Python 크롤링 코드 예시 (PubMed 사이트에서 논문 정보 크롤링)

2022-09-17

의사인 절친이 PubMed 사이트에서 수동으로 논문을 수집하고 있길래, 총 두 시간 정도를 써서 (초안 작성 후 개량 시간 포함) Python으로 아래의 web scrapper를 만들어 준 적이 있다. 아래의 web scrapper로 검색어에 대응하는 논문들의 URL/ 출판일/ 저자/ 소속/ 키워드/ 저널명 수집이 가능하다.

특히 검색 option을 포함한 상세 검색도 가능하다. 또한 $\texttt{urlopen}$ 라이브러리를 사용했기에, $\texttt{requests}$ 라이브러리를 통한 크롤링이 막혀 있는 컴퓨터들에서도 사용이 가능하다.

코드

from urllib.request import urlopen
from urllib.parse import quote
from bs4 import BeautifulSoup
import pandas as pd
import time
from datetime import datetime

# Each tuple in list variable "searchwordlist" contains searchword as the first element,
# and desired number of search result pages as the second element 
searchwordlist = [
("""
Venoms[mesh] AND pathology[affiliation]
""", 1),
("""
Turtle[mesh] AND pathology[affiliation]
""", 16)
] 

### Below is the code for scrapping

def get_html(url): 
    try:
        return urlopen(url) 
    # retry if failed to get html of the target webpage
    except Exception:  
        print("Retrying")
        time.sleep(1)
        return get_html(url)

starttime = time.time()

for (searchword,numpages) in searchwordlist:

    urls = [] # Initialize the lists for storing scraped data
    papertitle= []
    date = []
    affiliations = []
    authors = []
    keywords = []
    journals = []   

    for i in range(numpages): # for each searchresult page
    
        url_searchpage = "https://pubmed.ncbi.nlm.nih.gov/?term=" + quote(searchword) + "&page={}".format(i+1) # make URL of searchresult page
        getdata = get_html(url_searchpage) # get HTML of the page
        getdata.encoding = "UTF-8"
        soup = BeautifulSoup(getdata, 'html.parser') # parse the HTML 
        titles = soup.find_all("a",{"class":"docsum-title"}) 
             
        for title in titles: # for each paper page
            url_paperpage = "https://pubmed.ncbi.nlm.nih.gov/" + title["data-article-id"] # make URL of paper page (title["data-article-id"] is an ID of the paper in PubMed)
            urls.append(url_paperpage) # store URL to the list
            getdata = get_html(url_paperpage) # get HTML of the page
            soup = BeautifulSoup(getdata, 'html.parser') # parse the HTML
            
            papertitle.append(soup.find("h1",{"class":"heading-title"}).text.lstrip().rstrip()) # store title to the list, after processing the data using functions such as replace, lstrip, rstrip
            
            if soup.find("span",{"class":"cit"}) is None: 
                date.append("No date") # if there is no information of date of publication, store a default message
            else:
                date.append(soup.find("span",{"class":"cit"}).text.split(";")[0]) # store date of publication to the list, after processing the data using functions such as replace, lstrip, rstrip
                
            if soup.find("div",{"class":"authors"}) is None:   
                authors.append("No author") # if there is no information of authors, store a default message
            else:
                authors.append(soup.find("div",{"class":"authors"}).text.replace("                ","").replace("              ","").replace("Authors","").replace("Author","").lstrip().rstrip()) # store authors to the list, after processing the data using functions such as replace, lstrip, rstrip
                
            if soup.find("div",{"class":"affiliations"}) is None:
                affiliations.append("No affiliation") # if there is no information of affiliation, store a default message
            else: 
                affiliations.append(soup.find("div",{"class":"affiliations"}).text.replace("Affiliations","").replace("Affiliation","").lstrip().rstrip())  # store affiliation to the list, after processing the data using functions such as replace, lstrip, rstrip
            
            if len(soup.find("div",{"class":"abstract"}).text.split("Keywords:")) < 2:
                keywords.append("No keywords") # if there is no information of keywords, store a default message
            else:
                keywords.append(soup.find("div",{"class":"abstract"}).text.split("Keywords:")[1].lstrip().rstrip()) # store keywords to the list, after processing the data using functions such as replace, lstrip, rstrip
                
            if soup.find("span",{"class":"journal"}) is None: 
                journals.append("No journal") # if there is no information of journal, store a default message
            else:
                journals.append(soup.find("span",{"class":"journal"}).text.lstrip().rstrip()) # store journal to the list, after processing the data using functions such as replace, lstrip, rstrip
                
        checktime = time.time()
        print("{} of {} pages scraped. {} seconds elapsed".format(i+1,numpages,round(checktime-starttime,1))) # display a message informing progress

    df_papers = pd.DataFrame( # combine the lists containing data as a dataframe
        {
            "Title": papertitle,
            "Authors": authors,
            "Affiliations": affiliations,
            "Keywords": keywords,
            "Date": date,
            "Journal": journals,
            "URL": urls,
            "Searchword:{}".format(searchword): None
        }
    )
    
    df_papers.to_excel('results_{}.xlsx'.format(str(datetime.now().replace(microsecond=0)).replace(":","-"))) # export the dataframe as an excel file    
    print("Excel file exported.")

Facebook LinkedIn

코드

공유하기