Python 크롤링 코드 예시 (PubMed 사이트에서 논문 정보 크롤링)
의사인 절친이 PubMed 사이트에서 수동으로 논문을 수집하고 있길래, 총 두 시간 정도를 써서 (초안 작성 후 개량 시간 포함) Python으로 아래의 web scrapper를 만들어 준 적이 있다. 아래의 web scrapper로 검색어에 대응하는 논문들의 URL/ 출판일/ 저자/ 소속/ 키워드/ 저널명 수집이 가능하다.
특히 검색 option을 포함한 상세 검색도 가능하다. 또한 $\texttt{urlopen}$ 라이브러리를 사용했기에, $\texttt{requests}$ 라이브러리를 통한 크롤링이 막혀 있는 컴퓨터들에서도 사용이 가능하다.
코드
from urllib.request import urlopen
from urllib.parse import quote
from bs4 import BeautifulSoup
import pandas as pd
import time
from datetime import datetime
# Each tuple in list variable "searchwordlist" contains searchword as the first element,
# and desired number of search result pages as the second element
searchwordlist = [
("""
Venoms[mesh] AND pathology[affiliation]
""", 1),
("""
Turtle[mesh] AND pathology[affiliation]
""", 16)
]
### Below is the code for scrapping
def get_html(url):
try:
return urlopen(url)
# retry if failed to get html of the target webpage
except Exception:
print("Retrying")
time.sleep(1)
return get_html(url)
starttime = time.time()
for (searchword,numpages) in searchwordlist:
urls = [] # Initialize the lists for storing scraped data
papertitle= []
date = []
affiliations = []
authors = []
keywords = []
journals = []
for i in range(numpages): # for each searchresult page
url_searchpage = "https://pubmed.ncbi.nlm.nih.gov/?term=" + quote(searchword) + "&page={}".format(i+1) # make URL of searchresult page
getdata = get_html(url_searchpage) # get HTML of the page
getdata.encoding = "UTF-8"
soup = BeautifulSoup(getdata, 'html.parser') # parse the HTML
titles = soup.find_all("a",{"class":"docsum-title"})
for title in titles: # for each paper page
url_paperpage = "https://pubmed.ncbi.nlm.nih.gov/" + title["data-article-id"] # make URL of paper page (title["data-article-id"] is an ID of the paper in PubMed)
urls.append(url_paperpage) # store URL to the list
getdata = get_html(url_paperpage) # get HTML of the page
soup = BeautifulSoup(getdata, 'html.parser') # parse the HTML
papertitle.append(soup.find("h1",{"class":"heading-title"}).text.lstrip().rstrip()) # store title to the list, after processing the data using functions such as replace, lstrip, rstrip
if soup.find("span",{"class":"cit"}) is None:
date.append("No date") # if there is no information of date of publication, store a default message
else:
date.append(soup.find("span",{"class":"cit"}).text.split(";")[0]) # store date of publication to the list, after processing the data using functions such as replace, lstrip, rstrip
if soup.find("div",{"class":"authors"}) is None:
authors.append("No author") # if there is no information of authors, store a default message
else:
authors.append(soup.find("div",{"class":"authors"}).text.replace(" ","").replace(" ","").replace("Authors","").replace("Author","").lstrip().rstrip()) # store authors to the list, after processing the data using functions such as replace, lstrip, rstrip
if soup.find("div",{"class":"affiliations"}) is None:
affiliations.append("No affiliation") # if there is no information of affiliation, store a default message
else:
affiliations.append(soup.find("div",{"class":"affiliations"}).text.replace("Affiliations","").replace("Affiliation","").lstrip().rstrip()) # store affiliation to the list, after processing the data using functions such as replace, lstrip, rstrip
if len(soup.find("div",{"class":"abstract"}).text.split("Keywords:")) < 2:
keywords.append("No keywords") # if there is no information of keywords, store a default message
else:
keywords.append(soup.find("div",{"class":"abstract"}).text.split("Keywords:")[1].lstrip().rstrip()) # store keywords to the list, after processing the data using functions such as replace, lstrip, rstrip
if soup.find("span",{"class":"journal"}) is None:
journals.append("No journal") # if there is no information of journal, store a default message
else:
journals.append(soup.find("span",{"class":"journal"}).text.lstrip().rstrip()) # store journal to the list, after processing the data using functions such as replace, lstrip, rstrip
checktime = time.time()
print("{} of {} pages scraped. {} seconds elapsed".format(i+1,numpages,round(checktime-starttime,1))) # display a message informing progress
df_papers = pd.DataFrame( # combine the lists containing data as a dataframe
{
"Title": papertitle,
"Authors": authors,
"Affiliations": affiliations,
"Keywords": keywords,
"Date": date,
"Journal": journals,
"URL": urls,
"Searchword:{}".format(searchword): None
}
)
df_papers.to_excel('results_{}.xlsx'.format(str(datetime.now().replace(microsecond=0)).replace(":","-"))) # export the dataframe as an excel file
print("Excel file exported.")