Python 크롤링 코드 예시 (PubMed 사이트에서 논문 정보 크롤링)
의사인 절친이 PubMed 사이트에서 수동으로 논문을 수집하고 있길래, 총 두 시간 정도를 써서 (초안 작성 후 개량 시간 포함) Python으로 아래의 web scrapper를 만들어 준 적이 있다. 아래의 web scrapper로 검색어에 대응하는 논문들의 URL/ 출판일/ 저자/ 소속/ 키워드/ 저널명 수집이 가능하다.
특히 검색 option을 포함한 상세 검색도 가능하다. 또한 $\texttt{urlopen}$ 라이브러리를 사용했기에, $\texttt{requests}$ 라이브러리를 통한 크롤링이 막혀 있는 컴퓨터들에서도 사용이 가능하다.
from urllib.request import urlopen
from urllib.parse import quote
from bs4 import BeautifulSoup
import pandas as pd
import time
from datetime import datetime
# Each tuple in list variable "searchwordlist" contains searchword as the first element,
# and desired number of search result pages as the second element
searchwordlist = [
Venoms[mesh] AND pathology[affiliation]
""", 1),
Turtle[mesh] AND pathology[affiliation]
""", 16)
### Below is the code for scrapping
def get_html(url):
return urlopen(url)
# retry if failed to get html of the target webpage
except Exception:
return get_html(url)
starttime = time.time()
for (searchword,numpages) in searchwordlist:
urls = [] # Initialize the lists for storing scraped data
papertitle= []
date = []
affiliations = []
authors = []
keywords = []
journals = []
for i in range(numpages): # for each searchresult page
url_searchpage = "" + quote(searchword) + "&page={}".format(i+1) # make URL of searchresult page
getdata = get_html(url_searchpage) # get HTML of the page
getdata.encoding = "UTF-8"
soup = BeautifulSoup(getdata, 'html.parser') # parse the HTML
titles = soup.find_all("a",{"class":"docsum-title"})
for title in titles: # for each paper page
url_paperpage = "" + title["data-article-id"] # make URL of paper page (title["data-article-id"] is an ID of the paper in PubMed)
urls.append(url_paperpage) # store URL to the list
getdata = get_html(url_paperpage) # get HTML of the page
soup = BeautifulSoup(getdata, 'html.parser') # parse the HTML
papertitle.append(soup.find("h1",{"class":"heading-title"}).text.lstrip().rstrip()) # store title to the list, after processing the data using functions such as replace, lstrip, rstrip
if soup.find("span",{"class":"cit"}) is None:
date.append("No date") # if there is no information of date of publication, store a default message
date.append(soup.find("span",{"class":"cit"}).text.split(";")[0]) # store date of publication to the list, after processing the data using functions such as replace, lstrip, rstrip
if soup.find("div",{"class":"authors"}) is None:
authors.append("No author") # if there is no information of authors, store a default message
authors.append(soup.find("div",{"class":"authors"}).text.replace(" ","").replace(" ","").replace("Authors","").replace("Author","").lstrip().rstrip()) # store authors to the list, after processing the data using functions such as replace, lstrip, rstrip
if soup.find("div",{"class":"affiliations"}) is None:
affiliations.append("No affiliation") # if there is no information of affiliation, store a default message
affiliations.append(soup.find("div",{"class":"affiliations"}).text.replace("Affiliations","").replace("Affiliation","").lstrip().rstrip()) # store affiliation to the list, after processing the data using functions such as replace, lstrip, rstrip
if len(soup.find("div",{"class":"abstract"}).text.split("Keywords:")) < 2:
keywords.append("No keywords") # if there is no information of keywords, store a default message
keywords.append(soup.find("div",{"class":"abstract"}).text.split("Keywords:")[1].lstrip().rstrip()) # store keywords to the list, after processing the data using functions such as replace, lstrip, rstrip
if soup.find("span",{"class":"journal"}) is None:
journals.append("No journal") # if there is no information of journal, store a default message
journals.append(soup.find("span",{"class":"journal"}).text.lstrip().rstrip()) # store journal to the list, after processing the data using functions such as replace, lstrip, rstrip
checktime = time.time()
print("{} of {} pages scraped. {} seconds elapsed".format(i+1,numpages,round(checktime-starttime,1))) # display a message informing progress
df_papers = pd.DataFrame( # combine the lists containing data as a dataframe
"Title": papertitle,
"Authors": authors,
"Affiliations": affiliations,
"Keywords": keywords,
"Date": date,
"Journal": journals,
"URL": urls,
"Searchword:{}".format(searchword): None
df_papers.to_excel('results_{}.xlsx'.format(str(":","-"))) # export the dataframe as an excel file
print("Excel file exported.")