Advertisement
Finnit

Scraper

Jun 8th, 2017
159
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 2.86 KB | None | 0 0
  1. from bs4 import BeautifulSoup
  2. import codecs
  3. import requests
  4. import re
  5. import sys, signal, os
  6. import xlsxwriter
  7.  
  8. JOURNAL_NAME    = 0
  9. YEAR            = 1
  10. VOLUME          = 2
  11. AUTHOR          = 3
  12. TITLE           = 4
  13. KEYWORDS        = 5
  14. ABSTRACT        = 6
  15.  
  16. workBook = xlsxwriter.Workbook('data.xlsx')
  17. workSheet = workBook.add_worksheet()
  18.  
  19. def handler(signal,frame):
  20.     workBook.close()
  21.     sys.exit(0)
  22.  
  23. signal.signal(signal.SIGINT,handler)
  24.  
  25. bold = workBook.add_format({'bold': True})
  26.  
  27. workSheet.write(0,JOURNAL_NAME,'Periódico',bold)
  28. workSheet.write(0,YEAR,'Ano',bold)
  29. workSheet.write(0,VOLUME,'Volume',bold)
  30. workSheet.write(0,AUTHOR,'Autor',bold)
  31. workSheet.write(0,TITLE,'Título',bold)
  32. workSheet.write(0,KEYWORDS,'Palavras-chave',bold)
  33. workSheet.write(0,ABSTRACT,'Resumo',bold)
  34.  
  35. inFile = open('links.txt','r')
  36.  
  37. row = 1
  38.  
  39. h = {"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/47.0.2526.106 Safari/537.36"}
  40.  
  41. def processArticle(link,journalName,year,volume):
  42.     global row
  43.     print('\t\t\t'+link)
  44.     soup = BeautifulSoup(requests.get(link,headers = h).text,'lxml')
  45.     workSheet.write(row,JOURNAL_NAME,journalName)
  46.     workSheet.write(row,YEAR,year)
  47.     workSheet.write(row,VOLUME,volume)
  48.     workSheet.write(row,AUTHOR,soup.find('div',attrs = {'id':'authorString'}).em.text)
  49.     workSheet.write(row,TITLE,soup.find('div',attrs = {'id':'articleTitle'}).h3.text)
  50.     keywords = soup.find('div',attrs = {'id':'articleSubject'})
  51.     workSheet.write(row,KEYWORDS,keywords.div.text if keywords != None else 'Nenhuma palavra-chave encontrada')
  52.     workSheet.write(row,ABSTRACT,soup.find('div',attrs = {'id':'articleAbstract'}).div.text)
  53.     row += 1
  54.  
  55. def processIssue(link,journalName,year):
  56.     print('\t\t'+link)
  57.     soup = BeautifulSoup(requests.get(link,headers = h).text,'lxml')
  58.     for articleLink in set(soup.find_all('a',href = re.compile('/article/view/[0-9]+$'))):
  59.         processArticle(articleLink['href'],journalName,year,soup.title.text)
  60.  
  61. def processPage(link,journalName):
  62.     print('\t'+link)
  63.     soup = BeautifulSoup(requests.get(link,headers = h).text,'lxml')
  64.     for issueLink, year in zip(set(soup.find_all('a',href = re.compile('/issue/view/[0-9]+$'))),soup.find('div',attrs = {'id':'issues'}).find_all('h3')):
  65.         processIssue(issueLink['href']+'/showToc',journalName,year.text)
  66.  
  67. for journalLink in inFile:
  68.     journalLink = re.sub('\n','',journalLink)
  69.     print(journalLink)
  70.     soup = BeautifulSoup(requests.get(journalLink+'/issue/archive',headers = h).text,'lxml')
  71.     journalName = soup.find('meta',attrs = {'name':'description'})['content']
  72.     processPage(journalLink+'/issue/archive',journalName)
  73.     for pageLink in set(soup.find_all('a',href = re.compile('issuesPage=[2-9][0-9]*#issues$'))):
  74.         processPage(pageLink['href'],journalName)
  75.        
  76. workBook.close()
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement