Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- from bs4 import BeautifulSoup
- import codecs
- import requests
- import re
- import sys, signal, os
- import xlsxwriter
- JOURNAL_NAME = 0
- YEAR = 1
- VOLUME = 2
- AUTHOR = 3
- TITLE = 4
- KEYWORDS = 5
- ABSTRACT = 6
- workBook = xlsxwriter.Workbook('data.xlsx')
- workSheet = workBook.add_worksheet()
- def handler(signal,frame):
- workBook.close()
- sys.exit(0)
- signal.signal(signal.SIGINT,handler)
- bold = workBook.add_format({'bold': True})
- workSheet.write(0,JOURNAL_NAME,'Periódico',bold)
- workSheet.write(0,YEAR,'Ano',bold)
- workSheet.write(0,VOLUME,'Volume',bold)
- workSheet.write(0,AUTHOR,'Autor',bold)
- workSheet.write(0,TITLE,'Título',bold)
- workSheet.write(0,KEYWORDS,'Palavras-chave',bold)
- workSheet.write(0,ABSTRACT,'Resumo',bold)
- inFile = open('links.txt','r')
- row = 1
- h = {"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/47.0.2526.106 Safari/537.36"}
- def processArticle(link,journalName,year,volume):
- global row
- print('\t\t\t'+link)
- soup = BeautifulSoup(requests.get(link,headers = h).text,'lxml')
- workSheet.write(row,JOURNAL_NAME,journalName)
- workSheet.write(row,YEAR,year)
- workSheet.write(row,VOLUME,volume)
- workSheet.write(row,AUTHOR,soup.find('div',attrs = {'id':'authorString'}).em.text)
- workSheet.write(row,TITLE,soup.find('div',attrs = {'id':'articleTitle'}).h3.text)
- keywords = soup.find('div',attrs = {'id':'articleSubject'})
- workSheet.write(row,KEYWORDS,keywords.div.text if keywords != None else 'Nenhuma palavra-chave encontrada')
- workSheet.write(row,ABSTRACT,soup.find('div',attrs = {'id':'articleAbstract'}).div.text)
- row += 1
- def processIssue(link,journalName,year):
- print('\t\t'+link)
- soup = BeautifulSoup(requests.get(link,headers = h).text,'lxml')
- for articleLink in set(soup.find_all('a',href = re.compile('/article/view/[0-9]+$'))):
- processArticle(articleLink['href'],journalName,year,soup.title.text)
- def processPage(link,journalName):
- print('\t'+link)
- soup = BeautifulSoup(requests.get(link,headers = h).text,'lxml')
- for issueLink, year in zip(set(soup.find_all('a',href = re.compile('/issue/view/[0-9]+$'))),soup.find('div',attrs = {'id':'issues'}).find_all('h3')):
- processIssue(issueLink['href']+'/showToc',journalName,year.text)
- for journalLink in inFile:
- journalLink = re.sub('\n','',journalLink)
- print(journalLink)
- soup = BeautifulSoup(requests.get(journalLink+'/issue/archive',headers = h).text,'lxml')
- journalName = soup.find('meta',attrs = {'name':'description'})['content']
- processPage(journalLink+'/issue/archive',journalName)
- for pageLink in set(soup.find_all('a',href = re.compile('issuesPage=[2-9][0-9]*#issues$'))):
- processPage(pageLink['href'],journalName)
- workBook.close()
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement