1 # -*- coding: utf-8 -*-
2 """
3 Created on Thu Oct 18 09:13:32 2018
4
5 @author: Gawen
6
7 实现爬取IEEE目标网页上该页所有论文的摘要
8 并通过百度翻译api进行翻译
9 并将链接与翻译后的摘要存到文本文档中
10 其中百度的api的appid以及secertkey需要自己申请
11 每月有200万字符的免费额度
12 python版本3.6
13 需要的包:selenium
14 需要的软件:chrome-driver
15 """
16 #example url https://ieeexplore.ieee.org/xpl/mostRecentIssue.jsp?punumber=8360187&punumber=8360187&filter=issueId%20EQ%20%228363090%22&pageNumber=9&pageNumber=10
17 import requests
18 from bs4 import BeautifulSoup
19 from selenium import webdriver
20 import time
21 from selenium.webdriver.chrome.options import Options
22 import hashlib
23 import urllib
24 import random
25 import json
26
27
28 def writetxt(file,url,abstract):
29 with open(file, 'a', encoding='GBK') as file_txt:
30 file_txt.write('链接:\n'+url)
31 file_txt.write('\n')
32 file_txt.write('摘要:\n'+abstract)
33 file_txt.write('\n')
34 file_txt.write('\n')
35
36
37 def trans(q):
38 appid = ''#你自己的百度翻译appid
39 secretkey = ''#你自己的百度翻译secretkey
40 myurl = '/api/trans/vip/translate'
41 fromLang = 'en'
42 toLang = 'zh'
43 salt = random.randint(32768,65536)
44 sign = appid + q + str(salt) + secretkey
45 sign = hashlib.md5(sign.encode(encoding = 'utf-8')).hexdigest()
46 myurl = myurl + '?appid=' + appid + '&q=' + urllib.parse.quote(q) + '&from=' + fromLang + '&to=' + toLang + '&salt=' + str(salt) + '&sign=' + sign
47 print(myurl)
48 try:
49 r = requests.get('http://api.fanyi.baidu.com'+myurl)
50 print(r.content.decode('utf-8'))
51 except Exception as e:
52 print(e)
53 html = r.content.decode('utf-8')
54 soup = BeautifulSoup(html,'lxml')
55 text = soup.find('p').text
56 text_dict = json.loads(text)
57 if ('error_code' in text_dict.keys()):
58 return 'error'
59 return text_dict['trans_result'][0]['dst']
60
61
62 url = input('please input the url that you want to download:\n')
63 fore = 'https://ieeexplore.ieee.org'
64 r = requests.get(url)
65 html = r.content.decode('utf-8')
66 soup = BeautifulSoup(html,'lxml')
67 h3 = soup.find('div', class_='cf jrnl-results-filter').find_all('h3')
68 h3text = []
69 errtitle = []
70 links = []
71 for h in h3:
72 h3text.append(h.text.strip())
73 print(h3text)
74 for i in range(len(h3text)):
75 if ((soup.find('a', attrs={'aria-label':'View HTML: ' + h3text[i]}))==None):
76 errtitle.append(h3text[i])
77 continue
78 href = (soup.find('a', attrs={'aria-label':'View HTML: ' + h3text[i]})['href'])
79 links.append(fore + href)
80 print(links)
81 chrome_options = Options()
82 chrome_options.add_argument('--headless')
83 chrome_options.add_argument('--disable-gpu')
84 driver = webdriver.Chrome(chrome_options=chrome_options)
85 count = 0
86 for link in links:
87 driver.get(link)
88 driver.implicitly_wait(20)
89 ps = driver.page_source
90 lsoup = BeautifulSoup(ps,'lxml')
91 abstract = lsoup.select('body > div > div > div > div > div > div > xpl-root > xpl-document-details > div > div > div > div > section > div > div > xpl-document-abstract > section > div > div > div > div > div')[0].text
92 abstract = trans(abstract)
93 if(abstract=='error'):
94 errtitle.append(link)
95 continue
96 writetxt(r'C:\Users\Gawen\Desktop\abstract.txt',link,abstract)#输出路径
97 count += 1
98 print(count)
99 time.sleep(5)
100 driver.close()
101 print("共有"+str(len(errtitle))+"篇论文下载失败")
102 for err in errtitle:
103 print(err)
104
105
106