python爬取中国知网部分论文信息

爬取指定主题的论文,并以相关度排序。

 1 #!/usr/bin/python3
 2 # -*- coding: utf-8 -*- 
 3 import requests
 4 import linecache
 5 import random
 6 from bs4 import BeautifulSoup
 7 
 8 if __name__=="__main__":
 9     keywords='通信' ### 查询的主题 
10     n=0
11     target='http://search.cnki.net/search.aspx?q='+str(keywords)+'&rank=relevant&cluster=all&val=CJFDTOTAL&p={}'
12     user_agent = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.99 Safari/537.36'
13     headers = {'User-Agent':user_agent}
14     for i in range(10):
15         i=i*15
16         target=target.format(i)
17         req=requests.get(url=target)
18         html=req.text
19         html=html.replace('<br>',' ').replace('<br/>',' ').replace('/>','>')
20         bf=BeautifulSoup(html,"html.parser")
21         texts=bf.find('div',class_='articles')
22         texts_div=texts.find_all('div',class_='wz_content')
23         for item in texts_div:
24             item_name=item.find('a').text
25             item_href=item.find('a')['href']
26             item_refer2=item.find('span',class_='count').text
27             print('{} {} {}\n'.format(item_name,item_href,item_refer2))
28     print(n)

 

posted @ 2019-05-23 17:57  会武术之白猫  阅读(4807)  评论(0编辑  收藏  举报