爬取校园新闻首页的新闻的详情,使用正则表达式,函数抽离

 1 import requests
 2 from bs4 import BeautifulSoup
 3 res=requests.get('http://news.gzcc.cn/html/xiaoyuanxinwen/')
 4 res.encoding= 'utf-8'
 5 soup =BeautifulSoup(res.text,'html.parser')
 6 import re
 7 
 8 def getclick(link):
 9     newId = re.search('\_(.*).html', link).group(1).split('/')[1]
10     click = requests.get('http://oa.gzcc.cn/api.php?op=count&id={}&modelid=80'.format(newId))
11     return click.text.split('.html')[-1].lstrip("('").rstrip("');")
12 
13 a=soup.select('li')
14 
15 for news in a:
16     if(len(news.select('.news-list-title'))>0):
17         title=news.select('.news-list-title')[0].text
18         description=news.select('.news-list-description')[0].text
19         link=news.a.attrs['href']
20         resd = requests.get(link)
21         resd.encoding = 'utf-8'
22         soupd = BeautifulSoup(resd.text, 'html.parser')
23 
24         content=soupd.select('.show-content')[0].text
25         info=soupd.select('.show-info')[0].text
26         divide=info.split()
27         date= divide[0]+' '+ divide[1]
28         author= divide[2]
29         auditing= divide[3]
30         source= divide[4]
31         clickcount=getclick(link)
32 
33         print('新闻链接:' + link)
34         print('新闻标题:' + title)
35         print('新闻描述:' + description)
36         print('新闻正文:' + content)
37         print(date)
38         print(author)
39         print(auditing)
40         print(source)
41         print('点击次数:'+clickcount +'')
42         break
43 
44 
45 from datetime import datetime
46 now=datetime.now()
47 print(now)
48 time='2018-04-04 14:53:25 942204'
49 print(type(time))
50 date=datetime.strptime(time,'%Y-%m-%d %H:%M:%S %f')
51 print(date)
52 print(type(date))

截图:

posted @ 2018-04-08 20:45  150颜杰文  阅读(136)  评论(0编辑  收藏  举报