import requests
from re import *
from bs4 import BeautifulSoup
import pandas
import sqlite3
#获取一个新闻列表页的所有新闻的上述详情,并包装成一个函数
def getclick(newurl):
num=search('_(.*).html',newurl).group(1).split('/')[1]
url="http://oa.gzcc.cn/api.php?op=count&id={}&modelid=80".format(num)
re2=int(requests.get(url).text.split('.')[-1].lstrip("html('").rstrip("');"))
return re2
#获取单条新闻的#标题#链接#时间#来源#内容 #点击次数,并包装成一个函数。
def getcontent(s):
for i in s:
if len(i.select(".news-list-title"))>0:
new={}
new["title"]=i.select(".news-list-title")[0].text
new["time"]=i.select(".news-list-info")[0].contents[0].text
new["content"]=i.select(".news-list-info")[0].contents[1].text
new["url"]=i.attrs['href']
url=i.attrs['href']
re1=requests.get(url)
re1.encoding="utf-8"
soup1=BeautifulSoup(re1.text,'html.parser',from_encoding="utf-8")
new["click"]=getclick(url)
s1=soup1.select("div")
for x in s1:
if len(x.select(".show-content"))>0:
new["show"]=x.select(".show-content")[0].text
#print(new)
break
return new
#传入连接进行爬取
def onepage(urlpage):
re=requests.get(urlpage)
re.encoding="utf-8"
soup=BeautifulSoup(re.text,'html.parser',from_encoding="utf-8")
s=soup.select("a")
list=[]
list.append(getcontent(s))
return list
re=requests.get("http://news.gzcc.cn/html/xiaoyuanxinwen/")
re.encoding="utf-8"
soup=BeautifulSoup(re.text,'html.parser',from_encoding="utf-8")
s=soup.select("a")
list=[]
listtotal=[]
list.append(getcontent(s))
listtotal.extend(list)
#获取所有新闻列表页的网址,调用上述函数。
s=int(soup.select(".a1")[0].text.rstrip('条'))
s=s//10+1
for i in range(2,50):
urlpage="http://news.gzcc.cn/html/xiaoyuanxinwen/{}.html".format(i)
listtotal.extend(onepage(urlpage))
df = pandas.DataFrame(listtotal)
#print(df.head())
#print(df['title'])
df.to_excel('gzccnews.xlsx')
with sqlite3.connect('gzccnewsdb.sqlite') as db:
df.to_sql('gzccnewsdb',con = db)