数据结构化与保存

import requests
from re import *
from bs4 import BeautifulSoup
import pandas
import sqlite3

#获取一个新闻列表页的所有新闻的上述详情,并包装成一个函数
def getclick(newurl):
    num=search('_(.*).html',newurl).group(1).split('/')[1]
    url="http://oa.gzcc.cn/api.php?op=count&id={}&modelid=80".format(num)
    re2=int(requests.get(url).text.split('.')[-1].lstrip("html('").rstrip("');"))
    return re2

#获取单条新闻的#标题#链接#时间#来源#内容 #点击次数,并包装成一个函数。
def getcontent(s):
    for i in s:
        if len(i.select(".news-list-title"))>0:
            new={}
            new["title"]=i.select(".news-list-title")[0].text
            new["time"]=i.select(".news-list-info")[0].contents[0].text
            new["content"]=i.select(".news-list-info")[0].contents[1].text
            new["url"]=i.attrs['href']
            url=i.attrs['href']
            re1=requests.get(url)
            re1.encoding="utf-8"
            soup1=BeautifulSoup(re1.text,'html.parser',from_encoding="utf-8")
            new["click"]=getclick(url)   
            s1=soup1.select("div")
            for x in s1:
                if len(x.select(".show-content"))>0:
                    new["show"]=x.select(".show-content")[0].text
                    #print(new)
                    break
    return new
        
#传入连接进行爬取
def onepage(urlpage):
    re=requests.get(urlpage)
    re.encoding="utf-8"
    soup=BeautifulSoup(re.text,'html.parser',from_encoding="utf-8")
    s=soup.select("a")
    list=[]
    list.append(getcontent(s))
    return list


re=requests.get("http://news.gzcc.cn/html/xiaoyuanxinwen/")
re.encoding="utf-8"
soup=BeautifulSoup(re.text,'html.parser',from_encoding="utf-8")
s=soup.select("a")
list=[]
listtotal=[]
list.append(getcontent(s))
listtotal.extend(list)
#获取所有新闻列表页的网址,调用上述函数。
s=int(soup.select(".a1")[0].text.rstrip(''))
s=s//10+1
for i in range(2,50):
    urlpage="http://news.gzcc.cn/html/xiaoyuanxinwen/{}.html".format(i)
    listtotal.extend(onepage(urlpage))
df = pandas.DataFrame(listtotal)
#print(df.head())
#print(df['title'])
df.to_excel('gzccnews.xlsx')
with sqlite3.connect('gzccnewsdb.sqlite') as db:
    df.to_sql('gzccnewsdb',con = db)

posted @ 2017-10-19 20:23  36-林秋雁  阅读(86)  评论(0编辑  收藏  举报