Python 爬虫第一天改良版【学习笔记】
晚上改良了下午写的那个,可以批量获取新浪新闻网页了。不过也是搬砖来的别人写好我抄了一遍。
from bs4 import BeautifulSoup as bs
import requests
import csv
import json, re
import pandas
#csv_file = open("Newslist.csv","w",newline="",encoding="utf-8-sig")
#writer = csv.writer(csv_file)
#writer.writerow(["标题","时间","内容","来源","链接"])
news_total=[]
url = "https://feed.sina.com.cn/api/roll/get?pageid=121&lid=1356&num=20&versionNumber=1.2.4&page={}&encode=utf-8&callback"
commentsURL = "https://comment.sina.com.cn/page/info?version=1&format=json&channel=gn&newsid=comos-{}&group=undefined&compress=0&ie=utf-8&oe=utf-8&page=1&page_size=3&t_size=3&h_size=3&thread=1&callback"
def parsListLinks(url):
newsdetails = []; #创建数组装新闻信息;
res = requests.get(url); #加载主页URL分页的信息;
jd = json.loads(res.text); #装载URL的json信息。;
for ent in jd["result"]["data"]: #遍历Data数据;
newsdetails.append(getNewsDetail(ent["url"])) #将新闻页URL传给“getNewDetail”;
return newsdetails
def getNewsDetail(newsurl):
result = {} #创建包库
res = requests.get(newsurl); #获取页面地址
res.encoding = "utf-8"
soup = bs(res.text,"html.parser")
result["title"] = soup.select(".main-title")[0].text #标题
result["article"] = " ".join([p.text.strip()for p in soup.select(".article p")[:-1]]) #正文
result["dt"] = soup.select(".date")[0].text #时间
#print(result["dt"])
#result["source"] = soup.select(".source")[0]["href"]; #来源和连接
result['comments'] = getCommentCounts(newsurl); #评论数
result["links"] = newsurl;
return result
def getCommentCounts(newsurl):
m = re.search("doc-i(.*).shtml",newsurl)
newsid = m.group(1)
comments = requests.get(commentsURL.format(newsid));
jd = json.loads(comments.text)
return jd["result"]["count"]["total"]
for i in range(1,8):
newsurl = url.format(i)
newsary= parsListLinks(newsurl)
news_total.extend(newsary);
df=pandas.DataFrame(news_total)
df.head()
df.to_excel('news.xlsx')
#writer.writerow([title,date,article,source[0].text,source[0]["href"]])
#csv_file.close();
浙公网安备 33010602011771号