团队冲刺第一天
今天已经完成新闻的数据爬取,目前爬取的新闻网站有腾讯、新浪网。
爬取新浪网:
import requests
from bs4 import BeautifulSoup
from datetime import datetime
import json
import pymysql
def getnewsdetail(newsurl):
res = requests.get(newsurl)
res.encoding = 'utf-8'
value=[]
soup = BeautifulSoup(res.text, 'html.parser')
if (soup.select('.main-title')):
title = soup.select('.main-title')[0].text
else:
title="异常爬取"
if (soup.select('.date-source span')):
timesource = soup.select('.date-source span')[0].text # 获取时间)
dt = datetime.strptime(timesource, '%Y年%m月%d日 %H:%M')
dt.strftime('%Y-%m-%d')
else:
timesource="异常爬取"
if(soup.select('.date-source a')):
place = soup.select('.date-source a')[0].text # 获取新闻来源
else:
if soup.select('#top_bar > div > div.date-source > span.source'):
place = soup.select('#top_bar > div > div.date-source > span.source')[0].text
else:
place="异常爬取"
article = [] # 获取文章内容
for p in soup.select('#article p')[:-1]:
article.append(p.text.strip())
articleall = ' '.join(article)
if(soup.select('#article p')):
editor = soup.select('#article p')[-1].text.strip('责任编辑:') # 获取作者姓名
else:
editor='异常爬取'
value=[title,timesource,place,editor,articleall]
return value
def parseListLinks(url):
newsdetail=[]
res=requests.get(url)
jd=json.loads(res.text[26:-14])
for ent in jd['result']['data']:
newsdetail.append(getnewsdetail(ent['url']))
return newsdetail
url='https://feed.sina.com.cn/api/roll/get?pageid=121&lid=1356&num=20&versionNumber=1.2.4&page={}&encode=utf-8&callback=feedCardJsonpCallback&_=1619529999063'
news_total=[]
for i in range(130,150): #爬取的页数,这里从0-130(130页之后就没有了)
newsurl=url.format(i)
newsary=parseListLinks(newsurl)
news_total.extend(newsary)
print(i)
tuplist=tuple(news_total)
db = pymysql.connect(host="localhost",user="root",password="1229", database="lianxi", charset='utf8')
cursor = db.cursor()
sql_xilang = "INSERT INTO xilang values (%s,%s,%s,%s,%s)"
try:
cursor.executemany(sql_xilang,tuplist)
db.commit()
except:
print('执行失败,进入回调3')
db.rollback()
db.close()
对于新闻的界面目前还在规划中,算法方面也会由团队中一名成员去进行实现。
明天会通过使用viewpage和tablayout来实现界面的一些基本功能。

浙公网安备 33010602011771号