爬虫大作业
import requests from bs4 import BeautifulSoup from datetime import datetime import re import jieba def getNewsDetail(newsurl):#获取新闻详情 resd=requests.get(newsurl) resd.encoding='utf-8' soupd=BeautifulSoup(resd.text,'html.parser') click=soupd.select('.like')[0].text.split(" ")[0] title = soupd.select('h1')[0].text info=soupd.select('.pdate')[0].text dt=datetime.strptime(info.lstrip('发布时间:')[0:19],'%Y-%m-%d %H:%M:%S') author=soupd.select('p')[0].text.split(" ")[1].strip('<p>') delcontent=soupd.select('p')[0].text newscontent=soupd.select('.maintext')[0].text.lstrip(delcontent) keyWords=getKeyWords(newscontent) print(dt) print(title) print(click) print(author) print(newscontent) print(keyWords) f = open("C:\python/pachong.txt", 'w', encoding='utf8') f.write(newscontent) f.close() def getKeyWords(newscontent):#获取新闻关键词 newscontent = ''.join(re.findall('[\u4e00-\u9fa5]', newscontent)) wordSet=set(jieba._lcut(newscontent)) wordDict={} for i in wordSet: wordDict[i]=newscontent.count(i) delList=[] for i in wordDict.keys(): if len(i)<2: delList.append(i) for i in delList: del wordDict[i] dictList=list(wordDict.items()) dictList.sort(key=lambda item: item[1], reverse=True) keyWords=[] for i in range(20): keyWords.append(dictList[i][0]) return keyWords newsurl="http://news.gdufe.edu.cn/11499" getNewsDetail(newsurl)