数据结构化与保存
import requests
import re
from bs4 import BeautifulSoup
def uslHtml(url):
res = requests.get(url)
res.encoding = "utf-8"
soup = BeautifulSoup(res.text, "html.parser")
return soup
def page(url):
soup = uslHtml(url)
newsList = soup.select(".news-list")[0].select("li")
for aList in newsList:
a = aList.select("a")[0].attrs["href"]
number = re.search("_(\d+)/(\d+)",a).group(0)
ress = requests.get(a)
ress.encoding="utf-8"
soup1 = BeautifulSoup(ress.text,"html.parser")
content = soup1.select("#content")[0].text
f.write(content)
print(content)
so = uslHtml("http://news.gzcc.cn/html/xiaoyuanxinwen/")
n = int(so.select("#pages")[0].select(".a1")[0].text.strip("条"))
n=int(n/10)+1
f = open("SchoolNews.txt","a+",encoding='utf-8')
page("http://news.gzcc.cn/html/xiaoyuanxinwen/")
for i in range(2,3):
url1 = "http://news.gzcc.cn/html/xiaoyuanxinwen/{}.html".format(i);
page(url1)
f.close()
2.
def Url(newsUrl): re = requests.get(newsUrl) re.encoding="utf-8" soup = BeautifulSoup(re.text,"html.parser") return soup newsArr= [] def pageNumber(): soup = Url("http://news.gzcc.cn/html/xiaoyuanxinwen/") newsPage = int(soup.select("#pages")[0].select(".a1")[0].text.rstrip("条")) newsPage = int((newsPage/10)+1) return newsPage def site(): newsPage = pageNumber() for i in (newsPage,newsPage+1): othersUrl = "http://news.gzcc.cn/html/xiaoyuanxinwen/{}.html".format(i) soup = Url(othersUrl) News(soup) def News(soup): newsList = soup.select(".news-list-text") for i in range(len(newsList)): dict = {} news = newsList[i] newsTitle = news.select(".news-list-title")[0].text dict["title"] = newsTitle newsDescription = news.select(".news-list-description")[0].text dict["description"] = newsDescription newsArr.insert(len(newsArr),dict) print(newsArr) site()
3.
df = pandas.DataFrame(newsArr) df.to_excel("title.xlsx")
print(df[['click', 'title', 'sources']].head(6)) print(df[(df['click'] > 3000) & (df['sources'] == '学校综合办')]) sou = ['国际学院', '学生工作处'] print(df[df['sources'].isin(sou)])
浙公网安备 33010602011771号