第5课-中国天气网爬虫案例
一、中国天气网爬虫案例
#中国天气网爬虫
import requests
from pyecharts.charts import Bar
from bs4 import BeautifulSoup
import copy
import html5lib
datas = []
data = {
"city":None,
"day":None,
"higher_temp":None,
"lower_temp":None
}
HEADERS = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.108 Safari/537.36",
"Referer": "http://www.weather.com.cn/textFC/db.shtml"
}
def weather_spider_dome(url):
html = requests.get(url=url,headers=HEADERS).content.decode("utf-8")
soup = BeautifulSoup(html,"html5lib")
conMidtabs = soup.find_all(attrs={"class":"conMidtab"})
if url.find("gat")!= "-1":
for conMidtab in conMidtabs:
tables = conMidtab.find(attrs={"class":"conMidtab2"}).find_all("table")
for table in tables:
trs = table.find_all("tr")
for i,tr in enumerate(trs):
global cur_day
tds = trs[i].find_all("td")
if i == 0:
start = tds[2].string.find("(")
end = tds[2].string.find(")")
cur_day = tds[2].string[start+1:end]
data["day"] = cur_day
elif i==2:
data["city"] = list(tds[1].stripped_strings)[0]
higher_temp = tds[4].string
lower_temp = tds[7].string
data["higher_temp"] = higher_temp
data["lower_temp"] = lower_temp
datas.append(copy.copy(data))
elif i>=3:
data["city"] = list(tds[0].stripped_strings)[0]
higher_temp = tds[3].string
lower_temp = tds[6].string
data["higher_temp"] = higher_temp
data["lower_temp"] = lower_temp
datas.append(copy.copy(data))
else:
for conMidtab in conMidtabs:
conMidtab2s = conMidtab.find_all(attrs={"class":"conMidtab2"})
for conMidtab2 in conMidtab2s:
trs = conMidtab2.find_all("tr")
for i,tr in enumerate(trs):
tds = trs[i].find_all("td")
if i == 0:
start = tds[2].string.find("(")
end = tds[2].string.find(")")
cur_day = tds[2].string[start+1:end]
data["day"] = cur_day
elif i>1:
higher_temp = ""
lower_temp = ""
if i==2:
higher_temp = copy.copy(tds[4].string)
lower_temp = copy.copy(tds[7].string)
data["higher_temp"] = higher_temp
data["lower_temp"] = lower_temp
elif i>2:
higher_temp = tds[3].string
lower_temp = tds[6].string
data["higher_temp"] = higher_temp
data["lower_temp"] = lower_temp
data["city"] = list(tds[0].stripped_strings)
print(data)
datas.append(copy.copy(data))
if __name__=="__main__":
urls = ["http://www.weather.com.cn/textFC/hb.shtml",
"http://www.weather.com.cn/textFC/db.shtml",
"http://www.weather.com.cn/textFC/hd.shtml",
"http://www.weather.com.cn/textFC/hz.shtml",
"http://www.weather.com.cn/textFC/hn.shtml",
"http://www.weather.com.cn/textFC/xb.shtml",
"http://www.weather.com.cn/textFC/xn.shtml",
"http://www.weather.com.cn/textFC/gat.shtml"]
for url in urls:
weather_spider_dome(url)
for i in datas:
print(i)
# cities = []
# temp = []
# for i in datas:
# if i["day"] == "12月11日":
# cities.append(i["city"])
# cities.append(i["city"])
# temp.append(i["higher_temp"])
# temp.append(i["lower_temp"])
# print(cities)
# print(temp)
# bar = Bar()
#
#
# bar.add_xaxis(cities)
# bar.add_yaxis("12月11日", temp)
# bar.render("weather.html")

浙公网安备 33010602011771号