1 """中国天气网爬虫"""
2
3 import requests
4 from bs4 import BeautifulSoup
5
6 HEADERS = {
7 'User-Agent': 'Mozilla/5.0',
8 }
9
10 def parse_detail_page(url, is_html5lib):
11 """爬取具体页面具体数据"""
12
13 respose = requests.get(url, headers=HEADERS)
14 text = respose.content.decode('utf-8')
15 # with open('weather.html', 'w', encoding='utf-8') as fp:
16 # fp.write(text)
17 if is_html5lib == False:
18 soup = BeautifulSoup(text, 'lxml')
19 else:
20 soup = BeautifulSoup(text, 'html5lib')
21 # 以下为具体爬取数据方法
22 conMidtab = soup.find_all('div', attrs={'class':'conMidtab'})
23 tables = conMidtab[0].find_all('table')
24 for table in tables:
25 trs = table.find_all('tr')[2:]
26 for index,tr in enumerate(trs):
27 tds = tr.find_all('td')
28 city_td = tds[0]
29 if index == 0:
30 city_td = tds[1]
31 city = list(city_td.stripped_strings)[0]
32 temp_td = tds[-2]
33 min_temp = list(temp_td.stripped_strings)[0]
34 # 输出城市及其最低温度
35 print({'city': city, 'min_temp': min_temp})
36
37 print("="*40)
38
39 def get_detail_urls(url, base_url):
40 """得到华北、东北、华东、华中、华南、西北、西南、港澳台的具体页面链接"""
41
42 urllists = [] # 具体的页面信息列表
43 respose = requests.get(url, headers=HEADERS)
44 text = respose.content.decode('utf-8')
45 soup = BeautifulSoup(text, 'lxml')
46 # 数据爬取
47 uls = soup.find_all('ul', class_='lq_contentboxTab2')
48 alists = uls[0].find_all('a')
49 for list in alists:
50 newurl = base_url + list['href']
51 urllists.append(newurl)
52
53 return urllists
54
55 def spider():
56 """"""
57
58 # 初始爬取页面
59 src_url = "http://www.weather.com.cn/textFC/hb.shtml"
60 base_url = "http://www.weather.com.cn"
61 urllists = []
62 urllists = get_detail_urls(src_url, base_url)
63 #print(urllists)
64 is_html5lib = False # 爬取页面是否用html5lib库
65 for index,urllist in enumerate(urllists):
66 if index != len(urllists)-1:
67 parse_detail_page(urllist, is_html5lib)
68 else:
69 is_html5lib = True
70 # url = "http://www.weather.com.cn/textFC/gat.shtml"这个页面需要用html5lib库解析,不然数据有错
71 parse_detail_page(urllist, is_html5lib)
72
73 if __name__ == '__main__':
74 spider()