1 # coding:utf-8
2 #!/bin/env python
3 import requests,re,time
4 from urllib import parse
5 from bs4 import BeautifulSoup
6 import bs4
7
8 header = {
9 "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8",
10 "Accept-Encoding": "gzip, deflate",
11 "Accept-Language": "zh-CN,zh;q=0.9",
12 "Connection": "keep-alive",
13 "Host": "www.stats.gov.cn",
14 "Referer": "http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2017/index.html",
15 "Upgrade-Insecure-Requests": "1",
16 "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.146 Safari/537.36"
17 }
18 url = 'http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2016/index.html'
19 def gethtml(url_str='data'):
20 try:
21 html_s = req.get(url_str)
22 if html_s.status_code == 200:
23 # print('抓取成功网页长度:', len(html_s.text))
24 test_html = html_s.text.encode("latin1").decode("gbk")
25 while re.findall('charset=(.*?)"',test_html) != ['gb2312']:
26 print('进入while')
27 html_s = req.get(url_str)
28 test_html = html_s.text.encode("latin1").decode("gbk")
29 return test_html
30 except BaseException as e:
31
32 print('抓取出现错误:',e)
33 def writetxt(str,str_type):
34 if str_type == 'url':
35 file_str = 'D:\curl.txt'
36 else:
37 file_str = 'D:\city.txt'
38 with open(file_str, 'a+') as s_f:
39 s_f.writelines(str + '\n')
40 def province(url_str):
41 if url_str != '':
42 soup_html = re.findall("</td></tr>\r\n(.*)\r\n</table>", gethtml(url_str), re.S)
43 soup_html = soup_html[0] if soup_html != [] else ''
44 soup = BeautifulSoup(soup_html, 'lxml')
45 for soup_tr in soup.findAll('tr', class_='provincetr'):
46 for soup_td in soup_tr.find_all(name='a'):
47 soup_sid = soup_td['href'].split('.')[0]
48 soup_txt = soup_td.get_text()
49 soup_url = parse.urljoin(url_str, soup_td['href'])
50 print('level_1', ['0', soup_sid, soup_txt, soup_url])
51 writetxt(str(['level_1', '0', soup_sid, soup_txt, soup_url]), 'data')
52 writetxt(soup_url, 'url')
53 def getcity(url_str):
54 if url_str != '':
55 soup_html = re.findall("</td></tr>\r\n(.*)\r\n</table>", gethtml(url_str), re.S)
56 soup_html = soup_html[0] if soup_html != [] else ''
57 soup = BeautifulSoup(soup_html, 'lxml')
58 Parent_url = re.findall("(\d+).html", url_str)
59 Parent_url = Parent_url[0] if Parent_url != [] else ''
60 level = str(int(len(Parent_url)/2+1))
61 class_str = {'2': 'citytr', '3': 'countytr' ,'4': 'towntr' ,'5': 'villagetr'}
62 for soup_tr in soup.findAll('tr', class_=class_str[level]):
63 soup_sid = re.findall(r'\d+', soup_tr.get_text())
64 soup_sid = soup_sid[0] if soup_sid != [] else ''
65 soup_txt = re.findall(r'\D+', soup_tr.get_text())
66 soup_txt = soup_txt[0] if soup_txt != [] else ''
67 soup_url = re.findall('href="(.*?)">', str(soup_tr))
68 soup_url = parse.urljoin(url_str, soup_url[0]) if soup_url != [] else ''
69 print('level_'+level, [Parent_url, soup_sid, soup_txt, soup_url])
70 writetxt(str(['level_'+level, Parent_url, soup_sid, soup_txt, soup_url]), 'data')
71 writetxt(soup_url, 'url')
72 def updateurl():
73 file_str = 'D:\curl.txt'
74 with open(file_str, 'r') as f:
75 lines = f.readlines()
76 with open(file_str, 'w+') as f_w:
77 if lines != []:
78 lines[0] = ''
79 f_w.writelines(lines)
80 def geturl():
81 file_str = 'D:\curl.txt'
82 lines_str = ''
83 with open(file_str, 'r') as f:
84 lines = f.readlines()
85 if lines !=[]:
86 lines_str = lines[0].strip()
87 return lines_str
88
89 req = requests.session()
90 req.headers = header
91 province(url)
92
93 while 1 != 2:
94 try:
95 current_url = geturl()
96 getcity(current_url)
97 updateurl()
98 except BaseException as e:
99 print(e)
100 time.sleep(1)