python之天气爬虫
代码已调试通过
# 导入第三方包
import random
import re
import time
import pandas as pd
import requests
# 构造请求头
headers = {
'Accept': '*/*',
'Accept -Enconding': 'gzip,deflate',
'Accept-Language': 'zh-CN,zh;q=0.9',
'conection': 'keep-alive',
'User-Agent': 'Mozilla/5.0 (windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML,like Gecko) Chrome/63.0.3236.0 '
'Safari/537.36 '
}
# 生成所有需要抓取的链接
urls = []
for year in range(2012, 2019):
for month in range(1, 13):
if year <= 2016:
urls.append('http://tianqi.2345.com/t/wea_history/js/58362_%s%s.js' % (year, month))
else:
if month < 10:
print("未获取天气数据")
break;
info = []
for url in urls:
random.randint(3, 6)
response = requests.get(url, headers=headers).text # 发送url链接的请求,并返回响应数据
print(response)
city=re.findall("city:'(.*?)',", "".join(response)) # 正则表达式获取城市
ymd = re.findall("ymd:'(.*?)',", "".join(response)) # 正则表达式获取日期数据
high = re.findall(",bWendu:'(.*?)',", "".join(response)) # 正则表达式获取最高气温数据,正则表达式不加最前面的逗号,容易多匹配avgbWendu字段
low = re.findall(",yWendu:'(.*?)',", "".join(response)) # 正则表达式获取最低气温数据
tianqi = re.findall("tianqi:'(.*?)',", "".join(response)) # 正则表达式获取天气状况数据
fengxiang = re.findall("fengxiang:'(.*?)',", "".join(response)) # 正则表达式获取风向数据
aqi = re.findall("aqi:'(.*?)',", "".join(response)) # 正则表达式获取空气质量指标数据
aqiInfo = re.findall("aqiInfo:'(.*?)',", "".join(response)) # 正则表达式获取空气质量说明数据
aqiLevel = re.findall("aqiLevel:'(.*?)'}", "".join(response)) # 正则表达式获取空气质量水平数据
maxWendu=re.findall("maxWendu:'(.*?)',", "".join(response)) # 正则表达式获取最高 温度
minWendu = re.findall("maxWendu:'(.*?)',", "".join(response)) # 正则表达式获取最低温度
avgbWendu = re.findall("avgbWendu:'(.*?)',", "".join(response)) # 正则表达式获取平均白天温度
# 犹豫 2012-2015没有空气质量相关的数据,故需要分开处理
# 循环并通过正则匹配获取相关数据
if len(aqi) == 0:
fengli = re.findall("fengli:'(.*?)'}", "".join(response)) # 正则表达式获取风力数据
avgyWendu = re.findall("avgyWendu:'(.*?)'}", "".join(response)) # 正则表达式获取平均夜里温度
aqi = ''
aqiInfo = ''
aqiLevel = ''
df = pd.DataFrame.from_dict(
{'city': city, 'ymd': ymd, 'high': high, 'low': low, 'tianqi': tianqi, 'fengxiang': fengxiang,
'fengli': fengli, 'aqi': aqi,
'aqiInfo': aqiInfo, 'aqiLevel': aqiLevel, 'maxWendu': maxWendu, 'minWendu': minWendu,
'avgbWendu': avgbWendu, 'avgyWendu': avgyWendu}, orient='index')
pl = df.transpose()
info.append(pl)
else:
fengli = re.findall("fengli:'(.*?)',", "".join(response)) # 正则表达式获取风力数据
avgyWendu = re.findall("avgyWendu:'(.*?)',", "".join(response)) # 正则表达式获取平均夜里温度
df = pd.DataFrame.from_dict(
{'city':city,'ymd': ymd, 'high': high, 'low': low, 'tianqi': tianqi, 'fengxiang': fengxiang, 'fengli': fengli, 'aqi': aqi,
'aqiInfo': aqiInfo, 'aqiLevel': aqiLevel,'maxWendu':maxWendu,'minWendu':minWendu,'avgbWendu':avgbWendu,'avgyWendu':avgyWendu}, orient='index') #
pl = df.transpose()
info.append(pl)
time.sleep(3) # 每循环一次,都随机停顿几秒
# 将存储的所有天气数据进行合并,生成数据表格
weather = pd.concat(info)
# 数据导出
time = time.strftime('%Y%m%d%H%M%S', time.localtime(time.time()))
weather.to_csv('weather_new' + time + '.csv', index=False)
运行结果如下:


浙公网安备 33010602011771号