第二次作业
作业①
1)、爬取与存储天气预报数据实验
from bs4 import BeautifulSoup
from bs4 import UnicodeDammit
import urllib.request
import sqlite3
class WeatherDB:
def openDB(self):
self.con = sqlite3.connect("weathers.db")
self.cursor = self.con.cursor()
try:
self.cursor.execute("create table weathers (wNum varchar(16),wCity varchar(16),wDate varchar(16),wWeather varchar(64),wTemp varchar(32),constraint pk_weather primary key (wCity,wDate))")
except:
self.cursor.execute("delete from weathers")
def closeDB(self):
self.con.commit()
self.con.close()
def insert(self,num,city,date,weather,temp):
try:
self.cursor.execute("insert into weathers (wNum,wCity,wDate,wWeather,wTemp) values(?,?,?,?,?)",(num,city,date,weather,temp))
except Exception as err:
print(err)
def show(self):
self.cursor.execute("select * from weathers")
rows = self.cursor.fetchall()
print("%-16s%-16s%-16s%-32s%-16s"%("序号","地区","日期","天气信息","温度"))
for row in rows:
print("%-16s%-16s%-16s%-32s%-16s"%(row[0],row[1],row[2],row[3],row[4]))
class WeatherForecast:
def __init__(self):
self.headers={
"User-Agent": "Mozilla/5.0 (Windows; U; Windows NT 6.0 x64; en-US; rv:1.9pre) Gecko/2008072421 Minefield/3.0.2pre"
}
self.cityCode={"北京":"101010100","上海":"101020100","广州":"101280101","深圳":"101280601"}
def forecastCity(self,city):
if city not in self.cityCode.keys():
print(city+" code cannot be found")
return
url="http://www.weather.com.cn/weather/"+self.cityCode[city]+".shtml"
try:
req=urllib.request.Request(url,headers=self.headers)
data=urllib.request.urlopen(req)
data=data.read()
dammit=UnicodeDammit(data,["utf-8","gbk"])
data=dammit.unicode_markup
soup=BeautifulSoup(data,"lxml")
lis=soup.select("ul[class='t clearfix'] li")
num=0
for li in lis:
try:
num=num+1
data=li.select('h1')[0].text
weather=li.select('p[class="wea"]')[0].text
temp=li.select('p[class="tem"] span')[0].text+"/"+li.select('p[class="tem"] i')[0].text
print(num,city,data,weather,temp)
self.db.insert(num,city,data,weather,temp)
except Exception as err:
print(err)
except Exception as err:
print(err)
def process(self,cities):
self.db=WeatherDB()
self.db.openDB()
for city in cities:
self.forecastCity(city)
self.db.show()
self.db.closeDB()
ws=WeatherForecast()
ws.process(["北京","上海","广州","深圳"])
print("completed")

2)、心得体会
本次作业是要求爬取网站上的天气预报信息,主要是模仿课本上的例题,对书上的代码进行复现并理解。本次作业的代码中对于数据库的存储的相关操作是之前没有碰到过的,通过此次实验也学习到了相关的知识。有趣的是,在第一次完成了这次的代码的时候,已经都调试完成没有问题了,但是过了几天在晚上的时候进行调试代码时,出现了“list index out of range”的错误提示,获取不到当天的天气信息,我重新去到网站查看网站的源代码时发现当天的温度标签下的span标签消失了,因此获取不到span标签的内容,才出现了错误信息。因为爬取的是一直在动态变化的网站,网站的源代码也是会有稍微的变化的,看来有时候想要代码不出错误信息也是需要一点运气。。。=。=
作业②
1)、爬取股票相关信息实验
import json
import urllib.request
headers = {
'Accept': '*/*',
'Accept-Encoding': 'gzip, deflate',
'Accept-Language':'zh-CN,zh;q=0.9',
'Connection': 'keep-alive',
'Cookie':'qgqp_b_id=b12cd1da193f892cee63c6eb376e704e; cowCookie=true; intellpositionL=1215.35px; '
'intellpositionT=4102.2px; waptgshowtime=2020101; st_si=12487757093798; st_asi=delete; em_hq_fls=js; '
'st_pvi=38642806579130; st_sp=2020-09-30%2009%3A43%3A24; st_inirUrl=https%3A%2F%2Fwww.eastmoney.com%2F; '
'st_sn=16; st_psi=20201001161438311-113200301321-0460904339',
'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) '
'Chrome/81.0.4044.138 Safari/537.36',
'Referer': 'http://quote.eastmoney.com/center/gridlist.html',
'Host': '66.push2.eastmoney.com'
}
num=0
for page in range(1,92): # 一共91页
url = "http://66.push2.eastmoney.com/api/qt/clist/get?cb=jQuery1124040761773020239334_1601540078171&pn=" + str(
page) + "&pz=20&po=1&np=1&ut=bd1d9ddb04089700cf9c27f6f7426281&fltt=2&invt=2&fid=f3&fs=m:1+t:2," \
"m:1+t:23&fields=f1,f2,f3,f4,f5,f6,f7,f8,f9,f10,f12,f13,f14,f15,f16,f17,f18,f20,f21,f23,f24,f25,f22," \
"f11,f62,f128,f136,f115,f152 "
req = urllib.request.Request(url,headers=headers)
data=urllib.request.urlopen(req)
data=data.read().decode('utf-8')
data=data[43:-2] # 将获取到的json文件字符串去掉前面的jQuery.....一大串东西,截取为标准的json格式,传入处理
responseJson=json.loads(data)
stocks=responseJson.get('data').get('diff') # 提取到所需要的数据
if(page==1):
print("{:^2}{:^10}{:^10}{:^10}{:^10}{:^10}{:^10}{:^10}{:>10}".format("序号", "代码", "名称", "最新价", "涨跌幅", "跌涨额", "成交量", "成交额", "涨幅"))
for stock in stocks:
num = num+1
code = stock.get('f12')
name = stock.get('f14')
new_price = stock.get('f2')
price_limit = stock.get('f3')
change_amount = stock.get('f4')
turnover = stock.get('f5')
volume = stock.get('f6')
rise = stock.get('f7')
print("{:^2}{:>10}{:>10}{:>10}{:>10}{:>12}{:>13}{:>15}{:>12}".format(num, code, name, new_price, price_limit, change_amount, turnover, volume, rise))

2)、心得体会
本次作业与之前爬取的数据的形式有些不同,一开始我尝试用之前的方式直接读取网站的数据,但读了个空= =。带着疑问在网上搜索了资料后,才知道原来这个网站的数据不是直接存在源代码里,而是通过向服务器发送json请求来获取加载数据,如果直接查看网页的源代码会发现我们是找不到我们想要的数据的,而是只有一个框架。这次作业爬取的网站也是跟以前爬取的类型不一样,一开始我也是遇到了些困难,但经过查找资料以后也是收获了许多,我觉得主要有这三点收获:
一是知道了网站不仅仅可以将数据直接写在源代码里,还能通过向服务器发送json请求来获取数据,如果是在源代码里的数据我们可以直接就爬取下来,但如果是通过json请求获取的数据,我们则需要抓取json请求来解析获得数据。
二是学会了如何抓取json,一开始找json包时也是找了好大一会儿,当时是在打开了网页加载完以后才打开F12找json包,结果当然是找不到=.=,原来是得先打开F12再加载网站才能获取得到。
三是学会如何处理json格式的数据,可以通过python的json库函数来对其进行处理,会方便许多,就是得注意传入处理函数的数据格式应该是很严格的json的字符串,如果有其他杂七杂八的东西,就会一直报错。一开始我也是没有把前面的jQuery.....一大串东西处理掉导致报错,也是处理了好一会儿。
作业③
1)、根据自选3位数+学号后3位选取股票,获取印股票信息。实验
import json
import urllib.request
# 爬取的是东方财富网的上证A股栏目的所有股票信息,网址:http://quote.eastmoney.com/center/gridlist.html#sh_a_board
# 选取数字600+学号尾数202,打印代码为改数字的股票
headers = {
'Accept': '*/*',
'Accept-Encoding': 'gzip, deflate',
'Accept-Language':'zh-CN,zh;q=0.9',
'Connection': 'keep-alive',
'Cookie':'qgqp_b_id=b12cd1da193f892cee63c6eb376e704e; cowCookie=true; intellpositionL=1215.35px; '
'intellpositionT=4102.2px; waptgshowtime=2020101; st_si=12487757093798; st_asi=delete; em_hq_fls=js; '
'st_pvi=38642806579130; st_sp=2020-09-30%2009%3A43%3A24; st_inirUrl=https%3A%2F%2Fwww.eastmoney.com%2F; '
'st_sn=16; st_psi=20201001161438311-113200301321-0460904339',
'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) '
'Chrome/81.0.4044.138 Safari/537.36',
'Referer': 'http://quote.eastmoney.com/center/gridlist.html',
'Host': '66.push2.eastmoney.com'
}
for page in range(1,92):
url = "http://66.push2.eastmoney.com/api/qt/clist/get?cb=jQuery1124040761773020239334_1601540078171&pn=" + str(
page) + "&pz=20&po=1&np=1&ut=bd1d9ddb04089700cf9c27f6f7426281&fltt=2&invt=2&fid=f3&fs=m:1+t:2," \
"m:1+t:23&fields=f1,f2,f3,f4,f5,f6,f7,f8,f9,f10,f12,f13,f14,f15,f16,f17,f18,f20,f21,f23,f24,f25,f22," \
"f11,f62,f128,f136,f115,f152 "
req = urllib.request.Request(url,headers=headers)
data=urllib.request.urlopen(req)
data=data.read().decode('utf-8')
data=data[43:-2]
responseJson=json.loads(data)
stocks=responseJson.get('data').get('diff')
if(page==1):
print("{:^10}{:^10}{:^10}{:^10}{:^10}{:^10}{:^10}".format("代码", "名称", "今开", "最高", "涨停", "换手", "成交量"))
for stock in stocks:
code = stock.get('f12')
if(code!="600202"):
continue
name = stock.get('f14')
open = stock.get('f17')
hightest = stock.get('f15')
stop = stock.get('f18')*1.1 # 版面没有涨停的数据,查的百度的公式进行计算的。。。
change = stock.get('f8')
turnover = stock.get('f5')
print("{:>10}{:>8}{:>10}{:>12}{:>12}{:>10}{:>12}".format(code, name, open, hightest, stop, change, turnover))

2)、心得体会
这一题我主要就是在上一题的基础上加了一个if语句,找到股票代码是我所选的号码的那支股票然后打印出来。数据处理和获取思路基本与上题一样。
浙公网安备 33010602011771号