第二次作业
作业一
-
要求
在中国气象网(http://www.weather.com.cn)给定城市集的7日天气预报,并保存在数据库。
-
代码
from bs4 import BeautifulSoup
from bs4 import UnicodeDammit
import urllib.request
import sqlite3
class WeatherDB:
def openDB(self):
self.con = sqlite3.connect("weathers.db")
self.cursor = self.con.cursor()
try:
self.cursor.execute("create table weathers (wCity varchar(16),wDate varchar(16),wWeather varchar(64),wTemp varchar(32),constraint pk_weather primary key (wCity,wDate))")
except:
self.cursor.execute("delete from weathers")
def closeDB(self):
self.con.commit()
self.con.close()
def insert(self,city,date,weather,temp):
try:
self.cursor.execute("insert into weathers (wCity,wDate,wWeather,wTemp) values(?,?,?,?)",(city,date,weather,temp))
except Exception as err:
print(err)
def show(self):
self.cursor.execute("select * from weathers")
rows = self.cursor.fetchall()
print("%-16s%-16s%-32s%-16s"%("city","data","weather","temp"))
for row in rows:
print("%-16s%-16s%-32s%-16s"%(row[0],row[1],row[2],row[3]))
class WeatherForecast:
def __init__(self):
self.headers = {"User-Agent":"Mozilla/5.0 (Windows; U; Windows NT 6.0 x64; en-US;rv:1.9pre)Gecko/2008072421 Minefield/3.0.2pre"}
self.cityCode = {"北京":"101010100","上海":"101020100","广州":"101280101","深圳":"101280601"}
def forecastCity(self,city):
if city not in self.cityCode.keys():
print(city+"code cannot be found")
return
url = "http://www.weather.com.cn/weather/"+self.cityCode[city]+".shtml"
try:
req = urllib.request.Request(url,headers=self.headers)
data = urllib.request.urlopen(req)
data = data.read()
dammit = UnicodeDammit(data,["utf-8","gbk"])
data = dammit.unicode_markup
soup = BeautifulSoup(data,"lxml")
lis = soup.select("ul[class='t clearfix'] li")
for li in lis:
try:
date = li.select('h1')[0].text
weather = li.select('p[class="wea"]')[0].text
temp = li.select('p[class="tem"] span')[0].text+"/"+li.select('p[class="tem"] i')[0].text
print(city,date, weather,temp)
self.db.insert(city,date,weather,temp)
except Exception as err:
print(err)
except Exception as err:
print(err)
def process(self,cities):
self.db = WeatherDB()
self.db.openDB()
for city in cities:
self.forecastCity(city)
self.db.show()
self.db.closeDB()
ws = WeatherForecast()
ws.process(["北京","上海","广州","深圳"])
print("completed")
-
运行结果截图

-
心得体会
通过将书上的代码进行复现,巩固了对BeautifulSoup的使用,同时熟悉了对数据库的建立以及使用
作业二
-
要求
用requests和BeautifulSoup库方法定向爬取股票相关信息。
-
候选网站
东方财富网:https://www.eastmoney.com/
新浪股票:http://finance.sina.com.cn/stock/
-
思路
1.因为网页数据是动态加载的,首先需要通过js文件获取数据所在的url
2.因为需要获取不同模块以及页数上的股票信息,通过修改url的对应的值来达到翻页以及查看不同模块的效果
3.通过正则表达式获取股票信息,再根据数据的特征进行处理,最后获得需要的数据
-
代码
import requests
import re
import prettytable as pt
x = pt.PrettyTable()
x.field_names = ["序号","代码","名称","最新价","涨跌幅","涨跌额","成交量","成交额","振幅","最高","最低","今开","昨收"]
#用get方法访问服务器并提取页面数据
def getHtml(cmd,page,fields):
headers = {"User-Agent":"Mozilla/5.0 (Windows; U; Windows NT 6.0 x64; en-US;rv:1.9pre)"
"Gecko/2008072421 Minefield/3.0.2pre"}
#通过改变url来访问不同模块不用页数的股票信息
url = "http://19.push2.eastmoney.com/api/qt/clist/get?cb=jQuery112403324490377009397_1602209502288" \
"&pn="+str(page)+"&pz=20&po=1&np=1&ut=bd1d9ddb04089700cf9c27f6f7426281&fltt=2&invt=2" \
"&fid=f3&fs="+cmd+"&fields="+fields+"&_=1602209502289"
r = requests.get(url,headers = headers)
r.encoding = "utf-8"
html = r.text
pat = '"diff":\[(.*?)]'
data = re.compile(pat,re.S).findall(html) #使用re.S参数以后,正则表达式会将这个字符串作为一个整体,在整体中进行匹配
return data
#获取单个页面股票数据
def getOnePageStock(cmd,page,fields):
data = getHtml(cmd,page,fields)
#通过}.{的组合将各股票信息分割,并去除第一个的{和最后一个的}
datas = data[0].split("},{")
datas[0] = datas[0].replace('{','')
datas[-1] = datas[-1].replace('}','')
stocks = []
for i in range(len(datas)):
#将“去除,并再次通过,进行分割每支股票的各类信息
stock = datas[i].split(",")
stocks.append(stock)
return stocks
def main():
cmd = {
"沪深A股": "m:0+t:6,m:0+t:13,m:0+t:80,m:1+t:2,m:1+t:23",
"上证A股": "m:1+t:2,m:1+t:23"
}
results = []
#想要获取的信息
#f12:代码,f14:名称,f2:最新价,f3:涨跌幅,f4:涨跌额,f5:成交量,f6:成交额,f7:振幅,f15:最高,f16;最低,f17:今开,f18:昨收
fields = "f12,f14,f2,f3,f4,f5,f6,f7,f15,f16,f17,f18"
for i in cmd.keys():
page = 1
#爬取三页的股票信息
while page <3:
result = getOnePageStock(cmd[i],page,fields)
results += result
page += 1
i = 0
for stock in results:
i = i+1
x.add_row([i,stock[6].split(":")[1],stock[7].split(":")[1],stock[0].split(":")[1],stock[1].split(":")[1],stock[2].split(":")[1],stock[3].split(":")[1],stock[4].split(":")[1],stock[5].split(":")[1],stock[8].split(":")[1],stock[9].split(":")[1],stock[10].split(":")[1],stock[11].split(":")[1]])
print(x)
main()
-
运行结果截图




-
心得体会
1.了解了动态加载的网页数据的爬取过程,明白了如何通过查看js文件获取数据所在的url
2.熟悉了对翻页处理的使用
3.学会了使用prettytable来输出数据
作业三
-
要求
根据自选3位数+学号后3位选取股票,获取印股票信息。抓包方法同作作业二。
-
候选网站
东方财富网:https://www.eastmoney.com/
新浪股票:http://finance.sina.com.cn/stock/
-
思路
1.通过观察不同代码的股票的url区别,了解如何直接通过搜索获取不同代码的股票的所需信息
2.当获得相应代码的股票信息的url之后,后面的思路跟作业二的思路一样
-
代码
import requests
import re
import prettytable as pt
x = pt.PrettyTable()
x.field_names = ["股票代码","股票名称","今日开","今日最高","今日最低"]
#用get方法访问服务器并提取页面数据
def getHtml(num,fields):
headers = {"User-Agent":"Mozilla/5.0 (Windows; U; Windows NT 6.0 x64; en-US;rv:1.9pre)Gecko"
"/2008072421 Minefield/3.0.2pre"}
#通过改变secid
url = "http://push2.eastmoney.com/api/qt/stock/get?ut=fa5fd1943c7b386f172d6893dbfba10b&invt=2" \
"&fltt=2&fields="+fields+"&secid="+num+"&cb=jQuery11240959380450062036_1601458369843&_=1601458369844"
r = requests.get(url,headers = headers)
r.encoding = "utf-8"
html = r.text
pat = '"data":\{(.*?)\}'
data = re.compile(pat,re.S).findall(html) #使用re.S参数以后,正则表达式会将这个字符串作为一个整体,在整体中进行匹配
return data
#获取单个页面股票数据
def getOnePageStock(num,fields):
data = getHtml(num,fields)
datas = data[0].strip("'").split(',')
result = []
for i in datas:
result.append(i.split(':')[1])
x.add_row([result[3].strip('"'),result[4].strip('"'),result[2],result[0],result[1]])
def main():
nums = ["0.002125","0.300125","1.600125"]
#要获取的信息
fields = "f44,f45,f46,f57,f58"
for num in nums:
try:
getOnePageStock(num,fields)
except:
print("没有股票:"+num)
print(x)
main()
-
运行结果截图

-
心得体会
在作业二的基础上,对url进行一些修改就能获取想要的数据。

浙公网安备 33010602011771号