2023数据采集与融合技术实践作业二
2023数据采集与融合技术实践作业二
作业①:
- 要求:在中国气象网(http://www.weather.com.cn)给定城市集的 7
日天气预报,并保存在数据库。 - 输出信息:
| 序号 | 地区 | 日期 | 天气信息 | 温度 |
|---|---|---|---|---|
| 1 | 北京 | 15日(今天) | 多云 | ... |
- Gitee文件夹链接:实践作业2
代码:
from bs4 import BeautifulSoup
from bs4 import UnicodeDammit
import urllib.request
import sqlite3
class WeatherDB:
def openDB(self):
self.con=sqlite3.connect("weathers.db")
self.cursor=self.con.cursor()
try:
self.cursor.execute("create table weathers (wCity varchar(16),wDate varchar(16),wWeather varchar(64),wTemp varchar(32),constraint pk_weather primary key (wCity,wDate))")
except:
self.cursor.execute("delete from weathers")
def closeDB(self):
self.con.commit()
self.con.close()
def insert(self, city, date, weather, temp):
try:
self.cursor.execute("insert into weathers (wCity,wDate,wWeather,wTemp) values (?,?,?,?)",
(city, date, weather, temp))
except Exception as err:
print(err)
def show(self):
self.cursor.execute("select * from weathers")
rows = self.cursor.fetchall()
print("%-16s%-16s%-32s%-16s" % ("city", "date", "weather", "temp"))
for row in rows:
print("%-16s%-16s%-32s%-16s" % (row[0], row[1], row[2], row[3]))
class WeatherForecast:
def __init__(self):
self.headers = {
"User-Agent": "Mozilla/5.0 (Windows; U; Windows NT 6.0 x64; en-US; rv:1.9pre) Gecko/2008072421 Minefield/3.0.2pre"}
self.cityCode = {"北京": "101010100", "上海": "101020100", "广州": "101280101", "深圳": "101280601"}
def forecastCity(self, city):
if city not in self.cityCode.keys():
print(city + " code cannot be found")
return
url = "http://www.weather.com.cn/weather/" + self.cityCode[city] + ".shtml"
try:
req = urllib.request.Request(url, headers=self.headers)
data = urllib.request.urlopen(req)
data = data.read()
dammit = UnicodeDammit(data, ["utf-8", "gbk"])
data = dammit.unicode_markup
soup = BeautifulSoup(data, "lxml")
lis = soup.select("ul[class='t clearfix'] li")
for li in lis:
try:
date = li.select('h1')[0].text
weather = li.select('p[class="wea"]')[0].text
temp = li.select('p[class="tem"] span')[0].text + "/" + li.select('p[class="tem"] i')[0].text
print(city, date, weather, temp)
self.db.insert(city, date, weather, temp)
except Exception as err:
print(err)
except Exception as err:
print(err)
def process(self, cities):
self.db = WeatherDB()
self.db.openDB()
for city in cities:
self.forecastCity(city)
self.db.closeDB()
ws = WeatherForecast()
ws.process(["北京", "上海", "广州", "深圳"])
print("completed")
- 结果截图:
![结果1]()
实验心得:
主要是进行复现,并没有什么难度
作业②
-
要求:用requests和BeautifulSoup库方法定向爬取股票相关信息,并存储在数据库中。
-
候选网站:东方财富网:https://www.eastmoney.com/
新浪股票:http://finance.sina.com.cn/stock/ -
技巧:在谷歌浏览器中进入F12调试模式进行抓包,查找股票列表加载使用的url,并分析api返回的值,并根据所要求的参数可适当更改api的请求参数。根据URL可观察请求的参数f1、f2可获取不同的数值,根据情况可删减请求的参数。
-
输出信息:
![输出2]()
-
Gitee文件夹链接:实践作业2
代码:
import requests
from bs4 import BeautifulSoup
import re
import pandas as pd
def getData(url, gupiao):
response = requests.get(url)
database = []
daima = re.findall('f12":"(.*?)",', response.text)
name = re.findall('f14":"(.*?)",', response.text)
newprice = re.findall('f2":(.*?),', response.text)
diezhanfu = re.findall('f4":(.*?),', response.text)
diezhane = re.findall('f3":(.*?),', response.text)
cjl = re.findall('f5":(.*?),', response.text)
cje = re.findall('f6":(.*?),', response.text)
zf = re.findall('f7":(.*?),', response.text)
zg = re.findall('f15":(.*?),', response.text)
zd = re.findall('f16":(.*?),', response.text)
jk = re.findall('f17":(.*?),', response.text)
zs = re.findall('f18":(.*?),', response.text)
for i in range(0, len(zs)):
data = []
data.append(str(i+1))
data.append(name[i])
data.append(daima[i])
data.append(newprice[i])
data.append(diezhanfu[i])
data.append(diezhane[i])
data.append(cjl[i])
data.append(cje[i])
data.append(zf[i])
data.append(zg[i])
data.append(zd[i])
data.append(jk[i])
data.append(zs[i])
database.append(data)
# print(database)
columns = {1:"序号",2:"代码",3:"名称",4:"最新价格",5:"跌涨幅",6:"跌涨额",7:"成交量",8:"成交额",9:"振幅",10:"最高",11:"最低",12:"今开",13:"昨收"}
df = pd.DataFrame(database, columns=columns.values())
docename = gupiao + ".xlsx"
df.to_excel(docename, index=False)
print("已保存" + docename)
def main():
desirable_page = 1
fs = {
"沪深京A股": "m:0+t:6,m:0+t:80,m:1+t:2,m:1+t:23,m:0+t:81+s:2048",
"上证A股": "m:1+t:2,m:1+t:23",
"深证A股": "m:0+t:6,m:0+t:80",
"北证A股": "m:0+t:81+s:2048",
"新股": "m:0+t:81+s:2048",
"创业板": "m:0+t:80"
}
for gupiao in fs:
url = f"https://89.push2.eastmoney.com/api/qt/clist/get?cb=jQuery112408428773349332392_1697292673356&pn={desirable_page}&pz=20&po=1&np=1&ut=bd1d9ddb04089700cf9c27f6f7426281&fltt=2&invt=2&wbp2u=|0|0|0|web&fid=f3&fs={fs[gupiao]}&fields=f1,f2,f3,f4,f5,f6,f7,f8,f9,f10,f12,f13,f14,f15,f16,f17,f18,f20,f21,f23&_=1697292673357"
getData(url, gupiao)
# print(getData(url))
if __name__ == "__main__":
main()
- 抓包过程:
![11]()
- 结果截图:
![s]()
![a]()
![d]()
实验心得:
学习到一种新的方式,用抓包的方式去获得url,但获取到的数据不像能在开发者模式下用元素那么容易看懂,另外还需要使用json解析,但我直接使用了正则表达式来提取信息,最后保存在excel里。
作业③:
-
要求:爬取中国大学2021主榜(https://www.shanghairanking.cn/rankings/bcur/2021)所有院校信息,并存储在数据库中,同时将浏览器F12调试分析的过程录制Gif加入至博客中。
-
技巧:分析该网站的发包情况,分析获取数据的api
-
输出信息:
![4]()
-
Gitee文件夹链接:实践作业2
代码:
import re
import requests
from bs4 import BeautifulSoup
import pandas as pd
province = {
'k': '江苏','n': '山东', 'o': '河南','p': '河北','q': '北京','r': '辽宁','s': '陕西','t': '四川','u': '广东',
'v': '湖北','w': '湖南','x': '浙江','y': '安徽','z': '江西','A': '黑龙江','B': '吉林','C': '上海','D': '福建','E': '山西',
'F': '云南','G': '广西','I': '贵州','J': '甘肃','K': '内蒙古','L': '重庆','M': '天津','N': '新疆','Y': '海南'
}
unicata = {
'f': '综合',
'e': '理工',
'h': '师范',
'm': '农业',
'T': '林业',
}
def getData(url):
response = requests.get(url)
response.raise_for_status()
response.encoding = response.apparent_encoding
uniname = re.findall(r'univNameCn:"(.*?)",', response.text)
uniscore = re.findall(r'score:(.*?),', response.text)
uniprovince = re.findall(r'province:(.*?),', response.text)
unica = re.findall(r'univCategory:(.*?),', response.text)
database = []
for i in range(0, len(unica)):
data = []
data.append(str(i+1))
data.append(uniname[i])
shengshi = ""
if uniprovince[i] in province:
shengshi = province[uniprovince[i]]
else:
shengshi = uniprovince
data.append(shengshi)
leixing = ""
if unica[i] in unicata:
leixing = unicata[unica[i]]
else:
leixing = unica[i]
data.append(leixing)
data.append(uniscore[i])
database.append(data)
columns = {1:"排名",2:"学校名称",3:"省市",4:"学校类型",5:"学校总分"}
df = pd.DataFrame(database, columns=columns.values())
df.to_excel('school_rank.xlsx', index=False)
print("已保存school_rank.xlsx")
url = "http://www.shanghairanking.cn/_nuxt/static/1697106492/rankings/bcur/2021/payload.js"
getData(url)
- 抓包过程:
![给]()
- 结果截图:
![33]()
![212]()
![2121]()
实验心得:
其实操作和之前差不多,也对正则表达式的使用更加熟练。












浙公网安备 33010602011771号