2023数据采集与融合技术实践作业二
作业①:
* 要求:在中国气象网(http://www.weather.com.cn)给定城市集的7日天气预报,并保存在数据库。代码
from bs4 import BeautifulSoup
from bs4.dammit import UnicodeDammit
import urllib.request
import sqlite3
# 天气数据库
class WeatherDB:
def __init__(self):
self.cursor = None
self.con = None
def openDB(self):
self.con = sqlite3.connect("weathers.db")
self.cursor = self.con.cursor()
try:
self.cursor.execute(
"create table weathers (wCity varchar(16),"
"wDate varchar(16),"
"wWeather varchar(64),"
"wTemp varchar(32),"
"constraint pk_weather primary key (wCity,wDate))")
except Exception as err:
print(err)
self.cursor.execute("delete from weathers")
def closeDB(self):
self.con.commit()
self.con.close()
def insert(self, city, date, weather, temp):
try:
self.cursor.execute("insert into weathers (wCity,wDate,wWeather,wTemp) values (?,?,?,?)",
(city, date, weather, temp))
except Exception as err:
print(err)
def show(self):
self.cursor.execute("select * from weathers")
rows = self.cursor.fetchall()
print("%-16s%-16s%-32s%-16s" % ("city", "date", "weather", "temp"))
for row in rows:
print("%-16s%-16s%-32s%-16s" % (row[0], row[1], row[2], row[3]))
# 天气预报
class WeatherForecast:
def __init__(self):
self.headers = {
"User-Agent": "Mozilla/5.0 (Windows; U; Windows NT 6.0 x64; en-US; rv:1.9pre) "
"Gecko/2008072421 Minefield/3.0.2pre"}
self.cityCode = {"北京": "101010100", "上海": "101020100", "广州": "101280101", "深圳": "101280601"}
def forecastCity(self, city):
if city not in self.cityCode.keys():
print(city + " 找不到代码")
return
url = "http://www.weather.com.cn/weather/" + self.cityCode[city] + ".shtml"
try:
req = urllib.request.Request(url, headers=self.headers)
data = urllib.request.urlopen(req)
data = data.read()
dammit = UnicodeDammit(data, ["utf-8", "gbk"])
data = dammit.unicode_markup
soup = BeautifulSoup(data, "lxml")
lis = soup.select("ul[class='t clearfix'] li")
x = 0
row = 1
for li in lis:
try:
date = li.select('h1')[0].text
weather = li.select('p[class="wea"]')[0].text
if x == 0: # 为今天只有一个温度做判断 <i>14℃</i>
x += 1
temp = li.select('p[class="tem"] i')[0].text
else:
temp = li.select('p[class="tem"] span')[0].text + "/" + li.select('p[class="tem"] i')[0].text
print("{:<5}\t{:<4}\t{:<9}\t{:<12}\t{:<6}\t".format(row, city, date, weather, temp))
row = row+1
self.db.insert(city, date, weather, temp)
except Exception as err:
print(err)
except Exception as err:
print(err)
def process(self, cities):
self.db = WeatherDB()
self.db.openDB()
for city in cities:
self.forecastCity(city)
# self.db.show()
self.db.closeDB()
ws = WeatherForecast()
ws.process(["北京", "上海", "广州", "深圳"])
运行结果
心得体会
对bs的信息提取和正则表达式都有了更深的理解,掌握了数据库与爬虫程序的连接
作业②:
* 要求:用 requests 和 BeautifulSoup 库方法定向爬取股票相关信息,并存储在数据库中。代码
import urllib.request
import re
import json
import pandas as pd
import pathlib
import openpyxl
headers = {
'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 '
'(KHTML, like Gecko) Chrome/117.0.0.0 Safari/537.36 Edg/117.0.2045.47'
}
def getdata(data_ls,count_ls):
count = 1
for i in range(1,6):
page_num = i
url = ('http://25.push2.eastmoney.com/api/qt/clist/get?cb=jQuery11240213'
'13927342030325_1696658971596&pn=%d&pz=20&po=1&np=1&ut=bd1d9ddb040'
'89700cf9c27f6f7426281&fltt=2&invt=2&wbp2u=|0|0|0|web&fid=f3&fs=m:0'
'+t:6,m:0+t:80,m:1+t:2,m:1+t:23,m:0+t:81+s:2048&fields=f2,f3,f4,f5,f6,f7,f12,f14,f15,f16,f17,f18&_=1696658971636')
url = url % page_num
req = urllib.request.Request(url=url,headers=headers)
data = urllib.request.urlopen(req).read().decode()
data = re.compile('"diff":\[(.*?)\]',re.S).findall(data)
for one_data in re.compile('\{(.*?)\}',re.S).findall(data[0]):
data_dic = json.loads('{' + one_data + '}')
data_ls.append(data_dic)
count_ls.append(count)
count += 1
return data_ls, count_ls
if __name__ == "__main__":
columns={'f2':'最新价','f3':'涨跌幅(%)','f4':'涨跌额','f5':'成交量','f6':'成交额','f7':'振幅(%)','f12':'代码','f14':'名称',
'f15':'最高','f16':'最低','f17':'今开','f18':'昨收'}
data_ls=[]
count_ls=[]
getdata(data_ls, count_ls)
num = pd.DataFrame(count_ls)
# print(num)
df = pd.DataFrame(data_ls)
df.rename(columns=columns,inplace=True)
df.insert(0,column='序号',value=num)
df.to_excel('./stock.xlsx',index=False)
运行结果
心得体会
对js的作用有了更深的了解
作业③:
* 要求:用 requests 和 BeautifulSoup 库方法定向爬取股票相关信息,并存储在数据库中。调试分析过程
代码
import requests
import pandas as pd
import re
url = "https://www.shanghairanking.cn/_nuxt/static/1695811954/rankings/bcur/2021/payload.js"
resquest = requests.get(url=url)
#获取学校名称
name_grep = ',univNameCn:"(.*?)",'
name = re.findall(name_grep,resquest.text)
#获取学校总分
score_grep = ',score:(.*?),'
score = re.findall(score_grep,resquest.text)
#获取学校类型
category_grep = ',univCategory:(.*?),'
category = re.findall(category_grep,resquest.text)
#获取学校所在省份
province_grep = ',province:(.*?),'
province = re.findall(province_grep,resquest.text)
code_name_grep = 'function(.*?){'
code_name = re.findall(code_name_grep,resquest.text)
start_code = code_name[0].find('a')
end_code = code_name[0].find('pE')
code_name = code_name[0][start_code:end_code].split(',')#将function中的参数取出并存在code_name列表中
value_name_grep ='mutations:(.*?);'
value_name = re.findall(value_name_grep,resquest.text)
start_value = value_name[0].find('(')
end_value = value_name[0].find(')')
value_name = value_name[0][start_value+1:end_value].split(",") #将参数所对应的含义取出存在value_name列表中
df = pd.DataFrame(columns=["排名","学校","省份","类型","总分"])
for i in range(len(name)):
province_name = value_name[code_name.index(province[i])][1:-1]
category_name = value_name[code_name.index(category[i])][1:-1]
df.loc[i] = [i+1,name[i],province_name,category_name,score[i]]
print(df)
df.to_excel("./rank.xlsx")
运行截图
心得体会
对爬虫抓包有了更深的了解