作业一:爬取给定城市集的7日天气预报
点击查看代码
from bs4 import BeautifulSoup
from bs4 import UnicodeDammit
import urllib.request
import sqlite3
class WeatherDB:
def openDB(self):
self.con=sqlite3.connect("weathers.db")
self.cursor=self.con.cursor()
try:
self.cursor.execute("create table weathers (wCity varchar(16),wDate varchar(16),wWeather varchar(64),wTemp varchar(32),constraint pk_weather primary key (wCity,wDate))")
except:
self.cursor.execute("delete from weathers")
def closeDB(self):
self.con.commit()
self.con.close()
def insert(self, city, date, weather, temp):
try:
self.cursor.execute("insert into weathers (wCity,wDate,wWeather,wTemp) values (?,?,?,?)",
(city, date, weather, temp))
except Exception as err:
print(err)
def show(self):
self.cursor.execute("select * from weathers")
rows = self.cursor.fetchall()
print("%-16s%-16s%-32s%-16s" % ("city", "date", "weather", "temp"))
for row in rows:
print("%-16s%-16s%-32s%-16s" % (row[0], row[1], row[2], row[3]))
class WeatherForecast:
def __init__(self):
self.headers = {
"User-Agent": "Mozilla/5.0 (Windows; U; Windows NT 6.0 x64; en-US; rv:1.9pre) Gecko/2008072421 Minefield/3.0.2pre"}
self.cityCode = {"北京": "101010100", "上海": "101020100", "广州": "101280101", "深圳": "101280601"}
def forecastCity(self, city):
if city not in self.cityCode.keys():
print(city + " code cannot be found")
return
url = "http://www.weather.com.cn/weather/" + self.cityCode[city] + ".shtml"
try:
req = urllib.request.Request(url, headers=self.headers)
data = urllib.request.urlopen(req)
data = data.read()
dammit = UnicodeDammit(data, ["utf-8", "gbk"])
data = dammit.unicode_markup
soup = BeautifulSoup(data, "lxml")
lis = soup.select("ul[class='t clearfix'] li")
for li in lis:
try:
date = li.select('h1')[0].text
weather = li.select('p[class="wea"]')[0].text
temp = li.select('p[class="tem"] span')[0].text + "/" + li.select('p[class="tem"] i')[0].text
print(city, date, weather, temp)
self.db.insert(city, date, weather, temp)
except Exception as err:
print(err)
except Exception as err:
print(err)
def process(self, cities):
self.db = WeatherDB()
self.db.openDB()
for city in cities:
self.forecastCity(city)
self.db.closeDB()
ws = WeatherForecast()
ws.process(["北京"])
print("completed")
[码云链接](https://gitee.com/w-jking/crawl_project/blob/master/%E4%BD%9C%E4%B8%9A2/1 "码云链接")
运行结果
![image]()
作业二:定向爬取股票相关信息
抓包过程
![image]()
点击查看代码
import sqlite3
import re
import requests
import pandas as pd
def getHtml(page):
url = "https://2.push2.eastmoney.com/api/qt/clist/get?cb=jQuery112409158120399002962_1696660033552&pn="+str(page)+"&pz=20&po=1&np=1&ut=bd1d9ddb04089700cf9c27f6f7426281&fltt=2&invt=2&wbp2u=|0|0|0|web&fid=f3&fs=m:0+t:6,m:0+t:80,m:1+t:2,m:1+t:23,m:0+t:81+s:2048&fields=f12,f14,f2,f3,f4,f5,f6,f7,f15,f16,f17,f18&_=1696660033553"
rep = requests.get(url)
rep.encoding = "UTF-8"
# print(rep.text)
html = rep.text
return html
def main():
for i in range(1,10):
html = getHtml(i)
message1 = re.findall('"f12":(.*?),', html)
message2 = re.findall('"f14":(.*?),', html)
message3 = re.findall('"f2":(.*?),', html)
message4 = re.findall('"f3":(.*?),', html)
message5 = re.findall('"f4":(.*?),', html)
message6 = re.findall('"f5":(.*?),', html)
message7 = re.findall('"f6":(.*?),', html)
message8 = re.findall('"f7":(.*?),', html)
message9 = re.findall('"f15":(.*?),', html)
message10 = re.findall('"f16":(.*?),', html)
message11 = re.findall('"f17":(.*?),', html)
message12 = re.findall('"f18":(.*?)}', html)
# print("序号\t股票代码\t股票名称\t最新报价\t涨跌幅\t涨跌额\t成交量\t成交额\t振幅\t最高\t最低\t今开\t昨收")
# print(message5)
num = 1
database = []
try:
for i in range(0, len(message12)):
data = []
# print(str(num) + "\t" + message1[i] + "\t" + message2[i] + "\t" + message3[i] + "\t" + message4[
# i] + "\t" + message5[i] + "\t" + message6[i] + "\t" + message7[i] + "\t" + message8[i] + "\t" +
# message9[i] + "\t" + message10[i] + "\t" + message11[i] + "\t" + message12[i])
# num = num + 1
data.append(message1[i])
data.append(message2[i])
data.append(message3[i])
data.append(message4[i])
data.append(message5[i])
data.append(message6[i])
data.append(message7[i])
data.append(message8[i])
data.append(message9[i])
data.append(message10[i])
data.append(message11[i])
data.append(message12[i])
database.append(data)
except:
print('抛出异常')
# columns = {1: "代码", 2: "名称", 3: "最新价格", 4: "涨跌额", 5: "涨跌幅", 6: "成交量", 7: "成交额", 8: "振幅",
# 9: "最高", 10: "最低",
# 11: "今开", 12: "昨收"}
# df = pd.DataFrame(database, columns=columns.values())
# df.to_excel("data.xlsx", index=False)
# print("已保存haha.xls")
# print(database)
# 建立数据库连接
conn = sqlite3.connect('stock_data.db')
cursor = conn.cursor()
# 创建数据表
create_table_query = '''
CREATE TABLE IF NOT EXISTS stock_data (
id INTEGER PRIMARY KEY AUTOINCREMENT,
code TEXT,
name TEXT,
price REAL,
change_percent REAL,
change_amount REAL,
volume INTEGER,
turnover REAL,
amplitude REAL,
high REAL,
low REAL,
open REAL,
close REAL
)
'''
cursor.execute(create_table_query)
# 插入数据
insert_data_query = '''
INSERT INTO stock_data (code, name, price, change_percent, change_amount, volume, turnover, amplitude, high, low, open, close)
VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
'''
# data = [
# ("001", "Stock 1", 100.0, 0.05, 5.0, 1000, 100000.0, 0.1, 110.0, 90.0, 95.0, 95.0),
# ("002", "Stock 2", 200.0, -0.03, -6.0, 2000, 200000.0, 0.2, 210.0, 190.0, 205.0, 208.0),
# # 添加更多的数据行...
# ]
cursor.executemany(insert_data_query, database)
# 提交更改并关闭连接
conn.commit()
conn.close()
print("数据已成功存入SQLite数据库。")
if __name__ == "__main__":
main()
[码云链接](https://gitee.com/w-jking/crawl_project/blob/master/%E4%BD%9C%E4%B8%9A2/2 "码云链接")
结果截图
![]()
作业三:爬取中国大学2021主榜所有院校信息
抓包过程
![image]()
点击查看代码
import re
import requests
import pandas as pd
def getHtml():
url = "https://www.shanghairanking.cn/_nuxt/static/1695811954/rankings/bcur/2021/payload.js"
rep = requests.get(url)
rep.encoding = "UTF-8"
# print(rep.text)
html = rep.text
# print(html)
return html
def main():
html = getHtml()
message1 = re.findall('univNameCn:"(.*?)"', html)
message2 = re.findall('score:(.*?),', html)
# print(message2)
print("排名\t学校\t总分")
database = []
try:
for i in range(0, len(message1)):
data = []
# print(str(num) + "\t" + message1[i] + "\t" + message2[i] + "\t" + message3[i] + "\t" + message4[
# i] + "\t" + message5[i] + "\t" + message6[i] + "\t" + message7[i] + "\t" + message8[i] + "\t" +
# message9[i] + "\t" + message10[i] + "\t" + message11[i] + "\t" + message12[i])
data.append(message1[i])
data.append(message2[i])
# data.append(message3[i])
# data.append(message4[i])
# data.append(message5[i])
# data.append(message6[i])
# data.append(message7[i])
# data.append(message8[i])
# data.append(message9[i])
# data.append(message10[i])
# data.append(message11[i])
# data.append(message12[i])
database.append(data)
print(database)
except:
print('抛出异常')
columns = {1: "学校", 2: "总分"}
df = pd.DataFrame(database, columns=columns.values())
df.to_excel("data.xlsx", index=False)
print("已保存haha.xls")
if __name__=="__main__":
main()
结果截图
![image]()
码云链接