第二次作业

问题1

   要求:在中国气象网(http://www.weather.com.cn)给定城市集的7日天气预报,并保存在数据库。
   输出信息:Gitee文件夹链接

image

代码及运行结果

码云地址
https://gitee.com/chenming333/chenmingproject/blob/master/作业2/1.py

from bs4 import BeautifulSoup
from bs4 import UnicodeDammit
import urllib.request
import sqlite3

class WeatherDB:
    
    #天气数据数据库操作类负责创建数据库、表结构,以及数据的插入和查询
    def openDB(self):
        self.con=sqlite3.connect("weathers.db")  # 连接SQLite数据库
        self.cursor=self.con.cursor()
        try:
            # 创建天气数据表,设置城市和日期为主键
            self.cursor.execute("create table weathers (wCity varchar(16),wDate varchar(16),wWeather varchar(64),wTemp varchar(32),constraint pk_weather primary key (wCity,wDate))")
        except:
            # 如果表已存在,则清空表中的数据
            self.cursor.execute("delete from weathers")
            
    def closeDB(self):
        #关闭数据库连接
        self.con.commit()
        self.con.close()
        
    def insert(self,city,date,weather,temp):
        #向数据库插入天气数据
        try:
            self.cursor.execute("insert into weathers (wCity,wDate,wWeather,wTemp) values (?,?,?,?)" ,(city,date,weather,temp))
        except Exception as err:
            print(err)
            
    def show(self):
        #显示数据库中的所有天气数据
        self.cursor.execute("select * from weathers")
        rows=self.cursor.fetchall()
        # 打印表头
        print("%-16s%-16s%-32s%-16s" % ("city","date","weather","temp"))
        for row in rows:
            print("%-16s%-16s%-32s%-16s" % (row[0],row[1],row[2],row[3]))

class WeatherForecast:
    
    def __init__(self):
        self.headers = {
            "User-Agent": "Mozilla/5.0 (Windows; U; Windows NT 6.0 x64; en-US; rv:1.9pre) Gecko/2008072421 Minefield/3.0.2pre"}
        # 城市名称与对应城市代码的映射字典
        self.cityCode={"北京":"101010100","上海":"101020100","广州":"101280101","深圳":"101280601"}
        
    def forecastCity(self,city):
        # 检查城市是否在支持的城市列表中
        if city not in self.cityCode.keys():
            print(city+" code cannot be found")
            return
            
        url="http://www.weather.com.cn/weather/"+self.cityCode[city]+".shtml"
        try:
            req=urllib.request.Request(url,headers=self.headers)
            data=urllib.request.urlopen(req)
            data=data.read()
            dammit=UnicodeDammit(data,["utf-8","gbk"])
            data=dammit.unicode_markup
            
            # 使用BeautifulSoup解析HTML
            soup=BeautifulSoup(data,"lxml")
            # 选择包含天气预报信息的列表元素
            lis=soup.select("ul[class='t clearfix'] li")
            
            # 遍历每个天气条目
            for li in lis:
                try:
                    # 提取日期信息
                    date=li.select('h1')[0].text
                    # 提取天气状况
                    weather=li.select('p[class="wea"]')[0].text
                    # 提取温度信息(最高温/最低温)
                    temp=li.select('p[class="tem"] span')[0].text+"/"+li.select('p[class="tem"] i')[0].text
                    
                    # 打印天气信息到控制台
                    print(city,date,weather,temp)
                    # 将天气信息插入数据库
                    self.db.insert(city,date,weather,temp)
                except Exception as err:
                    print(err)
        except Exception as err:
            print(err)
            
    def process(self,cities):
        self.db=WeatherDB()
        self.db.openDB()
        
        # 遍历每个城市,获取天气预报
        for city in cities:
            self.forecastCity(city)
        self.db.closeDB()  # 关闭数据库连接

ws=WeatherForecast()
ws.process(["北京","上海","广州","深圳"])
print("completed")

运行结果

image

运行过程

通过搜索对比,可以看到城市名称和城市代码的映射,下面分别对应北京和上海
image
image
检查网页源码,找到7日天气预报信息所在位置
image

心得体会

通过实践天气数据采集项目,我学到了可以用UnicodeDammit自动检测编码,这样就可以不用去猜是用什么编码的,而且也可以适应解决混合编码的问题

问题2

   要求:用requests和BeautifulSoup库方法定向爬取股票相关信息,并存储在数据库中。网站:东方财富网:https://www.eastmoney.com/
   输出信息:Gitee文件夹链接

image

代码及运行结果

码云地址
https://gitee.com/chenming333/chenmingproject/blob/master/作业2/2.py

import requests
import json
import sqlite3
from datetime import datetime

url = "https://push2.eastmoney.com/api/qt/clist/get?np=1&fltt=1&invt=2&fs=m%3A0%2Bt%3A6%2Bf%3A!2%2Cm%3A0%2Bt%3A80%2Bf%3A!2%2Cm%3A1%2Bt%3A2%2Bf%3A!2%2Cm%3A1%2Bt%3A23%2Bf%3A!2%2Cm%3A0%2Bt%3A81%2Bs%3A262144%2Bf%3A!2&fields=f12,f14,f2,f3,f4,f5,f6,f7,f8,f10,f15,f16,f17,f18&fid=f3&pn=1&pz=50&po=1&ut=fa5fd1943c7b386f172d6893dbfba10b&_=1761721260894"

# 创建数据库连接
conn = sqlite3.connect('stock_data.db')
cursor = conn.cursor()

# 创建股票数据表
cursor.execute('''
CREATE TABLE IF NOT EXISTS stocks (
    id INTEGER PRIMARY KEY AUTOINCREMENT,
    stock_code TEXT,
    stock_name TEXT,
    current_price REAL,
    change_percent REAL,
    change_amount REAL,
    volume REAL,
    amount REAL,
    amplitude REAL,
    high_price REAL,
    low_price REAL,
    open_price REAL,
    prev_close REAL,
    volume_ratio REAL,
    turnover_rate REAL,
    update_time TEXT
)
''')

try:
    response = requests.get(url)
    data = response.json()
    stocks = data['data']['diff']

    print("序号  代码      名称      最新价   涨跌幅   涨跌额   成交量(万)  成交额(亿)  振幅   最高    最低    今开    昨收    量比   换手率")
    print("-" * 140)

    for i in range(min(10, len(stocks))):
        s = stocks[i]
        
        # 逐个处理每个字段,确保都是数字
        def to_float(value):
            try:
                return float(value)
            except:
                return 0.0
        
        code = str(s.get('f12', ''))
        name = str(s.get('f14', ''))
        price = to_float(s.get('f2')) / 100
        change_pct = to_float(s.get('f3')) / 100
        change_amt = to_float(s.get('f4')) / 100
        volume = to_float(s.get('f5')) / 10000
        amount = to_float(s.get('f6')) / 100000000
        amplitude = to_float(s.get('f7')) / 100
        high = to_float(s.get('f15')) / 100
        low = to_float(s.get('f16')) / 100
        open_p = to_float(s.get('f17')) / 100
        prev_close = to_float(s.get('f18')) / 100
        vol_ratio = to_float(s.get('f10')) / 100
        turnover = to_float(s.get('f8')) / 100

        # 格式化输出
        print(f"{i+1:2d}    {code:6}    {name:6}    {price:6.2f}    {change_pct:5.2f}%   {change_amt:5.2f}    "
              f"{volume:7.1f}    {amount:7.2f}    {amplitude:4.1f}%   {high:6.2f}   {low:6.2f}   {open_p:6.2f}   "
              f"{prev_close:6.2f}   {vol_ratio:4.2f}   {turnover:5.1f}%")

        # 保存到数据库
        cursor.execute('''
        INSERT INTO stocks (
            stock_code, stock_name, current_price, change_percent, change_amount,
            volume, amount, amplitude, high_price, low_price, open_price, 
            prev_close, volume_ratio, turnover_rate, update_time
        ) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
        ''', (
            code, name, price, change_pct, change_amt,
            volume, amount, amplitude, high, low, open_p,
            prev_close, vol_ratio, turnover, datetime.now().strftime('%Y-%m-%d %H:%M:%S')
        ))

    conn.commit()
    print(f"\n成功保存 {min(10, len(stocks))} 条股票数据到数据库")

    # 显示数据库中的所有数据
    print("\n" + "="*140)
    print("数据库中的完整股票数据:")
    print("="*140)
    
    cursor.execute('''
    SELECT stock_code, stock_name, current_price, change_percent, change_amount, 
           volume, amount, amplitude, high_price, low_price, open_price, 
           prev_close, volume_ratio, turnover_rate 
    FROM stocks
    ''')
    db_data = cursor.fetchall()
    
    print("代码      名称      最新价   涨跌幅   涨跌额   成交量(万)  成交额(亿)  振幅   最高    最低    今开    昨收    量比   换手率")
    print("-" * 140)
    for i, row in enumerate(db_data, 1):
        (code, name, price, change_pct, change_amt, volume, amount, 
         amplitude, high, low, open_p, prev_close, vol_ratio, turnover) = row
        
        print(f"{i:2d}    {code:6}    {name:6}    {price:6.2f}    {change_pct:5.2f}%   {change_amt:5.2f}    "
              f"{volume:7.1f}    {amount:7.2f}    {amplitude:4.1f}%   {high:6.2f}   {low:6.2f}   {open_p:6.2f}   "
              f"{prev_close:6.2f}   {vol_ratio:4.2f}   {turnover:5.1f}%")

except Exception as e:
    print(f"错误: {e}")
    import traceback
    traceback.print_exc()
finally:
    # 关闭数据库连接
    cursor.close()
    conn.close()
    print("\n数据库连接已关闭")

运行结果

image
保存到数据库之中
image

实验过程

找到所要数据位置,发现所需要的数据和每一个标签对应,例如f2对应价格,提取出的数据还需做数据处理,包括百分比转换等等
image

心得体会

我学会了使用Python的格式化字符串来控制列宽和精度,比如通过:6、:7.2f等格式说明符让数据整齐排列。在处理数据类型转换时,我专门编写了安全转换函数来解决字符串转数字的难题。同时注意单位换算,将原始数据中的价格除以100,成交量转换为万手,成交额转换为亿元。这些是让数据从杂乱无章变得清晰可读的关键

问题3

   要求:爬取中国大学2021主榜(https://www.shanghairanking.cn/rankings/bcur/2021)所有院校信息,并存储在数据库中,同时将浏览器F12调试分析的过程录制Gif加入至博客中。
   输出信息:Gitee文件夹链接
排名 学校名称 省市 学校类型 总分
1 清华大学 北京 综合 852.5
2

代码及运行结果

码云地址
https://gitee.com/chenming333/chenmingproject/blob/master/作业2/3.py

代码

import requests
import sqlite3

# 连接数据库
conn = sqlite3.connect('school.db')
cur = conn.cursor()

# 创建表存储数据
cur.execute('''
CREATE TABLE IF NOT EXISTS schools (
    id INTEGER PRIMARY KEY AUTOINCREMENT,
    rank INTEGER,
    name TEXT,
    province TEXT,
    type TEXT,
    score REAL
)
''')

# 获取数据
url = "https://www.shanghairanking.cn/api/pub/v1/bcur?bcur_type=11&year=2021"
headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36"
}

response = requests.get(url, headers=headers)
data = response.json()
schools = data['data']['rankings']

# 保存到数据库
for school in schools:
    cur.execute('''
        INSERT INTO schools (rank, name, province, type, score)
        VALUES (?, ?, ?, ?, ?)
    ''', (
        school['ranking'],
        school['univNameCn'],
        school['province'],
        school['univCategory'],
        school['score']
    ))

conn.commit()

# 显示结果
print("排名  学校名称        地区    类型    总分")
print("-" * 50)

cur.execute("SELECT * FROM schools ORDER BY rank")
results = cur.fetchall()

for row in results:
    rank, name, province, type, score = row[1], row[2], row[3], row[4], row[5]
    print(f"{rank:<4}  {name:<12}  {province:<6}  {type:<6}  {score:.1f}")

cur.close()
conn.close()

image
image

实验过程

13406916044058618
找到我们所需的所有数据在payload.js之下,查看所有数据

import requests
import json

url = "https://www.shanghairanking.cn/_nuxt/static/1761118404/rankings/bcur/2020/payload.js"

response = requests.get(url)
data = response.text
print(data)

查看数据

image
image

心得体会

这次任务是要爬取所有大学数据,之前是只做一个页面的,通过检查页面源代码,找到了所有数据存在哪里,解决了问题

posted @ 2025-11-06 23:39  chenming333  阅读(4)  评论(0)    收藏  举报