102302114_比山布·努尔兰_作业2

作业1

在中国气象网(http://www.weather.com.cn)给定城市集的7日天气预报,并保存在数据库。

作业代码

点击查看代码
import requests
import pymysql
from bs4 import BeautifulSoup
from datetime import datetime

# 数据库统一配置
DB_CONFIG = {
    "host": "localhost",
    "port": 3306,
    "user": "root",
    "password": "123456",
    "database": "spider_db",
    "charset": "utf8mb4"
}


class WeatherCrawler:
    # 初始化爬虫配置
    def __init__(self):
        self.headers = {
            "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36"}
        self.city_code = {"北京": "101010100", "上海": "101020100", "广州": "101280101", "深圳": "101280601"}

    # 初始化天气数据表
    def init_weather_table(self):
        conn = pymysql.connect(**DB_CONFIG)
        cursor = conn.cursor()
        create_sql = '''
        CREATE TABLE IF NOT EXISTS weather_forecast (
            id INT AUTO_INCREMENT PRIMARY KEY COMMENT '主键',
            city_name VARCHAR(16) COMMENT '城市名称',
            forecast_date VARCHAR(16) COMMENT '预报日期',
            weather_info VARCHAR(64) COMMENT '天气状况',
            temperature VARCHAR(32) COMMENT '温度范围',
            crawl_time DATETIME DEFAULT CURRENT_TIMESTAMP COMMENT '爬取时间',
            UNIQUE KEY uk_city_date (city_name, forecast_date)
        ) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4;
        '''
        cursor.execute(create_sql)
        conn.commit()
        cursor.close()
        conn.close()

    # 爬取单城市天气
    def crawl_city_weather(self, city):
        if city not in self.city_code:
            print(f"{city} 无对应编码,跳过")
            return
        url = f"http://www.weather.com.cn/weather/{self.city_code[city]}.shtml"
        try:
            resp = requests.get(url, headers=self.headers, timeout=15)
            resp.encoding = "utf-8"
            soup = BeautifulSoup(resp.text, "html.parser")
            # 定位7日预报列表
            weather_list = soup.select("ul.t.clearfix li")[:7]

            conn = pymysql.connect(**DB_CONFIG)
            cursor = conn.cursor()
            for item in weather_list:
                date = item.find("h1").text.strip() if item.find("h1") else ""
                weather = item.find("p", class_="wea").text.strip() if item.find("p", class_="wea") else ""
                # 兼容温度标签结构
                temp_span = item.find("p", class_="tem").find("span")
                temp_i = item.find("p", class_="tem").find("i")
                temp = f"{temp_span.text.strip()}/{temp_i.text.strip()}" if temp_span and temp_i else ""

                if date and weather:
                    insert_sql = '''
                    INSERT INTO weather_forecast (city_name, forecast_date, weather_info, temperature)
                    VALUES (%s, %s, %s, %s) ON DUPLICATE KEY UPDATE
                    weather_info=VALUES(weather_info), temperature=VALUES(temperature);
                    '''
                    cursor.execute(insert_sql, (city, date, weather, temp))
                    print(f"{city} {date}:{weather} {temp}")
            conn.commit()
            cursor.close()
            conn.close()
        except Exception as e:
            print(f"{city} 爬取失败:{e}")

    # 批量处理城市
    def batch_process(self, cities):
        self.init_weather_table()
        for city in cities:
            self.crawl_city_weather(city)
        # 验证数据
        conn = pymysql.connect(**DB_CONFIG)
        cursor = conn.cursor()
        cursor.execute("SELECT city_name, forecast_date, weather_info, temperature FROM weather_forecast LIMIT 10")
        print("\n爬取结果验证(前10条):")
        print("%-16s%-16s%-32s%-16s" % ("城市", "日期", "天气", "温度"))
        for row in cursor.fetchall():
            print("%-16s%-16s%-32s%-16s" % (row[0], row[1], row[2], row[3]))
        cursor.close()
        conn.close()


if __name__ == "__main__":
    crawler = WeatherCrawler()
    crawler.batch_process(["北京", "上海", "广州", "深圳"])
    print("\n天气数据爬取并入库完成")

运行结果

image
image
image
image
image

心得体会

通过这个实验,我学会了解析网页并把爬取到的数据存入本地 MySQL数据库中,也学会了处理多城市天气数据。

作业2

要求:
–用requests和BeautifulSoup库方法定向爬取股票相关信息,并存储在数据库中。
–候选网站:东方财富网:https://www.eastmoney.com/
新浪股票:http://finance.sina.com.cn/stock/
–技巧:在谷歌浏览器中进入F12调试模式进行抓包,查找股票列表加载使用的url,并分析api返回的值,并根据所要求的参数可适当更改api的请求参数。根据URL可观察请求的参数f1、f2可获取不同的数值,根据情况可删减请求的参数。

作业代码

点击查看代码
import requests
import pymysql
from datetime import datetime

# 数据库配置
DB_CONFIG = {
    "host": "localhost",
    "port": 3306,
    "user": "root",
    "password": "123456",
    "database": "homework2_2",
    "charset": "utf8mb4"
}

# 东方财富股票接口地址
STOCK_API_URL = "https://push2.eastmoney.com/api/qt/clist/get?np=1&fltt=1&invt=2&fs=m%3A0%2Bt%3A6%2Bf%3A!2%2Cm%3A0%2Bt%3A80%2Bf%3A!2%2Cm%3A1%2Bt%3A2%2Bf%3A!2%2Cm%3A1%2Bt%3A23%2Bf%3A!2%2Cm%3A0%2Bt%3A81%2Bs%3A262144%2Bf%3A!2&fields=f12,f14,f2,f3,f4,f5,f6,f7,f8,f10,f15,f16,f17,f18&fid=f3&pn=1&pz=50&po=1&ut=fa5fd1943c7b386f172d6893dbfba10b&_=1761721260894"


def init_stock_table():
    # 初始化股票数据表
    conn = pymysql.connect(**DB_CONFIG)
    cursor = conn.cursor()
    create_sql = '''
    CREATE TABLE IF NOT EXISTS eastmoney_stocks (
        id INT AUTO_INCREMENT PRIMARY KEY COMMENT '主键',
        stock_code VARCHAR(20) COMMENT '股票代码',
        stock_name VARCHAR(50) COMMENT '股票名称',
        current_price FLOAT COMMENT '最新价',
        change_percent FLOAT COMMENT '涨跌幅(%)',
        change_amount FLOAT COMMENT '涨跌额',
        volume FLOAT COMMENT '成交量(万)',
        amount FLOAT COMMENT '成交额(亿)',
        amplitude FLOAT COMMENT '振幅(%)',
        high_price FLOAT COMMENT '最高价',
        low_price FLOAT COMMENT '最低价',
        open_price FLOAT COMMENT '开盘价',
        prev_close FLOAT COMMENT '昨收价',
        volume_ratio FLOAT COMMENT '量比',
        turnover_rate FLOAT COMMENT '换手率(%)',
        update_time DATETIME COMMENT '更新时间'
    ) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4;
    '''
    cursor.execute(create_sql)
    conn.commit()
    cursor.close()
    conn.close()


# 格式化数值(处理空值/单位转换)
def format_value(val):
    try:
        return float(val) / 100 if val else 0.0
    except:
        return 0.0


if __name__ == "__main__":
    # 初始化数据表
    init_stock_table()
    conn = pymysql.connect(**DB_CONFIG)
    cursor = conn.cursor()

    try:
        # 爬取股票数据
        response = requests.get(STOCK_API_URL, timeout=15)
        data = response.json()
        stock_list = data['data']['diff'][:10]  # 取前10条数据

        # 批量插入数据库
        for stock in stock_list:
            insert_sql = '''
            INSERT INTO eastmoney_stocks (
                stock_code, stock_name, current_price, change_percent, change_amount,
                volume, amount, amplitude, high_price, low_price, open_price,
                prev_close, volume_ratio, turnover_rate, update_time
            ) VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s)
            '''
            # 构造插入参数
            params = (
                stock.get('f12', ''), stock.get('f14', ''),
                format_value(stock.get('f2')), format_value(stock.get('f3')),
                format_value(stock.get('f4')), format_value(stock.get('f5')) / 10000,
                format_value(stock.get('f6')) / 100000000, format_value(stock.get('f7')),
                format_value(stock.get('f15')), format_value(stock.get('f16')),
                format_value(stock.get('f17')), format_value(stock.get('f18')),
                format_value(stock.get('f10')), format_value(stock.get('f8')),
                datetime.now()
            )
            cursor.execute(insert_sql, params)

        conn.commit()
        print(f"成功插入{len(stock_list)}条股票数据")

        # 验证数据
        cursor.execute("SELECT stock_code, stock_name, current_price FROM eastmoney_stocks LIMIT 5")
        print("\n数据库验证(前5条):")
        for row in cursor.fetchall():
            print(f"代码:{row[0]} | 名称:{row[1]} | 最新价:{row[2]:.2f}")

    except Exception as e:
        print(f"执行错误:{e}")
        conn.rollback()
    finally:
        # 关闭数据库连接
        cursor.close()
        conn.close()
        print("\n数据库连接已关闭")

运行结果

image
image
image

心得体会

通过这个实验我学会了怎么爬取动态变化的网页数据。

作业3

要求:爬取中国大学2021主榜(https://www.shanghairanking.cn/rankings/bcur/2021)所有院校信息,并存储在数据库中,同时将浏览器F12调试分析的过程录制Gif加入至博客中。
技巧:分析该网站的发包情况,分析获取数据的api

作业代码

点击查看代码
import requests
import pymysql
from datetime import datetime

# 数据库统一配置
DB_CONFIG = {
    "host": "localhost",
    "port": 3306,
    "user": "root",
    "password": "123456",
    "database": "spider_db",
    "charset": "utf8mb4"
}

# 软科2021大学排名接口
UNIV_API_URL = "https://www.shanghairanking.cn/api/pub/v1/bcur?bcur_type=11&year=2021"


def init_univ_table():
    # 初始化大学排名表
    conn = pymysql.connect(**DB_CONFIG)
    cursor = conn.cursor()
    create_sql = '''
    CREATE TABLE IF NOT EXISTS shanghairanking_2021 (
        id INT AUTO_INCREMENT PRIMARY KEY COMMENT '主键',
        univ_rank INT COMMENT '院校排名',
        univ_name VARCHAR(100) COMMENT '院校名称',
        province VARCHAR(50) COMMENT '所属省市',
        univ_type VARCHAR(20) COMMENT '院校类型',
        total_score FLOAT COMMENT '院校总分',
        crawl_time DATETIME DEFAULT CURRENT_TIMESTAMP COMMENT '爬取时间'
    ) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4;
    '''
    cursor.execute(create_sql)
    conn.commit()
    cursor.close()
    conn.close()


# 格式化排名/分数(处理空值)
def format_data(val):
    try:
        return int(val) if val.isdigit() else None if val == '未公布' else 0
    except:
        return None


if __name__ == "__main__":
    # 初始化数据表
    init_univ_table()
    conn = pymysql.connect(**DB_CONFIG)
    cursor = conn.cursor()

    try:
        # 请求大学排名数据
        headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36"}
        response = requests.get(UNIV_API_URL, headers=headers, timeout=15)
        data = response.json()
        univ_list = data['data']['rankings']

        # 批量插入数据库
        for univ in univ_list:
            insert_sql = '''
            INSERT INTO shanghairanking_2021 (
                univ_rank, univ_name, province, univ_type, total_score
            ) VALUES (%s, %s, %s, %s, %s)
            '''
            # 构造插入参数
            params = (
                format_data(univ.get('ranking', '')),
                univ.get('univNameCn', ''),
                univ.get('province', ''),
                univ.get('univCategory', ''),
                float(univ.get('score', 0)) if univ.get('score') else None
            )
            cursor.execute(insert_sql, params)

        conn.commit()
        print(f"成功插入{len(univ_list)}条大学排名数据")

        # 验证数据(输出前10条)
        cursor.execute(
            "SELECT univ_rank, univ_name, province, univ_type, total_score FROM shanghairanking_2021 ORDER BY univ_rank LIMIT 10")
        print("\n2021软科大学排名(前10条):")
        print("排名  学校名称        地区    类型    总分")
        print("-" * 50)
        for row in cursor.fetchall():
            rank, name, province, type_, score = row
            score = f"{score:.1f}" if score else "未公布"
            print(f"{rank:<4}  {name:<12}  {province:<6}  {type_:<6}  {score}")

    except Exception as e:
        print(f"执行错误:{e}")
        conn.rollback()
    finally:
        # 关闭数据库连接
        cursor.close()
        conn.close()
        print("\n数据库连接已关闭")

运行结果

image
image
image

心得体会

通过这个实验,我学会了怎么用浏览器F12调试分析网页信息。

posted @ 2026-01-04 21:02  bishanbu  阅读(4)  评论(0)    收藏  举报