102302126李坤铭第二次作业

作业①:
– 要求:在中国气象网(http://www.weather.com.cn)给定城市集的7日天气预报,并保存在数据库。

1)代码内容:

点击查看代码
import sqlite3
import urllib.request
from bs4 import BeautifulSoup
from datetime import datetime


def initialize_database():
    """初始化天气数据库表"""
    with sqlite3.connect("beijing_weather.db") as db_connection:
        db_connection.execute('''
            CREATE TABLE IF NOT EXISTS weather_records (
                record_date TEXT PRIMARY KEY,
                weather_condition TEXT,
                temperature_range TEXT
            )
        ''')


def fetch_weather_info():
    """从网络获取北京天气信息"""
    weather_url = "http://www.weather.com.cn/weather/101010100.shtml"

    try:
        # 发起HTTP请求获取网页内容
        with urllib.request.urlopen(weather_url, timeout=10) as response:
            html_content = BeautifulSoup(response.read(), "html.parser")

        daily_weather = []
        # 解析未来7天天气预报
        for day_element in html_content.select("ul.t.clearfix li")[:7]:
            date_info = day_element.select_one('h1').get_text(strip=True)
            condition = day_element.select_one('p.wea').get_text(strip=True)

            # 处理温度数据
            high_temp_elem = day_element.select_one('p.tem span')
            low_temp_elem = day_element.select_one('p.tem i')
            
            if high_temp_elem:
                temperature = f"{high_temp_elem.text}/{low_temp_elem.text.replace('℃', '')}℃"
            else:
                temperature = low_temp_elem.text

            daily_weather.append((date_info, condition, temperature))
            print(f"{date_info}: {condition} {temperature}")

        return daily_weather
    except Exception as error:
        print(f"天气数据获取失败: {error}")
        return None


def store_weather_data(weather_records):
    """将天气数据存入数据库"""
    with sqlite3.connect("beijing_weather.db") as db_connection:
        for record in weather_records:
            db_connection.execute(
                "INSERT OR REPLACE INTO weather_records VALUES (?, ?, ?)",
                record
            )


def display_weather_forecast():
    """展示数据库中的天气预报"""
    with sqlite3.connect("beijing_weather.db") as db_connection:
        weather_entries = db_connection.execute(
            "SELECT * FROM weather_records ORDER BY record_date"
        ).fetchall()

    print(f"\n{'日期':<12}{'天气状况':<12}{'温度范围':<10}")
    print("-" * 35)
    for entry in weather_entries:
        print(f"{entry[0]:<12}{entry[1]:<12}{entry[2]:<10}")


if __name__ == "__main__":
    print("=== 北京天气数据采集程序 ===")
    initialize_database()

    weather_records = fetch_weather_info()
    if weather_records:
        store_weather_data(weather_records)
        display_weather_forecast()
输出结果:

95acb8c3-4602-4a00-af13-a2d48e65a8bc
2)心得体会:
要注意数据库里的内容,需在代码运行时对原有的存储数据进行删除。
作业②:
– 要求:用requests和BeautifulSoup库方法定向爬取股票相关信息,并存储在数据库中。
– 网站:东方财富网:https://www.eastmoney.com/
– 技巧:在谷歌浏览器中进入F12调试模式进行抓包,查找股票列表加载使用的url,并分析api返回的值,并根据所要求的参数可适当更改api的请求参数。根据URL可观察请求的参数f1、f2可获取不同的数值,根据情况可删减请求的参数。

1)代码内容:

点击查看代码
import requests
import pandas as pd
import json
import sqlite3
from datetime import datetime


def fetch_stock_market_data():
    """从东方财富API获取股票市场数据"""
    api_url = "https://push2.eastmoney.com/api/qt/clist/get"
    query_params = {
        'cb': 'jQuery371048325911427590795_1761723060607',
        'fid': 'f3', 'po': '1', 'pz': '20', 'pn': '1', 'np': '1',
        'fltt': '2', 'invt': '2', 'ut': 'fa5fd1943c7b386f172d6893dbfba10b',
        'fs': 'm:0 t:6,m:0 t:80,m:1 t:2,m:1 t:23',
        'fields': 'f12,f13,f14,f1,f2,f4,f3,f152,f5,f6,f7,f15,f18,f16,f17,f10,f8,f9,f23',
        '_': '1761723060609'
    }

    try:
        response = requests.get(api_url, params=query_params, timeout=10)
        response.raise_for_status()
        
        # 解析JSONP格式的响应
        raw_data = response.text
        json_content = raw_data[raw_data.find('(')+1 : raw_data.rfind(')')]
        api_response = json.loads(json_content)

        # 验证响应数据
        if not api_response or api_response.get('rc') != 0 or not api_response.get('data'):
            print("警告: API返回了空数据或错误代码")
            return pd.DataFrame()

        # 处理股票数据
        stock_list = []
        stock_items = api_response['data']['diff']
        if isinstance(stock_items, dict):  # 兼容不同格式的响应
            stock_items = stock_items.values()

        for stock in stock_items:
            stock_info = {
                '代码': stock.get('f12', ''),
                '名称': stock.get('f14', ''),
                '最新价': round(float(stock.get('f2', 0)), 2),
                '涨跌幅': f"{float(stock.get('f3', 0)):.2f}%",
                '涨跌额': round(float(stock.get('f4', 0)), 2),
                '成交量': int(stock.get('f5', 0)),
                '成交额': float(stock.get('f6', 0)),
                '振幅': f"{float(stock.get('f7', 0)):.2f}%",
                '最高价': round(float(stock.get('f15', 0)), 2),
                '最低价': round(float(stock.get('f16', 0)), 2),
                '开盘价': round(float(stock.get('f17', 0)), 2),
                '昨日收盘': round(float(stock.get('f18', 0)), 2)
            }
            stock_list.append(stock_info)

        return pd.DataFrame(stock_list)

    except requests.exceptions.RequestException as req_err:
        print(f"网络请求失败: {req_err}")
    except json.JSONDecodeError as json_err:
        print(f"JSON解析错误: {json_err}")
    except Exception as err:
        print(f"数据处理异常: {err}")
    
    return pd.DataFrame()


def store_to_database(stock_df, database="stock_market.db"):
    """将股票数据存储到SQLite数据库"""
    if stock_df.empty:
        print("警告: 尝试存储空数据到数据库")
        return False

    try:
        with sqlite3.connect(database) as conn:
            cursor = conn.cursor()
            
            # 创建表(如果不存在)
            cursor.execute('''
                CREATE TABLE IF NOT EXISTS stock_quotes (
                    record_id INTEGER PRIMARY KEY AUTOINCREMENT,
                    stock_code TEXT, 
                    stock_name TEXT, 
                    current_price REAL, 
                    price_change_pct TEXT, 
                    price_change REAL, 
                    volume INTEGER, 
                    turnover REAL, 
                    amplitude_pct TEXT,
                    high_price REAL, 
                    low_price REAL, 
                    open_price REAL, 
                    prev_close REAL,
                    update_time TIMESTAMP DEFAULT CURRENT_TIMESTAMP
                )
            ''')

            # 批量插入数据(使用executemany提高效率)
            records = [tuple(row) for _, row in stock_df.iterrows()]
            cursor.executemany('''
                INSERT INTO stock_quotes 
                (stock_code, stock_name, current_price, price_change_pct, 
                 price_change, volume, turnover, amplitude_pct, 
                 high_price, low_price, open_price, prev_close)
                VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
            ''', records)
            
            conn.commit()
        return True

    except sqlite3.Error as db_err:
        print(f"数据库操作失败: {db_err}")
        return False


def format_volume(volume):
    """格式化成交量显示"""
    if volume > 100000000:  # 超过1亿
        return f"{volume/100000000:.2f}亿手"
    elif volume > 10000:  # 超过1万
        return f"{volume/10000:.2f}万手"
    return f"{volume}手"


def format_turnover(amount):
    """格式化成交额显示"""
    if amount > 100000000:  # 超过1亿
        return f"{amount/100000000:.2f}亿元"
    return f"{amount/10000:.2f}万元"


def display_stock_table():
    """显示格式化后的股票数据表"""
    stock_data = fetch_stock_market_data()
    
    if stock_data.empty:
        print("错误: 未能获取有效的股票数据")
        return

    # 创建显示用的DataFrame副本
    display_df = stock_data.copy()
    display_df.insert(0, '序号', range(1, len(display_df)+1))
    
    # 应用格式化
    display_df['成交量'] = display_df['成交量'].apply(format_volume)
    display_df['成交额'] = display_df['成交额'].apply(format_turnover)
    
    # 配置显示选项
    pd.set_option('display.unicode.east_asian_width', True)
    pd.set_option('display.max_columns', None)
    pd.set_option('display.width', 120)

    # 打印表格
    print("\n实时股票行情")
    print("="*120)
    print(display_df.to_string(index=False))
    print("="*120)
    print(f"数据更新时间: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
    print(f"股票总数: {len(display_df)}")

    # 存储到数据库
    if store_to_database(stock_data):
        print("状态: 数据已成功保存到数据库")
    else:
        print("警告: 数据保存失败")


if __name__ == "__main__":
    print("=== 股票数据采集系统 ===")
    print(f"启动时间: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}\n")
    display_stock_table()
输出结果:

03a6bb0f-1a31-4bb9-807c-9007d99688b4
2)心得体会:
可以通过调取api的方式爬取网页中的数据。
作业③:

– 要求:爬取中国大学2021主榜(https://www.shanghairanking.cn/rankings/bcur/2021 )所有院校信息,并存储在数据库中,同时将浏览器F12调试分析的过程录制Gif加入至博客中。
– 技巧:分析该网站的发包情况,分析获取数据的api

1)代码内容:

点击查看代码
import requests
import sqlite3
import os

class UniversityRankingFetcher:
    def __init__(self, ranking_type=11, target_year=2021):
        self.api_endpoint = f"https://www.shanghairanking.cn/api/pub/v1/bcur?bcur_type={ranking_type}&year={target_year}"
        self.request_headers = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36',
            'Referer': 'https://www.shanghairanking.cn/',
        }
        self.database_file = f"university_rankings_{target_year}.db"
        self.data_table = "university_rankings"

    def fetch_api_data(self):
        """从API接口获取大学排名数据"""
        try:
            response = requests.get(self.api_endpoint, headers=self.request_headers, timeout=10)
            response.raise_for_status()
            return response.json()
        except requests.exceptions.RequestException as e:
            print(f"请求API时出错: {e}")
            return None

    def store_to_db(self, api_data):
        """将API数据保存到SQLite数据库"""
        if not api_data or 'data' not in api_data or 'rankings' not in api_data['data']:
            print("警告: 返回数据格式不符合预期")
            return False

        university_list = api_data['data']['rankings']

        # 删除已存在的数据库文件
        if os.path.exists(self.database_file):
            os.remove(self.database_file)

        connection = None
        try:
            connection = sqlite3.connect(self.database_file)
            cursor = connection.cursor()
            print(f"已创建数据库连接: {self.database_file}")

            # 创建数据表
            cursor.execute(f'''
            CREATE TABLE IF NOT EXISTS {self.data_table} (
                rank_position INTEGER,
                university_name TEXT,
                region TEXT,
                category TEXT,
                total_score REAL
            )
            ''')
            print(f"数据表 {self.data_table} 创建成功")

            # 准备批量插入的数据
            records = [
                (
                    item['rankOverall'],
                    item['univNameCn'],
                    item['province'],
                    item['univCategory'],
                    item['score']
                )
                for item in university_list
            ]

            # 执行批量插入
            cursor.executemany(
                f'''INSERT INTO {self.data_table} 
                (rank_position, university_name, region, category, total_score) 
                VALUES (?, ?, ?, ?, ?)''',
                records
            )

            connection.commit()
            print(f"成功保存 {len(records)} 条大学排名记录")
            return True

        except sqlite3.Error as db_error:
            print(f"数据库操作异常: {db_error}")
            return False
        finally:
            if connection:
                connection.close()

    def display_from_db(self):
        """从数据库读取并显示排名数据"""
        if not os.path.exists(self.database_file):
            print(f"错误: 数据库文件 {self.database_file} 不存在")
            return

        try:
            with sqlite3.connect(self.database_file) as conn:
                cursor = conn.cursor()
                cursor.execute(f'''
                SELECT rank_position, university_name, region, category, total_score 
                FROM {self.data_table} 
                ORDER BY rank_position
                ''')
                
                print(f"\n--- {self.api_endpoint.split('year=')[1]}年大学排名 ---")
                print("{:<6}{:<20}{:<10}{:<10}{:<8}".format(
                    "排名", "学校名称", "地区", "类型", "总分"))
                print("-" * 60)

                for record in cursor.fetchall():
                    rank, name, region, category, score = record
                    score_str = f"{score:.2f}" if score is not None else "N/A"
                    print(f"{rank:<6}{name[:18]:<20}{region:<10}{category:<10}{score_str:<8}")

        except sqlite3.Error as e:
            print(f"读取数据库时发生错误: {e}")

    def execute(self):
        """执行完整流程: 获取数据 -> 存储数据库 -> 显示结果"""
        raw_data = self.fetch_api_data()
        if raw_data:
            if self.store_to_db(raw_data):
                self.display_from_db()

if __name__ == "__main__":
    fetcher = UniversityRankingFetcher()
    fetcher.execute()
输出结果:

6be84200-e205-4cd0-bfce-3a8ba42fbbc8
2)心得体会:
静态页面无法进行完整爬取,可借由js文件爬取内容。

posted @ 2026-01-04 03:17  李坤铭  阅读(2)  评论(0)    收藏  举报