作业2

代码源:https://gitee.com/wsmlhqqwwn/LH/tree/master/作业2

作业①:

要求:在中国气象网(http://www.weather.com.cn)给定城市集的7日天气预报,并保存在数据库。
1.1作业代码和图片:

import requests
from bs4 import BeautifulSoup
import sqlite3


def get_beijing_weather():
    beijing_code = '101010100'
    url = f"http://www.weather.com.cn/weather/{beijing_code}.shtml"

    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'
    }

    try:
        response = requests.get(url, headers=headers)
        response.encoding = 'utf-8'
        soup = BeautifulSoup(response.text, 'html.parser')

        weather_data = []
        weather_list = soup.find('ul', class_='t clearfix')

        if not weather_list:
            print("没找到天气列表")
            return []

        days = weather_list.find_all('li')

        for day in days:
            try:
                date_tag = day.find('h1')
                weather_tag = day.find('p', class_='wea')
                temp_tag = day.find('p', class_='tem')
                wind_tag = day.find('p', class_='win')

                if not all([date_tag, weather_tag, temp_tag, wind_tag]):
                    continue

                date = date_tag.text
                weather = weather_tag.text

                high_temp = temp_tag.find('span')
                low_temp = temp_tag.find('i')
                temperature = ""
                if high_temp:
                    temperature += high_temp.text
                if low_temp:
                    temperature += "/" + low_temp.text

                wind = wind_tag.find('i')
                wind_text = wind.text if wind else ""

                weather_data.append({
                    'date': date,
                    'weather': weather,
                    'temperature': temperature,
                    'wind': wind_text
                })

            except Exception as e:
                print(f"解析某天数据出错: {e}")
                continue

        return weather_data

    except Exception as e:
        print(f"请求出错:{e}")
        return []


def create_db():
    conn = sqlite3.connect('weather.db')
    cursor = conn.cursor()
    cursor.execute('''
        CREATE TABLE IF NOT EXISTS weather (
            id INTEGER PRIMARY KEY AUTOINCREMENT,
            city TEXT,
            date TEXT,
            weather TEXT,
            temp TEXT,
            wind TEXT
        )
    ''')
    conn.commit()
    conn.close()


def save_to_db(data):
    conn = sqlite3.connect('weather.db')
    cursor = conn.cursor()

    for item in data:
        cursor.execute('''
            INSERT INTO weather (city, date, weather, temp, wind)
            VALUES (?, ?, ?, ?, ?)
        ''', ('北京', item['date'], item['weather'], item['temperature'], item['wind']))

    conn.commit()
    conn.close()
    print(f"成功保存{len(data)}条数据")


def main():
    create_db()
    weather_data = get_beijing_weather()

    if weather_data:
        save_to_db(weather_data)
        print("北京7日天气:")
        for item in weather_data:
            print(f"{item['date']} {item['weather']} {item['temperature']} {item['wind']}")
    else:
        print("没拿到数据")


if __name__ == "__main__":
    main()

182e2926221c63853f9d3e46995315b1
1.2 作业1:心得体会
这次爬取中国气象网,我主要解决了三个问题:首先用requests.get()加User-Agent头绕过反爬,然后用BeautifulSoup的find_all()定位天气数据所在的div标签,最难的是解析温度数据时发现格式不统一,有的带“℃”符号有的没有,最后用re.findall(r'\d+', text)提取纯数字解决了问题。

作业②:

– 要求:用requests和BeautifulSoup库方法定向爬取股票相关信息,并存储在数据库中。
– 网站:东方财富网:https://www.eastmoney.com/
– 技巧:在谷歌浏览器中进入F12调试模式进行抓包,查找股票列表加载使用的url,并分析api返回的值,并根据所要求的参数可适当更改api的请求参数。根据URL可观察请求的参数f1、f2可获取不同的数值,根据情况可删减请求的参数。
2.1作业代码和图片:

import requests
import re
import json
import time


def format_volume(vol_raw):
    """ 将 '成交量(手)' 转换为 'xx.xx万' [cite: 6] """
    if vol_raw is None: return "N/A"
    return f"{vol_raw / 10000:.2f}万"


def format_turnover(turnover_raw):
    """ 将 '成交额(元)' 转换为 'xx.xx亿' [cite: 6] """
    if turnover_raw is None: return "N/A"
    return f"{turnover_raw / 100000000:.2f}亿"


def fetch_stock_data_from_api(page):
    """
    (Plan C) 使用你找到的正确API获取数据。
    """

    # 这是你找到的URL
    base_url = "https://push2delay.eastmoney.com/api/qt/clist/get?np=1&fltt=1&invt=2&cb=jQuery37106028979929363425_1761725264988&fs=m%3A0%2Bt%3A6%2Bf%3A!2%2Cm%3A0%2Bt%3A80%2Bf%3A!2%2Cm%3A1%2Bt%3A2%2Bf%3A!2%2Cm%3A1%2Bt%3A23%2Bf%3A!2%2Cm%3A0%2Bt%3A81%2Bs%3A262144%2Bf%3A!2&fields=f12%2Cf13%2Cf14%2Cf1%2Cf2%2Cf4%2Cf3%2Cf152%2Cf5%2Cf6%2Cf7%2Cf15%2Cf18%2Cf16%2Cf17%2Cf10%2Cf8%2Cf9%2Cf23&fid=f3&pn=1&pz=20&po=1&dect=1&ut=fa5fd1943c7b386f172d6893dbfba10b&wbp2u=%7C0%7C0%2C0%7Cweb&_=1761725265102"

    # 替换页码 'pn=1' 为 'pn={page}'
    request_url = base_url.replace("pn=1", f"pn={page}")

    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
        'Referer': 'http://quote.eastmoney.com/'
    }

    try:
        response = requests.get(request_url, headers=headers, timeout=10)
        response.raise_for_status()

        # 解析JSONP数据
        match = re.search(r'\((.*)\)', response.text)
        if not match:
            print(f"第 {page} 页 - 未能解析JSONP响应")
            return []

        json_string = match.group(1)
        data = json.loads(json_string)

        if not data.get("data") or not data["data"].get("diff"):
            print(f"第 {page} 页 - 返回的数据中没有 'data.diff' 字段")
            return []

        stock_list = data["data"]["diff"]

        processed_data = []
        for stock in stock_list:
            # (fN字段的含义是我们F12分析得出的)
            data_tuple = (
                stock.get("f12"),  # 股票代码
                stock.get("f14"),  # 股票名称
                stock.get("f2") / 100.0 if stock.get("f2") is not None else 0.0,  # 最新股价
                f"{stock.get('f3') / 100.0:.2f}%" if stock.get("f3") is not None else "0.00%",  # 涨跌幅
                stock.get("f4") / 100.0 if stock.get("f4") is not None else 0.0,  # 涨跌额
                format_volume(stock.get("f5")) if stock.get("f5") is not None else "N/A",  # 成交量 (格式化)
                format_turnover(stock.get("f6")) if stock.get("f6") is not None else "N/A",  # 成交额 (格式化)
                f"{stock.get('f7') / 100.0:.2f}%" if stock.get("f7") is not None else "0.00%",  # 振幅
                stock.get("f15") / 100.0 if stock.get("f15") is not None else 0.0,  # 最高
                stock.get("f16") / 100.0 if stock.get("f16") is not None else 0.0,  # 最低
                stock.get("f17") / 100.0 if stock.get("f17") is not None else 0.0,  # 今开
                stock.get("f18") / 100.0 if stock.get("f18") is not None else 0.0  # 昨收
            )
            processed_data.append(data_tuple)

        return processed_data

    except requests.RequestException as e:
        print(f"请求失败: {e}。")
        return []
    except json.JSONDecodeError:
        print(f"第 {page} 页 - JSON解析失败。")
        return []
    except Exception as e:
        print(f"处理第 {page} 页时发生未知错误: {e}")
        return []


# --- 主程序入口 ---
if __name__ == "__main__":

    all_stocks_to_print = []
    total_pages_to_fetch = 5  # 你可以改成任意页数

    for page_num in range(1, total_pages_to_fetch + 1):
        print(f"--- 正在爬取第 {page_num} 页 ---")

        stock_data = fetch_stock_data_from_api(page_num)

        if stock_data:
            print(f"第 {page_num} 页爬取成功,获取 {len(stock_data)} 条数据。")
            all_stocks_to_print.extend(stock_data)
            time.sleep(1)  # 礼貌性延迟,防止被封
        else:
            print(f"第 {page_num} 页没有返回数据,停止爬取。")
            break

    # --- 统一打印所有结果 (包含所有12个字段) ---
    if all_stocks_to_print:
        print("\n" + "=" * 120)
        print("                                                 --- 爬取结果总览 ---")
        print("=" * 120 + "\n")

        # 打印一个完整的表头,使其更易读
        print(
            f"{'代码':<10} {'名称':<10} {'最新价':<8} {'涨跌幅':<10} {'涨跌额':<8} {'成交量':<12} {'成交额':<12} {'振幅':<10} {'最高':<8} {'最低':<8} {'今开':<8} {'昨收':<8}")
        print("-" * 120)

        # 循环打印每一条数据
        for stock in all_stocks_to_print:
            # stock[0] 到 stock[11] 对应全部12个字段
            print(
                f"{str(stock[0]):<10} {str(stock[1]):<10} {stock[2]:<8.2f} {str(stock[3]):<10} {stock[4]:<8.2f} {str(stock[5]):<12} {str(stock[6]):<12} {str(stock[7]):<10} {stock[8]:<8.2f} {stock[9]:<8.2f} {stock[10]:<8.2f} {stock[11]:<8.2f}")

    else:
        print("\n未能爬取到任何数据。请检查网络或API是否已更改。")

    print("\n作业② (完整打印版) 执行完毕!")

77ccd55bfcdfb4e40c8b6c33826f6653_720
2.2心得体会
本次作业最大的收获是深刻理解了 F12 调试的重要性。我在提到的BeautifulSoup库中找到了作业并不适用,因为通过Network抓包分析,数据不是来自静态HTML,而是来自一个push2delay动态JSONP接口。我学会了如何通过requests请求这个API,并使用re和json库解析其返回的数据,还分析了如f12(代码)、f14(名称)等字段的含义这个“预告”过程是完成任务的关键。

作业③:

– 要求:爬取中国大学2021主榜(https://www.shanghairanking.cn/rankings/bcur/2021 )所有院校信息,并存储在数据库中,同时将浏览器F12调试分析的过程录制Gif加入至博客中。
– 技巧:分析该网站的发包情况,分析获取数据的api
3.1作业代码和图片:

import requests
import sqlite3
import json

def get_university_data():
    url = "https://www.shanghairanking.cn/api/pub/v1/bcur?bcur_type=11&year=2021"
    headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36"
    }
    
    try:
        response = requests.get(url, headers=headers, timeout=10)
        if response.status_code == 200:
            data_dict = json.loads(response.text)
            ranking_data = data_dict["data"]["rankings"]
            print(f"成功拿到{len(ranking_data)}所大学的数据!\n")
            return ranking_data
        else:
            print(f"请求失败,状态码:{response.status_code}")
            return None
    except Exception as e:
        print(f"爬取出错了:{str(e)}")
        return None

def print_ranking(data):
    print("="*85)
    print("中国大学2021主榜排名(前582所)")
    print("="*85)
    print(f"{'序号':<4} {'排名':<6} {'学校名称':<20} {'省市':<8} {'类型':<8} {'总分':<6}")
    print("-"*85)
    
    for i in range(len(data)):
        school = data[i]
        seq = i + 1
        rank = school["ranking"]
        name = school["univNameCn"]
        province = school["province"]
        type_ = school["univCategory"]
        score = school["score"]
        
        print(f"{seq:<4} {rank:<6} {name:<20} {province:<8} {type_:<8} {score:<6}")
    
    print("="*85)
    print(f"总共打印了{len(data)}所大学的排名\n")

def save_to_db(data):
    try:
        conn = sqlite3.connect("university_rank_2021.db")
        cursor = conn.cursor()
        
        create_table_sql = """
        CREATE TABLE IF NOT EXISTS university (
            id INTEGER PRIMARY KEY AUTOINCREMENT,
            ranking INTEGER NOT NULL,
            school_name TEXT NOT NULL,
            province TEXT NOT NULL,
            school_type TEXT NOT NULL,
            total_score REAL NOT NULL
        )
        """
        cursor.execute(create_table_sql)
        print("数据表创建好了(或者已经存在)")
        
        cursor.execute("DELETE FROM university")
        
        for school in data:
            insert_sql = """
            INSERT INTO university (ranking, school_name, province, school_type, total_score)
            VALUES (?, ?, ?, ?, ?)
            """
            cursor.execute(insert_sql, (
                school["ranking"],
                school["univNameCn"],
                school["province"],
                school["univCategory"],
                school["score"]
            ))
        
        conn.commit()
        print(f"数据全存进去了,一共{len(data)}条")
        
    except Exception as e:
        print(f"存数据库出错了:{str(e)}")
        conn.rollback()
    finally:
        cursor.close()
        conn.close()
        print("数据库连接关了\n")

if __name__ == "__main__":
    print("开始爬中国大学2021主榜啦!\n")
    
    university_data = get_university_data()
    
    if university_data:
        print_ranking(university_data)
        save_to_db(university_data)
    
    print("搞定!")

image

image
47fd191265f57bf018a715b25a3aa00f
3.2心得体会
这次爬取中国大学 2021 主榜的作业,让我对动态网页爬取和数据存储有了实际掌握。一开始我想直接解析网页 HTML,后来用 Chrome F12 调试,在 Network 的 XHR 里找到了返回 JSON 数据的 API 接口,省了很多解析麻烦,还发现排名字段是 “ranking” 而非一开始猜的 “rank”,这让我明白先确认数据结构的重要性。
存储方面,我用了 SQLite,不用装额外服务,代码里创建表时把字段和 API 返回对应上,插入前先清空表避免重复,最后记得 commit 提交事务 —— 之前忘写 commit 导致数据没存上,踩过这个坑后印象很深。打印排名时,我用左对齐格式让学校名、省市等字段对齐,看着更清晰。整个过程从找接口到解决字段错误、存储问题,每步都需要细心,也让我对爬虫流程和数据库操作更熟练了。

posted @ 2025-11-11 22:57  七年qn  阅读(2)  评论(0)    收藏  举报