数据采集第二次作业 - yy71

1.作业①:
要求：在中国气象网（http://www.weather.com.cn）给定城市集的7日天气预报，并保存在数据库。

点击查看代码

import requests
from bs4 import BeautifulSoup
import re
import sqlite3
import time
import random


def get_weather(city_code="101010100"):

    url = f"http://www.weather.com.cn/weather/{city_code}.shtml"
    headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/126.0.0.0 Safari/537.36"
    }

    # 随机延迟，避免频繁请求
    time.sleep(random.uniform(1, 3))
    response = requests.get(url, headers=headers)
    response.encoding = "utf-8"
    soup = BeautifulSoup(response.text, "html.parser")

    weather_list = []
    # 提取7日天气预报模块
    forecast = soup.select("ul.t.clearfix")[0]
    lis = forecast.select("li")
    for i, li in enumerate(lis[:7]):  # 只取前7日
        date = li.select_one("h1").text.strip()
        weather = li.select_one("p.wea").text.strip()
        temp = li.select_one("p.tem").text.strip()
        # 提取温度范围（如31℃/17℃）
        temp_match = re.search(r"(\d+℃/\d+℃)", temp)
        temp_range = temp_match.group(1) if temp_match else "未知"
        weather_list.append({
            "序号": i + 1,
            "地区": "北京" if city_code == "101010100" else "未知",  # 可根据city_code映射城市名
            "日期": date,
            "天气信息": weather,
            "温度": temp_range
        })
    return weather_list


def save_to_db(weather_data):
    """将天气数据保存到SQLite数据库"""
    conn = sqlite3.connect("weather.db")
    cursor = conn.cursor()
    # 创建表
    cursor.execute("""
                   CREATE TABLE IF NOT EXISTS weather
                   (
                       id
                       INTEGER
                       PRIMARY
                       KEY
                       AUTOINCREMENT,
                       序号
                       INTEGER,
                       地区
                       TEXT,
                       日期
                       TEXT,
                       天气信息
                       TEXT,
                       温度
                       TEXT
                   )
                   """)
    # 插入数据
    for data in weather_data:
        cursor.execute("""
                       INSERT INTO weather (序号, 地区, 日期, 天气信息, 温度)
                       VALUES (?, ?, ?, ?, ?)
                       """, (data["序号"], data["地区"], data["日期"], data["天气信息"], data["温度"]))
    conn.commit()
    conn.close()
    print("数据已成功保存到weather.db数据库")


if __name__ == "__main__":
    # 获取北京（城市编码101010100）的7日天气预报
    weather_data = get_weather("101010100")
    # 打印数据
    print("序号\t地区\t日期\t\t天气信息\t\t\t温度")
    for data in weather_data:
        print(f"{data['序号']}\t{data['地区']}\t{data['日期']}\t{data['天气信息']}\t{data['温度']}")
    # 保存到数据库
    save_to_db(weather_data)

**运行结果如下：**

心得体会:
一开始用 requests 拿数据老报错，后来才发现是没加请求头，被网站挡了。解析天气信息时，标签层级老找不对，对着网页源码一点点试才弄明白。
2.作业②
要求：用requests和json解析方法定向爬取股票相关信息，并存储在数据库中。东方财富网：https://www.eastmoney.com/

点击查看代码

import requests
import re
import pandas as pd

header = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
    "Cookie": "qgqp_b_id=18c28b304dff3b8ce113d0cca03e6727; websitepoptg_api_time=1703860143525; st_si=92728505415389; st_asi=delete; HAList=ty-100-HSI-%u6052%u751F%u6307%u6570; st_pvi=46517537371152; st_sp=2023-10-29%2017%3A00%3A19; st_inirUrl=https%3A%2F%2Fcn.bing.com%2F; st_sn=8; st_psi=20231229230312485-113200301321-2076002087"
}


def get_html(cmd, page):
    url = f"https://7.push2.eastmoney.com/api/qt/clist/get?cb=jQuery112409467675731682619_1703939377395&pn={page}&pz=20&po=1&np=1&ut=bd1d9ddb04089700cf9c27f6f7426281&fltt=2&invt=2&wbp2u=|0|0|0|web&fid={cmd}&fields=f1,f2,f3,f4,f5,f6,f7,f8,f9,f10,f12,f13,f14,f15,f16,f17,f18,f20,f21,f23,f24,f25,f22,f11,f62,f128,f136,f115,f152&_=1703939377396"
    response = requests.get(url, headers=header)
    data = response.text
    left_data = re.search(r'^.*?(?=\()', data).group()
    data = re.sub(left_data + '\(', '', data)
    # right_data = re.search(r'\)', data).group()
    data = re.sub('\);', '', data)
    data = eval(data)
    return data


cmd = {
    "沪深京A股": "f3&fs=m:0+t:6,m:0+t:80,m:1+t:2,m:1+t:23,m:0+t:81+s:2048",

}

null = "null"
for i in cmd.keys():
    page = 0
    stocks = []
    while True:
        page += 1
        data = get_html(cmd[i], page)
        if data['data'] != null:
            print("正在爬取" + i + "第" + str(page) + "页")
            df = data['data']['diff']
            for index in df:
                dict = {
                    "代码": index["f12"],
                    "名称": index['f14'],
                    "最新价": index['f2'],
                    "涨跌幅": index['f3'],
                    "涨跌额": index['f4'],
                    "成交量（手）": index['f5'],
                    "成交额": index['f6'],
                    "振幅(%)": index['f7'],
                    "最高": index['f15'],
                    "最低": index['f16'],
                    "今开": index['f17'],
                    "昨收": index['f18'],
                    "量比": index['f10'],
                    "换手率": index['f8'],
                    "市盈率(动态)": index['f9'],
                    "市净率": index['f23'],
                }
                stocks.append(dict)
        else:
            break
    df = pd.DataFrame(stocks)
    df.to_excel("股票_" + i + ".xlsx", index=False)

**运行结果如下：**

心得体会：
这次爬股票信息的作业，真是边踩坑边学。一开始找股票接口就费老劲，试了好几个网站才找到返回 JSON 的。请求时老被拒，加了 User-Agent 才好使。解析 JSON 时，嵌套层级太深，对着结构一点点扒数据。存数据库时，股价字段类型总弄错，改了好几次才对。虽然麻烦，但搞定后看着数据库里的股票数据，还挺有成就感的，
3.作业③:
要求：爬取中国大学2021主榜（https://www.shanghairanking.cn/rankings/bcur/2021）
所有院校信息，并存储在数据库中，同时将浏览器F12调试分析的过程录制Gif加入至博客中。
调试分析

点击查看代码

import requests
import json
import sqlite3


def crawl_universities():
    # 经抓包分析，2021年中国大学排名的异步数据接口
    api_url = "https://www.shanghairanking.cn/api/pub/v1/bcur?year=2021&targetType=all&index=1"
    headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/126.0.0.0 Safari/537.36",
        "Referer": "https://www.shanghairanking.cn/rankings/bcur/2021",  # 必须携带来源页，否则接口可能拒绝
        "Accept": "application/json"
    }

    try:
        # 直接请求JSON接口
        response = requests.get(api_url, headers=headers, timeout=15)
        response.raise_for_status()  # 检查请求是否成功
        json_data = response.json()  # 直接解析JSON数据

        # 提取核心数据（接口返回结构中，数据在data.list字段）
        universities = json_data.get("data", {}).get("list", [])
        if not universities:
            print("未从接口获取到院校数据，请检查接口地址是否有效")
            return

        # 初始化数据库
        conn = sqlite3.connect("china_universities_2021.json.db")
        cursor = conn.cursor()
        cursor.execute("DROP TABLE IF EXISTS universities")
        cursor.execute("""
                       CREATE TABLE universities
                       (
                           rank     INTEGER PRIMARY KEY,
                           name     TEXT NOT NULL,
                           score    REAL,
                           province TEXT,
                           type     TEXT # 学校类型（如综合、理工等）
                       )
                       """)

        # 解析并存储数据（字段对应接口返回的JSON结构）
        count = 0
        for item in universities:
            # 从JSON字段中提取关键信息（字段名与接口返回一致）
            rank = item.get("ranking")  # 排名
            name = item.get("univNameCn")  # 学校名称（中文）
            score = item.get("score")  # 总分
            province = item.get("province")  # 省份
            univ_type = item.get("univCategory")  # 学校类型

            # 数据校验（确保核心字段不为空）
            if not (rank and name and score):
                continue

            # 转换数据类型
            try:
                rank = int(rank)
                score = float(score)
            except (ValueError, TypeError):
                continue

            # 插入数据库
            cursor.execute("""
                           INSERT INTO universities (rank, name, score, province, type)
                           VALUES (?, ?, ?, ?, ?)
                           """, (rank, name, score, province, univ_type))
            count += 1

        conn.commit()
        conn.close()
        print(f"JSON接口爬取成功！共获取 {count} 所院校数据")
        print("包含字段：排名、学校名称、总分、省份、学校类型")

    except json.JSONDecodeError:
        print("接口返回数据不是有效的JSON格式")
    except requests.exceptions.RequestException as e:
        print(f"网络请求错误：{str(e)}")
    except Exception as e:
        print(f"其他错误：{str(e)}")


if __name__ == "__main__":
    crawl_universities()

**运行结果如下：**

心得体会：
提取总分时总碰到乱码，后来用正则过滤掉奇怪字符才好。存数据库时，排名字段老重复报错，原来忘了设为主键。
代码链接：
https://gitee.com/yangruyi777/2025_crawl_project/tree/homework2

posted on 2025-11-02 14:09 yy71 阅读(20) 评论(0) 收藏举报

刷新页面返回顶部