102302156 李子贤 数据采集第二次作业

作业1
在中国气象网(http://www.weather.com.cn)给定城市集的7日天气预报,并保存在数据库。

(1)代码和运行结果

点击查看代码
from bs4 import BeautifulSoup
from bs4 import UnicodeDammit
import urllib.request
import sqlite3


class WeatherDB:

    def openDB(self):

        # 连接到SQLite数据库
        self.con = sqlite3.connect("weathers.db")
        self.cursor = self.con.cursor()
        try:
            # 创建天气数据表,主键为城市+日期
            self.cursor.execute("""
                create table weathers (
                    wCity varchar(16),  
                    wDate varchar(16),  
                    wWeather varchar(64),  
                    wTemp varchar(32),  
                    constraint pk_weather primary key (wCity, wDate)
                )
            """)
        except:
            # 若表已存在则清空表数据
            self.cursor.execute("delete from weathers")

    def closeDB(self):
        self.con.commit()  
        self.con.close()

    def insert(self, city, date, weather, temp):
        try:
            self.cursor.execute("""
                insert into weathers (wCity, wDate, wWeather, wTemp) 
                values (?, ?, ?, ?)  
            """, (city, date, weather, temp))
        except Exception as err:
            print(f"插入数据出错: {err}")

    def show(self):
        self.cursor.execute("select * from weathers")
        rows = self.cursor.fetchall()
        # 打印表头
        print("%-16s%-16s%-32s%-16s" % ("city", "date", "weather", "temp"))
        # 打印每条数据
        for row in rows:
            print("%-16s%-16s%-32s%-16s" % (row[0], row[1], row[2], row[3]))


class WeatherForecast:

    def __init__(self):
        # 设置请求头,模拟浏览器访问
        self.headers = {
            "User-Agent": "Mozilla/5.0 (Windows; U; Windows NT 6.0 x64; en-US; rv:1.9pre) Gecko/2008072421 Minefield/3.0.2pre"
        }
        # 城市与对应天气网代码的映射(用于构造请求URL)
        self.cityCode = {
            "北京": "101010100",
            "上海": "101020100",
            "广州": "101280101",
            "深圳": "101280601"
        }

    def forecastCity(self, city):
        # 检查城市是否在支持的列表中
        if city not in self.cityCode.keys():
            print(f"{city} code cannot be found")
            return

        # 构造该城市的天气查询URL
        url = f"http://www.weather.com.cn/weather/{self.cityCode[city]}.shtml"
        try:
            # 发送HTTP请求
            req = urllib.request.Request(url, headers=self.headers)
            data = urllib.request.urlopen(req)
            data = data.read()  # 读取网页原始二进制数据

            # 自动检测网页编码并转换为Unicode
            dammit = UnicodeDammit(data, ["utf-8", "gbk"])  
            data = dammit.unicode_markup
            # 使用BeautifulSoup解析HTML
            soup = BeautifulSoup(data, "lxml")
            # 定位包含天气数据的列表项(通过CSS选择器)
            lis = soup.select("ul[class='t clearfix'] li")

            # 遍历每一天的天气数据
            for li in lis:
                try:
                    # 提取日期(h1标签内容)
                    date = li.select('h1')[0].text
                    # 提取天气状况(class为wea的p标签内容)
                    weather = li.select('p[class="wea"]')[0].text
                    # 提取温度范围(span标签为最高温,i标签为最低温)
                    temp = li.select('p[class="tem"] span')[0].text + "/" + li.select('p[class="tem"] i')[0].text
                    # 打印提取的数据
                    print(city, date, weather, temp)
                    # 调用数据库插入方法存储数据
                    self.db.insert(city, date, weather, temp)
                except Exception as err:
                    print(f"解析单条天气数据出错: {err}")
        except Exception as err:
            print(f"请求或解析页面出错: {err}")

    def process(self, cities):
        # 初始化数据库连接
        self.db = WeatherDB()
        self.db.openDB()

        # 遍历城市列表,逐个爬取数据
        for city in cities:
            self.forecastCity(city)
        # 关闭数据库连接
        self.db.closeDB()


if __name__ == "__main__":
    ws = WeatherForecast()
    ws.process(["北京", "上海", "广州", "深圳"])
    print("completed")

image

(2)心得体会

image

从上图的url网址中可以看出各个不同的城市之间只需要修改url城市编号即可,所以我们可以构建一个映射表,去分别访问不同城市的url。

image

通过f12我们可以很清晰的看到我们先要的数据都被存放在一个class='t clearfix'的ul中,只要去遍历ul下面的每一个li,利用css去定位爬取,就可以得到我们想要的结果
总之,无论使用re、css,只要明确自己所要爬取的网页结构,就可以轻松爬取到

作业2
用requests和BeautifulSoup库方法定向爬取股票相关信息,并存储在数据库中。

(1)代码和运行结果

点击查看代码
import requests
import sqlite3
import time
import json
from datetime import datetime

def init_db():
    # 连接到SQLite数据库
    conn = sqlite3.connect('stock_data.db')
    cursor = conn.cursor()
    # 创建股票数据表,定义各字段及类型
    cursor.execute('''
    CREATE TABLE IF NOT EXISTS stocks (
        id INTEGER PRIMARY KEY AUTOINCREMENT,  
        code TEXT,                             
        name TEXT,                             
        latest_price REAL,                     
        change_percent REAL,                   
        change_amount REAL,                    
        volume INTEGER,                       
        turnover REAL,                         
        increase REAL,                        
        crawl_time DATETIME                    
    )
    ''')
    conn.commit()  # 提交事务
    conn.close()   # 关闭连接


# 数据存储函数
def save_to_db(data_list):
    conn = sqlite3.connect('stock_data.db')
    cursor = conn.cursor()
    crawl_time = datetime.now().strftime('%Y-%m-%d %H:%M:%S')

    # 遍历股票数据列表,逐条插入数据库
    for data in data_list:
        cursor.execute('''
        INSERT INTO stocks (code, name, latest_price, change_percent, 
                          change_amount, volume, turnover, increase, crawl_time)
        VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?) 
        ''', (
            data['code'],
            data['name'],
            data['latest_price'],
            data['change_percent'],
            data['change_amount'],
            data['volume'],
            data['turnover'],
            data['increase'],
            crawl_time
        ))

    conn.commit()
    conn.close()


# 爬取股票数据并处理格式转换
def crawl_stock_data(page=1, pagesize=20):
    # 抓包的东方财富网股票数据接口
    url = f"https://push2.eastmoney.com/api/qt/clist/get?np=1&fltt=1&invt=2&cb=jQuery37107275901173684839_1761722704966&fs=m%3A0%2Bt%3A6%2Bf%3A!2%2Cm%3A0%2Bt%3A80%2Bf%3A!2%2Cm%3A1%2Bt%3A2%2Bf%3A!2%2Cm%3A1%2Bt%3A23%2Bf%3A!2%2Cm%3A0%2Bt%3A81%2Bs%3A262144%2Bf%3A!2&fields=f12%2Cf13%2Cf14%2Cf1%2Cf2%2Cf4%2Cf3%2Cf152%2Cf5%2Cf6%2Cf7%2Cf15%2Cf18%2Cf16%2Cf17%2Cf10%2Cf8%2Cf9%2Cf23&fid=f3&pn={page}&pz={pagesize}&po=1&dect=1&ut=fa5fd1943c7b386f172d6893dbfba10b&wbp2u=%7C0%7C0%7C0%7Cweb&_=1761722704968"

    try:
        # 设置请求头,模拟浏览器访问
        headers = {
            "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
            "Referer": "https://www.eastmoney.com/",
            "Accept": "text/javascript, application/javascript, application/ecmascript, application/x-ecmascript, */*; q=0.01",
            "X-Requested-With": "XMLHttpRequest"
        }

        # 发送GET请求
        response = requests.get(url, headers=headers, timeout=10)
        response.encoding = 'utf-8'  # 手动指定编码为utf-8,避免中文乱码

        # 处理JSONP格式数据
        if '(' in response.text and ')' in response.text:
            json_str = response.text[response.text.find('(') + 1: response.text.rfind(')')]
        else:
            print("返回数据非JSONP格式,无法解析")
            return []

        # 解析JSON字符串为字典
        data = json.loads(json_str)

        # 提取核心数据(接口返回的股票列表在data.diff中)
        if data.get('data') and 'diff' in data['data']:
            stock_list = []
            for item in data['data']['diff']:
                # 位转换(API返回原始数据为最小单位,需转换为常规显示单位)
                stock_info = {
                    'code': item.get('f12', ''),
                    'name': item.get('f14', ''),
                    'latest_price': item.get('f2', 0) / 100,
                    'change_percent': item.get('f3', 0) / 100,
                    'change_amount': item.get('f4', 0) / 100,
                    'volume': item.get('f5', 0),
                    'turnover': item.get('f6', 0) / 10000,
                    'increase': item.get('f3', 0) / 100
                }
                if stock_info['code'] and stock_info['name']:
                    stock_list.append(stock_info)
            return stock_list
        else:
            print(f"API返回数据结构异常:{data.get('rc', '未知错误码')},{data.get('rt', '无返回信息')}")
            return []

    except requests.exceptions.RequestException as e:
        print(f"网络请求错误: {e}")  # 捕获请求超时、连接失败等网络错误
        return []
    except json.JSONDecodeError as e:
        print(f"JSON解析错误(返回内容:{response.text[:100]}...): {e}")  # 捕获JSON格式错误
        return []
    except Exception as e:
        print(f"其他爬取错误: {e}")  # 捕获其他未预料的错误
        return []


# 打印股票数据:格式化输出
def print_stock_data(stock_list):
    # 打印表头,使用制表符和对齐符保证格式整齐
    print(f"\n{'序号':<4}\t{'代码':<8}\t{'名称':<8}\t{'最新价':<8}\t{'涨跌幅(%)':<10}\t{'涨跌额':<8}\t{'成交量':<10}\t{'成交额(万)':<10}\t{'涨幅(%)':<8}")
    print("-" * 120)  # 分隔线
    # 遍历股票列表,按格式打印每条数据
    for i, stock in enumerate(stock_list, 1):
        print(f"{i:<4}\t{stock['code']:<8}\t{stock['name']:<8}\t{stock['latest_price']:<8.2f}\t"
              f"{stock['change_percent']:<10.2f}\t{stock['change_amount']:<8.2f}\t{stock['volume']:<10}\t"
              f"{stock['turnover']:<10.1f}\t{stock['increase']:<8.2f}")



def main():
    init_db()
    print("数据库初始化完成,开始爬取股票数据...")

    total_pages = 3  # 爬取的总页数
    pagesize = 20    # 每页数据量

    # 循环爬取多页数据
    for page in range(1, total_pages + 1):
        print(f"\n=== 正在爬取第{page}/{total_pages}页数据 ===")
        # 调用爬取函数获取当前页数据
        stock_list = crawl_stock_data(page=page, pagesize=pagesize)

        # 若获取到有效数据,则打印并存储
        if stock_list:
            print_stock_data(stock_list)  # 打印数据
            save_to_db(stock_list)        # 存储数据
            print(f"第{page}页数据已保存至stock_data.db(共{len(stock_list)}条记录)")
        else:
            print(f"第{page}页未获取到有效股票数据")

        time.sleep(3)

    print(f"\n爬取完成!数据已存储至stock_data.db")



if __name__ == "__main__":
    main()

image

(2)心得体会
image

通过f12去抓包到相应的信息包,可以从预览中查看到存储的数据,将这个我们所需要的包的标头复制下来,通过浏览器可以查看到我们所需要的信息存储在data下的diff

image

这个网页结构很整洁也很清晰,所以只需要找到我们所需对应的数据,用get方法获取请求的参数f数据即可得到
这道题是用了一种新方法,刚开始还不能够很熟练的掌握,抓包的时候也需要一个一个去点击查看是不是自己想要的包,第一次代码运行出来的时候,我发现API返回原始数据为最小单位,需要我们对数据进行处理,转换为常规显示单位

作业3
爬取中国大学2021主榜(https://www.shanghairanking.cn/rankings/bcur/2021)所有院校信息,并存储在数据库中,同时将浏览器F12调试分析的过程录制Gif加入至博客中。
(1)代码和运行结果

点击查看代码
import os
import re
import datetime
import requests
import sqlite3
from sqlite3 import Error


def download_payload_js(url, save_path="payload.js"):
    """下载payload.js文件到本地"""
    try:
        headers = {
            "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.0.0 Safari/537.36"
        }

        print(f"开始下载: {url}")
        response = requests.get(url, headers=headers, timeout=15)
        response.raise_for_status()

        with open(save_path, "w", encoding="utf-8") as f:
            f.write(response.text)

        print(f"下载成功,保存至: {os.path.abspath(save_path)}")
        return True

    except requests.exceptions.RequestException as e:
        print(f"下载失败: {str(e)}")
        return False


def parse_all_rankings(file_path="payload.js"):
    """解析本地文件,提取所有排名数据"""
    if not os.path.exists(file_path):
        print(f"{file_path}不存在,无法解析")
        return []

    # 编码映射表
    province_map = {
        "q": "北京", "D": "上海", "x": "浙江", "k": "江苏", "v": "湖北",
        "y": "安徽", "u": "广东", "B": "黑龙江", "C": "吉林", "n": "山东",
        "o": "河南", "p": "河北", "N": "天津", "G": "山西", "F": "福建",
        "t": "四川", "s": "陕西", "w": "湖南", "M": "重庆", "K": "甘肃",
        "L": "内蒙古", "H": "云南", "I": "广西", "J": "贵州", "az": "宁夏",
        "aA": "青海", "aB": "西藏", "O": "新疆", "Y": "海南", "r": "辽宁"
    }
    category_map = {
        "f": "综合类", "e": "理工类", "h": "师范类", "m": "农业类",
        "S": "林业类", "dS": "政法类", "21": "医药类", "22": "财经类",
        "19": "语言类", "23": "艺术类", "24": "体育类", "25": "民族类"
    }
    tag_map = {"i": "双一流", "l": "985", "j": "211"}

    try:
        with open(file_path, "r", encoding="utf-8") as f:
            content = f.read()

        # 提取univData核心区域
        univ_data_match = re.search(r'univData:\s*\[(.*?)\],\s*indList:', content, re.S)
        if not univ_data_match:
            print("未找到核心数据区域")
            return []

        # 提取所有大学对象
        univ_items = re.findall(r'\{[^}]*univNameCn:"[^"]+"[^}]*\}', univ_data_match.group(1), re.S)
        all_rankings = []

        for idx, item in enumerate(univ_items, 1):
            # 基础信息提取
            name_cn = re.search(r'univNameCn:"(.*?)"', item).group(1) if re.search(r'univNameCn:"(.*?)"',
                                                                                   item) else f"未知大学_{idx}"
            name_en = re.search(r'univNameEn:"(.*?)"', item).group(1) if re.search(r'univNameEn:"(.*?)"',
                                                                                   item) else "Unknown University"

            # 排名处理
            rank_match = re.search(r'ranking:([^,]+)', item)
            if rank_match and rank_match.group(1).strip('"').isdigit():
                rank = int(rank_match.group(1).strip('"'))
            else:
                rank = idx  # 若无法提取排名,使用索引

            # 省份处理
            province_match = re.search(r'province:([^,]+)', item)
            province_code = province_match.group(1).strip('"') if province_match else ""
            province = province_map.get(province_code, "未知省份")

            # 学校类型处理
            category_match = re.search(r'univCategory:([^,]+)', item)
            category_code = category_match.group(1) if category_match else ""
            category = category_map.get(category_code, "未知类型")

            # 总分处理
            score_match = re.search(r'score:([^,]+)', item)
            if score_match:
                score_str = score_match.group(1).strip('"')
                score = float(score_str) if score_str.replace('.', '').isdigit() else None
            else:
                score = None

            # 标签处理
            tags = "无"
            tags_match = re.search(r'univTags:\[([^\]]*)\]', item)
            if tags_match:
                tag_codes = [t.strip() for t in tags_match.group(1).split(',') if t.strip()]
                tags = ",".join([tag_map.get(t, t) for t in tag_codes])

            # 指标得分提取
            ind_data = {}
            ind_match = re.search(r'indData:(\{[^}]+\})', item)
            if ind_match:
                ind_str = ind_match.group(1)
                ind_name_map = {"159": "办学层次", "160": "学科水平", "163": "人才培养"}
                for code, name in ind_name_map.items():
                    val_match = re.search(f'"{code}":"?([^,"]+)"?', ind_str)
                    if val_match:
                        val = val_match.group(1).strip()
                        ind_data[name] = float(val) if val.replace('.', '').isdigit() else val
                    else:
                        ind_data[name] = None

            # 组装数据
            all_rankings.append({
                "rank": rank,
                "name_cn": name_cn,
                "name_en": name_en,
                "province": province,
                "category": category,
                "score": score,
                "tags": tags,
                "school_level": ind_data.get("办学层次"),
                "discipline_level": ind_data.get("学科水平"),
                "talent_train": ind_data.get("人才培养"),
                "crawl_time": datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")
            })

        print(f"成功解析所有 {len(all_rankings)} 条排名数据")
        return all_rankings

    except Exception as e:
        print(f"解析失败: {str(e)}")
        return []


def create_db_connection(db_file):
    """创建与SQLite数据库的连接"""
    conn = None
    try:
        conn = sqlite3.connect(db_file)
        print(f"数据库连接成功 (SQLite版本: {sqlite3.version})")
        return conn
    except Error as e:
        print(f"数据库连接失败: {e}")
    return conn


def create_ranking_table(conn):
    """创建大学排名表"""
    try:
        sql_create_rankings_table = """
        CREATE TABLE IF NOT EXISTS university_rankings (
            id INTEGER PRIMARY KEY AUTOINCREMENT,
            rank INTEGER NOT NULL,
            name_cn TEXT NOT NULL,
            name_en TEXT,
            province TEXT,
            category TEXT,
            score REAL,
            tags TEXT,
            school_level REAL,
            discipline_level REAL,
            talent_train REAL,
            crawl_time TEXT
        );
        """
        cursor = conn.cursor()
        cursor.execute(sql_create_rankings_table)
        print("大学排名表创建成功")
    except Error as e:
        print(f"创建表失败: {e}")


def insert_rankings(conn, rankings):
    """将排名数据插入数据库"""
    if not rankings:
        print("没有数据可插入")
        return False

    try:
        # 先清空表中现有数据
        cursor = conn.cursor()
        cursor.execute("DELETE FROM university_rankings")

        # 插入新数据
        sql = """
        INSERT INTO university_rankings (
            rank, name_cn, name_en, province, category, score, 
            tags, school_level, discipline_level, talent_train, crawl_time
        ) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
        """

        # 准备数据
        data_to_insert = []
        for item in rankings:
            data_to_insert.append((
                item['rank'],
                item['name_cn'],
                item['name_en'],
                item['province'],
                item['category'],
                item['score'],
                item['tags'],
                item['school_level'],
                item['discipline_level'],
                item['talent_train'],
                item['crawl_time']
            ))

        # 批量插入
        cursor.executemany(sql, data_to_insert)
        conn.commit()
        print(f"成功插入 {cursor.rowcount} 条数据")
        return True

    except Error as e:
        print(f"插入数据失败: {e}")
        conn.rollback()
        return False


def main():
    """主函数:下载-解析-存储到数据库全流程"""
    # 配置参数
    js_url = "https://www.shanghairanking.cn/_nuxt/static/1761118404/rankings/bcur/2021/payload.js"
    local_file = "payload.js"
    db_file = "university_rankings.db"  # SQLite数据库文件

    # 步骤1:下载文件(如果不存在)
    if not os.path.exists(local_file):
        if not download_payload_js(js_url, local_file):
            print("无法继续,缺少必要的数据源文件")
            return

    # 步骤2:解析所有排名数据
    all_rankings = parse_all_rankings(local_file)
    if not all_rankings:
        print("解析过程出现问题,无法获取数据")
        return

    # 步骤3:数据库操作
    conn = create_db_connection(db_file)
    if conn is not None:
        # 创建表
        create_ranking_table(conn)
        # 插入数据
        insert_rankings(conn, all_rankings)
        # 关闭连接
        conn.close()
        print("数据库连接已关闭")
    else:
        print("无法建立数据库连接,数据未保存")

    # 步骤4:打印部分结果预览
    print("\n数据预览:")
    print(f"{'排名':<6}{'学校':<18}{'省市':<6}{'类型':<8}{'总分':<6}")
    print("-" * 45)
    for item in all_rankings[::]: 
        print(
            f"{item['rank']:<6}{item['name_cn']:<18}{item['province']:<6}{item['category']:<8}{str(item['score']):<6}")

    print(f"\n共解析 {len(all_rankings)} 所大学的排名数据并存储到数据库")


if __name__ == "__main__":
    main()

image

屏幕录制 2025-10-31 191638

(2)心得体会

image

依旧是打开网址后进行抓包,找到存储数据的包,但是当我们用浏览器去预览这个包的标头时,我发现它的信息是乱码的

image

这就造成我们没有办法很清晰的去查看数据的结构,同学给我提供了一个思路,他让我将这个js文件下载并解析下来

image

通过下载解析后,我们可以清楚的看到存储数据的结构,通过re正则表达式去匹配univNameCn(学校)、ranking(排名)、province(省份)等等。

image

但是在查看结构的时候,我们发现这个网址为了不想让我们能够很轻易的爬取到数据,它应用了一些密钥来转换,所以我们只能去添加一下映射表,先爬取它的代号,再将其通过密钥转换过来。

代码地址:https://gitee.com/lizixian66/shujucaiji/tree/homework2/

posted @ 2025-10-31 19:37  helllo_x  阅读(3)  评论(0)    收藏  举报