数据采集与融合技术实践作业2_102302107_林诗樾

作业①:
–要求:在中国气象网(http://www.weather.com.cn)给定城市集的7日天气预报,并保存在数据库。
–输出信息:
Gitee文件夹链接
1)实验代码和运行结果

import requests
from bs4 import BeautifulSoup
import pandas as pd
import sqlite3
from datetime import datetime
import tabulate


def init_weather_database():
    conn = sqlite3.connect("weather.db")
    cursor = conn.cursor()
    cursor.execute('''
        CREATE TABLE IF NOT EXISTS weather (
            id INTEGER PRIMARY KEY AUTOINCREMENT,
            area TEXT NOT NULL,          
            date TEXT NOT NULL,         
            weather_info TEXT NOT NULL, 
            temperature TEXT NOT NULL,  
            crawl_time TIMESTAMP NOT NULL DEFAULT CURRENT_TIMESTAMP  
        )
    ''')
    conn.commit()
    return conn


def get_weather_html(url):
    headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
        "Referer": "http://www.weather.com.cn/"
    }
    try:
        response = requests.get(url, headers=headers, timeout=10)
        response.raise_for_status()
        response.encoding = 'utf-8'
        return response.text
    except Exception as e:
        print(f"获取页面失败:{str(e)}")
        return ""


def parse_weather_data(html, area):
    weather_list = []
    soup = BeautifulSoup(html, "html.parser")

    # 根据中国天气网的实际结构解析
    forecast_container = soup.find("div", class_="tqtongji1")
    if not forecast_container:
        forecast_container = soup.find("ul", class_="t clearfix")

    if not forecast_container:
        print(f"未找到{area}的天气预报容器,尝试其他选择器")
        # 尝试其他可能的选择器
        forecast_container = soup.find("div", id="7d")
        if not forecast_container:
            return weather_list

    day_items = forecast_container.find_all("li")[
        :7] if forecast_container.name == "ul" else forecast_container.find_all("div", class_="tqtongji2")[:7]

    for idx, item in enumerate(day_items, 1):
        try:
            # 尝试多种日期选择器
            date_elem = item.find("h1") or item.find("div", class_="tqtongji21") or item.find("span")
            date = date_elem.text.strip() if date_elem else f"第{idx}天"

            # 天气信息选择器
            weather_elem = item.find("p", class_="wea") or item.find("div", class_="tqtongji22")
            weather_info = weather_elem.text.strip() if weather_elem else "未知天气"

            # 温度选择器
            temp_elem = item.find("p", class_="tem") or item.find("div", class_="tqtongji23")
            temperature = temp_elem.text.strip() if temp_elem else "未知温度"

            weather_list.append({
                "序号": idx,
                "地区": area,
                "日期": date,
                "天气信息": weather_info,
                "温度": temperature
            })
        except Exception as e:
            print(f"解析第{idx}天数据时出错:{str(e)}")
            continue

    return weather_list


def save_weather_to_db(conn, weather_data):
    cursor = conn.cursor()
    try:
        for data in weather_data:
            cursor.execute('''
                SELECT id FROM weather WHERE area = ? AND date = ?
            ''', (data["地区"], data["日期"]))
            if not cursor.fetchone():
                cursor.execute('''
                    INSERT INTO weather (area, date, weather_info, temperature)
                    VALUES (?, ?, ?, ?)
                ''', (data["地区"], data["日期"], data["天气信息"], data["温度"]))
        conn.commit()
        print(f"成功保存{len(weather_data)}条数据到数据库")
    except Exception as e:
        conn.rollback()
        print(f"数据插入失败:{str(e)}")


def main_weather_crawl():
    # 使用真实的中国天气网URL
    target_cities = {
        "北京": "101010100",
        "上海": "101020100",
        "广州": "101280101"
    }

    conn = init_weather_database()
    all_weather_data = []

    for city_name, city_id in target_cities.items():
        # 使用真实的中国天气网URL格式
        weather_url = f"http://www.weather.com.cn/weather/{city_id}.shtml"
        print(f"开始爬取{city_name}天气:{weather_url}")

        html = get_weather_html(weather_url)
        if not html:
            print(f"获取{city_name}页面失败,跳过")
            continue

        weather_data = parse_weather_data(html, city_name)
        if not weather_data:
            print(f"解析{city_name}天气数据失败,跳过")
            continue

        all_weather_data.extend(weather_data)
        save_weather_to_db(conn, weather_data)
        print(f"{city_name}数据存储完成!获取到{len(weather_data)}天数据\n")

    conn.close()

    if all_weather_data:
        # 生成Excel表格
        df = pd.DataFrame(all_weather_data)
        df.to_excel("weather_data.xlsx", index=False)
        print("天气数据表格已保存至 weather_data.xlsx")

        # 终端打印表格
        headers = ["序号", "地区", "日期", "天气信息", "温度"]
        table_data = [list(data.values()) for data in all_weather_data]
        print(tabulate.tabulate(table_data, headers=headers, tablefmt="grid"))

        print(f"\n总共爬取到{len(all_weather_data)}条天气数据")
    else:
        print("未能获取到任何天气数据,请检查网络连接或网站结构")


if __name__ == "__main__":
    main_weather_crawl()

运行结果:
image
image
image

2)心得体会:天气爬虫项目的核心心得是:工业级爬虫开发的关键在于用多重防御策略(模拟浏览器、动态编码检测、多路解析备选)来应对真实网络环境中的各种不确定性,确保程序在网站结构变化、网络波动等复杂场景下仍能稳定运行。

作业②
–要求:用requests和BeautifulSoup库方法定向爬取股票相关信息,并存储在数据库中。
–候选网站:东方财富网:https://www.eastmoney.com/
新浪股票:http://finance.sina.com.cn/stock/
–技巧:在谷歌浏览器中进入F12调试模式进行抓包,查找股票列表加载使用的url,并分析api返回的值,并根据所要求的参数可适当更改api的请求参数。根据URL可观察请求的参数f1、f2可获取不同的数值,根据情况可删减请求的参数。
参考链接:https://zhuanlan.zhihu.com/p/50099084
1)实验代码

import requests
import json
import sqlite3
import time
from datetime import datetime


class StockSpider:
    def __init__(self):
        self.headers = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36',
            'Referer': 'http://quote.eastmoney.com/'
        }
        self.base_url = "http://push2.eastmoney.com/api/qt/clist/get"

    def get_stock_params(self, page=1):
        """构造请求参数"""
        params = {
            'cb': 'jQuery112406995981980579432_1625123456789',
            'pn': page,
            'pz': 20,  # 每页数量
            'po': 1,
            'np': 1,
            'ut': 'bd1d9ddb04089700cf9c27f6f7426281',
            'fltt': 2,
            'invt': 2,
            'fid': 'f3',
            'fs': 'm:0 t:6,m:0 t:80,m:1 t:2,m:1 t:23',
            'fields': 'f1,f2,f3,f4,f5,f6,f7,f8,f9,f10,f12,f13,f14,f15,f16,f17,f18,f20,f21,f23,f24,f25,f22,f11,f62,f128,f136,f115,f152',
            '_': str(int(time.time() * 1000))
        }
        return params

    def parse_stock_data(self, data):
        """解析股票数据"""
        stocks = []
        if data and 'diff' in data:
            for item in data['diff']:
                stock = {
                    'code': item.get('f12', ''),  # 股票代码
                    'name': item.get('f14', ''),  # 股票名称
                    'current_price': item.get('f2', 0),  # 当前价格
                    'change_amount': item.get('f4', 0),  # 涨跌额
                    'change_rate': item.get('f3', 0),  # 涨跌幅
                    'volume': item.get('f5', 0),  # 成交量(手)
                    'turnover': item.get('f6', 0),  # 成交额
                    'amplitude': item.get('f7', 0),  # 振幅
                    'high_price': item.get('f15', 0),  # 最高价
                    'low_price': item.get('f16', 0),  # 最低价
                    'open_price': item.get('f17', 0),  # 开盘价
                    'close_price': item.get('f18', 0),  # 昨收价
                    'pe_ratio': item.get('f9', 0),  # 市盈率
                    'pb_ratio': item.get('f23', 0),  # 市净率
                    'total_market_cap': item.get('f20', 0),  # 总市值
                    'circulating_market_cap': item.get('f21', 0),  # 流通市值
                    'update_time': datetime.now().strftime('%Y-%m-%d %H:%M:%S')
                }
                stocks.append(stock)
        return stocks

    def create_database(self):
        """创建股票数据库"""
        conn = sqlite3.connect('stocks.db')
        cursor = conn.cursor()

        create_table_sql = '''
        CREATE TABLE IF NOT EXISTS stocks (
            id INTEGER PRIMARY KEY AUTOINCREMENT,
            code TEXT NOT NULL,
            name TEXT NOT NULL,
            current_price REAL,
            change_amount REAL,
            change_rate REAL,
            volume REAL,
            turnover REAL,
            amplitude REAL,
            high_price REAL,
            low_price REAL,
            open_price REAL,
            close_price REAL,
            pe_ratio REAL,
            pb_ratio REAL,
            total_market_cap REAL,
            circulating_market_cap REAL,
            update_time TEXT
        )
        '''
        cursor.execute(create_table_sql)
        conn.commit()
        conn.close()

    def save_stocks_to_db(self, stocks):
        """保存股票数据到数据库"""
        conn = sqlite3.connect('stocks.db')
        cursor = conn.cursor()

        for stock in stocks:
            # 检查是否已存在该股票数据
            cursor.execute("SELECT id FROM stocks WHERE code = ?", (stock['code'],))
            exists = cursor.fetchone()

            if exists:
                # 更新数据
                update_sql = '''
                UPDATE stocks SET 
                    current_price=?, change_amount=?, change_rate=?, volume=?, turnover=?,
                    amplitude=?, high_price=?, low_price=?, open_price=?, close_price=?,
                    pe_ratio=?, pb_ratio=?, total_market_cap=?, circulating_market_cap=?,
                    update_time=?
                WHERE code=?
                '''
                cursor.execute(update_sql, (
                    stock['current_price'], stock['change_amount'], stock['change_rate'],
                    stock['volume'], stock['turnover'], stock['amplitude'], stock['high_price'],
                    stock['low_price'], stock['open_price'], stock['close_price'], stock['pe_ratio'],
                    stock['pb_ratio'], stock['total_market_cap'], stock['circulating_market_cap'],
                    stock['update_time'], stock['code']
                ))
            else:
                # 插入新数据
                insert_sql = '''
                INSERT INTO stocks (
                    code, name, current_price, change_amount, change_rate, volume, turnover,
                    amplitude, high_price, low_price, open_price, close_price, pe_ratio,
                    pb_ratio, total_market_cap, circulating_market_cap, update_time
                ) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
                '''
                cursor.execute(insert_sql, (
                    stock['code'], stock['name'], stock['current_price'], stock['change_amount'],
                    stock['change_rate'], stock['volume'], stock['turnover'], stock['amplitude'],
                    stock['high_price'], stock['low_price'], stock['open_price'], stock['close_price'],
                    stock['pe_ratio'], stock['pb_ratio'], stock['total_market_cap'],
                    stock['circulating_market_cap'], stock['update_time']
                ))

        conn.commit()
        conn.close()

    def crawl_stocks(self, pages=5):
        """爬取多页股票数据"""
        all_stocks = []

        for page in range(1, pages + 1):
            print(f"正在爬取第{page}页股票数据...")

            try:
                params = self.get_stock_params(page)
                response = requests.get(self.base_url, params=params, headers=self.headers)

                # 处理JSONP响应
                content = response.text
                if content.startswith('jQuery'):
                    # 提取JSON数据
                    json_str = content[content.find('(') + 1:content.rfind(')')]
                    data = json.loads(json_str)

                    stocks = self.parse_stock_data(data.get('data', {}))
                    all_stocks.extend(stocks)

                    print(f"第{page}页获取到{len(stocks)}只股票数据")

                time.sleep(1)  # 延迟避免请求过快

            except Exception as e:
                print(f"第{page}页爬取失败: {e}")
                continue

        return all_stocks

    def run(self):
        """主运行函数"""
        print("开始爬取股票数据...")
        self.create_database()

        stocks = self.crawl_stocks(pages=5)

        if stocks:
            self.save_stocks_to_db(stocks)
            print(f"成功保存{len(stocks)}只股票数据到数据库")
        else:
            print("未获取到股票数据")


if __name__ == "__main__":
    spider = StockSpider()
    spider.run()

运行结果:
image
image
image
image
2)心得体会:通过本次实验,我掌握了用requestsBeautifulSoup结合API抓包的股票数据爬取方法,深刻体会到分析接口参数和数据结构对定向爬取的关键作用,也认识到数据库存储在数据持久化中的实用性。

作业③:
–要求:爬取中国大学2021主榜(https://www.shanghairanking.cn/rankings/bcur/2021)所有院校信息,并存储在数据库中,同时将浏览器F12调试分析的过程录制Gif加入至博客中。
–技巧:分析该网站的发包情况,分析获取数据的api
1)实验代码

import requests
import sqlite3
import pandas as pd
from datetime import datetime
import time
import os


class UniversityRankingSpider:
    def __init__(self):
        self.headers = {
            "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
            "Referer": "https://www.shanghairanking.cn/rankings/bcur/2021",
            "Accept": "application/json, text/plain, */*",
        }
        self.api_url = "https://www.shanghairanking.cn/api/pub/v1/bcur"
        self.params = {"bcur_type": 11, "year": 2021}

    def create_database(self):
        """创建数据库表结构 - 已修复"""
        conn = sqlite3.connect('university_ranking.db')
        cursor = conn.cursor()

        # 删除已存在的表(确保重新创建)
        cursor.execute('DROP TABLE IF EXISTS university_ranking')

        # 创建大学排名表 - 包含所有必要字段
        cursor.execute('''
            CREATE TABLE university_ranking (
                id INTEGER PRIMARY KEY AUTOINCREMENT,
                ranking INTEGER NOT NULL,
                name TEXT NOT NULL,
                score REAL,
                province TEXT,
                category TEXT,
                year INTEGER,
                crawl_time TEXT,
                UNIQUE(ranking, name, year)
            )
        ''')

        conn.commit()
        conn.close()
        print("数据库创建成功")

    def fetch_rankings(self):
        """从API获取大学排名数据"""
        print("开始从API获取大学排名数据...")
        print(f"API地址: {self.api_url}")
        print(f"请求参数: {self.params}")

        try:
            # 发送API请求
            response = requests.get(
                self.api_url,
                params=self.params,
                headers=self.headers,
                timeout=20
            )
            response.raise_for_status()  # 检查HTTP状态码

            print(f"API响应状态: {response.status_code}")

            # 解析JSON数据
            data = response.json()
            print(f"API响应数据结构: {list(data.keys())}")

            # 关键数据在 data.rankings
            lst = (data.get("data") or {}).get("rankings", [])
            print(f"获取到 {len(lst)} 条大学数据")

            # 处理数据
            rows = []
            for it in lst:
                rank = it.get("ranking") or it.get("rank")
                name = it.get("univNameCn") or it.get("univName")
                province = it.get("province")  # 省市
                utype = it.get("univCategory")  # 类型:综合/理工/...
                score = it.get("score")

                if rank and name:
                    rows.append({
                        'ranking': int(rank),
                        'name': str(name),
                        'province': str(province or ""),
                        'category': str(utype or ""),
                        'score': float(score or 0),
                        'year': 2021,
                        'crawl_time': datetime.now().strftime('%Y-%m-%d %H:%M:%S')
                    })

            return rows

        except requests.exceptions.RequestException as e:
            print(f"网络请求错误: {e}")
            return None
        except Exception as e:
            print(f"数据处理错误: {e}")
            return None

    def save_to_database(self, universities):
        """保存数据到数据库"""
        if not universities:
            print("没有数据需要保存")
            return 0

        conn = sqlite3.connect('university_ranking.db')
        cursor = conn.cursor()

        success_count = 0
        for uni in universities:
            try:
                cursor.execute('''
                    INSERT INTO university_ranking 
                    (ranking, name, score, province, category, year, crawl_time)
                    VALUES (?, ?, ?, ?, ?, ?, ?)
                ''', (
                    uni['ranking'],
                    uni['name'],
                    uni['score'],
                    uni['province'],
                    uni['category'],
                    uni['year'],
                    uni['crawl_time']
                ))
                success_count += 1
            except Exception as e:
                print(f"保存 {uni['name']} 失败: {e}")

        conn.commit()
        conn.close()

        print(f"成功保存 {success_count} 条数据到数据库")
        return success_count

    def display_table(self, universities, count=30):
        """以表格形式展示数据"""
        if not universities:
            print("没有数据可以展示")
            return None

        # 限制显示数量
        display_data = universities[:count]

        # 创建DataFrame
        df = pd.DataFrame(display_data)
        df = df[['ranking', 'name', 'score', 'province', 'category']]

        # 设置列名
        df.columns = ['排名', '学校名称', '总分', '省份', '类型']

        # 设置索引从1开始
        df.index = range(1, len(df) + 1)

        # 设置显示选项
        pd.set_option('display.unicode.ambiguous_as_wide', True)
        pd.set_option('display.unicode.east_asian_width', True)
        pd.set_option('display.width', 120)
        pd.set_option('display.max_colwidth', 20)

        # 打印表格
        print("\n" + "=" * 80)
        print("2021年中国大学排名(前30名)")
        print("=" * 80)
        print(df.to_string(index=True, header=True))
        print("=" * 80)
        print(f"共显示 {len(display_data)} 所大学数据")

        return df

    def export_to_excel(self, universities, filename=None):
        """导出数据到Excel - 修复并增强功能"""
        if not universities:
            print("没有数据可以导出")
            return None

        if not filename:
            filename = f"university_ranking_{datetime.now().strftime('%Y%m%d_%H%M%S')}.xlsx"

        try:
            # 创建DataFrame
            df = pd.DataFrame(universities)

            # 选择需要的列并重命名
            df = df[['ranking', 'name', 'score', 'province', 'category', 'year', 'crawl_time']]
            df.columns = ['排名', '学校名称', '总分', '省份', '类型', '年份', '爬取时间']

            # 按排名排序
            df = df.sort_values('排名')

            # 导出到Excel
            df.to_excel(filename, index=False, engine='openpyxl')

            # 获取文件绝对路径
            abs_path = os.path.abspath(filename)

            print(f"✓ 数据已成功导出到Excel文件: {abs_path}")
            print(f"✓ 共导出 {len(df)} 条记录")

            return abs_path

        except ImportError:
            print(" 导出失败: 请安装openpyxl库 - pip install openpyxl")
            return None
        except Exception as e:
            print(f" 导出Excel失败: {e}")
            return None

    def run(self):
        """主运行函数 """
        print("=" * 60)
        print("中国大学排名爬虫 - 基于API分析")
        print("=" * 60)

        # 创建数据库
        self.create_database()

        # 获取数据
        universities = self.fetch_rankings()

        if universities:
            print(f"\n成功获取 {len(universities)} 所大学数据")

            # 保存到数据库
            saved_count = self.save_to_database(universities)

            # 显示表格
            self.display_table(universities, 30)

            # 导出到Excel - 这是新增的关键调用
            excel_file = self.export_to_excel(universities)

            if excel_file:
                print(f"\n🎉 所有操作完成!")
                print(f"   - 数据库: university_ranking.db")
                print(f"   - Excel文件: {excel_file}")
                print(f"   - 总记录数: {len(universities)} 条")
            else:
                print("\n 数据保存到数据库成功,但Excel导出失败")
        else:
            print("未能获取大学排名数据")


# 运行爬虫
if __name__ == "__main__":
    spider = UniversityRankingSpider()
    spider.run()

运行结果:
image
image
image
……
image

数据采集作业二

2)心得体会
通过这个大学排名爬虫项目,我深刻体会到数据爬取中API接口分析和JSON解析的重要性。在处理数据库存储时,我学会了表结构设计必须与数据字段严格匹配的关键细节。最终成功实现数据持久化到SQLite和Excel,提升了我的数据处理和错误调试能力。

posted @ 2025-11-05 14:33  ls樾  阅读(15)  评论(0)    收藏  举报