102302139 尚子骐 数据采集与融合作业3

  • 作业一

1.完整代码及运行结果

单线程完整代码

点击查看代码
import requests
from bs4 import BeautifulSoup
import os
import time
from urllib.parse import urljoin, urlparse

# 配置参数
BASE_URL = "http://www.weather.com.cn"
MAX_PAGES = 39  # 总页数限制
MAX_IMAGES = 139  # 总图片数限制
SAVE_DIR = "./images"  # 图片保存目录
HEADERS = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36"
}

def init_save_dir():
    """初始化图片保存目录"""
    if not os.path.exists(SAVE_DIR):
        os.makedirs(SAVE_DIR)
        print(f"创建图片保存目录:{os.path.abspath(SAVE_DIR)}")
    else:
        print(f"图片保存目录已存在:{os.path.abspath(SAVE_DIR)}")

def get_page_urls():
    """获取前39个待爬页面URL"""
    page_urls = [BASE_URL]  # 首页
    try:
        response = requests.get(BASE_URL, headers=HEADERS, timeout=10)
        response.encoding = "utf-8"
        soup = BeautifulSoup(response.text, "html.parser")
        # 提取导航栏链接
        nav_div = soup.find("div", class_="nav")
        if nav_div:
            nav_links = nav_div.find_all("a")
            for link in nav_links[:MAX_PAGES-1]:  # 补充剩余38个页面
                href = link.get("href")
                if href:
                    full_url = urljoin(BASE_URL, href)
                    # 过滤本站有效页面
                    if urlparse(full_url).netloc.endswith("weather.com.cn") and full_url.endswith((".shtml", ".html")):
                        if full_url not in page_urls:
                            page_urls.append(full_url)
        page_urls = page_urls[:MAX_PAGES]  # 确保不超过39页
        print(f"成功获取 {len(page_urls)} 个待爬页面")
        return page_urls
    except Exception as e:
        print(f"获取页面URL失败:{str(e)}")
        return [BASE_URL]

def download_image(image_url, save_path):
    """下载单张图片"""
    try:
        response = requests.get(image_url, headers=HEADERS, timeout=15, stream=True)
        if response.status_code == 200:
            with open(save_path, "wb") as f:
                for chunk in response.iter_content(chunk_size=1024):
                    if chunk:
                        f.write(chunk)
            return True
        else:
            print(f"图片下载失败(状态码:{response.status_code}):{image_url}")
            return False
    except Exception as e:
        print(f"图片下载异常:{str(e)} | URL:{image_url}")
        return False

def main():
    init_save_dir()
    page_urls = get_page_urls()
    downloaded_count = 0
    image_urls = set()  # 去重

    print("\n开始单线程爬取图片...")
    start_time = time.time()

    for page_idx, page_url in enumerate(page_urls, 1):
        print(f"\n=== 爬取第 {page_idx}/{len(page_urls)} 页:{page_url} ===")
        try:
            response = requests.get(page_url, headers=HEADERS, timeout=10)
            response.encoding = "utf-8"
            soup = BeautifulSoup(response.text, "html.parser")
            img_tags = soup.find_all("img")
            print(f"该页面找到 {len(img_tags)} 个图片标签")

            for img in img_tags:
                if downloaded_count >= MAX_IMAGES:
                    print(f"已达到最大下载数 {MAX_IMAGES},停止爬取")
                    break
                # 获取图片真实URL
                img_src = img.get("src") or img.get("data-src")
                if not img_src:
                    continue
                full_img_url = urljoin(BASE_URL, img_src)
                # 过滤有效图片格式
                if full_img_url.lower().endswith((".jpg", ".jpeg", ".png", ".gif")) and full_img_url not in image_urls:
                    image_urls.add(full_img_url)
                    # 生成保存文件名
                    img_name = full_img_url.split("/")[-1].split("?")[0]
                    img_name = img_name[-50:] if len(img_name) > 50 else img_name  # 限制文件名长度
                    save_path = os.path.join(SAVE_DIR, img_name)
                    # 下载
                    if download_image(full_img_url, save_path):
                        downloaded_count += 1
                        print(f"[{downloaded_count}/{MAX_IMAGES}] 下载成功:{full_img_url}")
                        print(f"保存路径:{save_path}")
        except Exception as e:
            print(f"爬取页面 {page_url} 失败:{str(e)}")
            continue

    end_time = time.time()
    print(f"\n=== 单线程爬取完成 ===")
    print(f"总爬取页面数:{len(page_urls)}")
    print(f"总下载图片数:{downloaded_count}")
    print(f"爬取耗时:{end_time - start_time:.2f} 秒")
    print(f"所有图片URL:{image_urls}")

if __name__ == "__main__":
    main()

多线程完整代码

点击查看代码
import requests
from bs4 import BeautifulSoup
import os
import time
import threading
from queue import Queue
from urllib.parse import urljoin, urlparse

# 配置参数
BASE_URL = "http://www.weather.com.cn"
MAX_PAGES = 39  # 学号后2位
MAX_IMAGES = 139  # 学号后3位
SAVE_DIR = "./images"
HEADERS = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36"
}
THREAD_NUM = 5  # 线程数

# 线程安全全局变量
image_queue = Queue()  # 图片URL队列
downloaded_count = 0  # 已下载计数
lock = threading.Lock()  # 锁保护

def init_save_dir():
    if not os.path.exists(SAVE_DIR):
        os.makedirs(SAVE_DIR)
        print(f"创建图片保存目录:{os.path.abspath(SAVE_DIR)}")

def get_page_urls():
    """获取前39个待爬页面"""
    page_urls = [BASE_URL]
    try:
        response = requests.get(BASE_URL, headers=HEADERS, timeout=10)
        response.encoding = "utf-8"
        soup = BeautifulSoup(response.text, "html.parser")
        nav_div = soup.find("div", class_="nav")
        if nav_div:
            nav_links = nav_div.find_all("a")
            for link in nav_links[:MAX_PAGES-1]:
                href = link.get("href")
                if href:
                    full_url = urljoin(BASE_URL, href)
                    if urlparse(full_url).netloc.endswith("weather.com.cn") and full_url.endswith((".shtml", ".html")):
                        if full_url not in page_urls:
                            page_urls.append(full_url)
        page_urls = page_urls[:MAX_PAGES]
        print(f"成功获取 {len(page_urls)} 个待爬页面")
        return page_urls
    except Exception as e:
        print(f"获取页面URL失败:{str(e)}")
        return [BASE_URL]

def extract_image_urls(page_url):
    """提取单个页面的图片URL到队列"""
    try:
        response = requests.get(page_url, headers=HEADERS, timeout=10)
        response.encoding = "utf-8"
        soup = BeautifulSoup(response.text, "html.parser")
        img_tags = soup.find_all("img")
        for img in img_tags:
            img_src = img.get("src") or img.get("data-src")
            if img_src and img_src.lower().endswith((".jpg", ".jpeg", ".png", ".gif")):
                full_img_url = urljoin(BASE_URL, img_src)
                image_queue.put(full_img_url)
        print(f"页面 {page_url} 提取 {len(img_tags)} 个图片标签,已加入队列")
    except Exception as e:
        print(f"提取页面 {page_url} 图片失败:{str(e)}")

def download_worker():
    """下载线程工作函数"""
    global downloaded_count
    while True:
        with lock:
            if downloaded_count >= MAX_IMAGES:
                break
        try:
            image_url = image_queue.get(timeout=5)  # 队列超时5秒
            # 生成保存路径
            img_name = image_url.split("/")[-1].split("?")[0]
            img_name = img_name[-50:] if len(img_name) > 50 else img_name
            save_path = os.path.join(SAVE_DIR, img_name)
            # 去重(跳过已存在图片)
            if os.path.exists(save_path):
                print(f"图片已存在,跳过:{image_url}")
                image_queue.task_done()
                continue
            # 下载图片
            response = requests.get(image_url, headers=HEADERS, timeout=15, stream=True)
            if response.status_code == 200:
                with open(save_path, "wb") as f:
                    for chunk in response.iter_content(chunk_size=1024):
                        if chunk:
                            f.write(chunk)
                # 原子更新计数
                with lock:
                    downloaded_count += 1
                print(f"[{downloaded_count}/{MAX_IMAGES}] 下载成功:{image_url}")
                print(f"保存路径:{save_path}")
            else:
                print(f"下载失败(状态码:{response.status_code}):{image_url}")
            image_queue.task_done()
        except Queue.Empty:
            print("图片队列为空,线程退出")
            break
        except Exception as e:
            print(f"下载异常:{str(e)} | URL:{image_url}")
            image_queue.task_done()

def main():
    init_save_dir()
    page_urls = get_page_urls()
    start_time = time.time()

    print("\n开始多线程爬取图片...")
    # 第一步:启动线程提取图片URL
    extract_threads = []
    for page_url in page_urls:
        t = threading.Thread(target=extract_image_urls, args=(page_url,))
        extract_threads.append(t)
        t.start()
    # 等待所有提取线程完成
    for t in extract_threads:
        t.join()

    # 第二步:启动下载线程
    download_threads = []
    for _ in range(THREAD_NUM):
        t = threading.Thread(target=download_worker)
        download_threads.append(t)
        t.start()
    # 等待下载线程完成
    for t in download_threads:
        t.join()

    end_time = time.time()
    print(f"\n=== 多线程爬取完成 ===")
    print(f"总爬取页面数:{len(page_urls)}")
    print(f"总下载图片数:{downloaded_count}")
    print(f"爬取耗时:{end_time - start_time:.2f} 秒")

if __name__ == "__main__":
    main()

运行结果和爬取图片

屏幕截图 2025-11-23 182350

屏幕截图 2025-11-23 182401

屏幕截图 2025-11-23 182411

屏幕截图 2025-11-23 182425

屏幕截图 2025-11-23 182440

屏幕截图 2025-11-23 182726

2.实验心得

1)代码的爬取范围逻辑
首页是在http://www.weather.com.cn
首页导航栏的class="nav" 标签下的前 38 个有效子页面由 MAX_PAGES = 39 限制
然后仅保留中国天气网本站页面域名后缀为 weather.com.cn,且以 .shtml 或 .html 结尾的有效页面

2)单线程代码运行逻辑
用 get_page_urls()获取 39 个待爬页面 URL
循环遍历 39 个页面,逐个发送 HTTP 请求获取页面 HTML,对每个页面,用 BeautifulSoup 解析 HTML,提取所有 标签,过滤有效图片 URL仅保留 .jpg/.jpeg/.png/.gif 格式

3)多线程代码运行逻辑
并发提取图片 URL,启动多个线程,每个线程负责一个页面的解析,提取图片 URL 并放入 image_queue 队列
启动 5 个下载线程THREAD_NUM = 5,从 image_queue 中取 URL 下载

4)心得:单线程代码串行执行,同一时间只处理一个任务,而多线程代码并发执行,多个任务同时处理。而且单线程不仅仅运行时间要比多线程久,还会占用较多资源

  • 作业二

1.完整代码及运行结果

核心爬虫代码

点击查看代码
import scrapy
import json
from stock_spider.items import StockItem

class EastMoneySpider(scrapy.Spider):
    name = 'eastmoney'
    allowed_domains = ['eastmoney.com']
    # 真实数据接口(分页参数:pn=页码,pz=每页条数)
    start_urls = ['https://push2.eastmoney.com/api/qt/clist/get?pn=1&pz=20&po=1&np=1&ut=bd1d9ddb04089700cf9c27f6f7426281&fltt=2&invt=2&fid=f3&fs=m:0+t:6,m:0+t:80,m:1+t:2,m:1+t:23,m:0+t:81+s:2048&fields=f1,f2,f3,f4,f5,f6,f7,f8,f9,f10,f12,f13,f14,f15,f16,f17,f18,f20,f21,f23,f24,f25,f22,f11,f62,f128,f136,f115,f152&_=1732400000000']

    def parse(self, response):
        # 解析接口返回的 JSON 数据
        json_data = json.loads(response.text)
        if json_data.get('data') and json_data['data'].get('diff'):
            stock_list = json_data['data']['diff']  # 股票数据列表
            for idx, stock in enumerate(stock_list, 1):
                item = StockItem()
                item['id'] = str(idx)
                item['stock_no'] = str(stock.get('f12', ''))  # 股票代码
                item['stock_name'] = stock.get('f14', '').strip()  # 股票名称
                item['latest_price'] = str(stock.get('f2', 0))  # 最新报价
                item['price_change_rate'] = f"{stock.get('f3', 0)}%"  # 涨跌幅
                item['price_change'] = str(stock.get('f4', 0))  # 涨跌额
                item['volume'] = str(stock.get('f5', 0))  # 成交量(手)
                item['amplitude'] = f"{stock.get('f7', 0)}%"  # 振幅
                item['highest'] = str(stock.get('f15', 0))  # 最高
                item['lowest'] = str(stock.get('f16', 0))  # 最低
                item['open_today'] = str(stock.get('f17', 0))  # 今开
                item['close_yesterday'] = str(stock.get('f18', 0))  # 昨收

                # 过滤无效数据
                if item['stock_no'] and item['stock_name']:
                    yield item

            # 分页爬取(自动翻页,爬取前5页示例)
            current_page = int(json_data['data'].get('pn', 1))
            total_page = int(json_data['data'].get('totalPage', 1))
            if current_page < 5:  # 限制爬取页数,避免爬取过多
                next_page = current_page + 1
                # 构造下一页 URL
                next_url = f'https://push2.eastmoney.com/api/qt/clist/get?pn={next_page}&pz=20&po=1&np=1&ut=bd1d9ddb04089700cf9c27f6f7426281&fltt=2&invt=2&fid=f3&fs=m:0+t:6,m:0+t:80,m:1+t:2,m:1+t:23,m:0+t:81+s:2048&fields=f1,f2,f3,f4,f5,f6,f7,f8,f9,f10,f12,f13,f14,f15,f16,f17,f18,f20,f21,f23,f24,f25,f22,f11,f62,f128,f136,f115,f152&_=1732400000000'
                yield scrapy.Request(url=next_url, callback=self.parse)

项目items代码

点击查看代码
import scrapy

class StockItem(scrapy.Item):
    id = scrapy.Field()          # 序号
    stock_no = scrapy.Field()    # 股票代码(bStockNo)
    stock_name = scrapy.Field()  # 股票名称(bStockName)
    latest_price = scrapy.Field()# 最新报价(fLatestPrice)
    price_change_rate = scrapy.Field()# 涨跌幅(fPriceChangeRate)
    price_change = scrapy.Field()# 涨跌额(fPriceChange)
    volume = scrapy.Field()      # 成交量(fVolume)
    amplitude = scrapy.Field()   # 振幅(fAmplitude)
    highest = scrapy.Field()     # 最高(fHighest)
    lowest = scrapy.Field()      # 最低(fLowest)
    open_today = scrapy.Field()  # 今开(fOpenToday)
    close_yesterday = scrapy.Field()# 昨收(fCloseYesterday)

项目pipelines代码

点击查看代码
# stock_spider/stock_spider/pipelines.py
import pymysql
from itemadapter import ItemAdapter
from scrapy.utils.project import get_project_settings


class MySQLPipeline:
    def __init__(self):
        # 从 settings.py 读取 MySQL 配置(无需手动写死密码)
        settings = get_project_settings()
        self.host = settings.get('MYSQL_HOST', 'localhost')
        self.user = settings.get('MYSQL_USER', 'root')
        self.password = settings.get('MYSQL_PASSWORD', '')  # 读取 settings 中的密码
        self.database = settings.get('MYSQL_DATABASE', 'spider_db')
        self.port = settings.get('MYSQL_PORT', 3306)

    def open_spider(self, spider):
        # 连接 MySQL 数据库(自动创建数据库 spider_db,前提是 root 账号有权限)
        self.db = pymysql.connect(
            host=self.host,
            user=self.user,
            password=self.password,
            database=self.database,
            port=self.port,
            charset='utf8mb4'
        )
        self.cursor = self.db.cursor()

        # 创建股票数据表(字段名和爬虫 item 完全匹配,避免字段不存报错)
        create_table_sql = """
        CREATE TABLE IF NOT EXISTS stock_info (
            id VARCHAR(10) NOT NULL COMMENT '序号',
            stock_no VARCHAR(20) NOT NULL COMMENT '股票代码',
            stock_name VARCHAR(50) NOT NULL COMMENT '股票名称',
            latest_price VARCHAR(20) DEFAULT '0' COMMENT '最新报价',
            price_change_rate VARCHAR(20) DEFAULT '0%' COMMENT '涨跌幅',
            price_change VARCHAR(20) DEFAULT '0' COMMENT '涨跌额',
            volume VARCHAR(20) DEFAULT '0' COMMENT '成交量(手)',
            amplitude VARCHAR(20) DEFAULT '0%' COMMENT '振幅',
            highest VARCHAR(20) DEFAULT '0' COMMENT '最高',
            lowest VARCHAR(20) DEFAULT '0' COMMENT '最低',
            open_today VARCHAR(20) DEFAULT '0' COMMENT '今开',
            close_yesterday VARCHAR(20) DEFAULT '0' COMMENT '昨收',
            PRIMARY KEY (stock_no)  # 股票代码唯一,作为主键避免重复数据
        ) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COMMENT='东方财富网A股股票信息表';
        """
        self.cursor.execute(create_table_sql)
        self.db.commit()
        spider.logger.info("MySQL 数据库连接成功,数据表创建/验证完成!")

    def close_spider(self, spider):
        # 关闭数据库连接
        self.cursor.close()
        self.db.close()
        spider.logger.info("MySQL 数据库连接已关闭!")

    def process_item(self, item, spider):
        # 插入/更新数据(字段顺序与表结构完全一致)
        insert_sql = """
        INSERT INTO stock_info (
            id, stock_no, stock_name, latest_price, price_change_rate,
            price_change, volume, amplitude, highest, lowest,
            open_today, close_yesterday
        ) VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s)
        ON DUPLICATE KEY UPDATE  # 股票代码重复时,自动更新最新数据
            latest_price = VALUES(latest_price),
            price_change_rate = VALUES(price_change_rate),
            price_change = VALUES(price_change),
            volume = VALUES(volume),
            amplitude = VALUES(amplitude),
            highest = VALUES(highest),
            lowest = VALUES(lowest),
            open_today = VALUES(open_today),
            close_yesterday = VALUES(close_yesterday);
        """
        try:
            # 提取爬虫 item 中的数据(顺序与 SQL 字段对应,避免错位)
            data = (
                item.get('id', ''),  # 序号
                item.get('stock_no', ''),  # 股票代码
                item.get('stock_name', ''),  # 股票名称
                item.get('latest_price', '0'),  # 最新报价
                item.get('price_change_rate', '0%'),  # 涨跌幅
                item.get('price_change', '0'),  # 涨跌额
                item.get('volume', '0'),  # 成交量
                item.get('amplitude', '0%'),  # 振幅
                item.get('highest', '0'),  # 最高
                item.get('lowest', '0'),  # 最低
                item.get('open_today', '0'),  # 今开
                item.get('close_yesterday', '0')  # 昨收
            )
            self.cursor.execute(insert_sql, data)
            self.db.commit()
            spider.logger.info(f" MySQL 插入成功:{item['stock_name']}({item['stock_no']})")
        except Exception as e:
            self.db.rollback()  # 插入失败时回滚事务
            spider.logger.error(f" MySQL 插入失败:{str(e)} | 数据:{item}")
        return item

# 注意:如果之前有其他 Pipeline 类,保留即可,确保 MySQLPipeline 是启用的

爬虫结果及爬取数据

屏幕截图 2025-11-24 094358

屏幕截图 2025-11-24 094437

屏幕截图 2025-11-24 094449

2.实验心得

1)爬取的是东方财富网的股票数据接口具体 URL 为https://push2.eastmoney.com/api/qt/clist/get?pn=1&pz=20&po=1&np=1&ut=bd1d9ddb04089700cf9c27f6f7426281&fltt=2&invt=2&fid=f3&fs=m:0+t:6,m:0+t:80,m:1+t:2,m:1+t:23,m:0+t:81+s:2048&fields=f1,f2,f3,f4,f5,f6,f7,f8,f9,f10,f12,f13,f14,f15,f16,f17,f18,f20,f21,f23,f24,f25,f22,f11,f62,f128,f136,f115,f152&_=1732400000000
因为此平台的接口是 JSON 接口,通过分页参数pn实现翻页爬取,使用xpath方法会导致爬虫失败,我当时尝试过xpath数据提取方式,但都失败了,主要解析 JSON 数据,但在后续可能的页面解析或备用方案中含 XPath 逻辑,且核心思路一致。

2)代码运行逻辑
从start_urls发起初始请求,解析返回的 JSON 数据,提取股票信息然后解析当前页码,构造下一页 URL 并继续爬取,我的代码中把页数限制在了五页,然后通过StockItem格式化数据,MySQLPipeline连接数据库,创建stock_info表,将爬虫生成的Item数据插入数据库,若股票代码重复则更新数据,最后从 MySQL 读取stock_info表数据,通过openpyxl库写入 Excel 文件,方便观察

excel导出数据

屏幕截图 2025-11-24 121645

  • 作业三

1.完整代码及运行结果

核心爬虫代码

点击查看代码
# forex_spider/forex_spider/spiders/boc.py
import scrapy
from forex_spider.items import ForexItem


class BocSpider(scrapy.Spider):
    name = 'boc'
    allowed_domains = ['bankofchina.com']
    start_urls = ['https://www.bankofchina.com/sourcedb/whpj/']

    def parse(self, response):
        # 定位包含汇率数据的行
        forex_rows = response.xpath(
            '//tr[contains(., ".") and string-length(translate(., "0123456789.", "")) < string-length(.) - 3]')
        self.logger.info(f"找到疑似数据行:{len(forex_rows)} 行")

        valid_count = 0  # 有效数据计数器,确保序号连续
        for row in forex_rows:
            # 提取行内所有非空文本,过滤空格和空字符串
            row_texts = [text.strip() for text in row.xpath('.//text()').getall() if text.strip()]

            # 筛选条件:至少7个字段、首字段为中文货币名称、汇率值为有效数字
            if len(row_texts) >= 7 and any(char.isalpha() for char in row_texts[0]):
                try:
                    # 验证核心汇率字段为有效数字,过滤异常值
                    bank_buy = float(row_texts[1])
                    bank_sell = float(row_texts[3])
                    if bank_buy <= 0 or bank_buy > 10000 or bank_sell <= 0 or bank_sell > 10000:
                        continue
                except (ValueError, IndexError):
                    continue  # 跳过数值异常或字段缺失的行

                # 构造数据项,字段顺序严格匹配中国银行返回格式
                item = ForexItem()
                item['id'] = str(valid_count + 1)
                item['currency_code'] = row_texts[0]  # 货币名称
                item['currency_name'] = row_texts[0]  # 货币名称
                item['bank_buy'] = row_texts[1]  # 现汇买入价
                item['cash_buy'] = row_texts[2]  # 现钞买入价
                item['bank_sell'] = row_texts[3]  # 现汇卖出价
                item['cash_sell'] = row_texts[4]  # 现钞卖出价
                item['middle_rate'] = row_texts[5]  # 中间价
                item['publish_time'] = row_texts[6]  # 发布时间

                yield item
                valid_count += 1
                self.logger.info(
                    f"提取成功[{valid_count}]:{item['currency_name']} - 现汇买入价:{item['bank_buy']} | 现汇卖出价:{item['bank_sell']}")

        self.logger.info(f"爬取完成!共提取有效外汇数据:{valid_count} 条")

项目items代码

点击查看代码
# items.py:定义外汇数据字段
import scrapy

class ForexItem(scrapy.Item):
    id = scrapy.Field()  # 序号
    currency_code = scrapy.Field()  # 货币代码
    currency_name = scrapy.Field()  # 货币名称
    bank_buy = scrapy.Field()  # 现汇买入价
    bank_sell = scrapy.Field()  # 现汇卖出价
    cash_buy = scrapy.Field()  # 现钞买入价
    cash_sell = scrapy.Field()  # 现钞卖出价
    middle_rate = scrapy.Field()  # 中间价
    publish_time = scrapy.Field()  # 发布时间

项目pipelines代码

点击查看代码
# pipelines.py:自动创建外汇表+数据插入
import pymysql
from itemadapter import ItemAdapter
from scrapy.utils.project import get_project_settings


class MySQLPipeline:
    def __init__(self):
        # 从 settings.py 读取 MySQL 配置
        settings = get_project_settings()
        self.host = settings.get('MYSQL_HOST', 'localhost')
        self.user = settings.get('MYSQL_USER', 'root')
        self.password = settings.get('MYSQL_PASSWORD', '')
        self.database = settings.get('MYSQL_DATABASE', 'spider_db')
        self.port = settings.get('MYSQL_PORT', 3306)

    def open_spider(self, spider):
        # 连接数据库,创建外汇数据表
        self.db = pymysql.connect(
            host=self.host,
            user=self.user,
            password=self.password,
            database=self.database,
            port=self.port,
            charset='utf8mb4'
        )
        self.cursor = self.db.cursor()

        # 创建外汇表(字段和 Item 完全匹配)
        create_table_sql = """
        CREATE TABLE IF NOT EXISTS forex_info (
            id VARCHAR(10) NOT NULL COMMENT '序号',
            currency_code VARCHAR(10) NOT NULL COMMENT '货币代码(如 USD)',
            currency_name VARCHAR(50) NOT NULL COMMENT '货币名称(如 美元)',
            bank_buy VARCHAR(20) DEFAULT '0' COMMENT '现汇买入价',
            bank_sell VARCHAR(20) DEFAULT '0' COMMENT '现汇卖出价',
            cash_buy VARCHAR(20) DEFAULT '0' COMMENT '现钞买入价',
            cash_sell VARCHAR(20) DEFAULT '0' COMMENT '现钞卖出价',
            middle_rate VARCHAR(20) DEFAULT '0' COMMENT '中间价',
            publish_time VARCHAR(30) DEFAULT '' COMMENT '发布时间',
            PRIMARY KEY (currency_code)  # 货币代码唯一,避免重复
        ) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COMMENT='中国银行外汇汇率表';
        """
        self.cursor.execute(create_table_sql)
        self.db.commit()
        spider.logger.info(" 外汇爬虫 - MySQL 连接成功,数据表创建完成!")

    def close_spider(self, spider):
        # 关闭数据库连接
        self.cursor.close()
        self.db.close()
        spider.logger.info(" 外汇爬虫 - MySQL 连接已关闭!")

    def process_item(self, item, spider):
        # 插入/更新外汇数据
        insert_sql = """
        INSERT INTO forex_info (
            id, currency_code, currency_name, bank_buy, bank_sell,
            cash_buy, cash_sell, middle_rate, publish_time
        ) VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s)
        ON DUPLICATE KEY UPDATE  # 货币代码重复时更新最新数据
            bank_buy = VALUES(bank_buy),
            bank_sell = VALUES(bank_sell),
            cash_buy = VALUES(cash_buy),
            cash_sell = VALUES(cash_sell),
            middle_rate = VALUES(middle_rate),
            publish_time = VALUES(publish_time);
        """
        try:
            data = (
                item.get('id', ''),
                item.get('currency_code', ''),
                item.get('currency_name', ''),
                item.get('bank_buy', '0'),
                item.get('bank_sell', '0'),
                item.get('cash_buy', '0'),
                item.get('cash_sell', '0'),
                item.get('middle_rate', '0'),
                item.get('publish_time', '')
            )
            self.cursor.execute(insert_sql, data)
            self.db.commit()
            spider.logger.info(f" 外汇插入成功:{item['currency_name']}({item['currency_code']})")
        except Exception as e:
            self.db.rollback()
            spider.logger.error(f" 外汇插入失败:{str(e)} | 数据:{item}")
        return item

爬虫结果及爬取数据

屏幕截图 2025-11-24 095343

屏幕截图 2025-11-24 095337

屏幕截图 2025-11-24 095409

查看特定货币信息:美元

屏幕截图 2025-11-24 095424

2.实验心得

1) 爬取的网页网址https://www.bankofchina.com/sourcedb/whpj/
2) 代码运行爬虫核心逻辑是从 start_urls 发起请求,访问目标网页在 parse 方法中解析网页响应,然后提取外汇数据,再通过 XPath 定位包含汇率数据的表格行,过滤无效数据行提取每行的货币名称、买入价、卖出价等字段,封装为 ForexItem 对象并输出,然后通过pipelines代码用MySQLPipeline 处理自动创建 forex_info 数据表再通过 SQL 实现数据插入,最后从 MySQL 数据库查询所有外汇数据使用 openpyxl 库创建 Excel 文件

excel导出数据

屏幕截图 2025-11-24 121658

3) 在 boc.py 中主要使用了XPath语法提取数据
forex_rows = response.xpath( '//tr[contains(., ".") and string-length(translate(., "0123456789.", "")) < string-length(.) - 3]' )
用//tr[...]匹配所有满足条件的表格行 观察爬取网页的所有符合tr表格行

屏幕截图 2025-11-24 202744

Gitee文件夹及代码链接:https://gitee.com/sike-0420/kuku/tree/master/作业3

posted @ 2025-11-24 20:38  Sike0420  阅读(6)  评论(0)    收藏  举报