数据采集与融合技术第三次作业

作业①:

要求:指定一个网站,爬取这个网站中的所有的所有图片,例如:中国气象网(http://www.weather.com.cn)。实现单线程和多线程的方式爬取。
–务必控制总页数(学号尾数2位)、总下载的图片数量(尾数后3位)等限制爬取的措施。

代码逻辑:

image
image

点击查看代码
import os
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin

ID = "152301219"
PAGE_LIMIT = int(ID[-2:])   # 19 页
IMG_LIMIT = int(ID[-3:])    # 219 张

BASE_URL = "http://www.weather.com.cn"
START_URL = "http://www.weather.com.cn/"
SAVE_DIR = "single_thread_images"
os.makedirs(SAVE_DIR, exist_ok=True)
def fetch_page(url):
    try:
        resp = requests.get(url, timeout=5)
        resp.encoding = resp.apparent_encoding
        return resp.text
    except:
        return ""
def extract_images(html, base_url):
    soup = BeautifulSoup(html, "html.parser")
    imgs = []
    for img in soup.find_all("img"):
        src = img.get("src")
        if src:
            imgs.append(urljoin(base_url, src))
    return imgs

def download_image(url, idx):
    try:
        ext = url.split(".")[-1][:4]
        filename = os.path.join(SAVE_DIR, f"img_{idx}.{ext}")
        r = requests.get(url, timeout=5)
        with open(filename, "wb") as f:
            f.write(r.content)
        print(f"Downloaded {filename}")
    except:
        pass
def single_thread_crawl():
    print("=== 单线程爬取开始 ===")

    to_visit = [START_URL]
    visited = set()
    page_count = 0
    img_count = 0

    while to_visit and page_count < PAGE_LIMIT and img_count < IMG_LIMIT:
        url = to_visit.pop(0)
        if url in visited:
            continue
        visited.add(url)

        html = fetch_page(url)
        if not html:
            continue

        page_count += 1
        print(f"[Single] Visiting page {page_count}: {url}")
        imgs = extract_images(html, BASE_URL)
        for img in imgs:
            if img_count >= IMG_LIMIT:
                break
            download_image(img, img_count)
            img_count += 1
        soup = BeautifulSoup(html, "html.parser")
        for a in soup.find_all("a"):
            href = a.get("href")
            if href and href.startswith("http") and BASE_URL in href:
                to_visit.append(href)

    print("=== 单线程爬取结束 ===")

if __name__ == "__main__":
    single_thread_crawl()

点击查看代码
import os
import threading
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin

ID = "152301219"
PAGE_LIMIT = int(ID[-2:])   # 19 页
IMG_LIMIT = int(ID[-3:])    # 219 张

BASE_URL = "http://www.weather.com.cn"
START_URL = "http://www.weather.com.cn/"
SAVE_DIR = "multi_thread_images"
os.makedirs(SAVE_DIR, exist_ok=True)
def fetch_page(url):
    try:
        resp = requests.get(url, timeout=5)
        resp.encoding = resp.apparent_encoding
        return resp.text
    except:
        return ""
def extract_images(html, base_url):
    soup = BeautifulSoup(html, "html.parser")
    imgs = []
    for img in soup.find_all("img"):
        src = img.get("src")
        if src:
            imgs.append(urljoin(base_url, src))
    return imgs

def download_image(url, idx):
    try:
        ext = url.split(".")[-1][:4]
        filename = os.path.join(SAVE_DIR, f"img_{idx}.{ext}")
        r = requests.get(url, timeout=5)
        with open(filename, "wb") as f:
            f.write(r.content)
        print(f"[Thread] Downloaded {filename}")
    except:
        pass

def multi_thread_crawl():
    print("=== 多线程爬取开始 ===")

    to_visit = [START_URL]
    visited = set()
    page_count = 0
    img_count = 0
    threads = []

    while to_visit and page_count < PAGE_LIMIT and img_count < IMG_LIMIT:
        url = to_visit.pop(0)
        if url in visited:
            continue
        visited.add(url)

        html = fetch_page(url)
        if not html:
            continue

        page_count += 1
        print(f"[Multi] Visiting page {page_count}: {url}")

        imgs = extract_images(html, BASE_URL)
        for img in imgs:
            if img_count >= IMG_LIMIT:
                break

            t = threading.Thread(target=download_image, args=(img, img_count))
            t.start()
            threads.append(t)

            img_count += 1

        soup = BeautifulSoup(html, "html.parser")
        for a in soup.find_all("a"):
            href = a.get("href")
            if href and href.startswith("http") and BASE_URL in href:
                to_visit.append(href)

    for t in threads:
        t.join()

    print("=== 多线程爬取结束 ===")


if __name__ == "__main__":
    multi_thread_crawl()

心得体会:

代码通过BeautifulSoup 库解析网页:在extract_images函数中,先将 HTML 文本加载为soup对象,然后调用soup.find_all("img")遍历页面所有图片标签,提取每个标签的src属性,并通过urljoin方法将相对路径拼接为绝对 URL,最终得到可直接访问的图片链接列表。此外,在页面遍历环节,同样通过BeautifulSoup解析标签,提取符合条件的页面链接,用于后续爬虫的页面拓展。但单线程模式在处理大量页面和图片时效率明显不足,随着PAGE_LIMIT和IMG_LIMIT增大,耗时会线性增长,所以用多线程爬虫能加快速度

作业②

要求:熟练掌握 scrapy 中 Item、Pipeline 数据的序列化输出方法;使用scrapy框架+Xpath+MySQL数据库存储技术路线爬取股票相关信息。
候选网站:东方财富网:
https://www.eastmoney.com/

代码逻辑:

image
image

定义StockItem类,声明需爬取的字段(如bStockNo股票代码、bPrice价格等),规范数据格式。

点击查看代码
import scrapy

class StockItem(scrapy.Item):
    id = scrapy.Field()
    bStockNo = scrapy.Field()
    bName = scrapy.Field()
    bPrice = scrapy.Field()
    bChangeRate = scrapy.Field()
    bChangeAmount = scrapy.Field()
    bVolume = scrapy.Field()
    bAmplitude = scrapy.Field()
    bHigh = scrapy.Field()
    bLow = scrapy.Field()
    bOpen = scrapy.Field()
    bPrevClose = scrapy.Field()

StockScrapyPipeline负责数据库操作,爬虫启动时连接 SQLite 并创建stocks表,处理 item 时将数据插入表中,爬虫结束时关闭连接。
点击查看代码
import pymysql
from scrapy.utils.project import get_project_settings


class StockScrapyPipeline:

    def __init__(self):
        settings = get_project_settings()
        self.host = settings.get('MYSQL_HOST')
        self.user = settings.get('MYSQL_USER')
        self.password = settings.get('MYSQL_PASSWORD')
        self.db_name = settings.get('MYSQL_DB')
        self.table_name = settings.get('MYSQL_TABLE')
        self.conn = None
        self.cursor = None

    def open_spider(self, spider):
        self.conn = pymysql.connect(
            host=self.host,
            user=self.user,
            password=self.password,
            database=self.db_name,
            charset='utf8mb4',
            cursorclass=pymysql.cursors.DictCursor
        )
        self.cursor = self.conn.cursor()
        spider.logger.info("MySQL Connection Established.")

        create_table_sql = f"""
            CREATE TABLE IF NOT EXISTS {self.table_name} (
                id INT PRIMARY KEY,
                bStockNo VARCHAR(20) NOT NULL UNIQUE, 
                bName VARCHAR(100),
                bPrice DECIMAL(10, 4),
                bChangeRate DECIMAL(10, 4),
                bChangeAmount DECIMAL(10, 4),
                bVolume DECIMAL(20, 4),
                bAmplitude DECIMAL(10, 4),
                bHigh DECIMAL(10, 4),
                bLow DECIMAL(10, 4),
                bOpen DECIMAL(10, 4),
                bPrevClose DECIMAL(10, 4)
            ) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4;
        """

        self.cursor.execute(create_table_sql)
        self.conn.commit()

    def process_item(self, item, spider):
        insert_sql = f"""
            INSERT INTO {self.table_name} (
                id, bStockNo, bName, bPrice, bChangeRate, bChangeAmount, 
                bVolume, bAmplitude, bHigh, bLow, bOpen, bPrevClose
            ) VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s)
            ON DUPLICATE KEY UPDATE
                bName=VALUES(bName),
                bPrice=VALUES(bPrice), 
                bChangeRate=VALUES(bChangeRate), 
                bChangeAmount=VALUES(bChangeAmount), 
                bVolume=VALUES(bVolume), 
                bAmplitude=VALUES(bAmplitude), 
                bHigh=VALUES(bHigh), 
                bLow=VALUES(bLow), 
                bOpen=VALUES(bOpen), 
                bPrevClose=VALUES(bPrevClose);
        """

        values = (
            item.get("id"),
            item.get("bStockNo"),
            item.get("bName"),
            item.get("bPrice"),
            item.get("bChangeRate"),
            item.get("bChangeAmount"),
            item.get("bVolume"),
            item.get("bAmplitude"),
            item.get("bHigh"),
            item.get("bLow"),
            item.get("bOpen"),
            item.get("bPrevClose"),
        )

        self.cursor.execute(insert_sql, values)
        self.conn.commit()

        return item

    def close_spider(self, spider):
        if self.conn:
            self.conn.close()
            spider.logger.info("MySQL Connection Closed.")
定义StocksSpider,指定爬取东方财富网的股票列表 API,通过parse方法解析返回的 JSON 数据,提取股票代码、名称、价格等字段,封装成StockItem并输出。
点击查看代码
import scrapy
import json
from stock_scrapy.items import StockItem

class StocksSpider(scrapy.Spider):
    name = "stocks"
    custom_settings = {
        'DOWNLOAD_DELAY': 0.5,  
    }
    api_url = (
        "http://push2.eastmoney.com/api/qt/clist/get?"
        "pn=1&pz=1000&np=1&fltt=2&fid=f3&fs=m:1+t:2"
        "&fields=f2,f3,f4,f5,f6,f7,f8,f9,f12,f13,f14,f15,f16,f17,f18"
    )
    start_urls = [api_url]

    def parse(self, response):
        try:
            data = json.loads(response.text)
            stocks = data.get("data", {}).get("diff", [])
        except json.JSONDecodeError as e:
            self.logger.error(f"JSON decoding failed: {e}")
            return

        for idx, s in enumerate(stocks, start=1):
            item = StockItem()
            item["id"] = idx + (1 - 1) * 1000  # 假设 pn=1, pz=1000
            item["bStockNo"] = s.get("f12")  # 股票代码
            item["bName"] = s.get("f14")  # 股票名称
            item["bPrice"] = s.get("f2")  # 最新价
            item["bChangeRate"] = s.get("f3")  # 涨跌幅
            item["bChangeAmount"] = s.get("f4")  # 涨跌额
            item["bVolume"] = s.get("f5")  # 成交量
            item["bAmplitude"] = s.get("f7")  # 振幅
            item["bHigh"] = s.get("f15")  # 最高价
            item["bLow"] = s.get("f16")  # 最低价
            item["bOpen"] = s.get("f17")  # 开盘价
            item["bPrevClose"] = s.get("f18")  # 昨收价
            if item["bStockNo"] and item["bName"]:
                yield item
            else:
                self.logger.warning(f"Skipping item due to missing key data: {s}")

心得体会:

这次股票数据爬虫项目,目标是将 Scrapy 采集的数据高效地存入 MySQL 数据库,整个过程让我对数据流的工程化有了更深的理解。
在爬虫逻辑方面,我主要完成了对东方财富 API 数据的结构化提取。将 API 返回的简写字段(如 f12、f2)准确映射到我们定义的 StockItem 字段是基础,这个过程确保了数据在进入 Pipeline 前的清晰度。核心是 StocksSpider 通过 yield item 将数据对象推送下去,这种解耦设计非常高效。
但在项目实战中,我遇到了两个关键的逻辑问题,需要深刻反思:
反爬挑战:爬虫在第一次请求时就被 robots.txt 规则阻止。虽然技术上可以通过修改 ROBOTSTXT_OBEY 解决,但这提醒我,爬虫逻辑设计必须考虑到反爬策略,如 User-Agent 管理和代理池,以应对真实环境中的复杂性。
数据完整性缺失:我的爬虫逻辑目前仅执行了一次请求,只能获取到数据集的一个子集。这是最大的逻辑缺陷——缺乏分页机制。一个合格的爬虫必须能够根据 API 响应中的总数信息,动态构造并发出下一页请求,形成一个完整的循环,确保全量数据的覆盖。

作业③:

要求:熟练掌握 scrapy 中 Item、Pipeline 数据的序列化输出方法;使用scrapy框架+Xpath+MySQL数据库存储技术路线爬取外汇网站数据。
候选网站:中国银行网:https://www.boc.cn/sourcedb/whpj/

代码逻辑:

image
image

item封装

点击查看代码
import scrapy

class BocFxItem(scrapy.Item):
    # 货币名称
    Currency = scrapy.Field()
    # 现汇买入价 (TBP: Telegraphic Transfer Buying Price)
    TBP = scrapy.Field()
    # 现钞买入价 (CBP: Cash Buying Price)
    CBP = scrapy.Field()
    # 现汇卖出价 (TSP: Telegraphic Transfer Selling Price)
    TSP = scrapy.Field()
    # 现钞卖出价 (CSP: Cash Selling Price)
    CSP = scrapy.Field()
    # 发布时间
    Time = scrapy.Field()
pipelines
点击查看代码
import pymysql
from scrapy.utils.project import get_project_settings
from itemadapter import ItemAdapter

class BocFxPipeline:
    def __init__(self):
        settings = get_project_settings()
        self.host = settings.get('MYSQL_HOST')
        self.user = settings.get('MYSQL_USER')
        self.password = settings.get('MYSQL_PASSWORD')
        self.db_name = settings.get('MYSQL_DB')
        self.table_name = settings.get('MYSQL_TABLE')

        self.conn = None
        self.cursor = None

    def open_spider(self, spider):
        spider.logger.info("正在连接 MySQL 数据库...")
        self.conn = pymysql.connect(
            host=self.host,
            user=self.user,
            password=self.password,
            database=self.db_name,
            charset='utf8mb4',
            cursorclass=pymysql.cursors.Cursor
        )
        self.cursor = self.conn.cursor()
        create_table_sql = f"""
        CREATE TABLE IF NOT EXISTS {self.table_name} (
            id INT AUTO_INCREMENT PRIMARY KEY,
            currency VARCHAR(50) UNIQUE,
            tbp VARCHAR(50),
            cbp VARCHAR(50),
            tsp VARCHAR(50),
            csp VARCHAR(50),
            time VARCHAR(50)
        ) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4;
        """
        self.cursor.execute(create_table_sql)
        self.conn.commit()
        spider.logger.info(f"MySQL 连接成功,表 '{self.table_name}' 准备就绪。")

    def process_item(self, item, spider):
        adapter = ItemAdapter(item)
        insert_sql = f"""
        INSERT INTO {self.table_name} (currency, tbp, cbp, tsp, csp, time)
        VALUES (%s, %s, %s, %s, %s, %s)
        ON DUPLICATE KEY UPDATE
            tbp = VALUES(tbp),
            cbp = VALUES(cbp),
            tsp = VALUES(tsp),
            csp = VALUES(csp),
            time = VALUES(time);
        """
        data = (
            adapter.get('Currency', ''),
            adapter.get('TBP', ''),
            adapter.get('CBP', ''),
            adapter.get('TSP', ''),
            adapter.get('CSP', ''),
            adapter.get('Time', '')
        )
        self.cursor.execute(insert_sql, data)
        self.conn.commit()

        return item

    def close_spider(self, spider):
        if self.conn:
            self.conn.close()
            spider.logger.info("MySQL 数据库连接已关闭。")
爬虫
点击查看代码
import scrapy
from boc_fx.items import BocFxItem


class BocSpiderSpider(scrapy.Spider):
    name = "boc_spider"
    allowed_domains = ["boc.cn"]
    start_urls = ["https://www.boc.cn/sourcedb/whpj/"]

    def parse(self, response):
        self.logger.info(f"成功访问页面,状态码: {response.status}")

        rows = response.xpath('//div[@class="publish"]/div/table/tr[position()>1]')

        if not rows:
            self.logger.warning("未找到表格数据,可能是页面结构变化或反爬拦截。")

        for row in rows:
            item = BocFxItem()
            # 提取货币名称并做非空判断
            currency_name = row.xpath('./td[1]/text()').extract_first()
            if not currency_name:
                continue
            item['Currency'] = currency_name.strip()
            tbp_text = row.xpath('./td[2]/text()').extract_first()
            item['TBP'] = tbp_text.strip() if tbp_text else None

            cbp_text = row.xpath('./td[3]/text()').extract_first()
            item['CBP'] = cbp_text.strip() if cbp_text else None

            tsp_text = row.xpath('./td[4]/text()').extract_first()
            item['TSP'] = tsp_text.strip() if tsp_text else None

            csp_text = row.xpath('./td[5]/text()').extract_first()
            item['CSP'] = csp_text.strip() if csp_text else None

            time_text = row.xpath('./td[7]/text()').extract_first()
            item['Time'] = time_text.strip() if time_text else None

            yield item

心得体会:

开发时最考验细心的是 XPath 定位 —— 刚开始因漏看表格行索引导致数据提取空值,后来加了非空判断才稳定。用 SQLite 存数据时,遇到过插入失败的情况,通过事务回滚和异常捕获解决了问题。整个过程让我明白,爬虫不仅要精准解析页面,数据存储的健壮性也同样重要,细节处理直接影响最终结果的可靠性
该项目基于 Scrapy 框架爬取中国银行外汇牌价数据。首先,boc_spider.py定义爬虫类,指定目标网址和解析规则,通过 XPath 定位网页表格中的货币名称、现汇 / 现钞买卖价及时间等信息,封装成BocFxItem(定义在items.py)。随后,BocFxPipeline处理 Item,在爬虫启动时连接 SQLite 数据库并创建表,爬取过程中将数据插入数据库,结束时关闭连接,完成数据持久化。

gitee地址:https://gitee.com/li-zhiyang-dejavu/2025_crawl_project/tree/master/3

posted @ 2025-11-20 19:33  dejJjL  阅读(15)  评论(0)    收藏  举报