数据采集与融合技术第三次作业

作业①:

要求:指定一个网站,爬取这个网站中的所有的所有图片,例如:中国气象网(http://www.weather.com.cn)。实现单线程和多线程的方式爬取。
–务必控制总页数(学号尾数2位)、总下载的图片数量(尾数后3位)等限制爬取的措施。

代码逻辑:

image
image

点击查看代码
import os
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin

ID = "152301219"
PAGE_LIMIT = int(ID[-2:])   # 19 页
IMG_LIMIT = int(ID[-3:])    # 219 张

BASE_URL = "http://www.weather.com.cn"
START_URL = "http://www.weather.com.cn/"
SAVE_DIR = "single_thread_images"
os.makedirs(SAVE_DIR, exist_ok=True)
def fetch_page(url):
    try:
        resp = requests.get(url, timeout=5)
        resp.encoding = resp.apparent_encoding
        return resp.text
    except:
        return ""
def extract_images(html, base_url):
    soup = BeautifulSoup(html, "html.parser")
    imgs = []
    for img in soup.find_all("img"):
        src = img.get("src")
        if src:
            imgs.append(urljoin(base_url, src))
    return imgs

def download_image(url, idx):
    try:
        ext = url.split(".")[-1][:4]
        filename = os.path.join(SAVE_DIR, f"img_{idx}.{ext}")
        r = requests.get(url, timeout=5)
        with open(filename, "wb") as f:
            f.write(r.content)
        print(f"Downloaded {filename}")
    except:
        pass
def single_thread_crawl():
    print("=== 单线程爬取开始 ===")

    to_visit = [START_URL]
    visited = set()
    page_count = 0
    img_count = 0

    while to_visit and page_count < PAGE_LIMIT and img_count < IMG_LIMIT:
        url = to_visit.pop(0)
        if url in visited:
            continue
        visited.add(url)

        html = fetch_page(url)
        if not html:
            continue

        page_count += 1
        print(f"[Single] Visiting page {page_count}: {url}")
        imgs = extract_images(html, BASE_URL)
        for img in imgs:
            if img_count >= IMG_LIMIT:
                break
            download_image(img, img_count)
            img_count += 1
        soup = BeautifulSoup(html, "html.parser")
        for a in soup.find_all("a"):
            href = a.get("href")
            if href and href.startswith("http") and BASE_URL in href:
                to_visit.append(href)

    print("=== 单线程爬取结束 ===")

if __name__ == "__main__":
    single_thread_crawl()

点击查看代码
import os
import threading
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin

ID = "152301219"
PAGE_LIMIT = int(ID[-2:])   # 19 页
IMG_LIMIT = int(ID[-3:])    # 219 张

BASE_URL = "http://www.weather.com.cn"
START_URL = "http://www.weather.com.cn/"
SAVE_DIR = "multi_thread_images"
os.makedirs(SAVE_DIR, exist_ok=True)
def fetch_page(url):
    try:
        resp = requests.get(url, timeout=5)
        resp.encoding = resp.apparent_encoding
        return resp.text
    except:
        return ""
def extract_images(html, base_url):
    soup = BeautifulSoup(html, "html.parser")
    imgs = []
    for img in soup.find_all("img"):
        src = img.get("src")
        if src:
            imgs.append(urljoin(base_url, src))
    return imgs

def download_image(url, idx):
    try:
        ext = url.split(".")[-1][:4]
        filename = os.path.join(SAVE_DIR, f"img_{idx}.{ext}")
        r = requests.get(url, timeout=5)
        with open(filename, "wb") as f:
            f.write(r.content)
        print(f"[Thread] Downloaded {filename}")
    except:
        pass

def multi_thread_crawl():
    print("=== 多线程爬取开始 ===")

    to_visit = [START_URL]
    visited = set()
    page_count = 0
    img_count = 0
    threads = []

    while to_visit and page_count < PAGE_LIMIT and img_count < IMG_LIMIT:
        url = to_visit.pop(0)
        if url in visited:
            continue
        visited.add(url)

        html = fetch_page(url)
        if not html:
            continue

        page_count += 1
        print(f"[Multi] Visiting page {page_count}: {url}")

        imgs = extract_images(html, BASE_URL)
        for img in imgs:
            if img_count >= IMG_LIMIT:
                break

            t = threading.Thread(target=download_image, args=(img, img_count))
            t.start()
            threads.append(t)

            img_count += 1

        soup = BeautifulSoup(html, "html.parser")
        for a in soup.find_all("a"):
            href = a.get("href")
            if href and href.startswith("http") and BASE_URL in href:
                to_visit.append(href)

    for t in threads:
        t.join()

    print("=== 多线程爬取结束 ===")


if __name__ == "__main__":
    multi_thread_crawl()

心得体会:

代码通过BeautifulSoup 库解析网页:在extract_images函数中,先将 HTML 文本加载为soup对象,然后调用soup.find_all("img")遍历页面所有图片标签,提取每个标签的src属性,并通过urljoin方法将相对路径拼接为绝对 URL,最终得到可直接访问的图片链接列表。此外,在页面遍历环节,同样通过BeautifulSoup解析标签,提取符合条件的页面链接,用于后续爬虫的页面拓展。但单线程模式在处理大量页面和图片时效率明显不足,随着PAGE_LIMIT和IMG_LIMIT增大,耗时会线性增长,所以用多线程爬虫能加快速度

作业②

要求:熟练掌握 scrapy 中 Item、Pipeline 数据的序列化输出方法;使用scrapy框架+Xpath+MySQL数据库存储技术路线爬取股票相关信息。
候选网站:东方财富网:
https://www.eastmoney.com/

代码逻辑:

image
image
定义StockItem类,声明需爬取的字段(如bStockNo股票代码、bPrice价格等),规范数据格式。

点击查看代码
import scrapy

class StockItem(scrapy.Item):
    id = scrapy.Field()
    bStockNo = scrapy.Field()
    bName = scrapy.Field()
    bPrice = scrapy.Field()
    bChangeRate = scrapy.Field()
    bChangeAmount = scrapy.Field()
    bVolume = scrapy.Field()
    bAmplitude = scrapy.Field()
    bHigh = scrapy.Field()
    bLow = scrapy.Field()
    bOpen = scrapy.Field()
    bPrevClose = scrapy.Field()

StockScrapyPipeline负责数据库操作,爬虫启动时连接 SQLite 并创建stocks表,处理 item 时将数据插入表中,爬虫结束时关闭连接。
点击查看代码
import sqlite3
class StockScrapyPipeline:
    def open_spider(self, spider):
        self.conn = sqlite3.connect("stocks.db")
        self.cursor = self.conn.cursor()
        self.cursor.execute("""
            CREATE TABLE IF NOT EXISTS stocks (
                id INTEGER PRIMARY KEY,
                bStockNo TEXT,
                bName TEXT,
                bPrice REAL,
                bChangeRate REAL,
                bChangeAmount REAL,
                bVolume REAL,
                bAmplitude REAL,
                bHigh REAL,
                bLow REAL,
                bOpen REAL,
                bPrevClose REAL
            );
        """)
        self.conn.commit()
    def process_item(self, item, spider):
        self.cursor.execute("""
            INSERT INTO stocks VALUES (?,?,?,?,?,?,?,?,?,?,?,?)
        """, (
            item["id"],
            item["bStockNo"],
            item["bName"],
            item["bPrice"],
            item["bChangeRate"],
            item["bChangeAmount"],
            item["bVolume"],
            item["bAmplitude"],
            item["bHigh"],
            item["bLow"],
            item["bOpen"],
            item["bPrevClose"],
        ))
        self.conn.commit()
        return item
    def close_spider(self, spider):
        self.conn.close()
定义StocksSpider,指定爬取东方财富网的股票列表 API,通过parse方法解析返回的 JSON 数据,提取股票代码、名称、价格等字段,封装成StockItem并输出。
点击查看代码
import scrapy
import json
from stock_scrapy.items import StockItem
class StocksSpider(scrapy.Spider):
    name = "stocks"
    # 东方财富后台 API(A股列表)
    api_url = (
        "http://push2.eastmoney.com/api/qt/clist/get?"
        "pn=1&pz=1000&np=1&fltt=2&fid=f3&fs=m:1+t:2"
        "&fields=f2,f3,f4,f5,f6,f7,f8,f9,f12,f13,f14,f15,f16,f17,f18"
    )
    start_urls = [api_url]
    def parse(self, response):
        data = json.loads(response.text)
        stocks = data["data"]["diff"]

        for idx, s in enumerate(stocks, start=1):
            item = StockItem()
            item["id"] = idx
            item["bStockNo"] = s["f12"]
            item["bName"] = s["f14"]
            item["bPrice"] = s["f2"]
            item["bChangeRate"] = s["f3"]
            item["bChangeAmount"] = s["f4"]
            item["bVolume"] = s["f5"]
            item["bAmplitude"] = s["f7"]
            item["bHigh"] = s["f15"]
            item["bLow"] = s["f16"]
            item["bOpen"] = s["f17"]
            item["bPrevClose"] = s["f18"]

            yield item

心得体会:

scrapy框架的结构清晰,Spider 专注爬取解析,Item 规范数据,Pipeline 处理存储,各组件解耦易维护。通过 SQLite 存储数据简单高效,适合小规模数据需求。实际爬取时需注意 API 反爬策略,可添加请求头或延迟。这种模块化设计让爬虫开发逻辑清晰,便于后续扩展功能(如多页爬取、增量更新)。

作业③:

要求:熟练掌握 scrapy 中 Item、Pipeline 数据的序列化输出方法;使用scrapy框架+Xpath+MySQL数据库存储技术路线爬取外汇网站数据。
候选网站:中国银行网:https://www.boc.cn/sourcedb/whpj/

代码逻辑:

image

item封装

点击查看代码
import scrapy

class BocFxItem(scrapy.Item):
    # 货币名称
    Currency = scrapy.Field()
    # 现汇买入价 (TBP: Telegraphic Transfer Buying Price)
    TBP = scrapy.Field()
    # 现钞买入价 (CBP: Cash Buying Price)
    CBP = scrapy.Field()
    # 现汇卖出价 (TSP: Telegraphic Transfer Selling Price)
    TSP = scrapy.Field()
    # 现钞卖出价 (CSP: Cash Selling Price)
    CSP = scrapy.Field()
    # 发布时间
    Time = scrapy.Field()
pipelines
点击查看代码
import sqlite3


class BocFxPipeline:
    def __init__(self):
        # 初始化数据库名称
        self.db_name = 'boc_data.db'
        self.conn = None
        self.cursor = None

    def open_spider(self, spider):
        """爬虫启动时调用:连接数据库并创建表"""
        print("正在连接 SQLite 数据库...")
        self.conn = sqlite3.connect(self.db_name)
        self.cursor = self.conn.cursor()

        # 创建表 SQL 语句 (如果表不存在则创建)
        create_table_sql = """
        CREATE TABLE IF NOT EXISTS exchange_rates (
            id INTEGER PRIMARY KEY AUTOINCREMENT,
            currency TEXT,
            tbp TEXT,
            cbp TEXT,
            tsp TEXT,
            csp TEXT,
            time TEXT
        );
        """
        self.cursor.execute(create_table_sql)
        self.conn.commit()

    def process_item(self, item, spider):
        """处理每一个 Item:插入数据"""
        insert_sql = """
        INSERT INTO exchange_rates (currency, tbp, cbp, tsp, csp, time)
        VALUES (?, ?, ?, ?, ?, ?)
        """

        # 准备数据元组
        data = (
            item.get('Currency', ''),
            item.get('TBP', ''),
            item.get('CBP', ''),
            item.get('TSP', ''),
            item.get('CSP', ''),
            item.get('Time', '')
        )

        try:
            self.cursor.execute(insert_sql, data)
            self.conn.commit()
        except Exception as e:
            print(f"插入数据出错: {e}")
            self.conn.rollback()

        return item

    def close_spider(self, spider):
        """爬虫关闭时调用:关闭数据库连接"""
        print("正在关闭数据库连接...")
        if self.cursor:
            self.cursor.close()
        if self.conn:
            self.conn.close()
爬虫
点击查看代码
import scrapy
from boc_fx.items import BocFxItem


class BocSpiderSpider(scrapy.Spider):
    name = "boc_spider"
    allowed_domains = ["boc.cn"]
    start_urls = ["https://www.boc.cn/sourcedb/whpj/"]

    def parse(self, response):
        self.logger.info(f"成功访问页面,状态码: {response.status}")

        rows = response.xpath('//div[@class="publish"]/div/table/tr[position()>1]')

        if not rows:
            self.logger.warning("未找到表格数据,可能是页面结构变化或反爬拦截。")

        for row in rows:
            item = BocFxItem()
            # 提取货币名称并做非空判断
            currency_name = row.xpath('./td[1]/text()').extract_first()
            if not currency_name:
                continue
            item['Currency'] = currency_name.strip()
            tbp_text = row.xpath('./td[2]/text()').extract_first()
            item['TBP'] = tbp_text.strip() if tbp_text else None

            cbp_text = row.xpath('./td[3]/text()').extract_first()
            item['CBP'] = cbp_text.strip() if cbp_text else None

            tsp_text = row.xpath('./td[4]/text()').extract_first()
            item['TSP'] = tsp_text.strip() if tsp_text else None

            csp_text = row.xpath('./td[5]/text()').extract_first()
            item['CSP'] = csp_text.strip() if csp_text else None

            time_text = row.xpath('./td[7]/text()').extract_first()
            item['Time'] = time_text.strip() if time_text else None

            yield item

心得体会:

开发时最考验细心的是 XPath 定位 —— 刚开始因漏看表格行索引导致数据提取空值,后来加了非空判断才稳定。用 SQLite 存数据时,遇到过插入失败的情况,通过事务回滚和异常捕获解决了问题。整个过程让我明白,爬虫不仅要精准解析页面,数据存储的健壮性也同样重要,细节处理直接影响最终结果的可靠性。### 项目逻辑
该项目基于 Scrapy 框架爬取中国银行外汇牌价数据。首先,boc_spider.py定义爬虫类,指定目标网址和解析规则,通过 XPath 定位网页表格中的货币名称、现汇 / 现钞买卖价及时间等信息,封装成BocFxItem(定义在items.py)。随后,BocFxPipeline处理 Item,在爬虫启动时连接 SQLite 数据库并创建表,爬取过程中将数据插入数据库,结束时关闭连接,完成数据持久化。

gitee地址:https://gitee.com/li-zhiyang-dejavu/2025_crawl_project/tree/master/3

posted @ 2025-11-20 19:33  dejJjL  阅读(7)  评论(0)    收藏  举报