102302142罗伟钊第三次作业

作业①:
– 要求:指定一个网站,爬取这个网站中的所有的所有图片,例如中国气象网(http://www.weather.com.cn)。实现单线程和多线程的方式爬取。

1)代码内容:

SingleThreadImageDownloader.py

--------------------------------------------------

点击查看代码
import requests
from bs4 import BeautifulSoup
import os
import time
from urllib.parse import urljoin, urlparse


class SingleThreadImageDownloader:
    def __init__(self, base_url, download_dir="images"):
        self.base_url = base_url
        self.download_dir = download_dir
        self.session = requests.Session()
        self.session.headers.update({
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
        })

    def create_download_directory(self):
        """创建下载目录"""
        if not os.path.exists(self.download_dir):
            os.makedirs(self.download_dir)
            print(f"创建目录: {self.download_dir}")

    def is_valid_image_url(self, url):
        """检查是否为有效的图片URL"""
        if not url or url.startswith('data:'):
            return False

        # 检查文件扩展名
        valid_extensions = ['.jpg', '.jpeg', '.png', '.gif', '.bmp', '.webp']
        parsed_url = urlparse(url)
        path = parsed_url.path.lower()
        return any(path.endswith(ext) for ext in valid_extensions)

    def download_image(self, img_url, filename):
        """下载单个图片"""
        try:
            response = self.session.get(img_url, stream=True, timeout=10)
            response.raise_for_status()

            filepath = os.path.join(self.download_dir, filename)

            with open(filepath, 'wb') as f:
                for chunk in response.iter_content(chunk_size=8192):
                    if chunk:
                        f.write(chunk)

            print(f"✓ 下载成功: {img_url} -> {filepath}")
            return True

        except Exception as e:
            print(f"✗ 下载失败: {img_url} - 错误: {str(e)}")
            return False

    def extract_image_urls(self, url):
        """从网页中提取图片URL"""
        try:
            response = self.session.get(url, timeout=10)
            response.raise_for_status()
            response.encoding = 'utf-8'

            soup = BeautifulSoup(response.text, 'html.parser')
            img_tags = soup.find_all('img')

            image_urls = []
            for img in img_tags:
                src = img.get('src') or img.get('data-src')
                if src:
                    # 转换为绝对URL
                    full_url = urljoin(url, src)
                    if self.is_valid_image_url(full_url):
                        image_urls.append(full_url)

            return list(set(image_urls))  # 去重

        except Exception as e:
            print(f"提取图片URL失败: {str(e)}")
            return []

    def run(self):
        """运行单线程下载"""
        print("=" * 60)
        print("开始单线程图片下载")
        print("=" * 60)

        self.create_download_directory()
        image_urls = self.extract_image_urls(self.base_url)

        print(f"找到 {len(image_urls)} 张图片")
        print("-" * 60)

        success_count = 0
        start_time = time.time()

        for i, img_url in enumerate(image_urls, 1):
            # 生成文件名
            filename = f"image_{i}_{hash(img_url) % 10000:04d}.jpg"

            # 下载图片
            if self.download_image(img_url, filename):
                success_count += 1

        end_time = time.time()
        elapsed_time = end_time - start_time

        print("-" * 60)
        print(f"单线程下载完成!")
        print(f"总图片数: {len(image_urls)}")
        print(f"成功下载: {success_count}")
        print(f"失败: {len(image_urls) - success_count}")
        print(f"耗时: {elapsed_time:.2f} 秒")
        print(f"图片保存位置: {os.path.abspath(self.download_dir)}")

        return success_count

--------------------------------------------------

MultiThreadImageDownloader.py

--------------------------------------------------

点击查看代码
import threading
from concurrent.futures import ThreadPoolExecutor, as_completed
import queue
from SingleThreadImageDownloader import SingleThreadImageDownloader
import time
import os


class MultiThreadImageDownloader(SingleThreadImageDownloader):
    def __init__(self, base_url, download_dir="images", max_workers=5):
        super().__init__(base_url, download_dir)
        self.max_workers = max_workers
        self.downloaded_count = 0
        self.lock = threading.Lock()

    def download_image_thread(self, args):
        """多线程下载图片的包装函数"""
        img_url, filename = args
        result = self.download_image(img_url, filename)

        with self.lock:
            if result:
                self.downloaded_count += 1
        return result

    def run(self):
        """运行多线程下载"""
        print("=" * 60)
        print("开始多线程图片下载")
        print("=" * 60)

        self.create_download_directory()
        image_urls = self.extract_image_urls(self.base_url)

        print(f"找到 {len(image_urls)} 张图片")
        print(f"使用 {self.max_workers} 个线程")
        print("-" * 60)

        self.downloaded_count = 0
        start_time = time.time()

        # 准备任务参数
        tasks = []
        for i, img_url in enumerate(image_urls, 1):
            filename = f"image_{i}_{hash(img_url) % 10000:04d}.jpg"
            tasks.append((img_url, filename))

        # 使用线程池执行下载任务
        with ThreadPoolExecutor(max_workers=self.max_workers) as executor:
            future_to_task = {
                executor.submit(self.download_image_thread, task): task
                for task in tasks
            }

            # 等待所有任务完成
            for future in as_completed(future_to_task):
                try:
                    future.result()
                except Exception as e:
                    print(f"任务执行异常: {e}")

        end_time = time.time()
        elapsed_time = end_time - start_time

        print("-" * 60)
        print(f"多线程下载完成!")
        print(f"总图片数: {len(image_urls)}")
        print(f"成功下载: {self.downloaded_count}")
        print(f"失败: {len(image_urls) - self.downloaded_count}")
        print(f"耗时: {elapsed_time:.2f} 秒")
        print(f"图片保存位置: {os.path.abspath(self.download_dir)}")

        return self.downloaded_count

--------------------------------------------------

image_downloader.py

--------------------------------------------------

点击查看代码
from SingleThreadImageDownloader import SingleThreadImageDownloader
from MultiThreadImageDownloader import MultiThreadImageDownloader


def main():
    # 目标网站
    base_url = "http://www.weather.com.cn"

    print("中国气象网图片爬虫")
    print("目标网站:", base_url)
    print()

    # 单线程下载
    single_downloader = SingleThreadImageDownloader(base_url, "images_single")
    single_success = single_downloader.run()

    print("\n" + "=" * 80 + "\n")

    # 多线程下载
    multi_downloader = MultiThreadImageDownloader(base_url, "images_multi", max_workers=5)
    multi_success = multi_downloader.run()

    print("\n" + "=" * 80)
    print("下载总结:")
    print(f"单线程: 成功下载 {single_success} 张图片")
    print(f"多线程: 成功下载 {multi_success} 张图片")
    print("=" * 80)


if __name__ == "__main__":
    main()

--------------------------------------------------

图片

多线程代码使用了线程池执行下载任务,系统不会为每张图片都创建一个新线程,具有复用性。

输出结果:

99b88269bf4ce40d0de88c986ecc09a0

2)心得体会:
在单线程中,图片一定是从第1张下载到最后一张。但在多线程中,完成顺序是不确定的。

作业②:
– 要求:熟练掌握 scrapy中Item, Pipeline 数据的序列化输出方法; Scrapy+Xpath+MySQL数据库存储技术路线爬取股票相关信息。
– 网站:东方财富网:https://www.eastmoney.com/

图片

1)代码内容:

eastmoney.py

--------------------------------------------------

点击查看代码
import scrapy
import json
from stock_scrapy.items import StockItem

class EastmoneySpider(scrapy.Spider):
    name = 'eastmoney'
    allowed_domains = ['eastmoney.com']
    
    # Base URL without the callback parameter to get pure JSON if possible, 
    # or we handle the jQuery wrapper.
    # Removing 'cb' usually returns pure JSON.
    base_url = "https://push2.eastmoney.com/api/qt/clist/get"
    
    def start_requests(self):
        # Crawl first 5 pages as an example
        for page in range(1, 6):
            params = {
                'fid': 'f3',
                'po': '1',
                'pz': '20',
                'pn': str(page),
                'np': '1',
                'fltt': '2',
                'invt': '2',
                'ut': 'fa5fd1943c7b386f172d6893dbfba10b',
                'fs': 'm:0 t:6,m:0 t:80,m:1 t:2,m:1 t:23',
                'fields': 'f12,f13,f14,f1,f2,f4,f3,f152,f5,f6,f7,f15,f18,f16,f17,f10,f8,f9,f23',
            }
            # Construct URL with parameters
            import urllib.parse
            url = f"{self.base_url}?{urllib.parse.urlencode(params)}"
            yield scrapy.Request(url=url, callback=self.parse)

    def parse(self, response):
        try:
            data = json.loads(response.text)
            if data.get('data') and data['data'].get('diff'):
                items = data['data']['diff']
                # diff can be a list or a dict depending on the page/api version, 
                # usually list for this endpoint but let's handle both as in the reference code
                if isinstance(items, dict):
                    items = items.values()
                
                for item_data in items:
                    item = StockItem()
                    item['bStockNo'] = item_data.get('f12')
                    item['bStockName'] = item_data.get('f14')
                    item['latest_price'] = item_data.get('f2')
                    
                    # Format percentage
                    f3 = item_data.get('f3')
                    item['change_percent'] = f"{f3}%" if f3 is not None else "-"
                    
                    item['change_amount'] = item_data.get('f4')
                    
                    # Format volume (万/亿) - keeping raw number or formatting?
                    # The prompt image shows "26.13万", "7.6亿".
                    # I will implement a helper to format it similar to the prompt.
                    item['volume'] = self.format_number(item_data.get('f5'))
                    item['turnover'] = self.format_number(item_data.get('f6'), is_currency=True)
                    
                    item['amplitude'] = f"{item_data.get('f7')}%" if item_data.get('f7') is not None else "-"
                    item['high'] = item_data.get('f15')
                    item['low'] = item_data.get('f16')
                    item['open'] = item_data.get('f17')
                    item['prev_close'] = item_data.get('f18')
                    
                    yield item
        except json.JSONDecodeError:
            self.logger.error(f"Failed to decode JSON from {response.url}")

    def format_number(self, num, is_currency=False):
        if num is None:
            return "0"
        try:
            num = float(num)
            if is_currency:
                # Turnover usually in Yuan
                if num > 100000000:
                    return f"{num/100000000:.2f}亿"
                elif num > 10000:
                    return f"{num/10000:.2f}万"
                return f"{num:.2f}"
            else:
                # Volume usually in Hands (100 shares) or just number
                # The API returns volume in 'hands' usually for stocks? 
                # Or raw shares? f5 is usually volume in hands.
                # Let's assume the API returns raw count or hands.
                # Based on reference code: f"{x / 10000:.2f}万手"
                if num > 10000:
                    return f"{num/10000:.2f}万"
                return str(num)
        except:
            return str(num)

--------------------------------------------------

图片

以上为run.py文件,这段代码是一个启动脚本,它的主要作用是通过运行一个 Python 文件来启动 Scrapy 爬虫。

输出结果:

f873762045d7c425b60efe48a1e86f65

2)心得体会:

在本次针对东方财富网的股票数据采集实践中,我深入理解了 Scrapy 框架的运行机制,并重点掌握了从数据抓取到持久化存储的完整流程。

作业③:

– 要求::熟练掌握 scrapy中Item, Pipeline 数据的序列化输出方法; 采用 Scrapy 框架、XPath 以及 MySQL 数据库存储技术路线,对外汇网站进行爬取操作。
– 候选网站:招商银行网: https://www.boc.cn/sourcedb/whpj/

1)代码内容:

boc.py

--------------------------------------------------

点击查看代码
import scrapy
from currency_scrapy.items import CurrencyItem

class BocSpider(scrapy.Spider):
    name = 'boc'
    allowed_domains = ['boc.cn']
    start_urls = ['https://www.boc.cn/sourcedb/whpj/']

    def parse(self, response):
        # The table is usually inside a div with class 'publish' or just the main table
        # Let's try to find the table rows.
        # Structure: <table> <tr> <th>...</th> </tr> <tr> <td>...</td> </tr> ... </table>
        
        # Using XPath to select rows, skipping the header row (usually the first one)
        # The table is often the second table in the publish div or just identified by structure.
        # Let's look for rows that have <td> elements.
        rows = response.xpath('//div[@class="publish"]//table//tr')
        
        # If the class "publish" is not found, try a more generic table search
        if not rows:
             rows = response.xpath('//table//tr')

        for row in rows:
            # Check if it's a data row (has td)
            tds = row.xpath('./td')
            if len(tds) < 6:
                continue
            
            item = CurrencyItem()
            
            # Extract text and strip whitespace
            # Column 1: Currency Name
            item['currency'] = tds[0].xpath('string(.)').get().strip()
            
            # Column 2: TBP (现汇买入价)
            item['tbp'] = tds[1].xpath('string(.)').get().strip()
            
            # Column 3: CBP (现钞买入价)
            item['cbp'] = tds[2].xpath('string(.)').get().strip()
            
            # Column 4: TSP (现汇卖出价)
            item['tsp'] = tds[3].xpath('string(.)').get().strip()
            
            # Column 5: CSP (现钞卖出价)
            item['csp'] = tds[4].xpath('string(.)').get().strip()
            
            # Column 7 or 8: Time
            # Usually: Currency, TBP, CBP, TSP, CSP, Middle, PubDate, PubTime
            # Let's check the number of columns.
            # If 8 columns: 0=Name, 1=TBP, 2=CBP, 3=TSP, 4=CSP, 5=Middle, 6=Date, 7=Time
            if len(tds) >= 8:
                pub_time = tds[7].xpath('string(.)').get().strip()
                # Sometimes we might want date too, but the requirement says "Time" and shows "11:27:14"
                item['time'] = pub_time
            elif len(tds) >= 7:
                 # Maybe combined or different structure
                 item['time'] = tds[-1].xpath('string(.)').get().strip()
            else:
                item['time'] = ""

            # Filter out empty rows or header rows that might have slipped through
            if item['currency'] and item['currency'] != "货币名称":
                yield item

--------------------------------------------------

图片

以上为可视化的代码,对数据库的内容进行输出

输出结果:

6afb0de608aeee05658d0fd90fac1d4e

2)心得体会:

在本次针对中国银行外汇牌价数据的采集实践中,我深刻体会到了 Scrapy 框架结合 XPath 解析的强大之处。通过使用 XPath 选择器,我能够精准地定位到表格中的每一行数据,并利用相对路径提取出货币名称、现汇买入价、现钞买入价等具体字段,这让我意识到在处理非结构化网页数据时,掌握 XPath 语法是多么重要且高效。

代码路径:https://gitee.com/sui123feng/20251015/tree/master/102302142罗伟钊_实践3

posted @ 2025-11-25 01:27  sui123feng  阅读(10)  评论(0)    收藏  举报