第三次作业102302127

作业①

气象爬取

实验要求

要求:指定一个网站,爬取这个网站中的所有的所有图片,例如中国气象网(http://www.weather.com.cn)。实现单线程和多线程的方式爬取。

核心代码和运行结果

点击查看代码
import os
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin
from concurrent.futures import ThreadPoolExecutor  

target_url = "http://www.weather.com.cn"
headers = {'User-Agent': 'Mozilla/5.0'}


def get_image_links(url):
    print(f"正在分析网页: {url} ...")
    try:
        resp = requests.get(url, headers=headers, timeout=10)
        soup = BeautifulSoup(resp.content, 'html.parser')
        links = []
        for img in soup.find_all('img'):
            src = img.get('src') or img.get('data-src')
            if src:
                full_url = urljoin(url, src)
                links.append(full_url)
        set() 
        return list(set(links))
    except Exception as e:
        print(f"网页打开失败: {e}")
        return []


def download_one(img_url, index, save_folder):
    try:
        # 文件名: pic_1.jpg, pic_2.jpg ...
        filename = f"pic_{index}.jpg"
        filepath = os.path.join(save_folder, filename)
        if os.path.exists(filepath):
            print(f"跳过(已存在): {filename} \n   -> 来源: {img_url}")
            return
        data = requests.get(img_url, headers=headers, timeout=5).content
        with open(filepath, 'wb') as f:
            f.write(data)
        print(f"√ 下载成功: {filename} \n   -> 来源: {img_url}")
    except Exception as e:
        print(f"× 下载失败: {img_url} -> {e}")

def run_single_thread():
    print("\n" + "=" * 40)
    print("单线程下载")
    print("=" * 40)

    save_folder = "images_single"
    if not os.path.exists(save_folder): os.makedirs(save_folder)

    links = get_image_links(target_url)
    print(f"找到 {len(links)} 张图片,开始排队下载...\n")

    # 一个循环,必须等上一张下完,才能下下一张
    for i, link in enumerate(links):
        download_one(link, i + 1, save_folder)

    print("\n单线程下载全部完成")

def run_multi_thread():
    print("\n" + "=" * 40)
    print("多线程下载 ")
    print("=" * 40)

    save_folder = "images_multi"
    if not os.path.exists(save_folder): os.makedirs(save_folder)

    links = get_image_links(target_url)
    print(f"找到 {len(links)} 张图片,启动 10 个线程同时下载...\n")

    # 使用线程池,max_workers=10 表示同时派10个人去下载
    with ThreadPoolExecutor(max_workers=10) as executor:
        for i, link in enumerate(links):
 
            executor.submit(download_one, link, i + 1, save_folder)

    print("\n多线程下载指令已全部发送")

if __name__ == "__main__":
    run_single_thread()
    run_multi_thread()

image
image
image
image

心得体会

我利用 requests 获取网页源码并用 BeautifulSoup 解析提取图片链接,通过复用同一个下载函数,分别进行单线程顺序执行和基于 ThreadPoolExecutor 的多线程并发下载。

作业②

股票信息爬取

实验要求

用scrapy和xpath爬取股票信息

核心代码和实验结果

点击查看代码
import scrapy
from datetime import datetime
from stock_spider.items import StockItem
import json


class EastmoneySpider(scrapy.Spider):
    name = "eastmoney"
    allowed_domains = ["eastmoney.com"]

    base_url = 'http://80.push2.eastmoney.com/api/qt/clist/get'

    def start_requests(self):
        params = {
            'pn': 1,
            'pz': 100,
            'po': 1,
            'np': 1,
            'fltt': 2,
            'invt': 2,
            'fid': 'f3',
            'fs': 'm:0+t:6,m:0+t:80,m:1+t:2,m:1+t:23',
            'fields': 'f2,f3,f4,f5,f6,f7,f12,f14,f15,f16,f17,f18'
        }
        url = self.base_url + '?' + '&'.join([f"{k}={v}" for k, v in params.items()])
        yield scrapy.Request(url, callback=self.parse)

    def parse(self, response):
        try:
            data = json.loads(response.text)
            stocks = data.get('data', {}).get('diff', [])
            for index, stock in enumerate(stocks, start=1):
                item = StockItem()
                item['sequence_id'] = index
                item['stock_code'] = stock.get('f12', '')
                item['stock_name'] = stock.get('f14', '')
                item['latest_price'] = float(stock.get('f2', 0))
                item['change_percent'] = f"{stock.get('f3', 0):.2f}%"
                item['change_amount'] = float(stock.get('f4', 0))
                item['volume'] = f"{float(stock.get('f5', 0) / 10000):.2f}万"
                item['turnover'] = f"{float(stock.get('f6', 0) / 100000000):.2f}亿"
                item['amplitude'] = f"{stock.get('f7', 0):.2f}%"
                item['highest'] = float(stock.get('f15', 0))
                item['lowest'] = float(stock.get('f16', 0))
                item['today_open'] = float(stock.get('f17', 0))
                item['yesterday_close'] = float(stock.get('f18', 0))
                item['crawl_time'] = datetime.now().strftime('%Y-%m-%d %H:%M:%S')
                yield item
        except Exception as e:
            self.logger.error(f"✗ 数据解析失败: {e}")

image
image

心得体会

我用 Scrapy 框架 绕过前端页面,直接向东方财富网的 后台 JSON API 接口 发送请求,通过解析返回的 JSON 数据并将特定的键名比如f12, f14映射为股票信息来实现数据抓取。刚开始创建scrapy和配置管道文件时遇到困难,通过询问大模型和查询来解决。

作业③

爬取外汇网站数

实验要求

用scrapy+xpath+mysql爬取外汇网站数

核心代码和实验结果

点击查看代码
import scrapy
from scrapy_project.items import CurrencyItem
from datetime import datetime


class CurrencySpider(scrapy.Spider):
    name = "currency"
    allowed_domains = ["boc.cn"]
    start_urls = ["https://www.boc.cn/sourcedb/whpj/"]

    def parse(self, response):
        rows = response.xpath('//table[@align="left" and @width="100%"]//tr')
        update_time = None
        time_xpath_options = [
            '//div[@align="right"]//text()',
            '//p[contains(text(), "更新时间")]//text()',
            '//font[contains(text(), "更新时间")]//text()',
            '//td[contains(text(), "更新时间")]//text()',
            '//*[contains(text(), "更新时间")]//text()',
        ]
        for xpath in time_xpath_options:
            time_texts = response.xpath(xpath).getall()
            for text in time_texts:
                if '更新时间' in text or ':' in text:
                    update_time = text.strip()
                    self.logger.info(f"找到更新时间: {update_time}")
                    break
            if update_time:
                break
        if not update_time:
            update_time = datetime.now().strftime('%Y-%m-%d %H:%M:%S')
            self.logger.warning(f"未找到更新时间,使用当前时间: {update_time}")
        self.logger.info(f"找到 {len(rows)} 行数据")
        for i, row in enumerate(rows):
            if i == 0:
                continue
            cells = row.xpath('.//td/text()').getall()
            cells = [cell.strip() for cell in cells if cell.strip()]
            if len(cells) < 6:
                self.logger.warning(f"第 {i} 行数据不完整: {cells}")
                continue
            try:
                item = CurrencyItem()
                item['currency'] = cells[0]  # 货币名称
                item['tbp'] = self.parse_float(cells[1])  # 现汇买入价
                item['cbp'] = self.parse_float(cells[2])  # 现钞买入价
                item['tsp'] = self.parse_float(cells[3])  # 现汇卖出价
                item['csp'] = self.parse_float(cells[4])  # 现钞卖出价
                item['time'] = update_time

                yield item

            except Exception as e:
                self.logger.error(f"解析第 {i} 行时出错: {e}, 数据: {cells}")

    def parse_float(self, value):
        try:
            value = value.strip()
            if not value or value == '--':
                return 0.0
            return float(value)
        except (ValueError, AttributeError):
            return 0.0

image
image
image

心得体会

我使用 Scrapy 框架 访问中国银行外汇牌价网页,采用多种 XPath定位页面更新时间,并遍历解析 HTML 表格行来提取和清洗各种货币的现汇/现钞买入卖出价数据,最终生成结构化的 CurrencyItem 对象。其中在获取更新时间是遇到的困难,无法读取时间,最后依次尝试定位特定布局属性、标签内的关键词),最后使用全页面通配符搜索,以保证能读取到时间

posted @ 2025-12-06 14:50  abc。  阅读(11)  评论(0)    收藏  举报