102302147傅乐宜作业3

1.指定一个网站,爬取这个网站中的所有的所有图片,例如:中国气象网(http://www.weather.com.cn)。实现单线程和多线程的方式爬取。

内容

由于是爬了好几次的网站,所以不放网页结构了

核心代码

1.单线程

点击查看代码
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin
import os

if not os.path.exists('images'):
    os.makedirs('images')
max_image=47
count=0
base_url="http://www.weather.com.cn"
headers = {'User-Agent': 'Mozilla/5.0'}
response=requests.get(base_url, headers=headers, timeout=10)
print(response)
response.encoding=response.apparent_encoding
soup=BeautifulSoup(response.text,'lxml')
images=soup.select("img")
urls=[]

count=0
def download(img_url):
    global count
    try:

        print(f"正在下载: {img_url}")
        
        # 发送请求下载图片
        response = requests.get(img_url, headers=headers, timeout=10)
        response.raise_for_status()
        
        # 从URL中提取文件名
        filename = os.path.basename(img_url)
        if not filename or '.' not in filename:
            filename = f"image_{count + 1}.jpg"
        
        # 完整的文件路径
        filepath = os.path.join('images', filename)
        
        # 保存图片
        with open(filepath, 'wb') as f:
            f.write(response.content)
        
        count+=1
        print(f"✓ 成功下载第 {count} 张图片: {filename}")
        return True
        
    except Exception as e:
        print(f"✗ 下载失败: {str(e)}")
        return False
       

for image in images:
    if count >=max_image:
        print("已达上限,停止")
        break
    try:
                src = image["src"]
 
                url = urljoin(base_url, src)
                if url not in urls:
                    urls.append(url)
                    print(url)
                    download(url)
    except Exception as err:
                print(err)

2.多线程
点击查看代码
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin
import os
import concurrent.futures
import threading

if not os.path.exists('images1'):
    os.makedirs('images1')

max_image = 47
base_url = "http://www.weather.com.cn"
headers = {'User-Agent': 'Mozilla/5.0'}
response = requests.get(base_url, headers=headers, timeout=10)
print(response)
response.encoding = response.apparent_encoding
soup = BeautifulSoup(response.text, 'lxml')
images = soup.select("img")
urls = []

# 全局计数器和锁
download_count = 0
lock = threading.Lock()
# 多线程下载函数
def download(img_url):
    global download_count
    try:
        print(f"正在下载: {img_url}")
        
        # 发送请求下载图片
        response = requests.get(img_url, headers=headers, timeout=10)
        response.raise_for_status()
        
        # 从URL中提取文件名
        filename = os.path.basename(img_url)
        if not filename or '.' not in filename:
            filename = f"image_{hash(img_url)}.jpg"
        
        # 完整的文件路径
        filepath = os.path.join('images1', filename)
        
        # 保存图片
        with open(filepath, 'wb') as f:
            f.write(response.content)
        
        with lock:
            download_count += 1
            print(f"✓ 成功下载第 {download_count} 张图片: {filename}")
        
        return True
        
    except Exception as e:
        print(f"✗ 下载失败: {str(e)}")
        return False

# 使用 ThreadPoolExecutor 来实现多线程下载
def main():
    with concurrent.futures.ThreadPoolExecutor(max_workers=5) as executor:
        futures = []
        for image in images:
            try:
                src = image["src"]
                url = urljoin(base_url, src)
                if url not in urls:
                    urls.append(url)
                    print(url)
                    futures.append(executor.submit(download, url))
            except Exception as err:
                print(err)
        
        # 等待所有任务完成
        for future in concurrent.futures.as_completed(futures):
            future.result()

if __name__ == "__main__":
    main()

结果

屏幕截图 2025-11-25 162034

心得体会

主要是在下载的时候才有单线程多线程的区别,相比与单线程,多线程不会被单一下载任务卡住

链接:https://gitee.com/wugao00882999/data-collection/blob/master/%E4%BD%9C%E4%B8%9A3/weather1.py
https://gitee.com/wugao00882999/data-collection/blob/master/%E4%BD%9C%E4%B8%9A3/weather2.py

2.熟练掌握 scrapy 中 Item、Pipeline 数据的序列化输出方法;使用scrapy框架+Xpath+MySQL数据库存储技术路线爬取股票相关信息。

内容

这也是个爬过的网站,故不再展示网页结构

核心代码

spider

点击查看代码
import scrapy
import json
import re
from Stock.items import StockItem

class EastMoneySpider(scrapy.Spider):
    name = 'eastmoney'
    allowed_domains = ['push2.eastmoney.com']
    
    def start_requests(self):
        url = "https://push2.eastmoney.com/api/qt/clist/get"
        
        params = {
            "np": 1,
            "fltt": 1,
            "invt": 2,
            "cb": "jQuery37106236146953184138_1761719786814",
            "fs": "m:0+t:6+f:!2,m:0+t:80+f:!2,m:1+t:2+f:!2,m:1+t:23+f:!2,m:0+t:81+s:262144+f:!2",
            "fields": "f12,f13,f14,f1,f2,f4,f3,f152,f5,f6,f7,f15,f18,f16,f17,f10,f8,f9,f23",
            "fid": "f3",
            "pn": 1,
            "pz": 20,
            "po": 1,
            "dect": 1,
            "ut": "fa5fd1943c7b386f172d6893dbfba10b",
            "wbp2u": "|0|0|0|web",
            "_": "1761719786819"
        }
        
        yield scrapy.Request(
            url=f"{url}?{'&'.join([f'{k}={v}' for k, v in params.items()])}",
            callback=self.parse
        )
    
    def parse(self, response):
        content = response.text
        pattern = r'^.*?\((.*)\);$'
        match = re.match(pattern, content)
        
        if match:
            json_str = match.group(1)
            data = json.loads(json_str)
            
            if 'data' in data and 'diff' in data['data']:
                stocks = data['data']['diff']
                
                for stock in stocks:
                    item = StockItem()
                    item['stock_code'] = stock['f12']
                    item['stock_name'] = stock['f14']
                    
                    # 处理最新价格
                    latest_price = str(stock['f2'])
                    if len(latest_price) > 2:
                        item['latest_price'] = latest_price[:2] + '.' + latest_price[2:]
                    else:
                        item['latest_price'] = latest_price
                    
                    # 处理涨跌幅
                    change_percent = str(stock['f3'])
                    if len(change_percent) > 2:
                        item['change_percentage'] = change_percent[:2] + '.' + change_percent[2:] + '%'
                    else:
                        item['change_percentage'] = change_percent + '%'
                    
                    yield item

pipeline

点击查看代码
from itemadapter import ItemAdapter
import pymysql
import json
class MySQLPipeline:
    def __init__(self, mysql_host, mysql_db, mysql_user, mysql_password):
        self.mysql_host = mysql_host
        self.mysql_db = mysql_db
        self.mysql_user = mysql_user
        self.mysql_password = mysql_password
    
    @classmethod
    def from_crawler(cls, crawler):
        return cls(
            mysql_host=crawler.settings.get('MYSQL_HOST'),
            mysql_db=crawler.settings.get('MYSQL_DATABASE'),
            mysql_user=crawler.settings.get('MYSQL_USER'),
            mysql_password=crawler.settings.get('MYSQL_PASSWORD')
        )
    
    def open_spider(self, spider):
        self.connection = pymysql.connect(
            host=self.mysql_host,
            user=self.mysql_user,
            password=self.mysql_password,
            database=self.mysql_db,
            charset='utf8mb4'
        )
        self.cursor = self.connection.cursor()
        
        # 创建表
        self.cursor.execute('''
            CREATE TABLE IF NOT EXISTS stocks (
                id INT AUTO_INCREMENT PRIMARY KEY,
                stock_code VARCHAR(20) NOT NULL,
                stock_name VARCHAR(100) NOT NULL,
                latest_price VARCHAR(20),
                change_percentage VARCHAR(20),
                UNIQUE KEY unique_stock (stock_code)
            )
        ''')
        self.connection.commit()
    
    def close_spider(self, spider):
        self.connection.close()
    
    def process_item(self, item, spider):
        # 使用INSERT ... ON DUPLICATE KEY UPDATE 避免重复数据
        sql = '''
            INSERT INTO stocks (stock_code, stock_name, latest_price, change_percentage)
            VALUES (%s, %s, %s, %s)
            ON DUPLICATE KEY UPDATE
            stock_name = VALUES(stock_name),
            latest_price = VALUES(latest_price),
            change_percentage = VALUES(change_percentage)
        '''
        
        self.cursor.execute(sql, (
            item['stock_code'],
            item['stock_name'],
            item['latest_price'],
            item['change_percentage']
        ))
        self.connection.commit()
        
        return item

class JsonWriterPipeline:
    def open_spider(self, spider):
        self.file = open('stocks.json', 'w', encoding='utf-8')
        self.file.write('[\n')
        self.first_item = True
    
    def close_spider(self, spider):
        self.file.write('\n]')
        self.file.close()
    
    def process_item(self, item, spider):
        line = '' if self.first_item else ',\n'
        self.first_item = False
        line += json.dumps(dict(item), ensure_ascii=False, indent=2)
        self.file.write(line)
        return item

结果

屏幕截图 2025-11-25 170652

屏幕截图 2025-11-25 191153

心得体会

本次实验让我对 Scrapy 项目的结构和运行流程有了更清晰的认识,知道了如何从原有的爬虫套用框架
链接:https://gitee.com/wugao00882999/data-collection/tree/master/%E4%BD%9C%E4%B8%9A3/Stock

3.熟练掌握 scrapy 中 Item、Pipeline 数据的序列化输出方法;使用scrapy框架+Xpath+MySQL数据库存储技术路线爬取外汇网站数据。

内容

网页结构

屏幕截图 2025-11-25 171742

屏幕截图 2025-11-25 182625

网页结构简单,容易定位元素tr,td

核心代码

spider

点击查看代码
import scrapy
from Forex.items import BocExchangeItem
from datetime import datetime

class BocSpider(scrapy.Spider):
    name = 'boc_exchange'
    allowed_domains = ['boc.cn']
    start_urls = ['https://www.boc.cn/sourcedb/whpj/']
    
    custom_settings = {
        'USER_AGENT': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36',
        'DOWNLOAD_DELAY': 1,
        'CONCURRENT_REQUESTS': 1,
    }

    def parse(self, response):
        # 定位表格行
        rows = response.xpath('//div[@class="publish"]//table/tr')
        
        # 跳过表头,处理数据行
        for row in rows[1:]:
            columns = row.xpath('./td/text()').getall()
            
            if len(columns) >= 6:
                item = BocExchangeItem()
                item['currency_name'] = columns[0].strip() if columns[0].strip() else None
                item['buying_rate'] = columns[1]
                item['cash_buying_rate'] = columns[2]
                item['selling_rate'] = columns[3]
                item['cash_selling_rate'] = columns[4]
                item['middle_rate'] = columns[5]
                item['pub_time'] = columns[6].strip() if len(columns) > 6 else None
                item['crawl_time'] = datetime.now()
                
                yield item



pipeline

点击查看代码
from itemadapter import ItemAdapter
import sqlite3
import logging

class SQLitePipeline:
    def __init__(self, sqlite_db):
        self.sqlite_db = sqlite_db
        self.conn = None
        self.cursor = None

    @classmethod
    def from_crawler(cls, crawler):
        return cls(
            sqlite_db=crawler.settings.get('SQLITE_DB', 'exchange_rates.db')
        )

    def open_spider(self, spider):
        # 连接数据库
        self.conn = sqlite3.connect(self.sqlite_db)
        self.cursor = self.conn.cursor()
        self.create_table()

    def close_spider(self, spider):
        # 关闭数据库连接
        if self.conn:
            self.conn.close()

    def create_table(self):
        # 创建数据表
        create_table_sql = """
        CREATE TABLE IF NOT EXISTS exchange_rates (
            id INTEGER PRIMARY KEY AUTOINCREMENT,
            currency_name VARCHAR(50) NOT NULL,
            buying_rate DECIMAL(10,4),
            cash_buying_rate DECIMAL(10,4),
            selling_rate DECIMAL(10,4),
            cash_selling_rate DECIMAL(10,4),
            middle_rate DECIMAL(10,4),
            pub_time DATETIME,
            crawl_time TIMESTAMP
        )
        """
        self.cursor.execute(create_table_sql)
        self.conn.commit()

    def process_item(self, item, spider):
        # 检查数据是否已存在
        if not self.data_exists(item):
            self.insert_data(item)
            spider.logger.info(f"插入数据: {item['currency_name']}")
        else:
            spider.logger.info(f"数据已存在: {item['currency_name']}")
        
        return item

    def data_exists(self, item):
        # 检查重复数据
        check_sql = """
        SELECT COUNT(*) FROM exchange_rates 
        WHERE currency_name = ? AND pub_time = ?
        """
        self.cursor.execute(check_sql, (item['currency_name'], item['pub_time']))
        return self.cursor.fetchone()[0] > 0

    def insert_data(self, item):
        # 插入数据
        insert_sql = """
        INSERT INTO exchange_rates 
        (currency_name, buying_rate, cash_buying_rate, selling_rate, 
         cash_selling_rate, middle_rate, pub_time, crawl_time)
        VALUES (?, ?, ?, ?, ?, ?, ?, ?)
        """
        self.cursor.execute(insert_sql, (
            item['currency_name'],
            item['buying_rate'],
            item['cash_buying_rate'],
            item['selling_rate'],
            item['cash_selling_rate'],
            item['middle_rate'],
            item['pub_time'],
            item['crawl_time']
        ))
        self.conn.commit()

结果

屏幕截图 2025-11-25 185012

心得体会

链接https://gitee.com/wugao00882999/data-collection/tree/master/%E4%BD%9C%E4%B8%9A3/Forex

posted @ 2025-11-25 19:53  kukuliii库12321  阅读(6)  评论(0)    收藏  举报