数据采集第三次作业

作业1

相关代码与结果

代码

单线程代码如下:

点击查看代码
import requests
from bs4 import BeautifulSoup
import os
import time
import urllib.parse

class WeatherImageDownloader:
    def __init__(self):
        self.download_count = 0
        self.max_images = 15  # 学号尾数15
        self.session = requests.Session()
        self.session.headers.update({
            "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36"
        })
    
    def setup_environment(self):
        """设置下载环境"""
        download_dir = "downloaded_images"
        if not os.path.exists(download_dir):
            os.makedirs(download_dir)
            print(f"创建下载目录: {download_dir}")
        return download_dir
    
    def fetch_page_content(self, url):
        """获取页面内容"""
        try:
            response = self.session.get(url, timeout=15)
            response.encoding = 'utf-8'
            return response.text
        except Exception as e:
            print(f"页面获取失败: {e}")
            return None
    
    def extract_image_links(self, html_content, base_url):
        """从HTML中提取图片链接"""
        image_links = []
        soup = BeautifulSoup(html_content, 'html.parser')
        
        img_elements = soup.find_all('img')
        print(f"发现 {len(img_elements)} 个图片标签")
        
        for img in img_elements:
            if self.download_count >= self.max_images:
                break
                
            src = img.get('src') or img.get('data-src')
            if src:
                # 构建完整URL
                if src.startswith('//'):
                    full_url = 'https:' + src
                elif src.startswith('http'):
                    full_url = src
                else:
                    full_url = urllib.parse.urljoin(base_url, src)
                
                if full_url not in image_links:
                    image_links.append(full_url)
                    print(f"添加图片链接: {full_url}")
        
        return image_links
    
    def download_image_file(self, img_url, save_path, file_index):
        """下载单个图片文件"""
        try:
            response = self.session.get(img_url, timeout=20, stream=True)
            if response.status_code == 200:
                # 获取文件扩展名
                file_ext = self.get_file_extension(img_url, response.headers.get('content-type', ''))
                filename = f"img_{file_index}{file_ext}"
                full_path = os.path.join(save_path, filename)
                
                with open(full_path, 'wb') as f:
                    for chunk in response.iter_content(chunk_size=8192):
                        f.write(chunk)
                
                self.download_count += 1
                print(f"✓ 成功下载 [{self.download_count}/{self.max_images}]: {filename}")
                return True
        except Exception as e:
            print(f"✗ 下载失败 {img_url}: {e}")
        return False
    
    def get_file_extension(self, url, content_type):
        """确定文件扩展名"""
        # 从URL中提取扩展名
        if '.' in url:
            path = url.split('?')[0]  
            ext = '.' + path.split('.')[-1].lower()
            if ext in ['.jpg', '.jpeg', '.png', '.gif', '.bmp', '.webp']:
                return ext
        
        # 从Content-Type判断
        if 'jpeg' in content_type or 'jpg' in content_type:
            return '.jpg'
        elif 'png' in content_type:
            return '.png'
        elif 'gif' in content_type:
            return '.gif'
        elif 'webp' in content_type:
            return '.webp'
        
        return '.jpg'  
    
    def execute_download(self, target_url):
        """执行下载任务"""
        print(f"开始单线程图片下载任务,目标数量: {self.max_images}")
        start_time = time.time()
        
        download_dir = self.setup_environment()
        
        # 获取页面内容
        html = self.fetch_page_content(target_url)
        if not html:
            print("无法获取页面内容,任务终止")
            return
        
        # 提取图片链接
        image_urls = self.extract_image_links(html, target_url)
        print(f"共提取到 {len(image_urls)} 个有效图片链接")
        
        # 下载图片
        for idx, img_url in enumerate(image_urls):
            if self.download_count >= self.max_images:
                break
            self.download_image_file(img_url, download_dir, idx + 1)
        
        end_time = time.time()
        print(f"\n下载任务完成! 成功下载 {self.download_count} 张图片")
        print(f"总耗时: {end_time - start_time:.2f} 秒")

# 主程序
if __name__ == "__main__":
    downloader = WeatherImageDownloader()
    weather_url = "http://www.weather.com.cn/weather/101280601.shtml"
    downloader.execute_download(weather_url)

多线程代码如下:

点击查看代码
import requests
from bs4 import BeautifulSoup
import os
import threading
import time
import urllib.parse
from concurrent.futures import ThreadPoolExecutor, as_completed

class ConcurrentImageDownloader:
    def __init__(self, max_workers=5):
        self.downloaded = 0
        self.max_images = 15
        self.lock = threading.Lock()
        self.max_workers = max_workers
        self.session = requests.Session()
        self.session.headers.update({
            "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36"
        })
        self.should_stop = False  
    
    def prepare_directory(self):
        """准备存储目录"""
        output_dir = "concurrent_images"
        if not os.path.exists(output_dir):
            os.makedirs(output_dir)
        return output_dir
    
    def get_webpage(self, url):
        """获取网页内容"""
        try:
            resp = self.session.get(url, timeout=10)
            resp.encoding = 'utf-8'
            return resp.text
        except Exception as e:
            print(f"网页请求异常: {e}")
            return None
    
    def parse_image_urls(self, html_doc, base_domain):
        """解析图片URL"""
        found_urls = []
        soup = BeautifulSoup(html_doc, 'html.parser')
        
        for img_tag in soup.find_all('img'):
            # 检查停止标志
            if self.should_stop:
                break
                
            img_src = img_tag.get('src') or img_tag.get('data-src')
            if img_src:
                # 规范化URL
                if img_src.startswith('//'):
                    complete_url = 'https:' + img_src
                elif img_src.startswith('/'):
                    complete_url = urllib.parse.urljoin(base_domain, img_src)
                elif not img_src.startswith('http'):
                    complete_url = urllib.parse.urljoin(base_domain, img_src)
                else:
                    complete_url = img_src
                
                if complete_url not in found_urls:
                    found_urls.append(complete_url)
        
        return found_urls
    
    def save_single_image(self, task_data):
        """保存单张图片"""
        url, index, save_folder = task_data
        
        # 在下载前再次检查是否应该停止
        if self.should_stop:
            return None
            
        try:
            response = self.session.get(url, timeout=25, stream=True)
            if response.status_code == 200:
                # 在写入文件前获取锁并检查数量
                with self.lock:
                    if self.downloaded >= self.max_images:
                        self.should_stop = True
                        return None
                    
                    # 确定文件扩展名
                    extension = self.determine_extension(url, response.headers.get('content-type', ''))
                    file_name = f"picture_{index}{extension}"
                    full_path = os.path.join(save_folder, file_name)
                    
                    # 写入文件
                    with open(full_path, 'wb') as file_obj:
                        for data_chunk in response.iter_content(chunk_size=8192):
                            file_obj.write(data_chunk)
                    
                    self.downloaded += 1
                    current_count = self.downloaded
                
                print(f"线程下载完成 [{current_count}/{self.max_images}]: {file_name}")
                
                # 检查是否达到最大数量
                if current_count >= self.max_images:
                    with self.lock:
                        self.should_stop = True
                
                return file_name
        except Exception as e:
            print(f"✗ 下载出错 {url}: {e}")
        
        return None
    
    def determine_extension(self, url, mime_type):
        """确定文件扩展名"""
        valid_extensions = ['.jpg', '.jpeg', '.png', '.gif', '.bmp', '.webp']
        
        # 从URL路径提取
        path_segments = url.split('/')
        if path_segments:
            last_segment = path_segments[-1].split('?')[0]
            if '.' in last_segment:
                potential_ext = '.' + last_segment.split('.')[-1].lower()
                if potential_ext in valid_extensions:
                    return potential_ext
        
        # 从MIME类型判断
        type_mapping = {
            'image/jpeg': '.jpg',
            'image/jpg': '.jpg',
            'image/png': '.png',
            'image/gif': '.gif',
            'image/webp': '.webp',
            'image/bmp': '.bmp'
        }
        
        return type_mapping.get(mime_type, '.jpg')
    
    def run_concurrent_download(self, start_url):
        """运行并发下载"""
        print(f"启动多线程下载,工作线程: {self.max_workers}, 目标数量: {self.max_images}")
        begin_time = time.time()
        
        storage_dir = self.prepare_directory()
        
        page_content = self.get_webpage(start_url)
        if not page_content:
            return
        
        image_list = self.parse_image_urls(page_content, start_url)
        print(f"解析获得 {len(image_list)} 个图片资源")
        
        download_tasks = []
        for i, image_url in enumerate(image_list):
            with self.lock:
                if self.downloaded >= self.max_images:
                    break
            task = (image_url, i + 1, storage_dir)
            download_tasks.append(task)
        
        with ThreadPoolExecutor(max_workers=self.max_workers) as executor:
            futures = []
            
            for task in download_tasks:
                if self.should_stop:
                    break
                future = executor.submit(self.save_single_image, task)
                futures.append(future)
            
            completed_count = 0
            for future in as_completed(futures):
                if self.should_stop and completed_count >= self.max_images:
                    for f in futures:
                        f.cancel()
                    executor.shutdown(wait=False)
                    break
                
                try:
                    future.result(timeout=30)
                    completed_count += 1
                except Exception as e:
                    print(f"任务执行异常: {e}")
        
        end_time = time.time()
        actual_downloaded = min(self.downloaded, self.max_images)
        print(f"\n并发下载结束! 实际下载: {actual_downloaded} 张图片")
        print(f"总用时: {end_time - begin_time:.2f} 秒")

# 执行多线程下载
if __name__ == "__main__":
    concurrent_downloader = ConcurrentImageDownloader(max_workers=6)
    target_website = "http://www.weather.com.cn/weather/101280601.shtml"
    concurrent_downloader.run_concurrent_download(target_website)
结果

单线程结果:
image

多线程结果:
image

爬取图片:
image

作业2

相关代码与结果

代码

item.py:

点击查看代码
import scrapy

class StockItem(scrapy.Item):
    # 股票数据字段定义
    record_no = scrapy.Field()        # 记录序号
    stock_id = scrapy.Field()         # 股票代码
    stock_title = scrapy.Field()      # 股票名称
    current_val = scrapy.Field()      # 当前价格
    change_pct = scrapy.Field()       # 涨跌百分比
    change_amt = scrapy.Field()       # 涨跌金额
    trade_qty = scrapy.Field()        # 交易数量
    price_swing = scrapy.Field()      # 价格波动

spider.py:

点击查看代码
import scrapy
import json
import re
from stock_crawler.items import StockItem

class StockDataSpider(scrapy.Spider):
    name = "stock_data"
    allowed_domains = ["eastmoney.com"]
    
    def start_requests(self):
        """生成初始请求"""
        base_api = "http://quote.eastmoney.com/center/api/sidemenu.json"
        yield scrapy.Request(url=base_api, callback=self.process_api_response)
    
    def process_api_response(self, response):
        """处理API响应"""
        try:
            market_info = json.loads(response.text)
            # 生成股票数据页面请求
            for page in range(1, 6):
                page_url = f"http://quote.eastmoney.com/center/gridlist.html#p={page}"
                yield scrapy.Request(url=page_url, callback=self.extract_stock_data, 
                                   meta={'page_num': page})
        except Exception as e:
            self.logger.error(f"API处理失败: {e}")
    
    def extract_stock_data(self, response):
        """提取股票数据"""
        current_page = response.meta['page_num']
        self.logger.info(f"处理第 {current_page} 页数据")
        
        # 提取股票行数据
        data_rows = response.xpath('//table[contains(@class, "table")]//tr[position()>1]')
        
        for row_idx, row in enumerate(data_rows):
            item = StockItem()
            global_idx = (current_page - 1) * len(data_rows) + row_idx + 1
            
            # 提取数据单元格
            cells = row.xpath('./td')
            if len(cells) >= 8:
                item['record_no'] = str(global_idx)
                item['stock_id'] = self.clean_content(cells[1].xpath('string(.)').get())
                item['stock_title'] = self.clean_content(cells[2].xpath('string(.)').get())
                item['current_val'] = self.clean_content(cells[3].xpath('string(.)').get())
                item['change_pct'] = self.clean_content(cells[4].xpath('string(.)').get())
                item['change_amt'] = self.clean_content(cells[5].xpath('string(.)').get())
                item['trade_qty'] = self.clean_content(cells[6].xpath('string(.)').get())
                item['price_swing'] = self.clean_content(cells[7].xpath('string(.)').get())
                
                if global_idx <= 5:
                    self.logger.info(f"处理股票: {item['stock_id']} - {item['stock_title']}")
                
                yield item
        
        self.logger.info(f"第 {current_page} 页数据处理完成")
    
    def clean_content(self, text):
        """清理文本内容"""
        if text:
            return re.sub(r'\s+', ' ', text).strip()
        return ""

pipelines.py:

点击查看代码
import sqlite3
import os

class DataStoragePipeline:
    def __init__(self, db_path):
        self.db_path = db_path
        self.conn = None
        self.cursor = None
    
    @classmethod
    def from_crawler(cls, crawler):
        db_path = crawler.settings.get('SQLITE_DB_PATH', 'stock_info.db')
        return cls(db_path)
    
    def open_spider(self, spider):
        # 确保目录存在
        os.makedirs(os.path.dirname(self.db_path), exist_ok=True)
        
        self.conn = sqlite3.connect(self.db_path)
        self.cursor = self.conn.cursor()
        self.setup_table()
        spider.logger.info("SQLite数据库连接成功")
    
    def setup_table(self):
        """设置数据表"""
        table_sql = """
        CREATE TABLE IF NOT EXISTS stock_records (
            id INTEGER PRIMARY KEY AUTOINCREMENT,
            record_no TEXT,
            stock_id TEXT NOT NULL UNIQUE,
            stock_title TEXT,
            current_val REAL,
            change_pct REAL,
            change_amt REAL,
            trade_qty TEXT,
            price_swing REAL
        )
        """
        self.cursor.execute(table_sql)
        self.conn.commit()
    
    def process_item(self, item, spider):
        """处理数据项"""
        insert_sql = """
        INSERT OR REPLACE INTO stock_records 
        (record_no, stock_id, stock_title, current_val, change_pct, change_amt, trade_qty, price_swing)
        VALUES (?, ?, ?, ?, ?, ?, ?, ?)
        """
        
        try:
            self.cursor.execute(insert_sql, (
                item['record_no'],
                item['stock_id'],
                item['stock_title'],
                self.to_float(item['current_val']),
                self.to_percent(item['change_pct']),
                self.to_float(item['change_amt']),
                item['trade_qty'],
                self.to_percent(item['price_swing'])
            ))
            self.conn.commit()
            spider.logger.info(f"数据保存: {item['stock_id']} - {item['stock_title']}")
        except Exception as e:
            spider.logger.error(f"保存失败 {item['stock_id']}: {e}")
        
        return item
    
    def to_float(self, value):
        """转换为浮点数"""
        if not value:
            return 0.0
        try:
            cleaned = value.replace(',', '').replace('--', '0').strip()
            return float(cleaned) if cleaned else 0.0
        except:
            return 0.0
    
    def to_percent(self, value):
        """转换百分比"""
        if not value:
            return 0.0
        try:
            cleaned = value.replace('%', '').replace(',', '').replace('--', '0').strip()
            return float(cleaned) if cleaned else 0.0
        except:
            return 0.0
    
    def close_spider(self, spider):
        if self.conn:
            self.conn.close()
        spider.logger.info("数据库连接关闭")
##### 结果

image

作业3

相关代码与结果

代码

items.py:

点击查看代码
import scrapy

class ForexDataItem(scrapy.Item):
    # 外汇数据字段定义
    currency = scrapy.Field()          # 货币名称
    telegraphic_buy = scrapy.Field()   # 现汇买入价
    cash_buy = scrapy.Field()          # 现钞买入价
    telegraphic_sell = scrapy.Field()  # 现汇卖出价
    cash_sell = scrapy.Field()         # 现钞卖出价
    central_rate = scrapy.Field()      # 中间汇率
    pub_date = scrapy.Field()          # 发布日期
    pub_time = scrapy.Field()          # 发布时间

spiders.py:

点击查看代码
import scrapy
from stock_crawler.items import ForexDataItem

class ForexRateSpider(scrapy.Spider):
    name = "forex_rates"
    allowed_domains = ["boc.cn"]
    start_urls = ["https://www.boc.cn/sourcedb/whpj/"]
    
    def parse(self, response):
        """解析外汇数据"""
        self.logger.info("开始获取外汇汇率数据")
        
        # 定位数据行
        data_rows = response.xpath('//table[@cellpadding="0" and @cellspacing="0"]/tr[position()>1]')
        
        self.logger.info(f"发现 {len(data_rows)} 行外汇数据")
        
        for idx, row in enumerate(data_rows):
            item = ForexDataItem()
            
            # 提取各字段数据
            cells = row.xpath('./td')
            if len(cells) >= 7:
                item['currency'] = self.get_cell_text(cells[0])
                item['telegraphic_buy'] = self.get_cell_text(cells[1])
                item['cash_buy'] = self.get_cell_text(cells[2])
                item['telegraphic_sell'] = self.get_cell_text(cells[3])
                item['cash_sell'] = self.get_cell_text(cells[4])
                item['central_rate'] = self.get_cell_text(cells[5])
                
                # 处理发布时间
                datetime_text = self.get_cell_text(cells[6])
                date_part, time_part = self.split_datetime(datetime_text)
                item['pub_date'] = date_part
                item['pub_time'] = time_part
                
                # 验证数据
                self.validate_item(item)
                
                if idx < 5:
                    self.logger.info(f"外汇: {item['currency']} 买入: {item['telegraphic_buy']}")
                
                yield item
        
        self.logger.info("外汇数据解析完成")
    
    def get_cell_text(self, cell):
        """获取单元格文本"""
        text = cell.xpath('string(.)').get()
        return text.strip() if text else ""
    
    def split_datetime(self, datetime_str):
        """分割日期时间"""
        if not datetime_str:
            return "", ""
        parts = datetime_str.split()
        if len(parts) >= 2:
            return parts[0], parts[1]
        return datetime_str, ""
    
    def validate_item(self, item):
        """验证数据项"""
        for field in ['telegraphic_buy', 'cash_buy', 'telegraphic_sell', 'cash_sell']:
            if not item[field]:
                item[field] = '0.0000'
结果

image

实验心得体会:

第一题:图片爬取
单线程爬取效率较低,改用多线程后下载速度显著提升。过程中需要精确控制并发数量,避免因抓取过多导致数据混乱或数量不足。同时,存储路径需设置正确,确保图片能顺利保存至指定文件夹。
第二题:股票数据采集
初始尝试直接抓取页面未获取到数据,后续发现需通过接口获取。使用Scrapy框架时,通过Item定义数据字段并结合Pipeline写入MySQL,提高了数据存储的便捷性。关键点在于字段与数据库表结构需准确对应,否则将影响数据入库。
第三题:外汇数据抓取
整体流程与作业二类似,但需根据实际数据调整Item与Pipeline中的字段映射。在抓取银行网站时,必须准确定位页面元素,否则获取的汇率信息可能出现错误。

gitee文件链接:https://gitee.com/fang-pu666/fp888/tree/homework/

posted @ 2025-11-25 22:58  ygtr3ce  阅读(11)  评论(0)    收藏  举报