022304105叶骋恺数据采集第三次作业
作业1
代码与运行结果
class ImageDownloader:
def __init__(self):
self.headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"
}
# 创建图片保存目录
self.image_dir = "images"
if not os.path.exists(self.image_dir):
os.makedirs(self.image_dir)
self.max_pages = 5 # 最大页数
self.max_images = 105 # 最大图片数量
self.downloaded_count = 0 # 已下载图片计数
def download_image(self, img_url, page_url):
"""下载单个图片"""
if self.downloaded_count >= self.max_images:
return False
if img_url.startswith('//'):
img_url = 'http:' + img_url
elif img_url.startswith('/'):
img_url = 'http://www.weather.com.cn' + img_url
elif not img_url.startswith('http'):
img_url = 'http://www.weather.com.cn/' + img_url
# 创建请求对象
req = urllib.request.Request(img_url, headers=self.headers)
# 获取图片数据
with urllib.request.urlopen(req, timeout=10) as response:
img_data = response.read()
# 从URL中提取文件名
filename = os.path.basename(img_url.split('?')[0])
if not filename or '.' not in filename:
filename = f"image_{self.downloaded_count + 1}.jpg"
filepath = os.path.join(self.image_dir, filename)
# 保存图片
with open(filepath, 'wb') as f:
f.write(img_data)
self.downloaded_count += 1
print(f"下载成功 [{self.downloaded_count}/{self.max_images}]: {img_url}")
print(f" 来源页面: {page_url}")
print(f" 保存路径: {filepath}")
return True
def extract_images_from_page(self, url):
"""从页面中提取所有图片URL"""
images = []
req = urllib.request.Request(url, headers=self.headers)
data = urllib.request.urlopen(req, timeout=10).read()
data = UnicodeDammit(data, ["utf-8", "gbk"]).unicode_markup
soup = BeautifulSoup(data, "lxml")
img_tags = soup.find_all('img')
for img in img_tags:
img_src = img.get('src')
if img_src: # 确保src不为空
images.append(img_src)
return images
def get_weather_pages(self):
"""获取气象网站的相关页面"""
base_url = "http://www.weather.com.cn"
pages = [
base_url,
base_url + "/weather/101010100.shtml", # 北京
base_url + "/weather/101020100.shtml", # 上海
base_url + "/weather/101280101.shtml", # 广州
base_url + "/weather/101280601.shtml", # 深圳
base_url + "/weather/101230101.shtml", # 福州
]
return pages[:self.max_pages] # 限制总页数
def single_thread_download(self):
print("=" * 50)
print("开始单线程下载")
print("=" * 50)
start_time = time.time()
self.downloaded_count = 0
pages = self.get_weather_pages()
print(f"总共爬取 {len(pages)} 个页面")
print(f"最大下载图片数量: {self.max_images}")
print("-" * 50)
for page_url in pages:
if self.downloaded_count >= self.max_images:
print("已达到最大下载数量,停止爬取")
break
print(f"\n正在处理页面: {page_url}")
images = self.extract_images_from_page(page_url)
print(f"在该页面找到 {len(images)} 张图片")
for img_url in images:
if self.downloaded_count >= self.max_images:
print("已达到最大下载数量,停止下载")
break
self.download_image(img_url, page_url)
end_time = time.time()
print("\n" + "=" * 50)
print(f"单线程下载完成!")
print(f"总下载图片: {self.downloaded_count} 张")
print(f"总耗时: {end_time - start_time:.2f} 秒")
print("=" * 50)
def multi_thread_download(self):
"""多线程下载"""
print("=" * 50)
print("开始多线程下载")
print("=" * 50)
start_time = time.time()
self.downloaded_count = 0
pages = self.get_weather_pages()
print(f"总共爬取 {len(pages)} 个页面")
print(f"最大下载图片数量: {self.max_images}")
print(f"线程数: 5")
print("-" * 50)
# 使用线程池
with ThreadPoolExecutor(max_workers=5) as executor:
futures = []
for page_url in pages:
if self.downloaded_count >= self.max_images:
print("已达到最大下载数量,停止爬取")
break
print(f"\n正在处理页面: {page_url}")
images = self.extract_images_from_page(page_url)
print(f"在该页面找到 {len(images)} 张图片")
# 为每个图片创建下载任务
for img_url in images:
if self.downloaded_count >= self.max_images:
print("已达到最大下载数量,停止添加任务")
break
future = executor.submit(self.download_image, img_url, page_url)
futures.append(future)
# 等待所有任务完成
for future in futures:
future.result()
end_time = time.time()
print("\n" + "=" * 50)
print(f"多线程下载完成!")
print(f"总下载图片: {self.downloaded_count} 张")
print(f"总耗时: {end_time - start_time:.2f} 秒")
print("=" * 50)
作业心得
本次天气网图片爬虫作业,我成功实现了一个兼具单线程与多线程下载功能的爬虫程序。通过分析中国气象网的页面结构,我利用BeautifulSoup精准定位并提取了图片URL,并设计了完善的URL处理逻辑,能够自动将相对路径转换为绝对地址。在控制方面,我严格遵循了学号尾数设定的限制,将最大爬取页数控制在5页,总下载图片数量限制在105张。程序运行时会实时在控制台输出每张图片的下载链接、来源页面及本地保存路径,所有图片均被有序存储在自动创建的images文件夹中。通过对比实验,我验证了多线程模式相较于单线程在I/O密集型任务上的显著效率优势,圆满完成了作业要求
作业2
代码与运行结果
import scrapy
from stock_spider.items import StockItem
import re
class EastmoneySpider(scrapy.Spider):
name = 'eastmoney'
allowed_domains = ['eastmoney.com']
def start_requests(self):
url = "https://quote.eastmoney.com/center/gridlist.html#sh_a_board"
yield scrapy.Request(url, self.parse)
def parse(self, response):
stock_rows = response.xpath('//table//tbody/tr')
for row in stock_rows:
item = StockItem()
item['stock_code'] = row.xpath('.//td[2]//a/text()').get(default='').strip()
item['stock_name'] = row.xpath('.//td[3]//a/text()').get(default='').strip()
# 验证必要字段
if not item['stock_code'] or not item['stock_name']:
continue
# 提取价格数据
item['current_price'] = self.clean_number(row.xpath('.//td[5]//text()').get())
item['change_percent'] = self.clean_number(row.xpath('.//td[6]//text()').get())
item['change_amount'] = self.clean_number(row.xpath('.//td[7]//text()').get())
item['volume'] = row.xpath('.//td[8]//text()').get(default='').strip()
item['amplitude'] = self.clean_number(row.xpath('.//td[10]//text()').get())
item['high'] = self.clean_number(row.xpath('.//td[11]//text()').get())
item['low'] = self.clean_number(row.xpath('.//td[12]//text()').get())
item['open_price'] = self.clean_number(row.xpath('.//td[13]//text()').get())
item['close_price'] = self.clean_number(row.xpath('.//td[14]//text()').get())
yield item
def clean_number(self, text):
"""清理数字格式"""
if not text:
return 0.0
try:
cleaned = re.sub(r'[^\d.-]', '', str(text).strip())
return float(cleaned) if cleaned else 0.0
except ValueError:
return 0.0
作业心得
使用 Scrapy 框架爬取东方财富网股票信息时,通过中间件机制在请求层面集成Selenium处理JavaScript渲染,管道系统自动管理数据库连接并实现数据持久化,Item类明确定义数据结构,配置化的设置便于环境管理,全面的异常处理保证程序稳定运行,模块化设计让各组件职责清晰、易于维护。
作业3
代码与运行结果
import scrapy
from boc_whpj.items import BocWHPJItem
from datetime import datetime
class BocSpider(scrapy.Spider):
name = 'boc_whpj'
allowed_domains = ['boc.cn']
start_urls = ['https://www.boc.cn/sourcedb/whpj/']
def parse(self, response):
self.logger.info(f"开始解析页面,URL: {response.url}")
self.logger.info(f"响应状态: {response.status}")
tables = response.xpath('//table')
self.logger.info(f"找到 {len(tables)} 个表格")
data_table = tables[1]
rows = data_table.xpath('./tr')
self.logger.info(f"数据表格有 {len(rows)} 行")
for i, row in enumerate(rows[1:], 1): # 从第二行开始
item = BocWHPJItem()
currency = row.xpath('./td[1]//text()').get()
if not currency or currency.strip() == '':
self.logger.debug(f"第{i}行跳过,货币名称为空")
continue
item['currency'] = currency.strip()
# 提取现汇买入价 - 第二个td
tbp = row.xpath('./td[2]//text()').get()
item['tbp'] = tbp.strip() if tbp else None
# 提取现钞买入价 - 第三个td
cbp = row.xpath('./td[3]//text()').get()
item['cbp'] = cbp.strip() if cbp else None
# 提取现汇卖出价 - 第四个td
tsp = row.xpath('./td[4]//text()').get()
item['tsp'] = tsp.strip() if tsp else None
# 提取现钞卖出价 - 第五个td
csp = row.xpath('./td[5]//text()').get()
item['csp'] = csp.strip() if csp else None
# 提取发布时间 - 第八个td
time_str = row.xpath('./td[8]//text()').get()
if time_str:
try:
# 处理时间格式
time_obj = datetime.strptime(time_str.strip(), '%H:%M:%S')
item['time'] = time_obj.strftime('%H:%M:%S')
except ValueError:
item['time'] = time_str.strip()
self.logger.warning(f"时间格式解析失败: {time_str}")
else:
item['time'] = None
# 记录提取的数据
self.logger.info(f"提取到数据 {i}: 货币={item['currency']}, 现汇买入={item['tbp']}, 时间={item['time']}")
yield item
作业心得
爬取中国银行外汇数据的过程中,通过 Scrapy 框架结合 XPath 精准提取了货币名称、现汇买入价、现钞卖出价等核心字段,借助 Pipeline 将数据有序存储到 MySQL 数据库,严格对应 Currency、TBP 等指定字段格式,不仅巩固了 Scrapy 数据序列化输出的方法,还深刻认识到外汇数据时效性强、格式严谨的特点,以及规范数据存储对后续数据分析的重要意义。
代码地址:https://gitee.com/xiaoyeabc/2025_crawl_project/tree/master/作业3

浙公网安备 33010602011771号