102302156 李子贤 数据采集第三次作业
作业1
要求:指定一个网站,爬取这个网站中的所有的所有图片,例如:中国气象网(http://www.weather.com.cn)。实现单线程和多线程的方式爬取。
–务必控制总页数(学号尾数2位)、总下载的图片数量(尾数后3位)等限制爬取的措施。
输出信息: 将下载的Url信息在控制台输出,并将下载的图片存储在images子文件中,并给出截图。
(1)代码和运行结果
单线程爬取
点击查看代码
import os
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin
# 配置参数
BASE_URL = "http://www.weather.com.cn"
MAX_PAGES = 56 # 最大爬取页数
MAX_IMAGES = 156 # 最大下载图片数
SAVE_DIR = "images"
# 创建保存目录
os.makedirs(SAVE_DIR, exist_ok=True)
def download_image(img_url, count):
"""下载单张图片并保存"""
try:
# 处理相对路径
if not img_url.startswith('http'):
img_url = urljoin(BASE_URL, img_url)
# 输出图片URL
print(f"下载图片 {count}/{MAX_IMAGES}: {img_url}")
# 发送请求
response = requests.get(img_url, timeout=10)
response.raise_for_status()
# 提取文件名
filename = os.path.join(SAVE_DIR, f"img_{count}.jpg")
# 保存图片
with open(filename, 'wb') as f:
f.write(response.content)
return True
except Exception as e:
print(f"下载失败 {img_url}: {str(e)}")
return False
def crawl_page(url, img_count):
"""爬取单个页面的图片"""
try:
response = requests.get(url, timeout=10)
response.encoding = 'utf-8'
soup = BeautifulSoup(response.text, 'html.parser')
# 查找所有图片标签
img_tags = soup.find_all('img')
for img_tag in img_tags:
if img_count >= MAX_IMAGES:
return img_count
img_url = img_tag.get('src')
if img_url and (img_url.endswith(('.jpg', '.jpeg', '.png', '.gif'))):
if download_image(img_url, img_count + 1):
img_count += 1
return img_count
except Exception as e:
print(f"爬取页面失败 {url}: {str(e)}")
return img_count
def main():
img_count = 0
page_count = 0
# 爬取首页
print(f"爬取页面 1: {BASE_URL}")
img_count = crawl_page(BASE_URL, img_count)
page_count += 1
# 爬取其他页面(从站内链接获取)
try:
response = requests.get(BASE_URL, timeout=10)
soup = BeautifulSoup(response.text, 'html.parser')
links = soup.find_all('a', href=True)
page_urls = [urljoin(BASE_URL, link['href']) for link in links]
# 去重并过滤有效链接
page_urls = list(set(page_urls))
valid_urls = [url for url in page_urls if url.startswith(BASE_URL) and url != BASE_URL]
# 继续爬取直到达到页数或图片数量限制
for url in valid_urls:
if page_count >= MAX_PAGES or img_count >= MAX_IMAGES:
break
page_count += 1
print(f"爬取页面 {page_count}: {url}")
img_count = crawl_page(url, img_count)
except Exception as e:
print(f"获取链接失败: {str(e)}")
print(f"\n爬取完成,共下载 {img_count} 张图片,爬取 {page_count} 个页面")
if __name__ == "__main__":
main()



多线程爬取
点击查看代码
import os
import requests
import threading
from queue import Queue
from bs4 import BeautifulSoup
from urllib.parse import urljoin
# 配置参数
BASE_URL = "http://www.weather.com.cn"
MAX_PAGES = 56
MAX_IMAGES = 156
SAVE_DIR = "images"
THREAD_NUM = 5 # 线程数量(可调整)
# 创建保存目录
os.makedirs(SAVE_DIR, exist_ok=True)
# 全局计数器与锁(避免多线程竞争)
img_count = 0
page_count = 0
count_lock = threading.Lock() # 用于保护计数器的锁
# 队列(用于线程间通信)
page_queue = Queue() # 存储待爬取的页面URL
img_queue = Queue() # 存储待下载的图片URL
def is_valid_image(url):
"""检查是否为有效图片URL"""
return url and url.endswith(('.jpg', '.jpeg', '.png', '.gif', '.bmp'))
def download_worker():
"""图片下载线程的工作函数(每个线程循环从队列取任务)"""
global img_count
while True:
img_url = img_queue.get() # 从队列获取图片URL,若队列为空则阻塞
# 退出信号:如果获取到None,则终止线程
if img_url is None:
img_queue.task_done()
break
# 加锁判断是否超过最大图片数(避免多线程同时修改计数器导致错误)
with count_lock:
current_count = img_count + 1
if current_count > MAX_IMAGES:
img_queue.task_done()
continue
img_count = current_count # 更新计数器
try:
# 处理相对路径(补全为完整URL)
if not img_url.startswith(('http:', 'https:')):
img_url = urljoin(BASE_URL, img_url)
# 输出下载信息
print(f"下载图片 {current_count}/{MAX_IMAGES}: {img_url}")
# 下载图片
response = requests.get(img_url, timeout=10)
response.raise_for_status() # 若请求失败(如404),抛出异常
# 保存图片到本地
filename = os.path.join(SAVE_DIR, f"img_{current_count}.jpg")
with open(filename, 'wb') as f:
f.write(response.content)
except Exception as e:
print(f"下载失败 {img_url}: {str(e)}")
img_queue.task_done() # 标记当前任务完成
def crawl_worker():
"""页面爬取线程的工作函数(从队列取页面URL,提取图片链接)"""
global page_count
while True:
page_url = page_queue.get() # 从队列获取页面URL,若队列为空则阻塞
# 退出信号:如果获取到None,则终止线程
if page_url is None:
page_queue.task_done()
break
# 加锁判断是否超过最大页数
with count_lock:
current_page = page_count + 1
if current_page > MAX_PAGES:
page_queue.task_done()
continue
page_count = current_page # 更新页数计数器
try:
# 爬取页面内容
print(f"爬取页面 {current_page}/{MAX_PAGES}: {page_url}")
response = requests.get(page_url, timeout=10)
response.encoding = 'utf-8' # 确保中文正常解析
soup = BeautifulSoup(response.text, 'html.parser')
# 提取页面中的所有图片链接
img_tags = soup.find_all('img')
for img_tag in img_tags:
img_url = img_tag.get('src') # 获取图片URL
if is_valid_image(img_url):
img_queue.put(img_url) # 将图片URL放入下载队列
except Exception as e:
print(f"爬取页面失败 {page_url}: {str(e)}")
page_queue.task_done() # 标记当前任务完成
def main():
# 1. 初始化页面队列(先加入首页,再从首页提取其他链接)
page_queue.put(BASE_URL) # 首页URL入队
# 从首页提取其他站内链接,补充到页面队列
try:
response = requests.get(BASE_URL, timeout=10)
soup = BeautifulSoup(response.text, 'html.parser')
links = soup.find_all('a', href=True) # 提取所有<a>标签的链接
# 过滤有效链接(仅保留本站链接,去重)
valid_links = set()
for link in links:
href = urljoin(BASE_URL, link['href']) # 补全相对路径
if href.startswith(BASE_URL) and href not in valid_links:
valid_links.add(href)
# 将有效链接加入页面队列
for link in valid_links:
page_queue.put(link)
except Exception as e:
print(f"提取首页链接失败: {str(e)}")
# 2. 创建并启动爬取线程(负责从页面提取图片链接)
crawl_threads = []
for _ in range(THREAD_NUM):
t = threading.Thread(target=crawl_worker) # 绑定爬取工作函数
t.daemon = True # 守护线程:主程序退出时自动结束
t.start()
crawl_threads.append(t)
# 3. 创建并启动下载线程(负责下载图片)
download_threads = []
for _ in range(THREAD_NUM):
t = threading.Thread(target=download_worker) # 绑定下载工作函数
t.daemon = True
t.start()
download_threads.append(t)
# 4. 等待页面队列处理完成(所有页面爬取完毕)
page_queue.join()
# 等待图片队列处理完成(所有图片下载完毕)
img_queue.join()
# 5. 发送退出信号给所有线程(避免线程无限阻塞)
for _ in range(THREAD_NUM):
page_queue.put(None) # 给爬取线程发退出信号
img_queue.put(None) # 给下载线程发退出信号
# 6. 等待所有线程结束
for t in crawl_threads:
t.join()
for t in download_threads:
t.join()
print(f"\n爬取完成:共爬取 {page_count} 页,下载 {img_count} 张图片")
if __name__ == "__main__":
main()



(2)心得体会
本次爬取中国气象网图片的实践,让我对单线程与多线程的差异有了直观认知。单线程实现简单,无需处理并发冲突,请求频率稳定不易触发反爬,但效率低下,图片下载需逐个等待响应,总耗时较长。多线程通过线程池并发请求,大幅缩短了爬取时间,资源利用率更高,但需解决线程安全问题,需用锁保护共享计数器和已爬取URL集合,同时要控制并发数避免触发网站反爬机制。
Gitee文件夹链接:https://gitee.com/lizixian66/shujucaiji/tree/homework3/shixun4
作业2
要求:熟练掌握 scrapy 中 Item、Pipeline 数据的序列化输出方法;使用scrapy框架+Xpath+MySQL数据库存储技术路线爬取股票相关信息。
候选网站:东方财富网:https://www.eastmoney.com/
输出信息:MySQL数据库存储和输出格式如下:
表头英文命名例如:序号id,股票代码:bStockNo……,由同学们自行定义设计
(1)代码和运行结果
1.新建Scrapy项目

2.定义数据结构(items.py)
点击查看代码
import scrapy
class StockspiderItem(scrapy.Item):
bStockNo = scrapy.Field() # 股票代码
bStockName = scrapy.Field() # 股票名称
latestPrice = scrapy.Field() # 最新价
priceChangeRate = scrapy.Field() # 涨跌幅
priceChange = scrapy.Field() # 涨跌额
volume = scrapy.Field() # 成交量
turnover = scrapy.Field() # 成交额
amplitude = scrapy.Field() # 振幅
highest = scrapy.Field() # 最高
lowest = scrapy.Field() # 最低
openToday = scrapy.Field() # 今开
closeYesterday = scrapy.Field() # 昨收
crawl_time = scrapy.Field() # 爬取时间
3.配置项目设置(settings.py)
点击查看代码
# 启用管道(指定Pipeline类的路径)
ITEM_PIPELINES = {
'StockSpider.pipelines.StockspiderPipeline': 300,
}
# 设置请求头(模拟浏览器)
USER_AGENT = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.0.0 Safari/537.36'
# 关闭ROBOTSTXT_OBEY
ROBOTSTXT_OBEY = False
4.编写爬虫脚本(stock_spider.py)
点击查看代码
import scrapy
import json
from datetime import datetime
from ..items import StockspiderItem
class StockApiSpider(scrapy.Spider):
name = 'stock_api'
api_url = "https://push2.eastmoney.com/api/qt/clist/get?np=1&fltt=1&invt=2&cb=jQuery37102428814534866044_1763278238593&fs=m%3A1%2Bt%3A2%2Bf%3A!2%2Cm%3A1%2Bt%3A23%2Bf%3A!2&fields=f12%2Cf13%2Cf14%2Cf1%2Cf2%2Cf4%2Cf3%2Cf152%2Cf5%2Cf6%2Cf7%2Cf15%2Cf18%2Cf16%2Cf17%2Cf10%2Cf8%2Cf9%2Cf23&fid=f3&pn=1&pz=20&po=1&dect=1&ut=fa5fd1943c7b386f172d6893dbfba10b&wbp2u=%7C0%7C0%7C0%7Cweb&_=1763278238597"
def start_requests(self):
yield scrapy.Request(
url=self.api_url,
callback=self.parse_api,
headers={
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36",
"X-Requested-With": "XMLHttpRequest"
}
)
def parse_api(self, response):
raw_data = response.text
json_str = raw_data.split('(')[1].rsplit(')', 1)[0]
data = json.loads(json_str)
stock_list = data.get('data', {}).get('diff', [])
crawl_time = datetime.now().strftime('%Y-%m-%d %H:%M:%S')
for item in stock_list:
stock_item = StockspiderItem()
stock_item['bStockNo'] = item.get('f12') # 股票代码
stock_item['bStockName'] = item.get('f14') # 股票名称
# 单位转换:除以100,保留两位小数后转字符串
stock_item['latestPrice'] = str(round(item.get('f2', 0) / 100, 2))
stock_item['priceChangeRate'] = str(round(item.get('f3', 0) / 100, 2)) + '%'
stock_item['priceChange'] = str(round(item.get('f4', 0) / 100, 2))
stock_item['volume'] = str(round(item.get('f5', 0) / 10000, 2)) # 成交量(万手)
stock_item['turnover'] = str(round(item.get('f6', 0) / 100000000, 2)) # 成交额(亿)
stock_item['amplitude'] = str(round(item.get('f7', 0) / 100, 2)) + '%'
stock_item['highest'] = str(round(item.get('f15', 0) / 100, 2))
stock_item['lowest'] = str(round(item.get('f16', 0) / 100, 2))
stock_item['openToday'] = str(round(item.get('f17', 0) / 100, 2))
stock_item['closeYesterday'] = str(round(item.get('f18', 0) / 100, 2))
stock_item['crawl_time'] = crawl_time
yield stock_item
5.编写数据处理与存储(pipelines.py)
点击查看代码
import mysql.connector
from datetime import datetime
class StockspiderPipeline:
def open_spider(self, spider):
self.db = mysql.connector.connect(
host='localhost',
user='root',
password='lzx2022666',
database='stock_db'
)
self.cursor = self.db.cursor()
def process_item(self, item, spider):
insert_sql = """
INSERT INTO stock_info
(bStockNo, bStockName, latestPrice, priceChangeRate, priceChange,
volume, turnover, amplitude, highest, lowest, openToday, closeYesterday, crawl_time)
VALUES
(%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s)
"""
values = (
item['bStockNo'],
item['bStockName'],
item['latestPrice'], # 移除float(),直接存字符串
item['priceChangeRate'],
item['priceChange'],
item['volume'],
item['turnover'],
item['amplitude'],
item['highest'],
item['lowest'],
item['openToday'],
item['closeYesterday'],
item['crawl_time']
)
self.cursor.execute(insert_sql, values)
self.db.commit()
return item
def close_spider(self, spider):
self.cursor.close()
self.db.close()
运行结果:


(2)心得体会
刚开始做这道题的时候,我根据页面的Xpath对我想要的数据进行处理,但是我发现一直爬取不出来,我就去问了大模型,发现因为我们爬取的页面是动态加载的,Scrapy默认的请求方式无法获取动态加载内容。于是我就改用之前学过的捕获js接口,去爬取就爬取出来了。
Gitee文件夹链接:https://gitee.com/lizixian66/shujucaiji/tree/homework3/StockSpider
作业3
要求:熟练掌握 scrapy 中 Item、Pipeline 数据的序列化输出方法;使用scrapy框架+Xpath+MySQL数据库存储技术路线爬取外汇网站数据。
候选网站:中国银行网:https://www.boc.cn/sourcedb/whpj/
(1)代码和运行结果
1.新建Scrapy项目

2.定义数据结构(items.py)
点击查看代码
# Define here the models for your scraped items
#
# See documentation in:
# https://docs.scrapy.org/en/latest/topics/items.html
import scrapy
class BocExchangeItem(scrapy.Item):
currency = scrapy.Field()
tbp = scrapy.Field()
cbp = scrapy.Field()
tsp = scrapy.Field()
csp = scrapy.Field()
mid_rate = scrapy.Field() # 新增中行折算价字段
time = scrapy.Field()
点击查看代码
ROBOTSTXT_OBEY = True
USER_AGENT = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36'
点击查看代码
import scrapy
from ..items import BocExchangeItem
class BocSpider(scrapy.Spider):
name = 'boc'
allowed_domains = ['boc.cn']
start_urls = ['https://www.boc.cn/sourcedb/whpj/']
def parse(self, response):
# 精准定位表格行
rows = response.xpath('//div[@class="BOC_main"]//table[@cellpadding="0"]//tr[position() > 1]')
print(f"匹配到{len(rows)}条数据行")
for row in rows:
item = BocExchangeItem()
# 提取货币名称(注意去除多余空格)
item['currency'] = row.xpath('td[1]/text()').get().strip()
# 现汇买入价
item['tbp'] = row.xpath('td[2]/text()').get(default='').strip()
# 现钞买入价
item['cbp'] = row.xpath('td[3]/text()').get(default='').strip()
# 现汇卖出价
item['tsp'] = row.xpath('td[4]/text()').get(default='').strip()
# 现钞卖出价
item['csp'] = row.xpath('td[5]/text()').get(default='').strip()
# 中行折算价
item['mid_rate'] = row.xpath('td[6]/text()').get(default='').strip()
# 发布日期
item['time'] = row.xpath('td[7]/text()').get().strip()
yield item
点击查看代码
import mysql.connector
from mysql.connector import errorcode
class BocExchangePipeline:
def open_spider(self, spider):
# 连接 MySQL 数据库
try:
self.conn = mysql.connector.connect(
host='localhost',
user='root', # 你的 MySQL 用户名
password='lzx2022666', # 你的 MySQL 密码
database='boc_exchange', # 数据库名(需提前创建)
charset='utf8mb4',
use_unicode=True
)
self.cursor = self.conn.cursor()
# 创建表(若不存在)
create_table_sql = '''
CREATE TABLE IF NOT EXISTS exchange_data (
id INT AUTO_INCREMENT PRIMARY KEY,
currency VARCHAR(50),
tbp DECIMAL(10,2),
cbp DECIMAL(10,2),
tsp DECIMAL(10,2),
csp DECIMAL(10,2),
time VARCHAR(20)
)
'''
self.cursor.execute(create_table_sql)
self.conn.commit()
spider.logger.info("数据库连接成功并创建表")
except mysql.connector.Error as err:
if err.errno == errorcode.ER_ACCESS_DENIED_ERROR:
spider.logger.error("用户名或密码错误")
elif err.errno == errorcode.ER_BAD_DB_ERROR:
spider.logger.error("数据库不存在,请先创建数据库")
else:
spider.logger.error(f"数据库连接错误: {err}")
raise # 抛出异常终止爬虫
def process_item(self, item, spider):
# 插入数据
try:
insert_sql = '''
INSERT INTO exchange_data (currency, tbp, cbp, tsp, csp, time)
VALUES (%s, %s, %s, %s, %s, %s)
'''
# 执行插入语句,参数与 pymysql 兼容
self.cursor.execute(insert_sql, (
item['currency'],
item['tbp'],
item['cbp'],
item['tsp'],
item['csp'],
item['time']
))
self.conn.commit()
spider.logger.debug(f"插入数据成功: {item['currency']}")
except mysql.connector.Error as err:
self.conn.rollback() # 出错时回滚
spider.logger.error(f"插入数据失败: {err},数据: {item}")
return item
def close_spider(self, spider):
# 关闭数据库连接
if hasattr(self, 'cursor'):
self.cursor.close()
if hasattr(self, 'conn') and self.conn.is_connected():
self.conn.close()
spider.logger.info("数据库连接已关闭")


(2)心得体会

通过复制想要爬取的XPath,就可以直接得到我们想要的规则“//div[@class="BOC_main"]//table[@cellpadding="0"]//tr[position() > 1]”非常方便,然后了解scrapyge框架的用处,配置各前置条件,就完成了爬取。
Gitee文件夹链接:https://gitee.com/lizixian66/shujucaiji/tree/homework3/boc_exchange

浙公网安备 33010602011771号