数据采集与融合技术第三次作业
作业①:
要求:指定一个网站,爬取这个网站中的所有的所有图片,例如:中国气象网(http://www.weather.com.cn)。实现单线程和多线程的方式爬取。
–务必控制总页数(学号尾数2位)、总下载的图片数量(尾数后3位)等限制爬取的措施。
代码逻辑:


点击查看代码
import os
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin
ID = "152301219"
PAGE_LIMIT = int(ID[-2:]) # 19 页
IMG_LIMIT = int(ID[-3:]) # 219 张
BASE_URL = "http://www.weather.com.cn"
START_URL = "http://www.weather.com.cn/"
SAVE_DIR = "single_thread_images"
os.makedirs(SAVE_DIR, exist_ok=True)
def fetch_page(url):
try:
resp = requests.get(url, timeout=5)
resp.encoding = resp.apparent_encoding
return resp.text
except:
return ""
def extract_images(html, base_url):
soup = BeautifulSoup(html, "html.parser")
imgs = []
for img in soup.find_all("img"):
src = img.get("src")
if src:
imgs.append(urljoin(base_url, src))
return imgs
def download_image(url, idx):
try:
ext = url.split(".")[-1][:4]
filename = os.path.join(SAVE_DIR, f"img_{idx}.{ext}")
r = requests.get(url, timeout=5)
with open(filename, "wb") as f:
f.write(r.content)
print(f"Downloaded {filename}")
except:
pass
def single_thread_crawl():
print("=== 单线程爬取开始 ===")
to_visit = [START_URL]
visited = set()
page_count = 0
img_count = 0
while to_visit and page_count < PAGE_LIMIT and img_count < IMG_LIMIT:
url = to_visit.pop(0)
if url in visited:
continue
visited.add(url)
html = fetch_page(url)
if not html:
continue
page_count += 1
print(f"[Single] Visiting page {page_count}: {url}")
imgs = extract_images(html, BASE_URL)
for img in imgs:
if img_count >= IMG_LIMIT:
break
download_image(img, img_count)
img_count += 1
soup = BeautifulSoup(html, "html.parser")
for a in soup.find_all("a"):
href = a.get("href")
if href and href.startswith("http") and BASE_URL in href:
to_visit.append(href)
print("=== 单线程爬取结束 ===")
if __name__ == "__main__":
single_thread_crawl()
点击查看代码
import os
import threading
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin
ID = "152301219"
PAGE_LIMIT = int(ID[-2:]) # 19 页
IMG_LIMIT = int(ID[-3:]) # 219 张
BASE_URL = "http://www.weather.com.cn"
START_URL = "http://www.weather.com.cn/"
SAVE_DIR = "multi_thread_images"
os.makedirs(SAVE_DIR, exist_ok=True)
def fetch_page(url):
try:
resp = requests.get(url, timeout=5)
resp.encoding = resp.apparent_encoding
return resp.text
except:
return ""
def extract_images(html, base_url):
soup = BeautifulSoup(html, "html.parser")
imgs = []
for img in soup.find_all("img"):
src = img.get("src")
if src:
imgs.append(urljoin(base_url, src))
return imgs
def download_image(url, idx):
try:
ext = url.split(".")[-1][:4]
filename = os.path.join(SAVE_DIR, f"img_{idx}.{ext}")
r = requests.get(url, timeout=5)
with open(filename, "wb") as f:
f.write(r.content)
print(f"[Thread] Downloaded {filename}")
except:
pass
def multi_thread_crawl():
print("=== 多线程爬取开始 ===")
to_visit = [START_URL]
visited = set()
page_count = 0
img_count = 0
threads = []
while to_visit and page_count < PAGE_LIMIT and img_count < IMG_LIMIT:
url = to_visit.pop(0)
if url in visited:
continue
visited.add(url)
html = fetch_page(url)
if not html:
continue
page_count += 1
print(f"[Multi] Visiting page {page_count}: {url}")
imgs = extract_images(html, BASE_URL)
for img in imgs:
if img_count >= IMG_LIMIT:
break
t = threading.Thread(target=download_image, args=(img, img_count))
t.start()
threads.append(t)
img_count += 1
soup = BeautifulSoup(html, "html.parser")
for a in soup.find_all("a"):
href = a.get("href")
if href and href.startswith("http") and BASE_URL in href:
to_visit.append(href)
for t in threads:
t.join()
print("=== 多线程爬取结束 ===")
if __name__ == "__main__":
multi_thread_crawl()
心得体会:
代码通过BeautifulSoup 库解析网页:在extract_images函数中,先将 HTML 文本加载为soup对象,然后调用soup.find_all("img")遍历页面所有图片标签,提取每个标签的src属性,并通过urljoin方法将相对路径拼接为绝对 URL,最终得到可直接访问的图片链接列表。此外,在页面遍历环节,同样通过BeautifulSoup解析标签,提取符合条件的页面链接,用于后续爬虫的页面拓展。但单线程模式在处理大量页面和图片时效率明显不足,随着PAGE_LIMIT和IMG_LIMIT增大,耗时会线性增长,所以用多线程爬虫能加快速度
作业②
要求:熟练掌握 scrapy 中 Item、Pipeline 数据的序列化输出方法;使用scrapy框架+Xpath+MySQL数据库存储技术路线爬取股票相关信息。
候选网站:东方财富网:https://www.eastmoney.com/
代码逻辑:


定义StockItem类,声明需爬取的字段(如bStockNo股票代码、bPrice价格等),规范数据格式。
点击查看代码
import scrapy
class StockItem(scrapy.Item):
id = scrapy.Field()
bStockNo = scrapy.Field()
bName = scrapy.Field()
bPrice = scrapy.Field()
bChangeRate = scrapy.Field()
bChangeAmount = scrapy.Field()
bVolume = scrapy.Field()
bAmplitude = scrapy.Field()
bHigh = scrapy.Field()
bLow = scrapy.Field()
bOpen = scrapy.Field()
bPrevClose = scrapy.Field()
点击查看代码
import sqlite3
class StockScrapyPipeline:
def open_spider(self, spider):
self.conn = sqlite3.connect("stocks.db")
self.cursor = self.conn.cursor()
self.cursor.execute("""
CREATE TABLE IF NOT EXISTS stocks (
id INTEGER PRIMARY KEY,
bStockNo TEXT,
bName TEXT,
bPrice REAL,
bChangeRate REAL,
bChangeAmount REAL,
bVolume REAL,
bAmplitude REAL,
bHigh REAL,
bLow REAL,
bOpen REAL,
bPrevClose REAL
);
""")
self.conn.commit()
def process_item(self, item, spider):
self.cursor.execute("""
INSERT INTO stocks VALUES (?,?,?,?,?,?,?,?,?,?,?,?)
""", (
item["id"],
item["bStockNo"],
item["bName"],
item["bPrice"],
item["bChangeRate"],
item["bChangeAmount"],
item["bVolume"],
item["bAmplitude"],
item["bHigh"],
item["bLow"],
item["bOpen"],
item["bPrevClose"],
))
self.conn.commit()
return item
def close_spider(self, spider):
self.conn.close()
点击查看代码
import scrapy
import json
from stock_scrapy.items import StockItem
class StocksSpider(scrapy.Spider):
name = "stocks"
# 东方财富后台 API(A股列表)
api_url = (
"http://push2.eastmoney.com/api/qt/clist/get?"
"pn=1&pz=1000&np=1&fltt=2&fid=f3&fs=m:1+t:2"
"&fields=f2,f3,f4,f5,f6,f7,f8,f9,f12,f13,f14,f15,f16,f17,f18"
)
start_urls = [api_url]
def parse(self, response):
data = json.loads(response.text)
stocks = data["data"]["diff"]
for idx, s in enumerate(stocks, start=1):
item = StockItem()
item["id"] = idx
item["bStockNo"] = s["f12"]
item["bName"] = s["f14"]
item["bPrice"] = s["f2"]
item["bChangeRate"] = s["f3"]
item["bChangeAmount"] = s["f4"]
item["bVolume"] = s["f5"]
item["bAmplitude"] = s["f7"]
item["bHigh"] = s["f15"]
item["bLow"] = s["f16"]
item["bOpen"] = s["f17"]
item["bPrevClose"] = s["f18"]
yield item
心得体会:
scrapy框架的结构清晰,Spider 专注爬取解析,Item 规范数据,Pipeline 处理存储,各组件解耦易维护。通过 SQLite 存储数据简单高效,适合小规模数据需求。实际爬取时需注意 API 反爬策略,可添加请求头或延迟。这种模块化设计让爬虫开发逻辑清晰,便于后续扩展功能(如多页爬取、增量更新)。
作业③:
要求:熟练掌握 scrapy 中 Item、Pipeline 数据的序列化输出方法;使用scrapy框架+Xpath+MySQL数据库存储技术路线爬取外汇网站数据。
候选网站:中国银行网:https://www.boc.cn/sourcedb/whpj/
代码逻辑:

item封装
点击查看代码
import scrapy
class BocFxItem(scrapy.Item):
# 货币名称
Currency = scrapy.Field()
# 现汇买入价 (TBP: Telegraphic Transfer Buying Price)
TBP = scrapy.Field()
# 现钞买入价 (CBP: Cash Buying Price)
CBP = scrapy.Field()
# 现汇卖出价 (TSP: Telegraphic Transfer Selling Price)
TSP = scrapy.Field()
# 现钞卖出价 (CSP: Cash Selling Price)
CSP = scrapy.Field()
# 发布时间
Time = scrapy.Field()
点击查看代码
import sqlite3
class BocFxPipeline:
def __init__(self):
# 初始化数据库名称
self.db_name = 'boc_data.db'
self.conn = None
self.cursor = None
def open_spider(self, spider):
"""爬虫启动时调用:连接数据库并创建表"""
print("正在连接 SQLite 数据库...")
self.conn = sqlite3.connect(self.db_name)
self.cursor = self.conn.cursor()
# 创建表 SQL 语句 (如果表不存在则创建)
create_table_sql = """
CREATE TABLE IF NOT EXISTS exchange_rates (
id INTEGER PRIMARY KEY AUTOINCREMENT,
currency TEXT,
tbp TEXT,
cbp TEXT,
tsp TEXT,
csp TEXT,
time TEXT
);
"""
self.cursor.execute(create_table_sql)
self.conn.commit()
def process_item(self, item, spider):
"""处理每一个 Item:插入数据"""
insert_sql = """
INSERT INTO exchange_rates (currency, tbp, cbp, tsp, csp, time)
VALUES (?, ?, ?, ?, ?, ?)
"""
# 准备数据元组
data = (
item.get('Currency', ''),
item.get('TBP', ''),
item.get('CBP', ''),
item.get('TSP', ''),
item.get('CSP', ''),
item.get('Time', '')
)
try:
self.cursor.execute(insert_sql, data)
self.conn.commit()
except Exception as e:
print(f"插入数据出错: {e}")
self.conn.rollback()
return item
def close_spider(self, spider):
"""爬虫关闭时调用:关闭数据库连接"""
print("正在关闭数据库连接...")
if self.cursor:
self.cursor.close()
if self.conn:
self.conn.close()
点击查看代码
import scrapy
from boc_fx.items import BocFxItem
class BocSpiderSpider(scrapy.Spider):
name = "boc_spider"
allowed_domains = ["boc.cn"]
start_urls = ["https://www.boc.cn/sourcedb/whpj/"]
def parse(self, response):
self.logger.info(f"成功访问页面,状态码: {response.status}")
rows = response.xpath('//div[@class="publish"]/div/table/tr[position()>1]')
if not rows:
self.logger.warning("未找到表格数据,可能是页面结构变化或反爬拦截。")
for row in rows:
item = BocFxItem()
# 提取货币名称并做非空判断
currency_name = row.xpath('./td[1]/text()').extract_first()
if not currency_name:
continue
item['Currency'] = currency_name.strip()
tbp_text = row.xpath('./td[2]/text()').extract_first()
item['TBP'] = tbp_text.strip() if tbp_text else None
cbp_text = row.xpath('./td[3]/text()').extract_first()
item['CBP'] = cbp_text.strip() if cbp_text else None
tsp_text = row.xpath('./td[4]/text()').extract_first()
item['TSP'] = tsp_text.strip() if tsp_text else None
csp_text = row.xpath('./td[5]/text()').extract_first()
item['CSP'] = csp_text.strip() if csp_text else None
time_text = row.xpath('./td[7]/text()').extract_first()
item['Time'] = time_text.strip() if time_text else None
yield item
心得体会:
开发时最考验细心的是 XPath 定位 —— 刚开始因漏看表格行索引导致数据提取空值,后来加了非空判断才稳定。用 SQLite 存数据时,遇到过插入失败的情况,通过事务回滚和异常捕获解决了问题。整个过程让我明白,爬虫不仅要精准解析页面,数据存储的健壮性也同样重要,细节处理直接影响最终结果的可靠性。### 项目逻辑
该项目基于 Scrapy 框架爬取中国银行外汇牌价数据。首先,boc_spider.py定义爬虫类,指定目标网址和解析规则,通过 XPath 定位网页表格中的货币名称、现汇 / 现钞买卖价及时间等信息,封装成BocFxItem(定义在items.py)。随后,BocFxPipeline处理 Item,在爬虫启动时连接 SQLite 数据库并创建表,爬取过程中将数据插入数据库,结束时关闭连接,完成数据持久化。
gitee地址:https://gitee.com/li-zhiyang-dejavu/2025_crawl_project/tree/master/3

浙公网安备 33010602011771号