爬虫进阶:数据解析与存储实战指南
一、数据解析核心技术
1.1 BS4解析详解
from bs4 import BeautifulSoup
import requests
# 创建解析对象
response = requests.get('https://example.com')
soup = BeautifulSoup(response.text, 'lxml')
# 标签定位四式
div_tag = soup.find('div', class_='header') # 单元素定位
all_links = soup.find_all('a') # 多元素定位
css_select = soup.select('#main > .article') # CSS选择器
nested_tag = soup.select('.container .title') # 嵌套选择
# 数据提取技巧
print(div_tag.text) # 获取全部文本
print(div_tag['href']) # 获取属性值
1.2 XPath高效解析
from lxml import etree
# 创建解析对象
tree = etree.HTML(requests.get('https://example.com').text)
# XPath四大定位法
title = tree.xpath('//h1/text()')[0] # 文本提取
links = tree.xpath('//a/@href') # 属性提取
third_div = tree.xpath('//div[3]') # 索引定位
nested_elem = tree.xpath('//div[@id="main"]//p') # 层级定位
二、实战案例解析
2.1 小说全本抓取(碧血剑)
import os
import requests
from bs4 import BeautifulSoup
BASE_URL = 'https://bixuejian.5000yan.com/'
HEADERS = {'User-Agent': 'Mozilla/5.0'}
def get_chapters():
response = requests.get(BASE_URL, headers=HEADERS)
soup = BeautifulSoup(response.text, 'lxml')
return soup.select('.paiban > li > a')
def save_chapter(chapter):
title = chapter.text.strip()
detail_url = chapter['href']
# 获取章节内容
detail_res = requests.get(detail_url, headers=HEADERS)
detail_soup = BeautifulSoup(detail_res.text, 'lxml')
content = detail_soup.find('div', class_='grap').text
# 创建存储目录
if not os.path.exists('碧血剑'):
os.makedirs('碧血剑')
# 保存文件
with open(f'碧血剑/{title}.txt', 'w') as f:
f.write(f"{title}\n\n{content}")
print(f'已保存: {title}')
if __name__ == '__main__':
chapters = get_chapters()
for chap in chapters:
save_chapter(chap)
2.2 图片批量下载
import requests
from lxml import etree
from urllib.parse import urljoin
BASE_URL = 'https://pic.netbian.com/4kmeinjpg/'
HEADERS = {'User-Agent': 'Mozilla/5.0'}
def download_images(page=5):
for p in range(1, page+1):
url = f'{BASE_URL}index_{p}.html' if p>1 else BASE_URL
response = requests.get(url, headers=HEADERS)
response.encoding = 'gbk'
tree = etree.HTML(response.text)
img_list = tree.xpath('//div[@class="slist"]//li')
for img in img_list:
title = img.xpath('.//b/text()')[0] + '.jpg'
img_url = urljoin(BASE_URL, img.xpath('.//img/@src')[0])
# 下载图片
img_data = requests.get(img_url, headers=HEADERS).content
with open(f'images/{title}', 'wb') as f:
f.write(img_data)
print(f'已下载: {title}')
if __name__ == '__main__':
download_images()
三、数据存储优化
3.1 使用Pandas存储
import pandas as pd
from lxml import etree
# 创建数据框架
df = pd.DataFrame(columns=['标题', '链接', '下载量'])
# 爬取数据填充
tree = etree.parse('data.html')
items = tree.xpath('//div[@class="item"]')
for i, item in enumerate(items):
title = item.xpath('.//h3/text()')[0]
link = item.xpath('.//a/@href')[0]
downloads = item.xpath('.//span/text()')[0]
df.loc[i] = [title, link, downloads]
# 导出Excel
df.to_excel('data.xlsx', index=False)
四、作业解析
4.1 简历模板下载
def download_resumes():
base_url = 'https://sc.chinaz.com/jianli/free.html'
response = requests.get(base_url, headers=HEADERS)
tree = etree.HTML(response.text)
resumes = tree.xpath('//div[@id="main"]//div[@class="box"]')
for res in resumes:
name = res.xpath('.//img/@alt')[0]
zip_url = res.xpath('.//a[contains(@href,"jianli")]/@href')[0]
# 下载压缩包
zip_data = requests.get(urljoin(base_url, zip_url)).content
with open(f'resumes/{name}.zip', 'wb') as f:
f.write(zip_data)
print(f'已下载简历: {name}')
4.2 图片站抓取要点
- 注意懒加载处理:真实图片地址可能在
data-src属性 - 处理不同分辨率:
@srcset属性解析 - 反爬对策:设置
Referer请求头
五、爬虫最佳实践
- 速率控制:添加
time.sleep(random.uniform(1,3)) - 异常处理:
try:
response = requests.get(url, timeout=10)
except Exception as e:
print(f'请求失败: {str(e)}')
- 代理配置:
proxies = {
'http': 'http://10.10.1.10:3128',
'https': 'http://10.10.1.10:1080',
}
response = requests.get(url, proxies=proxies)
提示:本文所有案例需在遵守目标网站
robots.txt协议前提下使用,请控制请求频率避免对目标服务器造成压力。

浙公网安备 33010602011771号