爬虫进阶:数据解析与存储实战指南

一、数据解析核心技术

1.1 BS4解析详解

from bs4 import BeautifulSoup
import requests

# 创建解析对象
response = requests.get('https://example.com')
soup = BeautifulSoup(response.text, 'lxml')

# 标签定位四式
div_tag = soup.find('div', class_='header')  # 单元素定位
all_links = soup.find_all('a')  # 多元素定位
css_select = soup.select('#main > .article')  # CSS选择器
nested_tag = soup.select('.container .title')  # 嵌套选择

# 数据提取技巧
print(div_tag.text)  # 获取全部文本
print(div_tag['href'])  # 获取属性值

1.2 XPath高效解析

from lxml import etree

# 创建解析对象
tree = etree.HTML(requests.get('https://example.com').text)

# XPath四大定位法
title = tree.xpath('//h1/text()')[0]  # 文本提取
links = tree.xpath('//a/@href')  # 属性提取
third_div = tree.xpath('//div[3]')  # 索引定位
nested_elem = tree.xpath('//div[@id="main"]//p')  # 层级定位

二、实战案例解析

2.1 小说全本抓取(碧血剑)

import os
import requests
from bs4 import BeautifulSoup

BASE_URL = 'https://bixuejian.5000yan.com/'
HEADERS = {'User-Agent': 'Mozilla/5.0'}

def get_chapters():
    response = requests.get(BASE_URL, headers=HEADERS)
    soup = BeautifulSoup(response.text, 'lxml')
    return soup.select('.paiban > li > a')

def save_chapter(chapter):
    title = chapter.text.strip()
    detail_url = chapter['href']
    
    # 获取章节内容
    detail_res = requests.get(detail_url, headers=HEADERS)
    detail_soup = BeautifulSoup(detail_res.text, 'lxml')
    content = detail_soup.find('div', class_='grap').text
    
    # 创建存储目录
    if not os.path.exists('碧血剑'):
        os.makedirs('碧血剑')
    
    # 保存文件
    with open(f'碧血剑/{title}.txt', 'w') as f:
        f.write(f"{title}\n\n{content}")
    print(f'已保存: {title}')

if __name__ == '__main__':
    chapters = get_chapters()
    for chap in chapters:
        save_chapter(chap)

2.2 图片批量下载

import requests
from lxml import etree
from urllib.parse import urljoin

BASE_URL = 'https://pic.netbian.com/4kmeinjpg/'
HEADERS = {'User-Agent': 'Mozilla/5.0'}

def download_images(page=5):
    for p in range(1, page+1):
        url = f'{BASE_URL}index_{p}.html' if p>1 else BASE_URL
        response = requests.get(url, headers=HEADERS)
        response.encoding = 'gbk'
        
        tree = etree.HTML(response.text)
        img_list = tree.xpath('//div[@class="slist"]//li')
        
        for img in img_list:
            title = img.xpath('.//b/text()')[0] + '.jpg'
            img_url = urljoin(BASE_URL, img.xpath('.//img/@src')[0])
            
            # 下载图片
            img_data = requests.get(img_url, headers=HEADERS).content
            with open(f'images/{title}', 'wb') as f:
                f.write(img_data)
            print(f'已下载: {title}')

if __name__ == '__main__':
    download_images()

三、数据存储优化

3.1 使用Pandas存储

import pandas as pd
from lxml import etree

# 创建数据框架
df = pd.DataFrame(columns=['标题', '链接', '下载量'])

# 爬取数据填充
tree = etree.parse('data.html')
items = tree.xpath('//div[@class="item"]')
for i, item in enumerate(items):
    title = item.xpath('.//h3/text()')[0]
    link = item.xpath('.//a/@href')[0]
    downloads = item.xpath('.//span/text()')[0]
    df.loc[i] = [title, link, downloads]

# 导出Excel
df.to_excel('data.xlsx', index=False)

四、作业解析

4.1 简历模板下载

def download_resumes():
    base_url = 'https://sc.chinaz.com/jianli/free.html'
    response = requests.get(base_url, headers=HEADERS)
    tree = etree.HTML(response.text)
    
    resumes = tree.xpath('//div[@id="main"]//div[@class="box"]')
    for res in resumes:
        name = res.xpath('.//img/@alt')[0]
        zip_url = res.xpath('.//a[contains(@href,"jianli")]/@href')[0]
        
        # 下载压缩包
        zip_data = requests.get(urljoin(base_url, zip_url)).content
        with open(f'resumes/{name}.zip', 'wb') as f:
            f.write(zip_data)
        print(f'已下载简历: {name}')

4.2 图片站抓取要点

  1. 注意懒加载处理:真实图片地址可能在data-src属性
  2. 处理不同分辨率:@srcset属性解析
  3. 反爬对策:设置Referer请求头

五、爬虫最佳实践

  1. 速率控制:添加time.sleep(random.uniform(1,3))
  2. 异常处理
try:
    response = requests.get(url, timeout=10)
except Exception as e:
    print(f'请求失败: {str(e)}')
  1. 代理配置
proxies = {
    'http': 'http://10.10.1.10:3128',
    'https': 'http://10.10.1.10:1080',
}
response = requests.get(url, proxies=proxies)

提示:本文所有案例需在遵守目标网站robots.txt协议前提下使用,请控制请求频率避免对目标服务器造成压力。

posted @ 2025-03-23 15:39  千陌666  阅读(10)  评论(0)    收藏  举报