某网站试题爬取
一、项目背景与目标
需求:爬取某网站每个考试分类下的每日一练试题(含题目、选项、答案)
技术挑战:
- 需要处理动态参数(sign/subsign)
- 需要模拟登录维持Cookie
- 接口参数关联性解析
二、完整实现代码(带注释版)
import requests
from lxml import etree
import pandas as pd
from urllib.parse import urljoin
import time
class WangxiaoSpider:
def __init__(self):
self.headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36',
}
self.session = requests.Session() # 维持会话
self.base_url = 'https://ks.wangxiao.cn'
self.login() # 初始化时自动登录
def login(self):
"""模拟登录获取有效Cookie"""
login_url = 'https://xxx.cn/login'
data = {
'username': 'your_username', # 替换实际账号
'password': 'your_password',
'verifycode': 'auto' # 自动识别验证码(需处理复杂情况)
}
self.session.post(login_url, data=data)
print('登录状态:', '成功' if self.check_login() else '失败')
def check_login(self):
"""验证登录状态"""
test_url = 'https://xxx.cn/user/center'
resp = self.session.get(test_url)
return '个人中心' in resp.text
def get_category_links(self):
"""获取所有一级/二级类目链接"""
resp = self.session.get(self.base_url)
tree = etree.HTML(resp.text)
categories = []
# 解析一级类目
for li in tree.xpath('//ul[@class="first-title"]/li'):
c1_name = li.xpath('./p/span/text()')[0]
# 解析二级类目
for a in li.xpath('./div/a'):
c2_name = a.xpath('./text()')[0]
c2_link = urljoin(self.base_url, a.xpath('./@href')[0])
categories.append( (c1_name, c2_name, c2_link) )
return categories
def convert_to_daily_url(self, exam_url):
"""将模拟考试链接转为每日一练链接"""
sign = exam_url.split('?')[-1]
return f'{self.base_url}/practice/listEveryday?{sign}'
def extract_dynamic_params(self, daily_url):
"""提取sign和subsign参数"""
resp = self.session.get(daily_url)
tree = etree.HTML(resp.text)
# 定位开始做题按钮
start_btn = tree.xpath('//a[contains(@href, "startExercise")]/@href')[0]
params = dict([
(k, v) for k, v in
[param.split('=') for param in start_btn.split('?')[1].split('&')]
])
return params['sign'], params['subsign']
def fetch_questions(self, sign, subsign):
"""获取试题数据"""
api_url = 'https://xxx.cn/practice/listQuestions'
payload = {
"practiceType": 1,
"sign": sign,
"subsign": subsign,
"day": time.strftime("%Y%m%d") # 自动获取当天日期
}
resp = self.session.post(api_url, json=payload)
return resp.json()['Data'][0]['questions']
def parse_question(self, question):
"""解析单道试题"""
options = []
correct_answer = []
for idx, opt in enumerate(question['options']):
options.append(f"{chr(65+idx)}. {opt['content']}")
if opt['isRight'] == 1:
correct_answer.append(chr(65+idx))
return {
'题目': question['content'],
'选项': options,
'答案': ','.join(correct_answer)
}
def run(self):
"""主运行流程"""
df = pd.DataFrame(columns=['一级类目','二级类目','题目','选项','答案'])
for c1, c2, exam_url in self.get_category_links():
try:
# Step 1: 构造每日一练链接
daily_url = self.convert_to_daily_url(exam_url)
# Step 2: 提取动态参数
sign, subsign = self.extract_dynamic_params(daily_url)
# Step 3: 获取试题数据
questions = self.fetch_questions(sign, subsign)
# Step 4: 解析数据
for q in questions:
item = self.parse_question(q)
df = df.append({
'一级类目': c1,
'二级类目': c2,
**item
}, ignore_index=True)
print(f'已完成:{c1} - {c2}')
time.sleep(1) # 礼貌间隔
except Exception as e:
print(f'【错误】{c1}-{c2}: {str(e)}')
continue
df.to_excel('每日一练试题库.xlsx', index=False)
print('所有数据已保存!')
if __name__ == '__main__':
spider = WangxiaoSpider()
spider.run()
三、关键步骤详解
1. 登录状态维持
self.session = requests.Session() # 关键对象
self.session.post(login_url, data=data) # 保持Cookie
- 使用Session对象自动处理Cookie
- 需处理验证码(示例简化,实际需要OCR或打码平台)
2. 链接转换技巧
# 原链接:https://xxx.cn/TestPaper/list?sign=jzs1
# 新链接:https://xxx.cn/practice/listEveryday?sign=jzs1
def convert_to_daily_url(self, exam_url):
sign = exam_url.split('?')[-1]
return f'{self.base_url}/practice/listEveryday?{sign}'
3. 动态参数提取
# 从类似/practice/startExercise?sign=xxx&subsign=yyy的链接提取参数
start_btn = tree.xpath('//a[contains(@href, "startExercise")]/@href')[0]
params = dict([ (k, v) for k, v in [param.split('=') for param in start_btn.split('?')[1].split('&')])
4. 数据解析优化
def parse_question(self, question):
# 处理多选题情况
correct_answer = [chr(65+idx) for idx, opt in enumerate(question['options']) if opt['isRight'] == 1]
return {
'题目': question['content'],
'选项': [f"{chr(65+i)}. {o['content']}" for i, o in enumerate(question['options'])],
'答案': ','.join(correct_answer)
}
四、运行结果示例
| 一级类目 | 二级类目 | 题目 | 选项 | 答案 |
|---|---|---|---|---|
| 建筑工程 | 一级建造师 | 关于混凝土结构特点的说法... | ['A. 强度高', 'B. 耐久性好'...] | A,B |
| 法律职业 | 司法考试 | 根据《民法典》规定... | ['A. 正确', 'B. 错误'] | A |
注意事项:
本代码仅用于技术交流,请遵守网站Robots协议及相关法律法规,合理控制请求频率。
欢迎在评论区交流爬虫技术细节!

浙公网安备 33010602011771号