某网站试题爬取

一、项目背景与目标

需求:爬取某网站每个考试分类下的每日一练试题(含题目、选项、答案)

技术挑战

  1. 需要处理动态参数(sign/subsign)
  2. 需要模拟登录维持Cookie
  3. 接口参数关联性解析

二、完整实现代码(带注释版)

import requests
from lxml import etree
import pandas as pd
from urllib.parse import urljoin
import time

class WangxiaoSpider:
    def __init__(self):
        self.headers = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36',
        }
        self.session = requests.Session()  # 维持会话
        self.base_url = 'https://ks.wangxiao.cn'
        self.login()  # 初始化时自动登录

    def login(self):
        """模拟登录获取有效Cookie"""
        login_url = 'https://xxx.cn/login'
        data = {
            'username': 'your_username',  # 替换实际账号
            'password': 'your_password',
            'verifycode': 'auto'  # 自动识别验证码(需处理复杂情况)
        }
        self.session.post(login_url, data=data)
        print('登录状态:', '成功' if self.check_login() else '失败')

    def check_login(self):
        """验证登录状态"""
        test_url = 'https://xxx.cn/user/center'
        resp = self.session.get(test_url)
        return '个人中心' in resp.text

    def get_category_links(self):
        """获取所有一级/二级类目链接"""
        resp = self.session.get(self.base_url)
        tree = etree.HTML(resp.text)
        
        categories = []
        # 解析一级类目
        for li in tree.xpath('//ul[@class="first-title"]/li'):
            c1_name = li.xpath('./p/span/text()')[0]
            # 解析二级类目
            for a in li.xpath('./div/a'):
                c2_name = a.xpath('./text()')[0]
                c2_link = urljoin(self.base_url, a.xpath('./@href')[0])
                categories.append( (c1_name, c2_name, c2_link) )
        return categories

    def convert_to_daily_url(self, exam_url):
        """将模拟考试链接转为每日一练链接"""
        sign = exam_url.split('?')[-1]
        return f'{self.base_url}/practice/listEveryday?{sign}'

    def extract_dynamic_params(self, daily_url):
        """提取sign和subsign参数"""
        resp = self.session.get(daily_url)
        tree = etree.HTML(resp.text)
        
        # 定位开始做题按钮
        start_btn = tree.xpath('//a[contains(@href, "startExercise")]/@href')[0]
        params = dict([ 
            (k, v) for k, v in 
            [param.split('=') for param in start_btn.split('?')[1].split('&')]
        ])
        return params['sign'], params['subsign']

    def fetch_questions(self, sign, subsign):
        """获取试题数据"""
        api_url = 'https://xxx.cn/practice/listQuestions'
        payload = {
            "practiceType": 1,
            "sign": sign,
            "subsign": subsign,
            "day": time.strftime("%Y%m%d")  # 自动获取当天日期
        }
        resp = self.session.post(api_url, json=payload)
        return resp.json()['Data'][0]['questions']

    def parse_question(self, question):
        """解析单道试题"""
        options = []
        correct_answer = []
        for idx, opt in enumerate(question['options']):
            options.append(f"{chr(65+idx)}. {opt['content']}")
            if opt['isRight'] == 1:
                correct_answer.append(chr(65+idx))
        return {
            '题目': question['content'],
            '选项': options,
            '答案': ','.join(correct_answer)
        }

    def run(self):
        """主运行流程"""
        df = pd.DataFrame(columns=['一级类目','二级类目','题目','选项','答案'])
        
        for c1, c2, exam_url in self.get_category_links():
            try:
                # Step 1: 构造每日一练链接
                daily_url = self.convert_to_daily_url(exam_url)
                
                # Step 2: 提取动态参数
                sign, subsign = self.extract_dynamic_params(daily_url)
                
                # Step 3: 获取试题数据
                questions = self.fetch_questions(sign, subsign)
                
                # Step 4: 解析数据
                for q in questions:
                    item = self.parse_question(q)
                    df = df.append({
                        '一级类目': c1,
                        '二级类目': c2,
                        **item
                    }, ignore_index=True)
                
                print(f'已完成:{c1} - {c2}')
                time.sleep(1)  # 礼貌间隔
            
            except Exception as e:
                print(f'【错误】{c1}-{c2}: {str(e)}')
                continue
        
        df.to_excel('每日一练试题库.xlsx', index=False)
        print('所有数据已保存!')

if __name__ == '__main__':
    spider = WangxiaoSpider()
    spider.run()

三、关键步骤详解

1. 登录状态维持

self.session = requests.Session()  # 关键对象
self.session.post(login_url, data=data)  # 保持Cookie
  • 使用Session对象自动处理Cookie
  • 需处理验证码(示例简化,实际需要OCR或打码平台)

2. 链接转换技巧

# 原链接:https://xxx.cn/TestPaper/list?sign=jzs1
# 新链接:https://xxx.cn/practice/listEveryday?sign=jzs1
def convert_to_daily_url(self, exam_url):
    sign = exam_url.split('?')[-1]
    return f'{self.base_url}/practice/listEveryday?{sign}'

3. 动态参数提取

# 从类似/practice/startExercise?sign=xxx&subsign=yyy的链接提取参数
start_btn = tree.xpath('//a[contains(@href, "startExercise")]/@href')[0]
params = dict([ (k, v) for k, v in [param.split('=') for param in start_btn.split('?')[1].split('&')])

4. 数据解析优化

def parse_question(self, question):
    # 处理多选题情况
    correct_answer = [chr(65+idx) for idx, opt in enumerate(question['options']) if opt['isRight'] == 1]
    return {
        '题目': question['content'],
        '选项': [f"{chr(65+i)}. {o['content']}" for i, o in enumerate(question['options'])],
        '答案': ','.join(correct_answer)
    }

四、运行结果示例

一级类目 二级类目 题目 选项 答案
建筑工程 一级建造师 关于混凝土结构特点的说法... ['A. 强度高', 'B. 耐久性好'...] A,B
法律职业 司法考试 根据《民法典》规定... ['A. 正确', 'B. 错误'] A

注意事项
本代码仅用于技术交流,请遵守网站Robots协议及相关法律法规,合理控制请求频率。


欢迎在评论区交流爬虫技术细节!

posted @ 2025-03-23 14:47  千陌666  阅读(59)  评论(0)    收藏  举报