利用BeautifulSoup库解析HTML

利用BeautifulSoup解析HTML

# -*- coding: utf-8 -*-

from bs4 import BeautifulSoup
import requests
import pandas as pd


class HTMLUtil:
    def __init__(self):
        self.html_content = None
        self.result = []

    def get_html_content_from_file(self, html_file_path):
        try:
            with open(html_file_path, 'r', encoding='utf - 8') as f:
                self.html_content = f.read()
        except FileNotFoundError:
            print('文件未找到')
        except UnicodeDecodeError:
            print('解码文件时出错,请检查编码')

    def get_html_content_from_internet(self, url):
        try:
            resp = requests.get(url)
            if resp.status_code == 200:
                self.html_content = resp.text
            else:
                print(f'网络请求失败,状态码: {resp.status_code}')
        except requests.RequestException as e:
            print(f'请求发生错误: {e}')

    def parse_html_content(self):
        soup = BeautifulSoup(self.html_content, 'html.parser')

        exam_mask_list_divs = soup.find_all('div', class_='exam-mask-list')
        for exam_mask_list_div in exam_mask_list_divs:
            question_type = exam_mask_list_div.find('div', class_='exam-mask-title').get_text()
            exam_list_divs = exam_mask_list_div.find_all('div', class_='exam-list')
            for item in exam_list_divs:
                result_dict = {'题型': question_type}
                title_divs = item.find('div', class_='exam-list-title')
                all_text = title_divs.get_text()
                num = title_divs.find('span', class_=None).get_text()
                title = all_text.replace(num, '').replace('纠错', '').replace('收藏', '').strip()
                num = num.replace('.', '').strip()
                result_dict['序号'] = num
                result_dict['题目'] = title

                options = item.find_all('div', class_='exam-list-point')
                option_dict = {}
                for option in options:
                    option_index = option.find('span', class_='label-italic').get_text().strip()
                    option_value = option.find('span', class_=None).get_text()
                    option_value = option_value.replace(' ', '').strip()
                    option_dict[option_index] = option_value
                result_dict['选项'] = option_dict

                self.result.append(result_dict)

    def save_result(self):
        result_file_path = './试题.xlsx'
        df = pd.DataFrame(self.result, columns=['题型', '序号', '题目', '选项'])
        df.to_excel(result_file_path, index=False)


if __name__ == '__main__':
    HTML_FILE_PATH = './html/练习.html'

    htmlUtil = HTMLUtil()

    # 1. 获取html内容
    htmlUtil.get_html_content_from_file(HTML_FILE_PATH)

    # 2. 解析html内容
    htmlUtil.parse_html_content()

    # 3. 保存结果
    htmlUtil.save_result()
posted @ 2025-04-24 14:03  Steven0325  阅读(30)  评论(0)    收藏  举报