利用BeautifulSoup库解析HTML
利用BeautifulSoup解析HTML
# -*- coding: utf-8 -*-
from bs4 import BeautifulSoup
import requests
import pandas as pd
class HTMLUtil:
def __init__(self):
self.html_content = None
self.result = []
def get_html_content_from_file(self, html_file_path):
try:
with open(html_file_path, 'r', encoding='utf - 8') as f:
self.html_content = f.read()
except FileNotFoundError:
print('文件未找到')
except UnicodeDecodeError:
print('解码文件时出错,请检查编码')
def get_html_content_from_internet(self, url):
try:
resp = requests.get(url)
if resp.status_code == 200:
self.html_content = resp.text
else:
print(f'网络请求失败,状态码: {resp.status_code}')
except requests.RequestException as e:
print(f'请求发生错误: {e}')
def parse_html_content(self):
soup = BeautifulSoup(self.html_content, 'html.parser')
exam_mask_list_divs = soup.find_all('div', class_='exam-mask-list')
for exam_mask_list_div in exam_mask_list_divs:
question_type = exam_mask_list_div.find('div', class_='exam-mask-title').get_text()
exam_list_divs = exam_mask_list_div.find_all('div', class_='exam-list')
for item in exam_list_divs:
result_dict = {'题型': question_type}
title_divs = item.find('div', class_='exam-list-title')
all_text = title_divs.get_text()
num = title_divs.find('span', class_=None).get_text()
title = all_text.replace(num, '').replace('纠错', '').replace('收藏', '').strip()
num = num.replace('.', '').strip()
result_dict['序号'] = num
result_dict['题目'] = title
options = item.find_all('div', class_='exam-list-point')
option_dict = {}
for option in options:
option_index = option.find('span', class_='label-italic').get_text().strip()
option_value = option.find('span', class_=None).get_text()
option_value = option_value.replace(' ', '').strip()
option_dict[option_index] = option_value
result_dict['选项'] = option_dict
self.result.append(result_dict)
def save_result(self):
result_file_path = './试题.xlsx'
df = pd.DataFrame(self.result, columns=['题型', '序号', '题目', '选项'])
df.to_excel(result_file_path, index=False)
if __name__ == '__main__':
HTML_FILE_PATH = './html/练习.html'
htmlUtil = HTMLUtil()
# 1. 获取html内容
htmlUtil.get_html_content_from_file(HTML_FILE_PATH)
# 2. 解析html内容
htmlUtil.parse_html_content()
# 3. 保存结果
htmlUtil.save_result()
God will send the rain when you are ready.You need to prepare your field to receive it.