法三、根据20250901 - 1-单版本（删解析和答案）.py里的代码优化。

指令：
根据以下代码和上面上传的word 文档，优化一下代码，实现不要删除上面上传的word 文档时的题目类型。代码如下：【】

`import os
import re
from docx import Document
from docx.shared import Inches, Pt
from docx.enum.section import WD_SECTION
from docx.enum.text import WD_PARAGRAPH_ALIGNMENT

def process_docx_file(input_path, output_path):
"""
处理单个Word文档，删除选择题解析和非选择题的解析与答案
"""
try:
# 打开文档
doc = Document(input_path)

    # 设置页面为A4大小
    for section in doc.sections:
        section.page_height = Pt(842)  # A4高度 29.7cm
        section.page_width = Pt(595)   # A4宽度 21cm
    
    paragraphs_to_remove = []
    current_question_type = None  # 'choice' 或 'non_choice'
    in_choice_question = False
    in_non_choice_question = False
    in_answer_section = False
    
    # 遍历所有段落
    for i, paragraph in enumerate(doc.paragraphs):
        text = paragraph.text.strip()
        
        # 跳过空段落
        if not text:
            continue
        
        # 检测题目类型（一、单项选择题，二、双项选择题，三、非选择题等）
        if re.match(r'^[一二三四五六七八九十]、.*选择题', text):
            in_choice_question = True
            in_non_choice_question = False
            in_answer_section = False
            current_question_type = 'choice'
            continue
        elif re.match(r'^[一二三四五六七八九十]、.*非选择题', text):
            in_choice_question = False
            in_non_choice_question = True
            in_answer_section = False
            current_question_type = 'non_choice'
            continue
        
        # 检测选择题（以数字加点开头，如"1．"）
        if re.match(r'^\d+[．\.]\s*', text) and not any(text.startswith(x) for x in ['答案', '解析']):
            # 检查是否是选择题（通常包含选项A、B、C、D）
            if re.search(r'[A-D][．\.]', text) or (i+1 < len(doc.paragraphs) and 
                                                 any(doc.paragraphs[i+1].text.strip().startswith(x) for x in ['A．', 'B．', 'C．', 'D．'])):
                in_choice_question = True
                in_non_choice_question = False
                in_answer_section = False
                current_question_type = 'choice'
            else:
                in_choice_question = False
                in_non_choice_question = True
                in_answer_section = False
                current_question_type = 'non_choice'
            continue
        
        # 检测非选择题（如"9．"、"11．"等）
        elif re.match(r'^\d+[．\.]\s*', text) and not in_choice_question:
            in_choice_question = False
            in_non_choice_question = True
            in_answer_section = False
            current_question_type = 'non_choice'
            continue
        
        # 检测选择题的解析
        if in_choice_question and (text.startswith('解析：') or text.startswith('解析：选')):
            paragraphs_to_remove.append(i)
            # 可能解析有多行，继续检查后续段落
            j = i + 1
            while j < len(doc.paragraphs):
                next_text = doc.paragraphs[j].text.strip()
                if not next_text or re.match(r'^[一二三四五六七八九十]、', next_text) or re.match(r'^\d+[．\.]', next_text) or next_text.startswith(('A．', 'B．', 'C．', 'D．')):
                    break
                paragraphs_to_remove.append(j)
                j += 1
            continue
        
        # 检测非选择题的解析开始
        if in_non_choice_question and text.startswith('解析：'):
            in_answer_section = True
            paragraphs_to_remove.append(i)
            continue
        
        # 检测非选择题的答案
        if in_non_choice_question and text.startswith('答案：'):
            in_answer_section = True
            paragraphs_to_remove.append(i)
            continue
        
        # 如果在非选择题的解析/答案区域内，标记删除
        if in_non_choice_question and in_answer_section:
            paragraphs_to_remove.append(i)
            # 如果遇到下一题或新的题目类型，结束删除区域
            if re.match(r'^[一二三四五六七八九十]、', text) or re.match(r'^\d+[．\.]', text) or not text:
                in_answer_section = False
    
    # 从后往前删除标记的段落（避免索引变化）
    for idx in sorted(set(paragraphs_to_remove), reverse=True):
        if idx < len(doc.paragraphs):
            p = doc.paragraphs[idx]
            p._element.getparent().remove(p._element)
    
    # 处理表格中的解析和答案
    process_tables(doc)
    
    # 保存处理后的文档
    doc.save(output_path)
    print(f"✓ 成功处理: {os.path.basename(input_path)}")
    return True
    
except Exception as e:
    print(f"✗ 处理文件 {os.path.basename(input_path)} 时出错: {str(e)}")
    return False

def process_tables(doc):
"""
处理表格中的解析和答案内容
"""
for table in doc.tables:
for row in table.rows:
for cell in row.cells:
# 处理单元格中的段落
cell_paragraphs_to_remove = []
for i, paragraph in enumerate(cell.paragraphs):
text = paragraph.text.strip()

                # 删除解析和答案内容
                if text.startswith('解析：') or text.startswith('答案：'):
                    cell_paragraphs_to_remove.append(i)
            
            # 删除单元格中的解析和答案段落
            for idx in sorted(cell_paragraphs_to_remove, reverse=True):
                if idx < len(cell.paragraphs):
                    p = cell.paragraphs[idx]
                    p._element.getparent().remove(p._element)

def process_all_docx_files():
"""
处理test目录下的所有docx文件
"""
input_dir = 'test'
output_dir = 'out'

# 创建输出目录
if not os.path.exists(output_dir):
    os.makedirs(output_dir)
    print(f"创建输出目录: {output_dir}")

# 检查输入目录是否存在
if not os.path.exists(input_dir):
    print(f"错误: 输入目录 '{input_dir}' 不存在")
    return

# 获取所有docx文件
docx_files = [f for f in os.listdir(input_dir) if f.endswith('.docx')]

if not docx_files:
    print(f"在目录 '{input_dir}' 中未找到docx文件")
    return

print(f"找到 {len(docx_files)} 个docx文件，开始处理...")
print("-" * 50)

# 处理所有docx文件
success_count = 0
for filename in docx_files:
    input_path = os.path.join(input_dir, filename)
    output_path = os.path.join(output_dir, filename)
    
    if process_docx_file(input_path, output_path):
        success_count += 1

print("-" * 50)
print(f"处理完成！成功处理 {success_count}/{len(docx_files)} 个文件")
print(f"输出文件保存在: {os.path.abspath(output_dir)}")

def install_requirements():
"""
检查并安装所需的依赖包
"""
try:
from docx import Document
print("python-docx 已安装")
except ImportError:
print("正在安装 python-docx...")
os.system("pip install python-docx")
print("安装完成")

if name == "main":
# 检查依赖
install_requirements()

# 处理所有文件
process_all_docx_files()

# 使用说明
print("\n使用说明:")
print("1. 确保在脚本同级目录下有 'test' 文件夹")
print("2. 将要处理的docx文件放入 'test' 文件夹")
print("3. 运行此脚本，处理后的文件将保存在 'out' 文件夹")
print("4. 处理后的文档将保持原有字体格式，页面设置为A4大小")
print("5. 题目类型（如一、单项选择题）会被保留")
print("6. 选择题的解析和非选择题的解析与答案会被删除")`

结果：
主要优化点：

题目类型识别：增加了对"一、单项选择题"、"二、双项选择题"、"三、非选择题"等题目类型的识别和保护

边界条件处理：改进了解析和答案区域的结束条件判断，确保不会误删题目类型

表格处理：添加了process_tables()函数来处理表格中的解析和答案内容

更精确的匹配：使用更严格的正则表达式来匹配题目类型，避免误删

错误处理：增强了错误处理机制，确保程序稳定性

这个优化后的代码应该能够：

保留所有题目类型（如"一、单项选择题"）

删除选择题的解析部分

删除非选择题的解析和答案部分

保持原有的字体格式和页面设置

正确处理表格中的内容

posted @ 2025-09-04 23:29 itkong 阅读(6) 评论(0) 收藏举报

刷新页面返回顶部

itkong

法三、根据20250901 - 1-单版本（删解析和答案）.py里的代码优化。

公告