Python处理文档中的文本
- 提取所有Word 文档中的指定部分内容,并合并到一个txt文件中:
import glob
import docx
def get_text_between_headings(doc, heading1, heading2):
    start = -1
    end = -1
    for i in range(len(doc.paragraphs)):
        if doc.paragraphs[i].text == heading1:
            start = i
        elif doc.paragraphs[i].text == heading2:
            end = i
            break
    return '\n'.join([doc.paragraphs[i].text for i in range(start+1, end)])
# 获取所有Word文件的路径
file_paths = glob.glob('./*.docx')
# 创建一个txt文件
with open('./output.txt', 'w', encoding='utf-8') as f:
    # 遍历每个Word文件,将其内容写入txt文件
    for file_path in file_paths:
        doc = docx.Document(file_path)
        # text = '\n\n'.join([paragraph.text for paragraph in doc.paragraphs])
        # f.write(text)
        fruit = get_text_between_headings(doc, "Done", "Introspection")
        # print(fruit)
        if not (fruit.startswith('Figure') or fruit.startswith('[')):
            f.write(fruit)
- 给txt文档中的所有空行按顺序添加指定内容:
with open('output.txt', 'r') as file:
    lines = file.readlines()
week_no = 0
with open('processed_file.txt', 'w') as file:
    for i, line in enumerate(lines):
        if not line.strip():  # 检查行是否为空行
            week_no += 1
            file.write(f'Week {week_no}')
        file.write(line)
作者:艾孜尔江
转载请务必标明出处!
本文来自博客园,作者:艾孜尔江,转载请注明原文链接:https://www.cnblogs.com/ezhar/p/17381815.html

 
                
            
         
         浙公网安备 33010602011771号
浙公网安备 33010602011771号