python 读取word、pdf文件内容

import docx2txt
import fitz
import docx
from docx.oxml import parse_xml


def get_doc_content(filepath):
    """获取word文本内容"""
    try:
        doc = docx.Document(filepath)
        content = []
        for element in doc.element.body:
            if element.__class__.__name__ == 'CT_P':  # 段落
                paragraph = docx.text.paragraph.Paragraph(parse_xml(element.xml), parent=None)
                content.append(paragraph.text)
            elif element.__class__.__name__ == 'CT_Tbl':  # 表格
                table = docx.table.Table(element, parent=None)
                table_texts = []
                for row in table.rows:
                    row_texts = []
                    for cell in row.cells:
                        if cell.text not in row_texts:
                            row_texts.append(cell.text)
                        else:
                            row_texts.append("")
                    table_texts.append("|".join(row_texts))
                content.append("\n".join(table_texts))
        content = "\n".join(content)
    except Exception:
        content = docx2txt.process(filepath)
    return content


def get_pdf_content(filepath):
    content = ""
    with fitz.Document(filepath) as doc:
        for page in doc:
            content += page.get_text()
    return content


def get_file_content(filepath):
    try:
        if filepath.endswith(".docx"):
            content = get_doc_content(filepath)
        elif filepath.endswith(".pdf"):
            content = get_pdf_content(filepath)
        elif filepath.endswith(".txt"):
            with open(filepath, "r", encoding="utf-8") as f:
                content = f.read()
        else:
            content = None
    except:
        content = None
    return content
posted @ 2025-01-22 11:41  二月雪  阅读(100)  评论(0)    收藏  举报