贝隆

  博客园  :: 首页  :: 新随笔  :: 联系 :: 订阅 订阅  :: 管理
import json
from collections import defaultdict
from typing import List, Dict, Any, Optional
from docx import Document

from llama_index.core.schema import Document as LlamaDocument, TextNode
from llama_index.core.node_parser import NodeParser
from llama_index.core import VectorStoreIndex

class DocxTitleParser:
    """Word文档标题解析器,转换为树形Node结构"""
   
    def __init__(self, file_path: str):
        self.file_path = file_path
        self.positions = []  # 标题级别
        self.contents = []   # 标题内容
        self.paragraphs = [] # 原始段落
       
    def extract_headings(self) -> None:
        """提取文档中的标题结构和内容"""
        doc = Document(self.file_path)
       
        for paragraph in doc.paragraphs:
            style_name = paragraph.style.name
            if style_name.startswith('Heading'):
                # 提取标题级别和内容
                level = int(style_name[-1])  # Heading1 → 1, Heading2 → 2
                self.positions.append(level)
                self.contents.append(paragraph.text)
                self.paragraphs.append(paragraph)
            elif paragraph.text.strip():  # 非空正文段落
                self.positions.append(0)  # 0表示正文段落
                self.contents.append(paragraph.text)
                self.paragraphs.append(paragraph)
   
    def generate_hierarchical_keys(self) -> List[str]:
        """生成层次化键名(如 1, 1-1, 1-2, 2, 2-1等)"""
        if not self.positions:
            return []
           
        keys = [''] * len(self.positions)
        counters = defaultdict(int)  # 各级别计数器
        stack = []  # 用于跟踪当前层级路径
       
        for i, level in enumerate(self.positions):
            # 回溯到合适的层级
            while stack and stack[-1][0] >= level:
                stack.pop()
           
            if level > 0:  # 标题
                if stack:
                    parent_key = stack[-1][1]
                    counters[parent_key] += 1
                    current_key = f"{parent_key}-{counters[parent_key]}"
                else:
                    counters['root'] += 1
                    current_key = str(counters['root'])
               
                keys[i] = current_key
                stack.append((level, current_key))
                counters[current_key] = 0  # 初始化子级计数器
            else:  # 正文段落
                if stack:
                    parent_key = stack[-1][1]
                    counters[parent_key] += 1
                    keys[i] = f"{parent_key}-p{counters[parent_key]}"
                else:
                    counters['root'] += 1
                    keys[i] = f"p{counters['root']}"
       
        return keys
   
    def build_tree_structure(self) -> List[Dict[str, Any]]:
        """构建树形结构"""
        keys = self.generate_hierarchical_keys()
       
        # 创建节点字典
        nodes_dict = {}
        for i, (level, content, key) in enumerate(zip(self.positions, self.contents, keys)):
            nodes_dict[i] = {
                'id': i,
                'level': level,
                'key': key,
                'content': content,
                'children': [],
                'is_heading': level > 0
            }
       
        # 构建父子关系
        root_nodes = []
        stack = []
       
        for i in range(len(self.positions)):
            current_node = nodes_dict[i]
           
            # 回溯栈到合适层级
            while stack and stack[-1]['level'] >= current_node['level']:
                stack.pop()
           
            # 添加到父节点或根节点
            if stack:
                parent_node = stack[-1]
                parent_node['children'].append(current_node)
            else:
                root_nodes.append(current_node)
           
            if current_node['is_heading']:
                stack.append(current_node)
       
        return root_nodes
   
    def convert_to_llamaindex_nodes(self) -> List[TextNode]:
        """转换为LlamaIndex的Node结构"""
        tree_structure = self.build_tree_structure()
        llama_nodes = []
       
        def process_node(node: Dict[str, Any], parent_node: Optional[TextNode] = None):
            """递归处理节点"""
            # 创建metadata
            metadata = {
                'node_type': 'heading' if node['is_heading'] else 'paragraph',
                'hierarchy_level': node['level'],
                'hierarchy_key': node['key'],
                'source_file': self.file_path
            }
           
            # 构建节点文本内容
            if node['is_heading']:
                node_text = f"【标题{node['level']}】{node['content']}"
            else:
                node_text = node['content']
           
            # 添加上下文信息
            if parent_node:
                metadata['parent_key'] = parent_node.metadata.get('hierarchy_key', 'root')
                # 可以选择将父节点标题作为上下文
                parent_title = parent_node.metadata.get('title_context', '')
                if parent_title:
                    node_text = f"上下文:{parent_title}\n{node_text}"
           
            # 创建TextNode
            llama_node = TextNode(
                text=node_text,
                metadata=metadata
            )
           
            # 为标题节点添加标题上下文
            if node['is_heading']:
                llama_node.metadata['title_context'] = node['content']
           
            llama_nodes.append(llama_node)
           
            # 递归处理子节点
            for child in node['children']:
                process_node(child, llama_node)
       
        # 处理所有根节点
        for root_node in tree_structure:
            process_node(root_node)
       
        return llama_nodes

    def create_document_with_hierarchy(self) -> LlamaDocument:
        """创建包含完整层次结构的Document"""
        tree_structure = self.build_tree_structure()
       
        # 将树结构转换为文本
        def tree_to_text(nodes: List[Dict[str, Any]], depth: int = 0) -> str:
            text_parts = []
            for node in nodes:
                indent = "  " * depth
                if node['is_heading']:
                    text_parts.append(f"{indent}## {node['content']}")
                else:
                    text_parts.append(f"{indent}{node['content']}")
               
                if node['children']:
                    text_parts.append(tree_to_text(node['children'], depth + 1))
           
            return "\n".join(text_parts)
       
        full_text = tree_to_text(tree_structure)
       
        # 创建metadata
        metadata = {
            'source_file': self.file_path,
            'total_headings': len([p for p in self.positions if p > 0]),
            'total_paragraphs': len([p for p in self.positions if p == 0]),
            'tree_structure': json.dumps(tree_structure, ensure_ascii=False, indent=2)
        }
       
        return LlamaDocument(text=full_text, metadata=metadata)

def main():
    """使用示例"""
    # 初始化解析器
    parser = DocxTitleParser('t.docx')
   
    # 提取标题结构
    parser.extract_headings()
   
    print("=== 文档结构分析 ===")
    print(f"找到 {len(parser.positions)} 个段落/标题")
    print(f"标题级别分布: {parser.positions}")
   
    # 生成层次化键名
    keys = parser.generate_hierarchical_keys()
    print(f"\n=== 层次化键名 ===")
    for i, (level, content, key) in enumerate(zip(parser.positions, parser.contents, keys)):
        node_type = "标题" if level > 0 else "段落"
        print(f"{i:2d}. [{node_type} L{level}] {key:8} | {content[:30]}...")
   
    # 构建树形结构
    tree = parser.build_tree_structure()
    print(f"\n=== 树形结构 ===")
    print(json.dumps(tree, ensure_ascii=False, indent=2))
   
    # 转换为LlamaIndex Nodes
    print(f"\n=== 转换为LlamaIndex Nodes ===")
    llama_nodes = parser.convert_to_llamaindex_nodes()
   
    for i, node in enumerate(llama_nodes):
        print(f"\n--- Node {i+1} ---")
        print(f"类型: {node.metadata['node_type']}")
        print(f"层级: {node.metadata.get('hierarchy_key', 'N/A')}")
        print(f"内容: {node.text[:50]}...")
        print(f"元数据: {node.metadata}")
   
    # 创建索引
    print(f"\n=== 创建向量索引 ===")
    index = VectorStoreIndex(llama_nodes)
   
    # 也可以创建完整文档
    full_document = parser.create_document_with_hierarchy()
    print(f"\n完整文档文本长度: {len(full_document.text)}")
    print(f"完整文档元数据: {full_document.metadata.keys()}")
   
    return {
        'parser': parser,
        'llama_nodes': llama_nodes,
        'index': index,
        'full_document': full_document
    }

if __name__ == "__main__":
    result = main()

metadata = {
'node_type': 'heading/paragraph',
'hierarchy_level': 2,
'hierarchy_key': '1-2-1',
'source_file': 't.docx',
'parent_key': '1-2' # 父节点键名
}

 

Node 1: 【标题1】第一章 介绍 (key: "1")
Node 2: 上下文:第一章 介绍\n这是第一章的介绍内容... (key: "1-p1")  
Node 3: 【标题2】1.1 背景 (key: "1-1")
Node 4: 上下文:1.1 背景\n背景描述... (key: "1-1-p1")
Node 5: 【标题2】1.2 目标 (key: "1-2")
Node 6: 上下文:1.2 目标\n目标描述... (key: "1-2-p1")
posted on 2025-11-05 23:44  贝隆  阅读(5)  评论(0)    收藏  举报