import json from collections import defaultdict from typing import List, Dict, Any, Optional from docx import Document from llama_index.core.schema import Document as LlamaDocument, TextNode from llama_index.core.node_parser import NodeParser from llama_index.core import VectorStoreIndex class DocxTitleParser: """Word文档标题解析器,转换为树形Node结构""" def __init__(self, file_path: str): self.file_path = file_path self.positions = [] # 标题级别 self.contents = [] # 标题内容 self.paragraphs = [] # 原始段落 def extract_headings(self) -> None: """提取文档中的标题结构和内容""" doc = Document(self.file_path) for paragraph in doc.paragraphs: style_name = paragraph.style.name if style_name.startswith('Heading'): # 提取标题级别和内容 level = int(style_name[-1]) # Heading1 → 1, Heading2 → 2 self.positions.append(level) self.contents.append(paragraph.text) self.paragraphs.append(paragraph) elif paragraph.text.strip(): # 非空正文段落 self.positions.append(0) # 0表示正文段落 self.contents.append(paragraph.text) self.paragraphs.append(paragraph) def generate_hierarchical_keys(self) -> List[str]: """生成层次化键名(如 1, 1-1, 1-2, 2, 2-1等)""" if not self.positions: return [] keys = [''] * len(self.positions) counters = defaultdict(int) # 各级别计数器 stack = [] # 用于跟踪当前层级路径 for i, level in enumerate(self.positions): # 回溯到合适的层级 while stack and stack[-1][0] >= level: stack.pop() if level > 0: # 标题 if stack: parent_key = stack[-1][1] counters[parent_key] += 1 current_key = f"{parent_key}-{counters[parent_key]}" else: counters['root'] += 1 current_key = str(counters['root']) keys[i] = current_key stack.append((level, current_key)) counters[current_key] = 0 # 初始化子级计数器 else: # 正文段落 if stack: parent_key = stack[-1][1] counters[parent_key] += 1 keys[i] = f"{parent_key}-p{counters[parent_key]}" else: counters['root'] += 1 keys[i] = f"p{counters['root']}" return keys def build_tree_structure(self) -> List[Dict[str, Any]]: """构建树形结构""" keys = self.generate_hierarchical_keys() # 创建节点字典 nodes_dict = {} for i, (level, content, key) in enumerate(zip(self.positions, self.contents, keys)): nodes_dict[i] = { 'id': i, 'level': level, 'key': key, 'content': content, 'children': [], 'is_heading': level > 0 } # 构建父子关系 root_nodes = [] stack = [] for i in range(len(self.positions)): current_node = nodes_dict[i] # 回溯栈到合适层级 while stack and stack[-1]['level'] >= current_node['level']: stack.pop() # 添加到父节点或根节点 if stack: parent_node = stack[-1] parent_node['children'].append(current_node) else: root_nodes.append(current_node) if current_node['is_heading']: stack.append(current_node) return root_nodes def convert_to_llamaindex_nodes(self) -> List[TextNode]: """转换为LlamaIndex的Node结构""" tree_structure = self.build_tree_structure() llama_nodes = [] def process_node(node: Dict[str, Any], parent_node: Optional[TextNode] = None): """递归处理节点""" # 创建metadata metadata = { 'node_type': 'heading' if node['is_heading'] else 'paragraph', 'hierarchy_level': node['level'], 'hierarchy_key': node['key'], 'source_file': self.file_path } # 构建节点文本内容 if node['is_heading']: node_text = f"【标题{node['level']}】{node['content']}" else: node_text = node['content'] # 添加上下文信息 if parent_node: metadata['parent_key'] = parent_node.metadata.get('hierarchy_key', 'root') # 可以选择将父节点标题作为上下文 parent_title = parent_node.metadata.get('title_context', '') if parent_title: node_text = f"上下文:{parent_title}\n{node_text}" # 创建TextNode llama_node = TextNode( text=node_text, metadata=metadata ) # 为标题节点添加标题上下文 if node['is_heading']: llama_node.metadata['title_context'] = node['content'] llama_nodes.append(llama_node) # 递归处理子节点 for child in node['children']: process_node(child, llama_node) # 处理所有根节点 for root_node in tree_structure: process_node(root_node) return llama_nodes def create_document_with_hierarchy(self) -> LlamaDocument: """创建包含完整层次结构的Document""" tree_structure = self.build_tree_structure() # 将树结构转换为文本 def tree_to_text(nodes: List[Dict[str, Any]], depth: int = 0) -> str: text_parts = [] for node in nodes: indent = " " * depth if node['is_heading']: text_parts.append(f"{indent}## {node['content']}") else: text_parts.append(f"{indent}{node['content']}") if node['children']: text_parts.append(tree_to_text(node['children'], depth + 1)) return "\n".join(text_parts) full_text = tree_to_text(tree_structure) # 创建metadata metadata = { 'source_file': self.file_path, 'total_headings': len([p for p in self.positions if p > 0]), 'total_paragraphs': len([p for p in self.positions if p == 0]), 'tree_structure': json.dumps(tree_structure, ensure_ascii=False, indent=2) } return LlamaDocument(text=full_text, metadata=metadata) def main(): """使用示例""" # 初始化解析器 parser = DocxTitleParser('t.docx') # 提取标题结构 parser.extract_headings() print("=== 文档结构分析 ===") print(f"找到 {len(parser.positions)} 个段落/标题") print(f"标题级别分布: {parser.positions}") # 生成层次化键名 keys = parser.generate_hierarchical_keys() print(f"\n=== 层次化键名 ===") for i, (level, content, key) in enumerate(zip(parser.positions, parser.contents, keys)): node_type = "标题" if level > 0 else "段落" print(f"{i:2d}. [{node_type} L{level}] {key:8} | {content[:30]}...") # 构建树形结构 tree = parser.build_tree_structure() print(f"\n=== 树形结构 ===") print(json.dumps(tree, ensure_ascii=False, indent=2)) # 转换为LlamaIndex Nodes print(f"\n=== 转换为LlamaIndex Nodes ===") llama_nodes = parser.convert_to_llamaindex_nodes() for i, node in enumerate(llama_nodes): print(f"\n--- Node {i+1} ---") print(f"类型: {node.metadata['node_type']}") print(f"层级: {node.metadata.get('hierarchy_key', 'N/A')}") print(f"内容: {node.text[:50]}...") print(f"元数据: {node.metadata}") # 创建索引 print(f"\n=== 创建向量索引 ===") index = VectorStoreIndex(llama_nodes) # 也可以创建完整文档 full_document = parser.create_document_with_hierarchy() print(f"\n完整文档文本长度: {len(full_document.text)}") print(f"完整文档元数据: {full_document.metadata.keys()}") return { 'parser': parser, 'llama_nodes': llama_nodes, 'index': index, 'full_document': full_document } if __name__ == "__main__": result = main()
metadata = {
'node_type': 'heading/paragraph',
'hierarchy_level': 2,
'hierarchy_key': '1-2-1',
'source_file': 't.docx',
'parent_key': '1-2' # 父节点键名
}
Node 1: 【标题1】第一章 介绍 (key: "1") Node 2: 上下文:第一章 介绍\n这是第一章的介绍内容... (key: "1-p1") Node 3: 【标题2】1.1 背景 (key: "1-1") Node 4: 上下文:1.1 背景\n背景描述... (key: "1-1-p1") Node 5: 【标题2】1.2 目标 (key: "1-2") Node 6: 上下文:1.2 目标\n目标描述... (key: "1-2-p1")
浙公网安备 33010602011771号