解析doc文档中XML段落的个数(Python实现)

# -*- coding:utf-8 -*-
'''
anslysis_doc.py
功能:解析doc文档中xml段落的个数 步骤: ''' import os import re from docx import Document def get_xml_count(path): ''' :param path: doc文档的绝对路径 :return: 返回doc文档中XML段落的个数 ''' # print('doc文件: %s' %path) doc = Document(path) count = 0 flag = True for paragraph in doc.paragraphs: # print(paragraph.text) if flag: regex = re.match(r'^<[A-Z]+>$', paragraph.text) # 首次匹配到XML的标签,eg:<ACL> if regex: value = regex.group(0) # 获取标签内容 flag = False count += 1 elif re.match(value, paragraph.text): # 以首次获取的标签内容对剩下段落做正则匹配 count += 1 return count if __name__ == '__main__': path_dir = 'D:\\workspace_py\\FILES' sum = 0 for file in os.listdir(path_dir): path = path_dir + '/' + file count = get_xml_count(path) print('%s文件中,符合条件XML个数: %s' % (path, count)) sum += count print('总的符合条件XML个数:%s' % sum) # path = 'D:\\workspace_py\\FILES\\Comware ACL NETCONF XML API Action Reference.docx' # count = get_xml_count(path) # print(count)

 

posted @ 2022-05-27 11:56  宇宙刘  阅读(66)  评论(0)    收藏  举报