# -*- coding:utf-8 -*-
'''
anslysis_doc.py
功能:解析doc文档中xml段落的个数
步骤:
'''
import os
import re
from docx import Document
def get_xml_count(path):
'''
:param path: doc文档的绝对路径
:return: 返回doc文档中XML段落的个数
'''
# print('doc文件: %s' %path)
doc = Document(path)
count = 0
flag = True
for paragraph in doc.paragraphs:
# print(paragraph.text)
if flag:
regex = re.match(r'^<[A-Z]+>$', paragraph.text) # 首次匹配到XML的标签,eg:<ACL>
if regex:
value = regex.group(0) # 获取标签内容
flag = False
count += 1
elif re.match(value, paragraph.text): # 以首次获取的标签内容对剩下段落做正则匹配
count += 1
return count
if __name__ == '__main__':
path_dir = 'D:\\workspace_py\\FILES'
sum = 0
for file in os.listdir(path_dir):
path = path_dir + '/' + file
count = get_xml_count(path)
print('%s文件中,符合条件XML个数: %s' % (path, count))
sum += count
print('总的符合条件XML个数:%s' % sum)
# path = 'D:\\workspace_py\\FILES\\Comware ACL NETCONF XML API Action Reference.docx'
# count = get_xml_count(path)
# print(count)