#-*- coding:gbk -*-
import os
import docx
from win32com import client as wc
import xlwt
import xlsxwriter
# 获取filepath文件夹下的所有的文件
def getfilelist(filepath):
filelist = os.listdir(filepath)
files = []
for i in range(len(filelist)):
child = os.path.join('%s\\%s' % (filepath, filelist[i]))
if os.path.isdir(child):
files.extend(getfilelist(child))
else:
files.append(child)
return files
# 获取word文件文本
def getDocx(fileName):
d = docx.opendocx(fileName)
doc = docx.getdocumenttext(d)
return doc
# 将doc转换为docx
def doc2Docx(fileName):
word = wc.Dispatch("Word.Application")
doc = word.Documents.Open(fileName)
doc.SaveAs(fileName + "x", 12, False, "", True, "", False, False, False, False)
os.remove(fileName)
doc.Close()
word.Quit()
filepath = "C:\\xxx\\xx\\xx\\xx\\数据集"
filelist = (getfilelist(filepath))
##如果文件夹下的文件都是doc,需要先通过该函数全部转变为docx
##for i in range(len(filelist)):
## doc2Docx(filelist[i])
list = []
for i in range(len(filelist)):
if (filelist[i].endswith("docx")):
list.append(filelist[i])
# 使用xlwt写入到excel,当存在大文本的时候会出现错误:Exception: String longer than 32767 characters
##for i in range(len(list)):
## fileName = list[i]
## doc = get_docx(fileName)
## filePaths = fileName.split("\\")
## string = ""
## for j in range(len(doc)):
## string += doc[j] + "\n"
## if (len(string) > 10000):
## string = string[:10000]
## filePaths.append(string)
## for j in range(20, -1, -1):
## if j < len(filePaths):
## worksheet.write(i, j, label = filePaths[j])
##workbook.save('Excel_Workbook.xls')
# 使用xlsxwriter处理超过的32767word文本
workbook = xlsxwriter.Workbook(u'数据.xlsx')
worksheet = workbook.add_worksheet(u"数据")
for i in range(len(list)):
fileName = list[i]
doc = get_docx(fileName)
filePaths = fileName.split("\\")
string = ""
for j in range(len(doc)):
string += doc[j] + "\n"
filePaths.append(string)
for j in range(20, -1, -1):
if j < len(filePaths):
worksheet.write(i, j, filePaths[j])
workbook.close()