import os
import os.path
import codecs
import numpy
import pandas
import jieba
#创建语料库
corpos = pandas.DataFrame(columns=['filePath', 'content'])
for root, dirs, files in os.walk(
"D:\\PDM\\2.1\\SogouC.mini\\Sample\\C000007\\"
):
for name in files:
filePath = root + '\\' + name;
f = codecs.open(filePath, 'r', 'utf-8')
content = f.read()
f.close()
corpos.loc[len(corpos)+1] = [filePath, content.strip()];
#进行分词
segments = pandas.DataFrame(columns=["filePath", 'segment'])
for content in corpos['content']:
segs = jieba.cut(content)
for seg in segs:
segments.loc[len(segments)+1] = [filePath, seg]