# coding:utf-8
# 将文本分词处理
import jieba
stoplist = {}.fromkeys([ line.strip() for line in open('/Test/orgindata/stopwords.txt') ])
input = open('/Test/orgindata/corpus.txt','r')
output = open('/Test/process2/corpus-seg.txt','w+')
line = input.readline()
index = 0
text = ''
while line!=None and len(line) > 4:
#去除头部和尾部的<content> </content>
line = line[9:-11]
# segments = thu1.cut(line, text=True)
segments = jieba.cut(line)
# segments = segments.split(' ')
segments = [word for word in list(segments) if word not in stoplist]
result = ''
for segment in segments:
if len(segment)>1:
result += segment + ' '
line = input.readline()
if len(result) > 4:
text += result
index += 1
if index%100 == 0:
output.write(text.encode('utf-8') + '\n')
text = ''
print('line '+str(index))
print '处理完成'