# -*- coding: utf-8 -*-
# MM
# 使用正向最大匹配算法实现中文分词
dic = []
MAX_LENGTH = 5
def init():
"""
读文件
获取中文词典
:return:
"""
input = open("test.txt")
lines = input.readlines()
for line in lines:
temp = line.split(',')
dic.append(temp[0])
for d in dic:
print(d)
def if_contain(words):
"""
判断当前词在词典中是否存在
:param words:
:return:
"""
flag = False
for d in dic:
if d == words:
flag = True
break
return flag
def spl(sentence):
"""
正向最大匹配算法的主要实现部分
从后向前切割字符串,直到切割出的子串与词典中的词匹配
:param sentence:
:return:
"""
result = ''
words = []
while len(sentence) > 0:
except_flag = False
for i in range(MAX_LENGTH, 0, -1):
temp = sentence[:i] # 中文字符串切割方式
print(i,temp)
flag = if_contain(temp)
if flag:
words.append(temp)
sentence = sentence[i:]
except_flag = True
break
if not except_flag:
# 判断当前字符串是否在词典中并不存在,若该字符串从头切割到尾都没有词典中的词则认为无法切割并且
# 词典中不存在,此时直接将该词当成切割后的结果加入结果列表
words.append(sentence)
break
for w in words:
result += (w + '/')
return result
def main():
"""
与用户交互接口
:return:
"""
init()
while True:
input_str = input(">")
if not input_str:
break
result = spl(input_str)
print("分词结果为:")
print(result)
if __name__ == "__main__":
main()
