//根据标记,进行分割操作、可以是分句或者分词
def segment(text, segs):
words= []
last = 0
for i in range(len(segs)):
if segs[i] =='1':
words.append(text[last:i+1])
last = i+1
words.append(text[last:])
return words
//计算得分值
def evaluate(text, segs):
words= segment(text, segs)
text_size = len(words)
lexicon_size = len(' '.join(list(set(words))))
return text_size + lexicon_size
from random import randint
//改变某一个标记串的某一位(1变成0,0变成1)
def flip(segs, pos):
return segs[:pos] + str(1-int(segs[pos])) + segs[pos+1:]
//根据整数N,随机改变N个位置,形成一个猜测的序列
def flip_n(segs, n):
for i in range(n):
segs = flip(segs, randint(0,len(segs)-1))
return segs
//模拟退火算法
def anneal(text, segs, iterations, cooling_rate):
temperature = float(len(segs))
whiletemperature > 0.5:
best_segs,best = segs, evaluate(text, segs)
for i in range(iterations):
guess= flip_n(segs, int(round(temperature)))
score =evaluate(text, guess)
if score < best://对比猜测的和目前的,那一种评价值比较小,选择那一种序列
best,best_segs = score, guess
score, segs = best,best_segs
temperature = temperature / cooling_rate
print evaluate(text, segs), segment(text, segs)
print return segs
//示例
>>>text = "doyouseethekittyseethedoggydoyoulikethekittylikethedoggy"
>>>seg1= "0000000000000001000000000010000000000000000100000000000"
>>>anneal(text, seg1, 5000,1.2)
60['doyouseetheki', 'tty', 'see', 'thedoggy', 'doyouliketh', 'ekittylike', 'thedoggy']
58['doy', 'ouseetheki', 'ttysee', 'thedoggy', 'doy', 'o', 'ulikethekittylike', 'thedoggy']
56['doyou', 'seetheki', 'ttysee', 'thedoggy', 'doyou', 'liketh', 'ekittylike', 'thedoggy']
54['doyou', 'seethekit', 'tysee', 'thedoggy', 'doyou', 'likethekittylike', 'thedoggy']
120
53['doyou', 'seethekit', 'tysee', 'thedoggy', 'doyou', 'like', 'thekitty', 'like', 'thedoggy']
51['doyou', 'seethekittysee', 'thedoggy', 'doyou', 'like', 'thekitty', 'like', 'thedoggy']
42['doyou', 'see', 'thekitty', 'see', 'thedoggy', 'doyou', 'like', 'thekitty', 'like', 'thedoggy']
'0000100100000001001000000010000100010000000100010000000'