#coding=utf-8
from __future__ import division
from nltk.book import *
# 打印文本名称
print text1
# 查找文本中的词monstrous
print text2.concordance("monstrous")
# 查找与词monstrous有相似上下文的其他词
print text2.similar("monstrous")
# 查找共用两个以上词汇的上下文
print text2.common_contexts(["monstrous", "very"])
# 显示某些词在文本中的分布图
text4.dispersion_plot(["citizens", "democracy", "freedom", "duties", "America"])
# 获取文本中所有标识符(单词标点)数量
print len(text3)
# 获取文本中项目类型(单词标点去重后)列表及数量
print sorted(set(text3))
print len(set(text3))
# 获取文本中每个词的平均使用次数
print len(text3)/len(set(text3))
# 获取某单词在文本中出现的次数并写为函数
print text3.count("smote")
def lexical_diversity(text):
return len(text)/len(set(text))
# 获取某单词在文本中占据的百分比并写为函数
print 100 * text4.count("a")/len(text4)
def percentage(count, total):
return 100 * count/total
# 将文本当作词列表并进行操作
sent1 = ["Call", "me", "Zty", "."]
sent2 = ["Hello NLP!"]
print len(sent1)
print lexical_diversity(sent1)
print sent1 + sent2
sent1.append("some")
print sent1
print text4[173]
print text4.index("awaken")
print text5[16715:16735]
sent = ["1","2","3","4","5","6","7","8","9","10"]
print sent[0], sent[9]
print sent[5:8], sent[:3]
print text2[141525:]
sent[0] = "first"
sent[9] = "last"
sent[1:9] = ["second", "third"]
print sent
my_sent = ["bold", "Sir", "Robin"]
print sorted(my_sent)
name = "Monty"
print name[0]
print name[:4]
print name*2
print name+"!"
print " ".join(["Monty", "Python"])
print "Monty Python".split()
# 利用FreqDist寻找文本中最常见的50个词
fdist1 = FreqDist(text1)
print fdist1
print len(text1)
print len(set(text1))
vocab1 = fdist1.keys()
print vocab1[:50]
print fdist1["whale"]
# 显示累积频率图
fdist1.plot(50, cumulative=True)
# 输出低频词
print fdist1.hapaxes()
# 获取长度大于15个字符的词
V = set(text1)
long_words = [w for w in V if len(w)>15]
print sorted(long_words)
# 获取长度超过7个字符并且出现次数超过7次的词
fdist5 = FreqDist(text5)
print sorted([w for w in set(text5) if len(w)>7 and fdist5[w]>7])
# 获取文本中双连词
print text4.collocations()
# 获取文本词长分布
ls = [len(w) for w in text1]
fdist = FreqDist(ls)
print fdist.keys()
print fdist.items()
print fdist.max()
print fdist[3]
print fdist.freq(3)