阶段作业1:完整的中英文词频统计
1.英文小说 词频统计
#读取字符串str f = open('zz.txt','r',encoding='utf-8') strbig= f.read() f.close() sep ='''.,;:?!-_''' for ch in sep: strbig = strbig.replace(ch,' ') strbig = strbig.lower() print(strbig) strlist = strbig.split() print(len(strlist),strlist) strset = set(strlist) exclude = {'a','the','and','i','you'} strset = strset-exclude print(len(strset),strset) strdict ={} for word in strset: strdict[word] = strlist.count(word) print(len(strdict),strdict) ccList = list(strdict.items()) print(ccList) ccList.sort(key=lambda x: x[1], reverse=True) print(ccList) for i in range(20): b = ccList[i] print(b)

2.中文小说 词频统计
fo = open('a.txt','r',encoding='utf-8')
shz=fo.read()
fo.close()
print(shz)
# jieba3种模式分词
import jieba
print(list(jieba.cut(shz)))
print(list(jieba.cut(shz,cut_all=True)))
print(list(jieba.cut_for_search(shz)))
s1=''', 。 ; : - ! ? 、 “ ”''' #标点符号转换成空格
for ch in s1:
shz=shz.replace(ch,' ')
print(shz)
strlist=shz.split()
print(len(strlist),strlist) #分解提取词语
wordsls=jieba.lcut(shz)
wcdict={}
for word in wordsls: #词为1的删除
if len(word)==1:
continue
else:
wcdict[word]=wcdict.get(word,0)+1
print(wordsls)
# # 按词频排序
shls=list(wcdict.items())
shls.sort(key=lambda x:x[1],reverse=True)
print(shls)
#
# # 输出TOP15
for i in range(15):
print(shls[i])

浙公网安备 33010602011771号