第四次作业

一、英文小说词频统计

 

##统计英文小说词频

#读取字符串str
f = open('like.txt','r',encoding='utf-8')
like= f.read()
f.close()
print(like)

 

运行结果

 

#字符串预处理

#全部转化为小写
a = str.lower(like)
print(a)

运行结果

#去除标点符号
sep = ",.、!:?…"
for ch in sep:
    like = like.replace(ch,"")
print(like)

运行结果

#分解提取单词 list
strList = like.split()
print(len(strList),strList)

运行结果

#单词计数字典 set(不重复)
strSet = set(strList)
print(len(strSet),strSet)

运行结果

#单词计数字典 dict
strDict = {}
for word in strSet:
    strDict[word] = strList.count(word)
    print(len(strDict),strDict)

运行结果

# 词频排序 list.sort(key = )
ccList = list(strDict.items())
print(ccList)
ccList.sort(key=lambda x: x[1], reverse=True)
print(ccList)

运行结果

#输出TOP(20)
for i in range(20):
    b = ccList[i]
    print(b)

运行结果

#排除语法型词汇,代词、冠词、连词等无语义词
strSet = set(strSet)
exclude = {'a','the','and','oh'}
strSet = strSet - exclude
print(len(strSet),strSet)

运行结果

 

二、中文小说词频统计

##统计中文小说词频
import jieba

#读取字符串str
t = open('xiyouji.txt','r',encoding='utf-8')
xiyouji= t.read().lower()
t.close()
print(xiyouji)

#字符串预处理

#去除标点符号
sep = ",.、!:?…"
for ch in sep:
    xiyouji = xiyouji.replace(ch,"")
print(xiyouji)

#分解提取单词 list
strList = xiyouji.split()
print(len(strList),strList)

#单词计数字典 set(不重复)
strSet = set(strList)
print(len(strSet),strSet)

#单词计数字典 dict
strDict = {}
for word in strSet:
    strDict[word] = strList.count(word)
    print(len(strDict),strDict)

# 词频排序 list.sort(key = )
ccList = list(strDict.items())
print(ccList)
ccList.sort(key=lambda x: x[1], reverse=True)
print(ccList)

#输出TOP(20)
for i in range(20):
    b = ccList[i]
    print(b)

#排除语法型词汇,代词、冠词、连词等无语义词
strSet = set(strSet)
exclude = {'','','','','',''}
strSet = strSet - exclude
print(len(strSet),strSet)

 

运行结果

 

posted @ 2018-09-29 10:49  SuperLIi  阅读(101)  评论(0)    收藏  举报