文本词频和词云
day 06
文本操作
#文本的读出
# f=open(r"D:\pycharm\PyCharm 2018.1.4\代码\6\实习.txt",'r')#r表示读
# data=f.read()
# print(data)
# f.close()#记得写
#文本的写入(覆盖写入)
#f=open(r"D:\pycharm\PyCharm 2018.1.4\代码\6\test.py","w")#w表示写
# f.write("king")
# f.close()
sort
list=[("曹操","c",100),("刘备","l",120),("阿斗","a",3),("董卓","d",80),("小乔","x",60)]
def func(i):
return i[2]
list.sort(key=func)
print(list)
结果:
[('阿斗', 'a', 3), ('小乔', 'x', 60), ('董卓', 'd', 80), ('曹操', 'c', 100), ('刘备', 'l', 120)]
文本词频标记(英文)
# #词频:单词出现的次数
#
dic = {}
dic['the'] = 0
dic['hamlet'] = 10
print(dic)
#结果:{'the': 0, 'hamlet': 10}
f=open(r"D:\pycharm\PyCharm 2018.1.4\代码\6\hamlet.txt","r",encoding="utf8")#打开文件夹
data=f.read().lower()#读取文件中的内容并且将里面的大写字母改为小写字母
data_split=data.split()#以空格切割英文字母
count_dict={}#定义字典
for word in data_split:
if word not in count_dict:#读取的单词不在词典内
count_dict[word]=1
else:
count_dict[word]+=1#读取的单词在词典内
def func(i):
return i[1]
lt=list(count_dict.items())
lt.sort(key=func)#排序
lt.reverse()#翻转
for i in lt[0:10]:#输出前十位
print(f'{i[0]:^7}{i[1]:^5}')
#结果:
# the 947
# and 831
# to 624
# of 577
# a 454
# my 446
# i 418
# in 361
# you 361
# hamlet 315
文本词频标记(中文)
注:需要使用到jieba库
import jieba
f=open(r"D:\pycharm\PyCharm 2018.1.4\代码\6\threekingdoms.txt","r",encoding="utf8")
data=f.read()
data_jieba=jieba.lcut(data)
#print(data_jieba)
count_dict={}
for word in data_jieba:
if len(word)==1:
continue
if word in {"东吴","天下","丞相","将军", "却说", "荆州", "二人", "不可", "不能", "如此", "商议","如何","主公","军士", "左右","军马","引兵","次日","大喜","云长","关公"}:
continue
if "曰" in word:
word=word.replace("曰","")
if word in count_dict:
count_dict[word]+=1
else:
count_dict[word]=1
def func(i):
return i[1]
data_list=list(count_dict.items())
data_list.sort(key=func)
data_list.reverse()
#print(data_list)
for i in data_list[0:10]:
print(f'{i[0]:^7}{i[1]:^5}')
#结果:
# 孔明 1226
# 玄德 975
# 曹操 953
# 张飞 358
# 吕布 300
# 赵云 278
# 刘备 277
# 孙权 264
# 于是 250
# 今日 243
词云
#使用前需要导入以下几个库,即需要相应的在pycharm的settings中下载 matplotlib,wordcloud,scipy,pillow,imageio
#pip install matplotlib
# pip install wordcloud
# pip install scipy
# pip install pillow
# pip install imageio
import wordcloud
from imageio import imread
mask = imread(r"D:\pycharm\PyCharm 2018.1.4\代码\6\11.png")#词云的形状
f = open(r'D:\pycharm\PyCharm 2018.1.4\代码\6\threekingdoms.txt', 'r', encoding='utf8')
data = f.read()
w=wordcloud.WordCloud(font_path=r'C:\Windows\Fonts\simkai', mask=mask, width=819,height=460,background_color="black")#当词云为中文时,需要写你在电脑中找到的中文字体名称如下所示:
#font_path=r'C:\Windows\Fonts\simkai'
w.generate(data)
w.to_file('outfile.png')
结果:


浙公网安备 33010602011771号