文本词频和词云

day 06

文本操作

#文本的读出
# f=open(r"D:\pycharm\PyCharm 2018.1.4\代码\6\实习.txt",'r')#r表示读
# data=f.read()
# print(data)
# f.close()#记得写

#文本的写入(覆盖写入)
#f=open(r"D:\pycharm\PyCharm 2018.1.4\代码\6\test.py","w")#w表示写
# f.write("king")
# f.close()

sort

list=[("曹操","c",100),("刘备","l",120),("阿斗","a",3),("董卓","d",80),("小乔","x",60)]
def func(i):
    return i[2]
list.sort(key=func)
print(list)
结果:
[('阿斗', 'a', 3), ('小乔', 'x', 60), ('董卓', 'd', 80), ('曹操', 'c', 100), ('刘备', 'l', 120)]

文本词频标记(英文)

# #词频:单词出现的次数
#
dic = {}
dic['the'] = 0
dic['hamlet'] = 10
print(dic)
#结果:{'the': 0, 'hamlet': 10}

f=open(r"D:\pycharm\PyCharm 2018.1.4\代码\6\hamlet.txt","r",encoding="utf8")#打开文件夹
data=f.read().lower()#读取文件中的内容并且将里面的大写字母改为小写字母
data_split=data.split()#以空格切割英文字母
count_dict={}#定义字典

for word in data_split:
    if word not in count_dict:#读取的单词不在词典内
        count_dict[word]=1
    else:
        count_dict[word]+=1#读取的单词在词典内
def func(i):
    return i[1]
lt=list(count_dict.items())
lt.sort(key=func)#排序
lt.reverse()#翻转

for i in lt[0:10]:#输出前十位
    print(f'{i[0]:^7}{i[1]:^5}')


#结果:
#   the   947
#   and   831
#   to    624
#   of    577
#    a    454
#   my    446
#    i    418
#   in    361
#   you   361
# hamlet  315

文本词频标记(中文)

注:需要使用到jieba库
import jieba
f=open(r"D:\pycharm\PyCharm 2018.1.4\代码\6\threekingdoms.txt","r",encoding="utf8")
data=f.read()

data_jieba=jieba.lcut(data)
#print(data_jieba)

count_dict={}
for word in  data_jieba:
    if len(word)==1:
        continue
    if word in {"东吴","天下","丞相","将军", "却说", "荆州", "二人", "不可", "不能", "如此", "商议","如何","主公","军士", "左右","军马","引兵","次日","大喜","云长","关公"}:
        continue
    if "曰" in word:
        word=word.replace("曰","")
    if word in count_dict:
        count_dict[word]+=1
    else:
        count_dict[word]=1
def func(i):
    return i[1]
data_list=list(count_dict.items())
data_list.sort(key=func)
data_list.reverse()
#print(data_list)

for i in data_list[0:10]:
    print(f'{i[0]:^7}{i[1]:^5}')
#结果:
#  孔明   1226 
#  玄德    975 
#  曹操    953 
#  张飞    358 
#  吕布    300 
#  赵云    278 
#  刘备    277 
#  孙权    264 
#  于是    250 
#  今日    243 

词云

#使用前需要导入以下几个库,即需要相应的在pycharm的settings中下载 matplotlib,wordcloud,scipy,pillow,imageio

#pip install matplotlib
# pip install wordcloud
# pip install scipy
# pip install pillow
# pip install imageio
import wordcloud
from imageio import imread

mask = imread(r"D:\pycharm\PyCharm 2018.1.4\代码\6\11.png")#词云的形状

f = open(r'D:\pycharm\PyCharm 2018.1.4\代码\6\threekingdoms.txt', 'r', encoding='utf8')
data = f.read()

w=wordcloud.WordCloud(font_path=r'C:\Windows\Fonts\simkai', mask=mask, width=819,height=460,background_color="black")#当词云为中文时,需要写你在电脑中找到的中文字体名称如下所示:
#font_path=r'C:\Windows\Fonts\simkai'

w.generate(data)
w.to_file('outfile.png')

结果:

posted @ 2019-07-19 15:03  夜元  阅读(390)  评论(0)    收藏  举报