综合练习:词频统计
f = open("peng.txt", "r", encoding='utf-8')
song = f.read()
f.close()
sep = ''',.?—!"'''
exclude = {'the', 'and', 'i', 'in', "i'm", 'a', 'of', 'an', 'on', 'to', 'with'}
for c in sep:
song = song.replace(c, ' ')
swl = song.lower().split()
swd = {}
sws = set(swl) - exclude
for w in sws:
swd[w] = swl.count(w)
fl = list(swd.items())
fl.sort(key=lambda x: x[1], reverse=True)
for i in fl:
print(i)
f = open("result.txt", "w")
for i in range(20):
f.write(fl[i][0] + " " + str(fl[i][1]) + "\n")
f.close()

import jieba
f = open('weicheng.txt', 'r', encoding='utf-8')
text = f.read()
f.close()
p = ''',。‘’“”:;()!?、 '''
a = {
'的', '\n', '\u3000',
'曰', '之', '不', '人', '一', '大', '马', '来', '有', '于', '下', '此',
}
for i in p:
text = text.replace(i, '')
print(list(jieba.cut(text)))
t = list(jieba.lcut(text))
print(t)
count = {}
wl = list(set(t) - a)
print(wl)
for i in range(0, len(wl)):
count[wl[i]] = text.count(str(wl[i]))
cl = list(count.items())
cl.sort(key=lambda x: x[1], reverse=True)
print(cl)
f = open('wcCount.txt', 'a')
for i in range(20):
f.write(cl[i][0] + ':' + str(cl[i][1]) + '\n')
f.close()
