import re
from operator import itemgetter
import matplotlib.pyplot as plt
from collections import Counter
# 英文:
frequency = {}
with open("Alice's adventures in wonderland.txt") as f:
file_to_string = f.read()
words = re.findall(r"(\b[A-Za-z][a-z]{2,9}\b)", file_to_string)
for word in words:
count = frequency.get(word, 0)
frequency[word] = count + 1
# 用于打印输出前100名
for key, value in sorted(frequency.items(), key=itemgetter(1), reverse=True)[:100]:
print(key, value)
sorted_freq = sorted(frequency.values(), reverse=True)
# 用matplotlib验证Zipf-Law并出图
plt.title("Zipf-Law")
plt.xlabel("rank")
plt.ylabel("freq")
x = [i for i in range(100)]
plt.loglog(x, sorted_freq[:100])
plt.show()
# 条形图
plt.bar(x, sorted_freq[:100])
plt.show()
![]()
![]()