nltk 文本分句、分词、统计词频

from nltk.tokenize import word_tokenize, sent_tokenize
import re
from nltk.corpus import stopwords as nltk_stopwords
from collections import Counter

# 读取文件
with open("timemachine.txt", encoding="utf8") as f:
    lines = f.read()

# 获取英文停用词列表
stopwords = set(nltk_stopwords.words('english'))
# 分句
sentences = sent_tokenize(lines)
# 删除句中标点、换行符
sentences = [re.sub(r'[^A-Za-z0-9]+', ' ', string=x) for x in sentences]
# 分词并删除句中停用词
sentences = [
    [word.lower() for word in sentence if word not in stopwords]
    for sentence in
    [word_tokenize(sentence) for sentence in sentences]
]
# 统计词频
freq_counter = Counter([word for sentence in sentences for word in sentence])
token, freq = zip(*sorted(freq_counter.items(), reverse=True, key=lambda x: x[1]))

posted @ 2022-07-10 16:20 aminor 阅读(254) 评论(0) 收藏举报

刷新页面返回顶部

aminor

nltk 文本分句、分词、统计词频

公告