import pandas as pd
import jieba
from collections import Counter
import re
def load_data(file_path):
"""加载数据"""
return pd.read_csv(file_path)
def clean_text(text):
"""清理文本"""
# 移除特殊字符和数字
text = re.sub(r'[^\u4e00-\u9fa5]', '', str(text))
return text
def segment_words(text):
"""分词"""
# 添加自定义词典
jieba.load_userdict('dict.txt')
# 分词
words = jieba.cut(text)
return [word for word in words if len(word) > 1] # 只保留长度大于1的词
def count_word_frequency(df):
"""统计词频"""
# 分别统计推荐和不推荐的评论词频
recommended_comments = df[df['is_recommended']]['comment'].apply(clean_text).str.cat(sep='')
not_recommended_comments = df[~df['is_recommended']]['comment'].apply(clean_text).str.cat(sep='')
# 分词并统计词频
recommended_words = segment_words(recommended_comments)
not_recommended_words = segment_words(not_recommended_comments)
recommended_freq = Counter(recommended_words)
not_recommended_freq = Counter(not_recommended_words)
return recommended_freq, not_recommended_freq
def save_word_frequency(recommended_freq, not_recommended_freq, output_file):
"""保存词频统计结果"""
with open(output_file, 'w', encoding='utf-8') as f:
# 写入推荐评论的词频
for word, freq in recommended_freq.most_common():
f.write(f'{word}\t{freq}\tTrue\n')
# 写入不推荐评论的词频
for word, freq in not_recommended_freq.most_common():
f.write(f'{word}\t{freq}\tFalse\n')
def main():
# 加载数据
df = load_data('../csv/processed_data.csv')
# 统计词频
recommended_freq, not_recommended_freq = count_word_frequency(df)
# 保存结果
save_word_frequency(recommended_freq, not_recommended_freq, '../output/word_frequency.txt')
print("词频统计完成,结果已保存到 word_frequency.txt")
if __name__ == "__main__":
main()