6月7日

import pandas as pd
import jieba
from collections import Counter
import re

def load_data(file_path):
    """加载数据"""
    return pd.read_csv(file_path)

def clean_text(text):
    """清理文本"""
    # 移除特殊字符和数字
    text = re.sub(r'[^\u4e00-\u9fa5]', '', str(text))
    return text

def segment_words(text):
    """分词"""
    # 添加自定义词典
    jieba.load_userdict('dict.txt')
    
    # 分词
    words = jieba.cut(text)
    return [word for word in words if len(word) > 1]  # 只保留长度大于1的词

def count_word_frequency(df):
    """统计词频"""
    # 分别统计推荐和不推荐的评论词频
    recommended_comments = df[df['is_recommended']]['comment'].apply(clean_text).str.cat(sep='')
    not_recommended_comments = df[~df['is_recommended']]['comment'].apply(clean_text).str.cat(sep='')
    
    # 分词并统计词频
    recommended_words = segment_words(recommended_comments)
    not_recommended_words = segment_words(not_recommended_comments)
    
    recommended_freq = Counter(recommended_words)
    not_recommended_freq = Counter(not_recommended_words)
    
    return recommended_freq, not_recommended_freq

def save_word_frequency(recommended_freq, not_recommended_freq, output_file):
    """保存词频统计结果"""
    with open(output_file, 'w', encoding='utf-8') as f:
        # 写入推荐评论的词频
        for word, freq in recommended_freq.most_common():
            f.write(f'{word}\t{freq}\tTrue\n')
        
        # 写入不推荐评论的词频
        for word, freq in not_recommended_freq.most_common():
            f.write(f'{word}\t{freq}\tFalse\n')

def main():
    # 加载数据
    df = load_data('../csv/processed_data.csv')
    
    # 统计词频
    recommended_freq, not_recommended_freq = count_word_frequency(df)
    
    # 保存结果
    save_word_frequency(recommended_freq, not_recommended_freq, '../output/word_frequency.txt')
    
    print("词频统计完成,结果已保存到 word_frequency.txt")

if __name__ == "__main__":
    main() 

 

posted @ 2025-06-09 14:57  KuanDong24  阅读(7)  评论(0)    收藏  举报