import pandas as pd
import jieba
import jieba.posseg as pseg
from snownlp import SnowNLP
from wordcloud import WordCloud
import matplotlib.pyplot as plt
import re
from collections import Counter
import os
# 设置中文字体,确保词云能正确显示中文
plt.rcParams["font.family"] = ["SimHei", "WenQuanYi Micro Hei", "Heiti TC"]
# 加载自定义词典和停用词
def load_resources():
"""加载自定义词典、停用词和情感词典"""
# 加载自定义词典(如果有)
# jieba.load_userdict('user_dict.txt')
# 加载停用词表
stopwords_path = 'stopwords.txt'
if os.path.exists(stopwords_path):
with open(stopwords_path, 'r', encoding='utf-8') as f:
stopwords = set([line.strip() for line in f])
else:
# 使用默认停用词
stopwords = set(
['的', '了', '在', '是', '我', '有', '和', '就', '不', '人', '都', '一', '一个', '上', '也', '很', '到',
'说', '要', '去', '你', '会', '着', '没有', '看', '好', '自己', '这', '那', '啊', '吧', '呀', '呢', '啦',
'唉', '哎', '哟', '嗯', '哦', '哈', '哈哈'])
# 情感词典(可以根据需要扩展)
positive_words = set(
['好', '优秀', '棒', '满意', '喜欢', '舒服', '开心', '愉快', '快乐', '积极', '出色', '完美', '漂亮', '精彩',
'成功', '值得', '高效', '热情', '专业', '贴心'])
negative_words = set(
['坏', '差', '糟糕', '不满意', '讨厌', '难受', '伤心', '难过', '消极', '差劲', '垃圾', '烂', '失败', '失望',
'贵', '慢', '低效', '冷漠', '不负责任'])
return stopwords, positive_words, negative_words
def load_data(file_path):
"""加载CSV数据"""
try:
df = pd.read_csv(file_path)
print(f"成功加载数据,共{len(df)}条评论")
return df
except FileNotFoundError:
print(f"错误:找不到文件 '{file_path}'")
exit(1)
def is_emotional_word(word, pos, positive_words, negative_words):
"""判断一个词是否为情感词"""
# 检查是否在情感词典中
if word in positive_words or word in negative_words:
return True
# 检查词性 (a:形容词, v:动词, d:副词)
# 保留形容词和部分动词(如"喜欢", "讨厌")
if pos.startswith('a') or pos.startswith('v'):
return True
return False
def clean_text(text):
"""清理文本,去除表情符号、URL等干扰因素,确保返回非空字符串"""
if not isinstance(text, str):
return ""
# 去除URL
text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)
# 去除@提及和#话题标签
text = re.sub(r'@\w+|#\w+', '', text)
# 去除表情符号
emoji_pattern = re.compile("["
u"\U0001F600-\U0001F64F" # 表情符号
u"\U0001F300-\U0001F5FF" # 符号与图案
u"\U0001F680-\U0001F6FF" # 交通与地图
u"\U0001F1E0-\U0001F1FF" # 国旗和地区标志
"]+", flags=re.UNICODE)
text = emoji_pattern.sub(r'', text)
# 去除多余空格和特殊字符
text = re.sub(r'\s+', ' ', text)
text = re.sub(r'[^\w\s]', '', text)
# 确保处理后的文本不为空
cleaned_text = text.strip()
return cleaned_text if cleaned_text else "无内容"
def analyze_sentiment(text, positive_words, negative_words):
"""精细化情感分析,返回情感类别和得分"""
if not isinstance(text, str) or not text.strip():
return "中性", 0.5
# 清理文本
cleaned_text = clean_text(text)
# 如果清理后的文本仍为空,返回中性
if not cleaned_text or cleaned_text == "无内容":
return "中性", 0.5
# 使用SnowNLP进行情感分析
try:
s = SnowNLP(cleaned_text)
sentiment_score = s.sentiments
except Exception as e:
print(f"警告: 情感分析失败 - 文本: '{cleaned_text}', 错误: {e}")
return "中性", 0.5
# 基于情感词典进行调整
words = jieba.lcut(cleaned_text)
positive_count = sum(1 for word in words if word in positive_words)
negative_count = sum(1 for word in words if word in negative_words)
# 简单调整:如果文本中明确的情感词较多,可以适当增强情感倾向
if positive_count > negative_count:
sentiment_score = min(1.0, sentiment_score + 0.1)
elif negative_count > positive_count:
sentiment_score = max(0.0, sentiment_score - 0.1)
# 分类情感
if sentiment_score > 0.7:
return "积极", sentiment_score
elif sentiment_score < 0.3:
return "消极", sentiment_score
else:
return "中性", sentiment_score
def extract_keywords(text, stopwords, positive_words, negative_words):
"""提取有情感意义的关键词"""
if not isinstance(text, str) or not text.strip():
return []
# 清理文本
cleaned_text = clean_text(text)
# 如果清理后的文本为空,返回空列表
if not cleaned_text or cleaned_text == "无内容":
return []
# 使用jieba分词并获取词性
words = pseg.cut(cleaned_text)
# 过滤停用词和无情感意义的词
filtered_words = []
for word, flag in words:
if word not in stopwords and len(word) > 1 and is_emotional_word(word, flag, positive_words, negative_words):
filtered_words.append(word)
return filtered_words
def generate_wordcloud(words, title, font_path=None):
"""生成词云图"""
if not words:
print(f"警告:{title}的词云数据为空,无法生成词云")
return
# 连接关键词列表为字符串
text = " ".join(words)
# 创建词云对象
wordcloud = WordCloud(
width=800,
height=400,
background_color='white',
font_path=font_path, # 指定中文字体路径
max_words=100,
max_font_size=100,
random_state=42
).generate(text)
# 显示词云图
plt.figure(figsize=(10, 5))
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis('off')
plt.title(title)
plt.show()
def main():
# 文件路径,可根据需要修改
file_path = 'wukong_cleaned.csv'
# 加载资源
stopwords, positive_words, negative_words = load_resources()
# 加载数据
df = load_data(file_path)
# 假设评论所在列名为'comment',请根据实际情况修改
comment_column = '评论'
if comment_column not in df.columns:
print(f"错误:数据集中没有名为'{comment_column}'的列。可用列名:{list(df.columns)}")
exit(1)
# 添加情感分析结果列
print("正在进行精细化情感分析...")
sentiment_results = df[comment_column].apply(lambda x: analyze_sentiment(x, positive_words, negative_words))
df['sentiment'] = [result[0] for result in sentiment_results]
df['sentiment_score'] = [result[1] for result in sentiment_results]
# 统计情感分布
sentiment_counts = df['sentiment'].value_counts()
print("\n情感分布统计:")
print(sentiment_counts)
# 提取关键词
print("\n正在提取情感关键词...")
df['keywords'] = df[comment_column].apply(lambda x: extract_keywords(x, stopwords, positive_words, negative_words))
# 按情感分类收集关键词
positive_keywords = []
negative_keywords = []
neutral_keywords = []
for index, row in df.iterrows():
if row['sentiment'] == '积极':
positive_keywords.extend(row['keywords'])
elif row['sentiment'] == '消极':
negative_keywords.extend(row['keywords'])
else:
neutral_keywords.extend(row['keywords'])
# 计算关键词频率
positive_freq = Counter(positive_keywords)
negative_freq = Counter(negative_keywords)
neutral_freq = Counter(neutral_keywords)
# 打印最常见的关键词
print("\n积极情感最常见的10个关键词:")
for word, count in positive_freq.most_common(10):
print(f"{word}: {count}次")
print("\n消极情感最常见的10个关键词:")
for word, count in negative_freq.most_common(10):
print(f"{word}: {count}次")
# 保存分析结果
output_file = 'sentiment_analysis_result.csv'
df.to_csv(output_file, index=False, encoding='utf-8-sig')
print(f"\n情感分析结果已保存至 {output_file}")
# 尝试查找系统中文字体路径
font_path = None
try:
# Windows系统默认中文字体路径
font_path = r'C:\Windows\Fonts\simhei.ttf'
if not os.path.exists(font_path):
font_path = None
except:
font_path = None
# 如果找不到系统字体,可以指定自己下载的字体文件路径
if font_path is None:
print("警告: 未找到系统中文字体,词云可能无法正确显示中文。")
print("请手动设置font_path指向中文字体文件,例如: font_path='C:/Windows/Fonts/simhei.ttf'")
# 生成词云图
print("\n正在生成词云图...")
generate_wordcloud(positive_keywords, "积极情感词云", font_path)
generate_wordcloud(negative_keywords, "消极情感词云", font_path)
generate_wordcloud(neutral_keywords, "中性情感词云", font_path)
if __name__ == "__main__":
main()
浙公网安备 33010602011771号