azure011328

导航

 

import pandas as pd
import jieba
import jieba.posseg as pseg
from snownlp import SnowNLP
from wordcloud import WordCloud
import matplotlib.pyplot as plt
import re
from collections import Counter
import os

# 设置中文字体,确保词云能正确显示中文
plt.rcParams["font.family"] = ["SimHei", "WenQuanYi Micro Hei", "Heiti TC"]


# 加载自定义词典和停用词
def load_resources():
"""加载自定义词典、停用词和情感词典"""
# 加载自定义词典(如果有)
# jieba.load_userdict('user_dict.txt')

# 加载停用词表
stopwords_path = 'stopwords.txt'
if os.path.exists(stopwords_path):
with open(stopwords_path, 'r', encoding='utf-8') as f:
stopwords = set([line.strip() for line in f])
else:
# 使用默认停用词
stopwords = set(
['的', '了', '在', '是', '我', '有', '和', '就', '不', '人', '都', '一', '一个', '上', '也', '很', '到',
'说', '要', '去', '你', '会', '着', '没有', '看', '好', '自己', '这', '那', '啊', '吧', '呀', '呢', '啦',
'唉', '哎', '哟', '嗯', '哦', '哈', '哈哈'])

# 情感词典(可以根据需要扩展)
positive_words = set(
['好', '优秀', '棒', '满意', '喜欢', '舒服', '开心', '愉快', '快乐', '积极', '出色', '完美', '漂亮', '精彩',
'成功', '值得', '高效', '热情', '专业', '贴心'])
negative_words = set(
['坏', '差', '糟糕', '不满意', '讨厌', '难受', '伤心', '难过', '消极', '差劲', '垃圾', '烂', '失败', '失望',
'贵', '慢', '低效', '冷漠', '不负责任'])

return stopwords, positive_words, negative_words


def load_data(file_path):
"""加载CSV数据"""
try:
df = pd.read_csv(file_path)
print(f"成功加载数据,共{len(df)}条评论")
return df
except FileNotFoundError:
print(f"错误:找不到文件 '{file_path}'")
exit(1)


def is_emotional_word(word, pos, positive_words, negative_words):
"""判断一个词是否为情感词"""
# 检查是否在情感词典中
if word in positive_words or word in negative_words:
return True

# 检查词性 (a:形容词, v:动词, d:副词)
# 保留形容词和部分动词(如"喜欢", "讨厌")
if pos.startswith('a') or pos.startswith('v'):
return True

return False


def clean_text(text):
"""清理文本,去除表情符号、URL等干扰因素,确保返回非空字符串"""
if not isinstance(text, str):
return ""

# 去除URL
text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)

# 去除@提及和#话题标签
text = re.sub(r'@\w+|#\w+', '', text)

# 去除表情符号
emoji_pattern = re.compile("["
u"\U0001F600-\U0001F64F" # 表情符号
u"\U0001F300-\U0001F5FF" # 符号与图案
u"\U0001F680-\U0001F6FF" # 交通与地图
u"\U0001F1E0-\U0001F1FF" # 国旗和地区标志
"]+", flags=re.UNICODE)
text = emoji_pattern.sub(r'', text)

# 去除多余空格和特殊字符
text = re.sub(r'\s+', ' ', text)
text = re.sub(r'[^\w\s]', '', text)

# 确保处理后的文本不为空
cleaned_text = text.strip()
return cleaned_text if cleaned_text else "无内容"


def analyze_sentiment(text, positive_words, negative_words):
"""精细化情感分析,返回情感类别和得分"""
if not isinstance(text, str) or not text.strip():
return "中性", 0.5

# 清理文本
cleaned_text = clean_text(text)

# 如果清理后的文本仍为空,返回中性
if not cleaned_text or cleaned_text == "无内容":
return "中性", 0.5

# 使用SnowNLP进行情感分析
try:
s = SnowNLP(cleaned_text)
sentiment_score = s.sentiments
except Exception as e:
print(f"警告: 情感分析失败 - 文本: '{cleaned_text}', 错误: {e}")
return "中性", 0.5

# 基于情感词典进行调整
words = jieba.lcut(cleaned_text)
positive_count = sum(1 for word in words if word in positive_words)
negative_count = sum(1 for word in words if word in negative_words)

# 简单调整:如果文本中明确的情感词较多,可以适当增强情感倾向
if positive_count > negative_count:
sentiment_score = min(1.0, sentiment_score + 0.1)
elif negative_count > positive_count:
sentiment_score = max(0.0, sentiment_score - 0.1)

# 分类情感
if sentiment_score > 0.7:
return "积极", sentiment_score
elif sentiment_score < 0.3:
return "消极", sentiment_score
else:
return "中性", sentiment_score


def extract_keywords(text, stopwords, positive_words, negative_words):
"""提取有情感意义的关键词"""
if not isinstance(text, str) or not text.strip():
return []

# 清理文本
cleaned_text = clean_text(text)

# 如果清理后的文本为空,返回空列表
if not cleaned_text or cleaned_text == "无内容":
return []

# 使用jieba分词并获取词性
words = pseg.cut(cleaned_text)

# 过滤停用词和无情感意义的词
filtered_words = []
for word, flag in words:
if word not in stopwords and len(word) > 1 and is_emotional_word(word, flag, positive_words, negative_words):
filtered_words.append(word)

return filtered_words


def generate_wordcloud(words, title, font_path=None):
"""生成词云图"""
if not words:
print(f"警告:{title}的词云数据为空,无法生成词云")
return

# 连接关键词列表为字符串
text = " ".join(words)

# 创建词云对象
wordcloud = WordCloud(
width=800,
height=400,
background_color='white',
font_path=font_path, # 指定中文字体路径
max_words=100,
max_font_size=100,
random_state=42
).generate(text)

# 显示词云图
plt.figure(figsize=(10, 5))
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis('off')
plt.title(title)
plt.show()


def main():
# 文件路径,可根据需要修改
file_path = 'wukong_cleaned.csv'

# 加载资源
stopwords, positive_words, negative_words = load_resources()

# 加载数据
df = load_data(file_path)

# 假设评论所在列名为'comment',请根据实际情况修改
comment_column = '评论'
if comment_column not in df.columns:
print(f"错误:数据集中没有名为'{comment_column}'的列。可用列名:{list(df.columns)}")
exit(1)

# 添加情感分析结果列
print("正在进行精细化情感分析...")
sentiment_results = df[comment_column].apply(lambda x: analyze_sentiment(x, positive_words, negative_words))
df['sentiment'] = [result[0] for result in sentiment_results]
df['sentiment_score'] = [result[1] for result in sentiment_results]

# 统计情感分布
sentiment_counts = df['sentiment'].value_counts()
print("\n情感分布统计:")
print(sentiment_counts)

# 提取关键词
print("\n正在提取情感关键词...")
df['keywords'] = df[comment_column].apply(lambda x: extract_keywords(x, stopwords, positive_words, negative_words))

# 按情感分类收集关键词
positive_keywords = []
negative_keywords = []
neutral_keywords = []

for index, row in df.iterrows():
if row['sentiment'] == '积极':
positive_keywords.extend(row['keywords'])
elif row['sentiment'] == '消极':
negative_keywords.extend(row['keywords'])
else:
neutral_keywords.extend(row['keywords'])

# 计算关键词频率
positive_freq = Counter(positive_keywords)
negative_freq = Counter(negative_keywords)
neutral_freq = Counter(neutral_keywords)

# 打印最常见的关键词
print("\n积极情感最常见的10个关键词:")
for word, count in positive_freq.most_common(10):
print(f"{word}: {count}次")

print("\n消极情感最常见的10个关键词:")
for word, count in negative_freq.most_common(10):
print(f"{word}: {count}次")

# 保存分析结果
output_file = 'sentiment_analysis_result.csv'
df.to_csv(output_file, index=False, encoding='utf-8-sig')
print(f"\n情感分析结果已保存至 {output_file}")

# 尝试查找系统中文字体路径
font_path = None
try:
# Windows系统默认中文字体路径
font_path = r'C:\Windows\Fonts\simhei.ttf'
if not os.path.exists(font_path):
font_path = None
except:
font_path = None

# 如果找不到系统字体,可以指定自己下载的字体文件路径
if font_path is None:
print("警告: 未找到系统中文字体,词云可能无法正确显示中文。")
print("请手动设置font_path指向中文字体文件,例如: font_path='C:/Windows/Fonts/simhei.ttf'")

# 生成词云图
print("\n正在生成词云图...")
generate_wordcloud(positive_keywords, "积极情感词云", font_path)
generate_wordcloud(negative_keywords, "消极情感词云", font_path)
generate_wordcloud(neutral_keywords, "中性情感词云", font_path)


if __name__ == "__main__":
main()

posted on 2025-06-11 09:52  淮竹i  阅读(13)  评论(0)    收藏  举报