azure011328

导航

 

import pandas as pd
import jieba
import jieba.posseg as pseg
from snownlp import SnowNLP
from wordcloud import WordCloud
import matplotlib.pyplot as plt
import re
from collections import Counter, defaultdict
import os

# 设置中文字体,确保词云能正确显示中文
plt.rcParams["font.family"] = ["SimHei", "WenQuanYi Micro Hei", "Heiti TC"]


# 加载自定义词典和停用词
def load_resources():
"""加载自定义词典、停用词和情感词典"""
# 加载自定义词典(如果有)
# jieba.load_userdict('user_dict.txt')

# 加载停用词表
stopwords_path = 'stopwords.txt'
if os.path.exists(stopwords_path):
with open(stopwords_path, 'r', encoding='utf-8') as f:
stopwords = set([line.strip() for line in f])
else:
# 使用默认停用词
stopwords = set(
['的', '了', '在', '是', '我', '有', '和', '就', '不', '人', '都', '一', '一个', '上', '也', '很', '到',
'说', '要', '去', '你', '会', '着', '没有', '看', '好', '自己', '这', '那', '啊', '吧', '呀', '呢', '啦',
'唉', '哎', '哟', '嗯', '哦', '哈', '哈哈'])

# 情感词典(可以根据需要扩展)
positive_words = set(
['好', '优秀', '棒', '满意', '喜欢', '舒服', '开心', '愉快', '快乐', '积极', '出色', '完美', '漂亮', '精彩',
'成功', '值得', '高效', '热情', '专业', '贴心', '方便', '实用', '优质', '美观', '可爱', '好玩'])
negative_words = set(
['坏', '差', '糟糕', '不满意', '讨厌', '难受', '伤心', '难过', '消极', '差劲', '垃圾', '烂', '失败', '失望',
'贵', '慢', '低效', '冷漠', '不负责任', '难用', '麻烦', '复杂', '破损', '劣质', '不好玩'])

return stopwords, positive_words, negative_words


def load_data(file_path):
"""加载CSV数据"""
try:
df = pd.read_csv(file_path)
print(f"成功加载数据,共{len(df)}条评论")
return df
except FileNotFoundError:
print(f"错误:找不到文件 '{file_path}'")
exit(1)


def is_emotional_word(word, pos, positive_words, negative_words):
"""判断一个词是否为情感词"""
# 检查是否在情感词典中
if word in positive_words or word in negative_words:
return True

# 检查词性 (a:形容词, v:动词, d:副词)
# 保留形容词和部分动词(如"喜欢", "讨厌")
if pos.startswith('a') or pos.startswith('v'):
return True

return False


def clean_text(text):
"""清理文本,去除表情符号、URL等干扰因素,确保返回非空字符串"""
if not isinstance(text, str):
return ""

# 去除URL
text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)

# 去除@提及和#话题标签
text = re.sub(r'@\w+|#\w+', '', text)

# 去除表情符号
emoji_pattern = re.compile("["
u"\U0001F600-\U0001F64F" # 表情符号
u"\U0001F300-\U0001F5FF" # 符号与图案
u"\U0001F680-\U0001F6FF" # 交通与地图
u"\U0001F1E0-\U0001F1FF" # 国旗和地区标志
"]+", flags=re.UNICODE)
text = emoji_pattern.sub(r'', text)

# 去除多余空格和特殊字符
text = re.sub(r'\s+', ' ', text)
text = re.sub(r'[^\w\s]', '', text)

# 确保处理后的文本不为空
cleaned_text = text.strip()
return cleaned_text if cleaned_text else "无内容"


def analyze_sentiment(text, positive_words, negative_words):
"""精细化情感分析,返回情感类别和得分"""
if not isinstance(text, str) or not text.strip():
return "中性", 0.5

# 清理文本
cleaned_text = clean_text(text)

# 如果清理后的文本仍为空,返回中性
if not cleaned_text or cleaned_text == "无内容":
return "中性", 0.5

# 使用SnowNLP进行情感分析
try:
s = SnowNLP(cleaned_text)
sentiment_score = s.sentiments
except Exception as e:
print(f"警告: 情感分析失败 - 文本: '{cleaned_text}', 错误: {e}")
return "中性", 0.5

# 基于情感词典进行调整
words = jieba.lcut(cleaned_text)
positive_count = sum(1 for word in words if word in positive_words)
negative_count = sum(1 for word in words if word in negative_words)

# 简单调整:如果文本中明确的情感词较多,可以适当增强情感倾向
if positive_count > negative_count:
sentiment_score = min(1.0, sentiment_score + 0.1)
elif negative_count > positive_count:
sentiment_score = max(0.0, sentiment_score - 0.1)

# 分类情感
if sentiment_score > 0.7:
return "积极", sentiment_score
elif sentiment_score < 0.3:
return "消极", sentiment_score
else:
return "中性", sentiment_score


def extract_keywords(text, stopwords, positive_words, negative_words):
"""提取有情感意义的关键词"""
if not isinstance(text, str) or not text.strip():
return []

# 清理文本
cleaned_text = clean_text(text)

# 如果清理后的文本为空,返回空列表
if not cleaned_text or cleaned_text == "无内容":
return []

# 使用jieba分词并获取词性
words = pseg.cut(cleaned_text)

# 过滤停用词和无情感意义的词
filtered_words = []
for word, flag in words:
if word not in stopwords and len(word) > 1 and is_emotional_word(word, flag, positive_words, negative_words):
filtered_words.append(word)

return filtered_words


def generate_wordcloud(words, title, font_path=None):
"""生成词云图"""
if not words:
print(f"警告:{title}的词云数据为空,无法生成词云")
return

# 连接关键词列表为字符串
text = " ".join(words)

# 创建词云对象
wordcloud = WordCloud(
width=800,
height=400,
background_color='white',
font_path=font_path, # 指定中文字体路径
max_words=100,
max_font_size=100,
random_state=42
).generate(text)

# 显示词云图
plt.figure(figsize=(10, 5))
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis('off')
plt.title(title)
plt.show()


def build_keyword_mapping(df, comment_column):
"""构建关键词到原评论的映射"""
keyword_to_comments = defaultdict(list)

for index, row in df.iterrows():
sentiment = row['sentiment']
comment = row[comment_column]
keywords = row['keywords']

for keyword in keywords:
keyword_to_comments[keyword].append((sentiment, comment, index)) # 保存评论索引

return keyword_to_comments


def display_keyword_comments(keyword, keyword_to_comments, limit=5):
"""显示关键词对应的原评论"""
if keyword not in keyword_to_comments:
print(f"关键词 '{keyword}' 没有找到对应的评论")
return

comments = keyword_to_comments[keyword]
print(f"\n关键词 '{keyword}' 对应的评论 ({len(comments)}条,显示前{limit}条):")

for i, (sentiment, comment, index) in enumerate(comments[:limit], 1):
print(f"{i}. [#{index}][{sentiment}] {comment}")

if len(comments) > limit:
print(f"... 还有 {len(comments) - limit} 条评论未显示")


def display_comments_by_sentiment(df, comment_column, sentiment_type, page_size=10):
"""按情感类型分页显示评论"""
sentiment_df = df[df['sentiment'] == sentiment_type]
total = len(sentiment_df)

if total == 0:
print(f"没有找到{sentiment_type}评论")
return

print(f"\n共有 {total} 条{sentiment_type}评论")

total_pages = (total + page_size - 1) // page_size
current_page = 1

while True:
start_idx = (current_page - 1) * page_size
end_idx = min(start_idx + page_size, total)

print(f"\n=== 第 {current_page}/{total_pages} 页 ===")
for i, (_, row) in enumerate(sentiment_df.iloc[start_idx:end_idx].iterrows(), 1):
print(
f"{start_idx + i}. [#{row.name}][{row['sentiment']}][{row['sentiment_score']:.2f}] {row[comment_column][:80]}...")
print(f" 关键词: {', '.join(row['keywords'])}")

if current_page < total_pages:
next_choice = input("\n按 Enter 继续下一页,输入 'q' 返回,输入数字跳转到指定页: ").strip().lower()
if next_choice == 'q':
break
elif next_choice.isdigit():
page_num = int(next_choice)
if 1 <= page_num <= total_pages:
current_page = page_num
else:
print(f"无效页码,范围是 1-{total_pages}")
else:
current_page += 1
else:
input("\n已显示全部内容,按 Enter 返回...")
break


def display_comment_details(df, comment_column):
"""显示评论详情(情感分类和关键词)"""
print("\n===== 评论详情查看 =====")
while True:
try:
command = input("请输入评论编号(例如 '10')查看详情,或输入 'q' 退出:").strip()

if command.lower() == 'q':
break

# 尝试解析评论编号
index = int(command)
if 0 <= index < len(df):
row = df.iloc[index]
print(f"\n评论 #{index}:")
print(f"情感分类: {row['sentiment']} ({row['sentiment_score']:.2f})")
print(f"评论内容: {row[comment_column]}")
print(f"关键词: {', '.join(row['keywords'])}")
else:
print(f"错误:评论编号 {index} 超出范围(有效范围: 0-{len(df) - 1})")
except ValueError:
print("错误:请输入有效的评论编号或命令")


def main():
# 文件路径,可根据需要修改
file_path = 'wukong_cleaned.csv'

# 加载资源
stopwords, positive_words, negative_words = load_resources()

# 加载数据
df = load_data(file_path)

# 打印所有列名,方便查看
print(f"数据集中的列名:{list(df.columns)}")

# 假设评论所在列名为'content',请根据实际情况修改
comment_column = '评论' # <-- 修改为你的实际列名
if comment_column not in df.columns:
print(f"错误:数据集中没有名为'{comment_column}'的列。可用列名:{list(df.columns)}")
exit(1)

# 添加情感分析结果列
print("正在进行精细化情感分析...")
sentiment_results = df[comment_column].apply(lambda x: analyze_sentiment(x, positive_words, negative_words))
df['sentiment'] = [result[0] for result in sentiment_results]
df['sentiment_score'] = [result[1] for result in sentiment_results]

# 统计情感分布
sentiment_counts = df['sentiment'].value_counts()
print("\n情感分布统计:")
print(sentiment_counts)

# 提取关键词
print("\n正在提取情感关键词...")
df['keywords'] = df[comment_column].apply(lambda x: extract_keywords(x, stopwords, positive_words, negative_words))

# 构建关键词到评论的映射
print("\n正在构建关键词到评论的映射...")
keyword_to_comments = build_keyword_mapping(df, comment_column)

# 按情感分类收集关键词
positive_keywords = []
negative_keywords = []
neutral_keywords = []

for index, row in df.iterrows():
if row['sentiment'] == '积极':
positive_keywords.extend(row['keywords'])
elif row['sentiment'] == '消极':
negative_keywords.extend(row['keywords'])
else:
neutral_keywords.extend(row['keywords'])

# 计算关键词频率
positive_freq = Counter(positive_keywords)
negative_freq = Counter(negative_keywords)
neutral_freq = Counter(neutral_keywords)

# 打印最常见的关键词
print("\n积极情感最常见的10个关键词:")
for word, count in positive_freq.most_common(10):
print(f"{word}: {count}次")

print("\n消极情感最常见的10个关键词:")
for word, count in negative_freq.most_common(10):
print(f"{word}: {count}次")

# 保存分析结果
output_file = 'sentiment_analysis_result.csv'
df.to_csv(output_file, index_label='comment_id', encoding='utf-8-sig')
print(f"\n情感分析结果已保存至 {output_file}")

# 尝试查找系统中文字体路径
font_path = None
try:
# Windows系统默认中文字体路径
font_path = r'C:\Windows\Fonts\simhei.ttf'
if not os.path.exists(font_path):
font_path = None
except:
font_path = None

# 如果找不到系统字体,可以指定自己下载的字体文件路径
if font_path is None:
print("警告: 未找到系统中文字体,词云可能无法正确显示中文。")
print("请手动设置font_path指向中文字体文件,例如: font_path='C:/Windows/Fonts/simhei.ttf'")

# 生成词云图
print("\n正在生成词云图...")
generate_wordcloud(positive_keywords, "积极情感词云", font_path)
generate_wordcloud(negative_keywords, "消极情感词云", font_path)
generate_wordcloud(neutral_keywords, "中性情感词云", font_path)

# 交互式查看功能
while True:
print("\n===== 交互式查看 =====")
print("1. 按关键词查看评论")
print("2. 查看积极评论")
print("3. 查看消极评论")
print("4. 查看中性评论")
print("5. 按编号查看评论详情")
print("q. 退出程序")

choice = input("请选择操作 (1/2/3/4/5/q): ").strip().lower()

if choice == '1':
keyword = input("请输入要查看的关键词:").strip()
display_keyword_comments(keyword, keyword_to_comments)
elif choice == '2':
display_comments_by_sentiment(df, comment_column, "积极")
elif choice == '3':
display_comments_by_sentiment(df, comment_column, "消极")
elif choice == '4':
display_comments_by_sentiment(df, comment_column, "中性")
elif choice == '5':
display_comment_details(df, comment_column)
elif choice == 'q':
break
else:
print("无效选择,请重新输入")


if __name__ == "__main__":
main()

posted on 2025-06-11 09:52  淮竹i  阅读(11)  评论(0)    收藏  举报