# 加载停用词表
stopwords_path = 'stopwords.txt'
if os.path.exists(stopwords_path):
with open(stopwords_path, 'r', encoding='utf-8') as f:
stopwords = set([line.strip() for line in f])
else:
# 使用默认停用词
stopwords = set(
['的', '了', '在', '是', '我', '有', '和', '就', '不', '人', '都', '一', '一个', '上', '也', '很', '到',
'说', '要', '去', '你', '会', '着', '没有', '看', '好', '自己', '这', '那', '啊', '吧', '呀', '呢', '啦',
'唉', '哎', '哟', '嗯', '哦', '哈', '哈哈'])

# 情感词典（可以根据需要扩展）
positive_words = set(
['好', '优秀', '棒', '满意', '喜欢', '舒服', '开心', '愉快', '快乐', '积极', '出色', '完美', '漂亮', '精彩',
'成功', '值得', '高效', '热情', '专业', '贴心', '方便', '实用', '优质', '美观', '可爱', '好玩'])
negative_words = set(
['坏', '差', '糟糕', '不满意', '讨厌', '难受', '伤心', '难过', '消极', '差劲', '垃圾', '烂', '失败', '失望',
'贵', '慢', '低效', '冷漠', '不负责任', '难用', '麻烦', '复杂', '破损', '劣质', '不好玩'])

return stopwords, positive_words, negative_words

def load_data(file_path):
"""加载CSV数据"""
try:
df = pd.read_csv(file_path)
print(f"成功加载数据，共{len(df)}条评论")
return df
except FileNotFoundError:
print(f"错误：找不到文件 '{file_path}'")
exit(1)

def is_emotional_word(word, pos, positive_words, negative_words):
"""判断一个词是否为情感词"""
# 检查是否在情感词典中
if word in positive_words or word in negative_words:
return True

# 检查词性 (a:形容词, v:动词, d:副词)
# 保留形容词和部分动词（如"喜欢", "讨厌"）
if pos.startswith('a') or pos.startswith('v'):
return True

return False

def clean_text(text):
"""清理文本，去除表情符号、URL等干扰因素，确保返回非空字符串"""
if not isinstance(text, str):
return ""

# 去除URL
text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)

# 去除@提及和#话题标签
text = re.sub(r'@\w+|#\w+', '', text)

# 去除表情符号
emoji_pattern = re.compile("["
u"\U0001F600-\U0001F64F" # 表情符号
u"\U0001F300-\U0001F5FF" # 符号与图案
u"\U0001F680-\U0001F6FF" # 交通与地图
u"\U0001F1E0-\U0001F1FF" # 国旗和地区标志
"]+", flags=re.UNICODE)
text = emoji_pattern.sub(r'', text)

# 去除多余空格和特殊字符
text = re.sub(r'\s+', ' ', text)
text = re.sub(r'[^\w\s]', '', text)

# 确保处理后的文本不为空
cleaned_text = text.strip()
return cleaned_text if cleaned_text else "无内容"

def analyze_sentiment(text, positive_words, negative_words):
"""精细化情感分析，返回情感类别和得分"""
if not isinstance(text, str) or not text.strip():
return "中性", 0.5

# 清理文本
cleaned_text = clean_text(text)

# 如果清理后的文本仍为空，返回中性
if not cleaned_text or cleaned_text == "无内容":
return "中性", 0.5

# 使用SnowNLP进行情感分析
try:
s = SnowNLP(cleaned_text)
sentiment_score = s.sentiments
except Exception as e:
print(f"警告: 情感分析失败 - 文本: '{cleaned_text}', 错误: {e}")
return "中性", 0.5

# 基于情感词典进行调整
words = jieba.lcut(cleaned_text)
positive_count = sum(1 for word in words if word in positive_words)
negative_count = sum(1 for word in words if word in negative_words)

# 简单调整：如果文本中明确的情感词较多，可以适当增强情感倾向
if positive_count > negative_count:
sentiment_score = min(1.0, sentiment_score + 0.1)
elif negative_count > positive_count:
sentiment_score = max(0.0, sentiment_score - 0.1)

# 分类情感
if sentiment_score > 0.7:
return "积极", sentiment_score
elif sentiment_score < 0.3:
return "消极", sentiment_score
else:
return "中性", sentiment_score

def extract_keywords(text, stopwords, positive_words, negative_words):
"""提取有情感意义的关键词"""
if not isinstance(text, str) or not text.strip():
return []

# 清理文本
cleaned_text = clean_text(text)

# 如果清理后的文本为空，返回空列表
if not cleaned_text or cleaned_text == "无内容":
return []

# 使用jieba分词并获取词性
words = pseg.cut(cleaned_text)

# 过滤停用词和无情感意义的词
filtered_words = []
for word, flag in words:
if word not in stopwords and len(word) > 1 and is_emotional_word(word, flag, positive_words, negative_words):
filtered_words.append(word)

return filtered_words

def generate_wordcloud(words, title, font_path=None):
"""生成词云图"""
if not words:
print(f"警告：{title}的词云数据为空，无法生成词云")
return

# 连接关键词列表为字符串
text = " ".join(words)

# 创建词云对象
wordcloud = WordCloud(
width=800,
height=400,
background_color='white',
font_path=font_path, # 指定中文字体路径
max_words=100,
max_font_size=100,
random_state=42
).generate(text)

# 显示词云图
plt.figure(figsize=(10, 5))
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis('off')
plt.title(title)
plt.show()

def build_keyword_mapping(df, comment_column):
"""构建关键词到原评论的映射"""
keyword_to_comments = defaultdict(list)

for index, row in df.iterrows():
sentiment = row['sentiment']
comment = row[comment_column]
keywords = row['keywords']

for keyword in keywords:
keyword_to_comments[keyword].append((sentiment, comment, index)) # 保存评论索引

return keyword_to_comments

def display_keyword_comments(keyword, keyword_to_comments, limit=5):
"""显示关键词对应的原评论"""
if keyword not in keyword_to_comments:
print(f"关键词 '{keyword}' 没有找到对应的评论")
return

comments = keyword_to_comments[keyword]
print(f"\n关键词 '{keyword}' 对应的评论 ({len(comments)}条，显示前{limit}条):")

for i, (sentiment, comment, index) in enumerate(comments[:limit], 1):
print(f"{i}. [#{index}][{sentiment}] {comment}")

if len(comments) > limit:
print(f"... 还有 {len(comments) - limit} 条评论未显示")

def display_comments_by_sentiment(df, comment_column, sentiment_type, page_size=10):
"""按情感类型分页显示评论"""
sentiment_df = df[df['sentiment'] == sentiment_type]
total = len(sentiment_df)

if total == 0:
print(f"没有找到{sentiment_type}评论")
return

print(f"\n共有 {total} 条{sentiment_type}评论")

total_pages = (total + page_size - 1) // page_size
current_page = 1

while True:
start_idx = (current_page - 1) * page_size
end_idx = min(start_idx + page_size, total)

print(f"\n=== 第 {current_page}/{total_pages} 页 ===")
for i, (_, row) in enumerate(sentiment_df.iloc[start_idx:end_idx].iterrows(), 1):
print(
f"{start_idx + i}. [#{row.name}][{row['sentiment']}][{row['sentiment_score']:.2f}] {row[comment_column][:80]}...")
print(f" 关键词: {', '.join(row['keywords'])}")

if current_page < total_pages:
next_choice = input("\n按 Enter 继续下一页，输入 'q' 返回，输入数字跳转到指定页: ").strip().lower()
if next_choice == 'q':
break
elif next_choice.isdigit():
page_num = int(next_choice)
if 1 <= page_num <= total_pages:
current_page = page_num
else:
print(f"无效页码，范围是 1-{total_pages}")
else:
current_page += 1
else:
input("\n已显示全部内容，按 Enter 返回...")
break

def display_comment_details(df, comment_column):
"""显示评论详情（情感分类和关键词）"""
print("\n===== 评论详情查看 =====")
while True:
try:
command = input("请输入评论编号（例如 '10'）查看详情，或输入 'q' 退出：").strip()

if command.lower() == 'q':
break

# 尝试解析评论编号
index = int(command)
if 0 <= index < len(df):
row = df.iloc[index]
print(f"\n评论 #{index}:")
print(f"情感分类: {row['sentiment']} ({row['sentiment_score']:.2f})")
print(f"评论内容: {row[comment_column]}")
print(f"关键词: {', '.join(row['keywords'])}")
else:
print(f"错误：评论编号 {index} 超出范围（有效范围: 0-{len(df) - 1}）")
except ValueError:
print("错误：请输入有效的评论编号或命令")

def main():
# 文件路径，可根据需要修改
file_path = 'wukong_cleaned.csv'

# 加载资源
stopwords, positive_words, negative_words = load_resources()

# 加载数据
df = load_data(file_path)

# 打印所有列名，方便查看
print(f"数据集中的列名：{list(df.columns)}")

# 假设评论所在列名为'content'，请根据实际情况修改
comment_column = '评论' # <-- 修改为你的实际列名
if comment_column not in df.columns:
print(f"错误：数据集中没有名为'{comment_column}'的列。可用列名：{list(df.columns)}")
exit(1)

# 添加情感分析结果列
print("正在进行精细化情感分析...")
sentiment_results = df[comment_column].apply(lambda x: analyze_sentiment(x, positive_words, negative_words))
df['sentiment'] = [result[0] for result in sentiment_results]
df['sentiment_score'] = [result[1] for result in sentiment_results]

# 统计情感分布
sentiment_counts = df['sentiment'].value_counts()
print("\n情感分布统计：")
print(sentiment_counts)

# 提取关键词
print("\n正在提取情感关键词...")
df['keywords'] = df[comment_column].apply(lambda x: extract_keywords(x, stopwords, positive_words, negative_words))

# 构建关键词到评论的映射
print("\n正在构建关键词到评论的映射...")
keyword_to_comments = build_keyword_mapping(df, comment_column)

# 按情感分类收集关键词
positive_keywords = []
negative_keywords = []
neutral_keywords = []

for index, row in df.iterrows():
if row['sentiment'] == '积极':
positive_keywords.extend(row['keywords'])
elif row['sentiment'] == '消极':
negative_keywords.extend(row['keywords'])
else:
neutral_keywords.extend(row['keywords'])

# 计算关键词频率
positive_freq = Counter(positive_keywords)
negative_freq = Counter(negative_keywords)
neutral_freq = Counter(neutral_keywords)

# 打印最常见的关键词
print("\n积极情感最常见的10个关键词：")
for word, count in positive_freq.most_common(10):
print(f"{word}: {count}次")

print("\n消极情感最常见的10个关键词：")
for word, count in negative_freq.most_common(10):
print(f"{word}: {count}次")

# 保存分析结果
output_file = 'sentiment_analysis_result.csv'
df.to_csv(output_file, index_label='comment_id', encoding='utf-8-sig')
print(f"\n情感分析结果已保存至 {output_file}")

# 尝试查找系统中文字体路径
font_path = None
try:
# Windows系统默认中文字体路径
font_path = r'C:\Windows\Fonts\simhei.ttf'
if not os.path.exists(font_path):
font_path = None
except:
font_path = None

# 如果找不到系统字体，可以指定自己下载的字体文件路径
if font_path is None:
print("警告: 未找到系统中文字体，词云可能无法正确显示中文。")
print("请手动设置font_path指向中文字体文件，例如: font_path='C:/Windows/Fonts/simhei.ttf'")

# 生成词云图
print("\n正在生成词云图...")
generate_wordcloud(positive_keywords, "积极情感词云", font_path)
generate_wordcloud(negative_keywords, "消极情感词云", font_path)
generate_wordcloud(neutral_keywords, "中性情感词云", font_path)

# 交互式查看功能
while True:
print("\n===== 交互式查看 =====")
print("1. 按关键词查看评论")
print("2. 查看积极评论")
print("3. 查看消极评论")
print("4. 查看中性评论")
print("5. 按编号查看评论详情")
print("q. 退出程序")

choice = input("请选择操作 (1/2/3/4/5/q): ").strip().lower()

if choice == '1':
keyword = input("请输入要查看的关键词：").strip()
display_keyword_comments(keyword, keyword_to_comments)
elif choice == '2':
display_comments_by_sentiment(df, comment_column, "积极")
elif choice == '3':
display_comments_by_sentiment(df, comment_column, "消极")
elif choice == '4':
display_comments_by_sentiment(df, comment_column, "中性")
elif choice == '5':
display_comment_details(df, comment_column)
elif choice == 'q':
break
else:
print("无效选择，请重新输入")

if __name__ == "__main__":
main()

posted on 2025-06-11 09:52 淮竹i 阅读(11) 评论(0) 收藏举报

刷新页面返回顶部

导航