import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import jieba
from wordcloud import WordCloud
import matplotlib
import os
import sys
import logging
from pathlib import Path
import matplotlib.font_manager as fm
from collections import Counter
import numpy as np
from PIL import Image, ImageDraw, ImageFont
import traceback
# 配置日志
logging.basicConfig(
level=logging.INFO,
format='%(asctime)s - %(levelname)s - %(message)s',
handlers=[
logging.StreamHandler(sys.stdout)
]
)
logger = logging.getLogger(__name__)
# 创建输出目录
OUTPUT_DIR = Path('../output')
OUTPUT_DIR.mkdir(exist_ok=True)
# 设置matplotlib中文字体
try:
font_paths = [
'/usr/share/fonts/truetype/wqy/wqy-microhei.ttc',
'/usr/share/fonts/truetype/wqy/wqy-zenhei.ttc',
'/usr/share/fonts/truetype/droid/DroidSansFallbackFull.ttf'
]
font_path = next((path for path in font_paths if os.path.exists(path)), None)
if font_path:
matplotlib.rcParams['font.family'] = 'sans-serif'
matplotlib.rcParams['font.sans-serif'] = ['WenQuanYi Micro Hei']
matplotlib.rcParams['axes.unicode_minus'] = False
logger.info(f"使用字体: {font_path}")
else:
logger.warning("未找到中文字体,将使用默认字体")
except Exception as e:
logger.warning(f"设置中文字体时出错: {e}")
logger.warning(traceback.format_exc())
def clean_text(text):
"""清理文本"""
if pd.isna(text):
return ""
text = str(text).replace('\n', ' ').replace('\t', ' ')
text = ' '.join(text.split())
text = ''.join(char for char in text if char.isalnum() or char.isspace() or '\u4e00' <= char <= '\u9fff')
return text
def plot_daily_reviews(df):
"""每日评价人数曲线"""
try:
logger.info("开始生成每日评价人数曲线图...")
daily_reviews = df.groupby('发布时间(2024年)').size()
plt.figure(figsize=(12, 6))
daily_reviews.plot(kind='line')
plt.title('每日评价人数')
plt.xlabel('日期')
plt.ylabel('评价人数')
plt.xticks(rotation=45)
plt.tight_layout()
plt.savefig(OUTPUT_DIR / 'daily_reviews.png')
plt.close()
logger.info("成功生成每日评价人数曲线图")
except Exception as e:
logger.error(f"生成每日评价人数曲线图失败: {e}")
logger.error(traceback.format_exc())
def plot_recommendation_pie(df, date=None):
"""推荐/不推荐饼图"""
try:
logger.info("开始生成推荐/不推荐饼图...")
if date:
df = df[df['发布时间(2024年)'] == date]
recommendation_counts = df['是否推荐'].value_counts()
plt.figure(figsize=(8, 8))
plt.pie(recommendation_counts, labels=['推荐', '不推荐'], autopct='%1.1f%%')
plt.title('推荐/不推荐比例')
plt.savefig(OUTPUT_DIR / 'recommendation_pie.png')
plt.close()
logger.info("成功生成推荐/不推荐饼图")
except Exception as e:
logger.error(f"生成推荐/不推荐饼图失败: {e}")
logger.error(traceback.format_exc())
def analyze_playtime_recommendation(df):
"""游戏时长与推荐关系"""
try:
logger.info("开始生成游戏时长与推荐关系图...")
plt.figure(figsize=(10, 6))
sns.boxplot(x='是否推荐', y='大圣游戏时长', data=df)
plt.title('游戏时长与推荐关系')
plt.xlabel('是否推荐')
plt.ylabel('游戏时长(小时)')
plt.savefig(OUTPUT_DIR / 'playtime_recommendation.png')
plt.close()
logger.info("成功生成游戏时长与推荐关系图")
except Exception as e:
logger.error(f"生成游戏时长与推荐关系图失败: {e}")
logger.error(traceback.format_exc())
def generate_wordcloud(df, is_recommended):
"""生成词云图"""
try:
logger.info(f"开始生成{'推荐' if is_recommended else '不推荐'}评论词云图...")
# 提取并清理评论
comments = df[df['是否推荐'] == is_recommended]['评论'].apply(clean_text)
logger.info(f"{'推荐' if is_recommended else '不推荐'}评论数量: {len(comments)}")
comments = comments[comments.str.len() > 0]
if len(comments) == 0:
logger.warning(f"没有足够的评论来生成{'推荐' if is_recommended else '不推荐'}评论的词云图")
return
# 合并所有评论
text = ' '.join(comments)
# 分词
words = jieba.cut(text)
# 过滤短词并统计词频
word_freq = Counter(word for word in words if len(word.strip()) > 1)
if not word_freq:
logger.warning(f"分词后没有足够的词来生成{'推荐' if is_recommended else '不推荐'}评论的词云图")
return
logger.info(f"分词后词数: {len(word_freq)}")
# 创建一个新的图像
img = Image.new('RGBA', (800, 400), color='white')
draw = ImageDraw.Draw(img)
# 按词频排序
sorted_words = sorted(word_freq.items(), key=lambda x: x[1], reverse=True)
# 计算最大词频,用于归一化
max_freq = max(word_freq.values())
# 设置初始位置和边界
x, y = 20, 20
max_width = 760
max_height = 360
line_height = 0
# 绘制每个词
for word, freq in sorted_words[:50]: # 只取前50个词
# 根据词频计算字体大小(8-40之间)
font_size = int(8 + (freq / max_freq) * 32)
try:
font = ImageFont.truetype(font_path, font_size)
except Exception as e:
logger.warning(f"加载字体失败,使用默认字体: {e}")
font = ImageFont.load_default()
# 获取词的边界框
try:
bbox = draw.textsize(word, font=font)
except Exception as e:
logger.warning(f"获取文本大小失败: {e}")
continue
# 如果超出边界,换行
if x + bbox[0] > max_width:
x = 20
y += line_height + 5
line_height = 0
# 如果超出高度,停止
if y + bbox[1] > max_height:
break
# 更新行高
line_height = max(line_height, bbox[1])
# 绘制词
try:
draw.text((x, y), word, font=font, fill='black')
x += bbox[0] + 10 # 增加词间距
except Exception as e:
logger.warning(f"绘制文本失败: {e}")
continue
# 保存图像
output_path = OUTPUT_DIR / f'wordcloud_{"recommended" if is_recommended else "not_recommended"}.png'
img.save(output_path)
logger.info(f"成功生成{'推荐' if is_recommended else '不推荐'}评论词云图: {output_path}")
except Exception as e:
logger.error(f"生成词云图失败: {e}")
logger.error(traceback.format_exc())
def load_data(file_path):
"""加载数据"""
try:
logger.info(f"开始加载数据文件: {file_path}")
if not os.path.exists(file_path):
logger.error(f"数据文件不存在: {file_path}")
sys.exit(1)
df = pd.read_csv(file_path)
logger.info(f"成功加载数据,共 {len(df)} 行")
return df
except Exception as e:
logger.error(f"加载数据失败: {e}")
logger.error(traceback.format_exc())
sys.exit(1)
def main():
try:
logger.info("开始执行分析...")
# 从CSV文件加载数据
df = load_data('../csv/wukong.csv') # 修改为相对路径
# 清理评论数据
df['评论'] = df['评论'].apply(clean_text)
# 打印数据统计信息
logger.info(f"总评论数: {len(df)}")
logger.info(f"推荐评论数: {len(df[df['是否推荐'] == '推荐'])}")
logger.info(f"不推荐评论数: {len(df[df['是否推荐'] == '不推荐'])}")
logger.info(f"非空评论数: {len(df[df['评论'].str.len() > 0])}")
# 打印一些评论样本,用于调试
logger.info('评论样本:')
logger.info(df['评论'].head())
# 生成可视化
plot_daily_reviews(df)
plot_recommendation_pie(df)
analyze_playtime_recommendation(df)
# 生成词云
logger.info('开始生成词云图...')
generate_wordcloud(df, '推荐')
generate_wordcloud(df, '不推荐')
logger.info('所有分析完成')
except Exception as e:
logger.error(f'程序执行失败: {e}')
logger.error(traceback.format_exc())
sys.exit(1)
if __name__ == '__main__':
main()