2023 级课前测试试卷-电子商务大数据分析

测试要求：
1、数据采集：
要求Python 编写程序爬取京东手机评论数据，生成Json形式的数据文件。
2、开发MR程序清洗电商评论数据数据清洗：
数据清洗是对数据进行重新审查和校验的过程，目的在于删除重复信息、纠正存在的错误，并提供数据一致性。

要求使用MapReduce，对大量的Json文件，进行清洗，以得到结构化的文本文件。

3、数据加载：生成Hive用户评论数据：
（1）在HIVE中创建自己的数据库；
（2）并将手机评论数据导入用户评价表中。

4、数据统计：生成Hive用户评论数据：
使用Hive对以下指标进行统计并可视化演示：
（1）用户评论周期（收到货后，一般多久进行评论）
（2）会员级别统计（判断购买此商品的用户级别）
（3）每天评论量（大体能反映出下单时间）
（4）自定义UDF，功能为：去掉评论时间的时分秒，只保留年月日
并进行可视化展示
5、利用Sqoop进行数据迁移至Mysql数据库：
要求生成mysql用户评论表。

6、中文分词实现用户评价分析。
（1）实现用户评价信息中的中文分词及词频统计；
（2）在 hive 中新建词频统计表并加载分词数据；
要求实现：
 （1）实现用户评价信息中的中文分词；
 （2）实现中文分词后的词频统计；
 （3）在 hive 中新建词频统计表加载分词数据；

首先爬取评论

import time
import json
from datetime import datetime
from DrissionPage import ChromiumPage

# 打开浏览器
dp = ChromiumPage()

# 访问网站
dp.get('https://item.jd.com/100071383534.html')

# 等待页面加载
time.sleep(3)

# 监听数据
dp.listen.start('client.action')

# 点击加载全部评论
try:
    dp.ele('css:.all-btn .arrow').click()
    time.sleep(2)
except Exception as e:
    print(f"无法点击加载全部评论按钮，尝试继续...: {e}")

# 存储所有评论数据
all_comments = []

# 添加连续失败计数器
consecutive_failures = 0

# 构建循环
for page in range(1, 30):
    print(f'正在采集第{page}页的数据')

    # 等待数据包加载，添加超时处理
    try:
        r = dp.listen.wait(timeout=15)  # 设置15秒超时
        if r is None:
            print(f"第{page}页等待数据包超时")
            consecutive_failures += 1
            if consecutive_failures >= 3:
                print("连续3次等待数据包超时，结束采集")
                break
            continue
    except Exception as e:
        print(f"等待第{page}页数据包时出错: {e}")
        consecutive_failures += 1
        if consecutive_failures >= 3:
            print("连续3次等待数据包出错，结束采集")
            break
        continue

    # 获取响应数据
    try:
        json_data = r.response.body
    except Exception as e:
        print(f"获取第{page}页响应数据时出错: {e}")
        consecutive_failures += 1
        if consecutive_failures >= 3:
            print("连续3次获取响应数据出错，结束采集")
            break
        continue

    # 解析数据
    try:
        # 字典取值，提取评论信息所在列表
        comment_list = json_data['result']['floors'][2]['data']

        # for循环遍历，提取列表里面的元素
        for index in comment_list:
            """提取具体每条评论信息"""

            if 'commentInfo' in index:
                # 直接保存完整的commentInfo数据，不做任何处理
                all_comments.append(index['commentInfo'])
                print(f"采集到评论: {index['commentInfo']['userNickName']}")

        # 如果成功解析，重置连续失败计数器
        consecutive_failures = 0

    except Exception as e:
        print(f"解析第{page}页数据时出错: {e}")
        consecutive_failures += 1

        # 检查是否连续失败3次
        if consecutive_failures >= 3:
            print("连续3次解析数据失败，结束采集")
            break
        else:
            continue

    # 定位窗口标签并下滑8
    try:
        tab = dp.ele('css:div._rateListContainer_1ygkr_45')
        tab.scroll.to_bottom()
        time.sleep(2)  # 等待加载
    except Exception as e:
        print(f"滚动加载失败: {e}")
        # 滚动失败不视为解析失败，不增加连续失败计数
        continue

# 保存原始数据到JSON文件，每行一个JSON对象
output_filename = f'jd_comments_raw_{datetime.now().strftime("%Y%m%d_%H%M%S")}.json'
try:
    with open(output_filename, 'w', encoding='utf-8') as f:
        for comment in all_comments:
            # 每行写入一个JSON对象
            json.dump(comment, f, ensure_ascii=False)
            f.write('\n')  # 换行
    print(f"原始数据已保存到 {output_filename}，共采集 {len(all_comments)} 条评论")
except Exception as e:
    print(f"保存文件时出错: {e}")

# 关闭浏览器
dp.quit()

第一遍清洗

import json
import csv
import os
from datetime import datetime
from typing import Dict, List, Any
import re


def clean_jd_comments_data(input_file: str = None, output_dir: str = "cleaned_data"):
    """
    清洗京东评论数据

    Args:
        input_file: 输入文件路径，如为None则自动查找最新文件
        output_dir: 输出目录
    """

    # 1. 查找输入文件
    if input_file is None:
        # 自动查找最新的jd_comments_raw_*.json文件
        json_files = [f for f in os.listdir() if f.startswith('jd_comments_raw_') and f.endswith('.json')]
        if not json_files:
            print("未找到原始数据文件，请确保文件在当前目录")
            return
        # 按时间戳排序，取最新的文件
        json_files.sort(reverse=True)
        input_file = json_files[0]

    print(f"正在处理文件: {input_file}")

    # 2. 读取原始数据
    all_comments = []
    try:
        with open(input_file, 'r', encoding='utf-8') as f:
            for line in f:
                if line.strip():  # 跳过空行
                    comment = json.loads(line.strip())
                    all_comments.append(comment)
    except FileNotFoundError:
        print(f"文件不存在: {input_file}")
        return
    except json.JSONDecodeError as e:
        print(f"JSON解析错误: {e}")
        return

    print(f"成功读取 {len(all_comments)} 条原始评论")

    # 3. 定义清洗后的数据结构
    cleaned_comments = []

    for idx, comment in enumerate(all_comments):
        try:
            # 提取基本信息
            cleaned = {
                # 用户信息
                "user_nickname": comment.get("userNickName", ""),
                "user_img_url": comment.get("userImgURL", ""),
                "officer_level": int(comment.get("officerLevel", 0)),  # 用户等级

                # 评论信息
                "comment_id": comment.get("commentId", ""),
                "comment_date": comment.get("commentDate", ""),
                "comment_content": comment.get("commentData", ""),
                "tag_comment_content": comment.get("tagCommentContent", ""),  # 带标签的内容

                # 评分信息
                "comment_score": int(comment.get("commentScore", 0)),
                "comment_score_text": comment.get("commentScoreText", ""),

                # 产品规格
                "product_specifications": comment.get("productSpecifications", ""),
                "ware_attribute": [],  # 商品属性列表

                # 图片信息
                "has_images": False,
                "image_count": 0,
                "image_urls": [],  # 小图URL列表
                "large_image_urls": [],  # 大图URL列表

                # 互动数据
                "praise_count": int(comment.get("praiseCnt", 0)),
                "reply_count": int(comment.get("replyCnt", 0)),
                "buy_count": int(re.search(r'\d+', comment.get("buyCountText", "0"))[0])
                if re.search(r'\d+', comment.get("buyCountText", "0")) else 0,  # 提取购买次数

                # 其他标识
                "is_repurchase": "该店铺购买" in comment.get("repurchaseInfo", ""),  # 是否复购
                "is_plus_member": "plusUrl" in comment,  # 是否PLUS会员
                "product_id": comment.get("productId", "")
            }

            # 提取商品属性
            ware_attr = comment.get("wareAttribute", [])
            if isinstance(ware_attr, list):
                for attr in ware_attr:
                    if isinstance(attr, dict):
                        cleaned["ware_attribute"].append(attr)

            # 提取图片信息
            picture_info = comment.get("pictureInfoList", [])
            if picture_info and isinstance(picture_info, list):
                cleaned["has_images"] = True
                cleaned["image_count"] = len(picture_info)

                for pic in picture_info:
                    if isinstance(pic, dict):
                        if pic.get("picURL"):
                            cleaned["image_urls"].append(pic["picURL"])
                        if pic.get("largePicURL"):
                            cleaned["large_image_urls"].append(pic["largePicURL"])

            # 计算评论长度（字符数）
            content = cleaned["comment_content"] or cleaned["tag_comment_content"]
            cleaned["content_length"] = len(str(content))

            # 解析日期为datetime对象便于后续分析
            try:
                date_str = cleaned["comment_date"]
                if date_str:
                    cleaned["comment_datetime"] = datetime.strptime(date_str, "%Y-%m-%d %H:%M:%S")
                else:
                    cleaned["comment_datetime"] = None
            except:
                cleaned["comment_datetime"] = None

            # 添加原始数据引用（可选，如需可保留）
            # cleaned["raw_data_hash"] = hash(json.dumps(comment, sort_keys=True))

            cleaned_comments.append(cleaned)

        except Exception as e:
            print(f"处理第 {idx + 1} 条评论时出错: {e}")
            continue

    print(f"成功清洗 {len(cleaned_comments)} 条评论数据")

    # 4. 创建输出目录
    os.makedirs(output_dir, exist_ok=True)

    # 生成时间戳
    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")

    # 5. 保存为JSON文件
    json_output = os.path.join(output_dir, f"jd_comments_cleaned_{timestamp}.json")
    with open(json_output, 'w', encoding='utf-8') as f:
        json.dump(cleaned_comments, f, ensure_ascii=False, indent=2, default=str)
    print(f"已保存JSON格式数据: {json_output}")

    # 6. 保存为CSV文件（便于分析）
    csv_output = os.path.join(output_dir, f"jd_comments_cleaned_{timestamp}.csv")

    if cleaned_comments:
        # 展平部分嵌套结构以便CSV存储
        csv_rows = []
        for comment in cleaned_comments:
            # 创建CSV行（展平部分数据）
            csv_row = {
                "user_nickname": comment["user_nickname"],
                "officer_level": comment["officer_level"],
                "comment_id": comment["comment_id"],
                "comment_date": comment["comment_date"],
                "comment_content": (comment["comment_content"] or comment["tag_comment_content"])[:200],  # 截断防止过长
                "comment_score": comment["comment_score"],
                "comment_score_text": comment["comment_score_text"],
                "product_specifications": comment["product_specifications"],
                "ware_attribute": json.dumps(comment["ware_attribute"], ensure_ascii=False),
                "has_images": comment["has_images"],
                "image_count": comment["image_count"],
                "image_urls": "; ".join(comment["image_urls"]) if comment["image_urls"] else "",
                "praise_count": comment["praise_count"],
                "reply_count": comment["reply_count"],
                "buy_count": comment["buy_count"],
                "is_repurchase": comment["is_repurchase"],
                "is_plus_member": comment["is_plus_member"],
                "content_length": comment["content_length"],
                "product_id": comment["product_id"]
            }
            csv_rows.append(csv_row)

        # 写入CSV
        with open(csv_output, 'w', encoding='utf-8-sig', newline='') as f:  # utf-8-sig支持Excel
            writer = csv.DictWriter(f, fieldnames=csv_rows[0].keys())
            writer.writeheader()
            writer.writerows(csv_rows)

        print(f"已保存CSV格式数据: {csv_output}")

    # 7. 生成数据摘要报告
    summary_report = os.path.join(output_dir, f"data_summary_{timestamp}.txt")
    generate_summary_report(cleaned_comments, summary_report, input_file)

    return cleaned_comments, json_output, csv_output


def generate_summary_report(comments: List[Dict], output_file: str, input_file: str):
    """生成数据摘要报告"""

    if not comments:
        return

    # 基本统计
    total_comments = len(comments)

    # 评分分布
    score_dist = {}
    for comment in comments:
        score = comment.get("comment_score", 0)
        score_dist[score] = score_dist.get(score, 0) + 1

    # 带图评论统计
    with_images = sum(1 for c in comments if c.get("has_images"))

    # 用户等级分布
    level_dist = {}
    for comment in comments:
        level = comment.get("officer_level", 0)
        level_dist[level] = level_dist.get(level, 0) + 1

    # 复购用户统计
    repurchase_users = sum(1 for c in comments if c.get("is_repurchase"))

    # 内容长度统计
    content_lengths = [c.get("content_length", 0) for c in comments]
    avg_length = sum(content_lengths) / len(content_lengths) if content_lengths else 0
    max_length = max(content_lengths) if content_lengths else 0
    min_length = min(content_lengths) if content_lengths else 0

    # 生成报告
    with open(output_file, 'w', encoding='utf-8') as f:
        f.write("=" * 50 + "\n")
        f.write("京东评论数据清洗报告\n")
        f.write("=" * 50 + "\n\n")

        f.write(f"原始数据文件: {input_file}\n")
        f.write(f"清洗时间: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}\n")
        f.write(f"处理记录数: {total_comments}\n\n")

        f.write("1. 评分分布:\n")
        for score in sorted(score_dist.keys()):
            count = score_dist[score]
            percentage = (count / total_comments) * 100
            f.write(f"   {score}星: {count}条 ({percentage:.1f}%)\n")

        f.write(f"\n2. 带图评论: {with_images}条 ({(with_images / total_comments * 100):.1f}%)\n")
        f.write(f"3. 复购用户: {repurchase_users}条 ({(repurchase_users / total_comments * 100):.1f}%)\n")

        f.write("\n4. 用户等级分布:\n")
        for level in sorted(level_dist.keys()):
            f.write(f"   等级{level}: {level_dist[level]}条\n")

        f.write(f"\n5. 评论内容长度统计:\n")
        f.write(f"   平均长度: {avg_length:.0f}字符\n")
        f.write(f"   最大长度: {max_length}字符\n")
        f.write(f"   最小长度: {min_length}字符\n")

        f.write(f"\n6. 点赞/回复统计:\n")
        total_praises = sum(c.get("praise_count", 0) for c in comments)
        total_replies = sum(c.get("reply_count", 0) for c in comments)
        f.write(f"   总点赞数: {total_praises}\n")
        f.write(f"   总回复数: {total_replies}\n")
        f.write(f"   平均每条点赞: {total_praises / total_comments:.1f}\n")
        f.write(f"   平均每条回复: {total_replies / total_comments:.1f}\n")

    print(f"已生成数据摘要报告: {output_file}")


def find_image_comments(comments: List[Dict], min_images: int = 1):
    """查找带有图片的评论"""
    result = []
    for comment in comments:
        if comment.get("has_images") and comment.get("image_count", 0) >= min_images:
            result.append({
                "user": comment.get("user_nickname"),
                "date": comment.get("comment_date"),
                "content": comment.get("comment_content", "")[:100],
                "image_count": comment.get("image_count"),
                "image_urls": comment.get("image_urls", [])
            })
    return result


def filter_by_score(comments: List[Dict], min_score: int = 4):
    """按评分筛选评论"""
    return [c for c in comments if c.get("comment_score", 0) >= min_score]


# 主程序入口
if __name__ == "__main__":
    # 使用方法1: 自动处理最新文件
    cleaned_data, json_file, csv_file = clean_jd_comments_data()

    # 使用方法2: 指定文件处理
    # cleaned_data, json_file, csv_file = clean_jd_comments_data("jd_comments_raw_20250101_120000.json")

    if cleaned_data:
        print(f"\n数据清洗完成!")
        print(f"JSON文件: {json_file}")
        print(f"CSV文件: {csv_file}")
        print(f"摘要报告: {csv_file.replace('.csv', '.txt').replace('cleaned_data', 'data_summary')}")

        # 示例: 查找带图片的评论
        image_comments = find_image_comments(cleaned_data, min_images=2)
        print(f"\n找到 {len(image_comments)} 条带有2张以上图片的评论")

        # 示例: 查找高评分评论
        high_score_comments = filter_by_score(cleaned_data, min_score=4)
        print(
            f"找到 {len(high_score_comments)} 条4星及以上评分评论 ({(len(high_score_comments) / len(cleaned_data) * 100):.1f}%)")

        # 保存筛选结果
        if image_comments:
            with open("image_comments.json", 'w', encoding='utf-8') as f:
                json.dump(image_comments, f, ensure_ascii=False, indent=2)
            print("已保存带图片评论到: image_comments.json")

到hive生成Hive用户评论数据

点击查看代码

hive> -- 创建用户评论表
hive> CREATE TABLE IF NOT EXISTS user_reviews (
    >     user_nickname STRING,
    >     officer_level INT,
    >     comment_id STRING,
    >     comment_date STRING,
    >     comment_content STRING,
    >     comment_score INT,
    >     comment_score_text STRING,
    >     product_specifications STRING,
    >     ware_attribute STRING,
    >     has_images BOOLEAN,
    >     image_count INT,
    >     image_urls STRING,
    >     praise_count INT,
    >     reply_count INT,
    >     buy_count INT,
    >     is_repurchase BOOLEAN,
    >     is_plus_member BOOLEAN,
    >     content_length INT,
    >     product_id STRING
    > )
    > ROW FORMAT DELIMITED
    > FIELDS TERMINATED BY ','
    > STORED AS TEXTFILE
    > TBLPROPERTIES ("skip.header.line.count"="1");

点击查看代码

hive> DROP TABLE IF EXISTS review_cycle_analysis;
OK
Time taken: 0.127 seconds
hive> CREATE TABLE review_cycle_analysis (
    >     cycle_period STRING,
    >     review_count INT
    > );
OK
Time taken: 0.102 seconds
hive> INSERT INTO TABLE review_cycle_analysis
    > SELECT 
    >     CASE 
    >         WHEN hour(from_unixtime(unix_timestamp(comment_date, 'yyyy-MM-dd HH:mm:ss'))) BETWEEN 0 AND 6 THEN '凌晨(0-6点)'
    >         WHEN hour(from_unixtime(unix_timestamp(comment_date, 'yyyy-MM-dd HH:mm:ss'))) BETWEEN 7 AND 12 THEN '上午(7-12点)'
    >         WHEN hour(from_unixtime(unix_timestamp(comment_date, 'yyyy-MM-dd HH:mm:ss'))) BETWEEN 13 AND 18 THEN '下午(13-18点)'
    >         ELSE '晚上(19-23点)'
    >     END as cycle_period,
    >     COUNT(*) as review_count
    > FROM user_reviews
    > GROUP BY 
    >     CASE 
    >         WHEN hour(from_unixtime(unix_timestamp(comment_date, 'yyyy-MM-dd HH:mm:ss'))) BETWEEN 0 AND 6 THEN '凌晨(0-6点)'
    >         WHEN hour(from_unixtime(unix_timestamp(comment_date, 'yyyy-MM-dd HH:mm:ss'))) BETWEEN 7 AND 12 THEN '上午(7-12点)'
    >         WHEN hour(from_unixtime(unix_timestamp(comment_date, 'yyyy-MM-dd HH:mm:ss'))) BETWEEN 13 AND 18 THEN '下午(13-18点)'
    >         ELSE '晚上(19-23点)'
    >     END;

点击查看代码

hive> CREATE TABLE member_level_stats (
    >     officer_level INT,
    >     user_count INT,
    >     percentage DOUBLE
    > );

hive> INSERT INTO TABLE member_level_stats
    > SELECT 
    >     officer_level,
    >     COUNT(*) as user_count,
    >     ROUND(COUNT(*) * 100.0 / (SELECT COUNT(*) FROM user_reviews), 2) as percentage
    > FROM user_reviews
    > GROUP BY officer_level
    > ORDER BY officer_level;

点击查看代码

hive> CREATE TABLE daily_review_stats (
    >     review_date STRING,
    >     review_count INT
    > );


hive> INSERT INTO TABLE daily_review_stats
    > SELECT 
    >     substr(comment_date, 1, 10) as review_date,
    >     COUNT(*) as review_count
    > FROM user_reviews
    > GROUP BY substr(comment_date, 1, 10)
    > ORDER BY review_date;

在myql建表用sqoop迁移导入mysql

python远程连接mysql可视化数据

# local_analysis.py
import mysql.connector
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib

matplotlib.rcParams['font.sans-serif'] = ['SimHei', 'Arial Unicode MS', 'DejaVu Sans']
matplotlib.rcParams['axes.unicode_minus'] = False

# 连接到虚拟机的MySQL
try:
    conn = mysql.connector.connect(
        host="192.168.60.10",  # 替换为你的虚拟机IP
        user="root",
        password="root",  # 你的MySQL密码
        database="phone_reviews"
    )
    print("✅ 成功连接到MySQL")

    # 1. 读取评论周期数据
    print("\n📊 1. 评论周期分析:")
    df_cycle = pd.read_sql("SELECT * FROM review_cycle_analysis", conn)
    print(df_cycle)

    # 2. 读取会员级别数据
    print("\n👥 2. 会员级别统计:")
    df_member = pd.read_sql("SELECT * FROM member_level_stats ORDER BY officer_level", conn)
    print(df_member)

    # 3. 读取每日评论数据
    print("\n📅 3. 每日评论统计（前10条）:")
    df_daily = pd.read_sql("""
                           SELECT *
                           FROM daily_review_stats
                           WHERE review_date != 'NULL'
                           ORDER BY review_date
                               LIMIT 10
                           """, conn)
    print(df_daily)

    # 4. 创建可视化图表
    fig, axes = plt.subplots(2, 2, figsize=(15, 10))

    # 4.1 饼图 - 评论时间分布
    axes[0, 0].pie(df_cycle['review_count'], labels=df_cycle['cycle_period'],
                   autopct='%1.1f%%', startangle=90)
    axes[0, 0].set_title('用户评论时间分布', fontsize=12, fontweight='bold')
    axes[0, 0].axis('equal')

    # 4.2 柱状图 - 会员级别分布
    bars = axes[0, 1].bar(df_member['officer_level'].astype(str),
                          df_member['user_count'],
                          color='skyblue', edgecolor='black')
    axes[0, 1].set_title('会员级别分布', fontsize=12, fontweight='bold')
    axes[0, 1].set_xlabel('会员级别', fontsize=10)
    axes[0, 1].set_ylabel('用户数量', fontsize=10)
    axes[0, 1].grid(axis='y', alpha=0.3)

    # 在柱子上添加百分比
    for bar, pct in zip(bars, df_member['percentage']):
        height = bar.get_height()
        axes[0, 1].text(bar.get_x() + bar.get_width() / 2., height + 2,
                        f'{pct}%', ha='center', va='bottom', fontsize=9)

    # 4.3 折线图 - 评论趋势
    axes[1, 0].plot(df_daily['review_date'], df_daily['review_count'],
                    'o-', linewidth=2, markersize=6, color='green')
    axes[1, 0].set_title('每日评论数量趋势', fontsize=12, fontweight='bold')
    axes[1, 0].set_xlabel('日期', fontsize=10)
    axes[1, 0].set_ylabel('评论数量', fontsize=10)
    axes[1, 0].tick_params(axis='x', rotation=45)
    axes[1, 0].grid(True, alpha=0.3)

    # 4.4 数据汇总
    total_comments = df_cycle['review_count'].sum()
    total_users = df_member['user_count'].sum()

    summary_text = f"""数据汇总:
    • 总评论数: {total_comments}
    • 总用户数: {total_users}
    • 分析时间段数: {len(df_cycle)}
    • 会员级别数: {len(df_member)}
    • 记录天数: {len(df_daily)}天"""

    axes[1, 1].text(0.1, 0.5, summary_text, fontsize=11,
                    verticalalignment='center', linespacing=1.5)
    axes[1, 1].set_title('数据统计摘要', fontsize=12, fontweight='bold')
    axes[1, 1].axis('off')

    plt.suptitle('京东手机评论数据分析报告', fontsize=16, fontweight='bold', y=1.02)
    plt.tight_layout()

    # 保存图表
    plt.savefig('phone_reviews_analysis.png', dpi=300, bbox_inches='tight')
    print(f"\n✅ 图表已保存为: phone_reviews_analysis.png")

    # 显示图表
    plt.show()

    conn.close()

except mysql.connector.Error as err:
    print(f"❌ 连接错误: {err}")
    print("\n💡 请检查:")
    print("1. 虚拟机IP地址是否正确")
    print("2. MySQL服务是否运行 (systemctl status mysqld)")
    print("3. 防火墙是否开放3306端口")
    print("4. 用户名密码是否正确")
except Exception as e:
    print(f"❌ 错误: {e}")

posted @ 2026-03-03 20:30 mwhB 阅读(9) 评论(0) 收藏举报

刷新页面返回顶部

2023 级课前测试试卷-电子商务大数据分析

公告