2023 级课前测试试卷-电子商务大数据分析
测试要求:
1、数据采集:
要求Python 编写程序爬取京东手机评论数据,生成Json形式的数据文件。
2、开发MR程序清洗电商评论数据数据清洗:
数据清洗是对数据进行重新审查和校验的过程,目的在于删除重复信息、纠正存在的错误,并提供数据一致性。
要求使用MapReduce,对大量的Json文件,进行清洗,以得到结构化的文本文件。
3、数据加载:生成Hive用户评论数据:
(1)在HIVE中创建自己的数据库;
(2)并将手机评论数据导入用户评价表中。
4、数据统计:生成Hive用户评论数据:
使用Hive对以下指标进行统计并可视化演示:
(1)用户评论周期(收到货后,一般多久进行评论)
(2)会员级别统计(判断购买此商品的用户级别)
(3)每天评论量(大体能反映出下单时间)
(4)自定义UDF,功能为:去掉评论时间的时分秒,只保留年月日
并进行可视化展示
5、利用Sqoop进行数据迁移至Mysql数据库:
要求生成mysql用户评论表。
6、中文分词实现用户评价分析。
(1)实现用户评价信息中的中文分词及词频统计;
(2)在 hive 中新建词频统计表并加载分词数据;
要求实现:
(1)实现用户评价信息中的中文分词;
(2)实现中文分词后的词频统计;
(3)在 hive 中新建词频统计表加载分词数据;
首先爬取评论
import time
import json
from datetime import datetime
from DrissionPage import ChromiumPage
# 打开浏览器
dp = ChromiumPage()
# 访问网站
dp.get('https://item.jd.com/100071383534.html')
# 等待页面加载
time.sleep(3)
# 监听数据
dp.listen.start('client.action')
# 点击加载全部评论
try:
dp.ele('css:.all-btn .arrow').click()
time.sleep(2)
except Exception as e:
print(f"无法点击加载全部评论按钮,尝试继续...: {e}")
# 存储所有评论数据
all_comments = []
# 添加连续失败计数器
consecutive_failures = 0
# 构建循环
for page in range(1, 30):
print(f'正在采集第{page}页的数据')
# 等待数据包加载,添加超时处理
try:
r = dp.listen.wait(timeout=15) # 设置15秒超时
if r is None:
print(f"第{page}页等待数据包超时")
consecutive_failures += 1
if consecutive_failures >= 3:
print("连续3次等待数据包超时,结束采集")
break
continue
except Exception as e:
print(f"等待第{page}页数据包时出错: {e}")
consecutive_failures += 1
if consecutive_failures >= 3:
print("连续3次等待数据包出错,结束采集")
break
continue
# 获取响应数据
try:
json_data = r.response.body
except Exception as e:
print(f"获取第{page}页响应数据时出错: {e}")
consecutive_failures += 1
if consecutive_failures >= 3:
print("连续3次获取响应数据出错,结束采集")
break
continue
# 解析数据
try:
# 字典取值,提取评论信息所在列表
comment_list = json_data['result']['floors'][2]['data']
# for循环遍历,提取列表里面的元素
for index in comment_list:
"""提取具体每条评论信息"""
if 'commentInfo' in index:
# 直接保存完整的commentInfo数据,不做任何处理
all_comments.append(index['commentInfo'])
print(f"采集到评论: {index['commentInfo']['userNickName']}")
# 如果成功解析,重置连续失败计数器
consecutive_failures = 0
except Exception as e:
print(f"解析第{page}页数据时出错: {e}")
consecutive_failures += 1
# 检查是否连续失败3次
if consecutive_failures >= 3:
print("连续3次解析数据失败,结束采集")
break
else:
continue
# 定位窗口标签并下滑8
try:
tab = dp.ele('css:div._rateListContainer_1ygkr_45')
tab.scroll.to_bottom()
time.sleep(2) # 等待加载
except Exception as e:
print(f"滚动加载失败: {e}")
# 滚动失败不视为解析失败,不增加连续失败计数
continue
# 保存原始数据到JSON文件,每行一个JSON对象
output_filename = f'jd_comments_raw_{datetime.now().strftime("%Y%m%d_%H%M%S")}.json'
try:
with open(output_filename, 'w', encoding='utf-8') as f:
for comment in all_comments:
# 每行写入一个JSON对象
json.dump(comment, f, ensure_ascii=False)
f.write('\n') # 换行
print(f"原始数据已保存到 {output_filename},共采集 {len(all_comments)} 条评论")
except Exception as e:
print(f"保存文件时出错: {e}")
# 关闭浏览器
dp.quit()
第一遍清洗
import json
import csv
import os
from datetime import datetime
from typing import Dict, List, Any
import re
def clean_jd_comments_data(input_file: str = None, output_dir: str = "cleaned_data"):
"""
清洗京东评论数据
Args:
input_file: 输入文件路径,如为None则自动查找最新文件
output_dir: 输出目录
"""
# 1. 查找输入文件
if input_file is None:
# 自动查找最新的jd_comments_raw_*.json文件
json_files = [f for f in os.listdir() if f.startswith('jd_comments_raw_') and f.endswith('.json')]
if not json_files:
print("未找到原始数据文件,请确保文件在当前目录")
return
# 按时间戳排序,取最新的文件
json_files.sort(reverse=True)
input_file = json_files[0]
print(f"正在处理文件: {input_file}")
# 2. 读取原始数据
all_comments = []
try:
with open(input_file, 'r', encoding='utf-8') as f:
for line in f:
if line.strip(): # 跳过空行
comment = json.loads(line.strip())
all_comments.append(comment)
except FileNotFoundError:
print(f"文件不存在: {input_file}")
return
except json.JSONDecodeError as e:
print(f"JSON解析错误: {e}")
return
print(f"成功读取 {len(all_comments)} 条原始评论")
# 3. 定义清洗后的数据结构
cleaned_comments = []
for idx, comment in enumerate(all_comments):
try:
# 提取基本信息
cleaned = {
# 用户信息
"user_nickname": comment.get("userNickName", ""),
"user_img_url": comment.get("userImgURL", ""),
"officer_level": int(comment.get("officerLevel", 0)), # 用户等级
# 评论信息
"comment_id": comment.get("commentId", ""),
"comment_date": comment.get("commentDate", ""),
"comment_content": comment.get("commentData", ""),
"tag_comment_content": comment.get("tagCommentContent", ""), # 带标签的内容
# 评分信息
"comment_score": int(comment.get("commentScore", 0)),
"comment_score_text": comment.get("commentScoreText", ""),
# 产品规格
"product_specifications": comment.get("productSpecifications", ""),
"ware_attribute": [], # 商品属性列表
# 图片信息
"has_images": False,
"image_count": 0,
"image_urls": [], # 小图URL列表
"large_image_urls": [], # 大图URL列表
# 互动数据
"praise_count": int(comment.get("praiseCnt", 0)),
"reply_count": int(comment.get("replyCnt", 0)),
"buy_count": int(re.search(r'\d+', comment.get("buyCountText", "0"))[0])
if re.search(r'\d+', comment.get("buyCountText", "0")) else 0, # 提取购买次数
# 其他标识
"is_repurchase": "该店铺购买" in comment.get("repurchaseInfo", ""), # 是否复购
"is_plus_member": "plusUrl" in comment, # 是否PLUS会员
"product_id": comment.get("productId", "")
}
# 提取商品属性
ware_attr = comment.get("wareAttribute", [])
if isinstance(ware_attr, list):
for attr in ware_attr:
if isinstance(attr, dict):
cleaned["ware_attribute"].append(attr)
# 提取图片信息
picture_info = comment.get("pictureInfoList", [])
if picture_info and isinstance(picture_info, list):
cleaned["has_images"] = True
cleaned["image_count"] = len(picture_info)
for pic in picture_info:
if isinstance(pic, dict):
if pic.get("picURL"):
cleaned["image_urls"].append(pic["picURL"])
if pic.get("largePicURL"):
cleaned["large_image_urls"].append(pic["largePicURL"])
# 计算评论长度(字符数)
content = cleaned["comment_content"] or cleaned["tag_comment_content"]
cleaned["content_length"] = len(str(content))
# 解析日期为datetime对象便于后续分析
try:
date_str = cleaned["comment_date"]
if date_str:
cleaned["comment_datetime"] = datetime.strptime(date_str, "%Y-%m-%d %H:%M:%S")
else:
cleaned["comment_datetime"] = None
except:
cleaned["comment_datetime"] = None
# 添加原始数据引用(可选,如需可保留)
# cleaned["raw_data_hash"] = hash(json.dumps(comment, sort_keys=True))
cleaned_comments.append(cleaned)
except Exception as e:
print(f"处理第 {idx + 1} 条评论时出错: {e}")
continue
print(f"成功清洗 {len(cleaned_comments)} 条评论数据")
# 4. 创建输出目录
os.makedirs(output_dir, exist_ok=True)
# 生成时间戳
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
# 5. 保存为JSON文件
json_output = os.path.join(output_dir, f"jd_comments_cleaned_{timestamp}.json")
with open(json_output, 'w', encoding='utf-8') as f:
json.dump(cleaned_comments, f, ensure_ascii=False, indent=2, default=str)
print(f"已保存JSON格式数据: {json_output}")
# 6. 保存为CSV文件(便于分析)
csv_output = os.path.join(output_dir, f"jd_comments_cleaned_{timestamp}.csv")
if cleaned_comments:
# 展平部分嵌套结构以便CSV存储
csv_rows = []
for comment in cleaned_comments:
# 创建CSV行(展平部分数据)
csv_row = {
"user_nickname": comment["user_nickname"],
"officer_level": comment["officer_level"],
"comment_id": comment["comment_id"],
"comment_date": comment["comment_date"],
"comment_content": (comment["comment_content"] or comment["tag_comment_content"])[:200], # 截断防止过长
"comment_score": comment["comment_score"],
"comment_score_text": comment["comment_score_text"],
"product_specifications": comment["product_specifications"],
"ware_attribute": json.dumps(comment["ware_attribute"], ensure_ascii=False),
"has_images": comment["has_images"],
"image_count": comment["image_count"],
"image_urls": "; ".join(comment["image_urls"]) if comment["image_urls"] else "",
"praise_count": comment["praise_count"],
"reply_count": comment["reply_count"],
"buy_count": comment["buy_count"],
"is_repurchase": comment["is_repurchase"],
"is_plus_member": comment["is_plus_member"],
"content_length": comment["content_length"],
"product_id": comment["product_id"]
}
csv_rows.append(csv_row)
# 写入CSV
with open(csv_output, 'w', encoding='utf-8-sig', newline='') as f: # utf-8-sig支持Excel
writer = csv.DictWriter(f, fieldnames=csv_rows[0].keys())
writer.writeheader()
writer.writerows(csv_rows)
print(f"已保存CSV格式数据: {csv_output}")
# 7. 生成数据摘要报告
summary_report = os.path.join(output_dir, f"data_summary_{timestamp}.txt")
generate_summary_report(cleaned_comments, summary_report, input_file)
return cleaned_comments, json_output, csv_output
def generate_summary_report(comments: List[Dict], output_file: str, input_file: str):
"""生成数据摘要报告"""
if not comments:
return
# 基本统计
total_comments = len(comments)
# 评分分布
score_dist = {}
for comment in comments:
score = comment.get("comment_score", 0)
score_dist[score] = score_dist.get(score, 0) + 1
# 带图评论统计
with_images = sum(1 for c in comments if c.get("has_images"))
# 用户等级分布
level_dist = {}
for comment in comments:
level = comment.get("officer_level", 0)
level_dist[level] = level_dist.get(level, 0) + 1
# 复购用户统计
repurchase_users = sum(1 for c in comments if c.get("is_repurchase"))
# 内容长度统计
content_lengths = [c.get("content_length", 0) for c in comments]
avg_length = sum(content_lengths) / len(content_lengths) if content_lengths else 0
max_length = max(content_lengths) if content_lengths else 0
min_length = min(content_lengths) if content_lengths else 0
# 生成报告
with open(output_file, 'w', encoding='utf-8') as f:
f.write("=" * 50 + "\n")
f.write("京东评论数据清洗报告\n")
f.write("=" * 50 + "\n\n")
f.write(f"原始数据文件: {input_file}\n")
f.write(f"清洗时间: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}\n")
f.write(f"处理记录数: {total_comments}\n\n")
f.write("1. 评分分布:\n")
for score in sorted(score_dist.keys()):
count = score_dist[score]
percentage = (count / total_comments) * 100
f.write(f" {score}星: {count}条 ({percentage:.1f}%)\n")
f.write(f"\n2. 带图评论: {with_images}条 ({(with_images / total_comments * 100):.1f}%)\n")
f.write(f"3. 复购用户: {repurchase_users}条 ({(repurchase_users / total_comments * 100):.1f}%)\n")
f.write("\n4. 用户等级分布:\n")
for level in sorted(level_dist.keys()):
f.write(f" 等级{level}: {level_dist[level]}条\n")
f.write(f"\n5. 评论内容长度统计:\n")
f.write(f" 平均长度: {avg_length:.0f}字符\n")
f.write(f" 最大长度: {max_length}字符\n")
f.write(f" 最小长度: {min_length}字符\n")
f.write(f"\n6. 点赞/回复统计:\n")
total_praises = sum(c.get("praise_count", 0) for c in comments)
total_replies = sum(c.get("reply_count", 0) for c in comments)
f.write(f" 总点赞数: {total_praises}\n")
f.write(f" 总回复数: {total_replies}\n")
f.write(f" 平均每条点赞: {total_praises / total_comments:.1f}\n")
f.write(f" 平均每条回复: {total_replies / total_comments:.1f}\n")
print(f"已生成数据摘要报告: {output_file}")
def find_image_comments(comments: List[Dict], min_images: int = 1):
"""查找带有图片的评论"""
result = []
for comment in comments:
if comment.get("has_images") and comment.get("image_count", 0) >= min_images:
result.append({
"user": comment.get("user_nickname"),
"date": comment.get("comment_date"),
"content": comment.get("comment_content", "")[:100],
"image_count": comment.get("image_count"),
"image_urls": comment.get("image_urls", [])
})
return result
def filter_by_score(comments: List[Dict], min_score: int = 4):
"""按评分筛选评论"""
return [c for c in comments if c.get("comment_score", 0) >= min_score]
# 主程序入口
if __name__ == "__main__":
# 使用方法1: 自动处理最新文件
cleaned_data, json_file, csv_file = clean_jd_comments_data()
# 使用方法2: 指定文件处理
# cleaned_data, json_file, csv_file = clean_jd_comments_data("jd_comments_raw_20250101_120000.json")
if cleaned_data:
print(f"\n数据清洗完成!")
print(f"JSON文件: {json_file}")
print(f"CSV文件: {csv_file}")
print(f"摘要报告: {csv_file.replace('.csv', '.txt').replace('cleaned_data', 'data_summary')}")
# 示例: 查找带图片的评论
image_comments = find_image_comments(cleaned_data, min_images=2)
print(f"\n找到 {len(image_comments)} 条带有2张以上图片的评论")
# 示例: 查找高评分评论
high_score_comments = filter_by_score(cleaned_data, min_score=4)
print(
f"找到 {len(high_score_comments)} 条4星及以上评分评论 ({(len(high_score_comments) / len(cleaned_data) * 100):.1f}%)")
# 保存筛选结果
if image_comments:
with open("image_comments.json", 'w', encoding='utf-8') as f:
json.dump(image_comments, f, ensure_ascii=False, indent=2)
print("已保存带图片评论到: image_comments.json")
到hive生成Hive用户评论数据
点击查看代码
hive> -- 创建用户评论表
hive> CREATE TABLE IF NOT EXISTS user_reviews (
> user_nickname STRING,
> officer_level INT,
> comment_id STRING,
> comment_date STRING,
> comment_content STRING,
> comment_score INT,
> comment_score_text STRING,
> product_specifications STRING,
> ware_attribute STRING,
> has_images BOOLEAN,
> image_count INT,
> image_urls STRING,
> praise_count INT,
> reply_count INT,
> buy_count INT,
> is_repurchase BOOLEAN,
> is_plus_member BOOLEAN,
> content_length INT,
> product_id STRING
> )
> ROW FORMAT DELIMITED
> FIELDS TERMINATED BY ','
> STORED AS TEXTFILE
> TBLPROPERTIES ("skip.header.line.count"="1");
点击查看代码
hive> DROP TABLE IF EXISTS review_cycle_analysis;
OK
Time taken: 0.127 seconds
hive> CREATE TABLE review_cycle_analysis (
> cycle_period STRING,
> review_count INT
> );
OK
Time taken: 0.102 seconds
hive> INSERT INTO TABLE review_cycle_analysis
> SELECT
> CASE
> WHEN hour(from_unixtime(unix_timestamp(comment_date, 'yyyy-MM-dd HH:mm:ss'))) BETWEEN 0 AND 6 THEN '凌晨(0-6点)'
> WHEN hour(from_unixtime(unix_timestamp(comment_date, 'yyyy-MM-dd HH:mm:ss'))) BETWEEN 7 AND 12 THEN '上午(7-12点)'
> WHEN hour(from_unixtime(unix_timestamp(comment_date, 'yyyy-MM-dd HH:mm:ss'))) BETWEEN 13 AND 18 THEN '下午(13-18点)'
> ELSE '晚上(19-23点)'
> END as cycle_period,
> COUNT(*) as review_count
> FROM user_reviews
> GROUP BY
> CASE
> WHEN hour(from_unixtime(unix_timestamp(comment_date, 'yyyy-MM-dd HH:mm:ss'))) BETWEEN 0 AND 6 THEN '凌晨(0-6点)'
> WHEN hour(from_unixtime(unix_timestamp(comment_date, 'yyyy-MM-dd HH:mm:ss'))) BETWEEN 7 AND 12 THEN '上午(7-12点)'
> WHEN hour(from_unixtime(unix_timestamp(comment_date, 'yyyy-MM-dd HH:mm:ss'))) BETWEEN 13 AND 18 THEN '下午(13-18点)'
> ELSE '晚上(19-23点)'
> END;
点击查看代码
hive> CREATE TABLE member_level_stats (
> officer_level INT,
> user_count INT,
> percentage DOUBLE
> );
hive> INSERT INTO TABLE member_level_stats
> SELECT
> officer_level,
> COUNT(*) as user_count,
> ROUND(COUNT(*) * 100.0 / (SELECT COUNT(*) FROM user_reviews), 2) as percentage
> FROM user_reviews
> GROUP BY officer_level
> ORDER BY officer_level;
点击查看代码
hive> CREATE TABLE daily_review_stats (
> review_date STRING,
> review_count INT
> );
hive> INSERT INTO TABLE daily_review_stats
> SELECT
> substr(comment_date, 1, 10) as review_date,
> COUNT(*) as review_count
> FROM user_reviews
> GROUP BY substr(comment_date, 1, 10)
> ORDER BY review_date;
在myql建表用sqoop迁移导入mysql

python远程连接mysql可视化数据
# local_analysis.py
import mysql.connector
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib
matplotlib.rcParams['font.sans-serif'] = ['SimHei', 'Arial Unicode MS', 'DejaVu Sans']
matplotlib.rcParams['axes.unicode_minus'] = False
# 连接到虚拟机的MySQL
try:
conn = mysql.connector.connect(
host="192.168.60.10", # 替换为你的虚拟机IP
user="root",
password="root", # 你的MySQL密码
database="phone_reviews"
)
print("✅ 成功连接到MySQL")
# 1. 读取评论周期数据
print("\n📊 1. 评论周期分析:")
df_cycle = pd.read_sql("SELECT * FROM review_cycle_analysis", conn)
print(df_cycle)
# 2. 读取会员级别数据
print("\n👥 2. 会员级别统计:")
df_member = pd.read_sql("SELECT * FROM member_level_stats ORDER BY officer_level", conn)
print(df_member)
# 3. 读取每日评论数据
print("\n📅 3. 每日评论统计(前10条):")
df_daily = pd.read_sql("""
SELECT *
FROM daily_review_stats
WHERE review_date != 'NULL'
ORDER BY review_date
LIMIT 10
""", conn)
print(df_daily)
# 4. 创建可视化图表
fig, axes = plt.subplots(2, 2, figsize=(15, 10))
# 4.1 饼图 - 评论时间分布
axes[0, 0].pie(df_cycle['review_count'], labels=df_cycle['cycle_period'],
autopct='%1.1f%%', startangle=90)
axes[0, 0].set_title('用户评论时间分布', fontsize=12, fontweight='bold')
axes[0, 0].axis('equal')
# 4.2 柱状图 - 会员级别分布
bars = axes[0, 1].bar(df_member['officer_level'].astype(str),
df_member['user_count'],
color='skyblue', edgecolor='black')
axes[0, 1].set_title('会员级别分布', fontsize=12, fontweight='bold')
axes[0, 1].set_xlabel('会员级别', fontsize=10)
axes[0, 1].set_ylabel('用户数量', fontsize=10)
axes[0, 1].grid(axis='y', alpha=0.3)
# 在柱子上添加百分比
for bar, pct in zip(bars, df_member['percentage']):
height = bar.get_height()
axes[0, 1].text(bar.get_x() + bar.get_width() / 2., height + 2,
f'{pct}%', ha='center', va='bottom', fontsize=9)
# 4.3 折线图 - 评论趋势
axes[1, 0].plot(df_daily['review_date'], df_daily['review_count'],
'o-', linewidth=2, markersize=6, color='green')
axes[1, 0].set_title('每日评论数量趋势', fontsize=12, fontweight='bold')
axes[1, 0].set_xlabel('日期', fontsize=10)
axes[1, 0].set_ylabel('评论数量', fontsize=10)
axes[1, 0].tick_params(axis='x', rotation=45)
axes[1, 0].grid(True, alpha=0.3)
# 4.4 数据汇总
total_comments = df_cycle['review_count'].sum()
total_users = df_member['user_count'].sum()
summary_text = f"""数据汇总:
• 总评论数: {total_comments}
• 总用户数: {total_users}
• 分析时间段数: {len(df_cycle)}
• 会员级别数: {len(df_member)}
• 记录天数: {len(df_daily)}天"""
axes[1, 1].text(0.1, 0.5, summary_text, fontsize=11,
verticalalignment='center', linespacing=1.5)
axes[1, 1].set_title('数据统计摘要', fontsize=12, fontweight='bold')
axes[1, 1].axis('off')
plt.suptitle('京东手机评论数据分析报告', fontsize=16, fontweight='bold', y=1.02)
plt.tight_layout()
# 保存图表
plt.savefig('phone_reviews_analysis.png', dpi=300, bbox_inches='tight')
print(f"\n✅ 图表已保存为: phone_reviews_analysis.png")
# 显示图表
plt.show()
conn.close()
except mysql.connector.Error as err:
print(f"❌ 连接错误: {err}")
print("\n💡 请检查:")
print("1. 虚拟机IP地址是否正确")
print("2. MySQL服务是否运行 (systemctl status mysqld)")
print("3. 防火墙是否开放3306端口")
print("4. 用户名密码是否正确")
except Exception as e:
print(f"❌ 错误: {e}")

浙公网安备 33010602011771号