石家庄铁道大学2025年春季 2022 级课堂测试试卷—数据分析
第一步
//建表语句
CREATE TABLE default.data (
ip STRING,
`date` STRING,
day INT,
traffic INT,
type STRING,
id INT
)
ROW FORMAT DELIMITED
FIELDS TERMINATED BY ','
STORED AS TEXTFILE;
//通过rz命令上传txt,然后删除多余空格
sed -i 's/[[:space:]]//g' result.txt
//然后通过logo命令导入数据
LOAD DATA LOCAL INPATH '/export/server/hive/result.txt' INTO TABLE data;

第二步
统计代码
from pyhive import hive import pandas as pd from sqlalchemy import create_engine # 创建SQLAlchemy连接 from sqlalchemy import create_engine # 连接到 Hive,不传递密码 hive_engine = create_engine('hive://hadoop@node1:10000/default') # 查询最受欢迎的视频/文章 Top 10 query1 = """ SELECT type, id, COUNT(*) AS view_count FROM data GROUP BY type, id ORDER BY view_count DESC LIMIT 10 """ df_top10_video_article = pd.read_sql(query1, hive_engine) # 按照地市统计最受欢迎的 Top 10 课程 query2 = """ SELECT ip, COUNT(*) AS course_count FROM data GROUP BY ip ORDER BY course_count DESC LIMIT 10 """ df_top10_courses_ip = pd.read_sql(query2, hive_engine) # 按照流量统计最受欢迎的 Top 10 课程 query3 = """ SELECT id, SUM(traffic) AS total_traffic FROM data GROUP BY id ORDER BY total_traffic DESC LIMIT 10 """ df_top10_courses_traffic = pd.read_sql(query3, hive_engine) # 打印结果查看 print("Top 10 Video/Article Popularity:") print(df_top10_video_article) print("\nTop 10 Courses by IP:") print(df_top10_courses_ip) print("\nTop 10 Courses by Traffic:") print(df_top10_courses_traffic)

将统计数据插入到MySQL
# 连接到 MySQL import pymysql # 使用pymysql连接MySQL数据库 mysql_conn = pymysql.connect( host="localhost", # MySQL主机 user="lty", # MySQL用户名 password="、", # MySQL密码 database="sixdatabase" # 数据库名称 ) # 创建一个游标 cursor = mysql_conn.cursor() # 创建表 cursor.execute(""" CREATE TABLE IF NOT EXISTS top_video_article ( type VARCHAR(20), id INT, view_count INT ) """) cursor.execute(""" CREATE TABLE IF NOT EXISTS top_courses_ip ( ip VARCHAR(20), course_count INT ) """) cursor.execute(""" CREATE TABLE IF NOT EXISTS top_courses_traffic ( id INT, total_traffic INT ) """) # 将统计结果插入 MySQL # 插入 Top 10 视频/文章 for row in df_top10_video_article.itertuples(): cursor.execute(""" INSERT INTO top_video_article (type, id, view_count) VALUES (%s, %s, %s) """, (row.type, row.id, row.view_count)) # 插入按 IP 统计的 Top 10 课程 for row in df_top10_courses_ip.itertuples(): cursor.execute(""" INSERT INTO top_courses_ip (ip, course_count) VALUES (%s, %s) """, (row.ip, row.course_count)) # 插入按流量统计的 Top 10 课程 for row in df_top10_courses_traffic.itertuples(): cursor.execute(""" INSERT INTO top_courses_traffic (id, total_traffic) VALUES (%s, %s) """, (row.id, row.total_traffic)) # 提交事务 mysql_conn.commit() # 关闭 MySQL 连接 cursor.close() mysql_conn.close() print("Data has been inserted into MySQL.")

第三步
从MySQL中提取数据,然后可视化
import pandas as pd import json import pymysql from sqlalchemy import create_engine # 创建SQLAlchemy连接 hive_engine = create_engine('hive://hadoop@node1:10000/default') # 查询最受欢迎的视频/文章 Top 10 query1 = """ SELECT type, id, COUNT(*) AS view_count FROM data GROUP BY type, id ORDER BY view_count DESC LIMIT 10 """ df_top10_video_article = pd.read_sql(query1, hive_engine) # 按照地市统计最受欢迎的 Top 10 课程 query2 = """ SELECT ip, COUNT(*) AS course_count FROM data GROUP BY ip ORDER BY course_count DESC LIMIT 10 """ df_top10_courses_ip = pd.read_sql(query2, hive_engine) # 按照流量统计最受欢迎的 Top 10 课程 query3 = """ SELECT id, SUM(traffic) AS total_traffic FROM data GROUP BY id ORDER BY total_traffic DESC LIMIT 10 """ df_top10_courses_traffic = pd.read_sql(query3, hive_engine) # 连接到 MySQL mysql_conn = pymysql.connect( host="localhost", user="lty", password="", database="sixdatabase" ) cursor = mysql_conn.cursor() # 创建表 cursor.execute(""" CREATE TABLE IF NOT EXISTS top_video_article ( type VARCHAR(20), id INT, view_count INT ) """) cursor.execute(""" CREATE TABLE IF NOT EXISTS top_courses_ip ( ip VARCHAR(20), course_count INT ) """) cursor.execute(""" CREATE TABLE IF NOT EXISTS top_courses_traffic ( id INT, total_traffic INT ) """) # 将统计结果插入 MySQL # 插入 Top 10 视频/文章 for row in df_top10_video_article.itertuples(): cursor.execute(""" INSERT INTO top_video_article (type, id, view_count) VALUES (%s, %s, %s) """, (row.type, row.id, row.view_count)) # 插入按 IP 统计的 Top 10 课程 for row in df_top10_courses_ip.itertuples(): cursor.execute(""" INSERT INTO top_courses_ip (ip, course_count) VALUES (%s, %s) """, (row.ip, row.course_count)) # 插入按流量统计的 Top 10 课程 for row in df_top10_courses_traffic.itertuples(): cursor.execute(""" INSERT INTO top_courses_traffic (id, total_traffic) VALUES (%s, %s) """, (row.id, row.total_traffic)) # 提交事务 mysql_conn.commit() # 关闭 MySQL 连接 cursor.close() mysql_conn.close() # 获取插入后的数据 mysql_conn = pymysql.connect( host="localhost", user="lty", password="LItianyu20041030.", database="sixdatabase" ) cursor = mysql_conn.cursor() # 查询 Top 10 视频/文章 cursor.execute("SELECT * FROM top_video_article") top_video_article_data = cursor.fetchall() # 查询按 IP 统计的 Top 10 课程 cursor.execute("SELECT * FROM top_courses_ip") top_courses_ip_data = cursor.fetchall() # 查询按流量统计的 Top 10 课程 cursor.execute("SELECT * FROM top_courses_traffic") top_courses_traffic_data = cursor.fetchall() # 关闭连接 cursor.close() mysql_conn.close() # 格式化为 ECharts 数据格式 video_article_data = [{"type": row[0], "id": row[1], "view_count": row[2]} for row in top_video_article_data] courses_ip_data = [{"ip": row[0], "course_count": row[1]} for row in top_courses_ip_data] courses_traffic_data = [{"id": row[0], "total_traffic": row[1]} for row in top_courses_traffic_data] # 将数据转换为 JSON 格式 video_article_json = json.dumps(video_article_data) courses_ip_json = json.dumps(courses_ip_data) courses_traffic_json = json.dumps(courses_traffic_data) # 写入到 HTML 文件 html_content = f""" <!DOCTYPE html> <html lang="en"> <head> <meta charset="UTF-8"> <meta name="viewport" content="width=device-width, initial-scale=1.0"> <title>ECharts Visualization</title> <script src="https://cdn.jsdelivr.net/npm/echarts@5.0.2/dist/echarts.min.js"></script> </head> <body> <div id="video_article_chart" style="width: 600px; height: 400px;"></div> <div id="courses_ip_chart" style="width: 600px; height: 400px;"></div> <div id="courses_traffic_chart" style="width: 600px; height: 400px;"></div> <script> var video_article_data = {video_article_json}; var courses_ip_data = {courses_ip_json}; var courses_traffic_data = {courses_traffic_json}; var video_article_chart = echarts.init(document.getElementById('video_article_chart')); var courses_ip_chart = echarts.init(document.getElementById('courses_ip_chart')); var courses_traffic_chart = echarts.init(document.getElementById('courses_traffic_chart')); var video_article_option = {{ title: {{ text: 'Top 10 Video/Article Popularity' }}, xAxis: {{ type: 'category', data: video_article_data.map(item => item.id) }}, yAxis: {{ type: 'value' }}, series: [{{ data: video_article_data.map(item => item.view_count), type: 'bar' }}] }}; var courses_ip_option = {{ title: {{ text: 'Top 10 Courses by IP' }}, xAxis: {{ type: 'category', data: courses_ip_data.map(item => item.ip) }}, yAxis: {{ type: 'value' }}, series: [{{ data: courses_ip_data.map(item => item.course_count), type: 'bar' }}] }}; var courses_traffic_option = {{ title: {{ text: 'Top 10 Courses by Traffic' }}, xAxis: {{ type: 'category', data: courses_traffic_data.map(item => item.id) }}, yAxis: {{ type: 'value' }}, series: [{{ data: courses_traffic_data.map(item => item.total_traffic), type: 'bar' }}] }}; video_article_chart.setOption(video_article_option); courses_ip_chart.setOption(courses_ip_option); courses_traffic_chart.setOption(courses_traffic_option); </script> </body> </html> """ # 将 HTML 内容保存到文件 with open("charts.html", "w") as file: file.write(html_content) print("HTML file with charts has been generated successfully!")
运行html在pycharm中的效果

浙公网安备 33010602011771号