石家庄铁道大学2025年春季 2022 级课堂测试试卷—数据分析

第一步

//建表语句

CREATE TABLE default.data (
ip STRING,
`date` STRING,
day INT,
traffic INT,
type STRING,
id INT
)
ROW FORMAT DELIMITED
FIELDS TERMINATED BY ','
STORED AS TEXTFILE;

//通过rz命令上传txt,然后删除多余空格

sed -i 's/[[:space:]]//g' result.txt

//然后通过logo命令导入数据

LOAD DATA LOCAL INPATH '/export/server/hive/result.txt' INTO TABLE data; 

 

第二步

 

统计代码

from pyhive import hive
import pandas as pd
from sqlalchemy import create_engine

# 创建SQLAlchemy连接
from sqlalchemy import create_engine

# 连接到 Hive,不传递密码
hive_engine = create_engine('hive://hadoop@node1:10000/default')


# 查询最受欢迎的视频/文章 Top 10
query1 = """
SELECT type, id, COUNT(*) AS view_count
FROM data
GROUP BY type, id
ORDER BY view_count DESC
LIMIT 10
"""
df_top10_video_article = pd.read_sql(query1, hive_engine)

# 按照地市统计最受欢迎的 Top 10 课程
query2 = """
SELECT ip, COUNT(*) AS course_count
FROM data
GROUP BY ip
ORDER BY course_count DESC
LIMIT 10
"""
df_top10_courses_ip = pd.read_sql(query2, hive_engine)

# 按照流量统计最受欢迎的 Top 10 课程
query3 = """
SELECT id, SUM(traffic) AS total_traffic
FROM data
GROUP BY id
ORDER BY total_traffic DESC
LIMIT 10
"""
df_top10_courses_traffic = pd.read_sql(query3, hive_engine)

# 打印结果查看
print("Top 10 Video/Article Popularity:")
print(df_top10_video_article)

print("\nTop 10 Courses by IP:")
print(df_top10_courses_ip)

print("\nTop 10 Courses by Traffic:")
print(df_top10_courses_traffic)

 

将统计数据插入到MySQL

# 连接到 MySQL
import pymysql

# 使用pymysql连接MySQL数据库
mysql_conn = pymysql.connect(
    host="localhost",  # MySQL主机
    user="lty",       # MySQL用户名
    password="",  # MySQL密码
    database="sixdatabase"  # 数据库名称
)


# 创建一个游标
cursor = mysql_conn.cursor()

# 创建表
cursor.execute("""
CREATE TABLE IF NOT EXISTS top_video_article (
    type VARCHAR(20),
    id INT,
    view_count INT
)
""")

cursor.execute("""
CREATE TABLE IF NOT EXISTS top_courses_ip (
    ip VARCHAR(20),
    course_count INT
)
""")

cursor.execute("""
CREATE TABLE IF NOT EXISTS top_courses_traffic (
    id INT,
    total_traffic INT
)
""")

# 将统计结果插入 MySQL
# 插入 Top 10 视频/文章
for row in df_top10_video_article.itertuples():
    cursor.execute("""
    INSERT INTO top_video_article (type, id, view_count)
    VALUES (%s, %s, %s)
    """, (row.type, row.id, row.view_count))

# 插入按 IP 统计的 Top 10 课程
for row in df_top10_courses_ip.itertuples():
    cursor.execute("""
    INSERT INTO top_courses_ip (ip, course_count)
    VALUES (%s, %s)
    """, (row.ip, row.course_count))

# 插入按流量统计的 Top 10 课程
for row in df_top10_courses_traffic.itertuples():
    cursor.execute("""
    INSERT INTO top_courses_traffic (id, total_traffic)
    VALUES (%s, %s)
    """, (row.id, row.total_traffic))

# 提交事务
mysql_conn.commit()

# 关闭 MySQL 连接
cursor.close()
mysql_conn.close()

print("Data has been inserted into MySQL.")

 

 第三步

从MySQL中提取数据,然后可视化

 

import pandas as pd
import json
import pymysql
from sqlalchemy import create_engine

# 创建SQLAlchemy连接
hive_engine = create_engine('hive://hadoop@node1:10000/default')

# 查询最受欢迎的视频/文章 Top 10
query1 = """
SELECT type, id, COUNT(*) AS view_count
FROM data
GROUP BY type, id
ORDER BY view_count DESC
LIMIT 10
"""
df_top10_video_article = pd.read_sql(query1, hive_engine)

# 按照地市统计最受欢迎的 Top 10 课程
query2 = """
SELECT ip, COUNT(*) AS course_count
FROM data
GROUP BY ip
ORDER BY course_count DESC
LIMIT 10
"""
df_top10_courses_ip = pd.read_sql(query2, hive_engine)

# 按照流量统计最受欢迎的 Top 10 课程
query3 = """
SELECT id, SUM(traffic) AS total_traffic
FROM data
GROUP BY id
ORDER BY total_traffic DESC
LIMIT 10
"""
df_top10_courses_traffic = pd.read_sql(query3, hive_engine)

# 连接到 MySQL
mysql_conn = pymysql.connect(
    host="localhost",
    user="lty",
    password="",
    database="sixdatabase"
)

cursor = mysql_conn.cursor()

# 创建表
cursor.execute("""
CREATE TABLE IF NOT EXISTS top_video_article (
    type VARCHAR(20),
    id INT,
    view_count INT
)
""")
cursor.execute("""
CREATE TABLE IF NOT EXISTS top_courses_ip (
    ip VARCHAR(20),
    course_count INT
)
""")
cursor.execute("""
CREATE TABLE IF NOT EXISTS top_courses_traffic (
    id INT,
    total_traffic INT
)
""")

# 将统计结果插入 MySQL
# 插入 Top 10 视频/文章
for row in df_top10_video_article.itertuples():
    cursor.execute("""
    INSERT INTO top_video_article (type, id, view_count)
    VALUES (%s, %s, %s)
    """, (row.type, row.id, row.view_count))

# 插入按 IP 统计的 Top 10 课程
for row in df_top10_courses_ip.itertuples():
    cursor.execute("""
    INSERT INTO top_courses_ip (ip, course_count)
    VALUES (%s, %s)
    """, (row.ip, row.course_count))

# 插入按流量统计的 Top 10 课程
for row in df_top10_courses_traffic.itertuples():
    cursor.execute("""
    INSERT INTO top_courses_traffic (id, total_traffic)
    VALUES (%s, %s)
    """, (row.id, row.total_traffic))

# 提交事务
mysql_conn.commit()

# 关闭 MySQL 连接
cursor.close()
mysql_conn.close()

# 获取插入后的数据
mysql_conn = pymysql.connect(
    host="localhost",
    user="lty",
    password="LItianyu20041030.",
    database="sixdatabase"
)
cursor = mysql_conn.cursor()

# 查询 Top 10 视频/文章
cursor.execute("SELECT * FROM top_video_article")
top_video_article_data = cursor.fetchall()

# 查询按 IP 统计的 Top 10 课程
cursor.execute("SELECT * FROM top_courses_ip")
top_courses_ip_data = cursor.fetchall()

# 查询按流量统计的 Top 10 课程
cursor.execute("SELECT * FROM top_courses_traffic")
top_courses_traffic_data = cursor.fetchall()

# 关闭连接
cursor.close()
mysql_conn.close()

# 格式化为 ECharts 数据格式
video_article_data = [{"type": row[0], "id": row[1], "view_count": row[2]} for row in top_video_article_data]
courses_ip_data = [{"ip": row[0], "course_count": row[1]} for row in top_courses_ip_data]
courses_traffic_data = [{"id": row[0], "total_traffic": row[1]} for row in top_courses_traffic_data]

# 将数据转换为 JSON 格式
video_article_json = json.dumps(video_article_data)
courses_ip_json = json.dumps(courses_ip_data)
courses_traffic_json = json.dumps(courses_traffic_data)

# 写入到 HTML 文件
html_content = f"""
<!DOCTYPE html>
<html lang="en">
<head>
    <meta charset="UTF-8">
    <meta name="viewport" content="width=device-width, initial-scale=1.0">
    <title>ECharts Visualization</title>
    <script src="https://cdn.jsdelivr.net/npm/echarts@5.0.2/dist/echarts.min.js"></script>
</head>
<body>
    <div id="video_article_chart" style="width: 600px; height: 400px;"></div>
    <div id="courses_ip_chart" style="width: 600px; height: 400px;"></div>
    <div id="courses_traffic_chart" style="width: 600px; height: 400px;"></div>

    <script>
        var video_article_data = {video_article_json};
        var courses_ip_data = {courses_ip_json};
        var courses_traffic_data = {courses_traffic_json};

        var video_article_chart = echarts.init(document.getElementById('video_article_chart'));
        var courses_ip_chart = echarts.init(document.getElementById('courses_ip_chart'));
        var courses_traffic_chart = echarts.init(document.getElementById('courses_traffic_chart'));

        var video_article_option = {{
            title: {{ text: 'Top 10 Video/Article Popularity' }},
            xAxis: {{ type: 'category', data: video_article_data.map(item => item.id) }},
            yAxis: {{ type: 'value' }},
            series: [{{
                data: video_article_data.map(item => item.view_count),
                type: 'bar'
            }}]
        }};

        var courses_ip_option = {{
            title: {{ text: 'Top 10 Courses by IP' }},
            xAxis: {{ type: 'category', data: courses_ip_data.map(item => item.ip) }},
            yAxis: {{ type: 'value' }},
            series: [{{
                data: courses_ip_data.map(item => item.course_count),
                type: 'bar'
            }}]
        }};

        var courses_traffic_option = {{
            title: {{ text: 'Top 10 Courses by Traffic' }},
            xAxis: {{ type: 'category', data: courses_traffic_data.map(item => item.id) }},
            yAxis: {{ type: 'value' }},
            series: [{{
                data: courses_traffic_data.map(item => item.total_traffic),
                type: 'bar'
            }}]
        }};

        video_article_chart.setOption(video_article_option);
        courses_ip_chart.setOption(courses_ip_option);
        courses_traffic_chart.setOption(courses_traffic_option);
    </script>
</body>
</html>
"""

# 将 HTML 内容保存到文件
with open("charts.html", "w") as file:
    file.write(html_content)

print("HTML file with charts has been generated successfully!")

运行html在pycharm中的效果

 

 

posted on 2025-02-19 18:31  临江柔  阅读(16)  评论(0)    收藏  举报