2025.2.19

今天进行课堂测试,要求如下:

Result文件数据说明:

Ip106.39.41.166,(城市)

Date10/Nov/2016:00:01:02 +0800,(日期)

Day10,(天数)

Traffic: 54 ,(流量)

Type: video,(类型:视频video或文章article

Id: 8701(视频或者文章的id

测试要求:

1、 数据清洗:按照进行数据清洗,并将清洗后的数据导入hive数据库中

两阶段数据清洗:

1)第一阶段:把需要的信息从原始日志中提取出来

ip:    199.30.25.88

time:  10/Nov/2016:00:01:03 +0800

traffic:  62

文章: article/11325

视频: video/3235

2)第二阶段:根据提取出来的信息做精细化操作

ip--->城市 cityIP

date--> time:2016-11-10 00:01:03

day: 10

traffic:62

type:article/video

id:11325

3hive数据库表结构:

create table data(  ip string,  time string , day string, traffic bigint,

type string, id   string )

2数据分析:在HIVE统计下列数据。

1统计最受欢迎的视频/文章的Top10访问次数 (video/article

2按照地市统计最受欢迎的Top10课程 (ip

3按照流量统计最受欢迎的Top10课程 (traffic

3、数据可视化:

将统计结果倒入MySql数据库中,通过图形化展示的方式展现出来。

 

最终在课上只完成了前两个阶段,因为在hive上统计的过程十分缓慢,最开始还因为没有完全理解三项统计的要求而导致sql语句的查询条件错误,浪费了大量时间,最后在宿舍完成了所有要求,具体代码如下:

from pyecharts.charts import Pie, Bar, Line, Page
from pyecharts import options as opts
from pyecharts.commons.utils import JsCode
import pymysql

# 数据库配置
DB_CONFIG = {
    "host": "localhost",
    "port": 3306,
    "user": "root",
    "password": "123456",
    "db": "hive",
    "charset": "utf8mb4"
}


def fetch_data_from_mysql(table_name, fields):
    """从MySQL获取数据"""
    connection = pymysql.connect(**DB_CONFIG)
    try:
        with connection.cursor() as cursor:
            sql = f"SELECT {','.join(fields)} FROM {table_name}"
            cursor.execute(sql)
            return cursor.fetchall()
    finally:
        connection.close()


def process_table1_data():
    """处理表1数据(type)"""
    raw_data = fetch_data_from_mysql("video_article_top10", ["type", "id", "visit_count"])
    type_dict = {}
    for item in raw_data:
        type_name = item[0]
        id = item[1]
        num = item[2]
        type_dict[f"{type_name}/{id}"] = num
    return [(k, v) for k, v in type_dict.items()]


def process_table2_data():
    """处理表2数据(ip)"""
    raw_data = fetch_data_from_mysql("city_course_top10", ["city", "id", "visit_count"])
    ip_dict = {}
    for item in raw_data:
        ip = item[0]
        id = item[1]
        num = item[2]
        ip_dict[f"{ip}/{id}"] = num
    return [(k, v) for k, v in ip_dict.items()]


def process_table3_data():
    """处理表3数据(traffic)"""
    raw_data = fetch_data_from_mysql("traffic_course_top10", ["id", "total_traffic"])
    traffic_data = sorted(raw_data, key=lambda x: x[1], reverse=True)[:10]  # 取前10大流量
    return [(str(item[0]), item[1] / 10000) for item in traffic_data]  # 转换为万单位


def create_pie_page():
    """创建饼图页面"""
    page = Page(layout=Page.SimplePageLayout)

    # 表1饼图
    pie1 = (
        Pie(
            init_opts=opts.InitOpts(width="1200px", height="800px")
        )
        .add("", process_table1_data(), radius=["30%", "70%"])
        .set_global_opts(
            title_opts=opts.TitleOpts(title="表1分类分布", pos_left="center"),
            legend_opts=opts.LegendOpts(
                orient="vertical",
                pos_top="15%",
                pos_right="2%",
                type_="scroll",
                is_show=True
            )
        )
        .set_series_opts(label_opts=opts.LabelOpts(formatter="{b}: {c}"))
        .set_colors(["#C1232B", "#B5C334", "#FCCE10", "#E87C25", "#27727B"])
    )

    # 表2饼图
    pie2 = (
        Pie(
            init_opts=opts.InitOpts(width="1200px", height="800px")
        )
        .add("", process_table2_data(), radius=["30%", "70%"])
        .set_global_opts(
            title_opts=opts.TitleOpts(title="表2IP分布", pos_left="center"),
            legend_opts=opts.LegendOpts(
                orient="vertical",
                pos_top="15%",
                pos_right="2%",
                type_="scroll",
                is_show=True
            )
        )
        .set_series_opts(label_opts=opts.LabelOpts(formatter="{b}: {c}"))
        .set_colors(["#C1232B", "#B5C334", "#FCCE10", "#E87C25", "#27727B", "#FE8463", "#9BCA63", "#FAD860", "#F3A43B",
                     "#60C0DD"])
    )

    # 表3饼图
    pie3 = (
        Pie(
            init_opts=opts.InitOpts(width="1200px", height="800px")
        )
        .add("", process_table3_data(), radius=["30%", "70%"])
        .set_global_opts(
            title_opts=opts.TitleOpts(title="表3流量分布", pos_left="center"),
            legend_opts=opts.LegendOpts(
                orient="vertical",
                pos_top="15%",
                pos_right="2%",
                type_="scroll",
                is_show=True
            )
        )
        .set_series_opts(label_opts=opts.LabelOpts(formatter="{b}: {c}"))
        .set_colors(["#C1232B", "#B5C334", "#FCCE10", "#E87C25", "#27727B", "#FE8463", "#9BCA63", "#FAD860", "#F3A43B",
                     "#60C0DD", "#D7504B", "#C6E579", "#F4E001", "#F0805A", "#26C0C0"])
    )

    page.add(pie1, pie2, pie3)
    page.render("pie_charts.html")


def create_bar_page():
    """创建柱状图页面"""
    page = Page(layout=Page.SimplePageLayout)

    # 表1柱状图
    table1_data = process_table1_data()
    bar1 = (
        Bar(
            init_opts=opts.InitOpts(width="900px", height="600px")
        )
        .add_xaxis([x[0] for x in table1_data])
        .add_yaxis("数量(次)", [x[1] for x in table1_data])
        .set_global_opts(
            title_opts=opts.TitleOpts(title="表1分类统计"),
            xaxis_opts=opts.AxisOpts(axislabel_opts=opts.LabelOpts(rotate=20)),

            # datazoom_opts=[opts.DataZoomOpts(type_="slider")],
        )
    )

    # 表2柱状图
    table2_data = process_table2_data()
    bar2 = (
        Bar(
            init_opts=opts.InitOpts(width="900px", height="600px")
        )
        .add_xaxis([x[0] for x in table2_data])
        .add_yaxis("数量(次)", [x[1] for x in table2_data])
        .set_global_opts(
            title_opts=opts.TitleOpts(title="表2IP统计"),
            xaxis_opts=opts.AxisOpts(axislabel_opts=opts.LabelOpts(rotate=20)),
            # datazoom_opts=[opts.DataZoomOpts(type_="slider")],
        )
    )

    # 表3柱状图
    table3_data = process_table3_data()
    bar3 = (
        Bar(
            init_opts=opts.InitOpts(width="900px", height="600px")
        )
        .add_xaxis([x[0] for x in table3_data])
        .add_yaxis("流量(万)", [x[1] for x in table3_data])
        .set_global_opts(
            title_opts=opts.TitleOpts(title="表3流量统计"),
            xaxis_opts=opts.AxisOpts(axislabel_opts=opts.LabelOpts(rotate=20)),
            # datazoom_opts=[opts.DataZoomOpts(type_="slider")],
        )
    )

    page.add(bar1, bar2, bar3)
    page.render("bar_charts.html")


def create_line_page():
    """创建折线图页面"""
    page = Page(layout=Page.SimplePageLayout)

    # 表1折线图
    table1_data = process_table1_data()
    line1 = (
        Line(
            init_opts=opts.InitOpts(width="900px", height="600px")
        )
        .add_xaxis([x[0] for x in table1_data])
        .add_yaxis("数量(次)", [x[1] for x in table1_data])
        .set_global_opts(
            title_opts=opts.TitleOpts(title="表1分类趋势"),
            xaxis_opts=opts.AxisOpts(axislabel_opts=opts.LabelOpts(rotate=20)),
            # datazoom_opts=[opts.DataZoomOpts(type_="slider")],
        )
    )

    # 表2折线图
    table2_data = process_table2_data()
    line2 = (
        Line(
            init_opts=opts.InitOpts(width="900px", height="600px")
        )
        .add_xaxis([x[0] for x in table2_data])
        .add_yaxis("数量(次)", [x[1] for x in table2_data])
        .set_global_opts(
            title_opts=opts.TitleOpts(title="表2IP趋势"),
            xaxis_opts=opts.AxisOpts(axislabel_opts=opts.LabelOpts(rotate=20)),
            # datazoom_opts=[opts.DataZoomOpts(type_="slider")],
        )
    )

    # 表3折线图
    table3_data = process_table3_data()
    line3 = (
        Line(
            init_opts=opts.InitOpts(width="900px", height="600px")
        )
        .add_xaxis([x[0] for x in table3_data])
        .add_yaxis("流量(万)", [x[1] for x in table3_data])
        .set_global_opts(
            title_opts=opts.TitleOpts(title="表3流量趋势"),
            xaxis_opts=opts.AxisOpts(axislabel_opts=opts.LabelOpts(rotate=20)),
            # datazoom_opts=[opts.DataZoomOpts(type_="slider")],
        )
    )

    page.add(line1, line2, line3)
    page.render("line_charts.html")


if __name__ == "__main__":
    # 生成三个页面
    create_pie_page()
    create_bar_page()
    create_line_page()
    print("图表已生成:pie_charts.html, bar_charts.html, line_charts.html")

可视化页面展示:

 

 

 

 

 

posted @ 2025-02-19 20:54  贾贾鱼  阅读(7)  评论(0)    收藏  举报