2025.2.19
今天进行课堂测试,要求如下:
Result文件数据说明:
Ip:106.39.41.166,(城市)
Date:10/Nov/2016:00:01:02 +0800,(日期)
Day:10,(天数)
Traffic: 54 ,(流量)
Type: video,(类型:视频video或文章article)
Id: 8701(视频或者文章的id)
测试要求:
1、 数据清洗:按照进行数据清洗,并将清洗后的数据导入hive数据库中。
两阶段数据清洗:
(1)第一阶段:把需要的信息从原始日志中提取出来
ip: 199.30.25.88
time: 10/Nov/2016:00:01:03 +0800
traffic: 62
文章: article/11325
视频: video/3235
(2)第二阶段:根据提取出来的信息做精细化操作
ip--->城市 city(IP)
date--> time:2016-11-10 00:01:03
day: 10
traffic:62
type:article/video
id:11325
(3)hive数据库表结构:
create table data( ip string, time string , day string, traffic bigint,
type string, id string )
2、数据分析:在HIVE统计下列数据。
(1)统计最受欢迎的视频/文章的Top10访问次数 (video/article)
(2)按照地市统计最受欢迎的Top10课程 (ip)
(3)按照流量统计最受欢迎的Top10课程 (traffic)
3、数据可视化:
将统计结果倒入MySql数据库中,通过图形化展示的方式展现出来。
最终在课上只完成了前两个阶段,因为在hive上统计的过程十分缓慢,最开始还因为没有完全理解三项统计的要求而导致sql语句的查询条件错误,浪费了大量时间,最后在宿舍完成了所有要求,具体代码如下:
from pyecharts.charts import Pie, Bar, Line, Page from pyecharts import options as opts from pyecharts.commons.utils import JsCode import pymysql # 数据库配置 DB_CONFIG = { "host": "localhost", "port": 3306, "user": "root", "password": "123456", "db": "hive", "charset": "utf8mb4" } def fetch_data_from_mysql(table_name, fields): """从MySQL获取数据""" connection = pymysql.connect(**DB_CONFIG) try: with connection.cursor() as cursor: sql = f"SELECT {','.join(fields)} FROM {table_name}" cursor.execute(sql) return cursor.fetchall() finally: connection.close() def process_table1_data(): """处理表1数据(type)""" raw_data = fetch_data_from_mysql("video_article_top10", ["type", "id", "visit_count"]) type_dict = {} for item in raw_data: type_name = item[0] id = item[1] num = item[2] type_dict[f"{type_name}/{id}"] = num return [(k, v) for k, v in type_dict.items()] def process_table2_data(): """处理表2数据(ip)""" raw_data = fetch_data_from_mysql("city_course_top10", ["city", "id", "visit_count"]) ip_dict = {} for item in raw_data: ip = item[0] id = item[1] num = item[2] ip_dict[f"{ip}/{id}"] = num return [(k, v) for k, v in ip_dict.items()] def process_table3_data(): """处理表3数据(traffic)""" raw_data = fetch_data_from_mysql("traffic_course_top10", ["id", "total_traffic"]) traffic_data = sorted(raw_data, key=lambda x: x[1], reverse=True)[:10] # 取前10大流量 return [(str(item[0]), item[1] / 10000) for item in traffic_data] # 转换为万单位 def create_pie_page(): """创建饼图页面""" page = Page(layout=Page.SimplePageLayout) # 表1饼图 pie1 = ( Pie( init_opts=opts.InitOpts(width="1200px", height="800px") ) .add("", process_table1_data(), radius=["30%", "70%"]) .set_global_opts( title_opts=opts.TitleOpts(title="表1分类分布", pos_left="center"), legend_opts=opts.LegendOpts( orient="vertical", pos_top="15%", pos_right="2%", type_="scroll", is_show=True ) ) .set_series_opts(label_opts=opts.LabelOpts(formatter="{b}: {c}")) .set_colors(["#C1232B", "#B5C334", "#FCCE10", "#E87C25", "#27727B"]) ) # 表2饼图 pie2 = ( Pie( init_opts=opts.InitOpts(width="1200px", height="800px") ) .add("", process_table2_data(), radius=["30%", "70%"]) .set_global_opts( title_opts=opts.TitleOpts(title="表2IP分布", pos_left="center"), legend_opts=opts.LegendOpts( orient="vertical", pos_top="15%", pos_right="2%", type_="scroll", is_show=True ) ) .set_series_opts(label_opts=opts.LabelOpts(formatter="{b}: {c}")) .set_colors(["#C1232B", "#B5C334", "#FCCE10", "#E87C25", "#27727B", "#FE8463", "#9BCA63", "#FAD860", "#F3A43B", "#60C0DD"]) ) # 表3饼图 pie3 = ( Pie( init_opts=opts.InitOpts(width="1200px", height="800px") ) .add("", process_table3_data(), radius=["30%", "70%"]) .set_global_opts( title_opts=opts.TitleOpts(title="表3流量分布", pos_left="center"), legend_opts=opts.LegendOpts( orient="vertical", pos_top="15%", pos_right="2%", type_="scroll", is_show=True ) ) .set_series_opts(label_opts=opts.LabelOpts(formatter="{b}: {c}")) .set_colors(["#C1232B", "#B5C334", "#FCCE10", "#E87C25", "#27727B", "#FE8463", "#9BCA63", "#FAD860", "#F3A43B", "#60C0DD", "#D7504B", "#C6E579", "#F4E001", "#F0805A", "#26C0C0"]) ) page.add(pie1, pie2, pie3) page.render("pie_charts.html") def create_bar_page(): """创建柱状图页面""" page = Page(layout=Page.SimplePageLayout) # 表1柱状图 table1_data = process_table1_data() bar1 = ( Bar( init_opts=opts.InitOpts(width="900px", height="600px") ) .add_xaxis([x[0] for x in table1_data]) .add_yaxis("数量(次)", [x[1] for x in table1_data]) .set_global_opts( title_opts=opts.TitleOpts(title="表1分类统计"), xaxis_opts=opts.AxisOpts(axislabel_opts=opts.LabelOpts(rotate=20)), # datazoom_opts=[opts.DataZoomOpts(type_="slider")], ) ) # 表2柱状图 table2_data = process_table2_data() bar2 = ( Bar( init_opts=opts.InitOpts(width="900px", height="600px") ) .add_xaxis([x[0] for x in table2_data]) .add_yaxis("数量(次)", [x[1] for x in table2_data]) .set_global_opts( title_opts=opts.TitleOpts(title="表2IP统计"), xaxis_opts=opts.AxisOpts(axislabel_opts=opts.LabelOpts(rotate=20)), # datazoom_opts=[opts.DataZoomOpts(type_="slider")], ) ) # 表3柱状图 table3_data = process_table3_data() bar3 = ( Bar( init_opts=opts.InitOpts(width="900px", height="600px") ) .add_xaxis([x[0] for x in table3_data]) .add_yaxis("流量(万)", [x[1] for x in table3_data]) .set_global_opts( title_opts=opts.TitleOpts(title="表3流量统计"), xaxis_opts=opts.AxisOpts(axislabel_opts=opts.LabelOpts(rotate=20)), # datazoom_opts=[opts.DataZoomOpts(type_="slider")], ) ) page.add(bar1, bar2, bar3) page.render("bar_charts.html") def create_line_page(): """创建折线图页面""" page = Page(layout=Page.SimplePageLayout) # 表1折线图 table1_data = process_table1_data() line1 = ( Line( init_opts=opts.InitOpts(width="900px", height="600px") ) .add_xaxis([x[0] for x in table1_data]) .add_yaxis("数量(次)", [x[1] for x in table1_data]) .set_global_opts( title_opts=opts.TitleOpts(title="表1分类趋势"), xaxis_opts=opts.AxisOpts(axislabel_opts=opts.LabelOpts(rotate=20)), # datazoom_opts=[opts.DataZoomOpts(type_="slider")], ) ) # 表2折线图 table2_data = process_table2_data() line2 = ( Line( init_opts=opts.InitOpts(width="900px", height="600px") ) .add_xaxis([x[0] for x in table2_data]) .add_yaxis("数量(次)", [x[1] for x in table2_data]) .set_global_opts( title_opts=opts.TitleOpts(title="表2IP趋势"), xaxis_opts=opts.AxisOpts(axislabel_opts=opts.LabelOpts(rotate=20)), # datazoom_opts=[opts.DataZoomOpts(type_="slider")], ) ) # 表3折线图 table3_data = process_table3_data() line3 = ( Line( init_opts=opts.InitOpts(width="900px", height="600px") ) .add_xaxis([x[0] for x in table3_data]) .add_yaxis("流量(万)", [x[1] for x in table3_data]) .set_global_opts( title_opts=opts.TitleOpts(title="表3流量趋势"), xaxis_opts=opts.AxisOpts(axislabel_opts=opts.LabelOpts(rotate=20)), # datazoom_opts=[opts.DataZoomOpts(type_="slider")], ) ) page.add(line1, line2, line3) page.render("line_charts.html") if __name__ == "__main__": # 生成三个页面 create_pie_page() create_bar_page() create_line_page() print("图表已生成:pie_charts.html, bar_charts.html, line_charts.html")
可视化页面展示:





 
                    
                     
                    
                 
                    
                
 
                
            
         
         浙公网安备 33010602011771号
浙公网安备 33010602011771号