hive数据分析
题目:
样表(sales_sample_20170310)字段说明: day_id 日期编号; sale_nbr 卖出方代码; buy_nbr 买入方代码; cnt 数量 round 金额 卖出方和买入方又分为3种类型: 以'C'开头的表示类型为C,代表“航空公司”,只可以卖出,不可以买入; 以'O'开头的表示类型为O,代表“代理人”,既可以卖出,也可以买入,并且允许自己卖给自己(简单来讲:每个“代理人”代码可能对应多个售票点,售票点之前有交换票的情况,所以体现为自己卖给了自己); 'PAX'表示类型为PAX,代表“旅客”,只可以买入,不可以卖出。
先正常启动hive
准备数据
create table sales_sample_20170310 ( `day_id` string comment '时间', `sale_nbr` string comment '卖出方', `buy_nbr` string comment '买入方', `cnt` int comment '数量', `round` int comment '金额' ) row format delimited fields terminated by ',' lines terminated by '\n';
load data inpath '/hivedata/sales_sample_20170310.csv' into table sales_sample_20170310;
inpath之后为文件在hdfs中的位置
验证:
select * from sales_sample_20170310 limit 10000;
简单清洗:
insert overwrite table sales_sample_20170310 select concat('2022-10-', day_id), sale_nbr, buy_nbr, cnt, round from sales_sample_20170310; create table sales_sample111 as select to_date(from_unixtime(UNIX_TIMESTAMP(day_id, 'yyyy-MM-dd'))) as day_id, sale_nbr, buy_nbr, cnt, round from sales_sample_20170310; select * from sales_sample111 limit 1000;
所需的四个表:
-- 1 CREATE TABLE sale_hangkong AS SELECT day_id, sale_nbr, SUM(cnt) AS cnt, SUM(round) AS round FROM sales_sample111 WHERE sale_nbr LIKE 'C%' GROUP BY day_id, sale_nbr; select * from sale_hangkong limit 10000; -- 2 create table day_sale as SELECT day_id, sale_nbr, SUM(cnt) AS cnt, SUM(round) AS rount FROM sales_sample111 WHERE sale_nbr LIKE 'O%' GROUP BY day_id, sale_nbr; select * from day_sale limit 1000; -- 3 create table huoyuedumaichu as select day_id, sale_nbr, count(*) as sale_number from sales_sample111 where sale_nbr like 'O%' group by sale_nbr, day_id; create table huoyuedumairu as select day_id, buy_nbr, count(*) as sale_number from sales_sample111 where buy_nbr like 'O%' group by buy_nbr, day_id; create table huoyuedu as select a.day_id day_id, a.sale_nbr sale_nbr, a.sale_number + h.sale_number sale_number from huoyuedumaichu a join huoyuedumairu h on a.day_id = h.day_id and a.sale_nbr = h.buy_nbr; select * from huoyuedu limit 1000; -- 4 --计算代理商买入数量金额 drop table mairu; create table mairu as select day_id, buy_nbr, sum(cnt) as cnt, sum(round) as round from sales_sample111 where buy_nbr like 'O%' group by day_id, buy_nbr; select * from mairu limit 1000; create table maichu as select day_id, sale_nbr, sum(cnt) as cnt, sum(round) as round from sales_sample111 where sale_nbr like 'O%' group by day_id, sale_nbr; select * from maichu limit 1000; create table lirun as select a.day_id as day_id, b.sale_nbr as nbr, a.cnt as cnt_buy, a.round as rount_buy, b.cnt as cnt_sale, b.round as round_sale, b.round - a.round as liren from mairu a join maichu b on a.buy_nbr = b.sale_nbr and a.day_id = b.day_id where a.day_id between '2022-10-01' and '2022-10-15'; select * from lirun limit 1000;
将数据通过sqoop传到mysql中:
第一个表类似
create table day_sale( day_id varchar(20) not null , sale_nbr varchar(20), cnt_sum int , round_sum int ); bin/sqoop export --connect "mysql链接-到数据库?useUnicode=true&characterEncoding=utf-8" --username root --password 123456 --table day_sale --export-dir /user/hive/warehouse/db_msg.db/day_sale --fields-terminated-by "\001" --input-null-non-string '\\N' create table huoyuedu( day_id varchar(50) , sale_nbr varchar(20), sale_number int ); bin/sqoop export --connect "mysql链接-到数据库?useUnicode=true&characterEncoding=utf-8" --username root --password 123456 --table huoyuedu --export-dir /user/hive/warehouse/db_msg.db/huoyuedu --fields-terminated-by "\001" --input-null-non-string '\\N' create table lirun( day_id varchar(50) , nbr varchar(20), cnt_buy int, rount_buy int, cnt_sale int, round_sale int, lirun int ); bin/sqoop export --connect "mysql链接-到数据库?useUnicode=true&characterEncoding=utf-8" --username root --password 123456 --table lirun --export-dir /user/hive/warehouse/db_msg.db/lirun --fields-terminated-by "\001" --input-null-non-string '\\N'
最后进行springboot链接到虚拟机中的mysql与平常一样,就不再表述

浙公网安备 33010602011771号