hive高阶函数和采样-优化

 select a.id, a.month from user_b a union all select b.id, b.day from user_b b;
桶表：和hash partition类似
=============================================
    bucket    //clustered(id) into 2 buckets
    
    为了避免分区表过多产生海量文件夹
    文件段形式存在


分区+分桶表插入问题
    
    insert into xx partition(''='',''='') select 1,'tom',20;

    000000        //第一桶
    000001        //第二桶

    select size(arr)
    select array(1,2,3)        //创建array
    select map(1,'tom',2,'tomas')    //创建map
    select struct(1,'tom','male')    //创建匿名结构体 ['col1':1,'col2':'tom' ...]
    select named_struct('id',1,'name','tom','sex','male')    //创建带名结构体

    select current_date();        //当前日期
    select current_timestamp();    //当前时间(精确到毫秒)

    select date_format(current_date(),'yyyyMMdd')    //日期转换
    select date_format(current_timestamp(),'yyyyMMdd')    //日期转换

    select from_unixtime(bigint,'yyyyMMdd')        //将时间戳转换成时间


hive的文件格式：stored as parquet;
===========================================
插入方法：创建text，然后insert into  xx select * from

    行级存储
        text
        seqFile        1M

    列级存储    在进行投影查询的时候，会跳过不相关的列
        rcFile        4M块
        orcFile        能够支持更大的块256M
        parquet        支持更多hadoop生态圈组件    

    

SerDe：
=========================================
    serialize && deserialize

    将文件字段映射成hive表中的列
    
    使用阶段：在inputFormat之后

    textFile：lazySimpleSerde
          openCsvSerde
          JsonSerde        //创建时字段务必与json中的key对应
    
    创建表指定serde：
        row format serde 'org.xxx.xxx.JsonSerde';

    在hive字段中以特殊字符开头的字段
        create table xx(`_location` string);


分析函数：
=================================
    sum()over(sort by age rows between unbounded preceding and unbounded following);    //排序并界定窗口
    sum()over(sort by age range between unbounded preceding and unbounded following);

    sum()over(partition by province order by age range between unbounded preceding and unbounded following);    //分组排序+界定窗口
    
    current row    //当前行

Hive的分组方式

    row_number() 是没有重复值的排序(即使两天记录相等也是不重复的),可以利用它来实现分页
    dense_rank() 是连续排序,两个第二名仍然跟着第三名
    rank()       是跳跃排序的,两个第二名下来就是第四名
    -- percent_rank()百分比排名,相对排名.
        SELECT empno, deptno, salary,  percent_RANK() OVER (PARTITION BY deptno ORDER BY salary desc) from emp ;
    
    rank()        //并列跳跃    113
    dense_rank()    //不跳跃    112
    row_number()    //顺序        123

    
    ntile(n)    //三六九等    

    first_value()    //取第一个值

    lead()        //窗口上浮

    //连续两个月活跃
    select id , m , n from 
    (select id, (month+1) as m , lead(month)over(partition by id order by month) as n from user_c) a 
    where  a.m=a.n;
    //select id ,m ,n from(select id m(month+1)as m , lead(month) over(partition by id order by month) as n from user_c )a where a.m=a.n;
    //连续三个月活跃
    select distinct id from (select id, (month+2) as first , (lead(month)over(partition by id order by month)+1) 
    as second, lead(month,2)over(partition by id order by month) as third from user_c) a 
    where a.first=a.second and a.second=a.third;
    
    //select distinct id from (select id ,(month+2) as first,(lead(month)over(partition by id order by month)+1) 
    //as second,lead(month,2)over (partition by id order by month)as third from user_c) a where a.first = a.second and a.second=a.third;

    lag


    pv：page view    //页面浏览量，统计总浏览数
    uv：user view    //用户浏览数，统计用户数

    根据pv，统计uv：
    select month,day, count(distinct id) as uv from user_b group by month,day;


高级聚合函数：**********************

    grouping__id    //组号
    
    grouping sets    组集    //分别统计月活和日活
    select month,day, count(distinct id) as uv, grouping__id from user_b group by month,day
    grouping sets(month,day) order by grouping__id ;

    //select month,day count(distinct id) as uv grouping_id from user_b group by month,day grouping sets(month,day)order by grouping_id;
    
    +----------+-------------+-----+---------------+--+
    |  month   |     day     | uv  | grouping__id  |
    +----------+-------------+-----+---------------+--+
    | 2015-04  | NULL        | 6   | 1             |
    | 2015-03  | NULL        | 5   | 1             |
    | 2015-02  | NULL        | 2   | 1             |
    | NULL     | 2015-04-16  | 2   | 2             |
    | NULL     | 2015-04-15  | 2   | 2             |
    | NULL     | 2015-04-13  | 3   | 2             |
    | NULL     | 2015-04-12  | 2   | 2             |
    | NULL     | 2015-03-12  | 1   | 2             |
    | NULL     | 2015-03-10  | 4   | 2             |
    | NULL     | 2015-02-16  | 2   | 2             |
    +----------+-------------+-----+---------------+--+



    rollup        汇总
    select month,day, count(distinct id) as uv, grouping__id from user_b group by month,day
    with rollup order by grouping__id ;
    //select month,day count (distinct id )as uv grouping_id from user_b group by month,day with rollup order by grouping_id;
    

    +----------+-------------+-----+---------------+--+
    |  month   |     day     | uv  | grouping__id  |
    +----------+-------------+-----+---------------+--+
    | NULL     | NULL        | 7   | 0             |
    | 2015-04  | NULL        | 6   | 1             |
    | 2015-03  | NULL        | 5   | 1             |
    | 2015-02  | NULL        | 2   | 1             |
    | 2015-04  | 2015-04-16  | 2   | 3             |
    | 2015-04  | 2015-04-15  | 2   | 3             |
    | 2015-04  | 2015-04-13  | 3   | 3             |
    | 2015-04  | 2015-04-12  | 2   | 3             |
    | 2015-03  | 2015-03-12  | 1   | 3             |
    | 2015-03  | 2015-03-10  | 4   | 3             |
    | 2015-02  | 2015-02-16  | 2   | 3             |
    +----------+-------------+-----+---------------+--+

    select day,month count(distinct id) as uv, grouping__id from user_b group by day,month
    with rollup order by grouping__id ;    
    //selectt day ,month count(distinct id ) as uv,grouping_id form user_b group by day,month with rollup order by grouping_id;
    
    +-------------+----------+-----+---------------+--+
    |     day     |  month   | uv  | grouping__id  |
    +-------------+----------+-----+---------------+--+
    | NULL        | NULL     | 7   | 0             |
    | 2015-04-16  | NULL     | 2   | 2             |
    | 2015-04-15  | NULL     | 2   | 2             |
    | 2015-04-13  | NULL     | 3   | 2             |
    | 2015-04-12  | NULL     | 2   | 2             |
    | 2015-03-12  | NULL     | 1   | 2             |
    | 2015-03-10  | NULL     | 4   | 2             |
    | 2015-02-16  | NULL     | 2   | 2             |
    | 2015-04-16  | 2015-04  | 2   | 3             |
    | 2015-04-13  | 2015-04  | 3   | 3             |
    | 2015-04-12  | 2015-04  | 2   | 3             |
    | 2015-03-12  | 2015-03  | 1   | 3             |
    | 2015-03-10  | 2015-03  | 4   | 3             |
    | 2015-02-16  | 2015-02  | 2   | 3             |
    | 2015-04-15  | 2015-04  | 2   | 3             |
    +-------------+----------+-----+---------------+--+
    

    cube        魔方
    select month,day, count(distinct id) as uv, grouping__id from user_b group by month,day
    with cube order by grouping__id ;
    
    //select month,day ,count(distinct id) as uv ,grouping_id from user_b group by month,day with cube order by grouping_id; 

    2    1, 2, [1,2]
    3    1，2，3, [1,2],[1,3],[2,3],[1,2,3]
    4
    
    +----------+-------------+-----+---------------+--+
    |  month   |     day     | uv  | grouping__id  |
    +----------+-------------+-----+---------------+--+
    | NULL     | NULL        | 7   | 0             |
    | 2015-02  | NULL        | 2   | 1             |
    | 2015-04  | NULL        | 6   | 1             |
    | 2015-03  | NULL        | 5   | 1             |
    | NULL     | 2015-02-16  | 2   | 2             |
    | NULL     | 2015-04-16  | 2   | 2             |
    | NULL     | 2015-04-15  | 2   | 2             |
    | NULL     | 2015-04-13  | 3   | 2             |
    | NULL     | 2015-04-12  | 2   | 2             |
    | NULL     | 2015-03-12  | 1   | 2             |
    | NULL     | 2015-03-10  | 4   | 2             |
    | 2015-04  | 2015-04-12  | 2   | 3             |
    | 2015-03  | 2015-03-12  | 1   | 3             |
    | 2015-03  | 2015-03-10  | 4   | 3             |
    | 2015-04  | 2015-04-16  | 2   | 3             |
    | 2015-02  | 2015-02-16  | 2   | 3             |
    | 2015-04  | 2015-04-15  | 2   | 3             |
    | 2015-04  | 2015-04-13  | 3   | 3             |
    +----------+-------------+-----+---------------+--+




cookie1    1
cookie2    5
cookie2    4
cookie1    3
cookie2    7
cookie1    4
cookie2    2
cookie3    2
cookie2    3
cookie3    5
cookie1    6
cookie3    10
cookie2    8


hive的事务性：
===================================
    1、配置文件
    SET hive.support.concurrency=true; 
    SET hive.enforce.bucketing=true; 
    SET hive.exec.dynamic.partition.mode=nonstrict;
    SET hive.txn.manager=org.apache.hadoop.hive.ql.lockmgr.DbTxnManager; 
    SET hive.compactor.initiator.on=true; 
    SET hive.compactor.worker.threads=1;


    2、桶表
    
    3、指定事务开启

    4、orc文件
    create table user3(id int, name string, age int, province string, city string) 
    clustered by(province) into 2 buckets
    row format delimited
    fields terminated by '\t'
    stored as orc 
    tblproperties('transactional'='true');


    5、insert into user3 select * from user1;

    
    6、update user3 set age=200 where id=5;


采样：取出大型数据集的子集
============================================
    随机采样：
        select * from user2 distribute by rand() sort by rand() limit 10;
        
        //select * from user2 distribite by rand() sort rand() limit 10;
        
        随机模式分发+ 随机模式排序


    桶表采样：
        对于桶表的采样优化
        select name from user1 TABLESAMPLE(bucket 1<指定桶数> out of 4<总桶数> on rand()) ;
        //select name from user1 tablesample(bucket 1 out of 4 on rand());

        随机取得某个桶的数据，

        create table user1_bucket(id int, name string, age int, province string, city string) 
        clustered by(province) into 4 buckets
        row format delimited fields terminated by '\t';

        insert into user1_bucket select * from user1;


1    tom    30    anhui    anhui
2    tomas    30    hefei
1    tom    30    heilongjiang
2    tomas    30    jilin 
1    tom    30    liaoning
2    tomas    30    neimenggu
1    tom    30    shandong
2    tomas    30    shanxi
1    tom    30    qinghai
2    tomas    30    jiangsu
1    tom    30    gansu
2    tomas    30    ningxia
1    tom    30    hubei
2    tomas    30    sichuan
1    tom    30    chongqing
2    tomas    30    taiwan
1    tom    30    xianggang
2    tomas    30    guangdong



    块采样：可以指定文件大小、行数、百分比进行数据的采样
        select * from users TABLESAMPLE(5 percent);
        select * from users TABLESAMPLE(30 M);
        select * from users TABLESAMPLE(2 rows);


用户定义函数
=====================
    1、UDF        //单行转单行    add(id,name) ===> idname
            //处理单位一行，返回一行

       入口点：UDF类

    2、UDTF        //单行转多行    explode(arr) ===> arr[1]
                              arr[2]
                              arr[3]

    
    3、UDAF        //多行转单行    sum(age)     ===> 500


编写hive自定义函数：
==========================================
    1、pom文件
        <dependency>
            <groupId>org.apache.hive</groupId>
            <artifactId>hive-exec</artifactId>
            <version>2.1.1</version>
        </dependency>

    2、编写代码

        注意：对于hive的UDF，不支持变长参数，但是支持list
        @Description(
            name = "add",
            value = "this is an add function",
            extended = "select add(1,2,3) ====> 6 ; select add('hello','world') ====> helloworld"
        )
        public class UDFAdd extends UDF{
            /**
             * 将所有i进行相加
             * @param i
             * @return
             */
            public Integer evaluate(int i, int j){
            return i+ j;
            }
            public String evaluate(String i, String j){
            return i+ j;
            }
            public int evaluate(List<Integer> i){
            int j = 0;
            for (Integer integer : i) {
                j += integer;
            }
            return j;
            }



        }



    3、加载并使用jar
        第一个方法：打包并将其复制到hive的lib文件夹            //hive的默认类路径
        第二个方法：add jar /soft/hive/lib/myhive-1.0-SNAPSHOT.jar;    //手动指定hive类路径
        第三个方法：修改配置文件：hive-site.xml                //hive.aux.jars.path=/x/x/x.jar


    4、将类加载成hive的函数
        创建函数[临时函数]
            beeline> create temporary function add as  'com.oldboy.hive.UDFAdd';

        [永久函数]
            beeline> create function add as 'com.oldboy.hive.UDFAdd'
                 using jar 'hdfs:///path/to/x.jar';

            该方式非常重要，完全分布式时需要使用该方式,否则找不到函数类。

    
    5、删除方法：
        drop temporary function add ;



1、将商家的标签，通过udf进行解析并返回标签

    json ==== udf + fastJson ===> {'味道好','服务好'}

    如何在hive中使用fastJson？    //将其拷贝到hive的lib文件夹并重启hive


    1)编写程序并打包程序
    2)将jar和fastJson文件传送到hive的lib文件夹
    3)添加临时函数create temporary function parseJson as  'com.oldboy.hive.TempTagUDF';
    4)创建temptag表，字段包括id和json，以'\t'作为分隔
    5)将TempTag.txt加载到hive的temptag表中
    6)使用udf对商家评论进行操作
    7)统计各个商家去重之后的标签数
        select  id, count(distinct tag)  from  temptags lateral view explode(parseJson(json)) xx  as tag group by id;
    8)统计各个商家各个标签的数量
        select id,  tag, count(tag) from temptags lateral view explode(parseJson(json)) xx  as tag group by id,tag ;


    !!!!出现问题：类找不到异常
        Caused by: java.lang.ClassNotFoundException: com.oldboy.hive.TempTagUDF

        原因分析：hive在使用MR操作时，在其他节点接受不到TempTagUDF类所在jar

        解决：将udf的jar和fastjson一同复制到${HADOOP_HOME}/share/hadoop/common/lib下，并同步到其他节点
              不需要重启hadoop


maven项目设置编译器为jdk1.8(默认1.5)    //放在<project>标签下
    <build>
        <plugins>
            <plugin>
                <artifactId>maven-compiler-plugin</artifactId>
                <configuration>
                    <source>1.8</source>
                    <target>1.8</target>
                </configuration>
            </plugin>
        </plugins>
    </build>



虚列：
======================    
    > select name, explode(work_place) from default.employee;    //udtf不支持外部的select从句
    UDTF's are not supported outside the SELECT clause, nor nested in expressions (state=42000,code=10081)


    lateral view    //对表生成函数的补充

    select id, tag from temptags lateral view explode(parseJson(json)) xx as tag ;
          //虚列
          //lateral view explode(parseJson(json)) xx as tag    (xx无意义，占位符)


    select id , explode()



UDTF:
===================================
    StructObjectInspector    类
    ObjectInspector[]

    create temporary function wc as  'com.oldboy.hive.TestUDTF';

    LazySimpleSerDe中的lazy格式，默认string格式，只有在使用或者声明格式的时候才进行转换



hive优化：
=====================================
    性能工具：
        EXPLAIN        //解释hive运行过程中MR作业整体流程
                //explain select count(1) from wc;

        Analyze        //在下一次执行的时候使用CBO(cost-based-optimize)基于成本的优化来执行作业
                //analyze table wc compute STATISTICS
                //desc formatted wc    =====> 能看到文件行数统计以及文件大小


    设计优化：
        分区表：    //以日期+时间、以location、以业务逻辑为分区字段，优化分区（对where子句查询的优化）
                //create table xx() partition by (province string, city string);

        分桶表：    //可以进行采样、对join的优化比较好(分桶字段如果和join字段一致，
                //在join操作时会选择分区内部的桶文件段，避免了全文件扫描)
                //create table xx() clustered by(province) into 2 buckets;

        创建索引：    // CREATE INDEX idx_id_employee_id ON TABLE employee_id (id)
                   AS 'COMPACT' WITH DEFERRED REBUILD;        //创建compact索引

                // CREATE INDEX idx_id_employee_id ON TABLE employee_id (id)     
                   AS 'BITMAP' WITH DEFERRED REBUILD;        //创建位图索引


        执行引擎优化：    //hive2中已经不推荐使用mr作为执行引擎
                //推荐使用spark、tez作为执行引擎

    数据文件格式优化：、

        数据格式：
            text
            seqFile
            RCFile
            ORCFile
            Parquet

        压缩（default\gzip\lzo\bzip2\lz4\snappy）
            中间数据压缩：    //分担网络间分发压力和磁盘存储压力
                    //set hive.exec.compress.intermidiate=true
                    //set hive.intermidiate.compression.codec=org.apache.hadoop.io.compress.SnappyCodec
            
            输出文件压缩    //减小磁盘存储压力
                    //set hive.exec.compress.output=true
                    //set mapred.output.compression.codec=org.apache.hadoop.io.compress.SnappyCodec

        存储优化：
            1、使用har文件
            2、使用SeqFile格式
            3、使用CombineFileInputFormat格式    //hive自动优化手段，将多个小文件合并为单个文件
            4、使用hdfs联邦                //在其他节点存储namenode数据，水平扩展其容量(不实用)

            5、配置文件：
                set hive.merge.mapfiles=true    //仅map任务时合并输出的小文件
                set hive.merge.mapredfiles=true    //MR作业时合并输出小文件
                set hive.merge.size.per.task=256000000    //定义job合并文件的大小
                set hive.merge.smallfiles.avgsize=16000000    //定义合并小文件的触发阈值

    作业和查询优化：
        使用本地模式：
            beeline本地模式(不用开启hive2)：beeline -u jdbc:hive2:// -n centos
            hive本地模式触发条件：
                SET hive.exec.mode.local.auto=true;            //本地模式自动进入
                SET hive.exec.mode.local.auto.inputbytes.max=50000000;    //自动进入本地模式的输入阈值，超过则退出本地模式
                SET hive.exec.mode.local.auto.input.files.max=5;    //自动进入本地模式的文件个数阈值，超过则退出本地模式

        jvm重用：仅适用于MR一代，yarn不适用
            MR作业会共享jvm虚拟机内存而非全部并行计算
                set mapred.job.reuse.jvm.num.tasks=5



        并行计算：多用于分布式作业，之在不同主机中同时进行同一作业的处理
        并发执行：多用于线程

               在hive作业执行时，多个stage之间不一定相互依赖，在此时可以设置并行执行
               set hive.exec.parellel=true;
               set hive.exec.parellel.thread.number=16;


        join优化：**********
            Common join    //reduce 端的join
                    //通过暗示指定大表/*+ STREAMTABLE(bigtable) */
                
            Map join    //map 端
                    //通过暗示指定小表/*+MAP JOIN(smalltable) */
                    SET hive.auto.convert.join=true; 
                    --default false
                    SET hive.mapjoin.smalltable.filesize=600000000;
                    --default 25M 超过此值，使用reduce端join
                    SET hive.auto.convert.join.noconditionaltask=true;
                    --default false. true说明不需要暗示
                    SET hive.auto.convert.join.noconditionaltask.size=10000000;
                    --控制表大小和内存的适配

            桶表join优化：    //SET hive.auto.convert.join=true; --default false
                    //SET hive.optimize.bucketmapjoin=true; --default false

                
            !!!!!!!!join端数据倾斜处理
                    //SET hive.optimize.skewjoin=true;     进行负载均衡
                      SET hive.skewjoin.key=100000;    //在reduce中如果一个reduce接受的数据超过此值，会自动发送给空闲的reduce

                    
                    

        group by优化
            !!!!!!!!group by数据倾斜处理
            SET hive.groupby.skewindata=true;
            
            
            
            
             
1、请把下一语句用hive方式实现？

SELECT a.key,a.value 
FROM a 
WHERE a.key not in (SELECT b.key FROM b)
答案：
select a.key,a.value from a where a.key not exists (select b.key from b)




2、Multi-group by 是hive的一个非常好的特性，请举例说明？  多组


from A
insert overwrite table B
 select A.a, count(distinct A.b) group by A.a
insert overwrite table C
  select A.c, count(distinct A.b) group by A.c
  
//from  A insert overwrite table B select A.a count(A.b) group by A.a  insert overwrite table C select A.c count(A.b)group by A.c



3、写出将 text.txt 文件放入 hive 中 test 表‘2016-10-10’ 分区的语句，test 的分区字段是 l_date。

 
LOAD DATA LOCAL INPATH '/your/path/test.txt' OVERWRITE INTO TABLE test PARTITION (l_date='2016-10-10')

https://blog.csdn.net/haohaixingyun/article/details/52819588网页连接

https://blog.csdn.net/ukakasu/article/details/47860647面试题是实例，处理大表数据。里面有原题连接。
posted on 2018-06-25 17:18 飞机耳朵阅读(353) 评论(0) 收藏举报
刷新页面返回顶部
hive高阶函数和采样-优化

导航

公告