Hive学习笔记——在Hive中使用AvroSerde

Hive支持使用avro serde作为序列化的方式,参考:

https://cwiki.apache.org/confluence/display/hive/avroserde
https://www.docs4dev.com/docs/zh/apache-hive/3.1.1/reference/AvroSerDe.html
https://github.com/jghoman/haivvreo

以及CDH官方的文档

https://docs.cloudera.com/documentation/enterprise/6/6.3/topics/cdh_ig_avro_usage.html

  

1.定义avro schema,kst.avsc

{
  "namespace": "com.linkedin.haivvreo",
  "name": "test_serializer",
  "type": "record",
  "fields": [
    { "name":"string1", "type":"string" },
    { "name":"int1", "type":"int" },
    { "name":"tinyint1", "type":"int" },
    { "name":"smallint1", "type":"int" },
    { "name":"bigint1", "type":"long" },
    { "name":"boolean1", "type":"boolean" },
    { "name":"float1", "type":"float" },
    { "name":"double1", "type":"double" },
    { "name":"list1", "type":{"type":"array", "items":"string"} },
    { "name":"map1", "type":{"type":"map", "values":"int"} },
    { "name":"struct1", "type":{"type":"record", "name":"struct1_name", "fields": [
          { "name":"sInt", "type":"int" }, { "name":"sBoolean", "type":"boolean" }, { "name":"sString", "type":"string" } ] } },
    { "name":"union1", "type":["float", "boolean", "string"] },
    { "name":"enum1", "type":{"type":"enum", "name":"enum1_values", "symbols":["BLUE","RED", "GREEN"]} },
    { "name":"nullableint", "type":["int", "null"] },
    { "name":"bytes1", "type":"bytes" },
    { "name":"fixed1", "type":{"type":"fixed", "name":"threebytes", "size":3} }
  ] }

将schema文件其放到HDFS上

hadoop fs -ls /user/hive/schema
Found 1 items
-rw-r--r--   3 hive hive       1131 2021-12-04 13:53 /user/hive/schema/kst.avsc

2.建Hive表

CREATE TABLE default.kst
  PARTITIONED BY (ds string)
  ROW FORMAT SERDE
  'org.apache.hadoop.hive.serde2.avro.AvroSerDe'
  STORED AS INPUTFORMAT
  'org.apache.hadoop.hive.ql.io.avro.AvroContainerInputFormat'
  OUTPUTFORMAT
  'org.apache.hadoop.hive.ql.io.avro.AvroContainerOutputFormat'
  TBLPROPERTIES (
    'avro.schema.url'='hdfs:///user/hive/schema/kst.avsc');
    

或者直接在TBLPROPERTIES中指定schema

CREATE TABLE default.kst
  PARTITIONED BY (ds string)
  ROW FORMAT SERDE
  'org.apache.hadoop.hive.serde2.avro.AvroSerDe'
  STORED AS INPUTFORMAT
  'org.apache.hadoop.hive.ql.io.avro.AvroContainerInputFormat'
  OUTPUTFORMAT
  'org.apache.hadoop.hive.ql.io.avro.AvroContainerOutputFormat'
  TBLPROPERTIES (
    'avro.schema.literal'='{
  "namespace": "com.linkedin.haivvreo",
  "name": "test_serializer",
  "type": "record",
  "fields": [
    { "name":"string1", "type":"string" },
    { "name":"int1", "type":"int" },
    { "name":"tinyint1", "type":"int" },
    { "name":"smallint1", "type":"int" },
    { "name":"bigint1", "type":"long" },
    { "name":"boolean1", "type":"boolean" },
    { "name":"float1", "type":"float" },
    { "name":"double1", "type":"double" },
    { "name":"list1", "type":{"type":"array", "items":"string"} },
    { "name":"map1", "type":{"type":"map", "values":"int"} },
    { "name":"struct1", "type":{"type":"record", "name":"struct1_name", "fields": [
          { "name":"sInt", "type":"int" }, { "name":"sBoolean", "type":"boolean" }, { "name":"sString", "type":"string" } ] } },
    { "name":"union1", "type":["float", "boolean", "string"] },
    { "name":"enum1", "type":{"type":"enum", "name":"enum1_values", "symbols":["BLUE","RED", "GREEN"]} },
    { "name":"nullableint", "type":["int", "null"] },
    { "name":"bytes1", "type":"bytes" },
    { "name":"fixed1", "type":{"type":"fixed", "name":"threebytes", "size":3} }
  ] }');

3.查看Hive表的schema

describe kst;

 

使用show create table查看表结构

CREATE TABLE `default.kst`(
	  `string1` string COMMENT '', 
	  `int1` int COMMENT '', 
	  `tinyint1` int COMMENT '', 
	  `smallint1` int COMMENT '', 
	  `bigint1` bigint COMMENT '', 
	  `boolean1` boolean COMMENT '', 
	  `float1` float COMMENT '', 
	  `double1` double COMMENT '', 
	  `list1` array<string> COMMENT '', 
	  `map1` map<string,int> COMMENT '', 
	  `struct1` struct<sint:int,sboolean:boolean,sstring:string> COMMENT '', 
	  `union1` uniontype<float,boolean,string> COMMENT '', 
	  `enum1` string COMMENT '', 
	  `nullableint` int COMMENT '', 
	  `bytes1` binary COMMENT '', 
	  `fixed1` binary COMMENT '')
	PARTITIONED BY ( 
	  `ds` string)
	ROW FORMAT SERDE 
	  'org.apache.hadoop.hive.serde2.avro.AvroSerDe' 
	STORED AS INPUTFORMAT 
	  'org.apache.hadoop.hive.ql.io.avro.AvroContainerInputFormat' 
	OUTPUTFORMAT 
	  'org.apache.hadoop.hive.ql.io.avro.AvroContainerOutputFormat'
	LOCATION
	  'hdfs://master:8020/user/hive/warehouse/kst'
	TBLPROPERTIES (
	  'avro.schema.url'='hdfs:///user/hive/schema/kst.avsc', 

4.插入数据

insert into table default.kst partition(ds="2019-08-20") select "test",2,1Y,2S,1000L,true,1.111,2.22222,array("test1","test2","test3"),map("test123",2222,"test321",4444),
named_struct('sInt',123,'sBoolean',true, 'sString','London') as struct1,create_union(0,cast(0.2 as float),false,"test3"),"BLUE",12345,"00008DAC",'111';

5.查看刚刚insert的数据

 

posted @ 2016-03-04 14:15  tonglin0325  阅读(236)  评论(0编辑  收藏  举报