parquet-tools使用
使用parquet-tools的方法有2种
1.在安装了CDH的机器上,会自动有parquet-tools命令
lintong@master:/opt/cloudera/parcels/CDH/bin$ ls| grep parquet-tools parquet-tools lintong@master:/opt/cloudera/parcels/CDH/bin$ parquet-tools
2.自行编辑jar
git clone并指定分支,master分支已经删除了parquet-tools
git clone git@github.com:apache/parquet-mr.git -b apache-parquet-1.10.1
编译
cd parquet-tools && mvn clean package -Plocal
parquet-tools可以使用的命令,参考:How to build and use parquet-tools to read parquet files
1.查看parquet文件的schema
由AvroParquet写的parquet文件的schema
lintong@lintongdeMacBook-Pro ~/coding/java/parquet-mr/parquet-tools/target $ java -jar parquet-tools-1.10.1.jar schema /xxx/avro_parquet/part-r-00000.snappy.parquet
message com.linkedin.haivvreo.test_serializer {
required binary string1 (UTF8);
required int32 int1;
required int32 tinyint1;
required int32 smallint1;
required int64 bigint1;
required boolean boolean1;
required float float1;
required double double1;
required group list1 (LIST) {
repeated binary array (UTF8);
}
required group map1 (MAP) {
repeated group map (MAP_KEY_VALUE) {
required binary key (UTF8);
required int32 value;
}
}
required group struct1 {
required int32 sInt;
required boolean sBoolean;
required binary sString (UTF8);
}
required binary enum1 (ENUM);
optional int32 nullableint;
}
由ThriftParquet写的parquet文件的schema
lintong@lintongdeMacBook-Pro ~/coding/java/parquet-mr/parquet-tools/target $ java -jar parquet-tools-1.10.1.jar schema /xxx/thrift_parquet/part-r-00000.snappy.parquet
message ParquetSchema {
required binary string1 (UTF8) = 1;
required int32 int1 = 2;
required int32 tinyint1 = 3;
required int32 smallint1 = 4;
required int64 bigint1 = 5;
required boolean boolean1 = 6;
required double float1 = 7;
required double double1 = 8;
required group list1 (LIST) = 9 {
repeated binary list1_tuple (UTF8);
}
required group map1 (MAP) = 10 {
repeated group map (MAP_KEY_VALUE) {
required binary key (UTF8);
optional int32 value;
}
}
required group struct1 = 11 {
required int32 sInt = 1;
required boolean sBoolean = 2;
required binary sString (UTF8) = 3;
}
required binary enum1 (UTF8) = 12;
optional int32 nullableint = 13;
}
由hive job写的parquet文件的schema
message hive_schema {
optional binary appid (UTF8);
optional int64 ts;
optional int32 userid;
optional binary countries (UTF8);
}
2.查看parquet文件的head
java -jar parquet-tools-1.10.1.jar head -n 1 /xxx/thrift_parquet/part-r-00000.snappy.parquet string1 = ecAsz6ca7E int1 = 64676 tinyint1 = 8 smallint1 = 0 bigint1 = -9081354042296389692 boolean1 = true float1 = 0.1271180510520935 double1 = 0.011293589263621895 list1: .list1_tuple = v8gCJFRBIb .list1_tuple = nfvrI1Rltp map1: .map: ..key = v8gCJFRBIb ..value = 428 .map: ..key = nfvrI1Rltp ..value = 1257 struct1: .sInt = 740564 .sBoolean = true .sString = RuiVISF2BI enum1 = BLUE nullableint = 5559
本文只发表于博客园和tonglin0325的博客,作者:tonglin0325,转载请注明原文链接:https://www.cnblogs.com/tonglin0325/p/4715351.html

浙公网安备 33010602011771号