将parquet schema转换成avro schema
1.引入依赖
<!--parquet-->
<dependency>
<groupId>org.apache.parquet</groupId>
<artifactId>parquet-avro</artifactId>
<version>1.10.0</version>
</dependency>
<dependency>
<groupId>org.apache.parquet</groupId>
<artifactId>parquet-hadoop</artifactId>
<version>1.10.0</version>
</dependency>
<!--hadoop-->
<dependency>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-client</artifactId>
<version>2.7.3</version>
</dependency>
2.从parquet文件的footer读取parquet schema
import org.apache.hadoop.conf.Configuration;
import org.apache.parquet.hadoop.ParquetFileReader;
import org.apache.parquet.hadoop.util.HadoopInputFile;
import org.apache.hadoop.fs.Path;
import org.apache.parquet.schema.MessageType;
Configuration config = new Configuration();
Path parquetPath = new Path("file:///Users/lintong/Downloads/xxxx.snappy.parquet");
ParquetFileReader reader = ParquetFileReader.open(HadoopInputFile.fromPath(parquetPath, config));
MessageType parquetSchema = reader.getFooter().getFileMetaData().getSchema();
System.out.println(parquetSchema);
输出
message TestSerializer {
optional binary string1 (UTF8);
optional int32 int1;
optional int32 tinyint1;
optional int32 smallint1;
optional int64 bigint1;
optional boolean boolean1;
optional double float1;
optional double double1;
optional group list1 (LIST) {
repeated binary array (UTF8);
}
optional group map1 (LIST) {
repeated group array {
optional binary key (UTF8);
optional int32 value;
}
}
optional group struct1 {
optional int32 sInt;
optional boolean sBoolean;
optional binary sString (UTF8);
}
optional binary enum1 (UTF8);
optional int32 nullableint;
}
3.将parquet schema转换成avro schema
import org.apache.parquet.avro.AvroSchemaConverter; import org.apache.avro.Schema; Schema avroSchema = new AvroSchemaConverter(config).convert(parquetSchema); System.out.println(avroSchema);
输出
{
"type":"record",
"name":"TestSerializer",
"fields":[
{
"name":"string1",
"type":[
"null",
"string"
],
"default":null
},
{
"name":"int1",
"type":[
"null",
"int"
],
"default":null
},
{
"name":"tinyint1",
"type":[
"null",
"int"
],
"default":null
},
{
"name":"smallint1",
"type":[
"null",
"int"
],
"default":null
},
{
"name":"bigint1",
"type":[
"null",
"long"
],
"default":null
},
{
"name":"boolean1",
"type":[
"null",
"boolean"
],
"default":null
},
{
"name":"float1",
"type":[
"null",
"double"
],
"default":null
},
{
"name":"double1",
"type":[
"null",
"double"
],
"default":null
},
{
"name":"list1",
"type":[
"null",
{
"type":"array",
"items":"string"
}
],
"default":null
},
{
"name":"map1",
"type":[
"null",
{
"type":"array",
"items":{
"type":"record",
"name":"array",
"fields":[
{
"name":"key",
"type":[
"null",
"string"
],
"default":null
},
{
"name":"value",
"type":[
"null",
"int"
],
"default":null
}
]
}
}
],
"default":null
},
{
"name":"struct1",
"type":[
"null",
{
"type":"record",
"name":"struct1",
"fields":[
{
"name":"sInt",
"type":[
"null",
"int"
],
"default":null
},
{
"name":"sBoolean",
"type":[
"null",
"boolean"
],
"default":null
},
{
"name":"sString",
"type":[
"null",
"string"
],
"default":null
}
]
}
],
"default":null
},
{
"name":"enum1",
"type":[
"null",
"string"
],
"default":null
},
{
"name":"nullableint",
"type":[
"null",
"int"
],
"default":null
}
]
}
参考:https://stackoverflow.com/questions/54159454/how-to-convert-parquet-schema-to-avro-in-java-scala
本文只发表于博客园和tonglin0325的博客,作者:tonglin0325,转载请注明原文链接:https://www.cnblogs.com/tonglin0325/p/5323978.html

浙公网安备 33010602011771号