java合并HDFS ORC文件

package com.filemerge;

import java.io.*;
import java.util.ArrayList;
import java.util.List;
import org.apache.commons.io.IOUtils;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.*;
import org.apache.hadoop.hive.ql.exec.vector.VectorizedRowBatch;
import org.apache.orc.*;
import org.apache.orc.Reader;
import org.apache.orc.Writer;
import org.joda.time.LocalDate;

public class Merge_File {
    private static String UTF_8 = "UTF-8";
    private static Writer writer;
    public static void main(String[] args) throws IOException {
        // hdfs://121.89.199.2:8020/user/hive/warehouse/test_version_02 journald.schema
        System.out.println("----------------BEGIN---------------------");
/*      String mergeFilePath = "hdfs://121.89.199.2:8020/user/hive/warehouse/test_version_02";// 表/分区目录
        String struct = "C:\\Users\\fubo\\Desktop\\journald.schema"; //结构化模板*/
        String mergeFilePath = args[0];// 表/分区目录
        String struct = args[1]; //结构化模板

        Configuration conf = new Configuration();
        conf.set("fs.hdfs.impl", "org.apache.hadoop.hdfs.DistributedFileSystem");
        System.setProperty("HADOOP_USER_NAME","hive");

        int currentDay = LocalDate.now().getDayOfMonth();
        int currentMonth = LocalDate.now().getMonthOfYear();
        int currentYear = LocalDate.now().getYear();


        Path path = new Path(mergeFilePath);
        FileSystem fileSystem = path.getFileSystem(conf);


        //定义输出的文件信息
        TypeDescription schema = TypeDescription.fromString(getSchema(struct).replaceAll("\n", ""));
        System.out.println("--------------------schema---------------------------------"+schema);
        String outFile = "." + currentYear  + currentMonth + currentDay + ".orc.working";
        Path outFilePath = new Path(mergeFilePath + "/" + outFile);
        System.out.println("----------------定义合并输出文件路径---------------------"+outFilePath);
        //获取需要合并的文件路径
        List<String> orcFiles = getAllFilePath(new Path(mergeFilePath), fileSystem);
        System.out.println("----------------获取需要合并的ORC文件---------------------"+orcFiles);


        if(fileSystem.exists(outFilePath)) {
            System.out.println(outFilePath + " 文件已经存在, 请删除后再执行合并.");
        } else {
            writer = OrcFile.createWriter(outFilePath, OrcFile.writerOptions(conf)
                    .setSchema(schema).compress(CompressionKind.SNAPPY).version(OrcFile.Version.V_0_12));
        }

        System.out.println("----------------开始合并文件---------------------");
        for (int j = 0; j < orcFiles.size(); j++) {
            Reader reader = OrcFile.createReader(new Path(orcFiles.get(j)), OrcFile.readerOptions(conf));
            VectorizedRowBatch batch = reader.getSchema().createRowBatch();
            RecordReader rows = reader.rows();
            while (rows.nextBatch(batch)) {
                if (batch != null) {
                    writer.addRowBatch(batch);
                }
            }
            rows.close();
            fileSystem.delete(new Path(orcFiles.get(j)), false);
        }

        System.out.println("---------------文件合并结束----------------------");
        writer.close();

        //合并完成 修改合并后文件名称
        outFile = fileSystem.getFileStatus(outFilePath).getPath().getName();
        if (outFile.startsWith(".")) {
            outFile = outFile.substring(1);
            int lastIndexOf = outFile.lastIndexOf(".working");
            outFile = outFile.substring(0, lastIndexOf);
        }
        Path parent = outFilePath.getParent();
        fileSystem.rename(outFilePath, new Path(parent, outFile));
        System.out.println("----------------END---------------------");
    }

    public static String getSchema(String resource) throws IOException {
        InputStream input = Merge_File.class.getResourceAsStream("/" + resource);
        return IOUtils.toString(input, UTF_8);
    }

    private static List<String> getAllFilePath(Path filePath, FileSystem fs) throws FileNotFoundException, IOException {
        List<String> fileList = new ArrayList<String>();
        FileStatus[] fileStatus = fs.listStatus(filePath);
        for (FileStatus fileStat : fileStatus) {
            if (fileStat.isDirectory()) {
                fileList.addAll(getAllFilePath(fileStat.getPath(), fs));
            } else {
                fileList.add(fileStat.getPath().toString());
            }
        }
        return fileList;
    }
}
posted @ 2022-09-21 18:32  付十一。  阅读(352)  评论(0)    收藏  举报