package com.filemerge;
import java.io.*;
import java.util.ArrayList;
import java.util.List;
import org.apache.commons.io.IOUtils;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.*;
import org.apache.hadoop.hive.ql.exec.vector.VectorizedRowBatch;
import org.apache.orc.*;
import org.apache.orc.Reader;
import org.apache.orc.Writer;
import org.joda.time.LocalDate;
public class Merge_File {
private static String UTF_8 = "UTF-8";
private static Writer writer;
public static void main(String[] args) throws IOException {
// hdfs://121.89.199.2:8020/user/hive/warehouse/test_version_02 journald.schema
System.out.println("----------------BEGIN---------------------");
/* String mergeFilePath = "hdfs://121.89.199.2:8020/user/hive/warehouse/test_version_02";// 表/分区目录
String struct = "C:\\Users\\fubo\\Desktop\\journald.schema"; //结构化模板*/
String mergeFilePath = args[0];// 表/分区目录
String struct = args[1]; //结构化模板
Configuration conf = new Configuration();
conf.set("fs.hdfs.impl", "org.apache.hadoop.hdfs.DistributedFileSystem");
System.setProperty("HADOOP_USER_NAME","hive");
int currentDay = LocalDate.now().getDayOfMonth();
int currentMonth = LocalDate.now().getMonthOfYear();
int currentYear = LocalDate.now().getYear();
Path path = new Path(mergeFilePath);
FileSystem fileSystem = path.getFileSystem(conf);
//定义输出的文件信息
TypeDescription schema = TypeDescription.fromString(getSchema(struct).replaceAll("\n", ""));
System.out.println("--------------------schema---------------------------------"+schema);
String outFile = "." + currentYear + currentMonth + currentDay + ".orc.working";
Path outFilePath = new Path(mergeFilePath + "/" + outFile);
System.out.println("----------------定义合并输出文件路径---------------------"+outFilePath);
//获取需要合并的文件路径
List<String> orcFiles = getAllFilePath(new Path(mergeFilePath), fileSystem);
System.out.println("----------------获取需要合并的ORC文件---------------------"+orcFiles);
if(fileSystem.exists(outFilePath)) {
System.out.println(outFilePath + " 文件已经存在, 请删除后再执行合并.");
} else {
writer = OrcFile.createWriter(outFilePath, OrcFile.writerOptions(conf)
.setSchema(schema).compress(CompressionKind.SNAPPY).version(OrcFile.Version.V_0_12));
}
System.out.println("----------------开始合并文件---------------------");
for (int j = 0; j < orcFiles.size(); j++) {
Reader reader = OrcFile.createReader(new Path(orcFiles.get(j)), OrcFile.readerOptions(conf));
VectorizedRowBatch batch = reader.getSchema().createRowBatch();
RecordReader rows = reader.rows();
while (rows.nextBatch(batch)) {
if (batch != null) {
writer.addRowBatch(batch);
}
}
rows.close();
fileSystem.delete(new Path(orcFiles.get(j)), false);
}
System.out.println("---------------文件合并结束----------------------");
writer.close();
//合并完成 修改合并后文件名称
outFile = fileSystem.getFileStatus(outFilePath).getPath().getName();
if (outFile.startsWith(".")) {
outFile = outFile.substring(1);
int lastIndexOf = outFile.lastIndexOf(".working");
outFile = outFile.substring(0, lastIndexOf);
}
Path parent = outFilePath.getParent();
fileSystem.rename(outFilePath, new Path(parent, outFile));
System.out.println("----------------END---------------------");
}
public static String getSchema(String resource) throws IOException {
InputStream input = Merge_File.class.getResourceAsStream("/" + resource);
return IOUtils.toString(input, UTF_8);
}
private static List<String> getAllFilePath(Path filePath, FileSystem fs) throws FileNotFoundException, IOException {
List<String> fileList = new ArrayList<String>();
FileStatus[] fileStatus = fs.listStatus(filePath);
for (FileStatus fileStat : fileStatus) {
if (fileStat.isDirectory()) {
fileList.addAll(getAllFilePath(fileStat.getPath(), fs));
} else {
fileList.add(fileStat.getPath().toString());
}
}
return fileList;
}
}