如何读取Hadoop中压缩的文件

最近在处理离线数据导入HBase的问题,涉及从Hdfs中读取gz压缩文件,把思路记录下来,以作备用。具体代码如下:

package org.dba.util;
import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStreamReader;
import java.io.PrintStream;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FSDataInputStream;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.compress.CompressionCodec;
import org.apache.hadoop.io.compress.CompressionCodecFactory;
import org.apache.hadoop.io.compress.CompressionInputStream;

public class ReadHdfs {
    public static void ReadFile(String fileName) throws IOException{
        Configuration conf = new Configuration();
        Path file = new Path(fileName);
        FileSystem fs = FileSystem.get(conf);
        FSDataInputStream hdfsInstream = fs.open(file);
        CompressionCodecFactory factory = new CompressionCodecFactory(conf);
        CompressionCodec codec = factory.getCodec(file);
        BufferedReader reader = null;
        try{
            if(codec == null){
                reader = new BufferedReader(new InputStreamReader(hdfsInstream));
            }else{
                CompressionInputStream comInStream = codec.createInputStream(hdfsInstream);
                reader = new BufferedReader(new InputStreamReader(comInStream));
                System.out.println(reader.readLine().substring(0, 100));
            }
        }catch(Exception e){
            e.printStackTrace();
        }
    }
    public static void main(String[] args) throws IOException{
        ReadFile(args[0]);
    }

}

 

posted @ 2017-03-25 11:27 ballwql 阅读(...) 评论(...) 编辑 收藏