压缩和SequenceFile存储

一、压缩

  文件的压缩和解压在mapper的输出和reducer的输入阶段,既可以减少存储时间,又能降低网络传输时间。

 

 ·这些算法只有bzip2和lzo支持切分(splitting)。压缩后的大文件若不能切分,则只能作为一个单独的Map输入。 


import java.io.File;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.OutputStream;
import java.net.URI;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FSDataInputStream;
import org.apache.hadoop.fs.FSDataOutputStream;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IOUtils;
import org.apache.hadoop.io.compress.CompressionCodec;
import org.apache.hadoop.io.compress.CompressionCodecFactory;
import org.apache.hadoop.io.compress.CompressionInputStream;
import org.apache.hadoop.io.compress.CompressionOutputStream;
import org.apache.hadoop.util.ReflectionUtils;
import package 转自https://www.cnblogs.com/frankdeng/p/9255935.html
public class Compress {
    
    public static void main(String[] args) throws Exception, IOException {        
        String uri="hdfs://node:9000/result/Compress/LICENSE.txt";
       //   compress(uri,"org.apache.hadoop.io.compress.BZip2Codec"); 
       //  decompres(uri+".bz2");
       System.out.print("OK!");
    }
    public static void compress(String uri, String method) throws ClassNotFoundException, IOException {
        
        Configuration conf = new Configuration();
        FileSystem fs = FileSystem.get(URI.create(uri), conf);
        FSDataInputStream in = fs.open(new Path(uri));        //InputStream in = new FileInputStream(new File(filername));
        
        Class codecClass = Class.forName(method);         //  获取压缩的方式的类            
        CompressionCodec codec = (CompressionCodec) ReflectionUtils.newInstance(codecClass, conf);

        // 4 该压缩方法对应的文件扩展名//OutputStream out = new FileOutputStream(new File(filename + codec.getDefaultExtension()));        
        FSDataOutputStream out = fs.create(new Path(uri+codec.getDefaultExtension() ));
        CompressionOutputStream cout = codec.createOutputStream(out);     // 流对接        
        IOUtils.copyBytes(in, cout, 1024 * 1024 * 5, false);          // 缓冲区设为5MB
      
        in.close();  cout.close();   out.close();        
    }
    public static void decompres(String uri) throws FileNotFoundException, IOException {

        Configuration conf = new Configuration();     
        FileSystem fs=FileSystem.get(URI.create(uri),conf);
        CompressionCodecFactory factory = new CompressionCodecFactory(conf);
        CompressionCodec codec = factory.getCodec(new Path(uri)); 
        if (null == codec) {                         // 判断该文件的压缩方法是否存在
            System.out.println("Cannot find codec for file :" + uri);
            return;
        }

        // 流对接 InputStream cin = codec.createInputStream(new FileInputStream(filename));
        CompressionInputStream cin = codec.createInputStream(fs.open(new Path(uri)));         
        // OutputStream out = new FileOutputStream(new File(uri+ ".decoded"));
        FSDataOutputStream out = fs.create(new Path(uri+".decoded" ));
        IOUtils.copyBytes(cin, out, 1024 * 1024 * 5, false);

        cin.close(); out.close();
    }
}

 Hadoop由Java开发,但某些需求与操作不适合使用Java,使用Hadoop的本地库(Native Libraries)性能更好本地库位于lib/native

  gzip有linux命令,可直接使用 。适用于每个文件压缩后小于128MB,但不支持split;

  bzip2有linux命令,可直接使用。适用于很想压缩单个很大的文件,又不在乎速率,支持split;

  lzo有linux命令,但hadoop本身不支持 需要去安装。适用于每个文件压缩后在200MB以上。

job.setInputFormatClass(LzoTextInputFormat.class)       // https://blog.csdn.net/wisgood/article/details/17080361 
job.getConfiguration().setBoolean("mapred.output.compress", true);// https://blog.csdn.net/haizhaopeng/article/details/47120043
job.getConfiguration().setClass("mapred.output.compression.codec", Bzip2Codec.class, CompressionCodec.class); 
FileOutputFormat.setCompressOutput(job, true);          //https://blog.csdn.net/qq262593421/article/details/101685223          
FileOutputFormat.setOutputCompressorClass(job, GzipCodec.class);       

 二、基于文件的数据结构  SequenceFile存储

 HDFS和MapReduce这两个子框架针对大数据文件,小文件就效率低耗资源了。解决:把整个小文件包装成一个记录。

SequencceFile的写操作有3种压缩类型:Write(无压缩)、RecordCompressWriter(记录级压缩,只压缩值)和

  Sync记录同步点 

  BlockCompressWrite(块级压缩,键值分别压缩)。    示例读写文件: 


import java.io.IOException;
import java.net.URI;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IOUtils;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.SequenceFile;
import org.apache.hadoop.io.SequenceFile.CompressionType;
import org.apache.hadoop.io.Text;
import package
public class SequenceFileWriteFile {
    private static String[] myValue = { 
        "hello world", 
        "bye world", 
        "hello hadoop", 
        "bye hadoop" 
    };
    @SuppressWarnings("deprecation")
    public static void main(String[] args) throws IOException, ClassNotFoundException {
        uri="hdfs://localhost:8010/usr/hadoop/mapfile"
        String uri = "hdfs://node:9000/result/Sequence/Block.txt"; 
        Configuration conf = new Configuration();
        //在window中运行,关闭读取本地gzip压缩库
        conf.setBoolean("io.native.lib.available",false);
        FileSystem fs = FileSystem.get(URI.create(uri),conf);
        Path path = new Path(uri);
        
        IntWritable key = new IntWritable();
        Text value = new Text();
        SequenceFile.Writer writer = null;
        try {
            
//            writer = SequenceFile.createWriter(fs, conf, path, key.getClass(), value.getClass(),CompressionType.NONE);//128
//            writer = SequenceFile.createWriter(fs, conf, path, key.getClass(), value.getClass(),CompressionType.RECORD);//128
            writer = SequenceFile.createWriter(fs, conf, path, key.getClass(), value.getClass(),CompressionType.BLOCK);//128
            for (int i = 0; i < 1000; i++) {
                key.set(1000 - i);
                value.set(myValue[i % myValue.length]);                
                System.out.printf("[%s]\t%s\t%s\n", writer.getLength(),key,value);
                writer.append(key, value);
            }
        } finally {
            IOUtils.closeStream(writer);
        }
    }
}
import java.io.IOException;
import java.net.URI;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IOUtils;
import org.apache.hadoop.io.SequenceFile;
import org.apache.hadoop.io.Writable;
import org.apache.hadoop.util.ReflectionUtils;

public class SequenceFileReadFile {
    @SuppressWarnings("deprecation") //wo gai de
    public static void main(String[] args) throws IOException {
        String uri = "hdfs://node:9000/result/Sequence/Block.txt";
        Configuration conf = new Configuration();
        FileSystem fs = FileSystem.getLocal(conf);
        Path path = new Path(uri);
        SequenceFile.Reader reader = null;
        try {
            reader = new SequenceFile.Reader(fs, path, conf);  //得到 SequenceFile
            //reader.getKeyClass(),getValueClass()得到key和value的类型,并通过ReflectionUtils实例化
            Writable key = (Writable) ReflectionUtils.newInstance(reader.getKeyClass(), conf);
            Writable value = (Writable) ReflectionUtils.newInstance(reader.getValueClass(), conf);
            long position = reader.getPosition();
            while (reader.next(key, value)) {
                String syncSeen = reader.syncSeen() ? "*" : "";
                
                System.out.printf("[%s%s]\t%s\t%s\n", position, syncSeen, key, value);
                position = reader.getPosition(); // beginning of next record
            }
        } finally {
            IOUtils.closeStream(reader);
        }
    }
}
SequenceFileReadFile.java 

 

2020-06-10 12:45:42 

posted @ 2021-12-09 10:02  shines87  阅读(102)  评论(0)    收藏  举报