压缩和SequenceFile存储
一、压缩
文件的压缩和解压在mapper的输出和reducer的输入阶段,既可以减少存储时间,又能降低网络传输时间。

·这些算法只有bzip2和lzo支持切分(splitting)。压缩后的大文件若不能切分,则只能作为一个单独的Map输入。
import java.io.File; import java.io.FileInputStream; import java.io.FileNotFoundException; import java.io.FileOutputStream; import java.io.IOException; import java.io.InputStream; import java.io.OutputStream; import java.net.URI; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.FSDataInputStream; import org.apache.hadoop.fs.FSDataOutputStream; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; import org.apache.hadoop.io.IOUtils; import org.apache.hadoop.io.compress.CompressionCodec; import org.apache.hadoop.io.compress.CompressionCodecFactory; import org.apache.hadoop.io.compress.CompressionInputStream; import org.apache.hadoop.io.compress.CompressionOutputStream; import org.apache.hadoop.util.ReflectionUtils;
public class Compress { public static void main(String[] args) throws Exception, IOException { String uri="hdfs://node:9000/result/Compress/LICENSE.txt"; // compress(uri,"org.apache.hadoop.io.compress.BZip2Codec"); // decompres(uri+".bz2"); System.out.print("OK!"); } public static void compress(String uri, String method) throws ClassNotFoundException, IOException { Configuration conf = new Configuration(); FileSystem fs = FileSystem.get(URI.create(uri), conf); FSDataInputStream in = fs.open(new Path(uri)); //InputStream in = new FileInputStream(new File(filername)); Class codecClass = Class.forName(method); // 获取压缩的方式的类 CompressionCodec codec = (CompressionCodec) ReflectionUtils.newInstance(codecClass, conf); // 4 该压缩方法对应的文件扩展名//OutputStream out = new FileOutputStream(new File(filename + codec.getDefaultExtension())); FSDataOutputStream out = fs.create(new Path(uri+codec.getDefaultExtension() )); CompressionOutputStream cout = codec.createOutputStream(out); // 流对接 IOUtils.copyBytes(in, cout, 1024 * 1024 * 5, false); // 缓冲区设为5MB in.close(); cout.close(); out.close(); } public static void decompres(String uri) throws FileNotFoundException, IOException { Configuration conf = new Configuration(); FileSystem fs=FileSystem.get(URI.create(uri),conf); CompressionCodecFactory factory = new CompressionCodecFactory(conf); CompressionCodec codec = factory.getCodec(new Path(uri)); if (null == codec) { // 判断该文件的压缩方法是否存在 System.out.println("Cannot find codec for file :" + uri); return; } // 流对接 InputStream cin = codec.createInputStream(new FileInputStream(filename)); CompressionInputStream cin = codec.createInputStream(fs.open(new Path(uri))); // OutputStream out = new FileOutputStream(new File(uri+ ".decoded")); FSDataOutputStream out = fs.create(new Path(uri+".decoded" )); IOUtils.copyBytes(cin, out, 1024 * 1024 * 5, false); cin.close(); out.close(); } }
Hadoop由Java开发,但某些需求与操作不适合使用Java,使用Hadoop的本地库(Native Libraries)性能更好。本地库位于lib/native
gzip有linux命令,可直接使用 。适用于每个文件压缩后小于128MB,但不支持split;
bzip2有linux命令,可直接使用。适用于很想压缩单个很大的文件,又不在乎速率,支持split;
lzo有linux命令,但hadoop本身不支持 需要去安装。适用于每个文件压缩后在200MB以上。
job.setInputFormatClass(LzoTextInputFormat.class) // https://blog.csdn.net/wisgood/article/details/17080361 job.getConfiguration().setBoolean("mapred.output.compress", true);// https://blog.csdn.net/haizhaopeng/article/details/47120043 job.getConfiguration().setClass("mapred.output.compression.codec", Bzip2Codec.class, CompressionCodec.class); FileOutputFormat.setCompressOutput(job, true); //https://blog.csdn.net/qq262593421/article/details/101685223 FileOutputFormat.setOutputCompressorClass(job, GzipCodec.class);
二、基于文件的数据结构 SequenceFile存储
HDFS和MapReduce这两个子框架针对大数据文件,小文件就效率低耗资源了。解决:把整个小文件包装成一个记录。
SequencceFile的写操作有3种压缩类型:Write(无压缩)、RecordCompressWriter(记录级压缩,只压缩值)和
Sync记录同步点
BlockCompressWrite(块级压缩,键值分别压缩)。 示例读写文件:
import java.io.IOException; import java.net.URI; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; import org.apache.hadoop.io.IOUtils; import org.apache.hadoop.io.IntWritable; import org.apache.hadoop.io.SequenceFile; import org.apache.hadoop.io.SequenceFile.CompressionType; import org.apache.hadoop.io.Text;
public class SequenceFileWriteFile { private static String[] myValue = { "hello world", "bye world", "hello hadoop", "bye hadoop" }; @SuppressWarnings("deprecation") public static void main(String[] args) throws IOException, ClassNotFoundException { uri="hdfs://localhost:8010/usr/hadoop/mapfile" String uri = "hdfs://node:9000/result/Sequence/Block.txt"; Configuration conf = new Configuration(); //在window中运行,关闭读取本地gzip压缩库 conf.setBoolean("io.native.lib.available",false); FileSystem fs = FileSystem.get(URI.create(uri),conf); Path path = new Path(uri); IntWritable key = new IntWritable(); Text value = new Text(); SequenceFile.Writer writer = null; try { // writer = SequenceFile.createWriter(fs, conf, path, key.getClass(), value.getClass(),CompressionType.NONE);//128 // writer = SequenceFile.createWriter(fs, conf, path, key.getClass(), value.getClass(),CompressionType.RECORD);//128 writer = SequenceFile.createWriter(fs, conf, path, key.getClass(), value.getClass(),CompressionType.BLOCK);//128 for (int i = 0; i < 1000; i++) { key.set(1000 - i); value.set(myValue[i % myValue.length]); System.out.printf("[%s]\t%s\t%s\n", writer.getLength(),key,value); writer.append(key, value); } } finally { IOUtils.closeStream(writer); } } }
import java.io.IOException; import java.net.URI; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; import org.apache.hadoop.io.IOUtils; import org.apache.hadoop.io.SequenceFile; import org.apache.hadoop.io.Writable; import org.apache.hadoop.util.ReflectionUtils; public class SequenceFileReadFile { @SuppressWarnings("deprecation") //wo gai de public static void main(String[] args) throws IOException { String uri = "hdfs://node:9000/result/Sequence/Block.txt"; Configuration conf = new Configuration(); FileSystem fs = FileSystem.getLocal(conf); Path path = new Path(uri); SequenceFile.Reader reader = null; try { reader = new SequenceFile.Reader(fs, path, conf); //得到 SequenceFile //reader.getKeyClass(),getValueClass()得到key和value的类型,并通过ReflectionUtils实例化 Writable key = (Writable) ReflectionUtils.newInstance(reader.getKeyClass(), conf); Writable value = (Writable) ReflectionUtils.newInstance(reader.getValueClass(), conf); long position = reader.getPosition(); while (reader.next(key, value)) { String syncSeen = reader.syncSeen() ? "*" : ""; System.out.printf("[%s%s]\t%s\t%s\n", position, syncSeen, key, value); position = reader.getPosition(); // beginning of next record } } finally { IOUtils.closeStream(reader); } } }
2020-06-10 12:45:42

浙公网安备 33010602011771号