mapreduce的多种格式文件输出-自定义OutputFormat

/**
 * @description: mapreduce多种格式的文件输出方式
 */
public class MultipleTypeOutputFormat<K, V> extends FileOutputFormat<K, V> {
    private static final String ORCEXTENSION = ".orc";
    private static final String CSVEXTENSION = ".csv";
    public static final String SKIP_TEMP_DIRECTORY = "orc.mapreduce.output.skip-temporary-directory";

    public MultipleTypeOutputFormat() {
    }

    /**
     * 具体数据写出对象
     *
     * @param job the information about the current task.
     * @return
     * @throws IOException
     * @throws InterruptedException
     */
    @Override
    public RecordWriter<K, V> getRecordWriter(TaskAttemptContext job) throws IOException, InterruptedException {
        // 根据需要，你可以在这里添加逻辑以决定使用SequenceFileOutputFormat还是TextOutputFormat
        //根据基准路径和输出文件截出标识
        String outputNameStr = job.getConfiguration().get(BASE_OUTPUT_NAME);
        String dirOutputStr = job.getConfiguration().get(OUTDIR);
        if (outputNameStr.contains(":")) {
            outputNameStr = outputNameStr.split(":")[1];
        }
        if (dirOutputStr.contains(":")) {
            dirOutputStr = dirOutputStr.split(":")[1];
        }
        //输出格式标识
        String flag = "";
        if (outputNameStr.startsWith(dirOutputStr)) {
            String pathStr = outputNameStr.substring(dirOutputStr.length() + 1, outputNameStr.length());
            if (pathStr.contains("/")) {
                flag = pathStr.split("/")[0];
            } else if (pathStr.contains(File.separator)) {
                flag = pathStr.split(File.separator)[0];
            }
        }
        //从这个方法里面可以获取一个configuration
        Configuration configuration = job.getConfiguration();
        //根据标识输出相应的数据
        switch (flag) {
            case "cleandata"://清洗明细结果ORC格式
            case "basetotaldata"://清洗基准值ORC格式
            case "calcdata"://清洗计算出异常记录结果ORC格式
                //文件的输出路径
                Path file = this.getDefaultWorkFile(job, ORCEXTENSION);
                TypeDescription schema = TypeDescription.fromString("struct<did:string,dno:bigint,dtm:bigint,kind:int,typ:bigint,val:string>");
                OrcFile.WriterOptions writerOptions = OrcFile.writerOptions(new Configuration());
                //该类型新版本新增的
                CompressionKind zlib = CompressionKind.ZSTD;
                Writer writer = OrcFile.createWriter(file, writerOptions.setSchema(schema).compress(zlib));
                OrcMapreduceRecordWriter orcMapreduceRecordWriter = new OrcMapreduceRecordWriter(writer);
                return orcMapreduceRecordWriter;
            case "infodata"://清洗场景识别明细数据ORC格式
                //文件的输出路径
                file = this.getDefaultWorkFile(job, ORCEXTENSION);
                schema = TypeDescription.fromString("struct<did:string,dno:string,dtm:bigint,kind:int,typ:bigint,val:string>");
                writerOptions = OrcFile.writerOptions(new Configuration());
                zlib = CompressionKind.ZSTD;
                writer = OrcFile.createWriter(file, writerOptions.setSchema(schema).compress(zlib));
                orcMapreduceRecordWriter = new OrcMapreduceRecordWriter(writer);
                return orcMapreduceRecordWriter;
            case "cleancsvdata"://清洗结果CSV格式
                file = this.getDefaultWorkFile(job, CSVEXTENSION);
                Configuration conf = job.getConfiguration();
                String keyValueSeparator = conf.get(TextOutputFormat.SEPERATOR, "\t");
                FileSystem fs = file.getFileSystem(conf);
                FSDataOutputStream fileOut = fs.create(file, false);
                return new LineRecordWriter<K, V>(fileOut, keyValueSeparator);
        }

        return null;
    }

    /**
     * 输出job的工作路径
     *
     * @param context   the task context
     * @param extension an extension to add to the filename
     * @return
     * @throws IOException
     */
    @Override
    public Path getDefaultWorkFile(TaskAttemptContext context, String extension) throws IOException {
        if (context.getConfiguration().getBoolean(SKIP_TEMP_DIRECTORY, false)) {
            return new Path(getOutputPath(context), getUniqueFile(context, getOutputName(context), extension));
        } else {
            //自定义 map 输出和 reduce 输出
            String fileNameprefix = context.getConfiguration().get("fileNameprefix");
            if (StringUtils.isNotBlank(fileNameprefix)) {
                String outputPath = context.getConfiguration().get("outputPath");
                String fileName = getMUniqueFile(context, fileNameprefix, extension);
                return new Path(outputPath, fileName);
            } else {
                //默认方式
                FileOutputCommitter committer = (FileOutputCommitter) getOutputCommitter(context);
                return new Path(committer.getWorkPath(), getUniqueFile(context, getOutputName(context), extension));
            }
        }
    }

    /**
     * 自定义 模拟源码对文件名进行编写
     *
     * @param context
     * @param name
     * @param extension
     * @return
     */
    public synchronized static String getMUniqueFile(TaskAttemptContext context, String name, String extension) {
        TaskID taskId = context.getTaskAttemptID().getTaskID();
        int partition = taskId.getId();
        StringBuilder result = new StringBuilder();
        result.append(name);
        result.append('-');
        result.append(NumberFormat.getInstance().format(partition));
        result.append(extension);
        return result.toString();
    }

    /**
     * 输出job的提交对象
     *
     * @param context the task context
     * @return
     * @throws IOException
     */
    @Override
    public synchronized OutputCommitter getOutputCommitter(TaskAttemptContext context) throws IOException {

        return super.getOutputCommitter(context);
    }

    protected static class LineRecordWriter<K, V>
            extends RecordWriter<K, V> {
        private static final String utf8 = "UTF-8";
        private static final byte[] newline;

        static {
            try {
                newline = "\n".getBytes(utf8);
            } catch (UnsupportedEncodingException uee) {
                throw new IllegalArgumentException("can't find " + utf8 + " encoding");
            }
        }

        protected DataOutputStream out;
        private final byte[] keyValueSeparator;

        /**
         * 按行输出
         * @param out
         * @param keyValueSeparator
         */
        public LineRecordWriter(DataOutputStream out, String keyValueSeparator) {
            this.out = out;
            try {
                this.keyValueSeparator = keyValueSeparator.getBytes(utf8);
            } catch (UnsupportedEncodingException uee) {
                throw new IllegalArgumentException("can't find " + utf8 + " encoding");
            }
        }

        public LineRecordWriter(DataOutputStream out) {
            this(out, "\t");
        }

        /**
         * Write the object to the byte stream, handling Text as a special
         * case.
         *
         * @param o the object to print
         * @throws IOException if the write throws, we pass it on
         */
        private void writeObject(Object o) throws IOException {
            if (o instanceof Text) {
                Text to = (Text) o;
                out.write(to.getBytes(), 0, to.getLength());
            } else {
                out.write(o.toString().getBytes(utf8));
            }
        }

        public synchronized void write(K key, V value)
                throws IOException {

            boolean nullKey = key == null || key instanceof NullWritable;
            boolean nullValue = value == null || value instanceof NullWritable;
            if (nullKey && nullValue) {
                return;
            }
            if (!nullKey) {
                writeObject(key);
            }
            if (!(nullKey || nullValue)) {
                out.write(keyValueSeparator);
            }
            if (!nullValue) {
                writeObject(value);
            }
            out.write(newline);
        }

        public synchronized void close(TaskAttemptContext context) throws IOException {
            out.close();
        }
    }
}

注意，不同类型的标识取的的比较low，很难通用，下次注意，另外在reduce的输出是的<K,V>指定为

 //指定reduce输出
job.setOutputKeyClass(NullWritable.class);//red输出的key
job.setOutputValueClass(Writable.class);//red输出的value

posted on 2024-05-31 17:52 Yr-Zhang 阅读(58) 评论(0) 收藏举报

刷新页面返回顶部

☆☆☆★☆☆☆

导航

公告

mapreduce的多种格式文件输出-自定义OutputFormat