笔记04

关于hadoop一些自定义输出
code>OutputFormat</code> describes the output-specification for a 
 * Map-Reduce job.
首先继承outputFormat<key,value>这个抽象类 Map-Reduce job的输出规范
实现他的方法：
RecordWriter<KeyBaseDimension, BaseStatsValueWritable> getRecordWriter 在方法内可以进行数据库连接操作
这里需要一个返回一个RecordWriter 
继承这个RecordWriter类
 实现里面的write方法 进行数据库jdbc存储即可
 
 
 关于reduce端输出时会调用的write方法
 实现类为：TaskInputOutputContextImpl
  private RecordWriter<KEYOUT,VALUEOUT> output;
  public void write(KEYOUT key, VALUEOUT value
                    ) throws IOException, InterruptedException {
    output.write(key, value);
  }
  最终是调用了RecordWriter的write方法，
  
  
 
 *在提交TableMap作业之前使用此选项。它将被适当地设置
*工作。
TableMapReduceUtil 这个类很重要，在提交读取hbase表job之前可以对其进行一系列过滤操作
 
 public static void initTableMapperJob(List<Scan> scans,
      Class<? extends TableMapper> mapper,
      Class<?> outputKeyClass,
      Class<?> outputValueClass, Job job,
      boolean addDependencyJars,
      boolean initCredentials) throws IOException {
    job.setInputFormatClass(MultiTableInputFormat.class);
    if (outputValueClass != null) {
      job.setMapOutputValueClass(outputValueClass);
    }
    if (outputKeyClass != null) {
      job.setMapOutputKeyClass(outputKeyClass);
    }
    job.setMapperClass(mapper);
    Configuration conf = job.getConfiguration();
    HBaseConfiguration.merge(conf, HBaseConfiguration.create(conf));
    List<String> scanStrings = new ArrayList<String>();

    for (Scan scan : scans) {
      scanStrings.add(convertScanToString(scan));
    }
    job.getConfiguration().setStrings(MultiTableInputFormat.SCANS,
      scanStrings.toArray(new String[scanStrings.size()]));

    if (addDependencyJars) {
      addDependencyJars(job);
    }

    if (initCredentials) {
      initCredentials(job);
    }
  }

 
 map之前Hbase过滤
     FilterList filterList = new FilterList(FilterList.Operator.MUST_PASS_ALL);

        filterList.addFilter(
                new SingleColumnValueFilter(EventLogConstants.EVENT_LOGS_FAMILY_NAME_BYTES,
                        Bytes.toBytes(EventLogConstants.LOG_COLUMN_NAME_EVENT_NAME),
                        CompareOp.EQUAL, Bytes.toBytes(EventLogConstants.EventEnum.BC_SX.alias)));

        filterList.addFilter(
                new SingleColumnValueFilter(EventLogConstants.EVENT_LOGS_FAMILY_NAME_BYTES,
                        Bytes.toBytes(EventLogConstants.LOG_COLUMN_NAME_BC_STATUS),
                        CompareOp.NOT_EQUAL, Bytes.toBytes("0")));

        filterList.addFilter(
                new SingleColumnValueFilter(EventLogConstants.EVENT_LOGS_FAMILY_NAME_BYTES,
                        Bytes.toBytes(EventLogConstants.LOG_COLUMN_NAME_CR_CP_ID),
                        CompareOp.NOT_EQUAL, Bytes.toBytes("699004")));

        filterList.addFilter(
                new SingleColumnValueFilter(EventLogConstants.EVENT_LOGS_FAMILY_NAME_BYTES,
                        Bytes.toBytes(EventLogConstants.LOG_COLUMN_NAME_IS_DEL),
                        CompareOp.EQUAL, Bytes.toBytes("0")));

        String[] columns = new String[] {
                // 不管mapper中是否用到event的值，在column中都必须有
                EventLogConstants.LOG_COLUMN_NAME_EVENT_NAME,
                EventLogConstants.LOG_COLUMN_NAME_BC_STATUS,
                EventLogConstants.LOG_COLUMN_NAME_CR_CP_ID,
                EventLogConstants.LOG_COLUMN_NAME_C_ID,
                EventLogConstants.LOG_COLUMN_NAME_BC_PERSON,
                EventLogConstants.LOG_COLUMN_NAME_IS_BC_RE
        };

        filterList.addFilter(this.getColumnFilter(columns));

        String statDate = conf.get(GlobalConstants.RUNNING_DATE_PARAMES);

        Connection conn;
        Admin admin = null;
        List<Scan> scanList = new ArrayList<Scan>();
        try {
            conn = ConnectionFactory.createConnection(conf);
            admin = conn.getAdmin();
            String tableName = EventLogConstants.HBASE_NAME_AUDIT_SX + GlobalConstants.UNDERLINE + statDate.replaceAll(GlobalConstants.KEY_SEPARATOR, "");
            if (admin.tableExists(TableName.valueOf(tableName))) {
                Scan scan = new Scan();
 // If an application wants to use multiple scans over different tables each scan must
  // define this attribute with the appropriate table name by calling
  // scan.setAttribute(Scan.SCAN_ATTRIBUTES_TABLE_NAME, Bytes.toBytes(tableName))
 // static public final String SCAN_ATTRIBUTES_TABLE_NAME = "scan.attributes.table.name";
                scan.setAttribute(Scan.SCAN_ATTRIBUTES_TABLE_NAME, Bytes.toBytes(tableName));
                scan.setFilter(filterList);
                scanList.add(scan);
            }
        } catch (Exception e) {
            e.printStackTrace();
            throw new RuntimeException("创建HBaseAdmin发生异常", e);
        } finally {
            if (admin != null) {
                try {
                    admin.close();
                } catch (IOException e) {
                    e.printStackTrace();
                }
            }
        }

        if (scanList.isEmpty()) {
            throw new IOException("没有表存在，无法创建scan集合");
        }

        TableMapReduceUtil.initTableMapperJob(scanList, AuditorSXMapper.class,
                AuditorDimensionKey.class, Text.class, job, false);
    }
 
 
 
 
 storm echo(File(),fun,File())
 filter:实现filter接口 iskeep方法
 partitionAggregate函数：分区内聚合，实现aggregate<保存聚合状态的类> 的aggregate实现聚合逻辑 ，complete方法 ridentCollector collector.emit(Value(聚合后的值))
 一般的key拼接函数：实现function接口的execute方法
 HBaseMapState.Options optsWait = new HBaseMapState.Options();
 
     TridentState amtOfWaitState = partStream.project(new Fields("waitingTotalOfPartDay","dayAndContType"))
                .groupBy(new Fields("dayAndContType"))
                .persistentAggregate(
                        factoryWait,
                        new Fields("waitingTotalOfPartDay"),new Sum(),
                        new Fields("waitingGlobalOfDay")
                );
 
 persistentAggregate 持久化保存函数 进行全区的sum求和，输入各区，输出为总和
posted @ 2019-03-13 22:16 夜半钟声到客船阅读(247) 评论(0) 收藏举报
刷新页面返回顶部
落霞与孤鹜齐飞

中山桥砖厂搬砖者

笔记04

公告