hive-UDTF-源码及例子

GenericUDTF

UDTF需要继承GenericUDTF

public abstract class GenericUDTF { 
  Collector collector = null;

  public void configure(MapredContext mapredContext) {
  }

  public StructObjectInspector initialize(StructObjectInspector argOIs)
      throws UDFArgumentException {
	//输入获取一行列表
    List<? extends StructField> inputFields = argOIs.getAllStructFieldRefs();
    //创建一行的ObjectInspector数组
    ObjectInspector[] udtfInputOIs = new ObjectInspector[inputFields.size()];
    for (int i = 0; i < inputFields.size(); i++) {
      udtfInputOIs[i] = inputFields.get(i).getFieldObjectInspector();
    }
    //返回初始化后的
    return initialize(udtfInputOIs);
  }

  @Deprecated
  public StructObjectInspector initialize(ObjectInspector[] argOIs)
      throws UDFArgumentException {
    throw new IllegalStateException("Should not be called directly");
  }

  public abstract void process(Object[] args) throws HiveException;
  public abstract void close() throws HiveException;
  public final void setCollector(Collector collector) {
    this.collector = collector;
  }
  protected final void forward(Object o) throws HiveException {
    collector.collect(o);
  }

}

Collector接口如下

public interface Collector {
  void collect(Object input) throws HiveException;
}

子类只有UDTFCollector,从GenericUDTF收集数据然后发送给UDTFOperator
构造时会传入UDTFOperator

public class UDTFCollector implements Collector {
  final UDTFOperator op;
  private transient int counter;	//收集一次,counter加1

  //设置UDTFOperator
  public UDTFCollector(UDTFOperator op) {
    this.op = op;
  }

  //collect就是调用UDTFOperator的forwardUDTFOutput方法
  public void collect(Object input) throws HiveException {	
    op.forwardUDTFOutput(input);
    counter++;
  }

  public int getCounter() {	//获得计数器
    return counter;
  }

  public void reset() {		//计数器清零
    counter = 0;
  }
}

UDTFOperator

public class UDTFOperator extends Operator<UDTFDesc> implements Serializable {
  StructObjectInspector udtfInputOI = null;
  Object[] objToSendToUDTF = null;

  GenericUDTF genericUDTF;
  UDTFCollector collector;
  List outerObj;
  //自动周期发送消息
  transient AutoProgressor autoProgressor;

  //传入一个Configuration
  protected Collection<Future<?>> initializeOp(Configuration hconf) throws HiveException {
    Collection<Future<?>> result = super.initializeOp(hconf);
    genericUDTF = conf.getGenericUDTF();
    collector = new UDTFCollector(this);

    genericUDTF.setCollector(collector);

    udtfInputOI = (StructObjectInspector) inputObjInspectors[0];

    objToSendToUDTF = new Object[udtfInputOI.getAllStructFieldRefs().size()];

    MapredContext context = MapredContext.get();
    if (context != null) {
      context.setup(genericUDTF);
    }
    StructObjectInspector udtfOutputOI = genericUDTF.initialize(udtfInputOI);

    if (conf.isOuterLV()) {
      outerObj = Arrays.asList(new Object[udtfOutputOI.getAllStructFieldRefs().size()]);
    }

    // Since we're passing the object output by the UDTF directly to the next
    // operator, we can use the same OI.
    outputObjInspector = udtfOutputOI;

    // Set up periodic progress reporting in case the UDTF doesn't output rows
    // for a while
    if (HiveConf.getBoolVar(hconf, HiveConf.ConfVars.HIVEUDTFAUTOPROGRESS)) {
      autoProgressor = new AutoProgressor(this.getClass().getName(), reporter,
          Utilities.getDefaultNotificationInterval(hconf),
          HiveConf.getTimeVar(
              hconf, HiveConf.ConfVars.HIVES_AUTO_PROGRESS_TIMEOUT, TimeUnit.MILLISECONDS));
      autoProgressor.go();
    }
    return result;
  }

  public void process(Object row, int tag) throws HiveException {
    // The UDTF expects arguments in an object[]
    StructObjectInspector soi = (StructObjectInspector) inputObjInspectors[tag];
    List<? extends StructField> fields = soi.getAllStructFieldRefs();

    for (int i = 0; i < fields.size(); i++) {
      objToSendToUDTF[i] = soi.getStructFieldData(row, fields.get(i));
    }

    genericUDTF.process(objToSendToUDTF);
    if (conf.isOuterLV() && collector.getCounter() == 0) {
      collector.collect(outerObj);
    }
    collector.reset();
  }
  
    public void forwardUDTFOutput(Object o) throws HiveException {
    // Since the output of the UDTF is a struct, we can just forward that
    forward(o, outputObjInspector);
  }

  @Override
  public String getName() {
    return getOperatorName();
  }

  static public String getOperatorName() {
    return "UDTF";
  }

  @Override
  public OperatorType getType() {
    return OperatorType.UDTF;
  }

  @Override
  protected void closeOp(boolean abort) throws HiveException {
    conf.getGenericUDTF().close();
  }


}

Operator

Operator是抽象类
Operator传入一个泛型OperatorDesc,构造时一般是递增seqId并且设置operatorId
Node是作为图的节点

public abstract class Operator<T extends OperatorDesc> implements Serializable,Cloneable,Node {

  private transient Configuration configuration;
  //存储父子Operator的列表
  protected List<Operator<? extends OperatorDesc>> childOperators;
  protected List<Operator<? extends OperatorDesc>> parentOperators;

  protected String operatorId;
  private transient ExecMapperContext execContext;
  private transient boolean rootInitializeCalled = false;

  private static AtomicInteger seqId;
  public static enum State {
    UNINIT, //还没有进行初始化
    INIT,   //初始化已经调用,但是close还没有调用
    CLOSE
  }
  protected transient State state = State.UNINIT;
  static {
    seqId = new AtomicInteger(0);
  }

  //构造时
  private Operator(String name) {
    id = name;
    initOperatorId();
    childOperators = new ArrayList<Operator<? extends OperatorDesc>>();
    parentOperators = new ArrayList<Operator<? extends OperatorDesc>>();
  }

  //如果不传入seqId是自增的
  public Operator() {
    this(String.valueOf(seqId.getAndIncrement()));
  }
  //这里把operatorId初始化
  public void initOperatorId() {
    setOperatorId(getName() + "_" + this.id);
  }
  public void setOperatorId(String operatorId) {
    this.operatorId = operatorId;
  } 

  public static void resetId() {
    seqId.set(0);
  }

  public Operator(Reporter reporter) {
    this();
    this.reporter = reporter;
  }
  

  protected T conf;
  private RowSchema rowSchema;

  protected transient ObjectInspector[] inputObjInspectors = new ObjectInspector[1];
  protected transient ObjectInspector outputObjInspector;
  protected Map<String, ExprNodeDesc> colExprMap;

  protected void forward(Object row, ObjectInspector rowInspector)
      throws HiveException {

    if (getDone()) {
      return;
    }

    int childrenDone = 0;
    for (int i = 0; i < childOperatorsArray.length; i++) {
      Operator<? extends OperatorDesc> o = childOperatorsArray[i];
      if (o.getDone()) {
        childrenDone++;
      } else {
        o.process(row, childOperatorsTag[i]);
      }
    }

    // if all children are done, this operator is also done
    if (childrenDone != 0 && childrenDone == childOperatorsArray.length) {
      setDone(true);
    }
  }
}

OperatorDesc

public interface OperatorDesc extends Serializable, Cloneable {
  public Object clone() throws CloneNotSupportedException;
  public Statistics getStatistics();
  public void setStatistics(Statistics statistics);
  public OpTraits getTraits();
  public void setTraits(OpTraits opTraits);
  public Map<String, String> getOpProps();
}

OperatorDesc的子类AbstractOperatorDesc,对于所有的Operator都有一个相应的OperatorDesc具体实现;

ObjectInspector

public interface ObjectInspector extends Cloneable {
  public static enum Category {
    PRIMITIVE, LIST, MAP, STRUCT, UNION
  };
  String getTypeName();			//type是具体的类型
  Category getCategory();		//Category是五中类型
}	

ObjectInspector是数据类型的基类,下属:PRIMITIVE, LIST, MAP, STRUCT, UNION几种类型
比如PRIMITIVE代表了基本数据类型,内容如下

public interface PrimitiveObjectInspector extends ObjectInspector {
  //PrimitiveObjectInspector具有自己的PrimitiveCategory
  public static enum PrimitiveCategory {
    VOID, BOOLEAN, BYTE, SHORT, INT, LONG, FLOAT, DOUBLE, STRING,
    DATE, TIMESTAMP, BINARY, DECIMAL, VARCHAR, CHAR, INTERVAL_YEAR_MONTH, INTERVAL_DAY_TIME,
    UNKNOWN
  };
  public PrimitiveTypeInfo getTypeInfo();
  PrimitiveCategory getPrimitiveCategory();

  //获取PrimitiveWritableClass、PrimitiveWritableObject
  Class<?> getPrimitiveWritableClass();
  Object getPrimitiveWritableObject(Object o);

  Class<?> getJavaPrimitiveClass();
  Object getPrimitiveJavaObject(Object o);
  Object copyObject(Object o);
  boolean preferWritable();
  int precision();						//数据精度
  int scale();							//数据scale
}

子类AbstractPrimitiveObjectInspector,内部保存了一个PrimitiveTypeInfo,对于PrimitiveObjectInspector内部的抽象方法,都是通过typeInfo获取的

public abstract class AbstractPrimitiveObjectInspector implements
    PrimitiveObjectInspector {

  protected PrimitiveTypeInfo typeInfo;		//存储PrimitiveTypeInfo

  //构造时可以传入PrimitiveTypeInfo
  protected AbstractPrimitiveObjectInspector() {super();}
  protected AbstractPrimitiveObjectInspector(PrimitiveTypeInfo typeInfo) {this.typeInfo = typeInfo;}

  public Class<?> getJavaPrimitiveClass() {
    return typeInfo.getPrimitiveJavaClass();
  }
  public PrimitiveCategory getPrimitiveCategory() {
    return typeInfo.getPrimitiveCategory();
  }

  @Override
  public Class<?> getPrimitiveWritableClass() {
    return typeInfo.getPrimitiveWritableClass();
  }

  @Override
  public Category getCategory() {
    return Category.PRIMITIVE;
  }

  @Override
  public String getTypeName() {
    return typeInfo.getTypeName();
  }

  @Override
  public PrimitiveTypeInfo getTypeInfo() {
    return this.typeInfo;
  }

  @Override
  public int precision() {
    return HiveDecimalUtils.getPrecisionForType(typeInfo);
  }

  @Override
  public int scale() {
    return HiveDecimalUtils.getScaleForType(typeInfo);
  }

}

同时又一个AbstractPrimitiveLazyObjectInspector,能够传入一个泛型

public abstract class AbstractPrimitiveLazyObjectInspector<T extends Writable>
    extends AbstractPrimitiveObjectInspector {

  protected AbstractPrimitiveLazyObjectInspector() {
    super();
  }
  protected AbstractPrimitiveLazyObjectInspector(PrimitiveTypeInfo typeInfo) {
    super(typeInfo);
  }

  @Override
  public T getPrimitiveWritableObject(Object o) {
    return o == null ? null : ((LazyPrimitive<?, T>) o).getWritableObject();
  }

  @Override
  public boolean preferWritable() {
    return true;
  }

}

LazyLongObjectInspector

public class LazyLongObjectInspector extends
    AbstractPrimitiveLazyObjectInspector<LongWritable> implements
    LongObjectInspector {

  LazyLongObjectInspector() {
    super(TypeInfoFactory.longTypeInfo);
  }

  @Override
  public long get(Object o) {
    return getPrimitiveWritableObject(o).get();
  }

  @Override
  public Object copyObject(Object o) {
    return o == null ? null : new LazyLong((LazyLong) o);
  }

  @Override
  public Object getPrimitiveJavaObject(Object o) {
    return o == null ? null : Long.valueOf(get(o));
  }
}

LongObjectInspector接口内容,可以获得对象存储的long基本类型

public interface LongObjectInspector extends PrimitiveObjectInspector {
  long get(Object o);
}

具体的DateRange

初始化部分检查输入参数

   public StructObjectInspector initialize(ObjectInspector[] inspectors) throws UDFArgumentException {
        if (inspectors.length != 2 && inspectors.length != 4)
            throw new UDFArgumentException("DateRange() takes 2 or 4 argument. \n Usage: DateRange(from_date, to_date, [in_format, out_format])");

        this.inspectors = inspectors;

        if (inspectors.length != 4)
            iformatter = oformatter = new SimpleDateFormat("yyyy-MM-dd");

        final ObjectInspector returnOI = PrimitiveObjectInspectorFactory.writableStringObjectInspector;


        return ObjectInspectorFactory.getStandardStructObjectInspector(
                new ArrayList<String>() {{ add("date"); }},
                new ArrayList<ObjectInspector>() {{ add(returnOI); }}
        );
    }

在初始化的部分需要返回StructObjectInspector
调用process发生的事

   public void process(Object[] args) throws HiveException {
		//传入的参数
        String fromDateStr = ((StringObjectInspector) inspectors[0]).getPrimitiveJavaObject(args[0]);
        String toDateStr   = ((StringObjectInspector) inspectors[1]).getPrimitiveJavaObject(args[1]);

        if (fromDateStr == null || toDateStr == null) {
            forward(new ArrayList<>(0));
            return;
        }

        if (inspectors.length == 4) {
            String ifmt = ((StringObjectInspector) inspectors[2]).getPrimitiveJavaObject(args[2]);
            String ofmt = ((StringObjectInspector) inspectors[3]).getPrimitiveJavaObject(args[3]);
            iformatter = new SimpleDateFormat(ifmt);
            oformatter = new SimpleDateFormat(ofmt);
        }

        Date fromDate, toDate;

        try {
            fromDate = new Date(iformatter.parse(fromDateStr).getTime());
            toDate   = new Date(iformatter.parse(toDateStr).getTime());
        } catch (ParseException e) {
            logger.error("invalid date format: " + fromDateStr + " or " + toDateStr );
            forward(new ArrayList<>(0));
            return;
        }

        do {
            List<Text> result = new ArrayList<>();
            result.add(new Text(oformatter.format(fromDate)));
            forward(result);
            fromDate = new Date(fromDate.getTime() + 86400l * 1000);
        } while (fromDate.compareTo(toDate) <= 0);
    }
posted @ 2016-12-15 11:45  zhangshihai1232  阅读(1258)  评论(0)    收藏  举报