hive-UDTF-源码及例子
GenericUDTF
UDTF需要继承GenericUDTF
public abstract class GenericUDTF { 
  Collector collector = null;
  public void configure(MapredContext mapredContext) {
  }
  public StructObjectInspector initialize(StructObjectInspector argOIs)
      throws UDFArgumentException {
	//输入获取一行列表
    List<? extends StructField> inputFields = argOIs.getAllStructFieldRefs();
    //创建一行的ObjectInspector数组
    ObjectInspector[] udtfInputOIs = new ObjectInspector[inputFields.size()];
    for (int i = 0; i < inputFields.size(); i++) {
      udtfInputOIs[i] = inputFields.get(i).getFieldObjectInspector();
    }
    //返回初始化后的
    return initialize(udtfInputOIs);
  }
  @Deprecated
  public StructObjectInspector initialize(ObjectInspector[] argOIs)
      throws UDFArgumentException {
    throw new IllegalStateException("Should not be called directly");
  }
  public abstract void process(Object[] args) throws HiveException;
  public abstract void close() throws HiveException;
  public final void setCollector(Collector collector) {
    this.collector = collector;
  }
  protected final void forward(Object o) throws HiveException {
    collector.collect(o);
  }
}
Collector接口如下
public interface Collector {
  void collect(Object input) throws HiveException;
}
子类只有UDTFCollector,从GenericUDTF收集数据然后发送给UDTFOperator
构造时会传入UDTFOperator
public class UDTFCollector implements Collector {
  final UDTFOperator op;
  private transient int counter;	//收集一次,counter加1
  //设置UDTFOperator
  public UDTFCollector(UDTFOperator op) {
    this.op = op;
  }
  //collect就是调用UDTFOperator的forwardUDTFOutput方法
  public void collect(Object input) throws HiveException {	
    op.forwardUDTFOutput(input);
    counter++;
  }
  public int getCounter() {	//获得计数器
    return counter;
  }
  public void reset() {		//计数器清零
    counter = 0;
  }
}
UDTFOperator
public class UDTFOperator extends Operator<UDTFDesc> implements Serializable {
  StructObjectInspector udtfInputOI = null;
  Object[] objToSendToUDTF = null;
  GenericUDTF genericUDTF;
  UDTFCollector collector;
  List outerObj;
  //自动周期发送消息
  transient AutoProgressor autoProgressor;
  //传入一个Configuration
  protected Collection<Future<?>> initializeOp(Configuration hconf) throws HiveException {
    Collection<Future<?>> result = super.initializeOp(hconf);
    genericUDTF = conf.getGenericUDTF();
    collector = new UDTFCollector(this);
    genericUDTF.setCollector(collector);
    udtfInputOI = (StructObjectInspector) inputObjInspectors[0];
    objToSendToUDTF = new Object[udtfInputOI.getAllStructFieldRefs().size()];
    MapredContext context = MapredContext.get();
    if (context != null) {
      context.setup(genericUDTF);
    }
    StructObjectInspector udtfOutputOI = genericUDTF.initialize(udtfInputOI);
    if (conf.isOuterLV()) {
      outerObj = Arrays.asList(new Object[udtfOutputOI.getAllStructFieldRefs().size()]);
    }
    // Since we're passing the object output by the UDTF directly to the next
    // operator, we can use the same OI.
    outputObjInspector = udtfOutputOI;
    // Set up periodic progress reporting in case the UDTF doesn't output rows
    // for a while
    if (HiveConf.getBoolVar(hconf, HiveConf.ConfVars.HIVEUDTFAUTOPROGRESS)) {
      autoProgressor = new AutoProgressor(this.getClass().getName(), reporter,
          Utilities.getDefaultNotificationInterval(hconf),
          HiveConf.getTimeVar(
              hconf, HiveConf.ConfVars.HIVES_AUTO_PROGRESS_TIMEOUT, TimeUnit.MILLISECONDS));
      autoProgressor.go();
    }
    return result;
  }
  public void process(Object row, int tag) throws HiveException {
    // The UDTF expects arguments in an object[]
    StructObjectInspector soi = (StructObjectInspector) inputObjInspectors[tag];
    List<? extends StructField> fields = soi.getAllStructFieldRefs();
    for (int i = 0; i < fields.size(); i++) {
      objToSendToUDTF[i] = soi.getStructFieldData(row, fields.get(i));
    }
    genericUDTF.process(objToSendToUDTF);
    if (conf.isOuterLV() && collector.getCounter() == 0) {
      collector.collect(outerObj);
    }
    collector.reset();
  }
  
    public void forwardUDTFOutput(Object o) throws HiveException {
    // Since the output of the UDTF is a struct, we can just forward that
    forward(o, outputObjInspector);
  }
  @Override
  public String getName() {
    return getOperatorName();
  }
  static public String getOperatorName() {
    return "UDTF";
  }
  @Override
  public OperatorType getType() {
    return OperatorType.UDTF;
  }
  @Override
  protected void closeOp(boolean abort) throws HiveException {
    conf.getGenericUDTF().close();
  }
}
Operator
Operator是抽象类
Operator传入一个泛型OperatorDesc,构造时一般是递增seqId并且设置operatorId
Node是作为图的节点
public abstract class Operator<T extends OperatorDesc> implements Serializable,Cloneable,Node {
  private transient Configuration configuration;
  //存储父子Operator的列表
  protected List<Operator<? extends OperatorDesc>> childOperators;
  protected List<Operator<? extends OperatorDesc>> parentOperators;
  protected String operatorId;
  private transient ExecMapperContext execContext;
  private transient boolean rootInitializeCalled = false;
  private static AtomicInteger seqId;
  public static enum State {
    UNINIT, //还没有进行初始化
    INIT,   //初始化已经调用,但是close还没有调用
    CLOSE
  }
  protected transient State state = State.UNINIT;
  static {
    seqId = new AtomicInteger(0);
  }
  //构造时
  private Operator(String name) {
    id = name;
    initOperatorId();
    childOperators = new ArrayList<Operator<? extends OperatorDesc>>();
    parentOperators = new ArrayList<Operator<? extends OperatorDesc>>();
  }
  //如果不传入seqId是自增的
  public Operator() {
    this(String.valueOf(seqId.getAndIncrement()));
  }
  //这里把operatorId初始化
  public void initOperatorId() {
    setOperatorId(getName() + "_" + this.id);
  }
  public void setOperatorId(String operatorId) {
    this.operatorId = operatorId;
  } 
  public static void resetId() {
    seqId.set(0);
  }
  public Operator(Reporter reporter) {
    this();
    this.reporter = reporter;
  }
  
  protected T conf;
  private RowSchema rowSchema;
  protected transient ObjectInspector[] inputObjInspectors = new ObjectInspector[1];
  protected transient ObjectInspector outputObjInspector;
  protected Map<String, ExprNodeDesc> colExprMap;
  protected void forward(Object row, ObjectInspector rowInspector)
      throws HiveException {
    if (getDone()) {
      return;
    }
    int childrenDone = 0;
    for (int i = 0; i < childOperatorsArray.length; i++) {
      Operator<? extends OperatorDesc> o = childOperatorsArray[i];
      if (o.getDone()) {
        childrenDone++;
      } else {
        o.process(row, childOperatorsTag[i]);
      }
    }
    // if all children are done, this operator is also done
    if (childrenDone != 0 && childrenDone == childOperatorsArray.length) {
      setDone(true);
    }
  }
}
OperatorDesc
public interface OperatorDesc extends Serializable, Cloneable {
  public Object clone() throws CloneNotSupportedException;
  public Statistics getStatistics();
  public void setStatistics(Statistics statistics);
  public OpTraits getTraits();
  public void setTraits(OpTraits opTraits);
  public Map<String, String> getOpProps();
}
OperatorDesc的子类AbstractOperatorDesc,对于所有的Operator都有一个相应的OperatorDesc具体实现;
ObjectInspector
public interface ObjectInspector extends Cloneable {
  public static enum Category {
    PRIMITIVE, LIST, MAP, STRUCT, UNION
  };
  String getTypeName();			//type是具体的类型
  Category getCategory();		//Category是五中类型
}	
ObjectInspector是数据类型的基类,下属:PRIMITIVE, LIST, MAP, STRUCT, UNION几种类型
比如PRIMITIVE代表了基本数据类型,内容如下
public interface PrimitiveObjectInspector extends ObjectInspector {
  //PrimitiveObjectInspector具有自己的PrimitiveCategory
  public static enum PrimitiveCategory {
    VOID, BOOLEAN, BYTE, SHORT, INT, LONG, FLOAT, DOUBLE, STRING,
    DATE, TIMESTAMP, BINARY, DECIMAL, VARCHAR, CHAR, INTERVAL_YEAR_MONTH, INTERVAL_DAY_TIME,
    UNKNOWN
  };
  public PrimitiveTypeInfo getTypeInfo();
  PrimitiveCategory getPrimitiveCategory();
  //获取PrimitiveWritableClass、PrimitiveWritableObject
  Class<?> getPrimitiveWritableClass();
  Object getPrimitiveWritableObject(Object o);
  Class<?> getJavaPrimitiveClass();
  Object getPrimitiveJavaObject(Object o);
  Object copyObject(Object o);
  boolean preferWritable();
  int precision();						//数据精度
  int scale();							//数据scale
}
子类AbstractPrimitiveObjectInspector,内部保存了一个PrimitiveTypeInfo,对于PrimitiveObjectInspector内部的抽象方法,都是通过typeInfo获取的
public abstract class AbstractPrimitiveObjectInspector implements
    PrimitiveObjectInspector {
  protected PrimitiveTypeInfo typeInfo;		//存储PrimitiveTypeInfo
  //构造时可以传入PrimitiveTypeInfo
  protected AbstractPrimitiveObjectInspector() {super();}
  protected AbstractPrimitiveObjectInspector(PrimitiveTypeInfo typeInfo) {this.typeInfo = typeInfo;}
  public Class<?> getJavaPrimitiveClass() {
    return typeInfo.getPrimitiveJavaClass();
  }
  public PrimitiveCategory getPrimitiveCategory() {
    return typeInfo.getPrimitiveCategory();
  }
  @Override
  public Class<?> getPrimitiveWritableClass() {
    return typeInfo.getPrimitiveWritableClass();
  }
  @Override
  public Category getCategory() {
    return Category.PRIMITIVE;
  }
  @Override
  public String getTypeName() {
    return typeInfo.getTypeName();
  }
  @Override
  public PrimitiveTypeInfo getTypeInfo() {
    return this.typeInfo;
  }
  @Override
  public int precision() {
    return HiveDecimalUtils.getPrecisionForType(typeInfo);
  }
  @Override
  public int scale() {
    return HiveDecimalUtils.getScaleForType(typeInfo);
  }
}
同时又一个AbstractPrimitiveLazyObjectInspector,能够传入一个泛型
public abstract class AbstractPrimitiveLazyObjectInspector<T extends Writable>
    extends AbstractPrimitiveObjectInspector {
  protected AbstractPrimitiveLazyObjectInspector() {
    super();
  }
  protected AbstractPrimitiveLazyObjectInspector(PrimitiveTypeInfo typeInfo) {
    super(typeInfo);
  }
  @Override
  public T getPrimitiveWritableObject(Object o) {
    return o == null ? null : ((LazyPrimitive<?, T>) o).getWritableObject();
  }
  @Override
  public boolean preferWritable() {
    return true;
  }
}
LazyLongObjectInspector
public class LazyLongObjectInspector extends
    AbstractPrimitiveLazyObjectInspector<LongWritable> implements
    LongObjectInspector {
  LazyLongObjectInspector() {
    super(TypeInfoFactory.longTypeInfo);
  }
  @Override
  public long get(Object o) {
    return getPrimitiveWritableObject(o).get();
  }
  @Override
  public Object copyObject(Object o) {
    return o == null ? null : new LazyLong((LazyLong) o);
  }
  @Override
  public Object getPrimitiveJavaObject(Object o) {
    return o == null ? null : Long.valueOf(get(o));
  }
}
LongObjectInspector接口内容,可以获得对象存储的long基本类型
public interface LongObjectInspector extends PrimitiveObjectInspector {
  long get(Object o);
}
具体的DateRange
初始化部分检查输入参数
   public StructObjectInspector initialize(ObjectInspector[] inspectors) throws UDFArgumentException {
        if (inspectors.length != 2 && inspectors.length != 4)
            throw new UDFArgumentException("DateRange() takes 2 or 4 argument. \n Usage: DateRange(from_date, to_date, [in_format, out_format])");
        this.inspectors = inspectors;
        if (inspectors.length != 4)
            iformatter = oformatter = new SimpleDateFormat("yyyy-MM-dd");
        final ObjectInspector returnOI = PrimitiveObjectInspectorFactory.writableStringObjectInspector;
        return ObjectInspectorFactory.getStandardStructObjectInspector(
                new ArrayList<String>() {{ add("date"); }},
                new ArrayList<ObjectInspector>() {{ add(returnOI); }}
        );
    }
在初始化的部分需要返回StructObjectInspector
调用process发生的事
   public void process(Object[] args) throws HiveException {
		//传入的参数
        String fromDateStr = ((StringObjectInspector) inspectors[0]).getPrimitiveJavaObject(args[0]);
        String toDateStr   = ((StringObjectInspector) inspectors[1]).getPrimitiveJavaObject(args[1]);
        if (fromDateStr == null || toDateStr == null) {
            forward(new ArrayList<>(0));
            return;
        }
        if (inspectors.length == 4) {
            String ifmt = ((StringObjectInspector) inspectors[2]).getPrimitiveJavaObject(args[2]);
            String ofmt = ((StringObjectInspector) inspectors[3]).getPrimitiveJavaObject(args[3]);
            iformatter = new SimpleDateFormat(ifmt);
            oformatter = new SimpleDateFormat(ofmt);
        }
        Date fromDate, toDate;
        try {
            fromDate = new Date(iformatter.parse(fromDateStr).getTime());
            toDate   = new Date(iformatter.parse(toDateStr).getTime());
        } catch (ParseException e) {
            logger.error("invalid date format: " + fromDateStr + " or " + toDateStr );
            forward(new ArrayList<>(0));
            return;
        }
        do {
            List<Text> result = new ArrayList<>();
            result.add(new Text(oformatter.format(fromDate)));
            forward(result);
            fromDate = new Date(fromDate.getTime() + 86400l * 1000);
        } while (fromDate.compareTo(toDate) <= 0);
    }
 
                    
                     
                    
                 
                    
                
 
 
                
            
         
         浙公网安备 33010602011771号
浙公网安备 33010602011771号