Hive自定义UDTF

需求:
    原始数据为
        F8    楼梯    MULTIPOLYGON(((27.7363472618163 -76.4068597634723,27.7363472618163 -80.0131241459239,19.72894019261 -80.0118552827063,19.502938400954 -76.4055550890202,27.7363472618163 -76.4068597634723)))
    我自定义的UDTF需要把POI数据使用0、1;1、2、2、3;3、4...规则多行输出,如下:
        F8    楼梯    27.7363472618163    -76.4068597634723    27.7363472618163    -80.0131241459239
        F8    楼梯    27.7363472618163    -80.0131241459239    19.72894019261     -80.0118552827063
        F8    楼梯    19.72894019261     -80.0118552827063    19.502938400954     -76.4055550890202
        F8    楼梯    19.502938400954     -76.4055550890202    27.7363472618163 -76.4068597634723

        
如下使用自定义UDTF实现        
1、添加依赖jar
<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
    xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
    <modelVersion>4.0.0</modelVersion>

    <groupId>com.mengyao.dataformat</groupId>
    <artifactId>hortonworks</artifactId>
    <version>0.0.1-SNAPSHOT</version>
    <packaging>jar</packaging>

    <name>hortonworks</name>
    <url>http://maven.apache.org</url>

    <repositories>
        <!-- hortonworks -->
        <repository>
            <releases>
                <enabled>true</enabled>
                <updatePolicy>always</updatePolicy>
                <checksumPolicy>warn</checksumPolicy>
            </releases>
            <snapshots>
                <enabled>false</enabled>
                <updatePolicy>never</updatePolicy>
                <checksumPolicy>fail</checksumPolicy>
            </snapshots>
            <id>HDPReleases</id>
            <name>HDP Releases</name>
            <url>http://repo.hortonworks.com/content/repositories/releases/</url>
            <layout>default</layout>
        </repository>
        <!-- cloudera -->
        <!--
        <repository>
            <id>cloudera</id>
            <url>https://repository.cloudera.com/artifactory/cloudera-repos/</url>
        </repository>
         -->
    </repositories>

    <properties>
        <project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
        <junit.version>4.10</junit.version>
        <hortonworks.hadoop.version>2.7.1.2.3.2.0-2950</hortonworks.hadoop.version>
        <hortonworks.hive.version>1.2.1.2.3.2.0-2950</hortonworks.hive.version>
        <slf4j.version>1.7.10</slf4j.version>
    </properties>

    <dependencies>
        <dependency>
            <groupId>junit</groupId>
            <artifactId>junit</artifactId>
            <version>${junit.version}</version>
            <scope>test</scope>
        </dependency>
        <dependency>
            <groupId>jdk.tools</groupId>
            <artifactId>jdk.tools</artifactId>
            <version>1.7</version>
            <scope>system</scope>
            <systemPath>${JAVA_HOME}/lib/tools.jar</systemPath>
        </dependency>
        <dependency>
            <groupId>org.mortbay.jetty</groupId>
            <artifactId>jetty</artifactId>
            <version>6.1.26</version>
        </dependency>

        <!-- HortonWorks Hadoop -->
        <dependency>
            <groupId>org.apache.hadoop</groupId>
            <artifactId>hadoop-common</artifactId>
            <version>${hortonworks.hadoop.version}</version>
        </dependency>
        <dependency>
            <groupId>org.apache.hadoop</groupId>
            <artifactId>hadoop-hdfs</artifactId>
            <version>${hortonworks.hadoop.version}</version>
        </dependency>
        <dependency>
            <groupId>org.apache.hadoop</groupId>
            <artifactId>hadoop-mapreduce-client-core</artifactId>
            <version>${hortonworks.hadoop.version}</version>
        </dependency>
        <dependency>
            <groupId>org.apache.hadoop</groupId>
            <artifactId>hadoop-mapreduce-client-jobclient</artifactId>
            <version>${hortonworks.hadoop.version}</version>
        </dependency>
        <dependency>
            <groupId>org.apache.hadoop</groupId>
            <artifactId>hadoop-mapreduce-client-common</artifactId>
            <version>${hortonworks.hadoop.version}</version>
        </dependency>

        <!-- Hortonworks Hive -->
        <dependency>
            <groupId>org.apache.hive</groupId>
            <artifactId>hive-jdbc</artifactId>
            <version>${hortonworks.hive.version}</version>
        </dependency>
        <dependency>
            <groupId>org.apache.hive</groupId>
            <artifactId>hive-exec</artifactId>
            <version>${hortonworks.hive.version}</version>
        </dependency>

        <!-- slf4j -->
        <dependency>
            <groupId>org.slf4j</groupId>
            <artifactId>slf4j-api</artifactId>
            <version>${slf4j.version}</version>
        </dependency>
        <dependency>
            <groupId>org.slf4j</groupId>
            <artifactId>slf4j-log4j12</artifactId>
            <version>${slf4j.version}</version>
        </dependency>

    </dependencies>
</project>


2、自定义UDTF
package com.mengyao.hadoop.hortonworks.hive.udf;

import java.util.ArrayList;

import org.apache.hadoop.hive.ql.exec.UDFArgumentException;
import org.apache.hadoop.hive.ql.exec.UDFArgumentLengthException;
import org.apache.hadoop.hive.ql.metadata.HiveException;
import org.apache.hadoop.hive.ql.udf.generic.GenericUDTF;
import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorFactory;
import org.apache.hadoop.hive.serde2.objectinspector.StructObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorFactory;

public class Udf1 extends GenericUDTF {
    
    @Override
    public StructObjectInspector initialize(ObjectInspector[] args) throws UDFArgumentException {
        if (args.length!=3) {
            throw new UDFArgumentLengthException("参数小于3个字段");
        }
        if (args[0].getCategory() != ObjectInspector.Category.PRIMITIVE) {
            throw new UDFArgumentException("参数不是基本类型");
        }
        ArrayList<String> fieldNames = new ArrayList<String>();
        ArrayList<ObjectInspector> fieldOIs = new ArrayList<ObjectInspector>();
        fieldNames.add("floor");
        fieldOIs.add(PrimitiveObjectInspectorFactory.javaStringObjectInspector);
        fieldNames.add("poiName");
        fieldOIs.add(PrimitiveObjectInspectorFactory.javaStringObjectInspector);
        fieldNames.add("x1");
        fieldOIs.add(PrimitiveObjectInspectorFactory.javaStringObjectInspector);
        fieldNames.add("y1");
        fieldOIs.add(PrimitiveObjectInspectorFactory.javaStringObjectInspector);
        fieldNames.add("x2");
        fieldOIs.add(PrimitiveObjectInspectorFactory.javaStringObjectInspector);
        fieldNames.add("y2");
        fieldOIs.add(PrimitiveObjectInspectorFactory.javaStringObjectInspector);

        return ObjectInspectorFactory.getStandardStructObjectInspector(fieldNames,fieldOIs);
    }

    @Override
    public void close() throws HiveException {
    }

    @Override
    public void process(Object[] args) throws HiveException {
        final String floor = args[0].toString();
        final String poiNameCn = args[1].toString();
        final String geom = args[2].toString();
        String input = geom.replace("MULTIPOLYGON(((", "").replace(")))", "");
        //5
        String[] fields = input.split(",");
        for(int i=0; i<fields.length; i++) {
            try {
                String[] arr = new String[6];
                arr[0]=floor;
                arr[1]=poiNameCn;
                String[] tmp1 = fields[i].split(" ");
                arr[2]=tmp1[0];
                arr[3]=tmp1[1];
                if (i+1<fields.length) {
                    String[] tmp2 = fields[i+1].split(" ");
                    arr[4]=tmp2[0];
                    arr[5]=tmp2[1];
                }
                forward(arr);
            } catch (Exception e) {
                continue;
            }
        }
    }
    
}


3、原始数据
hive> select floor_id,poi_name_cn,geom from tmp_source_poi limit 10;
OK
F8    楼梯    MULTIPOLYGON(((27.7363472618163 -76.4068597634723,27.7363472618163 -80.0131241459239,19.72894019261 -80.0118552827063,19.502938400954 -76.4055550890202,27.7363472618163 -76.4068597634723)))
F8    避风塘    MULTIPOLYGON(((142.38873291015625 -89.830322265625,142.38848876953125 -92.31036376953125,142.3878173828125 -95.64105224609375,151.43511962890625 -95.62530517578125,151.4322509765625 -114.43475341796875,145.563720703125 -114.43389892578125,140.51824951171875 -114.43310546875,140.51849365234375 -111.59051513671875,140.52069091796875 -98.031005859375,125.305908203125 -98.02862548828125,125.3070068359375 -92.30804443359375,128.85321044921875 -92.30859375,138.40338134765625 -92.309814453125,138.40386962890625 -89.82965087890625,142.38873291015625 -89.830322265625)))
F8    咖啡时间    MULTIPOLYGON(((73.9320653229952 -73.1165769445031,73.9267330393195 -99.5226487368938,60.0283501408994 -99.5200368016607,42.2555900737643 -99.5584639401271,42.2556011453271 -95.2023675085028,39.3781932592392 -95.2019115369498,38.6626852564514 -95.2017981567676,38.6649185493588 -81.1082055937875,41.4105622805655 -81.108640672537,41.4118176437914 -73.1864483631389,73.9320653229952 -73.1165769445031)))
F8    电梯    MULTIPOLYGON(((27.7363472618163 -73.7509837455356,27.7363472618163 -76.4068597521423,27.0886035524309 -76.4067571138262,19.502938400954 -76.4055550890202,19.336468026042 -73.7496526911885,27.7363472618163 -73.7509837455356)))
F8    电梯    MULTIPOLYGON(((71.791599329561 -120.789716028472,71.7846846170723 -123.743601208178,63.5224315151572 -123.589247200859,63.5290417298674 -120.765438285686,71.791599329561 -120.789716028472)))
F8    电梯    MULTIPOLYGON(((151.438877351582 -71.140565793033,143.380713831633 -71.1289797308272,143.41783554107 -67.7106754227564,151.439391598105 -67.8952907441503,151.438877351582 -71.140565793033)))
F8    自动扶梯    MULTIPOLYGON(((129.16515929997 -72.9842409176565,129.16515929997 -69.2177793933323,136.625542271882 -69.2177793933323,136.599083889276 -72.9842409176565,129.16515929997 -72.9842409176565)))
F8    自动扶梯    MULTIPOLYGON(((90.0243225693702 -73.1198265228713,90.0237087868154 -76.1593672441145,83.4253928624093 -76.2595438690263,83.4661638364195 -73.1185022099924,90.0243225693702 -73.1198265228713)))
F8    卡乐时代    MULTIPOLYGON(((56.796247374266386 -117.50865583121777,56.6748490780592 -117.49650815501809,48.22472807019949 -117.49516914784908,48.26815593987703 -123.26608142256737,47.20688885077834 -123.24331614375114,47.17912720888853 -121.70872777327895,23.973091162741184 -120.98440674319863,22.212835498154163 -95.15799333155155,27.736347261816263 -95.15799333155155,27.736347261816263 -97.24820585176349,28.933990716934204 -97.24888809770346,29.609365351498127 -97.24910343065858,39.72476739436388 -97.26569728925824,39.74184823036194 -100.99478274583817,56.65072625130415 -100.9862755574286,56.796247374266386 -117.50865583121777)))
F8    一味一诚    MULTIPOLYGON(((104.10507772862911 -113.762516528368,104.10343369841576 -124.07710339874029,71.78468461707234 -123.74360121414065,71.80045375600457 -117.0072059892118,80.00307461991906 -117.11536817625165,80.00307461991906 -111.21669100224972,85.23863035440445 -113.57487217336893,104.10507772862911 -113.762516528368)))
Time taken: 0.052 seconds, Fetched: 10 row(s)
hive>


4、执行UDTF验证结果
hive> select poipoint(floor_id,poi_name_cn,geom) as (floor,poiName,x1,y1,x2,y2) from tmp_source_poi limit 10;
OK
F8    楼梯    27.7363472618163    -76.4068597634723    27.7363472618163    -80.0131241459239
F8    楼梯    27.7363472618163    -80.0131241459239    19.72894019261    -80.0118552827063
F8    楼梯    19.72894019261    -80.0118552827063    19.502938400954    -76.4055550890202
F8    楼梯    19.502938400954    -76.4055550890202    27.7363472618163    -76.4068597634723
F8    楼梯    27.7363472618163    -76.4068597634723    NULL    NULL
F8    避风塘    142.38873291015625    -89.830322265625    142.38848876953125    -92.31036376953125
F8    避风塘    142.38848876953125    -92.31036376953125    142.3878173828125    -95.64105224609375
F8    避风塘    142.3878173828125    -95.64105224609375    151.43511962890625    -95.62530517578125
F8    避风塘    151.43511962890625    -95.62530517578125    151.4322509765625    -114.43475341796875
F8    避风塘    151.4322509765625    -114.43475341796875    145.563720703125    -114.43389892578125
Time taken: 0.059 seconds, Fetched: 10 row(s)
hive>

 

posted @ 2013-03-12 13:52  孟尧  阅读(473)  评论(0编辑  收藏  举报