需求:
原始数据为
F8 楼梯 MULTIPOLYGON(((27.7363472618163 -76.4068597634723,27.7363472618163 -80.0131241459239,19.72894019261 -80.0118552827063,19.502938400954 -76.4055550890202,27.7363472618163 -76.4068597634723)))
我自定义的UDTF需要把POI数据使用0、1;1、2、2、3;3、4...规则多行输出,如下:
F8 楼梯 27.7363472618163 -76.4068597634723 27.7363472618163 -80.0131241459239
F8 楼梯 27.7363472618163 -80.0131241459239 19.72894019261 -80.0118552827063
F8 楼梯 19.72894019261 -80.0118552827063 19.502938400954 -76.4055550890202
F8 楼梯 19.502938400954 -76.4055550890202 27.7363472618163 -76.4068597634723
如下使用自定义UDTF实现
1、添加依赖jar
<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
<modelVersion>4.0.0</modelVersion>
<groupId>com.mengyao.dataformat</groupId>
<artifactId>hortonworks</artifactId>
<version>0.0.1-SNAPSHOT</version>
<packaging>jar</packaging>
<name>hortonworks</name>
<url>http://maven.apache.org</url>
<repositories>
<!-- hortonworks -->
<repository>
<releases>
<enabled>true</enabled>
<updatePolicy>always</updatePolicy>
<checksumPolicy>warn</checksumPolicy>
</releases>
<snapshots>
<enabled>false</enabled>
<updatePolicy>never</updatePolicy>
<checksumPolicy>fail</checksumPolicy>
</snapshots>
<id>HDPReleases</id>
<name>HDP Releases</name>
<url>http://repo.hortonworks.com/content/repositories/releases/</url>
<layout>default</layout>
</repository>
<!-- cloudera -->
<!--
<repository>
<id>cloudera</id>
<url>https://repository.cloudera.com/artifactory/cloudera-repos/</url>
</repository>
-->
</repositories>
<properties>
<project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
<junit.version>4.10</junit.version>
<hortonworks.hadoop.version>2.7.1.2.3.2.0-2950</hortonworks.hadoop.version>
<hortonworks.hive.version>1.2.1.2.3.2.0-2950</hortonworks.hive.version>
<slf4j.version>1.7.10</slf4j.version>
</properties>
<dependencies>
<dependency>
<groupId>junit</groupId>
<artifactId>junit</artifactId>
<version>${junit.version}</version>
<scope>test</scope>
</dependency>
<dependency>
<groupId>jdk.tools</groupId>
<artifactId>jdk.tools</artifactId>
<version>1.7</version>
<scope>system</scope>
<systemPath>${JAVA_HOME}/lib/tools.jar</systemPath>
</dependency>
<dependency>
<groupId>org.mortbay.jetty</groupId>
<artifactId>jetty</artifactId>
<version>6.1.26</version>
</dependency>
<!-- HortonWorks Hadoop -->
<dependency>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-common</artifactId>
<version>${hortonworks.hadoop.version}</version>
</dependency>
<dependency>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-hdfs</artifactId>
<version>${hortonworks.hadoop.version}</version>
</dependency>
<dependency>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-mapreduce-client-core</artifactId>
<version>${hortonworks.hadoop.version}</version>
</dependency>
<dependency>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-mapreduce-client-jobclient</artifactId>
<version>${hortonworks.hadoop.version}</version>
</dependency>
<dependency>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-mapreduce-client-common</artifactId>
<version>${hortonworks.hadoop.version}</version>
</dependency>
<!-- Hortonworks Hive -->
<dependency>
<groupId>org.apache.hive</groupId>
<artifactId>hive-jdbc</artifactId>
<version>${hortonworks.hive.version}</version>
</dependency>
<dependency>
<groupId>org.apache.hive</groupId>
<artifactId>hive-exec</artifactId>
<version>${hortonworks.hive.version}</version>
</dependency>
<!-- slf4j -->
<dependency>
<groupId>org.slf4j</groupId>
<artifactId>slf4j-api</artifactId>
<version>${slf4j.version}</version>
</dependency>
<dependency>
<groupId>org.slf4j</groupId>
<artifactId>slf4j-log4j12</artifactId>
<version>${slf4j.version}</version>
</dependency>
</dependencies>
</project>
2、自定义UDTF
package com.mengyao.hadoop.hortonworks.hive.udf;
import java.util.ArrayList;
import org.apache.hadoop.hive.ql.exec.UDFArgumentException;
import org.apache.hadoop.hive.ql.exec.UDFArgumentLengthException;
import org.apache.hadoop.hive.ql.metadata.HiveException;
import org.apache.hadoop.hive.ql.udf.generic.GenericUDTF;
import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorFactory;
import org.apache.hadoop.hive.serde2.objectinspector.StructObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorFactory;
public class Udf1 extends GenericUDTF {
@Override
public StructObjectInspector initialize(ObjectInspector[] args) throws UDFArgumentException {
if (args.length!=3) {
throw new UDFArgumentLengthException("参数小于3个字段");
}
if (args[0].getCategory() != ObjectInspector.Category.PRIMITIVE) {
throw new UDFArgumentException("参数不是基本类型");
}
ArrayList<String> fieldNames = new ArrayList<String>();
ArrayList<ObjectInspector> fieldOIs = new ArrayList<ObjectInspector>();
fieldNames.add("floor");
fieldOIs.add(PrimitiveObjectInspectorFactory.javaStringObjectInspector);
fieldNames.add("poiName");
fieldOIs.add(PrimitiveObjectInspectorFactory.javaStringObjectInspector);
fieldNames.add("x1");
fieldOIs.add(PrimitiveObjectInspectorFactory.javaStringObjectInspector);
fieldNames.add("y1");
fieldOIs.add(PrimitiveObjectInspectorFactory.javaStringObjectInspector);
fieldNames.add("x2");
fieldOIs.add(PrimitiveObjectInspectorFactory.javaStringObjectInspector);
fieldNames.add("y2");
fieldOIs.add(PrimitiveObjectInspectorFactory.javaStringObjectInspector);
return ObjectInspectorFactory.getStandardStructObjectInspector(fieldNames,fieldOIs);
}
@Override
public void close() throws HiveException {
}
@Override
public void process(Object[] args) throws HiveException {
final String floor = args[0].toString();
final String poiNameCn = args[1].toString();
final String geom = args[2].toString();
String input = geom.replace("MULTIPOLYGON(((", "").replace(")))", "");
//5
String[] fields = input.split(",");
for(int i=0; i<fields.length; i++) {
try {
String[] arr = new String[6];
arr[0]=floor;
arr[1]=poiNameCn;
String[] tmp1 = fields[i].split(" ");
arr[2]=tmp1[0];
arr[3]=tmp1[1];
if (i+1<fields.length) {
String[] tmp2 = fields[i+1].split(" ");
arr[4]=tmp2[0];
arr[5]=tmp2[1];
}
forward(arr);
} catch (Exception e) {
continue;
}
}
}
}
3、原始数据
hive> select floor_id,poi_name_cn,geom from tmp_source_poi limit 10;
OK
F8 楼梯 MULTIPOLYGON(((27.7363472618163 -76.4068597634723,27.7363472618163 -80.0131241459239,19.72894019261 -80.0118552827063,19.502938400954 -76.4055550890202,27.7363472618163 -76.4068597634723)))
F8 避风塘 MULTIPOLYGON(((142.38873291015625 -89.830322265625,142.38848876953125 -92.31036376953125,142.3878173828125 -95.64105224609375,151.43511962890625 -95.62530517578125,151.4322509765625 -114.43475341796875,145.563720703125 -114.43389892578125,140.51824951171875 -114.43310546875,140.51849365234375 -111.59051513671875,140.52069091796875 -98.031005859375,125.305908203125 -98.02862548828125,125.3070068359375 -92.30804443359375,128.85321044921875 -92.30859375,138.40338134765625 -92.309814453125,138.40386962890625 -89.82965087890625,142.38873291015625 -89.830322265625)))
F8 咖啡时间 MULTIPOLYGON(((73.9320653229952 -73.1165769445031,73.9267330393195 -99.5226487368938,60.0283501408994 -99.5200368016607,42.2555900737643 -99.5584639401271,42.2556011453271 -95.2023675085028,39.3781932592392 -95.2019115369498,38.6626852564514 -95.2017981567676,38.6649185493588 -81.1082055937875,41.4105622805655 -81.108640672537,41.4118176437914 -73.1864483631389,73.9320653229952 -73.1165769445031)))
F8 电梯 MULTIPOLYGON(((27.7363472618163 -73.7509837455356,27.7363472618163 -76.4068597521423,27.0886035524309 -76.4067571138262,19.502938400954 -76.4055550890202,19.336468026042 -73.7496526911885,27.7363472618163 -73.7509837455356)))
F8 电梯 MULTIPOLYGON(((71.791599329561 -120.789716028472,71.7846846170723 -123.743601208178,63.5224315151572 -123.589247200859,63.5290417298674 -120.765438285686,71.791599329561 -120.789716028472)))
F8 电梯 MULTIPOLYGON(((151.438877351582 -71.140565793033,143.380713831633 -71.1289797308272,143.41783554107 -67.7106754227564,151.439391598105 -67.8952907441503,151.438877351582 -71.140565793033)))
F8 自动扶梯 MULTIPOLYGON(((129.16515929997 -72.9842409176565,129.16515929997 -69.2177793933323,136.625542271882 -69.2177793933323,136.599083889276 -72.9842409176565,129.16515929997 -72.9842409176565)))
F8 自动扶梯 MULTIPOLYGON(((90.0243225693702 -73.1198265228713,90.0237087868154 -76.1593672441145,83.4253928624093 -76.2595438690263,83.4661638364195 -73.1185022099924,90.0243225693702 -73.1198265228713)))
F8 卡乐时代 MULTIPOLYGON(((56.796247374266386 -117.50865583121777,56.6748490780592 -117.49650815501809,48.22472807019949 -117.49516914784908,48.26815593987703 -123.26608142256737,47.20688885077834 -123.24331614375114,47.17912720888853 -121.70872777327895,23.973091162741184 -120.98440674319863,22.212835498154163 -95.15799333155155,27.736347261816263 -95.15799333155155,27.736347261816263 -97.24820585176349,28.933990716934204 -97.24888809770346,29.609365351498127 -97.24910343065858,39.72476739436388 -97.26569728925824,39.74184823036194 -100.99478274583817,56.65072625130415 -100.9862755574286,56.796247374266386 -117.50865583121777)))
F8 一味一诚 MULTIPOLYGON(((104.10507772862911 -113.762516528368,104.10343369841576 -124.07710339874029,71.78468461707234 -123.74360121414065,71.80045375600457 -117.0072059892118,80.00307461991906 -117.11536817625165,80.00307461991906 -111.21669100224972,85.23863035440445 -113.57487217336893,104.10507772862911 -113.762516528368)))
Time taken: 0.052 seconds, Fetched: 10 row(s)
hive>
4、执行UDTF验证结果
hive> select poipoint(floor_id,poi_name_cn,geom) as (floor,poiName,x1,y1,x2,y2) from tmp_source_poi limit 10;
OK
F8 楼梯 27.7363472618163 -76.4068597634723 27.7363472618163 -80.0131241459239
F8 楼梯 27.7363472618163 -80.0131241459239 19.72894019261 -80.0118552827063
F8 楼梯 19.72894019261 -80.0118552827063 19.502938400954 -76.4055550890202
F8 楼梯 19.502938400954 -76.4055550890202 27.7363472618163 -76.4068597634723
F8 楼梯 27.7363472618163 -76.4068597634723 NULL NULL
F8 避风塘 142.38873291015625 -89.830322265625 142.38848876953125 -92.31036376953125
F8 避风塘 142.38848876953125 -92.31036376953125 142.3878173828125 -95.64105224609375
F8 避风塘 142.3878173828125 -95.64105224609375 151.43511962890625 -95.62530517578125
F8 避风塘 151.43511962890625 -95.62530517578125 151.4322509765625 -114.43475341796875
F8 避风塘 151.4322509765625 -114.43475341796875 145.563720703125 -114.43389892578125
Time taken: 0.059 seconds, Fetched: 10 row(s)
hive>