1、添加依赖jar
<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
<modelVersion>4.0.0</modelVersion>
<groupId>com.mengyao.dataformat</groupId>
<artifactId>hortonworks</artifactId>
<version>0.0.1-SNAPSHOT</version>
<packaging>jar</packaging>
<name>hortonworks</name>
<url>http://maven.apache.org</url>
<repositories>
<!-- hortonworks -->
<repository>
<releases>
<enabled>true</enabled>
<updatePolicy>always</updatePolicy>
<checksumPolicy>warn</checksumPolicy>
</releases>
<snapshots>
<enabled>false</enabled>
<updatePolicy>never</updatePolicy>
<checksumPolicy>fail</checksumPolicy>
</snapshots>
<id>HDPReleases</id>
<name>HDP Releases</name>
<url>http://repo.hortonworks.com/content/repositories/releases/</url>
<layout>default</layout>
</repository>
<!-- cloudera -->
<!--
<repository>
<id>cloudera</id>
<url>https://repository.cloudera.com/artifactory/cloudera-repos/</url>
</repository>
-->
</repositories>
<properties>
<project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
<junit.version>4.10</junit.version>
<hortonworks.hadoop.version>2.7.1.2.3.2.0-2950</hortonworks.hadoop.version>
<hortonworks.hive.version>1.2.1.2.3.2.0-2950</hortonworks.hive.version>
<slf4j.version>1.7.10</slf4j.version>
</properties>
<dependencies>
<dependency>
<groupId>junit</groupId>
<artifactId>junit</artifactId>
<version>${junit.version}</version>
<scope>test</scope>
</dependency>
<dependency>
<groupId>jdk.tools</groupId>
<artifactId>jdk.tools</artifactId>
<version>1.7</version>
<scope>system</scope>
<systemPath>${JAVA_HOME}/lib/tools.jar</systemPath>
</dependency>
<dependency>
<groupId>org.mortbay.jetty</groupId>
<artifactId>jetty</artifactId>
<version>6.1.26</version>
</dependency>
<!-- HortonWorks Hadoop -->
<dependency>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-common</artifactId>
<version>${hortonworks.hadoop.version}</version>
</dependency>
<dependency>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-hdfs</artifactId>
<version>${hortonworks.hadoop.version}</version>
</dependency>
<dependency>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-mapreduce-client-core</artifactId>
<version>${hortonworks.hadoop.version}</version>
</dependency>
<dependency>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-mapreduce-client-jobclient</artifactId>
<version>${hortonworks.hadoop.version}</version>
</dependency>
<dependency>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-mapreduce-client-common</artifactId>
<version>${hortonworks.hadoop.version}</version>
</dependency>
<!-- Hortonworks Hive -->
<dependency>
<groupId>org.apache.hive</groupId>
<artifactId>hive-jdbc</artifactId>
<version>${hortonworks.hive.version}</version>
</dependency>
<dependency>
<groupId>org.apache.hive</groupId>
<artifactId>hive-exec</artifactId>
<version>${hortonworks.hive.version}</version>
</dependency>
<!-- slf4j -->
<dependency>
<groupId>org.slf4j</groupId>
<artifactId>slf4j-api</artifactId>
<version>${slf4j.version}</version>
</dependency>
<dependency>
<groupId>org.slf4j</groupId>
<artifactId>slf4j-log4j12</artifactId>
<version>${slf4j.version}</version>
</dependency>
</dependencies>
</project>
2、自定义Hive的UDF函数
package com.mengyao.hadoop.hortonworks.hive.udf;
import java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import org.apache.hadoop.hive.ql.exec.UDF;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.Writable;
public class AddrSplitUDF extends UDF {
public static class AddrBean implements Writable {
private String province;
private String city;
private String region;
private String county;
private String street;
private String road;
private String other;
private String make;
@Override
public void readFields(DataInput in) throws IOException {
this.province = in.readUTF();
this.city = in.readUTF();
this.region = in.readUTF();
this.county = in.readUTF();
this.street = in.readUTF();
this.road = in.readUTF();
this.other = in.readUTF();
this.make = in.readUTF();
}
@Override
public void write(DataOutput out) throws IOException {
out.writeUTF(province);
out.writeUTF(city);
out.writeUTF(region);
out.writeUTF(county);
out.writeUTF(street);
out.writeUTF(road);
out.writeUTF(other);
out.writeUTF(make);
}
public AddrBean(){
}
public void set(String province, String city, String region, String county, String street, String road, String other, String make) {
this.province = province;
this.city = city;
this.region = region;
this.county = county;
this.street = street;
this.road = road;
this.other = other;
this.make = make;
}
public String getProvince() {
return province;
}
public void setProvince(String province) {
this.province = province;
}
public String getCity() {
return city;
}
public void setCity(String city) {
this.city = city;
}
public String getRegion() {
return region;
}
public void setRegion(String region) {
this.region = region;
}
public String getCounty() {
return county;
}
public void setCounty(String county) {
this.county = county;
}
public String getStreet() {
return street;
}
public void setStreet(String street) {
this.street = street;
}
public String getRoad() {
return road;
}
public void setRoad(String road) {
this.road = road;
}
public String getOther() {
return other;
}
public void setOther(String other) {
this.other = other;
}
public String getMake() {
return make;
}
public void setMake(String make) {
this.make = make;
}
@Override
public String toString() {
return province + "\t" + city + "\t" + region + "\t" + county + "\t" + street + "\t" + road
+ "\t" + other + "\t " + make;
}
}
public static AddrBean splitAddr(String addrStr) {
Pattern mpattern = Pattern.compile("(((.*省)|(.*市)|(.*区)|(.*县)|(.*街)|(.*路)).*?|.*)");
Matcher mmatcher = mpattern.matcher(addrStr);
String str = "";
AddrBean addr = new AddrBean();
while (mmatcher.find()) {
str = mmatcher.group();
if (str.length() > 0) {
if (str.endsWith("省")) {
addr.setProvince(str);
} else if (str.endsWith("市")) {
addr.setCity(str);
} else if (str.endsWith("区")) {
addr.setRegion(str);
} else if (str.endsWith("县")) {
addr.setCounty(str);
} else if (str.endsWith("街")) {
addr.setStreet(str);
} else if (str.endsWith("路")) {
addr.setRoad(str);
} else {
addr.setOther(str);
}
}
}
return addr;
}
public Text evaluate(final Text addr){
if (null == addr) {
return null;
}
AddrBean splitAddr = splitAddr(addr.toString());
if (null == splitAddr) {
return null;
}
return new Text(splitAddr.toString());
}
}
3、打包为Jar File:splitAddr-0.0.1-SNAPSHOT.jar
4、注册自定义UDF到hive会话中(临时)
4.1:在hive的会话中添加临时函数
hive
hive> add jar /home/rta/mengyao/apps/udf/splitAddr-0.0.1-SNAPSHOT.jar;
Added [/home/rta/mengyao/apps/udf/splitAddr-0.0.1-SNAPSHOT.jar] to class path
Added resources: [/home/rta/mengyao/apps/udf/splitAddr-0.0.1-SNAPSHOT.jar]
hive> create temporary function splitAddr as 'com.mengyao.hadoop.hortonworks.hive.udf.AddrSplitUDF';
OK
Time taken: 0.444 seconds
验证:select splitAddr(shop_addr) from rtc_nuomi limit 10;
4.2:运行hive时加载配置文件
vim init_func_splitAddr
add jar /home/rta/mengyao/apps/udf/splitAddr-0.0.1-SNAPSHOT.jar;
create temporary function splitAddr as 'com.mengyao.hadoop.hortonworks.hive.udf.AddrSplitUDF';
hive -i init_func_splitAddr
验证:select splitAddr(shop_addr) from rtc_nuomi limit 10;