Spark UDF 使用自定义函数

 

package org.onepiece.bigdata.windows

import org.apache.spark.{SparkConf, SparkContext} import org.apache.spark.sql.{Row, SQLContext, SaveMode, SparkSession, types}
import scala.util.matching.Regex object App { /** * 屏蔽不必要的日志信息显示 **/ def init(): Unit = { //Logger.getLogger("org.apache.spark").setLevel(Level.WARN) org.apache.log4j.Logger.getLogger("org.apache.spark").setLevel(org.apache.log4j.Level.ERROR) org.apache.log4j.Logger.getLogger("org.apache.spark.sql").setLevel(org.apache.log4j.Level.ERROR) } case class Order(user: String, cardId: String) def main(args: Array[String]): Unit = { init() val conf = new SparkConf().setMaster("local[*]").setAppName("spark-udf-test") val spark = SparkSession.builder() .config(conf) //.enableHiveSupport() .getOrCreate() import spark.implicits._ //注册UDF udf_CardId(spark) regex_CardId(spark) val orders = List( Order("AA", "445321199910201030"), Order("BB", "44532120100910103X"), Order("CC", ""), Order("SS", "445198010201030"), Order("TT", "44520110910103X"), Order("NN", null), Order("XX", "445321"), Order("YY", "441900X") ) val df = orders.toDF() //df.printSchema() //df.show() df.createOrReplaceTempView("cte_order") val result = spark.sql( """ |select | user, cardId | ,udf_CardId(cardId) as udf_cardId | ,regex_CardId(cardId) as regex_cardId |from cte_order |order by user """.stripMargin) result.show() } def udf_CardId(sparkSession: SparkSession) = { //注册自定义函数 sparkSession.udf.register("udf_CardId", (str: String) => { if (str != null && str.trim != "") { val s = str.trim val result = if (s.length == 18) s.takeRight(6) else s.takeRight(4) result } else { "" } }) } def regex_CardId(sparkSession: SparkSession) = { //注册自定义函数 sparkSession.udf.register("regex_CardId", (str: String) => { if (str != null && str.trim != "") { val s = str.trim val result = regex_string(s, "^[0-9]*$") result } else { false } }) } def regex_string(str: String, regex: String): Boolean = { if (str == null || str.trim == "") return false; //regex = "^[0-9]*$" val pattern = new Regex(regex) val result = pattern findFirstMatchIn (str.trim) return result.nonEmpty; } def regex_string(str: String, regex: Regex): Boolean = { if (str == null || str.trim == "") return false; //regex = "^[0-9]*$".r val result = regex findFirstMatchIn (str.trim) return result.nonEmpty; } }
root
 |-- user: string (nullable = true)
 |-- cardId: string (nullable = true)

+----+------------------+----------+------------+
|user|            cardId|udf_cardId|regex_cardId|
+----+------------------+----------+------------+
|  AA|445321199910201030|    201030|        true|
|  BB|44532120100910103X|    10103X|       false|
|  CC|                  |          |       false|
|  NN|              null|          |       false|
|  SS|   445198010201030|      1030|        true|
|  TT|   44520110910103X|      103X|       false|
|  XX|            445321|      5321|        true|
|  YY|           441900X|      900X|       false|
+----+------------------+----------+------------+

 

posted @ 2021-03-07 11:03  茗::流  阅读(588)  评论(0)    收藏  举报
如有雷同,纯属参考。如有侵犯你的版权,请联系我。