Spark -- UDF

[root@centos00 ~]$ cd /opt/cdh5.14.2/hadoop-2.6.0-cdh5.14.2/
[root@centos00 hadoop-2.6.0-cdh5.14.2]$ sbin/hadoop-daemon.sh start namenode
[root@centos00 hadoop-2.6.0-cdh5.14.2]$ sbin/hadoop-daemon.sh start datanode
         
[root@centos00 ~]$ cd /opt/cdh5.14.2/hive-1.1.0-cdh5.14.2/
[root@centos00 hive-1.1.0-cdh5.14.2]$ bin/hive --service metastore &
         
[root@centos00 hadoop-2.6.0-cdh5.14.2]$ cd ../spark-2.2.1-cdh5.14.2/
[root@centos00 spark-2.2.1-cdh5.14.2]$ sbin/start-master.sh
[root@centos00 spark-2.2.1-cdh5.14.2]$ sbin/start-slaves.sh
[root@centos00 spark-2.2.1-cdh5.14.2]$ bin/spark-shell --master local[2]


scala> val df = Seq(
     | ("01", "Jack", "08012345566", "28","SALES", "1000", 1),
     | ("02", "Tom",  "08056586761", "19","MANAGEMENT", "2500", 1),
     | ("03", "Mike", "08009097878", "25","MARKET", "2000", 1),
     | ("04", "Tina", "07099661234", "30","LOGISTICS", "3000", 0),
     | ("05", "Alex", "08019208960", "18","MARKET", "3500", 1),
     | ("06", "Bob", "08011223344", "22","CLERK", "1500", 1),
     | ("07", "Dvaid", "08022557788", "25","CLERK", "2500", 1),
     | ("08", "Ben", "08080201682", "35","MARKET", "500", 1),
     | ("09", "Allen", "08099206680", "20","MARKET", "2500", 1),
     | ("10", "Caesar", "09011020806", "32","SALES", "1000", 1)).toDF("id", "name", "cellphone", "age", "department", "expense", "gender")
df: org.apache.spark.sql.DataFrame = [id: string, name: string ... 5 more fields]

scala> val convertString = spark.udf.register(
     |   "convertString",
     |   (gender: Int) => {
     |     gender match {
     |       case 0 => "F"
     |       case 1 => "M"
     |     }
     |   }
     | )
convertString: org.apache.spark.sql.expressions.UserDefinedFunction = UserDefinedFunction(<function1>,StringType,Some(List(IntegerType)))

scala> df.select(col("id"), col("name"), convertString(col("gender"))).show
+---+------+-----------+
| id|  name|UDF(gender)|
+---+------+-----------+
| 01|  Jack|          M|
| 02|   Tom|          M|
| 03|  Mike|          M|
| 04|  Tina|          F|
| 05|  Alex|          M|
| 06|   Bob|          M|
| 07| Dvaid|          M|
| 08|   Ben|          M|
| 09| Allen|          M|
| 10|Caesar|          M|
+---+------+-----------+

scala> df.select(col("*"), convertString(col("gender")) as "sexuality").show
+---+------+-----------+---+----------+-------+------+---------+
| id|  name|  cellphone|age|department|expense|gender|sexuality|
+---+------+-----------+---+----------+-------+------+---------+
| 01|  Jack|08012345566| 28|     SALES|   1000|     1|        M|
| 02|   Tom|08056586761| 19|MANAGEMENT|   2500|     1|        M|
| 03|  Mike|08009097878| 25|    MARKET|   2000|     1|        M|
| 04|  Tina|07099661234| 30| LOGISTICS|   3000|     0|        F|
| 05|  Alex|08019208960| 18|    MARKET|   3500|     1|        M|
| 06|   Bob|08011223344| 22|     CLERK|   1500|     1|        M|
| 07| Dvaid|08022557788| 25|     CLERK|   2500|     1|        M|
| 08|   Ben|08080201682| 35|    MARKET|    500|     1|        M|
| 09| Allen|08099206680| 20|    MARKET|   2500|     1|        M|
| 10|Caesar|09011020806| 32|     SALES|   1000|     1|        M|
+---+------+-----------+---+----------+-------+------+---------+

  

posted @ 2020-08-27 13:26  初入门径  阅读(199)  评论(0)    收藏  举报