SparkSQL行列变化
【1】数据 文件 rowcolumnData 数据如下
username,item,price
zhangsan,A,1
zhangsan,B,2
zhangsan,C,3
lisi,A,4
lisi,C,5
zhangsan,D,6
lisi,B,7
wangwu,C,8
【2】scala代码实现
1 package com.it.baizhan.scalacode.sparksql.examples 2 3 import org.apache.spark.sql.SparkSession 4 5 /** 6 * SparkSQL行列变化: 7 * +--------+----+-----+ 8 * |username|item|price| +--------+----+----+---+----+ 9 * +--------+----+-----+ |username| A| B| C| D| 10 * |zhangsan| A| 1| +--------+----+----+---+----+ 11 * |zhangsan| B| 2| | wangwu|null|null| 8|null| 12 * |zhangsan| C| 3| |zhangsan| 1| 2| 3| 6| 13 * | lisi| A| 4| | lisi| 4| 7| 5|null| 14 * | lisi| C| 5| +--------+----+----+---+----+ 15 * |zhangsan| D| 6| 16 * | lisi| B| 7| 17 * | wangwu| C| 8| 18 * +--------+----+-----+ 19 * SQL函数: 20 * str_to_map(字段,分隔符1,分隔符2) : 把当前字符串字段按照分隔符1切分成多条数据,再对每条数据按照分隔符2切割成K,V格式的数据组成Map 21 * map(K1,V1,K2,V2,K3,V3... ...) : 得到一个map集合 22 */ 23 object RowColumnTransfer2 { 24 def main(args: Array[String]): Unit = { 25 val session = SparkSession.builder().appName("test").master("local").getOrCreate() 26 session.sparkContext.setLogLevel("Error") 27 val frame = session.read.option("header",true).csv("./data/rowcolumnData") 28 frame.createTempView("temp1") 29 session.sql( 30 """ 31 | select 32 | username,concat_ws("#",collect_list(concat(item,",",price))) as cw 33 | from temp1 34 | group by username 35 """.stripMargin).createTempView("temp2") 36 37 /** 38 * +--------+---------------+ 39 * |username|cw | 40 * +--------+---------------+ 41 * |wangwu |C,8 | 42 * |zhangsan|A,1#B,2#C,3#D,6| 43 * |lisi |A,4#C,5#B,7 | 44 * +--------+---------------+ 45 */ 46 session.sql( 47 """ 48 | select 49 | username,str_to_map(cw,"#",",") as mp 50 | from temp2 51 """.stripMargin).createTempView("temp3") 52 53 /** 54 * +--------+--------------------------------+ 55 * |username|mp | 56 * +--------+--------------------------------+ 57 * |wangwu |[C -> 8] | 58 * |zhangsan|[A -> 1, B -> 2, C -> 3, D -> 6]| 59 * |lisi |[A -> 4, C -> 5, B -> 7] | 60 * +--------+--------------------------------+ 61 */ 62 63 session.sql( 64 """ 65 | select username ,mp['A'] as A,mp['B'] as B ,mp['C'] as C ,mp['D'] as D 66 | from temp3 67 """.stripMargin).createTempView("temp4") 68 69 /** 70 * +--------+----+----+---+----+ 71 * |username|A |B |C |D | 72 * +--------+----+----+---+----+ 73 * |wangwu |null|null|8 |null| 74 * |zhangsan|1 |2 |3 |6 | 75 * |lisi |4 |7 |5 |null| 76 * +--------+----+----+---+----+ 77 */ 78 79 session.sql( 80 """ 81 | select username,item,price 82 | from 83 | (select 84 | username,explode(map("A",A,"B",B,"C",C,"D",D)) as (item,price) 85 | from temp4) ttt 86 | where price is not null 87 """.stripMargin).show(false) 88 89 /** 90 * +--------+----+-----+ 91 * |username|item|price| 92 * +--------+----+-----+ 93 * |wangwu |C |8 | 94 * |zhangsan|A |1 | 95 * |zhangsan|B |2 | 96 * |zhangsan|C |3 | 97 * |zhangsan|D |6 | 98 * |lisi |A |4 | 99 * |lisi |B |7 | 100 * |lisi |C |5 | 101 * +--------+----+-----+ 102 */ 103 } 104 105 }

浙公网安备 33010602011771号