行转列

[root@centos00 ~]$ cd hadoop-2.6.0-cdh5.14.2/
[root@centos00 hadoop-2.6.0-cdh5.14.2]$ sbin/hadoop-daemon.sh start namenode
[root@centos00 hadoop-2.6.0-cdh5.14.2]$ sbin/hadoop-daemon.sh start datanode
[root@centos00 hadoop-2.6.0-cdh5.14.2]$ sbin/yarn-daemon.sh start resourcemanager
  
[root@centos00 ~]$ cd /opt/cdh5.14.2/hive-1.1.0-cdh5.14.2/
[root@centos00 hive-1.1.0-cdh5.14.2]$ bin/hive --service metastore &
  
[root@centos00 ~]$ cd /opt/cdh5.14.2/spark-2.2.1-cdh5.14.2/
[root@centos00 spark-2.2.1-cdh5.14.2]$ sbin/start-master.sh
[root@centos00 spark-2.2.1-cdh5.14.2]$ sbin/start-slaves.sh

scala> val df = Seq((1,"Jack",50),(1,"Tony",100),(1,"Alex",125),(2,"Jack",75),(2,"Tony",150),(2,"Alex",175)).toDF("id","name","salary")
df: org.apache.spark.sql.DataFrame = [id: int, name: string ... 1 more field]

scala> df.show(false)
+---+----+------+
|id |name|salary|
+---+----+------+
|1  |Jack|50    |
|1  |Tony|100   |
|1  |Alex|125   |
|2  |Jack|75    |
|2  |Tony|150   |
|2  |Alex|175   |
+---+----+------+


scala> val df2 = df.groupBy("id").pivot("name").max("salary")
df2: org.apache.spark.sql.DataFrame = [id: int, Alex: int ... 2 more fields]

scala> df2.show(false)
+---+----+----+----+
|id |Alex|Jack|Tony|
+---+----+----+----+
|1  |125 |50  |100 |
|2  |175 |75  |150 |
+---+----+----+----+

  

posted @ 2020-09-17 19:53  初入门径  阅读(156)  评论(0)    收藏  举报