Spark集成Kudu(Spark shell)

[root@centos00 hadoop-2.6.0-cdh5.14.2]$ sbin/hadoop-daemon.sh start namenode
[root@centos00 hadoop-2.6.0-cdh5.14.2]$ sbin/hadoop-daemon.sh start datanode
[root@centos00 hadoop-2.6.0-cdh5.14.2]$ sbin/yarn-daemon.sh start resourcemanager 

[root@centos00 ~]$ cd /opt/cdh5.14.2/hive-1.1.0-cdh5.14.2/
[root@centos00 hive-1.1.0-cdh5.14.2]$ bin/hive --service metastore &

[root@centos00 ~]$ cd /opt/cdh5.14.2/spark-2.2.1-cdh5.14.2/
[root@centos00 spark-2.2.1-cdh5.14.2]$ sbin/start-master.sh
[root@centos00 spark-2.2.1-cdh5.14.2]$ sbin/start-slaves.sh 

[root@centos00 ~]$ service kudu-master start
[root@centos00 ~]$ service kudu-tserver start

[root@centos00 spark-2.2.1-cdh5.14.2]$ bin/spark-shell --master local[2]  --packages org.apache.kudu:kudu-spark2_2.11:1.8.0
Ivy Default Cache set to: /home/root/.ivy2/cache
The jars for the packages stored in: /home/root/.ivy2/jars
:: loading settings :: url = jar:file:/opt/cdh5.14.2/spark-2.2.1-cdh5.14.2/jars/ivy-2.4.0.jar!/org/apache/ivy/core/settings/ivysettings.xml
org.apache.kudu#kudu-spark2_2.11 added as a dependency
:: resolving dependencies :: org.apache.spark#spark-submit-parent;1.0
	confs: [default]
	found org.apache.kudu#kudu-spark2_2.11;1.8.0 in central
:: resolution report :: resolve 204ms :: artifacts dl 14ms
	:: modules in use:
	org.apache.kudu#kudu-spark2_2.11;1.8.0 from central in [default]
	---------------------------------------------------------------------
	|                  |            modules            ||   artifacts   |
	|       conf       | number| search|dwnlded|evicted|| number|dwnlded|
	---------------------------------------------------------------------
	|      default     |   1   |   0   |   0   |   0   ||   1   |   0   |
	---------------------------------------------------------------------
:: retrieving :: org.apache.spark#spark-submit-parent
	confs: [default]
	0 artifacts copied, 1 already retrieved (0kB/6ms)
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
20/09/09 18:59:25 WARN util.NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
Spark context Web UI available at http://192.168.208.3:4040
Spark context available as 'sc' (master = local[2], app id = local-1599645566105).
Spark session available as 'spark'.
Welcome to
      ____              __
     / __/__  ___ _____/ /__
    _\ \/ _ \/ _ `/ __/  '_/
   /___/ .__/\_,_/_/ /_/\_\   version 2.2.1
      /_/
         
Using Scala version 2.11.8 (Java HotSpot(TM) 64-Bit Server VM, Java 1.8.0_112)
Type in expressions to have them evaluated.
Type :help for more information.

===================================================== Read ===================================================== 

scala> import org.apache.kudu.spark.kudu._
import org.apache.kudu.spark.kudu._

scala> val df = spark.read.options(Map("kudu.master" -> "centos00:7051", "kudu.table" -> "my_kudu_table")).kudu
df: org.apache.spark.sql.DataFrame = [id: bigint, name: string ... 1 more field]

scala> df.show(3,false)
+---+-------+------+
|id |name   |salary|
+---+-------+------+
|9  |Richard|1800  |
|2  |Tony   |1000  |
|3  |David  |2000  |
+---+-------+------+
only showing top 3 rows

scala> df.select("id","name").show(3,false)
+---+-------+
|id |name   |
+---+-------+
|9  |Richard|
|2  |Tony   |
|3  |David  |
+---+-------+
only showing top 3 rows

scala> df.select("id","name","salary").filter("salary < 1500")show(3,false)
+---+----+------+
|id |name|salary|
+---+----+------+
|2  |Tony|1000  |
|6  |Alex|1400  |
+---+----+------+

// 创建临时表
scala> df.registerTempTable("t")

scala> spark.sql("select id, name, salary from t where id = 1").show(false)
+---+-------+------+
|id |name   |salary|
+---+-------+------+
|1  |Phoenix|2000  |
+---+-------+------+

// 创建临时视图
scala> df.createOrReplaceTempView("v")

scala> spark.sql("select id, name, salary from v where id = 1").show(false)
+---+-------+------+
|id |name   |salary|
+---+-------+------+
|1  |Phoenix|2000  |
+---+-------+------+

===================================================== insertRows ===================================================== 

// 创建KuduContext对象
scala> import org.apache.kudu.spark.kudu.KuduContext
import org.apache.kudu.spark.kudu.KuduContext

scala> val kc = new KuduContext("centos00:7051",spark.sparkContext)
kc: org.apache.kudu.spark.kudu.KuduContext = org.apache.kudu.spark.kudu.KuduContext@70f5f57d

scala> df.schema
res0: org.apache.spark.sql.types.StructType = StructType(StructField(id,LongType,false), StructField(name,StringType,true), StructField(salary,StringType,true))

scala> df.schema.printTreeString
root
 |-- id: long (nullable = false)
 |-- name: string (nullable = true)
 |-- salary: string (nullable = true)


scala> import org.apache.kudu.client._
import org.apache.kudu.client._

scala> import collection.JavaConverters._
import collection.JavaConverters._

scala> kc.createTable("mykudu", df.schema, Seq("id"), new CreateTableOptions().setNumReplicas(1).addHashPartitions(List("id").asJava,3))
res1: org.apache.kudu.client.KuduTable = org.apache.kudu.client.KuduTable@ed6d97f

scala> val tmp = df.filter("id < 6")
tmp: org.apache.spark.sql.Dataset[org.apache.spark.sql.Row] = [id: bigint, name: string ... 1 more field]

scala> tmp.show
+---+-------+------+
| id|   name|salary|
+---+-------+------+
|  2|   Tony|  1000|
|  3|  David|  2000|
|  1|Phoenix|  2000|
|  5|   Jimy|  1900|
|  4|   Mike|  1500|
+---+-------+------+

scala> kc.insertRows(tmp, "mykudu")

scala> val m  = spark.read.options(Map("kudu.master" -> "centos00:7051", "kudu.table" -> "mykudu")).kudu
m: org.apache.spark.sql.DataFrame = [id: bigint, name: string ... 1 more field]

scala> m.show(false)
+---+-------+------+
|id |name   |salary|
+---+-------+------+
|1  |Phoenix|2000  |
|5  |Jimy   |1900  |
|2  |Tony   |1000  |
|3  |David  |2000  |
|4  |Mike   |1500  |
+---+-------+------+

===================================================== deleteRows =====================================================

scala> df.show(false)
+---+-------+------+
|id |name   |salary|
+---+-------+------+
|9  |Richard|1800  |
|2  |Tony   |1000  |
|3  |David  |2000  |
|10 |Phoniex|null  |
|1  |Phoenix|2000  |
|8  |Kevin  |8000  |
|5  |Jimy   |1900  |
|6  |Alex   |1400  |
|7  |Bob    |1600  |
|4  |Mike   |1500  |
+---+-------+------+

scala> tmp.show(false)
+---+-------+------+
|id |name   |salary|
+---+-------+------+
|2  |Tony   |1000  |
|3  |David  |2000  |
|1  |Phoenix|2000  |
|5  |Jimy   |1900  |
|4  |Mike   |1500  |
+---+-------+------+

scala> kc.deleteRows(tmp.select("id"), "my_kudu_table")

scala> df.show
+---+-------+------+
| id|   name|salary|
+---+-------+------+
|  9|Richard|  1800|
| 10|Phoniex|  null|
|  8|  Kevin|  8000|
|  6|   Alex|  1400|
|  7|    Bob|  1600|
+---+-------+------+

===================================================== upsertRows ===================================================== 

scala> df.orderBy("id").show(false)
+---+-------+------+
|id |name   |salary|
+---+-------+------+
|6  |Alex   |1400  |
|7  |Bob    |1600  |
|8  |Kevin  |8000  |
|9  |Richard|1800  |
|10 |Phoniex|null  |
+---+-------+------+

scala> df.filter("8 < id and id < 10").show
+---+-------+------+
| id|   name|salary|
+---+-------+------+
|  9|Richard|  1800|
+---+-------+------+

scala> kc.upsertRows(df.filter("8 < id and id < 10"), "mykudu")

scala> m.show(false)
+---+-------+------+
|id |name   |salary|
+---+-------+------+
|9  |Richard|1800  |
+---+-------+------+

===================================================== updateRows ===================================================== 

scala> val t  = spark.read.options(Map("kudu.master" -> "centos00:7051", "kudu.table" -> "kudutable")).kudu
t: org.apache.spark.sql.DataFrame = [id: string, name: string ... 1 more field]

scala> t.show(false)
+---+------+------+
|id |name  |salary|
+---+------+------+
|1  |Jordon|2500  |
+---+------+------+

scala> val d = sc.makeRDD(Seq(("1", "Lincoln", "3000"))).toDF("id", "name", "salary")
d: org.apache.spark.sql.DataFrame = [id: string, name: string ... 1 more field]

scala> d.show(false)
+---+-------+------+
|id |name   |salary|
+---+-------+------+
|1  |Lincoln|3000  |
+---+-------+------+

scala> kc.updateRows(d, "kudutable")

scala> t.show(false)
+---+-------+------+
|id |name   |salary|
+---+-------+------+
|1  |Lincoln|3000  |
+---+-------+------+

  

posted @ 2020-09-08 12:09  初入门径  阅读(442)  评论(0)    收藏  举报