spark

spark sql基础操作

SQLContext可能需要自己创建。

var sqlContext=new org.apache.spark.sql SQLContext(sc)

sc 是指spark context。

scala> val textFile=sc.textFile("file:///usr/local/spark/README.md")
textFile: org.apache.spark.rdd.RDD[String] = file:///usr/local/spark/README.md MapPartitionsRDD[1] at textFile at <console>:24

scala> textFile.count
res0: Long = 104

scala> textFile.first
res1: String = # Apache Spark

scala> textFile.filter(line=>line.contains("Spark")).count
res2: Long = 20

scala> var df=sqlContext.read.json("file:///usr/local/spark/examples/src/main/resources/people.json")
<console>:23: error: not found: value sqlContext
var df=sqlContext.read.json("file:///usr/local/spark/examples/src/main/resources/people.json")
^

scala>

 

scala> var sqlContext=new org.apache.spark.sql SQLContext(sc)
<console>:24: error: type sql is not a member of package org.apache.spark
var sqlContext=new org.apache.spark.sql SQLContext(sc)
^

scala> var sqlContext=new org.apache.spark.sql.SQLContext(sc)
warning: there was one deprecation warning; re-run with -deprecation for details
sqlContext: org.apache.spark.sql.SQLContext = org.apache.spark.sql.SQLContext@576a8b00

scala> var df=sqlContext.read.json("file:///usr/local/spark/examples/src/main/resources/people.json")
18/06/21 22:33:50 WARN metastore.ObjectStore: Version information not found in metastore. hive.metastore.schema.verification is not enabled so recording the schema version 1.2.0
18/06/21 22:33:51 WARN metastore.ObjectStore: Failed to get database default, returning NoSuchObjectException
18/06/21 22:33:54 WARN metastore.ObjectStore: Failed to get database global_temp, returning NoSuchObjectException
df: org.apache.spark.sql.DataFrame = [age: bigint, name: string]

scala> df.show
+----+-------+
| age| name|
+----+-------+
|null|Michael|
| 30| Andy|
| 19| Justin|
+----+-------+


scala> df.select("name").show
+-------+
| name|
+-------+
|Michael|
| Andy|
| Justin|
+-------+


scala> df.select(df("name"),de("age")+1).show//age +1
<console>:29: error: not found: value de
df.select(df("name"),de("age")+1).show//age +1
^

scala> df.select(df("name"),df("age")+1).show//age +1
+-------+---------+
| name|(age + 1)|
+-------+---------+
|Michael| null|
| Andy| 31|
| Justin| 20|
+-------+---------+


scala> df.filter(df("age")>21).show
+---+----+
|age|name|
+---+----+
| 30|Andy|
+---+----+


scala> df.groupBy("age").count().show
+----+-----+
| age|count|
+----+-----+
| 19| 1|
|null| 1|
| 30| 1|
+----+-----+


scala> df.registerTempTable("people")
warning: there was one deprecation warning; re-run with -deprecation for details

scala> val result=sqlContext.sql("select name,age from people")
result: org.apache.spark.sql.DataFrame = [name: string, age: bigint]

scala> result.show
+-------+----+
| name| age|
+-------+----+
|Michael|null|
| Andy| 30|
| Justin| 19|
+-------+----+

posted on 2018-06-21 22:50  whynotybb  阅读(279)  评论(0编辑  收藏  举报