Spark SQL 以编程方式指定模式

Spark SQL 以编程方式指定模式

scala> val sqlcontext = new org.apache.spark.sql.SQLContext(sc)
warning: there was one deprecation warning (since 2.0.0); for details, enable `:setting -deprecation' or `:replay -deprecation'
sqlcontext: org.apache.spark.sql.SQLContext = org.apache.spark.sql.SQLContext@350a3df3

scala> val Demo = sc.textFile("Demo.txt")
Demo: org.apache.spark.rdd.RDD[String] = Demo.txt MapPartitionsRDD[1] at textFile at <console>:24

scala> val schemastring = "id name age"
schemastring: String = id name age

scala> import org.apache.spark.sql.Row;
import org.apache.spark.sql.Row

scala> import org.apache.spark.sql.types.{StructType,StructField,StringType};
import org.apache.spark.sql.types.{StructType, StructField, StringType}
scala> val schema = StructType(schemastring.split(" ").map(fieldName => StructField(fieldName,StringType,true)))
schema: org.apache.spark.sql.types.StructType = StructType(StructField(id,StringType,true), StructField(name,StringType,true), StructField(age,StringType,true))
scala> val rowRDD = Demo.map(_.split(",")).map(e => Row(e(0),e(1),e(2)))
rowRDD: org.apache.spark.rdd.RDD[org.apache.spark.sql.Row] = MapPartitionsRDD[10] at map at <console>:27
-- 我原来在这一步将ID与age转化为 e(0).trim.toInt 是报错的;目前还未确定原因;日后在研究
scala> val DemoDF = sqlcontext.createDataFrame(rowRDD,schema)
DemoDF: org.apache.spark.sql.DataFrame = [id: string, name: string ... 1 more field]

scala> DemoDF.registerTempTable("Demo")
warning: there was one deprecation warning (since 2.0.0); for details, enable `:setting -deprecation' or `:replay -deprecation'

scala> val allrow = sqlcontext.sql("select * from Demo")
allrow: org.apache.spark.sql.DataFrame = [id: string, name: string ... 1 more field]

scala> allrow.show()
+----+--------+---+ | id| name|age| +----+--------+---+ |1201| satish| 25| |1202| krishna| 28| |1203| amith| 39| |1204| javed| 23| |1205| prudvi| 23| +----+--------+---+

 

posted @ 2022-05-25 22:56  不吃酸豆角  阅读(84)  评论(0)    收藏  举报