DataFrame
在Spark SQL中有两种方式可以在DataFrame和RDD进行转换
首先在maven项目的pom.xml中添加Spark SQL的依赖
<dependency> <groupId>org.apache.spark</groupId> <artifactId>spark-sql_${scala.version}</artifactId> <version>${spark.version}</version> </dependency>
1,zhangshan,20 2,lis,30 3,wangwu,40
{"name": "aaa","age": 20}
{"name": "bbb","age": 30}
{"name": "ccc","age": 40}
1. DataFrame
1.1 通过内存创建DataFrame
import org.apache.spark.rdd.RDD import org.apache.spark.sql.{DataFrame, Dataset, SQLContext} import org.apache.spark._ object sql8 { def main(args: Array[String]): Unit = { val conf: SparkConf = new SparkConf().setAppName("My scala word count").setMaster("local") val sc = new SparkContext(conf) val sqlContext = new sql.SQLContext(sc) import sqlContext.implicits._
//方式一:自己创建RDD=>DataFrame val rdd: RDD[Int] = sc.makeRDD(List(1,2,3,4)) val df: DataFrame = rdd.toDF("id") df.show() val rdd2: RDD[(String, Int)] = sc.makeRDD(List(("zhangshan",10),("lisi",20),("wangwu",30))) val df2: DataFrame = rdd2.toDF("name","age") df2.show() } }
1.2 通过json文件创建DataFrame
import org.apache.spark.sql.{DataFrame, Dataset, SQLContext} import org.apache.spark._ object sql6 { def main(args: Array[String]): Unit = { val conf: SparkConf = new SparkConf().setAppName("My scala word count").setMaster("local") val sc = new SparkContext(conf) val sqlContext = new sql.SQLContext(sc) //方式三:读取json文件,sqlContext.read.json 直接生成DataFrame
val personDF: DataFrame = sqlContext.read.json("in/user.json")
personDF.show } }
1.3 通过txt文件创建DataFrame
import org.apache.spark.sql.{DataFrame} import org.apache.spark._ object sql6 { def main(args: Array[String]): Unit = { val conf: SparkConf = new SparkConf().setAppName("My scala word count").setMaster("local") val sc = new SparkContext(conf) val sqlContext = new sql.SQLContext(sc) // 方式二:读取txt文件,sqlContext.read.text,直接生成DataFrame val studentDF: DataFrame = sqlContext.read.text("in/student.txt") studentDF.show } }
1.4 通过反射机制构建DataFrame
Scala支持使用case class类型导入RDD转换为DataFrame
通过case class创建schema,
case class的参数名称会被反射读取并成为表的列名。
这种RDD可以高效的转换为DataFrame并注册为表
import org.apache.spark import org.apache.spark.rdd.RDD import org.apache.spark.sql.{DataFrame, Dataset, SQLContext} import org.apache.spark._ //todo:定义一个样例类Person case class Person(id:Int,name:String,age:Int) extends Serializable object sql5 { def main(args: Array[String]): Unit = { val conf: SparkConf = new SparkConf().setAppName("My scala word count").setMaster("local") val sc = new SparkContext(conf)
val sqlContext = new sql.SQLContext(sc)
import sqlContext.implicits._
//todo:3、加载数据 val dataRDD: RDD[String] = sc.textFile("in/student.txt") //todo:4、切分每一行记录 val lineArrayRDD: RDD[Array[String]] = dataRDD.map(_.split(",")) //todo:5、将RDD与Person类关联 val personRDD: RDD[Person] = lineArrayRDD.map(x => Person(x(0).toInt, x(1), x(2).toInt)) val personDF: DataFrame = personRDD.toDF() sc.stop() } }
1.5 通过指定schena构建DataFrame
当case class不能提前定义好时,可以通过以下三步通过代码创建DataFrame
(1)将RDD转为包含row对象的RDD
(2)基于structType类型创建schema,与第一步创建的RDD相匹配
(3)通过sparkSession的createDataFrame方法对第一步的RDD应用
如果需要RDD与DF,DS相互转化,需要引入 import sqlContext.implicits._
import org.apache.spark.{SparkConf, SparkContext, sql} import org.apache.spark.rdd.RDD import org.apache.spark.sql.types.{IntegerType, StringType, StructField, StructType} import org.apache.spark.sql.{DataFrame, Row, SparkSession} object sql4 { def main(args: Array[String]): Unit = { //todo:1、创建SparkSession,指定appName和master val conf: SparkConf = new SparkConf().setAppName("My scala word count").setMaster("local") //todo:2、获取sparkContext对象 val sc = new SparkContext(conf) //todo:3、加载数据 val dataRDD: RDD[String] = sc.textFile("in/student.txt") //todo:4、切分每一行 val dataArrayRDD: RDD[Array[String]] = dataRDD.map(_.split(",")) //todo:5、加载数据到Row对象中 val personRDD: RDD[Row] = dataArrayRDD.map(x=>Row(x(0).toInt,x(1),x(2).toInt)) //todo:6、创建schema val schema:StructType= StructType(Seq( StructField("id", IntegerType, false), StructField("name", StringType, false), StructField("age", IntegerType, false) )) //todo:7、利用personRDD与schema创建DataFrame // 首先用已有的Spark Context对象创建SQLContext对象 val sqlContext = new sql.SQLContext(sc) val personDF: DataFrame = sqlContext.createDataFrame(personRDD,schema) //todo:8、DSL操作显示DataFrame的数据结果 personDF.show() sc.stop() } }
1. 6. dataframe操作
import org.apache.spark.sql.{DataFrame, Dataset, SQLContext} import org.apache.spark._ object sql6 { def main(args: Array[String]): Unit = { val conf: SparkConf = new SparkConf().setAppName("My scala word count").setMaster("local") val sc = new SparkContext(conf) // 首先用已有的Spark Context对象创建SQLContext对象 val sqlContext = new sql.SQLContext(sc) //TODO 创建DataFrame //TODO :读取json文件创建DataFrame val personDF: DataFrame = sqlContext.read.json("in/user.json") personDF.show //TODO 方式一:领域特定语言DSL风格语法 (personDF,userDF) //1.打印DataFrame的Schema信息 personDF.printSchema() // 查看DataFrame部分列中的内容 personDF.select("name").show personDF.select("name","age"+1).show personDF.filter($"age">=25).show val count: Long = personDF.filter($"age">30).count() println(count) personDF.groupBy("age").count().show //TODO 方式二:SQL风格语法(personDF,userDF) // 将DataFrame注册为一个表 //personDF.registerTempTable("people")
personDF.createOrReplaceTempView("people")
// // //显示表的信息 sqlContext.sql("desc people").show sqlContext.sql("SELECT name, age FROM people where age>=30").show } }
1. 6. 数据的读取
package sparkSql import org.apache.spark.sql.{DataFrame, SQLContext, SparkSession} import org.apache.spark.{SparkConf, SparkContext} object dataRead { def main(args: Array[String]): Unit = { val conf: SparkConf = new SparkConf().setAppName("my scala").setMaster("local") val spark: SparkSession = SparkSession.builder().config(conf).getOrCreate() import spark.implicits._ //数据的读取 //方式一 val df1: DataFrame = spark.read.json("in/user.json") // val df2 = spark.read.parquet("E:\\666\\users.parquet") df1.show() //方式二 val df2: DataFrame = spark.read.format("json").load("in/user.json") //val df4 = spark.read.format("parquet").load("E:\\666\\users.parquet") df2.show() //方式三,默认是parquet格式 //val df5 = spark.load("E:\\666\\users.parquet") } }
1. 7. 数据的保存
package sparkSql import org.apache.spark.SparkConf import org.apache.spark.sql.{DataFrame, SparkSession} object dataSave { def main(args: Array[String]): Unit = { val conf: SparkConf = new SparkConf().setAppName("my scala").setMaster("local") val spark: SparkSession = SparkSession.builder().config(conf).getOrCreate() //数据的读取 //方式一 val df1: DataFrame = spark.read.json("in/user.json") //方式一 df1.write.json("out/1.json") df1.write.parquet("out/2,parquet") //方式2 df1.write.format("json").save("out/3.json") df1.write.format("parquet").save("out/4.parquet") //方式3 df1.write.save("out/5.parquet") } }
2. DataSet
2.1 创建DataSet
import org.apache.spark.rdd.RDD import org.apache.spark.sql.{DataFrame, Dataset, Row, SQLContext} import org.apache.spark._ //TODO 定义一个样例类People //样例类中不能用Int 需要使用bigint 或者 long case class People(id:Int,name:String, age:Int) case class Person2(name:String, age:Long) object sql7 { def main(args: Array[String]): Unit = { val conf: SparkConf = new SparkConf().setAppName("My scala word count").setMaster("local") val sc = new SparkContext(conf) // 首先用已有的Spark Context对象创建SQLContext对象 val sqlContext = new sql.SQLContext(sc) // 导入语句,可以隐式地将RDD转化成DataFrame import sqlContext.implicits._ //TODO List==>Dataset :toDS val persons = List(Person2("zhangshan",10),Person2("lisi",20),Person2("wangwu",30)) val ds: Dataset[Person2] = persons.toDS ds.show() //TODO rdd==>Dataset toDS() val lineRDD: RDD[Array[String]] = sc.textFile("in/student.txt").map(_.split(",")) val value: RDD[People] = lineRDD.map(x => People(x(0).toInt, x(1), x(2).toInt)) val peopleDS: Dataset[People] = value.toDS() peopleDS.show() //TODO DataFrame==>Dataset as[] val df: DataFrame = sqlContext.read.json("in/user.json") val ds2: Dataset[Person2] = df.as[Person2] ds2.show()
2.2 dataset操作
import org.apache.spark.rdd.RDD import org.apache.spark.sql._ import org.apache.spark.{SparkConf, SparkContext} case class Person23(name:String,age:Long) object test23 { def main(args: Array[String]): Unit = { val conf: SparkConf = new SparkConf().setAppName("My scala word count").setMaster("local") val sc = new SparkContext(conf) val spark: SparkSession = SparkSession.builder().config(conf).getOrCreate() import spark.implicits._ val rdd: RDD[(String, Int)] = sc.makeRDD(List(("zhangshan", 10), ("lisi", 20), ("wangwu", 30))) val df: DataFrame = rdd.toDF("name", "age") df.show() val ds: Dataset[Person23] = df.as[Person23] ds.show() //TODO 方式一:领域特定语言DSL风格语法 ds.select("name").show() //TODO 方式二:SQL风格语法 ds.createOrReplaceTempView("person") spark.sql("select name from person").show() spark.stop() } }
2.3 相互转化

import org.apache.spark.rdd.RDD import org.apache.spark.sql.{DataFrame, Dataset, Row, SQLContext} import org.apache.spark._ //TODO 定义一个样例类People //样例类中不能用Int 需要使用bigint 或者 long case class People(id:Int,name:String, age:Int) case class Person2(name:String, age:Long) object sql7 { def main(args: Array[String]): Unit = { val conf: SparkConf = new SparkConf().setAppName("My scala word count").setMaster("local") val sc = new SparkContext(conf) val sqlContext = new sql.SQLContext(sc) import sqlContext.implicits._ //TODO List==>DataFrame :toDF val persons = List(("zhangshan",10),("lisi",20),("wangwu",30)) val df: DataFrame = persons.toDF("name","age") df.show //TODO List==>Dataset :toDS val persons2 = List(Person2("zhangshan",10),Person2("lisi",20),Person2("wangwu",30)) val ds: Dataset[Person2] = persons2.toDS ds.show() val lineRDD: RDD[Array[String]] = sc.textFile("in/student.txt").map(_.split(",")) val rdd: RDD[People] = lineRDD.map(x => People(x(0).toInt, x(1), x(2).toInt)) //TODO rdd==>DataFrame toDF() val df2: DataFrame = rdd.toDF() df2.show() //TODO rdd==>Dataset toDS() val ds2: Dataset[People] = rdd.toDS() ds2.show() //TODO DataFrame==>Dataset as[] val ds3: Dataset[Person2] = df.as[Person2] ds3.show() //TODO DataFrame==>rdd rdd val rdd3: RDD[Row] = df2.rdd rdd3.foreach(row=>{ println(row.getInt(0),row.getShort(1)) }) //TODO Dataset==>DataFrame toDF() val df3: DataFrame = ds2.toDF() df3.show //TODO Dataset==>rdd .rdd val peopleRDD: RDD[People] = ds2.rdd peopleRDD.foreach(x=>println(x.name+","+x.age)) } }
3. 隐式转换
import org.apache.spark.{SparkConf, SparkContext} import org.apache.spark.rdd.RDD import org.apache.spark.sql.{DataFrame, Dataset, Row,SparkSession} //todo:定义一个样例类Person case class Person(id:Int,name:String,age:Int) extends Serializable object sql5 { def main(args: Array[String]): Unit = { //todo:1、构建sparkSession 指定appName和master的地址 val conf: SparkConf = new SparkConf().setAppName("My scala word count").setMaster("local") //todo:2、从sparkSession获取sparkContext对象 val sc = new SparkContext(conf) //todo:6、创建dataFrame,需要导入隐式转换 // val sqlContext = new sql.SQLContext(sc) // import sqlContext.implicits._ val sqlContext: SparkSession = SparkSession.builder().config(conf).getOrCreate() import sqlContext.implicits._ //todo:3、加载数据 val dataRDD: RDD[String] = sc.textFile("in/student.txt") //todo:4、切分每一行记录 val lineArrayRDD: RDD[Array[String]] = dataRDD.map(_.split(",")) //todo:5、将RDD与Person类关联 val personRDD: RDD[Person] = lineArrayRDD.map(x => Person(x(0).toInt, x(1), x(2).toInt)) val personDF: DataFrame = personRDD.toDF() } }
4. 数据源
4.1 json
4.2 parquet
4.3 mysql
package sparkSql import java.util.Properties import org.apache.log4j.{Level, Logger} import org.apache.spark.SparkConf import org.apache.spark.sql.{DataFrame, SparkSession} object dataRead2 { def main(args: Array[String]): Unit = { //屏蔽日志 Logger.getLogger("org.apache.spark").setLevel(Level.WARN) Logger.getLogger("org.eclipse.jetty.server").setLevel(Level.OFF) val conf: SparkConf = new SparkConf().setAppName("my scala").setMaster("local") val spark: SparkSession = SparkSession.builder().config(conf).getOrCreate() //数据的读取Mysql val url ="jdbc:mysql://localhost:3306/spark"+"?serverTimezone=GMT%2B8" val table = "student" val properties = new Properties() properties.setProperty("user","root") properties.setProperty("password","*******") properties.setProperty("driver","com.mysql.cj.jdbc.Driver") //需要传入Mysql的URL、表明、properties(连接数据库的用户名密码) val df: DataFrame = spark.read.jdbc(url,table,properties) df.show() } }
4.4 hive
posted on 2020-10-19 11:38 happygril3 阅读(537) 评论(0) 收藏 举报
浙公网安备 33010602011771号