Spark编程--Spark SQL DataFrame

DataFrame与RDD的区别

从示例文件people.json中创建DataFrame,保存成csv格式的文件

package com.zwq
import org.apache.spark.SparkConf
import org.apache.spark.sql.SparkSession;
object DataFrame {
  def main(args: Array[String]): Unit = {
    val conf = new SparkConf().setMaster("local").setAppName("DataFrameApp")
    val spark = SparkSession.builder().config(conf).getOrCreate()
    import spark.implicits._

    // val df  = spark.read.json("resources/people.json")
    // df.show()
    val peopleDF = spark.read.format("json").load("resources/people.json")
    peopleDF.select("name", "age").write.format("csv").save("resources/people.csv")
  }
}

DataFrame常用操作

df.printSchema 打印模式信息

df.select(df("name"), df("age")+1).show()

df.filter(df("age">20).show()

df.groupBy("age").count().show()

//排序

df.sort(df("age").desc).show()

//多列排序

df.sort(df("age").desc, df("name").asc).show()

//对列进行重命名

df.select(df("name").as("username"), df("age")).show()

利用反射机制推断RDD模式 关键:定义一个case Class

package com.zwq

import org.apache.spark.SparkConf
import org.apache.spark.sql.SparkSession

object RDD2DataFrame {
def main(args: Array[String]): Unit = {
val conf = new SparkConf().setMaster("local").setAppName("DataFrameApp")
val spark = SparkSession.builder().config(conf).getOrCreate()
import spark.implicits._

val peopleDF = spark.sparkContext.textFile("resources/people.txt")
.map(_.split(","))
.map(attributes => Person(attributes(0), attributes(1).trim.toInt))
.toDF()
peopleDF.createOrReplaceTempView("people") //必须注册为临时表才能供下面的查询使用
val personsRDD = spark.sql("select name,age from people where age > 20")

personsRDD.map(t=>s"Name: ${t(0)}, Age: ${t(1)}").show();

// peopleDF.show()
}
}

case class Person(name:String, age:Long)

 

用编程机制定义RDD模式            对于事先无法 提前定义case class 

  1. 制作 表头

  2. 制作表中记录

  3. 拼装在一

package com.zwq

import org.apache.spark.SparkConf
import org.apache.spark.sql.{Row, SparkSession}
import org.apache.spark.sql.types._

object RDD2DataFrame2 {
  def main(args: Array[String]): Unit = {
    val conf = new SparkConf().setMaster("local").setAppName("DataFrameApp")
    val spark = SparkSession.builder().config(conf).getOrCreate()
    import spark.implicits._

    val fields = Array(StructField("name", StringType, true), StructField("age", IntegerType, true))

    val schema = StructType(fields)

    val rowRDD = spark.sparkContext.textFile("resources/people.txt")
      .map(_.split(","))
      .map(attributes => Row(attributes(0), attributes(1).trim.toInt))


    val peopleDF = spark.createDataFrame(rowRDD, schema)

    peopleDF.createOrReplaceTempView("people")
    val results = spark.sql("SELECT name,age FROM people");
    results.show();
  }
}

 通过JDBC连接MySQL数据库

package com.zwq

import org.apache.spark.SparkConf
import org.apache.spark.sql.SparkSession

object JDBCApp {
  def main(args: Array[String]): Unit = {
    val conf = new SparkConf().setMaster("local").setAppName("DataFrameApp")
    val spark = SparkSession.builder().config(conf).getOrCreate()
    import spark.implicits._

  val jdbcDF =  spark.read.format("jdbc")
      .option("url", "jdbc:mysql://localhost:3306/spark")
      .option("driver", "com.mysql.jdbc.Driver")
      .option("dbtable", "student")
      .option("user", "root")
      .option("password", "root")
      .load()
    jdbcDF.show()
  }
}

向MySQl数据库写入数据

package com.zwq

import java.util.Properties

import org.apache.spark.SparkConf
import org.apache.spark.sql.types.{IntegerType, StringType, StructField, StructType}
import org.apache.spark.sql.{Row, SparkSession}

object JDBCApp {
  def main(args: Array[String]): Unit = {
    val conf = new SparkConf().setMaster("local").setAppName("DataFrameApp")
    val spark = SparkSession.builder().config(conf).getOrCreate()
    import spark.implicits._

    val fields = List(StructField("id", IntegerType, true),
      StructField("name", StringType, true),
      StructField("gender", StringType, true),
      StructField("age", IntegerType, true))
    val schema = StructType(fields)

    val studentRDD = spark.sparkContext.parallelize(Array("3 Rongcheng M 26", "4 Guanhua M 27"))
      .map(_.split(" "))
    val rowRDD = studentRDD.map(p => Row(p(0).toInt, p(1).trim, p(2).trim, p(3).toInt))
    val studentDF = spark.createDataFrame(rowRDD, schema)

    val prop = new Properties()
    prop.put("user", "root")
    prop.put("password", "root")
    prop.put("driver", "com.mysql.jdbc.Driver")

    studentDF.write.mode("append").jdbc("jdbc:mysql://localhost:3306/spark", "spark.student", prop)
  }
}

 

posted @ 2020-03-14 10:17  中年程序  阅读(205)  评论(0)    收藏  举报