package com.shujia.spark.sql
import org.apache.spark.SparkContext
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.{DataFrame, Row, SparkSession}
object Demo4DFonRDD {
def main(args: Array[String]): Unit = {
val spark: SparkSession = SparkSession
.builder()
.appName("source")
.master("local")
.config("spark.sql.shuffle.partitions", "1")
.getOrCreate()
//导入隐式转换
import spark.implicits._
//导入spark 所有的函数
import org.apache.spark.sql.functions._
/**
*
* 1、可以通过SparkSession 获取SparkContext
*/
val sc: SparkContext = spark.sparkContext
val studentRDD: RDD[String] = sc.textFile("data/students.txt")
/**
* rdd 转换成DF
*
*/
val studentTuple: RDD[(String, String, Int, String, String)] = studentRDD.map(line => {
val split: Array[String] = line.split(",")
(split(0), split(1), split(2).toInt, split(3), split(4))
})
//指定列名
val studentDF: DataFrame = studentRDD.toDF("id","name","age","gender","clazz")
studentDF.printSchema()
studentDF.show()
val studentBean: RDD[Student] = studentRDD.map(line => {
val split: Array[String] = line.split(",")
Student(split(0), split(1), split(2).toInt, split(3), split(4))
})
//如果rdd的类型是一个自定义的类,不需要指定类型
val studentBeanDF: DataFrame = studentBean.toDF()
studentBeanDF.printSchema()
studentBeanDF.show()
/**
*
* DF 转换成RDD
*/
val rdd: RDD[Row] = studentBeanDF.rdd
//通过列名和类型取字段
rdd.map(row => {
val id: String = row.getAs[String]("id")
val name: String = row.getAs[String]("name")
val age: Int = row.getAs[Int]("age")
val gender: String = row.getAs[String]("gender")
val clazz: String = row.getAs[String]("clazz")
(id, name, age, gender, clazz)
}).foreach(println)
}
case class Student(id: String, name: String, age: Int, gender: String, clazz: String)
}