package com.shujia.spark.core
import org.apache.spark.rdd.RDD
import org.apache.spark.storage.StorageLevel
import org.apache.spark.{SparkConf, SparkContext}
object Demo15Cache {
def main(args: Array[String]): Unit = {
val conf: SparkConf = new SparkConf()
.setMaster("local")
.setAppName("cache")
val sc = new SparkContext(conf)
val studentRDD: RDD[String] = sc.textFile("data/students.txt")
val studentsRDD: RDD[(String, String, Int, String, String)] = studentRDD.map(student => {
println("studentsRDD处理")
val split: Array[String] = student.split(",")
val id: String = split(0)
val name: String = split(1)
val age: Int = split(2).toInt
val gender: String = split(3)
val clazz: String = split(4)
(id, name, age, gender, clazz)
})
/**
* rdd中默认不报错数据,如果对同一个rdd使用多次,这个rdd会处理多次
*
* 持久化级别选择
* 1、如果数据量不大,内存充足----> MEMORY_ONLY
* 2、如果数据超过内存限制 ---> MEMORY_AND_DISK_SER (不管压缩不压缩,放内存中都比放磁盘上快,)
*
* 压缩---> 体积小,压缩和解压需要时间
*
*
*/
//MEMORY_ONLY 默认是MEMORY_ONLY
// studentsRDD.cache()
studentsRDD.persist(StorageLevel.MEMORY_AND_DISK_SER)
//班级人数
val clazzNum: RDD[(String, Int)] = studentsRDD.map(stu => (stu._5, 1)).reduceByKey(_ + _)
clazzNum.foreach(println)
//性别的人数
val genderNum: RDD[(String, Int)] = studentsRDD.map(stu => (stu._4, 1)).reduceByKey(_ + _)
genderNum.foreach(println)
//性别人数
val ageNumRDD: RDD[(Int, Int)] = studentsRDD.map(stu => (stu._3, 1)).reduceByKey(_ + _)
ageNumRDD.foreach(println)
}
}