import org.apache.spark.{SparkConf, SparkContext}
import org.apache.spark.sql.{SQLContext, SparkSession}
import org.apache.log4j.{Level, Logger}
object App {
def init(): Unit = {
//Logger.getLogger("org.apache.spark").setLevel(Level.WARN)
org.apache.log4j.Logger.getLogger("org.apache.spark").setLevel(org.apache.log4j.Level.ERROR)
}
def main(args: Array[String]): Unit = {
init()
val conf = new SparkConf().setMaster("local[*]").setAppName("spark-rdd-test")
val sc = new SparkContext(conf)
// spark-shell
val sqlContext = new SQLContext(sc)
import sqlContext.implicits._
val df = sqlContext.createDataFrame(List(
(12, "张小军", 25, "男", "chinese", 50.0), (12, "张小军", 25, "男", "math", 60.0), (12, "张小军", 25, "男", "english", 70.0),
(12, "李小凤", 20, "女", "chinese", 80.0), (12, "李小凤", 20, "女", "math", 80.0), (12, "李小凤", 20, "女", "english", 80.0),
(12, "王大力", 19, "男", "chinese", 70.0), (12, "王大力", 19, "男", "math", 80.0), (12, "王大力", 19, "男", "english", 90.0),
(13, "张大明", 25, "男", "chinese", 55.0), (13, "张大明", 25, "男", "math", 65.0), (13, "张大明", 25, "男", "english", 60.0),
(13, "李小华", 20, "男", "chinese", 95.0), (13, "李小华", 20, "男", "math", 92.0), (13, "李小华", 20, "男", "english", 91.0),
(13, "王小芳", 19, "女", "chinese", 75.0), (13, "王小芳", 19, "女", "math", 85.0), (13, "王小芳", 19, "女", "english", 90.0)
)).toDF("classid", "name", "age", "sex", "course", "score")
df.createOrReplaceTempView("cte_student") //注册为临时表
df.cache(); //df.persist(StorageLevel.MEMORY_ONLY)
df.sqlContext.sql(" select name, sum(score) as s, avg(score) as c from cte_student group by name ").show()
/*
+-------+-----+-----------------+
| name| s| c|
+-------+-----+-----------------+
| 李小凤|240.0| 80.0|
| 张小军|180.0| 60.0|
| 李小华|278.0|92.66666666666667|
| 王大力|240.0| 80.0|
| 王小芳|250.0|83.33333333333333|
| 张大明|180.0| 60.0|
+-------+-----+-----------------+
* */
sqlContext.sql(
"""
|select
| name, sum(score) as s, avg(score) as c
|from cte_student
|group by name
|order by 2 desc
""".stripMargin).show()
/*
+-------+-----+-----------------+
| name| s| c|
+-------+-----+-----------------+
| 李小华|278.0|92.66666666666667|
| 王小芳|250.0|83.33333333333333|
| 李小凤|240.0| 80.0|
| 王大力|240.0| 80.0|
| 张小军|180.0| 60.0|
| 张大明|180.0| 60.0|
+-------+-----+-----------------+
* */
df.unpersist()
}
}
import org.apache.spark.{SparkConf, SparkContext}
import org.apache.spark.sql.{SQLContext, SparkSession}
import org.apache.spark.storage.StorageLevel
import org.apache.log4j.{Level, Logger}
object App {
def init(): Unit = {
//Logger.getLogger("org.apache.spark").setLevel(Level.WARN)
org.apache.log4j.Logger.getLogger("org.apache.spark").setLevel(org.apache.log4j.Level.ERROR)
}
def main(args: Array[String]): Unit = {
init()
//val conf = new SparkConf().setMaster("local[*]").setAppName("spark-rdd-test")
//val sc = new SparkContext(conf)
val sparkSession = SparkSession.builder()
.master("local[*]").appName("spark-rdd-test")
.getOrCreate()
import sparkSession.implicits._
val df = sparkSession.createDataFrame(List(
(12, "张小军", 25, "男", "chinese", 50.0), (12, "张小军", 25, "男", "math", 60.0), (12, "张小军", 25, "男", "english", 70.0),
(12, "李小凤", 20, "女", "chinese", 80.0), (12, "李小凤", 20, "女", "math", 80.0), (12, "李小凤", 20, "女", "english", 80.0),
(12, "王大力", 19, "男", "chinese", 70.0), (12, "王大力", 19, "男", "math", 80.0), (12, "王大力", 19, "男", "english", 90.0),
(13, "张大明", 25, "男", "chinese", 55.0), (13, "张大明", 25, "男", "math", 65.0), (13, "张大明", 25, "男", "english", 60.0),
(13, "李小华", 20, "男", "chinese", 95.0), (13, "李小华", 20, "男", "math", 92.0), (13, "李小华", 20, "男", "english", 91.0),
(13, "王小芳", 19, "女", "chinese", 75.0), (13, "王小芳", 19, "女", "math", 85.0), (13, "王小芳", 19, "女", "english", 90.0)
)).toDF("classid", "name", "age", "sex", "course", "score")
df.createOrReplaceTempView("cte_student") //注册为临时表
sparkSession.catalog.cacheTable("cte_student", StorageLevel.MEMORY_ONLY) //df.cache();
df.printSchema()
/*
root
|-- classid: integer (nullable = false)
|-- name: string (nullable = true)
|-- age: integer (nullable = false)
|-- sex: string (nullable = true)
|-- course: string (nullable = true)
|-- score: double (nullable = false)
* */
sparkSession.sql(" select * from cte_student order by classid,name,course ").show()
/*
+-------+-------+---+---+-------+-----+
|classid| name|age|sex| course|score|
+-------+-------+---+---+-------+-----+
| 12| 张小军| 25| 男|chinese| 50.0|
| 12| 张小军| 25| 男|english| 70.0|
| 12| 张小军| 25| 男| math| 60.0|
| 12| 李小凤| 20| 女|chinese| 80.0|
| 12| 李小凤| 20| 女|english| 80.0|
| 12| 李小凤| 20| 女| math| 80.0|
| 12| 王大力| 19| 男|chinese| 70.0|
| 12| 王大力| 19| 男|english| 90.0|
| 12| 王大力| 19| 男| math| 80.0|
| 13| 张大明| 25| 男|chinese| 55.0|
| 13| 张大明| 25| 男|english| 60.0|
| 13| 张大明| 25| 男| math| 65.0|
| 13| 李小华| 20| 男|chinese| 95.0|
| 13| 李小华| 20| 男|english| 91.0|
| 13| 李小华| 20| 男| math| 92.0|
| 13| 王小芳| 19| 女|chinese| 75.0|
| 13| 王小芳| 19| 女|english| 90.0|
| 13| 王小芳| 19| 女| math| 85.0|
+-------+-------+---+---+-------+-----+
* */
df.sqlContext.sql(" select name, sum(score) as s, avg(score) as c from cte_student group by name ").show()
/*
+-------+-----+-----------------+
| name| s| c|
+-------+-----+-----------------+
| 李小凤|240.0| 80.0|
| 张小军|180.0| 60.0|
| 李小华|278.0|92.66666666666667|
| 王大力|240.0| 80.0|
| 王小芳|250.0|83.33333333333333|
| 张大明|180.0| 60.0|
+-------+-----+-----------------+
* */
sparkSession.sql(
"""
|select
| name, sum(score) as s, avg(score) as c
|from cte_student
|group by name
|order by 2 desc
""".stripMargin).show()
/*
+-------+-----+-----------------+
| name| s| c|
+-------+-----+-----------------+
| 李小华|278.0|92.66666666666667|
| 王小芳|250.0|83.33333333333333|
| 王大力|240.0| 80.0|
| 李小凤|240.0| 80.0|
| 张小军|180.0| 60.0|
| 张大明|180.0| 60.0|
+-------+-----+-----------------+
* */
sparkSession.catalog.uncacheTable("cte_student") //df.unpersist()
}
}
import org.apache.spark.{SparkConf, SparkContext}
import org.apache.spark.sql.{SQLContext, SparkSession}
import org.apache.spark.storage.StorageLevel
import org.apache.log4j.{Level, Logger}
object App {
def init(): Unit = {
//Logger.getLogger("org.apache.spark").setLevel(Level.WARN)
org.apache.log4j.Logger.getLogger("org.apache.spark").setLevel(org.apache.log4j.Level.ERROR)
}
def main(args: Array[String]): Unit = {
init()
val sparkSession = SparkSession.builder()
.master("local[*]")
.appName("spark-rdd-test")
.getOrCreate()
import sparkSession.implicits._ //启用隐式转换
//spark-shell
val dataset = sparkSession.read.textFile("F:\\files\\hdfs_file\\student.txt"); // Dataset[String]
println(dataset.collect().toList)
// List(
// 12 张小军 25 男 chinese 50, 12 张小军 25 男 math 60, 12 张小军 25 男 english 70,
// 12 李小凤 20 女 chinese 80, 12 李小凤 20 女 math 80, 12 李小凤 20 女 english 80,
// 12 王大力 19 男 chinese 70, 12 王大力 19 男 math 80, 12 王大力 19 男 english 90,
// 13 张大明 25 男 chinese 55, 13 张大明 25 男 math 65, 13 张大明 25 男 english 60,
// 13 李小华 20 男 chinese 95, 13 李小华 20 男 math 92, 13 李小华 20 男 english 91,
// 13 王小芳 19 女 chinese 75, 13 王小芳 19 女 math 85, 13 王小芳 19 女 english 90
// )
val df = dataset.map(line => line.split(" ")) //空格
.map(x => (x(0).toInt, x(1), x(2).toInt, x(3), x(4), x(5).toDouble))
.toDF("classid", "name", "age", "sex", "course", "score")
//直接指定列名和数据类型
df.cache()
df.printSchema()
/*
root
|-- classid: integer (nullable = false)
|-- name: string (nullable = true)
|-- age: integer (nullable = false)
|-- sex: string (nullable = true)
|-- course: string (nullable = true)
|-- score: double (nullable = false)
* */
df.createOrReplaceTempView("cte_student") //注册为临时表
df.sqlContext.sql(" select name, sum(score) as s, avg(score) as c from cte_student group by name ").show()
/*
+-------+-----+-----------------+
| name| s| c|
+-------+-----+-----------------+
| 李小凤|240.0| 80.0|
| 张小军|180.0| 60.0|
| 李小华|278.0|92.66666666666667|
| 王大力|240.0| 80.0|
| 王小芳|250.0|83.33333333333333|
| 张大明|180.0| 60.0|
+-------+-----+-----------------+
* */
sparkSession.sql(
"""
|select name, sum(score) as s, avg(score) as c
|from cte_student
|group by name
|order by 2 desc
""".stripMargin).show()
/*
+-------+-----+-----------------+
| name| s| c|
+-------+-----+-----------------+
| 李小华|278.0|92.66666666666667|
| 王小芳|250.0|83.33333333333333|
| 李小凤|240.0| 80.0|
| 王大力|240.0| 80.0|
| 张小军|180.0| 60.0|
| 张大明|180.0| 60.0|
+-------+-----+-----------------+
* */
df.unpersist()
}
}
import org.apache.spark.{SparkConf, SparkContext}
import org.apache.spark.sql.{SQLContext, SparkSession}
import org.apache.spark.storage.StorageLevel
import org.apache.log4j.{Level, Logger}
object App {
def init(): Unit = {
//Logger.getLogger("org.apache.spark").setLevel(Level.WARN)
org.apache.log4j.Logger.getLogger("org.apache.spark").setLevel(org.apache.log4j.Level.ERROR)
}
case class student(classid: Int, name: String, age: Int, sex: String, course: String, score: Double)
def main(args: Array[String]): Unit = {
init()
val conf = new SparkConf().setMaster("local[*]").setAppName("spark-rdd-test")
val sc = new SparkContext(conf)
//spark-shell
val sqlContext = new SQLContext(sc);
import sqlContext.implicits._ //启用隐式转换
val dataset = sqlContext.read.textFile("F:\\files\\hdfs_file\\student.txt");
val df = dataset
.map(line => line.split(" ")) //空格
.map(obj => student(obj(0).toInt, obj(1), obj(2).toInt, obj(3), obj(4), obj(5).toDouble)) //通过反射转换( case class )
.toDF()
df.cache();
df.printSchema()
/*
root
|-- classid: integer (nullable = false)
|-- name: string (nullable = true)
|-- age: integer (nullable = false)
|-- sex: string (nullable = true)
|-- course: string (nullable = true)
|-- score: double (nullable = false)
* */
println(df.collect().toList)
// List(
// [12,张小军,25,男,chinese,50.0], [12,张小军,25,男,math,60.0], [12,张小军,25,男,english,70.0],
// [12,李小凤,20,女,chinese,80.0], [12,李小凤,20,女,math,80.0], [12,李小凤,20,女,english,80.0],
// [12,王大力,19,男,chinese,70.0], [12,王大力,19,男,math,80.0], [12,王大力,19,男,english,90.0],
// [13,张大明,25,男,chinese,55.0], [13,张大明,25,男,math,65.0], [13,张大明,25,男,english,60.0],
// [13,李小华,20,男,chinese,95.0], [13,李小华,20,男,math,92.0], [13,李小华,20,男,english,91.0],
// [13,王小芳,19,女,chinese,75.0], [13,王小芳,19,女,math,85.0], [13,王小芳,19,女,english,90.0]
// )
df.createOrReplaceTempView("cte_student"); //注册为临时表
sqlContext.sql(" select name, sum(score) as s, avg(score) as c from cte_student group by name ").show()
/*
+-------+-----+-----------------+
| name| s| c|
+-------+-----+-----------------+
| 李小凤|240.0| 80.0|
| 张小军|180.0| 60.0|
| 李小华|278.0|92.66666666666667|
| 王大力|240.0| 80.0|
| 王小芳|250.0|83.33333333333333|
| 张大明|180.0| 60.0|
+-------+-----+-----------------+
* */
df.unpersist()
}
}
import org.apache.spark.sql.{Row, SQLContext, SparkSession}
import org.apache.spark.storage.StorageLevel
import org.apache.log4j.{Level, Logger}
import org.apache.spark.sql.types._
import org.apache.spark.{SparkConf, SparkContext, sql}
import org.apache.spark.SparkContext._
import org.apache.spark.storage.StorageLevel
import org.apache.spark.sql.SparkSession
import org.apache.spark.sql.SQLContext
import org.apache.spark.sql.types
import org.apache.spark.sql.types._
import org.apache.spark.sql.Row
object App {
def init(): Unit = {
//Logger.getLogger("org.apache.spark").setLevel(Level.WARN)
org.apache.log4j.Logger.getLogger("org.apache.spark").setLevel(org.apache.log4j.Level.ERROR)
}
case class student(classid: Int, name: String, age: Int, sex: String, course: String, score: Double)
def main(args: Array[String]): Unit = {
init()
val conf = new SparkConf().setMaster("local[*]").setAppName("spark-rdd-test")
val sc = new SparkContext(conf)
//spark-shell
val sqlContext = new SQLContext(sc);
val fileRDD = sc.textFile("F:\\files\\hdfs_file\\student.txt");
val schema = StructType(List(
//生成字段: 字段名, 字段类型, 是否可以为空
StructField("classid", IntegerType, true),
StructField("name", StringType, true),
StructField("age", IntegerType, true),
StructField("sex", StringType, true),
StructField("course", StringType, true),
StructField("score", DoubleType, true)
));
val rowRDD = fileRDD.map(line => line.split(" "))
.map(obj => Row(
obj(0).toInt, obj(1), obj(2).toInt,
obj(3), obj(4), obj(5).toDouble
));
val df = sqlContext.createDataFrame(rowRDD, schema); //通过编程设置Schema(StructType)
println(rowRDD.collect().toList)
// List(
// [12,张小军,25,男,chinese,50.0], [12,张小军,25,男,math,60.0], [12,张小军,25,男,english,70.0],
// [12,李小凤,20,女,chinese,80.0], [12,李小凤,20,女,math,80.0], [12,李小凤,20,女,english,80.0],
// [12,王大力,19,男,chinese,70.0], [12,王大力,19,男,math,80.0], [12,王大力,19,男,english,90.0],
// [13,张大明,25,男,chinese,55.0], [13,张大明,25,男,math,65.0], [13,张大明,25,男,english,60.0],
// [13,李小华,20,男,chinese,95.0], [13,李小华,20,男,math,92.0], [13,李小华,20,男,english,91.0],
// [13,王小芳,19,女,chinese,75.0], [13,王小芳,19,女,math,85.0], [13,王小芳,19,女,english,90.0]
// )
df.cache()
df.printSchema();
/*
root
|-- classid: integer (nullable = true)
|-- name: string (nullable = true)
|-- age: integer (nullable = true)
|-- sex: string (nullable = true)
|-- course: string (nullable = true)
|-- score: double (nullable = true)
* */
//df.show()
df.createOrReplaceTempView("cte_student"); //注册为临时表
df.sqlContext.sql(" select name, sum(score) as s, avg(score) as c from cte_student group by name ").show();
/*
+-------+-----+-----------------+
| name| s| c|
+-------+-----+-----------------+
| 李小凤|240.0| 80.0|
| 张小军|180.0| 60.0|
| 李小华|278.0|92.66666666666667|
| 王大力|240.0| 80.0|
| 王小芳|250.0|83.33333333333333|
| 张大明|180.0| 60.0|
+-------+-----+-----------------+
* */
sqlContext.sql(" select name, sum(score) as s, avg(score) as c from cte_student group by name order by 2 desc ").show();
/*
+-------+-----+-----------------+
| name| s| c|
+-------+-----+-----------------+
| 李小华|278.0|92.66666666666667|
| 王小芳|250.0|83.33333333333333|
| 王大力|240.0| 80.0|
| 李小凤|240.0| 80.0|
| 张小军|180.0| 60.0|
| 张大明|180.0| 60.0|
+-------+-----+-----------------+
* */
df.unpersist()
}
}