RDD编程API:
1,RDD中的所有转换(Transformation)都是延迟加载的,也就是说,它们并不会直接计算结果,只是记住这些应用到基础数据集(例如一个文件)上的转换动作。只有当发生一个要求返回结果给Driver的动作时,这些转换才会真正运行
spark 常见算子总结(其一)
package day02
import org.apache.spark.{SparkConf, SparkContext}
import org.scalatest.prop.Tables.Table
object RddTest {
def main(args: Array[String]): Unit = {
// map()
// 只能跑一个 RDD
// filter()
// flatmap()
// groupbykey()
// reducebykey()
// sortByKey()
join()
}
def map(): Unit = {
// 将集合之中的每一个元素乘以2
val conf = new SparkConf().setAppName("SparkRDD").setMaster("local")
val sc = new SparkContext(conf)
val number = Array(1, 2, 3, 4, 5)
val numberRDD = sc.parallelize(number, 1)
// map 接受的是 function对象!
// 每一个映射成 num * 2
val multipmentRdd = numberRDD.map { num => num * 2 }
// 循环打印出每一个 num
multipmentRdd.foreach { num => println(num) }
}
def filter(): Unit = {
// filter 过滤操作!
val conf = new SparkConf().setAppName("SparkRDD").setMaster("local")
val sc = new SparkContext(conf)
val number = Array(1, 2, 3, 4, 5)
val numberRDD = sc.parallelize(number, 1)
// {} 与 () 相同!
//过滤集合之中的偶数!filter方法遍历整个集合, 参数是函数(是一种判断)!
val evennumRDD = numberRDD.filter { num => num % 2 == 0 }
evennumRDD.foreach { num => println(num) }
}
def flatmap(): Unit = {
// 将文本行拆分成多个单词!
val conf = new SparkConf().setAppName("SparkRDD").setMaster("local")
val sc = new SparkContext(conf)
val lineArry = Array("hello java", "hello python", "hello R", "hello you")
val lines = sc.parallelize(lineArry, 1)
val words = lines.flatMap { lines => lines.split(" ") }
words.foreach { words => println(words) }
}
def groupbykey(): Unit = {
val conf = new SparkConf().setAppName("SparkRDD").setMaster("local")
val sc = new SparkContext(conf)
val corelist = Array(
Tuple2("class1", 34), Tuple2("class2", 26),
Tuple2("class1", 69), Tuple2("class2", 87)
)
val cores = sc.parallelize(corelist,1)
val groupscore = cores.groupByKey()
groupscore.foreach {score => println(score._1);score._2.foreach(sing=>println(sing))}
/*
class1 34 69
class2 26 87
*/
}
def reducebykey(): Unit ={
val conf = new SparkConf().setAppName("SparkRDD").setMaster("local")
val sc = new SparkContext(conf)
val corelist = Array(
Tuple2("class1", 34), Tuple2("class2", 26),
Tuple2("class1", 69), Tuple2("class2", 87)
)
val scores = sc.parallelize(corelist,1)
// 对于相同的 key 进行处理,最终每一条key 保留一条记录!
val totalScore =scores.reduceByKey(_+_)
totalScore.foreach(classScore =>println(classScore._1+" "+classScore._2))
/*class1 103 class2 113*/
}
def sortByKey(): Unit ={
val conf = new SparkConf().setAppName("SparkRDD").setMaster("local")
val sc = new SparkContext(conf)
val sortlist = Array(
Tuple2(3,"xiaoming"),Tuple2(113,"xiaoqiang"),Tuple2(132,"xiaolv"),Tuple2(43,"xiaoxiao")
)
val scores = sc.parallelize(sortlist,1)
val sortedScore = scores.sortByKey()
sortedScore.foreach{
sortedScore =>println(sortedScore._1+" "+sortedScore._2)
}
/*
3 xiaoming
43 xiaoxiao
113 xiaoqiang
132 xiaolv
* */
}
def join(): Unit = {
// 两个 rdd 关联起来 join 打印每一个学生的成绩!
val conf = new SparkConf().setAppName("SparkRDD").setMaster("local")
val sc = new SparkContext(conf)
val studentList = Array(
Tuple2(1,"loe"),
Tuple2(2,"jiek"),
Tuple2(3,"tom")
)
val scoreList = Array(
Tuple2(1,23),
Tuple2(2,35),
Tuple2(3,24)
)
val students = sc.parallelize(studentList)
val scores = sc.parallelize(scoreList)
val studentsScores = students.join(scores)
studentsScores.foreach(
studentsScores =>{ println("student id: " +studentsScores._1);
println("student name: "+ studentsScores._2);
println("student name: "+ studentsScores._2);
println("-------------------------------")
}
)
/*
student id: 1
student name: (loe,23)
student name: (loe,23)
-------------------------------
student id: 3
student name: (tom,24)
student name: (tom,24)
-------------------------------
student id: 2
student name: (jiek,35)
student name: (jiek,35)
-------------------------------
* */
}
}
常见的 action操作!
package day02
import org.apache.spark.{SparkConf, SparkContext}
object ActionRDD {
def main(args: Array[String]): Unit = {
// reduce()
// countBykey()
// collect()
take()
}
def reduce(): Unit ={
val conf = new SparkConf().setAppName("SparkRDD").setMaster("local")
val sc = new SparkContext(conf)
val numberArry = Array(1,2,3,4,5,6,7,8,9,10)
val numbers = sc.parallelize(numberArry,1)
val sum = numbers.reduce(_+_) //进行累加
println(sum)
}
def collect(): Unit ={
val conf = new SparkConf().setAppName("SparkRDD").setMaster("local")
val sc = new SparkContext(conf)
val numberArry = Array(1,2,3,4,5,6,7,8,9,10)
val numbers = sc.parallelize(numberArry,1)
val doubleNumbes = numbers.map( num => num * 2)
println(doubleNumbes)//MapPartitionsRDD[1] at map at ActionRDD.scala:24
println("------------------")
// 使用 collect操作时将分布式集群之上的 doubleNumbes RDD的数据拉取到本地之中,
// 通常使用 foreach action 操作进行对RDD处理
val doubleNumerArry =doubleNumbes.collect()
for (num <- doubleNumerArry){
// println(num) //2 4 6 8 10 12 14 16 18 20
}
}
def countBykey(): Unit ={
val conf = new SparkConf().setAppName("SparkRDD").setMaster("local")
val sc = new SparkContext(conf)
val studentlist = Array(
Tuple2("class1", "jiek"), Tuple2("class2", "tpm"),
Tuple2("class1", "root"), Tuple2("class2", "user")
)
val students = sc.parallelize(studentlist,1)
// 计算 key个数
val studentconut = students.countByKey()
println(studentconut)
println("------------------------")
}
def take(): Unit ={
val conf = new SparkConf().setAppName("SparkRDD").setMaster("local")
val sc = new SparkContext(conf)
val numberArry = Array(1,2,3,4,5,6,7,8,9,10)
val numbers = sc.parallelize(numberArry,1)
val doubleNumbes = numbers.map( num => num * 2)
// take 排序 取出前三个元素
val top3Nums =doubleNumbes.take(3)
for (num <- top3Nums){
println(num) // 2 4 6
}
}
}
join 算子操作总结
package day02
import org.apache.spark.rdd.RDD
import org.apache.spark.{SparkConf, SparkContext}
object joinOperation {
def main(args: Array[String]): Unit = {
val conf = new SparkConf().setAppName("SparkRDD").setMaster("local")
val sc = new SparkContext(conf)
val namelist = Array(Tuple2(1,"xiao"),Tuple2(2,"cww"),Tuple2(3,"wd"),Tuple2(4,"wd"))
val scorelist = Array(Tuple2(1,123),Tuple2(2,34),Tuple2(3,87))
// makeRDD 将本地集合转换成 RDD, 3代表着生成三个分区
//parallelize 同上都是生成 RDD第一步 parallelize 接受的是 序列, 生成器与索引拼接
// RDD[(Int:NameRDD]代表着key类型,String:代表着 NameRDD的value类型)]
val NameRDD:RDD[(Int,String)] = sc.makeRDD(namelist,3)
val ScoreRDD =sc.parallelize(scorelist,1)
// RDD[(Int:代表ScoreRDD key值,(Int:NameRDD key值,String NameRDD value值))]
val resultRDD:RDD[(Int,(Int,String))] = ScoreRDD.join(NameRDD)// join 内连接 Tuple2(4,"wd")关联不了
// leftOuterJoin 是按照左边 RDD的内容作为标准
val leftOuterJoinResultEDD = NameRDD.leftOuterJoin(ScoreRDD)
resultRDD.foreachPartition((x =>{
while (x.hasNext){
val log = x.next
val id =log._1
val name = log._2._2
val core = log._2._1
println("id: "+id +"\t name:"+name+"\t core:"+core)
/*
id: 1 core:123 name:xiao
id: 2 core:34 name:cww
id: 3 core:87 name:wd
* */
}
}))
leftOuterJoinResultEDD.foreachPartition((x =>{
while (x.hasNext){
val log = x.next
val id =log._1
val namr = log._2._1
val core = log._2._2
println("id: "+id +"\t name:"+namr+"\t core:"+core)
/*
id: 3 name:wd core:Some(87)
id: 4 name:wd core:None
id: 1 name:xiao core:Some(123)
id: 2 name:cww core:Some(34)
*/
}
}))
}
}
常见的action操作
package day02
import org.apache.spark.{SparkConf, SparkContext}
import scala.collection.mutable.ListBuffer
object Transformationses {
def main(args: Array[String]): Unit = {
val conf = new SparkConf().setMaster("local").setAppName("test")
val sc = new SparkContext(conf)
val arr = Array(
"ABC", "Abc1","Abc10")
val rdds = sc.parallelize(arr,3)
val rdd = sc.parallelize(arr, 3)
val rdd1 = rdds.mapPartitionsWithIndex((index, iter) => {
val list = new ListBuffer[String]()
while (iter.hasNext) {
list.+=("rdds partition index = " + index + ",value = " + iter.next())
}
list.iterator // 返回一个迭代对象!
}, true)
rdd1.foreach(println)
/*(ABC,0) (Abc1,1) (Abc10,2)*/
println("--------------------")
rdd.zipWithIndex().foreach(println)
/*
* (ABC,1)
(Abc1,1)
(Abc10,1)
* */
// rdd.zip(rdds).foreach(println)
/*
(ABC,ABC)
(Abc1,Abc1)
(Abc10,Abc10)
* */
rdd.countByValue().foreach(println)
/*
* (ABC,1)
(Abc1,1)
(Abc10,1)
* */
sc.stop()
}
}
maven 编译 jar包 集群提交任务
sudo ./bin/spark-submit --class day02.ActionRDD --executor-memory 20M --executor-cores 1 /home/hadoop/spark-1.4.0-bin-hadoop2.3/lib/sfd-1.0-SNAPSHOT.jar