DistinctOperator

package com.bjsxt.scala.spark.high.operator

import org.apache.spark.SparkConf
import org.apache.spark.SparkContext

object DistinctOperator {
  def main(args: Array[String]): Unit = {
    val conf = new SparkConf()
      .setMaster("local")
      .setAppName("JoinOperator")
      .set("spark.rdd.compress", "true")
      
    val sc = new SparkContext(conf)
    val rdd = sc.makeRDD(Array(1,2,1,11,6,6,3,2))
    
    /**
     * map  KV格式的RDD
     * (1,null)
     * (2,null)
     * (2,null)
     * 
     * 
     * groupByKey/reduceByKey算子,可以对key去重
     * 1 [null]
     * 2 [null,null]
     * 
     */
    rdd.map { (_,null) }.groupByKey().map(_._1).foreach { println }
    
    /**
     * distinct这个 就是这样组合的
     * rdd.distinct = rdd.map { (_,null) }.groupByKey().map(_._1).foreach { println }
     */
     val rdd1 = sc.makeRDD(Array(("A",1),("B",1),("A",2),("B",1)))
    
     val distinctRDD = rdd1.distinct
    
     distinctRDD.foreach(println)
     
     val mapValuesRDD = rdd1.mapValues { x => (x,1) }
     
     
  }
}

  

posted @ 2018-06-18 14:00  uuhh  阅读(85)  评论(0)    收藏  举报