Spark-wordcount排序开发，二次排序和取top n

package cn.spark.study.core;

import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaPairRDD;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.api.java.function.FlatMapFunction;
import org.apache.spark.api.java.function.Function2;
import org.apache.spark.api.java.function.PairFunction;
import org.apache.spark.api.java.function.VoidFunction;
import org.apache.spark.sql.sources.In;
import scala.Tuple2;

import java.util.Arrays;

/**
 * @author: yangchun
 * @description:
 * @date: Created in 2020-05-08 12:50
 */
public class SortWordCount {
    public static void main(String[] args) {
        SparkConf sparkConf = new SparkConf().setAppName("sortedWordCounts").setMaster("local");
        JavaSparkContext sc = new JavaSparkContext(sparkConf);
        JavaRDD<String> lines = sc.textFile("E:\\spark\\spark.txt");
        JavaRDD<String> words = lines.flatMap(new FlatMapFunction<String, String>() {
            @Override
            public Iterable<String> call(String s) throws Exception {
                return Arrays.asList(s.split(" "));
            }
        });
        JavaPairRDD<String,Integer> pairs = words.mapToPair(new PairFunction<String, String, Integer>() {
            @Override
            public Tuple2<String, Integer> call(String s) throws Exception {
                return new Tuple2<>(s,1);
            }
        });
        JavaPairRDD<String, Integer> wordCounts = pairs.reduceByKey(new Function2<Integer, Integer, Integer>() {
            @Override
            public Integer call(Integer integer, Integer integer2) throws Exception {
                return integer+integer2;
            }
        });
        JavaPairRDD<Integer, String> countWords = wordCounts.mapToPair(new PairFunction<Tuple2<String, Integer>, Integer, String>() {
            @Override
            public Tuple2<Integer, String> call(Tuple2<String, Integer> stringIntegerTuple2) throws Exception {
                return new Tuple2<>(stringIntegerTuple2._2,stringIntegerTuple2._1);
            }
        });
        JavaPairRDD<Integer, String> sortedCountWords = countWords.sortByKey(false);
        JavaPairRDD<String, Integer> sortedWordCounts = sortedCountWords.mapToPair(new PairFunction<Tuple2<Integer, String>, String, Integer>() {
            @Override
            public Tuple2<String, Integer> call(Tuple2<Integer, String> integerStringTuple2) throws Exception {
                return new Tuple2<>(integerStringTuple2._2,integerStringTuple2._1);
            }
        });
        sortedWordCounts.foreach(new VoidFunction<Tuple2<String, Integer>>() {
            @Override
            public void call(Tuple2<String, Integer> stringIntegerTuple2) throws Exception {
                System.out.println(stringIntegerTuple2._1+" "+stringIntegerTuple2._2);
            }
        });
        sc.close();
    }
}

package cn.spark.study.core

import org.apache.spark.{SparkConf, SparkContext}

/**
 * @author: yangchun
 * @description:
 * @date: Created in 2020-05-08 13:04 
 */
object SortedWordCounts {
  def main(args: Array[String]): Unit = {
    val sparkConf = new SparkConf().setAppName("sortedWordCounts").setMaster("local");
    val sparkContext = new SparkContext(sparkConf);
    val lines = sparkContext.textFile("E:\\spark\\spark.txt")
    val words  = lines.flatMap(line=>line.split(" "))
    val pairs  = words.map(word=>(word,1))
    val wordCounts = pairs.reduceByKey{_+_}
    val countWords = wordCounts.map(wordCount=>(wordCount._2,wordCount._1))
    val sortedCountWords = countWords.sortByKey(false)
    val sortedWordCounts = sortedCountWords.map(sortedCountWord =>(sortedCountWord._2,sortedCountWord._1))
    sortedWordCounts.foreach(sortedWordCount =>println(sortedWordCount._1+" "+sortedWordCount._2 ))
  }
}

二次排序

package cn.spark.study.core;

import scala.math.Ordered;

import java.io.Serializable;
import java.util.Objects;

/**
 * @author: yangchun
 * @description:
 * @date: Created in 2020-05-08 17:39
 */
public class SecondarySortKey implements Ordered<SecondarySortKey>, Serializable {
    private static final long serialVersionUID = -2366006422945129991L;
    private int first;
    private int second;

    public SecondarySortKey(int first, int second) {
        this.first = first;
        this.second = second;
    }

    @Override
    public int compare(SecondarySortKey that) {
        if(this.first-that.first!=0){
            return this.first -that.first;
        }else {
            return this.second -that.second;
        }
    }

    @Override
    public boolean $less(SecondarySortKey that) {
        if(this.first>that.first){
            return true;
        }else if(this.first == that.first&&this.second<that.second){
            return true;
        }
        return false;
    }

    @Override
    public boolean $greater(SecondarySortKey that) {
        if(this.first>that.first){
            return true;
        }else if(this.first == that.first&&this.second>that.second){
            return true;
        }
        return false;
    }

    @Override
    public boolean $less$eq(SecondarySortKey that) {
        if(this.first>that.first){
            return true;
        }else if(this.first == that.first&&this.second>=that.second){
            return true;
        }
        return false;
    }

    @Override
    public boolean $greater$eq(SecondarySortKey that) {
        if(this.first<that.first){
            return true;
        }else if(this.first == that.first&&this.second<=that.second){
            return true;
        }
        return false;
    }

    @Override
    public int compareTo(SecondarySortKey that) {
        return compare(that);
    }

    @Override
    public boolean equals(Object o) {
        if (this == o) return true;
        if (o == null || getClass() != o.getClass()) return false;
        SecondarySortKey that = (SecondarySortKey) o;
        return first == that.first &&
                second == that.second;
    }

    @Override
    public int hashCode() {
        return Objects.hash(first, second);
    }

    public int getFirst() {
        return first;
    }

    public void setFirst(int first) {
        this.first = first;
    }

    public int getSecond() {
        return second;
    }

    public void setSecond(int second) {
        this.second = second;
    }
}

package cn.spark.study.core;

import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaPairRDD;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.api.java.function.*;
import scala.Tuple2;

import java.util.Arrays;

/**
 * @author: yangchun
 * @description:
 * @date: Created in 2020-05-08 18:46
 */
public class SecondarySort {
    public static void main(String[] args) {
        SparkConf sparkConf = new SparkConf().setAppName("sort").setMaster("local");
        JavaSparkContext sc = new JavaSparkContext(sparkConf);
        JavaRDD<String> lines = sc.textFile("E:\\spark\\sort.txt");
        JavaPairRDD<SecondarySortKey,String> pairs = lines.mapToPair(new PairFunction<String,SecondarySortKey,String>(){
            @Override
            public Tuple2<SecondarySortKey, String> call(String s) throws Exception {
                String[] number = s.split(" ");
                return new Tuple2<>(new SecondarySortKey(Integer.parseInt(number[0]),Integer.parseInt(number[1])),s);
            }
        });
        JavaPairRDD<SecondarySortKey,String>  sortedPairs = pairs.sortByKey(false);
        lines =sortedPairs.map(new Function<Tuple2<SecondarySortKey, String>, String>() {
            @Override
            public String call(Tuple2<SecondarySortKey, String> secondarySortKeyStringTuple2) throws Exception {
                return secondarySortKeyStringTuple2._2;
            }
        });
        lines.foreach(new VoidFunction<String>() {
            @Override
            public void call(String s) throws Exception {
                System.out.println(s);
            }
        });
        sc.close();
    }
}

scala

package cn.spark.study.core

/**
 * @author: yangchun
 * @description:
 * @date: Created in 2020-05-09 11:41 
 */
class SecondSortKey(val first:Int,val second:Int) extends Ordered[SecondSortKey] with Serializable {
  override def compare(that: SecondSortKey): Int = {
    if(this.first==that.first){
       this.second - that.second;
    }else{
      this.first - that.first
    }
  }
}

package cn.spark.study.core

import org.apache.spark.{SparkConf, SparkContext}

/**
 * @author: yangchun
 * @description:
 * @date: Created in 2020-05-09 12:18 
 */
object SecondSort {
  def main(args: Array[String]): Unit = {
    val sparkConf = new SparkConf().setAppName("secondSort").setMaster("local")
    val sparkContext = new SparkContext(sparkConf)
    val lines = sparkContext.textFile("E:\\spark\\sort.txt")
    val pairs = lines.map(line=>(new SecondSortKey(line.split(" ")(0).toInt,line.split(" ")(1).toInt),line))
    val sortPairs = pairs.sortByKey(false)
    val sortLines = sortPairs.map(sortPair=>sortPair._2)
    sortLines.foreach(println(_))
  }
}

topn

package cn.spark.study.core;

import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaPairRDD;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.api.java.function.Function;
import org.apache.spark.api.java.function.PairFunction;
import scala.Tuple2;

import java.util.List;

/**
 * @author: yangchun
 * @description:
 * @date: Created in 2020-05-09 12:29
 */
public class Top3 {
    public static void main(String[] args) {
        SparkConf sparkConf = new SparkConf().setAppName("top3").setMaster("local");
        JavaSparkContext sparkContext = new JavaSparkContext(sparkConf);
        JavaRDD<String> lines = sparkContext.textFile("E:\\spark\\top.txt");
        JavaPairRDD<Integer,String> pairs = lines.mapToPair(new PairFunction<String, Integer, String>() {
            @Override
            public Tuple2<Integer, String> call(String s) throws Exception {
                return new Tuple2<>(Integer.parseInt(s),s);
            }
        });
        JavaPairRDD<Integer,String> sortedPairs=pairs.sortByKey(false);

        JavaRDD<String> sortedNumbers=sortedPairs.map(new Function<Tuple2<Integer, String>, String>() {
            @Override
            public String call(Tuple2<Integer, String> integerStringTuple2) throws Exception {
                return integerStringTuple2._2;
            }
        });
        List<String> top3 =sortedNumbers.take(3);
        for(String number:top3){
            System.out.println(number);
        }
    }
}

scala

package cn.spark.study.core

import org.apache.spark.{SparkConf, SparkContext}

/**
 * @author: yangchun
 * @description:
 * @date: Created in 2020-05-09 12:53 
 */
object Top3 {
  def main(args: Array[String]): Unit = {
    val sparkConf = new SparkConf().setAppName("top").setMaster("local")
    val sparkContext = new SparkContext(sparkConf)
    val lines = sparkContext.textFile("E:\\spark\\top.txt")
    val pairs = lines.map(line=>(line.toInt,line))
    val sortPairs = pairs.sortByKey(false)
    val sortLine = sortPairs.map(pair=>pair._2)
    val top  =sortLine.take(3)
    top.foreach(println(_))
  }
}

分组取前三

package cn.spark.study.core;

import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaPairRDD;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.api.java.function.Function;
import org.apache.spark.api.java.function.PairFunction;
import org.apache.spark.api.java.function.VoidFunction;
import org.apache.spark.sql.sources.In;
import scala.Tuple2;

import java.util.Arrays;
import java.util.Iterator;
import java.util.List;

/**
 * @author: yangchun
 * @description:
 * @date: Created in 2020-05-09 12:29
 */
public class GroupTop3 {
    public static void main(String[] args) {
        SparkConf sparkConf = new SparkConf().setAppName("top3").setMaster("local");
        JavaSparkContext sparkContext = new JavaSparkContext(sparkConf);
        JavaRDD<String> lines = sparkContext.textFile("E:\\spark\\score.txt");
        JavaPairRDD<String,Integer> pairs = lines.mapToPair(new PairFunction<String, String, Integer>() {
            @Override
            public Tuple2<String, Integer> call(String s) throws Exception {
                String[] args =s.split(" ");
                return new Tuple2<>(args[0],Integer.parseInt(args[1]));
            }
        });
        JavaPairRDD<String,Iterable<Integer>> groupPairs = pairs.groupByKey();
        JavaPairRDD<String,Iterable<Integer>> sortedPairs=groupPairs.mapToPair(new PairFunction<Tuple2<String, Iterable<Integer>>, String, Iterable<Integer>>() {
            @Override
            public Tuple2<String, Iterable<Integer>> call(Tuple2<String, Iterable<Integer>> stringIterableTuple2) throws Exception {
                String classStr = stringIterableTuple2._1;
                Integer[] top3 =new Integer[3];
                Iterator iterator = stringIterableTuple2._2.iterator();
                while (iterator.hasNext()){
                    Integer score =  (Integer) iterator.next();
                    for(int i=0;i<3;i++){
                        if(top3[i]==null){
                            top3[i] = score;
                            break;
                        }else if(score>top3[i]){
                            for(int j=2;j>i;j--){
                                top3[j] = top3[j-1];
                            }
                            top3[i] = score;
                            break;
                        }
                    }
                }
                return new Tuple2<String,Iterable<Integer>>(classStr, Arrays.asList(top3));
            }
        });
        sortedPairs.foreach(new VoidFunction<Tuple2<String, Iterable<Integer>>>() {
            @Override
            public void call(Tuple2<String, Iterable<Integer>> stringIterableTuple2) throws Exception {
                System.out.println("============================================");
                System.out.println("class:"+stringIterableTuple2._1);
                Iterator iterator = stringIterableTuple2._2.iterator();
                while (iterator.hasNext()){
                    System.out.println((Integer) iterator.next());
                }
            }
        });
        sparkContext.close();
    }
}

scala

package cn.spark.study.core

import org.apache.spark.{SparkConf, SparkContext}

import scala.collection.mutable.ArrayBuffer

/**
 * @author: yangchun
 * @description:
 * @date: Created in 2020-05-09 13:32 
 */
object GroupTop3 {
  def main(args: Array[String]): Unit = {
    val sparkConf = new SparkConf().setAppName("top").setMaster("local")
    val sparkContext = new SparkContext(sparkConf)
    val lines = sparkContext.textFile("E:\\spark\\score.txt")
    val pairs = lines.map(line=>{
      val args = line.split(" ")
      new Tuple2(args(0),args(1).toInt)
    })
    val groupPairs = pairs.groupByKey()
    val top3Scores = groupPairs.map(pair=>{
      val scores = pair._2
      val top3 = new Array[Int](3)
      scores.foreach(score=>{
        var flag =true
        for(i<-0 to 2 if flag){
          if(top3(i)==0){
            top3(i) = score
            flag = false
          }else if(score>top3(i)){
            for(j<-(i+1 to 2).reverse){
              top3(j) = top3(j-1)
            }
            top3(i) = score
            flag = false
          }
        }
      })
      new Tuple2(pair._1,top3)
    })
    top3Scores.foreach(classScore=>{
      println("========================")
      println(classScore._1)
      classScore._2.foreach(println(_))
    })
  }
}

posted on 2020-05-08 13:30 清浊阅读(299) 评论(0) 收藏举报