11、TopN实战

1、Java版本:

    1.1、取前3

package sparkcore.java;
import java.util.List;
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaPairRDD;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.api.java.function.Function;
import org.apache.spark.api.java.function.PairFunction;
import scala.Tuple2;
/**
 * 取前3
 */
public class Top3 {
    public static void main(String[] args) {
        SparkConf conf = new SparkConf()
                .setAppName("Top3")
                .setMaster("local");  
        JavaSparkContext sc = new JavaSparkContext(conf);
    /*
         3
        5
        6
        7
        1
        4
        5
        6
        9
        0
        3
     */
        JavaRDD<String> lines = sc.textFile("top.txt");
        
        JavaPairRDD<Integer, String> pairs = lines.mapToPair(
                
                new PairFunction<String, Integer, String>() {
                    private static final long serialVersionUID = 1L;
                    @Override
                    public Tuple2<Integer, String> call(String tthrows Exception {
                        return new Tuple2<Integer, String>(Integer.valueOf(t), t);
                    }
                    
                });
        
        JavaPairRDD<Integer, String> sortedPairs = pairs.sortByKey(false);
        
        JavaRDD<Integer> sortedNumbers = sortedPairs.map(
                
                new Function<Tuple2<Integer,String>, Integer>() {
                    private static final long serialVersionUID = 1L;
                    @Override
                    public Integer call(Tuple2<Integer, String> v1throws Exception {
                        return v1._1;
                    }
                    
                });
        
        List<Integer> sortedNumberList = sortedNumbers.take(3);
        
        for(Integer num : sortedNumberList) {
            System.out.println(num);
        }
        
        sc.close();
    }
}
输出:
9
7
6

    1.2、每组内取top3

package sparkcore.java;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collections;
import java.util.Comparator;
import java.util.Iterator;
import java.util.List;
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaPairRDD;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.api.java.function.PairFunction;
import org.apache.spark.api.java.function.VoidFunction;
import scala.Tuple2;
/**
 * 每组内取top3
 */
public class GroupTop3 {
    public static void main(String[] args) {
        SparkConf conf = new SparkConf().setAppName("Top3").setMaster("local");
        JavaSparkContext sc = new JavaSparkContext(conf);
        /*
             class1 90
            class2 56
            class1 87
            class1 76
            class2 88
            class1 95
            class1 74
            class2 87
            class2 67
            class2 77
         */
        JavaRDD<String> lines = sc.textFile("score.txt");
        
        JavaPairRDD<String, Integer> pairs = lines.mapToPair(
                new PairFunction<String, String, Integer>() {
                    private static final long serialVersionUID = 1L;
                    @Override
                    public Tuple2<String, Integer> call(String linethrows Exception {
                        String[] lineSplited = line.split(" ");
                        return new Tuple2<String, Integer>(lineSplited[0], Integer.valueOf(lineSplited[1]));
                    }
                });
        JavaPairRDD<String, Iterable<Integer>> groupedPairs = pairs.groupByKey();
        
        JavaPairRDD<String, Iterable<Integer>> top3Score = groupedPairs.mapToPair(
                new PairFunction<Tuple2<String, Iterable<Integer>>, String, Iterable<Integer>>() {
                    private static final long serialVersionUID = 1L;
                    @Override
                    public Tuple2<String, Iterable<Integer>> call(Tuple2<String, Iterable<Integer>> classScores)
                            throws Exception {
                        Integer[] top3 = new Integer[3];
                        String className = classScores._1;
                        Iterator<Integer> scores = classScores._2.iterator();
                        List<Integer> l = new ArrayList<Integer>();
                        while (scores.hasNext()) {
                            l.add(scores.next());
                        }
                        Collections.sort(lnew Comparator<Integer>() {
                            public int compare(Integer o1, Integer o2) {
                                return -o1.compareTo(o2);
                            }
                        });
                        for (int i = 0; i < l.size(); i++) {
                            if (i < 3) {
                                top3[i] = l.get(i);
                            }
                        }
                        return new Tuple2<String, Iterable<Integer>>(className, Arrays.asList(top3));
                    }
                });
        top3Score.foreach(new VoidFunction<Tuple2<String, Iterable<Integer>>>() {
            private static final long serialVersionUID = 1L;
            @Override
            public void call(Tuple2<String, Iterable<Integer>> tthrows Exception {
                System.out.println("class: " + t._1);
                Iterator<Integer> scoreIterator = t._2.iterator();
                while (scoreIterator.hasNext()) {
                    Integer score = scoreIterator.next();
                    System.out.println(score);
                }
                System.out.println("=======================================");
            }
        });
        sc.close();
    }
}
输出:
class: class1
95
90
87
=======================================
class: class2
88
87
77
=======================================

2、Scala版本:

    2.1、取前3

package sparkcore.scala
import org.apache.spark.SparkConf
import org.apache.spark.SparkContext
object Top3 {
  def main(args: Array[String]): Unit = {
    val conf = new SparkConf()
      .setAppName("Top3")
      .setMaster("local")
    val sc = new SparkContext(conf)
    val lines = sc.textFile("top.txt"1)
    val pairs = lines.map { line => (line.toInt, line) }
    val sortedPairs = pairs.sortByKey(false)
    val sortedNumbers = sortedPairs.map(sortedPair => sortedPair._1)
    val top3Number = sortedNumbers.take(3)
    for (num <- top3Number) {
      println(num)
    }
  }
}
输出:
9
7
6

    2.2、每组内取top3

package sparkcore.scala
import org.apache.spark.SparkConf
import org.apache.spark.SparkContext
object GroupTop3 {
  def main(args: Array[String]): Unit = {
    val conf = new SparkConf()
      .setAppName("Top3")
      .setMaster("local")
    val sc = new SparkContext(conf)
    val lines = sc.textFile("score.txt"1)
    val linePair = lines.map((line => (line.split(" ")(0), line.split(" ")(1).toInt)))
    val groupRdd = linePair.groupByKey()
    val sortRdd = groupRdd.map(g => (g._1,
      g._2.toList.sorted(new Ordering[Int]() {
        def compare(x: Int, y: Int): Int = {
          - x.compare(y)
        }
      }).take(3)))
    sortRdd.foreach(e => println(e._1 + " : " + e._2))
  }
}
输出:
class1 : List(95, 90, 87)
class2 : List(88, 87, 77)

posted @ 2017-07-31 18:03  江正军  阅读(...)  评论(... 编辑 收藏