spark 基于key排序的wordcount

java

 1 /**
 2  * 根据单词次数排序的wordcount
 3  * @author Tele
 4  *
 5  */
 6 public class SortWordCount {
 7     private static SparkConf conf = new SparkConf().setMaster("local").setAppName("sortwordcount");
 8     private static JavaSparkContext jsc = new JavaSparkContext(conf);
 9     private static String path = "D:\\inputword\\result.txt";
10 
11     public static <U> void main(String[] args) {
12         JavaRDD<String> rdd = jsc.textFile(path);
13 
14         /*
15          * JavaRDD<String> lines = rdd.flatMap(new FlatMapFunction<String,String>() {
16          * 
17          * private static final long serialVersionUID = 1L;
18          * 
19          * @Override public Iterator<String> call(String t) throws Exception { return
20          * Arrays.asList(t.split(" ")).iterator(); } });
21          * 
22          * JavaPairRDD<String, Integer> tuples = lines.mapToPair(new
23          * PairFunction<String,String,Integer>() {
24          * 
25          * private static final long serialVersionUID = 1L;
26          * 
27          * @Override public Tuple2<String,Integer> call(String t) throws Exception {
28          * return new Tuple2<String,Integer>(t,1); } });
29          */
30 
31         JavaPairRDD<String, Integer> tuples = rdd.flatMapToPair(new PairFlatMapFunction<String, String, Integer>() {
32 
33             private static final long serialVersionUID = 1L;
34 
35             @Override
36             public Iterator<Tuple2<String, Integer>> call(String t) throws Exception {
37                 Stream<Tuple2<String, Integer>> stream = Arrays.asList(t.split(" ")).stream()
38                         .map(i -> new Tuple2<>(i, 1));
39                 return stream.iterator();
40             }
41         });
42 
43         JavaPairRDD<String, Integer> wc = tuples.reduceByKey(new Function2<Integer, Integer, Integer>() {
44 
45             private static final long serialVersionUID = 1L;
46 
47             @Override
48             public Integer call(Integer v1, Integer v2) throws Exception {
49                 return v1 + v2;
50             }
51         });
52 
53         // 将词频与单词互换位置
54         JavaPairRDD<Integer, String> cw = wc.mapToPair(new PairFunction<Tuple2<String, Integer>, Integer, String>() {
55 
56             private static final long serialVersionUID = 1L;
57 
58             @Override
59             public Tuple2<Integer, String> call(Tuple2<String, Integer> t) throws Exception {
60                 return new Tuple2<Integer, String>(t._2, t._1);
61             }
62         });
63 
64         JavaPairRDD<Integer, String> result = cw.sortByKey(false);
65         result.foreach(new VoidFunction<Tuple2<Integer, String>>() {
66 
67             private static final long serialVersionUID = 1L;
68 
69             @Override
70             public void call(Tuple2<Integer, String> t) throws Exception {
71                 System.out.println(t._2 + "----" + t._1);
72             }
73         });
74 
75         // 也可以在排序完毕后换成单词-词频的形式
76         /*
77          * JavaPairRDD<String, Integer> result = cw.sortByKey(false).mapToPair(new
78          * PairFunction<Tuple2<Integer,String>,String,Integer>() {
79          * 
80          * private static final long serialVersionUID = 1L;
81          * 
82          * @Override public Tuple2<String,Integer> call(Tuple2<Integer, String> t)
83          * throws Exception { return new Tuple2<String,Integer>(t._2,t._1); } });
84          * 
85          * result.foreach(new VoidFunction<Tuple2<String,Integer>>() {
86          * 
87          * private static final long serialVersionUID = 1L;
88          * 
89          * @Override public void call(Tuple2<String, Integer> t) throws Exception {
90          * System.out.println(t._1 + "-------" + t._2); } });
91          */
92 
93         jsc.close();
94     }
95 }

scala

 1 object SortWordCount {
 2   def main(args: Array[String]): Unit = {
 3     val conf = new SparkConf().setMaster("local").setAppName("sortwordcount");
 4     val sc = new SparkContext(conf);
 5 
 6     val rdd = sc.textFile("D:\\inputword\\result.txt", 1);
 7     
 8     val wordcount = rdd.flatMap(_.split(" ")).map((_, 1)).reduceByKey(_ + _);
 9     wordcount.map(t => (t._2, t._1)).sortByKey(false, 1).map(t => (t._2, t._1)).foreach(t => println(t._1 + "-----" + t._2));
10 
11   }
12 }

 

posted @ 2019-01-17 14:33  tele  阅读(254)  评论(0编辑  收藏  举报