Spark-wordcount排序开发,二次排序和取top n
package cn.spark.study.core; import org.apache.spark.SparkConf; import org.apache.spark.api.java.JavaPairRDD; import org.apache.spark.api.java.JavaRDD; import org.apache.spark.api.java.JavaSparkContext; import org.apache.spark.api.java.function.FlatMapFunction; import org.apache.spark.api.java.function.Function2; import org.apache.spark.api.java.function.PairFunction; import org.apache.spark.api.java.function.VoidFunction; import org.apache.spark.sql.sources.In; import scala.Tuple2; import java.util.Arrays; /** * @author: yangchun * @description: * @date: Created in 2020-05-08 12:50 */ public class SortWordCount { public static void main(String[] args) { SparkConf sparkConf = new SparkConf().setAppName("sortedWordCounts").setMaster("local"); JavaSparkContext sc = new JavaSparkContext(sparkConf); JavaRDD<String> lines = sc.textFile("E:\\spark\\spark.txt"); JavaRDD<String> words = lines.flatMap(new FlatMapFunction<String, String>() { @Override public Iterable<String> call(String s) throws Exception { return Arrays.asList(s.split(" ")); } }); JavaPairRDD<String,Integer> pairs = words.mapToPair(new PairFunction<String, String, Integer>() { @Override public Tuple2<String, Integer> call(String s) throws Exception { return new Tuple2<>(s,1); } }); JavaPairRDD<String, Integer> wordCounts = pairs.reduceByKey(new Function2<Integer, Integer, Integer>() { @Override public Integer call(Integer integer, Integer integer2) throws Exception { return integer+integer2; } }); JavaPairRDD<Integer, String> countWords = wordCounts.mapToPair(new PairFunction<Tuple2<String, Integer>, Integer, String>() { @Override public Tuple2<Integer, String> call(Tuple2<String, Integer> stringIntegerTuple2) throws Exception { return new Tuple2<>(stringIntegerTuple2._2,stringIntegerTuple2._1); } }); JavaPairRDD<Integer, String> sortedCountWords = countWords.sortByKey(false); JavaPairRDD<String, Integer> sortedWordCounts = sortedCountWords.mapToPair(new PairFunction<Tuple2<Integer, String>, String, Integer>() { @Override public Tuple2<String, Integer> call(Tuple2<Integer, String> integerStringTuple2) throws Exception { return new Tuple2<>(integerStringTuple2._2,integerStringTuple2._1); } }); sortedWordCounts.foreach(new VoidFunction<Tuple2<String, Integer>>() { @Override public void call(Tuple2<String, Integer> stringIntegerTuple2) throws Exception { System.out.println(stringIntegerTuple2._1+" "+stringIntegerTuple2._2); } }); sc.close(); } }
package cn.spark.study.core import org.apache.spark.{SparkConf, SparkContext} /** * @author: yangchun * @description: * @date: Created in 2020-05-08 13:04 */ object SortedWordCounts { def main(args: Array[String]): Unit = { val sparkConf = new SparkConf().setAppName("sortedWordCounts").setMaster("local"); val sparkContext = new SparkContext(sparkConf); val lines = sparkContext.textFile("E:\\spark\\spark.txt") val words = lines.flatMap(line=>line.split(" ")) val pairs = words.map(word=>(word,1)) val wordCounts = pairs.reduceByKey{_+_} val countWords = wordCounts.map(wordCount=>(wordCount._2,wordCount._1)) val sortedCountWords = countWords.sortByKey(false) val sortedWordCounts = sortedCountWords.map(sortedCountWord =>(sortedCountWord._2,sortedCountWord._1)) sortedWordCounts.foreach(sortedWordCount =>println(sortedWordCount._1+" "+sortedWordCount._2 )) } }
二次排序
package cn.spark.study.core; import scala.math.Ordered; import java.io.Serializable; import java.util.Objects; /** * @author: yangchun * @description: * @date: Created in 2020-05-08 17:39 */ public class SecondarySortKey implements Ordered<SecondarySortKey>, Serializable { private static final long serialVersionUID = -2366006422945129991L; private int first; private int second; public SecondarySortKey(int first, int second) { this.first = first; this.second = second; } @Override public int compare(SecondarySortKey that) { if(this.first-that.first!=0){ return this.first -that.first; }else { return this.second -that.second; } } @Override public boolean $less(SecondarySortKey that) { if(this.first>that.first){ return true; }else if(this.first == that.first&&this.second<that.second){ return true; } return false; } @Override public boolean $greater(SecondarySortKey that) { if(this.first>that.first){ return true; }else if(this.first == that.first&&this.second>that.second){ return true; } return false; } @Override public boolean $less$eq(SecondarySortKey that) { if(this.first>that.first){ return true; }else if(this.first == that.first&&this.second>=that.second){ return true; } return false; } @Override public boolean $greater$eq(SecondarySortKey that) { if(this.first<that.first){ return true; }else if(this.first == that.first&&this.second<=that.second){ return true; } return false; } @Override public int compareTo(SecondarySortKey that) { return compare(that); } @Override public boolean equals(Object o) { if (this == o) return true; if (o == null || getClass() != o.getClass()) return false; SecondarySortKey that = (SecondarySortKey) o; return first == that.first && second == that.second; } @Override public int hashCode() { return Objects.hash(first, second); } public int getFirst() { return first; } public void setFirst(int first) { this.first = first; } public int getSecond() { return second; } public void setSecond(int second) { this.second = second; } }
package cn.spark.study.core; import org.apache.spark.SparkConf; import org.apache.spark.api.java.JavaPairRDD; import org.apache.spark.api.java.JavaRDD; import org.apache.spark.api.java.JavaSparkContext; import org.apache.spark.api.java.function.*; import scala.Tuple2; import java.util.Arrays; /** * @author: yangchun * @description: * @date: Created in 2020-05-08 18:46 */ public class SecondarySort { public static void main(String[] args) { SparkConf sparkConf = new SparkConf().setAppName("sort").setMaster("local"); JavaSparkContext sc = new JavaSparkContext(sparkConf); JavaRDD<String> lines = sc.textFile("E:\\spark\\sort.txt"); JavaPairRDD<SecondarySortKey,String> pairs = lines.mapToPair(new PairFunction<String,SecondarySortKey,String>(){ @Override public Tuple2<SecondarySortKey, String> call(String s) throws Exception { String[] number = s.split(" "); return new Tuple2<>(new SecondarySortKey(Integer.parseInt(number[0]),Integer.parseInt(number[1])),s); } }); JavaPairRDD<SecondarySortKey,String> sortedPairs = pairs.sortByKey(false); lines =sortedPairs.map(new Function<Tuple2<SecondarySortKey, String>, String>() { @Override public String call(Tuple2<SecondarySortKey, String> secondarySortKeyStringTuple2) throws Exception { return secondarySortKeyStringTuple2._2; } }); lines.foreach(new VoidFunction<String>() { @Override public void call(String s) throws Exception { System.out.println(s); } }); sc.close(); } }
scala
package cn.spark.study.core /** * @author: yangchun * @description: * @date: Created in 2020-05-09 11:41 */ class SecondSortKey(val first:Int,val second:Int) extends Ordered[SecondSortKey] with Serializable { override def compare(that: SecondSortKey): Int = { if(this.first==that.first){ this.second - that.second; }else{ this.first - that.first } } }
package cn.spark.study.core import org.apache.spark.{SparkConf, SparkContext} /** * @author: yangchun * @description: * @date: Created in 2020-05-09 12:18 */ object SecondSort { def main(args: Array[String]): Unit = { val sparkConf = new SparkConf().setAppName("secondSort").setMaster("local") val sparkContext = new SparkContext(sparkConf) val lines = sparkContext.textFile("E:\\spark\\sort.txt") val pairs = lines.map(line=>(new SecondSortKey(line.split(" ")(0).toInt,line.split(" ")(1).toInt),line)) val sortPairs = pairs.sortByKey(false) val sortLines = sortPairs.map(sortPair=>sortPair._2) sortLines.foreach(println(_)) } }
topn
package cn.spark.study.core; import org.apache.spark.SparkConf; import org.apache.spark.api.java.JavaPairRDD; import org.apache.spark.api.java.JavaRDD; import org.apache.spark.api.java.JavaSparkContext; import org.apache.spark.api.java.function.Function; import org.apache.spark.api.java.function.PairFunction; import scala.Tuple2; import java.util.List; /** * @author: yangchun * @description: * @date: Created in 2020-05-09 12:29 */ public class Top3 { public static void main(String[] args) { SparkConf sparkConf = new SparkConf().setAppName("top3").setMaster("local"); JavaSparkContext sparkContext = new JavaSparkContext(sparkConf); JavaRDD<String> lines = sparkContext.textFile("E:\\spark\\top.txt"); JavaPairRDD<Integer,String> pairs = lines.mapToPair(new PairFunction<String, Integer, String>() { @Override public Tuple2<Integer, String> call(String s) throws Exception { return new Tuple2<>(Integer.parseInt(s),s); } }); JavaPairRDD<Integer,String> sortedPairs=pairs.sortByKey(false); JavaRDD<String> sortedNumbers=sortedPairs.map(new Function<Tuple2<Integer, String>, String>() { @Override public String call(Tuple2<Integer, String> integerStringTuple2) throws Exception { return integerStringTuple2._2; } }); List<String> top3 =sortedNumbers.take(3); for(String number:top3){ System.out.println(number); } } }
scala
package cn.spark.study.core import org.apache.spark.{SparkConf, SparkContext} /** * @author: yangchun * @description: * @date: Created in 2020-05-09 12:53 */ object Top3 { def main(args: Array[String]): Unit = { val sparkConf = new SparkConf().setAppName("top").setMaster("local") val sparkContext = new SparkContext(sparkConf) val lines = sparkContext.textFile("E:\\spark\\top.txt") val pairs = lines.map(line=>(line.toInt,line)) val sortPairs = pairs.sortByKey(false) val sortLine = sortPairs.map(pair=>pair._2) val top =sortLine.take(3) top.foreach(println(_)) } }
分组取前三
package cn.spark.study.core; import org.apache.spark.SparkConf; import org.apache.spark.api.java.JavaPairRDD; import org.apache.spark.api.java.JavaRDD; import org.apache.spark.api.java.JavaSparkContext; import org.apache.spark.api.java.function.Function; import org.apache.spark.api.java.function.PairFunction; import org.apache.spark.api.java.function.VoidFunction; import org.apache.spark.sql.sources.In; import scala.Tuple2; import java.util.Arrays; import java.util.Iterator; import java.util.List; /** * @author: yangchun * @description: * @date: Created in 2020-05-09 12:29 */ public class GroupTop3 { public static void main(String[] args) { SparkConf sparkConf = new SparkConf().setAppName("top3").setMaster("local"); JavaSparkContext sparkContext = new JavaSparkContext(sparkConf); JavaRDD<String> lines = sparkContext.textFile("E:\\spark\\score.txt"); JavaPairRDD<String,Integer> pairs = lines.mapToPair(new PairFunction<String, String, Integer>() { @Override public Tuple2<String, Integer> call(String s) throws Exception { String[] args =s.split(" "); return new Tuple2<>(args[0],Integer.parseInt(args[1])); } }); JavaPairRDD<String,Iterable<Integer>> groupPairs = pairs.groupByKey(); JavaPairRDD<String,Iterable<Integer>> sortedPairs=groupPairs.mapToPair(new PairFunction<Tuple2<String, Iterable<Integer>>, String, Iterable<Integer>>() { @Override public Tuple2<String, Iterable<Integer>> call(Tuple2<String, Iterable<Integer>> stringIterableTuple2) throws Exception { String classStr = stringIterableTuple2._1; Integer[] top3 =new Integer[3]; Iterator iterator = stringIterableTuple2._2.iterator(); while (iterator.hasNext()){ Integer score = (Integer) iterator.next(); for(int i=0;i<3;i++){ if(top3[i]==null){ top3[i] = score; break; }else if(score>top3[i]){ for(int j=2;j>i;j--){ top3[j] = top3[j-1]; } top3[i] = score; break; } } } return new Tuple2<String,Iterable<Integer>>(classStr, Arrays.asList(top3)); } }); sortedPairs.foreach(new VoidFunction<Tuple2<String, Iterable<Integer>>>() { @Override public void call(Tuple2<String, Iterable<Integer>> stringIterableTuple2) throws Exception { System.out.println("============================================"); System.out.println("class:"+stringIterableTuple2._1); Iterator iterator = stringIterableTuple2._2.iterator(); while (iterator.hasNext()){ System.out.println((Integer) iterator.next()); } } }); sparkContext.close(); } }
scala
package cn.spark.study.core import org.apache.spark.{SparkConf, SparkContext} import scala.collection.mutable.ArrayBuffer /** * @author: yangchun * @description: * @date: Created in 2020-05-09 13:32 */ object GroupTop3 { def main(args: Array[String]): Unit = { val sparkConf = new SparkConf().setAppName("top").setMaster("local") val sparkContext = new SparkContext(sparkConf) val lines = sparkContext.textFile("E:\\spark\\score.txt") val pairs = lines.map(line=>{ val args = line.split(" ") new Tuple2(args(0),args(1).toInt) }) val groupPairs = pairs.groupByKey() val top3Scores = groupPairs.map(pair=>{ val scores = pair._2 val top3 = new Array[Int](3) scores.foreach(score=>{ var flag =true for(i<-0 to 2 if flag){ if(top3(i)==0){ top3(i) = score flag = false }else if(score>top3(i)){ for(j<-(i+1 to 2).reverse){ top3(j) = top3(j-1) } top3(i) = score flag = false } } }) new Tuple2(pair._1,top3) }) top3Scores.foreach(classScore=>{ println("========================") println(classScore._1) classScore._2.foreach(println(_)) }) } }
浙公网安备 33010602011771号