Scala 复杂分词求和(二元组)

 1 package chapter07
 2 
 3 object Test18_ComplexWordCount {
 4   def main(args: Array[String]): Unit = {
 5     val tupleList: List[(String, Int)] = List(
 6       ("hello", 1),
 7       ("hello world", 2),
 8       ("hello scala", 3),
 9       ("hello spark from scala", 1),
10       ("hello flink from scala", 2)
11     )
12 
13     // 思路一:直接展开为普通版本
14     val newStringList: List[String] = tupleList.map(
15       kv => {
16         (kv._1.trim + " ") * kv._2
17       }
18     )
19     println(newStringList)
20 
21     // 接下来操作与普通版本完全一致
22     val wordCountList: List[(String, Int)] = newStringList
23       .flatMap(_.split(" "))    // 空格分词
24       .groupBy( word => word )     // 按照单词分组
25       .map( kv => (kv._1, kv._2.size) )     // 统计出每个单词的个数
26       .toList
27       .sortBy(_._2)(Ordering[Int].reverse)
28       .take(3)
29 
30     println(wordCountList)
31 
32     println("================================")
33 
34     // 思路二:直接基于预统计的结果进行转换
35     // 1. 将字符串打散为单词,并结合对应的个数包装成二元组List((hello,1), (hello,2), (world,2), (hello,3), (scala,3), (
36     val preCountList: List[(String, Int)] = tupleList.flatMap(
37       tuple => {
38         val strings: Array[String] = tuple._1.split(" ")
39         strings.map( word => (word, tuple._2) )
40       }
41     )
42     println(preCountList)
43 
44     // 2. 对二元组按照单词进行分组
45     val preCountMap: Map[String, List[(String, Int)]] = preCountList.groupBy( _._1 )
46     println(preCountMap)
47 
48     // 3. 叠加每个单词预统计的个数值
49     val countMap: Map[String, Int] = preCountMap.mapValues(
50       tupleList => tupleList.map(_._2).sum
51     )
52     println(countMap)
53 
54     // 4. 转换成list,排序取前3
55     val countList = countMap.toList
56       .sortWith(_._2 > _._2)
57       .take(3)
58     println(countList)
59   }
60 }
("hello", 1), 说明“hello”字符串已知出现了两次!
posted @ 2022-01-20 20:34  靠谱杨  阅读(144)  评论(0)    收藏  举报