package com.bjsxt.scala.spark.operator
import org.apache.spark.SparkConf
import org.apache.spark.SparkContext
import scala.collection.mutable.ListBuffer
import scala.collection.mutable.HashMap
object PUVAnalyze {
def main(args: Array[String]): Unit = {
val conf = new SparkConf().setAppName("PUVAnalyze").setMaster("local")
val sc = new SparkContext(conf)
val rdd = sc.textFile("d:/demo/userLog")
//假设下面集合中存储就是最热门的版块名
val list = List("Spark", "Kafka", "Flink")
val hotChannleBoradcast = sc.broadcast(list)
val filteredRDD = rdd.filter { x => hotChannleBoradcast.value.contains(x.split("\t")(4)) && !"null".equals(x.split("\t")(2)) }
//k:userId v:channel
val userId2ChnnelRDD = filteredRDD.map(x => {
val splited = x.split("\t")
(splited(2), splited(4))
})
userId2ChnnelRDD
.groupByKey()
.flatMap((x: (String, Iterable[String])) => {
val rest = new ListBuffer[(String, String)]()
val map = new HashMap[String, Long]
val userId = x._1
val channles = x._2
val channelItera = channles.iterator
while (channelItera.hasNext) {
val channel = channelItera.next()
if (map.get(channel) == None) {
map.+=((channel, 1))
} else {
map.update(channel, map.get(channel).get + 1)
}
}
map.map(x => {
val channel = x._1
val count = x._2
rest.+=((channel, userId + "_" + count))
})
rest
}).groupByKey()
.foreach(x => {
val channel = x._1
val infos = x._2
val infoIter = infos.iterator
val arr = new Array[String](10)
while (infoIter.hasNext) {
val info = infoIter.next()
val splited = info.split("_")
val userId = splited(0)
val count = splited(1)
for(i <- 0 to arr.length-1){
if(arr(i) == null){
arr(i) = info
}else if(count > arr(i).split("_")(1)){
for(j <- (arr.length - 1).to(i+1,-1)){
arr(j) = arr(j - 1)
}
arr(i) = info
}
}
}
for(elem <- arr){
println("hot channle:" + channel + "\thigh:" + elem)
}
})
sc.stop()
}
}