scala实现单词统计
---------------------
import scala.io.Source
/**
* Created by Administrator on 2018/5/7.
*/
object WCApp {
def main(args: Array[String]): Unit = {
//1.加载文件
val src = Source.fromFile("d:/mr/word.txt")
//2.取得所有行
val lines = src.getLines().toList
// for(line <- lines){
// println(line)
// }
//3.压扁单词
val words = lines.flatMap(_.split(" "))
//标一成对
val map1 = words.map((w:String) => (w, 1))
//按照单词分组
val map2 = map1.groupBy(t=>t._1)
val map3 = map2.mapValues(list => list.size)
lines.foreach(println)
}
}
scala实现单词统计2
---------------------
import scala.io.Source
/**
* Created by Administrator on 2018/5/7.
*/
object WCApp2 {
def main(args: Array[String]): Unit = {
//1.加载文件
val src = Source.fromFile("d:/mr/word.txt")
//2.取得所有行
val lines = src.getLines().toList
// for(line <- lines){
// println(line)
// }
//3.压扁单词
val words = lines.flatMap(_.split(" "))
//标一成对
val map1 = words.map((w:String) => (w, 1))
//按照单词分组{hello->[(hello,1),(hello,1),(hello,1)]}
val map2 = map1.groupBy(t=>t._1)
//{hello->(hello,4) , ...}
val map3 = map2.mapValues(list => {
// def op(a:Tuple2[String,Int] , b:Tuple2[String,Int]) = {
// val word = a._1
// val cnt = a._2 + b._2
// (word , cnt)
// }
// list.reduce(op _)
list.reduce((a,b)=>(a._1,a._2 + b._2 ))
})
//
val map4 = map3.map((t:Tuple2[String,Tuple2[String,Int]])=>t._2)
map4.foreach(println)
}
}
Bitmap实现topn统计
------------------------
import scala.io.Source
/**
* 气温的年度内topN查询,使用reduce实现
*/
object TempTopN2_Bitmap {
def main(args: Array[String]): Unit = {
//1.加载气温文件
val f = Source.fromFile("d:/mr/temp.dat")
//2.取得所有行
val temps = f.getLines().toList
//3.提取每行的年度和气温,形成元组{(1900,28),....}
val map1 = temps.map((line:String) => {
val arr = line.split(" ")
val year = arr(0).toInt
val temp = arr(1).toInt
(year, temp)
})
//4.按照年度分组{(1920->{(),(),(),...}),...}
val map2 = map1.groupBy((t:Tuple2[Int,Int])=>t._1)
//5.对每个key对应的value进行按照气温只top3聚合
val map3 = map2.mapValues(list=>{
val bytes = list.foldLeft(new Array[Byte](128))((a,b)=>{
val temp = b._2
if(temp > 0){
val index = temp / 8
val mod = temp % 8
a(index) = (a(index) | (1 << mod)).toByte
}
a
})
//定义方法,处理bitmap
def process(): String ={
var count = 0;
var tempStr = "";
for (x <- (0 until bytes.length).reverse) {
val b = bytes(x)
for (y <- (0 to 7).reverse) {
if (((b >> y) & 1) != 0) {
count += 1
tempStr = tempStr + "," + (8 * x + y)
if (count == 3) {
return tempStr
}
}
}
}
tempStr
}
process()
})
val map4 = map3.toList.sortBy(e=>e._1)
map4.foreach(println(_))
}
}
scala实现商品评论
---------------------
1.TagUtil.java
package com.oldboy.scala.util;
import com.alibaba.fastjson.JSON;
import com.alibaba.fastjson.JSONArray;
import com.alibaba.fastjson.JSONObject;
import java.util.ArrayList;
import java.util.List;
/**
* 标签工具类
*/
public class TagUtil {
/**
* 从json数据中抽取出评论集合
*/
public static List<String> extractTags(String json){
//评论集合
List<String> tags = new ArrayList<String>() ;
//将文件解析成json对象
JSONObject obj = JSON.parseObject(json) ;
//得到数组
JSONArray array = obj.getJSONArray("extInfoList");
//判断数组有效性
if(array != null && array.size() > 0){
JSONObject obj2 = array.getJSONObject(0);
JSONArray arr2 = obj2.getJSONArray("values") ;
if(arr2 != null && arr2.size() > 0 ){
for(int i = 0 ; i < arr2.size() ; i ++){
tags.add(arr2.getString(i));
}
}
}
return tags ;
}
}
2.TaggenDemo
import javax.swing.text.html.HTML.Tag
import com.oldboy.scala.util.TagUtil
import scala.io.Source
/**
* 便签生成统计
*/
object TaggenDemo {
def main(args: Array[String]): Unit = {
//1.加载文件
val file = Source.fromFile("d:/mr/temptags.txt") ;
//2.提取所有行
val lines = file.getLines().toList
//3.压扁变换每行形成(busid,tag)
val map1 = lines.flatMap(line=>{
var list0:List[(String,String)] = Nil
var arr = line.split("\t")
val busid = arr(0)
var json = arr(1)
import scala.collection.JavaConversions._
val list:List[String] = TagUtil.extractTags(json).toList ;
for(tag <- list){
list0 = (busid, tag) +: list0
}
list0
})
//4.对元组进行分组,{(busid,tag)->List((busid,tag),(busid,tag),...}
val map2 = map1.groupBy(t => t)
//5.统计每个key下List的size,{(busid,tag)->300}
val map3 = map2.mapValues(_.size)
//6.交换元素位置,List((busid , (tag,cnt)),...)
val map4 = map3.toList.map(t=>(t._1._1 , (t._1._2,t._2)))
//7.按照busid再次分组Map(busid->List((busid , (tag,cnt)),...))
val map5 = map4.groupBy(t=>t._1)
//8.对每个商家内的评论按照数量倒排序.Map(busid->List((busid,(tag,59)))
val map6 = map5.mapValues(list=>{
val list2 = list.sortBy(t=> -t._2._2).take(5)
val list3 = list2.map(t=>t._2)
list3
})
//9.对商家进行排序,按照商家的最大评论数倒排序
val map7 = map6.toList.sortBy(t=> -t._2(0)._2)
map7.foreach(t=>{
val busid = t._1
val str = t._2.mkString(";")
println(busid + "==>" + str)
})
}
}