MapReduce实现求平均值
1. 需求背景
文本文件File里面存放公司各个部门人员的工资明细 salary.txt文件数据格式如下:
deptId name salary
1001 张三01 2000
1002 李四02 2500
1003 张三05 3000
1002 王五01 2600
用程序写出各个部门的平均工资并倒序输出
2. 使用MR代码实现
import org.apache.hadoop.conf.Configuration import org.apache.hadoop.fs.{FileSystem, Path} import org.apache.hadoop.io.{LongWritable, NullWritable, Text} import org.apache.hadoop.mapreduce.lib.input.FileInputFormat import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat import org.apache.hadoop.mapreduce.{Job, Mapper} case class AggInfo(cnt:Int,sum:Double) case class AggResult(deptId: String,avg:Double) object DeptAvgSalaryMRApp{ val path = "./spark/src/main/resources/salary.txt" // 方法五: 使用MapReduce实现 def main(args: Array[String]): Unit = { useMapReduce() } var aggMap:scala.collection.mutable.Map[String,AggInfo] = scala.collection.mutable.Map[String,AggInfo]() def useMapReduce() = { val conf = new Configuration() val job = Job.getInstance(conf,"avgSalary") val inputPath = new Path(path) val outPath = new Path("D:/output") FileInputFormat.setInputPaths(job, inputPath) val fileSystem = FileSystem.get(conf) if (fileSystem.exists(outPath)) { fileSystem.delete(outPath, true) } FileOutputFormat.setOutputPath(job,outPath) job.setMapperClass(classOf[MyMapper]) job.setNumReduceTasks(1); job.setMapOutputKeyClass(classOf[NullWritable]) job.setMapOutputValueClass(classOf[NullWritable]) var list:scala.collection.mutable.ListBuffer[AggResult] = scala.collection.mutable.ListBuffer[AggResult]() job.waitForCompletion(true) for (elem <- aggMap) { val key = elem._1 val value = elem._2 val avg:Double = value.sum/value.cnt // println(s"deptId: ${key} 平均工资: ${avg}") list.+=(AggResult(key,avg)) } val results = list.toList.sortWith((a, b) => a.avg > b.avg) for (elem <- results) { println(s"部门id:${elem.deptId} 平均工资: ${elem.avg}") } } class MyMapper extends Mapper[LongWritable,Text,NullWritable, NullWritable]{ override def map(key: LongWritable, value: Text, context: Mapper[LongWritable, Text, NullWritable, NullWritable]#Context): Unit = { val string = value.toString if(!string.contains("deptId")){ val arrs = string.split(" ") val deptId = arrs(0) if(aggMap.contains(deptId)){ val aggInfo:AggInfo = aggMap.get(deptId).get aggMap.put(deptId,AggInfo(1+aggInfo.cnt,arrs(3).toDouble+aggInfo.sum)) }else{ aggMap.put(deptId,AggInfo(1,arrs(3).toDouble)) } } } } }
引入pom依赖
<!--Hadoop的依赖-->
<dependency>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-client</artifactId>
<version>2.7.5</version>
<scope>provided</scope>
</dependency>
<dependency>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-mapreduce-client-common</artifactId>
<version>2.7.5</version>
</dependency>

浙公网安备 33010602011771号