MapReduce实现求平均值

1. 需求背景

文本文件File里面存放公司各个部门人员的工资明细 salary.txt文件数据格式如下:
deptId name salary
1001 张三01 2000
1002 李四02 2500
1003 张三05 3000
1002 王五01 2600
用程序写出各个部门的平均工资并倒序输出

2. 使用MR代码实现

import org.apache.hadoop.conf.Configuration
import org.apache.hadoop.fs.{FileSystem, Path}
import org.apache.hadoop.io.{LongWritable, NullWritable, Text}
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat
import org.apache.hadoop.mapreduce.{Job, Mapper}
case class AggInfo(cnt:Int,sum:Double)
case class AggResult(deptId: String,avg:Double)
object DeptAvgSalaryMRApp{
  val path = "./spark/src/main/resources/salary.txt"
  // 方法五: 使用MapReduce实现
  def main(args: Array[String]): Unit = {
    useMapReduce()
  }
  var aggMap:scala.collection.mutable.Map[String,AggInfo] =  scala.collection.mutable.Map[String,AggInfo]()
  def useMapReduce() = {
    val conf = new Configuration()
    val job = Job.getInstance(conf,"avgSalary")
    val inputPath = new Path(path)
    val outPath = new Path("D:/output")
    FileInputFormat.setInputPaths(job, inputPath)
    val fileSystem = FileSystem.get(conf)
    if (fileSystem.exists(outPath)) {
      fileSystem.delete(outPath, true)
    }
    FileOutputFormat.setOutputPath(job,outPath)
    job.setMapperClass(classOf[MyMapper])
    job.setNumReduceTasks(1);

    job.setMapOutputKeyClass(classOf[NullWritable])
    job.setMapOutputValueClass(classOf[NullWritable])
    var list:scala.collection.mutable.ListBuffer[AggResult] =  scala.collection.mutable.ListBuffer[AggResult]()
    job.waitForCompletion(true)
    for (elem <- aggMap) {
      val key = elem._1
      val value = elem._2
      val avg:Double = value.sum/value.cnt
      // println(s"deptId: ${key}  平均工资: ${avg}")
      list.+=(AggResult(key,avg))
    }
    val results = list.toList.sortWith((a, b) => a.avg > b.avg)
    for (elem <- results) {
      println(s"部门id:${elem.deptId} 平均工资: ${elem.avg}")
    }
  }
  class MyMapper extends Mapper[LongWritable,Text,NullWritable, NullWritable]{
    override def map(key: LongWritable, value: Text, context: Mapper[LongWritable, Text, NullWritable, NullWritable]#Context): Unit = {
      val string = value.toString
      if(!string.contains("deptId")){
        val arrs = string.split(" ")
        val deptId = arrs(0)
        if(aggMap.contains(deptId)){
          val aggInfo:AggInfo = aggMap.get(deptId).get
          aggMap.put(deptId,AggInfo(1+aggInfo.cnt,arrs(3).toDouble+aggInfo.sum))
        }else{
          aggMap.put(deptId,AggInfo(1,arrs(3).toDouble))
        }
      }
    }
  }
}

 

引入pom依赖

<!--Hadoop的依赖-->
<dependency>
    <groupId>org.apache.hadoop</groupId>
    <artifactId>hadoop-client</artifactId>
    <version>2.7.5</version>
    <scope>provided</scope>
</dependency>
<dependency>
    <groupId>org.apache.hadoop</groupId>
    <artifactId>hadoop-mapreduce-client-common</artifactId>
    <version>2.7.5</version>
</dependency>

 

posted @ 2021-06-10 16:23  521pingguo1314  阅读(514)  评论(0)    收藏  举报