MapReduce 练习2 - Jack404

MapReduce 小练习2

题目

求部门的总工资和平均工资，最大年龄，最大年龄的名字，平均年龄，工资最高者名字。根据平均工资/平均年龄降序排序

数据

7369,SMITH,CLERK,7902,1980/12/17,800,,20
7499,ALLEN,SALESMAN,7698,1981/2/20,1600,300,30
7521,WARD,SALESMAN,7698,1981/2/22,1250,500,30
7566,JONES,MANAGER,7839,1981/4/2,2975,,20
7654,MARTIN,SALESMAN,7698,1981/9/28,1250,1400,30
7698,BLAKE,MANAGER,7839,1981/5/1,2850,,30
7782,CLARK,MANAGER,7839,1981/6/9,2450,,10
7788,SCOTT,ANALYST,7566,1987/4/19,3000,,20
7839,KING,PRESIDENT,,1981/11/17,5000,,10
7844,TURNER,SALESMAN,7698,1981/9/8,1500,0,30
7876,ADAMS,CLERK,7788,1987/5/23,1100,,20
7900,JAMES,CLERK,7698,1981/12/3,950,,30
7902,FORD,ANALYST,7566,1981/12/3,3000,,20
7934,MILLER,CLERK,7782,1982/1/23,1300,,10

思路

DeptReducer结果:

ANALYST	21132.0,10566.0,SCOTT,42,39.0,FORD
CLERK	35320.0,8830.0,MILLER,43,40.5,SMITH
MANAGER	31792.0,10597.333333333334,JONES,42,42.0,CLARK
PRESIDENT	5000.0,5000.0,KING,42,42.0,KING
SALESMAN	36392.0,9098.0,ALLEN,42,42.0,TURNER

通过

conf.set("mapreduce.job.inputformat.class", "org.apache.hadoop.mapreduce.lib.input.KeyValueTextInputFormat");

可以一空格切割 key value的形式，然后将输入类型换为Text, 这里排序还需要继承WritableComparable接口，这里并不是Comparable接口，需要注意！

返回和Comparable是一样的，实现ComparaTo即可。

这里排序的时候因为需要用到Dept，然而我们的思路是按key排序，所以我直接继承了DeptOut并加了一个dept成员变量。

最终结果:

PRESIDENT 5000.0,5000.0,KING,42,42.0,KING
SALESMAN 36392.0,9098.0,ALLEN,42,42.0,TURNER
CLERK 35320.0,8830.0,MILLER,43,40.5,SMITH
MANAGER 31792.0,10597.333333333334,JONES,42,42.0,CLARK
ANALYST 21132.0,10566.0,SCOTT,42,39.0,FORD

Code

DeptMapperOut.java

package edu.sugon;

import org.apache.hadoop.io.Writable;

import java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException;

public class DeptMapperOut implements Writable {
    private String name;
    private double salary;
    private int age;

    public void init(String name, double salary, int age) {
        this.name = name;
        this.salary = salary;
        this.age = age;
    }

    @Override
    public void write(DataOutput dataOutput) throws IOException {
        dataOutput.writeUTF(name);
        dataOutput.writeDouble(salary);
        dataOutput.writeInt(age);
    }

    @Override
    public void readFields(DataInput dataInput) throws IOException {
        name = dataInput.readUTF();
        salary = dataInput.readDouble();
        age = dataInput.readInt();
    }

    public void setAge(int age) {
        this.age = age;
    }

    public int getAge() {
        return age;
    }

    public String getName() {
        return name;
    }

    public double getSalary() {
        return salary;
    }

    public void setName(String name) {
        this.name = name;
    }

    public void setSalary(double salary) {
        this.salary = salary;
    }
}

DeptOut.java

package edu.sugon;

import org.apache.hadoop.io.Writable;

import java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException;

public class DeptOut implements Writable {
    private String maxAgeName;
    private String maxSalaryName;
    private double totSalary;
    private double avgSalary;
    private double avgAge;
    private int maxAge;

    public void init(String maxAgeName, String maxSalaryName, double totSalary, double avgSalary, int maxAge, double avgAge) {
        this.maxAgeName = maxAgeName;
        this.maxSalaryName = maxSalaryName;
        this.totSalary = totSalary;
        this.avgSalary = avgSalary;
        this.avgAge = avgAge;
        this.maxAge = maxAge;
    }

    @Override
    public void write(DataOutput dataOutput) throws IOException {
        dataOutput.writeUTF(maxAgeName);
        dataOutput.writeUTF(maxSalaryName);
        dataOutput.writeDouble(totSalary);
        dataOutput.writeDouble(avgSalary);
        dataOutput.writeDouble(avgAge);
        dataOutput.writeInt(maxAge);
    }

    @Override
    public void readFields(DataInput dataInput) throws IOException {
        maxAgeName = dataInput.readUTF();
        maxSalaryName = dataInput.readUTF();
        totSalary = dataInput.readDouble();
        avgSalary = dataInput.readDouble();
        avgAge = dataInput.readDouble();
        maxAge = dataInput.readInt();
    }

    @Override
    public String toString() {
        return totSalary + "," + avgSalary + "," + maxSalaryName + "," + maxAge + "," + avgAge + "," + maxAgeName;
    }

    public void setAvgSalary(double avgSalary) {
        this.avgSalary = avgSalary;
    }

    public void setMaxAge(int maxAge) {
        this.maxAge = maxAge;
    }

    public void setMaxAgeName(String maxAgeName) {
        this.maxAgeName = maxAgeName;
    }

    public void setAvgAge(double avgAge) {
        this.avgAge = avgAge;
    }

    public void setMaxSalaryName(String maxSalaryName) {
        this.maxSalaryName = maxSalaryName;
    }

    public void setTotSalary(double totSalary) {
        this.totSalary = totSalary;
    }

    public double getAvgSalary() {
        return avgSalary;
    }

    public int getMaxAge() {
        return maxAge;
    }

    public String getMaxAgeName() {
        return maxAgeName;
    }

    public double getAvgAge() {
        return avgAge;
    }

    public String getMaxSalaryName() {
        return maxSalaryName;
    }

    public double getTotSalary() {
        return totSalary;
    }
}

DeptSortOut.java

package edu.sugon;

import org.apache.hadoop.io.WritableComparable;

import java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException;

public class DeptSortOut extends DeptOut implements WritableComparable<DeptSortOut> {
    private String dept;

    public void init(String maxAgeName, String maxSalaryName, double totSalary, double avgSalary, int maxAge, double avgAge, String dept) {
        super.init(maxAgeName, maxSalaryName, totSalary, avgSalary, maxAge, avgAge);
        this.dept = dept;
    }

    public void init(DeptOut out, String dept) {
        super.init(out.getMaxAgeName(), out.getMaxSalaryName(), out.getTotSalary(), out.getAvgSalary(), out.getMaxAge(), out.getAvgAge());
        this.dept = dept;
    }

    @Override
    public void write(DataOutput dataOutput) throws IOException {
        super.write(dataOutput);
        dataOutput.writeUTF(dept);
    }

    @Override
    public void readFields(DataInput dataInput) throws IOException {
        super.readFields(dataInput);
        dept = dataInput.readUTF();
    }

    @Override
    public int compareTo(DeptSortOut o) {
        return (int) (getAvgSalary() / getAvgAge() - o.getAvgSalary() / o.getAvgAge());
    }

    @Override
    public String toString() {
        return dept.toString() + " " + super.toString();
    }

    public void setDept(String dept) {
        this.dept = dept;
    }

    public String getDept() {
        return dept;
    }
}

DeptCount.java

package edu.sugon;

import org.apache.commons.lang.ObjectUtils;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.hdfs.DFSClient;
import org.apache.hadoop.io.DoubleWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.mapred.OutputCollector;
import org.apache.hadoop.mapred.Reporter;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;

import java.io.IOException;
import java.time.LocalDate;
import java.util.Iterator;

import edu.sugon.DeptMapperOut;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;

public class DeptCount {
    public static int calcAge(int year) {
        LocalDate current_date = LocalDate.now();
        return current_date.getYear() - year;
    }
    static class DeptMapper extends Mapper<LongWritable, Text, Text, DeptMapperOut> {
        @Override
        protected void map(LongWritable key, Text value, Mapper<LongWritable, Text, Text, DeptMapperOut>.Context context) throws IOException, InterruptedException {
            String row = value.toString();
            String[] attrs = row.split(",");

            String year = attrs[4].split("/")[0];
            int age = calcAge(Integer.parseInt(year));

            DeptMapperOut out = new DeptMapperOut();
            double salary = 0;

            salary += attrs[3].isEmpty()?0:Double.parseDouble(attrs[3]);
            salary += attrs[5].isEmpty()?0:Double.parseDouble(attrs[5]);

            out.init(attrs[1], salary, age);

            context.write(new Text(attrs[2]), out);
        }
    }

    static class DeptReducer extends Reducer<Text, DeptMapperOut, Text, DeptOut> {
        @Override
        protected void reduce(Text key, Iterable<DeptMapperOut> values, Reducer<Text, DeptMapperOut, Text, DeptOut>.Context context) throws IOException, InterruptedException {
            String maxAgeName, maxSalaryName;
            double totSalary, avgSalary, avgAge, maxSalary;
            int maxAge, cnt;

            maxAgeName = maxSalaryName = null;
            totSalary = avgSalary = avgAge = 0;
            maxSalary = cnt = maxAge = 0;

            for (DeptMapperOut out: values) {
                totSalary += out.getSalary();
                avgAge += out.getAge();
                cnt += 1;

                if (out.getSalary() > maxSalary) {
                    maxSalaryName = out.getName();
                    maxSalary = out.getSalary();
                }

                if (out.getAge() > maxAge) {
                    maxAgeName = out.getName();
                    maxAge = out.getAge();
                }
            }

            avgSalary = totSalary / cnt;
            avgAge = avgAge / cnt;

            DeptOut out = new DeptOut();

            out.init(maxAgeName, maxSalaryName, totSalary, avgSalary, maxAge, avgAge);

            context.write(key, out);
        }
    }

    static class SortMapper extends Mapper<Text, Text, DeptSortOut, NullWritable> {
        @Override
        protected void map(Text key, Text value, Mapper<Text, Text, DeptSortOut, NullWritable>.Context context) throws IOException, InterruptedException {
            String attrs[] = value.toString().split(",");
            DeptSortOut out = new DeptSortOut();

            int maxAge = Integer.parseInt(attrs[3]);
            Double totSalary = Double.parseDouble(attrs[0]);
            Double avgSalary = Double.parseDouble(attrs[1]), avgAge = Double.parseDouble(attrs[4]);
            String maxSalaryName = attrs[2], maxAgeName = attrs[5];

            out.init(maxAgeName, maxSalaryName, totSalary, avgSalary, maxAge, avgAge, key.toString());

            context.write(out, NullWritable.get());
        }
    }

    static class SortReducer extends Reducer<DeptSortOut, NullWritable, DeptSortOut, NullWritable> {
        @Override
        protected void reduce(DeptSortOut key, Iterable<NullWritable> values, Reducer<DeptSortOut, NullWritable, DeptSortOut, NullWritable>.Context context) throws IOException, InterruptedException {
            for (NullWritable val: values) {
                context.write(key, NullWritable.get());
            }
        }
    }

    public static void main(String[] args) throws Exception {
        // test calcAge
        //System.out.println(calcAge(2002));

        System.setProperty("HADOOP_USER_NAME", "hadoop");
        System.setProperty("hadoop.home.dir", "D:\\Program Files (x86)\\hadoop-3.1.1");

        Configuration conf = new Configuration();

        //main方法在哪个类
        Job job = new Job(conf);
        job.setJarByClass(DeptCount.class);

        //设置mapper类
        job.setMapperClass(DeptCount.DeptMapper.class);
        job.setMapOutputKeyClass(Text.class);
        job.setMapOutputValueClass(DeptMapperOut.class);
        //设置reducer类
        job.setReducerClass(DeptCount.DeptReducer.class);
        job.setOutputKeyClass(Text.class);
        job.setOutputValueClass(DeptOut.class);

        // 检查文件是否存在，如果已经存在，则删除
        String outPathStr = "hdfs://192.168.10.100:9000/2021030541035/data/output2";

        Path outPath = new Path(outPathStr);

        FileSystem fs = outPath.getFileSystem(conf);

        if (fs.exists(outPath)) {
            fs.delete(outPath, true);
        }


        //设置输入文件、输出文件
        FileInputFormat.setInputPaths(job, new Path("hdfs://192.168.10.100:9000/2021030541035/data/emp.csv"));
        FileOutputFormat.setOutputPath(job, outPath);

        if (job.waitForCompletion(true)) {
            conf.set("mapreduce.job.inputformat.class", "org.apache.hadoop.mapreduce.lib.input.KeyValueTextInputFormat");

            Job sortJob = Job.getInstance(conf);

            // 设置mapper
            sortJob.setMapperClass(SortMapper.class);
            sortJob.setMapOutputKeyClass(DeptSortOut.class);
            sortJob.setMapOutputValueClass(NullWritable.class);

            // 设置reducer
            sortJob.setReducerClass(SortReducer.class);
            sortJob.setOutputValueClass(DeptSortOut.class);
            sortJob.setOutputKeyClass(NullWritable.class);


            Path sortOutPath = new Path("hdfs://192.168.10.100:9000/2021030541035/data/output3");

            if (fs.exists(sortOutPath)) {
                fs.delete(sortOutPath, true);
            }

            //设置输入文件、输出文件
            FileInputFormat.setInputPaths(sortJob, new Path(outPathStr + "/part-r-*"));
            FileOutputFormat.setOutputPath(sortJob, sortOutPath);

            sortJob.waitForCompletion(true);
        }
    }
}

posted on 2023-04-20 17:27 Jack404 阅读(42) 评论(0) 收藏举报

刷新页面返回顶部