MapReduce 小练习2
题目
求部门的总工资和平均工资,最大年龄,最大年龄的名字,平均年龄,工资最高者名字。根据平均工资/平均年龄降序排序
数据
7369,SMITH,CLERK,7902,1980/12/17,800,,20
7499,ALLEN,SALESMAN,7698,1981/2/20,1600,300,30
7521,WARD,SALESMAN,7698,1981/2/22,1250,500,30
7566,JONES,MANAGER,7839,1981/4/2,2975,,20
7654,MARTIN,SALESMAN,7698,1981/9/28,1250,1400,30
7698,BLAKE,MANAGER,7839,1981/5/1,2850,,30
7782,CLARK,MANAGER,7839,1981/6/9,2450,,10
7788,SCOTT,ANALYST,7566,1987/4/19,3000,,20
7839,KING,PRESIDENT,,1981/11/17,5000,,10
7844,TURNER,SALESMAN,7698,1981/9/8,1500,0,30
7876,ADAMS,CLERK,7788,1987/5/23,1100,,20
7900,JAMES,CLERK,7698,1981/12/3,950,,30
7902,FORD,ANALYST,7566,1981/12/3,3000,,20
7934,MILLER,CLERK,7782,1982/1/23,1300,,10
思路
DeptReducer结果:
ANALYST 21132.0,10566.0,SCOTT,42,39.0,FORD
CLERK 35320.0,8830.0,MILLER,43,40.5,SMITH
MANAGER 31792.0,10597.333333333334,JONES,42,42.0,CLARK
PRESIDENT 5000.0,5000.0,KING,42,42.0,KING
SALESMAN 36392.0,9098.0,ALLEN,42,42.0,TURNER
通过
conf.set("mapreduce.job.inputformat.class", "org.apache.hadoop.mapreduce.lib.input.KeyValueTextInputFormat");
可以一空格切割 key value的形式, 然后将输入类型换为Text, 这里排序还需要继承WritableComparable接口, 这里并不是Comparable接口,需要注意!
返回和Comparable是一样的,实现ComparaTo即可。
这里排序的时候因为需要用到Dept,然而我们的思路是按key排序,所以我直接继承了DeptOut并加了一个dept成员变量。
最终结果:
PRESIDENT 5000.0,5000.0,KING,42,42.0,KING
SALESMAN 36392.0,9098.0,ALLEN,42,42.0,TURNER
CLERK 35320.0,8830.0,MILLER,43,40.5,SMITH
MANAGER 31792.0,10597.333333333334,JONES,42,42.0,CLARK
ANALYST 21132.0,10566.0,SCOTT,42,39.0,FORD
Code
DeptMapperOut.java
package edu.sugon;
import org.apache.hadoop.io.Writable;
import java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException;
public class DeptMapperOut implements Writable {
private String name;
private double salary;
private int age;
public void init(String name, double salary, int age) {
this.name = name;
this.salary = salary;
this.age = age;
}
@Override
public void write(DataOutput dataOutput) throws IOException {
dataOutput.writeUTF(name);
dataOutput.writeDouble(salary);
dataOutput.writeInt(age);
}
@Override
public void readFields(DataInput dataInput) throws IOException {
name = dataInput.readUTF();
salary = dataInput.readDouble();
age = dataInput.readInt();
}
public void setAge(int age) {
this.age = age;
}
public int getAge() {
return age;
}
public String getName() {
return name;
}
public double getSalary() {
return salary;
}
public void setName(String name) {
this.name = name;
}
public void setSalary(double salary) {
this.salary = salary;
}
}
DeptOut.java
package edu.sugon;
import org.apache.hadoop.io.Writable;
import java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException;
public class DeptOut implements Writable {
private String maxAgeName;
private String maxSalaryName;
private double totSalary;
private double avgSalary;
private double avgAge;
private int maxAge;
public void init(String maxAgeName, String maxSalaryName, double totSalary, double avgSalary, int maxAge, double avgAge) {
this.maxAgeName = maxAgeName;
this.maxSalaryName = maxSalaryName;
this.totSalary = totSalary;
this.avgSalary = avgSalary;
this.avgAge = avgAge;
this.maxAge = maxAge;
}
@Override
public void write(DataOutput dataOutput) throws IOException {
dataOutput.writeUTF(maxAgeName);
dataOutput.writeUTF(maxSalaryName);
dataOutput.writeDouble(totSalary);
dataOutput.writeDouble(avgSalary);
dataOutput.writeDouble(avgAge);
dataOutput.writeInt(maxAge);
}
@Override
public void readFields(DataInput dataInput) throws IOException {
maxAgeName = dataInput.readUTF();
maxSalaryName = dataInput.readUTF();
totSalary = dataInput.readDouble();
avgSalary = dataInput.readDouble();
avgAge = dataInput.readDouble();
maxAge = dataInput.readInt();
}
@Override
public String toString() {
return totSalary + "," + avgSalary + "," + maxSalaryName + "," + maxAge + "," + avgAge + "," + maxAgeName;
}
public void setAvgSalary(double avgSalary) {
this.avgSalary = avgSalary;
}
public void setMaxAge(int maxAge) {
this.maxAge = maxAge;
}
public void setMaxAgeName(String maxAgeName) {
this.maxAgeName = maxAgeName;
}
public void setAvgAge(double avgAge) {
this.avgAge = avgAge;
}
public void setMaxSalaryName(String maxSalaryName) {
this.maxSalaryName = maxSalaryName;
}
public void setTotSalary(double totSalary) {
this.totSalary = totSalary;
}
public double getAvgSalary() {
return avgSalary;
}
public int getMaxAge() {
return maxAge;
}
public String getMaxAgeName() {
return maxAgeName;
}
public double getAvgAge() {
return avgAge;
}
public String getMaxSalaryName() {
return maxSalaryName;
}
public double getTotSalary() {
return totSalary;
}
}
DeptSortOut.java
package edu.sugon;
import org.apache.hadoop.io.WritableComparable;
import java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException;
public class DeptSortOut extends DeptOut implements WritableComparable<DeptSortOut> {
private String dept;
public void init(String maxAgeName, String maxSalaryName, double totSalary, double avgSalary, int maxAge, double avgAge, String dept) {
super.init(maxAgeName, maxSalaryName, totSalary, avgSalary, maxAge, avgAge);
this.dept = dept;
}
public void init(DeptOut out, String dept) {
super.init(out.getMaxAgeName(), out.getMaxSalaryName(), out.getTotSalary(), out.getAvgSalary(), out.getMaxAge(), out.getAvgAge());
this.dept = dept;
}
@Override
public void write(DataOutput dataOutput) throws IOException {
super.write(dataOutput);
dataOutput.writeUTF(dept);
}
@Override
public void readFields(DataInput dataInput) throws IOException {
super.readFields(dataInput);
dept = dataInput.readUTF();
}
@Override
public int compareTo(DeptSortOut o) {
return (int) (getAvgSalary() / getAvgAge() - o.getAvgSalary() / o.getAvgAge());
}
@Override
public String toString() {
return dept.toString() + " " + super.toString();
}
public void setDept(String dept) {
this.dept = dept;
}
public String getDept() {
return dept;
}
}
DeptCount.java
package edu.sugon;
import org.apache.commons.lang.ObjectUtils;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.hdfs.DFSClient;
import org.apache.hadoop.io.DoubleWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.mapred.OutputCollector;
import org.apache.hadoop.mapred.Reporter;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import java.io.IOException;
import java.time.LocalDate;
import java.util.Iterator;
import edu.sugon.DeptMapperOut;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
public class DeptCount {
public static int calcAge(int year) {
LocalDate current_date = LocalDate.now();
return current_date.getYear() - year;
}
static class DeptMapper extends Mapper<LongWritable, Text, Text, DeptMapperOut> {
@Override
protected void map(LongWritable key, Text value, Mapper<LongWritable, Text, Text, DeptMapperOut>.Context context) throws IOException, InterruptedException {
String row = value.toString();
String[] attrs = row.split(",");
String year = attrs[4].split("/")[0];
int age = calcAge(Integer.parseInt(year));
DeptMapperOut out = new DeptMapperOut();
double salary = 0;
salary += attrs[3].isEmpty()?0:Double.parseDouble(attrs[3]);
salary += attrs[5].isEmpty()?0:Double.parseDouble(attrs[5]);
out.init(attrs[1], salary, age);
context.write(new Text(attrs[2]), out);
}
}
static class DeptReducer extends Reducer<Text, DeptMapperOut, Text, DeptOut> {
@Override
protected void reduce(Text key, Iterable<DeptMapperOut> values, Reducer<Text, DeptMapperOut, Text, DeptOut>.Context context) throws IOException, InterruptedException {
String maxAgeName, maxSalaryName;
double totSalary, avgSalary, avgAge, maxSalary;
int maxAge, cnt;
maxAgeName = maxSalaryName = null;
totSalary = avgSalary = avgAge = 0;
maxSalary = cnt = maxAge = 0;
for (DeptMapperOut out: values) {
totSalary += out.getSalary();
avgAge += out.getAge();
cnt += 1;
if (out.getSalary() > maxSalary) {
maxSalaryName = out.getName();
maxSalary = out.getSalary();
}
if (out.getAge() > maxAge) {
maxAgeName = out.getName();
maxAge = out.getAge();
}
}
avgSalary = totSalary / cnt;
avgAge = avgAge / cnt;
DeptOut out = new DeptOut();
out.init(maxAgeName, maxSalaryName, totSalary, avgSalary, maxAge, avgAge);
context.write(key, out);
}
}
static class SortMapper extends Mapper<Text, Text, DeptSortOut, NullWritable> {
@Override
protected void map(Text key, Text value, Mapper<Text, Text, DeptSortOut, NullWritable>.Context context) throws IOException, InterruptedException {
String attrs[] = value.toString().split(",");
DeptSortOut out = new DeptSortOut();
int maxAge = Integer.parseInt(attrs[3]);
Double totSalary = Double.parseDouble(attrs[0]);
Double avgSalary = Double.parseDouble(attrs[1]), avgAge = Double.parseDouble(attrs[4]);
String maxSalaryName = attrs[2], maxAgeName = attrs[5];
out.init(maxAgeName, maxSalaryName, totSalary, avgSalary, maxAge, avgAge, key.toString());
context.write(out, NullWritable.get());
}
}
static class SortReducer extends Reducer<DeptSortOut, NullWritable, DeptSortOut, NullWritable> {
@Override
protected void reduce(DeptSortOut key, Iterable<NullWritable> values, Reducer<DeptSortOut, NullWritable, DeptSortOut, NullWritable>.Context context) throws IOException, InterruptedException {
for (NullWritable val: values) {
context.write(key, NullWritable.get());
}
}
}
public static void main(String[] args) throws Exception {
// test calcAge
//System.out.println(calcAge(2002));
System.setProperty("HADOOP_USER_NAME", "hadoop");
System.setProperty("hadoop.home.dir", "D:\\Program Files (x86)\\hadoop-3.1.1");
Configuration conf = new Configuration();
//main方法在哪个类
Job job = new Job(conf);
job.setJarByClass(DeptCount.class);
//设置mapper类
job.setMapperClass(DeptCount.DeptMapper.class);
job.setMapOutputKeyClass(Text.class);
job.setMapOutputValueClass(DeptMapperOut.class);
//设置reducer类
job.setReducerClass(DeptCount.DeptReducer.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(DeptOut.class);
// 检查文件是否存在,如果已经存在,则删除
String outPathStr = "hdfs://192.168.10.100:9000/2021030541035/data/output2";
Path outPath = new Path(outPathStr);
FileSystem fs = outPath.getFileSystem(conf);
if (fs.exists(outPath)) {
fs.delete(outPath, true);
}
//设置输入文件、输出文件
FileInputFormat.setInputPaths(job, new Path("hdfs://192.168.10.100:9000/2021030541035/data/emp.csv"));
FileOutputFormat.setOutputPath(job, outPath);
if (job.waitForCompletion(true)) {
conf.set("mapreduce.job.inputformat.class", "org.apache.hadoop.mapreduce.lib.input.KeyValueTextInputFormat");
Job sortJob = Job.getInstance(conf);
// 设置mapper
sortJob.setMapperClass(SortMapper.class);
sortJob.setMapOutputKeyClass(DeptSortOut.class);
sortJob.setMapOutputValueClass(NullWritable.class);
// 设置reducer
sortJob.setReducerClass(SortReducer.class);
sortJob.setOutputValueClass(DeptSortOut.class);
sortJob.setOutputKeyClass(NullWritable.class);
Path sortOutPath = new Path("hdfs://192.168.10.100:9000/2021030541035/data/output3");
if (fs.exists(sortOutPath)) {
fs.delete(sortOutPath, true);
}
//设置输入文件、输出文件
FileInputFormat.setInputPaths(sortJob, new Path(outPathStr + "/part-r-*"));
FileOutputFormat.setOutputPath(sortJob, sortOutPath);
sortJob.waitForCompletion(true);
}
}
}
浙公网安备 33010602011771号