实验5 MapReduce初级编程实践
实验步骤
对于两个输入文件,即文件A和文件B,请编写MapReduce程序,对两个文件进行合并,并剔除其中重复的内容,得到一个新的输出文件C。下面是输入文件和输出文件的一个样例供参考。
输入文件A的样例如下:
|
20170101 x 20170102 y 20170103 x 20170104 y 20170105 z 20170106 x |
输入文件B的样例如下:
|
20170101 y 20170102 y 20170103 x 20170104 z 20170105 y |
根据输入文件A和B合并得到的输出文件C的样例如下:
|
20170101 x 20170101 y 20170102 y 20170103 x 20170104 y 20170104 z 20170105 y 20170105 z 20170106 x |
现在有多个输入文件,每个文件中的每行内容均为一个整数。要求读取所有文件中的整数,进行升序排序后,输出到一个新的文件中,输出的数据格式为每行两个整数,第一个数字为第二个整数的排序位次,第二个整数为原待排列的整数。下面是输入文件和输出文件的一个样例供参考。
输入文件1的样例如下:
|
33 37 12 40 |
输入文件2的样例如下:
|
4 16 39 5 |
输入文件3的样例如下:
|
1 45 25 |
根据输入文件1、2和3得到的输出文件如下:
|
1 1 2 4 3 5 4 12 5 16 6 25 7 33 8 37 9 39 10 40 11 45 |
下面给出一个child-parent的表格,要求挖掘其中的父子辈关系,给出祖孙辈关系的表格。
输入文件内容如下:
|
child parent Steven Lucy Steven Jack Jone Lucy Jone Jack Lucy Mary Lucy Frank Jack Alice Jack Jesse David Alice David Jesse Philip David Philip Alma Mark David Mark Alma |
输出文件内容如下:
|
grandchild grandparent Steven Alice Steven Jesse Jone Alice Jone Jesse Steven Mary Steven Frank Jone Mary Jone Frank Philip Alice Philip Jesse Mark Alice Mark Jesse |
具体代码
导入依赖
pom.xml
<?xml version="1.0" encoding="UTF-8"?>
<project xmlns="http://maven.apache.org/POM/4.0.0"
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0
http://maven.apache.org/xsd/maven-4.0.0.xsd">
<modelVersion>4.0.0</modelVersion>
<groupId>com.example</groupId>
<artifactId>step1</artifactId>
<version>1.0-SNAPSHOT</version>
<properties>
<maven.compiler.source>1.8</maven.compiler.source>
<maven.compiler.target>1.8</maven.compiler.target>
<project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
</properties>
<dependencies>
<dependency>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-common</artifactId>
<version>3.4.0</version>
</dependency>
<dependency>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-client</artifactId>
<version>3.3.4</version>
</dependency>
</dependencies>
<build>
<plugins>
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-compiler-plugin</artifactId>
<version>3.8.1</version>
<configuration>
<source>1.8</source>
<target>1.8</target>
</configuration>
</plugin>
</plugins>
</build>
</project>
任务一
MergeDeduplicateMapper
package com.example;
import java.io.IOException;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;
public class MergeDeduplicateMapper extends Mapper<LongWritable, Text, Text, Text> {
private final Text outputKey = new Text();
private final Text outputValue = new Text();
@Override
protected void map(LongWritable key, Text value, Context context)
throws IOException, InterruptedException {
String line = value.toString().trim();
// 跳过空行
if (line.isEmpty()) {
return;
}
// 按制表符或空格分割
String[] parts = line.split("\\s+");
if (parts.length >= 2) {
// 以日期和值的组合作为key,这样可以去除重复的记录
String date = parts[0];
String val = parts[1];
// 创建复合键:日期+值
String compositeKey = date + "\t" + val;
outputKey.set(compositeKey);
outputValue.set(""); // 值为空,因为我们只需要key
context.write(outputKey, outputValue);
}
}
}
MergeDeduplicateReducer
package com.example;
import java.io.IOException;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Reducer;
public class MergeDeduplicateReducer extends Reducer<Text, Text, Text, Text> {
@Override
protected void reduce(Text key, Iterable<Text> values, Context context)
throws IOException, InterruptedException {
// 由于在mapper阶段已经用复合键去重,这里每个key只会出现一次
// 直接输出键(包含日期和值)
String compositeKey = key.toString();
String[] parts = compositeKey.split("\t");
if (parts.length == 2) {
Text outputKey = new Text(parts[0]);
Text outputValue = new Text(parts[1]);
context.write(outputKey, outputValue);
}
}
}
MergeDeduplicateDriver
package com.example;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;
public class MergeDeduplicateDriver {
public static void main(String[] args) throws Exception {
if (args.length != 3) {
System.err.println("Usage: MergeDeduplicate <input path A> <input path B> <output path>");
System.exit(-1);
}
Configuration conf = new Configuration();
Job job = Job.getInstance(conf, "Merge and Deduplicate Files");
// 设置Jar类
job.setJarByClass(MergeDeduplicateDriver.class);
// 设置Mapper和Reducer类
job.setMapperClass(MergeDeduplicateMapper.class);
job.setReducerClass(MergeDeduplicateReducer.class);
// 设置输出键值类型
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(Text.class);
// 设置输入输出格式
job.setInputFormatClass(TextInputFormat.class);
job.setOutputFormatClass(TextOutputFormat.class);
// 设置输入和输出路径
FileInputFormat.addInputPath(job, new Path(args[0])); // 文件A
FileInputFormat.addInputPath(job, new Path(args[1])); // 文件B
FileOutputFormat.setOutputPath(job, new Path(args[2])); // 输出文件C
// 设置Reducer任务数量
job.setNumReduceTasks(1);
// 等待作业完成
System.exit(job.waitForCompletion(true) ? 0 : 1);
}
}
任务二
SortMapper
package com.example;
import java.io.IOException;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;
public class SortMapper extends Mapper<LongWritable, Text, IntWritable, IntWritable> {
private final IntWritable number = new IntWritable();
private static final IntWritable one = new IntWritable(1);
@Override
protected void map(LongWritable key, Text value, Context context)
throws IOException, InterruptedException {
String line = value.toString().trim();
// 跳过空行
if (line.isEmpty()) {
return;
}
try {
// 将字符串转换为整数
int num = Integer.parseInt(line);
number.set(num);
// 输出 (数字, 1),其中1是占位符
context.write(number, one);
} catch (NumberFormatException e) {
// 如果遇到非数字内容,跳过
System.err.println("跳过非数字内容: " + line);
}
}
}
SortReducer
package com.example;
import java.io.IOException;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.mapreduce.Reducer;
public class SortReducer extends Reducer<IntWritable, IntWritable, IntWritable, IntWritable> {
private final IntWritable rank = new IntWritable(1);
@Override
protected void reduce(IntWritable key, Iterable<IntWritable> values, Context context)
throws IOException, InterruptedException {
// 由于MapReduce已经按key(数字)排序,我们只需要按顺序输出排名
for (IntWritable value : values) {
context.write(rank, key);
// 排名递增
rank.set(rank.get() + 1);
}
}
}
SortDriver
package com.example;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;
public class SortDriver {
public static void main(String[] args) throws Exception {
if (args.length != 2) {
System.err.println("Usage: SortDriver <input path> <output path>");
System.exit(-1);
}
Configuration conf = new Configuration();
Job job = Job.getInstance(conf, "Number Sort with Rank");
// 设置Jar类
job.setJarByClass(SortDriver.class);
// 设置Mapper和Reducer类
job.setMapperClass(SortMapper.class);
job.setReducerClass(SortReducer.class);
// 设置Mapper输出键值类型
job.setMapOutputKeyClass(IntWritable.class);
job.setMapOutputValueClass(IntWritable.class);
// 设置最终输出键值类型
job.setOutputKeyClass(IntWritable.class);
job.setOutputValueClass(IntWritable.class);
// 设置输入输出格式
job.setInputFormatClass(TextInputFormat.class);
job.setOutputFormatClass(TextOutputFormat.class);
// 设置输入和输出路径
FileInputFormat.addInputPath(job, new Path(args[0])); // 输入目录(包含所有输入文件)
FileOutputFormat.setOutputPath(job, new Path(args[1])); // 输出目录
// 设置Reducer任务数量为1,确保全局排序
job.setNumReduceTasks(1);
// 等待作业完成
System.exit(job.waitForCompletion(true) ? 0 : 1);
}
}
任务三
GrandParentMapper
package com.example;
import java.io.IOException;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;
public class GrandParentMapper extends Mapper<LongWritable, Text, Text, Text> {
private final Text outputKey = new Text();
private final Text outputValue = new Text();
@Override
protected void map(LongWritable key, Text value, Context context)
throws IOException, InterruptedException {
String line = value.toString().trim();
// 跳过空行和标题行
if (line.isEmpty() || line.startsWith("child") || line.startsWith("grandchild")) {
return;
}
// 按制表符或空格分割
String[] parts = line.split("\\s+");
if (parts.length >= 2) {
String child = parts[0];
String parent = parts[1];
// 输出两种关系:
// 1. 作为子代关系:key=子, value="1:"+父 (1表示这是子代关系)
// 2. 作为父代关系:key=父, value="2:"+子 (2表示这是父代关系)
outputKey.set(child);
outputValue.set("1:" + parent);
context.write(outputKey, outputValue);
outputKey.set(parent);
outputValue.set("2:" + child);
context.write(outputKey, outputValue);
}
}
}
GrandParentReducer
package com.example;
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Reducer;
public class GrandParentReducer extends Reducer<Text, Text, Text, Text> {
private final Text outputKey = new Text();
private final Text outputValue = new Text();
@Override
protected void reduce(Text key, Iterable<Text> values, Context context)
throws IOException, InterruptedException {
// 存储该人的子女列表和父母列表
List<String> children = new ArrayList<>();
List<String> parents = new ArrayList<>();
for (Text value : values) {
String valStr = value.toString();
if (valStr.startsWith("1:")) {
// 这是父母关系:1:父母名
parents.add(valStr.substring(2));
} else if (valStr.startsWith("2:")) {
// 这是子女关系:2:子女名
children.add(valStr.substring(2));
}
}
// 生成祖孙关系:该人的父母(祖父母) × 该人的子女(孙子)
for (String parent : parents) {
for (String child : children) {
outputKey.set(child); // 孙子
outputValue.set(parent); // 祖父母
context.write(outputKey, outputValue);
}
}
}
}
GrandParentDriver
package com.example;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;
public class GrandParentDriver {
public static void main(String[] args) throws Exception {
if (args.length != 2) {
System.err.println("Usage: GrandParentDriver <input path> <output path>");
System.exit(-1);
}
Configuration conf = new Configuration();
Job job = Job.getInstance(conf, "Find Grandparent Relationships");
// 设置Jar类
job.setJarByClass(GrandParentDriver.class);
// 设置Mapper和Reducer类
job.setMapperClass(GrandParentMapper.class);
job.setReducerClass(GrandParentReducer.class);
// 设置输出键值类型
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(Text.class);
// 设置输入输出格式
job.setInputFormatClass(TextInputFormat.class);
job.setOutputFormatClass(TextOutputFormat.class);
// 设置输入和输出路径
FileInputFormat.addInputPath(job, new Path(args[0]));
FileOutputFormat.setOutputPath(job, new Path(args[1]));
// 设置Reducer任务数量
job.setNumReduceTasks(1);
// 等待作业完成
System.exit(job.waitForCompletion(true) ? 0 : 1);
}
}
浙公网安备 33010602011771号