MapReduce的代码编写----学生数据和总分数据关联(join)
程序代码
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import java.io.IOException;
public class Demo4Join {
//Map端
public static class MyMapper extends Mapper<LongWritable, Text,LongWritable,Text>{
@Override
protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
//区分value到底是哪个文件的数据
//先将数据转化为String类型(因为Text无法调用切分方法)
String v = value.toString();
//判断value是哪个文件的数据
if(v.contains(",")){
//如果value包含逗号,那么它是学生文件的数据
//按照逗号切分
String[] stuSplite = v.split(",");
//提取Id,前面所有的数据都转化为了toString类型,
//但是Id是Long类型,需要转化过来
long id = Long.parseLong(stuSplite[0]);
//提取姓名
String name = stuSplite[1];
//提取班级
String clazz = stuSplite[4];
//发送数据
context.write(new LongWritable(id),new Text(name+","+clazz+"|"));
}else{
//如果value不包含逗号,那么它是总分数据
//总分数据按照4个空格切分(因为输出的是4个空格)
String[] sumScoreSplite = v.split("\t");
//提取Id
long sId = Long.parseLong(sumScoreSplite[0]);
//提取总分(分数可以作为String类型,不需要转化)
String sScore = sumScoreSplite[1];
//发送数据
context.write(new LongWritable(sId),new Text(sScore + "#"));
}
}
}
//Reduce端
public static class MyReducer extends Reducer<LongWritable,Text,LongWritable,Text>{
@Override
protected void reduce(LongWritable key, Iterable<Text> values, Context context) throws IOException, InterruptedException {
//Reduce最后想要输出学生数据+总分数据,需要定义一个变量等于一个空字符串
String stuV = " ";
String sumScoreV = " ";
//遍历迭代器
for (Text value : values) {
String v = value.toString();
if(v.contains("|")){
//如果values包含”|“,该数据是学生数据
//已经判断出了包含”|“的是学生数据,将”|“替换为” “,不让相加的时候,会带着”|“
stuV = v.replace("|", " ");
}else{
//总分数据
sumScoreV = v.replace("#"," ");
}
}
//发送的时候将二者相加
context.write(key,new Text(stuV + "," + sumScoreV));
}
}
//Driver端
public static void main(String[] args) throws IOException, InterruptedException, ClassNotFoundException {
Configuration conf = new Configuration();
// 设置MapReduce输出的K-V的分隔符
conf.set("mapred.textoutputformat.separator", ",");
Job job = Job.getInstance(conf);
job.setJobName("Demo4Join");
job.setJarByClass(Demo4Join.class);
job.setMapperClass(MyMapper.class);
job.setMapOutputKeyClass(LongWritable.class);
job.setMapOutputValueClass(Text.class);
job.setReducerClass(MyReducer.class);
job.setOutputKeyClass(LongWritable.class);
job.setOutputValueClass(Text.class);
// 配置输入输出路径
FileInputFormat.addInputPath(job, new Path("/student/input"));
FileInputFormat.addInputPath(job, new Path("/student/score/output"));
// 输出路径不需要提前创建,如果该目录已存在则会报错
// 通过HDFS的JavaAPI判断输出路径是否存在
Path outPath = new Path("/student/join/output");
FileSystem fs = FileSystem.get(conf);
if (fs.exists(outPath)) {
fs.delete(outPath, true);
}
FileOutputFormat.setOutputPath(job, outPath);
// 等待job运行完成
job.waitForCompletion(true);
}
}
运行结果
1500100001,施笑槐,文科六班 ,406
1500100002,吕金鹏,文科六班 ,440
1500100003,单乐蕊,理科六班 ,359
1500100004,葛德曜,理科三班 ,421
1500100005,宣谷芹,理科五班 ,395
1500100006,边昂雄,理科二班 ,314
1500100007,尚孤风,文科六班 ,418
1500100008,符半双,理科六班 ,363
1500100009,沈德昌,理科一班 ,251
1500100010,羿彦昌,理科六班 ,402
1500100011,宰运华,理科三班 ,282
1500100012,梁易槐,理科一班 ,459
1500100013,逯君昊,文科二班 ,369
1500100014,羿旭炎,理科五班 ,396
1500100015,宦怀绿,理科一班 ,309
1500100016,潘访烟,文科一班 ,359
1500100017,高芷天,理科五班 ,263
1500100018,骆怜雪,文科六班 ,425
1500100019,娄曦之,理科三班 ,433
...共6000行