MapReduce案例五：倒排索引

一、数据样例
二、需求
三、分析
四、代码实现

一、数据样例

三个文件，a.txt,b.txt,c.txt。其中每个文件中包含若干的单词。

文件a.txt内容：

I Love Hadoop
he like ZhouSiYuan
I love me

文件b.txt内容：

I Love MapReduce
he like NBA
I love Hadoop

文件c.txt内容：

I Love MapReduce
I love me
I Love Hadoop

二、需求

建立搜索索引，根据查找单词来查找文档。

三、分析

1、求出每个文件中对应的单词及其单词次数，并在其后面加上其对应的文件名。即形如：I--a.txt 2，I--b.txt 2。
2、最后得出单词所对应所有文件名，及其在每个文件中出现的次数。即形如：**I a.txt-->2 b.txt-->2 c.txt-->3 **。

四、代码实现

1、第一次Mapper，编写 OneIndexMapper 类：

import java.io.IOException;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.lib.input.FileSplit;

public class OneIndexMapper extends Mapper<LongWritable, Text, Text, IntWritable> {

    Text k = new Text();

    @Override
    protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
        // 1 获取切片名称
        FileSplit inputSplit = (FileSplit) context.getInputSplit();
        String name = inputSplit.getPath().getName();

        // 2 获取1行
        String line = value.toString();

        // 3 截取
        String[] words = line.split(" ");

        // 4 把每个单词和切片名称关联起来
        for (String word : words) {
            k.set(word + "--" + name);
            
            context.write(k, new IntWritable(1));
        }
    }
}

2、第一次Reducer，编写 OneIndexReducer 类：

import java.io.IOException;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Reducer;

public class OneIndexReducer extends Reducer<Text, IntWritable, Text, IntWritable>{
    
    @Override
    protected void reduce(Text key, Iterable<IntWritable> values, Context context) throws IOException, InterruptedException {
        
        int count = 0;
        // 累加和
        for(IntWritable value: values){
            count +=value.get();
        }
        
        // 写出
        context.write(key, new IntWritable(count));
    }
}

3、第一次Driver，编写 OneIndexDriver 类：

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;

public class OneIndexDriver {

    public static void main(String[] args) throws Exception {

        //data文件加下包含a.txt,b.txt,c.txt三个文件
        args = new String[]{"D:\\大数据API\\data","D:\\大数据API\\data1"};

        Configuration conf = new Configuration();

        Job job = Job.getInstance(conf);
        job.setJarByClass(OneIndexDriver.class);

        job.setMapperClass(OneIndexMapper.class);
        job.setReducerClass(OneIndexReducer.class);

        job.setMapOutputKeyClass(Text.class);
        job.setMapOutputValueClass(IntWritable.class);
        
        job.setOutputKeyClass(Text.class);
        job.setOutputValueClass(IntWritable.class);

        FileInputFormat.setInputPaths(job, new Path(args[0]));
        FileOutputFormat.setOutputPath(job, new Path(args[1]));

        job.waitForCompletion(true);
    }
}

4、第二次Mapper，编写 TwoIndexMapper 类：

import java.io.IOException;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;

public class TwoIndexMapper extends Mapper<LongWritable, Text, Text, Text>{
    Text k = new Text();
    Text v = new Text();
    
    @Override
    protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
        
        // 1 获取1行数据
        String line = value.toString();
        
        // 2用“--”切割
        String[] fields = line.split("--");
        
        k.set(fields[0]);
        v.set(fields[1]);
        
        // 3 输出数据
        context.write(k, v);
    }
}

5、第二次Mapper，编写 TwoIndexReducer类：

import java.io.IOException;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Reducer;

public class TwoIndexReducer extends Reducer<Text, Text, Text, Text> {

    @Override
    protected void reduce(Text key, Iterable<Text> values, Context context) throws IOException, InterruptedException {
        //  I--a.txt	2
        //I--b.txt	2
        //I--c.txt	3

        //变成：I c.txt-->2 b.txt-->2 a.txt-->3

        StringBuilder sb = new StringBuilder();

        for (Text value : values) {
            sb.append(value.toString().replace("\t", "-->") + "\t");
        }
        
        context.write(key, new Text(sb.toString()));
    }
}

6、第二次Mapper，编写 TwoIndexDriver 类：

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;

public class TwoIndexDriver {

    public static void main(String[] args) throws Exception {

        args = new String[]{"D:\\大数据API\\data1","D:\\大数据API\\data2"};

        Configuration config = new Configuration();
        Job job = Job.getInstance(config);

        job.setMapperClass(TwoIndexMapper.class);
        job.setReducerClass(TwoIndexReducer.class);

        job.setMapOutputKeyClass(Text.class);
        job.setMapOutputValueClass(Text.class);
        
        job.setOutputKeyClass(Text.class);
        job.setOutputValueClass(Text.class);

        FileInputFormat.setInputPaths(job, new Path(args[0]));
        FileOutputFormat.setOutputPath(job, new Path(args[1]));

        System.exit(job.waitForCompletion(true) ? 0 : 1);
    }
}

结果图

第一次MapReduce：

第二次MapReduce：

posted @ 2020-02-05 17:44 落花桂阅读(520) 评论(0) 编辑收藏举报

会员力量，点亮园子希望

刷新页面返回顶部

为了2025的桂花

不要让自己阻止自己过自己想过的生活。

MapReduce案例五：倒排索引

一、数据样例

二、需求

三、分析

四、代码实现

公告