【WordCount案例】(三)把键值按照ASCII码奇偶分区(Partitioner)

对于Partitioner,按ctrl+h点击右侧的HashPartitioner,标准规定使用这个分区方式。

一、WordCountMapper类

package WordCount;

import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;

import java.io.IOException;

/**
 * <KEYIN, VALUEIN ,KEYOUT, VALUEOUT>
 * <LongWritable, Text,Text, IntWritable>
 */
public class WordCountMapper extends Mapper <LongWritable, Text,Text, IntWritable> {
    Text k = new Text();
    IntWritable v = new IntWritable();
    /**
     * ctrl+o  输出集成的方法
     */
    @Override
    protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
        //1.转换格式:Hadoop-->Java
        String line = value.toString();
        //2.切分数据
        String[] words = line.split(" ");
        //3.输出成:<单词,1>
        for(String word:words){
            k.set(word);
            v.set(1);
            context.write(k, v);
        }

    }
}

二、分区类-WordCountPartitioner

package WordCount;

import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Partitioner;

public class WordCountPartitioner extends Partitioner<Text, IntWritable> {
    /**
     *  分区
     * @param key  键
     * @param value 值
     * @param i  分区数
     * @return
     */
    @Override
    public int getPartition(Text key, IntWritable value, int i) {
        //1.将Text转换成String,再取第一个数
        String num=key.toString().substring(0,1);
        //2.自动装箱成Int类型
        int k = Integer.valueOf(num);
        if( k % 2 == 0){
            return 0;
        }else{
            return 1;
        }
    }
}

三、WordCountReducer类

package WordCount;

import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Reducer;

import java.io.IOException;

/**
 * <KEYIN, VALUEIN, KEYOUT, VALUEOUT>
 * <Text, IntWritable, Text,IntWritable>
 */
public class WordCountReducer extends Reducer<Text, IntWritable, Text,IntWritable> {
    IntWritable v = new IntWritable();
    @Override
    protected void reduce(Text key, Iterable<IntWritable> values, Context context) throws IOException, InterruptedException {
        //1.初始化次数
        int count=0;
        //2.汇总个数
        for (IntWritable value:values) {
            count += value.get();
        }
        v.set(count);
        //3.输出总次数
        context.write(key,v);
    }
}

四、执行类-WordCountDriver

package WordCount;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;

public class WordCountDriver {
    public static void main(String[] args) throws Exception {

        //计算时间-开始时间
        long startTime=System.currentTimeMillis();

        //输入数据的Windows路径,输出的Windows路径(不存在的文件夹)
        args = new String[]{"D:/new 1", "D:/HDFS/f"};

        //1.获取配置信息
        Configuration conf = new Configuration();
        //一个MR就是一个Job
        Job job = Job.getInstance(conf);

        //2.反射类
        job.setJarByClass(WordCountDriver.class);
        job.setMapperClass(WordCountMapper.class);
        job.setReducerClass(WordCountReducer.class);

        job.setPartitionerClass(WordCountPartitioner.class);
        //设置分区个数
        job.setNumReduceTasks(2);

        //3.Reduce输入的K、V类型:Text、IntWritable
        job.setMapOutputKeyClass(Text.class);
        job.setMapOutputValueClass(IntWritable.class);
        //Reduce输出的K、V类型--总输出
        job.setOutputKeyClass(Text.class);
        job.setOutputValueClass(IntWritable.class);

        //4.数据的输入和输出的指定目录
        FileInputFormat.setInputPaths(job,new Path(args[0]));
        FileOutputFormat.setOutputPath(job,new Path(args[1]));

        //5.提交job   waitForCompletion()包含submit()
        job.waitForCompletion(true);

        //计算时间-结束时间
        long endTime=System.currentTimeMillis();
        System.out.println("程序运行的时间为:"+(endTime-startTime));
    }
}

注意:如果分区数小于分类数并且大于1,就会报错。

 

博客园  ©  2004-2025
浙公网安备 33010602011771号 浙ICP备2021040463号-3