hadoop基础编程【施工中】

 

 

 

一,单词统计

统计输入文本中的单词个数。

map:分割行中的单词,单词为key,1为value。reduce:统计key的数目。 

其他:自定义combiner。

import java.io.IOException;
import java.util.Iterator;
import java.util.StringTokenizer;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.util.GenericOptionsParser;

/**
 * 单词统计。统计输入文本中的单词个数
 */
public class WordCount {

    public static class TokenizerMapper extends Mapper<Object, Text, Text, IntWritable> {
        private static final IntWritable one = new IntWritable(1);
        private Text word = new Text();

        public void map(Object key, Text value, Mapper<Object, Text, Text, IntWritable>.Context context) throws IOException, InterruptedException {
            //根据空格、制表符\t、换行符\n、回车符\r分割字符串
            StringTokenizer itr = new StringTokenizer(value.toString());
            //nextToken()获得序列中的下一个元素。hasMoreTokens()检查序列中是否还有元素。
            while(itr.hasMoreTokens()) {
                this.word.set(itr.nextToken());
                context.write(this.word, one);
            }
        }
    }

    public static class IntSumReducer extends Reducer<Text, IntWritable, Text, IntWritable> {
        private IntWritable result = new IntWritable();

        public void reduce(Text key, Iterable<IntWritable> values, Reducer<Text, IntWritable, Text, IntWritable>.Context context) throws IOException, InterruptedException {
            int sum = 0;
            IntWritable val;

            //使用next()获得序列中的下一个元素。使用hasNext()检查序列中是否还有元素。
            for(Iterator<IntWritable> i$ = values.iterator(); i$.hasNext(); sum += val.get()) {
                val = (IntWritable)i$.next();
            }

            this.result.set(sum);
            context.write(key, this.result);
        }
    }

    public static void main(String[] args) throws Exception{
        Configuration conf = new Configuration();//初始化作业的配置信息类
        String[] otherArgs = new GenericOptionsParser(conf, args).getRemainingArgs();//获得程序执行所传入的参数
        if (otherArgs.length < 2) {//如果传入参数少于2,则提示类使用至少需要输入路径和输出路径
            System.err.println("Usage: WordCount <in> [<in>...] <out>");
            System.exit(2);
        }

        Job job = Job.getInstance(conf, "word count");//构建任务对象
        job.setJarByClass(WordCount.class);//指定执行的类
        job.setMapperClass(TokenizerMapper.class);//指定map类
        job.setCombinerClass(IntSumReducer.class);//指定shuffle的combiner类
        job.setReducerClass(IntSumReducer.class);//指定reducer类

        //设置输出的类型。当 Mapper和 Reducer两个的输出类型一致时可以不设置 setMapOutputKeyClass和 setMapOutputValueClass
        job.setOutputKeyClass(Text.class);//指定key输出类型
        job.setOutputValueClass(IntWritable.class);//指定value输出类型

        for(int i = 0; i < otherArgs.length - 1; i++) {
            FileInputFormat.addInputPath(job, new Path(otherArgs[i]));//设置输入文件路径
        }
        FileOutputFormat.setOutputPath(job, new Path(otherArgs[otherArgs.length - 1]));//设置输出文件路径

        System.exit(job.waitForCompletion(true) ? 0:1);//提交任务
    }
}
WordCount.java 
input file
wordCount.txt :
import java io OException;
import java util Iterator;
import java util StringTokenizer;
import org apache hadoop
import org apache hadoop
import org apache hadoop io IntWritable;
class class class



output file
part-r-00000 :
IntWritable;    1
Iterator;       1
OException;     1
StringTokenizer;        1
apache  3
class   3
hadoop  3
import  6
io      2
java    3
org     3
util    2
input/output file

 

二,去重

剔除文件中重复的内容,得到一个新的输出文件。

map:以去重的元素为key。reduce:直接返回key。 

import java.io.IOException;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.util.GenericOptionsParser;
/**
 * 去重。剔除文件中重复的内容,得到一个新的输出文件。
 */
public class Dedup {

    public static class MyMapper extends Mapper<Object, Text, Text, Text> {

        public void map(Object key, Text value, Context context) throws IOException, InterruptedException {
            context.write(value, new Text());
        }
    }

    public static class MyReducer extends Reducer<Text, Text, Text, Text> {

        public void reduce(Text key, Iterable<Text> values, Context context) throws IOException, InterruptedException {
            context.write(key, new Text());
        }
    }

    public static void main(String[] args) throws Exception {
        Configuration conf = new Configuration();//初始化作业的配置信息类
        String[] otherArgs = new GenericOptionsParser(conf, args).getRemainingArgs();//获得程序执行所传入的参数

        if (otherArgs.length < 2) {//如果传入参数少于2,则提示类使用至少需要输入路径和输出路径
            System.err.println("Usage: Dedup <in> [<in>...] <out>");
            System.exit(2);
        }

        Job job = Job.getInstance(conf, "Dedup");//构建任务对象
        job.setJarByClass(Dedup.class);//指定执行的类
        job.setMapperClass(MyMapper.class);//指定map类
        job.setReducerClass(MyReducer.class);//指定reducer类

        //当 Mapper和 Reducer两个的输出类型一致时可以不设置 setMapOutputKeyClass和 setMapOutputValueClass
        job.setOutputKeyClass(Text.class);//指定key输出类型
        job.setOutputValueClass(Text.class);//指定value输出类型

        for(int i = 0; i < otherArgs.length-1; i++) {
            FileInputFormat.addInputPath(job, new Path(otherArgs[i]));//输入文件路径
        }
        FileOutputFormat.setOutputPath(job, new Path(otherArgs[otherArgs.length-1]));//输出文件路径

        System.exit(job.waitForCompletion(true) ? 0:1);//提交任务
    }

}
Dedup.java
input file
dedup1.txt :
1
2
3
4
4
3
2

dedup2.txt :
3
4
4



output file
part-r-00000 :
1
2
3
4
input/output file

 

三,求平均

求出学生成绩的平均分。

map:分割行,学生姓名为key,成绩为value。reduce:统计同一key成绩的出现次数和总分,求平均

import java.io.IOException;
import java.util.Iterator;
import java.util.StringTokenizer;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.DoubleWritable;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.util.GenericOptionsParser;

/**
 * 求出学生成绩的平均分
 */
public class Avg {

    public static class MyMapper extends Mapper<LongWritable, Text, Text, IntWritable> {

        public void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
            //hadoop 默认的是 UTF-8 编码,如果中文 GBK 编码的字符会出现乱码,需要以下面的方式进行转码
            String line = new String(value.getBytes(), 0, value.getLength(), "UTF-8");

            //根据空格、制表符\t、换行符\n、回车符\r分割字符串
            StringTokenizer itr = new StringTokenizer(line);
            //获取分割后的字符串
            String strName = itr.nextToken();//学生姓名
            String strScore = itr.nextToken();//学生成绩
            Text name = new Text(strName);
            int score = Integer.parseInt(strScore);

            context.write(name, new IntWritable(score));
        }
    }

    public static class MyReducer extends Reducer<Text, IntWritable, Text, DoubleWritable> {

        public void reduce(Text key, Iterable<IntWritable> values, Context context) throws IOException, InterruptedException {
            int sum = 0;
            int count = 0;
            Iterator<IntWritable> iterator = values.iterator();
            while (iterator.hasNext()){
                sum += iterator.next().get();
                count++;
            }
            double avg = (sum*1.0)/count;//为了显示小数,转数据类型

            context.write(key, new DoubleWritable(avg));
        }
    }

    public static void main(String[] args) throws Exception {
        Configuration conf = new Configuration();//初始化作业的配置信息类
        String[] otherArgs = new GenericOptionsParser(conf, args).getRemainingArgs();//获得程序执行所传入的参数

        if (otherArgs.length < 2) {//如果传入参数少于2,则提示类使用至少需要输入路径和输出路径
            System.err.println("Usage: Score <in> [<in>...] <out>");
            System.exit(2);
        }

        Job job = Job.getInstance(conf, "Avg");//构建任务对象
        job.setJarByClass(Avg.class);//指定执行的类
        job.setMapperClass(MyMapper.class);//指定map类
        job.setReducerClass(MyReducer.class);//指定reducer类

        job.setMapOutputKeyClass(Text.class);//指定map类key输出类型
        job.setMapOutputValueClass(IntWritable.class);//指定map类value输出类型
        job.setOutputKeyClass(Text.class);//指定key输出类型
        job.setOutputValueClass(DoubleWritable.class);//指定value输出类型

        for(int i = 0; i < otherArgs.length-1; i++) {
            FileInputFormat.addInputPath(job, new Path(otherArgs[i]));//输入文件路径
        }
        FileOutputFormat.setOutputPath(job, new Path(otherArgs[otherArgs.length-1]));//输出文件路径

        System.exit(job.waitForCompletion(true) ? 0:1);//提交任务
    }

}
Avg.java
input file
Avg1.txt :
a 1
b 2
a 3
b 3
a 5
b 7
c 3
c 5

Avg2.txt
a 1
b 7
c 5
a 1
c 3



output file
part-r-00000 :
a       2.2
b       4.75
c       4.0
input/output file

 

四,表连接

4.1 祖孙辈挖掘

根据父子辈关系,挖掘祖孙辈关系。

map:分割,输出子父和父子两条结果。reduce:将结果分类写入集合,然后集合笛卡尔乘积。 

import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
import java.util.StringTokenizer;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.util.GenericOptionsParser;

/**
 *下面给出一个child-parent的表格,要求挖掘其中的父子辈关系,给出祖孙辈关系的表格。
 */
public class ChildParent {

    public static class MyMapper extends Mapper<Object, Text, Text, Text> {

        public void map(Object key, Text value, Context context) throws IOException, InterruptedException {
            String line = value.toString();

            if(!line.contains("child")){//排除表头,假设得到的字段为<Steven Lucy>
                //根据空格、制表符\t、换行符\n、回车符\r分割字符串
                StringTokenizer itr = new StringTokenizer(line);
                String child = itr.nextToken();//获得子女 Steven
                String parent = itr.nextToken();//获得父母 Lucy

                //添加父母前缀,输出<Steven, pLucy> 代表子父表<Steven Lucy>
                context.write(new Text(child), new Text("p" +parent));
                //添加子女前缀,输出<Lucy, cSteven> 代表父子表<Lucy Steven>
                context.write(new Text(parent), new Text("c" +child));
            }
        }

    }

    public static class MyReducer extends Reducer<Text, Text, Text, Text> {
        private int i = 0;//标记第一行

        public void reduce(Text key, Iterable<Text> values, Context context) throws IOException, InterruptedException {
            if(i == 0){//如果是第一行,则添加表头
                context.write(new Text("grandChild"), new Text("grandParen"));
                i++;
            }

            List<Text> child = new ArrayList<Text>();//儿女集合
            List<Text> parent = new ArrayList<Text>();//父母集合

            //将人员分类添加进集合
            for (Text t : values) {
                String str = t.toString();
                if (str.startsWith("p")) {//当前缀为 p 时,是父母
                    parent.add(new Text(str.substring(1)));//删除前缀,将父母添加进父母集合
                } else {//当前缀为 c 时,是子女
                    child.add(new Text(str.substring(1)));//删除前缀,将儿女添加进儿女集合
                }
            }

            //遍历集合,输出祖孙辈关系
            for (int i = 0; i < child.size(); i++) {
                for (int j = 0; j < parent.size(); j++) {
                    context.write(child.get(i), parent.get(j));
                }
            }
        }
    }

    public static void main(String[] args) throws Exception {
        Configuration conf = new Configuration();//初始化作业的配置信息类
        String[] otherArgs = new GenericOptionsParser(conf, args).getRemainingArgs();//获得程序执行所传入的参数
        if (otherArgs.length < 2) {//如果传入参数少于2,则提示类使用至少需要输入路径和输出路径
            System.err.println("Usage: ChildParent <in> [<in>...] <out>");
            System.exit(2);
        }

        Job job = Job.getInstance(conf, "Child Parent");//构建任务对象
        job.setJarByClass(ChildParent.class);//指定执行的类
        job.setMapperClass(MyMapper.class);//指定map类
        job.setReducerClass(MyReducer.class);//指定reducer类

        job.setOutputKeyClass(Text.class);//指定key输出类型
        job.setOutputValueClass(Text.class);//指定value输出类型

        for(int i = 0; i < otherArgs.length-1; i++) {
            FileInputFormat.addInputPath(job, new Path(otherArgs[i]));//输入文件路径
        }
        FileOutputFormat.setOutputPath(job, new Path(otherArgs[otherArgs.length-1]));//输出文件路径

        System.exit(job.waitForCompletion(true) ? 0:1);//提交任务
    }

}
ChildParent.java
input file
childParent.txt :
child parent
Steven Lucy
Steven Jack
Jone Lucy
Jone Jack
Lucy Mary
Lucy Frank
Jack Alice
Jack Jesse
David Alice
David Jesse
Philip David
Philip Alma
Mark David
Mark Alma



output file
part-r-00000 :
grandChild      grandParen
Philip  Jesse
Philip  Alice
Mark    Jesse
Mark    Alice
Steven  Jesse
Steven  Alice
Jone    Jesse
Jone    Alice
Steven  Frank
Steven  Mary
Jone    Frank
Jone    Mary
input/output file

4.2 学生成绩表连接

根据学生的姓名表和成绩表,得到学生姓名总分表。输入(学号-姓名-电话)/(学号-科目-成绩),输出(学号-姓名-电话-总分)。

map:学号作为 key,其他部分作为value后分割生成自定义类。reduce:将结果分类写入集合,求总分。

其他:自定义数据类型,定义输入文件类型,多map处理。

import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
import java.util.StringTokenizer;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.Writable;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.mapreduce.lib.input.KeyValueTextInputFormat;
import org.apache.hadoop.mapreduce.lib.input.MultipleInputs;
import org.apache.hadoop.util.GenericOptionsParser;

/**
 * 根据学生的姓名表和成绩表,得到学生姓名总分表。
 * 输入:Name表 学号-姓名-电话
 * 输入:Result表 学号-科目-成绩
 * 输出:学号-姓名-电话-总分
 */
public class StudentJoin {

    public static class NameMapper extends Mapper<Text, Text, Text, PublicClass> {

        public void map(Text key, Text value, Context context) throws IOException, InterruptedException {
            //学号作为 key,其他部分作为value后分割,得到 itr。
            StringTokenizer itr = new StringTokenizer(value.toString());

            //生成自定义 Name类
            Name name = new Name();
            if(itr.hasMoreTokens()){
                name.setName(itr.nextToken());
                name.setTelephone(itr.nextToken());
            }

            context.write(key, new PublicClass(name));
        }

    }

    public static class ResultMapper extends Mapper<Text, Text, Text, PublicClass> {

        public void map(Text key, Text value, Context context) throws IOException, InterruptedException {
            //学号作为 key,其他部分作为value后分割,得到 itr。
            StringTokenizer itr = new StringTokenizer(value.toString());

            //生成自定义 Result类
            Result result = new Result();
            if(itr.hasMoreTokens()){
                result.setSubject(itr.nextToken());
                result.setResult(Integer.parseInt(itr.nextToken()));
            }

            context.write(key, new PublicClass(result));
        }

    }

    public static class MyReducer extends Reducer<Text, PublicClass, Text, Text> {

        public void reduce(Text key, Iterable<PublicClass> values, Context context) throws IOException, InterruptedException {
            Name name = new Name();//因为只有一个 name变量,所以不需要集合
            List<Result> resultList = new ArrayList<Result>();
            int g = 0;

            //将 Name类写入新建的类,将Result类添加进集合
            for(PublicClass value : values){
                Writable obj = value.get();
                if(obj instanceof Name){
                    name.setName( ((Name)obj).getName() );
                    name.setTelephone( ((Name)obj).getTelephone() );
                }else{
                    resultList.add((Result)obj);
                }
            }

            //统计总分
            for(Result r : resultList){
                g = g + r.getResult();
            }

            context.write(key, new Text(name.getName()+ "\t" +name.getTelephone()+ "\t" +g));
        }

    }

    public static void main(String[] args) throws Exception {
        Configuration conf = new Configuration();//初始化作业的配置信息类
        String[] otherArgs = new GenericOptionsParser(conf, args).getRemainingArgs();//获得程序执行所传入的参数
        if (otherArgs.length != 3) {//需要两个输入路径,一个输出路径
            System.err.println("Usage: StudentJoin <in> <in> <out>");
            System.exit(2);
        }

        Job job = Job.getInstance(conf, "StudentJoin");//构建任务对象
        job.setJarByClass(StudentJoin.class);//指定执行的类
        //因为下面将输入文件路径和处理 map 类绑定,所以这里不设置
        //job.setMapperClass(MyMapper.class);//指定map类
        job.setReducerClass(MyReducer.class);//指定reducer类

        job.setMapOutputKeyClass(Text.class);//指定map类key输出类型
        job.setMapOutputValueClass(PublicClass.class);//指定map类value输出类型
        job.setOutputKeyClass(Text.class);//指定key输出类型
        job.setOutputValueClass(Text.class);//指定value输出类型

        //输入文件路径,输入文件方式,处理输入文件的 map 类
        MultipleInputs.addInputPath(job, new Path(otherArgs[0]), KeyValueTextInputFormat.class, NameMapper.class);//Name表路径
        MultipleInputs.addInputPath(job, new Path(otherArgs[1]), KeyValueTextInputFormat.class, ResultMapper.class);//Result表路径
        FileOutputFormat.setOutputPath(job, new Path(args[2]));//输出文件路径

        System.exit(job.waitForCompletion(true)?0:1);//提交任务
    }

}
StudentJoin.java
import org.apache.hadoop.io.Writable;
import java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException;

/**
 * 自定义数据类型,存储 Name表数据
 */
public class Name implements Writable {

    //类变量
    private String name;
    private String telephone;

    //实现该方法,序列化对象 output 中的字段
    public void write(DataOutput output) throws IOException {
        output.writeUTF(name);
        output.writeUTF(telephone);
    }
    //实现该方法,反序列化对象 input 中的字段
    public void readFields(DataInput input) throws IOException {
        name = input.readUTF();
        telephone = input.readUTF();
    }

    //实现字段的 get 和 set 方法
    public String getName() {
        return name;
    }
    public void setName(String name) {
        this.name = name;
    }
    public String getTelephone() {
        return telephone;
    }
    public void setTelephone(String telephone) {
        this.telephone = telephone;
    }

}
Name.java
import org.apache.hadoop.io.Writable;
import java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException;

/**
 * 自定义数据类型,存储 Result表数据
 */
public class Result implements Writable {

    //类变量
    private String subject;
    private int result;

    //实现该方法,序列化对象 output 中的字段
    public void write(DataOutput output) throws IOException {
        output.writeUTF(subject);
        output.writeInt(result);
    }
    //实现该方法,反序列化对象 input 中的字段
    public void readFields(DataInput input) throws IOException {
        subject = input.readUTF();
        result = input.readInt();
    }

    //实现字段的 get 和 set 方法
    public String getSubject() {
        return subject;
    }
    public void setSubject(String subject) {
        this.subject = subject;
    }
    public int getResult() {
        return result;
    }
    public void setResult(int result) {
        this.result = result;
    }

}
Result.java
import org.apache.hadoop.io.GenericWritable;
import org.apache.hadoop.io.Writable;

/**
 * 自定义类判断类,包括 Name类和 Result类
 */
public class PublicClass extends GenericWritable {

    private static Class<? extends Writable>[] CLASSES = null;

    static {
        CLASSES = (Class<? extends Writable>[]) new Class[] {
                Name.class,
                Result.class
        };
    }

    public PublicClass() {}

    public PublicClass(Writable instance) {
        set(instance);
    }

    protected Class<? extends Writable>[] getTypes() {
        return CLASSES;
    }

}
PublicClass.java
input file
Name.txt :
2018001    jack    639987
2018002    rose    639988
2018003    smith    639989

Result.txt :
2018001    language    80
2018001    english    84
2018001    math    75
2018002    language    90
2018002    english    86
2018002    math    90
2018003    language    95
2018003    english    88
2018003    math    89



output file
part-r-00000 :
2018001 jack    639987  239
2018002 rose    639988  266
2018003 smith   639989  272
input/output file

 

五,排序

5.1 采样后分区排序

 施工中

5.2 自定义分区排序

 施工中

5.3 自定义排序(逆序)

 施工中

5.4 二次排序

施工中 

 

六,矩阵计算

施工中 

 

七,字段替换

7.1 Json字段替换之

 

7.2 字段替换之

 

7.3 字段替换之

 

 

 

 相关/转载:

1,hadoop大数据技术开发实战

2,Hadoop计算平均值【转】

posted @ 2020-08-18 22:37  子崖子崖  阅读(241)  评论(0)    收藏  举报