IDEA快速开发Hadoop入门程序WordCount

1、开发环境配置

IDEA : 2019
hadoop: 2.6.5(根据个人情况而定)
maven: 3.5.4 (最好自己下载,需要在配置setting文件的添加阿里镜像)
测试数据: 自己输入

2、步骤

2.0 在C:/windows/System32/下添加 hadoop.dll

2.1 打开Idea新建maven工程

2.2 填写工程名字,选择磁盘目录

2.3 引入maven依赖(可以在maven官网上找到)

<?xml version="1.0" encoding="UTF-8"?>
<project xmlns="http://maven.apache.org/POM/4.0.0"
         xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
         xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
    <modelVersion>4.0.0</modelVersion>

    <groupId>com.zzuli.tumint</groupId>
    <artifactId>Hadoop_test</artifactId>
    <version>1.0-SNAPSHOT</version>
    <properties>
        <!-- 开发阶段配置文件-->
        <hadoop.version>2.6.5</hadoop.version>
    </properties>

    <dependencies>
        <!-- https://mvnrepository.com/artifact/org.apache.hadoop/hadoop-common -->
        <dependency>
            <groupId>org.apache.hadoop</groupId>
            <artifactId>hadoop-common</artifactId>
            <version>${hadoop.version}</version>
        </dependency>

        <!-- https://mvnrepository.com/artifact/org.apache.hadoop/hadoop-hdfs -->
        <dependency>
            <groupId>org.apache.hadoop</groupId>
            <artifactId>hadoop-hdfs</artifactId>
            <version>${hadoop.version}</version>
        </dependency>

        <!-- https://mvnrepository.com/artifact/org.apache.hadoop/hadoop-client -->
        <dependency>
            <groupId>org.apache.hadoop</groupId>
            <artifactId>hadoop-client</artifactId>
            <version>${hadoop.version}</version>
        </dependency>
    </dependencies>
</project>

2.4 引入官网的测试代码

  • WorCount 主类
package test;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import java.io.IOException;
/**
 * @Author: 张今天
 * @Date: 2020/2/24 11:23
 */
public class WordCount {
    public static void main(String[] args) throws IOException, ClassNotFoundException, InterruptedException {
        Configuration conf = new Configuration();
        Job job = Job.getInstance(conf, "word count");
        job.setJarByClass(WordCount.class);
        job.setMapperClass(WordCountMapper.class);
        job.setCombinerClass(WordCountReducer.class);
        job.setReducerClass(WordCountReducer.class);
        job.setOutputKeyClass(Text .class);
        job.setOutputValueClass(IntWritable .class);
        FileInputFormat.addInputPath(job, new Path(args[0]));
        FileOutputFormat.setOutputPath(job, new Path(args[1]));
        System.exit(job.waitForCompletion(true) ? 0 : 1);
    }
}
  • WordCountMapper 类

package test;

import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;

import java.io.IOException;
import java.util.StringTokenizer;

/**
 * @Author: 张今天
 * @Date: 2020/2/24 11:24
 */
public class WordCountMapper extends Mapper<Object, Text, Text, IntWritable> {
    private final static IntWritable one = new IntWritable(1);
    private Text word = new Text();

    /**
     * Called once for each key/value pair in the input split. Most applications
     * should override this, but the default is the identity function.
     *
     * @param key
     * @param value
     * @param context
     */
    @Override
    protected void map(Object key, Text value, Context context) throws IOException, InterruptedException {
        StringTokenizer itr = new StringTokenizer(value.toString());
        while (itr.hasMoreTokens()) {
            word.set(itr.nextToken());
            context.write(word, one);
        }
    }
}
  • WordCountReducer 类
package test;

import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Reducer;

import java.io.IOException;

/**
 * @Author: 张今天
 * @Date: 2020/2/24 11:24
 */
public class WordCountReducer extends Reducer <Text,IntWritable,Text,IntWritable>{
    private IntWritable result = new IntWritable(0);

    /**
     * This method is called once for each key. Most applications will define
     * their reduce class by overriding this method. The default implementation
     * is an identity function.
     *
     * @param key
     * @param values
     * @param context
     */
    @Override
    protected void reduce(Text key, Iterable<IntWritable> values, Context context) throws IOException, InterruptedException {
        int sum = 0;
        for (IntWritable val : values) {
            sum += val.get();
        }
        result.set(sum);
        context.write(key, result);
    }
}

2.5 配置Idea的运行环境

3、单元测试

  • 测试依赖
       <junit.version>4.12</junit.version>
       <mrunit.version>1.1.0</mrunit.version>
 <!-- junit -->
       <dependency>
           <groupId>junit</groupId>
           <artifactId>junit</artifactId>
           <version>${junit.version}</version>
           <scope>test</scope>
       </dependency>
       <!-- mrunit -->
       <dependency>
           <groupId>org.apache.mrunit</groupId>
           <artifactId>mrunit</artifactId>
           <version>${mrunit.version}</version>
           <classifier>hadoop2</classifier>
           <scope>test</scope>
       </dependency>
  • mapDriver 测试
package test;

import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mrunit.mapreduce.MapDriver;
import org.junit.Test;

import java.io.IOException;

/**
* @Author: 张今天
* @Date: 2020/2/26 18:11
*/
public class WordCountMapperTest {
   @Test
   public void mapperTest() throws IOException {
       Text value = new Text("hello hadoop hello word");
       new MapDriver<Object , Text, Text, IntWritable>()
               .withMapper(new WordCountMapper())
               .withInput(new IntWritable(0), value)
               .withOutput(new Text("hello"), new IntWritable(1))
               .withOutput(new Text("hadoop"), new IntWritable(1))
               .withOutput(new Text("hello"), new IntWritable(1))
               .withOutput(new Text("word"), new IntWritable(1))
               .runTest();
   }
}

  • ReduceDriver 测试
package test;

import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mrunit.mapreduce.ReduceDriver;
import org.junit.Test;

import java.io.IOException;
import java.util.Arrays;

/**
 * @Author: 张今天
 * @Date: 2020/2/26 18:28
 */
public class WordCountReducerTest {
    @Test
    public void reduceTest() throws IOException {
        new ReduceDriver<Text, IntWritable, Text, IntWritable>()
                .withReducer(new WordCountReducer())
                .withInput(new Text("hello"), Arrays.asList(
                        new IntWritable(1),
                        new IntWritable(1),
                        new IntWritable(1),
                        new IntWritable(2)
                )).withOutput(new Text("hello"), new IntWritable(5))
                .runTest();
    }
}

  • mapReduce 测试
package test;

import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mrunit.mapreduce.MapReduceDriver;
import org.junit.Test;

import java.io.IOException;

/**
 * @Author: 张今天
 * @Date: 2020/2/26 18:37
 */
public class WordCountTest {
    @Test
    public void DriverTest() throws IOException {
        String test1 = "hello word hello mapreducer";

        new MapReduceDriver(new WordCountMapper(), new WordCountReducer())
                .withInput(new IntWritable(0), new Text(test1))
                .withOutput(new Text("hello"), new IntWritable(2))
                .withOutput(new Text("word"), new IntWritable(1))
                .withOutput(new Text("mapreducer"), new IntWritable(1))
                .runTest();

    }
}

posted @ 2020-02-24 12:50  tumint  阅读(622)  评论(0)    收藏  举报