hadoop wordcount

搭建好环境后执行wordcount用例

这里是统计hehe.txt中各个单词的个数

配置编译环境

hadoop@muhe221:~/test$ vim ~/.bashrc
export HADOOP_CLASSPATH=${JAVA_HOME}/lib/tools.jar

用例源码解压hadoop-3.2.0/share/hadoop/mapreduce/sources/hadoop-mapreduce-examples-3.2.0-sources.jar

org\apache\hadoop\examples\WordCount.java

package org.apache.hadoop.examples;import java.io.IOException;
import java.util.StringTokenizer;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.util.GenericOptionsParser;

public class WordCount {

  public static class TokenizerMapper 
       extends Mapper<Object, Text, Text, IntWritable>{
    
    private final static IntWritable one = new IntWritable(1);
    private Text word = new Text();
      
    public void map(Object key, Text value, Context context
                    ) throws IOException, InterruptedException {
      StringTokenizer itr = new StringTokenizer(value.toString());
      while (itr.hasMoreTokens()) {
        word.set(itr.nextToken());
        context.write(word, one);
      }
    }
  }
  
  public static class IntSumReducer 
       extends Reducer<Text,IntWritable,Text,IntWritable> {
    private IntWritable result = new IntWritable();

    public void reduce(Text key, Iterable<IntWritable> values, 
                       Context context
                       ) throws IOException, InterruptedException {
      int sum = 0;
      for (IntWritable val : values) {
        sum += val.get();
      }
      result.set(sum);
      context.write(key, result);
    }
  }

  public static void main(String[] args) throws Exception {
    Configuration conf = new Configuration();
    String[] otherArgs = new GenericOptionsParser(conf, args).getRemainingArgs();
    if (otherArgs.length < 2) {
      System.err.println("Usage: wordcount <in> [<in>...] <out>");
      System.exit(2);
    }
    Job job = Job.getInstance(conf, "word count");
    job.setJarByClass(WordCount.class);
    job.setMapperClass(TokenizerMapper.class);
    job.setCombinerClass(IntSumReducer.class);
    job.setReducerClass(IntSumReducer.class);
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(IntWritable.class);
    for (int i = 0; i < otherArgs.length - 1; ++i) {
      FileInputFormat.addInputPath(job, new Path(otherArgs[i]));
    }
    FileOutputFormat.setOutputPath(job,
      new Path(otherArgs[otherArgs.length - 1]));
    System.exit(job.waitForCompletion(true) ? 0 : 1);
  }
}

hadoop@muhe221:~/test$ hadoop com.sun.tools.javac.Main WordCount.java
hadoop@muhe221:~/test$ jar -cf wc.jar WordCount*.class

hadoop@muhe221:~/hadoop-3.2.0$ hadoop namenode -format
WARNING: Use of this script to execute namenode is deprecated.
WARNING: Attempting to execute replacement "hdfs namenode" instead.

2016-11-20 04:33:07,670 INFO namenode.NameNode: STARTUP_MSG:
/************************************************************
STARTUP_MSG: Starting NameNode
STARTUP_MSG:   host = muhe221/10.121.63.240
STARTUP_MSG:   args = [-format]
STARTUP_MSG:   version = 3.2.0
STARTUP_MSG:   classpath = /home/hadoop/hadoop-3.2.0/etc/hadoop:/home/hadoop/hadoop-3.2.0/share/hadoop/common/lib/curator-recipes-2.12.0.jar......
STARTUP_MSG:   build = https://github.com/apache/hadoop.git -r e97acb3bd8f3befd27418996fa5d4b50bf2e17bf; compiled by 'sunilg' on 2019-01-08T06:08Z
STARTUP_MSG:   java = 1.8.0_121
************************************************************/
2016-11-20 04:33:07,681 INFO namenode.NameNode: registered UNIX signal handlers for [TERM, HUP, INT]
......
2016-11-20 04:33:08,618 INFO common.Storage: Storage directory /home/hadoop/hadoop/name has been successfully formatted.
2016-11-20 04:33:08,625 INFO namenode.FSImageFormatProtobuf: Saving image file /home/hadoop/hadoop/name/current/fsimage.ckpt_0000000000000000000 using no compression
2016-11-20 04:33:08,702 INFO namenode.FSImageFormatProtobuf: Image file /home/hadoop/hadoop/name/current/fsimage.ckpt_0000000000000000000 of size 401 bytes saved in 0 seconds .
2016-11-20 04:33:08,738 INFO namenode.NNStorageRetentionManager: Going to retain 1 images with txid >= 0
2016-11-20 04:33:08,742 INFO namenode.NameNode: SHUTDOWN_MSG:
/************************************************************
SHUTDOWN_MSG: Shutting down NameNode at muhe221/10.121.63.240
************************************************************/
hadoop@muhe221:~/hadoop-3.2.0$ cd sbin
hadoop@muhe221:~/hadoop-3.2.0/sbin$ ./start-all.sh
WARNING: Attempting to start all Apache Hadoop daemons as hadoop in 10 seconds.
WARNING: This is not a recommended production deployment configuration.
WARNING: Use CTRL-C to abort.
Starting namenodes on [muhe221]
Starting datanodes
Starting secondary namenodes [muhe221]
Starting resourcemanager
Starting nodemanagers
hadoop@muhe221:~$ vi hehe.txt
123456
789
123456
hadoop@muhe221:~$ hadoop fs -mkdir /input
hadoop@muhe221:~$ hadoop fs -ls /
Found 1 items
drwxr-xr-x   - hadoop supergroup          0 2016-11-20 04:40 /input
hadoop@muhe221:~$ hadoop fs -ls /input
hadoop@muhe221:~$ hadoop fs -put hehe.txt /input
hadoop@muhe221:~$ hadoop fs -ls /input
Found 1 items
-rw-r--r--   1 hadoop supergroup         18 2016-11-20 04:42 /input/hehe.txt
hadoop@muhe221:~/test$ hadoop jar wc.jar org.apache.hadoop.examples.WordCount /input /output
hadoop@muhe221:~$ hadoop jar ~/hadoop-3.2.0/share/hadoop/mapreduce/hadoop-mapreduce-examples-3.2.0.jar wordcount /input /output
2016-11-20 04:43:19,837 INFO client.RMProxy: Connecting to ResourceManager at /0.0.0.0:8032
2016-11-20 04:43:20,386 INFO mapreduce.JobResourceUploader: Disabling Erasure Coding for path: /tmp/hadoop-yarn/staging/hadoop/.staging/job_1479587883124_0001
2016-11-20 04:43:20,626 INFO input.FileInputFormat: Total input files to process : 1
2016-11-20 04:43:20,927 INFO mapreduce.JobSubmitter: number of splits:1
2016-11-20 04:43:20,962 INFO Configuration.deprecation: yarn.resourcemanager.system-metrics-publisher.enabled is deprecated. Instead, use yarn.system-metrics-publisher.enabled
2016-11-20 04:43:21,487 INFO mapreduce.JobSubmitter: Submitting tokens for job: job_1479587883124_0001
2016-11-20 04:43:21,488 INFO mapreduce.JobSubmitter: Executing with tokens: []
2016-11-20 04:43:21,648 INFO conf.Configuration: resource-types.xml not found
2016-11-20 04:43:21,649 INFO resource.ResourceUtils: Unable to find 'resource-types.xml'.
2016-11-20 04:43:22,119 INFO impl.YarnClientImpl: Submitted application application_1479587883124_0001
2016-11-20 04:43:22,150 INFO mapreduce.Job: The url to track the job: http://muhe221:8088/proxy/application_1479587883124_0001/
2016-11-20 04:43:22,151 INFO mapreduce.Job: Running job: job_1479587883124_0001
2016-11-20 04:43:30,253 INFO mapreduce.Job: Job job_1479587883124_0001 running in uber mode : false
2016-11-20 04:43:30,254 INFO mapreduce.Job:  map 0% reduce 0%
2016-11-20 04:43:34,300 INFO mapreduce.Job:  map 100% reduce 0%
2016-11-20 04:43:40,348 INFO mapreduce.Job:  map 100% reduce 100%
2016-11-20 04:43:40,361 INFO mapreduce.Job: Job job_1479587883124_0001 completed successfully
2016-11-20 04:43:40,462 INFO mapreduce.Job: Counters: 54
        File System Counters
                FILE: Number of bytes read=29
                FILE: Number of bytes written=443559
                FILE: Number of read operations=0
                FILE: Number of large read operations=0
                FILE: Number of write operations=0
                HDFS: Number of bytes read=123
                HDFS: Number of bytes written=15
                HDFS: Number of read operations=8
                HDFS: Number of large read operations=0
                HDFS: Number of write operations=2
                HDFS: Number of bytes read erasure-coded=0
        Job Counters
                Launched map tasks=1
                Launched reduce tasks=1
                Data-local map tasks=1
                Total time spent by all maps in occupied slots (ms)=1998
                Total time spent by all reduces in occupied slots (ms)=2689
                Total time spent by all map tasks (ms)=1998
                Total time spent by all reduce tasks (ms)=2689
                Total vcore-milliseconds taken by all map tasks=1998
                Total vcore-milliseconds taken by all reduce tasks=2689
                Total megabyte-milliseconds taken by all map tasks=2045952
                Total megabyte-milliseconds taken by all reduce tasks=2753536
        Map-Reduce Framework
                Map input records=3
                Map output records=3
                Map output bytes=30
                Map output materialized bytes=29
                Input split bytes=105
                Combine input records=3
                Combine output records=2
                Reduce input groups=2
                Reduce shuffle bytes=29
                Reduce input records=2
                Reduce output records=2
                Spilled Records=4
                Shuffled Maps =1
                Failed Shuffles=0
                Merged Map outputs=1
                GC time elapsed (ms)=100
                CPU time spent (ms)=1200
                Physical memory (bytes) snapshot=488787968
                Virtual memory (bytes) snapshot=5287763968
                Total committed heap usage (bytes)=401080320
                Peak Map Physical memory (bytes)=295424000
                Peak Map Virtual memory (bytes)=2638876672
                Peak Reduce Physical memory (bytes)=193363968
                Peak Reduce Virtual memory (bytes)=2648887296
        Shuffle Errors
                BAD_ID=0
                CONNECTION=0
                IO_ERROR=0
                WRONG_LENGTH=0
                WRONG_MAP=0
                WRONG_REDUCE=0
        File Input Format Counters
                Bytes Read=18
        File Output Format Counters
                Bytes Written=15
hadoop@muhe221:~$ hadoop fs -ls /output
Found 2 items
-rw-r--r--   1 hadoop supergroup          0 2016-11-20 04:43 /output/_SUCCESS
-rw-r--r--   1 hadoop supergroup         15 2016-11-20 04:43 /output/part-r-00000
hadoop@muhe221:~$ hadoop fs -cat /output/part-r-00000   #查看结果
123456  2
789     1

Hadoop Map/Reduce框架为每一个InputSplit产生一个map任务，而每个InputSplit是由该作业的InputFormat产生的。
对Mapper的实现者需要重写 JobConfigurable.configure(JobConf)方法，这个方法需要传递一个JobConf参数，目的是完成Mapper的初始化工作。然后，框架为这个任务的InputSplit中每个键值对调用一次 map(WritableComparable, Writable, OutputCollector, Reporter)操作。

Reducer
Reducer将与一个key关联的一组中间数值集归约（reduce）为一个更小的数值集。

Reducer有3个主要阶段：shuffle、sort和reduce。
Shuffle:
Reducer的输入就是Mapper已经排好序的输出。在这个阶段，框架通过HTTP为每个Reducer获得所有Mapper输出中与之相关的分块。
Sort:
这个阶段，框架将按照key的值对Reducer的输入进行分组（因为不同mapper的输出中可能会有相同的key）。
Shuffle和Sort两个阶段是同时进行的；map的输出也是一边被取回一边被合并的。
Secondary Sort
如果需要中间过程对key的分组规则和reduce前对key的分组规则不同，那么可以通过 JobConf.setOutputValueGroupingComparator(Class)来指定一个Comparator。再加上 JobConf.setOutputKeyComparatorClass(Class)可用于控制中间过程的key如何被分组，所以结合两者可以实现按值的二次排序。
Reduce:
在这个阶段，框架为已分组的输入数据中的每个 <key, (list of values)>对调用一次 reduce(WritableComparable, Iterator, OutputCollector, Reporter)方法。
Reduce任务的输出通常是通过调用 OutputCollector.collect(WritableComparable, Writable)写入文件系统的。
应用程序可以使用Reporter报告进度，设定应用程序级别的状态消息，更新Counters（计数器），或者仅是表明自己运行正常。

Reducer的输出是没有排序的。
一个Map/Reduce 作业的输入和输出类型如下所示：
(input) <k1, v1> -> map -> <k2, v2> -> combine -> <k2, v2> -> reduce -> <k3, v3> (output)

https://hadoop.apache.org/docs/r3.2.0/hadoop-mapreduce-client/hadoop-mapreduce-client-core/MapReduceTutorial.html

https://blog.csdn.net/khxu666/article/details/80764994

posted @ 2019-02-22 20:54 牧天阅读(400) 评论(0) 收藏举报

刷新页面返回顶部

牧 天

hadoop wordcount

公告

牧天