hadoop wordcount
搭建好环境后执行wordcount用例
这里是统计hehe.txt中各个单词的个数
配置编译环境
hadoop@muhe221:~/test$ vim ~/.bashrc
export HADOOP_CLASSPATH=${JAVA_HOME}/lib/tools.jar
用例源码解压hadoop-3.2.0/share/hadoop/mapreduce/sources/hadoop-mapreduce-examples-3.2.0-sources.jar
org\apache\hadoop\examples\WordCount.java
package org.apache.hadoop.examples;import java.io.IOException; import java.util.StringTokenizer; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.Path; import org.apache.hadoop.io.IntWritable; import org.apache.hadoop.io.Text; import org.apache.hadoop.mapreduce.Job; import org.apache.hadoop.mapreduce.Mapper; import org.apache.hadoop.mapreduce.Reducer; import org.apache.hadoop.mapreduce.lib.input.FileInputFormat; import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat; import org.apache.hadoop.util.GenericOptionsParser; public class WordCount { public static class TokenizerMapper extends Mapper<Object, Text, Text, IntWritable>{ private final static IntWritable one = new IntWritable(1); private Text word = new Text(); public void map(Object key, Text value, Context context ) throws IOException, InterruptedException { StringTokenizer itr = new StringTokenizer(value.toString()); while (itr.hasMoreTokens()) { word.set(itr.nextToken()); context.write(word, one); } } } public static class IntSumReducer extends Reducer<Text,IntWritable,Text,IntWritable> { private IntWritable result = new IntWritable(); public void reduce(Text key, Iterable<IntWritable> values, Context context ) throws IOException, InterruptedException { int sum = 0; for (IntWritable val : values) { sum += val.get(); } result.set(sum); context.write(key, result); } } public static void main(String[] args) throws Exception { Configuration conf = new Configuration(); String[] otherArgs = new GenericOptionsParser(conf, args).getRemainingArgs(); if (otherArgs.length < 2) { System.err.println("Usage: wordcount <in> [<in>...] <out>"); System.exit(2); } Job job = Job.getInstance(conf, "word count"); job.setJarByClass(WordCount.class); job.setMapperClass(TokenizerMapper.class); job.setCombinerClass(IntSumReducer.class); job.setReducerClass(IntSumReducer.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(IntWritable.class); for (int i = 0; i < otherArgs.length - 1; ++i) { FileInputFormat.addInputPath(job, new Path(otherArgs[i])); } FileOutputFormat.setOutputPath(job, new Path(otherArgs[otherArgs.length - 1])); System.exit(job.waitForCompletion(true) ? 0 : 1); } }
hadoop@muhe221:~/test$ hadoop com.sun.tools.javac.Main WordCount.java
hadoop@muhe221:~/test$ jar -cf wc.jar WordCount*.class
hadoop@muhe221:~/hadoop-3.2.0$ hadoop namenode -format
WARNING: Use of this script to execute namenode is deprecated.
WARNING: Attempting to execute replacement "hdfs namenode" instead.
2016-11-20 04:33:07,670 INFO namenode.NameNode: STARTUP_MSG:
/************************************************************
STARTUP_MSG: Starting NameNode
STARTUP_MSG: host = muhe221/10.121.63.240
STARTUP_MSG: args = [-format]
STARTUP_MSG: version = 3.2.0
STARTUP_MSG: classpath = /home/hadoop/hadoop-3.2.0/etc/hadoop:/home/hadoop/hadoop-3.2.0/share/hadoop/common/lib/curator-recipes-2.12.0.jar......
STARTUP_MSG: build = https://github.com/apache/hadoop.git -r e97acb3bd8f3befd27418996fa5d4b50bf2e17bf; compiled by 'sunilg' on 2019-01-08T06:08Z
STARTUP_MSG: java = 1.8.0_121
************************************************************/
2016-11-20 04:33:07,681 INFO namenode.NameNode: registered UNIX signal handlers for [TERM, HUP, INT]
......
2016-11-20 04:33:08,618 INFO common.Storage: Storage directory /home/hadoop/hadoop/name has been successfully formatted.
2016-11-20 04:33:08,625 INFO namenode.FSImageFormatProtobuf: Saving image file /home/hadoop/hadoop/name/current/fsimage.ckpt_0000000000000000000 using no compression
2016-11-20 04:33:08,702 INFO namenode.FSImageFormatProtobuf: Image file /home/hadoop/hadoop/name/current/fsimage.ckpt_0000000000000000000 of size 401 bytes saved in 0 seconds .
2016-11-20 04:33:08,738 INFO namenode.NNStorageRetentionManager: Going to retain 1 images with txid >= 0
2016-11-20 04:33:08,742 INFO namenode.NameNode: SHUTDOWN_MSG:
/************************************************************
SHUTDOWN_MSG: Shutting down NameNode at muhe221/10.121.63.240
************************************************************/
hadoop@muhe221:~/hadoop-3.2.0$ cd sbin
hadoop@muhe221:~/hadoop-3.2.0/sbin$ ./start-all.sh
WARNING: Attempting to start all Apache Hadoop daemons as hadoop in 10 seconds.
WARNING: This is not a recommended production deployment configuration.
WARNING: Use CTRL-C to abort.
Starting namenodes on [muhe221]
Starting datanodes
Starting secondary namenodes [muhe221]
Starting resourcemanager
Starting nodemanagers
hadoop@muhe221:~$ vi hehe.txt
123456
789
123456
hadoop@muhe221:~$ hadoop fs -mkdir /input
hadoop@muhe221:~$ hadoop fs -ls /
Found 1 items
drwxr-xr-x - hadoop supergroup 0 2016-11-20 04:40 /input
hadoop@muhe221:~$ hadoop fs -ls /input
hadoop@muhe221:~$ hadoop fs -put hehe.txt /input
hadoop@muhe221:~$ hadoop fs -ls /input
Found 1 items
-rw-r--r-- 1 hadoop supergroup 18 2016-11-20 04:42 /input/hehe.txt
hadoop@muhe221:~/test$ hadoop jar wc.jar org.apache.hadoop.examples.WordCount /input /output
hadoop@muhe221:~$ hadoop jar ~/hadoop-3.2.0/share/hadoop/mapreduce/hadoop-mapreduce-examples-3.2.0.jar wordcount /input /output
2016-11-20 04:43:19,837 INFO client.RMProxy: Connecting to ResourceManager at /0.0.0.0:8032
2016-11-20 04:43:20,386 INFO mapreduce.JobResourceUploader: Disabling Erasure Coding for path: /tmp/hadoop-yarn/staging/hadoop/.staging/job_1479587883124_0001
2016-11-20 04:43:20,626 INFO input.FileInputFormat: Total input files to process : 1
2016-11-20 04:43:20,927 INFO mapreduce.JobSubmitter: number of splits:1
2016-11-20 04:43:20,962 INFO Configuration.deprecation: yarn.resourcemanager.system-metrics-publisher.enabled is deprecated. Instead, use yarn.system-metrics-publisher.enabled
2016-11-20 04:43:21,487 INFO mapreduce.JobSubmitter: Submitting tokens for job: job_1479587883124_0001
2016-11-20 04:43:21,488 INFO mapreduce.JobSubmitter: Executing with tokens: []
2016-11-20 04:43:21,648 INFO conf.Configuration: resource-types.xml not found
2016-11-20 04:43:21,649 INFO resource.ResourceUtils: Unable to find 'resource-types.xml'.
2016-11-20 04:43:22,119 INFO impl.YarnClientImpl: Submitted application application_1479587883124_0001
2016-11-20 04:43:22,150 INFO mapreduce.Job: The url to track the job: http://muhe221:8088/proxy/application_1479587883124_0001/
2016-11-20 04:43:22,151 INFO mapreduce.Job: Running job: job_1479587883124_0001
2016-11-20 04:43:30,253 INFO mapreduce.Job: Job job_1479587883124_0001 running in uber mode : false
2016-11-20 04:43:30,254 INFO mapreduce.Job: map 0% reduce 0%
2016-11-20 04:43:34,300 INFO mapreduce.Job: map 100% reduce 0%
2016-11-20 04:43:40,348 INFO mapreduce.Job: map 100% reduce 100%
2016-11-20 04:43:40,361 INFO mapreduce.Job: Job job_1479587883124_0001 completed successfully
2016-11-20 04:43:40,462 INFO mapreduce.Job: Counters: 54
File System Counters
FILE: Number of bytes read=29
FILE: Number of bytes written=443559
FILE: Number of read operations=0
FILE: Number of large read operations=0
FILE: Number of write operations=0
HDFS: Number of bytes read=123
HDFS: Number of bytes written=15
HDFS: Number of read operations=8
HDFS: Number of large read operations=0
HDFS: Number of write operations=2
HDFS: Number of bytes read erasure-coded=0
Job Counters
Launched map tasks=1
Launched reduce tasks=1
Data-local map tasks=1
Total time spent by all maps in occupied slots (ms)=1998
Total time spent by all reduces in occupied slots (ms)=2689
Total time spent by all map tasks (ms)=1998
Total time spent by all reduce tasks (ms)=2689
Total vcore-milliseconds taken by all map tasks=1998
Total vcore-milliseconds taken by all reduce tasks=2689
Total megabyte-milliseconds taken by all map tasks=2045952
Total megabyte-milliseconds taken by all reduce tasks=2753536
Map-Reduce Framework
Map input records=3
Map output records=3
Map output bytes=30
Map output materialized bytes=29
Input split bytes=105
Combine input records=3
Combine output records=2
Reduce input groups=2
Reduce shuffle bytes=29
Reduce input records=2
Reduce output records=2
Spilled Records=4
Shuffled Maps =1
Failed Shuffles=0
Merged Map outputs=1
GC time elapsed (ms)=100
CPU time spent (ms)=1200
Physical memory (bytes) snapshot=488787968
Virtual memory (bytes) snapshot=5287763968
Total committed heap usage (bytes)=401080320
Peak Map Physical memory (bytes)=295424000
Peak Map Virtual memory (bytes)=2638876672
Peak Reduce Physical memory (bytes)=193363968
Peak Reduce Virtual memory (bytes)=2648887296
Shuffle Errors
BAD_ID=0
CONNECTION=0
IO_ERROR=0
WRONG_LENGTH=0
WRONG_MAP=0
WRONG_REDUCE=0
File Input Format Counters
Bytes Read=18
File Output Format Counters
Bytes Written=15
hadoop@muhe221:~$ hadoop fs -ls /output
Found 2 items
-rw-r--r-- 1 hadoop supergroup 0 2016-11-20 04:43 /output/_SUCCESS
-rw-r--r-- 1 hadoop supergroup 15 2016-11-20 04:43 /output/part-r-00000
hadoop@muhe221:~$ hadoop fs -cat /output/part-r-00000 #查看结果
123456 2
789 1
Hadoop Map/Reduce框架为每一个InputSplit产生一个map任务,而每个InputSplit是由该作业的InputFormat产生的。
对Mapper的实现者需要重写 JobConfigurable.configure(JobConf)方法,这个方法需要传递一个JobConf参数,目的是完成Mapper的初始化工作。然后,框架为这个任务的InputSplit中每个键值对调用一次 map(WritableComparable, Writable, OutputCollector, Reporter)操作。
Reducer
Reducer将与一个key关联的一组中间数值集归约(reduce)为一个更小的数值集。
Reducer有3个主要阶段:shuffle、sort和reduce。
Shuffle:
Reducer的输入就是Mapper已经排好序的输出。在这个阶段,框架通过HTTP为每个Reducer获得所有Mapper输出中与之相关的分块。
Sort:
这个阶段,框架将按照key的值对Reducer的输入进行分组 (因为不同mapper的输出中可能会有相同的key)。
Shuffle和Sort两个阶段是同时进行的;map的输出也是一边被取回一边被合并的。
Secondary Sort
如果需要中间过程对key的分组规则和reduce前对key的分组规则不同,那么可以通过 JobConf.setOutputValueGroupingComparator(Class)来指定一个Comparator。再加上 JobConf.setOutputKeyComparatorClass(Class)可用于控制中间过程的key如何被分组,所以结合两者可以实现按值的二次排序。
Reduce:
在这个阶段,框架为已分组的输入数据中的每个 <key, (list of values)>对调用一次 reduce(WritableComparable, Iterator, OutputCollector, Reporter)方法。
Reduce任务的输出通常是通过调用 OutputCollector.collect(WritableComparable, Writable)写入 文件系统的。
应用程序可以使用Reporter报告进度,设定应用程序级别的状态消息,更新Counters(计数器),或者仅是表明自己运行正常。
Reducer的输出是没有排序的。
一个Map/Reduce 作业的输入和输出类型如下所示:
(input) <k1, v1> -> map -> <k2, v2> -> combine -> <k2, v2> -> reduce -> <k3, v3> (output)
https://hadoop.apache.org/docs/r3.2.0/hadoop-mapreduce-client/hadoop-mapreduce-client-core/MapReduceTutorial.html
https://blog.csdn.net/khxu666/article/details/80764994

浙公网安备 33010602011771号