Hadoop实战-MapReduce之WordCount(五)

环境介绍:

主服务器ip:192.168.80.128(master)  NameNode  SecondaryNameNode ResourceManager

从服务器ip:192.168.80.129(slave1)  DataNode NodeManager

从服务器ip: 192.168.80.130(slave2)  DataNode NodeManager

1.文件准备

1)在HDFS上创建文件夹

hadoop fs -mkdir /user/joe/wordcount/input

 

2)在本地创建文件夹

mkdir /home/chenyun/data/mapreduce

 

3)创建file01

cd /home/chenyun/data/mapreduce
touch file01

 

vi file01

往file01写入内容:

Hello World, Bye World!

 

4)创建file02

cd /home/chenyun/data/mapreduce 
touch file02 vi file02 

 

往file02写入内容:

Hello Hadoop, Goodbye to hadoop.

 

5)把本地文件file01、file02上传到hdfs的/user/joe/wordcount/input目录

hadoop fs -put /home/chenyun/data/mapreduce/file01 /user/joe/wordcount/input 

hadoop fs -put /home/chenyun/data/mapreduce/file02 /user/joe/wordcount/input

 

2.编写mapreduce程序

1)在Eclipse编写Mapreduce程序

import java.io.BufferedReader;
import java.io.FileReader;
import java.io.IOException;
import java.net.URI;
import java.util.ArrayList;
import java.util.HashSet;
import java.util.List;
import java.util.Set;
import java.util.StringTokenizer;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Counter;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.util.GenericOptionsParser;
import org.apache.hadoop.util.StringUtils;

public class WordCount {

	public static class TokenizerMapper extends
			Mapper<Object, Text, Text, IntWritable> {
		static enum CountersEnum {
			INPUT_WORDS
		}

		private final static IntWritable one = new IntWritable(1);

		private Text word = new Text();
		private boolean caseSensitive;
		private Set<String> patternsToSkip = new HashSet<String>();

		private Configuration conf;
		private BufferedReader fis;

		@Override
		public void setup(Context context) throws IOException,
				InterruptedException {
			conf = context.getConfiguration();
			caseSensitive = conf.getBoolean("wordcount.case.sensitive", true);
			if (conf.getBoolean("wordcount.skip.patterns", false)) {
				URI[] patternsURIs = Job.getInstance(conf).getCacheFiles();
				for (URI patternsURI : patternsURIs) {
					Path patternsPath = new Path(patternsURI.getPath());
					String patternsFileName = patternsPath.getName().toString();
					parseSkipFile(patternsFileName);
				}
			}
		}

		private void parseSkipFile(String fileName) {
			try {
				fis = new BufferedReader(new FileReader(fileName));
				String pattern = null;
				while ((pattern = fis.readLine()) != null) {
					patternsToSkip.add(pattern);
				}
			} catch (IOException ioe) {
				System.err
						.println("Caught exception while parsing the cached file '"
								+ StringUtils.stringifyException(ioe));
			}
		}

		@Override
		public void map(Object key, Text value, Context context)
				throws IOException, InterruptedException {
			String line = (caseSensitive) ? value.toString() : value.toString()
					.toLowerCase();
			for (String pattern : patternsToSkip) {
				line = line.replaceAll(pattern, "");
			}
			StringTokenizer itr = new StringTokenizer(line);
			while (itr.hasMoreTokens()) {
				word.set(itr.nextToken());
				context.write(word, one);
				Counter counter = context.getCounter(
						CountersEnum.class.getName(),
						CountersEnum.INPUT_WORDS.toString());
				counter.increment(1);
			}
		}

	}

	public static class IntSumReducer extends
			Reducer<Text, IntWritable, Text, IntWritable> {
		private IntWritable result = new IntWritable();

		public void reduce(Text key, Iterable<IntWritable> values,
				Context context) throws IOException, InterruptedException {
			int sum = 0;
			for (IntWritable val : values) {
				sum += val.get();
			}
			result.set(sum);
			context.write(key, result);
		}
	}

	public static void main(String[] args) throws Exception {
		Configuration conf = new Configuration();
		GenericOptionsParser optionParser = new GenericOptionsParser(conf, args);
		String[] remainingArgs = optionParser.getRemainingArgs();
		if ((remainingArgs.length != 2) && (remainingArgs.length != 4)) {
			System.err
					.println("Usage: wordcount <in> <out> [-skip skipPatternFile]");
			System.exit(2);
		}
		Job job = Job.getInstance(conf, "word count");
		job.setJarByClass(WordCount.class);
		job.setMapperClass(TokenizerMapper.class);
		job.setCombinerClass(IntSumReducer.class);
		job.setReducerClass(IntSumReducer.class);
		job.setOutputKeyClass(Text.class);
		job.setOutputValueClass(IntWritable.class);

		List<String> otherArgs = new ArrayList<String>();
		for (int i = 0; i < remainingArgs.length; ++i) {
			if ("-skip".equals(remainingArgs[i])) {
				job.addCacheFile(new Path(remainingArgs[++i]).toUri());
				job.getConfiguration().setBoolean("wordcount.skip.patterns",
						true);
			} else {
				otherArgs.add(remainingArgs[i]);
			}
		}
		FileInputFormat.addInputPath(job, new Path(otherArgs.get(0)));
		FileOutputFormat.setOutputPath(job, new Path(otherArgs.get(1)));

		System.exit(job.waitForCompletion(true) ? 0 : 1);
	}
	
	
}

 

2)导出mapreduce.jar

3) 上传到master的目录

/home/chenyun/project/mapreduce

3.运行wordCount

hadoop jar /home/chenyun/project/mapreduce/mapreduce.jar com.accp.mapreduce.WordCount /user/joe/wordcount/input /user/joe/wordcount/output

 

4)查看运行结果

hadoop fs -cat /user/joe/wordcount/output/part-r-00000

 

=======================================================================================================================

4.过滤不需要统计的字符

1)在本地创建/home/chenyun/data/mapreduce/patterns.txt ,在文件里加入

\.
\,
\!
to

 

2)把文件上传到hdfs上

hadoop fs -put /home/chenyun/data/mapreduce/patterns.txt /user/joe/wordcount

 

3)运行

hadoop jar /home/chenyun/project/mapreduce/mapreduce.jar com.accp.mapreduce.WordCount -Dwordcount.case.sensitive=true /user/joe/wordcount/input /user/joe/wordcount/output1 -skip /user/joe/wordcount/patterns.txt

 

4)查看运行结果

hadoop fs -cat /user/joe/wordcount/output1/part-r-00000

 

======================================================================================================================

5.忽略大小写,进行统计

1)运行

hadoop jar /home/chenyun/project/mapreduce/mapreduce.jar com.accp.mapreduce.WordCount -Dwordcount.case.sensitive=false /user/joe/wordcount/input /user/joe/wordcount/output5 -skip /user/joe/wordcount/patterns.txt

 2)查看运行结果

hadoop fs -cat /user/joe/wordcount/output5/part-r-00000

 

posted on 2017-05-07 23:18  简单明了  阅读(459)  评论(0编辑  收藏  举报