Mapreduce词频统计

maven建立quick-start工程。

pom.xml

<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"

xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">

<artifactId>wordcount</artifactId>

<version>0.0.1-SNAPSHOT</version>

<name>wordcount</name>

<url>http://maven.apache.org</url>

<project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>

</properties>

<groupId>junit</groupId>

<artifactId>junit</artifactId>

</dependency>

<groupId>org.apache.hadoop</groupId>

<artifactId>hadoop-common</artifactId>

</dependency>

<groupId>org.apache.hadoop</groupId>

<artifactId>hadoop-hdfs</artifactId>

</dependency>

<groupId>org.apache.hadoop</groupId>

<artifactId>hadoop-client</artifactId>

</dependency>

</dependencies>

</project>

3个java代码，mapper、reducer、runner主类：

mapper：

package cn.edu.bupt.wcy.wordcount;

import java.io.IOException;

import org.apache.commons.lang.StringUtils;

import org.apache.hadoop.io.LongWritable;

import org.apache.hadoop.io.Text;

import org.apache.hadoop.mapreduce.Mapper;

public class WordCountMapper extends Mapper<LongWritable, Text, Text, LongWritable>{

@Override

protected void map(LongWritable key, Text value, Mapper<LongWritable, Text, Text, LongWritable>.Context context)

throws IOException, InterruptedException {

// TODO Auto-generated method stub

//super.map(key, value, context);

//String[] words = StringUtils.split(value.toString());

String[] words = StringUtils.split(value.toString(), " ");

for(String word:words)

{

context.write(new Text(word), new LongWritable(1));

}

reducer：

package cn.edu.bupt.wcy.wordcount;

import java.io.IOException;

import org.apache.hadoop.io.LongWritable;

import org.apache.hadoop.io.Text;

import org.apache.hadoop.mapreduce.Reducer;

public class WordCountReducer extends Reducer<Text, LongWritable, Text, LongWritable> {

@Override

protected void reduce(Text arg0, Iterable<LongWritable> arg1,

Reducer<Text, LongWritable, Text, LongWritable>.Context context) throws IOException, InterruptedException {

// TODO Auto-generated method stub

//super.reduce(arg0, arg1, arg2);

int sum=0;

for(LongWritable num:arg1)

{

sum += num.get();

}

context.write(arg0,new LongWritable(sum));

}

runner：

package cn.edu.bupt.wcy.wordcount;

import java.io.IOException;

import org.apache.hadoop.conf.Configuration;

import org.apache.hadoop.fs.Path;

import org.apache.hadoop.io.LongWritable;

import org.apache.hadoop.io.Text;

import org.apache.hadoop.mapreduce.Job;

import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;

import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;

import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;

import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;

public class WordCountRunner {

public static void main(String[] args) throws IllegalArgumentException, IOException, ClassNotFoundException, InterruptedException {

Configuration conf = new Configuration();

Job job = new Job(conf);

job.setJarByClass(WordCountRunner.class);

job.setJobName("wordcount");

job.setOutputKeyClass(Text.class);

job.setOutputValueClass(LongWritable.class);

job.setMapperClass(WordCountMapper.class);

job.setReducerClass(WordCountReducer.class);

job.setInputFormatClass(TextInputFormat.class);

job.setOutputFormatClass(TextOutputFormat.class);

FileInputFormat.addInputPath(job, new Path(args[1]));

FileOutputFormat.setOutputPath(job, new Path(args[2]));

job.waitForCompletion(true);

}

打包成jar包后，放到集群上运行。先在集群上新建一个文件夹：

hdfs dfs -mkdir /input_wordcount 再放入单词文件，比如：

hello world

I like playing basketball

hello java。。。

运行hadoop jar WordCount.jar（jar包） WordCountRunner(主类) /input_wordcount /output_wordcount

运行完成后，查看：

hdfs dfs -ls /output_wordcount。已经生成了结果，在cat一下查看内容即可。

posted on 2021-10-23 17:04 风中明月阅读(114) 评论(0) 收藏举报