Java逻辑完成Reduce join
Java逻辑完成Reduce join
1.MapReduce处理数据
通过Hadoop的组件对拉取的数据根据客户的需求进行MapReduce,首先使用Java逻辑将客户所需要实现的需求进型实现,首先要创建一个main方法主类,在类中作如下的操作:
package com.shujia;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import java.io.IOException;
public class MapReduce {
public static void main(String[] args) throws IOException, InterruptedException, ClassNotFoundException {
//获取MapReduce的配置信息
Configuration conf = new Configuration();
//创建任务job
Job job = Job.getInstance(conf);
//给任务起一个名字,这个名字会在yarn中看到
job.setJobName("mapreduce");
job.setNumReduceTasks(2);
//设置该任务运行的主类
job.setJarByClass(MapReduce.class);
//设置该任务的mapper类
job.setMapperClass(MyMapper.class);
//设置该任务的reduce类
job.setReducerClass(MyRudece.class);
//设置map阶段k-v输出的输出类型
// Hadoop中字符串的类型对应的叫做Text
job.setMapOutputKeyClass(Text.class);
job.setMapOutputValueClass(LongWritable.class);
//设置reduce阶段的k-v输出的输出类型
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(LongWritable.class);
//HDFS设置输入路径和输出路径
FileInputFormat.addInputPath(job,new Path(args[0]));
//这里设置的都是输出的目录
FileOutputFormat.setOutputPath(job,new Path(args[1]));
//启动任务
job.waitForCompletion(true);
}
}
对于主类中设置的mapper类和reduce类进行代码的实现
mapper类进行过滤,
public class MyMapper extends Mapper<LongWritable,Text,Text, LongWritable>
对于map中,首先将数据以行号进行读取作为key,将读取的数据作为value,接着对每个数据进行再次map,然后以数据作为key,个数作为value,这就是上面的泛型里面所对应的类型的原由。
package com.shujia;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;
import java.io.IOException;
public class MyMapper extends Mapper<LongWritable,Text,Text, LongWritable> {
@Override
protected void map(LongWritable key, Text value, Mapper<LongWritable, Text, Text, LongWritable>.Context context) throws IOException, InterruptedException {
/**
* map处理逻辑
* context代表的是Hadoop的上下文,将来可以通过他将数据进行写出map
* 对每一行数据进行风格
*/
String s = value.toString();
String[] s1 = s.split(" ");
for (String s2 : s1) {
//将string转化为Text类型
// 使用context写出上下文,对每一个单词进行封装
Text text = new Text(s2);
context.write(text,new LongWritable(1l));
}
}
}
reduce类进行计算
map里面过滤的数据然后reduce进行计算,所以这里的输入数据类型就是map的输出数据类型,以此类推,reduce的输出类型也是一样,与map的输入类型一致。
package com.shujia;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Reducer;
import java.io.IOException;
public class MyRudece extends Reducer<Text, LongWritable, Text,LongWritable> {
@Override
protected void reduce(Text key, Iterable<LongWritable> values, Reducer<Text, LongWritable, Text, LongWritable>.Context context) throws IOException, InterruptedException {
//迭代reduce,进行求和
Long sum=0l;
for (LongWritable value : values) {
long l = value.get();
sum+=l;
}
context.write(key,new LongWritable(sum));
}
}
2.IK分词器处理数据(对于中文数据的处理)
对于英文数据使用一般的MapReduce逻辑可以很好的处理,对于中文的处理,可以使用一下分词器进行处理
这里首先需要将所需要的分词器依赖放入pom.xml文件中(或者父类的pom.xml),然后开始书写代码,同样也是三部分构成:主类,mapper,reducer,首先主类中的代码逻辑实现与上面的MapReduce的代实现基本一致,main主类的代码具体逻辑
package ikanlaysiser;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.yarn.webapp.hamlet.Hamlet;
import java.io.BufferedReader;
import java.io.FileReader;
public class ikanlaysis {
public static void main(String[] args) throws Exception{
Configuration conf = new Configuration();
Job job = Job.getInstance(conf);
job.setJarByClass(ikanlaysis.class);
job.setMapperClass(IkMapper.class);
job.setReducerClass(IkReduce.class);
job.setMapOutputKeyClass(Text.class);
job.setMapOutputValueClass(LongWritable.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(LongWritable.class);
FileInputFormat.setInputPaths(job,new Path(args[0]));
FileOutputFormat.setOutputPath(job,new Path(args[1]));
job.waitForCompletion(true);
}
}
接着对于map中的逻辑代码有所不同,这个使用到了刚才所借助的依赖,Ik分词器,具体逻辑如下:
package ikanlaysiser;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapred.OutputCollector;
import org.apache.hadoop.mapred.Reporter;
import org.apache.hadoop.mapreduce.Mapper;
import org.wltea.analyzer.core.IKSegmenter;
import org.wltea.analyzer.core.Lexeme;
import java.io.BufferedReader;
import java.io.IOException;
import java.io.StringReader;
public class IkMapper extends Mapper<LongWritable,Text,Text, LongWritable> {
@Override
protected void map(LongWritable key, Text value, Mapper<LongWritable, Text, Text, LongWritable>.Context context) throws IOException, InterruptedException {
String s = value.toString();
StringReader stringReader = new StringReader(s);
IKSegmenter ikSegmenter = new IKSegmenter(stringReader,true);
Lexeme lexeme=null;
while ((lexeme=ikSegmenter.next())!=null){
String lexemeText = lexeme.getLexemeText();
if("曹操".equals(lexemeText) || "董卓".equals(lexemeText) || "张飞".equals(lexemeText)){
context.write(new Text(lexemeText),new LongWritable(1l));
}
}
}
}
这里是通过新建一个分词器对象,将对应的数据转化成对应的类型,从而添加到IKSegmenter中,然后定义一个Lexeme常量,类似于集合地迭代器的方式将数据进行遍历,如果数据满足条件,将数据加入结果中,由reduce进行计算,聚合数据得到最后的结果。
reduce代码逻辑,这里的reduce代码逻辑是直接将是结果累计相加遍历出来的。
package ikanlaysiser;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Reducer;
import java.io.IOException;
public class IkReduce extends Reducer<Text,LongWritable, Text,LongWritable> {
@Override
protected void reduce(Text key, Iterable<LongWritable> values, Reducer<Text, LongWritable, Text, LongWritable>.Context context) throws IOException, InterruptedException {
long sum=0;
for (LongWritable value : values) {
sum+=value.get();
}
context.write(key,new LongWritable(sum));
}
}
3.将两个表数据进行连接,进行多次MapReduce
同样也是有三部分组成
main
mapper
reduce
对于main的逻辑
package saldemo;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
public class dianxing {
public static void main(String[] args) throws Exception{
Configuration conf = new Configuration();
Job job = Job.getInstance(conf);
job.setJarByClass(dianxing.class);
job.setMapperClass(dxMapper.class);
job.setReducerClass(dxReduce.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(Text.class);
job.setMapOutputKeyClass(Text.class);
job.setMapOutputValueClass(Text.class);
FileInputFormat.addInputPath(job,new Path(args[0]));
FileOutputFormat.setOutputPath(job,new Path(args[1]));
job.waitForCompletion(true);
}
}
对于mapper的逻辑,首先通过一个方法获取当下目录的文件,获取姓名,然后通过if判断姓名,对文件里面的数据进行切分,取出所需要的数据,作为对应的key和value(注意:这里的key要是一样的)
具体逻辑代码实现
package saldemo;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.lib.input.FileSplit;
import java.io.IOException;
public class dxMapper extends Mapper<LongWritable,Text,Text,Text> {
private Text newkey=new Text();
@Override
protected void map(LongWritable key, Text value, Mapper<LongWritable, Text, Text, Text>.Context context) throws IOException, InterruptedException {
String s = value.toString();
FileSplit inputSplit = (FileSplit) context.getInputSplit();
String name = inputSplit.getPath().getName();
if (name.contains("part")){
String[] s1 = s.split("\t");
String phone = s1[0];
String area = s1[1];
newkey.set(area);
String time = s1[2];
context.write(newkey ,new Text("#"+phone+"\t"+time));
}else if (name.contains("city")){
String[] s2 = s.split(",");
String area = s2[0];
newkey.set(area);
String placename = s2[1];
context.write(newkey,new Text("$"+placename));
}
}
}
对于reduce,将所获取的数据进行增强for遍历,对于其中的数据进行类型转换,根据条件判别所需要的数据,进行一系列操作,最终将相应的数据作为对应的key和value进行存储。(这里也可以定义两个与对应的键和值相同类型的集合,先将数据加入集合,最后集合写入)具体逻辑代码实现:
package saldemo;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Reducer;
import java.io.IOException;
import java.util.LinkedList;
public class dxReduce extends Reducer<Text, Text, Text, Text>{
@Override
protected void reduce(Text key, Iterable<Text> values, Reducer<Text, Text, Text, Text>.Context context) throws IOException, InterruptedException {
LinkedList id = new LinkedList();
LinkedList place = new LinkedList();
for (Text value : values) {
String s = value.toString();
if (s.startsWith("#")){
String s1 = s.substring(1);
id.add(s1.split("\t")[0]+"\t"+s1.split("\t")[1]);
}else if (s.startsWith("$")){
String s2 = s.substring(1);
place.add(s2.toString());
}
for (Object o : place) {
for (Object o1 : id) {
String s1 = o1.toString();
String s2 = s1.split("\t")[0];
String s3 = s1.split("\t")[1];
context.write(key,new Text(o+"\t"+s2+"\t"+s3));
}
}
}
}
}
4.打包
将所写的代码打成jar包(IK分词器需要添加相应的依赖才可以进行打包,否则运行报错)
iK分词器的子类pom.xml的依赖
<build>
<plugins>
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-assembly-plugin</artifactId>
<version>3.3.0</version>
<configuration>
<descriptorRefs>
<!-- 打包出来的带依赖jar包名称 -->
<descriptorRef>jar-with-dependencies</descriptorRef>
</descriptorRefs>
</configuration>
<!--下面是为了使用 mvn package命令,如果不加则使用mvn assembly-->
<executions>
<execution>
<id>make-assemble</id>
<phase>package</phase>
<goals>
<goal>single</goal>
</goals>
</execution>
</executions>
</plugin>
</plugins>
</build>
数据进行打包

出来这样说明架包打包成功!!!!

对应的左边的目录会出现,如果不是分词器的,用第一个就可以了,如果是,用第二个

接着将打好的jar包放入对应的Linux目录下,使用Hadoop命令进行运行
例如
首先将所需要的文件进行上传到对应的目录中
hadoop fs -put dianxing.txt /shujia/
接着进行运行操作
hadoop jar java_mapreduce-1.0-SNAPSHOT-jar-with-dependencies.jar saldemo.dianxing /shujia/dainxing.txt /shujia/output/out1
运行过后通过相应的命令进行查看
hadoop fs -cat /shujia/output/out1/part-r-00000
hadoop jar:运行对应jar的命令
java_mapreduce-1.0-SNAPSHOT-jar-with-dependencies.jar;对应jar包名
saldemo.dianxing :所要执行main方法的主类名
/shujia/dainxing.txt:相应的所需执行文件
/shujia/output/out1:相应所需要输出路径

浙公网安备 33010602011771号