数据分析
数据分析
本例:求同一个小时内的用户数量
其实就是通过mapreduce将同一小时数的用户进行合并
(1)mapper类
import java.io.IOException;
import java.net.URI;
import java.util.HashMap;
import org.apache.hadoop.fs.FSDataInputStream;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Mapper.Context;
/*
* 统计分析
* 统计每小时的pv值
*/
public class BBSMapper3 extends Mapper<LongWritable, Text, Text, Text> {
//输入:key为行偏移量,value为原文本
//输出:key为时间,value为用户ip地址
Text outputKey = new Text();
Text outputValue = new Text();
@Override
protected void map(LongWritable key, Text value, Context context)
throws IOException, InterruptedException{
String line = value.toString(); // 转换为String
String[] fields = line.split("\t");
if(fields.length != 4) {
return;
}
// hour, ip
outputKey.set(fields[2]);
outputValue.set(fields[0]);
context.write(outputKey, outputValue);
}
}
(2)reducer类
实现同一时间内用户合并,注意1个ip地址对应一个用户,有可能同一个小时同一个用户反复访问所以用set去重只用算一次!
package com.simple.mr;
import java.io.IOException;
import java.util.HashSet;
import java.util.Set;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Reducer;
public class BBSReducer3 extends Reducer<Text, Text, Text, LongWritable> {
LongWritable pv = new LongWritable();
protected void reduce(Text key, Iterable<Text> values, Context context)
throws IOException, InterruptedException {
Set<String> ipSets = new HashSet<>();
for (Text ip : values) {
ipSets.add(ip.toString());
}
pv.set(ipSets.size());
context.write(key, pv);
}
}
(3)主启动类
package com.simple.mr;
import java.net.URI;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;
public class BBSDriver3 extends Configured implements Tool {
static final String INPUT_PATH = "hdfs://localhost:9000/output2.txt";
static final String OUT_PATH = "hdfs://localhost:9000/output3";
public static void main(String[] args) {
Configuration conf = new Configuration();
try {
int res = ToolRunner.run(conf, new BBSDriver3(), args);
System.exit(res);
} catch (Exception e) {
e.printStackTrace();
}
}
@Override
public int run(String[] args) throws Exception {
// 清理已存在的输出文件
FileSystem fs = FileSystem.get(new URI(INPUT_PATH), getConf());
Path outPath = new Path(OUT_PATH);
if (fs.exists(outPath)) {
fs.delete(outPath, true);
}
final Job job = Job.getInstance(getConf(), "BBS论坛日志分析3");
// 设置为可以打包运行
job.setJarByClass(BBSDriver3.class);
job.setMapperClass(BBSMapper3.class);
job.setMapOutputKeyClass(Text.class);
job.setMapOutputValueClass(Text.class);
job.setReducerClass(BBSReducer3.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(LongWritable.class);
FileInputFormat.setInputPaths(job, INPUT_PATH);
FileOutputFormat.setOutputPath(job, new Path(OUT_PATH));
boolean success = job.waitForCompletion(true);
// 如果成功输出
if (success) {
System.out.println("Clean process success!");
}else {
System.out.println("Clean process failed!");
}
return 0;
}
}

浙公网安备 33010602011771号