数据分析

数据分析

本例:求同一个小时内的用户数量

其实就是通过mapreduce将同一小时数的用户进行合并

(1)mapper类

import java.io.IOException;

import java.net.URI;

import java.util.HashMap;

import org.apache.hadoop.fs.FSDataInputStream;

import org.apache.hadoop.fs.FileSystem;

import org.apache.hadoop.fs.Path;

import org.apache.hadoop.io.LongWritable;

import org.apache.hadoop.io.Text;

import org.apache.hadoop.mapreduce.Mapper;

import org.apache.hadoop.mapreduce.Mapper.Context;

 

/*

 * 统计分析

 * 统计每小时的pv值

 */

public class BBSMapper3 extends Mapper<LongWritable, Text, Text, Text> {

//输入:key为行偏移量,value为原文本

//输出:key为时间,value为用户ip地址

    Text outputKey = new Text();

    Text outputValue = new Text();

    @Override

    protected void map(LongWritable key, Text value, Context context)

            throws IOException, InterruptedException{

        String line = value.toString();            // 转换为String

        String[] fields = line.split("\t");

        if(fields.length != 4) {

            return;

        }

        // hour, ip

        outputKey.set(fields[2]);

        outputValue.set(fields[0]);

        context.write(outputKey, outputValue);

     

     }       

}

(2)reducer类

实现同一时间内用户合并,注意1个ip地址对应一个用户,有可能同一个小时同一个用户反复访问所以用set去重只用算一次!

package com.simple.mr;

import java.io.IOException;

import java.util.HashSet;

import java.util.Set;

import org.apache.hadoop.io.LongWritable;

import org.apache.hadoop.io.Text;

import org.apache.hadoop.mapreduce.Reducer;

public class BBSReducer3 extends Reducer<Text, Text, Text, LongWritable> {

    LongWritable pv = new LongWritable();

    protected void reduce(Text key, Iterable<Text> values, Context context)

            throws IOException, InterruptedException {

        Set<String> ipSets = new HashSet<>();

        for (Text ip : values) {

            ipSets.add(ip.toString());

        }

        pv.set(ipSets.size());

        context.write(key, pv);

    }

}

(3)主启动类

package com.simple.mr;

 

import java.net.URI;

import org.apache.hadoop.conf.Configuration;

import org.apache.hadoop.conf.Configured;

import org.apache.hadoop.fs.FileSystem;

import org.apache.hadoop.fs.Path;

import org.apache.hadoop.io.LongWritable;

import org.apache.hadoop.io.NullWritable;

import org.apache.hadoop.io.Text;

import org.apache.hadoop.mapreduce.Job;

import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;

import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;

import org.apache.hadoop.util.Tool;

import org.apache.hadoop.util.ToolRunner;

 

public class BBSDriver3 extends Configured implements Tool {

 

    static final String INPUT_PATH = "hdfs://localhost:9000/output2.txt";

    static final String OUT_PATH = "hdfs://localhost:9000/output3";

 

    public static void main(String[] args) {

 

        Configuration conf = new Configuration();

 

        try {

            int res = ToolRunner.run(conf, new BBSDriver3(), args);

            System.exit(res);

        } catch (Exception e) {

            e.printStackTrace();

        }

 

    }

     

    @Override

    public int run(String[] args) throws Exception {

        // 清理已存在的输出文件

        FileSystem fs = FileSystem.get(new URI(INPUT_PATH), getConf());

        Path outPath = new Path(OUT_PATH);

        if (fs.exists(outPath)) {

            fs.delete(outPath, true);

        }

        final Job job = Job.getInstance(getConf(), "BBS论坛日志分析3");

        // 设置为可以打包运行

        job.setJarByClass(BBSDriver3.class);

        job.setMapperClass(BBSMapper3.class);

        job.setMapOutputKeyClass(Text.class);

        job.setMapOutputValueClass(Text.class);

        job.setReducerClass(BBSReducer3.class);

        job.setOutputKeyClass(Text.class);

        job.setOutputValueClass(LongWritable.class);

        FileInputFormat.setInputPaths(job, INPUT_PATH);

        FileOutputFormat.setOutputPath(job, new Path(OUT_PATH));

        boolean success = job.waitForCompletion(true);

        // 如果成功输出

        if (success) {

            System.out.println("Clean process success!");

        }else {

            System.out.println("Clean process failed!");

        }

        return 0;

    }

 

}

posted @ 2025-04-07 00:21  Annaprincess  阅读(26)  评论(0)    收藏  举报