Cascading(二)之日志分析

同样官方例子上代码: 

  1 package com.wyf.cascade;
  2 
  3 import java.util.Map;
  4 import java.util.Properties;
  5 
  6 import cascading.cascade.Cascade;
  7 import cascading.cascade.CascadeConnector;
  8 import cascading.cascade.Cascades;
  9 import cascading.flow.Flow;
 10 import cascading.flow.FlowConnector;
 11 import cascading.operation.aggregator.Count;
 12 import cascading.operation.expression.ExpressionFunction;
 13 import cascading.operation.regex.RegexParser;
 14 import cascading.operation.text.DateParser;
 15 import cascading.pipe.Each;
 16 import cascading.pipe.Every;
 17 import cascading.pipe.GroupBy;
 18 import cascading.pipe.Pipe;
 19 import cascading.scheme.TextLine;
 20 import cascading.tap.Hfs;
 21 import cascading.tap.Lfs;
 22 import cascading.tap.Tap;
 23 import cascading.tuple.Fields;
 24 
 25 /**
 26  * 日志分析 
 27  *  
 28  * @author: wyf  
 29  * @version: Jul 12, 2013 3:33:44 PM
 30  */
 31 public class LogAnalysis {
 32     public static void main(String[] args) {
 33         // set the current job jar
 34         //设置当前工作Jar
 35         Properties properties = new Properties();
 36         FlowConnector.setApplicationJarClass(properties, LogAnalysis.class);
 37         
 38         FlowConnector flowConnector = new FlowConnector(properties);
 39         CascadeConnector cascadeConnector = new CascadeConnector();
 40 
 41         String inputPath = "/home/wyf/workspace/HadoopCascading/data/apache.200.txt";
 42         String tmpPath = "/home/wyf/workspace/HadoopCascading/data";
 43         String logsPath = tmpPath + "/logs/";
 44         String arrivalRatePath = tmpPath + "/arrivalrate/";
 45         String arrivalRateSecPath = arrivalRatePath + "sec";
 46         String arrivalRateMinPath = arrivalRatePath + "min";
 47 
 48         //设置解析器
 49         Fields apacheFields = new Fields("ip", "time", "method", "event", "status", "size");
 50         String apacheRegex = "^([^ ]*) +[^ ]* +[^ ]* +\\[([^]]*)\\] +\\\"([^ ]*) ([^ ]*) [^ ]*\\\" ([^ ]*) ([^ ]*).*$";
 51         int[] apacheGroups = { 1, 2, 3, 4, 5, 6 };
 52         RegexParser parser = new RegexParser(apacheFields, apacheRegex, apacheGroups);
 53         Pipe importPipe = new Each("import", new Fields("line"), parser);
 54 
 55         //创建源头
 56         Tap localLogTap = new Lfs(new TextLine(), inputPath);
 57         //创建日志头
 58         Tap parsedLogTap = new Hfs(apacheFields, logsPath);
 59 
 60         //创建链接
 61         Flow importLogFlow = flowConnector.connect(localLogTap, parsedLogTap, importPipe);
 62 
 63         
 64         
 65         //取每一个时间戳管道
 66         Pipe tsPipe = new Each("arrival rate", new Fields("time"), new DateParser("dd/MM/yyyy:HH:mm:ss Z"));
 67 
 68         //装载tsPipe管道,并进行拆分
 69         Pipe tsCountPipe = new Pipe("tsCount", tsPipe);
 70         //此处针对tsPipe中的字段time进行分组,并取别名"ts"
 71         tsCountPipe = new GroupBy(tsCountPipe, new Fields("ts"));
 72         tsCountPipe = new Every(tsCountPipe, Fields.GROUP, new Count());
 73 
 74         //按时间戳的分钟粒度划分,并将处理后的ts命名为tm
 75         Pipe tmPipe = new Each(tsPipe, new ExpressionFunction(new Fields("tm"), "ts - (ts % (60 * 1000))", long.class));
 76         
 77         //装载tmPipe管道,并进行分组,计算每个组内元素个数
 78         Pipe tmCountPipe = new Pipe("tmCount", tmPipe);
 79         tmCountPipe = new GroupBy(tmCountPipe, new Fields("tm"));
 80         tmCountPipe = new Every(tmCountPipe, Fields.GROUP, new Count());
 81 
 82         
 83         
 84         
 85         //创建秒钟级的输出头
 86         Tap tsSinkTap = new Hfs(new TextLine(), arrivalRateSecPath);
 87         //创建分钟级的输出头
 88         Tap tmSinkTap = new Hfs(new TextLine(), arrivalRateMinPath);
 89 
 90         //绑定一群头到一群管道的转换方法,顺序是非常重要的
 91         Map<String, Tap> sinks = Cascades.tapsMap(Pipe.pipes(tsCountPipe, tmCountPipe), Tap.taps(tsSinkTap, tmSinkTap));
 92 
 93         //##############################
 94         //链接装配源头到出头,此处对进行分流处理
 95         //##############################
 96         Flow arrivalRateFlow = flowConnector.connect(parsedLogTap, sinks, tsCountPipe, tmCountPipe);
 97 
 98         //###############################################
 99         //可选操作,通过它们之间的依赖链接所有流,顺序是不重要的,会自动判断依赖
100         //###############################################
101         Cascade cascade = cascadeConnector.connect(importLogFlow, arrivalRateFlow);
102 
103         //执行cascade,将会在依赖顺序下执行每一个流
104         cascade.complete();
105     }
106 }

解释操作方法:

  Each:对每条记录进行处理

  GroupBy:对记录按字段分组

  Every:对每组中的记录做处理

  Count:这里类似SQL中的组函数

  

posted @ 2013-07-12 17:33  GeoPanda  阅读(368)  评论(0)    收藏  举报