Cascading(二)之日志分析
同样官方例子上代码:
1 package com.wyf.cascade; 2 3 import java.util.Map; 4 import java.util.Properties; 5 6 import cascading.cascade.Cascade; 7 import cascading.cascade.CascadeConnector; 8 import cascading.cascade.Cascades; 9 import cascading.flow.Flow; 10 import cascading.flow.FlowConnector; 11 import cascading.operation.aggregator.Count; 12 import cascading.operation.expression.ExpressionFunction; 13 import cascading.operation.regex.RegexParser; 14 import cascading.operation.text.DateParser; 15 import cascading.pipe.Each; 16 import cascading.pipe.Every; 17 import cascading.pipe.GroupBy; 18 import cascading.pipe.Pipe; 19 import cascading.scheme.TextLine; 20 import cascading.tap.Hfs; 21 import cascading.tap.Lfs; 22 import cascading.tap.Tap; 23 import cascading.tuple.Fields; 24 25 /** 26 * 日志分析 27 * 28 * @author: wyf 29 * @version: Jul 12, 2013 3:33:44 PM 30 */ 31 public class LogAnalysis { 32 public static void main(String[] args) { 33 // set the current job jar 34 //设置当前工作Jar 35 Properties properties = new Properties(); 36 FlowConnector.setApplicationJarClass(properties, LogAnalysis.class); 37 38 FlowConnector flowConnector = new FlowConnector(properties); 39 CascadeConnector cascadeConnector = new CascadeConnector(); 40 41 String inputPath = "/home/wyf/workspace/HadoopCascading/data/apache.200.txt"; 42 String tmpPath = "/home/wyf/workspace/HadoopCascading/data"; 43 String logsPath = tmpPath + "/logs/"; 44 String arrivalRatePath = tmpPath + "/arrivalrate/"; 45 String arrivalRateSecPath = arrivalRatePath + "sec"; 46 String arrivalRateMinPath = arrivalRatePath + "min"; 47 48 //设置解析器 49 Fields apacheFields = new Fields("ip", "time", "method", "event", "status", "size"); 50 String apacheRegex = "^([^ ]*) +[^ ]* +[^ ]* +\\[([^]]*)\\] +\\\"([^ ]*) ([^ ]*) [^ ]*\\\" ([^ ]*) ([^ ]*).*$"; 51 int[] apacheGroups = { 1, 2, 3, 4, 5, 6 }; 52 RegexParser parser = new RegexParser(apacheFields, apacheRegex, apacheGroups); 53 Pipe importPipe = new Each("import", new Fields("line"), parser); 54 55 //创建源头 56 Tap localLogTap = new Lfs(new TextLine(), inputPath); 57 //创建日志头 58 Tap parsedLogTap = new Hfs(apacheFields, logsPath); 59 60 //创建链接 61 Flow importLogFlow = flowConnector.connect(localLogTap, parsedLogTap, importPipe); 62 63 64 65 //取每一个时间戳管道 66 Pipe tsPipe = new Each("arrival rate", new Fields("time"), new DateParser("dd/MM/yyyy:HH:mm:ss Z")); 67 68 //装载tsPipe管道,并进行拆分 69 Pipe tsCountPipe = new Pipe("tsCount", tsPipe); 70 //此处针对tsPipe中的字段time进行分组,并取别名"ts" 71 tsCountPipe = new GroupBy(tsCountPipe, new Fields("ts")); 72 tsCountPipe = new Every(tsCountPipe, Fields.GROUP, new Count()); 73 74 //按时间戳的分钟粒度划分,并将处理后的ts命名为tm 75 Pipe tmPipe = new Each(tsPipe, new ExpressionFunction(new Fields("tm"), "ts - (ts % (60 * 1000))", long.class)); 76 77 //装载tmPipe管道,并进行分组,计算每个组内元素个数 78 Pipe tmCountPipe = new Pipe("tmCount", tmPipe); 79 tmCountPipe = new GroupBy(tmCountPipe, new Fields("tm")); 80 tmCountPipe = new Every(tmCountPipe, Fields.GROUP, new Count()); 81 82 83 84 85 //创建秒钟级的输出头 86 Tap tsSinkTap = new Hfs(new TextLine(), arrivalRateSecPath); 87 //创建分钟级的输出头 88 Tap tmSinkTap = new Hfs(new TextLine(), arrivalRateMinPath); 89 90 //绑定一群头到一群管道的转换方法,顺序是非常重要的 91 Map<String, Tap> sinks = Cascades.tapsMap(Pipe.pipes(tsCountPipe, tmCountPipe), Tap.taps(tsSinkTap, tmSinkTap)); 92 93 //############################## 94 //链接装配源头到出头,此处对进行分流处理 95 //############################## 96 Flow arrivalRateFlow = flowConnector.connect(parsedLogTap, sinks, tsCountPipe, tmCountPipe); 97 98 //############################################### 99 //可选操作,通过它们之间的依赖链接所有流,顺序是不重要的,会自动判断依赖 100 //############################################### 101 Cascade cascade = cascadeConnector.connect(importLogFlow, arrivalRateFlow); 102 103 //执行cascade,将会在依赖顺序下执行每一个流 104 cascade.complete(); 105 } 106 }
解释操作方法:
Each:对每条记录进行处理
GroupBy:对记录按字段分组
Every:对每组中的记录做处理
Count:这里类似SQL中的组函数

浙公网安备 33010602011771号