应用Flume+HBase采集和存储日志数据

1. 在本方案中,我们要将数据存储到HBase中,所以使用flume中提供的hbase sink,同时,为了清洗转换日志数据,我们实现自己的AsyncHbaseEventSerializer

package com.ncc.dlut;
import java.io.UnsupportedEncodingException;
import java.util.ArrayList;
import java.util.List;

import org.apache.flume.Context;
import org.apache.flume.Event;
import org.apache.flume.conf.ComponentConfiguration;
import org.apache.flume.sink.hbase.AsyncHbaseEventSerializer;
import org.apache.flume.sink.hbase.SimpleRowKeyGenerator;
import org.hbase.async.AtomicIncrementRequest;
import org.hbase.async.PutRequest;

public class AsyncHbaseLTEEventSerializer implements AsyncHbaseEventSerializer {
    //表名
    private byte[] table;
    //列族
    private byte[] colFam;
    //当前事件
    private Event currentEvent;
    //列名
    private byte[][] columnNames;
    //用于向HBase批量存储数据
    private final List<PutRequest> puts = new ArrayList<PutRequest>();
    private final List<AtomicIncrementRequest> incs = new ArrayList<AtomicIncrementRequest>();
    //当前行键
    private byte[] currentRowKey;
    private final byte[] eventCountCol = "eventCount".getBytes();
    
    @Override
    public void configure(Context context) {
        //从配置文件中获取列名
        String cols = new String(context.getString("columns"));
        String[] names = cols.split(",");
        columnNames = new byte[names.length][];
        int i = 0;
        for(String name:names){
            columnNames[i++] = name.getBytes();
        }
        
    }

    @Override
    public void configure(ComponentConfiguration conf) {
        // TODO Auto-generated method stub
        
    }

    @Override
    public void cleanUp() {
        // TODO Auto-generated method stub
        table = null;
        colFam = null;
        currentEvent = null;
        columnNames = null;
        currentRowKey = null;
        
    }

    @Override
    public List<PutRequest> getActions() {
        // 分割事件体获取各列的值
        String eventStr = new String(currentEvent.getBody());
        String[] cols = logTokenize(eventStr);
        puts.clear();
        //数据中的时间
        String time=cols[1];
        int n1 = 13-time.length();
        StringBuilder sb = new StringBuilder(time);
        for(int i=0;i<n1;i++){
            sb.insert(0, '0');
        }
        try {
            //使用自带的行键生成器生成行键
            currentRowKey = SimpleRowKeyGenerator.getUUIDKey(cols[0]+"-"+sb.toString());
        } catch (UnsupportedEncodingException e) {
            // TODO Auto-generated catch block
            e.printStackTrace();
        }
//        currentRowKey = (cols[0]+"-"+System.currentTimeMillis()).getBytes(); 
        int n = cols.length;
        // 添加每列数据
        for(int i=0;i<n;i++){
            PutRequest putReq = new PutRequest(table, currentRowKey,colFam,columnNames[i],cols[i].getBytes());
            puts.add(putReq);
        }        
        return puts;
    }

    @Override
    public List<AtomicIncrementRequest> getIncrements() {
        // 增加接收到的事件数量
        incs.clear();
        incs.add(new AtomicIncrementRequest(table, "totalEvents".getBytes(), colFam, eventCountCol));
        return incs;
    }

    @Override
    //初始化表名和列名
    public void initialize(byte[] table, byte[] cf) {
        
        this.table = table;
        this.colFam = cf;
        
    }

    @Override
    public void setEvent(Event event) {
        // TODO Auto-generated method stub
        this.currentEvent = event;
    }
    
    //从日志中获取列值信息
    public String[] logTokenize(String eventStr) {

//      String logEntryPattern = "^([\\d.]+) (\\S+) (\\S+) \\[([\\w:/]+\\s[+\\-]\\d{4})\\] \"(.+?)\" (\\d{3}) (\\d+|-) \"([^\"]+)\" \"([^\"]+)\"";
//        Pattern p = Pattern.compile(logEntryPattern);
//        Matcher matcher = p.matcher(eventStr); 

/*        if (!matcher.matches()){
            System.err.println("Bad log entry (or problem with RE?):");
            System.err.println(eventStr);
            return null;
        }
*/
        
/*        String[] columns = new String[matcher.groupCount()];       
        for (int i = 0; i < matcher.groupCount(); i++){
            columns[i] = matcher.group(i+1);
        }*/
        
        String[] s = eventStr.split("[:,]");
        int n = s.length;
        String[] columns = new String[n/2];
        for(int i=0;2*i+1<n;i++){
            columns[i] = s[2*i+1];
        }
        
        return columns;

    } 

}

 

所需jar包如下:

这些jar包都可以在flume的lib文件夹中找到。

2. 将上面的程序打包,放入flume的lib文件夹中

3. 配置Flume,实现采集和存储

配置文件flume-hbase.properties如下:

 

############################################
#  flume-src-agent config
###########################################

#agent section
agent.sources = s
agent.channels = c
agent.sinks = r

#source section
#agent.sources.s.type = exec
#agent.sources.s.command = tail -f -n+1 /usr/local/test.log

agent.sources.s.type = spooldir
agent.sources.s.spoolDir = /usr/local/flume-hbase
agent.sources.s.fileHeader = true
agent.sources.s.batchSize = 100
agent.sources.s.channels = c


# Each sink's type must be defined
agent.sinks.r.type = asynchbase
agent.sinks.r.table = car_table
agent.sinks.r.columnFamily = lte
agent.sinks.r.batchSize = 100
agent.sinks.r.serializer = com.ncc.dlut.AsyncHbaseLTEEventSerializer
agent.sinks.r.serializer.columns = cid,time,pci,st,ed,ta,lng,lat

#Specify the channel the sink should use
agent.sinks.r.channel = c

# Each channel's type is defined.
agent.channels.c.type = memory
agent.channels.c.capacity = 1000

 

参考链接:

大数据技术应用(一) 应用Flume+HBase采集和存储日志数据

 

posted @ 2015-12-01 18:48  ~风轻云淡~  阅读(2295)  评论(0编辑  收藏  举报