简单的Flume和hive的结合

1. 日志格式 

#Software: Microsoft Internet Information Services 6.0
#Version: 1.0
#Date: 2014-01-03 00:00:34
#Fields: date time s-sitename s-ip cs-method cs-uri-stem cs-uri-query s-port cs-username c-ip cs(User-Agent) sc-status sc-substatus sc-win32-status 
2014-01-03 00:00:34 W3SVC1 2001:da8:7007:102::244 GET /skin6/film_sort.asp id=10 80 - 2001:da8:7007:f07:ac50:d2b:f22d:5dec Mozilla/5.0+(Windows+NT+6.1;+Trident/7.0;+rv:11.0)+like+Gecko 200 0 0
2014-01-03 00:00:34 W3SVC1 2001:da8:7007:102::244 GET /news.asp - 80 - 2001:da8:7007:f07:ac50:d2b:f22d:5dec Mozilla/5.0+(Windows+NT+6.1;+Trident/7.0;+rv:11.0)+like+Gecko 200 0 0
2014-01-03 00:00:34 W3SVC1 2001:da8:7007:102::244 GET /UploadFile/20131028231421.jpg - 80 - 2001:da8:7007:f07:ac50:d2b:f22d:5dec Mozilla/5.0+(Windows+NT+6.1;+Trident/7.0;+rv:11.0)+like+Gecko 200 0 0

2. 建立的对应的hive表:

CREATE EXTERNAL  TABLE IF NOT EXISTS exmovielog
(
    log_date            TIMESTAMP,
    s_sitename            STRING,
    s_ip                STRING,
    cs_method            STRING,
    cs_uri_stem            STRING,
    cs_uri_query            STRING,
    s_port                INT,
    cs_username            STRING,
    c_ip                STRING,
    user_agen            STRING,
    sc_status            INT,
    sc_substatus            INT,
    sc_win32_status            INT
)
ROW FORMAT DELIMITED FIELDS TERMINATED BY '`' 
LOCATION '/movielog';

3. flume配置文件

  

agent1.sources = source1
agent1.channels = channel1
agent1.sinks = sink1


# Each channel's type is defined.
agent1.channels.channel1.type = file
agent1.channels.channel1.checkpointDir = /home/hadoop_admin/flumeTemp/fchannel/spool/checkpoint
agent1.channels.channel1.dataDirs = /home/hadoop_admin/flumeTemp/fchannel/spool/data
agent1.channels.channel1.capacity = 10000

# For each one of the sources, the type is defined
agent1.sources.source1.type = spooldir
agent1.sources.source1.inputCharset = GBK
agent1.sources.source1.spoolDir =/home/hadoop_admin/movielog
agent1.sources.source1.fileHeader = true
agent1.sources.source1.deletePolicy = immediate
agent1.sources.source1.batchSize = 1000
agent1.sources.source1.channels = channel1
# remove the line that starts with '#'
agent1.sources.source1.interceptors = i1  search-replace1 search-replace2 search-replace3
agent1.sources.source1.interceptors.i1.type = regex_filter
agent1.sources.source1.interceptors.i1.regex = ^[^#].*$
# the default value of this configuration is flase
# agent1.sources.source1.interceptors.i1.excludeEvents = true
# agent1.sources.source1.interceptors.i1.regex = ^#

# connect the date and time to be a timestamp 
agent1.sources.source1.interceptors.search-replace1.type = search_replace
agent1.sources.source1.interceptors.search-replace1.searchPattern = (\\d\\d\\d\\d-\\d\\d-\\d\\d)\\s(\\d\\d:\\d\\d:\\d\\d)
agent1.sources.source1.interceptors.search-replace1.replaceString = $1T$2
# change the split char
agent1.sources.source1.interceptors.search-replace2.type = search_replace
agent1.sources.source1.interceptors.search-replace2.searchPattern = \\s
agent1.sources.source1.interceptors.search-replace2.replaceString = `

agent1.sources.source1.interceptors.search-replace3.type = search_replace
agent1.sources.source1.interceptors.search-replace3.searchPattern = (\\d\\d\\d\\d-\\d\\d-\\d\\d)T(\\d\\d:\\d\\d:\\d\\d)
agent1.sources.source1.interceptors.search-replace3.replaceString = $1 $2



# Each sink's type must be defined
agent1.sinks.sink1.type = hdfs
agent1.sinks.sink1.channel = channel1
agent1.sinks.sink1.hdfs.path = hdfs://master:9000/movielog
agent1.sinks.sink1.hdfs.writeFormat = Text
agent1.sinks.sink1.hdfs.fileType = DataStream
agent1.sinks.sink1.hdfs.rollInterval = 0
agent1.sinks.sink1.hdfs.idleTimeout = 0
agent1.sinks.sink1.hdfs.rollCount = 0
agent1.sinks.sink1.hdfs.rollSize = 67108864
agent1.sinks.sink1.hdfs.batchSize = 1000
agent1.sinks.sink1.hdfs.callTimeout = 3000

 

posted @ 2016-05-03 11:03  Amei1314  阅读(1578)  评论(0编辑  收藏  举报