flume

flume简介

Flume是一个分布式、可靠、和高可用的海量日志聚合的系统,支持在系统中定制各类数据发送方,用于收集数据;同时,Flume提供对数据进行简单处理,并写到各种数据接受方(可定制)的能力。

flume架构:

flume部署

 安装flume:

root@s100:~/Downloads# tar  xf  apache-flume-1.8.0-bin.tar.gz  -C  /soft/
#修改环境变脸
root@s100:/soft# vim  /etc/environment 

JAVA_HOME=/soft/jdk
HADOOP_HOME=/soft/hadoop
ZOOKEEPER_HOME=/soft/zk
HIVE_HOME=/soft/hive
FLUME_HOME=/soft/flume
PATH="/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin:/usr/games:/so
ft/jdk/bin:/soft/hadoop/bin:/soft/hadoop/sbin:/soft/zk/bin:/soft/hive/bin:/soft/f
lume/bin"
root@s100:/soft# source  /etc/environment

修改flume环境变量的配置文件,配置jdk路径

root@s100:/soft/flume/conf# cp  flume-env.sh.template flume-env.sh
root@s100:/soft/flume/conf# vim flume-env.sh
export JAVA_HOME=/soft/jdk

根据数据采集的需求配置采集方案,采集方案参考官方配置文档:http://flume.apache.org/FlumeUserGuide.html

  先在flume的conf目录下新建一个文件,这个文件作为flume的采集方案:

root@s100:/soft/flume/conf# vim  netcat-logger.conf

# 定义这个agent中各组件的名字
a1.sources = r1
a1.sinks = k1
a1.channels = c1

# 描述和配置source组件:r1
a1.sources.r1.type = netcat
a1.sources.r1.bind = 192.168.1.100
a1.sources.r1.port = 44444

# 描述和配置sink组件:k1
a1.sinks.k1.type = logger

# 描述和配置channel组件,此处使用是内存缓存的方式
a1.channels.c1.type = memory
a1.channels.c1.capacity = 1000
a1.channels.c1.transactionCapacity = 100

# 描述和配置source  channel   sink之间的连接关系
a1.sources.r1.channels = c1
a1.sinks.k1.channel = c1

指定采集方案配置文件,在相应的节点上启动flume agent

root@s100:~# flume-ng  agent -c /soft/flume/conf -f  /soft/flume/conf/netcat-logger.conf -n  a1 -Dflume.root.logger=INFO,console 
#-c conf   指定flume自身的配置文件所在目录
#-f conf/netcat-logger.con  指定我们所描述的采集方案
#-n a1  指定我们这个agent的名字

#客户端通过telnet连接flume agent
root@s102:~/hadoop/dfs# telnet  s100 44444
Escape character is '^]'.
how are  you
#这时候flume agent采集到的event数据如下:
.NetcatSource.start(NetcatSource.java:166)] Created serverSocket:sun.nio.ch.ServerSocketChannelImpl[/192.168.1.100:44444]
2018-05-12 14:22:34,641 (SinkRunner-PollingRunner-DefaultSinkProcessor) [INFO - org.apache.flume.sink.LoggerSink.process(LoggerSink.java:95)] Event: { headers:{} body: 68 6F 77 20 61 72 65 20 20 79 6F 75 0D          how are  you. }

采集案例

采集案例1

  采集需求:某服务器的某特定目录下,会不断产生新的文件,每当有新文件出现,就需要把文件采集到HDFS中去

  创建采集方案文件flume_dir.conf

root@s100:/soft/flume/conf# vim   flume_dir.conf 

agent1.sources = source1
agent1.sinks = sink1
agent1.channels = channel1

# 配置source组件
agent1.sources.source1.type = spooldir
agent1.sources.source1.spoolDir = /root/logs/
agent1.sources.source1.fileHeader = false
# 配置sink组件
agent1.sinks.sink1.type = hdfs
agent1.sinks.sink1.hdfs.path =/flume-dir/%y-%m-%d/%H-%M
agent1.sinks.sink1.hdfs.filePrefix = access_log
agent1.sinks.sink1.hdfs.maxOpenFiles = 5000
agent1.sinks.sink1.hdfs.batchSize= 100
agent1.sinks.sink1.hdfs.fileType = DataStream
agent1.sinks.sink1.hdfs.writeFormat =Text
agent1.sinks.sink1.hdfs.rollSize = 102400
agent1.sinks.sink1.hdfs.rollCount = 1000000
agent1.sinks.sink1.hdfs.rollInterval = 60
agent1.sinks.sink1.hdfs.round = true
agent1.sinks.sink1.hdfs.roundValue = 10
agent1.sinks.sink1.hdfs.roundUnit = minute
agent1.sinks.sink1.hdfs.useLocalTimeStamp = true
# Use a channel which buffers events in memory
agent1.channels.channel1.type = memory
agent1.channels.channel1.keep-alive = 120
agent1.channels.channel1.capacity = 500000
agent1.channels.channel1.transactionCapacity = 600

# Bind the source and sink to the channel
agent1.sources.source1.channels = channel1
agent1.sinks.sink1.channel = channel1

  启动flume,测试向/root/logs中拷贝数据,查看hadoop服务目录的变化

#启动flume采集数据
root@s100:/soft/flume/conf# flume-ng  agent -c /soft/flume/conf -f  /soft/flume/conf/flume_dir.conf -n  agent1
#向/root/logs中拷贝文件
root@s100:~# cp  helloworld.txt logs/

 

采集方案2:

  采集需求:比如业务系统使用log4j生成的日志,日志内容不断增加,需要把追加到日志文件中的数据实时采集到hdfs

  创建采集方案文件flume_logs.conf

agent1.sources = source1
agent1.sinks = sink1
agent1.channels = channel1

# Describe/configure tail -F source1
agent1.sources.source1.type = exec
agent1.sources.source1.command = tail -F /root/logs/test_log
agent1.sources.source1.channels = channel1

#configure host for source
agent1.sources.source1.interceptors = i1
agent1.sources.source1.interceptors.i1.type = host
agent1.sources.source1.interceptors.i1.hostHeader = hostname

# Describe sink1
agent1.sinks.sink1.type = hdfs
agent1.sinks.sink1.hdfs.path = /weblog/flume-collection/%y-%m-%d/%H-%M
agent1.sinks.sink1.hdfs.filePrefix = access_log
agent1.sinks.sink1.hdfs.maxOpenFiles = 5000
agent1.sinks.sink1.hdfs.batchSize= 100
agent1.sinks.sink1.hdfs.fileType = DataStream
agent1.sinks.sink1.hdfs.writeFormat =Text
agent1.sinks.sink1.hdfs.rollSize = 102400
agent1.sinks.sink1.hdfs.rollCount = 1000000
agent1.sinks.sink1.hdfs.rollInterval = 60
agent1.sinks.sink1.hdfs.round = true
agent1.sinks.sink1.hdfs.roundValue = 10
agent1.sinks.sink1.hdfs.roundUnit = minute
agent1.sinks.sink1.hdfs.useLocalTimeStamp = true

# Use a channel which buffers events in memory
agent1.channels.channel1.type = memory
agent1.channels.channel1.keep-alive = 120
agent1.channels.channel1.capacity = 500000
agent1.channels.channel1.transactionCapacity = 600

# Bind the source and sink to the channel
agent1.sources.source1.channels = channel1
agent1.sinks.sink1.channel = channel1

  启动flume,测试向/root/logs/test_log文件中写入数据,查看hadoop服务目录的变化

root@s100:/soft/flume/conf# flume-ng  agent -c /soft/flume/conf -f  /soft/flume/conf/flume_logs.conf -n  agent1 
#向test_log文件中不停地写入数据
root@s100:/soft/flume/conf# while  true
> do 
> echo  1111111 >> /root/logs/test_log
> sleep 0.5
> done

 采集方案3:

  采集需求:比如业务系统使用log4j生成的日志,日志内容不断增加,需要把追加到日志文件中的数据实时采集到kafka中

  创建采集方案文件flume_kafka

root@s100:/soft/flume/conf# vim  flume_kafka 

a1.sources = r1
a1.channels = c1
a1.sinks = k1
a1.sources.r1.type = exec
a1.sources.r1.command = tail -F /root/logs/tokafka.log
a1.sources.r1.channels = c1

a1.channels.c1.type=memory
a1.channels.c1.capacity=10000
a1.channels.c1.transactionCapacity=100

a1.sinks.k1.type = org.apache.flume.sink.kafka.KafkaSink
a1.sinks.k1.topic = orderMq
a1.sinks.k1.brokerList = s100:9092
a1.sinks.k1.requiredAcks = 1
a1.sinks.k1.batchSize = 20
a1.sinks.k1.channel = c1

  启动flume客户端

root@s100:/soft/flume/conf# flume-ng  agent -c /soft/flume/conf -f  /soft/flume/conf/flume_kafka -n a1

  启动的kafka consumer

root@s102:~# kafka-console-consumer.sh  --zookeeper  s101:2181   --topic  orderMq 

  向日志文件中循环写入数据

root@s100:~# for((i=0;i<=500000;i++));do echo "message-"+$i >> /root/logs/tokafka.log;done

  在kafka consumer端接收的消息效果如下:

root@s102:~# kafka-console-consumer.sh  --zookeeper  s101:2181   --topic  orderMq 
message-+147058
message-+147062
message-+147066
message-+147070
message-+147074
message-+147078
message-+147082
message-+147086
message-+147090
message-+147094
..........
posted @ 2018-05-12 12:40  goser  阅读(175)  评论(0)    收藏  举报