6.Flume的配置及使用
Flume的配置及使用
1.下载
官网:https://flume.apache.org/download.html
华为源:https://repo.huaweicloud.com/apache/flume/
2.安装
# 解压到/usr/local中
sudo tar -zxf ~/下载/apache-flume-1.9.0-bin.tar.gz -C /usr/local
cd /usr/local/
#将文件夹名改为flume
sudo mv ./apache-flume-1.9.0-bin ./flume
#给该文件夹赋予权限
sudo chown -R hadoop ./flume
3.配置环境变量
vim ~/.bashrc
#Flume
export FLUME_HOME=/usr/local/flume
export PATH=$FLUME_HOME/bin:$PATH
source ~/.bashrc
4.验证
查看flume版本
flume-ng version
5.测试flume
-
监控一个目录,将数据打印出来
-
该配置文件命名为spoolingtest.conf,可放在/home/hadoop/flume_test/目录下
- 配置文件
# 首先先给agent起一个名字 叫a1 # 分别给source channel sink取名字 a1.sources = r1 a1.channels = c1 a1.sinks = k1 # 分别对source、channel、sink进行配置 # 配置source # 将source的类型指定为 spooldir 用于监听一个目录下文件的变化 # 因为每个组件可能会出现相同的属性名称,所以在对每个组件进行配置的时候 # 需要加上 agent的名字.sources.组件的名字.属性 = 属性值 a1.sources.r1.type = spooldir a1.sources.r1.spoolDir = /home/hadoop/flume_test/data_test a1.sources.r1.fileSuffix = .ok a1.sources.r1.fileHeader = true # 给r1这个souces配置一个拦截器并取名为 i1 a1.sources.r1.interceptors = i1 # 将拦截器i1的类型设置为timestamp 会将处理数据的时间以毫秒的格式插入event的header中 # a1.sources.r1.interceptors.i1.type = timestamp # 将拦截器i1的类型设置为regex_filter 会根据正则表达式过滤数据 a1.sources.r1.interceptors.i1.type = regex_filter # 配置正则表达式 a1.sources.r1.interceptors.i1.regex = \\d{3,6} # excludeEvents = true 表示将匹配到的过滤,未匹配到的放行 a1.sources.r1.interceptors.i1.excludeEvents = true # 配置sink # 使用logger作为sink组件,可以将收集到数据直接打印到控制台 a1.sinks.k1.type = logger # 配置channel # 将channel的类型设置为memory,表示将event缓存在内存中 a1.channels.c1.type = memory # 组装 # 将sources的channels属性指定为c1 a1.sources.r1.channels = c1 # 将sinks的channel属性指定为c1 a1.sinks.k1.channel = c1 -
启动agent
flume-ng agent -n a1 -f ./spoolingtest.conf -Dflume.root.logger=DEBUG,console
- 新建/home/hadoop/flume_test/data_test目录
mkdir /home/hadoop/flume_test/data_test
- 在/home/hadoop/flume_test/data_test目录下新建文件,输入内容,观察flume进程打印的日志
# 随意在a.txt中加入一些内容
vim /home/hadoop/flume_test/data_test/test.txt
6.教学内容
1.测试
a1.sources = s1
a1.sinks = k1
a1.channels = c1
#配置source
a1.sources.s1.type = spooldir
a1.sources.s1.spoolDir = /hadoop_class/Flume_Class/flumetest
#配置channel
a1.channels.c1.type = memory
#配置sink
a1.sinks.k1.type = logger
#为source指定他的channel
a1.sources.s1.channels = c1
#为sink指定他的channel
a1.sinks.k1.channel = c1
2.记录日志
配置,time_append.sh
while true
do
echo $(date)
echo $(date) >> /home/hadoop/hadoop_flume/out
sleep 1
done
a1.sources = s1
a1.sinks = k1
a1.channels = c1
#配置source
a1.sources.s1.type = exec
#tail -F 和-f的区别是 -F追踪的是文件名字。-f是追踪id。因为.out文件满了就变成了.out.1了,再会去重建一个.out文件
a1.sources.s1.command = tail -F /home/hadoop/hadoop_flume/out
#配置channel
a1.channels.c1.type = memory
#配置sink
a1.sinks.k1.type = hdfs
a1.sinks.k1.hdfs.path = /hadoop_flume/%Y-%m-%d/%H%M
#设置目录回滚(首先根据时间创建个文件夹,写了一分钟后我们就需要重新建文件夹了,所以我们要指定回滚为true)
a1.sinks.k1.hdfs.round = true
a1.sinks.k1.hdfs.roundValue = 1
a1.sinks.k1.hdfs.roundUnit = minute
a1.sinks.k1.hdfs.useLocalTimeStamp = true
a1.sinks.k1.hdfs.filePrefix = yefei
a1.sinks.k1.hdfs.fileSuffix = log
#设置文件的回滚(下面三个条件只要有一个满足都会进行回滚)
#每过多少秒滚动一次,为0时表示不按照时间滚动文件,并不是0秒滚动;
a1.sinks.k1.hdfs.rollInterval = 10
#文件大小多大时滚动(单位是字节)
a1.sinks.k1.hdfs.rollSize = 1024
#写入文件的数量数达到多少时滚动
a1.sinks.k1.hdfs.rollCount = 10
#设置压缩格式为不压缩
a1.sinks.k1.hdfs.fileType = DataStream
#为source指定他的channel
a1.sources.s1.channels = c1
#为sink指定他的channel
a1.sinks.k1.channel = c1
3.记录日志
配置,time_append.sh
while true
do
echo $(date)
echo $(date) >> /home/hadoop/hadoop_flume/out
sleep 1
done
a1.sources = s1
a1.sinks = k1
a1.channels = c1
#配置source
a1.sources.s1.type = exec
#tail -F 和-f的区别是 -F追踪的是文件名字。-f是追踪id。因为.out文件满了就变成了.out.1了,再会去重建一个.out文件
a1.sources.s1.command = tail -F /home/yefei/hadoop_class/Flume_Class/out
#配置channel
a1.channels.c1.type = memory
#配置sink
a1.sinks.k1.type = hdfs
a1.sinks.k1.hdfs.path = /hadoop_class/flume_class/%Y-%m-%d/%H%M
#设置目录回滚(首先根据时间创建个文件夹,写了一分钟后我们就需要重新建文件夹了,所以我们要指定回滚为true)
a1.sinks.k1.hdfs.round = true
a1.sinks.k1.hdfs.roundValue = 1
a1.sinks.k1.hdfs.roundUnit = minute
a1.sinks.k1.hdfs.useLocalTimeStamp = true
a1.sinks.k1.hdfs.filePrefix = yefei
a1.sinks.k1.hdfs.fileSuffix = log
#设置文件的回滚(下面三个条件只要有一个满足都会进行回滚)
#每过多少秒滚动一次,为0时表示不按照时间滚动文件,并不是0秒滚动;
a1.sinks.k1.hdfs.rollInterval = 10
#文件大小多大时滚动(单位是字节)
a1.sinks.k1.hdfs.rollSize = 1024
#写入文件的数量数达到多少时滚动
a1.sinks.k1.hdfs.rollCount = 10
#设置压缩格式为不压缩
a1.sinks.k1.hdfs.fileType = DataStream
#为source指定他的channel
a1.sources.s1.channels = c1
#为sink指定他的channel
a1.sinks.k1.channel = c1
4.flume导入hbase
a1.sources = r1
a1.sinks = k1
a1.channels = c1
#Describe/configure the source
a1.sources.r1.type = exec
a1.sources.r1.command = tail -F /home/hadoop/hadoop_flume/flumetest3/data.txt
#Describe the sink
a1.sinks.k1.type = hbase
a1.sinks.k1.table = flume_hbase_student
a1.sinks.k1.columnFamily = info
#a1.sinks.k1.serializer = org.apache.flume.sink.hbase.RegexHbaseEventSerializer
a1.sinks.k1.channel = memoryChannel
a1.sinks.k1.channel=memoryChannel
#a1.sinks.k1.serializer.regex=(.*?)\\s(.*?)\\s(\d{1,3})
#a1.sinks.k1.serializer.colNames=name,sex,age
#Use a channel which buffers events in memory
a1.channels.c1.type = memory
a1.channels.c1.capacity = 1000
a1.channels.c1.transactionCapacity = 100
#Bind the source and sink to the channel
a1.sources.r1.channels = c1
a1.sinks.k1.channel = c1
5.flume导入hive
定义agent名, source、channel、sink的名称
agent3.sources = source3
agent3.channels = channel3
agent3.sinks = sink3
具体定义source
agent3.sources.source3.type = spooldir
agent3.sources.source3.spoolDir = /home/hadoop/hadoop_flume/flumetest4/msg
agent3.sources.source3.fileHeader=false
定义拦截器,为消息添加时间戳
agent3.sources.source3.interceptors = i1
agent3.sources.source3.interceptors.i1.type=timestamp
hiveserver2要开启
设置channel类型为磁盘
agent3.channels.channel3.type = file
file channle checkpoint文件的路径
agent3.channels.channel3.checkpointDir=/home/hadoop/hadoop_flume/flumetest4/temp/point
file channel data文件的路径
agent3.channels.channel3.dataDirs=/home/hadoop/hadoop_flume/flumetest4/temp
具体定义sink
agent3.sinks.sink3.type = hive
agent3.sinks.sink3.hive.metastore = thrift://localhost:9000
agent3.sinks.sink3.hive.database = flume_hive
agent3.sinks.sink3.hive.table = flume_hive
agent3.sinks.sink3.hive.partition = %y-%m-%d-%H-%M
agent3.sinks.sink3.useLocalTimeStamp = false
agent3.sinks.sink3.round = true
agent3.sinks.sink3.roundValue = 10
agent3.sinks.sink3.roundUnit = minute
agent3.sinks.sink3.serializer = DELIMITED
agent3.sinks.sink3.serializer.delimiter = ","
agent3.sinks.sink3.serializer.serdeSeparator = ','
agent3.sinks.sink3.serializer.fieldnames = nid,name,phone
agent3.sinks.sink3.batchSize = 90
组装source、channel、sink
agent3.sources.source3.channels = channel3
agent3.sinks.sink3.channel = channel3
6.flume导入hdfs(实例)
1.配置flume文件,并建立好相应文件夹
配置文件:
a1.sources = r1
a1.sinks = k1
a1.channels = c1
#Describe/configure the source
a1.sources.r1.type = spooldir
a1.sources.r1.spoolDir = /home/hadoop/Student_class/student_flume/msg
#Use a channel which buffers events in memory
a1.channels.c1.type = file
a1.channels.c1.checkpointDir = /home/hadoop/Student_class/student_flume/temp/point
a1.channels.c1.dataDirs = /home/hadoop/Student_class/student_flume/temp
#Describe the sink
a1.sinks.k1.type = hdfs
a1.sinks.k1.hdfs.path =hdfs://localhost:9000/student_class/student
a1.sinks.k1.hdfs.filePrefix = fangqiujian_
a1.sinks.k1.hdfs.fileSuffix = .csv
a1.sinks.k1.hdfs.fileType = DataStream
a1.sinks.k1.hdfs.writeFormat = Text
a1.sinks.k1.hdfs.rollInterval = 3600
a1.sinks.k1.hdfs.rollSize = 134217728
a1.sinks.k1.hdfs.rollCount = 0
#Bind the source and sink to the channel
a1.sources.r1.channels = c1
a1.sinks.k1.channel = c1
#运行命令
#flume-ng agent --conf conf --conf-file student.conf --name a1 -DFlume.root.logger=INFO,console
2.编写flume需要的文件夹,
3.应将需要上传至hdfs的文件放在单独的目录下
4.在编写的.conf文件的位置打开终端并运行flume命令
命令:
flume-ng agent --conf conf --conf-file student.conf --name a1 -DFlume.root.logger=INFO,console
5.在hdfs中查询是否上传成功
7.Flume采集数据到HDFS时,文件出现乱码
hdfs.fileType生成的文件类型,默认是Sequencefile
DataStream,则为普通文本
解决方案
在hdfs sink加一行
a1.sinks.k1.hdfs.fileType = DataStream
8.实际使用
1.spoolingToHDFS.conf
- 配置文件
# a表示给agent命名为a
# 给source组件命名为r1
a.sources = r1
# 给sink组件命名为k1
a.sinks = k1
# 给channel组件命名为c1
a.channels = c1
#指定spooldir的属性
a.sources.r1.type = spooldir
a.sources.r1.spoolDir = /root/data
a.sources.r1.fileHeader = true
a.sources.r1.interceptors = i1
a.sources.r1.interceptors.i1.type = timestamp
#指定sink的类型
a.sinks.k1.type = hdfs
a.sinks.k1.hdfs.path = /flume/data/dir1
# 指定文件名前缀
a.sinks.k1.hdfs.filePrefix = student
# 指定达到多少数据量写一次文件 单位:bytes
a.sinks.k1.hdfs.rollSize = 102400
# 指定多少条写一次文件
a.sinks.k1.hdfs.rollCount = 1000
# 指定文件类型为 流 来什么输出什么
a.sinks.k1.hdfs.fileType = DataStream
# 指定文件输出格式 为text
a.sinks.k1.hdfs.writeFormat = text
# 指定文件名后缀
a.sinks.k1.hdfs.fileSuffix = .txt
#指定channel
a.channels.c1.type = memory
a.channels.c1.capacity = 1000
# 表示sink每次会从channel里取多少数据
a.channels.c1.transactionCapacity = 100
# 组装
a.sources.r1.channels = c1
a.sinks.k1.channel = c1
- 在 /root/data/目录下准备数据
The Zen of Python, by Tim Peters
Beautiful is better than ugly.
Explicit is better than implicit.
Simple is better than complex.
Complex is better than complicated.
Flat is better than nested.
Sparse is better than dense.
Readability counts.
Special cases aren't special enough to break the rules.
Although practicality beats purity.
Errors should never pass silently.
Unless explicitly silenced.
In the face of ambiguity, refuse the temptation to guess.
There should be one-- and preferably only one --obvious way to do it.
Although that way may not be obvious at first unless you're Dutch.
Now is better than never.
Although never is often better than *right* now.
If the implementation is hard to explain, it's a bad idea.
If the implementation is easy to explain, it may be a good idea.
Namespaces are one honking great idea -- let's do more of those!
- 启动agent
flume-ng agent -n a -f ./spoolingToHDFS.conf -Dflume.root.logger=DEBUG,console
2.hbaseLogToHDFS
- 配置文件
# a表示给agent命名为a
# 给source组件命名为r1
a.sources = r1
# 给sink组件命名为k1
a.sinks = k1
# 给channel组件命名为c1
a.channels = c1
#指定exec的属性
a.sources.r1.type = exec
a.sources.r1.command = tail -f /usr/local/soft/hbase-1.4.6/logs/hbase-root-master-master.log
#指定sink的类型
a.sinks.k1.type = hdfs
a.sinks.k1.hdfs.path = /flume/data/dir2
# 指定文件名前缀
a.sinks.k1.hdfs.filePrefix = hbaselog
# 指定达到多少数据量写一次文件 单位:bytes
a.sinks.k1.hdfs.rollSize = 102400
# 指定多少条写一次文件
a.sinks.k1.hdfs.rollCount = 1000
# 指定文件类型为 流 来什么输出什么
a.sinks.k1.hdfs.fileType = DataStream
# 指定文件输出格式 为text
a.sinks.k1.hdfs.writeFormat = text
# 指定文件名后缀
a.sinks.k1.hdfs.fileSuffix = .txt
#指定channel
a.channels.c1.type = memory
a.channels.c1.capacity = 1000
# 表示sink每次会从channel里取多少数据
a.channels.c1.transactionCapacity = 100
# 组装
a.sources.r1.channels = c1
a.sinks.k1.channel = c1
3.hbaselogToHBase
- 在hbase中创建log表
create 'log','cf1'
- 配置文件
# a表示给agent命名为a
# 给source组件命名为r1
a.sources = r1
# 给sink组件命名为k1
a.sinks = k1
# 给channel组件命名为c1
a.channels = c1
#指定exec的属性
a.sources.r1.type = exec
a.sources.r1.command = cat /usr/local/soft/hbase-1.4.6/logs/hbase-root-master-master.log
#指定sink的类型
a.sinks.k1.type = hbase
a.sinks.k1.table = log
a.sinks.k1.columnFamily = cf1
#指定channel
a.channels.c1.type = memory
a.channels.c1.capacity = 100000
# 表示sink每次会从channel里取多少数据
a.channels.c1.transactionCapacity = 100
# 组装
a.sources.r1.channels = c1
a.sinks.k1.channel = c1
4.netcatLogger
监听telnet端口
- 安装telnet
yum install telnet
- 配置文件
# a表示给agent命名为a
# 给source组件命名为r1
a.sources = r1
# 给sink组件命名为k1
a.sinks = k1
# 给channel组件命名为c1
a.channels = c1
#指定netcat的属性
a.sources.r1.type = netcat
a.sources.r1.bind = 0.0.0.0
a.sources.r1.port = 8888
#指定sink的类型
a.sinks.k1.type = logger
#指定channel
a.channels.c1.type = memory
a.channels.c1.capacity = 1000
# 表示sink每次会从channel里取多少数据
a.channels.c1.transactionCapacity = 100
# 组装
a.sources.r1.channels = c1
a.sinks.k1.channel = c1
-
启动
- 先启动agent
flume-ng agent -n a -f ./netcatToLogger.conf -Dflume.root.logger=DEBUG,console- 在启动telnet
telnet master 8888
5.httpToLogger
- 配置文件
# a表示给agent命名为a
# 给source组件命名为r1
a.sources = r1
# 给sink组件命名为k1
a.sinks = k1
# 给channel组件命名为c1
a.channels = c1
#指定http的属性
a.sources.r1.type = http
a.sources.r1.port = 6666
#指定sink的类型
a.sinks.k1.type = logger
#指定channel
a.channels.c1.type = memory
a.channels.c1.capacity = 1000
# 表示sink每次会从channel里取多少数据
a.channels.c1.transactionCapacity = 100
# 组装
a.sources.r1.channels = c1
a.sinks.k1.channel = c1
-
启动
- 先启动agent
flume-ng agent -n a -f ./httpToLogger.conf -Dflume.root.logger=DEBUG,console- 再使用curl发起一个http请求
curl -X POST -d '[{ "headers" :{"a" : "a1","b" : "b1"},"body" : "hello~http~flume~"}]' http://master:666

浙公网安备 33010602011771号