6.Flume的配置及使用

Flume的配置及使用

1.下载

官网：https://flume.apache.org/download.html

华为源：https://repo.huaweicloud.com/apache/flume/

2.安装

# 解压到/usr/local中
sudo tar -zxf ~/下载/apache-flume-1.9.0-bin.tar.gz -C /usr/local            
cd /usr/local/
#将文件夹名改为flume
sudo mv ./apache-flume-1.9.0-bin ./flume
#给该文件夹赋予权限
sudo chown -R hadoop ./flume

3.配置环境变量

vim ~/.bashrc

#Flume
export FLUME_HOME=/usr/local/flume
export PATH=$FLUME_HOME/bin:$PATH

source ~/.bashrc

4.验证

查看flume版本

flume-ng version

5.测试flume

该配置文件命名为spoolingtest.conf，可放在/home/hadoop/flume_test/目录下

配置文件

# 首先先给agent起一个名字 叫a1
# 分别给source channel sink取名字
a1.sources = r1
a1.channels = c1
a1.sinks = k1
 
# 分别对source、channel、sink进行配置
 
# 配置source
# 将source的类型指定为 spooldir 用于监听一个目录下文件的变化
# 因为每个组件可能会出现相同的属性名称，所以在对每个组件进行配置的时候 
# 需要加上 agent的名字.sources.组件的名字.属性 = 属性值
a1.sources.r1.type = spooldir
a1.sources.r1.spoolDir = /home/hadoop/flume_test/data_test
a1.sources.r1.fileSuffix = .ok
a1.sources.r1.fileHeader = true
 
# 给r1这个souces配置一个拦截器并取名为 i1
a1.sources.r1.interceptors = i1
# 将拦截器i1的类型设置为timestamp 会将处理数据的时间以毫秒的格式插入event的header中
# a1.sources.r1.interceptors.i1.type = timestamp
# 将拦截器i1的类型设置为regex_filter 会根据正则表达式过滤数据
a1.sources.r1.interceptors.i1.type = regex_filter
# 配置正则表达式
a1.sources.r1.interceptors.i1.regex = \\d{3,6}
# excludeEvents = true 表示将匹配到的过滤，未匹配到的放行
a1.sources.r1.interceptors.i1.excludeEvents = true
 
# 配置sink
# 使用logger作为sink组件，可以将收集到数据直接打印到控制台
a1.sinks.k1.type = logger
 
# 配置channel
# 将channel的类型设置为memory，表示将event缓存在内存中
a1.channels.c1.type = memory
 
# 组装
# 将sources的channels属性指定为c1
a1.sources.r1.channels = c1
 
# 将sinks的channel属性指定为c1
a1.sinks.k1.channel = c1

启动agent

flume-ng agent -n a1 -f ./spoolingtest.conf -Dflume.root.logger=DEBUG,console

新建/home/hadoop/flume_test/data_test目录

mkdir /home/hadoop/flume_test/data_test

在/home/hadoop/flume_test/data_test目录下新建文件，输入内容，观察flume进程打印的日志

# 随意在a.txt中加入一些内容
vim /home/hadoop/flume_test/data_test/test.txt

6.教学内容

1.测试

a1.sources = s1
a1.sinks = k1
a1.channels = c1

#配置source
a1.sources.s1.type = spooldir
a1.sources.s1.spoolDir = /hadoop_class/Flume_Class/flumetest

#配置channel
a1.channels.c1.type = memory

#配置sink
a1.sinks.k1.type = logger

#为source指定他的channel
a1.sources.s1.channels = c1

#为sink指定他的channel
a1.sinks.k1.channel = c1

2.记录日志

配置，time_append.sh

while true
do
echo $(date)
echo $(date) >> /home/hadoop/hadoop_flume/out
sleep 1
done

a1.sources = s1
a1.sinks = k1
a1.channels = c1

#配置source
a1.sources.s1.type = exec
#tail -F 和-f的区别是 -F追踪的是文件名字。-f是追踪id。因为.out文件满了就变成了.out.1了，再会去重建一个.out文件
a1.sources.s1.command = tail -F /home/hadoop/hadoop_flume/out
#配置channel
a1.channels.c1.type = memory

#配置sink
a1.sinks.k1.type = hdfs
a1.sinks.k1.hdfs.path = /hadoop_flume/%Y-%m-%d/%H%M
#设置目录回滚(首先根据时间创建个文件夹，写了一分钟后我们就需要重新建文件夹了，所以我们要指定回滚为true)
a1.sinks.k1.hdfs.round = true
a1.sinks.k1.hdfs.roundValue = 1
a1.sinks.k1.hdfs.roundUnit = minute
a1.sinks.k1.hdfs.useLocalTimeStamp = true
a1.sinks.k1.hdfs.filePrefix = yefei
a1.sinks.k1.hdfs.fileSuffix = log

#设置文件的回滚(下面三个条件只要有一个满足都会进行回滚)
#每过多少秒滚动一次，为0时表示不按照时间滚动文件，并不是0秒滚动；
a1.sinks.k1.hdfs.rollInterval = 10
#文件大小多大时滚动（单位是字节）
a1.sinks.k1.hdfs.rollSize = 1024
#写入文件的数量数达到多少时滚动
a1.sinks.k1.hdfs.rollCount = 10
#设置压缩格式为不压缩
a1.sinks.k1.hdfs.fileType = DataStream

#为source指定他的channel
a1.sources.s1.channels = c1
#为sink指定他的channel
a1.sinks.k1.channel = c1

3.记录日志

配置，time_append.sh

while true
do
echo $(date)
echo $(date) >> /home/hadoop/hadoop_flume/out
sleep 1
done

a1.sources = s1
a1.sinks = k1
a1.channels = c1

#配置source
a1.sources.s1.type = exec
#tail -F 和-f的区别是 -F追踪的是文件名字。-f是追踪id。因为.out文件满了就变成了.out.1了，再会去重建一个.out文件
a1.sources.s1.command = tail -F /home/yefei/hadoop_class/Flume_Class/out
#配置channel
a1.channels.c1.type = memory

#配置sink
a1.sinks.k1.type = hdfs
a1.sinks.k1.hdfs.path = /hadoop_class/flume_class/%Y-%m-%d/%H%M
#设置目录回滚(首先根据时间创建个文件夹，写了一分钟后我们就需要重新建文件夹了，所以我们要指定回滚为true)
a1.sinks.k1.hdfs.round = true
a1.sinks.k1.hdfs.roundValue = 1
a1.sinks.k1.hdfs.roundUnit = minute
a1.sinks.k1.hdfs.useLocalTimeStamp = true
a1.sinks.k1.hdfs.filePrefix = yefei
a1.sinks.k1.hdfs.fileSuffix = log

#设置文件的回滚(下面三个条件只要有一个满足都会进行回滚)
#每过多少秒滚动一次，为0时表示不按照时间滚动文件，并不是0秒滚动；
a1.sinks.k1.hdfs.rollInterval = 10
#文件大小多大时滚动（单位是字节）
a1.sinks.k1.hdfs.rollSize = 1024
#写入文件的数量数达到多少时滚动
a1.sinks.k1.hdfs.rollCount = 10
#设置压缩格式为不压缩
a1.sinks.k1.hdfs.fileType = DataStream

#为source指定他的channel
a1.sources.s1.channels = c1
#为sink指定他的channel
a1.sinks.k1.channel = c1

4.flume导入hbase

a1.sources = r1
a1.sinks = k1
a1.channels = c1

#Describe/configure the source

a1.sources.r1.type = exec
a1.sources.r1.command = tail -F /home/hadoop/hadoop_flume/flumetest3/data.txt

#Describe the sink

a1.sinks.k1.type = hbase
a1.sinks.k1.table = flume_hbase_student
a1.sinks.k1.columnFamily = info
#a1.sinks.k1.serializer = org.apache.flume.sink.hbase.RegexHbaseEventSerializer
a1.sinks.k1.channel = memoryChannel
a1.sinks.k1.channel=memoryChannel
#a1.sinks.k1.serializer.regex=(.*?)\\s(.*?)\\s(\d{1,3})
#a1.sinks.k1.serializer.colNames=name,sex,age

#Use a channel which buffers events in memory

a1.channels.c1.type = memory
a1.channels.c1.capacity = 1000
a1.channels.c1.transactionCapacity = 100

#Bind the source and sink to the channel

a1.sources.r1.channels = c1
a1.sinks.k1.channel = c1

5.flume导入hive

定义agent名， source、channel、sink的名称

agent3.sources = source3
agent3.channels = channel3
agent3.sinks = sink3

具体定义source

agent3.sources.source3.type = spooldir
agent3.sources.source3.spoolDir = /home/hadoop/hadoop_flume/flumetest4/msg
agent3.sources.source3.fileHeader=false

定义拦截器，为消息添加时间戳

agent3.sources.source3.interceptors = i1
agent3.sources.source3.interceptors.i1.type=timestamp

hiveserver2要开启

设置channel类型为磁盘

agent3.channels.channel3.type = file

file channle checkpoint文件的路径

agent3.channels.channel3.checkpointDir=/home/hadoop/hadoop_flume/flumetest4/temp/point

file channel data文件的路径

agent3.channels.channel3.dataDirs=/home/hadoop/hadoop_flume/flumetest4/temp

具体定义sink

agent3.sinks.sink3.type = hive
agent3.sinks.sink3.hive.metastore = thrift://localhost:9000
agent3.sinks.sink3.hive.database = flume_hive
agent3.sinks.sink3.hive.table = flume_hive

agent3.sinks.sink3.hive.partition = %y-%m-%d-%H-%M

agent3.sinks.sink3.useLocalTimeStamp = false

agent3.sinks.sink3.round = true

agent3.sinks.sink3.roundValue = 10

agent3.sinks.sink3.roundUnit = minute

agent3.sinks.sink3.serializer = DELIMITED
agent3.sinks.sink3.serializer.delimiter = ","
agent3.sinks.sink3.serializer.serdeSeparator = ','
agent3.sinks.sink3.serializer.fieldnames = nid,name,phone
agent3.sinks.sink3.batchSize = 90

组装source、channel、sink

agent3.sources.source3.channels = channel3
agent3.sinks.sink3.channel = channel3

6.flume导入hdfs（实例）

1.配置flume文件，并建立好相应文件夹

1student.conf

配置文件：

a1.sources = r1
a1.sinks = k1
a1.channels = c1

#Describe/configure the source

a1.sources.r1.type = spooldir
a1.sources.r1.spoolDir = /home/hadoop/Student_class/student_flume/msg

#Use a channel which buffers events in memory

a1.channels.c1.type = file
a1.channels.c1.checkpointDir = /home/hadoop/Student_class/student_flume/temp/point
a1.channels.c1.dataDirs = /home/hadoop/Student_class/student_flume/temp



#Describe the sink

a1.sinks.k1.type = hdfs
a1.sinks.k1.hdfs.path =hdfs://localhost:9000/student_class/student
a1.sinks.k1.hdfs.filePrefix = fangqiujian_
a1.sinks.k1.hdfs.fileSuffix = .csv
a1.sinks.k1.hdfs.fileType = DataStream
a1.sinks.k1.hdfs.writeFormat = Text
a1.sinks.k1.hdfs.rollInterval = 3600
a1.sinks.k1.hdfs.rollSize = 134217728
a1.sinks.k1.hdfs.rollCount = 0

#Bind the source and sink to the channel

a1.sources.r1.channels = c1
a1.sinks.k1.channel = c1

#运行命令
#flume-ng agent --conf conf --conf-file student.conf --name a1 -DFlume.root.logger=INFO,console

2.编写flume需要的文件夹，

3.应将需要上传至hdfs的文件放在单独的目录下

3文件位置

4.在编写的.conf文件的位置打开终端并运行flume命令

4conf

命令：

flume-ng agent --conf conf --conf-file student.conf --name a1 -DFlume.root.logger=INFO,console

5.在hdfs中查询是否上传成功

5hdfstem

7.Flume采集数据到HDFS时，文件出现乱码

hdfs.fileType生成的文件类型，默认是Sequencefile
DataStream，则为普通文本
解决方案
在hdfs sink加一行

a1.sinks.k1.hdfs.fileType = DataStream

8.实际使用

1.spoolingToHDFS.conf

配置文件

# a表示给agent命名为a
# 给source组件命名为r1
a.sources = r1
# 给sink组件命名为k1
a.sinks = k1 
# 给channel组件命名为c1
a.channels = c1
#指定spooldir的属性
a.sources.r1.type = spooldir 
a.sources.r1.spoolDir = /root/data 
a.sources.r1.fileHeader = true 
a.sources.r1.interceptors = i1 
a.sources.r1.interceptors.i1.type = timestamp
#指定sink的类型
a.sinks.k1.type = hdfs
a.sinks.k1.hdfs.path = /flume/data/dir1
# 指定文件名前缀
a.sinks.k1.hdfs.filePrefix = student
# 指定达到多少数据量写一次文件 单位：bytes
a.sinks.k1.hdfs.rollSize = 102400
# 指定多少条写一次文件
a.sinks.k1.hdfs.rollCount = 1000
# 指定文件类型为 流 来什么输出什么
a.sinks.k1.hdfs.fileType = DataStream
# 指定文件输出格式 为text
a.sinks.k1.hdfs.writeFormat = text
# 指定文件名后缀
a.sinks.k1.hdfs.fileSuffix = .txt
 
#指定channel
a.channels.c1.type = memory 
a.channels.c1.capacity = 1000
# 表示sink每次会从channel里取多少数据
a.channels.c1.transactionCapacity = 100
# 组装
a.sources.r1.channels = c1 
a.sinks.k1.channel = c1

The Zen of Python, by Tim Peters
 
Beautiful is better than ugly.
Explicit is better than implicit.
Simple is better than complex.
Complex is better than complicated.
Flat is better than nested.
Sparse is better than dense.
Readability counts.
Special cases aren't special enough to break the rules.
Although practicality beats purity.
Errors should never pass silently.
Unless explicitly silenced.
In the face of ambiguity, refuse the temptation to guess.
There should be one-- and preferably only one --obvious way to do it.
Although that way may not be obvious at first unless you're Dutch.
Now is better than never.
Although never is often better than *right* now.
If the implementation is hard to explain, it's a bad idea.
If the implementation is easy to explain, it may be a good idea.
Namespaces are one honking great idea -- let's do more of those!

启动agent

flume-ng agent -n a -f ./spoolingToHDFS.conf -Dflume.root.logger=DEBUG,console

2.hbaseLogToHDFS

配置文件

# a表示给agent命名为a
# 给source组件命名为r1
a.sources = r1
# 给sink组件命名为k1
a.sinks = k1 
# 给channel组件命名为c1
a.channels = c1
#指定exec的属性
a.sources.r1.type = exec 
a.sources.r1.command = tail -f /usr/local/soft/hbase-1.4.6/logs/hbase-root-master-master.log
#指定sink的类型
a.sinks.k1.type = hdfs
a.sinks.k1.hdfs.path = /flume/data/dir2
# 指定文件名前缀
a.sinks.k1.hdfs.filePrefix = hbaselog
# 指定达到多少数据量写一次文件 单位：bytes
a.sinks.k1.hdfs.rollSize = 102400
# 指定多少条写一次文件
a.sinks.k1.hdfs.rollCount = 1000
# 指定文件类型为 流 来什么输出什么
a.sinks.k1.hdfs.fileType = DataStream
# 指定文件输出格式 为text
a.sinks.k1.hdfs.writeFormat = text
# 指定文件名后缀
a.sinks.k1.hdfs.fileSuffix = .txt
 
#指定channel
a.channels.c1.type = memory 
a.channels.c1.capacity = 1000
# 表示sink每次会从channel里取多少数据
a.channels.c1.transactionCapacity = 100
# 组装
a.sources.r1.channels = c1 
a.sinks.k1.channel = c1

3.hbaselogToHBase

在hbase中创建log表

create 'log','cf1'

配置文件

# a表示给agent命名为a
# 给source组件命名为r1
a.sources = r1
# 给sink组件命名为k1
a.sinks = k1 
# 给channel组件命名为c1
a.channels = c1
#指定exec的属性
a.sources.r1.type = exec 
a.sources.r1.command = cat /usr/local/soft/hbase-1.4.6/logs/hbase-root-master-master.log
#指定sink的类型
a.sinks.k1.type = hbase
a.sinks.k1.table = log
a.sinks.k1.columnFamily = cf1
 
#指定channel
a.channels.c1.type = memory 
a.channels.c1.capacity = 100000
# 表示sink每次会从channel里取多少数据
a.channels.c1.transactionCapacity = 100
# 组装
a.sources.r1.channels = c1 
a.sinks.k1.channel = c1

4.netcatLogger

监听telnet端口

安装telnet

yum install telnet

配置文件

# a表示给agent命名为a
# 给source组件命名为r1
a.sources = r1
# 给sink组件命名为k1
a.sinks = k1 
# 给channel组件命名为c1
a.channels = c1
#指定netcat的属性
a.sources.r1.type = netcat 
a.sources.r1.bind = 0.0.0.0 
a.sources.r1.port = 8888 
 
#指定sink的类型
a.sinks.k1.type = logger
#指定channel
a.channels.c1.type = memory 
a.channels.c1.capacity = 1000
# 表示sink每次会从channel里取多少数据
a.channels.c1.transactionCapacity = 100
# 组装
a.sources.r1.channels = c1 
a.sinks.k1.channel = c1

启动

先启动agent

flume-ng agent -n a -f ./netcatToLogger.conf -Dflume.root.logger=DEBUG,console

在启动telnet

telnet master 8888

5.httpToLogger

配置文件

# a表示给agent命名为a
# 给source组件命名为r1
a.sources = r1
# 给sink组件命名为k1
a.sinks = k1 
# 给channel组件命名为c1
a.channels = c1
#指定http的属性
a.sources.r1.type = http
a.sources.r1.port = 6666 
 
#指定sink的类型
a.sinks.k1.type = logger
#指定channel
a.channels.c1.type = memory 
a.channels.c1.capacity = 1000
# 表示sink每次会从channel里取多少数据
a.channels.c1.transactionCapacity = 100
# 组装
a.sources.r1.channels = c1 
a.sinks.k1.channel = c1

启动

先启动agent

flume-ng agent -n a -f ./httpToLogger.conf -Dflume.root.logger=DEBUG,console

再使用curl发起一个http请求

curl -X POST -d '[{ "headers" :{"a" : "a1","b" : "b1"},"body" : "hello~http~flume~"}]' http://master:666

posted @ 2024-06-10 16:26 3088577529 阅读(101) 评论(0) 收藏举报

刷新页面返回顶部

yukana

慎言敏行，事在人为