实时计算集群搭建(zookeeper&kafka&flume&mysql&flink)
1. 部署zookeeper集群
1.1 下载zk
下载:百度云盘/常用下载/zookeeper/linux/xxx
1.2 部署并配置zk
# 1. 部署zookeeper集群
## 1.1 下载zk
```
下载:百度云盘/常用下载/zookeeper/linux/xxx
```
## 1.2 部署并配置zk
```shell
tar -zxvf xxx.tar.gz -C /opt/hadoop
vim apache-zookeeper-3.8.4-bin/config/zoo_sample.cfg
---
## 加入
server.1=master:2888:3888
server.2=worker01:2888:3888
server.3=worker02:2888:3888
---
mv zoo_sample.cfg zoo.cfg
# 将zoo.cfg分发到每台节点
scp zoo.cfg root@worker01:/opt/hadoop/apache-zookeeper-3.8.4-bin/conf
scp zoo.cfg root@worker02:/opt/hadoop/apache-zookeeper-3.8.4-bin/conf
# 生成myid,每台都要执行
echo 1 >> /tmp/zookeeper/myid
echo 2 >> /tmp/zookeeper/myid
echo 3 >> /tmp/zookeeper/myid
```
## 1.3 启动zk
```shell
cd /opt/hadoop/apache-zookeeper-3.8.4-bin/bin
./zkServer.sh start ## 每台都要执行
## 查看启动是否成功
jps ## 查看有无QuorumPeerMain
```
# 2. 部署Kafka集群
## 2.1 下载kafka
```
下载:百度云盘/常用下载/kafka/linux/xxx
```
## 2.2 部署并配置kafka
```shell
tar -zxvf xxx.tar.gz -C /opt/hadoop
vim kafka_2.13-3.4.0/config/server.properties
---
# 每台都要配置broker.id
broker.id=1 ## 不能重复
# 每台都要配置listeners
listeners=PLAINTEXT://master:9092 ## 配置各自的地址
#每台都要配置zookeeper.connect
zookeeper.connect=master:2181,worker01:2181,worker02:2181
---
```
## 2.3 启动Kafka
```shell
cd /opt/hadoop/kafka_2.13-3.4.0
./bin/kafka-server-start.sh ./config/server.properties
## 后台运行
./bin/kafka-server-start.sh -daemon ./config/server.properties
## 查看有无kafka
jps
```
## 2.4 创建生产者
```shell
cd kafka_2.13-3.4.0/bin
./kafka-topics.sh --create --bootstrap-server 192.168.0.103:9092,192.168.0.104:9092,192.168.0.105:9092 --replication-factor 1 --partitions 1 --topic real_time
```
## 2.5 创建消费者
1)**命令行创建**
```shell
./kafka-console-consumer.sh --topic real_time1 --bootstrap-server 192.168.0.104:9092 --consumer.config ../config/consumer.properties
,192.168.0.105:9092,192.168.0.106:9092
---
# format: host1:port1,host2:port2 ...
bootstrap.servers=192.168.0.104:9092,192.168.0.105:9092,192.168.0.106:9092
# consumer group id
group.id=test-consumer-group
---
```
2)**FlinkSQL创建**
```java
// 1. 创建StreamTableEnvironment环境
StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
// env.enableCheckpointing(5000, CheckpointingMode.EXACTLY_ONCE);
// env.getCheckpointConfig().setCheckpointStorage("file:/D/ckpt/");
// env.getCheckpointConfig().setCheckpointStorage("hdfs://xxx:8020/xx/ckpt/");
StreamTableEnvironment tenv = StreamTableEnvironment.create(env);
// 2. 创建链接kafka的源头表
tenv.executeSql("CREATE TABLE kafka_source "+
"(id STRING,use_rname STRING, age int,gender STRING,goods_no STRING "+
",goods_price Float,store_id STRING,shopping_type STRING,tel STRING "+
",email STRING,shopping_date Date) "+
"WITH ('connector'='kafka' "+
",'topic'='real_time1' "+
",'properties.bootstrap.servers'='192.168.0.104:9092' "+
",'properties.group.id'='test-consumer-group' "+
",'scan.startup.mode' = 'latest-offset' "+
",'value.format'='csv' ); " );
```
3)**Flink Table API创建**
```java
import org.apache.flink.api.common.serialization.SimpleStringSchema;
import org.apache.flink.streaming.api.datastream.DataStream;
import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;
import org.apache.flink.streaming.connectors.kafka.FlinkKafkaConsumer;
import java.util.Properties;
public class KafkaConsumerExample {
public static void main(String[] args) throws Exception {
// 设置执行环境
final StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
// Kafka消费者配置
Properties properties = new Properties();
properties.setProperty("bootstrap.servers", "localhost:9092"); // Kafka服务器地址
properties.setProperty("group.id", "test"); // 消费者组ID
// 创建Kafka消费者
FlinkKafkaConsumer<String> kafkaConsumer = new FlinkKafkaConsumer<>(
"your-kafka-topic", // Kafka主题名称
new SimpleStringSchema(), // 序列化模式
properties
);
// 从Kafka读取数据并创建一个数据流
DataStream<String> stream = env.addSource(kafkaConsumer);
// 在这里,您可以对流进行转换操作,例如 map, filter, reduce 等
// 例如,打印从Kafka读取的每条消息
stream.print();
// 执行Flink程序
env.execute("Kafka Consumer Example");
}
}
```
# 3. 部署Flume
## 3.1 下载Flume
```
百度云盘/常用下载/flume/linux/xxx
```
## 3.2 部署配置Flume
```shell
tar -zxvf xxx.tar.gz -C /opt/hadoop
cd /apache-flume-1.11.0-bin/conf
vim test.conf
---
#添加:注意修改:client.sinks.sh1.kafka.topic=real_time
#添加:注意修改:client.sinks.sh1.kafka.bootstrap.servers=broker1:9092,broker2:9092,broker3:9092
client.sources=s1
client.channels=c1
client.sinks=sh1
# the source configuration of s1
client.sources.s1.type=spooldir
client.sources.s1.spoolDir=/tmp/flume_spooldir ## python文件存储的目录
client.sources.s1.fileSuffix=.COMPLETED
client.sources.s1.deletePolicy=never
client.sources.s1.trackerDir=.flumespool
client.sources.s1.ignorePattern=^$
client.sources.s1.batchSize=1000
client.sources.s1.inputCharset=UTF-8
client.sources.s1.deserializer=LINE
client.sources.s1.selector.type=replicating
client.sources.s1.fileHeaderKey=file
client.sources.s1.fileHeader=false
client.sources.s1.basenameHeader=true
client.sources.s1.basenameHeaderKey=basename
client.sources.s1.deserializer.maxBatchLine=1
client.sources.s1.deserializer.maxLineLength=2048
client.sources.s1.channels=c1
# the channel configuration of c1
client.channels.c1.type=memory
client.channels.c1.capacity=10000
client.channels.c1.transactionCapacity=1000
client.channels.c1.channlefullcount=10
client.channels.c1.keep-alive=3
client.channels.c1.byteCapacityBufferPercentage=20
# the sink configuration of sh1
client.sinks.sh1.type=org.apache.flume.sink.kafka.KafkaSink
client.sinks.sh1.kafka.topic=real_time1 #添加:注意修改
client.sinks.sh1.flumeBatchSize=1000
client.sinks.sh1.kafka.producer.type=sync
client.sinks.sh1.kafka.bootstrap.servers=master:9092 #添加:注意修改
client.sinks.sh1.kafka.security.protocol =PLAINTEXT
client.sinks.sh1.requiredAcks=0
client.sinks.sh1.channel=c1
---
```
## 3.3 启动Flume
```shell
# 启动
cd /opt/hadoop/apache-flume-1.11.0-bin/bin
flume-ng agent --conf /opt/flume-1.9.0/conf/ --conf-file ../conf/test.conf --name client -Dflume.root.logger=INFO,console
## 后台运行
nohup flume-ng agent --conf /opt/flume-1.9.0/conf/ --conf-file ../conf/test.conf --name client -Dflume.root.logger=INFO,console &
## 查看有无APPlication
jps
```
# 4. 部署Mysql
参考:https://www.cnblogs.com/fushiyi/articles/15956398.html
# 5. 部署Flink
## 5.1 下载Flink
```
百度云盘/常用下载/flink/linux/flink-1.13.6/flink-1.13.6-bin-scala_2.11.tgz
百度云盘/常用下载/flink/linux/flink-1.17.0/flink-1.17.0-bin-scala_2.12.tgz ## 正在用的版本
```
## 5.2 部署配置Flink
配置详解参考:https://blog.csdn.net/mengxianglong123/article/details/123754502
配置详解参考:https://blog.csdn.net/m0_52606060/article/details/131756522
### 5.2.1 配置主节点文件
```shell
tar -zxvf flink-1.13.6-bin-scala_2.11.tgz /opt/hadoop
vim conf/flink-conf.yaml
---
#设置jobmanager的机器地址
jobmanager.rpc.address: master
#设置允许访问jobmanager的机器地址,相当于白名单,0.0.0.0表示允许所有机器访问
jobmanager.bind-host: 0.0.0.0
#设置允许访问taskmanager的机器地址,相当于白名单,0.0.0.0表示允许所有机器访问
taskmanager.bind-host: 0.0.0.0
#设置taskmanager的机器地址,集群设置为当前机器所在地址,jobmanager那台机器不修改保持localhost地址。
taskmanager.host: worker01
#设置webui页面访问地址,如果没有设置或者IP地址错误会照成,集群即使启动成功,页面无法访问。
rest.bind-address: 0.0.0.0
---
## 分发flink-conf.yaml
```
scp flink-conf.yaml root@worker01:/opt/hadoop/flink-1.17.0/conf
scp flink-conf.yaml root@worker02:/opt/hadoop/flink-1.17.0/conf
```
vim flink/conf/workers
————————————————
#设置taskmanager地址
master
worker01
wroker02
————————————————
vim flink/conf/masters
————————————————
#设置taskmanager地址
master:8081
————————————————
```
## 5.3 启动Flink
```shell
cd $FlinkHome
bin/start-cluster.sh
```
1.3 启动zk
cd apache-zookeeper-3.8.4-bin/bin
./zkServer.sh start ## 每台都要执行
## 查看启动是否成功
jps ## 查看有无QuorumPeerMain
2. 部署Kafka集群
2.1 下载kafka
下载:百度云盘/常用下载/kafka/linux/xxx
2.2 部署并配置kafka
tar -zxvf xxx.tar.gz -C /opt/hadoop
vim kafka_2.13-3.4.0/config/server.properties
---
# 每台都要配置broker.id
broker.id=1 ## 不能重复
#每台都要配置zookeeper.connect
zookeeper.connect=zoo1:2181,zoo2:2181,zoo3:2181
# 每台都要配置listeners
listeners=PLAINTEXT://192.168.0.102:9092 ## 配置各自的地址
---
2.3 启动Kafka
cd kafka_2.13-3.4.0
./bin/kafka-server-start.sh -daemon ./config/server.properties
## 后台运行
./bin/kafka-server-start.sh -daemon ./config/server.properties
## 查看有无kafka
jps
2.4 创建生产者
cd kafka_2.13-3.4.0/bin
./kafka-topics.sh --create --bootstrap-server 192.168.0.103:9092,192.168.0.104:9092,192.168.0.105:9092 --replication-factor 1 --partitions 1 --topic real_time
2.5 创建消费者
1)命令行创建
./kafka-console-consumer.sh --topic real_time1 --bootstrap-server 192.168.0.104:9092 --consumer.config ../config/consumer.properties
,192.168.0.105:9092,192.168.0.106:9092
---
# format: host1:port1,host2:port2 ...
bootstrap.servers=192.168.0.104:9092,192.168.0.105:9092,192.168.0.106:9092
# consumer group id
group.id=test-consumer-group
---
2)FlinkSQL创建
// 1. 创建StreamTableEnvironment环境
StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
// env.enableCheckpointing(5000, CheckpointingMode.EXACTLY_ONCE);
// env.getCheckpointConfig().setCheckpointStorage("file:/D/ckpt/");
// env.getCheckpointConfig().setCheckpointStorage("hdfs://xxx:8020/xx/ckpt/");
StreamTableEnvironment tenv = StreamTableEnvironment.create(env);
// 2. 创建链接kafka的源头表
tenv.executeSql("CREATE TABLE kafka_source "+
"(id STRING,use_rname STRING, age int,gender STRING,goods_no STRING "+
",goods_price Float,store_id STRING,shopping_type STRING,tel STRING "+
",email STRING,shopping_date Date) "+
"WITH ('connector'='kafka' "+
",'topic'='real_time1' "+
",'properties.bootstrap.servers'='192.168.0.104:9092' "+
",'properties.group.id'='test-consumer-group' "+
",'scan.startup.mode' = 'latest-offset' "+
",'value.format'='csv' ); " );
3)Flink Table API创建
import org.apache.flink.api.common.serialization.SimpleStringSchema;
import org.apache.flink.streaming.api.datastream.DataStream;
import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;
import org.apache.flink.streaming.connectors.kafka.FlinkKafkaConsumer;
import java.util.Properties;
public class KafkaConsumerExample {
public static void main(String[] args) throws Exception {
// 设置执行环境
final StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
// Kafka消费者配置
Properties properties = new Properties();
properties.setProperty("bootstrap.servers", "localhost:9092"); // Kafka服务器地址
properties.setProperty("group.id", "test"); // 消费者组ID
// 创建Kafka消费者
FlinkKafkaConsumer<String> kafkaConsumer = new FlinkKafkaConsumer<>(
"your-kafka-topic", // Kafka主题名称
new SimpleStringSchema(), // 序列化模式
properties
);
// 从Kafka读取数据并创建一个数据流
DataStream<String> stream = env.addSource(kafkaConsumer);
// 在这里,您可以对流进行转换操作,例如 map, filter, reduce 等
// 例如,打印从Kafka读取的每条消息
stream.print();
// 执行Flink程序
env.execute("Kafka Consumer Example");
}
}
3. 部署Flume
3.1 下载Flume
百度云盘/常用下载/flume/linux/xxx
3.2 部署配置Flume
tar -zxvf xxx.tar.gz -C /opt/hadoop
cd /apache-flume-1.11.0-bin/conf
vim test.conf
---
#添加:注意修改:client.sinks.sh1.kafka.topic=real_time
#添加:注意修改:client.sinks.sh1.kafka.bootstrap.servers=broker1:9092,broker2:9092,broker3:9092
client.sources=s1
client.channels=c1
client.sinks=sh1
# the source configuration of s1
client.sources.s1.type=spooldir
client.sources.s1.spoolDir=/tmp/flume_spooldir ## python文件存储的目录
client.sources.s1.fileSuffix=.COMPLETED
client.sources.s1.deletePolicy=never
client.sources.s1.trackerDir=.flumespool
client.sources.s1.ignorePattern=^$
client.sources.s1.batchSize=1000
client.sources.s1.inputCharset=UTF-8
client.sources.s1.deserializer=LINE
client.sources.s1.selector.type=replicating
client.sources.s1.fileHeaderKey=file
client.sources.s1.fileHeader=false
client.sources.s1.basenameHeader=true
client.sources.s1.basenameHeaderKey=basename
client.sources.s1.deserializer.maxBatchLine=1
client.sources.s1.deserializer.maxLineLength=2048
client.sources.s1.channels=c1
# the channel configuration of c1
client.channels.c1.type=memory
client.channels.c1.capacity=10000
client.channels.c1.transactionCapacity=1000
client.channels.c1.channlefullcount=10
client.channels.c1.keep-alive=3
client.channels.c1.byteCapacityBufferPercentage=20
# the sink configuration of sh1
client.sinks.sh1.type=org.apache.flume.sink.kafka.KafkaSink
client.sinks.sh1.kafka.topic=real_time #添加:注意修改
client.sinks.sh1.flumeBatchSize=1000
client.sinks.sh1.kafka.producer.type=sync
client.sinks.sh1.kafka.bootstrap.servers=localhost:9092 #添加:注意修改
client.sinks.sh1.kafka.security.protocol =PLAINTEXT
client.sinks.sh1.requiredAcks=0
client.sinks.sh1.channel=c1
---
3.3 启动Flume
# 启动
cd apache-flume-1.11.0-bin/bin
flume-ng agent --conf /opt/flume-1.9.0/conf/ --conf-file ../conf/test.conf --name client -Dflume.root.logger=INFO,console
## 后台运行
nohup flume-ng agent --conf /opt/flume-1.9.0/conf/ --conf-file ../conf/test.conf --name client -Dflume.root.logger=INFO,console &
## 查看有无APPlication
jps
4. 部署Mysql
参考:https://www.cnblogs.com/fushiyi/articles/15956398.html
5. 部署Flink
5.1 下载Flink
百度云盘/常用下载/flink/linux/flink-1.13.6/flink-1.13.6-bin-scala_2.11.tgz
百度云盘/常用下载/flink/linux/flink-1.17.0/flink-1.17.0-bin-scala_2.12.tgz ## 正在用的版本
5.2 部署配置Flink
配置详解参考:https://blog.csdn.net/mengxianglong123/article/details/123754502
配置详解参考:https://blog.csdn.net/m0_52606060/article/details/131756522
5.2.1 配置主节点文件
tar -zxvf flink-1.13.6-bin-scala_2.11.tgz /opt/hadoop
vim conf/flink-conf.yaml
---
#设置jobmanager的机器地址
jobmanager.rpc.address: flinkv2
#设置允许访问jobmanager的机器地址,相当于白名单,0.0.0.0表示允许所有机器访问
jobmanager.bind-host: 0.0.0.0
#设置允许访问taskmanager的机器地址,相当于白名单,0.0.0.0表示允许所有机器访问
taskmanager.bind-host: 0.0.0.0
#设置taskmanager的机器地址,集群设置为当前机器所在地址,jobmanager那台机器不修改保持localhost地址。
taskmanager.host: flinkv1
#设置webui页面访问地址,如果没有设置或者IP地址错误会照成,集群即使启动成功,页面无法访问。
rest.bind-address: 0.0.0.0
---
vim flink/conf/workers
————————————————
#设置taskmanager地址
flinkv1
flinkv3
————————————————
vim flink/conf/masters
————————————————
#设置taskmanager地址
192.168.0.104:8081
————————————————
5.2.2 文件分发
scp flink-conf.yaml root@192.168.0.105:/opt/hadoop/flink-1.17.0/conf
scp flink-conf.yaml root@192.168.0.106:/opt/hadoop/flink-1.17.0/conf
5.3 启动Flink
cd $FlinkHome
bin/start-cluster.sh
5.4 监控Flink
http://192.168.0.104:8081/#/overview
## 查看主节点有无StandaloneSessionClusterEntrypoint 和 TaskManagerRunner
TaskManagerRunner