第一章 Local本地模式
|
Hadoop102 |
Hadoop103 |
Hadoop104 |
| Spark |
Worker、 Master |
|
|
# 下载
[whboy@hadoop102 ]$ wget https://archive.apache.org/dist/spark/spark-4.0.0-preview2/spark-4.0.0-preview2-bin-hadoop3.tgz
# 解压
[whboy@hadoop102 ]$ tar -zxvf spark-4.0.0-preview2-bin-hadoop3.tgz -C /opt/module
[whboy@hadoop102 ]$ mv spark-4.0.0-preview2-bin-hadoop3 spark-4.0.0-local
# 提交测试案例
[whboy@hadoop102 spark-4.0.0-local]$ bin/spark-submit \
--class org.apache.spark.examples.SparkPi \
--master local \
./examples/jars/spark-examples_2.12-4.0.0.jar \
500
第二章 Standalone模式
|
Hadoop102 |
Hadoop103 |
Hadoop104 |
| Spark |
Worker、 Master |
Worker |
Worker |
2.1 Spark安装
# 下载
[whboy@hadoop102 ]$ wget https://archive.apache.org/dist/spark/spark-4.0.0-preview2/spark-4.0.0-preview2-bin-hadoop3.tgz
# 解压
[whboy@hadoop102 ]$ tar -zxvf spark-4.0.0-preview2-bin-hadoop3.tgz -C /opt/module
[whboy@hadoop102 ]$ mv spark-4.0.0-preview2-bin-hadoop3 spark-4.0.0-standalone
# 配置环境变量
[whboy@hadoop102 ]$ sudo vim /etc/profile.d/my_env.sh
# SPARK_HOME
export SPARK_HOME=/opt/module/spark-4.0.0-standalone
export PATH=$PATH:$SPARK_HOME/bin:$SPARK_HOME/sbin
2.2 Spark配置文件
2.2.1 spark-env.sh
[whboy@hadoop102 spark-4.0.0-standalone]$ mv conf/spark-env.sh.template spark-env.sh
[whboy@hadoop102 spark-4.0.0-standalone]$ vim conf/spark-env.sh
export JAVA_HOME=/opt/module/jdk21/
export SPARK_MASTER_HOST=hadoop102
export SPARK_MASTER_PORT=7077
2.2.2 workers
[whboy@hadoop102 spark-4.0.0-standalone]$ mv conf/workers.template workers
[whboy@hadoop102 spark-4.0.0-standalone]$ vim conf/workers
hadoop102
hadoop103
hadoop104
2.3 配置历史服务
2.3.1 spark-env.sh
[whboy@hadoop102 spark-4.0.0-standalone]$ vim conf/spark-env.sh
export SPARK_HISTORY_OPTS="
-Dspark.history.ui.port=18080
-Dspark.history.fs.logDirectory=hdfs://hadoop102:8020/directory
-Dspark.history.retainedApplications=100"
#参数1含义:WEB UI访问的端口号为18080
#参数2含义:指定历史服务器日志存储路径
#参数3含义:指定保存Application历史记录的个数,如果超过这个值,旧的应用程序信息将被删除,这个是内存中的应用数,而不是页面上显示的应用数。
2.3.1 spark-defaults.conf
[whboy@hadoop102 spark-4.0.0-standalone]$ vim conf/spark-defaults.conf
spark.eventLog.enabled true
spark.eventLog.dir hdfs://hadoop102:8020/directory
spark.history.ui.port=18080
spark.history.fs.logDirectory=hdfs://hadoop102:8020/directory
spark.yarn.historyServer.address=hadoop102:18080
# 集群同步
[whboy@hadoop102 module]$ my_rsync_script.sh spark-4.0.0-standalone
2.4 启动集群
# 注意:需要启动hadoop集群,HDFS上的directory目录需要提前存在
[whboy@hadoop102 spark-4.0.0-standalone]$ start-dfs.sh
[whboy@hadoop102 spark-4.0.0-standalone]$ hadoop fs -mkdir /directory
# 启动spark集群和历史服务器
[whboy@hadoop102 spark-4.0.0-standalone]$ sbin/start-all.sh
[whboy@hadoop102 spark-4.0.0-standalone]$ sbin/start-history-server.sh
# 提交应用
[whboy@hadoop102 spark-4.0.0-standalone]$ bin/spark-submit \
--class org.apache.spark.examples.SparkPi \
--master spark://hadoop102:7077 \
./examples/jars/spark-examples_2.12-4.0.0.jar \
100
# 查看历史服务:http://hadoop102:18080
第三章 Yarn模式
4.1 Spark安装
# 下载
[whboy@hadoop102 ]$ wget https://archive.apache.org/dist/spark/spark-4.0.0-preview2/spark-4.0.0-preview2-bin-hadoop3.tgz
# 解压
[whboy@hadoop102 ]$ tar -zxvf spark-4.0.0-preview2-bin-hadoop3.tgz -C /opt/module
[whboy@hadoop102 ]$ mv spark-4.0.0-preview2-bin-hadoop3 spark-4.0.0-yarn
# 配置环境变量
[whboy@hadoop102 ]$ sudo vim /etc/profile.d/my_env.sh
# SPARK_HOME
export SPARK_HOME=/opt/module/spark-4.0.0-yarn
export PATH=$PATH:$SPARK_HOME/bin:$SPARK_HOME/sbin
4.2 Spark配置文件
4.2.1 spark-env.sh
[whboy@hadoop102 spark-4.0.0-yarn]$ vim conf/spark-env.sh
export JAVA_HOME=/opt/module/jdk-21.0.5
export YARN_CONF_DIR=/opt/module/hadoop-3.4.1/etc/hadoop
4.2.2 spark-defaults.conf
[whboy@hadoop102 spark-4.0.0-yarn]$ vim conf/spark-defaults.conf
spark.eventLog.enabled true
spark.eventLog.dir hdfs://hadoop102:8020/directory
4.4 Spark历史服务器
4.4.1 spark-env.sh
[whboy@hadoop102 spark-4.0.0-yarn]$ vim conf/spark-env.sh
# 历史服务器
export SPARK_HISTORY_OPTS="
-Dspark.history.ui.port=18080
-Dspark.history.fs.logDirectory=hdfs://hadoop102:8020/directory
-Dspark.history.retainedApplications=100"
4.2.2 spark-defaults.conf
[whboy@hadoop102 spark-4.0.0-yarn]$ vim conf/spark-defaults.conf
spark.eventLog.enabled true
spark.eventLog.dir hdfs://hadoop102:8020/directory
spark.history.ui.port=18080
spark.history.fs.logDirectory=hdfs://hadoop102:8020/directory
spark.yarn.historyServer.address=hadoop102:18080
# 启动spark历史服务
[whboy@hadoop102 spark-4.0.0-yarn]$ sbin/start-history-server.sh
4.5 提交作业
# 客户端提交
[whboy@hadoop102 spark-yarn]$ bin/spark-submit \
--class org.apache.spark.examples.SparkPi \
--master yarn \
--deploy-mode client \
./examples/jars/spark-examples_2.13-4.0.0-preview2.jar \
100
# 集群模式提交
[whboy@hadoop102 spark-yarn]$ bin/spark-submit \
--class org.apache.spark.examples.SparkPi \
--master yarn \
--deploy-mode cluster \
./examples/jars/spark-examples_2.13-4.0.0-preview2.jar \
100
# Web页面查看日志:http://hadoop102:18080/