spark部署示例
官网
https://spark.apache.org/
# spark版本文档列表
https://spark.apache.org/docs/
# 查看指定版本文档
https://spark.apache.org/docs/2.4.7/
# 下载安装包
https://spark.apache.org/downloads.html
<!--jdbc连接官网-->
https://spark.apache.org/docs/3.4.0/sql-distributed-sql-engine.html#running-the-thrift-jdbcodbc-server
scala学习
spark单机部署
spark独立集群部署
spark使用yarn部署
spark部署(3种模式)
启动sql客户端,直接写sql
bin/spark-sql
启动spark jdbc iceberg服务
<!-- yarn方式启动 -->
spark/sbin/start-thriftserver.sh --master yarn
启动spark jdbc thriftserver服务
<!--jdbc连接官网-->
https://spark.apache.org/docs/3.4.0/sql-distributed-sql-engine.html#running-the-thrift-jdbcodbc-server
<!-- 启动spark jdbc thriftserver服务,服务名称:SparkSubmit -->
<!-- yarn方式启动 -->
spark/sbin/start-thriftserver.sh --master yarn
<!--设置连接端口,默认10000-->
export HIVE_SERVER2_THRIFT_PORT=<listening-port>
<!--配置thrift服务绑定的ip-->
export HIVE_SERVER2_THRIFT_BIND_HOST=<listening-host>
spark/sbin/start-thriftserver.sh \
<!--master的URL,如spark://host:port, mesos://host:port, yarn, 或local -->
--master MASTER_URL
<!--如果使用yarn模式,设置队列名字-->
--queue queue_name \
<!--executor的数目 -->
--num-executors NUM \
<!--driver内存的大小-->
--conf spark.driver.memory=40g \
<!--driver CPU数目,cluster模式才有这个参数-->
--driver-cores NUM \
<!--executor内存大小,如果开启动态分配,这个就不需要了-->
--executor-memory 6g \
<!--overhead大小-->
--conf spark.yarn.executor.memoryOverhead=2048 \
been连接spark
beeline> !connect jdbc:hive2://localhost:10000 -n 用户名 -p 密码
beeline -u jdbc:hive2://localhost:10000 -n 用户名 -p 密码
spark连接
./spark-sql --conf spark.sql.extensions=org.apache.iceberg.spark.extensions.IcebergSparkSessionExtensions \
--conf spark.sql.catalog.spark_iceberg=org.apache.iceberg.spark.SparkCatalog \
--conf spark.sql.catalog.spark_iceberg.type=hive \
--conf spark.sql.catalog.spark_iceberg.uri=thrift://CQA-L0668036:9083 \
--conf spark.sql.catalog.spark_iceberg.warehouse=hdfs://clusterHA/user/hive/iceberg_hive
./spark-submit --master yarn --deploy-mode cluster --class org.apache.spark.examples.SparkPi ../examples/jars/spark-examples_2.13-3.4.0.jar 100
./spark-submit --master yarn --class org.apache.spark.examples.SparkPi ../examples/jars/spark-examples_2.13-3.4.0.jar 100
jdbc连接spark
<!--sparkSQL 依赖-->
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-sql_2.11</artifactId>
<version>${spark.version}</version>
</dependency>
<!--hiveSQL 的依赖-->
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-hive_2.11</artifactId>
<version>${spark.version}</version>
</dependency>
<!--配置hive的依赖-->
<dependency>
<groupId>org.spark-project.hive</groupId>
<artifactId>hive-jdbc</artifactId>
<version>1.2.1.spark2</version>
</dependency>
<!--配置连接数据库-->
Class.forName("org.apache.hive.jdbc.HiveDriver");
<!--url: beeline的访问路径 ip是你的服务器的ip, username: 服务器的用户名 password: 服务器登录密码-->
DriverManager.getConnection("jdbc:hive2://ip:10000","username","password");
conn.prepareStatement("select name,password from user");
spark UI 访问地址
# hadoop UI
http://node01/9870
# hadoop resourceManager UI
http://node01/8088
# spark UI
http://node01/8080
# spark Job UI
http://node01/4040
# spark History UI
http://node01/18080