安装
mkdir /opt/spark && tar -zxf /root/spark-2.3.2-bin-hadoop2.7.tgz -C /opt/spark
配置服务
# 通过python接口读取spark时需要内置一些Jar包
# Hbase
mkdir /opt/spark/spark-2.3.2-bin-hadoop2.7/jars/hbase &&\
/bin/cp $HBASE_HOME/lib/hbase*.jar /opt/spark/spark-2.3.2-bin-hadoop2.7/jars/hbase &&\
/bin/cp $HBASE_HOME/lib/guava-*.jar /opt/spark/spark-2.3.2-bin-hadoop2.7/jars/hbase &&\
/bin/cp $HBASE_HOME/lib/htrace-core*.jar /opt/spark/spark-2.3.2-bin-hadoop2.7/jars/hbase &&\
/bin/cp $HBASE_HOME/lib/protobuf-java*.jar /opt/spark/spark-2.3.2-bin-hadoop2.7/jars/hbase &&\
/bin/cp $HBASE_HOME/lib/metrics-core-*.jar /opt/spark/spark-2.3.2-bin-hadoop2.7/jars/hbase
# mysql
/bin/cp /root/mysql-connector-java-8.0.16/mysql-connector-java-8.0.16.jar /opt/spark/spark-2.3.2-bin-hadoop2.7/jars/
# SHC
/bin/cp /root/shc-core-spark-2.3.2-hbase-2.0.1.jar /opt/spark/spark-2.3.2-bin-hadoop2.7/jars/
# 远程同步文件
scp -r /root/mysql-connector-java-8.0.16/mysql-connector-java-8.0.16.jar root@worker4:/opt/spark/spark-2.3.2-bin-hadoop2.7/jars/
scp -r /root/shc-core-spark-2.3.2-hbase-2.0.1.jar root@worker4:/opt/spark/spark-2.3.2-bin-hadoop2.7/jars/
scp -r /root/mysql-connector-java-8.0.16/mysql-connector-java-8.0.16.jar root@worker5:/opt/spark/spark-2.3.2-bin-hadoop2.7/jars/
scp -r /root/shc-core-spark-2.3.2-hbase-2.0.1.jar root@worker5:/opt/spark/spark-2.3.2-bin-hadoop2.7/jars/
# yarn模式下需要将依赖的jar包上传至集群
hadoop fs -mkdir -p /spark-yarn/jars
hdfs dfs -put /opt/spark/spark-2.3.2-bin-hadoop2.7/jars/* /spark-yarn/jars/
# 配置slaves
echo 'worker4
worker5' > /opt/spark/spark-2.3.2-bin-hadoop2.7/conf/slaves
# 配置环境和conf
cp /opt/spark/spark-2.3.2-bin-hadoop2.7/conf/spark-env.sh.template /opt/spark/spark-2.3.2-bin-hadoop2.7/conf/spark-env.sh &&\
echo '
export SPARK_DIST_CLASSPATH=$(hadoop classpath):$(hbase classpath)
export LD_LIBRARY_PATH=$HADOOP_HOME/lib/native
export SPARK_DAEMON_JAVA_OPTS="-Dspark.deploy.recoveryMode=ZOOKEEPER -Dspark.deploy.zookeeper.url=worker3:2181,worker4:2181,worker5:2181 -Dspark.deploy.zookeeper.dir=/spark"
' >> /opt/spark/spark-2.3.2-bin-hadoop2.7/conf/spark-env.sh
/bin/cp /opt/spark/spark-2.3.2-bin-hadoop2.7/conf/spark-defaults.conf.template /opt/spark/spark-2.3.2-bin-hadoop2.7/conf/spark-defaults.conf &&\
echo 'spark.yarn.jars=hdfs://NameNs/spark-yarn/jars/*.jar
spark.eventLog.enabled=false
' >> /opt/spark/spark-2.3.2-bin-hadoop2.7/conf/spark-defaults.conf
# hive 配置文件
cp /opt/hive/apache-hive-3.1.1-bin/conf/hive-site.xml /opt/spark/spark-2.3.2-bin-hadoop2.7/conf/hive-site.xml
scp -r /opt/hive/apache-hive-3.1.1-bin/conf/hive-site.xml root@worker4:/opt/spark/spark-2.3.2-bin-hadoop2.7/
scp -r /opt/hive/apache-hive-3.1.1-bin/conf/hive-site.xml root@worker5:/opt/spark/spark-2.3.2-bin-hadoop2.7/
启动服务
# Spark-HA
[root@worker3]# /opt/spark/spark-2.3.2-bin-hadoop2.7/sbin/start-master.sh
[root@worker4]# /opt/spark/spark-2.3.2-bin-hadoop2.7/sbin/start-master.sh
[root@worker4]# /opt/spark/spark-2.3.2-bin-hadoop2.7/sbin/start-slave.sh spark://worker3:7077
[root@worker5]# /opt/spark/spark-2.3.2-bin-hadoop2.7/sbin/start-slave.sh spark://worker3:7077
工作原理
RDD、DateFrame和SparkSQL
SparkStreaming和Structured Streaming
MLlib
GraphX