Hadoop-Spark安装及工作原理和优化

安装

mkdir /opt/spark && tar -zxf /root/spark-2.3.2-bin-hadoop2.7.tgz -C /opt/spark

配置服务

# 通过python接口读取spark时需要内置一些Jar包
# Hbase
mkdir /opt/spark/spark-2.3.2-bin-hadoop2.7/jars/hbase &&\
/bin/cp $HBASE_HOME/lib/hbase*.jar /opt/spark/spark-2.3.2-bin-hadoop2.7/jars/hbase &&\
/bin/cp $HBASE_HOME/lib/guava-*.jar /opt/spark/spark-2.3.2-bin-hadoop2.7/jars/hbase &&\
/bin/cp $HBASE_HOME/lib/htrace-core*.jar /opt/spark/spark-2.3.2-bin-hadoop2.7/jars/hbase &&\
/bin/cp $HBASE_HOME/lib/protobuf-java*.jar /opt/spark/spark-2.3.2-bin-hadoop2.7/jars/hbase &&\
/bin/cp $HBASE_HOME/lib/metrics-core-*.jar /opt/spark/spark-2.3.2-bin-hadoop2.7/jars/hbase
# mysql
/bin/cp /root/mysql-connector-java-8.0.16/mysql-connector-java-8.0.16.jar /opt/spark/spark-2.3.2-bin-hadoop2.7/jars/
# SHC
/bin/cp  /root/shc-core-spark-2.3.2-hbase-2.0.1.jar /opt/spark/spark-2.3.2-bin-hadoop2.7/jars/
# 远程同步文件
scp -r /root/mysql-connector-java-8.0.16/mysql-connector-java-8.0.16.jar root@worker4:/opt/spark/spark-2.3.2-bin-hadoop2.7/jars/
scp -r /root/shc-core-spark-2.3.2-hbase-2.0.1.jar root@worker4:/opt/spark/spark-2.3.2-bin-hadoop2.7/jars/
scp -r /root/mysql-connector-java-8.0.16/mysql-connector-java-8.0.16.jar root@worker5:/opt/spark/spark-2.3.2-bin-hadoop2.7/jars/
scp -r /root/shc-core-spark-2.3.2-hbase-2.0.1.jar root@worker5:/opt/spark/spark-2.3.2-bin-hadoop2.7/jars/


# yarn模式下需要将依赖的jar包上传至集群
hadoop fs -mkdir -p  /spark-yarn/jars
hdfs dfs -put /opt/spark/spark-2.3.2-bin-hadoop2.7/jars/* /spark-yarn/jars/

# 配置slaves
echo 'worker4
worker5' > /opt/spark/spark-2.3.2-bin-hadoop2.7/conf/slaves


# 配置环境和conf
cp /opt/spark/spark-2.3.2-bin-hadoop2.7/conf/spark-env.sh.template /opt/spark/spark-2.3.2-bin-hadoop2.7/conf/spark-env.sh &&\
echo '
export SPARK_DIST_CLASSPATH=$(hadoop classpath):$(hbase classpath)
export  LD_LIBRARY_PATH=$HADOOP_HOME/lib/native
export SPARK_DAEMON_JAVA_OPTS="-Dspark.deploy.recoveryMode=ZOOKEEPER -Dspark.deploy.zookeeper.url=worker3:2181,worker4:2181,worker5:2181 -Dspark.deploy.zookeeper.dir=/spark"
' >> /opt/spark/spark-2.3.2-bin-hadoop2.7/conf/spark-env.sh

/bin/cp /opt/spark/spark-2.3.2-bin-hadoop2.7/conf/spark-defaults.conf.template /opt/spark/spark-2.3.2-bin-hadoop2.7/conf/spark-defaults.conf &&\
echo 'spark.yarn.jars=hdfs://NameNs/spark-yarn/jars/*.jar
spark.eventLog.enabled=false
' >> /opt/spark/spark-2.3.2-bin-hadoop2.7/conf/spark-defaults.conf


# hive 配置文件
cp  /opt/hive/apache-hive-3.1.1-bin/conf/hive-site.xml /opt/spark/spark-2.3.2-bin-hadoop2.7/conf/hive-site.xml
scp -r /opt/hive/apache-hive-3.1.1-bin/conf/hive-site.xml root@worker4:/opt/spark/spark-2.3.2-bin-hadoop2.7/
scp -r /opt/hive/apache-hive-3.1.1-bin/conf/hive-site.xml root@worker5:/opt/spark/spark-2.3.2-bin-hadoop2.7/

启动服务

# Spark-HA
[root@worker3]# /opt/spark/spark-2.3.2-bin-hadoop2.7/sbin/start-master.sh
[root@worker4]# /opt/spark/spark-2.3.2-bin-hadoop2.7/sbin/start-master.sh
[root@worker4]# /opt/spark/spark-2.3.2-bin-hadoop2.7/sbin/start-slave.sh spark://worker3:7077
[root@worker5]# /opt/spark/spark-2.3.2-bin-hadoop2.7/sbin/start-slave.sh spark://worker3:7077

工作原理

RDD、DateFrame和SparkSQL

SparkStreaming和Structured Streaming

MLlib

GraphX

posted @ 2020-05-18 18:12  Cshare  阅读(138)  评论(0)    收藏  举报