build.sh:
#!/bin/bash # # -- Build Apache Spark Standalone Cluster Docker Images # ---------------------------------------------------------------------------------------------------------------------- # -- Variables --------------------------------------------------------------------------------------------------------- # ---------------------------------------------------------------------------------------------------------------------- BUILD_DATE="$(date -u +'%Y-%m-%d')" SPARK_VERSION="3.5.4" HADOOP_VERSION="3" # DELTA_SPARK_VERSION="2.4.0" # DELTALAKE_VERSION="0.10.0" # JUPYTERLAB_VERSION="4.0.2" # PANDAS_VERSION="2.0.1" DELTA_PACKAGE_VERSION="delta-core_2.12:2.4.0" # SPARK_VERSION_MAJOR=${SPARK_VERSION:0:1} SPARK_XML_PACKAGE_VERSION="spark-xml_2.12:0.16.0" # SPARKSQL_MAGIC_VERSION="0.0.3" # KAFKA_PYTHON_VERSION="2.0.2" # ---------------------------------------------------------------------------------------------------------------------- # -- Functions---------------------------------------------------------------------------------------------------------- # ---------------------------------------------------------------------------------------------------------------------- function cleanContainers() { container="$(docker ps -a | grep 'jupyterlab' | awk '{print $1}')" docker stop "${container}" docker rm "${container}" container="$(docker ps -a | grep 'spark-worker' -m 1 | awk '{print $1}')" while [ -n "${container}" ]; do docker stop "${container}" docker rm "${container}" container="$(docker ps -a | grep 'spark-worker' -m 1 | awk '{print $1}')" done container="$(docker ps -a | grep 'spark-master' | awk '{print $1}')" docker stop "${container}" docker rm "${container}" container="$(docker ps -a | grep 'spark-base' | awk '{print $1}')" docker stop "${container}" docker rm "${container}" container="$(docker ps -a | grep 'base' | awk '{print $1}')" docker stop "${container}" docker rm "${container}" } function cleanImages() { docker rmi -f "$(docker images | grep -m 1 'jupyterlab' | awk '{print $3}')" docker rmi -f "$(docker images | grep -m 1 'spark-worker' | awk '{print $3}')" docker rmi -f "$(docker images | grep -m 1 'spark-master' | awk '{print $3}')" docker rmi -f "$(docker images | grep -m 1 'spark-base' | awk '{print $3}')" docker rmi -f "$(docker images | grep -m 1 'base' | awk '{print $3}')" } function cleanVolume() { docker volume rm "distributed-file-system" } function buildImages() { docker build \ --build-arg build_date="${BUILD_DATE}" \ -f docker/base/Dockerfile \ -t base:latest . docker build \ --build-arg build_date="${BUILD_DATE}" \ --build-arg spark_version="${SPARK_VERSION}" \ --build-arg hadoop_version="${HADOOP_VERSION}" \ --build-arg delta_package_version="${DELTA_PACKAGE_VERSION}" \ --build-arg spark_xml_package_version="${SPARK_XML_PACKAGE_VERSION}" \ -f docker/spark-base/Dockerfile \ -t spark-base:${SPARK_VERSION} . docker build \ --build-arg build_date="${BUILD_DATE}" \ --build-arg spark_version="${SPARK_VERSION}" \ -f docker/spark-master/Dockerfile \ -t spark-master:${SPARK_VERSION} . docker build \ --build-arg build_date="${BUILD_DATE}" \ --build-arg spark_version="${SPARK_VERSION}" \ -f docker/spark-worker/Dockerfile \ -t spark-worker:${SPARK_VERSION} . docker build \ --build-arg build_date="${BUILD_DATE}" \ --build-arg spark_version="${SPARK_VERSION}" \ -f docker/jupyterlab/Dockerfile \ -t jupyterlab:spark-${SPARK_VERSION} . } # ---------------------------------------------------------------------------------------------------------------------- # -- Main -------------------------------------------------------------------------------------------------------------- # ---------------------------------------------------------------------------------------------------------------------- cleanContainers; cleanImages; cleanVolume; buildImages;
base/Dockerfile:
ARG java_image_tag=17-jre FROM eclipse-temurin:${java_image_tag} # -- Layer: Image Metadata ARG build_date LABEL org.label-schema.build-date=${build_date} LABEL org.label-schema.description="Data Engineering wih Apache Spark and Delta Lake Cookbook - Cluster base image" LABEL org.label-schema.schema-version="1.0" # -- Layer: OS + Python + Scala ARG shared_workspace=/opt/workspace RUN mkdir -p ${shared_workspace}/data RUN mkdir -p /usr/share/man/man1 RUN apt-get update -y RUN apt-get install -y python3 python3-dev python3-venv RUN apt-get install -y curl r-base netcat-traditional build-essential manpages-dev RUN apt-get clean RUN rm -rf /var/lib/apt/lists/* RUN python3 -m venv /opt/myenv RUN /opt/myenv/bin/pip install --no-cache-dir --upgrade pip # We are explicitly pinning the versions of various libraries which this Docker image runs on. RUN /opt/myenv/bin/pip install --quiet --no-cache-dir delta-spark RUN /opt/myenv/bin/pip install --quiet --no-cache-dir deltalake RUN /opt/myenv/bin/pip install --quiet --no-cache-dir pandas ENV SCALA_HOME="/usr/bin/scala" ENV PATH=/opt/myenv/bin:${PATH}:${SCALA_HOME}/bin ENV SHARED_WORKSPACE=${shared_workspace} # -- Runtime VOLUME ${shared_workspace} CMD ["bash"]
Successfully built.
spark-base/Dockerfile:
FROM base # -- Layer: Image Metadata ARG build_date ARG delta_package_version ARG spark_xml_package_version LABEL org.label-schema.build-date=${build_date} LABEL org.label-schema.description="Data Engineering wih Apache Spark and Delta Lake Cookbook - Spark base image" LABEL org.label-schema.schema-version="1.0" # -- Layer: Apache Spark ARG spark_version ARG hadoop_version RUN curl https://archive.apache.org/dist/spark/spark-${spark_version}/spark-${spark_version}-bin-hadoop${hadoop_version}.tgz -o spark.tgz RUN tar -xf spark.tgz RUN mv spark-${spark_version}-bin-hadoop${hadoop_version} /usr/bin/ RUN echo "alias pyspark=/usr/bin/spark-${spark_version}-bin-hadoop${hadoop_version}/bin/pyspark" >> ~/.bashrc RUN echo "alias spark-shell=/usr/bin/spark-${spark_version}-bin-hadoop${hadoop_version}/bin/spark-shell" >> ~/.bashrc RUN mkdir /usr/bin/spark-${spark_version}-bin-hadoop${hadoop_version}/logs RUN rm spark.tgz ENV SPARK_HOME /usr/bin/spark-${spark_version}-bin-hadoop${hadoop_version} ENV SPARK_MASTER_HOST spark-master ENV SPARK_MASTER_PORT 7077 ENV PYSPARK_PYTHON python3 # -- Runtime WORKDIR ${SPARK_HOME} USER root ARG NBuser=NBuser ARG GROUP=NBuser RUN groupadd -r ${GROUP} && useradd -r -m -g ${GROUP} ${NBuser} RUN chown -R "${NBuser}":"${GROUP}" /home/"${NBuser}"/ RUN chown -R "${NBuser}":"${GROUP}" "${SPARK_HOME}" RUN chown -R "${NBuser}":"${GROUP}" "${SHARED_WORKSPACE}" USER ${NBuser} RUN ${SPARK_HOME}/bin/spark-shell --packages io.delta:${delta_package_version} \ --conf "spark.sql.extensions=io.delta.sql.DeltaSparkSessionExtension" \ --conf "spark.sql.catalog.spark_catalog=org.apache.spark.sql.delta.catalog.DeltaCatalog" RUN ${SPARK_HOME}/bin/spark-shell --packages com.databricks:${spark_xml_package_version} RUN ${SPARK_HOME}/bin/spark-shell --packages org.apache.spark:spark-sql-kafka-0-10_2.12:3.4.1
Successfully built.
spark-master/Dockerfile:
ARG spark_version FROM spark-base:${spark_version} # -- Layer: Image Metadata ARG build_date LABEL org.label-schema.build-date=${build_date} LABEL org.label-schema.description="Spark master image" LABEL org.label-schema.schema-version="1.0" # -- Runtime EXPOSE 8080 7077 CMD bin/spark-class org.apache.spark.deploy.master.Master >> logs/spark-master.out
Successfully built.
spark-worker/Dockerfile:
ARG spark_version FROM spark-base:${spark_version} # -- Layer: Image Metadata ARG build_date LABEL org.label-schema.build-date=${build_date} LABEL org.label-schema.description="Spark worker image" LABEL org.label-schema.schema-version="1.0" # -- Runtime EXPOSE 8081 CMD bin/spark-class org.apache.spark.deploy.worker.Worker spark://${SPARK_MASTER_HOST}:${SPARK_MASTER_PORT} >> logs/spark-worker.out
Successfully built.
jupyterlab/Dockerfile:
FROM base # -- Layer: Image Metadata ARG build_date LABEL org.label-schema.build-date=${build_date} LABEL org.label-schema.name="Data Engineering wih Apache Spark and Delta Lake Cookbook - JupyterLab Image" LABEL org.label-schema.description="JupyterLab image" # -- Layer: Notebooks and data # ADD docker/jupyterlab/kafka-producer.py / # -- Layer: JupyterLab + Python kernel for PySpark ARG spark_version RUN pip install --no-cache-dir wget RUN pip install --no-cache-dir pyspark==${spark_version} RUN pip install --no-cache-dir jupyterlab RUN pip install --no-cache-dir sparksql-magic RUN pip install --no-cache-dir kafka-python EXPOSE 8888 WORKDIR ${SHARED_WORKSPACE} # COPY docker/jupyterlab/00-first.py /root/.ipython/profile_default/startup/00-first.py CMD jupyter lab --ip=0.0.0.0 --port=8888 --no-browser --allow-root --NotebookApp.token=;python
Successfully built.
docker-compose.yml:
volumes:
shared-workspace:
name: "distributed-file-system"
driver: local
driver_opts:
o: bind
type: none
device: ./docker_volumes
services:
zookeeper:
image: docker.io/bitnami/zookeeper:3.8.2
container_name: zookeeper
ports:
- "2181:2181"
volumes:
- shared-workspace:/opt/workspace
environment:
- ALLOW_ANONYMOUS_LOGIN=yes
kafka:
image: docker.io/bitnami/kafka:3.5.1
container_name: kafka
ports:
- "9092:9092"
environment:
- BITNAMI_DEBUG=yes
- KAFKA_BROKER_ID=1
- KAFKA_ENABLE_KRAFT=false
- KAFKA_CFG_LISTENERS=PLAINTEXT://kafka:9092
- KAFKA_CFG_ADVERTISED_LISTENERS=PLAINTEXT://kafka:9092
- KAFKA_CFG_ZOOKEEPER_CONNECT=zookeeper:2181
- KAFKA_CFG_INTER_BROKER_LISTENER_NAME=PLAINTEXT
- KAFKA_CFG_LISTENER_SECURITY_PROTOCOL_MAP=PLAINTEXT:PLAINTEXT
- ALLOW_PLAINTEXT_LISTENER=yes
depends_on:
- zookeeper
jupyterlab:
image: jupyterlab:spark-3.5.4
container_name: jupyterlab
ports:
- 8888:8888
- 4040:4040
volumes:
- shared-workspace:/opt/workspace
spark-master:
image: spark-master:3.5.4
container_name: spark-master
ports:
- 8080:8080
- 7077:7077
volumes:
- shared-workspace:/opt/workspace
spark-worker-1:
image: spark-worker:3.5.4
container_name: spark-worker-1
environment:
- SPARK_WORKER_CORES=1
- SPARK_WORKER_MEMORY=512m
ports:
- 8081:8081
volumes:
- shared-workspace:/opt/workspace
depends_on:
- spark-master
spark-worker-2:
image: spark-worker:3.5.4
container_name: spark-worker-2
environment:
- SPARK_WORKER_CORES=1
- SPARK_WORKER_MEMORY=512m
ports:
- 8082:8081
volumes:
- shared-workspace:/opt/workspace
depends_on:
- spark-master

浙公网安备 33010602011771号