pyspark3.0.0使用读取Hive数据,使用ArangoClient导入Arango
记录一下使用pyspark读取Hive数据导入至Arango
import pyspark
import os
import findspark
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, lit
from arango import ArangoClient
findspark.init()
os.environ['HADOOP_USER_NAME']='zhisan'
os.environ['SPARK_HOME']='/opt/spark'
os.environ['JAVA_HOME']='/opt/hava'
os.environ['PATHONPATH']='/home/zhisan/apps/conda/miniconda3/envs/py10/bin/python'
os.environ['PYSPARK_DRIVER_PYTHON']='/home/zhisan/apps/conda/miniconda3/envs/py10/bin/python'
os.environ['PYSPARK_PYTHON']='/home/zhisan/apps/conda/miniconda3/envs/py10/bin/python'
spark = SparkSession.builder \
.enableHiveSupport() \
.appName("sparkSql") \
.config("spark.master", "yarn") \
.config("spark.submit.deployMode", "client") \
.config("spark.driver.maxResultSize", "4G") \
.getOrCreate()
# 获取Arango链接
def get_arnago_collection(db_name, collection_name):
client = ArangoClient(hosts="http://xxx:8529", request_timeout=60*60*2)
db = client.db(db_name, username="xxx", password="xxx")
collection = db.collection(collection_name)
return collection
# Arango批量写入
def write_to_arangodb(db_name, collection_name, document_dict):
collection = get_arnago_collection(db_name, collection_name)
collection.import_bulk(document_dict)
#
db_name = "gs_enterprise_graph"
collection_name = "erp_rel_email_e"
hive_db = "gs"
hive_table = "erp_rel_email_e"
collection = get_arnago_collection(db_name, collection_name)
collection.truncate()
hive_df = spark.sql(f"SELECT * FROM {hive_db}.{hive_table}")
hive_df = hive_df \
.withColumn("_id", lit(f"{collection_name}/") + col("object_key")) \
.withColumn("_key", col("object_key")) \
.withColumn("_from", col("from_key")) \
.withColumn("_to", col("to_key"))
df_dict = hive_df.rdd.map(lambda row: row.asDict())
df_dict.foreachPartition(lambda partition: write_to_arangodb(db_name, collection_name, partition))

浙公网安备 33010602011771号