Loading

pyspark3.0.0使用读取Hive数据,使用ArangoClient导入Arango

记录一下使用pyspark读取Hive数据导入至Arango

import pyspark
import os
import findspark

from pyspark.sql import SparkSession
from pyspark.sql.functions import col, lit
from arango import ArangoClient

findspark.init()

os.environ['HADOOP_USER_NAME']='zhisan'
os.environ['SPARK_HOME']='/opt/spark'
os.environ['JAVA_HOME']='/opt/hava'
os.environ['PATHONPATH']='/home/zhisan/apps/conda/miniconda3/envs/py10/bin/python'
os.environ['PYSPARK_DRIVER_PYTHON']='/home/zhisan/apps/conda/miniconda3/envs/py10/bin/python'
os.environ['PYSPARK_PYTHON']='/home/zhisan/apps/conda/miniconda3/envs/py10/bin/python'


spark = SparkSession.builder \
    .enableHiveSupport() \
    .appName("sparkSql") \
    .config("spark.master", "yarn") \
    .config("spark.submit.deployMode", "client") \
    .config("spark.driver.maxResultSize", "4G") \
    .getOrCreate()

# 获取Arango链接
def get_arnago_collection(db_name, collection_name):
    client = ArangoClient(hosts="http://xxx:8529", request_timeout=60*60*2)
    db = client.db(db_name, username="xxx", password="xxx")
    collection = db.collection(collection_name)
    return collection

# Arango批量写入
def write_to_arangodb(db_name, collection_name, document_dict):
    collection = get_arnago_collection(db_name, collection_name)
    collection.import_bulk(document_dict)

# 
db_name = "gs_enterprise_graph"
collection_name = "erp_rel_email_e"
hive_db = "gs"
hive_table = "erp_rel_email_e"

collection = get_arnago_collection(db_name, collection_name)
collection.truncate()

hive_df = spark.sql(f"SELECT * FROM {hive_db}.{hive_table}")
hive_df = hive_df \
    .withColumn("_id", lit(f"{collection_name}/") + col("object_key")) \
    .withColumn("_key", col("object_key")) \
    .withColumn("_from", col("from_key")) \
    .withColumn("_to", col("to_key")) 
df_dict = hive_df.rdd.map(lambda row: row.asDict()) 

df_dict.foreachPartition(lambda partition: write_to_arangodb(db_name, collection_name, partition))

posted @ 2024-07-22 14:35  AxiaNibiru  阅读(29)  评论(0)    收藏  举报