spark 编写自定义业务代码 提交到远程 yarn 上进行运行spark on yarn

1、Scala代码编译打包

package scala
import org.apache.spark.sql.SparkSession
import scala.math.random

object excutpi {
  def main(args: Array[String]): Unit = {
    //spark-submit --class scala.excutpi  --master yarn --queue root.dafault --deploy-mode client D:\hadoop\workCode\sparkhiveproject\target\sparkhiveproject-1.0-SNAPSHOT.jar
    // 创建 SparkSession
    val spark = SparkSession.builder
      .appName("Spark Pi")
      // 注意:这里先设置为local[*]用于本地测试,提交到YARN时需要修改
       //.master("local[*]")
      .getOrCreate()

    val slices = if (args.length > 0) args(0).toInt else 1000
    val n = math.min(100000L * slices, Int.MaxValue).toInt

    // 使用蒙特卡洛方法估算Pi
    val count = spark.sparkContext.parallelize(1 until n, slices)
      .map { i =>
        val x = random * 2 - 1
        val y = random * 2 - 1
        if (x*x + y*y <= 1) 1 else 0
      }.reduce(_ + _)

    println(s"Pi is roughly ${4.0 * count / (n - 1)}")

    spark.stop()
  }
}

  

package scala

import org.apache.spark.SparkConf
import org.apache.spark.sql.SparkSession

object localSparkToHive {
  def main(args: Array[String]): Unit = {

    val spark: SparkSession = SparkSession.builder()
      .appName("Hive Integration")
      //.master("local[*]")
      //.config("spark.sql.warehouse.dir", "hdfs://hadoop01:9000/hivedata") // 配置master映射远程ip,window  hosts文件里面配置
      .config("hive.metastore.uris", "thrift://hadoop01:9083") //  配置远程 hive-site.xml  hive.metastore.uris
      //启用ACID配置
      .config("hive.support.concurrency","true")
      .config("hive.txn.manager","org.apache.hadoop.hive.ql.lockmgr.DbTxnManager")
      .config("spark.sql.storeAssignmentPolicy","LEGACY")
      .config("spark.debug.maxToStringFields", "200")
      .enableHiveSupport()
      .getOrCreate()

    //查询示例
    //    val df = spark.sql("show tables")
    //    df.printSchema()
    spark.sql("SHOW DATABASES").show()
    spark.sql("USE test_db").count()
//    spark.sql("show tables").show()
//    spark.sql("SELECT * FROM test_db.room1 where address like '%剑河%' ").show(10)

    spark.sql("\nSELECT T.sname,T.ctfid,T.gender,\n       count(*) OVER(PARTITION BY T.gender) AS FM_CNT\n     " +
      "  FROM test_db.room3 T WHERE  T.address like \"%上海%\" AND instr(T.ctfid,'310')>0 \nORDER BY FM_CNT DESC LIMIT 100;").show()

    //        df.createTempView("room2")

    //  spark.sql("select * from room2_gender_out").show()
  // spark.sql("select t.gender,count(*) as cnt from test_db.room3 t group by t.gender").show()

    //    spark.sql("select a.gender,count(*) as cnt from test_db.room2 a " +
    //      "where length(TRIM(a.gender))>0 group by a.gender order by cnt desc").show()

    //启用ACID配置 -= 1
    //    spark.sql("SET hive.support.concurrency=true")
    //    spark.sql("SET hive.txn.manager=org.apache.hadoop.hive.ql.lockmgr.DbTxnManager")


    //    spark.sql("update test_db.employees set salary = 100.0 where id = 1").show();

    //    spark.sql("select * from test_db.employees").show();

    spark.close()
  }
}

2、Linux spark 把 打包代码 逻辑 提交到 yarn分布式集群进行运行

 spark-submit --class scala.excutpi  --master yarn --queue root.dafault --deploy-mode client sparkhiveproject-1.0-SNAPSHOT.jar 10000

spark-submit --class scala.localSparkToHive  --master yarn --queue root.dafault --deploy-mode client sparkhiveproject-1.0-SNAPSHOT.jar

  

posted @ 2025-12-27 21:07  ARYOUOK  阅读(6)  评论(0)    收藏  举报