spark 编写自定义业务代码 提交到远程 yarn 上进行运行spark on yarn
1、Scala代码编译打包
package scala
import org.apache.spark.sql.SparkSession
import scala.math.random
object excutpi {
def main(args: Array[String]): Unit = {
//spark-submit --class scala.excutpi --master yarn --queue root.dafault --deploy-mode client D:\hadoop\workCode\sparkhiveproject\target\sparkhiveproject-1.0-SNAPSHOT.jar
// 创建 SparkSession
val spark = SparkSession.builder
.appName("Spark Pi")
// 注意:这里先设置为local[*]用于本地测试,提交到YARN时需要修改
//.master("local[*]")
.getOrCreate()
val slices = if (args.length > 0) args(0).toInt else 1000
val n = math.min(100000L * slices, Int.MaxValue).toInt
// 使用蒙特卡洛方法估算Pi
val count = spark.sparkContext.parallelize(1 until n, slices)
.map { i =>
val x = random * 2 - 1
val y = random * 2 - 1
if (x*x + y*y <= 1) 1 else 0
}.reduce(_ + _)
println(s"Pi is roughly ${4.0 * count / (n - 1)}")
spark.stop()
}
}
package scala
import org.apache.spark.SparkConf
import org.apache.spark.sql.SparkSession
object localSparkToHive {
def main(args: Array[String]): Unit = {
val spark: SparkSession = SparkSession.builder()
.appName("Hive Integration")
//.master("local[*]")
//.config("spark.sql.warehouse.dir", "hdfs://hadoop01:9000/hivedata") // 配置master映射远程ip,window hosts文件里面配置
.config("hive.metastore.uris", "thrift://hadoop01:9083") // 配置远程 hive-site.xml hive.metastore.uris
//启用ACID配置
.config("hive.support.concurrency","true")
.config("hive.txn.manager","org.apache.hadoop.hive.ql.lockmgr.DbTxnManager")
.config("spark.sql.storeAssignmentPolicy","LEGACY")
.config("spark.debug.maxToStringFields", "200")
.enableHiveSupport()
.getOrCreate()
//查询示例
// val df = spark.sql("show tables")
// df.printSchema()
spark.sql("SHOW DATABASES").show()
spark.sql("USE test_db").count()
// spark.sql("show tables").show()
// spark.sql("SELECT * FROM test_db.room1 where address like '%剑河%' ").show(10)
spark.sql("\nSELECT T.sname,T.ctfid,T.gender,\n count(*) OVER(PARTITION BY T.gender) AS FM_CNT\n " +
" FROM test_db.room3 T WHERE T.address like \"%上海%\" AND instr(T.ctfid,'310')>0 \nORDER BY FM_CNT DESC LIMIT 100;").show()
// df.createTempView("room2")
// spark.sql("select * from room2_gender_out").show()
// spark.sql("select t.gender,count(*) as cnt from test_db.room3 t group by t.gender").show()
// spark.sql("select a.gender,count(*) as cnt from test_db.room2 a " +
// "where length(TRIM(a.gender))>0 group by a.gender order by cnt desc").show()
//启用ACID配置 -= 1
// spark.sql("SET hive.support.concurrency=true")
// spark.sql("SET hive.txn.manager=org.apache.hadoop.hive.ql.lockmgr.DbTxnManager")
// spark.sql("update test_db.employees set salary = 100.0 where id = 1").show();
// spark.sql("select * from test_db.employees").show();
spark.close()
}
}
2、Linux spark 把 打包代码 逻辑 提交到 yarn分布式集群进行运行
spark-submit --class scala.excutpi --master yarn --queue root.dafault --deploy-mode client sparkhiveproject-1.0-SNAPSHOT.jar 10000 spark-submit --class scala.localSparkToHive --master yarn --queue root.dafault --deploy-mode client sparkhiveproject-1.0-SNAPSHOT.jar
自动化学习。

浙公网安备 33010602011771号