Spark 基础
spark rdd
DataFrame与RDD的主要区别在于DataFrame带有schema元信息,即DataFrame所表示的二维表数据集的每一列都带有名称和类型。使得Spark SQL得以洞察更多的结构信息,从而对藏于DataFrame背后的数据源以及作用于DataFrame之上的变换进行了针对性的性能优化,最终达到大幅提升运行时效率的目标。
RDD,由于无从得知所存数据元素的具体内部结构,Spark Core只能在stage层面进行简单、通用的流水线优化。DataFrame底层是以RDD为基础的分布式数据集,和RDD的主要区别的是:RDD中没有schema信息,而DataFrame中数据每一行(Row)都包含schema
DataFrame = RDD[Row] + shcema
package com.shujia.spark.sql
import org.apache.spark.SparkContext
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.{DataFrame, Row, SparkSession}
object Demo3DFAndRDD {
def main(args: Array[String]): Unit = {
/**
* spark sql 的入口
*/
val spark: SparkSession = SparkSession
.builder()
.master("local")
.appName("spark")
.getOrCreate()
//通过spark获取spark上下文对象,写代码入口
val sc: SparkContext = spark.sparkContext
val linesRDD: RDD[String] = sc.textFile("data/students.txt")
val stuRDD: RDD[(String, String)] = linesRDD.map(lines => {
val strings: Array[String] = lines.split(",")
(strings(0), strings(1))
})
//导入隐式转换
import spark.implicits._
//RDD 2 DF
val stuDF: DataFrame = stuRDD.toDF("id", "name")
stuDF.show()
//DF 2 RDD
val rowRDD: RDD[Row] = stuDF.rdd
val reRDD: RDD[(String, String)] = rowRDD.map(row => {
//根据列名获取数据
val id: String = row.getAs("id")
val name: String = row.getAs("name")
(id, name)
})
reRDD.foreach(println)
}
}
phoenix sql
ALTER INDEX IF EXISTS idx_bdh_energy_hid2 ON "hs_spin"."dwd_energy_platforms" REBUILD ;
./bin/psql.py 192.168.187.128,192.168.187.129,192.168.187.130:2181 user.sql
/opt/phoenix-4.9.0-cdh5.9.1
./bin/psql.py hadoop362,hadoop363,hadoop364:2181 mesql/my_index.sql
/opt/phoenix-4.9.0-cdh5.9.1/bin/psql.py hadoop362,hadoop363,hadoop364:2181 mesql/my_index.sql
/opt/phoenix-4.9.0-cdh5.9.1/bin/psql.py hadoop362,hadoop363,hadoop364:2181 /opt/phoenix-4.9.0-cdh5.9.1/mesql/my_index.sql
hive sql shell
#!/bin/bash
APP=dwd
hive=/usr/bin/hive
jobname="DWD_p_min_yield_val_bak"
if [ -n "$1" ] ;then
do_date=$1
else
do_date=`date -d "-1 day" +%F`
fi
# phoenix 索引更新
/opt/phoenix-4.9.0-cdh5.9.1/bin/psql.py hadoop362,hadoop363,hadoop364:2181 /opt/phoenix-4.9.0-cdh5.9.1/mesql/my_index.sql
sql="
set mapred.job.name=$jobname;
set hive.exec.dynamic.partition=true;
set hive.exec.dynamic.partition.mode=nostrick;
insert into dwd.p_min_yield_val partition(year,month,day,producer_id)
select key, id, name, host_id, model_id, dev_group, path_id, staff_id, staff_name, class_type, variety, yarn_count, class_order, class_order_alias, efficiency, factor, output_m, output_kg, output_add_m, output_add_kg, htime, crt, online, current_shift
,SUBSTR(htime,1,4) year,SUBSTR(htime,1,7) month,SUBSTR(htime,1,10) day,producer_id from hs_spin.ext_min_yield_val where to_date(htime)='$do_date';
"
echo "$sql"
$hive -e "$sql" >> /opt/soft/hive/job/dwd/log/log.$do_date
参考资料
https://blog.csdn.net/zwyoozwz/article/details/125860126
http://xueai8.com/course/197/article

浙公网安备 33010602011771号