Spark 基础

spark rdd

DataFrame与RDD的主要区别在于DataFrame带有schema元信息,即DataFrame所表示的二维表数据集的每一列都带有名称和类型。使得Spark SQL得以洞察更多的结构信息,从而对藏于DataFrame背后的数据源以及作用于DataFrame之上的变换进行了针对性的性能优化,最终达到大幅提升运行时效率的目标。

RDD,由于无从得知所存数据元素的具体内部结构,Spark Core只能在stage层面进行简单、通用的流水线优化。DataFrame底层是以RDD为基础的分布式数据集,和RDD的主要区别的是:RDD中没有schema信息,而DataFrame中数据每一行(Row)都包含schema

DataFrame = RDD[Row] + shcema

package com.shujia.spark.sql
 
import org.apache.spark.SparkContext
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.{DataFrame, Row, SparkSession}
 
object Demo3DFAndRDD {
  def main(args: Array[String]): Unit = {
    /**
     * spark sql 的入口
     */
    val spark: SparkSession = SparkSession
      .builder()
      .master("local")
      .appName("spark")
      .getOrCreate()
 
    //通过spark获取spark上下文对象,写代码入口
    val sc: SparkContext = spark.sparkContext
    val linesRDD: RDD[String] = sc.textFile("data/students.txt")
    val stuRDD: RDD[(String, String)] = linesRDD.map(lines => {
      val strings: Array[String] = lines.split(",")
      (strings(0), strings(1))
    })
 
    //导入隐式转换
    import spark.implicits._
 
    //RDD 2 DF
    val stuDF: DataFrame = stuRDD.toDF("id", "name")
    stuDF.show()
 
    //DF 2 RDD
    val rowRDD: RDD[Row] = stuDF.rdd
    val reRDD: RDD[(String, String)] = rowRDD.map(row => {
      //根据列名获取数据
      val id: String = row.getAs("id")
      val name: String = row.getAs("name")
      (id, name)
    })
    reRDD.foreach(println)
 
  }
 
}

phoenix sql

ALTER INDEX IF EXISTS idx_bdh_energy_hid2 ON "hs_spin"."dwd_energy_platforms" REBUILD ;

./bin/psql.py 192.168.187.128,192.168.187.129,192.168.187.130:2181 user.sql
/opt/phoenix-4.9.0-cdh5.9.1
./bin/psql.py hadoop362,hadoop363,hadoop364:2181 mesql/my_index.sql
/opt/phoenix-4.9.0-cdh5.9.1/bin/psql.py hadoop362,hadoop363,hadoop364:2181 mesql/my_index.sql
/opt/phoenix-4.9.0-cdh5.9.1/bin/psql.py hadoop362,hadoop363,hadoop364:2181 /opt/phoenix-4.9.0-cdh5.9.1/mesql/my_index.sql




hive sql shell

#!/bin/bash
APP=dwd
hive=/usr/bin/hive
jobname="DWD_p_min_yield_val_bak"
if [ -n "$1" ] ;then
    do_date=$1
else
    do_date=`date -d "-1 day" +%F`
fi

# phoenix 索引更新
/opt/phoenix-4.9.0-cdh5.9.1/bin/psql.py hadoop362,hadoop363,hadoop364:2181 /opt/phoenix-4.9.0-cdh5.9.1/mesql/my_index.sql


sql="
set mapred.job.name=$jobname;
set hive.exec.dynamic.partition=true;
set hive.exec.dynamic.partition.mode=nostrick;
insert into dwd.p_min_yield_val partition(year,month,day,producer_id)
select key, id, name, host_id, model_id, dev_group, path_id, staff_id, staff_name, class_type, variety, yarn_count, class_order, class_order_alias, efficiency, factor, output_m, output_kg, output_add_m, output_add_kg, htime, crt, online, current_shift
,SUBSTR(htime,1,4) year,SUBSTR(htime,1,7) month,SUBSTR(htime,1,10) day,producer_id from hs_spin.ext_min_yield_val where to_date(htime)='$do_date';


"
echo "$sql"

$hive -e "$sql" >> /opt/soft/hive/job/dwd/log/log.$do_date

参考资料

https://blog.csdn.net/zwyoozwz/article/details/125860126
http://xueai8.com/course/197/article

posted @ 2023-09-09 10:05  三里清风18  阅读(13)  评论(0)    收藏  举报