Parquet 支持元数据合并:
1,主要是针对多个 Parquet文件,并且有着可以互相兼容进行合并
2,开启自动合并的两种方式:
1),读取 Parquet文件时将数据源选项 mergeSchema 设置为true
2),使用 SQLContext.setConf() 将 spark.sql.parquet.mergeSchema设置为 true
package day02
import org.apache.spark.sql.SQLContext
import org.apache.spark.{SparkConf, SparkContext}
object Merge {
def main(args: Array[String]): Unit = {
val conf = new SparkConf()
.setAppName("Mero").setMaster("local[*]")
val sc = new SparkContext(conf)
val sqlContext = new SQLContext(sc)
// 隐式转换
import sqlContext.implicits._
val studentsWithNameAge = Array(("JIEK",19),("TOM",34))
val studentsWithNameAgeDF =
sc.parallelize(studentsWithNameAge,2).
toDF("name","age")
// 写入到一个 parquet文件之中
studentsWithNameAgeDF.write.
mode(saveMode = "append").format("parquet")
.save("hdfs://master:9000/students")
val studentsWithNameGrade= Array(("MERRAY","A"),("MATH","B"))
val studentsWithNameGradeDf =
sc.parallelize(studentsWithNameGrade,2).
toDF("name","grade")
// 写入到一个 parquet文件之中
studentsWithNameGradeDf.write.
mode(saveMode = "append").format("parquet")
.save("hdfs://master:9000/students")
//第一个 DataFrame 之中的元数据(name,age)与 第二个元数据(name,grade)是不一样的
// 进行元数据合并
val students =sqlContext.read.option("mergeSchema","true")
.parquet("hdfs://master:9000/students")
students.show()
students.printSchema()
/*
+------+----+-----+
| name| age|grade|
+------+----+-----+
|MERRAY|null| A|
| JIEK| 19| null|
| MATH|null| B|
| TOM| 34| null|
+------+----+-----+
root
|-- name: string (nullable = true)
|-- age: integer (nullable = true)
|-- grade: string (nullable = true)
* */
}}
sparksql 操作hive
saveAsTable与 registerTempTable的区别
1,spark 允许将数据保存到 Hive 表中,使用 saveAsTable 将DataFrame数据保存到Hive表之中,saveAsTable 不仅创建元数据,还会数据物理化到Hive表之中
2,registerTempTable 会创建一张临时表,没有物理化操作.spark Applation 重新启动就会丢失
2, 需要将 hive-site.xml 拷贝到 spark/conf 下,mysql-connect 拷贝到 spark/jars 之中
package day02
import org.apache.spark.sql.SaveMode
import org.apache.spark.sql.hive.HiveContext
import org.apache.spark.{SparkConf, SparkContext}
object hiveSparkSql {
def main(args: Array[String]): Unit = {
val conf = new SparkConf().setAppName("hive-sql")
.setMaster("local[*]")
val sc = new SparkContext(conf)
// 以 SparkContext 为参数
val hiveContext = new HiveContext(sc)
hiveContext.sql(
"drop table if exists student_infos")
hiveContext.sql("create table if not " +
"exists student_infos(name string ,age int) row format delimited fields terminated by '\\t'")
hiveContext.sql("" +
"load data local inpath '/home/hadoop/" +
"student_infos.txt' into table student_infos")
hiveContext.sql("" +
"drop table if exists student_scores")
hiveContext.sql("" +
"create table if not exists student_scores(name string,score int) " +
"row format delimited fields terminated by '\\t'")
hiveContext.sql("" +
"load data local inpath '/home/hadoop/" +
"student_scores.txt' into table student_scores")
// 关联两张表
val goodStudentDf = hiveContext.sql("select " +
"si.name,si.age,ss.score from student_infos si join " +
"student_scores ss on si.name==ss.name where ss.score" +
">=80")
hiveContext.sql("drop table if exists good_student_infos")
goodStudentDf.write.mode(SaveMode.Overwrite).saveAsTable("good_student_infos")
val goodStudents = hiveContext.table("good_student_infos"
).collect()
val goodstudentsRows = hiveContext.table("" +
"good_student_infos").collect()
for(goodstudentsRow<-goodstudentsRows){
println(goodstudentsRow)
}}}
使用 scala 操作 jdbc 的几种方式
package Day3
import java.util.Properties
import org.apache.spark.sql.SQLContext
import org.apache.spark.{SparkConf, SparkContext}
object Jdbc {
def main(args: Array[String]): Unit = {
val conf = new SparkConf().setMaster("local[4]").setAppName("JDBC")
val sc = new SparkContext(conf)
val sqlContext = new SQLContext(sc)
val lowerBound = 1 // 上界
val upperBound = 100000 //下界
val numPartitions = 5 // 分区数量
val url = "jdbc:mysql://localhost:3306/my_db?user=root&password=x"
val prop = new Properties()
// 使用 Properties(相关属性)
// 设定分区上下限以及 rdd 个数,数据很大不适合
val df = sqlContext.read.jdbc(
url, "student_info","name",
lowerBound, upperBound, numPartitions, prop)
val predicates = Array[String]("age <19") // 可以指定 查询限制
val df1 = sqlContext.read.jdbc(url, "student_info", predicates, prop)
// 使用 load 来进行数据加载, load 还支持 json、orc
// 其实 options 与上面 jdbc 是一样的方法
// dbtable 指定表
val df2 = sqlContext.read.format("jdbc").options(
Map("url" -> "jdbc:mysql://localhost:3306/my_db?user=root&password=x",
"dbtable" -> "student_info")).load()
for (a <- df) println(a)
/*
[Len,18] [Jieaak,14] [Qang,57]
* */
for (a <- df1) println(a)
/*
[Len,18] [Jieaak,14]
* */
for (a <- df2) println(a)
/*
[Len,18] [Jieaak,14] [Qang,57]
* */}}