Spark SQL实战
一、程序
1 package sparklearning 2 3 import org.apache.log4j.Logger 4 import org.apache.spark.SparkConf 5 import org.apache.spark.SparkContext 6 import org.apache.spark.sql.SQLContext 7 import org.apache.spark.storage.StorageLevel 8 import org.apache.log4j.Level 9 10 object OnLineTradeStatistics { 11 12 case class User(userID:String,gender:String,age:Int,registerDate:String,provice:String,career:String) 13 case class TradeDetail(tradeID:String, tradeDate:String,productID:Int,amount:Int,userID:String) 14 def main(args: Array[String]){ 15 16 //关闭不必要的日志显示 17 Logger.getLogger("org.apache.hadoop").setLevel(Level.ERROR) 18 Logger.getLogger("org.apache.spark").setLevel(Level.ERROR) 19 Logger.getLogger("org.eclipse.jetty.server").setLevel(Level.OFF) 20 21 //设置应用程序 22 val conf=new SparkConf().setAppName("On Line Trade Data").setMaster("local") 23 val ctx=new SparkContext(conf) 24 val sqlCtx=new SQLContext(ctx) 25 import sqlCtx.implicits._ 26 27 //读文件 RDD-->DataFrame 28 val userDF= ctx.textFile("/home/hadoop/data/on_line_trade_user.txt").map(_.split(" ")).map(u=>User(u(0),u(1),u(2).toInt,u(3),u(4),u(5))).toDF() 29 userDF.registerTempTable("user") 30 userDF.persist(StorageLevel.MEMORY_ONLY_SER) 31 32 val tradeDF= ctx.textFile("/home/hadoop/data/on_line_trade_detail.txt").map(_.split(" ")).map(u=>TradeDetail(u(0),u(1),u(2).toInt,u(3).toInt,u(4))).toDF() 33 tradeDF.registerTempTable("trade")//生成临时表 34 tradeDF.persist(StorageLevel.MEMORY_ONLY_SER) 35 36 val countOfTrade2016 = sqlCtx.sql("SELECT * FROM trade where tradeDate like '2016%'").count() 37 println("2016 total money: "+countOfTrade2016) 38 } 39 }
二、结果
当神已无能为力,那便是魔渡众生