瞌睡中的葡萄虎

博客园 首页 新随笔 联系 订阅 管理

在最新的master分支上官方提供了Spark JDBC外部数据源的实现,先尝为快。

通过spark-shell测试

import org.apache.spark.sql.SQLContext  
val sqlContext  = new SQLContext(sc)
import sqlContext._

val TBLS_JDBC_DDL = s"""
|CREATE TEMPORARY TABLE spark_tbls
|USING org.apache.spark.sql.jdbc
|OPTIONS (
|  url    'jdbc:mysql://hadoop000:3306/hive?user=root&password=root',
|  dbtable     'TBLS'
|)""".stripMargin

sqlContext.sql(TBLS_JDBC_DDL)

 

指定列查询:

sql("SELECT * FROM spark_tbls").collect.foreach(println)
[1,1423100397,1,0,spark,0,1,page_views,MANAGED_TABLE,A,D] [6,1423116106,1,0,spark,0,6,order_created,MANAGED_TABLE,B,E] [7,1423116131,1,0,spark,0,7,test_load1,MANAGED_TABLE,C,F] [8,1423116145,1,0,spark,0,8,order_picked,MANAGED_TABLE,null,null] [9,1423116160,1,0,spark,0,9,order_shipped,MANAGED_TABLE,null,null] [10,1423116168,1,0,spark,0,10,order_received,MANAGED_TABLE,null,null] [11,1423116179,1,0,spark,0,11,order_cancelled,MANAGED_TABLE,null,null] [12,1423116193,1,0,spark,0,12,order_tracking,MANAGED_TABLE,null,null] [13,1423116248,1,0,spark,0,13,order_tracking_join,MANAGED_TABLE,null,null] [14,1423116298,1,0,spark,0,14,click_log,MANAGED_TABLE,null,null] [15,1423116316,1,0,spark,0,15,ad_list,MANAGED_TABLE,null,null][16,1423116324,1,0,spark,0,16,ad_list_string,MANAGED_TABLE,null,null] [17,1423116338,1,0,spark,0,17,cookie_cats,MANAGED_TABLE,null,null]

 

查询表中指定列:

sql("SELECT TBL_ID,TBL_NAME,TBL_TYPE FROM spark_tbls").collect.foreach(println)
[1,page_views,MANAGED_TABLE] [6,order_created,MANAGED_TABLE] [7,test_load1,MANAGED_TABLE] [8,order_picked,MANAGED_TABLE] [9,order_shipped,MANAGED_TABLE] [10,order_received,MANAGED_TABLE] [11,order_cancelled,MANAGED_TABLE] [12,order_tracking,MANAGED_TABLE] [13,order_tracking_join,MANAGED_TABLE] [14,click_log,MANAGED_TABLE] [15,ad_list,MANAGED_TABLE] [16,ad_list_string,MANAGED_TABLE] [17,cookie_cats,MANAGED_TABLE]

 

指定查询条件查询:

sql("SELECT TBL_ID,TBL_NAME,TBL_TYPE FROM spark_tbls WHERE TBL_ID = 1").collect.foreach(println)
[1,page_views,MANAGED_TABLE]

sql("SELECT TBL_ID,TBL_NAME,TBL_TYPE FROM spark_tbls WHERE TBL_ID < 7").collect.foreach(println)
[1,page_views,MANAGED_TABLE]
[6,order_created,MANAGED_TABLE]

sql("SELECT TBL_ID,TBL_NAME,TBL_TYPE FROM spark_tbls WHERE TBL_ID <= 7").collect.foreach(println)
[1,page_views,MANAGED_TABLE]
[6,order_created,MANAGED_TABLE]
[7,test_load1,MANAGED_TABLE]

sql("SELECT TBL_ID,TBL_NAME,TBL_TYPE FROM spark_tbls WHERE TBL_ID > 7").collect.foreach(println)
[8,order_picked,MANAGED_TABLE]
[9,order_shipped,MANAGED_TABLE]
[10,order_received,MANAGED_TABLE]
[11,order_cancelled,MANAGED_TABLE]
[12,order_tracking,MANAGED_TABLE]
[13,order_tracking_join,MANAGED_TABLE]
[14,click_log,MANAGED_TABLE]
[15,ad_list,MANAGED_TABLE]
[16,ad_list_string,MANAGED_TABLE]
[17,cookie_cats,MANAGED_TABLE]

sql("SELECT TBL_ID,TBL_NAME,TBL_TYPE FROM spark_tbls WHERE TBL_ID >= 7").collect.foreach(println)
[7,test_load1,MANAGED_TABLE]
[8,order_picked,MANAGED_TABLE]
[9,order_shipped,MANAGED_TABLE]
[10,order_received,MANAGED_TABLE]
[11,order_cancelled,MANAGED_TABLE]
[12,order_tracking,MANAGED_TABLE]
[13,order_tracking_join,MANAGED_TABLE]
[14,click_log,MANAGED_TABLE]
[15,ad_list,MANAGED_TABLE]
[16,ad_list_string,MANAGED_TABLE]
[17,cookie_cats,MANAGED_TABLE]

sql("SELECT TBL_ID,TBL_NAME,TBL_TYPE,VIEW_EXPANDED_TEXT FROM spark_tbls WHERE VIEW_EXPANDED_TEXT IS NULL").collect.foreach(println)
[8,order_picked,MANAGED_TABLE,null]
[9,order_shipped,MANAGED_TABLE,null]
[10,order_received,MANAGED_TABLE,null]
[11,order_cancelled,MANAGED_TABLE,null]
[12,order_tracking,MANAGED_TABLE,null]
[13,order_tracking_join,MANAGED_TABLE,null]
[14,click_log,MANAGED_TABLE,null]
[15,ad_list,MANAGED_TABLE,null]
[16,ad_list_string,MANAGED_TABLE,null]
[17,cookie_cats,MANAGED_TABLE,null]

sql("SELECT TBL_ID,TBL_NAME,TBL_TYPE,VIEW_EXPANDED_TEXT FROM spark_tbls WHERE VIEW_EXPANDED_TEXT IS NOT NULL").collect.foreach(println)
[1,page_views,MANAGED_TABLE,A]
[6,order_created,MANAGED_TABLE,B]
[7,test_load1,MANAGED_TABLE,C]

sql("SELECT TBL_ID,TBL_NAME,TBL_TYPE,VIEW_EXPANDED_TEXT FROM spark_tbls WHERE TBL_ID>=7 AND TBL_ID <=10").collect.foreach(println)
[7,test_load1,MANAGED_TABLE,C]
[8,order_picked,MANAGED_TABLE,null]
[9,order_shipped,MANAGED_TABLE,null]
[10,order_received,MANAGED_TABLE,null]

 

多partition并行执行: 可以通过http://hadoop000:4040/jobs/的tasks数查看

val TBLS_PARTS_JDBC_DDL = s"""
|CREATE TEMPORARY TABLE spark_tbls_parts
|USING org.apache.spark.sql.jdbc
|OPTIONS (
|  url    'jdbc:mysql://hadoop000:3306/hive?user=root&password=root',
|  dbtable     'TBLS',
|  partitionColumn 'TBL_ID', 
|  lowerBound '1', 
|  upperBound '50', 
|  numPartitions '3'
|)""".stripMargin

sqlContext.sql(TBLS_PARTS_JDBC_DDL)
sql("SELECT TBL_ID,TBL_NAME,TBL_TYPE,VIEW_EXPANDED_TEXT FROM spark_tbls_parts WHERE VIEW_EXPANDED_TEXT IS NULL").collect.foreach(println)
[8,order_picked,MANAGED_TABLE,null] [9,order_shipped,MANAGED_TABLE,null] [10,order_received,MANAGED_TABLE,null] [11,order_cancelled,MANAGED_TABLE,null] [12,order_tracking,MANAGED_TABLE,null] [13,order_tracking_join,MANAGED_TABLE,null] [14,click_log,MANAGED_TABLE,null] [15,ad_list,MANAGED_TABLE,null] [16,ad_list_string,MANAGED_TABLE,null] [17,cookie_cats,MANAGED_TABLE,null] [21,emp,MANAGED_TABLE,null] [22,dept,MANAGED_TABLE,null]

 

多表关联查询:

val COLUMNS_V2_JDBC_DDL = s"""
|CREATE TEMPORARY TABLE spark_column_v2
|USING org.apache.spark.sql.jdbc
|OPTIONS (
|  url    'jdbc:mysql://hadoop000:3306/hive?user=root&password=root',
|  dbtable     'COLUMNS_V2'
|)""".stripMargin

sqlContext.sql(COLUMNS_V2_JDBC_DDL)
sql("SELECT CD_ID, COLUMN_NAME FROM spark_column_v2").collect.foreach(println)
[1,city_id] [1,end_user_id] [1,ip] [1,referer] [1,session_id] [1,track_time] [1,url] [6,event_time] [6,ordernumber] [7,id] [7,name] [8,event_time] [8,ordernumber] [9,event_time] [9,ordernumber] [10,event_time] [10,ordernumber] [11,event_time] [11,ordernumber] [12,order_cancelled_ts] [12,order_created_ts] [12,order_picked_ts] [12,order_received_ts] [12,order_shipped_ts] [12,ordernumber] [13,order_cancelled_ts] [13,order_created_ts] [13,order_picked_ts] [13,order_received_ts] [13,order_shipped_ts] [13,ordernumber] [14,ad_id] [14,cookie_id] [14,ts] [15,ad_id] [15,catalogs] [15,url] [16,ad_id] [16,catalogs] [16,url] [17,catalog] [17,cookie_id] [17,weight] [21,comm] [21,deptno] [21,empno] [21,ename] [21,hiredate] [21,job] [21,mgr] [21,sal] [22,deptno] [22,dname] [22,loc] sql("SELECT a.TBL_ID, a.TBL_NAME, a.TBL_TYPE, b.CD_ID, b.COLUMN_NAME FROM spark_tbls a join spark_column_v2 b on a.TBL_ID = b.CD_ID WHERE a.TBL_ID = 1").collect.foreach(println)
[1,page_views,MANAGED_TABLE,1,city_id] [1,page_views,MANAGED_TABLE,1,end_user_id] [1,page_views,MANAGED_TABLE,1,ip] [1,page_views,MANAGED_TABLE,1,referer] [1,page_views,MANAGED_TABLE,1,session_id] [1,page_views,MANAGED_TABLE,1,track_time] [1,page_views,MANAGED_TABLE,1,url] sql("SELECT a.TBL_ID, COUNT(b.CD_ID) FROM spark_tbls a join spark_column_v2 b on a.TBL_ID = b.CD_ID GROUP BY a.TBL_ID").collect.foreach(println)
[1,7] [6,2] [7,2] [8,2] [9,2] [10,2] [11,2] [12,6] [13,6] [14,3] [15,3] [16,3] [17,3] [21,8] [22,3]

 

通过spark-sql测试

CREATE TEMPORARY TABLE spark_tbls
USING org.apache.spark.sql.jdbc
OPTIONS (
url    'jdbc:mysql://hadoop000:3306/hive?user=root&password=root',
dbtable     'TBLS'
);
SELECT * FROM spark_tbls;

CREATE TEMPORARY TABLE spark_tbls_parts USING org.apache.spark.sql.jdbc OPTIONS ( url 'jdbc:mysql://hadoop000:3306/hive?user=root&password=root', dbtable 'TBLS', partitionColumn 'TBL_ID', lowerBound '1', upperBound '50', numPartitions '3' ); SELECT * FROM spark_tbls_parts;
CREATE TEMPORARY TABLE spark_column_v2 USING org.apache.spark.sql.jdbc OPTIONS ( url 'jdbc:mysql://hadoop000:3306/hive?user=root&password=root', dbtable 'COLUMNS_V2' ); select * from spark_column_v2; SELECT a.TBL_ID, a.TBL_NAME, a.TBL_TYPE, b.CD_ID, b.COLUMN_NAME FROM spark_tbls a join spark_column_v2 b on a.TBL_ID = b.CD_ID WHERE a.TBL_ID = 1

 

posted on 2015-02-05 15:24  瞌睡中的葡萄虎  阅读(1391)  评论(0编辑  收藏  举报