R通过RJDBC连接Hive相关

连接方式1

	require("DBI")
	require("rJava")
	require("RJDBC")
	
	username <- '******'
	password <- '******'
	
	# .jclassLoader()$setDebug(1L)
	cp <- dir("/opt/cloudera/parcels/CDH/jars", full.names = TRUE)
	.jinit(classpath = cp) 
	options( java.parameters = "-Xmx8g" )
	drv <- JDBC("org.apache.hive.jdbc.HiveDriver", "/opt/cloudera/parcels/CDH/lib/hive/lib/hive-jdbc.jar", identifier.quote="`")
	# make sure disconnect the connection after the query
	conn <- dbConnect(drv, "jdbc:hive2://172.156.188.28:10000/default", username, password)
	options(width = 160)
	
	show_databases <- dbGetQuery(conn, 'select * from library limit 10')
	head(show_databases)
	
	library(knitr)
	kable(show_databases)
	
	dbDisconnect(conn)

	### http://172.16.18.2/

连接方式2

	library("DBI")
	library("rJava")
	library("RJDBC")
	
	username <- '******'
	password <- '******'

	for(l in list.files('/opt/cloudera/parcels/CDH/jars/', full.names = TRUE)){ .jaddClassPath(l)}
	
	drv <- JDBC("org.apache.hive.jdbc.HiveDriver", "/opt/cloudera/parcels/CDH/lib/hive/lib/hive-jdbc-standalone.jar")
	
	for(l in list.files('/opt/org/apache/hadoop/2.7.3/share/hadoop/common/', full.names = TRUE)){ .jaddClassPath(l)}
	
	conn <- dbConnect(drv, "jdbc:hive2://172.156.188.28:10000/default", username, password)
	
	library(knitr)
	kable(show_databases) # table's meta data, for gollum wiki
	
	## Get data from hive
	show_databases <- dbGetQuery(conn, 'select * from library limit 10')
	
	head(show_databases)
	
	library(knitr)
	kable(show_databases)
	
	dbDisconnect(conn)

连接方式-函数

数据量太大问题

Have you tired to make increase the memory to 4g instead of 1g. I mean set options for the java parameters to be -Xmx4g instead of -Xmx1g

Simply change the java.parameters to 8GB before load RJDBC package:

	# options(java.parameters = "-Xmx8048m")
	# library("RJDBC")

You can increase the memory as below and restart your Rstudio and it worked for me.

	# memory.limit(size=10000000000024)

函数

	get_hive_data <- function(oUsername, oPassword, oSql){
	  
	  # library(DBI)
	  # library(rJava)
	  # library(RJDBC)
	  options( java.parameters = "-Xmx8048m" )
	  require(DBI)
	  require(rJava)
	  require(RJDBC)
	  username <- oUsername
	  password <- oPassword
	  cp <- dir("/opt/cloudera/parcels/CDH/jars", full.names = TRUE)
	  .jinit(classpath = cp) 
	  # options( java.parameters = "-Xmx8g" )
	
	  drv <- JDBC("org.apache.hive.jdbc.HiveDriver", "/opt/cloudera/parcels/CDH/lib/hive/lib/hive-jdbc.jar", identifier.quote="`")
	  # make sure disconnect the connection after the query
	  conn <- dbConnect(drv, "jdbc:hive2://172.156.188.28:10000/default", username, password)
	  options(width = 160)
	  result <- dbGetQuery(conn, oSql)
	  dbDisconnect(conn)
	  return(result)
	  
	}
	
	oUsername <- '******'
	oPassword <- '******'
	oSql <- 'select * from library limit 1000000'
	# oSql <- 'select count(*) from library'
	test_data <- get_hive_data(oUsername, oPassword, oSql)
	dim(test_data)
	head(test_data)
	
	library(knitr)
	kable(test_data)
posted @ 2018-04-19 15:16  银河统计  阅读(783)  评论(0编辑  收藏  举报