phoenix二级索引-重建索引REBUILD
phoenix二级索引-重建索引REBUILD
ALTER INDEX IF EXISTS my_idx ON my_table REBUILD
ALTER INDEX IF EXISTS idx_bdh_energy_hid2 ON "hs_spin"."dwd_energy_platforms" REBUILD ;
spark结果写入phoenix
pom 依赖
<dependency>
<groupId>org.apache.phoenix</groupId>
<artifactId>phoenix-spark</artifactId>
<version>${phoenix.version}</version>
<scope>provided</scope>
</dependency>
<!-- https://mvnrepository.com/artifact/org.apache.phoenix/phoenix-spark -->
<dependency>
<groupId>org.apache.phoenix</groupId>
<artifactId>phoenix-spark</artifactId>
<version>4.9.0-HBase-1.2</version>
</dependency>
代码
import org.apache.spark.sql.SparkSession
import org.apache.phoenix.spark._
object SparkToPhoenix {
def main(args: Array[String]): Unit = {
val spark = SparkSession.builder()
.appName("SparkToPhoenix")
.master("local")
.getOrCreate()
val data = spark.read.parquet("path/to/parquet/file")
val processedData = data.select("column1", "column2")
.withColumn("column1", data("column1").cast("string"))
processedData.write
.format("org.apache.phoenix.spark")
.mode("overwrite")
.option("table", "your_phoenix_table")
.option("zkUrl", "your_zookeeper_url")
.save()
val df = spark.read
.format("org.apache.phoenix.spark")
.option("table", "my_table")
.option("zkUrl", "localhost:2181")
.load()
spark.stop()
}
}
案例1
索引1
import org.apache.commons.lang.ArrayUtils;
import org.apache.commons.lang.StringUtils;
import org.apache.hadoop.hbase.util.Bytes;
import java.util.Map;
/**
* 插入hbase索引表 主键字段拼接
*/
public class RowKeyIndex {
public static byte[] rowKeyIndex(Map<String, String> dataMap, String rowKey) {
byte[] addFour;
byte[] modelIDS = Bytes.toBytes(dataMap.get("modelID"));
byte transformChar = (byte) 0;
byte[] transfromChars = new byte[]{transformChar};
byte[] addOne = ArrayUtils.addAll(modelIDS, transfromChars);
byte[] deviceIDS = Bytes.toBytes(dataMap.get("deviceID"));
byte[] addTwo = ArrayUtils.addAll(deviceIDS, transfromChars);
byte[] oneAddTwo = ArrayUtils.addAll(addOne, addTwo);
byte[] deviceTimes = Bytes.toBytes(dataMap.get("deviceTime"));
byte[] addThree = ArrayUtils.addAll(deviceTimes, transfromChars);
if (StringUtils.isBlank(dataMap.get("processState"))) {
addFour = ArrayUtils.addAll(null, transfromChars);
} else {
byte[] processStates = Bytes.toBytes(dataMap.get("processState"));
addFour = ArrayUtils.addAll(processStates, transfromChars);
}
byte[] threeAddFour = ArrayUtils.addAll(addThree, addFour);
byte[] oneToFour = ArrayUtils.addAll(oneAddTwo, threeAddFour);
byte[] rowKeyBytes = Bytes.toBytes(rowKey);
byte[] rowKeyIndex = ArrayUtils.addAll(oneToFour, rowKeyBytes);
return rowKeyIndex;
}
}
索引2
//调用RowKeyIndex类中的方法
byte[] rowKeyIndex = RowKeyIndex.rowKeyIndex(dataMap, rowKey);
//向表中插入数据
Put putIndex = new Put(rowKeyIndex);
//存入hbase索引表
Result resultIndex = baseService.insertData(key, tableNameAndDataMapIndex.get(key));
索引3
Properties props = new Properties();
props.setProperty("phoenix.query.timeoutMs", "1200000");
props.setProperty("hbase.rpc.timeout", "1200000");
props.setProperty("hbase.client.scanner.timeout.period", "1200000");
try {
Class.forName("org.apache.phoenix.jdbc.PhoenixDriver");
// 这里配置zookeeper的地址,可单个,也可多个。可以是域名或者ip
String url= "jdbc:phoenix:z-001,z-002,z-003:2181:/hbase-unsecure";
Connection conn= DriverManager.getConnection(url, props);
System.out.println(conn);
Statement statement = conn.createStatement();
String sql = "select count(1) from test";
long time = System.currentTimeMillis();
ResultSet rs = statement.executeQuery(sql);
while (rs.next()) {
int count = rs.getInt(1);
System.out.println("row count is " + count);
}
long timeUsed = System.currentTimeMillis() - time;
System.out.println("time " + timeUsed + "mm");
// 关闭连接
rs.close();
statement.close();
conn.close();
} catch (Exception e) {
e.printStackTrace();
}
Phoenix为NoSQL数据库HBase提供了标准SQL和JDBC API的强大功能,且具备完整的ACID事务处理能力。对于小数据量的查询,其性能可以达到毫秒级别;对于数千万行的数据,其性能也可以达到秒级。
spark将数据写入phoenix
// log4j2 使用了全局异步打印日志的方式,还需要引入disruptor的依赖
def main(args: Array[String]): Unit = {
val spark = SparkSession.builder().master("local[*]").appName("phoenix-test").getOrCreate()
// 第一种读取方法
// val df = spark.read.format("org.apache.phoenix.spark")
// .option("table", "TEST_YUAN")
// .option("zkUrl", "10.31.1.123,10.31.1.124,10.31.1.125:2181")
// .load()
// // 对列名的大小写不敏感,对值的大小写敏感
// val df2 = df.filter("mycolumn like 'Hell%'")
// df2.show()
//
val configuration = new Configuration()
configuration.set("hbase.zookeeper.quorum", "10.31.1.123,10.31.1.124,10.31.1.125:2181")
// configuration.set("mapreduce.output.fileoutputformat.outputdir", "/tmp")
// configuration.set("mapred.output.dir", "E:/hbase")
// 第二种读取方法
import org.apache.phoenix.spark._
val df = spark.sqlContext.phoenixTableAsDataFrame("TEST_YUAN", Array("ROW", "FAMM", "NAME"), conf = configuration)
df.show()
// create table "TEST_YUAN22"("ROW" VARCHAR PRIMARY KEY,"BASE"."FAMM" VARCHAR,"BASE"."NAME" VARCHAR);
//第一种存储方法
// java.lang.IllegalArgumentException: Can not create a Path from an empty string 可以将spark2.2.0降为2.1.1解决问题
// 这个问题其实可以忽略,因为即使报错,也会将数据存入,最好的方法是try
// 表一定要存在
// df.write
// .format("org.apache.phoenix.spark")
// .mode("overwrite")
// .option("table", "TEST_YUAN22")
// .option("zkUrl", "10.31.1.123,10.31.1.124,10.31.1.125:2181")
// .save()
//第二种存储方法
df.saveToPhoenix(Map("table" -> "TEST_YUAN22", "zkUrl" -> "10.31.1.123,10.31.1.124,10.31.1.125:2181"))
}
// val df = spark.read
// .format("org.apache.phoenix.spark")
// .option("table", tb1)
//// .option("zkUrl", "localhost:2181")
// .option("zkUrl", "hadoop362,hadoop363,hadoop364:2181")
// .load()
// val df = spark.sqlContext.load(
// "org.apache.phoenix.spark",
// Map("table" -> "TABLE1", "zkUrl" -> "hadoop362,hadoop363,hadoop364:2181")
// )
// df.printSchema()
// df.show(false)
// df.show(false)
package com.spark.core.phoenix
import org.apache.spark.sql.SparkSession
object ReadFromPhoenix {
def main(args: Array[String]): Unit = {
val spark = SparkSession.builder().master("local").appName("test").getOrCreate()
// 使用 DataSourceV2 API 加载Phoenix表为DataFrame
val df = spark.read
.format("org.apache.phoenix.spark")
.option("table", "testdb.student")
.option("zkUrl", "localhost:2181")
.load()
df.show()
import org.apache.phoenix.spark._
// 使用 phoenixTableAsDataFrame() 加载Phoenix表为DataFrame
val df = spark.sqlContext.phoenixTableAsDataFrame(
table = "testdb.student",
columns = Seq("ID","NAME","SEX","PHONE","EMAIL"),
predicate = Some("SEX = 1"), // 可设置where条件
zkUrl = Some("localhost:2181"))
df.show()
// 使用 phoenixTableAsRDD() 加载Phoenix表为RDD
val rdd =spark.sparkContext.phoenixTableAsRDD(
table = "testdb.student",
columns = Seq("ID","NAME","SEX","PHONE","EMAIL"),
zkUrl = Some("localhost:2181"))
rdd.collect().foreach { println }
}
}
package com.spark.core.phoenix
import org.apache.spark.sql.SparkSession
object WriteToPhoenix {
def main(args: Array[String]): Unit = {
val spark = SparkSession.builder().master("local").appName("test").getOrCreate()
val rdd = spark.sparkContext.parallelize(Array(
student("2021120604","小明",1,"13783782988","123456789@email.com"),
student("2021120605","小红",0,"13786564688","888@email.com")))
import spark.implicits._
val df = rdd.toDF("ID","NAME","SEX","PHONE","EMAIL")
// 使用 DataSourceV2 API 保存DataFrame到Phoenix表
df.write
.format("org.apache.phoenix.spark")
.mode("overwrite") // 仅支持覆盖的保存模式
.option("table", "testdb.student")
.option("zkUrl", "localhost:2181")
.save()
import org.apache.phoenix.spark._
// 使用 saveToPhoenix() 保存DataFrame到Phoenix表
df.saveToPhoenix(Map("table" -> "testdb.student", "zkUrl" -> "localhost:2181"))
// 使用 saveToPhoenix() 保存RDD到Phoenix表
rdd.saveToPhoenix(
tableName = "testdb.student",
cols = Seq("ID","NAME","SEX","PHONE","EMAIL"),
zkUrl = Some("localhost:2181"))
}
}
import org.apache.phoenix.spark._
val tb1="\"dws\".\"dws_energy_platforms\""
val configuration = new Configuration()
configuration.set("hbase.zookeeper.quorum", "hadoop362,hadoop363,hadoop364:2181")
val df2 = spark.sqlContext.phoenixTableAsDataFrame(tb1, Array("ROW","hkey"), conf = configuration)
df2.printSchema()
df2.show()
df2.show()
pom2
<dependency>
<groupId>org.apache.phoenix</groupId>
<artifactId>phoenix-core</artifactId>
<version>${phoenix.version}</version>
<scope>provided</scope>
</dependency>
<dependency>
<groupId>org.apache.phoenix</groupId>
<artifactId>phoenix-spark</artifactId>
<version>${phoenix.version}</version>
<scope>provided</scope>
</dependency>
参考
http://xueai8.com/course/228/article
https://blog.51cto.com/u_16175453/6660495
https://blog.csdn.net/qq_31557939/article/details/126053490
https://blog.csdn.net/qq_16094777/article/details/70738694
http://www.taodudu.cc/news/show-6534968.html?action=onClick
https://www.jianshu.com/p/f336f7e5f31b
https://blog.csdn.net/penker_zhao/article/details/108361732
https://blog.csdn.net/weixin_45034316/article/details/121603081
https://blog.csdn.net/tototuzuoquan/article/details/88655354
第一步中hive表是管理表,如果该hive表被删除,那么Hbase中的表也会被删除
https://www.jianshu.com/p/721379437b82
在往hbase主表插入的同时,在对应的索引表中也插入一份。
https://blog.csdn.net/qq_35207086/article/details/121141086
https://blog.csdn.net/weixin_42814075/article/details/95494086
https://blog.csdn.net/liu137612432/article/details/108716843
https://blog.csdn.net/u012551524/article/details/79568675
https://blog.csdn.net/yuanhaiwn/article/details/81913017
https://blog.csdn.net/u012551524/article/details/79575632
https://www.cnblogs.com/dinghong-jo/p/7873646.html

浙公网安备 33010602011771号