用starrocks实现冷热数据分层

Starrocks 冷热分区验证

  1. 基于分区实现冷热

    1. 建表
    CREATE DATABASE IF NOT EXISTS quickstart;
    
    USE quickstart;
    
    DROP TABLE table_range;
    CREATE TABLE table_range
    (
        k1 DATE,
        k2 INT,
        k3 SMALLINT,
        v1 VARCHAR(2048),
        v2 DATETIME DEFAULT "2014-02-04 15:36:00"
    )
    ENGINE=olap
    DUPLICATE KEY(k1, k2, k3)
    PARTITION BY RANGE (k1)
    (
        PARTITION p1 VALUES LESS THAN ("2014-01-01"),
        PARTITION p2 VALUES LESS THAN ("2014-06-01"),
        PARTITION p3 VALUES LESS THAN ("2014-12-01")
    )
    DISTRIBUTED BY HASH(k2)
    PROPERTIES(
        "storage_medium" = "SSD", 
        "storage_cooldown_time" = "2025-08-19 00:00:00"
    );
    
    1. 生成分区数据
    ## 生成p1分区数据 (k1 < '2014-01-01')
    INSERT INTO table_range (k1, k2, k3, v1)
    SELECT 
        DATE_ADD('2013-01-01', INTERVAL FLOOR(RAND() * 365) DAY) AS k1,
        FLOOR(RAND() * 10000) AS k2,
        FLOOR(RAND() * 100) AS k3,
        CONCAT('p1_data_', FLOOR(RAND() * 100000)) AS v1
    FROM 
        (SELECT 1 UNION ALL SELECT 2 UNION ALL SELECT 3 UNION ALL SELECT 4 UNION ALL SELECT 5) t1,
        (SELECT 1 UNION ALL SELECT 2 UNION ALL SELECT 3 UNION ALL SELECT 4 UNION ALL SELECT 5) t2,
        (SELECT 1 UNION ALL SELECT 2 UNION ALL SELECT 3 UNION ALL SELECT 4 UNION ALL SELECT 5) t3;
    
    ## 生成p2分区数据 ('2014-01-01' <= k1 < '2014-06-01')
    INSERT INTO table_range (k1, k2, k3, v1)
    SELECT 
        DATE_ADD('2014-01-01', INTERVAL FLOOR(RAND() * 150) DAY) AS k1,
        FLOOR(RAND() * 10000) AS k2,
        FLOOR(RAND() * 100) AS k3,
        CONCAT('p2_data_', FLOOR(RAND() * 100000)) AS v1
    FROM 
        (SELECT 1 UNION ALL SELECT 2 UNION ALL SELECT 3 UNION ALL SELECT 4 UNION ALL SELECT 5) t1,
        (SELECT 1 UNION ALL SELECT 2 UNION ALL SELECT 3 UNION ALL SELECT 4 UNION ALL SELECT 5) t2,
        (SELECT 1 UNION ALL SELECT 2 UNION ALL SELECT 3 UNION ALL SELECT 4 UNION ALL SELECT 5) t3;
    
    ## 生成p3分区数据 ('2014-06-01' <= k1 < '2014-12-01')
    INSERT INTO table_range (k1, k2, k3, v1)
    SELECT 
        DATE_ADD('2014-06-01', INTERVAL FLOOR(RAND() * 180) DAY) AS k1,
        FLOOR(RAND() * 10000) AS k2,
        FLOOR(RAND() * 100) AS k3,
        CONCAT('p3_data_', FLOOR(RAND() * 100000)) AS v1
    FROM 
        (SELECT 1 UNION ALL SELECT 2 UNION ALL SELECT 3 UNION ALL SELECT 4 UNION ALL SELECT 5) t1,
        (SELECT 1 UNION ALL SELECT 2 UNION ALL SELECT 3 UNION ALL SELECT 4 UNION ALL SELECT 5) t2,
        (SELECT 1 UNION ALL SELECT 2 UNION ALL SELECT 3 UNION ALL SELECT 4 UNION ALL SELECT 5) t3;
    
    1. 查看分区冷热结果
    SELECT * FROM information_schema.tables t where TABLE_NAME = 'table_range'  ;
    SELECT * FROM information_schema.tables_config tc  where TABLE_NAME = 'table_range'  ;
    -- {"compression":"LZ4","fast_schema_evolution":"true","replicated_storage":"true","replication_num":"1","storage_medium":"SSD"}
    SELECT * FROM information_schema.`columns` c where TABLE_NAME = 'table_range'  ;
    -- partition info
    SELECT * FROM information_schema.partitions p;
    SELECT * FROM information_schema.partitions_meta pm where TABLE_NAME = 'table_range' limit 100 ;
    -- tablets info
    SELECT PARTITION_ID , TABLET_ID , NUM_ROW , DATA_SIZE , CREATE_TIME , DATA_DIR , SHARD_ID , MEDIUM_TYPE  FROM information_schema.be_tablets where TABLE_ID = 10006 ;
    SELECT * FROM information_schema.fe_tablet_schedules where TABLE_ID = 10006;
    

    注意:CREATE TABLE | StarRocks

    1. 冷热策略前提是存储支持多种存储介质,按时间进行分区

    2. 降冷是分区粒度,而不是分桶粒度

    3. 冷数据转热数据可以通过修改分区存储来改变

    固定时间触发storage_cooldown_time:

    {"compression":"LZ4","fast_schema_evolution":"true","replicated_storage":"true","replication_num":"1","storage_medium":"SSD"}
    

    触发前:

    触发后:

    冷数据修改分区配置转热数据:

    动态触发

    {"compression":"LZ4","fast_schema_evolution":"true","replicated_storage":"true","replication_num":"1","storage_cooldown_ttl":"1 hours","storage_medium":"SSD"}
    

    触发前:

    触发后:

  2. 基于物化视图实现冷热

使用物化视图加速数据湖查询 | StarRocks

CREATE MATERIALIZED VIEW mv_4_1 
DISTRIBUTED BY HASH(lo_orderdate)
PARTITION BY LO_ORDERDATE
REFRESH ASYNC EVERY(INTERVAL 1 DAY) 
AS 
SELECT lo_orderkey, lo_orderdate, lo_revenue
FROM hive.ssb_1g_csv.lineorder
WHERE lo_orderdate<=current_date()
AND lo_orderdate>=date_add(current_date(), INTERVAL -4 DAY);

通过 “预计算 + 定期刷新”,自动同步原表 hive.ssb_1g_csv.lineorder 中满足条件的数据(最近 4 天的订单),并以物化视图的形式存储。可以看作是一种 “跨系统的冷热数据分层管理”,通过结合外部存储(Hive)和 StarRocks 自身存储实现的 “逻辑 + 物理” 混合分层。

不是原生的冷热分离,但这种方式达到了与冷热分离相同的业务目标:

  • 对高频访问的热数据(如最近 30 天),通过 StarRocks 物化视图(SSD 存储)提供毫秒级查询性能;

  • 对低频访问的冷数据(如 30 天前),保留在 Hive(HDD 存储),降低总体存储成本;

  • 用户查询时,可通过 StarRocks 自动路由(若开启查询重写),热数据查物化视图,冷数据查 Hive 外表,无需感知底层存储差异。

    • 查询改写配置:enable_query_rewrite、query_rewrite_consistency、force_external_table_query_rewrite、transparent_mv_rewrite_mode配置参数

    • 查询改写可通过mv_rewrite_staleness_second 配置控制一定程度上的数据不一致

  1. 存算分离的冷热

CREATE STORAGE VOLUME | StarRocks

CREATE TABLE | StarRocks

基于minio创建存算分离(参考:基于 MinIO 部署 | StarRocks),此时存算分离只是缓存数据和存储分离,和物化视图类似间接实现了冷热分离(参考:存算分离 | StarRocks

实践sql:

DROP STORAGE VOLUME IF EXISTS s3_ssc;
CREATE STORAGE VOLUME IF NOT EXISTS s3_ssc
TYPE = S3
LOCATIONS = ('s3://starrocks/data/')
PROPERTIES
("enabled" = "true",
"aws.s3.region" = "cn-north-1",
"aws.s3.endpoint" = "http://10.0.70.134:31807",
"aws.s3.access_key" = "lPwTcNeCBq2Jx1HklM5i",
"aws.s3.secret_key" = "P3wYLtiQKqCsIjjNAzJrHOysrqXq1ZTatAiJXUIM"
)

SET s3_ssc AS DEFAULT STORAGE VOLUME;

SELECT * FROM information_schema.partitions_meta pm where TABLE_NAME = 'table_range2' limit 100 ;

分区效果

posted @ 2025-08-20 16:19  WonderfulDom  阅读(2)  评论(0)    收藏  举报