Clickhouse 物化视图与投影

参考文章

增量物化视图

数据聚合

示例1

表结构:

CREATE TABLE votes
(
    `Id` UInt32,
    `PostId` Int32,
    `VoteTypeId` UInt8,
    `CreationDate` DateTime64(3, 'UTC'),
    `UserId` Int32,
    `BountyAmount` UInt8
)
ENGINE = MergeTree
ORDER BY (VoteTypeId, CreationDate, PostId)

INSERT INTO votes SELECT * FROM s3('https://datasets-documentation.s3.eu-west-3.amazonaws.com/stackoverflow/parquet/votes/*.parquet')

0 rows in set. Elapsed: 29.359 sec. Processed 238.98 million rows, 2.13 GB (8.14 million rows/s., 72.45 MB/s.)

原始查询:

SELECT toStartOfDay(CreationDate) AS day,
       countIf(VoteTypeId = 2) AS UpVotes,
       countIf(VoteTypeId = 3) AS DownVotes
FROM votes
GROUP BY day
ORDER BY day ASC
LIMIT 10

物化视图

-- 使用了 SummingMergeTree
CREATE TABLE up_down_votes_per_day
(
  `Day` Date,
  `UpVotes` UInt32,
  `DownVotes` UInt32
)
ENGINE = SummingMergeTree
-- 重要提示
ORDER BY Day

CREATE MATERIALIZED VIEW up_down_votes_per_day_mv TO up_down_votes_per_day AS
SELECT toStartOfDay(CreationDate)::Date AS Day,
       countIf(VoteTypeId = 2) AS UpVotes,
       countIf(VoteTypeId = 3) AS DownVotes
FROM votes
-- 重要提示
GROUP BY Day

重要提示: ORDER BY = GROUP BY
在大多数情况下,物化视图转换的 GROUP BY 子句中使用的列,应该与目标表中使用的 ORDER BY 子句保持一致,如果使用 SummingMergeTree 或 AggregatingMergeTree 表引擎。这些引擎依赖于 ORDER BY 列在后台合并操作中合并具有相同值的行。GROUP BY 和 ORDER BY 列之间的不对齐可能导致查询性能低效、合并不理想,甚至数据不一致。

再次查询:

SELECT
        Day,
        UpVotes,
        DownVotes
FROM up_down_votes_per_day
FINAL
ORDER BY Day ASC
LIMIT 10

示例2

原始查询:

SELECT
        toStartOfDay(CreationDate) AS Day,
        quantile(0.999)(Score) AS Score_99th,
        avg(CommentCount) AS AvgCommentCount
FROM posts
GROUP BY Day
ORDER BY Day DESC
LIMIT 10

物化视图:

CREATE TABLE posts_null AS posts ENGINE = Null

-- 目标表:使用了 AggregatingMergeTree 与 AggregateFunction
CREATE TABLE post_stats_per_day
(
  `Day` Date,
  `Score_quantiles` AggregateFunction(quantile(0.999), Int32),
  `AvgCommentCount` AggregateFunction(avg, UInt8)
)
ENGINE = AggregatingMergeTree
ORDER BY Day

-- 物化视图
-- 在聚合函数的末尾附加了后缀 State。这确保了返回函数的聚合状态,而不是最终结果。它将包含额外信息,以允许此部分状态与其他状态合并。
-- 例如,在平均情况下,这将包括列的计数和总和。
CREATE MATERIALIZED VIEW post_stats_mv TO post_stats_per_day AS
       SELECT toStartOfDay(CreationDate) AS Day,
       quantileState(0.999)(Score) AS Score_quantiles,
       avgState(CommentCount) AS AvgCommentCount
FROM posts_null
GROUP BY Day

INSERT INTO posts_null SELECT * FROM posts

查询:

-- 在聚合函数的末尾附加了后缀 Merge(因为列存储的是部分聚合状态)。
-- 请注意,我们在这里使用 GROUP BY 而不使用 FINAL。
SELECT
        Day,
        quantileMerge(0.999)(Score_quantiles),
        avgMerge(AvgCommentCount)
FROM post_stats_per_day
GROUP BY Day
ORDER BY Day DESC
LIMIT 10

过滤与转换

示例:

CREATE MATERIALIZED VIEW posts_mv TO posts AS
        SELECT * EXCEPT Tags, arrayFilter(t -> (t != ''), splitByChar('|', Tags)) as Tags FROM posts_null

注:通过该物化视图,数据插入 posts_null 表后,会自动执行该 ETL 逻辑,将转换后的数据保存到 posts 表中。

辅助索引表

通过额外的物化视图表构建一个“辅助索引表”,然后用子查询来缩小主表扫描范围。

表结构:

CREATE TABLE comments
(
    `Id` UInt32,
    `PostId` UInt32,
    `Score` UInt16,
    `Text` String,
    `CreationDate` DateTime64(3, 'UTC'),
    `UserId` Int32,
    `UserDisplayName` LowCardinality(String)
)
ENGINE = MergeTree
ORDER BY PostId

原始查询:

-- 由于 UserId 非表 comments 的主索引,所以按 UserId 查询效果不高
SELECT avg(Score)
FROM comments
WHERE UserId = 8592047

物化视图:

-- 建立一个辅助索引表,使用 UserId 作为主索引
CREATE TABLE comments_posts_users (
  PostId UInt32,
  UserId Int32
) ENGINE = MergeTree ORDER BY UserId

CREATE TABLE comments_null AS comments
ENGINE = Null

CREATE MATERIALIZED VIEW comments_posts_users_mv TO comments_posts_users AS
SELECT PostId, UserId FROM comments_null

INSERT INTO comments_null SELECT * FROM comments

查询:

-- 使用子查询加速查询
SELECT avg(Score)
FROM comments
WHERE PostId IN (
        SELECT PostId
        FROM comments_posts_users
        WHERE UserId = 8592047
) AND UserId = 8592047

联表 Join

增量物化视图完全支持 JOIN 操作。不过有一些限制:

  • 增量物化视图仅在源表(查询中的最左侧表)插入时触发,任何对右侧表的更新将不会回溯更新物化视图。
  • 物化视图执行时,只会使用新插入的数据替换源表。

示例3

表结构:

CREATE TABLE badges
(
    Id       UInt32,
    UserId   Int32,
    Name     LowCardinality(String),
    Date     DateTime64(3, 'UTC'),
    Class    Enum8('Gold' = 1, 'Silver' = 2, 'Bronze' = 3),
    TagBased Bool
)
    ENGINE = MergeTree
        ORDER BY UserId

CREATE TABLE users
(
    Id             Int32,
    Reputation     UInt32,
    CreationDate   DateTime64(3, 'UTC'),
    DisplayName    LowCardinality(String),
    LastAccessDate DateTime64(3, 'UTC'),
    Location       LowCardinality(String),
    Views          UInt32,
    UpVotes        UInt32,
    DownVotes      UInt32
)
    ENGINE = MergeTree
        ORDER BY Id;

方案1:

CREATE TABLE daily_badges_by_user
(
    Day         Date,
    UserId      Int32,
    DisplayName LowCardinality(String),
    Gold        UInt32,
    Silver      UInt32,
    Bronze      UInt32
)
    ENGINE = SummingMergeTree
        ORDER BY (DisplayName, UserId, Day);

CREATE MATERIALIZED VIEW daily_badges_by_user_mv TO daily_badges_by_user AS
SELECT toDate(Date)              AS Day,
       b.UserId,
       u.DisplayName,
       countIf(Class = 'Gold')   AS Gold,
       countIf(Class = 'Silver') AS Silver,
       countIf(Class = 'Bronze') AS Bronze
FROM badges AS b
         LEFT JOIN users AS u ON b.UserId = u.Id
GROUP BY Day, b.UserId, u.DisplayName;

说明:

  • Join 整表影响写入性能。
    users 是一个可能非常大的维度表,每次 badges 有新数据写入时,物化视图需要实时做 join。ClickHouse 的物化视图是 同步触发的,意味着写入 badges 的同时必须完成 join → 这会让写入性能大幅下降。

方案2:

CREATE MATERIALIZED VIEW daily_badges_by_user_mv TO daily_badges_by_user
AS
SELECT toDate(Date)              AS Day,
       b.UserId,
       u.DisplayName,
       countIf(Class = 'Gold')   AS Gold,
       countIf(Class = 'Silver') AS Silver,
       countIf(Class = 'Bronze') AS Bronze
FROM badges AS b
         LEFT JOIN
     (
         SELECT Id,
                DisplayName
         FROM users
         WHERE Id IN (SELECT UserId
                      FROM badges)
         ) AS u ON b.UserId = u.Id
GROUP BY Day,
         b.UserId,
         u.DisplayName;

说明:

  • 只 Join 部分表数据
    子查询 WHERE Id IN (SELECT UserId FROM badges) 只取出 badges 里出现过的 UserId,大大减少了需要参与 join 的 users 记录数。相当于提前做了 用户维度裁剪,降低了 join 的开销。

物化视图与联合

示例4

表结构:

CREATE TABLE stackoverflow.comments
(
    `Id` UInt32,
    `PostId` UInt32,
    `Score` UInt16,
    `Text` String,
    `CreationDate` DateTime64(3, 'UTC'),
    `UserId` Int32,
    `UserDisplayName` LowCardinality(String)
)
ENGINE = MergeTree
ORDER BY CreationDate

CREATE TABLE stackoverflow.badges
(
    `Id` UInt32,
    `UserId` Int32,
    `Name` LowCardinality(String),
    `Date` DateTime64(3, 'UTC'),
    `Class` Enum8('Gold' = 1, 'Silver' = 2, 'Bronze' = 3),
    `TagBased` Bool
)
ENGINE = MergeTree
ORDER BY UserId

原始查询:

-- 显示每个用户的最后活动
SELECT
 UserId,
 argMax(description, event_time) AS last_description,
 argMax(activity_type, event_time) AS activity_type,
    max(event_time) AS last_activity
FROM
(
    SELECT
 UserId,
 CreationDate AS event_time,
        Text AS description,
        'comment' AS activity_type
    FROM stackoverflow.comments
    UNION ALL
    SELECT
 UserId,
        Date AS event_time,
        Name AS description,
        'badge' AS activity_type
    FROM stackoverflow.badges
)
GROUP BY UserId
ORDER BY last_activity DESC
LIMIT 10

注:该视图只会触发对 comments 表的插入

物化视图:

-- 目标表:使用了 AggregatingMergeTree 与 AggregateFunction
CREATE TABLE user_activity
(
    `UserId` String,
    `last_description` AggregateFunction(argMax, String, DateTime64(3, 'UTC')),
    `activity_type` AggregateFunction(argMax, String, DateTime64(3, 'UTC')),
    `last_activity` SimpleAggregateFunction(max, DateTime64(3, 'UTC'))
)
ENGINE = AggregatingMergeTree
ORDER BY UserId

-- 物化视图1(数据插入 comments 时触发)
CREATE MATERIALIZED VIEW comment_activity_mv TO user_activity AS
SELECT
 UserId,
 argMaxState(Text, CreationDate) AS last_description,
 argMaxState('comment', CreationDate) AS activity_type,
 max(CreationDate) AS last_activity
FROM stackoverflow.comments
GROUP BY UserId;

-- 物化视图2(数据插入 badges 时触发)
CREATE MATERIALIZED VIEW badges_activity_mv TO user_activity AS
SELECT
 UserId,
 argMaxState(Name, Date) AS last_description,
 argMaxState('badge', Date) AS activity_type,
 max(Date) AS last_activity
FROM stackoverflow.badges
GROUP BY UserId;

说明:

  • 当物化视图需要对两张表的插入都触发执行的时候,可以创建两个物化视图并指向同一张目标表。

可刷新物化视图

使用 APPEND 添加新行

普通的可刷新视图用于保存最新的记录。APPEND 可以保留刷新时刻的记录。

投影

加速筛选条件为非主键列的查询

示例 5

查询:

-- 由于我们要对 tip_amount 进行过滤,而 ORDER BY 中并不包含 tip_amount ,因此 ClickHouse 必须进行全表扫描。
SELECT
  tip_amount,
  trip_id,
  dateDiff('minutes', pickup_datetime, dropoff_datetime) AS trip_duration_min
FROM nyc_taxi.trips 
WHERE tip_amount > 200 AND trip_duration_min > 0
ORDER BY tip_amount, trip_id ASC

投影:

-- 创建投影,添加新的主键列,再次查询便很快了。
ALTER TABLE nyc_taxi.trips
ADD PROJECTION prj_tip_amount
(
    SELECT *
    ORDER BY tip_amount, dateDiff('minutes', pickup_datetime, dropoff_datetime)
)

ALTER TABLE nyc.trips MATERIALIZE PROJECTION prj_tip_amount

加速分组的查询

示例 6

查询:

SELECT
    county,
    avg(price)
FROM uk.uk_price_paid
GROUP BY county
ORDER BY avg(price) DESC
LIMIT 3

投影:

ALTER TABLE uk.uk_price_paid_with_projections
  (ADD PROJECTION prj_gby_county
  (
    SELECT
        county,
        avg(price)
    GROUP BY county
  ));

ALTER TABLE uk.uk_price_paid_with_projections
  (MATERIALIZE PROJECTION prj_gby_county)
SETTINGS mutations_sync = 1;

如果投影中使用了 GROUP BY 子句,如 prj_gby_county 中的子句 上述投影,则(隐藏)表的底层存储引擎 变成 AggregatingMergeTree 并将所有聚合函数转换为 聚合函数 。这可确保正确的增量数据聚合。

posted on 2025-08-21 17:22  Lemo_wd  阅读(21)  评论(0)    收藏  举报

导航