Clickhouse 物化视图与投影
增量物化视图
数据聚合
示例1
表结构:
CREATE TABLE votes
(
`Id` UInt32,
`PostId` Int32,
`VoteTypeId` UInt8,
`CreationDate` DateTime64(3, 'UTC'),
`UserId` Int32,
`BountyAmount` UInt8
)
ENGINE = MergeTree
ORDER BY (VoteTypeId, CreationDate, PostId)
INSERT INTO votes SELECT * FROM s3('https://datasets-documentation.s3.eu-west-3.amazonaws.com/stackoverflow/parquet/votes/*.parquet')
0 rows in set. Elapsed: 29.359 sec. Processed 238.98 million rows, 2.13 GB (8.14 million rows/s., 72.45 MB/s.)
原始查询:
SELECT toStartOfDay(CreationDate) AS day,
countIf(VoteTypeId = 2) AS UpVotes,
countIf(VoteTypeId = 3) AS DownVotes
FROM votes
GROUP BY day
ORDER BY day ASC
LIMIT 10
物化视图
-- 使用了 SummingMergeTree
CREATE TABLE up_down_votes_per_day
(
`Day` Date,
`UpVotes` UInt32,
`DownVotes` UInt32
)
ENGINE = SummingMergeTree
-- 重要提示
ORDER BY Day
CREATE MATERIALIZED VIEW up_down_votes_per_day_mv TO up_down_votes_per_day AS
SELECT toStartOfDay(CreationDate)::Date AS Day,
countIf(VoteTypeId = 2) AS UpVotes,
countIf(VoteTypeId = 3) AS DownVotes
FROM votes
-- 重要提示
GROUP BY Day
重要提示: ORDER BY = GROUP BY
在大多数情况下,物化视图转换的 GROUP BY 子句中使用的列,应该与目标表中使用的 ORDER BY 子句保持一致,如果使用 SummingMergeTree 或 AggregatingMergeTree 表引擎。这些引擎依赖于 ORDER BY 列在后台合并操作中合并具有相同值的行。GROUP BY 和 ORDER BY 列之间的不对齐可能导致查询性能低效、合并不理想,甚至数据不一致。
再次查询:
SELECT
Day,
UpVotes,
DownVotes
FROM up_down_votes_per_day
FINAL
ORDER BY Day ASC
LIMIT 10
示例2
原始查询:
SELECT
toStartOfDay(CreationDate) AS Day,
quantile(0.999)(Score) AS Score_99th,
avg(CommentCount) AS AvgCommentCount
FROM posts
GROUP BY Day
ORDER BY Day DESC
LIMIT 10
物化视图:
CREATE TABLE posts_null AS posts ENGINE = Null
-- 目标表:使用了 AggregatingMergeTree 与 AggregateFunction
CREATE TABLE post_stats_per_day
(
`Day` Date,
`Score_quantiles` AggregateFunction(quantile(0.999), Int32),
`AvgCommentCount` AggregateFunction(avg, UInt8)
)
ENGINE = AggregatingMergeTree
ORDER BY Day
-- 物化视图
-- 在聚合函数的末尾附加了后缀 State。这确保了返回函数的聚合状态,而不是最终结果。它将包含额外信息,以允许此部分状态与其他状态合并。
-- 例如,在平均情况下,这将包括列的计数和总和。
CREATE MATERIALIZED VIEW post_stats_mv TO post_stats_per_day AS
SELECT toStartOfDay(CreationDate) AS Day,
quantileState(0.999)(Score) AS Score_quantiles,
avgState(CommentCount) AS AvgCommentCount
FROM posts_null
GROUP BY Day
INSERT INTO posts_null SELECT * FROM posts
查询:
-- 在聚合函数的末尾附加了后缀 Merge(因为列存储的是部分聚合状态)。
-- 请注意,我们在这里使用 GROUP BY 而不使用 FINAL。
SELECT
Day,
quantileMerge(0.999)(Score_quantiles),
avgMerge(AvgCommentCount)
FROM post_stats_per_day
GROUP BY Day
ORDER BY Day DESC
LIMIT 10
过滤与转换
示例:
CREATE MATERIALIZED VIEW posts_mv TO posts AS
SELECT * EXCEPT Tags, arrayFilter(t -> (t != ''), splitByChar('|', Tags)) as Tags FROM posts_null
注:通过该物化视图,数据插入 posts_null 表后,会自动执行该 ETL 逻辑,将转换后的数据保存到 posts 表中。
辅助索引表
通过额外的物化视图表构建一个“辅助索引表”,然后用子查询来缩小主表扫描范围。
表结构:
CREATE TABLE comments
(
`Id` UInt32,
`PostId` UInt32,
`Score` UInt16,
`Text` String,
`CreationDate` DateTime64(3, 'UTC'),
`UserId` Int32,
`UserDisplayName` LowCardinality(String)
)
ENGINE = MergeTree
ORDER BY PostId
原始查询:
-- 由于 UserId 非表 comments 的主索引,所以按 UserId 查询效果不高
SELECT avg(Score)
FROM comments
WHERE UserId = 8592047
物化视图:
-- 建立一个辅助索引表,使用 UserId 作为主索引
CREATE TABLE comments_posts_users (
PostId UInt32,
UserId Int32
) ENGINE = MergeTree ORDER BY UserId
CREATE TABLE comments_null AS comments
ENGINE = Null
CREATE MATERIALIZED VIEW comments_posts_users_mv TO comments_posts_users AS
SELECT PostId, UserId FROM comments_null
INSERT INTO comments_null SELECT * FROM comments
查询:
-- 使用子查询加速查询
SELECT avg(Score)
FROM comments
WHERE PostId IN (
SELECT PostId
FROM comments_posts_users
WHERE UserId = 8592047
) AND UserId = 8592047
联表 Join
增量物化视图完全支持 JOIN 操作。不过有一些限制:
- 增量物化视图仅在源表(查询中的最左侧表)插入时触发,任何对右侧表的更新将不会回溯更新物化视图。
- 物化视图执行时,只会使用新插入的数据替换源表。
示例3
表结构:
CREATE TABLE badges
(
Id UInt32,
UserId Int32,
Name LowCardinality(String),
Date DateTime64(3, 'UTC'),
Class Enum8('Gold' = 1, 'Silver' = 2, 'Bronze' = 3),
TagBased Bool
)
ENGINE = MergeTree
ORDER BY UserId
CREATE TABLE users
(
Id Int32,
Reputation UInt32,
CreationDate DateTime64(3, 'UTC'),
DisplayName LowCardinality(String),
LastAccessDate DateTime64(3, 'UTC'),
Location LowCardinality(String),
Views UInt32,
UpVotes UInt32,
DownVotes UInt32
)
ENGINE = MergeTree
ORDER BY Id;
方案1:
CREATE TABLE daily_badges_by_user
(
Day Date,
UserId Int32,
DisplayName LowCardinality(String),
Gold UInt32,
Silver UInt32,
Bronze UInt32
)
ENGINE = SummingMergeTree
ORDER BY (DisplayName, UserId, Day);
CREATE MATERIALIZED VIEW daily_badges_by_user_mv TO daily_badges_by_user AS
SELECT toDate(Date) AS Day,
b.UserId,
u.DisplayName,
countIf(Class = 'Gold') AS Gold,
countIf(Class = 'Silver') AS Silver,
countIf(Class = 'Bronze') AS Bronze
FROM badges AS b
LEFT JOIN users AS u ON b.UserId = u.Id
GROUP BY Day, b.UserId, u.DisplayName;
说明:
- Join 整表影响写入性能。
users 是一个可能非常大的维度表,每次 badges 有新数据写入时,物化视图需要实时做 join。ClickHouse 的物化视图是 同步触发的,意味着写入 badges 的同时必须完成 join → 这会让写入性能大幅下降。
方案2:
CREATE MATERIALIZED VIEW daily_badges_by_user_mv TO daily_badges_by_user
AS
SELECT toDate(Date) AS Day,
b.UserId,
u.DisplayName,
countIf(Class = 'Gold') AS Gold,
countIf(Class = 'Silver') AS Silver,
countIf(Class = 'Bronze') AS Bronze
FROM badges AS b
LEFT JOIN
(
SELECT Id,
DisplayName
FROM users
WHERE Id IN (SELECT UserId
FROM badges)
) AS u ON b.UserId = u.Id
GROUP BY Day,
b.UserId,
u.DisplayName;
说明:
- 只 Join 部分表数据
子查询 WHERE Id IN (SELECT UserId FROM badges) 只取出 badges 里出现过的 UserId,大大减少了需要参与 join 的 users 记录数。相当于提前做了 用户维度裁剪,降低了 join 的开销。
物化视图与联合
示例4
表结构:
CREATE TABLE stackoverflow.comments
(
`Id` UInt32,
`PostId` UInt32,
`Score` UInt16,
`Text` String,
`CreationDate` DateTime64(3, 'UTC'),
`UserId` Int32,
`UserDisplayName` LowCardinality(String)
)
ENGINE = MergeTree
ORDER BY CreationDate
CREATE TABLE stackoverflow.badges
(
`Id` UInt32,
`UserId` Int32,
`Name` LowCardinality(String),
`Date` DateTime64(3, 'UTC'),
`Class` Enum8('Gold' = 1, 'Silver' = 2, 'Bronze' = 3),
`TagBased` Bool
)
ENGINE = MergeTree
ORDER BY UserId
原始查询:
-- 显示每个用户的最后活动
SELECT
UserId,
argMax(description, event_time) AS last_description,
argMax(activity_type, event_time) AS activity_type,
max(event_time) AS last_activity
FROM
(
SELECT
UserId,
CreationDate AS event_time,
Text AS description,
'comment' AS activity_type
FROM stackoverflow.comments
UNION ALL
SELECT
UserId,
Date AS event_time,
Name AS description,
'badge' AS activity_type
FROM stackoverflow.badges
)
GROUP BY UserId
ORDER BY last_activity DESC
LIMIT 10
注:该视图只会触发对 comments 表的插入
物化视图:
-- 目标表:使用了 AggregatingMergeTree 与 AggregateFunction
CREATE TABLE user_activity
(
`UserId` String,
`last_description` AggregateFunction(argMax, String, DateTime64(3, 'UTC')),
`activity_type` AggregateFunction(argMax, String, DateTime64(3, 'UTC')),
`last_activity` SimpleAggregateFunction(max, DateTime64(3, 'UTC'))
)
ENGINE = AggregatingMergeTree
ORDER BY UserId
-- 物化视图1(数据插入 comments 时触发)
CREATE MATERIALIZED VIEW comment_activity_mv TO user_activity AS
SELECT
UserId,
argMaxState(Text, CreationDate) AS last_description,
argMaxState('comment', CreationDate) AS activity_type,
max(CreationDate) AS last_activity
FROM stackoverflow.comments
GROUP BY UserId;
-- 物化视图2(数据插入 badges 时触发)
CREATE MATERIALIZED VIEW badges_activity_mv TO user_activity AS
SELECT
UserId,
argMaxState(Name, Date) AS last_description,
argMaxState('badge', Date) AS activity_type,
max(Date) AS last_activity
FROM stackoverflow.badges
GROUP BY UserId;
说明:
- 当物化视图需要对两张表的插入都触发执行的时候,可以创建两个物化视图并指向同一张目标表。
可刷新物化视图
使用 APPEND 添加新行
普通的可刷新视图用于保存最新的记录。APPEND 可以保留刷新时刻的记录。
投影
加速筛选条件为非主键列的查询
示例 5
查询:
-- 由于我们要对 tip_amount 进行过滤,而 ORDER BY 中并不包含 tip_amount ,因此 ClickHouse 必须进行全表扫描。
SELECT
tip_amount,
trip_id,
dateDiff('minutes', pickup_datetime, dropoff_datetime) AS trip_duration_min
FROM nyc_taxi.trips
WHERE tip_amount > 200 AND trip_duration_min > 0
ORDER BY tip_amount, trip_id ASC
投影:
-- 创建投影,添加新的主键列,再次查询便很快了。
ALTER TABLE nyc_taxi.trips
ADD PROJECTION prj_tip_amount
(
SELECT *
ORDER BY tip_amount, dateDiff('minutes', pickup_datetime, dropoff_datetime)
)
ALTER TABLE nyc.trips MATERIALIZE PROJECTION prj_tip_amount
加速分组的查询
示例 6
查询:
SELECT
county,
avg(price)
FROM uk.uk_price_paid
GROUP BY county
ORDER BY avg(price) DESC
LIMIT 3
投影:
ALTER TABLE uk.uk_price_paid_with_projections
(ADD PROJECTION prj_gby_county
(
SELECT
county,
avg(price)
GROUP BY county
));
ALTER TABLE uk.uk_price_paid_with_projections
(MATERIALIZE PROJECTION prj_gby_county)
SETTINGS mutations_sync = 1;
如果投影中使用了 GROUP BY 子句,如 prj_gby_county 中的子句 上述投影,则(隐藏)表的底层存储引擎 变成 AggregatingMergeTree 并将所有聚合函数转换为 聚合函数 。这可确保正确的增量数据聚合。