ClickHouse 案例学习 —— 使用 ClickHouse 探索 GitHub 统计信息
来源 使用 ClickHouse 探索 GitHub 统计信息
统计数量与大小
SELECT formatReadableQuantity(sum(rows)) AS rows,
formatReadableSize(sum(data_compressed_bytes)) AS compressedBytes,
formatReadableSize(sum(data_uncompressed_bytes)) AS uncompressedBytes
FROM system.parts
WHERE database = 'github' AND table = 'events';
自动分组
-- ClickHouse 会自动识别 SELECT 子句中的非聚合列(在这个例子中是 event_type),然后对这些列进行分组
SELECT event_type, count() AS count, formatReadableQuantity(count)
FROM github.events
GROUP BY ALL
字符串分割函数
-- 按字符串分割
select splitByString('::', 'a::b::c')
-- 按空白字符分割
select splitByWhitespace('hello world test')
-- 按正则分割
select splitByRegexp('[,;]', 'a,b;c')
日期转换函数
-- 转成天
SELECT toDate(created_at) AS day, count() AS dailyCount
FROM github.events
WHERE event_type = 'WatchEvent' AND repo_name = 'deepseek-ai/DeepSeek-R1'
GROUP BY ALL
ORDER BY day;
-- 转成周
SELECT toDayOfWeek(created_at) AS year, event_type, count() AS stars
FROM github.events
GROUP BY ALL
ORDER BY year ASC;
请参考 处理日期和时间的函数
例如:toStartOfYear
窗口函数
-- 窗口函数在聚合完成后执行,可以引用聚合的结果
-- 每个类型的star数量的排名
SELECT toDayOfWeek(created_at) AS dayOfWeek, event_type,
count() AS stars,
row_number() OVER(PARTITION BY event_type ORDER BY stars DESC) AS rank
FROM github.events
WHERE event_type IN (
'ForkEvent', 'IssuesEvent', 'WatchEvent', 'PushEvent', 'PullRequestEvent'
)
GROUP BY ALL
ORDER BY dayOfWeek ASC
-- 每个类型中当前star数量与最大star数量的相对大小
WITH eventTypeStars AS (
SELECT toDayOfWeek(created_at) AS dayOfWeek, event_type,
count() AS stars,
max(stars) OVER(PARTITION BY event_type) AS maxDay
FROM github.events
WHERE event_type IN (
'ForkEvent', 'IssuesEvent', 'WatchEvent', 'PushEvent', 'PullRequestEvent'
)
GROUP BY ALL
ORDER BY dayOfWeek ASC
)
SELECT *, stars/maxDay AS ratio
FROM eventTypeStars;
视图
物化视图
-- SummingMergeTree 表
CREATE TABLE repo_stars (
repo_name String,
count UInt32
)
ENGINE = SummingMergeTree
ORDER BY (repo_name)
-- 视图
CREATE MATERIALIZED VIEW github.repo_stars_mv
TO github.repo_stars AS
SELECT repo_name, count() as count
FROM github.events
WHERE event_type = 'WatchEvent'
GROUP BY repo_name
-- 查询(由于后台 merge,需要 sum(count)
SELECT repo_name
FROM github.repo_stars
GROUP BY repo_name
ORDER BY sum(count) DESC
LIMIT 1000
可刷新的物化视图
-- MergeTree 表
CREATE TABLE github.top_repos
(
`repo_name` String,
`stars` UInt64
)
ENGINE = MergeTree
ORDER BY stars;
-- 视图
CREATE MATERIALIZED VIEW github.top_repos_mv
REFRESH EVERY 30 MINUTES TO github.top_repos
(
`repo_name` String,
`stars` Int64
)
AS SELECT
repo_name,
sum(count) AS stars
FROM github.repo_events_per_day
WHERE event_type = 'WatchEvent'
GROUP BY ALL;
-- 查询
SELECT repo_name
FROM github.top_repos
ORDER BY stars DESC
LIMIT 1000
投影
在 ClickHouse 中,如果筛选条件与排序键中较早的列相匹配,查询速度会更快。通过给原始表添加投影,可以很方便地添加不同的排序方式,从而加速其查询。
-- 原始表
CREATE TABLE actors_per_repo(
`actor_login` LowCardinality(String),
`repo_name` String,
`pushes` UInt32,
`issues` UInt32,
`stars` UInt32
)
ENGINE = SummingMergeTree
ORDER BY (repo_name, actor_login);
-- 设置
ALTER TABLE github.actors_per_repo
MODIFY SETTING deduplicate_merge_projection_mode='rebuild';
-- 添加投影(根据 actor_login 排序)
ALTER TABLE github.actors_per_repo
ADD PROJECTION actor_login_order (
SELECT *
ORDER BY actor_login
);
ALTER TABLE github.actors_per_repo
MATERIALIZE PROJECTION actor_login_order;
-- 再次查询
SELECT actor_login, splitByChar('/', repo_name)[2] as project, sum(pushes) as pushes, sum(issues) as issues, sum(stars) as stars
FROM github.actors_per_repo
WHERE actor_login = 'alexey-milovidov'
GROUP BY actor_login, project
ORDER BY pushes DESC
LIMIT 5;