ClickHouse 案例学习 —— 使用 ClickHouse 探索 GitHub 统计信息

来源 使用 ClickHouse 探索 GitHub 统计信息

统计数量与大小

SELECT formatReadableQuantity(sum(rows)) AS rows,
       formatReadableSize(sum(data_compressed_bytes)) AS compressedBytes,
       formatReadableSize(sum(data_uncompressed_bytes)) AS uncompressedBytes
FROM system.parts
WHERE database = 'github' AND table = 'events';

自动分组

-- ClickHouse 会自动识别 SELECT 子句中的非聚合列(在这个例子中是 event_type),然后对这些列进行分组
SELECT event_type, count() AS count, formatReadableQuantity(count)
FROM github.events
GROUP BY ALL

字符串分割函数

-- 按字符串分割
select splitByString('::', 'a::b::c')

-- 按空白字符分割
select splitByWhitespace('hello world test') 

-- 按正则分割
select splitByRegexp('[,;]', 'a,b;c') 

日期转换函数

-- 转成天
SELECT toDate(created_at) AS day, count() AS dailyCount
FROM github.events 
WHERE event_type = 'WatchEvent' AND repo_name = 'deepseek-ai/DeepSeek-R1'
GROUP BY ALL
ORDER BY day;
-- 转成周
SELECT toDayOfWeek(created_at) AS year, event_type, count() AS stars
FROM github.events
GROUP BY ALL
ORDER BY year ASC;

请参考 处理日期和时间的函数
例如:toStartOfYear

窗口函数

-- 窗口函数在聚合完成后执行,可以引用聚合的结果

-- 每个类型的star数量的排名
SELECT toDayOfWeek(created_at) AS dayOfWeek, event_type, 
         count() AS stars, 
         row_number() OVER(PARTITION BY event_type ORDER BY stars DESC) AS rank
  FROM github.events
  WHERE event_type IN (
    'ForkEvent', 'IssuesEvent', 'WatchEvent', 'PushEvent', 'PullRequestEvent'
  )  
  GROUP BY ALL
  ORDER BY dayOfWeek ASC
-- 每个类型中当前star数量与最大star数量的相对大小
WITH eventTypeStars AS (
  SELECT toDayOfWeek(created_at) AS dayOfWeek, event_type, 
         count() AS stars, 
         max(stars) OVER(PARTITION BY event_type) AS maxDay
  FROM github.events
  WHERE event_type IN (
    'ForkEvent', 'IssuesEvent', 'WatchEvent', 'PushEvent', 'PullRequestEvent'
  )  
  GROUP BY ALL
  ORDER BY dayOfWeek ASC
)

SELECT *, stars/maxDay AS ratio
FROM eventTypeStars;

视图

物化视图

-- SummingMergeTree 表
CREATE TABLE repo_stars (
  repo_name String,
  count UInt32
)
ENGINE = SummingMergeTree 
ORDER BY (repo_name)

-- 视图
CREATE MATERIALIZED VIEW github.repo_stars_mv
TO github.repo_stars AS 
SELECT repo_name, count() as count 
FROM github.events
WHERE event_type = 'WatchEvent' 
GROUP BY repo_name

-- 查询(由于后台 merge,需要  sum(count) 
SELECT repo_name
	FROM github.repo_stars
	GROUP BY repo_name
	ORDER BY sum(count) DESC
	LIMIT 1000

可刷新的物化视图

-- MergeTree 表
CREATE TABLE github.top_repos
(
	`repo_name` String,
	`stars` UInt64
)
ENGINE = MergeTree
ORDER BY stars;

-- 视图
CREATE MATERIALIZED VIEW github.top_repos_mv
REFRESH EVERY 30 MINUTES TO github.top_repos
(
	`repo_name` String,
	`stars` Int64
)
AS SELECT
	repo_name,
	sum(count) AS stars
FROM github.repo_events_per_day
WHERE event_type = 'WatchEvent'
GROUP BY ALL;

-- 查询
SELECT repo_name
	FROM github.top_repos
	ORDER BY stars DESC
	LIMIT 1000

投影

在 ClickHouse 中,如果筛选条件与排序键中较早的列相匹配,查询速度会更快。通过给原始表添加投影,可以很方便地添加不同的排序方式,从而加速其查询。

-- 原始表
CREATE TABLE actors_per_repo(	
  `actor_login` LowCardinality(String),
  `repo_name` String,
  `pushes` UInt32,
  `issues` UInt32,
  `stars` UInt32
)
ENGINE = SummingMergeTree
ORDER BY (repo_name, actor_login);

-- 设置
ALTER TABLE github.actors_per_repo  
MODIFY SETTING deduplicate_merge_projection_mode='rebuild';

-- 添加投影(根据 actor_login 排序)
ALTER TABLE github.actors_per_repo 
ADD PROJECTION actor_login_order (
  SELECT * 
  ORDER BY actor_login
);
ALTER TABLE github.actors_per_repo 
MATERIALIZE PROJECTION actor_login_order;

-- 再次查询
SELECT actor_login, splitByChar('/', repo_name)[2] as project, sum(pushes) as pushes, sum(issues) as issues, sum(stars) as stars
FROM github.actors_per_repo
WHERE actor_login = 'alexey-milovidov'
GROUP BY actor_login, project
ORDER BY pushes DESC
LIMIT 5;

posted on 2025-06-09 14:02  Lemo_wd  阅读(20)  评论(0)    收藏  举报

导航