postgresql-cluster
description
-
cluster a table according to an index
如果对某张表中的某些数据访问多于其它数据,可以考虑使用CLUSTER,因为如果索引标识出第一匹配行所在的存储页,所有其它行也可能已经在同一个存储页里了,这样便节省了磁盘访问的时间,加速了查询。 -
更新表后需要重做analyze
-
建议在cluster 之前将 maintenance_work_mem 设置为一个相当大的值
-
根据指定索引,重新排列表数据顺序,该表仍然是根本上无序的 heap 数据。
试验
-- 实验环境:CentOS 7 + PG 11.1
-- 创建实验表,索引
create table test (id int, val numeric);
create index on test(id);
create index on test(val);
-- 插入实验数据
insert into test select generate_series(1,10000000),random();
-- 表信息
\d test
Table "public.test"
Column | Type | Collation | Nullable | Default
--------+---------+-----------+----------+---------
id | integer | | |
val | numeric | | |
Indexes:
"test_id_idx" btree (id)
"test_val_idx" btree (val)
-- 表文件
select pg_relation_filepath('test'::regclass);
pg_relation_filepath
----------------------
base/16385/64278
(1 row)
-- 索引文件
select pg_relation_filepath('test_id_idx'::regclass);
pg_relation_filepath
----------------------
base/16385/64280
(1 row)
-- 索引文件
select pg_relation_filepath('test_val_idx'::regclass);
pg_relation_filepath
----------------------
base/16385/64281
(1 row)
- 查看列的离散程度,值越接近0,表示越离散,越接近1,表示存储比较有顺序
- 说明目前表是根据 id 有序存储的。
select correlation from pg_stats where tablename='test' and attname='id';
correlation
-------------
1
select correlation from pg_stats where tablename='test' and attname='val';
correlation
-------------
0.00781794
(1 row)
-- 收缩表,分析表,收集统计信息。
vacuum analyze test;
-- 最小的ID的存储位置
select ctid,id,val from test where id=(select min(id) from test);
ctid | id | val
-------+----+-------------------
(0,1) | 1 | 0.419486843980849
(1 row)
-- 最小的 VAL 的存储位置
select ctid,id,val from test where val=(select min(val) from test);
ctid | id | val
------------+--------+------------------------
(4361,127) | 806540 | 0.00000218348577618599
(1 row)
-- 根据 val 列上的索引。做cluster。
cluster test USING test_val_idx;
-- 查看最小 id 的存储
select ctid,id,val from test where id=(select min(id) from test);
ctid | id | val
----------+----+-------------------
(3192,7) | 1 | 0.589303761254996
(1 row)
-- 查看最小 val 的存储
select ctid,id,val from test where val=(select min(val) from test);
ctid | id | val
-------+--------+------------------------
(0,1) | 806540 | 0.00000218348577618599
(1 row)
-- 收缩表,分析表,收集统计信息。
vacuum analyze test;
- 查看列的离散程度,值越接近0,表示越离散,越接近1,表示存储比较有顺序
- 说明目前表是根据 val 有序存储的。
select correlation from pg_stats where tablename='test' and attname='val';
correlation
-------------
1
(1 row)
select correlation from pg_stats where tablename='test' and attname='id';
correlation
-------------
0.00133778
(1 row)
-- 查询表,索引文件,都变了。说明cluster重建表和索引。
select pg_relation_filepath('test'::regclass);
pg_relation_filepath
----------------------
base/16385/64283
(1 row)
select pg_relation_filepath('test_id_idx'::regclass);
pg_relation_filepath
----------------------
base/16385/64289
(1 row)
select pg_relation_filepath('test_val_idx'::regclass);
pg_relation_filepath
----------------------
base/16385/64290
(1 row)
-- 表锁
-- session 1
mytest=# begin;
BEGIN
mytest=# cluster test USING test_id_idx ;
CLUSTER
-- session 2
mytest=# select * from test limit 1;
等待......
-- session 3
select pid,locktype,database,relation,granted,mode,b.relname from pg_locks a,pg_class b where a.relation=b.oid;
pid | locktype | database | relation | granted | mode | relname
-------+----------+----------+----------+---------+---------------------+-----------------------------------
22891 | relation | 16385 | 3455 | t | AccessShareLock | pg_class_tblspc_relfilenode_index
22891 | relation | 16385 | 2663 | t | AccessShareLock | pg_class_relname_nsp_index
22891 | relation | 16385 | 2662 | t | AccessShareLock | pg_class_oid_index
22891 | relation | 16385 | 1259 | t | AccessShareLock | pg_class
22891 | relation | 16385 | 11645 | t | AccessShareLock | pg_locks
21609 | relation | 16385 | 2679 | t | AccessShareLock | pg_index_indexrelid_index
21609 | relation | 16385 | 2678 | t | AccessShareLock | pg_index_indrelid_index
21609 | relation | 16385 | 2610 | t | AccessShareLock | pg_index
21609 | relation | 16385 | 3455 | t | AccessShareLock | pg_class_tblspc_relfilenode_index
21609 | relation | 16385 | 2663 | t | AccessShareLock | pg_class_relname_nsp_index
21609 | relation | 16385 | 2662 | t | AccessShareLock | pg_class_oid_index
21609 | relation | 16385 | 2685 | t | AccessShareLock | pg_namespace_oid_index
21609 | relation | 16385 | 2684 | t | AccessShareLock | pg_namespace_nspname_index
21609 | relation | 16385 | 2615 | t | AccessShareLock | pg_namespace
21609 | relation | 16385 | 1259 | t | AccessShareLock | pg_class
21609 | relation | 16385 | 64276 | t | AccessShareLock | test_val_idx
21609 | relation | 16385 | 64276 | t | AccessExclusiveLock | test_val_idx
21609 | relation | 16385 | 64272 | t | AccessExclusiveLock | pg_toast_64269
21609 | relation | 16385 | 64275 | t | AccessShareLock | test_id_idx
21609 | relation | 16385 | 64275 | t | AccessExclusiveLock | test_id_idx
21609 | relation | 16385 | 64269 | t | ShareLock | test
21609 | relation | 16385 | 64269 | t | AccessExclusiveLock | test
22829 | relation | 16385 | 64269 | f | AccessShareLock | test
(23 rows)
————————————————
试验2
test=# CREATE TABLE t_test AS SELECT *
FROM generate_series(1, 10000000);
SELECT 10000000
test=# CREATE TABLE t_random AS SELECT *
FROM t_test
ORDER BY random();
SELECT 10000000
test=# CREATE INDEX idx_test ON t_test (generate_series);
CREATE INDEX
Time: 3699.416 ms (00:03.699)
test=# CREATE INDEX idx_random ON t_random (generate_series);
CREATE INDEX
Time: 5084.823 ms (00:05.085)
顺序表读取
- 11.785毫秒
- 所需的8k块数为138
- “shared hit”表示所有数据都来自内存
test=# explain (analyze, buffers) SELECT *
FROM t_test
WHERE generate_series BETWEEN 1000 AND 50000;
QUERY PLAN
---------------------------------------------------------------------------------------------------------------------------------
Index Only Scan using idx_test on t_test (cost=0.43..1362.62 rows=43909 width=4) (actual time=0.017..7.810 rows=49001 loops=1)
Index Cond: ((generate_series >= 1000) AND (generate_series <= 50000))
Heap Fetches: 0
Buffers: shared hit=138
Planning Time: 0.149 ms
Execution Time: 11.785 ms
(6 rows)
随机表读取
- 13.4毫秒
- 18799块
test=# explain (analyze, buffers) SELECT *
FROM t_random
WHERE generate_series BETWEEN 1000 AND 50000;
QUERY PLAN
-------------------------------------------------------------------------------------------------------------------------------------
Index Only Scan using idx_random on t_random (cost=0.43..1665.17 rows=53637 width=4) (actual time=0.013..9.892 rows=49001 loops=1)
Index Cond: ((generate_series >= 1000) AND (generate_series <= 50000))
Heap Fetches: 0
Buffers: shared hit=18799
Planning Time: 0.102 ms
Execution Time: 13.386 ms
(6 rows)
使用cluster
test=# CLUSTER t_random USING idx_random;
CLUSTER
test=# ANALYZE;
ANALYZE
- 读取随机表 355块
test=# explain (analyze, buffers) SELECT *
FROM t_random
WHERE generate_series BETWEEN 1000 AND 50000;
QUERY PLAN
--------------------------------------------------------------------------------------------------------------------------------------
Index Only Scan using idx_random on t_random (cost=0.43..1807.12 rows=50884 width=4) (actual time=0.012..11.737 rows=49001 loops=1)
Index Cond: ((generate_series >= 1000) AND (generate_series <= 50000))
Heap Fetches: 49001
Buffers: shared hit=355
Planning Time: 0.220 ms
Execution Time: 15.267 ms
(6 rows)
结论
- 如果数据比较静态,则CLUSTER特别有用
好好学习天天向上
本文来自博客园,作者:{dyy},转载请注明原文链接:{https://www.cnblogs.com/ddlearning/}