postgresql-cluster

description

  • cluster a table according to an index
    如果对某张表中的某些数据访问多于其它数据,可以考虑使用CLUSTER,因为如果索引标识出第一匹配行所在的存储页,所有其它行也可能已经在同一个存储页里了,这样便节省了磁盘访问的时间,加速了查询。

  • 更新表后需要重做analyze

  • 建议在cluster 之前将 maintenance_work_mem 设置为一个相当大的值

  • 根据指定索引,重新排列表数据顺序,该表仍然是根本上无序的 heap 数据。

试验

-- 实验环境:CentOS 7 + PG 11.1
-- 创建实验表,索引
create table test (id int, val numeric);
create index on test(id);
create index on test(val);
 
-- 插入实验数据
insert into test select generate_series(1,10000000),random();
 
-- 表信息
\d test
                Table "public.test"
 Column |  Type   | Collation | Nullable | Default
--------+---------+-----------+----------+---------
 id     | integer |           |          |
 val    | numeric |           |          |
Indexes:
    "test_id_idx" btree (id)
    "test_val_idx" btree (val)
 
-- 表文件
select pg_relation_filepath('test'::regclass);
 pg_relation_filepath
----------------------
 base/16385/64278
(1 row)
-- 索引文件
select pg_relation_filepath('test_id_idx'::regclass);
 pg_relation_filepath
----------------------
 base/16385/64280
(1 row)
-- 索引文件
select pg_relation_filepath('test_val_idx'::regclass);
 pg_relation_filepath
----------------------
 base/16385/64281
(1 row)
 
  • 查看列的离散程度,值越接近0,表示越离散,越接近1,表示存储比较有顺序
  • 说明目前表是根据 id 有序存储的。
select correlation from pg_stats where tablename='test' and attname='id';
 correlation
-------------
           1
 
select correlation from pg_stats where tablename='test' and attname='val';
 correlation
-------------
  0.00781794
(1 row)
 
-- 收缩表,分析表,收集统计信息。
vacuum analyze test;
 
-- 最小的ID的存储位置
select ctid,id,val from test where id=(select min(id) from test);
 ctid  | id |        val
-------+----+-------------------
 (0,1) |  1 | 0.419486843980849
(1 row)
 
-- 最小的 VAL 的存储位置
select ctid,id,val from test where val=(select min(val) from test);
    ctid    |   id   |          val
------------+--------+------------------------
 (4361,127) | 806540 | 0.00000218348577618599
(1 row)
 
 
-- 根据 val 列上的索引。做cluster。
cluster test USING test_val_idx;
 
-- 查看最小 id 的存储
select ctid,id,val from test where id=(select min(id) from test);
   ctid   | id |        val
----------+----+-------------------
 (3192,7) |  1 | 0.589303761254996
(1 row)
 
-- 查看最小 val 的存储
select ctid,id,val from test where val=(select min(val) from test);
 ctid  |   id   |          val
-------+--------+------------------------
 (0,1) | 806540 | 0.00000218348577618599
(1 row)
 
-- 收缩表,分析表,收集统计信息。
vacuum analyze test;
  • 查看列的离散程度,值越接近0,表示越离散,越接近1,表示存储比较有顺序
  • 说明目前表是根据 val 有序存储的。
select correlation from pg_stats where tablename='test' and attname='val';
 correlation
-------------
           1
(1 row)
 
select correlation from pg_stats where tablename='test' and attname='id';
 correlation
-------------
  0.00133778
(1 row)
 
-- 查询表,索引文件,都变了。说明cluster重建表和索引。
select pg_relation_filepath('test'::regclass);
 pg_relation_filepath
----------------------
 base/16385/64283
(1 row)
 
select pg_relation_filepath('test_id_idx'::regclass);
 pg_relation_filepath
----------------------
 base/16385/64289
(1 row)
 
select pg_relation_filepath('test_val_idx'::regclass);
 pg_relation_filepath
----------------------
 base/16385/64290
(1 row)
 
-- 表锁
-- session 1
mytest=# begin;
BEGIN
mytest=# cluster test USING test_id_idx ;
CLUSTER
 
-- session 2
mytest=# select * from test limit 1;
等待......
 
-- session 3
select pid,locktype,database,relation,granted,mode,b.relname from pg_locks a,pg_class b where a.relation=b.oid;
  pid  | locktype | database | relation | granted |        mode         |              relname
-------+----------+----------+----------+---------+---------------------+-----------------------------------
 22891 | relation |    16385 |     3455 | t       | AccessShareLock     | pg_class_tblspc_relfilenode_index
 22891 | relation |    16385 |     2663 | t       | AccessShareLock     | pg_class_relname_nsp_index
 22891 | relation |    16385 |     2662 | t       | AccessShareLock     | pg_class_oid_index
 22891 | relation |    16385 |     1259 | t       | AccessShareLock     | pg_class
 22891 | relation |    16385 |    11645 | t       | AccessShareLock     | pg_locks
 21609 | relation |    16385 |     2679 | t       | AccessShareLock     | pg_index_indexrelid_index
 21609 | relation |    16385 |     2678 | t       | AccessShareLock     | pg_index_indrelid_index
 21609 | relation |    16385 |     2610 | t       | AccessShareLock     | pg_index
 21609 | relation |    16385 |     3455 | t       | AccessShareLock     | pg_class_tblspc_relfilenode_index
 21609 | relation |    16385 |     2663 | t       | AccessShareLock     | pg_class_relname_nsp_index
 21609 | relation |    16385 |     2662 | t       | AccessShareLock     | pg_class_oid_index
 21609 | relation |    16385 |     2685 | t       | AccessShareLock     | pg_namespace_oid_index
 21609 | relation |    16385 |     2684 | t       | AccessShareLock     | pg_namespace_nspname_index
 21609 | relation |    16385 |     2615 | t       | AccessShareLock     | pg_namespace
 21609 | relation |    16385 |     1259 | t       | AccessShareLock     | pg_class
 21609 | relation |    16385 |    64276 | t       | AccessShareLock     | test_val_idx
 21609 | relation |    16385 |    64276 | t       | AccessExclusiveLock | test_val_idx
 21609 | relation |    16385 |    64272 | t       | AccessExclusiveLock | pg_toast_64269
 21609 | relation |    16385 |    64275 | t       | AccessShareLock     | test_id_idx
 21609 | relation |    16385 |    64275 | t       | AccessExclusiveLock | test_id_idx
 21609 | relation |    16385 |    64269 | t       | ShareLock           | test
 21609 | relation |    16385 |    64269 | t       | AccessExclusiveLock | test
 22829 | relation |    16385 |    64269 | f       | AccessShareLock     | test
(23 rows)
————————————————

试验2

test=# CREATE TABLE t_test AS SELECT *
       FROM generate_series(1, 10000000);
SELECT 10000000
test=# CREATE TABLE t_random AS SELECT *
       FROM t_test
       ORDER BY random();
SELECT 10000000

test=# CREATE INDEX idx_test ON t_test (generate_series);
CREATE INDEX
Time: 3699.416 ms (00:03.699)
test=# CREATE INDEX idx_random ON t_random (generate_series);
CREATE INDEX
Time: 5084.823 ms (00:05.085)

顺序表读取

  • 11.785毫秒
  • 所需的8k块数为138
  • “shared hit”表示所有数据都来自内存
test=# explain (analyze, buffers) SELECT *
   FROM   t_test
   WHERE   generate_series BETWEEN 1000 AND 50000;
                                                          QUERY PLAN                                                            
---------------------------------------------------------------------------------------------------------------------------------
Index Only Scan using idx_test on t_test (cost=0.43..1362.62 rows=43909 width=4) (actual time=0.017..7.810 rows=49001 loops=1)
  Index Cond: ((generate_series >= 1000) AND (generate_series <= 50000))
  Heap Fetches: 0
  Buffers: shared hit=138
Planning Time: 0.149 ms
Execution Time: 11.785 ms
(6 rows)

随机表读取

  • 13.4毫秒
  • 18799块
test=# explain (analyze, buffers) SELECT *
   FROM   t_random
   WHERE   generate_series BETWEEN 1000 AND 50000;
                                                            QUERY PLAN                                                              
-------------------------------------------------------------------------------------------------------------------------------------
Index Only Scan using idx_random on t_random (cost=0.43..1665.17 rows=53637 width=4) (actual time=0.013..9.892 rows=49001 loops=1)
  Index Cond: ((generate_series >= 1000) AND (generate_series <= 50000))
  Heap Fetches: 0
  Buffers: shared hit=18799
Planning Time: 0.102 ms
Execution Time: 13.386 ms
(6 rows)

使用cluster

test=# CLUSTER t_random USING idx_random;
CLUSTER
test=# ANALYZE;
ANALYZE
  • 读取随机表 355块
test=# explain (analyze, buffers) SELECT *
   FROM   t_random
   WHERE   generate_series BETWEEN 1000 AND 50000;
                                                            QUERY PLAN                                                              
--------------------------------------------------------------------------------------------------------------------------------------
Index Only Scan using idx_random on t_random (cost=0.43..1807.12 rows=50884 width=4) (actual time=0.012..11.737 rows=49001 loops=1)
  Index Cond: ((generate_series >= 1000) AND (generate_series <= 50000))
  Heap Fetches: 49001
  Buffers: shared hit=355
Planning Time: 0.220 ms
Execution Time: 15.267 ms
(6 rows)

结论

  • 如果数据比较静态,则CLUSTER特别有用

好好学习天天向上

posted @ 2021-08-08 22:02  y_dou  阅读(385)  评论(0)    收藏  举报