PostgreSQL

1. 创建自增主键

create table test_increment(

id bigserial primary key,

name varchar(255)

)

insert into test_increment(name) values('wuchen'),('waahh'),('eefa');

2. 导入文本

copy ry_total from '/home/wuchen/excelAndSheetNames.csv' with delimiter '  ';

copy ry_total from '/home/wuchen/123.csv' with csv;

3. 创建内存表

步骤1：内存中创建一个512M的tmpfs文件系统，并挂载到/home/postgres/ramdisk/下
sudo mount tmpfs /home/postgres/ramdisk/ -t tmpfs -o size=512m

步骤2：更改拥有者为Postgres
sudo chown postgres:postgres /home/postgres/ -R

步骤3：创建Ram的表空间
create tablespace ramdis location '/home/postgres/ramdisk';

步骤4：创建一个表，并指定表空间到Ram Disk的表空间
create table ram (id serial, name varchar(40), age int) TABLESPACE ramdis;

4. 时间戳转日期

select to_timestamp(1585152000)--2020-03-26 00:00:00

select to_timestamp(1585238399)--2020-03-26 23:59:59

--日期转时间戳

select extract(epoch from now())

select extract(epoch from cast('2020-03-26 00:00:00' as timestamp))

5. 工具入库，效率极高

./pg_bulkload -i /home/wuchen/CIP_STAT_SPACETIME_WZ_12.bcp -O TMP_WZ -l /home/postgres/wz.log -P /home/postgres/t_bad.txt -o "TYPE=CSV" -o $'DELIMITER=\t' -d postgres -U postgres -p 5431

6. 查看磁盘空间大小

6.1 查看所有表所占磁盘空间大小

select pg_database.datname, pg_size_pretty(pg_database_size(pg_database.datname)) AS size from pg_database;

6.2 查看每个表所占用磁盘空间大小

SELECT table_schema || '.' || table_name AS table_full_name, pg_total_relation_size('"' || table_schema || '"."' || table_name || '"')AS size FROM information_schema.tables order by 2 desc;

7. awk打印某一列之后所有列

more vertica.log | grep "Starting" | awk -F' ' '{for(i=3;i<=10;i++){$i="";$NF=""};print $0}'| head -10

grep 'Executing' vertica.sql|awk -F' ' '{for(i=3;i<=9;i++){$i=""};print $0}' > no_10.sql

grep 'Begin' vertica.sql|awk -F' ' '{for(i=3;i<=9;i++){$i=""};print $0}' > no_10_1.sql

grep 'Starting' vertica.sql|awk -F' ' '{for(i=3;i<=10;i++){$i="";$NF=""};print $0}' > no_11.sql

8. 内置分区表建分区

create table car2_history partition of car2 for values from (minvalue) to (1546272000)

create table car2_201901 partition of car2 for values from (1546272000) to (1548950400)

...

create table car2_202203 partition of car2 for values from (1646064000) to (1648742400)

create table car2_future partition of car2 for values from (1648742400) to (maxvalue)

9. 多核并行计算相关参数

--> vim /data/pgsql/12/postgresql.conf

max_worker_processes = 128       # 控制整个数据库集群同时开启多少work process，必须设置

max_parallel_maintenance_workers = 30  # 创建索引用到的并发数

max_parallel_workers_per_gather = 30   # 控制一个并行EXEC NODE最多能开启多少个并行处理单元，同时参考标记参数parallel_workers,根据表的大小计算需要开多少并行处理单元，取最小值。

--> 重新加载配置文件

pg_ctl reload

show max_parallel_workers_per_gather;

--> 设置表的并行度为29

alter table car set (parallel_workers = 29);

postgres=# explain analyze select count(*) from car;

                                 QUERY PLAN                                 

\----------------------------------------------------------------------------------------------------------------------------------------------

Finalize Aggregate (cost=1341555.70..1341555.71 rows=1 width=8) (actual time=1544.933..1544.934 rows=1 loops=1)

  -> Gather (cost=1341555.62..1341555.63 rows=29 width=8) (actual time=1543.712..1650.497 rows=30 loops=1)

     Workers Planned: 29

     Workers Launched: 29

     -> Partial Aggregate (cost=1341555.62..1341555.63 rows=1 width=8) (actual time=1504.593..1504.593 rows=1 loops=30)

        -> Parallel Seq Scan on car (cost=0.00..1332503.90 rows=3620690 width=0) (actual time=0.023..1108.201 rows=3500000 loops=30)

Planning Time: 0.091 ms

JIT:

  Functions: 62

  Options: Inlining true, Optimization true, Expressions true, Deforming true

  Timing: Generation 16.837 ms, Inlining 2398.887 ms, Optimization 281.337 ms, Emission 322.743 ms, Total 3019.804 ms

Execution Time: 1651.498 ms

(12 rows)

10. PG生成序列

select r,r||'_tans' from generate_series(1,10) r;

11. 全文检索

建立rum索引： "idx_zn_json_vector" rum (to_tsvector('zhcfg'::regconfig, content_json))

标准化：postgres=# select to_tsvector('zhcfg','是138号');

to_tsvector

-----------------------

'138':2 '号':3 '是':1

create index idx_remark_gin on nb_app_adm_per_base_s using gin (replace(array_to_string_immutable(remark,','),',','') gin_trgm_ops);

12. 查询表数据量

select relname,reltuples::bigint from pg_class where relkind = 'r' and relnamespace=(select oid from pg_namespace where nspname='public') and relname like 'nb_tab_auth_%' order by reltuples desc;

select sum(cnt) from (select relname,reltuples::bigint as cnt from pg_class where relkind = 'r' and relnamespace=(select oid from pg_namespace where nspname='public') and relname like 'nb_tab_auth%' order by reltuples desc)t;

计算平均行长

select pg_relation_size('nb_tab_auth_1')::bigint/reltuples::bigint from pg_class where relkind = 'r' and relnamespace=(select oid from pg_namespace where nspname='public') and relname = 'nb_tab_auth_1';

13. 查询单表/索引所占存储

select pg_size_pretty(pg_relation_size('idx_remotectrl_gin'));

select pg_size_pretty(pg_relation_size('trgm_idx1'));

14. 索引创建

建立rum索引：create index idx_rum_wuchen_person on wuchen_person using rum(fh_all rum_tsvector_ops);

建立gin索引：create index idx_nb_app_adm_per_base_s on nb_app_adm_per_base_s using gin(md_id,idno,idtype,chname,chnamepy,nikename);

15. PG toast策略

plain 避免压缩和行外存储

extended 允许压缩和行外存储（当数据长度超过一个BLOCK的长度的四分之一（8K/4=2K）时会触发压缩）

externa 允许行外存储，不允许压缩

main 允许压缩，但不允许行外存储

16. 查看已安装插件 \dx

查看可用插件 select * from pg_available_extensions;

17. 查看PG分词器插件

\dFp

FP-hanlp分词器函数

select CL_Analyzer('text_fhik','query','123456张三李四');

select CL_Analyzer('text_fhik','query','南京市长江大桥');

select CL_Analyzer('text_fhik','query','杨进，这两天没起床');

select CL_Analyzer('text_fhik','query','唐山市气象台');

18. PG分词器函数

select to_tsvector('zhcfg','123456张三李四');

select to_tsvector('zhcfg','南京市长江大桥');

select to_tsvector('zhcfg','杨进，这两天没起床');

select to_tsvector('zhcfg','唐山市气象台');

19. 分区函数

（1）天分区

create or replace function create_range_partition_day(varchar,timestamp,timestamp,interval) returns void as $$

declare

  rec record;

begin

for rec in (select t as curmonth,t + $4 as nextmonth from generate_series($2,$3,$4) g(t)) loop

raise notice 'create table % partition of % for values from (%) to (%)',

quote_ident(lower($1)||'_'||to_char(rec.curmonth,'yyyyMMdd')),quote_ident(lower($1))

,FLOOR(EXTRACT(epoch from rec.curmonth::timestamptz)),FLOOR(EXTRACT(epoch from rec.nextmonth::timestamptz));

execute format('create table %s partition of %s for values from (%s) to (%s)',

quote_ident(lower($1)||'_'||to_char(rec.curmonth,'yyyyMMdd')),quote_ident(lower($1)),

FLOOR(EXTRACT(epoch from rec.curmonth::timestamptz)),

FLOOR(EXTRACT(epoch from rec.nextmonth::timestamptz)));

end loop;

execute format('create table %s_default partition of %s default',quote_ident(lower($1)),quote_ident(lower($1)));

end;

$$ language plpgsql;

 

create table car

(

  id bigserial not null,

  veh_plate_num varchar(100),

  bayo_no    varchar(50),

  trav_dire_code varchar(10),

  veh_lic_color_code int,

  veh_color_code char(1),

  pass_bayo_time int DEFAULT FLOOR(EXTRACT(epoch from now())) NOT NULL,

  bayo_addr_lon decimal(12,6),

  bayo_addr_lat decimal(12,6),

  json_all jsonb

) PARTITION BY RANGE (pass_bayo_time);

 

 

select create_range_partition_day('car','2022-01-01'::timestamp,'2022-03-01'::timestamp,interval '1 day');

 

（2）月分区

create or replace function create_range_partition_mon(varchar,timestamp,timestamp,interval) returns void as $$

declare

  rec record;

begin

   for rec in (select t as curmonth,t+interval '1 month' as nextmonth from generate_series($2,$3,$4) g(t)) loop

    raise notice 'create table % partition of % for values from (%) to (%)',

          quote_ident(lower($1)||'_'||to_char(rec.curmonth,'yyyyMM')),quote_ident(lower($1))

          ,FLOOR(EXTRACT(epoch from rec.curmonth::timestamptz)),FLOOR(EXTRACT(epoch from rec.nextmonth::timestamptz));

    execute format('create table %s partition of %s for values from (%s) to (%s)',

          quote_ident(lower($1)||'_'||to_char(rec.curmonth,'yyyyMM')),quote_ident(lower($1)),

          FLOOR(EXTRACT(epoch from rec.curmonth::timestamptz)),

          FLOOR(EXTRACT(epoch from rec.nextmonth::timestamptz)));

   end loop;

   execute format('create table %s_default partition of %s default',quote_ident(lower($1)),quote_ident(lower($1)));

end;

$$ language plpgsql;

 

select create_range_partition_mon('car','2022-01-01'::timestamp,'2022-03-01'::timestamp,interval '1 month');

20. 用pg_bulkload工具入库

pg_bulkload -i /home/wuchen/auth_test.bcp -O mid_auth -l /home/postgres/load.log -P /home/postgres/bad.log -o "TYPE=CSV" -o $'DELIMITER=\t' -d wuchen -U postgres -p 5432

-i 是文件路径

-O t 指定表名

21. 数组类型字段用法

21.1 精确查询

（1）remark @> ARRAY['刑蕊']::varchar[] （包含）

可以建立gin索引

postgres=# explain analyze select md_id, idno, idtype, chname, chnamepy, remark from nb_app_adm_per_base_s where remark @> ARRAY['刑蕊']::varchar[] or remark @> ARRAY['李.勇']::varchar[];

QUERY PLAN

----------------------------------------------------------------------------------------------------------------------------------------

Bitmap Heap Scan on nb_app_adm_per_base_s (cost=16.02..22.86 rows=2 width=200) (actual time=0.028..0.031 rows=2 loops=1)

Recheck Cond: ((remark @> '{刑蕊}'::character varying[]) OR (remark @> '{李文勇}'::character varying[]))

Heap Blocks: exact=2

-> BitmapOr (cost=16.02..16.02 rows=2 width=0) (actual time=0.021..0.021 rows=0 loops=1)

-> Bitmap Index Scan on idx_nb_app_adm_per_base_s (cost=0.00..8.01 rows=1 width=0) (actual time=0.013..0.013 rows=1 loops=1)

Index Cond: (remark @> '{刑蕊}'::character varying[])

-> Bitmap Index Scan on idx_nb_app_adm_per_base_s (cost=0.00..8.01 rows=1 width=0) (actual time=0.008..0.008 rows=1 loops=1)

Index Cond: (remark @> '{李文勇}'::character varying[])

Planning Time: 0.184 ms

Execution Time: 0.062 ms

(10 rows)

（2）'刑蕊'=ANY(remark)

全表扫描无法利用索引

postgres=# explain analyze select md_id, idno, idtype, chname, chnamepy, remark from nb_app_adm_per_base_s where '刑蕊'=ANY(remark) or '李文勇'=ANY(remark);

QUERY PLAN

------------------------------------------------------------------------------------------------------------------

Seq Scan on nb_app_adm_per_base_s (cost=0.00..56.15 rows=2 width=200) (actual time=0.148..0.227 rows=2 loops=1)

Filter: (('刑蕊'::text = ANY ((remark)::text[])) OR ('李文勇'::text = ANY ((remark)::text[])))

Rows Removed by Filter: 145

Planning Time: 0.160 ms

Execution Time: 0.246 ms

(5 rows)

21.2 模糊查询

全表扫描无法利用索引

explain analyze select md_id, idno, idtype, chname, chnamepy, remark,array_to_string(remark,',') from nb_app_adm_per_base_s where array_to_string(remark,',') like '%李文勇%';

QUERY PLAN

------------------------------------------------------------------------------------------------------------------

Seq Scan on nb_app_adm_per_base_s (cost=0.00..53.21 rows=1 width=232) (actual time=0.181..0.258 rows=1 loops=1)

Filter: (array_to_string(remark, ','::text) ~~ '%李文勇%'::text)

Rows Removed by Filter: 146

Planning Time: 0.072 ms

Execution Time: 0.278 ms

(5 rows)

22. 查询进程

select pid,datname,usename,application_name,client_addr,state,query from pg_stat_activity;

SELECT pg_cancel_backend(PID); ###取消一个正在执行的SQL；

SELECT pg_terminate_backend(PID); ###终止一个后台服务进程，同时释放此后台服务进程的资源。

23. psql命令

psql -U postgres -d testdb -c "create index idx_auth1_captime on nb_tab_auth1 using btree(capture_time);"

psql -U postgres -d testdb3 -f /home/wuchen/testdb.sql

24.查询database所在表空间

testdb3=# select datname,dattablespace from pg_database where datname = 'testdb3';

datname | dattablespace

---------+---------------

testdb3 | 1663

(1 row)

testdb3=# select oid,spcname from pg_tablespace where oid = 1663;

oid | spcname

------+------------

1663 | pg_default

(1 row)

查询索引所在表空间：

select * from pg_indexes where indexname='nb_tab_auth_23_capture_time_data_source_idx';

schemaname | tablename | indexname | tablespace | indexdef

------------+----------------+---------------------------------------------+------------+-----------------------------------------------------------------------------------

----------------------------------------

public | nb_tab_auth_23 | nb_tab_auth_23_capture_time_data_source_idx | tbs4 | CREATE INDEX nb_tab_auth_23_capture_time_data_source_idx ON public.nb_tab_auth_23

USING btree (capture_time, data_source)

25. 获取表结构字段

--PG

psql -U postgres -d postgres -c "select * from nb_app_adm_per_base_s2 where 1<>1" | grep '|'| tr '|' ','

--F-Pointer

sh sql.sh -c "select * from nb_app_adm_per_base_s2 where 1<>1" | grep '|'| tr '|' ','

26. 查看参数配置

select name,setting,unit,category,short_desc from pg_settings where name like '%time%';

27. 利用开窗函数分页

第一页:
select T.chname,T.max_age,T.min_idno,T.maxid
from
(
select chname,max(age) as max_age,min(idno) as min_idno,max(id) as maxid
from test_person
where 1=1 and id>1
group by chname
order by chname limit 50 offset 0
) T
order by maxid;

应用程序记下最后一行的maxid，如:2999635 传给下面的分页sql

第二页:
select T.chname,T.max_age,T.min_idno,T.maxid
from
(
select chname,max(age) as max_age,min(idno) as min_idno,max(id) as maxid
from test_person
where 1=1 and id>2999635
group by chname
order by chname limit 50 offset 0
) T
order by maxid;

28. 查看函数

\dfa --aggregates

\dfn --normal

\dft --trigger

\dfw --window functions

-- 查看详细内容

\sf array_append

29. 打开慢查询日志

log_min_duration_statement = 5 # -1 is disabled, 0 logs all statements

log_directory = 'slowlog'

30. PG注释

--创建

comment on table dtat_container_job is '作业表';

comment on column dtat_container_job.job_id is '作业名';

 

--查询

SELECT 

a.attname as "字段名", 

col_description(a.attrelid,a.attnum) as "注释", 

concat_ws('',t.typname,SUBSTRING(format_type(a.atttypid,a.atttypmod) from '\(.*\)')) as "字段类型" 

FROM 

pg_class as c, 

pg_attribute as a, 

pg_type as t 

WHERE 

c.relname = 'dtat_container_job' 

and a.atttypid = t.oid 

and a.attrelid = c.oid 

and a.attnum>0;

31. 自定义分隔符

copy aa from '/home/digoal/aa.csv' with (delimiter E'\t');

可以使用类似这样的命令进行导入:copy target_table_name (field_1, field_2, field_3)from 'C:\sourceData.txt'with (FORMAT csv,DELIMITER ',',escape '',header true,quote '"',encoding 'UTF8')

这里, with后面括号中跟的是导入参数设置:format指定导入的文件格式为csv格式delimiter指定了字段之间的分隔符号位逗号escape指定了在引号中的转义字符为反斜杠，这样即使在引号字串中存在引号本身，也可以用该字符进行转义，变为一般的引号字符，而不是字段终结header true：指定文件中存在表头。如果没有的话，则设置为falsequote指定了以双引号作为字符串字段的引号，这样它会将双引号内的内容作为一个字段值来进行处理encoding指定了文件的编码格式为utf8, 如果是别的格式则修改为适当的编码格式.

32. 切换模式

set search_path = schema_name

33. 杀会话

SELECT pg_terminate_backend(PID);

34.PG gin索引结构

Pending list：当开启fastupdate时，优先将数据写入pending list，autovacuum异步合并到gin树，可以减少插入时的IO，但查询时需要查pending list和gin树，导致搜索性能降低；

1.创建分片表

create database wuchen;

\c wuchen

#在各节点数据库安装插件

create extension citus;

#在Master节点执行命令添加节点

select * from master_add_node('172.16.20.8',5432);

select * from master_add_node('172.16.20.9',5432);

select * from master_get_active_worker_nodes();

#设置会话级分片数

set citus.shard_count = 64;

#创建分片，id是分片字段

select create_distributed_table('nb_tab_auth', 'id');

#各worker统一安装分词器插件

35. 分布式表update

因为ctid（rowid）不唯一，需用主键更新

with T as(

select auth_account,row_to_json(nb_tab_auth)::jsonb as c

from nb_tab_auth

)

update nb_tab_auth

set json_content = T.c

from T

where T.auth_account=nb_tab_auth.auth_account;

36. 查看分片表的分片字段

select * from pg_dist_partition where logicalrelid='nb_tab_auth1'::regclass;

logicalrelid | partmethod | partkey | colocationid | repmode

--------------+------------+--------------------------------------------------------------------------------------+--------------+--------

nb_tab_auth1 | h | {VAR :varno 1 :varattno 1 :vartype 23 :vartypmod -1 :varcollid 0 :varlevelsup 0 :varnoold 1 :varoattno 1 :location -1} | 2 | s

(1 row)

------

partmethod h：hash分片

partkey varattno 1：第1个字段是分片字段

37. 在分片上单独执行SQL

在每个worker上批量执行SQL

SELECT * FROM run_command_on_workers('alter system set log_statement=$$mod$$');

SELECT pg_reload_conf();

在每个分片上批量执行SQL

select * from run_command_on_shards('nb_tab_auth','select count(*) from %s');

38. 分片数的确定

官方建议

(1) OLTP场景(带分片字段SQL)小负载（小于100GB）32；大负载64或128

(2) OLAP场景(不带分片字段SQL)为worker总core数的2倍或4倍

(3) 不带分片字段SQL，分片数越多CN负载越大；CN是瓶颈时(主键查询，分组结果很多的聚合查询)QPS越低

39. 如果要指定表空间建索引

要在每个 worker节点（数据节点），对每个分片表单独建索引，索引名称不能重复

select 'create index idx_'||tablename||'gin on '||tablename||' using gin(name) tablespace tbs0;' from pg_tables where tablename??~ '^[1][\d+]';

40. 集群入库速度：

(1)无索引条件（kafka-insert）每条数据约2K

（1CN+2worker）6000条/s

（1CN+5worker）分SSD盘12线程 6254条/s（IO瓶颈）

11574条/s（IO均衡）

(2)带三个索引入库（kafka-insert）每条数据约2K

（1CN+2worker）单SATA盘12线程 2739条/s

（1CN+5worker）分SSD盘12线程开始6017条/s，随数据量增多越来越慢

41. 连接池技术：pgbouncer

20.7 没连接池，CN 节点连接worker节点走的是连接池，20.8、20.9 上有连接池。据陈华军说性能更好，未测试

42. 分片字段的选择：

原则：

(1) 每个SQL中都带等值条件的字段（比如用户ID）

(2) Join关联的字段

(3) 高基数且值分布均匀的字段

(4) 日期通常不适合作为分片字段，容易形成热点

实测：

(1) 自增ID和时间戳做分片字段时，查询TPS差别不大，优于时间戳转换成YYYYMMDDHH格式的字段

(2) 自增ID缺点，不能通过worker节点入库，因为worker节点入库需要先确定分片

(3) 大数据了测试SQL中有分片字段效果较好

43. 分片表分盘

作用：可以防止单个盘IO爆满

-- 利用脚本生成移库语句

sh movetabspace.sh testdb3 > move_testdb3.sql

-- 分别执行语句

psql -d testdb3 -f move_testdb3.sql

44. 表亲和

缺省所有分片数，副本数，分片字段类型相同的分片表被归属为同一个亲和组

自定义亲和组可以减小今后分片迁移的粒度

select create_distributed_table(‘tb1’,‘id’, colocate_with=>‘none’); --生成新的亲和组

select create_distributed_table(‘tb2’,‘id’, colocate_with=>‘tb1’); --加入已有的亲和组

45. 分布式查询执行器

set citus.task_executor_type = 'task-tracker'

(1) adaptive执行器：（默认）自适应，对于涉及跨多个分片的聚合和关联查询，有最优的快速响应

--执行不带分片字段SQL时，CN对所有worker上的所有shard同时发起连接，并执行SQL收集结果。(原real_time执行器的增强版)

(2) task-tracker执行器：任务跟踪器，适合于长时间运行，复杂的查询

--执行不带分片字段SQL时，CN只和worker上的task-tracker进程交互，调用worker上的task_tracker_assign_task()函数将任务分配给task-tracker,然后轮询任务的完成状况，待任务结束后再开一个连接从worker取回结果

(3) real_time执行器：（不推荐）与自适应执行器有类似的目的，但不太灵活，会对worker节点造成更多的连接压力

--执行不带分片字段SQL时，CN对所有worker上的所有shard同时发起连接，并执行SQL收集结果

46. citus集群不支持功能点

(1) copy 导出的时候要从数据节点一张一张导出

(2) 无法查看union all 的执行计划

47. 从worker节点入库方案

《pg_get_dist_shard_placement.txt》

在数据库中创建函数，通过传入分片字段的值获取分片表名和节点IP，用程序实现worker节点入库

48.在主节点查询某个分片表在 work节点的记录数

select n.nodename,n.nodeport,sum(t.result::int4)

from (

select *

from run_command_on_shards('nb_tab_auth','select count(*) from %s')

) as t,

pg_dist_placement p,

pg_dist_node n

where t.shardid = p.shardid and p.groupid=n.groupid

group by n.nodename,n.nodeport;

49. PG全文索引插件

# btree_gin索引插件安装

create extension btree_gin;

# rum索引插件安装

create extension rum;

# pg_zhparser分词器安装

create extension zhparser;

CREATE TEXT SEARCH CONFIGURATION zhcfg (PARSER = zhparser);

ALTER TEXT SEARCH CONFIGURATION zhcfg ADD MAPPING FOR a,b,c,d,e,f,g,h,i,j,k,l,m,n,o,p,q,r,s,t,u,v,w,x,y,z WITH simple;

postgresql.conf 中可配置

--------------------------------------------------------------------------------------------

#忽略所有的标点等特殊符号:

zhparser.punctuation_ignore = f

闲散文字自动以二字分词法聚合:

zhparser.seg_with_duality = f

#将词典全部加载到内存里:

zhparser.dict_in_memory = f

短词复合:

zhparser.multi_short = t

#散字二元复合:

zhparser.multi_duality = t

#重要单字复合:

zhparser.multi_zmain = t

#全部单字复合:

zhparser.multi_zall = f

------------------------------------------------------------------------------------------

/*会话级别

set zhparser.multi_short = t ;

set zhparser.multi_duality = t;

set zhparser.multi_zmain = t ;

50. 索引存储对比

Indexes:

"idx_gin_nb_app_adm_per_base_s_105592" gin (chname)

"idx_trgm_gin_nb_app_adm_per_base_s_105592" gin (chname gin_trgm_ops)

"idx_trgm_gist_nb_app_adm_per_base_s_105592" gist (chname gist_trgm_ops)

Tablespace: "tb1"

Access method: heap

Options: parallel_workers=20

50.1 普通gin索引

testdb=# select pg_size_pretty(pg_relation_size('idx_gin_nb_app_adm_per_base_s_105592'));

pg_size_pretty

----------------

9024 kB

(1 row)

50.2 pg_trgm插件的gin索引

testdb=# select pg_size_pretty(pg_relation_size('idx_trgm_gin_nb_app_adm_per_base_s_105592'));

pg_size_pretty

----------------

23 MB

(1 row)

50.3 pg_trgm插件的gist索引

testdb=# select pg_size_pretty(pg_relation_size('idx_trgm_gist_nb_app_adm_per_base_s_105592'));

pg_size_pretty

----------------

54 MB

(1 row)

51. 表达式索引

-- 场景：

在应用程序中，特别是使用PostgreSQL多值列（数组、全文检索、JSON）类型的业务中，除了多值列有查询需求，单值列也有查询需求。

那么当两种查询并行存在时，数据库会根据COST选择单列或多列组合索引。但是性能并不能达到极致，只是简化用户的索引设计而已。

-- 原理：

本文提到的方法，适合于单值列与多值列混合查询的场景，实际上我们用到了PostgreSQL的UDF和表达式索引的功能。

UDF将单值列和多值列合并，变成一个新的多值列。

表达式索引，针对这个UDF来构建。目的是将内部的BITMAP合并这个部分化解掉，仅使用一颗倒排树，并且这颗倒排树包含了单值列和多值列的值。

-- 1.创建一个UDF，将auth_account、jcontent值合并。

-- （1）整型单值列+多值列/分词

create or replace function gen_newarr(int, anyarray) returns text[] as $$

declare

res text[] := '{}';

x int;

begin

foreach x in array $2 loop

res := array_append(res, $1||'_'||x);

end loop;

return res;

end;

$$ language plpgsql strict immutable;

-- （2）字符型单值列+多值列/分词

create or replace function gen_newarr(varchar, anyarray) returns text[] as $$

declare

res text[] := '{}';

x varchar;

begin

foreach x in array $2 loop

res := array_append(res, $1||'_'||x);

end loop;

return res;

end;

$$ language plpgsql strict immutable;

-- 2.测试语法

select array_to_tsvector(gen_newarr(auth_account,tsvector_to_array(to_tsvector('zhcfg',jcontent))::text[])) from nb_tab_auth_104824 limit 10;

-- 3.创建测试表

create table test_index(auth_account varchar(255),jcontent jsonb);

-- 4.导入测试数据

insert into test_index select auth_account,jcontent from nb_tab_auth_104824 limit 1000000;

-- 5.创建复合索引

create index idx_test_index_gin on test_index using gin ( array_to_tsvector(gen_newarr(auth_account, tsvector_to_array(to_tsvector('zhcfg',jcontent)))) );

Time: 447812.016 ms (07:27.812)

-- 6.查询表/索引的存储

testdb3=# select pg_size_pretty(pg_relation_size('test_index'));

pg_size_pretty

----------------

1553 MB

testdb3=# select pg_size_pretty(pg_relation_size('idx_test_index_gin'));

pg_size_pretty

----------------

4951 MB

(1 row)

-- 7.查询SQL

testdb3=# explain analyze select jcontent from test_index where array_to_tsvector(gen_newarr(auth_account,tsvector_to_array(to_tsvector('zhcfg',jcontent))::text[])) @@ tsquery '18841601440_18841601440';

QUERY PLAN

---------------------------------------------------------------------------------------------------------------------------------------------------------------------------

Bitmap Heap Scan on test_index (cost=74.75..20079.74 rows=5000 width=1395) (actual time=0.048..0.049 rows=1 loops=1)

Recheck Cond: (array_to_tsvector(gen_newarr(auth_account, tsvector_to_array(to_tsvector('zhcfg'::regconfig, jcontent)))) @@ '''18841601440_18841601440'''::tsquery)

Heap Blocks: exact=1

-> Bitmap Index Scan on idx_test_index_gin (cost=0.00..73.50 rows=5000 width=0) (actual time=0.037..0.037 rows=1 loops=1)

Index Cond: (array_to_tsvector(gen_newarr(auth_account, tsvector_to_array(to_tsvector('zhcfg'::regconfig, jcontent)))) @@ '''18841601440_18841601440'''::tsquery)

Planning Time: 0.157 ms

Execution Time: 0.113 ms

(7 rows)

Time: 1.212 ms

/普通索引/

create table test_index2(auth_account varchar(255),jcontent jsonb);

insert into test_index2 select auth_account,jcontent from nb_tab_auth_104824 limit 1000000;

create index idx1_test_index2_gin on test_index using gin (auth_account);

Time: 4910.510 ms (00:04.911)

create index idx2_test_index2_gin on test_index using gin (to_tsvector('zhcfg',jcontent));

Time: 176472.011 ms (02:56.472)

testdb3=# select pg_size_pretty(pg_relation_size('idx1_test_index2_gin'));

pg_size_pretty

----------------

65 MB

(1 row)

testdb3=# select pg_size_pretty(pg_relation_size('idx2_test_index2_gin'));

pg_size_pretty

----------------

536 MB

(1 row)

-- 查询SQL

testdb3=# explain analyze select jcontent from test_index2 where to_tsvector('zhcfg',jcontent) @@ tsquery '18841601440' and auth_account = '18841601440';

QUERY PLAN

----------------------------------------------------------------------------------------------------------------------------------------------

Gather (cost=0.00..251850.01 rows=1 width=1396) (actual time=1.129..221.907 rows=1 loops=1)

Workers Planned: 5

Workers Launched: 5

-> Parallel Seq Scan on test_index2 (cost=0.00..251850.01 rows=1 width=1396) (actual time=143.866..178.349 rows=0 loops=6)

Filter: (((auth_account)::text = '18841601440'::text) AND (to_tsvector('zhcfg'::regconfig, jcontent) @@ '''18841601440'''::tsquery))

Rows Removed by Filter: 166666

Planning Time: 0.264 ms

Execution Time: 221.959 ms

(8 rows)

Time: 223.013 ms

-- 总结

1.表达式索引提升性能100~200倍

2.表达式索引比单独索引存储膨胀8倍

3.表达式索引创建时间是单独索引2.5倍

4.表达式索引查询条件只能用and

52. 权限控制

52.1 列权限

1.创建用户，（create role 与 create user 命令是等效的，只是create user默认是login的，而前者默认是非login的）

postgres=# create role wuchen login nosuperuser nocreatedb password 'wuchen';

--等同于create user wuchen password 'wuchen';

2.赋予查询一列的权限

postgres=# grant select (time) on nb_tab_auth to wuchen;

3.收回权限

postgres=# revoke select (time) on nb_tab_auth from wuchen;

4.查询该列正常

postgres=> select time from nb_tab_auth limit 1;

time

------------

1513576860

(1 row)

5.查询其他列报错

postgres=> select time,userid from nb_tab_auth limit 1;

ERROR: permission denied for table nb_tab_auth

6.系统表pg_attribute.attacl 控制权限（通过直接修改字段attacl即可控制权限）

postgres=> select attname,attacl from pg_attribute where attrelid =(select oid from pg_class where relname ='wx_contacts_group_id_seq') and attname in ('time','userid');

attname | attacl

---------+---------------------

time | {wuchen=r/postgres}

userid |

(2 rows)

52.2 表级权限

1）在建库时指定用户test 为 owner可以赋予test整个库的权限：

create database container owner test;

2）对于已经存在库，且库里有表的情况，需单独对表进行赋权：

alter database container owner to test;

alter table wx_msg owner to test;

ps：用下面的语句刷

tools psql -d container -c "\dt" | awk -F'|' '{ print "alter table " $2 " owner to test;"}' > alter.sql

3）给予其他用户以表的权限：

grant all on database container to ace ;

4）对于已经建好的表，需要单独对表进行赋权：

grant all on wx_call_contact_detail to ace;

ps：用下面的语句刷

tools psql -d container -c "\dt" | awk -F'|' '{ print "grant all on " $2 " to ace;"}' > alter2.sql

5）查看单表的权限：

container=# select relname,relacl from pg_class where relname ='wx_register';

relname | relacl

-------------+--------------------------------------

wx_register | {test=arwdDxt/test,ace=arwdDxt/test}

简化命令：

\dp wx_register

参数说明：

角色名=xxxx -- 被授予给一个角色的特权

=xxxx -- 被授予给 PUBLIC 的特权

r -- SELECT ("读")

w -- UPDATE ("写")

a -- INSERT ("追加")

d -- DELETE

D -- TRUNCATE

x -- REFERENCES

t -- TRIGGER

X -- EXECUTE

U -- USAGE

C -- CREATE

c -- CONNECT

T -- TEMPORARY

arwdDxt -- ALL PRIVILEGES （对于表，对其他对象会变化）

* -- 用于前述特权的授权选项

6）查看用户拥有的权限

container=# select * from information_schema.role_table_grants where grantee='ace';

---------+---------+---------------+--------------+---------------------------+----------------+--------------+----------------

53. 德哥解答

53.1 pg 可以使用hint指定走某个索引吗？

• 用 pg_hint_plan 插件

• 为什么要问这个问题?

• 优化器没有选择合适索引

• 为什么优化器没有选择合适索引

• 统计信息维度不够、统计信息不准确、算子参数不准确

• 有哪些优化器? 分别解决什么问题?

• 自动: cbo, geqo, aqo

• 人为: srplan, hint

• 自动化趋势不可逆转, 未来一定是自动化的天下.

53.2 pg 针对分表要新增一个字段，需要对每个分表编辑吗？

• 不需要, 直接操作主表, PS: 所有DDL都有排他锁, 注意防止雪崩(后面会讲).

53.3 pg 在保持性能的前提下，最多能存多少条数据呀？

• 为什么要问这个问题?

• 一定是没有遇到过性能瓶颈, 否则不就知道了么, 还有啥疑问

• 答案

• 摩尔定律+安迪比尔定律

• 大了之后还需要考虑的问题:

• 备份

• 时间点恢复

• 52C 384GB 16TB SSD

• PG 12 TPCB 100亿

• RO: 100万量级qps

• RW: 50万量级qps

53.4 pg 建议在多少条记录时分库/分表？

• 寻址上限

ctid(pageid, offset)->pageid 32位->pagesize(2k-32k) ->最大128TB每分区

• 存储上限

表空间->目录->文件系统->卷->块设备

• 性能上限

垃圾回收, table/process, index/multiprocess, (更新模型, 瓶颈与 IO|CPU 匹配)

建索引, 支持并行, 10亿记录, 创建索引252秒

freeze, 32bit xid, 只写(一次性freeze)、更改(版本变化后需要freeze)

逻辑备份, 快照, 垃圾回收oldest位点, DDL互斥

rewrite table, 某些情况需要重写表(vacuum full, 更改字段类型导致内部存储值发生变化, 老版本加字段默认值)

• 更新多的表：16GB or 1亿 / 分区

• 更新少的表：64GB / 分区

• 分库/建只读实例：热数据>内存/2 , 写入瓶颈, 查询瓶颈

53.5 pg delete记录后水位线会下降吗？

• 索引, 复用空间, 不回收, 不降水位.

• HEAP, 复用空间, 无有效记录的空页为什么不能从文件系统回收空间?

试想索引如何检索记录? 当记录在HEAP末尾页, 不在末尾页时有什么区别?末尾连续空页可以直接回收, 其他空页无法直接回收

• 水位没降下来怎么办?

表未来还要不要继续写入?

vacuum full, pg_repack , (rewrite, 前提保留足够存储空间)

53.6 pg 一般tps/qps能达到多少？

• 任何没有说明环境、场景、测试方法的性能指标都没有参考价值

• 答案

• 52C 384GB 16TB SSD

• PG 12 TPCB 100亿

• RO: 100万量级qps（read only）

• RW: 50万量级qps

53.7 pg 的统计信息收集是按照什么规则？

• 为什么要问这个问题?

• 一定是遇到过统计信息不准、未开启, 引起的SQL执行计划不准的问题

• 答案

• autovacuum_analyze_scale_factor

• autovacuum_analyze_threshold

• table | global level set.

• 开启自动收集->计数器->监工发现需要统计的表->分配工人干活->更新统计信息

53.8 pg 支持json/jsonb吗？

• PG: 无敌的sqljson功能

• sql 2016的sql/json标准有15条， PG 支持14道标准

• PG (12 14/15)，

• oracle(18c 11/15),

• mysql(8.0.4 5/15),

• sqlserver(2017 2/15) 。

• 索引支持

• jsonpath搜索语法

https://github.com/digoal/blog/blob/master/202010/20201013_01.md

53.9 pg 索引类型，除了btree还有哪些，可以

建立btree-gist联合索引吗？

• 《[直播]为什么饿了么网上订餐不会凉凉 & 牛顿发现万有引力有关?》

• https://github.com/digoal/blog/blob/master/202010/20201018_01.md

• 《[直播]为什么打车和宇宙大爆炸有关?》

• https://github.com/digoal/blog/blob/master/202009/20200926_02.md

53.10 canceling statement due to conflict withrecovery什么情况，怎么处理？

• https://github.com/digoal/blog/blob/master/202005/20200518_01.md

• 答案

• rw instance -> redo -> readonly instance -> startup process replay redo -> conflict with query -> recovery等query -> 等待是有限度的 -> cancel query

• replay的redo里面包含什么信息时, 会和query冲突?

• 最常见的情况: vacuum某些tuple version, 与query快照的xid相冲突

• 从节点调大replay等待时长, 主节点设置延迟回收, 从节点设置query feedback

• 可能导致主节点vacuum出现无用功, 或者膨胀

• 其他: 删除表空间, 锁冲突, pinned buffer, 死锁等.

53.11 在频繁更新和删除的系统中，如何比较好地避免表和索引膨胀从而引起的sql效率降低？

• 开启自动垃圾回收, autovacuum

• 配置足够多工人, autovacuum_max_workers

• 配置足够频繁的监测周期, autovacuum_naptime

• 配置足够小的触发阈值, autovacuum_vacuum_scale_factor,autovacuum_vacuum_threshold

• 避免工人频繁休息, autovacuum_vacuum_cost_delay, autovacuum_vacuum_cost_limit

• 避免工人做无用功, oldest xid snapshot. standby feedback, vacuum defer,old_snapshot_threshold, long query, long xact, long 2pc.

• 避免超大单分区, 因为单个分区单个vacuum工人为之服务, 无法并行

• 避免重复扫描索引, autovacuum_work_mem, 单个表分区的垃圾记录数*17字节不要超出autovacuum_work_mem

53.12 pg 大表加字段及默认值会锁表吗？哪些版本能很好地解决这种情况？

• PG 11及以后的版本, 加自动和默认值不需要rewrite table.

• 其他版本加字段不含默认值不需要rewrite table, 包含默认值需要rewrite table.

• 锁不锁关系不大, 关键是:

• 多大的锁, 排他, 与任何其他锁都会发生冲突

• 锁影响多长时间

• 取决于整个过程要多久

• 未持有锁, 等待中, 会不会与其他会话发生冲突

• 会, 大多数人栽在这里.

• 如何避免雪崩

• 执行DDL前, 设置锁请求超时, 然后再执行DDL

53.13 pg 大表更改字段类型会锁表吗？

• 一切DDL都会锁表

• 影响有多大, 取决于锁时长

• 数据内部存储未变化, 不需要rewrite table, 仅修改元数据

• 数据内部存储发生变化, 需要rewrite table

53.14 pg 想一次性对表（包括将来新建的表）赋权该如何操作？

• 为什么要问这个问题?

• mysql用户的问题? https://www.postgresql.org/docs/12/sql-alterdefaultprivileges.html

• https://www.postgresql.org/docs/12/sql-grant.html

• 答案

• GRANT { { SELECT | INSERT | UPDATE | DELETE | TRUNCATE | REFERENCES | TRIGGER }

• [, ...] | ALL [ PRIVILEGES ] }

• ON { [ TABLE ] table_name [, ...]

• | ALL TABLES IN SCHEMA schema_name [, ...] }

• TO role_specification [, ...] [ WITH GRANT OPTION ]

• ALTER DEFAULT PRIVILEGES

• [ FOR { ROLE | USER } target_role [, ...] ]

• [ IN SCHEMA schema_name [, ...] ]

• abbreviated_grant_or_revoke

• where abbreviated_grant_or_revoke is one of:

• GRANT { { SELECT | INSERT | UPDATE | DELETE | TRUNCATE | REFERENCES | TRIGGER }

• [, ...] | ALL [ PRIVILEGES ] }

• ON TABLES

• TO { [ GROUP ] role_name | PUBLIC } [, ...] [ WITH GRANT OPTION ]

53.15 ERROR: index row requires 8600 bytes,maximum size is 8191，怎么办？

• 为什么要问这个问题?

• 设计问题? 未限制字段长度. 然后报错让DBA去解决?

• 答案

• hash index

• hash value长度固定

• function index

• func(col), 查询时也使用func(col)

• partial index

• index on tbl (col) where col_length < 8192

• index on tbl (1) where col_length >= 8192

• select x on tbl where ... and col_length < 8192 union all ... and col_length >= 8192

53.16 current transaction is aborted, commands ignored until end of transaction block，这是啥，怎么办？

• begin;...错误; end; 结束事务后再发起新事务

• 使用自动事务提交

• 或者看看你用的驱动有没有相关参数、相关模块可以自动结束事务.

53.17 mysql不支持位图存储、 PG支持位图存储吗？

• 什么情况适合用位图索引?

• 数据量大, 唯一值集合小, 数据变更少, 按单值、组合值查询count?

• 答案

• 8.2支持过, 后来从代码中去掉了, 取而代之的: GIN、Bloom、更通用.

• btree: 等值、范围、排序、唯一约束

• hash: 等值

• gin, 数组包含、相交; 全文检索; 模糊查询; 正则匹配; JSON搜索; 相似查询; 任意字段组合等值搜索;

• gist: R tree, RD tree通用自定义平衡树; 地理信息搜索、距离排序; 全文检索; 多维向量距离排序; 排他约束

• spgist: 通用自定义非平衡树; quad tree, k-d tree, radix tree; 空间搜索; 排他约束;

• brin: 时序区间搜索; 线性相关存储数据搜索;

• bloom: 任意字段组合等值、不等过滤;

• rum: 全文检索; 文本相似; 数组相似;

• zombodb: ElasticSearch扩展引擎索引; (数据存PG, 索引在ES)

• pase: 阿里云PG专供: 多维向量距离排序; 图像识别; 相似

54. PG坏块处理

1.PG库查询报错：invalid page in block 18432 of relation base/26115/31469

其中26115是dboid、31469是reloid

执行sql：select oid,relname from pg_class where oid = 31469;

（1）如果对象是索引，则删除重建

（2）如果对象是表，则跳过坏块，将正常的数据导出后重新导入，具体方法如下：

a）在postgresql.conf中添加一行：zero_damaged_pages = 1

b）重启数据库

c）执行copy 表 to 文件，导出没有坏块的文件

d）truncate表

e）执行copy 表 from 文件，导入没有坏块的文件

风险：如果是表，坏块对应的数据丢失

https://www.cnblogs.com/xibuhaohao/p/11328891.html

t ↩︎

posted @ 2021-10-02 20:13 舍得先森阅读(563) 评论(0) 收藏举报

刷新页面返回顶部

舍得先森

PostgreSQL

1. 创建自增主键

2. 导入文本

3. 创建内存表

4. 时间戳转日期

5. 工具入库，效率极高

6. 查看磁盘空间大小

6.1 查看所有表所占磁盘空间大小

6.2 查看每个表所占用磁盘空间大小

7. awk打印某一列之后所有列

8. 内置分区表建分区

9. 多核并行计算相关参数

10. PG生成序列

11. 全文检索

12. 查询表数据量

13. 查询单表/索引所占存储

14. 索引创建

15. PG toast策略

16. 查看已安装插件 \dx

17. 查看PG分词器插件

18. PG分词器函数

19. 分区函数

20. 用pg_bulkload工具入库

21. 数组类型字段用法

21.1 精确查询

21.2 模糊查询

22. 查询进程

23. psql命令

24.查询database所在表空间

25. 获取表结构字段

26. 查看参数配置

27. 利用开窗函数分页

28. 查看函数

29. 打开慢查询日志

30. PG注释

31. 自定义分隔符

32. 切换模式

33. 杀会话

34.PG gin索引结构

35. 分布式表update

36. 查看分片表的分片字段

37. 在分片上单独执行SQL

38. 分片数的确定

39. 如果要指定表空间建索引

40. 集群入库速度：

41. 连接池技术：pgbouncer

42. 分片字段的选择：

43. 分片表分盘

44. 表亲和

45. 分布式查询执行器

46. citus集群不支持功能点

47. 从worker节点入库方案

48.在主节点查询某个分片表在 work节点的记录数

49. PG全文索引插件

闲散文字自动以二字分词法聚合:

短词复合:

50. 索引存储对比

50.1 普通gin索引

50.2 pg_trgm插件的gin索引

50.3 pg_trgm插件的gist索引

51. 表达式索引

52. 权限控制

52.1 列权限

52.2 表级权限

1）在建库时指定用户test 为 owner可以赋予test整个库的权限：

2）对于已经存在库，且库里有表的情况，需单独对表进行赋权：

3）给予其他用户以表的权限：

4）对于已经建好的表，需要单独对表进行赋权：

5）查看单表的权限：

6）查看用户拥有的权限

53. 德哥解答

53.1 pg 可以使用hint指定走某个索引吗？

• 用 pg_hint_plan 插件

53.2 pg 针对分表要新增一个字段， 需要对每个分表编辑吗？

53.3 pg 在保持性能的前提下， 最多能存多少条数据呀？

• 摩尔定律+安迪比尔定律

53.4 pg 建议在多少条记录时分库/分表？

• 寻址上限

• 存储上限

53.2 pg 针对分表要新增一个字段，需要对每个分表编辑吗？

53.3 pg 在保持性能的前提下，最多能存多少条数据呀？

53.9 pg 索引类型，除了btree还有哪些，可以

53.10 canceling statement due to conflict withrecovery什么情况，怎么处理？

53.11 在频繁更新和删除的系统中，如何比较好地避免表和索引膨胀从而引起的sql效率降低？

53.12 pg 大表加字段及默认值会锁表吗？哪些版本能很好地解决这种情况？

53.14 pg 想一次性对表（包括将来新建的表）赋权该如何操作？

53.15 ERROR: index row requires 8600 bytes,maximum size is 8191，怎么办？

53.16 current transaction is aborted, commands ignored until end of transaction block，这是啥，怎么办？