postgresql/lightdb的核心数据结构

Datum和领域数据类型、Pointer类型

/*
 * A Datum contains either a value of a pass-by-value type or a pointer to a
 * value of a pass-by-reference type.  Therefore, we require:
 *
 * sizeof(Datum) == sizeof(void *) == 4 or 8
 *
 * The macros below and the analogous macros for other types should be used to
 * convert between a Datum and the appropriate C type.
 */
typedef uintptr_t Datum;

/*
 * A NullableDatum is used in places where both a Datum and its nullness needs
 * to be stored. This can be more efficient than storing datums and nullness
 * in separate arrays, due to better spatial locality, even if more space may
 * be wasted due to padding.
 */
typedef struct NullableDatum
{
#define FIELDNO_NULLABLE_DATUM_DATUM 0
    Datum        value;
#define FIELDNO_NULLABLE_DATUM_ISNULL 1
    bool        isnull;
    /* due to alignment padding this could be used for flags for free */
} NullableDatum;

#define SIZEOF_DATUM SIZEOF_VOID_P

/*
 * DatumGetBool
 *        Returns boolean value of a datum.
 *
 * Note: any nonzero value will be considered true.
 */
#define DatumGetBool(X) ((bool) ((X) != 0))

/*
 * BoolGetDatum
 *        Returns datum representation for a boolean.
 *
 * Note: any nonzero value will be considered true.
 */
#define BoolGetDatum(X) ((Datum) ((X) ? 1 : 0))

/*
 * DatumGetChar
 *        Returns character value of a datum.
 */
#define DatumGetChar(X) ((char) (X))

/*
 * CharGetDatum
 *        Returns datum representation for a character.
 */
#define CharGetDatum(X) ((Datum) (X))

/*
 * Int8GetDatum
 *        Returns datum representation for an 8-bit integer.
 */
#define Int8GetDatum(X) ((Datum) (X))

/*
 * DatumGetUInt8
 *        Returns 8-bit unsigned integer value of a datum.
 */

#define DatumGetUInt8(X) ((uint8) (X))

/*
 * UInt8GetDatum
 *        Returns datum representation for an 8-bit unsigned integer.
 */
#define UInt8GetDatum(X) ((Datum) (X))

/*
 * DatumGetInt16
 *        Returns 16-bit integer value of a datum.
 */
#define DatumGetInt16(X) ((int16) (X))

/*
 * Int16GetDatum
 *        Returns datum representation for a 16-bit integer.
 */
#define Int16GetDatum(X) ((Datum) (X))

/*
 * DatumGetUInt16
 *        Returns 16-bit unsigned integer value of a datum.
 */
#define DatumGetUInt16(X) ((uint16) (X))

/*
 * UInt16GetDatum
 *        Returns datum representation for a 16-bit unsigned integer.
 */
#define UInt16GetDatum(X) ((Datum) (X))

/*
 * DatumGetInt32
 *        Returns 32-bit integer value of a datum.
 */
#define DatumGetInt32(X) ((int32) (X))

/*
 * Int32GetDatum
 *        Returns datum representation for a 32-bit integer.
 */
#define Int32GetDatum(X) ((Datum) (X))

/*
 * DatumGetUInt32
 *        Returns 32-bit unsigned integer value of a datum.
 */

#define DatumGetUInt32(X) ((uint32) (X))

/*
 * UInt32GetDatum
 *        Returns datum representation for a 32-bit unsigned integer.
 */
#define UInt32GetDatum(X) ((Datum) (X))

/*
 * DatumGetObjectId
 *        Returns object identifier value of a datum.
 */

#define DatumGetObjectId(X) ((Oid) (X))

/*
 * ObjectIdGetDatum
 *        Returns datum representation for an object identifier.
 */
#define ObjectIdGetDatum(X) ((Datum) (X))

/*
 * DatumGetTransactionId
 *        Returns transaction identifier value of a datum.
 */
#define DatumGetTransactionId(X) ((TransactionId) (X))

/*
 * TransactionIdGetDatum
 *        Returns datum representation for a transaction identifier.
 */
#define TransactionIdGetDatum(X) ((Datum) (X))

/*
 * MultiXactIdGetDatum
 *        Returns datum representation for a multixact identifier.
 */
#define MultiXactIdGetDatum(X) ((Datum) (X))

/*
 * DatumGetCommandId
 *        Returns command identifier value of a datum.
 */

#define DatumGetCommandId(X) ((CommandId) (X))

/*
 * CommandIdGetDatum
 *        Returns datum representation for a command identifier.
 */
#define CommandIdGetDatum(X) ((Datum) (X))

/*
 * DatumGetPointer
 *        Returns pointer value of a datum.
 */
#define DatumGetPointer(X) ((Pointer) (X))

/*
 * PointerGetDatum
 *        Returns datum representation for a pointer.
 */
#define PointerGetDatum(X) ((Datum) (X))

/*
 * DatumGetCString
 *        Returns C string (null-terminated string) value of a datum.
 *
 * Note: C string is not a full-fledged Postgres type at present,
 * but type input functions use this conversion for their inputs.
 */
#define DatumGetCString(X) ((char *) DatumGetPointer(X))

/*
 * CStringGetDatum
 *        Returns datum representation for a C string (null-terminated string).
 *
 * Note: C string is not a full-fledged Postgres type at present,
 * but type output functions use this conversion for their outputs.
 * Note: CString is pass-by-reference; caller must ensure the pointed-to
 * value has adequate lifetime.
 */
#define CStringGetDatum(X) PointerGetDatum(X)

/*
 * DatumGetName
 *        Returns name value of a datum.
 */
#define DatumGetName(X) ((Name) DatumGetPointer(X))

/*
 * NameGetDatum
 *        Returns datum representation for a name.
 *
 * Note: Name is pass-by-reference; caller must ensure the pointed-to
 * value has adequate lifetime.
 */
#define NameGetDatum(X) CStringGetDatum(NameStr(*(X)))

/*
 * DatumGetInt64
 *        Returns 64-bit integer value of a datum.
 *
 * Note: this macro hides whether int64 is pass by value or by reference.
 */

 

哈希(utils/hash/dynahash.c,hashfn.h,便利APIhsearch.h)

  在pg内核中,大量使用了hash以便快速搜索。如catcache,portal,operator等。

  哈希创建(属性),遍历,hashcode计算函数。

列表List/ListCell

  跟c++/java的List是一样的,底层数组。接口和宏在pg_list.h,是由于早期有一部分是使用Lisp开发的,所以用c重写后,仍然保留了下来。正统的是dlist,也就是链表。大部分宏的命名规范为xxx1,xxx2,xxx3,代表1、2、3个参数。参数重载也是类似。实现在src/backend/nodes/list.c中。

  PostgreSQL stores information about SQL queries in structures called nodes. Nodes are generic containers that have a type field and then a type-specific data section. Nodes are usually placed in Lists. A List is container with an elem element, and a next field that points to the next List. These List structures are chained together in a forward linked list. In this way, a chain of List s can contain an unlimited number of Node elements, and each Node can contain any data type. These are used extensively in the parser, optimizer, and executor to store requests and data.

  访问listcell的宏,如下:

除了第一个直接获取指针、int、oid值外,其他直接基于list的复合操作,减少代码重复。
#define
lfirst(lc) ((lc)->ptr_value) #define lfirst_int(lc) ((lc)->int_value) #define lfirst_oid(lc) ((lc)->oid_value) #define lfirst_node(type,lc) castNode(type, lfirst(lc)) #define linitial(l) lfirst(list_head(l)) #define linitial_int(l) lfirst_int(list_head(l)) #define linitial_oid(l) lfirst_oid(list_head(l)) #define linitial_node(type,l) castNode(type, linitial(l)) #define lsecond(l) lfirst(list_second_cell(l)) #define lsecond_int(l) lfirst_int(list_second_cell(l)) #define lsecond_oid(l) lfirst_oid(list_second_cell(l)) #define lsecond_node(type,l) castNode(type, lsecond(l)) #define lthird(l) lfirst(list_third_cell(l)) #define lthird_int(l) lfirst_int(list_third_cell(l)) #define lthird_oid(l) lfirst_oid(list_third_cell(l)) #define lthird_node(type,l) castNode(type, lthird(l)) #define lfourth(l) lfirst(list_fourth_cell(l)) #define lfourth_int(l) lfirst_int(list_fourth_cell(l)) #define lfourth_oid(l) lfirst_oid(list_fourth_cell(l)) #define lfourth_node(type,l) castNode(type, lfourth(l)) #define llast(l) lfirst(list_tail(l)) #define llast_int(l) lfirst_int(list_tail(l)) #define llast_oid(l) lfirst_oid(list_tail(l)) #define llast_node(type,l) castNode(type, llast(l))

 

链表(dlist)

  就是普通的链表实现。实现在ilist.h。那pg中什么时候用List,什么时候单/双链表?StartAutovacuumWorker中就使用了链表。目前来看,绝大部分情况下还是列表比较多。

   

内存上下文(MemoryContext,mcxt.c)

  通过palloc/pfree分配和释放,通过palloc分配的内存会在事务结束时自动释放,避免了内存泄露。理解AllocSet context的设计和实现是关键。

  最重要的是理解内存层次,计算内存消耗(https://github.com/MasaoFujii/pg_cheat_funcs,pg14内置了内存上下文管理函数),各种out of memory的排查(一般是真的内存不足了,跟Linux内核的几个参数有关系,详见https://www.cnblogs.com/zhjh256/p/15424236.html)。

  掌握全局上下文,理解用户上下文和事务上下文的分配与管理机制。

palloc和palloc0的区别?palloc0多了对齐处理。

 

轻量级锁(LWLock)/barrier

  修改各种内存共享数据,典型的是共享内存段中的hash、全局变量以及数组如PGPROC等。几乎到处都会使用,比spin要更慢,因为不是忙等,会有唤醒间隔。一般用mutex实现,在pg中体现为lwlock,在oracle中体现为latch。

 

latch(数据库用的更多),底层是pg事件通知(包括进程之间的,等待某个事件如i/o发生或从节点同步返回等)的可靠实现承载对象(具体通知有信号方式、poll或epoll或管道,但这些可能存在遗漏或重发、误发,所以latch是可靠保障)

/*
 * Latch structure should be treated as opaque and only accessed through
 * the public functions. It is defined here to allow embedding Latches as
 * part of bigger structs.
 */
typedef struct Latch
{
    sig_atomic_t is_set;
    bool        is_shared;
    int            owner_pid;
#ifdef WIN32
    HANDLE        event;
#endif
} Latch;
// 等待事件通常发生于某个socket/文件或进程之上,在初始化进程和处理完成之后设置各种事件
typedef struct WaitEvent { int pos; /* position in the event data structure */ uint32 events; /* triggered events */ pgsocket fd; /* socket fd associated with event */ void *user_data; /* pointer provided in AddWaitEventToSet */ #ifdef WIN32 bool reset; /* Is reset of the event required? */ #endif } WaitEvent; /* forward declaration to avoid exposing latch.c implementation details */ typedef struct WaitEventSet WaitEventSet; /* * prototypes for functions in latch.c */ extern void InitializeLatchSupport(void); extern void InitLatch(Latch *latch); extern void InitSharedLatch(Latch *latch); extern void OwnLatch(Latch *latch); extern void DisownLatch(Latch *latch); extern void SetLatch(Latch *latch); extern void ResetLatch(Latch *latch); extern WaitEventSet *CreateWaitEventSet(MemoryContext context, int nevents); extern void FreeWaitEventSet(WaitEventSet *set); extern int AddWaitEventToSet(WaitEventSet *set, uint32 events, pgsocket fd, Latch *latch, void *user_data); extern void ModifyWaitEvent(WaitEventSet *set, int pos, uint32 events, Latch *latch); extern int WaitEventSetWait(WaitEventSet *set, long timeout, WaitEvent *occurred_events, int nevents, uint32 wait_event_info); extern int WaitLatch(Latch *latch, int wakeEvents, long timeout, uint32 wait_event_info); extern int WaitLatchOrSocket(Latch *latch, int wakeEvents, pgsocket sock, long timeout, uint32 wait_event_info); /* * Unix implementation uses SIGUSR1 for inter-process signaling. * Win32 doesn't need this. */ #ifndef WIN32 extern void latch_sigusr1_handler(void);

  latch相对于实现mutex作用的spinlock,latch通常知道设置者和拥有者,比如并行执行中设置gather/worker通信用的shm_mq上的收/发者,就用到了latch,等待sql请求可读也基于latch。mutex通常不需要知道等待者和被等待者。这样latch和mutex的适用场景就比较清晰了。所以pg中的latch结构应用更加广泛,自旋锁通常可以假设等待时间会很短、实现也比较聚焦。

signal

  signal是一种技术实现手段。latch一种逻辑同步概念,本质上不是一个维度对比。 

transaction(xact,详见src/backend/access/transam/README)

  理解事务实现的三层体系(SQL层,逻辑事务层(StartTransactionCommand,实现在xact.c),事务实现细节层(low-level,主要是各种内核数据结构的设置和清理))。

  理解各种事务状态:TransactionState表示,事务状态详细信息保存在数据结构TransactionStateData中,事务状态转换图。

  各种类型的事务ID(虚拟,全局,本地,真正),各种事务ID及其比较。

  事务快照(TransactionSnapshot)、目录快照(CatalogSnapshot)、历史快照(HistoricSnapshot),注册各种快照(RegisterSnapshot)。

 

relation/RelationData(rel.h/table.c/tableam.c)

  访问catalog和访问用户表是两个逻辑,因为catalog的使用上下文非常明确,所以就尽可能多的硬编码用于优化性能。比如类型都是可以强行转义,如Form_pg_database   pgdatabase = (Form_pg_database) GETSTRUCT(tup);。甚至一些后台逻辑都是硬编码的,如pgstat.c/pgstat_vacuum_stat()。

  table是用户层,relation是内部,tableAPI调用relationAPI。

  RelationData是运行时缓存relcache的元素类型,非常重要。

bitmapset

定义如下:

typedef struct Bitmapset
{
    int            nwords;            /* number of words in array */
    bitmapword    words[FLEXIBLE_ARRAY_MEMBER];    /* really [nwords] */
} Bitmapset;

 

relids与bitmapset关系??bitmapset是一个32位的整型数组,所以跟dynhash一样,是为了比较速度更快,bitmap wise比较而非字符比较。RelOptInfo中的Relids就是bitmapset来表达的,主要是为了各种匹配的时候性能更高,但可读性不咋地。outfuncs.c里面包含了outBitmapset可以本文化的显示bitmapset内容。

typedef Bitmapset *Relids; relid的值指向rangetable数组的索引

 

scankey

  任何一个访问和过滤谓词都是通过scankey实现的,运行时判断,所以成本是比较高的。

   pg系统目录的初始化过程可以参见https://zhuanlan.zhihu.com/p/623283855,非常详细

  cache和catalog/catcache(catcache.c,Low-level catalog cache definitions)/relcache(Relation descriptor cache definitions,relcache.h,pg_table/pg_stat_relation相关的缓存,每个表一行)/syscache(syscache.c,System catalog cache definitions,lsyscache.h是其API友好版,一般用户调用lsyscache中的函数,通过SearchSysCacheN获取,通过ReleaseSysCache释放,但不能修改返回的记录)

  catcache和syscache的差异。lsyscache.c调用syscache.c,syscache.c(catalog以及被标记为catalog的用户表都是syscache加载的,通过systable_* scan APIs访问,https://www.postgresql.org/docs/current/logicaldecoding-output-plugin.html#LOGICALDECODING-CAPABILITIES)调用catcache.c。

  

   syscache泛指系统表,实现上主要分为catcache和relcache,也包括其它一些特殊用途的如表空间。

  注意可淘汰和被淘汰的状态差异。ReleaseSysCache只是减少计数器。CatCacheRemoveCTup负责真正销毁并释放内存。

  relcache.h包括获取和relation也就是表相关的任何信息,包括根据relid获取所有的索引,分区明细等。

  catalog与pg_catalog、lt_catalog系统表的关系及initdb初始化

The values in catalog/pg_* include files are always in sync with what gets put into
the database catalogs by virtue of being the definition of the structure and contents of
the system catalog tables. The .bki files used when initdb command sets up a new
empty database cluster are generated from these .h files by genbki.pl script.

Another often used include directory is catalog/ which gives you the initial (and by
convention constant) part of most system tables so you do not need to look up things
like type identifier for int4 data type, but can use its pre-defined value
INT4OID directly. As of PostgreSQL 9.2, there are 79 constants for type IDs defined in
catalog/pgtype.

   可以参见https://amitlan.com/2019/06/14/caches-inval.html。

pgproc/myproc/mypgxact,xid/xmin

 https://vdocuments.net/inside-postgres-shared-memory-with-enterprisedb.html?page=16

pg_stat_activity(pgstat.c)

 

catalog缓存条目的获取与释放

todo

大量的辅助函数、宏

  在pg中,针对上述核心基础结构的操作提供了大量的辅助函数和宏。包括很多的状态访问,简单的状态转换,加锁、解锁,成员修改等。你会发现很多基础软件、框架、库中都是这个模式,包括java的,但是业务应用系统却很难应用。为什么呢?因为应用是定制化的,很多应用甚至几乎没有什么用户,甚至运行不了多久,所以根本不需要维护还这么高的成本。前三个版本的软件你会发现基本上不需要,第四个版本开始才会需要。以为这些领域对象被大量用户访问。所以是可维护性(有维护的需求)才有领域对象辅助函数的需求,否则就不需要了。

难点

  基础之外的增值才是核心,以table为例,分区、TOAST、继承、vacuum属性(甚至为vacuum定制实现)、unlogged、表空间、特殊字段类型、压缩、存储优化、FDW、TAM等等。核心的table存储功能是基础,但是同质化会很严重,通常核心和外围的投入55开,而且需要花更多的时间跟踪和管理。

 
posted @ 2023-06-04 00:01  zhjh256  阅读(55)  评论(0编辑  收藏  举报