PG存储介质管理器（SMGR）——磁盘管理器：postgresql-8.4.1/src/backend/storage/md

　　磁盘管理器是SMGR的一种具体实现，它对外提供了管理磁盘介质的接口，其主要实现在文件md.c中。磁盘管理器并非对磁盘上的文件直接进行操作，而是通过VFD机制进行文件操作。在PG中，凡是要对存储在磁盘中的表进行磁盘操作（如打开/关闭、读/写等），都是必须与磁盘管理器打交道，由它来统一处理。对文件的操作是通过磁盘文件描述符（MdfdVec）进行的。MdfdVec的各个字段意义如下：

mdfd_vfd：该文件所对应的VFD
mdfd_segno：由于较大的表文件会被切分成多个文件（可以称为段），因此用这个字段表示当前这个文件是表文件的第几段
mdfd_chain：指向同一表文件下一段的MdfdVec，通过这个字段可以把表文件的各段链接起来，形成链表

通过mdfd_chain链表使得大于操作系统文件大小（通常为2GB）限制的表文件更易被支持，因为我们可以将表文件切割成段，使每段小于2GB

1 typedef struct _MdfdVec{
2     File mdfd_vfd;  //该磁盘文件对应的VFD编号
3     BlockNumber mdfd_segno;  //该磁盘文件对应的表文件段号
4     struct _MdfdVec* mdfd_chain;  //指向同一个表文件的下一段
5 } MdfdVec;

需要注意的是mdfd_chain为空并不一定表示表仅有一段，可能其他段没有打开。对于一个大表的操作，往往是逐段打开的。在PG中，使用结构体SmgrRelation来表示一个被打开的表文件，该结构中保存了该表文件对应的MdfdVec链表的头部。所有的SmgrRelation被组织成一个Hash表，该Hash表仅对后台进程可见，这样可以通过Hash表快速找到表对应的MdfdVec链表。在md.c中仅提供了读写文件的接口函数，其具体的实现则在文件fd.c中。

　　磁盘存储管理器在自己的描述符池中跟踪打开的文件描述符。这样会使得那些有文件大小限制（通常2G字节）的操作系统更容易实现关系表。为了做到这一点，我们将关系分成“段”文件，每个文件都比操作系统文件大小限制短。段大小由pg_config.h中的RELSEG_size配置常量设置。在磁盘上，关系表必须由连续编号的段文件组成

　　--每个完全RELSEG_SIZE块的零个或多个完整段

　　--正好是大小为0<=size<RELSEG_size块的一部分

　　--（可选）大小为0的块的任意数量的非活动段。

完整段和部分段统称为“活动”段。非活动段是那些曾经包含数据但由于mdtruncate()操作而当前不需要的段。将它们保留为零大小而不是取消链接的原因是，其他后端和/或bgwriter可能持有对此类段的打开文件引用。如果关系表在mdtruncate()之后再次展开，则停用的段将再次变为活动状态，重要的是，这样的文件引用仍然有效，否则数据可能会被写入段文件未链接的旧副本中，该副本最终将消失。

　　存储在SMgrRelation缓存中的文件描述符指针（md_fd field）只是MdfdVec对象列表的头，每个段一个。但请注意md_fd指针可以为NULL，表示关系表未打开。还要注意，mdfd_chain==NULL并不一定意味着关系在此段之后没有另一段；我们可能只是还没有打开下一段。（无论如何，我们不能将“所有段都在链中”作为不变项，因为另一个后端可以在我们不查找时扩展关系表）但是，我们不会为非活动段创建链条目；一旦找到部分段，我们就假定任何后续段都是非活动的。所有MdfdVec对象都在MdCxt内存上下文中。

API

forknum 除了关系表段文件外，PG会使用相同的文件号创建额外的文件。目前，支持创建用于存储表内部空闲空间信息的文件(FreeSpaceMap)，即所谓的Visibility Map等。

1 static MdfdVec *mdopen(SMgrRelation reln, ForkNumber forknum, ExtensionBehavior behavior);
2 static void register_dirty_segment(SMgrRelation reln, ForkNumber forknum, MdfdVec *seg);
3 static void register_unlink(RelFileNode rnode);
4 static MdfdVec *_fdvec_alloc(void);
5 static MdfdVec *_mdfd_openseg(SMgrRelation reln, ForkNumber forkno, BlockNumber segno, int oflags);
6 static MdfdVec *_mdfd_getseg(SMgrRelation reln, ForkNumber forkno, BlockNumber blkno, bool isTemp, ExtensionBehavior behavior);
7 static BlockNumber _mdnblocks(SMgrRelation reln, ForkNumber forknum, MdfdVec *seg);

　　mdopen打开指定的关系表，当有多个segment时，只打开第一个。如果第一个segment不在，或者调用ereport或者返回NULL。We treat EXTENSION_CREATE the same as EXTENSION_FAIL; EXTENSION_CREATE means it's OK to extend an existing relation, not to invent one out of whole cloth.

 1 static MdfdVec * mdopen(SMgrRelation reln, ForkNumber forknum, ExtensionBehavior behavior) {
 2     MdfdVec    *mdfd;
 3     char       *path;
 4     File        fd;
 5     /* No work if already open */
 6     if (reln->md_fd[forknum])
 7         return reln->md_fd[forknum];
 8     path = relpath(reln->smgr_rnode, forknum); //找到关系表对应的物理文件的路径
 9     fd = PathNameOpenFile(path, O_RDWR | PG_BINARY, 0600);
10 
11     if (fd < 0) {
12         /*During bootstrap, there are cases where a system relation will be accessed (by internal backend processes) before the bootstrap script nominally creates it.  Therefore, accept mdopen() as a substitute for mdcreate() in bootstrap mode only. (See mdcreate) */
13         if (IsBootstrapProcessingMode())
14             fd = PathNameOpenFile(path, O_RDWR | O_CREAT | O_EXCL | PG_BINARY, 0600);
15         if (fd < 0) {
16             if (behavior == EXTENSION_RETURN_NULL &&
17                 FILE_POSSIBLY_DELETED(errno)) {
18                 pfree(path);
19                 return NULL;
20             }
21             ereport(ERROR, (errcode_for_file_access(),
22                      errmsg("could not open relation %s: %m", path)));
23         }
24     }
25     pfree(path);
26 
27     reln->md_fd[forknum] = mdfd = _fdvec_alloc();
28     mdfd->mdfd_vfd = fd;
29     mdfd->mdfd_segno = 0;
30     mdfd->mdfd_chain = NULL;
31     Assert(_mdnblocks(reln, forknum, mdfd) <= ((BlockNumber) RELSEG_SIZE));
32     return mdfd;
33 }

对于查找关系表物理文件的路径请参见源代码catalog,c

　　_fdvec_alloc函数创建一个MdfdVec对象

1 static MdfdVec * _fdvec_alloc(void) {
2     return (MdfdVec *) MemoryContextAlloc(MdCxt, sizeof(MdfdVec));
3 }

　　_mdfd_openseg函数打开关系表中指定的segment文件，并创建一个MdfdVec对象

 1 static MdfdVec * _mdfd_openseg(SMgrRelation reln, ForkNumber forknum, BlockNumber segno, int oflags) {
 2     MdfdVec    *v;
 3     int    fd;
 4     char       *path, *fullpath;
 5 
 6     path = relpath(reln->smgr_rnode, forknum);
 7     if (segno > 0) {
 8         /* be sure we have enough space for the '.segno' */
 9         fullpath = (char *) palloc(strlen(path) + 12);
10         sprintf(fullpath, "%s.%u", path, segno);
11         pfree(path);
12     } else
13         fullpath = path;
14 
15     /* open the file */
16     fd = PathNameOpenFile(fullpath, O_RDWR | PG_BINARY | oflags, 0600);
17     pfree(fullpath);
18     if (fd < 0)
19         return NULL;
20 
21     /* allocate an mdfdvec entry for it */
22     v = _fdvec_alloc();
23     /* fill the entry */
24     v->mdfd_vfd = fd;
25     v->mdfd_segno = segno;
26     v->mdfd_chain = NULL;
27     Assert(_mdnblocks(reln, forknum, v) <= ((BlockNumber) RELSEG_SIZE));
28     /* all done */
29     return v;
30 }

　　_mdfd_getseg函数找到指定数据块所在的segment块。如果segment不在exist，函数ereport，返回NULL，创建segment。isTemp仅需要在EXTENSION_CREATE下纠正。如果mdfd_chain为NULL，如果调用者需要，就创建新segment，比如mdextend函数；在WAL恢复过程中，会创建segment文件，这会允许replaying WAL data that has a write into a high-numbered segment of a relation that was later deleted。

 1 static MdfdVec * _mdfd_getseg(SMgrRelation reln, ForkNumber forknum, BlockNumber blkno, bool isTemp, ExtensionBehavior behavior) {
 2     MdfdVec    *v = mdopen(reln, forknum, behavior);
 3     BlockNumber targetseg;
 4     BlockNumber nextsegno;
 5     if (!v)
 6         return NULL;            /* only possible if EXTENSION_RETURN_NULL */
 7     targetseg = blkno / ((BlockNumber) RELSEG_SIZE);
 8     for (nextsegno = 1; nextsegno <= targetseg; nextsegno++){
 9         Assert(nextsegno == v->mdfd_segno + 1);
10         if (v->mdfd_chain == NULL) {
11             if (behavior == EXTENSION_CREATE || InRecovery){
12                 if (_mdnblocks(reln, forknum, v) < RELSEG_SIZE){
13                     char       *zerobuf = palloc0(BLCKSZ);
14                     mdextend(reln, forknum, nextsegno * ((BlockNumber) RELSEG_SIZE) - 1, zerobuf, isTemp);         
15                                         pfree(zerobuf);
16                 }
17                 v->mdfd_chain = _mdfd_openseg(reln, forknum, +nextsegno, O_CREAT);
18             }else {
19                 /* We won't create segment if not existent */
20                 v->mdfd_chain = _mdfd_openseg(reln, forknum, nextsegno, 0);
21             }
22             if (v->mdfd_chain == NULL) {
23                 if (behavior == EXTENSION_RETURN_NULL &&
24                     FILE_POSSIBLY_DELETED(errno))
25                     return NULL;
26                 ereport(ERROR,
27                         (errcode_for_file_access(),
28                          errmsg("could not open segment %u of relation %s (target block %u): %m",
29                                 nextsegno,
30                                 relpath(reln->smgr_rnode, forknum),
31                                 blkno)));
32             }
33         }
34         v = v->mdfd_chain;
35     }
36     return v;
37 }

　　_mdnblocks函数获取磁盘文件中数据块的数量

 1 static BlockNumber _mdnblocks(SMgrRelation reln, ForkNumber forknum, MdfdVec *seg) {
 2     off_t        len;
 3     len = FileSeek(seg->mdfd_vfd, 0L, SEEK_END);
 4     if (len < 0)
 5         ereport(ERROR,
 6                 (errcode_for_file_access(),
 7              errmsg("could not seek to end of segment %u of relation %s: %m",
 8                     seg->mdfd_segno, relpath(reln->smgr_rnode, forknum))));
 9     /* note that this calculation will ignore any partial block at EOF */
10     return (BlockNumber) (len / BLCKSZ);
11 }

mdinit为磁盘存储管理器初始化私有状态

　　先调用AllocSetContextCreate函数在TopMemoryContext内存上下文上为磁盘管理模块创建内存上下文，MdCxt是md.c所有函数的内存上下文（static MemoryContent MdCxt）。接下来，如果处于standalone模式，不是在postmaster模式或者在postmaster的bootstrap-mode子进程下(a startup or bgwriter process)，则创建pending-operations HASH表。

 1 void mdinit(void) {
 2     MdCxt = AllocSetContextCreate(TopMemoryContext,"MdSmgr",ALLOCSET_DEFAULT_MINSIZE,ALLOCSET_DEFAULT_INITSIZE,ALLOCSET_DEFAULT_MAXSIZE);
 3 
 4     /*
 5      * Create pending-operations hashtable if we need it.  Currently, we need
 6      * it if we are standalone (not under a postmaster) OR if we are a
 7      * bootstrap-mode subprocess of a postmaster (that is, a startup or
 8      * bgwriter process).
 9      */
10     if (!IsUnderPostmaster || IsBootstrapProcessingMode()) {
11         HASHCTL        hash_ctl;
12         MemSet(&hash_ctl, 0, sizeof(hash_ctl));
13         hash_ctl.keysize = sizeof(PendingOperationTag);
14         hash_ctl.entrysize = sizeof(PendingOperationEntry);
15         hash_ctl.hash = tag_hash;
16         hash_ctl.hcxt = MdCxt;
17         pendingOpsTable = hash_create("Pending Ops Table",
18                                       100L,
19                                       &hash_ctl,
20                                    HASH_ELEM | HASH_FUNCTION | HASH_CONTEXT);
21         pendingUnlinks = NIL;
22     }
23 }

mdexists判断物理文件是否存在

　　对于有待删除的滞留文件，这将返回true(this will return true for lingering files, with pending deletions)。步骤是先关闭文件，以确保我们注意到自打开该文件以来fork是否已取消链接。

1 bool
2 mdexists(SMgrRelation reln, ForkNumber forkNum)
3 {
4     /* Close it first, to ensure that we notice if the fork has been unlinked since we opened it. */
5     mdclose(reln, forkNum);
6     return (mdopen(reln, forkNum, EXTENSION_RETURN_NULL) != NULL);
7 }

mdclose关闭指定关系表（如果还没有关闭）

　　步骤是先判别该forknum所指定的表文件是否已经关闭，没有关闭，先置NULL，然后循环进行下面的操作，如果mdfd_vfd即vfd句柄有效就调用FileClose关闭该vfd句柄引用的文件，释放MdfdVec结构，循环处理MdfdVec链表。

 1 void mdclose(SMgrRelation reln, ForkNumber forknum) {
 2     MdfdVec    *v = reln->md_fd[forknum];
 3     /* No work if already closed */
 4     if (v == NULL)
 5         return;
 6     reln->md_fd[forknum] = NULL;    /* prevent dangling pointer after error */
 7     while (v != NULL) {
 8         MdfdVec    *ov = v;
 9         /* if not closed already */
10         if (v->mdfd_vfd >= 0)
11             FileClose(v->mdfd_vfd);
12         /* Now free vector */
13         v = v->mdfd_chain;
14         pfree(ov);
15     }
16 }

mdcreate在磁盘上创建新的关系表

　　如果isRedo为true，关系表已经存在是没关系的。

 1 void mdcreate(SMgrRelation reln, ForkNumber forkNum, bool isRedo) {
 2     char       *path;
 3     File        fd;
 4     if (isRedo && reln->md_fd[forkNum] != NULL)
 5         return;    /* created and opened already... */
 6     Assert(reln->md_fd[forkNum] == NULL);
 7 
 8     path = relpath(reln->smgr_rnode, forkNum);
 9     fd = PathNameOpenFile(path, O_RDWR | O_CREAT | O_EXCL | PG_BINARY, 0600);
10     if (fd < 0) {
11         int            save_errno = errno;
12 
13         /* During bootstrap, there are cases where a system relation will be accessed (by internal backend processes) before the bootstrap script nominally creates it.  Therefore, allow the file to exist already, even if isRedo is not set.    (See also mdopen) */
14         if (isRedo || IsBootstrapProcessingMode())
15             fd = PathNameOpenFile(path, O_RDWR | PG_BINARY, 0600);
16         if (fd < 0) {
17             /* be sure to report the error reported by create, not open */
18             errno = save_errno;
19             ereport(ERROR, (errcode_for_file_access(),
20                      errmsg("could not create relation %s: %m", path)));
21         }
22     }
23     pfree(path);
24     reln->md_fd[forkNum] = _fdvec_alloc();
25     reln->md_fd[forkNum]->mdfd_vfd = fd;
26     reln->md_fd[forkNum]->mdfd_segno = 0;
27     reln->md_fd[forkNum]->mdfd_chain = NULL;
28 }

mdread从关系表读取指定数据块

　　主要是调用nbytes = FileRead(v->mdfd_vfd, buffer, BLCKSZ)函数，将数据写入buffer所指向的缓冲区。如果nbytes不等于BLCKSZ且nbytes大于等于0，是short read情况，表明读取正好处于文件EOF或超过了EOF，或读了在文件EOF处的partial block。通常这是一个错误。上层不应该尝试读不存在的块。如果zero_damaged_pages处于ON或者处于InRecovery，buffer内存应该直接返回全零。

 1 void mdread(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, char *buffer) {
 2     off_t        seekpos;
 3     int            nbytes;
 4     MdfdVec    *v;
 5     TRACE_POSTGRESQL_SMGR_MD_READ_START(forknum,blocknum,reln->smgr_rnode.spcNode,reln->smgr_rnode.dbNode,reln->smgr_rnode.relNode);
 6 
 7     v = _mdfd_getseg(reln, forknum, blocknum, false, EXTENSION_FAIL);
 8     seekpos = (off_t) BLCKSZ *(blocknum % ((BlockNumber) RELSEG_SIZE));
 9     Assert(seekpos < (off_t) BLCKSZ * RELSEG_SIZE);
10     if (FileSeek(v->mdfd_vfd, seekpos, SEEK_SET) != seekpos)
11         ereport(ERROR,(errcode_for_file_access(),
12                  errmsg("could not seek to block %u of relation %s: %m",
13                         blocknum, relpath(reln->smgr_rnode, forknum))));
14 
15     nbytes = FileRead(v->mdfd_vfd, buffer, BLCKSZ);
16     TRACE_POSTGRESQL_SMGR_MD_READ_DONE(forknum, blocknum,
17                                        reln->smgr_rnode.spcNode,
18                                        reln->smgr_rnode.dbNode,
19                                        reln->smgr_rnode.relNode,
20                                        nbytes,
21                                        BLCKSZ);
22     if (nbytes != BLCKSZ) {
23         if (nbytes < 0)
24             ereport(ERROR,
25                     (errcode_for_file_access(),
26                      errmsg("could not read block %u of relation %s: %m",
27                             blocknum, relpath(reln->smgr_rnode, forknum))));
28         if (zero_damaged_pages || InRecovery)
29             MemSet(buffer, 0, BLCKSZ);
30         else
31             ereport(ERROR,
32                     (errcode(ERRCODE_DATA_CORRUPTED),
33                      errmsg("could not read block %u of relation %s: read only %d of %d bytes",
34                             blocknum, relpath(reln->smgr_rnode, forknum),
35                             nbytes, BLCKSZ)));
36     }
37 }

mdwrite在合适的位置写supplied数据块

　　该函数只用于更新关系表已经存在的块

 1 void mdwrite(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, char *buffer, bool isTemp) {
 2     off_t        seekpos;
 3     int            nbytes;
 4     MdfdVec    *v;
 5     TRACE_POSTGRESQL_SMGR_MD_WRITE_START(forknum, blocknum, reln->smgr_rnode.spcNode, reln->smgr_rnode.dbNode, reln->smgr_rnode.relNode);
 6        v = _mdfd_getseg(reln, forknum, blocknum, isTemp, EXTENSION_FAIL);
 7     seekpos = (off_t) BLCKSZ *(blocknum % ((BlockNumber) RELSEG_SIZE));
 8     if (FileSeek(v->mdfd_vfd, seekpos, SEEK_SET) != seekpos)
 9         ereport(ERROR,
10                 (errcode_for_file_access(),
11                  errmsg("could not seek to block %u of relation %s: %m",
12                         blocknum, relpath(reln->smgr_rnode, forknum))));
13     nbytes = FileWrite(v->mdfd_vfd, buffer, BLCKSZ);  
14     TRACE_POSTGRESQL_SMGR_MD_WRITE_DONE(forknum, blocknum, reln->smgr_rnode.spcNode, reln->smgr_rnode.dbNode, reln->smgr_rnode.relNode, nbytes, BLCKSZ);
15     if (nbytes != BLCKSZ) {
16         if (nbytes < 0)
17             ereport(ERROR,
18                     (errcode_for_file_access(),
19                      errmsg("could not write block %u of relation %s: %m",
20                             blocknum, relpath(reln->smgr_rnode, forknum))));
21         /* short write: complain appropriately */
22         ereport(ERROR,
23                 (errcode(ERRCODE_DISK_FULL),
24                  errmsg("could not write block %u of relation %s: wrote only %d of %d bytes",
25                         blocknum,
26                         relpath(reln->smgr_rnode, forknum),
27                         nbytes, BLCKSZ),
28                  errhint("Check free disk space.")));
29     }
30 
31     if (!isTemp)
32         register_dirty_segment(reln, forknum, v);
33 }
34

mdpredfetch函数初始化指定块的异步读

 1 void mdprefetch(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum) {
 2 #ifdef USE_PREFETCH
 3     off_t        seekpos;
 4     MdfdVec    *v;
 5     v = _mdfd_getseg(reln, forknum, blocknum, false, EXTENSION_FAIL);
 6     seekpos = (off_t) BLCKSZ *(blocknum % ((BlockNumber) RELSEG_SIZE));
 7     Assert(seekpos < (off_t) BLCKSZ * RELSEG_SIZE);
 8     (void) FilePrefetch(v->mdfd_vfd, seekpos, BLCKSZ);
 9 #endif   /* USE_PREFETCH */
10 }

mdextend函数将数据块加入指定关系表

 1 void mdextend(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, char *buffer, bool isTemp) {
 2     off_t        seekpos;
 3     int            nbytes;
 4     MdfdVec    *v;
 5     /* This assert is too expensive to have on normally ... */
 6 #ifdef CHECK_WRITE_VS_EXTEND
 7     Assert(blocknum >= mdnblocks(reln, forknum));
 8 #endif
 9 
10     /* If a relation manages to grow to 2^32-1 blocks, refuse to extend it any more --- we mustn't create a block whose number actually is InvalidBlockNumber. */
11     if (blocknum == InvalidBlockNumber)
12         ereport(ERROR(errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
13                  errmsg("cannot extend relation %s beyond %u blocks",
14                         relpath(reln->smgr_rnode, forknum),
15                         InvalidBlockNumber)));
16 
17     v = _mdfd_getseg(reln, forknum, blocknum, isTemp, EXTENSION_CREATE);
18     seekpos = (off_t) BLCKSZ *(blocknum % ((BlockNumber) RELSEG_SIZE));
19     Assert(seekpos < (off_t) BLCKSZ * RELSEG_SIZE);
20     if (FileSeek(v->mdfd_vfd, seekpos, SEEK_SET) != seekpos)
21         ereport(ERROR,
22                 (errcode_for_file_access(),
23                  errmsg("could not seek to block %u of relation %s: %m",
24                         blocknum,
25                         relpath(reln->smgr_rnode, forknum))));
26     if ((nbytes = FileWrite(v->mdfd_vfd, buffer, BLCKSZ)) != BLCKSZ){
27         if (nbytes < 0)
28             ereport(ERROR,
29                     (errcode_for_file_access(),
30                      errmsg("could not extend relation %s: %m",
31                             relpath(reln->smgr_rnode, forknum)),
32                      errhint("Check free disk space.")));
33         /* short write: complain appropriately */
34         ereport(ERROR,
35                 (errcode(ERRCODE_DISK_FULL),
36                  errmsg("could not extend relation %s: wrote only %d of %d bytes at block %u",
37                         relpath(reln->smgr_rnode, forknum),
38                         nbytes, BLCKSZ, blocknum),
39                  errhint("Check free disk space.")));
40     }
41     if (!isTemp)
42         register_dirty_segment(reln, forknum, v);
43 
44     Assert(_mdnblocks(reln, forknum, v) <= ((BlockNumber) RELSEG_SIZE));
45 }

mdtruncate将关系表截断为指定数量的数据块

 1 void mdtruncate(SMgrRelation reln, ForkNumber forknum, BlockNumber nblocks,  bool isTemp) {
 2     MdfdVec    *v;
 3     BlockNumber curnblk;
 4     BlockNumber priorblocks;
 5     /* NOTE: mdnblocks makes sure we have opened all active segments, so that truncation loop will get them all! */
 6     curnblk = mdnblocks(reln, forknum);
 7     if (nblocks > curnblk) {
 8         /* Bogus request ... but no complaint if InRecovery */
 9         if (InRecovery)
10             return;
11         ereport(ERROR,
12                 (errmsg("could not truncate relation %s to %u blocks: it's only %u blocks now",
13                         relpath(reln->smgr_rnode, forknum),
14                         nblocks, curnblk)));
15     }
16     if (nblocks == curnblk)
17         return;                    /* no work */
18 
19     v = mdopen(reln, forknum, EXTENSION_FAIL);
20     priorblocks = 0;
21     while (v != NULL){
22         MdfdVec    *ov = v;
23         if (priorblocks > nblocks){
24             /* This segment is no longer active (and has already been unlinked from the mdfd_chain). We truncate the file, but do not delete it, for reasons explained in the header comments. */
25             if (FileTruncate(v->mdfd_vfd, 0) < 0)
26                 ereport(ERROR,
27                         (errcode_for_file_access(),
28                     errmsg("could not truncate relation %s to %u blocks: %m",
29                            relpath(reln->smgr_rnode, forknum),
30                            nblocks)));
31             if (!isTemp)
32                 register_dirty_segment(reln, forknum, v);
33             v = v->mdfd_chain;
34             Assert(ov != reln->md_fd[forknum]); /* we never drop the 1st
35                                                  * segment */
36             pfree(ov);
37         } else if (priorblocks + ((BlockNumber) RELSEG_SIZE) > nblocks){
38             /* This is the last segment we want to keep. Truncate the file to the right length, and clear chain link that points to any remaining segments (which we shall zap). NOTE: if nblocks is exactly a multiple K of RELSEG_SIZE, we will truncate the K+1st segment to 0 length but keep it. This adheres to the invariant given in the header comments. */
39             BlockNumber lastsegblocks = nblocks - priorblocks;
40             if (FileTruncate(v->mdfd_vfd, (off_t) lastsegblocks * BLCKSZ) < 0)
41                 ereport(ERROR,
42                         (errcode_for_file_access(),
43                     errmsg("could not truncate relation %s to %u blocks: %m",
44                            relpath(reln->smgr_rnode, forknum),
45                            nblocks)));
46             if (!isTemp)
47                 register_dirty_segment(reln, forknum, v);
48             v = v->mdfd_chain;
49             ov->mdfd_chain = NULL;
50         }else{
51             /* We still need this segment and 0 or more blocks beyond it, so nothing to do here. */
52             v = v->mdfd_chain;
53         }
54         priorblocks += RELSEG_SIZE;
55     }
56 }

mdnblocks获取存储在关系表的数据块的数量

 1 BlockNumber mdnblocks(SMgrRelation reln, ForkNumber forknum) {
 2     MdfdVec    *v = mdopen(reln, forknum, EXTENSION_FAIL);
 3     BlockNumber nblocks;
 4     BlockNumber segno = 0;
 5     while (v->mdfd_chain != NULL) {
 6         segno++;
 7         v = v->mdfd_chain;
 8     }
 9     for (;;)
10     {
11         nblocks = _mdnblocks(reln, forknum, v);
12         if (nblocks > ((BlockNumber) RELSEG_SIZE))
13             elog(FATAL, "segment too big");
14         if (nblocks < ((BlockNumber) RELSEG_SIZE))
15             return (segno * ((BlockNumber) RELSEG_SIZE)) + nblocks;
16         /* If segment is exactly RELSEG_SIZE, advance to next one. */
17         segno++;
18 
19         if (v->mdfd_chain == NULL) {
20             /* Because we pass O_CREAT, we will create the next segment (with zero length) immediately, if the last segment is of length RELSEG_SIZE.  While perhaps not strictly necessary, this keeps the logic simple. */
21             v->mdfd_chain = _mdfd_openseg(reln, forknum, segno, O_CREAT);
22             if (v->mdfd_chain == NULL)
23                 ereport(ERROR,
24                         (errcode_for_file_access(),
25                        errmsg("could not open segment %u of relation %s: %m",
26                               segno,
27                               relpath(reln->smgr_rnode, forknum))));
28         }
29         v = v->mdfd_chain;
30     }
31 }

mdunlink取消链接关系表

　　事实上，不会unlink关系表的第一个segment文件，但是会将长度truncate为零，并在下次checkpoint之前记录unlink的请求。其余segment文件可以立即unlink。将空文件留下为了防止refilenode再次被复用。可以防止我们删除一个关系表（We delete a relation (and commit, and actually remove its file)），创建新关系表（We create a new relation, which by chance gets the same relfilenode as the just-deleted one (OIDs must've wrapped around for that to happen)），在其他checkpoint发生前崩溃。

　　在replay过程中，删除文件然后再次创建该文件(which is fine if the contents of the file were repopulated by subsequent WAL entries)。But if we didn't WAL-log insertions, but instead relied on fsyncing the file after populating it (as for instance CLUSTER and CREATE INDEX do), the contents of the file would be lost forever. By leaving the empty file until after the next checkpoint, we prevent reassignment of the relfilenode number until it's safe, because relfilenode assignment skips over any existing file.如果isRedo为true，关系表已经删除是可以的。我们应该删除该文件而不是之后发出请求，因为在redo过程中，不可能创建冲突的关系表。

 1 void mdunlink(RelFileNode rnode, ForkNumber forkNum, bool isRedo)
 2 {
 3     char       *path;
 4     int    ret;
 5     /* We have to clean out any pending fsync requests for the doomed relation, else the next mdsync() will fail. */ //清除该表上的pending 同步请求
 6     ForgetRelationFsyncRequests(rnode, forkNum);
 7     path = relpath(rnode, forkNum);
 8     /* Delete or truncate the first segment. */
 9     if (isRedo || forkNum != MAIN_FORKNUM)
10         ret = unlink(path);
11     else{
12         /* truncate(2) would be easier here, but Windows hasn't got it */
13         int            fd;
14         fd = BasicOpenFile(path, O_RDWR | PG_BINARY, 0);
15         if (fd >= 0){
16             int            save_errno;
17             ret = ftruncate(fd, 0);
18             save_errno = errno;
19             close(fd);
20             errno = save_errno;
21         }
22         else
23             ret = -1;
24     }
25     if (ret < 0) {
26         if (!isRedo || errno != ENOENT)
27             ereport(WARNING,
28                     (errcode_for_file_access(),
29                      errmsg("could not remove relation %s: %m", path)));
30     }
31         /* Delete any additional segments. */
32     else {
33         char       *segpath = (char *) palloc(strlen(path) + 12);
34         BlockNumber segno;
35         /*Note that because we loop until getting ENOENT, we will correctly remove all inactive segments as well as active ones.*/
36         for (segno = 1;; segno++){
37             sprintf(segpath, "%s.%u", path, segno);
38             if (unlink(segpath) < 0){
39                 /* ENOENT is expected after the last segment... */
40                 if (errno != ENOENT)
41                     ereport(WARNING,
42                             (errcode_for_file_access(),
43                      errmsg("could not remove segment %u of relation %s: %m",
44                             segno, path)));
45                 break;
46             }
47         }
48         pfree(segpath);
49     }
50     pfree(path);
51     /* Register request to unlink first segment later */
52     if (!isRedo && forkNum == MAIN_FORKNUM)
53         register_unlink(rnode);
54 }

和BgWriter后台写进程相关的函数

在某些情况下（当前是standalone backend和bgwriter进程），我们跟踪未完成的fsync操作：我们需要记住自上一个检查点以来已写入的所有关系表段relation segments，以便在完成下一个检查点之前将它们向下同步到磁盘。该哈希表记住待处理的操作。我们主要将哈希表用作消除重复请求的便捷方法。

我们使用类似的机制来记住不再需要的文件，这些文件可以在下一个检查点之后删除，但是我们使用链接列表而不是哈希表，因为我们不希望有任何重复的请求。

（常规后端regular backend不会在本地跟踪待处理的操作，而是将其转发给bgwriter。）

SetForwardFsyncRequests

在archive recovery下，依靠bgwriter执行fsyncs，但是我们已经在startup过程中已经创建了pendingOpsTable了，需要调用该函数丢弃local pendingOpsTable，这样subsequent请求才会推送给bgwriter。

1 void SetForwardFsyncRequests(void) {
2     /* Perform any pending ops we may have queued up */
3     if (pendingOpsTable)
4         mdsync();
5     pendingOpsTable = NULL;
6 }

ForgetRelationFsyncRequests

　　forget关系表的任何fsyncs，如果pendingOpsTable不为NULL，调用RememberFsyncRequest(rnode, forknum, FORGET_RELATION_FSYNC)。如果是在Postmaster中，通知bgwriter，如果向队列中加入调用消息是否，sleep后再次尝试。

 1 void ForgetRelationFsyncRequests(RelFileNode rnode, ForkNumber forknum) {
 2     if (pendingOpsTable) {
 3         /* standalone backend or startup process: fsync state is local */
 4         RememberFsyncRequest(rnode, forknum, FORGET_RELATION_FSYNC);
 5     }else if (IsUnderPostmaster){
 6         while (!ForwardFsyncRequest(rnode, forknum, FORGET_RELATION_FSYNC))
 7             pg_usleep(10000L);    /* 10 msec seems a good number */
 8 
 9         /* Note we don't wait for the bgwriter to actually absorb the revoke message; see mdsync() for the implications. */
10     }
11 }

RememberFsyncRequest

　　bgwriter处理fsync请求的回调函数。在bgwriter下次检查点到来前，将大多数fsync请求放入本地哈希表。UNLINK请求放入链表。

FORGET_RELATION_FSYNC means to cancel pending fsyncs for a relation

FORGET_DATABASE_FSYNC means to cancel pending fsyncs for a whole database

UNLINK_RELATION_REQUEST is a request to delete the file after the next checkpoint.

void
RememberFsyncRequest(RelFileNode rnode, ForkNumber forknum, BlockNumber segno)
{
    Assert(pendingOpsTable);

    if (segno == FORGET_RELATION_FSYNC)
    {
        /* Remove any pending requests for the entire relation */
        HASH_SEQ_STATUS hstat;
        PendingOperationEntry *entry;

        hash_seq_init(&hstat, pendingOpsTable);
        while ((entry = (PendingOperationEntry *) hash_seq_search(&hstat)) != NULL)
        {
            if (RelFileNodeEquals(entry->tag.rnode, rnode) &&
                entry->tag.forknum == forknum)
            {
                /* Okay, cancel this entry */
                entry->canceled = true;
            }
        }
    }
    else if (segno == FORGET_DATABASE_FSYNC)
    {
        /* Remove any pending requests for the entire database */
        HASH_SEQ_STATUS hstat;
        PendingOperationEntry *entry;
        ListCell   *cell,
                   *prev,
                   *next;

        /* Remove fsync requests */
        hash_seq_init(&hstat, pendingOpsTable);
        while ((entry = (PendingOperationEntry *) hash_seq_search(&hstat)) != NULL)
        {
            if (entry->tag.rnode.dbNode == rnode.dbNode)
            {
                /* Okay, cancel this entry */
                entry->canceled = true;
            }
        }

        /* Remove unlink requests */
        prev = NULL;
        for (cell = list_head(pendingUnlinks); cell; cell = next)
        {
            PendingUnlinkEntry *entry = (PendingUnlinkEntry *) lfirst(cell);

            next = lnext(cell);
            if (entry->rnode.dbNode == rnode.dbNode)
            {
                pendingUnlinks = list_delete_cell(pendingUnlinks, cell, prev);
                pfree(entry);
            }
            else
                prev = cell;
        }
    }
    else if (segno == UNLINK_RELATION_REQUEST)
    {
        /* Unlink request: put it in the linked list */
        MemoryContext oldcxt = MemoryContextSwitchTo(MdCxt);
        PendingUnlinkEntry *entry;

        entry = palloc(sizeof(PendingUnlinkEntry));
        entry->rnode = rnode;
        entry->cycle_ctr = mdckpt_cycle_ctr;

        pendingUnlinks = lappend(pendingUnlinks, entry);

        MemoryContextSwitchTo(oldcxt);
    }
    else
    {
        /* Normal case: enter a request to fsync this segment */
        PendingOperationTag key;
        PendingOperationEntry *entry;
        bool        found;

        /* ensure any pad bytes in the hash key are zeroed */
        MemSet(&key, 0, sizeof(key));
        key.rnode = rnode;
        key.forknum = forknum;
        key.segno = segno;

        entry = (PendingOperationEntry *) hash_search(pendingOpsTable,
                                                      &key,
                                                      HASH_ENTER,
                                                      &found);
        /* if new or previously canceled entry, initialize it */
        if (!found || entry->canceled)
        {
            entry->canceled = false;
            entry->cycle_ctr = mdsync_cycle_ctr;
        }
    }
}

ForgetDatabaseFsyncRequests

　　forget任何fsync，取消DB链接

 1 void ForgetDatabaseFsyncRequests(Oid dbid) {
 2     RelFileNode rnode;
 3     rnode.dbNode = dbid;
 4     rnode.spcNode = 0;
 5     rnode.relNode = 0;
 6     if (pendingOpsTable){
 7         /* standalone backend or startup process: fsync state is local */
 8         RememberFsyncRequest(rnode, InvalidForkNumber, FORGET_DATABASE_FSYNC);
 9     }else if (IsUnderPostmaster){
10         /* see notes in ForgetRelationFsyncRequests */
11         while (!ForwardFsyncRequest(rnode, InvalidForkNumber,
12                                     FORGET_DATABASE_FSYNC))
13             pg_usleep(10000L);    /* 10 msec seems a good number */
14     }
15 }

register_dirty_segment

　　将关系表段标志位需要fsync，如果有pendingOpsTable，就向其中添加一个条目，以供mdsync后续处理。否则，转交给backgroud writer process。如果都失败了，就直接在返回前进行fsync操作。

 1 static void register_dirty_segment(SMgrRelation reln, ForkNumber forknum, MdfdVec *seg) {
 2     if (pendingOpsTable) {
 3         /* push it into local pending-ops table */
 4         RememberFsyncRequest(reln->smgr_rnode, forknum, seg->mdfd_segno);
 5     }else{
 6         if (ForwardFsyncRequest(reln->smgr_rnode, forknum, seg->mdfd_segno))
 7             return;                /* passed it off successfully */
 8 
 9         if (FileSync(seg->mdfd_vfd) < 0)
10             ereport(ERROR,
11                     (errcode_for_file_access(),
12                      errmsg("could not fsync segment %u of relation %s: %m",
13                             seg->mdfd_segno,
14                             relpath(reln->smgr_rnode, forknum))));
15     }
16 }

register_unlink

在下一个检查点之前调度要删除的文件

 1 static void register_unlink(RelFileNode rnode) {
 2     if (pendingOpsTable) {
 3         /* push it into local pending-ops table */
 4         RememberFsyncRequest(rnode, MAIN_FORKNUM, UNLINK_RELATION_REQUEST);
 5     } else {
 6         /* Notify the bgwriter about it.  If we fail to queue the request message, we have to sleep and try again, because we can't simply delete the file now.  Ugly, but hopefully won't happen often. XXX should we just leave the file orphaned instead? */
 7         Assert(IsUnderPostmaster);
 8         while (!ForwardFsyncRequest(rnode, MAIN_FORKNUM,
 9                                     UNLINK_RELATION_REQUEST))
10             pg_usleep(10000L);    /* 10 msec seems a good number */
11     }
12 }

mdsync向stable storage同步之前writes

　　该函数只能在checkpoints过程中调用，并且该进程已经创建了pendingOpsTable。很重要的函数，后续在bgwriter节详细讲解。

 1 void mdsync(void) {
 2     static bool mdsync_in_progress = false;
 3     HASH_SEQ_STATUS hstat;
 4     PendingOperationEntry *entry;
 5     int            absorb_counter;
 6     if (!pendingOpsTable)
 7         elog(ERROR, "cannot sync without a pendingOpsTable");
 8     AbsorbFsyncRequests();
 9     if (mdsync_in_progress)
10     {
11         /* prior try failed, so update any stale cycle_ctr values */
12         hash_seq_init(&hstat, pendingOpsTable);
13         while ((entry = (PendingOperationEntry *) hash_seq_search(&hstat)) != NULL)
14         {
15             entry->cycle_ctr = mdsync_cycle_ctr;
16         }
17     }
18     mdsync_cycle_ctr++;
19     mdsync_in_progress = true;
20     absorb_counter = FSYNCS_PER_ABSORB;
21     hash_seq_init(&hstat, pendingOpsTable);
22     while ((entry = (PendingOperationEntry *) hash_seq_search(&hstat)) != NULL){
23         if (entry->cycle_ctr == mdsync_cycle_ctr)
24             continue;
25         if (enableFsync && !entry->canceled)
26         {
27             int            failures;
28             if (--absorb_counter <= 0)
29             {
30                 AbsorbFsyncRequests();
31                 absorb_counter = FSYNCS_PER_ABSORB;
32             }
33             for (failures = 0;; failures++)        /* loop exits at "break" */
34             {
35                 SMgrRelation reln;
36                 MdfdVec    *seg;
37                 char       *path;
38             reln = smgropen(entry->tag.rnode);
39                 seg = _mdfd_getseg(reln, entry->tag.forknum,
40                               entry->tag.segno * ((BlockNumber) RELSEG_SIZE),
41                                    false, EXTENSION_RETURN_NULL);
42                 if (seg != NULL &&
43                     FileSync(seg->mdfd_vfd) >= 0)
44                     break;        /* success; break out of retry loop */
45                 path = relpath(entry->tag.rnode, entry->tag.forknum);
46                 if (!FILE_POSSIBLY_DELETED(errno) ||
47                     failures > 0)
48                     ereport(ERROR,
49                             (errcode_for_file_access(),
50                       errmsg("could not fsync segment %u of relation %s: %m",
51                              entry->tag.segno, path)));
52                 else
53                     ereport(DEBUG1,
54                             (errcode_for_file_access(),
55                              errmsg("could not fsync segment %u of relation %s but retrying: %m",
56                                     entry->tag.segno, path)));
57                 pfree(path);
58                 AbsorbFsyncRequests();
59                 absorb_counter = FSYNCS_PER_ABSORB;        /* might as well... */
60                 if (entry->canceled)
61                     break;
62             }                    /* end retry loop */
63         }
64 
65         if (hash_search(pendingOpsTable, &entry->tag,
66                         HASH_REMOVE, NULL) == NULL)
67             elog(ERROR, "pendingOpsTable corrupted");
68     }                            /* end loop over hashtable entries */
69     /* Flag successful completion of mdsync */
70     mdsync_in_progress = false;
71 }

mdimmedsync

　　将关系表立刻同步到存储中

 1 void mdimmedsync(SMgrRelation reln, ForkNumber forknum) {
 2     MdfdVec    *v;
 3     BlockNumber curnblk;
 4     /* NOTE: mdnblocks makes sure we have opened all active segments, so that fsync loop will get them all! */
 5     curnblk = mdnblocks(reln, forknum);
 6     v = mdopen(reln, forknum, EXTENSION_FAIL);
 7     while (v != NULL){
 8         if (FileSync(v->mdfd_vfd) < 0)
 9             ereport(ERROR,
10                     (errcode_for_file_access(),
11                      errmsg("could not fsync segment %u of relation %s: %m",
12                             v->mdfd_segno,
13                             relpath(reln->smgr_rnode, forknum))));
14         v = v->mdfd_chain;
15     }
16 }

和检查点相关函数

mdpreckpt函数做预先检查点工作，为了区分在此检查点开始之前到达的取消链接请求与在检查点期间到达的请求，我们使用一个与fsync请求类似的循环计数器。循环计数器在这里递增。必须在确定检查点重做点之前调用它。这样可以确保我们不会过早删除文件。注意，我们不能在这里做任何依赖于检查点将完成的假设的事情。

 1 void mdpreckpt(void) {
 2     ListCell   *cell;
 3     /* In case the prior checkpoint wasn't completed, stamp all entries in the list with the current cycle counter.  Anything that's in the list at the start of checkpoint can surely be deleted after the checkpoint is finished, regardless of when the request was made.  */
 4     foreach(cell, pendingUnlinks) {
 5         PendingUnlinkEntry *entry = (PendingUnlinkEntry *) lfirst(cell);
 6 
 7         entry->cycle_ctr = mdckpt_cycle_ctr;
 8     }
 9 
10     /* Any unlink requests arriving after this point will be assigned the next cycle counter, and won't be unlinked until next checkpoint. */
11     mdckpt_cycle_ctr++;
12 }

mdpostckpt函数做检查点后处理，删除lingering文件

 1 void mdpostckpt(void) {
 2     while (pendingUnlinks != NIL) {
 3         PendingUnlinkEntry *entry = (PendingUnlinkEntry *) linitial(pendingUnlinks);
 4         char       *path;
 5         /* New entries are appended to the end, so if the entry is new we've reached the end of old entries. */
 6         if (entry->cycle_ctr == mdckpt_cycle_ctr)
 7             break;
 8         /* Else assert we haven't missed it */
 9         Assert((CycleCtr) (entry->cycle_ctr + 1) == mdckpt_cycle_ctr);
10         /* Unlink the file */
11         path = relpath(entry->rnode, MAIN_FORKNUM);
12         if (unlink(path) < 0) {
13             /* There's a race condition, when the database is dropped at the same time that we process the pending unlink requests. If the DROP DATABASE deletes the file before we do, we will get ENOENT here. rmtree() also has to ignore ENOENT errors, to deal with the possibility that we delete the file first. */
14             if (errno != ENOENT)
15                 ereport(WARNING,
16                         (errcode_for_file_access(),
17                          errmsg("could not remove relation %s: %m", path)));
18         }
19         pfree(path);
20         pendingUnlinks = list_delete_first(pendingUnlinks);
21         pfree(entry);
22     }
23 }

主要数据结构

smgr.c维护了存储SMgrRelation对象的表，实际上是已缓存的文件句柄。SMgrRelation由smgropen()创建，由smgrclose()销毁。任何操作都不暗含I/O，它们只是创建或销毁一个hashtable条目。

typedef SMgrRelationData *SMgrRelation;

 1 typedef struct SMgrRelationData
 2 {
 3     /* rnode is the hashtable lookup key, so it must be first! */
 4     RelFileNode smgr_rnode;        /* relation physical identifier */
 5 
 6     /* pointer to owning pointer, or NULL if none */
 7     struct SMgrRelationData **smgr_owner;
 8 
 9     /* additional public fields may someday exist here */
10 
11     /*
12      * Fields below here are intended to be private to smgr.c and its
13      * submodules.    Do not touch them from elsewhere.
14      */
15     int            smgr_which;        /* storage manager selector */
16 
17     /* for md.c; NULL for forks that are not open */
18     struct _MdfdVec *md_fd[MAX_FORKNUM + 1];
19 } SMgrRelationData;

posted @ 2020-12-09 12:58 肥叔菌阅读(1192) 评论(0) 编辑收藏举报

会员力量，点亮园子希望

刷新页面返回顶部

肥叔菌

PG存储介质管理器（SMGR）——磁盘管理器：postgresql-8.4.1/src/backend/storage/md

API

和BgWriter后台写进程相关的函数

和检查点相关函数

主要数据结构

公告