page cache 与free

我们经常用free查看服务器的内存使用情况，而free中的输出却有些让人困惑，如下：

先看看各个数字的意义以及如何计算得到：

free命令输出的第二行(Mem)：这行分别显示了物理内存的总量(total)、已使用的 (used)、空闲的(free)、共享的(shared)、buffer(buffer大小)、 cache(cache的大小)的内存。我们知道Total、free、buffers、cached这几个字段是从/proc/meminfo中获取的，而used = total – free。Share列已经过时，忽略(见参考)。

free命令输出的第三行(-/+ buffers/cache)：

它显示的第一个值(used)：210236，这个值表示系统本身使用的内存总量，即除去buffer/cache，等于Mem行used列 - Mem行buffers列 - Mem行cached列。

它显示的第二个值(free)：814956，这个值表示系统当前可用内存，它等于Mem行total列— buffers/cache used，也等于Mem行free列 + Mem行buffers列 + Mem行cached列。

free命令输出的第四行(Swap) 这行显示交换内存的总量、已使用量、空闲量。

我们都知道free是从/proc/meminfo中读取相关的数据的。

下面是/proc/meminfo的实现：

static int meminfo_read_proc(char *page, char **start, off_t off,
                 int count, int *eof, void *data)
{
    struct sysinfo i;
    int len;
    unsigned long committed;
    unsigned long allowed;
    struct vmalloc_info vmi;
    long cached;

/*
 * display in kilobytes.
 */
#define K(x) ((x) << (PAGE_SHIFT - 10))
    si_meminfo(&i);
    si_swapinfo(&i);
    committed = atomic_read(&vm_committed_space);
    allowed = ((totalram_pages - hugetlb_total_pages())
        * sysctl_overcommit_ratio / 100) + total_swap_pages;

    cached = global_page_state(NR_FILE_PAGES) -
            total_swapcache_pages - i.bufferram;
    if (cached < 0)
        cached = 0;

    get_vmalloc_info(&vmi);

    /*
     * Tagged format, for easy grepping and expansion.
     */
    len = sprintf(page,
        "MemTotal:     %8lu kB\n"
        "MemFree:      %8lu kB\n"
        "Buffers:      %8lu kB\n"
        "Cached:       %8lu kB\n"
        "SwapCached:   %8lu kB\n"

        ......

        K(i.totalram),
        K(i.freeram),
        K(i.bufferram),
        K(cached),
        K(total_swapcache_pages),
        
        ......

#undef K
}     



struct sysinfo {
    long uptime;            /* Seconds since boot */
    unsigned long loads[3];        /* 1, 5, and 15 minute load averages */
    unsigned long totalram;        /* Total usable main memory size */
    unsigned long freeram;        /* Available memory size */
    unsigned long sharedram;    /* Amount of shared memory */
    unsigned long bufferram;    /* Memory used by buffers */
    unsigned long totalswap;    /* Total swap space size */
    unsigned long freeswap;        /* swap space still available */
    unsigned short procs;        /* Number of current processes */
    unsigned short pad;        /* explicit padding for m68k */
    unsigned long totalhigh;    /* Total high memory size */
    unsigned long freehigh;        /* Available high memory size */
    unsigned int mem_unit;        /* Memory unit size in bytes */
    char _f[20-2*sizeof(long)-sizeof(int)];    /* Padding: libc5 uses this.. */
};

图中，Buffers对应sysinfo.bufferram，内核中以页框为单位，通过宏K转化成以KB为单位输出。

void si_meminfo(struct sysinfo *val)
{
    val->totalram = totalram_pages;//total ram pages
    val->sharedram = 0;
    val->freeram = global_page_state(NR_FREE_PAGES);//free mem pages
    val->bufferram = nr_blockdev_pages();//block devices used pages
    val->totalhigh = totalhigh_pages;
    val->freehigh = nr_free_highpages();
    val->mem_unit = PAGE_SIZE;
}

long nr_blockdev_pages(void)
{
    struct block_device *bdev;
    long ret = 0;
    spin_lock(&bdev_lock);
    list_for_each_entry(bdev, &all_bdevs, bd_list) {
        ret += bdev->bd_inode->i_mapping->nrpages;
    }
    spin_unlock(&bdev_lock);
    return ret;
}

nr_blockdev_pages计算块设备使用的页框数，遍历所有块设备，将使用的页框数相加。而不包含普通文件使用的页框数。

cached = global_page_state(NR_FILE_PAGES) - total_swapcache_pages - i.bufferram;

static inline unsigned long global_page_state(enum zone_stat_item item)
{
    long x = atomic_long_read(&vm_stat[item]);
#ifdef CONFIG_SMP
    if (x < 0)
        x = 0;
#endif
    return x;
}

Cache的大小为内核总的page cache减去swap cache和块设备占用的页框数量，实际上cache即为普通文件的占用的page cache。实际上，在函数add_to_page_cache和__add_to_swap_cache 中，都会通过调用pagecache_acct实现对内核变量nr_pagecache进行累加。前者对应page cache，内核读块设备和普通文件使用；后者对应swap cache，内核读交换分区使用。

Page cache(页面缓存)

在linux系统中，为了加快文件的读写，内核中提供了page cache作为缓存，称为页面缓存(page cache)。为了加快对块设备的读写，内核中还提供了buffer cache作为缓存。在2.4内核中，这两者是分开的。这样就造成了双缓冲，因为文件读写最后还是转化为对块设备的读写。在2.6中，buffer cache合并到page cache中，对应的页面叫作buffer page。当进行文件读写时，如果文件在磁盘上的存储块是连续的，那么文件在page cache中对应的页是普通的page，如果文件在磁盘上的数据块是不连续的，或者是设备文件，那么文件在page cache中对应的页是buffer page。buffer page与普通的page相比，每个页多了几个buffer_head结构体(个数视块的大小而定)。此外，如果对单独的块（如超级块）直接进行读写，对应的page cache中的页也是buffer page。这两种页面虽然形式略有不同，但是最终他们的数据都会被封装成bio结构体，提交到通用块设备驱动层，统一进行I/O调度。

/**
 * 块缓冲头描述符
 */
struct buffer_head {
    /* 块缓冲状态位图，如BH_Uptodate */
    unsigned long b_state;        /* buffer state bitmap (see above) */
    /* 指向下一个块缓冲，二者属于同一个页缓存 */
    struct buffer_head *b_this_page;/* circular list of page's buffers */
    /* 如果缓冲区属于页缓存，则指向缓存页。如果独立于页缓存，则为NULL */
    struct page *b_page;        /* the page this bh is mapped to */

    /* 对应的块号 */
    sector_t b_blocknr;        /* start block number */
    /* 块长 */
    size_t b_size;            /* size of mapping */
    /* 内存中的数据指针 */
    char *b_data;            /* pointer to data within the page */

    /* 后备设备 */
    struct block_device *b_bdev;
    /* 当IO操作完成时，由内核调用的回调函数 */
    bh_end_io_t *b_end_io;        /* I/O completion */
    /* 预留指针，用于b_end_io。一般用于日志文件系统。 */
     void *b_private;        /* reserved for b_end_io */
    struct list_head b_assoc_buffers; /* associated with another mapping */
    /* 所属地址空间 */
    struct address_space *b_assoc_map;    /* mapping this buffer is
                           associated with */
    /* 访问计数器 */
    atomic_t b_count;        /* users using this buffer_head */
};

在kernel2.6之后，buffer_head没有别的作用，主要用来保持页框与块设备中数据块的映射关系。

Buffer page(缓冲页)

如果内核需要单独访问一个块，就会涉及到buffer page，并会检查对应的buffer head。

内核创建buffer page的两种常见情况：

(1)当读或者写一个文件页的数据块不相邻时。发生这种情况是因为文件系统为文件分配了非连续的块，或者文件有洞。具体请参见block_read_full_page(fs/buffer.c)函数:

/**
 * 从块设备中读取整页
 */
int block_read_full_page(struct page *page, get_block_t *get_block)
{
    struct inode *inode = page->mapping->host;
    sector_t iblock, lblock;
    struct buffer_head *bh, *head, *arr[MAX_BUF_PER_PAGE];
    unsigned int blocksize;
    int nr, i;
    int fully_mapped = 1;

    BUG_ON(!PageLocked(page));  
    blocksize = 1 << inode->i_blkbits;
    if (!page_has_buffers(page))/* 如果还没有建立缓冲区，则建立几个空缓冲区 */
        create_empty_buffers(page, blocksize, 0);
    /* 取页面关联的第一个缓冲区 */
    head = page_buffers(page);

    /* 计算要读取的块号 */
    iblock = (sector_t)page->index << (PAGE_CACHE_SHIFT - inode->i_blkbits);
    lblock = (i_size_read(inode)+blocksize-1) >> inode->i_blkbits;
    bh = head;
    nr = 0;
    i = 0;

    /* 遍历所有缓冲区 */
    do {
        if (buffer_uptodate(bh))/* 缓冲区已经与设备匹配了，不需要处理 */
            continue;

        if (!buffer_mapped(bh)) {/* 没有映射 */
            int err = 0;

            fully_mapped = 0;
            if (iblock < lblock) {/* 在设备上还不存在块 */
                WARN_ON(bh->b_size != blocksize);
                /* 获得逻辑块在磁盘上的位置 */
                err = get_block(inode, iblock, bh, 0);
                if (err)
                    SetPageError(page);
            }
            if (!buffer_mapped(bh)) {/* 对应的块是稀疏块，写入0即可 */
                zero_user_page(page, i * blocksize, blocksize,
                        KM_USER0);
                if (!err)
                    set_buffer_uptodate(bh);
                continue;
            }
            /*
             * get_block() might have updated the buffer
             * synchronously
             */
            if (buffer_uptodate(bh))/* get_block将缓冲区更新了，继续处理下一块 */
                continue;
        }
        /* 缓冲区已经映射，但内容不是最新的，将它放到临时数组中 */
        arr[nr++] = bh;
    } while (i++, iblock++, (bh = bh->b_this_page) != head);

    if (fully_mapped)
        SetPageMappedToDisk(page);

    if (!nr) {/* 所有缓冲区都是最新的 */
        /*
         * All buffers are uptodate - we can set the page uptodate
         * as well. But not if get_block() returned an error.
         */
        if (!PageError(page))/* 设置页的uptodate标志，然后退出 */
            SetPageUptodate(page);
        unlock_page(page);
        return 0;
    }

    /* Stage two: lock the buffers */
    for (i = 0; i < nr; i++) {/* 锁定缓冲区 */
        bh = arr[i];
        lock_buffer(bh);
        mark_buffer_async_read(bh);
    }

    /*
     * Stage 3: start the IO.  Check for uptodateness
     * inside the buffer lock in case another process reading
     * the underlying blockdev brought it uptodate (the sct fix).
     */
    for (i = 0; i < nr; i++) {/* 遍历页内所有需要更新的缓冲区 */
        bh = arr[i];
        if (buffer_uptodate(bh))/* 在没有获得锁的期间，如果有其他进程读取的内容 */
            end_buffer_async_read(bh, 1);
        else
            submit_bh(READ, bh);/* 提交IO请求 */
    }
    return 0;
}

这里使用buffer head主要是通过buffer head建立页框与数据块的映射关系。因为页面中的数据不是连接的，而页框描述符struct page的字段又不足以表达这种信息。

该函数会调用create_empty_buffers来创建一组全新的缓冲区，并与page关联起来

/**
 * 创建一组全新的缓冲区，以便与页关联
 */
void create_empty_buffers(struct page *page,
            unsigned long blocksize, unsigned long b_state)
{
    struct buffer_head *bh, *head, *tail;

    /* 创建所需要数目的缓冲头，并将其形成一个链表，返回第一个缓冲头 */
    head = alloc_page_buffers(page, blocksize, 1);
    /* 设置所有缓冲头的状态，并将缓冲头形成一个环形链表 */
    bh = head;
    do {
        bh->b_state |= b_state;
        tail = bh;
        bh = bh->b_this_page;
    } while (bh);
    tail->b_this_page = head;

    /* 根据页面状态设置块缓冲区的状态 */
    spin_lock(&page->mapping->private_lock);
    if (PageUptodate(page) || PageDirty(page)) {
        bh = head;
        do {/* 更新每一个缓冲头的状态 */
            if (PageDirty(page))
                set_buffer_dirty(bh);
            if (PageUptodate(page))
                set_buffer_uptodate(bh);
            bh = bh->b_this_page;
        } while (bh != head);
    }
    /* 将缓冲区关联到页面 */
    attach_page_buffers(page, head);
    spin_unlock(&page->mapping->private_lock);
}

create_empty_buffers调用alloc_page_buffers来创建一组buffer head链表，但还不是循环链表：

struct buffer_head *alloc_page_buffers(struct page *page, unsigned long size,
        int retry)
{
    struct buffer_head *bh, *head;
    long offset;

try_again:
    head = NULL;
    offset = PAGE_SIZE;
    while ((offset -= size) >= 0) {
        bh = alloc_buffer_head(GFP_NOFS);
        if (!bh)
            goto no_grow;

        bh->b_bdev = NULL;
        bh->b_this_page = head;
        bh->b_blocknr = -1;
        head = bh;

        bh->b_state = 0;
        atomic_set(&bh->b_count, 0);
        bh->b_private = NULL;
        bh->b_size = size;

        /* Link the buffer to its page */
        set_bh_page(bh, page, offset);

        init_buffer(bh, NULL, NULL);
    }
    return head;

......

}

alloc_page_buffers调用set_bh_page来设置b_data.

void set_bh_page(struct buffer_head *bh,
        struct page *page, unsigned long offset)
{
    bh->b_page = page;
    BUG_ON(offset >= PAGE_SIZE);
    if (PageHighMem(page))
        /*
         * This catches illegal uses and preserves the offset:
         */
        bh->b_data = (char *)(0 + offset);
    else
        bh->b_data = page_address(page) + offset;
}

(2)访问一个单独的磁盘块(比如，读超级块或者索引节点块时)。参见ext2_fill_super(fs/ext2/super.c)，该函数在安装ext2文件系统时调用。

Buffer page和buffer head的关系：

由代码可知，每个buffer_head对应磁盘上的一个block. 一个page cache有N（N = PAGE_SIZE/BLOCK_SIZE）个buffer_head来描述。

posted @ 2017-01-25 09:54 penghan 阅读(1458) 评论(0) 编辑收藏举报

会员力量，点亮园子希望

刷新页面返回顶部

penghan

page cache 与free

公告