Linux内存管理 (16)内存规整

专题：Linux内存管理专题

关键词：内存规整、页面迁移、pageblock、MIGRATE_TYPES。

内存碎片的产生：伙伴系统以页为单位进行管理，经过大量申请释放，造成大量离散且不连续的页面。这时就产生了很多碎片。

内存规整也即内存碎片整理，内存碎片也是以页面为单位的。实现基础是内存页面按照可移动性进行分组。内存规整的实现基础是页面迁移。

Linux内核以pageblock为单位来管理页的迁移属性。

为什么需要内存规整？

有些情况下，物理设备需要大段连续物理内存。虽然此时空闲内存足够，但是哟与无法找到连续的物理内存，仍然造成内存分配失败。

1. 内存规整的触发

下面是内存页面分配，以及分配失败之后采取的措施，以便促成分配成功。

可以看出采取的措施，越来越重。首先采用kswapd来进行页面回收，然后尝试页面规整、直接页面回收，最后是OOM杀死进程来获取更多内存空间。

alloc_pages-------------------------------------页面分配的入口
  ->__alloc_pages_nodemask
    ->get_page_from_freelist--------------------直接从zonelist的空闲列表中分配页面
    ->__alloc_pages_slowpath--------------------在初次尝试分配失败后，进入slowpath路径分配页面
      ->wake_all_kswapds------------------------唤醒kswapd内核线程进行页面回收
      ->get_page_from_freelist------------------kswapd页面回收后再次进行页面分配
      ->__alloc_pages_direct_compact------------进行页面规整，然后进行页面分配
      ->__alloc_pages_direct_reclaim------------直接页面回收，然后进行页面分配
      ->__alloc_pages_may_oom-------------------尝试触发OOM

另一条路径是在kswapd的balance_pgdat中会判断是否需要进行内存规整。

kswapd
  ->balance_pgdat-------------------------------遍历内存节点的zone，判断是否处于平衡状态即WMARK_HIGH。
    ->compact_pgdat-----------------------------针对整个内存节点进行内存规整

其中compact_pddat->__compact_pgdat->compact_zone，最终的实现和__alloc_pages_direct_compact调用compact_zone一样。

1.1 内存规整相关节点

内存规整相关有两个节点，compact_memory用于触发内存规整；extfrag_threshold影响内核决策是采用内存规整还是直接回收来满足大内存分配。

节点入口代码：

static struct ctl_table vm_table[] = {
...
#ifdef CONFIG_COMPACTION
    {
        .procname    = "compact_memory",
        .data        = &sysctl_compact_memory,
        .maxlen        = sizeof(int),
        .mode        = 0200,
        .proc_handler    = sysctl_compaction_handler,
    },
    {
        .procname    = "extfrag_threshold",
        .data        = &sysctl_extfrag_threshold,
        .maxlen        = sizeof(int),
        .mode        = 0644,
        .proc_handler    = sysctl_extfrag_handler,
        .extra1        = &min_extfrag_threshold,
        .extra2        = &max_extfrag_threshold,
    },

#endif /* CONFIG_COMPACTION */
...
    { }
}

1.1.1 /proc/sys/vm/compact_memory

打开compaction Tracepoint：echo 1 > /sys/kernel/debug/tracing/events/compaction/enable

触发内存规整：sysctl -w vm.compact_memory=1

查看Tracepoint：cat /sys/kernel/debug/tracing/trace

1.1.2 /proc/sys/vm/extfrag_threshold

在compact_zone中调用函数compaction_suitable->__compaction_suitable进行判断是否进行内存规整。

和extfrag_threshold相关部分如下，如果当前fragindex不超过sysctl_extfrag_threshold，则不会继续进行内存规整。

所以这个参数越小越倾向于进行内存规整，越大越不容易进行内存规整。

static unsigned long __compaction_suitable(struct zone *zone, int order,
                    int alloc_flags, int classzone_idx)
{
...
    fragindex = fragmentation_index(zone, order);
    if (fragindex >= 0 && fragindex <= sysctl_extfrag_threshold)
        return COMPACT_NOT_SUITABLE_ZONE;

    return COMPACT_CONTINUE;
}

设置extfrag_threshold：sysctl -w vm.extfrag_threshold=500

1.1.3 其它Debug信息

/sys/kernel/debug/extfrag/extfrag_index

/sys/kernel/debug/extfrag/unusable_index

2. 内存规整实现

在进入细节前，先看看内存规整函数框架。

__alloc_pages_direct_compact
  ->try_to_compact_pages-----------------直接内存规整来满足高阶分配需求
    ->compact_zone_order-----------------遍历zonelist对每个zone进行规整
      ->compact_zone---------------------对zone进行规整
        ->compaction_suitable------------检查是否继续规整，COMPACT_PARTIAL/COMPACT_SKIPPED都跳过。
        ->compact_finished---------------在while中判断是否可以停止内存规整
        ->isolate_migratepages-----------查找可以迁移页面
        ->migrate_pages------------------进行页面迁移操作
      ->get_free_page_from_freelist------在规整完成后进行页面分配操作

__alloc_pages_direct_compact首先执行规整操作，然后进行页面分配。

static struct page *
__alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,
        int alloc_flags, const struct alloc_context *ac,
        enum migrate_mode mode, int *contended_compaction,
        bool *deferred_compaction)
{
    unsigned long compact_result;
    struct page *page;

    if (!order)-----------------------------------------------------------------order为0情况，不用进行内存规整。
        return NULL;

    current->flags |= PF_MEMALLOC;
    compact_result = try_to_compact_pages(gfp_mask, order, alloc_flags, ac,-----进行内存规整，当前进程会置PF_MEMALLOC，避免进程迁移时发生死锁。
                        mode, contended_compaction);
    current->flags &= ~PF_MEMALLOC;

    switch (compact_result) {
    case COMPACT_DEFERRED:
        *deferred_compaction = true;
        /* fall-through */
    case COMPACT_SKIPPED:
        return NULL;
    default:
        break;
    }
...
    page = get_page_from_freelist(gfp_mask, order,-----------------------------进行内存分配
                    alloc_flags & ~ALLOC_NO_WATERMARKS, ac);
...
    count_vm_event(COMPACTFAIL);

    cond_resched();

    return NULL;
}

try_to_compact_pages执行内存规整，以pageblock为单位，选择pageblock中可迁移页面。

unsigned long try_to_compact_pages(gfp_t gfp_mask, unsigned int order,
            int alloc_flags, const struct alloc_context *ac,
            enum migrate_mode mode, int *contended)
{
    int may_enter_fs = gfp_mask & __GFP_FS;
    int may_perform_io = gfp_mask & __GFP_IO;
    struct zoneref *z;
    struct zone *zone;
    int rc = COMPACT_DEFERRED;
    int all_zones_contended = COMPACT_CONTENDED_LOCK; /* init for &= op */

    *contended = COMPACT_CONTENDED_NONE;

    /* Check if the GFP flags allow compaction */
    if (!order || !may_enter_fs || !may_perform_io)
        return COMPACT_SKIPPED;

    trace_mm_compaction_try_to_compact_pages(order, gfp_mask, mode);

    /* Compact each zone in the list */
    for_each_zone_zonelist_nodemask(zone, z, ac->zonelist, ac->high_zoneidx,-----------根据掩码遍历特定zone
                                ac->nodemask) {
        int status;
        int zone_contended;

        if (compaction_deferred(zone, order))
            continue;

        status = compact_zone_order(zone, order, gfp_mask, mode,-----------------------针对特定zone进行规整
                &zone_contended, alloc_flags,
                ac->classzone_idx);
        rc = max(status, rc);
        /*
         * It takes at least one zone that wasn't lock contended
         * to clear all_zones_contended.
         */
        all_zones_contended &= zone_contended;

        /* If a normal allocation would succeed, stop compacting */
        if (zone_watermark_ok(zone, order, low_wmark_pages(zone),
                    ac->classzone_idx, alloc_flags)) {--------------------------------当前zoen水位是否高于WMARK_LOW，如果是则退出当前循环。
            /*
             * We think the allocation will succeed in this zone,
             * but it is not certain, hence the false. The caller
             * will repeat this with true if allocation indeed
             * succeeds in this zone.
             */
            compaction_defer_reset(zone, order, false);
            /*
             * It is possible that async compaction aborted due to
             * need_resched() and the watermarks were ok thanks to
             * somebody else freeing memory. The allocation can
             * however still fail so we better signal the
             * need_resched() contention anyway (this will not
             * prevent the allocation attempt).
             */
            if (zone_contended == COMPACT_CONTENDED_SCHED)
                *contended = COMPACT_CONTENDED_SCHED;

            goto break_loop;
        }
...
        continue;
break_loop:
        /*
         * We might not have tried all the zones, so  be conservative
         * and assume they are not all lock contended.
         */
        all_zones_contended = 0;
        break;
    }

    /*
     * If at least one zone wasn't deferred or skipped, we report if all
     * zones that were tried were lock contended.
     */
    if (rc > COMPACT_SKIPPED && all_zones_contended)
        *contended = COMPACT_CONTENDED_LOCK;

    return rc;
}

compact_zone_order调用compact_zone，最主要的就是将参数填入struct compact_control结构体，然后和zone一起作为参数传递给compact_zone。

struct compact_control数据结构记录了被迁移的页面，以及规整过程中迁移到的页面列表。

static unsigned long compact_zone_order(struct zone *zone, int order,
        gfp_t gfp_mask, enum migrate_mode mode, int *contended,
        int alloc_flags, int classzone_idx)
{
    unsigned long ret;
    struct compact_control cc = {
        .nr_freepages = 0,
        .nr_migratepages = 0,
        .order = order,------------------------------------------需要规整的页面阶数
        .gfp_mask = gfp_mask,------------------------------------页面规整的页面掩码
        .zone = zone,
        .mode = mode,--------------------------------------------页面规整模式-同步、异步
        .alloc_flags = alloc_flags,
        .classzone_idx = classzone_idx,
    };
    INIT_LIST_HEAD(&cc.freepages);-------------------------------初始化迁移目的地的链表
    INIT_LIST_HEAD(&cc.migratepages);----------------------------初始化将要迁移页面链表

    ret = compact_zone(zone, &cc);

    VM_BUG_ON(!list_empty(&cc.freepages));
    VM_BUG_ON(!list_empty(&cc.migratepages));

    *contended = cc.contended;
    return ret;
}

static int compact_zone(struct zone *zone, struct compact_control *cc)
{
    int ret;
    unsigned long start_pfn = zone->zone_start_pfn;
    unsigned long end_pfn = zone_end_pfn(zone);
    const int migratetype = gfpflags_to_migratetype(cc->gfp_mask);
    const bool sync = cc->mode != MIGRATE_ASYNC;
    unsigned long last_migrated_pfn = 0;

    ret = compaction_suitable(zone, cc->order, cc->alloc_flags,
                            cc->classzone_idx);-------------------------------根据当前zone水位来判断是否需要进行内存规整，COMPACT_CONTINUE表示可以做内存规整。
    switch (ret) {
    case COMPACT_PARTIAL:
    case COMPACT_SKIPPED:
        /* Compaction is likely to fail */
        return ret;
    case COMPACT_CONTINUE:
        /* Fall through to compaction */
        ;
    }

    /*
     * Clear pageblock skip if there were failures recently and compaction
     * is about to be retried after being deferred. kswapd does not do
     * this reset as it'll reset the cached information when going to sleep.
     */
    if (compaction_restarting(zone, cc->order) && !current_is_kswapd())
        __reset_isolation_suitable(zone);

    /*
     * Setup to move all movable pages to the end of the zone. Used cached
     * information on where the scanners should start but check that it
     * is initialised by ensuring the values are within zone boundaries.
     */
    cc->migrate_pfn = zone->compact_cached_migrate_pfn[sync];-----------------表示从zone的开始页面开始扫描和查找哪些页面可以被迁移。
    cc->free_pfn = zone->compact_cached_free_pfn;-----------------------------从zone末端开始扫描和查找哪些空闲的页面可以用作迁移页面的目的地。
    if (cc->free_pfn < start_pfn || cc->free_pfn > end_pfn) {-----------------下面对free_pfn和migrate_pfn进行范围限制。
        cc->free_pfn = end_pfn & ~(pageblock_nr_pages-1);
        zone->compact_cached_free_pfn = cc->free_pfn;
    }
    if (cc->migrate_pfn < start_pfn || cc->migrate_pfn > end_pfn) {
        cc->migrate_pfn = start_pfn;
        zone->compact_cached_migrate_pfn[0] = cc->migrate_pfn;
        zone->compact_cached_migrate_pfn[1] = cc->migrate_pfn;
    }

    trace_mm_compaction_begin(start_pfn, cc->migrate_pfn,
                cc->free_pfn, end_pfn, sync);

    migrate_prep_local();

    while ((ret = compact_finished(zone, cc, migratetype)) ==
                        COMPACT_CONTINUE) {-----------------------------------while中从zone开头扫描查找合适的迁移页面，然后尝试迁移到zone末端空闲页面中，直到zone处于低水位WMARK_LOW之上。
        int err;
        unsigned long isolate_start_pfn = cc->migrate_pfn;

        switch (isolate_migratepages(zone, cc)) {-----------------------------用于扫描和查找合适迁移的页，从zone头部开始找起，查找步长以pageblock_nr_pages为单位。
        case ISOLATE_ABORT:
            ret = COMPACT_PARTIAL;
            putback_movable_pages(&cc->migratepages);
            cc->nr_migratepages = 0;
            goto out;
        case ISOLATE_NONE:
            /*
             * We haven't isolated and migrated anything, but
             * there might still be unflushed migrations from
             * previous cc->order aligned block.
             */
            goto check_drain;
        case ISOLATE_SUCCESS:
            ;
        }

        err = migrate_pages(&cc->migratepages, compaction_alloc,--------------migrate_pages是页面迁移核心函数，从cc->migratepages中摘取页，然后尝试去迁移。
                compaction_free, (unsigned long)cc, cc->mode,
                MR_COMPACTION);

        trace_mm_compaction_migratepages(cc->nr_migratepages, err,
                            &cc->migratepages);

        /* All pages were either migrated or will be released */
        cc->nr_migratepages = 0;
        if (err) {------------------------------------------------------------没处理成功的页面会放回到合适的LRU链表中。
            putback_movable_pages(&cc->migratepages);
            /*
             * migrate_pages() may return -ENOMEM when scanners meet
             * and we want compact_finished() to detect it
             */
            if (err == -ENOMEM && cc->free_pfn > cc->migrate_pfn) {
                ret = COMPACT_PARTIAL;
                goto out;
            }
        }
...
    }

out:
...
    trace_mm_compaction_end(start_pfn, cc->migrate_pfn,
                cc->free_pfn, end_pfn, sync, ret);

    return ret;
}

compaction_suitable根据当前zone水位决定是否需要继续内存规整，主要工作由__compaction_suitable进行处理。

主要依据zone低水位和extfrag_threshold两个参数进行判断。

unsigned long compaction_suitable(struct zone *zone, int order,
                    int alloc_flags, int classzone_idx)
{
    unsigned long ret;

    ret = __compaction_suitable(zone, order, alloc_flags, classzone_idx);
    trace_mm_compaction_suitable(zone, order, ret);
    if (ret == COMPACT_NOT_SUITABLE_ZONE)
        ret = COMPACT_SKIPPED;

    return ret;
}

static unsigned long __compaction_suitable(struct zone *zone, int order,
                    int alloc_flags, int classzone_idx)
{
    int fragindex;
    unsigned long watermark;

    /*
     * order == -1 is expected when compacting via
     * /proc/sys/vm/compact_memory
     */
    if (order == -1)
        return COMPACT_CONTINUE;

    watermark = low_wmark_pages(zone);
    /*
     * If watermarks for high-order allocation are already met, there
     * should be no need for compaction at all.
     */
    if (zone_watermark_ok(zone, order, watermark, classzone_idx,
                                alloc_flags))--------------------------------------COMPACT_PARTIAL：如果满足低水位，则不需要进行内存规整。
        return COMPACT_PARTIAL;

    /*
     * Watermarks for order-0 must be met for compaction. Note the 2UL.
     * This is because during migration, copies of pages need to be
     * allocated and for a short time, the footprint is higher
     */
    watermark += (2UL << order);---------------------------------------------------增加水位高度为watermark+2<<order。
    if (!zone_watermark_ok(zone, 0, watermark, classzone_idx, alloc_flags))--------COMPACT_SKIPPED：如果达不到新水位，说明当前zone中空闲页面很少，不适合作内存规整，跳过此zone。
        return COMPACT_SKIPPED;

    /*
     * fragmentation index determines if allocation failures are due to
     * low memory or external fragmentation
     *
     * index of -1000 would imply allocations might succeed depending on
     * watermarks, but we already failed the high-order watermark check
     * index towards 0 implies failure is due to lack of memory
     * index towards 1000 implies failure is due to fragmentation
     *
     * Only compact if a failure would be due to fragmentation.
     */
    fragindex = fragmentation_index(zone, order);
    if (fragindex >= 0 && fragindex <= sysctl_extfrag_threshold)-----------------由extfrag_threshold控制的内存规整流程
        return COMPACT_NOT_SUITABLE_ZONE;

    return COMPACT_CONTINUE;
}

compact_finished判断内存规整流程是否可以结束，结束的条件有两个：

一是cc->migrate_pfn和cc->free_pfn两个指针相遇；二是以order为条件判断当前zone的水位在低水位之上。

static int compact_finished(struct zone *zone, struct compact_control *cc,
                const int migratetype)
{
    int ret;

    ret = __compact_finished(zone, cc, migratetype);
    trace_mm_compaction_finished(zone, cc->order, ret);
    if (ret == COMPACT_NO_SUITABLE_PAGE)
        ret = COMPACT_CONTINUE;

    return ret;
}

static int __compact_finished(struct zone *zone, struct compact_control *cc,
                const int migratetype)
{
    unsigned int order;
    unsigned long watermark;

    if (cc->contended || fatal_signal_pending(current))
        return COMPACT_PARTIAL;

    /* Compaction run completes if the migrate and free scanner meet */
    if (cc->free_pfn <= cc->migrate_pfn) {-----------------------------------------扫描可迁移页面和空闲页面，从zone的头尾向中间运行。当两者相遇，可以停止规整。
        /* Let the next compaction start anew. */
        zone->compact_cached_migrate_pfn[0] = zone->zone_start_pfn;
        zone->compact_cached_migrate_pfn[1] = zone->zone_start_pfn;
        zone->compact_cached_free_pfn = zone_end_pfn(zone);

        /*
         * Mark that the PG_migrate_skip information should be cleared
         * by kswapd when it goes to sleep. kswapd does not set the
         * flag itself as the decision to be clear should be directly
         * based on an allocation request.
         */
        if (!current_is_kswapd())
            zone->compact_blockskip_flush = true;

        return COMPACT_COMPLETE;--------------------------------------------------停止内存规整
    }

    /*
     * order == -1 is expected when compacting via
     * /proc/sys/vm/compact_memory
     */
    if (cc->order == -1)----------------------------------------------------------order为-1表示强制执行内存规整，继续内存规整
        return COMPACT_CONTINUE;

    /* Compaction run is not finished if the watermark is not met */
    watermark = low_wmark_pages(zone);

    if (!zone_watermark_ok(zone, cc->order, watermark, cc->classzone_idx,
                            cc->alloc_flags))--------------------------------------不满足低水位条件，继续内存规整。
        return COMPACT_CONTINUE;

    /* Direct compactor: Is a suitable page free? */
    for (order = cc->order; order < MAX_ORDER; order++) {
        struct free_area *area = &zone->free_area[order];

        /* Job done if page is free of the right migratetype */
        if (!list_empty(&area->free_list[migratetype]))----------------------------空闲页面为空，无法进行迁移，停止内存规整。
            return COMPACT_PARTIAL;

        /* Job done if allocation would set block type */
        if (order >= pageblock_order && area->nr_free)
            return COMPACT_PARTIAL;
    }

    return COMPACT_NO_SUITABLE_PAGE;
}

isolate_migratepages扫描并寻找zone中可迁移页面，结果回添加到cc->migratepages链表中。

扫描的一个重要参数是页的迁移属性参考MIGRATE_TYPES有详细解释。

Linux内核以pageblock为单位来管理页的迁移属性，一个pageblock大小为4MB大小，即2^10个页面。

pageblock_nr_pages即为1024个页面。

static isolate_migrate_t isolate_migratepages(struct zone *zone,
                    struct compact_control *cc)
{
    unsigned long low_pfn, end_pfn;
    struct page *page;
    const isolate_mode_t isolate_mode =
        (cc->mode == MIGRATE_ASYNC ? ISOLATE_ASYNC_MIGRATE : 0);

    /*
     * Start at where we last stopped, or beginning of the zone as
     * initialized by compact_zone()
     */
    low_pfn = cc->migrate_pfn;

    /* Only scan within a pageblock boundary */
    end_pfn = ALIGN(low_pfn + 1, pageblock_nr_pages);

    /*
     * Iterate over whole pageblocks until we find the first suitable.
     * Do not cross the free scanner.
     */
    for (; end_pfn <= cc->free_pfn;---------------------------------------从cc->migrate_pfn开始以pageblock_nr_pages为步长向zone尾部进行扫描。
            low_pfn = end_pfn, end_pfn += pageblock_nr_pages) {

        /*
         * This can potentially iterate a massively long zone with
         * many pageblocks unsuitable, so periodically check if we
         * need to schedule, or even abort async compaction.
         */
        if (!(low_pfn % (SWAP_CLUSTER_MAX * pageblock_nr_pages))
                        && compact_should_abort(cc))
            break;

        page = pageblock_pfn_to_page(low_pfn, end_pfn, zone);
        if (!page)
            continue;

        /* If isolation recently failed, do not retry */
        if (!isolation_suitable(cc, page))
            continue;

        /*
         * For async compaction, also only scan in MOVABLE blocks.
         * Async compaction is optimistic to see if the minimum amount
         * of work satisfies the allocation.
         */
        if (cc->mode == MIGRATE_ASYNC &&
            !migrate_async_suitable(get_pageblock_migratetype(page)))----migrate_async_suitable判断pageblock是否是MIGRATE_MOVABLE和MIGRATE_CMA两种类型，这两种类型可以迁移。
            continue;

        /* Perform the isolation */
        low_pfn = isolate_migratepages_block(cc, low_pfn, end_pfn,
                                isolate_mode);---------------------------扫描和分离pageblock中的页面是否是和迁移。

        if (!low_pfn || cc->contended) {
            acct_isolated(zone, cc);
            return ISOLATE_ABORT;
        }

        /*
         * Either we isolated something and proceed with migration. Or
         * we failed and compact_zone should decide if we should
         * continue or not.
         */
        break;
    }

    acct_isolated(zone, cc);
    /*
     * Record where migration scanner will be restarted. If we end up in
     * the same pageblock as the free scanner, make the scanners fully
     * meet so that compact_finished() terminates compaction.
     */
    cc->migrate_pfn = (end_pfn <= cc->free_pfn) ? low_pfn : cc->free_pfn;

    return cc->nr_migratepages ? ISOLATE_SUCCESS : ISOLATE_NONE;
}

compaction_alloc()从zone的末尾开始查找空闲页面，并把空闲页面添加到cc->freepages链表中。然后从cc->freepages中摘除页面，返回给migrate_pages作为迁移使用。

compaction_free是规整失败的处理函数，将空闲页面返回给cc->freepages。

static struct page *compaction_alloc(struct page *migratepage,
                    unsigned long data,
                    int **result)
{
    struct compact_control *cc = (struct compact_control *)data;
    struct page *freepage;

    /*
     * Isolate free pages if necessary, and if we are not aborting due to
     * contention.
     */
    if (list_empty(&cc->freepages)) {
        if (!cc->contended)
            isolate_freepages(cc);--------------------------------------查找可以用来作为迁移目的页面

        if (list_empty(&cc->freepages))---------------------------------如果没有页面可被用来作为迁移目的页面，返回NULL。
            return NULL;
    }

    freepage = list_entry(cc->freepages.next, struct page, lru);
    list_del(&freepage->lru);-------------------------------------------将空闲页面从cc->freepages中摘除。
    cc->nr_freepages--;

    return freepage;----------------------------------------------------找到可以被用作迁移目的的页面
}

static void compaction_free(struct page *page, unsigned long data)
{
    struct compact_control *cc = (struct compact_control *)data;

    list_add(&page->lru, &cc->freepages);-------------------------------失败情况下，将页面放回cc->freepages。
    cc->nr_freepages++;
}

posted on 2018-02-10 14:16 ArnoldLu 阅读(9362) 评论(0) 收藏举报

刷新页面返回顶部

Arnold Lu@南京