Linux内核情景分析的alloc_pages
NUMA结构的alloc_pages
==================== mm/numa.c 43 43 ====================43 #ifdef CONFIG_DISCONTIGMEM==================== mm/numa.c 91 128 ====================91 /*92 * This can be refined. Currently, tries to do round robin, instead93 * should do concentratic circle search, starting from current node.94 *///分配策略, 所需物理块的大小,2的order次方95 struct page * alloc_pages(int gfp_mask, unsigned long order)96 {97 struct page *ret = 0;98 pg_data_t *start, *temp;99 #ifndef CONFIG_NUMA100 unsigned long flags;101 static pg_data_t *next = 0;102 #endif103104 if (order >= MAX_ORDER)105 return NULL;106 #ifdef CONFIG_NUMA//NUMA结构107 temp = NODE_DATA(numa_node_id());//可以通过宏操作找到cpu的节数据结构队列108 #else109 spin_lock_irqsave(&node_lock, flags);110 if (!next) next = pgdat_list;111 temp = next;112 next = next->node_next;113 spin_unlock_irqrestore(&node_lock, flags);114 #endif/*函数主要操作2个循环,一个从temp到队列末尾,一个从队头到temp,扫描所有节,直到某节点内存分配成功*/115 start = temp;116 while (temp) {117 if ((ret = alloc_pages_pgdat(temp, gfp_mask, order)))//接下来解析此函数118 return(ret);119 temp = temp->node_next;120 }121 temp = pgdat_list;122 while (temp != start) {123 if ((ret = alloc_pages_pgdat(temp, gfp_mask, order)))124 return(ret);125 temp = temp->node_next;126 }127 return(0);128 }
alloc_pages_pgdat试图分配所需页面,是__alloc_pages的封装
85 static struct page * alloc_pages_pgdat(pg_data_t *pgdat, int gfp_mask,86 unsigned long order)87 { //node_zonelist决定分配策略数组88 return __alloc_pages(pgdat->node_zonelists + gfp_mask, order);89 }
与UMA的alloc_pages()相比较,UMA只有一个节点,contig_page_data.UMA与NUMA共同使用__alloc_pages
==================== include/linux/mm.h 343 352 ====================343 #ifndef CONFIG_DISCONTIGMEM//只有这个无定义,才使用uma的__alloc_pages344 static inline struct page * alloc_pages(int gfp_mask, unsigned long order)83345 {346 /*347 * Gets optimized away by the compiler.348 */349 if (order >= MAX_ORDER)350 return NULL;351 return __alloc_pages(contig_page_data.node_zonelists+(gfp_mask), order);352 }
查看__alloc_page,
如果只分配一个页面,而且要等待完成分配,又不适用于管理的目的把direct_reclaim设置为1,表示可以从相应的管理区的不活跃干净页面缓冲队列中回收84发现空闲页面短缺,唤醒以下2个进程,试图腾出一些页面出来
[alloc_pages()>__alloc_pages()]270 /*271 * This is the 'heart' of the zoned buddy allocator:272 */273 struct page * __alloc_pages(zonelist_t *zonelist, unsigned long order)274 {275 zone_t **zone;276 int direct_reclaim = 0;277 unsigned int gfp_mask = zonelist->gfp_mask;//获取具体的分配策略278 struct page * page;279280 /*281 * Allocations put pressure on the VM subsystem.282 */283 memory_pressure++;//表示内存管理所承受的压力,分配++,归还--284285 /*286 * (If anyone calls gfp from interrupts nonatomically then it287 * will sooner or later tripped up by a schedule().)288 *289 * We are falling back to lower-level zones if allocation290 * in a higher zone fails.291 */292293 /*294 如果只分配一个页面,而且要等待完成分配,又不适用于管理的目的把direct_reclaim设置为1,表示可以从相应的管理区的不活跃干净页面缓冲队列中回收296 */297 if (order == 0 && (gfp_mask & __GFP_WAIT) &&298 !(current->flags & PF_MEMALLOC))299 direct_reclaim = 1;300301 /*302 * If we are about to get low on free pages and we also have303 * an inactive page shortage, wake up kswapd.84发现空闲页面短缺,唤醒以下2个进程,试图腾出一些页面出来304 */305 if (inactive_shortage() > inactive_target / 2 && free_shortage())306 wakeup_kswapd(0);307 /*308 * If we are about to get low on free pages and cleaning309 * the inactive_dirty pages would fix the situation,310 * wake up bdflush.311 */312 else if (free_shortage() && nr_inactive_dirty_pages > free_shortage()313 && nr_inactive_dirty_pages >= freepages.high)314 wakeup_bdflush(0);315
继续看__alloc_page代码
//如果管理区的空闲页面大于其最低标准,分配成功直接返回
//否则有进程(内核线程kreclaimd)在等待队列睡眠,把它唤醒,用于回收一些页面,备用
==================== mm/page_alloc.c 316 340 ====================[alloc_pages()>__alloc_pages()]316 try_again:317 /*318 * First, see if we have any zones with lots of free memory.319 *320 * We allocate free memory first because it doesn't contain321 * any data ... DUH!322 */323 zone = zonelist->zones;//获取管理区指针324 for (;;) {325 zone_t *z = *(zone++);//管理区326 if (!z)327 break;328 if (!z->size)329 BUG();330//如果管理区的空闲页面大于其最低标准331 if (z->free_pages >= z->pages_low) {332 page = rmqueue(z, order);//分配内存,接下来分析此函数333 if (page)334 return page;335 }//否则有进程(内核线程kreclaimd)在等待队列睡眠,把它唤醒,用于回收一些页面,备用else if (z->free_pages < z->pages_min &&336 waitqueue_active(&kreclaimd_wait)) {85337 wake_up_interruptible(&kreclaimd_wait);338 }339 }340
[alloc_pages()>__alloc_pages()>rmqueue()]172 static struct page * rmqueue(zone_t *zone, unsigned long order)173 {174 free_area_t * area = zone->free_area + order;//获取其数组对应的元素175 unsigned long curr_order = order;176 struct list_head *head, *curr;177 unsigned long flags;178 struct page *page;179180 spin_lock_irqsave(&zone->lock, flags);//相应管理区加锁181 do {182 head = &area->free_list;//头183 curr = memlist_next(head);//头的下一个节点184185 if (curr != head) {//不等于空,说明有物理页块186 unsigned int index;187//从非空队列中取出第一个结构page元素188 page = memlist_entry(curr, struct page, list);189 if (BAD_RANGE(zone,page))190 BUG();191 memlist_del(curr);//删除队列中的元素192 index = (page - mem_map) - zone->offset;//偏移193 MARK_USED(index, curr_order, area);//将相应位图设置为1194 zone->free_pages -= 1 << order;195//分配成功,把大块剩余的部分分解为小块,链入相应的队列196 page = expand(zone, page, index, order, curr_order, area);197 spin_unlock_irqrestore(&zone->lock, flags);198199 set_page_count(page, 1);200 if (BAD_RANGE(zone,page))201 BUG();202 DEBUG_ADD_PAGE203 return page;204 }205 curr_order++;206 area++;86207 } while (curr_order < MAX_ORDER);208 spin_unlock_irqrestore(&zone->lock, flags);209210 return NULL;211 }
[alloc_pages()>__alloc_pages()>rmqueue()>expand()]/*low表示所需块大小,high表示实际大小*/150 static inline struct page * expand (zone_t *zone, struct page *page,151 unsigned long index, int low, int high, free_area_t * area)152 {153 unsigned long size = 1 << high;154155 while (high > low) {156 if (BAD_RANGE(zone,page))157 BUG();158 area--;159 high--;160 size >>= 1;//每次减少2的n次方161 memlist_add_head(&(page)->list, &(area)->free_list);162 MARK_USED(index, high, area);//标记位图//处理更低一档的空闲块队列163 index += size;164 page += size;165 }166 if (BAD_RANGE(zone,page))167 BUG();168 return page;169 }
就这样rmqueue队列一直往上扫描,直到分配成功或者失败,如果失败,则__alloc_pages通过for循环
指向下一个管理区(按照分配策略),直到成功.
要是给定的分配策略中的所有页面管理区都失败,那就只能加大力度再试试.要么降低对页面的水位要求
要么把缓冲在管理区的不活跃干净页面也给考虑进去
[alloc_pages()>__alloc_pages()]341 /*342 * Try to allocate a page from a zone with a HIGH343 * amount of free + inactive_clean pages.344 *345 * If there is a lot of activity, inactive_target346 * will be high and we'll have a good chance of347 * finding a page using the HIGH limit.348 *///先用page_high,如果不行再用page_low349 page = __alloc_pages_limit(zonelist, order, PAGES_HIGH, direct_reclaim);350 if (page)351 return page;352353 /*354 * Then try to allocate a page from a zone with more355 * than zone->pages_low free + inactive_clean pages.356 *357 * When the working set is very large and VM activity358 * is low, we're most likely to have our allocation359 * succeed here.360 */361 page = __alloc_pages_limit(zonelist, order, PAGES_LOW, direct_reclaim);362 if (page)363 return page;364
[alloc_pages()>__alloc_pages()>__alloc_pages_limit()]213 #define PAGES_MIN 0214 #define PAGES_LOW 1215 #define PAGES_HIGH 288216217 /*218 * This function does the dirty work for __alloc_pages219 * and is separated out to keep the code size smaller.220 * (suggested by Davem at 1:30 AM, typed by Rik at 6 AM)221 */222 static struct page * __alloc_pages_limit(zonelist_t *zonelist,223 unsigned long order, int limit, int direct_reclaim)224 {225 zone_t **zone = zonelist->zones;226227 for (;;) {228 zone_t *z = *(zone++);229 unsigned long water_mark;230231 if (!z)232 break;233 if (!z->size)234 BUG();235236 /*237 * We allocate if the number of free + inactive_clean238 * pages is above the watermark.239 */240 switch (limit) {241 default:242 case PAGES_MIN://通过分配策略,改变水位243 water_mark = z->pages_min;244 break;245 case PAGES_LOW:246 water_mark = z->pages_low;247 break;248 case PAGES_HIGH:249 water_mark = z->pages_high;250 }251//如果空闲页面+干净回收页面大于最低水位252 if (z->free_pages + z->inactive_clean_pages > water_mark) {253 struct page *page = NULL;254 /* 如果空闲页面小于最低水位+8,那就回收. */255 if (direct_reclaim && z->free_pages < z->pages_min + 8)256 page = reclaim_page(z);//把inactive_clean_list队列回收页面257 /* If that fails, fall back to rmqueue. */258 if (!page)259 page = rmqueue(z, order);260 if (page)261 return page;262 }263 }26489265 /* Found nothing. */266 return NULL;267 }
如果还是不行,那就说明管理区的页面很短缺了
[alloc_pages()>__alloc_pages()]365 /*366 * OK, none of the zones on our zonelist has lots367 * of pages free.368 *369 * We wake up kswapd, in the hope that kswapd will370 * resolve this situation before memory gets tight.371 *372 * We also yield the CPU, because that:373 * - gives kswapd a chance to do something374 * - slows down allocations, in particular the375 * allocations from the fast allocator that's376 * causing the problems ...377 * - ... which minimises the impact the "bad guys"378 * have on the rest of the system379 * - if we don't have __GFP_IO set, kswapd may be380 * able to free some memory we can't free ourselves381 */382 wakeup_kswapd(0);//唤醒内核线程,想办法换出一些页面383 if (gfp_mask & __GFP_WAIT) {//要求必须获取页面,分配不到时等待,那就让系统再调用一次(目的为了调度kswapd线程)//以此获取一些页面384 __set_current_state(TASK_RUNNING);385 current->policy |= SCHED_YIELD;386 schedule();387 }388389 /*390 * After waking up kswapd, we try to allocate a page391 * from any zone which isn't critical yet.392 *393 * Kswapd should, in most situations, bring the situation394 * back to normal in no time.395 *//*如果不允许等待,那就用pages_min再调用一次__alloc_pages_limit*/396 page = __alloc_pages_limit(zonelist, order, PAGES_MIN, direct_reclaim);397 if (page)398 return page;399
要是再失败,那就看谁在要求分配内存页面.如果是kswaped,本身就是内存分配工作者
是要更好的分配页面,比一般进程更重要,那就PF_memalloc标志位为1,不过我们先看一般进程
即pe_memalloc标志位为0的策略.
==================== mm/page_alloc.c 400 477 ====================[alloc_pages()>__alloc_pages()]400 /*401 * Damn, we didn't succeed.402 *403 * This can be due to 2 reasons:404 * - we're doing a higher-order allocation405 * --> move pages to the free list until we succeed406 * - we're /really/ tight on memory407 * --> wait on the kswapd waitqueue until memory is freed408 */409 if (!(current->flags & PF_MEMALLOC)) {410 /*411 * Are we dealing with a higher order allocation?412 *413 * Move pages from the inactive_clean to the free list414 * in the hope of creating a large, physically contiguous415 * piece of free memory.416 */417 if (order > 0 && (gfp_mask & __GFP_WAIT)) {418 zone = zonelist->zones;419 /* First, clean some dirty pages. */420 current->flags |= PF_MEMALLOC;421 page_launder(gfp_mask, 1);//把脏页洗干净(页面的定期换出)422 current->flags &= ~PF_MEMALLOC;423 for (;;) {424 zone_t *z = *(zone++);//通过一个for循环把干净页面等待队列的页面回收425 if (!z)426 break;427 if (!z->size)428 continue;//是否有干净页面429 while (z->inactive_clean_pages) {430 struct page * page;431 /* Move one page to the free list. */432 page = reclaim_page(z);//回收干净页面等待队列433 if (!page)434 break;91435 __free_page(page);//通过__free_page释放页面的同时,把空闲页面拼接成大的页面块436 /* Try if the allocation succeeds. */437 page = rmqueue(z, order);//试图再次请求成功438 if (page)439 return page;440 }441 }442 }443 /*444 * When we arrive here, we are really tight on memory.445 *446 * We wake up kswapd and sleep until kswapd wakes us447 * up again. After that we loop back to the start.448 *449 * We have to do this because something else might eat450 * the memory kswapd frees for us and we need to be451 * reliable. Note that we don't loop back for higher452 * order allocations since it is possible that kswapd453 * simply cannot free a large enough contiguous area454 * of memory *ever*.455 *//*如果依旧失败,而且必须要求分配到页面,那就等待,进程睡眠*/456 if ((gfp_mask & (__GFP_WAIT|__GFP_IO)) == (__GFP_WAIT|__GFP_IO)) {457 wakeup_kswapd(1);//唤醒kswaped,要求分配页面进程睡眠,等待kswapd完成一轮运行再唤醒需要页面的进程458 memory_pressure++;459 if (!order)//如果要求分配的是1个页面,跳到try_again460 goto try_again;461 /*462 * If __GFP_IO isn't set, we can't wait on kswapd because463 * kswapd just might need some IO locks /we/ are holding ...464 *465 * SUBTLE: The scheduling point above makes sure that466 * kswapd does get the chance to free memory we can't467 * free ourselves...468 */469 } else if (gfp_mask & __GFP_WAIT) {470 try_to_free_pages(gfp_mask);//另外一种方案...直接调用此函数获取页面(本来就是kswaped函数调用的)471 memory_pressure++;472 if (!order)473 goto try_again;474 }475476 }477
最后的办法了
[alloc_pages()>__alloc_pages()]478 /*479 * Final phase: allocate anything we can!480 *481 * Higher order allocations, GFP_ATOMIC allocations and482 * recursive allocations (PF_MEMALLOC) end up here.483 *484 * Only recursive allocations can use the very last pages485 * in the system, otherwise it would be just too easy to486 * deadlock the system...487 */488 zone = zonelist->zones;489 for (;;) {490 zone_t *z = *(zone++);491 struct page * page = NULL;492 if (!z)493 break;494 if (!z->size)495 BUG();496497 /*498 * SUBTLE: direct_reclaim is only possible if the task499 * becomes PF_MEMALLOC while looping above. This will500 * happen when the OOM killer selects this task for501 * instant execution...93502 */503 if (direct_reclaim) {504 page = reclaim_page(z);505 if (page)506 return page;507 }508509 /* XXX: is pages_min/4 a good amount to reserve for this? */510 if (z->free_pages < z->pages_min / 4 &&511 !(current->flags & PF_MEMALLOC))512 continue;513 page = rmqueue(z, order);514 if (page)515 return page;516 }517518 /* No luck.. */519 printk(KERN_ERR "__alloc_pages: %lu-order allocation failed.\n", order);520 return NULL;521 }
如果这都失败,那就系统一定出现了问题了
节点->页面短缺->调用线程,试图腾出页面->
开始遍历每个管理区->一旦管理区的空闲页面大于最低水位,那就调用rmqueue进行分配,否则把kcreclaimd线程唤醒,回收页面
rmqueue分析->如果失败,换一个管理区(按照分配策略),如果全部失败->降低页面的水位要求,把不活跃干净的页面考虑进来
->调用__alloc_pages_limit->如果空闲页面小于最低水位+8,那就回收干净页面队列(换出,腾出空间)->失败,唤醒内核线程,获取页面
->依旧失败,把脏页面洗干净,换出.获取页面->依旧失败,再次调用线程换取页面,依旧失败->把水位降低到1/4看能否满足分配->
依旧不能,系统出了问题

浙公网安备 33010602011771号