page_alloc.c revision c2f29ea111e3344ed48257c2a142c3db514e1529
1/* 2 * linux/mm/page_alloc.c 3 * 4 * Manages the free list, the system allocates free pages here. 5 * Note that kmalloc() lives in slab.c 6 * 7 * Copyright (C) 1991, 1992, 1993, 1994 Linus Torvalds 8 * Swap reorganised 29.12.95, Stephen Tweedie 9 * Support of BIGMEM added by Gerhard Wichert, Siemens AG, July 1999 10 * Reshaped it to be a zoned allocator, Ingo Molnar, Red Hat, 1999 11 * Discontiguous memory support, Kanoj Sarcar, SGI, Nov 1999 12 * Zone balancing, Kanoj Sarcar, SGI, Jan 2000 13 * Per cpu hot/cold page lists, bulk allocation, Martin J. Bligh, Sept 2002 14 * (lots of bits borrowed from Ingo Molnar & Andrew Morton) 15 */ 16 17#include <linux/config.h> 18#include <linux/stddef.h> 19#include <linux/mm.h> 20#include <linux/swap.h> 21#include <linux/interrupt.h> 22#include <linux/pagemap.h> 23#include <linux/bootmem.h> 24#include <linux/compiler.h> 25#include <linux/module.h> 26#include <linux/suspend.h> 27#include <linux/pagevec.h> 28#include <linux/blkdev.h> 29#include <linux/slab.h> 30#include <linux/notifier.h> 31#include <linux/topology.h> 32#include <linux/sysctl.h> 33#include <linux/cpu.h> 34#include <linux/cpuset.h> 35#include <linux/nodemask.h> 36#include <linux/vmalloc.h> 37 38#include <asm/tlbflush.h> 39#include "internal.h" 40 41/* 42 * MCD - HACK: Find somewhere to initialize this EARLY, or make this 43 * initializer cleaner 44 */ 45nodemask_t node_online_map = { { [0] = 1UL } }; 46EXPORT_SYMBOL(node_online_map); 47nodemask_t node_possible_map = NODE_MASK_ALL; 48EXPORT_SYMBOL(node_possible_map); 49struct pglist_data *pgdat_list; 50unsigned long totalram_pages; 51unsigned long totalhigh_pages; 52long nr_swap_pages; 53 54/* 55 * results with 256, 32 in the lowmem_reserve sysctl: 56 * 1G machine -> (16M dma, 800M-16M normal, 1G-800M high) 57 * 1G machine -> (16M dma, 784M normal, 224M high) 58 * NORMAL allocation will leave 784M/256 of ram reserved in the ZONE_DMA 59 * HIGHMEM allocation will leave 224M/32 of ram reserved in ZONE_NORMAL 60 * HIGHMEM allocation will (224M+784M)/256 of ram reserved in ZONE_DMA 61 */ 62int sysctl_lowmem_reserve_ratio[MAX_NR_ZONES-1] = { 256, 32 }; 63 64EXPORT_SYMBOL(totalram_pages); 65EXPORT_SYMBOL(nr_swap_pages); 66 67/* 68 * Used by page_zone() to look up the address of the struct zone whose 69 * id is encoded in the upper bits of page->flags 70 */ 71struct zone *zone_table[1 << (ZONES_SHIFT + NODES_SHIFT)]; 72EXPORT_SYMBOL(zone_table); 73 74#ifdef CONFIG_NUMA 75static struct per_cpu_pageset 76 pageset_table[MAX_NR_ZONES*MAX_NUMNODES*NR_CPUS] __initdata; 77#endif 78 79static char *zone_names[MAX_NR_ZONES] = { "DMA", "Normal", "HighMem" }; 80int min_free_kbytes = 1024; 81 82unsigned long __initdata nr_kernel_pages; 83unsigned long __initdata nr_all_pages; 84 85/* 86 * Temporary debugging check for pages not lying within a given zone. 87 */ 88static int bad_range(struct zone *zone, struct page *page) 89{ 90 if (page_to_pfn(page) >= zone->zone_start_pfn + zone->spanned_pages) 91 return 1; 92 if (page_to_pfn(page) < zone->zone_start_pfn) 93 return 1; 94#ifdef CONFIG_HOLES_IN_ZONE 95 if (!pfn_valid(page_to_pfn(page))) 96 return 1; 97#endif 98 if (zone != page_zone(page)) 99 return 1; 100 return 0; 101} 102 103static void bad_page(const char *function, struct page *page) 104{ 105 printk(KERN_EMERG "Bad page state at %s (in process '%s', page %p)\n", 106 function, current->comm, page); 107 printk(KERN_EMERG "flags:0x%0*lx mapping:%p mapcount:%d count:%d\n", 108 (int)(2*sizeof(page_flags_t)), (unsigned long)page->flags, 109 page->mapping, page_mapcount(page), page_count(page)); 110 printk(KERN_EMERG "Backtrace:\n"); 111 dump_stack(); 112 printk(KERN_EMERG "Trying to fix it up, but a reboot is needed\n"); 113 page->flags &= ~(1 << PG_private | 114 1 << PG_locked | 115 1 << PG_lru | 116 1 << PG_active | 117 1 << PG_dirty | 118 1 << PG_swapcache | 119 1 << PG_writeback); 120 set_page_count(page, 0); 121 reset_page_mapcount(page); 122 page->mapping = NULL; 123 tainted |= TAINT_BAD_PAGE; 124} 125 126#ifndef CONFIG_HUGETLB_PAGE 127#define prep_compound_page(page, order) do { } while (0) 128#define destroy_compound_page(page, order) do { } while (0) 129#else 130/* 131 * Higher-order pages are called "compound pages". They are structured thusly: 132 * 133 * The first PAGE_SIZE page is called the "head page". 134 * 135 * The remaining PAGE_SIZE pages are called "tail pages". 136 * 137 * All pages have PG_compound set. All pages have their ->private pointing at 138 * the head page (even the head page has this). 139 * 140 * The first tail page's ->mapping, if non-zero, holds the address of the 141 * compound page's put_page() function. 142 * 143 * The order of the allocation is stored in the first tail page's ->index 144 * This is only for debug at present. This usage means that zero-order pages 145 * may not be compound. 146 */ 147static void prep_compound_page(struct page *page, unsigned long order) 148{ 149 int i; 150 int nr_pages = 1 << order; 151 152 page[1].mapping = NULL; 153 page[1].index = order; 154 for (i = 0; i < nr_pages; i++) { 155 struct page *p = page + i; 156 157 SetPageCompound(p); 158 p->private = (unsigned long)page; 159 } 160} 161 162static void destroy_compound_page(struct page *page, unsigned long order) 163{ 164 int i; 165 int nr_pages = 1 << order; 166 167 if (!PageCompound(page)) 168 return; 169 170 if (page[1].index != order) 171 bad_page(__FUNCTION__, page); 172 173 for (i = 0; i < nr_pages; i++) { 174 struct page *p = page + i; 175 176 if (!PageCompound(p)) 177 bad_page(__FUNCTION__, page); 178 if (p->private != (unsigned long)page) 179 bad_page(__FUNCTION__, page); 180 ClearPageCompound(p); 181 } 182} 183#endif /* CONFIG_HUGETLB_PAGE */ 184 185/* 186 * function for dealing with page's order in buddy system. 187 * zone->lock is already acquired when we use these. 188 * So, we don't need atomic page->flags operations here. 189 */ 190static inline unsigned long page_order(struct page *page) { 191 return page->private; 192} 193 194static inline void set_page_order(struct page *page, int order) { 195 page->private = order; 196 __SetPagePrivate(page); 197} 198 199static inline void rmv_page_order(struct page *page) 200{ 201 __ClearPagePrivate(page); 202 page->private = 0; 203} 204 205/* 206 * Locate the struct page for both the matching buddy in our 207 * pair (buddy1) and the combined O(n+1) page they form (page). 208 * 209 * 1) Any buddy B1 will have an order O twin B2 which satisfies 210 * the following equation: 211 * B2 = B1 ^ (1 << O) 212 * For example, if the starting buddy (buddy2) is #8 its order 213 * 1 buddy is #10: 214 * B2 = 8 ^ (1 << 1) = 8 ^ 2 = 10 215 * 216 * 2) Any buddy B will have an order O+1 parent P which 217 * satisfies the following equation: 218 * P = B & ~(1 << O) 219 * 220 * Assumption: *_mem_map is contigious at least up to MAX_ORDER 221 */ 222static inline struct page * 223__page_find_buddy(struct page *page, unsigned long page_idx, unsigned int order) 224{ 225 unsigned long buddy_idx = page_idx ^ (1 << order); 226 227 return page + (buddy_idx - page_idx); 228} 229 230static inline unsigned long 231__find_combined_index(unsigned long page_idx, unsigned int order) 232{ 233 return (page_idx & ~(1 << order)); 234} 235 236/* 237 * This function checks whether a page is free && is the buddy 238 * we can do coalesce a page and its buddy if 239 * (a) the buddy is free && 240 * (b) the buddy is on the buddy system && 241 * (c) a page and its buddy have the same order. 242 * for recording page's order, we use page->private and PG_private. 243 * 244 */ 245static inline int page_is_buddy(struct page *page, int order) 246{ 247 if (PagePrivate(page) && 248 (page_order(page) == order) && 249 !PageReserved(page) && 250 page_count(page) == 0) 251 return 1; 252 return 0; 253} 254 255/* 256 * Freeing function for a buddy system allocator. 257 * 258 * The concept of a buddy system is to maintain direct-mapped table 259 * (containing bit values) for memory blocks of various "orders". 260 * The bottom level table contains the map for the smallest allocatable 261 * units of memory (here, pages), and each level above it describes 262 * pairs of units from the levels below, hence, "buddies". 263 * At a high level, all that happens here is marking the table entry 264 * at the bottom level available, and propagating the changes upward 265 * as necessary, plus some accounting needed to play nicely with other 266 * parts of the VM system. 267 * At each level, we keep a list of pages, which are heads of continuous 268 * free pages of length of (1 << order) and marked with PG_Private.Page's 269 * order is recorded in page->private field. 270 * So when we are allocating or freeing one, we can derive the state of the 271 * other. That is, if we allocate a small block, and both were 272 * free, the remainder of the region must be split into blocks. 273 * If a block is freed, and its buddy is also free, then this 274 * triggers coalescing into a block of larger size. 275 * 276 * -- wli 277 */ 278 279static inline void __free_pages_bulk (struct page *page, 280 struct zone *zone, unsigned int order) 281{ 282 unsigned long page_idx; 283 int order_size = 1 << order; 284 285 if (unlikely(order)) 286 destroy_compound_page(page, order); 287 288 page_idx = page_to_pfn(page) & ((1 << MAX_ORDER) - 1); 289 290 BUG_ON(page_idx & (order_size - 1)); 291 BUG_ON(bad_range(zone, page)); 292 293 zone->free_pages += order_size; 294 while (order < MAX_ORDER-1) { 295 unsigned long combined_idx; 296 struct free_area *area; 297 struct page *buddy; 298 299 combined_idx = __find_combined_index(page_idx, order); 300 buddy = __page_find_buddy(page, page_idx, order); 301 302 if (bad_range(zone, buddy)) 303 break; 304 if (!page_is_buddy(buddy, order)) 305 break; /* Move the buddy up one level. */ 306 list_del(&buddy->lru); 307 area = zone->free_area + order; 308 area->nr_free--; 309 rmv_page_order(buddy); 310 page = page + (combined_idx - page_idx); 311 page_idx = combined_idx; 312 order++; 313 } 314 set_page_order(page, order); 315 list_add(&page->lru, &zone->free_area[order].free_list); 316 zone->free_area[order].nr_free++; 317} 318 319static inline void free_pages_check(const char *function, struct page *page) 320{ 321 if ( page_mapcount(page) || 322 page->mapping != NULL || 323 page_count(page) != 0 || 324 (page->flags & ( 325 1 << PG_lru | 326 1 << PG_private | 327 1 << PG_locked | 328 1 << PG_active | 329 1 << PG_reclaim | 330 1 << PG_slab | 331 1 << PG_swapcache | 332 1 << PG_writeback ))) 333 bad_page(function, page); 334 if (PageDirty(page)) 335 ClearPageDirty(page); 336} 337 338/* 339 * Frees a list of pages. 340 * Assumes all pages on list are in same zone, and of same order. 341 * count is the number of pages to free, or 0 for all on the list. 342 * 343 * If the zone was previously in an "all pages pinned" state then look to 344 * see if this freeing clears that state. 345 * 346 * And clear the zone's pages_scanned counter, to hold off the "all pages are 347 * pinned" detection logic. 348 */ 349static int 350free_pages_bulk(struct zone *zone, int count, 351 struct list_head *list, unsigned int order) 352{ 353 unsigned long flags; 354 struct page *page = NULL; 355 int ret = 0; 356 357 spin_lock_irqsave(&zone->lock, flags); 358 zone->all_unreclaimable = 0; 359 zone->pages_scanned = 0; 360 while (!list_empty(list) && count--) { 361 page = list_entry(list->prev, struct page, lru); 362 /* have to delete it as __free_pages_bulk list manipulates */ 363 list_del(&page->lru); 364 __free_pages_bulk(page, zone, order); 365 ret++; 366 } 367 spin_unlock_irqrestore(&zone->lock, flags); 368 return ret; 369} 370 371void __free_pages_ok(struct page *page, unsigned int order) 372{ 373 LIST_HEAD(list); 374 int i; 375 376 arch_free_page(page, order); 377 378 mod_page_state(pgfree, 1 << order); 379 380#ifndef CONFIG_MMU 381 if (order > 0) 382 for (i = 1 ; i < (1 << order) ; ++i) 383 __put_page(page + i); 384#endif 385 386 for (i = 0 ; i < (1 << order) ; ++i) 387 free_pages_check(__FUNCTION__, page + i); 388 list_add(&page->lru, &list); 389 kernel_map_pages(page, 1<<order, 0); 390 free_pages_bulk(page_zone(page), 1, &list, order); 391} 392 393 394/* 395 * The order of subdivision here is critical for the IO subsystem. 396 * Please do not alter this order without good reasons and regression 397 * testing. Specifically, as large blocks of memory are subdivided, 398 * the order in which smaller blocks are delivered depends on the order 399 * they're subdivided in this function. This is the primary factor 400 * influencing the order in which pages are delivered to the IO 401 * subsystem according to empirical testing, and this is also justified 402 * by considering the behavior of a buddy system containing a single 403 * large block of memory acted on by a series of small allocations. 404 * This behavior is a critical factor in sglist merging's success. 405 * 406 * -- wli 407 */ 408static inline struct page * 409expand(struct zone *zone, struct page *page, 410 int low, int high, struct free_area *area) 411{ 412 unsigned long size = 1 << high; 413 414 while (high > low) { 415 area--; 416 high--; 417 size >>= 1; 418 BUG_ON(bad_range(zone, &page[size])); 419 list_add(&page[size].lru, &area->free_list); 420 area->nr_free++; 421 set_page_order(&page[size], high); 422 } 423 return page; 424} 425 426void set_page_refs(struct page *page, int order) 427{ 428#ifdef CONFIG_MMU 429 set_page_count(page, 1); 430#else 431 int i; 432 433 /* 434 * We need to reference all the pages for this order, otherwise if 435 * anyone accesses one of the pages with (get/put) it will be freed. 436 * - eg: access_process_vm() 437 */ 438 for (i = 0; i < (1 << order); i++) 439 set_page_count(page + i, 1); 440#endif /* CONFIG_MMU */ 441} 442 443/* 444 * This page is about to be returned from the page allocator 445 */ 446static void prep_new_page(struct page *page, int order) 447{ 448 if (page->mapping || page_mapcount(page) || 449 (page->flags & ( 450 1 << PG_private | 451 1 << PG_locked | 452 1 << PG_lru | 453 1 << PG_active | 454 1 << PG_dirty | 455 1 << PG_reclaim | 456 1 << PG_swapcache | 457 1 << PG_writeback ))) 458 bad_page(__FUNCTION__, page); 459 460 page->flags &= ~(1 << PG_uptodate | 1 << PG_error | 461 1 << PG_referenced | 1 << PG_arch_1 | 462 1 << PG_checked | 1 << PG_mappedtodisk); 463 page->private = 0; 464 set_page_refs(page, order); 465 kernel_map_pages(page, 1 << order, 1); 466} 467 468/* 469 * Do the hard work of removing an element from the buddy allocator. 470 * Call me with the zone->lock already held. 471 */ 472static struct page *__rmqueue(struct zone *zone, unsigned int order) 473{ 474 struct free_area * area; 475 unsigned int current_order; 476 struct page *page; 477 478 for (current_order = order; current_order < MAX_ORDER; ++current_order) { 479 area = zone->free_area + current_order; 480 if (list_empty(&area->free_list)) 481 continue; 482 483 page = list_entry(area->free_list.next, struct page, lru); 484 list_del(&page->lru); 485 rmv_page_order(page); 486 area->nr_free--; 487 zone->free_pages -= 1UL << order; 488 return expand(zone, page, order, current_order, area); 489 } 490 491 return NULL; 492} 493 494/* 495 * Obtain a specified number of elements from the buddy allocator, all under 496 * a single hold of the lock, for efficiency. Add them to the supplied list. 497 * Returns the number of new pages which were placed at *list. 498 */ 499static int rmqueue_bulk(struct zone *zone, unsigned int order, 500 unsigned long count, struct list_head *list) 501{ 502 unsigned long flags; 503 int i; 504 int allocated = 0; 505 struct page *page; 506 507 spin_lock_irqsave(&zone->lock, flags); 508 for (i = 0; i < count; ++i) { 509 page = __rmqueue(zone, order); 510 if (page == NULL) 511 break; 512 allocated++; 513 list_add_tail(&page->lru, list); 514 } 515 spin_unlock_irqrestore(&zone->lock, flags); 516 return allocated; 517} 518 519#if defined(CONFIG_PM) || defined(CONFIG_HOTPLUG_CPU) 520static void __drain_pages(unsigned int cpu) 521{ 522 struct zone *zone; 523 int i; 524 525 for_each_zone(zone) { 526 struct per_cpu_pageset *pset; 527 528 pset = zone_pcp(zone, cpu); 529 for (i = 0; i < ARRAY_SIZE(pset->pcp); i++) { 530 struct per_cpu_pages *pcp; 531 532 pcp = &pset->pcp[i]; 533 pcp->count -= free_pages_bulk(zone, pcp->count, 534 &pcp->list, 0); 535 } 536 } 537} 538#endif /* CONFIG_PM || CONFIG_HOTPLUG_CPU */ 539 540#ifdef CONFIG_PM 541 542void mark_free_pages(struct zone *zone) 543{ 544 unsigned long zone_pfn, flags; 545 int order; 546 struct list_head *curr; 547 548 if (!zone->spanned_pages) 549 return; 550 551 spin_lock_irqsave(&zone->lock, flags); 552 for (zone_pfn = 0; zone_pfn < zone->spanned_pages; ++zone_pfn) 553 ClearPageNosaveFree(pfn_to_page(zone_pfn + zone->zone_start_pfn)); 554 555 for (order = MAX_ORDER - 1; order >= 0; --order) 556 list_for_each(curr, &zone->free_area[order].free_list) { 557 unsigned long start_pfn, i; 558 559 start_pfn = page_to_pfn(list_entry(curr, struct page, lru)); 560 561 for (i=0; i < (1<<order); i++) 562 SetPageNosaveFree(pfn_to_page(start_pfn+i)); 563 } 564 spin_unlock_irqrestore(&zone->lock, flags); 565} 566 567/* 568 * Spill all of this CPU's per-cpu pages back into the buddy allocator. 569 */ 570void drain_local_pages(void) 571{ 572 unsigned long flags; 573 574 local_irq_save(flags); 575 __drain_pages(smp_processor_id()); 576 local_irq_restore(flags); 577} 578#endif /* CONFIG_PM */ 579 580static void zone_statistics(struct zonelist *zonelist, struct zone *z) 581{ 582#ifdef CONFIG_NUMA 583 unsigned long flags; 584 int cpu; 585 pg_data_t *pg = z->zone_pgdat; 586 pg_data_t *orig = zonelist->zones[0]->zone_pgdat; 587 struct per_cpu_pageset *p; 588 589 local_irq_save(flags); 590 cpu = smp_processor_id(); 591 p = zone_pcp(z,cpu); 592 if (pg == orig) { 593 p->numa_hit++; 594 } else { 595 p->numa_miss++; 596 zone_pcp(zonelist->zones[0], cpu)->numa_foreign++; 597 } 598 if (pg == NODE_DATA(numa_node_id())) 599 p->local_node++; 600 else 601 p->other_node++; 602 local_irq_restore(flags); 603#endif 604} 605 606/* 607 * Free a 0-order page 608 */ 609static void FASTCALL(free_hot_cold_page(struct page *page, int cold)); 610static void fastcall free_hot_cold_page(struct page *page, int cold) 611{ 612 struct zone *zone = page_zone(page); 613 struct per_cpu_pages *pcp; 614 unsigned long flags; 615 616 arch_free_page(page, 0); 617 618 kernel_map_pages(page, 1, 0); 619 inc_page_state(pgfree); 620 if (PageAnon(page)) 621 page->mapping = NULL; 622 free_pages_check(__FUNCTION__, page); 623 pcp = &zone_pcp(zone, get_cpu())->pcp[cold]; 624 local_irq_save(flags); 625 if (pcp->count >= pcp->high) 626 pcp->count -= free_pages_bulk(zone, pcp->batch, &pcp->list, 0); 627 list_add(&page->lru, &pcp->list); 628 pcp->count++; 629 local_irq_restore(flags); 630 put_cpu(); 631} 632 633void fastcall free_hot_page(struct page *page) 634{ 635 free_hot_cold_page(page, 0); 636} 637 638void fastcall free_cold_page(struct page *page) 639{ 640 free_hot_cold_page(page, 1); 641} 642 643static inline void prep_zero_page(struct page *page, int order, unsigned int __nocast gfp_flags) 644{ 645 int i; 646 647 BUG_ON((gfp_flags & (__GFP_WAIT | __GFP_HIGHMEM)) == __GFP_HIGHMEM); 648 for(i = 0; i < (1 << order); i++) 649 clear_highpage(page + i); 650} 651 652/* 653 * Really, prep_compound_page() should be called from __rmqueue_bulk(). But 654 * we cheat by calling it from here, in the order > 0 path. Saves a branch 655 * or two. 656 */ 657static struct page * 658buffered_rmqueue(struct zone *zone, int order, unsigned int __nocast gfp_flags) 659{ 660 unsigned long flags; 661 struct page *page = NULL; 662 int cold = !!(gfp_flags & __GFP_COLD); 663 664 if (order == 0) { 665 struct per_cpu_pages *pcp; 666 667 pcp = &zone_pcp(zone, get_cpu())->pcp[cold]; 668 local_irq_save(flags); 669 if (pcp->count <= pcp->low) 670 pcp->count += rmqueue_bulk(zone, 0, 671 pcp->batch, &pcp->list); 672 if (pcp->count) { 673 page = list_entry(pcp->list.next, struct page, lru); 674 list_del(&page->lru); 675 pcp->count--; 676 } 677 local_irq_restore(flags); 678 put_cpu(); 679 } 680 681 if (page == NULL) { 682 spin_lock_irqsave(&zone->lock, flags); 683 page = __rmqueue(zone, order); 684 spin_unlock_irqrestore(&zone->lock, flags); 685 } 686 687 if (page != NULL) { 688 BUG_ON(bad_range(zone, page)); 689 mod_page_state_zone(zone, pgalloc, 1 << order); 690 prep_new_page(page, order); 691 692 if (gfp_flags & __GFP_ZERO) 693 prep_zero_page(page, order, gfp_flags); 694 695 if (order && (gfp_flags & __GFP_COMP)) 696 prep_compound_page(page, order); 697 } 698 return page; 699} 700 701/* 702 * Return 1 if free pages are above 'mark'. This takes into account the order 703 * of the allocation. 704 */ 705int zone_watermark_ok(struct zone *z, int order, unsigned long mark, 706 int classzone_idx, int can_try_harder, int gfp_high) 707{ 708 /* free_pages my go negative - that's OK */ 709 long min = mark, free_pages = z->free_pages - (1 << order) + 1; 710 int o; 711 712 if (gfp_high) 713 min -= min / 2; 714 if (can_try_harder) 715 min -= min / 4; 716 717 if (free_pages <= min + z->lowmem_reserve[classzone_idx]) 718 return 0; 719 for (o = 0; o < order; o++) { 720 /* At the next order, this order's pages become unavailable */ 721 free_pages -= z->free_area[o].nr_free << o; 722 723 /* Require fewer higher order pages to be free */ 724 min >>= 1; 725 726 if (free_pages <= min) 727 return 0; 728 } 729 return 1; 730} 731 732static inline int 733should_reclaim_zone(struct zone *z, unsigned int gfp_mask) 734{ 735 if (!z->reclaim_pages) 736 return 0; 737 if (gfp_mask & __GFP_NORECLAIM) 738 return 0; 739 return 1; 740} 741 742/* 743 * This is the 'heart' of the zoned buddy allocator. 744 */ 745struct page * fastcall 746__alloc_pages(unsigned int __nocast gfp_mask, unsigned int order, 747 struct zonelist *zonelist) 748{ 749 const int wait = gfp_mask & __GFP_WAIT; 750 struct zone **zones, *z; 751 struct page *page; 752 struct reclaim_state reclaim_state; 753 struct task_struct *p = current; 754 int i; 755 int classzone_idx; 756 int do_retry; 757 int can_try_harder; 758 int did_some_progress; 759 760 might_sleep_if(wait); 761 762 /* 763 * The caller may dip into page reserves a bit more if the caller 764 * cannot run direct reclaim, or is the caller has realtime scheduling 765 * policy 766 */ 767 can_try_harder = (unlikely(rt_task(p)) && !in_interrupt()) || !wait; 768 769 zones = zonelist->zones; /* the list of zones suitable for gfp_mask */ 770 771 if (unlikely(zones[0] == NULL)) { 772 /* Should this ever happen?? */ 773 return NULL; 774 } 775 776 classzone_idx = zone_idx(zones[0]); 777 778restart: 779 /* Go through the zonelist once, looking for a zone with enough free */ 780 for (i = 0; (z = zones[i]) != NULL; i++) { 781 int do_reclaim = should_reclaim_zone(z, gfp_mask); 782 783 if (!cpuset_zone_allowed(z)) 784 continue; 785 786 /* 787 * If the zone is to attempt early page reclaim then this loop 788 * will try to reclaim pages and check the watermark a second 789 * time before giving up and falling back to the next zone. 790 */ 791zone_reclaim_retry: 792 if (!zone_watermark_ok(z, order, z->pages_low, 793 classzone_idx, 0, 0)) { 794 if (!do_reclaim) 795 continue; 796 else { 797 zone_reclaim(z, gfp_mask, order); 798 /* Only try reclaim once */ 799 do_reclaim = 0; 800 goto zone_reclaim_retry; 801 } 802 } 803 804 page = buffered_rmqueue(z, order, gfp_mask); 805 if (page) 806 goto got_pg; 807 } 808 809 for (i = 0; (z = zones[i]) != NULL; i++) 810 wakeup_kswapd(z, order); 811 812 /* 813 * Go through the zonelist again. Let __GFP_HIGH and allocations 814 * coming from realtime tasks to go deeper into reserves 815 * 816 * This is the last chance, in general, before the goto nopage. 817 * Ignore cpuset if GFP_ATOMIC (!wait) rather than fail alloc. 818 */ 819 for (i = 0; (z = zones[i]) != NULL; i++) { 820 if (!zone_watermark_ok(z, order, z->pages_min, 821 classzone_idx, can_try_harder, 822 gfp_mask & __GFP_HIGH)) 823 continue; 824 825 if (wait && !cpuset_zone_allowed(z)) 826 continue; 827 828 page = buffered_rmqueue(z, order, gfp_mask); 829 if (page) 830 goto got_pg; 831 } 832 833 /* This allocation should allow future memory freeing. */ 834 835 if (((p->flags & PF_MEMALLOC) || unlikely(test_thread_flag(TIF_MEMDIE))) 836 && !in_interrupt()) { 837 if (!(gfp_mask & __GFP_NOMEMALLOC)) { 838 /* go through the zonelist yet again, ignoring mins */ 839 for (i = 0; (z = zones[i]) != NULL; i++) { 840 if (!cpuset_zone_allowed(z)) 841 continue; 842 page = buffered_rmqueue(z, order, gfp_mask); 843 if (page) 844 goto got_pg; 845 } 846 } 847 goto nopage; 848 } 849 850 /* Atomic allocations - we can't balance anything */ 851 if (!wait) 852 goto nopage; 853 854rebalance: 855 cond_resched(); 856 857 /* We now go into synchronous reclaim */ 858 p->flags |= PF_MEMALLOC; 859 reclaim_state.reclaimed_slab = 0; 860 p->reclaim_state = &reclaim_state; 861 862 did_some_progress = try_to_free_pages(zones, gfp_mask); 863 864 p->reclaim_state = NULL; 865 p->flags &= ~PF_MEMALLOC; 866 867 cond_resched(); 868 869 if (likely(did_some_progress)) { 870 /* 871 * Go through the zonelist yet one more time, keep 872 * very high watermark here, this is only to catch 873 * a parallel oom killing, we must fail if we're still 874 * under heavy pressure. 875 */ 876 for (i = 0; (z = zones[i]) != NULL; i++) { 877 if (!zone_watermark_ok(z, order, z->pages_min, 878 classzone_idx, can_try_harder, 879 gfp_mask & __GFP_HIGH)) 880 continue; 881 882 if (!cpuset_zone_allowed(z)) 883 continue; 884 885 page = buffered_rmqueue(z, order, gfp_mask); 886 if (page) 887 goto got_pg; 888 } 889 } else if ((gfp_mask & __GFP_FS) && !(gfp_mask & __GFP_NORETRY)) { 890 /* 891 * Go through the zonelist yet one more time, keep 892 * very high watermark here, this is only to catch 893 * a parallel oom killing, we must fail if we're still 894 * under heavy pressure. 895 */ 896 for (i = 0; (z = zones[i]) != NULL; i++) { 897 if (!zone_watermark_ok(z, order, z->pages_high, 898 classzone_idx, 0, 0)) 899 continue; 900 901 if (!cpuset_zone_allowed(z)) 902 continue; 903 904 page = buffered_rmqueue(z, order, gfp_mask); 905 if (page) 906 goto got_pg; 907 } 908 909 out_of_memory(gfp_mask); 910 goto restart; 911 } 912 913 /* 914 * Don't let big-order allocations loop unless the caller explicitly 915 * requests that. Wait for some write requests to complete then retry. 916 * 917 * In this implementation, __GFP_REPEAT means __GFP_NOFAIL for order 918 * <= 3, but that may not be true in other implementations. 919 */ 920 do_retry = 0; 921 if (!(gfp_mask & __GFP_NORETRY)) { 922 if ((order <= 3) || (gfp_mask & __GFP_REPEAT)) 923 do_retry = 1; 924 if (gfp_mask & __GFP_NOFAIL) 925 do_retry = 1; 926 } 927 if (do_retry) { 928 blk_congestion_wait(WRITE, HZ/50); 929 goto rebalance; 930 } 931 932nopage: 933 if (!(gfp_mask & __GFP_NOWARN) && printk_ratelimit()) { 934 printk(KERN_WARNING "%s: page allocation failure." 935 " order:%d, mode:0x%x\n", 936 p->comm, order, gfp_mask); 937 dump_stack(); 938 } 939 return NULL; 940got_pg: 941 zone_statistics(zonelist, z); 942 return page; 943} 944 945EXPORT_SYMBOL(__alloc_pages); 946 947/* 948 * Common helper functions. 949 */ 950fastcall unsigned long __get_free_pages(unsigned int __nocast gfp_mask, unsigned int order) 951{ 952 struct page * page; 953 page = alloc_pages(gfp_mask, order); 954 if (!page) 955 return 0; 956 return (unsigned long) page_address(page); 957} 958 959EXPORT_SYMBOL(__get_free_pages); 960 961fastcall unsigned long get_zeroed_page(unsigned int __nocast gfp_mask) 962{ 963 struct page * page; 964 965 /* 966 * get_zeroed_page() returns a 32-bit address, which cannot represent 967 * a highmem page 968 */ 969 BUG_ON(gfp_mask & __GFP_HIGHMEM); 970 971 page = alloc_pages(gfp_mask | __GFP_ZERO, 0); 972 if (page) 973 return (unsigned long) page_address(page); 974 return 0; 975} 976 977EXPORT_SYMBOL(get_zeroed_page); 978 979void __pagevec_free(struct pagevec *pvec) 980{ 981 int i = pagevec_count(pvec); 982 983 while (--i >= 0) 984 free_hot_cold_page(pvec->pages[i], pvec->cold); 985} 986 987fastcall void __free_pages(struct page *page, unsigned int order) 988{ 989 if (!PageReserved(page) && put_page_testzero(page)) { 990 if (order == 0) 991 free_hot_page(page); 992 else 993 __free_pages_ok(page, order); 994 } 995} 996 997EXPORT_SYMBOL(__free_pages); 998 999fastcall void free_pages(unsigned long addr, unsigned int order) 1000{ 1001 if (addr != 0) { 1002 BUG_ON(!virt_addr_valid((void *)addr)); 1003 __free_pages(virt_to_page((void *)addr), order); 1004 } 1005} 1006 1007EXPORT_SYMBOL(free_pages); 1008 1009/* 1010 * Total amount of free (allocatable) RAM: 1011 */ 1012unsigned int nr_free_pages(void) 1013{ 1014 unsigned int sum = 0; 1015 struct zone *zone; 1016 1017 for_each_zone(zone) 1018 sum += zone->free_pages; 1019 1020 return sum; 1021} 1022 1023EXPORT_SYMBOL(nr_free_pages); 1024 1025#ifdef CONFIG_NUMA 1026unsigned int nr_free_pages_pgdat(pg_data_t *pgdat) 1027{ 1028 unsigned int i, sum = 0; 1029 1030 for (i = 0; i < MAX_NR_ZONES; i++) 1031 sum += pgdat->node_zones[i].free_pages; 1032 1033 return sum; 1034} 1035#endif 1036 1037static unsigned int nr_free_zone_pages(int offset) 1038{ 1039 pg_data_t *pgdat; 1040 unsigned int sum = 0; 1041 1042 for_each_pgdat(pgdat) { 1043 struct zonelist *zonelist = pgdat->node_zonelists + offset; 1044 struct zone **zonep = zonelist->zones; 1045 struct zone *zone; 1046 1047 for (zone = *zonep++; zone; zone = *zonep++) { 1048 unsigned long size = zone->present_pages; 1049 unsigned long high = zone->pages_high; 1050 if (size > high) 1051 sum += size - high; 1052 } 1053 } 1054 1055 return sum; 1056} 1057 1058/* 1059 * Amount of free RAM allocatable within ZONE_DMA and ZONE_NORMAL 1060 */ 1061unsigned int nr_free_buffer_pages(void) 1062{ 1063 return nr_free_zone_pages(GFP_USER & GFP_ZONEMASK); 1064} 1065 1066/* 1067 * Amount of free RAM allocatable within all zones 1068 */ 1069unsigned int nr_free_pagecache_pages(void) 1070{ 1071 return nr_free_zone_pages(GFP_HIGHUSER & GFP_ZONEMASK); 1072} 1073 1074#ifdef CONFIG_HIGHMEM 1075unsigned int nr_free_highpages (void) 1076{ 1077 pg_data_t *pgdat; 1078 unsigned int pages = 0; 1079 1080 for_each_pgdat(pgdat) 1081 pages += pgdat->node_zones[ZONE_HIGHMEM].free_pages; 1082 1083 return pages; 1084} 1085#endif 1086 1087#ifdef CONFIG_NUMA 1088static void show_node(struct zone *zone) 1089{ 1090 printk("Node %d ", zone->zone_pgdat->node_id); 1091} 1092#else 1093#define show_node(zone) do { } while (0) 1094#endif 1095 1096/* 1097 * Accumulate the page_state information across all CPUs. 1098 * The result is unavoidably approximate - it can change 1099 * during and after execution of this function. 1100 */ 1101static DEFINE_PER_CPU(struct page_state, page_states) = {0}; 1102 1103atomic_t nr_pagecache = ATOMIC_INIT(0); 1104EXPORT_SYMBOL(nr_pagecache); 1105#ifdef CONFIG_SMP 1106DEFINE_PER_CPU(long, nr_pagecache_local) = 0; 1107#endif 1108 1109void __get_page_state(struct page_state *ret, int nr) 1110{ 1111 int cpu = 0; 1112 1113 memset(ret, 0, sizeof(*ret)); 1114 1115 cpu = first_cpu(cpu_online_map); 1116 while (cpu < NR_CPUS) { 1117 unsigned long *in, *out, off; 1118 1119 in = (unsigned long *)&per_cpu(page_states, cpu); 1120 1121 cpu = next_cpu(cpu, cpu_online_map); 1122 1123 if (cpu < NR_CPUS) 1124 prefetch(&per_cpu(page_states, cpu)); 1125 1126 out = (unsigned long *)ret; 1127 for (off = 0; off < nr; off++) 1128 *out++ += *in++; 1129 } 1130} 1131 1132void get_page_state(struct page_state *ret) 1133{ 1134 int nr; 1135 1136 nr = offsetof(struct page_state, GET_PAGE_STATE_LAST); 1137 nr /= sizeof(unsigned long); 1138 1139 __get_page_state(ret, nr + 1); 1140} 1141 1142void get_full_page_state(struct page_state *ret) 1143{ 1144 __get_page_state(ret, sizeof(*ret) / sizeof(unsigned long)); 1145} 1146 1147unsigned long __read_page_state(unsigned long offset) 1148{ 1149 unsigned long ret = 0; 1150 int cpu; 1151 1152 for_each_online_cpu(cpu) { 1153 unsigned long in; 1154 1155 in = (unsigned long)&per_cpu(page_states, cpu) + offset; 1156 ret += *((unsigned long *)in); 1157 } 1158 return ret; 1159} 1160 1161void __mod_page_state(unsigned long offset, unsigned long delta) 1162{ 1163 unsigned long flags; 1164 void* ptr; 1165 1166 local_irq_save(flags); 1167 ptr = &__get_cpu_var(page_states); 1168 *(unsigned long*)(ptr + offset) += delta; 1169 local_irq_restore(flags); 1170} 1171 1172EXPORT_SYMBOL(__mod_page_state); 1173 1174void __get_zone_counts(unsigned long *active, unsigned long *inactive, 1175 unsigned long *free, struct pglist_data *pgdat) 1176{ 1177 struct zone *zones = pgdat->node_zones; 1178 int i; 1179 1180 *active = 0; 1181 *inactive = 0; 1182 *free = 0; 1183 for (i = 0; i < MAX_NR_ZONES; i++) { 1184 *active += zones[i].nr_active; 1185 *inactive += zones[i].nr_inactive; 1186 *free += zones[i].free_pages; 1187 } 1188} 1189 1190void get_zone_counts(unsigned long *active, 1191 unsigned long *inactive, unsigned long *free) 1192{ 1193 struct pglist_data *pgdat; 1194 1195 *active = 0; 1196 *inactive = 0; 1197 *free = 0; 1198 for_each_pgdat(pgdat) { 1199 unsigned long l, m, n; 1200 __get_zone_counts(&l, &m, &n, pgdat); 1201 *active += l; 1202 *inactive += m; 1203 *free += n; 1204 } 1205} 1206 1207void si_meminfo(struct sysinfo *val) 1208{ 1209 val->totalram = totalram_pages; 1210 val->sharedram = 0; 1211 val->freeram = nr_free_pages(); 1212 val->bufferram = nr_blockdev_pages(); 1213#ifdef CONFIG_HIGHMEM 1214 val->totalhigh = totalhigh_pages; 1215 val->freehigh = nr_free_highpages(); 1216#else 1217 val->totalhigh = 0; 1218 val->freehigh = 0; 1219#endif 1220 val->mem_unit = PAGE_SIZE; 1221} 1222 1223EXPORT_SYMBOL(si_meminfo); 1224 1225#ifdef CONFIG_NUMA 1226void si_meminfo_node(struct sysinfo *val, int nid) 1227{ 1228 pg_data_t *pgdat = NODE_DATA(nid); 1229 1230 val->totalram = pgdat->node_present_pages; 1231 val->freeram = nr_free_pages_pgdat(pgdat); 1232 val->totalhigh = pgdat->node_zones[ZONE_HIGHMEM].present_pages; 1233 val->freehigh = pgdat->node_zones[ZONE_HIGHMEM].free_pages; 1234 val->mem_unit = PAGE_SIZE; 1235} 1236#endif 1237 1238#define K(x) ((x) << (PAGE_SHIFT-10)) 1239 1240/* 1241 * Show free area list (used inside shift_scroll-lock stuff) 1242 * We also calculate the percentage fragmentation. We do this by counting the 1243 * memory on each free list with the exception of the first item on the list. 1244 */ 1245void show_free_areas(void) 1246{ 1247 struct page_state ps; 1248 int cpu, temperature; 1249 unsigned long active; 1250 unsigned long inactive; 1251 unsigned long free; 1252 struct zone *zone; 1253 1254 for_each_zone(zone) { 1255 show_node(zone); 1256 printk("%s per-cpu:", zone->name); 1257 1258 if (!zone->present_pages) { 1259 printk(" empty\n"); 1260 continue; 1261 } else 1262 printk("\n"); 1263 1264 for (cpu = 0; cpu < NR_CPUS; ++cpu) { 1265 struct per_cpu_pageset *pageset; 1266 1267 if (!cpu_possible(cpu)) 1268 continue; 1269 1270 pageset = zone_pcp(zone, cpu); 1271 1272 for (temperature = 0; temperature < 2; temperature++) 1273 printk("cpu %d %s: low %d, high %d, batch %d\n", 1274 cpu, 1275 temperature ? "cold" : "hot", 1276 pageset->pcp[temperature].low, 1277 pageset->pcp[temperature].high, 1278 pageset->pcp[temperature].batch); 1279 } 1280 } 1281 1282 get_page_state(&ps); 1283 get_zone_counts(&active, &inactive, &free); 1284 1285 printk("\nFree pages: %11ukB (%ukB HighMem)\n", 1286 K(nr_free_pages()), 1287 K(nr_free_highpages())); 1288 1289 printk("Active:%lu inactive:%lu dirty:%lu writeback:%lu " 1290 "unstable:%lu free:%u slab:%lu mapped:%lu pagetables:%lu\n", 1291 active, 1292 inactive, 1293 ps.nr_dirty, 1294 ps.nr_writeback, 1295 ps.nr_unstable, 1296 nr_free_pages(), 1297 ps.nr_slab, 1298 ps.nr_mapped, 1299 ps.nr_page_table_pages); 1300 1301 for_each_zone(zone) { 1302 int i; 1303 1304 show_node(zone); 1305 printk("%s" 1306 " free:%lukB" 1307 " min:%lukB" 1308 " low:%lukB" 1309 " high:%lukB" 1310 " active:%lukB" 1311 " inactive:%lukB" 1312 " present:%lukB" 1313 " pages_scanned:%lu" 1314 " all_unreclaimable? %s" 1315 "\n", 1316 zone->name, 1317 K(zone->free_pages), 1318 K(zone->pages_min), 1319 K(zone->pages_low), 1320 K(zone->pages_high), 1321 K(zone->nr_active), 1322 K(zone->nr_inactive), 1323 K(zone->present_pages), 1324 zone->pages_scanned, 1325 (zone->all_unreclaimable ? "yes" : "no") 1326 ); 1327 printk("lowmem_reserve[]:"); 1328 for (i = 0; i < MAX_NR_ZONES; i++) 1329 printk(" %lu", zone->lowmem_reserve[i]); 1330 printk("\n"); 1331 } 1332 1333 for_each_zone(zone) { 1334 unsigned long nr, flags, order, total = 0; 1335 1336 show_node(zone); 1337 printk("%s: ", zone->name); 1338 if (!zone->present_pages) { 1339 printk("empty\n"); 1340 continue; 1341 } 1342 1343 spin_lock_irqsave(&zone->lock, flags); 1344 for (order = 0; order < MAX_ORDER; order++) { 1345 nr = zone->free_area[order].nr_free; 1346 total += nr << order; 1347 printk("%lu*%lukB ", nr, K(1UL) << order); 1348 } 1349 spin_unlock_irqrestore(&zone->lock, flags); 1350 printk("= %lukB\n", K(total)); 1351 } 1352 1353 show_swap_cache_info(); 1354} 1355 1356/* 1357 * Builds allocation fallback zone lists. 1358 */ 1359static int __init build_zonelists_node(pg_data_t *pgdat, struct zonelist *zonelist, int j, int k) 1360{ 1361 switch (k) { 1362 struct zone *zone; 1363 default: 1364 BUG(); 1365 case ZONE_HIGHMEM: 1366 zone = pgdat->node_zones + ZONE_HIGHMEM; 1367 if (zone->present_pages) { 1368#ifndef CONFIG_HIGHMEM 1369 BUG(); 1370#endif 1371 zonelist->zones[j++] = zone; 1372 } 1373 case ZONE_NORMAL: 1374 zone = pgdat->node_zones + ZONE_NORMAL; 1375 if (zone->present_pages) 1376 zonelist->zones[j++] = zone; 1377 case ZONE_DMA: 1378 zone = pgdat->node_zones + ZONE_DMA; 1379 if (zone->present_pages) 1380 zonelist->zones[j++] = zone; 1381 } 1382 1383 return j; 1384} 1385 1386#ifdef CONFIG_NUMA 1387#define MAX_NODE_LOAD (num_online_nodes()) 1388static int __initdata node_load[MAX_NUMNODES]; 1389/** 1390 * find_next_best_node - find the next node that should appear in a given node's fallback list 1391 * @node: node whose fallback list we're appending 1392 * @used_node_mask: nodemask_t of already used nodes 1393 * 1394 * We use a number of factors to determine which is the next node that should 1395 * appear on a given node's fallback list. The node should not have appeared 1396 * already in @node's fallback list, and it should be the next closest node 1397 * according to the distance array (which contains arbitrary distance values 1398 * from each node to each node in the system), and should also prefer nodes 1399 * with no CPUs, since presumably they'll have very little allocation pressure 1400 * on them otherwise. 1401 * It returns -1 if no node is found. 1402 */ 1403static int __init find_next_best_node(int node, nodemask_t *used_node_mask) 1404{ 1405 int i, n, val; 1406 int min_val = INT_MAX; 1407 int best_node = -1; 1408 1409 for_each_online_node(i) { 1410 cpumask_t tmp; 1411 1412 /* Start from local node */ 1413 n = (node+i) % num_online_nodes(); 1414 1415 /* Don't want a node to appear more than once */ 1416 if (node_isset(n, *used_node_mask)) 1417 continue; 1418 1419 /* Use the local node if we haven't already */ 1420 if (!node_isset(node, *used_node_mask)) { 1421 best_node = node; 1422 break; 1423 } 1424 1425 /* Use the distance array to find the distance */ 1426 val = node_distance(node, n); 1427 1428 /* Give preference to headless and unused nodes */ 1429 tmp = node_to_cpumask(n); 1430 if (!cpus_empty(tmp)) 1431 val += PENALTY_FOR_NODE_WITH_CPUS; 1432 1433 /* Slight preference for less loaded node */ 1434 val *= (MAX_NODE_LOAD*MAX_NUMNODES); 1435 val += node_load[n]; 1436 1437 if (val < min_val) { 1438 min_val = val; 1439 best_node = n; 1440 } 1441 } 1442 1443 if (best_node >= 0) 1444 node_set(best_node, *used_node_mask); 1445 1446 return best_node; 1447} 1448 1449static void __init build_zonelists(pg_data_t *pgdat) 1450{ 1451 int i, j, k, node, local_node; 1452 int prev_node, load; 1453 struct zonelist *zonelist; 1454 nodemask_t used_mask; 1455 1456 /* initialize zonelists */ 1457 for (i = 0; i < GFP_ZONETYPES; i++) { 1458 zonelist = pgdat->node_zonelists + i; 1459 zonelist->zones[0] = NULL; 1460 } 1461 1462 /* NUMA-aware ordering of nodes */ 1463 local_node = pgdat->node_id; 1464 load = num_online_nodes(); 1465 prev_node = local_node; 1466 nodes_clear(used_mask); 1467 while ((node = find_next_best_node(local_node, &used_mask)) >= 0) { 1468 /* 1469 * We don't want to pressure a particular node. 1470 * So adding penalty to the first node in same 1471 * distance group to make it round-robin. 1472 */ 1473 if (node_distance(local_node, node) != 1474 node_distance(local_node, prev_node)) 1475 node_load[node] += load; 1476 prev_node = node; 1477 load--; 1478 for (i = 0; i < GFP_ZONETYPES; i++) { 1479 zonelist = pgdat->node_zonelists + i; 1480 for (j = 0; zonelist->zones[j] != NULL; j++); 1481 1482 k = ZONE_NORMAL; 1483 if (i & __GFP_HIGHMEM) 1484 k = ZONE_HIGHMEM; 1485 if (i & __GFP_DMA) 1486 k = ZONE_DMA; 1487 1488 j = build_zonelists_node(NODE_DATA(node), zonelist, j, k); 1489 zonelist->zones[j] = NULL; 1490 } 1491 } 1492} 1493 1494#else /* CONFIG_NUMA */ 1495 1496static void __init build_zonelists(pg_data_t *pgdat) 1497{ 1498 int i, j, k, node, local_node; 1499 1500 local_node = pgdat->node_id; 1501 for (i = 0; i < GFP_ZONETYPES; i++) { 1502 struct zonelist *zonelist; 1503 1504 zonelist = pgdat->node_zonelists + i; 1505 1506 j = 0; 1507 k = ZONE_NORMAL; 1508 if (i & __GFP_HIGHMEM) 1509 k = ZONE_HIGHMEM; 1510 if (i & __GFP_DMA) 1511 k = ZONE_DMA; 1512 1513 j = build_zonelists_node(pgdat, zonelist, j, k); 1514 /* 1515 * Now we build the zonelist so that it contains the zones 1516 * of all the other nodes. 1517 * We don't want to pressure a particular node, so when 1518 * building the zones for node N, we make sure that the 1519 * zones coming right after the local ones are those from 1520 * node N+1 (modulo N) 1521 */ 1522 for (node = local_node + 1; node < MAX_NUMNODES; node++) { 1523 if (!node_online(node)) 1524 continue; 1525 j = build_zonelists_node(NODE_DATA(node), zonelist, j, k); 1526 } 1527 for (node = 0; node < local_node; node++) { 1528 if (!node_online(node)) 1529 continue; 1530 j = build_zonelists_node(NODE_DATA(node), zonelist, j, k); 1531 } 1532 1533 zonelist->zones[j] = NULL; 1534 } 1535} 1536 1537#endif /* CONFIG_NUMA */ 1538 1539void __init build_all_zonelists(void) 1540{ 1541 int i; 1542 1543 for_each_online_node(i) 1544 build_zonelists(NODE_DATA(i)); 1545 printk("Built %i zonelists\n", num_online_nodes()); 1546 cpuset_init_current_mems_allowed(); 1547} 1548 1549/* 1550 * Helper functions to size the waitqueue hash table. 1551 * Essentially these want to choose hash table sizes sufficiently 1552 * large so that collisions trying to wait on pages are rare. 1553 * But in fact, the number of active page waitqueues on typical 1554 * systems is ridiculously low, less than 200. So this is even 1555 * conservative, even though it seems large. 1556 * 1557 * The constant PAGES_PER_WAITQUEUE specifies the ratio of pages to 1558 * waitqueues, i.e. the size of the waitq table given the number of pages. 1559 */ 1560#define PAGES_PER_WAITQUEUE 256 1561 1562static inline unsigned long wait_table_size(unsigned long pages) 1563{ 1564 unsigned long size = 1; 1565 1566 pages /= PAGES_PER_WAITQUEUE; 1567 1568 while (size < pages) 1569 size <<= 1; 1570 1571 /* 1572 * Once we have dozens or even hundreds of threads sleeping 1573 * on IO we've got bigger problems than wait queue collision. 1574 * Limit the size of the wait table to a reasonable size. 1575 */ 1576 size = min(size, 4096UL); 1577 1578 return max(size, 4UL); 1579} 1580 1581/* 1582 * This is an integer logarithm so that shifts can be used later 1583 * to extract the more random high bits from the multiplicative 1584 * hash function before the remainder is taken. 1585 */ 1586static inline unsigned long wait_table_bits(unsigned long size) 1587{ 1588 return ffz(~size); 1589} 1590 1591#define LONG_ALIGN(x) (((x)+(sizeof(long))-1)&~((sizeof(long))-1)) 1592 1593static void __init calculate_zone_totalpages(struct pglist_data *pgdat, 1594 unsigned long *zones_size, unsigned long *zholes_size) 1595{ 1596 unsigned long realtotalpages, totalpages = 0; 1597 int i; 1598 1599 for (i = 0; i < MAX_NR_ZONES; i++) 1600 totalpages += zones_size[i]; 1601 pgdat->node_spanned_pages = totalpages; 1602 1603 realtotalpages = totalpages; 1604 if (zholes_size) 1605 for (i = 0; i < MAX_NR_ZONES; i++) 1606 realtotalpages -= zholes_size[i]; 1607 pgdat->node_present_pages = realtotalpages; 1608 printk(KERN_DEBUG "On node %d totalpages: %lu\n", pgdat->node_id, realtotalpages); 1609} 1610 1611 1612/* 1613 * Initially all pages are reserved - free ones are freed 1614 * up by free_all_bootmem() once the early boot process is 1615 * done. Non-atomic initialization, single-pass. 1616 */ 1617void __init memmap_init_zone(unsigned long size, int nid, unsigned long zone, 1618 unsigned long start_pfn) 1619{ 1620 struct page *start = pfn_to_page(start_pfn); 1621 struct page *page; 1622 1623 for (page = start; page < (start + size); page++) { 1624 set_page_zone(page, NODEZONE(nid, zone)); 1625 set_page_count(page, 0); 1626 reset_page_mapcount(page); 1627 SetPageReserved(page); 1628 INIT_LIST_HEAD(&page->lru); 1629#ifdef WANT_PAGE_VIRTUAL 1630 /* The shift won't overflow because ZONE_NORMAL is below 4G. */ 1631 if (!is_highmem_idx(zone)) 1632 set_page_address(page, __va(start_pfn << PAGE_SHIFT)); 1633#endif 1634 start_pfn++; 1635 } 1636} 1637 1638void zone_init_free_lists(struct pglist_data *pgdat, struct zone *zone, 1639 unsigned long size) 1640{ 1641 int order; 1642 for (order = 0; order < MAX_ORDER ; order++) { 1643 INIT_LIST_HEAD(&zone->free_area[order].free_list); 1644 zone->free_area[order].nr_free = 0; 1645 } 1646} 1647 1648#ifndef __HAVE_ARCH_MEMMAP_INIT 1649#define memmap_init(size, nid, zone, start_pfn) \ 1650 memmap_init_zone((size), (nid), (zone), (start_pfn)) 1651#endif 1652 1653static int __devinit zone_batchsize(struct zone *zone) 1654{ 1655 int batch; 1656 1657 /* 1658 * The per-cpu-pages pools are set to around 1000th of the 1659 * size of the zone. But no more than 1/4 of a meg - there's 1660 * no point in going beyond the size of L2 cache. 1661 * 1662 * OK, so we don't know how big the cache is. So guess. 1663 */ 1664 batch = zone->present_pages / 1024; 1665 if (batch * PAGE_SIZE > 256 * 1024) 1666 batch = (256 * 1024) / PAGE_SIZE; 1667 batch /= 4; /* We effectively *= 4 below */ 1668 if (batch < 1) 1669 batch = 1; 1670 1671 /* 1672 * Clamp the batch to a 2^n - 1 value. Having a power 1673 * of 2 value was found to be more likely to have 1674 * suboptimal cache aliasing properties in some cases. 1675 * 1676 * For example if 2 tasks are alternately allocating 1677 * batches of pages, one task can end up with a lot 1678 * of pages of one half of the possible page colors 1679 * and the other with pages of the other colors. 1680 */ 1681 batch = (1 << fls(batch + batch/2)) - 1; 1682 return batch; 1683} 1684 1685#ifdef CONFIG_NUMA 1686/* 1687 * Dynamicaly allocate memory for the 1688 * per cpu pageset array in struct zone. 1689 */ 1690static int __devinit process_zones(int cpu) 1691{ 1692 struct zone *zone, *dzone; 1693 int i; 1694 1695 for_each_zone(zone) { 1696 struct per_cpu_pageset *npageset = NULL; 1697 1698 npageset = kmalloc_node(sizeof(struct per_cpu_pageset), 1699 GFP_KERNEL, cpu_to_node(cpu)); 1700 if (!npageset) { 1701 zone->pageset[cpu] = NULL; 1702 goto bad; 1703 } 1704 1705 if (zone->pageset[cpu]) { 1706 memcpy(npageset, zone->pageset[cpu], 1707 sizeof(struct per_cpu_pageset)); 1708 1709 /* Relocate lists */ 1710 for (i = 0; i < 2; i++) { 1711 INIT_LIST_HEAD(&npageset->pcp[i].list); 1712 list_splice(&zone->pageset[cpu]->pcp[i].list, 1713 &npageset->pcp[i].list); 1714 } 1715 } else { 1716 struct per_cpu_pages *pcp; 1717 unsigned long batch; 1718 1719 batch = zone_batchsize(zone); 1720 1721 pcp = &npageset->pcp[0]; /* hot */ 1722 pcp->count = 0; 1723 pcp->low = 2 * batch; 1724 pcp->high = 6 * batch; 1725 pcp->batch = 1 * batch; 1726 INIT_LIST_HEAD(&pcp->list); 1727 1728 pcp = &npageset->pcp[1]; /* cold*/ 1729 pcp->count = 0; 1730 pcp->low = 0; 1731 pcp->high = 2 * batch; 1732 pcp->batch = 1 * batch; 1733 INIT_LIST_HEAD(&pcp->list); 1734 } 1735 zone->pageset[cpu] = npageset; 1736 } 1737 1738 return 0; 1739bad: 1740 for_each_zone(dzone) { 1741 if (dzone == zone) 1742 break; 1743 kfree(dzone->pageset[cpu]); 1744 dzone->pageset[cpu] = NULL; 1745 } 1746 return -ENOMEM; 1747} 1748 1749static inline void free_zone_pagesets(int cpu) 1750{ 1751#ifdef CONFIG_NUMA 1752 struct zone *zone; 1753 1754 for_each_zone(zone) { 1755 struct per_cpu_pageset *pset = zone_pcp(zone, cpu); 1756 1757 zone_pcp(zone, cpu) = NULL; 1758 kfree(pset); 1759 } 1760#endif 1761} 1762 1763static int __devinit pageset_cpuup_callback(struct notifier_block *nfb, 1764 unsigned long action, 1765 void *hcpu) 1766{ 1767 int cpu = (long)hcpu; 1768 int ret = NOTIFY_OK; 1769 1770 switch (action) { 1771 case CPU_UP_PREPARE: 1772 if (process_zones(cpu)) 1773 ret = NOTIFY_BAD; 1774 break; 1775#ifdef CONFIG_HOTPLUG_CPU 1776 case CPU_DEAD: 1777 free_zone_pagesets(cpu); 1778 break; 1779#endif 1780 default: 1781 break; 1782 } 1783 return ret; 1784} 1785 1786static struct notifier_block pageset_notifier = 1787 { &pageset_cpuup_callback, NULL, 0 }; 1788 1789void __init setup_per_cpu_pageset() 1790{ 1791 int err; 1792 1793 /* Initialize per_cpu_pageset for cpu 0. 1794 * A cpuup callback will do this for every cpu 1795 * as it comes online 1796 */ 1797 err = process_zones(smp_processor_id()); 1798 BUG_ON(err); 1799 register_cpu_notifier(&pageset_notifier); 1800} 1801 1802#endif 1803 1804/* 1805 * Set up the zone data structures: 1806 * - mark all pages reserved 1807 * - mark all memory queues empty 1808 * - clear the memory bitmaps 1809 */ 1810static void __init free_area_init_core(struct pglist_data *pgdat, 1811 unsigned long *zones_size, unsigned long *zholes_size) 1812{ 1813 unsigned long i, j; 1814 const unsigned long zone_required_alignment = 1UL << (MAX_ORDER-1); 1815 int cpu, nid = pgdat->node_id; 1816 unsigned long zone_start_pfn = pgdat->node_start_pfn; 1817 1818 pgdat->nr_zones = 0; 1819 init_waitqueue_head(&pgdat->kswapd_wait); 1820 pgdat->kswapd_max_order = 0; 1821 1822 for (j = 0; j < MAX_NR_ZONES; j++) { 1823 struct zone *zone = pgdat->node_zones + j; 1824 unsigned long size, realsize; 1825 unsigned long batch; 1826 1827 zone_table[NODEZONE(nid, j)] = zone; 1828 realsize = size = zones_size[j]; 1829 if (zholes_size) 1830 realsize -= zholes_size[j]; 1831 1832 if (j == ZONE_DMA || j == ZONE_NORMAL) 1833 nr_kernel_pages += realsize; 1834 nr_all_pages += realsize; 1835 1836 zone->spanned_pages = size; 1837 zone->present_pages = realsize; 1838 zone->name = zone_names[j]; 1839 spin_lock_init(&zone->lock); 1840 spin_lock_init(&zone->lru_lock); 1841 zone->zone_pgdat = pgdat; 1842 zone->free_pages = 0; 1843 1844 zone->temp_priority = zone->prev_priority = DEF_PRIORITY; 1845 1846 batch = zone_batchsize(zone); 1847 1848 for (cpu = 0; cpu < NR_CPUS; cpu++) { 1849 struct per_cpu_pages *pcp; 1850#ifdef CONFIG_NUMA 1851 struct per_cpu_pageset *pgset; 1852 pgset = &pageset_table[nid*MAX_NR_ZONES*NR_CPUS + 1853 (j * NR_CPUS) + cpu]; 1854 1855 zone->pageset[cpu] = pgset; 1856#else 1857 struct per_cpu_pageset *pgset = zone_pcp(zone, cpu); 1858#endif 1859 1860 pcp = &pgset->pcp[0]; /* hot */ 1861 pcp->count = 0; 1862 pcp->low = 2 * batch; 1863 pcp->high = 6 * batch; 1864 pcp->batch = 1 * batch; 1865 INIT_LIST_HEAD(&pcp->list); 1866 1867 pcp = &pgset->pcp[1]; /* cold */ 1868 pcp->count = 0; 1869 pcp->low = 0; 1870 pcp->high = 2 * batch; 1871 pcp->batch = 1 * batch; 1872 INIT_LIST_HEAD(&pcp->list); 1873 } 1874 printk(KERN_DEBUG " %s zone: %lu pages, LIFO batch:%lu\n", 1875 zone_names[j], realsize, batch); 1876 INIT_LIST_HEAD(&zone->active_list); 1877 INIT_LIST_HEAD(&zone->inactive_list); 1878 zone->nr_scan_active = 0; 1879 zone->nr_scan_inactive = 0; 1880 zone->nr_active = 0; 1881 zone->nr_inactive = 0; 1882 atomic_set(&zone->reclaim_in_progress, -1); 1883 if (!size) 1884 continue; 1885 1886 /* 1887 * The per-page waitqueue mechanism uses hashed waitqueues 1888 * per zone. 1889 */ 1890 zone->wait_table_size = wait_table_size(size); 1891 zone->wait_table_bits = 1892 wait_table_bits(zone->wait_table_size); 1893 zone->wait_table = (wait_queue_head_t *) 1894 alloc_bootmem_node(pgdat, zone->wait_table_size 1895 * sizeof(wait_queue_head_t)); 1896 1897 for(i = 0; i < zone->wait_table_size; ++i) 1898 init_waitqueue_head(zone->wait_table + i); 1899 1900 pgdat->nr_zones = j+1; 1901 1902 zone->zone_mem_map = pfn_to_page(zone_start_pfn); 1903 zone->zone_start_pfn = zone_start_pfn; 1904 1905 if ((zone_start_pfn) & (zone_required_alignment-1)) 1906 printk(KERN_CRIT "BUG: wrong zone alignment, it will crash\n"); 1907 1908 memmap_init(size, nid, j, zone_start_pfn); 1909 1910 zone_start_pfn += size; 1911 1912 zone_init_free_lists(pgdat, zone, zone->spanned_pages); 1913 } 1914} 1915 1916static void __init alloc_node_mem_map(struct pglist_data *pgdat) 1917{ 1918 unsigned long size; 1919 1920 /* Skip empty nodes */ 1921 if (!pgdat->node_spanned_pages) 1922 return; 1923 1924 /* ia64 gets its own node_mem_map, before this, without bootmem */ 1925 if (!pgdat->node_mem_map) { 1926 size = (pgdat->node_spanned_pages + 1) * sizeof(struct page); 1927 pgdat->node_mem_map = alloc_bootmem_node(pgdat, size); 1928 } 1929#ifndef CONFIG_DISCONTIGMEM 1930 /* 1931 * With no DISCONTIG, the global mem_map is just set as node 0's 1932 */ 1933 if (pgdat == NODE_DATA(0)) 1934 mem_map = NODE_DATA(0)->node_mem_map; 1935#endif 1936} 1937 1938void __init free_area_init_node(int nid, struct pglist_data *pgdat, 1939 unsigned long *zones_size, unsigned long node_start_pfn, 1940 unsigned long *zholes_size) 1941{ 1942 pgdat->node_id = nid; 1943 pgdat->node_start_pfn = node_start_pfn; 1944 calculate_zone_totalpages(pgdat, zones_size, zholes_size); 1945 1946 alloc_node_mem_map(pgdat); 1947 1948 free_area_init_core(pgdat, zones_size, zholes_size); 1949} 1950 1951#ifndef CONFIG_DISCONTIGMEM 1952static bootmem_data_t contig_bootmem_data; 1953struct pglist_data contig_page_data = { .bdata = &contig_bootmem_data }; 1954 1955EXPORT_SYMBOL(contig_page_data); 1956 1957void __init free_area_init(unsigned long *zones_size) 1958{ 1959 free_area_init_node(0, &contig_page_data, zones_size, 1960 __pa(PAGE_OFFSET) >> PAGE_SHIFT, NULL); 1961} 1962#endif 1963 1964#ifdef CONFIG_PROC_FS 1965 1966#include <linux/seq_file.h> 1967 1968static void *frag_start(struct seq_file *m, loff_t *pos) 1969{ 1970 pg_data_t *pgdat; 1971 loff_t node = *pos; 1972 1973 for (pgdat = pgdat_list; pgdat && node; pgdat = pgdat->pgdat_next) 1974 --node; 1975 1976 return pgdat; 1977} 1978 1979static void *frag_next(struct seq_file *m, void *arg, loff_t *pos) 1980{ 1981 pg_data_t *pgdat = (pg_data_t *)arg; 1982 1983 (*pos)++; 1984 return pgdat->pgdat_next; 1985} 1986 1987static void frag_stop(struct seq_file *m, void *arg) 1988{ 1989} 1990 1991/* 1992 * This walks the free areas for each zone. 1993 */ 1994static int frag_show(struct seq_file *m, void *arg) 1995{ 1996 pg_data_t *pgdat = (pg_data_t *)arg; 1997 struct zone *zone; 1998 struct zone *node_zones = pgdat->node_zones; 1999 unsigned long flags; 2000 int order; 2001 2002 for (zone = node_zones; zone - node_zones < MAX_NR_ZONES; ++zone) { 2003 if (!zone->present_pages) 2004 continue; 2005 2006 spin_lock_irqsave(&zone->lock, flags); 2007 seq_printf(m, "Node %d, zone %8s ", pgdat->node_id, zone->name); 2008 for (order = 0; order < MAX_ORDER; ++order) 2009 seq_printf(m, "%6lu ", zone->free_area[order].nr_free); 2010 spin_unlock_irqrestore(&zone->lock, flags); 2011 seq_putc(m, '\n'); 2012 } 2013 return 0; 2014} 2015 2016struct seq_operations fragmentation_op = { 2017 .start = frag_start, 2018 .next = frag_next, 2019 .stop = frag_stop, 2020 .show = frag_show, 2021}; 2022 2023/* 2024 * Output information about zones in @pgdat. 2025 */ 2026static int zoneinfo_show(struct seq_file *m, void *arg) 2027{ 2028 pg_data_t *pgdat = arg; 2029 struct zone *zone; 2030 struct zone *node_zones = pgdat->node_zones; 2031 unsigned long flags; 2032 2033 for (zone = node_zones; zone - node_zones < MAX_NR_ZONES; zone++) { 2034 int i; 2035 2036 if (!zone->present_pages) 2037 continue; 2038 2039 spin_lock_irqsave(&zone->lock, flags); 2040 seq_printf(m, "Node %d, zone %8s", pgdat->node_id, zone->name); 2041 seq_printf(m, 2042 "\n pages free %lu" 2043 "\n min %lu" 2044 "\n low %lu" 2045 "\n high %lu" 2046 "\n active %lu" 2047 "\n inactive %lu" 2048 "\n scanned %lu (a: %lu i: %lu)" 2049 "\n spanned %lu" 2050 "\n present %lu", 2051 zone->free_pages, 2052 zone->pages_min, 2053 zone->pages_low, 2054 zone->pages_high, 2055 zone->nr_active, 2056 zone->nr_inactive, 2057 zone->pages_scanned, 2058 zone->nr_scan_active, zone->nr_scan_inactive, 2059 zone->spanned_pages, 2060 zone->present_pages); 2061 seq_printf(m, 2062 "\n protection: (%lu", 2063 zone->lowmem_reserve[0]); 2064 for (i = 1; i < ARRAY_SIZE(zone->lowmem_reserve); i++) 2065 seq_printf(m, ", %lu", zone->lowmem_reserve[i]); 2066 seq_printf(m, 2067 ")" 2068 "\n pagesets"); 2069 for (i = 0; i < ARRAY_SIZE(zone->pageset); i++) { 2070 struct per_cpu_pageset *pageset; 2071 int j; 2072 2073 pageset = zone_pcp(zone, i); 2074 for (j = 0; j < ARRAY_SIZE(pageset->pcp); j++) { 2075 if (pageset->pcp[j].count) 2076 break; 2077 } 2078 if (j == ARRAY_SIZE(pageset->pcp)) 2079 continue; 2080 for (j = 0; j < ARRAY_SIZE(pageset->pcp); j++) { 2081 seq_printf(m, 2082 "\n cpu: %i pcp: %i" 2083 "\n count: %i" 2084 "\n low: %i" 2085 "\n high: %i" 2086 "\n batch: %i", 2087 i, j, 2088 pageset->pcp[j].count, 2089 pageset->pcp[j].low, 2090 pageset->pcp[j].high, 2091 pageset->pcp[j].batch); 2092 } 2093#ifdef CONFIG_NUMA 2094 seq_printf(m, 2095 "\n numa_hit: %lu" 2096 "\n numa_miss: %lu" 2097 "\n numa_foreign: %lu" 2098 "\n interleave_hit: %lu" 2099 "\n local_node: %lu" 2100 "\n other_node: %lu", 2101 pageset->numa_hit, 2102 pageset->numa_miss, 2103 pageset->numa_foreign, 2104 pageset->interleave_hit, 2105 pageset->local_node, 2106 pageset->other_node); 2107#endif 2108 } 2109 seq_printf(m, 2110 "\n all_unreclaimable: %u" 2111 "\n prev_priority: %i" 2112 "\n temp_priority: %i" 2113 "\n start_pfn: %lu", 2114 zone->all_unreclaimable, 2115 zone->prev_priority, 2116 zone->temp_priority, 2117 zone->zone_start_pfn); 2118 spin_unlock_irqrestore(&zone->lock, flags); 2119 seq_putc(m, '\n'); 2120 } 2121 return 0; 2122} 2123 2124struct seq_operations zoneinfo_op = { 2125 .start = frag_start, /* iterate over all zones. The same as in 2126 * fragmentation. */ 2127 .next = frag_next, 2128 .stop = frag_stop, 2129 .show = zoneinfo_show, 2130}; 2131 2132static char *vmstat_text[] = { 2133 "nr_dirty", 2134 "nr_writeback", 2135 "nr_unstable", 2136 "nr_page_table_pages", 2137 "nr_mapped", 2138 "nr_slab", 2139 2140 "pgpgin", 2141 "pgpgout", 2142 "pswpin", 2143 "pswpout", 2144 "pgalloc_high", 2145 2146 "pgalloc_normal", 2147 "pgalloc_dma", 2148 "pgfree", 2149 "pgactivate", 2150 "pgdeactivate", 2151 2152 "pgfault", 2153 "pgmajfault", 2154 "pgrefill_high", 2155 "pgrefill_normal", 2156 "pgrefill_dma", 2157 2158 "pgsteal_high", 2159 "pgsteal_normal", 2160 "pgsteal_dma", 2161 "pgscan_kswapd_high", 2162 "pgscan_kswapd_normal", 2163 2164 "pgscan_kswapd_dma", 2165 "pgscan_direct_high", 2166 "pgscan_direct_normal", 2167 "pgscan_direct_dma", 2168 "pginodesteal", 2169 2170 "slabs_scanned", 2171 "kswapd_steal", 2172 "kswapd_inodesteal", 2173 "pageoutrun", 2174 "allocstall", 2175 2176 "pgrotated", 2177 "nr_bounce", 2178}; 2179 2180static void *vmstat_start(struct seq_file *m, loff_t *pos) 2181{ 2182 struct page_state *ps; 2183 2184 if (*pos >= ARRAY_SIZE(vmstat_text)) 2185 return NULL; 2186 2187 ps = kmalloc(sizeof(*ps), GFP_KERNEL); 2188 m->private = ps; 2189 if (!ps) 2190 return ERR_PTR(-ENOMEM); 2191 get_full_page_state(ps); 2192 ps->pgpgin /= 2; /* sectors -> kbytes */ 2193 ps->pgpgout /= 2; 2194 return (unsigned long *)ps + *pos; 2195} 2196 2197static void *vmstat_next(struct seq_file *m, void *arg, loff_t *pos) 2198{ 2199 (*pos)++; 2200 if (*pos >= ARRAY_SIZE(vmstat_text)) 2201 return NULL; 2202 return (unsigned long *)m->private + *pos; 2203} 2204 2205static int vmstat_show(struct seq_file *m, void *arg) 2206{ 2207 unsigned long *l = arg; 2208 unsigned long off = l - (unsigned long *)m->private; 2209 2210 seq_printf(m, "%s %lu\n", vmstat_text[off], *l); 2211 return 0; 2212} 2213 2214static void vmstat_stop(struct seq_file *m, void *arg) 2215{ 2216 kfree(m->private); 2217 m->private = NULL; 2218} 2219 2220struct seq_operations vmstat_op = { 2221 .start = vmstat_start, 2222 .next = vmstat_next, 2223 .stop = vmstat_stop, 2224 .show = vmstat_show, 2225}; 2226 2227#endif /* CONFIG_PROC_FS */ 2228 2229#ifdef CONFIG_HOTPLUG_CPU 2230static int page_alloc_cpu_notify(struct notifier_block *self, 2231 unsigned long action, void *hcpu) 2232{ 2233 int cpu = (unsigned long)hcpu; 2234 long *count; 2235 unsigned long *src, *dest; 2236 2237 if (action == CPU_DEAD) { 2238 int i; 2239 2240 /* Drain local pagecache count. */ 2241 count = &per_cpu(nr_pagecache_local, cpu); 2242 atomic_add(*count, &nr_pagecache); 2243 *count = 0; 2244 local_irq_disable(); 2245 __drain_pages(cpu); 2246 2247 /* Add dead cpu's page_states to our own. */ 2248 dest = (unsigned long *)&__get_cpu_var(page_states); 2249 src = (unsigned long *)&per_cpu(page_states, cpu); 2250 2251 for (i = 0; i < sizeof(struct page_state)/sizeof(unsigned long); 2252 i++) { 2253 dest[i] += src[i]; 2254 src[i] = 0; 2255 } 2256 2257 local_irq_enable(); 2258 } 2259 return NOTIFY_OK; 2260} 2261#endif /* CONFIG_HOTPLUG_CPU */ 2262 2263void __init page_alloc_init(void) 2264{ 2265 hotcpu_notifier(page_alloc_cpu_notify, 0); 2266} 2267 2268/* 2269 * setup_per_zone_lowmem_reserve - called whenever 2270 * sysctl_lower_zone_reserve_ratio changes. Ensures that each zone 2271 * has a correct pages reserved value, so an adequate number of 2272 * pages are left in the zone after a successful __alloc_pages(). 2273 */ 2274static void setup_per_zone_lowmem_reserve(void) 2275{ 2276 struct pglist_data *pgdat; 2277 int j, idx; 2278 2279 for_each_pgdat(pgdat) { 2280 for (j = 0; j < MAX_NR_ZONES; j++) { 2281 struct zone *zone = pgdat->node_zones + j; 2282 unsigned long present_pages = zone->present_pages; 2283 2284 zone->lowmem_reserve[j] = 0; 2285 2286 for (idx = j-1; idx >= 0; idx--) { 2287 struct zone *lower_zone; 2288 2289 if (sysctl_lowmem_reserve_ratio[idx] < 1) 2290 sysctl_lowmem_reserve_ratio[idx] = 1; 2291 2292 lower_zone = pgdat->node_zones + idx; 2293 lower_zone->lowmem_reserve[j] = present_pages / 2294 sysctl_lowmem_reserve_ratio[idx]; 2295 present_pages += lower_zone->present_pages; 2296 } 2297 } 2298 } 2299} 2300 2301/* 2302 * setup_per_zone_pages_min - called when min_free_kbytes changes. Ensures 2303 * that the pages_{min,low,high} values for each zone are set correctly 2304 * with respect to min_free_kbytes. 2305 */ 2306static void setup_per_zone_pages_min(void) 2307{ 2308 unsigned long pages_min = min_free_kbytes >> (PAGE_SHIFT - 10); 2309 unsigned long lowmem_pages = 0; 2310 struct zone *zone; 2311 unsigned long flags; 2312 2313 /* Calculate total number of !ZONE_HIGHMEM pages */ 2314 for_each_zone(zone) { 2315 if (!is_highmem(zone)) 2316 lowmem_pages += zone->present_pages; 2317 } 2318 2319 for_each_zone(zone) { 2320 spin_lock_irqsave(&zone->lru_lock, flags); 2321 if (is_highmem(zone)) { 2322 /* 2323 * Often, highmem doesn't need to reserve any pages. 2324 * But the pages_min/low/high values are also used for 2325 * batching up page reclaim activity so we need a 2326 * decent value here. 2327 */ 2328 int min_pages; 2329 2330 min_pages = zone->present_pages / 1024; 2331 if (min_pages < SWAP_CLUSTER_MAX) 2332 min_pages = SWAP_CLUSTER_MAX; 2333 if (min_pages > 128) 2334 min_pages = 128; 2335 zone->pages_min = min_pages; 2336 } else { 2337 /* if it's a lowmem zone, reserve a number of pages 2338 * proportionate to the zone's size. 2339 */ 2340 zone->pages_min = (pages_min * zone->present_pages) / 2341 lowmem_pages; 2342 } 2343 2344 /* 2345 * When interpreting these watermarks, just keep in mind that: 2346 * zone->pages_min == (zone->pages_min * 4) / 4; 2347 */ 2348 zone->pages_low = (zone->pages_min * 5) / 4; 2349 zone->pages_high = (zone->pages_min * 6) / 4; 2350 spin_unlock_irqrestore(&zone->lru_lock, flags); 2351 } 2352} 2353 2354/* 2355 * Initialise min_free_kbytes. 2356 * 2357 * For small machines we want it small (128k min). For large machines 2358 * we want it large (64MB max). But it is not linear, because network 2359 * bandwidth does not increase linearly with machine size. We use 2360 * 2361 * min_free_kbytes = 4 * sqrt(lowmem_kbytes), for better accuracy: 2362 * min_free_kbytes = sqrt(lowmem_kbytes * 16) 2363 * 2364 * which yields 2365 * 2366 * 16MB: 512k 2367 * 32MB: 724k 2368 * 64MB: 1024k 2369 * 128MB: 1448k 2370 * 256MB: 2048k 2371 * 512MB: 2896k 2372 * 1024MB: 4096k 2373 * 2048MB: 5792k 2374 * 4096MB: 8192k 2375 * 8192MB: 11584k 2376 * 16384MB: 16384k 2377 */ 2378static int __init init_per_zone_pages_min(void) 2379{ 2380 unsigned long lowmem_kbytes; 2381 2382 lowmem_kbytes = nr_free_buffer_pages() * (PAGE_SIZE >> 10); 2383 2384 min_free_kbytes = int_sqrt(lowmem_kbytes * 16); 2385 if (min_free_kbytes < 128) 2386 min_free_kbytes = 128; 2387 if (min_free_kbytes > 65536) 2388 min_free_kbytes = 65536; 2389 setup_per_zone_pages_min(); 2390 setup_per_zone_lowmem_reserve(); 2391 return 0; 2392} 2393module_init(init_per_zone_pages_min) 2394 2395/* 2396 * min_free_kbytes_sysctl_handler - just a wrapper around proc_dointvec() so 2397 * that we can call two helper functions whenever min_free_kbytes 2398 * changes. 2399 */ 2400int min_free_kbytes_sysctl_handler(ctl_table *table, int write, 2401 struct file *file, void __user *buffer, size_t *length, loff_t *ppos) 2402{ 2403 proc_dointvec(table, write, file, buffer, length, ppos); 2404 setup_per_zone_pages_min(); 2405 return 0; 2406} 2407 2408/* 2409 * lowmem_reserve_ratio_sysctl_handler - just a wrapper around 2410 * proc_dointvec() so that we can call setup_per_zone_lowmem_reserve() 2411 * whenever sysctl_lowmem_reserve_ratio changes. 2412 * 2413 * The reserve ratio obviously has absolutely no relation with the 2414 * pages_min watermarks. The lowmem reserve ratio can only make sense 2415 * if in function of the boot time zone sizes. 2416 */ 2417int lowmem_reserve_ratio_sysctl_handler(ctl_table *table, int write, 2418 struct file *file, void __user *buffer, size_t *length, loff_t *ppos) 2419{ 2420 proc_dointvec_minmax(table, write, file, buffer, length, ppos); 2421 setup_per_zone_lowmem_reserve(); 2422 return 0; 2423} 2424 2425__initdata int hashdist = HASHDIST_DEFAULT; 2426 2427#ifdef CONFIG_NUMA 2428static int __init set_hashdist(char *str) 2429{ 2430 if (!str) 2431 return 0; 2432 hashdist = simple_strtoul(str, &str, 0); 2433 return 1; 2434} 2435__setup("hashdist=", set_hashdist); 2436#endif 2437 2438/* 2439 * allocate a large system hash table from bootmem 2440 * - it is assumed that the hash table must contain an exact power-of-2 2441 * quantity of entries 2442 * - limit is the number of hash buckets, not the total allocation size 2443 */ 2444void *__init alloc_large_system_hash(const char *tablename, 2445 unsigned long bucketsize, 2446 unsigned long numentries, 2447 int scale, 2448 int flags, 2449 unsigned int *_hash_shift, 2450 unsigned int *_hash_mask, 2451 unsigned long limit) 2452{ 2453 unsigned long long max = limit; 2454 unsigned long log2qty, size; 2455 void *table = NULL; 2456 2457 /* allow the kernel cmdline to have a say */ 2458 if (!numentries) { 2459 /* round applicable memory size up to nearest megabyte */ 2460 numentries = (flags & HASH_HIGHMEM) ? nr_all_pages : nr_kernel_pages; 2461 numentries += (1UL << (20 - PAGE_SHIFT)) - 1; 2462 numentries >>= 20 - PAGE_SHIFT; 2463 numentries <<= 20 - PAGE_SHIFT; 2464 2465 /* limit to 1 bucket per 2^scale bytes of low memory */ 2466 if (scale > PAGE_SHIFT) 2467 numentries >>= (scale - PAGE_SHIFT); 2468 else 2469 numentries <<= (PAGE_SHIFT - scale); 2470 } 2471 /* rounded up to nearest power of 2 in size */ 2472 numentries = 1UL << (long_log2(numentries) + 1); 2473 2474 /* limit allocation size to 1/16 total memory by default */ 2475 if (max == 0) { 2476 max = ((unsigned long long)nr_all_pages << PAGE_SHIFT) >> 4; 2477 do_div(max, bucketsize); 2478 } 2479 2480 if (numentries > max) 2481 numentries = max; 2482 2483 log2qty = long_log2(numentries); 2484 2485 do { 2486 size = bucketsize << log2qty; 2487 if (flags & HASH_EARLY) 2488 table = alloc_bootmem(size); 2489 else if (hashdist) 2490 table = __vmalloc(size, GFP_ATOMIC, PAGE_KERNEL); 2491 else { 2492 unsigned long order; 2493 for (order = 0; ((1UL << order) << PAGE_SHIFT) < size; order++) 2494 ; 2495 table = (void*) __get_free_pages(GFP_ATOMIC, order); 2496 } 2497 } while (!table && size > PAGE_SIZE && --log2qty); 2498 2499 if (!table) 2500 panic("Failed to allocate %s hash table\n", tablename); 2501 2502 printk("%s hash table entries: %d (order: %d, %lu bytes)\n", 2503 tablename, 2504 (1U << log2qty), 2505 long_log2(size) - PAGE_SHIFT, 2506 size); 2507 2508 if (_hash_shift) 2509 *_hash_shift = log2qty; 2510 if (_hash_mask) 2511 *_hash_mask = (1 << log2qty) - 1; 2512 2513 return table; 2514} 2515