page_alloc.c revision c6a57e19e464db118dc4ab9cfe9e9748c6d630a0
1/* 2 * linux/mm/page_alloc.c 3 * 4 * Manages the free list, the system allocates free pages here. 5 * Note that kmalloc() lives in slab.c 6 * 7 * Copyright (C) 1991, 1992, 1993, 1994 Linus Torvalds 8 * Swap reorganised 29.12.95, Stephen Tweedie 9 * Support of BIGMEM added by Gerhard Wichert, Siemens AG, July 1999 10 * Reshaped it to be a zoned allocator, Ingo Molnar, Red Hat, 1999 11 * Discontiguous memory support, Kanoj Sarcar, SGI, Nov 1999 12 * Zone balancing, Kanoj Sarcar, SGI, Jan 2000 13 * Per cpu hot/cold page lists, bulk allocation, Martin J. Bligh, Sept 2002 14 * (lots of bits borrowed from Ingo Molnar & Andrew Morton) 15 */ 16 17#include <linux/config.h> 18#include <linux/stddef.h> 19#include <linux/mm.h> 20#include <linux/swap.h> 21#include <linux/interrupt.h> 22#include <linux/pagemap.h> 23#include <linux/bootmem.h> 24#include <linux/compiler.h> 25#include <linux/kernel.h> 26#include <linux/module.h> 27#include <linux/suspend.h> 28#include <linux/pagevec.h> 29#include <linux/blkdev.h> 30#include <linux/slab.h> 31#include <linux/notifier.h> 32#include <linux/topology.h> 33#include <linux/sysctl.h> 34#include <linux/cpu.h> 35#include <linux/cpuset.h> 36#include <linux/nodemask.h> 37#include <linux/vmalloc.h> 38 39#include <asm/tlbflush.h> 40#include "internal.h" 41 42/* 43 * MCD - HACK: Find somewhere to initialize this EARLY, or make this 44 * initializer cleaner 45 */ 46nodemask_t node_online_map __read_mostly = { { [0] = 1UL } }; 47EXPORT_SYMBOL(node_online_map); 48nodemask_t node_possible_map __read_mostly = NODE_MASK_ALL; 49EXPORT_SYMBOL(node_possible_map); 50struct pglist_data *pgdat_list __read_mostly; 51unsigned long totalram_pages __read_mostly; 52unsigned long totalhigh_pages __read_mostly; 53long nr_swap_pages; 54 55/* 56 * results with 256, 32 in the lowmem_reserve sysctl: 57 * 1G machine -> (16M dma, 800M-16M normal, 1G-800M high) 58 * 1G machine -> (16M dma, 784M normal, 224M high) 59 * NORMAL allocation will leave 784M/256 of ram reserved in the ZONE_DMA 60 * HIGHMEM allocation will leave 224M/32 of ram reserved in ZONE_NORMAL 61 * HIGHMEM allocation will (224M+784M)/256 of ram reserved in ZONE_DMA 62 */ 63int sysctl_lowmem_reserve_ratio[MAX_NR_ZONES-1] = { 256, 32 }; 64 65EXPORT_SYMBOL(totalram_pages); 66EXPORT_SYMBOL(nr_swap_pages); 67 68/* 69 * Used by page_zone() to look up the address of the struct zone whose 70 * id is encoded in the upper bits of page->flags 71 */ 72struct zone *zone_table[1 << ZONETABLE_SHIFT] __read_mostly; 73EXPORT_SYMBOL(zone_table); 74 75static char *zone_names[MAX_NR_ZONES] = { "DMA", "Normal", "HighMem" }; 76int min_free_kbytes = 1024; 77 78unsigned long __initdata nr_kernel_pages; 79unsigned long __initdata nr_all_pages; 80 81static int page_outside_zone_boundaries(struct zone *zone, struct page *page) 82{ 83 if (page_to_pfn(page) >= zone->zone_start_pfn + zone->spanned_pages) 84 return 1; 85 if (page_to_pfn(page) < zone->zone_start_pfn) 86 return 1; 87 88 return 0; 89} 90 91static int page_is_consistent(struct zone *zone, struct page *page) 92{ 93#ifdef CONFIG_HOLES_IN_ZONE 94 if (!pfn_valid(page_to_pfn(page))) 95 return 0; 96#endif 97 if (zone != page_zone(page)) 98 return 0; 99 100 return 1; 101} 102/* 103 * Temporary debugging check for pages not lying within a given zone. 104 */ 105static int bad_range(struct zone *zone, struct page *page) 106{ 107 if (page_outside_zone_boundaries(zone, page)) 108 return 1; 109 if (!page_is_consistent(zone, page)) 110 return 1; 111 112 return 0; 113} 114 115static void bad_page(const char *function, struct page *page) 116{ 117 printk(KERN_EMERG "Bad page state at %s (in process '%s', page %p)\n", 118 function, current->comm, page); 119 printk(KERN_EMERG "flags:0x%0*lx mapping:%p mapcount:%d count:%d\n", 120 (int)(2*sizeof(page_flags_t)), (unsigned long)page->flags, 121 page->mapping, page_mapcount(page), page_count(page)); 122 printk(KERN_EMERG "Backtrace:\n"); 123 dump_stack(); 124 printk(KERN_EMERG "Trying to fix it up, but a reboot is needed\n"); 125 page->flags &= ~(1 << PG_lru | 126 1 << PG_private | 127 1 << PG_locked | 128 1 << PG_active | 129 1 << PG_dirty | 130 1 << PG_reclaim | 131 1 << PG_slab | 132 1 << PG_swapcache | 133 1 << PG_writeback | 134 1 << PG_reserved ); 135 set_page_count(page, 0); 136 reset_page_mapcount(page); 137 page->mapping = NULL; 138 add_taint(TAINT_BAD_PAGE); 139} 140 141#ifndef CONFIG_HUGETLB_PAGE 142#define prep_compound_page(page, order) do { } while (0) 143#define destroy_compound_page(page, order) do { } while (0) 144#else 145/* 146 * Higher-order pages are called "compound pages". They are structured thusly: 147 * 148 * The first PAGE_SIZE page is called the "head page". 149 * 150 * The remaining PAGE_SIZE pages are called "tail pages". 151 * 152 * All pages have PG_compound set. All pages have their ->private pointing at 153 * the head page (even the head page has this). 154 * 155 * The first tail page's ->mapping, if non-zero, holds the address of the 156 * compound page's put_page() function. 157 * 158 * The order of the allocation is stored in the first tail page's ->index 159 * This is only for debug at present. This usage means that zero-order pages 160 * may not be compound. 161 */ 162static void prep_compound_page(struct page *page, unsigned long order) 163{ 164 int i; 165 int nr_pages = 1 << order; 166 167 page[1].mapping = NULL; 168 page[1].index = order; 169 for (i = 0; i < nr_pages; i++) { 170 struct page *p = page + i; 171 172 SetPageCompound(p); 173 set_page_private(p, (unsigned long)page); 174 } 175} 176 177static void destroy_compound_page(struct page *page, unsigned long order) 178{ 179 int i; 180 int nr_pages = 1 << order; 181 182 if (!PageCompound(page)) 183 return; 184 185 if (page[1].index != order) 186 bad_page(__FUNCTION__, page); 187 188 for (i = 0; i < nr_pages; i++) { 189 struct page *p = page + i; 190 191 if (!PageCompound(p)) 192 bad_page(__FUNCTION__, page); 193 if (page_private(p) != (unsigned long)page) 194 bad_page(__FUNCTION__, page); 195 ClearPageCompound(p); 196 } 197} 198#endif /* CONFIG_HUGETLB_PAGE */ 199 200/* 201 * function for dealing with page's order in buddy system. 202 * zone->lock is already acquired when we use these. 203 * So, we don't need atomic page->flags operations here. 204 */ 205static inline unsigned long page_order(struct page *page) { 206 return page_private(page); 207} 208 209static inline void set_page_order(struct page *page, int order) { 210 set_page_private(page, order); 211 __SetPagePrivate(page); 212} 213 214static inline void rmv_page_order(struct page *page) 215{ 216 __ClearPagePrivate(page); 217 set_page_private(page, 0); 218} 219 220/* 221 * Locate the struct page for both the matching buddy in our 222 * pair (buddy1) and the combined O(n+1) page they form (page). 223 * 224 * 1) Any buddy B1 will have an order O twin B2 which satisfies 225 * the following equation: 226 * B2 = B1 ^ (1 << O) 227 * For example, if the starting buddy (buddy2) is #8 its order 228 * 1 buddy is #10: 229 * B2 = 8 ^ (1 << 1) = 8 ^ 2 = 10 230 * 231 * 2) Any buddy B will have an order O+1 parent P which 232 * satisfies the following equation: 233 * P = B & ~(1 << O) 234 * 235 * Assumption: *_mem_map is contigious at least up to MAX_ORDER 236 */ 237static inline struct page * 238__page_find_buddy(struct page *page, unsigned long page_idx, unsigned int order) 239{ 240 unsigned long buddy_idx = page_idx ^ (1 << order); 241 242 return page + (buddy_idx - page_idx); 243} 244 245static inline unsigned long 246__find_combined_index(unsigned long page_idx, unsigned int order) 247{ 248 return (page_idx & ~(1 << order)); 249} 250 251/* 252 * This function checks whether a page is free && is the buddy 253 * we can do coalesce a page and its buddy if 254 * (a) the buddy is free && 255 * (b) the buddy is on the buddy system && 256 * (c) a page and its buddy have the same order. 257 * for recording page's order, we use page_private(page) and PG_private. 258 * 259 */ 260static inline int page_is_buddy(struct page *page, int order) 261{ 262 if (PagePrivate(page) && 263 (page_order(page) == order) && 264 page_count(page) == 0) 265 return 1; 266 return 0; 267} 268 269/* 270 * Freeing function for a buddy system allocator. 271 * 272 * The concept of a buddy system is to maintain direct-mapped table 273 * (containing bit values) for memory blocks of various "orders". 274 * The bottom level table contains the map for the smallest allocatable 275 * units of memory (here, pages), and each level above it describes 276 * pairs of units from the levels below, hence, "buddies". 277 * At a high level, all that happens here is marking the table entry 278 * at the bottom level available, and propagating the changes upward 279 * as necessary, plus some accounting needed to play nicely with other 280 * parts of the VM system. 281 * At each level, we keep a list of pages, which are heads of continuous 282 * free pages of length of (1 << order) and marked with PG_Private.Page's 283 * order is recorded in page_private(page) field. 284 * So when we are allocating or freeing one, we can derive the state of the 285 * other. That is, if we allocate a small block, and both were 286 * free, the remainder of the region must be split into blocks. 287 * If a block is freed, and its buddy is also free, then this 288 * triggers coalescing into a block of larger size. 289 * 290 * -- wli 291 */ 292 293static inline void __free_pages_bulk (struct page *page, 294 struct zone *zone, unsigned int order) 295{ 296 unsigned long page_idx; 297 int order_size = 1 << order; 298 299 if (unlikely(order)) 300 destroy_compound_page(page, order); 301 302 page_idx = page_to_pfn(page) & ((1 << MAX_ORDER) - 1); 303 304 BUG_ON(page_idx & (order_size - 1)); 305 BUG_ON(bad_range(zone, page)); 306 307 zone->free_pages += order_size; 308 while (order < MAX_ORDER-1) { 309 unsigned long combined_idx; 310 struct free_area *area; 311 struct page *buddy; 312 313 combined_idx = __find_combined_index(page_idx, order); 314 buddy = __page_find_buddy(page, page_idx, order); 315 316 if (bad_range(zone, buddy)) 317 break; 318 if (!page_is_buddy(buddy, order)) 319 break; /* Move the buddy up one level. */ 320 list_del(&buddy->lru); 321 area = zone->free_area + order; 322 area->nr_free--; 323 rmv_page_order(buddy); 324 page = page + (combined_idx - page_idx); 325 page_idx = combined_idx; 326 order++; 327 } 328 set_page_order(page, order); 329 list_add(&page->lru, &zone->free_area[order].free_list); 330 zone->free_area[order].nr_free++; 331} 332 333static inline void free_pages_check(const char *function, struct page *page) 334{ 335 if ( page_mapcount(page) || 336 page->mapping != NULL || 337 page_count(page) != 0 || 338 (page->flags & ( 339 1 << PG_lru | 340 1 << PG_private | 341 1 << PG_locked | 342 1 << PG_active | 343 1 << PG_reclaim | 344 1 << PG_slab | 345 1 << PG_swapcache | 346 1 << PG_writeback | 347 1 << PG_reserved ))) 348 bad_page(function, page); 349 if (PageDirty(page)) 350 __ClearPageDirty(page); 351} 352 353/* 354 * Frees a list of pages. 355 * Assumes all pages on list are in same zone, and of same order. 356 * count is the number of pages to free. 357 * 358 * If the zone was previously in an "all pages pinned" state then look to 359 * see if this freeing clears that state. 360 * 361 * And clear the zone's pages_scanned counter, to hold off the "all pages are 362 * pinned" detection logic. 363 */ 364static int 365free_pages_bulk(struct zone *zone, int count, 366 struct list_head *list, unsigned int order) 367{ 368 unsigned long flags; 369 struct page *page = NULL; 370 int ret = 0; 371 372 spin_lock_irqsave(&zone->lock, flags); 373 zone->all_unreclaimable = 0; 374 zone->pages_scanned = 0; 375 while (!list_empty(list) && count--) { 376 page = list_entry(list->prev, struct page, lru); 377 /* have to delete it as __free_pages_bulk list manipulates */ 378 list_del(&page->lru); 379 __free_pages_bulk(page, zone, order); 380 ret++; 381 } 382 spin_unlock_irqrestore(&zone->lock, flags); 383 return ret; 384} 385 386void __free_pages_ok(struct page *page, unsigned int order) 387{ 388 LIST_HEAD(list); 389 int i; 390 391 arch_free_page(page, order); 392 393 mod_page_state(pgfree, 1 << order); 394 395#ifndef CONFIG_MMU 396 if (order > 0) 397 for (i = 1 ; i < (1 << order) ; ++i) 398 __put_page(page + i); 399#endif 400 401 for (i = 0 ; i < (1 << order) ; ++i) 402 free_pages_check(__FUNCTION__, page + i); 403 list_add(&page->lru, &list); 404 kernel_map_pages(page, 1<<order, 0); 405 free_pages_bulk(page_zone(page), 1, &list, order); 406} 407 408 409/* 410 * The order of subdivision here is critical for the IO subsystem. 411 * Please do not alter this order without good reasons and regression 412 * testing. Specifically, as large blocks of memory are subdivided, 413 * the order in which smaller blocks are delivered depends on the order 414 * they're subdivided in this function. This is the primary factor 415 * influencing the order in which pages are delivered to the IO 416 * subsystem according to empirical testing, and this is also justified 417 * by considering the behavior of a buddy system containing a single 418 * large block of memory acted on by a series of small allocations. 419 * This behavior is a critical factor in sglist merging's success. 420 * 421 * -- wli 422 */ 423static inline struct page * 424expand(struct zone *zone, struct page *page, 425 int low, int high, struct free_area *area) 426{ 427 unsigned long size = 1 << high; 428 429 while (high > low) { 430 area--; 431 high--; 432 size >>= 1; 433 BUG_ON(bad_range(zone, &page[size])); 434 list_add(&page[size].lru, &area->free_list); 435 area->nr_free++; 436 set_page_order(&page[size], high); 437 } 438 return page; 439} 440 441void set_page_refs(struct page *page, int order) 442{ 443#ifdef CONFIG_MMU 444 set_page_count(page, 1); 445#else 446 int i; 447 448 /* 449 * We need to reference all the pages for this order, otherwise if 450 * anyone accesses one of the pages with (get/put) it will be freed. 451 * - eg: access_process_vm() 452 */ 453 for (i = 0; i < (1 << order); i++) 454 set_page_count(page + i, 1); 455#endif /* CONFIG_MMU */ 456} 457 458/* 459 * This page is about to be returned from the page allocator 460 */ 461static void prep_new_page(struct page *page, int order) 462{ 463 if ( page_mapcount(page) || 464 page->mapping != NULL || 465 page_count(page) != 0 || 466 (page->flags & ( 467 1 << PG_lru | 468 1 << PG_private | 469 1 << PG_locked | 470 1 << PG_active | 471 1 << PG_dirty | 472 1 << PG_reclaim | 473 1 << PG_slab | 474 1 << PG_swapcache | 475 1 << PG_writeback | 476 1 << PG_reserved ))) 477 bad_page(__FUNCTION__, page); 478 479 page->flags &= ~(1 << PG_uptodate | 1 << PG_error | 480 1 << PG_referenced | 1 << PG_arch_1 | 481 1 << PG_checked | 1 << PG_mappedtodisk); 482 set_page_private(page, 0); 483 set_page_refs(page, order); 484 kernel_map_pages(page, 1 << order, 1); 485} 486 487/* 488 * Do the hard work of removing an element from the buddy allocator. 489 * Call me with the zone->lock already held. 490 */ 491static struct page *__rmqueue(struct zone *zone, unsigned int order) 492{ 493 struct free_area * area; 494 unsigned int current_order; 495 struct page *page; 496 497 for (current_order = order; current_order < MAX_ORDER; ++current_order) { 498 area = zone->free_area + current_order; 499 if (list_empty(&area->free_list)) 500 continue; 501 502 page = list_entry(area->free_list.next, struct page, lru); 503 list_del(&page->lru); 504 rmv_page_order(page); 505 area->nr_free--; 506 zone->free_pages -= 1UL << order; 507 return expand(zone, page, order, current_order, area); 508 } 509 510 return NULL; 511} 512 513/* 514 * Obtain a specified number of elements from the buddy allocator, all under 515 * a single hold of the lock, for efficiency. Add them to the supplied list. 516 * Returns the number of new pages which were placed at *list. 517 */ 518static int rmqueue_bulk(struct zone *zone, unsigned int order, 519 unsigned long count, struct list_head *list) 520{ 521 unsigned long flags; 522 int i; 523 int allocated = 0; 524 struct page *page; 525 526 spin_lock_irqsave(&zone->lock, flags); 527 for (i = 0; i < count; ++i) { 528 page = __rmqueue(zone, order); 529 if (page == NULL) 530 break; 531 allocated++; 532 list_add_tail(&page->lru, list); 533 } 534 spin_unlock_irqrestore(&zone->lock, flags); 535 return allocated; 536} 537 538#ifdef CONFIG_NUMA 539/* Called from the slab reaper to drain remote pagesets */ 540void drain_remote_pages(void) 541{ 542 struct zone *zone; 543 int i; 544 unsigned long flags; 545 546 local_irq_save(flags); 547 for_each_zone(zone) { 548 struct per_cpu_pageset *pset; 549 550 /* Do not drain local pagesets */ 551 if (zone->zone_pgdat->node_id == numa_node_id()) 552 continue; 553 554 pset = zone->pageset[smp_processor_id()]; 555 for (i = 0; i < ARRAY_SIZE(pset->pcp); i++) { 556 struct per_cpu_pages *pcp; 557 558 pcp = &pset->pcp[i]; 559 if (pcp->count) 560 pcp->count -= free_pages_bulk(zone, pcp->count, 561 &pcp->list, 0); 562 } 563 } 564 local_irq_restore(flags); 565} 566#endif 567 568#if defined(CONFIG_PM) || defined(CONFIG_HOTPLUG_CPU) 569static void __drain_pages(unsigned int cpu) 570{ 571 struct zone *zone; 572 int i; 573 574 for_each_zone(zone) { 575 struct per_cpu_pageset *pset; 576 577 pset = zone_pcp(zone, cpu); 578 for (i = 0; i < ARRAY_SIZE(pset->pcp); i++) { 579 struct per_cpu_pages *pcp; 580 581 pcp = &pset->pcp[i]; 582 pcp->count -= free_pages_bulk(zone, pcp->count, 583 &pcp->list, 0); 584 } 585 } 586} 587#endif /* CONFIG_PM || CONFIG_HOTPLUG_CPU */ 588 589#ifdef CONFIG_PM 590 591void mark_free_pages(struct zone *zone) 592{ 593 unsigned long zone_pfn, flags; 594 int order; 595 struct list_head *curr; 596 597 if (!zone->spanned_pages) 598 return; 599 600 spin_lock_irqsave(&zone->lock, flags); 601 for (zone_pfn = 0; zone_pfn < zone->spanned_pages; ++zone_pfn) 602 ClearPageNosaveFree(pfn_to_page(zone_pfn + zone->zone_start_pfn)); 603 604 for (order = MAX_ORDER - 1; order >= 0; --order) 605 list_for_each(curr, &zone->free_area[order].free_list) { 606 unsigned long start_pfn, i; 607 608 start_pfn = page_to_pfn(list_entry(curr, struct page, lru)); 609 610 for (i=0; i < (1<<order); i++) 611 SetPageNosaveFree(pfn_to_page(start_pfn+i)); 612 } 613 spin_unlock_irqrestore(&zone->lock, flags); 614} 615 616/* 617 * Spill all of this CPU's per-cpu pages back into the buddy allocator. 618 */ 619void drain_local_pages(void) 620{ 621 unsigned long flags; 622 623 local_irq_save(flags); 624 __drain_pages(smp_processor_id()); 625 local_irq_restore(flags); 626} 627#endif /* CONFIG_PM */ 628 629static void zone_statistics(struct zonelist *zonelist, struct zone *z) 630{ 631#ifdef CONFIG_NUMA 632 unsigned long flags; 633 int cpu; 634 pg_data_t *pg = z->zone_pgdat; 635 pg_data_t *orig = zonelist->zones[0]->zone_pgdat; 636 struct per_cpu_pageset *p; 637 638 local_irq_save(flags); 639 cpu = smp_processor_id(); 640 p = zone_pcp(z,cpu); 641 if (pg == orig) { 642 p->numa_hit++; 643 } else { 644 p->numa_miss++; 645 zone_pcp(zonelist->zones[0], cpu)->numa_foreign++; 646 } 647 if (pg == NODE_DATA(numa_node_id())) 648 p->local_node++; 649 else 650 p->other_node++; 651 local_irq_restore(flags); 652#endif 653} 654 655/* 656 * Free a 0-order page 657 */ 658static void FASTCALL(free_hot_cold_page(struct page *page, int cold)); 659static void fastcall free_hot_cold_page(struct page *page, int cold) 660{ 661 struct zone *zone = page_zone(page); 662 struct per_cpu_pages *pcp; 663 unsigned long flags; 664 665 arch_free_page(page, 0); 666 667 kernel_map_pages(page, 1, 0); 668 inc_page_state(pgfree); 669 if (PageAnon(page)) 670 page->mapping = NULL; 671 free_pages_check(__FUNCTION__, page); 672 pcp = &zone_pcp(zone, get_cpu())->pcp[cold]; 673 local_irq_save(flags); 674 list_add(&page->lru, &pcp->list); 675 pcp->count++; 676 if (pcp->count >= pcp->high) 677 pcp->count -= free_pages_bulk(zone, pcp->batch, &pcp->list, 0); 678 local_irq_restore(flags); 679 put_cpu(); 680} 681 682void fastcall free_hot_page(struct page *page) 683{ 684 free_hot_cold_page(page, 0); 685} 686 687void fastcall free_cold_page(struct page *page) 688{ 689 free_hot_cold_page(page, 1); 690} 691 692static inline void prep_zero_page(struct page *page, int order, gfp_t gfp_flags) 693{ 694 int i; 695 696 BUG_ON((gfp_flags & (__GFP_WAIT | __GFP_HIGHMEM)) == __GFP_HIGHMEM); 697 for(i = 0; i < (1 << order); i++) 698 clear_highpage(page + i); 699} 700 701/* 702 * Really, prep_compound_page() should be called from __rmqueue_bulk(). But 703 * we cheat by calling it from here, in the order > 0 path. Saves a branch 704 * or two. 705 */ 706static struct page * 707buffered_rmqueue(struct zone *zone, int order, gfp_t gfp_flags) 708{ 709 unsigned long flags; 710 struct page *page = NULL; 711 int cold = !!(gfp_flags & __GFP_COLD); 712 713 if (order == 0) { 714 struct per_cpu_pages *pcp; 715 716 pcp = &zone_pcp(zone, get_cpu())->pcp[cold]; 717 local_irq_save(flags); 718 if (pcp->count <= pcp->low) 719 pcp->count += rmqueue_bulk(zone, 0, 720 pcp->batch, &pcp->list); 721 if (pcp->count) { 722 page = list_entry(pcp->list.next, struct page, lru); 723 list_del(&page->lru); 724 pcp->count--; 725 } 726 local_irq_restore(flags); 727 put_cpu(); 728 } 729 730 if (page == NULL) { 731 spin_lock_irqsave(&zone->lock, flags); 732 page = __rmqueue(zone, order); 733 spin_unlock_irqrestore(&zone->lock, flags); 734 } 735 736 if (page != NULL) { 737 BUG_ON(bad_range(zone, page)); 738 mod_page_state_zone(zone, pgalloc, 1 << order); 739 prep_new_page(page, order); 740 741 if (gfp_flags & __GFP_ZERO) 742 prep_zero_page(page, order, gfp_flags); 743 744 if (order && (gfp_flags & __GFP_COMP)) 745 prep_compound_page(page, order); 746 } 747 return page; 748} 749 750/* 751 * Return 1 if free pages are above 'mark'. This takes into account the order 752 * of the allocation. 753 */ 754int zone_watermark_ok(struct zone *z, int order, unsigned long mark, 755 int classzone_idx, int can_try_harder, gfp_t gfp_high) 756{ 757 /* free_pages my go negative - that's OK */ 758 long min = mark, free_pages = z->free_pages - (1 << order) + 1; 759 int o; 760 761 if (gfp_high) 762 min -= min / 2; 763 if (can_try_harder) 764 min -= min / 4; 765 766 if (free_pages <= min + z->lowmem_reserve[classzone_idx]) 767 return 0; 768 for (o = 0; o < order; o++) { 769 /* At the next order, this order's pages become unavailable */ 770 free_pages -= z->free_area[o].nr_free << o; 771 772 /* Require fewer higher order pages to be free */ 773 min >>= 1; 774 775 if (free_pages <= min) 776 return 0; 777 } 778 return 1; 779} 780 781static inline int 782should_reclaim_zone(struct zone *z, gfp_t gfp_mask) 783{ 784 if (!z->reclaim_pages) 785 return 0; 786 if (gfp_mask & __GFP_NORECLAIM) 787 return 0; 788 return 1; 789} 790 791/* 792 * This is the 'heart' of the zoned buddy allocator. 793 */ 794struct page * fastcall 795__alloc_pages(gfp_t gfp_mask, unsigned int order, 796 struct zonelist *zonelist) 797{ 798 const gfp_t wait = gfp_mask & __GFP_WAIT; 799 struct zone **zones, *z; 800 struct page *page; 801 struct reclaim_state reclaim_state; 802 struct task_struct *p = current; 803 int i; 804 int classzone_idx; 805 int do_retry; 806 int can_try_harder; 807 int did_some_progress; 808 809 might_sleep_if(wait); 810 811 /* 812 * The caller may dip into page reserves a bit more if the caller 813 * cannot run direct reclaim, or is the caller has realtime scheduling 814 * policy 815 */ 816 can_try_harder = (unlikely(rt_task(p)) && !in_interrupt()) || !wait; 817 818 zones = zonelist->zones; /* the list of zones suitable for gfp_mask */ 819 820 if (unlikely(zones[0] == NULL)) { 821 /* Should this ever happen?? */ 822 return NULL; 823 } 824 825 classzone_idx = zone_idx(zones[0]); 826 827restart: 828 /* 829 * Go through the zonelist once, looking for a zone with enough free. 830 * See also cpuset_zone_allowed() comment in kernel/cpuset.c. 831 */ 832 for (i = 0; (z = zones[i]) != NULL; i++) { 833 int do_reclaim = should_reclaim_zone(z, gfp_mask); 834 835 if (!cpuset_zone_allowed(z, __GFP_HARDWALL)) 836 continue; 837 838 /* 839 * If the zone is to attempt early page reclaim then this loop 840 * will try to reclaim pages and check the watermark a second 841 * time before giving up and falling back to the next zone. 842 */ 843zone_reclaim_retry: 844 if (!zone_watermark_ok(z, order, z->pages_low, 845 classzone_idx, 0, 0)) { 846 if (!do_reclaim) 847 continue; 848 else { 849 zone_reclaim(z, gfp_mask, order); 850 /* Only try reclaim once */ 851 do_reclaim = 0; 852 goto zone_reclaim_retry; 853 } 854 } 855 856 page = buffered_rmqueue(z, order, gfp_mask); 857 if (page) 858 goto got_pg; 859 } 860 861 for (i = 0; (z = zones[i]) != NULL; i++) 862 wakeup_kswapd(z, order); 863 864 /* 865 * Go through the zonelist again. Let __GFP_HIGH and allocations 866 * coming from realtime tasks to go deeper into reserves 867 * 868 * This is the last chance, in general, before the goto nopage. 869 * Ignore cpuset if GFP_ATOMIC (!wait) rather than fail alloc. 870 * See also cpuset_zone_allowed() comment in kernel/cpuset.c. 871 */ 872 for (i = 0; (z = zones[i]) != NULL; i++) { 873 if (!zone_watermark_ok(z, order, z->pages_min, 874 classzone_idx, can_try_harder, 875 gfp_mask & __GFP_HIGH)) 876 continue; 877 878 if (wait && !cpuset_zone_allowed(z, gfp_mask)) 879 continue; 880 881 page = buffered_rmqueue(z, order, gfp_mask); 882 if (page) 883 goto got_pg; 884 } 885 886 /* This allocation should allow future memory freeing. */ 887 888 if (((p->flags & PF_MEMALLOC) || unlikely(test_thread_flag(TIF_MEMDIE))) 889 && !in_interrupt()) { 890 if (!(gfp_mask & __GFP_NOMEMALLOC)) { 891 /* go through the zonelist yet again, ignoring mins */ 892 for (i = 0; (z = zones[i]) != NULL; i++) { 893 if (!cpuset_zone_allowed(z, gfp_mask)) 894 continue; 895 page = buffered_rmqueue(z, order, gfp_mask); 896 if (page) 897 goto got_pg; 898 } 899 } 900 goto nopage; 901 } 902 903 /* Atomic allocations - we can't balance anything */ 904 if (!wait) 905 goto nopage; 906 907rebalance: 908 cond_resched(); 909 910 /* We now go into synchronous reclaim */ 911 p->flags |= PF_MEMALLOC; 912 reclaim_state.reclaimed_slab = 0; 913 p->reclaim_state = &reclaim_state; 914 915 did_some_progress = try_to_free_pages(zones, gfp_mask); 916 917 p->reclaim_state = NULL; 918 p->flags &= ~PF_MEMALLOC; 919 920 cond_resched(); 921 922 if (likely(did_some_progress)) { 923 for (i = 0; (z = zones[i]) != NULL; i++) { 924 if (!zone_watermark_ok(z, order, z->pages_min, 925 classzone_idx, can_try_harder, 926 gfp_mask & __GFP_HIGH)) 927 continue; 928 929 if (!cpuset_zone_allowed(z, gfp_mask)) 930 continue; 931 932 page = buffered_rmqueue(z, order, gfp_mask); 933 if (page) 934 goto got_pg; 935 } 936 } else if ((gfp_mask & __GFP_FS) && !(gfp_mask & __GFP_NORETRY)) { 937 /* 938 * Go through the zonelist yet one more time, keep 939 * very high watermark here, this is only to catch 940 * a parallel oom killing, we must fail if we're still 941 * under heavy pressure. 942 */ 943 for (i = 0; (z = zones[i]) != NULL; i++) { 944 if (!zone_watermark_ok(z, order, z->pages_high, 945 classzone_idx, 0, 0)) 946 continue; 947 948 if (!cpuset_zone_allowed(z, __GFP_HARDWALL)) 949 continue; 950 951 page = buffered_rmqueue(z, order, gfp_mask); 952 if (page) 953 goto got_pg; 954 } 955 956 out_of_memory(gfp_mask, order); 957 goto restart; 958 } 959 960 /* 961 * Don't let big-order allocations loop unless the caller explicitly 962 * requests that. Wait for some write requests to complete then retry. 963 * 964 * In this implementation, __GFP_REPEAT means __GFP_NOFAIL for order 965 * <= 3, but that may not be true in other implementations. 966 */ 967 do_retry = 0; 968 if (!(gfp_mask & __GFP_NORETRY)) { 969 if ((order <= 3) || (gfp_mask & __GFP_REPEAT)) 970 do_retry = 1; 971 if (gfp_mask & __GFP_NOFAIL) 972 do_retry = 1; 973 } 974 if (do_retry) { 975 blk_congestion_wait(WRITE, HZ/50); 976 goto rebalance; 977 } 978 979nopage: 980 if (!(gfp_mask & __GFP_NOWARN) && printk_ratelimit()) { 981 printk(KERN_WARNING "%s: page allocation failure." 982 " order:%d, mode:0x%x\n", 983 p->comm, order, gfp_mask); 984 dump_stack(); 985 show_mem(); 986 } 987 return NULL; 988got_pg: 989 zone_statistics(zonelist, z); 990 return page; 991} 992 993EXPORT_SYMBOL(__alloc_pages); 994 995/* 996 * Common helper functions. 997 */ 998fastcall unsigned long __get_free_pages(gfp_t gfp_mask, unsigned int order) 999{ 1000 struct page * page; 1001 page = alloc_pages(gfp_mask, order); 1002 if (!page) 1003 return 0; 1004 return (unsigned long) page_address(page); 1005} 1006 1007EXPORT_SYMBOL(__get_free_pages); 1008 1009fastcall unsigned long get_zeroed_page(gfp_t gfp_mask) 1010{ 1011 struct page * page; 1012 1013 /* 1014 * get_zeroed_page() returns a 32-bit address, which cannot represent 1015 * a highmem page 1016 */ 1017 BUG_ON((gfp_mask & __GFP_HIGHMEM) != 0); 1018 1019 page = alloc_pages(gfp_mask | __GFP_ZERO, 0); 1020 if (page) 1021 return (unsigned long) page_address(page); 1022 return 0; 1023} 1024 1025EXPORT_SYMBOL(get_zeroed_page); 1026 1027void __pagevec_free(struct pagevec *pvec) 1028{ 1029 int i = pagevec_count(pvec); 1030 1031 while (--i >= 0) 1032 free_hot_cold_page(pvec->pages[i], pvec->cold); 1033} 1034 1035fastcall void __free_pages(struct page *page, unsigned int order) 1036{ 1037 if (put_page_testzero(page)) { 1038 if (order == 0) 1039 free_hot_page(page); 1040 else 1041 __free_pages_ok(page, order); 1042 } 1043} 1044 1045EXPORT_SYMBOL(__free_pages); 1046 1047fastcall void free_pages(unsigned long addr, unsigned int order) 1048{ 1049 if (addr != 0) { 1050 BUG_ON(!virt_addr_valid((void *)addr)); 1051 __free_pages(virt_to_page((void *)addr), order); 1052 } 1053} 1054 1055EXPORT_SYMBOL(free_pages); 1056 1057/* 1058 * Total amount of free (allocatable) RAM: 1059 */ 1060unsigned int nr_free_pages(void) 1061{ 1062 unsigned int sum = 0; 1063 struct zone *zone; 1064 1065 for_each_zone(zone) 1066 sum += zone->free_pages; 1067 1068 return sum; 1069} 1070 1071EXPORT_SYMBOL(nr_free_pages); 1072 1073#ifdef CONFIG_NUMA 1074unsigned int nr_free_pages_pgdat(pg_data_t *pgdat) 1075{ 1076 unsigned int i, sum = 0; 1077 1078 for (i = 0; i < MAX_NR_ZONES; i++) 1079 sum += pgdat->node_zones[i].free_pages; 1080 1081 return sum; 1082} 1083#endif 1084 1085static unsigned int nr_free_zone_pages(int offset) 1086{ 1087 /* Just pick one node, since fallback list is circular */ 1088 pg_data_t *pgdat = NODE_DATA(numa_node_id()); 1089 unsigned int sum = 0; 1090 1091 struct zonelist *zonelist = pgdat->node_zonelists + offset; 1092 struct zone **zonep = zonelist->zones; 1093 struct zone *zone; 1094 1095 for (zone = *zonep++; zone; zone = *zonep++) { 1096 unsigned long size = zone->present_pages; 1097 unsigned long high = zone->pages_high; 1098 if (size > high) 1099 sum += size - high; 1100 } 1101 1102 return sum; 1103} 1104 1105/* 1106 * Amount of free RAM allocatable within ZONE_DMA and ZONE_NORMAL 1107 */ 1108unsigned int nr_free_buffer_pages(void) 1109{ 1110 return nr_free_zone_pages(gfp_zone(GFP_USER)); 1111} 1112 1113/* 1114 * Amount of free RAM allocatable within all zones 1115 */ 1116unsigned int nr_free_pagecache_pages(void) 1117{ 1118 return nr_free_zone_pages(gfp_zone(GFP_HIGHUSER)); 1119} 1120 1121#ifdef CONFIG_HIGHMEM 1122unsigned int nr_free_highpages (void) 1123{ 1124 pg_data_t *pgdat; 1125 unsigned int pages = 0; 1126 1127 for_each_pgdat(pgdat) 1128 pages += pgdat->node_zones[ZONE_HIGHMEM].free_pages; 1129 1130 return pages; 1131} 1132#endif 1133 1134#ifdef CONFIG_NUMA 1135static void show_node(struct zone *zone) 1136{ 1137 printk("Node %d ", zone->zone_pgdat->node_id); 1138} 1139#else 1140#define show_node(zone) do { } while (0) 1141#endif 1142 1143/* 1144 * Accumulate the page_state information across all CPUs. 1145 * The result is unavoidably approximate - it can change 1146 * during and after execution of this function. 1147 */ 1148static DEFINE_PER_CPU(struct page_state, page_states) = {0}; 1149 1150atomic_t nr_pagecache = ATOMIC_INIT(0); 1151EXPORT_SYMBOL(nr_pagecache); 1152#ifdef CONFIG_SMP 1153DEFINE_PER_CPU(long, nr_pagecache_local) = 0; 1154#endif 1155 1156void __get_page_state(struct page_state *ret, int nr, cpumask_t *cpumask) 1157{ 1158 int cpu = 0; 1159 1160 memset(ret, 0, sizeof(*ret)); 1161 cpus_and(*cpumask, *cpumask, cpu_online_map); 1162 1163 cpu = first_cpu(*cpumask); 1164 while (cpu < NR_CPUS) { 1165 unsigned long *in, *out, off; 1166 1167 in = (unsigned long *)&per_cpu(page_states, cpu); 1168 1169 cpu = next_cpu(cpu, *cpumask); 1170 1171 if (cpu < NR_CPUS) 1172 prefetch(&per_cpu(page_states, cpu)); 1173 1174 out = (unsigned long *)ret; 1175 for (off = 0; off < nr; off++) 1176 *out++ += *in++; 1177 } 1178} 1179 1180void get_page_state_node(struct page_state *ret, int node) 1181{ 1182 int nr; 1183 cpumask_t mask = node_to_cpumask(node); 1184 1185 nr = offsetof(struct page_state, GET_PAGE_STATE_LAST); 1186 nr /= sizeof(unsigned long); 1187 1188 __get_page_state(ret, nr+1, &mask); 1189} 1190 1191void get_page_state(struct page_state *ret) 1192{ 1193 int nr; 1194 cpumask_t mask = CPU_MASK_ALL; 1195 1196 nr = offsetof(struct page_state, GET_PAGE_STATE_LAST); 1197 nr /= sizeof(unsigned long); 1198 1199 __get_page_state(ret, nr + 1, &mask); 1200} 1201 1202void get_full_page_state(struct page_state *ret) 1203{ 1204 cpumask_t mask = CPU_MASK_ALL; 1205 1206 __get_page_state(ret, sizeof(*ret) / sizeof(unsigned long), &mask); 1207} 1208 1209unsigned long __read_page_state(unsigned long offset) 1210{ 1211 unsigned long ret = 0; 1212 int cpu; 1213 1214 for_each_online_cpu(cpu) { 1215 unsigned long in; 1216 1217 in = (unsigned long)&per_cpu(page_states, cpu) + offset; 1218 ret += *((unsigned long *)in); 1219 } 1220 return ret; 1221} 1222 1223void __mod_page_state(unsigned long offset, unsigned long delta) 1224{ 1225 unsigned long flags; 1226 void* ptr; 1227 1228 local_irq_save(flags); 1229 ptr = &__get_cpu_var(page_states); 1230 *(unsigned long*)(ptr + offset) += delta; 1231 local_irq_restore(flags); 1232} 1233 1234EXPORT_SYMBOL(__mod_page_state); 1235 1236void __get_zone_counts(unsigned long *active, unsigned long *inactive, 1237 unsigned long *free, struct pglist_data *pgdat) 1238{ 1239 struct zone *zones = pgdat->node_zones; 1240 int i; 1241 1242 *active = 0; 1243 *inactive = 0; 1244 *free = 0; 1245 for (i = 0; i < MAX_NR_ZONES; i++) { 1246 *active += zones[i].nr_active; 1247 *inactive += zones[i].nr_inactive; 1248 *free += zones[i].free_pages; 1249 } 1250} 1251 1252void get_zone_counts(unsigned long *active, 1253 unsigned long *inactive, unsigned long *free) 1254{ 1255 struct pglist_data *pgdat; 1256 1257 *active = 0; 1258 *inactive = 0; 1259 *free = 0; 1260 for_each_pgdat(pgdat) { 1261 unsigned long l, m, n; 1262 __get_zone_counts(&l, &m, &n, pgdat); 1263 *active += l; 1264 *inactive += m; 1265 *free += n; 1266 } 1267} 1268 1269void si_meminfo(struct sysinfo *val) 1270{ 1271 val->totalram = totalram_pages; 1272 val->sharedram = 0; 1273 val->freeram = nr_free_pages(); 1274 val->bufferram = nr_blockdev_pages(); 1275#ifdef CONFIG_HIGHMEM 1276 val->totalhigh = totalhigh_pages; 1277 val->freehigh = nr_free_highpages(); 1278#else 1279 val->totalhigh = 0; 1280 val->freehigh = 0; 1281#endif 1282 val->mem_unit = PAGE_SIZE; 1283} 1284 1285EXPORT_SYMBOL(si_meminfo); 1286 1287#ifdef CONFIG_NUMA 1288void si_meminfo_node(struct sysinfo *val, int nid) 1289{ 1290 pg_data_t *pgdat = NODE_DATA(nid); 1291 1292 val->totalram = pgdat->node_present_pages; 1293 val->freeram = nr_free_pages_pgdat(pgdat); 1294 val->totalhigh = pgdat->node_zones[ZONE_HIGHMEM].present_pages; 1295 val->freehigh = pgdat->node_zones[ZONE_HIGHMEM].free_pages; 1296 val->mem_unit = PAGE_SIZE; 1297} 1298#endif 1299 1300#define K(x) ((x) << (PAGE_SHIFT-10)) 1301 1302/* 1303 * Show free area list (used inside shift_scroll-lock stuff) 1304 * We also calculate the percentage fragmentation. We do this by counting the 1305 * memory on each free list with the exception of the first item on the list. 1306 */ 1307void show_free_areas(void) 1308{ 1309 struct page_state ps; 1310 int cpu, temperature; 1311 unsigned long active; 1312 unsigned long inactive; 1313 unsigned long free; 1314 struct zone *zone; 1315 1316 for_each_zone(zone) { 1317 show_node(zone); 1318 printk("%s per-cpu:", zone->name); 1319 1320 if (!zone->present_pages) { 1321 printk(" empty\n"); 1322 continue; 1323 } else 1324 printk("\n"); 1325 1326 for (cpu = 0; cpu < NR_CPUS; ++cpu) { 1327 struct per_cpu_pageset *pageset; 1328 1329 if (!cpu_possible(cpu)) 1330 continue; 1331 1332 pageset = zone_pcp(zone, cpu); 1333 1334 for (temperature = 0; temperature < 2; temperature++) 1335 printk("cpu %d %s: low %d, high %d, batch %d used:%d\n", 1336 cpu, 1337 temperature ? "cold" : "hot", 1338 pageset->pcp[temperature].low, 1339 pageset->pcp[temperature].high, 1340 pageset->pcp[temperature].batch, 1341 pageset->pcp[temperature].count); 1342 } 1343 } 1344 1345 get_page_state(&ps); 1346 get_zone_counts(&active, &inactive, &free); 1347 1348 printk("Free pages: %11ukB (%ukB HighMem)\n", 1349 K(nr_free_pages()), 1350 K(nr_free_highpages())); 1351 1352 printk("Active:%lu inactive:%lu dirty:%lu writeback:%lu " 1353 "unstable:%lu free:%u slab:%lu mapped:%lu pagetables:%lu\n", 1354 active, 1355 inactive, 1356 ps.nr_dirty, 1357 ps.nr_writeback, 1358 ps.nr_unstable, 1359 nr_free_pages(), 1360 ps.nr_slab, 1361 ps.nr_mapped, 1362 ps.nr_page_table_pages); 1363 1364 for_each_zone(zone) { 1365 int i; 1366 1367 show_node(zone); 1368 printk("%s" 1369 " free:%lukB" 1370 " min:%lukB" 1371 " low:%lukB" 1372 " high:%lukB" 1373 " active:%lukB" 1374 " inactive:%lukB" 1375 " present:%lukB" 1376 " pages_scanned:%lu" 1377 " all_unreclaimable? %s" 1378 "\n", 1379 zone->name, 1380 K(zone->free_pages), 1381 K(zone->pages_min), 1382 K(zone->pages_low), 1383 K(zone->pages_high), 1384 K(zone->nr_active), 1385 K(zone->nr_inactive), 1386 K(zone->present_pages), 1387 zone->pages_scanned, 1388 (zone->all_unreclaimable ? "yes" : "no") 1389 ); 1390 printk("lowmem_reserve[]:"); 1391 for (i = 0; i < MAX_NR_ZONES; i++) 1392 printk(" %lu", zone->lowmem_reserve[i]); 1393 printk("\n"); 1394 } 1395 1396 for_each_zone(zone) { 1397 unsigned long nr, flags, order, total = 0; 1398 1399 show_node(zone); 1400 printk("%s: ", zone->name); 1401 if (!zone->present_pages) { 1402 printk("empty\n"); 1403 continue; 1404 } 1405 1406 spin_lock_irqsave(&zone->lock, flags); 1407 for (order = 0; order < MAX_ORDER; order++) { 1408 nr = zone->free_area[order].nr_free; 1409 total += nr << order; 1410 printk("%lu*%lukB ", nr, K(1UL) << order); 1411 } 1412 spin_unlock_irqrestore(&zone->lock, flags); 1413 printk("= %lukB\n", K(total)); 1414 } 1415 1416 show_swap_cache_info(); 1417} 1418 1419/* 1420 * Builds allocation fallback zone lists. 1421 */ 1422static int __init build_zonelists_node(pg_data_t *pgdat, struct zonelist *zonelist, int j, int k) 1423{ 1424 switch (k) { 1425 struct zone *zone; 1426 default: 1427 BUG(); 1428 case ZONE_HIGHMEM: 1429 zone = pgdat->node_zones + ZONE_HIGHMEM; 1430 if (zone->present_pages) { 1431#ifndef CONFIG_HIGHMEM 1432 BUG(); 1433#endif 1434 zonelist->zones[j++] = zone; 1435 } 1436 case ZONE_NORMAL: 1437 zone = pgdat->node_zones + ZONE_NORMAL; 1438 if (zone->present_pages) 1439 zonelist->zones[j++] = zone; 1440 case ZONE_DMA: 1441 zone = pgdat->node_zones + ZONE_DMA; 1442 if (zone->present_pages) 1443 zonelist->zones[j++] = zone; 1444 } 1445 1446 return j; 1447} 1448 1449static inline int highest_zone(int zone_bits) 1450{ 1451 int res = ZONE_NORMAL; 1452 if (zone_bits & (__force int)__GFP_HIGHMEM) 1453 res = ZONE_HIGHMEM; 1454 if (zone_bits & (__force int)__GFP_DMA) 1455 res = ZONE_DMA; 1456 return res; 1457} 1458 1459#ifdef CONFIG_NUMA 1460#define MAX_NODE_LOAD (num_online_nodes()) 1461static int __initdata node_load[MAX_NUMNODES]; 1462/** 1463 * find_next_best_node - find the next node that should appear in a given node's fallback list 1464 * @node: node whose fallback list we're appending 1465 * @used_node_mask: nodemask_t of already used nodes 1466 * 1467 * We use a number of factors to determine which is the next node that should 1468 * appear on a given node's fallback list. The node should not have appeared 1469 * already in @node's fallback list, and it should be the next closest node 1470 * according to the distance array (which contains arbitrary distance values 1471 * from each node to each node in the system), and should also prefer nodes 1472 * with no CPUs, since presumably they'll have very little allocation pressure 1473 * on them otherwise. 1474 * It returns -1 if no node is found. 1475 */ 1476static int __init find_next_best_node(int node, nodemask_t *used_node_mask) 1477{ 1478 int i, n, val; 1479 int min_val = INT_MAX; 1480 int best_node = -1; 1481 1482 for_each_online_node(i) { 1483 cpumask_t tmp; 1484 1485 /* Start from local node */ 1486 n = (node+i) % num_online_nodes(); 1487 1488 /* Don't want a node to appear more than once */ 1489 if (node_isset(n, *used_node_mask)) 1490 continue; 1491 1492 /* Use the local node if we haven't already */ 1493 if (!node_isset(node, *used_node_mask)) { 1494 best_node = node; 1495 break; 1496 } 1497 1498 /* Use the distance array to find the distance */ 1499 val = node_distance(node, n); 1500 1501 /* Give preference to headless and unused nodes */ 1502 tmp = node_to_cpumask(n); 1503 if (!cpus_empty(tmp)) 1504 val += PENALTY_FOR_NODE_WITH_CPUS; 1505 1506 /* Slight preference for less loaded node */ 1507 val *= (MAX_NODE_LOAD*MAX_NUMNODES); 1508 val += node_load[n]; 1509 1510 if (val < min_val) { 1511 min_val = val; 1512 best_node = n; 1513 } 1514 } 1515 1516 if (best_node >= 0) 1517 node_set(best_node, *used_node_mask); 1518 1519 return best_node; 1520} 1521 1522static void __init build_zonelists(pg_data_t *pgdat) 1523{ 1524 int i, j, k, node, local_node; 1525 int prev_node, load; 1526 struct zonelist *zonelist; 1527 nodemask_t used_mask; 1528 1529 /* initialize zonelists */ 1530 for (i = 0; i < GFP_ZONETYPES; i++) { 1531 zonelist = pgdat->node_zonelists + i; 1532 zonelist->zones[0] = NULL; 1533 } 1534 1535 /* NUMA-aware ordering of nodes */ 1536 local_node = pgdat->node_id; 1537 load = num_online_nodes(); 1538 prev_node = local_node; 1539 nodes_clear(used_mask); 1540 while ((node = find_next_best_node(local_node, &used_mask)) >= 0) { 1541 /* 1542 * We don't want to pressure a particular node. 1543 * So adding penalty to the first node in same 1544 * distance group to make it round-robin. 1545 */ 1546 if (node_distance(local_node, node) != 1547 node_distance(local_node, prev_node)) 1548 node_load[node] += load; 1549 prev_node = node; 1550 load--; 1551 for (i = 0; i < GFP_ZONETYPES; i++) { 1552 zonelist = pgdat->node_zonelists + i; 1553 for (j = 0; zonelist->zones[j] != NULL; j++); 1554 1555 k = highest_zone(i); 1556 1557 j = build_zonelists_node(NODE_DATA(node), zonelist, j, k); 1558 zonelist->zones[j] = NULL; 1559 } 1560 } 1561} 1562 1563#else /* CONFIG_NUMA */ 1564 1565static void __init build_zonelists(pg_data_t *pgdat) 1566{ 1567 int i, j, k, node, local_node; 1568 1569 local_node = pgdat->node_id; 1570 for (i = 0; i < GFP_ZONETYPES; i++) { 1571 struct zonelist *zonelist; 1572 1573 zonelist = pgdat->node_zonelists + i; 1574 1575 j = 0; 1576 k = highest_zone(i); 1577 j = build_zonelists_node(pgdat, zonelist, j, k); 1578 /* 1579 * Now we build the zonelist so that it contains the zones 1580 * of all the other nodes. 1581 * We don't want to pressure a particular node, so when 1582 * building the zones for node N, we make sure that the 1583 * zones coming right after the local ones are those from 1584 * node N+1 (modulo N) 1585 */ 1586 for (node = local_node + 1; node < MAX_NUMNODES; node++) { 1587 if (!node_online(node)) 1588 continue; 1589 j = build_zonelists_node(NODE_DATA(node), zonelist, j, k); 1590 } 1591 for (node = 0; node < local_node; node++) { 1592 if (!node_online(node)) 1593 continue; 1594 j = build_zonelists_node(NODE_DATA(node), zonelist, j, k); 1595 } 1596 1597 zonelist->zones[j] = NULL; 1598 } 1599} 1600 1601#endif /* CONFIG_NUMA */ 1602 1603void __init build_all_zonelists(void) 1604{ 1605 int i; 1606 1607 for_each_online_node(i) 1608 build_zonelists(NODE_DATA(i)); 1609 printk("Built %i zonelists\n", num_online_nodes()); 1610 cpuset_init_current_mems_allowed(); 1611} 1612 1613/* 1614 * Helper functions to size the waitqueue hash table. 1615 * Essentially these want to choose hash table sizes sufficiently 1616 * large so that collisions trying to wait on pages are rare. 1617 * But in fact, the number of active page waitqueues on typical 1618 * systems is ridiculously low, less than 200. So this is even 1619 * conservative, even though it seems large. 1620 * 1621 * The constant PAGES_PER_WAITQUEUE specifies the ratio of pages to 1622 * waitqueues, i.e. the size of the waitq table given the number of pages. 1623 */ 1624#define PAGES_PER_WAITQUEUE 256 1625 1626static inline unsigned long wait_table_size(unsigned long pages) 1627{ 1628 unsigned long size = 1; 1629 1630 pages /= PAGES_PER_WAITQUEUE; 1631 1632 while (size < pages) 1633 size <<= 1; 1634 1635 /* 1636 * Once we have dozens or even hundreds of threads sleeping 1637 * on IO we've got bigger problems than wait queue collision. 1638 * Limit the size of the wait table to a reasonable size. 1639 */ 1640 size = min(size, 4096UL); 1641 1642 return max(size, 4UL); 1643} 1644 1645/* 1646 * This is an integer logarithm so that shifts can be used later 1647 * to extract the more random high bits from the multiplicative 1648 * hash function before the remainder is taken. 1649 */ 1650static inline unsigned long wait_table_bits(unsigned long size) 1651{ 1652 return ffz(~size); 1653} 1654 1655#define LONG_ALIGN(x) (((x)+(sizeof(long))-1)&~((sizeof(long))-1)) 1656 1657static void __init calculate_zone_totalpages(struct pglist_data *pgdat, 1658 unsigned long *zones_size, unsigned long *zholes_size) 1659{ 1660 unsigned long realtotalpages, totalpages = 0; 1661 int i; 1662 1663 for (i = 0; i < MAX_NR_ZONES; i++) 1664 totalpages += zones_size[i]; 1665 pgdat->node_spanned_pages = totalpages; 1666 1667 realtotalpages = totalpages; 1668 if (zholes_size) 1669 for (i = 0; i < MAX_NR_ZONES; i++) 1670 realtotalpages -= zholes_size[i]; 1671 pgdat->node_present_pages = realtotalpages; 1672 printk(KERN_DEBUG "On node %d totalpages: %lu\n", pgdat->node_id, realtotalpages); 1673} 1674 1675 1676/* 1677 * Initially all pages are reserved - free ones are freed 1678 * up by free_all_bootmem() once the early boot process is 1679 * done. Non-atomic initialization, single-pass. 1680 */ 1681void __init memmap_init_zone(unsigned long size, int nid, unsigned long zone, 1682 unsigned long start_pfn) 1683{ 1684 struct page *page; 1685 unsigned long end_pfn = start_pfn + size; 1686 unsigned long pfn; 1687 1688 for (pfn = start_pfn; pfn < end_pfn; pfn++, page++) { 1689 if (!early_pfn_valid(pfn)) 1690 continue; 1691 if (!early_pfn_in_nid(pfn, nid)) 1692 continue; 1693 page = pfn_to_page(pfn); 1694 set_page_links(page, zone, nid, pfn); 1695 set_page_count(page, 1); 1696 reset_page_mapcount(page); 1697 SetPageReserved(page); 1698 INIT_LIST_HEAD(&page->lru); 1699#ifdef WANT_PAGE_VIRTUAL 1700 /* The shift won't overflow because ZONE_NORMAL is below 4G. */ 1701 if (!is_highmem_idx(zone)) 1702 set_page_address(page, __va(pfn << PAGE_SHIFT)); 1703#endif 1704 } 1705} 1706 1707void zone_init_free_lists(struct pglist_data *pgdat, struct zone *zone, 1708 unsigned long size) 1709{ 1710 int order; 1711 for (order = 0; order < MAX_ORDER ; order++) { 1712 INIT_LIST_HEAD(&zone->free_area[order].free_list); 1713 zone->free_area[order].nr_free = 0; 1714 } 1715} 1716 1717#define ZONETABLE_INDEX(x, zone_nr) ((x << ZONES_SHIFT) | zone_nr) 1718void zonetable_add(struct zone *zone, int nid, int zid, unsigned long pfn, 1719 unsigned long size) 1720{ 1721 unsigned long snum = pfn_to_section_nr(pfn); 1722 unsigned long end = pfn_to_section_nr(pfn + size); 1723 1724 if (FLAGS_HAS_NODE) 1725 zone_table[ZONETABLE_INDEX(nid, zid)] = zone; 1726 else 1727 for (; snum <= end; snum++) 1728 zone_table[ZONETABLE_INDEX(snum, zid)] = zone; 1729} 1730 1731#ifndef __HAVE_ARCH_MEMMAP_INIT 1732#define memmap_init(size, nid, zone, start_pfn) \ 1733 memmap_init_zone((size), (nid), (zone), (start_pfn)) 1734#endif 1735 1736static int __devinit zone_batchsize(struct zone *zone) 1737{ 1738 int batch; 1739 1740 /* 1741 * The per-cpu-pages pools are set to around 1000th of the 1742 * size of the zone. But no more than 1/2 of a meg. 1743 * 1744 * OK, so we don't know how big the cache is. So guess. 1745 */ 1746 batch = zone->present_pages / 1024; 1747 if (batch * PAGE_SIZE > 512 * 1024) 1748 batch = (512 * 1024) / PAGE_SIZE; 1749 batch /= 4; /* We effectively *= 4 below */ 1750 if (batch < 1) 1751 batch = 1; 1752 1753 /* 1754 * We will be trying to allcoate bigger chunks of contiguous 1755 * memory of the order of fls(batch). This should result in 1756 * better cache coloring. 1757 * 1758 * A sanity check also to ensure that batch is still in limits. 1759 */ 1760 batch = (1 << fls(batch + batch/2)); 1761 1762 if (fls(batch) >= (PAGE_SHIFT + MAX_ORDER - 2)) 1763 batch = PAGE_SHIFT + ((MAX_ORDER - 1 - PAGE_SHIFT)/2); 1764 1765 return batch; 1766} 1767 1768inline void setup_pageset(struct per_cpu_pageset *p, unsigned long batch) 1769{ 1770 struct per_cpu_pages *pcp; 1771 1772 memset(p, 0, sizeof(*p)); 1773 1774 pcp = &p->pcp[0]; /* hot */ 1775 pcp->count = 0; 1776 pcp->low = 0; 1777 pcp->high = 6 * batch; 1778 pcp->batch = max(1UL, 1 * batch); 1779 INIT_LIST_HEAD(&pcp->list); 1780 1781 pcp = &p->pcp[1]; /* cold*/ 1782 pcp->count = 0; 1783 pcp->low = 0; 1784 pcp->high = 2 * batch; 1785 pcp->batch = max(1UL, batch/2); 1786 INIT_LIST_HEAD(&pcp->list); 1787} 1788 1789#ifdef CONFIG_NUMA 1790/* 1791 * Boot pageset table. One per cpu which is going to be used for all 1792 * zones and all nodes. The parameters will be set in such a way 1793 * that an item put on a list will immediately be handed over to 1794 * the buddy list. This is safe since pageset manipulation is done 1795 * with interrupts disabled. 1796 * 1797 * Some NUMA counter updates may also be caught by the boot pagesets. 1798 * 1799 * The boot_pagesets must be kept even after bootup is complete for 1800 * unused processors and/or zones. They do play a role for bootstrapping 1801 * hotplugged processors. 1802 * 1803 * zoneinfo_show() and maybe other functions do 1804 * not check if the processor is online before following the pageset pointer. 1805 * Other parts of the kernel may not check if the zone is available. 1806 */ 1807static struct per_cpu_pageset 1808 boot_pageset[NR_CPUS]; 1809 1810/* 1811 * Dynamically allocate memory for the 1812 * per cpu pageset array in struct zone. 1813 */ 1814static int __devinit process_zones(int cpu) 1815{ 1816 struct zone *zone, *dzone; 1817 1818 for_each_zone(zone) { 1819 1820 zone->pageset[cpu] = kmalloc_node(sizeof(struct per_cpu_pageset), 1821 GFP_KERNEL, cpu_to_node(cpu)); 1822 if (!zone->pageset[cpu]) 1823 goto bad; 1824 1825 setup_pageset(zone->pageset[cpu], zone_batchsize(zone)); 1826 } 1827 1828 return 0; 1829bad: 1830 for_each_zone(dzone) { 1831 if (dzone == zone) 1832 break; 1833 kfree(dzone->pageset[cpu]); 1834 dzone->pageset[cpu] = NULL; 1835 } 1836 return -ENOMEM; 1837} 1838 1839static inline void free_zone_pagesets(int cpu) 1840{ 1841#ifdef CONFIG_NUMA 1842 struct zone *zone; 1843 1844 for_each_zone(zone) { 1845 struct per_cpu_pageset *pset = zone_pcp(zone, cpu); 1846 1847 zone_pcp(zone, cpu) = NULL; 1848 kfree(pset); 1849 } 1850#endif 1851} 1852 1853static int __devinit pageset_cpuup_callback(struct notifier_block *nfb, 1854 unsigned long action, 1855 void *hcpu) 1856{ 1857 int cpu = (long)hcpu; 1858 int ret = NOTIFY_OK; 1859 1860 switch (action) { 1861 case CPU_UP_PREPARE: 1862 if (process_zones(cpu)) 1863 ret = NOTIFY_BAD; 1864 break; 1865#ifdef CONFIG_HOTPLUG_CPU 1866 case CPU_DEAD: 1867 free_zone_pagesets(cpu); 1868 break; 1869#endif 1870 default: 1871 break; 1872 } 1873 return ret; 1874} 1875 1876static struct notifier_block pageset_notifier = 1877 { &pageset_cpuup_callback, NULL, 0 }; 1878 1879void __init setup_per_cpu_pageset() 1880{ 1881 int err; 1882 1883 /* Initialize per_cpu_pageset for cpu 0. 1884 * A cpuup callback will do this for every cpu 1885 * as it comes online 1886 */ 1887 err = process_zones(smp_processor_id()); 1888 BUG_ON(err); 1889 register_cpu_notifier(&pageset_notifier); 1890} 1891 1892#endif 1893 1894static __devinit 1895void zone_wait_table_init(struct zone *zone, unsigned long zone_size_pages) 1896{ 1897 int i; 1898 struct pglist_data *pgdat = zone->zone_pgdat; 1899 1900 /* 1901 * The per-page waitqueue mechanism uses hashed waitqueues 1902 * per zone. 1903 */ 1904 zone->wait_table_size = wait_table_size(zone_size_pages); 1905 zone->wait_table_bits = wait_table_bits(zone->wait_table_size); 1906 zone->wait_table = (wait_queue_head_t *) 1907 alloc_bootmem_node(pgdat, zone->wait_table_size 1908 * sizeof(wait_queue_head_t)); 1909 1910 for(i = 0; i < zone->wait_table_size; ++i) 1911 init_waitqueue_head(zone->wait_table + i); 1912} 1913 1914static __devinit void zone_pcp_init(struct zone *zone) 1915{ 1916 int cpu; 1917 unsigned long batch = zone_batchsize(zone); 1918 1919 for (cpu = 0; cpu < NR_CPUS; cpu++) { 1920#ifdef CONFIG_NUMA 1921 /* Early boot. Slab allocator not functional yet */ 1922 zone->pageset[cpu] = &boot_pageset[cpu]; 1923 setup_pageset(&boot_pageset[cpu],0); 1924#else 1925 setup_pageset(zone_pcp(zone,cpu), batch); 1926#endif 1927 } 1928 printk(KERN_DEBUG " %s zone: %lu pages, LIFO batch:%lu\n", 1929 zone->name, zone->present_pages, batch); 1930} 1931 1932static __devinit void init_currently_empty_zone(struct zone *zone, 1933 unsigned long zone_start_pfn, unsigned long size) 1934{ 1935 struct pglist_data *pgdat = zone->zone_pgdat; 1936 1937 zone_wait_table_init(zone, size); 1938 pgdat->nr_zones = zone_idx(zone) + 1; 1939 1940 zone->zone_mem_map = pfn_to_page(zone_start_pfn); 1941 zone->zone_start_pfn = zone_start_pfn; 1942 1943 memmap_init(size, pgdat->node_id, zone_idx(zone), zone_start_pfn); 1944 1945 zone_init_free_lists(pgdat, zone, zone->spanned_pages); 1946} 1947 1948/* 1949 * Set up the zone data structures: 1950 * - mark all pages reserved 1951 * - mark all memory queues empty 1952 * - clear the memory bitmaps 1953 */ 1954static void __init free_area_init_core(struct pglist_data *pgdat, 1955 unsigned long *zones_size, unsigned long *zholes_size) 1956{ 1957 unsigned long j; 1958 int nid = pgdat->node_id; 1959 unsigned long zone_start_pfn = pgdat->node_start_pfn; 1960 1961 pgdat->nr_zones = 0; 1962 init_waitqueue_head(&pgdat->kswapd_wait); 1963 pgdat->kswapd_max_order = 0; 1964 1965 for (j = 0; j < MAX_NR_ZONES; j++) { 1966 struct zone *zone = pgdat->node_zones + j; 1967 unsigned long size, realsize; 1968 1969 realsize = size = zones_size[j]; 1970 if (zholes_size) 1971 realsize -= zholes_size[j]; 1972 1973 if (j == ZONE_DMA || j == ZONE_NORMAL) 1974 nr_kernel_pages += realsize; 1975 nr_all_pages += realsize; 1976 1977 zone->spanned_pages = size; 1978 zone->present_pages = realsize; 1979 zone->name = zone_names[j]; 1980 spin_lock_init(&zone->lock); 1981 spin_lock_init(&zone->lru_lock); 1982 zone->zone_pgdat = pgdat; 1983 zone->free_pages = 0; 1984 1985 zone->temp_priority = zone->prev_priority = DEF_PRIORITY; 1986 1987 zone_pcp_init(zone); 1988 INIT_LIST_HEAD(&zone->active_list); 1989 INIT_LIST_HEAD(&zone->inactive_list); 1990 zone->nr_scan_active = 0; 1991 zone->nr_scan_inactive = 0; 1992 zone->nr_active = 0; 1993 zone->nr_inactive = 0; 1994 atomic_set(&zone->reclaim_in_progress, 0); 1995 if (!size) 1996 continue; 1997 1998 zonetable_add(zone, nid, j, zone_start_pfn, size); 1999 init_currently_empty_zone(zone, zone_start_pfn, size); 2000 zone_start_pfn += size; 2001 } 2002} 2003 2004static void __init alloc_node_mem_map(struct pglist_data *pgdat) 2005{ 2006 /* Skip empty nodes */ 2007 if (!pgdat->node_spanned_pages) 2008 return; 2009 2010#ifdef CONFIG_FLAT_NODE_MEM_MAP 2011 /* ia64 gets its own node_mem_map, before this, without bootmem */ 2012 if (!pgdat->node_mem_map) { 2013 unsigned long size; 2014 struct page *map; 2015 2016 size = (pgdat->node_spanned_pages + 1) * sizeof(struct page); 2017 map = alloc_remap(pgdat->node_id, size); 2018 if (!map) 2019 map = alloc_bootmem_node(pgdat, size); 2020 pgdat->node_mem_map = map; 2021 } 2022#ifdef CONFIG_FLATMEM 2023 /* 2024 * With no DISCONTIG, the global mem_map is just set as node 0's 2025 */ 2026 if (pgdat == NODE_DATA(0)) 2027 mem_map = NODE_DATA(0)->node_mem_map; 2028#endif 2029#endif /* CONFIG_FLAT_NODE_MEM_MAP */ 2030} 2031 2032void __init free_area_init_node(int nid, struct pglist_data *pgdat, 2033 unsigned long *zones_size, unsigned long node_start_pfn, 2034 unsigned long *zholes_size) 2035{ 2036 pgdat->node_id = nid; 2037 pgdat->node_start_pfn = node_start_pfn; 2038 calculate_zone_totalpages(pgdat, zones_size, zholes_size); 2039 2040 alloc_node_mem_map(pgdat); 2041 2042 free_area_init_core(pgdat, zones_size, zholes_size); 2043} 2044 2045#ifndef CONFIG_NEED_MULTIPLE_NODES 2046static bootmem_data_t contig_bootmem_data; 2047struct pglist_data contig_page_data = { .bdata = &contig_bootmem_data }; 2048 2049EXPORT_SYMBOL(contig_page_data); 2050#endif 2051 2052void __init free_area_init(unsigned long *zones_size) 2053{ 2054 free_area_init_node(0, NODE_DATA(0), zones_size, 2055 __pa(PAGE_OFFSET) >> PAGE_SHIFT, NULL); 2056} 2057 2058#ifdef CONFIG_PROC_FS 2059 2060#include <linux/seq_file.h> 2061 2062static void *frag_start(struct seq_file *m, loff_t *pos) 2063{ 2064 pg_data_t *pgdat; 2065 loff_t node = *pos; 2066 2067 for (pgdat = pgdat_list; pgdat && node; pgdat = pgdat->pgdat_next) 2068 --node; 2069 2070 return pgdat; 2071} 2072 2073static void *frag_next(struct seq_file *m, void *arg, loff_t *pos) 2074{ 2075 pg_data_t *pgdat = (pg_data_t *)arg; 2076 2077 (*pos)++; 2078 return pgdat->pgdat_next; 2079} 2080 2081static void frag_stop(struct seq_file *m, void *arg) 2082{ 2083} 2084 2085/* 2086 * This walks the free areas for each zone. 2087 */ 2088static int frag_show(struct seq_file *m, void *arg) 2089{ 2090 pg_data_t *pgdat = (pg_data_t *)arg; 2091 struct zone *zone; 2092 struct zone *node_zones = pgdat->node_zones; 2093 unsigned long flags; 2094 int order; 2095 2096 for (zone = node_zones; zone - node_zones < MAX_NR_ZONES; ++zone) { 2097 if (!zone->present_pages) 2098 continue; 2099 2100 spin_lock_irqsave(&zone->lock, flags); 2101 seq_printf(m, "Node %d, zone %8s ", pgdat->node_id, zone->name); 2102 for (order = 0; order < MAX_ORDER; ++order) 2103 seq_printf(m, "%6lu ", zone->free_area[order].nr_free); 2104 spin_unlock_irqrestore(&zone->lock, flags); 2105 seq_putc(m, '\n'); 2106 } 2107 return 0; 2108} 2109 2110struct seq_operations fragmentation_op = { 2111 .start = frag_start, 2112 .next = frag_next, 2113 .stop = frag_stop, 2114 .show = frag_show, 2115}; 2116 2117/* 2118 * Output information about zones in @pgdat. 2119 */ 2120static int zoneinfo_show(struct seq_file *m, void *arg) 2121{ 2122 pg_data_t *pgdat = arg; 2123 struct zone *zone; 2124 struct zone *node_zones = pgdat->node_zones; 2125 unsigned long flags; 2126 2127 for (zone = node_zones; zone - node_zones < MAX_NR_ZONES; zone++) { 2128 int i; 2129 2130 if (!zone->present_pages) 2131 continue; 2132 2133 spin_lock_irqsave(&zone->lock, flags); 2134 seq_printf(m, "Node %d, zone %8s", pgdat->node_id, zone->name); 2135 seq_printf(m, 2136 "\n pages free %lu" 2137 "\n min %lu" 2138 "\n low %lu" 2139 "\n high %lu" 2140 "\n active %lu" 2141 "\n inactive %lu" 2142 "\n scanned %lu (a: %lu i: %lu)" 2143 "\n spanned %lu" 2144 "\n present %lu", 2145 zone->free_pages, 2146 zone->pages_min, 2147 zone->pages_low, 2148 zone->pages_high, 2149 zone->nr_active, 2150 zone->nr_inactive, 2151 zone->pages_scanned, 2152 zone->nr_scan_active, zone->nr_scan_inactive, 2153 zone->spanned_pages, 2154 zone->present_pages); 2155 seq_printf(m, 2156 "\n protection: (%lu", 2157 zone->lowmem_reserve[0]); 2158 for (i = 1; i < ARRAY_SIZE(zone->lowmem_reserve); i++) 2159 seq_printf(m, ", %lu", zone->lowmem_reserve[i]); 2160 seq_printf(m, 2161 ")" 2162 "\n pagesets"); 2163 for (i = 0; i < ARRAY_SIZE(zone->pageset); i++) { 2164 struct per_cpu_pageset *pageset; 2165 int j; 2166 2167 pageset = zone_pcp(zone, i); 2168 for (j = 0; j < ARRAY_SIZE(pageset->pcp); j++) { 2169 if (pageset->pcp[j].count) 2170 break; 2171 } 2172 if (j == ARRAY_SIZE(pageset->pcp)) 2173 continue; 2174 for (j = 0; j < ARRAY_SIZE(pageset->pcp); j++) { 2175 seq_printf(m, 2176 "\n cpu: %i pcp: %i" 2177 "\n count: %i" 2178 "\n low: %i" 2179 "\n high: %i" 2180 "\n batch: %i", 2181 i, j, 2182 pageset->pcp[j].count, 2183 pageset->pcp[j].low, 2184 pageset->pcp[j].high, 2185 pageset->pcp[j].batch); 2186 } 2187#ifdef CONFIG_NUMA 2188 seq_printf(m, 2189 "\n numa_hit: %lu" 2190 "\n numa_miss: %lu" 2191 "\n numa_foreign: %lu" 2192 "\n interleave_hit: %lu" 2193 "\n local_node: %lu" 2194 "\n other_node: %lu", 2195 pageset->numa_hit, 2196 pageset->numa_miss, 2197 pageset->numa_foreign, 2198 pageset->interleave_hit, 2199 pageset->local_node, 2200 pageset->other_node); 2201#endif 2202 } 2203 seq_printf(m, 2204 "\n all_unreclaimable: %u" 2205 "\n prev_priority: %i" 2206 "\n temp_priority: %i" 2207 "\n start_pfn: %lu", 2208 zone->all_unreclaimable, 2209 zone->prev_priority, 2210 zone->temp_priority, 2211 zone->zone_start_pfn); 2212 spin_unlock_irqrestore(&zone->lock, flags); 2213 seq_putc(m, '\n'); 2214 } 2215 return 0; 2216} 2217 2218struct seq_operations zoneinfo_op = { 2219 .start = frag_start, /* iterate over all zones. The same as in 2220 * fragmentation. */ 2221 .next = frag_next, 2222 .stop = frag_stop, 2223 .show = zoneinfo_show, 2224}; 2225 2226static char *vmstat_text[] = { 2227 "nr_dirty", 2228 "nr_writeback", 2229 "nr_unstable", 2230 "nr_page_table_pages", 2231 "nr_mapped", 2232 "nr_slab", 2233 2234 "pgpgin", 2235 "pgpgout", 2236 "pswpin", 2237 "pswpout", 2238 "pgalloc_high", 2239 2240 "pgalloc_normal", 2241 "pgalloc_dma", 2242 "pgfree", 2243 "pgactivate", 2244 "pgdeactivate", 2245 2246 "pgfault", 2247 "pgmajfault", 2248 "pgrefill_high", 2249 "pgrefill_normal", 2250 "pgrefill_dma", 2251 2252 "pgsteal_high", 2253 "pgsteal_normal", 2254 "pgsteal_dma", 2255 "pgscan_kswapd_high", 2256 "pgscan_kswapd_normal", 2257 2258 "pgscan_kswapd_dma", 2259 "pgscan_direct_high", 2260 "pgscan_direct_normal", 2261 "pgscan_direct_dma", 2262 "pginodesteal", 2263 2264 "slabs_scanned", 2265 "kswapd_steal", 2266 "kswapd_inodesteal", 2267 "pageoutrun", 2268 "allocstall", 2269 2270 "pgrotated", 2271 "nr_bounce", 2272}; 2273 2274static void *vmstat_start(struct seq_file *m, loff_t *pos) 2275{ 2276 struct page_state *ps; 2277 2278 if (*pos >= ARRAY_SIZE(vmstat_text)) 2279 return NULL; 2280 2281 ps = kmalloc(sizeof(*ps), GFP_KERNEL); 2282 m->private = ps; 2283 if (!ps) 2284 return ERR_PTR(-ENOMEM); 2285 get_full_page_state(ps); 2286 ps->pgpgin /= 2; /* sectors -> kbytes */ 2287 ps->pgpgout /= 2; 2288 return (unsigned long *)ps + *pos; 2289} 2290 2291static void *vmstat_next(struct seq_file *m, void *arg, loff_t *pos) 2292{ 2293 (*pos)++; 2294 if (*pos >= ARRAY_SIZE(vmstat_text)) 2295 return NULL; 2296 return (unsigned long *)m->private + *pos; 2297} 2298 2299static int vmstat_show(struct seq_file *m, void *arg) 2300{ 2301 unsigned long *l = arg; 2302 unsigned long off = l - (unsigned long *)m->private; 2303 2304 seq_printf(m, "%s %lu\n", vmstat_text[off], *l); 2305 return 0; 2306} 2307 2308static void vmstat_stop(struct seq_file *m, void *arg) 2309{ 2310 kfree(m->private); 2311 m->private = NULL; 2312} 2313 2314struct seq_operations vmstat_op = { 2315 .start = vmstat_start, 2316 .next = vmstat_next, 2317 .stop = vmstat_stop, 2318 .show = vmstat_show, 2319}; 2320 2321#endif /* CONFIG_PROC_FS */ 2322 2323#ifdef CONFIG_HOTPLUG_CPU 2324static int page_alloc_cpu_notify(struct notifier_block *self, 2325 unsigned long action, void *hcpu) 2326{ 2327 int cpu = (unsigned long)hcpu; 2328 long *count; 2329 unsigned long *src, *dest; 2330 2331 if (action == CPU_DEAD) { 2332 int i; 2333 2334 /* Drain local pagecache count. */ 2335 count = &per_cpu(nr_pagecache_local, cpu); 2336 atomic_add(*count, &nr_pagecache); 2337 *count = 0; 2338 local_irq_disable(); 2339 __drain_pages(cpu); 2340 2341 /* Add dead cpu's page_states to our own. */ 2342 dest = (unsigned long *)&__get_cpu_var(page_states); 2343 src = (unsigned long *)&per_cpu(page_states, cpu); 2344 2345 for (i = 0; i < sizeof(struct page_state)/sizeof(unsigned long); 2346 i++) { 2347 dest[i] += src[i]; 2348 src[i] = 0; 2349 } 2350 2351 local_irq_enable(); 2352 } 2353 return NOTIFY_OK; 2354} 2355#endif /* CONFIG_HOTPLUG_CPU */ 2356 2357void __init page_alloc_init(void) 2358{ 2359 hotcpu_notifier(page_alloc_cpu_notify, 0); 2360} 2361 2362/* 2363 * setup_per_zone_lowmem_reserve - called whenever 2364 * sysctl_lower_zone_reserve_ratio changes. Ensures that each zone 2365 * has a correct pages reserved value, so an adequate number of 2366 * pages are left in the zone after a successful __alloc_pages(). 2367 */ 2368static void setup_per_zone_lowmem_reserve(void) 2369{ 2370 struct pglist_data *pgdat; 2371 int j, idx; 2372 2373 for_each_pgdat(pgdat) { 2374 for (j = 0; j < MAX_NR_ZONES; j++) { 2375 struct zone *zone = pgdat->node_zones + j; 2376 unsigned long present_pages = zone->present_pages; 2377 2378 zone->lowmem_reserve[j] = 0; 2379 2380 for (idx = j-1; idx >= 0; idx--) { 2381 struct zone *lower_zone; 2382 2383 if (sysctl_lowmem_reserve_ratio[idx] < 1) 2384 sysctl_lowmem_reserve_ratio[idx] = 1; 2385 2386 lower_zone = pgdat->node_zones + idx; 2387 lower_zone->lowmem_reserve[j] = present_pages / 2388 sysctl_lowmem_reserve_ratio[idx]; 2389 present_pages += lower_zone->present_pages; 2390 } 2391 } 2392 } 2393} 2394 2395/* 2396 * setup_per_zone_pages_min - called when min_free_kbytes changes. Ensures 2397 * that the pages_{min,low,high} values for each zone are set correctly 2398 * with respect to min_free_kbytes. 2399 */ 2400static void setup_per_zone_pages_min(void) 2401{ 2402 unsigned long pages_min = min_free_kbytes >> (PAGE_SHIFT - 10); 2403 unsigned long lowmem_pages = 0; 2404 struct zone *zone; 2405 unsigned long flags; 2406 2407 /* Calculate total number of !ZONE_HIGHMEM pages */ 2408 for_each_zone(zone) { 2409 if (!is_highmem(zone)) 2410 lowmem_pages += zone->present_pages; 2411 } 2412 2413 for_each_zone(zone) { 2414 spin_lock_irqsave(&zone->lru_lock, flags); 2415 if (is_highmem(zone)) { 2416 /* 2417 * Often, highmem doesn't need to reserve any pages. 2418 * But the pages_min/low/high values are also used for 2419 * batching up page reclaim activity so we need a 2420 * decent value here. 2421 */ 2422 int min_pages; 2423 2424 min_pages = zone->present_pages / 1024; 2425 if (min_pages < SWAP_CLUSTER_MAX) 2426 min_pages = SWAP_CLUSTER_MAX; 2427 if (min_pages > 128) 2428 min_pages = 128; 2429 zone->pages_min = min_pages; 2430 } else { 2431 /* if it's a lowmem zone, reserve a number of pages 2432 * proportionate to the zone's size. 2433 */ 2434 zone->pages_min = (pages_min * zone->present_pages) / 2435 lowmem_pages; 2436 } 2437 2438 /* 2439 * When interpreting these watermarks, just keep in mind that: 2440 * zone->pages_min == (zone->pages_min * 4) / 4; 2441 */ 2442 zone->pages_low = (zone->pages_min * 5) / 4; 2443 zone->pages_high = (zone->pages_min * 6) / 4; 2444 spin_unlock_irqrestore(&zone->lru_lock, flags); 2445 } 2446} 2447 2448/* 2449 * Initialise min_free_kbytes. 2450 * 2451 * For small machines we want it small (128k min). For large machines 2452 * we want it large (64MB max). But it is not linear, because network 2453 * bandwidth does not increase linearly with machine size. We use 2454 * 2455 * min_free_kbytes = 4 * sqrt(lowmem_kbytes), for better accuracy: 2456 * min_free_kbytes = sqrt(lowmem_kbytes * 16) 2457 * 2458 * which yields 2459 * 2460 * 16MB: 512k 2461 * 32MB: 724k 2462 * 64MB: 1024k 2463 * 128MB: 1448k 2464 * 256MB: 2048k 2465 * 512MB: 2896k 2466 * 1024MB: 4096k 2467 * 2048MB: 5792k 2468 * 4096MB: 8192k 2469 * 8192MB: 11584k 2470 * 16384MB: 16384k 2471 */ 2472static int __init init_per_zone_pages_min(void) 2473{ 2474 unsigned long lowmem_kbytes; 2475 2476 lowmem_kbytes = nr_free_buffer_pages() * (PAGE_SIZE >> 10); 2477 2478 min_free_kbytes = int_sqrt(lowmem_kbytes * 16); 2479 if (min_free_kbytes < 128) 2480 min_free_kbytes = 128; 2481 if (min_free_kbytes > 65536) 2482 min_free_kbytes = 65536; 2483 setup_per_zone_pages_min(); 2484 setup_per_zone_lowmem_reserve(); 2485 return 0; 2486} 2487module_init(init_per_zone_pages_min) 2488 2489/* 2490 * min_free_kbytes_sysctl_handler - just a wrapper around proc_dointvec() so 2491 * that we can call two helper functions whenever min_free_kbytes 2492 * changes. 2493 */ 2494int min_free_kbytes_sysctl_handler(ctl_table *table, int write, 2495 struct file *file, void __user *buffer, size_t *length, loff_t *ppos) 2496{ 2497 proc_dointvec(table, write, file, buffer, length, ppos); 2498 setup_per_zone_pages_min(); 2499 return 0; 2500} 2501 2502/* 2503 * lowmem_reserve_ratio_sysctl_handler - just a wrapper around 2504 * proc_dointvec() so that we can call setup_per_zone_lowmem_reserve() 2505 * whenever sysctl_lowmem_reserve_ratio changes. 2506 * 2507 * The reserve ratio obviously has absolutely no relation with the 2508 * pages_min watermarks. The lowmem reserve ratio can only make sense 2509 * if in function of the boot time zone sizes. 2510 */ 2511int lowmem_reserve_ratio_sysctl_handler(ctl_table *table, int write, 2512 struct file *file, void __user *buffer, size_t *length, loff_t *ppos) 2513{ 2514 proc_dointvec_minmax(table, write, file, buffer, length, ppos); 2515 setup_per_zone_lowmem_reserve(); 2516 return 0; 2517} 2518 2519__initdata int hashdist = HASHDIST_DEFAULT; 2520 2521#ifdef CONFIG_NUMA 2522static int __init set_hashdist(char *str) 2523{ 2524 if (!str) 2525 return 0; 2526 hashdist = simple_strtoul(str, &str, 0); 2527 return 1; 2528} 2529__setup("hashdist=", set_hashdist); 2530#endif 2531 2532/* 2533 * allocate a large system hash table from bootmem 2534 * - it is assumed that the hash table must contain an exact power-of-2 2535 * quantity of entries 2536 * - limit is the number of hash buckets, not the total allocation size 2537 */ 2538void *__init alloc_large_system_hash(const char *tablename, 2539 unsigned long bucketsize, 2540 unsigned long numentries, 2541 int scale, 2542 int flags, 2543 unsigned int *_hash_shift, 2544 unsigned int *_hash_mask, 2545 unsigned long limit) 2546{ 2547 unsigned long long max = limit; 2548 unsigned long log2qty, size; 2549 void *table = NULL; 2550 2551 /* allow the kernel cmdline to have a say */ 2552 if (!numentries) { 2553 /* round applicable memory size up to nearest megabyte */ 2554 numentries = (flags & HASH_HIGHMEM) ? nr_all_pages : nr_kernel_pages; 2555 numentries += (1UL << (20 - PAGE_SHIFT)) - 1; 2556 numentries >>= 20 - PAGE_SHIFT; 2557 numentries <<= 20 - PAGE_SHIFT; 2558 2559 /* limit to 1 bucket per 2^scale bytes of low memory */ 2560 if (scale > PAGE_SHIFT) 2561 numentries >>= (scale - PAGE_SHIFT); 2562 else 2563 numentries <<= (PAGE_SHIFT - scale); 2564 } 2565 /* rounded up to nearest power of 2 in size */ 2566 numentries = 1UL << (long_log2(numentries) + 1); 2567 2568 /* limit allocation size to 1/16 total memory by default */ 2569 if (max == 0) { 2570 max = ((unsigned long long)nr_all_pages << PAGE_SHIFT) >> 4; 2571 do_div(max, bucketsize); 2572 } 2573 2574 if (numentries > max) 2575 numentries = max; 2576 2577 log2qty = long_log2(numentries); 2578 2579 do { 2580 size = bucketsize << log2qty; 2581 if (flags & HASH_EARLY) 2582 table = alloc_bootmem(size); 2583 else if (hashdist) 2584 table = __vmalloc(size, GFP_ATOMIC, PAGE_KERNEL); 2585 else { 2586 unsigned long order; 2587 for (order = 0; ((1UL << order) << PAGE_SHIFT) < size; order++) 2588 ; 2589 table = (void*) __get_free_pages(GFP_ATOMIC, order); 2590 } 2591 } while (!table && size > PAGE_SIZE && --log2qty); 2592 2593 if (!table) 2594 panic("Failed to allocate %s hash table\n", tablename); 2595 2596 printk("%s hash table entries: %d (order: %d, %lu bytes)\n", 2597 tablename, 2598 (1U << log2qty), 2599 long_log2(size) - PAGE_SHIFT, 2600 size); 2601 2602 if (_hash_shift) 2603 *_hash_shift = log2qty; 2604 if (_hash_mask) 2605 *_hash_mask = (1 << log2qty) - 1; 2606 2607 return table; 2608} 2609