page_alloc.c revision 77a8a78834561398fb4cb1480afa7b0e80b1dd53
1/* 2 * linux/mm/page_alloc.c 3 * 4 * Manages the free list, the system allocates free pages here. 5 * Note that kmalloc() lives in slab.c 6 * 7 * Copyright (C) 1991, 1992, 1993, 1994 Linus Torvalds 8 * Swap reorganised 29.12.95, Stephen Tweedie 9 * Support of BIGMEM added by Gerhard Wichert, Siemens AG, July 1999 10 * Reshaped it to be a zoned allocator, Ingo Molnar, Red Hat, 1999 11 * Discontiguous memory support, Kanoj Sarcar, SGI, Nov 1999 12 * Zone balancing, Kanoj Sarcar, SGI, Jan 2000 13 * Per cpu hot/cold page lists, bulk allocation, Martin J. Bligh, Sept 2002 14 * (lots of bits borrowed from Ingo Molnar & Andrew Morton) 15 */ 16 17#include <linux/config.h> 18#include <linux/stddef.h> 19#include <linux/mm.h> 20#include <linux/swap.h> 21#include <linux/interrupt.h> 22#include <linux/pagemap.h> 23#include <linux/bootmem.h> 24#include <linux/compiler.h> 25#include <linux/kernel.h> 26#include <linux/module.h> 27#include <linux/suspend.h> 28#include <linux/pagevec.h> 29#include <linux/blkdev.h> 30#include <linux/slab.h> 31#include <linux/notifier.h> 32#include <linux/topology.h> 33#include <linux/sysctl.h> 34#include <linux/cpu.h> 35#include <linux/cpuset.h> 36#include <linux/memory_hotplug.h> 37#include <linux/nodemask.h> 38#include <linux/vmalloc.h> 39 40#include <asm/tlbflush.h> 41#include "internal.h" 42 43/* 44 * MCD - HACK: Find somewhere to initialize this EARLY, or make this 45 * initializer cleaner 46 */ 47nodemask_t node_online_map __read_mostly = { { [0] = 1UL } }; 48EXPORT_SYMBOL(node_online_map); 49nodemask_t node_possible_map __read_mostly = NODE_MASK_ALL; 50EXPORT_SYMBOL(node_possible_map); 51struct pglist_data *pgdat_list __read_mostly; 52unsigned long totalram_pages __read_mostly; 53unsigned long totalhigh_pages __read_mostly; 54long nr_swap_pages; 55 56/* 57 * results with 256, 32 in the lowmem_reserve sysctl: 58 * 1G machine -> (16M dma, 800M-16M normal, 1G-800M high) 59 * 1G machine -> (16M dma, 784M normal, 224M high) 60 * NORMAL allocation will leave 784M/256 of ram reserved in the ZONE_DMA 61 * HIGHMEM allocation will leave 224M/32 of ram reserved in ZONE_NORMAL 62 * HIGHMEM allocation will (224M+784M)/256 of ram reserved in ZONE_DMA 63 * 64 * TBD: should special case ZONE_DMA32 machines here - in those we normally 65 * don't need any ZONE_NORMAL reservation 66 */ 67int sysctl_lowmem_reserve_ratio[MAX_NR_ZONES-1] = { 256, 256, 32 }; 68 69EXPORT_SYMBOL(totalram_pages); 70 71/* 72 * Used by page_zone() to look up the address of the struct zone whose 73 * id is encoded in the upper bits of page->flags 74 */ 75struct zone *zone_table[1 << ZONETABLE_SHIFT] __read_mostly; 76EXPORT_SYMBOL(zone_table); 77 78static char *zone_names[MAX_NR_ZONES] = { "DMA", "DMA32", "Normal", "HighMem" }; 79int min_free_kbytes = 1024; 80 81unsigned long __initdata nr_kernel_pages; 82unsigned long __initdata nr_all_pages; 83 84static int page_outside_zone_boundaries(struct zone *zone, struct page *page) 85{ 86 int ret = 0; 87 unsigned seq; 88 unsigned long pfn = page_to_pfn(page); 89 90 do { 91 seq = zone_span_seqbegin(zone); 92 if (pfn >= zone->zone_start_pfn + zone->spanned_pages) 93 ret = 1; 94 else if (pfn < zone->zone_start_pfn) 95 ret = 1; 96 } while (zone_span_seqretry(zone, seq)); 97 98 return ret; 99} 100 101static int page_is_consistent(struct zone *zone, struct page *page) 102{ 103#ifdef CONFIG_HOLES_IN_ZONE 104 if (!pfn_valid(page_to_pfn(page))) 105 return 0; 106#endif 107 if (zone != page_zone(page)) 108 return 0; 109 110 return 1; 111} 112/* 113 * Temporary debugging check for pages not lying within a given zone. 114 */ 115static int bad_range(struct zone *zone, struct page *page) 116{ 117 if (page_outside_zone_boundaries(zone, page)) 118 return 1; 119 if (!page_is_consistent(zone, page)) 120 return 1; 121 122 return 0; 123} 124 125static void bad_page(const char *function, struct page *page) 126{ 127 printk(KERN_EMERG "Bad page state at %s (in process '%s', page %p)\n", 128 function, current->comm, page); 129 printk(KERN_EMERG "flags:0x%0*lx mapping:%p mapcount:%d count:%d\n", 130 (int)(2*sizeof(unsigned long)), (unsigned long)page->flags, 131 page->mapping, page_mapcount(page), page_count(page)); 132 printk(KERN_EMERG "Backtrace:\n"); 133 dump_stack(); 134 printk(KERN_EMERG "Trying to fix it up, but a reboot is needed\n"); 135 page->flags &= ~(1 << PG_lru | 136 1 << PG_private | 137 1 << PG_locked | 138 1 << PG_active | 139 1 << PG_dirty | 140 1 << PG_reclaim | 141 1 << PG_slab | 142 1 << PG_swapcache | 143 1 << PG_writeback ); 144 set_page_count(page, 0); 145 reset_page_mapcount(page); 146 page->mapping = NULL; 147 add_taint(TAINT_BAD_PAGE); 148} 149 150/* 151 * Higher-order pages are called "compound pages". They are structured thusly: 152 * 153 * The first PAGE_SIZE page is called the "head page". 154 * 155 * The remaining PAGE_SIZE pages are called "tail pages". 156 * 157 * All pages have PG_compound set. All pages have their ->private pointing at 158 * the head page (even the head page has this). 159 * 160 * The first tail page's ->mapping, if non-zero, holds the address of the 161 * compound page's put_page() function. 162 * 163 * The order of the allocation is stored in the first tail page's ->index 164 * This is only for debug at present. This usage means that zero-order pages 165 * may not be compound. 166 */ 167static void prep_compound_page(struct page *page, unsigned long order) 168{ 169 int i; 170 int nr_pages = 1 << order; 171 172 page[1].mapping = NULL; 173 page[1].index = order; 174 for (i = 0; i < nr_pages; i++) { 175 struct page *p = page + i; 176 177 SetPageCompound(p); 178 set_page_private(p, (unsigned long)page); 179 } 180} 181 182static void destroy_compound_page(struct page *page, unsigned long order) 183{ 184 int i; 185 int nr_pages = 1 << order; 186 187 if (!PageCompound(page)) 188 return; 189 190 if (page[1].index != order) 191 bad_page(__FUNCTION__, page); 192 193 for (i = 0; i < nr_pages; i++) { 194 struct page *p = page + i; 195 196 if (!PageCompound(p)) 197 bad_page(__FUNCTION__, page); 198 if (page_private(p) != (unsigned long)page) 199 bad_page(__FUNCTION__, page); 200 ClearPageCompound(p); 201 } 202} 203 204/* 205 * function for dealing with page's order in buddy system. 206 * zone->lock is already acquired when we use these. 207 * So, we don't need atomic page->flags operations here. 208 */ 209static inline unsigned long page_order(struct page *page) { 210 return page_private(page); 211} 212 213static inline void set_page_order(struct page *page, int order) { 214 set_page_private(page, order); 215 __SetPagePrivate(page); 216} 217 218static inline void rmv_page_order(struct page *page) 219{ 220 __ClearPagePrivate(page); 221 set_page_private(page, 0); 222} 223 224/* 225 * Locate the struct page for both the matching buddy in our 226 * pair (buddy1) and the combined O(n+1) page they form (page). 227 * 228 * 1) Any buddy B1 will have an order O twin B2 which satisfies 229 * the following equation: 230 * B2 = B1 ^ (1 << O) 231 * For example, if the starting buddy (buddy2) is #8 its order 232 * 1 buddy is #10: 233 * B2 = 8 ^ (1 << 1) = 8 ^ 2 = 10 234 * 235 * 2) Any buddy B will have an order O+1 parent P which 236 * satisfies the following equation: 237 * P = B & ~(1 << O) 238 * 239 * Assumption: *_mem_map is contigious at least up to MAX_ORDER 240 */ 241static inline struct page * 242__page_find_buddy(struct page *page, unsigned long page_idx, unsigned int order) 243{ 244 unsigned long buddy_idx = page_idx ^ (1 << order); 245 246 return page + (buddy_idx - page_idx); 247} 248 249static inline unsigned long 250__find_combined_index(unsigned long page_idx, unsigned int order) 251{ 252 return (page_idx & ~(1 << order)); 253} 254 255/* 256 * This function checks whether a page is free && is the buddy 257 * we can do coalesce a page and its buddy if 258 * (a) the buddy is free && 259 * (b) the buddy is on the buddy system && 260 * (c) a page and its buddy have the same order. 261 * for recording page's order, we use page_private(page) and PG_private. 262 * 263 */ 264static inline int page_is_buddy(struct page *page, int order) 265{ 266 if (PagePrivate(page) && 267 (page_order(page) == order) && 268 page_count(page) == 0) 269 return 1; 270 return 0; 271} 272 273/* 274 * Freeing function for a buddy system allocator. 275 * 276 * The concept of a buddy system is to maintain direct-mapped table 277 * (containing bit values) for memory blocks of various "orders". 278 * The bottom level table contains the map for the smallest allocatable 279 * units of memory (here, pages), and each level above it describes 280 * pairs of units from the levels below, hence, "buddies". 281 * At a high level, all that happens here is marking the table entry 282 * at the bottom level available, and propagating the changes upward 283 * as necessary, plus some accounting needed to play nicely with other 284 * parts of the VM system. 285 * At each level, we keep a list of pages, which are heads of continuous 286 * free pages of length of (1 << order) and marked with PG_Private.Page's 287 * order is recorded in page_private(page) field. 288 * So when we are allocating or freeing one, we can derive the state of the 289 * other. That is, if we allocate a small block, and both were 290 * free, the remainder of the region must be split into blocks. 291 * If a block is freed, and its buddy is also free, then this 292 * triggers coalescing into a block of larger size. 293 * 294 * -- wli 295 */ 296 297static inline void __free_pages_bulk (struct page *page, 298 struct zone *zone, unsigned int order) 299{ 300 unsigned long page_idx; 301 int order_size = 1 << order; 302 303 if (unlikely(order)) 304 destroy_compound_page(page, order); 305 306 page_idx = page_to_pfn(page) & ((1 << MAX_ORDER) - 1); 307 308 BUG_ON(page_idx & (order_size - 1)); 309 BUG_ON(bad_range(zone, page)); 310 311 zone->free_pages += order_size; 312 while (order < MAX_ORDER-1) { 313 unsigned long combined_idx; 314 struct free_area *area; 315 struct page *buddy; 316 317 combined_idx = __find_combined_index(page_idx, order); 318 buddy = __page_find_buddy(page, page_idx, order); 319 320 if (bad_range(zone, buddy)) 321 break; 322 if (!page_is_buddy(buddy, order)) 323 break; /* Move the buddy up one level. */ 324 list_del(&buddy->lru); 325 area = zone->free_area + order; 326 area->nr_free--; 327 rmv_page_order(buddy); 328 page = page + (combined_idx - page_idx); 329 page_idx = combined_idx; 330 order++; 331 } 332 set_page_order(page, order); 333 list_add(&page->lru, &zone->free_area[order].free_list); 334 zone->free_area[order].nr_free++; 335} 336 337static inline int free_pages_check(const char *function, struct page *page) 338{ 339 if ( page_mapcount(page) || 340 page->mapping != NULL || 341 page_count(page) != 0 || 342 (page->flags & ( 343 1 << PG_lru | 344 1 << PG_private | 345 1 << PG_locked | 346 1 << PG_active | 347 1 << PG_reclaim | 348 1 << PG_slab | 349 1 << PG_swapcache | 350 1 << PG_writeback | 351 1 << PG_reserved ))) 352 bad_page(function, page); 353 if (PageDirty(page)) 354 __ClearPageDirty(page); 355 /* 356 * For now, we report if PG_reserved was found set, but do not 357 * clear it, and do not free the page. But we shall soon need 358 * to do more, for when the ZERO_PAGE count wraps negative. 359 */ 360 return PageReserved(page); 361} 362 363/* 364 * Frees a list of pages. 365 * Assumes all pages on list are in same zone, and of same order. 366 * count is the number of pages to free. 367 * 368 * If the zone was previously in an "all pages pinned" state then look to 369 * see if this freeing clears that state. 370 * 371 * And clear the zone's pages_scanned counter, to hold off the "all pages are 372 * pinned" detection logic. 373 */ 374static int 375free_pages_bulk(struct zone *zone, int count, 376 struct list_head *list, unsigned int order) 377{ 378 struct page *page = NULL; 379 int ret = 0; 380 381 spin_lock(&zone->lock); 382 zone->all_unreclaimable = 0; 383 zone->pages_scanned = 0; 384 while (!list_empty(list) && count--) { 385 page = list_entry(list->prev, struct page, lru); 386 /* have to delete it as __free_pages_bulk list manipulates */ 387 list_del(&page->lru); 388 __free_pages_bulk(page, zone, order); 389 ret++; 390 } 391 spin_unlock(&zone->lock); 392 return ret; 393} 394 395void __free_pages_ok(struct page *page, unsigned int order) 396{ 397 unsigned long flags; 398 LIST_HEAD(list); 399 int i; 400 int reserved = 0; 401 402 arch_free_page(page, order); 403 404#ifndef CONFIG_MMU 405 if (order > 0) 406 for (i = 1 ; i < (1 << order) ; ++i) 407 __put_page(page + i); 408#endif 409 410 for (i = 0 ; i < (1 << order) ; ++i) 411 reserved += free_pages_check(__FUNCTION__, page + i); 412 if (reserved) 413 return; 414 415 list_add(&page->lru, &list); 416 mod_page_state(pgfree, 1 << order); 417 kernel_map_pages(page, 1<<order, 0); 418 local_irq_save(flags); 419 free_pages_bulk(page_zone(page), 1, &list, order); 420 local_irq_restore(flags); 421} 422 423 424/* 425 * The order of subdivision here is critical for the IO subsystem. 426 * Please do not alter this order without good reasons and regression 427 * testing. Specifically, as large blocks of memory are subdivided, 428 * the order in which smaller blocks are delivered depends on the order 429 * they're subdivided in this function. This is the primary factor 430 * influencing the order in which pages are delivered to the IO 431 * subsystem according to empirical testing, and this is also justified 432 * by considering the behavior of a buddy system containing a single 433 * large block of memory acted on by a series of small allocations. 434 * This behavior is a critical factor in sglist merging's success. 435 * 436 * -- wli 437 */ 438static inline struct page * 439expand(struct zone *zone, struct page *page, 440 int low, int high, struct free_area *area) 441{ 442 unsigned long size = 1 << high; 443 444 while (high > low) { 445 area--; 446 high--; 447 size >>= 1; 448 BUG_ON(bad_range(zone, &page[size])); 449 list_add(&page[size].lru, &area->free_list); 450 area->nr_free++; 451 set_page_order(&page[size], high); 452 } 453 return page; 454} 455 456/* 457 * This page is about to be returned from the page allocator 458 */ 459static int prep_new_page(struct page *page, int order) 460{ 461 if ( page_mapcount(page) || 462 page->mapping != NULL || 463 page_count(page) != 0 || 464 (page->flags & ( 465 1 << PG_lru | 466 1 << PG_private | 467 1 << PG_locked | 468 1 << PG_active | 469 1 << PG_dirty | 470 1 << PG_reclaim | 471 1 << PG_slab | 472 1 << PG_swapcache | 473 1 << PG_writeback | 474 1 << PG_reserved ))) 475 bad_page(__FUNCTION__, page); 476 477 /* 478 * For now, we report if PG_reserved was found set, but do not 479 * clear it, and do not allocate the page: as a safety net. 480 */ 481 if (PageReserved(page)) 482 return 1; 483 484 page->flags &= ~(1 << PG_uptodate | 1 << PG_error | 485 1 << PG_referenced | 1 << PG_arch_1 | 486 1 << PG_checked | 1 << PG_mappedtodisk); 487 set_page_private(page, 0); 488 set_page_refs(page, order); 489 kernel_map_pages(page, 1 << order, 1); 490 return 0; 491} 492 493/* 494 * Do the hard work of removing an element from the buddy allocator. 495 * Call me with the zone->lock already held. 496 */ 497static struct page *__rmqueue(struct zone *zone, unsigned int order) 498{ 499 struct free_area * area; 500 unsigned int current_order; 501 struct page *page; 502 503 for (current_order = order; current_order < MAX_ORDER; ++current_order) { 504 area = zone->free_area + current_order; 505 if (list_empty(&area->free_list)) 506 continue; 507 508 page = list_entry(area->free_list.next, struct page, lru); 509 list_del(&page->lru); 510 rmv_page_order(page); 511 area->nr_free--; 512 zone->free_pages -= 1UL << order; 513 return expand(zone, page, order, current_order, area); 514 } 515 516 return NULL; 517} 518 519/* 520 * Obtain a specified number of elements from the buddy allocator, all under 521 * a single hold of the lock, for efficiency. Add them to the supplied list. 522 * Returns the number of new pages which were placed at *list. 523 */ 524static int rmqueue_bulk(struct zone *zone, unsigned int order, 525 unsigned long count, struct list_head *list) 526{ 527 int i; 528 int allocated = 0; 529 struct page *page; 530 531 spin_lock(&zone->lock); 532 for (i = 0; i < count; ++i) { 533 page = __rmqueue(zone, order); 534 if (page == NULL) 535 break; 536 allocated++; 537 list_add_tail(&page->lru, list); 538 } 539 spin_unlock(&zone->lock); 540 return allocated; 541} 542 543#ifdef CONFIG_NUMA 544/* Called from the slab reaper to drain remote pagesets */ 545void drain_remote_pages(void) 546{ 547 struct zone *zone; 548 int i; 549 unsigned long flags; 550 551 local_irq_save(flags); 552 for_each_zone(zone) { 553 struct per_cpu_pageset *pset; 554 555 /* Do not drain local pagesets */ 556 if (zone->zone_pgdat->node_id == numa_node_id()) 557 continue; 558 559 pset = zone->pageset[smp_processor_id()]; 560 for (i = 0; i < ARRAY_SIZE(pset->pcp); i++) { 561 struct per_cpu_pages *pcp; 562 563 pcp = &pset->pcp[i]; 564 if (pcp->count) 565 pcp->count -= free_pages_bulk(zone, pcp->count, 566 &pcp->list, 0); 567 } 568 } 569 local_irq_restore(flags); 570} 571#endif 572 573#if defined(CONFIG_PM) || defined(CONFIG_HOTPLUG_CPU) 574static void __drain_pages(unsigned int cpu) 575{ 576 unsigned long flags; 577 struct zone *zone; 578 int i; 579 580 for_each_zone(zone) { 581 struct per_cpu_pageset *pset; 582 583 pset = zone_pcp(zone, cpu); 584 for (i = 0; i < ARRAY_SIZE(pset->pcp); i++) { 585 struct per_cpu_pages *pcp; 586 587 pcp = &pset->pcp[i]; 588 local_irq_save(flags); 589 pcp->count -= free_pages_bulk(zone, pcp->count, 590 &pcp->list, 0); 591 local_irq_restore(flags); 592 } 593 } 594} 595#endif /* CONFIG_PM || CONFIG_HOTPLUG_CPU */ 596 597#ifdef CONFIG_PM 598 599void mark_free_pages(struct zone *zone) 600{ 601 unsigned long zone_pfn, flags; 602 int order; 603 struct list_head *curr; 604 605 if (!zone->spanned_pages) 606 return; 607 608 spin_lock_irqsave(&zone->lock, flags); 609 for (zone_pfn = 0; zone_pfn < zone->spanned_pages; ++zone_pfn) 610 ClearPageNosaveFree(pfn_to_page(zone_pfn + zone->zone_start_pfn)); 611 612 for (order = MAX_ORDER - 1; order >= 0; --order) 613 list_for_each(curr, &zone->free_area[order].free_list) { 614 unsigned long start_pfn, i; 615 616 start_pfn = page_to_pfn(list_entry(curr, struct page, lru)); 617 618 for (i=0; i < (1<<order); i++) 619 SetPageNosaveFree(pfn_to_page(start_pfn+i)); 620 } 621 spin_unlock_irqrestore(&zone->lock, flags); 622} 623 624/* 625 * Spill all of this CPU's per-cpu pages back into the buddy allocator. 626 */ 627void drain_local_pages(void) 628{ 629 unsigned long flags; 630 631 local_irq_save(flags); 632 __drain_pages(smp_processor_id()); 633 local_irq_restore(flags); 634} 635#endif /* CONFIG_PM */ 636 637static void zone_statistics(struct zonelist *zonelist, struct zone *z) 638{ 639#ifdef CONFIG_NUMA 640 unsigned long flags; 641 int cpu; 642 pg_data_t *pg = z->zone_pgdat; 643 pg_data_t *orig = zonelist->zones[0]->zone_pgdat; 644 struct per_cpu_pageset *p; 645 646 local_irq_save(flags); 647 cpu = smp_processor_id(); 648 p = zone_pcp(z,cpu); 649 if (pg == orig) { 650 p->numa_hit++; 651 } else { 652 p->numa_miss++; 653 zone_pcp(zonelist->zones[0], cpu)->numa_foreign++; 654 } 655 if (pg == NODE_DATA(numa_node_id())) 656 p->local_node++; 657 else 658 p->other_node++; 659 local_irq_restore(flags); 660#endif 661} 662 663/* 664 * Free a 0-order page 665 */ 666static void FASTCALL(free_hot_cold_page(struct page *page, int cold)); 667static void fastcall free_hot_cold_page(struct page *page, int cold) 668{ 669 struct zone *zone = page_zone(page); 670 struct per_cpu_pages *pcp; 671 unsigned long flags; 672 673 arch_free_page(page, 0); 674 675 if (PageAnon(page)) 676 page->mapping = NULL; 677 if (free_pages_check(__FUNCTION__, page)) 678 return; 679 680 inc_page_state(pgfree); 681 kernel_map_pages(page, 1, 0); 682 683 pcp = &zone_pcp(zone, get_cpu())->pcp[cold]; 684 local_irq_save(flags); 685 list_add(&page->lru, &pcp->list); 686 pcp->count++; 687 if (pcp->count >= pcp->high) 688 pcp->count -= free_pages_bulk(zone, pcp->batch, &pcp->list, 0); 689 local_irq_restore(flags); 690 put_cpu(); 691} 692 693void fastcall free_hot_page(struct page *page) 694{ 695 free_hot_cold_page(page, 0); 696} 697 698void fastcall free_cold_page(struct page *page) 699{ 700 free_hot_cold_page(page, 1); 701} 702 703static inline void prep_zero_page(struct page *page, int order, gfp_t gfp_flags) 704{ 705 int i; 706 707 BUG_ON((gfp_flags & (__GFP_WAIT | __GFP_HIGHMEM)) == __GFP_HIGHMEM); 708 for(i = 0; i < (1 << order); i++) 709 clear_highpage(page + i); 710} 711 712/* 713 * Really, prep_compound_page() should be called from __rmqueue_bulk(). But 714 * we cheat by calling it from here, in the order > 0 path. Saves a branch 715 * or two. 716 */ 717static struct page * 718buffered_rmqueue(struct zone *zone, int order, gfp_t gfp_flags) 719{ 720 unsigned long flags; 721 struct page *page; 722 int cold = !!(gfp_flags & __GFP_COLD); 723 724again: 725 if (order == 0) { 726 struct per_cpu_pages *pcp; 727 728 page = NULL; 729 pcp = &zone_pcp(zone, get_cpu())->pcp[cold]; 730 local_irq_save(flags); 731 if (pcp->count <= pcp->low) 732 pcp->count += rmqueue_bulk(zone, 0, 733 pcp->batch, &pcp->list); 734 if (likely(pcp->count)) { 735 page = list_entry(pcp->list.next, struct page, lru); 736 list_del(&page->lru); 737 pcp->count--; 738 } 739 local_irq_restore(flags); 740 put_cpu(); 741 } else { 742 spin_lock_irqsave(&zone->lock, flags); 743 page = __rmqueue(zone, order); 744 spin_unlock_irqrestore(&zone->lock, flags); 745 } 746 747 if (page != NULL) { 748 BUG_ON(bad_range(zone, page)); 749 mod_page_state_zone(zone, pgalloc, 1 << order); 750 if (prep_new_page(page, order)) 751 goto again; 752 753 if (gfp_flags & __GFP_ZERO) 754 prep_zero_page(page, order, gfp_flags); 755 756 if (order && (gfp_flags & __GFP_COMP)) 757 prep_compound_page(page, order); 758 } 759 return page; 760} 761 762#define ALLOC_NO_WATERMARKS 0x01 /* don't check watermarks at all */ 763#define ALLOC_WMARK_MIN 0x02 /* use pages_min watermark */ 764#define ALLOC_WMARK_LOW 0x04 /* use pages_low watermark */ 765#define ALLOC_WMARK_HIGH 0x08 /* use pages_high watermark */ 766#define ALLOC_HARDER 0x10 /* try to alloc harder */ 767#define ALLOC_HIGH 0x20 /* __GFP_HIGH set */ 768#define ALLOC_CPUSET 0x40 /* check for correct cpuset */ 769 770/* 771 * Return 1 if free pages are above 'mark'. This takes into account the order 772 * of the allocation. 773 */ 774int zone_watermark_ok(struct zone *z, int order, unsigned long mark, 775 int classzone_idx, int alloc_flags) 776{ 777 /* free_pages my go negative - that's OK */ 778 long min = mark, free_pages = z->free_pages - (1 << order) + 1; 779 int o; 780 781 if (alloc_flags & ALLOC_HIGH) 782 min -= min / 2; 783 if (alloc_flags & ALLOC_HARDER) 784 min -= min / 4; 785 786 if (free_pages <= min + z->lowmem_reserve[classzone_idx]) 787 return 0; 788 for (o = 0; o < order; o++) { 789 /* At the next order, this order's pages become unavailable */ 790 free_pages -= z->free_area[o].nr_free << o; 791 792 /* Require fewer higher order pages to be free */ 793 min >>= 1; 794 795 if (free_pages <= min) 796 return 0; 797 } 798 return 1; 799} 800 801/* 802 * get_page_from_freeliest goes through the zonelist trying to allocate 803 * a page. 804 */ 805static struct page * 806get_page_from_freelist(gfp_t gfp_mask, unsigned int order, 807 struct zonelist *zonelist, int alloc_flags) 808{ 809 struct zone **z = zonelist->zones; 810 struct page *page = NULL; 811 int classzone_idx = zone_idx(*z); 812 813 /* 814 * Go through the zonelist once, looking for a zone with enough free. 815 * See also cpuset_zone_allowed() comment in kernel/cpuset.c. 816 */ 817 do { 818 if ((alloc_flags & ALLOC_CPUSET) && 819 !cpuset_zone_allowed(*z, gfp_mask)) 820 continue; 821 822 if (!(alloc_flags & ALLOC_NO_WATERMARKS)) { 823 unsigned long mark; 824 if (alloc_flags & ALLOC_WMARK_MIN) 825 mark = (*z)->pages_min; 826 else if (alloc_flags & ALLOC_WMARK_LOW) 827 mark = (*z)->pages_low; 828 else 829 mark = (*z)->pages_high; 830 if (!zone_watermark_ok(*z, order, mark, 831 classzone_idx, alloc_flags)) 832 continue; 833 } 834 835 page = buffered_rmqueue(*z, order, gfp_mask); 836 if (page) { 837 zone_statistics(zonelist, *z); 838 break; 839 } 840 } while (*(++z) != NULL); 841 return page; 842} 843 844/* 845 * This is the 'heart' of the zoned buddy allocator. 846 */ 847struct page * fastcall 848__alloc_pages(gfp_t gfp_mask, unsigned int order, 849 struct zonelist *zonelist) 850{ 851 const gfp_t wait = gfp_mask & __GFP_WAIT; 852 struct zone **z; 853 struct page *page; 854 struct reclaim_state reclaim_state; 855 struct task_struct *p = current; 856 int do_retry; 857 int alloc_flags; 858 int did_some_progress; 859 860 might_sleep_if(wait); 861 862restart: 863 z = zonelist->zones; /* the list of zones suitable for gfp_mask */ 864 865 if (unlikely(*z == NULL)) { 866 /* Should this ever happen?? */ 867 return NULL; 868 } 869 870 page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, order, 871 zonelist, ALLOC_WMARK_LOW|ALLOC_CPUSET); 872 if (page) 873 goto got_pg; 874 875 do { 876 wakeup_kswapd(*z, order); 877 } while (*(++z)); 878 879 /* 880 * OK, we're below the kswapd watermark and have kicked background 881 * reclaim. Now things get more complex, so set up alloc_flags according 882 * to how we want to proceed. 883 * 884 * The caller may dip into page reserves a bit more if the caller 885 * cannot run direct reclaim, or if the caller has realtime scheduling 886 * policy. 887 */ 888 alloc_flags = ALLOC_WMARK_MIN; 889 if ((unlikely(rt_task(p)) && !in_interrupt()) || !wait) 890 alloc_flags |= ALLOC_HARDER; 891 if (gfp_mask & __GFP_HIGH) 892 alloc_flags |= ALLOC_HIGH; 893 alloc_flags |= ALLOC_CPUSET; 894 895 /* 896 * Go through the zonelist again. Let __GFP_HIGH and allocations 897 * coming from realtime tasks go deeper into reserves. 898 * 899 * This is the last chance, in general, before the goto nopage. 900 * Ignore cpuset if GFP_ATOMIC (!wait) rather than fail alloc. 901 * See also cpuset_zone_allowed() comment in kernel/cpuset.c. 902 */ 903 page = get_page_from_freelist(gfp_mask, order, zonelist, alloc_flags); 904 if (page) 905 goto got_pg; 906 907 /* This allocation should allow future memory freeing. */ 908 909 if (((p->flags & PF_MEMALLOC) || unlikely(test_thread_flag(TIF_MEMDIE))) 910 && !in_interrupt()) { 911 if (!(gfp_mask & __GFP_NOMEMALLOC)) { 912nofail_alloc: 913 /* go through the zonelist yet again, ignoring mins */ 914 page = get_page_from_freelist(gfp_mask, order, 915 zonelist, ALLOC_NO_WATERMARKS); 916 if (page) 917 goto got_pg; 918 if (gfp_mask & __GFP_NOFAIL) { 919 blk_congestion_wait(WRITE, HZ/50); 920 goto nofail_alloc; 921 } 922 } 923 goto nopage; 924 } 925 926 /* Atomic allocations - we can't balance anything */ 927 if (!wait) 928 goto nopage; 929 930rebalance: 931 cond_resched(); 932 933 /* We now go into synchronous reclaim */ 934 p->flags |= PF_MEMALLOC; 935 reclaim_state.reclaimed_slab = 0; 936 p->reclaim_state = &reclaim_state; 937 938 did_some_progress = try_to_free_pages(zonelist->zones, gfp_mask); 939 940 p->reclaim_state = NULL; 941 p->flags &= ~PF_MEMALLOC; 942 943 cond_resched(); 944 945 if (likely(did_some_progress)) { 946 page = get_page_from_freelist(gfp_mask, order, 947 zonelist, alloc_flags); 948 if (page) 949 goto got_pg; 950 } else if ((gfp_mask & __GFP_FS) && !(gfp_mask & __GFP_NORETRY)) { 951 /* 952 * Go through the zonelist yet one more time, keep 953 * very high watermark here, this is only to catch 954 * a parallel oom killing, we must fail if we're still 955 * under heavy pressure. 956 */ 957 page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, order, 958 zonelist, ALLOC_WMARK_HIGH|ALLOC_CPUSET); 959 if (page) 960 goto got_pg; 961 962 out_of_memory(gfp_mask, order); 963 goto restart; 964 } 965 966 /* 967 * Don't let big-order allocations loop unless the caller explicitly 968 * requests that. Wait for some write requests to complete then retry. 969 * 970 * In this implementation, __GFP_REPEAT means __GFP_NOFAIL for order 971 * <= 3, but that may not be true in other implementations. 972 */ 973 do_retry = 0; 974 if (!(gfp_mask & __GFP_NORETRY)) { 975 if ((order <= 3) || (gfp_mask & __GFP_REPEAT)) 976 do_retry = 1; 977 if (gfp_mask & __GFP_NOFAIL) 978 do_retry = 1; 979 } 980 if (do_retry) { 981 blk_congestion_wait(WRITE, HZ/50); 982 goto rebalance; 983 } 984 985nopage: 986 if (!(gfp_mask & __GFP_NOWARN) && printk_ratelimit()) { 987 printk(KERN_WARNING "%s: page allocation failure." 988 " order:%d, mode:0x%x\n", 989 p->comm, order, gfp_mask); 990 dump_stack(); 991 show_mem(); 992 } 993got_pg: 994 return page; 995} 996 997EXPORT_SYMBOL(__alloc_pages); 998 999/* 1000 * Common helper functions. 1001 */ 1002fastcall unsigned long __get_free_pages(gfp_t gfp_mask, unsigned int order) 1003{ 1004 struct page * page; 1005 page = alloc_pages(gfp_mask, order); 1006 if (!page) 1007 return 0; 1008 return (unsigned long) page_address(page); 1009} 1010 1011EXPORT_SYMBOL(__get_free_pages); 1012 1013fastcall unsigned long get_zeroed_page(gfp_t gfp_mask) 1014{ 1015 struct page * page; 1016 1017 /* 1018 * get_zeroed_page() returns a 32-bit address, which cannot represent 1019 * a highmem page 1020 */ 1021 BUG_ON((gfp_mask & __GFP_HIGHMEM) != 0); 1022 1023 page = alloc_pages(gfp_mask | __GFP_ZERO, 0); 1024 if (page) 1025 return (unsigned long) page_address(page); 1026 return 0; 1027} 1028 1029EXPORT_SYMBOL(get_zeroed_page); 1030 1031void __pagevec_free(struct pagevec *pvec) 1032{ 1033 int i = pagevec_count(pvec); 1034 1035 while (--i >= 0) 1036 free_hot_cold_page(pvec->pages[i], pvec->cold); 1037} 1038 1039fastcall void __free_pages(struct page *page, unsigned int order) 1040{ 1041 if (put_page_testzero(page)) { 1042 if (order == 0) 1043 free_hot_page(page); 1044 else 1045 __free_pages_ok(page, order); 1046 } 1047} 1048 1049EXPORT_SYMBOL(__free_pages); 1050 1051fastcall void free_pages(unsigned long addr, unsigned int order) 1052{ 1053 if (addr != 0) { 1054 BUG_ON(!virt_addr_valid((void *)addr)); 1055 __free_pages(virt_to_page((void *)addr), order); 1056 } 1057} 1058 1059EXPORT_SYMBOL(free_pages); 1060 1061/* 1062 * Total amount of free (allocatable) RAM: 1063 */ 1064unsigned int nr_free_pages(void) 1065{ 1066 unsigned int sum = 0; 1067 struct zone *zone; 1068 1069 for_each_zone(zone) 1070 sum += zone->free_pages; 1071 1072 return sum; 1073} 1074 1075EXPORT_SYMBOL(nr_free_pages); 1076 1077#ifdef CONFIG_NUMA 1078unsigned int nr_free_pages_pgdat(pg_data_t *pgdat) 1079{ 1080 unsigned int i, sum = 0; 1081 1082 for (i = 0; i < MAX_NR_ZONES; i++) 1083 sum += pgdat->node_zones[i].free_pages; 1084 1085 return sum; 1086} 1087#endif 1088 1089static unsigned int nr_free_zone_pages(int offset) 1090{ 1091 /* Just pick one node, since fallback list is circular */ 1092 pg_data_t *pgdat = NODE_DATA(numa_node_id()); 1093 unsigned int sum = 0; 1094 1095 struct zonelist *zonelist = pgdat->node_zonelists + offset; 1096 struct zone **zonep = zonelist->zones; 1097 struct zone *zone; 1098 1099 for (zone = *zonep++; zone; zone = *zonep++) { 1100 unsigned long size = zone->present_pages; 1101 unsigned long high = zone->pages_high; 1102 if (size > high) 1103 sum += size - high; 1104 } 1105 1106 return sum; 1107} 1108 1109/* 1110 * Amount of free RAM allocatable within ZONE_DMA and ZONE_NORMAL 1111 */ 1112unsigned int nr_free_buffer_pages(void) 1113{ 1114 return nr_free_zone_pages(gfp_zone(GFP_USER)); 1115} 1116 1117/* 1118 * Amount of free RAM allocatable within all zones 1119 */ 1120unsigned int nr_free_pagecache_pages(void) 1121{ 1122 return nr_free_zone_pages(gfp_zone(GFP_HIGHUSER)); 1123} 1124 1125#ifdef CONFIG_HIGHMEM 1126unsigned int nr_free_highpages (void) 1127{ 1128 pg_data_t *pgdat; 1129 unsigned int pages = 0; 1130 1131 for_each_pgdat(pgdat) 1132 pages += pgdat->node_zones[ZONE_HIGHMEM].free_pages; 1133 1134 return pages; 1135} 1136#endif 1137 1138#ifdef CONFIG_NUMA 1139static void show_node(struct zone *zone) 1140{ 1141 printk("Node %d ", zone->zone_pgdat->node_id); 1142} 1143#else 1144#define show_node(zone) do { } while (0) 1145#endif 1146 1147/* 1148 * Accumulate the page_state information across all CPUs. 1149 * The result is unavoidably approximate - it can change 1150 * during and after execution of this function. 1151 */ 1152static DEFINE_PER_CPU(struct page_state, page_states) = {0}; 1153 1154atomic_t nr_pagecache = ATOMIC_INIT(0); 1155EXPORT_SYMBOL(nr_pagecache); 1156#ifdef CONFIG_SMP 1157DEFINE_PER_CPU(long, nr_pagecache_local) = 0; 1158#endif 1159 1160void __get_page_state(struct page_state *ret, int nr, cpumask_t *cpumask) 1161{ 1162 int cpu = 0; 1163 1164 memset(ret, 0, sizeof(*ret)); 1165 cpus_and(*cpumask, *cpumask, cpu_online_map); 1166 1167 cpu = first_cpu(*cpumask); 1168 while (cpu < NR_CPUS) { 1169 unsigned long *in, *out, off; 1170 1171 in = (unsigned long *)&per_cpu(page_states, cpu); 1172 1173 cpu = next_cpu(cpu, *cpumask); 1174 1175 if (cpu < NR_CPUS) 1176 prefetch(&per_cpu(page_states, cpu)); 1177 1178 out = (unsigned long *)ret; 1179 for (off = 0; off < nr; off++) 1180 *out++ += *in++; 1181 } 1182} 1183 1184void get_page_state_node(struct page_state *ret, int node) 1185{ 1186 int nr; 1187 cpumask_t mask = node_to_cpumask(node); 1188 1189 nr = offsetof(struct page_state, GET_PAGE_STATE_LAST); 1190 nr /= sizeof(unsigned long); 1191 1192 __get_page_state(ret, nr+1, &mask); 1193} 1194 1195void get_page_state(struct page_state *ret) 1196{ 1197 int nr; 1198 cpumask_t mask = CPU_MASK_ALL; 1199 1200 nr = offsetof(struct page_state, GET_PAGE_STATE_LAST); 1201 nr /= sizeof(unsigned long); 1202 1203 __get_page_state(ret, nr + 1, &mask); 1204} 1205 1206void get_full_page_state(struct page_state *ret) 1207{ 1208 cpumask_t mask = CPU_MASK_ALL; 1209 1210 __get_page_state(ret, sizeof(*ret) / sizeof(unsigned long), &mask); 1211} 1212 1213unsigned long __read_page_state(unsigned long offset) 1214{ 1215 unsigned long ret = 0; 1216 int cpu; 1217 1218 for_each_online_cpu(cpu) { 1219 unsigned long in; 1220 1221 in = (unsigned long)&per_cpu(page_states, cpu) + offset; 1222 ret += *((unsigned long *)in); 1223 } 1224 return ret; 1225} 1226 1227void __mod_page_state(unsigned long offset, unsigned long delta) 1228{ 1229 unsigned long flags; 1230 void* ptr; 1231 1232 local_irq_save(flags); 1233 ptr = &__get_cpu_var(page_states); 1234 *(unsigned long*)(ptr + offset) += delta; 1235 local_irq_restore(flags); 1236} 1237 1238EXPORT_SYMBOL(__mod_page_state); 1239 1240void __get_zone_counts(unsigned long *active, unsigned long *inactive, 1241 unsigned long *free, struct pglist_data *pgdat) 1242{ 1243 struct zone *zones = pgdat->node_zones; 1244 int i; 1245 1246 *active = 0; 1247 *inactive = 0; 1248 *free = 0; 1249 for (i = 0; i < MAX_NR_ZONES; i++) { 1250 *active += zones[i].nr_active; 1251 *inactive += zones[i].nr_inactive; 1252 *free += zones[i].free_pages; 1253 } 1254} 1255 1256void get_zone_counts(unsigned long *active, 1257 unsigned long *inactive, unsigned long *free) 1258{ 1259 struct pglist_data *pgdat; 1260 1261 *active = 0; 1262 *inactive = 0; 1263 *free = 0; 1264 for_each_pgdat(pgdat) { 1265 unsigned long l, m, n; 1266 __get_zone_counts(&l, &m, &n, pgdat); 1267 *active += l; 1268 *inactive += m; 1269 *free += n; 1270 } 1271} 1272 1273void si_meminfo(struct sysinfo *val) 1274{ 1275 val->totalram = totalram_pages; 1276 val->sharedram = 0; 1277 val->freeram = nr_free_pages(); 1278 val->bufferram = nr_blockdev_pages(); 1279#ifdef CONFIG_HIGHMEM 1280 val->totalhigh = totalhigh_pages; 1281 val->freehigh = nr_free_highpages(); 1282#else 1283 val->totalhigh = 0; 1284 val->freehigh = 0; 1285#endif 1286 val->mem_unit = PAGE_SIZE; 1287} 1288 1289EXPORT_SYMBOL(si_meminfo); 1290 1291#ifdef CONFIG_NUMA 1292void si_meminfo_node(struct sysinfo *val, int nid) 1293{ 1294 pg_data_t *pgdat = NODE_DATA(nid); 1295 1296 val->totalram = pgdat->node_present_pages; 1297 val->freeram = nr_free_pages_pgdat(pgdat); 1298 val->totalhigh = pgdat->node_zones[ZONE_HIGHMEM].present_pages; 1299 val->freehigh = pgdat->node_zones[ZONE_HIGHMEM].free_pages; 1300 val->mem_unit = PAGE_SIZE; 1301} 1302#endif 1303 1304#define K(x) ((x) << (PAGE_SHIFT-10)) 1305 1306/* 1307 * Show free area list (used inside shift_scroll-lock stuff) 1308 * We also calculate the percentage fragmentation. We do this by counting the 1309 * memory on each free list with the exception of the first item on the list. 1310 */ 1311void show_free_areas(void) 1312{ 1313 struct page_state ps; 1314 int cpu, temperature; 1315 unsigned long active; 1316 unsigned long inactive; 1317 unsigned long free; 1318 struct zone *zone; 1319 1320 for_each_zone(zone) { 1321 show_node(zone); 1322 printk("%s per-cpu:", zone->name); 1323 1324 if (!zone->present_pages) { 1325 printk(" empty\n"); 1326 continue; 1327 } else 1328 printk("\n"); 1329 1330 for_each_online_cpu(cpu) { 1331 struct per_cpu_pageset *pageset; 1332 1333 pageset = zone_pcp(zone, cpu); 1334 1335 for (temperature = 0; temperature < 2; temperature++) 1336 printk("cpu %d %s: low %d, high %d, batch %d used:%d\n", 1337 cpu, 1338 temperature ? "cold" : "hot", 1339 pageset->pcp[temperature].low, 1340 pageset->pcp[temperature].high, 1341 pageset->pcp[temperature].batch, 1342 pageset->pcp[temperature].count); 1343 } 1344 } 1345 1346 get_page_state(&ps); 1347 get_zone_counts(&active, &inactive, &free); 1348 1349 printk("Free pages: %11ukB (%ukB HighMem)\n", 1350 K(nr_free_pages()), 1351 K(nr_free_highpages())); 1352 1353 printk("Active:%lu inactive:%lu dirty:%lu writeback:%lu " 1354 "unstable:%lu free:%u slab:%lu mapped:%lu pagetables:%lu\n", 1355 active, 1356 inactive, 1357 ps.nr_dirty, 1358 ps.nr_writeback, 1359 ps.nr_unstable, 1360 nr_free_pages(), 1361 ps.nr_slab, 1362 ps.nr_mapped, 1363 ps.nr_page_table_pages); 1364 1365 for_each_zone(zone) { 1366 int i; 1367 1368 show_node(zone); 1369 printk("%s" 1370 " free:%lukB" 1371 " min:%lukB" 1372 " low:%lukB" 1373 " high:%lukB" 1374 " active:%lukB" 1375 " inactive:%lukB" 1376 " present:%lukB" 1377 " pages_scanned:%lu" 1378 " all_unreclaimable? %s" 1379 "\n", 1380 zone->name, 1381 K(zone->free_pages), 1382 K(zone->pages_min), 1383 K(zone->pages_low), 1384 K(zone->pages_high), 1385 K(zone->nr_active), 1386 K(zone->nr_inactive), 1387 K(zone->present_pages), 1388 zone->pages_scanned, 1389 (zone->all_unreclaimable ? "yes" : "no") 1390 ); 1391 printk("lowmem_reserve[]:"); 1392 for (i = 0; i < MAX_NR_ZONES; i++) 1393 printk(" %lu", zone->lowmem_reserve[i]); 1394 printk("\n"); 1395 } 1396 1397 for_each_zone(zone) { 1398 unsigned long nr, flags, order, total = 0; 1399 1400 show_node(zone); 1401 printk("%s: ", zone->name); 1402 if (!zone->present_pages) { 1403 printk("empty\n"); 1404 continue; 1405 } 1406 1407 spin_lock_irqsave(&zone->lock, flags); 1408 for (order = 0; order < MAX_ORDER; order++) { 1409 nr = zone->free_area[order].nr_free; 1410 total += nr << order; 1411 printk("%lu*%lukB ", nr, K(1UL) << order); 1412 } 1413 spin_unlock_irqrestore(&zone->lock, flags); 1414 printk("= %lukB\n", K(total)); 1415 } 1416 1417 show_swap_cache_info(); 1418} 1419 1420/* 1421 * Builds allocation fallback zone lists. 1422 */ 1423static int __init build_zonelists_node(pg_data_t *pgdat, struct zonelist *zonelist, int j, int k) 1424{ 1425 switch (k) { 1426 struct zone *zone; 1427 default: 1428 BUG(); 1429 case ZONE_HIGHMEM: 1430 zone = pgdat->node_zones + ZONE_HIGHMEM; 1431 if (zone->present_pages) { 1432#ifndef CONFIG_HIGHMEM 1433 BUG(); 1434#endif 1435 zonelist->zones[j++] = zone; 1436 } 1437 case ZONE_NORMAL: 1438 zone = pgdat->node_zones + ZONE_NORMAL; 1439 if (zone->present_pages) 1440 zonelist->zones[j++] = zone; 1441 case ZONE_DMA32: 1442 zone = pgdat->node_zones + ZONE_DMA32; 1443 if (zone->present_pages) 1444 zonelist->zones[j++] = zone; 1445 case ZONE_DMA: 1446 zone = pgdat->node_zones + ZONE_DMA; 1447 if (zone->present_pages) 1448 zonelist->zones[j++] = zone; 1449 } 1450 1451 return j; 1452} 1453 1454static inline int highest_zone(int zone_bits) 1455{ 1456 int res = ZONE_NORMAL; 1457 if (zone_bits & (__force int)__GFP_HIGHMEM) 1458 res = ZONE_HIGHMEM; 1459 if (zone_bits & (__force int)__GFP_DMA32) 1460 res = ZONE_DMA32; 1461 if (zone_bits & (__force int)__GFP_DMA) 1462 res = ZONE_DMA; 1463 return res; 1464} 1465 1466#ifdef CONFIG_NUMA 1467#define MAX_NODE_LOAD (num_online_nodes()) 1468static int __initdata node_load[MAX_NUMNODES]; 1469/** 1470 * find_next_best_node - find the next node that should appear in a given node's fallback list 1471 * @node: node whose fallback list we're appending 1472 * @used_node_mask: nodemask_t of already used nodes 1473 * 1474 * We use a number of factors to determine which is the next node that should 1475 * appear on a given node's fallback list. The node should not have appeared 1476 * already in @node's fallback list, and it should be the next closest node 1477 * according to the distance array (which contains arbitrary distance values 1478 * from each node to each node in the system), and should also prefer nodes 1479 * with no CPUs, since presumably they'll have very little allocation pressure 1480 * on them otherwise. 1481 * It returns -1 if no node is found. 1482 */ 1483static int __init find_next_best_node(int node, nodemask_t *used_node_mask) 1484{ 1485 int i, n, val; 1486 int min_val = INT_MAX; 1487 int best_node = -1; 1488 1489 for_each_online_node(i) { 1490 cpumask_t tmp; 1491 1492 /* Start from local node */ 1493 n = (node+i) % num_online_nodes(); 1494 1495 /* Don't want a node to appear more than once */ 1496 if (node_isset(n, *used_node_mask)) 1497 continue; 1498 1499 /* Use the local node if we haven't already */ 1500 if (!node_isset(node, *used_node_mask)) { 1501 best_node = node; 1502 break; 1503 } 1504 1505 /* Use the distance array to find the distance */ 1506 val = node_distance(node, n); 1507 1508 /* Give preference to headless and unused nodes */ 1509 tmp = node_to_cpumask(n); 1510 if (!cpus_empty(tmp)) 1511 val += PENALTY_FOR_NODE_WITH_CPUS; 1512 1513 /* Slight preference for less loaded node */ 1514 val *= (MAX_NODE_LOAD*MAX_NUMNODES); 1515 val += node_load[n]; 1516 1517 if (val < min_val) { 1518 min_val = val; 1519 best_node = n; 1520 } 1521 } 1522 1523 if (best_node >= 0) 1524 node_set(best_node, *used_node_mask); 1525 1526 return best_node; 1527} 1528 1529static void __init build_zonelists(pg_data_t *pgdat) 1530{ 1531 int i, j, k, node, local_node; 1532 int prev_node, load; 1533 struct zonelist *zonelist; 1534 nodemask_t used_mask; 1535 1536 /* initialize zonelists */ 1537 for (i = 0; i < GFP_ZONETYPES; i++) { 1538 zonelist = pgdat->node_zonelists + i; 1539 zonelist->zones[0] = NULL; 1540 } 1541 1542 /* NUMA-aware ordering of nodes */ 1543 local_node = pgdat->node_id; 1544 load = num_online_nodes(); 1545 prev_node = local_node; 1546 nodes_clear(used_mask); 1547 while ((node = find_next_best_node(local_node, &used_mask)) >= 0) { 1548 /* 1549 * We don't want to pressure a particular node. 1550 * So adding penalty to the first node in same 1551 * distance group to make it round-robin. 1552 */ 1553 if (node_distance(local_node, node) != 1554 node_distance(local_node, prev_node)) 1555 node_load[node] += load; 1556 prev_node = node; 1557 load--; 1558 for (i = 0; i < GFP_ZONETYPES; i++) { 1559 zonelist = pgdat->node_zonelists + i; 1560 for (j = 0; zonelist->zones[j] != NULL; j++); 1561 1562 k = highest_zone(i); 1563 1564 j = build_zonelists_node(NODE_DATA(node), zonelist, j, k); 1565 zonelist->zones[j] = NULL; 1566 } 1567 } 1568} 1569 1570#else /* CONFIG_NUMA */ 1571 1572static void __init build_zonelists(pg_data_t *pgdat) 1573{ 1574 int i, j, k, node, local_node; 1575 1576 local_node = pgdat->node_id; 1577 for (i = 0; i < GFP_ZONETYPES; i++) { 1578 struct zonelist *zonelist; 1579 1580 zonelist = pgdat->node_zonelists + i; 1581 1582 j = 0; 1583 k = highest_zone(i); 1584 j = build_zonelists_node(pgdat, zonelist, j, k); 1585 /* 1586 * Now we build the zonelist so that it contains the zones 1587 * of all the other nodes. 1588 * We don't want to pressure a particular node, so when 1589 * building the zones for node N, we make sure that the 1590 * zones coming right after the local ones are those from 1591 * node N+1 (modulo N) 1592 */ 1593 for (node = local_node + 1; node < MAX_NUMNODES; node++) { 1594 if (!node_online(node)) 1595 continue; 1596 j = build_zonelists_node(NODE_DATA(node), zonelist, j, k); 1597 } 1598 for (node = 0; node < local_node; node++) { 1599 if (!node_online(node)) 1600 continue; 1601 j = build_zonelists_node(NODE_DATA(node), zonelist, j, k); 1602 } 1603 1604 zonelist->zones[j] = NULL; 1605 } 1606} 1607 1608#endif /* CONFIG_NUMA */ 1609 1610void __init build_all_zonelists(void) 1611{ 1612 int i; 1613 1614 for_each_online_node(i) 1615 build_zonelists(NODE_DATA(i)); 1616 printk("Built %i zonelists\n", num_online_nodes()); 1617 cpuset_init_current_mems_allowed(); 1618} 1619 1620/* 1621 * Helper functions to size the waitqueue hash table. 1622 * Essentially these want to choose hash table sizes sufficiently 1623 * large so that collisions trying to wait on pages are rare. 1624 * But in fact, the number of active page waitqueues on typical 1625 * systems is ridiculously low, less than 200. So this is even 1626 * conservative, even though it seems large. 1627 * 1628 * The constant PAGES_PER_WAITQUEUE specifies the ratio of pages to 1629 * waitqueues, i.e. the size of the waitq table given the number of pages. 1630 */ 1631#define PAGES_PER_WAITQUEUE 256 1632 1633static inline unsigned long wait_table_size(unsigned long pages) 1634{ 1635 unsigned long size = 1; 1636 1637 pages /= PAGES_PER_WAITQUEUE; 1638 1639 while (size < pages) 1640 size <<= 1; 1641 1642 /* 1643 * Once we have dozens or even hundreds of threads sleeping 1644 * on IO we've got bigger problems than wait queue collision. 1645 * Limit the size of the wait table to a reasonable size. 1646 */ 1647 size = min(size, 4096UL); 1648 1649 return max(size, 4UL); 1650} 1651 1652/* 1653 * This is an integer logarithm so that shifts can be used later 1654 * to extract the more random high bits from the multiplicative 1655 * hash function before the remainder is taken. 1656 */ 1657static inline unsigned long wait_table_bits(unsigned long size) 1658{ 1659 return ffz(~size); 1660} 1661 1662#define LONG_ALIGN(x) (((x)+(sizeof(long))-1)&~((sizeof(long))-1)) 1663 1664static void __init calculate_zone_totalpages(struct pglist_data *pgdat, 1665 unsigned long *zones_size, unsigned long *zholes_size) 1666{ 1667 unsigned long realtotalpages, totalpages = 0; 1668 int i; 1669 1670 for (i = 0; i < MAX_NR_ZONES; i++) 1671 totalpages += zones_size[i]; 1672 pgdat->node_spanned_pages = totalpages; 1673 1674 realtotalpages = totalpages; 1675 if (zholes_size) 1676 for (i = 0; i < MAX_NR_ZONES; i++) 1677 realtotalpages -= zholes_size[i]; 1678 pgdat->node_present_pages = realtotalpages; 1679 printk(KERN_DEBUG "On node %d totalpages: %lu\n", pgdat->node_id, realtotalpages); 1680} 1681 1682 1683/* 1684 * Initially all pages are reserved - free ones are freed 1685 * up by free_all_bootmem() once the early boot process is 1686 * done. Non-atomic initialization, single-pass. 1687 */ 1688void __devinit memmap_init_zone(unsigned long size, int nid, unsigned long zone, 1689 unsigned long start_pfn) 1690{ 1691 struct page *page; 1692 unsigned long end_pfn = start_pfn + size; 1693 unsigned long pfn; 1694 1695 for (pfn = start_pfn; pfn < end_pfn; pfn++, page++) { 1696 if (!early_pfn_valid(pfn)) 1697 continue; 1698 page = pfn_to_page(pfn); 1699 set_page_links(page, zone, nid, pfn); 1700 set_page_count(page, 1); 1701 reset_page_mapcount(page); 1702 SetPageReserved(page); 1703 INIT_LIST_HEAD(&page->lru); 1704#ifdef WANT_PAGE_VIRTUAL 1705 /* The shift won't overflow because ZONE_NORMAL is below 4G. */ 1706 if (!is_highmem_idx(zone)) 1707 set_page_address(page, __va(pfn << PAGE_SHIFT)); 1708#endif 1709 } 1710} 1711 1712void zone_init_free_lists(struct pglist_data *pgdat, struct zone *zone, 1713 unsigned long size) 1714{ 1715 int order; 1716 for (order = 0; order < MAX_ORDER ; order++) { 1717 INIT_LIST_HEAD(&zone->free_area[order].free_list); 1718 zone->free_area[order].nr_free = 0; 1719 } 1720} 1721 1722#define ZONETABLE_INDEX(x, zone_nr) ((x << ZONES_SHIFT) | zone_nr) 1723void zonetable_add(struct zone *zone, int nid, int zid, unsigned long pfn, 1724 unsigned long size) 1725{ 1726 unsigned long snum = pfn_to_section_nr(pfn); 1727 unsigned long end = pfn_to_section_nr(pfn + size); 1728 1729 if (FLAGS_HAS_NODE) 1730 zone_table[ZONETABLE_INDEX(nid, zid)] = zone; 1731 else 1732 for (; snum <= end; snum++) 1733 zone_table[ZONETABLE_INDEX(snum, zid)] = zone; 1734} 1735 1736#ifndef __HAVE_ARCH_MEMMAP_INIT 1737#define memmap_init(size, nid, zone, start_pfn) \ 1738 memmap_init_zone((size), (nid), (zone), (start_pfn)) 1739#endif 1740 1741static int __devinit zone_batchsize(struct zone *zone) 1742{ 1743 int batch; 1744 1745 /* 1746 * The per-cpu-pages pools are set to around 1000th of the 1747 * size of the zone. But no more than 1/2 of a meg. 1748 * 1749 * OK, so we don't know how big the cache is. So guess. 1750 */ 1751 batch = zone->present_pages / 1024; 1752 if (batch * PAGE_SIZE > 512 * 1024) 1753 batch = (512 * 1024) / PAGE_SIZE; 1754 batch /= 4; /* We effectively *= 4 below */ 1755 if (batch < 1) 1756 batch = 1; 1757 1758 /* 1759 * Clamp the batch to a 2^n - 1 value. Having a power 1760 * of 2 value was found to be more likely to have 1761 * suboptimal cache aliasing properties in some cases. 1762 * 1763 * For example if 2 tasks are alternately allocating 1764 * batches of pages, one task can end up with a lot 1765 * of pages of one half of the possible page colors 1766 * and the other with pages of the other colors. 1767 */ 1768 batch = (1 << (fls(batch + batch/2)-1)) - 1; 1769 1770 return batch; 1771} 1772 1773inline void setup_pageset(struct per_cpu_pageset *p, unsigned long batch) 1774{ 1775 struct per_cpu_pages *pcp; 1776 1777 memset(p, 0, sizeof(*p)); 1778 1779 pcp = &p->pcp[0]; /* hot */ 1780 pcp->count = 0; 1781 pcp->low = 0; 1782 pcp->high = 6 * batch; 1783 pcp->batch = max(1UL, 1 * batch); 1784 INIT_LIST_HEAD(&pcp->list); 1785 1786 pcp = &p->pcp[1]; /* cold*/ 1787 pcp->count = 0; 1788 pcp->low = 0; 1789 pcp->high = 2 * batch; 1790 pcp->batch = max(1UL, batch/2); 1791 INIT_LIST_HEAD(&pcp->list); 1792} 1793 1794#ifdef CONFIG_NUMA 1795/* 1796 * Boot pageset table. One per cpu which is going to be used for all 1797 * zones and all nodes. The parameters will be set in such a way 1798 * that an item put on a list will immediately be handed over to 1799 * the buddy list. This is safe since pageset manipulation is done 1800 * with interrupts disabled. 1801 * 1802 * Some NUMA counter updates may also be caught by the boot pagesets. 1803 * 1804 * The boot_pagesets must be kept even after bootup is complete for 1805 * unused processors and/or zones. They do play a role for bootstrapping 1806 * hotplugged processors. 1807 * 1808 * zoneinfo_show() and maybe other functions do 1809 * not check if the processor is online before following the pageset pointer. 1810 * Other parts of the kernel may not check if the zone is available. 1811 */ 1812static struct per_cpu_pageset 1813 boot_pageset[NR_CPUS]; 1814 1815/* 1816 * Dynamically allocate memory for the 1817 * per cpu pageset array in struct zone. 1818 */ 1819static int __devinit process_zones(int cpu) 1820{ 1821 struct zone *zone, *dzone; 1822 1823 for_each_zone(zone) { 1824 1825 zone->pageset[cpu] = kmalloc_node(sizeof(struct per_cpu_pageset), 1826 GFP_KERNEL, cpu_to_node(cpu)); 1827 if (!zone->pageset[cpu]) 1828 goto bad; 1829 1830 setup_pageset(zone->pageset[cpu], zone_batchsize(zone)); 1831 } 1832 1833 return 0; 1834bad: 1835 for_each_zone(dzone) { 1836 if (dzone == zone) 1837 break; 1838 kfree(dzone->pageset[cpu]); 1839 dzone->pageset[cpu] = NULL; 1840 } 1841 return -ENOMEM; 1842} 1843 1844static inline void free_zone_pagesets(int cpu) 1845{ 1846#ifdef CONFIG_NUMA 1847 struct zone *zone; 1848 1849 for_each_zone(zone) { 1850 struct per_cpu_pageset *pset = zone_pcp(zone, cpu); 1851 1852 zone_pcp(zone, cpu) = NULL; 1853 kfree(pset); 1854 } 1855#endif 1856} 1857 1858static int __devinit pageset_cpuup_callback(struct notifier_block *nfb, 1859 unsigned long action, 1860 void *hcpu) 1861{ 1862 int cpu = (long)hcpu; 1863 int ret = NOTIFY_OK; 1864 1865 switch (action) { 1866 case CPU_UP_PREPARE: 1867 if (process_zones(cpu)) 1868 ret = NOTIFY_BAD; 1869 break; 1870 case CPU_UP_CANCELED: 1871 case CPU_DEAD: 1872 free_zone_pagesets(cpu); 1873 break; 1874 default: 1875 break; 1876 } 1877 return ret; 1878} 1879 1880static struct notifier_block pageset_notifier = 1881 { &pageset_cpuup_callback, NULL, 0 }; 1882 1883void __init setup_per_cpu_pageset(void) 1884{ 1885 int err; 1886 1887 /* Initialize per_cpu_pageset for cpu 0. 1888 * A cpuup callback will do this for every cpu 1889 * as it comes online 1890 */ 1891 err = process_zones(smp_processor_id()); 1892 BUG_ON(err); 1893 register_cpu_notifier(&pageset_notifier); 1894} 1895 1896#endif 1897 1898static __devinit 1899void zone_wait_table_init(struct zone *zone, unsigned long zone_size_pages) 1900{ 1901 int i; 1902 struct pglist_data *pgdat = zone->zone_pgdat; 1903 1904 /* 1905 * The per-page waitqueue mechanism uses hashed waitqueues 1906 * per zone. 1907 */ 1908 zone->wait_table_size = wait_table_size(zone_size_pages); 1909 zone->wait_table_bits = wait_table_bits(zone->wait_table_size); 1910 zone->wait_table = (wait_queue_head_t *) 1911 alloc_bootmem_node(pgdat, zone->wait_table_size 1912 * sizeof(wait_queue_head_t)); 1913 1914 for(i = 0; i < zone->wait_table_size; ++i) 1915 init_waitqueue_head(zone->wait_table + i); 1916} 1917 1918static __devinit void zone_pcp_init(struct zone *zone) 1919{ 1920 int cpu; 1921 unsigned long batch = zone_batchsize(zone); 1922 1923 for (cpu = 0; cpu < NR_CPUS; cpu++) { 1924#ifdef CONFIG_NUMA 1925 /* Early boot. Slab allocator not functional yet */ 1926 zone->pageset[cpu] = &boot_pageset[cpu]; 1927 setup_pageset(&boot_pageset[cpu],0); 1928#else 1929 setup_pageset(zone_pcp(zone,cpu), batch); 1930#endif 1931 } 1932 printk(KERN_DEBUG " %s zone: %lu pages, LIFO batch:%lu\n", 1933 zone->name, zone->present_pages, batch); 1934} 1935 1936static __devinit void init_currently_empty_zone(struct zone *zone, 1937 unsigned long zone_start_pfn, unsigned long size) 1938{ 1939 struct pglist_data *pgdat = zone->zone_pgdat; 1940 1941 zone_wait_table_init(zone, size); 1942 pgdat->nr_zones = zone_idx(zone) + 1; 1943 1944 zone->zone_mem_map = pfn_to_page(zone_start_pfn); 1945 zone->zone_start_pfn = zone_start_pfn; 1946 1947 memmap_init(size, pgdat->node_id, zone_idx(zone), zone_start_pfn); 1948 1949 zone_init_free_lists(pgdat, zone, zone->spanned_pages); 1950} 1951 1952/* 1953 * Set up the zone data structures: 1954 * - mark all pages reserved 1955 * - mark all memory queues empty 1956 * - clear the memory bitmaps 1957 */ 1958static void __init free_area_init_core(struct pglist_data *pgdat, 1959 unsigned long *zones_size, unsigned long *zholes_size) 1960{ 1961 unsigned long j; 1962 int nid = pgdat->node_id; 1963 unsigned long zone_start_pfn = pgdat->node_start_pfn; 1964 1965 pgdat_resize_init(pgdat); 1966 pgdat->nr_zones = 0; 1967 init_waitqueue_head(&pgdat->kswapd_wait); 1968 pgdat->kswapd_max_order = 0; 1969 1970 for (j = 0; j < MAX_NR_ZONES; j++) { 1971 struct zone *zone = pgdat->node_zones + j; 1972 unsigned long size, realsize; 1973 1974 realsize = size = zones_size[j]; 1975 if (zholes_size) 1976 realsize -= zholes_size[j]; 1977 1978 if (j < ZONE_HIGHMEM) 1979 nr_kernel_pages += realsize; 1980 nr_all_pages += realsize; 1981 1982 zone->spanned_pages = size; 1983 zone->present_pages = realsize; 1984 zone->name = zone_names[j]; 1985 spin_lock_init(&zone->lock); 1986 spin_lock_init(&zone->lru_lock); 1987 zone_seqlock_init(zone); 1988 zone->zone_pgdat = pgdat; 1989 zone->free_pages = 0; 1990 1991 zone->temp_priority = zone->prev_priority = DEF_PRIORITY; 1992 1993 zone_pcp_init(zone); 1994 INIT_LIST_HEAD(&zone->active_list); 1995 INIT_LIST_HEAD(&zone->inactive_list); 1996 zone->nr_scan_active = 0; 1997 zone->nr_scan_inactive = 0; 1998 zone->nr_active = 0; 1999 zone->nr_inactive = 0; 2000 atomic_set(&zone->reclaim_in_progress, 0); 2001 if (!size) 2002 continue; 2003 2004 zonetable_add(zone, nid, j, zone_start_pfn, size); 2005 init_currently_empty_zone(zone, zone_start_pfn, size); 2006 zone_start_pfn += size; 2007 } 2008} 2009 2010static void __init alloc_node_mem_map(struct pglist_data *pgdat) 2011{ 2012 /* Skip empty nodes */ 2013 if (!pgdat->node_spanned_pages) 2014 return; 2015 2016#ifdef CONFIG_FLAT_NODE_MEM_MAP 2017 /* ia64 gets its own node_mem_map, before this, without bootmem */ 2018 if (!pgdat->node_mem_map) { 2019 unsigned long size; 2020 struct page *map; 2021 2022 size = (pgdat->node_spanned_pages + 1) * sizeof(struct page); 2023 map = alloc_remap(pgdat->node_id, size); 2024 if (!map) 2025 map = alloc_bootmem_node(pgdat, size); 2026 pgdat->node_mem_map = map; 2027 } 2028#ifdef CONFIG_FLATMEM 2029 /* 2030 * With no DISCONTIG, the global mem_map is just set as node 0's 2031 */ 2032 if (pgdat == NODE_DATA(0)) 2033 mem_map = NODE_DATA(0)->node_mem_map; 2034#endif 2035#endif /* CONFIG_FLAT_NODE_MEM_MAP */ 2036} 2037 2038void __init free_area_init_node(int nid, struct pglist_data *pgdat, 2039 unsigned long *zones_size, unsigned long node_start_pfn, 2040 unsigned long *zholes_size) 2041{ 2042 pgdat->node_id = nid; 2043 pgdat->node_start_pfn = node_start_pfn; 2044 calculate_zone_totalpages(pgdat, zones_size, zholes_size); 2045 2046 alloc_node_mem_map(pgdat); 2047 2048 free_area_init_core(pgdat, zones_size, zholes_size); 2049} 2050 2051#ifndef CONFIG_NEED_MULTIPLE_NODES 2052static bootmem_data_t contig_bootmem_data; 2053struct pglist_data contig_page_data = { .bdata = &contig_bootmem_data }; 2054 2055EXPORT_SYMBOL(contig_page_data); 2056#endif 2057 2058void __init free_area_init(unsigned long *zones_size) 2059{ 2060 free_area_init_node(0, NODE_DATA(0), zones_size, 2061 __pa(PAGE_OFFSET) >> PAGE_SHIFT, NULL); 2062} 2063 2064#ifdef CONFIG_PROC_FS 2065 2066#include <linux/seq_file.h> 2067 2068static void *frag_start(struct seq_file *m, loff_t *pos) 2069{ 2070 pg_data_t *pgdat; 2071 loff_t node = *pos; 2072 2073 for (pgdat = pgdat_list; pgdat && node; pgdat = pgdat->pgdat_next) 2074 --node; 2075 2076 return pgdat; 2077} 2078 2079static void *frag_next(struct seq_file *m, void *arg, loff_t *pos) 2080{ 2081 pg_data_t *pgdat = (pg_data_t *)arg; 2082 2083 (*pos)++; 2084 return pgdat->pgdat_next; 2085} 2086 2087static void frag_stop(struct seq_file *m, void *arg) 2088{ 2089} 2090 2091/* 2092 * This walks the free areas for each zone. 2093 */ 2094static int frag_show(struct seq_file *m, void *arg) 2095{ 2096 pg_data_t *pgdat = (pg_data_t *)arg; 2097 struct zone *zone; 2098 struct zone *node_zones = pgdat->node_zones; 2099 unsigned long flags; 2100 int order; 2101 2102 for (zone = node_zones; zone - node_zones < MAX_NR_ZONES; ++zone) { 2103 if (!zone->present_pages) 2104 continue; 2105 2106 spin_lock_irqsave(&zone->lock, flags); 2107 seq_printf(m, "Node %d, zone %8s ", pgdat->node_id, zone->name); 2108 for (order = 0; order < MAX_ORDER; ++order) 2109 seq_printf(m, "%6lu ", zone->free_area[order].nr_free); 2110 spin_unlock_irqrestore(&zone->lock, flags); 2111 seq_putc(m, '\n'); 2112 } 2113 return 0; 2114} 2115 2116struct seq_operations fragmentation_op = { 2117 .start = frag_start, 2118 .next = frag_next, 2119 .stop = frag_stop, 2120 .show = frag_show, 2121}; 2122 2123/* 2124 * Output information about zones in @pgdat. 2125 */ 2126static int zoneinfo_show(struct seq_file *m, void *arg) 2127{ 2128 pg_data_t *pgdat = arg; 2129 struct zone *zone; 2130 struct zone *node_zones = pgdat->node_zones; 2131 unsigned long flags; 2132 2133 for (zone = node_zones; zone - node_zones < MAX_NR_ZONES; zone++) { 2134 int i; 2135 2136 if (!zone->present_pages) 2137 continue; 2138 2139 spin_lock_irqsave(&zone->lock, flags); 2140 seq_printf(m, "Node %d, zone %8s", pgdat->node_id, zone->name); 2141 seq_printf(m, 2142 "\n pages free %lu" 2143 "\n min %lu" 2144 "\n low %lu" 2145 "\n high %lu" 2146 "\n active %lu" 2147 "\n inactive %lu" 2148 "\n scanned %lu (a: %lu i: %lu)" 2149 "\n spanned %lu" 2150 "\n present %lu", 2151 zone->free_pages, 2152 zone->pages_min, 2153 zone->pages_low, 2154 zone->pages_high, 2155 zone->nr_active, 2156 zone->nr_inactive, 2157 zone->pages_scanned, 2158 zone->nr_scan_active, zone->nr_scan_inactive, 2159 zone->spanned_pages, 2160 zone->present_pages); 2161 seq_printf(m, 2162 "\n protection: (%lu", 2163 zone->lowmem_reserve[0]); 2164 for (i = 1; i < ARRAY_SIZE(zone->lowmem_reserve); i++) 2165 seq_printf(m, ", %lu", zone->lowmem_reserve[i]); 2166 seq_printf(m, 2167 ")" 2168 "\n pagesets"); 2169 for (i = 0; i < ARRAY_SIZE(zone->pageset); i++) { 2170 struct per_cpu_pageset *pageset; 2171 int j; 2172 2173 pageset = zone_pcp(zone, i); 2174 for (j = 0; j < ARRAY_SIZE(pageset->pcp); j++) { 2175 if (pageset->pcp[j].count) 2176 break; 2177 } 2178 if (j == ARRAY_SIZE(pageset->pcp)) 2179 continue; 2180 for (j = 0; j < ARRAY_SIZE(pageset->pcp); j++) { 2181 seq_printf(m, 2182 "\n cpu: %i pcp: %i" 2183 "\n count: %i" 2184 "\n low: %i" 2185 "\n high: %i" 2186 "\n batch: %i", 2187 i, j, 2188 pageset->pcp[j].count, 2189 pageset->pcp[j].low, 2190 pageset->pcp[j].high, 2191 pageset->pcp[j].batch); 2192 } 2193#ifdef CONFIG_NUMA 2194 seq_printf(m, 2195 "\n numa_hit: %lu" 2196 "\n numa_miss: %lu" 2197 "\n numa_foreign: %lu" 2198 "\n interleave_hit: %lu" 2199 "\n local_node: %lu" 2200 "\n other_node: %lu", 2201 pageset->numa_hit, 2202 pageset->numa_miss, 2203 pageset->numa_foreign, 2204 pageset->interleave_hit, 2205 pageset->local_node, 2206 pageset->other_node); 2207#endif 2208 } 2209 seq_printf(m, 2210 "\n all_unreclaimable: %u" 2211 "\n prev_priority: %i" 2212 "\n temp_priority: %i" 2213 "\n start_pfn: %lu", 2214 zone->all_unreclaimable, 2215 zone->prev_priority, 2216 zone->temp_priority, 2217 zone->zone_start_pfn); 2218 spin_unlock_irqrestore(&zone->lock, flags); 2219 seq_putc(m, '\n'); 2220 } 2221 return 0; 2222} 2223 2224struct seq_operations zoneinfo_op = { 2225 .start = frag_start, /* iterate over all zones. The same as in 2226 * fragmentation. */ 2227 .next = frag_next, 2228 .stop = frag_stop, 2229 .show = zoneinfo_show, 2230}; 2231 2232static char *vmstat_text[] = { 2233 "nr_dirty", 2234 "nr_writeback", 2235 "nr_unstable", 2236 "nr_page_table_pages", 2237 "nr_mapped", 2238 "nr_slab", 2239 2240 "pgpgin", 2241 "pgpgout", 2242 "pswpin", 2243 "pswpout", 2244 "pgalloc_high", 2245 2246 "pgalloc_normal", 2247 "pgalloc_dma", 2248 "pgfree", 2249 "pgactivate", 2250 "pgdeactivate", 2251 2252 "pgfault", 2253 "pgmajfault", 2254 "pgrefill_high", 2255 "pgrefill_normal", 2256 "pgrefill_dma", 2257 2258 "pgsteal_high", 2259 "pgsteal_normal", 2260 "pgsteal_dma", 2261 "pgscan_kswapd_high", 2262 "pgscan_kswapd_normal", 2263 2264 "pgscan_kswapd_dma", 2265 "pgscan_direct_high", 2266 "pgscan_direct_normal", 2267 "pgscan_direct_dma", 2268 "pginodesteal", 2269 2270 "slabs_scanned", 2271 "kswapd_steal", 2272 "kswapd_inodesteal", 2273 "pageoutrun", 2274 "allocstall", 2275 2276 "pgrotated", 2277 "nr_bounce", 2278}; 2279 2280static void *vmstat_start(struct seq_file *m, loff_t *pos) 2281{ 2282 struct page_state *ps; 2283 2284 if (*pos >= ARRAY_SIZE(vmstat_text)) 2285 return NULL; 2286 2287 ps = kmalloc(sizeof(*ps), GFP_KERNEL); 2288 m->private = ps; 2289 if (!ps) 2290 return ERR_PTR(-ENOMEM); 2291 get_full_page_state(ps); 2292 ps->pgpgin /= 2; /* sectors -> kbytes */ 2293 ps->pgpgout /= 2; 2294 return (unsigned long *)ps + *pos; 2295} 2296 2297static void *vmstat_next(struct seq_file *m, void *arg, loff_t *pos) 2298{ 2299 (*pos)++; 2300 if (*pos >= ARRAY_SIZE(vmstat_text)) 2301 return NULL; 2302 return (unsigned long *)m->private + *pos; 2303} 2304 2305static int vmstat_show(struct seq_file *m, void *arg) 2306{ 2307 unsigned long *l = arg; 2308 unsigned long off = l - (unsigned long *)m->private; 2309 2310 seq_printf(m, "%s %lu\n", vmstat_text[off], *l); 2311 return 0; 2312} 2313 2314static void vmstat_stop(struct seq_file *m, void *arg) 2315{ 2316 kfree(m->private); 2317 m->private = NULL; 2318} 2319 2320struct seq_operations vmstat_op = { 2321 .start = vmstat_start, 2322 .next = vmstat_next, 2323 .stop = vmstat_stop, 2324 .show = vmstat_show, 2325}; 2326 2327#endif /* CONFIG_PROC_FS */ 2328 2329#ifdef CONFIG_HOTPLUG_CPU 2330static int page_alloc_cpu_notify(struct notifier_block *self, 2331 unsigned long action, void *hcpu) 2332{ 2333 int cpu = (unsigned long)hcpu; 2334 long *count; 2335 unsigned long *src, *dest; 2336 2337 if (action == CPU_DEAD) { 2338 int i; 2339 2340 /* Drain local pagecache count. */ 2341 count = &per_cpu(nr_pagecache_local, cpu); 2342 atomic_add(*count, &nr_pagecache); 2343 *count = 0; 2344 local_irq_disable(); 2345 __drain_pages(cpu); 2346 2347 /* Add dead cpu's page_states to our own. */ 2348 dest = (unsigned long *)&__get_cpu_var(page_states); 2349 src = (unsigned long *)&per_cpu(page_states, cpu); 2350 2351 for (i = 0; i < sizeof(struct page_state)/sizeof(unsigned long); 2352 i++) { 2353 dest[i] += src[i]; 2354 src[i] = 0; 2355 } 2356 2357 local_irq_enable(); 2358 } 2359 return NOTIFY_OK; 2360} 2361#endif /* CONFIG_HOTPLUG_CPU */ 2362 2363void __init page_alloc_init(void) 2364{ 2365 hotcpu_notifier(page_alloc_cpu_notify, 0); 2366} 2367 2368/* 2369 * setup_per_zone_lowmem_reserve - called whenever 2370 * sysctl_lower_zone_reserve_ratio changes. Ensures that each zone 2371 * has a correct pages reserved value, so an adequate number of 2372 * pages are left in the zone after a successful __alloc_pages(). 2373 */ 2374static void setup_per_zone_lowmem_reserve(void) 2375{ 2376 struct pglist_data *pgdat; 2377 int j, idx; 2378 2379 for_each_pgdat(pgdat) { 2380 for (j = 0; j < MAX_NR_ZONES; j++) { 2381 struct zone *zone = pgdat->node_zones + j; 2382 unsigned long present_pages = zone->present_pages; 2383 2384 zone->lowmem_reserve[j] = 0; 2385 2386 for (idx = j-1; idx >= 0; idx--) { 2387 struct zone *lower_zone; 2388 2389 if (sysctl_lowmem_reserve_ratio[idx] < 1) 2390 sysctl_lowmem_reserve_ratio[idx] = 1; 2391 2392 lower_zone = pgdat->node_zones + idx; 2393 lower_zone->lowmem_reserve[j] = present_pages / 2394 sysctl_lowmem_reserve_ratio[idx]; 2395 present_pages += lower_zone->present_pages; 2396 } 2397 } 2398 } 2399} 2400 2401/* 2402 * setup_per_zone_pages_min - called when min_free_kbytes changes. Ensures 2403 * that the pages_{min,low,high} values for each zone are set correctly 2404 * with respect to min_free_kbytes. 2405 */ 2406void setup_per_zone_pages_min(void) 2407{ 2408 unsigned long pages_min = min_free_kbytes >> (PAGE_SHIFT - 10); 2409 unsigned long lowmem_pages = 0; 2410 struct zone *zone; 2411 unsigned long flags; 2412 2413 /* Calculate total number of !ZONE_HIGHMEM pages */ 2414 for_each_zone(zone) { 2415 if (!is_highmem(zone)) 2416 lowmem_pages += zone->present_pages; 2417 } 2418 2419 for_each_zone(zone) { 2420 unsigned long tmp; 2421 spin_lock_irqsave(&zone->lru_lock, flags); 2422 tmp = (pages_min * zone->present_pages) / lowmem_pages; 2423 if (is_highmem(zone)) { 2424 /* 2425 * __GFP_HIGH and PF_MEMALLOC allocations usually don't 2426 * need highmem pages, so cap pages_min to a small 2427 * value here. 2428 * 2429 * The (pages_high-pages_low) and (pages_low-pages_min) 2430 * deltas controls asynch page reclaim, and so should 2431 * not be capped for highmem. 2432 */ 2433 int min_pages; 2434 2435 min_pages = zone->present_pages / 1024; 2436 if (min_pages < SWAP_CLUSTER_MAX) 2437 min_pages = SWAP_CLUSTER_MAX; 2438 if (min_pages > 128) 2439 min_pages = 128; 2440 zone->pages_min = min_pages; 2441 } else { 2442 /* 2443 * If it's a lowmem zone, reserve a number of pages 2444 * proportionate to the zone's size. 2445 */ 2446 zone->pages_min = tmp; 2447 } 2448 2449 zone->pages_low = zone->pages_min + tmp / 4; 2450 zone->pages_high = zone->pages_min + tmp / 2; 2451 spin_unlock_irqrestore(&zone->lru_lock, flags); 2452 } 2453} 2454 2455/* 2456 * Initialise min_free_kbytes. 2457 * 2458 * For small machines we want it small (128k min). For large machines 2459 * we want it large (64MB max). But it is not linear, because network 2460 * bandwidth does not increase linearly with machine size. We use 2461 * 2462 * min_free_kbytes = 4 * sqrt(lowmem_kbytes), for better accuracy: 2463 * min_free_kbytes = sqrt(lowmem_kbytes * 16) 2464 * 2465 * which yields 2466 * 2467 * 16MB: 512k 2468 * 32MB: 724k 2469 * 64MB: 1024k 2470 * 128MB: 1448k 2471 * 256MB: 2048k 2472 * 512MB: 2896k 2473 * 1024MB: 4096k 2474 * 2048MB: 5792k 2475 * 4096MB: 8192k 2476 * 8192MB: 11584k 2477 * 16384MB: 16384k 2478 */ 2479static int __init init_per_zone_pages_min(void) 2480{ 2481 unsigned long lowmem_kbytes; 2482 2483 lowmem_kbytes = nr_free_buffer_pages() * (PAGE_SIZE >> 10); 2484 2485 min_free_kbytes = int_sqrt(lowmem_kbytes * 16); 2486 if (min_free_kbytes < 128) 2487 min_free_kbytes = 128; 2488 if (min_free_kbytes > 65536) 2489 min_free_kbytes = 65536; 2490 setup_per_zone_pages_min(); 2491 setup_per_zone_lowmem_reserve(); 2492 return 0; 2493} 2494module_init(init_per_zone_pages_min) 2495 2496/* 2497 * min_free_kbytes_sysctl_handler - just a wrapper around proc_dointvec() so 2498 * that we can call two helper functions whenever min_free_kbytes 2499 * changes. 2500 */ 2501int min_free_kbytes_sysctl_handler(ctl_table *table, int write, 2502 struct file *file, void __user *buffer, size_t *length, loff_t *ppos) 2503{ 2504 proc_dointvec(table, write, file, buffer, length, ppos); 2505 setup_per_zone_pages_min(); 2506 return 0; 2507} 2508 2509/* 2510 * lowmem_reserve_ratio_sysctl_handler - just a wrapper around 2511 * proc_dointvec() so that we can call setup_per_zone_lowmem_reserve() 2512 * whenever sysctl_lowmem_reserve_ratio changes. 2513 * 2514 * The reserve ratio obviously has absolutely no relation with the 2515 * pages_min watermarks. The lowmem reserve ratio can only make sense 2516 * if in function of the boot time zone sizes. 2517 */ 2518int lowmem_reserve_ratio_sysctl_handler(ctl_table *table, int write, 2519 struct file *file, void __user *buffer, size_t *length, loff_t *ppos) 2520{ 2521 proc_dointvec_minmax(table, write, file, buffer, length, ppos); 2522 setup_per_zone_lowmem_reserve(); 2523 return 0; 2524} 2525 2526__initdata int hashdist = HASHDIST_DEFAULT; 2527 2528#ifdef CONFIG_NUMA 2529static int __init set_hashdist(char *str) 2530{ 2531 if (!str) 2532 return 0; 2533 hashdist = simple_strtoul(str, &str, 0); 2534 return 1; 2535} 2536__setup("hashdist=", set_hashdist); 2537#endif 2538 2539/* 2540 * allocate a large system hash table from bootmem 2541 * - it is assumed that the hash table must contain an exact power-of-2 2542 * quantity of entries 2543 * - limit is the number of hash buckets, not the total allocation size 2544 */ 2545void *__init alloc_large_system_hash(const char *tablename, 2546 unsigned long bucketsize, 2547 unsigned long numentries, 2548 int scale, 2549 int flags, 2550 unsigned int *_hash_shift, 2551 unsigned int *_hash_mask, 2552 unsigned long limit) 2553{ 2554 unsigned long long max = limit; 2555 unsigned long log2qty, size; 2556 void *table = NULL; 2557 2558 /* allow the kernel cmdline to have a say */ 2559 if (!numentries) { 2560 /* round applicable memory size up to nearest megabyte */ 2561 numentries = (flags & HASH_HIGHMEM) ? nr_all_pages : nr_kernel_pages; 2562 numentries += (1UL << (20 - PAGE_SHIFT)) - 1; 2563 numentries >>= 20 - PAGE_SHIFT; 2564 numentries <<= 20 - PAGE_SHIFT; 2565 2566 /* limit to 1 bucket per 2^scale bytes of low memory */ 2567 if (scale > PAGE_SHIFT) 2568 numentries >>= (scale - PAGE_SHIFT); 2569 else 2570 numentries <<= (PAGE_SHIFT - scale); 2571 } 2572 /* rounded up to nearest power of 2 in size */ 2573 numentries = 1UL << (long_log2(numentries) + 1); 2574 2575 /* limit allocation size to 1/16 total memory by default */ 2576 if (max == 0) { 2577 max = ((unsigned long long)nr_all_pages << PAGE_SHIFT) >> 4; 2578 do_div(max, bucketsize); 2579 } 2580 2581 if (numentries > max) 2582 numentries = max; 2583 2584 log2qty = long_log2(numentries); 2585 2586 do { 2587 size = bucketsize << log2qty; 2588 if (flags & HASH_EARLY) 2589 table = alloc_bootmem(size); 2590 else if (hashdist) 2591 table = __vmalloc(size, GFP_ATOMIC, PAGE_KERNEL); 2592 else { 2593 unsigned long order; 2594 for (order = 0; ((1UL << order) << PAGE_SHIFT) < size; order++) 2595 ; 2596 table = (void*) __get_free_pages(GFP_ATOMIC, order); 2597 } 2598 } while (!table && size > PAGE_SIZE && --log2qty); 2599 2600 if (!table) 2601 panic("Failed to allocate %s hash table\n", tablename); 2602 2603 printk("%s hash table entries: %d (order: %d, %lu bytes)\n", 2604 tablename, 2605 (1U << log2qty), 2606 long_log2(size) - PAGE_SHIFT, 2607 size); 2608 2609 if (_hash_shift) 2610 *_hash_shift = log2qty; 2611 if (_hash_mask) 2612 *_hash_mask = (1 << log2qty) - 1; 2613 2614 return table; 2615} 2616