page_alloc.c revision 08677214e318297f228237be0042aac754f48f1d
1/*
2 *  linux/mm/page_alloc.c
3 *
4 *  Manages the free list, the system allocates free pages here.
5 *  Note that kmalloc() lives in slab.c
6 *
7 *  Copyright (C) 1991, 1992, 1993, 1994  Linus Torvalds
8 *  Swap reorganised 29.12.95, Stephen Tweedie
9 *  Support of BIGMEM added by Gerhard Wichert, Siemens AG, July 1999
10 *  Reshaped it to be a zoned allocator, Ingo Molnar, Red Hat, 1999
11 *  Discontiguous memory support, Kanoj Sarcar, SGI, Nov 1999
12 *  Zone balancing, Kanoj Sarcar, SGI, Jan 2000
13 *  Per cpu hot/cold page lists, bulk allocation, Martin J. Bligh, Sept 2002
14 *          (lots of bits borrowed from Ingo Molnar & Andrew Morton)
15 */
16
17#include <linux/stddef.h>
18#include <linux/mm.h>
19#include <linux/swap.h>
20#include <linux/interrupt.h>
21#include <linux/pagemap.h>
22#include <linux/jiffies.h>
23#include <linux/bootmem.h>
24#include <linux/compiler.h>
25#include <linux/kernel.h>
26#include <linux/kmemcheck.h>
27#include <linux/module.h>
28#include <linux/suspend.h>
29#include <linux/pagevec.h>
30#include <linux/blkdev.h>
31#include <linux/slab.h>
32#include <linux/oom.h>
33#include <linux/notifier.h>
34#include <linux/topology.h>
35#include <linux/sysctl.h>
36#include <linux/cpu.h>
37#include <linux/cpuset.h>
38#include <linux/memory_hotplug.h>
39#include <linux/nodemask.h>
40#include <linux/vmalloc.h>
41#include <linux/mempolicy.h>
42#include <linux/stop_machine.h>
43#include <linux/sort.h>
44#include <linux/pfn.h>
45#include <linux/backing-dev.h>
46#include <linux/fault-inject.h>
47#include <linux/page-isolation.h>
48#include <linux/page_cgroup.h>
49#include <linux/debugobjects.h>
50#include <linux/kmemleak.h>
51#include <linux/memory.h>
52#include <trace/events/kmem.h>
53
54#include <asm/tlbflush.h>
55#include <asm/div64.h>
56#include "internal.h"
57
58/*
59 * Array of node states.
60 */
61nodemask_t node_states[NR_NODE_STATES] __read_mostly = {
62	[N_POSSIBLE] = NODE_MASK_ALL,
63	[N_ONLINE] = { { [0] = 1UL } },
64#ifndef CONFIG_NUMA
65	[N_NORMAL_MEMORY] = { { [0] = 1UL } },
66#ifdef CONFIG_HIGHMEM
67	[N_HIGH_MEMORY] = { { [0] = 1UL } },
68#endif
69	[N_CPU] = { { [0] = 1UL } },
70#endif	/* NUMA */
71};
72EXPORT_SYMBOL(node_states);
73
74unsigned long totalram_pages __read_mostly;
75unsigned long totalreserve_pages __read_mostly;
76int percpu_pagelist_fraction;
77gfp_t gfp_allowed_mask __read_mostly = GFP_BOOT_MASK;
78
79#ifdef CONFIG_HUGETLB_PAGE_SIZE_VARIABLE
80int pageblock_order __read_mostly;
81#endif
82
83static void __free_pages_ok(struct page *page, unsigned int order);
84
85/*
86 * results with 256, 32 in the lowmem_reserve sysctl:
87 *	1G machine -> (16M dma, 800M-16M normal, 1G-800M high)
88 *	1G machine -> (16M dma, 784M normal, 224M high)
89 *	NORMAL allocation will leave 784M/256 of ram reserved in the ZONE_DMA
90 *	HIGHMEM allocation will leave 224M/32 of ram reserved in ZONE_NORMAL
91 *	HIGHMEM allocation will (224M+784M)/256 of ram reserved in ZONE_DMA
92 *
93 * TBD: should special case ZONE_DMA32 machines here - in those we normally
94 * don't need any ZONE_NORMAL reservation
95 */
96int sysctl_lowmem_reserve_ratio[MAX_NR_ZONES-1] = {
97#ifdef CONFIG_ZONE_DMA
98	 256,
99#endif
100#ifdef CONFIG_ZONE_DMA32
101	 256,
102#endif
103#ifdef CONFIG_HIGHMEM
104	 32,
105#endif
106	 32,
107};
108
109EXPORT_SYMBOL(totalram_pages);
110
111static char * const zone_names[MAX_NR_ZONES] = {
112#ifdef CONFIG_ZONE_DMA
113	 "DMA",
114#endif
115#ifdef CONFIG_ZONE_DMA32
116	 "DMA32",
117#endif
118	 "Normal",
119#ifdef CONFIG_HIGHMEM
120	 "HighMem",
121#endif
122	 "Movable",
123};
124
125int min_free_kbytes = 1024;
126
127static unsigned long __meminitdata nr_kernel_pages;
128static unsigned long __meminitdata nr_all_pages;
129static unsigned long __meminitdata dma_reserve;
130
131#ifdef CONFIG_ARCH_POPULATES_NODE_MAP
132  /*
133   * MAX_ACTIVE_REGIONS determines the maximum number of distinct
134   * ranges of memory (RAM) that may be registered with add_active_range().
135   * Ranges passed to add_active_range() will be merged if possible
136   * so the number of times add_active_range() can be called is
137   * related to the number of nodes and the number of holes
138   */
139  #ifdef CONFIG_MAX_ACTIVE_REGIONS
140    /* Allow an architecture to set MAX_ACTIVE_REGIONS to save memory */
141    #define MAX_ACTIVE_REGIONS CONFIG_MAX_ACTIVE_REGIONS
142  #else
143    #if MAX_NUMNODES >= 32
144      /* If there can be many nodes, allow up to 50 holes per node */
145      #define MAX_ACTIVE_REGIONS (MAX_NUMNODES*50)
146    #else
147      /* By default, allow up to 256 distinct regions */
148      #define MAX_ACTIVE_REGIONS 256
149    #endif
150  #endif
151
152  static struct node_active_region __meminitdata early_node_map[MAX_ACTIVE_REGIONS];
153  static int __meminitdata nr_nodemap_entries;
154  static unsigned long __meminitdata arch_zone_lowest_possible_pfn[MAX_NR_ZONES];
155  static unsigned long __meminitdata arch_zone_highest_possible_pfn[MAX_NR_ZONES];
156  static unsigned long __initdata required_kernelcore;
157  static unsigned long __initdata required_movablecore;
158  static unsigned long __meminitdata zone_movable_pfn[MAX_NUMNODES];
159
160  /* movable_zone is the "real" zone pages in ZONE_MOVABLE are taken from */
161  int movable_zone;
162  EXPORT_SYMBOL(movable_zone);
163#endif /* CONFIG_ARCH_POPULATES_NODE_MAP */
164
165#if MAX_NUMNODES > 1
166int nr_node_ids __read_mostly = MAX_NUMNODES;
167int nr_online_nodes __read_mostly = 1;
168EXPORT_SYMBOL(nr_node_ids);
169EXPORT_SYMBOL(nr_online_nodes);
170#endif
171
172int page_group_by_mobility_disabled __read_mostly;
173
174static void set_pageblock_migratetype(struct page *page, int migratetype)
175{
176
177	if (unlikely(page_group_by_mobility_disabled))
178		migratetype = MIGRATE_UNMOVABLE;
179
180	set_pageblock_flags_group(page, (unsigned long)migratetype,
181					PB_migrate, PB_migrate_end);
182}
183
184bool oom_killer_disabled __read_mostly;
185
186#ifdef CONFIG_DEBUG_VM
187static int page_outside_zone_boundaries(struct zone *zone, struct page *page)
188{
189	int ret = 0;
190	unsigned seq;
191	unsigned long pfn = page_to_pfn(page);
192
193	do {
194		seq = zone_span_seqbegin(zone);
195		if (pfn >= zone->zone_start_pfn + zone->spanned_pages)
196			ret = 1;
197		else if (pfn < zone->zone_start_pfn)
198			ret = 1;
199	} while (zone_span_seqretry(zone, seq));
200
201	return ret;
202}
203
204static int page_is_consistent(struct zone *zone, struct page *page)
205{
206	if (!pfn_valid_within(page_to_pfn(page)))
207		return 0;
208	if (zone != page_zone(page))
209		return 0;
210
211	return 1;
212}
213/*
214 * Temporary debugging check for pages not lying within a given zone.
215 */
216static int bad_range(struct zone *zone, struct page *page)
217{
218	if (page_outside_zone_boundaries(zone, page))
219		return 1;
220	if (!page_is_consistent(zone, page))
221		return 1;
222
223	return 0;
224}
225#else
226static inline int bad_range(struct zone *zone, struct page *page)
227{
228	return 0;
229}
230#endif
231
232static void bad_page(struct page *page)
233{
234	static unsigned long resume;
235	static unsigned long nr_shown;
236	static unsigned long nr_unshown;
237
238	/* Don't complain about poisoned pages */
239	if (PageHWPoison(page)) {
240		__ClearPageBuddy(page);
241		return;
242	}
243
244	/*
245	 * Allow a burst of 60 reports, then keep quiet for that minute;
246	 * or allow a steady drip of one report per second.
247	 */
248	if (nr_shown == 60) {
249		if (time_before(jiffies, resume)) {
250			nr_unshown++;
251			goto out;
252		}
253		if (nr_unshown) {
254			printk(KERN_ALERT
255			      "BUG: Bad page state: %lu messages suppressed\n",
256				nr_unshown);
257			nr_unshown = 0;
258		}
259		nr_shown = 0;
260	}
261	if (nr_shown++ == 0)
262		resume = jiffies + 60 * HZ;
263
264	printk(KERN_ALERT "BUG: Bad page state in process %s  pfn:%05lx\n",
265		current->comm, page_to_pfn(page));
266	printk(KERN_ALERT
267		"page:%p flags:%p count:%d mapcount:%d mapping:%p index:%lx\n",
268		page, (void *)page->flags, page_count(page),
269		page_mapcount(page), page->mapping, page->index);
270
271	dump_stack();
272out:
273	/* Leave bad fields for debug, except PageBuddy could make trouble */
274	__ClearPageBuddy(page);
275	add_taint(TAINT_BAD_PAGE);
276}
277
278/*
279 * Higher-order pages are called "compound pages".  They are structured thusly:
280 *
281 * The first PAGE_SIZE page is called the "head page".
282 *
283 * The remaining PAGE_SIZE pages are called "tail pages".
284 *
285 * All pages have PG_compound set.  All pages have their ->private pointing at
286 * the head page (even the head page has this).
287 *
288 * The first tail page's ->lru.next holds the address of the compound page's
289 * put_page() function.  Its ->lru.prev holds the order of allocation.
290 * This usage means that zero-order pages may not be compound.
291 */
292
293static void free_compound_page(struct page *page)
294{
295	__free_pages_ok(page, compound_order(page));
296}
297
298void prep_compound_page(struct page *page, unsigned long order)
299{
300	int i;
301	int nr_pages = 1 << order;
302
303	set_compound_page_dtor(page, free_compound_page);
304	set_compound_order(page, order);
305	__SetPageHead(page);
306	for (i = 1; i < nr_pages; i++) {
307		struct page *p = page + i;
308
309		__SetPageTail(p);
310		p->first_page = page;
311	}
312}
313
314static int destroy_compound_page(struct page *page, unsigned long order)
315{
316	int i;
317	int nr_pages = 1 << order;
318	int bad = 0;
319
320	if (unlikely(compound_order(page) != order) ||
321	    unlikely(!PageHead(page))) {
322		bad_page(page);
323		bad++;
324	}
325
326	__ClearPageHead(page);
327
328	for (i = 1; i < nr_pages; i++) {
329		struct page *p = page + i;
330
331		if (unlikely(!PageTail(p) || (p->first_page != page))) {
332			bad_page(page);
333			bad++;
334		}
335		__ClearPageTail(p);
336	}
337
338	return bad;
339}
340
341static inline void prep_zero_page(struct page *page, int order, gfp_t gfp_flags)
342{
343	int i;
344
345	/*
346	 * clear_highpage() will use KM_USER0, so it's a bug to use __GFP_ZERO
347	 * and __GFP_HIGHMEM from hard or soft interrupt context.
348	 */
349	VM_BUG_ON((gfp_flags & __GFP_HIGHMEM) && in_interrupt());
350	for (i = 0; i < (1 << order); i++)
351		clear_highpage(page + i);
352}
353
354static inline void set_page_order(struct page *page, int order)
355{
356	set_page_private(page, order);
357	__SetPageBuddy(page);
358}
359
360static inline void rmv_page_order(struct page *page)
361{
362	__ClearPageBuddy(page);
363	set_page_private(page, 0);
364}
365
366/*
367 * Locate the struct page for both the matching buddy in our
368 * pair (buddy1) and the combined O(n+1) page they form (page).
369 *
370 * 1) Any buddy B1 will have an order O twin B2 which satisfies
371 * the following equation:
372 *     B2 = B1 ^ (1 << O)
373 * For example, if the starting buddy (buddy2) is #8 its order
374 * 1 buddy is #10:
375 *     B2 = 8 ^ (1 << 1) = 8 ^ 2 = 10
376 *
377 * 2) Any buddy B will have an order O+1 parent P which
378 * satisfies the following equation:
379 *     P = B & ~(1 << O)
380 *
381 * Assumption: *_mem_map is contiguous at least up to MAX_ORDER
382 */
383static inline struct page *
384__page_find_buddy(struct page *page, unsigned long page_idx, unsigned int order)
385{
386	unsigned long buddy_idx = page_idx ^ (1 << order);
387
388	return page + (buddy_idx - page_idx);
389}
390
391static inline unsigned long
392__find_combined_index(unsigned long page_idx, unsigned int order)
393{
394	return (page_idx & ~(1 << order));
395}
396
397/*
398 * This function checks whether a page is free && is the buddy
399 * we can do coalesce a page and its buddy if
400 * (a) the buddy is not in a hole &&
401 * (b) the buddy is in the buddy system &&
402 * (c) a page and its buddy have the same order &&
403 * (d) a page and its buddy are in the same zone.
404 *
405 * For recording whether a page is in the buddy system, we use PG_buddy.
406 * Setting, clearing, and testing PG_buddy is serialized by zone->lock.
407 *
408 * For recording page's order, we use page_private(page).
409 */
410static inline int page_is_buddy(struct page *page, struct page *buddy,
411								int order)
412{
413	if (!pfn_valid_within(page_to_pfn(buddy)))
414		return 0;
415
416	if (page_zone_id(page) != page_zone_id(buddy))
417		return 0;
418
419	if (PageBuddy(buddy) && page_order(buddy) == order) {
420		VM_BUG_ON(page_count(buddy) != 0);
421		return 1;
422	}
423	return 0;
424}
425
426/*
427 * Freeing function for a buddy system allocator.
428 *
429 * The concept of a buddy system is to maintain direct-mapped table
430 * (containing bit values) for memory blocks of various "orders".
431 * The bottom level table contains the map for the smallest allocatable
432 * units of memory (here, pages), and each level above it describes
433 * pairs of units from the levels below, hence, "buddies".
434 * At a high level, all that happens here is marking the table entry
435 * at the bottom level available, and propagating the changes upward
436 * as necessary, plus some accounting needed to play nicely with other
437 * parts of the VM system.
438 * At each level, we keep a list of pages, which are heads of continuous
439 * free pages of length of (1 << order) and marked with PG_buddy. Page's
440 * order is recorded in page_private(page) field.
441 * So when we are allocating or freeing one, we can derive the state of the
442 * other.  That is, if we allocate a small block, and both were
443 * free, the remainder of the region must be split into blocks.
444 * If a block is freed, and its buddy is also free, then this
445 * triggers coalescing into a block of larger size.
446 *
447 * -- wli
448 */
449
450static inline void __free_one_page(struct page *page,
451		struct zone *zone, unsigned int order,
452		int migratetype)
453{
454	unsigned long page_idx;
455
456	if (unlikely(PageCompound(page)))
457		if (unlikely(destroy_compound_page(page, order)))
458			return;
459
460	VM_BUG_ON(migratetype == -1);
461
462	page_idx = page_to_pfn(page) & ((1 << MAX_ORDER) - 1);
463
464	VM_BUG_ON(page_idx & ((1 << order) - 1));
465	VM_BUG_ON(bad_range(zone, page));
466
467	while (order < MAX_ORDER-1) {
468		unsigned long combined_idx;
469		struct page *buddy;
470
471		buddy = __page_find_buddy(page, page_idx, order);
472		if (!page_is_buddy(page, buddy, order))
473			break;
474
475		/* Our buddy is free, merge with it and move up one order. */
476		list_del(&buddy->lru);
477		zone->free_area[order].nr_free--;
478		rmv_page_order(buddy);
479		combined_idx = __find_combined_index(page_idx, order);
480		page = page + (combined_idx - page_idx);
481		page_idx = combined_idx;
482		order++;
483	}
484	set_page_order(page, order);
485	list_add(&page->lru,
486		&zone->free_area[order].free_list[migratetype]);
487	zone->free_area[order].nr_free++;
488}
489
490/*
491 * free_page_mlock() -- clean up attempts to free and mlocked() page.
492 * Page should not be on lru, so no need to fix that up.
493 * free_pages_check() will verify...
494 */
495static inline void free_page_mlock(struct page *page)
496{
497	__dec_zone_page_state(page, NR_MLOCK);
498	__count_vm_event(UNEVICTABLE_MLOCKFREED);
499}
500
501static inline int free_pages_check(struct page *page)
502{
503	if (unlikely(page_mapcount(page) |
504		(page->mapping != NULL)  |
505		(atomic_read(&page->_count) != 0) |
506		(page->flags & PAGE_FLAGS_CHECK_AT_FREE))) {
507		bad_page(page);
508		return 1;
509	}
510	if (page->flags & PAGE_FLAGS_CHECK_AT_PREP)
511		page->flags &= ~PAGE_FLAGS_CHECK_AT_PREP;
512	return 0;
513}
514
515/*
516 * Frees a number of pages from the PCP lists
517 * Assumes all pages on list are in same zone, and of same order.
518 * count is the number of pages to free.
519 *
520 * If the zone was previously in an "all pages pinned" state then look to
521 * see if this freeing clears that state.
522 *
523 * And clear the zone's pages_scanned counter, to hold off the "all pages are
524 * pinned" detection logic.
525 */
526static void free_pcppages_bulk(struct zone *zone, int count,
527					struct per_cpu_pages *pcp)
528{
529	int migratetype = 0;
530	int batch_free = 0;
531
532	spin_lock(&zone->lock);
533	zone_clear_flag(zone, ZONE_ALL_UNRECLAIMABLE);
534	zone->pages_scanned = 0;
535
536	__mod_zone_page_state(zone, NR_FREE_PAGES, count);
537	while (count) {
538		struct page *page;
539		struct list_head *list;
540
541		/*
542		 * Remove pages from lists in a round-robin fashion. A
543		 * batch_free count is maintained that is incremented when an
544		 * empty list is encountered.  This is so more pages are freed
545		 * off fuller lists instead of spinning excessively around empty
546		 * lists
547		 */
548		do {
549			batch_free++;
550			if (++migratetype == MIGRATE_PCPTYPES)
551				migratetype = 0;
552			list = &pcp->lists[migratetype];
553		} while (list_empty(list));
554
555		do {
556			page = list_entry(list->prev, struct page, lru);
557			/* must delete as __free_one_page list manipulates */
558			list_del(&page->lru);
559			/* MIGRATE_MOVABLE list may include MIGRATE_RESERVEs */
560			__free_one_page(page, zone, 0, page_private(page));
561			trace_mm_page_pcpu_drain(page, 0, page_private(page));
562		} while (--count && --batch_free && !list_empty(list));
563	}
564	spin_unlock(&zone->lock);
565}
566
567static void free_one_page(struct zone *zone, struct page *page, int order,
568				int migratetype)
569{
570	spin_lock(&zone->lock);
571	zone_clear_flag(zone, ZONE_ALL_UNRECLAIMABLE);
572	zone->pages_scanned = 0;
573
574	__mod_zone_page_state(zone, NR_FREE_PAGES, 1 << order);
575	__free_one_page(page, zone, order, migratetype);
576	spin_unlock(&zone->lock);
577}
578
579static void __free_pages_ok(struct page *page, unsigned int order)
580{
581	unsigned long flags;
582	int i;
583	int bad = 0;
584	int wasMlocked = __TestClearPageMlocked(page);
585
586	kmemcheck_free_shadow(page, order);
587
588	for (i = 0 ; i < (1 << order) ; ++i)
589		bad += free_pages_check(page + i);
590	if (bad)
591		return;
592
593	if (!PageHighMem(page)) {
594		debug_check_no_locks_freed(page_address(page),PAGE_SIZE<<order);
595		debug_check_no_obj_freed(page_address(page),
596					   PAGE_SIZE << order);
597	}
598	arch_free_page(page, order);
599	kernel_map_pages(page, 1 << order, 0);
600
601	local_irq_save(flags);
602	if (unlikely(wasMlocked))
603		free_page_mlock(page);
604	__count_vm_events(PGFREE, 1 << order);
605	free_one_page(page_zone(page), page, order,
606					get_pageblock_migratetype(page));
607	local_irq_restore(flags);
608}
609
610/*
611 * permit the bootmem allocator to evade page validation on high-order frees
612 */
613void __meminit __free_pages_bootmem(struct page *page, unsigned int order)
614{
615	if (order == 0) {
616		__ClearPageReserved(page);
617		set_page_count(page, 0);
618		set_page_refcounted(page);
619		__free_page(page);
620	} else {
621		int loop;
622
623		prefetchw(page);
624		for (loop = 0; loop < BITS_PER_LONG; loop++) {
625			struct page *p = &page[loop];
626
627			if (loop + 1 < BITS_PER_LONG)
628				prefetchw(p + 1);
629			__ClearPageReserved(p);
630			set_page_count(p, 0);
631		}
632
633		set_page_refcounted(page);
634		__free_pages(page, order);
635	}
636}
637
638
639/*
640 * The order of subdivision here is critical for the IO subsystem.
641 * Please do not alter this order without good reasons and regression
642 * testing. Specifically, as large blocks of memory are subdivided,
643 * the order in which smaller blocks are delivered depends on the order
644 * they're subdivided in this function. This is the primary factor
645 * influencing the order in which pages are delivered to the IO
646 * subsystem according to empirical testing, and this is also justified
647 * by considering the behavior of a buddy system containing a single
648 * large block of memory acted on by a series of small allocations.
649 * This behavior is a critical factor in sglist merging's success.
650 *
651 * -- wli
652 */
653static inline void expand(struct zone *zone, struct page *page,
654	int low, int high, struct free_area *area,
655	int migratetype)
656{
657	unsigned long size = 1 << high;
658
659	while (high > low) {
660		area--;
661		high--;
662		size >>= 1;
663		VM_BUG_ON(bad_range(zone, &page[size]));
664		list_add(&page[size].lru, &area->free_list[migratetype]);
665		area->nr_free++;
666		set_page_order(&page[size], high);
667	}
668}
669
670/*
671 * This page is about to be returned from the page allocator
672 */
673static inline int check_new_page(struct page *page)
674{
675	if (unlikely(page_mapcount(page) |
676		(page->mapping != NULL)  |
677		(atomic_read(&page->_count) != 0)  |
678		(page->flags & PAGE_FLAGS_CHECK_AT_PREP))) {
679		bad_page(page);
680		return 1;
681	}
682	return 0;
683}
684
685static int prep_new_page(struct page *page, int order, gfp_t gfp_flags)
686{
687	int i;
688
689	for (i = 0; i < (1 << order); i++) {
690		struct page *p = page + i;
691		if (unlikely(check_new_page(p)))
692			return 1;
693	}
694
695	set_page_private(page, 0);
696	set_page_refcounted(page);
697
698	arch_alloc_page(page, order);
699	kernel_map_pages(page, 1 << order, 1);
700
701	if (gfp_flags & __GFP_ZERO)
702		prep_zero_page(page, order, gfp_flags);
703
704	if (order && (gfp_flags & __GFP_COMP))
705		prep_compound_page(page, order);
706
707	return 0;
708}
709
710/*
711 * Go through the free lists for the given migratetype and remove
712 * the smallest available page from the freelists
713 */
714static inline
715struct page *__rmqueue_smallest(struct zone *zone, unsigned int order,
716						int migratetype)
717{
718	unsigned int current_order;
719	struct free_area * area;
720	struct page *page;
721
722	/* Find a page of the appropriate size in the preferred list */
723	for (current_order = order; current_order < MAX_ORDER; ++current_order) {
724		area = &(zone->free_area[current_order]);
725		if (list_empty(&area->free_list[migratetype]))
726			continue;
727
728		page = list_entry(area->free_list[migratetype].next,
729							struct page, lru);
730		list_del(&page->lru);
731		rmv_page_order(page);
732		area->nr_free--;
733		expand(zone, page, order, current_order, area, migratetype);
734		return page;
735	}
736
737	return NULL;
738}
739
740
741/*
742 * This array describes the order lists are fallen back to when
743 * the free lists for the desirable migrate type are depleted
744 */
745static int fallbacks[MIGRATE_TYPES][MIGRATE_TYPES-1] = {
746	[MIGRATE_UNMOVABLE]   = { MIGRATE_RECLAIMABLE, MIGRATE_MOVABLE,   MIGRATE_RESERVE },
747	[MIGRATE_RECLAIMABLE] = { MIGRATE_UNMOVABLE,   MIGRATE_MOVABLE,   MIGRATE_RESERVE },
748	[MIGRATE_MOVABLE]     = { MIGRATE_RECLAIMABLE, MIGRATE_UNMOVABLE, MIGRATE_RESERVE },
749	[MIGRATE_RESERVE]     = { MIGRATE_RESERVE,     MIGRATE_RESERVE,   MIGRATE_RESERVE }, /* Never used */
750};
751
752/*
753 * Move the free pages in a range to the free lists of the requested type.
754 * Note that start_page and end_pages are not aligned on a pageblock
755 * boundary. If alignment is required, use move_freepages_block()
756 */
757static int move_freepages(struct zone *zone,
758			  struct page *start_page, struct page *end_page,
759			  int migratetype)
760{
761	struct page *page;
762	unsigned long order;
763	int pages_moved = 0;
764
765#ifndef CONFIG_HOLES_IN_ZONE
766	/*
767	 * page_zone is not safe to call in this context when
768	 * CONFIG_HOLES_IN_ZONE is set. This bug check is probably redundant
769	 * anyway as we check zone boundaries in move_freepages_block().
770	 * Remove at a later date when no bug reports exist related to
771	 * grouping pages by mobility
772	 */
773	BUG_ON(page_zone(start_page) != page_zone(end_page));
774#endif
775
776	for (page = start_page; page <= end_page;) {
777		/* Make sure we are not inadvertently changing nodes */
778		VM_BUG_ON(page_to_nid(page) != zone_to_nid(zone));
779
780		if (!pfn_valid_within(page_to_pfn(page))) {
781			page++;
782			continue;
783		}
784
785		if (!PageBuddy(page)) {
786			page++;
787			continue;
788		}
789
790		order = page_order(page);
791		list_del(&page->lru);
792		list_add(&page->lru,
793			&zone->free_area[order].free_list[migratetype]);
794		page += 1 << order;
795		pages_moved += 1 << order;
796	}
797
798	return pages_moved;
799}
800
801static int move_freepages_block(struct zone *zone, struct page *page,
802				int migratetype)
803{
804	unsigned long start_pfn, end_pfn;
805	struct page *start_page, *end_page;
806
807	start_pfn = page_to_pfn(page);
808	start_pfn = start_pfn & ~(pageblock_nr_pages-1);
809	start_page = pfn_to_page(start_pfn);
810	end_page = start_page + pageblock_nr_pages - 1;
811	end_pfn = start_pfn + pageblock_nr_pages - 1;
812
813	/* Do not cross zone boundaries */
814	if (start_pfn < zone->zone_start_pfn)
815		start_page = page;
816	if (end_pfn >= zone->zone_start_pfn + zone->spanned_pages)
817		return 0;
818
819	return move_freepages(zone, start_page, end_page, migratetype);
820}
821
822static void change_pageblock_range(struct page *pageblock_page,
823					int start_order, int migratetype)
824{
825	int nr_pageblocks = 1 << (start_order - pageblock_order);
826
827	while (nr_pageblocks--) {
828		set_pageblock_migratetype(pageblock_page, migratetype);
829		pageblock_page += pageblock_nr_pages;
830	}
831}
832
833/* Remove an element from the buddy allocator from the fallback list */
834static inline struct page *
835__rmqueue_fallback(struct zone *zone, int order, int start_migratetype)
836{
837	struct free_area * area;
838	int current_order;
839	struct page *page;
840	int migratetype, i;
841
842	/* Find the largest possible block of pages in the other list */
843	for (current_order = MAX_ORDER-1; current_order >= order;
844						--current_order) {
845		for (i = 0; i < MIGRATE_TYPES - 1; i++) {
846			migratetype = fallbacks[start_migratetype][i];
847
848			/* MIGRATE_RESERVE handled later if necessary */
849			if (migratetype == MIGRATE_RESERVE)
850				continue;
851
852			area = &(zone->free_area[current_order]);
853			if (list_empty(&area->free_list[migratetype]))
854				continue;
855
856			page = list_entry(area->free_list[migratetype].next,
857					struct page, lru);
858			area->nr_free--;
859
860			/*
861			 * If breaking a large block of pages, move all free
862			 * pages to the preferred allocation list. If falling
863			 * back for a reclaimable kernel allocation, be more
864			 * agressive about taking ownership of free pages
865			 */
866			if (unlikely(current_order >= (pageblock_order >> 1)) ||
867					start_migratetype == MIGRATE_RECLAIMABLE ||
868					page_group_by_mobility_disabled) {
869				unsigned long pages;
870				pages = move_freepages_block(zone, page,
871								start_migratetype);
872
873				/* Claim the whole block if over half of it is free */
874				if (pages >= (1 << (pageblock_order-1)) ||
875						page_group_by_mobility_disabled)
876					set_pageblock_migratetype(page,
877								start_migratetype);
878
879				migratetype = start_migratetype;
880			}
881
882			/* Remove the page from the freelists */
883			list_del(&page->lru);
884			rmv_page_order(page);
885
886			/* Take ownership for orders >= pageblock_order */
887			if (current_order >= pageblock_order)
888				change_pageblock_range(page, current_order,
889							start_migratetype);
890
891			expand(zone, page, order, current_order, area, migratetype);
892
893			trace_mm_page_alloc_extfrag(page, order, current_order,
894				start_migratetype, migratetype);
895
896			return page;
897		}
898	}
899
900	return NULL;
901}
902
903/*
904 * Do the hard work of removing an element from the buddy allocator.
905 * Call me with the zone->lock already held.
906 */
907static struct page *__rmqueue(struct zone *zone, unsigned int order,
908						int migratetype)
909{
910	struct page *page;
911
912retry_reserve:
913	page = __rmqueue_smallest(zone, order, migratetype);
914
915	if (unlikely(!page) && migratetype != MIGRATE_RESERVE) {
916		page = __rmqueue_fallback(zone, order, migratetype);
917
918		/*
919		 * Use MIGRATE_RESERVE rather than fail an allocation. goto
920		 * is used because __rmqueue_smallest is an inline function
921		 * and we want just one call site
922		 */
923		if (!page) {
924			migratetype = MIGRATE_RESERVE;
925			goto retry_reserve;
926		}
927	}
928
929	trace_mm_page_alloc_zone_locked(page, order, migratetype);
930	return page;
931}
932
933/*
934 * Obtain a specified number of elements from the buddy allocator, all under
935 * a single hold of the lock, for efficiency.  Add them to the supplied list.
936 * Returns the number of new pages which were placed at *list.
937 */
938static int rmqueue_bulk(struct zone *zone, unsigned int order,
939			unsigned long count, struct list_head *list,
940			int migratetype, int cold)
941{
942	int i;
943
944	spin_lock(&zone->lock);
945	for (i = 0; i < count; ++i) {
946		struct page *page = __rmqueue(zone, order, migratetype);
947		if (unlikely(page == NULL))
948			break;
949
950		/*
951		 * Split buddy pages returned by expand() are received here
952		 * in physical page order. The page is added to the callers and
953		 * list and the list head then moves forward. From the callers
954		 * perspective, the linked list is ordered by page number in
955		 * some conditions. This is useful for IO devices that can
956		 * merge IO requests if the physical pages are ordered
957		 * properly.
958		 */
959		if (likely(cold == 0))
960			list_add(&page->lru, list);
961		else
962			list_add_tail(&page->lru, list);
963		set_page_private(page, migratetype);
964		list = &page->lru;
965	}
966	__mod_zone_page_state(zone, NR_FREE_PAGES, -(i << order));
967	spin_unlock(&zone->lock);
968	return i;
969}
970
971#ifdef CONFIG_NUMA
972/*
973 * Called from the vmstat counter updater to drain pagesets of this
974 * currently executing processor on remote nodes after they have
975 * expired.
976 *
977 * Note that this function must be called with the thread pinned to
978 * a single processor.
979 */
980void drain_zone_pages(struct zone *zone, struct per_cpu_pages *pcp)
981{
982	unsigned long flags;
983	int to_drain;
984
985	local_irq_save(flags);
986	if (pcp->count >= pcp->batch)
987		to_drain = pcp->batch;
988	else
989		to_drain = pcp->count;
990	free_pcppages_bulk(zone, to_drain, pcp);
991	pcp->count -= to_drain;
992	local_irq_restore(flags);
993}
994#endif
995
996/*
997 * Drain pages of the indicated processor.
998 *
999 * The processor must either be the current processor and the
1000 * thread pinned to the current processor or a processor that
1001 * is not online.
1002 */
1003static void drain_pages(unsigned int cpu)
1004{
1005	unsigned long flags;
1006	struct zone *zone;
1007
1008	for_each_populated_zone(zone) {
1009		struct per_cpu_pageset *pset;
1010		struct per_cpu_pages *pcp;
1011
1012		pset = zone_pcp(zone, cpu);
1013
1014		pcp = &pset->pcp;
1015		local_irq_save(flags);
1016		free_pcppages_bulk(zone, pcp->count, pcp);
1017		pcp->count = 0;
1018		local_irq_restore(flags);
1019	}
1020}
1021
1022/*
1023 * Spill all of this CPU's per-cpu pages back into the buddy allocator.
1024 */
1025void drain_local_pages(void *arg)
1026{
1027	drain_pages(smp_processor_id());
1028}
1029
1030/*
1031 * Spill all the per-cpu pages from all CPUs back into the buddy allocator
1032 */
1033void drain_all_pages(void)
1034{
1035	on_each_cpu(drain_local_pages, NULL, 1);
1036}
1037
1038#ifdef CONFIG_HIBERNATION
1039
1040void mark_free_pages(struct zone *zone)
1041{
1042	unsigned long pfn, max_zone_pfn;
1043	unsigned long flags;
1044	int order, t;
1045	struct list_head *curr;
1046
1047	if (!zone->spanned_pages)
1048		return;
1049
1050	spin_lock_irqsave(&zone->lock, flags);
1051
1052	max_zone_pfn = zone->zone_start_pfn + zone->spanned_pages;
1053	for (pfn = zone->zone_start_pfn; pfn < max_zone_pfn; pfn++)
1054		if (pfn_valid(pfn)) {
1055			struct page *page = pfn_to_page(pfn);
1056
1057			if (!swsusp_page_is_forbidden(page))
1058				swsusp_unset_page_free(page);
1059		}
1060
1061	for_each_migratetype_order(order, t) {
1062		list_for_each(curr, &zone->free_area[order].free_list[t]) {
1063			unsigned long i;
1064
1065			pfn = page_to_pfn(list_entry(curr, struct page, lru));
1066			for (i = 0; i < (1UL << order); i++)
1067				swsusp_set_page_free(pfn_to_page(pfn + i));
1068		}
1069	}
1070	spin_unlock_irqrestore(&zone->lock, flags);
1071}
1072#endif /* CONFIG_PM */
1073
1074/*
1075 * Free a 0-order page
1076 */
1077static void free_hot_cold_page(struct page *page, int cold)
1078{
1079	struct zone *zone = page_zone(page);
1080	struct per_cpu_pages *pcp;
1081	unsigned long flags;
1082	int migratetype;
1083	int wasMlocked = __TestClearPageMlocked(page);
1084
1085	kmemcheck_free_shadow(page, 0);
1086
1087	if (PageAnon(page))
1088		page->mapping = NULL;
1089	if (free_pages_check(page))
1090		return;
1091
1092	if (!PageHighMem(page)) {
1093		debug_check_no_locks_freed(page_address(page), PAGE_SIZE);
1094		debug_check_no_obj_freed(page_address(page), PAGE_SIZE);
1095	}
1096	arch_free_page(page, 0);
1097	kernel_map_pages(page, 1, 0);
1098
1099	pcp = &zone_pcp(zone, get_cpu())->pcp;
1100	migratetype = get_pageblock_migratetype(page);
1101	set_page_private(page, migratetype);
1102	local_irq_save(flags);
1103	if (unlikely(wasMlocked))
1104		free_page_mlock(page);
1105	__count_vm_event(PGFREE);
1106
1107	/*
1108	 * We only track unmovable, reclaimable and movable on pcp lists.
1109	 * Free ISOLATE pages back to the allocator because they are being
1110	 * offlined but treat RESERVE as movable pages so we can get those
1111	 * areas back if necessary. Otherwise, we may have to free
1112	 * excessively into the page allocator
1113	 */
1114	if (migratetype >= MIGRATE_PCPTYPES) {
1115		if (unlikely(migratetype == MIGRATE_ISOLATE)) {
1116			free_one_page(zone, page, 0, migratetype);
1117			goto out;
1118		}
1119		migratetype = MIGRATE_MOVABLE;
1120	}
1121
1122	if (cold)
1123		list_add_tail(&page->lru, &pcp->lists[migratetype]);
1124	else
1125		list_add(&page->lru, &pcp->lists[migratetype]);
1126	pcp->count++;
1127	if (pcp->count >= pcp->high) {
1128		free_pcppages_bulk(zone, pcp->batch, pcp);
1129		pcp->count -= pcp->batch;
1130	}
1131
1132out:
1133	local_irq_restore(flags);
1134	put_cpu();
1135}
1136
1137void free_hot_page(struct page *page)
1138{
1139	trace_mm_page_free_direct(page, 0);
1140	free_hot_cold_page(page, 0);
1141}
1142
1143/*
1144 * split_page takes a non-compound higher-order page, and splits it into
1145 * n (1<<order) sub-pages: page[0..n]
1146 * Each sub-page must be freed individually.
1147 *
1148 * Note: this is probably too low level an operation for use in drivers.
1149 * Please consult with lkml before using this in your driver.
1150 */
1151void split_page(struct page *page, unsigned int order)
1152{
1153	int i;
1154
1155	VM_BUG_ON(PageCompound(page));
1156	VM_BUG_ON(!page_count(page));
1157
1158#ifdef CONFIG_KMEMCHECK
1159	/*
1160	 * Split shadow pages too, because free(page[0]) would
1161	 * otherwise free the whole shadow.
1162	 */
1163	if (kmemcheck_page_is_tracked(page))
1164		split_page(virt_to_page(page[0].shadow), order);
1165#endif
1166
1167	for (i = 1; i < (1 << order); i++)
1168		set_page_refcounted(page + i);
1169}
1170
1171/*
1172 * Really, prep_compound_page() should be called from __rmqueue_bulk().  But
1173 * we cheat by calling it from here, in the order > 0 path.  Saves a branch
1174 * or two.
1175 */
1176static inline
1177struct page *buffered_rmqueue(struct zone *preferred_zone,
1178			struct zone *zone, int order, gfp_t gfp_flags,
1179			int migratetype)
1180{
1181	unsigned long flags;
1182	struct page *page;
1183	int cold = !!(gfp_flags & __GFP_COLD);
1184	int cpu;
1185
1186again:
1187	cpu  = get_cpu();
1188	if (likely(order == 0)) {
1189		struct per_cpu_pages *pcp;
1190		struct list_head *list;
1191
1192		pcp = &zone_pcp(zone, cpu)->pcp;
1193		list = &pcp->lists[migratetype];
1194		local_irq_save(flags);
1195		if (list_empty(list)) {
1196			pcp->count += rmqueue_bulk(zone, 0,
1197					pcp->batch, list,
1198					migratetype, cold);
1199			if (unlikely(list_empty(list)))
1200				goto failed;
1201		}
1202
1203		if (cold)
1204			page = list_entry(list->prev, struct page, lru);
1205		else
1206			page = list_entry(list->next, struct page, lru);
1207
1208		list_del(&page->lru);
1209		pcp->count--;
1210	} else {
1211		if (unlikely(gfp_flags & __GFP_NOFAIL)) {
1212			/*
1213			 * __GFP_NOFAIL is not to be used in new code.
1214			 *
1215			 * All __GFP_NOFAIL callers should be fixed so that they
1216			 * properly detect and handle allocation failures.
1217			 *
1218			 * We most definitely don't want callers attempting to
1219			 * allocate greater than order-1 page units with
1220			 * __GFP_NOFAIL.
1221			 */
1222			WARN_ON_ONCE(order > 1);
1223		}
1224		spin_lock_irqsave(&zone->lock, flags);
1225		page = __rmqueue(zone, order, migratetype);
1226		spin_unlock(&zone->lock);
1227		if (!page)
1228			goto failed;
1229		__mod_zone_page_state(zone, NR_FREE_PAGES, -(1 << order));
1230	}
1231
1232	__count_zone_vm_events(PGALLOC, zone, 1 << order);
1233	zone_statistics(preferred_zone, zone);
1234	local_irq_restore(flags);
1235	put_cpu();
1236
1237	VM_BUG_ON(bad_range(zone, page));
1238	if (prep_new_page(page, order, gfp_flags))
1239		goto again;
1240	return page;
1241
1242failed:
1243	local_irq_restore(flags);
1244	put_cpu();
1245	return NULL;
1246}
1247
1248/* The ALLOC_WMARK bits are used as an index to zone->watermark */
1249#define ALLOC_WMARK_MIN		WMARK_MIN
1250#define ALLOC_WMARK_LOW		WMARK_LOW
1251#define ALLOC_WMARK_HIGH	WMARK_HIGH
1252#define ALLOC_NO_WATERMARKS	0x04 /* don't check watermarks at all */
1253
1254/* Mask to get the watermark bits */
1255#define ALLOC_WMARK_MASK	(ALLOC_NO_WATERMARKS-1)
1256
1257#define ALLOC_HARDER		0x10 /* try to alloc harder */
1258#define ALLOC_HIGH		0x20 /* __GFP_HIGH set */
1259#define ALLOC_CPUSET		0x40 /* check for correct cpuset */
1260
1261#ifdef CONFIG_FAIL_PAGE_ALLOC
1262
1263static struct fail_page_alloc_attr {
1264	struct fault_attr attr;
1265
1266	u32 ignore_gfp_highmem;
1267	u32 ignore_gfp_wait;
1268	u32 min_order;
1269
1270#ifdef CONFIG_FAULT_INJECTION_DEBUG_FS
1271
1272	struct dentry *ignore_gfp_highmem_file;
1273	struct dentry *ignore_gfp_wait_file;
1274	struct dentry *min_order_file;
1275
1276#endif /* CONFIG_FAULT_INJECTION_DEBUG_FS */
1277
1278} fail_page_alloc = {
1279	.attr = FAULT_ATTR_INITIALIZER,
1280	.ignore_gfp_wait = 1,
1281	.ignore_gfp_highmem = 1,
1282	.min_order = 1,
1283};
1284
1285static int __init setup_fail_page_alloc(char *str)
1286{
1287	return setup_fault_attr(&fail_page_alloc.attr, str);
1288}
1289__setup("fail_page_alloc=", setup_fail_page_alloc);
1290
1291static int should_fail_alloc_page(gfp_t gfp_mask, unsigned int order)
1292{
1293	if (order < fail_page_alloc.min_order)
1294		return 0;
1295	if (gfp_mask & __GFP_NOFAIL)
1296		return 0;
1297	if (fail_page_alloc.ignore_gfp_highmem && (gfp_mask & __GFP_HIGHMEM))
1298		return 0;
1299	if (fail_page_alloc.ignore_gfp_wait && (gfp_mask & __GFP_WAIT))
1300		return 0;
1301
1302	return should_fail(&fail_page_alloc.attr, 1 << order);
1303}
1304
1305#ifdef CONFIG_FAULT_INJECTION_DEBUG_FS
1306
1307static int __init fail_page_alloc_debugfs(void)
1308{
1309	mode_t mode = S_IFREG | S_IRUSR | S_IWUSR;
1310	struct dentry *dir;
1311	int err;
1312
1313	err = init_fault_attr_dentries(&fail_page_alloc.attr,
1314				       "fail_page_alloc");
1315	if (err)
1316		return err;
1317	dir = fail_page_alloc.attr.dentries.dir;
1318
1319	fail_page_alloc.ignore_gfp_wait_file =
1320		debugfs_create_bool("ignore-gfp-wait", mode, dir,
1321				      &fail_page_alloc.ignore_gfp_wait);
1322
1323	fail_page_alloc.ignore_gfp_highmem_file =
1324		debugfs_create_bool("ignore-gfp-highmem", mode, dir,
1325				      &fail_page_alloc.ignore_gfp_highmem);
1326	fail_page_alloc.min_order_file =
1327		debugfs_create_u32("min-order", mode, dir,
1328				   &fail_page_alloc.min_order);
1329
1330	if (!fail_page_alloc.ignore_gfp_wait_file ||
1331            !fail_page_alloc.ignore_gfp_highmem_file ||
1332            !fail_page_alloc.min_order_file) {
1333		err = -ENOMEM;
1334		debugfs_remove(fail_page_alloc.ignore_gfp_wait_file);
1335		debugfs_remove(fail_page_alloc.ignore_gfp_highmem_file);
1336		debugfs_remove(fail_page_alloc.min_order_file);
1337		cleanup_fault_attr_dentries(&fail_page_alloc.attr);
1338	}
1339
1340	return err;
1341}
1342
1343late_initcall(fail_page_alloc_debugfs);
1344
1345#endif /* CONFIG_FAULT_INJECTION_DEBUG_FS */
1346
1347#else /* CONFIG_FAIL_PAGE_ALLOC */
1348
1349static inline int should_fail_alloc_page(gfp_t gfp_mask, unsigned int order)
1350{
1351	return 0;
1352}
1353
1354#endif /* CONFIG_FAIL_PAGE_ALLOC */
1355
1356/*
1357 * Return 1 if free pages are above 'mark'. This takes into account the order
1358 * of the allocation.
1359 */
1360int zone_watermark_ok(struct zone *z, int order, unsigned long mark,
1361		      int classzone_idx, int alloc_flags)
1362{
1363	/* free_pages my go negative - that's OK */
1364	long min = mark;
1365	long free_pages = zone_page_state(z, NR_FREE_PAGES) - (1 << order) + 1;
1366	int o;
1367
1368	if (alloc_flags & ALLOC_HIGH)
1369		min -= min / 2;
1370	if (alloc_flags & ALLOC_HARDER)
1371		min -= min / 4;
1372
1373	if (free_pages <= min + z->lowmem_reserve[classzone_idx])
1374		return 0;
1375	for (o = 0; o < order; o++) {
1376		/* At the next order, this order's pages become unavailable */
1377		free_pages -= z->free_area[o].nr_free << o;
1378
1379		/* Require fewer higher order pages to be free */
1380		min >>= 1;
1381
1382		if (free_pages <= min)
1383			return 0;
1384	}
1385	return 1;
1386}
1387
1388#ifdef CONFIG_NUMA
1389/*
1390 * zlc_setup - Setup for "zonelist cache".  Uses cached zone data to
1391 * skip over zones that are not allowed by the cpuset, or that have
1392 * been recently (in last second) found to be nearly full.  See further
1393 * comments in mmzone.h.  Reduces cache footprint of zonelist scans
1394 * that have to skip over a lot of full or unallowed zones.
1395 *
1396 * If the zonelist cache is present in the passed in zonelist, then
1397 * returns a pointer to the allowed node mask (either the current
1398 * tasks mems_allowed, or node_states[N_HIGH_MEMORY].)
1399 *
1400 * If the zonelist cache is not available for this zonelist, does
1401 * nothing and returns NULL.
1402 *
1403 * If the fullzones BITMAP in the zonelist cache is stale (more than
1404 * a second since last zap'd) then we zap it out (clear its bits.)
1405 *
1406 * We hold off even calling zlc_setup, until after we've checked the
1407 * first zone in the zonelist, on the theory that most allocations will
1408 * be satisfied from that first zone, so best to examine that zone as
1409 * quickly as we can.
1410 */
1411static nodemask_t *zlc_setup(struct zonelist *zonelist, int alloc_flags)
1412{
1413	struct zonelist_cache *zlc;	/* cached zonelist speedup info */
1414	nodemask_t *allowednodes;	/* zonelist_cache approximation */
1415
1416	zlc = zonelist->zlcache_ptr;
1417	if (!zlc)
1418		return NULL;
1419
1420	if (time_after(jiffies, zlc->last_full_zap + HZ)) {
1421		bitmap_zero(zlc->fullzones, MAX_ZONES_PER_ZONELIST);
1422		zlc->last_full_zap = jiffies;
1423	}
1424
1425	allowednodes = !in_interrupt() && (alloc_flags & ALLOC_CPUSET) ?
1426					&cpuset_current_mems_allowed :
1427					&node_states[N_HIGH_MEMORY];
1428	return allowednodes;
1429}
1430
1431/*
1432 * Given 'z' scanning a zonelist, run a couple of quick checks to see
1433 * if it is worth looking at further for free memory:
1434 *  1) Check that the zone isn't thought to be full (doesn't have its
1435 *     bit set in the zonelist_cache fullzones BITMAP).
1436 *  2) Check that the zones node (obtained from the zonelist_cache
1437 *     z_to_n[] mapping) is allowed in the passed in allowednodes mask.
1438 * Return true (non-zero) if zone is worth looking at further, or
1439 * else return false (zero) if it is not.
1440 *
1441 * This check -ignores- the distinction between various watermarks,
1442 * such as GFP_HIGH, GFP_ATOMIC, PF_MEMALLOC, ...  If a zone is
1443 * found to be full for any variation of these watermarks, it will
1444 * be considered full for up to one second by all requests, unless
1445 * we are so low on memory on all allowed nodes that we are forced
1446 * into the second scan of the zonelist.
1447 *
1448 * In the second scan we ignore this zonelist cache and exactly
1449 * apply the watermarks to all zones, even it is slower to do so.
1450 * We are low on memory in the second scan, and should leave no stone
1451 * unturned looking for a free page.
1452 */
1453static int zlc_zone_worth_trying(struct zonelist *zonelist, struct zoneref *z,
1454						nodemask_t *allowednodes)
1455{
1456	struct zonelist_cache *zlc;	/* cached zonelist speedup info */
1457	int i;				/* index of *z in zonelist zones */
1458	int n;				/* node that zone *z is on */
1459
1460	zlc = zonelist->zlcache_ptr;
1461	if (!zlc)
1462		return 1;
1463
1464	i = z - zonelist->_zonerefs;
1465	n = zlc->z_to_n[i];
1466
1467	/* This zone is worth trying if it is allowed but not full */
1468	return node_isset(n, *allowednodes) && !test_bit(i, zlc->fullzones);
1469}
1470
1471/*
1472 * Given 'z' scanning a zonelist, set the corresponding bit in
1473 * zlc->fullzones, so that subsequent attempts to allocate a page
1474 * from that zone don't waste time re-examining it.
1475 */
1476static void zlc_mark_zone_full(struct zonelist *zonelist, struct zoneref *z)
1477{
1478	struct zonelist_cache *zlc;	/* cached zonelist speedup info */
1479	int i;				/* index of *z in zonelist zones */
1480
1481	zlc = zonelist->zlcache_ptr;
1482	if (!zlc)
1483		return;
1484
1485	i = z - zonelist->_zonerefs;
1486
1487	set_bit(i, zlc->fullzones);
1488}
1489
1490#else	/* CONFIG_NUMA */
1491
1492static nodemask_t *zlc_setup(struct zonelist *zonelist, int alloc_flags)
1493{
1494	return NULL;
1495}
1496
1497static int zlc_zone_worth_trying(struct zonelist *zonelist, struct zoneref *z,
1498				nodemask_t *allowednodes)
1499{
1500	return 1;
1501}
1502
1503static void zlc_mark_zone_full(struct zonelist *zonelist, struct zoneref *z)
1504{
1505}
1506#endif	/* CONFIG_NUMA */
1507
1508/*
1509 * get_page_from_freelist goes through the zonelist trying to allocate
1510 * a page.
1511 */
1512static struct page *
1513get_page_from_freelist(gfp_t gfp_mask, nodemask_t *nodemask, unsigned int order,
1514		struct zonelist *zonelist, int high_zoneidx, int alloc_flags,
1515		struct zone *preferred_zone, int migratetype)
1516{
1517	struct zoneref *z;
1518	struct page *page = NULL;
1519	int classzone_idx;
1520	struct zone *zone;
1521	nodemask_t *allowednodes = NULL;/* zonelist_cache approximation */
1522	int zlc_active = 0;		/* set if using zonelist_cache */
1523	int did_zlc_setup = 0;		/* just call zlc_setup() one time */
1524
1525	classzone_idx = zone_idx(preferred_zone);
1526zonelist_scan:
1527	/*
1528	 * Scan zonelist, looking for a zone with enough free.
1529	 * See also cpuset_zone_allowed() comment in kernel/cpuset.c.
1530	 */
1531	for_each_zone_zonelist_nodemask(zone, z, zonelist,
1532						high_zoneidx, nodemask) {
1533		if (NUMA_BUILD && zlc_active &&
1534			!zlc_zone_worth_trying(zonelist, z, allowednodes))
1535				continue;
1536		if ((alloc_flags & ALLOC_CPUSET) &&
1537			!cpuset_zone_allowed_softwall(zone, gfp_mask))
1538				goto try_next_zone;
1539
1540		BUILD_BUG_ON(ALLOC_NO_WATERMARKS < NR_WMARK);
1541		if (!(alloc_flags & ALLOC_NO_WATERMARKS)) {
1542			unsigned long mark;
1543			int ret;
1544
1545			mark = zone->watermark[alloc_flags & ALLOC_WMARK_MASK];
1546			if (zone_watermark_ok(zone, order, mark,
1547				    classzone_idx, alloc_flags))
1548				goto try_this_zone;
1549
1550			if (zone_reclaim_mode == 0)
1551				goto this_zone_full;
1552
1553			ret = zone_reclaim(zone, gfp_mask, order);
1554			switch (ret) {
1555			case ZONE_RECLAIM_NOSCAN:
1556				/* did not scan */
1557				goto try_next_zone;
1558			case ZONE_RECLAIM_FULL:
1559				/* scanned but unreclaimable */
1560				goto this_zone_full;
1561			default:
1562				/* did we reclaim enough */
1563				if (!zone_watermark_ok(zone, order, mark,
1564						classzone_idx, alloc_flags))
1565					goto this_zone_full;
1566			}
1567		}
1568
1569try_this_zone:
1570		page = buffered_rmqueue(preferred_zone, zone, order,
1571						gfp_mask, migratetype);
1572		if (page)
1573			break;
1574this_zone_full:
1575		if (NUMA_BUILD)
1576			zlc_mark_zone_full(zonelist, z);
1577try_next_zone:
1578		if (NUMA_BUILD && !did_zlc_setup && nr_online_nodes > 1) {
1579			/*
1580			 * we do zlc_setup after the first zone is tried but only
1581			 * if there are multiple nodes make it worthwhile
1582			 */
1583			allowednodes = zlc_setup(zonelist, alloc_flags);
1584			zlc_active = 1;
1585			did_zlc_setup = 1;
1586		}
1587	}
1588
1589	if (unlikely(NUMA_BUILD && page == NULL && zlc_active)) {
1590		/* Disable zlc cache for second zonelist scan */
1591		zlc_active = 0;
1592		goto zonelist_scan;
1593	}
1594	return page;
1595}
1596
1597static inline int
1598should_alloc_retry(gfp_t gfp_mask, unsigned int order,
1599				unsigned long pages_reclaimed)
1600{
1601	/* Do not loop if specifically requested */
1602	if (gfp_mask & __GFP_NORETRY)
1603		return 0;
1604
1605	/*
1606	 * In this implementation, order <= PAGE_ALLOC_COSTLY_ORDER
1607	 * means __GFP_NOFAIL, but that may not be true in other
1608	 * implementations.
1609	 */
1610	if (order <= PAGE_ALLOC_COSTLY_ORDER)
1611		return 1;
1612
1613	/*
1614	 * For order > PAGE_ALLOC_COSTLY_ORDER, if __GFP_REPEAT is
1615	 * specified, then we retry until we no longer reclaim any pages
1616	 * (above), or we've reclaimed an order of pages at least as
1617	 * large as the allocation's order. In both cases, if the
1618	 * allocation still fails, we stop retrying.
1619	 */
1620	if (gfp_mask & __GFP_REPEAT && pages_reclaimed < (1 << order))
1621		return 1;
1622
1623	/*
1624	 * Don't let big-order allocations loop unless the caller
1625	 * explicitly requests that.
1626	 */
1627	if (gfp_mask & __GFP_NOFAIL)
1628		return 1;
1629
1630	return 0;
1631}
1632
1633static inline struct page *
1634__alloc_pages_may_oom(gfp_t gfp_mask, unsigned int order,
1635	struct zonelist *zonelist, enum zone_type high_zoneidx,
1636	nodemask_t *nodemask, struct zone *preferred_zone,
1637	int migratetype)
1638{
1639	struct page *page;
1640
1641	/* Acquire the OOM killer lock for the zones in zonelist */
1642	if (!try_set_zone_oom(zonelist, gfp_mask)) {
1643		schedule_timeout_uninterruptible(1);
1644		return NULL;
1645	}
1646
1647	/*
1648	 * Go through the zonelist yet one more time, keep very high watermark
1649	 * here, this is only to catch a parallel oom killing, we must fail if
1650	 * we're still under heavy pressure.
1651	 */
1652	page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, nodemask,
1653		order, zonelist, high_zoneidx,
1654		ALLOC_WMARK_HIGH|ALLOC_CPUSET,
1655		preferred_zone, migratetype);
1656	if (page)
1657		goto out;
1658
1659	if (!(gfp_mask & __GFP_NOFAIL)) {
1660		/* The OOM killer will not help higher order allocs */
1661		if (order > PAGE_ALLOC_COSTLY_ORDER)
1662			goto out;
1663		/*
1664		 * GFP_THISNODE contains __GFP_NORETRY and we never hit this.
1665		 * Sanity check for bare calls of __GFP_THISNODE, not real OOM.
1666		 * The caller should handle page allocation failure by itself if
1667		 * it specifies __GFP_THISNODE.
1668		 * Note: Hugepage uses it but will hit PAGE_ALLOC_COSTLY_ORDER.
1669		 */
1670		if (gfp_mask & __GFP_THISNODE)
1671			goto out;
1672	}
1673	/* Exhausted what can be done so it's blamo time */
1674	out_of_memory(zonelist, gfp_mask, order, nodemask);
1675
1676out:
1677	clear_zonelist_oom(zonelist, gfp_mask);
1678	return page;
1679}
1680
1681/* The really slow allocator path where we enter direct reclaim */
1682static inline struct page *
1683__alloc_pages_direct_reclaim(gfp_t gfp_mask, unsigned int order,
1684	struct zonelist *zonelist, enum zone_type high_zoneidx,
1685	nodemask_t *nodemask, int alloc_flags, struct zone *preferred_zone,
1686	int migratetype, unsigned long *did_some_progress)
1687{
1688	struct page *page = NULL;
1689	struct reclaim_state reclaim_state;
1690	struct task_struct *p = current;
1691
1692	cond_resched();
1693
1694	/* We now go into synchronous reclaim */
1695	cpuset_memory_pressure_bump();
1696	p->flags |= PF_MEMALLOC;
1697	lockdep_set_current_reclaim_state(gfp_mask);
1698	reclaim_state.reclaimed_slab = 0;
1699	p->reclaim_state = &reclaim_state;
1700
1701	*did_some_progress = try_to_free_pages(zonelist, order, gfp_mask, nodemask);
1702
1703	p->reclaim_state = NULL;
1704	lockdep_clear_current_reclaim_state();
1705	p->flags &= ~PF_MEMALLOC;
1706
1707	cond_resched();
1708
1709	if (order != 0)
1710		drain_all_pages();
1711
1712	if (likely(*did_some_progress))
1713		page = get_page_from_freelist(gfp_mask, nodemask, order,
1714					zonelist, high_zoneidx,
1715					alloc_flags, preferred_zone,
1716					migratetype);
1717	return page;
1718}
1719
1720/*
1721 * This is called in the allocator slow-path if the allocation request is of
1722 * sufficient urgency to ignore watermarks and take other desperate measures
1723 */
1724static inline struct page *
1725__alloc_pages_high_priority(gfp_t gfp_mask, unsigned int order,
1726	struct zonelist *zonelist, enum zone_type high_zoneidx,
1727	nodemask_t *nodemask, struct zone *preferred_zone,
1728	int migratetype)
1729{
1730	struct page *page;
1731
1732	do {
1733		page = get_page_from_freelist(gfp_mask, nodemask, order,
1734			zonelist, high_zoneidx, ALLOC_NO_WATERMARKS,
1735			preferred_zone, migratetype);
1736
1737		if (!page && gfp_mask & __GFP_NOFAIL)
1738			congestion_wait(BLK_RW_ASYNC, HZ/50);
1739	} while (!page && (gfp_mask & __GFP_NOFAIL));
1740
1741	return page;
1742}
1743
1744static inline
1745void wake_all_kswapd(unsigned int order, struct zonelist *zonelist,
1746						enum zone_type high_zoneidx)
1747{
1748	struct zoneref *z;
1749	struct zone *zone;
1750
1751	for_each_zone_zonelist(zone, z, zonelist, high_zoneidx)
1752		wakeup_kswapd(zone, order);
1753}
1754
1755static inline int
1756gfp_to_alloc_flags(gfp_t gfp_mask)
1757{
1758	struct task_struct *p = current;
1759	int alloc_flags = ALLOC_WMARK_MIN | ALLOC_CPUSET;
1760	const gfp_t wait = gfp_mask & __GFP_WAIT;
1761
1762	/* __GFP_HIGH is assumed to be the same as ALLOC_HIGH to save a branch. */
1763	BUILD_BUG_ON(__GFP_HIGH != ALLOC_HIGH);
1764
1765	/*
1766	 * The caller may dip into page reserves a bit more if the caller
1767	 * cannot run direct reclaim, or if the caller has realtime scheduling
1768	 * policy or is asking for __GFP_HIGH memory.  GFP_ATOMIC requests will
1769	 * set both ALLOC_HARDER (!wait) and ALLOC_HIGH (__GFP_HIGH).
1770	 */
1771	alloc_flags |= (gfp_mask & __GFP_HIGH);
1772
1773	if (!wait) {
1774		alloc_flags |= ALLOC_HARDER;
1775		/*
1776		 * Ignore cpuset if GFP_ATOMIC (!wait) rather than fail alloc.
1777		 * See also cpuset_zone_allowed() comment in kernel/cpuset.c.
1778		 */
1779		alloc_flags &= ~ALLOC_CPUSET;
1780	} else if (unlikely(rt_task(p)) && !in_interrupt())
1781		alloc_flags |= ALLOC_HARDER;
1782
1783	if (likely(!(gfp_mask & __GFP_NOMEMALLOC))) {
1784		if (!in_interrupt() &&
1785		    ((p->flags & PF_MEMALLOC) ||
1786		     unlikely(test_thread_flag(TIF_MEMDIE))))
1787			alloc_flags |= ALLOC_NO_WATERMARKS;
1788	}
1789
1790	return alloc_flags;
1791}
1792
1793static inline struct page *
1794__alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order,
1795	struct zonelist *zonelist, enum zone_type high_zoneidx,
1796	nodemask_t *nodemask, struct zone *preferred_zone,
1797	int migratetype)
1798{
1799	const gfp_t wait = gfp_mask & __GFP_WAIT;
1800	struct page *page = NULL;
1801	int alloc_flags;
1802	unsigned long pages_reclaimed = 0;
1803	unsigned long did_some_progress;
1804	struct task_struct *p = current;
1805
1806	/*
1807	 * In the slowpath, we sanity check order to avoid ever trying to
1808	 * reclaim >= MAX_ORDER areas which will never succeed. Callers may
1809	 * be using allocators in order of preference for an area that is
1810	 * too large.
1811	 */
1812	if (order >= MAX_ORDER) {
1813		WARN_ON_ONCE(!(gfp_mask & __GFP_NOWARN));
1814		return NULL;
1815	}
1816
1817	/*
1818	 * GFP_THISNODE (meaning __GFP_THISNODE, __GFP_NORETRY and
1819	 * __GFP_NOWARN set) should not cause reclaim since the subsystem
1820	 * (f.e. slab) using GFP_THISNODE may choose to trigger reclaim
1821	 * using a larger set of nodes after it has established that the
1822	 * allowed per node queues are empty and that nodes are
1823	 * over allocated.
1824	 */
1825	if (NUMA_BUILD && (gfp_mask & GFP_THISNODE) == GFP_THISNODE)
1826		goto nopage;
1827
1828restart:
1829	wake_all_kswapd(order, zonelist, high_zoneidx);
1830
1831	/*
1832	 * OK, we're below the kswapd watermark and have kicked background
1833	 * reclaim. Now things get more complex, so set up alloc_flags according
1834	 * to how we want to proceed.
1835	 */
1836	alloc_flags = gfp_to_alloc_flags(gfp_mask);
1837
1838	/* This is the last chance, in general, before the goto nopage. */
1839	page = get_page_from_freelist(gfp_mask, nodemask, order, zonelist,
1840			high_zoneidx, alloc_flags & ~ALLOC_NO_WATERMARKS,
1841			preferred_zone, migratetype);
1842	if (page)
1843		goto got_pg;
1844
1845rebalance:
1846	/* Allocate without watermarks if the context allows */
1847	if (alloc_flags & ALLOC_NO_WATERMARKS) {
1848		page = __alloc_pages_high_priority(gfp_mask, order,
1849				zonelist, high_zoneidx, nodemask,
1850				preferred_zone, migratetype);
1851		if (page)
1852			goto got_pg;
1853	}
1854
1855	/* Atomic allocations - we can't balance anything */
1856	if (!wait)
1857		goto nopage;
1858
1859	/* Avoid recursion of direct reclaim */
1860	if (p->flags & PF_MEMALLOC)
1861		goto nopage;
1862
1863	/* Avoid allocations with no watermarks from looping endlessly */
1864	if (test_thread_flag(TIF_MEMDIE) && !(gfp_mask & __GFP_NOFAIL))
1865		goto nopage;
1866
1867	/* Try direct reclaim and then allocating */
1868	page = __alloc_pages_direct_reclaim(gfp_mask, order,
1869					zonelist, high_zoneidx,
1870					nodemask,
1871					alloc_flags, preferred_zone,
1872					migratetype, &did_some_progress);
1873	if (page)
1874		goto got_pg;
1875
1876	/*
1877	 * If we failed to make any progress reclaiming, then we are
1878	 * running out of options and have to consider going OOM
1879	 */
1880	if (!did_some_progress) {
1881		if ((gfp_mask & __GFP_FS) && !(gfp_mask & __GFP_NORETRY)) {
1882			if (oom_killer_disabled)
1883				goto nopage;
1884			page = __alloc_pages_may_oom(gfp_mask, order,
1885					zonelist, high_zoneidx,
1886					nodemask, preferred_zone,
1887					migratetype);
1888			if (page)
1889				goto got_pg;
1890
1891			/*
1892			 * The OOM killer does not trigger for high-order
1893			 * ~__GFP_NOFAIL allocations so if no progress is being
1894			 * made, there are no other options and retrying is
1895			 * unlikely to help.
1896			 */
1897			if (order > PAGE_ALLOC_COSTLY_ORDER &&
1898						!(gfp_mask & __GFP_NOFAIL))
1899				goto nopage;
1900
1901			goto restart;
1902		}
1903	}
1904
1905	/* Check if we should retry the allocation */
1906	pages_reclaimed += did_some_progress;
1907	if (should_alloc_retry(gfp_mask, order, pages_reclaimed)) {
1908		/* Wait for some write requests to complete then retry */
1909		congestion_wait(BLK_RW_ASYNC, HZ/50);
1910		goto rebalance;
1911	}
1912
1913nopage:
1914	if (!(gfp_mask & __GFP_NOWARN) && printk_ratelimit()) {
1915		printk(KERN_WARNING "%s: page allocation failure."
1916			" order:%d, mode:0x%x\n",
1917			p->comm, order, gfp_mask);
1918		dump_stack();
1919		show_mem();
1920	}
1921	return page;
1922got_pg:
1923	if (kmemcheck_enabled)
1924		kmemcheck_pagealloc_alloc(page, order, gfp_mask);
1925	return page;
1926
1927}
1928
1929/*
1930 * This is the 'heart' of the zoned buddy allocator.
1931 */
1932struct page *
1933__alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order,
1934			struct zonelist *zonelist, nodemask_t *nodemask)
1935{
1936	enum zone_type high_zoneidx = gfp_zone(gfp_mask);
1937	struct zone *preferred_zone;
1938	struct page *page;
1939	int migratetype = allocflags_to_migratetype(gfp_mask);
1940
1941	gfp_mask &= gfp_allowed_mask;
1942
1943	lockdep_trace_alloc(gfp_mask);
1944
1945	might_sleep_if(gfp_mask & __GFP_WAIT);
1946
1947	if (should_fail_alloc_page(gfp_mask, order))
1948		return NULL;
1949
1950	/*
1951	 * Check the zones suitable for the gfp_mask contain at least one
1952	 * valid zone. It's possible to have an empty zonelist as a result
1953	 * of GFP_THISNODE and a memoryless node
1954	 */
1955	if (unlikely(!zonelist->_zonerefs->zone))
1956		return NULL;
1957
1958	/* The preferred zone is used for statistics later */
1959	first_zones_zonelist(zonelist, high_zoneidx, nodemask, &preferred_zone);
1960	if (!preferred_zone)
1961		return NULL;
1962
1963	/* First allocation attempt */
1964	page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, nodemask, order,
1965			zonelist, high_zoneidx, ALLOC_WMARK_LOW|ALLOC_CPUSET,
1966			preferred_zone, migratetype);
1967	if (unlikely(!page))
1968		page = __alloc_pages_slowpath(gfp_mask, order,
1969				zonelist, high_zoneidx, nodemask,
1970				preferred_zone, migratetype);
1971
1972	trace_mm_page_alloc(page, order, gfp_mask, migratetype);
1973	return page;
1974}
1975EXPORT_SYMBOL(__alloc_pages_nodemask);
1976
1977/*
1978 * Common helper functions.
1979 */
1980unsigned long __get_free_pages(gfp_t gfp_mask, unsigned int order)
1981{
1982	struct page *page;
1983
1984	/*
1985	 * __get_free_pages() returns a 32-bit address, which cannot represent
1986	 * a highmem page
1987	 */
1988	VM_BUG_ON((gfp_mask & __GFP_HIGHMEM) != 0);
1989
1990	page = alloc_pages(gfp_mask, order);
1991	if (!page)
1992		return 0;
1993	return (unsigned long) page_address(page);
1994}
1995EXPORT_SYMBOL(__get_free_pages);
1996
1997unsigned long get_zeroed_page(gfp_t gfp_mask)
1998{
1999	return __get_free_pages(gfp_mask | __GFP_ZERO, 0);
2000}
2001EXPORT_SYMBOL(get_zeroed_page);
2002
2003void __pagevec_free(struct pagevec *pvec)
2004{
2005	int i = pagevec_count(pvec);
2006
2007	while (--i >= 0) {
2008		trace_mm_pagevec_free(pvec->pages[i], pvec->cold);
2009		free_hot_cold_page(pvec->pages[i], pvec->cold);
2010	}
2011}
2012
2013void __free_pages(struct page *page, unsigned int order)
2014{
2015	if (put_page_testzero(page)) {
2016		trace_mm_page_free_direct(page, order);
2017		if (order == 0)
2018			free_hot_page(page);
2019		else
2020			__free_pages_ok(page, order);
2021	}
2022}
2023
2024EXPORT_SYMBOL(__free_pages);
2025
2026void free_pages(unsigned long addr, unsigned int order)
2027{
2028	if (addr != 0) {
2029		VM_BUG_ON(!virt_addr_valid((void *)addr));
2030		__free_pages(virt_to_page((void *)addr), order);
2031	}
2032}
2033
2034EXPORT_SYMBOL(free_pages);
2035
2036/**
2037 * alloc_pages_exact - allocate an exact number physically-contiguous pages.
2038 * @size: the number of bytes to allocate
2039 * @gfp_mask: GFP flags for the allocation
2040 *
2041 * This function is similar to alloc_pages(), except that it allocates the
2042 * minimum number of pages to satisfy the request.  alloc_pages() can only
2043 * allocate memory in power-of-two pages.
2044 *
2045 * This function is also limited by MAX_ORDER.
2046 *
2047 * Memory allocated by this function must be released by free_pages_exact().
2048 */
2049void *alloc_pages_exact(size_t size, gfp_t gfp_mask)
2050{
2051	unsigned int order = get_order(size);
2052	unsigned long addr;
2053
2054	addr = __get_free_pages(gfp_mask, order);
2055	if (addr) {
2056		unsigned long alloc_end = addr + (PAGE_SIZE << order);
2057		unsigned long used = addr + PAGE_ALIGN(size);
2058
2059		split_page(virt_to_page((void *)addr), order);
2060		while (used < alloc_end) {
2061			free_page(used);
2062			used += PAGE_SIZE;
2063		}
2064	}
2065
2066	return (void *)addr;
2067}
2068EXPORT_SYMBOL(alloc_pages_exact);
2069
2070/**
2071 * free_pages_exact - release memory allocated via alloc_pages_exact()
2072 * @virt: the value returned by alloc_pages_exact.
2073 * @size: size of allocation, same value as passed to alloc_pages_exact().
2074 *
2075 * Release the memory allocated by a previous call to alloc_pages_exact.
2076 */
2077void free_pages_exact(void *virt, size_t size)
2078{
2079	unsigned long addr = (unsigned long)virt;
2080	unsigned long end = addr + PAGE_ALIGN(size);
2081
2082	while (addr < end) {
2083		free_page(addr);
2084		addr += PAGE_SIZE;
2085	}
2086}
2087EXPORT_SYMBOL(free_pages_exact);
2088
2089static unsigned int nr_free_zone_pages(int offset)
2090{
2091	struct zoneref *z;
2092	struct zone *zone;
2093
2094	/* Just pick one node, since fallback list is circular */
2095	unsigned int sum = 0;
2096
2097	struct zonelist *zonelist = node_zonelist(numa_node_id(), GFP_KERNEL);
2098
2099	for_each_zone_zonelist(zone, z, zonelist, offset) {
2100		unsigned long size = zone->present_pages;
2101		unsigned long high = high_wmark_pages(zone);
2102		if (size > high)
2103			sum += size - high;
2104	}
2105
2106	return sum;
2107}
2108
2109/*
2110 * Amount of free RAM allocatable within ZONE_DMA and ZONE_NORMAL
2111 */
2112unsigned int nr_free_buffer_pages(void)
2113{
2114	return nr_free_zone_pages(gfp_zone(GFP_USER));
2115}
2116EXPORT_SYMBOL_GPL(nr_free_buffer_pages);
2117
2118/*
2119 * Amount of free RAM allocatable within all zones
2120 */
2121unsigned int nr_free_pagecache_pages(void)
2122{
2123	return nr_free_zone_pages(gfp_zone(GFP_HIGHUSER_MOVABLE));
2124}
2125
2126static inline void show_node(struct zone *zone)
2127{
2128	if (NUMA_BUILD)
2129		printk("Node %d ", zone_to_nid(zone));
2130}
2131
2132void si_meminfo(struct sysinfo *val)
2133{
2134	val->totalram = totalram_pages;
2135	val->sharedram = 0;
2136	val->freeram = global_page_state(NR_FREE_PAGES);
2137	val->bufferram = nr_blockdev_pages();
2138	val->totalhigh = totalhigh_pages;
2139	val->freehigh = nr_free_highpages();
2140	val->mem_unit = PAGE_SIZE;
2141}
2142
2143EXPORT_SYMBOL(si_meminfo);
2144
2145#ifdef CONFIG_NUMA
2146void si_meminfo_node(struct sysinfo *val, int nid)
2147{
2148	pg_data_t *pgdat = NODE_DATA(nid);
2149
2150	val->totalram = pgdat->node_present_pages;
2151	val->freeram = node_page_state(nid, NR_FREE_PAGES);
2152#ifdef CONFIG_HIGHMEM
2153	val->totalhigh = pgdat->node_zones[ZONE_HIGHMEM].present_pages;
2154	val->freehigh = zone_page_state(&pgdat->node_zones[ZONE_HIGHMEM],
2155			NR_FREE_PAGES);
2156#else
2157	val->totalhigh = 0;
2158	val->freehigh = 0;
2159#endif
2160	val->mem_unit = PAGE_SIZE;
2161}
2162#endif
2163
2164#define K(x) ((x) << (PAGE_SHIFT-10))
2165
2166/*
2167 * Show free area list (used inside shift_scroll-lock stuff)
2168 * We also calculate the percentage fragmentation. We do this by counting the
2169 * memory on each free list with the exception of the first item on the list.
2170 */
2171void show_free_areas(void)
2172{
2173	int cpu;
2174	struct zone *zone;
2175
2176	for_each_populated_zone(zone) {
2177		show_node(zone);
2178		printk("%s per-cpu:\n", zone->name);
2179
2180		for_each_online_cpu(cpu) {
2181			struct per_cpu_pageset *pageset;
2182
2183			pageset = zone_pcp(zone, cpu);
2184
2185			printk("CPU %4d: hi:%5d, btch:%4d usd:%4d\n",
2186			       cpu, pageset->pcp.high,
2187			       pageset->pcp.batch, pageset->pcp.count);
2188		}
2189	}
2190
2191	printk("active_anon:%lu inactive_anon:%lu isolated_anon:%lu\n"
2192		" active_file:%lu inactive_file:%lu isolated_file:%lu\n"
2193		" unevictable:%lu"
2194		" dirty:%lu writeback:%lu unstable:%lu\n"
2195		" free:%lu slab_reclaimable:%lu slab_unreclaimable:%lu\n"
2196		" mapped:%lu shmem:%lu pagetables:%lu bounce:%lu\n",
2197		global_page_state(NR_ACTIVE_ANON),
2198		global_page_state(NR_INACTIVE_ANON),
2199		global_page_state(NR_ISOLATED_ANON),
2200		global_page_state(NR_ACTIVE_FILE),
2201		global_page_state(NR_INACTIVE_FILE),
2202		global_page_state(NR_ISOLATED_FILE),
2203		global_page_state(NR_UNEVICTABLE),
2204		global_page_state(NR_FILE_DIRTY),
2205		global_page_state(NR_WRITEBACK),
2206		global_page_state(NR_UNSTABLE_NFS),
2207		global_page_state(NR_FREE_PAGES),
2208		global_page_state(NR_SLAB_RECLAIMABLE),
2209		global_page_state(NR_SLAB_UNRECLAIMABLE),
2210		global_page_state(NR_FILE_MAPPED),
2211		global_page_state(NR_SHMEM),
2212		global_page_state(NR_PAGETABLE),
2213		global_page_state(NR_BOUNCE));
2214
2215	for_each_populated_zone(zone) {
2216		int i;
2217
2218		show_node(zone);
2219		printk("%s"
2220			" free:%lukB"
2221			" min:%lukB"
2222			" low:%lukB"
2223			" high:%lukB"
2224			" active_anon:%lukB"
2225			" inactive_anon:%lukB"
2226			" active_file:%lukB"
2227			" inactive_file:%lukB"
2228			" unevictable:%lukB"
2229			" isolated(anon):%lukB"
2230			" isolated(file):%lukB"
2231			" present:%lukB"
2232			" mlocked:%lukB"
2233			" dirty:%lukB"
2234			" writeback:%lukB"
2235			" mapped:%lukB"
2236			" shmem:%lukB"
2237			" slab_reclaimable:%lukB"
2238			" slab_unreclaimable:%lukB"
2239			" kernel_stack:%lukB"
2240			" pagetables:%lukB"
2241			" unstable:%lukB"
2242			" bounce:%lukB"
2243			" writeback_tmp:%lukB"
2244			" pages_scanned:%lu"
2245			" all_unreclaimable? %s"
2246			"\n",
2247			zone->name,
2248			K(zone_page_state(zone, NR_FREE_PAGES)),
2249			K(min_wmark_pages(zone)),
2250			K(low_wmark_pages(zone)),
2251			K(high_wmark_pages(zone)),
2252			K(zone_page_state(zone, NR_ACTIVE_ANON)),
2253			K(zone_page_state(zone, NR_INACTIVE_ANON)),
2254			K(zone_page_state(zone, NR_ACTIVE_FILE)),
2255			K(zone_page_state(zone, NR_INACTIVE_FILE)),
2256			K(zone_page_state(zone, NR_UNEVICTABLE)),
2257			K(zone_page_state(zone, NR_ISOLATED_ANON)),
2258			K(zone_page_state(zone, NR_ISOLATED_FILE)),
2259			K(zone->present_pages),
2260			K(zone_page_state(zone, NR_MLOCK)),
2261			K(zone_page_state(zone, NR_FILE_DIRTY)),
2262			K(zone_page_state(zone, NR_WRITEBACK)),
2263			K(zone_page_state(zone, NR_FILE_MAPPED)),
2264			K(zone_page_state(zone, NR_SHMEM)),
2265			K(zone_page_state(zone, NR_SLAB_RECLAIMABLE)),
2266			K(zone_page_state(zone, NR_SLAB_UNRECLAIMABLE)),
2267			zone_page_state(zone, NR_KERNEL_STACK) *
2268				THREAD_SIZE / 1024,
2269			K(zone_page_state(zone, NR_PAGETABLE)),
2270			K(zone_page_state(zone, NR_UNSTABLE_NFS)),
2271			K(zone_page_state(zone, NR_BOUNCE)),
2272			K(zone_page_state(zone, NR_WRITEBACK_TEMP)),
2273			zone->pages_scanned,
2274			(zone_is_all_unreclaimable(zone) ? "yes" : "no")
2275			);
2276		printk("lowmem_reserve[]:");
2277		for (i = 0; i < MAX_NR_ZONES; i++)
2278			printk(" %lu", zone->lowmem_reserve[i]);
2279		printk("\n");
2280	}
2281
2282	for_each_populated_zone(zone) {
2283 		unsigned long nr[MAX_ORDER], flags, order, total = 0;
2284
2285		show_node(zone);
2286		printk("%s: ", zone->name);
2287
2288		spin_lock_irqsave(&zone->lock, flags);
2289		for (order = 0; order < MAX_ORDER; order++) {
2290			nr[order] = zone->free_area[order].nr_free;
2291			total += nr[order] << order;
2292		}
2293		spin_unlock_irqrestore(&zone->lock, flags);
2294		for (order = 0; order < MAX_ORDER; order++)
2295			printk("%lu*%lukB ", nr[order], K(1UL) << order);
2296		printk("= %lukB\n", K(total));
2297	}
2298
2299	printk("%ld total pagecache pages\n", global_page_state(NR_FILE_PAGES));
2300
2301	show_swap_cache_info();
2302}
2303
2304static void zoneref_set_zone(struct zone *zone, struct zoneref *zoneref)
2305{
2306	zoneref->zone = zone;
2307	zoneref->zone_idx = zone_idx(zone);
2308}
2309
2310/*
2311 * Builds allocation fallback zone lists.
2312 *
2313 * Add all populated zones of a node to the zonelist.
2314 */
2315static int build_zonelists_node(pg_data_t *pgdat, struct zonelist *zonelist,
2316				int nr_zones, enum zone_type zone_type)
2317{
2318	struct zone *zone;
2319
2320	BUG_ON(zone_type >= MAX_NR_ZONES);
2321	zone_type++;
2322
2323	do {
2324		zone_type--;
2325		zone = pgdat->node_zones + zone_type;
2326		if (populated_zone(zone)) {
2327			zoneref_set_zone(zone,
2328				&zonelist->_zonerefs[nr_zones++]);
2329			check_highest_zone(zone_type);
2330		}
2331
2332	} while (zone_type);
2333	return nr_zones;
2334}
2335
2336
2337/*
2338 *  zonelist_order:
2339 *  0 = automatic detection of better ordering.
2340 *  1 = order by ([node] distance, -zonetype)
2341 *  2 = order by (-zonetype, [node] distance)
2342 *
2343 *  If not NUMA, ZONELIST_ORDER_ZONE and ZONELIST_ORDER_NODE will create
2344 *  the same zonelist. So only NUMA can configure this param.
2345 */
2346#define ZONELIST_ORDER_DEFAULT  0
2347#define ZONELIST_ORDER_NODE     1
2348#define ZONELIST_ORDER_ZONE     2
2349
2350/* zonelist order in the kernel.
2351 * set_zonelist_order() will set this to NODE or ZONE.
2352 */
2353static int current_zonelist_order = ZONELIST_ORDER_DEFAULT;
2354static char zonelist_order_name[3][8] = {"Default", "Node", "Zone"};
2355
2356
2357#ifdef CONFIG_NUMA
2358/* The value user specified ....changed by config */
2359static int user_zonelist_order = ZONELIST_ORDER_DEFAULT;
2360/* string for sysctl */
2361#define NUMA_ZONELIST_ORDER_LEN	16
2362char numa_zonelist_order[16] = "default";
2363
2364/*
2365 * interface for configure zonelist ordering.
2366 * command line option "numa_zonelist_order"
2367 *	= "[dD]efault	- default, automatic configuration.
2368 *	= "[nN]ode 	- order by node locality, then by zone within node
2369 *	= "[zZ]one      - order by zone, then by locality within zone
2370 */
2371
2372static int __parse_numa_zonelist_order(char *s)
2373{
2374	if (*s == 'd' || *s == 'D') {
2375		user_zonelist_order = ZONELIST_ORDER_DEFAULT;
2376	} else if (*s == 'n' || *s == 'N') {
2377		user_zonelist_order = ZONELIST_ORDER_NODE;
2378	} else if (*s == 'z' || *s == 'Z') {
2379		user_zonelist_order = ZONELIST_ORDER_ZONE;
2380	} else {
2381		printk(KERN_WARNING
2382			"Ignoring invalid numa_zonelist_order value:  "
2383			"%s\n", s);
2384		return -EINVAL;
2385	}
2386	return 0;
2387}
2388
2389static __init int setup_numa_zonelist_order(char *s)
2390{
2391	if (s)
2392		return __parse_numa_zonelist_order(s);
2393	return 0;
2394}
2395early_param("numa_zonelist_order", setup_numa_zonelist_order);
2396
2397/*
2398 * sysctl handler for numa_zonelist_order
2399 */
2400int numa_zonelist_order_handler(ctl_table *table, int write,
2401		void __user *buffer, size_t *length,
2402		loff_t *ppos)
2403{
2404	char saved_string[NUMA_ZONELIST_ORDER_LEN];
2405	int ret;
2406	static DEFINE_MUTEX(zl_order_mutex);
2407
2408	mutex_lock(&zl_order_mutex);
2409	if (write)
2410		strcpy(saved_string, (char*)table->data);
2411	ret = proc_dostring(table, write, buffer, length, ppos);
2412	if (ret)
2413		goto out;
2414	if (write) {
2415		int oldval = user_zonelist_order;
2416		if (__parse_numa_zonelist_order((char*)table->data)) {
2417			/*
2418			 * bogus value.  restore saved string
2419			 */
2420			strncpy((char*)table->data, saved_string,
2421				NUMA_ZONELIST_ORDER_LEN);
2422			user_zonelist_order = oldval;
2423		} else if (oldval != user_zonelist_order)
2424			build_all_zonelists();
2425	}
2426out:
2427	mutex_unlock(&zl_order_mutex);
2428	return ret;
2429}
2430
2431
2432#define MAX_NODE_LOAD (nr_online_nodes)
2433static int node_load[MAX_NUMNODES];
2434
2435/**
2436 * find_next_best_node - find the next node that should appear in a given node's fallback list
2437 * @node: node whose fallback list we're appending
2438 * @used_node_mask: nodemask_t of already used nodes
2439 *
2440 * We use a number of factors to determine which is the next node that should
2441 * appear on a given node's fallback list.  The node should not have appeared
2442 * already in @node's fallback list, and it should be the next closest node
2443 * according to the distance array (which contains arbitrary distance values
2444 * from each node to each node in the system), and should also prefer nodes
2445 * with no CPUs, since presumably they'll have very little allocation pressure
2446 * on them otherwise.
2447 * It returns -1 if no node is found.
2448 */
2449static int find_next_best_node(int node, nodemask_t *used_node_mask)
2450{
2451	int n, val;
2452	int min_val = INT_MAX;
2453	int best_node = -1;
2454	const struct cpumask *tmp = cpumask_of_node(0);
2455
2456	/* Use the local node if we haven't already */
2457	if (!node_isset(node, *used_node_mask)) {
2458		node_set(node, *used_node_mask);
2459		return node;
2460	}
2461
2462	for_each_node_state(n, N_HIGH_MEMORY) {
2463
2464		/* Don't want a node to appear more than once */
2465		if (node_isset(n, *used_node_mask))
2466			continue;
2467
2468		/* Use the distance array to find the distance */
2469		val = node_distance(node, n);
2470
2471		/* Penalize nodes under us ("prefer the next node") */
2472		val += (n < node);
2473
2474		/* Give preference to headless and unused nodes */
2475		tmp = cpumask_of_node(n);
2476		if (!cpumask_empty(tmp))
2477			val += PENALTY_FOR_NODE_WITH_CPUS;
2478
2479		/* Slight preference for less loaded node */
2480		val *= (MAX_NODE_LOAD*MAX_NUMNODES);
2481		val += node_load[n];
2482
2483		if (val < min_val) {
2484			min_val = val;
2485			best_node = n;
2486		}
2487	}
2488
2489	if (best_node >= 0)
2490		node_set(best_node, *used_node_mask);
2491
2492	return best_node;
2493}
2494
2495
2496/*
2497 * Build zonelists ordered by node and zones within node.
2498 * This results in maximum locality--normal zone overflows into local
2499 * DMA zone, if any--but risks exhausting DMA zone.
2500 */
2501static void build_zonelists_in_node_order(pg_data_t *pgdat, int node)
2502{
2503	int j;
2504	struct zonelist *zonelist;
2505
2506	zonelist = &pgdat->node_zonelists[0];
2507	for (j = 0; zonelist->_zonerefs[j].zone != NULL; j++)
2508		;
2509	j = build_zonelists_node(NODE_DATA(node), zonelist, j,
2510							MAX_NR_ZONES - 1);
2511	zonelist->_zonerefs[j].zone = NULL;
2512	zonelist->_zonerefs[j].zone_idx = 0;
2513}
2514
2515/*
2516 * Build gfp_thisnode zonelists
2517 */
2518static void build_thisnode_zonelists(pg_data_t *pgdat)
2519{
2520	int j;
2521	struct zonelist *zonelist;
2522
2523	zonelist = &pgdat->node_zonelists[1];
2524	j = build_zonelists_node(pgdat, zonelist, 0, MAX_NR_ZONES - 1);
2525	zonelist->_zonerefs[j].zone = NULL;
2526	zonelist->_zonerefs[j].zone_idx = 0;
2527}
2528
2529/*
2530 * Build zonelists ordered by zone and nodes within zones.
2531 * This results in conserving DMA zone[s] until all Normal memory is
2532 * exhausted, but results in overflowing to remote node while memory
2533 * may still exist in local DMA zone.
2534 */
2535static int node_order[MAX_NUMNODES];
2536
2537static void build_zonelists_in_zone_order(pg_data_t *pgdat, int nr_nodes)
2538{
2539	int pos, j, node;
2540	int zone_type;		/* needs to be signed */
2541	struct zone *z;
2542	struct zonelist *zonelist;
2543
2544	zonelist = &pgdat->node_zonelists[0];
2545	pos = 0;
2546	for (zone_type = MAX_NR_ZONES - 1; zone_type >= 0; zone_type--) {
2547		for (j = 0; j < nr_nodes; j++) {
2548			node = node_order[j];
2549			z = &NODE_DATA(node)->node_zones[zone_type];
2550			if (populated_zone(z)) {
2551				zoneref_set_zone(z,
2552					&zonelist->_zonerefs[pos++]);
2553				check_highest_zone(zone_type);
2554			}
2555		}
2556	}
2557	zonelist->_zonerefs[pos].zone = NULL;
2558	zonelist->_zonerefs[pos].zone_idx = 0;
2559}
2560
2561static int default_zonelist_order(void)
2562{
2563	int nid, zone_type;
2564	unsigned long low_kmem_size,total_size;
2565	struct zone *z;
2566	int average_size;
2567	/*
2568         * ZONE_DMA and ZONE_DMA32 can be very small area in the sytem.
2569	 * If they are really small and used heavily, the system can fall
2570	 * into OOM very easily.
2571	 * This function detect ZONE_DMA/DMA32 size and confgigures zone order.
2572	 */
2573	/* Is there ZONE_NORMAL ? (ex. ppc has only DMA zone..) */
2574	low_kmem_size = 0;
2575	total_size = 0;
2576	for_each_online_node(nid) {
2577		for (zone_type = 0; zone_type < MAX_NR_ZONES; zone_type++) {
2578			z = &NODE_DATA(nid)->node_zones[zone_type];
2579			if (populated_zone(z)) {
2580				if (zone_type < ZONE_NORMAL)
2581					low_kmem_size += z->present_pages;
2582				total_size += z->present_pages;
2583			}
2584		}
2585	}
2586	if (!low_kmem_size ||  /* there are no DMA area. */
2587	    low_kmem_size > total_size/2) /* DMA/DMA32 is big. */
2588		return ZONELIST_ORDER_NODE;
2589	/*
2590	 * look into each node's config.
2591  	 * If there is a node whose DMA/DMA32 memory is very big area on
2592 	 * local memory, NODE_ORDER may be suitable.
2593         */
2594	average_size = total_size /
2595				(nodes_weight(node_states[N_HIGH_MEMORY]) + 1);
2596	for_each_online_node(nid) {
2597		low_kmem_size = 0;
2598		total_size = 0;
2599		for (zone_type = 0; zone_type < MAX_NR_ZONES; zone_type++) {
2600			z = &NODE_DATA(nid)->node_zones[zone_type];
2601			if (populated_zone(z)) {
2602				if (zone_type < ZONE_NORMAL)
2603					low_kmem_size += z->present_pages;
2604				total_size += z->present_pages;
2605			}
2606		}
2607		if (low_kmem_size &&
2608		    total_size > average_size && /* ignore small node */
2609		    low_kmem_size > total_size * 70/100)
2610			return ZONELIST_ORDER_NODE;
2611	}
2612	return ZONELIST_ORDER_ZONE;
2613}
2614
2615static void set_zonelist_order(void)
2616{
2617	if (user_zonelist_order == ZONELIST_ORDER_DEFAULT)
2618		current_zonelist_order = default_zonelist_order();
2619	else
2620		current_zonelist_order = user_zonelist_order;
2621}
2622
2623static void build_zonelists(pg_data_t *pgdat)
2624{
2625	int j, node, load;
2626	enum zone_type i;
2627	nodemask_t used_mask;
2628	int local_node, prev_node;
2629	struct zonelist *zonelist;
2630	int order = current_zonelist_order;
2631
2632	/* initialize zonelists */
2633	for (i = 0; i < MAX_ZONELISTS; i++) {
2634		zonelist = pgdat->node_zonelists + i;
2635		zonelist->_zonerefs[0].zone = NULL;
2636		zonelist->_zonerefs[0].zone_idx = 0;
2637	}
2638
2639	/* NUMA-aware ordering of nodes */
2640	local_node = pgdat->node_id;
2641	load = nr_online_nodes;
2642	prev_node = local_node;
2643	nodes_clear(used_mask);
2644
2645	memset(node_order, 0, sizeof(node_order));
2646	j = 0;
2647
2648	while ((node = find_next_best_node(local_node, &used_mask)) >= 0) {
2649		int distance = node_distance(local_node, node);
2650
2651		/*
2652		 * If another node is sufficiently far away then it is better
2653		 * to reclaim pages in a zone before going off node.
2654		 */
2655		if (distance > RECLAIM_DISTANCE)
2656			zone_reclaim_mode = 1;
2657
2658		/*
2659		 * We don't want to pressure a particular node.
2660		 * So adding penalty to the first node in same
2661		 * distance group to make it round-robin.
2662		 */
2663		if (distance != node_distance(local_node, prev_node))
2664			node_load[node] = load;
2665
2666		prev_node = node;
2667		load--;
2668		if (order == ZONELIST_ORDER_NODE)
2669			build_zonelists_in_node_order(pgdat, node);
2670		else
2671			node_order[j++] = node;	/* remember order */
2672	}
2673
2674	if (order == ZONELIST_ORDER_ZONE) {
2675		/* calculate node order -- i.e., DMA last! */
2676		build_zonelists_in_zone_order(pgdat, j);
2677	}
2678
2679	build_thisnode_zonelists(pgdat);
2680}
2681
2682/* Construct the zonelist performance cache - see further mmzone.h */
2683static void build_zonelist_cache(pg_data_t *pgdat)
2684{
2685	struct zonelist *zonelist;
2686	struct zonelist_cache *zlc;
2687	struct zoneref *z;
2688
2689	zonelist = &pgdat->node_zonelists[0];
2690	zonelist->zlcache_ptr = zlc = &zonelist->zlcache;
2691	bitmap_zero(zlc->fullzones, MAX_ZONES_PER_ZONELIST);
2692	for (z = zonelist->_zonerefs; z->zone; z++)
2693		zlc->z_to_n[z - zonelist->_zonerefs] = zonelist_node_idx(z);
2694}
2695
2696
2697#else	/* CONFIG_NUMA */
2698
2699static void set_zonelist_order(void)
2700{
2701	current_zonelist_order = ZONELIST_ORDER_ZONE;
2702}
2703
2704static void build_zonelists(pg_data_t *pgdat)
2705{
2706	int node, local_node;
2707	enum zone_type j;
2708	struct zonelist *zonelist;
2709
2710	local_node = pgdat->node_id;
2711
2712	zonelist = &pgdat->node_zonelists[0];
2713	j = build_zonelists_node(pgdat, zonelist, 0, MAX_NR_ZONES - 1);
2714
2715	/*
2716	 * Now we build the zonelist so that it contains the zones
2717	 * of all the other nodes.
2718	 * We don't want to pressure a particular node, so when
2719	 * building the zones for node N, we make sure that the
2720	 * zones coming right after the local ones are those from
2721	 * node N+1 (modulo N)
2722	 */
2723	for (node = local_node + 1; node < MAX_NUMNODES; node++) {
2724		if (!node_online(node))
2725			continue;
2726		j = build_zonelists_node(NODE_DATA(node), zonelist, j,
2727							MAX_NR_ZONES - 1);
2728	}
2729	for (node = 0; node < local_node; node++) {
2730		if (!node_online(node))
2731			continue;
2732		j = build_zonelists_node(NODE_DATA(node), zonelist, j,
2733							MAX_NR_ZONES - 1);
2734	}
2735
2736	zonelist->_zonerefs[j].zone = NULL;
2737	zonelist->_zonerefs[j].zone_idx = 0;
2738}
2739
2740/* non-NUMA variant of zonelist performance cache - just NULL zlcache_ptr */
2741static void build_zonelist_cache(pg_data_t *pgdat)
2742{
2743	pgdat->node_zonelists[0].zlcache_ptr = NULL;
2744}
2745
2746#endif	/* CONFIG_NUMA */
2747
2748/* return values int ....just for stop_machine() */
2749static int __build_all_zonelists(void *dummy)
2750{
2751	int nid;
2752
2753#ifdef CONFIG_NUMA
2754	memset(node_load, 0, sizeof(node_load));
2755#endif
2756	for_each_online_node(nid) {
2757		pg_data_t *pgdat = NODE_DATA(nid);
2758
2759		build_zonelists(pgdat);
2760		build_zonelist_cache(pgdat);
2761	}
2762	return 0;
2763}
2764
2765void build_all_zonelists(void)
2766{
2767	set_zonelist_order();
2768
2769	if (system_state == SYSTEM_BOOTING) {
2770		__build_all_zonelists(NULL);
2771		mminit_verify_zonelist();
2772		cpuset_init_current_mems_allowed();
2773	} else {
2774		/* we have to stop all cpus to guarantee there is no user
2775		   of zonelist */
2776		stop_machine(__build_all_zonelists, NULL, NULL);
2777		/* cpuset refresh routine should be here */
2778	}
2779	vm_total_pages = nr_free_pagecache_pages();
2780	/*
2781	 * Disable grouping by mobility if the number of pages in the
2782	 * system is too low to allow the mechanism to work. It would be
2783	 * more accurate, but expensive to check per-zone. This check is
2784	 * made on memory-hotadd so a system can start with mobility
2785	 * disabled and enable it later
2786	 */
2787	if (vm_total_pages < (pageblock_nr_pages * MIGRATE_TYPES))
2788		page_group_by_mobility_disabled = 1;
2789	else
2790		page_group_by_mobility_disabled = 0;
2791
2792	printk("Built %i zonelists in %s order, mobility grouping %s.  "
2793		"Total pages: %ld\n",
2794			nr_online_nodes,
2795			zonelist_order_name[current_zonelist_order],
2796			page_group_by_mobility_disabled ? "off" : "on",
2797			vm_total_pages);
2798#ifdef CONFIG_NUMA
2799	printk("Policy zone: %s\n", zone_names[policy_zone]);
2800#endif
2801}
2802
2803/*
2804 * Helper functions to size the waitqueue hash table.
2805 * Essentially these want to choose hash table sizes sufficiently
2806 * large so that collisions trying to wait on pages are rare.
2807 * But in fact, the number of active page waitqueues on typical
2808 * systems is ridiculously low, less than 200. So this is even
2809 * conservative, even though it seems large.
2810 *
2811 * The constant PAGES_PER_WAITQUEUE specifies the ratio of pages to
2812 * waitqueues, i.e. the size of the waitq table given the number of pages.
2813 */
2814#define PAGES_PER_WAITQUEUE	256
2815
2816#ifndef CONFIG_MEMORY_HOTPLUG
2817static inline unsigned long wait_table_hash_nr_entries(unsigned long pages)
2818{
2819	unsigned long size = 1;
2820
2821	pages /= PAGES_PER_WAITQUEUE;
2822
2823	while (size < pages)
2824		size <<= 1;
2825
2826	/*
2827	 * Once we have dozens or even hundreds of threads sleeping
2828	 * on IO we've got bigger problems than wait queue collision.
2829	 * Limit the size of the wait table to a reasonable size.
2830	 */
2831	size = min(size, 4096UL);
2832
2833	return max(size, 4UL);
2834}
2835#else
2836/*
2837 * A zone's size might be changed by hot-add, so it is not possible to determine
2838 * a suitable size for its wait_table.  So we use the maximum size now.
2839 *
2840 * The max wait table size = 4096 x sizeof(wait_queue_head_t).   ie:
2841 *
2842 *    i386 (preemption config)    : 4096 x 16 = 64Kbyte.
2843 *    ia64, x86-64 (no preemption): 4096 x 20 = 80Kbyte.
2844 *    ia64, x86-64 (preemption)   : 4096 x 24 = 96Kbyte.
2845 *
2846 * The maximum entries are prepared when a zone's memory is (512K + 256) pages
2847 * or more by the traditional way. (See above).  It equals:
2848 *
2849 *    i386, x86-64, powerpc(4K page size) : =  ( 2G + 1M)byte.
2850 *    ia64(16K page size)                 : =  ( 8G + 4M)byte.
2851 *    powerpc (64K page size)             : =  (32G +16M)byte.
2852 */
2853static inline unsigned long wait_table_hash_nr_entries(unsigned long pages)
2854{
2855	return 4096UL;
2856}
2857#endif
2858
2859/*
2860 * This is an integer logarithm so that shifts can be used later
2861 * to extract the more random high bits from the multiplicative
2862 * hash function before the remainder is taken.
2863 */
2864static inline unsigned long wait_table_bits(unsigned long size)
2865{
2866	return ffz(~size);
2867}
2868
2869#define LONG_ALIGN(x) (((x)+(sizeof(long))-1)&~((sizeof(long))-1))
2870
2871/*
2872 * Mark a number of pageblocks as MIGRATE_RESERVE. The number
2873 * of blocks reserved is based on min_wmark_pages(zone). The memory within
2874 * the reserve will tend to store contiguous free pages. Setting min_free_kbytes
2875 * higher will lead to a bigger reserve which will get freed as contiguous
2876 * blocks as reclaim kicks in
2877 */
2878static void setup_zone_migrate_reserve(struct zone *zone)
2879{
2880	unsigned long start_pfn, pfn, end_pfn;
2881	struct page *page;
2882	unsigned long block_migratetype;
2883	int reserve;
2884
2885	/* Get the start pfn, end pfn and the number of blocks to reserve */
2886	start_pfn = zone->zone_start_pfn;
2887	end_pfn = start_pfn + zone->spanned_pages;
2888	reserve = roundup(min_wmark_pages(zone), pageblock_nr_pages) >>
2889							pageblock_order;
2890
2891	/*
2892	 * Reserve blocks are generally in place to help high-order atomic
2893	 * allocations that are short-lived. A min_free_kbytes value that
2894	 * would result in more than 2 reserve blocks for atomic allocations
2895	 * is assumed to be in place to help anti-fragmentation for the
2896	 * future allocation of hugepages at runtime.
2897	 */
2898	reserve = min(2, reserve);
2899
2900	for (pfn = start_pfn; pfn < end_pfn; pfn += pageblock_nr_pages) {
2901		if (!pfn_valid(pfn))
2902			continue;
2903		page = pfn_to_page(pfn);
2904
2905		/* Watch out for overlapping nodes */
2906		if (page_to_nid(page) != zone_to_nid(zone))
2907			continue;
2908
2909		/* Blocks with reserved pages will never free, skip them. */
2910		if (PageReserved(page))
2911			continue;
2912
2913		block_migratetype = get_pageblock_migratetype(page);
2914
2915		/* If this block is reserved, account for it */
2916		if (reserve > 0 && block_migratetype == MIGRATE_RESERVE) {
2917			reserve--;
2918			continue;
2919		}
2920
2921		/* Suitable for reserving if this block is movable */
2922		if (reserve > 0 && block_migratetype == MIGRATE_MOVABLE) {
2923			set_pageblock_migratetype(page, MIGRATE_RESERVE);
2924			move_freepages_block(zone, page, MIGRATE_RESERVE);
2925			reserve--;
2926			continue;
2927		}
2928
2929		/*
2930		 * If the reserve is met and this is a previous reserved block,
2931		 * take it back
2932		 */
2933		if (block_migratetype == MIGRATE_RESERVE) {
2934			set_pageblock_migratetype(page, MIGRATE_MOVABLE);
2935			move_freepages_block(zone, page, MIGRATE_MOVABLE);
2936		}
2937	}
2938}
2939
2940/*
2941 * Initially all pages are reserved - free ones are freed
2942 * up by free_all_bootmem() once the early boot process is
2943 * done. Non-atomic initialization, single-pass.
2944 */
2945void __meminit memmap_init_zone(unsigned long size, int nid, unsigned long zone,
2946		unsigned long start_pfn, enum memmap_context context)
2947{
2948	struct page *page;
2949	unsigned long end_pfn = start_pfn + size;
2950	unsigned long pfn;
2951	struct zone *z;
2952
2953	if (highest_memmap_pfn < end_pfn - 1)
2954		highest_memmap_pfn = end_pfn - 1;
2955
2956	z = &NODE_DATA(nid)->node_zones[zone];
2957	for (pfn = start_pfn; pfn < end_pfn; pfn++) {
2958		/*
2959		 * There can be holes in boot-time mem_map[]s
2960		 * handed to this function.  They do not
2961		 * exist on hotplugged memory.
2962		 */
2963		if (context == MEMMAP_EARLY) {
2964			if (!early_pfn_valid(pfn))
2965				continue;
2966			if (!early_pfn_in_nid(pfn, nid))
2967				continue;
2968		}
2969		page = pfn_to_page(pfn);
2970		set_page_links(page, zone, nid, pfn);
2971		mminit_verify_page_links(page, zone, nid, pfn);
2972		init_page_count(page);
2973		reset_page_mapcount(page);
2974		SetPageReserved(page);
2975		/*
2976		 * Mark the block movable so that blocks are reserved for
2977		 * movable at startup. This will force kernel allocations
2978		 * to reserve their blocks rather than leaking throughout
2979		 * the address space during boot when many long-lived
2980		 * kernel allocations are made. Later some blocks near
2981		 * the start are marked MIGRATE_RESERVE by
2982		 * setup_zone_migrate_reserve()
2983		 *
2984		 * bitmap is created for zone's valid pfn range. but memmap
2985		 * can be created for invalid pages (for alignment)
2986		 * check here not to call set_pageblock_migratetype() against
2987		 * pfn out of zone.
2988		 */
2989		if ((z->zone_start_pfn <= pfn)
2990		    && (pfn < z->zone_start_pfn + z->spanned_pages)
2991		    && !(pfn & (pageblock_nr_pages - 1)))
2992			set_pageblock_migratetype(page, MIGRATE_MOVABLE);
2993
2994		INIT_LIST_HEAD(&page->lru);
2995#ifdef WANT_PAGE_VIRTUAL
2996		/* The shift won't overflow because ZONE_NORMAL is below 4G. */
2997		if (!is_highmem_idx(zone))
2998			set_page_address(page, __va(pfn << PAGE_SHIFT));
2999#endif
3000	}
3001}
3002
3003static void __meminit zone_init_free_lists(struct zone *zone)
3004{
3005	int order, t;
3006	for_each_migratetype_order(order, t) {
3007		INIT_LIST_HEAD(&zone->free_area[order].free_list[t]);
3008		zone->free_area[order].nr_free = 0;
3009	}
3010}
3011
3012#ifndef __HAVE_ARCH_MEMMAP_INIT
3013#define memmap_init(size, nid, zone, start_pfn) \
3014	memmap_init_zone((size), (nid), (zone), (start_pfn), MEMMAP_EARLY)
3015#endif
3016
3017static int zone_batchsize(struct zone *zone)
3018{
3019#ifdef CONFIG_MMU
3020	int batch;
3021
3022	/*
3023	 * The per-cpu-pages pools are set to around 1000th of the
3024	 * size of the zone.  But no more than 1/2 of a meg.
3025	 *
3026	 * OK, so we don't know how big the cache is.  So guess.
3027	 */
3028	batch = zone->present_pages / 1024;
3029	if (batch * PAGE_SIZE > 512 * 1024)
3030		batch = (512 * 1024) / PAGE_SIZE;
3031	batch /= 4;		/* We effectively *= 4 below */
3032	if (batch < 1)
3033		batch = 1;
3034
3035	/*
3036	 * Clamp the batch to a 2^n - 1 value. Having a power
3037	 * of 2 value was found to be more likely to have
3038	 * suboptimal cache aliasing properties in some cases.
3039	 *
3040	 * For example if 2 tasks are alternately allocating
3041	 * batches of pages, one task can end up with a lot
3042	 * of pages of one half of the possible page colors
3043	 * and the other with pages of the other colors.
3044	 */
3045	batch = rounddown_pow_of_two(batch + batch/2) - 1;
3046
3047	return batch;
3048
3049#else
3050	/* The deferral and batching of frees should be suppressed under NOMMU
3051	 * conditions.
3052	 *
3053	 * The problem is that NOMMU needs to be able to allocate large chunks
3054	 * of contiguous memory as there's no hardware page translation to
3055	 * assemble apparent contiguous memory from discontiguous pages.
3056	 *
3057	 * Queueing large contiguous runs of pages for batching, however,
3058	 * causes the pages to actually be freed in smaller chunks.  As there
3059	 * can be a significant delay between the individual batches being
3060	 * recycled, this leads to the once large chunks of space being
3061	 * fragmented and becoming unavailable for high-order allocations.
3062	 */
3063	return 0;
3064#endif
3065}
3066
3067static void setup_pageset(struct per_cpu_pageset *p, unsigned long batch)
3068{
3069	struct per_cpu_pages *pcp;
3070	int migratetype;
3071
3072	memset(p, 0, sizeof(*p));
3073
3074	pcp = &p->pcp;
3075	pcp->count = 0;
3076	pcp->high = 6 * batch;
3077	pcp->batch = max(1UL, 1 * batch);
3078	for (migratetype = 0; migratetype < MIGRATE_PCPTYPES; migratetype++)
3079		INIT_LIST_HEAD(&pcp->lists[migratetype]);
3080}
3081
3082/*
3083 * setup_pagelist_highmark() sets the high water mark for hot per_cpu_pagelist
3084 * to the value high for the pageset p.
3085 */
3086
3087static void setup_pagelist_highmark(struct per_cpu_pageset *p,
3088				unsigned long high)
3089{
3090	struct per_cpu_pages *pcp;
3091
3092	pcp = &p->pcp;
3093	pcp->high = high;
3094	pcp->batch = max(1UL, high/4);
3095	if ((high/4) > (PAGE_SHIFT * 8))
3096		pcp->batch = PAGE_SHIFT * 8;
3097}
3098
3099
3100#ifdef CONFIG_NUMA
3101/*
3102 * Boot pageset table. One per cpu which is going to be used for all
3103 * zones and all nodes. The parameters will be set in such a way
3104 * that an item put on a list will immediately be handed over to
3105 * the buddy list. This is safe since pageset manipulation is done
3106 * with interrupts disabled.
3107 *
3108 * Some NUMA counter updates may also be caught by the boot pagesets.
3109 *
3110 * The boot_pagesets must be kept even after bootup is complete for
3111 * unused processors and/or zones. They do play a role for bootstrapping
3112 * hotplugged processors.
3113 *
3114 * zoneinfo_show() and maybe other functions do
3115 * not check if the processor is online before following the pageset pointer.
3116 * Other parts of the kernel may not check if the zone is available.
3117 */
3118static struct per_cpu_pageset boot_pageset[NR_CPUS];
3119
3120/*
3121 * Dynamically allocate memory for the
3122 * per cpu pageset array in struct zone.
3123 */
3124static int __cpuinit process_zones(int cpu)
3125{
3126	struct zone *zone, *dzone;
3127	int node = cpu_to_node(cpu);
3128
3129	node_set_state(node, N_CPU);	/* this node has a cpu */
3130
3131	for_each_populated_zone(zone) {
3132		zone_pcp(zone, cpu) = kmalloc_node(sizeof(struct per_cpu_pageset),
3133					 GFP_KERNEL, node);
3134		if (!zone_pcp(zone, cpu))
3135			goto bad;
3136
3137		setup_pageset(zone_pcp(zone, cpu), zone_batchsize(zone));
3138
3139		if (percpu_pagelist_fraction)
3140			setup_pagelist_highmark(zone_pcp(zone, cpu),
3141			    (zone->present_pages / percpu_pagelist_fraction));
3142	}
3143
3144	return 0;
3145bad:
3146	for_each_zone(dzone) {
3147		if (!populated_zone(dzone))
3148			continue;
3149		if (dzone == zone)
3150			break;
3151		kfree(zone_pcp(dzone, cpu));
3152		zone_pcp(dzone, cpu) = &boot_pageset[cpu];
3153	}
3154	return -ENOMEM;
3155}
3156
3157static inline void free_zone_pagesets(int cpu)
3158{
3159	struct zone *zone;
3160
3161	for_each_zone(zone) {
3162		struct per_cpu_pageset *pset = zone_pcp(zone, cpu);
3163
3164		/* Free per_cpu_pageset if it is slab allocated */
3165		if (pset != &boot_pageset[cpu])
3166			kfree(pset);
3167		zone_pcp(zone, cpu) = &boot_pageset[cpu];
3168	}
3169}
3170
3171static int __cpuinit pageset_cpuup_callback(struct notifier_block *nfb,
3172		unsigned long action,
3173		void *hcpu)
3174{
3175	int cpu = (long)hcpu;
3176	int ret = NOTIFY_OK;
3177
3178	switch (action) {
3179	case CPU_UP_PREPARE:
3180	case CPU_UP_PREPARE_FROZEN:
3181		if (process_zones(cpu))
3182			ret = NOTIFY_BAD;
3183		break;
3184	case CPU_UP_CANCELED:
3185	case CPU_UP_CANCELED_FROZEN:
3186	case CPU_DEAD:
3187	case CPU_DEAD_FROZEN:
3188		free_zone_pagesets(cpu);
3189		break;
3190	default:
3191		break;
3192	}
3193	return ret;
3194}
3195
3196static struct notifier_block __cpuinitdata pageset_notifier =
3197	{ &pageset_cpuup_callback, NULL, 0 };
3198
3199void __init setup_per_cpu_pageset(void)
3200{
3201	int err;
3202
3203	/* Initialize per_cpu_pageset for cpu 0.
3204	 * A cpuup callback will do this for every cpu
3205	 * as it comes online
3206	 */
3207	err = process_zones(smp_processor_id());
3208	BUG_ON(err);
3209	register_cpu_notifier(&pageset_notifier);
3210}
3211
3212#endif
3213
3214static noinline __init_refok
3215int zone_wait_table_init(struct zone *zone, unsigned long zone_size_pages)
3216{
3217	int i;
3218	struct pglist_data *pgdat = zone->zone_pgdat;
3219	size_t alloc_size;
3220
3221	/*
3222	 * The per-page waitqueue mechanism uses hashed waitqueues
3223	 * per zone.
3224	 */
3225	zone->wait_table_hash_nr_entries =
3226		 wait_table_hash_nr_entries(zone_size_pages);
3227	zone->wait_table_bits =
3228		wait_table_bits(zone->wait_table_hash_nr_entries);
3229	alloc_size = zone->wait_table_hash_nr_entries
3230					* sizeof(wait_queue_head_t);
3231
3232	if (!slab_is_available()) {
3233		zone->wait_table = (wait_queue_head_t *)
3234			alloc_bootmem_node(pgdat, alloc_size);
3235	} else {
3236		/*
3237		 * This case means that a zone whose size was 0 gets new memory
3238		 * via memory hot-add.
3239		 * But it may be the case that a new node was hot-added.  In
3240		 * this case vmalloc() will not be able to use this new node's
3241		 * memory - this wait_table must be initialized to use this new
3242		 * node itself as well.
3243		 * To use this new node's memory, further consideration will be
3244		 * necessary.
3245		 */
3246		zone->wait_table = vmalloc(alloc_size);
3247	}
3248	if (!zone->wait_table)
3249		return -ENOMEM;
3250
3251	for(i = 0; i < zone->wait_table_hash_nr_entries; ++i)
3252		init_waitqueue_head(zone->wait_table + i);
3253
3254	return 0;
3255}
3256
3257static int __zone_pcp_update(void *data)
3258{
3259	struct zone *zone = data;
3260	int cpu;
3261	unsigned long batch = zone_batchsize(zone), flags;
3262
3263	for (cpu = 0; cpu < NR_CPUS; cpu++) {
3264		struct per_cpu_pageset *pset;
3265		struct per_cpu_pages *pcp;
3266
3267		pset = zone_pcp(zone, cpu);
3268		pcp = &pset->pcp;
3269
3270		local_irq_save(flags);
3271		free_pcppages_bulk(zone, pcp->count, pcp);
3272		setup_pageset(pset, batch);
3273		local_irq_restore(flags);
3274	}
3275	return 0;
3276}
3277
3278void zone_pcp_update(struct zone *zone)
3279{
3280	stop_machine(__zone_pcp_update, zone, NULL);
3281}
3282
3283static __meminit void zone_pcp_init(struct zone *zone)
3284{
3285	int cpu;
3286	unsigned long batch = zone_batchsize(zone);
3287
3288	for (cpu = 0; cpu < NR_CPUS; cpu++) {
3289#ifdef CONFIG_NUMA
3290		/* Early boot. Slab allocator not functional yet */
3291		zone_pcp(zone, cpu) = &boot_pageset[cpu];
3292		setup_pageset(&boot_pageset[cpu],0);
3293#else
3294		setup_pageset(zone_pcp(zone,cpu), batch);
3295#endif
3296	}
3297	if (zone->present_pages)
3298		printk(KERN_DEBUG "  %s zone: %lu pages, LIFO batch:%lu\n",
3299			zone->name, zone->present_pages, batch);
3300}
3301
3302__meminit int init_currently_empty_zone(struct zone *zone,
3303					unsigned long zone_start_pfn,
3304					unsigned long size,
3305					enum memmap_context context)
3306{
3307	struct pglist_data *pgdat = zone->zone_pgdat;
3308	int ret;
3309	ret = zone_wait_table_init(zone, size);
3310	if (ret)
3311		return ret;
3312	pgdat->nr_zones = zone_idx(zone) + 1;
3313
3314	zone->zone_start_pfn = zone_start_pfn;
3315
3316	mminit_dprintk(MMINIT_TRACE, "memmap_init",
3317			"Initialising map node %d zone %lu pfns %lu -> %lu\n",
3318			pgdat->node_id,
3319			(unsigned long)zone_idx(zone),
3320			zone_start_pfn, (zone_start_pfn + size));
3321
3322	zone_init_free_lists(zone);
3323
3324	return 0;
3325}
3326
3327#ifdef CONFIG_ARCH_POPULATES_NODE_MAP
3328/*
3329 * Basic iterator support. Return the first range of PFNs for a node
3330 * Note: nid == MAX_NUMNODES returns first region regardless of node
3331 */
3332static int __meminit first_active_region_index_in_nid(int nid)
3333{
3334	int i;
3335
3336	for (i = 0; i < nr_nodemap_entries; i++)
3337		if (nid == MAX_NUMNODES || early_node_map[i].nid == nid)
3338			return i;
3339
3340	return -1;
3341}
3342
3343/*
3344 * Basic iterator support. Return the next active range of PFNs for a node
3345 * Note: nid == MAX_NUMNODES returns next region regardless of node
3346 */
3347static int __meminit next_active_region_index_in_nid(int index, int nid)
3348{
3349	for (index = index + 1; index < nr_nodemap_entries; index++)
3350		if (nid == MAX_NUMNODES || early_node_map[index].nid == nid)
3351			return index;
3352
3353	return -1;
3354}
3355
3356#ifndef CONFIG_HAVE_ARCH_EARLY_PFN_TO_NID
3357/*
3358 * Required by SPARSEMEM. Given a PFN, return what node the PFN is on.
3359 * Architectures may implement their own version but if add_active_range()
3360 * was used and there are no special requirements, this is a convenient
3361 * alternative
3362 */
3363int __meminit __early_pfn_to_nid(unsigned long pfn)
3364{
3365	int i;
3366
3367	for (i = 0; i < nr_nodemap_entries; i++) {
3368		unsigned long start_pfn = early_node_map[i].start_pfn;
3369		unsigned long end_pfn = early_node_map[i].end_pfn;
3370
3371		if (start_pfn <= pfn && pfn < end_pfn)
3372			return early_node_map[i].nid;
3373	}
3374	/* This is a memory hole */
3375	return -1;
3376}
3377#endif /* CONFIG_HAVE_ARCH_EARLY_PFN_TO_NID */
3378
3379int __meminit early_pfn_to_nid(unsigned long pfn)
3380{
3381	int nid;
3382
3383	nid = __early_pfn_to_nid(pfn);
3384	if (nid >= 0)
3385		return nid;
3386	/* just returns 0 */
3387	return 0;
3388}
3389
3390#ifdef CONFIG_NODES_SPAN_OTHER_NODES
3391bool __meminit early_pfn_in_nid(unsigned long pfn, int node)
3392{
3393	int nid;
3394
3395	nid = __early_pfn_to_nid(pfn);
3396	if (nid >= 0 && nid != node)
3397		return false;
3398	return true;
3399}
3400#endif
3401
3402/* Basic iterator support to walk early_node_map[] */
3403#define for_each_active_range_index_in_nid(i, nid) \
3404	for (i = first_active_region_index_in_nid(nid); i != -1; \
3405				i = next_active_region_index_in_nid(i, nid))
3406
3407/**
3408 * free_bootmem_with_active_regions - Call free_bootmem_node for each active range
3409 * @nid: The node to free memory on. If MAX_NUMNODES, all nodes are freed.
3410 * @max_low_pfn: The highest PFN that will be passed to free_bootmem_node
3411 *
3412 * If an architecture guarantees that all ranges registered with
3413 * add_active_ranges() contain no holes and may be freed, this
3414 * this function may be used instead of calling free_bootmem() manually.
3415 */
3416void __init free_bootmem_with_active_regions(int nid,
3417						unsigned long max_low_pfn)
3418{
3419	int i;
3420
3421	for_each_active_range_index_in_nid(i, nid) {
3422		unsigned long size_pages = 0;
3423		unsigned long end_pfn = early_node_map[i].end_pfn;
3424
3425		if (early_node_map[i].start_pfn >= max_low_pfn)
3426			continue;
3427
3428		if (end_pfn > max_low_pfn)
3429			end_pfn = max_low_pfn;
3430
3431		size_pages = end_pfn - early_node_map[i].start_pfn;
3432		free_bootmem_node(NODE_DATA(early_node_map[i].nid),
3433				PFN_PHYS(early_node_map[i].start_pfn),
3434				size_pages << PAGE_SHIFT);
3435	}
3436}
3437
3438int __init add_from_early_node_map(struct range *range, int az,
3439				   int nr_range, int nid)
3440{
3441	int i;
3442	u64 start, end;
3443
3444	/* need to go over early_node_map to find out good range for node */
3445	for_each_active_range_index_in_nid(i, nid) {
3446		start = early_node_map[i].start_pfn;
3447		end = early_node_map[i].end_pfn;
3448		nr_range = add_range(range, az, nr_range, start, end);
3449	}
3450	return nr_range;
3451}
3452
3453void * __init __alloc_memory_core_early(int nid, u64 size, u64 align,
3454					u64 goal, u64 limit)
3455{
3456	int i;
3457	void *ptr;
3458
3459	/* need to go over early_node_map to find out good range for node */
3460	for_each_active_range_index_in_nid(i, nid) {
3461		u64 addr;
3462		u64 ei_start, ei_last;
3463
3464		ei_last = early_node_map[i].end_pfn;
3465		ei_last <<= PAGE_SHIFT;
3466		ei_start = early_node_map[i].start_pfn;
3467		ei_start <<= PAGE_SHIFT;
3468		addr = find_early_area(ei_start, ei_last,
3469					 goal, limit, size, align);
3470
3471		if (addr == -1ULL)
3472			continue;
3473
3474#if 0
3475		printk(KERN_DEBUG "alloc (nid=%d %llx - %llx) (%llx - %llx) %llx %llx => %llx\n",
3476				nid,
3477				ei_start, ei_last, goal, limit, size,
3478				align, addr);
3479#endif
3480
3481		ptr = phys_to_virt(addr);
3482		memset(ptr, 0, size);
3483		reserve_early_without_check(addr, addr + size, "BOOTMEM");
3484		return ptr;
3485	}
3486
3487	return NULL;
3488}
3489
3490
3491void __init work_with_active_regions(int nid, work_fn_t work_fn, void *data)
3492{
3493	int i;
3494	int ret;
3495
3496	for_each_active_range_index_in_nid(i, nid) {
3497		ret = work_fn(early_node_map[i].start_pfn,
3498			      early_node_map[i].end_pfn, data);
3499		if (ret)
3500			break;
3501	}
3502}
3503/**
3504 * sparse_memory_present_with_active_regions - Call memory_present for each active range
3505 * @nid: The node to call memory_present for. If MAX_NUMNODES, all nodes will be used.
3506 *
3507 * If an architecture guarantees that all ranges registered with
3508 * add_active_ranges() contain no holes and may be freed, this
3509 * function may be used instead of calling memory_present() manually.
3510 */
3511void __init sparse_memory_present_with_active_regions(int nid)
3512{
3513	int i;
3514
3515	for_each_active_range_index_in_nid(i, nid)
3516		memory_present(early_node_map[i].nid,
3517				early_node_map[i].start_pfn,
3518				early_node_map[i].end_pfn);
3519}
3520
3521/**
3522 * get_pfn_range_for_nid - Return the start and end page frames for a node
3523 * @nid: The nid to return the range for. If MAX_NUMNODES, the min and max PFN are returned.
3524 * @start_pfn: Passed by reference. On return, it will have the node start_pfn.
3525 * @end_pfn: Passed by reference. On return, it will have the node end_pfn.
3526 *
3527 * It returns the start and end page frame of a node based on information
3528 * provided by an arch calling add_active_range(). If called for a node
3529 * with no available memory, a warning is printed and the start and end
3530 * PFNs will be 0.
3531 */
3532void __meminit get_pfn_range_for_nid(unsigned int nid,
3533			unsigned long *start_pfn, unsigned long *end_pfn)
3534{
3535	int i;
3536	*start_pfn = -1UL;
3537	*end_pfn = 0;
3538
3539	for_each_active_range_index_in_nid(i, nid) {
3540		*start_pfn = min(*start_pfn, early_node_map[i].start_pfn);
3541		*end_pfn = max(*end_pfn, early_node_map[i].end_pfn);
3542	}
3543
3544	if (*start_pfn == -1UL)
3545		*start_pfn = 0;
3546}
3547
3548/*
3549 * This finds a zone that can be used for ZONE_MOVABLE pages. The
3550 * assumption is made that zones within a node are ordered in monotonic
3551 * increasing memory addresses so that the "highest" populated zone is used
3552 */
3553static void __init find_usable_zone_for_movable(void)
3554{
3555	int zone_index;
3556	for (zone_index = MAX_NR_ZONES - 1; zone_index >= 0; zone_index--) {
3557		if (zone_index == ZONE_MOVABLE)
3558			continue;
3559
3560		if (arch_zone_highest_possible_pfn[zone_index] >
3561				arch_zone_lowest_possible_pfn[zone_index])
3562			break;
3563	}
3564
3565	VM_BUG_ON(zone_index == -1);
3566	movable_zone = zone_index;
3567}
3568
3569/*
3570 * The zone ranges provided by the architecture do not include ZONE_MOVABLE
3571 * because it is sized independant of architecture. Unlike the other zones,
3572 * the starting point for ZONE_MOVABLE is not fixed. It may be different
3573 * in each node depending on the size of each node and how evenly kernelcore
3574 * is distributed. This helper function adjusts the zone ranges
3575 * provided by the architecture for a given node by using the end of the
3576 * highest usable zone for ZONE_MOVABLE. This preserves the assumption that
3577 * zones within a node are in order of monotonic increases memory addresses
3578 */
3579static void __meminit adjust_zone_range_for_zone_movable(int nid,
3580					unsigned long zone_type,
3581					unsigned long node_start_pfn,
3582					unsigned long node_end_pfn,
3583					unsigned long *zone_start_pfn,
3584					unsigned long *zone_end_pfn)
3585{
3586	/* Only adjust if ZONE_MOVABLE is on this node */
3587	if (zone_movable_pfn[nid]) {
3588		/* Size ZONE_MOVABLE */
3589		if (zone_type == ZONE_MOVABLE) {
3590			*zone_start_pfn = zone_movable_pfn[nid];
3591			*zone_end_pfn = min(node_end_pfn,
3592				arch_zone_highest_possible_pfn[movable_zone]);
3593
3594		/* Adjust for ZONE_MOVABLE starting within this range */
3595		} else if (*zone_start_pfn < zone_movable_pfn[nid] &&
3596				*zone_end_pfn > zone_movable_pfn[nid]) {
3597			*zone_end_pfn = zone_movable_pfn[nid];
3598
3599		/* Check if this whole range is within ZONE_MOVABLE */
3600		} else if (*zone_start_pfn >= zone_movable_pfn[nid])
3601			*zone_start_pfn = *zone_end_pfn;
3602	}
3603}
3604
3605/*
3606 * Return the number of pages a zone spans in a node, including holes
3607 * present_pages = zone_spanned_pages_in_node() - zone_absent_pages_in_node()
3608 */
3609static unsigned long __meminit zone_spanned_pages_in_node(int nid,
3610					unsigned long zone_type,
3611					unsigned long *ignored)
3612{
3613	unsigned long node_start_pfn, node_end_pfn;
3614	unsigned long zone_start_pfn, zone_end_pfn;
3615
3616	/* Get the start and end of the node and zone */
3617	get_pfn_range_for_nid(nid, &node_start_pfn, &node_end_pfn);
3618	zone_start_pfn = arch_zone_lowest_possible_pfn[zone_type];
3619	zone_end_pfn = arch_zone_highest_possible_pfn[zone_type];
3620	adjust_zone_range_for_zone_movable(nid, zone_type,
3621				node_start_pfn, node_end_pfn,
3622				&zone_start_pfn, &zone_end_pfn);
3623
3624	/* Check that this node has pages within the zone's required range */
3625	if (zone_end_pfn < node_start_pfn || zone_start_pfn > node_end_pfn)
3626		return 0;
3627
3628	/* Move the zone boundaries inside the node if necessary */
3629	zone_end_pfn = min(zone_end_pfn, node_end_pfn);
3630	zone_start_pfn = max(zone_start_pfn, node_start_pfn);
3631
3632	/* Return the spanned pages */
3633	return zone_end_pfn - zone_start_pfn;
3634}
3635
3636/*
3637 * Return the number of holes in a range on a node. If nid is MAX_NUMNODES,
3638 * then all holes in the requested range will be accounted for.
3639 */
3640unsigned long __meminit __absent_pages_in_range(int nid,
3641				unsigned long range_start_pfn,
3642				unsigned long range_end_pfn)
3643{
3644	int i = 0;
3645	unsigned long prev_end_pfn = 0, hole_pages = 0;
3646	unsigned long start_pfn;
3647
3648	/* Find the end_pfn of the first active range of pfns in the node */
3649	i = first_active_region_index_in_nid(nid);
3650	if (i == -1)
3651		return 0;
3652
3653	prev_end_pfn = min(early_node_map[i].start_pfn, range_end_pfn);
3654
3655	/* Account for ranges before physical memory on this node */
3656	if (early_node_map[i].start_pfn > range_start_pfn)
3657		hole_pages = prev_end_pfn - range_start_pfn;
3658
3659	/* Find all holes for the zone within the node */
3660	for (; i != -1; i = next_active_region_index_in_nid(i, nid)) {
3661
3662		/* No need to continue if prev_end_pfn is outside the zone */
3663		if (prev_end_pfn >= range_end_pfn)
3664			break;
3665
3666		/* Make sure the end of the zone is not within the hole */
3667		start_pfn = min(early_node_map[i].start_pfn, range_end_pfn);
3668		prev_end_pfn = max(prev_end_pfn, range_start_pfn);
3669
3670		/* Update the hole size cound and move on */
3671		if (start_pfn > range_start_pfn) {
3672			BUG_ON(prev_end_pfn > start_pfn);
3673			hole_pages += start_pfn - prev_end_pfn;
3674		}
3675		prev_end_pfn = early_node_map[i].end_pfn;
3676	}
3677
3678	/* Account for ranges past physical memory on this node */
3679	if (range_end_pfn > prev_end_pfn)
3680		hole_pages += range_end_pfn -
3681				max(range_start_pfn, prev_end_pfn);
3682
3683	return hole_pages;
3684}
3685
3686/**
3687 * absent_pages_in_range - Return number of page frames in holes within a range
3688 * @start_pfn: The start PFN to start searching for holes
3689 * @end_pfn: The end PFN to stop searching for holes
3690 *
3691 * It returns the number of pages frames in memory holes within a range.
3692 */
3693unsigned long __init absent_pages_in_range(unsigned long start_pfn,
3694							unsigned long end_pfn)
3695{
3696	return __absent_pages_in_range(MAX_NUMNODES, start_pfn, end_pfn);
3697}
3698
3699/* Return the number of page frames in holes in a zone on a node */
3700static unsigned long __meminit zone_absent_pages_in_node(int nid,
3701					unsigned long zone_type,
3702					unsigned long *ignored)
3703{
3704	unsigned long node_start_pfn, node_end_pfn;
3705	unsigned long zone_start_pfn, zone_end_pfn;
3706
3707	get_pfn_range_for_nid(nid, &node_start_pfn, &node_end_pfn);
3708	zone_start_pfn = max(arch_zone_lowest_possible_pfn[zone_type],
3709							node_start_pfn);
3710	zone_end_pfn = min(arch_zone_highest_possible_pfn[zone_type],
3711							node_end_pfn);
3712
3713	adjust_zone_range_for_zone_movable(nid, zone_type,
3714			node_start_pfn, node_end_pfn,
3715			&zone_start_pfn, &zone_end_pfn);
3716	return __absent_pages_in_range(nid, zone_start_pfn, zone_end_pfn);
3717}
3718
3719#else
3720static inline unsigned long __meminit zone_spanned_pages_in_node(int nid,
3721					unsigned long zone_type,
3722					unsigned long *zones_size)
3723{
3724	return zones_size[zone_type];
3725}
3726
3727static inline unsigned long __meminit zone_absent_pages_in_node(int nid,
3728						unsigned long zone_type,
3729						unsigned long *zholes_size)
3730{
3731	if (!zholes_size)
3732		return 0;
3733
3734	return zholes_size[zone_type];
3735}
3736
3737#endif
3738
3739static void __meminit calculate_node_totalpages(struct pglist_data *pgdat,
3740		unsigned long *zones_size, unsigned long *zholes_size)
3741{
3742	unsigned long realtotalpages, totalpages = 0;
3743	enum zone_type i;
3744
3745	for (i = 0; i < MAX_NR_ZONES; i++)
3746		totalpages += zone_spanned_pages_in_node(pgdat->node_id, i,
3747								zones_size);
3748	pgdat->node_spanned_pages = totalpages;
3749
3750	realtotalpages = totalpages;
3751	for (i = 0; i < MAX_NR_ZONES; i++)
3752		realtotalpages -=
3753			zone_absent_pages_in_node(pgdat->node_id, i,
3754								zholes_size);
3755	pgdat->node_present_pages = realtotalpages;
3756	printk(KERN_DEBUG "On node %d totalpages: %lu\n", pgdat->node_id,
3757							realtotalpages);
3758}
3759
3760#ifndef CONFIG_SPARSEMEM
3761/*
3762 * Calculate the size of the zone->blockflags rounded to an unsigned long
3763 * Start by making sure zonesize is a multiple of pageblock_order by rounding
3764 * up. Then use 1 NR_PAGEBLOCK_BITS worth of bits per pageblock, finally
3765 * round what is now in bits to nearest long in bits, then return it in
3766 * bytes.
3767 */
3768static unsigned long __init usemap_size(unsigned long zonesize)
3769{
3770	unsigned long usemapsize;
3771
3772	usemapsize = roundup(zonesize, pageblock_nr_pages);
3773	usemapsize = usemapsize >> pageblock_order;
3774	usemapsize *= NR_PAGEBLOCK_BITS;
3775	usemapsize = roundup(usemapsize, 8 * sizeof(unsigned long));
3776
3777	return usemapsize / 8;
3778}
3779
3780static void __init setup_usemap(struct pglist_data *pgdat,
3781				struct zone *zone, unsigned long zonesize)
3782{
3783	unsigned long usemapsize = usemap_size(zonesize);
3784	zone->pageblock_flags = NULL;
3785	if (usemapsize)
3786		zone->pageblock_flags = alloc_bootmem_node(pgdat, usemapsize);
3787}
3788#else
3789static void inline setup_usemap(struct pglist_data *pgdat,
3790				struct zone *zone, unsigned long zonesize) {}
3791#endif /* CONFIG_SPARSEMEM */
3792
3793#ifdef CONFIG_HUGETLB_PAGE_SIZE_VARIABLE
3794
3795/* Return a sensible default order for the pageblock size. */
3796static inline int pageblock_default_order(void)
3797{
3798	if (HPAGE_SHIFT > PAGE_SHIFT)
3799		return HUGETLB_PAGE_ORDER;
3800
3801	return MAX_ORDER-1;
3802}
3803
3804/* Initialise the number of pages represented by NR_PAGEBLOCK_BITS */
3805static inline void __init set_pageblock_order(unsigned int order)
3806{
3807	/* Check that pageblock_nr_pages has not already been setup */
3808	if (pageblock_order)
3809		return;
3810
3811	/*
3812	 * Assume the largest contiguous order of interest is a huge page.
3813	 * This value may be variable depending on boot parameters on IA64
3814	 */
3815	pageblock_order = order;
3816}
3817#else /* CONFIG_HUGETLB_PAGE_SIZE_VARIABLE */
3818
3819/*
3820 * When CONFIG_HUGETLB_PAGE_SIZE_VARIABLE is not set, set_pageblock_order()
3821 * and pageblock_default_order() are unused as pageblock_order is set
3822 * at compile-time. See include/linux/pageblock-flags.h for the values of
3823 * pageblock_order based on the kernel config
3824 */
3825static inline int pageblock_default_order(unsigned int order)
3826{
3827	return MAX_ORDER-1;
3828}
3829#define set_pageblock_order(x)	do {} while (0)
3830
3831#endif /* CONFIG_HUGETLB_PAGE_SIZE_VARIABLE */
3832
3833/*
3834 * Set up the zone data structures:
3835 *   - mark all pages reserved
3836 *   - mark all memory queues empty
3837 *   - clear the memory bitmaps
3838 */
3839static void __paginginit free_area_init_core(struct pglist_data *pgdat,
3840		unsigned long *zones_size, unsigned long *zholes_size)
3841{
3842	enum zone_type j;
3843	int nid = pgdat->node_id;
3844	unsigned long zone_start_pfn = pgdat->node_start_pfn;
3845	int ret;
3846
3847	pgdat_resize_init(pgdat);
3848	pgdat->nr_zones = 0;
3849	init_waitqueue_head(&pgdat->kswapd_wait);
3850	pgdat->kswapd_max_order = 0;
3851	pgdat_page_cgroup_init(pgdat);
3852
3853	for (j = 0; j < MAX_NR_ZONES; j++) {
3854		struct zone *zone = pgdat->node_zones + j;
3855		unsigned long size, realsize, memmap_pages;
3856		enum lru_list l;
3857
3858		size = zone_spanned_pages_in_node(nid, j, zones_size);
3859		realsize = size - zone_absent_pages_in_node(nid, j,
3860								zholes_size);
3861
3862		/*
3863		 * Adjust realsize so that it accounts for how much memory
3864		 * is used by this zone for memmap. This affects the watermark
3865		 * and per-cpu initialisations
3866		 */
3867		memmap_pages =
3868			PAGE_ALIGN(size * sizeof(struct page)) >> PAGE_SHIFT;
3869		if (realsize >= memmap_pages) {
3870			realsize -= memmap_pages;
3871			if (memmap_pages)
3872				printk(KERN_DEBUG
3873				       "  %s zone: %lu pages used for memmap\n",
3874				       zone_names[j], memmap_pages);
3875		} else
3876			printk(KERN_WARNING
3877				"  %s zone: %lu pages exceeds realsize %lu\n",
3878				zone_names[j], memmap_pages, realsize);
3879
3880		/* Account for reserved pages */
3881		if (j == 0 && realsize > dma_reserve) {
3882			realsize -= dma_reserve;
3883			printk(KERN_DEBUG "  %s zone: %lu pages reserved\n",
3884					zone_names[0], dma_reserve);
3885		}
3886
3887		if (!is_highmem_idx(j))
3888			nr_kernel_pages += realsize;
3889		nr_all_pages += realsize;
3890
3891		zone->spanned_pages = size;
3892		zone->present_pages = realsize;
3893#ifdef CONFIG_NUMA
3894		zone->node = nid;
3895		zone->min_unmapped_pages = (realsize*sysctl_min_unmapped_ratio)
3896						/ 100;
3897		zone->min_slab_pages = (realsize * sysctl_min_slab_ratio) / 100;
3898#endif
3899		zone->name = zone_names[j];
3900		spin_lock_init(&zone->lock);
3901		spin_lock_init(&zone->lru_lock);
3902		zone_seqlock_init(zone);
3903		zone->zone_pgdat = pgdat;
3904
3905		zone->prev_priority = DEF_PRIORITY;
3906
3907		zone_pcp_init(zone);
3908		for_each_lru(l) {
3909			INIT_LIST_HEAD(&zone->lru[l].list);
3910			zone->reclaim_stat.nr_saved_scan[l] = 0;
3911		}
3912		zone->reclaim_stat.recent_rotated[0] = 0;
3913		zone->reclaim_stat.recent_rotated[1] = 0;
3914		zone->reclaim_stat.recent_scanned[0] = 0;
3915		zone->reclaim_stat.recent_scanned[1] = 0;
3916		zap_zone_vm_stats(zone);
3917		zone->flags = 0;
3918		if (!size)
3919			continue;
3920
3921		set_pageblock_order(pageblock_default_order());
3922		setup_usemap(pgdat, zone, size);
3923		ret = init_currently_empty_zone(zone, zone_start_pfn,
3924						size, MEMMAP_EARLY);
3925		BUG_ON(ret);
3926		memmap_init(size, nid, j, zone_start_pfn);
3927		zone_start_pfn += size;
3928	}
3929}
3930
3931static void __init_refok alloc_node_mem_map(struct pglist_data *pgdat)
3932{
3933	/* Skip empty nodes */
3934	if (!pgdat->node_spanned_pages)
3935		return;
3936
3937#ifdef CONFIG_FLAT_NODE_MEM_MAP
3938	/* ia64 gets its own node_mem_map, before this, without bootmem */
3939	if (!pgdat->node_mem_map) {
3940		unsigned long size, start, end;
3941		struct page *map;
3942
3943		/*
3944		 * The zone's endpoints aren't required to be MAX_ORDER
3945		 * aligned but the node_mem_map endpoints must be in order
3946		 * for the buddy allocator to function correctly.
3947		 */
3948		start = pgdat->node_start_pfn & ~(MAX_ORDER_NR_PAGES - 1);
3949		end = pgdat->node_start_pfn + pgdat->node_spanned_pages;
3950		end = ALIGN(end, MAX_ORDER_NR_PAGES);
3951		size =  (end - start) * sizeof(struct page);
3952		map = alloc_remap(pgdat->node_id, size);
3953		if (!map)
3954			map = alloc_bootmem_node(pgdat, size);
3955		pgdat->node_mem_map = map + (pgdat->node_start_pfn - start);
3956	}
3957#ifndef CONFIG_NEED_MULTIPLE_NODES
3958	/*
3959	 * With no DISCONTIG, the global mem_map is just set as node 0's
3960	 */
3961	if (pgdat == NODE_DATA(0)) {
3962		mem_map = NODE_DATA(0)->node_mem_map;
3963#ifdef CONFIG_ARCH_POPULATES_NODE_MAP
3964		if (page_to_pfn(mem_map) != pgdat->node_start_pfn)
3965			mem_map -= (pgdat->node_start_pfn - ARCH_PFN_OFFSET);
3966#endif /* CONFIG_ARCH_POPULATES_NODE_MAP */
3967	}
3968#endif
3969#endif /* CONFIG_FLAT_NODE_MEM_MAP */
3970}
3971
3972void __paginginit free_area_init_node(int nid, unsigned long *zones_size,
3973		unsigned long node_start_pfn, unsigned long *zholes_size)
3974{
3975	pg_data_t *pgdat = NODE_DATA(nid);
3976
3977	pgdat->node_id = nid;
3978	pgdat->node_start_pfn = node_start_pfn;
3979	calculate_node_totalpages(pgdat, zones_size, zholes_size);
3980
3981	alloc_node_mem_map(pgdat);
3982#ifdef CONFIG_FLAT_NODE_MEM_MAP
3983	printk(KERN_DEBUG "free_area_init_node: node %d, pgdat %08lx, node_mem_map %08lx\n",
3984		nid, (unsigned long)pgdat,
3985		(unsigned long)pgdat->node_mem_map);
3986#endif
3987
3988	free_area_init_core(pgdat, zones_size, zholes_size);
3989}
3990
3991#ifdef CONFIG_ARCH_POPULATES_NODE_MAP
3992
3993#if MAX_NUMNODES > 1
3994/*
3995 * Figure out the number of possible node ids.
3996 */
3997static void __init setup_nr_node_ids(void)
3998{
3999	unsigned int node;
4000	unsigned int highest = 0;
4001
4002	for_each_node_mask(node, node_possible_map)
4003		highest = node;
4004	nr_node_ids = highest + 1;
4005}
4006#else
4007static inline void setup_nr_node_ids(void)
4008{
4009}
4010#endif
4011
4012/**
4013 * add_active_range - Register a range of PFNs backed by physical memory
4014 * @nid: The node ID the range resides on
4015 * @start_pfn: The start PFN of the available physical memory
4016 * @end_pfn: The end PFN of the available physical memory
4017 *
4018 * These ranges are stored in an early_node_map[] and later used by
4019 * free_area_init_nodes() to calculate zone sizes and holes. If the
4020 * range spans a memory hole, it is up to the architecture to ensure
4021 * the memory is not freed by the bootmem allocator. If possible
4022 * the range being registered will be merged with existing ranges.
4023 */
4024void __init add_active_range(unsigned int nid, unsigned long start_pfn,
4025						unsigned long end_pfn)
4026{
4027	int i;
4028
4029	mminit_dprintk(MMINIT_TRACE, "memory_register",
4030			"Entering add_active_range(%d, %#lx, %#lx) "
4031			"%d entries of %d used\n",
4032			nid, start_pfn, end_pfn,
4033			nr_nodemap_entries, MAX_ACTIVE_REGIONS);
4034
4035	mminit_validate_memmodel_limits(&start_pfn, &end_pfn);
4036
4037	/* Merge with existing active regions if possible */
4038	for (i = 0; i < nr_nodemap_entries; i++) {
4039		if (early_node_map[i].nid != nid)
4040			continue;
4041
4042		/* Skip if an existing region covers this new one */
4043		if (start_pfn >= early_node_map[i].start_pfn &&
4044				end_pfn <= early_node_map[i].end_pfn)
4045			return;
4046
4047		/* Merge forward if suitable */
4048		if (start_pfn <= early_node_map[i].end_pfn &&
4049				end_pfn > early_node_map[i].end_pfn) {
4050			early_node_map[i].end_pfn = end_pfn;
4051			return;
4052		}
4053
4054		/* Merge backward if suitable */
4055		if (start_pfn < early_node_map[i].start_pfn &&
4056				end_pfn >= early_node_map[i].start_pfn) {
4057			early_node_map[i].start_pfn = start_pfn;
4058			return;
4059		}
4060	}
4061
4062	/* Check that early_node_map is large enough */
4063	if (i >= MAX_ACTIVE_REGIONS) {
4064		printk(KERN_CRIT "More than %d memory regions, truncating\n",
4065							MAX_ACTIVE_REGIONS);
4066		return;
4067	}
4068
4069	early_node_map[i].nid = nid;
4070	early_node_map[i].start_pfn = start_pfn;
4071	early_node_map[i].end_pfn = end_pfn;
4072	nr_nodemap_entries = i + 1;
4073}
4074
4075/**
4076 * remove_active_range - Shrink an existing registered range of PFNs
4077 * @nid: The node id the range is on that should be shrunk
4078 * @start_pfn: The new PFN of the range
4079 * @end_pfn: The new PFN of the range
4080 *
4081 * i386 with NUMA use alloc_remap() to store a node_mem_map on a local node.
4082 * The map is kept near the end physical page range that has already been
4083 * registered. This function allows an arch to shrink an existing registered
4084 * range.
4085 */
4086void __init remove_active_range(unsigned int nid, unsigned long start_pfn,
4087				unsigned long end_pfn)
4088{
4089	int i, j;
4090	int removed = 0;
4091
4092	printk(KERN_DEBUG "remove_active_range (%d, %lu, %lu)\n",
4093			  nid, start_pfn, end_pfn);
4094
4095	/* Find the old active region end and shrink */
4096	for_each_active_range_index_in_nid(i, nid) {
4097		if (early_node_map[i].start_pfn >= start_pfn &&
4098		    early_node_map[i].end_pfn <= end_pfn) {
4099			/* clear it */
4100			early_node_map[i].start_pfn = 0;
4101			early_node_map[i].end_pfn = 0;
4102			removed = 1;
4103			continue;
4104		}
4105		if (early_node_map[i].start_pfn < start_pfn &&
4106		    early_node_map[i].end_pfn > start_pfn) {
4107			unsigned long temp_end_pfn = early_node_map[i].end_pfn;
4108			early_node_map[i].end_pfn = start_pfn;
4109			if (temp_end_pfn > end_pfn)
4110				add_active_range(nid, end_pfn, temp_end_pfn);
4111			continue;
4112		}
4113		if (early_node_map[i].start_pfn >= start_pfn &&
4114		    early_node_map[i].end_pfn > end_pfn &&
4115		    early_node_map[i].start_pfn < end_pfn) {
4116			early_node_map[i].start_pfn = end_pfn;
4117			continue;
4118		}
4119	}
4120
4121	if (!removed)
4122		return;
4123
4124	/* remove the blank ones */
4125	for (i = nr_nodemap_entries - 1; i > 0; i--) {
4126		if (early_node_map[i].nid != nid)
4127			continue;
4128		if (early_node_map[i].end_pfn)
4129			continue;
4130		/* we found it, get rid of it */
4131		for (j = i; j < nr_nodemap_entries - 1; j++)
4132			memcpy(&early_node_map[j], &early_node_map[j+1],
4133				sizeof(early_node_map[j]));
4134		j = nr_nodemap_entries - 1;
4135		memset(&early_node_map[j], 0, sizeof(early_node_map[j]));
4136		nr_nodemap_entries--;
4137	}
4138}
4139
4140/**
4141 * remove_all_active_ranges - Remove all currently registered regions
4142 *
4143 * During discovery, it may be found that a table like SRAT is invalid
4144 * and an alternative discovery method must be used. This function removes
4145 * all currently registered regions.
4146 */
4147void __init remove_all_active_ranges(void)
4148{
4149	memset(early_node_map, 0, sizeof(early_node_map));
4150	nr_nodemap_entries = 0;
4151}
4152
4153/* Compare two active node_active_regions */
4154static int __init cmp_node_active_region(const void *a, const void *b)
4155{
4156	struct node_active_region *arange = (struct node_active_region *)a;
4157	struct node_active_region *brange = (struct node_active_region *)b;
4158
4159	/* Done this way to avoid overflows */
4160	if (arange->start_pfn > brange->start_pfn)
4161		return 1;
4162	if (arange->start_pfn < brange->start_pfn)
4163		return -1;
4164
4165	return 0;
4166}
4167
4168/* sort the node_map by start_pfn */
4169void __init sort_node_map(void)
4170{
4171	sort(early_node_map, (size_t)nr_nodemap_entries,
4172			sizeof(struct node_active_region),
4173			cmp_node_active_region, NULL);
4174}
4175
4176/* Find the lowest pfn for a node */
4177static unsigned long __init find_min_pfn_for_node(int nid)
4178{
4179	int i;
4180	unsigned long min_pfn = ULONG_MAX;
4181
4182	/* Assuming a sorted map, the first range found has the starting pfn */
4183	for_each_active_range_index_in_nid(i, nid)
4184		min_pfn = min(min_pfn, early_node_map[i].start_pfn);
4185
4186	if (min_pfn == ULONG_MAX) {
4187		printk(KERN_WARNING
4188			"Could not find start_pfn for node %d\n", nid);
4189		return 0;
4190	}
4191
4192	return min_pfn;
4193}
4194
4195/**
4196 * find_min_pfn_with_active_regions - Find the minimum PFN registered
4197 *
4198 * It returns the minimum PFN based on information provided via
4199 * add_active_range().
4200 */
4201unsigned long __init find_min_pfn_with_active_regions(void)
4202{
4203	return find_min_pfn_for_node(MAX_NUMNODES);
4204}
4205
4206/*
4207 * early_calculate_totalpages()
4208 * Sum pages in active regions for movable zone.
4209 * Populate N_HIGH_MEMORY for calculating usable_nodes.
4210 */
4211static unsigned long __init early_calculate_totalpages(void)
4212{
4213	int i;
4214	unsigned long totalpages = 0;
4215
4216	for (i = 0; i < nr_nodemap_entries; i++) {
4217		unsigned long pages = early_node_map[i].end_pfn -
4218						early_node_map[i].start_pfn;
4219		totalpages += pages;
4220		if (pages)
4221			node_set_state(early_node_map[i].nid, N_HIGH_MEMORY);
4222	}
4223  	return totalpages;
4224}
4225
4226/*
4227 * Find the PFN the Movable zone begins in each node. Kernel memory
4228 * is spread evenly between nodes as long as the nodes have enough
4229 * memory. When they don't, some nodes will have more kernelcore than
4230 * others
4231 */
4232static void __init find_zone_movable_pfns_for_nodes(unsigned long *movable_pfn)
4233{
4234	int i, nid;
4235	unsigned long usable_startpfn;
4236	unsigned long kernelcore_node, kernelcore_remaining;
4237	/* save the state before borrow the nodemask */
4238	nodemask_t saved_node_state = node_states[N_HIGH_MEMORY];
4239	unsigned long totalpages = early_calculate_totalpages();
4240	int usable_nodes = nodes_weight(node_states[N_HIGH_MEMORY]);
4241
4242	/*
4243	 * If movablecore was specified, calculate what size of
4244	 * kernelcore that corresponds so that memory usable for
4245	 * any allocation type is evenly spread. If both kernelcore
4246	 * and movablecore are specified, then the value of kernelcore
4247	 * will be used for required_kernelcore if it's greater than
4248	 * what movablecore would have allowed.
4249	 */
4250	if (required_movablecore) {
4251		unsigned long corepages;
4252
4253		/*
4254		 * Round-up so that ZONE_MOVABLE is at least as large as what
4255		 * was requested by the user
4256		 */
4257		required_movablecore =
4258			roundup(required_movablecore, MAX_ORDER_NR_PAGES);
4259		corepages = totalpages - required_movablecore;
4260
4261		required_kernelcore = max(required_kernelcore, corepages);
4262	}
4263
4264	/* If kernelcore was not specified, there is no ZONE_MOVABLE */
4265	if (!required_kernelcore)
4266		goto out;
4267
4268	/* usable_startpfn is the lowest possible pfn ZONE_MOVABLE can be at */
4269	find_usable_zone_for_movable();
4270	usable_startpfn = arch_zone_lowest_possible_pfn[movable_zone];
4271
4272restart:
4273	/* Spread kernelcore memory as evenly as possible throughout nodes */
4274	kernelcore_node = required_kernelcore / usable_nodes;
4275	for_each_node_state(nid, N_HIGH_MEMORY) {
4276		/*
4277		 * Recalculate kernelcore_node if the division per node
4278		 * now exceeds what is necessary to satisfy the requested
4279		 * amount of memory for the kernel
4280		 */
4281		if (required_kernelcore < kernelcore_node)
4282			kernelcore_node = required_kernelcore / usable_nodes;
4283
4284		/*
4285		 * As the map is walked, we track how much memory is usable
4286		 * by the kernel using kernelcore_remaining. When it is
4287		 * 0, the rest of the node is usable by ZONE_MOVABLE
4288		 */
4289		kernelcore_remaining = kernelcore_node;
4290
4291		/* Go through each range of PFNs within this node */
4292		for_each_active_range_index_in_nid(i, nid) {
4293			unsigned long start_pfn, end_pfn;
4294			unsigned long size_pages;
4295
4296			start_pfn = max(early_node_map[i].start_pfn,
4297						zone_movable_pfn[nid]);
4298			end_pfn = early_node_map[i].end_pfn;
4299			if (start_pfn >= end_pfn)
4300				continue;
4301
4302			/* Account for what is only usable for kernelcore */
4303			if (start_pfn < usable_startpfn) {
4304				unsigned long kernel_pages;
4305				kernel_pages = min(end_pfn, usable_startpfn)
4306								- start_pfn;
4307
4308				kernelcore_remaining -= min(kernel_pages,
4309							kernelcore_remaining);
4310				required_kernelcore -= min(kernel_pages,
4311							required_kernelcore);
4312
4313				/* Continue if range is now fully accounted */
4314				if (end_pfn <= usable_startpfn) {
4315
4316					/*
4317					 * Push zone_movable_pfn to the end so
4318					 * that if we have to rebalance
4319					 * kernelcore across nodes, we will
4320					 * not double account here
4321					 */
4322					zone_movable_pfn[nid] = end_pfn;
4323					continue;
4324				}
4325				start_pfn = usable_startpfn;
4326			}
4327
4328			/*
4329			 * The usable PFN range for ZONE_MOVABLE is from
4330			 * start_pfn->end_pfn. Calculate size_pages as the
4331			 * number of pages used as kernelcore
4332			 */
4333			size_pages = end_pfn - start_pfn;
4334			if (size_pages > kernelcore_remaining)
4335				size_pages = kernelcore_remaining;
4336			zone_movable_pfn[nid] = start_pfn + size_pages;
4337
4338			/*
4339			 * Some kernelcore has been met, update counts and
4340			 * break if the kernelcore for this node has been
4341			 * satisified
4342			 */
4343			required_kernelcore -= min(required_kernelcore,
4344								size_pages);
4345			kernelcore_remaining -= size_pages;
4346			if (!kernelcore_remaining)
4347				break;
4348		}
4349	}
4350
4351	/*
4352	 * If there is still required_kernelcore, we do another pass with one
4353	 * less node in the count. This will push zone_movable_pfn[nid] further
4354	 * along on the nodes that still have memory until kernelcore is
4355	 * satisified
4356	 */
4357	usable_nodes--;
4358	if (usable_nodes && required_kernelcore > usable_nodes)
4359		goto restart;
4360
4361	/* Align start of ZONE_MOVABLE on all nids to MAX_ORDER_NR_PAGES */
4362	for (nid = 0; nid < MAX_NUMNODES; nid++)
4363		zone_movable_pfn[nid] =
4364			roundup(zone_movable_pfn[nid], MAX_ORDER_NR_PAGES);
4365
4366out:
4367	/* restore the node_state */
4368	node_states[N_HIGH_MEMORY] = saved_node_state;
4369}
4370
4371/* Any regular memory on that node ? */
4372static void check_for_regular_memory(pg_data_t *pgdat)
4373{
4374#ifdef CONFIG_HIGHMEM
4375	enum zone_type zone_type;
4376
4377	for (zone_type = 0; zone_type <= ZONE_NORMAL; zone_type++) {
4378		struct zone *zone = &pgdat->node_zones[zone_type];
4379		if (zone->present_pages)
4380			node_set_state(zone_to_nid(zone), N_NORMAL_MEMORY);
4381	}
4382#endif
4383}
4384
4385/**
4386 * free_area_init_nodes - Initialise all pg_data_t and zone data
4387 * @max_zone_pfn: an array of max PFNs for each zone
4388 *
4389 * This will call free_area_init_node() for each active node in the system.
4390 * Using the page ranges provided by add_active_range(), the size of each
4391 * zone in each node and their holes is calculated. If the maximum PFN
4392 * between two adjacent zones match, it is assumed that the zone is empty.
4393 * For example, if arch_max_dma_pfn == arch_max_dma32_pfn, it is assumed
4394 * that arch_max_dma32_pfn has no pages. It is also assumed that a zone
4395 * starts where the previous one ended. For example, ZONE_DMA32 starts
4396 * at arch_max_dma_pfn.
4397 */
4398void __init free_area_init_nodes(unsigned long *max_zone_pfn)
4399{
4400	unsigned long nid;
4401	int i;
4402
4403	/* Sort early_node_map as initialisation assumes it is sorted */
4404	sort_node_map();
4405
4406	/* Record where the zone boundaries are */
4407	memset(arch_zone_lowest_possible_pfn, 0,
4408				sizeof(arch_zone_lowest_possible_pfn));
4409	memset(arch_zone_highest_possible_pfn, 0,
4410				sizeof(arch_zone_highest_possible_pfn));
4411	arch_zone_lowest_possible_pfn[0] = find_min_pfn_with_active_regions();
4412	arch_zone_highest_possible_pfn[0] = max_zone_pfn[0];
4413	for (i = 1; i < MAX_NR_ZONES; i++) {
4414		if (i == ZONE_MOVABLE)
4415			continue;
4416		arch_zone_lowest_possible_pfn[i] =
4417			arch_zone_highest_possible_pfn[i-1];
4418		arch_zone_highest_possible_pfn[i] =
4419			max(max_zone_pfn[i], arch_zone_lowest_possible_pfn[i]);
4420	}
4421	arch_zone_lowest_possible_pfn[ZONE_MOVABLE] = 0;
4422	arch_zone_highest_possible_pfn[ZONE_MOVABLE] = 0;
4423
4424	/* Find the PFNs that ZONE_MOVABLE begins at in each node */
4425	memset(zone_movable_pfn, 0, sizeof(zone_movable_pfn));
4426	find_zone_movable_pfns_for_nodes(zone_movable_pfn);
4427
4428	/* Print out the zone ranges */
4429	printk("Zone PFN ranges:\n");
4430	for (i = 0; i < MAX_NR_ZONES; i++) {
4431		if (i == ZONE_MOVABLE)
4432			continue;
4433		printk("  %-8s %0#10lx -> %0#10lx\n",
4434				zone_names[i],
4435				arch_zone_lowest_possible_pfn[i],
4436				arch_zone_highest_possible_pfn[i]);
4437	}
4438
4439	/* Print out the PFNs ZONE_MOVABLE begins at in each node */
4440	printk("Movable zone start PFN for each node\n");
4441	for (i = 0; i < MAX_NUMNODES; i++) {
4442		if (zone_movable_pfn[i])
4443			printk("  Node %d: %lu\n", i, zone_movable_pfn[i]);
4444	}
4445
4446	/* Print out the early_node_map[] */
4447	printk("early_node_map[%d] active PFN ranges\n", nr_nodemap_entries);
4448	for (i = 0; i < nr_nodemap_entries; i++)
4449		printk("  %3d: %0#10lx -> %0#10lx\n", early_node_map[i].nid,
4450						early_node_map[i].start_pfn,
4451						early_node_map[i].end_pfn);
4452
4453	/* Initialise every node */
4454	mminit_verify_pageflags_layout();
4455	setup_nr_node_ids();
4456	for_each_online_node(nid) {
4457		pg_data_t *pgdat = NODE_DATA(nid);
4458		free_area_init_node(nid, NULL,
4459				find_min_pfn_for_node(nid), NULL);
4460
4461		/* Any memory on that node */
4462		if (pgdat->node_present_pages)
4463			node_set_state(nid, N_HIGH_MEMORY);
4464		check_for_regular_memory(pgdat);
4465	}
4466}
4467
4468static int __init cmdline_parse_core(char *p, unsigned long *core)
4469{
4470	unsigned long long coremem;
4471	if (!p)
4472		return -EINVAL;
4473
4474	coremem = memparse(p, &p);
4475	*core = coremem >> PAGE_SHIFT;
4476
4477	/* Paranoid check that UL is enough for the coremem value */
4478	WARN_ON((coremem >> PAGE_SHIFT) > ULONG_MAX);
4479
4480	return 0;
4481}
4482
4483/*
4484 * kernelcore=size sets the amount of memory for use for allocations that
4485 * cannot be reclaimed or migrated.
4486 */
4487static int __init cmdline_parse_kernelcore(char *p)
4488{
4489	return cmdline_parse_core(p, &required_kernelcore);
4490}
4491
4492/*
4493 * movablecore=size sets the amount of memory for use for allocations that
4494 * can be reclaimed or migrated.
4495 */
4496static int __init cmdline_parse_movablecore(char *p)
4497{
4498	return cmdline_parse_core(p, &required_movablecore);
4499}
4500
4501early_param("kernelcore", cmdline_parse_kernelcore);
4502early_param("movablecore", cmdline_parse_movablecore);
4503
4504#endif /* CONFIG_ARCH_POPULATES_NODE_MAP */
4505
4506/**
4507 * set_dma_reserve - set the specified number of pages reserved in the first zone
4508 * @new_dma_reserve: The number of pages to mark reserved
4509 *
4510 * The per-cpu batchsize and zone watermarks are determined by present_pages.
4511 * In the DMA zone, a significant percentage may be consumed by kernel image
4512 * and other unfreeable allocations which can skew the watermarks badly. This
4513 * function may optionally be used to account for unfreeable pages in the
4514 * first zone (e.g., ZONE_DMA). The effect will be lower watermarks and
4515 * smaller per-cpu batchsize.
4516 */
4517void __init set_dma_reserve(unsigned long new_dma_reserve)
4518{
4519	dma_reserve = new_dma_reserve;
4520}
4521
4522#ifndef CONFIG_NEED_MULTIPLE_NODES
4523struct pglist_data __refdata contig_page_data = {
4524#ifndef CONFIG_NO_BOOTMEM
4525 .bdata = &bootmem_node_data[0]
4526#endif
4527 };
4528EXPORT_SYMBOL(contig_page_data);
4529#endif
4530
4531void __init free_area_init(unsigned long *zones_size)
4532{
4533	free_area_init_node(0, zones_size,
4534			__pa(PAGE_OFFSET) >> PAGE_SHIFT, NULL);
4535}
4536
4537static int page_alloc_cpu_notify(struct notifier_block *self,
4538				 unsigned long action, void *hcpu)
4539{
4540	int cpu = (unsigned long)hcpu;
4541
4542	if (action == CPU_DEAD || action == CPU_DEAD_FROZEN) {
4543		drain_pages(cpu);
4544
4545		/*
4546		 * Spill the event counters of the dead processor
4547		 * into the current processors event counters.
4548		 * This artificially elevates the count of the current
4549		 * processor.
4550		 */
4551		vm_events_fold_cpu(cpu);
4552
4553		/*
4554		 * Zero the differential counters of the dead processor
4555		 * so that the vm statistics are consistent.
4556		 *
4557		 * This is only okay since the processor is dead and cannot
4558		 * race with what we are doing.
4559		 */
4560		refresh_cpu_vm_stats(cpu);
4561	}
4562	return NOTIFY_OK;
4563}
4564
4565void __init page_alloc_init(void)
4566{
4567	hotcpu_notifier(page_alloc_cpu_notify, 0);
4568}
4569
4570/*
4571 * calculate_totalreserve_pages - called when sysctl_lower_zone_reserve_ratio
4572 *	or min_free_kbytes changes.
4573 */
4574static void calculate_totalreserve_pages(void)
4575{
4576	struct pglist_data *pgdat;
4577	unsigned long reserve_pages = 0;
4578	enum zone_type i, j;
4579
4580	for_each_online_pgdat(pgdat) {
4581		for (i = 0; i < MAX_NR_ZONES; i++) {
4582			struct zone *zone = pgdat->node_zones + i;
4583			unsigned long max = 0;
4584
4585			/* Find valid and maximum lowmem_reserve in the zone */
4586			for (j = i; j < MAX_NR_ZONES; j++) {
4587				if (zone->lowmem_reserve[j] > max)
4588					max = zone->lowmem_reserve[j];
4589			}
4590
4591			/* we treat the high watermark as reserved pages. */
4592			max += high_wmark_pages(zone);
4593
4594			if (max > zone->present_pages)
4595				max = zone->present_pages;
4596			reserve_pages += max;
4597		}
4598	}
4599	totalreserve_pages = reserve_pages;
4600}
4601
4602/*
4603 * setup_per_zone_lowmem_reserve - called whenever
4604 *	sysctl_lower_zone_reserve_ratio changes.  Ensures that each zone
4605 *	has a correct pages reserved value, so an adequate number of
4606 *	pages are left in the zone after a successful __alloc_pages().
4607 */
4608static void setup_per_zone_lowmem_reserve(void)
4609{
4610	struct pglist_data *pgdat;
4611	enum zone_type j, idx;
4612
4613	for_each_online_pgdat(pgdat) {
4614		for (j = 0; j < MAX_NR_ZONES; j++) {
4615			struct zone *zone = pgdat->node_zones + j;
4616			unsigned long present_pages = zone->present_pages;
4617
4618			zone->lowmem_reserve[j] = 0;
4619
4620			idx = j;
4621			while (idx) {
4622				struct zone *lower_zone;
4623
4624				idx--;
4625
4626				if (sysctl_lowmem_reserve_ratio[idx] < 1)
4627					sysctl_lowmem_reserve_ratio[idx] = 1;
4628
4629				lower_zone = pgdat->node_zones + idx;
4630				lower_zone->lowmem_reserve[j] = present_pages /
4631					sysctl_lowmem_reserve_ratio[idx];
4632				present_pages += lower_zone->present_pages;
4633			}
4634		}
4635	}
4636
4637	/* update totalreserve_pages */
4638	calculate_totalreserve_pages();
4639}
4640
4641/**
4642 * setup_per_zone_wmarks - called when min_free_kbytes changes
4643 * or when memory is hot-{added|removed}
4644 *
4645 * Ensures that the watermark[min,low,high] values for each zone are set
4646 * correctly with respect to min_free_kbytes.
4647 */
4648void setup_per_zone_wmarks(void)
4649{
4650	unsigned long pages_min = min_free_kbytes >> (PAGE_SHIFT - 10);
4651	unsigned long lowmem_pages = 0;
4652	struct zone *zone;
4653	unsigned long flags;
4654
4655	/* Calculate total number of !ZONE_HIGHMEM pages */
4656	for_each_zone(zone) {
4657		if (!is_highmem(zone))
4658			lowmem_pages += zone->present_pages;
4659	}
4660
4661	for_each_zone(zone) {
4662		u64 tmp;
4663
4664		spin_lock_irqsave(&zone->lock, flags);
4665		tmp = (u64)pages_min * zone->present_pages;
4666		do_div(tmp, lowmem_pages);
4667		if (is_highmem(zone)) {
4668			/*
4669			 * __GFP_HIGH and PF_MEMALLOC allocations usually don't
4670			 * need highmem pages, so cap pages_min to a small
4671			 * value here.
4672			 *
4673			 * The WMARK_HIGH-WMARK_LOW and (WMARK_LOW-WMARK_MIN)
4674			 * deltas controls asynch page reclaim, and so should
4675			 * not be capped for highmem.
4676			 */
4677			int min_pages;
4678
4679			min_pages = zone->present_pages / 1024;
4680			if (min_pages < SWAP_CLUSTER_MAX)
4681				min_pages = SWAP_CLUSTER_MAX;
4682			if (min_pages > 128)
4683				min_pages = 128;
4684			zone->watermark[WMARK_MIN] = min_pages;
4685		} else {
4686			/*
4687			 * If it's a lowmem zone, reserve a number of pages
4688			 * proportionate to the zone's size.
4689			 */
4690			zone->watermark[WMARK_MIN] = tmp;
4691		}
4692
4693		zone->watermark[WMARK_LOW]  = min_wmark_pages(zone) + (tmp >> 2);
4694		zone->watermark[WMARK_HIGH] = min_wmark_pages(zone) + (tmp >> 1);
4695		setup_zone_migrate_reserve(zone);
4696		spin_unlock_irqrestore(&zone->lock, flags);
4697	}
4698
4699	/* update totalreserve_pages */
4700	calculate_totalreserve_pages();
4701}
4702
4703/*
4704 * The inactive anon list should be small enough that the VM never has to
4705 * do too much work, but large enough that each inactive page has a chance
4706 * to be referenced again before it is swapped out.
4707 *
4708 * The inactive_anon ratio is the target ratio of ACTIVE_ANON to
4709 * INACTIVE_ANON pages on this zone's LRU, maintained by the
4710 * pageout code. A zone->inactive_ratio of 3 means 3:1 or 25% of
4711 * the anonymous pages are kept on the inactive list.
4712 *
4713 * total     target    max
4714 * memory    ratio     inactive anon
4715 * -------------------------------------
4716 *   10MB       1         5MB
4717 *  100MB       1        50MB
4718 *    1GB       3       250MB
4719 *   10GB      10       0.9GB
4720 *  100GB      31         3GB
4721 *    1TB     101        10GB
4722 *   10TB     320        32GB
4723 */
4724void calculate_zone_inactive_ratio(struct zone *zone)
4725{
4726	unsigned int gb, ratio;
4727
4728	/* Zone size in gigabytes */
4729	gb = zone->present_pages >> (30 - PAGE_SHIFT);
4730	if (gb)
4731		ratio = int_sqrt(10 * gb);
4732	else
4733		ratio = 1;
4734
4735	zone->inactive_ratio = ratio;
4736}
4737
4738static void __init setup_per_zone_inactive_ratio(void)
4739{
4740	struct zone *zone;
4741
4742	for_each_zone(zone)
4743		calculate_zone_inactive_ratio(zone);
4744}
4745
4746/*
4747 * Initialise min_free_kbytes.
4748 *
4749 * For small machines we want it small (128k min).  For large machines
4750 * we want it large (64MB max).  But it is not linear, because network
4751 * bandwidth does not increase linearly with machine size.  We use
4752 *
4753 * 	min_free_kbytes = 4 * sqrt(lowmem_kbytes), for better accuracy:
4754 *	min_free_kbytes = sqrt(lowmem_kbytes * 16)
4755 *
4756 * which yields
4757 *
4758 * 16MB:	512k
4759 * 32MB:	724k
4760 * 64MB:	1024k
4761 * 128MB:	1448k
4762 * 256MB:	2048k
4763 * 512MB:	2896k
4764 * 1024MB:	4096k
4765 * 2048MB:	5792k
4766 * 4096MB:	8192k
4767 * 8192MB:	11584k
4768 * 16384MB:	16384k
4769 */
4770static int __init init_per_zone_wmark_min(void)
4771{
4772	unsigned long lowmem_kbytes;
4773
4774	lowmem_kbytes = nr_free_buffer_pages() * (PAGE_SIZE >> 10);
4775
4776	min_free_kbytes = int_sqrt(lowmem_kbytes * 16);
4777	if (min_free_kbytes < 128)
4778		min_free_kbytes = 128;
4779	if (min_free_kbytes > 65536)
4780		min_free_kbytes = 65536;
4781	setup_per_zone_wmarks();
4782	setup_per_zone_lowmem_reserve();
4783	setup_per_zone_inactive_ratio();
4784	return 0;
4785}
4786module_init(init_per_zone_wmark_min)
4787
4788/*
4789 * min_free_kbytes_sysctl_handler - just a wrapper around proc_dointvec() so
4790 *	that we can call two helper functions whenever min_free_kbytes
4791 *	changes.
4792 */
4793int min_free_kbytes_sysctl_handler(ctl_table *table, int write,
4794	void __user *buffer, size_t *length, loff_t *ppos)
4795{
4796	proc_dointvec(table, write, buffer, length, ppos);
4797	if (write)
4798		setup_per_zone_wmarks();
4799	return 0;
4800}
4801
4802#ifdef CONFIG_NUMA
4803int sysctl_min_unmapped_ratio_sysctl_handler(ctl_table *table, int write,
4804	void __user *buffer, size_t *length, loff_t *ppos)
4805{
4806	struct zone *zone;
4807	int rc;
4808
4809	rc = proc_dointvec_minmax(table, write, buffer, length, ppos);
4810	if (rc)
4811		return rc;
4812
4813	for_each_zone(zone)
4814		zone->min_unmapped_pages = (zone->present_pages *
4815				sysctl_min_unmapped_ratio) / 100;
4816	return 0;
4817}
4818
4819int sysctl_min_slab_ratio_sysctl_handler(ctl_table *table, int write,
4820	void __user *buffer, size_t *length, loff_t *ppos)
4821{
4822	struct zone *zone;
4823	int rc;
4824
4825	rc = proc_dointvec_minmax(table, write, buffer, length, ppos);
4826	if (rc)
4827		return rc;
4828
4829	for_each_zone(zone)
4830		zone->min_slab_pages = (zone->present_pages *
4831				sysctl_min_slab_ratio) / 100;
4832	return 0;
4833}
4834#endif
4835
4836/*
4837 * lowmem_reserve_ratio_sysctl_handler - just a wrapper around
4838 *	proc_dointvec() so that we can call setup_per_zone_lowmem_reserve()
4839 *	whenever sysctl_lowmem_reserve_ratio changes.
4840 *
4841 * The reserve ratio obviously has absolutely no relation with the
4842 * minimum watermarks. The lowmem reserve ratio can only make sense
4843 * if in function of the boot time zone sizes.
4844 */
4845int lowmem_reserve_ratio_sysctl_handler(ctl_table *table, int write,
4846	void __user *buffer, size_t *length, loff_t *ppos)
4847{
4848	proc_dointvec_minmax(table, write, buffer, length, ppos);
4849	setup_per_zone_lowmem_reserve();
4850	return 0;
4851}
4852
4853/*
4854 * percpu_pagelist_fraction - changes the pcp->high for each zone on each
4855 * cpu.  It is the fraction of total pages in each zone that a hot per cpu pagelist
4856 * can have before it gets flushed back to buddy allocator.
4857 */
4858
4859int percpu_pagelist_fraction_sysctl_handler(ctl_table *table, int write,
4860	void __user *buffer, size_t *length, loff_t *ppos)
4861{
4862	struct zone *zone;
4863	unsigned int cpu;
4864	int ret;
4865
4866	ret = proc_dointvec_minmax(table, write, buffer, length, ppos);
4867	if (!write || (ret == -EINVAL))
4868		return ret;
4869	for_each_populated_zone(zone) {
4870		for_each_online_cpu(cpu) {
4871			unsigned long  high;
4872			high = zone->present_pages / percpu_pagelist_fraction;
4873			setup_pagelist_highmark(zone_pcp(zone, cpu), high);
4874		}
4875	}
4876	return 0;
4877}
4878
4879int hashdist = HASHDIST_DEFAULT;
4880
4881#ifdef CONFIG_NUMA
4882static int __init set_hashdist(char *str)
4883{
4884	if (!str)
4885		return 0;
4886	hashdist = simple_strtoul(str, &str, 0);
4887	return 1;
4888}
4889__setup("hashdist=", set_hashdist);
4890#endif
4891
4892/*
4893 * allocate a large system hash table from bootmem
4894 * - it is assumed that the hash table must contain an exact power-of-2
4895 *   quantity of entries
4896 * - limit is the number of hash buckets, not the total allocation size
4897 */
4898void *__init alloc_large_system_hash(const char *tablename,
4899				     unsigned long bucketsize,
4900				     unsigned long numentries,
4901				     int scale,
4902				     int flags,
4903				     unsigned int *_hash_shift,
4904				     unsigned int *_hash_mask,
4905				     unsigned long limit)
4906{
4907	unsigned long long max = limit;
4908	unsigned long log2qty, size;
4909	void *table = NULL;
4910
4911	/* allow the kernel cmdline to have a say */
4912	if (!numentries) {
4913		/* round applicable memory size up to nearest megabyte */
4914		numentries = nr_kernel_pages;
4915		numentries += (1UL << (20 - PAGE_SHIFT)) - 1;
4916		numentries >>= 20 - PAGE_SHIFT;
4917		numentries <<= 20 - PAGE_SHIFT;
4918
4919		/* limit to 1 bucket per 2^scale bytes of low memory */
4920		if (scale > PAGE_SHIFT)
4921			numentries >>= (scale - PAGE_SHIFT);
4922		else
4923			numentries <<= (PAGE_SHIFT - scale);
4924
4925		/* Make sure we've got at least a 0-order allocation.. */
4926		if (unlikely(flags & HASH_SMALL)) {
4927			/* Makes no sense without HASH_EARLY */
4928			WARN_ON(!(flags & HASH_EARLY));
4929			if (!(numentries >> *_hash_shift)) {
4930				numentries = 1UL << *_hash_shift;
4931				BUG_ON(!numentries);
4932			}
4933		} else if (unlikely((numentries * bucketsize) < PAGE_SIZE))
4934			numentries = PAGE_SIZE / bucketsize;
4935	}
4936	numentries = roundup_pow_of_two(numentries);
4937
4938	/* limit allocation size to 1/16 total memory by default */
4939	if (max == 0) {
4940		max = ((unsigned long long)nr_all_pages << PAGE_SHIFT) >> 4;
4941		do_div(max, bucketsize);
4942	}
4943
4944	if (numentries > max)
4945		numentries = max;
4946
4947	log2qty = ilog2(numentries);
4948
4949	do {
4950		size = bucketsize << log2qty;
4951		if (flags & HASH_EARLY)
4952			table = alloc_bootmem_nopanic(size);
4953		else if (hashdist)
4954			table = __vmalloc(size, GFP_ATOMIC, PAGE_KERNEL);
4955		else {
4956			/*
4957			 * If bucketsize is not a power-of-two, we may free
4958			 * some pages at the end of hash table which
4959			 * alloc_pages_exact() automatically does
4960			 */
4961			if (get_order(size) < MAX_ORDER) {
4962				table = alloc_pages_exact(size, GFP_ATOMIC);
4963				kmemleak_alloc(table, size, 1, GFP_ATOMIC);
4964			}
4965		}
4966	} while (!table && size > PAGE_SIZE && --log2qty);
4967
4968	if (!table)
4969		panic("Failed to allocate %s hash table\n", tablename);
4970
4971	printk(KERN_INFO "%s hash table entries: %d (order: %d, %lu bytes)\n",
4972	       tablename,
4973	       (1U << log2qty),
4974	       ilog2(size) - PAGE_SHIFT,
4975	       size);
4976
4977	if (_hash_shift)
4978		*_hash_shift = log2qty;
4979	if (_hash_mask)
4980		*_hash_mask = (1 << log2qty) - 1;
4981
4982	return table;
4983}
4984
4985/* Return a pointer to the bitmap storing bits affecting a block of pages */
4986static inline unsigned long *get_pageblock_bitmap(struct zone *zone,
4987							unsigned long pfn)
4988{
4989#ifdef CONFIG_SPARSEMEM
4990	return __pfn_to_section(pfn)->pageblock_flags;
4991#else
4992	return zone->pageblock_flags;
4993#endif /* CONFIG_SPARSEMEM */
4994}
4995
4996static inline int pfn_to_bitidx(struct zone *zone, unsigned long pfn)
4997{
4998#ifdef CONFIG_SPARSEMEM
4999	pfn &= (PAGES_PER_SECTION-1);
5000	return (pfn >> pageblock_order) * NR_PAGEBLOCK_BITS;
5001#else
5002	pfn = pfn - zone->zone_start_pfn;
5003	return (pfn >> pageblock_order) * NR_PAGEBLOCK_BITS;
5004#endif /* CONFIG_SPARSEMEM */
5005}
5006
5007/**
5008 * get_pageblock_flags_group - Return the requested group of flags for the pageblock_nr_pages block of pages
5009 * @page: The page within the block of interest
5010 * @start_bitidx: The first bit of interest to retrieve
5011 * @end_bitidx: The last bit of interest
5012 * returns pageblock_bits flags
5013 */
5014unsigned long get_pageblock_flags_group(struct page *page,
5015					int start_bitidx, int end_bitidx)
5016{
5017	struct zone *zone;
5018	unsigned long *bitmap;
5019	unsigned long pfn, bitidx;
5020	unsigned long flags = 0;
5021	unsigned long value = 1;
5022
5023	zone = page_zone(page);
5024	pfn = page_to_pfn(page);
5025	bitmap = get_pageblock_bitmap(zone, pfn);
5026	bitidx = pfn_to_bitidx(zone, pfn);
5027
5028	for (; start_bitidx <= end_bitidx; start_bitidx++, value <<= 1)
5029		if (test_bit(bitidx + start_bitidx, bitmap))
5030			flags |= value;
5031
5032	return flags;
5033}
5034
5035/**
5036 * set_pageblock_flags_group - Set the requested group of flags for a pageblock_nr_pages block of pages
5037 * @page: The page within the block of interest
5038 * @start_bitidx: The first bit of interest
5039 * @end_bitidx: The last bit of interest
5040 * @flags: The flags to set
5041 */
5042void set_pageblock_flags_group(struct page *page, unsigned long flags,
5043					int start_bitidx, int end_bitidx)
5044{
5045	struct zone *zone;
5046	unsigned long *bitmap;
5047	unsigned long pfn, bitidx;
5048	unsigned long value = 1;
5049
5050	zone = page_zone(page);
5051	pfn = page_to_pfn(page);
5052	bitmap = get_pageblock_bitmap(zone, pfn);
5053	bitidx = pfn_to_bitidx(zone, pfn);
5054	VM_BUG_ON(pfn < zone->zone_start_pfn);
5055	VM_BUG_ON(pfn >= zone->zone_start_pfn + zone->spanned_pages);
5056
5057	for (; start_bitidx <= end_bitidx; start_bitidx++, value <<= 1)
5058		if (flags & value)
5059			__set_bit(bitidx + start_bitidx, bitmap);
5060		else
5061			__clear_bit(bitidx + start_bitidx, bitmap);
5062}
5063
5064/*
5065 * This is designed as sub function...plz see page_isolation.c also.
5066 * set/clear page block's type to be ISOLATE.
5067 * page allocater never alloc memory from ISOLATE block.
5068 */
5069
5070int set_migratetype_isolate(struct page *page)
5071{
5072	struct zone *zone;
5073	struct page *curr_page;
5074	unsigned long flags, pfn, iter;
5075	unsigned long immobile = 0;
5076	struct memory_isolate_notify arg;
5077	int notifier_ret;
5078	int ret = -EBUSY;
5079	int zone_idx;
5080
5081	zone = page_zone(page);
5082	zone_idx = zone_idx(zone);
5083
5084	spin_lock_irqsave(&zone->lock, flags);
5085	if (get_pageblock_migratetype(page) == MIGRATE_MOVABLE ||
5086	    zone_idx == ZONE_MOVABLE) {
5087		ret = 0;
5088		goto out;
5089	}
5090
5091	pfn = page_to_pfn(page);
5092	arg.start_pfn = pfn;
5093	arg.nr_pages = pageblock_nr_pages;
5094	arg.pages_found = 0;
5095
5096	/*
5097	 * It may be possible to isolate a pageblock even if the
5098	 * migratetype is not MIGRATE_MOVABLE. The memory isolation
5099	 * notifier chain is used by balloon drivers to return the
5100	 * number of pages in a range that are held by the balloon
5101	 * driver to shrink memory. If all the pages are accounted for
5102	 * by balloons, are free, or on the LRU, isolation can continue.
5103	 * Later, for example, when memory hotplug notifier runs, these
5104	 * pages reported as "can be isolated" should be isolated(freed)
5105	 * by the balloon driver through the memory notifier chain.
5106	 */
5107	notifier_ret = memory_isolate_notify(MEM_ISOLATE_COUNT, &arg);
5108	notifier_ret = notifier_to_errno(notifier_ret);
5109	if (notifier_ret || !arg.pages_found)
5110		goto out;
5111
5112	for (iter = pfn; iter < (pfn + pageblock_nr_pages); iter++) {
5113		if (!pfn_valid_within(pfn))
5114			continue;
5115
5116		curr_page = pfn_to_page(iter);
5117		if (!page_count(curr_page) || PageLRU(curr_page))
5118			continue;
5119
5120		immobile++;
5121	}
5122
5123	if (arg.pages_found == immobile)
5124		ret = 0;
5125
5126out:
5127	if (!ret) {
5128		set_pageblock_migratetype(page, MIGRATE_ISOLATE);
5129		move_freepages_block(zone, page, MIGRATE_ISOLATE);
5130	}
5131
5132	spin_unlock_irqrestore(&zone->lock, flags);
5133	if (!ret)
5134		drain_all_pages();
5135	return ret;
5136}
5137
5138void unset_migratetype_isolate(struct page *page)
5139{
5140	struct zone *zone;
5141	unsigned long flags;
5142	zone = page_zone(page);
5143	spin_lock_irqsave(&zone->lock, flags);
5144	if (get_pageblock_migratetype(page) != MIGRATE_ISOLATE)
5145		goto out;
5146	set_pageblock_migratetype(page, MIGRATE_MOVABLE);
5147	move_freepages_block(zone, page, MIGRATE_MOVABLE);
5148out:
5149	spin_unlock_irqrestore(&zone->lock, flags);
5150}
5151
5152#ifdef CONFIG_MEMORY_HOTREMOVE
5153/*
5154 * All pages in the range must be isolated before calling this.
5155 */
5156void
5157__offline_isolated_pages(unsigned long start_pfn, unsigned long end_pfn)
5158{
5159	struct page *page;
5160	struct zone *zone;
5161	int order, i;
5162	unsigned long pfn;
5163	unsigned long flags;
5164	/* find the first valid pfn */
5165	for (pfn = start_pfn; pfn < end_pfn; pfn++)
5166		if (pfn_valid(pfn))
5167			break;
5168	if (pfn == end_pfn)
5169		return;
5170	zone = page_zone(pfn_to_page(pfn));
5171	spin_lock_irqsave(&zone->lock, flags);
5172	pfn = start_pfn;
5173	while (pfn < end_pfn) {
5174		if (!pfn_valid(pfn)) {
5175			pfn++;
5176			continue;
5177		}
5178		page = pfn_to_page(pfn);
5179		BUG_ON(page_count(page));
5180		BUG_ON(!PageBuddy(page));
5181		order = page_order(page);
5182#ifdef CONFIG_DEBUG_VM
5183		printk(KERN_INFO "remove from free list %lx %d %lx\n",
5184		       pfn, 1 << order, end_pfn);
5185#endif
5186		list_del(&page->lru);
5187		rmv_page_order(page);
5188		zone->free_area[order].nr_free--;
5189		__mod_zone_page_state(zone, NR_FREE_PAGES,
5190				      - (1UL << order));
5191		for (i = 0; i < (1 << order); i++)
5192			SetPageReserved((page+i));
5193		pfn += (1 << order);
5194	}
5195	spin_unlock_irqrestore(&zone->lock, flags);
5196}
5197#endif
5198
5199#ifdef CONFIG_MEMORY_FAILURE
5200bool is_free_buddy_page(struct page *page)
5201{
5202	struct zone *zone = page_zone(page);
5203	unsigned long pfn = page_to_pfn(page);
5204	unsigned long flags;
5205	int order;
5206
5207	spin_lock_irqsave(&zone->lock, flags);
5208	for (order = 0; order < MAX_ORDER; order++) {
5209		struct page *page_head = page - (pfn & ((1 << order) - 1));
5210
5211		if (PageBuddy(page_head) && page_order(page_head) >= order)
5212			break;
5213	}
5214	spin_unlock_irqrestore(&zone->lock, flags);
5215
5216	return order < MAX_ORDER;
5217}
5218#endif
5219