page_alloc.c revision fd39fc8561be33065306bdac0e30414e1e8ac8e1
1/*
2 *  linux/mm/page_alloc.c
3 *
4 *  Manages the free list, the system allocates free pages here.
5 *  Note that kmalloc() lives in slab.c
6 *
7 *  Copyright (C) 1991, 1992, 1993, 1994  Linus Torvalds
8 *  Swap reorganised 29.12.95, Stephen Tweedie
9 *  Support of BIGMEM added by Gerhard Wichert, Siemens AG, July 1999
10 *  Reshaped it to be a zoned allocator, Ingo Molnar, Red Hat, 1999
11 *  Discontiguous memory support, Kanoj Sarcar, SGI, Nov 1999
12 *  Zone balancing, Kanoj Sarcar, SGI, Jan 2000
13 *  Per cpu hot/cold page lists, bulk allocation, Martin J. Bligh, Sept 2002
14 *          (lots of bits borrowed from Ingo Molnar & Andrew Morton)
15 */
16
17#include <linux/config.h>
18#include <linux/stddef.h>
19#include <linux/mm.h>
20#include <linux/swap.h>
21#include <linux/interrupt.h>
22#include <linux/pagemap.h>
23#include <linux/bootmem.h>
24#include <linux/compiler.h>
25#include <linux/kernel.h>
26#include <linux/module.h>
27#include <linux/suspend.h>
28#include <linux/pagevec.h>
29#include <linux/blkdev.h>
30#include <linux/slab.h>
31#include <linux/notifier.h>
32#include <linux/topology.h>
33#include <linux/sysctl.h>
34#include <linux/cpu.h>
35#include <linux/cpuset.h>
36#include <linux/memory_hotplug.h>
37#include <linux/nodemask.h>
38#include <linux/vmalloc.h>
39#include <linux/mempolicy.h>
40#include <linux/stop_machine.h>
41
42#include <asm/tlbflush.h>
43#include <asm/div64.h>
44#include "internal.h"
45
46/*
47 * MCD - HACK: Find somewhere to initialize this EARLY, or make this
48 * initializer cleaner
49 */
50nodemask_t node_online_map __read_mostly = { { [0] = 1UL } };
51EXPORT_SYMBOL(node_online_map);
52nodemask_t node_possible_map __read_mostly = NODE_MASK_ALL;
53EXPORT_SYMBOL(node_possible_map);
54unsigned long totalram_pages __read_mostly;
55unsigned long totalhigh_pages __read_mostly;
56unsigned long totalreserve_pages __read_mostly;
57long nr_swap_pages;
58int percpu_pagelist_fraction;
59
60static void __free_pages_ok(struct page *page, unsigned int order);
61
62/*
63 * results with 256, 32 in the lowmem_reserve sysctl:
64 *	1G machine -> (16M dma, 800M-16M normal, 1G-800M high)
65 *	1G machine -> (16M dma, 784M normal, 224M high)
66 *	NORMAL allocation will leave 784M/256 of ram reserved in the ZONE_DMA
67 *	HIGHMEM allocation will leave 224M/32 of ram reserved in ZONE_NORMAL
68 *	HIGHMEM allocation will (224M+784M)/256 of ram reserved in ZONE_DMA
69 *
70 * TBD: should special case ZONE_DMA32 machines here - in those we normally
71 * don't need any ZONE_NORMAL reservation
72 */
73int sysctl_lowmem_reserve_ratio[MAX_NR_ZONES-1] = { 256, 256, 32 };
74
75EXPORT_SYMBOL(totalram_pages);
76
77/*
78 * Used by page_zone() to look up the address of the struct zone whose
79 * id is encoded in the upper bits of page->flags
80 */
81struct zone *zone_table[1 << ZONETABLE_SHIFT] __read_mostly;
82EXPORT_SYMBOL(zone_table);
83
84static char *zone_names[MAX_NR_ZONES] = { "DMA", "DMA32", "Normal", "HighMem" };
85int min_free_kbytes = 1024;
86
87unsigned long __meminitdata nr_kernel_pages;
88unsigned long __meminitdata nr_all_pages;
89
90#ifdef CONFIG_DEBUG_VM
91static int page_outside_zone_boundaries(struct zone *zone, struct page *page)
92{
93	int ret = 0;
94	unsigned seq;
95	unsigned long pfn = page_to_pfn(page);
96
97	do {
98		seq = zone_span_seqbegin(zone);
99		if (pfn >= zone->zone_start_pfn + zone->spanned_pages)
100			ret = 1;
101		else if (pfn < zone->zone_start_pfn)
102			ret = 1;
103	} while (zone_span_seqretry(zone, seq));
104
105	return ret;
106}
107
108static int page_is_consistent(struct zone *zone, struct page *page)
109{
110#ifdef CONFIG_HOLES_IN_ZONE
111	if (!pfn_valid(page_to_pfn(page)))
112		return 0;
113#endif
114	if (zone != page_zone(page))
115		return 0;
116
117	return 1;
118}
119/*
120 * Temporary debugging check for pages not lying within a given zone.
121 */
122static int bad_range(struct zone *zone, struct page *page)
123{
124	if (page_outside_zone_boundaries(zone, page))
125		return 1;
126	if (!page_is_consistent(zone, page))
127		return 1;
128
129	return 0;
130}
131
132#else
133static inline int bad_range(struct zone *zone, struct page *page)
134{
135	return 0;
136}
137#endif
138
139static void bad_page(struct page *page)
140{
141	printk(KERN_EMERG "Bad page state in process '%s'\n"
142		KERN_EMERG "page:%p flags:0x%0*lx mapping:%p mapcount:%d count:%d\n"
143		KERN_EMERG "Trying to fix it up, but a reboot is needed\n"
144		KERN_EMERG "Backtrace:\n",
145		current->comm, page, (int)(2*sizeof(unsigned long)),
146		(unsigned long)page->flags, page->mapping,
147		page_mapcount(page), page_count(page));
148	dump_stack();
149	page->flags &= ~(1 << PG_lru	|
150			1 << PG_private |
151			1 << PG_locked	|
152			1 << PG_active	|
153			1 << PG_dirty	|
154			1 << PG_reclaim |
155			1 << PG_slab    |
156			1 << PG_swapcache |
157			1 << PG_writeback |
158			1 << PG_buddy );
159	set_page_count(page, 0);
160	reset_page_mapcount(page);
161	page->mapping = NULL;
162	add_taint(TAINT_BAD_PAGE);
163}
164
165/*
166 * Higher-order pages are called "compound pages".  They are structured thusly:
167 *
168 * The first PAGE_SIZE page is called the "head page".
169 *
170 * The remaining PAGE_SIZE pages are called "tail pages".
171 *
172 * All pages have PG_compound set.  All pages have their ->private pointing at
173 * the head page (even the head page has this).
174 *
175 * The first tail page's ->lru.next holds the address of the compound page's
176 * put_page() function.  Its ->lru.prev holds the order of allocation.
177 * This usage means that zero-order pages may not be compound.
178 */
179
180static void free_compound_page(struct page *page)
181{
182	__free_pages_ok(page, (unsigned long)page[1].lru.prev);
183}
184
185static void prep_compound_page(struct page *page, unsigned long order)
186{
187	int i;
188	int nr_pages = 1 << order;
189
190	page[1].lru.next = (void *)free_compound_page;	/* set dtor */
191	page[1].lru.prev = (void *)order;
192	for (i = 0; i < nr_pages; i++) {
193		struct page *p = page + i;
194
195		__SetPageCompound(p);
196		set_page_private(p, (unsigned long)page);
197	}
198}
199
200static void destroy_compound_page(struct page *page, unsigned long order)
201{
202	int i;
203	int nr_pages = 1 << order;
204
205	if (unlikely((unsigned long)page[1].lru.prev != order))
206		bad_page(page);
207
208	for (i = 0; i < nr_pages; i++) {
209		struct page *p = page + i;
210
211		if (unlikely(!PageCompound(p) |
212				(page_private(p) != (unsigned long)page)))
213			bad_page(page);
214		__ClearPageCompound(p);
215	}
216}
217
218static inline void prep_zero_page(struct page *page, int order, gfp_t gfp_flags)
219{
220	int i;
221
222	BUG_ON((gfp_flags & (__GFP_WAIT | __GFP_HIGHMEM)) == __GFP_HIGHMEM);
223	/*
224	 * clear_highpage() will use KM_USER0, so it's a bug to use __GFP_ZERO
225	 * and __GFP_HIGHMEM from hard or soft interrupt context.
226	 */
227	BUG_ON((gfp_flags & __GFP_HIGHMEM) && in_interrupt());
228	for (i = 0; i < (1 << order); i++)
229		clear_highpage(page + i);
230}
231
232/*
233 * function for dealing with page's order in buddy system.
234 * zone->lock is already acquired when we use these.
235 * So, we don't need atomic page->flags operations here.
236 */
237static inline unsigned long page_order(struct page *page)
238{
239	return page_private(page);
240}
241
242static inline void set_page_order(struct page *page, int order)
243{
244	set_page_private(page, order);
245	__SetPageBuddy(page);
246}
247
248static inline void rmv_page_order(struct page *page)
249{
250	__ClearPageBuddy(page);
251	set_page_private(page, 0);
252}
253
254/*
255 * Locate the struct page for both the matching buddy in our
256 * pair (buddy1) and the combined O(n+1) page they form (page).
257 *
258 * 1) Any buddy B1 will have an order O twin B2 which satisfies
259 * the following equation:
260 *     B2 = B1 ^ (1 << O)
261 * For example, if the starting buddy (buddy2) is #8 its order
262 * 1 buddy is #10:
263 *     B2 = 8 ^ (1 << 1) = 8 ^ 2 = 10
264 *
265 * 2) Any buddy B will have an order O+1 parent P which
266 * satisfies the following equation:
267 *     P = B & ~(1 << O)
268 *
269 * Assumption: *_mem_map is contiguous at least up to MAX_ORDER
270 */
271static inline struct page *
272__page_find_buddy(struct page *page, unsigned long page_idx, unsigned int order)
273{
274	unsigned long buddy_idx = page_idx ^ (1 << order);
275
276	return page + (buddy_idx - page_idx);
277}
278
279static inline unsigned long
280__find_combined_index(unsigned long page_idx, unsigned int order)
281{
282	return (page_idx & ~(1 << order));
283}
284
285/*
286 * This function checks whether a page is free && is the buddy
287 * we can do coalesce a page and its buddy if
288 * (a) the buddy is not in a hole &&
289 * (b) the buddy is in the buddy system &&
290 * (c) a page and its buddy have the same order &&
291 * (d) a page and its buddy are in the same zone.
292 *
293 * For recording whether a page is in the buddy system, we use PG_buddy.
294 * Setting, clearing, and testing PG_buddy is serialized by zone->lock.
295 *
296 * For recording page's order, we use page_private(page).
297 */
298static inline int page_is_buddy(struct page *page, struct page *buddy,
299								int order)
300{
301#ifdef CONFIG_HOLES_IN_ZONE
302	if (!pfn_valid(page_to_pfn(buddy)))
303		return 0;
304#endif
305
306	if (page_zone_id(page) != page_zone_id(buddy))
307		return 0;
308
309	if (PageBuddy(buddy) && page_order(buddy) == order) {
310		BUG_ON(page_count(buddy) != 0);
311		return 1;
312	}
313	return 0;
314}
315
316/*
317 * Freeing function for a buddy system allocator.
318 *
319 * The concept of a buddy system is to maintain direct-mapped table
320 * (containing bit values) for memory blocks of various "orders".
321 * The bottom level table contains the map for the smallest allocatable
322 * units of memory (here, pages), and each level above it describes
323 * pairs of units from the levels below, hence, "buddies".
324 * At a high level, all that happens here is marking the table entry
325 * at the bottom level available, and propagating the changes upward
326 * as necessary, plus some accounting needed to play nicely with other
327 * parts of the VM system.
328 * At each level, we keep a list of pages, which are heads of continuous
329 * free pages of length of (1 << order) and marked with PG_buddy. Page's
330 * order is recorded in page_private(page) field.
331 * So when we are allocating or freeing one, we can derive the state of the
332 * other.  That is, if we allocate a small block, and both were
333 * free, the remainder of the region must be split into blocks.
334 * If a block is freed, and its buddy is also free, then this
335 * triggers coalescing into a block of larger size.
336 *
337 * -- wli
338 */
339
340static inline void __free_one_page(struct page *page,
341		struct zone *zone, unsigned int order)
342{
343	unsigned long page_idx;
344	int order_size = 1 << order;
345
346	if (unlikely(PageCompound(page)))
347		destroy_compound_page(page, order);
348
349	page_idx = page_to_pfn(page) & ((1 << MAX_ORDER) - 1);
350
351	BUG_ON(page_idx & (order_size - 1));
352	BUG_ON(bad_range(zone, page));
353
354	zone->free_pages += order_size;
355	while (order < MAX_ORDER-1) {
356		unsigned long combined_idx;
357		struct free_area *area;
358		struct page *buddy;
359
360		buddy = __page_find_buddy(page, page_idx, order);
361		if (!page_is_buddy(page, buddy, order))
362			break;		/* Move the buddy up one level. */
363
364		list_del(&buddy->lru);
365		area = zone->free_area + order;
366		area->nr_free--;
367		rmv_page_order(buddy);
368		combined_idx = __find_combined_index(page_idx, order);
369		page = page + (combined_idx - page_idx);
370		page_idx = combined_idx;
371		order++;
372	}
373	set_page_order(page, order);
374	list_add(&page->lru, &zone->free_area[order].free_list);
375	zone->free_area[order].nr_free++;
376}
377
378static inline int free_pages_check(struct page *page)
379{
380	if (unlikely(page_mapcount(page) |
381		(page->mapping != NULL)  |
382		(page_count(page) != 0)  |
383		(page->flags & (
384			1 << PG_lru	|
385			1 << PG_private |
386			1 << PG_locked	|
387			1 << PG_active	|
388			1 << PG_reclaim	|
389			1 << PG_slab	|
390			1 << PG_swapcache |
391			1 << PG_writeback |
392			1 << PG_reserved |
393			1 << PG_buddy ))))
394		bad_page(page);
395	if (PageDirty(page))
396		__ClearPageDirty(page);
397	/*
398	 * For now, we report if PG_reserved was found set, but do not
399	 * clear it, and do not free the page.  But we shall soon need
400	 * to do more, for when the ZERO_PAGE count wraps negative.
401	 */
402	return PageReserved(page);
403}
404
405/*
406 * Frees a list of pages.
407 * Assumes all pages on list are in same zone, and of same order.
408 * count is the number of pages to free.
409 *
410 * If the zone was previously in an "all pages pinned" state then look to
411 * see if this freeing clears that state.
412 *
413 * And clear the zone's pages_scanned counter, to hold off the "all pages are
414 * pinned" detection logic.
415 */
416static void free_pages_bulk(struct zone *zone, int count,
417					struct list_head *list, int order)
418{
419	spin_lock(&zone->lock);
420	zone->all_unreclaimable = 0;
421	zone->pages_scanned = 0;
422	while (count--) {
423		struct page *page;
424
425		BUG_ON(list_empty(list));
426		page = list_entry(list->prev, struct page, lru);
427		/* have to delete it as __free_one_page list manipulates */
428		list_del(&page->lru);
429		__free_one_page(page, zone, order);
430	}
431	spin_unlock(&zone->lock);
432}
433
434static void free_one_page(struct zone *zone, struct page *page, int order)
435{
436	LIST_HEAD(list);
437	list_add(&page->lru, &list);
438	free_pages_bulk(zone, 1, &list, order);
439}
440
441static void __free_pages_ok(struct page *page, unsigned int order)
442{
443	unsigned long flags;
444	int i;
445	int reserved = 0;
446
447	arch_free_page(page, order);
448	if (!PageHighMem(page))
449		debug_check_no_locks_freed(page_address(page),
450					   PAGE_SIZE<<order);
451
452	for (i = 0 ; i < (1 << order) ; ++i)
453		reserved += free_pages_check(page + i);
454	if (reserved)
455		return;
456
457	kernel_map_pages(page, 1 << order, 0);
458	local_irq_save(flags);
459	__mod_page_state(pgfree, 1 << order);
460	free_one_page(page_zone(page), page, order);
461	local_irq_restore(flags);
462}
463
464/*
465 * permit the bootmem allocator to evade page validation on high-order frees
466 */
467void fastcall __init __free_pages_bootmem(struct page *page, unsigned int order)
468{
469	if (order == 0) {
470		__ClearPageReserved(page);
471		set_page_count(page, 0);
472		set_page_refcounted(page);
473		__free_page(page);
474	} else {
475		int loop;
476
477		prefetchw(page);
478		for (loop = 0; loop < BITS_PER_LONG; loop++) {
479			struct page *p = &page[loop];
480
481			if (loop + 1 < BITS_PER_LONG)
482				prefetchw(p + 1);
483			__ClearPageReserved(p);
484			set_page_count(p, 0);
485		}
486
487		set_page_refcounted(page);
488		__free_pages(page, order);
489	}
490}
491
492
493/*
494 * The order of subdivision here is critical for the IO subsystem.
495 * Please do not alter this order without good reasons and regression
496 * testing. Specifically, as large blocks of memory are subdivided,
497 * the order in which smaller blocks are delivered depends on the order
498 * they're subdivided in this function. This is the primary factor
499 * influencing the order in which pages are delivered to the IO
500 * subsystem according to empirical testing, and this is also justified
501 * by considering the behavior of a buddy system containing a single
502 * large block of memory acted on by a series of small allocations.
503 * This behavior is a critical factor in sglist merging's success.
504 *
505 * -- wli
506 */
507static inline void expand(struct zone *zone, struct page *page,
508 	int low, int high, struct free_area *area)
509{
510	unsigned long size = 1 << high;
511
512	while (high > low) {
513		area--;
514		high--;
515		size >>= 1;
516		BUG_ON(bad_range(zone, &page[size]));
517		list_add(&page[size].lru, &area->free_list);
518		area->nr_free++;
519		set_page_order(&page[size], high);
520	}
521}
522
523/*
524 * This page is about to be returned from the page allocator
525 */
526static int prep_new_page(struct page *page, int order, gfp_t gfp_flags)
527{
528	if (unlikely(page_mapcount(page) |
529		(page->mapping != NULL)  |
530		(page_count(page) != 0)  |
531		(page->flags & (
532			1 << PG_lru	|
533			1 << PG_private	|
534			1 << PG_locked	|
535			1 << PG_active	|
536			1 << PG_dirty	|
537			1 << PG_reclaim	|
538			1 << PG_slab    |
539			1 << PG_swapcache |
540			1 << PG_writeback |
541			1 << PG_reserved |
542			1 << PG_buddy ))))
543		bad_page(page);
544
545	/*
546	 * For now, we report if PG_reserved was found set, but do not
547	 * clear it, and do not allocate the page: as a safety net.
548	 */
549	if (PageReserved(page))
550		return 1;
551
552	page->flags &= ~(1 << PG_uptodate | 1 << PG_error |
553			1 << PG_referenced | 1 << PG_arch_1 |
554			1 << PG_checked | 1 << PG_mappedtodisk);
555	set_page_private(page, 0);
556	set_page_refcounted(page);
557	kernel_map_pages(page, 1 << order, 1);
558
559	if (gfp_flags & __GFP_ZERO)
560		prep_zero_page(page, order, gfp_flags);
561
562	if (order && (gfp_flags & __GFP_COMP))
563		prep_compound_page(page, order);
564
565	return 0;
566}
567
568/*
569 * Do the hard work of removing an element from the buddy allocator.
570 * Call me with the zone->lock already held.
571 */
572static struct page *__rmqueue(struct zone *zone, unsigned int order)
573{
574	struct free_area * area;
575	unsigned int current_order;
576	struct page *page;
577
578	for (current_order = order; current_order < MAX_ORDER; ++current_order) {
579		area = zone->free_area + current_order;
580		if (list_empty(&area->free_list))
581			continue;
582
583		page = list_entry(area->free_list.next, struct page, lru);
584		list_del(&page->lru);
585		rmv_page_order(page);
586		area->nr_free--;
587		zone->free_pages -= 1UL << order;
588		expand(zone, page, order, current_order, area);
589		return page;
590	}
591
592	return NULL;
593}
594
595/*
596 * Obtain a specified number of elements from the buddy allocator, all under
597 * a single hold of the lock, for efficiency.  Add them to the supplied list.
598 * Returns the number of new pages which were placed at *list.
599 */
600static int rmqueue_bulk(struct zone *zone, unsigned int order,
601			unsigned long count, struct list_head *list)
602{
603	int i;
604
605	spin_lock(&zone->lock);
606	for (i = 0; i < count; ++i) {
607		struct page *page = __rmqueue(zone, order);
608		if (unlikely(page == NULL))
609			break;
610		list_add_tail(&page->lru, list);
611	}
612	spin_unlock(&zone->lock);
613	return i;
614}
615
616#ifdef CONFIG_NUMA
617/*
618 * Called from the slab reaper to drain pagesets on a particular node that
619 * belong to the currently executing processor.
620 * Note that this function must be called with the thread pinned to
621 * a single processor.
622 */
623void drain_node_pages(int nodeid)
624{
625	int i, z;
626	unsigned long flags;
627
628	for (z = 0; z < MAX_NR_ZONES; z++) {
629		struct zone *zone = NODE_DATA(nodeid)->node_zones + z;
630		struct per_cpu_pageset *pset;
631
632		pset = zone_pcp(zone, smp_processor_id());
633		for (i = 0; i < ARRAY_SIZE(pset->pcp); i++) {
634			struct per_cpu_pages *pcp;
635
636			pcp = &pset->pcp[i];
637			if (pcp->count) {
638				local_irq_save(flags);
639				free_pages_bulk(zone, pcp->count, &pcp->list, 0);
640				pcp->count = 0;
641				local_irq_restore(flags);
642			}
643		}
644	}
645}
646#endif
647
648#if defined(CONFIG_PM) || defined(CONFIG_HOTPLUG_CPU)
649static void __drain_pages(unsigned int cpu)
650{
651	unsigned long flags;
652	struct zone *zone;
653	int i;
654
655	for_each_zone(zone) {
656		struct per_cpu_pageset *pset;
657
658		pset = zone_pcp(zone, cpu);
659		for (i = 0; i < ARRAY_SIZE(pset->pcp); i++) {
660			struct per_cpu_pages *pcp;
661
662			pcp = &pset->pcp[i];
663			local_irq_save(flags);
664			free_pages_bulk(zone, pcp->count, &pcp->list, 0);
665			pcp->count = 0;
666			local_irq_restore(flags);
667		}
668	}
669}
670#endif /* CONFIG_PM || CONFIG_HOTPLUG_CPU */
671
672#ifdef CONFIG_PM
673
674void mark_free_pages(struct zone *zone)
675{
676	unsigned long zone_pfn, flags;
677	int order;
678	struct list_head *curr;
679
680	if (!zone->spanned_pages)
681		return;
682
683	spin_lock_irqsave(&zone->lock, flags);
684	for (zone_pfn = 0; zone_pfn < zone->spanned_pages; ++zone_pfn)
685		ClearPageNosaveFree(pfn_to_page(zone_pfn + zone->zone_start_pfn));
686
687	for (order = MAX_ORDER - 1; order >= 0; --order)
688		list_for_each(curr, &zone->free_area[order].free_list) {
689			unsigned long start_pfn, i;
690
691			start_pfn = page_to_pfn(list_entry(curr, struct page, lru));
692
693			for (i=0; i < (1<<order); i++)
694				SetPageNosaveFree(pfn_to_page(start_pfn+i));
695	}
696	spin_unlock_irqrestore(&zone->lock, flags);
697}
698
699/*
700 * Spill all of this CPU's per-cpu pages back into the buddy allocator.
701 */
702void drain_local_pages(void)
703{
704	unsigned long flags;
705
706	local_irq_save(flags);
707	__drain_pages(smp_processor_id());
708	local_irq_restore(flags);
709}
710#endif /* CONFIG_PM */
711
712static void zone_statistics(struct zonelist *zonelist, struct zone *z, int cpu)
713{
714#ifdef CONFIG_NUMA
715	pg_data_t *pg = z->zone_pgdat;
716	pg_data_t *orig = zonelist->zones[0]->zone_pgdat;
717	struct per_cpu_pageset *p;
718
719	p = zone_pcp(z, cpu);
720	if (pg == orig) {
721		p->numa_hit++;
722	} else {
723		p->numa_miss++;
724		zone_pcp(zonelist->zones[0], cpu)->numa_foreign++;
725	}
726	if (pg == NODE_DATA(numa_node_id()))
727		p->local_node++;
728	else
729		p->other_node++;
730#endif
731}
732
733/*
734 * Free a 0-order page
735 */
736static void fastcall free_hot_cold_page(struct page *page, int cold)
737{
738	struct zone *zone = page_zone(page);
739	struct per_cpu_pages *pcp;
740	unsigned long flags;
741
742	arch_free_page(page, 0);
743
744	if (PageAnon(page))
745		page->mapping = NULL;
746	if (free_pages_check(page))
747		return;
748
749	kernel_map_pages(page, 1, 0);
750
751	pcp = &zone_pcp(zone, get_cpu())->pcp[cold];
752	local_irq_save(flags);
753	__inc_page_state(pgfree);
754	list_add(&page->lru, &pcp->list);
755	pcp->count++;
756	if (pcp->count >= pcp->high) {
757		free_pages_bulk(zone, pcp->batch, &pcp->list, 0);
758		pcp->count -= pcp->batch;
759	}
760	local_irq_restore(flags);
761	put_cpu();
762}
763
764void fastcall free_hot_page(struct page *page)
765{
766	free_hot_cold_page(page, 0);
767}
768
769void fastcall free_cold_page(struct page *page)
770{
771	free_hot_cold_page(page, 1);
772}
773
774/*
775 * split_page takes a non-compound higher-order page, and splits it into
776 * n (1<<order) sub-pages: page[0..n]
777 * Each sub-page must be freed individually.
778 *
779 * Note: this is probably too low level an operation for use in drivers.
780 * Please consult with lkml before using this in your driver.
781 */
782void split_page(struct page *page, unsigned int order)
783{
784	int i;
785
786	BUG_ON(PageCompound(page));
787	BUG_ON(!page_count(page));
788	for (i = 1; i < (1 << order); i++)
789		set_page_refcounted(page + i);
790}
791
792/*
793 * Really, prep_compound_page() should be called from __rmqueue_bulk().  But
794 * we cheat by calling it from here, in the order > 0 path.  Saves a branch
795 * or two.
796 */
797static struct page *buffered_rmqueue(struct zonelist *zonelist,
798			struct zone *zone, int order, gfp_t gfp_flags)
799{
800	unsigned long flags;
801	struct page *page;
802	int cold = !!(gfp_flags & __GFP_COLD);
803	int cpu;
804
805again:
806	cpu  = get_cpu();
807	if (likely(order == 0)) {
808		struct per_cpu_pages *pcp;
809
810		pcp = &zone_pcp(zone, cpu)->pcp[cold];
811		local_irq_save(flags);
812		if (!pcp->count) {
813			pcp->count += rmqueue_bulk(zone, 0,
814						pcp->batch, &pcp->list);
815			if (unlikely(!pcp->count))
816				goto failed;
817		}
818		page = list_entry(pcp->list.next, struct page, lru);
819		list_del(&page->lru);
820		pcp->count--;
821	} else {
822		spin_lock_irqsave(&zone->lock, flags);
823		page = __rmqueue(zone, order);
824		spin_unlock(&zone->lock);
825		if (!page)
826			goto failed;
827	}
828
829	__mod_page_state_zone(zone, pgalloc, 1 << order);
830	zone_statistics(zonelist, zone, cpu);
831	local_irq_restore(flags);
832	put_cpu();
833
834	BUG_ON(bad_range(zone, page));
835	if (prep_new_page(page, order, gfp_flags))
836		goto again;
837	return page;
838
839failed:
840	local_irq_restore(flags);
841	put_cpu();
842	return NULL;
843}
844
845#define ALLOC_NO_WATERMARKS	0x01 /* don't check watermarks at all */
846#define ALLOC_WMARK_MIN		0x02 /* use pages_min watermark */
847#define ALLOC_WMARK_LOW		0x04 /* use pages_low watermark */
848#define ALLOC_WMARK_HIGH	0x08 /* use pages_high watermark */
849#define ALLOC_HARDER		0x10 /* try to alloc harder */
850#define ALLOC_HIGH		0x20 /* __GFP_HIGH set */
851#define ALLOC_CPUSET		0x40 /* check for correct cpuset */
852
853/*
854 * Return 1 if free pages are above 'mark'. This takes into account the order
855 * of the allocation.
856 */
857int zone_watermark_ok(struct zone *z, int order, unsigned long mark,
858		      int classzone_idx, int alloc_flags)
859{
860	/* free_pages my go negative - that's OK */
861	long min = mark, free_pages = z->free_pages - (1 << order) + 1;
862	int o;
863
864	if (alloc_flags & ALLOC_HIGH)
865		min -= min / 2;
866	if (alloc_flags & ALLOC_HARDER)
867		min -= min / 4;
868
869	if (free_pages <= min + z->lowmem_reserve[classzone_idx])
870		return 0;
871	for (o = 0; o < order; o++) {
872		/* At the next order, this order's pages become unavailable */
873		free_pages -= z->free_area[o].nr_free << o;
874
875		/* Require fewer higher order pages to be free */
876		min >>= 1;
877
878		if (free_pages <= min)
879			return 0;
880	}
881	return 1;
882}
883
884/*
885 * get_page_from_freeliest goes through the zonelist trying to allocate
886 * a page.
887 */
888static struct page *
889get_page_from_freelist(gfp_t gfp_mask, unsigned int order,
890		struct zonelist *zonelist, int alloc_flags)
891{
892	struct zone **z = zonelist->zones;
893	struct page *page = NULL;
894	int classzone_idx = zone_idx(*z);
895
896	/*
897	 * Go through the zonelist once, looking for a zone with enough free.
898	 * See also cpuset_zone_allowed() comment in kernel/cpuset.c.
899	 */
900	do {
901		if ((alloc_flags & ALLOC_CPUSET) &&
902				!cpuset_zone_allowed(*z, gfp_mask))
903			continue;
904
905		if (!(alloc_flags & ALLOC_NO_WATERMARKS)) {
906			unsigned long mark;
907			if (alloc_flags & ALLOC_WMARK_MIN)
908				mark = (*z)->pages_min;
909			else if (alloc_flags & ALLOC_WMARK_LOW)
910				mark = (*z)->pages_low;
911			else
912				mark = (*z)->pages_high;
913			if (!zone_watermark_ok(*z, order, mark,
914				    classzone_idx, alloc_flags))
915				if (!zone_reclaim_mode ||
916				    !zone_reclaim(*z, gfp_mask, order))
917					continue;
918		}
919
920		page = buffered_rmqueue(zonelist, *z, order, gfp_mask);
921		if (page) {
922			break;
923		}
924	} while (*(++z) != NULL);
925	return page;
926}
927
928/*
929 * This is the 'heart' of the zoned buddy allocator.
930 */
931struct page * fastcall
932__alloc_pages(gfp_t gfp_mask, unsigned int order,
933		struct zonelist *zonelist)
934{
935	const gfp_t wait = gfp_mask & __GFP_WAIT;
936	struct zone **z;
937	struct page *page;
938	struct reclaim_state reclaim_state;
939	struct task_struct *p = current;
940	int do_retry;
941	int alloc_flags;
942	int did_some_progress;
943
944	might_sleep_if(wait);
945
946restart:
947	z = zonelist->zones;  /* the list of zones suitable for gfp_mask */
948
949	if (unlikely(*z == NULL)) {
950		/* Should this ever happen?? */
951		return NULL;
952	}
953
954	page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, order,
955				zonelist, ALLOC_WMARK_LOW|ALLOC_CPUSET);
956	if (page)
957		goto got_pg;
958
959	do {
960		wakeup_kswapd(*z, order);
961	} while (*(++z));
962
963	/*
964	 * OK, we're below the kswapd watermark and have kicked background
965	 * reclaim. Now things get more complex, so set up alloc_flags according
966	 * to how we want to proceed.
967	 *
968	 * The caller may dip into page reserves a bit more if the caller
969	 * cannot run direct reclaim, or if the caller has realtime scheduling
970	 * policy or is asking for __GFP_HIGH memory.  GFP_ATOMIC requests will
971	 * set both ALLOC_HARDER (!wait) and ALLOC_HIGH (__GFP_HIGH).
972	 */
973	alloc_flags = ALLOC_WMARK_MIN;
974	if ((unlikely(rt_task(p)) && !in_interrupt()) || !wait)
975		alloc_flags |= ALLOC_HARDER;
976	if (gfp_mask & __GFP_HIGH)
977		alloc_flags |= ALLOC_HIGH;
978	if (wait)
979		alloc_flags |= ALLOC_CPUSET;
980
981	/*
982	 * Go through the zonelist again. Let __GFP_HIGH and allocations
983	 * coming from realtime tasks go deeper into reserves.
984	 *
985	 * This is the last chance, in general, before the goto nopage.
986	 * Ignore cpuset if GFP_ATOMIC (!wait) rather than fail alloc.
987	 * See also cpuset_zone_allowed() comment in kernel/cpuset.c.
988	 */
989	page = get_page_from_freelist(gfp_mask, order, zonelist, alloc_flags);
990	if (page)
991		goto got_pg;
992
993	/* This allocation should allow future memory freeing. */
994
995	if (((p->flags & PF_MEMALLOC) || unlikely(test_thread_flag(TIF_MEMDIE)))
996			&& !in_interrupt()) {
997		if (!(gfp_mask & __GFP_NOMEMALLOC)) {
998nofail_alloc:
999			/* go through the zonelist yet again, ignoring mins */
1000			page = get_page_from_freelist(gfp_mask, order,
1001				zonelist, ALLOC_NO_WATERMARKS);
1002			if (page)
1003				goto got_pg;
1004			if (gfp_mask & __GFP_NOFAIL) {
1005				blk_congestion_wait(WRITE, HZ/50);
1006				goto nofail_alloc;
1007			}
1008		}
1009		goto nopage;
1010	}
1011
1012	/* Atomic allocations - we can't balance anything */
1013	if (!wait)
1014		goto nopage;
1015
1016rebalance:
1017	cond_resched();
1018
1019	/* We now go into synchronous reclaim */
1020	cpuset_memory_pressure_bump();
1021	p->flags |= PF_MEMALLOC;
1022	reclaim_state.reclaimed_slab = 0;
1023	p->reclaim_state = &reclaim_state;
1024
1025	did_some_progress = try_to_free_pages(zonelist->zones, gfp_mask);
1026
1027	p->reclaim_state = NULL;
1028	p->flags &= ~PF_MEMALLOC;
1029
1030	cond_resched();
1031
1032	if (likely(did_some_progress)) {
1033		page = get_page_from_freelist(gfp_mask, order,
1034						zonelist, alloc_flags);
1035		if (page)
1036			goto got_pg;
1037	} else if ((gfp_mask & __GFP_FS) && !(gfp_mask & __GFP_NORETRY)) {
1038		/*
1039		 * Go through the zonelist yet one more time, keep
1040		 * very high watermark here, this is only to catch
1041		 * a parallel oom killing, we must fail if we're still
1042		 * under heavy pressure.
1043		 */
1044		page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, order,
1045				zonelist, ALLOC_WMARK_HIGH|ALLOC_CPUSET);
1046		if (page)
1047			goto got_pg;
1048
1049		out_of_memory(zonelist, gfp_mask, order);
1050		goto restart;
1051	}
1052
1053	/*
1054	 * Don't let big-order allocations loop unless the caller explicitly
1055	 * requests that.  Wait for some write requests to complete then retry.
1056	 *
1057	 * In this implementation, __GFP_REPEAT means __GFP_NOFAIL for order
1058	 * <= 3, but that may not be true in other implementations.
1059	 */
1060	do_retry = 0;
1061	if (!(gfp_mask & __GFP_NORETRY)) {
1062		if ((order <= 3) || (gfp_mask & __GFP_REPEAT))
1063			do_retry = 1;
1064		if (gfp_mask & __GFP_NOFAIL)
1065			do_retry = 1;
1066	}
1067	if (do_retry) {
1068		blk_congestion_wait(WRITE, HZ/50);
1069		goto rebalance;
1070	}
1071
1072nopage:
1073	if (!(gfp_mask & __GFP_NOWARN) && printk_ratelimit()) {
1074		printk(KERN_WARNING "%s: page allocation failure."
1075			" order:%d, mode:0x%x\n",
1076			p->comm, order, gfp_mask);
1077		dump_stack();
1078		show_mem();
1079	}
1080got_pg:
1081	return page;
1082}
1083
1084EXPORT_SYMBOL(__alloc_pages);
1085
1086/*
1087 * Common helper functions.
1088 */
1089fastcall unsigned long __get_free_pages(gfp_t gfp_mask, unsigned int order)
1090{
1091	struct page * page;
1092	page = alloc_pages(gfp_mask, order);
1093	if (!page)
1094		return 0;
1095	return (unsigned long) page_address(page);
1096}
1097
1098EXPORT_SYMBOL(__get_free_pages);
1099
1100fastcall unsigned long get_zeroed_page(gfp_t gfp_mask)
1101{
1102	struct page * page;
1103
1104	/*
1105	 * get_zeroed_page() returns a 32-bit address, which cannot represent
1106	 * a highmem page
1107	 */
1108	BUG_ON((gfp_mask & __GFP_HIGHMEM) != 0);
1109
1110	page = alloc_pages(gfp_mask | __GFP_ZERO, 0);
1111	if (page)
1112		return (unsigned long) page_address(page);
1113	return 0;
1114}
1115
1116EXPORT_SYMBOL(get_zeroed_page);
1117
1118void __pagevec_free(struct pagevec *pvec)
1119{
1120	int i = pagevec_count(pvec);
1121
1122	while (--i >= 0)
1123		free_hot_cold_page(pvec->pages[i], pvec->cold);
1124}
1125
1126fastcall void __free_pages(struct page *page, unsigned int order)
1127{
1128	if (put_page_testzero(page)) {
1129		if (order == 0)
1130			free_hot_page(page);
1131		else
1132			__free_pages_ok(page, order);
1133	}
1134}
1135
1136EXPORT_SYMBOL(__free_pages);
1137
1138fastcall void free_pages(unsigned long addr, unsigned int order)
1139{
1140	if (addr != 0) {
1141		BUG_ON(!virt_addr_valid((void *)addr));
1142		__free_pages(virt_to_page((void *)addr), order);
1143	}
1144}
1145
1146EXPORT_SYMBOL(free_pages);
1147
1148/*
1149 * Total amount of free (allocatable) RAM:
1150 */
1151unsigned int nr_free_pages(void)
1152{
1153	unsigned int sum = 0;
1154	struct zone *zone;
1155
1156	for_each_zone(zone)
1157		sum += zone->free_pages;
1158
1159	return sum;
1160}
1161
1162EXPORT_SYMBOL(nr_free_pages);
1163
1164#ifdef CONFIG_NUMA
1165unsigned int nr_free_pages_pgdat(pg_data_t *pgdat)
1166{
1167	unsigned int i, sum = 0;
1168
1169	for (i = 0; i < MAX_NR_ZONES; i++)
1170		sum += pgdat->node_zones[i].free_pages;
1171
1172	return sum;
1173}
1174#endif
1175
1176static unsigned int nr_free_zone_pages(int offset)
1177{
1178	/* Just pick one node, since fallback list is circular */
1179	pg_data_t *pgdat = NODE_DATA(numa_node_id());
1180	unsigned int sum = 0;
1181
1182	struct zonelist *zonelist = pgdat->node_zonelists + offset;
1183	struct zone **zonep = zonelist->zones;
1184	struct zone *zone;
1185
1186	for (zone = *zonep++; zone; zone = *zonep++) {
1187		unsigned long size = zone->present_pages;
1188		unsigned long high = zone->pages_high;
1189		if (size > high)
1190			sum += size - high;
1191	}
1192
1193	return sum;
1194}
1195
1196/*
1197 * Amount of free RAM allocatable within ZONE_DMA and ZONE_NORMAL
1198 */
1199unsigned int nr_free_buffer_pages(void)
1200{
1201	return nr_free_zone_pages(gfp_zone(GFP_USER));
1202}
1203
1204/*
1205 * Amount of free RAM allocatable within all zones
1206 */
1207unsigned int nr_free_pagecache_pages(void)
1208{
1209	return nr_free_zone_pages(gfp_zone(GFP_HIGHUSER));
1210}
1211
1212#ifdef CONFIG_HIGHMEM
1213unsigned int nr_free_highpages (void)
1214{
1215	pg_data_t *pgdat;
1216	unsigned int pages = 0;
1217
1218	for_each_online_pgdat(pgdat)
1219		pages += pgdat->node_zones[ZONE_HIGHMEM].free_pages;
1220
1221	return pages;
1222}
1223#endif
1224
1225#ifdef CONFIG_NUMA
1226static void show_node(struct zone *zone)
1227{
1228	printk("Node %d ", zone->zone_pgdat->node_id);
1229}
1230#else
1231#define show_node(zone)	do { } while (0)
1232#endif
1233
1234void si_meminfo(struct sysinfo *val)
1235{
1236	val->totalram = totalram_pages;
1237	val->sharedram = 0;
1238	val->freeram = nr_free_pages();
1239	val->bufferram = nr_blockdev_pages();
1240#ifdef CONFIG_HIGHMEM
1241	val->totalhigh = totalhigh_pages;
1242	val->freehigh = nr_free_highpages();
1243#else
1244	val->totalhigh = 0;
1245	val->freehigh = 0;
1246#endif
1247	val->mem_unit = PAGE_SIZE;
1248}
1249
1250EXPORT_SYMBOL(si_meminfo);
1251
1252#ifdef CONFIG_NUMA
1253void si_meminfo_node(struct sysinfo *val, int nid)
1254{
1255	pg_data_t *pgdat = NODE_DATA(nid);
1256
1257	val->totalram = pgdat->node_present_pages;
1258	val->freeram = nr_free_pages_pgdat(pgdat);
1259	val->totalhigh = pgdat->node_zones[ZONE_HIGHMEM].present_pages;
1260	val->freehigh = pgdat->node_zones[ZONE_HIGHMEM].free_pages;
1261	val->mem_unit = PAGE_SIZE;
1262}
1263#endif
1264
1265#define K(x) ((x) << (PAGE_SHIFT-10))
1266
1267/*
1268 * Show free area list (used inside shift_scroll-lock stuff)
1269 * We also calculate the percentage fragmentation. We do this by counting the
1270 * memory on each free list with the exception of the first item on the list.
1271 */
1272void show_free_areas(void)
1273{
1274	int cpu, temperature;
1275	unsigned long active;
1276	unsigned long inactive;
1277	unsigned long free;
1278	struct zone *zone;
1279
1280	for_each_zone(zone) {
1281		show_node(zone);
1282		printk("%s per-cpu:", zone->name);
1283
1284		if (!populated_zone(zone)) {
1285			printk(" empty\n");
1286			continue;
1287		} else
1288			printk("\n");
1289
1290		for_each_online_cpu(cpu) {
1291			struct per_cpu_pageset *pageset;
1292
1293			pageset = zone_pcp(zone, cpu);
1294
1295			for (temperature = 0; temperature < 2; temperature++)
1296				printk("cpu %d %s: high %d, batch %d used:%d\n",
1297					cpu,
1298					temperature ? "cold" : "hot",
1299					pageset->pcp[temperature].high,
1300					pageset->pcp[temperature].batch,
1301					pageset->pcp[temperature].count);
1302		}
1303	}
1304
1305	get_zone_counts(&active, &inactive, &free);
1306
1307	printk("Free pages: %11ukB (%ukB HighMem)\n",
1308		K(nr_free_pages()),
1309		K(nr_free_highpages()));
1310
1311	printk("Active:%lu inactive:%lu dirty:%lu writeback:%lu "
1312		"unstable:%lu free:%u slab:%lu mapped:%lu pagetables:%lu\n",
1313		active,
1314		inactive,
1315		global_page_state(NR_FILE_DIRTY),
1316		global_page_state(NR_WRITEBACK),
1317		global_page_state(NR_UNSTABLE_NFS),
1318		nr_free_pages(),
1319		global_page_state(NR_SLAB),
1320		global_page_state(NR_FILE_MAPPED),
1321		global_page_state(NR_PAGETABLE));
1322
1323	for_each_zone(zone) {
1324		int i;
1325
1326		show_node(zone);
1327		printk("%s"
1328			" free:%lukB"
1329			" min:%lukB"
1330			" low:%lukB"
1331			" high:%lukB"
1332			" active:%lukB"
1333			" inactive:%lukB"
1334			" present:%lukB"
1335			" pages_scanned:%lu"
1336			" all_unreclaimable? %s"
1337			"\n",
1338			zone->name,
1339			K(zone->free_pages),
1340			K(zone->pages_min),
1341			K(zone->pages_low),
1342			K(zone->pages_high),
1343			K(zone->nr_active),
1344			K(zone->nr_inactive),
1345			K(zone->present_pages),
1346			zone->pages_scanned,
1347			(zone->all_unreclaimable ? "yes" : "no")
1348			);
1349		printk("lowmem_reserve[]:");
1350		for (i = 0; i < MAX_NR_ZONES; i++)
1351			printk(" %lu", zone->lowmem_reserve[i]);
1352		printk("\n");
1353	}
1354
1355	for_each_zone(zone) {
1356 		unsigned long nr[MAX_ORDER], flags, order, total = 0;
1357
1358		show_node(zone);
1359		printk("%s: ", zone->name);
1360		if (!populated_zone(zone)) {
1361			printk("empty\n");
1362			continue;
1363		}
1364
1365		spin_lock_irqsave(&zone->lock, flags);
1366		for (order = 0; order < MAX_ORDER; order++) {
1367			nr[order] = zone->free_area[order].nr_free;
1368			total += nr[order] << order;
1369		}
1370		spin_unlock_irqrestore(&zone->lock, flags);
1371		for (order = 0; order < MAX_ORDER; order++)
1372			printk("%lu*%lukB ", nr[order], K(1UL) << order);
1373		printk("= %lukB\n", K(total));
1374	}
1375
1376	show_swap_cache_info();
1377}
1378
1379/*
1380 * Builds allocation fallback zone lists.
1381 *
1382 * Add all populated zones of a node to the zonelist.
1383 */
1384static int __meminit build_zonelists_node(pg_data_t *pgdat,
1385			struct zonelist *zonelist, int nr_zones, int zone_type)
1386{
1387	struct zone *zone;
1388
1389	BUG_ON(zone_type > ZONE_HIGHMEM);
1390
1391	do {
1392		zone = pgdat->node_zones + zone_type;
1393		if (populated_zone(zone)) {
1394#ifndef CONFIG_HIGHMEM
1395			BUG_ON(zone_type > ZONE_NORMAL);
1396#endif
1397			zonelist->zones[nr_zones++] = zone;
1398			check_highest_zone(zone_type);
1399		}
1400		zone_type--;
1401
1402	} while (zone_type >= 0);
1403	return nr_zones;
1404}
1405
1406static inline int highest_zone(int zone_bits)
1407{
1408	int res = ZONE_NORMAL;
1409	if (zone_bits & (__force int)__GFP_HIGHMEM)
1410		res = ZONE_HIGHMEM;
1411	if (zone_bits & (__force int)__GFP_DMA32)
1412		res = ZONE_DMA32;
1413	if (zone_bits & (__force int)__GFP_DMA)
1414		res = ZONE_DMA;
1415	return res;
1416}
1417
1418#ifdef CONFIG_NUMA
1419#define MAX_NODE_LOAD (num_online_nodes())
1420static int __meminitdata node_load[MAX_NUMNODES];
1421/**
1422 * find_next_best_node - find the next node that should appear in a given node's fallback list
1423 * @node: node whose fallback list we're appending
1424 * @used_node_mask: nodemask_t of already used nodes
1425 *
1426 * We use a number of factors to determine which is the next node that should
1427 * appear on a given node's fallback list.  The node should not have appeared
1428 * already in @node's fallback list, and it should be the next closest node
1429 * according to the distance array (which contains arbitrary distance values
1430 * from each node to each node in the system), and should also prefer nodes
1431 * with no CPUs, since presumably they'll have very little allocation pressure
1432 * on them otherwise.
1433 * It returns -1 if no node is found.
1434 */
1435static int __meminit find_next_best_node(int node, nodemask_t *used_node_mask)
1436{
1437	int n, val;
1438	int min_val = INT_MAX;
1439	int best_node = -1;
1440
1441	/* Use the local node if we haven't already */
1442	if (!node_isset(node, *used_node_mask)) {
1443		node_set(node, *used_node_mask);
1444		return node;
1445	}
1446
1447	for_each_online_node(n) {
1448		cpumask_t tmp;
1449
1450		/* Don't want a node to appear more than once */
1451		if (node_isset(n, *used_node_mask))
1452			continue;
1453
1454		/* Use the distance array to find the distance */
1455		val = node_distance(node, n);
1456
1457		/* Penalize nodes under us ("prefer the next node") */
1458		val += (n < node);
1459
1460		/* Give preference to headless and unused nodes */
1461		tmp = node_to_cpumask(n);
1462		if (!cpus_empty(tmp))
1463			val += PENALTY_FOR_NODE_WITH_CPUS;
1464
1465		/* Slight preference for less loaded node */
1466		val *= (MAX_NODE_LOAD*MAX_NUMNODES);
1467		val += node_load[n];
1468
1469		if (val < min_val) {
1470			min_val = val;
1471			best_node = n;
1472		}
1473	}
1474
1475	if (best_node >= 0)
1476		node_set(best_node, *used_node_mask);
1477
1478	return best_node;
1479}
1480
1481static void __meminit build_zonelists(pg_data_t *pgdat)
1482{
1483	int i, j, k, node, local_node;
1484	int prev_node, load;
1485	struct zonelist *zonelist;
1486	nodemask_t used_mask;
1487
1488	/* initialize zonelists */
1489	for (i = 0; i < GFP_ZONETYPES; i++) {
1490		zonelist = pgdat->node_zonelists + i;
1491		zonelist->zones[0] = NULL;
1492	}
1493
1494	/* NUMA-aware ordering of nodes */
1495	local_node = pgdat->node_id;
1496	load = num_online_nodes();
1497	prev_node = local_node;
1498	nodes_clear(used_mask);
1499	while ((node = find_next_best_node(local_node, &used_mask)) >= 0) {
1500		int distance = node_distance(local_node, node);
1501
1502		/*
1503		 * If another node is sufficiently far away then it is better
1504		 * to reclaim pages in a zone before going off node.
1505		 */
1506		if (distance > RECLAIM_DISTANCE)
1507			zone_reclaim_mode = 1;
1508
1509		/*
1510		 * We don't want to pressure a particular node.
1511		 * So adding penalty to the first node in same
1512		 * distance group to make it round-robin.
1513		 */
1514
1515		if (distance != node_distance(local_node, prev_node))
1516			node_load[node] += load;
1517		prev_node = node;
1518		load--;
1519		for (i = 0; i < GFP_ZONETYPES; i++) {
1520			zonelist = pgdat->node_zonelists + i;
1521			for (j = 0; zonelist->zones[j] != NULL; j++);
1522
1523			k = highest_zone(i);
1524
1525	 		j = build_zonelists_node(NODE_DATA(node), zonelist, j, k);
1526			zonelist->zones[j] = NULL;
1527		}
1528	}
1529}
1530
1531#else	/* CONFIG_NUMA */
1532
1533static void __meminit build_zonelists(pg_data_t *pgdat)
1534{
1535	int i, j, k, node, local_node;
1536
1537	local_node = pgdat->node_id;
1538	for (i = 0; i < GFP_ZONETYPES; i++) {
1539		struct zonelist *zonelist;
1540
1541		zonelist = pgdat->node_zonelists + i;
1542
1543		j = 0;
1544		k = highest_zone(i);
1545 		j = build_zonelists_node(pgdat, zonelist, j, k);
1546 		/*
1547 		 * Now we build the zonelist so that it contains the zones
1548 		 * of all the other nodes.
1549 		 * We don't want to pressure a particular node, so when
1550 		 * building the zones for node N, we make sure that the
1551 		 * zones coming right after the local ones are those from
1552 		 * node N+1 (modulo N)
1553 		 */
1554		for (node = local_node + 1; node < MAX_NUMNODES; node++) {
1555			if (!node_online(node))
1556				continue;
1557			j = build_zonelists_node(NODE_DATA(node), zonelist, j, k);
1558		}
1559		for (node = 0; node < local_node; node++) {
1560			if (!node_online(node))
1561				continue;
1562			j = build_zonelists_node(NODE_DATA(node), zonelist, j, k);
1563		}
1564
1565		zonelist->zones[j] = NULL;
1566	}
1567}
1568
1569#endif	/* CONFIG_NUMA */
1570
1571/* return values int ....just for stop_machine_run() */
1572static int __meminit __build_all_zonelists(void *dummy)
1573{
1574	int nid;
1575	for_each_online_node(nid)
1576		build_zonelists(NODE_DATA(nid));
1577	return 0;
1578}
1579
1580void __meminit build_all_zonelists(void)
1581{
1582	if (system_state == SYSTEM_BOOTING) {
1583		__build_all_zonelists(0);
1584		cpuset_init_current_mems_allowed();
1585	} else {
1586		/* we have to stop all cpus to guaranntee there is no user
1587		   of zonelist */
1588		stop_machine_run(__build_all_zonelists, NULL, NR_CPUS);
1589		/* cpuset refresh routine should be here */
1590	}
1591	vm_total_pages = nr_free_pagecache_pages();
1592	printk("Built %i zonelists.  Total pages: %ld\n",
1593			num_online_nodes(), vm_total_pages);
1594}
1595
1596/*
1597 * Helper functions to size the waitqueue hash table.
1598 * Essentially these want to choose hash table sizes sufficiently
1599 * large so that collisions trying to wait on pages are rare.
1600 * But in fact, the number of active page waitqueues on typical
1601 * systems is ridiculously low, less than 200. So this is even
1602 * conservative, even though it seems large.
1603 *
1604 * The constant PAGES_PER_WAITQUEUE specifies the ratio of pages to
1605 * waitqueues, i.e. the size of the waitq table given the number of pages.
1606 */
1607#define PAGES_PER_WAITQUEUE	256
1608
1609#ifndef CONFIG_MEMORY_HOTPLUG
1610static inline unsigned long wait_table_hash_nr_entries(unsigned long pages)
1611{
1612	unsigned long size = 1;
1613
1614	pages /= PAGES_PER_WAITQUEUE;
1615
1616	while (size < pages)
1617		size <<= 1;
1618
1619	/*
1620	 * Once we have dozens or even hundreds of threads sleeping
1621	 * on IO we've got bigger problems than wait queue collision.
1622	 * Limit the size of the wait table to a reasonable size.
1623	 */
1624	size = min(size, 4096UL);
1625
1626	return max(size, 4UL);
1627}
1628#else
1629/*
1630 * A zone's size might be changed by hot-add, so it is not possible to determine
1631 * a suitable size for its wait_table.  So we use the maximum size now.
1632 *
1633 * The max wait table size = 4096 x sizeof(wait_queue_head_t).   ie:
1634 *
1635 *    i386 (preemption config)    : 4096 x 16 = 64Kbyte.
1636 *    ia64, x86-64 (no preemption): 4096 x 20 = 80Kbyte.
1637 *    ia64, x86-64 (preemption)   : 4096 x 24 = 96Kbyte.
1638 *
1639 * The maximum entries are prepared when a zone's memory is (512K + 256) pages
1640 * or more by the traditional way. (See above).  It equals:
1641 *
1642 *    i386, x86-64, powerpc(4K page size) : =  ( 2G + 1M)byte.
1643 *    ia64(16K page size)                 : =  ( 8G + 4M)byte.
1644 *    powerpc (64K page size)             : =  (32G +16M)byte.
1645 */
1646static inline unsigned long wait_table_hash_nr_entries(unsigned long pages)
1647{
1648	return 4096UL;
1649}
1650#endif
1651
1652/*
1653 * This is an integer logarithm so that shifts can be used later
1654 * to extract the more random high bits from the multiplicative
1655 * hash function before the remainder is taken.
1656 */
1657static inline unsigned long wait_table_bits(unsigned long size)
1658{
1659	return ffz(~size);
1660}
1661
1662#define LONG_ALIGN(x) (((x)+(sizeof(long))-1)&~((sizeof(long))-1))
1663
1664static void __init calculate_zone_totalpages(struct pglist_data *pgdat,
1665		unsigned long *zones_size, unsigned long *zholes_size)
1666{
1667	unsigned long realtotalpages, totalpages = 0;
1668	int i;
1669
1670	for (i = 0; i < MAX_NR_ZONES; i++)
1671		totalpages += zones_size[i];
1672	pgdat->node_spanned_pages = totalpages;
1673
1674	realtotalpages = totalpages;
1675	if (zholes_size)
1676		for (i = 0; i < MAX_NR_ZONES; i++)
1677			realtotalpages -= zholes_size[i];
1678	pgdat->node_present_pages = realtotalpages;
1679	printk(KERN_DEBUG "On node %d totalpages: %lu\n", pgdat->node_id, realtotalpages);
1680}
1681
1682
1683/*
1684 * Initially all pages are reserved - free ones are freed
1685 * up by free_all_bootmem() once the early boot process is
1686 * done. Non-atomic initialization, single-pass.
1687 */
1688void __meminit memmap_init_zone(unsigned long size, int nid, unsigned long zone,
1689		unsigned long start_pfn)
1690{
1691	struct page *page;
1692	unsigned long end_pfn = start_pfn + size;
1693	unsigned long pfn;
1694
1695	for (pfn = start_pfn; pfn < end_pfn; pfn++) {
1696		if (!early_pfn_valid(pfn))
1697			continue;
1698		page = pfn_to_page(pfn);
1699		set_page_links(page, zone, nid, pfn);
1700		init_page_count(page);
1701		reset_page_mapcount(page);
1702		SetPageReserved(page);
1703		INIT_LIST_HEAD(&page->lru);
1704#ifdef WANT_PAGE_VIRTUAL
1705		/* The shift won't overflow because ZONE_NORMAL is below 4G. */
1706		if (!is_highmem_idx(zone))
1707			set_page_address(page, __va(pfn << PAGE_SHIFT));
1708#endif
1709	}
1710}
1711
1712void zone_init_free_lists(struct pglist_data *pgdat, struct zone *zone,
1713				unsigned long size)
1714{
1715	int order;
1716	for (order = 0; order < MAX_ORDER ; order++) {
1717		INIT_LIST_HEAD(&zone->free_area[order].free_list);
1718		zone->free_area[order].nr_free = 0;
1719	}
1720}
1721
1722#define ZONETABLE_INDEX(x, zone_nr)	((x << ZONES_SHIFT) | zone_nr)
1723void zonetable_add(struct zone *zone, int nid, int zid, unsigned long pfn,
1724		unsigned long size)
1725{
1726	unsigned long snum = pfn_to_section_nr(pfn);
1727	unsigned long end = pfn_to_section_nr(pfn + size);
1728
1729	if (FLAGS_HAS_NODE)
1730		zone_table[ZONETABLE_INDEX(nid, zid)] = zone;
1731	else
1732		for (; snum <= end; snum++)
1733			zone_table[ZONETABLE_INDEX(snum, zid)] = zone;
1734}
1735
1736#ifndef __HAVE_ARCH_MEMMAP_INIT
1737#define memmap_init(size, nid, zone, start_pfn) \
1738	memmap_init_zone((size), (nid), (zone), (start_pfn))
1739#endif
1740
1741static int __cpuinit zone_batchsize(struct zone *zone)
1742{
1743	int batch;
1744
1745	/*
1746	 * The per-cpu-pages pools are set to around 1000th of the
1747	 * size of the zone.  But no more than 1/2 of a meg.
1748	 *
1749	 * OK, so we don't know how big the cache is.  So guess.
1750	 */
1751	batch = zone->present_pages / 1024;
1752	if (batch * PAGE_SIZE > 512 * 1024)
1753		batch = (512 * 1024) / PAGE_SIZE;
1754	batch /= 4;		/* We effectively *= 4 below */
1755	if (batch < 1)
1756		batch = 1;
1757
1758	/*
1759	 * Clamp the batch to a 2^n - 1 value. Having a power
1760	 * of 2 value was found to be more likely to have
1761	 * suboptimal cache aliasing properties in some cases.
1762	 *
1763	 * For example if 2 tasks are alternately allocating
1764	 * batches of pages, one task can end up with a lot
1765	 * of pages of one half of the possible page colors
1766	 * and the other with pages of the other colors.
1767	 */
1768	batch = (1 << (fls(batch + batch/2)-1)) - 1;
1769
1770	return batch;
1771}
1772
1773inline void setup_pageset(struct per_cpu_pageset *p, unsigned long batch)
1774{
1775	struct per_cpu_pages *pcp;
1776
1777	memset(p, 0, sizeof(*p));
1778
1779	pcp = &p->pcp[0];		/* hot */
1780	pcp->count = 0;
1781	pcp->high = 6 * batch;
1782	pcp->batch = max(1UL, 1 * batch);
1783	INIT_LIST_HEAD(&pcp->list);
1784
1785	pcp = &p->pcp[1];		/* cold*/
1786	pcp->count = 0;
1787	pcp->high = 2 * batch;
1788	pcp->batch = max(1UL, batch/2);
1789	INIT_LIST_HEAD(&pcp->list);
1790}
1791
1792/*
1793 * setup_pagelist_highmark() sets the high water mark for hot per_cpu_pagelist
1794 * to the value high for the pageset p.
1795 */
1796
1797static void setup_pagelist_highmark(struct per_cpu_pageset *p,
1798				unsigned long high)
1799{
1800	struct per_cpu_pages *pcp;
1801
1802	pcp = &p->pcp[0]; /* hot list */
1803	pcp->high = high;
1804	pcp->batch = max(1UL, high/4);
1805	if ((high/4) > (PAGE_SHIFT * 8))
1806		pcp->batch = PAGE_SHIFT * 8;
1807}
1808
1809
1810#ifdef CONFIG_NUMA
1811/*
1812 * Boot pageset table. One per cpu which is going to be used for all
1813 * zones and all nodes. The parameters will be set in such a way
1814 * that an item put on a list will immediately be handed over to
1815 * the buddy list. This is safe since pageset manipulation is done
1816 * with interrupts disabled.
1817 *
1818 * Some NUMA counter updates may also be caught by the boot pagesets.
1819 *
1820 * The boot_pagesets must be kept even after bootup is complete for
1821 * unused processors and/or zones. They do play a role for bootstrapping
1822 * hotplugged processors.
1823 *
1824 * zoneinfo_show() and maybe other functions do
1825 * not check if the processor is online before following the pageset pointer.
1826 * Other parts of the kernel may not check if the zone is available.
1827 */
1828static struct per_cpu_pageset boot_pageset[NR_CPUS];
1829
1830/*
1831 * Dynamically allocate memory for the
1832 * per cpu pageset array in struct zone.
1833 */
1834static int __cpuinit process_zones(int cpu)
1835{
1836	struct zone *zone, *dzone;
1837
1838	for_each_zone(zone) {
1839
1840		zone_pcp(zone, cpu) = kmalloc_node(sizeof(struct per_cpu_pageset),
1841					 GFP_KERNEL, cpu_to_node(cpu));
1842		if (!zone_pcp(zone, cpu))
1843			goto bad;
1844
1845		setup_pageset(zone_pcp(zone, cpu), zone_batchsize(zone));
1846
1847		if (percpu_pagelist_fraction)
1848			setup_pagelist_highmark(zone_pcp(zone, cpu),
1849			 	(zone->present_pages / percpu_pagelist_fraction));
1850	}
1851
1852	return 0;
1853bad:
1854	for_each_zone(dzone) {
1855		if (dzone == zone)
1856			break;
1857		kfree(zone_pcp(dzone, cpu));
1858		zone_pcp(dzone, cpu) = NULL;
1859	}
1860	return -ENOMEM;
1861}
1862
1863static inline void free_zone_pagesets(int cpu)
1864{
1865	struct zone *zone;
1866
1867	for_each_zone(zone) {
1868		struct per_cpu_pageset *pset = zone_pcp(zone, cpu);
1869
1870		zone_pcp(zone, cpu) = NULL;
1871		kfree(pset);
1872	}
1873}
1874
1875static int __cpuinit pageset_cpuup_callback(struct notifier_block *nfb,
1876		unsigned long action,
1877		void *hcpu)
1878{
1879	int cpu = (long)hcpu;
1880	int ret = NOTIFY_OK;
1881
1882	switch (action) {
1883		case CPU_UP_PREPARE:
1884			if (process_zones(cpu))
1885				ret = NOTIFY_BAD;
1886			break;
1887		case CPU_UP_CANCELED:
1888		case CPU_DEAD:
1889			free_zone_pagesets(cpu);
1890			break;
1891		default:
1892			break;
1893	}
1894	return ret;
1895}
1896
1897static struct notifier_block __cpuinitdata pageset_notifier =
1898	{ &pageset_cpuup_callback, NULL, 0 };
1899
1900void __init setup_per_cpu_pageset(void)
1901{
1902	int err;
1903
1904	/* Initialize per_cpu_pageset for cpu 0.
1905	 * A cpuup callback will do this for every cpu
1906	 * as it comes online
1907	 */
1908	err = process_zones(smp_processor_id());
1909	BUG_ON(err);
1910	register_cpu_notifier(&pageset_notifier);
1911}
1912
1913#endif
1914
1915static __meminit
1916int zone_wait_table_init(struct zone *zone, unsigned long zone_size_pages)
1917{
1918	int i;
1919	struct pglist_data *pgdat = zone->zone_pgdat;
1920	size_t alloc_size;
1921
1922	/*
1923	 * The per-page waitqueue mechanism uses hashed waitqueues
1924	 * per zone.
1925	 */
1926	zone->wait_table_hash_nr_entries =
1927		 wait_table_hash_nr_entries(zone_size_pages);
1928	zone->wait_table_bits =
1929		wait_table_bits(zone->wait_table_hash_nr_entries);
1930	alloc_size = zone->wait_table_hash_nr_entries
1931					* sizeof(wait_queue_head_t);
1932
1933 	if (system_state == SYSTEM_BOOTING) {
1934		zone->wait_table = (wait_queue_head_t *)
1935			alloc_bootmem_node(pgdat, alloc_size);
1936	} else {
1937		/*
1938		 * This case means that a zone whose size was 0 gets new memory
1939		 * via memory hot-add.
1940		 * But it may be the case that a new node was hot-added.  In
1941		 * this case vmalloc() will not be able to use this new node's
1942		 * memory - this wait_table must be initialized to use this new
1943		 * node itself as well.
1944		 * To use this new node's memory, further consideration will be
1945		 * necessary.
1946		 */
1947		zone->wait_table = (wait_queue_head_t *)vmalloc(alloc_size);
1948	}
1949	if (!zone->wait_table)
1950		return -ENOMEM;
1951
1952	for(i = 0; i < zone->wait_table_hash_nr_entries; ++i)
1953		init_waitqueue_head(zone->wait_table + i);
1954
1955	return 0;
1956}
1957
1958static __meminit void zone_pcp_init(struct zone *zone)
1959{
1960	int cpu;
1961	unsigned long batch = zone_batchsize(zone);
1962
1963	for (cpu = 0; cpu < NR_CPUS; cpu++) {
1964#ifdef CONFIG_NUMA
1965		/* Early boot. Slab allocator not functional yet */
1966		zone_pcp(zone, cpu) = &boot_pageset[cpu];
1967		setup_pageset(&boot_pageset[cpu],0);
1968#else
1969		setup_pageset(zone_pcp(zone,cpu), batch);
1970#endif
1971	}
1972	if (zone->present_pages)
1973		printk(KERN_DEBUG "  %s zone: %lu pages, LIFO batch:%lu\n",
1974			zone->name, zone->present_pages, batch);
1975}
1976
1977__meminit int init_currently_empty_zone(struct zone *zone,
1978					unsigned long zone_start_pfn,
1979					unsigned long size)
1980{
1981	struct pglist_data *pgdat = zone->zone_pgdat;
1982	int ret;
1983	ret = zone_wait_table_init(zone, size);
1984	if (ret)
1985		return ret;
1986	pgdat->nr_zones = zone_idx(zone) + 1;
1987
1988	zone->zone_start_pfn = zone_start_pfn;
1989
1990	memmap_init(size, pgdat->node_id, zone_idx(zone), zone_start_pfn);
1991
1992	zone_init_free_lists(pgdat, zone, zone->spanned_pages);
1993
1994	return 0;
1995}
1996
1997/*
1998 * Set up the zone data structures:
1999 *   - mark all pages reserved
2000 *   - mark all memory queues empty
2001 *   - clear the memory bitmaps
2002 */
2003static void __meminit free_area_init_core(struct pglist_data *pgdat,
2004		unsigned long *zones_size, unsigned long *zholes_size)
2005{
2006	unsigned long j;
2007	int nid = pgdat->node_id;
2008	unsigned long zone_start_pfn = pgdat->node_start_pfn;
2009	int ret;
2010
2011	pgdat_resize_init(pgdat);
2012	pgdat->nr_zones = 0;
2013	init_waitqueue_head(&pgdat->kswapd_wait);
2014	pgdat->kswapd_max_order = 0;
2015
2016	for (j = 0; j < MAX_NR_ZONES; j++) {
2017		struct zone *zone = pgdat->node_zones + j;
2018		unsigned long size, realsize;
2019
2020		realsize = size = zones_size[j];
2021		if (zholes_size)
2022			realsize -= zholes_size[j];
2023
2024		if (j < ZONE_HIGHMEM)
2025			nr_kernel_pages += realsize;
2026		nr_all_pages += realsize;
2027
2028		zone->spanned_pages = size;
2029		zone->present_pages = realsize;
2030		zone->name = zone_names[j];
2031		spin_lock_init(&zone->lock);
2032		spin_lock_init(&zone->lru_lock);
2033		zone_seqlock_init(zone);
2034		zone->zone_pgdat = pgdat;
2035		zone->free_pages = 0;
2036
2037		zone->temp_priority = zone->prev_priority = DEF_PRIORITY;
2038
2039		zone_pcp_init(zone);
2040		INIT_LIST_HEAD(&zone->active_list);
2041		INIT_LIST_HEAD(&zone->inactive_list);
2042		zone->nr_scan_active = 0;
2043		zone->nr_scan_inactive = 0;
2044		zone->nr_active = 0;
2045		zone->nr_inactive = 0;
2046		zap_zone_vm_stats(zone);
2047		atomic_set(&zone->reclaim_in_progress, 0);
2048		if (!size)
2049			continue;
2050
2051		zonetable_add(zone, nid, j, zone_start_pfn, size);
2052		ret = init_currently_empty_zone(zone, zone_start_pfn, size);
2053		BUG_ON(ret);
2054		zone_start_pfn += size;
2055	}
2056}
2057
2058static void __init alloc_node_mem_map(struct pglist_data *pgdat)
2059{
2060	/* Skip empty nodes */
2061	if (!pgdat->node_spanned_pages)
2062		return;
2063
2064#ifdef CONFIG_FLAT_NODE_MEM_MAP
2065	/* ia64 gets its own node_mem_map, before this, without bootmem */
2066	if (!pgdat->node_mem_map) {
2067		unsigned long size, start, end;
2068		struct page *map;
2069
2070		/*
2071		 * The zone's endpoints aren't required to be MAX_ORDER
2072		 * aligned but the node_mem_map endpoints must be in order
2073		 * for the buddy allocator to function correctly.
2074		 */
2075		start = pgdat->node_start_pfn & ~(MAX_ORDER_NR_PAGES - 1);
2076		end = pgdat->node_start_pfn + pgdat->node_spanned_pages;
2077		end = ALIGN(end, MAX_ORDER_NR_PAGES);
2078		size =  (end - start) * sizeof(struct page);
2079		map = alloc_remap(pgdat->node_id, size);
2080		if (!map)
2081			map = alloc_bootmem_node(pgdat, size);
2082		pgdat->node_mem_map = map + (pgdat->node_start_pfn - start);
2083	}
2084#ifdef CONFIG_FLATMEM
2085	/*
2086	 * With no DISCONTIG, the global mem_map is just set as node 0's
2087	 */
2088	if (pgdat == NODE_DATA(0))
2089		mem_map = NODE_DATA(0)->node_mem_map;
2090#endif
2091#endif /* CONFIG_FLAT_NODE_MEM_MAP */
2092}
2093
2094void __meminit free_area_init_node(int nid, struct pglist_data *pgdat,
2095		unsigned long *zones_size, unsigned long node_start_pfn,
2096		unsigned long *zholes_size)
2097{
2098	pgdat->node_id = nid;
2099	pgdat->node_start_pfn = node_start_pfn;
2100	calculate_zone_totalpages(pgdat, zones_size, zholes_size);
2101
2102	alloc_node_mem_map(pgdat);
2103
2104	free_area_init_core(pgdat, zones_size, zholes_size);
2105}
2106
2107#ifndef CONFIG_NEED_MULTIPLE_NODES
2108static bootmem_data_t contig_bootmem_data;
2109struct pglist_data contig_page_data = { .bdata = &contig_bootmem_data };
2110
2111EXPORT_SYMBOL(contig_page_data);
2112#endif
2113
2114void __init free_area_init(unsigned long *zones_size)
2115{
2116	free_area_init_node(0, NODE_DATA(0), zones_size,
2117			__pa(PAGE_OFFSET) >> PAGE_SHIFT, NULL);
2118}
2119
2120#ifdef CONFIG_HOTPLUG_CPU
2121static int page_alloc_cpu_notify(struct notifier_block *self,
2122				 unsigned long action, void *hcpu)
2123{
2124	int cpu = (unsigned long)hcpu;
2125	unsigned long *src, *dest;
2126
2127	if (action == CPU_DEAD) {
2128		int i;
2129
2130		local_irq_disable();
2131		__drain_pages(cpu);
2132
2133		/* Add dead cpu's page_states to our own. */
2134		dest = (unsigned long *)&__get_cpu_var(page_states);
2135		src = (unsigned long *)&per_cpu(page_states, cpu);
2136
2137		for (i = 0; i < sizeof(struct page_state)/sizeof(unsigned long);
2138				i++) {
2139			dest[i] += src[i];
2140			src[i] = 0;
2141		}
2142
2143		local_irq_enable();
2144		refresh_cpu_vm_stats(cpu);
2145	}
2146	return NOTIFY_OK;
2147}
2148#endif /* CONFIG_HOTPLUG_CPU */
2149
2150void __init page_alloc_init(void)
2151{
2152	hotcpu_notifier(page_alloc_cpu_notify, 0);
2153}
2154
2155/*
2156 * calculate_totalreserve_pages - called when sysctl_lower_zone_reserve_ratio
2157 *	or min_free_kbytes changes.
2158 */
2159static void calculate_totalreserve_pages(void)
2160{
2161	struct pglist_data *pgdat;
2162	unsigned long reserve_pages = 0;
2163	int i, j;
2164
2165	for_each_online_pgdat(pgdat) {
2166		for (i = 0; i < MAX_NR_ZONES; i++) {
2167			struct zone *zone = pgdat->node_zones + i;
2168			unsigned long max = 0;
2169
2170			/* Find valid and maximum lowmem_reserve in the zone */
2171			for (j = i; j < MAX_NR_ZONES; j++) {
2172				if (zone->lowmem_reserve[j] > max)
2173					max = zone->lowmem_reserve[j];
2174			}
2175
2176			/* we treat pages_high as reserved pages. */
2177			max += zone->pages_high;
2178
2179			if (max > zone->present_pages)
2180				max = zone->present_pages;
2181			reserve_pages += max;
2182		}
2183	}
2184	totalreserve_pages = reserve_pages;
2185}
2186
2187/*
2188 * setup_per_zone_lowmem_reserve - called whenever
2189 *	sysctl_lower_zone_reserve_ratio changes.  Ensures that each zone
2190 *	has a correct pages reserved value, so an adequate number of
2191 *	pages are left in the zone after a successful __alloc_pages().
2192 */
2193static void setup_per_zone_lowmem_reserve(void)
2194{
2195	struct pglist_data *pgdat;
2196	int j, idx;
2197
2198	for_each_online_pgdat(pgdat) {
2199		for (j = 0; j < MAX_NR_ZONES; j++) {
2200			struct zone *zone = pgdat->node_zones + j;
2201			unsigned long present_pages = zone->present_pages;
2202
2203			zone->lowmem_reserve[j] = 0;
2204
2205			for (idx = j-1; idx >= 0; idx--) {
2206				struct zone *lower_zone;
2207
2208				if (sysctl_lowmem_reserve_ratio[idx] < 1)
2209					sysctl_lowmem_reserve_ratio[idx] = 1;
2210
2211				lower_zone = pgdat->node_zones + idx;
2212				lower_zone->lowmem_reserve[j] = present_pages /
2213					sysctl_lowmem_reserve_ratio[idx];
2214				present_pages += lower_zone->present_pages;
2215			}
2216		}
2217	}
2218
2219	/* update totalreserve_pages */
2220	calculate_totalreserve_pages();
2221}
2222
2223/*
2224 * setup_per_zone_pages_min - called when min_free_kbytes changes.  Ensures
2225 *	that the pages_{min,low,high} values for each zone are set correctly
2226 *	with respect to min_free_kbytes.
2227 */
2228void setup_per_zone_pages_min(void)
2229{
2230	unsigned long pages_min = min_free_kbytes >> (PAGE_SHIFT - 10);
2231	unsigned long lowmem_pages = 0;
2232	struct zone *zone;
2233	unsigned long flags;
2234
2235	/* Calculate total number of !ZONE_HIGHMEM pages */
2236	for_each_zone(zone) {
2237		if (!is_highmem(zone))
2238			lowmem_pages += zone->present_pages;
2239	}
2240
2241	for_each_zone(zone) {
2242		u64 tmp;
2243
2244		spin_lock_irqsave(&zone->lru_lock, flags);
2245		tmp = (u64)pages_min * zone->present_pages;
2246		do_div(tmp, lowmem_pages);
2247		if (is_highmem(zone)) {
2248			/*
2249			 * __GFP_HIGH and PF_MEMALLOC allocations usually don't
2250			 * need highmem pages, so cap pages_min to a small
2251			 * value here.
2252			 *
2253			 * The (pages_high-pages_low) and (pages_low-pages_min)
2254			 * deltas controls asynch page reclaim, and so should
2255			 * not be capped for highmem.
2256			 */
2257			int min_pages;
2258
2259			min_pages = zone->present_pages / 1024;
2260			if (min_pages < SWAP_CLUSTER_MAX)
2261				min_pages = SWAP_CLUSTER_MAX;
2262			if (min_pages > 128)
2263				min_pages = 128;
2264			zone->pages_min = min_pages;
2265		} else {
2266			/*
2267			 * If it's a lowmem zone, reserve a number of pages
2268			 * proportionate to the zone's size.
2269			 */
2270			zone->pages_min = tmp;
2271		}
2272
2273		zone->pages_low   = zone->pages_min + (tmp >> 2);
2274		zone->pages_high  = zone->pages_min + (tmp >> 1);
2275		spin_unlock_irqrestore(&zone->lru_lock, flags);
2276	}
2277
2278	/* update totalreserve_pages */
2279	calculate_totalreserve_pages();
2280}
2281
2282/*
2283 * Initialise min_free_kbytes.
2284 *
2285 * For small machines we want it small (128k min).  For large machines
2286 * we want it large (64MB max).  But it is not linear, because network
2287 * bandwidth does not increase linearly with machine size.  We use
2288 *
2289 * 	min_free_kbytes = 4 * sqrt(lowmem_kbytes), for better accuracy:
2290 *	min_free_kbytes = sqrt(lowmem_kbytes * 16)
2291 *
2292 * which yields
2293 *
2294 * 16MB:	512k
2295 * 32MB:	724k
2296 * 64MB:	1024k
2297 * 128MB:	1448k
2298 * 256MB:	2048k
2299 * 512MB:	2896k
2300 * 1024MB:	4096k
2301 * 2048MB:	5792k
2302 * 4096MB:	8192k
2303 * 8192MB:	11584k
2304 * 16384MB:	16384k
2305 */
2306static int __init init_per_zone_pages_min(void)
2307{
2308	unsigned long lowmem_kbytes;
2309
2310	lowmem_kbytes = nr_free_buffer_pages() * (PAGE_SIZE >> 10);
2311
2312	min_free_kbytes = int_sqrt(lowmem_kbytes * 16);
2313	if (min_free_kbytes < 128)
2314		min_free_kbytes = 128;
2315	if (min_free_kbytes > 65536)
2316		min_free_kbytes = 65536;
2317	setup_per_zone_pages_min();
2318	setup_per_zone_lowmem_reserve();
2319	return 0;
2320}
2321module_init(init_per_zone_pages_min)
2322
2323/*
2324 * min_free_kbytes_sysctl_handler - just a wrapper around proc_dointvec() so
2325 *	that we can call two helper functions whenever min_free_kbytes
2326 *	changes.
2327 */
2328int min_free_kbytes_sysctl_handler(ctl_table *table, int write,
2329	struct file *file, void __user *buffer, size_t *length, loff_t *ppos)
2330{
2331	proc_dointvec(table, write, file, buffer, length, ppos);
2332	setup_per_zone_pages_min();
2333	return 0;
2334}
2335
2336/*
2337 * lowmem_reserve_ratio_sysctl_handler - just a wrapper around
2338 *	proc_dointvec() so that we can call setup_per_zone_lowmem_reserve()
2339 *	whenever sysctl_lowmem_reserve_ratio changes.
2340 *
2341 * The reserve ratio obviously has absolutely no relation with the
2342 * pages_min watermarks. The lowmem reserve ratio can only make sense
2343 * if in function of the boot time zone sizes.
2344 */
2345int lowmem_reserve_ratio_sysctl_handler(ctl_table *table, int write,
2346	struct file *file, void __user *buffer, size_t *length, loff_t *ppos)
2347{
2348	proc_dointvec_minmax(table, write, file, buffer, length, ppos);
2349	setup_per_zone_lowmem_reserve();
2350	return 0;
2351}
2352
2353/*
2354 * percpu_pagelist_fraction - changes the pcp->high for each zone on each
2355 * cpu.  It is the fraction of total pages in each zone that a hot per cpu pagelist
2356 * can have before it gets flushed back to buddy allocator.
2357 */
2358
2359int percpu_pagelist_fraction_sysctl_handler(ctl_table *table, int write,
2360	struct file *file, void __user *buffer, size_t *length, loff_t *ppos)
2361{
2362	struct zone *zone;
2363	unsigned int cpu;
2364	int ret;
2365
2366	ret = proc_dointvec_minmax(table, write, file, buffer, length, ppos);
2367	if (!write || (ret == -EINVAL))
2368		return ret;
2369	for_each_zone(zone) {
2370		for_each_online_cpu(cpu) {
2371			unsigned long  high;
2372			high = zone->present_pages / percpu_pagelist_fraction;
2373			setup_pagelist_highmark(zone_pcp(zone, cpu), high);
2374		}
2375	}
2376	return 0;
2377}
2378
2379__initdata int hashdist = HASHDIST_DEFAULT;
2380
2381#ifdef CONFIG_NUMA
2382static int __init set_hashdist(char *str)
2383{
2384	if (!str)
2385		return 0;
2386	hashdist = simple_strtoul(str, &str, 0);
2387	return 1;
2388}
2389__setup("hashdist=", set_hashdist);
2390#endif
2391
2392/*
2393 * allocate a large system hash table from bootmem
2394 * - it is assumed that the hash table must contain an exact power-of-2
2395 *   quantity of entries
2396 * - limit is the number of hash buckets, not the total allocation size
2397 */
2398void *__init alloc_large_system_hash(const char *tablename,
2399				     unsigned long bucketsize,
2400				     unsigned long numentries,
2401				     int scale,
2402				     int flags,
2403				     unsigned int *_hash_shift,
2404				     unsigned int *_hash_mask,
2405				     unsigned long limit)
2406{
2407	unsigned long long max = limit;
2408	unsigned long log2qty, size;
2409	void *table = NULL;
2410
2411	/* allow the kernel cmdline to have a say */
2412	if (!numentries) {
2413		/* round applicable memory size up to nearest megabyte */
2414		numentries = (flags & HASH_HIGHMEM) ? nr_all_pages : nr_kernel_pages;
2415		numentries += (1UL << (20 - PAGE_SHIFT)) - 1;
2416		numentries >>= 20 - PAGE_SHIFT;
2417		numentries <<= 20 - PAGE_SHIFT;
2418
2419		/* limit to 1 bucket per 2^scale bytes of low memory */
2420		if (scale > PAGE_SHIFT)
2421			numentries >>= (scale - PAGE_SHIFT);
2422		else
2423			numentries <<= (PAGE_SHIFT - scale);
2424	}
2425	numentries = roundup_pow_of_two(numentries);
2426
2427	/* limit allocation size to 1/16 total memory by default */
2428	if (max == 0) {
2429		max = ((unsigned long long)nr_all_pages << PAGE_SHIFT) >> 4;
2430		do_div(max, bucketsize);
2431	}
2432
2433	if (numentries > max)
2434		numentries = max;
2435
2436	log2qty = long_log2(numentries);
2437
2438	do {
2439		size = bucketsize << log2qty;
2440		if (flags & HASH_EARLY)
2441			table = alloc_bootmem(size);
2442		else if (hashdist)
2443			table = __vmalloc(size, GFP_ATOMIC, PAGE_KERNEL);
2444		else {
2445			unsigned long order;
2446			for (order = 0; ((1UL << order) << PAGE_SHIFT) < size; order++)
2447				;
2448			table = (void*) __get_free_pages(GFP_ATOMIC, order);
2449		}
2450	} while (!table && size > PAGE_SIZE && --log2qty);
2451
2452	if (!table)
2453		panic("Failed to allocate %s hash table\n", tablename);
2454
2455	printk("%s hash table entries: %d (order: %d, %lu bytes)\n",
2456	       tablename,
2457	       (1U << log2qty),
2458	       long_log2(size) - PAGE_SHIFT,
2459	       size);
2460
2461	if (_hash_shift)
2462		*_hash_shift = log2qty;
2463	if (_hash_mask)
2464		*_hash_mask = (1 << log2qty) - 1;
2465
2466	return table;
2467}
2468
2469#ifdef CONFIG_OUT_OF_LINE_PFN_TO_PAGE
2470struct page *pfn_to_page(unsigned long pfn)
2471{
2472	return __pfn_to_page(pfn);
2473}
2474unsigned long page_to_pfn(struct page *page)
2475{
2476	return __page_to_pfn(page);
2477}
2478EXPORT_SYMBOL(pfn_to_page);
2479EXPORT_SYMBOL(page_to_pfn);
2480#endif /* CONFIG_OUT_OF_LINE_PFN_TO_PAGE */
2481