page_alloc.c revision 2244b95a7bcf8d24196f8a3a44187ba5dfff754c
1/*
2 *  linux/mm/page_alloc.c
3 *
4 *  Manages the free list, the system allocates free pages here.
5 *  Note that kmalloc() lives in slab.c
6 *
7 *  Copyright (C) 1991, 1992, 1993, 1994  Linus Torvalds
8 *  Swap reorganised 29.12.95, Stephen Tweedie
9 *  Support of BIGMEM added by Gerhard Wichert, Siemens AG, July 1999
10 *  Reshaped it to be a zoned allocator, Ingo Molnar, Red Hat, 1999
11 *  Discontiguous memory support, Kanoj Sarcar, SGI, Nov 1999
12 *  Zone balancing, Kanoj Sarcar, SGI, Jan 2000
13 *  Per cpu hot/cold page lists, bulk allocation, Martin J. Bligh, Sept 2002
14 *          (lots of bits borrowed from Ingo Molnar & Andrew Morton)
15 */
16
17#include <linux/config.h>
18#include <linux/stddef.h>
19#include <linux/mm.h>
20#include <linux/swap.h>
21#include <linux/interrupt.h>
22#include <linux/pagemap.h>
23#include <linux/bootmem.h>
24#include <linux/compiler.h>
25#include <linux/kernel.h>
26#include <linux/module.h>
27#include <linux/suspend.h>
28#include <linux/pagevec.h>
29#include <linux/blkdev.h>
30#include <linux/slab.h>
31#include <linux/notifier.h>
32#include <linux/topology.h>
33#include <linux/sysctl.h>
34#include <linux/cpu.h>
35#include <linux/cpuset.h>
36#include <linux/memory_hotplug.h>
37#include <linux/nodemask.h>
38#include <linux/vmalloc.h>
39#include <linux/mempolicy.h>
40#include <linux/stop_machine.h>
41
42#include <asm/tlbflush.h>
43#include <asm/div64.h>
44#include "internal.h"
45
46/*
47 * MCD - HACK: Find somewhere to initialize this EARLY, or make this
48 * initializer cleaner
49 */
50nodemask_t node_online_map __read_mostly = { { [0] = 1UL } };
51EXPORT_SYMBOL(node_online_map);
52nodemask_t node_possible_map __read_mostly = NODE_MASK_ALL;
53EXPORT_SYMBOL(node_possible_map);
54unsigned long totalram_pages __read_mostly;
55unsigned long totalhigh_pages __read_mostly;
56unsigned long totalreserve_pages __read_mostly;
57long nr_swap_pages;
58int percpu_pagelist_fraction;
59
60static void __free_pages_ok(struct page *page, unsigned int order);
61
62/*
63 * results with 256, 32 in the lowmem_reserve sysctl:
64 *	1G machine -> (16M dma, 800M-16M normal, 1G-800M high)
65 *	1G machine -> (16M dma, 784M normal, 224M high)
66 *	NORMAL allocation will leave 784M/256 of ram reserved in the ZONE_DMA
67 *	HIGHMEM allocation will leave 224M/32 of ram reserved in ZONE_NORMAL
68 *	HIGHMEM allocation will (224M+784M)/256 of ram reserved in ZONE_DMA
69 *
70 * TBD: should special case ZONE_DMA32 machines here - in those we normally
71 * don't need any ZONE_NORMAL reservation
72 */
73int sysctl_lowmem_reserve_ratio[MAX_NR_ZONES-1] = { 256, 256, 32 };
74
75EXPORT_SYMBOL(totalram_pages);
76
77/*
78 * Used by page_zone() to look up the address of the struct zone whose
79 * id is encoded in the upper bits of page->flags
80 */
81struct zone *zone_table[1 << ZONETABLE_SHIFT] __read_mostly;
82EXPORT_SYMBOL(zone_table);
83
84static char *zone_names[MAX_NR_ZONES] = { "DMA", "DMA32", "Normal", "HighMem" };
85int min_free_kbytes = 1024;
86
87unsigned long __meminitdata nr_kernel_pages;
88unsigned long __meminitdata nr_all_pages;
89
90#ifdef CONFIG_DEBUG_VM
91static int page_outside_zone_boundaries(struct zone *zone, struct page *page)
92{
93	int ret = 0;
94	unsigned seq;
95	unsigned long pfn = page_to_pfn(page);
96
97	do {
98		seq = zone_span_seqbegin(zone);
99		if (pfn >= zone->zone_start_pfn + zone->spanned_pages)
100			ret = 1;
101		else if (pfn < zone->zone_start_pfn)
102			ret = 1;
103	} while (zone_span_seqretry(zone, seq));
104
105	return ret;
106}
107
108static int page_is_consistent(struct zone *zone, struct page *page)
109{
110#ifdef CONFIG_HOLES_IN_ZONE
111	if (!pfn_valid(page_to_pfn(page)))
112		return 0;
113#endif
114	if (zone != page_zone(page))
115		return 0;
116
117	return 1;
118}
119/*
120 * Temporary debugging check for pages not lying within a given zone.
121 */
122static int bad_range(struct zone *zone, struct page *page)
123{
124	if (page_outside_zone_boundaries(zone, page))
125		return 1;
126	if (!page_is_consistent(zone, page))
127		return 1;
128
129	return 0;
130}
131
132#else
133static inline int bad_range(struct zone *zone, struct page *page)
134{
135	return 0;
136}
137#endif
138
139static void bad_page(struct page *page)
140{
141	printk(KERN_EMERG "Bad page state in process '%s'\n"
142		KERN_EMERG "page:%p flags:0x%0*lx mapping:%p mapcount:%d count:%d\n"
143		KERN_EMERG "Trying to fix it up, but a reboot is needed\n"
144		KERN_EMERG "Backtrace:\n",
145		current->comm, page, (int)(2*sizeof(unsigned long)),
146		(unsigned long)page->flags, page->mapping,
147		page_mapcount(page), page_count(page));
148	dump_stack();
149	page->flags &= ~(1 << PG_lru	|
150			1 << PG_private |
151			1 << PG_locked	|
152			1 << PG_active	|
153			1 << PG_dirty	|
154			1 << PG_reclaim |
155			1 << PG_slab    |
156			1 << PG_swapcache |
157			1 << PG_writeback |
158			1 << PG_buddy );
159	set_page_count(page, 0);
160	reset_page_mapcount(page);
161	page->mapping = NULL;
162	add_taint(TAINT_BAD_PAGE);
163}
164
165/*
166 * Higher-order pages are called "compound pages".  They are structured thusly:
167 *
168 * The first PAGE_SIZE page is called the "head page".
169 *
170 * The remaining PAGE_SIZE pages are called "tail pages".
171 *
172 * All pages have PG_compound set.  All pages have their ->private pointing at
173 * the head page (even the head page has this).
174 *
175 * The first tail page's ->lru.next holds the address of the compound page's
176 * put_page() function.  Its ->lru.prev holds the order of allocation.
177 * This usage means that zero-order pages may not be compound.
178 */
179
180static void free_compound_page(struct page *page)
181{
182	__free_pages_ok(page, (unsigned long)page[1].lru.prev);
183}
184
185static void prep_compound_page(struct page *page, unsigned long order)
186{
187	int i;
188	int nr_pages = 1 << order;
189
190	page[1].lru.next = (void *)free_compound_page;	/* set dtor */
191	page[1].lru.prev = (void *)order;
192	for (i = 0; i < nr_pages; i++) {
193		struct page *p = page + i;
194
195		__SetPageCompound(p);
196		set_page_private(p, (unsigned long)page);
197	}
198}
199
200static void destroy_compound_page(struct page *page, unsigned long order)
201{
202	int i;
203	int nr_pages = 1 << order;
204
205	if (unlikely((unsigned long)page[1].lru.prev != order))
206		bad_page(page);
207
208	for (i = 0; i < nr_pages; i++) {
209		struct page *p = page + i;
210
211		if (unlikely(!PageCompound(p) |
212				(page_private(p) != (unsigned long)page)))
213			bad_page(page);
214		__ClearPageCompound(p);
215	}
216}
217
218static inline void prep_zero_page(struct page *page, int order, gfp_t gfp_flags)
219{
220	int i;
221
222	BUG_ON((gfp_flags & (__GFP_WAIT | __GFP_HIGHMEM)) == __GFP_HIGHMEM);
223	/*
224	 * clear_highpage() will use KM_USER0, so it's a bug to use __GFP_ZERO
225	 * and __GFP_HIGHMEM from hard or soft interrupt context.
226	 */
227	BUG_ON((gfp_flags & __GFP_HIGHMEM) && in_interrupt());
228	for (i = 0; i < (1 << order); i++)
229		clear_highpage(page + i);
230}
231
232/*
233 * function for dealing with page's order in buddy system.
234 * zone->lock is already acquired when we use these.
235 * So, we don't need atomic page->flags operations here.
236 */
237static inline unsigned long page_order(struct page *page)
238{
239	return page_private(page);
240}
241
242static inline void set_page_order(struct page *page, int order)
243{
244	set_page_private(page, order);
245	__SetPageBuddy(page);
246}
247
248static inline void rmv_page_order(struct page *page)
249{
250	__ClearPageBuddy(page);
251	set_page_private(page, 0);
252}
253
254/*
255 * Locate the struct page for both the matching buddy in our
256 * pair (buddy1) and the combined O(n+1) page they form (page).
257 *
258 * 1) Any buddy B1 will have an order O twin B2 which satisfies
259 * the following equation:
260 *     B2 = B1 ^ (1 << O)
261 * For example, if the starting buddy (buddy2) is #8 its order
262 * 1 buddy is #10:
263 *     B2 = 8 ^ (1 << 1) = 8 ^ 2 = 10
264 *
265 * 2) Any buddy B will have an order O+1 parent P which
266 * satisfies the following equation:
267 *     P = B & ~(1 << O)
268 *
269 * Assumption: *_mem_map is contiguous at least up to MAX_ORDER
270 */
271static inline struct page *
272__page_find_buddy(struct page *page, unsigned long page_idx, unsigned int order)
273{
274	unsigned long buddy_idx = page_idx ^ (1 << order);
275
276	return page + (buddy_idx - page_idx);
277}
278
279static inline unsigned long
280__find_combined_index(unsigned long page_idx, unsigned int order)
281{
282	return (page_idx & ~(1 << order));
283}
284
285/*
286 * This function checks whether a page is free && is the buddy
287 * we can do coalesce a page and its buddy if
288 * (a) the buddy is not in a hole &&
289 * (b) the buddy is in the buddy system &&
290 * (c) a page and its buddy have the same order &&
291 * (d) a page and its buddy are in the same zone.
292 *
293 * For recording whether a page is in the buddy system, we use PG_buddy.
294 * Setting, clearing, and testing PG_buddy is serialized by zone->lock.
295 *
296 * For recording page's order, we use page_private(page).
297 */
298static inline int page_is_buddy(struct page *page, struct page *buddy,
299								int order)
300{
301#ifdef CONFIG_HOLES_IN_ZONE
302	if (!pfn_valid(page_to_pfn(buddy)))
303		return 0;
304#endif
305
306	if (page_zone_id(page) != page_zone_id(buddy))
307		return 0;
308
309	if (PageBuddy(buddy) && page_order(buddy) == order) {
310		BUG_ON(page_count(buddy) != 0);
311		return 1;
312	}
313	return 0;
314}
315
316/*
317 * Freeing function for a buddy system allocator.
318 *
319 * The concept of a buddy system is to maintain direct-mapped table
320 * (containing bit values) for memory blocks of various "orders".
321 * The bottom level table contains the map for the smallest allocatable
322 * units of memory (here, pages), and each level above it describes
323 * pairs of units from the levels below, hence, "buddies".
324 * At a high level, all that happens here is marking the table entry
325 * at the bottom level available, and propagating the changes upward
326 * as necessary, plus some accounting needed to play nicely with other
327 * parts of the VM system.
328 * At each level, we keep a list of pages, which are heads of continuous
329 * free pages of length of (1 << order) and marked with PG_buddy. Page's
330 * order is recorded in page_private(page) field.
331 * So when we are allocating or freeing one, we can derive the state of the
332 * other.  That is, if we allocate a small block, and both were
333 * free, the remainder of the region must be split into blocks.
334 * If a block is freed, and its buddy is also free, then this
335 * triggers coalescing into a block of larger size.
336 *
337 * -- wli
338 */
339
340static inline void __free_one_page(struct page *page,
341		struct zone *zone, unsigned int order)
342{
343	unsigned long page_idx;
344	int order_size = 1 << order;
345
346	if (unlikely(PageCompound(page)))
347		destroy_compound_page(page, order);
348
349	page_idx = page_to_pfn(page) & ((1 << MAX_ORDER) - 1);
350
351	BUG_ON(page_idx & (order_size - 1));
352	BUG_ON(bad_range(zone, page));
353
354	zone->free_pages += order_size;
355	while (order < MAX_ORDER-1) {
356		unsigned long combined_idx;
357		struct free_area *area;
358		struct page *buddy;
359
360		buddy = __page_find_buddy(page, page_idx, order);
361		if (!page_is_buddy(page, buddy, order))
362			break;		/* Move the buddy up one level. */
363
364		list_del(&buddy->lru);
365		area = zone->free_area + order;
366		area->nr_free--;
367		rmv_page_order(buddy);
368		combined_idx = __find_combined_index(page_idx, order);
369		page = page + (combined_idx - page_idx);
370		page_idx = combined_idx;
371		order++;
372	}
373	set_page_order(page, order);
374	list_add(&page->lru, &zone->free_area[order].free_list);
375	zone->free_area[order].nr_free++;
376}
377
378static inline int free_pages_check(struct page *page)
379{
380	if (unlikely(page_mapcount(page) |
381		(page->mapping != NULL)  |
382		(page_count(page) != 0)  |
383		(page->flags & (
384			1 << PG_lru	|
385			1 << PG_private |
386			1 << PG_locked	|
387			1 << PG_active	|
388			1 << PG_reclaim	|
389			1 << PG_slab	|
390			1 << PG_swapcache |
391			1 << PG_writeback |
392			1 << PG_reserved |
393			1 << PG_buddy ))))
394		bad_page(page);
395	if (PageDirty(page))
396		__ClearPageDirty(page);
397	/*
398	 * For now, we report if PG_reserved was found set, but do not
399	 * clear it, and do not free the page.  But we shall soon need
400	 * to do more, for when the ZERO_PAGE count wraps negative.
401	 */
402	return PageReserved(page);
403}
404
405/*
406 * Frees a list of pages.
407 * Assumes all pages on list are in same zone, and of same order.
408 * count is the number of pages to free.
409 *
410 * If the zone was previously in an "all pages pinned" state then look to
411 * see if this freeing clears that state.
412 *
413 * And clear the zone's pages_scanned counter, to hold off the "all pages are
414 * pinned" detection logic.
415 */
416static void free_pages_bulk(struct zone *zone, int count,
417					struct list_head *list, int order)
418{
419	spin_lock(&zone->lock);
420	zone->all_unreclaimable = 0;
421	zone->pages_scanned = 0;
422	while (count--) {
423		struct page *page;
424
425		BUG_ON(list_empty(list));
426		page = list_entry(list->prev, struct page, lru);
427		/* have to delete it as __free_one_page list manipulates */
428		list_del(&page->lru);
429		__free_one_page(page, zone, order);
430	}
431	spin_unlock(&zone->lock);
432}
433
434static void free_one_page(struct zone *zone, struct page *page, int order)
435{
436	LIST_HEAD(list);
437	list_add(&page->lru, &list);
438	free_pages_bulk(zone, 1, &list, order);
439}
440
441static void __free_pages_ok(struct page *page, unsigned int order)
442{
443	unsigned long flags;
444	int i;
445	int reserved = 0;
446
447	arch_free_page(page, order);
448	if (!PageHighMem(page))
449		debug_check_no_locks_freed(page_address(page),
450					   PAGE_SIZE<<order);
451
452	for (i = 0 ; i < (1 << order) ; ++i)
453		reserved += free_pages_check(page + i);
454	if (reserved)
455		return;
456
457	kernel_map_pages(page, 1 << order, 0);
458	local_irq_save(flags);
459	__mod_page_state(pgfree, 1 << order);
460	free_one_page(page_zone(page), page, order);
461	local_irq_restore(flags);
462}
463
464/*
465 * permit the bootmem allocator to evade page validation on high-order frees
466 */
467void fastcall __init __free_pages_bootmem(struct page *page, unsigned int order)
468{
469	if (order == 0) {
470		__ClearPageReserved(page);
471		set_page_count(page, 0);
472		set_page_refcounted(page);
473		__free_page(page);
474	} else {
475		int loop;
476
477		prefetchw(page);
478		for (loop = 0; loop < BITS_PER_LONG; loop++) {
479			struct page *p = &page[loop];
480
481			if (loop + 1 < BITS_PER_LONG)
482				prefetchw(p + 1);
483			__ClearPageReserved(p);
484			set_page_count(p, 0);
485		}
486
487		set_page_refcounted(page);
488		__free_pages(page, order);
489	}
490}
491
492
493/*
494 * The order of subdivision here is critical for the IO subsystem.
495 * Please do not alter this order without good reasons and regression
496 * testing. Specifically, as large blocks of memory are subdivided,
497 * the order in which smaller blocks are delivered depends on the order
498 * they're subdivided in this function. This is the primary factor
499 * influencing the order in which pages are delivered to the IO
500 * subsystem according to empirical testing, and this is also justified
501 * by considering the behavior of a buddy system containing a single
502 * large block of memory acted on by a series of small allocations.
503 * This behavior is a critical factor in sglist merging's success.
504 *
505 * -- wli
506 */
507static inline void expand(struct zone *zone, struct page *page,
508 	int low, int high, struct free_area *area)
509{
510	unsigned long size = 1 << high;
511
512	while (high > low) {
513		area--;
514		high--;
515		size >>= 1;
516		BUG_ON(bad_range(zone, &page[size]));
517		list_add(&page[size].lru, &area->free_list);
518		area->nr_free++;
519		set_page_order(&page[size], high);
520	}
521}
522
523/*
524 * This page is about to be returned from the page allocator
525 */
526static int prep_new_page(struct page *page, int order, gfp_t gfp_flags)
527{
528	if (unlikely(page_mapcount(page) |
529		(page->mapping != NULL)  |
530		(page_count(page) != 0)  |
531		(page->flags & (
532			1 << PG_lru	|
533			1 << PG_private	|
534			1 << PG_locked	|
535			1 << PG_active	|
536			1 << PG_dirty	|
537			1 << PG_reclaim	|
538			1 << PG_slab    |
539			1 << PG_swapcache |
540			1 << PG_writeback |
541			1 << PG_reserved |
542			1 << PG_buddy ))))
543		bad_page(page);
544
545	/*
546	 * For now, we report if PG_reserved was found set, but do not
547	 * clear it, and do not allocate the page: as a safety net.
548	 */
549	if (PageReserved(page))
550		return 1;
551
552	page->flags &= ~(1 << PG_uptodate | 1 << PG_error |
553			1 << PG_referenced | 1 << PG_arch_1 |
554			1 << PG_checked | 1 << PG_mappedtodisk);
555	set_page_private(page, 0);
556	set_page_refcounted(page);
557	kernel_map_pages(page, 1 << order, 1);
558
559	if (gfp_flags & __GFP_ZERO)
560		prep_zero_page(page, order, gfp_flags);
561
562	if (order && (gfp_flags & __GFP_COMP))
563		prep_compound_page(page, order);
564
565	return 0;
566}
567
568/*
569 * Do the hard work of removing an element from the buddy allocator.
570 * Call me with the zone->lock already held.
571 */
572static struct page *__rmqueue(struct zone *zone, unsigned int order)
573{
574	struct free_area * area;
575	unsigned int current_order;
576	struct page *page;
577
578	for (current_order = order; current_order < MAX_ORDER; ++current_order) {
579		area = zone->free_area + current_order;
580		if (list_empty(&area->free_list))
581			continue;
582
583		page = list_entry(area->free_list.next, struct page, lru);
584		list_del(&page->lru);
585		rmv_page_order(page);
586		area->nr_free--;
587		zone->free_pages -= 1UL << order;
588		expand(zone, page, order, current_order, area);
589		return page;
590	}
591
592	return NULL;
593}
594
595/*
596 * Obtain a specified number of elements from the buddy allocator, all under
597 * a single hold of the lock, for efficiency.  Add them to the supplied list.
598 * Returns the number of new pages which were placed at *list.
599 */
600static int rmqueue_bulk(struct zone *zone, unsigned int order,
601			unsigned long count, struct list_head *list)
602{
603	int i;
604
605	spin_lock(&zone->lock);
606	for (i = 0; i < count; ++i) {
607		struct page *page = __rmqueue(zone, order);
608		if (unlikely(page == NULL))
609			break;
610		list_add_tail(&page->lru, list);
611	}
612	spin_unlock(&zone->lock);
613	return i;
614}
615
616#ifdef CONFIG_NUMA
617/*
618 * Called from the slab reaper to drain pagesets on a particular node that
619 * belong to the currently executing processor.
620 * Note that this function must be called with the thread pinned to
621 * a single processor.
622 */
623void drain_node_pages(int nodeid)
624{
625	int i, z;
626	unsigned long flags;
627
628	for (z = 0; z < MAX_NR_ZONES; z++) {
629		struct zone *zone = NODE_DATA(nodeid)->node_zones + z;
630		struct per_cpu_pageset *pset;
631
632		pset = zone_pcp(zone, smp_processor_id());
633		for (i = 0; i < ARRAY_SIZE(pset->pcp); i++) {
634			struct per_cpu_pages *pcp;
635
636			pcp = &pset->pcp[i];
637			if (pcp->count) {
638				local_irq_save(flags);
639				free_pages_bulk(zone, pcp->count, &pcp->list, 0);
640				pcp->count = 0;
641				local_irq_restore(flags);
642			}
643		}
644	}
645}
646#endif
647
648#if defined(CONFIG_PM) || defined(CONFIG_HOTPLUG_CPU)
649static void __drain_pages(unsigned int cpu)
650{
651	unsigned long flags;
652	struct zone *zone;
653	int i;
654
655	for_each_zone(zone) {
656		struct per_cpu_pageset *pset;
657
658		pset = zone_pcp(zone, cpu);
659		for (i = 0; i < ARRAY_SIZE(pset->pcp); i++) {
660			struct per_cpu_pages *pcp;
661
662			pcp = &pset->pcp[i];
663			local_irq_save(flags);
664			free_pages_bulk(zone, pcp->count, &pcp->list, 0);
665			pcp->count = 0;
666			local_irq_restore(flags);
667		}
668	}
669}
670#endif /* CONFIG_PM || CONFIG_HOTPLUG_CPU */
671
672#ifdef CONFIG_PM
673
674void mark_free_pages(struct zone *zone)
675{
676	unsigned long zone_pfn, flags;
677	int order;
678	struct list_head *curr;
679
680	if (!zone->spanned_pages)
681		return;
682
683	spin_lock_irqsave(&zone->lock, flags);
684	for (zone_pfn = 0; zone_pfn < zone->spanned_pages; ++zone_pfn)
685		ClearPageNosaveFree(pfn_to_page(zone_pfn + zone->zone_start_pfn));
686
687	for (order = MAX_ORDER - 1; order >= 0; --order)
688		list_for_each(curr, &zone->free_area[order].free_list) {
689			unsigned long start_pfn, i;
690
691			start_pfn = page_to_pfn(list_entry(curr, struct page, lru));
692
693			for (i=0; i < (1<<order); i++)
694				SetPageNosaveFree(pfn_to_page(start_pfn+i));
695	}
696	spin_unlock_irqrestore(&zone->lock, flags);
697}
698
699/*
700 * Spill all of this CPU's per-cpu pages back into the buddy allocator.
701 */
702void drain_local_pages(void)
703{
704	unsigned long flags;
705
706	local_irq_save(flags);
707	__drain_pages(smp_processor_id());
708	local_irq_restore(flags);
709}
710#endif /* CONFIG_PM */
711
712static void zone_statistics(struct zonelist *zonelist, struct zone *z, int cpu)
713{
714#ifdef CONFIG_NUMA
715	pg_data_t *pg = z->zone_pgdat;
716	pg_data_t *orig = zonelist->zones[0]->zone_pgdat;
717	struct per_cpu_pageset *p;
718
719	p = zone_pcp(z, cpu);
720	if (pg == orig) {
721		p->numa_hit++;
722	} else {
723		p->numa_miss++;
724		zone_pcp(zonelist->zones[0], cpu)->numa_foreign++;
725	}
726	if (pg == NODE_DATA(numa_node_id()))
727		p->local_node++;
728	else
729		p->other_node++;
730#endif
731}
732
733/*
734 * Free a 0-order page
735 */
736static void fastcall free_hot_cold_page(struct page *page, int cold)
737{
738	struct zone *zone = page_zone(page);
739	struct per_cpu_pages *pcp;
740	unsigned long flags;
741
742	arch_free_page(page, 0);
743
744	if (PageAnon(page))
745		page->mapping = NULL;
746	if (free_pages_check(page))
747		return;
748
749	kernel_map_pages(page, 1, 0);
750
751	pcp = &zone_pcp(zone, get_cpu())->pcp[cold];
752	local_irq_save(flags);
753	__inc_page_state(pgfree);
754	list_add(&page->lru, &pcp->list);
755	pcp->count++;
756	if (pcp->count >= pcp->high) {
757		free_pages_bulk(zone, pcp->batch, &pcp->list, 0);
758		pcp->count -= pcp->batch;
759	}
760	local_irq_restore(flags);
761	put_cpu();
762}
763
764void fastcall free_hot_page(struct page *page)
765{
766	free_hot_cold_page(page, 0);
767}
768
769void fastcall free_cold_page(struct page *page)
770{
771	free_hot_cold_page(page, 1);
772}
773
774/*
775 * split_page takes a non-compound higher-order page, and splits it into
776 * n (1<<order) sub-pages: page[0..n]
777 * Each sub-page must be freed individually.
778 *
779 * Note: this is probably too low level an operation for use in drivers.
780 * Please consult with lkml before using this in your driver.
781 */
782void split_page(struct page *page, unsigned int order)
783{
784	int i;
785
786	BUG_ON(PageCompound(page));
787	BUG_ON(!page_count(page));
788	for (i = 1; i < (1 << order); i++)
789		set_page_refcounted(page + i);
790}
791
792/*
793 * Really, prep_compound_page() should be called from __rmqueue_bulk().  But
794 * we cheat by calling it from here, in the order > 0 path.  Saves a branch
795 * or two.
796 */
797static struct page *buffered_rmqueue(struct zonelist *zonelist,
798			struct zone *zone, int order, gfp_t gfp_flags)
799{
800	unsigned long flags;
801	struct page *page;
802	int cold = !!(gfp_flags & __GFP_COLD);
803	int cpu;
804
805again:
806	cpu  = get_cpu();
807	if (likely(order == 0)) {
808		struct per_cpu_pages *pcp;
809
810		pcp = &zone_pcp(zone, cpu)->pcp[cold];
811		local_irq_save(flags);
812		if (!pcp->count) {
813			pcp->count += rmqueue_bulk(zone, 0,
814						pcp->batch, &pcp->list);
815			if (unlikely(!pcp->count))
816				goto failed;
817		}
818		page = list_entry(pcp->list.next, struct page, lru);
819		list_del(&page->lru);
820		pcp->count--;
821	} else {
822		spin_lock_irqsave(&zone->lock, flags);
823		page = __rmqueue(zone, order);
824		spin_unlock(&zone->lock);
825		if (!page)
826			goto failed;
827	}
828
829	__mod_page_state_zone(zone, pgalloc, 1 << order);
830	zone_statistics(zonelist, zone, cpu);
831	local_irq_restore(flags);
832	put_cpu();
833
834	BUG_ON(bad_range(zone, page));
835	if (prep_new_page(page, order, gfp_flags))
836		goto again;
837	return page;
838
839failed:
840	local_irq_restore(flags);
841	put_cpu();
842	return NULL;
843}
844
845#define ALLOC_NO_WATERMARKS	0x01 /* don't check watermarks at all */
846#define ALLOC_WMARK_MIN		0x02 /* use pages_min watermark */
847#define ALLOC_WMARK_LOW		0x04 /* use pages_low watermark */
848#define ALLOC_WMARK_HIGH	0x08 /* use pages_high watermark */
849#define ALLOC_HARDER		0x10 /* try to alloc harder */
850#define ALLOC_HIGH		0x20 /* __GFP_HIGH set */
851#define ALLOC_CPUSET		0x40 /* check for correct cpuset */
852
853/*
854 * Return 1 if free pages are above 'mark'. This takes into account the order
855 * of the allocation.
856 */
857int zone_watermark_ok(struct zone *z, int order, unsigned long mark,
858		      int classzone_idx, int alloc_flags)
859{
860	/* free_pages my go negative - that's OK */
861	long min = mark, free_pages = z->free_pages - (1 << order) + 1;
862	int o;
863
864	if (alloc_flags & ALLOC_HIGH)
865		min -= min / 2;
866	if (alloc_flags & ALLOC_HARDER)
867		min -= min / 4;
868
869	if (free_pages <= min + z->lowmem_reserve[classzone_idx])
870		return 0;
871	for (o = 0; o < order; o++) {
872		/* At the next order, this order's pages become unavailable */
873		free_pages -= z->free_area[o].nr_free << o;
874
875		/* Require fewer higher order pages to be free */
876		min >>= 1;
877
878		if (free_pages <= min)
879			return 0;
880	}
881	return 1;
882}
883
884/*
885 * get_page_from_freeliest goes through the zonelist trying to allocate
886 * a page.
887 */
888static struct page *
889get_page_from_freelist(gfp_t gfp_mask, unsigned int order,
890		struct zonelist *zonelist, int alloc_flags)
891{
892	struct zone **z = zonelist->zones;
893	struct page *page = NULL;
894	int classzone_idx = zone_idx(*z);
895
896	/*
897	 * Go through the zonelist once, looking for a zone with enough free.
898	 * See also cpuset_zone_allowed() comment in kernel/cpuset.c.
899	 */
900	do {
901		if ((alloc_flags & ALLOC_CPUSET) &&
902				!cpuset_zone_allowed(*z, gfp_mask))
903			continue;
904
905		if (!(alloc_flags & ALLOC_NO_WATERMARKS)) {
906			unsigned long mark;
907			if (alloc_flags & ALLOC_WMARK_MIN)
908				mark = (*z)->pages_min;
909			else if (alloc_flags & ALLOC_WMARK_LOW)
910				mark = (*z)->pages_low;
911			else
912				mark = (*z)->pages_high;
913			if (!zone_watermark_ok(*z, order, mark,
914				    classzone_idx, alloc_flags))
915				if (!zone_reclaim_mode ||
916				    !zone_reclaim(*z, gfp_mask, order))
917					continue;
918		}
919
920		page = buffered_rmqueue(zonelist, *z, order, gfp_mask);
921		if (page) {
922			break;
923		}
924	} while (*(++z) != NULL);
925	return page;
926}
927
928/*
929 * This is the 'heart' of the zoned buddy allocator.
930 */
931struct page * fastcall
932__alloc_pages(gfp_t gfp_mask, unsigned int order,
933		struct zonelist *zonelist)
934{
935	const gfp_t wait = gfp_mask & __GFP_WAIT;
936	struct zone **z;
937	struct page *page;
938	struct reclaim_state reclaim_state;
939	struct task_struct *p = current;
940	int do_retry;
941	int alloc_flags;
942	int did_some_progress;
943
944	might_sleep_if(wait);
945
946restart:
947	z = zonelist->zones;  /* the list of zones suitable for gfp_mask */
948
949	if (unlikely(*z == NULL)) {
950		/* Should this ever happen?? */
951		return NULL;
952	}
953
954	page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, order,
955				zonelist, ALLOC_WMARK_LOW|ALLOC_CPUSET);
956	if (page)
957		goto got_pg;
958
959	do {
960		wakeup_kswapd(*z, order);
961	} while (*(++z));
962
963	/*
964	 * OK, we're below the kswapd watermark and have kicked background
965	 * reclaim. Now things get more complex, so set up alloc_flags according
966	 * to how we want to proceed.
967	 *
968	 * The caller may dip into page reserves a bit more if the caller
969	 * cannot run direct reclaim, or if the caller has realtime scheduling
970	 * policy or is asking for __GFP_HIGH memory.  GFP_ATOMIC requests will
971	 * set both ALLOC_HARDER (!wait) and ALLOC_HIGH (__GFP_HIGH).
972	 */
973	alloc_flags = ALLOC_WMARK_MIN;
974	if ((unlikely(rt_task(p)) && !in_interrupt()) || !wait)
975		alloc_flags |= ALLOC_HARDER;
976	if (gfp_mask & __GFP_HIGH)
977		alloc_flags |= ALLOC_HIGH;
978	if (wait)
979		alloc_flags |= ALLOC_CPUSET;
980
981	/*
982	 * Go through the zonelist again. Let __GFP_HIGH and allocations
983	 * coming from realtime tasks go deeper into reserves.
984	 *
985	 * This is the last chance, in general, before the goto nopage.
986	 * Ignore cpuset if GFP_ATOMIC (!wait) rather than fail alloc.
987	 * See also cpuset_zone_allowed() comment in kernel/cpuset.c.
988	 */
989	page = get_page_from_freelist(gfp_mask, order, zonelist, alloc_flags);
990	if (page)
991		goto got_pg;
992
993	/* This allocation should allow future memory freeing. */
994
995	if (((p->flags & PF_MEMALLOC) || unlikely(test_thread_flag(TIF_MEMDIE)))
996			&& !in_interrupt()) {
997		if (!(gfp_mask & __GFP_NOMEMALLOC)) {
998nofail_alloc:
999			/* go through the zonelist yet again, ignoring mins */
1000			page = get_page_from_freelist(gfp_mask, order,
1001				zonelist, ALLOC_NO_WATERMARKS);
1002			if (page)
1003				goto got_pg;
1004			if (gfp_mask & __GFP_NOFAIL) {
1005				blk_congestion_wait(WRITE, HZ/50);
1006				goto nofail_alloc;
1007			}
1008		}
1009		goto nopage;
1010	}
1011
1012	/* Atomic allocations - we can't balance anything */
1013	if (!wait)
1014		goto nopage;
1015
1016rebalance:
1017	cond_resched();
1018
1019	/* We now go into synchronous reclaim */
1020	cpuset_memory_pressure_bump();
1021	p->flags |= PF_MEMALLOC;
1022	reclaim_state.reclaimed_slab = 0;
1023	p->reclaim_state = &reclaim_state;
1024
1025	did_some_progress = try_to_free_pages(zonelist->zones, gfp_mask);
1026
1027	p->reclaim_state = NULL;
1028	p->flags &= ~PF_MEMALLOC;
1029
1030	cond_resched();
1031
1032	if (likely(did_some_progress)) {
1033		page = get_page_from_freelist(gfp_mask, order,
1034						zonelist, alloc_flags);
1035		if (page)
1036			goto got_pg;
1037	} else if ((gfp_mask & __GFP_FS) && !(gfp_mask & __GFP_NORETRY)) {
1038		/*
1039		 * Go through the zonelist yet one more time, keep
1040		 * very high watermark here, this is only to catch
1041		 * a parallel oom killing, we must fail if we're still
1042		 * under heavy pressure.
1043		 */
1044		page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, order,
1045				zonelist, ALLOC_WMARK_HIGH|ALLOC_CPUSET);
1046		if (page)
1047			goto got_pg;
1048
1049		out_of_memory(zonelist, gfp_mask, order);
1050		goto restart;
1051	}
1052
1053	/*
1054	 * Don't let big-order allocations loop unless the caller explicitly
1055	 * requests that.  Wait for some write requests to complete then retry.
1056	 *
1057	 * In this implementation, __GFP_REPEAT means __GFP_NOFAIL for order
1058	 * <= 3, but that may not be true in other implementations.
1059	 */
1060	do_retry = 0;
1061	if (!(gfp_mask & __GFP_NORETRY)) {
1062		if ((order <= 3) || (gfp_mask & __GFP_REPEAT))
1063			do_retry = 1;
1064		if (gfp_mask & __GFP_NOFAIL)
1065			do_retry = 1;
1066	}
1067	if (do_retry) {
1068		blk_congestion_wait(WRITE, HZ/50);
1069		goto rebalance;
1070	}
1071
1072nopage:
1073	if (!(gfp_mask & __GFP_NOWARN) && printk_ratelimit()) {
1074		printk(KERN_WARNING "%s: page allocation failure."
1075			" order:%d, mode:0x%x\n",
1076			p->comm, order, gfp_mask);
1077		dump_stack();
1078		show_mem();
1079	}
1080got_pg:
1081	return page;
1082}
1083
1084EXPORT_SYMBOL(__alloc_pages);
1085
1086/*
1087 * Common helper functions.
1088 */
1089fastcall unsigned long __get_free_pages(gfp_t gfp_mask, unsigned int order)
1090{
1091	struct page * page;
1092	page = alloc_pages(gfp_mask, order);
1093	if (!page)
1094		return 0;
1095	return (unsigned long) page_address(page);
1096}
1097
1098EXPORT_SYMBOL(__get_free_pages);
1099
1100fastcall unsigned long get_zeroed_page(gfp_t gfp_mask)
1101{
1102	struct page * page;
1103
1104	/*
1105	 * get_zeroed_page() returns a 32-bit address, which cannot represent
1106	 * a highmem page
1107	 */
1108	BUG_ON((gfp_mask & __GFP_HIGHMEM) != 0);
1109
1110	page = alloc_pages(gfp_mask | __GFP_ZERO, 0);
1111	if (page)
1112		return (unsigned long) page_address(page);
1113	return 0;
1114}
1115
1116EXPORT_SYMBOL(get_zeroed_page);
1117
1118void __pagevec_free(struct pagevec *pvec)
1119{
1120	int i = pagevec_count(pvec);
1121
1122	while (--i >= 0)
1123		free_hot_cold_page(pvec->pages[i], pvec->cold);
1124}
1125
1126fastcall void __free_pages(struct page *page, unsigned int order)
1127{
1128	if (put_page_testzero(page)) {
1129		if (order == 0)
1130			free_hot_page(page);
1131		else
1132			__free_pages_ok(page, order);
1133	}
1134}
1135
1136EXPORT_SYMBOL(__free_pages);
1137
1138fastcall void free_pages(unsigned long addr, unsigned int order)
1139{
1140	if (addr != 0) {
1141		BUG_ON(!virt_addr_valid((void *)addr));
1142		__free_pages(virt_to_page((void *)addr), order);
1143	}
1144}
1145
1146EXPORT_SYMBOL(free_pages);
1147
1148/*
1149 * Total amount of free (allocatable) RAM:
1150 */
1151unsigned int nr_free_pages(void)
1152{
1153	unsigned int sum = 0;
1154	struct zone *zone;
1155
1156	for_each_zone(zone)
1157		sum += zone->free_pages;
1158
1159	return sum;
1160}
1161
1162EXPORT_SYMBOL(nr_free_pages);
1163
1164#ifdef CONFIG_NUMA
1165unsigned int nr_free_pages_pgdat(pg_data_t *pgdat)
1166{
1167	unsigned int i, sum = 0;
1168
1169	for (i = 0; i < MAX_NR_ZONES; i++)
1170		sum += pgdat->node_zones[i].free_pages;
1171
1172	return sum;
1173}
1174#endif
1175
1176static unsigned int nr_free_zone_pages(int offset)
1177{
1178	/* Just pick one node, since fallback list is circular */
1179	pg_data_t *pgdat = NODE_DATA(numa_node_id());
1180	unsigned int sum = 0;
1181
1182	struct zonelist *zonelist = pgdat->node_zonelists + offset;
1183	struct zone **zonep = zonelist->zones;
1184	struct zone *zone;
1185
1186	for (zone = *zonep++; zone; zone = *zonep++) {
1187		unsigned long size = zone->present_pages;
1188		unsigned long high = zone->pages_high;
1189		if (size > high)
1190			sum += size - high;
1191	}
1192
1193	return sum;
1194}
1195
1196/*
1197 * Amount of free RAM allocatable within ZONE_DMA and ZONE_NORMAL
1198 */
1199unsigned int nr_free_buffer_pages(void)
1200{
1201	return nr_free_zone_pages(gfp_zone(GFP_USER));
1202}
1203
1204/*
1205 * Amount of free RAM allocatable within all zones
1206 */
1207unsigned int nr_free_pagecache_pages(void)
1208{
1209	return nr_free_zone_pages(gfp_zone(GFP_HIGHUSER));
1210}
1211
1212#ifdef CONFIG_HIGHMEM
1213unsigned int nr_free_highpages (void)
1214{
1215	pg_data_t *pgdat;
1216	unsigned int pages = 0;
1217
1218	for_each_online_pgdat(pgdat)
1219		pages += pgdat->node_zones[ZONE_HIGHMEM].free_pages;
1220
1221	return pages;
1222}
1223#endif
1224
1225#ifdef CONFIG_NUMA
1226static void show_node(struct zone *zone)
1227{
1228	printk("Node %d ", zone->zone_pgdat->node_id);
1229}
1230#else
1231#define show_node(zone)	do { } while (0)
1232#endif
1233
1234void si_meminfo(struct sysinfo *val)
1235{
1236	val->totalram = totalram_pages;
1237	val->sharedram = 0;
1238	val->freeram = nr_free_pages();
1239	val->bufferram = nr_blockdev_pages();
1240#ifdef CONFIG_HIGHMEM
1241	val->totalhigh = totalhigh_pages;
1242	val->freehigh = nr_free_highpages();
1243#else
1244	val->totalhigh = 0;
1245	val->freehigh = 0;
1246#endif
1247	val->mem_unit = PAGE_SIZE;
1248}
1249
1250EXPORT_SYMBOL(si_meminfo);
1251
1252#ifdef CONFIG_NUMA
1253void si_meminfo_node(struct sysinfo *val, int nid)
1254{
1255	pg_data_t *pgdat = NODE_DATA(nid);
1256
1257	val->totalram = pgdat->node_present_pages;
1258	val->freeram = nr_free_pages_pgdat(pgdat);
1259	val->totalhigh = pgdat->node_zones[ZONE_HIGHMEM].present_pages;
1260	val->freehigh = pgdat->node_zones[ZONE_HIGHMEM].free_pages;
1261	val->mem_unit = PAGE_SIZE;
1262}
1263#endif
1264
1265#define K(x) ((x) << (PAGE_SHIFT-10))
1266
1267/*
1268 * Show free area list (used inside shift_scroll-lock stuff)
1269 * We also calculate the percentage fragmentation. We do this by counting the
1270 * memory on each free list with the exception of the first item on the list.
1271 */
1272void show_free_areas(void)
1273{
1274	struct page_state ps;
1275	int cpu, temperature;
1276	unsigned long active;
1277	unsigned long inactive;
1278	unsigned long free;
1279	struct zone *zone;
1280
1281	for_each_zone(zone) {
1282		show_node(zone);
1283		printk("%s per-cpu:", zone->name);
1284
1285		if (!populated_zone(zone)) {
1286			printk(" empty\n");
1287			continue;
1288		} else
1289			printk("\n");
1290
1291		for_each_online_cpu(cpu) {
1292			struct per_cpu_pageset *pageset;
1293
1294			pageset = zone_pcp(zone, cpu);
1295
1296			for (temperature = 0; temperature < 2; temperature++)
1297				printk("cpu %d %s: high %d, batch %d used:%d\n",
1298					cpu,
1299					temperature ? "cold" : "hot",
1300					pageset->pcp[temperature].high,
1301					pageset->pcp[temperature].batch,
1302					pageset->pcp[temperature].count);
1303		}
1304	}
1305
1306	get_page_state(&ps);
1307	get_zone_counts(&active, &inactive, &free);
1308
1309	printk("Free pages: %11ukB (%ukB HighMem)\n",
1310		K(nr_free_pages()),
1311		K(nr_free_highpages()));
1312
1313	printk("Active:%lu inactive:%lu dirty:%lu writeback:%lu "
1314		"unstable:%lu free:%u slab:%lu mapped:%lu pagetables:%lu\n",
1315		active,
1316		inactive,
1317		ps.nr_dirty,
1318		ps.nr_writeback,
1319		ps.nr_unstable,
1320		nr_free_pages(),
1321		ps.nr_slab,
1322		ps.nr_mapped,
1323		ps.nr_page_table_pages);
1324
1325	for_each_zone(zone) {
1326		int i;
1327
1328		show_node(zone);
1329		printk("%s"
1330			" free:%lukB"
1331			" min:%lukB"
1332			" low:%lukB"
1333			" high:%lukB"
1334			" active:%lukB"
1335			" inactive:%lukB"
1336			" present:%lukB"
1337			" pages_scanned:%lu"
1338			" all_unreclaimable? %s"
1339			"\n",
1340			zone->name,
1341			K(zone->free_pages),
1342			K(zone->pages_min),
1343			K(zone->pages_low),
1344			K(zone->pages_high),
1345			K(zone->nr_active),
1346			K(zone->nr_inactive),
1347			K(zone->present_pages),
1348			zone->pages_scanned,
1349			(zone->all_unreclaimable ? "yes" : "no")
1350			);
1351		printk("lowmem_reserve[]:");
1352		for (i = 0; i < MAX_NR_ZONES; i++)
1353			printk(" %lu", zone->lowmem_reserve[i]);
1354		printk("\n");
1355	}
1356
1357	for_each_zone(zone) {
1358 		unsigned long nr[MAX_ORDER], flags, order, total = 0;
1359
1360		show_node(zone);
1361		printk("%s: ", zone->name);
1362		if (!populated_zone(zone)) {
1363			printk("empty\n");
1364			continue;
1365		}
1366
1367		spin_lock_irqsave(&zone->lock, flags);
1368		for (order = 0; order < MAX_ORDER; order++) {
1369			nr[order] = zone->free_area[order].nr_free;
1370			total += nr[order] << order;
1371		}
1372		spin_unlock_irqrestore(&zone->lock, flags);
1373		for (order = 0; order < MAX_ORDER; order++)
1374			printk("%lu*%lukB ", nr[order], K(1UL) << order);
1375		printk("= %lukB\n", K(total));
1376	}
1377
1378	show_swap_cache_info();
1379}
1380
1381/*
1382 * Builds allocation fallback zone lists.
1383 *
1384 * Add all populated zones of a node to the zonelist.
1385 */
1386static int __meminit build_zonelists_node(pg_data_t *pgdat,
1387			struct zonelist *zonelist, int nr_zones, int zone_type)
1388{
1389	struct zone *zone;
1390
1391	BUG_ON(zone_type > ZONE_HIGHMEM);
1392
1393	do {
1394		zone = pgdat->node_zones + zone_type;
1395		if (populated_zone(zone)) {
1396#ifndef CONFIG_HIGHMEM
1397			BUG_ON(zone_type > ZONE_NORMAL);
1398#endif
1399			zonelist->zones[nr_zones++] = zone;
1400			check_highest_zone(zone_type);
1401		}
1402		zone_type--;
1403
1404	} while (zone_type >= 0);
1405	return nr_zones;
1406}
1407
1408static inline int highest_zone(int zone_bits)
1409{
1410	int res = ZONE_NORMAL;
1411	if (zone_bits & (__force int)__GFP_HIGHMEM)
1412		res = ZONE_HIGHMEM;
1413	if (zone_bits & (__force int)__GFP_DMA32)
1414		res = ZONE_DMA32;
1415	if (zone_bits & (__force int)__GFP_DMA)
1416		res = ZONE_DMA;
1417	return res;
1418}
1419
1420#ifdef CONFIG_NUMA
1421#define MAX_NODE_LOAD (num_online_nodes())
1422static int __meminitdata node_load[MAX_NUMNODES];
1423/**
1424 * find_next_best_node - find the next node that should appear in a given node's fallback list
1425 * @node: node whose fallback list we're appending
1426 * @used_node_mask: nodemask_t of already used nodes
1427 *
1428 * We use a number of factors to determine which is the next node that should
1429 * appear on a given node's fallback list.  The node should not have appeared
1430 * already in @node's fallback list, and it should be the next closest node
1431 * according to the distance array (which contains arbitrary distance values
1432 * from each node to each node in the system), and should also prefer nodes
1433 * with no CPUs, since presumably they'll have very little allocation pressure
1434 * on them otherwise.
1435 * It returns -1 if no node is found.
1436 */
1437static int __meminit find_next_best_node(int node, nodemask_t *used_node_mask)
1438{
1439	int n, val;
1440	int min_val = INT_MAX;
1441	int best_node = -1;
1442
1443	/* Use the local node if we haven't already */
1444	if (!node_isset(node, *used_node_mask)) {
1445		node_set(node, *used_node_mask);
1446		return node;
1447	}
1448
1449	for_each_online_node(n) {
1450		cpumask_t tmp;
1451
1452		/* Don't want a node to appear more than once */
1453		if (node_isset(n, *used_node_mask))
1454			continue;
1455
1456		/* Use the distance array to find the distance */
1457		val = node_distance(node, n);
1458
1459		/* Penalize nodes under us ("prefer the next node") */
1460		val += (n < node);
1461
1462		/* Give preference to headless and unused nodes */
1463		tmp = node_to_cpumask(n);
1464		if (!cpus_empty(tmp))
1465			val += PENALTY_FOR_NODE_WITH_CPUS;
1466
1467		/* Slight preference for less loaded node */
1468		val *= (MAX_NODE_LOAD*MAX_NUMNODES);
1469		val += node_load[n];
1470
1471		if (val < min_val) {
1472			min_val = val;
1473			best_node = n;
1474		}
1475	}
1476
1477	if (best_node >= 0)
1478		node_set(best_node, *used_node_mask);
1479
1480	return best_node;
1481}
1482
1483static void __meminit build_zonelists(pg_data_t *pgdat)
1484{
1485	int i, j, k, node, local_node;
1486	int prev_node, load;
1487	struct zonelist *zonelist;
1488	nodemask_t used_mask;
1489
1490	/* initialize zonelists */
1491	for (i = 0; i < GFP_ZONETYPES; i++) {
1492		zonelist = pgdat->node_zonelists + i;
1493		zonelist->zones[0] = NULL;
1494	}
1495
1496	/* NUMA-aware ordering of nodes */
1497	local_node = pgdat->node_id;
1498	load = num_online_nodes();
1499	prev_node = local_node;
1500	nodes_clear(used_mask);
1501	while ((node = find_next_best_node(local_node, &used_mask)) >= 0) {
1502		int distance = node_distance(local_node, node);
1503
1504		/*
1505		 * If another node is sufficiently far away then it is better
1506		 * to reclaim pages in a zone before going off node.
1507		 */
1508		if (distance > RECLAIM_DISTANCE)
1509			zone_reclaim_mode = 1;
1510
1511		/*
1512		 * We don't want to pressure a particular node.
1513		 * So adding penalty to the first node in same
1514		 * distance group to make it round-robin.
1515		 */
1516
1517		if (distance != node_distance(local_node, prev_node))
1518			node_load[node] += load;
1519		prev_node = node;
1520		load--;
1521		for (i = 0; i < GFP_ZONETYPES; i++) {
1522			zonelist = pgdat->node_zonelists + i;
1523			for (j = 0; zonelist->zones[j] != NULL; j++);
1524
1525			k = highest_zone(i);
1526
1527	 		j = build_zonelists_node(NODE_DATA(node), zonelist, j, k);
1528			zonelist->zones[j] = NULL;
1529		}
1530	}
1531}
1532
1533#else	/* CONFIG_NUMA */
1534
1535static void __meminit build_zonelists(pg_data_t *pgdat)
1536{
1537	int i, j, k, node, local_node;
1538
1539	local_node = pgdat->node_id;
1540	for (i = 0; i < GFP_ZONETYPES; i++) {
1541		struct zonelist *zonelist;
1542
1543		zonelist = pgdat->node_zonelists + i;
1544
1545		j = 0;
1546		k = highest_zone(i);
1547 		j = build_zonelists_node(pgdat, zonelist, j, k);
1548 		/*
1549 		 * Now we build the zonelist so that it contains the zones
1550 		 * of all the other nodes.
1551 		 * We don't want to pressure a particular node, so when
1552 		 * building the zones for node N, we make sure that the
1553 		 * zones coming right after the local ones are those from
1554 		 * node N+1 (modulo N)
1555 		 */
1556		for (node = local_node + 1; node < MAX_NUMNODES; node++) {
1557			if (!node_online(node))
1558				continue;
1559			j = build_zonelists_node(NODE_DATA(node), zonelist, j, k);
1560		}
1561		for (node = 0; node < local_node; node++) {
1562			if (!node_online(node))
1563				continue;
1564			j = build_zonelists_node(NODE_DATA(node), zonelist, j, k);
1565		}
1566
1567		zonelist->zones[j] = NULL;
1568	}
1569}
1570
1571#endif	/* CONFIG_NUMA */
1572
1573/* return values int ....just for stop_machine_run() */
1574static int __meminit __build_all_zonelists(void *dummy)
1575{
1576	int nid;
1577	for_each_online_node(nid)
1578		build_zonelists(NODE_DATA(nid));
1579	return 0;
1580}
1581
1582void __meminit build_all_zonelists(void)
1583{
1584	if (system_state == SYSTEM_BOOTING) {
1585		__build_all_zonelists(0);
1586		cpuset_init_current_mems_allowed();
1587	} else {
1588		/* we have to stop all cpus to guaranntee there is no user
1589		   of zonelist */
1590		stop_machine_run(__build_all_zonelists, NULL, NR_CPUS);
1591		/* cpuset refresh routine should be here */
1592	}
1593	vm_total_pages = nr_free_pagecache_pages();
1594	printk("Built %i zonelists.  Total pages: %ld\n",
1595			num_online_nodes(), vm_total_pages);
1596}
1597
1598/*
1599 * Helper functions to size the waitqueue hash table.
1600 * Essentially these want to choose hash table sizes sufficiently
1601 * large so that collisions trying to wait on pages are rare.
1602 * But in fact, the number of active page waitqueues on typical
1603 * systems is ridiculously low, less than 200. So this is even
1604 * conservative, even though it seems large.
1605 *
1606 * The constant PAGES_PER_WAITQUEUE specifies the ratio of pages to
1607 * waitqueues, i.e. the size of the waitq table given the number of pages.
1608 */
1609#define PAGES_PER_WAITQUEUE	256
1610
1611#ifndef CONFIG_MEMORY_HOTPLUG
1612static inline unsigned long wait_table_hash_nr_entries(unsigned long pages)
1613{
1614	unsigned long size = 1;
1615
1616	pages /= PAGES_PER_WAITQUEUE;
1617
1618	while (size < pages)
1619		size <<= 1;
1620
1621	/*
1622	 * Once we have dozens or even hundreds of threads sleeping
1623	 * on IO we've got bigger problems than wait queue collision.
1624	 * Limit the size of the wait table to a reasonable size.
1625	 */
1626	size = min(size, 4096UL);
1627
1628	return max(size, 4UL);
1629}
1630#else
1631/*
1632 * A zone's size might be changed by hot-add, so it is not possible to determine
1633 * a suitable size for its wait_table.  So we use the maximum size now.
1634 *
1635 * The max wait table size = 4096 x sizeof(wait_queue_head_t).   ie:
1636 *
1637 *    i386 (preemption config)    : 4096 x 16 = 64Kbyte.
1638 *    ia64, x86-64 (no preemption): 4096 x 20 = 80Kbyte.
1639 *    ia64, x86-64 (preemption)   : 4096 x 24 = 96Kbyte.
1640 *
1641 * The maximum entries are prepared when a zone's memory is (512K + 256) pages
1642 * or more by the traditional way. (See above).  It equals:
1643 *
1644 *    i386, x86-64, powerpc(4K page size) : =  ( 2G + 1M)byte.
1645 *    ia64(16K page size)                 : =  ( 8G + 4M)byte.
1646 *    powerpc (64K page size)             : =  (32G +16M)byte.
1647 */
1648static inline unsigned long wait_table_hash_nr_entries(unsigned long pages)
1649{
1650	return 4096UL;
1651}
1652#endif
1653
1654/*
1655 * This is an integer logarithm so that shifts can be used later
1656 * to extract the more random high bits from the multiplicative
1657 * hash function before the remainder is taken.
1658 */
1659static inline unsigned long wait_table_bits(unsigned long size)
1660{
1661	return ffz(~size);
1662}
1663
1664#define LONG_ALIGN(x) (((x)+(sizeof(long))-1)&~((sizeof(long))-1))
1665
1666static void __init calculate_zone_totalpages(struct pglist_data *pgdat,
1667		unsigned long *zones_size, unsigned long *zholes_size)
1668{
1669	unsigned long realtotalpages, totalpages = 0;
1670	int i;
1671
1672	for (i = 0; i < MAX_NR_ZONES; i++)
1673		totalpages += zones_size[i];
1674	pgdat->node_spanned_pages = totalpages;
1675
1676	realtotalpages = totalpages;
1677	if (zholes_size)
1678		for (i = 0; i < MAX_NR_ZONES; i++)
1679			realtotalpages -= zholes_size[i];
1680	pgdat->node_present_pages = realtotalpages;
1681	printk(KERN_DEBUG "On node %d totalpages: %lu\n", pgdat->node_id, realtotalpages);
1682}
1683
1684
1685/*
1686 * Initially all pages are reserved - free ones are freed
1687 * up by free_all_bootmem() once the early boot process is
1688 * done. Non-atomic initialization, single-pass.
1689 */
1690void __meminit memmap_init_zone(unsigned long size, int nid, unsigned long zone,
1691		unsigned long start_pfn)
1692{
1693	struct page *page;
1694	unsigned long end_pfn = start_pfn + size;
1695	unsigned long pfn;
1696
1697	for (pfn = start_pfn; pfn < end_pfn; pfn++) {
1698		if (!early_pfn_valid(pfn))
1699			continue;
1700		page = pfn_to_page(pfn);
1701		set_page_links(page, zone, nid, pfn);
1702		init_page_count(page);
1703		reset_page_mapcount(page);
1704		SetPageReserved(page);
1705		INIT_LIST_HEAD(&page->lru);
1706#ifdef WANT_PAGE_VIRTUAL
1707		/* The shift won't overflow because ZONE_NORMAL is below 4G. */
1708		if (!is_highmem_idx(zone))
1709			set_page_address(page, __va(pfn << PAGE_SHIFT));
1710#endif
1711	}
1712}
1713
1714void zone_init_free_lists(struct pglist_data *pgdat, struct zone *zone,
1715				unsigned long size)
1716{
1717	int order;
1718	for (order = 0; order < MAX_ORDER ; order++) {
1719		INIT_LIST_HEAD(&zone->free_area[order].free_list);
1720		zone->free_area[order].nr_free = 0;
1721	}
1722}
1723
1724#define ZONETABLE_INDEX(x, zone_nr)	((x << ZONES_SHIFT) | zone_nr)
1725void zonetable_add(struct zone *zone, int nid, int zid, unsigned long pfn,
1726		unsigned long size)
1727{
1728	unsigned long snum = pfn_to_section_nr(pfn);
1729	unsigned long end = pfn_to_section_nr(pfn + size);
1730
1731	if (FLAGS_HAS_NODE)
1732		zone_table[ZONETABLE_INDEX(nid, zid)] = zone;
1733	else
1734		for (; snum <= end; snum++)
1735			zone_table[ZONETABLE_INDEX(snum, zid)] = zone;
1736}
1737
1738#ifndef __HAVE_ARCH_MEMMAP_INIT
1739#define memmap_init(size, nid, zone, start_pfn) \
1740	memmap_init_zone((size), (nid), (zone), (start_pfn))
1741#endif
1742
1743static int __cpuinit zone_batchsize(struct zone *zone)
1744{
1745	int batch;
1746
1747	/*
1748	 * The per-cpu-pages pools are set to around 1000th of the
1749	 * size of the zone.  But no more than 1/2 of a meg.
1750	 *
1751	 * OK, so we don't know how big the cache is.  So guess.
1752	 */
1753	batch = zone->present_pages / 1024;
1754	if (batch * PAGE_SIZE > 512 * 1024)
1755		batch = (512 * 1024) / PAGE_SIZE;
1756	batch /= 4;		/* We effectively *= 4 below */
1757	if (batch < 1)
1758		batch = 1;
1759
1760	/*
1761	 * Clamp the batch to a 2^n - 1 value. Having a power
1762	 * of 2 value was found to be more likely to have
1763	 * suboptimal cache aliasing properties in some cases.
1764	 *
1765	 * For example if 2 tasks are alternately allocating
1766	 * batches of pages, one task can end up with a lot
1767	 * of pages of one half of the possible page colors
1768	 * and the other with pages of the other colors.
1769	 */
1770	batch = (1 << (fls(batch + batch/2)-1)) - 1;
1771
1772	return batch;
1773}
1774
1775inline void setup_pageset(struct per_cpu_pageset *p, unsigned long batch)
1776{
1777	struct per_cpu_pages *pcp;
1778
1779	memset(p, 0, sizeof(*p));
1780
1781	pcp = &p->pcp[0];		/* hot */
1782	pcp->count = 0;
1783	pcp->high = 6 * batch;
1784	pcp->batch = max(1UL, 1 * batch);
1785	INIT_LIST_HEAD(&pcp->list);
1786
1787	pcp = &p->pcp[1];		/* cold*/
1788	pcp->count = 0;
1789	pcp->high = 2 * batch;
1790	pcp->batch = max(1UL, batch/2);
1791	INIT_LIST_HEAD(&pcp->list);
1792}
1793
1794/*
1795 * setup_pagelist_highmark() sets the high water mark for hot per_cpu_pagelist
1796 * to the value high for the pageset p.
1797 */
1798
1799static void setup_pagelist_highmark(struct per_cpu_pageset *p,
1800				unsigned long high)
1801{
1802	struct per_cpu_pages *pcp;
1803
1804	pcp = &p->pcp[0]; /* hot list */
1805	pcp->high = high;
1806	pcp->batch = max(1UL, high/4);
1807	if ((high/4) > (PAGE_SHIFT * 8))
1808		pcp->batch = PAGE_SHIFT * 8;
1809}
1810
1811
1812#ifdef CONFIG_NUMA
1813/*
1814 * Boot pageset table. One per cpu which is going to be used for all
1815 * zones and all nodes. The parameters will be set in such a way
1816 * that an item put on a list will immediately be handed over to
1817 * the buddy list. This is safe since pageset manipulation is done
1818 * with interrupts disabled.
1819 *
1820 * Some NUMA counter updates may also be caught by the boot pagesets.
1821 *
1822 * The boot_pagesets must be kept even after bootup is complete for
1823 * unused processors and/or zones. They do play a role for bootstrapping
1824 * hotplugged processors.
1825 *
1826 * zoneinfo_show() and maybe other functions do
1827 * not check if the processor is online before following the pageset pointer.
1828 * Other parts of the kernel may not check if the zone is available.
1829 */
1830static struct per_cpu_pageset boot_pageset[NR_CPUS];
1831
1832/*
1833 * Dynamically allocate memory for the
1834 * per cpu pageset array in struct zone.
1835 */
1836static int __cpuinit process_zones(int cpu)
1837{
1838	struct zone *zone, *dzone;
1839
1840	for_each_zone(zone) {
1841
1842		zone_pcp(zone, cpu) = kmalloc_node(sizeof(struct per_cpu_pageset),
1843					 GFP_KERNEL, cpu_to_node(cpu));
1844		if (!zone_pcp(zone, cpu))
1845			goto bad;
1846
1847		setup_pageset(zone_pcp(zone, cpu), zone_batchsize(zone));
1848
1849		if (percpu_pagelist_fraction)
1850			setup_pagelist_highmark(zone_pcp(zone, cpu),
1851			 	(zone->present_pages / percpu_pagelist_fraction));
1852	}
1853
1854	return 0;
1855bad:
1856	for_each_zone(dzone) {
1857		if (dzone == zone)
1858			break;
1859		kfree(zone_pcp(dzone, cpu));
1860		zone_pcp(dzone, cpu) = NULL;
1861	}
1862	return -ENOMEM;
1863}
1864
1865static inline void free_zone_pagesets(int cpu)
1866{
1867	struct zone *zone;
1868
1869	for_each_zone(zone) {
1870		struct per_cpu_pageset *pset = zone_pcp(zone, cpu);
1871
1872		zone_pcp(zone, cpu) = NULL;
1873		kfree(pset);
1874	}
1875}
1876
1877static int __cpuinit pageset_cpuup_callback(struct notifier_block *nfb,
1878		unsigned long action,
1879		void *hcpu)
1880{
1881	int cpu = (long)hcpu;
1882	int ret = NOTIFY_OK;
1883
1884	switch (action) {
1885		case CPU_UP_PREPARE:
1886			if (process_zones(cpu))
1887				ret = NOTIFY_BAD;
1888			break;
1889		case CPU_UP_CANCELED:
1890		case CPU_DEAD:
1891			free_zone_pagesets(cpu);
1892			break;
1893		default:
1894			break;
1895	}
1896	return ret;
1897}
1898
1899static struct notifier_block __cpuinitdata pageset_notifier =
1900	{ &pageset_cpuup_callback, NULL, 0 };
1901
1902void __init setup_per_cpu_pageset(void)
1903{
1904	int err;
1905
1906	/* Initialize per_cpu_pageset for cpu 0.
1907	 * A cpuup callback will do this for every cpu
1908	 * as it comes online
1909	 */
1910	err = process_zones(smp_processor_id());
1911	BUG_ON(err);
1912	register_cpu_notifier(&pageset_notifier);
1913}
1914
1915#endif
1916
1917static __meminit
1918int zone_wait_table_init(struct zone *zone, unsigned long zone_size_pages)
1919{
1920	int i;
1921	struct pglist_data *pgdat = zone->zone_pgdat;
1922	size_t alloc_size;
1923
1924	/*
1925	 * The per-page waitqueue mechanism uses hashed waitqueues
1926	 * per zone.
1927	 */
1928	zone->wait_table_hash_nr_entries =
1929		 wait_table_hash_nr_entries(zone_size_pages);
1930	zone->wait_table_bits =
1931		wait_table_bits(zone->wait_table_hash_nr_entries);
1932	alloc_size = zone->wait_table_hash_nr_entries
1933					* sizeof(wait_queue_head_t);
1934
1935 	if (system_state == SYSTEM_BOOTING) {
1936		zone->wait_table = (wait_queue_head_t *)
1937			alloc_bootmem_node(pgdat, alloc_size);
1938	} else {
1939		/*
1940		 * This case means that a zone whose size was 0 gets new memory
1941		 * via memory hot-add.
1942		 * But it may be the case that a new node was hot-added.  In
1943		 * this case vmalloc() will not be able to use this new node's
1944		 * memory - this wait_table must be initialized to use this new
1945		 * node itself as well.
1946		 * To use this new node's memory, further consideration will be
1947		 * necessary.
1948		 */
1949		zone->wait_table = (wait_queue_head_t *)vmalloc(alloc_size);
1950	}
1951	if (!zone->wait_table)
1952		return -ENOMEM;
1953
1954	for(i = 0; i < zone->wait_table_hash_nr_entries; ++i)
1955		init_waitqueue_head(zone->wait_table + i);
1956
1957	return 0;
1958}
1959
1960static __meminit void zone_pcp_init(struct zone *zone)
1961{
1962	int cpu;
1963	unsigned long batch = zone_batchsize(zone);
1964
1965	for (cpu = 0; cpu < NR_CPUS; cpu++) {
1966#ifdef CONFIG_NUMA
1967		/* Early boot. Slab allocator not functional yet */
1968		zone_pcp(zone, cpu) = &boot_pageset[cpu];
1969		setup_pageset(&boot_pageset[cpu],0);
1970#else
1971		setup_pageset(zone_pcp(zone,cpu), batch);
1972#endif
1973	}
1974	if (zone->present_pages)
1975		printk(KERN_DEBUG "  %s zone: %lu pages, LIFO batch:%lu\n",
1976			zone->name, zone->present_pages, batch);
1977}
1978
1979__meminit int init_currently_empty_zone(struct zone *zone,
1980					unsigned long zone_start_pfn,
1981					unsigned long size)
1982{
1983	struct pglist_data *pgdat = zone->zone_pgdat;
1984	int ret;
1985	ret = zone_wait_table_init(zone, size);
1986	if (ret)
1987		return ret;
1988	pgdat->nr_zones = zone_idx(zone) + 1;
1989
1990	zone->zone_start_pfn = zone_start_pfn;
1991
1992	memmap_init(size, pgdat->node_id, zone_idx(zone), zone_start_pfn);
1993
1994	zone_init_free_lists(pgdat, zone, zone->spanned_pages);
1995
1996	return 0;
1997}
1998
1999/*
2000 * Set up the zone data structures:
2001 *   - mark all pages reserved
2002 *   - mark all memory queues empty
2003 *   - clear the memory bitmaps
2004 */
2005static void __meminit free_area_init_core(struct pglist_data *pgdat,
2006		unsigned long *zones_size, unsigned long *zholes_size)
2007{
2008	unsigned long j;
2009	int nid = pgdat->node_id;
2010	unsigned long zone_start_pfn = pgdat->node_start_pfn;
2011	int ret;
2012
2013	pgdat_resize_init(pgdat);
2014	pgdat->nr_zones = 0;
2015	init_waitqueue_head(&pgdat->kswapd_wait);
2016	pgdat->kswapd_max_order = 0;
2017
2018	for (j = 0; j < MAX_NR_ZONES; j++) {
2019		struct zone *zone = pgdat->node_zones + j;
2020		unsigned long size, realsize;
2021
2022		realsize = size = zones_size[j];
2023		if (zholes_size)
2024			realsize -= zholes_size[j];
2025
2026		if (j < ZONE_HIGHMEM)
2027			nr_kernel_pages += realsize;
2028		nr_all_pages += realsize;
2029
2030		zone->spanned_pages = size;
2031		zone->present_pages = realsize;
2032		zone->name = zone_names[j];
2033		spin_lock_init(&zone->lock);
2034		spin_lock_init(&zone->lru_lock);
2035		zone_seqlock_init(zone);
2036		zone->zone_pgdat = pgdat;
2037		zone->free_pages = 0;
2038
2039		zone->temp_priority = zone->prev_priority = DEF_PRIORITY;
2040
2041		zone_pcp_init(zone);
2042		INIT_LIST_HEAD(&zone->active_list);
2043		INIT_LIST_HEAD(&zone->inactive_list);
2044		zone->nr_scan_active = 0;
2045		zone->nr_scan_inactive = 0;
2046		zone->nr_active = 0;
2047		zone->nr_inactive = 0;
2048		zap_zone_vm_stats(zone);
2049		atomic_set(&zone->reclaim_in_progress, 0);
2050		if (!size)
2051			continue;
2052
2053		zonetable_add(zone, nid, j, zone_start_pfn, size);
2054		ret = init_currently_empty_zone(zone, zone_start_pfn, size);
2055		BUG_ON(ret);
2056		zone_start_pfn += size;
2057	}
2058}
2059
2060static void __init alloc_node_mem_map(struct pglist_data *pgdat)
2061{
2062	/* Skip empty nodes */
2063	if (!pgdat->node_spanned_pages)
2064		return;
2065
2066#ifdef CONFIG_FLAT_NODE_MEM_MAP
2067	/* ia64 gets its own node_mem_map, before this, without bootmem */
2068	if (!pgdat->node_mem_map) {
2069		unsigned long size, start, end;
2070		struct page *map;
2071
2072		/*
2073		 * The zone's endpoints aren't required to be MAX_ORDER
2074		 * aligned but the node_mem_map endpoints must be in order
2075		 * for the buddy allocator to function correctly.
2076		 */
2077		start = pgdat->node_start_pfn & ~(MAX_ORDER_NR_PAGES - 1);
2078		end = pgdat->node_start_pfn + pgdat->node_spanned_pages;
2079		end = ALIGN(end, MAX_ORDER_NR_PAGES);
2080		size =  (end - start) * sizeof(struct page);
2081		map = alloc_remap(pgdat->node_id, size);
2082		if (!map)
2083			map = alloc_bootmem_node(pgdat, size);
2084		pgdat->node_mem_map = map + (pgdat->node_start_pfn - start);
2085	}
2086#ifdef CONFIG_FLATMEM
2087	/*
2088	 * With no DISCONTIG, the global mem_map is just set as node 0's
2089	 */
2090	if (pgdat == NODE_DATA(0))
2091		mem_map = NODE_DATA(0)->node_mem_map;
2092#endif
2093#endif /* CONFIG_FLAT_NODE_MEM_MAP */
2094}
2095
2096void __meminit free_area_init_node(int nid, struct pglist_data *pgdat,
2097		unsigned long *zones_size, unsigned long node_start_pfn,
2098		unsigned long *zholes_size)
2099{
2100	pgdat->node_id = nid;
2101	pgdat->node_start_pfn = node_start_pfn;
2102	calculate_zone_totalpages(pgdat, zones_size, zholes_size);
2103
2104	alloc_node_mem_map(pgdat);
2105
2106	free_area_init_core(pgdat, zones_size, zholes_size);
2107}
2108
2109#ifndef CONFIG_NEED_MULTIPLE_NODES
2110static bootmem_data_t contig_bootmem_data;
2111struct pglist_data contig_page_data = { .bdata = &contig_bootmem_data };
2112
2113EXPORT_SYMBOL(contig_page_data);
2114#endif
2115
2116void __init free_area_init(unsigned long *zones_size)
2117{
2118	free_area_init_node(0, NODE_DATA(0), zones_size,
2119			__pa(PAGE_OFFSET) >> PAGE_SHIFT, NULL);
2120}
2121
2122#ifdef CONFIG_HOTPLUG_CPU
2123static int page_alloc_cpu_notify(struct notifier_block *self,
2124				 unsigned long action, void *hcpu)
2125{
2126	int cpu = (unsigned long)hcpu;
2127	long *count;
2128	unsigned long *src, *dest;
2129
2130	if (action == CPU_DEAD) {
2131		int i;
2132
2133		/* Drain local pagecache count. */
2134		count = &per_cpu(nr_pagecache_local, cpu);
2135		atomic_add(*count, &nr_pagecache);
2136		*count = 0;
2137		local_irq_disable();
2138		__drain_pages(cpu);
2139
2140		/* Add dead cpu's page_states to our own. */
2141		dest = (unsigned long *)&__get_cpu_var(page_states);
2142		src = (unsigned long *)&per_cpu(page_states, cpu);
2143
2144		for (i = 0; i < sizeof(struct page_state)/sizeof(unsigned long);
2145				i++) {
2146			dest[i] += src[i];
2147			src[i] = 0;
2148		}
2149
2150		local_irq_enable();
2151		refresh_cpu_vm_stats(cpu);
2152	}
2153	return NOTIFY_OK;
2154}
2155#endif /* CONFIG_HOTPLUG_CPU */
2156
2157void __init page_alloc_init(void)
2158{
2159	hotcpu_notifier(page_alloc_cpu_notify, 0);
2160}
2161
2162/*
2163 * calculate_totalreserve_pages - called when sysctl_lower_zone_reserve_ratio
2164 *	or min_free_kbytes changes.
2165 */
2166static void calculate_totalreserve_pages(void)
2167{
2168	struct pglist_data *pgdat;
2169	unsigned long reserve_pages = 0;
2170	int i, j;
2171
2172	for_each_online_pgdat(pgdat) {
2173		for (i = 0; i < MAX_NR_ZONES; i++) {
2174			struct zone *zone = pgdat->node_zones + i;
2175			unsigned long max = 0;
2176
2177			/* Find valid and maximum lowmem_reserve in the zone */
2178			for (j = i; j < MAX_NR_ZONES; j++) {
2179				if (zone->lowmem_reserve[j] > max)
2180					max = zone->lowmem_reserve[j];
2181			}
2182
2183			/* we treat pages_high as reserved pages. */
2184			max += zone->pages_high;
2185
2186			if (max > zone->present_pages)
2187				max = zone->present_pages;
2188			reserve_pages += max;
2189		}
2190	}
2191	totalreserve_pages = reserve_pages;
2192}
2193
2194/*
2195 * setup_per_zone_lowmem_reserve - called whenever
2196 *	sysctl_lower_zone_reserve_ratio changes.  Ensures that each zone
2197 *	has a correct pages reserved value, so an adequate number of
2198 *	pages are left in the zone after a successful __alloc_pages().
2199 */
2200static void setup_per_zone_lowmem_reserve(void)
2201{
2202	struct pglist_data *pgdat;
2203	int j, idx;
2204
2205	for_each_online_pgdat(pgdat) {
2206		for (j = 0; j < MAX_NR_ZONES; j++) {
2207			struct zone *zone = pgdat->node_zones + j;
2208			unsigned long present_pages = zone->present_pages;
2209
2210			zone->lowmem_reserve[j] = 0;
2211
2212			for (idx = j-1; idx >= 0; idx--) {
2213				struct zone *lower_zone;
2214
2215				if (sysctl_lowmem_reserve_ratio[idx] < 1)
2216					sysctl_lowmem_reserve_ratio[idx] = 1;
2217
2218				lower_zone = pgdat->node_zones + idx;
2219				lower_zone->lowmem_reserve[j] = present_pages /
2220					sysctl_lowmem_reserve_ratio[idx];
2221				present_pages += lower_zone->present_pages;
2222			}
2223		}
2224	}
2225
2226	/* update totalreserve_pages */
2227	calculate_totalreserve_pages();
2228}
2229
2230/*
2231 * setup_per_zone_pages_min - called when min_free_kbytes changes.  Ensures
2232 *	that the pages_{min,low,high} values for each zone are set correctly
2233 *	with respect to min_free_kbytes.
2234 */
2235void setup_per_zone_pages_min(void)
2236{
2237	unsigned long pages_min = min_free_kbytes >> (PAGE_SHIFT - 10);
2238	unsigned long lowmem_pages = 0;
2239	struct zone *zone;
2240	unsigned long flags;
2241
2242	/* Calculate total number of !ZONE_HIGHMEM pages */
2243	for_each_zone(zone) {
2244		if (!is_highmem(zone))
2245			lowmem_pages += zone->present_pages;
2246	}
2247
2248	for_each_zone(zone) {
2249		u64 tmp;
2250
2251		spin_lock_irqsave(&zone->lru_lock, flags);
2252		tmp = (u64)pages_min * zone->present_pages;
2253		do_div(tmp, lowmem_pages);
2254		if (is_highmem(zone)) {
2255			/*
2256			 * __GFP_HIGH and PF_MEMALLOC allocations usually don't
2257			 * need highmem pages, so cap pages_min to a small
2258			 * value here.
2259			 *
2260			 * The (pages_high-pages_low) and (pages_low-pages_min)
2261			 * deltas controls asynch page reclaim, and so should
2262			 * not be capped for highmem.
2263			 */
2264			int min_pages;
2265
2266			min_pages = zone->present_pages / 1024;
2267			if (min_pages < SWAP_CLUSTER_MAX)
2268				min_pages = SWAP_CLUSTER_MAX;
2269			if (min_pages > 128)
2270				min_pages = 128;
2271			zone->pages_min = min_pages;
2272		} else {
2273			/*
2274			 * If it's a lowmem zone, reserve a number of pages
2275			 * proportionate to the zone's size.
2276			 */
2277			zone->pages_min = tmp;
2278		}
2279
2280		zone->pages_low   = zone->pages_min + (tmp >> 2);
2281		zone->pages_high  = zone->pages_min + (tmp >> 1);
2282		spin_unlock_irqrestore(&zone->lru_lock, flags);
2283	}
2284
2285	/* update totalreserve_pages */
2286	calculate_totalreserve_pages();
2287}
2288
2289/*
2290 * Initialise min_free_kbytes.
2291 *
2292 * For small machines we want it small (128k min).  For large machines
2293 * we want it large (64MB max).  But it is not linear, because network
2294 * bandwidth does not increase linearly with machine size.  We use
2295 *
2296 * 	min_free_kbytes = 4 * sqrt(lowmem_kbytes), for better accuracy:
2297 *	min_free_kbytes = sqrt(lowmem_kbytes * 16)
2298 *
2299 * which yields
2300 *
2301 * 16MB:	512k
2302 * 32MB:	724k
2303 * 64MB:	1024k
2304 * 128MB:	1448k
2305 * 256MB:	2048k
2306 * 512MB:	2896k
2307 * 1024MB:	4096k
2308 * 2048MB:	5792k
2309 * 4096MB:	8192k
2310 * 8192MB:	11584k
2311 * 16384MB:	16384k
2312 */
2313static int __init init_per_zone_pages_min(void)
2314{
2315	unsigned long lowmem_kbytes;
2316
2317	lowmem_kbytes = nr_free_buffer_pages() * (PAGE_SIZE >> 10);
2318
2319	min_free_kbytes = int_sqrt(lowmem_kbytes * 16);
2320	if (min_free_kbytes < 128)
2321		min_free_kbytes = 128;
2322	if (min_free_kbytes > 65536)
2323		min_free_kbytes = 65536;
2324	setup_per_zone_pages_min();
2325	setup_per_zone_lowmem_reserve();
2326	return 0;
2327}
2328module_init(init_per_zone_pages_min)
2329
2330/*
2331 * min_free_kbytes_sysctl_handler - just a wrapper around proc_dointvec() so
2332 *	that we can call two helper functions whenever min_free_kbytes
2333 *	changes.
2334 */
2335int min_free_kbytes_sysctl_handler(ctl_table *table, int write,
2336	struct file *file, void __user *buffer, size_t *length, loff_t *ppos)
2337{
2338	proc_dointvec(table, write, file, buffer, length, ppos);
2339	setup_per_zone_pages_min();
2340	return 0;
2341}
2342
2343/*
2344 * lowmem_reserve_ratio_sysctl_handler - just a wrapper around
2345 *	proc_dointvec() so that we can call setup_per_zone_lowmem_reserve()
2346 *	whenever sysctl_lowmem_reserve_ratio changes.
2347 *
2348 * The reserve ratio obviously has absolutely no relation with the
2349 * pages_min watermarks. The lowmem reserve ratio can only make sense
2350 * if in function of the boot time zone sizes.
2351 */
2352int lowmem_reserve_ratio_sysctl_handler(ctl_table *table, int write,
2353	struct file *file, void __user *buffer, size_t *length, loff_t *ppos)
2354{
2355	proc_dointvec_minmax(table, write, file, buffer, length, ppos);
2356	setup_per_zone_lowmem_reserve();
2357	return 0;
2358}
2359
2360/*
2361 * percpu_pagelist_fraction - changes the pcp->high for each zone on each
2362 * cpu.  It is the fraction of total pages in each zone that a hot per cpu pagelist
2363 * can have before it gets flushed back to buddy allocator.
2364 */
2365
2366int percpu_pagelist_fraction_sysctl_handler(ctl_table *table, int write,
2367	struct file *file, void __user *buffer, size_t *length, loff_t *ppos)
2368{
2369	struct zone *zone;
2370	unsigned int cpu;
2371	int ret;
2372
2373	ret = proc_dointvec_minmax(table, write, file, buffer, length, ppos);
2374	if (!write || (ret == -EINVAL))
2375		return ret;
2376	for_each_zone(zone) {
2377		for_each_online_cpu(cpu) {
2378			unsigned long  high;
2379			high = zone->present_pages / percpu_pagelist_fraction;
2380			setup_pagelist_highmark(zone_pcp(zone, cpu), high);
2381		}
2382	}
2383	return 0;
2384}
2385
2386__initdata int hashdist = HASHDIST_DEFAULT;
2387
2388#ifdef CONFIG_NUMA
2389static int __init set_hashdist(char *str)
2390{
2391	if (!str)
2392		return 0;
2393	hashdist = simple_strtoul(str, &str, 0);
2394	return 1;
2395}
2396__setup("hashdist=", set_hashdist);
2397#endif
2398
2399/*
2400 * allocate a large system hash table from bootmem
2401 * - it is assumed that the hash table must contain an exact power-of-2
2402 *   quantity of entries
2403 * - limit is the number of hash buckets, not the total allocation size
2404 */
2405void *__init alloc_large_system_hash(const char *tablename,
2406				     unsigned long bucketsize,
2407				     unsigned long numentries,
2408				     int scale,
2409				     int flags,
2410				     unsigned int *_hash_shift,
2411				     unsigned int *_hash_mask,
2412				     unsigned long limit)
2413{
2414	unsigned long long max = limit;
2415	unsigned long log2qty, size;
2416	void *table = NULL;
2417
2418	/* allow the kernel cmdline to have a say */
2419	if (!numentries) {
2420		/* round applicable memory size up to nearest megabyte */
2421		numentries = (flags & HASH_HIGHMEM) ? nr_all_pages : nr_kernel_pages;
2422		numentries += (1UL << (20 - PAGE_SHIFT)) - 1;
2423		numentries >>= 20 - PAGE_SHIFT;
2424		numentries <<= 20 - PAGE_SHIFT;
2425
2426		/* limit to 1 bucket per 2^scale bytes of low memory */
2427		if (scale > PAGE_SHIFT)
2428			numentries >>= (scale - PAGE_SHIFT);
2429		else
2430			numentries <<= (PAGE_SHIFT - scale);
2431	}
2432	numentries = roundup_pow_of_two(numentries);
2433
2434	/* limit allocation size to 1/16 total memory by default */
2435	if (max == 0) {
2436		max = ((unsigned long long)nr_all_pages << PAGE_SHIFT) >> 4;
2437		do_div(max, bucketsize);
2438	}
2439
2440	if (numentries > max)
2441		numentries = max;
2442
2443	log2qty = long_log2(numentries);
2444
2445	do {
2446		size = bucketsize << log2qty;
2447		if (flags & HASH_EARLY)
2448			table = alloc_bootmem(size);
2449		else if (hashdist)
2450			table = __vmalloc(size, GFP_ATOMIC, PAGE_KERNEL);
2451		else {
2452			unsigned long order;
2453			for (order = 0; ((1UL << order) << PAGE_SHIFT) < size; order++)
2454				;
2455			table = (void*) __get_free_pages(GFP_ATOMIC, order);
2456		}
2457	} while (!table && size > PAGE_SIZE && --log2qty);
2458
2459	if (!table)
2460		panic("Failed to allocate %s hash table\n", tablename);
2461
2462	printk("%s hash table entries: %d (order: %d, %lu bytes)\n",
2463	       tablename,
2464	       (1U << log2qty),
2465	       long_log2(size) - PAGE_SHIFT,
2466	       size);
2467
2468	if (_hash_shift)
2469		*_hash_shift = log2qty;
2470	if (_hash_mask)
2471		*_hash_mask = (1 << log2qty) - 1;
2472
2473	return table;
2474}
2475
2476#ifdef CONFIG_OUT_OF_LINE_PFN_TO_PAGE
2477struct page *pfn_to_page(unsigned long pfn)
2478{
2479	return __pfn_to_page(pfn);
2480}
2481unsigned long page_to_pfn(struct page *page)
2482{
2483	return __page_to_pfn(page);
2484}
2485EXPORT_SYMBOL(pfn_to_page);
2486EXPORT_SYMBOL(page_to_pfn);
2487#endif /* CONFIG_OUT_OF_LINE_PFN_TO_PAGE */
2488