vmstat.c revision 8f32f7e5ac2ed11b0659b6b55af926f3d58ffd9d
1/*
2 *  linux/mm/vmstat.c
3 *
4 *  Manages VM statistics
5 *  Copyright (C) 1991, 1992, 1993, 1994  Linus Torvalds
6 *
7 *  zoned VM statistics
8 *  Copyright (C) 2006 Silicon Graphics, Inc.,
9 *		Christoph Lameter <christoph@lameter.com>
10 */
11#include <linux/fs.h>
12#include <linux/mm.h>
13#include <linux/err.h>
14#include <linux/module.h>
15#include <linux/cpu.h>
16#include <linux/vmstat.h>
17#include <linux/sched.h>
18
19#ifdef CONFIG_VM_EVENT_COUNTERS
20DEFINE_PER_CPU(struct vm_event_state, vm_event_states) = {{0}};
21EXPORT_PER_CPU_SYMBOL(vm_event_states);
22
23static void sum_vm_events(unsigned long *ret, cpumask_t *cpumask)
24{
25	int cpu;
26	int i;
27
28	memset(ret, 0, NR_VM_EVENT_ITEMS * sizeof(unsigned long));
29
30	for_each_cpu_mask_nr(cpu, *cpumask) {
31		struct vm_event_state *this = &per_cpu(vm_event_states, cpu);
32
33		for (i = 0; i < NR_VM_EVENT_ITEMS; i++)
34			ret[i] += this->event[i];
35	}
36}
37
38/*
39 * Accumulate the vm event counters across all CPUs.
40 * The result is unavoidably approximate - it can change
41 * during and after execution of this function.
42*/
43void all_vm_events(unsigned long *ret)
44{
45	get_online_cpus();
46	sum_vm_events(ret, &cpu_online_map);
47	put_online_cpus();
48}
49EXPORT_SYMBOL_GPL(all_vm_events);
50
51#ifdef CONFIG_HOTPLUG
52/*
53 * Fold the foreign cpu events into our own.
54 *
55 * This is adding to the events on one processor
56 * but keeps the global counts constant.
57 */
58void vm_events_fold_cpu(int cpu)
59{
60	struct vm_event_state *fold_state = &per_cpu(vm_event_states, cpu);
61	int i;
62
63	for (i = 0; i < NR_VM_EVENT_ITEMS; i++) {
64		count_vm_events(i, fold_state->event[i]);
65		fold_state->event[i] = 0;
66	}
67}
68#endif /* CONFIG_HOTPLUG */
69
70#endif /* CONFIG_VM_EVENT_COUNTERS */
71
72/*
73 * Manage combined zone based / global counters
74 *
75 * vm_stat contains the global counters
76 */
77atomic_long_t vm_stat[NR_VM_ZONE_STAT_ITEMS];
78EXPORT_SYMBOL(vm_stat);
79
80#ifdef CONFIG_SMP
81
82static int calculate_threshold(struct zone *zone)
83{
84	int threshold;
85	int mem;	/* memory in 128 MB units */
86
87	/*
88	 * The threshold scales with the number of processors and the amount
89	 * of memory per zone. More memory means that we can defer updates for
90	 * longer, more processors could lead to more contention.
91 	 * fls() is used to have a cheap way of logarithmic scaling.
92	 *
93	 * Some sample thresholds:
94	 *
95	 * Threshold	Processors	(fls)	Zonesize	fls(mem+1)
96	 * ------------------------------------------------------------------
97	 * 8		1		1	0.9-1 GB	4
98	 * 16		2		2	0.9-1 GB	4
99	 * 20 		2		2	1-2 GB		5
100	 * 24		2		2	2-4 GB		6
101	 * 28		2		2	4-8 GB		7
102	 * 32		2		2	8-16 GB		8
103	 * 4		2		2	<128M		1
104	 * 30		4		3	2-4 GB		5
105	 * 48		4		3	8-16 GB		8
106	 * 32		8		4	1-2 GB		4
107	 * 32		8		4	0.9-1GB		4
108	 * 10		16		5	<128M		1
109	 * 40		16		5	900M		4
110	 * 70		64		7	2-4 GB		5
111	 * 84		64		7	4-8 GB		6
112	 * 108		512		9	4-8 GB		6
113	 * 125		1024		10	8-16 GB		8
114	 * 125		1024		10	16-32 GB	9
115	 */
116
117	mem = zone->present_pages >> (27 - PAGE_SHIFT);
118
119	threshold = 2 * fls(num_online_cpus()) * (1 + fls(mem));
120
121	/*
122	 * Maximum threshold is 125
123	 */
124	threshold = min(125, threshold);
125
126	return threshold;
127}
128
129/*
130 * Refresh the thresholds for each zone.
131 */
132static void refresh_zone_stat_thresholds(void)
133{
134	struct zone *zone;
135	int cpu;
136	int threshold;
137
138	for_each_zone(zone) {
139
140		if (!zone->present_pages)
141			continue;
142
143		threshold = calculate_threshold(zone);
144
145		for_each_online_cpu(cpu)
146			zone_pcp(zone, cpu)->stat_threshold = threshold;
147	}
148}
149
150/*
151 * For use when we know that interrupts are disabled.
152 */
153void __mod_zone_page_state(struct zone *zone, enum zone_stat_item item,
154				int delta)
155{
156	struct per_cpu_pageset *pcp = zone_pcp(zone, smp_processor_id());
157	s8 *p = pcp->vm_stat_diff + item;
158	long x;
159
160	x = delta + *p;
161
162	if (unlikely(x > pcp->stat_threshold || x < -pcp->stat_threshold)) {
163		zone_page_state_add(x, zone, item);
164		x = 0;
165	}
166	*p = x;
167}
168EXPORT_SYMBOL(__mod_zone_page_state);
169
170/*
171 * For an unknown interrupt state
172 */
173void mod_zone_page_state(struct zone *zone, enum zone_stat_item item,
174					int delta)
175{
176	unsigned long flags;
177
178	local_irq_save(flags);
179	__mod_zone_page_state(zone, item, delta);
180	local_irq_restore(flags);
181}
182EXPORT_SYMBOL(mod_zone_page_state);
183
184/*
185 * Optimized increment and decrement functions.
186 *
187 * These are only for a single page and therefore can take a struct page *
188 * argument instead of struct zone *. This allows the inclusion of the code
189 * generated for page_zone(page) into the optimized functions.
190 *
191 * No overflow check is necessary and therefore the differential can be
192 * incremented or decremented in place which may allow the compilers to
193 * generate better code.
194 * The increment or decrement is known and therefore one boundary check can
195 * be omitted.
196 *
197 * NOTE: These functions are very performance sensitive. Change only
198 * with care.
199 *
200 * Some processors have inc/dec instructions that are atomic vs an interrupt.
201 * However, the code must first determine the differential location in a zone
202 * based on the processor number and then inc/dec the counter. There is no
203 * guarantee without disabling preemption that the processor will not change
204 * in between and therefore the atomicity vs. interrupt cannot be exploited
205 * in a useful way here.
206 */
207void __inc_zone_state(struct zone *zone, enum zone_stat_item item)
208{
209	struct per_cpu_pageset *pcp = zone_pcp(zone, smp_processor_id());
210	s8 *p = pcp->vm_stat_diff + item;
211
212	(*p)++;
213
214	if (unlikely(*p > pcp->stat_threshold)) {
215		int overstep = pcp->stat_threshold / 2;
216
217		zone_page_state_add(*p + overstep, zone, item);
218		*p = -overstep;
219	}
220}
221
222void __inc_zone_page_state(struct page *page, enum zone_stat_item item)
223{
224	__inc_zone_state(page_zone(page), item);
225}
226EXPORT_SYMBOL(__inc_zone_page_state);
227
228void __dec_zone_state(struct zone *zone, enum zone_stat_item item)
229{
230	struct per_cpu_pageset *pcp = zone_pcp(zone, smp_processor_id());
231	s8 *p = pcp->vm_stat_diff + item;
232
233	(*p)--;
234
235	if (unlikely(*p < - pcp->stat_threshold)) {
236		int overstep = pcp->stat_threshold / 2;
237
238		zone_page_state_add(*p - overstep, zone, item);
239		*p = overstep;
240	}
241}
242
243void __dec_zone_page_state(struct page *page, enum zone_stat_item item)
244{
245	__dec_zone_state(page_zone(page), item);
246}
247EXPORT_SYMBOL(__dec_zone_page_state);
248
249void inc_zone_state(struct zone *zone, enum zone_stat_item item)
250{
251	unsigned long flags;
252
253	local_irq_save(flags);
254	__inc_zone_state(zone, item);
255	local_irq_restore(flags);
256}
257
258void inc_zone_page_state(struct page *page, enum zone_stat_item item)
259{
260	unsigned long flags;
261	struct zone *zone;
262
263	zone = page_zone(page);
264	local_irq_save(flags);
265	__inc_zone_state(zone, item);
266	local_irq_restore(flags);
267}
268EXPORT_SYMBOL(inc_zone_page_state);
269
270void dec_zone_page_state(struct page *page, enum zone_stat_item item)
271{
272	unsigned long flags;
273
274	local_irq_save(flags);
275	__dec_zone_page_state(page, item);
276	local_irq_restore(flags);
277}
278EXPORT_SYMBOL(dec_zone_page_state);
279
280/*
281 * Update the zone counters for one cpu.
282 *
283 * The cpu specified must be either the current cpu or a processor that
284 * is not online. If it is the current cpu then the execution thread must
285 * be pinned to the current cpu.
286 *
287 * Note that refresh_cpu_vm_stats strives to only access
288 * node local memory. The per cpu pagesets on remote zones are placed
289 * in the memory local to the processor using that pageset. So the
290 * loop over all zones will access a series of cachelines local to
291 * the processor.
292 *
293 * The call to zone_page_state_add updates the cachelines with the
294 * statistics in the remote zone struct as well as the global cachelines
295 * with the global counters. These could cause remote node cache line
296 * bouncing and will have to be only done when necessary.
297 */
298void refresh_cpu_vm_stats(int cpu)
299{
300	struct zone *zone;
301	int i;
302	int global_diff[NR_VM_ZONE_STAT_ITEMS] = { 0, };
303
304	for_each_zone(zone) {
305		struct per_cpu_pageset *p;
306
307		if (!populated_zone(zone))
308			continue;
309
310		p = zone_pcp(zone, cpu);
311
312		for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++)
313			if (p->vm_stat_diff[i]) {
314				unsigned long flags;
315				int v;
316
317				local_irq_save(flags);
318				v = p->vm_stat_diff[i];
319				p->vm_stat_diff[i] = 0;
320				local_irq_restore(flags);
321				atomic_long_add(v, &zone->vm_stat[i]);
322				global_diff[i] += v;
323#ifdef CONFIG_NUMA
324				/* 3 seconds idle till flush */
325				p->expire = 3;
326#endif
327			}
328		cond_resched();
329#ifdef CONFIG_NUMA
330		/*
331		 * Deal with draining the remote pageset of this
332		 * processor
333		 *
334		 * Check if there are pages remaining in this pageset
335		 * if not then there is nothing to expire.
336		 */
337		if (!p->expire || !p->pcp.count)
338			continue;
339
340		/*
341		 * We never drain zones local to this processor.
342		 */
343		if (zone_to_nid(zone) == numa_node_id()) {
344			p->expire = 0;
345			continue;
346		}
347
348		p->expire--;
349		if (p->expire)
350			continue;
351
352		if (p->pcp.count)
353			drain_zone_pages(zone, &p->pcp);
354#endif
355	}
356
357	for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++)
358		if (global_diff[i])
359			atomic_long_add(global_diff[i], &vm_stat[i]);
360}
361
362#endif
363
364#ifdef CONFIG_NUMA
365/*
366 * zonelist = the list of zones passed to the allocator
367 * z 	    = the zone from which the allocation occurred.
368 *
369 * Must be called with interrupts disabled.
370 */
371void zone_statistics(struct zone *preferred_zone, struct zone *z)
372{
373	if (z->zone_pgdat == preferred_zone->zone_pgdat) {
374		__inc_zone_state(z, NUMA_HIT);
375	} else {
376		__inc_zone_state(z, NUMA_MISS);
377		__inc_zone_state(preferred_zone, NUMA_FOREIGN);
378	}
379	if (z->node == numa_node_id())
380		__inc_zone_state(z, NUMA_LOCAL);
381	else
382		__inc_zone_state(z, NUMA_OTHER);
383}
384#endif
385
386#ifdef CONFIG_PROC_FS
387#include <linux/proc_fs.h>
388#include <linux/seq_file.h>
389
390static char * const migratetype_names[MIGRATE_TYPES] = {
391	"Unmovable",
392	"Reclaimable",
393	"Movable",
394	"Reserve",
395	"Isolate",
396};
397
398static void *frag_start(struct seq_file *m, loff_t *pos)
399{
400	pg_data_t *pgdat;
401	loff_t node = *pos;
402	for (pgdat = first_online_pgdat();
403	     pgdat && node;
404	     pgdat = next_online_pgdat(pgdat))
405		--node;
406
407	return pgdat;
408}
409
410static void *frag_next(struct seq_file *m, void *arg, loff_t *pos)
411{
412	pg_data_t *pgdat = (pg_data_t *)arg;
413
414	(*pos)++;
415	return next_online_pgdat(pgdat);
416}
417
418static void frag_stop(struct seq_file *m, void *arg)
419{
420}
421
422/* Walk all the zones in a node and print using a callback */
423static void walk_zones_in_node(struct seq_file *m, pg_data_t *pgdat,
424		void (*print)(struct seq_file *m, pg_data_t *, struct zone *))
425{
426	struct zone *zone;
427	struct zone *node_zones = pgdat->node_zones;
428	unsigned long flags;
429
430	for (zone = node_zones; zone - node_zones < MAX_NR_ZONES; ++zone) {
431		if (!populated_zone(zone))
432			continue;
433
434		spin_lock_irqsave(&zone->lock, flags);
435		print(m, pgdat, zone);
436		spin_unlock_irqrestore(&zone->lock, flags);
437	}
438}
439
440static void frag_show_print(struct seq_file *m, pg_data_t *pgdat,
441						struct zone *zone)
442{
443	int order;
444
445	seq_printf(m, "Node %d, zone %8s ", pgdat->node_id, zone->name);
446	for (order = 0; order < MAX_ORDER; ++order)
447		seq_printf(m, "%6lu ", zone->free_area[order].nr_free);
448	seq_putc(m, '\n');
449}
450
451/*
452 * This walks the free areas for each zone.
453 */
454static int frag_show(struct seq_file *m, void *arg)
455{
456	pg_data_t *pgdat = (pg_data_t *)arg;
457	walk_zones_in_node(m, pgdat, frag_show_print);
458	return 0;
459}
460
461static void pagetypeinfo_showfree_print(struct seq_file *m,
462					pg_data_t *pgdat, struct zone *zone)
463{
464	int order, mtype;
465
466	for (mtype = 0; mtype < MIGRATE_TYPES; mtype++) {
467		seq_printf(m, "Node %4d, zone %8s, type %12s ",
468					pgdat->node_id,
469					zone->name,
470					migratetype_names[mtype]);
471		for (order = 0; order < MAX_ORDER; ++order) {
472			unsigned long freecount = 0;
473			struct free_area *area;
474			struct list_head *curr;
475
476			area = &(zone->free_area[order]);
477
478			list_for_each(curr, &area->free_list[mtype])
479				freecount++;
480			seq_printf(m, "%6lu ", freecount);
481		}
482		seq_putc(m, '\n');
483	}
484}
485
486/* Print out the free pages at each order for each migatetype */
487static int pagetypeinfo_showfree(struct seq_file *m, void *arg)
488{
489	int order;
490	pg_data_t *pgdat = (pg_data_t *)arg;
491
492	/* Print header */
493	seq_printf(m, "%-43s ", "Free pages count per migrate type at order");
494	for (order = 0; order < MAX_ORDER; ++order)
495		seq_printf(m, "%6d ", order);
496	seq_putc(m, '\n');
497
498	walk_zones_in_node(m, pgdat, pagetypeinfo_showfree_print);
499
500	return 0;
501}
502
503static void pagetypeinfo_showblockcount_print(struct seq_file *m,
504					pg_data_t *pgdat, struct zone *zone)
505{
506	int mtype;
507	unsigned long pfn;
508	unsigned long start_pfn = zone->zone_start_pfn;
509	unsigned long end_pfn = start_pfn + zone->spanned_pages;
510	unsigned long count[MIGRATE_TYPES] = { 0, };
511
512	for (pfn = start_pfn; pfn < end_pfn; pfn += pageblock_nr_pages) {
513		struct page *page;
514
515		if (!pfn_valid(pfn))
516			continue;
517
518		page = pfn_to_page(pfn);
519#ifdef CONFIG_ARCH_FLATMEM_HAS_HOLES
520		/*
521		 * Ordinarily, memory holes in flatmem still have a valid
522		 * memmap for the PFN range. However, an architecture for
523		 * embedded systems (e.g. ARM) can free up the memmap backing
524		 * holes to save memory on the assumption the memmap is
525		 * never used. The page_zone linkages are then broken even
526		 * though pfn_valid() returns true. Skip the page if the
527		 * linkages are broken. Even if this test passed, the impact
528		 * is that the counters for the movable type are off but
529		 * fragmentation monitoring is likely meaningless on small
530		 * systems.
531		 */
532		if (page_zone(page) != zone)
533			continue;
534#endif
535		mtype = get_pageblock_migratetype(page);
536
537		if (mtype < MIGRATE_TYPES)
538			count[mtype]++;
539	}
540
541	/* Print counts */
542	seq_printf(m, "Node %d, zone %8s ", pgdat->node_id, zone->name);
543	for (mtype = 0; mtype < MIGRATE_TYPES; mtype++)
544		seq_printf(m, "%12lu ", count[mtype]);
545	seq_putc(m, '\n');
546}
547
548/* Print out the free pages at each order for each migratetype */
549static int pagetypeinfo_showblockcount(struct seq_file *m, void *arg)
550{
551	int mtype;
552	pg_data_t *pgdat = (pg_data_t *)arg;
553
554	seq_printf(m, "\n%-23s", "Number of blocks type ");
555	for (mtype = 0; mtype < MIGRATE_TYPES; mtype++)
556		seq_printf(m, "%12s ", migratetype_names[mtype]);
557	seq_putc(m, '\n');
558	walk_zones_in_node(m, pgdat, pagetypeinfo_showblockcount_print);
559
560	return 0;
561}
562
563/*
564 * This prints out statistics in relation to grouping pages by mobility.
565 * It is expensive to collect so do not constantly read the file.
566 */
567static int pagetypeinfo_show(struct seq_file *m, void *arg)
568{
569	pg_data_t *pgdat = (pg_data_t *)arg;
570
571	/* check memoryless node */
572	if (!node_state(pgdat->node_id, N_HIGH_MEMORY))
573		return 0;
574
575	seq_printf(m, "Page block order: %d\n", pageblock_order);
576	seq_printf(m, "Pages per block:  %lu\n", pageblock_nr_pages);
577	seq_putc(m, '\n');
578	pagetypeinfo_showfree(m, pgdat);
579	pagetypeinfo_showblockcount(m, pgdat);
580
581	return 0;
582}
583
584static const struct seq_operations fragmentation_op = {
585	.start	= frag_start,
586	.next	= frag_next,
587	.stop	= frag_stop,
588	.show	= frag_show,
589};
590
591static int fragmentation_open(struct inode *inode, struct file *file)
592{
593	return seq_open(file, &fragmentation_op);
594}
595
596static const struct file_operations fragmentation_file_operations = {
597	.open		= fragmentation_open,
598	.read		= seq_read,
599	.llseek		= seq_lseek,
600	.release	= seq_release,
601};
602
603const struct seq_operations pagetypeinfo_op = {
604	.start	= frag_start,
605	.next	= frag_next,
606	.stop	= frag_stop,
607	.show	= pagetypeinfo_show,
608};
609
610#ifdef CONFIG_ZONE_DMA
611#define TEXT_FOR_DMA(xx) xx "_dma",
612#else
613#define TEXT_FOR_DMA(xx)
614#endif
615
616#ifdef CONFIG_ZONE_DMA32
617#define TEXT_FOR_DMA32(xx) xx "_dma32",
618#else
619#define TEXT_FOR_DMA32(xx)
620#endif
621
622#ifdef CONFIG_HIGHMEM
623#define TEXT_FOR_HIGHMEM(xx) xx "_high",
624#else
625#define TEXT_FOR_HIGHMEM(xx)
626#endif
627
628#define TEXTS_FOR_ZONES(xx) TEXT_FOR_DMA(xx) TEXT_FOR_DMA32(xx) xx "_normal", \
629					TEXT_FOR_HIGHMEM(xx) xx "_movable",
630
631static const char * const vmstat_text[] = {
632	/* Zoned VM counters */
633	"nr_free_pages",
634	"nr_inactive_anon",
635	"nr_active_anon",
636	"nr_inactive_file",
637	"nr_active_file",
638#ifdef CONFIG_UNEVICTABLE_LRU
639	"nr_unevictable",
640	"nr_mlock",
641#endif
642	"nr_anon_pages",
643	"nr_mapped",
644	"nr_file_pages",
645	"nr_dirty",
646	"nr_writeback",
647	"nr_slab_reclaimable",
648	"nr_slab_unreclaimable",
649	"nr_page_table_pages",
650	"nr_unstable",
651	"nr_bounce",
652	"nr_vmscan_write",
653	"nr_writeback_temp",
654
655#ifdef CONFIG_NUMA
656	"numa_hit",
657	"numa_miss",
658	"numa_foreign",
659	"numa_interleave",
660	"numa_local",
661	"numa_other",
662#endif
663
664#ifdef CONFIG_VM_EVENT_COUNTERS
665	"pgpgin",
666	"pgpgout",
667	"pswpin",
668	"pswpout",
669
670	TEXTS_FOR_ZONES("pgalloc")
671
672	"pgfree",
673	"pgactivate",
674	"pgdeactivate",
675
676	"pgfault",
677	"pgmajfault",
678
679	TEXTS_FOR_ZONES("pgrefill")
680	TEXTS_FOR_ZONES("pgsteal")
681	TEXTS_FOR_ZONES("pgscan_kswapd")
682	TEXTS_FOR_ZONES("pgscan_direct")
683
684	"pginodesteal",
685	"slabs_scanned",
686	"kswapd_steal",
687	"kswapd_inodesteal",
688	"pageoutrun",
689	"allocstall",
690
691	"pgrotated",
692#ifdef CONFIG_HUGETLB_PAGE
693	"htlb_buddy_alloc_success",
694	"htlb_buddy_alloc_fail",
695#endif
696#ifdef CONFIG_UNEVICTABLE_LRU
697	"unevictable_pgs_culled",
698	"unevictable_pgs_scanned",
699	"unevictable_pgs_rescued",
700	"unevictable_pgs_mlocked",
701	"unevictable_pgs_munlocked",
702	"unevictable_pgs_cleared",
703	"unevictable_pgs_stranded",
704	"unevictable_pgs_mlockfreed",
705#endif
706#endif
707};
708
709static void zoneinfo_show_print(struct seq_file *m, pg_data_t *pgdat,
710							struct zone *zone)
711{
712	int i;
713	seq_printf(m, "Node %d, zone %8s", pgdat->node_id, zone->name);
714	seq_printf(m,
715		   "\n  pages free     %lu"
716		   "\n        min      %lu"
717		   "\n        low      %lu"
718		   "\n        high     %lu"
719		   "\n        scanned  %lu (aa: %lu ia: %lu af: %lu if: %lu)"
720		   "\n        spanned  %lu"
721		   "\n        present  %lu",
722		   zone_page_state(zone, NR_FREE_PAGES),
723		   zone->pages_min,
724		   zone->pages_low,
725		   zone->pages_high,
726		   zone->pages_scanned,
727		   zone->lru[LRU_ACTIVE_ANON].nr_scan,
728		   zone->lru[LRU_INACTIVE_ANON].nr_scan,
729		   zone->lru[LRU_ACTIVE_FILE].nr_scan,
730		   zone->lru[LRU_INACTIVE_FILE].nr_scan,
731		   zone->spanned_pages,
732		   zone->present_pages);
733
734	for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++)
735		seq_printf(m, "\n    %-12s %lu", vmstat_text[i],
736				zone_page_state(zone, i));
737
738	seq_printf(m,
739		   "\n        protection: (%lu",
740		   zone->lowmem_reserve[0]);
741	for (i = 1; i < ARRAY_SIZE(zone->lowmem_reserve); i++)
742		seq_printf(m, ", %lu", zone->lowmem_reserve[i]);
743	seq_printf(m,
744		   ")"
745		   "\n  pagesets");
746	for_each_online_cpu(i) {
747		struct per_cpu_pageset *pageset;
748
749		pageset = zone_pcp(zone, i);
750		seq_printf(m,
751			   "\n    cpu: %i"
752			   "\n              count: %i"
753			   "\n              high:  %i"
754			   "\n              batch: %i",
755			   i,
756			   pageset->pcp.count,
757			   pageset->pcp.high,
758			   pageset->pcp.batch);
759#ifdef CONFIG_SMP
760		seq_printf(m, "\n  vm stats threshold: %d",
761				pageset->stat_threshold);
762#endif
763	}
764	seq_printf(m,
765		   "\n  all_unreclaimable: %u"
766		   "\n  prev_priority:     %i"
767		   "\n  start_pfn:         %lu"
768		   "\n  inactive_ratio:    %u",
769			   zone_is_all_unreclaimable(zone),
770		   zone->prev_priority,
771		   zone->zone_start_pfn,
772		   zone->inactive_ratio);
773	seq_putc(m, '\n');
774}
775
776/*
777 * Output information about zones in @pgdat.
778 */
779static int zoneinfo_show(struct seq_file *m, void *arg)
780{
781	pg_data_t *pgdat = (pg_data_t *)arg;
782	walk_zones_in_node(m, pgdat, zoneinfo_show_print);
783	return 0;
784}
785
786const struct seq_operations zoneinfo_op = {
787	.start	= frag_start, /* iterate over all zones. The same as in
788			       * fragmentation. */
789	.next	= frag_next,
790	.stop	= frag_stop,
791	.show	= zoneinfo_show,
792};
793
794static void *vmstat_start(struct seq_file *m, loff_t *pos)
795{
796	unsigned long *v;
797#ifdef CONFIG_VM_EVENT_COUNTERS
798	unsigned long *e;
799#endif
800	int i;
801
802	if (*pos >= ARRAY_SIZE(vmstat_text))
803		return NULL;
804
805#ifdef CONFIG_VM_EVENT_COUNTERS
806	v = kmalloc(NR_VM_ZONE_STAT_ITEMS * sizeof(unsigned long)
807			+ sizeof(struct vm_event_state), GFP_KERNEL);
808#else
809	v = kmalloc(NR_VM_ZONE_STAT_ITEMS * sizeof(unsigned long),
810			GFP_KERNEL);
811#endif
812	m->private = v;
813	if (!v)
814		return ERR_PTR(-ENOMEM);
815	for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++)
816		v[i] = global_page_state(i);
817#ifdef CONFIG_VM_EVENT_COUNTERS
818	e = v + NR_VM_ZONE_STAT_ITEMS;
819	all_vm_events(e);
820	e[PGPGIN] /= 2;		/* sectors -> kbytes */
821	e[PGPGOUT] /= 2;
822#endif
823	return v + *pos;
824}
825
826static void *vmstat_next(struct seq_file *m, void *arg, loff_t *pos)
827{
828	(*pos)++;
829	if (*pos >= ARRAY_SIZE(vmstat_text))
830		return NULL;
831	return (unsigned long *)m->private + *pos;
832}
833
834static int vmstat_show(struct seq_file *m, void *arg)
835{
836	unsigned long *l = arg;
837	unsigned long off = l - (unsigned long *)m->private;
838
839	seq_printf(m, "%s %lu\n", vmstat_text[off], *l);
840	return 0;
841}
842
843static void vmstat_stop(struct seq_file *m, void *arg)
844{
845	kfree(m->private);
846	m->private = NULL;
847}
848
849const struct seq_operations vmstat_op = {
850	.start	= vmstat_start,
851	.next	= vmstat_next,
852	.stop	= vmstat_stop,
853	.show	= vmstat_show,
854};
855
856#endif /* CONFIG_PROC_FS */
857
858#ifdef CONFIG_SMP
859static DEFINE_PER_CPU(struct delayed_work, vmstat_work);
860int sysctl_stat_interval __read_mostly = HZ;
861
862static void vmstat_update(struct work_struct *w)
863{
864	refresh_cpu_vm_stats(smp_processor_id());
865	schedule_delayed_work(&__get_cpu_var(vmstat_work),
866		sysctl_stat_interval);
867}
868
869static void __cpuinit start_cpu_timer(int cpu)
870{
871	struct delayed_work *vmstat_work = &per_cpu(vmstat_work, cpu);
872
873	INIT_DELAYED_WORK_DEFERRABLE(vmstat_work, vmstat_update);
874	schedule_delayed_work_on(cpu, vmstat_work, HZ + cpu);
875}
876
877/*
878 * Use the cpu notifier to insure that the thresholds are recalculated
879 * when necessary.
880 */
881static int __cpuinit vmstat_cpuup_callback(struct notifier_block *nfb,
882		unsigned long action,
883		void *hcpu)
884{
885	long cpu = (long)hcpu;
886
887	switch (action) {
888	case CPU_ONLINE:
889	case CPU_ONLINE_FROZEN:
890		start_cpu_timer(cpu);
891		break;
892	case CPU_DOWN_PREPARE:
893	case CPU_DOWN_PREPARE_FROZEN:
894		cancel_rearming_delayed_work(&per_cpu(vmstat_work, cpu));
895		per_cpu(vmstat_work, cpu).work.func = NULL;
896		break;
897	case CPU_DOWN_FAILED:
898	case CPU_DOWN_FAILED_FROZEN:
899		start_cpu_timer(cpu);
900		break;
901	case CPU_DEAD:
902	case CPU_DEAD_FROZEN:
903		refresh_zone_stat_thresholds();
904		break;
905	default:
906		break;
907	}
908	return NOTIFY_OK;
909}
910
911static struct notifier_block __cpuinitdata vmstat_notifier =
912	{ &vmstat_cpuup_callback, NULL, 0 };
913#endif
914
915static int __init setup_vmstat(void)
916{
917#ifdef CONFIG_SMP
918	int cpu;
919
920	refresh_zone_stat_thresholds();
921	register_cpu_notifier(&vmstat_notifier);
922
923	for_each_online_cpu(cpu)
924		start_cpu_timer(cpu);
925#endif
926#ifdef CONFIG_PROC_FS
927	proc_create("buddyinfo", S_IRUGO, NULL, &fragmentation_file_operations);
928#endif
929	return 0;
930}
931module_init(setup_vmstat)
932