vmstat.c revision 4f98a2fee8acdb4ac84545df98cccecfd130f8db
1/*
2 *  linux/mm/vmstat.c
3 *
4 *  Manages VM statistics
5 *  Copyright (C) 1991, 1992, 1993, 1994  Linus Torvalds
6 *
7 *  zoned VM statistics
8 *  Copyright (C) 2006 Silicon Graphics, Inc.,
9 *		Christoph Lameter <christoph@lameter.com>
10 */
11
12#include <linux/mm.h>
13#include <linux/err.h>
14#include <linux/module.h>
15#include <linux/cpu.h>
16#include <linux/vmstat.h>
17#include <linux/sched.h>
18
19#ifdef CONFIG_VM_EVENT_COUNTERS
20DEFINE_PER_CPU(struct vm_event_state, vm_event_states) = {{0}};
21EXPORT_PER_CPU_SYMBOL(vm_event_states);
22
23static void sum_vm_events(unsigned long *ret, cpumask_t *cpumask)
24{
25	int cpu;
26	int i;
27
28	memset(ret, 0, NR_VM_EVENT_ITEMS * sizeof(unsigned long));
29
30	for_each_cpu_mask_nr(cpu, *cpumask) {
31		struct vm_event_state *this = &per_cpu(vm_event_states, cpu);
32
33		for (i = 0; i < NR_VM_EVENT_ITEMS; i++)
34			ret[i] += this->event[i];
35	}
36}
37
38/*
39 * Accumulate the vm event counters across all CPUs.
40 * The result is unavoidably approximate - it can change
41 * during and after execution of this function.
42*/
43void all_vm_events(unsigned long *ret)
44{
45	get_online_cpus();
46	sum_vm_events(ret, &cpu_online_map);
47	put_online_cpus();
48}
49EXPORT_SYMBOL_GPL(all_vm_events);
50
51#ifdef CONFIG_HOTPLUG
52/*
53 * Fold the foreign cpu events into our own.
54 *
55 * This is adding to the events on one processor
56 * but keeps the global counts constant.
57 */
58void vm_events_fold_cpu(int cpu)
59{
60	struct vm_event_state *fold_state = &per_cpu(vm_event_states, cpu);
61	int i;
62
63	for (i = 0; i < NR_VM_EVENT_ITEMS; i++) {
64		count_vm_events(i, fold_state->event[i]);
65		fold_state->event[i] = 0;
66	}
67}
68#endif /* CONFIG_HOTPLUG */
69
70#endif /* CONFIG_VM_EVENT_COUNTERS */
71
72/*
73 * Manage combined zone based / global counters
74 *
75 * vm_stat contains the global counters
76 */
77atomic_long_t vm_stat[NR_VM_ZONE_STAT_ITEMS];
78EXPORT_SYMBOL(vm_stat);
79
80#ifdef CONFIG_SMP
81
82static int calculate_threshold(struct zone *zone)
83{
84	int threshold;
85	int mem;	/* memory in 128 MB units */
86
87	/*
88	 * The threshold scales with the number of processors and the amount
89	 * of memory per zone. More memory means that we can defer updates for
90	 * longer, more processors could lead to more contention.
91 	 * fls() is used to have a cheap way of logarithmic scaling.
92	 *
93	 * Some sample thresholds:
94	 *
95	 * Threshold	Processors	(fls)	Zonesize	fls(mem+1)
96	 * ------------------------------------------------------------------
97	 * 8		1		1	0.9-1 GB	4
98	 * 16		2		2	0.9-1 GB	4
99	 * 20 		2		2	1-2 GB		5
100	 * 24		2		2	2-4 GB		6
101	 * 28		2		2	4-8 GB		7
102	 * 32		2		2	8-16 GB		8
103	 * 4		2		2	<128M		1
104	 * 30		4		3	2-4 GB		5
105	 * 48		4		3	8-16 GB		8
106	 * 32		8		4	1-2 GB		4
107	 * 32		8		4	0.9-1GB		4
108	 * 10		16		5	<128M		1
109	 * 40		16		5	900M		4
110	 * 70		64		7	2-4 GB		5
111	 * 84		64		7	4-8 GB		6
112	 * 108		512		9	4-8 GB		6
113	 * 125		1024		10	8-16 GB		8
114	 * 125		1024		10	16-32 GB	9
115	 */
116
117	mem = zone->present_pages >> (27 - PAGE_SHIFT);
118
119	threshold = 2 * fls(num_online_cpus()) * (1 + fls(mem));
120
121	/*
122	 * Maximum threshold is 125
123	 */
124	threshold = min(125, threshold);
125
126	return threshold;
127}
128
129/*
130 * Refresh the thresholds for each zone.
131 */
132static void refresh_zone_stat_thresholds(void)
133{
134	struct zone *zone;
135	int cpu;
136	int threshold;
137
138	for_each_zone(zone) {
139
140		if (!zone->present_pages)
141			continue;
142
143		threshold = calculate_threshold(zone);
144
145		for_each_online_cpu(cpu)
146			zone_pcp(zone, cpu)->stat_threshold = threshold;
147	}
148}
149
150/*
151 * For use when we know that interrupts are disabled.
152 */
153void __mod_zone_page_state(struct zone *zone, enum zone_stat_item item,
154				int delta)
155{
156	struct per_cpu_pageset *pcp = zone_pcp(zone, smp_processor_id());
157	s8 *p = pcp->vm_stat_diff + item;
158	long x;
159
160	x = delta + *p;
161
162	if (unlikely(x > pcp->stat_threshold || x < -pcp->stat_threshold)) {
163		zone_page_state_add(x, zone, item);
164		x = 0;
165	}
166	*p = x;
167}
168EXPORT_SYMBOL(__mod_zone_page_state);
169
170/*
171 * For an unknown interrupt state
172 */
173void mod_zone_page_state(struct zone *zone, enum zone_stat_item item,
174					int delta)
175{
176	unsigned long flags;
177
178	local_irq_save(flags);
179	__mod_zone_page_state(zone, item, delta);
180	local_irq_restore(flags);
181}
182EXPORT_SYMBOL(mod_zone_page_state);
183
184/*
185 * Optimized increment and decrement functions.
186 *
187 * These are only for a single page and therefore can take a struct page *
188 * argument instead of struct zone *. This allows the inclusion of the code
189 * generated for page_zone(page) into the optimized functions.
190 *
191 * No overflow check is necessary and therefore the differential can be
192 * incremented or decremented in place which may allow the compilers to
193 * generate better code.
194 * The increment or decrement is known and therefore one boundary check can
195 * be omitted.
196 *
197 * NOTE: These functions are very performance sensitive. Change only
198 * with care.
199 *
200 * Some processors have inc/dec instructions that are atomic vs an interrupt.
201 * However, the code must first determine the differential location in a zone
202 * based on the processor number and then inc/dec the counter. There is no
203 * guarantee without disabling preemption that the processor will not change
204 * in between and therefore the atomicity vs. interrupt cannot be exploited
205 * in a useful way here.
206 */
207void __inc_zone_state(struct zone *zone, enum zone_stat_item item)
208{
209	struct per_cpu_pageset *pcp = zone_pcp(zone, smp_processor_id());
210	s8 *p = pcp->vm_stat_diff + item;
211
212	(*p)++;
213
214	if (unlikely(*p > pcp->stat_threshold)) {
215		int overstep = pcp->stat_threshold / 2;
216
217		zone_page_state_add(*p + overstep, zone, item);
218		*p = -overstep;
219	}
220}
221
222void __inc_zone_page_state(struct page *page, enum zone_stat_item item)
223{
224	__inc_zone_state(page_zone(page), item);
225}
226EXPORT_SYMBOL(__inc_zone_page_state);
227
228void __dec_zone_state(struct zone *zone, enum zone_stat_item item)
229{
230	struct per_cpu_pageset *pcp = zone_pcp(zone, smp_processor_id());
231	s8 *p = pcp->vm_stat_diff + item;
232
233	(*p)--;
234
235	if (unlikely(*p < - pcp->stat_threshold)) {
236		int overstep = pcp->stat_threshold / 2;
237
238		zone_page_state_add(*p - overstep, zone, item);
239		*p = overstep;
240	}
241}
242
243void __dec_zone_page_state(struct page *page, enum zone_stat_item item)
244{
245	__dec_zone_state(page_zone(page), item);
246}
247EXPORT_SYMBOL(__dec_zone_page_state);
248
249void inc_zone_state(struct zone *zone, enum zone_stat_item item)
250{
251	unsigned long flags;
252
253	local_irq_save(flags);
254	__inc_zone_state(zone, item);
255	local_irq_restore(flags);
256}
257
258void inc_zone_page_state(struct page *page, enum zone_stat_item item)
259{
260	unsigned long flags;
261	struct zone *zone;
262
263	zone = page_zone(page);
264	local_irq_save(flags);
265	__inc_zone_state(zone, item);
266	local_irq_restore(flags);
267}
268EXPORT_SYMBOL(inc_zone_page_state);
269
270void dec_zone_page_state(struct page *page, enum zone_stat_item item)
271{
272	unsigned long flags;
273
274	local_irq_save(flags);
275	__dec_zone_page_state(page, item);
276	local_irq_restore(flags);
277}
278EXPORT_SYMBOL(dec_zone_page_state);
279
280/*
281 * Update the zone counters for one cpu.
282 *
283 * The cpu specified must be either the current cpu or a processor that
284 * is not online. If it is the current cpu then the execution thread must
285 * be pinned to the current cpu.
286 *
287 * Note that refresh_cpu_vm_stats strives to only access
288 * node local memory. The per cpu pagesets on remote zones are placed
289 * in the memory local to the processor using that pageset. So the
290 * loop over all zones will access a series of cachelines local to
291 * the processor.
292 *
293 * The call to zone_page_state_add updates the cachelines with the
294 * statistics in the remote zone struct as well as the global cachelines
295 * with the global counters. These could cause remote node cache line
296 * bouncing and will have to be only done when necessary.
297 */
298void refresh_cpu_vm_stats(int cpu)
299{
300	struct zone *zone;
301	int i;
302	int global_diff[NR_VM_ZONE_STAT_ITEMS] = { 0, };
303
304	for_each_zone(zone) {
305		struct per_cpu_pageset *p;
306
307		if (!populated_zone(zone))
308			continue;
309
310		p = zone_pcp(zone, cpu);
311
312		for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++)
313			if (p->vm_stat_diff[i]) {
314				unsigned long flags;
315				int v;
316
317				local_irq_save(flags);
318				v = p->vm_stat_diff[i];
319				p->vm_stat_diff[i] = 0;
320				local_irq_restore(flags);
321				atomic_long_add(v, &zone->vm_stat[i]);
322				global_diff[i] += v;
323#ifdef CONFIG_NUMA
324				/* 3 seconds idle till flush */
325				p->expire = 3;
326#endif
327			}
328		cond_resched();
329#ifdef CONFIG_NUMA
330		/*
331		 * Deal with draining the remote pageset of this
332		 * processor
333		 *
334		 * Check if there are pages remaining in this pageset
335		 * if not then there is nothing to expire.
336		 */
337		if (!p->expire || !p->pcp.count)
338			continue;
339
340		/*
341		 * We never drain zones local to this processor.
342		 */
343		if (zone_to_nid(zone) == numa_node_id()) {
344			p->expire = 0;
345			continue;
346		}
347
348		p->expire--;
349		if (p->expire)
350			continue;
351
352		if (p->pcp.count)
353			drain_zone_pages(zone, &p->pcp);
354#endif
355	}
356
357	for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++)
358		if (global_diff[i])
359			atomic_long_add(global_diff[i], &vm_stat[i]);
360}
361
362#endif
363
364#ifdef CONFIG_NUMA
365/*
366 * zonelist = the list of zones passed to the allocator
367 * z 	    = the zone from which the allocation occurred.
368 *
369 * Must be called with interrupts disabled.
370 */
371void zone_statistics(struct zone *preferred_zone, struct zone *z)
372{
373	if (z->zone_pgdat == preferred_zone->zone_pgdat) {
374		__inc_zone_state(z, NUMA_HIT);
375	} else {
376		__inc_zone_state(z, NUMA_MISS);
377		__inc_zone_state(preferred_zone, NUMA_FOREIGN);
378	}
379	if (z->node == numa_node_id())
380		__inc_zone_state(z, NUMA_LOCAL);
381	else
382		__inc_zone_state(z, NUMA_OTHER);
383}
384#endif
385
386#ifdef CONFIG_PROC_FS
387
388#include <linux/seq_file.h>
389
390static char * const migratetype_names[MIGRATE_TYPES] = {
391	"Unmovable",
392	"Reclaimable",
393	"Movable",
394	"Reserve",
395	"Isolate",
396};
397
398static void *frag_start(struct seq_file *m, loff_t *pos)
399{
400	pg_data_t *pgdat;
401	loff_t node = *pos;
402	for (pgdat = first_online_pgdat();
403	     pgdat && node;
404	     pgdat = next_online_pgdat(pgdat))
405		--node;
406
407	return pgdat;
408}
409
410static void *frag_next(struct seq_file *m, void *arg, loff_t *pos)
411{
412	pg_data_t *pgdat = (pg_data_t *)arg;
413
414	(*pos)++;
415	return next_online_pgdat(pgdat);
416}
417
418static void frag_stop(struct seq_file *m, void *arg)
419{
420}
421
422/* Walk all the zones in a node and print using a callback */
423static void walk_zones_in_node(struct seq_file *m, pg_data_t *pgdat,
424		void (*print)(struct seq_file *m, pg_data_t *, struct zone *))
425{
426	struct zone *zone;
427	struct zone *node_zones = pgdat->node_zones;
428	unsigned long flags;
429
430	for (zone = node_zones; zone - node_zones < MAX_NR_ZONES; ++zone) {
431		if (!populated_zone(zone))
432			continue;
433
434		spin_lock_irqsave(&zone->lock, flags);
435		print(m, pgdat, zone);
436		spin_unlock_irqrestore(&zone->lock, flags);
437	}
438}
439
440static void frag_show_print(struct seq_file *m, pg_data_t *pgdat,
441						struct zone *zone)
442{
443	int order;
444
445	seq_printf(m, "Node %d, zone %8s ", pgdat->node_id, zone->name);
446	for (order = 0; order < MAX_ORDER; ++order)
447		seq_printf(m, "%6lu ", zone->free_area[order].nr_free);
448	seq_putc(m, '\n');
449}
450
451/*
452 * This walks the free areas for each zone.
453 */
454static int frag_show(struct seq_file *m, void *arg)
455{
456	pg_data_t *pgdat = (pg_data_t *)arg;
457	walk_zones_in_node(m, pgdat, frag_show_print);
458	return 0;
459}
460
461static void pagetypeinfo_showfree_print(struct seq_file *m,
462					pg_data_t *pgdat, struct zone *zone)
463{
464	int order, mtype;
465
466	for (mtype = 0; mtype < MIGRATE_TYPES; mtype++) {
467		seq_printf(m, "Node %4d, zone %8s, type %12s ",
468					pgdat->node_id,
469					zone->name,
470					migratetype_names[mtype]);
471		for (order = 0; order < MAX_ORDER; ++order) {
472			unsigned long freecount = 0;
473			struct free_area *area;
474			struct list_head *curr;
475
476			area = &(zone->free_area[order]);
477
478			list_for_each(curr, &area->free_list[mtype])
479				freecount++;
480			seq_printf(m, "%6lu ", freecount);
481		}
482		seq_putc(m, '\n');
483	}
484}
485
486/* Print out the free pages at each order for each migatetype */
487static int pagetypeinfo_showfree(struct seq_file *m, void *arg)
488{
489	int order;
490	pg_data_t *pgdat = (pg_data_t *)arg;
491
492	/* Print header */
493	seq_printf(m, "%-43s ", "Free pages count per migrate type at order");
494	for (order = 0; order < MAX_ORDER; ++order)
495		seq_printf(m, "%6d ", order);
496	seq_putc(m, '\n');
497
498	walk_zones_in_node(m, pgdat, pagetypeinfo_showfree_print);
499
500	return 0;
501}
502
503static void pagetypeinfo_showblockcount_print(struct seq_file *m,
504					pg_data_t *pgdat, struct zone *zone)
505{
506	int mtype;
507	unsigned long pfn;
508	unsigned long start_pfn = zone->zone_start_pfn;
509	unsigned long end_pfn = start_pfn + zone->spanned_pages;
510	unsigned long count[MIGRATE_TYPES] = { 0, };
511
512	for (pfn = start_pfn; pfn < end_pfn; pfn += pageblock_nr_pages) {
513		struct page *page;
514
515		if (!pfn_valid(pfn))
516			continue;
517
518		page = pfn_to_page(pfn);
519#ifdef CONFIG_ARCH_FLATMEM_HAS_HOLES
520		/*
521		 * Ordinarily, memory holes in flatmem still have a valid
522		 * memmap for the PFN range. However, an architecture for
523		 * embedded systems (e.g. ARM) can free up the memmap backing
524		 * holes to save memory on the assumption the memmap is
525		 * never used. The page_zone linkages are then broken even
526		 * though pfn_valid() returns true. Skip the page if the
527		 * linkages are broken. Even if this test passed, the impact
528		 * is that the counters for the movable type are off but
529		 * fragmentation monitoring is likely meaningless on small
530		 * systems.
531		 */
532		if (page_zone(page) != zone)
533			continue;
534#endif
535		mtype = get_pageblock_migratetype(page);
536
537		if (mtype < MIGRATE_TYPES)
538			count[mtype]++;
539	}
540
541	/* Print counts */
542	seq_printf(m, "Node %d, zone %8s ", pgdat->node_id, zone->name);
543	for (mtype = 0; mtype < MIGRATE_TYPES; mtype++)
544		seq_printf(m, "%12lu ", count[mtype]);
545	seq_putc(m, '\n');
546}
547
548/* Print out the free pages at each order for each migratetype */
549static int pagetypeinfo_showblockcount(struct seq_file *m, void *arg)
550{
551	int mtype;
552	pg_data_t *pgdat = (pg_data_t *)arg;
553
554	seq_printf(m, "\n%-23s", "Number of blocks type ");
555	for (mtype = 0; mtype < MIGRATE_TYPES; mtype++)
556		seq_printf(m, "%12s ", migratetype_names[mtype]);
557	seq_putc(m, '\n');
558	walk_zones_in_node(m, pgdat, pagetypeinfo_showblockcount_print);
559
560	return 0;
561}
562
563/*
564 * This prints out statistics in relation to grouping pages by mobility.
565 * It is expensive to collect so do not constantly read the file.
566 */
567static int pagetypeinfo_show(struct seq_file *m, void *arg)
568{
569	pg_data_t *pgdat = (pg_data_t *)arg;
570
571	/* check memoryless node */
572	if (!node_state(pgdat->node_id, N_HIGH_MEMORY))
573		return 0;
574
575	seq_printf(m, "Page block order: %d\n", pageblock_order);
576	seq_printf(m, "Pages per block:  %lu\n", pageblock_nr_pages);
577	seq_putc(m, '\n');
578	pagetypeinfo_showfree(m, pgdat);
579	pagetypeinfo_showblockcount(m, pgdat);
580
581	return 0;
582}
583
584const struct seq_operations fragmentation_op = {
585	.start	= frag_start,
586	.next	= frag_next,
587	.stop	= frag_stop,
588	.show	= frag_show,
589};
590
591const struct seq_operations pagetypeinfo_op = {
592	.start	= frag_start,
593	.next	= frag_next,
594	.stop	= frag_stop,
595	.show	= pagetypeinfo_show,
596};
597
598#ifdef CONFIG_ZONE_DMA
599#define TEXT_FOR_DMA(xx) xx "_dma",
600#else
601#define TEXT_FOR_DMA(xx)
602#endif
603
604#ifdef CONFIG_ZONE_DMA32
605#define TEXT_FOR_DMA32(xx) xx "_dma32",
606#else
607#define TEXT_FOR_DMA32(xx)
608#endif
609
610#ifdef CONFIG_HIGHMEM
611#define TEXT_FOR_HIGHMEM(xx) xx "_high",
612#else
613#define TEXT_FOR_HIGHMEM(xx)
614#endif
615
616#define TEXTS_FOR_ZONES(xx) TEXT_FOR_DMA(xx) TEXT_FOR_DMA32(xx) xx "_normal", \
617					TEXT_FOR_HIGHMEM(xx) xx "_movable",
618
619static const char * const vmstat_text[] = {
620	/* Zoned VM counters */
621	"nr_free_pages",
622	"nr_inactive_anon",
623	"nr_active_anon",
624	"nr_inactive_file",
625	"nr_active_file",
626	"nr_anon_pages",
627	"nr_mapped",
628	"nr_file_pages",
629	"nr_dirty",
630	"nr_writeback",
631	"nr_slab_reclaimable",
632	"nr_slab_unreclaimable",
633	"nr_page_table_pages",
634	"nr_unstable",
635	"nr_bounce",
636	"nr_vmscan_write",
637	"nr_writeback_temp",
638
639#ifdef CONFIG_NUMA
640	"numa_hit",
641	"numa_miss",
642	"numa_foreign",
643	"numa_interleave",
644	"numa_local",
645	"numa_other",
646#endif
647
648#ifdef CONFIG_VM_EVENT_COUNTERS
649	"pgpgin",
650	"pgpgout",
651	"pswpin",
652	"pswpout",
653
654	TEXTS_FOR_ZONES("pgalloc")
655
656	"pgfree",
657	"pgactivate",
658	"pgdeactivate",
659
660	"pgfault",
661	"pgmajfault",
662
663	TEXTS_FOR_ZONES("pgrefill")
664	TEXTS_FOR_ZONES("pgsteal")
665	TEXTS_FOR_ZONES("pgscan_kswapd")
666	TEXTS_FOR_ZONES("pgscan_direct")
667
668	"pginodesteal",
669	"slabs_scanned",
670	"kswapd_steal",
671	"kswapd_inodesteal",
672	"pageoutrun",
673	"allocstall",
674
675	"pgrotated",
676#ifdef CONFIG_HUGETLB_PAGE
677	"htlb_buddy_alloc_success",
678	"htlb_buddy_alloc_fail",
679#endif
680#endif
681};
682
683static void zoneinfo_show_print(struct seq_file *m, pg_data_t *pgdat,
684							struct zone *zone)
685{
686	int i;
687	seq_printf(m, "Node %d, zone %8s", pgdat->node_id, zone->name);
688	seq_printf(m,
689		   "\n  pages free     %lu"
690		   "\n        min      %lu"
691		   "\n        low      %lu"
692		   "\n        high     %lu"
693		   "\n        scanned  %lu (aa: %lu ia: %lu af: %lu if: %lu)"
694		   "\n        spanned  %lu"
695		   "\n        present  %lu",
696		   zone_page_state(zone, NR_FREE_PAGES),
697		   zone->pages_min,
698		   zone->pages_low,
699		   zone->pages_high,
700		   zone->pages_scanned,
701		   zone->lru[LRU_ACTIVE_ANON].nr_scan,
702		   zone->lru[LRU_INACTIVE_ANON].nr_scan,
703		   zone->lru[LRU_ACTIVE_FILE].nr_scan,
704		   zone->lru[LRU_INACTIVE_FILE].nr_scan,
705		   zone->spanned_pages,
706		   zone->present_pages);
707
708	for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++)
709		seq_printf(m, "\n    %-12s %lu", vmstat_text[i],
710				zone_page_state(zone, i));
711
712	seq_printf(m,
713		   "\n        protection: (%lu",
714		   zone->lowmem_reserve[0]);
715	for (i = 1; i < ARRAY_SIZE(zone->lowmem_reserve); i++)
716		seq_printf(m, ", %lu", zone->lowmem_reserve[i]);
717	seq_printf(m,
718		   ")"
719		   "\n  pagesets");
720	for_each_online_cpu(i) {
721		struct per_cpu_pageset *pageset;
722
723		pageset = zone_pcp(zone, i);
724		seq_printf(m,
725			   "\n    cpu: %i"
726			   "\n              count: %i"
727			   "\n              high:  %i"
728			   "\n              batch: %i",
729			   i,
730			   pageset->pcp.count,
731			   pageset->pcp.high,
732			   pageset->pcp.batch);
733#ifdef CONFIG_SMP
734		seq_printf(m, "\n  vm stats threshold: %d",
735				pageset->stat_threshold);
736#endif
737	}
738	seq_printf(m,
739		   "\n  all_unreclaimable: %u"
740		   "\n  prev_priority:     %i"
741		   "\n  start_pfn:         %lu",
742			   zone_is_all_unreclaimable(zone),
743		   zone->prev_priority,
744		   zone->zone_start_pfn);
745	seq_putc(m, '\n');
746}
747
748/*
749 * Output information about zones in @pgdat.
750 */
751static int zoneinfo_show(struct seq_file *m, void *arg)
752{
753	pg_data_t *pgdat = (pg_data_t *)arg;
754	walk_zones_in_node(m, pgdat, zoneinfo_show_print);
755	return 0;
756}
757
758const struct seq_operations zoneinfo_op = {
759	.start	= frag_start, /* iterate over all zones. The same as in
760			       * fragmentation. */
761	.next	= frag_next,
762	.stop	= frag_stop,
763	.show	= zoneinfo_show,
764};
765
766static void *vmstat_start(struct seq_file *m, loff_t *pos)
767{
768	unsigned long *v;
769#ifdef CONFIG_VM_EVENT_COUNTERS
770	unsigned long *e;
771#endif
772	int i;
773
774	if (*pos >= ARRAY_SIZE(vmstat_text))
775		return NULL;
776
777#ifdef CONFIG_VM_EVENT_COUNTERS
778	v = kmalloc(NR_VM_ZONE_STAT_ITEMS * sizeof(unsigned long)
779			+ sizeof(struct vm_event_state), GFP_KERNEL);
780#else
781	v = kmalloc(NR_VM_ZONE_STAT_ITEMS * sizeof(unsigned long),
782			GFP_KERNEL);
783#endif
784	m->private = v;
785	if (!v)
786		return ERR_PTR(-ENOMEM);
787	for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++)
788		v[i] = global_page_state(i);
789#ifdef CONFIG_VM_EVENT_COUNTERS
790	e = v + NR_VM_ZONE_STAT_ITEMS;
791	all_vm_events(e);
792	e[PGPGIN] /= 2;		/* sectors -> kbytes */
793	e[PGPGOUT] /= 2;
794#endif
795	return v + *pos;
796}
797
798static void *vmstat_next(struct seq_file *m, void *arg, loff_t *pos)
799{
800	(*pos)++;
801	if (*pos >= ARRAY_SIZE(vmstat_text))
802		return NULL;
803	return (unsigned long *)m->private + *pos;
804}
805
806static int vmstat_show(struct seq_file *m, void *arg)
807{
808	unsigned long *l = arg;
809	unsigned long off = l - (unsigned long *)m->private;
810
811	seq_printf(m, "%s %lu\n", vmstat_text[off], *l);
812	return 0;
813}
814
815static void vmstat_stop(struct seq_file *m, void *arg)
816{
817	kfree(m->private);
818	m->private = NULL;
819}
820
821const struct seq_operations vmstat_op = {
822	.start	= vmstat_start,
823	.next	= vmstat_next,
824	.stop	= vmstat_stop,
825	.show	= vmstat_show,
826};
827
828#endif /* CONFIG_PROC_FS */
829
830#ifdef CONFIG_SMP
831static DEFINE_PER_CPU(struct delayed_work, vmstat_work);
832int sysctl_stat_interval __read_mostly = HZ;
833
834static void vmstat_update(struct work_struct *w)
835{
836	refresh_cpu_vm_stats(smp_processor_id());
837	schedule_delayed_work(&__get_cpu_var(vmstat_work),
838		sysctl_stat_interval);
839}
840
841static void __cpuinit start_cpu_timer(int cpu)
842{
843	struct delayed_work *vmstat_work = &per_cpu(vmstat_work, cpu);
844
845	INIT_DELAYED_WORK_DEFERRABLE(vmstat_work, vmstat_update);
846	schedule_delayed_work_on(cpu, vmstat_work, HZ + cpu);
847}
848
849/*
850 * Use the cpu notifier to insure that the thresholds are recalculated
851 * when necessary.
852 */
853static int __cpuinit vmstat_cpuup_callback(struct notifier_block *nfb,
854		unsigned long action,
855		void *hcpu)
856{
857	long cpu = (long)hcpu;
858
859	switch (action) {
860	case CPU_ONLINE:
861	case CPU_ONLINE_FROZEN:
862		start_cpu_timer(cpu);
863		break;
864	case CPU_DOWN_PREPARE:
865	case CPU_DOWN_PREPARE_FROZEN:
866		cancel_rearming_delayed_work(&per_cpu(vmstat_work, cpu));
867		per_cpu(vmstat_work, cpu).work.func = NULL;
868		break;
869	case CPU_DOWN_FAILED:
870	case CPU_DOWN_FAILED_FROZEN:
871		start_cpu_timer(cpu);
872		break;
873	case CPU_DEAD:
874	case CPU_DEAD_FROZEN:
875		refresh_zone_stat_thresholds();
876		break;
877	default:
878		break;
879	}
880	return NOTIFY_OK;
881}
882
883static struct notifier_block __cpuinitdata vmstat_notifier =
884	{ &vmstat_cpuup_callback, NULL, 0 };
885
886static int __init setup_vmstat(void)
887{
888	int cpu;
889
890	refresh_zone_stat_thresholds();
891	register_cpu_notifier(&vmstat_notifier);
892
893	for_each_online_cpu(cpu)
894		start_cpu_timer(cpu);
895	return 0;
896}
897module_init(setup_vmstat)
898#endif
899