vmstat.c revision bab1846a0582f627f5ec22aa2dc5f4f3e82e8176
1/*
2 *  linux/mm/vmstat.c
3 *
4 *  Manages VM statistics
5 *  Copyright (C) 1991, 1992, 1993, 1994  Linus Torvalds
6 *
7 *  zoned VM statistics
8 *  Copyright (C) 2006 Silicon Graphics, Inc.,
9 *		Christoph Lameter <christoph@lameter.com>
10 */
11
12#include <linux/config.h>
13#include <linux/mm.h>
14#include <linux/module.h>
15
16/*
17 * Accumulate the page_state information across all CPUs.
18 * The result is unavoidably approximate - it can change
19 * during and after execution of this function.
20 */
21DEFINE_PER_CPU(struct page_state, page_states) = {0};
22
23static void __get_page_state(struct page_state *ret, int nr, cpumask_t *cpumask)
24{
25	unsigned cpu;
26
27	memset(ret, 0, nr * sizeof(unsigned long));
28	cpus_and(*cpumask, *cpumask, cpu_online_map);
29
30	for_each_cpu_mask(cpu, *cpumask) {
31		unsigned long *in;
32		unsigned long *out;
33		unsigned off;
34		unsigned next_cpu;
35
36		in = (unsigned long *)&per_cpu(page_states, cpu);
37
38		next_cpu = next_cpu(cpu, *cpumask);
39		if (likely(next_cpu < NR_CPUS))
40			prefetch(&per_cpu(page_states, next_cpu));
41
42		out = (unsigned long *)ret;
43		for (off = 0; off < nr; off++)
44			*out++ += *in++;
45	}
46}
47
48void get_full_page_state(struct page_state *ret)
49{
50	cpumask_t mask = CPU_MASK_ALL;
51
52	__get_page_state(ret, sizeof(*ret) / sizeof(unsigned long), &mask);
53}
54
55void __mod_page_state_offset(unsigned long offset, unsigned long delta)
56{
57	void *ptr;
58
59	ptr = &__get_cpu_var(page_states);
60	*(unsigned long *)(ptr + offset) += delta;
61}
62EXPORT_SYMBOL(__mod_page_state_offset);
63
64void mod_page_state_offset(unsigned long offset, unsigned long delta)
65{
66	unsigned long flags;
67	void *ptr;
68
69	local_irq_save(flags);
70	ptr = &__get_cpu_var(page_states);
71	*(unsigned long *)(ptr + offset) += delta;
72	local_irq_restore(flags);
73}
74EXPORT_SYMBOL(mod_page_state_offset);
75
76void __get_zone_counts(unsigned long *active, unsigned long *inactive,
77			unsigned long *free, struct pglist_data *pgdat)
78{
79	struct zone *zones = pgdat->node_zones;
80	int i;
81
82	*active = 0;
83	*inactive = 0;
84	*free = 0;
85	for (i = 0; i < MAX_NR_ZONES; i++) {
86		*active += zones[i].nr_active;
87		*inactive += zones[i].nr_inactive;
88		*free += zones[i].free_pages;
89	}
90}
91
92void get_zone_counts(unsigned long *active,
93		unsigned long *inactive, unsigned long *free)
94{
95	struct pglist_data *pgdat;
96
97	*active = 0;
98	*inactive = 0;
99	*free = 0;
100	for_each_online_pgdat(pgdat) {
101		unsigned long l, m, n;
102		__get_zone_counts(&l, &m, &n, pgdat);
103		*active += l;
104		*inactive += m;
105		*free += n;
106	}
107}
108
109/*
110 * Manage combined zone based / global counters
111 *
112 * vm_stat contains the global counters
113 */
114atomic_long_t vm_stat[NR_VM_ZONE_STAT_ITEMS];
115EXPORT_SYMBOL(vm_stat);
116
117#ifdef CONFIG_SMP
118
119#define STAT_THRESHOLD 32
120
121/*
122 * Determine pointer to currently valid differential byte given a zone and
123 * the item number.
124 *
125 * Preemption must be off
126 */
127static inline s8 *diff_pointer(struct zone *zone, enum zone_stat_item item)
128{
129	return &zone_pcp(zone, smp_processor_id())->vm_stat_diff[item];
130}
131
132/*
133 * For use when we know that interrupts are disabled.
134 */
135void __mod_zone_page_state(struct zone *zone, enum zone_stat_item item,
136				int delta)
137{
138	s8 *p;
139	long x;
140
141	p = diff_pointer(zone, item);
142	x = delta + *p;
143
144	if (unlikely(x > STAT_THRESHOLD || x < -STAT_THRESHOLD)) {
145		zone_page_state_add(x, zone, item);
146		x = 0;
147	}
148
149	*p = x;
150}
151EXPORT_SYMBOL(__mod_zone_page_state);
152
153/*
154 * For an unknown interrupt state
155 */
156void mod_zone_page_state(struct zone *zone, enum zone_stat_item item,
157					int delta)
158{
159	unsigned long flags;
160
161	local_irq_save(flags);
162	__mod_zone_page_state(zone, item, delta);
163	local_irq_restore(flags);
164}
165EXPORT_SYMBOL(mod_zone_page_state);
166
167/*
168 * Optimized increment and decrement functions.
169 *
170 * These are only for a single page and therefore can take a struct page *
171 * argument instead of struct zone *. This allows the inclusion of the code
172 * generated for page_zone(page) into the optimized functions.
173 *
174 * No overflow check is necessary and therefore the differential can be
175 * incremented or decremented in place which may allow the compilers to
176 * generate better code.
177 *
178 * The increment or decrement is known and therefore one boundary check can
179 * be omitted.
180 *
181 * Some processors have inc/dec instructions that are atomic vs an interrupt.
182 * However, the code must first determine the differential location in a zone
183 * based on the processor number and then inc/dec the counter. There is no
184 * guarantee without disabling preemption that the processor will not change
185 * in between and therefore the atomicity vs. interrupt cannot be exploited
186 * in a useful way here.
187 */
188void __inc_zone_page_state(struct page *page, enum zone_stat_item item)
189{
190	struct zone *zone = page_zone(page);
191	s8 *p = diff_pointer(zone, item);
192
193	(*p)++;
194
195	if (unlikely(*p > STAT_THRESHOLD)) {
196		zone_page_state_add(*p, zone, item);
197		*p = 0;
198	}
199}
200EXPORT_SYMBOL(__inc_zone_page_state);
201
202void __dec_zone_page_state(struct page *page, enum zone_stat_item item)
203{
204	struct zone *zone = page_zone(page);
205	s8 *p = diff_pointer(zone, item);
206
207	(*p)--;
208
209	if (unlikely(*p < -STAT_THRESHOLD)) {
210		zone_page_state_add(*p, zone, item);
211		*p = 0;
212	}
213}
214EXPORT_SYMBOL(__dec_zone_page_state);
215
216void inc_zone_page_state(struct page *page, enum zone_stat_item item)
217{
218	unsigned long flags;
219	struct zone *zone;
220	s8 *p;
221
222	zone = page_zone(page);
223	local_irq_save(flags);
224	p = diff_pointer(zone, item);
225
226	(*p)++;
227
228	if (unlikely(*p > STAT_THRESHOLD)) {
229		zone_page_state_add(*p, zone, item);
230		*p = 0;
231	}
232	local_irq_restore(flags);
233}
234EXPORT_SYMBOL(inc_zone_page_state);
235
236void dec_zone_page_state(struct page *page, enum zone_stat_item item)
237{
238	unsigned long flags;
239	struct zone *zone;
240	s8 *p;
241
242	zone = page_zone(page);
243	local_irq_save(flags);
244	p = diff_pointer(zone, item);
245
246	(*p)--;
247
248	if (unlikely(*p < -STAT_THRESHOLD)) {
249		zone_page_state_add(*p, zone, item);
250		*p = 0;
251	}
252	local_irq_restore(flags);
253}
254EXPORT_SYMBOL(dec_zone_page_state);
255
256/*
257 * Update the zone counters for one cpu.
258 */
259void refresh_cpu_vm_stats(int cpu)
260{
261	struct zone *zone;
262	int i;
263	unsigned long flags;
264
265	for_each_zone(zone) {
266		struct per_cpu_pageset *pcp;
267
268		pcp = zone_pcp(zone, cpu);
269
270		for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++)
271			if (pcp->vm_stat_diff[i]) {
272				local_irq_save(flags);
273				zone_page_state_add(pcp->vm_stat_diff[i],
274					zone, i);
275				pcp->vm_stat_diff[i] = 0;
276				local_irq_restore(flags);
277			}
278	}
279}
280
281static void __refresh_cpu_vm_stats(void *dummy)
282{
283	refresh_cpu_vm_stats(smp_processor_id());
284}
285
286/*
287 * Consolidate all counters.
288 *
289 * Note that the result is less inaccurate but still inaccurate
290 * if concurrent processes are allowed to run.
291 */
292void refresh_vm_stats(void)
293{
294	on_each_cpu(__refresh_cpu_vm_stats, NULL, 0, 1);
295}
296EXPORT_SYMBOL(refresh_vm_stats);
297
298#endif
299
300#ifdef CONFIG_PROC_FS
301
302#include <linux/seq_file.h>
303
304static void *frag_start(struct seq_file *m, loff_t *pos)
305{
306	pg_data_t *pgdat;
307	loff_t node = *pos;
308	for (pgdat = first_online_pgdat();
309	     pgdat && node;
310	     pgdat = next_online_pgdat(pgdat))
311		--node;
312
313	return pgdat;
314}
315
316static void *frag_next(struct seq_file *m, void *arg, loff_t *pos)
317{
318	pg_data_t *pgdat = (pg_data_t *)arg;
319
320	(*pos)++;
321	return next_online_pgdat(pgdat);
322}
323
324static void frag_stop(struct seq_file *m, void *arg)
325{
326}
327
328/*
329 * This walks the free areas for each zone.
330 */
331static int frag_show(struct seq_file *m, void *arg)
332{
333	pg_data_t *pgdat = (pg_data_t *)arg;
334	struct zone *zone;
335	struct zone *node_zones = pgdat->node_zones;
336	unsigned long flags;
337	int order;
338
339	for (zone = node_zones; zone - node_zones < MAX_NR_ZONES; ++zone) {
340		if (!populated_zone(zone))
341			continue;
342
343		spin_lock_irqsave(&zone->lock, flags);
344		seq_printf(m, "Node %d, zone %8s ", pgdat->node_id, zone->name);
345		for (order = 0; order < MAX_ORDER; ++order)
346			seq_printf(m, "%6lu ", zone->free_area[order].nr_free);
347		spin_unlock_irqrestore(&zone->lock, flags);
348		seq_putc(m, '\n');
349	}
350	return 0;
351}
352
353struct seq_operations fragmentation_op = {
354	.start	= frag_start,
355	.next	= frag_next,
356	.stop	= frag_stop,
357	.show	= frag_show,
358};
359
360static char *vmstat_text[] = {
361	/* Zoned VM counters */
362	"nr_anon_pages",
363	"nr_mapped",
364	"nr_file_pages",
365	"nr_slab",
366	"nr_page_table_pages",
367	"nr_dirty",
368	"nr_writeback",
369	"nr_unstable",
370	"nr_bounce",
371
372	/* Event counters */
373	"pgpgin",
374	"pgpgout",
375	"pswpin",
376	"pswpout",
377
378	"pgalloc_high",
379	"pgalloc_normal",
380	"pgalloc_dma32",
381	"pgalloc_dma",
382
383	"pgfree",
384	"pgactivate",
385	"pgdeactivate",
386
387	"pgfault",
388	"pgmajfault",
389
390	"pgrefill_high",
391	"pgrefill_normal",
392	"pgrefill_dma32",
393	"pgrefill_dma",
394
395	"pgsteal_high",
396	"pgsteal_normal",
397	"pgsteal_dma32",
398	"pgsteal_dma",
399
400	"pgscan_kswapd_high",
401	"pgscan_kswapd_normal",
402	"pgscan_kswapd_dma32",
403	"pgscan_kswapd_dma",
404
405	"pgscan_direct_high",
406	"pgscan_direct_normal",
407	"pgscan_direct_dma32",
408	"pgscan_direct_dma",
409
410	"pginodesteal",
411	"slabs_scanned",
412	"kswapd_steal",
413	"kswapd_inodesteal",
414	"pageoutrun",
415	"allocstall",
416
417	"pgrotated",
418};
419
420/*
421 * Output information about zones in @pgdat.
422 */
423static int zoneinfo_show(struct seq_file *m, void *arg)
424{
425	pg_data_t *pgdat = arg;
426	struct zone *zone;
427	struct zone *node_zones = pgdat->node_zones;
428	unsigned long flags;
429
430	for (zone = node_zones; zone - node_zones < MAX_NR_ZONES; zone++) {
431		int i;
432
433		if (!populated_zone(zone))
434			continue;
435
436		spin_lock_irqsave(&zone->lock, flags);
437		seq_printf(m, "Node %d, zone %8s", pgdat->node_id, zone->name);
438		seq_printf(m,
439			   "\n  pages free     %lu"
440			   "\n        min      %lu"
441			   "\n        low      %lu"
442			   "\n        high     %lu"
443			   "\n        active   %lu"
444			   "\n        inactive %lu"
445			   "\n        scanned  %lu (a: %lu i: %lu)"
446			   "\n        spanned  %lu"
447			   "\n        present  %lu",
448			   zone->free_pages,
449			   zone->pages_min,
450			   zone->pages_low,
451			   zone->pages_high,
452			   zone->nr_active,
453			   zone->nr_inactive,
454			   zone->pages_scanned,
455			   zone->nr_scan_active, zone->nr_scan_inactive,
456			   zone->spanned_pages,
457			   zone->present_pages);
458
459		for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++)
460			seq_printf(m, "\n    %-12s %lu", vmstat_text[i],
461					zone_page_state(zone, i));
462
463		seq_printf(m,
464			   "\n        protection: (%lu",
465			   zone->lowmem_reserve[0]);
466		for (i = 1; i < ARRAY_SIZE(zone->lowmem_reserve); i++)
467			seq_printf(m, ", %lu", zone->lowmem_reserve[i]);
468		seq_printf(m,
469			   ")"
470			   "\n  pagesets");
471		for_each_online_cpu(i) {
472			struct per_cpu_pageset *pageset;
473			int j;
474
475			pageset = zone_pcp(zone, i);
476			for (j = 0; j < ARRAY_SIZE(pageset->pcp); j++) {
477				if (pageset->pcp[j].count)
478					break;
479			}
480			if (j == ARRAY_SIZE(pageset->pcp))
481				continue;
482			for (j = 0; j < ARRAY_SIZE(pageset->pcp); j++) {
483				seq_printf(m,
484					   "\n    cpu: %i pcp: %i"
485					   "\n              count: %i"
486					   "\n              high:  %i"
487					   "\n              batch: %i",
488					   i, j,
489					   pageset->pcp[j].count,
490					   pageset->pcp[j].high,
491					   pageset->pcp[j].batch);
492			}
493#ifdef CONFIG_NUMA
494			seq_printf(m,
495				   "\n            numa_hit:       %lu"
496				   "\n            numa_miss:      %lu"
497				   "\n            numa_foreign:   %lu"
498				   "\n            interleave_hit: %lu"
499				   "\n            local_node:     %lu"
500				   "\n            other_node:     %lu",
501				   pageset->numa_hit,
502				   pageset->numa_miss,
503				   pageset->numa_foreign,
504				   pageset->interleave_hit,
505				   pageset->local_node,
506				   pageset->other_node);
507#endif
508		}
509		seq_printf(m,
510			   "\n  all_unreclaimable: %u"
511			   "\n  prev_priority:     %i"
512			   "\n  temp_priority:     %i"
513			   "\n  start_pfn:         %lu",
514			   zone->all_unreclaimable,
515			   zone->prev_priority,
516			   zone->temp_priority,
517			   zone->zone_start_pfn);
518		spin_unlock_irqrestore(&zone->lock, flags);
519		seq_putc(m, '\n');
520	}
521	return 0;
522}
523
524struct seq_operations zoneinfo_op = {
525	.start	= frag_start, /* iterate over all zones. The same as in
526			       * fragmentation. */
527	.next	= frag_next,
528	.stop	= frag_stop,
529	.show	= zoneinfo_show,
530};
531
532static void *vmstat_start(struct seq_file *m, loff_t *pos)
533{
534	unsigned long *v;
535	struct page_state *ps;
536	int i;
537
538	if (*pos >= ARRAY_SIZE(vmstat_text))
539		return NULL;
540
541	v = kmalloc(NR_VM_ZONE_STAT_ITEMS * sizeof(unsigned long)
542			+ sizeof(*ps), GFP_KERNEL);
543	m->private = v;
544	if (!v)
545		return ERR_PTR(-ENOMEM);
546	for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++)
547		v[i] = global_page_state(i);
548	ps = (struct page_state *)(v + NR_VM_ZONE_STAT_ITEMS);
549	get_full_page_state(ps);
550	ps->pgpgin /= 2;		/* sectors -> kbytes */
551	ps->pgpgout /= 2;
552	return v + *pos;
553}
554
555static void *vmstat_next(struct seq_file *m, void *arg, loff_t *pos)
556{
557	(*pos)++;
558	if (*pos >= ARRAY_SIZE(vmstat_text))
559		return NULL;
560	return (unsigned long *)m->private + *pos;
561}
562
563static int vmstat_show(struct seq_file *m, void *arg)
564{
565	unsigned long *l = arg;
566	unsigned long off = l - (unsigned long *)m->private;
567
568	seq_printf(m, "%s %lu\n", vmstat_text[off], *l);
569	return 0;
570}
571
572static void vmstat_stop(struct seq_file *m, void *arg)
573{
574	kfree(m->private);
575	m->private = NULL;
576}
577
578struct seq_operations vmstat_op = {
579	.start	= vmstat_start,
580	.next	= vmstat_next,
581	.stop	= vmstat_stop,
582	.show	= vmstat_show,
583};
584
585#endif /* CONFIG_PROC_FS */
586
587