page_cgroup.c revision 6b3ae58efca06623c197fd6d91ded4aa3a8fe039
1#include <linux/mm.h>
2#include <linux/mmzone.h>
3#include <linux/bootmem.h>
4#include <linux/bit_spinlock.h>
5#include <linux/page_cgroup.h>
6#include <linux/hash.h>
7#include <linux/slab.h>
8#include <linux/memory.h>
9#include <linux/vmalloc.h>
10#include <linux/cgroup.h>
11#include <linux/swapops.h>
12#include <linux/kmemleak.h>
13
14static void __meminit init_page_cgroup(struct page_cgroup *pc, unsigned long id)
15{
16	pc->flags = 0;
17	set_page_cgroup_array_id(pc, id);
18	pc->mem_cgroup = NULL;
19	INIT_LIST_HEAD(&pc->lru);
20}
21static unsigned long total_usage;
22
23#if !defined(CONFIG_SPARSEMEM)
24
25
26void __meminit pgdat_page_cgroup_init(struct pglist_data *pgdat)
27{
28	pgdat->node_page_cgroup = NULL;
29}
30
31struct page_cgroup *lookup_page_cgroup(struct page *page)
32{
33	unsigned long pfn = page_to_pfn(page);
34	unsigned long offset;
35	struct page_cgroup *base;
36
37	base = NODE_DATA(page_to_nid(page))->node_page_cgroup;
38	if (unlikely(!base))
39		return NULL;
40
41	offset = pfn - NODE_DATA(page_to_nid(page))->node_start_pfn;
42	return base + offset;
43}
44
45struct page *lookup_cgroup_page(struct page_cgroup *pc)
46{
47	unsigned long pfn;
48	struct page *page;
49	pg_data_t *pgdat;
50
51	pgdat = NODE_DATA(page_cgroup_array_id(pc));
52	pfn = pc - pgdat->node_page_cgroup + pgdat->node_start_pfn;
53	page = pfn_to_page(pfn);
54	VM_BUG_ON(pc != lookup_page_cgroup(page));
55	return page;
56}
57
58static int __init alloc_node_page_cgroup(int nid)
59{
60	struct page_cgroup *base, *pc;
61	unsigned long table_size;
62	unsigned long start_pfn, nr_pages, index;
63
64	start_pfn = NODE_DATA(nid)->node_start_pfn;
65	nr_pages = NODE_DATA(nid)->node_spanned_pages;
66
67	if (!nr_pages)
68		return 0;
69
70	table_size = sizeof(struct page_cgroup) * nr_pages;
71
72	base = __alloc_bootmem_node_nopanic(NODE_DATA(nid),
73			table_size, PAGE_SIZE, __pa(MAX_DMA_ADDRESS));
74	if (!base)
75		return -ENOMEM;
76	for (index = 0; index < nr_pages; index++) {
77		pc = base + index;
78		init_page_cgroup(pc, nid);
79	}
80	NODE_DATA(nid)->node_page_cgroup = base;
81	total_usage += table_size;
82	return 0;
83}
84
85void __init page_cgroup_init_flatmem(void)
86{
87
88	int nid, fail;
89
90	if (mem_cgroup_disabled())
91		return;
92
93	for_each_online_node(nid)  {
94		fail = alloc_node_page_cgroup(nid);
95		if (fail)
96			goto fail;
97	}
98	printk(KERN_INFO "allocated %ld bytes of page_cgroup\n", total_usage);
99	printk(KERN_INFO "please try 'cgroup_disable=memory' option if you"
100	" don't want memory cgroups\n");
101	return;
102fail:
103	printk(KERN_CRIT "allocation of page_cgroup failed.\n");
104	printk(KERN_CRIT "please try 'cgroup_disable=memory' boot option\n");
105	panic("Out of memory");
106}
107
108#else /* CONFIG_FLAT_NODE_MEM_MAP */
109
110struct page_cgroup *lookup_page_cgroup(struct page *page)
111{
112	unsigned long pfn = page_to_pfn(page);
113	struct mem_section *section = __pfn_to_section(pfn);
114
115	if (!section->page_cgroup)
116		return NULL;
117	return section->page_cgroup + pfn;
118}
119
120struct page *lookup_cgroup_page(struct page_cgroup *pc)
121{
122	struct mem_section *section;
123	struct page *page;
124	unsigned long nr;
125
126	nr = page_cgroup_array_id(pc);
127	section = __nr_to_section(nr);
128	page = pfn_to_page(pc - section->page_cgroup);
129	VM_BUG_ON(pc != lookup_page_cgroup(page));
130	return page;
131}
132
133/* __alloc_bootmem...() is protected by !slab_available() */
134static int __init_refok init_section_page_cgroup(unsigned long pfn)
135{
136	struct page_cgroup *base, *pc;
137	struct mem_section *section;
138	unsigned long table_size;
139	unsigned long nr;
140	int nid, index;
141
142	nr = pfn_to_section_nr(pfn);
143	section = __nr_to_section(nr);
144
145	if (section->page_cgroup)
146		return 0;
147
148	nid = page_to_nid(pfn_to_page(pfn));
149	table_size = sizeof(struct page_cgroup) * PAGES_PER_SECTION;
150	VM_BUG_ON(!slab_is_available());
151	if (node_state(nid, N_HIGH_MEMORY)) {
152		base = kmalloc_node(table_size,
153				    GFP_KERNEL | __GFP_NOWARN, nid);
154		if (!base)
155			base = vmalloc_node(table_size, nid);
156	} else {
157		base = kmalloc(table_size, GFP_KERNEL | __GFP_NOWARN);
158		if (!base)
159			base = vmalloc(table_size);
160	}
161	/*
162	 * The value stored in section->page_cgroup is (base - pfn)
163	 * and it does not point to the memory block allocated above,
164	 * causing kmemleak false positives.
165	 */
166	kmemleak_not_leak(base);
167
168	if (!base) {
169		printk(KERN_ERR "page cgroup allocation failure\n");
170		return -ENOMEM;
171	}
172
173	for (index = 0; index < PAGES_PER_SECTION; index++) {
174		pc = base + index;
175		init_page_cgroup(pc, nr);
176	}
177
178	section->page_cgroup = base - pfn;
179	total_usage += table_size;
180	return 0;
181}
182#ifdef CONFIG_MEMORY_HOTPLUG
183void __free_page_cgroup(unsigned long pfn)
184{
185	struct mem_section *ms;
186	struct page_cgroup *base;
187
188	ms = __pfn_to_section(pfn);
189	if (!ms || !ms->page_cgroup)
190		return;
191	base = ms->page_cgroup + pfn;
192	if (is_vmalloc_addr(base)) {
193		vfree(base);
194		ms->page_cgroup = NULL;
195	} else {
196		struct page *page = virt_to_page(base);
197		if (!PageReserved(page)) { /* Is bootmem ? */
198			kfree(base);
199			ms->page_cgroup = NULL;
200		}
201	}
202}
203
204int __meminit online_page_cgroup(unsigned long start_pfn,
205			unsigned long nr_pages,
206			int nid)
207{
208	unsigned long start, end, pfn;
209	int fail = 0;
210
211	start = start_pfn & ~(PAGES_PER_SECTION - 1);
212	end = ALIGN(start_pfn + nr_pages, PAGES_PER_SECTION);
213
214	for (pfn = start; !fail && pfn < end; pfn += PAGES_PER_SECTION) {
215		if (!pfn_present(pfn))
216			continue;
217		fail = init_section_page_cgroup(pfn);
218	}
219	if (!fail)
220		return 0;
221
222	/* rollback */
223	for (pfn = start; pfn < end; pfn += PAGES_PER_SECTION)
224		__free_page_cgroup(pfn);
225
226	return -ENOMEM;
227}
228
229int __meminit offline_page_cgroup(unsigned long start_pfn,
230		unsigned long nr_pages, int nid)
231{
232	unsigned long start, end, pfn;
233
234	start = start_pfn & ~(PAGES_PER_SECTION - 1);
235	end = ALIGN(start_pfn + nr_pages, PAGES_PER_SECTION);
236
237	for (pfn = start; pfn < end; pfn += PAGES_PER_SECTION)
238		__free_page_cgroup(pfn);
239	return 0;
240
241}
242
243static int __meminit page_cgroup_callback(struct notifier_block *self,
244			       unsigned long action, void *arg)
245{
246	struct memory_notify *mn = arg;
247	int ret = 0;
248	switch (action) {
249	case MEM_GOING_ONLINE:
250		ret = online_page_cgroup(mn->start_pfn,
251				   mn->nr_pages, mn->status_change_nid);
252		break;
253	case MEM_OFFLINE:
254		offline_page_cgroup(mn->start_pfn,
255				mn->nr_pages, mn->status_change_nid);
256		break;
257	case MEM_CANCEL_ONLINE:
258	case MEM_GOING_OFFLINE:
259		break;
260	case MEM_ONLINE:
261	case MEM_CANCEL_OFFLINE:
262		break;
263	}
264
265	return notifier_from_errno(ret);
266}
267
268#endif
269
270void __init page_cgroup_init(void)
271{
272	unsigned long pfn;
273	int fail = 0;
274
275	if (mem_cgroup_disabled())
276		return;
277
278	for (pfn = 0; !fail && pfn < max_pfn; pfn += PAGES_PER_SECTION) {
279		if (!pfn_present(pfn))
280			continue;
281		fail = init_section_page_cgroup(pfn);
282	}
283	if (fail) {
284		printk(KERN_CRIT "try 'cgroup_disable=memory' boot option\n");
285		panic("Out of memory");
286	} else {
287		hotplug_memory_notifier(page_cgroup_callback, 0);
288	}
289	printk(KERN_INFO "allocated %ld bytes of page_cgroup\n", total_usage);
290	printk(KERN_INFO "please try 'cgroup_disable=memory' option if you don't"
291	" want memory cgroups\n");
292}
293
294void __meminit pgdat_page_cgroup_init(struct pglist_data *pgdat)
295{
296	return;
297}
298
299#endif
300
301
302#ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP
303
304static DEFINE_MUTEX(swap_cgroup_mutex);
305struct swap_cgroup_ctrl {
306	struct page **map;
307	unsigned long length;
308	spinlock_t	lock;
309};
310
311struct swap_cgroup_ctrl swap_cgroup_ctrl[MAX_SWAPFILES];
312
313struct swap_cgroup {
314	unsigned short		id;
315};
316#define SC_PER_PAGE	(PAGE_SIZE/sizeof(struct swap_cgroup))
317#define SC_POS_MASK	(SC_PER_PAGE - 1)
318
319/*
320 * SwapCgroup implements "lookup" and "exchange" operations.
321 * In typical usage, this swap_cgroup is accessed via memcg's charge/uncharge
322 * against SwapCache. At swap_free(), this is accessed directly from swap.
323 *
324 * This means,
325 *  - we have no race in "exchange" when we're accessed via SwapCache because
326 *    SwapCache(and its swp_entry) is under lock.
327 *  - When called via swap_free(), there is no user of this entry and no race.
328 * Then, we don't need lock around "exchange".
329 *
330 * TODO: we can push these buffers out to HIGHMEM.
331 */
332
333/*
334 * allocate buffer for swap_cgroup.
335 */
336static int swap_cgroup_prepare(int type)
337{
338	struct page *page;
339	struct swap_cgroup_ctrl *ctrl;
340	unsigned long idx, max;
341
342	ctrl = &swap_cgroup_ctrl[type];
343
344	for (idx = 0; idx < ctrl->length; idx++) {
345		page = alloc_page(GFP_KERNEL | __GFP_ZERO);
346		if (!page)
347			goto not_enough_page;
348		ctrl->map[idx] = page;
349	}
350	return 0;
351not_enough_page:
352	max = idx;
353	for (idx = 0; idx < max; idx++)
354		__free_page(ctrl->map[idx]);
355
356	return -ENOMEM;
357}
358
359/**
360 * swap_cgroup_cmpxchg - cmpxchg mem_cgroup's id for this swp_entry.
361 * @end: swap entry to be cmpxchged
362 * @old: old id
363 * @new: new id
364 *
365 * Returns old id at success, 0 at failure.
366 * (There is no mem_cgroup useing 0 as its id)
367 */
368unsigned short swap_cgroup_cmpxchg(swp_entry_t ent,
369					unsigned short old, unsigned short new)
370{
371	int type = swp_type(ent);
372	unsigned long offset = swp_offset(ent);
373	unsigned long idx = offset / SC_PER_PAGE;
374	unsigned long pos = offset & SC_POS_MASK;
375	struct swap_cgroup_ctrl *ctrl;
376	struct page *mappage;
377	struct swap_cgroup *sc;
378	unsigned long flags;
379	unsigned short retval;
380
381	ctrl = &swap_cgroup_ctrl[type];
382
383	mappage = ctrl->map[idx];
384	sc = page_address(mappage);
385	sc += pos;
386	spin_lock_irqsave(&ctrl->lock, flags);
387	retval = sc->id;
388	if (retval == old)
389		sc->id = new;
390	else
391		retval = 0;
392	spin_unlock_irqrestore(&ctrl->lock, flags);
393	return retval;
394}
395
396/**
397 * swap_cgroup_record - record mem_cgroup for this swp_entry.
398 * @ent: swap entry to be recorded into
399 * @mem: mem_cgroup to be recorded
400 *
401 * Returns old value at success, 0 at failure.
402 * (Of course, old value can be 0.)
403 */
404unsigned short swap_cgroup_record(swp_entry_t ent, unsigned short id)
405{
406	int type = swp_type(ent);
407	unsigned long offset = swp_offset(ent);
408	unsigned long idx = offset / SC_PER_PAGE;
409	unsigned long pos = offset & SC_POS_MASK;
410	struct swap_cgroup_ctrl *ctrl;
411	struct page *mappage;
412	struct swap_cgroup *sc;
413	unsigned short old;
414	unsigned long flags;
415
416	ctrl = &swap_cgroup_ctrl[type];
417
418	mappage = ctrl->map[idx];
419	sc = page_address(mappage);
420	sc += pos;
421	spin_lock_irqsave(&ctrl->lock, flags);
422	old = sc->id;
423	sc->id = id;
424	spin_unlock_irqrestore(&ctrl->lock, flags);
425
426	return old;
427}
428
429/**
430 * lookup_swap_cgroup - lookup mem_cgroup tied to swap entry
431 * @ent: swap entry to be looked up.
432 *
433 * Returns CSS ID of mem_cgroup at success. 0 at failure. (0 is invalid ID)
434 */
435unsigned short lookup_swap_cgroup(swp_entry_t ent)
436{
437	int type = swp_type(ent);
438	unsigned long offset = swp_offset(ent);
439	unsigned long idx = offset / SC_PER_PAGE;
440	unsigned long pos = offset & SC_POS_MASK;
441	struct swap_cgroup_ctrl *ctrl;
442	struct page *mappage;
443	struct swap_cgroup *sc;
444	unsigned short ret;
445
446	ctrl = &swap_cgroup_ctrl[type];
447	mappage = ctrl->map[idx];
448	sc = page_address(mappage);
449	sc += pos;
450	ret = sc->id;
451	return ret;
452}
453
454int swap_cgroup_swapon(int type, unsigned long max_pages)
455{
456	void *array;
457	unsigned long array_size;
458	unsigned long length;
459	struct swap_cgroup_ctrl *ctrl;
460
461	if (!do_swap_account)
462		return 0;
463
464	length = ((max_pages/SC_PER_PAGE) + 1);
465	array_size = length * sizeof(void *);
466
467	array = vmalloc(array_size);
468	if (!array)
469		goto nomem;
470
471	memset(array, 0, array_size);
472	ctrl = &swap_cgroup_ctrl[type];
473	mutex_lock(&swap_cgroup_mutex);
474	ctrl->length = length;
475	ctrl->map = array;
476	spin_lock_init(&ctrl->lock);
477	if (swap_cgroup_prepare(type)) {
478		/* memory shortage */
479		ctrl->map = NULL;
480		ctrl->length = 0;
481		vfree(array);
482		mutex_unlock(&swap_cgroup_mutex);
483		goto nomem;
484	}
485	mutex_unlock(&swap_cgroup_mutex);
486
487	return 0;
488nomem:
489	printk(KERN_INFO "couldn't allocate enough memory for swap_cgroup.\n");
490	printk(KERN_INFO
491		"swap_cgroup can be disabled by noswapaccount boot option\n");
492	return -ENOMEM;
493}
494
495void swap_cgroup_swapoff(int type)
496{
497	int i;
498	struct swap_cgroup_ctrl *ctrl;
499
500	if (!do_swap_account)
501		return;
502
503	mutex_lock(&swap_cgroup_mutex);
504	ctrl = &swap_cgroup_ctrl[type];
505	if (ctrl->map) {
506		for (i = 0; i < ctrl->length; i++) {
507			struct page *page = ctrl->map[i];
508			if (page)
509				__free_page(page);
510		}
511		vfree(ctrl->map);
512		ctrl->map = NULL;
513		ctrl->length = 0;
514	}
515	mutex_unlock(&swap_cgroup_mutex);
516}
517
518#endif
519