memcontrol.c revision c772be939e078afd2505ede7d596a30f8f61de95
1/* memcontrol.c - Memory Controller
2 *
3 * Copyright IBM Corporation, 2007
4 * Author Balbir Singh <balbir@linux.vnet.ibm.com>
5 *
6 * Copyright 2007 OpenVZ SWsoft Inc
7 * Author: Pavel Emelianov <xemul@openvz.org>
8 *
9 * This program is free software; you can redistribute it and/or modify
10 * it under the terms of the GNU General Public License as published by
11 * the Free Software Foundation; either version 2 of the License, or
12 * (at your option) any later version.
13 *
14 * This program is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
17 * GNU General Public License for more details.
18 */
19
20#include <linux/res_counter.h>
21#include <linux/memcontrol.h>
22#include <linux/cgroup.h>
23#include <linux/mm.h>
24#include <linux/pagemap.h>
25#include <linux/smp.h>
26#include <linux/page-flags.h>
27#include <linux/backing-dev.h>
28#include <linux/bit_spinlock.h>
29#include <linux/rcupdate.h>
30#include <linux/mutex.h>
31#include <linux/slab.h>
32#include <linux/swap.h>
33#include <linux/spinlock.h>
34#include <linux/fs.h>
35#include <linux/seq_file.h>
36#include <linux/vmalloc.h>
37#include <linux/mm_inline.h>
38#include <linux/page_cgroup.h>
39#include "internal.h"
40
41#include <asm/uaccess.h>
42
43struct cgroup_subsys mem_cgroup_subsys __read_mostly;
44#define MEM_CGROUP_RECLAIM_RETRIES	5
45
46#ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP
47/* Turned on only when memory cgroup is enabled && really_do_swap_account = 0 */
48int do_swap_account __read_mostly;
49static int really_do_swap_account __initdata = 1; /* for remember boot option*/
50#else
51#define do_swap_account		(0)
52#endif
53
54
55/*
56 * Statistics for memory cgroup.
57 */
58enum mem_cgroup_stat_index {
59	/*
60	 * For MEM_CONTAINER_TYPE_ALL, usage = pagecache + rss.
61	 */
62	MEM_CGROUP_STAT_CACHE, 	   /* # of pages charged as cache */
63	MEM_CGROUP_STAT_RSS,	   /* # of pages charged as rss */
64	MEM_CGROUP_STAT_PGPGIN_COUNT,	/* # of pages paged in */
65	MEM_CGROUP_STAT_PGPGOUT_COUNT,	/* # of pages paged out */
66
67	MEM_CGROUP_STAT_NSTATS,
68};
69
70struct mem_cgroup_stat_cpu {
71	s64 count[MEM_CGROUP_STAT_NSTATS];
72} ____cacheline_aligned_in_smp;
73
74struct mem_cgroup_stat {
75	struct mem_cgroup_stat_cpu cpustat[0];
76};
77
78/*
79 * For accounting under irq disable, no need for increment preempt count.
80 */
81static inline void __mem_cgroup_stat_add_safe(struct mem_cgroup_stat_cpu *stat,
82		enum mem_cgroup_stat_index idx, int val)
83{
84	stat->count[idx] += val;
85}
86
87static s64 mem_cgroup_read_stat(struct mem_cgroup_stat *stat,
88		enum mem_cgroup_stat_index idx)
89{
90	int cpu;
91	s64 ret = 0;
92	for_each_possible_cpu(cpu)
93		ret += stat->cpustat[cpu].count[idx];
94	return ret;
95}
96
97/*
98 * per-zone information in memory controller.
99 */
100struct mem_cgroup_per_zone {
101	/*
102	 * spin_lock to protect the per cgroup LRU
103	 */
104	struct list_head	lists[NR_LRU_LISTS];
105	unsigned long		count[NR_LRU_LISTS];
106
107	struct zone_reclaim_stat reclaim_stat;
108};
109/* Macro for accessing counter */
110#define MEM_CGROUP_ZSTAT(mz, idx)	((mz)->count[(idx)])
111
112struct mem_cgroup_per_node {
113	struct mem_cgroup_per_zone zoneinfo[MAX_NR_ZONES];
114};
115
116struct mem_cgroup_lru_info {
117	struct mem_cgroup_per_node *nodeinfo[MAX_NUMNODES];
118};
119
120/*
121 * The memory controller data structure. The memory controller controls both
122 * page cache and RSS per cgroup. We would eventually like to provide
123 * statistics based on the statistics developed by Rik Van Riel for clock-pro,
124 * to help the administrator determine what knobs to tune.
125 *
126 * TODO: Add a water mark for the memory controller. Reclaim will begin when
127 * we hit the water mark. May be even add a low water mark, such that
128 * no reclaim occurs from a cgroup at it's low water mark, this is
129 * a feature that will be implemented much later in the future.
130 */
131struct mem_cgroup {
132	struct cgroup_subsys_state css;
133	/*
134	 * the counter to account for memory usage
135	 */
136	struct res_counter res;
137	/*
138	 * the counter to account for mem+swap usage.
139	 */
140	struct res_counter memsw;
141	/*
142	 * Per cgroup active and inactive list, similar to the
143	 * per zone LRU lists.
144	 */
145	struct mem_cgroup_lru_info info;
146
147	/*
148	  protect against reclaim related member.
149	*/
150	spinlock_t reclaim_param_lock;
151
152	int	prev_priority;	/* for recording reclaim priority */
153
154	/*
155	 * While reclaiming in a hiearchy, we cache the last child we
156	 * reclaimed from. Protected by cgroup_lock()
157	 */
158	struct mem_cgroup *last_scanned_child;
159	/*
160	 * Should the accounting and control be hierarchical, per subtree?
161	 */
162	bool use_hierarchy;
163	unsigned long	last_oom_jiffies;
164	int		obsolete;
165	atomic_t	refcnt;
166
167	unsigned int	swappiness;
168
169	/*
170	 * statistics. This must be placed at the end of memcg.
171	 */
172	struct mem_cgroup_stat stat;
173};
174
175enum charge_type {
176	MEM_CGROUP_CHARGE_TYPE_CACHE = 0,
177	MEM_CGROUP_CHARGE_TYPE_MAPPED,
178	MEM_CGROUP_CHARGE_TYPE_SHMEM,	/* used by page migration of shmem */
179	MEM_CGROUP_CHARGE_TYPE_FORCE,	/* used by force_empty */
180	MEM_CGROUP_CHARGE_TYPE_SWAPOUT,	/* for accounting swapcache */
181	NR_CHARGE_TYPE,
182};
183
184/* only for here (for easy reading.) */
185#define PCGF_CACHE	(1UL << PCG_CACHE)
186#define PCGF_USED	(1UL << PCG_USED)
187#define PCGF_LOCK	(1UL << PCG_LOCK)
188static const unsigned long
189pcg_default_flags[NR_CHARGE_TYPE] = {
190	PCGF_CACHE | PCGF_USED | PCGF_LOCK, /* File Cache */
191	PCGF_USED | PCGF_LOCK, /* Anon */
192	PCGF_CACHE | PCGF_USED | PCGF_LOCK, /* Shmem */
193	0, /* FORCE */
194};
195
196/* for encoding cft->private value on file */
197#define _MEM			(0)
198#define _MEMSWAP		(1)
199#define MEMFILE_PRIVATE(x, val)	(((x) << 16) | (val))
200#define MEMFILE_TYPE(val)	(((val) >> 16) & 0xffff)
201#define MEMFILE_ATTR(val)	((val) & 0xffff)
202
203static void mem_cgroup_get(struct mem_cgroup *mem);
204static void mem_cgroup_put(struct mem_cgroup *mem);
205
206static void mem_cgroup_charge_statistics(struct mem_cgroup *mem,
207					 struct page_cgroup *pc,
208					 bool charge)
209{
210	int val = (charge)? 1 : -1;
211	struct mem_cgroup_stat *stat = &mem->stat;
212	struct mem_cgroup_stat_cpu *cpustat;
213	int cpu = get_cpu();
214
215	cpustat = &stat->cpustat[cpu];
216	if (PageCgroupCache(pc))
217		__mem_cgroup_stat_add_safe(cpustat, MEM_CGROUP_STAT_CACHE, val);
218	else
219		__mem_cgroup_stat_add_safe(cpustat, MEM_CGROUP_STAT_RSS, val);
220
221	if (charge)
222		__mem_cgroup_stat_add_safe(cpustat,
223				MEM_CGROUP_STAT_PGPGIN_COUNT, 1);
224	else
225		__mem_cgroup_stat_add_safe(cpustat,
226				MEM_CGROUP_STAT_PGPGOUT_COUNT, 1);
227	put_cpu();
228}
229
230static struct mem_cgroup_per_zone *
231mem_cgroup_zoneinfo(struct mem_cgroup *mem, int nid, int zid)
232{
233	return &mem->info.nodeinfo[nid]->zoneinfo[zid];
234}
235
236static struct mem_cgroup_per_zone *
237page_cgroup_zoneinfo(struct page_cgroup *pc)
238{
239	struct mem_cgroup *mem = pc->mem_cgroup;
240	int nid = page_cgroup_nid(pc);
241	int zid = page_cgroup_zid(pc);
242
243	if (!mem)
244		return NULL;
245
246	return mem_cgroup_zoneinfo(mem, nid, zid);
247}
248
249static unsigned long mem_cgroup_get_all_zonestat(struct mem_cgroup *mem,
250					enum lru_list idx)
251{
252	int nid, zid;
253	struct mem_cgroup_per_zone *mz;
254	u64 total = 0;
255
256	for_each_online_node(nid)
257		for (zid = 0; zid < MAX_NR_ZONES; zid++) {
258			mz = mem_cgroup_zoneinfo(mem, nid, zid);
259			total += MEM_CGROUP_ZSTAT(mz, idx);
260		}
261	return total;
262}
263
264static struct mem_cgroup *mem_cgroup_from_cont(struct cgroup *cont)
265{
266	return container_of(cgroup_subsys_state(cont,
267				mem_cgroup_subsys_id), struct mem_cgroup,
268				css);
269}
270
271struct mem_cgroup *mem_cgroup_from_task(struct task_struct *p)
272{
273	/*
274	 * mm_update_next_owner() may clear mm->owner to NULL
275	 * if it races with swapoff, page migration, etc.
276	 * So this can be called with p == NULL.
277	 */
278	if (unlikely(!p))
279		return NULL;
280
281	return container_of(task_subsys_state(p, mem_cgroup_subsys_id),
282				struct mem_cgroup, css);
283}
284
285/*
286 * Following LRU functions are allowed to be used without PCG_LOCK.
287 * Operations are called by routine of global LRU independently from memcg.
288 * What we have to take care of here is validness of pc->mem_cgroup.
289 *
290 * Changes to pc->mem_cgroup happens when
291 * 1. charge
292 * 2. moving account
293 * In typical case, "charge" is done before add-to-lru. Exception is SwapCache.
294 * It is added to LRU before charge.
295 * If PCG_USED bit is not set, page_cgroup is not added to this private LRU.
296 * When moving account, the page is not on LRU. It's isolated.
297 */
298
299void mem_cgroup_del_lru_list(struct page *page, enum lru_list lru)
300{
301	struct page_cgroup *pc;
302	struct mem_cgroup *mem;
303	struct mem_cgroup_per_zone *mz;
304
305	if (mem_cgroup_disabled())
306		return;
307	pc = lookup_page_cgroup(page);
308	/* can happen while we handle swapcache. */
309	if (list_empty(&pc->lru))
310		return;
311	mz = page_cgroup_zoneinfo(pc);
312	mem = pc->mem_cgroup;
313	MEM_CGROUP_ZSTAT(mz, lru) -= 1;
314	list_del_init(&pc->lru);
315	return;
316}
317
318void mem_cgroup_del_lru(struct page *page)
319{
320	mem_cgroup_del_lru_list(page, page_lru(page));
321}
322
323void mem_cgroup_rotate_lru_list(struct page *page, enum lru_list lru)
324{
325	struct mem_cgroup_per_zone *mz;
326	struct page_cgroup *pc;
327
328	if (mem_cgroup_disabled())
329		return;
330
331	pc = lookup_page_cgroup(page);
332	smp_rmb();
333	/* unused page is not rotated. */
334	if (!PageCgroupUsed(pc))
335		return;
336	mz = page_cgroup_zoneinfo(pc);
337	list_move(&pc->lru, &mz->lists[lru]);
338}
339
340void mem_cgroup_add_lru_list(struct page *page, enum lru_list lru)
341{
342	struct page_cgroup *pc;
343	struct mem_cgroup_per_zone *mz;
344
345	if (mem_cgroup_disabled())
346		return;
347	pc = lookup_page_cgroup(page);
348	/* barrier to sync with "charge" */
349	smp_rmb();
350	if (!PageCgroupUsed(pc))
351		return;
352
353	mz = page_cgroup_zoneinfo(pc);
354	MEM_CGROUP_ZSTAT(mz, lru) += 1;
355	list_add(&pc->lru, &mz->lists[lru]);
356}
357/*
358 * To add swapcache into LRU. Be careful to all this function.
359 * zone->lru_lock shouldn't be held and irq must not be disabled.
360 */
361static void mem_cgroup_lru_fixup(struct page *page)
362{
363	if (!isolate_lru_page(page))
364		putback_lru_page(page);
365}
366
367void mem_cgroup_move_lists(struct page *page,
368			   enum lru_list from, enum lru_list to)
369{
370	if (mem_cgroup_disabled())
371		return;
372	mem_cgroup_del_lru_list(page, from);
373	mem_cgroup_add_lru_list(page, to);
374}
375
376int task_in_mem_cgroup(struct task_struct *task, const struct mem_cgroup *mem)
377{
378	int ret;
379
380	task_lock(task);
381	ret = task->mm && mm_match_cgroup(task->mm, mem);
382	task_unlock(task);
383	return ret;
384}
385
386/*
387 * Calculate mapped_ratio under memory controller. This will be used in
388 * vmscan.c for deteremining we have to reclaim mapped pages.
389 */
390int mem_cgroup_calc_mapped_ratio(struct mem_cgroup *mem)
391{
392	long total, rss;
393
394	/*
395	 * usage is recorded in bytes. But, here, we assume the number of
396	 * physical pages can be represented by "long" on any arch.
397	 */
398	total = (long) (mem->res.usage >> PAGE_SHIFT) + 1L;
399	rss = (long)mem_cgroup_read_stat(&mem->stat, MEM_CGROUP_STAT_RSS);
400	return (int)((rss * 100L) / total);
401}
402
403/*
404 * prev_priority control...this will be used in memory reclaim path.
405 */
406int mem_cgroup_get_reclaim_priority(struct mem_cgroup *mem)
407{
408	int prev_priority;
409
410	spin_lock(&mem->reclaim_param_lock);
411	prev_priority = mem->prev_priority;
412	spin_unlock(&mem->reclaim_param_lock);
413
414	return prev_priority;
415}
416
417void mem_cgroup_note_reclaim_priority(struct mem_cgroup *mem, int priority)
418{
419	spin_lock(&mem->reclaim_param_lock);
420	if (priority < mem->prev_priority)
421		mem->prev_priority = priority;
422	spin_unlock(&mem->reclaim_param_lock);
423}
424
425void mem_cgroup_record_reclaim_priority(struct mem_cgroup *mem, int priority)
426{
427	spin_lock(&mem->reclaim_param_lock);
428	mem->prev_priority = priority;
429	spin_unlock(&mem->reclaim_param_lock);
430}
431
432static int calc_inactive_ratio(struct mem_cgroup *memcg, unsigned long *present_pages)
433{
434	unsigned long active;
435	unsigned long inactive;
436	unsigned long gb;
437	unsigned long inactive_ratio;
438
439	inactive = mem_cgroup_get_all_zonestat(memcg, LRU_INACTIVE_ANON);
440	active = mem_cgroup_get_all_zonestat(memcg, LRU_ACTIVE_ANON);
441
442	gb = (inactive + active) >> (30 - PAGE_SHIFT);
443	if (gb)
444		inactive_ratio = int_sqrt(10 * gb);
445	else
446		inactive_ratio = 1;
447
448	if (present_pages) {
449		present_pages[0] = inactive;
450		present_pages[1] = active;
451	}
452
453	return inactive_ratio;
454}
455
456int mem_cgroup_inactive_anon_is_low(struct mem_cgroup *memcg)
457{
458	unsigned long active;
459	unsigned long inactive;
460	unsigned long present_pages[2];
461	unsigned long inactive_ratio;
462
463	inactive_ratio = calc_inactive_ratio(memcg, present_pages);
464
465	inactive = present_pages[0];
466	active = present_pages[1];
467
468	if (inactive * inactive_ratio < active)
469		return 1;
470
471	return 0;
472}
473
474unsigned long mem_cgroup_zone_nr_pages(struct mem_cgroup *memcg,
475				       struct zone *zone,
476				       enum lru_list lru)
477{
478	int nid = zone->zone_pgdat->node_id;
479	int zid = zone_idx(zone);
480	struct mem_cgroup_per_zone *mz = mem_cgroup_zoneinfo(memcg, nid, zid);
481
482	return MEM_CGROUP_ZSTAT(mz, lru);
483}
484
485struct zone_reclaim_stat *mem_cgroup_get_reclaim_stat(struct mem_cgroup *memcg,
486						      struct zone *zone)
487{
488	int nid = zone->zone_pgdat->node_id;
489	int zid = zone_idx(zone);
490	struct mem_cgroup_per_zone *mz = mem_cgroup_zoneinfo(memcg, nid, zid);
491
492	return &mz->reclaim_stat;
493}
494
495struct zone_reclaim_stat *
496mem_cgroup_get_reclaim_stat_from_page(struct page *page)
497{
498	struct page_cgroup *pc;
499	struct mem_cgroup_per_zone *mz;
500
501	if (mem_cgroup_disabled())
502		return NULL;
503
504	pc = lookup_page_cgroup(page);
505	mz = page_cgroup_zoneinfo(pc);
506	if (!mz)
507		return NULL;
508
509	return &mz->reclaim_stat;
510}
511
512unsigned long mem_cgroup_isolate_pages(unsigned long nr_to_scan,
513					struct list_head *dst,
514					unsigned long *scanned, int order,
515					int mode, struct zone *z,
516					struct mem_cgroup *mem_cont,
517					int active, int file)
518{
519	unsigned long nr_taken = 0;
520	struct page *page;
521	unsigned long scan;
522	LIST_HEAD(pc_list);
523	struct list_head *src;
524	struct page_cgroup *pc, *tmp;
525	int nid = z->zone_pgdat->node_id;
526	int zid = zone_idx(z);
527	struct mem_cgroup_per_zone *mz;
528	int lru = LRU_FILE * !!file + !!active;
529
530	BUG_ON(!mem_cont);
531	mz = mem_cgroup_zoneinfo(mem_cont, nid, zid);
532	src = &mz->lists[lru];
533
534	scan = 0;
535	list_for_each_entry_safe_reverse(pc, tmp, src, lru) {
536		if (scan >= nr_to_scan)
537			break;
538
539		page = pc->page;
540		if (unlikely(!PageCgroupUsed(pc)))
541			continue;
542		if (unlikely(!PageLRU(page)))
543			continue;
544
545		scan++;
546		if (__isolate_lru_page(page, mode, file) == 0) {
547			list_move(&page->lru, dst);
548			nr_taken++;
549		}
550	}
551
552	*scanned = scan;
553	return nr_taken;
554}
555
556#define mem_cgroup_from_res_counter(counter, member)	\
557	container_of(counter, struct mem_cgroup, member)
558
559/*
560 * This routine finds the DFS walk successor. This routine should be
561 * called with cgroup_mutex held
562 */
563static struct mem_cgroup *
564mem_cgroup_get_next_node(struct mem_cgroup *curr, struct mem_cgroup *root_mem)
565{
566	struct cgroup *cgroup, *curr_cgroup, *root_cgroup;
567
568	curr_cgroup = curr->css.cgroup;
569	root_cgroup = root_mem->css.cgroup;
570
571	if (!list_empty(&curr_cgroup->children)) {
572		/*
573		 * Walk down to children
574		 */
575		mem_cgroup_put(curr);
576		cgroup = list_entry(curr_cgroup->children.next,
577						struct cgroup, sibling);
578		curr = mem_cgroup_from_cont(cgroup);
579		mem_cgroup_get(curr);
580		goto done;
581	}
582
583visit_parent:
584	if (curr_cgroup == root_cgroup) {
585		mem_cgroup_put(curr);
586		curr = root_mem;
587		mem_cgroup_get(curr);
588		goto done;
589	}
590
591	/*
592	 * Goto next sibling
593	 */
594	if (curr_cgroup->sibling.next != &curr_cgroup->parent->children) {
595		mem_cgroup_put(curr);
596		cgroup = list_entry(curr_cgroup->sibling.next, struct cgroup,
597						sibling);
598		curr = mem_cgroup_from_cont(cgroup);
599		mem_cgroup_get(curr);
600		goto done;
601	}
602
603	/*
604	 * Go up to next parent and next parent's sibling if need be
605	 */
606	curr_cgroup = curr_cgroup->parent;
607	goto visit_parent;
608
609done:
610	root_mem->last_scanned_child = curr;
611	return curr;
612}
613
614/*
615 * Visit the first child (need not be the first child as per the ordering
616 * of the cgroup list, since we track last_scanned_child) of @mem and use
617 * that to reclaim free pages from.
618 */
619static struct mem_cgroup *
620mem_cgroup_get_first_node(struct mem_cgroup *root_mem)
621{
622	struct cgroup *cgroup;
623	struct mem_cgroup *ret;
624	bool obsolete = (root_mem->last_scanned_child &&
625				root_mem->last_scanned_child->obsolete);
626
627	/*
628	 * Scan all children under the mem_cgroup mem
629	 */
630	cgroup_lock();
631	if (list_empty(&root_mem->css.cgroup->children)) {
632		ret = root_mem;
633		goto done;
634	}
635
636	if (!root_mem->last_scanned_child || obsolete) {
637
638		if (obsolete)
639			mem_cgroup_put(root_mem->last_scanned_child);
640
641		cgroup = list_first_entry(&root_mem->css.cgroup->children,
642				struct cgroup, sibling);
643		ret = mem_cgroup_from_cont(cgroup);
644		mem_cgroup_get(ret);
645	} else
646		ret = mem_cgroup_get_next_node(root_mem->last_scanned_child,
647						root_mem);
648
649done:
650	root_mem->last_scanned_child = ret;
651	cgroup_unlock();
652	return ret;
653}
654
655static bool mem_cgroup_check_under_limit(struct mem_cgroup *mem)
656{
657	if (do_swap_account) {
658		if (res_counter_check_under_limit(&mem->res) &&
659			res_counter_check_under_limit(&mem->memsw))
660			return true;
661	} else
662		if (res_counter_check_under_limit(&mem->res))
663			return true;
664	return false;
665}
666
667static unsigned int get_swappiness(struct mem_cgroup *memcg)
668{
669	struct cgroup *cgrp = memcg->css.cgroup;
670	unsigned int swappiness;
671
672	/* root ? */
673	if (cgrp->parent == NULL)
674		return vm_swappiness;
675
676	spin_lock(&memcg->reclaim_param_lock);
677	swappiness = memcg->swappiness;
678	spin_unlock(&memcg->reclaim_param_lock);
679
680	return swappiness;
681}
682
683/*
684 * Dance down the hierarchy if needed to reclaim memory. We remember the
685 * last child we reclaimed from, so that we don't end up penalizing
686 * one child extensively based on its position in the children list.
687 *
688 * root_mem is the original ancestor that we've been reclaim from.
689 */
690static int mem_cgroup_hierarchical_reclaim(struct mem_cgroup *root_mem,
691						gfp_t gfp_mask, bool noswap)
692{
693	struct mem_cgroup *next_mem;
694	int ret = 0;
695
696	/*
697	 * Reclaim unconditionally and don't check for return value.
698	 * We need to reclaim in the current group and down the tree.
699	 * One might think about checking for children before reclaiming,
700	 * but there might be left over accounting, even after children
701	 * have left.
702	 */
703	ret = try_to_free_mem_cgroup_pages(root_mem, gfp_mask, noswap,
704					   get_swappiness(root_mem));
705	if (mem_cgroup_check_under_limit(root_mem))
706		return 0;
707	if (!root_mem->use_hierarchy)
708		return ret;
709
710	next_mem = mem_cgroup_get_first_node(root_mem);
711
712	while (next_mem != root_mem) {
713		if (next_mem->obsolete) {
714			mem_cgroup_put(next_mem);
715			cgroup_lock();
716			next_mem = mem_cgroup_get_first_node(root_mem);
717			cgroup_unlock();
718			continue;
719		}
720		ret = try_to_free_mem_cgroup_pages(next_mem, gfp_mask, noswap,
721						   get_swappiness(next_mem));
722		if (mem_cgroup_check_under_limit(root_mem))
723			return 0;
724		cgroup_lock();
725		next_mem = mem_cgroup_get_next_node(next_mem, root_mem);
726		cgroup_unlock();
727	}
728	return ret;
729}
730
731bool mem_cgroup_oom_called(struct task_struct *task)
732{
733	bool ret = false;
734	struct mem_cgroup *mem;
735	struct mm_struct *mm;
736
737	rcu_read_lock();
738	mm = task->mm;
739	if (!mm)
740		mm = &init_mm;
741	mem = mem_cgroup_from_task(rcu_dereference(mm->owner));
742	if (mem && time_before(jiffies, mem->last_oom_jiffies + HZ/10))
743		ret = true;
744	rcu_read_unlock();
745	return ret;
746}
747/*
748 * Unlike exported interface, "oom" parameter is added. if oom==true,
749 * oom-killer can be invoked.
750 */
751static int __mem_cgroup_try_charge(struct mm_struct *mm,
752			gfp_t gfp_mask, struct mem_cgroup **memcg,
753			bool oom)
754{
755	struct mem_cgroup *mem, *mem_over_limit;
756	int nr_retries = MEM_CGROUP_RECLAIM_RETRIES;
757	struct res_counter *fail_res;
758
759	if (unlikely(test_thread_flag(TIF_MEMDIE))) {
760		/* Don't account this! */
761		*memcg = NULL;
762		return 0;
763	}
764
765	/*
766	 * We always charge the cgroup the mm_struct belongs to.
767	 * The mm_struct's mem_cgroup changes on task migration if the
768	 * thread group leader migrates. It's possible that mm is not
769	 * set, if so charge the init_mm (happens for pagecache usage).
770	 */
771	if (likely(!*memcg)) {
772		rcu_read_lock();
773		mem = mem_cgroup_from_task(rcu_dereference(mm->owner));
774		if (unlikely(!mem)) {
775			rcu_read_unlock();
776			return 0;
777		}
778		/*
779		 * For every charge from the cgroup, increment reference count
780		 */
781		css_get(&mem->css);
782		*memcg = mem;
783		rcu_read_unlock();
784	} else {
785		mem = *memcg;
786		css_get(&mem->css);
787	}
788
789	while (1) {
790		int ret;
791		bool noswap = false;
792
793		ret = res_counter_charge(&mem->res, PAGE_SIZE, &fail_res);
794		if (likely(!ret)) {
795			if (!do_swap_account)
796				break;
797			ret = res_counter_charge(&mem->memsw, PAGE_SIZE,
798							&fail_res);
799			if (likely(!ret))
800				break;
801			/* mem+swap counter fails */
802			res_counter_uncharge(&mem->res, PAGE_SIZE);
803			noswap = true;
804			mem_over_limit = mem_cgroup_from_res_counter(fail_res,
805									memsw);
806		} else
807			/* mem counter fails */
808			mem_over_limit = mem_cgroup_from_res_counter(fail_res,
809									res);
810
811		if (!(gfp_mask & __GFP_WAIT))
812			goto nomem;
813
814		ret = mem_cgroup_hierarchical_reclaim(mem_over_limit, gfp_mask,
815							noswap);
816
817		/*
818		 * try_to_free_mem_cgroup_pages() might not give us a full
819		 * picture of reclaim. Some pages are reclaimed and might be
820		 * moved to swap cache or just unmapped from the cgroup.
821		 * Check the limit again to see if the reclaim reduced the
822		 * current usage of the cgroup before giving up
823		 *
824		 */
825		if (mem_cgroup_check_under_limit(mem_over_limit))
826			continue;
827
828		if (!nr_retries--) {
829			if (oom) {
830				mem_cgroup_out_of_memory(mem_over_limit, gfp_mask);
831				mem_over_limit->last_oom_jiffies = jiffies;
832			}
833			goto nomem;
834		}
835	}
836	return 0;
837nomem:
838	css_put(&mem->css);
839	return -ENOMEM;
840}
841
842/**
843 * mem_cgroup_try_charge - get charge of PAGE_SIZE.
844 * @mm: an mm_struct which is charged against. (when *memcg is NULL)
845 * @gfp_mask: gfp_mask for reclaim.
846 * @memcg: a pointer to memory cgroup which is charged against.
847 *
848 * charge against memory cgroup pointed by *memcg. if *memcg == NULL, estimated
849 * memory cgroup from @mm is got and stored in *memcg.
850 *
851 * Returns 0 if success. -ENOMEM at failure.
852 * This call can invoke OOM-Killer.
853 */
854
855int mem_cgroup_try_charge(struct mm_struct *mm,
856			  gfp_t mask, struct mem_cgroup **memcg)
857{
858	return __mem_cgroup_try_charge(mm, mask, memcg, true);
859}
860
861/*
862 * commit a charge got by mem_cgroup_try_charge() and makes page_cgroup to be
863 * USED state. If already USED, uncharge and return.
864 */
865
866static void __mem_cgroup_commit_charge(struct mem_cgroup *mem,
867				     struct page_cgroup *pc,
868				     enum charge_type ctype)
869{
870	/* try_charge() can return NULL to *memcg, taking care of it. */
871	if (!mem)
872		return;
873
874	lock_page_cgroup(pc);
875	if (unlikely(PageCgroupUsed(pc))) {
876		unlock_page_cgroup(pc);
877		res_counter_uncharge(&mem->res, PAGE_SIZE);
878		if (do_swap_account)
879			res_counter_uncharge(&mem->memsw, PAGE_SIZE);
880		css_put(&mem->css);
881		return;
882	}
883	pc->mem_cgroup = mem;
884	smp_wmb();
885	pc->flags = pcg_default_flags[ctype];
886
887	mem_cgroup_charge_statistics(mem, pc, true);
888
889	unlock_page_cgroup(pc);
890}
891
892/**
893 * mem_cgroup_move_account - move account of the page
894 * @pc:	page_cgroup of the page.
895 * @from: mem_cgroup which the page is moved from.
896 * @to:	mem_cgroup which the page is moved to. @from != @to.
897 *
898 * The caller must confirm following.
899 * - page is not on LRU (isolate_page() is useful.)
900 *
901 * returns 0 at success,
902 * returns -EBUSY when lock is busy or "pc" is unstable.
903 *
904 * This function does "uncharge" from old cgroup but doesn't do "charge" to
905 * new cgroup. It should be done by a caller.
906 */
907
908static int mem_cgroup_move_account(struct page_cgroup *pc,
909	struct mem_cgroup *from, struct mem_cgroup *to)
910{
911	struct mem_cgroup_per_zone *from_mz, *to_mz;
912	int nid, zid;
913	int ret = -EBUSY;
914
915	VM_BUG_ON(from == to);
916	VM_BUG_ON(PageLRU(pc->page));
917
918	nid = page_cgroup_nid(pc);
919	zid = page_cgroup_zid(pc);
920	from_mz =  mem_cgroup_zoneinfo(from, nid, zid);
921	to_mz =  mem_cgroup_zoneinfo(to, nid, zid);
922
923	if (!trylock_page_cgroup(pc))
924		return ret;
925
926	if (!PageCgroupUsed(pc))
927		goto out;
928
929	if (pc->mem_cgroup != from)
930		goto out;
931
932	css_put(&from->css);
933	res_counter_uncharge(&from->res, PAGE_SIZE);
934	mem_cgroup_charge_statistics(from, pc, false);
935	if (do_swap_account)
936		res_counter_uncharge(&from->memsw, PAGE_SIZE);
937	pc->mem_cgroup = to;
938	mem_cgroup_charge_statistics(to, pc, true);
939	css_get(&to->css);
940	ret = 0;
941out:
942	unlock_page_cgroup(pc);
943	return ret;
944}
945
946/*
947 * move charges to its parent.
948 */
949
950static int mem_cgroup_move_parent(struct page_cgroup *pc,
951				  struct mem_cgroup *child,
952				  gfp_t gfp_mask)
953{
954	struct page *page = pc->page;
955	struct cgroup *cg = child->css.cgroup;
956	struct cgroup *pcg = cg->parent;
957	struct mem_cgroup *parent;
958	int ret;
959
960	/* Is ROOT ? */
961	if (!pcg)
962		return -EINVAL;
963
964
965	parent = mem_cgroup_from_cont(pcg);
966
967
968	ret = __mem_cgroup_try_charge(NULL, gfp_mask, &parent, false);
969	if (ret || !parent)
970		return ret;
971
972	if (!get_page_unless_zero(page))
973		return -EBUSY;
974
975	ret = isolate_lru_page(page);
976
977	if (ret)
978		goto cancel;
979
980	ret = mem_cgroup_move_account(pc, child, parent);
981
982	/* drop extra refcnt by try_charge() (move_account increment one) */
983	css_put(&parent->css);
984	putback_lru_page(page);
985	if (!ret) {
986		put_page(page);
987		return 0;
988	}
989	/* uncharge if move fails */
990cancel:
991	res_counter_uncharge(&parent->res, PAGE_SIZE);
992	if (do_swap_account)
993		res_counter_uncharge(&parent->memsw, PAGE_SIZE);
994	put_page(page);
995	return ret;
996}
997
998/*
999 * Charge the memory controller for page usage.
1000 * Return
1001 * 0 if the charge was successful
1002 * < 0 if the cgroup is over its limit
1003 */
1004static int mem_cgroup_charge_common(struct page *page, struct mm_struct *mm,
1005				gfp_t gfp_mask, enum charge_type ctype,
1006				struct mem_cgroup *memcg)
1007{
1008	struct mem_cgroup *mem;
1009	struct page_cgroup *pc;
1010	int ret;
1011
1012	pc = lookup_page_cgroup(page);
1013	/* can happen at boot */
1014	if (unlikely(!pc))
1015		return 0;
1016	prefetchw(pc);
1017
1018	mem = memcg;
1019	ret = __mem_cgroup_try_charge(mm, gfp_mask, &mem, true);
1020	if (ret || !mem)
1021		return ret;
1022
1023	__mem_cgroup_commit_charge(mem, pc, ctype);
1024	return 0;
1025}
1026
1027int mem_cgroup_newpage_charge(struct page *page,
1028			      struct mm_struct *mm, gfp_t gfp_mask)
1029{
1030	if (mem_cgroup_disabled())
1031		return 0;
1032	if (PageCompound(page))
1033		return 0;
1034	/*
1035	 * If already mapped, we don't have to account.
1036	 * If page cache, page->mapping has address_space.
1037	 * But page->mapping may have out-of-use anon_vma pointer,
1038	 * detecit it by PageAnon() check. newly-mapped-anon's page->mapping
1039	 * is NULL.
1040  	 */
1041	if (page_mapped(page) || (page->mapping && !PageAnon(page)))
1042		return 0;
1043	if (unlikely(!mm))
1044		mm = &init_mm;
1045	return mem_cgroup_charge_common(page, mm, gfp_mask,
1046				MEM_CGROUP_CHARGE_TYPE_MAPPED, NULL);
1047}
1048
1049int mem_cgroup_cache_charge(struct page *page, struct mm_struct *mm,
1050				gfp_t gfp_mask)
1051{
1052	if (mem_cgroup_disabled())
1053		return 0;
1054	if (PageCompound(page))
1055		return 0;
1056	/*
1057	 * Corner case handling. This is called from add_to_page_cache()
1058	 * in usual. But some FS (shmem) precharges this page before calling it
1059	 * and call add_to_page_cache() with GFP_NOWAIT.
1060	 *
1061	 * For GFP_NOWAIT case, the page may be pre-charged before calling
1062	 * add_to_page_cache(). (See shmem.c) check it here and avoid to call
1063	 * charge twice. (It works but has to pay a bit larger cost.)
1064	 */
1065	if (!(gfp_mask & __GFP_WAIT)) {
1066		struct page_cgroup *pc;
1067
1068
1069		pc = lookup_page_cgroup(page);
1070		if (!pc)
1071			return 0;
1072		lock_page_cgroup(pc);
1073		if (PageCgroupUsed(pc)) {
1074			unlock_page_cgroup(pc);
1075			return 0;
1076		}
1077		unlock_page_cgroup(pc);
1078	}
1079
1080	if (unlikely(!mm))
1081		mm = &init_mm;
1082
1083	if (page_is_file_cache(page))
1084		return mem_cgroup_charge_common(page, mm, gfp_mask,
1085				MEM_CGROUP_CHARGE_TYPE_CACHE, NULL);
1086	else
1087		return mem_cgroup_charge_common(page, mm, gfp_mask,
1088				MEM_CGROUP_CHARGE_TYPE_SHMEM, NULL);
1089}
1090
1091int mem_cgroup_try_charge_swapin(struct mm_struct *mm,
1092				 struct page *page,
1093				 gfp_t mask, struct mem_cgroup **ptr)
1094{
1095	struct mem_cgroup *mem;
1096	swp_entry_t     ent;
1097
1098	if (mem_cgroup_disabled())
1099		return 0;
1100
1101	if (!do_swap_account)
1102		goto charge_cur_mm;
1103
1104	/*
1105	 * A racing thread's fault, or swapoff, may have already updated
1106	 * the pte, and even removed page from swap cache: return success
1107	 * to go on to do_swap_page()'s pte_same() test, which should fail.
1108	 */
1109	if (!PageSwapCache(page))
1110		return 0;
1111
1112	ent.val = page_private(page);
1113
1114	mem = lookup_swap_cgroup(ent);
1115	if (!mem || mem->obsolete)
1116		goto charge_cur_mm;
1117	*ptr = mem;
1118	return __mem_cgroup_try_charge(NULL, mask, ptr, true);
1119charge_cur_mm:
1120	if (unlikely(!mm))
1121		mm = &init_mm;
1122	return __mem_cgroup_try_charge(mm, mask, ptr, true);
1123}
1124
1125#ifdef CONFIG_SWAP
1126
1127int mem_cgroup_cache_charge_swapin(struct page *page,
1128			struct mm_struct *mm, gfp_t mask, bool locked)
1129{
1130	int ret = 0;
1131
1132	if (mem_cgroup_disabled())
1133		return 0;
1134	if (unlikely(!mm))
1135		mm = &init_mm;
1136	if (!locked)
1137		lock_page(page);
1138	/*
1139	 * If not locked, the page can be dropped from SwapCache until
1140	 * we reach here.
1141	 */
1142	if (PageSwapCache(page)) {
1143		struct mem_cgroup *mem = NULL;
1144		swp_entry_t ent;
1145
1146		ent.val = page_private(page);
1147		if (do_swap_account) {
1148			mem = lookup_swap_cgroup(ent);
1149			if (mem && mem->obsolete)
1150				mem = NULL;
1151			if (mem)
1152				mm = NULL;
1153		}
1154		ret = mem_cgroup_charge_common(page, mm, mask,
1155				MEM_CGROUP_CHARGE_TYPE_SHMEM, mem);
1156
1157		if (!ret && do_swap_account) {
1158			/* avoid double counting */
1159			mem = swap_cgroup_record(ent, NULL);
1160			if (mem) {
1161				res_counter_uncharge(&mem->memsw, PAGE_SIZE);
1162				mem_cgroup_put(mem);
1163			}
1164		}
1165	}
1166	if (!locked)
1167		unlock_page(page);
1168	/* add this page(page_cgroup) to the LRU we want. */
1169	mem_cgroup_lru_fixup(page);
1170
1171	return ret;
1172}
1173#endif
1174
1175void mem_cgroup_commit_charge_swapin(struct page *page, struct mem_cgroup *ptr)
1176{
1177	struct page_cgroup *pc;
1178
1179	if (mem_cgroup_disabled())
1180		return;
1181	if (!ptr)
1182		return;
1183	pc = lookup_page_cgroup(page);
1184	__mem_cgroup_commit_charge(ptr, pc, MEM_CGROUP_CHARGE_TYPE_MAPPED);
1185	/*
1186	 * Now swap is on-memory. This means this page may be
1187	 * counted both as mem and swap....double count.
1188	 * Fix it by uncharging from memsw. This SwapCache is stable
1189	 * because we're still under lock_page().
1190	 */
1191	if (do_swap_account) {
1192		swp_entry_t ent = {.val = page_private(page)};
1193		struct mem_cgroup *memcg;
1194		memcg = swap_cgroup_record(ent, NULL);
1195		if (memcg) {
1196			/* If memcg is obsolete, memcg can be != ptr */
1197			res_counter_uncharge(&memcg->memsw, PAGE_SIZE);
1198			mem_cgroup_put(memcg);
1199		}
1200
1201	}
1202	/* add this page(page_cgroup) to the LRU we want. */
1203	mem_cgroup_lru_fixup(page);
1204}
1205
1206void mem_cgroup_cancel_charge_swapin(struct mem_cgroup *mem)
1207{
1208	if (mem_cgroup_disabled())
1209		return;
1210	if (!mem)
1211		return;
1212	res_counter_uncharge(&mem->res, PAGE_SIZE);
1213	if (do_swap_account)
1214		res_counter_uncharge(&mem->memsw, PAGE_SIZE);
1215	css_put(&mem->css);
1216}
1217
1218
1219/*
1220 * uncharge if !page_mapped(page)
1221 */
1222static struct mem_cgroup *
1223__mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype)
1224{
1225	struct page_cgroup *pc;
1226	struct mem_cgroup *mem = NULL;
1227	struct mem_cgroup_per_zone *mz;
1228
1229	if (mem_cgroup_disabled())
1230		return NULL;
1231
1232	if (PageSwapCache(page))
1233		return NULL;
1234
1235	/*
1236	 * Check if our page_cgroup is valid
1237	 */
1238	pc = lookup_page_cgroup(page);
1239	if (unlikely(!pc || !PageCgroupUsed(pc)))
1240		return NULL;
1241
1242	lock_page_cgroup(pc);
1243
1244	mem = pc->mem_cgroup;
1245
1246	if (!PageCgroupUsed(pc))
1247		goto unlock_out;
1248
1249	switch (ctype) {
1250	case MEM_CGROUP_CHARGE_TYPE_MAPPED:
1251		if (page_mapped(page))
1252			goto unlock_out;
1253		break;
1254	case MEM_CGROUP_CHARGE_TYPE_SWAPOUT:
1255		if (!PageAnon(page)) {	/* Shared memory */
1256			if (page->mapping && !page_is_file_cache(page))
1257				goto unlock_out;
1258		} else if (page_mapped(page)) /* Anon */
1259				goto unlock_out;
1260		break;
1261	default:
1262		break;
1263	}
1264
1265	res_counter_uncharge(&mem->res, PAGE_SIZE);
1266	if (do_swap_account && (ctype != MEM_CGROUP_CHARGE_TYPE_SWAPOUT))
1267		res_counter_uncharge(&mem->memsw, PAGE_SIZE);
1268
1269	mem_cgroup_charge_statistics(mem, pc, false);
1270	ClearPageCgroupUsed(pc);
1271
1272	mz = page_cgroup_zoneinfo(pc);
1273	unlock_page_cgroup(pc);
1274
1275	/* at swapout, this memcg will be accessed to record to swap */
1276	if (ctype != MEM_CGROUP_CHARGE_TYPE_SWAPOUT)
1277		css_put(&mem->css);
1278
1279	return mem;
1280
1281unlock_out:
1282	unlock_page_cgroup(pc);
1283	return NULL;
1284}
1285
1286void mem_cgroup_uncharge_page(struct page *page)
1287{
1288	/* early check. */
1289	if (page_mapped(page))
1290		return;
1291	if (page->mapping && !PageAnon(page))
1292		return;
1293	__mem_cgroup_uncharge_common(page, MEM_CGROUP_CHARGE_TYPE_MAPPED);
1294}
1295
1296void mem_cgroup_uncharge_cache_page(struct page *page)
1297{
1298	VM_BUG_ON(page_mapped(page));
1299	VM_BUG_ON(page->mapping);
1300	__mem_cgroup_uncharge_common(page, MEM_CGROUP_CHARGE_TYPE_CACHE);
1301}
1302
1303/*
1304 * called from __delete_from_swap_cache() and drop "page" account.
1305 * memcg information is recorded to swap_cgroup of "ent"
1306 */
1307void mem_cgroup_uncharge_swapcache(struct page *page, swp_entry_t ent)
1308{
1309	struct mem_cgroup *memcg;
1310
1311	memcg = __mem_cgroup_uncharge_common(page,
1312					MEM_CGROUP_CHARGE_TYPE_SWAPOUT);
1313	/* record memcg information */
1314	if (do_swap_account && memcg) {
1315		swap_cgroup_record(ent, memcg);
1316		mem_cgroup_get(memcg);
1317	}
1318	if (memcg)
1319		css_put(&memcg->css);
1320}
1321
1322#ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP
1323/*
1324 * called from swap_entry_free(). remove record in swap_cgroup and
1325 * uncharge "memsw" account.
1326 */
1327void mem_cgroup_uncharge_swap(swp_entry_t ent)
1328{
1329	struct mem_cgroup *memcg;
1330
1331	if (!do_swap_account)
1332		return;
1333
1334	memcg = swap_cgroup_record(ent, NULL);
1335	if (memcg) {
1336		res_counter_uncharge(&memcg->memsw, PAGE_SIZE);
1337		mem_cgroup_put(memcg);
1338	}
1339}
1340#endif
1341
1342/*
1343 * Before starting migration, account PAGE_SIZE to mem_cgroup that the old
1344 * page belongs to.
1345 */
1346int mem_cgroup_prepare_migration(struct page *page, struct mem_cgroup **ptr)
1347{
1348	struct page_cgroup *pc;
1349	struct mem_cgroup *mem = NULL;
1350	int ret = 0;
1351
1352	if (mem_cgroup_disabled())
1353		return 0;
1354
1355	pc = lookup_page_cgroup(page);
1356	lock_page_cgroup(pc);
1357	if (PageCgroupUsed(pc)) {
1358		mem = pc->mem_cgroup;
1359		css_get(&mem->css);
1360	}
1361	unlock_page_cgroup(pc);
1362
1363	if (mem) {
1364		ret = mem_cgroup_try_charge(NULL, GFP_KERNEL, &mem);
1365		css_put(&mem->css);
1366	}
1367	*ptr = mem;
1368	return ret;
1369}
1370
1371/* remove redundant charge if migration failed*/
1372void mem_cgroup_end_migration(struct mem_cgroup *mem,
1373		struct page *oldpage, struct page *newpage)
1374{
1375	struct page *target, *unused;
1376	struct page_cgroup *pc;
1377	enum charge_type ctype;
1378
1379	if (!mem)
1380		return;
1381
1382	/* at migration success, oldpage->mapping is NULL. */
1383	if (oldpage->mapping) {
1384		target = oldpage;
1385		unused = NULL;
1386	} else {
1387		target = newpage;
1388		unused = oldpage;
1389	}
1390
1391	if (PageAnon(target))
1392		ctype = MEM_CGROUP_CHARGE_TYPE_MAPPED;
1393	else if (page_is_file_cache(target))
1394		ctype = MEM_CGROUP_CHARGE_TYPE_CACHE;
1395	else
1396		ctype = MEM_CGROUP_CHARGE_TYPE_SHMEM;
1397
1398	/* unused page is not on radix-tree now. */
1399	if (unused)
1400		__mem_cgroup_uncharge_common(unused, ctype);
1401
1402	pc = lookup_page_cgroup(target);
1403	/*
1404	 * __mem_cgroup_commit_charge() check PCG_USED bit of page_cgroup.
1405	 * So, double-counting is effectively avoided.
1406	 */
1407	__mem_cgroup_commit_charge(mem, pc, ctype);
1408
1409	/*
1410	 * Both of oldpage and newpage are still under lock_page().
1411	 * Then, we don't have to care about race in radix-tree.
1412	 * But we have to be careful that this page is unmapped or not.
1413	 *
1414	 * There is a case for !page_mapped(). At the start of
1415	 * migration, oldpage was mapped. But now, it's zapped.
1416	 * But we know *target* page is not freed/reused under us.
1417	 * mem_cgroup_uncharge_page() does all necessary checks.
1418	 */
1419	if (ctype == MEM_CGROUP_CHARGE_TYPE_MAPPED)
1420		mem_cgroup_uncharge_page(target);
1421}
1422
1423/*
1424 * A call to try to shrink memory usage under specified resource controller.
1425 * This is typically used for page reclaiming for shmem for reducing side
1426 * effect of page allocation from shmem, which is used by some mem_cgroup.
1427 */
1428int mem_cgroup_shrink_usage(struct mm_struct *mm, gfp_t gfp_mask)
1429{
1430	struct mem_cgroup *mem;
1431	int progress = 0;
1432	int retry = MEM_CGROUP_RECLAIM_RETRIES;
1433
1434	if (mem_cgroup_disabled())
1435		return 0;
1436	if (!mm)
1437		return 0;
1438
1439	rcu_read_lock();
1440	mem = mem_cgroup_from_task(rcu_dereference(mm->owner));
1441	if (unlikely(!mem)) {
1442		rcu_read_unlock();
1443		return 0;
1444	}
1445	css_get(&mem->css);
1446	rcu_read_unlock();
1447
1448	do {
1449		progress = try_to_free_mem_cgroup_pages(mem, gfp_mask, true,
1450							get_swappiness(mem));
1451		progress += mem_cgroup_check_under_limit(mem);
1452	} while (!progress && --retry);
1453
1454	css_put(&mem->css);
1455	if (!retry)
1456		return -ENOMEM;
1457	return 0;
1458}
1459
1460static DEFINE_MUTEX(set_limit_mutex);
1461
1462static int mem_cgroup_resize_limit(struct mem_cgroup *memcg,
1463				unsigned long long val)
1464{
1465
1466	int retry_count = MEM_CGROUP_RECLAIM_RETRIES;
1467	int progress;
1468	u64 memswlimit;
1469	int ret = 0;
1470
1471	while (retry_count) {
1472		if (signal_pending(current)) {
1473			ret = -EINTR;
1474			break;
1475		}
1476		/*
1477		 * Rather than hide all in some function, I do this in
1478		 * open coded manner. You see what this really does.
1479		 * We have to guarantee mem->res.limit < mem->memsw.limit.
1480		 */
1481		mutex_lock(&set_limit_mutex);
1482		memswlimit = res_counter_read_u64(&memcg->memsw, RES_LIMIT);
1483		if (memswlimit < val) {
1484			ret = -EINVAL;
1485			mutex_unlock(&set_limit_mutex);
1486			break;
1487		}
1488		ret = res_counter_set_limit(&memcg->res, val);
1489		mutex_unlock(&set_limit_mutex);
1490
1491		if (!ret)
1492			break;
1493
1494		progress = try_to_free_mem_cgroup_pages(memcg,
1495							GFP_KERNEL,
1496							false,
1497							get_swappiness(memcg));
1498  		if (!progress)			retry_count--;
1499	}
1500
1501	return ret;
1502}
1503
1504int mem_cgroup_resize_memsw_limit(struct mem_cgroup *memcg,
1505				unsigned long long val)
1506{
1507	int retry_count = MEM_CGROUP_RECLAIM_RETRIES;
1508	u64 memlimit, oldusage, curusage;
1509	int ret;
1510
1511	if (!do_swap_account)
1512		return -EINVAL;
1513
1514	while (retry_count) {
1515		if (signal_pending(current)) {
1516			ret = -EINTR;
1517			break;
1518		}
1519		/*
1520		 * Rather than hide all in some function, I do this in
1521		 * open coded manner. You see what this really does.
1522		 * We have to guarantee mem->res.limit < mem->memsw.limit.
1523		 */
1524		mutex_lock(&set_limit_mutex);
1525		memlimit = res_counter_read_u64(&memcg->res, RES_LIMIT);
1526		if (memlimit > val) {
1527			ret = -EINVAL;
1528			mutex_unlock(&set_limit_mutex);
1529			break;
1530		}
1531		ret = res_counter_set_limit(&memcg->memsw, val);
1532		mutex_unlock(&set_limit_mutex);
1533
1534		if (!ret)
1535			break;
1536
1537		oldusage = res_counter_read_u64(&memcg->memsw, RES_USAGE);
1538		try_to_free_mem_cgroup_pages(memcg, GFP_KERNEL, true,
1539					     get_swappiness(memcg));
1540		curusage = res_counter_read_u64(&memcg->memsw, RES_USAGE);
1541		if (curusage >= oldusage)
1542			retry_count--;
1543	}
1544	return ret;
1545}
1546
1547/*
1548 * This routine traverse page_cgroup in given list and drop them all.
1549 * *And* this routine doesn't reclaim page itself, just removes page_cgroup.
1550 */
1551static int mem_cgroup_force_empty_list(struct mem_cgroup *mem,
1552				int node, int zid, enum lru_list lru)
1553{
1554	struct zone *zone;
1555	struct mem_cgroup_per_zone *mz;
1556	struct page_cgroup *pc, *busy;
1557	unsigned long flags, loop;
1558	struct list_head *list;
1559	int ret = 0;
1560
1561	zone = &NODE_DATA(node)->node_zones[zid];
1562	mz = mem_cgroup_zoneinfo(mem, node, zid);
1563	list = &mz->lists[lru];
1564
1565	loop = MEM_CGROUP_ZSTAT(mz, lru);
1566	/* give some margin against EBUSY etc...*/
1567	loop += 256;
1568	busy = NULL;
1569	while (loop--) {
1570		ret = 0;
1571		spin_lock_irqsave(&zone->lru_lock, flags);
1572		if (list_empty(list)) {
1573			spin_unlock_irqrestore(&zone->lru_lock, flags);
1574			break;
1575		}
1576		pc = list_entry(list->prev, struct page_cgroup, lru);
1577		if (busy == pc) {
1578			list_move(&pc->lru, list);
1579			busy = 0;
1580			spin_unlock_irqrestore(&zone->lru_lock, flags);
1581			continue;
1582		}
1583		spin_unlock_irqrestore(&zone->lru_lock, flags);
1584
1585		ret = mem_cgroup_move_parent(pc, mem, GFP_KERNEL);
1586		if (ret == -ENOMEM)
1587			break;
1588
1589		if (ret == -EBUSY || ret == -EINVAL) {
1590			/* found lock contention or "pc" is obsolete. */
1591			busy = pc;
1592			cond_resched();
1593		} else
1594			busy = NULL;
1595	}
1596
1597	if (!ret && !list_empty(list))
1598		return -EBUSY;
1599	return ret;
1600}
1601
1602/*
1603 * make mem_cgroup's charge to be 0 if there is no task.
1604 * This enables deleting this mem_cgroup.
1605 */
1606static int mem_cgroup_force_empty(struct mem_cgroup *mem, bool free_all)
1607{
1608	int ret;
1609	int node, zid, shrink;
1610	int nr_retries = MEM_CGROUP_RECLAIM_RETRIES;
1611	struct cgroup *cgrp = mem->css.cgroup;
1612
1613	css_get(&mem->css);
1614
1615	shrink = 0;
1616	/* should free all ? */
1617	if (free_all)
1618		goto try_to_free;
1619move_account:
1620	while (mem->res.usage > 0) {
1621		ret = -EBUSY;
1622		if (cgroup_task_count(cgrp) || !list_empty(&cgrp->children))
1623			goto out;
1624		ret = -EINTR;
1625		if (signal_pending(current))
1626			goto out;
1627		/* This is for making all *used* pages to be on LRU. */
1628		lru_add_drain_all();
1629		ret = 0;
1630		for_each_node_state(node, N_POSSIBLE) {
1631			for (zid = 0; !ret && zid < MAX_NR_ZONES; zid++) {
1632				enum lru_list l;
1633				for_each_lru(l) {
1634					ret = mem_cgroup_force_empty_list(mem,
1635							node, zid, l);
1636					if (ret)
1637						break;
1638				}
1639			}
1640			if (ret)
1641				break;
1642		}
1643		/* it seems parent cgroup doesn't have enough mem */
1644		if (ret == -ENOMEM)
1645			goto try_to_free;
1646		cond_resched();
1647	}
1648	ret = 0;
1649out:
1650	css_put(&mem->css);
1651	return ret;
1652
1653try_to_free:
1654	/* returns EBUSY if there is a task or if we come here twice. */
1655	if (cgroup_task_count(cgrp) || !list_empty(&cgrp->children) || shrink) {
1656		ret = -EBUSY;
1657		goto out;
1658	}
1659	/* we call try-to-free pages for make this cgroup empty */
1660	lru_add_drain_all();
1661	/* try to free all pages in this cgroup */
1662	shrink = 1;
1663	while (nr_retries && mem->res.usage > 0) {
1664		int progress;
1665
1666		if (signal_pending(current)) {
1667			ret = -EINTR;
1668			goto out;
1669		}
1670		progress = try_to_free_mem_cgroup_pages(mem, GFP_KERNEL,
1671						false, get_swappiness(mem));
1672		if (!progress) {
1673			nr_retries--;
1674			/* maybe some writeback is necessary */
1675			congestion_wait(WRITE, HZ/10);
1676		}
1677
1678	}
1679	lru_add_drain();
1680	/* try move_account...there may be some *locked* pages. */
1681	if (mem->res.usage)
1682		goto move_account;
1683	ret = 0;
1684	goto out;
1685}
1686
1687int mem_cgroup_force_empty_write(struct cgroup *cont, unsigned int event)
1688{
1689	return mem_cgroup_force_empty(mem_cgroup_from_cont(cont), true);
1690}
1691
1692
1693static u64 mem_cgroup_hierarchy_read(struct cgroup *cont, struct cftype *cft)
1694{
1695	return mem_cgroup_from_cont(cont)->use_hierarchy;
1696}
1697
1698static int mem_cgroup_hierarchy_write(struct cgroup *cont, struct cftype *cft,
1699					u64 val)
1700{
1701	int retval = 0;
1702	struct mem_cgroup *mem = mem_cgroup_from_cont(cont);
1703	struct cgroup *parent = cont->parent;
1704	struct mem_cgroup *parent_mem = NULL;
1705
1706	if (parent)
1707		parent_mem = mem_cgroup_from_cont(parent);
1708
1709	cgroup_lock();
1710	/*
1711	 * If parent's use_hiearchy is set, we can't make any modifications
1712	 * in the child subtrees. If it is unset, then the change can
1713	 * occur, provided the current cgroup has no children.
1714	 *
1715	 * For the root cgroup, parent_mem is NULL, we allow value to be
1716	 * set if there are no children.
1717	 */
1718	if ((!parent_mem || !parent_mem->use_hierarchy) &&
1719				(val == 1 || val == 0)) {
1720		if (list_empty(&cont->children))
1721			mem->use_hierarchy = val;
1722		else
1723			retval = -EBUSY;
1724	} else
1725		retval = -EINVAL;
1726	cgroup_unlock();
1727
1728	return retval;
1729}
1730
1731static u64 mem_cgroup_read(struct cgroup *cont, struct cftype *cft)
1732{
1733	struct mem_cgroup *mem = mem_cgroup_from_cont(cont);
1734	u64 val = 0;
1735	int type, name;
1736
1737	type = MEMFILE_TYPE(cft->private);
1738	name = MEMFILE_ATTR(cft->private);
1739	switch (type) {
1740	case _MEM:
1741		val = res_counter_read_u64(&mem->res, name);
1742		break;
1743	case _MEMSWAP:
1744		if (do_swap_account)
1745			val = res_counter_read_u64(&mem->memsw, name);
1746		break;
1747	default:
1748		BUG();
1749		break;
1750	}
1751	return val;
1752}
1753/*
1754 * The user of this function is...
1755 * RES_LIMIT.
1756 */
1757static int mem_cgroup_write(struct cgroup *cont, struct cftype *cft,
1758			    const char *buffer)
1759{
1760	struct mem_cgroup *memcg = mem_cgroup_from_cont(cont);
1761	int type, name;
1762	unsigned long long val;
1763	int ret;
1764
1765	type = MEMFILE_TYPE(cft->private);
1766	name = MEMFILE_ATTR(cft->private);
1767	switch (name) {
1768	case RES_LIMIT:
1769		/* This function does all necessary parse...reuse it */
1770		ret = res_counter_memparse_write_strategy(buffer, &val);
1771		if (ret)
1772			break;
1773		if (type == _MEM)
1774			ret = mem_cgroup_resize_limit(memcg, val);
1775		else
1776			ret = mem_cgroup_resize_memsw_limit(memcg, val);
1777		break;
1778	default:
1779		ret = -EINVAL; /* should be BUG() ? */
1780		break;
1781	}
1782	return ret;
1783}
1784
1785static int mem_cgroup_reset(struct cgroup *cont, unsigned int event)
1786{
1787	struct mem_cgroup *mem;
1788	int type, name;
1789
1790	mem = mem_cgroup_from_cont(cont);
1791	type = MEMFILE_TYPE(event);
1792	name = MEMFILE_ATTR(event);
1793	switch (name) {
1794	case RES_MAX_USAGE:
1795		if (type == _MEM)
1796			res_counter_reset_max(&mem->res);
1797		else
1798			res_counter_reset_max(&mem->memsw);
1799		break;
1800	case RES_FAILCNT:
1801		if (type == _MEM)
1802			res_counter_reset_failcnt(&mem->res);
1803		else
1804			res_counter_reset_failcnt(&mem->memsw);
1805		break;
1806	}
1807	return 0;
1808}
1809
1810static const struct mem_cgroup_stat_desc {
1811	const char *msg;
1812	u64 unit;
1813} mem_cgroup_stat_desc[] = {
1814	[MEM_CGROUP_STAT_CACHE] = { "cache", PAGE_SIZE, },
1815	[MEM_CGROUP_STAT_RSS] = { "rss", PAGE_SIZE, },
1816	[MEM_CGROUP_STAT_PGPGIN_COUNT] = {"pgpgin", 1, },
1817	[MEM_CGROUP_STAT_PGPGOUT_COUNT] = {"pgpgout", 1, },
1818};
1819
1820static int mem_control_stat_show(struct cgroup *cont, struct cftype *cft,
1821				 struct cgroup_map_cb *cb)
1822{
1823	struct mem_cgroup *mem_cont = mem_cgroup_from_cont(cont);
1824	struct mem_cgroup_stat *stat = &mem_cont->stat;
1825	int i;
1826
1827	for (i = 0; i < ARRAY_SIZE(stat->cpustat[0].count); i++) {
1828		s64 val;
1829
1830		val = mem_cgroup_read_stat(stat, i);
1831		val *= mem_cgroup_stat_desc[i].unit;
1832		cb->fill(cb, mem_cgroup_stat_desc[i].msg, val);
1833	}
1834	/* showing # of active pages */
1835	{
1836		unsigned long active_anon, inactive_anon;
1837		unsigned long active_file, inactive_file;
1838		unsigned long unevictable;
1839
1840		inactive_anon = mem_cgroup_get_all_zonestat(mem_cont,
1841						LRU_INACTIVE_ANON);
1842		active_anon = mem_cgroup_get_all_zonestat(mem_cont,
1843						LRU_ACTIVE_ANON);
1844		inactive_file = mem_cgroup_get_all_zonestat(mem_cont,
1845						LRU_INACTIVE_FILE);
1846		active_file = mem_cgroup_get_all_zonestat(mem_cont,
1847						LRU_ACTIVE_FILE);
1848		unevictable = mem_cgroup_get_all_zonestat(mem_cont,
1849							LRU_UNEVICTABLE);
1850
1851		cb->fill(cb, "active_anon", (active_anon) * PAGE_SIZE);
1852		cb->fill(cb, "inactive_anon", (inactive_anon) * PAGE_SIZE);
1853		cb->fill(cb, "active_file", (active_file) * PAGE_SIZE);
1854		cb->fill(cb, "inactive_file", (inactive_file) * PAGE_SIZE);
1855		cb->fill(cb, "unevictable", unevictable * PAGE_SIZE);
1856
1857	}
1858
1859#ifdef CONFIG_DEBUG_VM
1860	cb->fill(cb, "inactive_ratio", calc_inactive_ratio(mem_cont, NULL));
1861
1862	{
1863		int nid, zid;
1864		struct mem_cgroup_per_zone *mz;
1865		unsigned long recent_rotated[2] = {0, 0};
1866		unsigned long recent_scanned[2] = {0, 0};
1867
1868		for_each_online_node(nid)
1869			for (zid = 0; zid < MAX_NR_ZONES; zid++) {
1870				mz = mem_cgroup_zoneinfo(mem_cont, nid, zid);
1871
1872				recent_rotated[0] +=
1873					mz->reclaim_stat.recent_rotated[0];
1874				recent_rotated[1] +=
1875					mz->reclaim_stat.recent_rotated[1];
1876				recent_scanned[0] +=
1877					mz->reclaim_stat.recent_scanned[0];
1878				recent_scanned[1] +=
1879					mz->reclaim_stat.recent_scanned[1];
1880			}
1881		cb->fill(cb, "recent_rotated_anon", recent_rotated[0]);
1882		cb->fill(cb, "recent_rotated_file", recent_rotated[1]);
1883		cb->fill(cb, "recent_scanned_anon", recent_scanned[0]);
1884		cb->fill(cb, "recent_scanned_file", recent_scanned[1]);
1885	}
1886#endif
1887
1888	return 0;
1889}
1890
1891static u64 mem_cgroup_swappiness_read(struct cgroup *cgrp, struct cftype *cft)
1892{
1893	struct mem_cgroup *memcg = mem_cgroup_from_cont(cgrp);
1894
1895	return get_swappiness(memcg);
1896}
1897
1898static int mem_cgroup_swappiness_write(struct cgroup *cgrp, struct cftype *cft,
1899				       u64 val)
1900{
1901	struct mem_cgroup *memcg = mem_cgroup_from_cont(cgrp);
1902	struct mem_cgroup *parent;
1903	if (val > 100)
1904		return -EINVAL;
1905
1906	if (cgrp->parent == NULL)
1907		return -EINVAL;
1908
1909	parent = mem_cgroup_from_cont(cgrp->parent);
1910	/* If under hierarchy, only empty-root can set this value */
1911	if ((parent->use_hierarchy) ||
1912	    (memcg->use_hierarchy && !list_empty(&cgrp->children)))
1913		return -EINVAL;
1914
1915	spin_lock(&memcg->reclaim_param_lock);
1916	memcg->swappiness = val;
1917	spin_unlock(&memcg->reclaim_param_lock);
1918
1919	return 0;
1920}
1921
1922
1923static struct cftype mem_cgroup_files[] = {
1924	{
1925		.name = "usage_in_bytes",
1926		.private = MEMFILE_PRIVATE(_MEM, RES_USAGE),
1927		.read_u64 = mem_cgroup_read,
1928	},
1929	{
1930		.name = "max_usage_in_bytes",
1931		.private = MEMFILE_PRIVATE(_MEM, RES_MAX_USAGE),
1932		.trigger = mem_cgroup_reset,
1933		.read_u64 = mem_cgroup_read,
1934	},
1935	{
1936		.name = "limit_in_bytes",
1937		.private = MEMFILE_PRIVATE(_MEM, RES_LIMIT),
1938		.write_string = mem_cgroup_write,
1939		.read_u64 = mem_cgroup_read,
1940	},
1941	{
1942		.name = "failcnt",
1943		.private = MEMFILE_PRIVATE(_MEM, RES_FAILCNT),
1944		.trigger = mem_cgroup_reset,
1945		.read_u64 = mem_cgroup_read,
1946	},
1947	{
1948		.name = "stat",
1949		.read_map = mem_control_stat_show,
1950	},
1951	{
1952		.name = "force_empty",
1953		.trigger = mem_cgroup_force_empty_write,
1954	},
1955	{
1956		.name = "use_hierarchy",
1957		.write_u64 = mem_cgroup_hierarchy_write,
1958		.read_u64 = mem_cgroup_hierarchy_read,
1959	},
1960	{
1961		.name = "swappiness",
1962		.read_u64 = mem_cgroup_swappiness_read,
1963		.write_u64 = mem_cgroup_swappiness_write,
1964	},
1965};
1966
1967#ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP
1968static struct cftype memsw_cgroup_files[] = {
1969	{
1970		.name = "memsw.usage_in_bytes",
1971		.private = MEMFILE_PRIVATE(_MEMSWAP, RES_USAGE),
1972		.read_u64 = mem_cgroup_read,
1973	},
1974	{
1975		.name = "memsw.max_usage_in_bytes",
1976		.private = MEMFILE_PRIVATE(_MEMSWAP, RES_MAX_USAGE),
1977		.trigger = mem_cgroup_reset,
1978		.read_u64 = mem_cgroup_read,
1979	},
1980	{
1981		.name = "memsw.limit_in_bytes",
1982		.private = MEMFILE_PRIVATE(_MEMSWAP, RES_LIMIT),
1983		.write_string = mem_cgroup_write,
1984		.read_u64 = mem_cgroup_read,
1985	},
1986	{
1987		.name = "memsw.failcnt",
1988		.private = MEMFILE_PRIVATE(_MEMSWAP, RES_FAILCNT),
1989		.trigger = mem_cgroup_reset,
1990		.read_u64 = mem_cgroup_read,
1991	},
1992};
1993
1994static int register_memsw_files(struct cgroup *cont, struct cgroup_subsys *ss)
1995{
1996	if (!do_swap_account)
1997		return 0;
1998	return cgroup_add_files(cont, ss, memsw_cgroup_files,
1999				ARRAY_SIZE(memsw_cgroup_files));
2000};
2001#else
2002static int register_memsw_files(struct cgroup *cont, struct cgroup_subsys *ss)
2003{
2004	return 0;
2005}
2006#endif
2007
2008static int alloc_mem_cgroup_per_zone_info(struct mem_cgroup *mem, int node)
2009{
2010	struct mem_cgroup_per_node *pn;
2011	struct mem_cgroup_per_zone *mz;
2012	enum lru_list l;
2013	int zone, tmp = node;
2014	/*
2015	 * This routine is called against possible nodes.
2016	 * But it's BUG to call kmalloc() against offline node.
2017	 *
2018	 * TODO: this routine can waste much memory for nodes which will
2019	 *       never be onlined. It's better to use memory hotplug callback
2020	 *       function.
2021	 */
2022	if (!node_state(node, N_NORMAL_MEMORY))
2023		tmp = -1;
2024	pn = kmalloc_node(sizeof(*pn), GFP_KERNEL, tmp);
2025	if (!pn)
2026		return 1;
2027
2028	mem->info.nodeinfo[node] = pn;
2029	memset(pn, 0, sizeof(*pn));
2030
2031	for (zone = 0; zone < MAX_NR_ZONES; zone++) {
2032		mz = &pn->zoneinfo[zone];
2033		for_each_lru(l)
2034			INIT_LIST_HEAD(&mz->lists[l]);
2035	}
2036	return 0;
2037}
2038
2039static void free_mem_cgroup_per_zone_info(struct mem_cgroup *mem, int node)
2040{
2041	kfree(mem->info.nodeinfo[node]);
2042}
2043
2044static int mem_cgroup_size(void)
2045{
2046	int cpustat_size = nr_cpu_ids * sizeof(struct mem_cgroup_stat_cpu);
2047	return sizeof(struct mem_cgroup) + cpustat_size;
2048}
2049
2050static struct mem_cgroup *mem_cgroup_alloc(void)
2051{
2052	struct mem_cgroup *mem;
2053	int size = mem_cgroup_size();
2054
2055	if (size < PAGE_SIZE)
2056		mem = kmalloc(size, GFP_KERNEL);
2057	else
2058		mem = vmalloc(size);
2059
2060	if (mem)
2061		memset(mem, 0, size);
2062	return mem;
2063}
2064
2065/*
2066 * At destroying mem_cgroup, references from swap_cgroup can remain.
2067 * (scanning all at force_empty is too costly...)
2068 *
2069 * Instead of clearing all references at force_empty, we remember
2070 * the number of reference from swap_cgroup and free mem_cgroup when
2071 * it goes down to 0.
2072 *
2073 * When mem_cgroup is destroyed, mem->obsolete will be set to 0 and
2074 * entry which points to this memcg will be ignore at swapin.
2075 *
2076 * Removal of cgroup itself succeeds regardless of refs from swap.
2077 */
2078
2079static void mem_cgroup_free(struct mem_cgroup *mem)
2080{
2081	int node;
2082
2083	if (atomic_read(&mem->refcnt) > 0)
2084		return;
2085
2086
2087	for_each_node_state(node, N_POSSIBLE)
2088		free_mem_cgroup_per_zone_info(mem, node);
2089
2090	if (mem_cgroup_size() < PAGE_SIZE)
2091		kfree(mem);
2092	else
2093		vfree(mem);
2094}
2095
2096static void mem_cgroup_get(struct mem_cgroup *mem)
2097{
2098	atomic_inc(&mem->refcnt);
2099}
2100
2101static void mem_cgroup_put(struct mem_cgroup *mem)
2102{
2103	if (atomic_dec_and_test(&mem->refcnt)) {
2104		if (!mem->obsolete)
2105			return;
2106		mem_cgroup_free(mem);
2107	}
2108}
2109
2110
2111#ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP
2112static void __init enable_swap_cgroup(void)
2113{
2114	if (!mem_cgroup_disabled() && really_do_swap_account)
2115		do_swap_account = 1;
2116}
2117#else
2118static void __init enable_swap_cgroup(void)
2119{
2120}
2121#endif
2122
2123static struct cgroup_subsys_state *
2124mem_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cont)
2125{
2126	struct mem_cgroup *mem, *parent;
2127	int node;
2128
2129	mem = mem_cgroup_alloc();
2130	if (!mem)
2131		return ERR_PTR(-ENOMEM);
2132
2133	for_each_node_state(node, N_POSSIBLE)
2134		if (alloc_mem_cgroup_per_zone_info(mem, node))
2135			goto free_out;
2136	/* root ? */
2137	if (cont->parent == NULL) {
2138		enable_swap_cgroup();
2139		parent = NULL;
2140	} else {
2141		parent = mem_cgroup_from_cont(cont->parent);
2142		mem->use_hierarchy = parent->use_hierarchy;
2143	}
2144
2145	if (parent && parent->use_hierarchy) {
2146		res_counter_init(&mem->res, &parent->res);
2147		res_counter_init(&mem->memsw, &parent->memsw);
2148	} else {
2149		res_counter_init(&mem->res, NULL);
2150		res_counter_init(&mem->memsw, NULL);
2151	}
2152	mem->last_scanned_child = NULL;
2153	spin_lock_init(&mem->reclaim_param_lock);
2154
2155	if (parent)
2156		mem->swappiness = get_swappiness(parent);
2157
2158	return &mem->css;
2159free_out:
2160	for_each_node_state(node, N_POSSIBLE)
2161		free_mem_cgroup_per_zone_info(mem, node);
2162	mem_cgroup_free(mem);
2163	return ERR_PTR(-ENOMEM);
2164}
2165
2166static void mem_cgroup_pre_destroy(struct cgroup_subsys *ss,
2167					struct cgroup *cont)
2168{
2169	struct mem_cgroup *mem = mem_cgroup_from_cont(cont);
2170	mem->obsolete = 1;
2171	mem_cgroup_force_empty(mem, false);
2172}
2173
2174static void mem_cgroup_destroy(struct cgroup_subsys *ss,
2175				struct cgroup *cont)
2176{
2177	mem_cgroup_free(mem_cgroup_from_cont(cont));
2178}
2179
2180static int mem_cgroup_populate(struct cgroup_subsys *ss,
2181				struct cgroup *cont)
2182{
2183	int ret;
2184
2185	ret = cgroup_add_files(cont, ss, mem_cgroup_files,
2186				ARRAY_SIZE(mem_cgroup_files));
2187
2188	if (!ret)
2189		ret = register_memsw_files(cont, ss);
2190	return ret;
2191}
2192
2193static void mem_cgroup_move_task(struct cgroup_subsys *ss,
2194				struct cgroup *cont,
2195				struct cgroup *old_cont,
2196				struct task_struct *p)
2197{
2198	/*
2199	 * FIXME: It's better to move charges of this process from old
2200	 * memcg to new memcg. But it's just on TODO-List now.
2201	 */
2202}
2203
2204struct cgroup_subsys mem_cgroup_subsys = {
2205	.name = "memory",
2206	.subsys_id = mem_cgroup_subsys_id,
2207	.create = mem_cgroup_create,
2208	.pre_destroy = mem_cgroup_pre_destroy,
2209	.destroy = mem_cgroup_destroy,
2210	.populate = mem_cgroup_populate,
2211	.attach = mem_cgroup_move_task,
2212	.early_init = 0,
2213};
2214
2215#ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP
2216
2217static int __init disable_swap_account(char *s)
2218{
2219	really_do_swap_account = 0;
2220	return 1;
2221}
2222__setup("noswapaccount", disable_swap_account);
2223#endif
2224