memcontrol.c revision 14797e2363c2b2f1ce139fd1c5a215e4e05aa1d9
1/* memcontrol.c - Memory Controller
2 *
3 * Copyright IBM Corporation, 2007
4 * Author Balbir Singh <balbir@linux.vnet.ibm.com>
5 *
6 * Copyright 2007 OpenVZ SWsoft Inc
7 * Author: Pavel Emelianov <xemul@openvz.org>
8 *
9 * This program is free software; you can redistribute it and/or modify
10 * it under the terms of the GNU General Public License as published by
11 * the Free Software Foundation; either version 2 of the License, or
12 * (at your option) any later version.
13 *
14 * This program is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
17 * GNU General Public License for more details.
18 */
19
20#include <linux/res_counter.h>
21#include <linux/memcontrol.h>
22#include <linux/cgroup.h>
23#include <linux/mm.h>
24#include <linux/pagemap.h>
25#include <linux/smp.h>
26#include <linux/page-flags.h>
27#include <linux/backing-dev.h>
28#include <linux/bit_spinlock.h>
29#include <linux/rcupdate.h>
30#include <linux/mutex.h>
31#include <linux/slab.h>
32#include <linux/swap.h>
33#include <linux/spinlock.h>
34#include <linux/fs.h>
35#include <linux/seq_file.h>
36#include <linux/vmalloc.h>
37#include <linux/mm_inline.h>
38#include <linux/page_cgroup.h>
39#include "internal.h"
40
41#include <asm/uaccess.h>
42
43struct cgroup_subsys mem_cgroup_subsys __read_mostly;
44#define MEM_CGROUP_RECLAIM_RETRIES	5
45
46#ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP
47/* Turned on only when memory cgroup is enabled && really_do_swap_account = 0 */
48int do_swap_account __read_mostly;
49static int really_do_swap_account __initdata = 1; /* for remember boot option*/
50#else
51#define do_swap_account		(0)
52#endif
53
54
55/*
56 * Statistics for memory cgroup.
57 */
58enum mem_cgroup_stat_index {
59	/*
60	 * For MEM_CONTAINER_TYPE_ALL, usage = pagecache + rss.
61	 */
62	MEM_CGROUP_STAT_CACHE, 	   /* # of pages charged as cache */
63	MEM_CGROUP_STAT_RSS,	   /* # of pages charged as rss */
64	MEM_CGROUP_STAT_PGPGIN_COUNT,	/* # of pages paged in */
65	MEM_CGROUP_STAT_PGPGOUT_COUNT,	/* # of pages paged out */
66
67	MEM_CGROUP_STAT_NSTATS,
68};
69
70struct mem_cgroup_stat_cpu {
71	s64 count[MEM_CGROUP_STAT_NSTATS];
72} ____cacheline_aligned_in_smp;
73
74struct mem_cgroup_stat {
75	struct mem_cgroup_stat_cpu cpustat[0];
76};
77
78/*
79 * For accounting under irq disable, no need for increment preempt count.
80 */
81static inline void __mem_cgroup_stat_add_safe(struct mem_cgroup_stat_cpu *stat,
82		enum mem_cgroup_stat_index idx, int val)
83{
84	stat->count[idx] += val;
85}
86
87static s64 mem_cgroup_read_stat(struct mem_cgroup_stat *stat,
88		enum mem_cgroup_stat_index idx)
89{
90	int cpu;
91	s64 ret = 0;
92	for_each_possible_cpu(cpu)
93		ret += stat->cpustat[cpu].count[idx];
94	return ret;
95}
96
97/*
98 * per-zone information in memory controller.
99 */
100struct mem_cgroup_per_zone {
101	/*
102	 * spin_lock to protect the per cgroup LRU
103	 */
104	struct list_head	lists[NR_LRU_LISTS];
105	unsigned long		count[NR_LRU_LISTS];
106};
107/* Macro for accessing counter */
108#define MEM_CGROUP_ZSTAT(mz, idx)	((mz)->count[(idx)])
109
110struct mem_cgroup_per_node {
111	struct mem_cgroup_per_zone zoneinfo[MAX_NR_ZONES];
112};
113
114struct mem_cgroup_lru_info {
115	struct mem_cgroup_per_node *nodeinfo[MAX_NUMNODES];
116};
117
118/*
119 * The memory controller data structure. The memory controller controls both
120 * page cache and RSS per cgroup. We would eventually like to provide
121 * statistics based on the statistics developed by Rik Van Riel for clock-pro,
122 * to help the administrator determine what knobs to tune.
123 *
124 * TODO: Add a water mark for the memory controller. Reclaim will begin when
125 * we hit the water mark. May be even add a low water mark, such that
126 * no reclaim occurs from a cgroup at it's low water mark, this is
127 * a feature that will be implemented much later in the future.
128 */
129struct mem_cgroup {
130	struct cgroup_subsys_state css;
131	/*
132	 * the counter to account for memory usage
133	 */
134	struct res_counter res;
135	/*
136	 * the counter to account for mem+swap usage.
137	 */
138	struct res_counter memsw;
139	/*
140	 * Per cgroup active and inactive list, similar to the
141	 * per zone LRU lists.
142	 */
143	struct mem_cgroup_lru_info info;
144
145	int	prev_priority;	/* for recording reclaim priority */
146
147	/*
148	 * While reclaiming in a hiearchy, we cache the last child we
149	 * reclaimed from. Protected by cgroup_lock()
150	 */
151	struct mem_cgroup *last_scanned_child;
152	/*
153	 * Should the accounting and control be hierarchical, per subtree?
154	 */
155	bool use_hierarchy;
156	unsigned long	last_oom_jiffies;
157	int		obsolete;
158	atomic_t	refcnt;
159
160	unsigned int inactive_ratio;
161
162	/*
163	 * statistics. This must be placed at the end of memcg.
164	 */
165	struct mem_cgroup_stat stat;
166};
167
168enum charge_type {
169	MEM_CGROUP_CHARGE_TYPE_CACHE = 0,
170	MEM_CGROUP_CHARGE_TYPE_MAPPED,
171	MEM_CGROUP_CHARGE_TYPE_SHMEM,	/* used by page migration of shmem */
172	MEM_CGROUP_CHARGE_TYPE_FORCE,	/* used by force_empty */
173	MEM_CGROUP_CHARGE_TYPE_SWAPOUT,	/* for accounting swapcache */
174	NR_CHARGE_TYPE,
175};
176
177/* only for here (for easy reading.) */
178#define PCGF_CACHE	(1UL << PCG_CACHE)
179#define PCGF_USED	(1UL << PCG_USED)
180#define PCGF_LOCK	(1UL << PCG_LOCK)
181static const unsigned long
182pcg_default_flags[NR_CHARGE_TYPE] = {
183	PCGF_CACHE | PCGF_USED | PCGF_LOCK, /* File Cache */
184	PCGF_USED | PCGF_LOCK, /* Anon */
185	PCGF_CACHE | PCGF_USED | PCGF_LOCK, /* Shmem */
186	0, /* FORCE */
187};
188
189
190/* for encoding cft->private value on file */
191#define _MEM			(0)
192#define _MEMSWAP		(1)
193#define MEMFILE_PRIVATE(x, val)	(((x) << 16) | (val))
194#define MEMFILE_TYPE(val)	(((val) >> 16) & 0xffff)
195#define MEMFILE_ATTR(val)	((val) & 0xffff)
196
197static void mem_cgroup_get(struct mem_cgroup *mem);
198static void mem_cgroup_put(struct mem_cgroup *mem);
199
200static void mem_cgroup_charge_statistics(struct mem_cgroup *mem,
201					 struct page_cgroup *pc,
202					 bool charge)
203{
204	int val = (charge)? 1 : -1;
205	struct mem_cgroup_stat *stat = &mem->stat;
206	struct mem_cgroup_stat_cpu *cpustat;
207	int cpu = get_cpu();
208
209	cpustat = &stat->cpustat[cpu];
210	if (PageCgroupCache(pc))
211		__mem_cgroup_stat_add_safe(cpustat, MEM_CGROUP_STAT_CACHE, val);
212	else
213		__mem_cgroup_stat_add_safe(cpustat, MEM_CGROUP_STAT_RSS, val);
214
215	if (charge)
216		__mem_cgroup_stat_add_safe(cpustat,
217				MEM_CGROUP_STAT_PGPGIN_COUNT, 1);
218	else
219		__mem_cgroup_stat_add_safe(cpustat,
220				MEM_CGROUP_STAT_PGPGOUT_COUNT, 1);
221	put_cpu();
222}
223
224static struct mem_cgroup_per_zone *
225mem_cgroup_zoneinfo(struct mem_cgroup *mem, int nid, int zid)
226{
227	return &mem->info.nodeinfo[nid]->zoneinfo[zid];
228}
229
230static struct mem_cgroup_per_zone *
231page_cgroup_zoneinfo(struct page_cgroup *pc)
232{
233	struct mem_cgroup *mem = pc->mem_cgroup;
234	int nid = page_cgroup_nid(pc);
235	int zid = page_cgroup_zid(pc);
236
237	if (!mem)
238		return NULL;
239
240	return mem_cgroup_zoneinfo(mem, nid, zid);
241}
242
243static unsigned long mem_cgroup_get_all_zonestat(struct mem_cgroup *mem,
244					enum lru_list idx)
245{
246	int nid, zid;
247	struct mem_cgroup_per_zone *mz;
248	u64 total = 0;
249
250	for_each_online_node(nid)
251		for (zid = 0; zid < MAX_NR_ZONES; zid++) {
252			mz = mem_cgroup_zoneinfo(mem, nid, zid);
253			total += MEM_CGROUP_ZSTAT(mz, idx);
254		}
255	return total;
256}
257
258static struct mem_cgroup *mem_cgroup_from_cont(struct cgroup *cont)
259{
260	return container_of(cgroup_subsys_state(cont,
261				mem_cgroup_subsys_id), struct mem_cgroup,
262				css);
263}
264
265struct mem_cgroup *mem_cgroup_from_task(struct task_struct *p)
266{
267	/*
268	 * mm_update_next_owner() may clear mm->owner to NULL
269	 * if it races with swapoff, page migration, etc.
270	 * So this can be called with p == NULL.
271	 */
272	if (unlikely(!p))
273		return NULL;
274
275	return container_of(task_subsys_state(p, mem_cgroup_subsys_id),
276				struct mem_cgroup, css);
277}
278
279/*
280 * Following LRU functions are allowed to be used without PCG_LOCK.
281 * Operations are called by routine of global LRU independently from memcg.
282 * What we have to take care of here is validness of pc->mem_cgroup.
283 *
284 * Changes to pc->mem_cgroup happens when
285 * 1. charge
286 * 2. moving account
287 * In typical case, "charge" is done before add-to-lru. Exception is SwapCache.
288 * It is added to LRU before charge.
289 * If PCG_USED bit is not set, page_cgroup is not added to this private LRU.
290 * When moving account, the page is not on LRU. It's isolated.
291 */
292
293void mem_cgroup_del_lru_list(struct page *page, enum lru_list lru)
294{
295	struct page_cgroup *pc;
296	struct mem_cgroup *mem;
297	struct mem_cgroup_per_zone *mz;
298
299	if (mem_cgroup_disabled())
300		return;
301	pc = lookup_page_cgroup(page);
302	/* can happen while we handle swapcache. */
303	if (list_empty(&pc->lru))
304		return;
305	mz = page_cgroup_zoneinfo(pc);
306	mem = pc->mem_cgroup;
307	MEM_CGROUP_ZSTAT(mz, lru) -= 1;
308	list_del_init(&pc->lru);
309	return;
310}
311
312void mem_cgroup_del_lru(struct page *page)
313{
314	mem_cgroup_del_lru_list(page, page_lru(page));
315}
316
317void mem_cgroup_rotate_lru_list(struct page *page, enum lru_list lru)
318{
319	struct mem_cgroup_per_zone *mz;
320	struct page_cgroup *pc;
321
322	if (mem_cgroup_disabled())
323		return;
324
325	pc = lookup_page_cgroup(page);
326	smp_rmb();
327	/* unused page is not rotated. */
328	if (!PageCgroupUsed(pc))
329		return;
330	mz = page_cgroup_zoneinfo(pc);
331	list_move(&pc->lru, &mz->lists[lru]);
332}
333
334void mem_cgroup_add_lru_list(struct page *page, enum lru_list lru)
335{
336	struct page_cgroup *pc;
337	struct mem_cgroup_per_zone *mz;
338
339	if (mem_cgroup_disabled())
340		return;
341	pc = lookup_page_cgroup(page);
342	/* barrier to sync with "charge" */
343	smp_rmb();
344	if (!PageCgroupUsed(pc))
345		return;
346
347	mz = page_cgroup_zoneinfo(pc);
348	MEM_CGROUP_ZSTAT(mz, lru) += 1;
349	list_add(&pc->lru, &mz->lists[lru]);
350}
351/*
352 * To add swapcache into LRU. Be careful to all this function.
353 * zone->lru_lock shouldn't be held and irq must not be disabled.
354 */
355static void mem_cgroup_lru_fixup(struct page *page)
356{
357	if (!isolate_lru_page(page))
358		putback_lru_page(page);
359}
360
361void mem_cgroup_move_lists(struct page *page,
362			   enum lru_list from, enum lru_list to)
363{
364	if (mem_cgroup_disabled())
365		return;
366	mem_cgroup_del_lru_list(page, from);
367	mem_cgroup_add_lru_list(page, to);
368}
369
370int task_in_mem_cgroup(struct task_struct *task, const struct mem_cgroup *mem)
371{
372	int ret;
373
374	task_lock(task);
375	ret = task->mm && mm_match_cgroup(task->mm, mem);
376	task_unlock(task);
377	return ret;
378}
379
380/*
381 * Calculate mapped_ratio under memory controller. This will be used in
382 * vmscan.c for deteremining we have to reclaim mapped pages.
383 */
384int mem_cgroup_calc_mapped_ratio(struct mem_cgroup *mem)
385{
386	long total, rss;
387
388	/*
389	 * usage is recorded in bytes. But, here, we assume the number of
390	 * physical pages can be represented by "long" on any arch.
391	 */
392	total = (long) (mem->res.usage >> PAGE_SHIFT) + 1L;
393	rss = (long)mem_cgroup_read_stat(&mem->stat, MEM_CGROUP_STAT_RSS);
394	return (int)((rss * 100L) / total);
395}
396
397/*
398 * prev_priority control...this will be used in memory reclaim path.
399 */
400int mem_cgroup_get_reclaim_priority(struct mem_cgroup *mem)
401{
402	return mem->prev_priority;
403}
404
405void mem_cgroup_note_reclaim_priority(struct mem_cgroup *mem, int priority)
406{
407	if (priority < mem->prev_priority)
408		mem->prev_priority = priority;
409}
410
411void mem_cgroup_record_reclaim_priority(struct mem_cgroup *mem, int priority)
412{
413	mem->prev_priority = priority;
414}
415
416/*
417 * Calculate # of pages to be scanned in this priority/zone.
418 * See also vmscan.c
419 *
420 * priority starts from "DEF_PRIORITY" and decremented in each loop.
421 * (see include/linux/mmzone.h)
422 */
423
424long mem_cgroup_calc_reclaim(struct mem_cgroup *mem, struct zone *zone,
425					int priority, enum lru_list lru)
426{
427	long nr_pages;
428	int nid = zone->zone_pgdat->node_id;
429	int zid = zone_idx(zone);
430	struct mem_cgroup_per_zone *mz = mem_cgroup_zoneinfo(mem, nid, zid);
431
432	nr_pages = MEM_CGROUP_ZSTAT(mz, lru);
433
434	return (nr_pages >> priority);
435}
436
437int mem_cgroup_inactive_anon_is_low(struct mem_cgroup *memcg, struct zone *zone)
438{
439	unsigned long active;
440	unsigned long inactive;
441
442	inactive = mem_cgroup_get_all_zonestat(memcg, LRU_INACTIVE_ANON);
443	active = mem_cgroup_get_all_zonestat(memcg, LRU_ACTIVE_ANON);
444
445	if (inactive * memcg->inactive_ratio < active)
446		return 1;
447
448	return 0;
449}
450
451unsigned long mem_cgroup_isolate_pages(unsigned long nr_to_scan,
452					struct list_head *dst,
453					unsigned long *scanned, int order,
454					int mode, struct zone *z,
455					struct mem_cgroup *mem_cont,
456					int active, int file)
457{
458	unsigned long nr_taken = 0;
459	struct page *page;
460	unsigned long scan;
461	LIST_HEAD(pc_list);
462	struct list_head *src;
463	struct page_cgroup *pc, *tmp;
464	int nid = z->zone_pgdat->node_id;
465	int zid = zone_idx(z);
466	struct mem_cgroup_per_zone *mz;
467	int lru = LRU_FILE * !!file + !!active;
468
469	BUG_ON(!mem_cont);
470	mz = mem_cgroup_zoneinfo(mem_cont, nid, zid);
471	src = &mz->lists[lru];
472
473	scan = 0;
474	list_for_each_entry_safe_reverse(pc, tmp, src, lru) {
475		if (scan >= nr_to_scan)
476			break;
477
478		page = pc->page;
479		if (unlikely(!PageCgroupUsed(pc)))
480			continue;
481		if (unlikely(!PageLRU(page)))
482			continue;
483
484		scan++;
485		if (__isolate_lru_page(page, mode, file) == 0) {
486			list_move(&page->lru, dst);
487			nr_taken++;
488		}
489	}
490
491	*scanned = scan;
492	return nr_taken;
493}
494
495#define mem_cgroup_from_res_counter(counter, member)	\
496	container_of(counter, struct mem_cgroup, member)
497
498/*
499 * This routine finds the DFS walk successor. This routine should be
500 * called with cgroup_mutex held
501 */
502static struct mem_cgroup *
503mem_cgroup_get_next_node(struct mem_cgroup *curr, struct mem_cgroup *root_mem)
504{
505	struct cgroup *cgroup, *curr_cgroup, *root_cgroup;
506
507	curr_cgroup = curr->css.cgroup;
508	root_cgroup = root_mem->css.cgroup;
509
510	if (!list_empty(&curr_cgroup->children)) {
511		/*
512		 * Walk down to children
513		 */
514		mem_cgroup_put(curr);
515		cgroup = list_entry(curr_cgroup->children.next,
516						struct cgroup, sibling);
517		curr = mem_cgroup_from_cont(cgroup);
518		mem_cgroup_get(curr);
519		goto done;
520	}
521
522visit_parent:
523	if (curr_cgroup == root_cgroup) {
524		mem_cgroup_put(curr);
525		curr = root_mem;
526		mem_cgroup_get(curr);
527		goto done;
528	}
529
530	/*
531	 * Goto next sibling
532	 */
533	if (curr_cgroup->sibling.next != &curr_cgroup->parent->children) {
534		mem_cgroup_put(curr);
535		cgroup = list_entry(curr_cgroup->sibling.next, struct cgroup,
536						sibling);
537		curr = mem_cgroup_from_cont(cgroup);
538		mem_cgroup_get(curr);
539		goto done;
540	}
541
542	/*
543	 * Go up to next parent and next parent's sibling if need be
544	 */
545	curr_cgroup = curr_cgroup->parent;
546	goto visit_parent;
547
548done:
549	root_mem->last_scanned_child = curr;
550	return curr;
551}
552
553/*
554 * Visit the first child (need not be the first child as per the ordering
555 * of the cgroup list, since we track last_scanned_child) of @mem and use
556 * that to reclaim free pages from.
557 */
558static struct mem_cgroup *
559mem_cgroup_get_first_node(struct mem_cgroup *root_mem)
560{
561	struct cgroup *cgroup;
562	struct mem_cgroup *ret;
563	bool obsolete = (root_mem->last_scanned_child &&
564				root_mem->last_scanned_child->obsolete);
565
566	/*
567	 * Scan all children under the mem_cgroup mem
568	 */
569	cgroup_lock();
570	if (list_empty(&root_mem->css.cgroup->children)) {
571		ret = root_mem;
572		goto done;
573	}
574
575	if (!root_mem->last_scanned_child || obsolete) {
576
577		if (obsolete)
578			mem_cgroup_put(root_mem->last_scanned_child);
579
580		cgroup = list_first_entry(&root_mem->css.cgroup->children,
581				struct cgroup, sibling);
582		ret = mem_cgroup_from_cont(cgroup);
583		mem_cgroup_get(ret);
584	} else
585		ret = mem_cgroup_get_next_node(root_mem->last_scanned_child,
586						root_mem);
587
588done:
589	root_mem->last_scanned_child = ret;
590	cgroup_unlock();
591	return ret;
592}
593
594static bool mem_cgroup_check_under_limit(struct mem_cgroup *mem)
595{
596	if (do_swap_account) {
597		if (res_counter_check_under_limit(&mem->res) &&
598			res_counter_check_under_limit(&mem->memsw))
599			return true;
600	} else
601		if (res_counter_check_under_limit(&mem->res))
602			return true;
603	return false;
604}
605
606/*
607 * Dance down the hierarchy if needed to reclaim memory. We remember the
608 * last child we reclaimed from, so that we don't end up penalizing
609 * one child extensively based on its position in the children list.
610 *
611 * root_mem is the original ancestor that we've been reclaim from.
612 */
613static int mem_cgroup_hierarchical_reclaim(struct mem_cgroup *root_mem,
614						gfp_t gfp_mask, bool noswap)
615{
616	struct mem_cgroup *next_mem;
617	int ret = 0;
618
619	/*
620	 * Reclaim unconditionally and don't check for return value.
621	 * We need to reclaim in the current group and down the tree.
622	 * One might think about checking for children before reclaiming,
623	 * but there might be left over accounting, even after children
624	 * have left.
625	 */
626	ret = try_to_free_mem_cgroup_pages(root_mem, gfp_mask, noswap);
627	if (mem_cgroup_check_under_limit(root_mem))
628		return 0;
629	if (!root_mem->use_hierarchy)
630		return ret;
631
632	next_mem = mem_cgroup_get_first_node(root_mem);
633
634	while (next_mem != root_mem) {
635		if (next_mem->obsolete) {
636			mem_cgroup_put(next_mem);
637			cgroup_lock();
638			next_mem = mem_cgroup_get_first_node(root_mem);
639			cgroup_unlock();
640			continue;
641		}
642		ret = try_to_free_mem_cgroup_pages(next_mem, gfp_mask, noswap);
643		if (mem_cgroup_check_under_limit(root_mem))
644			return 0;
645		cgroup_lock();
646		next_mem = mem_cgroup_get_next_node(next_mem, root_mem);
647		cgroup_unlock();
648	}
649	return ret;
650}
651
652bool mem_cgroup_oom_called(struct task_struct *task)
653{
654	bool ret = false;
655	struct mem_cgroup *mem;
656	struct mm_struct *mm;
657
658	rcu_read_lock();
659	mm = task->mm;
660	if (!mm)
661		mm = &init_mm;
662	mem = mem_cgroup_from_task(rcu_dereference(mm->owner));
663	if (mem && time_before(jiffies, mem->last_oom_jiffies + HZ/10))
664		ret = true;
665	rcu_read_unlock();
666	return ret;
667}
668/*
669 * Unlike exported interface, "oom" parameter is added. if oom==true,
670 * oom-killer can be invoked.
671 */
672static int __mem_cgroup_try_charge(struct mm_struct *mm,
673			gfp_t gfp_mask, struct mem_cgroup **memcg,
674			bool oom)
675{
676	struct mem_cgroup *mem, *mem_over_limit;
677	int nr_retries = MEM_CGROUP_RECLAIM_RETRIES;
678	struct res_counter *fail_res;
679
680	if (unlikely(test_thread_flag(TIF_MEMDIE))) {
681		/* Don't account this! */
682		*memcg = NULL;
683		return 0;
684	}
685
686	/*
687	 * We always charge the cgroup the mm_struct belongs to.
688	 * The mm_struct's mem_cgroup changes on task migration if the
689	 * thread group leader migrates. It's possible that mm is not
690	 * set, if so charge the init_mm (happens for pagecache usage).
691	 */
692	if (likely(!*memcg)) {
693		rcu_read_lock();
694		mem = mem_cgroup_from_task(rcu_dereference(mm->owner));
695		if (unlikely(!mem)) {
696			rcu_read_unlock();
697			return 0;
698		}
699		/*
700		 * For every charge from the cgroup, increment reference count
701		 */
702		css_get(&mem->css);
703		*memcg = mem;
704		rcu_read_unlock();
705	} else {
706		mem = *memcg;
707		css_get(&mem->css);
708	}
709
710	while (1) {
711		int ret;
712		bool noswap = false;
713
714		ret = res_counter_charge(&mem->res, PAGE_SIZE, &fail_res);
715		if (likely(!ret)) {
716			if (!do_swap_account)
717				break;
718			ret = res_counter_charge(&mem->memsw, PAGE_SIZE,
719							&fail_res);
720			if (likely(!ret))
721				break;
722			/* mem+swap counter fails */
723			res_counter_uncharge(&mem->res, PAGE_SIZE);
724			noswap = true;
725			mem_over_limit = mem_cgroup_from_res_counter(fail_res,
726									memsw);
727		} else
728			/* mem counter fails */
729			mem_over_limit = mem_cgroup_from_res_counter(fail_res,
730									res);
731
732		if (!(gfp_mask & __GFP_WAIT))
733			goto nomem;
734
735		ret = mem_cgroup_hierarchical_reclaim(mem_over_limit, gfp_mask,
736							noswap);
737
738		/*
739		 * try_to_free_mem_cgroup_pages() might not give us a full
740		 * picture of reclaim. Some pages are reclaimed and might be
741		 * moved to swap cache or just unmapped from the cgroup.
742		 * Check the limit again to see if the reclaim reduced the
743		 * current usage of the cgroup before giving up
744		 *
745		 */
746		if (mem_cgroup_check_under_limit(mem_over_limit))
747			continue;
748
749		if (!nr_retries--) {
750			if (oom) {
751				mem_cgroup_out_of_memory(mem_over_limit, gfp_mask);
752				mem_over_limit->last_oom_jiffies = jiffies;
753			}
754			goto nomem;
755		}
756	}
757	return 0;
758nomem:
759	css_put(&mem->css);
760	return -ENOMEM;
761}
762
763/**
764 * mem_cgroup_try_charge - get charge of PAGE_SIZE.
765 * @mm: an mm_struct which is charged against. (when *memcg is NULL)
766 * @gfp_mask: gfp_mask for reclaim.
767 * @memcg: a pointer to memory cgroup which is charged against.
768 *
769 * charge against memory cgroup pointed by *memcg. if *memcg == NULL, estimated
770 * memory cgroup from @mm is got and stored in *memcg.
771 *
772 * Returns 0 if success. -ENOMEM at failure.
773 * This call can invoke OOM-Killer.
774 */
775
776int mem_cgroup_try_charge(struct mm_struct *mm,
777			  gfp_t mask, struct mem_cgroup **memcg)
778{
779	return __mem_cgroup_try_charge(mm, mask, memcg, true);
780}
781
782/*
783 * commit a charge got by mem_cgroup_try_charge() and makes page_cgroup to be
784 * USED state. If already USED, uncharge and return.
785 */
786
787static void __mem_cgroup_commit_charge(struct mem_cgroup *mem,
788				     struct page_cgroup *pc,
789				     enum charge_type ctype)
790{
791	/* try_charge() can return NULL to *memcg, taking care of it. */
792	if (!mem)
793		return;
794
795	lock_page_cgroup(pc);
796	if (unlikely(PageCgroupUsed(pc))) {
797		unlock_page_cgroup(pc);
798		res_counter_uncharge(&mem->res, PAGE_SIZE);
799		if (do_swap_account)
800			res_counter_uncharge(&mem->memsw, PAGE_SIZE);
801		css_put(&mem->css);
802		return;
803	}
804	pc->mem_cgroup = mem;
805	smp_wmb();
806	pc->flags = pcg_default_flags[ctype];
807
808	mem_cgroup_charge_statistics(mem, pc, true);
809
810	unlock_page_cgroup(pc);
811}
812
813/**
814 * mem_cgroup_move_account - move account of the page
815 * @pc:	page_cgroup of the page.
816 * @from: mem_cgroup which the page is moved from.
817 * @to:	mem_cgroup which the page is moved to. @from != @to.
818 *
819 * The caller must confirm following.
820 * - page is not on LRU (isolate_page() is useful.)
821 *
822 * returns 0 at success,
823 * returns -EBUSY when lock is busy or "pc" is unstable.
824 *
825 * This function does "uncharge" from old cgroup but doesn't do "charge" to
826 * new cgroup. It should be done by a caller.
827 */
828
829static int mem_cgroup_move_account(struct page_cgroup *pc,
830	struct mem_cgroup *from, struct mem_cgroup *to)
831{
832	struct mem_cgroup_per_zone *from_mz, *to_mz;
833	int nid, zid;
834	int ret = -EBUSY;
835
836	VM_BUG_ON(from == to);
837	VM_BUG_ON(PageLRU(pc->page));
838
839	nid = page_cgroup_nid(pc);
840	zid = page_cgroup_zid(pc);
841	from_mz =  mem_cgroup_zoneinfo(from, nid, zid);
842	to_mz =  mem_cgroup_zoneinfo(to, nid, zid);
843
844	if (!trylock_page_cgroup(pc))
845		return ret;
846
847	if (!PageCgroupUsed(pc))
848		goto out;
849
850	if (pc->mem_cgroup != from)
851		goto out;
852
853	css_put(&from->css);
854	res_counter_uncharge(&from->res, PAGE_SIZE);
855	mem_cgroup_charge_statistics(from, pc, false);
856	if (do_swap_account)
857		res_counter_uncharge(&from->memsw, PAGE_SIZE);
858	pc->mem_cgroup = to;
859	mem_cgroup_charge_statistics(to, pc, true);
860	css_get(&to->css);
861	ret = 0;
862out:
863	unlock_page_cgroup(pc);
864	return ret;
865}
866
867/*
868 * move charges to its parent.
869 */
870
871static int mem_cgroup_move_parent(struct page_cgroup *pc,
872				  struct mem_cgroup *child,
873				  gfp_t gfp_mask)
874{
875	struct page *page = pc->page;
876	struct cgroup *cg = child->css.cgroup;
877	struct cgroup *pcg = cg->parent;
878	struct mem_cgroup *parent;
879	int ret;
880
881	/* Is ROOT ? */
882	if (!pcg)
883		return -EINVAL;
884
885
886	parent = mem_cgroup_from_cont(pcg);
887
888
889	ret = __mem_cgroup_try_charge(NULL, gfp_mask, &parent, false);
890	if (ret || !parent)
891		return ret;
892
893	if (!get_page_unless_zero(page))
894		return -EBUSY;
895
896	ret = isolate_lru_page(page);
897
898	if (ret)
899		goto cancel;
900
901	ret = mem_cgroup_move_account(pc, child, parent);
902
903	/* drop extra refcnt by try_charge() (move_account increment one) */
904	css_put(&parent->css);
905	putback_lru_page(page);
906	if (!ret) {
907		put_page(page);
908		return 0;
909	}
910	/* uncharge if move fails */
911cancel:
912	res_counter_uncharge(&parent->res, PAGE_SIZE);
913	if (do_swap_account)
914		res_counter_uncharge(&parent->memsw, PAGE_SIZE);
915	put_page(page);
916	return ret;
917}
918
919/*
920 * Charge the memory controller for page usage.
921 * Return
922 * 0 if the charge was successful
923 * < 0 if the cgroup is over its limit
924 */
925static int mem_cgroup_charge_common(struct page *page, struct mm_struct *mm,
926				gfp_t gfp_mask, enum charge_type ctype,
927				struct mem_cgroup *memcg)
928{
929	struct mem_cgroup *mem;
930	struct page_cgroup *pc;
931	int ret;
932
933	pc = lookup_page_cgroup(page);
934	/* can happen at boot */
935	if (unlikely(!pc))
936		return 0;
937	prefetchw(pc);
938
939	mem = memcg;
940	ret = __mem_cgroup_try_charge(mm, gfp_mask, &mem, true);
941	if (ret || !mem)
942		return ret;
943
944	__mem_cgroup_commit_charge(mem, pc, ctype);
945	return 0;
946}
947
948int mem_cgroup_newpage_charge(struct page *page,
949			      struct mm_struct *mm, gfp_t gfp_mask)
950{
951	if (mem_cgroup_disabled())
952		return 0;
953	if (PageCompound(page))
954		return 0;
955	/*
956	 * If already mapped, we don't have to account.
957	 * If page cache, page->mapping has address_space.
958	 * But page->mapping may have out-of-use anon_vma pointer,
959	 * detecit it by PageAnon() check. newly-mapped-anon's page->mapping
960	 * is NULL.
961  	 */
962	if (page_mapped(page) || (page->mapping && !PageAnon(page)))
963		return 0;
964	if (unlikely(!mm))
965		mm = &init_mm;
966	return mem_cgroup_charge_common(page, mm, gfp_mask,
967				MEM_CGROUP_CHARGE_TYPE_MAPPED, NULL);
968}
969
970int mem_cgroup_cache_charge(struct page *page, struct mm_struct *mm,
971				gfp_t gfp_mask)
972{
973	if (mem_cgroup_disabled())
974		return 0;
975	if (PageCompound(page))
976		return 0;
977	/*
978	 * Corner case handling. This is called from add_to_page_cache()
979	 * in usual. But some FS (shmem) precharges this page before calling it
980	 * and call add_to_page_cache() with GFP_NOWAIT.
981	 *
982	 * For GFP_NOWAIT case, the page may be pre-charged before calling
983	 * add_to_page_cache(). (See shmem.c) check it here and avoid to call
984	 * charge twice. (It works but has to pay a bit larger cost.)
985	 */
986	if (!(gfp_mask & __GFP_WAIT)) {
987		struct page_cgroup *pc;
988
989
990		pc = lookup_page_cgroup(page);
991		if (!pc)
992			return 0;
993		lock_page_cgroup(pc);
994		if (PageCgroupUsed(pc)) {
995			unlock_page_cgroup(pc);
996			return 0;
997		}
998		unlock_page_cgroup(pc);
999	}
1000
1001	if (unlikely(!mm))
1002		mm = &init_mm;
1003
1004	if (page_is_file_cache(page))
1005		return mem_cgroup_charge_common(page, mm, gfp_mask,
1006				MEM_CGROUP_CHARGE_TYPE_CACHE, NULL);
1007	else
1008		return mem_cgroup_charge_common(page, mm, gfp_mask,
1009				MEM_CGROUP_CHARGE_TYPE_SHMEM, NULL);
1010}
1011
1012int mem_cgroup_try_charge_swapin(struct mm_struct *mm,
1013				 struct page *page,
1014				 gfp_t mask, struct mem_cgroup **ptr)
1015{
1016	struct mem_cgroup *mem;
1017	swp_entry_t     ent;
1018
1019	if (mem_cgroup_disabled())
1020		return 0;
1021
1022	if (!do_swap_account)
1023		goto charge_cur_mm;
1024
1025	/*
1026	 * A racing thread's fault, or swapoff, may have already updated
1027	 * the pte, and even removed page from swap cache: return success
1028	 * to go on to do_swap_page()'s pte_same() test, which should fail.
1029	 */
1030	if (!PageSwapCache(page))
1031		return 0;
1032
1033	ent.val = page_private(page);
1034
1035	mem = lookup_swap_cgroup(ent);
1036	if (!mem || mem->obsolete)
1037		goto charge_cur_mm;
1038	*ptr = mem;
1039	return __mem_cgroup_try_charge(NULL, mask, ptr, true);
1040charge_cur_mm:
1041	if (unlikely(!mm))
1042		mm = &init_mm;
1043	return __mem_cgroup_try_charge(mm, mask, ptr, true);
1044}
1045
1046#ifdef CONFIG_SWAP
1047
1048int mem_cgroup_cache_charge_swapin(struct page *page,
1049			struct mm_struct *mm, gfp_t mask, bool locked)
1050{
1051	int ret = 0;
1052
1053	if (mem_cgroup_disabled())
1054		return 0;
1055	if (unlikely(!mm))
1056		mm = &init_mm;
1057	if (!locked)
1058		lock_page(page);
1059	/*
1060	 * If not locked, the page can be dropped from SwapCache until
1061	 * we reach here.
1062	 */
1063	if (PageSwapCache(page)) {
1064		struct mem_cgroup *mem = NULL;
1065		swp_entry_t ent;
1066
1067		ent.val = page_private(page);
1068		if (do_swap_account) {
1069			mem = lookup_swap_cgroup(ent);
1070			if (mem && mem->obsolete)
1071				mem = NULL;
1072			if (mem)
1073				mm = NULL;
1074		}
1075		ret = mem_cgroup_charge_common(page, mm, mask,
1076				MEM_CGROUP_CHARGE_TYPE_SHMEM, mem);
1077
1078		if (!ret && do_swap_account) {
1079			/* avoid double counting */
1080			mem = swap_cgroup_record(ent, NULL);
1081			if (mem) {
1082				res_counter_uncharge(&mem->memsw, PAGE_SIZE);
1083				mem_cgroup_put(mem);
1084			}
1085		}
1086	}
1087	if (!locked)
1088		unlock_page(page);
1089	/* add this page(page_cgroup) to the LRU we want. */
1090	mem_cgroup_lru_fixup(page);
1091
1092	return ret;
1093}
1094#endif
1095
1096void mem_cgroup_commit_charge_swapin(struct page *page, struct mem_cgroup *ptr)
1097{
1098	struct page_cgroup *pc;
1099
1100	if (mem_cgroup_disabled())
1101		return;
1102	if (!ptr)
1103		return;
1104	pc = lookup_page_cgroup(page);
1105	__mem_cgroup_commit_charge(ptr, pc, MEM_CGROUP_CHARGE_TYPE_MAPPED);
1106	/*
1107	 * Now swap is on-memory. This means this page may be
1108	 * counted both as mem and swap....double count.
1109	 * Fix it by uncharging from memsw. This SwapCache is stable
1110	 * because we're still under lock_page().
1111	 */
1112	if (do_swap_account) {
1113		swp_entry_t ent = {.val = page_private(page)};
1114		struct mem_cgroup *memcg;
1115		memcg = swap_cgroup_record(ent, NULL);
1116		if (memcg) {
1117			/* If memcg is obsolete, memcg can be != ptr */
1118			res_counter_uncharge(&memcg->memsw, PAGE_SIZE);
1119			mem_cgroup_put(memcg);
1120		}
1121
1122	}
1123	/* add this page(page_cgroup) to the LRU we want. */
1124	mem_cgroup_lru_fixup(page);
1125}
1126
1127void mem_cgroup_cancel_charge_swapin(struct mem_cgroup *mem)
1128{
1129	if (mem_cgroup_disabled())
1130		return;
1131	if (!mem)
1132		return;
1133	res_counter_uncharge(&mem->res, PAGE_SIZE);
1134	if (do_swap_account)
1135		res_counter_uncharge(&mem->memsw, PAGE_SIZE);
1136	css_put(&mem->css);
1137}
1138
1139
1140/*
1141 * uncharge if !page_mapped(page)
1142 */
1143static struct mem_cgroup *
1144__mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype)
1145{
1146	struct page_cgroup *pc;
1147	struct mem_cgroup *mem = NULL;
1148	struct mem_cgroup_per_zone *mz;
1149
1150	if (mem_cgroup_disabled())
1151		return NULL;
1152
1153	if (PageSwapCache(page))
1154		return NULL;
1155
1156	/*
1157	 * Check if our page_cgroup is valid
1158	 */
1159	pc = lookup_page_cgroup(page);
1160	if (unlikely(!pc || !PageCgroupUsed(pc)))
1161		return NULL;
1162
1163	lock_page_cgroup(pc);
1164
1165	mem = pc->mem_cgroup;
1166
1167	if (!PageCgroupUsed(pc))
1168		goto unlock_out;
1169
1170	switch (ctype) {
1171	case MEM_CGROUP_CHARGE_TYPE_MAPPED:
1172		if (page_mapped(page))
1173			goto unlock_out;
1174		break;
1175	case MEM_CGROUP_CHARGE_TYPE_SWAPOUT:
1176		if (!PageAnon(page)) {	/* Shared memory */
1177			if (page->mapping && !page_is_file_cache(page))
1178				goto unlock_out;
1179		} else if (page_mapped(page)) /* Anon */
1180				goto unlock_out;
1181		break;
1182	default:
1183		break;
1184	}
1185
1186	res_counter_uncharge(&mem->res, PAGE_SIZE);
1187	if (do_swap_account && (ctype != MEM_CGROUP_CHARGE_TYPE_SWAPOUT))
1188		res_counter_uncharge(&mem->memsw, PAGE_SIZE);
1189
1190	mem_cgroup_charge_statistics(mem, pc, false);
1191	ClearPageCgroupUsed(pc);
1192
1193	mz = page_cgroup_zoneinfo(pc);
1194	unlock_page_cgroup(pc);
1195
1196	/* at swapout, this memcg will be accessed to record to swap */
1197	if (ctype != MEM_CGROUP_CHARGE_TYPE_SWAPOUT)
1198		css_put(&mem->css);
1199
1200	return mem;
1201
1202unlock_out:
1203	unlock_page_cgroup(pc);
1204	return NULL;
1205}
1206
1207void mem_cgroup_uncharge_page(struct page *page)
1208{
1209	/* early check. */
1210	if (page_mapped(page))
1211		return;
1212	if (page->mapping && !PageAnon(page))
1213		return;
1214	__mem_cgroup_uncharge_common(page, MEM_CGROUP_CHARGE_TYPE_MAPPED);
1215}
1216
1217void mem_cgroup_uncharge_cache_page(struct page *page)
1218{
1219	VM_BUG_ON(page_mapped(page));
1220	VM_BUG_ON(page->mapping);
1221	__mem_cgroup_uncharge_common(page, MEM_CGROUP_CHARGE_TYPE_CACHE);
1222}
1223
1224/*
1225 * called from __delete_from_swap_cache() and drop "page" account.
1226 * memcg information is recorded to swap_cgroup of "ent"
1227 */
1228void mem_cgroup_uncharge_swapcache(struct page *page, swp_entry_t ent)
1229{
1230	struct mem_cgroup *memcg;
1231
1232	memcg = __mem_cgroup_uncharge_common(page,
1233					MEM_CGROUP_CHARGE_TYPE_SWAPOUT);
1234	/* record memcg information */
1235	if (do_swap_account && memcg) {
1236		swap_cgroup_record(ent, memcg);
1237		mem_cgroup_get(memcg);
1238	}
1239	if (memcg)
1240		css_put(&memcg->css);
1241}
1242
1243#ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP
1244/*
1245 * called from swap_entry_free(). remove record in swap_cgroup and
1246 * uncharge "memsw" account.
1247 */
1248void mem_cgroup_uncharge_swap(swp_entry_t ent)
1249{
1250	struct mem_cgroup *memcg;
1251
1252	if (!do_swap_account)
1253		return;
1254
1255	memcg = swap_cgroup_record(ent, NULL);
1256	if (memcg) {
1257		res_counter_uncharge(&memcg->memsw, PAGE_SIZE);
1258		mem_cgroup_put(memcg);
1259	}
1260}
1261#endif
1262
1263/*
1264 * Before starting migration, account PAGE_SIZE to mem_cgroup that the old
1265 * page belongs to.
1266 */
1267int mem_cgroup_prepare_migration(struct page *page, struct mem_cgroup **ptr)
1268{
1269	struct page_cgroup *pc;
1270	struct mem_cgroup *mem = NULL;
1271	int ret = 0;
1272
1273	if (mem_cgroup_disabled())
1274		return 0;
1275
1276	pc = lookup_page_cgroup(page);
1277	lock_page_cgroup(pc);
1278	if (PageCgroupUsed(pc)) {
1279		mem = pc->mem_cgroup;
1280		css_get(&mem->css);
1281	}
1282	unlock_page_cgroup(pc);
1283
1284	if (mem) {
1285		ret = mem_cgroup_try_charge(NULL, GFP_KERNEL, &mem);
1286		css_put(&mem->css);
1287	}
1288	*ptr = mem;
1289	return ret;
1290}
1291
1292/* remove redundant charge if migration failed*/
1293void mem_cgroup_end_migration(struct mem_cgroup *mem,
1294		struct page *oldpage, struct page *newpage)
1295{
1296	struct page *target, *unused;
1297	struct page_cgroup *pc;
1298	enum charge_type ctype;
1299
1300	if (!mem)
1301		return;
1302
1303	/* at migration success, oldpage->mapping is NULL. */
1304	if (oldpage->mapping) {
1305		target = oldpage;
1306		unused = NULL;
1307	} else {
1308		target = newpage;
1309		unused = oldpage;
1310	}
1311
1312	if (PageAnon(target))
1313		ctype = MEM_CGROUP_CHARGE_TYPE_MAPPED;
1314	else if (page_is_file_cache(target))
1315		ctype = MEM_CGROUP_CHARGE_TYPE_CACHE;
1316	else
1317		ctype = MEM_CGROUP_CHARGE_TYPE_SHMEM;
1318
1319	/* unused page is not on radix-tree now. */
1320	if (unused)
1321		__mem_cgroup_uncharge_common(unused, ctype);
1322
1323	pc = lookup_page_cgroup(target);
1324	/*
1325	 * __mem_cgroup_commit_charge() check PCG_USED bit of page_cgroup.
1326	 * So, double-counting is effectively avoided.
1327	 */
1328	__mem_cgroup_commit_charge(mem, pc, ctype);
1329
1330	/*
1331	 * Both of oldpage and newpage are still under lock_page().
1332	 * Then, we don't have to care about race in radix-tree.
1333	 * But we have to be careful that this page is unmapped or not.
1334	 *
1335	 * There is a case for !page_mapped(). At the start of
1336	 * migration, oldpage was mapped. But now, it's zapped.
1337	 * But we know *target* page is not freed/reused under us.
1338	 * mem_cgroup_uncharge_page() does all necessary checks.
1339	 */
1340	if (ctype == MEM_CGROUP_CHARGE_TYPE_MAPPED)
1341		mem_cgroup_uncharge_page(target);
1342}
1343
1344/*
1345 * A call to try to shrink memory usage under specified resource controller.
1346 * This is typically used for page reclaiming for shmem for reducing side
1347 * effect of page allocation from shmem, which is used by some mem_cgroup.
1348 */
1349int mem_cgroup_shrink_usage(struct mm_struct *mm, gfp_t gfp_mask)
1350{
1351	struct mem_cgroup *mem;
1352	int progress = 0;
1353	int retry = MEM_CGROUP_RECLAIM_RETRIES;
1354
1355	if (mem_cgroup_disabled())
1356		return 0;
1357	if (!mm)
1358		return 0;
1359
1360	rcu_read_lock();
1361	mem = mem_cgroup_from_task(rcu_dereference(mm->owner));
1362	if (unlikely(!mem)) {
1363		rcu_read_unlock();
1364		return 0;
1365	}
1366	css_get(&mem->css);
1367	rcu_read_unlock();
1368
1369	do {
1370		progress = try_to_free_mem_cgroup_pages(mem, gfp_mask, true);
1371		progress += mem_cgroup_check_under_limit(mem);
1372	} while (!progress && --retry);
1373
1374	css_put(&mem->css);
1375	if (!retry)
1376		return -ENOMEM;
1377	return 0;
1378}
1379
1380/*
1381 * The inactive anon list should be small enough that the VM never has to
1382 * do too much work, but large enough that each inactive page has a chance
1383 * to be referenced again before it is swapped out.
1384 *
1385 * this calculation is straightforward porting from
1386 * page_alloc.c::setup_per_zone_inactive_ratio().
1387 * it describe more detail.
1388 */
1389static void mem_cgroup_set_inactive_ratio(struct mem_cgroup *memcg)
1390{
1391	unsigned int gb, ratio;
1392
1393	gb = res_counter_read_u64(&memcg->res, RES_LIMIT) >> 30;
1394	if (gb)
1395		ratio = int_sqrt(10 * gb);
1396	else
1397		ratio = 1;
1398
1399	memcg->inactive_ratio = ratio;
1400
1401}
1402
1403static DEFINE_MUTEX(set_limit_mutex);
1404
1405static int mem_cgroup_resize_limit(struct mem_cgroup *memcg,
1406				unsigned long long val)
1407{
1408
1409	int retry_count = MEM_CGROUP_RECLAIM_RETRIES;
1410	int progress;
1411	u64 memswlimit;
1412	int ret = 0;
1413
1414	while (retry_count) {
1415		if (signal_pending(current)) {
1416			ret = -EINTR;
1417			break;
1418		}
1419		/*
1420		 * Rather than hide all in some function, I do this in
1421		 * open coded manner. You see what this really does.
1422		 * We have to guarantee mem->res.limit < mem->memsw.limit.
1423		 */
1424		mutex_lock(&set_limit_mutex);
1425		memswlimit = res_counter_read_u64(&memcg->memsw, RES_LIMIT);
1426		if (memswlimit < val) {
1427			ret = -EINVAL;
1428			mutex_unlock(&set_limit_mutex);
1429			break;
1430		}
1431		ret = res_counter_set_limit(&memcg->res, val);
1432		mutex_unlock(&set_limit_mutex);
1433
1434		if (!ret)
1435			break;
1436
1437		progress = try_to_free_mem_cgroup_pages(memcg,
1438				GFP_KERNEL, false);
1439  		if (!progress)			retry_count--;
1440	}
1441
1442	if (!ret)
1443		mem_cgroup_set_inactive_ratio(memcg);
1444
1445	return ret;
1446}
1447
1448int mem_cgroup_resize_memsw_limit(struct mem_cgroup *memcg,
1449				unsigned long long val)
1450{
1451	int retry_count = MEM_CGROUP_RECLAIM_RETRIES;
1452	u64 memlimit, oldusage, curusage;
1453	int ret;
1454
1455	if (!do_swap_account)
1456		return -EINVAL;
1457
1458	while (retry_count) {
1459		if (signal_pending(current)) {
1460			ret = -EINTR;
1461			break;
1462		}
1463		/*
1464		 * Rather than hide all in some function, I do this in
1465		 * open coded manner. You see what this really does.
1466		 * We have to guarantee mem->res.limit < mem->memsw.limit.
1467		 */
1468		mutex_lock(&set_limit_mutex);
1469		memlimit = res_counter_read_u64(&memcg->res, RES_LIMIT);
1470		if (memlimit > val) {
1471			ret = -EINVAL;
1472			mutex_unlock(&set_limit_mutex);
1473			break;
1474		}
1475		ret = res_counter_set_limit(&memcg->memsw, val);
1476		mutex_unlock(&set_limit_mutex);
1477
1478		if (!ret)
1479			break;
1480
1481		oldusage = res_counter_read_u64(&memcg->memsw, RES_USAGE);
1482		try_to_free_mem_cgroup_pages(memcg, GFP_KERNEL, true);
1483		curusage = res_counter_read_u64(&memcg->memsw, RES_USAGE);
1484		if (curusage >= oldusage)
1485			retry_count--;
1486	}
1487	return ret;
1488}
1489
1490/*
1491 * This routine traverse page_cgroup in given list and drop them all.
1492 * *And* this routine doesn't reclaim page itself, just removes page_cgroup.
1493 */
1494static int mem_cgroup_force_empty_list(struct mem_cgroup *mem,
1495				int node, int zid, enum lru_list lru)
1496{
1497	struct zone *zone;
1498	struct mem_cgroup_per_zone *mz;
1499	struct page_cgroup *pc, *busy;
1500	unsigned long flags, loop;
1501	struct list_head *list;
1502	int ret = 0;
1503
1504	zone = &NODE_DATA(node)->node_zones[zid];
1505	mz = mem_cgroup_zoneinfo(mem, node, zid);
1506	list = &mz->lists[lru];
1507
1508	loop = MEM_CGROUP_ZSTAT(mz, lru);
1509	/* give some margin against EBUSY etc...*/
1510	loop += 256;
1511	busy = NULL;
1512	while (loop--) {
1513		ret = 0;
1514		spin_lock_irqsave(&zone->lru_lock, flags);
1515		if (list_empty(list)) {
1516			spin_unlock_irqrestore(&zone->lru_lock, flags);
1517			break;
1518		}
1519		pc = list_entry(list->prev, struct page_cgroup, lru);
1520		if (busy == pc) {
1521			list_move(&pc->lru, list);
1522			busy = 0;
1523			spin_unlock_irqrestore(&zone->lru_lock, flags);
1524			continue;
1525		}
1526		spin_unlock_irqrestore(&zone->lru_lock, flags);
1527
1528		ret = mem_cgroup_move_parent(pc, mem, GFP_KERNEL);
1529		if (ret == -ENOMEM)
1530			break;
1531
1532		if (ret == -EBUSY || ret == -EINVAL) {
1533			/* found lock contention or "pc" is obsolete. */
1534			busy = pc;
1535			cond_resched();
1536		} else
1537			busy = NULL;
1538	}
1539
1540	if (!ret && !list_empty(list))
1541		return -EBUSY;
1542	return ret;
1543}
1544
1545/*
1546 * make mem_cgroup's charge to be 0 if there is no task.
1547 * This enables deleting this mem_cgroup.
1548 */
1549static int mem_cgroup_force_empty(struct mem_cgroup *mem, bool free_all)
1550{
1551	int ret;
1552	int node, zid, shrink;
1553	int nr_retries = MEM_CGROUP_RECLAIM_RETRIES;
1554	struct cgroup *cgrp = mem->css.cgroup;
1555
1556	css_get(&mem->css);
1557
1558	shrink = 0;
1559	/* should free all ? */
1560	if (free_all)
1561		goto try_to_free;
1562move_account:
1563	while (mem->res.usage > 0) {
1564		ret = -EBUSY;
1565		if (cgroup_task_count(cgrp) || !list_empty(&cgrp->children))
1566			goto out;
1567		ret = -EINTR;
1568		if (signal_pending(current))
1569			goto out;
1570		/* This is for making all *used* pages to be on LRU. */
1571		lru_add_drain_all();
1572		ret = 0;
1573		for_each_node_state(node, N_POSSIBLE) {
1574			for (zid = 0; !ret && zid < MAX_NR_ZONES; zid++) {
1575				enum lru_list l;
1576				for_each_lru(l) {
1577					ret = mem_cgroup_force_empty_list(mem,
1578							node, zid, l);
1579					if (ret)
1580						break;
1581				}
1582			}
1583			if (ret)
1584				break;
1585		}
1586		/* it seems parent cgroup doesn't have enough mem */
1587		if (ret == -ENOMEM)
1588			goto try_to_free;
1589		cond_resched();
1590	}
1591	ret = 0;
1592out:
1593	css_put(&mem->css);
1594	return ret;
1595
1596try_to_free:
1597	/* returns EBUSY if there is a task or if we come here twice. */
1598	if (cgroup_task_count(cgrp) || !list_empty(&cgrp->children) || shrink) {
1599		ret = -EBUSY;
1600		goto out;
1601	}
1602	/* we call try-to-free pages for make this cgroup empty */
1603	lru_add_drain_all();
1604	/* try to free all pages in this cgroup */
1605	shrink = 1;
1606	while (nr_retries && mem->res.usage > 0) {
1607		int progress;
1608
1609		if (signal_pending(current)) {
1610			ret = -EINTR;
1611			goto out;
1612		}
1613		progress = try_to_free_mem_cgroup_pages(mem,
1614						  GFP_KERNEL, false);
1615		if (!progress) {
1616			nr_retries--;
1617			/* maybe some writeback is necessary */
1618			congestion_wait(WRITE, HZ/10);
1619		}
1620
1621	}
1622	lru_add_drain();
1623	/* try move_account...there may be some *locked* pages. */
1624	if (mem->res.usage)
1625		goto move_account;
1626	ret = 0;
1627	goto out;
1628}
1629
1630int mem_cgroup_force_empty_write(struct cgroup *cont, unsigned int event)
1631{
1632	return mem_cgroup_force_empty(mem_cgroup_from_cont(cont), true);
1633}
1634
1635
1636static u64 mem_cgroup_hierarchy_read(struct cgroup *cont, struct cftype *cft)
1637{
1638	return mem_cgroup_from_cont(cont)->use_hierarchy;
1639}
1640
1641static int mem_cgroup_hierarchy_write(struct cgroup *cont, struct cftype *cft,
1642					u64 val)
1643{
1644	int retval = 0;
1645	struct mem_cgroup *mem = mem_cgroup_from_cont(cont);
1646	struct cgroup *parent = cont->parent;
1647	struct mem_cgroup *parent_mem = NULL;
1648
1649	if (parent)
1650		parent_mem = mem_cgroup_from_cont(parent);
1651
1652	cgroup_lock();
1653	/*
1654	 * If parent's use_hiearchy is set, we can't make any modifications
1655	 * in the child subtrees. If it is unset, then the change can
1656	 * occur, provided the current cgroup has no children.
1657	 *
1658	 * For the root cgroup, parent_mem is NULL, we allow value to be
1659	 * set if there are no children.
1660	 */
1661	if ((!parent_mem || !parent_mem->use_hierarchy) &&
1662				(val == 1 || val == 0)) {
1663		if (list_empty(&cont->children))
1664			mem->use_hierarchy = val;
1665		else
1666			retval = -EBUSY;
1667	} else
1668		retval = -EINVAL;
1669	cgroup_unlock();
1670
1671	return retval;
1672}
1673
1674static u64 mem_cgroup_read(struct cgroup *cont, struct cftype *cft)
1675{
1676	struct mem_cgroup *mem = mem_cgroup_from_cont(cont);
1677	u64 val = 0;
1678	int type, name;
1679
1680	type = MEMFILE_TYPE(cft->private);
1681	name = MEMFILE_ATTR(cft->private);
1682	switch (type) {
1683	case _MEM:
1684		val = res_counter_read_u64(&mem->res, name);
1685		break;
1686	case _MEMSWAP:
1687		if (do_swap_account)
1688			val = res_counter_read_u64(&mem->memsw, name);
1689		break;
1690	default:
1691		BUG();
1692		break;
1693	}
1694	return val;
1695}
1696/*
1697 * The user of this function is...
1698 * RES_LIMIT.
1699 */
1700static int mem_cgroup_write(struct cgroup *cont, struct cftype *cft,
1701			    const char *buffer)
1702{
1703	struct mem_cgroup *memcg = mem_cgroup_from_cont(cont);
1704	int type, name;
1705	unsigned long long val;
1706	int ret;
1707
1708	type = MEMFILE_TYPE(cft->private);
1709	name = MEMFILE_ATTR(cft->private);
1710	switch (name) {
1711	case RES_LIMIT:
1712		/* This function does all necessary parse...reuse it */
1713		ret = res_counter_memparse_write_strategy(buffer, &val);
1714		if (ret)
1715			break;
1716		if (type == _MEM)
1717			ret = mem_cgroup_resize_limit(memcg, val);
1718		else
1719			ret = mem_cgroup_resize_memsw_limit(memcg, val);
1720		break;
1721	default:
1722		ret = -EINVAL; /* should be BUG() ? */
1723		break;
1724	}
1725	return ret;
1726}
1727
1728static int mem_cgroup_reset(struct cgroup *cont, unsigned int event)
1729{
1730	struct mem_cgroup *mem;
1731	int type, name;
1732
1733	mem = mem_cgroup_from_cont(cont);
1734	type = MEMFILE_TYPE(event);
1735	name = MEMFILE_ATTR(event);
1736	switch (name) {
1737	case RES_MAX_USAGE:
1738		if (type == _MEM)
1739			res_counter_reset_max(&mem->res);
1740		else
1741			res_counter_reset_max(&mem->memsw);
1742		break;
1743	case RES_FAILCNT:
1744		if (type == _MEM)
1745			res_counter_reset_failcnt(&mem->res);
1746		else
1747			res_counter_reset_failcnt(&mem->memsw);
1748		break;
1749	}
1750	return 0;
1751}
1752
1753static const struct mem_cgroup_stat_desc {
1754	const char *msg;
1755	u64 unit;
1756} mem_cgroup_stat_desc[] = {
1757	[MEM_CGROUP_STAT_CACHE] = { "cache", PAGE_SIZE, },
1758	[MEM_CGROUP_STAT_RSS] = { "rss", PAGE_SIZE, },
1759	[MEM_CGROUP_STAT_PGPGIN_COUNT] = {"pgpgin", 1, },
1760	[MEM_CGROUP_STAT_PGPGOUT_COUNT] = {"pgpgout", 1, },
1761};
1762
1763static int mem_control_stat_show(struct cgroup *cont, struct cftype *cft,
1764				 struct cgroup_map_cb *cb)
1765{
1766	struct mem_cgroup *mem_cont = mem_cgroup_from_cont(cont);
1767	struct mem_cgroup_stat *stat = &mem_cont->stat;
1768	int i;
1769
1770	for (i = 0; i < ARRAY_SIZE(stat->cpustat[0].count); i++) {
1771		s64 val;
1772
1773		val = mem_cgroup_read_stat(stat, i);
1774		val *= mem_cgroup_stat_desc[i].unit;
1775		cb->fill(cb, mem_cgroup_stat_desc[i].msg, val);
1776	}
1777	/* showing # of active pages */
1778	{
1779		unsigned long active_anon, inactive_anon;
1780		unsigned long active_file, inactive_file;
1781		unsigned long unevictable;
1782
1783		inactive_anon = mem_cgroup_get_all_zonestat(mem_cont,
1784						LRU_INACTIVE_ANON);
1785		active_anon = mem_cgroup_get_all_zonestat(mem_cont,
1786						LRU_ACTIVE_ANON);
1787		inactive_file = mem_cgroup_get_all_zonestat(mem_cont,
1788						LRU_INACTIVE_FILE);
1789		active_file = mem_cgroup_get_all_zonestat(mem_cont,
1790						LRU_ACTIVE_FILE);
1791		unevictable = mem_cgroup_get_all_zonestat(mem_cont,
1792							LRU_UNEVICTABLE);
1793
1794		cb->fill(cb, "active_anon", (active_anon) * PAGE_SIZE);
1795		cb->fill(cb, "inactive_anon", (inactive_anon) * PAGE_SIZE);
1796		cb->fill(cb, "active_file", (active_file) * PAGE_SIZE);
1797		cb->fill(cb, "inactive_file", (inactive_file) * PAGE_SIZE);
1798		cb->fill(cb, "unevictable", unevictable * PAGE_SIZE);
1799
1800	}
1801	return 0;
1802}
1803
1804
1805static struct cftype mem_cgroup_files[] = {
1806	{
1807		.name = "usage_in_bytes",
1808		.private = MEMFILE_PRIVATE(_MEM, RES_USAGE),
1809		.read_u64 = mem_cgroup_read,
1810	},
1811	{
1812		.name = "max_usage_in_bytes",
1813		.private = MEMFILE_PRIVATE(_MEM, RES_MAX_USAGE),
1814		.trigger = mem_cgroup_reset,
1815		.read_u64 = mem_cgroup_read,
1816	},
1817	{
1818		.name = "limit_in_bytes",
1819		.private = MEMFILE_PRIVATE(_MEM, RES_LIMIT),
1820		.write_string = mem_cgroup_write,
1821		.read_u64 = mem_cgroup_read,
1822	},
1823	{
1824		.name = "failcnt",
1825		.private = MEMFILE_PRIVATE(_MEM, RES_FAILCNT),
1826		.trigger = mem_cgroup_reset,
1827		.read_u64 = mem_cgroup_read,
1828	},
1829	{
1830		.name = "stat",
1831		.read_map = mem_control_stat_show,
1832	},
1833	{
1834		.name = "force_empty",
1835		.trigger = mem_cgroup_force_empty_write,
1836	},
1837	{
1838		.name = "use_hierarchy",
1839		.write_u64 = mem_cgroup_hierarchy_write,
1840		.read_u64 = mem_cgroup_hierarchy_read,
1841	},
1842};
1843
1844#ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP
1845static struct cftype memsw_cgroup_files[] = {
1846	{
1847		.name = "memsw.usage_in_bytes",
1848		.private = MEMFILE_PRIVATE(_MEMSWAP, RES_USAGE),
1849		.read_u64 = mem_cgroup_read,
1850	},
1851	{
1852		.name = "memsw.max_usage_in_bytes",
1853		.private = MEMFILE_PRIVATE(_MEMSWAP, RES_MAX_USAGE),
1854		.trigger = mem_cgroup_reset,
1855		.read_u64 = mem_cgroup_read,
1856	},
1857	{
1858		.name = "memsw.limit_in_bytes",
1859		.private = MEMFILE_PRIVATE(_MEMSWAP, RES_LIMIT),
1860		.write_string = mem_cgroup_write,
1861		.read_u64 = mem_cgroup_read,
1862	},
1863	{
1864		.name = "memsw.failcnt",
1865		.private = MEMFILE_PRIVATE(_MEMSWAP, RES_FAILCNT),
1866		.trigger = mem_cgroup_reset,
1867		.read_u64 = mem_cgroup_read,
1868	},
1869};
1870
1871static int register_memsw_files(struct cgroup *cont, struct cgroup_subsys *ss)
1872{
1873	if (!do_swap_account)
1874		return 0;
1875	return cgroup_add_files(cont, ss, memsw_cgroup_files,
1876				ARRAY_SIZE(memsw_cgroup_files));
1877};
1878#else
1879static int register_memsw_files(struct cgroup *cont, struct cgroup_subsys *ss)
1880{
1881	return 0;
1882}
1883#endif
1884
1885static int alloc_mem_cgroup_per_zone_info(struct mem_cgroup *mem, int node)
1886{
1887	struct mem_cgroup_per_node *pn;
1888	struct mem_cgroup_per_zone *mz;
1889	enum lru_list l;
1890	int zone, tmp = node;
1891	/*
1892	 * This routine is called against possible nodes.
1893	 * But it's BUG to call kmalloc() against offline node.
1894	 *
1895	 * TODO: this routine can waste much memory for nodes which will
1896	 *       never be onlined. It's better to use memory hotplug callback
1897	 *       function.
1898	 */
1899	if (!node_state(node, N_NORMAL_MEMORY))
1900		tmp = -1;
1901	pn = kmalloc_node(sizeof(*pn), GFP_KERNEL, tmp);
1902	if (!pn)
1903		return 1;
1904
1905	mem->info.nodeinfo[node] = pn;
1906	memset(pn, 0, sizeof(*pn));
1907
1908	for (zone = 0; zone < MAX_NR_ZONES; zone++) {
1909		mz = &pn->zoneinfo[zone];
1910		for_each_lru(l)
1911			INIT_LIST_HEAD(&mz->lists[l]);
1912	}
1913	return 0;
1914}
1915
1916static void free_mem_cgroup_per_zone_info(struct mem_cgroup *mem, int node)
1917{
1918	kfree(mem->info.nodeinfo[node]);
1919}
1920
1921static int mem_cgroup_size(void)
1922{
1923	int cpustat_size = nr_cpu_ids * sizeof(struct mem_cgroup_stat_cpu);
1924	return sizeof(struct mem_cgroup) + cpustat_size;
1925}
1926
1927static struct mem_cgroup *mem_cgroup_alloc(void)
1928{
1929	struct mem_cgroup *mem;
1930	int size = mem_cgroup_size();
1931
1932	if (size < PAGE_SIZE)
1933		mem = kmalloc(size, GFP_KERNEL);
1934	else
1935		mem = vmalloc(size);
1936
1937	if (mem)
1938		memset(mem, 0, size);
1939	return mem;
1940}
1941
1942/*
1943 * At destroying mem_cgroup, references from swap_cgroup can remain.
1944 * (scanning all at force_empty is too costly...)
1945 *
1946 * Instead of clearing all references at force_empty, we remember
1947 * the number of reference from swap_cgroup and free mem_cgroup when
1948 * it goes down to 0.
1949 *
1950 * When mem_cgroup is destroyed, mem->obsolete will be set to 0 and
1951 * entry which points to this memcg will be ignore at swapin.
1952 *
1953 * Removal of cgroup itself succeeds regardless of refs from swap.
1954 */
1955
1956static void mem_cgroup_free(struct mem_cgroup *mem)
1957{
1958	int node;
1959
1960	if (atomic_read(&mem->refcnt) > 0)
1961		return;
1962
1963
1964	for_each_node_state(node, N_POSSIBLE)
1965		free_mem_cgroup_per_zone_info(mem, node);
1966
1967	if (mem_cgroup_size() < PAGE_SIZE)
1968		kfree(mem);
1969	else
1970		vfree(mem);
1971}
1972
1973static void mem_cgroup_get(struct mem_cgroup *mem)
1974{
1975	atomic_inc(&mem->refcnt);
1976}
1977
1978static void mem_cgroup_put(struct mem_cgroup *mem)
1979{
1980	if (atomic_dec_and_test(&mem->refcnt)) {
1981		if (!mem->obsolete)
1982			return;
1983		mem_cgroup_free(mem);
1984	}
1985}
1986
1987
1988#ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP
1989static void __init enable_swap_cgroup(void)
1990{
1991	if (!mem_cgroup_disabled() && really_do_swap_account)
1992		do_swap_account = 1;
1993}
1994#else
1995static void __init enable_swap_cgroup(void)
1996{
1997}
1998#endif
1999
2000static struct cgroup_subsys_state *
2001mem_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cont)
2002{
2003	struct mem_cgroup *mem, *parent;
2004	int node;
2005
2006	mem = mem_cgroup_alloc();
2007	if (!mem)
2008		return ERR_PTR(-ENOMEM);
2009
2010	for_each_node_state(node, N_POSSIBLE)
2011		if (alloc_mem_cgroup_per_zone_info(mem, node))
2012			goto free_out;
2013	/* root ? */
2014	if (cont->parent == NULL) {
2015		enable_swap_cgroup();
2016		parent = NULL;
2017	} else {
2018		parent = mem_cgroup_from_cont(cont->parent);
2019		mem->use_hierarchy = parent->use_hierarchy;
2020	}
2021
2022	if (parent && parent->use_hierarchy) {
2023		res_counter_init(&mem->res, &parent->res);
2024		res_counter_init(&mem->memsw, &parent->memsw);
2025	} else {
2026		res_counter_init(&mem->res, NULL);
2027		res_counter_init(&mem->memsw, NULL);
2028	}
2029	mem_cgroup_set_inactive_ratio(mem);
2030	mem->last_scanned_child = NULL;
2031
2032	return &mem->css;
2033free_out:
2034	for_each_node_state(node, N_POSSIBLE)
2035		free_mem_cgroup_per_zone_info(mem, node);
2036	mem_cgroup_free(mem);
2037	return ERR_PTR(-ENOMEM);
2038}
2039
2040static void mem_cgroup_pre_destroy(struct cgroup_subsys *ss,
2041					struct cgroup *cont)
2042{
2043	struct mem_cgroup *mem = mem_cgroup_from_cont(cont);
2044	mem->obsolete = 1;
2045	mem_cgroup_force_empty(mem, false);
2046}
2047
2048static void mem_cgroup_destroy(struct cgroup_subsys *ss,
2049				struct cgroup *cont)
2050{
2051	mem_cgroup_free(mem_cgroup_from_cont(cont));
2052}
2053
2054static int mem_cgroup_populate(struct cgroup_subsys *ss,
2055				struct cgroup *cont)
2056{
2057	int ret;
2058
2059	ret = cgroup_add_files(cont, ss, mem_cgroup_files,
2060				ARRAY_SIZE(mem_cgroup_files));
2061
2062	if (!ret)
2063		ret = register_memsw_files(cont, ss);
2064	return ret;
2065}
2066
2067static void mem_cgroup_move_task(struct cgroup_subsys *ss,
2068				struct cgroup *cont,
2069				struct cgroup *old_cont,
2070				struct task_struct *p)
2071{
2072	/*
2073	 * FIXME: It's better to move charges of this process from old
2074	 * memcg to new memcg. But it's just on TODO-List now.
2075	 */
2076}
2077
2078struct cgroup_subsys mem_cgroup_subsys = {
2079	.name = "memory",
2080	.subsys_id = mem_cgroup_subsys_id,
2081	.create = mem_cgroup_create,
2082	.pre_destroy = mem_cgroup_pre_destroy,
2083	.destroy = mem_cgroup_destroy,
2084	.populate = mem_cgroup_populate,
2085	.attach = mem_cgroup_move_task,
2086	.early_init = 0,
2087};
2088
2089#ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP
2090
2091static int __init disable_swap_account(char *s)
2092{
2093	really_do_swap_account = 0;
2094	return 1;
2095}
2096__setup("noswapaccount", disable_swap_account);
2097#endif
2098