memcontrol.c revision 8a9478ca7f4bcb8945cec7f95d52dae2d5e50cbd
1/* memcontrol.c - Memory Controller
2 *
3 * Copyright IBM Corporation, 2007
4 * Author Balbir Singh <balbir@linux.vnet.ibm.com>
5 *
6 * Copyright 2007 OpenVZ SWsoft Inc
7 * Author: Pavel Emelianov <xemul@openvz.org>
8 *
9 * This program is free software; you can redistribute it and/or modify
10 * it under the terms of the GNU General Public License as published by
11 * the Free Software Foundation; either version 2 of the License, or
12 * (at your option) any later version.
13 *
14 * This program is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
17 * GNU General Public License for more details.
18 */
19
20#include <linux/res_counter.h>
21#include <linux/memcontrol.h>
22#include <linux/cgroup.h>
23#include <linux/mm.h>
24#include <linux/pagemap.h>
25#include <linux/smp.h>
26#include <linux/page-flags.h>
27#include <linux/backing-dev.h>
28#include <linux/bit_spinlock.h>
29#include <linux/rcupdate.h>
30#include <linux/limits.h>
31#include <linux/mutex.h>
32#include <linux/slab.h>
33#include <linux/swap.h>
34#include <linux/spinlock.h>
35#include <linux/fs.h>
36#include <linux/seq_file.h>
37#include <linux/vmalloc.h>
38#include <linux/mm_inline.h>
39#include <linux/page_cgroup.h>
40#include "internal.h"
41
42#include <asm/uaccess.h>
43
44struct cgroup_subsys mem_cgroup_subsys __read_mostly;
45#define MEM_CGROUP_RECLAIM_RETRIES	5
46
47#ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP
48/* Turned on only when memory cgroup is enabled && really_do_swap_account = 1 */
49int do_swap_account __read_mostly;
50static int really_do_swap_account __initdata = 1; /* for remember boot option*/
51#else
52#define do_swap_account		(0)
53#endif
54
55static DEFINE_MUTEX(memcg_tasklist);	/* can be hold under cgroup_mutex */
56
57/*
58 * Statistics for memory cgroup.
59 */
60enum mem_cgroup_stat_index {
61	/*
62	 * For MEM_CONTAINER_TYPE_ALL, usage = pagecache + rss.
63	 */
64	MEM_CGROUP_STAT_CACHE, 	   /* # of pages charged as cache */
65	MEM_CGROUP_STAT_RSS,	   /* # of pages charged as anon rss */
66	MEM_CGROUP_STAT_MAPPED_FILE,  /* # of pages charged as file rss */
67	MEM_CGROUP_STAT_PGPGIN_COUNT,	/* # of pages paged in */
68	MEM_CGROUP_STAT_PGPGOUT_COUNT,	/* # of pages paged out */
69
70	MEM_CGROUP_STAT_NSTATS,
71};
72
73struct mem_cgroup_stat_cpu {
74	s64 count[MEM_CGROUP_STAT_NSTATS];
75} ____cacheline_aligned_in_smp;
76
77struct mem_cgroup_stat {
78	struct mem_cgroup_stat_cpu cpustat[0];
79};
80
81/*
82 * For accounting under irq disable, no need for increment preempt count.
83 */
84static inline void __mem_cgroup_stat_add_safe(struct mem_cgroup_stat_cpu *stat,
85		enum mem_cgroup_stat_index idx, int val)
86{
87	stat->count[idx] += val;
88}
89
90static s64 mem_cgroup_read_stat(struct mem_cgroup_stat *stat,
91		enum mem_cgroup_stat_index idx)
92{
93	int cpu;
94	s64 ret = 0;
95	for_each_possible_cpu(cpu)
96		ret += stat->cpustat[cpu].count[idx];
97	return ret;
98}
99
100static s64 mem_cgroup_local_usage(struct mem_cgroup_stat *stat)
101{
102	s64 ret;
103
104	ret = mem_cgroup_read_stat(stat, MEM_CGROUP_STAT_CACHE);
105	ret += mem_cgroup_read_stat(stat, MEM_CGROUP_STAT_RSS);
106	return ret;
107}
108
109/*
110 * per-zone information in memory controller.
111 */
112struct mem_cgroup_per_zone {
113	/*
114	 * spin_lock to protect the per cgroup LRU
115	 */
116	struct list_head	lists[NR_LRU_LISTS];
117	unsigned long		count[NR_LRU_LISTS];
118
119	struct zone_reclaim_stat reclaim_stat;
120};
121/* Macro for accessing counter */
122#define MEM_CGROUP_ZSTAT(mz, idx)	((mz)->count[(idx)])
123
124struct mem_cgroup_per_node {
125	struct mem_cgroup_per_zone zoneinfo[MAX_NR_ZONES];
126};
127
128struct mem_cgroup_lru_info {
129	struct mem_cgroup_per_node *nodeinfo[MAX_NUMNODES];
130};
131
132/*
133 * The memory controller data structure. The memory controller controls both
134 * page cache and RSS per cgroup. We would eventually like to provide
135 * statistics based on the statistics developed by Rik Van Riel for clock-pro,
136 * to help the administrator determine what knobs to tune.
137 *
138 * TODO: Add a water mark for the memory controller. Reclaim will begin when
139 * we hit the water mark. May be even add a low water mark, such that
140 * no reclaim occurs from a cgroup at it's low water mark, this is
141 * a feature that will be implemented much later in the future.
142 */
143struct mem_cgroup {
144	struct cgroup_subsys_state css;
145	/*
146	 * the counter to account for memory usage
147	 */
148	struct res_counter res;
149	/*
150	 * the counter to account for mem+swap usage.
151	 */
152	struct res_counter memsw;
153	/*
154	 * Per cgroup active and inactive list, similar to the
155	 * per zone LRU lists.
156	 */
157	struct mem_cgroup_lru_info info;
158
159	/*
160	  protect against reclaim related member.
161	*/
162	spinlock_t reclaim_param_lock;
163
164	int	prev_priority;	/* for recording reclaim priority */
165
166	/*
167	 * While reclaiming in a hiearchy, we cache the last child we
168	 * reclaimed from.
169	 */
170	int last_scanned_child;
171	/*
172	 * Should the accounting and control be hierarchical, per subtree?
173	 */
174	bool use_hierarchy;
175	unsigned long	last_oom_jiffies;
176	atomic_t	refcnt;
177
178	unsigned int	swappiness;
179
180	/*
181	 * statistics. This must be placed at the end of memcg.
182	 */
183	struct mem_cgroup_stat stat;
184};
185
186enum charge_type {
187	MEM_CGROUP_CHARGE_TYPE_CACHE = 0,
188	MEM_CGROUP_CHARGE_TYPE_MAPPED,
189	MEM_CGROUP_CHARGE_TYPE_SHMEM,	/* used by page migration of shmem */
190	MEM_CGROUP_CHARGE_TYPE_FORCE,	/* used by force_empty */
191	MEM_CGROUP_CHARGE_TYPE_SWAPOUT,	/* for accounting swapcache */
192	MEM_CGROUP_CHARGE_TYPE_DROP,	/* a page was unused swap cache */
193	NR_CHARGE_TYPE,
194};
195
196/* only for here (for easy reading.) */
197#define PCGF_CACHE	(1UL << PCG_CACHE)
198#define PCGF_USED	(1UL << PCG_USED)
199#define PCGF_LOCK	(1UL << PCG_LOCK)
200static const unsigned long
201pcg_default_flags[NR_CHARGE_TYPE] = {
202	PCGF_CACHE | PCGF_USED | PCGF_LOCK, /* File Cache */
203	PCGF_USED | PCGF_LOCK, /* Anon */
204	PCGF_CACHE | PCGF_USED | PCGF_LOCK, /* Shmem */
205	0, /* FORCE */
206};
207
208/* for encoding cft->private value on file */
209#define _MEM			(0)
210#define _MEMSWAP		(1)
211#define MEMFILE_PRIVATE(x, val)	(((x) << 16) | (val))
212#define MEMFILE_TYPE(val)	(((val) >> 16) & 0xffff)
213#define MEMFILE_ATTR(val)	((val) & 0xffff)
214
215static void mem_cgroup_get(struct mem_cgroup *mem);
216static void mem_cgroup_put(struct mem_cgroup *mem);
217static struct mem_cgroup *parent_mem_cgroup(struct mem_cgroup *mem);
218
219static void mem_cgroup_charge_statistics(struct mem_cgroup *mem,
220					 struct page_cgroup *pc,
221					 bool charge)
222{
223	int val = (charge)? 1 : -1;
224	struct mem_cgroup_stat *stat = &mem->stat;
225	struct mem_cgroup_stat_cpu *cpustat;
226	int cpu = get_cpu();
227
228	cpustat = &stat->cpustat[cpu];
229	if (PageCgroupCache(pc))
230		__mem_cgroup_stat_add_safe(cpustat, MEM_CGROUP_STAT_CACHE, val);
231	else
232		__mem_cgroup_stat_add_safe(cpustat, MEM_CGROUP_STAT_RSS, val);
233
234	if (charge)
235		__mem_cgroup_stat_add_safe(cpustat,
236				MEM_CGROUP_STAT_PGPGIN_COUNT, 1);
237	else
238		__mem_cgroup_stat_add_safe(cpustat,
239				MEM_CGROUP_STAT_PGPGOUT_COUNT, 1);
240	put_cpu();
241}
242
243static struct mem_cgroup_per_zone *
244mem_cgroup_zoneinfo(struct mem_cgroup *mem, int nid, int zid)
245{
246	return &mem->info.nodeinfo[nid]->zoneinfo[zid];
247}
248
249static struct mem_cgroup_per_zone *
250page_cgroup_zoneinfo(struct page_cgroup *pc)
251{
252	struct mem_cgroup *mem = pc->mem_cgroup;
253	int nid = page_cgroup_nid(pc);
254	int zid = page_cgroup_zid(pc);
255
256	if (!mem)
257		return NULL;
258
259	return mem_cgroup_zoneinfo(mem, nid, zid);
260}
261
262static unsigned long mem_cgroup_get_local_zonestat(struct mem_cgroup *mem,
263					enum lru_list idx)
264{
265	int nid, zid;
266	struct mem_cgroup_per_zone *mz;
267	u64 total = 0;
268
269	for_each_online_node(nid)
270		for (zid = 0; zid < MAX_NR_ZONES; zid++) {
271			mz = mem_cgroup_zoneinfo(mem, nid, zid);
272			total += MEM_CGROUP_ZSTAT(mz, idx);
273		}
274	return total;
275}
276
277static struct mem_cgroup *mem_cgroup_from_cont(struct cgroup *cont)
278{
279	return container_of(cgroup_subsys_state(cont,
280				mem_cgroup_subsys_id), struct mem_cgroup,
281				css);
282}
283
284struct mem_cgroup *mem_cgroup_from_task(struct task_struct *p)
285{
286	/*
287	 * mm_update_next_owner() may clear mm->owner to NULL
288	 * if it races with swapoff, page migration, etc.
289	 * So this can be called with p == NULL.
290	 */
291	if (unlikely(!p))
292		return NULL;
293
294	return container_of(task_subsys_state(p, mem_cgroup_subsys_id),
295				struct mem_cgroup, css);
296}
297
298static struct mem_cgroup *try_get_mem_cgroup_from_mm(struct mm_struct *mm)
299{
300	struct mem_cgroup *mem = NULL;
301
302	if (!mm)
303		return NULL;
304	/*
305	 * Because we have no locks, mm->owner's may be being moved to other
306	 * cgroup. We use css_tryget() here even if this looks
307	 * pessimistic (rather than adding locks here).
308	 */
309	rcu_read_lock();
310	do {
311		mem = mem_cgroup_from_task(rcu_dereference(mm->owner));
312		if (unlikely(!mem))
313			break;
314	} while (!css_tryget(&mem->css));
315	rcu_read_unlock();
316	return mem;
317}
318
319/*
320 * Call callback function against all cgroup under hierarchy tree.
321 */
322static int mem_cgroup_walk_tree(struct mem_cgroup *root, void *data,
323			  int (*func)(struct mem_cgroup *, void *))
324{
325	int found, ret, nextid;
326	struct cgroup_subsys_state *css;
327	struct mem_cgroup *mem;
328
329	if (!root->use_hierarchy)
330		return (*func)(root, data);
331
332	nextid = 1;
333	do {
334		ret = 0;
335		mem = NULL;
336
337		rcu_read_lock();
338		css = css_get_next(&mem_cgroup_subsys, nextid, &root->css,
339				   &found);
340		if (css && css_tryget(css))
341			mem = container_of(css, struct mem_cgroup, css);
342		rcu_read_unlock();
343
344		if (mem) {
345			ret = (*func)(mem, data);
346			css_put(&mem->css);
347		}
348		nextid = found + 1;
349	} while (!ret && css);
350
351	return ret;
352}
353
354/*
355 * Following LRU functions are allowed to be used without PCG_LOCK.
356 * Operations are called by routine of global LRU independently from memcg.
357 * What we have to take care of here is validness of pc->mem_cgroup.
358 *
359 * Changes to pc->mem_cgroup happens when
360 * 1. charge
361 * 2. moving account
362 * In typical case, "charge" is done before add-to-lru. Exception is SwapCache.
363 * It is added to LRU before charge.
364 * If PCG_USED bit is not set, page_cgroup is not added to this private LRU.
365 * When moving account, the page is not on LRU. It's isolated.
366 */
367
368void mem_cgroup_del_lru_list(struct page *page, enum lru_list lru)
369{
370	struct page_cgroup *pc;
371	struct mem_cgroup *mem;
372	struct mem_cgroup_per_zone *mz;
373
374	if (mem_cgroup_disabled())
375		return;
376	pc = lookup_page_cgroup(page);
377	/* can happen while we handle swapcache. */
378	if (list_empty(&pc->lru) || !pc->mem_cgroup)
379		return;
380	/*
381	 * We don't check PCG_USED bit. It's cleared when the "page" is finally
382	 * removed from global LRU.
383	 */
384	mz = page_cgroup_zoneinfo(pc);
385	mem = pc->mem_cgroup;
386	MEM_CGROUP_ZSTAT(mz, lru) -= 1;
387	list_del_init(&pc->lru);
388	return;
389}
390
391void mem_cgroup_del_lru(struct page *page)
392{
393	mem_cgroup_del_lru_list(page, page_lru(page));
394}
395
396void mem_cgroup_rotate_lru_list(struct page *page, enum lru_list lru)
397{
398	struct mem_cgroup_per_zone *mz;
399	struct page_cgroup *pc;
400
401	if (mem_cgroup_disabled())
402		return;
403
404	pc = lookup_page_cgroup(page);
405	/*
406	 * Used bit is set without atomic ops but after smp_wmb().
407	 * For making pc->mem_cgroup visible, insert smp_rmb() here.
408	 */
409	smp_rmb();
410	/* unused page is not rotated. */
411	if (!PageCgroupUsed(pc))
412		return;
413	mz = page_cgroup_zoneinfo(pc);
414	list_move(&pc->lru, &mz->lists[lru]);
415}
416
417void mem_cgroup_add_lru_list(struct page *page, enum lru_list lru)
418{
419	struct page_cgroup *pc;
420	struct mem_cgroup_per_zone *mz;
421
422	if (mem_cgroup_disabled())
423		return;
424	pc = lookup_page_cgroup(page);
425	/*
426	 * Used bit is set without atomic ops but after smp_wmb().
427	 * For making pc->mem_cgroup visible, insert smp_rmb() here.
428	 */
429	smp_rmb();
430	if (!PageCgroupUsed(pc))
431		return;
432
433	mz = page_cgroup_zoneinfo(pc);
434	MEM_CGROUP_ZSTAT(mz, lru) += 1;
435	list_add(&pc->lru, &mz->lists[lru]);
436}
437
438/*
439 * At handling SwapCache, pc->mem_cgroup may be changed while it's linked to
440 * lru because the page may.be reused after it's fully uncharged (because of
441 * SwapCache behavior).To handle that, unlink page_cgroup from LRU when charge
442 * it again. This function is only used to charge SwapCache. It's done under
443 * lock_page and expected that zone->lru_lock is never held.
444 */
445static void mem_cgroup_lru_del_before_commit_swapcache(struct page *page)
446{
447	unsigned long flags;
448	struct zone *zone = page_zone(page);
449	struct page_cgroup *pc = lookup_page_cgroup(page);
450
451	spin_lock_irqsave(&zone->lru_lock, flags);
452	/*
453	 * Forget old LRU when this page_cgroup is *not* used. This Used bit
454	 * is guarded by lock_page() because the page is SwapCache.
455	 */
456	if (!PageCgroupUsed(pc))
457		mem_cgroup_del_lru_list(page, page_lru(page));
458	spin_unlock_irqrestore(&zone->lru_lock, flags);
459}
460
461static void mem_cgroup_lru_add_after_commit_swapcache(struct page *page)
462{
463	unsigned long flags;
464	struct zone *zone = page_zone(page);
465	struct page_cgroup *pc = lookup_page_cgroup(page);
466
467	spin_lock_irqsave(&zone->lru_lock, flags);
468	/* link when the page is linked to LRU but page_cgroup isn't */
469	if (PageLRU(page) && list_empty(&pc->lru))
470		mem_cgroup_add_lru_list(page, page_lru(page));
471	spin_unlock_irqrestore(&zone->lru_lock, flags);
472}
473
474
475void mem_cgroup_move_lists(struct page *page,
476			   enum lru_list from, enum lru_list to)
477{
478	if (mem_cgroup_disabled())
479		return;
480	mem_cgroup_del_lru_list(page, from);
481	mem_cgroup_add_lru_list(page, to);
482}
483
484int task_in_mem_cgroup(struct task_struct *task, const struct mem_cgroup *mem)
485{
486	int ret;
487	struct mem_cgroup *curr = NULL;
488
489	task_lock(task);
490	rcu_read_lock();
491	curr = try_get_mem_cgroup_from_mm(task->mm);
492	rcu_read_unlock();
493	task_unlock(task);
494	if (!curr)
495		return 0;
496	if (curr->use_hierarchy)
497		ret = css_is_ancestor(&curr->css, &mem->css);
498	else
499		ret = (curr == mem);
500	css_put(&curr->css);
501	return ret;
502}
503
504/*
505 * prev_priority control...this will be used in memory reclaim path.
506 */
507int mem_cgroup_get_reclaim_priority(struct mem_cgroup *mem)
508{
509	int prev_priority;
510
511	spin_lock(&mem->reclaim_param_lock);
512	prev_priority = mem->prev_priority;
513	spin_unlock(&mem->reclaim_param_lock);
514
515	return prev_priority;
516}
517
518void mem_cgroup_note_reclaim_priority(struct mem_cgroup *mem, int priority)
519{
520	spin_lock(&mem->reclaim_param_lock);
521	if (priority < mem->prev_priority)
522		mem->prev_priority = priority;
523	spin_unlock(&mem->reclaim_param_lock);
524}
525
526void mem_cgroup_record_reclaim_priority(struct mem_cgroup *mem, int priority)
527{
528	spin_lock(&mem->reclaim_param_lock);
529	mem->prev_priority = priority;
530	spin_unlock(&mem->reclaim_param_lock);
531}
532
533static int calc_inactive_ratio(struct mem_cgroup *memcg, unsigned long *present_pages)
534{
535	unsigned long active;
536	unsigned long inactive;
537	unsigned long gb;
538	unsigned long inactive_ratio;
539
540	inactive = mem_cgroup_get_local_zonestat(memcg, LRU_INACTIVE_ANON);
541	active = mem_cgroup_get_local_zonestat(memcg, LRU_ACTIVE_ANON);
542
543	gb = (inactive + active) >> (30 - PAGE_SHIFT);
544	if (gb)
545		inactive_ratio = int_sqrt(10 * gb);
546	else
547		inactive_ratio = 1;
548
549	if (present_pages) {
550		present_pages[0] = inactive;
551		present_pages[1] = active;
552	}
553
554	return inactive_ratio;
555}
556
557int mem_cgroup_inactive_anon_is_low(struct mem_cgroup *memcg)
558{
559	unsigned long active;
560	unsigned long inactive;
561	unsigned long present_pages[2];
562	unsigned long inactive_ratio;
563
564	inactive_ratio = calc_inactive_ratio(memcg, present_pages);
565
566	inactive = present_pages[0];
567	active = present_pages[1];
568
569	if (inactive * inactive_ratio < active)
570		return 1;
571
572	return 0;
573}
574
575int mem_cgroup_inactive_file_is_low(struct mem_cgroup *memcg)
576{
577	unsigned long active;
578	unsigned long inactive;
579
580	inactive = mem_cgroup_get_local_zonestat(memcg, LRU_INACTIVE_FILE);
581	active = mem_cgroup_get_local_zonestat(memcg, LRU_ACTIVE_FILE);
582
583	return (active > inactive);
584}
585
586unsigned long mem_cgroup_zone_nr_pages(struct mem_cgroup *memcg,
587				       struct zone *zone,
588				       enum lru_list lru)
589{
590	int nid = zone->zone_pgdat->node_id;
591	int zid = zone_idx(zone);
592	struct mem_cgroup_per_zone *mz = mem_cgroup_zoneinfo(memcg, nid, zid);
593
594	return MEM_CGROUP_ZSTAT(mz, lru);
595}
596
597struct zone_reclaim_stat *mem_cgroup_get_reclaim_stat(struct mem_cgroup *memcg,
598						      struct zone *zone)
599{
600	int nid = zone->zone_pgdat->node_id;
601	int zid = zone_idx(zone);
602	struct mem_cgroup_per_zone *mz = mem_cgroup_zoneinfo(memcg, nid, zid);
603
604	return &mz->reclaim_stat;
605}
606
607struct zone_reclaim_stat *
608mem_cgroup_get_reclaim_stat_from_page(struct page *page)
609{
610	struct page_cgroup *pc;
611	struct mem_cgroup_per_zone *mz;
612
613	if (mem_cgroup_disabled())
614		return NULL;
615
616	pc = lookup_page_cgroup(page);
617	/*
618	 * Used bit is set without atomic ops but after smp_wmb().
619	 * For making pc->mem_cgroup visible, insert smp_rmb() here.
620	 */
621	smp_rmb();
622	if (!PageCgroupUsed(pc))
623		return NULL;
624
625	mz = page_cgroup_zoneinfo(pc);
626	if (!mz)
627		return NULL;
628
629	return &mz->reclaim_stat;
630}
631
632unsigned long mem_cgroup_isolate_pages(unsigned long nr_to_scan,
633					struct list_head *dst,
634					unsigned long *scanned, int order,
635					int mode, struct zone *z,
636					struct mem_cgroup *mem_cont,
637					int active, int file)
638{
639	unsigned long nr_taken = 0;
640	struct page *page;
641	unsigned long scan;
642	LIST_HEAD(pc_list);
643	struct list_head *src;
644	struct page_cgroup *pc, *tmp;
645	int nid = z->zone_pgdat->node_id;
646	int zid = zone_idx(z);
647	struct mem_cgroup_per_zone *mz;
648	int lru = LRU_FILE * !!file + !!active;
649
650	BUG_ON(!mem_cont);
651	mz = mem_cgroup_zoneinfo(mem_cont, nid, zid);
652	src = &mz->lists[lru];
653
654	scan = 0;
655	list_for_each_entry_safe_reverse(pc, tmp, src, lru) {
656		if (scan >= nr_to_scan)
657			break;
658
659		page = pc->page;
660		if (unlikely(!PageCgroupUsed(pc)))
661			continue;
662		if (unlikely(!PageLRU(page)))
663			continue;
664
665		scan++;
666		if (__isolate_lru_page(page, mode, file) == 0) {
667			list_move(&page->lru, dst);
668			nr_taken++;
669		}
670	}
671
672	*scanned = scan;
673	return nr_taken;
674}
675
676#define mem_cgroup_from_res_counter(counter, member)	\
677	container_of(counter, struct mem_cgroup, member)
678
679static bool mem_cgroup_check_under_limit(struct mem_cgroup *mem)
680{
681	if (do_swap_account) {
682		if (res_counter_check_under_limit(&mem->res) &&
683			res_counter_check_under_limit(&mem->memsw))
684			return true;
685	} else
686		if (res_counter_check_under_limit(&mem->res))
687			return true;
688	return false;
689}
690
691static unsigned int get_swappiness(struct mem_cgroup *memcg)
692{
693	struct cgroup *cgrp = memcg->css.cgroup;
694	unsigned int swappiness;
695
696	/* root ? */
697	if (cgrp->parent == NULL)
698		return vm_swappiness;
699
700	spin_lock(&memcg->reclaim_param_lock);
701	swappiness = memcg->swappiness;
702	spin_unlock(&memcg->reclaim_param_lock);
703
704	return swappiness;
705}
706
707static int mem_cgroup_count_children_cb(struct mem_cgroup *mem, void *data)
708{
709	int *val = data;
710	(*val)++;
711	return 0;
712}
713
714/**
715 * mem_cgroup_print_mem_info: Called from OOM with tasklist_lock held in read mode.
716 * @memcg: The memory cgroup that went over limit
717 * @p: Task that is going to be killed
718 *
719 * NOTE: @memcg and @p's mem_cgroup can be different when hierarchy is
720 * enabled
721 */
722void mem_cgroup_print_oom_info(struct mem_cgroup *memcg, struct task_struct *p)
723{
724	struct cgroup *task_cgrp;
725	struct cgroup *mem_cgrp;
726	/*
727	 * Need a buffer in BSS, can't rely on allocations. The code relies
728	 * on the assumption that OOM is serialized for memory controller.
729	 * If this assumption is broken, revisit this code.
730	 */
731	static char memcg_name[PATH_MAX];
732	int ret;
733
734	if (!memcg)
735		return;
736
737
738	rcu_read_lock();
739
740	mem_cgrp = memcg->css.cgroup;
741	task_cgrp = task_cgroup(p, mem_cgroup_subsys_id);
742
743	ret = cgroup_path(task_cgrp, memcg_name, PATH_MAX);
744	if (ret < 0) {
745		/*
746		 * Unfortunately, we are unable to convert to a useful name
747		 * But we'll still print out the usage information
748		 */
749		rcu_read_unlock();
750		goto done;
751	}
752	rcu_read_unlock();
753
754	printk(KERN_INFO "Task in %s killed", memcg_name);
755
756	rcu_read_lock();
757	ret = cgroup_path(mem_cgrp, memcg_name, PATH_MAX);
758	if (ret < 0) {
759		rcu_read_unlock();
760		goto done;
761	}
762	rcu_read_unlock();
763
764	/*
765	 * Continues from above, so we don't need an KERN_ level
766	 */
767	printk(KERN_CONT " as a result of limit of %s\n", memcg_name);
768done:
769
770	printk(KERN_INFO "memory: usage %llukB, limit %llukB, failcnt %llu\n",
771		res_counter_read_u64(&memcg->res, RES_USAGE) >> 10,
772		res_counter_read_u64(&memcg->res, RES_LIMIT) >> 10,
773		res_counter_read_u64(&memcg->res, RES_FAILCNT));
774	printk(KERN_INFO "memory+swap: usage %llukB, limit %llukB, "
775		"failcnt %llu\n",
776		res_counter_read_u64(&memcg->memsw, RES_USAGE) >> 10,
777		res_counter_read_u64(&memcg->memsw, RES_LIMIT) >> 10,
778		res_counter_read_u64(&memcg->memsw, RES_FAILCNT));
779}
780
781/*
782 * This function returns the number of memcg under hierarchy tree. Returns
783 * 1(self count) if no children.
784 */
785static int mem_cgroup_count_children(struct mem_cgroup *mem)
786{
787	int num = 0;
788 	mem_cgroup_walk_tree(mem, &num, mem_cgroup_count_children_cb);
789	return num;
790}
791
792/*
793 * Visit the first child (need not be the first child as per the ordering
794 * of the cgroup list, since we track last_scanned_child) of @mem and use
795 * that to reclaim free pages from.
796 */
797static struct mem_cgroup *
798mem_cgroup_select_victim(struct mem_cgroup *root_mem)
799{
800	struct mem_cgroup *ret = NULL;
801	struct cgroup_subsys_state *css;
802	int nextid, found;
803
804	if (!root_mem->use_hierarchy) {
805		css_get(&root_mem->css);
806		ret = root_mem;
807	}
808
809	while (!ret) {
810		rcu_read_lock();
811		nextid = root_mem->last_scanned_child + 1;
812		css = css_get_next(&mem_cgroup_subsys, nextid, &root_mem->css,
813				   &found);
814		if (css && css_tryget(css))
815			ret = container_of(css, struct mem_cgroup, css);
816
817		rcu_read_unlock();
818		/* Updates scanning parameter */
819		spin_lock(&root_mem->reclaim_param_lock);
820		if (!css) {
821			/* this means start scan from ID:1 */
822			root_mem->last_scanned_child = 0;
823		} else
824			root_mem->last_scanned_child = found;
825		spin_unlock(&root_mem->reclaim_param_lock);
826	}
827
828	return ret;
829}
830
831/*
832 * Scan the hierarchy if needed to reclaim memory. We remember the last child
833 * we reclaimed from, so that we don't end up penalizing one child extensively
834 * based on its position in the children list.
835 *
836 * root_mem is the original ancestor that we've been reclaim from.
837 *
838 * We give up and return to the caller when we visit root_mem twice.
839 * (other groups can be removed while we're walking....)
840 *
841 * If shrink==true, for avoiding to free too much, this returns immedieately.
842 */
843static int mem_cgroup_hierarchical_reclaim(struct mem_cgroup *root_mem,
844				   gfp_t gfp_mask, bool noswap, bool shrink)
845{
846	struct mem_cgroup *victim;
847	int ret, total = 0;
848	int loop = 0;
849
850	while (loop < 2) {
851		victim = mem_cgroup_select_victim(root_mem);
852		if (victim == root_mem)
853			loop++;
854		if (!mem_cgroup_local_usage(&victim->stat)) {
855			/* this cgroup's local usage == 0 */
856			css_put(&victim->css);
857			continue;
858		}
859		/* we use swappiness of local cgroup */
860		ret = try_to_free_mem_cgroup_pages(victim, gfp_mask, noswap,
861						   get_swappiness(victim));
862		css_put(&victim->css);
863		/*
864		 * At shrinking usage, we can't check we should stop here or
865		 * reclaim more. It's depends on callers. last_scanned_child
866		 * will work enough for keeping fairness under tree.
867		 */
868		if (shrink)
869			return ret;
870		total += ret;
871		if (mem_cgroup_check_under_limit(root_mem))
872			return 1 + total;
873	}
874	return total;
875}
876
877bool mem_cgroup_oom_called(struct task_struct *task)
878{
879	bool ret = false;
880	struct mem_cgroup *mem;
881	struct mm_struct *mm;
882
883	rcu_read_lock();
884	mm = task->mm;
885	if (!mm)
886		mm = &init_mm;
887	mem = mem_cgroup_from_task(rcu_dereference(mm->owner));
888	if (mem && time_before(jiffies, mem->last_oom_jiffies + HZ/10))
889		ret = true;
890	rcu_read_unlock();
891	return ret;
892}
893
894static int record_last_oom_cb(struct mem_cgroup *mem, void *data)
895{
896	mem->last_oom_jiffies = jiffies;
897	return 0;
898}
899
900static void record_last_oom(struct mem_cgroup *mem)
901{
902	mem_cgroup_walk_tree(mem, NULL, record_last_oom_cb);
903}
904
905/*
906 * Currently used to update mapped file statistics, but the routine can be
907 * generalized to update other statistics as well.
908 */
909void mem_cgroup_update_mapped_file_stat(struct page *page, int val)
910{
911	struct mem_cgroup *mem;
912	struct mem_cgroup_stat *stat;
913	struct mem_cgroup_stat_cpu *cpustat;
914	int cpu;
915	struct page_cgroup *pc;
916
917	if (!page_is_file_cache(page))
918		return;
919
920	pc = lookup_page_cgroup(page);
921	if (unlikely(!pc))
922		return;
923
924	lock_page_cgroup(pc);
925	mem = pc->mem_cgroup;
926	if (!mem)
927		goto done;
928
929	if (!PageCgroupUsed(pc))
930		goto done;
931
932	/*
933	 * Preemption is already disabled, we don't need get_cpu()
934	 */
935	cpu = smp_processor_id();
936	stat = &mem->stat;
937	cpustat = &stat->cpustat[cpu];
938
939	__mem_cgroup_stat_add_safe(cpustat, MEM_CGROUP_STAT_MAPPED_FILE, val);
940done:
941	unlock_page_cgroup(pc);
942}
943
944/*
945 * Unlike exported interface, "oom" parameter is added. if oom==true,
946 * oom-killer can be invoked.
947 */
948static int __mem_cgroup_try_charge(struct mm_struct *mm,
949			gfp_t gfp_mask, struct mem_cgroup **memcg,
950			bool oom)
951{
952	struct mem_cgroup *mem, *mem_over_limit;
953	int nr_retries = MEM_CGROUP_RECLAIM_RETRIES;
954	struct res_counter *fail_res;
955
956	if (unlikely(test_thread_flag(TIF_MEMDIE))) {
957		/* Don't account this! */
958		*memcg = NULL;
959		return 0;
960	}
961
962	/*
963	 * We always charge the cgroup the mm_struct belongs to.
964	 * The mm_struct's mem_cgroup changes on task migration if the
965	 * thread group leader migrates. It's possible that mm is not
966	 * set, if so charge the init_mm (happens for pagecache usage).
967	 */
968	mem = *memcg;
969	if (likely(!mem)) {
970		mem = try_get_mem_cgroup_from_mm(mm);
971		*memcg = mem;
972	} else {
973		css_get(&mem->css);
974	}
975	if (unlikely(!mem))
976		return 0;
977
978	VM_BUG_ON(css_is_removed(&mem->css));
979
980	while (1) {
981		int ret;
982		bool noswap = false;
983
984		ret = res_counter_charge(&mem->res, PAGE_SIZE, &fail_res);
985		if (likely(!ret)) {
986			if (!do_swap_account)
987				break;
988			ret = res_counter_charge(&mem->memsw, PAGE_SIZE,
989							&fail_res);
990			if (likely(!ret))
991				break;
992			/* mem+swap counter fails */
993			res_counter_uncharge(&mem->res, PAGE_SIZE);
994			noswap = true;
995			mem_over_limit = mem_cgroup_from_res_counter(fail_res,
996									memsw);
997		} else
998			/* mem counter fails */
999			mem_over_limit = mem_cgroup_from_res_counter(fail_res,
1000									res);
1001
1002		if (!(gfp_mask & __GFP_WAIT))
1003			goto nomem;
1004
1005		ret = mem_cgroup_hierarchical_reclaim(mem_over_limit, gfp_mask,
1006							noswap, false);
1007		if (ret)
1008			continue;
1009
1010		/*
1011		 * try_to_free_mem_cgroup_pages() might not give us a full
1012		 * picture of reclaim. Some pages are reclaimed and might be
1013		 * moved to swap cache or just unmapped from the cgroup.
1014		 * Check the limit again to see if the reclaim reduced the
1015		 * current usage of the cgroup before giving up
1016		 *
1017		 */
1018		if (mem_cgroup_check_under_limit(mem_over_limit))
1019			continue;
1020
1021		if (!nr_retries--) {
1022			if (oom) {
1023				mutex_lock(&memcg_tasklist);
1024				mem_cgroup_out_of_memory(mem_over_limit, gfp_mask);
1025				mutex_unlock(&memcg_tasklist);
1026				record_last_oom(mem_over_limit);
1027			}
1028			goto nomem;
1029		}
1030	}
1031	return 0;
1032nomem:
1033	css_put(&mem->css);
1034	return -ENOMEM;
1035}
1036
1037
1038/*
1039 * A helper function to get mem_cgroup from ID. must be called under
1040 * rcu_read_lock(). The caller must check css_is_removed() or some if
1041 * it's concern. (dropping refcnt from swap can be called against removed
1042 * memcg.)
1043 */
1044static struct mem_cgroup *mem_cgroup_lookup(unsigned short id)
1045{
1046	struct cgroup_subsys_state *css;
1047
1048	/* ID 0 is unused ID */
1049	if (!id)
1050		return NULL;
1051	css = css_lookup(&mem_cgroup_subsys, id);
1052	if (!css)
1053		return NULL;
1054	return container_of(css, struct mem_cgroup, css);
1055}
1056
1057static struct mem_cgroup *try_get_mem_cgroup_from_swapcache(struct page *page)
1058{
1059	struct mem_cgroup *mem;
1060	struct page_cgroup *pc;
1061	unsigned short id;
1062	swp_entry_t ent;
1063
1064	VM_BUG_ON(!PageLocked(page));
1065
1066	if (!PageSwapCache(page))
1067		return NULL;
1068
1069	pc = lookup_page_cgroup(page);
1070	lock_page_cgroup(pc);
1071	if (PageCgroupUsed(pc)) {
1072		mem = pc->mem_cgroup;
1073		if (mem && !css_tryget(&mem->css))
1074			mem = NULL;
1075	} else {
1076		ent.val = page_private(page);
1077		id = lookup_swap_cgroup(ent);
1078		rcu_read_lock();
1079		mem = mem_cgroup_lookup(id);
1080		if (mem && !css_tryget(&mem->css))
1081			mem = NULL;
1082		rcu_read_unlock();
1083	}
1084	unlock_page_cgroup(pc);
1085	return mem;
1086}
1087
1088/*
1089 * commit a charge got by __mem_cgroup_try_charge() and makes page_cgroup to be
1090 * USED state. If already USED, uncharge and return.
1091 */
1092
1093static void __mem_cgroup_commit_charge(struct mem_cgroup *mem,
1094				     struct page_cgroup *pc,
1095				     enum charge_type ctype)
1096{
1097	/* try_charge() can return NULL to *memcg, taking care of it. */
1098	if (!mem)
1099		return;
1100
1101	lock_page_cgroup(pc);
1102	if (unlikely(PageCgroupUsed(pc))) {
1103		unlock_page_cgroup(pc);
1104		res_counter_uncharge(&mem->res, PAGE_SIZE);
1105		if (do_swap_account)
1106			res_counter_uncharge(&mem->memsw, PAGE_SIZE);
1107		css_put(&mem->css);
1108		return;
1109	}
1110	pc->mem_cgroup = mem;
1111	smp_wmb();
1112	pc->flags = pcg_default_flags[ctype];
1113
1114	mem_cgroup_charge_statistics(mem, pc, true);
1115
1116	unlock_page_cgroup(pc);
1117}
1118
1119/**
1120 * mem_cgroup_move_account - move account of the page
1121 * @pc:	page_cgroup of the page.
1122 * @from: mem_cgroup which the page is moved from.
1123 * @to:	mem_cgroup which the page is moved to. @from != @to.
1124 *
1125 * The caller must confirm following.
1126 * - page is not on LRU (isolate_page() is useful.)
1127 *
1128 * returns 0 at success,
1129 * returns -EBUSY when lock is busy or "pc" is unstable.
1130 *
1131 * This function does "uncharge" from old cgroup but doesn't do "charge" to
1132 * new cgroup. It should be done by a caller.
1133 */
1134
1135static int mem_cgroup_move_account(struct page_cgroup *pc,
1136	struct mem_cgroup *from, struct mem_cgroup *to)
1137{
1138	struct mem_cgroup_per_zone *from_mz, *to_mz;
1139	int nid, zid;
1140	int ret = -EBUSY;
1141	struct page *page;
1142	int cpu;
1143	struct mem_cgroup_stat *stat;
1144	struct mem_cgroup_stat_cpu *cpustat;
1145
1146	VM_BUG_ON(from == to);
1147	VM_BUG_ON(PageLRU(pc->page));
1148
1149	nid = page_cgroup_nid(pc);
1150	zid = page_cgroup_zid(pc);
1151	from_mz =  mem_cgroup_zoneinfo(from, nid, zid);
1152	to_mz =  mem_cgroup_zoneinfo(to, nid, zid);
1153
1154	if (!trylock_page_cgroup(pc))
1155		return ret;
1156
1157	if (!PageCgroupUsed(pc))
1158		goto out;
1159
1160	if (pc->mem_cgroup != from)
1161		goto out;
1162
1163	res_counter_uncharge(&from->res, PAGE_SIZE);
1164	mem_cgroup_charge_statistics(from, pc, false);
1165
1166	page = pc->page;
1167	if (page_is_file_cache(page) && page_mapped(page)) {
1168		cpu = smp_processor_id();
1169		/* Update mapped_file data for mem_cgroup "from" */
1170		stat = &from->stat;
1171		cpustat = &stat->cpustat[cpu];
1172		__mem_cgroup_stat_add_safe(cpustat, MEM_CGROUP_STAT_MAPPED_FILE,
1173						-1);
1174
1175		/* Update mapped_file data for mem_cgroup "to" */
1176		stat = &to->stat;
1177		cpustat = &stat->cpustat[cpu];
1178		__mem_cgroup_stat_add_safe(cpustat, MEM_CGROUP_STAT_MAPPED_FILE,
1179						1);
1180	}
1181
1182	if (do_swap_account)
1183		res_counter_uncharge(&from->memsw, PAGE_SIZE);
1184	css_put(&from->css);
1185
1186	css_get(&to->css);
1187	pc->mem_cgroup = to;
1188	mem_cgroup_charge_statistics(to, pc, true);
1189	ret = 0;
1190out:
1191	unlock_page_cgroup(pc);
1192	return ret;
1193}
1194
1195/*
1196 * move charges to its parent.
1197 */
1198
1199static int mem_cgroup_move_parent(struct page_cgroup *pc,
1200				  struct mem_cgroup *child,
1201				  gfp_t gfp_mask)
1202{
1203	struct page *page = pc->page;
1204	struct cgroup *cg = child->css.cgroup;
1205	struct cgroup *pcg = cg->parent;
1206	struct mem_cgroup *parent;
1207	int ret;
1208
1209	/* Is ROOT ? */
1210	if (!pcg)
1211		return -EINVAL;
1212
1213
1214	parent = mem_cgroup_from_cont(pcg);
1215
1216
1217	ret = __mem_cgroup_try_charge(NULL, gfp_mask, &parent, false);
1218	if (ret || !parent)
1219		return ret;
1220
1221	if (!get_page_unless_zero(page)) {
1222		ret = -EBUSY;
1223		goto uncharge;
1224	}
1225
1226	ret = isolate_lru_page(page);
1227
1228	if (ret)
1229		goto cancel;
1230
1231	ret = mem_cgroup_move_account(pc, child, parent);
1232
1233	putback_lru_page(page);
1234	if (!ret) {
1235		put_page(page);
1236		/* drop extra refcnt by try_charge() */
1237		css_put(&parent->css);
1238		return 0;
1239	}
1240
1241cancel:
1242	put_page(page);
1243uncharge:
1244	/* drop extra refcnt by try_charge() */
1245	css_put(&parent->css);
1246	/* uncharge if move fails */
1247	res_counter_uncharge(&parent->res, PAGE_SIZE);
1248	if (do_swap_account)
1249		res_counter_uncharge(&parent->memsw, PAGE_SIZE);
1250	return ret;
1251}
1252
1253/*
1254 * Charge the memory controller for page usage.
1255 * Return
1256 * 0 if the charge was successful
1257 * < 0 if the cgroup is over its limit
1258 */
1259static int mem_cgroup_charge_common(struct page *page, struct mm_struct *mm,
1260				gfp_t gfp_mask, enum charge_type ctype,
1261				struct mem_cgroup *memcg)
1262{
1263	struct mem_cgroup *mem;
1264	struct page_cgroup *pc;
1265	int ret;
1266
1267	pc = lookup_page_cgroup(page);
1268	/* can happen at boot */
1269	if (unlikely(!pc))
1270		return 0;
1271	prefetchw(pc);
1272
1273	mem = memcg;
1274	ret = __mem_cgroup_try_charge(mm, gfp_mask, &mem, true);
1275	if (ret || !mem)
1276		return ret;
1277
1278	__mem_cgroup_commit_charge(mem, pc, ctype);
1279	return 0;
1280}
1281
1282int mem_cgroup_newpage_charge(struct page *page,
1283			      struct mm_struct *mm, gfp_t gfp_mask)
1284{
1285	if (mem_cgroup_disabled())
1286		return 0;
1287	if (PageCompound(page))
1288		return 0;
1289	/*
1290	 * If already mapped, we don't have to account.
1291	 * If page cache, page->mapping has address_space.
1292	 * But page->mapping may have out-of-use anon_vma pointer,
1293	 * detecit it by PageAnon() check. newly-mapped-anon's page->mapping
1294	 * is NULL.
1295  	 */
1296	if (page_mapped(page) || (page->mapping && !PageAnon(page)))
1297		return 0;
1298	if (unlikely(!mm))
1299		mm = &init_mm;
1300	return mem_cgroup_charge_common(page, mm, gfp_mask,
1301				MEM_CGROUP_CHARGE_TYPE_MAPPED, NULL);
1302}
1303
1304static void
1305__mem_cgroup_commit_charge_swapin(struct page *page, struct mem_cgroup *ptr,
1306					enum charge_type ctype);
1307
1308int mem_cgroup_cache_charge(struct page *page, struct mm_struct *mm,
1309				gfp_t gfp_mask)
1310{
1311	struct mem_cgroup *mem = NULL;
1312	int ret;
1313
1314	if (mem_cgroup_disabled())
1315		return 0;
1316	if (PageCompound(page))
1317		return 0;
1318	/*
1319	 * Corner case handling. This is called from add_to_page_cache()
1320	 * in usual. But some FS (shmem) precharges this page before calling it
1321	 * and call add_to_page_cache() with GFP_NOWAIT.
1322	 *
1323	 * For GFP_NOWAIT case, the page may be pre-charged before calling
1324	 * add_to_page_cache(). (See shmem.c) check it here and avoid to call
1325	 * charge twice. (It works but has to pay a bit larger cost.)
1326	 * And when the page is SwapCache, it should take swap information
1327	 * into account. This is under lock_page() now.
1328	 */
1329	if (!(gfp_mask & __GFP_WAIT)) {
1330		struct page_cgroup *pc;
1331
1332
1333		pc = lookup_page_cgroup(page);
1334		if (!pc)
1335			return 0;
1336		lock_page_cgroup(pc);
1337		if (PageCgroupUsed(pc)) {
1338			unlock_page_cgroup(pc);
1339			return 0;
1340		}
1341		unlock_page_cgroup(pc);
1342	}
1343
1344	if (unlikely(!mm && !mem))
1345		mm = &init_mm;
1346
1347	if (page_is_file_cache(page))
1348		return mem_cgroup_charge_common(page, mm, gfp_mask,
1349				MEM_CGROUP_CHARGE_TYPE_CACHE, NULL);
1350
1351	/* shmem */
1352	if (PageSwapCache(page)) {
1353		ret = mem_cgroup_try_charge_swapin(mm, page, gfp_mask, &mem);
1354		if (!ret)
1355			__mem_cgroup_commit_charge_swapin(page, mem,
1356					MEM_CGROUP_CHARGE_TYPE_SHMEM);
1357	} else
1358		ret = mem_cgroup_charge_common(page, mm, gfp_mask,
1359					MEM_CGROUP_CHARGE_TYPE_SHMEM, mem);
1360
1361	return ret;
1362}
1363
1364/*
1365 * While swap-in, try_charge -> commit or cancel, the page is locked.
1366 * And when try_charge() successfully returns, one refcnt to memcg without
1367 * struct page_cgroup is aquired. This refcnt will be cumsumed by
1368 * "commit()" or removed by "cancel()"
1369 */
1370int mem_cgroup_try_charge_swapin(struct mm_struct *mm,
1371				 struct page *page,
1372				 gfp_t mask, struct mem_cgroup **ptr)
1373{
1374	struct mem_cgroup *mem;
1375	int ret;
1376
1377	if (mem_cgroup_disabled())
1378		return 0;
1379
1380	if (!do_swap_account)
1381		goto charge_cur_mm;
1382	/*
1383	 * A racing thread's fault, or swapoff, may have already updated
1384	 * the pte, and even removed page from swap cache: return success
1385	 * to go on to do_swap_page()'s pte_same() test, which should fail.
1386	 */
1387	if (!PageSwapCache(page))
1388		return 0;
1389	mem = try_get_mem_cgroup_from_swapcache(page);
1390	if (!mem)
1391		goto charge_cur_mm;
1392	*ptr = mem;
1393	ret = __mem_cgroup_try_charge(NULL, mask, ptr, true);
1394	/* drop extra refcnt from tryget */
1395	css_put(&mem->css);
1396	return ret;
1397charge_cur_mm:
1398	if (unlikely(!mm))
1399		mm = &init_mm;
1400	return __mem_cgroup_try_charge(mm, mask, ptr, true);
1401}
1402
1403static void
1404__mem_cgroup_commit_charge_swapin(struct page *page, struct mem_cgroup *ptr,
1405					enum charge_type ctype)
1406{
1407	struct page_cgroup *pc;
1408
1409	if (mem_cgroup_disabled())
1410		return;
1411	if (!ptr)
1412		return;
1413	pc = lookup_page_cgroup(page);
1414	mem_cgroup_lru_del_before_commit_swapcache(page);
1415	__mem_cgroup_commit_charge(ptr, pc, ctype);
1416	mem_cgroup_lru_add_after_commit_swapcache(page);
1417	/*
1418	 * Now swap is on-memory. This means this page may be
1419	 * counted both as mem and swap....double count.
1420	 * Fix it by uncharging from memsw. Basically, this SwapCache is stable
1421	 * under lock_page(). But in do_swap_page()::memory.c, reuse_swap_page()
1422	 * may call delete_from_swap_cache() before reach here.
1423	 */
1424	if (do_swap_account && PageSwapCache(page)) {
1425		swp_entry_t ent = {.val = page_private(page)};
1426		unsigned short id;
1427		struct mem_cgroup *memcg;
1428
1429		id = swap_cgroup_record(ent, 0);
1430		rcu_read_lock();
1431		memcg = mem_cgroup_lookup(id);
1432		if (memcg) {
1433			/*
1434			 * This recorded memcg can be obsolete one. So, avoid
1435			 * calling css_tryget
1436			 */
1437			res_counter_uncharge(&memcg->memsw, PAGE_SIZE);
1438			mem_cgroup_put(memcg);
1439		}
1440		rcu_read_unlock();
1441	}
1442	/* add this page(page_cgroup) to the LRU we want. */
1443
1444}
1445
1446void mem_cgroup_commit_charge_swapin(struct page *page, struct mem_cgroup *ptr)
1447{
1448	__mem_cgroup_commit_charge_swapin(page, ptr,
1449					MEM_CGROUP_CHARGE_TYPE_MAPPED);
1450}
1451
1452void mem_cgroup_cancel_charge_swapin(struct mem_cgroup *mem)
1453{
1454	if (mem_cgroup_disabled())
1455		return;
1456	if (!mem)
1457		return;
1458	res_counter_uncharge(&mem->res, PAGE_SIZE);
1459	if (do_swap_account)
1460		res_counter_uncharge(&mem->memsw, PAGE_SIZE);
1461	css_put(&mem->css);
1462}
1463
1464
1465/*
1466 * uncharge if !page_mapped(page)
1467 */
1468static struct mem_cgroup *
1469__mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype)
1470{
1471	struct page_cgroup *pc;
1472	struct mem_cgroup *mem = NULL;
1473	struct mem_cgroup_per_zone *mz;
1474
1475	if (mem_cgroup_disabled())
1476		return NULL;
1477
1478	if (PageSwapCache(page))
1479		return NULL;
1480
1481	/*
1482	 * Check if our page_cgroup is valid
1483	 */
1484	pc = lookup_page_cgroup(page);
1485	if (unlikely(!pc || !PageCgroupUsed(pc)))
1486		return NULL;
1487
1488	lock_page_cgroup(pc);
1489
1490	mem = pc->mem_cgroup;
1491
1492	if (!PageCgroupUsed(pc))
1493		goto unlock_out;
1494
1495	switch (ctype) {
1496	case MEM_CGROUP_CHARGE_TYPE_MAPPED:
1497	case MEM_CGROUP_CHARGE_TYPE_DROP:
1498		if (page_mapped(page))
1499			goto unlock_out;
1500		break;
1501	case MEM_CGROUP_CHARGE_TYPE_SWAPOUT:
1502		if (!PageAnon(page)) {	/* Shared memory */
1503			if (page->mapping && !page_is_file_cache(page))
1504				goto unlock_out;
1505		} else if (page_mapped(page)) /* Anon */
1506				goto unlock_out;
1507		break;
1508	default:
1509		break;
1510	}
1511
1512	res_counter_uncharge(&mem->res, PAGE_SIZE);
1513	if (do_swap_account && (ctype != MEM_CGROUP_CHARGE_TYPE_SWAPOUT))
1514		res_counter_uncharge(&mem->memsw, PAGE_SIZE);
1515	mem_cgroup_charge_statistics(mem, pc, false);
1516
1517	ClearPageCgroupUsed(pc);
1518	/*
1519	 * pc->mem_cgroup is not cleared here. It will be accessed when it's
1520	 * freed from LRU. This is safe because uncharged page is expected not
1521	 * to be reused (freed soon). Exception is SwapCache, it's handled by
1522	 * special functions.
1523	 */
1524
1525	mz = page_cgroup_zoneinfo(pc);
1526	unlock_page_cgroup(pc);
1527
1528	/* at swapout, this memcg will be accessed to record to swap */
1529	if (ctype != MEM_CGROUP_CHARGE_TYPE_SWAPOUT)
1530		css_put(&mem->css);
1531
1532	return mem;
1533
1534unlock_out:
1535	unlock_page_cgroup(pc);
1536	return NULL;
1537}
1538
1539void mem_cgroup_uncharge_page(struct page *page)
1540{
1541	/* early check. */
1542	if (page_mapped(page))
1543		return;
1544	if (page->mapping && !PageAnon(page))
1545		return;
1546	__mem_cgroup_uncharge_common(page, MEM_CGROUP_CHARGE_TYPE_MAPPED);
1547}
1548
1549void mem_cgroup_uncharge_cache_page(struct page *page)
1550{
1551	VM_BUG_ON(page_mapped(page));
1552	VM_BUG_ON(page->mapping);
1553	__mem_cgroup_uncharge_common(page, MEM_CGROUP_CHARGE_TYPE_CACHE);
1554}
1555
1556#ifdef CONFIG_SWAP
1557/*
1558 * called after __delete_from_swap_cache() and drop "page" account.
1559 * memcg information is recorded to swap_cgroup of "ent"
1560 */
1561void
1562mem_cgroup_uncharge_swapcache(struct page *page, swp_entry_t ent, bool swapout)
1563{
1564	struct mem_cgroup *memcg;
1565	int ctype = MEM_CGROUP_CHARGE_TYPE_SWAPOUT;
1566
1567	if (!swapout) /* this was a swap cache but the swap is unused ! */
1568		ctype = MEM_CGROUP_CHARGE_TYPE_DROP;
1569
1570	memcg = __mem_cgroup_uncharge_common(page, ctype);
1571
1572	/* record memcg information */
1573	if (do_swap_account && swapout && memcg) {
1574		swap_cgroup_record(ent, css_id(&memcg->css));
1575		mem_cgroup_get(memcg);
1576	}
1577	if (swapout && memcg)
1578		css_put(&memcg->css);
1579}
1580#endif
1581
1582#ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP
1583/*
1584 * called from swap_entry_free(). remove record in swap_cgroup and
1585 * uncharge "memsw" account.
1586 */
1587void mem_cgroup_uncharge_swap(swp_entry_t ent)
1588{
1589	struct mem_cgroup *memcg;
1590	unsigned short id;
1591
1592	if (!do_swap_account)
1593		return;
1594
1595	id = swap_cgroup_record(ent, 0);
1596	rcu_read_lock();
1597	memcg = mem_cgroup_lookup(id);
1598	if (memcg) {
1599		/*
1600		 * We uncharge this because swap is freed.
1601		 * This memcg can be obsolete one. We avoid calling css_tryget
1602		 */
1603		res_counter_uncharge(&memcg->memsw, PAGE_SIZE);
1604		mem_cgroup_put(memcg);
1605	}
1606	rcu_read_unlock();
1607}
1608#endif
1609
1610/*
1611 * Before starting migration, account PAGE_SIZE to mem_cgroup that the old
1612 * page belongs to.
1613 */
1614int mem_cgroup_prepare_migration(struct page *page, struct mem_cgroup **ptr)
1615{
1616	struct page_cgroup *pc;
1617	struct mem_cgroup *mem = NULL;
1618	int ret = 0;
1619
1620	if (mem_cgroup_disabled())
1621		return 0;
1622
1623	pc = lookup_page_cgroup(page);
1624	lock_page_cgroup(pc);
1625	if (PageCgroupUsed(pc)) {
1626		mem = pc->mem_cgroup;
1627		css_get(&mem->css);
1628	}
1629	unlock_page_cgroup(pc);
1630
1631	if (mem) {
1632		ret = __mem_cgroup_try_charge(NULL, GFP_KERNEL, &mem, false);
1633		css_put(&mem->css);
1634	}
1635	*ptr = mem;
1636	return ret;
1637}
1638
1639/* remove redundant charge if migration failed*/
1640void mem_cgroup_end_migration(struct mem_cgroup *mem,
1641		struct page *oldpage, struct page *newpage)
1642{
1643	struct page *target, *unused;
1644	struct page_cgroup *pc;
1645	enum charge_type ctype;
1646
1647	if (!mem)
1648		return;
1649
1650	/* at migration success, oldpage->mapping is NULL. */
1651	if (oldpage->mapping) {
1652		target = oldpage;
1653		unused = NULL;
1654	} else {
1655		target = newpage;
1656		unused = oldpage;
1657	}
1658
1659	if (PageAnon(target))
1660		ctype = MEM_CGROUP_CHARGE_TYPE_MAPPED;
1661	else if (page_is_file_cache(target))
1662		ctype = MEM_CGROUP_CHARGE_TYPE_CACHE;
1663	else
1664		ctype = MEM_CGROUP_CHARGE_TYPE_SHMEM;
1665
1666	/* unused page is not on radix-tree now. */
1667	if (unused)
1668		__mem_cgroup_uncharge_common(unused, ctype);
1669
1670	pc = lookup_page_cgroup(target);
1671	/*
1672	 * __mem_cgroup_commit_charge() check PCG_USED bit of page_cgroup.
1673	 * So, double-counting is effectively avoided.
1674	 */
1675	__mem_cgroup_commit_charge(mem, pc, ctype);
1676
1677	/*
1678	 * Both of oldpage and newpage are still under lock_page().
1679	 * Then, we don't have to care about race in radix-tree.
1680	 * But we have to be careful that this page is unmapped or not.
1681	 *
1682	 * There is a case for !page_mapped(). At the start of
1683	 * migration, oldpage was mapped. But now, it's zapped.
1684	 * But we know *target* page is not freed/reused under us.
1685	 * mem_cgroup_uncharge_page() does all necessary checks.
1686	 */
1687	if (ctype == MEM_CGROUP_CHARGE_TYPE_MAPPED)
1688		mem_cgroup_uncharge_page(target);
1689}
1690
1691/*
1692 * A call to try to shrink memory usage on charge failure at shmem's swapin.
1693 * Calling hierarchical_reclaim is not enough because we should update
1694 * last_oom_jiffies to prevent pagefault_out_of_memory from invoking global OOM.
1695 * Moreover considering hierarchy, we should reclaim from the mem_over_limit,
1696 * not from the memcg which this page would be charged to.
1697 * try_charge_swapin does all of these works properly.
1698 */
1699int mem_cgroup_shmem_charge_fallback(struct page *page,
1700			    struct mm_struct *mm,
1701			    gfp_t gfp_mask)
1702{
1703	struct mem_cgroup *mem = NULL;
1704	int ret;
1705
1706	if (mem_cgroup_disabled())
1707		return 0;
1708
1709	ret = mem_cgroup_try_charge_swapin(mm, page, gfp_mask, &mem);
1710	if (!ret)
1711		mem_cgroup_cancel_charge_swapin(mem); /* it does !mem check */
1712
1713	return ret;
1714}
1715
1716static DEFINE_MUTEX(set_limit_mutex);
1717
1718static int mem_cgroup_resize_limit(struct mem_cgroup *memcg,
1719				unsigned long long val)
1720{
1721	int retry_count;
1722	int progress;
1723	u64 memswlimit;
1724	int ret = 0;
1725	int children = mem_cgroup_count_children(memcg);
1726	u64 curusage, oldusage;
1727
1728	/*
1729	 * For keeping hierarchical_reclaim simple, how long we should retry
1730	 * is depends on callers. We set our retry-count to be function
1731	 * of # of children which we should visit in this loop.
1732	 */
1733	retry_count = MEM_CGROUP_RECLAIM_RETRIES * children;
1734
1735	oldusage = res_counter_read_u64(&memcg->res, RES_USAGE);
1736
1737	while (retry_count) {
1738		if (signal_pending(current)) {
1739			ret = -EINTR;
1740			break;
1741		}
1742		/*
1743		 * Rather than hide all in some function, I do this in
1744		 * open coded manner. You see what this really does.
1745		 * We have to guarantee mem->res.limit < mem->memsw.limit.
1746		 */
1747		mutex_lock(&set_limit_mutex);
1748		memswlimit = res_counter_read_u64(&memcg->memsw, RES_LIMIT);
1749		if (memswlimit < val) {
1750			ret = -EINVAL;
1751			mutex_unlock(&set_limit_mutex);
1752			break;
1753		}
1754		ret = res_counter_set_limit(&memcg->res, val);
1755		mutex_unlock(&set_limit_mutex);
1756
1757		if (!ret)
1758			break;
1759
1760		progress = mem_cgroup_hierarchical_reclaim(memcg, GFP_KERNEL,
1761						   false, true);
1762		curusage = res_counter_read_u64(&memcg->res, RES_USAGE);
1763		/* Usage is reduced ? */
1764  		if (curusage >= oldusage)
1765			retry_count--;
1766		else
1767			oldusage = curusage;
1768	}
1769
1770	return ret;
1771}
1772
1773static int mem_cgroup_resize_memsw_limit(struct mem_cgroup *memcg,
1774					unsigned long long val)
1775{
1776	int retry_count;
1777	u64 memlimit, oldusage, curusage;
1778	int children = mem_cgroup_count_children(memcg);
1779	int ret = -EBUSY;
1780
1781	/* see mem_cgroup_resize_res_limit */
1782 	retry_count = children * MEM_CGROUP_RECLAIM_RETRIES;
1783	oldusage = res_counter_read_u64(&memcg->memsw, RES_USAGE);
1784	while (retry_count) {
1785		if (signal_pending(current)) {
1786			ret = -EINTR;
1787			break;
1788		}
1789		/*
1790		 * Rather than hide all in some function, I do this in
1791		 * open coded manner. You see what this really does.
1792		 * We have to guarantee mem->res.limit < mem->memsw.limit.
1793		 */
1794		mutex_lock(&set_limit_mutex);
1795		memlimit = res_counter_read_u64(&memcg->res, RES_LIMIT);
1796		if (memlimit > val) {
1797			ret = -EINVAL;
1798			mutex_unlock(&set_limit_mutex);
1799			break;
1800		}
1801		ret = res_counter_set_limit(&memcg->memsw, val);
1802		mutex_unlock(&set_limit_mutex);
1803
1804		if (!ret)
1805			break;
1806
1807		mem_cgroup_hierarchical_reclaim(memcg, GFP_KERNEL, true, true);
1808		curusage = res_counter_read_u64(&memcg->memsw, RES_USAGE);
1809		/* Usage is reduced ? */
1810		if (curusage >= oldusage)
1811			retry_count--;
1812		else
1813			oldusage = curusage;
1814	}
1815	return ret;
1816}
1817
1818/*
1819 * This routine traverse page_cgroup in given list and drop them all.
1820 * *And* this routine doesn't reclaim page itself, just removes page_cgroup.
1821 */
1822static int mem_cgroup_force_empty_list(struct mem_cgroup *mem,
1823				int node, int zid, enum lru_list lru)
1824{
1825	struct zone *zone;
1826	struct mem_cgroup_per_zone *mz;
1827	struct page_cgroup *pc, *busy;
1828	unsigned long flags, loop;
1829	struct list_head *list;
1830	int ret = 0;
1831
1832	zone = &NODE_DATA(node)->node_zones[zid];
1833	mz = mem_cgroup_zoneinfo(mem, node, zid);
1834	list = &mz->lists[lru];
1835
1836	loop = MEM_CGROUP_ZSTAT(mz, lru);
1837	/* give some margin against EBUSY etc...*/
1838	loop += 256;
1839	busy = NULL;
1840	while (loop--) {
1841		ret = 0;
1842		spin_lock_irqsave(&zone->lru_lock, flags);
1843		if (list_empty(list)) {
1844			spin_unlock_irqrestore(&zone->lru_lock, flags);
1845			break;
1846		}
1847		pc = list_entry(list->prev, struct page_cgroup, lru);
1848		if (busy == pc) {
1849			list_move(&pc->lru, list);
1850			busy = 0;
1851			spin_unlock_irqrestore(&zone->lru_lock, flags);
1852			continue;
1853		}
1854		spin_unlock_irqrestore(&zone->lru_lock, flags);
1855
1856		ret = mem_cgroup_move_parent(pc, mem, GFP_KERNEL);
1857		if (ret == -ENOMEM)
1858			break;
1859
1860		if (ret == -EBUSY || ret == -EINVAL) {
1861			/* found lock contention or "pc" is obsolete. */
1862			busy = pc;
1863			cond_resched();
1864		} else
1865			busy = NULL;
1866	}
1867
1868	if (!ret && !list_empty(list))
1869		return -EBUSY;
1870	return ret;
1871}
1872
1873/*
1874 * make mem_cgroup's charge to be 0 if there is no task.
1875 * This enables deleting this mem_cgroup.
1876 */
1877static int mem_cgroup_force_empty(struct mem_cgroup *mem, bool free_all)
1878{
1879	int ret;
1880	int node, zid, shrink;
1881	int nr_retries = MEM_CGROUP_RECLAIM_RETRIES;
1882	struct cgroup *cgrp = mem->css.cgroup;
1883
1884	css_get(&mem->css);
1885
1886	shrink = 0;
1887	/* should free all ? */
1888	if (free_all)
1889		goto try_to_free;
1890move_account:
1891	while (mem->res.usage > 0) {
1892		ret = -EBUSY;
1893		if (cgroup_task_count(cgrp) || !list_empty(&cgrp->children))
1894			goto out;
1895		ret = -EINTR;
1896		if (signal_pending(current))
1897			goto out;
1898		/* This is for making all *used* pages to be on LRU. */
1899		lru_add_drain_all();
1900		ret = 0;
1901		for_each_node_state(node, N_HIGH_MEMORY) {
1902			for (zid = 0; !ret && zid < MAX_NR_ZONES; zid++) {
1903				enum lru_list l;
1904				for_each_lru(l) {
1905					ret = mem_cgroup_force_empty_list(mem,
1906							node, zid, l);
1907					if (ret)
1908						break;
1909				}
1910			}
1911			if (ret)
1912				break;
1913		}
1914		/* it seems parent cgroup doesn't have enough mem */
1915		if (ret == -ENOMEM)
1916			goto try_to_free;
1917		cond_resched();
1918	}
1919	ret = 0;
1920out:
1921	css_put(&mem->css);
1922	return ret;
1923
1924try_to_free:
1925	/* returns EBUSY if there is a task or if we come here twice. */
1926	if (cgroup_task_count(cgrp) || !list_empty(&cgrp->children) || shrink) {
1927		ret = -EBUSY;
1928		goto out;
1929	}
1930	/* we call try-to-free pages for make this cgroup empty */
1931	lru_add_drain_all();
1932	/* try to free all pages in this cgroup */
1933	shrink = 1;
1934	while (nr_retries && mem->res.usage > 0) {
1935		int progress;
1936
1937		if (signal_pending(current)) {
1938			ret = -EINTR;
1939			goto out;
1940		}
1941		progress = try_to_free_mem_cgroup_pages(mem, GFP_KERNEL,
1942						false, get_swappiness(mem));
1943		if (!progress) {
1944			nr_retries--;
1945			/* maybe some writeback is necessary */
1946			congestion_wait(WRITE, HZ/10);
1947		}
1948
1949	}
1950	lru_add_drain();
1951	/* try move_account...there may be some *locked* pages. */
1952	if (mem->res.usage)
1953		goto move_account;
1954	ret = 0;
1955	goto out;
1956}
1957
1958int mem_cgroup_force_empty_write(struct cgroup *cont, unsigned int event)
1959{
1960	return mem_cgroup_force_empty(mem_cgroup_from_cont(cont), true);
1961}
1962
1963
1964static u64 mem_cgroup_hierarchy_read(struct cgroup *cont, struct cftype *cft)
1965{
1966	return mem_cgroup_from_cont(cont)->use_hierarchy;
1967}
1968
1969static int mem_cgroup_hierarchy_write(struct cgroup *cont, struct cftype *cft,
1970					u64 val)
1971{
1972	int retval = 0;
1973	struct mem_cgroup *mem = mem_cgroup_from_cont(cont);
1974	struct cgroup *parent = cont->parent;
1975	struct mem_cgroup *parent_mem = NULL;
1976
1977	if (parent)
1978		parent_mem = mem_cgroup_from_cont(parent);
1979
1980	cgroup_lock();
1981	/*
1982	 * If parent's use_hiearchy is set, we can't make any modifications
1983	 * in the child subtrees. If it is unset, then the change can
1984	 * occur, provided the current cgroup has no children.
1985	 *
1986	 * For the root cgroup, parent_mem is NULL, we allow value to be
1987	 * set if there are no children.
1988	 */
1989	if ((!parent_mem || !parent_mem->use_hierarchy) &&
1990				(val == 1 || val == 0)) {
1991		if (list_empty(&cont->children))
1992			mem->use_hierarchy = val;
1993		else
1994			retval = -EBUSY;
1995	} else
1996		retval = -EINVAL;
1997	cgroup_unlock();
1998
1999	return retval;
2000}
2001
2002static u64 mem_cgroup_read(struct cgroup *cont, struct cftype *cft)
2003{
2004	struct mem_cgroup *mem = mem_cgroup_from_cont(cont);
2005	u64 val = 0;
2006	int type, name;
2007
2008	type = MEMFILE_TYPE(cft->private);
2009	name = MEMFILE_ATTR(cft->private);
2010	switch (type) {
2011	case _MEM:
2012		val = res_counter_read_u64(&mem->res, name);
2013		break;
2014	case _MEMSWAP:
2015		val = res_counter_read_u64(&mem->memsw, name);
2016		break;
2017	default:
2018		BUG();
2019		break;
2020	}
2021	return val;
2022}
2023/*
2024 * The user of this function is...
2025 * RES_LIMIT.
2026 */
2027static int mem_cgroup_write(struct cgroup *cont, struct cftype *cft,
2028			    const char *buffer)
2029{
2030	struct mem_cgroup *memcg = mem_cgroup_from_cont(cont);
2031	int type, name;
2032	unsigned long long val;
2033	int ret;
2034
2035	type = MEMFILE_TYPE(cft->private);
2036	name = MEMFILE_ATTR(cft->private);
2037	switch (name) {
2038	case RES_LIMIT:
2039		/* This function does all necessary parse...reuse it */
2040		ret = res_counter_memparse_write_strategy(buffer, &val);
2041		if (ret)
2042			break;
2043		if (type == _MEM)
2044			ret = mem_cgroup_resize_limit(memcg, val);
2045		else
2046			ret = mem_cgroup_resize_memsw_limit(memcg, val);
2047		break;
2048	default:
2049		ret = -EINVAL; /* should be BUG() ? */
2050		break;
2051	}
2052	return ret;
2053}
2054
2055static void memcg_get_hierarchical_limit(struct mem_cgroup *memcg,
2056		unsigned long long *mem_limit, unsigned long long *memsw_limit)
2057{
2058	struct cgroup *cgroup;
2059	unsigned long long min_limit, min_memsw_limit, tmp;
2060
2061	min_limit = res_counter_read_u64(&memcg->res, RES_LIMIT);
2062	min_memsw_limit = res_counter_read_u64(&memcg->memsw, RES_LIMIT);
2063	cgroup = memcg->css.cgroup;
2064	if (!memcg->use_hierarchy)
2065		goto out;
2066
2067	while (cgroup->parent) {
2068		cgroup = cgroup->parent;
2069		memcg = mem_cgroup_from_cont(cgroup);
2070		if (!memcg->use_hierarchy)
2071			break;
2072		tmp = res_counter_read_u64(&memcg->res, RES_LIMIT);
2073		min_limit = min(min_limit, tmp);
2074		tmp = res_counter_read_u64(&memcg->memsw, RES_LIMIT);
2075		min_memsw_limit = min(min_memsw_limit, tmp);
2076	}
2077out:
2078	*mem_limit = min_limit;
2079	*memsw_limit = min_memsw_limit;
2080	return;
2081}
2082
2083static int mem_cgroup_reset(struct cgroup *cont, unsigned int event)
2084{
2085	struct mem_cgroup *mem;
2086	int type, name;
2087
2088	mem = mem_cgroup_from_cont(cont);
2089	type = MEMFILE_TYPE(event);
2090	name = MEMFILE_ATTR(event);
2091	switch (name) {
2092	case RES_MAX_USAGE:
2093		if (type == _MEM)
2094			res_counter_reset_max(&mem->res);
2095		else
2096			res_counter_reset_max(&mem->memsw);
2097		break;
2098	case RES_FAILCNT:
2099		if (type == _MEM)
2100			res_counter_reset_failcnt(&mem->res);
2101		else
2102			res_counter_reset_failcnt(&mem->memsw);
2103		break;
2104	}
2105	return 0;
2106}
2107
2108
2109/* For read statistics */
2110enum {
2111	MCS_CACHE,
2112	MCS_RSS,
2113	MCS_MAPPED_FILE,
2114	MCS_PGPGIN,
2115	MCS_PGPGOUT,
2116	MCS_INACTIVE_ANON,
2117	MCS_ACTIVE_ANON,
2118	MCS_INACTIVE_FILE,
2119	MCS_ACTIVE_FILE,
2120	MCS_UNEVICTABLE,
2121	NR_MCS_STAT,
2122};
2123
2124struct mcs_total_stat {
2125	s64 stat[NR_MCS_STAT];
2126};
2127
2128struct {
2129	char *local_name;
2130	char *total_name;
2131} memcg_stat_strings[NR_MCS_STAT] = {
2132	{"cache", "total_cache"},
2133	{"rss", "total_rss"},
2134	{"mapped_file", "total_mapped_file"},
2135	{"pgpgin", "total_pgpgin"},
2136	{"pgpgout", "total_pgpgout"},
2137	{"inactive_anon", "total_inactive_anon"},
2138	{"active_anon", "total_active_anon"},
2139	{"inactive_file", "total_inactive_file"},
2140	{"active_file", "total_active_file"},
2141	{"unevictable", "total_unevictable"}
2142};
2143
2144
2145static int mem_cgroup_get_local_stat(struct mem_cgroup *mem, void *data)
2146{
2147	struct mcs_total_stat *s = data;
2148	s64 val;
2149
2150	/* per cpu stat */
2151	val = mem_cgroup_read_stat(&mem->stat, MEM_CGROUP_STAT_CACHE);
2152	s->stat[MCS_CACHE] += val * PAGE_SIZE;
2153	val = mem_cgroup_read_stat(&mem->stat, MEM_CGROUP_STAT_RSS);
2154	s->stat[MCS_RSS] += val * PAGE_SIZE;
2155	val = mem_cgroup_read_stat(&mem->stat, MEM_CGROUP_STAT_MAPPED_FILE);
2156	s->stat[MCS_MAPPED_FILE] += val * PAGE_SIZE;
2157	val = mem_cgroup_read_stat(&mem->stat, MEM_CGROUP_STAT_PGPGIN_COUNT);
2158	s->stat[MCS_PGPGIN] += val;
2159	val = mem_cgroup_read_stat(&mem->stat, MEM_CGROUP_STAT_PGPGOUT_COUNT);
2160	s->stat[MCS_PGPGOUT] += val;
2161
2162	/* per zone stat */
2163	val = mem_cgroup_get_local_zonestat(mem, LRU_INACTIVE_ANON);
2164	s->stat[MCS_INACTIVE_ANON] += val * PAGE_SIZE;
2165	val = mem_cgroup_get_local_zonestat(mem, LRU_ACTIVE_ANON);
2166	s->stat[MCS_ACTIVE_ANON] += val * PAGE_SIZE;
2167	val = mem_cgroup_get_local_zonestat(mem, LRU_INACTIVE_FILE);
2168	s->stat[MCS_INACTIVE_FILE] += val * PAGE_SIZE;
2169	val = mem_cgroup_get_local_zonestat(mem, LRU_ACTIVE_FILE);
2170	s->stat[MCS_ACTIVE_FILE] += val * PAGE_SIZE;
2171	val = mem_cgroup_get_local_zonestat(mem, LRU_UNEVICTABLE);
2172	s->stat[MCS_UNEVICTABLE] += val * PAGE_SIZE;
2173	return 0;
2174}
2175
2176static void
2177mem_cgroup_get_total_stat(struct mem_cgroup *mem, struct mcs_total_stat *s)
2178{
2179	mem_cgroup_walk_tree(mem, s, mem_cgroup_get_local_stat);
2180}
2181
2182static int mem_control_stat_show(struct cgroup *cont, struct cftype *cft,
2183				 struct cgroup_map_cb *cb)
2184{
2185	struct mem_cgroup *mem_cont = mem_cgroup_from_cont(cont);
2186	struct mcs_total_stat mystat;
2187	int i;
2188
2189	memset(&mystat, 0, sizeof(mystat));
2190	mem_cgroup_get_local_stat(mem_cont, &mystat);
2191
2192	for (i = 0; i < NR_MCS_STAT; i++)
2193		cb->fill(cb, memcg_stat_strings[i].local_name, mystat.stat[i]);
2194
2195	/* Hierarchical information */
2196	{
2197		unsigned long long limit, memsw_limit;
2198		memcg_get_hierarchical_limit(mem_cont, &limit, &memsw_limit);
2199		cb->fill(cb, "hierarchical_memory_limit", limit);
2200		if (do_swap_account)
2201			cb->fill(cb, "hierarchical_memsw_limit", memsw_limit);
2202	}
2203
2204	memset(&mystat, 0, sizeof(mystat));
2205	mem_cgroup_get_total_stat(mem_cont, &mystat);
2206	for (i = 0; i < NR_MCS_STAT; i++)
2207		cb->fill(cb, memcg_stat_strings[i].total_name, mystat.stat[i]);
2208
2209
2210#ifdef CONFIG_DEBUG_VM
2211	cb->fill(cb, "inactive_ratio", calc_inactive_ratio(mem_cont, NULL));
2212
2213	{
2214		int nid, zid;
2215		struct mem_cgroup_per_zone *mz;
2216		unsigned long recent_rotated[2] = {0, 0};
2217		unsigned long recent_scanned[2] = {0, 0};
2218
2219		for_each_online_node(nid)
2220			for (zid = 0; zid < MAX_NR_ZONES; zid++) {
2221				mz = mem_cgroup_zoneinfo(mem_cont, nid, zid);
2222
2223				recent_rotated[0] +=
2224					mz->reclaim_stat.recent_rotated[0];
2225				recent_rotated[1] +=
2226					mz->reclaim_stat.recent_rotated[1];
2227				recent_scanned[0] +=
2228					mz->reclaim_stat.recent_scanned[0];
2229				recent_scanned[1] +=
2230					mz->reclaim_stat.recent_scanned[1];
2231			}
2232		cb->fill(cb, "recent_rotated_anon", recent_rotated[0]);
2233		cb->fill(cb, "recent_rotated_file", recent_rotated[1]);
2234		cb->fill(cb, "recent_scanned_anon", recent_scanned[0]);
2235		cb->fill(cb, "recent_scanned_file", recent_scanned[1]);
2236	}
2237#endif
2238
2239	return 0;
2240}
2241
2242static u64 mem_cgroup_swappiness_read(struct cgroup *cgrp, struct cftype *cft)
2243{
2244	struct mem_cgroup *memcg = mem_cgroup_from_cont(cgrp);
2245
2246	return get_swappiness(memcg);
2247}
2248
2249static int mem_cgroup_swappiness_write(struct cgroup *cgrp, struct cftype *cft,
2250				       u64 val)
2251{
2252	struct mem_cgroup *memcg = mem_cgroup_from_cont(cgrp);
2253	struct mem_cgroup *parent;
2254
2255	if (val > 100)
2256		return -EINVAL;
2257
2258	if (cgrp->parent == NULL)
2259		return -EINVAL;
2260
2261	parent = mem_cgroup_from_cont(cgrp->parent);
2262
2263	cgroup_lock();
2264
2265	/* If under hierarchy, only empty-root can set this value */
2266	if ((parent->use_hierarchy) ||
2267	    (memcg->use_hierarchy && !list_empty(&cgrp->children))) {
2268		cgroup_unlock();
2269		return -EINVAL;
2270	}
2271
2272	spin_lock(&memcg->reclaim_param_lock);
2273	memcg->swappiness = val;
2274	spin_unlock(&memcg->reclaim_param_lock);
2275
2276	cgroup_unlock();
2277
2278	return 0;
2279}
2280
2281
2282static struct cftype mem_cgroup_files[] = {
2283	{
2284		.name = "usage_in_bytes",
2285		.private = MEMFILE_PRIVATE(_MEM, RES_USAGE),
2286		.read_u64 = mem_cgroup_read,
2287	},
2288	{
2289		.name = "max_usage_in_bytes",
2290		.private = MEMFILE_PRIVATE(_MEM, RES_MAX_USAGE),
2291		.trigger = mem_cgroup_reset,
2292		.read_u64 = mem_cgroup_read,
2293	},
2294	{
2295		.name = "limit_in_bytes",
2296		.private = MEMFILE_PRIVATE(_MEM, RES_LIMIT),
2297		.write_string = mem_cgroup_write,
2298		.read_u64 = mem_cgroup_read,
2299	},
2300	{
2301		.name = "failcnt",
2302		.private = MEMFILE_PRIVATE(_MEM, RES_FAILCNT),
2303		.trigger = mem_cgroup_reset,
2304		.read_u64 = mem_cgroup_read,
2305	},
2306	{
2307		.name = "stat",
2308		.read_map = mem_control_stat_show,
2309	},
2310	{
2311		.name = "force_empty",
2312		.trigger = mem_cgroup_force_empty_write,
2313	},
2314	{
2315		.name = "use_hierarchy",
2316		.write_u64 = mem_cgroup_hierarchy_write,
2317		.read_u64 = mem_cgroup_hierarchy_read,
2318	},
2319	{
2320		.name = "swappiness",
2321		.read_u64 = mem_cgroup_swappiness_read,
2322		.write_u64 = mem_cgroup_swappiness_write,
2323	},
2324};
2325
2326#ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP
2327static struct cftype memsw_cgroup_files[] = {
2328	{
2329		.name = "memsw.usage_in_bytes",
2330		.private = MEMFILE_PRIVATE(_MEMSWAP, RES_USAGE),
2331		.read_u64 = mem_cgroup_read,
2332	},
2333	{
2334		.name = "memsw.max_usage_in_bytes",
2335		.private = MEMFILE_PRIVATE(_MEMSWAP, RES_MAX_USAGE),
2336		.trigger = mem_cgroup_reset,
2337		.read_u64 = mem_cgroup_read,
2338	},
2339	{
2340		.name = "memsw.limit_in_bytes",
2341		.private = MEMFILE_PRIVATE(_MEMSWAP, RES_LIMIT),
2342		.write_string = mem_cgroup_write,
2343		.read_u64 = mem_cgroup_read,
2344	},
2345	{
2346		.name = "memsw.failcnt",
2347		.private = MEMFILE_PRIVATE(_MEMSWAP, RES_FAILCNT),
2348		.trigger = mem_cgroup_reset,
2349		.read_u64 = mem_cgroup_read,
2350	},
2351};
2352
2353static int register_memsw_files(struct cgroup *cont, struct cgroup_subsys *ss)
2354{
2355	if (!do_swap_account)
2356		return 0;
2357	return cgroup_add_files(cont, ss, memsw_cgroup_files,
2358				ARRAY_SIZE(memsw_cgroup_files));
2359};
2360#else
2361static int register_memsw_files(struct cgroup *cont, struct cgroup_subsys *ss)
2362{
2363	return 0;
2364}
2365#endif
2366
2367static int alloc_mem_cgroup_per_zone_info(struct mem_cgroup *mem, int node)
2368{
2369	struct mem_cgroup_per_node *pn;
2370	struct mem_cgroup_per_zone *mz;
2371	enum lru_list l;
2372	int zone, tmp = node;
2373	/*
2374	 * This routine is called against possible nodes.
2375	 * But it's BUG to call kmalloc() against offline node.
2376	 *
2377	 * TODO: this routine can waste much memory for nodes which will
2378	 *       never be onlined. It's better to use memory hotplug callback
2379	 *       function.
2380	 */
2381	if (!node_state(node, N_NORMAL_MEMORY))
2382		tmp = -1;
2383	pn = kmalloc_node(sizeof(*pn), GFP_KERNEL, tmp);
2384	if (!pn)
2385		return 1;
2386
2387	mem->info.nodeinfo[node] = pn;
2388	memset(pn, 0, sizeof(*pn));
2389
2390	for (zone = 0; zone < MAX_NR_ZONES; zone++) {
2391		mz = &pn->zoneinfo[zone];
2392		for_each_lru(l)
2393			INIT_LIST_HEAD(&mz->lists[l]);
2394	}
2395	return 0;
2396}
2397
2398static void free_mem_cgroup_per_zone_info(struct mem_cgroup *mem, int node)
2399{
2400	kfree(mem->info.nodeinfo[node]);
2401}
2402
2403static int mem_cgroup_size(void)
2404{
2405	int cpustat_size = nr_cpu_ids * sizeof(struct mem_cgroup_stat_cpu);
2406	return sizeof(struct mem_cgroup) + cpustat_size;
2407}
2408
2409static struct mem_cgroup *mem_cgroup_alloc(void)
2410{
2411	struct mem_cgroup *mem;
2412	int size = mem_cgroup_size();
2413
2414	if (size < PAGE_SIZE)
2415		mem = kmalloc(size, GFP_KERNEL);
2416	else
2417		mem = vmalloc(size);
2418
2419	if (mem)
2420		memset(mem, 0, size);
2421	return mem;
2422}
2423
2424/*
2425 * At destroying mem_cgroup, references from swap_cgroup can remain.
2426 * (scanning all at force_empty is too costly...)
2427 *
2428 * Instead of clearing all references at force_empty, we remember
2429 * the number of reference from swap_cgroup and free mem_cgroup when
2430 * it goes down to 0.
2431 *
2432 * Removal of cgroup itself succeeds regardless of refs from swap.
2433 */
2434
2435static void __mem_cgroup_free(struct mem_cgroup *mem)
2436{
2437	int node;
2438
2439	free_css_id(&mem_cgroup_subsys, &mem->css);
2440
2441	for_each_node_state(node, N_POSSIBLE)
2442		free_mem_cgroup_per_zone_info(mem, node);
2443
2444	if (mem_cgroup_size() < PAGE_SIZE)
2445		kfree(mem);
2446	else
2447		vfree(mem);
2448}
2449
2450static void mem_cgroup_get(struct mem_cgroup *mem)
2451{
2452	atomic_inc(&mem->refcnt);
2453}
2454
2455static void mem_cgroup_put(struct mem_cgroup *mem)
2456{
2457	if (atomic_dec_and_test(&mem->refcnt)) {
2458		struct mem_cgroup *parent = parent_mem_cgroup(mem);
2459		__mem_cgroup_free(mem);
2460		if (parent)
2461			mem_cgroup_put(parent);
2462	}
2463}
2464
2465/*
2466 * Returns the parent mem_cgroup in memcgroup hierarchy with hierarchy enabled.
2467 */
2468static struct mem_cgroup *parent_mem_cgroup(struct mem_cgroup *mem)
2469{
2470	if (!mem->res.parent)
2471		return NULL;
2472	return mem_cgroup_from_res_counter(mem->res.parent, res);
2473}
2474
2475#ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP
2476static void __init enable_swap_cgroup(void)
2477{
2478	if (!mem_cgroup_disabled() && really_do_swap_account)
2479		do_swap_account = 1;
2480}
2481#else
2482static void __init enable_swap_cgroup(void)
2483{
2484}
2485#endif
2486
2487static struct cgroup_subsys_state * __ref
2488mem_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cont)
2489{
2490	struct mem_cgroup *mem, *parent;
2491	long error = -ENOMEM;
2492	int node;
2493
2494	mem = mem_cgroup_alloc();
2495	if (!mem)
2496		return ERR_PTR(error);
2497
2498	for_each_node_state(node, N_POSSIBLE)
2499		if (alloc_mem_cgroup_per_zone_info(mem, node))
2500			goto free_out;
2501	/* root ? */
2502	if (cont->parent == NULL) {
2503		enable_swap_cgroup();
2504		parent = NULL;
2505	} else {
2506		parent = mem_cgroup_from_cont(cont->parent);
2507		mem->use_hierarchy = parent->use_hierarchy;
2508	}
2509
2510	if (parent && parent->use_hierarchy) {
2511		res_counter_init(&mem->res, &parent->res);
2512		res_counter_init(&mem->memsw, &parent->memsw);
2513		/*
2514		 * We increment refcnt of the parent to ensure that we can
2515		 * safely access it on res_counter_charge/uncharge.
2516		 * This refcnt will be decremented when freeing this
2517		 * mem_cgroup(see mem_cgroup_put).
2518		 */
2519		mem_cgroup_get(parent);
2520	} else {
2521		res_counter_init(&mem->res, NULL);
2522		res_counter_init(&mem->memsw, NULL);
2523	}
2524	mem->last_scanned_child = 0;
2525	spin_lock_init(&mem->reclaim_param_lock);
2526
2527	if (parent)
2528		mem->swappiness = get_swappiness(parent);
2529	atomic_set(&mem->refcnt, 1);
2530	return &mem->css;
2531free_out:
2532	__mem_cgroup_free(mem);
2533	return ERR_PTR(error);
2534}
2535
2536static int mem_cgroup_pre_destroy(struct cgroup_subsys *ss,
2537					struct cgroup *cont)
2538{
2539	struct mem_cgroup *mem = mem_cgroup_from_cont(cont);
2540
2541	return mem_cgroup_force_empty(mem, false);
2542}
2543
2544static void mem_cgroup_destroy(struct cgroup_subsys *ss,
2545				struct cgroup *cont)
2546{
2547	struct mem_cgroup *mem = mem_cgroup_from_cont(cont);
2548
2549	mem_cgroup_put(mem);
2550}
2551
2552static int mem_cgroup_populate(struct cgroup_subsys *ss,
2553				struct cgroup *cont)
2554{
2555	int ret;
2556
2557	ret = cgroup_add_files(cont, ss, mem_cgroup_files,
2558				ARRAY_SIZE(mem_cgroup_files));
2559
2560	if (!ret)
2561		ret = register_memsw_files(cont, ss);
2562	return ret;
2563}
2564
2565static void mem_cgroup_move_task(struct cgroup_subsys *ss,
2566				struct cgroup *cont,
2567				struct cgroup *old_cont,
2568				struct task_struct *p)
2569{
2570	mutex_lock(&memcg_tasklist);
2571	/*
2572	 * FIXME: It's better to move charges of this process from old
2573	 * memcg to new memcg. But it's just on TODO-List now.
2574	 */
2575	mutex_unlock(&memcg_tasklist);
2576}
2577
2578struct cgroup_subsys mem_cgroup_subsys = {
2579	.name = "memory",
2580	.subsys_id = mem_cgroup_subsys_id,
2581	.create = mem_cgroup_create,
2582	.pre_destroy = mem_cgroup_pre_destroy,
2583	.destroy = mem_cgroup_destroy,
2584	.populate = mem_cgroup_populate,
2585	.attach = mem_cgroup_move_task,
2586	.early_init = 0,
2587	.use_id = 1,
2588};
2589
2590#ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP
2591
2592static int __init disable_swap_account(char *s)
2593{
2594	really_do_swap_account = 0;
2595	return 1;
2596}
2597__setup("noswapaccount", disable_swap_account);
2598#endif
2599