memcontrol.c revision ab936cbcd02072a34b60d268f94440fd5cf1970b
1/* memcontrol.c - Memory Controller
2 *
3 * Copyright IBM Corporation, 2007
4 * Author Balbir Singh <balbir@linux.vnet.ibm.com>
5 *
6 * Copyright 2007 OpenVZ SWsoft Inc
7 * Author: Pavel Emelianov <xemul@openvz.org>
8 *
9 * Memory thresholds
10 * Copyright (C) 2009 Nokia Corporation
11 * Author: Kirill A. Shutemov
12 *
13 * This program is free software; you can redistribute it and/or modify
14 * it under the terms of the GNU General Public License as published by
15 * the Free Software Foundation; either version 2 of the License, or
16 * (at your option) any later version.
17 *
18 * This program is distributed in the hope that it will be useful,
19 * but WITHOUT ANY WARRANTY; without even the implied warranty of
20 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
21 * GNU General Public License for more details.
22 */
23
24#include <linux/res_counter.h>
25#include <linux/memcontrol.h>
26#include <linux/cgroup.h>
27#include <linux/mm.h>
28#include <linux/hugetlb.h>
29#include <linux/pagemap.h>
30#include <linux/smp.h>
31#include <linux/page-flags.h>
32#include <linux/backing-dev.h>
33#include <linux/bit_spinlock.h>
34#include <linux/rcupdate.h>
35#include <linux/limits.h>
36#include <linux/export.h>
37#include <linux/mutex.h>
38#include <linux/rbtree.h>
39#include <linux/slab.h>
40#include <linux/swap.h>
41#include <linux/swapops.h>
42#include <linux/spinlock.h>
43#include <linux/eventfd.h>
44#include <linux/sort.h>
45#include <linux/fs.h>
46#include <linux/seq_file.h>
47#include <linux/vmalloc.h>
48#include <linux/mm_inline.h>
49#include <linux/page_cgroup.h>
50#include <linux/cpu.h>
51#include <linux/oom.h>
52#include "internal.h"
53#include <net/sock.h>
54#include <net/tcp_memcontrol.h>
55
56#include <asm/uaccess.h>
57
58#include <trace/events/vmscan.h>
59
60struct cgroup_subsys mem_cgroup_subsys __read_mostly;
61#define MEM_CGROUP_RECLAIM_RETRIES	5
62struct mem_cgroup *root_mem_cgroup __read_mostly;
63
64#ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP
65/* Turned on only when memory cgroup is enabled && really_do_swap_account = 1 */
66int do_swap_account __read_mostly;
67
68/* for remember boot option*/
69#ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP_ENABLED
70static int really_do_swap_account __initdata = 1;
71#else
72static int really_do_swap_account __initdata = 0;
73#endif
74
75#else
76#define do_swap_account		(0)
77#endif
78
79
80/*
81 * Statistics for memory cgroup.
82 */
83enum mem_cgroup_stat_index {
84	/*
85	 * For MEM_CONTAINER_TYPE_ALL, usage = pagecache + rss.
86	 */
87	MEM_CGROUP_STAT_CACHE, 	   /* # of pages charged as cache */
88	MEM_CGROUP_STAT_RSS,	   /* # of pages charged as anon rss */
89	MEM_CGROUP_STAT_FILE_MAPPED,  /* # of pages charged as file rss */
90	MEM_CGROUP_STAT_SWAPOUT, /* # of pages, swapped out */
91	MEM_CGROUP_STAT_DATA, /* end of data requires synchronization */
92	MEM_CGROUP_ON_MOVE,	/* someone is moving account between groups */
93	MEM_CGROUP_STAT_NSTATS,
94};
95
96enum mem_cgroup_events_index {
97	MEM_CGROUP_EVENTS_PGPGIN,	/* # of pages paged in */
98	MEM_CGROUP_EVENTS_PGPGOUT,	/* # of pages paged out */
99	MEM_CGROUP_EVENTS_COUNT,	/* # of pages paged in/out */
100	MEM_CGROUP_EVENTS_PGFAULT,	/* # of page-faults */
101	MEM_CGROUP_EVENTS_PGMAJFAULT,	/* # of major page-faults */
102	MEM_CGROUP_EVENTS_NSTATS,
103};
104/*
105 * Per memcg event counter is incremented at every pagein/pageout. With THP,
106 * it will be incremated by the number of pages. This counter is used for
107 * for trigger some periodic events. This is straightforward and better
108 * than using jiffies etc. to handle periodic memcg event.
109 */
110enum mem_cgroup_events_target {
111	MEM_CGROUP_TARGET_THRESH,
112	MEM_CGROUP_TARGET_SOFTLIMIT,
113	MEM_CGROUP_TARGET_NUMAINFO,
114	MEM_CGROUP_NTARGETS,
115};
116#define THRESHOLDS_EVENTS_TARGET (128)
117#define SOFTLIMIT_EVENTS_TARGET (1024)
118#define NUMAINFO_EVENTS_TARGET	(1024)
119
120struct mem_cgroup_stat_cpu {
121	long count[MEM_CGROUP_STAT_NSTATS];
122	unsigned long events[MEM_CGROUP_EVENTS_NSTATS];
123	unsigned long targets[MEM_CGROUP_NTARGETS];
124};
125
126/*
127 * per-zone information in memory controller.
128 */
129struct mem_cgroup_per_zone {
130	/*
131	 * spin_lock to protect the per cgroup LRU
132	 */
133	struct list_head	lists[NR_LRU_LISTS];
134	unsigned long		count[NR_LRU_LISTS];
135
136	struct zone_reclaim_stat reclaim_stat;
137	struct rb_node		tree_node;	/* RB tree node */
138	unsigned long long	usage_in_excess;/* Set to the value by which */
139						/* the soft limit is exceeded*/
140	bool			on_tree;
141	struct mem_cgroup	*mem;		/* Back pointer, we cannot */
142						/* use container_of	   */
143};
144/* Macro for accessing counter */
145#define MEM_CGROUP_ZSTAT(mz, idx)	((mz)->count[(idx)])
146
147struct mem_cgroup_per_node {
148	struct mem_cgroup_per_zone zoneinfo[MAX_NR_ZONES];
149};
150
151struct mem_cgroup_lru_info {
152	struct mem_cgroup_per_node *nodeinfo[MAX_NUMNODES];
153};
154
155/*
156 * Cgroups above their limits are maintained in a RB-Tree, independent of
157 * their hierarchy representation
158 */
159
160struct mem_cgroup_tree_per_zone {
161	struct rb_root rb_root;
162	spinlock_t lock;
163};
164
165struct mem_cgroup_tree_per_node {
166	struct mem_cgroup_tree_per_zone rb_tree_per_zone[MAX_NR_ZONES];
167};
168
169struct mem_cgroup_tree {
170	struct mem_cgroup_tree_per_node *rb_tree_per_node[MAX_NUMNODES];
171};
172
173static struct mem_cgroup_tree soft_limit_tree __read_mostly;
174
175struct mem_cgroup_threshold {
176	struct eventfd_ctx *eventfd;
177	u64 threshold;
178};
179
180/* For threshold */
181struct mem_cgroup_threshold_ary {
182	/* An array index points to threshold just below usage. */
183	int current_threshold;
184	/* Size of entries[] */
185	unsigned int size;
186	/* Array of thresholds */
187	struct mem_cgroup_threshold entries[0];
188};
189
190struct mem_cgroup_thresholds {
191	/* Primary thresholds array */
192	struct mem_cgroup_threshold_ary *primary;
193	/*
194	 * Spare threshold array.
195	 * This is needed to make mem_cgroup_unregister_event() "never fail".
196	 * It must be able to store at least primary->size - 1 entries.
197	 */
198	struct mem_cgroup_threshold_ary *spare;
199};
200
201/* for OOM */
202struct mem_cgroup_eventfd_list {
203	struct list_head list;
204	struct eventfd_ctx *eventfd;
205};
206
207static void mem_cgroup_threshold(struct mem_cgroup *memcg);
208static void mem_cgroup_oom_notify(struct mem_cgroup *memcg);
209
210/*
211 * The memory controller data structure. The memory controller controls both
212 * page cache and RSS per cgroup. We would eventually like to provide
213 * statistics based on the statistics developed by Rik Van Riel for clock-pro,
214 * to help the administrator determine what knobs to tune.
215 *
216 * TODO: Add a water mark for the memory controller. Reclaim will begin when
217 * we hit the water mark. May be even add a low water mark, such that
218 * no reclaim occurs from a cgroup at it's low water mark, this is
219 * a feature that will be implemented much later in the future.
220 */
221struct mem_cgroup {
222	struct cgroup_subsys_state css;
223	/*
224	 * the counter to account for memory usage
225	 */
226	struct res_counter res;
227	/*
228	 * the counter to account for mem+swap usage.
229	 */
230	struct res_counter memsw;
231	/*
232	 * Per cgroup active and inactive list, similar to the
233	 * per zone LRU lists.
234	 */
235	struct mem_cgroup_lru_info info;
236	/*
237	 * While reclaiming in a hierarchy, we cache the last child we
238	 * reclaimed from.
239	 */
240	int last_scanned_child;
241	int last_scanned_node;
242#if MAX_NUMNODES > 1
243	nodemask_t	scan_nodes;
244	atomic_t	numainfo_events;
245	atomic_t	numainfo_updating;
246#endif
247	/*
248	 * Should the accounting and control be hierarchical, per subtree?
249	 */
250	bool use_hierarchy;
251
252	bool		oom_lock;
253	atomic_t	under_oom;
254
255	atomic_t	refcnt;
256
257	int	swappiness;
258	/* OOM-Killer disable */
259	int		oom_kill_disable;
260
261	/* set when res.limit == memsw.limit */
262	bool		memsw_is_minimum;
263
264	/* protect arrays of thresholds */
265	struct mutex thresholds_lock;
266
267	/* thresholds for memory usage. RCU-protected */
268	struct mem_cgroup_thresholds thresholds;
269
270	/* thresholds for mem+swap usage. RCU-protected */
271	struct mem_cgroup_thresholds memsw_thresholds;
272
273	/* For oom notifier event fd */
274	struct list_head oom_notify;
275
276	/*
277	 * Should we move charges of a task when a task is moved into this
278	 * mem_cgroup ? And what type of charges should we move ?
279	 */
280	unsigned long 	move_charge_at_immigrate;
281	/*
282	 * percpu counter.
283	 */
284	struct mem_cgroup_stat_cpu *stat;
285	/*
286	 * used when a cpu is offlined or other synchronizations
287	 * See mem_cgroup_read_stat().
288	 */
289	struct mem_cgroup_stat_cpu nocpu_base;
290	spinlock_t pcp_counter_lock;
291
292#ifdef CONFIG_INET
293	struct tcp_memcontrol tcp_mem;
294#endif
295};
296
297/* Stuffs for move charges at task migration. */
298/*
299 * Types of charges to be moved. "move_charge_at_immitgrate" is treated as a
300 * left-shifted bitmap of these types.
301 */
302enum move_type {
303	MOVE_CHARGE_TYPE_ANON,	/* private anonymous page and swap of it */
304	MOVE_CHARGE_TYPE_FILE,	/* file page(including tmpfs) and swap of it */
305	NR_MOVE_TYPE,
306};
307
308/* "mc" and its members are protected by cgroup_mutex */
309static struct move_charge_struct {
310	spinlock_t	  lock; /* for from, to */
311	struct mem_cgroup *from;
312	struct mem_cgroup *to;
313	unsigned long precharge;
314	unsigned long moved_charge;
315	unsigned long moved_swap;
316	struct task_struct *moving_task;	/* a task moving charges */
317	wait_queue_head_t waitq;		/* a waitq for other context */
318} mc = {
319	.lock = __SPIN_LOCK_UNLOCKED(mc.lock),
320	.waitq = __WAIT_QUEUE_HEAD_INITIALIZER(mc.waitq),
321};
322
323static bool move_anon(void)
324{
325	return test_bit(MOVE_CHARGE_TYPE_ANON,
326					&mc.to->move_charge_at_immigrate);
327}
328
329static bool move_file(void)
330{
331	return test_bit(MOVE_CHARGE_TYPE_FILE,
332					&mc.to->move_charge_at_immigrate);
333}
334
335/*
336 * Maximum loops in mem_cgroup_hierarchical_reclaim(), used for soft
337 * limit reclaim to prevent infinite loops, if they ever occur.
338 */
339#define	MEM_CGROUP_MAX_RECLAIM_LOOPS		(100)
340#define	MEM_CGROUP_MAX_SOFT_LIMIT_RECLAIM_LOOPS	(2)
341
342enum charge_type {
343	MEM_CGROUP_CHARGE_TYPE_CACHE = 0,
344	MEM_CGROUP_CHARGE_TYPE_MAPPED,
345	MEM_CGROUP_CHARGE_TYPE_SHMEM,	/* used by page migration of shmem */
346	MEM_CGROUP_CHARGE_TYPE_FORCE,	/* used by force_empty */
347	MEM_CGROUP_CHARGE_TYPE_SWAPOUT,	/* for accounting swapcache */
348	MEM_CGROUP_CHARGE_TYPE_DROP,	/* a page was unused swap cache */
349	NR_CHARGE_TYPE,
350};
351
352/* for encoding cft->private value on file */
353#define _MEM			(0)
354#define _MEMSWAP		(1)
355#define _OOM_TYPE		(2)
356#define MEMFILE_PRIVATE(x, val)	(((x) << 16) | (val))
357#define MEMFILE_TYPE(val)	(((val) >> 16) & 0xffff)
358#define MEMFILE_ATTR(val)	((val) & 0xffff)
359/* Used for OOM nofiier */
360#define OOM_CONTROL		(0)
361
362/*
363 * Reclaim flags for mem_cgroup_hierarchical_reclaim
364 */
365#define MEM_CGROUP_RECLAIM_NOSWAP_BIT	0x0
366#define MEM_CGROUP_RECLAIM_NOSWAP	(1 << MEM_CGROUP_RECLAIM_NOSWAP_BIT)
367#define MEM_CGROUP_RECLAIM_SHRINK_BIT	0x1
368#define MEM_CGROUP_RECLAIM_SHRINK	(1 << MEM_CGROUP_RECLAIM_SHRINK_BIT)
369#define MEM_CGROUP_RECLAIM_SOFT_BIT	0x2
370#define MEM_CGROUP_RECLAIM_SOFT		(1 << MEM_CGROUP_RECLAIM_SOFT_BIT)
371
372static void mem_cgroup_get(struct mem_cgroup *memcg);
373static void mem_cgroup_put(struct mem_cgroup *memcg);
374
375/* Writing them here to avoid exposing memcg's inner layout */
376#ifdef CONFIG_CGROUP_MEM_RES_CTLR_KMEM
377#ifdef CONFIG_INET
378#include <net/sock.h>
379#include <net/ip.h>
380
381static bool mem_cgroup_is_root(struct mem_cgroup *memcg);
382void sock_update_memcg(struct sock *sk)
383{
384	if (static_branch(&memcg_socket_limit_enabled)) {
385		struct mem_cgroup *memcg;
386
387		BUG_ON(!sk->sk_prot->proto_cgroup);
388
389		/* Socket cloning can throw us here with sk_cgrp already
390		 * filled. It won't however, necessarily happen from
391		 * process context. So the test for root memcg given
392		 * the current task's memcg won't help us in this case.
393		 *
394		 * Respecting the original socket's memcg is a better
395		 * decision in this case.
396		 */
397		if (sk->sk_cgrp) {
398			BUG_ON(mem_cgroup_is_root(sk->sk_cgrp->memcg));
399			mem_cgroup_get(sk->sk_cgrp->memcg);
400			return;
401		}
402
403		rcu_read_lock();
404		memcg = mem_cgroup_from_task(current);
405		if (!mem_cgroup_is_root(memcg)) {
406			mem_cgroup_get(memcg);
407			sk->sk_cgrp = sk->sk_prot->proto_cgroup(memcg);
408		}
409		rcu_read_unlock();
410	}
411}
412EXPORT_SYMBOL(sock_update_memcg);
413
414void sock_release_memcg(struct sock *sk)
415{
416	if (static_branch(&memcg_socket_limit_enabled) && sk->sk_cgrp) {
417		struct mem_cgroup *memcg;
418		WARN_ON(!sk->sk_cgrp->memcg);
419		memcg = sk->sk_cgrp->memcg;
420		mem_cgroup_put(memcg);
421	}
422}
423
424struct cg_proto *tcp_proto_cgroup(struct mem_cgroup *memcg)
425{
426	if (!memcg || mem_cgroup_is_root(memcg))
427		return NULL;
428
429	return &memcg->tcp_mem.cg_proto;
430}
431EXPORT_SYMBOL(tcp_proto_cgroup);
432#endif /* CONFIG_INET */
433#endif /* CONFIG_CGROUP_MEM_RES_CTLR_KMEM */
434
435static void drain_all_stock_async(struct mem_cgroup *memcg);
436
437static struct mem_cgroup_per_zone *
438mem_cgroup_zoneinfo(struct mem_cgroup *memcg, int nid, int zid)
439{
440	return &memcg->info.nodeinfo[nid]->zoneinfo[zid];
441}
442
443struct cgroup_subsys_state *mem_cgroup_css(struct mem_cgroup *memcg)
444{
445	return &memcg->css;
446}
447
448static struct mem_cgroup_per_zone *
449page_cgroup_zoneinfo(struct mem_cgroup *memcg, struct page *page)
450{
451	int nid = page_to_nid(page);
452	int zid = page_zonenum(page);
453
454	return mem_cgroup_zoneinfo(memcg, nid, zid);
455}
456
457static struct mem_cgroup_tree_per_zone *
458soft_limit_tree_node_zone(int nid, int zid)
459{
460	return &soft_limit_tree.rb_tree_per_node[nid]->rb_tree_per_zone[zid];
461}
462
463static struct mem_cgroup_tree_per_zone *
464soft_limit_tree_from_page(struct page *page)
465{
466	int nid = page_to_nid(page);
467	int zid = page_zonenum(page);
468
469	return &soft_limit_tree.rb_tree_per_node[nid]->rb_tree_per_zone[zid];
470}
471
472static void
473__mem_cgroup_insert_exceeded(struct mem_cgroup *memcg,
474				struct mem_cgroup_per_zone *mz,
475				struct mem_cgroup_tree_per_zone *mctz,
476				unsigned long long new_usage_in_excess)
477{
478	struct rb_node **p = &mctz->rb_root.rb_node;
479	struct rb_node *parent = NULL;
480	struct mem_cgroup_per_zone *mz_node;
481
482	if (mz->on_tree)
483		return;
484
485	mz->usage_in_excess = new_usage_in_excess;
486	if (!mz->usage_in_excess)
487		return;
488	while (*p) {
489		parent = *p;
490		mz_node = rb_entry(parent, struct mem_cgroup_per_zone,
491					tree_node);
492		if (mz->usage_in_excess < mz_node->usage_in_excess)
493			p = &(*p)->rb_left;
494		/*
495		 * We can't avoid mem cgroups that are over their soft
496		 * limit by the same amount
497		 */
498		else if (mz->usage_in_excess >= mz_node->usage_in_excess)
499			p = &(*p)->rb_right;
500	}
501	rb_link_node(&mz->tree_node, parent, p);
502	rb_insert_color(&mz->tree_node, &mctz->rb_root);
503	mz->on_tree = true;
504}
505
506static void
507__mem_cgroup_remove_exceeded(struct mem_cgroup *memcg,
508				struct mem_cgroup_per_zone *mz,
509				struct mem_cgroup_tree_per_zone *mctz)
510{
511	if (!mz->on_tree)
512		return;
513	rb_erase(&mz->tree_node, &mctz->rb_root);
514	mz->on_tree = false;
515}
516
517static void
518mem_cgroup_remove_exceeded(struct mem_cgroup *memcg,
519				struct mem_cgroup_per_zone *mz,
520				struct mem_cgroup_tree_per_zone *mctz)
521{
522	spin_lock(&mctz->lock);
523	__mem_cgroup_remove_exceeded(memcg, mz, mctz);
524	spin_unlock(&mctz->lock);
525}
526
527
528static void mem_cgroup_update_tree(struct mem_cgroup *memcg, struct page *page)
529{
530	unsigned long long excess;
531	struct mem_cgroup_per_zone *mz;
532	struct mem_cgroup_tree_per_zone *mctz;
533	int nid = page_to_nid(page);
534	int zid = page_zonenum(page);
535	mctz = soft_limit_tree_from_page(page);
536
537	/*
538	 * Necessary to update all ancestors when hierarchy is used.
539	 * because their event counter is not touched.
540	 */
541	for (; memcg; memcg = parent_mem_cgroup(memcg)) {
542		mz = mem_cgroup_zoneinfo(memcg, nid, zid);
543		excess = res_counter_soft_limit_excess(&memcg->res);
544		/*
545		 * We have to update the tree if mz is on RB-tree or
546		 * mem is over its softlimit.
547		 */
548		if (excess || mz->on_tree) {
549			spin_lock(&mctz->lock);
550			/* if on-tree, remove it */
551			if (mz->on_tree)
552				__mem_cgroup_remove_exceeded(memcg, mz, mctz);
553			/*
554			 * Insert again. mz->usage_in_excess will be updated.
555			 * If excess is 0, no tree ops.
556			 */
557			__mem_cgroup_insert_exceeded(memcg, mz, mctz, excess);
558			spin_unlock(&mctz->lock);
559		}
560	}
561}
562
563static void mem_cgroup_remove_from_trees(struct mem_cgroup *memcg)
564{
565	int node, zone;
566	struct mem_cgroup_per_zone *mz;
567	struct mem_cgroup_tree_per_zone *mctz;
568
569	for_each_node_state(node, N_POSSIBLE) {
570		for (zone = 0; zone < MAX_NR_ZONES; zone++) {
571			mz = mem_cgroup_zoneinfo(memcg, node, zone);
572			mctz = soft_limit_tree_node_zone(node, zone);
573			mem_cgroup_remove_exceeded(memcg, mz, mctz);
574		}
575	}
576}
577
578static struct mem_cgroup_per_zone *
579__mem_cgroup_largest_soft_limit_node(struct mem_cgroup_tree_per_zone *mctz)
580{
581	struct rb_node *rightmost = NULL;
582	struct mem_cgroup_per_zone *mz;
583
584retry:
585	mz = NULL;
586	rightmost = rb_last(&mctz->rb_root);
587	if (!rightmost)
588		goto done;		/* Nothing to reclaim from */
589
590	mz = rb_entry(rightmost, struct mem_cgroup_per_zone, tree_node);
591	/*
592	 * Remove the node now but someone else can add it back,
593	 * we will to add it back at the end of reclaim to its correct
594	 * position in the tree.
595	 */
596	__mem_cgroup_remove_exceeded(mz->mem, mz, mctz);
597	if (!res_counter_soft_limit_excess(&mz->mem->res) ||
598		!css_tryget(&mz->mem->css))
599		goto retry;
600done:
601	return mz;
602}
603
604static struct mem_cgroup_per_zone *
605mem_cgroup_largest_soft_limit_node(struct mem_cgroup_tree_per_zone *mctz)
606{
607	struct mem_cgroup_per_zone *mz;
608
609	spin_lock(&mctz->lock);
610	mz = __mem_cgroup_largest_soft_limit_node(mctz);
611	spin_unlock(&mctz->lock);
612	return mz;
613}
614
615/*
616 * Implementation Note: reading percpu statistics for memcg.
617 *
618 * Both of vmstat[] and percpu_counter has threshold and do periodic
619 * synchronization to implement "quick" read. There are trade-off between
620 * reading cost and precision of value. Then, we may have a chance to implement
621 * a periodic synchronizion of counter in memcg's counter.
622 *
623 * But this _read() function is used for user interface now. The user accounts
624 * memory usage by memory cgroup and he _always_ requires exact value because
625 * he accounts memory. Even if we provide quick-and-fuzzy read, we always
626 * have to visit all online cpus and make sum. So, for now, unnecessary
627 * synchronization is not implemented. (just implemented for cpu hotplug)
628 *
629 * If there are kernel internal actions which can make use of some not-exact
630 * value, and reading all cpu value can be performance bottleneck in some
631 * common workload, threashold and synchonization as vmstat[] should be
632 * implemented.
633 */
634static long mem_cgroup_read_stat(struct mem_cgroup *memcg,
635				 enum mem_cgroup_stat_index idx)
636{
637	long val = 0;
638	int cpu;
639
640	get_online_cpus();
641	for_each_online_cpu(cpu)
642		val += per_cpu(memcg->stat->count[idx], cpu);
643#ifdef CONFIG_HOTPLUG_CPU
644	spin_lock(&memcg->pcp_counter_lock);
645	val += memcg->nocpu_base.count[idx];
646	spin_unlock(&memcg->pcp_counter_lock);
647#endif
648	put_online_cpus();
649	return val;
650}
651
652static void mem_cgroup_swap_statistics(struct mem_cgroup *memcg,
653					 bool charge)
654{
655	int val = (charge) ? 1 : -1;
656	this_cpu_add(memcg->stat->count[MEM_CGROUP_STAT_SWAPOUT], val);
657}
658
659void mem_cgroup_pgfault(struct mem_cgroup *memcg, int val)
660{
661	this_cpu_add(memcg->stat->events[MEM_CGROUP_EVENTS_PGFAULT], val);
662}
663
664void mem_cgroup_pgmajfault(struct mem_cgroup *memcg, int val)
665{
666	this_cpu_add(memcg->stat->events[MEM_CGROUP_EVENTS_PGMAJFAULT], val);
667}
668
669static unsigned long mem_cgroup_read_events(struct mem_cgroup *memcg,
670					    enum mem_cgroup_events_index idx)
671{
672	unsigned long val = 0;
673	int cpu;
674
675	for_each_online_cpu(cpu)
676		val += per_cpu(memcg->stat->events[idx], cpu);
677#ifdef CONFIG_HOTPLUG_CPU
678	spin_lock(&memcg->pcp_counter_lock);
679	val += memcg->nocpu_base.events[idx];
680	spin_unlock(&memcg->pcp_counter_lock);
681#endif
682	return val;
683}
684
685static void mem_cgroup_charge_statistics(struct mem_cgroup *memcg,
686					 bool file, int nr_pages)
687{
688	preempt_disable();
689
690	if (file)
691		__this_cpu_add(memcg->stat->count[MEM_CGROUP_STAT_CACHE],
692				nr_pages);
693	else
694		__this_cpu_add(memcg->stat->count[MEM_CGROUP_STAT_RSS],
695				nr_pages);
696
697	/* pagein of a big page is an event. So, ignore page size */
698	if (nr_pages > 0)
699		__this_cpu_inc(memcg->stat->events[MEM_CGROUP_EVENTS_PGPGIN]);
700	else {
701		__this_cpu_inc(memcg->stat->events[MEM_CGROUP_EVENTS_PGPGOUT]);
702		nr_pages = -nr_pages; /* for event */
703	}
704
705	__this_cpu_add(memcg->stat->events[MEM_CGROUP_EVENTS_COUNT], nr_pages);
706
707	preempt_enable();
708}
709
710unsigned long
711mem_cgroup_zone_nr_lru_pages(struct mem_cgroup *memcg, int nid, int zid,
712			unsigned int lru_mask)
713{
714	struct mem_cgroup_per_zone *mz;
715	enum lru_list l;
716	unsigned long ret = 0;
717
718	mz = mem_cgroup_zoneinfo(memcg, nid, zid);
719
720	for_each_lru(l) {
721		if (BIT(l) & lru_mask)
722			ret += MEM_CGROUP_ZSTAT(mz, l);
723	}
724	return ret;
725}
726
727static unsigned long
728mem_cgroup_node_nr_lru_pages(struct mem_cgroup *memcg,
729			int nid, unsigned int lru_mask)
730{
731	u64 total = 0;
732	int zid;
733
734	for (zid = 0; zid < MAX_NR_ZONES; zid++)
735		total += mem_cgroup_zone_nr_lru_pages(memcg,
736						nid, zid, lru_mask);
737
738	return total;
739}
740
741static unsigned long mem_cgroup_nr_lru_pages(struct mem_cgroup *memcg,
742			unsigned int lru_mask)
743{
744	int nid;
745	u64 total = 0;
746
747	for_each_node_state(nid, N_HIGH_MEMORY)
748		total += mem_cgroup_node_nr_lru_pages(memcg, nid, lru_mask);
749	return total;
750}
751
752static bool __memcg_event_check(struct mem_cgroup *memcg, int target)
753{
754	unsigned long val, next;
755
756	val = __this_cpu_read(memcg->stat->events[MEM_CGROUP_EVENTS_COUNT]);
757	next = __this_cpu_read(memcg->stat->targets[target]);
758	/* from time_after() in jiffies.h */
759	return ((long)next - (long)val < 0);
760}
761
762static void __mem_cgroup_target_update(struct mem_cgroup *memcg, int target)
763{
764	unsigned long val, next;
765
766	val = __this_cpu_read(memcg->stat->events[MEM_CGROUP_EVENTS_COUNT]);
767
768	switch (target) {
769	case MEM_CGROUP_TARGET_THRESH:
770		next = val + THRESHOLDS_EVENTS_TARGET;
771		break;
772	case MEM_CGROUP_TARGET_SOFTLIMIT:
773		next = val + SOFTLIMIT_EVENTS_TARGET;
774		break;
775	case MEM_CGROUP_TARGET_NUMAINFO:
776		next = val + NUMAINFO_EVENTS_TARGET;
777		break;
778	default:
779		return;
780	}
781
782	__this_cpu_write(memcg->stat->targets[target], next);
783}
784
785/*
786 * Check events in order.
787 *
788 */
789static void memcg_check_events(struct mem_cgroup *memcg, struct page *page)
790{
791	preempt_disable();
792	/* threshold event is triggered in finer grain than soft limit */
793	if (unlikely(__memcg_event_check(memcg, MEM_CGROUP_TARGET_THRESH))) {
794		mem_cgroup_threshold(memcg);
795		__mem_cgroup_target_update(memcg, MEM_CGROUP_TARGET_THRESH);
796		if (unlikely(__memcg_event_check(memcg,
797			     MEM_CGROUP_TARGET_SOFTLIMIT))) {
798			mem_cgroup_update_tree(memcg, page);
799			__mem_cgroup_target_update(memcg,
800						   MEM_CGROUP_TARGET_SOFTLIMIT);
801		}
802#if MAX_NUMNODES > 1
803		if (unlikely(__memcg_event_check(memcg,
804			MEM_CGROUP_TARGET_NUMAINFO))) {
805			atomic_inc(&memcg->numainfo_events);
806			__mem_cgroup_target_update(memcg,
807				MEM_CGROUP_TARGET_NUMAINFO);
808		}
809#endif
810	}
811	preempt_enable();
812}
813
814struct mem_cgroup *mem_cgroup_from_cont(struct cgroup *cont)
815{
816	return container_of(cgroup_subsys_state(cont,
817				mem_cgroup_subsys_id), struct mem_cgroup,
818				css);
819}
820
821struct mem_cgroup *mem_cgroup_from_task(struct task_struct *p)
822{
823	/*
824	 * mm_update_next_owner() may clear mm->owner to NULL
825	 * if it races with swapoff, page migration, etc.
826	 * So this can be called with p == NULL.
827	 */
828	if (unlikely(!p))
829		return NULL;
830
831	return container_of(task_subsys_state(p, mem_cgroup_subsys_id),
832				struct mem_cgroup, css);
833}
834
835struct mem_cgroup *try_get_mem_cgroup_from_mm(struct mm_struct *mm)
836{
837	struct mem_cgroup *memcg = NULL;
838
839	if (!mm)
840		return NULL;
841	/*
842	 * Because we have no locks, mm->owner's may be being moved to other
843	 * cgroup. We use css_tryget() here even if this looks
844	 * pessimistic (rather than adding locks here).
845	 */
846	rcu_read_lock();
847	do {
848		memcg = mem_cgroup_from_task(rcu_dereference(mm->owner));
849		if (unlikely(!memcg))
850			break;
851	} while (!css_tryget(&memcg->css));
852	rcu_read_unlock();
853	return memcg;
854}
855
856/* The caller has to guarantee "mem" exists before calling this */
857static struct mem_cgroup *mem_cgroup_start_loop(struct mem_cgroup *memcg)
858{
859	struct cgroup_subsys_state *css;
860	int found;
861
862	if (!memcg) /* ROOT cgroup has the smallest ID */
863		return root_mem_cgroup; /*css_put/get against root is ignored*/
864	if (!memcg->use_hierarchy) {
865		if (css_tryget(&memcg->css))
866			return memcg;
867		return NULL;
868	}
869	rcu_read_lock();
870	/*
871	 * searching a memory cgroup which has the smallest ID under given
872	 * ROOT cgroup. (ID >= 1)
873	 */
874	css = css_get_next(&mem_cgroup_subsys, 1, &memcg->css, &found);
875	if (css && css_tryget(css))
876		memcg = container_of(css, struct mem_cgroup, css);
877	else
878		memcg = NULL;
879	rcu_read_unlock();
880	return memcg;
881}
882
883static struct mem_cgroup *mem_cgroup_get_next(struct mem_cgroup *iter,
884					struct mem_cgroup *root,
885					bool cond)
886{
887	int nextid = css_id(&iter->css) + 1;
888	int found;
889	int hierarchy_used;
890	struct cgroup_subsys_state *css;
891
892	hierarchy_used = iter->use_hierarchy;
893
894	css_put(&iter->css);
895	/* If no ROOT, walk all, ignore hierarchy */
896	if (!cond || (root && !hierarchy_used))
897		return NULL;
898
899	if (!root)
900		root = root_mem_cgroup;
901
902	do {
903		iter = NULL;
904		rcu_read_lock();
905
906		css = css_get_next(&mem_cgroup_subsys, nextid,
907				&root->css, &found);
908		if (css && css_tryget(css))
909			iter = container_of(css, struct mem_cgroup, css);
910		rcu_read_unlock();
911		/* If css is NULL, no more cgroups will be found */
912		nextid = found + 1;
913	} while (css && !iter);
914
915	return iter;
916}
917/*
918 * for_eacn_mem_cgroup_tree() for visiting all cgroup under tree. Please
919 * be careful that "break" loop is not allowed. We have reference count.
920 * Instead of that modify "cond" to be false and "continue" to exit the loop.
921 */
922#define for_each_mem_cgroup_tree_cond(iter, root, cond)	\
923	for (iter = mem_cgroup_start_loop(root);\
924	     iter != NULL;\
925	     iter = mem_cgroup_get_next(iter, root, cond))
926
927#define for_each_mem_cgroup_tree(iter, root) \
928	for_each_mem_cgroup_tree_cond(iter, root, true)
929
930#define for_each_mem_cgroup_all(iter) \
931	for_each_mem_cgroup_tree_cond(iter, NULL, true)
932
933
934static inline bool mem_cgroup_is_root(struct mem_cgroup *memcg)
935{
936	return (memcg == root_mem_cgroup);
937}
938
939void mem_cgroup_count_vm_event(struct mm_struct *mm, enum vm_event_item idx)
940{
941	struct mem_cgroup *memcg;
942
943	if (!mm)
944		return;
945
946	rcu_read_lock();
947	memcg = mem_cgroup_from_task(rcu_dereference(mm->owner));
948	if (unlikely(!memcg))
949		goto out;
950
951	switch (idx) {
952	case PGMAJFAULT:
953		mem_cgroup_pgmajfault(memcg, 1);
954		break;
955	case PGFAULT:
956		mem_cgroup_pgfault(memcg, 1);
957		break;
958	default:
959		BUG();
960	}
961out:
962	rcu_read_unlock();
963}
964EXPORT_SYMBOL(mem_cgroup_count_vm_event);
965
966/*
967 * Following LRU functions are allowed to be used without PCG_LOCK.
968 * Operations are called by routine of global LRU independently from memcg.
969 * What we have to take care of here is validness of pc->mem_cgroup.
970 *
971 * Changes to pc->mem_cgroup happens when
972 * 1. charge
973 * 2. moving account
974 * In typical case, "charge" is done before add-to-lru. Exception is SwapCache.
975 * It is added to LRU before charge.
976 * If PCG_USED bit is not set, page_cgroup is not added to this private LRU.
977 * When moving account, the page is not on LRU. It's isolated.
978 */
979
980void mem_cgroup_del_lru_list(struct page *page, enum lru_list lru)
981{
982	struct page_cgroup *pc;
983	struct mem_cgroup_per_zone *mz;
984
985	if (mem_cgroup_disabled())
986		return;
987	pc = lookup_page_cgroup(page);
988	/* can happen while we handle swapcache. */
989	if (!TestClearPageCgroupAcctLRU(pc))
990		return;
991	VM_BUG_ON(!pc->mem_cgroup);
992	/*
993	 * We don't check PCG_USED bit. It's cleared when the "page" is finally
994	 * removed from global LRU.
995	 */
996	mz = page_cgroup_zoneinfo(pc->mem_cgroup, page);
997	/* huge page split is done under lru_lock. so, we have no races. */
998	MEM_CGROUP_ZSTAT(mz, lru) -= 1 << compound_order(page);
999	if (mem_cgroup_is_root(pc->mem_cgroup))
1000		return;
1001	VM_BUG_ON(list_empty(&pc->lru));
1002	list_del_init(&pc->lru);
1003}
1004
1005void mem_cgroup_del_lru(struct page *page)
1006{
1007	mem_cgroup_del_lru_list(page, page_lru(page));
1008}
1009
1010/*
1011 * Writeback is about to end against a page which has been marked for immediate
1012 * reclaim.  If it still appears to be reclaimable, move it to the tail of the
1013 * inactive list.
1014 */
1015void mem_cgroup_rotate_reclaimable_page(struct page *page)
1016{
1017	struct mem_cgroup_per_zone *mz;
1018	struct page_cgroup *pc;
1019	enum lru_list lru = page_lru(page);
1020
1021	if (mem_cgroup_disabled())
1022		return;
1023
1024	pc = lookup_page_cgroup(page);
1025	/* unused or root page is not rotated. */
1026	if (!PageCgroupUsed(pc))
1027		return;
1028	/* Ensure pc->mem_cgroup is visible after reading PCG_USED. */
1029	smp_rmb();
1030	if (mem_cgroup_is_root(pc->mem_cgroup))
1031		return;
1032	mz = page_cgroup_zoneinfo(pc->mem_cgroup, page);
1033	list_move_tail(&pc->lru, &mz->lists[lru]);
1034}
1035
1036void mem_cgroup_rotate_lru_list(struct page *page, enum lru_list lru)
1037{
1038	struct mem_cgroup_per_zone *mz;
1039	struct page_cgroup *pc;
1040
1041	if (mem_cgroup_disabled())
1042		return;
1043
1044	pc = lookup_page_cgroup(page);
1045	/* unused or root page is not rotated. */
1046	if (!PageCgroupUsed(pc))
1047		return;
1048	/* Ensure pc->mem_cgroup is visible after reading PCG_USED. */
1049	smp_rmb();
1050	if (mem_cgroup_is_root(pc->mem_cgroup))
1051		return;
1052	mz = page_cgroup_zoneinfo(pc->mem_cgroup, page);
1053	list_move(&pc->lru, &mz->lists[lru]);
1054}
1055
1056void mem_cgroup_add_lru_list(struct page *page, enum lru_list lru)
1057{
1058	struct page_cgroup *pc;
1059	struct mem_cgroup_per_zone *mz;
1060
1061	if (mem_cgroup_disabled())
1062		return;
1063	pc = lookup_page_cgroup(page);
1064	VM_BUG_ON(PageCgroupAcctLRU(pc));
1065	/*
1066	 * putback:				charge:
1067	 * SetPageLRU				SetPageCgroupUsed
1068	 * smp_mb				smp_mb
1069	 * PageCgroupUsed && add to memcg LRU	PageLRU && add to memcg LRU
1070	 *
1071	 * Ensure that one of the two sides adds the page to the memcg
1072	 * LRU during a race.
1073	 */
1074	smp_mb();
1075	if (!PageCgroupUsed(pc))
1076		return;
1077	/* Ensure pc->mem_cgroup is visible after reading PCG_USED. */
1078	smp_rmb();
1079	mz = page_cgroup_zoneinfo(pc->mem_cgroup, page);
1080	/* huge page split is done under lru_lock. so, we have no races. */
1081	MEM_CGROUP_ZSTAT(mz, lru) += 1 << compound_order(page);
1082	SetPageCgroupAcctLRU(pc);
1083	if (mem_cgroup_is_root(pc->mem_cgroup))
1084		return;
1085	list_add(&pc->lru, &mz->lists[lru]);
1086}
1087
1088/*
1089 * At handling SwapCache and other FUSE stuff, pc->mem_cgroup may be changed
1090 * while it's linked to lru because the page may be reused after it's fully
1091 * uncharged. To handle that, unlink page_cgroup from LRU when charge it again.
1092 * It's done under lock_page and expected that zone->lru_lock isnever held.
1093 */
1094static void mem_cgroup_lru_del_before_commit(struct page *page)
1095{
1096	unsigned long flags;
1097	struct zone *zone = page_zone(page);
1098	struct page_cgroup *pc = lookup_page_cgroup(page);
1099
1100	/*
1101	 * Doing this check without taking ->lru_lock seems wrong but this
1102	 * is safe. Because if page_cgroup's USED bit is unset, the page
1103	 * will not be added to any memcg's LRU. If page_cgroup's USED bit is
1104	 * set, the commit after this will fail, anyway.
1105	 * This all charge/uncharge is done under some mutual execustion.
1106	 * So, we don't need to taking care of changes in USED bit.
1107	 */
1108	if (likely(!PageLRU(page)))
1109		return;
1110
1111	spin_lock_irqsave(&zone->lru_lock, flags);
1112	/*
1113	 * Forget old LRU when this page_cgroup is *not* used. This Used bit
1114	 * is guarded by lock_page() because the page is SwapCache.
1115	 */
1116	if (!PageCgroupUsed(pc))
1117		mem_cgroup_del_lru_list(page, page_lru(page));
1118	spin_unlock_irqrestore(&zone->lru_lock, flags);
1119}
1120
1121static void mem_cgroup_lru_add_after_commit(struct page *page)
1122{
1123	unsigned long flags;
1124	struct zone *zone = page_zone(page);
1125	struct page_cgroup *pc = lookup_page_cgroup(page);
1126	/*
1127	 * putback:				charge:
1128	 * SetPageLRU				SetPageCgroupUsed
1129	 * smp_mb				smp_mb
1130	 * PageCgroupUsed && add to memcg LRU	PageLRU && add to memcg LRU
1131	 *
1132	 * Ensure that one of the two sides adds the page to the memcg
1133	 * LRU during a race.
1134	 */
1135	smp_mb();
1136	/* taking care of that the page is added to LRU while we commit it */
1137	if (likely(!PageLRU(page)))
1138		return;
1139	spin_lock_irqsave(&zone->lru_lock, flags);
1140	/* link when the page is linked to LRU but page_cgroup isn't */
1141	if (PageLRU(page) && !PageCgroupAcctLRU(pc))
1142		mem_cgroup_add_lru_list(page, page_lru(page));
1143	spin_unlock_irqrestore(&zone->lru_lock, flags);
1144}
1145
1146
1147void mem_cgroup_move_lists(struct page *page,
1148			   enum lru_list from, enum lru_list to)
1149{
1150	if (mem_cgroup_disabled())
1151		return;
1152	mem_cgroup_del_lru_list(page, from);
1153	mem_cgroup_add_lru_list(page, to);
1154}
1155
1156/*
1157 * Checks whether given mem is same or in the root_mem_cgroup's
1158 * hierarchy subtree
1159 */
1160static bool mem_cgroup_same_or_subtree(const struct mem_cgroup *root_memcg,
1161		struct mem_cgroup *memcg)
1162{
1163	if (root_memcg != memcg) {
1164		return (root_memcg->use_hierarchy &&
1165			css_is_ancestor(&memcg->css, &root_memcg->css));
1166	}
1167
1168	return true;
1169}
1170
1171int task_in_mem_cgroup(struct task_struct *task, const struct mem_cgroup *memcg)
1172{
1173	int ret;
1174	struct mem_cgroup *curr = NULL;
1175	struct task_struct *p;
1176
1177	p = find_lock_task_mm(task);
1178	if (!p)
1179		return 0;
1180	curr = try_get_mem_cgroup_from_mm(p->mm);
1181	task_unlock(p);
1182	if (!curr)
1183		return 0;
1184	/*
1185	 * We should check use_hierarchy of "memcg" not "curr". Because checking
1186	 * use_hierarchy of "curr" here make this function true if hierarchy is
1187	 * enabled in "curr" and "curr" is a child of "memcg" in *cgroup*
1188	 * hierarchy(even if use_hierarchy is disabled in "memcg").
1189	 */
1190	ret = mem_cgroup_same_or_subtree(memcg, curr);
1191	css_put(&curr->css);
1192	return ret;
1193}
1194
1195int mem_cgroup_inactive_anon_is_low(struct mem_cgroup *memcg, struct zone *zone)
1196{
1197	unsigned long inactive_ratio;
1198	int nid = zone_to_nid(zone);
1199	int zid = zone_idx(zone);
1200	unsigned long inactive;
1201	unsigned long active;
1202	unsigned long gb;
1203
1204	inactive = mem_cgroup_zone_nr_lru_pages(memcg, nid, zid,
1205						BIT(LRU_INACTIVE_ANON));
1206	active = mem_cgroup_zone_nr_lru_pages(memcg, nid, zid,
1207					      BIT(LRU_ACTIVE_ANON));
1208
1209	gb = (inactive + active) >> (30 - PAGE_SHIFT);
1210	if (gb)
1211		inactive_ratio = int_sqrt(10 * gb);
1212	else
1213		inactive_ratio = 1;
1214
1215	return inactive * inactive_ratio < active;
1216}
1217
1218int mem_cgroup_inactive_file_is_low(struct mem_cgroup *memcg, struct zone *zone)
1219{
1220	unsigned long active;
1221	unsigned long inactive;
1222	int zid = zone_idx(zone);
1223	int nid = zone_to_nid(zone);
1224
1225	inactive = mem_cgroup_zone_nr_lru_pages(memcg, nid, zid,
1226						BIT(LRU_INACTIVE_FILE));
1227	active = mem_cgroup_zone_nr_lru_pages(memcg, nid, zid,
1228					      BIT(LRU_ACTIVE_FILE));
1229
1230	return (active > inactive);
1231}
1232
1233struct zone_reclaim_stat *mem_cgroup_get_reclaim_stat(struct mem_cgroup *memcg,
1234						      struct zone *zone)
1235{
1236	int nid = zone_to_nid(zone);
1237	int zid = zone_idx(zone);
1238	struct mem_cgroup_per_zone *mz = mem_cgroup_zoneinfo(memcg, nid, zid);
1239
1240	return &mz->reclaim_stat;
1241}
1242
1243struct zone_reclaim_stat *
1244mem_cgroup_get_reclaim_stat_from_page(struct page *page)
1245{
1246	struct page_cgroup *pc;
1247	struct mem_cgroup_per_zone *mz;
1248
1249	if (mem_cgroup_disabled())
1250		return NULL;
1251
1252	pc = lookup_page_cgroup(page);
1253	if (!PageCgroupUsed(pc))
1254		return NULL;
1255	/* Ensure pc->mem_cgroup is visible after reading PCG_USED. */
1256	smp_rmb();
1257	mz = page_cgroup_zoneinfo(pc->mem_cgroup, page);
1258	return &mz->reclaim_stat;
1259}
1260
1261unsigned long mem_cgroup_isolate_pages(unsigned long nr_to_scan,
1262					struct list_head *dst,
1263					unsigned long *scanned, int order,
1264					isolate_mode_t mode,
1265					struct zone *z,
1266					struct mem_cgroup *mem_cont,
1267					int active, int file)
1268{
1269	unsigned long nr_taken = 0;
1270	struct page *page;
1271	unsigned long scan;
1272	LIST_HEAD(pc_list);
1273	struct list_head *src;
1274	struct page_cgroup *pc, *tmp;
1275	int nid = zone_to_nid(z);
1276	int zid = zone_idx(z);
1277	struct mem_cgroup_per_zone *mz;
1278	int lru = LRU_FILE * file + active;
1279	int ret;
1280
1281	BUG_ON(!mem_cont);
1282	mz = mem_cgroup_zoneinfo(mem_cont, nid, zid);
1283	src = &mz->lists[lru];
1284
1285	scan = 0;
1286	list_for_each_entry_safe_reverse(pc, tmp, src, lru) {
1287		if (scan >= nr_to_scan)
1288			break;
1289
1290		if (unlikely(!PageCgroupUsed(pc)))
1291			continue;
1292
1293		page = lookup_cgroup_page(pc);
1294
1295		if (unlikely(!PageLRU(page)))
1296			continue;
1297
1298		scan++;
1299		ret = __isolate_lru_page(page, mode, file);
1300		switch (ret) {
1301		case 0:
1302			list_move(&page->lru, dst);
1303			mem_cgroup_del_lru(page);
1304			nr_taken += hpage_nr_pages(page);
1305			break;
1306		case -EBUSY:
1307			/* we don't affect global LRU but rotate in our LRU */
1308			mem_cgroup_rotate_lru_list(page, page_lru(page));
1309			break;
1310		default:
1311			break;
1312		}
1313	}
1314
1315	*scanned = scan;
1316
1317	trace_mm_vmscan_memcg_isolate(0, nr_to_scan, scan, nr_taken,
1318				      0, 0, 0, mode);
1319
1320	return nr_taken;
1321}
1322
1323#define mem_cgroup_from_res_counter(counter, member)	\
1324	container_of(counter, struct mem_cgroup, member)
1325
1326/**
1327 * mem_cgroup_margin - calculate chargeable space of a memory cgroup
1328 * @mem: the memory cgroup
1329 *
1330 * Returns the maximum amount of memory @mem can be charged with, in
1331 * pages.
1332 */
1333static unsigned long mem_cgroup_margin(struct mem_cgroup *memcg)
1334{
1335	unsigned long long margin;
1336
1337	margin = res_counter_margin(&memcg->res);
1338	if (do_swap_account)
1339		margin = min(margin, res_counter_margin(&memcg->memsw));
1340	return margin >> PAGE_SHIFT;
1341}
1342
1343int mem_cgroup_swappiness(struct mem_cgroup *memcg)
1344{
1345	struct cgroup *cgrp = memcg->css.cgroup;
1346
1347	/* root ? */
1348	if (cgrp->parent == NULL)
1349		return vm_swappiness;
1350
1351	return memcg->swappiness;
1352}
1353
1354static void mem_cgroup_start_move(struct mem_cgroup *memcg)
1355{
1356	int cpu;
1357
1358	get_online_cpus();
1359	spin_lock(&memcg->pcp_counter_lock);
1360	for_each_online_cpu(cpu)
1361		per_cpu(memcg->stat->count[MEM_CGROUP_ON_MOVE], cpu) += 1;
1362	memcg->nocpu_base.count[MEM_CGROUP_ON_MOVE] += 1;
1363	spin_unlock(&memcg->pcp_counter_lock);
1364	put_online_cpus();
1365
1366	synchronize_rcu();
1367}
1368
1369static void mem_cgroup_end_move(struct mem_cgroup *memcg)
1370{
1371	int cpu;
1372
1373	if (!memcg)
1374		return;
1375	get_online_cpus();
1376	spin_lock(&memcg->pcp_counter_lock);
1377	for_each_online_cpu(cpu)
1378		per_cpu(memcg->stat->count[MEM_CGROUP_ON_MOVE], cpu) -= 1;
1379	memcg->nocpu_base.count[MEM_CGROUP_ON_MOVE] -= 1;
1380	spin_unlock(&memcg->pcp_counter_lock);
1381	put_online_cpus();
1382}
1383/*
1384 * 2 routines for checking "mem" is under move_account() or not.
1385 *
1386 * mem_cgroup_stealed() - checking a cgroup is mc.from or not. This is used
1387 *			  for avoiding race in accounting. If true,
1388 *			  pc->mem_cgroup may be overwritten.
1389 *
1390 * mem_cgroup_under_move() - checking a cgroup is mc.from or mc.to or
1391 *			  under hierarchy of moving cgroups. This is for
1392 *			  waiting at hith-memory prressure caused by "move".
1393 */
1394
1395static bool mem_cgroup_stealed(struct mem_cgroup *memcg)
1396{
1397	VM_BUG_ON(!rcu_read_lock_held());
1398	return this_cpu_read(memcg->stat->count[MEM_CGROUP_ON_MOVE]) > 0;
1399}
1400
1401static bool mem_cgroup_under_move(struct mem_cgroup *memcg)
1402{
1403	struct mem_cgroup *from;
1404	struct mem_cgroup *to;
1405	bool ret = false;
1406	/*
1407	 * Unlike task_move routines, we access mc.to, mc.from not under
1408	 * mutual exclusion by cgroup_mutex. Here, we take spinlock instead.
1409	 */
1410	spin_lock(&mc.lock);
1411	from = mc.from;
1412	to = mc.to;
1413	if (!from)
1414		goto unlock;
1415
1416	ret = mem_cgroup_same_or_subtree(memcg, from)
1417		|| mem_cgroup_same_or_subtree(memcg, to);
1418unlock:
1419	spin_unlock(&mc.lock);
1420	return ret;
1421}
1422
1423static bool mem_cgroup_wait_acct_move(struct mem_cgroup *memcg)
1424{
1425	if (mc.moving_task && current != mc.moving_task) {
1426		if (mem_cgroup_under_move(memcg)) {
1427			DEFINE_WAIT(wait);
1428			prepare_to_wait(&mc.waitq, &wait, TASK_INTERRUPTIBLE);
1429			/* moving charge context might have finished. */
1430			if (mc.moving_task)
1431				schedule();
1432			finish_wait(&mc.waitq, &wait);
1433			return true;
1434		}
1435	}
1436	return false;
1437}
1438
1439/**
1440 * mem_cgroup_print_oom_info: Called from OOM with tasklist_lock held in read mode.
1441 * @memcg: The memory cgroup that went over limit
1442 * @p: Task that is going to be killed
1443 *
1444 * NOTE: @memcg and @p's mem_cgroup can be different when hierarchy is
1445 * enabled
1446 */
1447void mem_cgroup_print_oom_info(struct mem_cgroup *memcg, struct task_struct *p)
1448{
1449	struct cgroup *task_cgrp;
1450	struct cgroup *mem_cgrp;
1451	/*
1452	 * Need a buffer in BSS, can't rely on allocations. The code relies
1453	 * on the assumption that OOM is serialized for memory controller.
1454	 * If this assumption is broken, revisit this code.
1455	 */
1456	static char memcg_name[PATH_MAX];
1457	int ret;
1458
1459	if (!memcg || !p)
1460		return;
1461
1462
1463	rcu_read_lock();
1464
1465	mem_cgrp = memcg->css.cgroup;
1466	task_cgrp = task_cgroup(p, mem_cgroup_subsys_id);
1467
1468	ret = cgroup_path(task_cgrp, memcg_name, PATH_MAX);
1469	if (ret < 0) {
1470		/*
1471		 * Unfortunately, we are unable to convert to a useful name
1472		 * But we'll still print out the usage information
1473		 */
1474		rcu_read_unlock();
1475		goto done;
1476	}
1477	rcu_read_unlock();
1478
1479	printk(KERN_INFO "Task in %s killed", memcg_name);
1480
1481	rcu_read_lock();
1482	ret = cgroup_path(mem_cgrp, memcg_name, PATH_MAX);
1483	if (ret < 0) {
1484		rcu_read_unlock();
1485		goto done;
1486	}
1487	rcu_read_unlock();
1488
1489	/*
1490	 * Continues from above, so we don't need an KERN_ level
1491	 */
1492	printk(KERN_CONT " as a result of limit of %s\n", memcg_name);
1493done:
1494
1495	printk(KERN_INFO "memory: usage %llukB, limit %llukB, failcnt %llu\n",
1496		res_counter_read_u64(&memcg->res, RES_USAGE) >> 10,
1497		res_counter_read_u64(&memcg->res, RES_LIMIT) >> 10,
1498		res_counter_read_u64(&memcg->res, RES_FAILCNT));
1499	printk(KERN_INFO "memory+swap: usage %llukB, limit %llukB, "
1500		"failcnt %llu\n",
1501		res_counter_read_u64(&memcg->memsw, RES_USAGE) >> 10,
1502		res_counter_read_u64(&memcg->memsw, RES_LIMIT) >> 10,
1503		res_counter_read_u64(&memcg->memsw, RES_FAILCNT));
1504}
1505
1506/*
1507 * This function returns the number of memcg under hierarchy tree. Returns
1508 * 1(self count) if no children.
1509 */
1510static int mem_cgroup_count_children(struct mem_cgroup *memcg)
1511{
1512	int num = 0;
1513	struct mem_cgroup *iter;
1514
1515	for_each_mem_cgroup_tree(iter, memcg)
1516		num++;
1517	return num;
1518}
1519
1520/*
1521 * Return the memory (and swap, if configured) limit for a memcg.
1522 */
1523u64 mem_cgroup_get_limit(struct mem_cgroup *memcg)
1524{
1525	u64 limit;
1526	u64 memsw;
1527
1528	limit = res_counter_read_u64(&memcg->res, RES_LIMIT);
1529	limit += total_swap_pages << PAGE_SHIFT;
1530
1531	memsw = res_counter_read_u64(&memcg->memsw, RES_LIMIT);
1532	/*
1533	 * If memsw is finite and limits the amount of swap space available
1534	 * to this memcg, return that limit.
1535	 */
1536	return min(limit, memsw);
1537}
1538
1539/*
1540 * Visit the first child (need not be the first child as per the ordering
1541 * of the cgroup list, since we track last_scanned_child) of @mem and use
1542 * that to reclaim free pages from.
1543 */
1544static struct mem_cgroup *
1545mem_cgroup_select_victim(struct mem_cgroup *root_memcg)
1546{
1547	struct mem_cgroup *ret = NULL;
1548	struct cgroup_subsys_state *css;
1549	int nextid, found;
1550
1551	if (!root_memcg->use_hierarchy) {
1552		css_get(&root_memcg->css);
1553		ret = root_memcg;
1554	}
1555
1556	while (!ret) {
1557		rcu_read_lock();
1558		nextid = root_memcg->last_scanned_child + 1;
1559		css = css_get_next(&mem_cgroup_subsys, nextid, &root_memcg->css,
1560				   &found);
1561		if (css && css_tryget(css))
1562			ret = container_of(css, struct mem_cgroup, css);
1563
1564		rcu_read_unlock();
1565		/* Updates scanning parameter */
1566		if (!css) {
1567			/* this means start scan from ID:1 */
1568			root_memcg->last_scanned_child = 0;
1569		} else
1570			root_memcg->last_scanned_child = found;
1571	}
1572
1573	return ret;
1574}
1575
1576/**
1577 * test_mem_cgroup_node_reclaimable
1578 * @mem: the target memcg
1579 * @nid: the node ID to be checked.
1580 * @noswap : specify true here if the user wants flle only information.
1581 *
1582 * This function returns whether the specified memcg contains any
1583 * reclaimable pages on a node. Returns true if there are any reclaimable
1584 * pages in the node.
1585 */
1586static bool test_mem_cgroup_node_reclaimable(struct mem_cgroup *memcg,
1587		int nid, bool noswap)
1588{
1589	if (mem_cgroup_node_nr_lru_pages(memcg, nid, LRU_ALL_FILE))
1590		return true;
1591	if (noswap || !total_swap_pages)
1592		return false;
1593	if (mem_cgroup_node_nr_lru_pages(memcg, nid, LRU_ALL_ANON))
1594		return true;
1595	return false;
1596
1597}
1598#if MAX_NUMNODES > 1
1599
1600/*
1601 * Always updating the nodemask is not very good - even if we have an empty
1602 * list or the wrong list here, we can start from some node and traverse all
1603 * nodes based on the zonelist. So update the list loosely once per 10 secs.
1604 *
1605 */
1606static void mem_cgroup_may_update_nodemask(struct mem_cgroup *memcg)
1607{
1608	int nid;
1609	/*
1610	 * numainfo_events > 0 means there was at least NUMAINFO_EVENTS_TARGET
1611	 * pagein/pageout changes since the last update.
1612	 */
1613	if (!atomic_read(&memcg->numainfo_events))
1614		return;
1615	if (atomic_inc_return(&memcg->numainfo_updating) > 1)
1616		return;
1617
1618	/* make a nodemask where this memcg uses memory from */
1619	memcg->scan_nodes = node_states[N_HIGH_MEMORY];
1620
1621	for_each_node_mask(nid, node_states[N_HIGH_MEMORY]) {
1622
1623		if (!test_mem_cgroup_node_reclaimable(memcg, nid, false))
1624			node_clear(nid, memcg->scan_nodes);
1625	}
1626
1627	atomic_set(&memcg->numainfo_events, 0);
1628	atomic_set(&memcg->numainfo_updating, 0);
1629}
1630
1631/*
1632 * Selecting a node where we start reclaim from. Because what we need is just
1633 * reducing usage counter, start from anywhere is O,K. Considering
1634 * memory reclaim from current node, there are pros. and cons.
1635 *
1636 * Freeing memory from current node means freeing memory from a node which
1637 * we'll use or we've used. So, it may make LRU bad. And if several threads
1638 * hit limits, it will see a contention on a node. But freeing from remote
1639 * node means more costs for memory reclaim because of memory latency.
1640 *
1641 * Now, we use round-robin. Better algorithm is welcomed.
1642 */
1643int mem_cgroup_select_victim_node(struct mem_cgroup *memcg)
1644{
1645	int node;
1646
1647	mem_cgroup_may_update_nodemask(memcg);
1648	node = memcg->last_scanned_node;
1649
1650	node = next_node(node, memcg->scan_nodes);
1651	if (node == MAX_NUMNODES)
1652		node = first_node(memcg->scan_nodes);
1653	/*
1654	 * We call this when we hit limit, not when pages are added to LRU.
1655	 * No LRU may hold pages because all pages are UNEVICTABLE or
1656	 * memcg is too small and all pages are not on LRU. In that case,
1657	 * we use curret node.
1658	 */
1659	if (unlikely(node == MAX_NUMNODES))
1660		node = numa_node_id();
1661
1662	memcg->last_scanned_node = node;
1663	return node;
1664}
1665
1666/*
1667 * Check all nodes whether it contains reclaimable pages or not.
1668 * For quick scan, we make use of scan_nodes. This will allow us to skip
1669 * unused nodes. But scan_nodes is lazily updated and may not cotain
1670 * enough new information. We need to do double check.
1671 */
1672bool mem_cgroup_reclaimable(struct mem_cgroup *memcg, bool noswap)
1673{
1674	int nid;
1675
1676	/*
1677	 * quick check...making use of scan_node.
1678	 * We can skip unused nodes.
1679	 */
1680	if (!nodes_empty(memcg->scan_nodes)) {
1681		for (nid = first_node(memcg->scan_nodes);
1682		     nid < MAX_NUMNODES;
1683		     nid = next_node(nid, memcg->scan_nodes)) {
1684
1685			if (test_mem_cgroup_node_reclaimable(memcg, nid, noswap))
1686				return true;
1687		}
1688	}
1689	/*
1690	 * Check rest of nodes.
1691	 */
1692	for_each_node_state(nid, N_HIGH_MEMORY) {
1693		if (node_isset(nid, memcg->scan_nodes))
1694			continue;
1695		if (test_mem_cgroup_node_reclaimable(memcg, nid, noswap))
1696			return true;
1697	}
1698	return false;
1699}
1700
1701#else
1702int mem_cgroup_select_victim_node(struct mem_cgroup *memcg)
1703{
1704	return 0;
1705}
1706
1707bool mem_cgroup_reclaimable(struct mem_cgroup *memcg, bool noswap)
1708{
1709	return test_mem_cgroup_node_reclaimable(memcg, 0, noswap);
1710}
1711#endif
1712
1713/*
1714 * Scan the hierarchy if needed to reclaim memory. We remember the last child
1715 * we reclaimed from, so that we don't end up penalizing one child extensively
1716 * based on its position in the children list.
1717 *
1718 * root_memcg is the original ancestor that we've been reclaim from.
1719 *
1720 * We give up and return to the caller when we visit root_memcg twice.
1721 * (other groups can be removed while we're walking....)
1722 *
1723 * If shrink==true, for avoiding to free too much, this returns immedieately.
1724 */
1725static int mem_cgroup_hierarchical_reclaim(struct mem_cgroup *root_memcg,
1726						struct zone *zone,
1727						gfp_t gfp_mask,
1728						unsigned long reclaim_options,
1729						unsigned long *total_scanned)
1730{
1731	struct mem_cgroup *victim;
1732	int ret, total = 0;
1733	int loop = 0;
1734	bool noswap = reclaim_options & MEM_CGROUP_RECLAIM_NOSWAP;
1735	bool shrink = reclaim_options & MEM_CGROUP_RECLAIM_SHRINK;
1736	bool check_soft = reclaim_options & MEM_CGROUP_RECLAIM_SOFT;
1737	unsigned long excess;
1738	unsigned long nr_scanned;
1739
1740	excess = res_counter_soft_limit_excess(&root_memcg->res) >> PAGE_SHIFT;
1741
1742	/* If memsw_is_minimum==1, swap-out is of-no-use. */
1743	if (!check_soft && !shrink && root_memcg->memsw_is_minimum)
1744		noswap = true;
1745
1746	while (1) {
1747		victim = mem_cgroup_select_victim(root_memcg);
1748		if (victim == root_memcg) {
1749			loop++;
1750			/*
1751			 * We are not draining per cpu cached charges during
1752			 * soft limit reclaim  because global reclaim doesn't
1753			 * care about charges. It tries to free some memory and
1754			 * charges will not give any.
1755			 */
1756			if (!check_soft && loop >= 1)
1757				drain_all_stock_async(root_memcg);
1758			if (loop >= 2) {
1759				/*
1760				 * If we have not been able to reclaim
1761				 * anything, it might because there are
1762				 * no reclaimable pages under this hierarchy
1763				 */
1764				if (!check_soft || !total) {
1765					css_put(&victim->css);
1766					break;
1767				}
1768				/*
1769				 * We want to do more targeted reclaim.
1770				 * excess >> 2 is not to excessive so as to
1771				 * reclaim too much, nor too less that we keep
1772				 * coming back to reclaim from this cgroup
1773				 */
1774				if (total >= (excess >> 2) ||
1775					(loop > MEM_CGROUP_MAX_RECLAIM_LOOPS)) {
1776					css_put(&victim->css);
1777					break;
1778				}
1779			}
1780		}
1781		if (!mem_cgroup_reclaimable(victim, noswap)) {
1782			/* this cgroup's local usage == 0 */
1783			css_put(&victim->css);
1784			continue;
1785		}
1786		/* we use swappiness of local cgroup */
1787		if (check_soft) {
1788			ret = mem_cgroup_shrink_node_zone(victim, gfp_mask,
1789				noswap, zone, &nr_scanned);
1790			*total_scanned += nr_scanned;
1791		} else
1792			ret = try_to_free_mem_cgroup_pages(victim, gfp_mask,
1793						noswap);
1794		css_put(&victim->css);
1795		/*
1796		 * At shrinking usage, we can't check we should stop here or
1797		 * reclaim more. It's depends on callers. last_scanned_child
1798		 * will work enough for keeping fairness under tree.
1799		 */
1800		if (shrink)
1801			return ret;
1802		total += ret;
1803		if (check_soft) {
1804			if (!res_counter_soft_limit_excess(&root_memcg->res))
1805				return total;
1806		} else if (mem_cgroup_margin(root_memcg))
1807			return total;
1808	}
1809	return total;
1810}
1811
1812/*
1813 * Check OOM-Killer is already running under our hierarchy.
1814 * If someone is running, return false.
1815 * Has to be called with memcg_oom_lock
1816 */
1817static bool mem_cgroup_oom_lock(struct mem_cgroup *memcg)
1818{
1819	struct mem_cgroup *iter, *failed = NULL;
1820	bool cond = true;
1821
1822	for_each_mem_cgroup_tree_cond(iter, memcg, cond) {
1823		if (iter->oom_lock) {
1824			/*
1825			 * this subtree of our hierarchy is already locked
1826			 * so we cannot give a lock.
1827			 */
1828			failed = iter;
1829			cond = false;
1830		} else
1831			iter->oom_lock = true;
1832	}
1833
1834	if (!failed)
1835		return true;
1836
1837	/*
1838	 * OK, we failed to lock the whole subtree so we have to clean up
1839	 * what we set up to the failing subtree
1840	 */
1841	cond = true;
1842	for_each_mem_cgroup_tree_cond(iter, memcg, cond) {
1843		if (iter == failed) {
1844			cond = false;
1845			continue;
1846		}
1847		iter->oom_lock = false;
1848	}
1849	return false;
1850}
1851
1852/*
1853 * Has to be called with memcg_oom_lock
1854 */
1855static int mem_cgroup_oom_unlock(struct mem_cgroup *memcg)
1856{
1857	struct mem_cgroup *iter;
1858
1859	for_each_mem_cgroup_tree(iter, memcg)
1860		iter->oom_lock = false;
1861	return 0;
1862}
1863
1864static void mem_cgroup_mark_under_oom(struct mem_cgroup *memcg)
1865{
1866	struct mem_cgroup *iter;
1867
1868	for_each_mem_cgroup_tree(iter, memcg)
1869		atomic_inc(&iter->under_oom);
1870}
1871
1872static void mem_cgroup_unmark_under_oom(struct mem_cgroup *memcg)
1873{
1874	struct mem_cgroup *iter;
1875
1876	/*
1877	 * When a new child is created while the hierarchy is under oom,
1878	 * mem_cgroup_oom_lock() may not be called. We have to use
1879	 * atomic_add_unless() here.
1880	 */
1881	for_each_mem_cgroup_tree(iter, memcg)
1882		atomic_add_unless(&iter->under_oom, -1, 0);
1883}
1884
1885static DEFINE_SPINLOCK(memcg_oom_lock);
1886static DECLARE_WAIT_QUEUE_HEAD(memcg_oom_waitq);
1887
1888struct oom_wait_info {
1889	struct mem_cgroup *mem;
1890	wait_queue_t	wait;
1891};
1892
1893static int memcg_oom_wake_function(wait_queue_t *wait,
1894	unsigned mode, int sync, void *arg)
1895{
1896	struct mem_cgroup *wake_memcg = (struct mem_cgroup *)arg,
1897			  *oom_wait_memcg;
1898	struct oom_wait_info *oom_wait_info;
1899
1900	oom_wait_info = container_of(wait, struct oom_wait_info, wait);
1901	oom_wait_memcg = oom_wait_info->mem;
1902
1903	/*
1904	 * Both of oom_wait_info->mem and wake_mem are stable under us.
1905	 * Then we can use css_is_ancestor without taking care of RCU.
1906	 */
1907	if (!mem_cgroup_same_or_subtree(oom_wait_memcg, wake_memcg)
1908		&& !mem_cgroup_same_or_subtree(wake_memcg, oom_wait_memcg))
1909		return 0;
1910	return autoremove_wake_function(wait, mode, sync, arg);
1911}
1912
1913static void memcg_wakeup_oom(struct mem_cgroup *memcg)
1914{
1915	/* for filtering, pass "memcg" as argument. */
1916	__wake_up(&memcg_oom_waitq, TASK_NORMAL, 0, memcg);
1917}
1918
1919static void memcg_oom_recover(struct mem_cgroup *memcg)
1920{
1921	if (memcg && atomic_read(&memcg->under_oom))
1922		memcg_wakeup_oom(memcg);
1923}
1924
1925/*
1926 * try to call OOM killer. returns false if we should exit memory-reclaim loop.
1927 */
1928bool mem_cgroup_handle_oom(struct mem_cgroup *memcg, gfp_t mask)
1929{
1930	struct oom_wait_info owait;
1931	bool locked, need_to_kill;
1932
1933	owait.mem = memcg;
1934	owait.wait.flags = 0;
1935	owait.wait.func = memcg_oom_wake_function;
1936	owait.wait.private = current;
1937	INIT_LIST_HEAD(&owait.wait.task_list);
1938	need_to_kill = true;
1939	mem_cgroup_mark_under_oom(memcg);
1940
1941	/* At first, try to OOM lock hierarchy under memcg.*/
1942	spin_lock(&memcg_oom_lock);
1943	locked = mem_cgroup_oom_lock(memcg);
1944	/*
1945	 * Even if signal_pending(), we can't quit charge() loop without
1946	 * accounting. So, UNINTERRUPTIBLE is appropriate. But SIGKILL
1947	 * under OOM is always welcomed, use TASK_KILLABLE here.
1948	 */
1949	prepare_to_wait(&memcg_oom_waitq, &owait.wait, TASK_KILLABLE);
1950	if (!locked || memcg->oom_kill_disable)
1951		need_to_kill = false;
1952	if (locked)
1953		mem_cgroup_oom_notify(memcg);
1954	spin_unlock(&memcg_oom_lock);
1955
1956	if (need_to_kill) {
1957		finish_wait(&memcg_oom_waitq, &owait.wait);
1958		mem_cgroup_out_of_memory(memcg, mask);
1959	} else {
1960		schedule();
1961		finish_wait(&memcg_oom_waitq, &owait.wait);
1962	}
1963	spin_lock(&memcg_oom_lock);
1964	if (locked)
1965		mem_cgroup_oom_unlock(memcg);
1966	memcg_wakeup_oom(memcg);
1967	spin_unlock(&memcg_oom_lock);
1968
1969	mem_cgroup_unmark_under_oom(memcg);
1970
1971	if (test_thread_flag(TIF_MEMDIE) || fatal_signal_pending(current))
1972		return false;
1973	/* Give chance to dying process */
1974	schedule_timeout_uninterruptible(1);
1975	return true;
1976}
1977
1978/*
1979 * Currently used to update mapped file statistics, but the routine can be
1980 * generalized to update other statistics as well.
1981 *
1982 * Notes: Race condition
1983 *
1984 * We usually use page_cgroup_lock() for accessing page_cgroup member but
1985 * it tends to be costly. But considering some conditions, we doesn't need
1986 * to do so _always_.
1987 *
1988 * Considering "charge", lock_page_cgroup() is not required because all
1989 * file-stat operations happen after a page is attached to radix-tree. There
1990 * are no race with "charge".
1991 *
1992 * Considering "uncharge", we know that memcg doesn't clear pc->mem_cgroup
1993 * at "uncharge" intentionally. So, we always see valid pc->mem_cgroup even
1994 * if there are race with "uncharge". Statistics itself is properly handled
1995 * by flags.
1996 *
1997 * Considering "move", this is an only case we see a race. To make the race
1998 * small, we check MEM_CGROUP_ON_MOVE percpu value and detect there are
1999 * possibility of race condition. If there is, we take a lock.
2000 */
2001
2002void mem_cgroup_update_page_stat(struct page *page,
2003				 enum mem_cgroup_page_stat_item idx, int val)
2004{
2005	struct mem_cgroup *memcg;
2006	struct page_cgroup *pc = lookup_page_cgroup(page);
2007	bool need_unlock = false;
2008	unsigned long uninitialized_var(flags);
2009
2010	if (unlikely(!pc))
2011		return;
2012
2013	rcu_read_lock();
2014	memcg = pc->mem_cgroup;
2015	if (unlikely(!memcg || !PageCgroupUsed(pc)))
2016		goto out;
2017	/* pc->mem_cgroup is unstable ? */
2018	if (unlikely(mem_cgroup_stealed(memcg)) || PageTransHuge(page)) {
2019		/* take a lock against to access pc->mem_cgroup */
2020		move_lock_page_cgroup(pc, &flags);
2021		need_unlock = true;
2022		memcg = pc->mem_cgroup;
2023		if (!memcg || !PageCgroupUsed(pc))
2024			goto out;
2025	}
2026
2027	switch (idx) {
2028	case MEMCG_NR_FILE_MAPPED:
2029		if (val > 0)
2030			SetPageCgroupFileMapped(pc);
2031		else if (!page_mapped(page))
2032			ClearPageCgroupFileMapped(pc);
2033		idx = MEM_CGROUP_STAT_FILE_MAPPED;
2034		break;
2035	default:
2036		BUG();
2037	}
2038
2039	this_cpu_add(memcg->stat->count[idx], val);
2040
2041out:
2042	if (unlikely(need_unlock))
2043		move_unlock_page_cgroup(pc, &flags);
2044	rcu_read_unlock();
2045	return;
2046}
2047EXPORT_SYMBOL(mem_cgroup_update_page_stat);
2048
2049/*
2050 * size of first charge trial. "32" comes from vmscan.c's magic value.
2051 * TODO: maybe necessary to use big numbers in big irons.
2052 */
2053#define CHARGE_BATCH	32U
2054struct memcg_stock_pcp {
2055	struct mem_cgroup *cached; /* this never be root cgroup */
2056	unsigned int nr_pages;
2057	struct work_struct work;
2058	unsigned long flags;
2059#define FLUSHING_CACHED_CHARGE	(0)
2060};
2061static DEFINE_PER_CPU(struct memcg_stock_pcp, memcg_stock);
2062static DEFINE_MUTEX(percpu_charge_mutex);
2063
2064/*
2065 * Try to consume stocked charge on this cpu. If success, one page is consumed
2066 * from local stock and true is returned. If the stock is 0 or charges from a
2067 * cgroup which is not current target, returns false. This stock will be
2068 * refilled.
2069 */
2070static bool consume_stock(struct mem_cgroup *memcg)
2071{
2072	struct memcg_stock_pcp *stock;
2073	bool ret = true;
2074
2075	stock = &get_cpu_var(memcg_stock);
2076	if (memcg == stock->cached && stock->nr_pages)
2077		stock->nr_pages--;
2078	else /* need to call res_counter_charge */
2079		ret = false;
2080	put_cpu_var(memcg_stock);
2081	return ret;
2082}
2083
2084/*
2085 * Returns stocks cached in percpu to res_counter and reset cached information.
2086 */
2087static void drain_stock(struct memcg_stock_pcp *stock)
2088{
2089	struct mem_cgroup *old = stock->cached;
2090
2091	if (stock->nr_pages) {
2092		unsigned long bytes = stock->nr_pages * PAGE_SIZE;
2093
2094		res_counter_uncharge(&old->res, bytes);
2095		if (do_swap_account)
2096			res_counter_uncharge(&old->memsw, bytes);
2097		stock->nr_pages = 0;
2098	}
2099	stock->cached = NULL;
2100}
2101
2102/*
2103 * This must be called under preempt disabled or must be called by
2104 * a thread which is pinned to local cpu.
2105 */
2106static void drain_local_stock(struct work_struct *dummy)
2107{
2108	struct memcg_stock_pcp *stock = &__get_cpu_var(memcg_stock);
2109	drain_stock(stock);
2110	clear_bit(FLUSHING_CACHED_CHARGE, &stock->flags);
2111}
2112
2113/*
2114 * Cache charges(val) which is from res_counter, to local per_cpu area.
2115 * This will be consumed by consume_stock() function, later.
2116 */
2117static void refill_stock(struct mem_cgroup *memcg, unsigned int nr_pages)
2118{
2119	struct memcg_stock_pcp *stock = &get_cpu_var(memcg_stock);
2120
2121	if (stock->cached != memcg) { /* reset if necessary */
2122		drain_stock(stock);
2123		stock->cached = memcg;
2124	}
2125	stock->nr_pages += nr_pages;
2126	put_cpu_var(memcg_stock);
2127}
2128
2129/*
2130 * Drains all per-CPU charge caches for given root_memcg resp. subtree
2131 * of the hierarchy under it. sync flag says whether we should block
2132 * until the work is done.
2133 */
2134static void drain_all_stock(struct mem_cgroup *root_memcg, bool sync)
2135{
2136	int cpu, curcpu;
2137
2138	/* Notify other cpus that system-wide "drain" is running */
2139	get_online_cpus();
2140	curcpu = get_cpu();
2141	for_each_online_cpu(cpu) {
2142		struct memcg_stock_pcp *stock = &per_cpu(memcg_stock, cpu);
2143		struct mem_cgroup *memcg;
2144
2145		memcg = stock->cached;
2146		if (!memcg || !stock->nr_pages)
2147			continue;
2148		if (!mem_cgroup_same_or_subtree(root_memcg, memcg))
2149			continue;
2150		if (!test_and_set_bit(FLUSHING_CACHED_CHARGE, &stock->flags)) {
2151			if (cpu == curcpu)
2152				drain_local_stock(&stock->work);
2153			else
2154				schedule_work_on(cpu, &stock->work);
2155		}
2156	}
2157	put_cpu();
2158
2159	if (!sync)
2160		goto out;
2161
2162	for_each_online_cpu(cpu) {
2163		struct memcg_stock_pcp *stock = &per_cpu(memcg_stock, cpu);
2164		if (test_bit(FLUSHING_CACHED_CHARGE, &stock->flags))
2165			flush_work(&stock->work);
2166	}
2167out:
2168 	put_online_cpus();
2169}
2170
2171/*
2172 * Tries to drain stocked charges in other cpus. This function is asynchronous
2173 * and just put a work per cpu for draining localy on each cpu. Caller can
2174 * expects some charges will be back to res_counter later but cannot wait for
2175 * it.
2176 */
2177static void drain_all_stock_async(struct mem_cgroup *root_memcg)
2178{
2179	/*
2180	 * If someone calls draining, avoid adding more kworker runs.
2181	 */
2182	if (!mutex_trylock(&percpu_charge_mutex))
2183		return;
2184	drain_all_stock(root_memcg, false);
2185	mutex_unlock(&percpu_charge_mutex);
2186}
2187
2188/* This is a synchronous drain interface. */
2189static void drain_all_stock_sync(struct mem_cgroup *root_memcg)
2190{
2191	/* called when force_empty is called */
2192	mutex_lock(&percpu_charge_mutex);
2193	drain_all_stock(root_memcg, true);
2194	mutex_unlock(&percpu_charge_mutex);
2195}
2196
2197/*
2198 * This function drains percpu counter value from DEAD cpu and
2199 * move it to local cpu. Note that this function can be preempted.
2200 */
2201static void mem_cgroup_drain_pcp_counter(struct mem_cgroup *memcg, int cpu)
2202{
2203	int i;
2204
2205	spin_lock(&memcg->pcp_counter_lock);
2206	for (i = 0; i < MEM_CGROUP_STAT_DATA; i++) {
2207		long x = per_cpu(memcg->stat->count[i], cpu);
2208
2209		per_cpu(memcg->stat->count[i], cpu) = 0;
2210		memcg->nocpu_base.count[i] += x;
2211	}
2212	for (i = 0; i < MEM_CGROUP_EVENTS_NSTATS; i++) {
2213		unsigned long x = per_cpu(memcg->stat->events[i], cpu);
2214
2215		per_cpu(memcg->stat->events[i], cpu) = 0;
2216		memcg->nocpu_base.events[i] += x;
2217	}
2218	/* need to clear ON_MOVE value, works as a kind of lock. */
2219	per_cpu(memcg->stat->count[MEM_CGROUP_ON_MOVE], cpu) = 0;
2220	spin_unlock(&memcg->pcp_counter_lock);
2221}
2222
2223static void synchronize_mem_cgroup_on_move(struct mem_cgroup *memcg, int cpu)
2224{
2225	int idx = MEM_CGROUP_ON_MOVE;
2226
2227	spin_lock(&memcg->pcp_counter_lock);
2228	per_cpu(memcg->stat->count[idx], cpu) = memcg->nocpu_base.count[idx];
2229	spin_unlock(&memcg->pcp_counter_lock);
2230}
2231
2232static int __cpuinit memcg_cpu_hotplug_callback(struct notifier_block *nb,
2233					unsigned long action,
2234					void *hcpu)
2235{
2236	int cpu = (unsigned long)hcpu;
2237	struct memcg_stock_pcp *stock;
2238	struct mem_cgroup *iter;
2239
2240	if ((action == CPU_ONLINE)) {
2241		for_each_mem_cgroup_all(iter)
2242			synchronize_mem_cgroup_on_move(iter, cpu);
2243		return NOTIFY_OK;
2244	}
2245
2246	if ((action != CPU_DEAD) || action != CPU_DEAD_FROZEN)
2247		return NOTIFY_OK;
2248
2249	for_each_mem_cgroup_all(iter)
2250		mem_cgroup_drain_pcp_counter(iter, cpu);
2251
2252	stock = &per_cpu(memcg_stock, cpu);
2253	drain_stock(stock);
2254	return NOTIFY_OK;
2255}
2256
2257
2258/* See __mem_cgroup_try_charge() for details */
2259enum {
2260	CHARGE_OK,		/* success */
2261	CHARGE_RETRY,		/* need to retry but retry is not bad */
2262	CHARGE_NOMEM,		/* we can't do more. return -ENOMEM */
2263	CHARGE_WOULDBLOCK,	/* GFP_WAIT wasn't set and no enough res. */
2264	CHARGE_OOM_DIE,		/* the current is killed because of OOM */
2265};
2266
2267static int mem_cgroup_do_charge(struct mem_cgroup *memcg, gfp_t gfp_mask,
2268				unsigned int nr_pages, bool oom_check)
2269{
2270	unsigned long csize = nr_pages * PAGE_SIZE;
2271	struct mem_cgroup *mem_over_limit;
2272	struct res_counter *fail_res;
2273	unsigned long flags = 0;
2274	int ret;
2275
2276	ret = res_counter_charge(&memcg->res, csize, &fail_res);
2277
2278	if (likely(!ret)) {
2279		if (!do_swap_account)
2280			return CHARGE_OK;
2281		ret = res_counter_charge(&memcg->memsw, csize, &fail_res);
2282		if (likely(!ret))
2283			return CHARGE_OK;
2284
2285		res_counter_uncharge(&memcg->res, csize);
2286		mem_over_limit = mem_cgroup_from_res_counter(fail_res, memsw);
2287		flags |= MEM_CGROUP_RECLAIM_NOSWAP;
2288	} else
2289		mem_over_limit = mem_cgroup_from_res_counter(fail_res, res);
2290	/*
2291	 * nr_pages can be either a huge page (HPAGE_PMD_NR), a batch
2292	 * of regular pages (CHARGE_BATCH), or a single regular page (1).
2293	 *
2294	 * Never reclaim on behalf of optional batching, retry with a
2295	 * single page instead.
2296	 */
2297	if (nr_pages == CHARGE_BATCH)
2298		return CHARGE_RETRY;
2299
2300	if (!(gfp_mask & __GFP_WAIT))
2301		return CHARGE_WOULDBLOCK;
2302
2303	ret = mem_cgroup_hierarchical_reclaim(mem_over_limit, NULL,
2304					      gfp_mask, flags, NULL);
2305	if (mem_cgroup_margin(mem_over_limit) >= nr_pages)
2306		return CHARGE_RETRY;
2307	/*
2308	 * Even though the limit is exceeded at this point, reclaim
2309	 * may have been able to free some pages.  Retry the charge
2310	 * before killing the task.
2311	 *
2312	 * Only for regular pages, though: huge pages are rather
2313	 * unlikely to succeed so close to the limit, and we fall back
2314	 * to regular pages anyway in case of failure.
2315	 */
2316	if (nr_pages == 1 && ret)
2317		return CHARGE_RETRY;
2318
2319	/*
2320	 * At task move, charge accounts can be doubly counted. So, it's
2321	 * better to wait until the end of task_move if something is going on.
2322	 */
2323	if (mem_cgroup_wait_acct_move(mem_over_limit))
2324		return CHARGE_RETRY;
2325
2326	/* If we don't need to call oom-killer at el, return immediately */
2327	if (!oom_check)
2328		return CHARGE_NOMEM;
2329	/* check OOM */
2330	if (!mem_cgroup_handle_oom(mem_over_limit, gfp_mask))
2331		return CHARGE_OOM_DIE;
2332
2333	return CHARGE_RETRY;
2334}
2335
2336/*
2337 * Unlike exported interface, "oom" parameter is added. if oom==true,
2338 * oom-killer can be invoked.
2339 */
2340static int __mem_cgroup_try_charge(struct mm_struct *mm,
2341				   gfp_t gfp_mask,
2342				   unsigned int nr_pages,
2343				   struct mem_cgroup **ptr,
2344				   bool oom)
2345{
2346	unsigned int batch = max(CHARGE_BATCH, nr_pages);
2347	int nr_oom_retries = MEM_CGROUP_RECLAIM_RETRIES;
2348	struct mem_cgroup *memcg = NULL;
2349	int ret;
2350
2351	/*
2352	 * Unlike gloval-vm's OOM-kill, we're not in memory shortage
2353	 * in system level. So, allow to go ahead dying process in addition to
2354	 * MEMDIE process.
2355	 */
2356	if (unlikely(test_thread_flag(TIF_MEMDIE)
2357		     || fatal_signal_pending(current)))
2358		goto bypass;
2359
2360	/*
2361	 * We always charge the cgroup the mm_struct belongs to.
2362	 * The mm_struct's mem_cgroup changes on task migration if the
2363	 * thread group leader migrates. It's possible that mm is not
2364	 * set, if so charge the init_mm (happens for pagecache usage).
2365	 */
2366	if (!*ptr && !mm)
2367		goto bypass;
2368again:
2369	if (*ptr) { /* css should be a valid one */
2370		memcg = *ptr;
2371		VM_BUG_ON(css_is_removed(&memcg->css));
2372		if (mem_cgroup_is_root(memcg))
2373			goto done;
2374		if (nr_pages == 1 && consume_stock(memcg))
2375			goto done;
2376		css_get(&memcg->css);
2377	} else {
2378		struct task_struct *p;
2379
2380		rcu_read_lock();
2381		p = rcu_dereference(mm->owner);
2382		/*
2383		 * Because we don't have task_lock(), "p" can exit.
2384		 * In that case, "memcg" can point to root or p can be NULL with
2385		 * race with swapoff. Then, we have small risk of mis-accouning.
2386		 * But such kind of mis-account by race always happens because
2387		 * we don't have cgroup_mutex(). It's overkill and we allo that
2388		 * small race, here.
2389		 * (*) swapoff at el will charge against mm-struct not against
2390		 * task-struct. So, mm->owner can be NULL.
2391		 */
2392		memcg = mem_cgroup_from_task(p);
2393		if (!memcg || mem_cgroup_is_root(memcg)) {
2394			rcu_read_unlock();
2395			goto done;
2396		}
2397		if (nr_pages == 1 && consume_stock(memcg)) {
2398			/*
2399			 * It seems dagerous to access memcg without css_get().
2400			 * But considering how consume_stok works, it's not
2401			 * necessary. If consume_stock success, some charges
2402			 * from this memcg are cached on this cpu. So, we
2403			 * don't need to call css_get()/css_tryget() before
2404			 * calling consume_stock().
2405			 */
2406			rcu_read_unlock();
2407			goto done;
2408		}
2409		/* after here, we may be blocked. we need to get refcnt */
2410		if (!css_tryget(&memcg->css)) {
2411			rcu_read_unlock();
2412			goto again;
2413		}
2414		rcu_read_unlock();
2415	}
2416
2417	do {
2418		bool oom_check;
2419
2420		/* If killed, bypass charge */
2421		if (fatal_signal_pending(current)) {
2422			css_put(&memcg->css);
2423			goto bypass;
2424		}
2425
2426		oom_check = false;
2427		if (oom && !nr_oom_retries) {
2428			oom_check = true;
2429			nr_oom_retries = MEM_CGROUP_RECLAIM_RETRIES;
2430		}
2431
2432		ret = mem_cgroup_do_charge(memcg, gfp_mask, batch, oom_check);
2433		switch (ret) {
2434		case CHARGE_OK:
2435			break;
2436		case CHARGE_RETRY: /* not in OOM situation but retry */
2437			batch = nr_pages;
2438			css_put(&memcg->css);
2439			memcg = NULL;
2440			goto again;
2441		case CHARGE_WOULDBLOCK: /* !__GFP_WAIT */
2442			css_put(&memcg->css);
2443			goto nomem;
2444		case CHARGE_NOMEM: /* OOM routine works */
2445			if (!oom) {
2446				css_put(&memcg->css);
2447				goto nomem;
2448			}
2449			/* If oom, we never return -ENOMEM */
2450			nr_oom_retries--;
2451			break;
2452		case CHARGE_OOM_DIE: /* Killed by OOM Killer */
2453			css_put(&memcg->css);
2454			goto bypass;
2455		}
2456	} while (ret != CHARGE_OK);
2457
2458	if (batch > nr_pages)
2459		refill_stock(memcg, batch - nr_pages);
2460	css_put(&memcg->css);
2461done:
2462	*ptr = memcg;
2463	return 0;
2464nomem:
2465	*ptr = NULL;
2466	return -ENOMEM;
2467bypass:
2468	*ptr = NULL;
2469	return 0;
2470}
2471
2472/*
2473 * Somemtimes we have to undo a charge we got by try_charge().
2474 * This function is for that and do uncharge, put css's refcnt.
2475 * gotten by try_charge().
2476 */
2477static void __mem_cgroup_cancel_charge(struct mem_cgroup *memcg,
2478				       unsigned int nr_pages)
2479{
2480	if (!mem_cgroup_is_root(memcg)) {
2481		unsigned long bytes = nr_pages * PAGE_SIZE;
2482
2483		res_counter_uncharge(&memcg->res, bytes);
2484		if (do_swap_account)
2485			res_counter_uncharge(&memcg->memsw, bytes);
2486	}
2487}
2488
2489/*
2490 * A helper function to get mem_cgroup from ID. must be called under
2491 * rcu_read_lock(). The caller must check css_is_removed() or some if
2492 * it's concern. (dropping refcnt from swap can be called against removed
2493 * memcg.)
2494 */
2495static struct mem_cgroup *mem_cgroup_lookup(unsigned short id)
2496{
2497	struct cgroup_subsys_state *css;
2498
2499	/* ID 0 is unused ID */
2500	if (!id)
2501		return NULL;
2502	css = css_lookup(&mem_cgroup_subsys, id);
2503	if (!css)
2504		return NULL;
2505	return container_of(css, struct mem_cgroup, css);
2506}
2507
2508struct mem_cgroup *try_get_mem_cgroup_from_page(struct page *page)
2509{
2510	struct mem_cgroup *memcg = NULL;
2511	struct page_cgroup *pc;
2512	unsigned short id;
2513	swp_entry_t ent;
2514
2515	VM_BUG_ON(!PageLocked(page));
2516
2517	pc = lookup_page_cgroup(page);
2518	lock_page_cgroup(pc);
2519	if (PageCgroupUsed(pc)) {
2520		memcg = pc->mem_cgroup;
2521		if (memcg && !css_tryget(&memcg->css))
2522			memcg = NULL;
2523	} else if (PageSwapCache(page)) {
2524		ent.val = page_private(page);
2525		id = lookup_swap_cgroup(ent);
2526		rcu_read_lock();
2527		memcg = mem_cgroup_lookup(id);
2528		if (memcg && !css_tryget(&memcg->css))
2529			memcg = NULL;
2530		rcu_read_unlock();
2531	}
2532	unlock_page_cgroup(pc);
2533	return memcg;
2534}
2535
2536static void __mem_cgroup_commit_charge(struct mem_cgroup *memcg,
2537				       struct page *page,
2538				       unsigned int nr_pages,
2539				       struct page_cgroup *pc,
2540				       enum charge_type ctype)
2541{
2542	lock_page_cgroup(pc);
2543	if (unlikely(PageCgroupUsed(pc))) {
2544		unlock_page_cgroup(pc);
2545		__mem_cgroup_cancel_charge(memcg, nr_pages);
2546		return;
2547	}
2548	/*
2549	 * we don't need page_cgroup_lock about tail pages, becase they are not
2550	 * accessed by any other context at this point.
2551	 */
2552	pc->mem_cgroup = memcg;
2553	/*
2554	 * We access a page_cgroup asynchronously without lock_page_cgroup().
2555	 * Especially when a page_cgroup is taken from a page, pc->mem_cgroup
2556	 * is accessed after testing USED bit. To make pc->mem_cgroup visible
2557	 * before USED bit, we need memory barrier here.
2558	 * See mem_cgroup_add_lru_list(), etc.
2559 	 */
2560	smp_wmb();
2561	switch (ctype) {
2562	case MEM_CGROUP_CHARGE_TYPE_CACHE:
2563	case MEM_CGROUP_CHARGE_TYPE_SHMEM:
2564		SetPageCgroupCache(pc);
2565		SetPageCgroupUsed(pc);
2566		break;
2567	case MEM_CGROUP_CHARGE_TYPE_MAPPED:
2568		ClearPageCgroupCache(pc);
2569		SetPageCgroupUsed(pc);
2570		break;
2571	default:
2572		break;
2573	}
2574
2575	mem_cgroup_charge_statistics(memcg, PageCgroupCache(pc), nr_pages);
2576	unlock_page_cgroup(pc);
2577	/*
2578	 * "charge_statistics" updated event counter. Then, check it.
2579	 * Insert ancestor (and ancestor's ancestors), to softlimit RB-tree.
2580	 * if they exceeds softlimit.
2581	 */
2582	memcg_check_events(memcg, page);
2583}
2584
2585#ifdef CONFIG_TRANSPARENT_HUGEPAGE
2586
2587#define PCGF_NOCOPY_AT_SPLIT ((1 << PCG_LOCK) | (1 << PCG_MOVE_LOCK) |\
2588			(1 << PCG_ACCT_LRU) | (1 << PCG_MIGRATION))
2589/*
2590 * Because tail pages are not marked as "used", set it. We're under
2591 * zone->lru_lock, 'splitting on pmd' and compund_lock.
2592 */
2593void mem_cgroup_split_huge_fixup(struct page *head, struct page *tail)
2594{
2595	struct page_cgroup *head_pc = lookup_page_cgroup(head);
2596	struct page_cgroup *tail_pc = lookup_page_cgroup(tail);
2597	unsigned long flags;
2598
2599	if (mem_cgroup_disabled())
2600		return;
2601	/*
2602	 * We have no races with charge/uncharge but will have races with
2603	 * page state accounting.
2604	 */
2605	move_lock_page_cgroup(head_pc, &flags);
2606
2607	tail_pc->mem_cgroup = head_pc->mem_cgroup;
2608	smp_wmb(); /* see __commit_charge() */
2609	if (PageCgroupAcctLRU(head_pc)) {
2610		enum lru_list lru;
2611		struct mem_cgroup_per_zone *mz;
2612
2613		/*
2614		 * LRU flags cannot be copied because we need to add tail
2615		 *.page to LRU by generic call and our hook will be called.
2616		 * We hold lru_lock, then, reduce counter directly.
2617		 */
2618		lru = page_lru(head);
2619		mz = page_cgroup_zoneinfo(head_pc->mem_cgroup, head);
2620		MEM_CGROUP_ZSTAT(mz, lru) -= 1;
2621	}
2622	tail_pc->flags = head_pc->flags & ~PCGF_NOCOPY_AT_SPLIT;
2623	move_unlock_page_cgroup(head_pc, &flags);
2624}
2625#endif
2626
2627/**
2628 * mem_cgroup_move_account - move account of the page
2629 * @page: the page
2630 * @nr_pages: number of regular pages (>1 for huge pages)
2631 * @pc:	page_cgroup of the page.
2632 * @from: mem_cgroup which the page is moved from.
2633 * @to:	mem_cgroup which the page is moved to. @from != @to.
2634 * @uncharge: whether we should call uncharge and css_put against @from.
2635 *
2636 * The caller must confirm following.
2637 * - page is not on LRU (isolate_page() is useful.)
2638 * - compound_lock is held when nr_pages > 1
2639 *
2640 * This function doesn't do "charge" nor css_get to new cgroup. It should be
2641 * done by a caller(__mem_cgroup_try_charge would be useful). If @uncharge is
2642 * true, this function does "uncharge" from old cgroup, but it doesn't if
2643 * @uncharge is false, so a caller should do "uncharge".
2644 */
2645static int mem_cgroup_move_account(struct page *page,
2646				   unsigned int nr_pages,
2647				   struct page_cgroup *pc,
2648				   struct mem_cgroup *from,
2649				   struct mem_cgroup *to,
2650				   bool uncharge)
2651{
2652	unsigned long flags;
2653	int ret;
2654
2655	VM_BUG_ON(from == to);
2656	VM_BUG_ON(PageLRU(page));
2657	/*
2658	 * The page is isolated from LRU. So, collapse function
2659	 * will not handle this page. But page splitting can happen.
2660	 * Do this check under compound_page_lock(). The caller should
2661	 * hold it.
2662	 */
2663	ret = -EBUSY;
2664	if (nr_pages > 1 && !PageTransHuge(page))
2665		goto out;
2666
2667	lock_page_cgroup(pc);
2668
2669	ret = -EINVAL;
2670	if (!PageCgroupUsed(pc) || pc->mem_cgroup != from)
2671		goto unlock;
2672
2673	move_lock_page_cgroup(pc, &flags);
2674
2675	if (PageCgroupFileMapped(pc)) {
2676		/* Update mapped_file data for mem_cgroup */
2677		preempt_disable();
2678		__this_cpu_dec(from->stat->count[MEM_CGROUP_STAT_FILE_MAPPED]);
2679		__this_cpu_inc(to->stat->count[MEM_CGROUP_STAT_FILE_MAPPED]);
2680		preempt_enable();
2681	}
2682	mem_cgroup_charge_statistics(from, PageCgroupCache(pc), -nr_pages);
2683	if (uncharge)
2684		/* This is not "cancel", but cancel_charge does all we need. */
2685		__mem_cgroup_cancel_charge(from, nr_pages);
2686
2687	/* caller should have done css_get */
2688	pc->mem_cgroup = to;
2689	mem_cgroup_charge_statistics(to, PageCgroupCache(pc), nr_pages);
2690	/*
2691	 * We charges against "to" which may not have any tasks. Then, "to"
2692	 * can be under rmdir(). But in current implementation, caller of
2693	 * this function is just force_empty() and move charge, so it's
2694	 * guaranteed that "to" is never removed. So, we don't check rmdir
2695	 * status here.
2696	 */
2697	move_unlock_page_cgroup(pc, &flags);
2698	ret = 0;
2699unlock:
2700	unlock_page_cgroup(pc);
2701	/*
2702	 * check events
2703	 */
2704	memcg_check_events(to, page);
2705	memcg_check_events(from, page);
2706out:
2707	return ret;
2708}
2709
2710/*
2711 * move charges to its parent.
2712 */
2713
2714static int mem_cgroup_move_parent(struct page *page,
2715				  struct page_cgroup *pc,
2716				  struct mem_cgroup *child,
2717				  gfp_t gfp_mask)
2718{
2719	struct cgroup *cg = child->css.cgroup;
2720	struct cgroup *pcg = cg->parent;
2721	struct mem_cgroup *parent;
2722	unsigned int nr_pages;
2723	unsigned long uninitialized_var(flags);
2724	int ret;
2725
2726	/* Is ROOT ? */
2727	if (!pcg)
2728		return -EINVAL;
2729
2730	ret = -EBUSY;
2731	if (!get_page_unless_zero(page))
2732		goto out;
2733	if (isolate_lru_page(page))
2734		goto put;
2735
2736	nr_pages = hpage_nr_pages(page);
2737
2738	parent = mem_cgroup_from_cont(pcg);
2739	ret = __mem_cgroup_try_charge(NULL, gfp_mask, nr_pages, &parent, false);
2740	if (ret || !parent)
2741		goto put_back;
2742
2743	if (nr_pages > 1)
2744		flags = compound_lock_irqsave(page);
2745
2746	ret = mem_cgroup_move_account(page, nr_pages, pc, child, parent, true);
2747	if (ret)
2748		__mem_cgroup_cancel_charge(parent, nr_pages);
2749
2750	if (nr_pages > 1)
2751		compound_unlock_irqrestore(page, flags);
2752put_back:
2753	putback_lru_page(page);
2754put:
2755	put_page(page);
2756out:
2757	return ret;
2758}
2759
2760/*
2761 * Charge the memory controller for page usage.
2762 * Return
2763 * 0 if the charge was successful
2764 * < 0 if the cgroup is over its limit
2765 */
2766static int mem_cgroup_charge_common(struct page *page, struct mm_struct *mm,
2767				gfp_t gfp_mask, enum charge_type ctype)
2768{
2769	struct mem_cgroup *memcg = NULL;
2770	unsigned int nr_pages = 1;
2771	struct page_cgroup *pc;
2772	bool oom = true;
2773	int ret;
2774
2775	if (PageTransHuge(page)) {
2776		nr_pages <<= compound_order(page);
2777		VM_BUG_ON(!PageTransHuge(page));
2778		/*
2779		 * Never OOM-kill a process for a huge page.  The
2780		 * fault handler will fall back to regular pages.
2781		 */
2782		oom = false;
2783	}
2784
2785	pc = lookup_page_cgroup(page);
2786	BUG_ON(!pc); /* XXX: remove this and move pc lookup into commit */
2787
2788	ret = __mem_cgroup_try_charge(mm, gfp_mask, nr_pages, &memcg, oom);
2789	if (ret || !memcg)
2790		return ret;
2791
2792	__mem_cgroup_commit_charge(memcg, page, nr_pages, pc, ctype);
2793	return 0;
2794}
2795
2796int mem_cgroup_newpage_charge(struct page *page,
2797			      struct mm_struct *mm, gfp_t gfp_mask)
2798{
2799	if (mem_cgroup_disabled())
2800		return 0;
2801	/*
2802	 * If already mapped, we don't have to account.
2803	 * If page cache, page->mapping has address_space.
2804	 * But page->mapping may have out-of-use anon_vma pointer,
2805	 * detecit it by PageAnon() check. newly-mapped-anon's page->mapping
2806	 * is NULL.
2807  	 */
2808	if (page_mapped(page) || (page->mapping && !PageAnon(page)))
2809		return 0;
2810	if (unlikely(!mm))
2811		mm = &init_mm;
2812	return mem_cgroup_charge_common(page, mm, gfp_mask,
2813				MEM_CGROUP_CHARGE_TYPE_MAPPED);
2814}
2815
2816static void
2817__mem_cgroup_commit_charge_swapin(struct page *page, struct mem_cgroup *ptr,
2818					enum charge_type ctype);
2819
2820static void
2821__mem_cgroup_commit_charge_lrucare(struct page *page, struct mem_cgroup *memcg,
2822					enum charge_type ctype)
2823{
2824	struct page_cgroup *pc = lookup_page_cgroup(page);
2825	/*
2826	 * In some case, SwapCache, FUSE(splice_buf->radixtree), the page
2827	 * is already on LRU. It means the page may on some other page_cgroup's
2828	 * LRU. Take care of it.
2829	 */
2830	mem_cgroup_lru_del_before_commit(page);
2831	__mem_cgroup_commit_charge(memcg, page, 1, pc, ctype);
2832	mem_cgroup_lru_add_after_commit(page);
2833	return;
2834}
2835
2836int mem_cgroup_cache_charge(struct page *page, struct mm_struct *mm,
2837				gfp_t gfp_mask)
2838{
2839	struct mem_cgroup *memcg = NULL;
2840	int ret;
2841
2842	if (mem_cgroup_disabled())
2843		return 0;
2844	if (PageCompound(page))
2845		return 0;
2846
2847	if (unlikely(!mm))
2848		mm = &init_mm;
2849
2850	if (page_is_file_cache(page)) {
2851		ret = __mem_cgroup_try_charge(mm, gfp_mask, 1, &memcg, true);
2852		if (ret || !memcg)
2853			return ret;
2854
2855		/*
2856		 * FUSE reuses pages without going through the final
2857		 * put that would remove them from the LRU list, make
2858		 * sure that they get relinked properly.
2859		 */
2860		__mem_cgroup_commit_charge_lrucare(page, memcg,
2861					MEM_CGROUP_CHARGE_TYPE_CACHE);
2862		return ret;
2863	}
2864	/* shmem */
2865	if (PageSwapCache(page)) {
2866		ret = mem_cgroup_try_charge_swapin(mm, page, gfp_mask, &memcg);
2867		if (!ret)
2868			__mem_cgroup_commit_charge_swapin(page, memcg,
2869					MEM_CGROUP_CHARGE_TYPE_SHMEM);
2870	} else
2871		ret = mem_cgroup_charge_common(page, mm, gfp_mask,
2872					MEM_CGROUP_CHARGE_TYPE_SHMEM);
2873
2874	return ret;
2875}
2876
2877/*
2878 * While swap-in, try_charge -> commit or cancel, the page is locked.
2879 * And when try_charge() successfully returns, one refcnt to memcg without
2880 * struct page_cgroup is acquired. This refcnt will be consumed by
2881 * "commit()" or removed by "cancel()"
2882 */
2883int mem_cgroup_try_charge_swapin(struct mm_struct *mm,
2884				 struct page *page,
2885				 gfp_t mask, struct mem_cgroup **ptr)
2886{
2887	struct mem_cgroup *memcg;
2888	int ret;
2889
2890	*ptr = NULL;
2891
2892	if (mem_cgroup_disabled())
2893		return 0;
2894
2895	if (!do_swap_account)
2896		goto charge_cur_mm;
2897	/*
2898	 * A racing thread's fault, or swapoff, may have already updated
2899	 * the pte, and even removed page from swap cache: in those cases
2900	 * do_swap_page()'s pte_same() test will fail; but there's also a
2901	 * KSM case which does need to charge the page.
2902	 */
2903	if (!PageSwapCache(page))
2904		goto charge_cur_mm;
2905	memcg = try_get_mem_cgroup_from_page(page);
2906	if (!memcg)
2907		goto charge_cur_mm;
2908	*ptr = memcg;
2909	ret = __mem_cgroup_try_charge(NULL, mask, 1, ptr, true);
2910	css_put(&memcg->css);
2911	return ret;
2912charge_cur_mm:
2913	if (unlikely(!mm))
2914		mm = &init_mm;
2915	return __mem_cgroup_try_charge(mm, mask, 1, ptr, true);
2916}
2917
2918static void
2919__mem_cgroup_commit_charge_swapin(struct page *page, struct mem_cgroup *ptr,
2920					enum charge_type ctype)
2921{
2922	if (mem_cgroup_disabled())
2923		return;
2924	if (!ptr)
2925		return;
2926	cgroup_exclude_rmdir(&ptr->css);
2927
2928	__mem_cgroup_commit_charge_lrucare(page, ptr, ctype);
2929	/*
2930	 * Now swap is on-memory. This means this page may be
2931	 * counted both as mem and swap....double count.
2932	 * Fix it by uncharging from memsw. Basically, this SwapCache is stable
2933	 * under lock_page(). But in do_swap_page()::memory.c, reuse_swap_page()
2934	 * may call delete_from_swap_cache() before reach here.
2935	 */
2936	if (do_swap_account && PageSwapCache(page)) {
2937		swp_entry_t ent = {.val = page_private(page)};
2938		unsigned short id;
2939		struct mem_cgroup *memcg;
2940
2941		id = swap_cgroup_record(ent, 0);
2942		rcu_read_lock();
2943		memcg = mem_cgroup_lookup(id);
2944		if (memcg) {
2945			/*
2946			 * This recorded memcg can be obsolete one. So, avoid
2947			 * calling css_tryget
2948			 */
2949			if (!mem_cgroup_is_root(memcg))
2950				res_counter_uncharge(&memcg->memsw, PAGE_SIZE);
2951			mem_cgroup_swap_statistics(memcg, false);
2952			mem_cgroup_put(memcg);
2953		}
2954		rcu_read_unlock();
2955	}
2956	/*
2957	 * At swapin, we may charge account against cgroup which has no tasks.
2958	 * So, rmdir()->pre_destroy() can be called while we do this charge.
2959	 * In that case, we need to call pre_destroy() again. check it here.
2960	 */
2961	cgroup_release_and_wakeup_rmdir(&ptr->css);
2962}
2963
2964void mem_cgroup_commit_charge_swapin(struct page *page, struct mem_cgroup *ptr)
2965{
2966	__mem_cgroup_commit_charge_swapin(page, ptr,
2967					MEM_CGROUP_CHARGE_TYPE_MAPPED);
2968}
2969
2970void mem_cgroup_cancel_charge_swapin(struct mem_cgroup *memcg)
2971{
2972	if (mem_cgroup_disabled())
2973		return;
2974	if (!memcg)
2975		return;
2976	__mem_cgroup_cancel_charge(memcg, 1);
2977}
2978
2979static void mem_cgroup_do_uncharge(struct mem_cgroup *memcg,
2980				   unsigned int nr_pages,
2981				   const enum charge_type ctype)
2982{
2983	struct memcg_batch_info *batch = NULL;
2984	bool uncharge_memsw = true;
2985
2986	/* If swapout, usage of swap doesn't decrease */
2987	if (!do_swap_account || ctype == MEM_CGROUP_CHARGE_TYPE_SWAPOUT)
2988		uncharge_memsw = false;
2989
2990	batch = &current->memcg_batch;
2991	/*
2992	 * In usual, we do css_get() when we remember memcg pointer.
2993	 * But in this case, we keep res->usage until end of a series of
2994	 * uncharges. Then, it's ok to ignore memcg's refcnt.
2995	 */
2996	if (!batch->memcg)
2997		batch->memcg = memcg;
2998	/*
2999	 * do_batch > 0 when unmapping pages or inode invalidate/truncate.
3000	 * In those cases, all pages freed continuously can be expected to be in
3001	 * the same cgroup and we have chance to coalesce uncharges.
3002	 * But we do uncharge one by one if this is killed by OOM(TIF_MEMDIE)
3003	 * because we want to do uncharge as soon as possible.
3004	 */
3005
3006	if (!batch->do_batch || test_thread_flag(TIF_MEMDIE))
3007		goto direct_uncharge;
3008
3009	if (nr_pages > 1)
3010		goto direct_uncharge;
3011
3012	/*
3013	 * In typical case, batch->memcg == mem. This means we can
3014	 * merge a series of uncharges to an uncharge of res_counter.
3015	 * If not, we uncharge res_counter ony by one.
3016	 */
3017	if (batch->memcg != memcg)
3018		goto direct_uncharge;
3019	/* remember freed charge and uncharge it later */
3020	batch->nr_pages++;
3021	if (uncharge_memsw)
3022		batch->memsw_nr_pages++;
3023	return;
3024direct_uncharge:
3025	res_counter_uncharge(&memcg->res, nr_pages * PAGE_SIZE);
3026	if (uncharge_memsw)
3027		res_counter_uncharge(&memcg->memsw, nr_pages * PAGE_SIZE);
3028	if (unlikely(batch->memcg != memcg))
3029		memcg_oom_recover(memcg);
3030	return;
3031}
3032
3033/*
3034 * uncharge if !page_mapped(page)
3035 */
3036static struct mem_cgroup *
3037__mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype)
3038{
3039	struct mem_cgroup *memcg = NULL;
3040	unsigned int nr_pages = 1;
3041	struct page_cgroup *pc;
3042
3043	if (mem_cgroup_disabled())
3044		return NULL;
3045
3046	if (PageSwapCache(page))
3047		return NULL;
3048
3049	if (PageTransHuge(page)) {
3050		nr_pages <<= compound_order(page);
3051		VM_BUG_ON(!PageTransHuge(page));
3052	}
3053	/*
3054	 * Check if our page_cgroup is valid
3055	 */
3056	pc = lookup_page_cgroup(page);
3057	if (unlikely(!pc || !PageCgroupUsed(pc)))
3058		return NULL;
3059
3060	lock_page_cgroup(pc);
3061
3062	memcg = pc->mem_cgroup;
3063
3064	if (!PageCgroupUsed(pc))
3065		goto unlock_out;
3066
3067	switch (ctype) {
3068	case MEM_CGROUP_CHARGE_TYPE_MAPPED:
3069	case MEM_CGROUP_CHARGE_TYPE_DROP:
3070		/* See mem_cgroup_prepare_migration() */
3071		if (page_mapped(page) || PageCgroupMigration(pc))
3072			goto unlock_out;
3073		break;
3074	case MEM_CGROUP_CHARGE_TYPE_SWAPOUT:
3075		if (!PageAnon(page)) {	/* Shared memory */
3076			if (page->mapping && !page_is_file_cache(page))
3077				goto unlock_out;
3078		} else if (page_mapped(page)) /* Anon */
3079				goto unlock_out;
3080		break;
3081	default:
3082		break;
3083	}
3084
3085	mem_cgroup_charge_statistics(memcg, PageCgroupCache(pc), -nr_pages);
3086
3087	ClearPageCgroupUsed(pc);
3088	/*
3089	 * pc->mem_cgroup is not cleared here. It will be accessed when it's
3090	 * freed from LRU. This is safe because uncharged page is expected not
3091	 * to be reused (freed soon). Exception is SwapCache, it's handled by
3092	 * special functions.
3093	 */
3094
3095	unlock_page_cgroup(pc);
3096	/*
3097	 * even after unlock, we have memcg->res.usage here and this memcg
3098	 * will never be freed.
3099	 */
3100	memcg_check_events(memcg, page);
3101	if (do_swap_account && ctype == MEM_CGROUP_CHARGE_TYPE_SWAPOUT) {
3102		mem_cgroup_swap_statistics(memcg, true);
3103		mem_cgroup_get(memcg);
3104	}
3105	if (!mem_cgroup_is_root(memcg))
3106		mem_cgroup_do_uncharge(memcg, nr_pages, ctype);
3107
3108	return memcg;
3109
3110unlock_out:
3111	unlock_page_cgroup(pc);
3112	return NULL;
3113}
3114
3115void mem_cgroup_uncharge_page(struct page *page)
3116{
3117	/* early check. */
3118	if (page_mapped(page))
3119		return;
3120	if (page->mapping && !PageAnon(page))
3121		return;
3122	__mem_cgroup_uncharge_common(page, MEM_CGROUP_CHARGE_TYPE_MAPPED);
3123}
3124
3125void mem_cgroup_uncharge_cache_page(struct page *page)
3126{
3127	VM_BUG_ON(page_mapped(page));
3128	VM_BUG_ON(page->mapping);
3129	__mem_cgroup_uncharge_common(page, MEM_CGROUP_CHARGE_TYPE_CACHE);
3130}
3131
3132/*
3133 * Batch_start/batch_end is called in unmap_page_range/invlidate/trucate.
3134 * In that cases, pages are freed continuously and we can expect pages
3135 * are in the same memcg. All these calls itself limits the number of
3136 * pages freed at once, then uncharge_start/end() is called properly.
3137 * This may be called prural(2) times in a context,
3138 */
3139
3140void mem_cgroup_uncharge_start(void)
3141{
3142	current->memcg_batch.do_batch++;
3143	/* We can do nest. */
3144	if (current->memcg_batch.do_batch == 1) {
3145		current->memcg_batch.memcg = NULL;
3146		current->memcg_batch.nr_pages = 0;
3147		current->memcg_batch.memsw_nr_pages = 0;
3148	}
3149}
3150
3151void mem_cgroup_uncharge_end(void)
3152{
3153	struct memcg_batch_info *batch = &current->memcg_batch;
3154
3155	if (!batch->do_batch)
3156		return;
3157
3158	batch->do_batch--;
3159	if (batch->do_batch) /* If stacked, do nothing. */
3160		return;
3161
3162	if (!batch->memcg)
3163		return;
3164	/*
3165	 * This "batch->memcg" is valid without any css_get/put etc...
3166	 * bacause we hide charges behind us.
3167	 */
3168	if (batch->nr_pages)
3169		res_counter_uncharge(&batch->memcg->res,
3170				     batch->nr_pages * PAGE_SIZE);
3171	if (batch->memsw_nr_pages)
3172		res_counter_uncharge(&batch->memcg->memsw,
3173				     batch->memsw_nr_pages * PAGE_SIZE);
3174	memcg_oom_recover(batch->memcg);
3175	/* forget this pointer (for sanity check) */
3176	batch->memcg = NULL;
3177}
3178
3179#ifdef CONFIG_SWAP
3180/*
3181 * called after __delete_from_swap_cache() and drop "page" account.
3182 * memcg information is recorded to swap_cgroup of "ent"
3183 */
3184void
3185mem_cgroup_uncharge_swapcache(struct page *page, swp_entry_t ent, bool swapout)
3186{
3187	struct mem_cgroup *memcg;
3188	int ctype = MEM_CGROUP_CHARGE_TYPE_SWAPOUT;
3189
3190	if (!swapout) /* this was a swap cache but the swap is unused ! */
3191		ctype = MEM_CGROUP_CHARGE_TYPE_DROP;
3192
3193	memcg = __mem_cgroup_uncharge_common(page, ctype);
3194
3195	/*
3196	 * record memcg information,  if swapout && memcg != NULL,
3197	 * mem_cgroup_get() was called in uncharge().
3198	 */
3199	if (do_swap_account && swapout && memcg)
3200		swap_cgroup_record(ent, css_id(&memcg->css));
3201}
3202#endif
3203
3204#ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP
3205/*
3206 * called from swap_entry_free(). remove record in swap_cgroup and
3207 * uncharge "memsw" account.
3208 */
3209void mem_cgroup_uncharge_swap(swp_entry_t ent)
3210{
3211	struct mem_cgroup *memcg;
3212	unsigned short id;
3213
3214	if (!do_swap_account)
3215		return;
3216
3217	id = swap_cgroup_record(ent, 0);
3218	rcu_read_lock();
3219	memcg = mem_cgroup_lookup(id);
3220	if (memcg) {
3221		/*
3222		 * We uncharge this because swap is freed.
3223		 * This memcg can be obsolete one. We avoid calling css_tryget
3224		 */
3225		if (!mem_cgroup_is_root(memcg))
3226			res_counter_uncharge(&memcg->memsw, PAGE_SIZE);
3227		mem_cgroup_swap_statistics(memcg, false);
3228		mem_cgroup_put(memcg);
3229	}
3230	rcu_read_unlock();
3231}
3232
3233/**
3234 * mem_cgroup_move_swap_account - move swap charge and swap_cgroup's record.
3235 * @entry: swap entry to be moved
3236 * @from:  mem_cgroup which the entry is moved from
3237 * @to:  mem_cgroup which the entry is moved to
3238 * @need_fixup: whether we should fixup res_counters and refcounts.
3239 *
3240 * It succeeds only when the swap_cgroup's record for this entry is the same
3241 * as the mem_cgroup's id of @from.
3242 *
3243 * Returns 0 on success, -EINVAL on failure.
3244 *
3245 * The caller must have charged to @to, IOW, called res_counter_charge() about
3246 * both res and memsw, and called css_get().
3247 */
3248static int mem_cgroup_move_swap_account(swp_entry_t entry,
3249		struct mem_cgroup *from, struct mem_cgroup *to, bool need_fixup)
3250{
3251	unsigned short old_id, new_id;
3252
3253	old_id = css_id(&from->css);
3254	new_id = css_id(&to->css);
3255
3256	if (swap_cgroup_cmpxchg(entry, old_id, new_id) == old_id) {
3257		mem_cgroup_swap_statistics(from, false);
3258		mem_cgroup_swap_statistics(to, true);
3259		/*
3260		 * This function is only called from task migration context now.
3261		 * It postpones res_counter and refcount handling till the end
3262		 * of task migration(mem_cgroup_clear_mc()) for performance
3263		 * improvement. But we cannot postpone mem_cgroup_get(to)
3264		 * because if the process that has been moved to @to does
3265		 * swap-in, the refcount of @to might be decreased to 0.
3266		 */
3267		mem_cgroup_get(to);
3268		if (need_fixup) {
3269			if (!mem_cgroup_is_root(from))
3270				res_counter_uncharge(&from->memsw, PAGE_SIZE);
3271			mem_cgroup_put(from);
3272			/*
3273			 * we charged both to->res and to->memsw, so we should
3274			 * uncharge to->res.
3275			 */
3276			if (!mem_cgroup_is_root(to))
3277				res_counter_uncharge(&to->res, PAGE_SIZE);
3278		}
3279		return 0;
3280	}
3281	return -EINVAL;
3282}
3283#else
3284static inline int mem_cgroup_move_swap_account(swp_entry_t entry,
3285		struct mem_cgroup *from, struct mem_cgroup *to, bool need_fixup)
3286{
3287	return -EINVAL;
3288}
3289#endif
3290
3291/*
3292 * Before starting migration, account PAGE_SIZE to mem_cgroup that the old
3293 * page belongs to.
3294 */
3295int mem_cgroup_prepare_migration(struct page *page,
3296	struct page *newpage, struct mem_cgroup **ptr, gfp_t gfp_mask)
3297{
3298	struct mem_cgroup *memcg = NULL;
3299	struct page_cgroup *pc;
3300	enum charge_type ctype;
3301	int ret = 0;
3302
3303	*ptr = NULL;
3304
3305	VM_BUG_ON(PageTransHuge(page));
3306	if (mem_cgroup_disabled())
3307		return 0;
3308
3309	pc = lookup_page_cgroup(page);
3310	lock_page_cgroup(pc);
3311	if (PageCgroupUsed(pc)) {
3312		memcg = pc->mem_cgroup;
3313		css_get(&memcg->css);
3314		/*
3315		 * At migrating an anonymous page, its mapcount goes down
3316		 * to 0 and uncharge() will be called. But, even if it's fully
3317		 * unmapped, migration may fail and this page has to be
3318		 * charged again. We set MIGRATION flag here and delay uncharge
3319		 * until end_migration() is called
3320		 *
3321		 * Corner Case Thinking
3322		 * A)
3323		 * When the old page was mapped as Anon and it's unmap-and-freed
3324		 * while migration was ongoing.
3325		 * If unmap finds the old page, uncharge() of it will be delayed
3326		 * until end_migration(). If unmap finds a new page, it's
3327		 * uncharged when it make mapcount to be 1->0. If unmap code
3328		 * finds swap_migration_entry, the new page will not be mapped
3329		 * and end_migration() will find it(mapcount==0).
3330		 *
3331		 * B)
3332		 * When the old page was mapped but migraion fails, the kernel
3333		 * remaps it. A charge for it is kept by MIGRATION flag even
3334		 * if mapcount goes down to 0. We can do remap successfully
3335		 * without charging it again.
3336		 *
3337		 * C)
3338		 * The "old" page is under lock_page() until the end of
3339		 * migration, so, the old page itself will not be swapped-out.
3340		 * If the new page is swapped out before end_migraton, our
3341		 * hook to usual swap-out path will catch the event.
3342		 */
3343		if (PageAnon(page))
3344			SetPageCgroupMigration(pc);
3345	}
3346	unlock_page_cgroup(pc);
3347	/*
3348	 * If the page is not charged at this point,
3349	 * we return here.
3350	 */
3351	if (!memcg)
3352		return 0;
3353
3354	*ptr = memcg;
3355	ret = __mem_cgroup_try_charge(NULL, gfp_mask, 1, ptr, false);
3356	css_put(&memcg->css);/* drop extra refcnt */
3357	if (ret || *ptr == NULL) {
3358		if (PageAnon(page)) {
3359			lock_page_cgroup(pc);
3360			ClearPageCgroupMigration(pc);
3361			unlock_page_cgroup(pc);
3362			/*
3363			 * The old page may be fully unmapped while we kept it.
3364			 */
3365			mem_cgroup_uncharge_page(page);
3366		}
3367		return -ENOMEM;
3368	}
3369	/*
3370	 * We charge new page before it's used/mapped. So, even if unlock_page()
3371	 * is called before end_migration, we can catch all events on this new
3372	 * page. In the case new page is migrated but not remapped, new page's
3373	 * mapcount will be finally 0 and we call uncharge in end_migration().
3374	 */
3375	pc = lookup_page_cgroup(newpage);
3376	if (PageAnon(page))
3377		ctype = MEM_CGROUP_CHARGE_TYPE_MAPPED;
3378	else if (page_is_file_cache(page))
3379		ctype = MEM_CGROUP_CHARGE_TYPE_CACHE;
3380	else
3381		ctype = MEM_CGROUP_CHARGE_TYPE_SHMEM;
3382	__mem_cgroup_commit_charge(memcg, page, 1, pc, ctype);
3383	return ret;
3384}
3385
3386/* remove redundant charge if migration failed*/
3387void mem_cgroup_end_migration(struct mem_cgroup *memcg,
3388	struct page *oldpage, struct page *newpage, bool migration_ok)
3389{
3390	struct page *used, *unused;
3391	struct page_cgroup *pc;
3392
3393	if (!memcg)
3394		return;
3395	/* blocks rmdir() */
3396	cgroup_exclude_rmdir(&memcg->css);
3397	if (!migration_ok) {
3398		used = oldpage;
3399		unused = newpage;
3400	} else {
3401		used = newpage;
3402		unused = oldpage;
3403	}
3404	/*
3405	 * We disallowed uncharge of pages under migration because mapcount
3406	 * of the page goes down to zero, temporarly.
3407	 * Clear the flag and check the page should be charged.
3408	 */
3409	pc = lookup_page_cgroup(oldpage);
3410	lock_page_cgroup(pc);
3411	ClearPageCgroupMigration(pc);
3412	unlock_page_cgroup(pc);
3413
3414	__mem_cgroup_uncharge_common(unused, MEM_CGROUP_CHARGE_TYPE_FORCE);
3415
3416	/*
3417	 * If a page is a file cache, radix-tree replacement is very atomic
3418	 * and we can skip this check. When it was an Anon page, its mapcount
3419	 * goes down to 0. But because we added MIGRATION flage, it's not
3420	 * uncharged yet. There are several case but page->mapcount check
3421	 * and USED bit check in mem_cgroup_uncharge_page() will do enough
3422	 * check. (see prepare_charge() also)
3423	 */
3424	if (PageAnon(used))
3425		mem_cgroup_uncharge_page(used);
3426	/*
3427	 * At migration, we may charge account against cgroup which has no
3428	 * tasks.
3429	 * So, rmdir()->pre_destroy() can be called while we do this charge.
3430	 * In that case, we need to call pre_destroy() again. check it here.
3431	 */
3432	cgroup_release_and_wakeup_rmdir(&memcg->css);
3433}
3434
3435/*
3436 * At replace page cache, newpage is not under any memcg but it's on
3437 * LRU. So, this function doesn't touch res_counter but handles LRU
3438 * in correct way. Both pages are locked so we cannot race with uncharge.
3439 */
3440void mem_cgroup_replace_page_cache(struct page *oldpage,
3441				  struct page *newpage)
3442{
3443	struct mem_cgroup *memcg;
3444	struct page_cgroup *pc;
3445	struct zone *zone;
3446	enum charge_type type = MEM_CGROUP_CHARGE_TYPE_CACHE;
3447	unsigned long flags;
3448
3449	if (mem_cgroup_disabled())
3450		return;
3451
3452	pc = lookup_page_cgroup(oldpage);
3453	/* fix accounting on old pages */
3454	lock_page_cgroup(pc);
3455	memcg = pc->mem_cgroup;
3456	mem_cgroup_charge_statistics(memcg, PageCgroupCache(pc), -1);
3457	ClearPageCgroupUsed(pc);
3458	unlock_page_cgroup(pc);
3459
3460	if (PageSwapBacked(oldpage))
3461		type = MEM_CGROUP_CHARGE_TYPE_SHMEM;
3462
3463	zone = page_zone(newpage);
3464	pc = lookup_page_cgroup(newpage);
3465	/*
3466	 * Even if newpage->mapping was NULL before starting replacement,
3467	 * the newpage may be on LRU(or pagevec for LRU) already. We lock
3468	 * LRU while we overwrite pc->mem_cgroup.
3469	 */
3470	spin_lock_irqsave(&zone->lru_lock, flags);
3471	if (PageLRU(newpage))
3472		del_page_from_lru_list(zone, newpage, page_lru(newpage));
3473	__mem_cgroup_commit_charge(memcg, newpage, 1, pc, type);
3474	if (PageLRU(newpage))
3475		add_page_to_lru_list(zone, newpage, page_lru(newpage));
3476	spin_unlock_irqrestore(&zone->lru_lock, flags);
3477}
3478
3479#ifdef CONFIG_DEBUG_VM
3480static struct page_cgroup *lookup_page_cgroup_used(struct page *page)
3481{
3482	struct page_cgroup *pc;
3483
3484	pc = lookup_page_cgroup(page);
3485	if (likely(pc) && PageCgroupUsed(pc))
3486		return pc;
3487	return NULL;
3488}
3489
3490bool mem_cgroup_bad_page_check(struct page *page)
3491{
3492	if (mem_cgroup_disabled())
3493		return false;
3494
3495	return lookup_page_cgroup_used(page) != NULL;
3496}
3497
3498void mem_cgroup_print_bad_page(struct page *page)
3499{
3500	struct page_cgroup *pc;
3501
3502	pc = lookup_page_cgroup_used(page);
3503	if (pc) {
3504		int ret = -1;
3505		char *path;
3506
3507		printk(KERN_ALERT "pc:%p pc->flags:%lx pc->mem_cgroup:%p",
3508		       pc, pc->flags, pc->mem_cgroup);
3509
3510		path = kmalloc(PATH_MAX, GFP_KERNEL);
3511		if (path) {
3512			rcu_read_lock();
3513			ret = cgroup_path(pc->mem_cgroup->css.cgroup,
3514							path, PATH_MAX);
3515			rcu_read_unlock();
3516		}
3517
3518		printk(KERN_CONT "(%s)\n",
3519				(ret < 0) ? "cannot get the path" : path);
3520		kfree(path);
3521	}
3522}
3523#endif
3524
3525static DEFINE_MUTEX(set_limit_mutex);
3526
3527static int mem_cgroup_resize_limit(struct mem_cgroup *memcg,
3528				unsigned long long val)
3529{
3530	int retry_count;
3531	u64 memswlimit, memlimit;
3532	int ret = 0;
3533	int children = mem_cgroup_count_children(memcg);
3534	u64 curusage, oldusage;
3535	int enlarge;
3536
3537	/*
3538	 * For keeping hierarchical_reclaim simple, how long we should retry
3539	 * is depends on callers. We set our retry-count to be function
3540	 * of # of children which we should visit in this loop.
3541	 */
3542	retry_count = MEM_CGROUP_RECLAIM_RETRIES * children;
3543
3544	oldusage = res_counter_read_u64(&memcg->res, RES_USAGE);
3545
3546	enlarge = 0;
3547	while (retry_count) {
3548		if (signal_pending(current)) {
3549			ret = -EINTR;
3550			break;
3551		}
3552		/*
3553		 * Rather than hide all in some function, I do this in
3554		 * open coded manner. You see what this really does.
3555		 * We have to guarantee memcg->res.limit < memcg->memsw.limit.
3556		 */
3557		mutex_lock(&set_limit_mutex);
3558		memswlimit = res_counter_read_u64(&memcg->memsw, RES_LIMIT);
3559		if (memswlimit < val) {
3560			ret = -EINVAL;
3561			mutex_unlock(&set_limit_mutex);
3562			break;
3563		}
3564
3565		memlimit = res_counter_read_u64(&memcg->res, RES_LIMIT);
3566		if (memlimit < val)
3567			enlarge = 1;
3568
3569		ret = res_counter_set_limit(&memcg->res, val);
3570		if (!ret) {
3571			if (memswlimit == val)
3572				memcg->memsw_is_minimum = true;
3573			else
3574				memcg->memsw_is_minimum = false;
3575		}
3576		mutex_unlock(&set_limit_mutex);
3577
3578		if (!ret)
3579			break;
3580
3581		mem_cgroup_hierarchical_reclaim(memcg, NULL, GFP_KERNEL,
3582						MEM_CGROUP_RECLAIM_SHRINK,
3583						NULL);
3584		curusage = res_counter_read_u64(&memcg->res, RES_USAGE);
3585		/* Usage is reduced ? */
3586  		if (curusage >= oldusage)
3587			retry_count--;
3588		else
3589			oldusage = curusage;
3590	}
3591	if (!ret && enlarge)
3592		memcg_oom_recover(memcg);
3593
3594	return ret;
3595}
3596
3597static int mem_cgroup_resize_memsw_limit(struct mem_cgroup *memcg,
3598					unsigned long long val)
3599{
3600	int retry_count;
3601	u64 memlimit, memswlimit, oldusage, curusage;
3602	int children = mem_cgroup_count_children(memcg);
3603	int ret = -EBUSY;
3604	int enlarge = 0;
3605
3606	/* see mem_cgroup_resize_res_limit */
3607 	retry_count = children * MEM_CGROUP_RECLAIM_RETRIES;
3608	oldusage = res_counter_read_u64(&memcg->memsw, RES_USAGE);
3609	while (retry_count) {
3610		if (signal_pending(current)) {
3611			ret = -EINTR;
3612			break;
3613		}
3614		/*
3615		 * Rather than hide all in some function, I do this in
3616		 * open coded manner. You see what this really does.
3617		 * We have to guarantee memcg->res.limit < memcg->memsw.limit.
3618		 */
3619		mutex_lock(&set_limit_mutex);
3620		memlimit = res_counter_read_u64(&memcg->res, RES_LIMIT);
3621		if (memlimit > val) {
3622			ret = -EINVAL;
3623			mutex_unlock(&set_limit_mutex);
3624			break;
3625		}
3626		memswlimit = res_counter_read_u64(&memcg->memsw, RES_LIMIT);
3627		if (memswlimit < val)
3628			enlarge = 1;
3629		ret = res_counter_set_limit(&memcg->memsw, val);
3630		if (!ret) {
3631			if (memlimit == val)
3632				memcg->memsw_is_minimum = true;
3633			else
3634				memcg->memsw_is_minimum = false;
3635		}
3636		mutex_unlock(&set_limit_mutex);
3637
3638		if (!ret)
3639			break;
3640
3641		mem_cgroup_hierarchical_reclaim(memcg, NULL, GFP_KERNEL,
3642						MEM_CGROUP_RECLAIM_NOSWAP |
3643						MEM_CGROUP_RECLAIM_SHRINK,
3644						NULL);
3645		curusage = res_counter_read_u64(&memcg->memsw, RES_USAGE);
3646		/* Usage is reduced ? */
3647		if (curusage >= oldusage)
3648			retry_count--;
3649		else
3650			oldusage = curusage;
3651	}
3652	if (!ret && enlarge)
3653		memcg_oom_recover(memcg);
3654	return ret;
3655}
3656
3657unsigned long mem_cgroup_soft_limit_reclaim(struct zone *zone, int order,
3658					    gfp_t gfp_mask,
3659					    unsigned long *total_scanned)
3660{
3661	unsigned long nr_reclaimed = 0;
3662	struct mem_cgroup_per_zone *mz, *next_mz = NULL;
3663	unsigned long reclaimed;
3664	int loop = 0;
3665	struct mem_cgroup_tree_per_zone *mctz;
3666	unsigned long long excess;
3667	unsigned long nr_scanned;
3668
3669	if (order > 0)
3670		return 0;
3671
3672	mctz = soft_limit_tree_node_zone(zone_to_nid(zone), zone_idx(zone));
3673	/*
3674	 * This loop can run a while, specially if mem_cgroup's continuously
3675	 * keep exceeding their soft limit and putting the system under
3676	 * pressure
3677	 */
3678	do {
3679		if (next_mz)
3680			mz = next_mz;
3681		else
3682			mz = mem_cgroup_largest_soft_limit_node(mctz);
3683		if (!mz)
3684			break;
3685
3686		nr_scanned = 0;
3687		reclaimed = mem_cgroup_hierarchical_reclaim(mz->mem, zone,
3688						gfp_mask,
3689						MEM_CGROUP_RECLAIM_SOFT,
3690						&nr_scanned);
3691		nr_reclaimed += reclaimed;
3692		*total_scanned += nr_scanned;
3693		spin_lock(&mctz->lock);
3694
3695		/*
3696		 * If we failed to reclaim anything from this memory cgroup
3697		 * it is time to move on to the next cgroup
3698		 */
3699		next_mz = NULL;
3700		if (!reclaimed) {
3701			do {
3702				/*
3703				 * Loop until we find yet another one.
3704				 *
3705				 * By the time we get the soft_limit lock
3706				 * again, someone might have aded the
3707				 * group back on the RB tree. Iterate to
3708				 * make sure we get a different mem.
3709				 * mem_cgroup_largest_soft_limit_node returns
3710				 * NULL if no other cgroup is present on
3711				 * the tree
3712				 */
3713				next_mz =
3714				__mem_cgroup_largest_soft_limit_node(mctz);
3715				if (next_mz == mz)
3716					css_put(&next_mz->mem->css);
3717				else /* next_mz == NULL or other memcg */
3718					break;
3719			} while (1);
3720		}
3721		__mem_cgroup_remove_exceeded(mz->mem, mz, mctz);
3722		excess = res_counter_soft_limit_excess(&mz->mem->res);
3723		/*
3724		 * One school of thought says that we should not add
3725		 * back the node to the tree if reclaim returns 0.
3726		 * But our reclaim could return 0, simply because due
3727		 * to priority we are exposing a smaller subset of
3728		 * memory to reclaim from. Consider this as a longer
3729		 * term TODO.
3730		 */
3731		/* If excess == 0, no tree ops */
3732		__mem_cgroup_insert_exceeded(mz->mem, mz, mctz, excess);
3733		spin_unlock(&mctz->lock);
3734		css_put(&mz->mem->css);
3735		loop++;
3736		/*
3737		 * Could not reclaim anything and there are no more
3738		 * mem cgroups to try or we seem to be looping without
3739		 * reclaiming anything.
3740		 */
3741		if (!nr_reclaimed &&
3742			(next_mz == NULL ||
3743			loop > MEM_CGROUP_MAX_SOFT_LIMIT_RECLAIM_LOOPS))
3744			break;
3745	} while (!nr_reclaimed);
3746	if (next_mz)
3747		css_put(&next_mz->mem->css);
3748	return nr_reclaimed;
3749}
3750
3751/*
3752 * This routine traverse page_cgroup in given list and drop them all.
3753 * *And* this routine doesn't reclaim page itself, just removes page_cgroup.
3754 */
3755static int mem_cgroup_force_empty_list(struct mem_cgroup *memcg,
3756				int node, int zid, enum lru_list lru)
3757{
3758	struct zone *zone;
3759	struct mem_cgroup_per_zone *mz;
3760	struct page_cgroup *pc, *busy;
3761	unsigned long flags, loop;
3762	struct list_head *list;
3763	int ret = 0;
3764
3765	zone = &NODE_DATA(node)->node_zones[zid];
3766	mz = mem_cgroup_zoneinfo(memcg, node, zid);
3767	list = &mz->lists[lru];
3768
3769	loop = MEM_CGROUP_ZSTAT(mz, lru);
3770	/* give some margin against EBUSY etc...*/
3771	loop += 256;
3772	busy = NULL;
3773	while (loop--) {
3774		struct page *page;
3775
3776		ret = 0;
3777		spin_lock_irqsave(&zone->lru_lock, flags);
3778		if (list_empty(list)) {
3779			spin_unlock_irqrestore(&zone->lru_lock, flags);
3780			break;
3781		}
3782		pc = list_entry(list->prev, struct page_cgroup, lru);
3783		if (busy == pc) {
3784			list_move(&pc->lru, list);
3785			busy = NULL;
3786			spin_unlock_irqrestore(&zone->lru_lock, flags);
3787			continue;
3788		}
3789		spin_unlock_irqrestore(&zone->lru_lock, flags);
3790
3791		page = lookup_cgroup_page(pc);
3792
3793		ret = mem_cgroup_move_parent(page, pc, memcg, GFP_KERNEL);
3794		if (ret == -ENOMEM)
3795			break;
3796
3797		if (ret == -EBUSY || ret == -EINVAL) {
3798			/* found lock contention or "pc" is obsolete. */
3799			busy = pc;
3800			cond_resched();
3801		} else
3802			busy = NULL;
3803	}
3804
3805	if (!ret && !list_empty(list))
3806		return -EBUSY;
3807	return ret;
3808}
3809
3810/*
3811 * make mem_cgroup's charge to be 0 if there is no task.
3812 * This enables deleting this mem_cgroup.
3813 */
3814static int mem_cgroup_force_empty(struct mem_cgroup *memcg, bool free_all)
3815{
3816	int ret;
3817	int node, zid, shrink;
3818	int nr_retries = MEM_CGROUP_RECLAIM_RETRIES;
3819	struct cgroup *cgrp = memcg->css.cgroup;
3820
3821	css_get(&memcg->css);
3822
3823	shrink = 0;
3824	/* should free all ? */
3825	if (free_all)
3826		goto try_to_free;
3827move_account:
3828	do {
3829		ret = -EBUSY;
3830		if (cgroup_task_count(cgrp) || !list_empty(&cgrp->children))
3831			goto out;
3832		ret = -EINTR;
3833		if (signal_pending(current))
3834			goto out;
3835		/* This is for making all *used* pages to be on LRU. */
3836		lru_add_drain_all();
3837		drain_all_stock_sync(memcg);
3838		ret = 0;
3839		mem_cgroup_start_move(memcg);
3840		for_each_node_state(node, N_HIGH_MEMORY) {
3841			for (zid = 0; !ret && zid < MAX_NR_ZONES; zid++) {
3842				enum lru_list l;
3843				for_each_lru(l) {
3844					ret = mem_cgroup_force_empty_list(memcg,
3845							node, zid, l);
3846					if (ret)
3847						break;
3848				}
3849			}
3850			if (ret)
3851				break;
3852		}
3853		mem_cgroup_end_move(memcg);
3854		memcg_oom_recover(memcg);
3855		/* it seems parent cgroup doesn't have enough mem */
3856		if (ret == -ENOMEM)
3857			goto try_to_free;
3858		cond_resched();
3859	/* "ret" should also be checked to ensure all lists are empty. */
3860	} while (memcg->res.usage > 0 || ret);
3861out:
3862	css_put(&memcg->css);
3863	return ret;
3864
3865try_to_free:
3866	/* returns EBUSY if there is a task or if we come here twice. */
3867	if (cgroup_task_count(cgrp) || !list_empty(&cgrp->children) || shrink) {
3868		ret = -EBUSY;
3869		goto out;
3870	}
3871	/* we call try-to-free pages for make this cgroup empty */
3872	lru_add_drain_all();
3873	/* try to free all pages in this cgroup */
3874	shrink = 1;
3875	while (nr_retries && memcg->res.usage > 0) {
3876		int progress;
3877
3878		if (signal_pending(current)) {
3879			ret = -EINTR;
3880			goto out;
3881		}
3882		progress = try_to_free_mem_cgroup_pages(memcg, GFP_KERNEL,
3883						false);
3884		if (!progress) {
3885			nr_retries--;
3886			/* maybe some writeback is necessary */
3887			congestion_wait(BLK_RW_ASYNC, HZ/10);
3888		}
3889
3890	}
3891	lru_add_drain();
3892	/* try move_account...there may be some *locked* pages. */
3893	goto move_account;
3894}
3895
3896int mem_cgroup_force_empty_write(struct cgroup *cont, unsigned int event)
3897{
3898	return mem_cgroup_force_empty(mem_cgroup_from_cont(cont), true);
3899}
3900
3901
3902static u64 mem_cgroup_hierarchy_read(struct cgroup *cont, struct cftype *cft)
3903{
3904	return mem_cgroup_from_cont(cont)->use_hierarchy;
3905}
3906
3907static int mem_cgroup_hierarchy_write(struct cgroup *cont, struct cftype *cft,
3908					u64 val)
3909{
3910	int retval = 0;
3911	struct mem_cgroup *memcg = mem_cgroup_from_cont(cont);
3912	struct cgroup *parent = cont->parent;
3913	struct mem_cgroup *parent_memcg = NULL;
3914
3915	if (parent)
3916		parent_memcg = mem_cgroup_from_cont(parent);
3917
3918	cgroup_lock();
3919	/*
3920	 * If parent's use_hierarchy is set, we can't make any modifications
3921	 * in the child subtrees. If it is unset, then the change can
3922	 * occur, provided the current cgroup has no children.
3923	 *
3924	 * For the root cgroup, parent_mem is NULL, we allow value to be
3925	 * set if there are no children.
3926	 */
3927	if ((!parent_memcg || !parent_memcg->use_hierarchy) &&
3928				(val == 1 || val == 0)) {
3929		if (list_empty(&cont->children))
3930			memcg->use_hierarchy = val;
3931		else
3932			retval = -EBUSY;
3933	} else
3934		retval = -EINVAL;
3935	cgroup_unlock();
3936
3937	return retval;
3938}
3939
3940
3941static unsigned long mem_cgroup_recursive_stat(struct mem_cgroup *memcg,
3942					       enum mem_cgroup_stat_index idx)
3943{
3944	struct mem_cgroup *iter;
3945	long val = 0;
3946
3947	/* Per-cpu values can be negative, use a signed accumulator */
3948	for_each_mem_cgroup_tree(iter, memcg)
3949		val += mem_cgroup_read_stat(iter, idx);
3950
3951	if (val < 0) /* race ? */
3952		val = 0;
3953	return val;
3954}
3955
3956static inline u64 mem_cgroup_usage(struct mem_cgroup *memcg, bool swap)
3957{
3958	u64 val;
3959
3960	if (!mem_cgroup_is_root(memcg)) {
3961		if (!swap)
3962			return res_counter_read_u64(&memcg->res, RES_USAGE);
3963		else
3964			return res_counter_read_u64(&memcg->memsw, RES_USAGE);
3965	}
3966
3967	val = mem_cgroup_recursive_stat(memcg, MEM_CGROUP_STAT_CACHE);
3968	val += mem_cgroup_recursive_stat(memcg, MEM_CGROUP_STAT_RSS);
3969
3970	if (swap)
3971		val += mem_cgroup_recursive_stat(memcg, MEM_CGROUP_STAT_SWAPOUT);
3972
3973	return val << PAGE_SHIFT;
3974}
3975
3976static u64 mem_cgroup_read(struct cgroup *cont, struct cftype *cft)
3977{
3978	struct mem_cgroup *memcg = mem_cgroup_from_cont(cont);
3979	u64 val;
3980	int type, name;
3981
3982	type = MEMFILE_TYPE(cft->private);
3983	name = MEMFILE_ATTR(cft->private);
3984	switch (type) {
3985	case _MEM:
3986		if (name == RES_USAGE)
3987			val = mem_cgroup_usage(memcg, false);
3988		else
3989			val = res_counter_read_u64(&memcg->res, name);
3990		break;
3991	case _MEMSWAP:
3992		if (name == RES_USAGE)
3993			val = mem_cgroup_usage(memcg, true);
3994		else
3995			val = res_counter_read_u64(&memcg->memsw, name);
3996		break;
3997	default:
3998		BUG();
3999		break;
4000	}
4001	return val;
4002}
4003/*
4004 * The user of this function is...
4005 * RES_LIMIT.
4006 */
4007static int mem_cgroup_write(struct cgroup *cont, struct cftype *cft,
4008			    const char *buffer)
4009{
4010	struct mem_cgroup *memcg = mem_cgroup_from_cont(cont);
4011	int type, name;
4012	unsigned long long val;
4013	int ret;
4014
4015	type = MEMFILE_TYPE(cft->private);
4016	name = MEMFILE_ATTR(cft->private);
4017	switch (name) {
4018	case RES_LIMIT:
4019		if (mem_cgroup_is_root(memcg)) { /* Can't set limit on root */
4020			ret = -EINVAL;
4021			break;
4022		}
4023		/* This function does all necessary parse...reuse it */
4024		ret = res_counter_memparse_write_strategy(buffer, &val);
4025		if (ret)
4026			break;
4027		if (type == _MEM)
4028			ret = mem_cgroup_resize_limit(memcg, val);
4029		else
4030			ret = mem_cgroup_resize_memsw_limit(memcg, val);
4031		break;
4032	case RES_SOFT_LIMIT:
4033		ret = res_counter_memparse_write_strategy(buffer, &val);
4034		if (ret)
4035			break;
4036		/*
4037		 * For memsw, soft limits are hard to implement in terms
4038		 * of semantics, for now, we support soft limits for
4039		 * control without swap
4040		 */
4041		if (type == _MEM)
4042			ret = res_counter_set_soft_limit(&memcg->res, val);
4043		else
4044			ret = -EINVAL;
4045		break;
4046	default:
4047		ret = -EINVAL; /* should be BUG() ? */
4048		break;
4049	}
4050	return ret;
4051}
4052
4053static void memcg_get_hierarchical_limit(struct mem_cgroup *memcg,
4054		unsigned long long *mem_limit, unsigned long long *memsw_limit)
4055{
4056	struct cgroup *cgroup;
4057	unsigned long long min_limit, min_memsw_limit, tmp;
4058
4059	min_limit = res_counter_read_u64(&memcg->res, RES_LIMIT);
4060	min_memsw_limit = res_counter_read_u64(&memcg->memsw, RES_LIMIT);
4061	cgroup = memcg->css.cgroup;
4062	if (!memcg->use_hierarchy)
4063		goto out;
4064
4065	while (cgroup->parent) {
4066		cgroup = cgroup->parent;
4067		memcg = mem_cgroup_from_cont(cgroup);
4068		if (!memcg->use_hierarchy)
4069			break;
4070		tmp = res_counter_read_u64(&memcg->res, RES_LIMIT);
4071		min_limit = min(min_limit, tmp);
4072		tmp = res_counter_read_u64(&memcg->memsw, RES_LIMIT);
4073		min_memsw_limit = min(min_memsw_limit, tmp);
4074	}
4075out:
4076	*mem_limit = min_limit;
4077	*memsw_limit = min_memsw_limit;
4078	return;
4079}
4080
4081static int mem_cgroup_reset(struct cgroup *cont, unsigned int event)
4082{
4083	struct mem_cgroup *memcg;
4084	int type, name;
4085
4086	memcg = mem_cgroup_from_cont(cont);
4087	type = MEMFILE_TYPE(event);
4088	name = MEMFILE_ATTR(event);
4089	switch (name) {
4090	case RES_MAX_USAGE:
4091		if (type == _MEM)
4092			res_counter_reset_max(&memcg->res);
4093		else
4094			res_counter_reset_max(&memcg->memsw);
4095		break;
4096	case RES_FAILCNT:
4097		if (type == _MEM)
4098			res_counter_reset_failcnt(&memcg->res);
4099		else
4100			res_counter_reset_failcnt(&memcg->memsw);
4101		break;
4102	}
4103
4104	return 0;
4105}
4106
4107static u64 mem_cgroup_move_charge_read(struct cgroup *cgrp,
4108					struct cftype *cft)
4109{
4110	return mem_cgroup_from_cont(cgrp)->move_charge_at_immigrate;
4111}
4112
4113#ifdef CONFIG_MMU
4114static int mem_cgroup_move_charge_write(struct cgroup *cgrp,
4115					struct cftype *cft, u64 val)
4116{
4117	struct mem_cgroup *memcg = mem_cgroup_from_cont(cgrp);
4118
4119	if (val >= (1 << NR_MOVE_TYPE))
4120		return -EINVAL;
4121	/*
4122	 * We check this value several times in both in can_attach() and
4123	 * attach(), so we need cgroup lock to prevent this value from being
4124	 * inconsistent.
4125	 */
4126	cgroup_lock();
4127	memcg->move_charge_at_immigrate = val;
4128	cgroup_unlock();
4129
4130	return 0;
4131}
4132#else
4133static int mem_cgroup_move_charge_write(struct cgroup *cgrp,
4134					struct cftype *cft, u64 val)
4135{
4136	return -ENOSYS;
4137}
4138#endif
4139
4140
4141/* For read statistics */
4142enum {
4143	MCS_CACHE,
4144	MCS_RSS,
4145	MCS_FILE_MAPPED,
4146	MCS_PGPGIN,
4147	MCS_PGPGOUT,
4148	MCS_SWAP,
4149	MCS_PGFAULT,
4150	MCS_PGMAJFAULT,
4151	MCS_INACTIVE_ANON,
4152	MCS_ACTIVE_ANON,
4153	MCS_INACTIVE_FILE,
4154	MCS_ACTIVE_FILE,
4155	MCS_UNEVICTABLE,
4156	NR_MCS_STAT,
4157};
4158
4159struct mcs_total_stat {
4160	s64 stat[NR_MCS_STAT];
4161};
4162
4163struct {
4164	char *local_name;
4165	char *total_name;
4166} memcg_stat_strings[NR_MCS_STAT] = {
4167	{"cache", "total_cache"},
4168	{"rss", "total_rss"},
4169	{"mapped_file", "total_mapped_file"},
4170	{"pgpgin", "total_pgpgin"},
4171	{"pgpgout", "total_pgpgout"},
4172	{"swap", "total_swap"},
4173	{"pgfault", "total_pgfault"},
4174	{"pgmajfault", "total_pgmajfault"},
4175	{"inactive_anon", "total_inactive_anon"},
4176	{"active_anon", "total_active_anon"},
4177	{"inactive_file", "total_inactive_file"},
4178	{"active_file", "total_active_file"},
4179	{"unevictable", "total_unevictable"}
4180};
4181
4182
4183static void
4184mem_cgroup_get_local_stat(struct mem_cgroup *memcg, struct mcs_total_stat *s)
4185{
4186	s64 val;
4187
4188	/* per cpu stat */
4189	val = mem_cgroup_read_stat(memcg, MEM_CGROUP_STAT_CACHE);
4190	s->stat[MCS_CACHE] += val * PAGE_SIZE;
4191	val = mem_cgroup_read_stat(memcg, MEM_CGROUP_STAT_RSS);
4192	s->stat[MCS_RSS] += val * PAGE_SIZE;
4193	val = mem_cgroup_read_stat(memcg, MEM_CGROUP_STAT_FILE_MAPPED);
4194	s->stat[MCS_FILE_MAPPED] += val * PAGE_SIZE;
4195	val = mem_cgroup_read_events(memcg, MEM_CGROUP_EVENTS_PGPGIN);
4196	s->stat[MCS_PGPGIN] += val;
4197	val = mem_cgroup_read_events(memcg, MEM_CGROUP_EVENTS_PGPGOUT);
4198	s->stat[MCS_PGPGOUT] += val;
4199	if (do_swap_account) {
4200		val = mem_cgroup_read_stat(memcg, MEM_CGROUP_STAT_SWAPOUT);
4201		s->stat[MCS_SWAP] += val * PAGE_SIZE;
4202	}
4203	val = mem_cgroup_read_events(memcg, MEM_CGROUP_EVENTS_PGFAULT);
4204	s->stat[MCS_PGFAULT] += val;
4205	val = mem_cgroup_read_events(memcg, MEM_CGROUP_EVENTS_PGMAJFAULT);
4206	s->stat[MCS_PGMAJFAULT] += val;
4207
4208	/* per zone stat */
4209	val = mem_cgroup_nr_lru_pages(memcg, BIT(LRU_INACTIVE_ANON));
4210	s->stat[MCS_INACTIVE_ANON] += val * PAGE_SIZE;
4211	val = mem_cgroup_nr_lru_pages(memcg, BIT(LRU_ACTIVE_ANON));
4212	s->stat[MCS_ACTIVE_ANON] += val * PAGE_SIZE;
4213	val = mem_cgroup_nr_lru_pages(memcg, BIT(LRU_INACTIVE_FILE));
4214	s->stat[MCS_INACTIVE_FILE] += val * PAGE_SIZE;
4215	val = mem_cgroup_nr_lru_pages(memcg, BIT(LRU_ACTIVE_FILE));
4216	s->stat[MCS_ACTIVE_FILE] += val * PAGE_SIZE;
4217	val = mem_cgroup_nr_lru_pages(memcg, BIT(LRU_UNEVICTABLE));
4218	s->stat[MCS_UNEVICTABLE] += val * PAGE_SIZE;
4219}
4220
4221static void
4222mem_cgroup_get_total_stat(struct mem_cgroup *memcg, struct mcs_total_stat *s)
4223{
4224	struct mem_cgroup *iter;
4225
4226	for_each_mem_cgroup_tree(iter, memcg)
4227		mem_cgroup_get_local_stat(iter, s);
4228}
4229
4230#ifdef CONFIG_NUMA
4231static int mem_control_numa_stat_show(struct seq_file *m, void *arg)
4232{
4233	int nid;
4234	unsigned long total_nr, file_nr, anon_nr, unevictable_nr;
4235	unsigned long node_nr;
4236	struct cgroup *cont = m->private;
4237	struct mem_cgroup *mem_cont = mem_cgroup_from_cont(cont);
4238
4239	total_nr = mem_cgroup_nr_lru_pages(mem_cont, LRU_ALL);
4240	seq_printf(m, "total=%lu", total_nr);
4241	for_each_node_state(nid, N_HIGH_MEMORY) {
4242		node_nr = mem_cgroup_node_nr_lru_pages(mem_cont, nid, LRU_ALL);
4243		seq_printf(m, " N%d=%lu", nid, node_nr);
4244	}
4245	seq_putc(m, '\n');
4246
4247	file_nr = mem_cgroup_nr_lru_pages(mem_cont, LRU_ALL_FILE);
4248	seq_printf(m, "file=%lu", file_nr);
4249	for_each_node_state(nid, N_HIGH_MEMORY) {
4250		node_nr = mem_cgroup_node_nr_lru_pages(mem_cont, nid,
4251				LRU_ALL_FILE);
4252		seq_printf(m, " N%d=%lu", nid, node_nr);
4253	}
4254	seq_putc(m, '\n');
4255
4256	anon_nr = mem_cgroup_nr_lru_pages(mem_cont, LRU_ALL_ANON);
4257	seq_printf(m, "anon=%lu", anon_nr);
4258	for_each_node_state(nid, N_HIGH_MEMORY) {
4259		node_nr = mem_cgroup_node_nr_lru_pages(mem_cont, nid,
4260				LRU_ALL_ANON);
4261		seq_printf(m, " N%d=%lu", nid, node_nr);
4262	}
4263	seq_putc(m, '\n');
4264
4265	unevictable_nr = mem_cgroup_nr_lru_pages(mem_cont, BIT(LRU_UNEVICTABLE));
4266	seq_printf(m, "unevictable=%lu", unevictable_nr);
4267	for_each_node_state(nid, N_HIGH_MEMORY) {
4268		node_nr = mem_cgroup_node_nr_lru_pages(mem_cont, nid,
4269				BIT(LRU_UNEVICTABLE));
4270		seq_printf(m, " N%d=%lu", nid, node_nr);
4271	}
4272	seq_putc(m, '\n');
4273	return 0;
4274}
4275#endif /* CONFIG_NUMA */
4276
4277static int mem_control_stat_show(struct cgroup *cont, struct cftype *cft,
4278				 struct cgroup_map_cb *cb)
4279{
4280	struct mem_cgroup *mem_cont = mem_cgroup_from_cont(cont);
4281	struct mcs_total_stat mystat;
4282	int i;
4283
4284	memset(&mystat, 0, sizeof(mystat));
4285	mem_cgroup_get_local_stat(mem_cont, &mystat);
4286
4287
4288	for (i = 0; i < NR_MCS_STAT; i++) {
4289		if (i == MCS_SWAP && !do_swap_account)
4290			continue;
4291		cb->fill(cb, memcg_stat_strings[i].local_name, mystat.stat[i]);
4292	}
4293
4294	/* Hierarchical information */
4295	{
4296		unsigned long long limit, memsw_limit;
4297		memcg_get_hierarchical_limit(mem_cont, &limit, &memsw_limit);
4298		cb->fill(cb, "hierarchical_memory_limit", limit);
4299		if (do_swap_account)
4300			cb->fill(cb, "hierarchical_memsw_limit", memsw_limit);
4301	}
4302
4303	memset(&mystat, 0, sizeof(mystat));
4304	mem_cgroup_get_total_stat(mem_cont, &mystat);
4305	for (i = 0; i < NR_MCS_STAT; i++) {
4306		if (i == MCS_SWAP && !do_swap_account)
4307			continue;
4308		cb->fill(cb, memcg_stat_strings[i].total_name, mystat.stat[i]);
4309	}
4310
4311#ifdef CONFIG_DEBUG_VM
4312	{
4313		int nid, zid;
4314		struct mem_cgroup_per_zone *mz;
4315		unsigned long recent_rotated[2] = {0, 0};
4316		unsigned long recent_scanned[2] = {0, 0};
4317
4318		for_each_online_node(nid)
4319			for (zid = 0; zid < MAX_NR_ZONES; zid++) {
4320				mz = mem_cgroup_zoneinfo(mem_cont, nid, zid);
4321
4322				recent_rotated[0] +=
4323					mz->reclaim_stat.recent_rotated[0];
4324				recent_rotated[1] +=
4325					mz->reclaim_stat.recent_rotated[1];
4326				recent_scanned[0] +=
4327					mz->reclaim_stat.recent_scanned[0];
4328				recent_scanned[1] +=
4329					mz->reclaim_stat.recent_scanned[1];
4330			}
4331		cb->fill(cb, "recent_rotated_anon", recent_rotated[0]);
4332		cb->fill(cb, "recent_rotated_file", recent_rotated[1]);
4333		cb->fill(cb, "recent_scanned_anon", recent_scanned[0]);
4334		cb->fill(cb, "recent_scanned_file", recent_scanned[1]);
4335	}
4336#endif
4337
4338	return 0;
4339}
4340
4341static u64 mem_cgroup_swappiness_read(struct cgroup *cgrp, struct cftype *cft)
4342{
4343	struct mem_cgroup *memcg = mem_cgroup_from_cont(cgrp);
4344
4345	return mem_cgroup_swappiness(memcg);
4346}
4347
4348static int mem_cgroup_swappiness_write(struct cgroup *cgrp, struct cftype *cft,
4349				       u64 val)
4350{
4351	struct mem_cgroup *memcg = mem_cgroup_from_cont(cgrp);
4352	struct mem_cgroup *parent;
4353
4354	if (val > 100)
4355		return -EINVAL;
4356
4357	if (cgrp->parent == NULL)
4358		return -EINVAL;
4359
4360	parent = mem_cgroup_from_cont(cgrp->parent);
4361
4362	cgroup_lock();
4363
4364	/* If under hierarchy, only empty-root can set this value */
4365	if ((parent->use_hierarchy) ||
4366	    (memcg->use_hierarchy && !list_empty(&cgrp->children))) {
4367		cgroup_unlock();
4368		return -EINVAL;
4369	}
4370
4371	memcg->swappiness = val;
4372
4373	cgroup_unlock();
4374
4375	return 0;
4376}
4377
4378static void __mem_cgroup_threshold(struct mem_cgroup *memcg, bool swap)
4379{
4380	struct mem_cgroup_threshold_ary *t;
4381	u64 usage;
4382	int i;
4383
4384	rcu_read_lock();
4385	if (!swap)
4386		t = rcu_dereference(memcg->thresholds.primary);
4387	else
4388		t = rcu_dereference(memcg->memsw_thresholds.primary);
4389
4390	if (!t)
4391		goto unlock;
4392
4393	usage = mem_cgroup_usage(memcg, swap);
4394
4395	/*
4396	 * current_threshold points to threshold just below usage.
4397	 * If it's not true, a threshold was crossed after last
4398	 * call of __mem_cgroup_threshold().
4399	 */
4400	i = t->current_threshold;
4401
4402	/*
4403	 * Iterate backward over array of thresholds starting from
4404	 * current_threshold and check if a threshold is crossed.
4405	 * If none of thresholds below usage is crossed, we read
4406	 * only one element of the array here.
4407	 */
4408	for (; i >= 0 && unlikely(t->entries[i].threshold > usage); i--)
4409		eventfd_signal(t->entries[i].eventfd, 1);
4410
4411	/* i = current_threshold + 1 */
4412	i++;
4413
4414	/*
4415	 * Iterate forward over array of thresholds starting from
4416	 * current_threshold+1 and check if a threshold is crossed.
4417	 * If none of thresholds above usage is crossed, we read
4418	 * only one element of the array here.
4419	 */
4420	for (; i < t->size && unlikely(t->entries[i].threshold <= usage); i++)
4421		eventfd_signal(t->entries[i].eventfd, 1);
4422
4423	/* Update current_threshold */
4424	t->current_threshold = i - 1;
4425unlock:
4426	rcu_read_unlock();
4427}
4428
4429static void mem_cgroup_threshold(struct mem_cgroup *memcg)
4430{
4431	while (memcg) {
4432		__mem_cgroup_threshold(memcg, false);
4433		if (do_swap_account)
4434			__mem_cgroup_threshold(memcg, true);
4435
4436		memcg = parent_mem_cgroup(memcg);
4437	}
4438}
4439
4440static int compare_thresholds(const void *a, const void *b)
4441{
4442	const struct mem_cgroup_threshold *_a = a;
4443	const struct mem_cgroup_threshold *_b = b;
4444
4445	return _a->threshold - _b->threshold;
4446}
4447
4448static int mem_cgroup_oom_notify_cb(struct mem_cgroup *memcg)
4449{
4450	struct mem_cgroup_eventfd_list *ev;
4451
4452	list_for_each_entry(ev, &memcg->oom_notify, list)
4453		eventfd_signal(ev->eventfd, 1);
4454	return 0;
4455}
4456
4457static void mem_cgroup_oom_notify(struct mem_cgroup *memcg)
4458{
4459	struct mem_cgroup *iter;
4460
4461	for_each_mem_cgroup_tree(iter, memcg)
4462		mem_cgroup_oom_notify_cb(iter);
4463}
4464
4465static int mem_cgroup_usage_register_event(struct cgroup *cgrp,
4466	struct cftype *cft, struct eventfd_ctx *eventfd, const char *args)
4467{
4468	struct mem_cgroup *memcg = mem_cgroup_from_cont(cgrp);
4469	struct mem_cgroup_thresholds *thresholds;
4470	struct mem_cgroup_threshold_ary *new;
4471	int type = MEMFILE_TYPE(cft->private);
4472	u64 threshold, usage;
4473	int i, size, ret;
4474
4475	ret = res_counter_memparse_write_strategy(args, &threshold);
4476	if (ret)
4477		return ret;
4478
4479	mutex_lock(&memcg->thresholds_lock);
4480
4481	if (type == _MEM)
4482		thresholds = &memcg->thresholds;
4483	else if (type == _MEMSWAP)
4484		thresholds = &memcg->memsw_thresholds;
4485	else
4486		BUG();
4487
4488	usage = mem_cgroup_usage(memcg, type == _MEMSWAP);
4489
4490	/* Check if a threshold crossed before adding a new one */
4491	if (thresholds->primary)
4492		__mem_cgroup_threshold(memcg, type == _MEMSWAP);
4493
4494	size = thresholds->primary ? thresholds->primary->size + 1 : 1;
4495
4496	/* Allocate memory for new array of thresholds */
4497	new = kmalloc(sizeof(*new) + size * sizeof(struct mem_cgroup_threshold),
4498			GFP_KERNEL);
4499	if (!new) {
4500		ret = -ENOMEM;
4501		goto unlock;
4502	}
4503	new->size = size;
4504
4505	/* Copy thresholds (if any) to new array */
4506	if (thresholds->primary) {
4507		memcpy(new->entries, thresholds->primary->entries, (size - 1) *
4508				sizeof(struct mem_cgroup_threshold));
4509	}
4510
4511	/* Add new threshold */
4512	new->entries[size - 1].eventfd = eventfd;
4513	new->entries[size - 1].threshold = threshold;
4514
4515	/* Sort thresholds. Registering of new threshold isn't time-critical */
4516	sort(new->entries, size, sizeof(struct mem_cgroup_threshold),
4517			compare_thresholds, NULL);
4518
4519	/* Find current threshold */
4520	new->current_threshold = -1;
4521	for (i = 0; i < size; i++) {
4522		if (new->entries[i].threshold < usage) {
4523			/*
4524			 * new->current_threshold will not be used until
4525			 * rcu_assign_pointer(), so it's safe to increment
4526			 * it here.
4527			 */
4528			++new->current_threshold;
4529		}
4530	}
4531
4532	/* Free old spare buffer and save old primary buffer as spare */
4533	kfree(thresholds->spare);
4534	thresholds->spare = thresholds->primary;
4535
4536	rcu_assign_pointer(thresholds->primary, new);
4537
4538	/* To be sure that nobody uses thresholds */
4539	synchronize_rcu();
4540
4541unlock:
4542	mutex_unlock(&memcg->thresholds_lock);
4543
4544	return ret;
4545}
4546
4547static void mem_cgroup_usage_unregister_event(struct cgroup *cgrp,
4548	struct cftype *cft, struct eventfd_ctx *eventfd)
4549{
4550	struct mem_cgroup *memcg = mem_cgroup_from_cont(cgrp);
4551	struct mem_cgroup_thresholds *thresholds;
4552	struct mem_cgroup_threshold_ary *new;
4553	int type = MEMFILE_TYPE(cft->private);
4554	u64 usage;
4555	int i, j, size;
4556
4557	mutex_lock(&memcg->thresholds_lock);
4558	if (type == _MEM)
4559		thresholds = &memcg->thresholds;
4560	else if (type == _MEMSWAP)
4561		thresholds = &memcg->memsw_thresholds;
4562	else
4563		BUG();
4564
4565	/*
4566	 * Something went wrong if we trying to unregister a threshold
4567	 * if we don't have thresholds
4568	 */
4569	BUG_ON(!thresholds);
4570
4571	usage = mem_cgroup_usage(memcg, type == _MEMSWAP);
4572
4573	/* Check if a threshold crossed before removing */
4574	__mem_cgroup_threshold(memcg, type == _MEMSWAP);
4575
4576	/* Calculate new number of threshold */
4577	size = 0;
4578	for (i = 0; i < thresholds->primary->size; i++) {
4579		if (thresholds->primary->entries[i].eventfd != eventfd)
4580			size++;
4581	}
4582
4583	new = thresholds->spare;
4584
4585	/* Set thresholds array to NULL if we don't have thresholds */
4586	if (!size) {
4587		kfree(new);
4588		new = NULL;
4589		goto swap_buffers;
4590	}
4591
4592	new->size = size;
4593
4594	/* Copy thresholds and find current threshold */
4595	new->current_threshold = -1;
4596	for (i = 0, j = 0; i < thresholds->primary->size; i++) {
4597		if (thresholds->primary->entries[i].eventfd == eventfd)
4598			continue;
4599
4600		new->entries[j] = thresholds->primary->entries[i];
4601		if (new->entries[j].threshold < usage) {
4602			/*
4603			 * new->current_threshold will not be used
4604			 * until rcu_assign_pointer(), so it's safe to increment
4605			 * it here.
4606			 */
4607			++new->current_threshold;
4608		}
4609		j++;
4610	}
4611
4612swap_buffers:
4613	/* Swap primary and spare array */
4614	thresholds->spare = thresholds->primary;
4615	rcu_assign_pointer(thresholds->primary, new);
4616
4617	/* To be sure that nobody uses thresholds */
4618	synchronize_rcu();
4619
4620	mutex_unlock(&memcg->thresholds_lock);
4621}
4622
4623static int mem_cgroup_oom_register_event(struct cgroup *cgrp,
4624	struct cftype *cft, struct eventfd_ctx *eventfd, const char *args)
4625{
4626	struct mem_cgroup *memcg = mem_cgroup_from_cont(cgrp);
4627	struct mem_cgroup_eventfd_list *event;
4628	int type = MEMFILE_TYPE(cft->private);
4629
4630	BUG_ON(type != _OOM_TYPE);
4631	event = kmalloc(sizeof(*event),	GFP_KERNEL);
4632	if (!event)
4633		return -ENOMEM;
4634
4635	spin_lock(&memcg_oom_lock);
4636
4637	event->eventfd = eventfd;
4638	list_add(&event->list, &memcg->oom_notify);
4639
4640	/* already in OOM ? */
4641	if (atomic_read(&memcg->under_oom))
4642		eventfd_signal(eventfd, 1);
4643	spin_unlock(&memcg_oom_lock);
4644
4645	return 0;
4646}
4647
4648static void mem_cgroup_oom_unregister_event(struct cgroup *cgrp,
4649	struct cftype *cft, struct eventfd_ctx *eventfd)
4650{
4651	struct mem_cgroup *memcg = mem_cgroup_from_cont(cgrp);
4652	struct mem_cgroup_eventfd_list *ev, *tmp;
4653	int type = MEMFILE_TYPE(cft->private);
4654
4655	BUG_ON(type != _OOM_TYPE);
4656
4657	spin_lock(&memcg_oom_lock);
4658
4659	list_for_each_entry_safe(ev, tmp, &memcg->oom_notify, list) {
4660		if (ev->eventfd == eventfd) {
4661			list_del(&ev->list);
4662			kfree(ev);
4663		}
4664	}
4665
4666	spin_unlock(&memcg_oom_lock);
4667}
4668
4669static int mem_cgroup_oom_control_read(struct cgroup *cgrp,
4670	struct cftype *cft,  struct cgroup_map_cb *cb)
4671{
4672	struct mem_cgroup *memcg = mem_cgroup_from_cont(cgrp);
4673
4674	cb->fill(cb, "oom_kill_disable", memcg->oom_kill_disable);
4675
4676	if (atomic_read(&memcg->under_oom))
4677		cb->fill(cb, "under_oom", 1);
4678	else
4679		cb->fill(cb, "under_oom", 0);
4680	return 0;
4681}
4682
4683static int mem_cgroup_oom_control_write(struct cgroup *cgrp,
4684	struct cftype *cft, u64 val)
4685{
4686	struct mem_cgroup *memcg = mem_cgroup_from_cont(cgrp);
4687	struct mem_cgroup *parent;
4688
4689	/* cannot set to root cgroup and only 0 and 1 are allowed */
4690	if (!cgrp->parent || !((val == 0) || (val == 1)))
4691		return -EINVAL;
4692
4693	parent = mem_cgroup_from_cont(cgrp->parent);
4694
4695	cgroup_lock();
4696	/* oom-kill-disable is a flag for subhierarchy. */
4697	if ((parent->use_hierarchy) ||
4698	    (memcg->use_hierarchy && !list_empty(&cgrp->children))) {
4699		cgroup_unlock();
4700		return -EINVAL;
4701	}
4702	memcg->oom_kill_disable = val;
4703	if (!val)
4704		memcg_oom_recover(memcg);
4705	cgroup_unlock();
4706	return 0;
4707}
4708
4709#ifdef CONFIG_NUMA
4710static const struct file_operations mem_control_numa_stat_file_operations = {
4711	.read = seq_read,
4712	.llseek = seq_lseek,
4713	.release = single_release,
4714};
4715
4716static int mem_control_numa_stat_open(struct inode *unused, struct file *file)
4717{
4718	struct cgroup *cont = file->f_dentry->d_parent->d_fsdata;
4719
4720	file->f_op = &mem_control_numa_stat_file_operations;
4721	return single_open(file, mem_control_numa_stat_show, cont);
4722}
4723#endif /* CONFIG_NUMA */
4724
4725#ifdef CONFIG_CGROUP_MEM_RES_CTLR_KMEM
4726static int register_kmem_files(struct cgroup *cont, struct cgroup_subsys *ss)
4727{
4728	/*
4729	 * Part of this would be better living in a separate allocation
4730	 * function, leaving us with just the cgroup tree population work.
4731	 * We, however, depend on state such as network's proto_list that
4732	 * is only initialized after cgroup creation. I found the less
4733	 * cumbersome way to deal with it to defer it all to populate time
4734	 */
4735	return mem_cgroup_sockets_init(cont, ss);
4736};
4737
4738static void kmem_cgroup_destroy(struct cgroup_subsys *ss,
4739				struct cgroup *cont)
4740{
4741	mem_cgroup_sockets_destroy(cont, ss);
4742}
4743#else
4744static int register_kmem_files(struct cgroup *cont, struct cgroup_subsys *ss)
4745{
4746	return 0;
4747}
4748
4749static void kmem_cgroup_destroy(struct cgroup_subsys *ss,
4750				struct cgroup *cont)
4751{
4752}
4753#endif
4754
4755static struct cftype mem_cgroup_files[] = {
4756	{
4757		.name = "usage_in_bytes",
4758		.private = MEMFILE_PRIVATE(_MEM, RES_USAGE),
4759		.read_u64 = mem_cgroup_read,
4760		.register_event = mem_cgroup_usage_register_event,
4761		.unregister_event = mem_cgroup_usage_unregister_event,
4762	},
4763	{
4764		.name = "max_usage_in_bytes",
4765		.private = MEMFILE_PRIVATE(_MEM, RES_MAX_USAGE),
4766		.trigger = mem_cgroup_reset,
4767		.read_u64 = mem_cgroup_read,
4768	},
4769	{
4770		.name = "limit_in_bytes",
4771		.private = MEMFILE_PRIVATE(_MEM, RES_LIMIT),
4772		.write_string = mem_cgroup_write,
4773		.read_u64 = mem_cgroup_read,
4774	},
4775	{
4776		.name = "soft_limit_in_bytes",
4777		.private = MEMFILE_PRIVATE(_MEM, RES_SOFT_LIMIT),
4778		.write_string = mem_cgroup_write,
4779		.read_u64 = mem_cgroup_read,
4780	},
4781	{
4782		.name = "failcnt",
4783		.private = MEMFILE_PRIVATE(_MEM, RES_FAILCNT),
4784		.trigger = mem_cgroup_reset,
4785		.read_u64 = mem_cgroup_read,
4786	},
4787	{
4788		.name = "stat",
4789		.read_map = mem_control_stat_show,
4790	},
4791	{
4792		.name = "force_empty",
4793		.trigger = mem_cgroup_force_empty_write,
4794	},
4795	{
4796		.name = "use_hierarchy",
4797		.write_u64 = mem_cgroup_hierarchy_write,
4798		.read_u64 = mem_cgroup_hierarchy_read,
4799	},
4800	{
4801		.name = "swappiness",
4802		.read_u64 = mem_cgroup_swappiness_read,
4803		.write_u64 = mem_cgroup_swappiness_write,
4804	},
4805	{
4806		.name = "move_charge_at_immigrate",
4807		.read_u64 = mem_cgroup_move_charge_read,
4808		.write_u64 = mem_cgroup_move_charge_write,
4809	},
4810	{
4811		.name = "oom_control",
4812		.read_map = mem_cgroup_oom_control_read,
4813		.write_u64 = mem_cgroup_oom_control_write,
4814		.register_event = mem_cgroup_oom_register_event,
4815		.unregister_event = mem_cgroup_oom_unregister_event,
4816		.private = MEMFILE_PRIVATE(_OOM_TYPE, OOM_CONTROL),
4817	},
4818#ifdef CONFIG_NUMA
4819	{
4820		.name = "numa_stat",
4821		.open = mem_control_numa_stat_open,
4822		.mode = S_IRUGO,
4823	},
4824#endif
4825};
4826
4827#ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP
4828static struct cftype memsw_cgroup_files[] = {
4829	{
4830		.name = "memsw.usage_in_bytes",
4831		.private = MEMFILE_PRIVATE(_MEMSWAP, RES_USAGE),
4832		.read_u64 = mem_cgroup_read,
4833		.register_event = mem_cgroup_usage_register_event,
4834		.unregister_event = mem_cgroup_usage_unregister_event,
4835	},
4836	{
4837		.name = "memsw.max_usage_in_bytes",
4838		.private = MEMFILE_PRIVATE(_MEMSWAP, RES_MAX_USAGE),
4839		.trigger = mem_cgroup_reset,
4840		.read_u64 = mem_cgroup_read,
4841	},
4842	{
4843		.name = "memsw.limit_in_bytes",
4844		.private = MEMFILE_PRIVATE(_MEMSWAP, RES_LIMIT),
4845		.write_string = mem_cgroup_write,
4846		.read_u64 = mem_cgroup_read,
4847	},
4848	{
4849		.name = "memsw.failcnt",
4850		.private = MEMFILE_PRIVATE(_MEMSWAP, RES_FAILCNT),
4851		.trigger = mem_cgroup_reset,
4852		.read_u64 = mem_cgroup_read,
4853	},
4854};
4855
4856static int register_memsw_files(struct cgroup *cont, struct cgroup_subsys *ss)
4857{
4858	if (!do_swap_account)
4859		return 0;
4860	return cgroup_add_files(cont, ss, memsw_cgroup_files,
4861				ARRAY_SIZE(memsw_cgroup_files));
4862};
4863#else
4864static int register_memsw_files(struct cgroup *cont, struct cgroup_subsys *ss)
4865{
4866	return 0;
4867}
4868#endif
4869
4870static int alloc_mem_cgroup_per_zone_info(struct mem_cgroup *memcg, int node)
4871{
4872	struct mem_cgroup_per_node *pn;
4873	struct mem_cgroup_per_zone *mz;
4874	enum lru_list l;
4875	int zone, tmp = node;
4876	/*
4877	 * This routine is called against possible nodes.
4878	 * But it's BUG to call kmalloc() against offline node.
4879	 *
4880	 * TODO: this routine can waste much memory for nodes which will
4881	 *       never be onlined. It's better to use memory hotplug callback
4882	 *       function.
4883	 */
4884	if (!node_state(node, N_NORMAL_MEMORY))
4885		tmp = -1;
4886	pn = kzalloc_node(sizeof(*pn), GFP_KERNEL, tmp);
4887	if (!pn)
4888		return 1;
4889
4890	for (zone = 0; zone < MAX_NR_ZONES; zone++) {
4891		mz = &pn->zoneinfo[zone];
4892		for_each_lru(l)
4893			INIT_LIST_HEAD(&mz->lists[l]);
4894		mz->usage_in_excess = 0;
4895		mz->on_tree = false;
4896		mz->mem = memcg;
4897	}
4898	memcg->info.nodeinfo[node] = pn;
4899	return 0;
4900}
4901
4902static void free_mem_cgroup_per_zone_info(struct mem_cgroup *memcg, int node)
4903{
4904	kfree(memcg->info.nodeinfo[node]);
4905}
4906
4907static struct mem_cgroup *mem_cgroup_alloc(void)
4908{
4909	struct mem_cgroup *mem;
4910	int size = sizeof(struct mem_cgroup);
4911
4912	/* Can be very big if MAX_NUMNODES is very big */
4913	if (size < PAGE_SIZE)
4914		mem = kzalloc(size, GFP_KERNEL);
4915	else
4916		mem = vzalloc(size);
4917
4918	if (!mem)
4919		return NULL;
4920
4921	mem->stat = alloc_percpu(struct mem_cgroup_stat_cpu);
4922	if (!mem->stat)
4923		goto out_free;
4924	spin_lock_init(&mem->pcp_counter_lock);
4925	return mem;
4926
4927out_free:
4928	if (size < PAGE_SIZE)
4929		kfree(mem);
4930	else
4931		vfree(mem);
4932	return NULL;
4933}
4934
4935/*
4936 * At destroying mem_cgroup, references from swap_cgroup can remain.
4937 * (scanning all at force_empty is too costly...)
4938 *
4939 * Instead of clearing all references at force_empty, we remember
4940 * the number of reference from swap_cgroup and free mem_cgroup when
4941 * it goes down to 0.
4942 *
4943 * Removal of cgroup itself succeeds regardless of refs from swap.
4944 */
4945
4946static void __mem_cgroup_free(struct mem_cgroup *memcg)
4947{
4948	int node;
4949
4950	mem_cgroup_remove_from_trees(memcg);
4951	free_css_id(&mem_cgroup_subsys, &memcg->css);
4952
4953	for_each_node_state(node, N_POSSIBLE)
4954		free_mem_cgroup_per_zone_info(memcg, node);
4955
4956	free_percpu(memcg->stat);
4957	if (sizeof(struct mem_cgroup) < PAGE_SIZE)
4958		kfree(memcg);
4959	else
4960		vfree(memcg);
4961}
4962
4963static void mem_cgroup_get(struct mem_cgroup *memcg)
4964{
4965	atomic_inc(&memcg->refcnt);
4966}
4967
4968static void __mem_cgroup_put(struct mem_cgroup *memcg, int count)
4969{
4970	if (atomic_sub_and_test(count, &memcg->refcnt)) {
4971		struct mem_cgroup *parent = parent_mem_cgroup(memcg);
4972		__mem_cgroup_free(memcg);
4973		if (parent)
4974			mem_cgroup_put(parent);
4975	}
4976}
4977
4978static void mem_cgroup_put(struct mem_cgroup *memcg)
4979{
4980	__mem_cgroup_put(memcg, 1);
4981}
4982
4983/*
4984 * Returns the parent mem_cgroup in memcgroup hierarchy with hierarchy enabled.
4985 */
4986struct mem_cgroup *parent_mem_cgroup(struct mem_cgroup *memcg)
4987{
4988	if (!memcg->res.parent)
4989		return NULL;
4990	return mem_cgroup_from_res_counter(memcg->res.parent, res);
4991}
4992EXPORT_SYMBOL(parent_mem_cgroup);
4993
4994#ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP
4995static void __init enable_swap_cgroup(void)
4996{
4997	if (!mem_cgroup_disabled() && really_do_swap_account)
4998		do_swap_account = 1;
4999}
5000#else
5001static void __init enable_swap_cgroup(void)
5002{
5003}
5004#endif
5005
5006static int mem_cgroup_soft_limit_tree_init(void)
5007{
5008	struct mem_cgroup_tree_per_node *rtpn;
5009	struct mem_cgroup_tree_per_zone *rtpz;
5010	int tmp, node, zone;
5011
5012	for_each_node_state(node, N_POSSIBLE) {
5013		tmp = node;
5014		if (!node_state(node, N_NORMAL_MEMORY))
5015			tmp = -1;
5016		rtpn = kzalloc_node(sizeof(*rtpn), GFP_KERNEL, tmp);
5017		if (!rtpn)
5018			return 1;
5019
5020		soft_limit_tree.rb_tree_per_node[node] = rtpn;
5021
5022		for (zone = 0; zone < MAX_NR_ZONES; zone++) {
5023			rtpz = &rtpn->rb_tree_per_zone[zone];
5024			rtpz->rb_root = RB_ROOT;
5025			spin_lock_init(&rtpz->lock);
5026		}
5027	}
5028	return 0;
5029}
5030
5031static struct cgroup_subsys_state * __ref
5032mem_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cont)
5033{
5034	struct mem_cgroup *memcg, *parent;
5035	long error = -ENOMEM;
5036	int node;
5037
5038	memcg = mem_cgroup_alloc();
5039	if (!memcg)
5040		return ERR_PTR(error);
5041
5042	for_each_node_state(node, N_POSSIBLE)
5043		if (alloc_mem_cgroup_per_zone_info(memcg, node))
5044			goto free_out;
5045
5046	/* root ? */
5047	if (cont->parent == NULL) {
5048		int cpu;
5049		enable_swap_cgroup();
5050		parent = NULL;
5051		if (mem_cgroup_soft_limit_tree_init())
5052			goto free_out;
5053		root_mem_cgroup = memcg;
5054		for_each_possible_cpu(cpu) {
5055			struct memcg_stock_pcp *stock =
5056						&per_cpu(memcg_stock, cpu);
5057			INIT_WORK(&stock->work, drain_local_stock);
5058		}
5059		hotcpu_notifier(memcg_cpu_hotplug_callback, 0);
5060	} else {
5061		parent = mem_cgroup_from_cont(cont->parent);
5062		memcg->use_hierarchy = parent->use_hierarchy;
5063		memcg->oom_kill_disable = parent->oom_kill_disable;
5064	}
5065
5066	if (parent && parent->use_hierarchy) {
5067		res_counter_init(&memcg->res, &parent->res);
5068		res_counter_init(&memcg->memsw, &parent->memsw);
5069		/*
5070		 * We increment refcnt of the parent to ensure that we can
5071		 * safely access it on res_counter_charge/uncharge.
5072		 * This refcnt will be decremented when freeing this
5073		 * mem_cgroup(see mem_cgroup_put).
5074		 */
5075		mem_cgroup_get(parent);
5076	} else {
5077		res_counter_init(&memcg->res, NULL);
5078		res_counter_init(&memcg->memsw, NULL);
5079	}
5080	memcg->last_scanned_child = 0;
5081	memcg->last_scanned_node = MAX_NUMNODES;
5082	INIT_LIST_HEAD(&memcg->oom_notify);
5083
5084	if (parent)
5085		memcg->swappiness = mem_cgroup_swappiness(parent);
5086	atomic_set(&memcg->refcnt, 1);
5087	memcg->move_charge_at_immigrate = 0;
5088	mutex_init(&memcg->thresholds_lock);
5089	return &memcg->css;
5090free_out:
5091	__mem_cgroup_free(memcg);
5092	return ERR_PTR(error);
5093}
5094
5095static int mem_cgroup_pre_destroy(struct cgroup_subsys *ss,
5096					struct cgroup *cont)
5097{
5098	struct mem_cgroup *memcg = mem_cgroup_from_cont(cont);
5099
5100	return mem_cgroup_force_empty(memcg, false);
5101}
5102
5103static void mem_cgroup_destroy(struct cgroup_subsys *ss,
5104				struct cgroup *cont)
5105{
5106	struct mem_cgroup *memcg = mem_cgroup_from_cont(cont);
5107
5108	kmem_cgroup_destroy(ss, cont);
5109
5110	mem_cgroup_put(memcg);
5111}
5112
5113static int mem_cgroup_populate(struct cgroup_subsys *ss,
5114				struct cgroup *cont)
5115{
5116	int ret;
5117
5118	ret = cgroup_add_files(cont, ss, mem_cgroup_files,
5119				ARRAY_SIZE(mem_cgroup_files));
5120
5121	if (!ret)
5122		ret = register_memsw_files(cont, ss);
5123
5124	if (!ret)
5125		ret = register_kmem_files(cont, ss);
5126
5127	return ret;
5128}
5129
5130#ifdef CONFIG_MMU
5131/* Handlers for move charge at task migration. */
5132#define PRECHARGE_COUNT_AT_ONCE	256
5133static int mem_cgroup_do_precharge(unsigned long count)
5134{
5135	int ret = 0;
5136	int batch_count = PRECHARGE_COUNT_AT_ONCE;
5137	struct mem_cgroup *memcg = mc.to;
5138
5139	if (mem_cgroup_is_root(memcg)) {
5140		mc.precharge += count;
5141		/* we don't need css_get for root */
5142		return ret;
5143	}
5144	/* try to charge at once */
5145	if (count > 1) {
5146		struct res_counter *dummy;
5147		/*
5148		 * "memcg" cannot be under rmdir() because we've already checked
5149		 * by cgroup_lock_live_cgroup() that it is not removed and we
5150		 * are still under the same cgroup_mutex. So we can postpone
5151		 * css_get().
5152		 */
5153		if (res_counter_charge(&memcg->res, PAGE_SIZE * count, &dummy))
5154			goto one_by_one;
5155		if (do_swap_account && res_counter_charge(&memcg->memsw,
5156						PAGE_SIZE * count, &dummy)) {
5157			res_counter_uncharge(&memcg->res, PAGE_SIZE * count);
5158			goto one_by_one;
5159		}
5160		mc.precharge += count;
5161		return ret;
5162	}
5163one_by_one:
5164	/* fall back to one by one charge */
5165	while (count--) {
5166		if (signal_pending(current)) {
5167			ret = -EINTR;
5168			break;
5169		}
5170		if (!batch_count--) {
5171			batch_count = PRECHARGE_COUNT_AT_ONCE;
5172			cond_resched();
5173		}
5174		ret = __mem_cgroup_try_charge(NULL,
5175					GFP_KERNEL, 1, &memcg, false);
5176		if (ret || !memcg)
5177			/* mem_cgroup_clear_mc() will do uncharge later */
5178			return -ENOMEM;
5179		mc.precharge++;
5180	}
5181	return ret;
5182}
5183
5184/**
5185 * is_target_pte_for_mc - check a pte whether it is valid for move charge
5186 * @vma: the vma the pte to be checked belongs
5187 * @addr: the address corresponding to the pte to be checked
5188 * @ptent: the pte to be checked
5189 * @target: the pointer the target page or swap ent will be stored(can be NULL)
5190 *
5191 * Returns
5192 *   0(MC_TARGET_NONE): if the pte is not a target for move charge.
5193 *   1(MC_TARGET_PAGE): if the page corresponding to this pte is a target for
5194 *     move charge. if @target is not NULL, the page is stored in target->page
5195 *     with extra refcnt got(Callers should handle it).
5196 *   2(MC_TARGET_SWAP): if the swap entry corresponding to this pte is a
5197 *     target for charge migration. if @target is not NULL, the entry is stored
5198 *     in target->ent.
5199 *
5200 * Called with pte lock held.
5201 */
5202union mc_target {
5203	struct page	*page;
5204	swp_entry_t	ent;
5205};
5206
5207enum mc_target_type {
5208	MC_TARGET_NONE,	/* not used */
5209	MC_TARGET_PAGE,
5210	MC_TARGET_SWAP,
5211};
5212
5213static struct page *mc_handle_present_pte(struct vm_area_struct *vma,
5214						unsigned long addr, pte_t ptent)
5215{
5216	struct page *page = vm_normal_page(vma, addr, ptent);
5217
5218	if (!page || !page_mapped(page))
5219		return NULL;
5220	if (PageAnon(page)) {
5221		/* we don't move shared anon */
5222		if (!move_anon() || page_mapcount(page) > 2)
5223			return NULL;
5224	} else if (!move_file())
5225		/* we ignore mapcount for file pages */
5226		return NULL;
5227	if (!get_page_unless_zero(page))
5228		return NULL;
5229
5230	return page;
5231}
5232
5233static struct page *mc_handle_swap_pte(struct vm_area_struct *vma,
5234			unsigned long addr, pte_t ptent, swp_entry_t *entry)
5235{
5236	int usage_count;
5237	struct page *page = NULL;
5238	swp_entry_t ent = pte_to_swp_entry(ptent);
5239
5240	if (!move_anon() || non_swap_entry(ent))
5241		return NULL;
5242	usage_count = mem_cgroup_count_swap_user(ent, &page);
5243	if (usage_count > 1) { /* we don't move shared anon */
5244		if (page)
5245			put_page(page);
5246		return NULL;
5247	}
5248	if (do_swap_account)
5249		entry->val = ent.val;
5250
5251	return page;
5252}
5253
5254static struct page *mc_handle_file_pte(struct vm_area_struct *vma,
5255			unsigned long addr, pte_t ptent, swp_entry_t *entry)
5256{
5257	struct page *page = NULL;
5258	struct inode *inode;
5259	struct address_space *mapping;
5260	pgoff_t pgoff;
5261
5262	if (!vma->vm_file) /* anonymous vma */
5263		return NULL;
5264	if (!move_file())
5265		return NULL;
5266
5267	inode = vma->vm_file->f_path.dentry->d_inode;
5268	mapping = vma->vm_file->f_mapping;
5269	if (pte_none(ptent))
5270		pgoff = linear_page_index(vma, addr);
5271	else /* pte_file(ptent) is true */
5272		pgoff = pte_to_pgoff(ptent);
5273
5274	/* page is moved even if it's not RSS of this task(page-faulted). */
5275	page = find_get_page(mapping, pgoff);
5276
5277#ifdef CONFIG_SWAP
5278	/* shmem/tmpfs may report page out on swap: account for that too. */
5279	if (radix_tree_exceptional_entry(page)) {
5280		swp_entry_t swap = radix_to_swp_entry(page);
5281		if (do_swap_account)
5282			*entry = swap;
5283		page = find_get_page(&swapper_space, swap.val);
5284	}
5285#endif
5286	return page;
5287}
5288
5289static int is_target_pte_for_mc(struct vm_area_struct *vma,
5290		unsigned long addr, pte_t ptent, union mc_target *target)
5291{
5292	struct page *page = NULL;
5293	struct page_cgroup *pc;
5294	int ret = 0;
5295	swp_entry_t ent = { .val = 0 };
5296
5297	if (pte_present(ptent))
5298		page = mc_handle_present_pte(vma, addr, ptent);
5299	else if (is_swap_pte(ptent))
5300		page = mc_handle_swap_pte(vma, addr, ptent, &ent);
5301	else if (pte_none(ptent) || pte_file(ptent))
5302		page = mc_handle_file_pte(vma, addr, ptent, &ent);
5303
5304	if (!page && !ent.val)
5305		return 0;
5306	if (page) {
5307		pc = lookup_page_cgroup(page);
5308		/*
5309		 * Do only loose check w/o page_cgroup lock.
5310		 * mem_cgroup_move_account() checks the pc is valid or not under
5311		 * the lock.
5312		 */
5313		if (PageCgroupUsed(pc) && pc->mem_cgroup == mc.from) {
5314			ret = MC_TARGET_PAGE;
5315			if (target)
5316				target->page = page;
5317		}
5318		if (!ret || !target)
5319			put_page(page);
5320	}
5321	/* There is a swap entry and a page doesn't exist or isn't charged */
5322	if (ent.val && !ret &&
5323			css_id(&mc.from->css) == lookup_swap_cgroup(ent)) {
5324		ret = MC_TARGET_SWAP;
5325		if (target)
5326			target->ent = ent;
5327	}
5328	return ret;
5329}
5330
5331static int mem_cgroup_count_precharge_pte_range(pmd_t *pmd,
5332					unsigned long addr, unsigned long end,
5333					struct mm_walk *walk)
5334{
5335	struct vm_area_struct *vma = walk->private;
5336	pte_t *pte;
5337	spinlock_t *ptl;
5338
5339	split_huge_page_pmd(walk->mm, pmd);
5340
5341	pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
5342	for (; addr != end; pte++, addr += PAGE_SIZE)
5343		if (is_target_pte_for_mc(vma, addr, *pte, NULL))
5344			mc.precharge++;	/* increment precharge temporarily */
5345	pte_unmap_unlock(pte - 1, ptl);
5346	cond_resched();
5347
5348	return 0;
5349}
5350
5351static unsigned long mem_cgroup_count_precharge(struct mm_struct *mm)
5352{
5353	unsigned long precharge;
5354	struct vm_area_struct *vma;
5355
5356	down_read(&mm->mmap_sem);
5357	for (vma = mm->mmap; vma; vma = vma->vm_next) {
5358		struct mm_walk mem_cgroup_count_precharge_walk = {
5359			.pmd_entry = mem_cgroup_count_precharge_pte_range,
5360			.mm = mm,
5361			.private = vma,
5362		};
5363		if (is_vm_hugetlb_page(vma))
5364			continue;
5365		walk_page_range(vma->vm_start, vma->vm_end,
5366					&mem_cgroup_count_precharge_walk);
5367	}
5368	up_read(&mm->mmap_sem);
5369
5370	precharge = mc.precharge;
5371	mc.precharge = 0;
5372
5373	return precharge;
5374}
5375
5376static int mem_cgroup_precharge_mc(struct mm_struct *mm)
5377{
5378	unsigned long precharge = mem_cgroup_count_precharge(mm);
5379
5380	VM_BUG_ON(mc.moving_task);
5381	mc.moving_task = current;
5382	return mem_cgroup_do_precharge(precharge);
5383}
5384
5385/* cancels all extra charges on mc.from and mc.to, and wakes up all waiters. */
5386static void __mem_cgroup_clear_mc(void)
5387{
5388	struct mem_cgroup *from = mc.from;
5389	struct mem_cgroup *to = mc.to;
5390
5391	/* we must uncharge all the leftover precharges from mc.to */
5392	if (mc.precharge) {
5393		__mem_cgroup_cancel_charge(mc.to, mc.precharge);
5394		mc.precharge = 0;
5395	}
5396	/*
5397	 * we didn't uncharge from mc.from at mem_cgroup_move_account(), so
5398	 * we must uncharge here.
5399	 */
5400	if (mc.moved_charge) {
5401		__mem_cgroup_cancel_charge(mc.from, mc.moved_charge);
5402		mc.moved_charge = 0;
5403	}
5404	/* we must fixup refcnts and charges */
5405	if (mc.moved_swap) {
5406		/* uncharge swap account from the old cgroup */
5407		if (!mem_cgroup_is_root(mc.from))
5408			res_counter_uncharge(&mc.from->memsw,
5409						PAGE_SIZE * mc.moved_swap);
5410		__mem_cgroup_put(mc.from, mc.moved_swap);
5411
5412		if (!mem_cgroup_is_root(mc.to)) {
5413			/*
5414			 * we charged both to->res and to->memsw, so we should
5415			 * uncharge to->res.
5416			 */
5417			res_counter_uncharge(&mc.to->res,
5418						PAGE_SIZE * mc.moved_swap);
5419		}
5420		/* we've already done mem_cgroup_get(mc.to) */
5421		mc.moved_swap = 0;
5422	}
5423	memcg_oom_recover(from);
5424	memcg_oom_recover(to);
5425	wake_up_all(&mc.waitq);
5426}
5427
5428static void mem_cgroup_clear_mc(void)
5429{
5430	struct mem_cgroup *from = mc.from;
5431
5432	/*
5433	 * we must clear moving_task before waking up waiters at the end of
5434	 * task migration.
5435	 */
5436	mc.moving_task = NULL;
5437	__mem_cgroup_clear_mc();
5438	spin_lock(&mc.lock);
5439	mc.from = NULL;
5440	mc.to = NULL;
5441	spin_unlock(&mc.lock);
5442	mem_cgroup_end_move(from);
5443}
5444
5445static int mem_cgroup_can_attach(struct cgroup_subsys *ss,
5446				struct cgroup *cgroup,
5447				struct cgroup_taskset *tset)
5448{
5449	struct task_struct *p = cgroup_taskset_first(tset);
5450	int ret = 0;
5451	struct mem_cgroup *memcg = mem_cgroup_from_cont(cgroup);
5452
5453	if (memcg->move_charge_at_immigrate) {
5454		struct mm_struct *mm;
5455		struct mem_cgroup *from = mem_cgroup_from_task(p);
5456
5457		VM_BUG_ON(from == memcg);
5458
5459		mm = get_task_mm(p);
5460		if (!mm)
5461			return 0;
5462		/* We move charges only when we move a owner of the mm */
5463		if (mm->owner == p) {
5464			VM_BUG_ON(mc.from);
5465			VM_BUG_ON(mc.to);
5466			VM_BUG_ON(mc.precharge);
5467			VM_BUG_ON(mc.moved_charge);
5468			VM_BUG_ON(mc.moved_swap);
5469			mem_cgroup_start_move(from);
5470			spin_lock(&mc.lock);
5471			mc.from = from;
5472			mc.to = memcg;
5473			spin_unlock(&mc.lock);
5474			/* We set mc.moving_task later */
5475
5476			ret = mem_cgroup_precharge_mc(mm);
5477			if (ret)
5478				mem_cgroup_clear_mc();
5479		}
5480		mmput(mm);
5481	}
5482	return ret;
5483}
5484
5485static void mem_cgroup_cancel_attach(struct cgroup_subsys *ss,
5486				struct cgroup *cgroup,
5487				struct cgroup_taskset *tset)
5488{
5489	mem_cgroup_clear_mc();
5490}
5491
5492static int mem_cgroup_move_charge_pte_range(pmd_t *pmd,
5493				unsigned long addr, unsigned long end,
5494				struct mm_walk *walk)
5495{
5496	int ret = 0;
5497	struct vm_area_struct *vma = walk->private;
5498	pte_t *pte;
5499	spinlock_t *ptl;
5500
5501	split_huge_page_pmd(walk->mm, pmd);
5502retry:
5503	pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
5504	for (; addr != end; addr += PAGE_SIZE) {
5505		pte_t ptent = *(pte++);
5506		union mc_target target;
5507		int type;
5508		struct page *page;
5509		struct page_cgroup *pc;
5510		swp_entry_t ent;
5511
5512		if (!mc.precharge)
5513			break;
5514
5515		type = is_target_pte_for_mc(vma, addr, ptent, &target);
5516		switch (type) {
5517		case MC_TARGET_PAGE:
5518			page = target.page;
5519			if (isolate_lru_page(page))
5520				goto put;
5521			pc = lookup_page_cgroup(page);
5522			if (!mem_cgroup_move_account(page, 1, pc,
5523						     mc.from, mc.to, false)) {
5524				mc.precharge--;
5525				/* we uncharge from mc.from later. */
5526				mc.moved_charge++;
5527			}
5528			putback_lru_page(page);
5529put:			/* is_target_pte_for_mc() gets the page */
5530			put_page(page);
5531			break;
5532		case MC_TARGET_SWAP:
5533			ent = target.ent;
5534			if (!mem_cgroup_move_swap_account(ent,
5535						mc.from, mc.to, false)) {
5536				mc.precharge--;
5537				/* we fixup refcnts and charges later. */
5538				mc.moved_swap++;
5539			}
5540			break;
5541		default:
5542			break;
5543		}
5544	}
5545	pte_unmap_unlock(pte - 1, ptl);
5546	cond_resched();
5547
5548	if (addr != end) {
5549		/*
5550		 * We have consumed all precharges we got in can_attach().
5551		 * We try charge one by one, but don't do any additional
5552		 * charges to mc.to if we have failed in charge once in attach()
5553		 * phase.
5554		 */
5555		ret = mem_cgroup_do_precharge(1);
5556		if (!ret)
5557			goto retry;
5558	}
5559
5560	return ret;
5561}
5562
5563static void mem_cgroup_move_charge(struct mm_struct *mm)
5564{
5565	struct vm_area_struct *vma;
5566
5567	lru_add_drain_all();
5568retry:
5569	if (unlikely(!down_read_trylock(&mm->mmap_sem))) {
5570		/*
5571		 * Someone who are holding the mmap_sem might be waiting in
5572		 * waitq. So we cancel all extra charges, wake up all waiters,
5573		 * and retry. Because we cancel precharges, we might not be able
5574		 * to move enough charges, but moving charge is a best-effort
5575		 * feature anyway, so it wouldn't be a big problem.
5576		 */
5577		__mem_cgroup_clear_mc();
5578		cond_resched();
5579		goto retry;
5580	}
5581	for (vma = mm->mmap; vma; vma = vma->vm_next) {
5582		int ret;
5583		struct mm_walk mem_cgroup_move_charge_walk = {
5584			.pmd_entry = mem_cgroup_move_charge_pte_range,
5585			.mm = mm,
5586			.private = vma,
5587		};
5588		if (is_vm_hugetlb_page(vma))
5589			continue;
5590		ret = walk_page_range(vma->vm_start, vma->vm_end,
5591						&mem_cgroup_move_charge_walk);
5592		if (ret)
5593			/*
5594			 * means we have consumed all precharges and failed in
5595			 * doing additional charge. Just abandon here.
5596			 */
5597			break;
5598	}
5599	up_read(&mm->mmap_sem);
5600}
5601
5602static void mem_cgroup_move_task(struct cgroup_subsys *ss,
5603				struct cgroup *cont,
5604				struct cgroup_taskset *tset)
5605{
5606	struct task_struct *p = cgroup_taskset_first(tset);
5607	struct mm_struct *mm = get_task_mm(p);
5608
5609	if (mm) {
5610		if (mc.to)
5611			mem_cgroup_move_charge(mm);
5612		put_swap_token(mm);
5613		mmput(mm);
5614	}
5615	if (mc.to)
5616		mem_cgroup_clear_mc();
5617}
5618#else	/* !CONFIG_MMU */
5619static int mem_cgroup_can_attach(struct cgroup_subsys *ss,
5620				struct cgroup *cgroup,
5621				struct cgroup_taskset *tset)
5622{
5623	return 0;
5624}
5625static void mem_cgroup_cancel_attach(struct cgroup_subsys *ss,
5626				struct cgroup *cgroup,
5627				struct cgroup_taskset *tset)
5628{
5629}
5630static void mem_cgroup_move_task(struct cgroup_subsys *ss,
5631				struct cgroup *cont,
5632				struct cgroup_taskset *tset)
5633{
5634}
5635#endif
5636
5637struct cgroup_subsys mem_cgroup_subsys = {
5638	.name = "memory",
5639	.subsys_id = mem_cgroup_subsys_id,
5640	.create = mem_cgroup_create,
5641	.pre_destroy = mem_cgroup_pre_destroy,
5642	.destroy = mem_cgroup_destroy,
5643	.populate = mem_cgroup_populate,
5644	.can_attach = mem_cgroup_can_attach,
5645	.cancel_attach = mem_cgroup_cancel_attach,
5646	.attach = mem_cgroup_move_task,
5647	.early_init = 0,
5648	.use_id = 1,
5649};
5650
5651#ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP
5652static int __init enable_swap_account(char *s)
5653{
5654	/* consider enabled if no parameter or 1 is given */
5655	if (!strcmp(s, "1"))
5656		really_do_swap_account = 1;
5657	else if (!strcmp(s, "0"))
5658		really_do_swap_account = 0;
5659	return 1;
5660}
5661__setup("swapaccount=", enable_swap_account);
5662
5663#endif
5664