memcontrol.c revision f3e8eb70b1807d1b30aa6972af0cf30077c40112
1/* memcontrol.c - Memory Controller 2 * 3 * Copyright IBM Corporation, 2007 4 * Author Balbir Singh <balbir@linux.vnet.ibm.com> 5 * 6 * Copyright 2007 OpenVZ SWsoft Inc 7 * Author: Pavel Emelianov <xemul@openvz.org> 8 * 9 * Memory thresholds 10 * Copyright (C) 2009 Nokia Corporation 11 * Author: Kirill A. Shutemov 12 * 13 * This program is free software; you can redistribute it and/or modify 14 * it under the terms of the GNU General Public License as published by 15 * the Free Software Foundation; either version 2 of the License, or 16 * (at your option) any later version. 17 * 18 * This program is distributed in the hope that it will be useful, 19 * but WITHOUT ANY WARRANTY; without even the implied warranty of 20 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 21 * GNU General Public License for more details. 22 */ 23 24#include <linux/res_counter.h> 25#include <linux/memcontrol.h> 26#include <linux/cgroup.h> 27#include <linux/mm.h> 28#include <linux/hugetlb.h> 29#include <linux/pagemap.h> 30#include <linux/smp.h> 31#include <linux/page-flags.h> 32#include <linux/backing-dev.h> 33#include <linux/bit_spinlock.h> 34#include <linux/rcupdate.h> 35#include <linux/limits.h> 36#include <linux/mutex.h> 37#include <linux/rbtree.h> 38#include <linux/slab.h> 39#include <linux/swap.h> 40#include <linux/swapops.h> 41#include <linux/spinlock.h> 42#include <linux/eventfd.h> 43#include <linux/sort.h> 44#include <linux/fs.h> 45#include <linux/seq_file.h> 46#include <linux/vmalloc.h> 47#include <linux/mm_inline.h> 48#include <linux/page_cgroup.h> 49#include <linux/cpu.h> 50#include <linux/oom.h> 51#include "internal.h" 52 53#include <asm/uaccess.h> 54 55#include <trace/events/vmscan.h> 56 57struct cgroup_subsys mem_cgroup_subsys __read_mostly; 58#define MEM_CGROUP_RECLAIM_RETRIES 5 59struct mem_cgroup *root_mem_cgroup __read_mostly; 60 61#ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP 62/* Turned on only when memory cgroup is enabled && really_do_swap_account = 1 */ 63int do_swap_account __read_mostly; 64 65/* for remember boot option*/ 66#ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP_ENABLED 67static int really_do_swap_account __initdata = 1; 68#else 69static int really_do_swap_account __initdata = 0; 70#endif 71 72#else 73#define do_swap_account (0) 74#endif 75 76/* 77 * Per memcg event counter is incremented at every pagein/pageout. This counter 78 * is used for trigger some periodic events. This is straightforward and better 79 * than using jiffies etc. to handle periodic memcg event. 80 * 81 * These values will be used as !((event) & ((1 <<(thresh)) - 1)) 82 */ 83#define THRESHOLDS_EVENTS_THRESH (7) /* once in 128 */ 84#define SOFTLIMIT_EVENTS_THRESH (10) /* once in 1024 */ 85 86/* 87 * Statistics for memory cgroup. 88 */ 89enum mem_cgroup_stat_index { 90 /* 91 * For MEM_CONTAINER_TYPE_ALL, usage = pagecache + rss. 92 */ 93 MEM_CGROUP_STAT_CACHE, /* # of pages charged as cache */ 94 MEM_CGROUP_STAT_RSS, /* # of pages charged as anon rss */ 95 MEM_CGROUP_STAT_FILE_MAPPED, /* # of pages charged as file rss */ 96 MEM_CGROUP_STAT_PGPGIN_COUNT, /* # of pages paged in */ 97 MEM_CGROUP_STAT_PGPGOUT_COUNT, /* # of pages paged out */ 98 MEM_CGROUP_STAT_SWAPOUT, /* # of pages, swapped out */ 99 MEM_CGROUP_STAT_DATA, /* end of data requires synchronization */ 100 /* incremented at every pagein/pageout */ 101 MEM_CGROUP_EVENTS = MEM_CGROUP_STAT_DATA, 102 MEM_CGROUP_ON_MOVE, /* someone is moving account between groups */ 103 104 MEM_CGROUP_STAT_NSTATS, 105}; 106 107struct mem_cgroup_stat_cpu { 108 s64 count[MEM_CGROUP_STAT_NSTATS]; 109}; 110 111/* 112 * per-zone information in memory controller. 113 */ 114struct mem_cgroup_per_zone { 115 /* 116 * spin_lock to protect the per cgroup LRU 117 */ 118 struct list_head lists[NR_LRU_LISTS]; 119 unsigned long count[NR_LRU_LISTS]; 120 121 struct zone_reclaim_stat reclaim_stat; 122 struct rb_node tree_node; /* RB tree node */ 123 unsigned long long usage_in_excess;/* Set to the value by which */ 124 /* the soft limit is exceeded*/ 125 bool on_tree; 126 struct mem_cgroup *mem; /* Back pointer, we cannot */ 127 /* use container_of */ 128}; 129/* Macro for accessing counter */ 130#define MEM_CGROUP_ZSTAT(mz, idx) ((mz)->count[(idx)]) 131 132struct mem_cgroup_per_node { 133 struct mem_cgroup_per_zone zoneinfo[MAX_NR_ZONES]; 134}; 135 136struct mem_cgroup_lru_info { 137 struct mem_cgroup_per_node *nodeinfo[MAX_NUMNODES]; 138}; 139 140/* 141 * Cgroups above their limits are maintained in a RB-Tree, independent of 142 * their hierarchy representation 143 */ 144 145struct mem_cgroup_tree_per_zone { 146 struct rb_root rb_root; 147 spinlock_t lock; 148}; 149 150struct mem_cgroup_tree_per_node { 151 struct mem_cgroup_tree_per_zone rb_tree_per_zone[MAX_NR_ZONES]; 152}; 153 154struct mem_cgroup_tree { 155 struct mem_cgroup_tree_per_node *rb_tree_per_node[MAX_NUMNODES]; 156}; 157 158static struct mem_cgroup_tree soft_limit_tree __read_mostly; 159 160struct mem_cgroup_threshold { 161 struct eventfd_ctx *eventfd; 162 u64 threshold; 163}; 164 165/* For threshold */ 166struct mem_cgroup_threshold_ary { 167 /* An array index points to threshold just below usage. */ 168 int current_threshold; 169 /* Size of entries[] */ 170 unsigned int size; 171 /* Array of thresholds */ 172 struct mem_cgroup_threshold entries[0]; 173}; 174 175struct mem_cgroup_thresholds { 176 /* Primary thresholds array */ 177 struct mem_cgroup_threshold_ary *primary; 178 /* 179 * Spare threshold array. 180 * This is needed to make mem_cgroup_unregister_event() "never fail". 181 * It must be able to store at least primary->size - 1 entries. 182 */ 183 struct mem_cgroup_threshold_ary *spare; 184}; 185 186/* for OOM */ 187struct mem_cgroup_eventfd_list { 188 struct list_head list; 189 struct eventfd_ctx *eventfd; 190}; 191 192static void mem_cgroup_threshold(struct mem_cgroup *mem); 193static void mem_cgroup_oom_notify(struct mem_cgroup *mem); 194 195/* 196 * The memory controller data structure. The memory controller controls both 197 * page cache and RSS per cgroup. We would eventually like to provide 198 * statistics based on the statistics developed by Rik Van Riel for clock-pro, 199 * to help the administrator determine what knobs to tune. 200 * 201 * TODO: Add a water mark for the memory controller. Reclaim will begin when 202 * we hit the water mark. May be even add a low water mark, such that 203 * no reclaim occurs from a cgroup at it's low water mark, this is 204 * a feature that will be implemented much later in the future. 205 */ 206struct mem_cgroup { 207 struct cgroup_subsys_state css; 208 /* 209 * the counter to account for memory usage 210 */ 211 struct res_counter res; 212 /* 213 * the counter to account for mem+swap usage. 214 */ 215 struct res_counter memsw; 216 /* 217 * Per cgroup active and inactive list, similar to the 218 * per zone LRU lists. 219 */ 220 struct mem_cgroup_lru_info info; 221 222 /* 223 protect against reclaim related member. 224 */ 225 spinlock_t reclaim_param_lock; 226 227 /* 228 * While reclaiming in a hierarchy, we cache the last child we 229 * reclaimed from. 230 */ 231 int last_scanned_child; 232 /* 233 * Should the accounting and control be hierarchical, per subtree? 234 */ 235 bool use_hierarchy; 236 atomic_t oom_lock; 237 atomic_t refcnt; 238 239 unsigned int swappiness; 240 /* OOM-Killer disable */ 241 int oom_kill_disable; 242 243 /* set when res.limit == memsw.limit */ 244 bool memsw_is_minimum; 245 246 /* protect arrays of thresholds */ 247 struct mutex thresholds_lock; 248 249 /* thresholds for memory usage. RCU-protected */ 250 struct mem_cgroup_thresholds thresholds; 251 252 /* thresholds for mem+swap usage. RCU-protected */ 253 struct mem_cgroup_thresholds memsw_thresholds; 254 255 /* For oom notifier event fd */ 256 struct list_head oom_notify; 257 258 /* 259 * Should we move charges of a task when a task is moved into this 260 * mem_cgroup ? And what type of charges should we move ? 261 */ 262 unsigned long move_charge_at_immigrate; 263 /* 264 * percpu counter. 265 */ 266 struct mem_cgroup_stat_cpu *stat; 267 /* 268 * used when a cpu is offlined or other synchronizations 269 * See mem_cgroup_read_stat(). 270 */ 271 struct mem_cgroup_stat_cpu nocpu_base; 272 spinlock_t pcp_counter_lock; 273}; 274 275/* Stuffs for move charges at task migration. */ 276/* 277 * Types of charges to be moved. "move_charge_at_immitgrate" is treated as a 278 * left-shifted bitmap of these types. 279 */ 280enum move_type { 281 MOVE_CHARGE_TYPE_ANON, /* private anonymous page and swap of it */ 282 MOVE_CHARGE_TYPE_FILE, /* file page(including tmpfs) and swap of it */ 283 NR_MOVE_TYPE, 284}; 285 286/* "mc" and its members are protected by cgroup_mutex */ 287static struct move_charge_struct { 288 spinlock_t lock; /* for from, to */ 289 struct mem_cgroup *from; 290 struct mem_cgroup *to; 291 unsigned long precharge; 292 unsigned long moved_charge; 293 unsigned long moved_swap; 294 struct task_struct *moving_task; /* a task moving charges */ 295 struct mm_struct *mm; 296 wait_queue_head_t waitq; /* a waitq for other context */ 297} mc = { 298 .lock = __SPIN_LOCK_UNLOCKED(mc.lock), 299 .waitq = __WAIT_QUEUE_HEAD_INITIALIZER(mc.waitq), 300}; 301 302static bool move_anon(void) 303{ 304 return test_bit(MOVE_CHARGE_TYPE_ANON, 305 &mc.to->move_charge_at_immigrate); 306} 307 308static bool move_file(void) 309{ 310 return test_bit(MOVE_CHARGE_TYPE_FILE, 311 &mc.to->move_charge_at_immigrate); 312} 313 314/* 315 * Maximum loops in mem_cgroup_hierarchical_reclaim(), used for soft 316 * limit reclaim to prevent infinite loops, if they ever occur. 317 */ 318#define MEM_CGROUP_MAX_RECLAIM_LOOPS (100) 319#define MEM_CGROUP_MAX_SOFT_LIMIT_RECLAIM_LOOPS (2) 320 321enum charge_type { 322 MEM_CGROUP_CHARGE_TYPE_CACHE = 0, 323 MEM_CGROUP_CHARGE_TYPE_MAPPED, 324 MEM_CGROUP_CHARGE_TYPE_SHMEM, /* used by page migration of shmem */ 325 MEM_CGROUP_CHARGE_TYPE_FORCE, /* used by force_empty */ 326 MEM_CGROUP_CHARGE_TYPE_SWAPOUT, /* for accounting swapcache */ 327 MEM_CGROUP_CHARGE_TYPE_DROP, /* a page was unused swap cache */ 328 NR_CHARGE_TYPE, 329}; 330 331/* only for here (for easy reading.) */ 332#define PCGF_CACHE (1UL << PCG_CACHE) 333#define PCGF_USED (1UL << PCG_USED) 334#define PCGF_LOCK (1UL << PCG_LOCK) 335/* Not used, but added here for completeness */ 336#define PCGF_ACCT (1UL << PCG_ACCT) 337 338/* for encoding cft->private value on file */ 339#define _MEM (0) 340#define _MEMSWAP (1) 341#define _OOM_TYPE (2) 342#define MEMFILE_PRIVATE(x, val) (((x) << 16) | (val)) 343#define MEMFILE_TYPE(val) (((val) >> 16) & 0xffff) 344#define MEMFILE_ATTR(val) ((val) & 0xffff) 345/* Used for OOM nofiier */ 346#define OOM_CONTROL (0) 347 348/* 349 * Reclaim flags for mem_cgroup_hierarchical_reclaim 350 */ 351#define MEM_CGROUP_RECLAIM_NOSWAP_BIT 0x0 352#define MEM_CGROUP_RECLAIM_NOSWAP (1 << MEM_CGROUP_RECLAIM_NOSWAP_BIT) 353#define MEM_CGROUP_RECLAIM_SHRINK_BIT 0x1 354#define MEM_CGROUP_RECLAIM_SHRINK (1 << MEM_CGROUP_RECLAIM_SHRINK_BIT) 355#define MEM_CGROUP_RECLAIM_SOFT_BIT 0x2 356#define MEM_CGROUP_RECLAIM_SOFT (1 << MEM_CGROUP_RECLAIM_SOFT_BIT) 357 358static void mem_cgroup_get(struct mem_cgroup *mem); 359static void mem_cgroup_put(struct mem_cgroup *mem); 360static struct mem_cgroup *parent_mem_cgroup(struct mem_cgroup *mem); 361static void drain_all_stock_async(void); 362 363static struct mem_cgroup_per_zone * 364mem_cgroup_zoneinfo(struct mem_cgroup *mem, int nid, int zid) 365{ 366 return &mem->info.nodeinfo[nid]->zoneinfo[zid]; 367} 368 369struct cgroup_subsys_state *mem_cgroup_css(struct mem_cgroup *mem) 370{ 371 return &mem->css; 372} 373 374static struct mem_cgroup_per_zone * 375page_cgroup_zoneinfo(struct page_cgroup *pc) 376{ 377 struct mem_cgroup *mem = pc->mem_cgroup; 378 int nid = page_cgroup_nid(pc); 379 int zid = page_cgroup_zid(pc); 380 381 if (!mem) 382 return NULL; 383 384 return mem_cgroup_zoneinfo(mem, nid, zid); 385} 386 387static struct mem_cgroup_tree_per_zone * 388soft_limit_tree_node_zone(int nid, int zid) 389{ 390 return &soft_limit_tree.rb_tree_per_node[nid]->rb_tree_per_zone[zid]; 391} 392 393static struct mem_cgroup_tree_per_zone * 394soft_limit_tree_from_page(struct page *page) 395{ 396 int nid = page_to_nid(page); 397 int zid = page_zonenum(page); 398 399 return &soft_limit_tree.rb_tree_per_node[nid]->rb_tree_per_zone[zid]; 400} 401 402static void 403__mem_cgroup_insert_exceeded(struct mem_cgroup *mem, 404 struct mem_cgroup_per_zone *mz, 405 struct mem_cgroup_tree_per_zone *mctz, 406 unsigned long long new_usage_in_excess) 407{ 408 struct rb_node **p = &mctz->rb_root.rb_node; 409 struct rb_node *parent = NULL; 410 struct mem_cgroup_per_zone *mz_node; 411 412 if (mz->on_tree) 413 return; 414 415 mz->usage_in_excess = new_usage_in_excess; 416 if (!mz->usage_in_excess) 417 return; 418 while (*p) { 419 parent = *p; 420 mz_node = rb_entry(parent, struct mem_cgroup_per_zone, 421 tree_node); 422 if (mz->usage_in_excess < mz_node->usage_in_excess) 423 p = &(*p)->rb_left; 424 /* 425 * We can't avoid mem cgroups that are over their soft 426 * limit by the same amount 427 */ 428 else if (mz->usage_in_excess >= mz_node->usage_in_excess) 429 p = &(*p)->rb_right; 430 } 431 rb_link_node(&mz->tree_node, parent, p); 432 rb_insert_color(&mz->tree_node, &mctz->rb_root); 433 mz->on_tree = true; 434} 435 436static void 437__mem_cgroup_remove_exceeded(struct mem_cgroup *mem, 438 struct mem_cgroup_per_zone *mz, 439 struct mem_cgroup_tree_per_zone *mctz) 440{ 441 if (!mz->on_tree) 442 return; 443 rb_erase(&mz->tree_node, &mctz->rb_root); 444 mz->on_tree = false; 445} 446 447static void 448mem_cgroup_remove_exceeded(struct mem_cgroup *mem, 449 struct mem_cgroup_per_zone *mz, 450 struct mem_cgroup_tree_per_zone *mctz) 451{ 452 spin_lock(&mctz->lock); 453 __mem_cgroup_remove_exceeded(mem, mz, mctz); 454 spin_unlock(&mctz->lock); 455} 456 457 458static void mem_cgroup_update_tree(struct mem_cgroup *mem, struct page *page) 459{ 460 unsigned long long excess; 461 struct mem_cgroup_per_zone *mz; 462 struct mem_cgroup_tree_per_zone *mctz; 463 int nid = page_to_nid(page); 464 int zid = page_zonenum(page); 465 mctz = soft_limit_tree_from_page(page); 466 467 /* 468 * Necessary to update all ancestors when hierarchy is used. 469 * because their event counter is not touched. 470 */ 471 for (; mem; mem = parent_mem_cgroup(mem)) { 472 mz = mem_cgroup_zoneinfo(mem, nid, zid); 473 excess = res_counter_soft_limit_excess(&mem->res); 474 /* 475 * We have to update the tree if mz is on RB-tree or 476 * mem is over its softlimit. 477 */ 478 if (excess || mz->on_tree) { 479 spin_lock(&mctz->lock); 480 /* if on-tree, remove it */ 481 if (mz->on_tree) 482 __mem_cgroup_remove_exceeded(mem, mz, mctz); 483 /* 484 * Insert again. mz->usage_in_excess will be updated. 485 * If excess is 0, no tree ops. 486 */ 487 __mem_cgroup_insert_exceeded(mem, mz, mctz, excess); 488 spin_unlock(&mctz->lock); 489 } 490 } 491} 492 493static void mem_cgroup_remove_from_trees(struct mem_cgroup *mem) 494{ 495 int node, zone; 496 struct mem_cgroup_per_zone *mz; 497 struct mem_cgroup_tree_per_zone *mctz; 498 499 for_each_node_state(node, N_POSSIBLE) { 500 for (zone = 0; zone < MAX_NR_ZONES; zone++) { 501 mz = mem_cgroup_zoneinfo(mem, node, zone); 502 mctz = soft_limit_tree_node_zone(node, zone); 503 mem_cgroup_remove_exceeded(mem, mz, mctz); 504 } 505 } 506} 507 508static inline unsigned long mem_cgroup_get_excess(struct mem_cgroup *mem) 509{ 510 return res_counter_soft_limit_excess(&mem->res) >> PAGE_SHIFT; 511} 512 513static struct mem_cgroup_per_zone * 514__mem_cgroup_largest_soft_limit_node(struct mem_cgroup_tree_per_zone *mctz) 515{ 516 struct rb_node *rightmost = NULL; 517 struct mem_cgroup_per_zone *mz; 518 519retry: 520 mz = NULL; 521 rightmost = rb_last(&mctz->rb_root); 522 if (!rightmost) 523 goto done; /* Nothing to reclaim from */ 524 525 mz = rb_entry(rightmost, struct mem_cgroup_per_zone, tree_node); 526 /* 527 * Remove the node now but someone else can add it back, 528 * we will to add it back at the end of reclaim to its correct 529 * position in the tree. 530 */ 531 __mem_cgroup_remove_exceeded(mz->mem, mz, mctz); 532 if (!res_counter_soft_limit_excess(&mz->mem->res) || 533 !css_tryget(&mz->mem->css)) 534 goto retry; 535done: 536 return mz; 537} 538 539static struct mem_cgroup_per_zone * 540mem_cgroup_largest_soft_limit_node(struct mem_cgroup_tree_per_zone *mctz) 541{ 542 struct mem_cgroup_per_zone *mz; 543 544 spin_lock(&mctz->lock); 545 mz = __mem_cgroup_largest_soft_limit_node(mctz); 546 spin_unlock(&mctz->lock); 547 return mz; 548} 549 550/* 551 * Implementation Note: reading percpu statistics for memcg. 552 * 553 * Both of vmstat[] and percpu_counter has threshold and do periodic 554 * synchronization to implement "quick" read. There are trade-off between 555 * reading cost and precision of value. Then, we may have a chance to implement 556 * a periodic synchronizion of counter in memcg's counter. 557 * 558 * But this _read() function is used for user interface now. The user accounts 559 * memory usage by memory cgroup and he _always_ requires exact value because 560 * he accounts memory. Even if we provide quick-and-fuzzy read, we always 561 * have to visit all online cpus and make sum. So, for now, unnecessary 562 * synchronization is not implemented. (just implemented for cpu hotplug) 563 * 564 * If there are kernel internal actions which can make use of some not-exact 565 * value, and reading all cpu value can be performance bottleneck in some 566 * common workload, threashold and synchonization as vmstat[] should be 567 * implemented. 568 */ 569static s64 mem_cgroup_read_stat(struct mem_cgroup *mem, 570 enum mem_cgroup_stat_index idx) 571{ 572 int cpu; 573 s64 val = 0; 574 575 get_online_cpus(); 576 for_each_online_cpu(cpu) 577 val += per_cpu(mem->stat->count[idx], cpu); 578#ifdef CONFIG_HOTPLUG_CPU 579 spin_lock(&mem->pcp_counter_lock); 580 val += mem->nocpu_base.count[idx]; 581 spin_unlock(&mem->pcp_counter_lock); 582#endif 583 put_online_cpus(); 584 return val; 585} 586 587static s64 mem_cgroup_local_usage(struct mem_cgroup *mem) 588{ 589 s64 ret; 590 591 ret = mem_cgroup_read_stat(mem, MEM_CGROUP_STAT_RSS); 592 ret += mem_cgroup_read_stat(mem, MEM_CGROUP_STAT_CACHE); 593 return ret; 594} 595 596static void mem_cgroup_swap_statistics(struct mem_cgroup *mem, 597 bool charge) 598{ 599 int val = (charge) ? 1 : -1; 600 this_cpu_add(mem->stat->count[MEM_CGROUP_STAT_SWAPOUT], val); 601} 602 603static void mem_cgroup_charge_statistics(struct mem_cgroup *mem, 604 struct page_cgroup *pc, 605 bool charge) 606{ 607 int val = (charge) ? 1 : -1; 608 609 preempt_disable(); 610 611 if (PageCgroupCache(pc)) 612 __this_cpu_add(mem->stat->count[MEM_CGROUP_STAT_CACHE], val); 613 else 614 __this_cpu_add(mem->stat->count[MEM_CGROUP_STAT_RSS], val); 615 616 if (charge) 617 __this_cpu_inc(mem->stat->count[MEM_CGROUP_STAT_PGPGIN_COUNT]); 618 else 619 __this_cpu_inc(mem->stat->count[MEM_CGROUP_STAT_PGPGOUT_COUNT]); 620 __this_cpu_inc(mem->stat->count[MEM_CGROUP_EVENTS]); 621 622 preempt_enable(); 623} 624 625static unsigned long mem_cgroup_get_local_zonestat(struct mem_cgroup *mem, 626 enum lru_list idx) 627{ 628 int nid, zid; 629 struct mem_cgroup_per_zone *mz; 630 u64 total = 0; 631 632 for_each_online_node(nid) 633 for (zid = 0; zid < MAX_NR_ZONES; zid++) { 634 mz = mem_cgroup_zoneinfo(mem, nid, zid); 635 total += MEM_CGROUP_ZSTAT(mz, idx); 636 } 637 return total; 638} 639 640static bool __memcg_event_check(struct mem_cgroup *mem, int event_mask_shift) 641{ 642 s64 val; 643 644 val = this_cpu_read(mem->stat->count[MEM_CGROUP_EVENTS]); 645 646 return !(val & ((1 << event_mask_shift) - 1)); 647} 648 649/* 650 * Check events in order. 651 * 652 */ 653static void memcg_check_events(struct mem_cgroup *mem, struct page *page) 654{ 655 /* threshold event is triggered in finer grain than soft limit */ 656 if (unlikely(__memcg_event_check(mem, THRESHOLDS_EVENTS_THRESH))) { 657 mem_cgroup_threshold(mem); 658 if (unlikely(__memcg_event_check(mem, SOFTLIMIT_EVENTS_THRESH))) 659 mem_cgroup_update_tree(mem, page); 660 } 661} 662 663static struct mem_cgroup *mem_cgroup_from_cont(struct cgroup *cont) 664{ 665 return container_of(cgroup_subsys_state(cont, 666 mem_cgroup_subsys_id), struct mem_cgroup, 667 css); 668} 669 670struct mem_cgroup *mem_cgroup_from_task(struct task_struct *p) 671{ 672 /* 673 * mm_update_next_owner() may clear mm->owner to NULL 674 * if it races with swapoff, page migration, etc. 675 * So this can be called with p == NULL. 676 */ 677 if (unlikely(!p)) 678 return NULL; 679 680 return container_of(task_subsys_state(p, mem_cgroup_subsys_id), 681 struct mem_cgroup, css); 682} 683 684static struct mem_cgroup *try_get_mem_cgroup_from_mm(struct mm_struct *mm) 685{ 686 struct mem_cgroup *mem = NULL; 687 688 if (!mm) 689 return NULL; 690 /* 691 * Because we have no locks, mm->owner's may be being moved to other 692 * cgroup. We use css_tryget() here even if this looks 693 * pessimistic (rather than adding locks here). 694 */ 695 rcu_read_lock(); 696 do { 697 mem = mem_cgroup_from_task(rcu_dereference(mm->owner)); 698 if (unlikely(!mem)) 699 break; 700 } while (!css_tryget(&mem->css)); 701 rcu_read_unlock(); 702 return mem; 703} 704 705/* The caller has to guarantee "mem" exists before calling this */ 706static struct mem_cgroup *mem_cgroup_start_loop(struct mem_cgroup *mem) 707{ 708 struct cgroup_subsys_state *css; 709 int found; 710 711 if (!mem) /* ROOT cgroup has the smallest ID */ 712 return root_mem_cgroup; /*css_put/get against root is ignored*/ 713 if (!mem->use_hierarchy) { 714 if (css_tryget(&mem->css)) 715 return mem; 716 return NULL; 717 } 718 rcu_read_lock(); 719 /* 720 * searching a memory cgroup which has the smallest ID under given 721 * ROOT cgroup. (ID >= 1) 722 */ 723 css = css_get_next(&mem_cgroup_subsys, 1, &mem->css, &found); 724 if (css && css_tryget(css)) 725 mem = container_of(css, struct mem_cgroup, css); 726 else 727 mem = NULL; 728 rcu_read_unlock(); 729 return mem; 730} 731 732static struct mem_cgroup *mem_cgroup_get_next(struct mem_cgroup *iter, 733 struct mem_cgroup *root, 734 bool cond) 735{ 736 int nextid = css_id(&iter->css) + 1; 737 int found; 738 int hierarchy_used; 739 struct cgroup_subsys_state *css; 740 741 hierarchy_used = iter->use_hierarchy; 742 743 css_put(&iter->css); 744 /* If no ROOT, walk all, ignore hierarchy */ 745 if (!cond || (root && !hierarchy_used)) 746 return NULL; 747 748 if (!root) 749 root = root_mem_cgroup; 750 751 do { 752 iter = NULL; 753 rcu_read_lock(); 754 755 css = css_get_next(&mem_cgroup_subsys, nextid, 756 &root->css, &found); 757 if (css && css_tryget(css)) 758 iter = container_of(css, struct mem_cgroup, css); 759 rcu_read_unlock(); 760 /* If css is NULL, no more cgroups will be found */ 761 nextid = found + 1; 762 } while (css && !iter); 763 764 return iter; 765} 766/* 767 * for_eacn_mem_cgroup_tree() for visiting all cgroup under tree. Please 768 * be careful that "break" loop is not allowed. We have reference count. 769 * Instead of that modify "cond" to be false and "continue" to exit the loop. 770 */ 771#define for_each_mem_cgroup_tree_cond(iter, root, cond) \ 772 for (iter = mem_cgroup_start_loop(root);\ 773 iter != NULL;\ 774 iter = mem_cgroup_get_next(iter, root, cond)) 775 776#define for_each_mem_cgroup_tree(iter, root) \ 777 for_each_mem_cgroup_tree_cond(iter, root, true) 778 779#define for_each_mem_cgroup_all(iter) \ 780 for_each_mem_cgroup_tree_cond(iter, NULL, true) 781 782 783static inline bool mem_cgroup_is_root(struct mem_cgroup *mem) 784{ 785 return (mem == root_mem_cgroup); 786} 787 788/* 789 * Following LRU functions are allowed to be used without PCG_LOCK. 790 * Operations are called by routine of global LRU independently from memcg. 791 * What we have to take care of here is validness of pc->mem_cgroup. 792 * 793 * Changes to pc->mem_cgroup happens when 794 * 1. charge 795 * 2. moving account 796 * In typical case, "charge" is done before add-to-lru. Exception is SwapCache. 797 * It is added to LRU before charge. 798 * If PCG_USED bit is not set, page_cgroup is not added to this private LRU. 799 * When moving account, the page is not on LRU. It's isolated. 800 */ 801 802void mem_cgroup_del_lru_list(struct page *page, enum lru_list lru) 803{ 804 struct page_cgroup *pc; 805 struct mem_cgroup_per_zone *mz; 806 807 if (mem_cgroup_disabled()) 808 return; 809 pc = lookup_page_cgroup(page); 810 /* can happen while we handle swapcache. */ 811 if (!TestClearPageCgroupAcctLRU(pc)) 812 return; 813 VM_BUG_ON(!pc->mem_cgroup); 814 /* 815 * We don't check PCG_USED bit. It's cleared when the "page" is finally 816 * removed from global LRU. 817 */ 818 mz = page_cgroup_zoneinfo(pc); 819 MEM_CGROUP_ZSTAT(mz, lru) -= 1; 820 if (mem_cgroup_is_root(pc->mem_cgroup)) 821 return; 822 VM_BUG_ON(list_empty(&pc->lru)); 823 list_del_init(&pc->lru); 824 return; 825} 826 827void mem_cgroup_del_lru(struct page *page) 828{ 829 mem_cgroup_del_lru_list(page, page_lru(page)); 830} 831 832void mem_cgroup_rotate_lru_list(struct page *page, enum lru_list lru) 833{ 834 struct mem_cgroup_per_zone *mz; 835 struct page_cgroup *pc; 836 837 if (mem_cgroup_disabled()) 838 return; 839 840 pc = lookup_page_cgroup(page); 841 /* 842 * Used bit is set without atomic ops but after smp_wmb(). 843 * For making pc->mem_cgroup visible, insert smp_rmb() here. 844 */ 845 smp_rmb(); 846 /* unused or root page is not rotated. */ 847 if (!PageCgroupUsed(pc) || mem_cgroup_is_root(pc->mem_cgroup)) 848 return; 849 mz = page_cgroup_zoneinfo(pc); 850 list_move(&pc->lru, &mz->lists[lru]); 851} 852 853void mem_cgroup_add_lru_list(struct page *page, enum lru_list lru) 854{ 855 struct page_cgroup *pc; 856 struct mem_cgroup_per_zone *mz; 857 858 if (mem_cgroup_disabled()) 859 return; 860 pc = lookup_page_cgroup(page); 861 VM_BUG_ON(PageCgroupAcctLRU(pc)); 862 /* 863 * Used bit is set without atomic ops but after smp_wmb(). 864 * For making pc->mem_cgroup visible, insert smp_rmb() here. 865 */ 866 smp_rmb(); 867 if (!PageCgroupUsed(pc)) 868 return; 869 870 mz = page_cgroup_zoneinfo(pc); 871 MEM_CGROUP_ZSTAT(mz, lru) += 1; 872 SetPageCgroupAcctLRU(pc); 873 if (mem_cgroup_is_root(pc->mem_cgroup)) 874 return; 875 list_add(&pc->lru, &mz->lists[lru]); 876} 877 878/* 879 * At handling SwapCache, pc->mem_cgroup may be changed while it's linked to 880 * lru because the page may.be reused after it's fully uncharged (because of 881 * SwapCache behavior).To handle that, unlink page_cgroup from LRU when charge 882 * it again. This function is only used to charge SwapCache. It's done under 883 * lock_page and expected that zone->lru_lock is never held. 884 */ 885static void mem_cgroup_lru_del_before_commit_swapcache(struct page *page) 886{ 887 unsigned long flags; 888 struct zone *zone = page_zone(page); 889 struct page_cgroup *pc = lookup_page_cgroup(page); 890 891 spin_lock_irqsave(&zone->lru_lock, flags); 892 /* 893 * Forget old LRU when this page_cgroup is *not* used. This Used bit 894 * is guarded by lock_page() because the page is SwapCache. 895 */ 896 if (!PageCgroupUsed(pc)) 897 mem_cgroup_del_lru_list(page, page_lru(page)); 898 spin_unlock_irqrestore(&zone->lru_lock, flags); 899} 900 901static void mem_cgroup_lru_add_after_commit_swapcache(struct page *page) 902{ 903 unsigned long flags; 904 struct zone *zone = page_zone(page); 905 struct page_cgroup *pc = lookup_page_cgroup(page); 906 907 spin_lock_irqsave(&zone->lru_lock, flags); 908 /* link when the page is linked to LRU but page_cgroup isn't */ 909 if (PageLRU(page) && !PageCgroupAcctLRU(pc)) 910 mem_cgroup_add_lru_list(page, page_lru(page)); 911 spin_unlock_irqrestore(&zone->lru_lock, flags); 912} 913 914 915void mem_cgroup_move_lists(struct page *page, 916 enum lru_list from, enum lru_list to) 917{ 918 if (mem_cgroup_disabled()) 919 return; 920 mem_cgroup_del_lru_list(page, from); 921 mem_cgroup_add_lru_list(page, to); 922} 923 924int task_in_mem_cgroup(struct task_struct *task, const struct mem_cgroup *mem) 925{ 926 int ret; 927 struct mem_cgroup *curr = NULL; 928 struct task_struct *p; 929 930 p = find_lock_task_mm(task); 931 if (!p) 932 return 0; 933 curr = try_get_mem_cgroup_from_mm(p->mm); 934 task_unlock(p); 935 if (!curr) 936 return 0; 937 /* 938 * We should check use_hierarchy of "mem" not "curr". Because checking 939 * use_hierarchy of "curr" here make this function true if hierarchy is 940 * enabled in "curr" and "curr" is a child of "mem" in *cgroup* 941 * hierarchy(even if use_hierarchy is disabled in "mem"). 942 */ 943 if (mem->use_hierarchy) 944 ret = css_is_ancestor(&curr->css, &mem->css); 945 else 946 ret = (curr == mem); 947 css_put(&curr->css); 948 return ret; 949} 950 951static int calc_inactive_ratio(struct mem_cgroup *memcg, unsigned long *present_pages) 952{ 953 unsigned long active; 954 unsigned long inactive; 955 unsigned long gb; 956 unsigned long inactive_ratio; 957 958 inactive = mem_cgroup_get_local_zonestat(memcg, LRU_INACTIVE_ANON); 959 active = mem_cgroup_get_local_zonestat(memcg, LRU_ACTIVE_ANON); 960 961 gb = (inactive + active) >> (30 - PAGE_SHIFT); 962 if (gb) 963 inactive_ratio = int_sqrt(10 * gb); 964 else 965 inactive_ratio = 1; 966 967 if (present_pages) { 968 present_pages[0] = inactive; 969 present_pages[1] = active; 970 } 971 972 return inactive_ratio; 973} 974 975int mem_cgroup_inactive_anon_is_low(struct mem_cgroup *memcg) 976{ 977 unsigned long active; 978 unsigned long inactive; 979 unsigned long present_pages[2]; 980 unsigned long inactive_ratio; 981 982 inactive_ratio = calc_inactive_ratio(memcg, present_pages); 983 984 inactive = present_pages[0]; 985 active = present_pages[1]; 986 987 if (inactive * inactive_ratio < active) 988 return 1; 989 990 return 0; 991} 992 993int mem_cgroup_inactive_file_is_low(struct mem_cgroup *memcg) 994{ 995 unsigned long active; 996 unsigned long inactive; 997 998 inactive = mem_cgroup_get_local_zonestat(memcg, LRU_INACTIVE_FILE); 999 active = mem_cgroup_get_local_zonestat(memcg, LRU_ACTIVE_FILE); 1000 1001 return (active > inactive); 1002} 1003 1004unsigned long mem_cgroup_zone_nr_pages(struct mem_cgroup *memcg, 1005 struct zone *zone, 1006 enum lru_list lru) 1007{ 1008 int nid = zone_to_nid(zone); 1009 int zid = zone_idx(zone); 1010 struct mem_cgroup_per_zone *mz = mem_cgroup_zoneinfo(memcg, nid, zid); 1011 1012 return MEM_CGROUP_ZSTAT(mz, lru); 1013} 1014 1015struct zone_reclaim_stat *mem_cgroup_get_reclaim_stat(struct mem_cgroup *memcg, 1016 struct zone *zone) 1017{ 1018 int nid = zone_to_nid(zone); 1019 int zid = zone_idx(zone); 1020 struct mem_cgroup_per_zone *mz = mem_cgroup_zoneinfo(memcg, nid, zid); 1021 1022 return &mz->reclaim_stat; 1023} 1024 1025struct zone_reclaim_stat * 1026mem_cgroup_get_reclaim_stat_from_page(struct page *page) 1027{ 1028 struct page_cgroup *pc; 1029 struct mem_cgroup_per_zone *mz; 1030 1031 if (mem_cgroup_disabled()) 1032 return NULL; 1033 1034 pc = lookup_page_cgroup(page); 1035 /* 1036 * Used bit is set without atomic ops but after smp_wmb(). 1037 * For making pc->mem_cgroup visible, insert smp_rmb() here. 1038 */ 1039 smp_rmb(); 1040 if (!PageCgroupUsed(pc)) 1041 return NULL; 1042 1043 mz = page_cgroup_zoneinfo(pc); 1044 if (!mz) 1045 return NULL; 1046 1047 return &mz->reclaim_stat; 1048} 1049 1050unsigned long mem_cgroup_isolate_pages(unsigned long nr_to_scan, 1051 struct list_head *dst, 1052 unsigned long *scanned, int order, 1053 int mode, struct zone *z, 1054 struct mem_cgroup *mem_cont, 1055 int active, int file) 1056{ 1057 unsigned long nr_taken = 0; 1058 struct page *page; 1059 unsigned long scan; 1060 LIST_HEAD(pc_list); 1061 struct list_head *src; 1062 struct page_cgroup *pc, *tmp; 1063 int nid = zone_to_nid(z); 1064 int zid = zone_idx(z); 1065 struct mem_cgroup_per_zone *mz; 1066 int lru = LRU_FILE * file + active; 1067 int ret; 1068 1069 BUG_ON(!mem_cont); 1070 mz = mem_cgroup_zoneinfo(mem_cont, nid, zid); 1071 src = &mz->lists[lru]; 1072 1073 scan = 0; 1074 list_for_each_entry_safe_reverse(pc, tmp, src, lru) { 1075 if (scan >= nr_to_scan) 1076 break; 1077 1078 page = pc->page; 1079 if (unlikely(!PageCgroupUsed(pc))) 1080 continue; 1081 if (unlikely(!PageLRU(page))) 1082 continue; 1083 1084 scan++; 1085 ret = __isolate_lru_page(page, mode, file); 1086 switch (ret) { 1087 case 0: 1088 list_move(&page->lru, dst); 1089 mem_cgroup_del_lru(page); 1090 nr_taken += hpage_nr_pages(page); 1091 break; 1092 case -EBUSY: 1093 /* we don't affect global LRU but rotate in our LRU */ 1094 mem_cgroup_rotate_lru_list(page, page_lru(page)); 1095 break; 1096 default: 1097 break; 1098 } 1099 } 1100 1101 *scanned = scan; 1102 1103 trace_mm_vmscan_memcg_isolate(0, nr_to_scan, scan, nr_taken, 1104 0, 0, 0, mode); 1105 1106 return nr_taken; 1107} 1108 1109#define mem_cgroup_from_res_counter(counter, member) \ 1110 container_of(counter, struct mem_cgroup, member) 1111 1112static bool mem_cgroup_check_under_limit(struct mem_cgroup *mem) 1113{ 1114 if (do_swap_account) { 1115 if (res_counter_check_under_limit(&mem->res) && 1116 res_counter_check_under_limit(&mem->memsw)) 1117 return true; 1118 } else 1119 if (res_counter_check_under_limit(&mem->res)) 1120 return true; 1121 return false; 1122} 1123 1124static unsigned int get_swappiness(struct mem_cgroup *memcg) 1125{ 1126 struct cgroup *cgrp = memcg->css.cgroup; 1127 unsigned int swappiness; 1128 1129 /* root ? */ 1130 if (cgrp->parent == NULL) 1131 return vm_swappiness; 1132 1133 spin_lock(&memcg->reclaim_param_lock); 1134 swappiness = memcg->swappiness; 1135 spin_unlock(&memcg->reclaim_param_lock); 1136 1137 return swappiness; 1138} 1139 1140static void mem_cgroup_start_move(struct mem_cgroup *mem) 1141{ 1142 int cpu; 1143 1144 get_online_cpus(); 1145 spin_lock(&mem->pcp_counter_lock); 1146 for_each_online_cpu(cpu) 1147 per_cpu(mem->stat->count[MEM_CGROUP_ON_MOVE], cpu) += 1; 1148 mem->nocpu_base.count[MEM_CGROUP_ON_MOVE] += 1; 1149 spin_unlock(&mem->pcp_counter_lock); 1150 put_online_cpus(); 1151 1152 synchronize_rcu(); 1153} 1154 1155static void mem_cgroup_end_move(struct mem_cgroup *mem) 1156{ 1157 int cpu; 1158 1159 if (!mem) 1160 return; 1161 get_online_cpus(); 1162 spin_lock(&mem->pcp_counter_lock); 1163 for_each_online_cpu(cpu) 1164 per_cpu(mem->stat->count[MEM_CGROUP_ON_MOVE], cpu) -= 1; 1165 mem->nocpu_base.count[MEM_CGROUP_ON_MOVE] -= 1; 1166 spin_unlock(&mem->pcp_counter_lock); 1167 put_online_cpus(); 1168} 1169/* 1170 * 2 routines for checking "mem" is under move_account() or not. 1171 * 1172 * mem_cgroup_stealed() - checking a cgroup is mc.from or not. This is used 1173 * for avoiding race in accounting. If true, 1174 * pc->mem_cgroup may be overwritten. 1175 * 1176 * mem_cgroup_under_move() - checking a cgroup is mc.from or mc.to or 1177 * under hierarchy of moving cgroups. This is for 1178 * waiting at hith-memory prressure caused by "move". 1179 */ 1180 1181static bool mem_cgroup_stealed(struct mem_cgroup *mem) 1182{ 1183 VM_BUG_ON(!rcu_read_lock_held()); 1184 return this_cpu_read(mem->stat->count[MEM_CGROUP_ON_MOVE]) > 0; 1185} 1186 1187static bool mem_cgroup_under_move(struct mem_cgroup *mem) 1188{ 1189 struct mem_cgroup *from; 1190 struct mem_cgroup *to; 1191 bool ret = false; 1192 /* 1193 * Unlike task_move routines, we access mc.to, mc.from not under 1194 * mutual exclusion by cgroup_mutex. Here, we take spinlock instead. 1195 */ 1196 spin_lock(&mc.lock); 1197 from = mc.from; 1198 to = mc.to; 1199 if (!from) 1200 goto unlock; 1201 if (from == mem || to == mem 1202 || (mem->use_hierarchy && css_is_ancestor(&from->css, &mem->css)) 1203 || (mem->use_hierarchy && css_is_ancestor(&to->css, &mem->css))) 1204 ret = true; 1205unlock: 1206 spin_unlock(&mc.lock); 1207 return ret; 1208} 1209 1210static bool mem_cgroup_wait_acct_move(struct mem_cgroup *mem) 1211{ 1212 if (mc.moving_task && current != mc.moving_task) { 1213 if (mem_cgroup_under_move(mem)) { 1214 DEFINE_WAIT(wait); 1215 prepare_to_wait(&mc.waitq, &wait, TASK_INTERRUPTIBLE); 1216 /* moving charge context might have finished. */ 1217 if (mc.moving_task) 1218 schedule(); 1219 finish_wait(&mc.waitq, &wait); 1220 return true; 1221 } 1222 } 1223 return false; 1224} 1225 1226/** 1227 * mem_cgroup_print_oom_info: Called from OOM with tasklist_lock held in read mode. 1228 * @memcg: The memory cgroup that went over limit 1229 * @p: Task that is going to be killed 1230 * 1231 * NOTE: @memcg and @p's mem_cgroup can be different when hierarchy is 1232 * enabled 1233 */ 1234void mem_cgroup_print_oom_info(struct mem_cgroup *memcg, struct task_struct *p) 1235{ 1236 struct cgroup *task_cgrp; 1237 struct cgroup *mem_cgrp; 1238 /* 1239 * Need a buffer in BSS, can't rely on allocations. The code relies 1240 * on the assumption that OOM is serialized for memory controller. 1241 * If this assumption is broken, revisit this code. 1242 */ 1243 static char memcg_name[PATH_MAX]; 1244 int ret; 1245 1246 if (!memcg || !p) 1247 return; 1248 1249 1250 rcu_read_lock(); 1251 1252 mem_cgrp = memcg->css.cgroup; 1253 task_cgrp = task_cgroup(p, mem_cgroup_subsys_id); 1254 1255 ret = cgroup_path(task_cgrp, memcg_name, PATH_MAX); 1256 if (ret < 0) { 1257 /* 1258 * Unfortunately, we are unable to convert to a useful name 1259 * But we'll still print out the usage information 1260 */ 1261 rcu_read_unlock(); 1262 goto done; 1263 } 1264 rcu_read_unlock(); 1265 1266 printk(KERN_INFO "Task in %s killed", memcg_name); 1267 1268 rcu_read_lock(); 1269 ret = cgroup_path(mem_cgrp, memcg_name, PATH_MAX); 1270 if (ret < 0) { 1271 rcu_read_unlock(); 1272 goto done; 1273 } 1274 rcu_read_unlock(); 1275 1276 /* 1277 * Continues from above, so we don't need an KERN_ level 1278 */ 1279 printk(KERN_CONT " as a result of limit of %s\n", memcg_name); 1280done: 1281 1282 printk(KERN_INFO "memory: usage %llukB, limit %llukB, failcnt %llu\n", 1283 res_counter_read_u64(&memcg->res, RES_USAGE) >> 10, 1284 res_counter_read_u64(&memcg->res, RES_LIMIT) >> 10, 1285 res_counter_read_u64(&memcg->res, RES_FAILCNT)); 1286 printk(KERN_INFO "memory+swap: usage %llukB, limit %llukB, " 1287 "failcnt %llu\n", 1288 res_counter_read_u64(&memcg->memsw, RES_USAGE) >> 10, 1289 res_counter_read_u64(&memcg->memsw, RES_LIMIT) >> 10, 1290 res_counter_read_u64(&memcg->memsw, RES_FAILCNT)); 1291} 1292 1293/* 1294 * This function returns the number of memcg under hierarchy tree. Returns 1295 * 1(self count) if no children. 1296 */ 1297static int mem_cgroup_count_children(struct mem_cgroup *mem) 1298{ 1299 int num = 0; 1300 struct mem_cgroup *iter; 1301 1302 for_each_mem_cgroup_tree(iter, mem) 1303 num++; 1304 return num; 1305} 1306 1307/* 1308 * Return the memory (and swap, if configured) limit for a memcg. 1309 */ 1310u64 mem_cgroup_get_limit(struct mem_cgroup *memcg) 1311{ 1312 u64 limit; 1313 u64 memsw; 1314 1315 limit = res_counter_read_u64(&memcg->res, RES_LIMIT); 1316 limit += total_swap_pages << PAGE_SHIFT; 1317 1318 memsw = res_counter_read_u64(&memcg->memsw, RES_LIMIT); 1319 /* 1320 * If memsw is finite and limits the amount of swap space available 1321 * to this memcg, return that limit. 1322 */ 1323 return min(limit, memsw); 1324} 1325 1326/* 1327 * Visit the first child (need not be the first child as per the ordering 1328 * of the cgroup list, since we track last_scanned_child) of @mem and use 1329 * that to reclaim free pages from. 1330 */ 1331static struct mem_cgroup * 1332mem_cgroup_select_victim(struct mem_cgroup *root_mem) 1333{ 1334 struct mem_cgroup *ret = NULL; 1335 struct cgroup_subsys_state *css; 1336 int nextid, found; 1337 1338 if (!root_mem->use_hierarchy) { 1339 css_get(&root_mem->css); 1340 ret = root_mem; 1341 } 1342 1343 while (!ret) { 1344 rcu_read_lock(); 1345 nextid = root_mem->last_scanned_child + 1; 1346 css = css_get_next(&mem_cgroup_subsys, nextid, &root_mem->css, 1347 &found); 1348 if (css && css_tryget(css)) 1349 ret = container_of(css, struct mem_cgroup, css); 1350 1351 rcu_read_unlock(); 1352 /* Updates scanning parameter */ 1353 spin_lock(&root_mem->reclaim_param_lock); 1354 if (!css) { 1355 /* this means start scan from ID:1 */ 1356 root_mem->last_scanned_child = 0; 1357 } else 1358 root_mem->last_scanned_child = found; 1359 spin_unlock(&root_mem->reclaim_param_lock); 1360 } 1361 1362 return ret; 1363} 1364 1365/* 1366 * Scan the hierarchy if needed to reclaim memory. We remember the last child 1367 * we reclaimed from, so that we don't end up penalizing one child extensively 1368 * based on its position in the children list. 1369 * 1370 * root_mem is the original ancestor that we've been reclaim from. 1371 * 1372 * We give up and return to the caller when we visit root_mem twice. 1373 * (other groups can be removed while we're walking....) 1374 * 1375 * If shrink==true, for avoiding to free too much, this returns immedieately. 1376 */ 1377static int mem_cgroup_hierarchical_reclaim(struct mem_cgroup *root_mem, 1378 struct zone *zone, 1379 gfp_t gfp_mask, 1380 unsigned long reclaim_options) 1381{ 1382 struct mem_cgroup *victim; 1383 int ret, total = 0; 1384 int loop = 0; 1385 bool noswap = reclaim_options & MEM_CGROUP_RECLAIM_NOSWAP; 1386 bool shrink = reclaim_options & MEM_CGROUP_RECLAIM_SHRINK; 1387 bool check_soft = reclaim_options & MEM_CGROUP_RECLAIM_SOFT; 1388 unsigned long excess = mem_cgroup_get_excess(root_mem); 1389 1390 /* If memsw_is_minimum==1, swap-out is of-no-use. */ 1391 if (root_mem->memsw_is_minimum) 1392 noswap = true; 1393 1394 while (1) { 1395 victim = mem_cgroup_select_victim(root_mem); 1396 if (victim == root_mem) { 1397 loop++; 1398 if (loop >= 1) 1399 drain_all_stock_async(); 1400 if (loop >= 2) { 1401 /* 1402 * If we have not been able to reclaim 1403 * anything, it might because there are 1404 * no reclaimable pages under this hierarchy 1405 */ 1406 if (!check_soft || !total) { 1407 css_put(&victim->css); 1408 break; 1409 } 1410 /* 1411 * We want to do more targetted reclaim. 1412 * excess >> 2 is not to excessive so as to 1413 * reclaim too much, nor too less that we keep 1414 * coming back to reclaim from this cgroup 1415 */ 1416 if (total >= (excess >> 2) || 1417 (loop > MEM_CGROUP_MAX_RECLAIM_LOOPS)) { 1418 css_put(&victim->css); 1419 break; 1420 } 1421 } 1422 } 1423 if (!mem_cgroup_local_usage(victim)) { 1424 /* this cgroup's local usage == 0 */ 1425 css_put(&victim->css); 1426 continue; 1427 } 1428 /* we use swappiness of local cgroup */ 1429 if (check_soft) 1430 ret = mem_cgroup_shrink_node_zone(victim, gfp_mask, 1431 noswap, get_swappiness(victim), zone); 1432 else 1433 ret = try_to_free_mem_cgroup_pages(victim, gfp_mask, 1434 noswap, get_swappiness(victim)); 1435 css_put(&victim->css); 1436 /* 1437 * At shrinking usage, we can't check we should stop here or 1438 * reclaim more. It's depends on callers. last_scanned_child 1439 * will work enough for keeping fairness under tree. 1440 */ 1441 if (shrink) 1442 return ret; 1443 total += ret; 1444 if (check_soft) { 1445 if (res_counter_check_under_soft_limit(&root_mem->res)) 1446 return total; 1447 } else if (mem_cgroup_check_under_limit(root_mem)) 1448 return 1 + total; 1449 } 1450 return total; 1451} 1452 1453/* 1454 * Check OOM-Killer is already running under our hierarchy. 1455 * If someone is running, return false. 1456 */ 1457static bool mem_cgroup_oom_lock(struct mem_cgroup *mem) 1458{ 1459 int x, lock_count = 0; 1460 struct mem_cgroup *iter; 1461 1462 for_each_mem_cgroup_tree(iter, mem) { 1463 x = atomic_inc_return(&iter->oom_lock); 1464 lock_count = max(x, lock_count); 1465 } 1466 1467 if (lock_count == 1) 1468 return true; 1469 return false; 1470} 1471 1472static int mem_cgroup_oom_unlock(struct mem_cgroup *mem) 1473{ 1474 struct mem_cgroup *iter; 1475 1476 /* 1477 * When a new child is created while the hierarchy is under oom, 1478 * mem_cgroup_oom_lock() may not be called. We have to use 1479 * atomic_add_unless() here. 1480 */ 1481 for_each_mem_cgroup_tree(iter, mem) 1482 atomic_add_unless(&iter->oom_lock, -1, 0); 1483 return 0; 1484} 1485 1486 1487static DEFINE_MUTEX(memcg_oom_mutex); 1488static DECLARE_WAIT_QUEUE_HEAD(memcg_oom_waitq); 1489 1490struct oom_wait_info { 1491 struct mem_cgroup *mem; 1492 wait_queue_t wait; 1493}; 1494 1495static int memcg_oom_wake_function(wait_queue_t *wait, 1496 unsigned mode, int sync, void *arg) 1497{ 1498 struct mem_cgroup *wake_mem = (struct mem_cgroup *)arg; 1499 struct oom_wait_info *oom_wait_info; 1500 1501 oom_wait_info = container_of(wait, struct oom_wait_info, wait); 1502 1503 if (oom_wait_info->mem == wake_mem) 1504 goto wakeup; 1505 /* if no hierarchy, no match */ 1506 if (!oom_wait_info->mem->use_hierarchy || !wake_mem->use_hierarchy) 1507 return 0; 1508 /* 1509 * Both of oom_wait_info->mem and wake_mem are stable under us. 1510 * Then we can use css_is_ancestor without taking care of RCU. 1511 */ 1512 if (!css_is_ancestor(&oom_wait_info->mem->css, &wake_mem->css) && 1513 !css_is_ancestor(&wake_mem->css, &oom_wait_info->mem->css)) 1514 return 0; 1515 1516wakeup: 1517 return autoremove_wake_function(wait, mode, sync, arg); 1518} 1519 1520static void memcg_wakeup_oom(struct mem_cgroup *mem) 1521{ 1522 /* for filtering, pass "mem" as argument. */ 1523 __wake_up(&memcg_oom_waitq, TASK_NORMAL, 0, mem); 1524} 1525 1526static void memcg_oom_recover(struct mem_cgroup *mem) 1527{ 1528 if (mem && atomic_read(&mem->oom_lock)) 1529 memcg_wakeup_oom(mem); 1530} 1531 1532/* 1533 * try to call OOM killer. returns false if we should exit memory-reclaim loop. 1534 */ 1535bool mem_cgroup_handle_oom(struct mem_cgroup *mem, gfp_t mask) 1536{ 1537 struct oom_wait_info owait; 1538 bool locked, need_to_kill; 1539 1540 owait.mem = mem; 1541 owait.wait.flags = 0; 1542 owait.wait.func = memcg_oom_wake_function; 1543 owait.wait.private = current; 1544 INIT_LIST_HEAD(&owait.wait.task_list); 1545 need_to_kill = true; 1546 /* At first, try to OOM lock hierarchy under mem.*/ 1547 mutex_lock(&memcg_oom_mutex); 1548 locked = mem_cgroup_oom_lock(mem); 1549 /* 1550 * Even if signal_pending(), we can't quit charge() loop without 1551 * accounting. So, UNINTERRUPTIBLE is appropriate. But SIGKILL 1552 * under OOM is always welcomed, use TASK_KILLABLE here. 1553 */ 1554 prepare_to_wait(&memcg_oom_waitq, &owait.wait, TASK_KILLABLE); 1555 if (!locked || mem->oom_kill_disable) 1556 need_to_kill = false; 1557 if (locked) 1558 mem_cgroup_oom_notify(mem); 1559 mutex_unlock(&memcg_oom_mutex); 1560 1561 if (need_to_kill) { 1562 finish_wait(&memcg_oom_waitq, &owait.wait); 1563 mem_cgroup_out_of_memory(mem, mask); 1564 } else { 1565 schedule(); 1566 finish_wait(&memcg_oom_waitq, &owait.wait); 1567 } 1568 mutex_lock(&memcg_oom_mutex); 1569 mem_cgroup_oom_unlock(mem); 1570 memcg_wakeup_oom(mem); 1571 mutex_unlock(&memcg_oom_mutex); 1572 1573 if (test_thread_flag(TIF_MEMDIE) || fatal_signal_pending(current)) 1574 return false; 1575 /* Give chance to dying process */ 1576 schedule_timeout(1); 1577 return true; 1578} 1579 1580/* 1581 * Currently used to update mapped file statistics, but the routine can be 1582 * generalized to update other statistics as well. 1583 * 1584 * Notes: Race condition 1585 * 1586 * We usually use page_cgroup_lock() for accessing page_cgroup member but 1587 * it tends to be costly. But considering some conditions, we doesn't need 1588 * to do so _always_. 1589 * 1590 * Considering "charge", lock_page_cgroup() is not required because all 1591 * file-stat operations happen after a page is attached to radix-tree. There 1592 * are no race with "charge". 1593 * 1594 * Considering "uncharge", we know that memcg doesn't clear pc->mem_cgroup 1595 * at "uncharge" intentionally. So, we always see valid pc->mem_cgroup even 1596 * if there are race with "uncharge". Statistics itself is properly handled 1597 * by flags. 1598 * 1599 * Considering "move", this is an only case we see a race. To make the race 1600 * small, we check MEM_CGROUP_ON_MOVE percpu value and detect there are 1601 * possibility of race condition. If there is, we take a lock. 1602 */ 1603 1604void mem_cgroup_update_page_stat(struct page *page, 1605 enum mem_cgroup_page_stat_item idx, int val) 1606{ 1607 struct mem_cgroup *mem; 1608 struct page_cgroup *pc = lookup_page_cgroup(page); 1609 bool need_unlock = false; 1610 unsigned long uninitialized_var(flags); 1611 1612 if (unlikely(!pc)) 1613 return; 1614 1615 rcu_read_lock(); 1616 mem = pc->mem_cgroup; 1617 if (unlikely(!mem || !PageCgroupUsed(pc))) 1618 goto out; 1619 /* pc->mem_cgroup is unstable ? */ 1620 if (unlikely(mem_cgroup_stealed(mem))) { 1621 /* take a lock against to access pc->mem_cgroup */ 1622 move_lock_page_cgroup(pc, &flags); 1623 need_unlock = true; 1624 mem = pc->mem_cgroup; 1625 if (!mem || !PageCgroupUsed(pc)) 1626 goto out; 1627 } 1628 1629 switch (idx) { 1630 case MEMCG_NR_FILE_MAPPED: 1631 if (val > 0) 1632 SetPageCgroupFileMapped(pc); 1633 else if (!page_mapped(page)) 1634 ClearPageCgroupFileMapped(pc); 1635 idx = MEM_CGROUP_STAT_FILE_MAPPED; 1636 break; 1637 default: 1638 BUG(); 1639 } 1640 1641 this_cpu_add(mem->stat->count[idx], val); 1642 1643out: 1644 if (unlikely(need_unlock)) 1645 move_unlock_page_cgroup(pc, &flags); 1646 rcu_read_unlock(); 1647 return; 1648} 1649EXPORT_SYMBOL(mem_cgroup_update_page_stat); 1650 1651/* 1652 * size of first charge trial. "32" comes from vmscan.c's magic value. 1653 * TODO: maybe necessary to use big numbers in big irons. 1654 */ 1655#define CHARGE_SIZE (32 * PAGE_SIZE) 1656struct memcg_stock_pcp { 1657 struct mem_cgroup *cached; /* this never be root cgroup */ 1658 int charge; 1659 struct work_struct work; 1660}; 1661static DEFINE_PER_CPU(struct memcg_stock_pcp, memcg_stock); 1662static atomic_t memcg_drain_count; 1663 1664/* 1665 * Try to consume stocked charge on this cpu. If success, PAGE_SIZE is consumed 1666 * from local stock and true is returned. If the stock is 0 or charges from a 1667 * cgroup which is not current target, returns false. This stock will be 1668 * refilled. 1669 */ 1670static bool consume_stock(struct mem_cgroup *mem) 1671{ 1672 struct memcg_stock_pcp *stock; 1673 bool ret = true; 1674 1675 stock = &get_cpu_var(memcg_stock); 1676 if (mem == stock->cached && stock->charge) 1677 stock->charge -= PAGE_SIZE; 1678 else /* need to call res_counter_charge */ 1679 ret = false; 1680 put_cpu_var(memcg_stock); 1681 return ret; 1682} 1683 1684/* 1685 * Returns stocks cached in percpu to res_counter and reset cached information. 1686 */ 1687static void drain_stock(struct memcg_stock_pcp *stock) 1688{ 1689 struct mem_cgroup *old = stock->cached; 1690 1691 if (stock->charge) { 1692 res_counter_uncharge(&old->res, stock->charge); 1693 if (do_swap_account) 1694 res_counter_uncharge(&old->memsw, stock->charge); 1695 } 1696 stock->cached = NULL; 1697 stock->charge = 0; 1698} 1699 1700/* 1701 * This must be called under preempt disabled or must be called by 1702 * a thread which is pinned to local cpu. 1703 */ 1704static void drain_local_stock(struct work_struct *dummy) 1705{ 1706 struct memcg_stock_pcp *stock = &__get_cpu_var(memcg_stock); 1707 drain_stock(stock); 1708} 1709 1710/* 1711 * Cache charges(val) which is from res_counter, to local per_cpu area. 1712 * This will be consumed by consume_stock() function, later. 1713 */ 1714static void refill_stock(struct mem_cgroup *mem, int val) 1715{ 1716 struct memcg_stock_pcp *stock = &get_cpu_var(memcg_stock); 1717 1718 if (stock->cached != mem) { /* reset if necessary */ 1719 drain_stock(stock); 1720 stock->cached = mem; 1721 } 1722 stock->charge += val; 1723 put_cpu_var(memcg_stock); 1724} 1725 1726/* 1727 * Tries to drain stocked charges in other cpus. This function is asynchronous 1728 * and just put a work per cpu for draining localy on each cpu. Caller can 1729 * expects some charges will be back to res_counter later but cannot wait for 1730 * it. 1731 */ 1732static void drain_all_stock_async(void) 1733{ 1734 int cpu; 1735 /* This function is for scheduling "drain" in asynchronous way. 1736 * The result of "drain" is not directly handled by callers. Then, 1737 * if someone is calling drain, we don't have to call drain more. 1738 * Anyway, WORK_STRUCT_PENDING check in queue_work_on() will catch if 1739 * there is a race. We just do loose check here. 1740 */ 1741 if (atomic_read(&memcg_drain_count)) 1742 return; 1743 /* Notify other cpus that system-wide "drain" is running */ 1744 atomic_inc(&memcg_drain_count); 1745 get_online_cpus(); 1746 for_each_online_cpu(cpu) { 1747 struct memcg_stock_pcp *stock = &per_cpu(memcg_stock, cpu); 1748 schedule_work_on(cpu, &stock->work); 1749 } 1750 put_online_cpus(); 1751 atomic_dec(&memcg_drain_count); 1752 /* We don't wait for flush_work */ 1753} 1754 1755/* This is a synchronous drain interface. */ 1756static void drain_all_stock_sync(void) 1757{ 1758 /* called when force_empty is called */ 1759 atomic_inc(&memcg_drain_count); 1760 schedule_on_each_cpu(drain_local_stock); 1761 atomic_dec(&memcg_drain_count); 1762} 1763 1764/* 1765 * This function drains percpu counter value from DEAD cpu and 1766 * move it to local cpu. Note that this function can be preempted. 1767 */ 1768static void mem_cgroup_drain_pcp_counter(struct mem_cgroup *mem, int cpu) 1769{ 1770 int i; 1771 1772 spin_lock(&mem->pcp_counter_lock); 1773 for (i = 0; i < MEM_CGROUP_STAT_DATA; i++) { 1774 s64 x = per_cpu(mem->stat->count[i], cpu); 1775 1776 per_cpu(mem->stat->count[i], cpu) = 0; 1777 mem->nocpu_base.count[i] += x; 1778 } 1779 /* need to clear ON_MOVE value, works as a kind of lock. */ 1780 per_cpu(mem->stat->count[MEM_CGROUP_ON_MOVE], cpu) = 0; 1781 spin_unlock(&mem->pcp_counter_lock); 1782} 1783 1784static void synchronize_mem_cgroup_on_move(struct mem_cgroup *mem, int cpu) 1785{ 1786 int idx = MEM_CGROUP_ON_MOVE; 1787 1788 spin_lock(&mem->pcp_counter_lock); 1789 per_cpu(mem->stat->count[idx], cpu) = mem->nocpu_base.count[idx]; 1790 spin_unlock(&mem->pcp_counter_lock); 1791} 1792 1793static int __cpuinit memcg_cpu_hotplug_callback(struct notifier_block *nb, 1794 unsigned long action, 1795 void *hcpu) 1796{ 1797 int cpu = (unsigned long)hcpu; 1798 struct memcg_stock_pcp *stock; 1799 struct mem_cgroup *iter; 1800 1801 if ((action == CPU_ONLINE)) { 1802 for_each_mem_cgroup_all(iter) 1803 synchronize_mem_cgroup_on_move(iter, cpu); 1804 return NOTIFY_OK; 1805 } 1806 1807 if ((action != CPU_DEAD) || action != CPU_DEAD_FROZEN) 1808 return NOTIFY_OK; 1809 1810 for_each_mem_cgroup_all(iter) 1811 mem_cgroup_drain_pcp_counter(iter, cpu); 1812 1813 stock = &per_cpu(memcg_stock, cpu); 1814 drain_stock(stock); 1815 return NOTIFY_OK; 1816} 1817 1818 1819/* See __mem_cgroup_try_charge() for details */ 1820enum { 1821 CHARGE_OK, /* success */ 1822 CHARGE_RETRY, /* need to retry but retry is not bad */ 1823 CHARGE_NOMEM, /* we can't do more. return -ENOMEM */ 1824 CHARGE_WOULDBLOCK, /* GFP_WAIT wasn't set and no enough res. */ 1825 CHARGE_OOM_DIE, /* the current is killed because of OOM */ 1826}; 1827 1828static int __mem_cgroup_do_charge(struct mem_cgroup *mem, gfp_t gfp_mask, 1829 int csize, bool oom_check) 1830{ 1831 struct mem_cgroup *mem_over_limit; 1832 struct res_counter *fail_res; 1833 unsigned long flags = 0; 1834 int ret; 1835 1836 ret = res_counter_charge(&mem->res, csize, &fail_res); 1837 1838 if (likely(!ret)) { 1839 if (!do_swap_account) 1840 return CHARGE_OK; 1841 ret = res_counter_charge(&mem->memsw, csize, &fail_res); 1842 if (likely(!ret)) 1843 return CHARGE_OK; 1844 1845 mem_over_limit = mem_cgroup_from_res_counter(fail_res, memsw); 1846 flags |= MEM_CGROUP_RECLAIM_NOSWAP; 1847 } else 1848 mem_over_limit = mem_cgroup_from_res_counter(fail_res, res); 1849 1850 if (csize > PAGE_SIZE) /* change csize and retry */ 1851 return CHARGE_RETRY; 1852 1853 if (!(gfp_mask & __GFP_WAIT)) 1854 return CHARGE_WOULDBLOCK; 1855 1856 ret = mem_cgroup_hierarchical_reclaim(mem_over_limit, NULL, 1857 gfp_mask, flags); 1858 /* 1859 * try_to_free_mem_cgroup_pages() might not give us a full 1860 * picture of reclaim. Some pages are reclaimed and might be 1861 * moved to swap cache or just unmapped from the cgroup. 1862 * Check the limit again to see if the reclaim reduced the 1863 * current usage of the cgroup before giving up 1864 */ 1865 if (ret || mem_cgroup_check_under_limit(mem_over_limit)) 1866 return CHARGE_RETRY; 1867 1868 /* 1869 * At task move, charge accounts can be doubly counted. So, it's 1870 * better to wait until the end of task_move if something is going on. 1871 */ 1872 if (mem_cgroup_wait_acct_move(mem_over_limit)) 1873 return CHARGE_RETRY; 1874 1875 /* If we don't need to call oom-killer at el, return immediately */ 1876 if (!oom_check) 1877 return CHARGE_NOMEM; 1878 /* check OOM */ 1879 if (!mem_cgroup_handle_oom(mem_over_limit, gfp_mask)) 1880 return CHARGE_OOM_DIE; 1881 1882 return CHARGE_RETRY; 1883} 1884 1885/* 1886 * Unlike exported interface, "oom" parameter is added. if oom==true, 1887 * oom-killer can be invoked. 1888 */ 1889static int __mem_cgroup_try_charge(struct mm_struct *mm, 1890 gfp_t gfp_mask, 1891 struct mem_cgroup **memcg, bool oom, 1892 int page_size) 1893{ 1894 int nr_oom_retries = MEM_CGROUP_RECLAIM_RETRIES; 1895 struct mem_cgroup *mem = NULL; 1896 int ret; 1897 int csize = max(CHARGE_SIZE, (unsigned long) page_size); 1898 1899 /* 1900 * Unlike gloval-vm's OOM-kill, we're not in memory shortage 1901 * in system level. So, allow to go ahead dying process in addition to 1902 * MEMDIE process. 1903 */ 1904 if (unlikely(test_thread_flag(TIF_MEMDIE) 1905 || fatal_signal_pending(current))) 1906 goto bypass; 1907 1908 /* 1909 * We always charge the cgroup the mm_struct belongs to. 1910 * The mm_struct's mem_cgroup changes on task migration if the 1911 * thread group leader migrates. It's possible that mm is not 1912 * set, if so charge the init_mm (happens for pagecache usage). 1913 */ 1914 if (!*memcg && !mm) 1915 goto bypass; 1916again: 1917 if (*memcg) { /* css should be a valid one */ 1918 mem = *memcg; 1919 VM_BUG_ON(css_is_removed(&mem->css)); 1920 if (mem_cgroup_is_root(mem)) 1921 goto done; 1922 if (page_size == PAGE_SIZE && consume_stock(mem)) 1923 goto done; 1924 css_get(&mem->css); 1925 } else { 1926 struct task_struct *p; 1927 1928 rcu_read_lock(); 1929 p = rcu_dereference(mm->owner); 1930 /* 1931 * Because we don't have task_lock(), "p" can exit. 1932 * In that case, "mem" can point to root or p can be NULL with 1933 * race with swapoff. Then, we have small risk of mis-accouning. 1934 * But such kind of mis-account by race always happens because 1935 * we don't have cgroup_mutex(). It's overkill and we allo that 1936 * small race, here. 1937 * (*) swapoff at el will charge against mm-struct not against 1938 * task-struct. So, mm->owner can be NULL. 1939 */ 1940 mem = mem_cgroup_from_task(p); 1941 if (!mem || mem_cgroup_is_root(mem)) { 1942 rcu_read_unlock(); 1943 goto done; 1944 } 1945 if (page_size == PAGE_SIZE && consume_stock(mem)) { 1946 /* 1947 * It seems dagerous to access memcg without css_get(). 1948 * But considering how consume_stok works, it's not 1949 * necessary. If consume_stock success, some charges 1950 * from this memcg are cached on this cpu. So, we 1951 * don't need to call css_get()/css_tryget() before 1952 * calling consume_stock(). 1953 */ 1954 rcu_read_unlock(); 1955 goto done; 1956 } 1957 /* after here, we may be blocked. we need to get refcnt */ 1958 if (!css_tryget(&mem->css)) { 1959 rcu_read_unlock(); 1960 goto again; 1961 } 1962 rcu_read_unlock(); 1963 } 1964 1965 do { 1966 bool oom_check; 1967 1968 /* If killed, bypass charge */ 1969 if (fatal_signal_pending(current)) { 1970 css_put(&mem->css); 1971 goto bypass; 1972 } 1973 1974 oom_check = false; 1975 if (oom && !nr_oom_retries) { 1976 oom_check = true; 1977 nr_oom_retries = MEM_CGROUP_RECLAIM_RETRIES; 1978 } 1979 1980 ret = __mem_cgroup_do_charge(mem, gfp_mask, csize, oom_check); 1981 1982 switch (ret) { 1983 case CHARGE_OK: 1984 break; 1985 case CHARGE_RETRY: /* not in OOM situation but retry */ 1986 csize = page_size; 1987 css_put(&mem->css); 1988 mem = NULL; 1989 goto again; 1990 case CHARGE_WOULDBLOCK: /* !__GFP_WAIT */ 1991 css_put(&mem->css); 1992 goto nomem; 1993 case CHARGE_NOMEM: /* OOM routine works */ 1994 if (!oom) { 1995 css_put(&mem->css); 1996 goto nomem; 1997 } 1998 /* If oom, we never return -ENOMEM */ 1999 nr_oom_retries--; 2000 break; 2001 case CHARGE_OOM_DIE: /* Killed by OOM Killer */ 2002 css_put(&mem->css); 2003 goto bypass; 2004 } 2005 } while (ret != CHARGE_OK); 2006 2007 if (csize > page_size) 2008 refill_stock(mem, csize - page_size); 2009 css_put(&mem->css); 2010done: 2011 *memcg = mem; 2012 return 0; 2013nomem: 2014 *memcg = NULL; 2015 return -ENOMEM; 2016bypass: 2017 *memcg = NULL; 2018 return 0; 2019} 2020 2021/* 2022 * Somemtimes we have to undo a charge we got by try_charge(). 2023 * This function is for that and do uncharge, put css's refcnt. 2024 * gotten by try_charge(). 2025 */ 2026static void __mem_cgroup_cancel_charge(struct mem_cgroup *mem, 2027 unsigned long count) 2028{ 2029 if (!mem_cgroup_is_root(mem)) { 2030 res_counter_uncharge(&mem->res, PAGE_SIZE * count); 2031 if (do_swap_account) 2032 res_counter_uncharge(&mem->memsw, PAGE_SIZE * count); 2033 } 2034} 2035 2036static void mem_cgroup_cancel_charge(struct mem_cgroup *mem, 2037 int page_size) 2038{ 2039 __mem_cgroup_cancel_charge(mem, page_size >> PAGE_SHIFT); 2040} 2041 2042/* 2043 * A helper function to get mem_cgroup from ID. must be called under 2044 * rcu_read_lock(). The caller must check css_is_removed() or some if 2045 * it's concern. (dropping refcnt from swap can be called against removed 2046 * memcg.) 2047 */ 2048static struct mem_cgroup *mem_cgroup_lookup(unsigned short id) 2049{ 2050 struct cgroup_subsys_state *css; 2051 2052 /* ID 0 is unused ID */ 2053 if (!id) 2054 return NULL; 2055 css = css_lookup(&mem_cgroup_subsys, id); 2056 if (!css) 2057 return NULL; 2058 return container_of(css, struct mem_cgroup, css); 2059} 2060 2061struct mem_cgroup *try_get_mem_cgroup_from_page(struct page *page) 2062{ 2063 struct mem_cgroup *mem = NULL; 2064 struct page_cgroup *pc; 2065 unsigned short id; 2066 swp_entry_t ent; 2067 2068 VM_BUG_ON(!PageLocked(page)); 2069 2070 pc = lookup_page_cgroup(page); 2071 lock_page_cgroup(pc); 2072 if (PageCgroupUsed(pc)) { 2073 mem = pc->mem_cgroup; 2074 if (mem && !css_tryget(&mem->css)) 2075 mem = NULL; 2076 } else if (PageSwapCache(page)) { 2077 ent.val = page_private(page); 2078 id = lookup_swap_cgroup(ent); 2079 rcu_read_lock(); 2080 mem = mem_cgroup_lookup(id); 2081 if (mem && !css_tryget(&mem->css)) 2082 mem = NULL; 2083 rcu_read_unlock(); 2084 } 2085 unlock_page_cgroup(pc); 2086 return mem; 2087} 2088 2089/* 2090 * commit a charge got by __mem_cgroup_try_charge() and makes page_cgroup to be 2091 * USED state. If already USED, uncharge and return. 2092 */ 2093static void ____mem_cgroup_commit_charge(struct mem_cgroup *mem, 2094 struct page_cgroup *pc, 2095 enum charge_type ctype) 2096{ 2097 pc->mem_cgroup = mem; 2098 /* 2099 * We access a page_cgroup asynchronously without lock_page_cgroup(). 2100 * Especially when a page_cgroup is taken from a page, pc->mem_cgroup 2101 * is accessed after testing USED bit. To make pc->mem_cgroup visible 2102 * before USED bit, we need memory barrier here. 2103 * See mem_cgroup_add_lru_list(), etc. 2104 */ 2105 smp_wmb(); 2106 switch (ctype) { 2107 case MEM_CGROUP_CHARGE_TYPE_CACHE: 2108 case MEM_CGROUP_CHARGE_TYPE_SHMEM: 2109 SetPageCgroupCache(pc); 2110 SetPageCgroupUsed(pc); 2111 break; 2112 case MEM_CGROUP_CHARGE_TYPE_MAPPED: 2113 ClearPageCgroupCache(pc); 2114 SetPageCgroupUsed(pc); 2115 break; 2116 default: 2117 break; 2118 } 2119 2120 mem_cgroup_charge_statistics(mem, pc, true); 2121} 2122 2123static void __mem_cgroup_commit_charge(struct mem_cgroup *mem, 2124 struct page_cgroup *pc, 2125 enum charge_type ctype, 2126 int page_size) 2127{ 2128 int i; 2129 int count = page_size >> PAGE_SHIFT; 2130 2131 /* try_charge() can return NULL to *memcg, taking care of it. */ 2132 if (!mem) 2133 return; 2134 2135 lock_page_cgroup(pc); 2136 if (unlikely(PageCgroupUsed(pc))) { 2137 unlock_page_cgroup(pc); 2138 mem_cgroup_cancel_charge(mem, page_size); 2139 return; 2140 } 2141 2142 /* 2143 * we don't need page_cgroup_lock about tail pages, becase they are not 2144 * accessed by any other context at this point. 2145 */ 2146 for (i = 0; i < count; i++) 2147 ____mem_cgroup_commit_charge(mem, pc + i, ctype); 2148 2149 unlock_page_cgroup(pc); 2150 /* 2151 * "charge_statistics" updated event counter. Then, check it. 2152 * Insert ancestor (and ancestor's ancestors), to softlimit RB-tree. 2153 * if they exceeds softlimit. 2154 */ 2155 memcg_check_events(mem, pc->page); 2156} 2157 2158/** 2159 * __mem_cgroup_move_account - move account of the page 2160 * @pc: page_cgroup of the page. 2161 * @from: mem_cgroup which the page is moved from. 2162 * @to: mem_cgroup which the page is moved to. @from != @to. 2163 * @uncharge: whether we should call uncharge and css_put against @from. 2164 * 2165 * The caller must confirm following. 2166 * - page is not on LRU (isolate_page() is useful.) 2167 * - the pc is locked, used, and ->mem_cgroup points to @from. 2168 * 2169 * This function doesn't do "charge" nor css_get to new cgroup. It should be 2170 * done by a caller(__mem_cgroup_try_charge would be usefull). If @uncharge is 2171 * true, this function does "uncharge" from old cgroup, but it doesn't if 2172 * @uncharge is false, so a caller should do "uncharge". 2173 */ 2174 2175static void __mem_cgroup_move_account(struct page_cgroup *pc, 2176 struct mem_cgroup *from, struct mem_cgroup *to, bool uncharge) 2177{ 2178 VM_BUG_ON(from == to); 2179 VM_BUG_ON(PageLRU(pc->page)); 2180 VM_BUG_ON(!page_is_cgroup_locked(pc)); 2181 VM_BUG_ON(!PageCgroupUsed(pc)); 2182 VM_BUG_ON(pc->mem_cgroup != from); 2183 2184 if (PageCgroupFileMapped(pc)) { 2185 /* Update mapped_file data for mem_cgroup */ 2186 preempt_disable(); 2187 __this_cpu_dec(from->stat->count[MEM_CGROUP_STAT_FILE_MAPPED]); 2188 __this_cpu_inc(to->stat->count[MEM_CGROUP_STAT_FILE_MAPPED]); 2189 preempt_enable(); 2190 } 2191 mem_cgroup_charge_statistics(from, pc, false); 2192 if (uncharge) 2193 /* This is not "cancel", but cancel_charge does all we need. */ 2194 mem_cgroup_cancel_charge(from, PAGE_SIZE); 2195 2196 /* caller should have done css_get */ 2197 pc->mem_cgroup = to; 2198 mem_cgroup_charge_statistics(to, pc, true); 2199 /* 2200 * We charges against "to" which may not have any tasks. Then, "to" 2201 * can be under rmdir(). But in current implementation, caller of 2202 * this function is just force_empty() and move charge, so it's 2203 * garanteed that "to" is never removed. So, we don't check rmdir 2204 * status here. 2205 */ 2206} 2207 2208/* 2209 * check whether the @pc is valid for moving account and call 2210 * __mem_cgroup_move_account() 2211 */ 2212static int mem_cgroup_move_account(struct page_cgroup *pc, 2213 struct mem_cgroup *from, struct mem_cgroup *to, bool uncharge) 2214{ 2215 int ret = -EINVAL; 2216 unsigned long flags; 2217 2218 lock_page_cgroup(pc); 2219 if (PageCgroupUsed(pc) && pc->mem_cgroup == from) { 2220 move_lock_page_cgroup(pc, &flags); 2221 __mem_cgroup_move_account(pc, from, to, uncharge); 2222 move_unlock_page_cgroup(pc, &flags); 2223 ret = 0; 2224 } 2225 unlock_page_cgroup(pc); 2226 /* 2227 * check events 2228 */ 2229 memcg_check_events(to, pc->page); 2230 memcg_check_events(from, pc->page); 2231 return ret; 2232} 2233 2234/* 2235 * move charges to its parent. 2236 */ 2237 2238static int mem_cgroup_move_parent(struct page_cgroup *pc, 2239 struct mem_cgroup *child, 2240 gfp_t gfp_mask) 2241{ 2242 struct page *page = pc->page; 2243 struct cgroup *cg = child->css.cgroup; 2244 struct cgroup *pcg = cg->parent; 2245 struct mem_cgroup *parent; 2246 int ret; 2247 2248 /* Is ROOT ? */ 2249 if (!pcg) 2250 return -EINVAL; 2251 2252 ret = -EBUSY; 2253 if (!get_page_unless_zero(page)) 2254 goto out; 2255 if (isolate_lru_page(page)) 2256 goto put; 2257 2258 parent = mem_cgroup_from_cont(pcg); 2259 ret = __mem_cgroup_try_charge(NULL, gfp_mask, &parent, false, 2260 PAGE_SIZE); 2261 if (ret || !parent) 2262 goto put_back; 2263 2264 ret = mem_cgroup_move_account(pc, child, parent, true); 2265 if (ret) 2266 mem_cgroup_cancel_charge(parent, PAGE_SIZE); 2267put_back: 2268 putback_lru_page(page); 2269put: 2270 put_page(page); 2271out: 2272 return ret; 2273} 2274 2275/* 2276 * Charge the memory controller for page usage. 2277 * Return 2278 * 0 if the charge was successful 2279 * < 0 if the cgroup is over its limit 2280 */ 2281static int mem_cgroup_charge_common(struct page *page, struct mm_struct *mm, 2282 gfp_t gfp_mask, enum charge_type ctype) 2283{ 2284 struct mem_cgroup *mem = NULL; 2285 struct page_cgroup *pc; 2286 int ret; 2287 int page_size = PAGE_SIZE; 2288 2289 if (PageTransHuge(page)) { 2290 page_size <<= compound_order(page); 2291 VM_BUG_ON(!PageTransHuge(page)); 2292 } 2293 2294 pc = lookup_page_cgroup(page); 2295 /* can happen at boot */ 2296 if (unlikely(!pc)) 2297 return 0; 2298 prefetchw(pc); 2299 2300 ret = __mem_cgroup_try_charge(mm, gfp_mask, &mem, true, page_size); 2301 if (ret || !mem) 2302 return ret; 2303 2304 __mem_cgroup_commit_charge(mem, pc, ctype, page_size); 2305 return 0; 2306} 2307 2308int mem_cgroup_newpage_charge(struct page *page, 2309 struct mm_struct *mm, gfp_t gfp_mask) 2310{ 2311 if (mem_cgroup_disabled()) 2312 return 0; 2313 /* 2314 * If already mapped, we don't have to account. 2315 * If page cache, page->mapping has address_space. 2316 * But page->mapping may have out-of-use anon_vma pointer, 2317 * detecit it by PageAnon() check. newly-mapped-anon's page->mapping 2318 * is NULL. 2319 */ 2320 if (page_mapped(page) || (page->mapping && !PageAnon(page))) 2321 return 0; 2322 if (unlikely(!mm)) 2323 mm = &init_mm; 2324 return mem_cgroup_charge_common(page, mm, gfp_mask, 2325 MEM_CGROUP_CHARGE_TYPE_MAPPED); 2326} 2327 2328static void 2329__mem_cgroup_commit_charge_swapin(struct page *page, struct mem_cgroup *ptr, 2330 enum charge_type ctype); 2331 2332int mem_cgroup_cache_charge(struct page *page, struct mm_struct *mm, 2333 gfp_t gfp_mask) 2334{ 2335 int ret; 2336 2337 if (mem_cgroup_disabled()) 2338 return 0; 2339 if (PageCompound(page)) 2340 return 0; 2341 /* 2342 * Corner case handling. This is called from add_to_page_cache() 2343 * in usual. But some FS (shmem) precharges this page before calling it 2344 * and call add_to_page_cache() with GFP_NOWAIT. 2345 * 2346 * For GFP_NOWAIT case, the page may be pre-charged before calling 2347 * add_to_page_cache(). (See shmem.c) check it here and avoid to call 2348 * charge twice. (It works but has to pay a bit larger cost.) 2349 * And when the page is SwapCache, it should take swap information 2350 * into account. This is under lock_page() now. 2351 */ 2352 if (!(gfp_mask & __GFP_WAIT)) { 2353 struct page_cgroup *pc; 2354 2355 pc = lookup_page_cgroup(page); 2356 if (!pc) 2357 return 0; 2358 lock_page_cgroup(pc); 2359 if (PageCgroupUsed(pc)) { 2360 unlock_page_cgroup(pc); 2361 return 0; 2362 } 2363 unlock_page_cgroup(pc); 2364 } 2365 2366 if (unlikely(!mm)) 2367 mm = &init_mm; 2368 2369 if (page_is_file_cache(page)) 2370 return mem_cgroup_charge_common(page, mm, gfp_mask, 2371 MEM_CGROUP_CHARGE_TYPE_CACHE); 2372 2373 /* shmem */ 2374 if (PageSwapCache(page)) { 2375 struct mem_cgroup *mem = NULL; 2376 2377 ret = mem_cgroup_try_charge_swapin(mm, page, gfp_mask, &mem); 2378 if (!ret) 2379 __mem_cgroup_commit_charge_swapin(page, mem, 2380 MEM_CGROUP_CHARGE_TYPE_SHMEM); 2381 } else 2382 ret = mem_cgroup_charge_common(page, mm, gfp_mask, 2383 MEM_CGROUP_CHARGE_TYPE_SHMEM); 2384 2385 return ret; 2386} 2387 2388/* 2389 * While swap-in, try_charge -> commit or cancel, the page is locked. 2390 * And when try_charge() successfully returns, one refcnt to memcg without 2391 * struct page_cgroup is acquired. This refcnt will be consumed by 2392 * "commit()" or removed by "cancel()" 2393 */ 2394int mem_cgroup_try_charge_swapin(struct mm_struct *mm, 2395 struct page *page, 2396 gfp_t mask, struct mem_cgroup **ptr) 2397{ 2398 struct mem_cgroup *mem; 2399 int ret; 2400 2401 if (mem_cgroup_disabled()) 2402 return 0; 2403 2404 if (!do_swap_account) 2405 goto charge_cur_mm; 2406 /* 2407 * A racing thread's fault, or swapoff, may have already updated 2408 * the pte, and even removed page from swap cache: in those cases 2409 * do_swap_page()'s pte_same() test will fail; but there's also a 2410 * KSM case which does need to charge the page. 2411 */ 2412 if (!PageSwapCache(page)) 2413 goto charge_cur_mm; 2414 mem = try_get_mem_cgroup_from_page(page); 2415 if (!mem) 2416 goto charge_cur_mm; 2417 *ptr = mem; 2418 ret = __mem_cgroup_try_charge(NULL, mask, ptr, true, PAGE_SIZE); 2419 css_put(&mem->css); 2420 return ret; 2421charge_cur_mm: 2422 if (unlikely(!mm)) 2423 mm = &init_mm; 2424 return __mem_cgroup_try_charge(mm, mask, ptr, true, PAGE_SIZE); 2425} 2426 2427static void 2428__mem_cgroup_commit_charge_swapin(struct page *page, struct mem_cgroup *ptr, 2429 enum charge_type ctype) 2430{ 2431 struct page_cgroup *pc; 2432 2433 if (mem_cgroup_disabled()) 2434 return; 2435 if (!ptr) 2436 return; 2437 cgroup_exclude_rmdir(&ptr->css); 2438 pc = lookup_page_cgroup(page); 2439 mem_cgroup_lru_del_before_commit_swapcache(page); 2440 __mem_cgroup_commit_charge(ptr, pc, ctype, PAGE_SIZE); 2441 mem_cgroup_lru_add_after_commit_swapcache(page); 2442 /* 2443 * Now swap is on-memory. This means this page may be 2444 * counted both as mem and swap....double count. 2445 * Fix it by uncharging from memsw. Basically, this SwapCache is stable 2446 * under lock_page(). But in do_swap_page()::memory.c, reuse_swap_page() 2447 * may call delete_from_swap_cache() before reach here. 2448 */ 2449 if (do_swap_account && PageSwapCache(page)) { 2450 swp_entry_t ent = {.val = page_private(page)}; 2451 unsigned short id; 2452 struct mem_cgroup *memcg; 2453 2454 id = swap_cgroup_record(ent, 0); 2455 rcu_read_lock(); 2456 memcg = mem_cgroup_lookup(id); 2457 if (memcg) { 2458 /* 2459 * This recorded memcg can be obsolete one. So, avoid 2460 * calling css_tryget 2461 */ 2462 if (!mem_cgroup_is_root(memcg)) 2463 res_counter_uncharge(&memcg->memsw, PAGE_SIZE); 2464 mem_cgroup_swap_statistics(memcg, false); 2465 mem_cgroup_put(memcg); 2466 } 2467 rcu_read_unlock(); 2468 } 2469 /* 2470 * At swapin, we may charge account against cgroup which has no tasks. 2471 * So, rmdir()->pre_destroy() can be called while we do this charge. 2472 * In that case, we need to call pre_destroy() again. check it here. 2473 */ 2474 cgroup_release_and_wakeup_rmdir(&ptr->css); 2475} 2476 2477void mem_cgroup_commit_charge_swapin(struct page *page, struct mem_cgroup *ptr) 2478{ 2479 __mem_cgroup_commit_charge_swapin(page, ptr, 2480 MEM_CGROUP_CHARGE_TYPE_MAPPED); 2481} 2482 2483void mem_cgroup_cancel_charge_swapin(struct mem_cgroup *mem) 2484{ 2485 if (mem_cgroup_disabled()) 2486 return; 2487 if (!mem) 2488 return; 2489 mem_cgroup_cancel_charge(mem, PAGE_SIZE); 2490} 2491 2492static void 2493__do_uncharge(struct mem_cgroup *mem, const enum charge_type ctype, 2494 int page_size) 2495{ 2496 struct memcg_batch_info *batch = NULL; 2497 bool uncharge_memsw = true; 2498 /* If swapout, usage of swap doesn't decrease */ 2499 if (!do_swap_account || ctype == MEM_CGROUP_CHARGE_TYPE_SWAPOUT) 2500 uncharge_memsw = false; 2501 2502 batch = ¤t->memcg_batch; 2503 /* 2504 * In usual, we do css_get() when we remember memcg pointer. 2505 * But in this case, we keep res->usage until end of a series of 2506 * uncharges. Then, it's ok to ignore memcg's refcnt. 2507 */ 2508 if (!batch->memcg) 2509 batch->memcg = mem; 2510 /* 2511 * do_batch > 0 when unmapping pages or inode invalidate/truncate. 2512 * In those cases, all pages freed continously can be expected to be in 2513 * the same cgroup and we have chance to coalesce uncharges. 2514 * But we do uncharge one by one if this is killed by OOM(TIF_MEMDIE) 2515 * because we want to do uncharge as soon as possible. 2516 */ 2517 2518 if (!batch->do_batch || test_thread_flag(TIF_MEMDIE)) 2519 goto direct_uncharge; 2520 2521 if (page_size != PAGE_SIZE) 2522 goto direct_uncharge; 2523 2524 /* 2525 * In typical case, batch->memcg == mem. This means we can 2526 * merge a series of uncharges to an uncharge of res_counter. 2527 * If not, we uncharge res_counter ony by one. 2528 */ 2529 if (batch->memcg != mem) 2530 goto direct_uncharge; 2531 /* remember freed charge and uncharge it later */ 2532 batch->bytes += PAGE_SIZE; 2533 if (uncharge_memsw) 2534 batch->memsw_bytes += PAGE_SIZE; 2535 return; 2536direct_uncharge: 2537 res_counter_uncharge(&mem->res, page_size); 2538 if (uncharge_memsw) 2539 res_counter_uncharge(&mem->memsw, page_size); 2540 if (unlikely(batch->memcg != mem)) 2541 memcg_oom_recover(mem); 2542 return; 2543} 2544 2545/* 2546 * uncharge if !page_mapped(page) 2547 */ 2548static struct mem_cgroup * 2549__mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype) 2550{ 2551 int i; 2552 int count; 2553 struct page_cgroup *pc; 2554 struct mem_cgroup *mem = NULL; 2555 int page_size = PAGE_SIZE; 2556 2557 if (mem_cgroup_disabled()) 2558 return NULL; 2559 2560 if (PageSwapCache(page)) 2561 return NULL; 2562 2563 if (PageTransHuge(page)) { 2564 page_size <<= compound_order(page); 2565 VM_BUG_ON(!PageTransHuge(page)); 2566 } 2567 2568 count = page_size >> PAGE_SHIFT; 2569 /* 2570 * Check if our page_cgroup is valid 2571 */ 2572 pc = lookup_page_cgroup(page); 2573 if (unlikely(!pc || !PageCgroupUsed(pc))) 2574 return NULL; 2575 2576 lock_page_cgroup(pc); 2577 2578 mem = pc->mem_cgroup; 2579 2580 if (!PageCgroupUsed(pc)) 2581 goto unlock_out; 2582 2583 switch (ctype) { 2584 case MEM_CGROUP_CHARGE_TYPE_MAPPED: 2585 case MEM_CGROUP_CHARGE_TYPE_DROP: 2586 /* See mem_cgroup_prepare_migration() */ 2587 if (page_mapped(page) || PageCgroupMigration(pc)) 2588 goto unlock_out; 2589 break; 2590 case MEM_CGROUP_CHARGE_TYPE_SWAPOUT: 2591 if (!PageAnon(page)) { /* Shared memory */ 2592 if (page->mapping && !page_is_file_cache(page)) 2593 goto unlock_out; 2594 } else if (page_mapped(page)) /* Anon */ 2595 goto unlock_out; 2596 break; 2597 default: 2598 break; 2599 } 2600 2601 for (i = 0; i < count; i++) 2602 mem_cgroup_charge_statistics(mem, pc + i, false); 2603 2604 ClearPageCgroupUsed(pc); 2605 /* 2606 * pc->mem_cgroup is not cleared here. It will be accessed when it's 2607 * freed from LRU. This is safe because uncharged page is expected not 2608 * to be reused (freed soon). Exception is SwapCache, it's handled by 2609 * special functions. 2610 */ 2611 2612 unlock_page_cgroup(pc); 2613 /* 2614 * even after unlock, we have mem->res.usage here and this memcg 2615 * will never be freed. 2616 */ 2617 memcg_check_events(mem, page); 2618 if (do_swap_account && ctype == MEM_CGROUP_CHARGE_TYPE_SWAPOUT) { 2619 mem_cgroup_swap_statistics(mem, true); 2620 mem_cgroup_get(mem); 2621 } 2622 if (!mem_cgroup_is_root(mem)) 2623 __do_uncharge(mem, ctype, page_size); 2624 2625 return mem; 2626 2627unlock_out: 2628 unlock_page_cgroup(pc); 2629 return NULL; 2630} 2631 2632void mem_cgroup_uncharge_page(struct page *page) 2633{ 2634 /* early check. */ 2635 if (page_mapped(page)) 2636 return; 2637 if (page->mapping && !PageAnon(page)) 2638 return; 2639 __mem_cgroup_uncharge_common(page, MEM_CGROUP_CHARGE_TYPE_MAPPED); 2640} 2641 2642void mem_cgroup_uncharge_cache_page(struct page *page) 2643{ 2644 VM_BUG_ON(page_mapped(page)); 2645 VM_BUG_ON(page->mapping); 2646 __mem_cgroup_uncharge_common(page, MEM_CGROUP_CHARGE_TYPE_CACHE); 2647} 2648 2649/* 2650 * Batch_start/batch_end is called in unmap_page_range/invlidate/trucate. 2651 * In that cases, pages are freed continuously and we can expect pages 2652 * are in the same memcg. All these calls itself limits the number of 2653 * pages freed at once, then uncharge_start/end() is called properly. 2654 * This may be called prural(2) times in a context, 2655 */ 2656 2657void mem_cgroup_uncharge_start(void) 2658{ 2659 current->memcg_batch.do_batch++; 2660 /* We can do nest. */ 2661 if (current->memcg_batch.do_batch == 1) { 2662 current->memcg_batch.memcg = NULL; 2663 current->memcg_batch.bytes = 0; 2664 current->memcg_batch.memsw_bytes = 0; 2665 } 2666} 2667 2668void mem_cgroup_uncharge_end(void) 2669{ 2670 struct memcg_batch_info *batch = ¤t->memcg_batch; 2671 2672 if (!batch->do_batch) 2673 return; 2674 2675 batch->do_batch--; 2676 if (batch->do_batch) /* If stacked, do nothing. */ 2677 return; 2678 2679 if (!batch->memcg) 2680 return; 2681 /* 2682 * This "batch->memcg" is valid without any css_get/put etc... 2683 * bacause we hide charges behind us. 2684 */ 2685 if (batch->bytes) 2686 res_counter_uncharge(&batch->memcg->res, batch->bytes); 2687 if (batch->memsw_bytes) 2688 res_counter_uncharge(&batch->memcg->memsw, batch->memsw_bytes); 2689 memcg_oom_recover(batch->memcg); 2690 /* forget this pointer (for sanity check) */ 2691 batch->memcg = NULL; 2692} 2693 2694#ifdef CONFIG_SWAP 2695/* 2696 * called after __delete_from_swap_cache() and drop "page" account. 2697 * memcg information is recorded to swap_cgroup of "ent" 2698 */ 2699void 2700mem_cgroup_uncharge_swapcache(struct page *page, swp_entry_t ent, bool swapout) 2701{ 2702 struct mem_cgroup *memcg; 2703 int ctype = MEM_CGROUP_CHARGE_TYPE_SWAPOUT; 2704 2705 if (!swapout) /* this was a swap cache but the swap is unused ! */ 2706 ctype = MEM_CGROUP_CHARGE_TYPE_DROP; 2707 2708 memcg = __mem_cgroup_uncharge_common(page, ctype); 2709 2710 /* 2711 * record memcg information, if swapout && memcg != NULL, 2712 * mem_cgroup_get() was called in uncharge(). 2713 */ 2714 if (do_swap_account && swapout && memcg) 2715 swap_cgroup_record(ent, css_id(&memcg->css)); 2716} 2717#endif 2718 2719#ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP 2720/* 2721 * called from swap_entry_free(). remove record in swap_cgroup and 2722 * uncharge "memsw" account. 2723 */ 2724void mem_cgroup_uncharge_swap(swp_entry_t ent) 2725{ 2726 struct mem_cgroup *memcg; 2727 unsigned short id; 2728 2729 if (!do_swap_account) 2730 return; 2731 2732 id = swap_cgroup_record(ent, 0); 2733 rcu_read_lock(); 2734 memcg = mem_cgroup_lookup(id); 2735 if (memcg) { 2736 /* 2737 * We uncharge this because swap is freed. 2738 * This memcg can be obsolete one. We avoid calling css_tryget 2739 */ 2740 if (!mem_cgroup_is_root(memcg)) 2741 res_counter_uncharge(&memcg->memsw, PAGE_SIZE); 2742 mem_cgroup_swap_statistics(memcg, false); 2743 mem_cgroup_put(memcg); 2744 } 2745 rcu_read_unlock(); 2746} 2747 2748/** 2749 * mem_cgroup_move_swap_account - move swap charge and swap_cgroup's record. 2750 * @entry: swap entry to be moved 2751 * @from: mem_cgroup which the entry is moved from 2752 * @to: mem_cgroup which the entry is moved to 2753 * @need_fixup: whether we should fixup res_counters and refcounts. 2754 * 2755 * It succeeds only when the swap_cgroup's record for this entry is the same 2756 * as the mem_cgroup's id of @from. 2757 * 2758 * Returns 0 on success, -EINVAL on failure. 2759 * 2760 * The caller must have charged to @to, IOW, called res_counter_charge() about 2761 * both res and memsw, and called css_get(). 2762 */ 2763static int mem_cgroup_move_swap_account(swp_entry_t entry, 2764 struct mem_cgroup *from, struct mem_cgroup *to, bool need_fixup) 2765{ 2766 unsigned short old_id, new_id; 2767 2768 old_id = css_id(&from->css); 2769 new_id = css_id(&to->css); 2770 2771 if (swap_cgroup_cmpxchg(entry, old_id, new_id) == old_id) { 2772 mem_cgroup_swap_statistics(from, false); 2773 mem_cgroup_swap_statistics(to, true); 2774 /* 2775 * This function is only called from task migration context now. 2776 * It postpones res_counter and refcount handling till the end 2777 * of task migration(mem_cgroup_clear_mc()) for performance 2778 * improvement. But we cannot postpone mem_cgroup_get(to) 2779 * because if the process that has been moved to @to does 2780 * swap-in, the refcount of @to might be decreased to 0. 2781 */ 2782 mem_cgroup_get(to); 2783 if (need_fixup) { 2784 if (!mem_cgroup_is_root(from)) 2785 res_counter_uncharge(&from->memsw, PAGE_SIZE); 2786 mem_cgroup_put(from); 2787 /* 2788 * we charged both to->res and to->memsw, so we should 2789 * uncharge to->res. 2790 */ 2791 if (!mem_cgroup_is_root(to)) 2792 res_counter_uncharge(&to->res, PAGE_SIZE); 2793 } 2794 return 0; 2795 } 2796 return -EINVAL; 2797} 2798#else 2799static inline int mem_cgroup_move_swap_account(swp_entry_t entry, 2800 struct mem_cgroup *from, struct mem_cgroup *to, bool need_fixup) 2801{ 2802 return -EINVAL; 2803} 2804#endif 2805 2806/* 2807 * Before starting migration, account PAGE_SIZE to mem_cgroup that the old 2808 * page belongs to. 2809 */ 2810int mem_cgroup_prepare_migration(struct page *page, 2811 struct page *newpage, struct mem_cgroup **ptr) 2812{ 2813 struct page_cgroup *pc; 2814 struct mem_cgroup *mem = NULL; 2815 enum charge_type ctype; 2816 int ret = 0; 2817 2818 VM_BUG_ON(PageTransHuge(page)); 2819 if (mem_cgroup_disabled()) 2820 return 0; 2821 2822 pc = lookup_page_cgroup(page); 2823 lock_page_cgroup(pc); 2824 if (PageCgroupUsed(pc)) { 2825 mem = pc->mem_cgroup; 2826 css_get(&mem->css); 2827 /* 2828 * At migrating an anonymous page, its mapcount goes down 2829 * to 0 and uncharge() will be called. But, even if it's fully 2830 * unmapped, migration may fail and this page has to be 2831 * charged again. We set MIGRATION flag here and delay uncharge 2832 * until end_migration() is called 2833 * 2834 * Corner Case Thinking 2835 * A) 2836 * When the old page was mapped as Anon and it's unmap-and-freed 2837 * while migration was ongoing. 2838 * If unmap finds the old page, uncharge() of it will be delayed 2839 * until end_migration(). If unmap finds a new page, it's 2840 * uncharged when it make mapcount to be 1->0. If unmap code 2841 * finds swap_migration_entry, the new page will not be mapped 2842 * and end_migration() will find it(mapcount==0). 2843 * 2844 * B) 2845 * When the old page was mapped but migraion fails, the kernel 2846 * remaps it. A charge for it is kept by MIGRATION flag even 2847 * if mapcount goes down to 0. We can do remap successfully 2848 * without charging it again. 2849 * 2850 * C) 2851 * The "old" page is under lock_page() until the end of 2852 * migration, so, the old page itself will not be swapped-out. 2853 * If the new page is swapped out before end_migraton, our 2854 * hook to usual swap-out path will catch the event. 2855 */ 2856 if (PageAnon(page)) 2857 SetPageCgroupMigration(pc); 2858 } 2859 unlock_page_cgroup(pc); 2860 /* 2861 * If the page is not charged at this point, 2862 * we return here. 2863 */ 2864 if (!mem) 2865 return 0; 2866 2867 *ptr = mem; 2868 ret = __mem_cgroup_try_charge(NULL, GFP_KERNEL, ptr, false, PAGE_SIZE); 2869 css_put(&mem->css);/* drop extra refcnt */ 2870 if (ret || *ptr == NULL) { 2871 if (PageAnon(page)) { 2872 lock_page_cgroup(pc); 2873 ClearPageCgroupMigration(pc); 2874 unlock_page_cgroup(pc); 2875 /* 2876 * The old page may be fully unmapped while we kept it. 2877 */ 2878 mem_cgroup_uncharge_page(page); 2879 } 2880 return -ENOMEM; 2881 } 2882 /* 2883 * We charge new page before it's used/mapped. So, even if unlock_page() 2884 * is called before end_migration, we can catch all events on this new 2885 * page. In the case new page is migrated but not remapped, new page's 2886 * mapcount will be finally 0 and we call uncharge in end_migration(). 2887 */ 2888 pc = lookup_page_cgroup(newpage); 2889 if (PageAnon(page)) 2890 ctype = MEM_CGROUP_CHARGE_TYPE_MAPPED; 2891 else if (page_is_file_cache(page)) 2892 ctype = MEM_CGROUP_CHARGE_TYPE_CACHE; 2893 else 2894 ctype = MEM_CGROUP_CHARGE_TYPE_SHMEM; 2895 __mem_cgroup_commit_charge(mem, pc, ctype, PAGE_SIZE); 2896 return ret; 2897} 2898 2899/* remove redundant charge if migration failed*/ 2900void mem_cgroup_end_migration(struct mem_cgroup *mem, 2901 struct page *oldpage, struct page *newpage) 2902{ 2903 struct page *used, *unused; 2904 struct page_cgroup *pc; 2905 2906 if (!mem) 2907 return; 2908 /* blocks rmdir() */ 2909 cgroup_exclude_rmdir(&mem->css); 2910 /* at migration success, oldpage->mapping is NULL. */ 2911 if (oldpage->mapping) { 2912 used = oldpage; 2913 unused = newpage; 2914 } else { 2915 used = newpage; 2916 unused = oldpage; 2917 } 2918 /* 2919 * We disallowed uncharge of pages under migration because mapcount 2920 * of the page goes down to zero, temporarly. 2921 * Clear the flag and check the page should be charged. 2922 */ 2923 pc = lookup_page_cgroup(oldpage); 2924 lock_page_cgroup(pc); 2925 ClearPageCgroupMigration(pc); 2926 unlock_page_cgroup(pc); 2927 2928 __mem_cgroup_uncharge_common(unused, MEM_CGROUP_CHARGE_TYPE_FORCE); 2929 2930 /* 2931 * If a page is a file cache, radix-tree replacement is very atomic 2932 * and we can skip this check. When it was an Anon page, its mapcount 2933 * goes down to 0. But because we added MIGRATION flage, it's not 2934 * uncharged yet. There are several case but page->mapcount check 2935 * and USED bit check in mem_cgroup_uncharge_page() will do enough 2936 * check. (see prepare_charge() also) 2937 */ 2938 if (PageAnon(used)) 2939 mem_cgroup_uncharge_page(used); 2940 /* 2941 * At migration, we may charge account against cgroup which has no 2942 * tasks. 2943 * So, rmdir()->pre_destroy() can be called while we do this charge. 2944 * In that case, we need to call pre_destroy() again. check it here. 2945 */ 2946 cgroup_release_and_wakeup_rmdir(&mem->css); 2947} 2948 2949/* 2950 * A call to try to shrink memory usage on charge failure at shmem's swapin. 2951 * Calling hierarchical_reclaim is not enough because we should update 2952 * last_oom_jiffies to prevent pagefault_out_of_memory from invoking global OOM. 2953 * Moreover considering hierarchy, we should reclaim from the mem_over_limit, 2954 * not from the memcg which this page would be charged to. 2955 * try_charge_swapin does all of these works properly. 2956 */ 2957int mem_cgroup_shmem_charge_fallback(struct page *page, 2958 struct mm_struct *mm, 2959 gfp_t gfp_mask) 2960{ 2961 struct mem_cgroup *mem = NULL; 2962 int ret; 2963 2964 if (mem_cgroup_disabled()) 2965 return 0; 2966 2967 ret = mem_cgroup_try_charge_swapin(mm, page, gfp_mask, &mem); 2968 if (!ret) 2969 mem_cgroup_cancel_charge_swapin(mem); /* it does !mem check */ 2970 2971 return ret; 2972} 2973 2974static DEFINE_MUTEX(set_limit_mutex); 2975 2976static int mem_cgroup_resize_limit(struct mem_cgroup *memcg, 2977 unsigned long long val) 2978{ 2979 int retry_count; 2980 u64 memswlimit, memlimit; 2981 int ret = 0; 2982 int children = mem_cgroup_count_children(memcg); 2983 u64 curusage, oldusage; 2984 int enlarge; 2985 2986 /* 2987 * For keeping hierarchical_reclaim simple, how long we should retry 2988 * is depends on callers. We set our retry-count to be function 2989 * of # of children which we should visit in this loop. 2990 */ 2991 retry_count = MEM_CGROUP_RECLAIM_RETRIES * children; 2992 2993 oldusage = res_counter_read_u64(&memcg->res, RES_USAGE); 2994 2995 enlarge = 0; 2996 while (retry_count) { 2997 if (signal_pending(current)) { 2998 ret = -EINTR; 2999 break; 3000 } 3001 /* 3002 * Rather than hide all in some function, I do this in 3003 * open coded manner. You see what this really does. 3004 * We have to guarantee mem->res.limit < mem->memsw.limit. 3005 */ 3006 mutex_lock(&set_limit_mutex); 3007 memswlimit = res_counter_read_u64(&memcg->memsw, RES_LIMIT); 3008 if (memswlimit < val) { 3009 ret = -EINVAL; 3010 mutex_unlock(&set_limit_mutex); 3011 break; 3012 } 3013 3014 memlimit = res_counter_read_u64(&memcg->res, RES_LIMIT); 3015 if (memlimit < val) 3016 enlarge = 1; 3017 3018 ret = res_counter_set_limit(&memcg->res, val); 3019 if (!ret) { 3020 if (memswlimit == val) 3021 memcg->memsw_is_minimum = true; 3022 else 3023 memcg->memsw_is_minimum = false; 3024 } 3025 mutex_unlock(&set_limit_mutex); 3026 3027 if (!ret) 3028 break; 3029 3030 mem_cgroup_hierarchical_reclaim(memcg, NULL, GFP_KERNEL, 3031 MEM_CGROUP_RECLAIM_SHRINK); 3032 curusage = res_counter_read_u64(&memcg->res, RES_USAGE); 3033 /* Usage is reduced ? */ 3034 if (curusage >= oldusage) 3035 retry_count--; 3036 else 3037 oldusage = curusage; 3038 } 3039 if (!ret && enlarge) 3040 memcg_oom_recover(memcg); 3041 3042 return ret; 3043} 3044 3045static int mem_cgroup_resize_memsw_limit(struct mem_cgroup *memcg, 3046 unsigned long long val) 3047{ 3048 int retry_count; 3049 u64 memlimit, memswlimit, oldusage, curusage; 3050 int children = mem_cgroup_count_children(memcg); 3051 int ret = -EBUSY; 3052 int enlarge = 0; 3053 3054 /* see mem_cgroup_resize_res_limit */ 3055 retry_count = children * MEM_CGROUP_RECLAIM_RETRIES; 3056 oldusage = res_counter_read_u64(&memcg->memsw, RES_USAGE); 3057 while (retry_count) { 3058 if (signal_pending(current)) { 3059 ret = -EINTR; 3060 break; 3061 } 3062 /* 3063 * Rather than hide all in some function, I do this in 3064 * open coded manner. You see what this really does. 3065 * We have to guarantee mem->res.limit < mem->memsw.limit. 3066 */ 3067 mutex_lock(&set_limit_mutex); 3068 memlimit = res_counter_read_u64(&memcg->res, RES_LIMIT); 3069 if (memlimit > val) { 3070 ret = -EINVAL; 3071 mutex_unlock(&set_limit_mutex); 3072 break; 3073 } 3074 memswlimit = res_counter_read_u64(&memcg->memsw, RES_LIMIT); 3075 if (memswlimit < val) 3076 enlarge = 1; 3077 ret = res_counter_set_limit(&memcg->memsw, val); 3078 if (!ret) { 3079 if (memlimit == val) 3080 memcg->memsw_is_minimum = true; 3081 else 3082 memcg->memsw_is_minimum = false; 3083 } 3084 mutex_unlock(&set_limit_mutex); 3085 3086 if (!ret) 3087 break; 3088 3089 mem_cgroup_hierarchical_reclaim(memcg, NULL, GFP_KERNEL, 3090 MEM_CGROUP_RECLAIM_NOSWAP | 3091 MEM_CGROUP_RECLAIM_SHRINK); 3092 curusage = res_counter_read_u64(&memcg->memsw, RES_USAGE); 3093 /* Usage is reduced ? */ 3094 if (curusage >= oldusage) 3095 retry_count--; 3096 else 3097 oldusage = curusage; 3098 } 3099 if (!ret && enlarge) 3100 memcg_oom_recover(memcg); 3101 return ret; 3102} 3103 3104unsigned long mem_cgroup_soft_limit_reclaim(struct zone *zone, int order, 3105 gfp_t gfp_mask) 3106{ 3107 unsigned long nr_reclaimed = 0; 3108 struct mem_cgroup_per_zone *mz, *next_mz = NULL; 3109 unsigned long reclaimed; 3110 int loop = 0; 3111 struct mem_cgroup_tree_per_zone *mctz; 3112 unsigned long long excess; 3113 3114 if (order > 0) 3115 return 0; 3116 3117 mctz = soft_limit_tree_node_zone(zone_to_nid(zone), zone_idx(zone)); 3118 /* 3119 * This loop can run a while, specially if mem_cgroup's continuously 3120 * keep exceeding their soft limit and putting the system under 3121 * pressure 3122 */ 3123 do { 3124 if (next_mz) 3125 mz = next_mz; 3126 else 3127 mz = mem_cgroup_largest_soft_limit_node(mctz); 3128 if (!mz) 3129 break; 3130 3131 reclaimed = mem_cgroup_hierarchical_reclaim(mz->mem, zone, 3132 gfp_mask, 3133 MEM_CGROUP_RECLAIM_SOFT); 3134 nr_reclaimed += reclaimed; 3135 spin_lock(&mctz->lock); 3136 3137 /* 3138 * If we failed to reclaim anything from this memory cgroup 3139 * it is time to move on to the next cgroup 3140 */ 3141 next_mz = NULL; 3142 if (!reclaimed) { 3143 do { 3144 /* 3145 * Loop until we find yet another one. 3146 * 3147 * By the time we get the soft_limit lock 3148 * again, someone might have aded the 3149 * group back on the RB tree. Iterate to 3150 * make sure we get a different mem. 3151 * mem_cgroup_largest_soft_limit_node returns 3152 * NULL if no other cgroup is present on 3153 * the tree 3154 */ 3155 next_mz = 3156 __mem_cgroup_largest_soft_limit_node(mctz); 3157 if (next_mz == mz) { 3158 css_put(&next_mz->mem->css); 3159 next_mz = NULL; 3160 } else /* next_mz == NULL or other memcg */ 3161 break; 3162 } while (1); 3163 } 3164 __mem_cgroup_remove_exceeded(mz->mem, mz, mctz); 3165 excess = res_counter_soft_limit_excess(&mz->mem->res); 3166 /* 3167 * One school of thought says that we should not add 3168 * back the node to the tree if reclaim returns 0. 3169 * But our reclaim could return 0, simply because due 3170 * to priority we are exposing a smaller subset of 3171 * memory to reclaim from. Consider this as a longer 3172 * term TODO. 3173 */ 3174 /* If excess == 0, no tree ops */ 3175 __mem_cgroup_insert_exceeded(mz->mem, mz, mctz, excess); 3176 spin_unlock(&mctz->lock); 3177 css_put(&mz->mem->css); 3178 loop++; 3179 /* 3180 * Could not reclaim anything and there are no more 3181 * mem cgroups to try or we seem to be looping without 3182 * reclaiming anything. 3183 */ 3184 if (!nr_reclaimed && 3185 (next_mz == NULL || 3186 loop > MEM_CGROUP_MAX_SOFT_LIMIT_RECLAIM_LOOPS)) 3187 break; 3188 } while (!nr_reclaimed); 3189 if (next_mz) 3190 css_put(&next_mz->mem->css); 3191 return nr_reclaimed; 3192} 3193 3194/* 3195 * This routine traverse page_cgroup in given list and drop them all. 3196 * *And* this routine doesn't reclaim page itself, just removes page_cgroup. 3197 */ 3198static int mem_cgroup_force_empty_list(struct mem_cgroup *mem, 3199 int node, int zid, enum lru_list lru) 3200{ 3201 struct zone *zone; 3202 struct mem_cgroup_per_zone *mz; 3203 struct page_cgroup *pc, *busy; 3204 unsigned long flags, loop; 3205 struct list_head *list; 3206 int ret = 0; 3207 3208 zone = &NODE_DATA(node)->node_zones[zid]; 3209 mz = mem_cgroup_zoneinfo(mem, node, zid); 3210 list = &mz->lists[lru]; 3211 3212 loop = MEM_CGROUP_ZSTAT(mz, lru); 3213 /* give some margin against EBUSY etc...*/ 3214 loop += 256; 3215 busy = NULL; 3216 while (loop--) { 3217 ret = 0; 3218 spin_lock_irqsave(&zone->lru_lock, flags); 3219 if (list_empty(list)) { 3220 spin_unlock_irqrestore(&zone->lru_lock, flags); 3221 break; 3222 } 3223 pc = list_entry(list->prev, struct page_cgroup, lru); 3224 if (busy == pc) { 3225 list_move(&pc->lru, list); 3226 busy = NULL; 3227 spin_unlock_irqrestore(&zone->lru_lock, flags); 3228 continue; 3229 } 3230 spin_unlock_irqrestore(&zone->lru_lock, flags); 3231 3232 ret = mem_cgroup_move_parent(pc, mem, GFP_KERNEL); 3233 if (ret == -ENOMEM) 3234 break; 3235 3236 if (ret == -EBUSY || ret == -EINVAL) { 3237 /* found lock contention or "pc" is obsolete. */ 3238 busy = pc; 3239 cond_resched(); 3240 } else 3241 busy = NULL; 3242 } 3243 3244 if (!ret && !list_empty(list)) 3245 return -EBUSY; 3246 return ret; 3247} 3248 3249/* 3250 * make mem_cgroup's charge to be 0 if there is no task. 3251 * This enables deleting this mem_cgroup. 3252 */ 3253static int mem_cgroup_force_empty(struct mem_cgroup *mem, bool free_all) 3254{ 3255 int ret; 3256 int node, zid, shrink; 3257 int nr_retries = MEM_CGROUP_RECLAIM_RETRIES; 3258 struct cgroup *cgrp = mem->css.cgroup; 3259 3260 css_get(&mem->css); 3261 3262 shrink = 0; 3263 /* should free all ? */ 3264 if (free_all) 3265 goto try_to_free; 3266move_account: 3267 do { 3268 ret = -EBUSY; 3269 if (cgroup_task_count(cgrp) || !list_empty(&cgrp->children)) 3270 goto out; 3271 ret = -EINTR; 3272 if (signal_pending(current)) 3273 goto out; 3274 /* This is for making all *used* pages to be on LRU. */ 3275 lru_add_drain_all(); 3276 drain_all_stock_sync(); 3277 ret = 0; 3278 mem_cgroup_start_move(mem); 3279 for_each_node_state(node, N_HIGH_MEMORY) { 3280 for (zid = 0; !ret && zid < MAX_NR_ZONES; zid++) { 3281 enum lru_list l; 3282 for_each_lru(l) { 3283 ret = mem_cgroup_force_empty_list(mem, 3284 node, zid, l); 3285 if (ret) 3286 break; 3287 } 3288 } 3289 if (ret) 3290 break; 3291 } 3292 mem_cgroup_end_move(mem); 3293 memcg_oom_recover(mem); 3294 /* it seems parent cgroup doesn't have enough mem */ 3295 if (ret == -ENOMEM) 3296 goto try_to_free; 3297 cond_resched(); 3298 /* "ret" should also be checked to ensure all lists are empty. */ 3299 } while (mem->res.usage > 0 || ret); 3300out: 3301 css_put(&mem->css); 3302 return ret; 3303 3304try_to_free: 3305 /* returns EBUSY if there is a task or if we come here twice. */ 3306 if (cgroup_task_count(cgrp) || !list_empty(&cgrp->children) || shrink) { 3307 ret = -EBUSY; 3308 goto out; 3309 } 3310 /* we call try-to-free pages for make this cgroup empty */ 3311 lru_add_drain_all(); 3312 /* try to free all pages in this cgroup */ 3313 shrink = 1; 3314 while (nr_retries && mem->res.usage > 0) { 3315 int progress; 3316 3317 if (signal_pending(current)) { 3318 ret = -EINTR; 3319 goto out; 3320 } 3321 progress = try_to_free_mem_cgroup_pages(mem, GFP_KERNEL, 3322 false, get_swappiness(mem)); 3323 if (!progress) { 3324 nr_retries--; 3325 /* maybe some writeback is necessary */ 3326 congestion_wait(BLK_RW_ASYNC, HZ/10); 3327 } 3328 3329 } 3330 lru_add_drain(); 3331 /* try move_account...there may be some *locked* pages. */ 3332 goto move_account; 3333} 3334 3335int mem_cgroup_force_empty_write(struct cgroup *cont, unsigned int event) 3336{ 3337 return mem_cgroup_force_empty(mem_cgroup_from_cont(cont), true); 3338} 3339 3340 3341static u64 mem_cgroup_hierarchy_read(struct cgroup *cont, struct cftype *cft) 3342{ 3343 return mem_cgroup_from_cont(cont)->use_hierarchy; 3344} 3345 3346static int mem_cgroup_hierarchy_write(struct cgroup *cont, struct cftype *cft, 3347 u64 val) 3348{ 3349 int retval = 0; 3350 struct mem_cgroup *mem = mem_cgroup_from_cont(cont); 3351 struct cgroup *parent = cont->parent; 3352 struct mem_cgroup *parent_mem = NULL; 3353 3354 if (parent) 3355 parent_mem = mem_cgroup_from_cont(parent); 3356 3357 cgroup_lock(); 3358 /* 3359 * If parent's use_hierarchy is set, we can't make any modifications 3360 * in the child subtrees. If it is unset, then the change can 3361 * occur, provided the current cgroup has no children. 3362 * 3363 * For the root cgroup, parent_mem is NULL, we allow value to be 3364 * set if there are no children. 3365 */ 3366 if ((!parent_mem || !parent_mem->use_hierarchy) && 3367 (val == 1 || val == 0)) { 3368 if (list_empty(&cont->children)) 3369 mem->use_hierarchy = val; 3370 else 3371 retval = -EBUSY; 3372 } else 3373 retval = -EINVAL; 3374 cgroup_unlock(); 3375 3376 return retval; 3377} 3378 3379 3380static u64 mem_cgroup_get_recursive_idx_stat(struct mem_cgroup *mem, 3381 enum mem_cgroup_stat_index idx) 3382{ 3383 struct mem_cgroup *iter; 3384 s64 val = 0; 3385 3386 /* each per cpu's value can be minus.Then, use s64 */ 3387 for_each_mem_cgroup_tree(iter, mem) 3388 val += mem_cgroup_read_stat(iter, idx); 3389 3390 if (val < 0) /* race ? */ 3391 val = 0; 3392 return val; 3393} 3394 3395static inline u64 mem_cgroup_usage(struct mem_cgroup *mem, bool swap) 3396{ 3397 u64 val; 3398 3399 if (!mem_cgroup_is_root(mem)) { 3400 if (!swap) 3401 return res_counter_read_u64(&mem->res, RES_USAGE); 3402 else 3403 return res_counter_read_u64(&mem->memsw, RES_USAGE); 3404 } 3405 3406 val = mem_cgroup_get_recursive_idx_stat(mem, MEM_CGROUP_STAT_CACHE); 3407 val += mem_cgroup_get_recursive_idx_stat(mem, MEM_CGROUP_STAT_RSS); 3408 3409 if (swap) 3410 val += mem_cgroup_get_recursive_idx_stat(mem, 3411 MEM_CGROUP_STAT_SWAPOUT); 3412 3413 return val << PAGE_SHIFT; 3414} 3415 3416static u64 mem_cgroup_read(struct cgroup *cont, struct cftype *cft) 3417{ 3418 struct mem_cgroup *mem = mem_cgroup_from_cont(cont); 3419 u64 val; 3420 int type, name; 3421 3422 type = MEMFILE_TYPE(cft->private); 3423 name = MEMFILE_ATTR(cft->private); 3424 switch (type) { 3425 case _MEM: 3426 if (name == RES_USAGE) 3427 val = mem_cgroup_usage(mem, false); 3428 else 3429 val = res_counter_read_u64(&mem->res, name); 3430 break; 3431 case _MEMSWAP: 3432 if (name == RES_USAGE) 3433 val = mem_cgroup_usage(mem, true); 3434 else 3435 val = res_counter_read_u64(&mem->memsw, name); 3436 break; 3437 default: 3438 BUG(); 3439 break; 3440 } 3441 return val; 3442} 3443/* 3444 * The user of this function is... 3445 * RES_LIMIT. 3446 */ 3447static int mem_cgroup_write(struct cgroup *cont, struct cftype *cft, 3448 const char *buffer) 3449{ 3450 struct mem_cgroup *memcg = mem_cgroup_from_cont(cont); 3451 int type, name; 3452 unsigned long long val; 3453 int ret; 3454 3455 type = MEMFILE_TYPE(cft->private); 3456 name = MEMFILE_ATTR(cft->private); 3457 switch (name) { 3458 case RES_LIMIT: 3459 if (mem_cgroup_is_root(memcg)) { /* Can't set limit on root */ 3460 ret = -EINVAL; 3461 break; 3462 } 3463 /* This function does all necessary parse...reuse it */ 3464 ret = res_counter_memparse_write_strategy(buffer, &val); 3465 if (ret) 3466 break; 3467 if (type == _MEM) 3468 ret = mem_cgroup_resize_limit(memcg, val); 3469 else 3470 ret = mem_cgroup_resize_memsw_limit(memcg, val); 3471 break; 3472 case RES_SOFT_LIMIT: 3473 ret = res_counter_memparse_write_strategy(buffer, &val); 3474 if (ret) 3475 break; 3476 /* 3477 * For memsw, soft limits are hard to implement in terms 3478 * of semantics, for now, we support soft limits for 3479 * control without swap 3480 */ 3481 if (type == _MEM) 3482 ret = res_counter_set_soft_limit(&memcg->res, val); 3483 else 3484 ret = -EINVAL; 3485 break; 3486 default: 3487 ret = -EINVAL; /* should be BUG() ? */ 3488 break; 3489 } 3490 return ret; 3491} 3492 3493static void memcg_get_hierarchical_limit(struct mem_cgroup *memcg, 3494 unsigned long long *mem_limit, unsigned long long *memsw_limit) 3495{ 3496 struct cgroup *cgroup; 3497 unsigned long long min_limit, min_memsw_limit, tmp; 3498 3499 min_limit = res_counter_read_u64(&memcg->res, RES_LIMIT); 3500 min_memsw_limit = res_counter_read_u64(&memcg->memsw, RES_LIMIT); 3501 cgroup = memcg->css.cgroup; 3502 if (!memcg->use_hierarchy) 3503 goto out; 3504 3505 while (cgroup->parent) { 3506 cgroup = cgroup->parent; 3507 memcg = mem_cgroup_from_cont(cgroup); 3508 if (!memcg->use_hierarchy) 3509 break; 3510 tmp = res_counter_read_u64(&memcg->res, RES_LIMIT); 3511 min_limit = min(min_limit, tmp); 3512 tmp = res_counter_read_u64(&memcg->memsw, RES_LIMIT); 3513 min_memsw_limit = min(min_memsw_limit, tmp); 3514 } 3515out: 3516 *mem_limit = min_limit; 3517 *memsw_limit = min_memsw_limit; 3518 return; 3519} 3520 3521static int mem_cgroup_reset(struct cgroup *cont, unsigned int event) 3522{ 3523 struct mem_cgroup *mem; 3524 int type, name; 3525 3526 mem = mem_cgroup_from_cont(cont); 3527 type = MEMFILE_TYPE(event); 3528 name = MEMFILE_ATTR(event); 3529 switch (name) { 3530 case RES_MAX_USAGE: 3531 if (type == _MEM) 3532 res_counter_reset_max(&mem->res); 3533 else 3534 res_counter_reset_max(&mem->memsw); 3535 break; 3536 case RES_FAILCNT: 3537 if (type == _MEM) 3538 res_counter_reset_failcnt(&mem->res); 3539 else 3540 res_counter_reset_failcnt(&mem->memsw); 3541 break; 3542 } 3543 3544 return 0; 3545} 3546 3547static u64 mem_cgroup_move_charge_read(struct cgroup *cgrp, 3548 struct cftype *cft) 3549{ 3550 return mem_cgroup_from_cont(cgrp)->move_charge_at_immigrate; 3551} 3552 3553#ifdef CONFIG_MMU 3554static int mem_cgroup_move_charge_write(struct cgroup *cgrp, 3555 struct cftype *cft, u64 val) 3556{ 3557 struct mem_cgroup *mem = mem_cgroup_from_cont(cgrp); 3558 3559 if (val >= (1 << NR_MOVE_TYPE)) 3560 return -EINVAL; 3561 /* 3562 * We check this value several times in both in can_attach() and 3563 * attach(), so we need cgroup lock to prevent this value from being 3564 * inconsistent. 3565 */ 3566 cgroup_lock(); 3567 mem->move_charge_at_immigrate = val; 3568 cgroup_unlock(); 3569 3570 return 0; 3571} 3572#else 3573static int mem_cgroup_move_charge_write(struct cgroup *cgrp, 3574 struct cftype *cft, u64 val) 3575{ 3576 return -ENOSYS; 3577} 3578#endif 3579 3580 3581/* For read statistics */ 3582enum { 3583 MCS_CACHE, 3584 MCS_RSS, 3585 MCS_FILE_MAPPED, 3586 MCS_PGPGIN, 3587 MCS_PGPGOUT, 3588 MCS_SWAP, 3589 MCS_INACTIVE_ANON, 3590 MCS_ACTIVE_ANON, 3591 MCS_INACTIVE_FILE, 3592 MCS_ACTIVE_FILE, 3593 MCS_UNEVICTABLE, 3594 NR_MCS_STAT, 3595}; 3596 3597struct mcs_total_stat { 3598 s64 stat[NR_MCS_STAT]; 3599}; 3600 3601struct { 3602 char *local_name; 3603 char *total_name; 3604} memcg_stat_strings[NR_MCS_STAT] = { 3605 {"cache", "total_cache"}, 3606 {"rss", "total_rss"}, 3607 {"mapped_file", "total_mapped_file"}, 3608 {"pgpgin", "total_pgpgin"}, 3609 {"pgpgout", "total_pgpgout"}, 3610 {"swap", "total_swap"}, 3611 {"inactive_anon", "total_inactive_anon"}, 3612 {"active_anon", "total_active_anon"}, 3613 {"inactive_file", "total_inactive_file"}, 3614 {"active_file", "total_active_file"}, 3615 {"unevictable", "total_unevictable"} 3616}; 3617 3618 3619static void 3620mem_cgroup_get_local_stat(struct mem_cgroup *mem, struct mcs_total_stat *s) 3621{ 3622 s64 val; 3623 3624 /* per cpu stat */ 3625 val = mem_cgroup_read_stat(mem, MEM_CGROUP_STAT_CACHE); 3626 s->stat[MCS_CACHE] += val * PAGE_SIZE; 3627 val = mem_cgroup_read_stat(mem, MEM_CGROUP_STAT_RSS); 3628 s->stat[MCS_RSS] += val * PAGE_SIZE; 3629 val = mem_cgroup_read_stat(mem, MEM_CGROUP_STAT_FILE_MAPPED); 3630 s->stat[MCS_FILE_MAPPED] += val * PAGE_SIZE; 3631 val = mem_cgroup_read_stat(mem, MEM_CGROUP_STAT_PGPGIN_COUNT); 3632 s->stat[MCS_PGPGIN] += val; 3633 val = mem_cgroup_read_stat(mem, MEM_CGROUP_STAT_PGPGOUT_COUNT); 3634 s->stat[MCS_PGPGOUT] += val; 3635 if (do_swap_account) { 3636 val = mem_cgroup_read_stat(mem, MEM_CGROUP_STAT_SWAPOUT); 3637 s->stat[MCS_SWAP] += val * PAGE_SIZE; 3638 } 3639 3640 /* per zone stat */ 3641 val = mem_cgroup_get_local_zonestat(mem, LRU_INACTIVE_ANON); 3642 s->stat[MCS_INACTIVE_ANON] += val * PAGE_SIZE; 3643 val = mem_cgroup_get_local_zonestat(mem, LRU_ACTIVE_ANON); 3644 s->stat[MCS_ACTIVE_ANON] += val * PAGE_SIZE; 3645 val = mem_cgroup_get_local_zonestat(mem, LRU_INACTIVE_FILE); 3646 s->stat[MCS_INACTIVE_FILE] += val * PAGE_SIZE; 3647 val = mem_cgroup_get_local_zonestat(mem, LRU_ACTIVE_FILE); 3648 s->stat[MCS_ACTIVE_FILE] += val * PAGE_SIZE; 3649 val = mem_cgroup_get_local_zonestat(mem, LRU_UNEVICTABLE); 3650 s->stat[MCS_UNEVICTABLE] += val * PAGE_SIZE; 3651} 3652 3653static void 3654mem_cgroup_get_total_stat(struct mem_cgroup *mem, struct mcs_total_stat *s) 3655{ 3656 struct mem_cgroup *iter; 3657 3658 for_each_mem_cgroup_tree(iter, mem) 3659 mem_cgroup_get_local_stat(iter, s); 3660} 3661 3662static int mem_control_stat_show(struct cgroup *cont, struct cftype *cft, 3663 struct cgroup_map_cb *cb) 3664{ 3665 struct mem_cgroup *mem_cont = mem_cgroup_from_cont(cont); 3666 struct mcs_total_stat mystat; 3667 int i; 3668 3669 memset(&mystat, 0, sizeof(mystat)); 3670 mem_cgroup_get_local_stat(mem_cont, &mystat); 3671 3672 for (i = 0; i < NR_MCS_STAT; i++) { 3673 if (i == MCS_SWAP && !do_swap_account) 3674 continue; 3675 cb->fill(cb, memcg_stat_strings[i].local_name, mystat.stat[i]); 3676 } 3677 3678 /* Hierarchical information */ 3679 { 3680 unsigned long long limit, memsw_limit; 3681 memcg_get_hierarchical_limit(mem_cont, &limit, &memsw_limit); 3682 cb->fill(cb, "hierarchical_memory_limit", limit); 3683 if (do_swap_account) 3684 cb->fill(cb, "hierarchical_memsw_limit", memsw_limit); 3685 } 3686 3687 memset(&mystat, 0, sizeof(mystat)); 3688 mem_cgroup_get_total_stat(mem_cont, &mystat); 3689 for (i = 0; i < NR_MCS_STAT; i++) { 3690 if (i == MCS_SWAP && !do_swap_account) 3691 continue; 3692 cb->fill(cb, memcg_stat_strings[i].total_name, mystat.stat[i]); 3693 } 3694 3695#ifdef CONFIG_DEBUG_VM 3696 cb->fill(cb, "inactive_ratio", calc_inactive_ratio(mem_cont, NULL)); 3697 3698 { 3699 int nid, zid; 3700 struct mem_cgroup_per_zone *mz; 3701 unsigned long recent_rotated[2] = {0, 0}; 3702 unsigned long recent_scanned[2] = {0, 0}; 3703 3704 for_each_online_node(nid) 3705 for (zid = 0; zid < MAX_NR_ZONES; zid++) { 3706 mz = mem_cgroup_zoneinfo(mem_cont, nid, zid); 3707 3708 recent_rotated[0] += 3709 mz->reclaim_stat.recent_rotated[0]; 3710 recent_rotated[1] += 3711 mz->reclaim_stat.recent_rotated[1]; 3712 recent_scanned[0] += 3713 mz->reclaim_stat.recent_scanned[0]; 3714 recent_scanned[1] += 3715 mz->reclaim_stat.recent_scanned[1]; 3716 } 3717 cb->fill(cb, "recent_rotated_anon", recent_rotated[0]); 3718 cb->fill(cb, "recent_rotated_file", recent_rotated[1]); 3719 cb->fill(cb, "recent_scanned_anon", recent_scanned[0]); 3720 cb->fill(cb, "recent_scanned_file", recent_scanned[1]); 3721 } 3722#endif 3723 3724 return 0; 3725} 3726 3727static u64 mem_cgroup_swappiness_read(struct cgroup *cgrp, struct cftype *cft) 3728{ 3729 struct mem_cgroup *memcg = mem_cgroup_from_cont(cgrp); 3730 3731 return get_swappiness(memcg); 3732} 3733 3734static int mem_cgroup_swappiness_write(struct cgroup *cgrp, struct cftype *cft, 3735 u64 val) 3736{ 3737 struct mem_cgroup *memcg = mem_cgroup_from_cont(cgrp); 3738 struct mem_cgroup *parent; 3739 3740 if (val > 100) 3741 return -EINVAL; 3742 3743 if (cgrp->parent == NULL) 3744 return -EINVAL; 3745 3746 parent = mem_cgroup_from_cont(cgrp->parent); 3747 3748 cgroup_lock(); 3749 3750 /* If under hierarchy, only empty-root can set this value */ 3751 if ((parent->use_hierarchy) || 3752 (memcg->use_hierarchy && !list_empty(&cgrp->children))) { 3753 cgroup_unlock(); 3754 return -EINVAL; 3755 } 3756 3757 spin_lock(&memcg->reclaim_param_lock); 3758 memcg->swappiness = val; 3759 spin_unlock(&memcg->reclaim_param_lock); 3760 3761 cgroup_unlock(); 3762 3763 return 0; 3764} 3765 3766static void __mem_cgroup_threshold(struct mem_cgroup *memcg, bool swap) 3767{ 3768 struct mem_cgroup_threshold_ary *t; 3769 u64 usage; 3770 int i; 3771 3772 rcu_read_lock(); 3773 if (!swap) 3774 t = rcu_dereference(memcg->thresholds.primary); 3775 else 3776 t = rcu_dereference(memcg->memsw_thresholds.primary); 3777 3778 if (!t) 3779 goto unlock; 3780 3781 usage = mem_cgroup_usage(memcg, swap); 3782 3783 /* 3784 * current_threshold points to threshold just below usage. 3785 * If it's not true, a threshold was crossed after last 3786 * call of __mem_cgroup_threshold(). 3787 */ 3788 i = t->current_threshold; 3789 3790 /* 3791 * Iterate backward over array of thresholds starting from 3792 * current_threshold and check if a threshold is crossed. 3793 * If none of thresholds below usage is crossed, we read 3794 * only one element of the array here. 3795 */ 3796 for (; i >= 0 && unlikely(t->entries[i].threshold > usage); i--) 3797 eventfd_signal(t->entries[i].eventfd, 1); 3798 3799 /* i = current_threshold + 1 */ 3800 i++; 3801 3802 /* 3803 * Iterate forward over array of thresholds starting from 3804 * current_threshold+1 and check if a threshold is crossed. 3805 * If none of thresholds above usage is crossed, we read 3806 * only one element of the array here. 3807 */ 3808 for (; i < t->size && unlikely(t->entries[i].threshold <= usage); i++) 3809 eventfd_signal(t->entries[i].eventfd, 1); 3810 3811 /* Update current_threshold */ 3812 t->current_threshold = i - 1; 3813unlock: 3814 rcu_read_unlock(); 3815} 3816 3817static void mem_cgroup_threshold(struct mem_cgroup *memcg) 3818{ 3819 while (memcg) { 3820 __mem_cgroup_threshold(memcg, false); 3821 if (do_swap_account) 3822 __mem_cgroup_threshold(memcg, true); 3823 3824 memcg = parent_mem_cgroup(memcg); 3825 } 3826} 3827 3828static int compare_thresholds(const void *a, const void *b) 3829{ 3830 const struct mem_cgroup_threshold *_a = a; 3831 const struct mem_cgroup_threshold *_b = b; 3832 3833 return _a->threshold - _b->threshold; 3834} 3835 3836static int mem_cgroup_oom_notify_cb(struct mem_cgroup *mem) 3837{ 3838 struct mem_cgroup_eventfd_list *ev; 3839 3840 list_for_each_entry(ev, &mem->oom_notify, list) 3841 eventfd_signal(ev->eventfd, 1); 3842 return 0; 3843} 3844 3845static void mem_cgroup_oom_notify(struct mem_cgroup *mem) 3846{ 3847 struct mem_cgroup *iter; 3848 3849 for_each_mem_cgroup_tree(iter, mem) 3850 mem_cgroup_oom_notify_cb(iter); 3851} 3852 3853static int mem_cgroup_usage_register_event(struct cgroup *cgrp, 3854 struct cftype *cft, struct eventfd_ctx *eventfd, const char *args) 3855{ 3856 struct mem_cgroup *memcg = mem_cgroup_from_cont(cgrp); 3857 struct mem_cgroup_thresholds *thresholds; 3858 struct mem_cgroup_threshold_ary *new; 3859 int type = MEMFILE_TYPE(cft->private); 3860 u64 threshold, usage; 3861 int i, size, ret; 3862 3863 ret = res_counter_memparse_write_strategy(args, &threshold); 3864 if (ret) 3865 return ret; 3866 3867 mutex_lock(&memcg->thresholds_lock); 3868 3869 if (type == _MEM) 3870 thresholds = &memcg->thresholds; 3871 else if (type == _MEMSWAP) 3872 thresholds = &memcg->memsw_thresholds; 3873 else 3874 BUG(); 3875 3876 usage = mem_cgroup_usage(memcg, type == _MEMSWAP); 3877 3878 /* Check if a threshold crossed before adding a new one */ 3879 if (thresholds->primary) 3880 __mem_cgroup_threshold(memcg, type == _MEMSWAP); 3881 3882 size = thresholds->primary ? thresholds->primary->size + 1 : 1; 3883 3884 /* Allocate memory for new array of thresholds */ 3885 new = kmalloc(sizeof(*new) + size * sizeof(struct mem_cgroup_threshold), 3886 GFP_KERNEL); 3887 if (!new) { 3888 ret = -ENOMEM; 3889 goto unlock; 3890 } 3891 new->size = size; 3892 3893 /* Copy thresholds (if any) to new array */ 3894 if (thresholds->primary) { 3895 memcpy(new->entries, thresholds->primary->entries, (size - 1) * 3896 sizeof(struct mem_cgroup_threshold)); 3897 } 3898 3899 /* Add new threshold */ 3900 new->entries[size - 1].eventfd = eventfd; 3901 new->entries[size - 1].threshold = threshold; 3902 3903 /* Sort thresholds. Registering of new threshold isn't time-critical */ 3904 sort(new->entries, size, sizeof(struct mem_cgroup_threshold), 3905 compare_thresholds, NULL); 3906 3907 /* Find current threshold */ 3908 new->current_threshold = -1; 3909 for (i = 0; i < size; i++) { 3910 if (new->entries[i].threshold < usage) { 3911 /* 3912 * new->current_threshold will not be used until 3913 * rcu_assign_pointer(), so it's safe to increment 3914 * it here. 3915 */ 3916 ++new->current_threshold; 3917 } 3918 } 3919 3920 /* Free old spare buffer and save old primary buffer as spare */ 3921 kfree(thresholds->spare); 3922 thresholds->spare = thresholds->primary; 3923 3924 rcu_assign_pointer(thresholds->primary, new); 3925 3926 /* To be sure that nobody uses thresholds */ 3927 synchronize_rcu(); 3928 3929unlock: 3930 mutex_unlock(&memcg->thresholds_lock); 3931 3932 return ret; 3933} 3934 3935static void mem_cgroup_usage_unregister_event(struct cgroup *cgrp, 3936 struct cftype *cft, struct eventfd_ctx *eventfd) 3937{ 3938 struct mem_cgroup *memcg = mem_cgroup_from_cont(cgrp); 3939 struct mem_cgroup_thresholds *thresholds; 3940 struct mem_cgroup_threshold_ary *new; 3941 int type = MEMFILE_TYPE(cft->private); 3942 u64 usage; 3943 int i, j, size; 3944 3945 mutex_lock(&memcg->thresholds_lock); 3946 if (type == _MEM) 3947 thresholds = &memcg->thresholds; 3948 else if (type == _MEMSWAP) 3949 thresholds = &memcg->memsw_thresholds; 3950 else 3951 BUG(); 3952 3953 /* 3954 * Something went wrong if we trying to unregister a threshold 3955 * if we don't have thresholds 3956 */ 3957 BUG_ON(!thresholds); 3958 3959 usage = mem_cgroup_usage(memcg, type == _MEMSWAP); 3960 3961 /* Check if a threshold crossed before removing */ 3962 __mem_cgroup_threshold(memcg, type == _MEMSWAP); 3963 3964 /* Calculate new number of threshold */ 3965 size = 0; 3966 for (i = 0; i < thresholds->primary->size; i++) { 3967 if (thresholds->primary->entries[i].eventfd != eventfd) 3968 size++; 3969 } 3970 3971 new = thresholds->spare; 3972 3973 /* Set thresholds array to NULL if we don't have thresholds */ 3974 if (!size) { 3975 kfree(new); 3976 new = NULL; 3977 goto swap_buffers; 3978 } 3979 3980 new->size = size; 3981 3982 /* Copy thresholds and find current threshold */ 3983 new->current_threshold = -1; 3984 for (i = 0, j = 0; i < thresholds->primary->size; i++) { 3985 if (thresholds->primary->entries[i].eventfd == eventfd) 3986 continue; 3987 3988 new->entries[j] = thresholds->primary->entries[i]; 3989 if (new->entries[j].threshold < usage) { 3990 /* 3991 * new->current_threshold will not be used 3992 * until rcu_assign_pointer(), so it's safe to increment 3993 * it here. 3994 */ 3995 ++new->current_threshold; 3996 } 3997 j++; 3998 } 3999 4000swap_buffers: 4001 /* Swap primary and spare array */ 4002 thresholds->spare = thresholds->primary; 4003 rcu_assign_pointer(thresholds->primary, new); 4004 4005 /* To be sure that nobody uses thresholds */ 4006 synchronize_rcu(); 4007 4008 mutex_unlock(&memcg->thresholds_lock); 4009} 4010 4011static int mem_cgroup_oom_register_event(struct cgroup *cgrp, 4012 struct cftype *cft, struct eventfd_ctx *eventfd, const char *args) 4013{ 4014 struct mem_cgroup *memcg = mem_cgroup_from_cont(cgrp); 4015 struct mem_cgroup_eventfd_list *event; 4016 int type = MEMFILE_TYPE(cft->private); 4017 4018 BUG_ON(type != _OOM_TYPE); 4019 event = kmalloc(sizeof(*event), GFP_KERNEL); 4020 if (!event) 4021 return -ENOMEM; 4022 4023 mutex_lock(&memcg_oom_mutex); 4024 4025 event->eventfd = eventfd; 4026 list_add(&event->list, &memcg->oom_notify); 4027 4028 /* already in OOM ? */ 4029 if (atomic_read(&memcg->oom_lock)) 4030 eventfd_signal(eventfd, 1); 4031 mutex_unlock(&memcg_oom_mutex); 4032 4033 return 0; 4034} 4035 4036static void mem_cgroup_oom_unregister_event(struct cgroup *cgrp, 4037 struct cftype *cft, struct eventfd_ctx *eventfd) 4038{ 4039 struct mem_cgroup *mem = mem_cgroup_from_cont(cgrp); 4040 struct mem_cgroup_eventfd_list *ev, *tmp; 4041 int type = MEMFILE_TYPE(cft->private); 4042 4043 BUG_ON(type != _OOM_TYPE); 4044 4045 mutex_lock(&memcg_oom_mutex); 4046 4047 list_for_each_entry_safe(ev, tmp, &mem->oom_notify, list) { 4048 if (ev->eventfd == eventfd) { 4049 list_del(&ev->list); 4050 kfree(ev); 4051 } 4052 } 4053 4054 mutex_unlock(&memcg_oom_mutex); 4055} 4056 4057static int mem_cgroup_oom_control_read(struct cgroup *cgrp, 4058 struct cftype *cft, struct cgroup_map_cb *cb) 4059{ 4060 struct mem_cgroup *mem = mem_cgroup_from_cont(cgrp); 4061 4062 cb->fill(cb, "oom_kill_disable", mem->oom_kill_disable); 4063 4064 if (atomic_read(&mem->oom_lock)) 4065 cb->fill(cb, "under_oom", 1); 4066 else 4067 cb->fill(cb, "under_oom", 0); 4068 return 0; 4069} 4070 4071static int mem_cgroup_oom_control_write(struct cgroup *cgrp, 4072 struct cftype *cft, u64 val) 4073{ 4074 struct mem_cgroup *mem = mem_cgroup_from_cont(cgrp); 4075 struct mem_cgroup *parent; 4076 4077 /* cannot set to root cgroup and only 0 and 1 are allowed */ 4078 if (!cgrp->parent || !((val == 0) || (val == 1))) 4079 return -EINVAL; 4080 4081 parent = mem_cgroup_from_cont(cgrp->parent); 4082 4083 cgroup_lock(); 4084 /* oom-kill-disable is a flag for subhierarchy. */ 4085 if ((parent->use_hierarchy) || 4086 (mem->use_hierarchy && !list_empty(&cgrp->children))) { 4087 cgroup_unlock(); 4088 return -EINVAL; 4089 } 4090 mem->oom_kill_disable = val; 4091 if (!val) 4092 memcg_oom_recover(mem); 4093 cgroup_unlock(); 4094 return 0; 4095} 4096 4097static struct cftype mem_cgroup_files[] = { 4098 { 4099 .name = "usage_in_bytes", 4100 .private = MEMFILE_PRIVATE(_MEM, RES_USAGE), 4101 .read_u64 = mem_cgroup_read, 4102 .register_event = mem_cgroup_usage_register_event, 4103 .unregister_event = mem_cgroup_usage_unregister_event, 4104 }, 4105 { 4106 .name = "max_usage_in_bytes", 4107 .private = MEMFILE_PRIVATE(_MEM, RES_MAX_USAGE), 4108 .trigger = mem_cgroup_reset, 4109 .read_u64 = mem_cgroup_read, 4110 }, 4111 { 4112 .name = "limit_in_bytes", 4113 .private = MEMFILE_PRIVATE(_MEM, RES_LIMIT), 4114 .write_string = mem_cgroup_write, 4115 .read_u64 = mem_cgroup_read, 4116 }, 4117 { 4118 .name = "soft_limit_in_bytes", 4119 .private = MEMFILE_PRIVATE(_MEM, RES_SOFT_LIMIT), 4120 .write_string = mem_cgroup_write, 4121 .read_u64 = mem_cgroup_read, 4122 }, 4123 { 4124 .name = "failcnt", 4125 .private = MEMFILE_PRIVATE(_MEM, RES_FAILCNT), 4126 .trigger = mem_cgroup_reset, 4127 .read_u64 = mem_cgroup_read, 4128 }, 4129 { 4130 .name = "stat", 4131 .read_map = mem_control_stat_show, 4132 }, 4133 { 4134 .name = "force_empty", 4135 .trigger = mem_cgroup_force_empty_write, 4136 }, 4137 { 4138 .name = "use_hierarchy", 4139 .write_u64 = mem_cgroup_hierarchy_write, 4140 .read_u64 = mem_cgroup_hierarchy_read, 4141 }, 4142 { 4143 .name = "swappiness", 4144 .read_u64 = mem_cgroup_swappiness_read, 4145 .write_u64 = mem_cgroup_swappiness_write, 4146 }, 4147 { 4148 .name = "move_charge_at_immigrate", 4149 .read_u64 = mem_cgroup_move_charge_read, 4150 .write_u64 = mem_cgroup_move_charge_write, 4151 }, 4152 { 4153 .name = "oom_control", 4154 .read_map = mem_cgroup_oom_control_read, 4155 .write_u64 = mem_cgroup_oom_control_write, 4156 .register_event = mem_cgroup_oom_register_event, 4157 .unregister_event = mem_cgroup_oom_unregister_event, 4158 .private = MEMFILE_PRIVATE(_OOM_TYPE, OOM_CONTROL), 4159 }, 4160}; 4161 4162#ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP 4163static struct cftype memsw_cgroup_files[] = { 4164 { 4165 .name = "memsw.usage_in_bytes", 4166 .private = MEMFILE_PRIVATE(_MEMSWAP, RES_USAGE), 4167 .read_u64 = mem_cgroup_read, 4168 .register_event = mem_cgroup_usage_register_event, 4169 .unregister_event = mem_cgroup_usage_unregister_event, 4170 }, 4171 { 4172 .name = "memsw.max_usage_in_bytes", 4173 .private = MEMFILE_PRIVATE(_MEMSWAP, RES_MAX_USAGE), 4174 .trigger = mem_cgroup_reset, 4175 .read_u64 = mem_cgroup_read, 4176 }, 4177 { 4178 .name = "memsw.limit_in_bytes", 4179 .private = MEMFILE_PRIVATE(_MEMSWAP, RES_LIMIT), 4180 .write_string = mem_cgroup_write, 4181 .read_u64 = mem_cgroup_read, 4182 }, 4183 { 4184 .name = "memsw.failcnt", 4185 .private = MEMFILE_PRIVATE(_MEMSWAP, RES_FAILCNT), 4186 .trigger = mem_cgroup_reset, 4187 .read_u64 = mem_cgroup_read, 4188 }, 4189}; 4190 4191static int register_memsw_files(struct cgroup *cont, struct cgroup_subsys *ss) 4192{ 4193 if (!do_swap_account) 4194 return 0; 4195 return cgroup_add_files(cont, ss, memsw_cgroup_files, 4196 ARRAY_SIZE(memsw_cgroup_files)); 4197}; 4198#else 4199static int register_memsw_files(struct cgroup *cont, struct cgroup_subsys *ss) 4200{ 4201 return 0; 4202} 4203#endif 4204 4205static int alloc_mem_cgroup_per_zone_info(struct mem_cgroup *mem, int node) 4206{ 4207 struct mem_cgroup_per_node *pn; 4208 struct mem_cgroup_per_zone *mz; 4209 enum lru_list l; 4210 int zone, tmp = node; 4211 /* 4212 * This routine is called against possible nodes. 4213 * But it's BUG to call kmalloc() against offline node. 4214 * 4215 * TODO: this routine can waste much memory for nodes which will 4216 * never be onlined. It's better to use memory hotplug callback 4217 * function. 4218 */ 4219 if (!node_state(node, N_NORMAL_MEMORY)) 4220 tmp = -1; 4221 pn = kmalloc_node(sizeof(*pn), GFP_KERNEL, tmp); 4222 if (!pn) 4223 return 1; 4224 4225 mem->info.nodeinfo[node] = pn; 4226 memset(pn, 0, sizeof(*pn)); 4227 4228 for (zone = 0; zone < MAX_NR_ZONES; zone++) { 4229 mz = &pn->zoneinfo[zone]; 4230 for_each_lru(l) 4231 INIT_LIST_HEAD(&mz->lists[l]); 4232 mz->usage_in_excess = 0; 4233 mz->on_tree = false; 4234 mz->mem = mem; 4235 } 4236 return 0; 4237} 4238 4239static void free_mem_cgroup_per_zone_info(struct mem_cgroup *mem, int node) 4240{ 4241 kfree(mem->info.nodeinfo[node]); 4242} 4243 4244static struct mem_cgroup *mem_cgroup_alloc(void) 4245{ 4246 struct mem_cgroup *mem; 4247 int size = sizeof(struct mem_cgroup); 4248 4249 /* Can be very big if MAX_NUMNODES is very big */ 4250 if (size < PAGE_SIZE) 4251 mem = kmalloc(size, GFP_KERNEL); 4252 else 4253 mem = vmalloc(size); 4254 4255 if (!mem) 4256 return NULL; 4257 4258 memset(mem, 0, size); 4259 mem->stat = alloc_percpu(struct mem_cgroup_stat_cpu); 4260 if (!mem->stat) 4261 goto out_free; 4262 spin_lock_init(&mem->pcp_counter_lock); 4263 return mem; 4264 4265out_free: 4266 if (size < PAGE_SIZE) 4267 kfree(mem); 4268 else 4269 vfree(mem); 4270 return NULL; 4271} 4272 4273/* 4274 * At destroying mem_cgroup, references from swap_cgroup can remain. 4275 * (scanning all at force_empty is too costly...) 4276 * 4277 * Instead of clearing all references at force_empty, we remember 4278 * the number of reference from swap_cgroup and free mem_cgroup when 4279 * it goes down to 0. 4280 * 4281 * Removal of cgroup itself succeeds regardless of refs from swap. 4282 */ 4283 4284static void __mem_cgroup_free(struct mem_cgroup *mem) 4285{ 4286 int node; 4287 4288 mem_cgroup_remove_from_trees(mem); 4289 free_css_id(&mem_cgroup_subsys, &mem->css); 4290 4291 for_each_node_state(node, N_POSSIBLE) 4292 free_mem_cgroup_per_zone_info(mem, node); 4293 4294 free_percpu(mem->stat); 4295 if (sizeof(struct mem_cgroup) < PAGE_SIZE) 4296 kfree(mem); 4297 else 4298 vfree(mem); 4299} 4300 4301static void mem_cgroup_get(struct mem_cgroup *mem) 4302{ 4303 atomic_inc(&mem->refcnt); 4304} 4305 4306static void __mem_cgroup_put(struct mem_cgroup *mem, int count) 4307{ 4308 if (atomic_sub_and_test(count, &mem->refcnt)) { 4309 struct mem_cgroup *parent = parent_mem_cgroup(mem); 4310 __mem_cgroup_free(mem); 4311 if (parent) 4312 mem_cgroup_put(parent); 4313 } 4314} 4315 4316static void mem_cgroup_put(struct mem_cgroup *mem) 4317{ 4318 __mem_cgroup_put(mem, 1); 4319} 4320 4321/* 4322 * Returns the parent mem_cgroup in memcgroup hierarchy with hierarchy enabled. 4323 */ 4324static struct mem_cgroup *parent_mem_cgroup(struct mem_cgroup *mem) 4325{ 4326 if (!mem->res.parent) 4327 return NULL; 4328 return mem_cgroup_from_res_counter(mem->res.parent, res); 4329} 4330 4331#ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP 4332static void __init enable_swap_cgroup(void) 4333{ 4334 if (!mem_cgroup_disabled() && really_do_swap_account) 4335 do_swap_account = 1; 4336} 4337#else 4338static void __init enable_swap_cgroup(void) 4339{ 4340} 4341#endif 4342 4343static int mem_cgroup_soft_limit_tree_init(void) 4344{ 4345 struct mem_cgroup_tree_per_node *rtpn; 4346 struct mem_cgroup_tree_per_zone *rtpz; 4347 int tmp, node, zone; 4348 4349 for_each_node_state(node, N_POSSIBLE) { 4350 tmp = node; 4351 if (!node_state(node, N_NORMAL_MEMORY)) 4352 tmp = -1; 4353 rtpn = kzalloc_node(sizeof(*rtpn), GFP_KERNEL, tmp); 4354 if (!rtpn) 4355 return 1; 4356 4357 soft_limit_tree.rb_tree_per_node[node] = rtpn; 4358 4359 for (zone = 0; zone < MAX_NR_ZONES; zone++) { 4360 rtpz = &rtpn->rb_tree_per_zone[zone]; 4361 rtpz->rb_root = RB_ROOT; 4362 spin_lock_init(&rtpz->lock); 4363 } 4364 } 4365 return 0; 4366} 4367 4368static struct cgroup_subsys_state * __ref 4369mem_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cont) 4370{ 4371 struct mem_cgroup *mem, *parent; 4372 long error = -ENOMEM; 4373 int node; 4374 4375 mem = mem_cgroup_alloc(); 4376 if (!mem) 4377 return ERR_PTR(error); 4378 4379 for_each_node_state(node, N_POSSIBLE) 4380 if (alloc_mem_cgroup_per_zone_info(mem, node)) 4381 goto free_out; 4382 4383 /* root ? */ 4384 if (cont->parent == NULL) { 4385 int cpu; 4386 enable_swap_cgroup(); 4387 parent = NULL; 4388 root_mem_cgroup = mem; 4389 if (mem_cgroup_soft_limit_tree_init()) 4390 goto free_out; 4391 for_each_possible_cpu(cpu) { 4392 struct memcg_stock_pcp *stock = 4393 &per_cpu(memcg_stock, cpu); 4394 INIT_WORK(&stock->work, drain_local_stock); 4395 } 4396 hotcpu_notifier(memcg_cpu_hotplug_callback, 0); 4397 } else { 4398 parent = mem_cgroup_from_cont(cont->parent); 4399 mem->use_hierarchy = parent->use_hierarchy; 4400 mem->oom_kill_disable = parent->oom_kill_disable; 4401 } 4402 4403 if (parent && parent->use_hierarchy) { 4404 res_counter_init(&mem->res, &parent->res); 4405 res_counter_init(&mem->memsw, &parent->memsw); 4406 /* 4407 * We increment refcnt of the parent to ensure that we can 4408 * safely access it on res_counter_charge/uncharge. 4409 * This refcnt will be decremented when freeing this 4410 * mem_cgroup(see mem_cgroup_put). 4411 */ 4412 mem_cgroup_get(parent); 4413 } else { 4414 res_counter_init(&mem->res, NULL); 4415 res_counter_init(&mem->memsw, NULL); 4416 } 4417 mem->last_scanned_child = 0; 4418 spin_lock_init(&mem->reclaim_param_lock); 4419 INIT_LIST_HEAD(&mem->oom_notify); 4420 4421 if (parent) 4422 mem->swappiness = get_swappiness(parent); 4423 atomic_set(&mem->refcnt, 1); 4424 mem->move_charge_at_immigrate = 0; 4425 mutex_init(&mem->thresholds_lock); 4426 return &mem->css; 4427free_out: 4428 __mem_cgroup_free(mem); 4429 root_mem_cgroup = NULL; 4430 return ERR_PTR(error); 4431} 4432 4433static int mem_cgroup_pre_destroy(struct cgroup_subsys *ss, 4434 struct cgroup *cont) 4435{ 4436 struct mem_cgroup *mem = mem_cgroup_from_cont(cont); 4437 4438 return mem_cgroup_force_empty(mem, false); 4439} 4440 4441static void mem_cgroup_destroy(struct cgroup_subsys *ss, 4442 struct cgroup *cont) 4443{ 4444 struct mem_cgroup *mem = mem_cgroup_from_cont(cont); 4445 4446 mem_cgroup_put(mem); 4447} 4448 4449static int mem_cgroup_populate(struct cgroup_subsys *ss, 4450 struct cgroup *cont) 4451{ 4452 int ret; 4453 4454 ret = cgroup_add_files(cont, ss, mem_cgroup_files, 4455 ARRAY_SIZE(mem_cgroup_files)); 4456 4457 if (!ret) 4458 ret = register_memsw_files(cont, ss); 4459 return ret; 4460} 4461 4462#ifdef CONFIG_MMU 4463/* Handlers for move charge at task migration. */ 4464#define PRECHARGE_COUNT_AT_ONCE 256 4465static int mem_cgroup_do_precharge(unsigned long count) 4466{ 4467 int ret = 0; 4468 int batch_count = PRECHARGE_COUNT_AT_ONCE; 4469 struct mem_cgroup *mem = mc.to; 4470 4471 if (mem_cgroup_is_root(mem)) { 4472 mc.precharge += count; 4473 /* we don't need css_get for root */ 4474 return ret; 4475 } 4476 /* try to charge at once */ 4477 if (count > 1) { 4478 struct res_counter *dummy; 4479 /* 4480 * "mem" cannot be under rmdir() because we've already checked 4481 * by cgroup_lock_live_cgroup() that it is not removed and we 4482 * are still under the same cgroup_mutex. So we can postpone 4483 * css_get(). 4484 */ 4485 if (res_counter_charge(&mem->res, PAGE_SIZE * count, &dummy)) 4486 goto one_by_one; 4487 if (do_swap_account && res_counter_charge(&mem->memsw, 4488 PAGE_SIZE * count, &dummy)) { 4489 res_counter_uncharge(&mem->res, PAGE_SIZE * count); 4490 goto one_by_one; 4491 } 4492 mc.precharge += count; 4493 return ret; 4494 } 4495one_by_one: 4496 /* fall back to one by one charge */ 4497 while (count--) { 4498 if (signal_pending(current)) { 4499 ret = -EINTR; 4500 break; 4501 } 4502 if (!batch_count--) { 4503 batch_count = PRECHARGE_COUNT_AT_ONCE; 4504 cond_resched(); 4505 } 4506 ret = __mem_cgroup_try_charge(NULL, GFP_KERNEL, &mem, false, 4507 PAGE_SIZE); 4508 if (ret || !mem) 4509 /* mem_cgroup_clear_mc() will do uncharge later */ 4510 return -ENOMEM; 4511 mc.precharge++; 4512 } 4513 return ret; 4514} 4515 4516/** 4517 * is_target_pte_for_mc - check a pte whether it is valid for move charge 4518 * @vma: the vma the pte to be checked belongs 4519 * @addr: the address corresponding to the pte to be checked 4520 * @ptent: the pte to be checked 4521 * @target: the pointer the target page or swap ent will be stored(can be NULL) 4522 * 4523 * Returns 4524 * 0(MC_TARGET_NONE): if the pte is not a target for move charge. 4525 * 1(MC_TARGET_PAGE): if the page corresponding to this pte is a target for 4526 * move charge. if @target is not NULL, the page is stored in target->page 4527 * with extra refcnt got(Callers should handle it). 4528 * 2(MC_TARGET_SWAP): if the swap entry corresponding to this pte is a 4529 * target for charge migration. if @target is not NULL, the entry is stored 4530 * in target->ent. 4531 * 4532 * Called with pte lock held. 4533 */ 4534union mc_target { 4535 struct page *page; 4536 swp_entry_t ent; 4537}; 4538 4539enum mc_target_type { 4540 MC_TARGET_NONE, /* not used */ 4541 MC_TARGET_PAGE, 4542 MC_TARGET_SWAP, 4543}; 4544 4545static struct page *mc_handle_present_pte(struct vm_area_struct *vma, 4546 unsigned long addr, pte_t ptent) 4547{ 4548 struct page *page = vm_normal_page(vma, addr, ptent); 4549 4550 if (!page || !page_mapped(page)) 4551 return NULL; 4552 if (PageAnon(page)) { 4553 /* we don't move shared anon */ 4554 if (!move_anon() || page_mapcount(page) > 2) 4555 return NULL; 4556 } else if (!move_file()) 4557 /* we ignore mapcount for file pages */ 4558 return NULL; 4559 if (!get_page_unless_zero(page)) 4560 return NULL; 4561 4562 return page; 4563} 4564 4565static struct page *mc_handle_swap_pte(struct vm_area_struct *vma, 4566 unsigned long addr, pte_t ptent, swp_entry_t *entry) 4567{ 4568 int usage_count; 4569 struct page *page = NULL; 4570 swp_entry_t ent = pte_to_swp_entry(ptent); 4571 4572 if (!move_anon() || non_swap_entry(ent)) 4573 return NULL; 4574 usage_count = mem_cgroup_count_swap_user(ent, &page); 4575 if (usage_count > 1) { /* we don't move shared anon */ 4576 if (page) 4577 put_page(page); 4578 return NULL; 4579 } 4580 if (do_swap_account) 4581 entry->val = ent.val; 4582 4583 return page; 4584} 4585 4586static struct page *mc_handle_file_pte(struct vm_area_struct *vma, 4587 unsigned long addr, pte_t ptent, swp_entry_t *entry) 4588{ 4589 struct page *page = NULL; 4590 struct inode *inode; 4591 struct address_space *mapping; 4592 pgoff_t pgoff; 4593 4594 if (!vma->vm_file) /* anonymous vma */ 4595 return NULL; 4596 if (!move_file()) 4597 return NULL; 4598 4599 inode = vma->vm_file->f_path.dentry->d_inode; 4600 mapping = vma->vm_file->f_mapping; 4601 if (pte_none(ptent)) 4602 pgoff = linear_page_index(vma, addr); 4603 else /* pte_file(ptent) is true */ 4604 pgoff = pte_to_pgoff(ptent); 4605 4606 /* page is moved even if it's not RSS of this task(page-faulted). */ 4607 if (!mapping_cap_swap_backed(mapping)) { /* normal file */ 4608 page = find_get_page(mapping, pgoff); 4609 } else { /* shmem/tmpfs file. we should take account of swap too. */ 4610 swp_entry_t ent; 4611 mem_cgroup_get_shmem_target(inode, pgoff, &page, &ent); 4612 if (do_swap_account) 4613 entry->val = ent.val; 4614 } 4615 4616 return page; 4617} 4618 4619static int is_target_pte_for_mc(struct vm_area_struct *vma, 4620 unsigned long addr, pte_t ptent, union mc_target *target) 4621{ 4622 struct page *page = NULL; 4623 struct page_cgroup *pc; 4624 int ret = 0; 4625 swp_entry_t ent = { .val = 0 }; 4626 4627 if (pte_present(ptent)) 4628 page = mc_handle_present_pte(vma, addr, ptent); 4629 else if (is_swap_pte(ptent)) 4630 page = mc_handle_swap_pte(vma, addr, ptent, &ent); 4631 else if (pte_none(ptent) || pte_file(ptent)) 4632 page = mc_handle_file_pte(vma, addr, ptent, &ent); 4633 4634 if (!page && !ent.val) 4635 return 0; 4636 if (page) { 4637 pc = lookup_page_cgroup(page); 4638 /* 4639 * Do only loose check w/o page_cgroup lock. 4640 * mem_cgroup_move_account() checks the pc is valid or not under 4641 * the lock. 4642 */ 4643 if (PageCgroupUsed(pc) && pc->mem_cgroup == mc.from) { 4644 ret = MC_TARGET_PAGE; 4645 if (target) 4646 target->page = page; 4647 } 4648 if (!ret || !target) 4649 put_page(page); 4650 } 4651 /* There is a swap entry and a page doesn't exist or isn't charged */ 4652 if (ent.val && !ret && 4653 css_id(&mc.from->css) == lookup_swap_cgroup(ent)) { 4654 ret = MC_TARGET_SWAP; 4655 if (target) 4656 target->ent = ent; 4657 } 4658 return ret; 4659} 4660 4661static int mem_cgroup_count_precharge_pte_range(pmd_t *pmd, 4662 unsigned long addr, unsigned long end, 4663 struct mm_walk *walk) 4664{ 4665 struct vm_area_struct *vma = walk->private; 4666 pte_t *pte; 4667 spinlock_t *ptl; 4668 4669 VM_BUG_ON(pmd_trans_huge(*pmd)); 4670 pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl); 4671 for (; addr != end; pte++, addr += PAGE_SIZE) 4672 if (is_target_pte_for_mc(vma, addr, *pte, NULL)) 4673 mc.precharge++; /* increment precharge temporarily */ 4674 pte_unmap_unlock(pte - 1, ptl); 4675 cond_resched(); 4676 4677 return 0; 4678} 4679 4680static unsigned long mem_cgroup_count_precharge(struct mm_struct *mm) 4681{ 4682 unsigned long precharge; 4683 struct vm_area_struct *vma; 4684 4685 /* We've already held the mmap_sem */ 4686 for (vma = mm->mmap; vma; vma = vma->vm_next) { 4687 struct mm_walk mem_cgroup_count_precharge_walk = { 4688 .pmd_entry = mem_cgroup_count_precharge_pte_range, 4689 .mm = mm, 4690 .private = vma, 4691 }; 4692 if (is_vm_hugetlb_page(vma)) 4693 continue; 4694 walk_page_range(vma->vm_start, vma->vm_end, 4695 &mem_cgroup_count_precharge_walk); 4696 } 4697 4698 precharge = mc.precharge; 4699 mc.precharge = 0; 4700 4701 return precharge; 4702} 4703 4704static int mem_cgroup_precharge_mc(struct mm_struct *mm) 4705{ 4706 return mem_cgroup_do_precharge(mem_cgroup_count_precharge(mm)); 4707} 4708 4709static void mem_cgroup_clear_mc(void) 4710{ 4711 struct mem_cgroup *from = mc.from; 4712 struct mem_cgroup *to = mc.to; 4713 4714 /* we must uncharge all the leftover precharges from mc.to */ 4715 if (mc.precharge) { 4716 __mem_cgroup_cancel_charge(mc.to, mc.precharge); 4717 mc.precharge = 0; 4718 } 4719 /* 4720 * we didn't uncharge from mc.from at mem_cgroup_move_account(), so 4721 * we must uncharge here. 4722 */ 4723 if (mc.moved_charge) { 4724 __mem_cgroup_cancel_charge(mc.from, mc.moved_charge); 4725 mc.moved_charge = 0; 4726 } 4727 /* we must fixup refcnts and charges */ 4728 if (mc.moved_swap) { 4729 /* uncharge swap account from the old cgroup */ 4730 if (!mem_cgroup_is_root(mc.from)) 4731 res_counter_uncharge(&mc.from->memsw, 4732 PAGE_SIZE * mc.moved_swap); 4733 __mem_cgroup_put(mc.from, mc.moved_swap); 4734 4735 if (!mem_cgroup_is_root(mc.to)) { 4736 /* 4737 * we charged both to->res and to->memsw, so we should 4738 * uncharge to->res. 4739 */ 4740 res_counter_uncharge(&mc.to->res, 4741 PAGE_SIZE * mc.moved_swap); 4742 } 4743 /* we've already done mem_cgroup_get(mc.to) */ 4744 4745 mc.moved_swap = 0; 4746 } 4747 if (mc.mm) { 4748 up_read(&mc.mm->mmap_sem); 4749 mmput(mc.mm); 4750 } 4751 spin_lock(&mc.lock); 4752 mc.from = NULL; 4753 mc.to = NULL; 4754 spin_unlock(&mc.lock); 4755 mc.moving_task = NULL; 4756 mc.mm = NULL; 4757 mem_cgroup_end_move(from); 4758 memcg_oom_recover(from); 4759 memcg_oom_recover(to); 4760 wake_up_all(&mc.waitq); 4761} 4762 4763static int mem_cgroup_can_attach(struct cgroup_subsys *ss, 4764 struct cgroup *cgroup, 4765 struct task_struct *p, 4766 bool threadgroup) 4767{ 4768 int ret = 0; 4769 struct mem_cgroup *mem = mem_cgroup_from_cont(cgroup); 4770 4771 if (mem->move_charge_at_immigrate) { 4772 struct mm_struct *mm; 4773 struct mem_cgroup *from = mem_cgroup_from_task(p); 4774 4775 VM_BUG_ON(from == mem); 4776 4777 mm = get_task_mm(p); 4778 if (!mm) 4779 return 0; 4780 /* We move charges only when we move a owner of the mm */ 4781 if (mm->owner == p) { 4782 /* 4783 * We do all the move charge works under one mmap_sem to 4784 * avoid deadlock with down_write(&mmap_sem) 4785 * -> try_charge() -> if (mc.moving_task) -> sleep. 4786 */ 4787 down_read(&mm->mmap_sem); 4788 4789 VM_BUG_ON(mc.from); 4790 VM_BUG_ON(mc.to); 4791 VM_BUG_ON(mc.precharge); 4792 VM_BUG_ON(mc.moved_charge); 4793 VM_BUG_ON(mc.moved_swap); 4794 VM_BUG_ON(mc.moving_task); 4795 VM_BUG_ON(mc.mm); 4796 4797 mem_cgroup_start_move(from); 4798 spin_lock(&mc.lock); 4799 mc.from = from; 4800 mc.to = mem; 4801 mc.precharge = 0; 4802 mc.moved_charge = 0; 4803 mc.moved_swap = 0; 4804 spin_unlock(&mc.lock); 4805 mc.moving_task = current; 4806 mc.mm = mm; 4807 4808 ret = mem_cgroup_precharge_mc(mm); 4809 if (ret) 4810 mem_cgroup_clear_mc(); 4811 /* We call up_read() and mmput() in clear_mc(). */ 4812 } else 4813 mmput(mm); 4814 } 4815 return ret; 4816} 4817 4818static void mem_cgroup_cancel_attach(struct cgroup_subsys *ss, 4819 struct cgroup *cgroup, 4820 struct task_struct *p, 4821 bool threadgroup) 4822{ 4823 mem_cgroup_clear_mc(); 4824} 4825 4826static int mem_cgroup_move_charge_pte_range(pmd_t *pmd, 4827 unsigned long addr, unsigned long end, 4828 struct mm_walk *walk) 4829{ 4830 int ret = 0; 4831 struct vm_area_struct *vma = walk->private; 4832 pte_t *pte; 4833 spinlock_t *ptl; 4834 4835retry: 4836 VM_BUG_ON(pmd_trans_huge(*pmd)); 4837 pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl); 4838 for (; addr != end; addr += PAGE_SIZE) { 4839 pte_t ptent = *(pte++); 4840 union mc_target target; 4841 int type; 4842 struct page *page; 4843 struct page_cgroup *pc; 4844 swp_entry_t ent; 4845 4846 if (!mc.precharge) 4847 break; 4848 4849 type = is_target_pte_for_mc(vma, addr, ptent, &target); 4850 switch (type) { 4851 case MC_TARGET_PAGE: 4852 page = target.page; 4853 if (isolate_lru_page(page)) 4854 goto put; 4855 pc = lookup_page_cgroup(page); 4856 if (!mem_cgroup_move_account(pc, 4857 mc.from, mc.to, false)) { 4858 mc.precharge--; 4859 /* we uncharge from mc.from later. */ 4860 mc.moved_charge++; 4861 } 4862 putback_lru_page(page); 4863put: /* is_target_pte_for_mc() gets the page */ 4864 put_page(page); 4865 break; 4866 case MC_TARGET_SWAP: 4867 ent = target.ent; 4868 if (!mem_cgroup_move_swap_account(ent, 4869 mc.from, mc.to, false)) { 4870 mc.precharge--; 4871 /* we fixup refcnts and charges later. */ 4872 mc.moved_swap++; 4873 } 4874 break; 4875 default: 4876 break; 4877 } 4878 } 4879 pte_unmap_unlock(pte - 1, ptl); 4880 cond_resched(); 4881 4882 if (addr != end) { 4883 /* 4884 * We have consumed all precharges we got in can_attach(). 4885 * We try charge one by one, but don't do any additional 4886 * charges to mc.to if we have failed in charge once in attach() 4887 * phase. 4888 */ 4889 ret = mem_cgroup_do_precharge(1); 4890 if (!ret) 4891 goto retry; 4892 } 4893 4894 return ret; 4895} 4896 4897static void mem_cgroup_move_charge(struct mm_struct *mm) 4898{ 4899 struct vm_area_struct *vma; 4900 4901 lru_add_drain_all(); 4902 /* We've already held the mmap_sem */ 4903 for (vma = mm->mmap; vma; vma = vma->vm_next) { 4904 int ret; 4905 struct mm_walk mem_cgroup_move_charge_walk = { 4906 .pmd_entry = mem_cgroup_move_charge_pte_range, 4907 .mm = mm, 4908 .private = vma, 4909 }; 4910 if (is_vm_hugetlb_page(vma)) 4911 continue; 4912 ret = walk_page_range(vma->vm_start, vma->vm_end, 4913 &mem_cgroup_move_charge_walk); 4914 if (ret) 4915 /* 4916 * means we have consumed all precharges and failed in 4917 * doing additional charge. Just abandon here. 4918 */ 4919 break; 4920 } 4921} 4922 4923static void mem_cgroup_move_task(struct cgroup_subsys *ss, 4924 struct cgroup *cont, 4925 struct cgroup *old_cont, 4926 struct task_struct *p, 4927 bool threadgroup) 4928{ 4929 if (!mc.mm) 4930 /* no need to move charge */ 4931 return; 4932 4933 mem_cgroup_move_charge(mc.mm); 4934 mem_cgroup_clear_mc(); 4935} 4936#else /* !CONFIG_MMU */ 4937static int mem_cgroup_can_attach(struct cgroup_subsys *ss, 4938 struct cgroup *cgroup, 4939 struct task_struct *p, 4940 bool threadgroup) 4941{ 4942 return 0; 4943} 4944static void mem_cgroup_cancel_attach(struct cgroup_subsys *ss, 4945 struct cgroup *cgroup, 4946 struct task_struct *p, 4947 bool threadgroup) 4948{ 4949} 4950static void mem_cgroup_move_task(struct cgroup_subsys *ss, 4951 struct cgroup *cont, 4952 struct cgroup *old_cont, 4953 struct task_struct *p, 4954 bool threadgroup) 4955{ 4956} 4957#endif 4958 4959struct cgroup_subsys mem_cgroup_subsys = { 4960 .name = "memory", 4961 .subsys_id = mem_cgroup_subsys_id, 4962 .create = mem_cgroup_create, 4963 .pre_destroy = mem_cgroup_pre_destroy, 4964 .destroy = mem_cgroup_destroy, 4965 .populate = mem_cgroup_populate, 4966 .can_attach = mem_cgroup_can_attach, 4967 .cancel_attach = mem_cgroup_cancel_attach, 4968 .attach = mem_cgroup_move_task, 4969 .early_init = 0, 4970 .use_id = 1, 4971}; 4972 4973#ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP 4974static int __init enable_swap_account(char *s) 4975{ 4976 /* consider enabled if no parameter or 1 is given */ 4977 if (!s || !strcmp(s, "1")) 4978 really_do_swap_account = 1; 4979 else if (!strcmp(s, "0")) 4980 really_do_swap_account = 0; 4981 return 1; 4982} 4983__setup("swapaccount", enable_swap_account); 4984 4985static int __init disable_swap_account(char *s) 4986{ 4987 enable_swap_account("0"); 4988 return 1; 4989} 4990__setup("noswapaccount", disable_swap_account); 4991#endif 4992