memcontrol.c revision 315c1998e10527ff364a9883048455e609bc7232
1/* memcontrol.c - Memory Controller 2 * 3 * Copyright IBM Corporation, 2007 4 * Author Balbir Singh <balbir@linux.vnet.ibm.com> 5 * 6 * Copyright 2007 OpenVZ SWsoft Inc 7 * Author: Pavel Emelianov <xemul@openvz.org> 8 * 9 * Memory thresholds 10 * Copyright (C) 2009 Nokia Corporation 11 * Author: Kirill A. Shutemov 12 * 13 * This program is free software; you can redistribute it and/or modify 14 * it under the terms of the GNU General Public License as published by 15 * the Free Software Foundation; either version 2 of the License, or 16 * (at your option) any later version. 17 * 18 * This program is distributed in the hope that it will be useful, 19 * but WITHOUT ANY WARRANTY; without even the implied warranty of 20 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 21 * GNU General Public License for more details. 22 */ 23 24#include <linux/res_counter.h> 25#include <linux/memcontrol.h> 26#include <linux/cgroup.h> 27#include <linux/mm.h> 28#include <linux/hugetlb.h> 29#include <linux/pagemap.h> 30#include <linux/smp.h> 31#include <linux/page-flags.h> 32#include <linux/backing-dev.h> 33#include <linux/bit_spinlock.h> 34#include <linux/rcupdate.h> 35#include <linux/limits.h> 36#include <linux/mutex.h> 37#include <linux/rbtree.h> 38#include <linux/slab.h> 39#include <linux/swap.h> 40#include <linux/swapops.h> 41#include <linux/spinlock.h> 42#include <linux/eventfd.h> 43#include <linux/sort.h> 44#include <linux/fs.h> 45#include <linux/seq_file.h> 46#include <linux/vmalloc.h> 47#include <linux/mm_inline.h> 48#include <linux/page_cgroup.h> 49#include <linux/cpu.h> 50#include "internal.h" 51 52#include <asm/uaccess.h> 53 54struct cgroup_subsys mem_cgroup_subsys __read_mostly; 55#define MEM_CGROUP_RECLAIM_RETRIES 5 56struct mem_cgroup *root_mem_cgroup __read_mostly; 57 58#ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP 59/* Turned on only when memory cgroup is enabled && really_do_swap_account = 1 */ 60int do_swap_account __read_mostly; 61static int really_do_swap_account __initdata = 1; /* for remember boot option*/ 62#else 63#define do_swap_account (0) 64#endif 65 66/* 67 * Per memcg event counter is incremented at every pagein/pageout. This counter 68 * is used for trigger some periodic events. This is straightforward and better 69 * than using jiffies etc. to handle periodic memcg event. 70 * 71 * These values will be used as !((event) & ((1 <<(thresh)) - 1)) 72 */ 73#define THRESHOLDS_EVENTS_THRESH (7) /* once in 128 */ 74#define SOFTLIMIT_EVENTS_THRESH (10) /* once in 1024 */ 75 76/* 77 * Statistics for memory cgroup. 78 */ 79enum mem_cgroup_stat_index { 80 /* 81 * For MEM_CONTAINER_TYPE_ALL, usage = pagecache + rss. 82 */ 83 MEM_CGROUP_STAT_CACHE, /* # of pages charged as cache */ 84 MEM_CGROUP_STAT_RSS, /* # of pages charged as anon rss */ 85 MEM_CGROUP_STAT_FILE_MAPPED, /* # of pages charged as file rss */ 86 MEM_CGROUP_STAT_PGPGIN_COUNT, /* # of pages paged in */ 87 MEM_CGROUP_STAT_PGPGOUT_COUNT, /* # of pages paged out */ 88 MEM_CGROUP_STAT_SWAPOUT, /* # of pages, swapped out */ 89 MEM_CGROUP_EVENTS, /* incremented at every pagein/pageout */ 90 91 MEM_CGROUP_STAT_NSTATS, 92}; 93 94struct mem_cgroup_stat_cpu { 95 s64 count[MEM_CGROUP_STAT_NSTATS]; 96}; 97 98/* 99 * per-zone information in memory controller. 100 */ 101struct mem_cgroup_per_zone { 102 /* 103 * spin_lock to protect the per cgroup LRU 104 */ 105 struct list_head lists[NR_LRU_LISTS]; 106 unsigned long count[NR_LRU_LISTS]; 107 108 struct zone_reclaim_stat reclaim_stat; 109 struct rb_node tree_node; /* RB tree node */ 110 unsigned long long usage_in_excess;/* Set to the value by which */ 111 /* the soft limit is exceeded*/ 112 bool on_tree; 113 struct mem_cgroup *mem; /* Back pointer, we cannot */ 114 /* use container_of */ 115}; 116/* Macro for accessing counter */ 117#define MEM_CGROUP_ZSTAT(mz, idx) ((mz)->count[(idx)]) 118 119struct mem_cgroup_per_node { 120 struct mem_cgroup_per_zone zoneinfo[MAX_NR_ZONES]; 121}; 122 123struct mem_cgroup_lru_info { 124 struct mem_cgroup_per_node *nodeinfo[MAX_NUMNODES]; 125}; 126 127/* 128 * Cgroups above their limits are maintained in a RB-Tree, independent of 129 * their hierarchy representation 130 */ 131 132struct mem_cgroup_tree_per_zone { 133 struct rb_root rb_root; 134 spinlock_t lock; 135}; 136 137struct mem_cgroup_tree_per_node { 138 struct mem_cgroup_tree_per_zone rb_tree_per_zone[MAX_NR_ZONES]; 139}; 140 141struct mem_cgroup_tree { 142 struct mem_cgroup_tree_per_node *rb_tree_per_node[MAX_NUMNODES]; 143}; 144 145static struct mem_cgroup_tree soft_limit_tree __read_mostly; 146 147struct mem_cgroup_threshold { 148 struct eventfd_ctx *eventfd; 149 u64 threshold; 150}; 151 152/* For threshold */ 153struct mem_cgroup_threshold_ary { 154 /* An array index points to threshold just below usage. */ 155 int current_threshold; 156 /* Size of entries[] */ 157 unsigned int size; 158 /* Array of thresholds */ 159 struct mem_cgroup_threshold entries[0]; 160}; 161/* for OOM */ 162struct mem_cgroup_eventfd_list { 163 struct list_head list; 164 struct eventfd_ctx *eventfd; 165}; 166 167static void mem_cgroup_threshold(struct mem_cgroup *mem); 168static void mem_cgroup_oom_notify(struct mem_cgroup *mem); 169 170/* 171 * The memory controller data structure. The memory controller controls both 172 * page cache and RSS per cgroup. We would eventually like to provide 173 * statistics based on the statistics developed by Rik Van Riel for clock-pro, 174 * to help the administrator determine what knobs to tune. 175 * 176 * TODO: Add a water mark for the memory controller. Reclaim will begin when 177 * we hit the water mark. May be even add a low water mark, such that 178 * no reclaim occurs from a cgroup at it's low water mark, this is 179 * a feature that will be implemented much later in the future. 180 */ 181struct mem_cgroup { 182 struct cgroup_subsys_state css; 183 /* 184 * the counter to account for memory usage 185 */ 186 struct res_counter res; 187 /* 188 * the counter to account for mem+swap usage. 189 */ 190 struct res_counter memsw; 191 /* 192 * Per cgroup active and inactive list, similar to the 193 * per zone LRU lists. 194 */ 195 struct mem_cgroup_lru_info info; 196 197 /* 198 protect against reclaim related member. 199 */ 200 spinlock_t reclaim_param_lock; 201 202 int prev_priority; /* for recording reclaim priority */ 203 204 /* 205 * While reclaiming in a hierarchy, we cache the last child we 206 * reclaimed from. 207 */ 208 int last_scanned_child; 209 /* 210 * Should the accounting and control be hierarchical, per subtree? 211 */ 212 bool use_hierarchy; 213 atomic_t oom_lock; 214 atomic_t refcnt; 215 216 unsigned int swappiness; 217 /* OOM-Killer disable */ 218 int oom_kill_disable; 219 220 /* set when res.limit == memsw.limit */ 221 bool memsw_is_minimum; 222 223 /* protect arrays of thresholds */ 224 struct mutex thresholds_lock; 225 226 /* thresholds for memory usage. RCU-protected */ 227 struct mem_cgroup_threshold_ary *thresholds; 228 229 /* thresholds for mem+swap usage. RCU-protected */ 230 struct mem_cgroup_threshold_ary *memsw_thresholds; 231 232 /* For oom notifier event fd */ 233 struct list_head oom_notify; 234 235 /* 236 * Should we move charges of a task when a task is moved into this 237 * mem_cgroup ? And what type of charges should we move ? 238 */ 239 unsigned long move_charge_at_immigrate; 240 /* 241 * percpu counter. 242 */ 243 struct mem_cgroup_stat_cpu *stat; 244}; 245 246/* Stuffs for move charges at task migration. */ 247/* 248 * Types of charges to be moved. "move_charge_at_immitgrate" is treated as a 249 * left-shifted bitmap of these types. 250 */ 251enum move_type { 252 MOVE_CHARGE_TYPE_ANON, /* private anonymous page and swap of it */ 253 MOVE_CHARGE_TYPE_FILE, /* file page(including tmpfs) and swap of it */ 254 NR_MOVE_TYPE, 255}; 256 257/* "mc" and its members are protected by cgroup_mutex */ 258static struct move_charge_struct { 259 struct mem_cgroup *from; 260 struct mem_cgroup *to; 261 unsigned long precharge; 262 unsigned long moved_charge; 263 unsigned long moved_swap; 264 struct task_struct *moving_task; /* a task moving charges */ 265 wait_queue_head_t waitq; /* a waitq for other context */ 266} mc = { 267 .waitq = __WAIT_QUEUE_HEAD_INITIALIZER(mc.waitq), 268}; 269 270static bool move_anon(void) 271{ 272 return test_bit(MOVE_CHARGE_TYPE_ANON, 273 &mc.to->move_charge_at_immigrate); 274} 275 276static bool move_file(void) 277{ 278 return test_bit(MOVE_CHARGE_TYPE_FILE, 279 &mc.to->move_charge_at_immigrate); 280} 281 282/* 283 * Maximum loops in mem_cgroup_hierarchical_reclaim(), used for soft 284 * limit reclaim to prevent infinite loops, if they ever occur. 285 */ 286#define MEM_CGROUP_MAX_RECLAIM_LOOPS (100) 287#define MEM_CGROUP_MAX_SOFT_LIMIT_RECLAIM_LOOPS (2) 288 289enum charge_type { 290 MEM_CGROUP_CHARGE_TYPE_CACHE = 0, 291 MEM_CGROUP_CHARGE_TYPE_MAPPED, 292 MEM_CGROUP_CHARGE_TYPE_SHMEM, /* used by page migration of shmem */ 293 MEM_CGROUP_CHARGE_TYPE_FORCE, /* used by force_empty */ 294 MEM_CGROUP_CHARGE_TYPE_SWAPOUT, /* for accounting swapcache */ 295 MEM_CGROUP_CHARGE_TYPE_DROP, /* a page was unused swap cache */ 296 NR_CHARGE_TYPE, 297}; 298 299/* only for here (for easy reading.) */ 300#define PCGF_CACHE (1UL << PCG_CACHE) 301#define PCGF_USED (1UL << PCG_USED) 302#define PCGF_LOCK (1UL << PCG_LOCK) 303/* Not used, but added here for completeness */ 304#define PCGF_ACCT (1UL << PCG_ACCT) 305 306/* for encoding cft->private value on file */ 307#define _MEM (0) 308#define _MEMSWAP (1) 309#define _OOM_TYPE (2) 310#define MEMFILE_PRIVATE(x, val) (((x) << 16) | (val)) 311#define MEMFILE_TYPE(val) (((val) >> 16) & 0xffff) 312#define MEMFILE_ATTR(val) ((val) & 0xffff) 313/* Used for OOM nofiier */ 314#define OOM_CONTROL (0) 315 316/* 317 * Reclaim flags for mem_cgroup_hierarchical_reclaim 318 */ 319#define MEM_CGROUP_RECLAIM_NOSWAP_BIT 0x0 320#define MEM_CGROUP_RECLAIM_NOSWAP (1 << MEM_CGROUP_RECLAIM_NOSWAP_BIT) 321#define MEM_CGROUP_RECLAIM_SHRINK_BIT 0x1 322#define MEM_CGROUP_RECLAIM_SHRINK (1 << MEM_CGROUP_RECLAIM_SHRINK_BIT) 323#define MEM_CGROUP_RECLAIM_SOFT_BIT 0x2 324#define MEM_CGROUP_RECLAIM_SOFT (1 << MEM_CGROUP_RECLAIM_SOFT_BIT) 325 326static void mem_cgroup_get(struct mem_cgroup *mem); 327static void mem_cgroup_put(struct mem_cgroup *mem); 328static struct mem_cgroup *parent_mem_cgroup(struct mem_cgroup *mem); 329static void drain_all_stock_async(void); 330 331static struct mem_cgroup_per_zone * 332mem_cgroup_zoneinfo(struct mem_cgroup *mem, int nid, int zid) 333{ 334 return &mem->info.nodeinfo[nid]->zoneinfo[zid]; 335} 336 337struct cgroup_subsys_state *mem_cgroup_css(struct mem_cgroup *mem) 338{ 339 return &mem->css; 340} 341 342static struct mem_cgroup_per_zone * 343page_cgroup_zoneinfo(struct page_cgroup *pc) 344{ 345 struct mem_cgroup *mem = pc->mem_cgroup; 346 int nid = page_cgroup_nid(pc); 347 int zid = page_cgroup_zid(pc); 348 349 if (!mem) 350 return NULL; 351 352 return mem_cgroup_zoneinfo(mem, nid, zid); 353} 354 355static struct mem_cgroup_tree_per_zone * 356soft_limit_tree_node_zone(int nid, int zid) 357{ 358 return &soft_limit_tree.rb_tree_per_node[nid]->rb_tree_per_zone[zid]; 359} 360 361static struct mem_cgroup_tree_per_zone * 362soft_limit_tree_from_page(struct page *page) 363{ 364 int nid = page_to_nid(page); 365 int zid = page_zonenum(page); 366 367 return &soft_limit_tree.rb_tree_per_node[nid]->rb_tree_per_zone[zid]; 368} 369 370static void 371__mem_cgroup_insert_exceeded(struct mem_cgroup *mem, 372 struct mem_cgroup_per_zone *mz, 373 struct mem_cgroup_tree_per_zone *mctz, 374 unsigned long long new_usage_in_excess) 375{ 376 struct rb_node **p = &mctz->rb_root.rb_node; 377 struct rb_node *parent = NULL; 378 struct mem_cgroup_per_zone *mz_node; 379 380 if (mz->on_tree) 381 return; 382 383 mz->usage_in_excess = new_usage_in_excess; 384 if (!mz->usage_in_excess) 385 return; 386 while (*p) { 387 parent = *p; 388 mz_node = rb_entry(parent, struct mem_cgroup_per_zone, 389 tree_node); 390 if (mz->usage_in_excess < mz_node->usage_in_excess) 391 p = &(*p)->rb_left; 392 /* 393 * We can't avoid mem cgroups that are over their soft 394 * limit by the same amount 395 */ 396 else if (mz->usage_in_excess >= mz_node->usage_in_excess) 397 p = &(*p)->rb_right; 398 } 399 rb_link_node(&mz->tree_node, parent, p); 400 rb_insert_color(&mz->tree_node, &mctz->rb_root); 401 mz->on_tree = true; 402} 403 404static void 405__mem_cgroup_remove_exceeded(struct mem_cgroup *mem, 406 struct mem_cgroup_per_zone *mz, 407 struct mem_cgroup_tree_per_zone *mctz) 408{ 409 if (!mz->on_tree) 410 return; 411 rb_erase(&mz->tree_node, &mctz->rb_root); 412 mz->on_tree = false; 413} 414 415static void 416mem_cgroup_remove_exceeded(struct mem_cgroup *mem, 417 struct mem_cgroup_per_zone *mz, 418 struct mem_cgroup_tree_per_zone *mctz) 419{ 420 spin_lock(&mctz->lock); 421 __mem_cgroup_remove_exceeded(mem, mz, mctz); 422 spin_unlock(&mctz->lock); 423} 424 425 426static void mem_cgroup_update_tree(struct mem_cgroup *mem, struct page *page) 427{ 428 unsigned long long excess; 429 struct mem_cgroup_per_zone *mz; 430 struct mem_cgroup_tree_per_zone *mctz; 431 int nid = page_to_nid(page); 432 int zid = page_zonenum(page); 433 mctz = soft_limit_tree_from_page(page); 434 435 /* 436 * Necessary to update all ancestors when hierarchy is used. 437 * because their event counter is not touched. 438 */ 439 for (; mem; mem = parent_mem_cgroup(mem)) { 440 mz = mem_cgroup_zoneinfo(mem, nid, zid); 441 excess = res_counter_soft_limit_excess(&mem->res); 442 /* 443 * We have to update the tree if mz is on RB-tree or 444 * mem is over its softlimit. 445 */ 446 if (excess || mz->on_tree) { 447 spin_lock(&mctz->lock); 448 /* if on-tree, remove it */ 449 if (mz->on_tree) 450 __mem_cgroup_remove_exceeded(mem, mz, mctz); 451 /* 452 * Insert again. mz->usage_in_excess will be updated. 453 * If excess is 0, no tree ops. 454 */ 455 __mem_cgroup_insert_exceeded(mem, mz, mctz, excess); 456 spin_unlock(&mctz->lock); 457 } 458 } 459} 460 461static void mem_cgroup_remove_from_trees(struct mem_cgroup *mem) 462{ 463 int node, zone; 464 struct mem_cgroup_per_zone *mz; 465 struct mem_cgroup_tree_per_zone *mctz; 466 467 for_each_node_state(node, N_POSSIBLE) { 468 for (zone = 0; zone < MAX_NR_ZONES; zone++) { 469 mz = mem_cgroup_zoneinfo(mem, node, zone); 470 mctz = soft_limit_tree_node_zone(node, zone); 471 mem_cgroup_remove_exceeded(mem, mz, mctz); 472 } 473 } 474} 475 476static inline unsigned long mem_cgroup_get_excess(struct mem_cgroup *mem) 477{ 478 return res_counter_soft_limit_excess(&mem->res) >> PAGE_SHIFT; 479} 480 481static struct mem_cgroup_per_zone * 482__mem_cgroup_largest_soft_limit_node(struct mem_cgroup_tree_per_zone *mctz) 483{ 484 struct rb_node *rightmost = NULL; 485 struct mem_cgroup_per_zone *mz; 486 487retry: 488 mz = NULL; 489 rightmost = rb_last(&mctz->rb_root); 490 if (!rightmost) 491 goto done; /* Nothing to reclaim from */ 492 493 mz = rb_entry(rightmost, struct mem_cgroup_per_zone, tree_node); 494 /* 495 * Remove the node now but someone else can add it back, 496 * we will to add it back at the end of reclaim to its correct 497 * position in the tree. 498 */ 499 __mem_cgroup_remove_exceeded(mz->mem, mz, mctz); 500 if (!res_counter_soft_limit_excess(&mz->mem->res) || 501 !css_tryget(&mz->mem->css)) 502 goto retry; 503done: 504 return mz; 505} 506 507static struct mem_cgroup_per_zone * 508mem_cgroup_largest_soft_limit_node(struct mem_cgroup_tree_per_zone *mctz) 509{ 510 struct mem_cgroup_per_zone *mz; 511 512 spin_lock(&mctz->lock); 513 mz = __mem_cgroup_largest_soft_limit_node(mctz); 514 spin_unlock(&mctz->lock); 515 return mz; 516} 517 518static s64 mem_cgroup_read_stat(struct mem_cgroup *mem, 519 enum mem_cgroup_stat_index idx) 520{ 521 int cpu; 522 s64 val = 0; 523 524 for_each_possible_cpu(cpu) 525 val += per_cpu(mem->stat->count[idx], cpu); 526 return val; 527} 528 529static s64 mem_cgroup_local_usage(struct mem_cgroup *mem) 530{ 531 s64 ret; 532 533 ret = mem_cgroup_read_stat(mem, MEM_CGROUP_STAT_RSS); 534 ret += mem_cgroup_read_stat(mem, MEM_CGROUP_STAT_CACHE); 535 return ret; 536} 537 538static void mem_cgroup_swap_statistics(struct mem_cgroup *mem, 539 bool charge) 540{ 541 int val = (charge) ? 1 : -1; 542 this_cpu_add(mem->stat->count[MEM_CGROUP_STAT_SWAPOUT], val); 543} 544 545static void mem_cgroup_charge_statistics(struct mem_cgroup *mem, 546 struct page_cgroup *pc, 547 bool charge) 548{ 549 int val = (charge) ? 1 : -1; 550 551 preempt_disable(); 552 553 if (PageCgroupCache(pc)) 554 __this_cpu_add(mem->stat->count[MEM_CGROUP_STAT_CACHE], val); 555 else 556 __this_cpu_add(mem->stat->count[MEM_CGROUP_STAT_RSS], val); 557 558 if (charge) 559 __this_cpu_inc(mem->stat->count[MEM_CGROUP_STAT_PGPGIN_COUNT]); 560 else 561 __this_cpu_inc(mem->stat->count[MEM_CGROUP_STAT_PGPGOUT_COUNT]); 562 __this_cpu_inc(mem->stat->count[MEM_CGROUP_EVENTS]); 563 564 preempt_enable(); 565} 566 567static unsigned long mem_cgroup_get_local_zonestat(struct mem_cgroup *mem, 568 enum lru_list idx) 569{ 570 int nid, zid; 571 struct mem_cgroup_per_zone *mz; 572 u64 total = 0; 573 574 for_each_online_node(nid) 575 for (zid = 0; zid < MAX_NR_ZONES; zid++) { 576 mz = mem_cgroup_zoneinfo(mem, nid, zid); 577 total += MEM_CGROUP_ZSTAT(mz, idx); 578 } 579 return total; 580} 581 582static bool __memcg_event_check(struct mem_cgroup *mem, int event_mask_shift) 583{ 584 s64 val; 585 586 val = this_cpu_read(mem->stat->count[MEM_CGROUP_EVENTS]); 587 588 return !(val & ((1 << event_mask_shift) - 1)); 589} 590 591/* 592 * Check events in order. 593 * 594 */ 595static void memcg_check_events(struct mem_cgroup *mem, struct page *page) 596{ 597 /* threshold event is triggered in finer grain than soft limit */ 598 if (unlikely(__memcg_event_check(mem, THRESHOLDS_EVENTS_THRESH))) { 599 mem_cgroup_threshold(mem); 600 if (unlikely(__memcg_event_check(mem, SOFTLIMIT_EVENTS_THRESH))) 601 mem_cgroup_update_tree(mem, page); 602 } 603} 604 605static struct mem_cgroup *mem_cgroup_from_cont(struct cgroup *cont) 606{ 607 return container_of(cgroup_subsys_state(cont, 608 mem_cgroup_subsys_id), struct mem_cgroup, 609 css); 610} 611 612struct mem_cgroup *mem_cgroup_from_task(struct task_struct *p) 613{ 614 /* 615 * mm_update_next_owner() may clear mm->owner to NULL 616 * if it races with swapoff, page migration, etc. 617 * So this can be called with p == NULL. 618 */ 619 if (unlikely(!p)) 620 return NULL; 621 622 return container_of(task_subsys_state(p, mem_cgroup_subsys_id), 623 struct mem_cgroup, css); 624} 625 626static struct mem_cgroup *try_get_mem_cgroup_from_mm(struct mm_struct *mm) 627{ 628 struct mem_cgroup *mem = NULL; 629 630 if (!mm) 631 return NULL; 632 /* 633 * Because we have no locks, mm->owner's may be being moved to other 634 * cgroup. We use css_tryget() here even if this looks 635 * pessimistic (rather than adding locks here). 636 */ 637 rcu_read_lock(); 638 do { 639 mem = mem_cgroup_from_task(rcu_dereference(mm->owner)); 640 if (unlikely(!mem)) 641 break; 642 } while (!css_tryget(&mem->css)); 643 rcu_read_unlock(); 644 return mem; 645} 646 647/* 648 * Call callback function against all cgroup under hierarchy tree. 649 */ 650static int mem_cgroup_walk_tree(struct mem_cgroup *root, void *data, 651 int (*func)(struct mem_cgroup *, void *)) 652{ 653 int found, ret, nextid; 654 struct cgroup_subsys_state *css; 655 struct mem_cgroup *mem; 656 657 if (!root->use_hierarchy) 658 return (*func)(root, data); 659 660 nextid = 1; 661 do { 662 ret = 0; 663 mem = NULL; 664 665 rcu_read_lock(); 666 css = css_get_next(&mem_cgroup_subsys, nextid, &root->css, 667 &found); 668 if (css && css_tryget(css)) 669 mem = container_of(css, struct mem_cgroup, css); 670 rcu_read_unlock(); 671 672 if (mem) { 673 ret = (*func)(mem, data); 674 css_put(&mem->css); 675 } 676 nextid = found + 1; 677 } while (!ret && css); 678 679 return ret; 680} 681 682static inline bool mem_cgroup_is_root(struct mem_cgroup *mem) 683{ 684 return (mem == root_mem_cgroup); 685} 686 687/* 688 * Following LRU functions are allowed to be used without PCG_LOCK. 689 * Operations are called by routine of global LRU independently from memcg. 690 * What we have to take care of here is validness of pc->mem_cgroup. 691 * 692 * Changes to pc->mem_cgroup happens when 693 * 1. charge 694 * 2. moving account 695 * In typical case, "charge" is done before add-to-lru. Exception is SwapCache. 696 * It is added to LRU before charge. 697 * If PCG_USED bit is not set, page_cgroup is not added to this private LRU. 698 * When moving account, the page is not on LRU. It's isolated. 699 */ 700 701void mem_cgroup_del_lru_list(struct page *page, enum lru_list lru) 702{ 703 struct page_cgroup *pc; 704 struct mem_cgroup_per_zone *mz; 705 706 if (mem_cgroup_disabled()) 707 return; 708 pc = lookup_page_cgroup(page); 709 /* can happen while we handle swapcache. */ 710 if (!TestClearPageCgroupAcctLRU(pc)) 711 return; 712 VM_BUG_ON(!pc->mem_cgroup); 713 /* 714 * We don't check PCG_USED bit. It's cleared when the "page" is finally 715 * removed from global LRU. 716 */ 717 mz = page_cgroup_zoneinfo(pc); 718 MEM_CGROUP_ZSTAT(mz, lru) -= 1; 719 if (mem_cgroup_is_root(pc->mem_cgroup)) 720 return; 721 VM_BUG_ON(list_empty(&pc->lru)); 722 list_del_init(&pc->lru); 723 return; 724} 725 726void mem_cgroup_del_lru(struct page *page) 727{ 728 mem_cgroup_del_lru_list(page, page_lru(page)); 729} 730 731void mem_cgroup_rotate_lru_list(struct page *page, enum lru_list lru) 732{ 733 struct mem_cgroup_per_zone *mz; 734 struct page_cgroup *pc; 735 736 if (mem_cgroup_disabled()) 737 return; 738 739 pc = lookup_page_cgroup(page); 740 /* 741 * Used bit is set without atomic ops but after smp_wmb(). 742 * For making pc->mem_cgroup visible, insert smp_rmb() here. 743 */ 744 smp_rmb(); 745 /* unused or root page is not rotated. */ 746 if (!PageCgroupUsed(pc) || mem_cgroup_is_root(pc->mem_cgroup)) 747 return; 748 mz = page_cgroup_zoneinfo(pc); 749 list_move(&pc->lru, &mz->lists[lru]); 750} 751 752void mem_cgroup_add_lru_list(struct page *page, enum lru_list lru) 753{ 754 struct page_cgroup *pc; 755 struct mem_cgroup_per_zone *mz; 756 757 if (mem_cgroup_disabled()) 758 return; 759 pc = lookup_page_cgroup(page); 760 VM_BUG_ON(PageCgroupAcctLRU(pc)); 761 /* 762 * Used bit is set without atomic ops but after smp_wmb(). 763 * For making pc->mem_cgroup visible, insert smp_rmb() here. 764 */ 765 smp_rmb(); 766 if (!PageCgroupUsed(pc)) 767 return; 768 769 mz = page_cgroup_zoneinfo(pc); 770 MEM_CGROUP_ZSTAT(mz, lru) += 1; 771 SetPageCgroupAcctLRU(pc); 772 if (mem_cgroup_is_root(pc->mem_cgroup)) 773 return; 774 list_add(&pc->lru, &mz->lists[lru]); 775} 776 777/* 778 * At handling SwapCache, pc->mem_cgroup may be changed while it's linked to 779 * lru because the page may.be reused after it's fully uncharged (because of 780 * SwapCache behavior).To handle that, unlink page_cgroup from LRU when charge 781 * it again. This function is only used to charge SwapCache. It's done under 782 * lock_page and expected that zone->lru_lock is never held. 783 */ 784static void mem_cgroup_lru_del_before_commit_swapcache(struct page *page) 785{ 786 unsigned long flags; 787 struct zone *zone = page_zone(page); 788 struct page_cgroup *pc = lookup_page_cgroup(page); 789 790 spin_lock_irqsave(&zone->lru_lock, flags); 791 /* 792 * Forget old LRU when this page_cgroup is *not* used. This Used bit 793 * is guarded by lock_page() because the page is SwapCache. 794 */ 795 if (!PageCgroupUsed(pc)) 796 mem_cgroup_del_lru_list(page, page_lru(page)); 797 spin_unlock_irqrestore(&zone->lru_lock, flags); 798} 799 800static void mem_cgroup_lru_add_after_commit_swapcache(struct page *page) 801{ 802 unsigned long flags; 803 struct zone *zone = page_zone(page); 804 struct page_cgroup *pc = lookup_page_cgroup(page); 805 806 spin_lock_irqsave(&zone->lru_lock, flags); 807 /* link when the page is linked to LRU but page_cgroup isn't */ 808 if (PageLRU(page) && !PageCgroupAcctLRU(pc)) 809 mem_cgroup_add_lru_list(page, page_lru(page)); 810 spin_unlock_irqrestore(&zone->lru_lock, flags); 811} 812 813 814void mem_cgroup_move_lists(struct page *page, 815 enum lru_list from, enum lru_list to) 816{ 817 if (mem_cgroup_disabled()) 818 return; 819 mem_cgroup_del_lru_list(page, from); 820 mem_cgroup_add_lru_list(page, to); 821} 822 823int task_in_mem_cgroup(struct task_struct *task, const struct mem_cgroup *mem) 824{ 825 int ret; 826 struct mem_cgroup *curr = NULL; 827 828 task_lock(task); 829 rcu_read_lock(); 830 curr = try_get_mem_cgroup_from_mm(task->mm); 831 rcu_read_unlock(); 832 task_unlock(task); 833 if (!curr) 834 return 0; 835 /* 836 * We should check use_hierarchy of "mem" not "curr". Because checking 837 * use_hierarchy of "curr" here make this function true if hierarchy is 838 * enabled in "curr" and "curr" is a child of "mem" in *cgroup* 839 * hierarchy(even if use_hierarchy is disabled in "mem"). 840 */ 841 if (mem->use_hierarchy) 842 ret = css_is_ancestor(&curr->css, &mem->css); 843 else 844 ret = (curr == mem); 845 css_put(&curr->css); 846 return ret; 847} 848 849/* 850 * prev_priority control...this will be used in memory reclaim path. 851 */ 852int mem_cgroup_get_reclaim_priority(struct mem_cgroup *mem) 853{ 854 int prev_priority; 855 856 spin_lock(&mem->reclaim_param_lock); 857 prev_priority = mem->prev_priority; 858 spin_unlock(&mem->reclaim_param_lock); 859 860 return prev_priority; 861} 862 863void mem_cgroup_note_reclaim_priority(struct mem_cgroup *mem, int priority) 864{ 865 spin_lock(&mem->reclaim_param_lock); 866 if (priority < mem->prev_priority) 867 mem->prev_priority = priority; 868 spin_unlock(&mem->reclaim_param_lock); 869} 870 871void mem_cgroup_record_reclaim_priority(struct mem_cgroup *mem, int priority) 872{ 873 spin_lock(&mem->reclaim_param_lock); 874 mem->prev_priority = priority; 875 spin_unlock(&mem->reclaim_param_lock); 876} 877 878static int calc_inactive_ratio(struct mem_cgroup *memcg, unsigned long *present_pages) 879{ 880 unsigned long active; 881 unsigned long inactive; 882 unsigned long gb; 883 unsigned long inactive_ratio; 884 885 inactive = mem_cgroup_get_local_zonestat(memcg, LRU_INACTIVE_ANON); 886 active = mem_cgroup_get_local_zonestat(memcg, LRU_ACTIVE_ANON); 887 888 gb = (inactive + active) >> (30 - PAGE_SHIFT); 889 if (gb) 890 inactive_ratio = int_sqrt(10 * gb); 891 else 892 inactive_ratio = 1; 893 894 if (present_pages) { 895 present_pages[0] = inactive; 896 present_pages[1] = active; 897 } 898 899 return inactive_ratio; 900} 901 902int mem_cgroup_inactive_anon_is_low(struct mem_cgroup *memcg) 903{ 904 unsigned long active; 905 unsigned long inactive; 906 unsigned long present_pages[2]; 907 unsigned long inactive_ratio; 908 909 inactive_ratio = calc_inactive_ratio(memcg, present_pages); 910 911 inactive = present_pages[0]; 912 active = present_pages[1]; 913 914 if (inactive * inactive_ratio < active) 915 return 1; 916 917 return 0; 918} 919 920int mem_cgroup_inactive_file_is_low(struct mem_cgroup *memcg) 921{ 922 unsigned long active; 923 unsigned long inactive; 924 925 inactive = mem_cgroup_get_local_zonestat(memcg, LRU_INACTIVE_FILE); 926 active = mem_cgroup_get_local_zonestat(memcg, LRU_ACTIVE_FILE); 927 928 return (active > inactive); 929} 930 931unsigned long mem_cgroup_zone_nr_pages(struct mem_cgroup *memcg, 932 struct zone *zone, 933 enum lru_list lru) 934{ 935 int nid = zone->zone_pgdat->node_id; 936 int zid = zone_idx(zone); 937 struct mem_cgroup_per_zone *mz = mem_cgroup_zoneinfo(memcg, nid, zid); 938 939 return MEM_CGROUP_ZSTAT(mz, lru); 940} 941 942struct zone_reclaim_stat *mem_cgroup_get_reclaim_stat(struct mem_cgroup *memcg, 943 struct zone *zone) 944{ 945 int nid = zone->zone_pgdat->node_id; 946 int zid = zone_idx(zone); 947 struct mem_cgroup_per_zone *mz = mem_cgroup_zoneinfo(memcg, nid, zid); 948 949 return &mz->reclaim_stat; 950} 951 952struct zone_reclaim_stat * 953mem_cgroup_get_reclaim_stat_from_page(struct page *page) 954{ 955 struct page_cgroup *pc; 956 struct mem_cgroup_per_zone *mz; 957 958 if (mem_cgroup_disabled()) 959 return NULL; 960 961 pc = lookup_page_cgroup(page); 962 /* 963 * Used bit is set without atomic ops but after smp_wmb(). 964 * For making pc->mem_cgroup visible, insert smp_rmb() here. 965 */ 966 smp_rmb(); 967 if (!PageCgroupUsed(pc)) 968 return NULL; 969 970 mz = page_cgroup_zoneinfo(pc); 971 if (!mz) 972 return NULL; 973 974 return &mz->reclaim_stat; 975} 976 977unsigned long mem_cgroup_isolate_pages(unsigned long nr_to_scan, 978 struct list_head *dst, 979 unsigned long *scanned, int order, 980 int mode, struct zone *z, 981 struct mem_cgroup *mem_cont, 982 int active, int file) 983{ 984 unsigned long nr_taken = 0; 985 struct page *page; 986 unsigned long scan; 987 LIST_HEAD(pc_list); 988 struct list_head *src; 989 struct page_cgroup *pc, *tmp; 990 int nid = z->zone_pgdat->node_id; 991 int zid = zone_idx(z); 992 struct mem_cgroup_per_zone *mz; 993 int lru = LRU_FILE * file + active; 994 int ret; 995 996 BUG_ON(!mem_cont); 997 mz = mem_cgroup_zoneinfo(mem_cont, nid, zid); 998 src = &mz->lists[lru]; 999 1000 scan = 0; 1001 list_for_each_entry_safe_reverse(pc, tmp, src, lru) { 1002 if (scan >= nr_to_scan) 1003 break; 1004 1005 page = pc->page; 1006 if (unlikely(!PageCgroupUsed(pc))) 1007 continue; 1008 if (unlikely(!PageLRU(page))) 1009 continue; 1010 1011 scan++; 1012 ret = __isolate_lru_page(page, mode, file); 1013 switch (ret) { 1014 case 0: 1015 list_move(&page->lru, dst); 1016 mem_cgroup_del_lru(page); 1017 nr_taken++; 1018 break; 1019 case -EBUSY: 1020 /* we don't affect global LRU but rotate in our LRU */ 1021 mem_cgroup_rotate_lru_list(page, page_lru(page)); 1022 break; 1023 default: 1024 break; 1025 } 1026 } 1027 1028 *scanned = scan; 1029 return nr_taken; 1030} 1031 1032#define mem_cgroup_from_res_counter(counter, member) \ 1033 container_of(counter, struct mem_cgroup, member) 1034 1035static bool mem_cgroup_check_under_limit(struct mem_cgroup *mem) 1036{ 1037 if (do_swap_account) { 1038 if (res_counter_check_under_limit(&mem->res) && 1039 res_counter_check_under_limit(&mem->memsw)) 1040 return true; 1041 } else 1042 if (res_counter_check_under_limit(&mem->res)) 1043 return true; 1044 return false; 1045} 1046 1047static unsigned int get_swappiness(struct mem_cgroup *memcg) 1048{ 1049 struct cgroup *cgrp = memcg->css.cgroup; 1050 unsigned int swappiness; 1051 1052 /* root ? */ 1053 if (cgrp->parent == NULL) 1054 return vm_swappiness; 1055 1056 spin_lock(&memcg->reclaim_param_lock); 1057 swappiness = memcg->swappiness; 1058 spin_unlock(&memcg->reclaim_param_lock); 1059 1060 return swappiness; 1061} 1062 1063static int mem_cgroup_count_children_cb(struct mem_cgroup *mem, void *data) 1064{ 1065 int *val = data; 1066 (*val)++; 1067 return 0; 1068} 1069 1070/** 1071 * mem_cgroup_print_oom_info: Called from OOM with tasklist_lock held in read mode. 1072 * @memcg: The memory cgroup that went over limit 1073 * @p: Task that is going to be killed 1074 * 1075 * NOTE: @memcg and @p's mem_cgroup can be different when hierarchy is 1076 * enabled 1077 */ 1078void mem_cgroup_print_oom_info(struct mem_cgroup *memcg, struct task_struct *p) 1079{ 1080 struct cgroup *task_cgrp; 1081 struct cgroup *mem_cgrp; 1082 /* 1083 * Need a buffer in BSS, can't rely on allocations. The code relies 1084 * on the assumption that OOM is serialized for memory controller. 1085 * If this assumption is broken, revisit this code. 1086 */ 1087 static char memcg_name[PATH_MAX]; 1088 int ret; 1089 1090 if (!memcg || !p) 1091 return; 1092 1093 1094 rcu_read_lock(); 1095 1096 mem_cgrp = memcg->css.cgroup; 1097 task_cgrp = task_cgroup(p, mem_cgroup_subsys_id); 1098 1099 ret = cgroup_path(task_cgrp, memcg_name, PATH_MAX); 1100 if (ret < 0) { 1101 /* 1102 * Unfortunately, we are unable to convert to a useful name 1103 * But we'll still print out the usage information 1104 */ 1105 rcu_read_unlock(); 1106 goto done; 1107 } 1108 rcu_read_unlock(); 1109 1110 printk(KERN_INFO "Task in %s killed", memcg_name); 1111 1112 rcu_read_lock(); 1113 ret = cgroup_path(mem_cgrp, memcg_name, PATH_MAX); 1114 if (ret < 0) { 1115 rcu_read_unlock(); 1116 goto done; 1117 } 1118 rcu_read_unlock(); 1119 1120 /* 1121 * Continues from above, so we don't need an KERN_ level 1122 */ 1123 printk(KERN_CONT " as a result of limit of %s\n", memcg_name); 1124done: 1125 1126 printk(KERN_INFO "memory: usage %llukB, limit %llukB, failcnt %llu\n", 1127 res_counter_read_u64(&memcg->res, RES_USAGE) >> 10, 1128 res_counter_read_u64(&memcg->res, RES_LIMIT) >> 10, 1129 res_counter_read_u64(&memcg->res, RES_FAILCNT)); 1130 printk(KERN_INFO "memory+swap: usage %llukB, limit %llukB, " 1131 "failcnt %llu\n", 1132 res_counter_read_u64(&memcg->memsw, RES_USAGE) >> 10, 1133 res_counter_read_u64(&memcg->memsw, RES_LIMIT) >> 10, 1134 res_counter_read_u64(&memcg->memsw, RES_FAILCNT)); 1135} 1136 1137/* 1138 * This function returns the number of memcg under hierarchy tree. Returns 1139 * 1(self count) if no children. 1140 */ 1141static int mem_cgroup_count_children(struct mem_cgroup *mem) 1142{ 1143 int num = 0; 1144 mem_cgroup_walk_tree(mem, &num, mem_cgroup_count_children_cb); 1145 return num; 1146} 1147 1148/* 1149 * Visit the first child (need not be the first child as per the ordering 1150 * of the cgroup list, since we track last_scanned_child) of @mem and use 1151 * that to reclaim free pages from. 1152 */ 1153static struct mem_cgroup * 1154mem_cgroup_select_victim(struct mem_cgroup *root_mem) 1155{ 1156 struct mem_cgroup *ret = NULL; 1157 struct cgroup_subsys_state *css; 1158 int nextid, found; 1159 1160 if (!root_mem->use_hierarchy) { 1161 css_get(&root_mem->css); 1162 ret = root_mem; 1163 } 1164 1165 while (!ret) { 1166 rcu_read_lock(); 1167 nextid = root_mem->last_scanned_child + 1; 1168 css = css_get_next(&mem_cgroup_subsys, nextid, &root_mem->css, 1169 &found); 1170 if (css && css_tryget(css)) 1171 ret = container_of(css, struct mem_cgroup, css); 1172 1173 rcu_read_unlock(); 1174 /* Updates scanning parameter */ 1175 spin_lock(&root_mem->reclaim_param_lock); 1176 if (!css) { 1177 /* this means start scan from ID:1 */ 1178 root_mem->last_scanned_child = 0; 1179 } else 1180 root_mem->last_scanned_child = found; 1181 spin_unlock(&root_mem->reclaim_param_lock); 1182 } 1183 1184 return ret; 1185} 1186 1187/* 1188 * Scan the hierarchy if needed to reclaim memory. We remember the last child 1189 * we reclaimed from, so that we don't end up penalizing one child extensively 1190 * based on its position in the children list. 1191 * 1192 * root_mem is the original ancestor that we've been reclaim from. 1193 * 1194 * We give up and return to the caller when we visit root_mem twice. 1195 * (other groups can be removed while we're walking....) 1196 * 1197 * If shrink==true, for avoiding to free too much, this returns immedieately. 1198 */ 1199static int mem_cgroup_hierarchical_reclaim(struct mem_cgroup *root_mem, 1200 struct zone *zone, 1201 gfp_t gfp_mask, 1202 unsigned long reclaim_options) 1203{ 1204 struct mem_cgroup *victim; 1205 int ret, total = 0; 1206 int loop = 0; 1207 bool noswap = reclaim_options & MEM_CGROUP_RECLAIM_NOSWAP; 1208 bool shrink = reclaim_options & MEM_CGROUP_RECLAIM_SHRINK; 1209 bool check_soft = reclaim_options & MEM_CGROUP_RECLAIM_SOFT; 1210 unsigned long excess = mem_cgroup_get_excess(root_mem); 1211 1212 /* If memsw_is_minimum==1, swap-out is of-no-use. */ 1213 if (root_mem->memsw_is_minimum) 1214 noswap = true; 1215 1216 while (1) { 1217 victim = mem_cgroup_select_victim(root_mem); 1218 if (victim == root_mem) { 1219 loop++; 1220 if (loop >= 1) 1221 drain_all_stock_async(); 1222 if (loop >= 2) { 1223 /* 1224 * If we have not been able to reclaim 1225 * anything, it might because there are 1226 * no reclaimable pages under this hierarchy 1227 */ 1228 if (!check_soft || !total) { 1229 css_put(&victim->css); 1230 break; 1231 } 1232 /* 1233 * We want to do more targetted reclaim. 1234 * excess >> 2 is not to excessive so as to 1235 * reclaim too much, nor too less that we keep 1236 * coming back to reclaim from this cgroup 1237 */ 1238 if (total >= (excess >> 2) || 1239 (loop > MEM_CGROUP_MAX_RECLAIM_LOOPS)) { 1240 css_put(&victim->css); 1241 break; 1242 } 1243 } 1244 } 1245 if (!mem_cgroup_local_usage(victim)) { 1246 /* this cgroup's local usage == 0 */ 1247 css_put(&victim->css); 1248 continue; 1249 } 1250 /* we use swappiness of local cgroup */ 1251 if (check_soft) 1252 ret = mem_cgroup_shrink_node_zone(victim, gfp_mask, 1253 noswap, get_swappiness(victim), zone, 1254 zone->zone_pgdat->node_id); 1255 else 1256 ret = try_to_free_mem_cgroup_pages(victim, gfp_mask, 1257 noswap, get_swappiness(victim)); 1258 css_put(&victim->css); 1259 /* 1260 * At shrinking usage, we can't check we should stop here or 1261 * reclaim more. It's depends on callers. last_scanned_child 1262 * will work enough for keeping fairness under tree. 1263 */ 1264 if (shrink) 1265 return ret; 1266 total += ret; 1267 if (check_soft) { 1268 if (res_counter_check_under_soft_limit(&root_mem->res)) 1269 return total; 1270 } else if (mem_cgroup_check_under_limit(root_mem)) 1271 return 1 + total; 1272 } 1273 return total; 1274} 1275 1276static int mem_cgroup_oom_lock_cb(struct mem_cgroup *mem, void *data) 1277{ 1278 int *val = (int *)data; 1279 int x; 1280 /* 1281 * Logically, we can stop scanning immediately when we find 1282 * a memcg is already locked. But condidering unlock ops and 1283 * creation/removal of memcg, scan-all is simple operation. 1284 */ 1285 x = atomic_inc_return(&mem->oom_lock); 1286 *val = max(x, *val); 1287 return 0; 1288} 1289/* 1290 * Check OOM-Killer is already running under our hierarchy. 1291 * If someone is running, return false. 1292 */ 1293static bool mem_cgroup_oom_lock(struct mem_cgroup *mem) 1294{ 1295 int lock_count = 0; 1296 1297 mem_cgroup_walk_tree(mem, &lock_count, mem_cgroup_oom_lock_cb); 1298 1299 if (lock_count == 1) 1300 return true; 1301 return false; 1302} 1303 1304static int mem_cgroup_oom_unlock_cb(struct mem_cgroup *mem, void *data) 1305{ 1306 /* 1307 * When a new child is created while the hierarchy is under oom, 1308 * mem_cgroup_oom_lock() may not be called. We have to use 1309 * atomic_add_unless() here. 1310 */ 1311 atomic_add_unless(&mem->oom_lock, -1, 0); 1312 return 0; 1313} 1314 1315static void mem_cgroup_oom_unlock(struct mem_cgroup *mem) 1316{ 1317 mem_cgroup_walk_tree(mem, NULL, mem_cgroup_oom_unlock_cb); 1318} 1319 1320static DEFINE_MUTEX(memcg_oom_mutex); 1321static DECLARE_WAIT_QUEUE_HEAD(memcg_oom_waitq); 1322 1323struct oom_wait_info { 1324 struct mem_cgroup *mem; 1325 wait_queue_t wait; 1326}; 1327 1328static int memcg_oom_wake_function(wait_queue_t *wait, 1329 unsigned mode, int sync, void *arg) 1330{ 1331 struct mem_cgroup *wake_mem = (struct mem_cgroup *)arg; 1332 struct oom_wait_info *oom_wait_info; 1333 1334 oom_wait_info = container_of(wait, struct oom_wait_info, wait); 1335 1336 if (oom_wait_info->mem == wake_mem) 1337 goto wakeup; 1338 /* if no hierarchy, no match */ 1339 if (!oom_wait_info->mem->use_hierarchy || !wake_mem->use_hierarchy) 1340 return 0; 1341 /* 1342 * Both of oom_wait_info->mem and wake_mem are stable under us. 1343 * Then we can use css_is_ancestor without taking care of RCU. 1344 */ 1345 if (!css_is_ancestor(&oom_wait_info->mem->css, &wake_mem->css) && 1346 !css_is_ancestor(&wake_mem->css, &oom_wait_info->mem->css)) 1347 return 0; 1348 1349wakeup: 1350 return autoremove_wake_function(wait, mode, sync, arg); 1351} 1352 1353static void memcg_wakeup_oom(struct mem_cgroup *mem) 1354{ 1355 /* for filtering, pass "mem" as argument. */ 1356 __wake_up(&memcg_oom_waitq, TASK_NORMAL, 0, mem); 1357} 1358 1359static void memcg_oom_recover(struct mem_cgroup *mem) 1360{ 1361 if (mem->oom_kill_disable && atomic_read(&mem->oom_lock)) 1362 memcg_wakeup_oom(mem); 1363} 1364 1365/* 1366 * try to call OOM killer. returns false if we should exit memory-reclaim loop. 1367 */ 1368bool mem_cgroup_handle_oom(struct mem_cgroup *mem, gfp_t mask) 1369{ 1370 struct oom_wait_info owait; 1371 bool locked, need_to_kill; 1372 1373 owait.mem = mem; 1374 owait.wait.flags = 0; 1375 owait.wait.func = memcg_oom_wake_function; 1376 owait.wait.private = current; 1377 INIT_LIST_HEAD(&owait.wait.task_list); 1378 need_to_kill = true; 1379 /* At first, try to OOM lock hierarchy under mem.*/ 1380 mutex_lock(&memcg_oom_mutex); 1381 locked = mem_cgroup_oom_lock(mem); 1382 /* 1383 * Even if signal_pending(), we can't quit charge() loop without 1384 * accounting. So, UNINTERRUPTIBLE is appropriate. But SIGKILL 1385 * under OOM is always welcomed, use TASK_KILLABLE here. 1386 */ 1387 prepare_to_wait(&memcg_oom_waitq, &owait.wait, TASK_KILLABLE); 1388 if (!locked || mem->oom_kill_disable) 1389 need_to_kill = false; 1390 if (locked) 1391 mem_cgroup_oom_notify(mem); 1392 mutex_unlock(&memcg_oom_mutex); 1393 1394 if (need_to_kill) { 1395 finish_wait(&memcg_oom_waitq, &owait.wait); 1396 mem_cgroup_out_of_memory(mem, mask); 1397 } else { 1398 schedule(); 1399 finish_wait(&memcg_oom_waitq, &owait.wait); 1400 } 1401 mutex_lock(&memcg_oom_mutex); 1402 mem_cgroup_oom_unlock(mem); 1403 memcg_wakeup_oom(mem); 1404 mutex_unlock(&memcg_oom_mutex); 1405 1406 if (test_thread_flag(TIF_MEMDIE) || fatal_signal_pending(current)) 1407 return false; 1408 /* Give chance to dying process */ 1409 schedule_timeout(1); 1410 return true; 1411} 1412 1413/* 1414 * Currently used to update mapped file statistics, but the routine can be 1415 * generalized to update other statistics as well. 1416 */ 1417void mem_cgroup_update_file_mapped(struct page *page, int val) 1418{ 1419 struct mem_cgroup *mem; 1420 struct page_cgroup *pc; 1421 1422 pc = lookup_page_cgroup(page); 1423 if (unlikely(!pc)) 1424 return; 1425 1426 lock_page_cgroup(pc); 1427 mem = pc->mem_cgroup; 1428 if (!mem || !PageCgroupUsed(pc)) 1429 goto done; 1430 1431 /* 1432 * Preemption is already disabled. We can use __this_cpu_xxx 1433 */ 1434 if (val > 0) { 1435 __this_cpu_inc(mem->stat->count[MEM_CGROUP_STAT_FILE_MAPPED]); 1436 SetPageCgroupFileMapped(pc); 1437 } else { 1438 __this_cpu_dec(mem->stat->count[MEM_CGROUP_STAT_FILE_MAPPED]); 1439 ClearPageCgroupFileMapped(pc); 1440 } 1441 1442done: 1443 unlock_page_cgroup(pc); 1444} 1445 1446/* 1447 * size of first charge trial. "32" comes from vmscan.c's magic value. 1448 * TODO: maybe necessary to use big numbers in big irons. 1449 */ 1450#define CHARGE_SIZE (32 * PAGE_SIZE) 1451struct memcg_stock_pcp { 1452 struct mem_cgroup *cached; /* this never be root cgroup */ 1453 int charge; 1454 struct work_struct work; 1455}; 1456static DEFINE_PER_CPU(struct memcg_stock_pcp, memcg_stock); 1457static atomic_t memcg_drain_count; 1458 1459/* 1460 * Try to consume stocked charge on this cpu. If success, PAGE_SIZE is consumed 1461 * from local stock and true is returned. If the stock is 0 or charges from a 1462 * cgroup which is not current target, returns false. This stock will be 1463 * refilled. 1464 */ 1465static bool consume_stock(struct mem_cgroup *mem) 1466{ 1467 struct memcg_stock_pcp *stock; 1468 bool ret = true; 1469 1470 stock = &get_cpu_var(memcg_stock); 1471 if (mem == stock->cached && stock->charge) 1472 stock->charge -= PAGE_SIZE; 1473 else /* need to call res_counter_charge */ 1474 ret = false; 1475 put_cpu_var(memcg_stock); 1476 return ret; 1477} 1478 1479/* 1480 * Returns stocks cached in percpu to res_counter and reset cached information. 1481 */ 1482static void drain_stock(struct memcg_stock_pcp *stock) 1483{ 1484 struct mem_cgroup *old = stock->cached; 1485 1486 if (stock->charge) { 1487 res_counter_uncharge(&old->res, stock->charge); 1488 if (do_swap_account) 1489 res_counter_uncharge(&old->memsw, stock->charge); 1490 } 1491 stock->cached = NULL; 1492 stock->charge = 0; 1493} 1494 1495/* 1496 * This must be called under preempt disabled or must be called by 1497 * a thread which is pinned to local cpu. 1498 */ 1499static void drain_local_stock(struct work_struct *dummy) 1500{ 1501 struct memcg_stock_pcp *stock = &__get_cpu_var(memcg_stock); 1502 drain_stock(stock); 1503} 1504 1505/* 1506 * Cache charges(val) which is from res_counter, to local per_cpu area. 1507 * This will be consumed by consume_stock() function, later. 1508 */ 1509static void refill_stock(struct mem_cgroup *mem, int val) 1510{ 1511 struct memcg_stock_pcp *stock = &get_cpu_var(memcg_stock); 1512 1513 if (stock->cached != mem) { /* reset if necessary */ 1514 drain_stock(stock); 1515 stock->cached = mem; 1516 } 1517 stock->charge += val; 1518 put_cpu_var(memcg_stock); 1519} 1520 1521/* 1522 * Tries to drain stocked charges in other cpus. This function is asynchronous 1523 * and just put a work per cpu for draining localy on each cpu. Caller can 1524 * expects some charges will be back to res_counter later but cannot wait for 1525 * it. 1526 */ 1527static void drain_all_stock_async(void) 1528{ 1529 int cpu; 1530 /* This function is for scheduling "drain" in asynchronous way. 1531 * The result of "drain" is not directly handled by callers. Then, 1532 * if someone is calling drain, we don't have to call drain more. 1533 * Anyway, WORK_STRUCT_PENDING check in queue_work_on() will catch if 1534 * there is a race. We just do loose check here. 1535 */ 1536 if (atomic_read(&memcg_drain_count)) 1537 return; 1538 /* Notify other cpus that system-wide "drain" is running */ 1539 atomic_inc(&memcg_drain_count); 1540 get_online_cpus(); 1541 for_each_online_cpu(cpu) { 1542 struct memcg_stock_pcp *stock = &per_cpu(memcg_stock, cpu); 1543 schedule_work_on(cpu, &stock->work); 1544 } 1545 put_online_cpus(); 1546 atomic_dec(&memcg_drain_count); 1547 /* We don't wait for flush_work */ 1548} 1549 1550/* This is a synchronous drain interface. */ 1551static void drain_all_stock_sync(void) 1552{ 1553 /* called when force_empty is called */ 1554 atomic_inc(&memcg_drain_count); 1555 schedule_on_each_cpu(drain_local_stock); 1556 atomic_dec(&memcg_drain_count); 1557} 1558 1559static int __cpuinit memcg_stock_cpu_callback(struct notifier_block *nb, 1560 unsigned long action, 1561 void *hcpu) 1562{ 1563 int cpu = (unsigned long)hcpu; 1564 struct memcg_stock_pcp *stock; 1565 1566 if (action != CPU_DEAD) 1567 return NOTIFY_OK; 1568 stock = &per_cpu(memcg_stock, cpu); 1569 drain_stock(stock); 1570 return NOTIFY_OK; 1571} 1572 1573/* 1574 * Unlike exported interface, "oom" parameter is added. if oom==true, 1575 * oom-killer can be invoked. 1576 */ 1577static int __mem_cgroup_try_charge(struct mm_struct *mm, 1578 gfp_t gfp_mask, struct mem_cgroup **memcg, bool oom) 1579{ 1580 struct mem_cgroup *mem, *mem_over_limit; 1581 int nr_retries = MEM_CGROUP_RECLAIM_RETRIES; 1582 struct res_counter *fail_res; 1583 int csize = CHARGE_SIZE; 1584 1585 /* 1586 * Unlike gloval-vm's OOM-kill, we're not in memory shortage 1587 * in system level. So, allow to go ahead dying process in addition to 1588 * MEMDIE process. 1589 */ 1590 if (unlikely(test_thread_flag(TIF_MEMDIE) 1591 || fatal_signal_pending(current))) 1592 goto bypass; 1593 1594 /* 1595 * We always charge the cgroup the mm_struct belongs to. 1596 * The mm_struct's mem_cgroup changes on task migration if the 1597 * thread group leader migrates. It's possible that mm is not 1598 * set, if so charge the init_mm (happens for pagecache usage). 1599 */ 1600 mem = *memcg; 1601 if (likely(!mem)) { 1602 mem = try_get_mem_cgroup_from_mm(mm); 1603 *memcg = mem; 1604 } else { 1605 css_get(&mem->css); 1606 } 1607 if (unlikely(!mem)) 1608 return 0; 1609 1610 VM_BUG_ON(css_is_removed(&mem->css)); 1611 if (mem_cgroup_is_root(mem)) 1612 goto done; 1613 1614 while (1) { 1615 int ret = 0; 1616 unsigned long flags = 0; 1617 1618 if (consume_stock(mem)) 1619 goto done; 1620 1621 ret = res_counter_charge(&mem->res, csize, &fail_res); 1622 if (likely(!ret)) { 1623 if (!do_swap_account) 1624 break; 1625 ret = res_counter_charge(&mem->memsw, csize, &fail_res); 1626 if (likely(!ret)) 1627 break; 1628 /* mem+swap counter fails */ 1629 res_counter_uncharge(&mem->res, csize); 1630 flags |= MEM_CGROUP_RECLAIM_NOSWAP; 1631 mem_over_limit = mem_cgroup_from_res_counter(fail_res, 1632 memsw); 1633 } else 1634 /* mem counter fails */ 1635 mem_over_limit = mem_cgroup_from_res_counter(fail_res, 1636 res); 1637 1638 /* reduce request size and retry */ 1639 if (csize > PAGE_SIZE) { 1640 csize = PAGE_SIZE; 1641 continue; 1642 } 1643 if (!(gfp_mask & __GFP_WAIT)) 1644 goto nomem; 1645 1646 ret = mem_cgroup_hierarchical_reclaim(mem_over_limit, NULL, 1647 gfp_mask, flags); 1648 if (ret) 1649 continue; 1650 1651 /* 1652 * try_to_free_mem_cgroup_pages() might not give us a full 1653 * picture of reclaim. Some pages are reclaimed and might be 1654 * moved to swap cache or just unmapped from the cgroup. 1655 * Check the limit again to see if the reclaim reduced the 1656 * current usage of the cgroup before giving up 1657 * 1658 */ 1659 if (mem_cgroup_check_under_limit(mem_over_limit)) 1660 continue; 1661 1662 /* try to avoid oom while someone is moving charge */ 1663 if (mc.moving_task && current != mc.moving_task) { 1664 struct mem_cgroup *from, *to; 1665 bool do_continue = false; 1666 /* 1667 * There is a small race that "from" or "to" can be 1668 * freed by rmdir, so we use css_tryget(). 1669 */ 1670 from = mc.from; 1671 to = mc.to; 1672 if (from && css_tryget(&from->css)) { 1673 if (mem_over_limit->use_hierarchy) 1674 do_continue = css_is_ancestor( 1675 &from->css, 1676 &mem_over_limit->css); 1677 else 1678 do_continue = (from == mem_over_limit); 1679 css_put(&from->css); 1680 } 1681 if (!do_continue && to && css_tryget(&to->css)) { 1682 if (mem_over_limit->use_hierarchy) 1683 do_continue = css_is_ancestor( 1684 &to->css, 1685 &mem_over_limit->css); 1686 else 1687 do_continue = (to == mem_over_limit); 1688 css_put(&to->css); 1689 } 1690 if (do_continue) { 1691 DEFINE_WAIT(wait); 1692 prepare_to_wait(&mc.waitq, &wait, 1693 TASK_INTERRUPTIBLE); 1694 /* moving charge context might have finished. */ 1695 if (mc.moving_task) 1696 schedule(); 1697 finish_wait(&mc.waitq, &wait); 1698 continue; 1699 } 1700 } 1701 1702 if (!nr_retries--) { 1703 if (!oom) 1704 goto nomem; 1705 if (mem_cgroup_handle_oom(mem_over_limit, gfp_mask)) { 1706 nr_retries = MEM_CGROUP_RECLAIM_RETRIES; 1707 continue; 1708 } 1709 /* When we reach here, current task is dying .*/ 1710 css_put(&mem->css); 1711 goto bypass; 1712 } 1713 } 1714 if (csize > PAGE_SIZE) 1715 refill_stock(mem, csize - PAGE_SIZE); 1716done: 1717 return 0; 1718nomem: 1719 css_put(&mem->css); 1720 return -ENOMEM; 1721bypass: 1722 *memcg = NULL; 1723 return 0; 1724} 1725 1726/* 1727 * Somemtimes we have to undo a charge we got by try_charge(). 1728 * This function is for that and do uncharge, put css's refcnt. 1729 * gotten by try_charge(). 1730 */ 1731static void __mem_cgroup_cancel_charge(struct mem_cgroup *mem, 1732 unsigned long count) 1733{ 1734 if (!mem_cgroup_is_root(mem)) { 1735 res_counter_uncharge(&mem->res, PAGE_SIZE * count); 1736 if (do_swap_account) 1737 res_counter_uncharge(&mem->memsw, PAGE_SIZE * count); 1738 VM_BUG_ON(test_bit(CSS_ROOT, &mem->css.flags)); 1739 WARN_ON_ONCE(count > INT_MAX); 1740 __css_put(&mem->css, (int)count); 1741 } 1742 /* we don't need css_put for root */ 1743} 1744 1745static void mem_cgroup_cancel_charge(struct mem_cgroup *mem) 1746{ 1747 __mem_cgroup_cancel_charge(mem, 1); 1748} 1749 1750/* 1751 * A helper function to get mem_cgroup from ID. must be called under 1752 * rcu_read_lock(). The caller must check css_is_removed() or some if 1753 * it's concern. (dropping refcnt from swap can be called against removed 1754 * memcg.) 1755 */ 1756static struct mem_cgroup *mem_cgroup_lookup(unsigned short id) 1757{ 1758 struct cgroup_subsys_state *css; 1759 1760 /* ID 0 is unused ID */ 1761 if (!id) 1762 return NULL; 1763 css = css_lookup(&mem_cgroup_subsys, id); 1764 if (!css) 1765 return NULL; 1766 return container_of(css, struct mem_cgroup, css); 1767} 1768 1769struct mem_cgroup *try_get_mem_cgroup_from_page(struct page *page) 1770{ 1771 struct mem_cgroup *mem = NULL; 1772 struct page_cgroup *pc; 1773 unsigned short id; 1774 swp_entry_t ent; 1775 1776 VM_BUG_ON(!PageLocked(page)); 1777 1778 pc = lookup_page_cgroup(page); 1779 lock_page_cgroup(pc); 1780 if (PageCgroupUsed(pc)) { 1781 mem = pc->mem_cgroup; 1782 if (mem && !css_tryget(&mem->css)) 1783 mem = NULL; 1784 } else if (PageSwapCache(page)) { 1785 ent.val = page_private(page); 1786 id = lookup_swap_cgroup(ent); 1787 rcu_read_lock(); 1788 mem = mem_cgroup_lookup(id); 1789 if (mem && !css_tryget(&mem->css)) 1790 mem = NULL; 1791 rcu_read_unlock(); 1792 } 1793 unlock_page_cgroup(pc); 1794 return mem; 1795} 1796 1797/* 1798 * commit a charge got by __mem_cgroup_try_charge() and makes page_cgroup to be 1799 * USED state. If already USED, uncharge and return. 1800 */ 1801 1802static void __mem_cgroup_commit_charge(struct mem_cgroup *mem, 1803 struct page_cgroup *pc, 1804 enum charge_type ctype) 1805{ 1806 /* try_charge() can return NULL to *memcg, taking care of it. */ 1807 if (!mem) 1808 return; 1809 1810 lock_page_cgroup(pc); 1811 if (unlikely(PageCgroupUsed(pc))) { 1812 unlock_page_cgroup(pc); 1813 mem_cgroup_cancel_charge(mem); 1814 return; 1815 } 1816 1817 pc->mem_cgroup = mem; 1818 /* 1819 * We access a page_cgroup asynchronously without lock_page_cgroup(). 1820 * Especially when a page_cgroup is taken from a page, pc->mem_cgroup 1821 * is accessed after testing USED bit. To make pc->mem_cgroup visible 1822 * before USED bit, we need memory barrier here. 1823 * See mem_cgroup_add_lru_list(), etc. 1824 */ 1825 smp_wmb(); 1826 switch (ctype) { 1827 case MEM_CGROUP_CHARGE_TYPE_CACHE: 1828 case MEM_CGROUP_CHARGE_TYPE_SHMEM: 1829 SetPageCgroupCache(pc); 1830 SetPageCgroupUsed(pc); 1831 break; 1832 case MEM_CGROUP_CHARGE_TYPE_MAPPED: 1833 ClearPageCgroupCache(pc); 1834 SetPageCgroupUsed(pc); 1835 break; 1836 default: 1837 break; 1838 } 1839 1840 mem_cgroup_charge_statistics(mem, pc, true); 1841 1842 unlock_page_cgroup(pc); 1843 /* 1844 * "charge_statistics" updated event counter. Then, check it. 1845 * Insert ancestor (and ancestor's ancestors), to softlimit RB-tree. 1846 * if they exceeds softlimit. 1847 */ 1848 memcg_check_events(mem, pc->page); 1849} 1850 1851/** 1852 * __mem_cgroup_move_account - move account of the page 1853 * @pc: page_cgroup of the page. 1854 * @from: mem_cgroup which the page is moved from. 1855 * @to: mem_cgroup which the page is moved to. @from != @to. 1856 * @uncharge: whether we should call uncharge and css_put against @from. 1857 * 1858 * The caller must confirm following. 1859 * - page is not on LRU (isolate_page() is useful.) 1860 * - the pc is locked, used, and ->mem_cgroup points to @from. 1861 * 1862 * This function doesn't do "charge" nor css_get to new cgroup. It should be 1863 * done by a caller(__mem_cgroup_try_charge would be usefull). If @uncharge is 1864 * true, this function does "uncharge" from old cgroup, but it doesn't if 1865 * @uncharge is false, so a caller should do "uncharge". 1866 */ 1867 1868static void __mem_cgroup_move_account(struct page_cgroup *pc, 1869 struct mem_cgroup *from, struct mem_cgroup *to, bool uncharge) 1870{ 1871 VM_BUG_ON(from == to); 1872 VM_BUG_ON(PageLRU(pc->page)); 1873 VM_BUG_ON(!PageCgroupLocked(pc)); 1874 VM_BUG_ON(!PageCgroupUsed(pc)); 1875 VM_BUG_ON(pc->mem_cgroup != from); 1876 1877 if (PageCgroupFileMapped(pc)) { 1878 /* Update mapped_file data for mem_cgroup */ 1879 preempt_disable(); 1880 __this_cpu_dec(from->stat->count[MEM_CGROUP_STAT_FILE_MAPPED]); 1881 __this_cpu_inc(to->stat->count[MEM_CGROUP_STAT_FILE_MAPPED]); 1882 preempt_enable(); 1883 } 1884 mem_cgroup_charge_statistics(from, pc, false); 1885 if (uncharge) 1886 /* This is not "cancel", but cancel_charge does all we need. */ 1887 mem_cgroup_cancel_charge(from); 1888 1889 /* caller should have done css_get */ 1890 pc->mem_cgroup = to; 1891 mem_cgroup_charge_statistics(to, pc, true); 1892 /* 1893 * We charges against "to" which may not have any tasks. Then, "to" 1894 * can be under rmdir(). But in current implementation, caller of 1895 * this function is just force_empty() and move charge, so it's 1896 * garanteed that "to" is never removed. So, we don't check rmdir 1897 * status here. 1898 */ 1899} 1900 1901/* 1902 * check whether the @pc is valid for moving account and call 1903 * __mem_cgroup_move_account() 1904 */ 1905static int mem_cgroup_move_account(struct page_cgroup *pc, 1906 struct mem_cgroup *from, struct mem_cgroup *to, bool uncharge) 1907{ 1908 int ret = -EINVAL; 1909 lock_page_cgroup(pc); 1910 if (PageCgroupUsed(pc) && pc->mem_cgroup == from) { 1911 __mem_cgroup_move_account(pc, from, to, uncharge); 1912 ret = 0; 1913 } 1914 unlock_page_cgroup(pc); 1915 /* 1916 * check events 1917 */ 1918 memcg_check_events(to, pc->page); 1919 memcg_check_events(from, pc->page); 1920 return ret; 1921} 1922 1923/* 1924 * move charges to its parent. 1925 */ 1926 1927static int mem_cgroup_move_parent(struct page_cgroup *pc, 1928 struct mem_cgroup *child, 1929 gfp_t gfp_mask) 1930{ 1931 struct page *page = pc->page; 1932 struct cgroup *cg = child->css.cgroup; 1933 struct cgroup *pcg = cg->parent; 1934 struct mem_cgroup *parent; 1935 int ret; 1936 1937 /* Is ROOT ? */ 1938 if (!pcg) 1939 return -EINVAL; 1940 1941 ret = -EBUSY; 1942 if (!get_page_unless_zero(page)) 1943 goto out; 1944 if (isolate_lru_page(page)) 1945 goto put; 1946 1947 parent = mem_cgroup_from_cont(pcg); 1948 ret = __mem_cgroup_try_charge(NULL, gfp_mask, &parent, false); 1949 if (ret || !parent) 1950 goto put_back; 1951 1952 ret = mem_cgroup_move_account(pc, child, parent, true); 1953 if (ret) 1954 mem_cgroup_cancel_charge(parent); 1955put_back: 1956 putback_lru_page(page); 1957put: 1958 put_page(page); 1959out: 1960 return ret; 1961} 1962 1963/* 1964 * Charge the memory controller for page usage. 1965 * Return 1966 * 0 if the charge was successful 1967 * < 0 if the cgroup is over its limit 1968 */ 1969static int mem_cgroup_charge_common(struct page *page, struct mm_struct *mm, 1970 gfp_t gfp_mask, enum charge_type ctype, 1971 struct mem_cgroup *memcg) 1972{ 1973 struct mem_cgroup *mem; 1974 struct page_cgroup *pc; 1975 int ret; 1976 1977 pc = lookup_page_cgroup(page); 1978 /* can happen at boot */ 1979 if (unlikely(!pc)) 1980 return 0; 1981 prefetchw(pc); 1982 1983 mem = memcg; 1984 ret = __mem_cgroup_try_charge(mm, gfp_mask, &mem, true); 1985 if (ret || !mem) 1986 return ret; 1987 1988 __mem_cgroup_commit_charge(mem, pc, ctype); 1989 return 0; 1990} 1991 1992int mem_cgroup_newpage_charge(struct page *page, 1993 struct mm_struct *mm, gfp_t gfp_mask) 1994{ 1995 if (mem_cgroup_disabled()) 1996 return 0; 1997 if (PageCompound(page)) 1998 return 0; 1999 /* 2000 * If already mapped, we don't have to account. 2001 * If page cache, page->mapping has address_space. 2002 * But page->mapping may have out-of-use anon_vma pointer, 2003 * detecit it by PageAnon() check. newly-mapped-anon's page->mapping 2004 * is NULL. 2005 */ 2006 if (page_mapped(page) || (page->mapping && !PageAnon(page))) 2007 return 0; 2008 if (unlikely(!mm)) 2009 mm = &init_mm; 2010 return mem_cgroup_charge_common(page, mm, gfp_mask, 2011 MEM_CGROUP_CHARGE_TYPE_MAPPED, NULL); 2012} 2013 2014static void 2015__mem_cgroup_commit_charge_swapin(struct page *page, struct mem_cgroup *ptr, 2016 enum charge_type ctype); 2017 2018int mem_cgroup_cache_charge(struct page *page, struct mm_struct *mm, 2019 gfp_t gfp_mask) 2020{ 2021 struct mem_cgroup *mem = NULL; 2022 int ret; 2023 2024 if (mem_cgroup_disabled()) 2025 return 0; 2026 if (PageCompound(page)) 2027 return 0; 2028 /* 2029 * Corner case handling. This is called from add_to_page_cache() 2030 * in usual. But some FS (shmem) precharges this page before calling it 2031 * and call add_to_page_cache() with GFP_NOWAIT. 2032 * 2033 * For GFP_NOWAIT case, the page may be pre-charged before calling 2034 * add_to_page_cache(). (See shmem.c) check it here and avoid to call 2035 * charge twice. (It works but has to pay a bit larger cost.) 2036 * And when the page is SwapCache, it should take swap information 2037 * into account. This is under lock_page() now. 2038 */ 2039 if (!(gfp_mask & __GFP_WAIT)) { 2040 struct page_cgroup *pc; 2041 2042 2043 pc = lookup_page_cgroup(page); 2044 if (!pc) 2045 return 0; 2046 lock_page_cgroup(pc); 2047 if (PageCgroupUsed(pc)) { 2048 unlock_page_cgroup(pc); 2049 return 0; 2050 } 2051 unlock_page_cgroup(pc); 2052 } 2053 2054 if (unlikely(!mm && !mem)) 2055 mm = &init_mm; 2056 2057 if (page_is_file_cache(page)) 2058 return mem_cgroup_charge_common(page, mm, gfp_mask, 2059 MEM_CGROUP_CHARGE_TYPE_CACHE, NULL); 2060 2061 /* shmem */ 2062 if (PageSwapCache(page)) { 2063 ret = mem_cgroup_try_charge_swapin(mm, page, gfp_mask, &mem); 2064 if (!ret) 2065 __mem_cgroup_commit_charge_swapin(page, mem, 2066 MEM_CGROUP_CHARGE_TYPE_SHMEM); 2067 } else 2068 ret = mem_cgroup_charge_common(page, mm, gfp_mask, 2069 MEM_CGROUP_CHARGE_TYPE_SHMEM, mem); 2070 2071 return ret; 2072} 2073 2074/* 2075 * While swap-in, try_charge -> commit or cancel, the page is locked. 2076 * And when try_charge() successfully returns, one refcnt to memcg without 2077 * struct page_cgroup is acquired. This refcnt will be consumed by 2078 * "commit()" or removed by "cancel()" 2079 */ 2080int mem_cgroup_try_charge_swapin(struct mm_struct *mm, 2081 struct page *page, 2082 gfp_t mask, struct mem_cgroup **ptr) 2083{ 2084 struct mem_cgroup *mem; 2085 int ret; 2086 2087 if (mem_cgroup_disabled()) 2088 return 0; 2089 2090 if (!do_swap_account) 2091 goto charge_cur_mm; 2092 /* 2093 * A racing thread's fault, or swapoff, may have already updated 2094 * the pte, and even removed page from swap cache: in those cases 2095 * do_swap_page()'s pte_same() test will fail; but there's also a 2096 * KSM case which does need to charge the page. 2097 */ 2098 if (!PageSwapCache(page)) 2099 goto charge_cur_mm; 2100 mem = try_get_mem_cgroup_from_page(page); 2101 if (!mem) 2102 goto charge_cur_mm; 2103 *ptr = mem; 2104 ret = __mem_cgroup_try_charge(NULL, mask, ptr, true); 2105 /* drop extra refcnt from tryget */ 2106 css_put(&mem->css); 2107 return ret; 2108charge_cur_mm: 2109 if (unlikely(!mm)) 2110 mm = &init_mm; 2111 return __mem_cgroup_try_charge(mm, mask, ptr, true); 2112} 2113 2114static void 2115__mem_cgroup_commit_charge_swapin(struct page *page, struct mem_cgroup *ptr, 2116 enum charge_type ctype) 2117{ 2118 struct page_cgroup *pc; 2119 2120 if (mem_cgroup_disabled()) 2121 return; 2122 if (!ptr) 2123 return; 2124 cgroup_exclude_rmdir(&ptr->css); 2125 pc = lookup_page_cgroup(page); 2126 mem_cgroup_lru_del_before_commit_swapcache(page); 2127 __mem_cgroup_commit_charge(ptr, pc, ctype); 2128 mem_cgroup_lru_add_after_commit_swapcache(page); 2129 /* 2130 * Now swap is on-memory. This means this page may be 2131 * counted both as mem and swap....double count. 2132 * Fix it by uncharging from memsw. Basically, this SwapCache is stable 2133 * under lock_page(). But in do_swap_page()::memory.c, reuse_swap_page() 2134 * may call delete_from_swap_cache() before reach here. 2135 */ 2136 if (do_swap_account && PageSwapCache(page)) { 2137 swp_entry_t ent = {.val = page_private(page)}; 2138 unsigned short id; 2139 struct mem_cgroup *memcg; 2140 2141 id = swap_cgroup_record(ent, 0); 2142 rcu_read_lock(); 2143 memcg = mem_cgroup_lookup(id); 2144 if (memcg) { 2145 /* 2146 * This recorded memcg can be obsolete one. So, avoid 2147 * calling css_tryget 2148 */ 2149 if (!mem_cgroup_is_root(memcg)) 2150 res_counter_uncharge(&memcg->memsw, PAGE_SIZE); 2151 mem_cgroup_swap_statistics(memcg, false); 2152 mem_cgroup_put(memcg); 2153 } 2154 rcu_read_unlock(); 2155 } 2156 /* 2157 * At swapin, we may charge account against cgroup which has no tasks. 2158 * So, rmdir()->pre_destroy() can be called while we do this charge. 2159 * In that case, we need to call pre_destroy() again. check it here. 2160 */ 2161 cgroup_release_and_wakeup_rmdir(&ptr->css); 2162} 2163 2164void mem_cgroup_commit_charge_swapin(struct page *page, struct mem_cgroup *ptr) 2165{ 2166 __mem_cgroup_commit_charge_swapin(page, ptr, 2167 MEM_CGROUP_CHARGE_TYPE_MAPPED); 2168} 2169 2170void mem_cgroup_cancel_charge_swapin(struct mem_cgroup *mem) 2171{ 2172 if (mem_cgroup_disabled()) 2173 return; 2174 if (!mem) 2175 return; 2176 mem_cgroup_cancel_charge(mem); 2177} 2178 2179static void 2180__do_uncharge(struct mem_cgroup *mem, const enum charge_type ctype) 2181{ 2182 struct memcg_batch_info *batch = NULL; 2183 bool uncharge_memsw = true; 2184 /* If swapout, usage of swap doesn't decrease */ 2185 if (!do_swap_account || ctype == MEM_CGROUP_CHARGE_TYPE_SWAPOUT) 2186 uncharge_memsw = false; 2187 2188 batch = ¤t->memcg_batch; 2189 /* 2190 * In usual, we do css_get() when we remember memcg pointer. 2191 * But in this case, we keep res->usage until end of a series of 2192 * uncharges. Then, it's ok to ignore memcg's refcnt. 2193 */ 2194 if (!batch->memcg) 2195 batch->memcg = mem; 2196 /* 2197 * do_batch > 0 when unmapping pages or inode invalidate/truncate. 2198 * In those cases, all pages freed continously can be expected to be in 2199 * the same cgroup and we have chance to coalesce uncharges. 2200 * But we do uncharge one by one if this is killed by OOM(TIF_MEMDIE) 2201 * because we want to do uncharge as soon as possible. 2202 */ 2203 2204 if (!batch->do_batch || test_thread_flag(TIF_MEMDIE)) 2205 goto direct_uncharge; 2206 2207 /* 2208 * In typical case, batch->memcg == mem. This means we can 2209 * merge a series of uncharges to an uncharge of res_counter. 2210 * If not, we uncharge res_counter ony by one. 2211 */ 2212 if (batch->memcg != mem) 2213 goto direct_uncharge; 2214 /* remember freed charge and uncharge it later */ 2215 batch->bytes += PAGE_SIZE; 2216 if (uncharge_memsw) 2217 batch->memsw_bytes += PAGE_SIZE; 2218 return; 2219direct_uncharge: 2220 res_counter_uncharge(&mem->res, PAGE_SIZE); 2221 if (uncharge_memsw) 2222 res_counter_uncharge(&mem->memsw, PAGE_SIZE); 2223 if (unlikely(batch->memcg != mem)) 2224 memcg_oom_recover(mem); 2225 return; 2226} 2227 2228/* 2229 * uncharge if !page_mapped(page) 2230 */ 2231static struct mem_cgroup * 2232__mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype) 2233{ 2234 struct page_cgroup *pc; 2235 struct mem_cgroup *mem = NULL; 2236 struct mem_cgroup_per_zone *mz; 2237 2238 if (mem_cgroup_disabled()) 2239 return NULL; 2240 2241 if (PageSwapCache(page)) 2242 return NULL; 2243 2244 /* 2245 * Check if our page_cgroup is valid 2246 */ 2247 pc = lookup_page_cgroup(page); 2248 if (unlikely(!pc || !PageCgroupUsed(pc))) 2249 return NULL; 2250 2251 lock_page_cgroup(pc); 2252 2253 mem = pc->mem_cgroup; 2254 2255 if (!PageCgroupUsed(pc)) 2256 goto unlock_out; 2257 2258 switch (ctype) { 2259 case MEM_CGROUP_CHARGE_TYPE_MAPPED: 2260 case MEM_CGROUP_CHARGE_TYPE_DROP: 2261 if (page_mapped(page)) 2262 goto unlock_out; 2263 break; 2264 case MEM_CGROUP_CHARGE_TYPE_SWAPOUT: 2265 if (!PageAnon(page)) { /* Shared memory */ 2266 if (page->mapping && !page_is_file_cache(page)) 2267 goto unlock_out; 2268 } else if (page_mapped(page)) /* Anon */ 2269 goto unlock_out; 2270 break; 2271 default: 2272 break; 2273 } 2274 2275 if (!mem_cgroup_is_root(mem)) 2276 __do_uncharge(mem, ctype); 2277 if (ctype == MEM_CGROUP_CHARGE_TYPE_SWAPOUT) 2278 mem_cgroup_swap_statistics(mem, true); 2279 mem_cgroup_charge_statistics(mem, pc, false); 2280 2281 ClearPageCgroupUsed(pc); 2282 /* 2283 * pc->mem_cgroup is not cleared here. It will be accessed when it's 2284 * freed from LRU. This is safe because uncharged page is expected not 2285 * to be reused (freed soon). Exception is SwapCache, it's handled by 2286 * special functions. 2287 */ 2288 2289 mz = page_cgroup_zoneinfo(pc); 2290 unlock_page_cgroup(pc); 2291 2292 memcg_check_events(mem, page); 2293 /* at swapout, this memcg will be accessed to record to swap */ 2294 if (ctype != MEM_CGROUP_CHARGE_TYPE_SWAPOUT) 2295 css_put(&mem->css); 2296 2297 return mem; 2298 2299unlock_out: 2300 unlock_page_cgroup(pc); 2301 return NULL; 2302} 2303 2304void mem_cgroup_uncharge_page(struct page *page) 2305{ 2306 /* early check. */ 2307 if (page_mapped(page)) 2308 return; 2309 if (page->mapping && !PageAnon(page)) 2310 return; 2311 __mem_cgroup_uncharge_common(page, MEM_CGROUP_CHARGE_TYPE_MAPPED); 2312} 2313 2314void mem_cgroup_uncharge_cache_page(struct page *page) 2315{ 2316 VM_BUG_ON(page_mapped(page)); 2317 VM_BUG_ON(page->mapping); 2318 __mem_cgroup_uncharge_common(page, MEM_CGROUP_CHARGE_TYPE_CACHE); 2319} 2320 2321/* 2322 * Batch_start/batch_end is called in unmap_page_range/invlidate/trucate. 2323 * In that cases, pages are freed continuously and we can expect pages 2324 * are in the same memcg. All these calls itself limits the number of 2325 * pages freed at once, then uncharge_start/end() is called properly. 2326 * This may be called prural(2) times in a context, 2327 */ 2328 2329void mem_cgroup_uncharge_start(void) 2330{ 2331 current->memcg_batch.do_batch++; 2332 /* We can do nest. */ 2333 if (current->memcg_batch.do_batch == 1) { 2334 current->memcg_batch.memcg = NULL; 2335 current->memcg_batch.bytes = 0; 2336 current->memcg_batch.memsw_bytes = 0; 2337 } 2338} 2339 2340void mem_cgroup_uncharge_end(void) 2341{ 2342 struct memcg_batch_info *batch = ¤t->memcg_batch; 2343 2344 if (!batch->do_batch) 2345 return; 2346 2347 batch->do_batch--; 2348 if (batch->do_batch) /* If stacked, do nothing. */ 2349 return; 2350 2351 if (!batch->memcg) 2352 return; 2353 /* 2354 * This "batch->memcg" is valid without any css_get/put etc... 2355 * bacause we hide charges behind us. 2356 */ 2357 if (batch->bytes) 2358 res_counter_uncharge(&batch->memcg->res, batch->bytes); 2359 if (batch->memsw_bytes) 2360 res_counter_uncharge(&batch->memcg->memsw, batch->memsw_bytes); 2361 memcg_oom_recover(batch->memcg); 2362 /* forget this pointer (for sanity check) */ 2363 batch->memcg = NULL; 2364} 2365 2366#ifdef CONFIG_SWAP 2367/* 2368 * called after __delete_from_swap_cache() and drop "page" account. 2369 * memcg information is recorded to swap_cgroup of "ent" 2370 */ 2371void 2372mem_cgroup_uncharge_swapcache(struct page *page, swp_entry_t ent, bool swapout) 2373{ 2374 struct mem_cgroup *memcg; 2375 int ctype = MEM_CGROUP_CHARGE_TYPE_SWAPOUT; 2376 2377 if (!swapout) /* this was a swap cache but the swap is unused ! */ 2378 ctype = MEM_CGROUP_CHARGE_TYPE_DROP; 2379 2380 memcg = __mem_cgroup_uncharge_common(page, ctype); 2381 2382 /* record memcg information */ 2383 if (do_swap_account && swapout && memcg) { 2384 swap_cgroup_record(ent, css_id(&memcg->css)); 2385 mem_cgroup_get(memcg); 2386 } 2387 if (swapout && memcg) 2388 css_put(&memcg->css); 2389} 2390#endif 2391 2392#ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP 2393/* 2394 * called from swap_entry_free(). remove record in swap_cgroup and 2395 * uncharge "memsw" account. 2396 */ 2397void mem_cgroup_uncharge_swap(swp_entry_t ent) 2398{ 2399 struct mem_cgroup *memcg; 2400 unsigned short id; 2401 2402 if (!do_swap_account) 2403 return; 2404 2405 id = swap_cgroup_record(ent, 0); 2406 rcu_read_lock(); 2407 memcg = mem_cgroup_lookup(id); 2408 if (memcg) { 2409 /* 2410 * We uncharge this because swap is freed. 2411 * This memcg can be obsolete one. We avoid calling css_tryget 2412 */ 2413 if (!mem_cgroup_is_root(memcg)) 2414 res_counter_uncharge(&memcg->memsw, PAGE_SIZE); 2415 mem_cgroup_swap_statistics(memcg, false); 2416 mem_cgroup_put(memcg); 2417 } 2418 rcu_read_unlock(); 2419} 2420 2421/** 2422 * mem_cgroup_move_swap_account - move swap charge and swap_cgroup's record. 2423 * @entry: swap entry to be moved 2424 * @from: mem_cgroup which the entry is moved from 2425 * @to: mem_cgroup which the entry is moved to 2426 * @need_fixup: whether we should fixup res_counters and refcounts. 2427 * 2428 * It succeeds only when the swap_cgroup's record for this entry is the same 2429 * as the mem_cgroup's id of @from. 2430 * 2431 * Returns 0 on success, -EINVAL on failure. 2432 * 2433 * The caller must have charged to @to, IOW, called res_counter_charge() about 2434 * both res and memsw, and called css_get(). 2435 */ 2436static int mem_cgroup_move_swap_account(swp_entry_t entry, 2437 struct mem_cgroup *from, struct mem_cgroup *to, bool need_fixup) 2438{ 2439 unsigned short old_id, new_id; 2440 2441 old_id = css_id(&from->css); 2442 new_id = css_id(&to->css); 2443 2444 if (swap_cgroup_cmpxchg(entry, old_id, new_id) == old_id) { 2445 mem_cgroup_swap_statistics(from, false); 2446 mem_cgroup_swap_statistics(to, true); 2447 /* 2448 * This function is only called from task migration context now. 2449 * It postpones res_counter and refcount handling till the end 2450 * of task migration(mem_cgroup_clear_mc()) for performance 2451 * improvement. But we cannot postpone mem_cgroup_get(to) 2452 * because if the process that has been moved to @to does 2453 * swap-in, the refcount of @to might be decreased to 0. 2454 */ 2455 mem_cgroup_get(to); 2456 if (need_fixup) { 2457 if (!mem_cgroup_is_root(from)) 2458 res_counter_uncharge(&from->memsw, PAGE_SIZE); 2459 mem_cgroup_put(from); 2460 /* 2461 * we charged both to->res and to->memsw, so we should 2462 * uncharge to->res. 2463 */ 2464 if (!mem_cgroup_is_root(to)) 2465 res_counter_uncharge(&to->res, PAGE_SIZE); 2466 css_put(&to->css); 2467 } 2468 return 0; 2469 } 2470 return -EINVAL; 2471} 2472#else 2473static inline int mem_cgroup_move_swap_account(swp_entry_t entry, 2474 struct mem_cgroup *from, struct mem_cgroup *to, bool need_fixup) 2475{ 2476 return -EINVAL; 2477} 2478#endif 2479 2480/* 2481 * Before starting migration, account PAGE_SIZE to mem_cgroup that the old 2482 * page belongs to. 2483 */ 2484int mem_cgroup_prepare_migration(struct page *page, struct mem_cgroup **ptr) 2485{ 2486 struct page_cgroup *pc; 2487 struct mem_cgroup *mem = NULL; 2488 int ret = 0; 2489 2490 if (mem_cgroup_disabled()) 2491 return 0; 2492 2493 pc = lookup_page_cgroup(page); 2494 lock_page_cgroup(pc); 2495 if (PageCgroupUsed(pc)) { 2496 mem = pc->mem_cgroup; 2497 css_get(&mem->css); 2498 } 2499 unlock_page_cgroup(pc); 2500 2501 *ptr = mem; 2502 if (mem) { 2503 ret = __mem_cgroup_try_charge(NULL, GFP_KERNEL, ptr, false); 2504 css_put(&mem->css); 2505 } 2506 return ret; 2507} 2508 2509/* remove redundant charge if migration failed*/ 2510void mem_cgroup_end_migration(struct mem_cgroup *mem, 2511 struct page *oldpage, struct page *newpage) 2512{ 2513 struct page *target, *unused; 2514 struct page_cgroup *pc; 2515 enum charge_type ctype; 2516 2517 if (!mem) 2518 return; 2519 cgroup_exclude_rmdir(&mem->css); 2520 /* at migration success, oldpage->mapping is NULL. */ 2521 if (oldpage->mapping) { 2522 target = oldpage; 2523 unused = NULL; 2524 } else { 2525 target = newpage; 2526 unused = oldpage; 2527 } 2528 2529 if (PageAnon(target)) 2530 ctype = MEM_CGROUP_CHARGE_TYPE_MAPPED; 2531 else if (page_is_file_cache(target)) 2532 ctype = MEM_CGROUP_CHARGE_TYPE_CACHE; 2533 else 2534 ctype = MEM_CGROUP_CHARGE_TYPE_SHMEM; 2535 2536 /* unused page is not on radix-tree now. */ 2537 if (unused) 2538 __mem_cgroup_uncharge_common(unused, ctype); 2539 2540 pc = lookup_page_cgroup(target); 2541 /* 2542 * __mem_cgroup_commit_charge() check PCG_USED bit of page_cgroup. 2543 * So, double-counting is effectively avoided. 2544 */ 2545 __mem_cgroup_commit_charge(mem, pc, ctype); 2546 2547 /* 2548 * Both of oldpage and newpage are still under lock_page(). 2549 * Then, we don't have to care about race in radix-tree. 2550 * But we have to be careful that this page is unmapped or not. 2551 * 2552 * There is a case for !page_mapped(). At the start of 2553 * migration, oldpage was mapped. But now, it's zapped. 2554 * But we know *target* page is not freed/reused under us. 2555 * mem_cgroup_uncharge_page() does all necessary checks. 2556 */ 2557 if (ctype == MEM_CGROUP_CHARGE_TYPE_MAPPED) 2558 mem_cgroup_uncharge_page(target); 2559 /* 2560 * At migration, we may charge account against cgroup which has no tasks 2561 * So, rmdir()->pre_destroy() can be called while we do this charge. 2562 * In that case, we need to call pre_destroy() again. check it here. 2563 */ 2564 cgroup_release_and_wakeup_rmdir(&mem->css); 2565} 2566 2567/* 2568 * A call to try to shrink memory usage on charge failure at shmem's swapin. 2569 * Calling hierarchical_reclaim is not enough because we should update 2570 * last_oom_jiffies to prevent pagefault_out_of_memory from invoking global OOM. 2571 * Moreover considering hierarchy, we should reclaim from the mem_over_limit, 2572 * not from the memcg which this page would be charged to. 2573 * try_charge_swapin does all of these works properly. 2574 */ 2575int mem_cgroup_shmem_charge_fallback(struct page *page, 2576 struct mm_struct *mm, 2577 gfp_t gfp_mask) 2578{ 2579 struct mem_cgroup *mem = NULL; 2580 int ret; 2581 2582 if (mem_cgroup_disabled()) 2583 return 0; 2584 2585 ret = mem_cgroup_try_charge_swapin(mm, page, gfp_mask, &mem); 2586 if (!ret) 2587 mem_cgroup_cancel_charge_swapin(mem); /* it does !mem check */ 2588 2589 return ret; 2590} 2591 2592static DEFINE_MUTEX(set_limit_mutex); 2593 2594static int mem_cgroup_resize_limit(struct mem_cgroup *memcg, 2595 unsigned long long val) 2596{ 2597 int retry_count; 2598 u64 memswlimit, memlimit; 2599 int ret = 0; 2600 int children = mem_cgroup_count_children(memcg); 2601 u64 curusage, oldusage; 2602 int enlarge; 2603 2604 /* 2605 * For keeping hierarchical_reclaim simple, how long we should retry 2606 * is depends on callers. We set our retry-count to be function 2607 * of # of children which we should visit in this loop. 2608 */ 2609 retry_count = MEM_CGROUP_RECLAIM_RETRIES * children; 2610 2611 oldusage = res_counter_read_u64(&memcg->res, RES_USAGE); 2612 2613 enlarge = 0; 2614 while (retry_count) { 2615 if (signal_pending(current)) { 2616 ret = -EINTR; 2617 break; 2618 } 2619 /* 2620 * Rather than hide all in some function, I do this in 2621 * open coded manner. You see what this really does. 2622 * We have to guarantee mem->res.limit < mem->memsw.limit. 2623 */ 2624 mutex_lock(&set_limit_mutex); 2625 memswlimit = res_counter_read_u64(&memcg->memsw, RES_LIMIT); 2626 if (memswlimit < val) { 2627 ret = -EINVAL; 2628 mutex_unlock(&set_limit_mutex); 2629 break; 2630 } 2631 2632 memlimit = res_counter_read_u64(&memcg->res, RES_LIMIT); 2633 if (memlimit < val) 2634 enlarge = 1; 2635 2636 ret = res_counter_set_limit(&memcg->res, val); 2637 if (!ret) { 2638 if (memswlimit == val) 2639 memcg->memsw_is_minimum = true; 2640 else 2641 memcg->memsw_is_minimum = false; 2642 } 2643 mutex_unlock(&set_limit_mutex); 2644 2645 if (!ret) 2646 break; 2647 2648 mem_cgroup_hierarchical_reclaim(memcg, NULL, GFP_KERNEL, 2649 MEM_CGROUP_RECLAIM_SHRINK); 2650 curusage = res_counter_read_u64(&memcg->res, RES_USAGE); 2651 /* Usage is reduced ? */ 2652 if (curusage >= oldusage) 2653 retry_count--; 2654 else 2655 oldusage = curusage; 2656 } 2657 if (!ret && enlarge) 2658 memcg_oom_recover(memcg); 2659 2660 return ret; 2661} 2662 2663static int mem_cgroup_resize_memsw_limit(struct mem_cgroup *memcg, 2664 unsigned long long val) 2665{ 2666 int retry_count; 2667 u64 memlimit, memswlimit, oldusage, curusage; 2668 int children = mem_cgroup_count_children(memcg); 2669 int ret = -EBUSY; 2670 int enlarge = 0; 2671 2672 /* see mem_cgroup_resize_res_limit */ 2673 retry_count = children * MEM_CGROUP_RECLAIM_RETRIES; 2674 oldusage = res_counter_read_u64(&memcg->memsw, RES_USAGE); 2675 while (retry_count) { 2676 if (signal_pending(current)) { 2677 ret = -EINTR; 2678 break; 2679 } 2680 /* 2681 * Rather than hide all in some function, I do this in 2682 * open coded manner. You see what this really does. 2683 * We have to guarantee mem->res.limit < mem->memsw.limit. 2684 */ 2685 mutex_lock(&set_limit_mutex); 2686 memlimit = res_counter_read_u64(&memcg->res, RES_LIMIT); 2687 if (memlimit > val) { 2688 ret = -EINVAL; 2689 mutex_unlock(&set_limit_mutex); 2690 break; 2691 } 2692 memswlimit = res_counter_read_u64(&memcg->memsw, RES_LIMIT); 2693 if (memswlimit < val) 2694 enlarge = 1; 2695 ret = res_counter_set_limit(&memcg->memsw, val); 2696 if (!ret) { 2697 if (memlimit == val) 2698 memcg->memsw_is_minimum = true; 2699 else 2700 memcg->memsw_is_minimum = false; 2701 } 2702 mutex_unlock(&set_limit_mutex); 2703 2704 if (!ret) 2705 break; 2706 2707 mem_cgroup_hierarchical_reclaim(memcg, NULL, GFP_KERNEL, 2708 MEM_CGROUP_RECLAIM_NOSWAP | 2709 MEM_CGROUP_RECLAIM_SHRINK); 2710 curusage = res_counter_read_u64(&memcg->memsw, RES_USAGE); 2711 /* Usage is reduced ? */ 2712 if (curusage >= oldusage) 2713 retry_count--; 2714 else 2715 oldusage = curusage; 2716 } 2717 if (!ret && enlarge) 2718 memcg_oom_recover(memcg); 2719 return ret; 2720} 2721 2722unsigned long mem_cgroup_soft_limit_reclaim(struct zone *zone, int order, 2723 gfp_t gfp_mask, int nid, 2724 int zid) 2725{ 2726 unsigned long nr_reclaimed = 0; 2727 struct mem_cgroup_per_zone *mz, *next_mz = NULL; 2728 unsigned long reclaimed; 2729 int loop = 0; 2730 struct mem_cgroup_tree_per_zone *mctz; 2731 unsigned long long excess; 2732 2733 if (order > 0) 2734 return 0; 2735 2736 mctz = soft_limit_tree_node_zone(nid, zid); 2737 /* 2738 * This loop can run a while, specially if mem_cgroup's continuously 2739 * keep exceeding their soft limit and putting the system under 2740 * pressure 2741 */ 2742 do { 2743 if (next_mz) 2744 mz = next_mz; 2745 else 2746 mz = mem_cgroup_largest_soft_limit_node(mctz); 2747 if (!mz) 2748 break; 2749 2750 reclaimed = mem_cgroup_hierarchical_reclaim(mz->mem, zone, 2751 gfp_mask, 2752 MEM_CGROUP_RECLAIM_SOFT); 2753 nr_reclaimed += reclaimed; 2754 spin_lock(&mctz->lock); 2755 2756 /* 2757 * If we failed to reclaim anything from this memory cgroup 2758 * it is time to move on to the next cgroup 2759 */ 2760 next_mz = NULL; 2761 if (!reclaimed) { 2762 do { 2763 /* 2764 * Loop until we find yet another one. 2765 * 2766 * By the time we get the soft_limit lock 2767 * again, someone might have aded the 2768 * group back on the RB tree. Iterate to 2769 * make sure we get a different mem. 2770 * mem_cgroup_largest_soft_limit_node returns 2771 * NULL if no other cgroup is present on 2772 * the tree 2773 */ 2774 next_mz = 2775 __mem_cgroup_largest_soft_limit_node(mctz); 2776 if (next_mz == mz) { 2777 css_put(&next_mz->mem->css); 2778 next_mz = NULL; 2779 } else /* next_mz == NULL or other memcg */ 2780 break; 2781 } while (1); 2782 } 2783 __mem_cgroup_remove_exceeded(mz->mem, mz, mctz); 2784 excess = res_counter_soft_limit_excess(&mz->mem->res); 2785 /* 2786 * One school of thought says that we should not add 2787 * back the node to the tree if reclaim returns 0. 2788 * But our reclaim could return 0, simply because due 2789 * to priority we are exposing a smaller subset of 2790 * memory to reclaim from. Consider this as a longer 2791 * term TODO. 2792 */ 2793 /* If excess == 0, no tree ops */ 2794 __mem_cgroup_insert_exceeded(mz->mem, mz, mctz, excess); 2795 spin_unlock(&mctz->lock); 2796 css_put(&mz->mem->css); 2797 loop++; 2798 /* 2799 * Could not reclaim anything and there are no more 2800 * mem cgroups to try or we seem to be looping without 2801 * reclaiming anything. 2802 */ 2803 if (!nr_reclaimed && 2804 (next_mz == NULL || 2805 loop > MEM_CGROUP_MAX_SOFT_LIMIT_RECLAIM_LOOPS)) 2806 break; 2807 } while (!nr_reclaimed); 2808 if (next_mz) 2809 css_put(&next_mz->mem->css); 2810 return nr_reclaimed; 2811} 2812 2813/* 2814 * This routine traverse page_cgroup in given list and drop them all. 2815 * *And* this routine doesn't reclaim page itself, just removes page_cgroup. 2816 */ 2817static int mem_cgroup_force_empty_list(struct mem_cgroup *mem, 2818 int node, int zid, enum lru_list lru) 2819{ 2820 struct zone *zone; 2821 struct mem_cgroup_per_zone *mz; 2822 struct page_cgroup *pc, *busy; 2823 unsigned long flags, loop; 2824 struct list_head *list; 2825 int ret = 0; 2826 2827 zone = &NODE_DATA(node)->node_zones[zid]; 2828 mz = mem_cgroup_zoneinfo(mem, node, zid); 2829 list = &mz->lists[lru]; 2830 2831 loop = MEM_CGROUP_ZSTAT(mz, lru); 2832 /* give some margin against EBUSY etc...*/ 2833 loop += 256; 2834 busy = NULL; 2835 while (loop--) { 2836 ret = 0; 2837 spin_lock_irqsave(&zone->lru_lock, flags); 2838 if (list_empty(list)) { 2839 spin_unlock_irqrestore(&zone->lru_lock, flags); 2840 break; 2841 } 2842 pc = list_entry(list->prev, struct page_cgroup, lru); 2843 if (busy == pc) { 2844 list_move(&pc->lru, list); 2845 busy = NULL; 2846 spin_unlock_irqrestore(&zone->lru_lock, flags); 2847 continue; 2848 } 2849 spin_unlock_irqrestore(&zone->lru_lock, flags); 2850 2851 ret = mem_cgroup_move_parent(pc, mem, GFP_KERNEL); 2852 if (ret == -ENOMEM) 2853 break; 2854 2855 if (ret == -EBUSY || ret == -EINVAL) { 2856 /* found lock contention or "pc" is obsolete. */ 2857 busy = pc; 2858 cond_resched(); 2859 } else 2860 busy = NULL; 2861 } 2862 2863 if (!ret && !list_empty(list)) 2864 return -EBUSY; 2865 return ret; 2866} 2867 2868/* 2869 * make mem_cgroup's charge to be 0 if there is no task. 2870 * This enables deleting this mem_cgroup. 2871 */ 2872static int mem_cgroup_force_empty(struct mem_cgroup *mem, bool free_all) 2873{ 2874 int ret; 2875 int node, zid, shrink; 2876 int nr_retries = MEM_CGROUP_RECLAIM_RETRIES; 2877 struct cgroup *cgrp = mem->css.cgroup; 2878 2879 css_get(&mem->css); 2880 2881 shrink = 0; 2882 /* should free all ? */ 2883 if (free_all) 2884 goto try_to_free; 2885move_account: 2886 do { 2887 ret = -EBUSY; 2888 if (cgroup_task_count(cgrp) || !list_empty(&cgrp->children)) 2889 goto out; 2890 ret = -EINTR; 2891 if (signal_pending(current)) 2892 goto out; 2893 /* This is for making all *used* pages to be on LRU. */ 2894 lru_add_drain_all(); 2895 drain_all_stock_sync(); 2896 ret = 0; 2897 for_each_node_state(node, N_HIGH_MEMORY) { 2898 for (zid = 0; !ret && zid < MAX_NR_ZONES; zid++) { 2899 enum lru_list l; 2900 for_each_lru(l) { 2901 ret = mem_cgroup_force_empty_list(mem, 2902 node, zid, l); 2903 if (ret) 2904 break; 2905 } 2906 } 2907 if (ret) 2908 break; 2909 } 2910 memcg_oom_recover(mem); 2911 /* it seems parent cgroup doesn't have enough mem */ 2912 if (ret == -ENOMEM) 2913 goto try_to_free; 2914 cond_resched(); 2915 /* "ret" should also be checked to ensure all lists are empty. */ 2916 } while (mem->res.usage > 0 || ret); 2917out: 2918 css_put(&mem->css); 2919 return ret; 2920 2921try_to_free: 2922 /* returns EBUSY if there is a task or if we come here twice. */ 2923 if (cgroup_task_count(cgrp) || !list_empty(&cgrp->children) || shrink) { 2924 ret = -EBUSY; 2925 goto out; 2926 } 2927 /* we call try-to-free pages for make this cgroup empty */ 2928 lru_add_drain_all(); 2929 /* try to free all pages in this cgroup */ 2930 shrink = 1; 2931 while (nr_retries && mem->res.usage > 0) { 2932 int progress; 2933 2934 if (signal_pending(current)) { 2935 ret = -EINTR; 2936 goto out; 2937 } 2938 progress = try_to_free_mem_cgroup_pages(mem, GFP_KERNEL, 2939 false, get_swappiness(mem)); 2940 if (!progress) { 2941 nr_retries--; 2942 /* maybe some writeback is necessary */ 2943 congestion_wait(BLK_RW_ASYNC, HZ/10); 2944 } 2945 2946 } 2947 lru_add_drain(); 2948 /* try move_account...there may be some *locked* pages. */ 2949 goto move_account; 2950} 2951 2952int mem_cgroup_force_empty_write(struct cgroup *cont, unsigned int event) 2953{ 2954 return mem_cgroup_force_empty(mem_cgroup_from_cont(cont), true); 2955} 2956 2957 2958static u64 mem_cgroup_hierarchy_read(struct cgroup *cont, struct cftype *cft) 2959{ 2960 return mem_cgroup_from_cont(cont)->use_hierarchy; 2961} 2962 2963static int mem_cgroup_hierarchy_write(struct cgroup *cont, struct cftype *cft, 2964 u64 val) 2965{ 2966 int retval = 0; 2967 struct mem_cgroup *mem = mem_cgroup_from_cont(cont); 2968 struct cgroup *parent = cont->parent; 2969 struct mem_cgroup *parent_mem = NULL; 2970 2971 if (parent) 2972 parent_mem = mem_cgroup_from_cont(parent); 2973 2974 cgroup_lock(); 2975 /* 2976 * If parent's use_hierarchy is set, we can't make any modifications 2977 * in the child subtrees. If it is unset, then the change can 2978 * occur, provided the current cgroup has no children. 2979 * 2980 * For the root cgroup, parent_mem is NULL, we allow value to be 2981 * set if there are no children. 2982 */ 2983 if ((!parent_mem || !parent_mem->use_hierarchy) && 2984 (val == 1 || val == 0)) { 2985 if (list_empty(&cont->children)) 2986 mem->use_hierarchy = val; 2987 else 2988 retval = -EBUSY; 2989 } else 2990 retval = -EINVAL; 2991 cgroup_unlock(); 2992 2993 return retval; 2994} 2995 2996struct mem_cgroup_idx_data { 2997 s64 val; 2998 enum mem_cgroup_stat_index idx; 2999}; 3000 3001static int 3002mem_cgroup_get_idx_stat(struct mem_cgroup *mem, void *data) 3003{ 3004 struct mem_cgroup_idx_data *d = data; 3005 d->val += mem_cgroup_read_stat(mem, d->idx); 3006 return 0; 3007} 3008 3009static void 3010mem_cgroup_get_recursive_idx_stat(struct mem_cgroup *mem, 3011 enum mem_cgroup_stat_index idx, s64 *val) 3012{ 3013 struct mem_cgroup_idx_data d; 3014 d.idx = idx; 3015 d.val = 0; 3016 mem_cgroup_walk_tree(mem, &d, mem_cgroup_get_idx_stat); 3017 *val = d.val; 3018} 3019 3020static inline u64 mem_cgroup_usage(struct mem_cgroup *mem, bool swap) 3021{ 3022 u64 idx_val, val; 3023 3024 if (!mem_cgroup_is_root(mem)) { 3025 if (!swap) 3026 return res_counter_read_u64(&mem->res, RES_USAGE); 3027 else 3028 return res_counter_read_u64(&mem->memsw, RES_USAGE); 3029 } 3030 3031 mem_cgroup_get_recursive_idx_stat(mem, MEM_CGROUP_STAT_CACHE, &idx_val); 3032 val = idx_val; 3033 mem_cgroup_get_recursive_idx_stat(mem, MEM_CGROUP_STAT_RSS, &idx_val); 3034 val += idx_val; 3035 3036 if (swap) { 3037 mem_cgroup_get_recursive_idx_stat(mem, 3038 MEM_CGROUP_STAT_SWAPOUT, &idx_val); 3039 val += idx_val; 3040 } 3041 3042 return val << PAGE_SHIFT; 3043} 3044 3045static u64 mem_cgroup_read(struct cgroup *cont, struct cftype *cft) 3046{ 3047 struct mem_cgroup *mem = mem_cgroup_from_cont(cont); 3048 u64 val; 3049 int type, name; 3050 3051 type = MEMFILE_TYPE(cft->private); 3052 name = MEMFILE_ATTR(cft->private); 3053 switch (type) { 3054 case _MEM: 3055 if (name == RES_USAGE) 3056 val = mem_cgroup_usage(mem, false); 3057 else 3058 val = res_counter_read_u64(&mem->res, name); 3059 break; 3060 case _MEMSWAP: 3061 if (name == RES_USAGE) 3062 val = mem_cgroup_usage(mem, true); 3063 else 3064 val = res_counter_read_u64(&mem->memsw, name); 3065 break; 3066 default: 3067 BUG(); 3068 break; 3069 } 3070 return val; 3071} 3072/* 3073 * The user of this function is... 3074 * RES_LIMIT. 3075 */ 3076static int mem_cgroup_write(struct cgroup *cont, struct cftype *cft, 3077 const char *buffer) 3078{ 3079 struct mem_cgroup *memcg = mem_cgroup_from_cont(cont); 3080 int type, name; 3081 unsigned long long val; 3082 int ret; 3083 3084 type = MEMFILE_TYPE(cft->private); 3085 name = MEMFILE_ATTR(cft->private); 3086 switch (name) { 3087 case RES_LIMIT: 3088 if (mem_cgroup_is_root(memcg)) { /* Can't set limit on root */ 3089 ret = -EINVAL; 3090 break; 3091 } 3092 /* This function does all necessary parse...reuse it */ 3093 ret = res_counter_memparse_write_strategy(buffer, &val); 3094 if (ret) 3095 break; 3096 if (type == _MEM) 3097 ret = mem_cgroup_resize_limit(memcg, val); 3098 else 3099 ret = mem_cgroup_resize_memsw_limit(memcg, val); 3100 break; 3101 case RES_SOFT_LIMIT: 3102 ret = res_counter_memparse_write_strategy(buffer, &val); 3103 if (ret) 3104 break; 3105 /* 3106 * For memsw, soft limits are hard to implement in terms 3107 * of semantics, for now, we support soft limits for 3108 * control without swap 3109 */ 3110 if (type == _MEM) 3111 ret = res_counter_set_soft_limit(&memcg->res, val); 3112 else 3113 ret = -EINVAL; 3114 break; 3115 default: 3116 ret = -EINVAL; /* should be BUG() ? */ 3117 break; 3118 } 3119 return ret; 3120} 3121 3122static void memcg_get_hierarchical_limit(struct mem_cgroup *memcg, 3123 unsigned long long *mem_limit, unsigned long long *memsw_limit) 3124{ 3125 struct cgroup *cgroup; 3126 unsigned long long min_limit, min_memsw_limit, tmp; 3127 3128 min_limit = res_counter_read_u64(&memcg->res, RES_LIMIT); 3129 min_memsw_limit = res_counter_read_u64(&memcg->memsw, RES_LIMIT); 3130 cgroup = memcg->css.cgroup; 3131 if (!memcg->use_hierarchy) 3132 goto out; 3133 3134 while (cgroup->parent) { 3135 cgroup = cgroup->parent; 3136 memcg = mem_cgroup_from_cont(cgroup); 3137 if (!memcg->use_hierarchy) 3138 break; 3139 tmp = res_counter_read_u64(&memcg->res, RES_LIMIT); 3140 min_limit = min(min_limit, tmp); 3141 tmp = res_counter_read_u64(&memcg->memsw, RES_LIMIT); 3142 min_memsw_limit = min(min_memsw_limit, tmp); 3143 } 3144out: 3145 *mem_limit = min_limit; 3146 *memsw_limit = min_memsw_limit; 3147 return; 3148} 3149 3150static int mem_cgroup_reset(struct cgroup *cont, unsigned int event) 3151{ 3152 struct mem_cgroup *mem; 3153 int type, name; 3154 3155 mem = mem_cgroup_from_cont(cont); 3156 type = MEMFILE_TYPE(event); 3157 name = MEMFILE_ATTR(event); 3158 switch (name) { 3159 case RES_MAX_USAGE: 3160 if (type == _MEM) 3161 res_counter_reset_max(&mem->res); 3162 else 3163 res_counter_reset_max(&mem->memsw); 3164 break; 3165 case RES_FAILCNT: 3166 if (type == _MEM) 3167 res_counter_reset_failcnt(&mem->res); 3168 else 3169 res_counter_reset_failcnt(&mem->memsw); 3170 break; 3171 } 3172 3173 return 0; 3174} 3175 3176static u64 mem_cgroup_move_charge_read(struct cgroup *cgrp, 3177 struct cftype *cft) 3178{ 3179 return mem_cgroup_from_cont(cgrp)->move_charge_at_immigrate; 3180} 3181 3182#ifdef CONFIG_MMU 3183static int mem_cgroup_move_charge_write(struct cgroup *cgrp, 3184 struct cftype *cft, u64 val) 3185{ 3186 struct mem_cgroup *mem = mem_cgroup_from_cont(cgrp); 3187 3188 if (val >= (1 << NR_MOVE_TYPE)) 3189 return -EINVAL; 3190 /* 3191 * We check this value several times in both in can_attach() and 3192 * attach(), so we need cgroup lock to prevent this value from being 3193 * inconsistent. 3194 */ 3195 cgroup_lock(); 3196 mem->move_charge_at_immigrate = val; 3197 cgroup_unlock(); 3198 3199 return 0; 3200} 3201#else 3202static int mem_cgroup_move_charge_write(struct cgroup *cgrp, 3203 struct cftype *cft, u64 val) 3204{ 3205 return -ENOSYS; 3206} 3207#endif 3208 3209 3210/* For read statistics */ 3211enum { 3212 MCS_CACHE, 3213 MCS_RSS, 3214 MCS_FILE_MAPPED, 3215 MCS_PGPGIN, 3216 MCS_PGPGOUT, 3217 MCS_SWAP, 3218 MCS_INACTIVE_ANON, 3219 MCS_ACTIVE_ANON, 3220 MCS_INACTIVE_FILE, 3221 MCS_ACTIVE_FILE, 3222 MCS_UNEVICTABLE, 3223 NR_MCS_STAT, 3224}; 3225 3226struct mcs_total_stat { 3227 s64 stat[NR_MCS_STAT]; 3228}; 3229 3230struct { 3231 char *local_name; 3232 char *total_name; 3233} memcg_stat_strings[NR_MCS_STAT] = { 3234 {"cache", "total_cache"}, 3235 {"rss", "total_rss"}, 3236 {"mapped_file", "total_mapped_file"}, 3237 {"pgpgin", "total_pgpgin"}, 3238 {"pgpgout", "total_pgpgout"}, 3239 {"swap", "total_swap"}, 3240 {"inactive_anon", "total_inactive_anon"}, 3241 {"active_anon", "total_active_anon"}, 3242 {"inactive_file", "total_inactive_file"}, 3243 {"active_file", "total_active_file"}, 3244 {"unevictable", "total_unevictable"} 3245}; 3246 3247 3248static int mem_cgroup_get_local_stat(struct mem_cgroup *mem, void *data) 3249{ 3250 struct mcs_total_stat *s = data; 3251 s64 val; 3252 3253 /* per cpu stat */ 3254 val = mem_cgroup_read_stat(mem, MEM_CGROUP_STAT_CACHE); 3255 s->stat[MCS_CACHE] += val * PAGE_SIZE; 3256 val = mem_cgroup_read_stat(mem, MEM_CGROUP_STAT_RSS); 3257 s->stat[MCS_RSS] += val * PAGE_SIZE; 3258 val = mem_cgroup_read_stat(mem, MEM_CGROUP_STAT_FILE_MAPPED); 3259 s->stat[MCS_FILE_MAPPED] += val * PAGE_SIZE; 3260 val = mem_cgroup_read_stat(mem, MEM_CGROUP_STAT_PGPGIN_COUNT); 3261 s->stat[MCS_PGPGIN] += val; 3262 val = mem_cgroup_read_stat(mem, MEM_CGROUP_STAT_PGPGOUT_COUNT); 3263 s->stat[MCS_PGPGOUT] += val; 3264 if (do_swap_account) { 3265 val = mem_cgroup_read_stat(mem, MEM_CGROUP_STAT_SWAPOUT); 3266 s->stat[MCS_SWAP] += val * PAGE_SIZE; 3267 } 3268 3269 /* per zone stat */ 3270 val = mem_cgroup_get_local_zonestat(mem, LRU_INACTIVE_ANON); 3271 s->stat[MCS_INACTIVE_ANON] += val * PAGE_SIZE; 3272 val = mem_cgroup_get_local_zonestat(mem, LRU_ACTIVE_ANON); 3273 s->stat[MCS_ACTIVE_ANON] += val * PAGE_SIZE; 3274 val = mem_cgroup_get_local_zonestat(mem, LRU_INACTIVE_FILE); 3275 s->stat[MCS_INACTIVE_FILE] += val * PAGE_SIZE; 3276 val = mem_cgroup_get_local_zonestat(mem, LRU_ACTIVE_FILE); 3277 s->stat[MCS_ACTIVE_FILE] += val * PAGE_SIZE; 3278 val = mem_cgroup_get_local_zonestat(mem, LRU_UNEVICTABLE); 3279 s->stat[MCS_UNEVICTABLE] += val * PAGE_SIZE; 3280 return 0; 3281} 3282 3283static void 3284mem_cgroup_get_total_stat(struct mem_cgroup *mem, struct mcs_total_stat *s) 3285{ 3286 mem_cgroup_walk_tree(mem, s, mem_cgroup_get_local_stat); 3287} 3288 3289static int mem_control_stat_show(struct cgroup *cont, struct cftype *cft, 3290 struct cgroup_map_cb *cb) 3291{ 3292 struct mem_cgroup *mem_cont = mem_cgroup_from_cont(cont); 3293 struct mcs_total_stat mystat; 3294 int i; 3295 3296 memset(&mystat, 0, sizeof(mystat)); 3297 mem_cgroup_get_local_stat(mem_cont, &mystat); 3298 3299 for (i = 0; i < NR_MCS_STAT; i++) { 3300 if (i == MCS_SWAP && !do_swap_account) 3301 continue; 3302 cb->fill(cb, memcg_stat_strings[i].local_name, mystat.stat[i]); 3303 } 3304 3305 /* Hierarchical information */ 3306 { 3307 unsigned long long limit, memsw_limit; 3308 memcg_get_hierarchical_limit(mem_cont, &limit, &memsw_limit); 3309 cb->fill(cb, "hierarchical_memory_limit", limit); 3310 if (do_swap_account) 3311 cb->fill(cb, "hierarchical_memsw_limit", memsw_limit); 3312 } 3313 3314 memset(&mystat, 0, sizeof(mystat)); 3315 mem_cgroup_get_total_stat(mem_cont, &mystat); 3316 for (i = 0; i < NR_MCS_STAT; i++) { 3317 if (i == MCS_SWAP && !do_swap_account) 3318 continue; 3319 cb->fill(cb, memcg_stat_strings[i].total_name, mystat.stat[i]); 3320 } 3321 3322#ifdef CONFIG_DEBUG_VM 3323 cb->fill(cb, "inactive_ratio", calc_inactive_ratio(mem_cont, NULL)); 3324 3325 { 3326 int nid, zid; 3327 struct mem_cgroup_per_zone *mz; 3328 unsigned long recent_rotated[2] = {0, 0}; 3329 unsigned long recent_scanned[2] = {0, 0}; 3330 3331 for_each_online_node(nid) 3332 for (zid = 0; zid < MAX_NR_ZONES; zid++) { 3333 mz = mem_cgroup_zoneinfo(mem_cont, nid, zid); 3334 3335 recent_rotated[0] += 3336 mz->reclaim_stat.recent_rotated[0]; 3337 recent_rotated[1] += 3338 mz->reclaim_stat.recent_rotated[1]; 3339 recent_scanned[0] += 3340 mz->reclaim_stat.recent_scanned[0]; 3341 recent_scanned[1] += 3342 mz->reclaim_stat.recent_scanned[1]; 3343 } 3344 cb->fill(cb, "recent_rotated_anon", recent_rotated[0]); 3345 cb->fill(cb, "recent_rotated_file", recent_rotated[1]); 3346 cb->fill(cb, "recent_scanned_anon", recent_scanned[0]); 3347 cb->fill(cb, "recent_scanned_file", recent_scanned[1]); 3348 } 3349#endif 3350 3351 return 0; 3352} 3353 3354static u64 mem_cgroup_swappiness_read(struct cgroup *cgrp, struct cftype *cft) 3355{ 3356 struct mem_cgroup *memcg = mem_cgroup_from_cont(cgrp); 3357 3358 return get_swappiness(memcg); 3359} 3360 3361static int mem_cgroup_swappiness_write(struct cgroup *cgrp, struct cftype *cft, 3362 u64 val) 3363{ 3364 struct mem_cgroup *memcg = mem_cgroup_from_cont(cgrp); 3365 struct mem_cgroup *parent; 3366 3367 if (val > 100) 3368 return -EINVAL; 3369 3370 if (cgrp->parent == NULL) 3371 return -EINVAL; 3372 3373 parent = mem_cgroup_from_cont(cgrp->parent); 3374 3375 cgroup_lock(); 3376 3377 /* If under hierarchy, only empty-root can set this value */ 3378 if ((parent->use_hierarchy) || 3379 (memcg->use_hierarchy && !list_empty(&cgrp->children))) { 3380 cgroup_unlock(); 3381 return -EINVAL; 3382 } 3383 3384 spin_lock(&memcg->reclaim_param_lock); 3385 memcg->swappiness = val; 3386 spin_unlock(&memcg->reclaim_param_lock); 3387 3388 cgroup_unlock(); 3389 3390 return 0; 3391} 3392 3393static void __mem_cgroup_threshold(struct mem_cgroup *memcg, bool swap) 3394{ 3395 struct mem_cgroup_threshold_ary *t; 3396 u64 usage; 3397 int i; 3398 3399 rcu_read_lock(); 3400 if (!swap) 3401 t = rcu_dereference(memcg->thresholds); 3402 else 3403 t = rcu_dereference(memcg->memsw_thresholds); 3404 3405 if (!t) 3406 goto unlock; 3407 3408 usage = mem_cgroup_usage(memcg, swap); 3409 3410 /* 3411 * current_threshold points to threshold just below usage. 3412 * If it's not true, a threshold was crossed after last 3413 * call of __mem_cgroup_threshold(). 3414 */ 3415 i = t->current_threshold; 3416 3417 /* 3418 * Iterate backward over array of thresholds starting from 3419 * current_threshold and check if a threshold is crossed. 3420 * If none of thresholds below usage is crossed, we read 3421 * only one element of the array here. 3422 */ 3423 for (; i >= 0 && unlikely(t->entries[i].threshold > usage); i--) 3424 eventfd_signal(t->entries[i].eventfd, 1); 3425 3426 /* i = current_threshold + 1 */ 3427 i++; 3428 3429 /* 3430 * Iterate forward over array of thresholds starting from 3431 * current_threshold+1 and check if a threshold is crossed. 3432 * If none of thresholds above usage is crossed, we read 3433 * only one element of the array here. 3434 */ 3435 for (; i < t->size && unlikely(t->entries[i].threshold <= usage); i++) 3436 eventfd_signal(t->entries[i].eventfd, 1); 3437 3438 /* Update current_threshold */ 3439 t->current_threshold = i - 1; 3440unlock: 3441 rcu_read_unlock(); 3442} 3443 3444static void mem_cgroup_threshold(struct mem_cgroup *memcg) 3445{ 3446 __mem_cgroup_threshold(memcg, false); 3447 if (do_swap_account) 3448 __mem_cgroup_threshold(memcg, true); 3449} 3450 3451static int compare_thresholds(const void *a, const void *b) 3452{ 3453 const struct mem_cgroup_threshold *_a = a; 3454 const struct mem_cgroup_threshold *_b = b; 3455 3456 return _a->threshold - _b->threshold; 3457} 3458 3459static int mem_cgroup_oom_notify_cb(struct mem_cgroup *mem, void *data) 3460{ 3461 struct mem_cgroup_eventfd_list *ev; 3462 3463 list_for_each_entry(ev, &mem->oom_notify, list) 3464 eventfd_signal(ev->eventfd, 1); 3465 return 0; 3466} 3467 3468static void mem_cgroup_oom_notify(struct mem_cgroup *mem) 3469{ 3470 mem_cgroup_walk_tree(mem, NULL, mem_cgroup_oom_notify_cb); 3471} 3472 3473static int mem_cgroup_usage_register_event(struct cgroup *cgrp, 3474 struct cftype *cft, struct eventfd_ctx *eventfd, const char *args) 3475{ 3476 struct mem_cgroup *memcg = mem_cgroup_from_cont(cgrp); 3477 struct mem_cgroup_threshold_ary *thresholds, *thresholds_new; 3478 int type = MEMFILE_TYPE(cft->private); 3479 u64 threshold, usage; 3480 int size; 3481 int i, ret; 3482 3483 ret = res_counter_memparse_write_strategy(args, &threshold); 3484 if (ret) 3485 return ret; 3486 3487 mutex_lock(&memcg->thresholds_lock); 3488 if (type == _MEM) 3489 thresholds = memcg->thresholds; 3490 else if (type == _MEMSWAP) 3491 thresholds = memcg->memsw_thresholds; 3492 else 3493 BUG(); 3494 3495 usage = mem_cgroup_usage(memcg, type == _MEMSWAP); 3496 3497 /* Check if a threshold crossed before adding a new one */ 3498 if (thresholds) 3499 __mem_cgroup_threshold(memcg, type == _MEMSWAP); 3500 3501 if (thresholds) 3502 size = thresholds->size + 1; 3503 else 3504 size = 1; 3505 3506 /* Allocate memory for new array of thresholds */ 3507 thresholds_new = kmalloc(sizeof(*thresholds_new) + 3508 size * sizeof(struct mem_cgroup_threshold), 3509 GFP_KERNEL); 3510 if (!thresholds_new) { 3511 ret = -ENOMEM; 3512 goto unlock; 3513 } 3514 thresholds_new->size = size; 3515 3516 /* Copy thresholds (if any) to new array */ 3517 if (thresholds) 3518 memcpy(thresholds_new->entries, thresholds->entries, 3519 thresholds->size * 3520 sizeof(struct mem_cgroup_threshold)); 3521 /* Add new threshold */ 3522 thresholds_new->entries[size - 1].eventfd = eventfd; 3523 thresholds_new->entries[size - 1].threshold = threshold; 3524 3525 /* Sort thresholds. Registering of new threshold isn't time-critical */ 3526 sort(thresholds_new->entries, size, 3527 sizeof(struct mem_cgroup_threshold), 3528 compare_thresholds, NULL); 3529 3530 /* Find current threshold */ 3531 thresholds_new->current_threshold = -1; 3532 for (i = 0; i < size; i++) { 3533 if (thresholds_new->entries[i].threshold < usage) { 3534 /* 3535 * thresholds_new->current_threshold will not be used 3536 * until rcu_assign_pointer(), so it's safe to increment 3537 * it here. 3538 */ 3539 ++thresholds_new->current_threshold; 3540 } 3541 } 3542 3543 if (type == _MEM) 3544 rcu_assign_pointer(memcg->thresholds, thresholds_new); 3545 else 3546 rcu_assign_pointer(memcg->memsw_thresholds, thresholds_new); 3547 3548 /* To be sure that nobody uses thresholds before freeing it */ 3549 synchronize_rcu(); 3550 3551 kfree(thresholds); 3552unlock: 3553 mutex_unlock(&memcg->thresholds_lock); 3554 3555 return ret; 3556} 3557 3558static int mem_cgroup_usage_unregister_event(struct cgroup *cgrp, 3559 struct cftype *cft, struct eventfd_ctx *eventfd) 3560{ 3561 struct mem_cgroup *memcg = mem_cgroup_from_cont(cgrp); 3562 struct mem_cgroup_threshold_ary *thresholds, *thresholds_new; 3563 int type = MEMFILE_TYPE(cft->private); 3564 u64 usage; 3565 int size = 0; 3566 int i, j, ret = 0; 3567 3568 mutex_lock(&memcg->thresholds_lock); 3569 if (type == _MEM) 3570 thresholds = memcg->thresholds; 3571 else if (type == _MEMSWAP) 3572 thresholds = memcg->memsw_thresholds; 3573 else 3574 BUG(); 3575 3576 /* 3577 * Something went wrong if we trying to unregister a threshold 3578 * if we don't have thresholds 3579 */ 3580 BUG_ON(!thresholds); 3581 3582 usage = mem_cgroup_usage(memcg, type == _MEMSWAP); 3583 3584 /* Check if a threshold crossed before removing */ 3585 __mem_cgroup_threshold(memcg, type == _MEMSWAP); 3586 3587 /* Calculate new number of threshold */ 3588 for (i = 0; i < thresholds->size; i++) { 3589 if (thresholds->entries[i].eventfd != eventfd) 3590 size++; 3591 } 3592 3593 /* Set thresholds array to NULL if we don't have thresholds */ 3594 if (!size) { 3595 thresholds_new = NULL; 3596 goto assign; 3597 } 3598 3599 /* Allocate memory for new array of thresholds */ 3600 thresholds_new = kmalloc(sizeof(*thresholds_new) + 3601 size * sizeof(struct mem_cgroup_threshold), 3602 GFP_KERNEL); 3603 if (!thresholds_new) { 3604 ret = -ENOMEM; 3605 goto unlock; 3606 } 3607 thresholds_new->size = size; 3608 3609 /* Copy thresholds and find current threshold */ 3610 thresholds_new->current_threshold = -1; 3611 for (i = 0, j = 0; i < thresholds->size; i++) { 3612 if (thresholds->entries[i].eventfd == eventfd) 3613 continue; 3614 3615 thresholds_new->entries[j] = thresholds->entries[i]; 3616 if (thresholds_new->entries[j].threshold < usage) { 3617 /* 3618 * thresholds_new->current_threshold will not be used 3619 * until rcu_assign_pointer(), so it's safe to increment 3620 * it here. 3621 */ 3622 ++thresholds_new->current_threshold; 3623 } 3624 j++; 3625 } 3626 3627assign: 3628 if (type == _MEM) 3629 rcu_assign_pointer(memcg->thresholds, thresholds_new); 3630 else 3631 rcu_assign_pointer(memcg->memsw_thresholds, thresholds_new); 3632 3633 /* To be sure that nobody uses thresholds before freeing it */ 3634 synchronize_rcu(); 3635 3636 kfree(thresholds); 3637unlock: 3638 mutex_unlock(&memcg->thresholds_lock); 3639 3640 return ret; 3641} 3642 3643static int mem_cgroup_oom_register_event(struct cgroup *cgrp, 3644 struct cftype *cft, struct eventfd_ctx *eventfd, const char *args) 3645{ 3646 struct mem_cgroup *memcg = mem_cgroup_from_cont(cgrp); 3647 struct mem_cgroup_eventfd_list *event; 3648 int type = MEMFILE_TYPE(cft->private); 3649 3650 BUG_ON(type != _OOM_TYPE); 3651 event = kmalloc(sizeof(*event), GFP_KERNEL); 3652 if (!event) 3653 return -ENOMEM; 3654 3655 mutex_lock(&memcg_oom_mutex); 3656 3657 event->eventfd = eventfd; 3658 list_add(&event->list, &memcg->oom_notify); 3659 3660 /* already in OOM ? */ 3661 if (atomic_read(&memcg->oom_lock)) 3662 eventfd_signal(eventfd, 1); 3663 mutex_unlock(&memcg_oom_mutex); 3664 3665 return 0; 3666} 3667 3668static int mem_cgroup_oom_unregister_event(struct cgroup *cgrp, 3669 struct cftype *cft, struct eventfd_ctx *eventfd) 3670{ 3671 struct mem_cgroup *mem = mem_cgroup_from_cont(cgrp); 3672 struct mem_cgroup_eventfd_list *ev, *tmp; 3673 int type = MEMFILE_TYPE(cft->private); 3674 3675 BUG_ON(type != _OOM_TYPE); 3676 3677 mutex_lock(&memcg_oom_mutex); 3678 3679 list_for_each_entry_safe(ev, tmp, &mem->oom_notify, list) { 3680 if (ev->eventfd == eventfd) { 3681 list_del(&ev->list); 3682 kfree(ev); 3683 } 3684 } 3685 3686 mutex_unlock(&memcg_oom_mutex); 3687 3688 return 0; 3689} 3690 3691static int mem_cgroup_oom_control_read(struct cgroup *cgrp, 3692 struct cftype *cft, struct cgroup_map_cb *cb) 3693{ 3694 struct mem_cgroup *mem = mem_cgroup_from_cont(cgrp); 3695 3696 cb->fill(cb, "oom_kill_disable", mem->oom_kill_disable); 3697 3698 if (atomic_read(&mem->oom_lock)) 3699 cb->fill(cb, "under_oom", 1); 3700 else 3701 cb->fill(cb, "under_oom", 0); 3702 return 0; 3703} 3704 3705/* 3706 */ 3707static int mem_cgroup_oom_control_write(struct cgroup *cgrp, 3708 struct cftype *cft, u64 val) 3709{ 3710 struct mem_cgroup *mem = mem_cgroup_from_cont(cgrp); 3711 struct mem_cgroup *parent; 3712 3713 /* cannot set to root cgroup and only 0 and 1 are allowed */ 3714 if (!cgrp->parent || !((val == 0) || (val == 1))) 3715 return -EINVAL; 3716 3717 parent = mem_cgroup_from_cont(cgrp->parent); 3718 3719 cgroup_lock(); 3720 /* oom-kill-disable is a flag for subhierarchy. */ 3721 if ((parent->use_hierarchy) || 3722 (mem->use_hierarchy && !list_empty(&cgrp->children))) { 3723 cgroup_unlock(); 3724 return -EINVAL; 3725 } 3726 mem->oom_kill_disable = val; 3727 cgroup_unlock(); 3728 return 0; 3729} 3730 3731static struct cftype mem_cgroup_files[] = { 3732 { 3733 .name = "usage_in_bytes", 3734 .private = MEMFILE_PRIVATE(_MEM, RES_USAGE), 3735 .read_u64 = mem_cgroup_read, 3736 .register_event = mem_cgroup_usage_register_event, 3737 .unregister_event = mem_cgroup_usage_unregister_event, 3738 }, 3739 { 3740 .name = "max_usage_in_bytes", 3741 .private = MEMFILE_PRIVATE(_MEM, RES_MAX_USAGE), 3742 .trigger = mem_cgroup_reset, 3743 .read_u64 = mem_cgroup_read, 3744 }, 3745 { 3746 .name = "limit_in_bytes", 3747 .private = MEMFILE_PRIVATE(_MEM, RES_LIMIT), 3748 .write_string = mem_cgroup_write, 3749 .read_u64 = mem_cgroup_read, 3750 }, 3751 { 3752 .name = "soft_limit_in_bytes", 3753 .private = MEMFILE_PRIVATE(_MEM, RES_SOFT_LIMIT), 3754 .write_string = mem_cgroup_write, 3755 .read_u64 = mem_cgroup_read, 3756 }, 3757 { 3758 .name = "failcnt", 3759 .private = MEMFILE_PRIVATE(_MEM, RES_FAILCNT), 3760 .trigger = mem_cgroup_reset, 3761 .read_u64 = mem_cgroup_read, 3762 }, 3763 { 3764 .name = "stat", 3765 .read_map = mem_control_stat_show, 3766 }, 3767 { 3768 .name = "force_empty", 3769 .trigger = mem_cgroup_force_empty_write, 3770 }, 3771 { 3772 .name = "use_hierarchy", 3773 .write_u64 = mem_cgroup_hierarchy_write, 3774 .read_u64 = mem_cgroup_hierarchy_read, 3775 }, 3776 { 3777 .name = "swappiness", 3778 .read_u64 = mem_cgroup_swappiness_read, 3779 .write_u64 = mem_cgroup_swappiness_write, 3780 }, 3781 { 3782 .name = "move_charge_at_immigrate", 3783 .read_u64 = mem_cgroup_move_charge_read, 3784 .write_u64 = mem_cgroup_move_charge_write, 3785 }, 3786 { 3787 .name = "oom_control", 3788 .read_map = mem_cgroup_oom_control_read, 3789 .write_u64 = mem_cgroup_oom_control_write, 3790 .register_event = mem_cgroup_oom_register_event, 3791 .unregister_event = mem_cgroup_oom_unregister_event, 3792 .private = MEMFILE_PRIVATE(_OOM_TYPE, OOM_CONTROL), 3793 }, 3794}; 3795 3796#ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP 3797static struct cftype memsw_cgroup_files[] = { 3798 { 3799 .name = "memsw.usage_in_bytes", 3800 .private = MEMFILE_PRIVATE(_MEMSWAP, RES_USAGE), 3801 .read_u64 = mem_cgroup_read, 3802 .register_event = mem_cgroup_usage_register_event, 3803 .unregister_event = mem_cgroup_usage_unregister_event, 3804 }, 3805 { 3806 .name = "memsw.max_usage_in_bytes", 3807 .private = MEMFILE_PRIVATE(_MEMSWAP, RES_MAX_USAGE), 3808 .trigger = mem_cgroup_reset, 3809 .read_u64 = mem_cgroup_read, 3810 }, 3811 { 3812 .name = "memsw.limit_in_bytes", 3813 .private = MEMFILE_PRIVATE(_MEMSWAP, RES_LIMIT), 3814 .write_string = mem_cgroup_write, 3815 .read_u64 = mem_cgroup_read, 3816 }, 3817 { 3818 .name = "memsw.failcnt", 3819 .private = MEMFILE_PRIVATE(_MEMSWAP, RES_FAILCNT), 3820 .trigger = mem_cgroup_reset, 3821 .read_u64 = mem_cgroup_read, 3822 }, 3823}; 3824 3825static int register_memsw_files(struct cgroup *cont, struct cgroup_subsys *ss) 3826{ 3827 if (!do_swap_account) 3828 return 0; 3829 return cgroup_add_files(cont, ss, memsw_cgroup_files, 3830 ARRAY_SIZE(memsw_cgroup_files)); 3831}; 3832#else 3833static int register_memsw_files(struct cgroup *cont, struct cgroup_subsys *ss) 3834{ 3835 return 0; 3836} 3837#endif 3838 3839static int alloc_mem_cgroup_per_zone_info(struct mem_cgroup *mem, int node) 3840{ 3841 struct mem_cgroup_per_node *pn; 3842 struct mem_cgroup_per_zone *mz; 3843 enum lru_list l; 3844 int zone, tmp = node; 3845 /* 3846 * This routine is called against possible nodes. 3847 * But it's BUG to call kmalloc() against offline node. 3848 * 3849 * TODO: this routine can waste much memory for nodes which will 3850 * never be onlined. It's better to use memory hotplug callback 3851 * function. 3852 */ 3853 if (!node_state(node, N_NORMAL_MEMORY)) 3854 tmp = -1; 3855 pn = kmalloc_node(sizeof(*pn), GFP_KERNEL, tmp); 3856 if (!pn) 3857 return 1; 3858 3859 mem->info.nodeinfo[node] = pn; 3860 memset(pn, 0, sizeof(*pn)); 3861 3862 for (zone = 0; zone < MAX_NR_ZONES; zone++) { 3863 mz = &pn->zoneinfo[zone]; 3864 for_each_lru(l) 3865 INIT_LIST_HEAD(&mz->lists[l]); 3866 mz->usage_in_excess = 0; 3867 mz->on_tree = false; 3868 mz->mem = mem; 3869 } 3870 return 0; 3871} 3872 3873static void free_mem_cgroup_per_zone_info(struct mem_cgroup *mem, int node) 3874{ 3875 kfree(mem->info.nodeinfo[node]); 3876} 3877 3878static struct mem_cgroup *mem_cgroup_alloc(void) 3879{ 3880 struct mem_cgroup *mem; 3881 int size = sizeof(struct mem_cgroup); 3882 3883 /* Can be very big if MAX_NUMNODES is very big */ 3884 if (size < PAGE_SIZE) 3885 mem = kmalloc(size, GFP_KERNEL); 3886 else 3887 mem = vmalloc(size); 3888 3889 if (!mem) 3890 return NULL; 3891 3892 memset(mem, 0, size); 3893 mem->stat = alloc_percpu(struct mem_cgroup_stat_cpu); 3894 if (!mem->stat) { 3895 if (size < PAGE_SIZE) 3896 kfree(mem); 3897 else 3898 vfree(mem); 3899 mem = NULL; 3900 } 3901 return mem; 3902} 3903 3904/* 3905 * At destroying mem_cgroup, references from swap_cgroup can remain. 3906 * (scanning all at force_empty is too costly...) 3907 * 3908 * Instead of clearing all references at force_empty, we remember 3909 * the number of reference from swap_cgroup and free mem_cgroup when 3910 * it goes down to 0. 3911 * 3912 * Removal of cgroup itself succeeds regardless of refs from swap. 3913 */ 3914 3915static void __mem_cgroup_free(struct mem_cgroup *mem) 3916{ 3917 int node; 3918 3919 mem_cgroup_remove_from_trees(mem); 3920 free_css_id(&mem_cgroup_subsys, &mem->css); 3921 3922 for_each_node_state(node, N_POSSIBLE) 3923 free_mem_cgroup_per_zone_info(mem, node); 3924 3925 free_percpu(mem->stat); 3926 if (sizeof(struct mem_cgroup) < PAGE_SIZE) 3927 kfree(mem); 3928 else 3929 vfree(mem); 3930} 3931 3932static void mem_cgroup_get(struct mem_cgroup *mem) 3933{ 3934 atomic_inc(&mem->refcnt); 3935} 3936 3937static void __mem_cgroup_put(struct mem_cgroup *mem, int count) 3938{ 3939 if (atomic_sub_and_test(count, &mem->refcnt)) { 3940 struct mem_cgroup *parent = parent_mem_cgroup(mem); 3941 __mem_cgroup_free(mem); 3942 if (parent) 3943 mem_cgroup_put(parent); 3944 } 3945} 3946 3947static void mem_cgroup_put(struct mem_cgroup *mem) 3948{ 3949 __mem_cgroup_put(mem, 1); 3950} 3951 3952/* 3953 * Returns the parent mem_cgroup in memcgroup hierarchy with hierarchy enabled. 3954 */ 3955static struct mem_cgroup *parent_mem_cgroup(struct mem_cgroup *mem) 3956{ 3957 if (!mem->res.parent) 3958 return NULL; 3959 return mem_cgroup_from_res_counter(mem->res.parent, res); 3960} 3961 3962#ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP 3963static void __init enable_swap_cgroup(void) 3964{ 3965 if (!mem_cgroup_disabled() && really_do_swap_account) 3966 do_swap_account = 1; 3967} 3968#else 3969static void __init enable_swap_cgroup(void) 3970{ 3971} 3972#endif 3973 3974static int mem_cgroup_soft_limit_tree_init(void) 3975{ 3976 struct mem_cgroup_tree_per_node *rtpn; 3977 struct mem_cgroup_tree_per_zone *rtpz; 3978 int tmp, node, zone; 3979 3980 for_each_node_state(node, N_POSSIBLE) { 3981 tmp = node; 3982 if (!node_state(node, N_NORMAL_MEMORY)) 3983 tmp = -1; 3984 rtpn = kzalloc_node(sizeof(*rtpn), GFP_KERNEL, tmp); 3985 if (!rtpn) 3986 return 1; 3987 3988 soft_limit_tree.rb_tree_per_node[node] = rtpn; 3989 3990 for (zone = 0; zone < MAX_NR_ZONES; zone++) { 3991 rtpz = &rtpn->rb_tree_per_zone[zone]; 3992 rtpz->rb_root = RB_ROOT; 3993 spin_lock_init(&rtpz->lock); 3994 } 3995 } 3996 return 0; 3997} 3998 3999static struct cgroup_subsys_state * __ref 4000mem_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cont) 4001{ 4002 struct mem_cgroup *mem, *parent; 4003 long error = -ENOMEM; 4004 int node; 4005 4006 mem = mem_cgroup_alloc(); 4007 if (!mem) 4008 return ERR_PTR(error); 4009 4010 for_each_node_state(node, N_POSSIBLE) 4011 if (alloc_mem_cgroup_per_zone_info(mem, node)) 4012 goto free_out; 4013 4014 /* root ? */ 4015 if (cont->parent == NULL) { 4016 int cpu; 4017 enable_swap_cgroup(); 4018 parent = NULL; 4019 root_mem_cgroup = mem; 4020 if (mem_cgroup_soft_limit_tree_init()) 4021 goto free_out; 4022 for_each_possible_cpu(cpu) { 4023 struct memcg_stock_pcp *stock = 4024 &per_cpu(memcg_stock, cpu); 4025 INIT_WORK(&stock->work, drain_local_stock); 4026 } 4027 hotcpu_notifier(memcg_stock_cpu_callback, 0); 4028 } else { 4029 parent = mem_cgroup_from_cont(cont->parent); 4030 mem->use_hierarchy = parent->use_hierarchy; 4031 mem->oom_kill_disable = parent->oom_kill_disable; 4032 } 4033 4034 if (parent && parent->use_hierarchy) { 4035 res_counter_init(&mem->res, &parent->res); 4036 res_counter_init(&mem->memsw, &parent->memsw); 4037 /* 4038 * We increment refcnt of the parent to ensure that we can 4039 * safely access it on res_counter_charge/uncharge. 4040 * This refcnt will be decremented when freeing this 4041 * mem_cgroup(see mem_cgroup_put). 4042 */ 4043 mem_cgroup_get(parent); 4044 } else { 4045 res_counter_init(&mem->res, NULL); 4046 res_counter_init(&mem->memsw, NULL); 4047 } 4048 mem->last_scanned_child = 0; 4049 spin_lock_init(&mem->reclaim_param_lock); 4050 INIT_LIST_HEAD(&mem->oom_notify); 4051 4052 if (parent) 4053 mem->swappiness = get_swappiness(parent); 4054 atomic_set(&mem->refcnt, 1); 4055 mem->move_charge_at_immigrate = 0; 4056 mutex_init(&mem->thresholds_lock); 4057 return &mem->css; 4058free_out: 4059 __mem_cgroup_free(mem); 4060 root_mem_cgroup = NULL; 4061 return ERR_PTR(error); 4062} 4063 4064static int mem_cgroup_pre_destroy(struct cgroup_subsys *ss, 4065 struct cgroup *cont) 4066{ 4067 struct mem_cgroup *mem = mem_cgroup_from_cont(cont); 4068 4069 return mem_cgroup_force_empty(mem, false); 4070} 4071 4072static void mem_cgroup_destroy(struct cgroup_subsys *ss, 4073 struct cgroup *cont) 4074{ 4075 struct mem_cgroup *mem = mem_cgroup_from_cont(cont); 4076 4077 mem_cgroup_put(mem); 4078} 4079 4080static int mem_cgroup_populate(struct cgroup_subsys *ss, 4081 struct cgroup *cont) 4082{ 4083 int ret; 4084 4085 ret = cgroup_add_files(cont, ss, mem_cgroup_files, 4086 ARRAY_SIZE(mem_cgroup_files)); 4087 4088 if (!ret) 4089 ret = register_memsw_files(cont, ss); 4090 return ret; 4091} 4092 4093#ifdef CONFIG_MMU 4094/* Handlers for move charge at task migration. */ 4095#define PRECHARGE_COUNT_AT_ONCE 256 4096static int mem_cgroup_do_precharge(unsigned long count) 4097{ 4098 int ret = 0; 4099 int batch_count = PRECHARGE_COUNT_AT_ONCE; 4100 struct mem_cgroup *mem = mc.to; 4101 4102 if (mem_cgroup_is_root(mem)) { 4103 mc.precharge += count; 4104 /* we don't need css_get for root */ 4105 return ret; 4106 } 4107 /* try to charge at once */ 4108 if (count > 1) { 4109 struct res_counter *dummy; 4110 /* 4111 * "mem" cannot be under rmdir() because we've already checked 4112 * by cgroup_lock_live_cgroup() that it is not removed and we 4113 * are still under the same cgroup_mutex. So we can postpone 4114 * css_get(). 4115 */ 4116 if (res_counter_charge(&mem->res, PAGE_SIZE * count, &dummy)) 4117 goto one_by_one; 4118 if (do_swap_account && res_counter_charge(&mem->memsw, 4119 PAGE_SIZE * count, &dummy)) { 4120 res_counter_uncharge(&mem->res, PAGE_SIZE * count); 4121 goto one_by_one; 4122 } 4123 mc.precharge += count; 4124 VM_BUG_ON(test_bit(CSS_ROOT, &mem->css.flags)); 4125 WARN_ON_ONCE(count > INT_MAX); 4126 __css_get(&mem->css, (int)count); 4127 return ret; 4128 } 4129one_by_one: 4130 /* fall back to one by one charge */ 4131 while (count--) { 4132 if (signal_pending(current)) { 4133 ret = -EINTR; 4134 break; 4135 } 4136 if (!batch_count--) { 4137 batch_count = PRECHARGE_COUNT_AT_ONCE; 4138 cond_resched(); 4139 } 4140 ret = __mem_cgroup_try_charge(NULL, GFP_KERNEL, &mem, false); 4141 if (ret || !mem) 4142 /* mem_cgroup_clear_mc() will do uncharge later */ 4143 return -ENOMEM; 4144 mc.precharge++; 4145 } 4146 return ret; 4147} 4148 4149/** 4150 * is_target_pte_for_mc - check a pte whether it is valid for move charge 4151 * @vma: the vma the pte to be checked belongs 4152 * @addr: the address corresponding to the pte to be checked 4153 * @ptent: the pte to be checked 4154 * @target: the pointer the target page or swap ent will be stored(can be NULL) 4155 * 4156 * Returns 4157 * 0(MC_TARGET_NONE): if the pte is not a target for move charge. 4158 * 1(MC_TARGET_PAGE): if the page corresponding to this pte is a target for 4159 * move charge. if @target is not NULL, the page is stored in target->page 4160 * with extra refcnt got(Callers should handle it). 4161 * 2(MC_TARGET_SWAP): if the swap entry corresponding to this pte is a 4162 * target for charge migration. if @target is not NULL, the entry is stored 4163 * in target->ent. 4164 * 4165 * Called with pte lock held. 4166 */ 4167union mc_target { 4168 struct page *page; 4169 swp_entry_t ent; 4170}; 4171 4172enum mc_target_type { 4173 MC_TARGET_NONE, /* not used */ 4174 MC_TARGET_PAGE, 4175 MC_TARGET_SWAP, 4176}; 4177 4178static struct page *mc_handle_present_pte(struct vm_area_struct *vma, 4179 unsigned long addr, pte_t ptent) 4180{ 4181 struct page *page = vm_normal_page(vma, addr, ptent); 4182 4183 if (!page || !page_mapped(page)) 4184 return NULL; 4185 if (PageAnon(page)) { 4186 /* we don't move shared anon */ 4187 if (!move_anon() || page_mapcount(page) > 2) 4188 return NULL; 4189 } else if (!move_file()) 4190 /* we ignore mapcount for file pages */ 4191 return NULL; 4192 if (!get_page_unless_zero(page)) 4193 return NULL; 4194 4195 return page; 4196} 4197 4198static struct page *mc_handle_swap_pte(struct vm_area_struct *vma, 4199 unsigned long addr, pte_t ptent, swp_entry_t *entry) 4200{ 4201 int usage_count; 4202 struct page *page = NULL; 4203 swp_entry_t ent = pte_to_swp_entry(ptent); 4204 4205 if (!move_anon() || non_swap_entry(ent)) 4206 return NULL; 4207 usage_count = mem_cgroup_count_swap_user(ent, &page); 4208 if (usage_count > 1) { /* we don't move shared anon */ 4209 if (page) 4210 put_page(page); 4211 return NULL; 4212 } 4213 if (do_swap_account) 4214 entry->val = ent.val; 4215 4216 return page; 4217} 4218 4219static struct page *mc_handle_file_pte(struct vm_area_struct *vma, 4220 unsigned long addr, pte_t ptent, swp_entry_t *entry) 4221{ 4222 struct page *page = NULL; 4223 struct inode *inode; 4224 struct address_space *mapping; 4225 pgoff_t pgoff; 4226 4227 if (!vma->vm_file) /* anonymous vma */ 4228 return NULL; 4229 if (!move_file()) 4230 return NULL; 4231 4232 inode = vma->vm_file->f_path.dentry->d_inode; 4233 mapping = vma->vm_file->f_mapping; 4234 if (pte_none(ptent)) 4235 pgoff = linear_page_index(vma, addr); 4236 else /* pte_file(ptent) is true */ 4237 pgoff = pte_to_pgoff(ptent); 4238 4239 /* page is moved even if it's not RSS of this task(page-faulted). */ 4240 if (!mapping_cap_swap_backed(mapping)) { /* normal file */ 4241 page = find_get_page(mapping, pgoff); 4242 } else { /* shmem/tmpfs file. we should take account of swap too. */ 4243 swp_entry_t ent; 4244 mem_cgroup_get_shmem_target(inode, pgoff, &page, &ent); 4245 if (do_swap_account) 4246 entry->val = ent.val; 4247 } 4248 4249 return page; 4250} 4251 4252static int is_target_pte_for_mc(struct vm_area_struct *vma, 4253 unsigned long addr, pte_t ptent, union mc_target *target) 4254{ 4255 struct page *page = NULL; 4256 struct page_cgroup *pc; 4257 int ret = 0; 4258 swp_entry_t ent = { .val = 0 }; 4259 4260 if (pte_present(ptent)) 4261 page = mc_handle_present_pte(vma, addr, ptent); 4262 else if (is_swap_pte(ptent)) 4263 page = mc_handle_swap_pte(vma, addr, ptent, &ent); 4264 else if (pte_none(ptent) || pte_file(ptent)) 4265 page = mc_handle_file_pte(vma, addr, ptent, &ent); 4266 4267 if (!page && !ent.val) 4268 return 0; 4269 if (page) { 4270 pc = lookup_page_cgroup(page); 4271 /* 4272 * Do only loose check w/o page_cgroup lock. 4273 * mem_cgroup_move_account() checks the pc is valid or not under 4274 * the lock. 4275 */ 4276 if (PageCgroupUsed(pc) && pc->mem_cgroup == mc.from) { 4277 ret = MC_TARGET_PAGE; 4278 if (target) 4279 target->page = page; 4280 } 4281 if (!ret || !target) 4282 put_page(page); 4283 } 4284 /* There is a swap entry and a page doesn't exist or isn't charged */ 4285 if (ent.val && !ret && 4286 css_id(&mc.from->css) == lookup_swap_cgroup(ent)) { 4287 ret = MC_TARGET_SWAP; 4288 if (target) 4289 target->ent = ent; 4290 } 4291 return ret; 4292} 4293 4294static int mem_cgroup_count_precharge_pte_range(pmd_t *pmd, 4295 unsigned long addr, unsigned long end, 4296 struct mm_walk *walk) 4297{ 4298 struct vm_area_struct *vma = walk->private; 4299 pte_t *pte; 4300 spinlock_t *ptl; 4301 4302 pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl); 4303 for (; addr != end; pte++, addr += PAGE_SIZE) 4304 if (is_target_pte_for_mc(vma, addr, *pte, NULL)) 4305 mc.precharge++; /* increment precharge temporarily */ 4306 pte_unmap_unlock(pte - 1, ptl); 4307 cond_resched(); 4308 4309 return 0; 4310} 4311 4312static unsigned long mem_cgroup_count_precharge(struct mm_struct *mm) 4313{ 4314 unsigned long precharge; 4315 struct vm_area_struct *vma; 4316 4317 down_read(&mm->mmap_sem); 4318 for (vma = mm->mmap; vma; vma = vma->vm_next) { 4319 struct mm_walk mem_cgroup_count_precharge_walk = { 4320 .pmd_entry = mem_cgroup_count_precharge_pte_range, 4321 .mm = mm, 4322 .private = vma, 4323 }; 4324 if (is_vm_hugetlb_page(vma)) 4325 continue; 4326 walk_page_range(vma->vm_start, vma->vm_end, 4327 &mem_cgroup_count_precharge_walk); 4328 } 4329 up_read(&mm->mmap_sem); 4330 4331 precharge = mc.precharge; 4332 mc.precharge = 0; 4333 4334 return precharge; 4335} 4336 4337static int mem_cgroup_precharge_mc(struct mm_struct *mm) 4338{ 4339 return mem_cgroup_do_precharge(mem_cgroup_count_precharge(mm)); 4340} 4341 4342static void mem_cgroup_clear_mc(void) 4343{ 4344 /* we must uncharge all the leftover precharges from mc.to */ 4345 if (mc.precharge) { 4346 __mem_cgroup_cancel_charge(mc.to, mc.precharge); 4347 mc.precharge = 0; 4348 memcg_oom_recover(mc.to); 4349 } 4350 /* 4351 * we didn't uncharge from mc.from at mem_cgroup_move_account(), so 4352 * we must uncharge here. 4353 */ 4354 if (mc.moved_charge) { 4355 __mem_cgroup_cancel_charge(mc.from, mc.moved_charge); 4356 mc.moved_charge = 0; 4357 memcg_oom_recover(mc.from); 4358 } 4359 /* we must fixup refcnts and charges */ 4360 if (mc.moved_swap) { 4361 WARN_ON_ONCE(mc.moved_swap > INT_MAX); 4362 /* uncharge swap account from the old cgroup */ 4363 if (!mem_cgroup_is_root(mc.from)) 4364 res_counter_uncharge(&mc.from->memsw, 4365 PAGE_SIZE * mc.moved_swap); 4366 __mem_cgroup_put(mc.from, mc.moved_swap); 4367 4368 if (!mem_cgroup_is_root(mc.to)) { 4369 /* 4370 * we charged both to->res and to->memsw, so we should 4371 * uncharge to->res. 4372 */ 4373 res_counter_uncharge(&mc.to->res, 4374 PAGE_SIZE * mc.moved_swap); 4375 VM_BUG_ON(test_bit(CSS_ROOT, &mc.to->css.flags)); 4376 __css_put(&mc.to->css, mc.moved_swap); 4377 } 4378 /* we've already done mem_cgroup_get(mc.to) */ 4379 4380 mc.moved_swap = 0; 4381 } 4382 mc.from = NULL; 4383 mc.to = NULL; 4384 mc.moving_task = NULL; 4385 wake_up_all(&mc.waitq); 4386} 4387 4388static int mem_cgroup_can_attach(struct cgroup_subsys *ss, 4389 struct cgroup *cgroup, 4390 struct task_struct *p, 4391 bool threadgroup) 4392{ 4393 int ret = 0; 4394 struct mem_cgroup *mem = mem_cgroup_from_cont(cgroup); 4395 4396 if (mem->move_charge_at_immigrate) { 4397 struct mm_struct *mm; 4398 struct mem_cgroup *from = mem_cgroup_from_task(p); 4399 4400 VM_BUG_ON(from == mem); 4401 4402 mm = get_task_mm(p); 4403 if (!mm) 4404 return 0; 4405 /* We move charges only when we move a owner of the mm */ 4406 if (mm->owner == p) { 4407 VM_BUG_ON(mc.from); 4408 VM_BUG_ON(mc.to); 4409 VM_BUG_ON(mc.precharge); 4410 VM_BUG_ON(mc.moved_charge); 4411 VM_BUG_ON(mc.moved_swap); 4412 VM_BUG_ON(mc.moving_task); 4413 mc.from = from; 4414 mc.to = mem; 4415 mc.precharge = 0; 4416 mc.moved_charge = 0; 4417 mc.moved_swap = 0; 4418 mc.moving_task = current; 4419 4420 ret = mem_cgroup_precharge_mc(mm); 4421 if (ret) 4422 mem_cgroup_clear_mc(); 4423 } 4424 mmput(mm); 4425 } 4426 return ret; 4427} 4428 4429static void mem_cgroup_cancel_attach(struct cgroup_subsys *ss, 4430 struct cgroup *cgroup, 4431 struct task_struct *p, 4432 bool threadgroup) 4433{ 4434 mem_cgroup_clear_mc(); 4435} 4436 4437static int mem_cgroup_move_charge_pte_range(pmd_t *pmd, 4438 unsigned long addr, unsigned long end, 4439 struct mm_walk *walk) 4440{ 4441 int ret = 0; 4442 struct vm_area_struct *vma = walk->private; 4443 pte_t *pte; 4444 spinlock_t *ptl; 4445 4446retry: 4447 pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl); 4448 for (; addr != end; addr += PAGE_SIZE) { 4449 pte_t ptent = *(pte++); 4450 union mc_target target; 4451 int type; 4452 struct page *page; 4453 struct page_cgroup *pc; 4454 swp_entry_t ent; 4455 4456 if (!mc.precharge) 4457 break; 4458 4459 type = is_target_pte_for_mc(vma, addr, ptent, &target); 4460 switch (type) { 4461 case MC_TARGET_PAGE: 4462 page = target.page; 4463 if (isolate_lru_page(page)) 4464 goto put; 4465 pc = lookup_page_cgroup(page); 4466 if (!mem_cgroup_move_account(pc, 4467 mc.from, mc.to, false)) { 4468 mc.precharge--; 4469 /* we uncharge from mc.from later. */ 4470 mc.moved_charge++; 4471 } 4472 putback_lru_page(page); 4473put: /* is_target_pte_for_mc() gets the page */ 4474 put_page(page); 4475 break; 4476 case MC_TARGET_SWAP: 4477 ent = target.ent; 4478 if (!mem_cgroup_move_swap_account(ent, 4479 mc.from, mc.to, false)) { 4480 mc.precharge--; 4481 /* we fixup refcnts and charges later. */ 4482 mc.moved_swap++; 4483 } 4484 break; 4485 default: 4486 break; 4487 } 4488 } 4489 pte_unmap_unlock(pte - 1, ptl); 4490 cond_resched(); 4491 4492 if (addr != end) { 4493 /* 4494 * We have consumed all precharges we got in can_attach(). 4495 * We try charge one by one, but don't do any additional 4496 * charges to mc.to if we have failed in charge once in attach() 4497 * phase. 4498 */ 4499 ret = mem_cgroup_do_precharge(1); 4500 if (!ret) 4501 goto retry; 4502 } 4503 4504 return ret; 4505} 4506 4507static void mem_cgroup_move_charge(struct mm_struct *mm) 4508{ 4509 struct vm_area_struct *vma; 4510 4511 lru_add_drain_all(); 4512 down_read(&mm->mmap_sem); 4513 for (vma = mm->mmap; vma; vma = vma->vm_next) { 4514 int ret; 4515 struct mm_walk mem_cgroup_move_charge_walk = { 4516 .pmd_entry = mem_cgroup_move_charge_pte_range, 4517 .mm = mm, 4518 .private = vma, 4519 }; 4520 if (is_vm_hugetlb_page(vma)) 4521 continue; 4522 ret = walk_page_range(vma->vm_start, vma->vm_end, 4523 &mem_cgroup_move_charge_walk); 4524 if (ret) 4525 /* 4526 * means we have consumed all precharges and failed in 4527 * doing additional charge. Just abandon here. 4528 */ 4529 break; 4530 } 4531 up_read(&mm->mmap_sem); 4532} 4533 4534static void mem_cgroup_move_task(struct cgroup_subsys *ss, 4535 struct cgroup *cont, 4536 struct cgroup *old_cont, 4537 struct task_struct *p, 4538 bool threadgroup) 4539{ 4540 struct mm_struct *mm; 4541 4542 if (!mc.to) 4543 /* no need to move charge */ 4544 return; 4545 4546 mm = get_task_mm(p); 4547 if (mm) { 4548 mem_cgroup_move_charge(mm); 4549 mmput(mm); 4550 } 4551 mem_cgroup_clear_mc(); 4552} 4553#else /* !CONFIG_MMU */ 4554static int mem_cgroup_can_attach(struct cgroup_subsys *ss, 4555 struct cgroup *cgroup, 4556 struct task_struct *p, 4557 bool threadgroup) 4558{ 4559 return 0; 4560} 4561static void mem_cgroup_cancel_attach(struct cgroup_subsys *ss, 4562 struct cgroup *cgroup, 4563 struct task_struct *p, 4564 bool threadgroup) 4565{ 4566} 4567static void mem_cgroup_move_task(struct cgroup_subsys *ss, 4568 struct cgroup *cont, 4569 struct cgroup *old_cont, 4570 struct task_struct *p, 4571 bool threadgroup) 4572{ 4573} 4574#endif 4575 4576struct cgroup_subsys mem_cgroup_subsys = { 4577 .name = "memory", 4578 .subsys_id = mem_cgroup_subsys_id, 4579 .create = mem_cgroup_create, 4580 .pre_destroy = mem_cgroup_pre_destroy, 4581 .destroy = mem_cgroup_destroy, 4582 .populate = mem_cgroup_populate, 4583 .can_attach = mem_cgroup_can_attach, 4584 .cancel_attach = mem_cgroup_cancel_attach, 4585 .attach = mem_cgroup_move_task, 4586 .early_init = 0, 4587 .use_id = 1, 4588}; 4589 4590#ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP 4591 4592static int __init disable_swap_account(char *s) 4593{ 4594 really_do_swap_account = 0; 4595 return 1; 4596} 4597__setup("noswapaccount", disable_swap_account); 4598#endif 4599