memcontrol.c revision c62b1a3b31b5e27a6c5c2e91cc5ce05fdb6344d0
1/* memcontrol.c - Memory Controller 2 * 3 * Copyright IBM Corporation, 2007 4 * Author Balbir Singh <balbir@linux.vnet.ibm.com> 5 * 6 * Copyright 2007 OpenVZ SWsoft Inc 7 * Author: Pavel Emelianov <xemul@openvz.org> 8 * 9 * Memory thresholds 10 * Copyright (C) 2009 Nokia Corporation 11 * Author: Kirill A. Shutemov 12 * 13 * This program is free software; you can redistribute it and/or modify 14 * it under the terms of the GNU General Public License as published by 15 * the Free Software Foundation; either version 2 of the License, or 16 * (at your option) any later version. 17 * 18 * This program is distributed in the hope that it will be useful, 19 * but WITHOUT ANY WARRANTY; without even the implied warranty of 20 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 21 * GNU General Public License for more details. 22 */ 23 24#include <linux/res_counter.h> 25#include <linux/memcontrol.h> 26#include <linux/cgroup.h> 27#include <linux/mm.h> 28#include <linux/hugetlb.h> 29#include <linux/pagemap.h> 30#include <linux/smp.h> 31#include <linux/page-flags.h> 32#include <linux/backing-dev.h> 33#include <linux/bit_spinlock.h> 34#include <linux/rcupdate.h> 35#include <linux/limits.h> 36#include <linux/mutex.h> 37#include <linux/rbtree.h> 38#include <linux/slab.h> 39#include <linux/swap.h> 40#include <linux/swapops.h> 41#include <linux/spinlock.h> 42#include <linux/eventfd.h> 43#include <linux/sort.h> 44#include <linux/fs.h> 45#include <linux/seq_file.h> 46#include <linux/vmalloc.h> 47#include <linux/mm_inline.h> 48#include <linux/page_cgroup.h> 49#include <linux/cpu.h> 50#include "internal.h" 51 52#include <asm/uaccess.h> 53 54struct cgroup_subsys mem_cgroup_subsys __read_mostly; 55#define MEM_CGROUP_RECLAIM_RETRIES 5 56struct mem_cgroup *root_mem_cgroup __read_mostly; 57 58#ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP 59/* Turned on only when memory cgroup is enabled && really_do_swap_account = 1 */ 60int do_swap_account __read_mostly; 61static int really_do_swap_account __initdata = 1; /* for remember boot option*/ 62#else 63#define do_swap_account (0) 64#endif 65 66#define SOFTLIMIT_EVENTS_THRESH (1000) 67#define THRESHOLDS_EVENTS_THRESH (100) 68 69/* 70 * Statistics for memory cgroup. 71 */ 72enum mem_cgroup_stat_index { 73 /* 74 * For MEM_CONTAINER_TYPE_ALL, usage = pagecache + rss. 75 */ 76 MEM_CGROUP_STAT_CACHE, /* # of pages charged as cache */ 77 MEM_CGROUP_STAT_RSS, /* # of pages charged as anon rss */ 78 MEM_CGROUP_STAT_FILE_MAPPED, /* # of pages charged as file rss */ 79 MEM_CGROUP_STAT_PGPGIN_COUNT, /* # of pages paged in */ 80 MEM_CGROUP_STAT_PGPGOUT_COUNT, /* # of pages paged out */ 81 MEM_CGROUP_STAT_SWAPOUT, /* # of pages, swapped out */ 82 MEM_CGROUP_STAT_SOFTLIMIT, /* decrements on each page in/out. 83 used by soft limit implementation */ 84 MEM_CGROUP_STAT_THRESHOLDS, /* decrements on each page in/out. 85 used by threshold implementation */ 86 87 MEM_CGROUP_STAT_NSTATS, 88}; 89 90struct mem_cgroup_stat_cpu { 91 s64 count[MEM_CGROUP_STAT_NSTATS]; 92}; 93 94/* 95 * per-zone information in memory controller. 96 */ 97struct mem_cgroup_per_zone { 98 /* 99 * spin_lock to protect the per cgroup LRU 100 */ 101 struct list_head lists[NR_LRU_LISTS]; 102 unsigned long count[NR_LRU_LISTS]; 103 104 struct zone_reclaim_stat reclaim_stat; 105 struct rb_node tree_node; /* RB tree node */ 106 unsigned long long usage_in_excess;/* Set to the value by which */ 107 /* the soft limit is exceeded*/ 108 bool on_tree; 109 struct mem_cgroup *mem; /* Back pointer, we cannot */ 110 /* use container_of */ 111}; 112/* Macro for accessing counter */ 113#define MEM_CGROUP_ZSTAT(mz, idx) ((mz)->count[(idx)]) 114 115struct mem_cgroup_per_node { 116 struct mem_cgroup_per_zone zoneinfo[MAX_NR_ZONES]; 117}; 118 119struct mem_cgroup_lru_info { 120 struct mem_cgroup_per_node *nodeinfo[MAX_NUMNODES]; 121}; 122 123/* 124 * Cgroups above their limits are maintained in a RB-Tree, independent of 125 * their hierarchy representation 126 */ 127 128struct mem_cgroup_tree_per_zone { 129 struct rb_root rb_root; 130 spinlock_t lock; 131}; 132 133struct mem_cgroup_tree_per_node { 134 struct mem_cgroup_tree_per_zone rb_tree_per_zone[MAX_NR_ZONES]; 135}; 136 137struct mem_cgroup_tree { 138 struct mem_cgroup_tree_per_node *rb_tree_per_node[MAX_NUMNODES]; 139}; 140 141static struct mem_cgroup_tree soft_limit_tree __read_mostly; 142 143struct mem_cgroup_threshold { 144 struct eventfd_ctx *eventfd; 145 u64 threshold; 146}; 147 148struct mem_cgroup_threshold_ary { 149 /* An array index points to threshold just below usage. */ 150 atomic_t current_threshold; 151 /* Size of entries[] */ 152 unsigned int size; 153 /* Array of thresholds */ 154 struct mem_cgroup_threshold entries[0]; 155}; 156 157static bool mem_cgroup_threshold_check(struct mem_cgroup *mem); 158static void mem_cgroup_threshold(struct mem_cgroup *mem); 159 160/* 161 * The memory controller data structure. The memory controller controls both 162 * page cache and RSS per cgroup. We would eventually like to provide 163 * statistics based on the statistics developed by Rik Van Riel for clock-pro, 164 * to help the administrator determine what knobs to tune. 165 * 166 * TODO: Add a water mark for the memory controller. Reclaim will begin when 167 * we hit the water mark. May be even add a low water mark, such that 168 * no reclaim occurs from a cgroup at it's low water mark, this is 169 * a feature that will be implemented much later in the future. 170 */ 171struct mem_cgroup { 172 struct cgroup_subsys_state css; 173 /* 174 * the counter to account for memory usage 175 */ 176 struct res_counter res; 177 /* 178 * the counter to account for mem+swap usage. 179 */ 180 struct res_counter memsw; 181 /* 182 * Per cgroup active and inactive list, similar to the 183 * per zone LRU lists. 184 */ 185 struct mem_cgroup_lru_info info; 186 187 /* 188 protect against reclaim related member. 189 */ 190 spinlock_t reclaim_param_lock; 191 192 int prev_priority; /* for recording reclaim priority */ 193 194 /* 195 * While reclaiming in a hierarchy, we cache the last child we 196 * reclaimed from. 197 */ 198 int last_scanned_child; 199 /* 200 * Should the accounting and control be hierarchical, per subtree? 201 */ 202 bool use_hierarchy; 203 unsigned long last_oom_jiffies; 204 atomic_t refcnt; 205 206 unsigned int swappiness; 207 208 /* set when res.limit == memsw.limit */ 209 bool memsw_is_minimum; 210 211 /* protect arrays of thresholds */ 212 struct mutex thresholds_lock; 213 214 /* thresholds for memory usage. RCU-protected */ 215 struct mem_cgroup_threshold_ary *thresholds; 216 217 /* thresholds for mem+swap usage. RCU-protected */ 218 struct mem_cgroup_threshold_ary *memsw_thresholds; 219 220 /* 221 * Should we move charges of a task when a task is moved into this 222 * mem_cgroup ? And what type of charges should we move ? 223 */ 224 unsigned long move_charge_at_immigrate; 225 226 /* 227 * percpu counter. 228 */ 229 struct mem_cgroup_stat_cpu *stat; 230}; 231 232/* Stuffs for move charges at task migration. */ 233/* 234 * Types of charges to be moved. "move_charge_at_immitgrate" is treated as a 235 * left-shifted bitmap of these types. 236 */ 237enum move_type { 238 MOVE_CHARGE_TYPE_ANON, /* private anonymous page and swap of it */ 239 NR_MOVE_TYPE, 240}; 241 242/* "mc" and its members are protected by cgroup_mutex */ 243static struct move_charge_struct { 244 struct mem_cgroup *from; 245 struct mem_cgroup *to; 246 unsigned long precharge; 247 unsigned long moved_charge; 248 unsigned long moved_swap; 249 struct task_struct *moving_task; /* a task moving charges */ 250 wait_queue_head_t waitq; /* a waitq for other context */ 251} mc = { 252 .waitq = __WAIT_QUEUE_HEAD_INITIALIZER(mc.waitq), 253}; 254 255/* 256 * Maximum loops in mem_cgroup_hierarchical_reclaim(), used for soft 257 * limit reclaim to prevent infinite loops, if they ever occur. 258 */ 259#define MEM_CGROUP_MAX_RECLAIM_LOOPS (100) 260#define MEM_CGROUP_MAX_SOFT_LIMIT_RECLAIM_LOOPS (2) 261 262enum charge_type { 263 MEM_CGROUP_CHARGE_TYPE_CACHE = 0, 264 MEM_CGROUP_CHARGE_TYPE_MAPPED, 265 MEM_CGROUP_CHARGE_TYPE_SHMEM, /* used by page migration of shmem */ 266 MEM_CGROUP_CHARGE_TYPE_FORCE, /* used by force_empty */ 267 MEM_CGROUP_CHARGE_TYPE_SWAPOUT, /* for accounting swapcache */ 268 MEM_CGROUP_CHARGE_TYPE_DROP, /* a page was unused swap cache */ 269 NR_CHARGE_TYPE, 270}; 271 272/* only for here (for easy reading.) */ 273#define PCGF_CACHE (1UL << PCG_CACHE) 274#define PCGF_USED (1UL << PCG_USED) 275#define PCGF_LOCK (1UL << PCG_LOCK) 276/* Not used, but added here for completeness */ 277#define PCGF_ACCT (1UL << PCG_ACCT) 278 279/* for encoding cft->private value on file */ 280#define _MEM (0) 281#define _MEMSWAP (1) 282#define MEMFILE_PRIVATE(x, val) (((x) << 16) | (val)) 283#define MEMFILE_TYPE(val) (((val) >> 16) & 0xffff) 284#define MEMFILE_ATTR(val) ((val) & 0xffff) 285 286/* 287 * Reclaim flags for mem_cgroup_hierarchical_reclaim 288 */ 289#define MEM_CGROUP_RECLAIM_NOSWAP_BIT 0x0 290#define MEM_CGROUP_RECLAIM_NOSWAP (1 << MEM_CGROUP_RECLAIM_NOSWAP_BIT) 291#define MEM_CGROUP_RECLAIM_SHRINK_BIT 0x1 292#define MEM_CGROUP_RECLAIM_SHRINK (1 << MEM_CGROUP_RECLAIM_SHRINK_BIT) 293#define MEM_CGROUP_RECLAIM_SOFT_BIT 0x2 294#define MEM_CGROUP_RECLAIM_SOFT (1 << MEM_CGROUP_RECLAIM_SOFT_BIT) 295 296static void mem_cgroup_get(struct mem_cgroup *mem); 297static void mem_cgroup_put(struct mem_cgroup *mem); 298static struct mem_cgroup *parent_mem_cgroup(struct mem_cgroup *mem); 299static void drain_all_stock_async(void); 300 301static struct mem_cgroup_per_zone * 302mem_cgroup_zoneinfo(struct mem_cgroup *mem, int nid, int zid) 303{ 304 return &mem->info.nodeinfo[nid]->zoneinfo[zid]; 305} 306 307struct cgroup_subsys_state *mem_cgroup_css(struct mem_cgroup *mem) 308{ 309 return &mem->css; 310} 311 312static struct mem_cgroup_per_zone * 313page_cgroup_zoneinfo(struct page_cgroup *pc) 314{ 315 struct mem_cgroup *mem = pc->mem_cgroup; 316 int nid = page_cgroup_nid(pc); 317 int zid = page_cgroup_zid(pc); 318 319 if (!mem) 320 return NULL; 321 322 return mem_cgroup_zoneinfo(mem, nid, zid); 323} 324 325static struct mem_cgroup_tree_per_zone * 326soft_limit_tree_node_zone(int nid, int zid) 327{ 328 return &soft_limit_tree.rb_tree_per_node[nid]->rb_tree_per_zone[zid]; 329} 330 331static struct mem_cgroup_tree_per_zone * 332soft_limit_tree_from_page(struct page *page) 333{ 334 int nid = page_to_nid(page); 335 int zid = page_zonenum(page); 336 337 return &soft_limit_tree.rb_tree_per_node[nid]->rb_tree_per_zone[zid]; 338} 339 340static void 341__mem_cgroup_insert_exceeded(struct mem_cgroup *mem, 342 struct mem_cgroup_per_zone *mz, 343 struct mem_cgroup_tree_per_zone *mctz, 344 unsigned long long new_usage_in_excess) 345{ 346 struct rb_node **p = &mctz->rb_root.rb_node; 347 struct rb_node *parent = NULL; 348 struct mem_cgroup_per_zone *mz_node; 349 350 if (mz->on_tree) 351 return; 352 353 mz->usage_in_excess = new_usage_in_excess; 354 if (!mz->usage_in_excess) 355 return; 356 while (*p) { 357 parent = *p; 358 mz_node = rb_entry(parent, struct mem_cgroup_per_zone, 359 tree_node); 360 if (mz->usage_in_excess < mz_node->usage_in_excess) 361 p = &(*p)->rb_left; 362 /* 363 * We can't avoid mem cgroups that are over their soft 364 * limit by the same amount 365 */ 366 else if (mz->usage_in_excess >= mz_node->usage_in_excess) 367 p = &(*p)->rb_right; 368 } 369 rb_link_node(&mz->tree_node, parent, p); 370 rb_insert_color(&mz->tree_node, &mctz->rb_root); 371 mz->on_tree = true; 372} 373 374static void 375__mem_cgroup_remove_exceeded(struct mem_cgroup *mem, 376 struct mem_cgroup_per_zone *mz, 377 struct mem_cgroup_tree_per_zone *mctz) 378{ 379 if (!mz->on_tree) 380 return; 381 rb_erase(&mz->tree_node, &mctz->rb_root); 382 mz->on_tree = false; 383} 384 385static void 386mem_cgroup_remove_exceeded(struct mem_cgroup *mem, 387 struct mem_cgroup_per_zone *mz, 388 struct mem_cgroup_tree_per_zone *mctz) 389{ 390 spin_lock(&mctz->lock); 391 __mem_cgroup_remove_exceeded(mem, mz, mctz); 392 spin_unlock(&mctz->lock); 393} 394 395static bool mem_cgroup_soft_limit_check(struct mem_cgroup *mem) 396{ 397 bool ret = false; 398 s64 val; 399 400 val = this_cpu_read(mem->stat->count[MEM_CGROUP_STAT_SOFTLIMIT]); 401 if (unlikely(val < 0)) { 402 this_cpu_write(mem->stat->count[MEM_CGROUP_STAT_SOFTLIMIT], 403 SOFTLIMIT_EVENTS_THRESH); 404 ret = true; 405 } 406 return ret; 407} 408 409static void mem_cgroup_update_tree(struct mem_cgroup *mem, struct page *page) 410{ 411 unsigned long long excess; 412 struct mem_cgroup_per_zone *mz; 413 struct mem_cgroup_tree_per_zone *mctz; 414 int nid = page_to_nid(page); 415 int zid = page_zonenum(page); 416 mctz = soft_limit_tree_from_page(page); 417 418 /* 419 * Necessary to update all ancestors when hierarchy is used. 420 * because their event counter is not touched. 421 */ 422 for (; mem; mem = parent_mem_cgroup(mem)) { 423 mz = mem_cgroup_zoneinfo(mem, nid, zid); 424 excess = res_counter_soft_limit_excess(&mem->res); 425 /* 426 * We have to update the tree if mz is on RB-tree or 427 * mem is over its softlimit. 428 */ 429 if (excess || mz->on_tree) { 430 spin_lock(&mctz->lock); 431 /* if on-tree, remove it */ 432 if (mz->on_tree) 433 __mem_cgroup_remove_exceeded(mem, mz, mctz); 434 /* 435 * Insert again. mz->usage_in_excess will be updated. 436 * If excess is 0, no tree ops. 437 */ 438 __mem_cgroup_insert_exceeded(mem, mz, mctz, excess); 439 spin_unlock(&mctz->lock); 440 } 441 } 442} 443 444static void mem_cgroup_remove_from_trees(struct mem_cgroup *mem) 445{ 446 int node, zone; 447 struct mem_cgroup_per_zone *mz; 448 struct mem_cgroup_tree_per_zone *mctz; 449 450 for_each_node_state(node, N_POSSIBLE) { 451 for (zone = 0; zone < MAX_NR_ZONES; zone++) { 452 mz = mem_cgroup_zoneinfo(mem, node, zone); 453 mctz = soft_limit_tree_node_zone(node, zone); 454 mem_cgroup_remove_exceeded(mem, mz, mctz); 455 } 456 } 457} 458 459static inline unsigned long mem_cgroup_get_excess(struct mem_cgroup *mem) 460{ 461 return res_counter_soft_limit_excess(&mem->res) >> PAGE_SHIFT; 462} 463 464static struct mem_cgroup_per_zone * 465__mem_cgroup_largest_soft_limit_node(struct mem_cgroup_tree_per_zone *mctz) 466{ 467 struct rb_node *rightmost = NULL; 468 struct mem_cgroup_per_zone *mz; 469 470retry: 471 mz = NULL; 472 rightmost = rb_last(&mctz->rb_root); 473 if (!rightmost) 474 goto done; /* Nothing to reclaim from */ 475 476 mz = rb_entry(rightmost, struct mem_cgroup_per_zone, tree_node); 477 /* 478 * Remove the node now but someone else can add it back, 479 * we will to add it back at the end of reclaim to its correct 480 * position in the tree. 481 */ 482 __mem_cgroup_remove_exceeded(mz->mem, mz, mctz); 483 if (!res_counter_soft_limit_excess(&mz->mem->res) || 484 !css_tryget(&mz->mem->css)) 485 goto retry; 486done: 487 return mz; 488} 489 490static struct mem_cgroup_per_zone * 491mem_cgroup_largest_soft_limit_node(struct mem_cgroup_tree_per_zone *mctz) 492{ 493 struct mem_cgroup_per_zone *mz; 494 495 spin_lock(&mctz->lock); 496 mz = __mem_cgroup_largest_soft_limit_node(mctz); 497 spin_unlock(&mctz->lock); 498 return mz; 499} 500 501static s64 mem_cgroup_read_stat(struct mem_cgroup *mem, 502 enum mem_cgroup_stat_index idx) 503{ 504 int cpu; 505 s64 val = 0; 506 507 for_each_possible_cpu(cpu) 508 val += per_cpu(mem->stat->count[idx], cpu); 509 return val; 510} 511 512static s64 mem_cgroup_local_usage(struct mem_cgroup *mem) 513{ 514 s64 ret; 515 516 ret = mem_cgroup_read_stat(mem, MEM_CGROUP_STAT_RSS); 517 ret += mem_cgroup_read_stat(mem, MEM_CGROUP_STAT_CACHE); 518 return ret; 519} 520 521static void mem_cgroup_swap_statistics(struct mem_cgroup *mem, 522 bool charge) 523{ 524 int val = (charge) ? 1 : -1; 525 this_cpu_add(mem->stat->count[MEM_CGROUP_STAT_SWAPOUT], val); 526} 527 528static void mem_cgroup_charge_statistics(struct mem_cgroup *mem, 529 struct page_cgroup *pc, 530 bool charge) 531{ 532 int val = (charge) ? 1 : -1; 533 534 preempt_disable(); 535 536 if (PageCgroupCache(pc)) 537 __this_cpu_add(mem->stat->count[MEM_CGROUP_STAT_CACHE], val); 538 else 539 __this_cpu_add(mem->stat->count[MEM_CGROUP_STAT_RSS], val); 540 541 if (charge) 542 __this_cpu_inc(mem->stat->count[MEM_CGROUP_STAT_PGPGIN_COUNT]); 543 else 544 __this_cpu_inc(mem->stat->count[MEM_CGROUP_STAT_PGPGOUT_COUNT]); 545 __this_cpu_dec(mem->stat->count[MEM_CGROUP_STAT_SOFTLIMIT]); 546 __this_cpu_dec(mem->stat->count[MEM_CGROUP_STAT_THRESHOLDS]); 547 548 preempt_enable(); 549} 550 551static unsigned long mem_cgroup_get_local_zonestat(struct mem_cgroup *mem, 552 enum lru_list idx) 553{ 554 int nid, zid; 555 struct mem_cgroup_per_zone *mz; 556 u64 total = 0; 557 558 for_each_online_node(nid) 559 for (zid = 0; zid < MAX_NR_ZONES; zid++) { 560 mz = mem_cgroup_zoneinfo(mem, nid, zid); 561 total += MEM_CGROUP_ZSTAT(mz, idx); 562 } 563 return total; 564} 565 566static struct mem_cgroup *mem_cgroup_from_cont(struct cgroup *cont) 567{ 568 return container_of(cgroup_subsys_state(cont, 569 mem_cgroup_subsys_id), struct mem_cgroup, 570 css); 571} 572 573struct mem_cgroup *mem_cgroup_from_task(struct task_struct *p) 574{ 575 /* 576 * mm_update_next_owner() may clear mm->owner to NULL 577 * if it races with swapoff, page migration, etc. 578 * So this can be called with p == NULL. 579 */ 580 if (unlikely(!p)) 581 return NULL; 582 583 return container_of(task_subsys_state(p, mem_cgroup_subsys_id), 584 struct mem_cgroup, css); 585} 586 587static struct mem_cgroup *try_get_mem_cgroup_from_mm(struct mm_struct *mm) 588{ 589 struct mem_cgroup *mem = NULL; 590 591 if (!mm) 592 return NULL; 593 /* 594 * Because we have no locks, mm->owner's may be being moved to other 595 * cgroup. We use css_tryget() here even if this looks 596 * pessimistic (rather than adding locks here). 597 */ 598 rcu_read_lock(); 599 do { 600 mem = mem_cgroup_from_task(rcu_dereference(mm->owner)); 601 if (unlikely(!mem)) 602 break; 603 } while (!css_tryget(&mem->css)); 604 rcu_read_unlock(); 605 return mem; 606} 607 608/* 609 * Call callback function against all cgroup under hierarchy tree. 610 */ 611static int mem_cgroup_walk_tree(struct mem_cgroup *root, void *data, 612 int (*func)(struct mem_cgroup *, void *)) 613{ 614 int found, ret, nextid; 615 struct cgroup_subsys_state *css; 616 struct mem_cgroup *mem; 617 618 if (!root->use_hierarchy) 619 return (*func)(root, data); 620 621 nextid = 1; 622 do { 623 ret = 0; 624 mem = NULL; 625 626 rcu_read_lock(); 627 css = css_get_next(&mem_cgroup_subsys, nextid, &root->css, 628 &found); 629 if (css && css_tryget(css)) 630 mem = container_of(css, struct mem_cgroup, css); 631 rcu_read_unlock(); 632 633 if (mem) { 634 ret = (*func)(mem, data); 635 css_put(&mem->css); 636 } 637 nextid = found + 1; 638 } while (!ret && css); 639 640 return ret; 641} 642 643static inline bool mem_cgroup_is_root(struct mem_cgroup *mem) 644{ 645 return (mem == root_mem_cgroup); 646} 647 648/* 649 * Following LRU functions are allowed to be used without PCG_LOCK. 650 * Operations are called by routine of global LRU independently from memcg. 651 * What we have to take care of here is validness of pc->mem_cgroup. 652 * 653 * Changes to pc->mem_cgroup happens when 654 * 1. charge 655 * 2. moving account 656 * In typical case, "charge" is done before add-to-lru. Exception is SwapCache. 657 * It is added to LRU before charge. 658 * If PCG_USED bit is not set, page_cgroup is not added to this private LRU. 659 * When moving account, the page is not on LRU. It's isolated. 660 */ 661 662void mem_cgroup_del_lru_list(struct page *page, enum lru_list lru) 663{ 664 struct page_cgroup *pc; 665 struct mem_cgroup_per_zone *mz; 666 667 if (mem_cgroup_disabled()) 668 return; 669 pc = lookup_page_cgroup(page); 670 /* can happen while we handle swapcache. */ 671 if (!TestClearPageCgroupAcctLRU(pc)) 672 return; 673 VM_BUG_ON(!pc->mem_cgroup); 674 /* 675 * We don't check PCG_USED bit. It's cleared when the "page" is finally 676 * removed from global LRU. 677 */ 678 mz = page_cgroup_zoneinfo(pc); 679 MEM_CGROUP_ZSTAT(mz, lru) -= 1; 680 if (mem_cgroup_is_root(pc->mem_cgroup)) 681 return; 682 VM_BUG_ON(list_empty(&pc->lru)); 683 list_del_init(&pc->lru); 684 return; 685} 686 687void mem_cgroup_del_lru(struct page *page) 688{ 689 mem_cgroup_del_lru_list(page, page_lru(page)); 690} 691 692void mem_cgroup_rotate_lru_list(struct page *page, enum lru_list lru) 693{ 694 struct mem_cgroup_per_zone *mz; 695 struct page_cgroup *pc; 696 697 if (mem_cgroup_disabled()) 698 return; 699 700 pc = lookup_page_cgroup(page); 701 /* 702 * Used bit is set without atomic ops but after smp_wmb(). 703 * For making pc->mem_cgroup visible, insert smp_rmb() here. 704 */ 705 smp_rmb(); 706 /* unused or root page is not rotated. */ 707 if (!PageCgroupUsed(pc) || mem_cgroup_is_root(pc->mem_cgroup)) 708 return; 709 mz = page_cgroup_zoneinfo(pc); 710 list_move(&pc->lru, &mz->lists[lru]); 711} 712 713void mem_cgroup_add_lru_list(struct page *page, enum lru_list lru) 714{ 715 struct page_cgroup *pc; 716 struct mem_cgroup_per_zone *mz; 717 718 if (mem_cgroup_disabled()) 719 return; 720 pc = lookup_page_cgroup(page); 721 VM_BUG_ON(PageCgroupAcctLRU(pc)); 722 /* 723 * Used bit is set without atomic ops but after smp_wmb(). 724 * For making pc->mem_cgroup visible, insert smp_rmb() here. 725 */ 726 smp_rmb(); 727 if (!PageCgroupUsed(pc)) 728 return; 729 730 mz = page_cgroup_zoneinfo(pc); 731 MEM_CGROUP_ZSTAT(mz, lru) += 1; 732 SetPageCgroupAcctLRU(pc); 733 if (mem_cgroup_is_root(pc->mem_cgroup)) 734 return; 735 list_add(&pc->lru, &mz->lists[lru]); 736} 737 738/* 739 * At handling SwapCache, pc->mem_cgroup may be changed while it's linked to 740 * lru because the page may.be reused after it's fully uncharged (because of 741 * SwapCache behavior).To handle that, unlink page_cgroup from LRU when charge 742 * it again. This function is only used to charge SwapCache. It's done under 743 * lock_page and expected that zone->lru_lock is never held. 744 */ 745static void mem_cgroup_lru_del_before_commit_swapcache(struct page *page) 746{ 747 unsigned long flags; 748 struct zone *zone = page_zone(page); 749 struct page_cgroup *pc = lookup_page_cgroup(page); 750 751 spin_lock_irqsave(&zone->lru_lock, flags); 752 /* 753 * Forget old LRU when this page_cgroup is *not* used. This Used bit 754 * is guarded by lock_page() because the page is SwapCache. 755 */ 756 if (!PageCgroupUsed(pc)) 757 mem_cgroup_del_lru_list(page, page_lru(page)); 758 spin_unlock_irqrestore(&zone->lru_lock, flags); 759} 760 761static void mem_cgroup_lru_add_after_commit_swapcache(struct page *page) 762{ 763 unsigned long flags; 764 struct zone *zone = page_zone(page); 765 struct page_cgroup *pc = lookup_page_cgroup(page); 766 767 spin_lock_irqsave(&zone->lru_lock, flags); 768 /* link when the page is linked to LRU but page_cgroup isn't */ 769 if (PageLRU(page) && !PageCgroupAcctLRU(pc)) 770 mem_cgroup_add_lru_list(page, page_lru(page)); 771 spin_unlock_irqrestore(&zone->lru_lock, flags); 772} 773 774 775void mem_cgroup_move_lists(struct page *page, 776 enum lru_list from, enum lru_list to) 777{ 778 if (mem_cgroup_disabled()) 779 return; 780 mem_cgroup_del_lru_list(page, from); 781 mem_cgroup_add_lru_list(page, to); 782} 783 784int task_in_mem_cgroup(struct task_struct *task, const struct mem_cgroup *mem) 785{ 786 int ret; 787 struct mem_cgroup *curr = NULL; 788 789 task_lock(task); 790 rcu_read_lock(); 791 curr = try_get_mem_cgroup_from_mm(task->mm); 792 rcu_read_unlock(); 793 task_unlock(task); 794 if (!curr) 795 return 0; 796 /* 797 * We should check use_hierarchy of "mem" not "curr". Because checking 798 * use_hierarchy of "curr" here make this function true if hierarchy is 799 * enabled in "curr" and "curr" is a child of "mem" in *cgroup* 800 * hierarchy(even if use_hierarchy is disabled in "mem"). 801 */ 802 if (mem->use_hierarchy) 803 ret = css_is_ancestor(&curr->css, &mem->css); 804 else 805 ret = (curr == mem); 806 css_put(&curr->css); 807 return ret; 808} 809 810/* 811 * prev_priority control...this will be used in memory reclaim path. 812 */ 813int mem_cgroup_get_reclaim_priority(struct mem_cgroup *mem) 814{ 815 int prev_priority; 816 817 spin_lock(&mem->reclaim_param_lock); 818 prev_priority = mem->prev_priority; 819 spin_unlock(&mem->reclaim_param_lock); 820 821 return prev_priority; 822} 823 824void mem_cgroup_note_reclaim_priority(struct mem_cgroup *mem, int priority) 825{ 826 spin_lock(&mem->reclaim_param_lock); 827 if (priority < mem->prev_priority) 828 mem->prev_priority = priority; 829 spin_unlock(&mem->reclaim_param_lock); 830} 831 832void mem_cgroup_record_reclaim_priority(struct mem_cgroup *mem, int priority) 833{ 834 spin_lock(&mem->reclaim_param_lock); 835 mem->prev_priority = priority; 836 spin_unlock(&mem->reclaim_param_lock); 837} 838 839static int calc_inactive_ratio(struct mem_cgroup *memcg, unsigned long *present_pages) 840{ 841 unsigned long active; 842 unsigned long inactive; 843 unsigned long gb; 844 unsigned long inactive_ratio; 845 846 inactive = mem_cgroup_get_local_zonestat(memcg, LRU_INACTIVE_ANON); 847 active = mem_cgroup_get_local_zonestat(memcg, LRU_ACTIVE_ANON); 848 849 gb = (inactive + active) >> (30 - PAGE_SHIFT); 850 if (gb) 851 inactive_ratio = int_sqrt(10 * gb); 852 else 853 inactive_ratio = 1; 854 855 if (present_pages) { 856 present_pages[0] = inactive; 857 present_pages[1] = active; 858 } 859 860 return inactive_ratio; 861} 862 863int mem_cgroup_inactive_anon_is_low(struct mem_cgroup *memcg) 864{ 865 unsigned long active; 866 unsigned long inactive; 867 unsigned long present_pages[2]; 868 unsigned long inactive_ratio; 869 870 inactive_ratio = calc_inactive_ratio(memcg, present_pages); 871 872 inactive = present_pages[0]; 873 active = present_pages[1]; 874 875 if (inactive * inactive_ratio < active) 876 return 1; 877 878 return 0; 879} 880 881int mem_cgroup_inactive_file_is_low(struct mem_cgroup *memcg) 882{ 883 unsigned long active; 884 unsigned long inactive; 885 886 inactive = mem_cgroup_get_local_zonestat(memcg, LRU_INACTIVE_FILE); 887 active = mem_cgroup_get_local_zonestat(memcg, LRU_ACTIVE_FILE); 888 889 return (active > inactive); 890} 891 892unsigned long mem_cgroup_zone_nr_pages(struct mem_cgroup *memcg, 893 struct zone *zone, 894 enum lru_list lru) 895{ 896 int nid = zone->zone_pgdat->node_id; 897 int zid = zone_idx(zone); 898 struct mem_cgroup_per_zone *mz = mem_cgroup_zoneinfo(memcg, nid, zid); 899 900 return MEM_CGROUP_ZSTAT(mz, lru); 901} 902 903struct zone_reclaim_stat *mem_cgroup_get_reclaim_stat(struct mem_cgroup *memcg, 904 struct zone *zone) 905{ 906 int nid = zone->zone_pgdat->node_id; 907 int zid = zone_idx(zone); 908 struct mem_cgroup_per_zone *mz = mem_cgroup_zoneinfo(memcg, nid, zid); 909 910 return &mz->reclaim_stat; 911} 912 913struct zone_reclaim_stat * 914mem_cgroup_get_reclaim_stat_from_page(struct page *page) 915{ 916 struct page_cgroup *pc; 917 struct mem_cgroup_per_zone *mz; 918 919 if (mem_cgroup_disabled()) 920 return NULL; 921 922 pc = lookup_page_cgroup(page); 923 /* 924 * Used bit is set without atomic ops but after smp_wmb(). 925 * For making pc->mem_cgroup visible, insert smp_rmb() here. 926 */ 927 smp_rmb(); 928 if (!PageCgroupUsed(pc)) 929 return NULL; 930 931 mz = page_cgroup_zoneinfo(pc); 932 if (!mz) 933 return NULL; 934 935 return &mz->reclaim_stat; 936} 937 938unsigned long mem_cgroup_isolate_pages(unsigned long nr_to_scan, 939 struct list_head *dst, 940 unsigned long *scanned, int order, 941 int mode, struct zone *z, 942 struct mem_cgroup *mem_cont, 943 int active, int file) 944{ 945 unsigned long nr_taken = 0; 946 struct page *page; 947 unsigned long scan; 948 LIST_HEAD(pc_list); 949 struct list_head *src; 950 struct page_cgroup *pc, *tmp; 951 int nid = z->zone_pgdat->node_id; 952 int zid = zone_idx(z); 953 struct mem_cgroup_per_zone *mz; 954 int lru = LRU_FILE * file + active; 955 int ret; 956 957 BUG_ON(!mem_cont); 958 mz = mem_cgroup_zoneinfo(mem_cont, nid, zid); 959 src = &mz->lists[lru]; 960 961 scan = 0; 962 list_for_each_entry_safe_reverse(pc, tmp, src, lru) { 963 if (scan >= nr_to_scan) 964 break; 965 966 page = pc->page; 967 if (unlikely(!PageCgroupUsed(pc))) 968 continue; 969 if (unlikely(!PageLRU(page))) 970 continue; 971 972 scan++; 973 ret = __isolate_lru_page(page, mode, file); 974 switch (ret) { 975 case 0: 976 list_move(&page->lru, dst); 977 mem_cgroup_del_lru(page); 978 nr_taken++; 979 break; 980 case -EBUSY: 981 /* we don't affect global LRU but rotate in our LRU */ 982 mem_cgroup_rotate_lru_list(page, page_lru(page)); 983 break; 984 default: 985 break; 986 } 987 } 988 989 *scanned = scan; 990 return nr_taken; 991} 992 993#define mem_cgroup_from_res_counter(counter, member) \ 994 container_of(counter, struct mem_cgroup, member) 995 996static bool mem_cgroup_check_under_limit(struct mem_cgroup *mem) 997{ 998 if (do_swap_account) { 999 if (res_counter_check_under_limit(&mem->res) && 1000 res_counter_check_under_limit(&mem->memsw)) 1001 return true; 1002 } else 1003 if (res_counter_check_under_limit(&mem->res)) 1004 return true; 1005 return false; 1006} 1007 1008static unsigned int get_swappiness(struct mem_cgroup *memcg) 1009{ 1010 struct cgroup *cgrp = memcg->css.cgroup; 1011 unsigned int swappiness; 1012 1013 /* root ? */ 1014 if (cgrp->parent == NULL) 1015 return vm_swappiness; 1016 1017 spin_lock(&memcg->reclaim_param_lock); 1018 swappiness = memcg->swappiness; 1019 spin_unlock(&memcg->reclaim_param_lock); 1020 1021 return swappiness; 1022} 1023 1024static int mem_cgroup_count_children_cb(struct mem_cgroup *mem, void *data) 1025{ 1026 int *val = data; 1027 (*val)++; 1028 return 0; 1029} 1030 1031/** 1032 * mem_cgroup_print_oom_info: Called from OOM with tasklist_lock held in read mode. 1033 * @memcg: The memory cgroup that went over limit 1034 * @p: Task that is going to be killed 1035 * 1036 * NOTE: @memcg and @p's mem_cgroup can be different when hierarchy is 1037 * enabled 1038 */ 1039void mem_cgroup_print_oom_info(struct mem_cgroup *memcg, struct task_struct *p) 1040{ 1041 struct cgroup *task_cgrp; 1042 struct cgroup *mem_cgrp; 1043 /* 1044 * Need a buffer in BSS, can't rely on allocations. The code relies 1045 * on the assumption that OOM is serialized for memory controller. 1046 * If this assumption is broken, revisit this code. 1047 */ 1048 static char memcg_name[PATH_MAX]; 1049 int ret; 1050 1051 if (!memcg || !p) 1052 return; 1053 1054 1055 rcu_read_lock(); 1056 1057 mem_cgrp = memcg->css.cgroup; 1058 task_cgrp = task_cgroup(p, mem_cgroup_subsys_id); 1059 1060 ret = cgroup_path(task_cgrp, memcg_name, PATH_MAX); 1061 if (ret < 0) { 1062 /* 1063 * Unfortunately, we are unable to convert to a useful name 1064 * But we'll still print out the usage information 1065 */ 1066 rcu_read_unlock(); 1067 goto done; 1068 } 1069 rcu_read_unlock(); 1070 1071 printk(KERN_INFO "Task in %s killed", memcg_name); 1072 1073 rcu_read_lock(); 1074 ret = cgroup_path(mem_cgrp, memcg_name, PATH_MAX); 1075 if (ret < 0) { 1076 rcu_read_unlock(); 1077 goto done; 1078 } 1079 rcu_read_unlock(); 1080 1081 /* 1082 * Continues from above, so we don't need an KERN_ level 1083 */ 1084 printk(KERN_CONT " as a result of limit of %s\n", memcg_name); 1085done: 1086 1087 printk(KERN_INFO "memory: usage %llukB, limit %llukB, failcnt %llu\n", 1088 res_counter_read_u64(&memcg->res, RES_USAGE) >> 10, 1089 res_counter_read_u64(&memcg->res, RES_LIMIT) >> 10, 1090 res_counter_read_u64(&memcg->res, RES_FAILCNT)); 1091 printk(KERN_INFO "memory+swap: usage %llukB, limit %llukB, " 1092 "failcnt %llu\n", 1093 res_counter_read_u64(&memcg->memsw, RES_USAGE) >> 10, 1094 res_counter_read_u64(&memcg->memsw, RES_LIMIT) >> 10, 1095 res_counter_read_u64(&memcg->memsw, RES_FAILCNT)); 1096} 1097 1098/* 1099 * This function returns the number of memcg under hierarchy tree. Returns 1100 * 1(self count) if no children. 1101 */ 1102static int mem_cgroup_count_children(struct mem_cgroup *mem) 1103{ 1104 int num = 0; 1105 mem_cgroup_walk_tree(mem, &num, mem_cgroup_count_children_cb); 1106 return num; 1107} 1108 1109/* 1110 * Visit the first child (need not be the first child as per the ordering 1111 * of the cgroup list, since we track last_scanned_child) of @mem and use 1112 * that to reclaim free pages from. 1113 */ 1114static struct mem_cgroup * 1115mem_cgroup_select_victim(struct mem_cgroup *root_mem) 1116{ 1117 struct mem_cgroup *ret = NULL; 1118 struct cgroup_subsys_state *css; 1119 int nextid, found; 1120 1121 if (!root_mem->use_hierarchy) { 1122 css_get(&root_mem->css); 1123 ret = root_mem; 1124 } 1125 1126 while (!ret) { 1127 rcu_read_lock(); 1128 nextid = root_mem->last_scanned_child + 1; 1129 css = css_get_next(&mem_cgroup_subsys, nextid, &root_mem->css, 1130 &found); 1131 if (css && css_tryget(css)) 1132 ret = container_of(css, struct mem_cgroup, css); 1133 1134 rcu_read_unlock(); 1135 /* Updates scanning parameter */ 1136 spin_lock(&root_mem->reclaim_param_lock); 1137 if (!css) { 1138 /* this means start scan from ID:1 */ 1139 root_mem->last_scanned_child = 0; 1140 } else 1141 root_mem->last_scanned_child = found; 1142 spin_unlock(&root_mem->reclaim_param_lock); 1143 } 1144 1145 return ret; 1146} 1147 1148/* 1149 * Scan the hierarchy if needed to reclaim memory. We remember the last child 1150 * we reclaimed from, so that we don't end up penalizing one child extensively 1151 * based on its position in the children list. 1152 * 1153 * root_mem is the original ancestor that we've been reclaim from. 1154 * 1155 * We give up and return to the caller when we visit root_mem twice. 1156 * (other groups can be removed while we're walking....) 1157 * 1158 * If shrink==true, for avoiding to free too much, this returns immedieately. 1159 */ 1160static int mem_cgroup_hierarchical_reclaim(struct mem_cgroup *root_mem, 1161 struct zone *zone, 1162 gfp_t gfp_mask, 1163 unsigned long reclaim_options) 1164{ 1165 struct mem_cgroup *victim; 1166 int ret, total = 0; 1167 int loop = 0; 1168 bool noswap = reclaim_options & MEM_CGROUP_RECLAIM_NOSWAP; 1169 bool shrink = reclaim_options & MEM_CGROUP_RECLAIM_SHRINK; 1170 bool check_soft = reclaim_options & MEM_CGROUP_RECLAIM_SOFT; 1171 unsigned long excess = mem_cgroup_get_excess(root_mem); 1172 1173 /* If memsw_is_minimum==1, swap-out is of-no-use. */ 1174 if (root_mem->memsw_is_minimum) 1175 noswap = true; 1176 1177 while (1) { 1178 victim = mem_cgroup_select_victim(root_mem); 1179 if (victim == root_mem) { 1180 loop++; 1181 if (loop >= 1) 1182 drain_all_stock_async(); 1183 if (loop >= 2) { 1184 /* 1185 * If we have not been able to reclaim 1186 * anything, it might because there are 1187 * no reclaimable pages under this hierarchy 1188 */ 1189 if (!check_soft || !total) { 1190 css_put(&victim->css); 1191 break; 1192 } 1193 /* 1194 * We want to do more targetted reclaim. 1195 * excess >> 2 is not to excessive so as to 1196 * reclaim too much, nor too less that we keep 1197 * coming back to reclaim from this cgroup 1198 */ 1199 if (total >= (excess >> 2) || 1200 (loop > MEM_CGROUP_MAX_RECLAIM_LOOPS)) { 1201 css_put(&victim->css); 1202 break; 1203 } 1204 } 1205 } 1206 if (!mem_cgroup_local_usage(victim)) { 1207 /* this cgroup's local usage == 0 */ 1208 css_put(&victim->css); 1209 continue; 1210 } 1211 /* we use swappiness of local cgroup */ 1212 if (check_soft) 1213 ret = mem_cgroup_shrink_node_zone(victim, gfp_mask, 1214 noswap, get_swappiness(victim), zone, 1215 zone->zone_pgdat->node_id); 1216 else 1217 ret = try_to_free_mem_cgroup_pages(victim, gfp_mask, 1218 noswap, get_swappiness(victim)); 1219 css_put(&victim->css); 1220 /* 1221 * At shrinking usage, we can't check we should stop here or 1222 * reclaim more. It's depends on callers. last_scanned_child 1223 * will work enough for keeping fairness under tree. 1224 */ 1225 if (shrink) 1226 return ret; 1227 total += ret; 1228 if (check_soft) { 1229 if (res_counter_check_under_soft_limit(&root_mem->res)) 1230 return total; 1231 } else if (mem_cgroup_check_under_limit(root_mem)) 1232 return 1 + total; 1233 } 1234 return total; 1235} 1236 1237bool mem_cgroup_oom_called(struct task_struct *task) 1238{ 1239 bool ret = false; 1240 struct mem_cgroup *mem; 1241 struct mm_struct *mm; 1242 1243 rcu_read_lock(); 1244 mm = task->mm; 1245 if (!mm) 1246 mm = &init_mm; 1247 mem = mem_cgroup_from_task(rcu_dereference(mm->owner)); 1248 if (mem && time_before(jiffies, mem->last_oom_jiffies + HZ/10)) 1249 ret = true; 1250 rcu_read_unlock(); 1251 return ret; 1252} 1253 1254static int record_last_oom_cb(struct mem_cgroup *mem, void *data) 1255{ 1256 mem->last_oom_jiffies = jiffies; 1257 return 0; 1258} 1259 1260static void record_last_oom(struct mem_cgroup *mem) 1261{ 1262 mem_cgroup_walk_tree(mem, NULL, record_last_oom_cb); 1263} 1264 1265/* 1266 * Currently used to update mapped file statistics, but the routine can be 1267 * generalized to update other statistics as well. 1268 */ 1269void mem_cgroup_update_file_mapped(struct page *page, int val) 1270{ 1271 struct mem_cgroup *mem; 1272 struct page_cgroup *pc; 1273 1274 pc = lookup_page_cgroup(page); 1275 if (unlikely(!pc)) 1276 return; 1277 1278 lock_page_cgroup(pc); 1279 mem = pc->mem_cgroup; 1280 if (!mem) 1281 goto done; 1282 1283 if (!PageCgroupUsed(pc)) 1284 goto done; 1285 1286 /* 1287 * Preemption is already disabled. We can use __this_cpu_xxx 1288 */ 1289 __this_cpu_add(mem->stat->count[MEM_CGROUP_STAT_FILE_MAPPED], val); 1290 1291done: 1292 unlock_page_cgroup(pc); 1293} 1294 1295/* 1296 * size of first charge trial. "32" comes from vmscan.c's magic value. 1297 * TODO: maybe necessary to use big numbers in big irons. 1298 */ 1299#define CHARGE_SIZE (32 * PAGE_SIZE) 1300struct memcg_stock_pcp { 1301 struct mem_cgroup *cached; /* this never be root cgroup */ 1302 int charge; 1303 struct work_struct work; 1304}; 1305static DEFINE_PER_CPU(struct memcg_stock_pcp, memcg_stock); 1306static atomic_t memcg_drain_count; 1307 1308/* 1309 * Try to consume stocked charge on this cpu. If success, PAGE_SIZE is consumed 1310 * from local stock and true is returned. If the stock is 0 or charges from a 1311 * cgroup which is not current target, returns false. This stock will be 1312 * refilled. 1313 */ 1314static bool consume_stock(struct mem_cgroup *mem) 1315{ 1316 struct memcg_stock_pcp *stock; 1317 bool ret = true; 1318 1319 stock = &get_cpu_var(memcg_stock); 1320 if (mem == stock->cached && stock->charge) 1321 stock->charge -= PAGE_SIZE; 1322 else /* need to call res_counter_charge */ 1323 ret = false; 1324 put_cpu_var(memcg_stock); 1325 return ret; 1326} 1327 1328/* 1329 * Returns stocks cached in percpu to res_counter and reset cached information. 1330 */ 1331static void drain_stock(struct memcg_stock_pcp *stock) 1332{ 1333 struct mem_cgroup *old = stock->cached; 1334 1335 if (stock->charge) { 1336 res_counter_uncharge(&old->res, stock->charge); 1337 if (do_swap_account) 1338 res_counter_uncharge(&old->memsw, stock->charge); 1339 } 1340 stock->cached = NULL; 1341 stock->charge = 0; 1342} 1343 1344/* 1345 * This must be called under preempt disabled or must be called by 1346 * a thread which is pinned to local cpu. 1347 */ 1348static void drain_local_stock(struct work_struct *dummy) 1349{ 1350 struct memcg_stock_pcp *stock = &__get_cpu_var(memcg_stock); 1351 drain_stock(stock); 1352} 1353 1354/* 1355 * Cache charges(val) which is from res_counter, to local per_cpu area. 1356 * This will be consumed by consumt_stock() function, later. 1357 */ 1358static void refill_stock(struct mem_cgroup *mem, int val) 1359{ 1360 struct memcg_stock_pcp *stock = &get_cpu_var(memcg_stock); 1361 1362 if (stock->cached != mem) { /* reset if necessary */ 1363 drain_stock(stock); 1364 stock->cached = mem; 1365 } 1366 stock->charge += val; 1367 put_cpu_var(memcg_stock); 1368} 1369 1370/* 1371 * Tries to drain stocked charges in other cpus. This function is asynchronous 1372 * and just put a work per cpu for draining localy on each cpu. Caller can 1373 * expects some charges will be back to res_counter later but cannot wait for 1374 * it. 1375 */ 1376static void drain_all_stock_async(void) 1377{ 1378 int cpu; 1379 /* This function is for scheduling "drain" in asynchronous way. 1380 * The result of "drain" is not directly handled by callers. Then, 1381 * if someone is calling drain, we don't have to call drain more. 1382 * Anyway, WORK_STRUCT_PENDING check in queue_work_on() will catch if 1383 * there is a race. We just do loose check here. 1384 */ 1385 if (atomic_read(&memcg_drain_count)) 1386 return; 1387 /* Notify other cpus that system-wide "drain" is running */ 1388 atomic_inc(&memcg_drain_count); 1389 get_online_cpus(); 1390 for_each_online_cpu(cpu) { 1391 struct memcg_stock_pcp *stock = &per_cpu(memcg_stock, cpu); 1392 schedule_work_on(cpu, &stock->work); 1393 } 1394 put_online_cpus(); 1395 atomic_dec(&memcg_drain_count); 1396 /* We don't wait for flush_work */ 1397} 1398 1399/* This is a synchronous drain interface. */ 1400static void drain_all_stock_sync(void) 1401{ 1402 /* called when force_empty is called */ 1403 atomic_inc(&memcg_drain_count); 1404 schedule_on_each_cpu(drain_local_stock); 1405 atomic_dec(&memcg_drain_count); 1406} 1407 1408static int __cpuinit memcg_stock_cpu_callback(struct notifier_block *nb, 1409 unsigned long action, 1410 void *hcpu) 1411{ 1412 int cpu = (unsigned long)hcpu; 1413 struct memcg_stock_pcp *stock; 1414 1415 if (action != CPU_DEAD) 1416 return NOTIFY_OK; 1417 stock = &per_cpu(memcg_stock, cpu); 1418 drain_stock(stock); 1419 return NOTIFY_OK; 1420} 1421 1422/* 1423 * Unlike exported interface, "oom" parameter is added. if oom==true, 1424 * oom-killer can be invoked. 1425 */ 1426static int __mem_cgroup_try_charge(struct mm_struct *mm, 1427 gfp_t gfp_mask, struct mem_cgroup **memcg, 1428 bool oom, struct page *page) 1429{ 1430 struct mem_cgroup *mem, *mem_over_limit; 1431 int nr_retries = MEM_CGROUP_RECLAIM_RETRIES; 1432 struct res_counter *fail_res; 1433 int csize = CHARGE_SIZE; 1434 1435 if (unlikely(test_thread_flag(TIF_MEMDIE))) { 1436 /* Don't account this! */ 1437 *memcg = NULL; 1438 return 0; 1439 } 1440 1441 /* 1442 * We always charge the cgroup the mm_struct belongs to. 1443 * The mm_struct's mem_cgroup changes on task migration if the 1444 * thread group leader migrates. It's possible that mm is not 1445 * set, if so charge the init_mm (happens for pagecache usage). 1446 */ 1447 mem = *memcg; 1448 if (likely(!mem)) { 1449 mem = try_get_mem_cgroup_from_mm(mm); 1450 *memcg = mem; 1451 } else { 1452 css_get(&mem->css); 1453 } 1454 if (unlikely(!mem)) 1455 return 0; 1456 1457 VM_BUG_ON(css_is_removed(&mem->css)); 1458 if (mem_cgroup_is_root(mem)) 1459 goto done; 1460 1461 while (1) { 1462 int ret = 0; 1463 unsigned long flags = 0; 1464 1465 if (consume_stock(mem)) 1466 goto charged; 1467 1468 ret = res_counter_charge(&mem->res, csize, &fail_res); 1469 if (likely(!ret)) { 1470 if (!do_swap_account) 1471 break; 1472 ret = res_counter_charge(&mem->memsw, csize, &fail_res); 1473 if (likely(!ret)) 1474 break; 1475 /* mem+swap counter fails */ 1476 res_counter_uncharge(&mem->res, csize); 1477 flags |= MEM_CGROUP_RECLAIM_NOSWAP; 1478 mem_over_limit = mem_cgroup_from_res_counter(fail_res, 1479 memsw); 1480 } else 1481 /* mem counter fails */ 1482 mem_over_limit = mem_cgroup_from_res_counter(fail_res, 1483 res); 1484 1485 /* reduce request size and retry */ 1486 if (csize > PAGE_SIZE) { 1487 csize = PAGE_SIZE; 1488 continue; 1489 } 1490 if (!(gfp_mask & __GFP_WAIT)) 1491 goto nomem; 1492 1493 ret = mem_cgroup_hierarchical_reclaim(mem_over_limit, NULL, 1494 gfp_mask, flags); 1495 if (ret) 1496 continue; 1497 1498 /* 1499 * try_to_free_mem_cgroup_pages() might not give us a full 1500 * picture of reclaim. Some pages are reclaimed and might be 1501 * moved to swap cache or just unmapped from the cgroup. 1502 * Check the limit again to see if the reclaim reduced the 1503 * current usage of the cgroup before giving up 1504 * 1505 */ 1506 if (mem_cgroup_check_under_limit(mem_over_limit)) 1507 continue; 1508 1509 /* try to avoid oom while someone is moving charge */ 1510 if (mc.moving_task && current != mc.moving_task) { 1511 struct mem_cgroup *from, *to; 1512 bool do_continue = false; 1513 /* 1514 * There is a small race that "from" or "to" can be 1515 * freed by rmdir, so we use css_tryget(). 1516 */ 1517 rcu_read_lock(); 1518 from = mc.from; 1519 to = mc.to; 1520 if (from && css_tryget(&from->css)) { 1521 if (mem_over_limit->use_hierarchy) 1522 do_continue = css_is_ancestor( 1523 &from->css, 1524 &mem_over_limit->css); 1525 else 1526 do_continue = (from == mem_over_limit); 1527 css_put(&from->css); 1528 } 1529 if (!do_continue && to && css_tryget(&to->css)) { 1530 if (mem_over_limit->use_hierarchy) 1531 do_continue = css_is_ancestor( 1532 &to->css, 1533 &mem_over_limit->css); 1534 else 1535 do_continue = (to == mem_over_limit); 1536 css_put(&to->css); 1537 } 1538 rcu_read_unlock(); 1539 if (do_continue) { 1540 DEFINE_WAIT(wait); 1541 prepare_to_wait(&mc.waitq, &wait, 1542 TASK_INTERRUPTIBLE); 1543 /* moving charge context might have finished. */ 1544 if (mc.moving_task) 1545 schedule(); 1546 finish_wait(&mc.waitq, &wait); 1547 continue; 1548 } 1549 } 1550 1551 if (!nr_retries--) { 1552 if (oom) { 1553 mem_cgroup_out_of_memory(mem_over_limit, gfp_mask); 1554 record_last_oom(mem_over_limit); 1555 } 1556 goto nomem; 1557 } 1558 } 1559 if (csize > PAGE_SIZE) 1560 refill_stock(mem, csize - PAGE_SIZE); 1561charged: 1562 /* 1563 * Insert ancestor (and ancestor's ancestors), to softlimit RB-tree. 1564 * if they exceeds softlimit. 1565 */ 1566 if (page && mem_cgroup_soft_limit_check(mem)) 1567 mem_cgroup_update_tree(mem, page); 1568done: 1569 if (mem_cgroup_threshold_check(mem)) 1570 mem_cgroup_threshold(mem); 1571 return 0; 1572nomem: 1573 css_put(&mem->css); 1574 return -ENOMEM; 1575} 1576 1577/* 1578 * Somemtimes we have to undo a charge we got by try_charge(). 1579 * This function is for that and do uncharge, put css's refcnt. 1580 * gotten by try_charge(). 1581 */ 1582static void __mem_cgroup_cancel_charge(struct mem_cgroup *mem, 1583 unsigned long count) 1584{ 1585 if (!mem_cgroup_is_root(mem)) { 1586 res_counter_uncharge(&mem->res, PAGE_SIZE * count); 1587 if (do_swap_account) 1588 res_counter_uncharge(&mem->memsw, PAGE_SIZE * count); 1589 VM_BUG_ON(test_bit(CSS_ROOT, &mem->css.flags)); 1590 WARN_ON_ONCE(count > INT_MAX); 1591 __css_put(&mem->css, (int)count); 1592 } 1593 /* we don't need css_put for root */ 1594} 1595 1596static void mem_cgroup_cancel_charge(struct mem_cgroup *mem) 1597{ 1598 __mem_cgroup_cancel_charge(mem, 1); 1599} 1600 1601/* 1602 * A helper function to get mem_cgroup from ID. must be called under 1603 * rcu_read_lock(). The caller must check css_is_removed() or some if 1604 * it's concern. (dropping refcnt from swap can be called against removed 1605 * memcg.) 1606 */ 1607static struct mem_cgroup *mem_cgroup_lookup(unsigned short id) 1608{ 1609 struct cgroup_subsys_state *css; 1610 1611 /* ID 0 is unused ID */ 1612 if (!id) 1613 return NULL; 1614 css = css_lookup(&mem_cgroup_subsys, id); 1615 if (!css) 1616 return NULL; 1617 return container_of(css, struct mem_cgroup, css); 1618} 1619 1620struct mem_cgroup *try_get_mem_cgroup_from_page(struct page *page) 1621{ 1622 struct mem_cgroup *mem = NULL; 1623 struct page_cgroup *pc; 1624 unsigned short id; 1625 swp_entry_t ent; 1626 1627 VM_BUG_ON(!PageLocked(page)); 1628 1629 pc = lookup_page_cgroup(page); 1630 lock_page_cgroup(pc); 1631 if (PageCgroupUsed(pc)) { 1632 mem = pc->mem_cgroup; 1633 if (mem && !css_tryget(&mem->css)) 1634 mem = NULL; 1635 } else if (PageSwapCache(page)) { 1636 ent.val = page_private(page); 1637 id = lookup_swap_cgroup(ent); 1638 rcu_read_lock(); 1639 mem = mem_cgroup_lookup(id); 1640 if (mem && !css_tryget(&mem->css)) 1641 mem = NULL; 1642 rcu_read_unlock(); 1643 } 1644 unlock_page_cgroup(pc); 1645 return mem; 1646} 1647 1648/* 1649 * commit a charge got by __mem_cgroup_try_charge() and makes page_cgroup to be 1650 * USED state. If already USED, uncharge and return. 1651 */ 1652 1653static void __mem_cgroup_commit_charge(struct mem_cgroup *mem, 1654 struct page_cgroup *pc, 1655 enum charge_type ctype) 1656{ 1657 /* try_charge() can return NULL to *memcg, taking care of it. */ 1658 if (!mem) 1659 return; 1660 1661 lock_page_cgroup(pc); 1662 if (unlikely(PageCgroupUsed(pc))) { 1663 unlock_page_cgroup(pc); 1664 mem_cgroup_cancel_charge(mem); 1665 return; 1666 } 1667 1668 pc->mem_cgroup = mem; 1669 /* 1670 * We access a page_cgroup asynchronously without lock_page_cgroup(). 1671 * Especially when a page_cgroup is taken from a page, pc->mem_cgroup 1672 * is accessed after testing USED bit. To make pc->mem_cgroup visible 1673 * before USED bit, we need memory barrier here. 1674 * See mem_cgroup_add_lru_list(), etc. 1675 */ 1676 smp_wmb(); 1677 switch (ctype) { 1678 case MEM_CGROUP_CHARGE_TYPE_CACHE: 1679 case MEM_CGROUP_CHARGE_TYPE_SHMEM: 1680 SetPageCgroupCache(pc); 1681 SetPageCgroupUsed(pc); 1682 break; 1683 case MEM_CGROUP_CHARGE_TYPE_MAPPED: 1684 ClearPageCgroupCache(pc); 1685 SetPageCgroupUsed(pc); 1686 break; 1687 default: 1688 break; 1689 } 1690 1691 mem_cgroup_charge_statistics(mem, pc, true); 1692 1693 unlock_page_cgroup(pc); 1694} 1695 1696/** 1697 * __mem_cgroup_move_account - move account of the page 1698 * @pc: page_cgroup of the page. 1699 * @from: mem_cgroup which the page is moved from. 1700 * @to: mem_cgroup which the page is moved to. @from != @to. 1701 * @uncharge: whether we should call uncharge and css_put against @from. 1702 * 1703 * The caller must confirm following. 1704 * - page is not on LRU (isolate_page() is useful.) 1705 * - the pc is locked, used, and ->mem_cgroup points to @from. 1706 * 1707 * This function doesn't do "charge" nor css_get to new cgroup. It should be 1708 * done by a caller(__mem_cgroup_try_charge would be usefull). If @uncharge is 1709 * true, this function does "uncharge" from old cgroup, but it doesn't if 1710 * @uncharge is false, so a caller should do "uncharge". 1711 */ 1712 1713static void __mem_cgroup_move_account(struct page_cgroup *pc, 1714 struct mem_cgroup *from, struct mem_cgroup *to, bool uncharge) 1715{ 1716 struct page *page; 1717 1718 VM_BUG_ON(from == to); 1719 VM_BUG_ON(PageLRU(pc->page)); 1720 VM_BUG_ON(!PageCgroupLocked(pc)); 1721 VM_BUG_ON(!PageCgroupUsed(pc)); 1722 VM_BUG_ON(pc->mem_cgroup != from); 1723 1724 page = pc->page; 1725 if (page_mapped(page) && !PageAnon(page)) { 1726 /* Update mapped_file data for mem_cgroup */ 1727 preempt_disable(); 1728 __this_cpu_dec(from->stat->count[MEM_CGROUP_STAT_FILE_MAPPED]); 1729 __this_cpu_inc(to->stat->count[MEM_CGROUP_STAT_FILE_MAPPED]); 1730 preempt_enable(); 1731 } 1732 mem_cgroup_charge_statistics(from, pc, false); 1733 if (uncharge) 1734 /* This is not "cancel", but cancel_charge does all we need. */ 1735 mem_cgroup_cancel_charge(from); 1736 1737 /* caller should have done css_get */ 1738 pc->mem_cgroup = to; 1739 mem_cgroup_charge_statistics(to, pc, true); 1740 /* 1741 * We charges against "to" which may not have any tasks. Then, "to" 1742 * can be under rmdir(). But in current implementation, caller of 1743 * this function is just force_empty() and move charge, so it's 1744 * garanteed that "to" is never removed. So, we don't check rmdir 1745 * status here. 1746 */ 1747} 1748 1749/* 1750 * check whether the @pc is valid for moving account and call 1751 * __mem_cgroup_move_account() 1752 */ 1753static int mem_cgroup_move_account(struct page_cgroup *pc, 1754 struct mem_cgroup *from, struct mem_cgroup *to, bool uncharge) 1755{ 1756 int ret = -EINVAL; 1757 lock_page_cgroup(pc); 1758 if (PageCgroupUsed(pc) && pc->mem_cgroup == from) { 1759 __mem_cgroup_move_account(pc, from, to, uncharge); 1760 ret = 0; 1761 } 1762 unlock_page_cgroup(pc); 1763 return ret; 1764} 1765 1766/* 1767 * move charges to its parent. 1768 */ 1769 1770static int mem_cgroup_move_parent(struct page_cgroup *pc, 1771 struct mem_cgroup *child, 1772 gfp_t gfp_mask) 1773{ 1774 struct page *page = pc->page; 1775 struct cgroup *cg = child->css.cgroup; 1776 struct cgroup *pcg = cg->parent; 1777 struct mem_cgroup *parent; 1778 int ret; 1779 1780 /* Is ROOT ? */ 1781 if (!pcg) 1782 return -EINVAL; 1783 1784 ret = -EBUSY; 1785 if (!get_page_unless_zero(page)) 1786 goto out; 1787 if (isolate_lru_page(page)) 1788 goto put; 1789 1790 parent = mem_cgroup_from_cont(pcg); 1791 ret = __mem_cgroup_try_charge(NULL, gfp_mask, &parent, false, page); 1792 if (ret || !parent) 1793 goto put_back; 1794 1795 ret = mem_cgroup_move_account(pc, child, parent, true); 1796 if (ret) 1797 mem_cgroup_cancel_charge(parent); 1798put_back: 1799 putback_lru_page(page); 1800put: 1801 put_page(page); 1802out: 1803 return ret; 1804} 1805 1806/* 1807 * Charge the memory controller for page usage. 1808 * Return 1809 * 0 if the charge was successful 1810 * < 0 if the cgroup is over its limit 1811 */ 1812static int mem_cgroup_charge_common(struct page *page, struct mm_struct *mm, 1813 gfp_t gfp_mask, enum charge_type ctype, 1814 struct mem_cgroup *memcg) 1815{ 1816 struct mem_cgroup *mem; 1817 struct page_cgroup *pc; 1818 int ret; 1819 1820 pc = lookup_page_cgroup(page); 1821 /* can happen at boot */ 1822 if (unlikely(!pc)) 1823 return 0; 1824 prefetchw(pc); 1825 1826 mem = memcg; 1827 ret = __mem_cgroup_try_charge(mm, gfp_mask, &mem, true, page); 1828 if (ret || !mem) 1829 return ret; 1830 1831 __mem_cgroup_commit_charge(mem, pc, ctype); 1832 return 0; 1833} 1834 1835int mem_cgroup_newpage_charge(struct page *page, 1836 struct mm_struct *mm, gfp_t gfp_mask) 1837{ 1838 if (mem_cgroup_disabled()) 1839 return 0; 1840 if (PageCompound(page)) 1841 return 0; 1842 /* 1843 * If already mapped, we don't have to account. 1844 * If page cache, page->mapping has address_space. 1845 * But page->mapping may have out-of-use anon_vma pointer, 1846 * detecit it by PageAnon() check. newly-mapped-anon's page->mapping 1847 * is NULL. 1848 */ 1849 if (page_mapped(page) || (page->mapping && !PageAnon(page))) 1850 return 0; 1851 if (unlikely(!mm)) 1852 mm = &init_mm; 1853 return mem_cgroup_charge_common(page, mm, gfp_mask, 1854 MEM_CGROUP_CHARGE_TYPE_MAPPED, NULL); 1855} 1856 1857static void 1858__mem_cgroup_commit_charge_swapin(struct page *page, struct mem_cgroup *ptr, 1859 enum charge_type ctype); 1860 1861int mem_cgroup_cache_charge(struct page *page, struct mm_struct *mm, 1862 gfp_t gfp_mask) 1863{ 1864 struct mem_cgroup *mem = NULL; 1865 int ret; 1866 1867 if (mem_cgroup_disabled()) 1868 return 0; 1869 if (PageCompound(page)) 1870 return 0; 1871 /* 1872 * Corner case handling. This is called from add_to_page_cache() 1873 * in usual. But some FS (shmem) precharges this page before calling it 1874 * and call add_to_page_cache() with GFP_NOWAIT. 1875 * 1876 * For GFP_NOWAIT case, the page may be pre-charged before calling 1877 * add_to_page_cache(). (See shmem.c) check it here and avoid to call 1878 * charge twice. (It works but has to pay a bit larger cost.) 1879 * And when the page is SwapCache, it should take swap information 1880 * into account. This is under lock_page() now. 1881 */ 1882 if (!(gfp_mask & __GFP_WAIT)) { 1883 struct page_cgroup *pc; 1884 1885 1886 pc = lookup_page_cgroup(page); 1887 if (!pc) 1888 return 0; 1889 lock_page_cgroup(pc); 1890 if (PageCgroupUsed(pc)) { 1891 unlock_page_cgroup(pc); 1892 return 0; 1893 } 1894 unlock_page_cgroup(pc); 1895 } 1896 1897 if (unlikely(!mm && !mem)) 1898 mm = &init_mm; 1899 1900 if (page_is_file_cache(page)) 1901 return mem_cgroup_charge_common(page, mm, gfp_mask, 1902 MEM_CGROUP_CHARGE_TYPE_CACHE, NULL); 1903 1904 /* shmem */ 1905 if (PageSwapCache(page)) { 1906 ret = mem_cgroup_try_charge_swapin(mm, page, gfp_mask, &mem); 1907 if (!ret) 1908 __mem_cgroup_commit_charge_swapin(page, mem, 1909 MEM_CGROUP_CHARGE_TYPE_SHMEM); 1910 } else 1911 ret = mem_cgroup_charge_common(page, mm, gfp_mask, 1912 MEM_CGROUP_CHARGE_TYPE_SHMEM, mem); 1913 1914 return ret; 1915} 1916 1917/* 1918 * While swap-in, try_charge -> commit or cancel, the page is locked. 1919 * And when try_charge() successfully returns, one refcnt to memcg without 1920 * struct page_cgroup is acquired. This refcnt will be consumed by 1921 * "commit()" or removed by "cancel()" 1922 */ 1923int mem_cgroup_try_charge_swapin(struct mm_struct *mm, 1924 struct page *page, 1925 gfp_t mask, struct mem_cgroup **ptr) 1926{ 1927 struct mem_cgroup *mem; 1928 int ret; 1929 1930 if (mem_cgroup_disabled()) 1931 return 0; 1932 1933 if (!do_swap_account) 1934 goto charge_cur_mm; 1935 /* 1936 * A racing thread's fault, or swapoff, may have already updated 1937 * the pte, and even removed page from swap cache: in those cases 1938 * do_swap_page()'s pte_same() test will fail; but there's also a 1939 * KSM case which does need to charge the page. 1940 */ 1941 if (!PageSwapCache(page)) 1942 goto charge_cur_mm; 1943 mem = try_get_mem_cgroup_from_page(page); 1944 if (!mem) 1945 goto charge_cur_mm; 1946 *ptr = mem; 1947 ret = __mem_cgroup_try_charge(NULL, mask, ptr, true, page); 1948 /* drop extra refcnt from tryget */ 1949 css_put(&mem->css); 1950 return ret; 1951charge_cur_mm: 1952 if (unlikely(!mm)) 1953 mm = &init_mm; 1954 return __mem_cgroup_try_charge(mm, mask, ptr, true, page); 1955} 1956 1957static void 1958__mem_cgroup_commit_charge_swapin(struct page *page, struct mem_cgroup *ptr, 1959 enum charge_type ctype) 1960{ 1961 struct page_cgroup *pc; 1962 1963 if (mem_cgroup_disabled()) 1964 return; 1965 if (!ptr) 1966 return; 1967 cgroup_exclude_rmdir(&ptr->css); 1968 pc = lookup_page_cgroup(page); 1969 mem_cgroup_lru_del_before_commit_swapcache(page); 1970 __mem_cgroup_commit_charge(ptr, pc, ctype); 1971 mem_cgroup_lru_add_after_commit_swapcache(page); 1972 /* 1973 * Now swap is on-memory. This means this page may be 1974 * counted both as mem and swap....double count. 1975 * Fix it by uncharging from memsw. Basically, this SwapCache is stable 1976 * under lock_page(). But in do_swap_page()::memory.c, reuse_swap_page() 1977 * may call delete_from_swap_cache() before reach here. 1978 */ 1979 if (do_swap_account && PageSwapCache(page)) { 1980 swp_entry_t ent = {.val = page_private(page)}; 1981 unsigned short id; 1982 struct mem_cgroup *memcg; 1983 1984 id = swap_cgroup_record(ent, 0); 1985 rcu_read_lock(); 1986 memcg = mem_cgroup_lookup(id); 1987 if (memcg) { 1988 /* 1989 * This recorded memcg can be obsolete one. So, avoid 1990 * calling css_tryget 1991 */ 1992 if (!mem_cgroup_is_root(memcg)) 1993 res_counter_uncharge(&memcg->memsw, PAGE_SIZE); 1994 mem_cgroup_swap_statistics(memcg, false); 1995 mem_cgroup_put(memcg); 1996 } 1997 rcu_read_unlock(); 1998 } 1999 /* 2000 * At swapin, we may charge account against cgroup which has no tasks. 2001 * So, rmdir()->pre_destroy() can be called while we do this charge. 2002 * In that case, we need to call pre_destroy() again. check it here. 2003 */ 2004 cgroup_release_and_wakeup_rmdir(&ptr->css); 2005} 2006 2007void mem_cgroup_commit_charge_swapin(struct page *page, struct mem_cgroup *ptr) 2008{ 2009 __mem_cgroup_commit_charge_swapin(page, ptr, 2010 MEM_CGROUP_CHARGE_TYPE_MAPPED); 2011} 2012 2013void mem_cgroup_cancel_charge_swapin(struct mem_cgroup *mem) 2014{ 2015 if (mem_cgroup_disabled()) 2016 return; 2017 if (!mem) 2018 return; 2019 mem_cgroup_cancel_charge(mem); 2020} 2021 2022static void 2023__do_uncharge(struct mem_cgroup *mem, const enum charge_type ctype) 2024{ 2025 struct memcg_batch_info *batch = NULL; 2026 bool uncharge_memsw = true; 2027 /* If swapout, usage of swap doesn't decrease */ 2028 if (!do_swap_account || ctype == MEM_CGROUP_CHARGE_TYPE_SWAPOUT) 2029 uncharge_memsw = false; 2030 /* 2031 * do_batch > 0 when unmapping pages or inode invalidate/truncate. 2032 * In those cases, all pages freed continously can be expected to be in 2033 * the same cgroup and we have chance to coalesce uncharges. 2034 * But we do uncharge one by one if this is killed by OOM(TIF_MEMDIE) 2035 * because we want to do uncharge as soon as possible. 2036 */ 2037 if (!current->memcg_batch.do_batch || test_thread_flag(TIF_MEMDIE)) 2038 goto direct_uncharge; 2039 2040 batch = ¤t->memcg_batch; 2041 /* 2042 * In usual, we do css_get() when we remember memcg pointer. 2043 * But in this case, we keep res->usage until end of a series of 2044 * uncharges. Then, it's ok to ignore memcg's refcnt. 2045 */ 2046 if (!batch->memcg) 2047 batch->memcg = mem; 2048 /* 2049 * In typical case, batch->memcg == mem. This means we can 2050 * merge a series of uncharges to an uncharge of res_counter. 2051 * If not, we uncharge res_counter ony by one. 2052 */ 2053 if (batch->memcg != mem) 2054 goto direct_uncharge; 2055 /* remember freed charge and uncharge it later */ 2056 batch->bytes += PAGE_SIZE; 2057 if (uncharge_memsw) 2058 batch->memsw_bytes += PAGE_SIZE; 2059 return; 2060direct_uncharge: 2061 res_counter_uncharge(&mem->res, PAGE_SIZE); 2062 if (uncharge_memsw) 2063 res_counter_uncharge(&mem->memsw, PAGE_SIZE); 2064 return; 2065} 2066 2067/* 2068 * uncharge if !page_mapped(page) 2069 */ 2070static struct mem_cgroup * 2071__mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype) 2072{ 2073 struct page_cgroup *pc; 2074 struct mem_cgroup *mem = NULL; 2075 struct mem_cgroup_per_zone *mz; 2076 2077 if (mem_cgroup_disabled()) 2078 return NULL; 2079 2080 if (PageSwapCache(page)) 2081 return NULL; 2082 2083 /* 2084 * Check if our page_cgroup is valid 2085 */ 2086 pc = lookup_page_cgroup(page); 2087 if (unlikely(!pc || !PageCgroupUsed(pc))) 2088 return NULL; 2089 2090 lock_page_cgroup(pc); 2091 2092 mem = pc->mem_cgroup; 2093 2094 if (!PageCgroupUsed(pc)) 2095 goto unlock_out; 2096 2097 switch (ctype) { 2098 case MEM_CGROUP_CHARGE_TYPE_MAPPED: 2099 case MEM_CGROUP_CHARGE_TYPE_DROP: 2100 if (page_mapped(page)) 2101 goto unlock_out; 2102 break; 2103 case MEM_CGROUP_CHARGE_TYPE_SWAPOUT: 2104 if (!PageAnon(page)) { /* Shared memory */ 2105 if (page->mapping && !page_is_file_cache(page)) 2106 goto unlock_out; 2107 } else if (page_mapped(page)) /* Anon */ 2108 goto unlock_out; 2109 break; 2110 default: 2111 break; 2112 } 2113 2114 if (!mem_cgroup_is_root(mem)) 2115 __do_uncharge(mem, ctype); 2116 if (ctype == MEM_CGROUP_CHARGE_TYPE_SWAPOUT) 2117 mem_cgroup_swap_statistics(mem, true); 2118 mem_cgroup_charge_statistics(mem, pc, false); 2119 2120 ClearPageCgroupUsed(pc); 2121 /* 2122 * pc->mem_cgroup is not cleared here. It will be accessed when it's 2123 * freed from LRU. This is safe because uncharged page is expected not 2124 * to be reused (freed soon). Exception is SwapCache, it's handled by 2125 * special functions. 2126 */ 2127 2128 mz = page_cgroup_zoneinfo(pc); 2129 unlock_page_cgroup(pc); 2130 2131 if (mem_cgroup_soft_limit_check(mem)) 2132 mem_cgroup_update_tree(mem, page); 2133 if (mem_cgroup_threshold_check(mem)) 2134 mem_cgroup_threshold(mem); 2135 /* at swapout, this memcg will be accessed to record to swap */ 2136 if (ctype != MEM_CGROUP_CHARGE_TYPE_SWAPOUT) 2137 css_put(&mem->css); 2138 2139 return mem; 2140 2141unlock_out: 2142 unlock_page_cgroup(pc); 2143 return NULL; 2144} 2145 2146void mem_cgroup_uncharge_page(struct page *page) 2147{ 2148 /* early check. */ 2149 if (page_mapped(page)) 2150 return; 2151 if (page->mapping && !PageAnon(page)) 2152 return; 2153 __mem_cgroup_uncharge_common(page, MEM_CGROUP_CHARGE_TYPE_MAPPED); 2154} 2155 2156void mem_cgroup_uncharge_cache_page(struct page *page) 2157{ 2158 VM_BUG_ON(page_mapped(page)); 2159 VM_BUG_ON(page->mapping); 2160 __mem_cgroup_uncharge_common(page, MEM_CGROUP_CHARGE_TYPE_CACHE); 2161} 2162 2163/* 2164 * Batch_start/batch_end is called in unmap_page_range/invlidate/trucate. 2165 * In that cases, pages are freed continuously and we can expect pages 2166 * are in the same memcg. All these calls itself limits the number of 2167 * pages freed at once, then uncharge_start/end() is called properly. 2168 * This may be called prural(2) times in a context, 2169 */ 2170 2171void mem_cgroup_uncharge_start(void) 2172{ 2173 current->memcg_batch.do_batch++; 2174 /* We can do nest. */ 2175 if (current->memcg_batch.do_batch == 1) { 2176 current->memcg_batch.memcg = NULL; 2177 current->memcg_batch.bytes = 0; 2178 current->memcg_batch.memsw_bytes = 0; 2179 } 2180} 2181 2182void mem_cgroup_uncharge_end(void) 2183{ 2184 struct memcg_batch_info *batch = ¤t->memcg_batch; 2185 2186 if (!batch->do_batch) 2187 return; 2188 2189 batch->do_batch--; 2190 if (batch->do_batch) /* If stacked, do nothing. */ 2191 return; 2192 2193 if (!batch->memcg) 2194 return; 2195 /* 2196 * This "batch->memcg" is valid without any css_get/put etc... 2197 * bacause we hide charges behind us. 2198 */ 2199 if (batch->bytes) 2200 res_counter_uncharge(&batch->memcg->res, batch->bytes); 2201 if (batch->memsw_bytes) 2202 res_counter_uncharge(&batch->memcg->memsw, batch->memsw_bytes); 2203 /* forget this pointer (for sanity check) */ 2204 batch->memcg = NULL; 2205} 2206 2207#ifdef CONFIG_SWAP 2208/* 2209 * called after __delete_from_swap_cache() and drop "page" account. 2210 * memcg information is recorded to swap_cgroup of "ent" 2211 */ 2212void 2213mem_cgroup_uncharge_swapcache(struct page *page, swp_entry_t ent, bool swapout) 2214{ 2215 struct mem_cgroup *memcg; 2216 int ctype = MEM_CGROUP_CHARGE_TYPE_SWAPOUT; 2217 2218 if (!swapout) /* this was a swap cache but the swap is unused ! */ 2219 ctype = MEM_CGROUP_CHARGE_TYPE_DROP; 2220 2221 memcg = __mem_cgroup_uncharge_common(page, ctype); 2222 2223 /* record memcg information */ 2224 if (do_swap_account && swapout && memcg) { 2225 swap_cgroup_record(ent, css_id(&memcg->css)); 2226 mem_cgroup_get(memcg); 2227 } 2228 if (swapout && memcg) 2229 css_put(&memcg->css); 2230} 2231#endif 2232 2233#ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP 2234/* 2235 * called from swap_entry_free(). remove record in swap_cgroup and 2236 * uncharge "memsw" account. 2237 */ 2238void mem_cgroup_uncharge_swap(swp_entry_t ent) 2239{ 2240 struct mem_cgroup *memcg; 2241 unsigned short id; 2242 2243 if (!do_swap_account) 2244 return; 2245 2246 id = swap_cgroup_record(ent, 0); 2247 rcu_read_lock(); 2248 memcg = mem_cgroup_lookup(id); 2249 if (memcg) { 2250 /* 2251 * We uncharge this because swap is freed. 2252 * This memcg can be obsolete one. We avoid calling css_tryget 2253 */ 2254 if (!mem_cgroup_is_root(memcg)) 2255 res_counter_uncharge(&memcg->memsw, PAGE_SIZE); 2256 mem_cgroup_swap_statistics(memcg, false); 2257 mem_cgroup_put(memcg); 2258 } 2259 rcu_read_unlock(); 2260} 2261 2262/** 2263 * mem_cgroup_move_swap_account - move swap charge and swap_cgroup's record. 2264 * @entry: swap entry to be moved 2265 * @from: mem_cgroup which the entry is moved from 2266 * @to: mem_cgroup which the entry is moved to 2267 * @need_fixup: whether we should fixup res_counters and refcounts. 2268 * 2269 * It succeeds only when the swap_cgroup's record for this entry is the same 2270 * as the mem_cgroup's id of @from. 2271 * 2272 * Returns 0 on success, -EINVAL on failure. 2273 * 2274 * The caller must have charged to @to, IOW, called res_counter_charge() about 2275 * both res and memsw, and called css_get(). 2276 */ 2277static int mem_cgroup_move_swap_account(swp_entry_t entry, 2278 struct mem_cgroup *from, struct mem_cgroup *to, bool need_fixup) 2279{ 2280 unsigned short old_id, new_id; 2281 2282 old_id = css_id(&from->css); 2283 new_id = css_id(&to->css); 2284 2285 if (swap_cgroup_cmpxchg(entry, old_id, new_id) == old_id) { 2286 mem_cgroup_swap_statistics(from, false); 2287 mem_cgroup_swap_statistics(to, true); 2288 /* 2289 * This function is only called from task migration context now. 2290 * It postpones res_counter and refcount handling till the end 2291 * of task migration(mem_cgroup_clear_mc()) for performance 2292 * improvement. But we cannot postpone mem_cgroup_get(to) 2293 * because if the process that has been moved to @to does 2294 * swap-in, the refcount of @to might be decreased to 0. 2295 */ 2296 mem_cgroup_get(to); 2297 if (need_fixup) { 2298 if (!mem_cgroup_is_root(from)) 2299 res_counter_uncharge(&from->memsw, PAGE_SIZE); 2300 mem_cgroup_put(from); 2301 /* 2302 * we charged both to->res and to->memsw, so we should 2303 * uncharge to->res. 2304 */ 2305 if (!mem_cgroup_is_root(to)) 2306 res_counter_uncharge(&to->res, PAGE_SIZE); 2307 css_put(&to->css); 2308 } 2309 return 0; 2310 } 2311 return -EINVAL; 2312} 2313#else 2314static inline int mem_cgroup_move_swap_account(swp_entry_t entry, 2315 struct mem_cgroup *from, struct mem_cgroup *to, bool need_fixup) 2316{ 2317 return -EINVAL; 2318} 2319#endif 2320 2321/* 2322 * Before starting migration, account PAGE_SIZE to mem_cgroup that the old 2323 * page belongs to. 2324 */ 2325int mem_cgroup_prepare_migration(struct page *page, struct mem_cgroup **ptr) 2326{ 2327 struct page_cgroup *pc; 2328 struct mem_cgroup *mem = NULL; 2329 int ret = 0; 2330 2331 if (mem_cgroup_disabled()) 2332 return 0; 2333 2334 pc = lookup_page_cgroup(page); 2335 lock_page_cgroup(pc); 2336 if (PageCgroupUsed(pc)) { 2337 mem = pc->mem_cgroup; 2338 css_get(&mem->css); 2339 } 2340 unlock_page_cgroup(pc); 2341 2342 if (mem) { 2343 ret = __mem_cgroup_try_charge(NULL, GFP_KERNEL, &mem, false, 2344 page); 2345 css_put(&mem->css); 2346 } 2347 *ptr = mem; 2348 return ret; 2349} 2350 2351/* remove redundant charge if migration failed*/ 2352void mem_cgroup_end_migration(struct mem_cgroup *mem, 2353 struct page *oldpage, struct page *newpage) 2354{ 2355 struct page *target, *unused; 2356 struct page_cgroup *pc; 2357 enum charge_type ctype; 2358 2359 if (!mem) 2360 return; 2361 cgroup_exclude_rmdir(&mem->css); 2362 /* at migration success, oldpage->mapping is NULL. */ 2363 if (oldpage->mapping) { 2364 target = oldpage; 2365 unused = NULL; 2366 } else { 2367 target = newpage; 2368 unused = oldpage; 2369 } 2370 2371 if (PageAnon(target)) 2372 ctype = MEM_CGROUP_CHARGE_TYPE_MAPPED; 2373 else if (page_is_file_cache(target)) 2374 ctype = MEM_CGROUP_CHARGE_TYPE_CACHE; 2375 else 2376 ctype = MEM_CGROUP_CHARGE_TYPE_SHMEM; 2377 2378 /* unused page is not on radix-tree now. */ 2379 if (unused) 2380 __mem_cgroup_uncharge_common(unused, ctype); 2381 2382 pc = lookup_page_cgroup(target); 2383 /* 2384 * __mem_cgroup_commit_charge() check PCG_USED bit of page_cgroup. 2385 * So, double-counting is effectively avoided. 2386 */ 2387 __mem_cgroup_commit_charge(mem, pc, ctype); 2388 2389 /* 2390 * Both of oldpage and newpage are still under lock_page(). 2391 * Then, we don't have to care about race in radix-tree. 2392 * But we have to be careful that this page is unmapped or not. 2393 * 2394 * There is a case for !page_mapped(). At the start of 2395 * migration, oldpage was mapped. But now, it's zapped. 2396 * But we know *target* page is not freed/reused under us. 2397 * mem_cgroup_uncharge_page() does all necessary checks. 2398 */ 2399 if (ctype == MEM_CGROUP_CHARGE_TYPE_MAPPED) 2400 mem_cgroup_uncharge_page(target); 2401 /* 2402 * At migration, we may charge account against cgroup which has no tasks 2403 * So, rmdir()->pre_destroy() can be called while we do this charge. 2404 * In that case, we need to call pre_destroy() again. check it here. 2405 */ 2406 cgroup_release_and_wakeup_rmdir(&mem->css); 2407} 2408 2409/* 2410 * A call to try to shrink memory usage on charge failure at shmem's swapin. 2411 * Calling hierarchical_reclaim is not enough because we should update 2412 * last_oom_jiffies to prevent pagefault_out_of_memory from invoking global OOM. 2413 * Moreover considering hierarchy, we should reclaim from the mem_over_limit, 2414 * not from the memcg which this page would be charged to. 2415 * try_charge_swapin does all of these works properly. 2416 */ 2417int mem_cgroup_shmem_charge_fallback(struct page *page, 2418 struct mm_struct *mm, 2419 gfp_t gfp_mask) 2420{ 2421 struct mem_cgroup *mem = NULL; 2422 int ret; 2423 2424 if (mem_cgroup_disabled()) 2425 return 0; 2426 2427 ret = mem_cgroup_try_charge_swapin(mm, page, gfp_mask, &mem); 2428 if (!ret) 2429 mem_cgroup_cancel_charge_swapin(mem); /* it does !mem check */ 2430 2431 return ret; 2432} 2433 2434static DEFINE_MUTEX(set_limit_mutex); 2435 2436static int mem_cgroup_resize_limit(struct mem_cgroup *memcg, 2437 unsigned long long val) 2438{ 2439 int retry_count; 2440 u64 memswlimit; 2441 int ret = 0; 2442 int children = mem_cgroup_count_children(memcg); 2443 u64 curusage, oldusage; 2444 2445 /* 2446 * For keeping hierarchical_reclaim simple, how long we should retry 2447 * is depends on callers. We set our retry-count to be function 2448 * of # of children which we should visit in this loop. 2449 */ 2450 retry_count = MEM_CGROUP_RECLAIM_RETRIES * children; 2451 2452 oldusage = res_counter_read_u64(&memcg->res, RES_USAGE); 2453 2454 while (retry_count) { 2455 if (signal_pending(current)) { 2456 ret = -EINTR; 2457 break; 2458 } 2459 /* 2460 * Rather than hide all in some function, I do this in 2461 * open coded manner. You see what this really does. 2462 * We have to guarantee mem->res.limit < mem->memsw.limit. 2463 */ 2464 mutex_lock(&set_limit_mutex); 2465 memswlimit = res_counter_read_u64(&memcg->memsw, RES_LIMIT); 2466 if (memswlimit < val) { 2467 ret = -EINVAL; 2468 mutex_unlock(&set_limit_mutex); 2469 break; 2470 } 2471 ret = res_counter_set_limit(&memcg->res, val); 2472 if (!ret) { 2473 if (memswlimit == val) 2474 memcg->memsw_is_minimum = true; 2475 else 2476 memcg->memsw_is_minimum = false; 2477 } 2478 mutex_unlock(&set_limit_mutex); 2479 2480 if (!ret) 2481 break; 2482 2483 mem_cgroup_hierarchical_reclaim(memcg, NULL, GFP_KERNEL, 2484 MEM_CGROUP_RECLAIM_SHRINK); 2485 curusage = res_counter_read_u64(&memcg->res, RES_USAGE); 2486 /* Usage is reduced ? */ 2487 if (curusage >= oldusage) 2488 retry_count--; 2489 else 2490 oldusage = curusage; 2491 } 2492 2493 return ret; 2494} 2495 2496static int mem_cgroup_resize_memsw_limit(struct mem_cgroup *memcg, 2497 unsigned long long val) 2498{ 2499 int retry_count; 2500 u64 memlimit, oldusage, curusage; 2501 int children = mem_cgroup_count_children(memcg); 2502 int ret = -EBUSY; 2503 2504 /* see mem_cgroup_resize_res_limit */ 2505 retry_count = children * MEM_CGROUP_RECLAIM_RETRIES; 2506 oldusage = res_counter_read_u64(&memcg->memsw, RES_USAGE); 2507 while (retry_count) { 2508 if (signal_pending(current)) { 2509 ret = -EINTR; 2510 break; 2511 } 2512 /* 2513 * Rather than hide all in some function, I do this in 2514 * open coded manner. You see what this really does. 2515 * We have to guarantee mem->res.limit < mem->memsw.limit. 2516 */ 2517 mutex_lock(&set_limit_mutex); 2518 memlimit = res_counter_read_u64(&memcg->res, RES_LIMIT); 2519 if (memlimit > val) { 2520 ret = -EINVAL; 2521 mutex_unlock(&set_limit_mutex); 2522 break; 2523 } 2524 ret = res_counter_set_limit(&memcg->memsw, val); 2525 if (!ret) { 2526 if (memlimit == val) 2527 memcg->memsw_is_minimum = true; 2528 else 2529 memcg->memsw_is_minimum = false; 2530 } 2531 mutex_unlock(&set_limit_mutex); 2532 2533 if (!ret) 2534 break; 2535 2536 mem_cgroup_hierarchical_reclaim(memcg, NULL, GFP_KERNEL, 2537 MEM_CGROUP_RECLAIM_NOSWAP | 2538 MEM_CGROUP_RECLAIM_SHRINK); 2539 curusage = res_counter_read_u64(&memcg->memsw, RES_USAGE); 2540 /* Usage is reduced ? */ 2541 if (curusage >= oldusage) 2542 retry_count--; 2543 else 2544 oldusage = curusage; 2545 } 2546 return ret; 2547} 2548 2549unsigned long mem_cgroup_soft_limit_reclaim(struct zone *zone, int order, 2550 gfp_t gfp_mask, int nid, 2551 int zid) 2552{ 2553 unsigned long nr_reclaimed = 0; 2554 struct mem_cgroup_per_zone *mz, *next_mz = NULL; 2555 unsigned long reclaimed; 2556 int loop = 0; 2557 struct mem_cgroup_tree_per_zone *mctz; 2558 unsigned long long excess; 2559 2560 if (order > 0) 2561 return 0; 2562 2563 mctz = soft_limit_tree_node_zone(nid, zid); 2564 /* 2565 * This loop can run a while, specially if mem_cgroup's continuously 2566 * keep exceeding their soft limit and putting the system under 2567 * pressure 2568 */ 2569 do { 2570 if (next_mz) 2571 mz = next_mz; 2572 else 2573 mz = mem_cgroup_largest_soft_limit_node(mctz); 2574 if (!mz) 2575 break; 2576 2577 reclaimed = mem_cgroup_hierarchical_reclaim(mz->mem, zone, 2578 gfp_mask, 2579 MEM_CGROUP_RECLAIM_SOFT); 2580 nr_reclaimed += reclaimed; 2581 spin_lock(&mctz->lock); 2582 2583 /* 2584 * If we failed to reclaim anything from this memory cgroup 2585 * it is time to move on to the next cgroup 2586 */ 2587 next_mz = NULL; 2588 if (!reclaimed) { 2589 do { 2590 /* 2591 * Loop until we find yet another one. 2592 * 2593 * By the time we get the soft_limit lock 2594 * again, someone might have aded the 2595 * group back on the RB tree. Iterate to 2596 * make sure we get a different mem. 2597 * mem_cgroup_largest_soft_limit_node returns 2598 * NULL if no other cgroup is present on 2599 * the tree 2600 */ 2601 next_mz = 2602 __mem_cgroup_largest_soft_limit_node(mctz); 2603 if (next_mz == mz) { 2604 css_put(&next_mz->mem->css); 2605 next_mz = NULL; 2606 } else /* next_mz == NULL or other memcg */ 2607 break; 2608 } while (1); 2609 } 2610 __mem_cgroup_remove_exceeded(mz->mem, mz, mctz); 2611 excess = res_counter_soft_limit_excess(&mz->mem->res); 2612 /* 2613 * One school of thought says that we should not add 2614 * back the node to the tree if reclaim returns 0. 2615 * But our reclaim could return 0, simply because due 2616 * to priority we are exposing a smaller subset of 2617 * memory to reclaim from. Consider this as a longer 2618 * term TODO. 2619 */ 2620 /* If excess == 0, no tree ops */ 2621 __mem_cgroup_insert_exceeded(mz->mem, mz, mctz, excess); 2622 spin_unlock(&mctz->lock); 2623 css_put(&mz->mem->css); 2624 loop++; 2625 /* 2626 * Could not reclaim anything and there are no more 2627 * mem cgroups to try or we seem to be looping without 2628 * reclaiming anything. 2629 */ 2630 if (!nr_reclaimed && 2631 (next_mz == NULL || 2632 loop > MEM_CGROUP_MAX_SOFT_LIMIT_RECLAIM_LOOPS)) 2633 break; 2634 } while (!nr_reclaimed); 2635 if (next_mz) 2636 css_put(&next_mz->mem->css); 2637 return nr_reclaimed; 2638} 2639 2640/* 2641 * This routine traverse page_cgroup in given list and drop them all. 2642 * *And* this routine doesn't reclaim page itself, just removes page_cgroup. 2643 */ 2644static int mem_cgroup_force_empty_list(struct mem_cgroup *mem, 2645 int node, int zid, enum lru_list lru) 2646{ 2647 struct zone *zone; 2648 struct mem_cgroup_per_zone *mz; 2649 struct page_cgroup *pc, *busy; 2650 unsigned long flags, loop; 2651 struct list_head *list; 2652 int ret = 0; 2653 2654 zone = &NODE_DATA(node)->node_zones[zid]; 2655 mz = mem_cgroup_zoneinfo(mem, node, zid); 2656 list = &mz->lists[lru]; 2657 2658 loop = MEM_CGROUP_ZSTAT(mz, lru); 2659 /* give some margin against EBUSY etc...*/ 2660 loop += 256; 2661 busy = NULL; 2662 while (loop--) { 2663 ret = 0; 2664 spin_lock_irqsave(&zone->lru_lock, flags); 2665 if (list_empty(list)) { 2666 spin_unlock_irqrestore(&zone->lru_lock, flags); 2667 break; 2668 } 2669 pc = list_entry(list->prev, struct page_cgroup, lru); 2670 if (busy == pc) { 2671 list_move(&pc->lru, list); 2672 busy = NULL; 2673 spin_unlock_irqrestore(&zone->lru_lock, flags); 2674 continue; 2675 } 2676 spin_unlock_irqrestore(&zone->lru_lock, flags); 2677 2678 ret = mem_cgroup_move_parent(pc, mem, GFP_KERNEL); 2679 if (ret == -ENOMEM) 2680 break; 2681 2682 if (ret == -EBUSY || ret == -EINVAL) { 2683 /* found lock contention or "pc" is obsolete. */ 2684 busy = pc; 2685 cond_resched(); 2686 } else 2687 busy = NULL; 2688 } 2689 2690 if (!ret && !list_empty(list)) 2691 return -EBUSY; 2692 return ret; 2693} 2694 2695/* 2696 * make mem_cgroup's charge to be 0 if there is no task. 2697 * This enables deleting this mem_cgroup. 2698 */ 2699static int mem_cgroup_force_empty(struct mem_cgroup *mem, bool free_all) 2700{ 2701 int ret; 2702 int node, zid, shrink; 2703 int nr_retries = MEM_CGROUP_RECLAIM_RETRIES; 2704 struct cgroup *cgrp = mem->css.cgroup; 2705 2706 css_get(&mem->css); 2707 2708 shrink = 0; 2709 /* should free all ? */ 2710 if (free_all) 2711 goto try_to_free; 2712move_account: 2713 do { 2714 ret = -EBUSY; 2715 if (cgroup_task_count(cgrp) || !list_empty(&cgrp->children)) 2716 goto out; 2717 ret = -EINTR; 2718 if (signal_pending(current)) 2719 goto out; 2720 /* This is for making all *used* pages to be on LRU. */ 2721 lru_add_drain_all(); 2722 drain_all_stock_sync(); 2723 ret = 0; 2724 for_each_node_state(node, N_HIGH_MEMORY) { 2725 for (zid = 0; !ret && zid < MAX_NR_ZONES; zid++) { 2726 enum lru_list l; 2727 for_each_lru(l) { 2728 ret = mem_cgroup_force_empty_list(mem, 2729 node, zid, l); 2730 if (ret) 2731 break; 2732 } 2733 } 2734 if (ret) 2735 break; 2736 } 2737 /* it seems parent cgroup doesn't have enough mem */ 2738 if (ret == -ENOMEM) 2739 goto try_to_free; 2740 cond_resched(); 2741 /* "ret" should also be checked to ensure all lists are empty. */ 2742 } while (mem->res.usage > 0 || ret); 2743out: 2744 css_put(&mem->css); 2745 return ret; 2746 2747try_to_free: 2748 /* returns EBUSY if there is a task or if we come here twice. */ 2749 if (cgroup_task_count(cgrp) || !list_empty(&cgrp->children) || shrink) { 2750 ret = -EBUSY; 2751 goto out; 2752 } 2753 /* we call try-to-free pages for make this cgroup empty */ 2754 lru_add_drain_all(); 2755 /* try to free all pages in this cgroup */ 2756 shrink = 1; 2757 while (nr_retries && mem->res.usage > 0) { 2758 int progress; 2759 2760 if (signal_pending(current)) { 2761 ret = -EINTR; 2762 goto out; 2763 } 2764 progress = try_to_free_mem_cgroup_pages(mem, GFP_KERNEL, 2765 false, get_swappiness(mem)); 2766 if (!progress) { 2767 nr_retries--; 2768 /* maybe some writeback is necessary */ 2769 congestion_wait(BLK_RW_ASYNC, HZ/10); 2770 } 2771 2772 } 2773 lru_add_drain(); 2774 /* try move_account...there may be some *locked* pages. */ 2775 goto move_account; 2776} 2777 2778int mem_cgroup_force_empty_write(struct cgroup *cont, unsigned int event) 2779{ 2780 return mem_cgroup_force_empty(mem_cgroup_from_cont(cont), true); 2781} 2782 2783 2784static u64 mem_cgroup_hierarchy_read(struct cgroup *cont, struct cftype *cft) 2785{ 2786 return mem_cgroup_from_cont(cont)->use_hierarchy; 2787} 2788 2789static int mem_cgroup_hierarchy_write(struct cgroup *cont, struct cftype *cft, 2790 u64 val) 2791{ 2792 int retval = 0; 2793 struct mem_cgroup *mem = mem_cgroup_from_cont(cont); 2794 struct cgroup *parent = cont->parent; 2795 struct mem_cgroup *parent_mem = NULL; 2796 2797 if (parent) 2798 parent_mem = mem_cgroup_from_cont(parent); 2799 2800 cgroup_lock(); 2801 /* 2802 * If parent's use_hierarchy is set, we can't make any modifications 2803 * in the child subtrees. If it is unset, then the change can 2804 * occur, provided the current cgroup has no children. 2805 * 2806 * For the root cgroup, parent_mem is NULL, we allow value to be 2807 * set if there are no children. 2808 */ 2809 if ((!parent_mem || !parent_mem->use_hierarchy) && 2810 (val == 1 || val == 0)) { 2811 if (list_empty(&cont->children)) 2812 mem->use_hierarchy = val; 2813 else 2814 retval = -EBUSY; 2815 } else 2816 retval = -EINVAL; 2817 cgroup_unlock(); 2818 2819 return retval; 2820} 2821 2822struct mem_cgroup_idx_data { 2823 s64 val; 2824 enum mem_cgroup_stat_index idx; 2825}; 2826 2827static int 2828mem_cgroup_get_idx_stat(struct mem_cgroup *mem, void *data) 2829{ 2830 struct mem_cgroup_idx_data *d = data; 2831 d->val += mem_cgroup_read_stat(mem, d->idx); 2832 return 0; 2833} 2834 2835static void 2836mem_cgroup_get_recursive_idx_stat(struct mem_cgroup *mem, 2837 enum mem_cgroup_stat_index idx, s64 *val) 2838{ 2839 struct mem_cgroup_idx_data d; 2840 d.idx = idx; 2841 d.val = 0; 2842 mem_cgroup_walk_tree(mem, &d, mem_cgroup_get_idx_stat); 2843 *val = d.val; 2844} 2845 2846static inline u64 mem_cgroup_usage(struct mem_cgroup *mem, bool swap) 2847{ 2848 u64 idx_val, val; 2849 2850 if (!mem_cgroup_is_root(mem)) { 2851 if (!swap) 2852 return res_counter_read_u64(&mem->res, RES_USAGE); 2853 else 2854 return res_counter_read_u64(&mem->memsw, RES_USAGE); 2855 } 2856 2857 mem_cgroup_get_recursive_idx_stat(mem, MEM_CGROUP_STAT_CACHE, &idx_val); 2858 val = idx_val; 2859 mem_cgroup_get_recursive_idx_stat(mem, MEM_CGROUP_STAT_RSS, &idx_val); 2860 val += idx_val; 2861 2862 if (swap) { 2863 mem_cgroup_get_recursive_idx_stat(mem, 2864 MEM_CGROUP_STAT_SWAPOUT, &idx_val); 2865 val += idx_val; 2866 } 2867 2868 return val << PAGE_SHIFT; 2869} 2870 2871static u64 mem_cgroup_read(struct cgroup *cont, struct cftype *cft) 2872{ 2873 struct mem_cgroup *mem = mem_cgroup_from_cont(cont); 2874 u64 val; 2875 int type, name; 2876 2877 type = MEMFILE_TYPE(cft->private); 2878 name = MEMFILE_ATTR(cft->private); 2879 switch (type) { 2880 case _MEM: 2881 if (name == RES_USAGE) 2882 val = mem_cgroup_usage(mem, false); 2883 else 2884 val = res_counter_read_u64(&mem->res, name); 2885 break; 2886 case _MEMSWAP: 2887 if (name == RES_USAGE) 2888 val = mem_cgroup_usage(mem, true); 2889 else 2890 val = res_counter_read_u64(&mem->memsw, name); 2891 break; 2892 default: 2893 BUG(); 2894 break; 2895 } 2896 return val; 2897} 2898/* 2899 * The user of this function is... 2900 * RES_LIMIT. 2901 */ 2902static int mem_cgroup_write(struct cgroup *cont, struct cftype *cft, 2903 const char *buffer) 2904{ 2905 struct mem_cgroup *memcg = mem_cgroup_from_cont(cont); 2906 int type, name; 2907 unsigned long long val; 2908 int ret; 2909 2910 type = MEMFILE_TYPE(cft->private); 2911 name = MEMFILE_ATTR(cft->private); 2912 switch (name) { 2913 case RES_LIMIT: 2914 if (mem_cgroup_is_root(memcg)) { /* Can't set limit on root */ 2915 ret = -EINVAL; 2916 break; 2917 } 2918 /* This function does all necessary parse...reuse it */ 2919 ret = res_counter_memparse_write_strategy(buffer, &val); 2920 if (ret) 2921 break; 2922 if (type == _MEM) 2923 ret = mem_cgroup_resize_limit(memcg, val); 2924 else 2925 ret = mem_cgroup_resize_memsw_limit(memcg, val); 2926 break; 2927 case RES_SOFT_LIMIT: 2928 ret = res_counter_memparse_write_strategy(buffer, &val); 2929 if (ret) 2930 break; 2931 /* 2932 * For memsw, soft limits are hard to implement in terms 2933 * of semantics, for now, we support soft limits for 2934 * control without swap 2935 */ 2936 if (type == _MEM) 2937 ret = res_counter_set_soft_limit(&memcg->res, val); 2938 else 2939 ret = -EINVAL; 2940 break; 2941 default: 2942 ret = -EINVAL; /* should be BUG() ? */ 2943 break; 2944 } 2945 return ret; 2946} 2947 2948static void memcg_get_hierarchical_limit(struct mem_cgroup *memcg, 2949 unsigned long long *mem_limit, unsigned long long *memsw_limit) 2950{ 2951 struct cgroup *cgroup; 2952 unsigned long long min_limit, min_memsw_limit, tmp; 2953 2954 min_limit = res_counter_read_u64(&memcg->res, RES_LIMIT); 2955 min_memsw_limit = res_counter_read_u64(&memcg->memsw, RES_LIMIT); 2956 cgroup = memcg->css.cgroup; 2957 if (!memcg->use_hierarchy) 2958 goto out; 2959 2960 while (cgroup->parent) { 2961 cgroup = cgroup->parent; 2962 memcg = mem_cgroup_from_cont(cgroup); 2963 if (!memcg->use_hierarchy) 2964 break; 2965 tmp = res_counter_read_u64(&memcg->res, RES_LIMIT); 2966 min_limit = min(min_limit, tmp); 2967 tmp = res_counter_read_u64(&memcg->memsw, RES_LIMIT); 2968 min_memsw_limit = min(min_memsw_limit, tmp); 2969 } 2970out: 2971 *mem_limit = min_limit; 2972 *memsw_limit = min_memsw_limit; 2973 return; 2974} 2975 2976static int mem_cgroup_reset(struct cgroup *cont, unsigned int event) 2977{ 2978 struct mem_cgroup *mem; 2979 int type, name; 2980 2981 mem = mem_cgroup_from_cont(cont); 2982 type = MEMFILE_TYPE(event); 2983 name = MEMFILE_ATTR(event); 2984 switch (name) { 2985 case RES_MAX_USAGE: 2986 if (type == _MEM) 2987 res_counter_reset_max(&mem->res); 2988 else 2989 res_counter_reset_max(&mem->memsw); 2990 break; 2991 case RES_FAILCNT: 2992 if (type == _MEM) 2993 res_counter_reset_failcnt(&mem->res); 2994 else 2995 res_counter_reset_failcnt(&mem->memsw); 2996 break; 2997 } 2998 2999 return 0; 3000} 3001 3002static u64 mem_cgroup_move_charge_read(struct cgroup *cgrp, 3003 struct cftype *cft) 3004{ 3005 return mem_cgroup_from_cont(cgrp)->move_charge_at_immigrate; 3006} 3007 3008#ifdef CONFIG_MMU 3009static int mem_cgroup_move_charge_write(struct cgroup *cgrp, 3010 struct cftype *cft, u64 val) 3011{ 3012 struct mem_cgroup *mem = mem_cgroup_from_cont(cgrp); 3013 3014 if (val >= (1 << NR_MOVE_TYPE)) 3015 return -EINVAL; 3016 /* 3017 * We check this value several times in both in can_attach() and 3018 * attach(), so we need cgroup lock to prevent this value from being 3019 * inconsistent. 3020 */ 3021 cgroup_lock(); 3022 mem->move_charge_at_immigrate = val; 3023 cgroup_unlock(); 3024 3025 return 0; 3026} 3027#else 3028static int mem_cgroup_move_charge_write(struct cgroup *cgrp, 3029 struct cftype *cft, u64 val) 3030{ 3031 return -ENOSYS; 3032} 3033#endif 3034 3035 3036/* For read statistics */ 3037enum { 3038 MCS_CACHE, 3039 MCS_RSS, 3040 MCS_FILE_MAPPED, 3041 MCS_PGPGIN, 3042 MCS_PGPGOUT, 3043 MCS_SWAP, 3044 MCS_INACTIVE_ANON, 3045 MCS_ACTIVE_ANON, 3046 MCS_INACTIVE_FILE, 3047 MCS_ACTIVE_FILE, 3048 MCS_UNEVICTABLE, 3049 NR_MCS_STAT, 3050}; 3051 3052struct mcs_total_stat { 3053 s64 stat[NR_MCS_STAT]; 3054}; 3055 3056struct { 3057 char *local_name; 3058 char *total_name; 3059} memcg_stat_strings[NR_MCS_STAT] = { 3060 {"cache", "total_cache"}, 3061 {"rss", "total_rss"}, 3062 {"mapped_file", "total_mapped_file"}, 3063 {"pgpgin", "total_pgpgin"}, 3064 {"pgpgout", "total_pgpgout"}, 3065 {"swap", "total_swap"}, 3066 {"inactive_anon", "total_inactive_anon"}, 3067 {"active_anon", "total_active_anon"}, 3068 {"inactive_file", "total_inactive_file"}, 3069 {"active_file", "total_active_file"}, 3070 {"unevictable", "total_unevictable"} 3071}; 3072 3073 3074static int mem_cgroup_get_local_stat(struct mem_cgroup *mem, void *data) 3075{ 3076 struct mcs_total_stat *s = data; 3077 s64 val; 3078 3079 /* per cpu stat */ 3080 val = mem_cgroup_read_stat(mem, MEM_CGROUP_STAT_CACHE); 3081 s->stat[MCS_CACHE] += val * PAGE_SIZE; 3082 val = mem_cgroup_read_stat(mem, MEM_CGROUP_STAT_RSS); 3083 s->stat[MCS_RSS] += val * PAGE_SIZE; 3084 val = mem_cgroup_read_stat(mem, MEM_CGROUP_STAT_FILE_MAPPED); 3085 s->stat[MCS_FILE_MAPPED] += val * PAGE_SIZE; 3086 val = mem_cgroup_read_stat(mem, MEM_CGROUP_STAT_PGPGIN_COUNT); 3087 s->stat[MCS_PGPGIN] += val; 3088 val = mem_cgroup_read_stat(mem, MEM_CGROUP_STAT_PGPGOUT_COUNT); 3089 s->stat[MCS_PGPGOUT] += val; 3090 if (do_swap_account) { 3091 val = mem_cgroup_read_stat(mem, MEM_CGROUP_STAT_SWAPOUT); 3092 s->stat[MCS_SWAP] += val * PAGE_SIZE; 3093 } 3094 3095 /* per zone stat */ 3096 val = mem_cgroup_get_local_zonestat(mem, LRU_INACTIVE_ANON); 3097 s->stat[MCS_INACTIVE_ANON] += val * PAGE_SIZE; 3098 val = mem_cgroup_get_local_zonestat(mem, LRU_ACTIVE_ANON); 3099 s->stat[MCS_ACTIVE_ANON] += val * PAGE_SIZE; 3100 val = mem_cgroup_get_local_zonestat(mem, LRU_INACTIVE_FILE); 3101 s->stat[MCS_INACTIVE_FILE] += val * PAGE_SIZE; 3102 val = mem_cgroup_get_local_zonestat(mem, LRU_ACTIVE_FILE); 3103 s->stat[MCS_ACTIVE_FILE] += val * PAGE_SIZE; 3104 val = mem_cgroup_get_local_zonestat(mem, LRU_UNEVICTABLE); 3105 s->stat[MCS_UNEVICTABLE] += val * PAGE_SIZE; 3106 return 0; 3107} 3108 3109static void 3110mem_cgroup_get_total_stat(struct mem_cgroup *mem, struct mcs_total_stat *s) 3111{ 3112 mem_cgroup_walk_tree(mem, s, mem_cgroup_get_local_stat); 3113} 3114 3115static int mem_control_stat_show(struct cgroup *cont, struct cftype *cft, 3116 struct cgroup_map_cb *cb) 3117{ 3118 struct mem_cgroup *mem_cont = mem_cgroup_from_cont(cont); 3119 struct mcs_total_stat mystat; 3120 int i; 3121 3122 memset(&mystat, 0, sizeof(mystat)); 3123 mem_cgroup_get_local_stat(mem_cont, &mystat); 3124 3125 for (i = 0; i < NR_MCS_STAT; i++) { 3126 if (i == MCS_SWAP && !do_swap_account) 3127 continue; 3128 cb->fill(cb, memcg_stat_strings[i].local_name, mystat.stat[i]); 3129 } 3130 3131 /* Hierarchical information */ 3132 { 3133 unsigned long long limit, memsw_limit; 3134 memcg_get_hierarchical_limit(mem_cont, &limit, &memsw_limit); 3135 cb->fill(cb, "hierarchical_memory_limit", limit); 3136 if (do_swap_account) 3137 cb->fill(cb, "hierarchical_memsw_limit", memsw_limit); 3138 } 3139 3140 memset(&mystat, 0, sizeof(mystat)); 3141 mem_cgroup_get_total_stat(mem_cont, &mystat); 3142 for (i = 0; i < NR_MCS_STAT; i++) { 3143 if (i == MCS_SWAP && !do_swap_account) 3144 continue; 3145 cb->fill(cb, memcg_stat_strings[i].total_name, mystat.stat[i]); 3146 } 3147 3148#ifdef CONFIG_DEBUG_VM 3149 cb->fill(cb, "inactive_ratio", calc_inactive_ratio(mem_cont, NULL)); 3150 3151 { 3152 int nid, zid; 3153 struct mem_cgroup_per_zone *mz; 3154 unsigned long recent_rotated[2] = {0, 0}; 3155 unsigned long recent_scanned[2] = {0, 0}; 3156 3157 for_each_online_node(nid) 3158 for (zid = 0; zid < MAX_NR_ZONES; zid++) { 3159 mz = mem_cgroup_zoneinfo(mem_cont, nid, zid); 3160 3161 recent_rotated[0] += 3162 mz->reclaim_stat.recent_rotated[0]; 3163 recent_rotated[1] += 3164 mz->reclaim_stat.recent_rotated[1]; 3165 recent_scanned[0] += 3166 mz->reclaim_stat.recent_scanned[0]; 3167 recent_scanned[1] += 3168 mz->reclaim_stat.recent_scanned[1]; 3169 } 3170 cb->fill(cb, "recent_rotated_anon", recent_rotated[0]); 3171 cb->fill(cb, "recent_rotated_file", recent_rotated[1]); 3172 cb->fill(cb, "recent_scanned_anon", recent_scanned[0]); 3173 cb->fill(cb, "recent_scanned_file", recent_scanned[1]); 3174 } 3175#endif 3176 3177 return 0; 3178} 3179 3180static u64 mem_cgroup_swappiness_read(struct cgroup *cgrp, struct cftype *cft) 3181{ 3182 struct mem_cgroup *memcg = mem_cgroup_from_cont(cgrp); 3183 3184 return get_swappiness(memcg); 3185} 3186 3187static int mem_cgroup_swappiness_write(struct cgroup *cgrp, struct cftype *cft, 3188 u64 val) 3189{ 3190 struct mem_cgroup *memcg = mem_cgroup_from_cont(cgrp); 3191 struct mem_cgroup *parent; 3192 3193 if (val > 100) 3194 return -EINVAL; 3195 3196 if (cgrp->parent == NULL) 3197 return -EINVAL; 3198 3199 parent = mem_cgroup_from_cont(cgrp->parent); 3200 3201 cgroup_lock(); 3202 3203 /* If under hierarchy, only empty-root can set this value */ 3204 if ((parent->use_hierarchy) || 3205 (memcg->use_hierarchy && !list_empty(&cgrp->children))) { 3206 cgroup_unlock(); 3207 return -EINVAL; 3208 } 3209 3210 spin_lock(&memcg->reclaim_param_lock); 3211 memcg->swappiness = val; 3212 spin_unlock(&memcg->reclaim_param_lock); 3213 3214 cgroup_unlock(); 3215 3216 return 0; 3217} 3218 3219static bool mem_cgroup_threshold_check(struct mem_cgroup *mem) 3220{ 3221 bool ret = false; 3222 s64 val; 3223 3224 val = this_cpu_read(mem->stat->count[MEM_CGROUP_STAT_THRESHOLDS]); 3225 if (unlikely(val < 0)) { 3226 this_cpu_write(mem->stat->count[MEM_CGROUP_STAT_THRESHOLDS], 3227 THRESHOLDS_EVENTS_THRESH); 3228 ret = true; 3229 } 3230 return ret; 3231} 3232 3233static void __mem_cgroup_threshold(struct mem_cgroup *memcg, bool swap) 3234{ 3235 struct mem_cgroup_threshold_ary *t; 3236 u64 usage; 3237 int i; 3238 3239 rcu_read_lock(); 3240 if (!swap) 3241 t = rcu_dereference(memcg->thresholds); 3242 else 3243 t = rcu_dereference(memcg->memsw_thresholds); 3244 3245 if (!t) 3246 goto unlock; 3247 3248 usage = mem_cgroup_usage(memcg, swap); 3249 3250 /* 3251 * current_threshold points to threshold just below usage. 3252 * If it's not true, a threshold was crossed after last 3253 * call of __mem_cgroup_threshold(). 3254 */ 3255 i = atomic_read(&t->current_threshold); 3256 3257 /* 3258 * Iterate backward over array of thresholds starting from 3259 * current_threshold and check if a threshold is crossed. 3260 * If none of thresholds below usage is crossed, we read 3261 * only one element of the array here. 3262 */ 3263 for (; i >= 0 && unlikely(t->entries[i].threshold > usage); i--) 3264 eventfd_signal(t->entries[i].eventfd, 1); 3265 3266 /* i = current_threshold + 1 */ 3267 i++; 3268 3269 /* 3270 * Iterate forward over array of thresholds starting from 3271 * current_threshold+1 and check if a threshold is crossed. 3272 * If none of thresholds above usage is crossed, we read 3273 * only one element of the array here. 3274 */ 3275 for (; i < t->size && unlikely(t->entries[i].threshold <= usage); i++) 3276 eventfd_signal(t->entries[i].eventfd, 1); 3277 3278 /* Update current_threshold */ 3279 atomic_set(&t->current_threshold, i - 1); 3280unlock: 3281 rcu_read_unlock(); 3282} 3283 3284static void mem_cgroup_threshold(struct mem_cgroup *memcg) 3285{ 3286 __mem_cgroup_threshold(memcg, false); 3287 if (do_swap_account) 3288 __mem_cgroup_threshold(memcg, true); 3289} 3290 3291static int compare_thresholds(const void *a, const void *b) 3292{ 3293 const struct mem_cgroup_threshold *_a = a; 3294 const struct mem_cgroup_threshold *_b = b; 3295 3296 return _a->threshold - _b->threshold; 3297} 3298 3299static int mem_cgroup_register_event(struct cgroup *cgrp, struct cftype *cft, 3300 struct eventfd_ctx *eventfd, const char *args) 3301{ 3302 struct mem_cgroup *memcg = mem_cgroup_from_cont(cgrp); 3303 struct mem_cgroup_threshold_ary *thresholds, *thresholds_new; 3304 int type = MEMFILE_TYPE(cft->private); 3305 u64 threshold, usage; 3306 int size; 3307 int i, ret; 3308 3309 ret = res_counter_memparse_write_strategy(args, &threshold); 3310 if (ret) 3311 return ret; 3312 3313 mutex_lock(&memcg->thresholds_lock); 3314 if (type == _MEM) 3315 thresholds = memcg->thresholds; 3316 else if (type == _MEMSWAP) 3317 thresholds = memcg->memsw_thresholds; 3318 else 3319 BUG(); 3320 3321 usage = mem_cgroup_usage(memcg, type == _MEMSWAP); 3322 3323 /* Check if a threshold crossed before adding a new one */ 3324 if (thresholds) 3325 __mem_cgroup_threshold(memcg, type == _MEMSWAP); 3326 3327 if (thresholds) 3328 size = thresholds->size + 1; 3329 else 3330 size = 1; 3331 3332 /* Allocate memory for new array of thresholds */ 3333 thresholds_new = kmalloc(sizeof(*thresholds_new) + 3334 size * sizeof(struct mem_cgroup_threshold), 3335 GFP_KERNEL); 3336 if (!thresholds_new) { 3337 ret = -ENOMEM; 3338 goto unlock; 3339 } 3340 thresholds_new->size = size; 3341 3342 /* Copy thresholds (if any) to new array */ 3343 if (thresholds) 3344 memcpy(thresholds_new->entries, thresholds->entries, 3345 thresholds->size * 3346 sizeof(struct mem_cgroup_threshold)); 3347 /* Add new threshold */ 3348 thresholds_new->entries[size - 1].eventfd = eventfd; 3349 thresholds_new->entries[size - 1].threshold = threshold; 3350 3351 /* Sort thresholds. Registering of new threshold isn't time-critical */ 3352 sort(thresholds_new->entries, size, 3353 sizeof(struct mem_cgroup_threshold), 3354 compare_thresholds, NULL); 3355 3356 /* Find current threshold */ 3357 atomic_set(&thresholds_new->current_threshold, -1); 3358 for (i = 0; i < size; i++) { 3359 if (thresholds_new->entries[i].threshold < usage) { 3360 /* 3361 * thresholds_new->current_threshold will not be used 3362 * until rcu_assign_pointer(), so it's safe to increment 3363 * it here. 3364 */ 3365 atomic_inc(&thresholds_new->current_threshold); 3366 } 3367 } 3368 3369 /* 3370 * We need to increment refcnt to be sure that all thresholds 3371 * will be unregistered before calling __mem_cgroup_free() 3372 */ 3373 mem_cgroup_get(memcg); 3374 3375 if (type == _MEM) 3376 rcu_assign_pointer(memcg->thresholds, thresholds_new); 3377 else 3378 rcu_assign_pointer(memcg->memsw_thresholds, thresholds_new); 3379 3380 /* To be sure that nobody uses thresholds before freeing it */ 3381 synchronize_rcu(); 3382 3383 kfree(thresholds); 3384unlock: 3385 mutex_unlock(&memcg->thresholds_lock); 3386 3387 return ret; 3388} 3389 3390static int mem_cgroup_unregister_event(struct cgroup *cgrp, struct cftype *cft, 3391 struct eventfd_ctx *eventfd) 3392{ 3393 struct mem_cgroup *memcg = mem_cgroup_from_cont(cgrp); 3394 struct mem_cgroup_threshold_ary *thresholds, *thresholds_new; 3395 int type = MEMFILE_TYPE(cft->private); 3396 u64 usage; 3397 int size = 0; 3398 int i, j, ret; 3399 3400 mutex_lock(&memcg->thresholds_lock); 3401 if (type == _MEM) 3402 thresholds = memcg->thresholds; 3403 else if (type == _MEMSWAP) 3404 thresholds = memcg->memsw_thresholds; 3405 else 3406 BUG(); 3407 3408 /* 3409 * Something went wrong if we trying to unregister a threshold 3410 * if we don't have thresholds 3411 */ 3412 BUG_ON(!thresholds); 3413 3414 usage = mem_cgroup_usage(memcg, type == _MEMSWAP); 3415 3416 /* Check if a threshold crossed before removing */ 3417 __mem_cgroup_threshold(memcg, type == _MEMSWAP); 3418 3419 /* Calculate new number of threshold */ 3420 for (i = 0; i < thresholds->size; i++) { 3421 if (thresholds->entries[i].eventfd != eventfd) 3422 size++; 3423 } 3424 3425 /* Set thresholds array to NULL if we don't have thresholds */ 3426 if (!size) { 3427 thresholds_new = NULL; 3428 goto assign; 3429 } 3430 3431 /* Allocate memory for new array of thresholds */ 3432 thresholds_new = kmalloc(sizeof(*thresholds_new) + 3433 size * sizeof(struct mem_cgroup_threshold), 3434 GFP_KERNEL); 3435 if (!thresholds_new) { 3436 ret = -ENOMEM; 3437 goto unlock; 3438 } 3439 thresholds_new->size = size; 3440 3441 /* Copy thresholds and find current threshold */ 3442 atomic_set(&thresholds_new->current_threshold, -1); 3443 for (i = 0, j = 0; i < thresholds->size; i++) { 3444 if (thresholds->entries[i].eventfd == eventfd) 3445 continue; 3446 3447 thresholds_new->entries[j] = thresholds->entries[i]; 3448 if (thresholds_new->entries[j].threshold < usage) { 3449 /* 3450 * thresholds_new->current_threshold will not be used 3451 * until rcu_assign_pointer(), so it's safe to increment 3452 * it here. 3453 */ 3454 atomic_inc(&thresholds_new->current_threshold); 3455 } 3456 j++; 3457 } 3458 3459assign: 3460 if (type == _MEM) 3461 rcu_assign_pointer(memcg->thresholds, thresholds_new); 3462 else 3463 rcu_assign_pointer(memcg->memsw_thresholds, thresholds_new); 3464 3465 /* To be sure that nobody uses thresholds before freeing it */ 3466 synchronize_rcu(); 3467 3468 for (i = 0; i < thresholds->size - size; i++) 3469 mem_cgroup_put(memcg); 3470 3471 kfree(thresholds); 3472unlock: 3473 mutex_unlock(&memcg->thresholds_lock); 3474 3475 return ret; 3476} 3477 3478static struct cftype mem_cgroup_files[] = { 3479 { 3480 .name = "usage_in_bytes", 3481 .private = MEMFILE_PRIVATE(_MEM, RES_USAGE), 3482 .read_u64 = mem_cgroup_read, 3483 .register_event = mem_cgroup_register_event, 3484 .unregister_event = mem_cgroup_unregister_event, 3485 }, 3486 { 3487 .name = "max_usage_in_bytes", 3488 .private = MEMFILE_PRIVATE(_MEM, RES_MAX_USAGE), 3489 .trigger = mem_cgroup_reset, 3490 .read_u64 = mem_cgroup_read, 3491 }, 3492 { 3493 .name = "limit_in_bytes", 3494 .private = MEMFILE_PRIVATE(_MEM, RES_LIMIT), 3495 .write_string = mem_cgroup_write, 3496 .read_u64 = mem_cgroup_read, 3497 }, 3498 { 3499 .name = "soft_limit_in_bytes", 3500 .private = MEMFILE_PRIVATE(_MEM, RES_SOFT_LIMIT), 3501 .write_string = mem_cgroup_write, 3502 .read_u64 = mem_cgroup_read, 3503 }, 3504 { 3505 .name = "failcnt", 3506 .private = MEMFILE_PRIVATE(_MEM, RES_FAILCNT), 3507 .trigger = mem_cgroup_reset, 3508 .read_u64 = mem_cgroup_read, 3509 }, 3510 { 3511 .name = "stat", 3512 .read_map = mem_control_stat_show, 3513 }, 3514 { 3515 .name = "force_empty", 3516 .trigger = mem_cgroup_force_empty_write, 3517 }, 3518 { 3519 .name = "use_hierarchy", 3520 .write_u64 = mem_cgroup_hierarchy_write, 3521 .read_u64 = mem_cgroup_hierarchy_read, 3522 }, 3523 { 3524 .name = "swappiness", 3525 .read_u64 = mem_cgroup_swappiness_read, 3526 .write_u64 = mem_cgroup_swappiness_write, 3527 }, 3528 { 3529 .name = "move_charge_at_immigrate", 3530 .read_u64 = mem_cgroup_move_charge_read, 3531 .write_u64 = mem_cgroup_move_charge_write, 3532 }, 3533}; 3534 3535#ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP 3536static struct cftype memsw_cgroup_files[] = { 3537 { 3538 .name = "memsw.usage_in_bytes", 3539 .private = MEMFILE_PRIVATE(_MEMSWAP, RES_USAGE), 3540 .read_u64 = mem_cgroup_read, 3541 .register_event = mem_cgroup_register_event, 3542 .unregister_event = mem_cgroup_unregister_event, 3543 }, 3544 { 3545 .name = "memsw.max_usage_in_bytes", 3546 .private = MEMFILE_PRIVATE(_MEMSWAP, RES_MAX_USAGE), 3547 .trigger = mem_cgroup_reset, 3548 .read_u64 = mem_cgroup_read, 3549 }, 3550 { 3551 .name = "memsw.limit_in_bytes", 3552 .private = MEMFILE_PRIVATE(_MEMSWAP, RES_LIMIT), 3553 .write_string = mem_cgroup_write, 3554 .read_u64 = mem_cgroup_read, 3555 }, 3556 { 3557 .name = "memsw.failcnt", 3558 .private = MEMFILE_PRIVATE(_MEMSWAP, RES_FAILCNT), 3559 .trigger = mem_cgroup_reset, 3560 .read_u64 = mem_cgroup_read, 3561 }, 3562}; 3563 3564static int register_memsw_files(struct cgroup *cont, struct cgroup_subsys *ss) 3565{ 3566 if (!do_swap_account) 3567 return 0; 3568 return cgroup_add_files(cont, ss, memsw_cgroup_files, 3569 ARRAY_SIZE(memsw_cgroup_files)); 3570}; 3571#else 3572static int register_memsw_files(struct cgroup *cont, struct cgroup_subsys *ss) 3573{ 3574 return 0; 3575} 3576#endif 3577 3578static int alloc_mem_cgroup_per_zone_info(struct mem_cgroup *mem, int node) 3579{ 3580 struct mem_cgroup_per_node *pn; 3581 struct mem_cgroup_per_zone *mz; 3582 enum lru_list l; 3583 int zone, tmp = node; 3584 /* 3585 * This routine is called against possible nodes. 3586 * But it's BUG to call kmalloc() against offline node. 3587 * 3588 * TODO: this routine can waste much memory for nodes which will 3589 * never be onlined. It's better to use memory hotplug callback 3590 * function. 3591 */ 3592 if (!node_state(node, N_NORMAL_MEMORY)) 3593 tmp = -1; 3594 pn = kmalloc_node(sizeof(*pn), GFP_KERNEL, tmp); 3595 if (!pn) 3596 return 1; 3597 3598 mem->info.nodeinfo[node] = pn; 3599 memset(pn, 0, sizeof(*pn)); 3600 3601 for (zone = 0; zone < MAX_NR_ZONES; zone++) { 3602 mz = &pn->zoneinfo[zone]; 3603 for_each_lru(l) 3604 INIT_LIST_HEAD(&mz->lists[l]); 3605 mz->usage_in_excess = 0; 3606 mz->on_tree = false; 3607 mz->mem = mem; 3608 } 3609 return 0; 3610} 3611 3612static void free_mem_cgroup_per_zone_info(struct mem_cgroup *mem, int node) 3613{ 3614 kfree(mem->info.nodeinfo[node]); 3615} 3616 3617static struct mem_cgroup *mem_cgroup_alloc(void) 3618{ 3619 struct mem_cgroup *mem; 3620 int size = sizeof(struct mem_cgroup); 3621 3622 /* Can be very big if MAX_NUMNODES is very big */ 3623 if (size < PAGE_SIZE) 3624 mem = kmalloc(size, GFP_KERNEL); 3625 else 3626 mem = vmalloc(size); 3627 3628 if (mem) 3629 memset(mem, 0, size); 3630 mem->stat = alloc_percpu(struct mem_cgroup_stat_cpu); 3631 if (!mem->stat) { 3632 if (size < PAGE_SIZE) 3633 kfree(mem); 3634 else 3635 vfree(mem); 3636 mem = NULL; 3637 } 3638 return mem; 3639} 3640 3641/* 3642 * At destroying mem_cgroup, references from swap_cgroup can remain. 3643 * (scanning all at force_empty is too costly...) 3644 * 3645 * Instead of clearing all references at force_empty, we remember 3646 * the number of reference from swap_cgroup and free mem_cgroup when 3647 * it goes down to 0. 3648 * 3649 * Removal of cgroup itself succeeds regardless of refs from swap. 3650 */ 3651 3652static void __mem_cgroup_free(struct mem_cgroup *mem) 3653{ 3654 int node; 3655 3656 mem_cgroup_remove_from_trees(mem); 3657 free_css_id(&mem_cgroup_subsys, &mem->css); 3658 3659 for_each_node_state(node, N_POSSIBLE) 3660 free_mem_cgroup_per_zone_info(mem, node); 3661 3662 free_percpu(mem->stat); 3663 if (sizeof(struct mem_cgroup) < PAGE_SIZE) 3664 kfree(mem); 3665 else 3666 vfree(mem); 3667} 3668 3669static void mem_cgroup_get(struct mem_cgroup *mem) 3670{ 3671 atomic_inc(&mem->refcnt); 3672} 3673 3674static void __mem_cgroup_put(struct mem_cgroup *mem, int count) 3675{ 3676 if (atomic_sub_and_test(count, &mem->refcnt)) { 3677 struct mem_cgroup *parent = parent_mem_cgroup(mem); 3678 __mem_cgroup_free(mem); 3679 if (parent) 3680 mem_cgroup_put(parent); 3681 } 3682} 3683 3684static void mem_cgroup_put(struct mem_cgroup *mem) 3685{ 3686 __mem_cgroup_put(mem, 1); 3687} 3688 3689/* 3690 * Returns the parent mem_cgroup in memcgroup hierarchy with hierarchy enabled. 3691 */ 3692static struct mem_cgroup *parent_mem_cgroup(struct mem_cgroup *mem) 3693{ 3694 if (!mem->res.parent) 3695 return NULL; 3696 return mem_cgroup_from_res_counter(mem->res.parent, res); 3697} 3698 3699#ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP 3700static void __init enable_swap_cgroup(void) 3701{ 3702 if (!mem_cgroup_disabled() && really_do_swap_account) 3703 do_swap_account = 1; 3704} 3705#else 3706static void __init enable_swap_cgroup(void) 3707{ 3708} 3709#endif 3710 3711static int mem_cgroup_soft_limit_tree_init(void) 3712{ 3713 struct mem_cgroup_tree_per_node *rtpn; 3714 struct mem_cgroup_tree_per_zone *rtpz; 3715 int tmp, node, zone; 3716 3717 for_each_node_state(node, N_POSSIBLE) { 3718 tmp = node; 3719 if (!node_state(node, N_NORMAL_MEMORY)) 3720 tmp = -1; 3721 rtpn = kzalloc_node(sizeof(*rtpn), GFP_KERNEL, tmp); 3722 if (!rtpn) 3723 return 1; 3724 3725 soft_limit_tree.rb_tree_per_node[node] = rtpn; 3726 3727 for (zone = 0; zone < MAX_NR_ZONES; zone++) { 3728 rtpz = &rtpn->rb_tree_per_zone[zone]; 3729 rtpz->rb_root = RB_ROOT; 3730 spin_lock_init(&rtpz->lock); 3731 } 3732 } 3733 return 0; 3734} 3735 3736static struct cgroup_subsys_state * __ref 3737mem_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cont) 3738{ 3739 struct mem_cgroup *mem, *parent; 3740 long error = -ENOMEM; 3741 int node; 3742 3743 mem = mem_cgroup_alloc(); 3744 if (!mem) 3745 return ERR_PTR(error); 3746 3747 for_each_node_state(node, N_POSSIBLE) 3748 if (alloc_mem_cgroup_per_zone_info(mem, node)) 3749 goto free_out; 3750 3751 /* root ? */ 3752 if (cont->parent == NULL) { 3753 int cpu; 3754 enable_swap_cgroup(); 3755 parent = NULL; 3756 root_mem_cgroup = mem; 3757 if (mem_cgroup_soft_limit_tree_init()) 3758 goto free_out; 3759 for_each_possible_cpu(cpu) { 3760 struct memcg_stock_pcp *stock = 3761 &per_cpu(memcg_stock, cpu); 3762 INIT_WORK(&stock->work, drain_local_stock); 3763 } 3764 hotcpu_notifier(memcg_stock_cpu_callback, 0); 3765 } else { 3766 parent = mem_cgroup_from_cont(cont->parent); 3767 mem->use_hierarchy = parent->use_hierarchy; 3768 } 3769 3770 if (parent && parent->use_hierarchy) { 3771 res_counter_init(&mem->res, &parent->res); 3772 res_counter_init(&mem->memsw, &parent->memsw); 3773 /* 3774 * We increment refcnt of the parent to ensure that we can 3775 * safely access it on res_counter_charge/uncharge. 3776 * This refcnt will be decremented when freeing this 3777 * mem_cgroup(see mem_cgroup_put). 3778 */ 3779 mem_cgroup_get(parent); 3780 } else { 3781 res_counter_init(&mem->res, NULL); 3782 res_counter_init(&mem->memsw, NULL); 3783 } 3784 mem->last_scanned_child = 0; 3785 spin_lock_init(&mem->reclaim_param_lock); 3786 3787 if (parent) 3788 mem->swappiness = get_swappiness(parent); 3789 atomic_set(&mem->refcnt, 1); 3790 mem->move_charge_at_immigrate = 0; 3791 mutex_init(&mem->thresholds_lock); 3792 return &mem->css; 3793free_out: 3794 __mem_cgroup_free(mem); 3795 root_mem_cgroup = NULL; 3796 return ERR_PTR(error); 3797} 3798 3799static int mem_cgroup_pre_destroy(struct cgroup_subsys *ss, 3800 struct cgroup *cont) 3801{ 3802 struct mem_cgroup *mem = mem_cgroup_from_cont(cont); 3803 3804 return mem_cgroup_force_empty(mem, false); 3805} 3806 3807static void mem_cgroup_destroy(struct cgroup_subsys *ss, 3808 struct cgroup *cont) 3809{ 3810 struct mem_cgroup *mem = mem_cgroup_from_cont(cont); 3811 3812 mem_cgroup_put(mem); 3813} 3814 3815static int mem_cgroup_populate(struct cgroup_subsys *ss, 3816 struct cgroup *cont) 3817{ 3818 int ret; 3819 3820 ret = cgroup_add_files(cont, ss, mem_cgroup_files, 3821 ARRAY_SIZE(mem_cgroup_files)); 3822 3823 if (!ret) 3824 ret = register_memsw_files(cont, ss); 3825 return ret; 3826} 3827 3828#ifdef CONFIG_MMU 3829/* Handlers for move charge at task migration. */ 3830#define PRECHARGE_COUNT_AT_ONCE 256 3831static int mem_cgroup_do_precharge(unsigned long count) 3832{ 3833 int ret = 0; 3834 int batch_count = PRECHARGE_COUNT_AT_ONCE; 3835 struct mem_cgroup *mem = mc.to; 3836 3837 if (mem_cgroup_is_root(mem)) { 3838 mc.precharge += count; 3839 /* we don't need css_get for root */ 3840 return ret; 3841 } 3842 /* try to charge at once */ 3843 if (count > 1) { 3844 struct res_counter *dummy; 3845 /* 3846 * "mem" cannot be under rmdir() because we've already checked 3847 * by cgroup_lock_live_cgroup() that it is not removed and we 3848 * are still under the same cgroup_mutex. So we can postpone 3849 * css_get(). 3850 */ 3851 if (res_counter_charge(&mem->res, PAGE_SIZE * count, &dummy)) 3852 goto one_by_one; 3853 if (do_swap_account && res_counter_charge(&mem->memsw, 3854 PAGE_SIZE * count, &dummy)) { 3855 res_counter_uncharge(&mem->res, PAGE_SIZE * count); 3856 goto one_by_one; 3857 } 3858 mc.precharge += count; 3859 VM_BUG_ON(test_bit(CSS_ROOT, &mem->css.flags)); 3860 WARN_ON_ONCE(count > INT_MAX); 3861 __css_get(&mem->css, (int)count); 3862 return ret; 3863 } 3864one_by_one: 3865 /* fall back to one by one charge */ 3866 while (count--) { 3867 if (signal_pending(current)) { 3868 ret = -EINTR; 3869 break; 3870 } 3871 if (!batch_count--) { 3872 batch_count = PRECHARGE_COUNT_AT_ONCE; 3873 cond_resched(); 3874 } 3875 ret = __mem_cgroup_try_charge(NULL, GFP_KERNEL, &mem, 3876 false, NULL); 3877 if (ret || !mem) 3878 /* mem_cgroup_clear_mc() will do uncharge later */ 3879 return -ENOMEM; 3880 mc.precharge++; 3881 } 3882 return ret; 3883} 3884#else /* !CONFIG_MMU */ 3885static int mem_cgroup_can_attach(struct cgroup_subsys *ss, 3886 struct cgroup *cgroup, 3887 struct task_struct *p, 3888 bool threadgroup) 3889{ 3890 return 0; 3891} 3892static void mem_cgroup_cancel_attach(struct cgroup_subsys *ss, 3893 struct cgroup *cgroup, 3894 struct task_struct *p, 3895 bool threadgroup) 3896{ 3897} 3898static void mem_cgroup_move_task(struct cgroup_subsys *ss, 3899 struct cgroup *cont, 3900 struct cgroup *old_cont, 3901 struct task_struct *p, 3902 bool threadgroup) 3903{ 3904} 3905#endif 3906 3907/** 3908 * is_target_pte_for_mc - check a pte whether it is valid for move charge 3909 * @vma: the vma the pte to be checked belongs 3910 * @addr: the address corresponding to the pte to be checked 3911 * @ptent: the pte to be checked 3912 * @target: the pointer the target page or swap ent will be stored(can be NULL) 3913 * 3914 * Returns 3915 * 0(MC_TARGET_NONE): if the pte is not a target for move charge. 3916 * 1(MC_TARGET_PAGE): if the page corresponding to this pte is a target for 3917 * move charge. if @target is not NULL, the page is stored in target->page 3918 * with extra refcnt got(Callers should handle it). 3919 * 2(MC_TARGET_SWAP): if the swap entry corresponding to this pte is a 3920 * target for charge migration. if @target is not NULL, the entry is stored 3921 * in target->ent. 3922 * 3923 * Called with pte lock held. 3924 */ 3925union mc_target { 3926 struct page *page; 3927 swp_entry_t ent; 3928}; 3929 3930enum mc_target_type { 3931 MC_TARGET_NONE, /* not used */ 3932 MC_TARGET_PAGE, 3933 MC_TARGET_SWAP, 3934}; 3935 3936static int is_target_pte_for_mc(struct vm_area_struct *vma, 3937 unsigned long addr, pte_t ptent, union mc_target *target) 3938{ 3939 struct page *page = NULL; 3940 struct page_cgroup *pc; 3941 int ret = 0; 3942 swp_entry_t ent = { .val = 0 }; 3943 int usage_count = 0; 3944 bool move_anon = test_bit(MOVE_CHARGE_TYPE_ANON, 3945 &mc.to->move_charge_at_immigrate); 3946 3947 if (!pte_present(ptent)) { 3948 /* TODO: handle swap of shmes/tmpfs */ 3949 if (pte_none(ptent) || pte_file(ptent)) 3950 return 0; 3951 else if (is_swap_pte(ptent)) { 3952 ent = pte_to_swp_entry(ptent); 3953 if (!move_anon || non_swap_entry(ent)) 3954 return 0; 3955 usage_count = mem_cgroup_count_swap_user(ent, &page); 3956 } 3957 } else { 3958 page = vm_normal_page(vma, addr, ptent); 3959 if (!page || !page_mapped(page)) 3960 return 0; 3961 /* 3962 * TODO: We don't move charges of file(including shmem/tmpfs) 3963 * pages for now. 3964 */ 3965 if (!move_anon || !PageAnon(page)) 3966 return 0; 3967 if (!get_page_unless_zero(page)) 3968 return 0; 3969 usage_count = page_mapcount(page); 3970 } 3971 if (usage_count > 1) { 3972 /* 3973 * TODO: We don't move charges of shared(used by multiple 3974 * processes) pages for now. 3975 */ 3976 if (page) 3977 put_page(page); 3978 return 0; 3979 } 3980 if (page) { 3981 pc = lookup_page_cgroup(page); 3982 /* 3983 * Do only loose check w/o page_cgroup lock. 3984 * mem_cgroup_move_account() checks the pc is valid or not under 3985 * the lock. 3986 */ 3987 if (PageCgroupUsed(pc) && pc->mem_cgroup == mc.from) { 3988 ret = MC_TARGET_PAGE; 3989 if (target) 3990 target->page = page; 3991 } 3992 if (!ret || !target) 3993 put_page(page); 3994 } 3995 /* throught */ 3996 if (ent.val && do_swap_account && !ret && 3997 css_id(&mc.from->css) == lookup_swap_cgroup(ent)) { 3998 ret = MC_TARGET_SWAP; 3999 if (target) 4000 target->ent = ent; 4001 } 4002 return ret; 4003} 4004 4005static int mem_cgroup_count_precharge_pte_range(pmd_t *pmd, 4006 unsigned long addr, unsigned long end, 4007 struct mm_walk *walk) 4008{ 4009 struct vm_area_struct *vma = walk->private; 4010 pte_t *pte; 4011 spinlock_t *ptl; 4012 4013 pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl); 4014 for (; addr != end; pte++, addr += PAGE_SIZE) 4015 if (is_target_pte_for_mc(vma, addr, *pte, NULL)) 4016 mc.precharge++; /* increment precharge temporarily */ 4017 pte_unmap_unlock(pte - 1, ptl); 4018 cond_resched(); 4019 4020 return 0; 4021} 4022 4023static unsigned long mem_cgroup_count_precharge(struct mm_struct *mm) 4024{ 4025 unsigned long precharge; 4026 struct vm_area_struct *vma; 4027 4028 down_read(&mm->mmap_sem); 4029 for (vma = mm->mmap; vma; vma = vma->vm_next) { 4030 struct mm_walk mem_cgroup_count_precharge_walk = { 4031 .pmd_entry = mem_cgroup_count_precharge_pte_range, 4032 .mm = mm, 4033 .private = vma, 4034 }; 4035 if (is_vm_hugetlb_page(vma)) 4036 continue; 4037 /* TODO: We don't move charges of shmem/tmpfs pages for now. */ 4038 if (vma->vm_flags & VM_SHARED) 4039 continue; 4040 walk_page_range(vma->vm_start, vma->vm_end, 4041 &mem_cgroup_count_precharge_walk); 4042 } 4043 up_read(&mm->mmap_sem); 4044 4045 precharge = mc.precharge; 4046 mc.precharge = 0; 4047 4048 return precharge; 4049} 4050 4051static int mem_cgroup_precharge_mc(struct mm_struct *mm) 4052{ 4053 return mem_cgroup_do_precharge(mem_cgroup_count_precharge(mm)); 4054} 4055 4056static void mem_cgroup_clear_mc(void) 4057{ 4058 /* we must uncharge all the leftover precharges from mc.to */ 4059 if (mc.precharge) { 4060 __mem_cgroup_cancel_charge(mc.to, mc.precharge); 4061 mc.precharge = 0; 4062 } 4063 /* 4064 * we didn't uncharge from mc.from at mem_cgroup_move_account(), so 4065 * we must uncharge here. 4066 */ 4067 if (mc.moved_charge) { 4068 __mem_cgroup_cancel_charge(mc.from, mc.moved_charge); 4069 mc.moved_charge = 0; 4070 } 4071 /* we must fixup refcnts and charges */ 4072 if (mc.moved_swap) { 4073 WARN_ON_ONCE(mc.moved_swap > INT_MAX); 4074 /* uncharge swap account from the old cgroup */ 4075 if (!mem_cgroup_is_root(mc.from)) 4076 res_counter_uncharge(&mc.from->memsw, 4077 PAGE_SIZE * mc.moved_swap); 4078 __mem_cgroup_put(mc.from, mc.moved_swap); 4079 4080 if (!mem_cgroup_is_root(mc.to)) { 4081 /* 4082 * we charged both to->res and to->memsw, so we should 4083 * uncharge to->res. 4084 */ 4085 res_counter_uncharge(&mc.to->res, 4086 PAGE_SIZE * mc.moved_swap); 4087 VM_BUG_ON(test_bit(CSS_ROOT, &mc.to->css.flags)); 4088 __css_put(&mc.to->css, mc.moved_swap); 4089 } 4090 /* we've already done mem_cgroup_get(mc.to) */ 4091 4092 mc.moved_swap = 0; 4093 } 4094 mc.from = NULL; 4095 mc.to = NULL; 4096 mc.moving_task = NULL; 4097 wake_up_all(&mc.waitq); 4098} 4099 4100static int mem_cgroup_can_attach(struct cgroup_subsys *ss, 4101 struct cgroup *cgroup, 4102 struct task_struct *p, 4103 bool threadgroup) 4104{ 4105 int ret = 0; 4106 struct mem_cgroup *mem = mem_cgroup_from_cont(cgroup); 4107 4108 if (mem->move_charge_at_immigrate) { 4109 struct mm_struct *mm; 4110 struct mem_cgroup *from = mem_cgroup_from_task(p); 4111 4112 VM_BUG_ON(from == mem); 4113 4114 mm = get_task_mm(p); 4115 if (!mm) 4116 return 0; 4117 /* We move charges only when we move a owner of the mm */ 4118 if (mm->owner == p) { 4119 VM_BUG_ON(mc.from); 4120 VM_BUG_ON(mc.to); 4121 VM_BUG_ON(mc.precharge); 4122 VM_BUG_ON(mc.moved_charge); 4123 VM_BUG_ON(mc.moved_swap); 4124 VM_BUG_ON(mc.moving_task); 4125 mc.from = from; 4126 mc.to = mem; 4127 mc.precharge = 0; 4128 mc.moved_charge = 0; 4129 mc.moved_swap = 0; 4130 mc.moving_task = current; 4131 4132 ret = mem_cgroup_precharge_mc(mm); 4133 if (ret) 4134 mem_cgroup_clear_mc(); 4135 } 4136 mmput(mm); 4137 } 4138 return ret; 4139} 4140 4141static void mem_cgroup_cancel_attach(struct cgroup_subsys *ss, 4142 struct cgroup *cgroup, 4143 struct task_struct *p, 4144 bool threadgroup) 4145{ 4146 mem_cgroup_clear_mc(); 4147} 4148 4149static int mem_cgroup_move_charge_pte_range(pmd_t *pmd, 4150 unsigned long addr, unsigned long end, 4151 struct mm_walk *walk) 4152{ 4153 int ret = 0; 4154 struct vm_area_struct *vma = walk->private; 4155 pte_t *pte; 4156 spinlock_t *ptl; 4157 4158retry: 4159 pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl); 4160 for (; addr != end; addr += PAGE_SIZE) { 4161 pte_t ptent = *(pte++); 4162 union mc_target target; 4163 int type; 4164 struct page *page; 4165 struct page_cgroup *pc; 4166 swp_entry_t ent; 4167 4168 if (!mc.precharge) 4169 break; 4170 4171 type = is_target_pte_for_mc(vma, addr, ptent, &target); 4172 switch (type) { 4173 case MC_TARGET_PAGE: 4174 page = target.page; 4175 if (isolate_lru_page(page)) 4176 goto put; 4177 pc = lookup_page_cgroup(page); 4178 if (!mem_cgroup_move_account(pc, 4179 mc.from, mc.to, false)) { 4180 mc.precharge--; 4181 /* we uncharge from mc.from later. */ 4182 mc.moved_charge++; 4183 } 4184 putback_lru_page(page); 4185put: /* is_target_pte_for_mc() gets the page */ 4186 put_page(page); 4187 break; 4188 case MC_TARGET_SWAP: 4189 ent = target.ent; 4190 if (!mem_cgroup_move_swap_account(ent, 4191 mc.from, mc.to, false)) { 4192 mc.precharge--; 4193 /* we fixup refcnts and charges later. */ 4194 mc.moved_swap++; 4195 } 4196 break; 4197 default: 4198 break; 4199 } 4200 } 4201 pte_unmap_unlock(pte - 1, ptl); 4202 cond_resched(); 4203 4204 if (addr != end) { 4205 /* 4206 * We have consumed all precharges we got in can_attach(). 4207 * We try charge one by one, but don't do any additional 4208 * charges to mc.to if we have failed in charge once in attach() 4209 * phase. 4210 */ 4211 ret = mem_cgroup_do_precharge(1); 4212 if (!ret) 4213 goto retry; 4214 } 4215 4216 return ret; 4217} 4218 4219static void mem_cgroup_move_charge(struct mm_struct *mm) 4220{ 4221 struct vm_area_struct *vma; 4222 4223 lru_add_drain_all(); 4224 down_read(&mm->mmap_sem); 4225 for (vma = mm->mmap; vma; vma = vma->vm_next) { 4226 int ret; 4227 struct mm_walk mem_cgroup_move_charge_walk = { 4228 .pmd_entry = mem_cgroup_move_charge_pte_range, 4229 .mm = mm, 4230 .private = vma, 4231 }; 4232 if (is_vm_hugetlb_page(vma)) 4233 continue; 4234 /* TODO: We don't move charges of shmem/tmpfs pages for now. */ 4235 if (vma->vm_flags & VM_SHARED) 4236 continue; 4237 ret = walk_page_range(vma->vm_start, vma->vm_end, 4238 &mem_cgroup_move_charge_walk); 4239 if (ret) 4240 /* 4241 * means we have consumed all precharges and failed in 4242 * doing additional charge. Just abandon here. 4243 */ 4244 break; 4245 } 4246 up_read(&mm->mmap_sem); 4247} 4248 4249static void mem_cgroup_move_task(struct cgroup_subsys *ss, 4250 struct cgroup *cont, 4251 struct cgroup *old_cont, 4252 struct task_struct *p, 4253 bool threadgroup) 4254{ 4255 struct mm_struct *mm; 4256 4257 if (!mc.to) 4258 /* no need to move charge */ 4259 return; 4260 4261 mm = get_task_mm(p); 4262 if (mm) { 4263 mem_cgroup_move_charge(mm); 4264 mmput(mm); 4265 } 4266 mem_cgroup_clear_mc(); 4267} 4268 4269struct cgroup_subsys mem_cgroup_subsys = { 4270 .name = "memory", 4271 .subsys_id = mem_cgroup_subsys_id, 4272 .create = mem_cgroup_create, 4273 .pre_destroy = mem_cgroup_pre_destroy, 4274 .destroy = mem_cgroup_destroy, 4275 .populate = mem_cgroup_populate, 4276 .can_attach = mem_cgroup_can_attach, 4277 .cancel_attach = mem_cgroup_cancel_attach, 4278 .attach = mem_cgroup_move_task, 4279 .early_init = 0, 4280 .use_id = 1, 4281}; 4282 4283#ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP 4284 4285static int __init disable_swap_account(char *s) 4286{ 4287 really_do_swap_account = 0; 4288 return 1; 4289} 4290__setup("noswapaccount", disable_swap_account); 4291#endif 4292