memcontrol.c revision ad4ba375373937817404fd92239ef4cadbded23b
1/* memcontrol.c - Memory Controller 2 * 3 * Copyright IBM Corporation, 2007 4 * Author Balbir Singh <balbir@linux.vnet.ibm.com> 5 * 6 * Copyright 2007 OpenVZ SWsoft Inc 7 * Author: Pavel Emelianov <xemul@openvz.org> 8 * 9 * Memory thresholds 10 * Copyright (C) 2009 Nokia Corporation 11 * Author: Kirill A. Shutemov 12 * 13 * This program is free software; you can redistribute it and/or modify 14 * it under the terms of the GNU General Public License as published by 15 * the Free Software Foundation; either version 2 of the License, or 16 * (at your option) any later version. 17 * 18 * This program is distributed in the hope that it will be useful, 19 * but WITHOUT ANY WARRANTY; without even the implied warranty of 20 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 21 * GNU General Public License for more details. 22 */ 23 24#include <linux/res_counter.h> 25#include <linux/memcontrol.h> 26#include <linux/cgroup.h> 27#include <linux/mm.h> 28#include <linux/hugetlb.h> 29#include <linux/pagemap.h> 30#include <linux/smp.h> 31#include <linux/page-flags.h> 32#include <linux/backing-dev.h> 33#include <linux/bit_spinlock.h> 34#include <linux/rcupdate.h> 35#include <linux/limits.h> 36#include <linux/mutex.h> 37#include <linux/rbtree.h> 38#include <linux/slab.h> 39#include <linux/swap.h> 40#include <linux/swapops.h> 41#include <linux/spinlock.h> 42#include <linux/eventfd.h> 43#include <linux/sort.h> 44#include <linux/fs.h> 45#include <linux/seq_file.h> 46#include <linux/vmalloc.h> 47#include <linux/mm_inline.h> 48#include <linux/page_cgroup.h> 49#include <linux/cpu.h> 50#include "internal.h" 51 52#include <asm/uaccess.h> 53 54struct cgroup_subsys mem_cgroup_subsys __read_mostly; 55#define MEM_CGROUP_RECLAIM_RETRIES 5 56struct mem_cgroup *root_mem_cgroup __read_mostly; 57 58#ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP 59/* Turned on only when memory cgroup is enabled && really_do_swap_account = 1 */ 60int do_swap_account __read_mostly; 61static int really_do_swap_account __initdata = 1; /* for remember boot option*/ 62#else 63#define do_swap_account (0) 64#endif 65 66/* 67 * Per memcg event counter is incremented at every pagein/pageout. This counter 68 * is used for trigger some periodic events. This is straightforward and better 69 * than using jiffies etc. to handle periodic memcg event. 70 * 71 * These values will be used as !((event) & ((1 <<(thresh)) - 1)) 72 */ 73#define THRESHOLDS_EVENTS_THRESH (7) /* once in 128 */ 74#define SOFTLIMIT_EVENTS_THRESH (10) /* once in 1024 */ 75 76/* 77 * Statistics for memory cgroup. 78 */ 79enum mem_cgroup_stat_index { 80 /* 81 * For MEM_CONTAINER_TYPE_ALL, usage = pagecache + rss. 82 */ 83 MEM_CGROUP_STAT_CACHE, /* # of pages charged as cache */ 84 MEM_CGROUP_STAT_RSS, /* # of pages charged as anon rss */ 85 MEM_CGROUP_STAT_FILE_MAPPED, /* # of pages charged as file rss */ 86 MEM_CGROUP_STAT_PGPGIN_COUNT, /* # of pages paged in */ 87 MEM_CGROUP_STAT_PGPGOUT_COUNT, /* # of pages paged out */ 88 MEM_CGROUP_STAT_SWAPOUT, /* # of pages, swapped out */ 89 MEM_CGROUP_EVENTS, /* incremented at every pagein/pageout */ 90 91 MEM_CGROUP_STAT_NSTATS, 92}; 93 94struct mem_cgroup_stat_cpu { 95 s64 count[MEM_CGROUP_STAT_NSTATS]; 96}; 97 98/* 99 * per-zone information in memory controller. 100 */ 101struct mem_cgroup_per_zone { 102 /* 103 * spin_lock to protect the per cgroup LRU 104 */ 105 struct list_head lists[NR_LRU_LISTS]; 106 unsigned long count[NR_LRU_LISTS]; 107 108 struct zone_reclaim_stat reclaim_stat; 109 struct rb_node tree_node; /* RB tree node */ 110 unsigned long long usage_in_excess;/* Set to the value by which */ 111 /* the soft limit is exceeded*/ 112 bool on_tree; 113 struct mem_cgroup *mem; /* Back pointer, we cannot */ 114 /* use container_of */ 115}; 116/* Macro for accessing counter */ 117#define MEM_CGROUP_ZSTAT(mz, idx) ((mz)->count[(idx)]) 118 119struct mem_cgroup_per_node { 120 struct mem_cgroup_per_zone zoneinfo[MAX_NR_ZONES]; 121}; 122 123struct mem_cgroup_lru_info { 124 struct mem_cgroup_per_node *nodeinfo[MAX_NUMNODES]; 125}; 126 127/* 128 * Cgroups above their limits are maintained in a RB-Tree, independent of 129 * their hierarchy representation 130 */ 131 132struct mem_cgroup_tree_per_zone { 133 struct rb_root rb_root; 134 spinlock_t lock; 135}; 136 137struct mem_cgroup_tree_per_node { 138 struct mem_cgroup_tree_per_zone rb_tree_per_zone[MAX_NR_ZONES]; 139}; 140 141struct mem_cgroup_tree { 142 struct mem_cgroup_tree_per_node *rb_tree_per_node[MAX_NUMNODES]; 143}; 144 145static struct mem_cgroup_tree soft_limit_tree __read_mostly; 146 147struct mem_cgroup_threshold { 148 struct eventfd_ctx *eventfd; 149 u64 threshold; 150}; 151 152struct mem_cgroup_threshold_ary { 153 /* An array index points to threshold just below usage. */ 154 atomic_t current_threshold; 155 /* Size of entries[] */ 156 unsigned int size; 157 /* Array of thresholds */ 158 struct mem_cgroup_threshold entries[0]; 159}; 160 161static void mem_cgroup_threshold(struct mem_cgroup *mem); 162 163/* 164 * The memory controller data structure. The memory controller controls both 165 * page cache and RSS per cgroup. We would eventually like to provide 166 * statistics based on the statistics developed by Rik Van Riel for clock-pro, 167 * to help the administrator determine what knobs to tune. 168 * 169 * TODO: Add a water mark for the memory controller. Reclaim will begin when 170 * we hit the water mark. May be even add a low water mark, such that 171 * no reclaim occurs from a cgroup at it's low water mark, this is 172 * a feature that will be implemented much later in the future. 173 */ 174struct mem_cgroup { 175 struct cgroup_subsys_state css; 176 /* 177 * the counter to account for memory usage 178 */ 179 struct res_counter res; 180 /* 181 * the counter to account for mem+swap usage. 182 */ 183 struct res_counter memsw; 184 /* 185 * Per cgroup active and inactive list, similar to the 186 * per zone LRU lists. 187 */ 188 struct mem_cgroup_lru_info info; 189 190 /* 191 protect against reclaim related member. 192 */ 193 spinlock_t reclaim_param_lock; 194 195 int prev_priority; /* for recording reclaim priority */ 196 197 /* 198 * While reclaiming in a hierarchy, we cache the last child we 199 * reclaimed from. 200 */ 201 int last_scanned_child; 202 /* 203 * Should the accounting and control be hierarchical, per subtree? 204 */ 205 bool use_hierarchy; 206 atomic_t oom_lock; 207 atomic_t refcnt; 208 209 unsigned int swappiness; 210 211 /* set when res.limit == memsw.limit */ 212 bool memsw_is_minimum; 213 214 /* protect arrays of thresholds */ 215 struct mutex thresholds_lock; 216 217 /* thresholds for memory usage. RCU-protected */ 218 struct mem_cgroup_threshold_ary *thresholds; 219 220 /* thresholds for mem+swap usage. RCU-protected */ 221 struct mem_cgroup_threshold_ary *memsw_thresholds; 222 223 /* 224 * Should we move charges of a task when a task is moved into this 225 * mem_cgroup ? And what type of charges should we move ? 226 */ 227 unsigned long move_charge_at_immigrate; 228 229 /* 230 * percpu counter. 231 */ 232 struct mem_cgroup_stat_cpu *stat; 233}; 234 235/* Stuffs for move charges at task migration. */ 236/* 237 * Types of charges to be moved. "move_charge_at_immitgrate" is treated as a 238 * left-shifted bitmap of these types. 239 */ 240enum move_type { 241 MOVE_CHARGE_TYPE_ANON, /* private anonymous page and swap of it */ 242 NR_MOVE_TYPE, 243}; 244 245/* "mc" and its members are protected by cgroup_mutex */ 246static struct move_charge_struct { 247 struct mem_cgroup *from; 248 struct mem_cgroup *to; 249 unsigned long precharge; 250 unsigned long moved_charge; 251 unsigned long moved_swap; 252 struct task_struct *moving_task; /* a task moving charges */ 253 wait_queue_head_t waitq; /* a waitq for other context */ 254} mc = { 255 .waitq = __WAIT_QUEUE_HEAD_INITIALIZER(mc.waitq), 256}; 257 258/* 259 * Maximum loops in mem_cgroup_hierarchical_reclaim(), used for soft 260 * limit reclaim to prevent infinite loops, if they ever occur. 261 */ 262#define MEM_CGROUP_MAX_RECLAIM_LOOPS (100) 263#define MEM_CGROUP_MAX_SOFT_LIMIT_RECLAIM_LOOPS (2) 264 265enum charge_type { 266 MEM_CGROUP_CHARGE_TYPE_CACHE = 0, 267 MEM_CGROUP_CHARGE_TYPE_MAPPED, 268 MEM_CGROUP_CHARGE_TYPE_SHMEM, /* used by page migration of shmem */ 269 MEM_CGROUP_CHARGE_TYPE_FORCE, /* used by force_empty */ 270 MEM_CGROUP_CHARGE_TYPE_SWAPOUT, /* for accounting swapcache */ 271 MEM_CGROUP_CHARGE_TYPE_DROP, /* a page was unused swap cache */ 272 NR_CHARGE_TYPE, 273}; 274 275/* only for here (for easy reading.) */ 276#define PCGF_CACHE (1UL << PCG_CACHE) 277#define PCGF_USED (1UL << PCG_USED) 278#define PCGF_LOCK (1UL << PCG_LOCK) 279/* Not used, but added here for completeness */ 280#define PCGF_ACCT (1UL << PCG_ACCT) 281 282/* for encoding cft->private value on file */ 283#define _MEM (0) 284#define _MEMSWAP (1) 285#define MEMFILE_PRIVATE(x, val) (((x) << 16) | (val)) 286#define MEMFILE_TYPE(val) (((val) >> 16) & 0xffff) 287#define MEMFILE_ATTR(val) ((val) & 0xffff) 288 289/* 290 * Reclaim flags for mem_cgroup_hierarchical_reclaim 291 */ 292#define MEM_CGROUP_RECLAIM_NOSWAP_BIT 0x0 293#define MEM_CGROUP_RECLAIM_NOSWAP (1 << MEM_CGROUP_RECLAIM_NOSWAP_BIT) 294#define MEM_CGROUP_RECLAIM_SHRINK_BIT 0x1 295#define MEM_CGROUP_RECLAIM_SHRINK (1 << MEM_CGROUP_RECLAIM_SHRINK_BIT) 296#define MEM_CGROUP_RECLAIM_SOFT_BIT 0x2 297#define MEM_CGROUP_RECLAIM_SOFT (1 << MEM_CGROUP_RECLAIM_SOFT_BIT) 298 299static void mem_cgroup_get(struct mem_cgroup *mem); 300static void mem_cgroup_put(struct mem_cgroup *mem); 301static struct mem_cgroup *parent_mem_cgroup(struct mem_cgroup *mem); 302static void drain_all_stock_async(void); 303 304static struct mem_cgroup_per_zone * 305mem_cgroup_zoneinfo(struct mem_cgroup *mem, int nid, int zid) 306{ 307 return &mem->info.nodeinfo[nid]->zoneinfo[zid]; 308} 309 310struct cgroup_subsys_state *mem_cgroup_css(struct mem_cgroup *mem) 311{ 312 return &mem->css; 313} 314 315static struct mem_cgroup_per_zone * 316page_cgroup_zoneinfo(struct page_cgroup *pc) 317{ 318 struct mem_cgroup *mem = pc->mem_cgroup; 319 int nid = page_cgroup_nid(pc); 320 int zid = page_cgroup_zid(pc); 321 322 if (!mem) 323 return NULL; 324 325 return mem_cgroup_zoneinfo(mem, nid, zid); 326} 327 328static struct mem_cgroup_tree_per_zone * 329soft_limit_tree_node_zone(int nid, int zid) 330{ 331 return &soft_limit_tree.rb_tree_per_node[nid]->rb_tree_per_zone[zid]; 332} 333 334static struct mem_cgroup_tree_per_zone * 335soft_limit_tree_from_page(struct page *page) 336{ 337 int nid = page_to_nid(page); 338 int zid = page_zonenum(page); 339 340 return &soft_limit_tree.rb_tree_per_node[nid]->rb_tree_per_zone[zid]; 341} 342 343static void 344__mem_cgroup_insert_exceeded(struct mem_cgroup *mem, 345 struct mem_cgroup_per_zone *mz, 346 struct mem_cgroup_tree_per_zone *mctz, 347 unsigned long long new_usage_in_excess) 348{ 349 struct rb_node **p = &mctz->rb_root.rb_node; 350 struct rb_node *parent = NULL; 351 struct mem_cgroup_per_zone *mz_node; 352 353 if (mz->on_tree) 354 return; 355 356 mz->usage_in_excess = new_usage_in_excess; 357 if (!mz->usage_in_excess) 358 return; 359 while (*p) { 360 parent = *p; 361 mz_node = rb_entry(parent, struct mem_cgroup_per_zone, 362 tree_node); 363 if (mz->usage_in_excess < mz_node->usage_in_excess) 364 p = &(*p)->rb_left; 365 /* 366 * We can't avoid mem cgroups that are over their soft 367 * limit by the same amount 368 */ 369 else if (mz->usage_in_excess >= mz_node->usage_in_excess) 370 p = &(*p)->rb_right; 371 } 372 rb_link_node(&mz->tree_node, parent, p); 373 rb_insert_color(&mz->tree_node, &mctz->rb_root); 374 mz->on_tree = true; 375} 376 377static void 378__mem_cgroup_remove_exceeded(struct mem_cgroup *mem, 379 struct mem_cgroup_per_zone *mz, 380 struct mem_cgroup_tree_per_zone *mctz) 381{ 382 if (!mz->on_tree) 383 return; 384 rb_erase(&mz->tree_node, &mctz->rb_root); 385 mz->on_tree = false; 386} 387 388static void 389mem_cgroup_remove_exceeded(struct mem_cgroup *mem, 390 struct mem_cgroup_per_zone *mz, 391 struct mem_cgroup_tree_per_zone *mctz) 392{ 393 spin_lock(&mctz->lock); 394 __mem_cgroup_remove_exceeded(mem, mz, mctz); 395 spin_unlock(&mctz->lock); 396} 397 398 399static void mem_cgroup_update_tree(struct mem_cgroup *mem, struct page *page) 400{ 401 unsigned long long excess; 402 struct mem_cgroup_per_zone *mz; 403 struct mem_cgroup_tree_per_zone *mctz; 404 int nid = page_to_nid(page); 405 int zid = page_zonenum(page); 406 mctz = soft_limit_tree_from_page(page); 407 408 /* 409 * Necessary to update all ancestors when hierarchy is used. 410 * because their event counter is not touched. 411 */ 412 for (; mem; mem = parent_mem_cgroup(mem)) { 413 mz = mem_cgroup_zoneinfo(mem, nid, zid); 414 excess = res_counter_soft_limit_excess(&mem->res); 415 /* 416 * We have to update the tree if mz is on RB-tree or 417 * mem is over its softlimit. 418 */ 419 if (excess || mz->on_tree) { 420 spin_lock(&mctz->lock); 421 /* if on-tree, remove it */ 422 if (mz->on_tree) 423 __mem_cgroup_remove_exceeded(mem, mz, mctz); 424 /* 425 * Insert again. mz->usage_in_excess will be updated. 426 * If excess is 0, no tree ops. 427 */ 428 __mem_cgroup_insert_exceeded(mem, mz, mctz, excess); 429 spin_unlock(&mctz->lock); 430 } 431 } 432} 433 434static void mem_cgroup_remove_from_trees(struct mem_cgroup *mem) 435{ 436 int node, zone; 437 struct mem_cgroup_per_zone *mz; 438 struct mem_cgroup_tree_per_zone *mctz; 439 440 for_each_node_state(node, N_POSSIBLE) { 441 for (zone = 0; zone < MAX_NR_ZONES; zone++) { 442 mz = mem_cgroup_zoneinfo(mem, node, zone); 443 mctz = soft_limit_tree_node_zone(node, zone); 444 mem_cgroup_remove_exceeded(mem, mz, mctz); 445 } 446 } 447} 448 449static inline unsigned long mem_cgroup_get_excess(struct mem_cgroup *mem) 450{ 451 return res_counter_soft_limit_excess(&mem->res) >> PAGE_SHIFT; 452} 453 454static struct mem_cgroup_per_zone * 455__mem_cgroup_largest_soft_limit_node(struct mem_cgroup_tree_per_zone *mctz) 456{ 457 struct rb_node *rightmost = NULL; 458 struct mem_cgroup_per_zone *mz; 459 460retry: 461 mz = NULL; 462 rightmost = rb_last(&mctz->rb_root); 463 if (!rightmost) 464 goto done; /* Nothing to reclaim from */ 465 466 mz = rb_entry(rightmost, struct mem_cgroup_per_zone, tree_node); 467 /* 468 * Remove the node now but someone else can add it back, 469 * we will to add it back at the end of reclaim to its correct 470 * position in the tree. 471 */ 472 __mem_cgroup_remove_exceeded(mz->mem, mz, mctz); 473 if (!res_counter_soft_limit_excess(&mz->mem->res) || 474 !css_tryget(&mz->mem->css)) 475 goto retry; 476done: 477 return mz; 478} 479 480static struct mem_cgroup_per_zone * 481mem_cgroup_largest_soft_limit_node(struct mem_cgroup_tree_per_zone *mctz) 482{ 483 struct mem_cgroup_per_zone *mz; 484 485 spin_lock(&mctz->lock); 486 mz = __mem_cgroup_largest_soft_limit_node(mctz); 487 spin_unlock(&mctz->lock); 488 return mz; 489} 490 491static s64 mem_cgroup_read_stat(struct mem_cgroup *mem, 492 enum mem_cgroup_stat_index idx) 493{ 494 int cpu; 495 s64 val = 0; 496 497 for_each_possible_cpu(cpu) 498 val += per_cpu(mem->stat->count[idx], cpu); 499 return val; 500} 501 502static s64 mem_cgroup_local_usage(struct mem_cgroup *mem) 503{ 504 s64 ret; 505 506 ret = mem_cgroup_read_stat(mem, MEM_CGROUP_STAT_RSS); 507 ret += mem_cgroup_read_stat(mem, MEM_CGROUP_STAT_CACHE); 508 return ret; 509} 510 511static void mem_cgroup_swap_statistics(struct mem_cgroup *mem, 512 bool charge) 513{ 514 int val = (charge) ? 1 : -1; 515 this_cpu_add(mem->stat->count[MEM_CGROUP_STAT_SWAPOUT], val); 516} 517 518static void mem_cgroup_charge_statistics(struct mem_cgroup *mem, 519 struct page_cgroup *pc, 520 bool charge) 521{ 522 int val = (charge) ? 1 : -1; 523 524 preempt_disable(); 525 526 if (PageCgroupCache(pc)) 527 __this_cpu_add(mem->stat->count[MEM_CGROUP_STAT_CACHE], val); 528 else 529 __this_cpu_add(mem->stat->count[MEM_CGROUP_STAT_RSS], val); 530 531 if (charge) 532 __this_cpu_inc(mem->stat->count[MEM_CGROUP_STAT_PGPGIN_COUNT]); 533 else 534 __this_cpu_inc(mem->stat->count[MEM_CGROUP_STAT_PGPGOUT_COUNT]); 535 __this_cpu_inc(mem->stat->count[MEM_CGROUP_EVENTS]); 536 537 preempt_enable(); 538} 539 540static unsigned long mem_cgroup_get_local_zonestat(struct mem_cgroup *mem, 541 enum lru_list idx) 542{ 543 int nid, zid; 544 struct mem_cgroup_per_zone *mz; 545 u64 total = 0; 546 547 for_each_online_node(nid) 548 for (zid = 0; zid < MAX_NR_ZONES; zid++) { 549 mz = mem_cgroup_zoneinfo(mem, nid, zid); 550 total += MEM_CGROUP_ZSTAT(mz, idx); 551 } 552 return total; 553} 554 555static bool __memcg_event_check(struct mem_cgroup *mem, int event_mask_shift) 556{ 557 s64 val; 558 559 val = this_cpu_read(mem->stat->count[MEM_CGROUP_EVENTS]); 560 561 return !(val & ((1 << event_mask_shift) - 1)); 562} 563 564/* 565 * Check events in order. 566 * 567 */ 568static void memcg_check_events(struct mem_cgroup *mem, struct page *page) 569{ 570 /* threshold event is triggered in finer grain than soft limit */ 571 if (unlikely(__memcg_event_check(mem, THRESHOLDS_EVENTS_THRESH))) { 572 mem_cgroup_threshold(mem); 573 if (unlikely(__memcg_event_check(mem, SOFTLIMIT_EVENTS_THRESH))) 574 mem_cgroup_update_tree(mem, page); 575 } 576} 577 578static struct mem_cgroup *mem_cgroup_from_cont(struct cgroup *cont) 579{ 580 return container_of(cgroup_subsys_state(cont, 581 mem_cgroup_subsys_id), struct mem_cgroup, 582 css); 583} 584 585struct mem_cgroup *mem_cgroup_from_task(struct task_struct *p) 586{ 587 /* 588 * mm_update_next_owner() may clear mm->owner to NULL 589 * if it races with swapoff, page migration, etc. 590 * So this can be called with p == NULL. 591 */ 592 if (unlikely(!p)) 593 return NULL; 594 595 return container_of(task_subsys_state(p, mem_cgroup_subsys_id), 596 struct mem_cgroup, css); 597} 598 599static struct mem_cgroup *try_get_mem_cgroup_from_mm(struct mm_struct *mm) 600{ 601 struct mem_cgroup *mem = NULL; 602 603 if (!mm) 604 return NULL; 605 /* 606 * Because we have no locks, mm->owner's may be being moved to other 607 * cgroup. We use css_tryget() here even if this looks 608 * pessimistic (rather than adding locks here). 609 */ 610 rcu_read_lock(); 611 do { 612 mem = mem_cgroup_from_task(rcu_dereference(mm->owner)); 613 if (unlikely(!mem)) 614 break; 615 } while (!css_tryget(&mem->css)); 616 rcu_read_unlock(); 617 return mem; 618} 619 620/* 621 * Call callback function against all cgroup under hierarchy tree. 622 */ 623static int mem_cgroup_walk_tree(struct mem_cgroup *root, void *data, 624 int (*func)(struct mem_cgroup *, void *)) 625{ 626 int found, ret, nextid; 627 struct cgroup_subsys_state *css; 628 struct mem_cgroup *mem; 629 630 if (!root->use_hierarchy) 631 return (*func)(root, data); 632 633 nextid = 1; 634 do { 635 ret = 0; 636 mem = NULL; 637 638 rcu_read_lock(); 639 css = css_get_next(&mem_cgroup_subsys, nextid, &root->css, 640 &found); 641 if (css && css_tryget(css)) 642 mem = container_of(css, struct mem_cgroup, css); 643 rcu_read_unlock(); 644 645 if (mem) { 646 ret = (*func)(mem, data); 647 css_put(&mem->css); 648 } 649 nextid = found + 1; 650 } while (!ret && css); 651 652 return ret; 653} 654 655static inline bool mem_cgroup_is_root(struct mem_cgroup *mem) 656{ 657 return (mem == root_mem_cgroup); 658} 659 660/* 661 * Following LRU functions are allowed to be used without PCG_LOCK. 662 * Operations are called by routine of global LRU independently from memcg. 663 * What we have to take care of here is validness of pc->mem_cgroup. 664 * 665 * Changes to pc->mem_cgroup happens when 666 * 1. charge 667 * 2. moving account 668 * In typical case, "charge" is done before add-to-lru. Exception is SwapCache. 669 * It is added to LRU before charge. 670 * If PCG_USED bit is not set, page_cgroup is not added to this private LRU. 671 * When moving account, the page is not on LRU. It's isolated. 672 */ 673 674void mem_cgroup_del_lru_list(struct page *page, enum lru_list lru) 675{ 676 struct page_cgroup *pc; 677 struct mem_cgroup_per_zone *mz; 678 679 if (mem_cgroup_disabled()) 680 return; 681 pc = lookup_page_cgroup(page); 682 /* can happen while we handle swapcache. */ 683 if (!TestClearPageCgroupAcctLRU(pc)) 684 return; 685 VM_BUG_ON(!pc->mem_cgroup); 686 /* 687 * We don't check PCG_USED bit. It's cleared when the "page" is finally 688 * removed from global LRU. 689 */ 690 mz = page_cgroup_zoneinfo(pc); 691 MEM_CGROUP_ZSTAT(mz, lru) -= 1; 692 if (mem_cgroup_is_root(pc->mem_cgroup)) 693 return; 694 VM_BUG_ON(list_empty(&pc->lru)); 695 list_del_init(&pc->lru); 696 return; 697} 698 699void mem_cgroup_del_lru(struct page *page) 700{ 701 mem_cgroup_del_lru_list(page, page_lru(page)); 702} 703 704void mem_cgroup_rotate_lru_list(struct page *page, enum lru_list lru) 705{ 706 struct mem_cgroup_per_zone *mz; 707 struct page_cgroup *pc; 708 709 if (mem_cgroup_disabled()) 710 return; 711 712 pc = lookup_page_cgroup(page); 713 /* 714 * Used bit is set without atomic ops but after smp_wmb(). 715 * For making pc->mem_cgroup visible, insert smp_rmb() here. 716 */ 717 smp_rmb(); 718 /* unused or root page is not rotated. */ 719 if (!PageCgroupUsed(pc) || mem_cgroup_is_root(pc->mem_cgroup)) 720 return; 721 mz = page_cgroup_zoneinfo(pc); 722 list_move(&pc->lru, &mz->lists[lru]); 723} 724 725void mem_cgroup_add_lru_list(struct page *page, enum lru_list lru) 726{ 727 struct page_cgroup *pc; 728 struct mem_cgroup_per_zone *mz; 729 730 if (mem_cgroup_disabled()) 731 return; 732 pc = lookup_page_cgroup(page); 733 VM_BUG_ON(PageCgroupAcctLRU(pc)); 734 /* 735 * Used bit is set without atomic ops but after smp_wmb(). 736 * For making pc->mem_cgroup visible, insert smp_rmb() here. 737 */ 738 smp_rmb(); 739 if (!PageCgroupUsed(pc)) 740 return; 741 742 mz = page_cgroup_zoneinfo(pc); 743 MEM_CGROUP_ZSTAT(mz, lru) += 1; 744 SetPageCgroupAcctLRU(pc); 745 if (mem_cgroup_is_root(pc->mem_cgroup)) 746 return; 747 list_add(&pc->lru, &mz->lists[lru]); 748} 749 750/* 751 * At handling SwapCache, pc->mem_cgroup may be changed while it's linked to 752 * lru because the page may.be reused after it's fully uncharged (because of 753 * SwapCache behavior).To handle that, unlink page_cgroup from LRU when charge 754 * it again. This function is only used to charge SwapCache. It's done under 755 * lock_page and expected that zone->lru_lock is never held. 756 */ 757static void mem_cgroup_lru_del_before_commit_swapcache(struct page *page) 758{ 759 unsigned long flags; 760 struct zone *zone = page_zone(page); 761 struct page_cgroup *pc = lookup_page_cgroup(page); 762 763 spin_lock_irqsave(&zone->lru_lock, flags); 764 /* 765 * Forget old LRU when this page_cgroup is *not* used. This Used bit 766 * is guarded by lock_page() because the page is SwapCache. 767 */ 768 if (!PageCgroupUsed(pc)) 769 mem_cgroup_del_lru_list(page, page_lru(page)); 770 spin_unlock_irqrestore(&zone->lru_lock, flags); 771} 772 773static void mem_cgroup_lru_add_after_commit_swapcache(struct page *page) 774{ 775 unsigned long flags; 776 struct zone *zone = page_zone(page); 777 struct page_cgroup *pc = lookup_page_cgroup(page); 778 779 spin_lock_irqsave(&zone->lru_lock, flags); 780 /* link when the page is linked to LRU but page_cgroup isn't */ 781 if (PageLRU(page) && !PageCgroupAcctLRU(pc)) 782 mem_cgroup_add_lru_list(page, page_lru(page)); 783 spin_unlock_irqrestore(&zone->lru_lock, flags); 784} 785 786 787void mem_cgroup_move_lists(struct page *page, 788 enum lru_list from, enum lru_list to) 789{ 790 if (mem_cgroup_disabled()) 791 return; 792 mem_cgroup_del_lru_list(page, from); 793 mem_cgroup_add_lru_list(page, to); 794} 795 796int task_in_mem_cgroup(struct task_struct *task, const struct mem_cgroup *mem) 797{ 798 int ret; 799 struct mem_cgroup *curr = NULL; 800 801 task_lock(task); 802 rcu_read_lock(); 803 curr = try_get_mem_cgroup_from_mm(task->mm); 804 rcu_read_unlock(); 805 task_unlock(task); 806 if (!curr) 807 return 0; 808 /* 809 * We should check use_hierarchy of "mem" not "curr". Because checking 810 * use_hierarchy of "curr" here make this function true if hierarchy is 811 * enabled in "curr" and "curr" is a child of "mem" in *cgroup* 812 * hierarchy(even if use_hierarchy is disabled in "mem"). 813 */ 814 rcu_read_lock(); 815 if (mem->use_hierarchy) 816 ret = css_is_ancestor(&curr->css, &mem->css); 817 else 818 ret = (curr == mem); 819 rcu_read_unlock(); 820 css_put(&curr->css); 821 return ret; 822} 823 824/* 825 * prev_priority control...this will be used in memory reclaim path. 826 */ 827int mem_cgroup_get_reclaim_priority(struct mem_cgroup *mem) 828{ 829 int prev_priority; 830 831 spin_lock(&mem->reclaim_param_lock); 832 prev_priority = mem->prev_priority; 833 spin_unlock(&mem->reclaim_param_lock); 834 835 return prev_priority; 836} 837 838void mem_cgroup_note_reclaim_priority(struct mem_cgroup *mem, int priority) 839{ 840 spin_lock(&mem->reclaim_param_lock); 841 if (priority < mem->prev_priority) 842 mem->prev_priority = priority; 843 spin_unlock(&mem->reclaim_param_lock); 844} 845 846void mem_cgroup_record_reclaim_priority(struct mem_cgroup *mem, int priority) 847{ 848 spin_lock(&mem->reclaim_param_lock); 849 mem->prev_priority = priority; 850 spin_unlock(&mem->reclaim_param_lock); 851} 852 853static int calc_inactive_ratio(struct mem_cgroup *memcg, unsigned long *present_pages) 854{ 855 unsigned long active; 856 unsigned long inactive; 857 unsigned long gb; 858 unsigned long inactive_ratio; 859 860 inactive = mem_cgroup_get_local_zonestat(memcg, LRU_INACTIVE_ANON); 861 active = mem_cgroup_get_local_zonestat(memcg, LRU_ACTIVE_ANON); 862 863 gb = (inactive + active) >> (30 - PAGE_SHIFT); 864 if (gb) 865 inactive_ratio = int_sqrt(10 * gb); 866 else 867 inactive_ratio = 1; 868 869 if (present_pages) { 870 present_pages[0] = inactive; 871 present_pages[1] = active; 872 } 873 874 return inactive_ratio; 875} 876 877int mem_cgroup_inactive_anon_is_low(struct mem_cgroup *memcg) 878{ 879 unsigned long active; 880 unsigned long inactive; 881 unsigned long present_pages[2]; 882 unsigned long inactive_ratio; 883 884 inactive_ratio = calc_inactive_ratio(memcg, present_pages); 885 886 inactive = present_pages[0]; 887 active = present_pages[1]; 888 889 if (inactive * inactive_ratio < active) 890 return 1; 891 892 return 0; 893} 894 895int mem_cgroup_inactive_file_is_low(struct mem_cgroup *memcg) 896{ 897 unsigned long active; 898 unsigned long inactive; 899 900 inactive = mem_cgroup_get_local_zonestat(memcg, LRU_INACTIVE_FILE); 901 active = mem_cgroup_get_local_zonestat(memcg, LRU_ACTIVE_FILE); 902 903 return (active > inactive); 904} 905 906unsigned long mem_cgroup_zone_nr_pages(struct mem_cgroup *memcg, 907 struct zone *zone, 908 enum lru_list lru) 909{ 910 int nid = zone->zone_pgdat->node_id; 911 int zid = zone_idx(zone); 912 struct mem_cgroup_per_zone *mz = mem_cgroup_zoneinfo(memcg, nid, zid); 913 914 return MEM_CGROUP_ZSTAT(mz, lru); 915} 916 917struct zone_reclaim_stat *mem_cgroup_get_reclaim_stat(struct mem_cgroup *memcg, 918 struct zone *zone) 919{ 920 int nid = zone->zone_pgdat->node_id; 921 int zid = zone_idx(zone); 922 struct mem_cgroup_per_zone *mz = mem_cgroup_zoneinfo(memcg, nid, zid); 923 924 return &mz->reclaim_stat; 925} 926 927struct zone_reclaim_stat * 928mem_cgroup_get_reclaim_stat_from_page(struct page *page) 929{ 930 struct page_cgroup *pc; 931 struct mem_cgroup_per_zone *mz; 932 933 if (mem_cgroup_disabled()) 934 return NULL; 935 936 pc = lookup_page_cgroup(page); 937 /* 938 * Used bit is set without atomic ops but after smp_wmb(). 939 * For making pc->mem_cgroup visible, insert smp_rmb() here. 940 */ 941 smp_rmb(); 942 if (!PageCgroupUsed(pc)) 943 return NULL; 944 945 mz = page_cgroup_zoneinfo(pc); 946 if (!mz) 947 return NULL; 948 949 return &mz->reclaim_stat; 950} 951 952unsigned long mem_cgroup_isolate_pages(unsigned long nr_to_scan, 953 struct list_head *dst, 954 unsigned long *scanned, int order, 955 int mode, struct zone *z, 956 struct mem_cgroup *mem_cont, 957 int active, int file) 958{ 959 unsigned long nr_taken = 0; 960 struct page *page; 961 unsigned long scan; 962 LIST_HEAD(pc_list); 963 struct list_head *src; 964 struct page_cgroup *pc, *tmp; 965 int nid = z->zone_pgdat->node_id; 966 int zid = zone_idx(z); 967 struct mem_cgroup_per_zone *mz; 968 int lru = LRU_FILE * file + active; 969 int ret; 970 971 BUG_ON(!mem_cont); 972 mz = mem_cgroup_zoneinfo(mem_cont, nid, zid); 973 src = &mz->lists[lru]; 974 975 scan = 0; 976 list_for_each_entry_safe_reverse(pc, tmp, src, lru) { 977 if (scan >= nr_to_scan) 978 break; 979 980 page = pc->page; 981 if (unlikely(!PageCgroupUsed(pc))) 982 continue; 983 if (unlikely(!PageLRU(page))) 984 continue; 985 986 scan++; 987 ret = __isolate_lru_page(page, mode, file); 988 switch (ret) { 989 case 0: 990 list_move(&page->lru, dst); 991 mem_cgroup_del_lru(page); 992 nr_taken++; 993 break; 994 case -EBUSY: 995 /* we don't affect global LRU but rotate in our LRU */ 996 mem_cgroup_rotate_lru_list(page, page_lru(page)); 997 break; 998 default: 999 break; 1000 } 1001 } 1002 1003 *scanned = scan; 1004 return nr_taken; 1005} 1006 1007#define mem_cgroup_from_res_counter(counter, member) \ 1008 container_of(counter, struct mem_cgroup, member) 1009 1010static bool mem_cgroup_check_under_limit(struct mem_cgroup *mem) 1011{ 1012 if (do_swap_account) { 1013 if (res_counter_check_under_limit(&mem->res) && 1014 res_counter_check_under_limit(&mem->memsw)) 1015 return true; 1016 } else 1017 if (res_counter_check_under_limit(&mem->res)) 1018 return true; 1019 return false; 1020} 1021 1022static unsigned int get_swappiness(struct mem_cgroup *memcg) 1023{ 1024 struct cgroup *cgrp = memcg->css.cgroup; 1025 unsigned int swappiness; 1026 1027 /* root ? */ 1028 if (cgrp->parent == NULL) 1029 return vm_swappiness; 1030 1031 spin_lock(&memcg->reclaim_param_lock); 1032 swappiness = memcg->swappiness; 1033 spin_unlock(&memcg->reclaim_param_lock); 1034 1035 return swappiness; 1036} 1037 1038static int mem_cgroup_count_children_cb(struct mem_cgroup *mem, void *data) 1039{ 1040 int *val = data; 1041 (*val)++; 1042 return 0; 1043} 1044 1045/** 1046 * mem_cgroup_print_oom_info: Called from OOM with tasklist_lock held in read mode. 1047 * @memcg: The memory cgroup that went over limit 1048 * @p: Task that is going to be killed 1049 * 1050 * NOTE: @memcg and @p's mem_cgroup can be different when hierarchy is 1051 * enabled 1052 */ 1053void mem_cgroup_print_oom_info(struct mem_cgroup *memcg, struct task_struct *p) 1054{ 1055 struct cgroup *task_cgrp; 1056 struct cgroup *mem_cgrp; 1057 /* 1058 * Need a buffer in BSS, can't rely on allocations. The code relies 1059 * on the assumption that OOM is serialized for memory controller. 1060 * If this assumption is broken, revisit this code. 1061 */ 1062 static char memcg_name[PATH_MAX]; 1063 int ret; 1064 1065 if (!memcg || !p) 1066 return; 1067 1068 1069 rcu_read_lock(); 1070 1071 mem_cgrp = memcg->css.cgroup; 1072 task_cgrp = task_cgroup(p, mem_cgroup_subsys_id); 1073 1074 ret = cgroup_path(task_cgrp, memcg_name, PATH_MAX); 1075 if (ret < 0) { 1076 /* 1077 * Unfortunately, we are unable to convert to a useful name 1078 * But we'll still print out the usage information 1079 */ 1080 rcu_read_unlock(); 1081 goto done; 1082 } 1083 rcu_read_unlock(); 1084 1085 printk(KERN_INFO "Task in %s killed", memcg_name); 1086 1087 rcu_read_lock(); 1088 ret = cgroup_path(mem_cgrp, memcg_name, PATH_MAX); 1089 if (ret < 0) { 1090 rcu_read_unlock(); 1091 goto done; 1092 } 1093 rcu_read_unlock(); 1094 1095 /* 1096 * Continues from above, so we don't need an KERN_ level 1097 */ 1098 printk(KERN_CONT " as a result of limit of %s\n", memcg_name); 1099done: 1100 1101 printk(KERN_INFO "memory: usage %llukB, limit %llukB, failcnt %llu\n", 1102 res_counter_read_u64(&memcg->res, RES_USAGE) >> 10, 1103 res_counter_read_u64(&memcg->res, RES_LIMIT) >> 10, 1104 res_counter_read_u64(&memcg->res, RES_FAILCNT)); 1105 printk(KERN_INFO "memory+swap: usage %llukB, limit %llukB, " 1106 "failcnt %llu\n", 1107 res_counter_read_u64(&memcg->memsw, RES_USAGE) >> 10, 1108 res_counter_read_u64(&memcg->memsw, RES_LIMIT) >> 10, 1109 res_counter_read_u64(&memcg->memsw, RES_FAILCNT)); 1110} 1111 1112/* 1113 * This function returns the number of memcg under hierarchy tree. Returns 1114 * 1(self count) if no children. 1115 */ 1116static int mem_cgroup_count_children(struct mem_cgroup *mem) 1117{ 1118 int num = 0; 1119 mem_cgroup_walk_tree(mem, &num, mem_cgroup_count_children_cb); 1120 return num; 1121} 1122 1123/* 1124 * Visit the first child (need not be the first child as per the ordering 1125 * of the cgroup list, since we track last_scanned_child) of @mem and use 1126 * that to reclaim free pages from. 1127 */ 1128static struct mem_cgroup * 1129mem_cgroup_select_victim(struct mem_cgroup *root_mem) 1130{ 1131 struct mem_cgroup *ret = NULL; 1132 struct cgroup_subsys_state *css; 1133 int nextid, found; 1134 1135 if (!root_mem->use_hierarchy) { 1136 css_get(&root_mem->css); 1137 ret = root_mem; 1138 } 1139 1140 while (!ret) { 1141 rcu_read_lock(); 1142 nextid = root_mem->last_scanned_child + 1; 1143 css = css_get_next(&mem_cgroup_subsys, nextid, &root_mem->css, 1144 &found); 1145 if (css && css_tryget(css)) 1146 ret = container_of(css, struct mem_cgroup, css); 1147 1148 rcu_read_unlock(); 1149 /* Updates scanning parameter */ 1150 spin_lock(&root_mem->reclaim_param_lock); 1151 if (!css) { 1152 /* this means start scan from ID:1 */ 1153 root_mem->last_scanned_child = 0; 1154 } else 1155 root_mem->last_scanned_child = found; 1156 spin_unlock(&root_mem->reclaim_param_lock); 1157 } 1158 1159 return ret; 1160} 1161 1162/* 1163 * Scan the hierarchy if needed to reclaim memory. We remember the last child 1164 * we reclaimed from, so that we don't end up penalizing one child extensively 1165 * based on its position in the children list. 1166 * 1167 * root_mem is the original ancestor that we've been reclaim from. 1168 * 1169 * We give up and return to the caller when we visit root_mem twice. 1170 * (other groups can be removed while we're walking....) 1171 * 1172 * If shrink==true, for avoiding to free too much, this returns immedieately. 1173 */ 1174static int mem_cgroup_hierarchical_reclaim(struct mem_cgroup *root_mem, 1175 struct zone *zone, 1176 gfp_t gfp_mask, 1177 unsigned long reclaim_options) 1178{ 1179 struct mem_cgroup *victim; 1180 int ret, total = 0; 1181 int loop = 0; 1182 bool noswap = reclaim_options & MEM_CGROUP_RECLAIM_NOSWAP; 1183 bool shrink = reclaim_options & MEM_CGROUP_RECLAIM_SHRINK; 1184 bool check_soft = reclaim_options & MEM_CGROUP_RECLAIM_SOFT; 1185 unsigned long excess = mem_cgroup_get_excess(root_mem); 1186 1187 /* If memsw_is_minimum==1, swap-out is of-no-use. */ 1188 if (root_mem->memsw_is_minimum) 1189 noswap = true; 1190 1191 while (1) { 1192 victim = mem_cgroup_select_victim(root_mem); 1193 if (victim == root_mem) { 1194 loop++; 1195 if (loop >= 1) 1196 drain_all_stock_async(); 1197 if (loop >= 2) { 1198 /* 1199 * If we have not been able to reclaim 1200 * anything, it might because there are 1201 * no reclaimable pages under this hierarchy 1202 */ 1203 if (!check_soft || !total) { 1204 css_put(&victim->css); 1205 break; 1206 } 1207 /* 1208 * We want to do more targetted reclaim. 1209 * excess >> 2 is not to excessive so as to 1210 * reclaim too much, nor too less that we keep 1211 * coming back to reclaim from this cgroup 1212 */ 1213 if (total >= (excess >> 2) || 1214 (loop > MEM_CGROUP_MAX_RECLAIM_LOOPS)) { 1215 css_put(&victim->css); 1216 break; 1217 } 1218 } 1219 } 1220 if (!mem_cgroup_local_usage(victim)) { 1221 /* this cgroup's local usage == 0 */ 1222 css_put(&victim->css); 1223 continue; 1224 } 1225 /* we use swappiness of local cgroup */ 1226 if (check_soft) 1227 ret = mem_cgroup_shrink_node_zone(victim, gfp_mask, 1228 noswap, get_swappiness(victim), zone, 1229 zone->zone_pgdat->node_id); 1230 else 1231 ret = try_to_free_mem_cgroup_pages(victim, gfp_mask, 1232 noswap, get_swappiness(victim)); 1233 css_put(&victim->css); 1234 /* 1235 * At shrinking usage, we can't check we should stop here or 1236 * reclaim more. It's depends on callers. last_scanned_child 1237 * will work enough for keeping fairness under tree. 1238 */ 1239 if (shrink) 1240 return ret; 1241 total += ret; 1242 if (check_soft) { 1243 if (res_counter_check_under_soft_limit(&root_mem->res)) 1244 return total; 1245 } else if (mem_cgroup_check_under_limit(root_mem)) 1246 return 1 + total; 1247 } 1248 return total; 1249} 1250 1251static int mem_cgroup_oom_lock_cb(struct mem_cgroup *mem, void *data) 1252{ 1253 int *val = (int *)data; 1254 int x; 1255 /* 1256 * Logically, we can stop scanning immediately when we find 1257 * a memcg is already locked. But condidering unlock ops and 1258 * creation/removal of memcg, scan-all is simple operation. 1259 */ 1260 x = atomic_inc_return(&mem->oom_lock); 1261 *val = max(x, *val); 1262 return 0; 1263} 1264/* 1265 * Check OOM-Killer is already running under our hierarchy. 1266 * If someone is running, return false. 1267 */ 1268static bool mem_cgroup_oom_lock(struct mem_cgroup *mem) 1269{ 1270 int lock_count = 0; 1271 1272 mem_cgroup_walk_tree(mem, &lock_count, mem_cgroup_oom_lock_cb); 1273 1274 if (lock_count == 1) 1275 return true; 1276 return false; 1277} 1278 1279static int mem_cgroup_oom_unlock_cb(struct mem_cgroup *mem, void *data) 1280{ 1281 /* 1282 * When a new child is created while the hierarchy is under oom, 1283 * mem_cgroup_oom_lock() may not be called. We have to use 1284 * atomic_add_unless() here. 1285 */ 1286 atomic_add_unless(&mem->oom_lock, -1, 0); 1287 return 0; 1288} 1289 1290static void mem_cgroup_oom_unlock(struct mem_cgroup *mem) 1291{ 1292 mem_cgroup_walk_tree(mem, NULL, mem_cgroup_oom_unlock_cb); 1293} 1294 1295static DEFINE_MUTEX(memcg_oom_mutex); 1296static DECLARE_WAIT_QUEUE_HEAD(memcg_oom_waitq); 1297 1298/* 1299 * try to call OOM killer. returns false if we should exit memory-reclaim loop. 1300 */ 1301bool mem_cgroup_handle_oom(struct mem_cgroup *mem, gfp_t mask) 1302{ 1303 DEFINE_WAIT(wait); 1304 bool locked; 1305 1306 /* At first, try to OOM lock hierarchy under mem.*/ 1307 mutex_lock(&memcg_oom_mutex); 1308 locked = mem_cgroup_oom_lock(mem); 1309 /* 1310 * Even if signal_pending(), we can't quit charge() loop without 1311 * accounting. So, UNINTERRUPTIBLE is appropriate. But SIGKILL 1312 * under OOM is always welcomed, use TASK_KILLABLE here. 1313 */ 1314 if (!locked) 1315 prepare_to_wait(&memcg_oom_waitq, &wait, TASK_KILLABLE); 1316 mutex_unlock(&memcg_oom_mutex); 1317 1318 if (locked) 1319 mem_cgroup_out_of_memory(mem, mask); 1320 else { 1321 schedule(); 1322 finish_wait(&memcg_oom_waitq, &wait); 1323 } 1324 mutex_lock(&memcg_oom_mutex); 1325 mem_cgroup_oom_unlock(mem); 1326 /* 1327 * Here, we use global waitq .....more fine grained waitq ? 1328 * Assume following hierarchy. 1329 * A/ 1330 * 01 1331 * 02 1332 * assume OOM happens both in A and 01 at the same time. Tthey are 1333 * mutually exclusive by lock. (kill in 01 helps A.) 1334 * When we use per memcg waitq, we have to wake up waiters on A and 02 1335 * in addtion to waiters on 01. We use global waitq for avoiding mess. 1336 * It will not be a big problem. 1337 * (And a task may be moved to other groups while it's waiting for OOM.) 1338 */ 1339 wake_up_all(&memcg_oom_waitq); 1340 mutex_unlock(&memcg_oom_mutex); 1341 1342 if (test_thread_flag(TIF_MEMDIE) || fatal_signal_pending(current)) 1343 return false; 1344 /* Give chance to dying process */ 1345 schedule_timeout(1); 1346 return true; 1347} 1348 1349/* 1350 * Currently used to update mapped file statistics, but the routine can be 1351 * generalized to update other statistics as well. 1352 */ 1353void mem_cgroup_update_file_mapped(struct page *page, int val) 1354{ 1355 struct mem_cgroup *mem; 1356 struct page_cgroup *pc; 1357 1358 pc = lookup_page_cgroup(page); 1359 if (unlikely(!pc)) 1360 return; 1361 1362 lock_page_cgroup(pc); 1363 mem = pc->mem_cgroup; 1364 if (!mem || !PageCgroupUsed(pc)) 1365 goto done; 1366 1367 /* 1368 * Preemption is already disabled. We can use __this_cpu_xxx 1369 */ 1370 if (val > 0) { 1371 __this_cpu_inc(mem->stat->count[MEM_CGROUP_STAT_FILE_MAPPED]); 1372 SetPageCgroupFileMapped(pc); 1373 } else { 1374 __this_cpu_dec(mem->stat->count[MEM_CGROUP_STAT_FILE_MAPPED]); 1375 ClearPageCgroupFileMapped(pc); 1376 } 1377 1378done: 1379 unlock_page_cgroup(pc); 1380} 1381 1382/* 1383 * size of first charge trial. "32" comes from vmscan.c's magic value. 1384 * TODO: maybe necessary to use big numbers in big irons. 1385 */ 1386#define CHARGE_SIZE (32 * PAGE_SIZE) 1387struct memcg_stock_pcp { 1388 struct mem_cgroup *cached; /* this never be root cgroup */ 1389 int charge; 1390 struct work_struct work; 1391}; 1392static DEFINE_PER_CPU(struct memcg_stock_pcp, memcg_stock); 1393static atomic_t memcg_drain_count; 1394 1395/* 1396 * Try to consume stocked charge on this cpu. If success, PAGE_SIZE is consumed 1397 * from local stock and true is returned. If the stock is 0 or charges from a 1398 * cgroup which is not current target, returns false. This stock will be 1399 * refilled. 1400 */ 1401static bool consume_stock(struct mem_cgroup *mem) 1402{ 1403 struct memcg_stock_pcp *stock; 1404 bool ret = true; 1405 1406 stock = &get_cpu_var(memcg_stock); 1407 if (mem == stock->cached && stock->charge) 1408 stock->charge -= PAGE_SIZE; 1409 else /* need to call res_counter_charge */ 1410 ret = false; 1411 put_cpu_var(memcg_stock); 1412 return ret; 1413} 1414 1415/* 1416 * Returns stocks cached in percpu to res_counter and reset cached information. 1417 */ 1418static void drain_stock(struct memcg_stock_pcp *stock) 1419{ 1420 struct mem_cgroup *old = stock->cached; 1421 1422 if (stock->charge) { 1423 res_counter_uncharge(&old->res, stock->charge); 1424 if (do_swap_account) 1425 res_counter_uncharge(&old->memsw, stock->charge); 1426 } 1427 stock->cached = NULL; 1428 stock->charge = 0; 1429} 1430 1431/* 1432 * This must be called under preempt disabled or must be called by 1433 * a thread which is pinned to local cpu. 1434 */ 1435static void drain_local_stock(struct work_struct *dummy) 1436{ 1437 struct memcg_stock_pcp *stock = &__get_cpu_var(memcg_stock); 1438 drain_stock(stock); 1439} 1440 1441/* 1442 * Cache charges(val) which is from res_counter, to local per_cpu area. 1443 * This will be consumed by consumt_stock() function, later. 1444 */ 1445static void refill_stock(struct mem_cgroup *mem, int val) 1446{ 1447 struct memcg_stock_pcp *stock = &get_cpu_var(memcg_stock); 1448 1449 if (stock->cached != mem) { /* reset if necessary */ 1450 drain_stock(stock); 1451 stock->cached = mem; 1452 } 1453 stock->charge += val; 1454 put_cpu_var(memcg_stock); 1455} 1456 1457/* 1458 * Tries to drain stocked charges in other cpus. This function is asynchronous 1459 * and just put a work per cpu for draining localy on each cpu. Caller can 1460 * expects some charges will be back to res_counter later but cannot wait for 1461 * it. 1462 */ 1463static void drain_all_stock_async(void) 1464{ 1465 int cpu; 1466 /* This function is for scheduling "drain" in asynchronous way. 1467 * The result of "drain" is not directly handled by callers. Then, 1468 * if someone is calling drain, we don't have to call drain more. 1469 * Anyway, WORK_STRUCT_PENDING check in queue_work_on() will catch if 1470 * there is a race. We just do loose check here. 1471 */ 1472 if (atomic_read(&memcg_drain_count)) 1473 return; 1474 /* Notify other cpus that system-wide "drain" is running */ 1475 atomic_inc(&memcg_drain_count); 1476 get_online_cpus(); 1477 for_each_online_cpu(cpu) { 1478 struct memcg_stock_pcp *stock = &per_cpu(memcg_stock, cpu); 1479 schedule_work_on(cpu, &stock->work); 1480 } 1481 put_online_cpus(); 1482 atomic_dec(&memcg_drain_count); 1483 /* We don't wait for flush_work */ 1484} 1485 1486/* This is a synchronous drain interface. */ 1487static void drain_all_stock_sync(void) 1488{ 1489 /* called when force_empty is called */ 1490 atomic_inc(&memcg_drain_count); 1491 schedule_on_each_cpu(drain_local_stock); 1492 atomic_dec(&memcg_drain_count); 1493} 1494 1495static int __cpuinit memcg_stock_cpu_callback(struct notifier_block *nb, 1496 unsigned long action, 1497 void *hcpu) 1498{ 1499 int cpu = (unsigned long)hcpu; 1500 struct memcg_stock_pcp *stock; 1501 1502 if (action != CPU_DEAD) 1503 return NOTIFY_OK; 1504 stock = &per_cpu(memcg_stock, cpu); 1505 drain_stock(stock); 1506 return NOTIFY_OK; 1507} 1508 1509/* 1510 * Unlike exported interface, "oom" parameter is added. if oom==true, 1511 * oom-killer can be invoked. 1512 */ 1513static int __mem_cgroup_try_charge(struct mm_struct *mm, 1514 gfp_t gfp_mask, struct mem_cgroup **memcg, bool oom) 1515{ 1516 struct mem_cgroup *mem, *mem_over_limit; 1517 int nr_retries = MEM_CGROUP_RECLAIM_RETRIES; 1518 struct res_counter *fail_res; 1519 int csize = CHARGE_SIZE; 1520 1521 /* 1522 * Unlike gloval-vm's OOM-kill, we're not in memory shortage 1523 * in system level. So, allow to go ahead dying process in addition to 1524 * MEMDIE process. 1525 */ 1526 if (unlikely(test_thread_flag(TIF_MEMDIE) 1527 || fatal_signal_pending(current))) 1528 goto bypass; 1529 1530 /* 1531 * We always charge the cgroup the mm_struct belongs to. 1532 * The mm_struct's mem_cgroup changes on task migration if the 1533 * thread group leader migrates. It's possible that mm is not 1534 * set, if so charge the init_mm (happens for pagecache usage). 1535 */ 1536 mem = *memcg; 1537 if (likely(!mem)) { 1538 mem = try_get_mem_cgroup_from_mm(mm); 1539 *memcg = mem; 1540 } else { 1541 css_get(&mem->css); 1542 } 1543 if (unlikely(!mem)) 1544 return 0; 1545 1546 VM_BUG_ON(css_is_removed(&mem->css)); 1547 if (mem_cgroup_is_root(mem)) 1548 goto done; 1549 1550 while (1) { 1551 int ret = 0; 1552 unsigned long flags = 0; 1553 1554 if (consume_stock(mem)) 1555 goto done; 1556 1557 ret = res_counter_charge(&mem->res, csize, &fail_res); 1558 if (likely(!ret)) { 1559 if (!do_swap_account) 1560 break; 1561 ret = res_counter_charge(&mem->memsw, csize, &fail_res); 1562 if (likely(!ret)) 1563 break; 1564 /* mem+swap counter fails */ 1565 res_counter_uncharge(&mem->res, csize); 1566 flags |= MEM_CGROUP_RECLAIM_NOSWAP; 1567 mem_over_limit = mem_cgroup_from_res_counter(fail_res, 1568 memsw); 1569 } else 1570 /* mem counter fails */ 1571 mem_over_limit = mem_cgroup_from_res_counter(fail_res, 1572 res); 1573 1574 /* reduce request size and retry */ 1575 if (csize > PAGE_SIZE) { 1576 csize = PAGE_SIZE; 1577 continue; 1578 } 1579 if (!(gfp_mask & __GFP_WAIT)) 1580 goto nomem; 1581 1582 ret = mem_cgroup_hierarchical_reclaim(mem_over_limit, NULL, 1583 gfp_mask, flags); 1584 if (ret) 1585 continue; 1586 1587 /* 1588 * try_to_free_mem_cgroup_pages() might not give us a full 1589 * picture of reclaim. Some pages are reclaimed and might be 1590 * moved to swap cache or just unmapped from the cgroup. 1591 * Check the limit again to see if the reclaim reduced the 1592 * current usage of the cgroup before giving up 1593 * 1594 */ 1595 if (mem_cgroup_check_under_limit(mem_over_limit)) 1596 continue; 1597 1598 /* try to avoid oom while someone is moving charge */ 1599 if (mc.moving_task && current != mc.moving_task) { 1600 struct mem_cgroup *from, *to; 1601 bool do_continue = false; 1602 /* 1603 * There is a small race that "from" or "to" can be 1604 * freed by rmdir, so we use css_tryget(). 1605 */ 1606 rcu_read_lock(); 1607 from = mc.from; 1608 to = mc.to; 1609 if (from && css_tryget(&from->css)) { 1610 if (mem_over_limit->use_hierarchy) 1611 do_continue = css_is_ancestor( 1612 &from->css, 1613 &mem_over_limit->css); 1614 else 1615 do_continue = (from == mem_over_limit); 1616 css_put(&from->css); 1617 } 1618 if (!do_continue && to && css_tryget(&to->css)) { 1619 if (mem_over_limit->use_hierarchy) 1620 do_continue = css_is_ancestor( 1621 &to->css, 1622 &mem_over_limit->css); 1623 else 1624 do_continue = (to == mem_over_limit); 1625 css_put(&to->css); 1626 } 1627 rcu_read_unlock(); 1628 if (do_continue) { 1629 DEFINE_WAIT(wait); 1630 prepare_to_wait(&mc.waitq, &wait, 1631 TASK_INTERRUPTIBLE); 1632 /* moving charge context might have finished. */ 1633 if (mc.moving_task) 1634 schedule(); 1635 finish_wait(&mc.waitq, &wait); 1636 continue; 1637 } 1638 } 1639 1640 if (!nr_retries--) { 1641 if (!oom) 1642 goto nomem; 1643 if (mem_cgroup_handle_oom(mem_over_limit, gfp_mask)) { 1644 nr_retries = MEM_CGROUP_RECLAIM_RETRIES; 1645 continue; 1646 } 1647 /* When we reach here, current task is dying .*/ 1648 css_put(&mem->css); 1649 goto bypass; 1650 } 1651 } 1652 if (csize > PAGE_SIZE) 1653 refill_stock(mem, csize - PAGE_SIZE); 1654done: 1655 return 0; 1656nomem: 1657 css_put(&mem->css); 1658 return -ENOMEM; 1659bypass: 1660 *memcg = NULL; 1661 return 0; 1662} 1663 1664/* 1665 * Somemtimes we have to undo a charge we got by try_charge(). 1666 * This function is for that and do uncharge, put css's refcnt. 1667 * gotten by try_charge(). 1668 */ 1669static void __mem_cgroup_cancel_charge(struct mem_cgroup *mem, 1670 unsigned long count) 1671{ 1672 if (!mem_cgroup_is_root(mem)) { 1673 res_counter_uncharge(&mem->res, PAGE_SIZE * count); 1674 if (do_swap_account) 1675 res_counter_uncharge(&mem->memsw, PAGE_SIZE * count); 1676 VM_BUG_ON(test_bit(CSS_ROOT, &mem->css.flags)); 1677 WARN_ON_ONCE(count > INT_MAX); 1678 __css_put(&mem->css, (int)count); 1679 } 1680 /* we don't need css_put for root */ 1681} 1682 1683static void mem_cgroup_cancel_charge(struct mem_cgroup *mem) 1684{ 1685 __mem_cgroup_cancel_charge(mem, 1); 1686} 1687 1688/* 1689 * A helper function to get mem_cgroup from ID. must be called under 1690 * rcu_read_lock(). The caller must check css_is_removed() or some if 1691 * it's concern. (dropping refcnt from swap can be called against removed 1692 * memcg.) 1693 */ 1694static struct mem_cgroup *mem_cgroup_lookup(unsigned short id) 1695{ 1696 struct cgroup_subsys_state *css; 1697 1698 /* ID 0 is unused ID */ 1699 if (!id) 1700 return NULL; 1701 css = css_lookup(&mem_cgroup_subsys, id); 1702 if (!css) 1703 return NULL; 1704 return container_of(css, struct mem_cgroup, css); 1705} 1706 1707struct mem_cgroup *try_get_mem_cgroup_from_page(struct page *page) 1708{ 1709 struct mem_cgroup *mem = NULL; 1710 struct page_cgroup *pc; 1711 unsigned short id; 1712 swp_entry_t ent; 1713 1714 VM_BUG_ON(!PageLocked(page)); 1715 1716 pc = lookup_page_cgroup(page); 1717 lock_page_cgroup(pc); 1718 if (PageCgroupUsed(pc)) { 1719 mem = pc->mem_cgroup; 1720 if (mem && !css_tryget(&mem->css)) 1721 mem = NULL; 1722 } else if (PageSwapCache(page)) { 1723 ent.val = page_private(page); 1724 id = lookup_swap_cgroup(ent); 1725 rcu_read_lock(); 1726 mem = mem_cgroup_lookup(id); 1727 if (mem && !css_tryget(&mem->css)) 1728 mem = NULL; 1729 rcu_read_unlock(); 1730 } 1731 unlock_page_cgroup(pc); 1732 return mem; 1733} 1734 1735/* 1736 * commit a charge got by __mem_cgroup_try_charge() and makes page_cgroup to be 1737 * USED state. If already USED, uncharge and return. 1738 */ 1739 1740static void __mem_cgroup_commit_charge(struct mem_cgroup *mem, 1741 struct page_cgroup *pc, 1742 enum charge_type ctype) 1743{ 1744 /* try_charge() can return NULL to *memcg, taking care of it. */ 1745 if (!mem) 1746 return; 1747 1748 lock_page_cgroup(pc); 1749 if (unlikely(PageCgroupUsed(pc))) { 1750 unlock_page_cgroup(pc); 1751 mem_cgroup_cancel_charge(mem); 1752 return; 1753 } 1754 1755 pc->mem_cgroup = mem; 1756 /* 1757 * We access a page_cgroup asynchronously without lock_page_cgroup(). 1758 * Especially when a page_cgroup is taken from a page, pc->mem_cgroup 1759 * is accessed after testing USED bit. To make pc->mem_cgroup visible 1760 * before USED bit, we need memory barrier here. 1761 * See mem_cgroup_add_lru_list(), etc. 1762 */ 1763 smp_wmb(); 1764 switch (ctype) { 1765 case MEM_CGROUP_CHARGE_TYPE_CACHE: 1766 case MEM_CGROUP_CHARGE_TYPE_SHMEM: 1767 SetPageCgroupCache(pc); 1768 SetPageCgroupUsed(pc); 1769 break; 1770 case MEM_CGROUP_CHARGE_TYPE_MAPPED: 1771 ClearPageCgroupCache(pc); 1772 SetPageCgroupUsed(pc); 1773 break; 1774 default: 1775 break; 1776 } 1777 1778 mem_cgroup_charge_statistics(mem, pc, true); 1779 1780 unlock_page_cgroup(pc); 1781 /* 1782 * "charge_statistics" updated event counter. Then, check it. 1783 * Insert ancestor (and ancestor's ancestors), to softlimit RB-tree. 1784 * if they exceeds softlimit. 1785 */ 1786 memcg_check_events(mem, pc->page); 1787} 1788 1789/** 1790 * __mem_cgroup_move_account - move account of the page 1791 * @pc: page_cgroup of the page. 1792 * @from: mem_cgroup which the page is moved from. 1793 * @to: mem_cgroup which the page is moved to. @from != @to. 1794 * @uncharge: whether we should call uncharge and css_put against @from. 1795 * 1796 * The caller must confirm following. 1797 * - page is not on LRU (isolate_page() is useful.) 1798 * - the pc is locked, used, and ->mem_cgroup points to @from. 1799 * 1800 * This function doesn't do "charge" nor css_get to new cgroup. It should be 1801 * done by a caller(__mem_cgroup_try_charge would be usefull). If @uncharge is 1802 * true, this function does "uncharge" from old cgroup, but it doesn't if 1803 * @uncharge is false, so a caller should do "uncharge". 1804 */ 1805 1806static void __mem_cgroup_move_account(struct page_cgroup *pc, 1807 struct mem_cgroup *from, struct mem_cgroup *to, bool uncharge) 1808{ 1809 VM_BUG_ON(from == to); 1810 VM_BUG_ON(PageLRU(pc->page)); 1811 VM_BUG_ON(!PageCgroupLocked(pc)); 1812 VM_BUG_ON(!PageCgroupUsed(pc)); 1813 VM_BUG_ON(pc->mem_cgroup != from); 1814 1815 if (PageCgroupFileMapped(pc)) { 1816 /* Update mapped_file data for mem_cgroup */ 1817 preempt_disable(); 1818 __this_cpu_dec(from->stat->count[MEM_CGROUP_STAT_FILE_MAPPED]); 1819 __this_cpu_inc(to->stat->count[MEM_CGROUP_STAT_FILE_MAPPED]); 1820 preempt_enable(); 1821 } 1822 mem_cgroup_charge_statistics(from, pc, false); 1823 if (uncharge) 1824 /* This is not "cancel", but cancel_charge does all we need. */ 1825 mem_cgroup_cancel_charge(from); 1826 1827 /* caller should have done css_get */ 1828 pc->mem_cgroup = to; 1829 mem_cgroup_charge_statistics(to, pc, true); 1830 /* 1831 * We charges against "to" which may not have any tasks. Then, "to" 1832 * can be under rmdir(). But in current implementation, caller of 1833 * this function is just force_empty() and move charge, so it's 1834 * garanteed that "to" is never removed. So, we don't check rmdir 1835 * status here. 1836 */ 1837} 1838 1839/* 1840 * check whether the @pc is valid for moving account and call 1841 * __mem_cgroup_move_account() 1842 */ 1843static int mem_cgroup_move_account(struct page_cgroup *pc, 1844 struct mem_cgroup *from, struct mem_cgroup *to, bool uncharge) 1845{ 1846 int ret = -EINVAL; 1847 lock_page_cgroup(pc); 1848 if (PageCgroupUsed(pc) && pc->mem_cgroup == from) { 1849 __mem_cgroup_move_account(pc, from, to, uncharge); 1850 ret = 0; 1851 } 1852 unlock_page_cgroup(pc); 1853 /* 1854 * check events 1855 */ 1856 memcg_check_events(to, pc->page); 1857 memcg_check_events(from, pc->page); 1858 return ret; 1859} 1860 1861/* 1862 * move charges to its parent. 1863 */ 1864 1865static int mem_cgroup_move_parent(struct page_cgroup *pc, 1866 struct mem_cgroup *child, 1867 gfp_t gfp_mask) 1868{ 1869 struct page *page = pc->page; 1870 struct cgroup *cg = child->css.cgroup; 1871 struct cgroup *pcg = cg->parent; 1872 struct mem_cgroup *parent; 1873 int ret; 1874 1875 /* Is ROOT ? */ 1876 if (!pcg) 1877 return -EINVAL; 1878 1879 ret = -EBUSY; 1880 if (!get_page_unless_zero(page)) 1881 goto out; 1882 if (isolate_lru_page(page)) 1883 goto put; 1884 1885 parent = mem_cgroup_from_cont(pcg); 1886 ret = __mem_cgroup_try_charge(NULL, gfp_mask, &parent, false); 1887 if (ret || !parent) 1888 goto put_back; 1889 1890 ret = mem_cgroup_move_account(pc, child, parent, true); 1891 if (ret) 1892 mem_cgroup_cancel_charge(parent); 1893put_back: 1894 putback_lru_page(page); 1895put: 1896 put_page(page); 1897out: 1898 return ret; 1899} 1900 1901/* 1902 * Charge the memory controller for page usage. 1903 * Return 1904 * 0 if the charge was successful 1905 * < 0 if the cgroup is over its limit 1906 */ 1907static int mem_cgroup_charge_common(struct page *page, struct mm_struct *mm, 1908 gfp_t gfp_mask, enum charge_type ctype, 1909 struct mem_cgroup *memcg) 1910{ 1911 struct mem_cgroup *mem; 1912 struct page_cgroup *pc; 1913 int ret; 1914 1915 pc = lookup_page_cgroup(page); 1916 /* can happen at boot */ 1917 if (unlikely(!pc)) 1918 return 0; 1919 prefetchw(pc); 1920 1921 mem = memcg; 1922 ret = __mem_cgroup_try_charge(mm, gfp_mask, &mem, true); 1923 if (ret || !mem) 1924 return ret; 1925 1926 __mem_cgroup_commit_charge(mem, pc, ctype); 1927 return 0; 1928} 1929 1930int mem_cgroup_newpage_charge(struct page *page, 1931 struct mm_struct *mm, gfp_t gfp_mask) 1932{ 1933 if (mem_cgroup_disabled()) 1934 return 0; 1935 if (PageCompound(page)) 1936 return 0; 1937 /* 1938 * If already mapped, we don't have to account. 1939 * If page cache, page->mapping has address_space. 1940 * But page->mapping may have out-of-use anon_vma pointer, 1941 * detecit it by PageAnon() check. newly-mapped-anon's page->mapping 1942 * is NULL. 1943 */ 1944 if (page_mapped(page) || (page->mapping && !PageAnon(page))) 1945 return 0; 1946 if (unlikely(!mm)) 1947 mm = &init_mm; 1948 return mem_cgroup_charge_common(page, mm, gfp_mask, 1949 MEM_CGROUP_CHARGE_TYPE_MAPPED, NULL); 1950} 1951 1952static void 1953__mem_cgroup_commit_charge_swapin(struct page *page, struct mem_cgroup *ptr, 1954 enum charge_type ctype); 1955 1956int mem_cgroup_cache_charge(struct page *page, struct mm_struct *mm, 1957 gfp_t gfp_mask) 1958{ 1959 struct mem_cgroup *mem = NULL; 1960 int ret; 1961 1962 if (mem_cgroup_disabled()) 1963 return 0; 1964 if (PageCompound(page)) 1965 return 0; 1966 /* 1967 * Corner case handling. This is called from add_to_page_cache() 1968 * in usual. But some FS (shmem) precharges this page before calling it 1969 * and call add_to_page_cache() with GFP_NOWAIT. 1970 * 1971 * For GFP_NOWAIT case, the page may be pre-charged before calling 1972 * add_to_page_cache(). (See shmem.c) check it here and avoid to call 1973 * charge twice. (It works but has to pay a bit larger cost.) 1974 * And when the page is SwapCache, it should take swap information 1975 * into account. This is under lock_page() now. 1976 */ 1977 if (!(gfp_mask & __GFP_WAIT)) { 1978 struct page_cgroup *pc; 1979 1980 1981 pc = lookup_page_cgroup(page); 1982 if (!pc) 1983 return 0; 1984 lock_page_cgroup(pc); 1985 if (PageCgroupUsed(pc)) { 1986 unlock_page_cgroup(pc); 1987 return 0; 1988 } 1989 unlock_page_cgroup(pc); 1990 } 1991 1992 if (unlikely(!mm && !mem)) 1993 mm = &init_mm; 1994 1995 if (page_is_file_cache(page)) 1996 return mem_cgroup_charge_common(page, mm, gfp_mask, 1997 MEM_CGROUP_CHARGE_TYPE_CACHE, NULL); 1998 1999 /* shmem */ 2000 if (PageSwapCache(page)) { 2001 ret = mem_cgroup_try_charge_swapin(mm, page, gfp_mask, &mem); 2002 if (!ret) 2003 __mem_cgroup_commit_charge_swapin(page, mem, 2004 MEM_CGROUP_CHARGE_TYPE_SHMEM); 2005 } else 2006 ret = mem_cgroup_charge_common(page, mm, gfp_mask, 2007 MEM_CGROUP_CHARGE_TYPE_SHMEM, mem); 2008 2009 return ret; 2010} 2011 2012/* 2013 * While swap-in, try_charge -> commit or cancel, the page is locked. 2014 * And when try_charge() successfully returns, one refcnt to memcg without 2015 * struct page_cgroup is acquired. This refcnt will be consumed by 2016 * "commit()" or removed by "cancel()" 2017 */ 2018int mem_cgroup_try_charge_swapin(struct mm_struct *mm, 2019 struct page *page, 2020 gfp_t mask, struct mem_cgroup **ptr) 2021{ 2022 struct mem_cgroup *mem; 2023 int ret; 2024 2025 if (mem_cgroup_disabled()) 2026 return 0; 2027 2028 if (!do_swap_account) 2029 goto charge_cur_mm; 2030 /* 2031 * A racing thread's fault, or swapoff, may have already updated 2032 * the pte, and even removed page from swap cache: in those cases 2033 * do_swap_page()'s pte_same() test will fail; but there's also a 2034 * KSM case which does need to charge the page. 2035 */ 2036 if (!PageSwapCache(page)) 2037 goto charge_cur_mm; 2038 mem = try_get_mem_cgroup_from_page(page); 2039 if (!mem) 2040 goto charge_cur_mm; 2041 *ptr = mem; 2042 ret = __mem_cgroup_try_charge(NULL, mask, ptr, true); 2043 /* drop extra refcnt from tryget */ 2044 css_put(&mem->css); 2045 return ret; 2046charge_cur_mm: 2047 if (unlikely(!mm)) 2048 mm = &init_mm; 2049 return __mem_cgroup_try_charge(mm, mask, ptr, true); 2050} 2051 2052static void 2053__mem_cgroup_commit_charge_swapin(struct page *page, struct mem_cgroup *ptr, 2054 enum charge_type ctype) 2055{ 2056 struct page_cgroup *pc; 2057 2058 if (mem_cgroup_disabled()) 2059 return; 2060 if (!ptr) 2061 return; 2062 cgroup_exclude_rmdir(&ptr->css); 2063 pc = lookup_page_cgroup(page); 2064 mem_cgroup_lru_del_before_commit_swapcache(page); 2065 __mem_cgroup_commit_charge(ptr, pc, ctype); 2066 mem_cgroup_lru_add_after_commit_swapcache(page); 2067 /* 2068 * Now swap is on-memory. This means this page may be 2069 * counted both as mem and swap....double count. 2070 * Fix it by uncharging from memsw. Basically, this SwapCache is stable 2071 * under lock_page(). But in do_swap_page()::memory.c, reuse_swap_page() 2072 * may call delete_from_swap_cache() before reach here. 2073 */ 2074 if (do_swap_account && PageSwapCache(page)) { 2075 swp_entry_t ent = {.val = page_private(page)}; 2076 unsigned short id; 2077 struct mem_cgroup *memcg; 2078 2079 id = swap_cgroup_record(ent, 0); 2080 rcu_read_lock(); 2081 memcg = mem_cgroup_lookup(id); 2082 if (memcg) { 2083 /* 2084 * This recorded memcg can be obsolete one. So, avoid 2085 * calling css_tryget 2086 */ 2087 if (!mem_cgroup_is_root(memcg)) 2088 res_counter_uncharge(&memcg->memsw, PAGE_SIZE); 2089 mem_cgroup_swap_statistics(memcg, false); 2090 mem_cgroup_put(memcg); 2091 } 2092 rcu_read_unlock(); 2093 } 2094 /* 2095 * At swapin, we may charge account against cgroup which has no tasks. 2096 * So, rmdir()->pre_destroy() can be called while we do this charge. 2097 * In that case, we need to call pre_destroy() again. check it here. 2098 */ 2099 cgroup_release_and_wakeup_rmdir(&ptr->css); 2100} 2101 2102void mem_cgroup_commit_charge_swapin(struct page *page, struct mem_cgroup *ptr) 2103{ 2104 __mem_cgroup_commit_charge_swapin(page, ptr, 2105 MEM_CGROUP_CHARGE_TYPE_MAPPED); 2106} 2107 2108void mem_cgroup_cancel_charge_swapin(struct mem_cgroup *mem) 2109{ 2110 if (mem_cgroup_disabled()) 2111 return; 2112 if (!mem) 2113 return; 2114 mem_cgroup_cancel_charge(mem); 2115} 2116 2117static void 2118__do_uncharge(struct mem_cgroup *mem, const enum charge_type ctype) 2119{ 2120 struct memcg_batch_info *batch = NULL; 2121 bool uncharge_memsw = true; 2122 /* If swapout, usage of swap doesn't decrease */ 2123 if (!do_swap_account || ctype == MEM_CGROUP_CHARGE_TYPE_SWAPOUT) 2124 uncharge_memsw = false; 2125 /* 2126 * do_batch > 0 when unmapping pages or inode invalidate/truncate. 2127 * In those cases, all pages freed continously can be expected to be in 2128 * the same cgroup and we have chance to coalesce uncharges. 2129 * But we do uncharge one by one if this is killed by OOM(TIF_MEMDIE) 2130 * because we want to do uncharge as soon as possible. 2131 */ 2132 if (!current->memcg_batch.do_batch || test_thread_flag(TIF_MEMDIE)) 2133 goto direct_uncharge; 2134 2135 batch = ¤t->memcg_batch; 2136 /* 2137 * In usual, we do css_get() when we remember memcg pointer. 2138 * But in this case, we keep res->usage until end of a series of 2139 * uncharges. Then, it's ok to ignore memcg's refcnt. 2140 */ 2141 if (!batch->memcg) 2142 batch->memcg = mem; 2143 /* 2144 * In typical case, batch->memcg == mem. This means we can 2145 * merge a series of uncharges to an uncharge of res_counter. 2146 * If not, we uncharge res_counter ony by one. 2147 */ 2148 if (batch->memcg != mem) 2149 goto direct_uncharge; 2150 /* remember freed charge and uncharge it later */ 2151 batch->bytes += PAGE_SIZE; 2152 if (uncharge_memsw) 2153 batch->memsw_bytes += PAGE_SIZE; 2154 return; 2155direct_uncharge: 2156 res_counter_uncharge(&mem->res, PAGE_SIZE); 2157 if (uncharge_memsw) 2158 res_counter_uncharge(&mem->memsw, PAGE_SIZE); 2159 return; 2160} 2161 2162/* 2163 * uncharge if !page_mapped(page) 2164 */ 2165static struct mem_cgroup * 2166__mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype) 2167{ 2168 struct page_cgroup *pc; 2169 struct mem_cgroup *mem = NULL; 2170 struct mem_cgroup_per_zone *mz; 2171 2172 if (mem_cgroup_disabled()) 2173 return NULL; 2174 2175 if (PageSwapCache(page)) 2176 return NULL; 2177 2178 /* 2179 * Check if our page_cgroup is valid 2180 */ 2181 pc = lookup_page_cgroup(page); 2182 if (unlikely(!pc || !PageCgroupUsed(pc))) 2183 return NULL; 2184 2185 lock_page_cgroup(pc); 2186 2187 mem = pc->mem_cgroup; 2188 2189 if (!PageCgroupUsed(pc)) 2190 goto unlock_out; 2191 2192 switch (ctype) { 2193 case MEM_CGROUP_CHARGE_TYPE_MAPPED: 2194 case MEM_CGROUP_CHARGE_TYPE_DROP: 2195 if (page_mapped(page)) 2196 goto unlock_out; 2197 break; 2198 case MEM_CGROUP_CHARGE_TYPE_SWAPOUT: 2199 if (!PageAnon(page)) { /* Shared memory */ 2200 if (page->mapping && !page_is_file_cache(page)) 2201 goto unlock_out; 2202 } else if (page_mapped(page)) /* Anon */ 2203 goto unlock_out; 2204 break; 2205 default: 2206 break; 2207 } 2208 2209 if (!mem_cgroup_is_root(mem)) 2210 __do_uncharge(mem, ctype); 2211 if (ctype == MEM_CGROUP_CHARGE_TYPE_SWAPOUT) 2212 mem_cgroup_swap_statistics(mem, true); 2213 mem_cgroup_charge_statistics(mem, pc, false); 2214 2215 ClearPageCgroupUsed(pc); 2216 /* 2217 * pc->mem_cgroup is not cleared here. It will be accessed when it's 2218 * freed from LRU. This is safe because uncharged page is expected not 2219 * to be reused (freed soon). Exception is SwapCache, it's handled by 2220 * special functions. 2221 */ 2222 2223 mz = page_cgroup_zoneinfo(pc); 2224 unlock_page_cgroup(pc); 2225 2226 memcg_check_events(mem, page); 2227 /* at swapout, this memcg will be accessed to record to swap */ 2228 if (ctype != MEM_CGROUP_CHARGE_TYPE_SWAPOUT) 2229 css_put(&mem->css); 2230 2231 return mem; 2232 2233unlock_out: 2234 unlock_page_cgroup(pc); 2235 return NULL; 2236} 2237 2238void mem_cgroup_uncharge_page(struct page *page) 2239{ 2240 /* early check. */ 2241 if (page_mapped(page)) 2242 return; 2243 if (page->mapping && !PageAnon(page)) 2244 return; 2245 __mem_cgroup_uncharge_common(page, MEM_CGROUP_CHARGE_TYPE_MAPPED); 2246} 2247 2248void mem_cgroup_uncharge_cache_page(struct page *page) 2249{ 2250 VM_BUG_ON(page_mapped(page)); 2251 VM_BUG_ON(page->mapping); 2252 __mem_cgroup_uncharge_common(page, MEM_CGROUP_CHARGE_TYPE_CACHE); 2253} 2254 2255/* 2256 * Batch_start/batch_end is called in unmap_page_range/invlidate/trucate. 2257 * In that cases, pages are freed continuously and we can expect pages 2258 * are in the same memcg. All these calls itself limits the number of 2259 * pages freed at once, then uncharge_start/end() is called properly. 2260 * This may be called prural(2) times in a context, 2261 */ 2262 2263void mem_cgroup_uncharge_start(void) 2264{ 2265 current->memcg_batch.do_batch++; 2266 /* We can do nest. */ 2267 if (current->memcg_batch.do_batch == 1) { 2268 current->memcg_batch.memcg = NULL; 2269 current->memcg_batch.bytes = 0; 2270 current->memcg_batch.memsw_bytes = 0; 2271 } 2272} 2273 2274void mem_cgroup_uncharge_end(void) 2275{ 2276 struct memcg_batch_info *batch = ¤t->memcg_batch; 2277 2278 if (!batch->do_batch) 2279 return; 2280 2281 batch->do_batch--; 2282 if (batch->do_batch) /* If stacked, do nothing. */ 2283 return; 2284 2285 if (!batch->memcg) 2286 return; 2287 /* 2288 * This "batch->memcg" is valid without any css_get/put etc... 2289 * bacause we hide charges behind us. 2290 */ 2291 if (batch->bytes) 2292 res_counter_uncharge(&batch->memcg->res, batch->bytes); 2293 if (batch->memsw_bytes) 2294 res_counter_uncharge(&batch->memcg->memsw, batch->memsw_bytes); 2295 /* forget this pointer (for sanity check) */ 2296 batch->memcg = NULL; 2297} 2298 2299#ifdef CONFIG_SWAP 2300/* 2301 * called after __delete_from_swap_cache() and drop "page" account. 2302 * memcg information is recorded to swap_cgroup of "ent" 2303 */ 2304void 2305mem_cgroup_uncharge_swapcache(struct page *page, swp_entry_t ent, bool swapout) 2306{ 2307 struct mem_cgroup *memcg; 2308 int ctype = MEM_CGROUP_CHARGE_TYPE_SWAPOUT; 2309 2310 if (!swapout) /* this was a swap cache but the swap is unused ! */ 2311 ctype = MEM_CGROUP_CHARGE_TYPE_DROP; 2312 2313 memcg = __mem_cgroup_uncharge_common(page, ctype); 2314 2315 /* record memcg information */ 2316 if (do_swap_account && swapout && memcg) { 2317 rcu_read_lock(); 2318 swap_cgroup_record(ent, css_id(&memcg->css)); 2319 rcu_read_unlock(); 2320 mem_cgroup_get(memcg); 2321 } 2322 if (swapout && memcg) 2323 css_put(&memcg->css); 2324} 2325#endif 2326 2327#ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP 2328/* 2329 * called from swap_entry_free(). remove record in swap_cgroup and 2330 * uncharge "memsw" account. 2331 */ 2332void mem_cgroup_uncharge_swap(swp_entry_t ent) 2333{ 2334 struct mem_cgroup *memcg; 2335 unsigned short id; 2336 2337 if (!do_swap_account) 2338 return; 2339 2340 id = swap_cgroup_record(ent, 0); 2341 rcu_read_lock(); 2342 memcg = mem_cgroup_lookup(id); 2343 if (memcg) { 2344 /* 2345 * We uncharge this because swap is freed. 2346 * This memcg can be obsolete one. We avoid calling css_tryget 2347 */ 2348 if (!mem_cgroup_is_root(memcg)) 2349 res_counter_uncharge(&memcg->memsw, PAGE_SIZE); 2350 mem_cgroup_swap_statistics(memcg, false); 2351 mem_cgroup_put(memcg); 2352 } 2353 rcu_read_unlock(); 2354} 2355 2356/** 2357 * mem_cgroup_move_swap_account - move swap charge and swap_cgroup's record. 2358 * @entry: swap entry to be moved 2359 * @from: mem_cgroup which the entry is moved from 2360 * @to: mem_cgroup which the entry is moved to 2361 * @need_fixup: whether we should fixup res_counters and refcounts. 2362 * 2363 * It succeeds only when the swap_cgroup's record for this entry is the same 2364 * as the mem_cgroup's id of @from. 2365 * 2366 * Returns 0 on success, -EINVAL on failure. 2367 * 2368 * The caller must have charged to @to, IOW, called res_counter_charge() about 2369 * both res and memsw, and called css_get(). 2370 */ 2371static int mem_cgroup_move_swap_account(swp_entry_t entry, 2372 struct mem_cgroup *from, struct mem_cgroup *to, bool need_fixup) 2373{ 2374 unsigned short old_id, new_id; 2375 2376 rcu_read_lock(); 2377 old_id = css_id(&from->css); 2378 new_id = css_id(&to->css); 2379 rcu_read_unlock(); 2380 2381 if (swap_cgroup_cmpxchg(entry, old_id, new_id) == old_id) { 2382 mem_cgroup_swap_statistics(from, false); 2383 mem_cgroup_swap_statistics(to, true); 2384 /* 2385 * This function is only called from task migration context now. 2386 * It postpones res_counter and refcount handling till the end 2387 * of task migration(mem_cgroup_clear_mc()) for performance 2388 * improvement. But we cannot postpone mem_cgroup_get(to) 2389 * because if the process that has been moved to @to does 2390 * swap-in, the refcount of @to might be decreased to 0. 2391 */ 2392 mem_cgroup_get(to); 2393 if (need_fixup) { 2394 if (!mem_cgroup_is_root(from)) 2395 res_counter_uncharge(&from->memsw, PAGE_SIZE); 2396 mem_cgroup_put(from); 2397 /* 2398 * we charged both to->res and to->memsw, so we should 2399 * uncharge to->res. 2400 */ 2401 if (!mem_cgroup_is_root(to)) 2402 res_counter_uncharge(&to->res, PAGE_SIZE); 2403 css_put(&to->css); 2404 } 2405 return 0; 2406 } 2407 return -EINVAL; 2408} 2409#else 2410static inline int mem_cgroup_move_swap_account(swp_entry_t entry, 2411 struct mem_cgroup *from, struct mem_cgroup *to, bool need_fixup) 2412{ 2413 return -EINVAL; 2414} 2415#endif 2416 2417/* 2418 * Before starting migration, account PAGE_SIZE to mem_cgroup that the old 2419 * page belongs to. 2420 */ 2421int mem_cgroup_prepare_migration(struct page *page, struct mem_cgroup **ptr) 2422{ 2423 struct page_cgroup *pc; 2424 struct mem_cgroup *mem = NULL; 2425 int ret = 0; 2426 2427 if (mem_cgroup_disabled()) 2428 return 0; 2429 2430 pc = lookup_page_cgroup(page); 2431 lock_page_cgroup(pc); 2432 if (PageCgroupUsed(pc)) { 2433 mem = pc->mem_cgroup; 2434 css_get(&mem->css); 2435 } 2436 unlock_page_cgroup(pc); 2437 2438 if (mem) { 2439 ret = __mem_cgroup_try_charge(NULL, GFP_KERNEL, &mem, false); 2440 css_put(&mem->css); 2441 } 2442 *ptr = mem; 2443 return ret; 2444} 2445 2446/* remove redundant charge if migration failed*/ 2447void mem_cgroup_end_migration(struct mem_cgroup *mem, 2448 struct page *oldpage, struct page *newpage) 2449{ 2450 struct page *target, *unused; 2451 struct page_cgroup *pc; 2452 enum charge_type ctype; 2453 2454 if (!mem) 2455 return; 2456 cgroup_exclude_rmdir(&mem->css); 2457 /* at migration success, oldpage->mapping is NULL. */ 2458 if (oldpage->mapping) { 2459 target = oldpage; 2460 unused = NULL; 2461 } else { 2462 target = newpage; 2463 unused = oldpage; 2464 } 2465 2466 if (PageAnon(target)) 2467 ctype = MEM_CGROUP_CHARGE_TYPE_MAPPED; 2468 else if (page_is_file_cache(target)) 2469 ctype = MEM_CGROUP_CHARGE_TYPE_CACHE; 2470 else 2471 ctype = MEM_CGROUP_CHARGE_TYPE_SHMEM; 2472 2473 /* unused page is not on radix-tree now. */ 2474 if (unused) 2475 __mem_cgroup_uncharge_common(unused, ctype); 2476 2477 pc = lookup_page_cgroup(target); 2478 /* 2479 * __mem_cgroup_commit_charge() check PCG_USED bit of page_cgroup. 2480 * So, double-counting is effectively avoided. 2481 */ 2482 __mem_cgroup_commit_charge(mem, pc, ctype); 2483 2484 /* 2485 * Both of oldpage and newpage are still under lock_page(). 2486 * Then, we don't have to care about race in radix-tree. 2487 * But we have to be careful that this page is unmapped or not. 2488 * 2489 * There is a case for !page_mapped(). At the start of 2490 * migration, oldpage was mapped. But now, it's zapped. 2491 * But we know *target* page is not freed/reused under us. 2492 * mem_cgroup_uncharge_page() does all necessary checks. 2493 */ 2494 if (ctype == MEM_CGROUP_CHARGE_TYPE_MAPPED) 2495 mem_cgroup_uncharge_page(target); 2496 /* 2497 * At migration, we may charge account against cgroup which has no tasks 2498 * So, rmdir()->pre_destroy() can be called while we do this charge. 2499 * In that case, we need to call pre_destroy() again. check it here. 2500 */ 2501 cgroup_release_and_wakeup_rmdir(&mem->css); 2502} 2503 2504/* 2505 * A call to try to shrink memory usage on charge failure at shmem's swapin. 2506 * Calling hierarchical_reclaim is not enough because we should update 2507 * last_oom_jiffies to prevent pagefault_out_of_memory from invoking global OOM. 2508 * Moreover considering hierarchy, we should reclaim from the mem_over_limit, 2509 * not from the memcg which this page would be charged to. 2510 * try_charge_swapin does all of these works properly. 2511 */ 2512int mem_cgroup_shmem_charge_fallback(struct page *page, 2513 struct mm_struct *mm, 2514 gfp_t gfp_mask) 2515{ 2516 struct mem_cgroup *mem = NULL; 2517 int ret; 2518 2519 if (mem_cgroup_disabled()) 2520 return 0; 2521 2522 ret = mem_cgroup_try_charge_swapin(mm, page, gfp_mask, &mem); 2523 if (!ret) 2524 mem_cgroup_cancel_charge_swapin(mem); /* it does !mem check */ 2525 2526 return ret; 2527} 2528 2529static DEFINE_MUTEX(set_limit_mutex); 2530 2531static int mem_cgroup_resize_limit(struct mem_cgroup *memcg, 2532 unsigned long long val) 2533{ 2534 int retry_count; 2535 u64 memswlimit; 2536 int ret = 0; 2537 int children = mem_cgroup_count_children(memcg); 2538 u64 curusage, oldusage; 2539 2540 /* 2541 * For keeping hierarchical_reclaim simple, how long we should retry 2542 * is depends on callers. We set our retry-count to be function 2543 * of # of children which we should visit in this loop. 2544 */ 2545 retry_count = MEM_CGROUP_RECLAIM_RETRIES * children; 2546 2547 oldusage = res_counter_read_u64(&memcg->res, RES_USAGE); 2548 2549 while (retry_count) { 2550 if (signal_pending(current)) { 2551 ret = -EINTR; 2552 break; 2553 } 2554 /* 2555 * Rather than hide all in some function, I do this in 2556 * open coded manner. You see what this really does. 2557 * We have to guarantee mem->res.limit < mem->memsw.limit. 2558 */ 2559 mutex_lock(&set_limit_mutex); 2560 memswlimit = res_counter_read_u64(&memcg->memsw, RES_LIMIT); 2561 if (memswlimit < val) { 2562 ret = -EINVAL; 2563 mutex_unlock(&set_limit_mutex); 2564 break; 2565 } 2566 ret = res_counter_set_limit(&memcg->res, val); 2567 if (!ret) { 2568 if (memswlimit == val) 2569 memcg->memsw_is_minimum = true; 2570 else 2571 memcg->memsw_is_minimum = false; 2572 } 2573 mutex_unlock(&set_limit_mutex); 2574 2575 if (!ret) 2576 break; 2577 2578 mem_cgroup_hierarchical_reclaim(memcg, NULL, GFP_KERNEL, 2579 MEM_CGROUP_RECLAIM_SHRINK); 2580 curusage = res_counter_read_u64(&memcg->res, RES_USAGE); 2581 /* Usage is reduced ? */ 2582 if (curusage >= oldusage) 2583 retry_count--; 2584 else 2585 oldusage = curusage; 2586 } 2587 2588 return ret; 2589} 2590 2591static int mem_cgroup_resize_memsw_limit(struct mem_cgroup *memcg, 2592 unsigned long long val) 2593{ 2594 int retry_count; 2595 u64 memlimit, oldusage, curusage; 2596 int children = mem_cgroup_count_children(memcg); 2597 int ret = -EBUSY; 2598 2599 /* see mem_cgroup_resize_res_limit */ 2600 retry_count = children * MEM_CGROUP_RECLAIM_RETRIES; 2601 oldusage = res_counter_read_u64(&memcg->memsw, RES_USAGE); 2602 while (retry_count) { 2603 if (signal_pending(current)) { 2604 ret = -EINTR; 2605 break; 2606 } 2607 /* 2608 * Rather than hide all in some function, I do this in 2609 * open coded manner. You see what this really does. 2610 * We have to guarantee mem->res.limit < mem->memsw.limit. 2611 */ 2612 mutex_lock(&set_limit_mutex); 2613 memlimit = res_counter_read_u64(&memcg->res, RES_LIMIT); 2614 if (memlimit > val) { 2615 ret = -EINVAL; 2616 mutex_unlock(&set_limit_mutex); 2617 break; 2618 } 2619 ret = res_counter_set_limit(&memcg->memsw, val); 2620 if (!ret) { 2621 if (memlimit == val) 2622 memcg->memsw_is_minimum = true; 2623 else 2624 memcg->memsw_is_minimum = false; 2625 } 2626 mutex_unlock(&set_limit_mutex); 2627 2628 if (!ret) 2629 break; 2630 2631 mem_cgroup_hierarchical_reclaim(memcg, NULL, GFP_KERNEL, 2632 MEM_CGROUP_RECLAIM_NOSWAP | 2633 MEM_CGROUP_RECLAIM_SHRINK); 2634 curusage = res_counter_read_u64(&memcg->memsw, RES_USAGE); 2635 /* Usage is reduced ? */ 2636 if (curusage >= oldusage) 2637 retry_count--; 2638 else 2639 oldusage = curusage; 2640 } 2641 return ret; 2642} 2643 2644unsigned long mem_cgroup_soft_limit_reclaim(struct zone *zone, int order, 2645 gfp_t gfp_mask, int nid, 2646 int zid) 2647{ 2648 unsigned long nr_reclaimed = 0; 2649 struct mem_cgroup_per_zone *mz, *next_mz = NULL; 2650 unsigned long reclaimed; 2651 int loop = 0; 2652 struct mem_cgroup_tree_per_zone *mctz; 2653 unsigned long long excess; 2654 2655 if (order > 0) 2656 return 0; 2657 2658 mctz = soft_limit_tree_node_zone(nid, zid); 2659 /* 2660 * This loop can run a while, specially if mem_cgroup's continuously 2661 * keep exceeding their soft limit and putting the system under 2662 * pressure 2663 */ 2664 do { 2665 if (next_mz) 2666 mz = next_mz; 2667 else 2668 mz = mem_cgroup_largest_soft_limit_node(mctz); 2669 if (!mz) 2670 break; 2671 2672 reclaimed = mem_cgroup_hierarchical_reclaim(mz->mem, zone, 2673 gfp_mask, 2674 MEM_CGROUP_RECLAIM_SOFT); 2675 nr_reclaimed += reclaimed; 2676 spin_lock(&mctz->lock); 2677 2678 /* 2679 * If we failed to reclaim anything from this memory cgroup 2680 * it is time to move on to the next cgroup 2681 */ 2682 next_mz = NULL; 2683 if (!reclaimed) { 2684 do { 2685 /* 2686 * Loop until we find yet another one. 2687 * 2688 * By the time we get the soft_limit lock 2689 * again, someone might have aded the 2690 * group back on the RB tree. Iterate to 2691 * make sure we get a different mem. 2692 * mem_cgroup_largest_soft_limit_node returns 2693 * NULL if no other cgroup is present on 2694 * the tree 2695 */ 2696 next_mz = 2697 __mem_cgroup_largest_soft_limit_node(mctz); 2698 if (next_mz == mz) { 2699 css_put(&next_mz->mem->css); 2700 next_mz = NULL; 2701 } else /* next_mz == NULL or other memcg */ 2702 break; 2703 } while (1); 2704 } 2705 __mem_cgroup_remove_exceeded(mz->mem, mz, mctz); 2706 excess = res_counter_soft_limit_excess(&mz->mem->res); 2707 /* 2708 * One school of thought says that we should not add 2709 * back the node to the tree if reclaim returns 0. 2710 * But our reclaim could return 0, simply because due 2711 * to priority we are exposing a smaller subset of 2712 * memory to reclaim from. Consider this as a longer 2713 * term TODO. 2714 */ 2715 /* If excess == 0, no tree ops */ 2716 __mem_cgroup_insert_exceeded(mz->mem, mz, mctz, excess); 2717 spin_unlock(&mctz->lock); 2718 css_put(&mz->mem->css); 2719 loop++; 2720 /* 2721 * Could not reclaim anything and there are no more 2722 * mem cgroups to try or we seem to be looping without 2723 * reclaiming anything. 2724 */ 2725 if (!nr_reclaimed && 2726 (next_mz == NULL || 2727 loop > MEM_CGROUP_MAX_SOFT_LIMIT_RECLAIM_LOOPS)) 2728 break; 2729 } while (!nr_reclaimed); 2730 if (next_mz) 2731 css_put(&next_mz->mem->css); 2732 return nr_reclaimed; 2733} 2734 2735/* 2736 * This routine traverse page_cgroup in given list and drop them all. 2737 * *And* this routine doesn't reclaim page itself, just removes page_cgroup. 2738 */ 2739static int mem_cgroup_force_empty_list(struct mem_cgroup *mem, 2740 int node, int zid, enum lru_list lru) 2741{ 2742 struct zone *zone; 2743 struct mem_cgroup_per_zone *mz; 2744 struct page_cgroup *pc, *busy; 2745 unsigned long flags, loop; 2746 struct list_head *list; 2747 int ret = 0; 2748 2749 zone = &NODE_DATA(node)->node_zones[zid]; 2750 mz = mem_cgroup_zoneinfo(mem, node, zid); 2751 list = &mz->lists[lru]; 2752 2753 loop = MEM_CGROUP_ZSTAT(mz, lru); 2754 /* give some margin against EBUSY etc...*/ 2755 loop += 256; 2756 busy = NULL; 2757 while (loop--) { 2758 ret = 0; 2759 spin_lock_irqsave(&zone->lru_lock, flags); 2760 if (list_empty(list)) { 2761 spin_unlock_irqrestore(&zone->lru_lock, flags); 2762 break; 2763 } 2764 pc = list_entry(list->prev, struct page_cgroup, lru); 2765 if (busy == pc) { 2766 list_move(&pc->lru, list); 2767 busy = NULL; 2768 spin_unlock_irqrestore(&zone->lru_lock, flags); 2769 continue; 2770 } 2771 spin_unlock_irqrestore(&zone->lru_lock, flags); 2772 2773 ret = mem_cgroup_move_parent(pc, mem, GFP_KERNEL); 2774 if (ret == -ENOMEM) 2775 break; 2776 2777 if (ret == -EBUSY || ret == -EINVAL) { 2778 /* found lock contention or "pc" is obsolete. */ 2779 busy = pc; 2780 cond_resched(); 2781 } else 2782 busy = NULL; 2783 } 2784 2785 if (!ret && !list_empty(list)) 2786 return -EBUSY; 2787 return ret; 2788} 2789 2790/* 2791 * make mem_cgroup's charge to be 0 if there is no task. 2792 * This enables deleting this mem_cgroup. 2793 */ 2794static int mem_cgroup_force_empty(struct mem_cgroup *mem, bool free_all) 2795{ 2796 int ret; 2797 int node, zid, shrink; 2798 int nr_retries = MEM_CGROUP_RECLAIM_RETRIES; 2799 struct cgroup *cgrp = mem->css.cgroup; 2800 2801 css_get(&mem->css); 2802 2803 shrink = 0; 2804 /* should free all ? */ 2805 if (free_all) 2806 goto try_to_free; 2807move_account: 2808 do { 2809 ret = -EBUSY; 2810 if (cgroup_task_count(cgrp) || !list_empty(&cgrp->children)) 2811 goto out; 2812 ret = -EINTR; 2813 if (signal_pending(current)) 2814 goto out; 2815 /* This is for making all *used* pages to be on LRU. */ 2816 lru_add_drain_all(); 2817 drain_all_stock_sync(); 2818 ret = 0; 2819 for_each_node_state(node, N_HIGH_MEMORY) { 2820 for (zid = 0; !ret && zid < MAX_NR_ZONES; zid++) { 2821 enum lru_list l; 2822 for_each_lru(l) { 2823 ret = mem_cgroup_force_empty_list(mem, 2824 node, zid, l); 2825 if (ret) 2826 break; 2827 } 2828 } 2829 if (ret) 2830 break; 2831 } 2832 /* it seems parent cgroup doesn't have enough mem */ 2833 if (ret == -ENOMEM) 2834 goto try_to_free; 2835 cond_resched(); 2836 /* "ret" should also be checked to ensure all lists are empty. */ 2837 } while (mem->res.usage > 0 || ret); 2838out: 2839 css_put(&mem->css); 2840 return ret; 2841 2842try_to_free: 2843 /* returns EBUSY if there is a task or if we come here twice. */ 2844 if (cgroup_task_count(cgrp) || !list_empty(&cgrp->children) || shrink) { 2845 ret = -EBUSY; 2846 goto out; 2847 } 2848 /* we call try-to-free pages for make this cgroup empty */ 2849 lru_add_drain_all(); 2850 /* try to free all pages in this cgroup */ 2851 shrink = 1; 2852 while (nr_retries && mem->res.usage > 0) { 2853 int progress; 2854 2855 if (signal_pending(current)) { 2856 ret = -EINTR; 2857 goto out; 2858 } 2859 progress = try_to_free_mem_cgroup_pages(mem, GFP_KERNEL, 2860 false, get_swappiness(mem)); 2861 if (!progress) { 2862 nr_retries--; 2863 /* maybe some writeback is necessary */ 2864 congestion_wait(BLK_RW_ASYNC, HZ/10); 2865 } 2866 2867 } 2868 lru_add_drain(); 2869 /* try move_account...there may be some *locked* pages. */ 2870 goto move_account; 2871} 2872 2873int mem_cgroup_force_empty_write(struct cgroup *cont, unsigned int event) 2874{ 2875 return mem_cgroup_force_empty(mem_cgroup_from_cont(cont), true); 2876} 2877 2878 2879static u64 mem_cgroup_hierarchy_read(struct cgroup *cont, struct cftype *cft) 2880{ 2881 return mem_cgroup_from_cont(cont)->use_hierarchy; 2882} 2883 2884static int mem_cgroup_hierarchy_write(struct cgroup *cont, struct cftype *cft, 2885 u64 val) 2886{ 2887 int retval = 0; 2888 struct mem_cgroup *mem = mem_cgroup_from_cont(cont); 2889 struct cgroup *parent = cont->parent; 2890 struct mem_cgroup *parent_mem = NULL; 2891 2892 if (parent) 2893 parent_mem = mem_cgroup_from_cont(parent); 2894 2895 cgroup_lock(); 2896 /* 2897 * If parent's use_hierarchy is set, we can't make any modifications 2898 * in the child subtrees. If it is unset, then the change can 2899 * occur, provided the current cgroup has no children. 2900 * 2901 * For the root cgroup, parent_mem is NULL, we allow value to be 2902 * set if there are no children. 2903 */ 2904 if ((!parent_mem || !parent_mem->use_hierarchy) && 2905 (val == 1 || val == 0)) { 2906 if (list_empty(&cont->children)) 2907 mem->use_hierarchy = val; 2908 else 2909 retval = -EBUSY; 2910 } else 2911 retval = -EINVAL; 2912 cgroup_unlock(); 2913 2914 return retval; 2915} 2916 2917struct mem_cgroup_idx_data { 2918 s64 val; 2919 enum mem_cgroup_stat_index idx; 2920}; 2921 2922static int 2923mem_cgroup_get_idx_stat(struct mem_cgroup *mem, void *data) 2924{ 2925 struct mem_cgroup_idx_data *d = data; 2926 d->val += mem_cgroup_read_stat(mem, d->idx); 2927 return 0; 2928} 2929 2930static void 2931mem_cgroup_get_recursive_idx_stat(struct mem_cgroup *mem, 2932 enum mem_cgroup_stat_index idx, s64 *val) 2933{ 2934 struct mem_cgroup_idx_data d; 2935 d.idx = idx; 2936 d.val = 0; 2937 mem_cgroup_walk_tree(mem, &d, mem_cgroup_get_idx_stat); 2938 *val = d.val; 2939} 2940 2941static inline u64 mem_cgroup_usage(struct mem_cgroup *mem, bool swap) 2942{ 2943 u64 idx_val, val; 2944 2945 if (!mem_cgroup_is_root(mem)) { 2946 if (!swap) 2947 return res_counter_read_u64(&mem->res, RES_USAGE); 2948 else 2949 return res_counter_read_u64(&mem->memsw, RES_USAGE); 2950 } 2951 2952 mem_cgroup_get_recursive_idx_stat(mem, MEM_CGROUP_STAT_CACHE, &idx_val); 2953 val = idx_val; 2954 mem_cgroup_get_recursive_idx_stat(mem, MEM_CGROUP_STAT_RSS, &idx_val); 2955 val += idx_val; 2956 2957 if (swap) { 2958 mem_cgroup_get_recursive_idx_stat(mem, 2959 MEM_CGROUP_STAT_SWAPOUT, &idx_val); 2960 val += idx_val; 2961 } 2962 2963 return val << PAGE_SHIFT; 2964} 2965 2966static u64 mem_cgroup_read(struct cgroup *cont, struct cftype *cft) 2967{ 2968 struct mem_cgroup *mem = mem_cgroup_from_cont(cont); 2969 u64 val; 2970 int type, name; 2971 2972 type = MEMFILE_TYPE(cft->private); 2973 name = MEMFILE_ATTR(cft->private); 2974 switch (type) { 2975 case _MEM: 2976 if (name == RES_USAGE) 2977 val = mem_cgroup_usage(mem, false); 2978 else 2979 val = res_counter_read_u64(&mem->res, name); 2980 break; 2981 case _MEMSWAP: 2982 if (name == RES_USAGE) 2983 val = mem_cgroup_usage(mem, true); 2984 else 2985 val = res_counter_read_u64(&mem->memsw, name); 2986 break; 2987 default: 2988 BUG(); 2989 break; 2990 } 2991 return val; 2992} 2993/* 2994 * The user of this function is... 2995 * RES_LIMIT. 2996 */ 2997static int mem_cgroup_write(struct cgroup *cont, struct cftype *cft, 2998 const char *buffer) 2999{ 3000 struct mem_cgroup *memcg = mem_cgroup_from_cont(cont); 3001 int type, name; 3002 unsigned long long val; 3003 int ret; 3004 3005 type = MEMFILE_TYPE(cft->private); 3006 name = MEMFILE_ATTR(cft->private); 3007 switch (name) { 3008 case RES_LIMIT: 3009 if (mem_cgroup_is_root(memcg)) { /* Can't set limit on root */ 3010 ret = -EINVAL; 3011 break; 3012 } 3013 /* This function does all necessary parse...reuse it */ 3014 ret = res_counter_memparse_write_strategy(buffer, &val); 3015 if (ret) 3016 break; 3017 if (type == _MEM) 3018 ret = mem_cgroup_resize_limit(memcg, val); 3019 else 3020 ret = mem_cgroup_resize_memsw_limit(memcg, val); 3021 break; 3022 case RES_SOFT_LIMIT: 3023 ret = res_counter_memparse_write_strategy(buffer, &val); 3024 if (ret) 3025 break; 3026 /* 3027 * For memsw, soft limits are hard to implement in terms 3028 * of semantics, for now, we support soft limits for 3029 * control without swap 3030 */ 3031 if (type == _MEM) 3032 ret = res_counter_set_soft_limit(&memcg->res, val); 3033 else 3034 ret = -EINVAL; 3035 break; 3036 default: 3037 ret = -EINVAL; /* should be BUG() ? */ 3038 break; 3039 } 3040 return ret; 3041} 3042 3043static void memcg_get_hierarchical_limit(struct mem_cgroup *memcg, 3044 unsigned long long *mem_limit, unsigned long long *memsw_limit) 3045{ 3046 struct cgroup *cgroup; 3047 unsigned long long min_limit, min_memsw_limit, tmp; 3048 3049 min_limit = res_counter_read_u64(&memcg->res, RES_LIMIT); 3050 min_memsw_limit = res_counter_read_u64(&memcg->memsw, RES_LIMIT); 3051 cgroup = memcg->css.cgroup; 3052 if (!memcg->use_hierarchy) 3053 goto out; 3054 3055 while (cgroup->parent) { 3056 cgroup = cgroup->parent; 3057 memcg = mem_cgroup_from_cont(cgroup); 3058 if (!memcg->use_hierarchy) 3059 break; 3060 tmp = res_counter_read_u64(&memcg->res, RES_LIMIT); 3061 min_limit = min(min_limit, tmp); 3062 tmp = res_counter_read_u64(&memcg->memsw, RES_LIMIT); 3063 min_memsw_limit = min(min_memsw_limit, tmp); 3064 } 3065out: 3066 *mem_limit = min_limit; 3067 *memsw_limit = min_memsw_limit; 3068 return; 3069} 3070 3071static int mem_cgroup_reset(struct cgroup *cont, unsigned int event) 3072{ 3073 struct mem_cgroup *mem; 3074 int type, name; 3075 3076 mem = mem_cgroup_from_cont(cont); 3077 type = MEMFILE_TYPE(event); 3078 name = MEMFILE_ATTR(event); 3079 switch (name) { 3080 case RES_MAX_USAGE: 3081 if (type == _MEM) 3082 res_counter_reset_max(&mem->res); 3083 else 3084 res_counter_reset_max(&mem->memsw); 3085 break; 3086 case RES_FAILCNT: 3087 if (type == _MEM) 3088 res_counter_reset_failcnt(&mem->res); 3089 else 3090 res_counter_reset_failcnt(&mem->memsw); 3091 break; 3092 } 3093 3094 return 0; 3095} 3096 3097static u64 mem_cgroup_move_charge_read(struct cgroup *cgrp, 3098 struct cftype *cft) 3099{ 3100 return mem_cgroup_from_cont(cgrp)->move_charge_at_immigrate; 3101} 3102 3103#ifdef CONFIG_MMU 3104static int mem_cgroup_move_charge_write(struct cgroup *cgrp, 3105 struct cftype *cft, u64 val) 3106{ 3107 struct mem_cgroup *mem = mem_cgroup_from_cont(cgrp); 3108 3109 if (val >= (1 << NR_MOVE_TYPE)) 3110 return -EINVAL; 3111 /* 3112 * We check this value several times in both in can_attach() and 3113 * attach(), so we need cgroup lock to prevent this value from being 3114 * inconsistent. 3115 */ 3116 cgroup_lock(); 3117 mem->move_charge_at_immigrate = val; 3118 cgroup_unlock(); 3119 3120 return 0; 3121} 3122#else 3123static int mem_cgroup_move_charge_write(struct cgroup *cgrp, 3124 struct cftype *cft, u64 val) 3125{ 3126 return -ENOSYS; 3127} 3128#endif 3129 3130 3131/* For read statistics */ 3132enum { 3133 MCS_CACHE, 3134 MCS_RSS, 3135 MCS_FILE_MAPPED, 3136 MCS_PGPGIN, 3137 MCS_PGPGOUT, 3138 MCS_SWAP, 3139 MCS_INACTIVE_ANON, 3140 MCS_ACTIVE_ANON, 3141 MCS_INACTIVE_FILE, 3142 MCS_ACTIVE_FILE, 3143 MCS_UNEVICTABLE, 3144 NR_MCS_STAT, 3145}; 3146 3147struct mcs_total_stat { 3148 s64 stat[NR_MCS_STAT]; 3149}; 3150 3151struct { 3152 char *local_name; 3153 char *total_name; 3154} memcg_stat_strings[NR_MCS_STAT] = { 3155 {"cache", "total_cache"}, 3156 {"rss", "total_rss"}, 3157 {"mapped_file", "total_mapped_file"}, 3158 {"pgpgin", "total_pgpgin"}, 3159 {"pgpgout", "total_pgpgout"}, 3160 {"swap", "total_swap"}, 3161 {"inactive_anon", "total_inactive_anon"}, 3162 {"active_anon", "total_active_anon"}, 3163 {"inactive_file", "total_inactive_file"}, 3164 {"active_file", "total_active_file"}, 3165 {"unevictable", "total_unevictable"} 3166}; 3167 3168 3169static int mem_cgroup_get_local_stat(struct mem_cgroup *mem, void *data) 3170{ 3171 struct mcs_total_stat *s = data; 3172 s64 val; 3173 3174 /* per cpu stat */ 3175 val = mem_cgroup_read_stat(mem, MEM_CGROUP_STAT_CACHE); 3176 s->stat[MCS_CACHE] += val * PAGE_SIZE; 3177 val = mem_cgroup_read_stat(mem, MEM_CGROUP_STAT_RSS); 3178 s->stat[MCS_RSS] += val * PAGE_SIZE; 3179 val = mem_cgroup_read_stat(mem, MEM_CGROUP_STAT_FILE_MAPPED); 3180 s->stat[MCS_FILE_MAPPED] += val * PAGE_SIZE; 3181 val = mem_cgroup_read_stat(mem, MEM_CGROUP_STAT_PGPGIN_COUNT); 3182 s->stat[MCS_PGPGIN] += val; 3183 val = mem_cgroup_read_stat(mem, MEM_CGROUP_STAT_PGPGOUT_COUNT); 3184 s->stat[MCS_PGPGOUT] += val; 3185 if (do_swap_account) { 3186 val = mem_cgroup_read_stat(mem, MEM_CGROUP_STAT_SWAPOUT); 3187 s->stat[MCS_SWAP] += val * PAGE_SIZE; 3188 } 3189 3190 /* per zone stat */ 3191 val = mem_cgroup_get_local_zonestat(mem, LRU_INACTIVE_ANON); 3192 s->stat[MCS_INACTIVE_ANON] += val * PAGE_SIZE; 3193 val = mem_cgroup_get_local_zonestat(mem, LRU_ACTIVE_ANON); 3194 s->stat[MCS_ACTIVE_ANON] += val * PAGE_SIZE; 3195 val = mem_cgroup_get_local_zonestat(mem, LRU_INACTIVE_FILE); 3196 s->stat[MCS_INACTIVE_FILE] += val * PAGE_SIZE; 3197 val = mem_cgroup_get_local_zonestat(mem, LRU_ACTIVE_FILE); 3198 s->stat[MCS_ACTIVE_FILE] += val * PAGE_SIZE; 3199 val = mem_cgroup_get_local_zonestat(mem, LRU_UNEVICTABLE); 3200 s->stat[MCS_UNEVICTABLE] += val * PAGE_SIZE; 3201 return 0; 3202} 3203 3204static void 3205mem_cgroup_get_total_stat(struct mem_cgroup *mem, struct mcs_total_stat *s) 3206{ 3207 mem_cgroup_walk_tree(mem, s, mem_cgroup_get_local_stat); 3208} 3209 3210static int mem_control_stat_show(struct cgroup *cont, struct cftype *cft, 3211 struct cgroup_map_cb *cb) 3212{ 3213 struct mem_cgroup *mem_cont = mem_cgroup_from_cont(cont); 3214 struct mcs_total_stat mystat; 3215 int i; 3216 3217 memset(&mystat, 0, sizeof(mystat)); 3218 mem_cgroup_get_local_stat(mem_cont, &mystat); 3219 3220 for (i = 0; i < NR_MCS_STAT; i++) { 3221 if (i == MCS_SWAP && !do_swap_account) 3222 continue; 3223 cb->fill(cb, memcg_stat_strings[i].local_name, mystat.stat[i]); 3224 } 3225 3226 /* Hierarchical information */ 3227 { 3228 unsigned long long limit, memsw_limit; 3229 memcg_get_hierarchical_limit(mem_cont, &limit, &memsw_limit); 3230 cb->fill(cb, "hierarchical_memory_limit", limit); 3231 if (do_swap_account) 3232 cb->fill(cb, "hierarchical_memsw_limit", memsw_limit); 3233 } 3234 3235 memset(&mystat, 0, sizeof(mystat)); 3236 mem_cgroup_get_total_stat(mem_cont, &mystat); 3237 for (i = 0; i < NR_MCS_STAT; i++) { 3238 if (i == MCS_SWAP && !do_swap_account) 3239 continue; 3240 cb->fill(cb, memcg_stat_strings[i].total_name, mystat.stat[i]); 3241 } 3242 3243#ifdef CONFIG_DEBUG_VM 3244 cb->fill(cb, "inactive_ratio", calc_inactive_ratio(mem_cont, NULL)); 3245 3246 { 3247 int nid, zid; 3248 struct mem_cgroup_per_zone *mz; 3249 unsigned long recent_rotated[2] = {0, 0}; 3250 unsigned long recent_scanned[2] = {0, 0}; 3251 3252 for_each_online_node(nid) 3253 for (zid = 0; zid < MAX_NR_ZONES; zid++) { 3254 mz = mem_cgroup_zoneinfo(mem_cont, nid, zid); 3255 3256 recent_rotated[0] += 3257 mz->reclaim_stat.recent_rotated[0]; 3258 recent_rotated[1] += 3259 mz->reclaim_stat.recent_rotated[1]; 3260 recent_scanned[0] += 3261 mz->reclaim_stat.recent_scanned[0]; 3262 recent_scanned[1] += 3263 mz->reclaim_stat.recent_scanned[1]; 3264 } 3265 cb->fill(cb, "recent_rotated_anon", recent_rotated[0]); 3266 cb->fill(cb, "recent_rotated_file", recent_rotated[1]); 3267 cb->fill(cb, "recent_scanned_anon", recent_scanned[0]); 3268 cb->fill(cb, "recent_scanned_file", recent_scanned[1]); 3269 } 3270#endif 3271 3272 return 0; 3273} 3274 3275static u64 mem_cgroup_swappiness_read(struct cgroup *cgrp, struct cftype *cft) 3276{ 3277 struct mem_cgroup *memcg = mem_cgroup_from_cont(cgrp); 3278 3279 return get_swappiness(memcg); 3280} 3281 3282static int mem_cgroup_swappiness_write(struct cgroup *cgrp, struct cftype *cft, 3283 u64 val) 3284{ 3285 struct mem_cgroup *memcg = mem_cgroup_from_cont(cgrp); 3286 struct mem_cgroup *parent; 3287 3288 if (val > 100) 3289 return -EINVAL; 3290 3291 if (cgrp->parent == NULL) 3292 return -EINVAL; 3293 3294 parent = mem_cgroup_from_cont(cgrp->parent); 3295 3296 cgroup_lock(); 3297 3298 /* If under hierarchy, only empty-root can set this value */ 3299 if ((parent->use_hierarchy) || 3300 (memcg->use_hierarchy && !list_empty(&cgrp->children))) { 3301 cgroup_unlock(); 3302 return -EINVAL; 3303 } 3304 3305 spin_lock(&memcg->reclaim_param_lock); 3306 memcg->swappiness = val; 3307 spin_unlock(&memcg->reclaim_param_lock); 3308 3309 cgroup_unlock(); 3310 3311 return 0; 3312} 3313 3314static void __mem_cgroup_threshold(struct mem_cgroup *memcg, bool swap) 3315{ 3316 struct mem_cgroup_threshold_ary *t; 3317 u64 usage; 3318 int i; 3319 3320 rcu_read_lock(); 3321 if (!swap) 3322 t = rcu_dereference(memcg->thresholds); 3323 else 3324 t = rcu_dereference(memcg->memsw_thresholds); 3325 3326 if (!t) 3327 goto unlock; 3328 3329 usage = mem_cgroup_usage(memcg, swap); 3330 3331 /* 3332 * current_threshold points to threshold just below usage. 3333 * If it's not true, a threshold was crossed after last 3334 * call of __mem_cgroup_threshold(). 3335 */ 3336 i = atomic_read(&t->current_threshold); 3337 3338 /* 3339 * Iterate backward over array of thresholds starting from 3340 * current_threshold and check if a threshold is crossed. 3341 * If none of thresholds below usage is crossed, we read 3342 * only one element of the array here. 3343 */ 3344 for (; i >= 0 && unlikely(t->entries[i].threshold > usage); i--) 3345 eventfd_signal(t->entries[i].eventfd, 1); 3346 3347 /* i = current_threshold + 1 */ 3348 i++; 3349 3350 /* 3351 * Iterate forward over array of thresholds starting from 3352 * current_threshold+1 and check if a threshold is crossed. 3353 * If none of thresholds above usage is crossed, we read 3354 * only one element of the array here. 3355 */ 3356 for (; i < t->size && unlikely(t->entries[i].threshold <= usage); i++) 3357 eventfd_signal(t->entries[i].eventfd, 1); 3358 3359 /* Update current_threshold */ 3360 atomic_set(&t->current_threshold, i - 1); 3361unlock: 3362 rcu_read_unlock(); 3363} 3364 3365static void mem_cgroup_threshold(struct mem_cgroup *memcg) 3366{ 3367 __mem_cgroup_threshold(memcg, false); 3368 if (do_swap_account) 3369 __mem_cgroup_threshold(memcg, true); 3370} 3371 3372static int compare_thresholds(const void *a, const void *b) 3373{ 3374 const struct mem_cgroup_threshold *_a = a; 3375 const struct mem_cgroup_threshold *_b = b; 3376 3377 return _a->threshold - _b->threshold; 3378} 3379 3380static int mem_cgroup_register_event(struct cgroup *cgrp, struct cftype *cft, 3381 struct eventfd_ctx *eventfd, const char *args) 3382{ 3383 struct mem_cgroup *memcg = mem_cgroup_from_cont(cgrp); 3384 struct mem_cgroup_threshold_ary *thresholds, *thresholds_new; 3385 int type = MEMFILE_TYPE(cft->private); 3386 u64 threshold, usage; 3387 int size; 3388 int i, ret; 3389 3390 ret = res_counter_memparse_write_strategy(args, &threshold); 3391 if (ret) 3392 return ret; 3393 3394 mutex_lock(&memcg->thresholds_lock); 3395 if (type == _MEM) 3396 thresholds = memcg->thresholds; 3397 else if (type == _MEMSWAP) 3398 thresholds = memcg->memsw_thresholds; 3399 else 3400 BUG(); 3401 3402 usage = mem_cgroup_usage(memcg, type == _MEMSWAP); 3403 3404 /* Check if a threshold crossed before adding a new one */ 3405 if (thresholds) 3406 __mem_cgroup_threshold(memcg, type == _MEMSWAP); 3407 3408 if (thresholds) 3409 size = thresholds->size + 1; 3410 else 3411 size = 1; 3412 3413 /* Allocate memory for new array of thresholds */ 3414 thresholds_new = kmalloc(sizeof(*thresholds_new) + 3415 size * sizeof(struct mem_cgroup_threshold), 3416 GFP_KERNEL); 3417 if (!thresholds_new) { 3418 ret = -ENOMEM; 3419 goto unlock; 3420 } 3421 thresholds_new->size = size; 3422 3423 /* Copy thresholds (if any) to new array */ 3424 if (thresholds) 3425 memcpy(thresholds_new->entries, thresholds->entries, 3426 thresholds->size * 3427 sizeof(struct mem_cgroup_threshold)); 3428 /* Add new threshold */ 3429 thresholds_new->entries[size - 1].eventfd = eventfd; 3430 thresholds_new->entries[size - 1].threshold = threshold; 3431 3432 /* Sort thresholds. Registering of new threshold isn't time-critical */ 3433 sort(thresholds_new->entries, size, 3434 sizeof(struct mem_cgroup_threshold), 3435 compare_thresholds, NULL); 3436 3437 /* Find current threshold */ 3438 atomic_set(&thresholds_new->current_threshold, -1); 3439 for (i = 0; i < size; i++) { 3440 if (thresholds_new->entries[i].threshold < usage) { 3441 /* 3442 * thresholds_new->current_threshold will not be used 3443 * until rcu_assign_pointer(), so it's safe to increment 3444 * it here. 3445 */ 3446 atomic_inc(&thresholds_new->current_threshold); 3447 } 3448 } 3449 3450 if (type == _MEM) 3451 rcu_assign_pointer(memcg->thresholds, thresholds_new); 3452 else 3453 rcu_assign_pointer(memcg->memsw_thresholds, thresholds_new); 3454 3455 /* To be sure that nobody uses thresholds before freeing it */ 3456 synchronize_rcu(); 3457 3458 kfree(thresholds); 3459unlock: 3460 mutex_unlock(&memcg->thresholds_lock); 3461 3462 return ret; 3463} 3464 3465static int mem_cgroup_unregister_event(struct cgroup *cgrp, struct cftype *cft, 3466 struct eventfd_ctx *eventfd) 3467{ 3468 struct mem_cgroup *memcg = mem_cgroup_from_cont(cgrp); 3469 struct mem_cgroup_threshold_ary *thresholds, *thresholds_new; 3470 int type = MEMFILE_TYPE(cft->private); 3471 u64 usage; 3472 int size = 0; 3473 int i, j, ret; 3474 3475 mutex_lock(&memcg->thresholds_lock); 3476 if (type == _MEM) 3477 thresholds = memcg->thresholds; 3478 else if (type == _MEMSWAP) 3479 thresholds = memcg->memsw_thresholds; 3480 else 3481 BUG(); 3482 3483 /* 3484 * Something went wrong if we trying to unregister a threshold 3485 * if we don't have thresholds 3486 */ 3487 BUG_ON(!thresholds); 3488 3489 usage = mem_cgroup_usage(memcg, type == _MEMSWAP); 3490 3491 /* Check if a threshold crossed before removing */ 3492 __mem_cgroup_threshold(memcg, type == _MEMSWAP); 3493 3494 /* Calculate new number of threshold */ 3495 for (i = 0; i < thresholds->size; i++) { 3496 if (thresholds->entries[i].eventfd != eventfd) 3497 size++; 3498 } 3499 3500 /* Set thresholds array to NULL if we don't have thresholds */ 3501 if (!size) { 3502 thresholds_new = NULL; 3503 goto assign; 3504 } 3505 3506 /* Allocate memory for new array of thresholds */ 3507 thresholds_new = kmalloc(sizeof(*thresholds_new) + 3508 size * sizeof(struct mem_cgroup_threshold), 3509 GFP_KERNEL); 3510 if (!thresholds_new) { 3511 ret = -ENOMEM; 3512 goto unlock; 3513 } 3514 thresholds_new->size = size; 3515 3516 /* Copy thresholds and find current threshold */ 3517 atomic_set(&thresholds_new->current_threshold, -1); 3518 for (i = 0, j = 0; i < thresholds->size; i++) { 3519 if (thresholds->entries[i].eventfd == eventfd) 3520 continue; 3521 3522 thresholds_new->entries[j] = thresholds->entries[i]; 3523 if (thresholds_new->entries[j].threshold < usage) { 3524 /* 3525 * thresholds_new->current_threshold will not be used 3526 * until rcu_assign_pointer(), so it's safe to increment 3527 * it here. 3528 */ 3529 atomic_inc(&thresholds_new->current_threshold); 3530 } 3531 j++; 3532 } 3533 3534assign: 3535 if (type == _MEM) 3536 rcu_assign_pointer(memcg->thresholds, thresholds_new); 3537 else 3538 rcu_assign_pointer(memcg->memsw_thresholds, thresholds_new); 3539 3540 /* To be sure that nobody uses thresholds before freeing it */ 3541 synchronize_rcu(); 3542 3543 kfree(thresholds); 3544unlock: 3545 mutex_unlock(&memcg->thresholds_lock); 3546 3547 return ret; 3548} 3549 3550static struct cftype mem_cgroup_files[] = { 3551 { 3552 .name = "usage_in_bytes", 3553 .private = MEMFILE_PRIVATE(_MEM, RES_USAGE), 3554 .read_u64 = mem_cgroup_read, 3555 .register_event = mem_cgroup_register_event, 3556 .unregister_event = mem_cgroup_unregister_event, 3557 }, 3558 { 3559 .name = "max_usage_in_bytes", 3560 .private = MEMFILE_PRIVATE(_MEM, RES_MAX_USAGE), 3561 .trigger = mem_cgroup_reset, 3562 .read_u64 = mem_cgroup_read, 3563 }, 3564 { 3565 .name = "limit_in_bytes", 3566 .private = MEMFILE_PRIVATE(_MEM, RES_LIMIT), 3567 .write_string = mem_cgroup_write, 3568 .read_u64 = mem_cgroup_read, 3569 }, 3570 { 3571 .name = "soft_limit_in_bytes", 3572 .private = MEMFILE_PRIVATE(_MEM, RES_SOFT_LIMIT), 3573 .write_string = mem_cgroup_write, 3574 .read_u64 = mem_cgroup_read, 3575 }, 3576 { 3577 .name = "failcnt", 3578 .private = MEMFILE_PRIVATE(_MEM, RES_FAILCNT), 3579 .trigger = mem_cgroup_reset, 3580 .read_u64 = mem_cgroup_read, 3581 }, 3582 { 3583 .name = "stat", 3584 .read_map = mem_control_stat_show, 3585 }, 3586 { 3587 .name = "force_empty", 3588 .trigger = mem_cgroup_force_empty_write, 3589 }, 3590 { 3591 .name = "use_hierarchy", 3592 .write_u64 = mem_cgroup_hierarchy_write, 3593 .read_u64 = mem_cgroup_hierarchy_read, 3594 }, 3595 { 3596 .name = "swappiness", 3597 .read_u64 = mem_cgroup_swappiness_read, 3598 .write_u64 = mem_cgroup_swappiness_write, 3599 }, 3600 { 3601 .name = "move_charge_at_immigrate", 3602 .read_u64 = mem_cgroup_move_charge_read, 3603 .write_u64 = mem_cgroup_move_charge_write, 3604 }, 3605}; 3606 3607#ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP 3608static struct cftype memsw_cgroup_files[] = { 3609 { 3610 .name = "memsw.usage_in_bytes", 3611 .private = MEMFILE_PRIVATE(_MEMSWAP, RES_USAGE), 3612 .read_u64 = mem_cgroup_read, 3613 .register_event = mem_cgroup_register_event, 3614 .unregister_event = mem_cgroup_unregister_event, 3615 }, 3616 { 3617 .name = "memsw.max_usage_in_bytes", 3618 .private = MEMFILE_PRIVATE(_MEMSWAP, RES_MAX_USAGE), 3619 .trigger = mem_cgroup_reset, 3620 .read_u64 = mem_cgroup_read, 3621 }, 3622 { 3623 .name = "memsw.limit_in_bytes", 3624 .private = MEMFILE_PRIVATE(_MEMSWAP, RES_LIMIT), 3625 .write_string = mem_cgroup_write, 3626 .read_u64 = mem_cgroup_read, 3627 }, 3628 { 3629 .name = "memsw.failcnt", 3630 .private = MEMFILE_PRIVATE(_MEMSWAP, RES_FAILCNT), 3631 .trigger = mem_cgroup_reset, 3632 .read_u64 = mem_cgroup_read, 3633 }, 3634}; 3635 3636static int register_memsw_files(struct cgroup *cont, struct cgroup_subsys *ss) 3637{ 3638 if (!do_swap_account) 3639 return 0; 3640 return cgroup_add_files(cont, ss, memsw_cgroup_files, 3641 ARRAY_SIZE(memsw_cgroup_files)); 3642}; 3643#else 3644static int register_memsw_files(struct cgroup *cont, struct cgroup_subsys *ss) 3645{ 3646 return 0; 3647} 3648#endif 3649 3650static int alloc_mem_cgroup_per_zone_info(struct mem_cgroup *mem, int node) 3651{ 3652 struct mem_cgroup_per_node *pn; 3653 struct mem_cgroup_per_zone *mz; 3654 enum lru_list l; 3655 int zone, tmp = node; 3656 /* 3657 * This routine is called against possible nodes. 3658 * But it's BUG to call kmalloc() against offline node. 3659 * 3660 * TODO: this routine can waste much memory for nodes which will 3661 * never be onlined. It's better to use memory hotplug callback 3662 * function. 3663 */ 3664 if (!node_state(node, N_NORMAL_MEMORY)) 3665 tmp = -1; 3666 pn = kmalloc_node(sizeof(*pn), GFP_KERNEL, tmp); 3667 if (!pn) 3668 return 1; 3669 3670 mem->info.nodeinfo[node] = pn; 3671 memset(pn, 0, sizeof(*pn)); 3672 3673 for (zone = 0; zone < MAX_NR_ZONES; zone++) { 3674 mz = &pn->zoneinfo[zone]; 3675 for_each_lru(l) 3676 INIT_LIST_HEAD(&mz->lists[l]); 3677 mz->usage_in_excess = 0; 3678 mz->on_tree = false; 3679 mz->mem = mem; 3680 } 3681 return 0; 3682} 3683 3684static void free_mem_cgroup_per_zone_info(struct mem_cgroup *mem, int node) 3685{ 3686 kfree(mem->info.nodeinfo[node]); 3687} 3688 3689static struct mem_cgroup *mem_cgroup_alloc(void) 3690{ 3691 struct mem_cgroup *mem; 3692 int size = sizeof(struct mem_cgroup); 3693 3694 /* Can be very big if MAX_NUMNODES is very big */ 3695 if (size < PAGE_SIZE) 3696 mem = kmalloc(size, GFP_KERNEL); 3697 else 3698 mem = vmalloc(size); 3699 3700 if (!mem) 3701 return NULL; 3702 3703 memset(mem, 0, size); 3704 mem->stat = alloc_percpu(struct mem_cgroup_stat_cpu); 3705 if (!mem->stat) { 3706 if (size < PAGE_SIZE) 3707 kfree(mem); 3708 else 3709 vfree(mem); 3710 mem = NULL; 3711 } 3712 return mem; 3713} 3714 3715/* 3716 * At destroying mem_cgroup, references from swap_cgroup can remain. 3717 * (scanning all at force_empty is too costly...) 3718 * 3719 * Instead of clearing all references at force_empty, we remember 3720 * the number of reference from swap_cgroup and free mem_cgroup when 3721 * it goes down to 0. 3722 * 3723 * Removal of cgroup itself succeeds regardless of refs from swap. 3724 */ 3725 3726static void __mem_cgroup_free(struct mem_cgroup *mem) 3727{ 3728 int node; 3729 3730 mem_cgroup_remove_from_trees(mem); 3731 free_css_id(&mem_cgroup_subsys, &mem->css); 3732 3733 for_each_node_state(node, N_POSSIBLE) 3734 free_mem_cgroup_per_zone_info(mem, node); 3735 3736 free_percpu(mem->stat); 3737 if (sizeof(struct mem_cgroup) < PAGE_SIZE) 3738 kfree(mem); 3739 else 3740 vfree(mem); 3741} 3742 3743static void mem_cgroup_get(struct mem_cgroup *mem) 3744{ 3745 atomic_inc(&mem->refcnt); 3746} 3747 3748static void __mem_cgroup_put(struct mem_cgroup *mem, int count) 3749{ 3750 if (atomic_sub_and_test(count, &mem->refcnt)) { 3751 struct mem_cgroup *parent = parent_mem_cgroup(mem); 3752 __mem_cgroup_free(mem); 3753 if (parent) 3754 mem_cgroup_put(parent); 3755 } 3756} 3757 3758static void mem_cgroup_put(struct mem_cgroup *mem) 3759{ 3760 __mem_cgroup_put(mem, 1); 3761} 3762 3763/* 3764 * Returns the parent mem_cgroup in memcgroup hierarchy with hierarchy enabled. 3765 */ 3766static struct mem_cgroup *parent_mem_cgroup(struct mem_cgroup *mem) 3767{ 3768 if (!mem->res.parent) 3769 return NULL; 3770 return mem_cgroup_from_res_counter(mem->res.parent, res); 3771} 3772 3773#ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP 3774static void __init enable_swap_cgroup(void) 3775{ 3776 if (!mem_cgroup_disabled() && really_do_swap_account) 3777 do_swap_account = 1; 3778} 3779#else 3780static void __init enable_swap_cgroup(void) 3781{ 3782} 3783#endif 3784 3785static int mem_cgroup_soft_limit_tree_init(void) 3786{ 3787 struct mem_cgroup_tree_per_node *rtpn; 3788 struct mem_cgroup_tree_per_zone *rtpz; 3789 int tmp, node, zone; 3790 3791 for_each_node_state(node, N_POSSIBLE) { 3792 tmp = node; 3793 if (!node_state(node, N_NORMAL_MEMORY)) 3794 tmp = -1; 3795 rtpn = kzalloc_node(sizeof(*rtpn), GFP_KERNEL, tmp); 3796 if (!rtpn) 3797 return 1; 3798 3799 soft_limit_tree.rb_tree_per_node[node] = rtpn; 3800 3801 for (zone = 0; zone < MAX_NR_ZONES; zone++) { 3802 rtpz = &rtpn->rb_tree_per_zone[zone]; 3803 rtpz->rb_root = RB_ROOT; 3804 spin_lock_init(&rtpz->lock); 3805 } 3806 } 3807 return 0; 3808} 3809 3810static struct cgroup_subsys_state * __ref 3811mem_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cont) 3812{ 3813 struct mem_cgroup *mem, *parent; 3814 long error = -ENOMEM; 3815 int node; 3816 3817 mem = mem_cgroup_alloc(); 3818 if (!mem) 3819 return ERR_PTR(error); 3820 3821 for_each_node_state(node, N_POSSIBLE) 3822 if (alloc_mem_cgroup_per_zone_info(mem, node)) 3823 goto free_out; 3824 3825 /* root ? */ 3826 if (cont->parent == NULL) { 3827 int cpu; 3828 enable_swap_cgroup(); 3829 parent = NULL; 3830 root_mem_cgroup = mem; 3831 if (mem_cgroup_soft_limit_tree_init()) 3832 goto free_out; 3833 for_each_possible_cpu(cpu) { 3834 struct memcg_stock_pcp *stock = 3835 &per_cpu(memcg_stock, cpu); 3836 INIT_WORK(&stock->work, drain_local_stock); 3837 } 3838 hotcpu_notifier(memcg_stock_cpu_callback, 0); 3839 } else { 3840 parent = mem_cgroup_from_cont(cont->parent); 3841 mem->use_hierarchy = parent->use_hierarchy; 3842 } 3843 3844 if (parent && parent->use_hierarchy) { 3845 res_counter_init(&mem->res, &parent->res); 3846 res_counter_init(&mem->memsw, &parent->memsw); 3847 /* 3848 * We increment refcnt of the parent to ensure that we can 3849 * safely access it on res_counter_charge/uncharge. 3850 * This refcnt will be decremented when freeing this 3851 * mem_cgroup(see mem_cgroup_put). 3852 */ 3853 mem_cgroup_get(parent); 3854 } else { 3855 res_counter_init(&mem->res, NULL); 3856 res_counter_init(&mem->memsw, NULL); 3857 } 3858 mem->last_scanned_child = 0; 3859 spin_lock_init(&mem->reclaim_param_lock); 3860 3861 if (parent) 3862 mem->swappiness = get_swappiness(parent); 3863 atomic_set(&mem->refcnt, 1); 3864 mem->move_charge_at_immigrate = 0; 3865 mutex_init(&mem->thresholds_lock); 3866 return &mem->css; 3867free_out: 3868 __mem_cgroup_free(mem); 3869 root_mem_cgroup = NULL; 3870 return ERR_PTR(error); 3871} 3872 3873static int mem_cgroup_pre_destroy(struct cgroup_subsys *ss, 3874 struct cgroup *cont) 3875{ 3876 struct mem_cgroup *mem = mem_cgroup_from_cont(cont); 3877 3878 return mem_cgroup_force_empty(mem, false); 3879} 3880 3881static void mem_cgroup_destroy(struct cgroup_subsys *ss, 3882 struct cgroup *cont) 3883{ 3884 struct mem_cgroup *mem = mem_cgroup_from_cont(cont); 3885 3886 mem_cgroup_put(mem); 3887} 3888 3889static int mem_cgroup_populate(struct cgroup_subsys *ss, 3890 struct cgroup *cont) 3891{ 3892 int ret; 3893 3894 ret = cgroup_add_files(cont, ss, mem_cgroup_files, 3895 ARRAY_SIZE(mem_cgroup_files)); 3896 3897 if (!ret) 3898 ret = register_memsw_files(cont, ss); 3899 return ret; 3900} 3901 3902#ifdef CONFIG_MMU 3903/* Handlers for move charge at task migration. */ 3904#define PRECHARGE_COUNT_AT_ONCE 256 3905static int mem_cgroup_do_precharge(unsigned long count) 3906{ 3907 int ret = 0; 3908 int batch_count = PRECHARGE_COUNT_AT_ONCE; 3909 struct mem_cgroup *mem = mc.to; 3910 3911 if (mem_cgroup_is_root(mem)) { 3912 mc.precharge += count; 3913 /* we don't need css_get for root */ 3914 return ret; 3915 } 3916 /* try to charge at once */ 3917 if (count > 1) { 3918 struct res_counter *dummy; 3919 /* 3920 * "mem" cannot be under rmdir() because we've already checked 3921 * by cgroup_lock_live_cgroup() that it is not removed and we 3922 * are still under the same cgroup_mutex. So we can postpone 3923 * css_get(). 3924 */ 3925 if (res_counter_charge(&mem->res, PAGE_SIZE * count, &dummy)) 3926 goto one_by_one; 3927 if (do_swap_account && res_counter_charge(&mem->memsw, 3928 PAGE_SIZE * count, &dummy)) { 3929 res_counter_uncharge(&mem->res, PAGE_SIZE * count); 3930 goto one_by_one; 3931 } 3932 mc.precharge += count; 3933 VM_BUG_ON(test_bit(CSS_ROOT, &mem->css.flags)); 3934 WARN_ON_ONCE(count > INT_MAX); 3935 __css_get(&mem->css, (int)count); 3936 return ret; 3937 } 3938one_by_one: 3939 /* fall back to one by one charge */ 3940 while (count--) { 3941 if (signal_pending(current)) { 3942 ret = -EINTR; 3943 break; 3944 } 3945 if (!batch_count--) { 3946 batch_count = PRECHARGE_COUNT_AT_ONCE; 3947 cond_resched(); 3948 } 3949 ret = __mem_cgroup_try_charge(NULL, GFP_KERNEL, &mem, false); 3950 if (ret || !mem) 3951 /* mem_cgroup_clear_mc() will do uncharge later */ 3952 return -ENOMEM; 3953 mc.precharge++; 3954 } 3955 return ret; 3956} 3957 3958/** 3959 * is_target_pte_for_mc - check a pte whether it is valid for move charge 3960 * @vma: the vma the pte to be checked belongs 3961 * @addr: the address corresponding to the pte to be checked 3962 * @ptent: the pte to be checked 3963 * @target: the pointer the target page or swap ent will be stored(can be NULL) 3964 * 3965 * Returns 3966 * 0(MC_TARGET_NONE): if the pte is not a target for move charge. 3967 * 1(MC_TARGET_PAGE): if the page corresponding to this pte is a target for 3968 * move charge. if @target is not NULL, the page is stored in target->page 3969 * with extra refcnt got(Callers should handle it). 3970 * 2(MC_TARGET_SWAP): if the swap entry corresponding to this pte is a 3971 * target for charge migration. if @target is not NULL, the entry is stored 3972 * in target->ent. 3973 * 3974 * Called with pte lock held. 3975 */ 3976union mc_target { 3977 struct page *page; 3978 swp_entry_t ent; 3979}; 3980 3981enum mc_target_type { 3982 MC_TARGET_NONE, /* not used */ 3983 MC_TARGET_PAGE, 3984 MC_TARGET_SWAP, 3985}; 3986 3987static int is_target_pte_for_mc(struct vm_area_struct *vma, 3988 unsigned long addr, pte_t ptent, union mc_target *target) 3989{ 3990 struct page *page = NULL; 3991 struct page_cgroup *pc; 3992 int ret = 0; 3993 swp_entry_t ent = { .val = 0 }; 3994 int usage_count = 0; 3995 bool move_anon = test_bit(MOVE_CHARGE_TYPE_ANON, 3996 &mc.to->move_charge_at_immigrate); 3997 3998 if (!pte_present(ptent)) { 3999 /* TODO: handle swap of shmes/tmpfs */ 4000 if (pte_none(ptent) || pte_file(ptent)) 4001 return 0; 4002 else if (is_swap_pte(ptent)) { 4003 ent = pte_to_swp_entry(ptent); 4004 if (!move_anon || non_swap_entry(ent)) 4005 return 0; 4006 usage_count = mem_cgroup_count_swap_user(ent, &page); 4007 } 4008 } else { 4009 page = vm_normal_page(vma, addr, ptent); 4010 if (!page || !page_mapped(page)) 4011 return 0; 4012 /* 4013 * TODO: We don't move charges of file(including shmem/tmpfs) 4014 * pages for now. 4015 */ 4016 if (!move_anon || !PageAnon(page)) 4017 return 0; 4018 if (!get_page_unless_zero(page)) 4019 return 0; 4020 usage_count = page_mapcount(page); 4021 } 4022 if (usage_count > 1) { 4023 /* 4024 * TODO: We don't move charges of shared(used by multiple 4025 * processes) pages for now. 4026 */ 4027 if (page) 4028 put_page(page); 4029 return 0; 4030 } 4031 if (page) { 4032 pc = lookup_page_cgroup(page); 4033 /* 4034 * Do only loose check w/o page_cgroup lock. 4035 * mem_cgroup_move_account() checks the pc is valid or not under 4036 * the lock. 4037 */ 4038 if (PageCgroupUsed(pc) && pc->mem_cgroup == mc.from) { 4039 ret = MC_TARGET_PAGE; 4040 if (target) 4041 target->page = page; 4042 } 4043 if (!ret || !target) 4044 put_page(page); 4045 } 4046 /* throught */ 4047 if (ent.val && do_swap_account && !ret) { 4048 unsigned short id; 4049 rcu_read_lock(); 4050 id = css_id(&mc.from->css); 4051 rcu_read_unlock(); 4052 if (id == lookup_swap_cgroup(ent)) { 4053 ret = MC_TARGET_SWAP; 4054 if (target) 4055 target->ent = ent; 4056 } 4057 } 4058 return ret; 4059} 4060 4061static int mem_cgroup_count_precharge_pte_range(pmd_t *pmd, 4062 unsigned long addr, unsigned long end, 4063 struct mm_walk *walk) 4064{ 4065 struct vm_area_struct *vma = walk->private; 4066 pte_t *pte; 4067 spinlock_t *ptl; 4068 4069 pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl); 4070 for (; addr != end; pte++, addr += PAGE_SIZE) 4071 if (is_target_pte_for_mc(vma, addr, *pte, NULL)) 4072 mc.precharge++; /* increment precharge temporarily */ 4073 pte_unmap_unlock(pte - 1, ptl); 4074 cond_resched(); 4075 4076 return 0; 4077} 4078 4079static unsigned long mem_cgroup_count_precharge(struct mm_struct *mm) 4080{ 4081 unsigned long precharge; 4082 struct vm_area_struct *vma; 4083 4084 down_read(&mm->mmap_sem); 4085 for (vma = mm->mmap; vma; vma = vma->vm_next) { 4086 struct mm_walk mem_cgroup_count_precharge_walk = { 4087 .pmd_entry = mem_cgroup_count_precharge_pte_range, 4088 .mm = mm, 4089 .private = vma, 4090 }; 4091 if (is_vm_hugetlb_page(vma)) 4092 continue; 4093 /* TODO: We don't move charges of shmem/tmpfs pages for now. */ 4094 if (vma->vm_flags & VM_SHARED) 4095 continue; 4096 walk_page_range(vma->vm_start, vma->vm_end, 4097 &mem_cgroup_count_precharge_walk); 4098 } 4099 up_read(&mm->mmap_sem); 4100 4101 precharge = mc.precharge; 4102 mc.precharge = 0; 4103 4104 return precharge; 4105} 4106 4107static int mem_cgroup_precharge_mc(struct mm_struct *mm) 4108{ 4109 return mem_cgroup_do_precharge(mem_cgroup_count_precharge(mm)); 4110} 4111 4112static void mem_cgroup_clear_mc(void) 4113{ 4114 /* we must uncharge all the leftover precharges from mc.to */ 4115 if (mc.precharge) { 4116 __mem_cgroup_cancel_charge(mc.to, mc.precharge); 4117 mc.precharge = 0; 4118 } 4119 /* 4120 * we didn't uncharge from mc.from at mem_cgroup_move_account(), so 4121 * we must uncharge here. 4122 */ 4123 if (mc.moved_charge) { 4124 __mem_cgroup_cancel_charge(mc.from, mc.moved_charge); 4125 mc.moved_charge = 0; 4126 } 4127 /* we must fixup refcnts and charges */ 4128 if (mc.moved_swap) { 4129 WARN_ON_ONCE(mc.moved_swap > INT_MAX); 4130 /* uncharge swap account from the old cgroup */ 4131 if (!mem_cgroup_is_root(mc.from)) 4132 res_counter_uncharge(&mc.from->memsw, 4133 PAGE_SIZE * mc.moved_swap); 4134 __mem_cgroup_put(mc.from, mc.moved_swap); 4135 4136 if (!mem_cgroup_is_root(mc.to)) { 4137 /* 4138 * we charged both to->res and to->memsw, so we should 4139 * uncharge to->res. 4140 */ 4141 res_counter_uncharge(&mc.to->res, 4142 PAGE_SIZE * mc.moved_swap); 4143 VM_BUG_ON(test_bit(CSS_ROOT, &mc.to->css.flags)); 4144 __css_put(&mc.to->css, mc.moved_swap); 4145 } 4146 /* we've already done mem_cgroup_get(mc.to) */ 4147 4148 mc.moved_swap = 0; 4149 } 4150 mc.from = NULL; 4151 mc.to = NULL; 4152 mc.moving_task = NULL; 4153 wake_up_all(&mc.waitq); 4154} 4155 4156static int mem_cgroup_can_attach(struct cgroup_subsys *ss, 4157 struct cgroup *cgroup, 4158 struct task_struct *p, 4159 bool threadgroup) 4160{ 4161 int ret = 0; 4162 struct mem_cgroup *mem = mem_cgroup_from_cont(cgroup); 4163 4164 if (mem->move_charge_at_immigrate) { 4165 struct mm_struct *mm; 4166 struct mem_cgroup *from = mem_cgroup_from_task(p); 4167 4168 VM_BUG_ON(from == mem); 4169 4170 mm = get_task_mm(p); 4171 if (!mm) 4172 return 0; 4173 /* We move charges only when we move a owner of the mm */ 4174 if (mm->owner == p) { 4175 VM_BUG_ON(mc.from); 4176 VM_BUG_ON(mc.to); 4177 VM_BUG_ON(mc.precharge); 4178 VM_BUG_ON(mc.moved_charge); 4179 VM_BUG_ON(mc.moved_swap); 4180 VM_BUG_ON(mc.moving_task); 4181 mc.from = from; 4182 mc.to = mem; 4183 mc.precharge = 0; 4184 mc.moved_charge = 0; 4185 mc.moved_swap = 0; 4186 mc.moving_task = current; 4187 4188 ret = mem_cgroup_precharge_mc(mm); 4189 if (ret) 4190 mem_cgroup_clear_mc(); 4191 } 4192 mmput(mm); 4193 } 4194 return ret; 4195} 4196 4197static void mem_cgroup_cancel_attach(struct cgroup_subsys *ss, 4198 struct cgroup *cgroup, 4199 struct task_struct *p, 4200 bool threadgroup) 4201{ 4202 mem_cgroup_clear_mc(); 4203} 4204 4205static int mem_cgroup_move_charge_pte_range(pmd_t *pmd, 4206 unsigned long addr, unsigned long end, 4207 struct mm_walk *walk) 4208{ 4209 int ret = 0; 4210 struct vm_area_struct *vma = walk->private; 4211 pte_t *pte; 4212 spinlock_t *ptl; 4213 4214retry: 4215 pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl); 4216 for (; addr != end; addr += PAGE_SIZE) { 4217 pte_t ptent = *(pte++); 4218 union mc_target target; 4219 int type; 4220 struct page *page; 4221 struct page_cgroup *pc; 4222 swp_entry_t ent; 4223 4224 if (!mc.precharge) 4225 break; 4226 4227 type = is_target_pte_for_mc(vma, addr, ptent, &target); 4228 switch (type) { 4229 case MC_TARGET_PAGE: 4230 page = target.page; 4231 if (isolate_lru_page(page)) 4232 goto put; 4233 pc = lookup_page_cgroup(page); 4234 if (!mem_cgroup_move_account(pc, 4235 mc.from, mc.to, false)) { 4236 mc.precharge--; 4237 /* we uncharge from mc.from later. */ 4238 mc.moved_charge++; 4239 } 4240 putback_lru_page(page); 4241put: /* is_target_pte_for_mc() gets the page */ 4242 put_page(page); 4243 break; 4244 case MC_TARGET_SWAP: 4245 ent = target.ent; 4246 if (!mem_cgroup_move_swap_account(ent, 4247 mc.from, mc.to, false)) { 4248 mc.precharge--; 4249 /* we fixup refcnts and charges later. */ 4250 mc.moved_swap++; 4251 } 4252 break; 4253 default: 4254 break; 4255 } 4256 } 4257 pte_unmap_unlock(pte - 1, ptl); 4258 cond_resched(); 4259 4260 if (addr != end) { 4261 /* 4262 * We have consumed all precharges we got in can_attach(). 4263 * We try charge one by one, but don't do any additional 4264 * charges to mc.to if we have failed in charge once in attach() 4265 * phase. 4266 */ 4267 ret = mem_cgroup_do_precharge(1); 4268 if (!ret) 4269 goto retry; 4270 } 4271 4272 return ret; 4273} 4274 4275static void mem_cgroup_move_charge(struct mm_struct *mm) 4276{ 4277 struct vm_area_struct *vma; 4278 4279 lru_add_drain_all(); 4280 down_read(&mm->mmap_sem); 4281 for (vma = mm->mmap; vma; vma = vma->vm_next) { 4282 int ret; 4283 struct mm_walk mem_cgroup_move_charge_walk = { 4284 .pmd_entry = mem_cgroup_move_charge_pte_range, 4285 .mm = mm, 4286 .private = vma, 4287 }; 4288 if (is_vm_hugetlb_page(vma)) 4289 continue; 4290 /* TODO: We don't move charges of shmem/tmpfs pages for now. */ 4291 if (vma->vm_flags & VM_SHARED) 4292 continue; 4293 ret = walk_page_range(vma->vm_start, vma->vm_end, 4294 &mem_cgroup_move_charge_walk); 4295 if (ret) 4296 /* 4297 * means we have consumed all precharges and failed in 4298 * doing additional charge. Just abandon here. 4299 */ 4300 break; 4301 } 4302 up_read(&mm->mmap_sem); 4303} 4304 4305static void mem_cgroup_move_task(struct cgroup_subsys *ss, 4306 struct cgroup *cont, 4307 struct cgroup *old_cont, 4308 struct task_struct *p, 4309 bool threadgroup) 4310{ 4311 struct mm_struct *mm; 4312 4313 if (!mc.to) 4314 /* no need to move charge */ 4315 return; 4316 4317 mm = get_task_mm(p); 4318 if (mm) { 4319 mem_cgroup_move_charge(mm); 4320 mmput(mm); 4321 } 4322 mem_cgroup_clear_mc(); 4323} 4324#else /* !CONFIG_MMU */ 4325static int mem_cgroup_can_attach(struct cgroup_subsys *ss, 4326 struct cgroup *cgroup, 4327 struct task_struct *p, 4328 bool threadgroup) 4329{ 4330 return 0; 4331} 4332static void mem_cgroup_cancel_attach(struct cgroup_subsys *ss, 4333 struct cgroup *cgroup, 4334 struct task_struct *p, 4335 bool threadgroup) 4336{ 4337} 4338static void mem_cgroup_move_task(struct cgroup_subsys *ss, 4339 struct cgroup *cont, 4340 struct cgroup *old_cont, 4341 struct task_struct *p, 4342 bool threadgroup) 4343{ 4344} 4345#endif 4346 4347struct cgroup_subsys mem_cgroup_subsys = { 4348 .name = "memory", 4349 .subsys_id = mem_cgroup_subsys_id, 4350 .create = mem_cgroup_create, 4351 .pre_destroy = mem_cgroup_pre_destroy, 4352 .destroy = mem_cgroup_destroy, 4353 .populate = mem_cgroup_populate, 4354 .can_attach = mem_cgroup_can_attach, 4355 .cancel_attach = mem_cgroup_cancel_attach, 4356 .attach = mem_cgroup_move_task, 4357 .early_init = 0, 4358 .use_id = 1, 4359}; 4360 4361#ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP 4362 4363static int __init disable_swap_account(char *s) 4364{ 4365 really_do_swap_account = 0; 4366 return 1; 4367} 4368__setup("noswapaccount", disable_swap_account); 4369#endif 4370