memcontrol.c revision ef8745c1e7fc5413d760b3b958f3fd3a0beaad72
1/* memcontrol.c - Memory Controller 2 * 3 * Copyright IBM Corporation, 2007 4 * Author Balbir Singh <balbir@linux.vnet.ibm.com> 5 * 6 * Copyright 2007 OpenVZ SWsoft Inc 7 * Author: Pavel Emelianov <xemul@openvz.org> 8 * 9 * This program is free software; you can redistribute it and/or modify 10 * it under the terms of the GNU General Public License as published by 11 * the Free Software Foundation; either version 2 of the License, or 12 * (at your option) any later version. 13 * 14 * This program is distributed in the hope that it will be useful, 15 * but WITHOUT ANY WARRANTY; without even the implied warranty of 16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 17 * GNU General Public License for more details. 18 */ 19 20#include <linux/res_counter.h> 21#include <linux/memcontrol.h> 22#include <linux/cgroup.h> 23#include <linux/mm.h> 24#include <linux/pagemap.h> 25#include <linux/smp.h> 26#include <linux/page-flags.h> 27#include <linux/backing-dev.h> 28#include <linux/bit_spinlock.h> 29#include <linux/rcupdate.h> 30#include <linux/limits.h> 31#include <linux/mutex.h> 32#include <linux/rbtree.h> 33#include <linux/slab.h> 34#include <linux/swap.h> 35#include <linux/spinlock.h> 36#include <linux/fs.h> 37#include <linux/seq_file.h> 38#include <linux/vmalloc.h> 39#include <linux/mm_inline.h> 40#include <linux/page_cgroup.h> 41#include "internal.h" 42 43#include <asm/uaccess.h> 44 45struct cgroup_subsys mem_cgroup_subsys __read_mostly; 46#define MEM_CGROUP_RECLAIM_RETRIES 5 47struct mem_cgroup *root_mem_cgroup __read_mostly; 48 49#ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP 50/* Turned on only when memory cgroup is enabled && really_do_swap_account = 1 */ 51int do_swap_account __read_mostly; 52static int really_do_swap_account __initdata = 1; /* for remember boot option*/ 53#else 54#define do_swap_account (0) 55#endif 56 57static DEFINE_MUTEX(memcg_tasklist); /* can be hold under cgroup_mutex */ 58#define SOFTLIMIT_EVENTS_THRESH (1000) 59 60/* 61 * Statistics for memory cgroup. 62 */ 63enum mem_cgroup_stat_index { 64 /* 65 * For MEM_CONTAINER_TYPE_ALL, usage = pagecache + rss. 66 */ 67 MEM_CGROUP_STAT_CACHE, /* # of pages charged as cache */ 68 MEM_CGROUP_STAT_RSS, /* # of pages charged as anon rss */ 69 MEM_CGROUP_STAT_MAPPED_FILE, /* # of pages charged as file rss */ 70 MEM_CGROUP_STAT_PGPGIN_COUNT, /* # of pages paged in */ 71 MEM_CGROUP_STAT_PGPGOUT_COUNT, /* # of pages paged out */ 72 MEM_CGROUP_STAT_EVENTS, /* sum of pagein + pageout for internal use */ 73 MEM_CGROUP_STAT_SWAPOUT, /* # of pages, swapped out */ 74 75 MEM_CGROUP_STAT_NSTATS, 76}; 77 78struct mem_cgroup_stat_cpu { 79 s64 count[MEM_CGROUP_STAT_NSTATS]; 80} ____cacheline_aligned_in_smp; 81 82struct mem_cgroup_stat { 83 struct mem_cgroup_stat_cpu cpustat[0]; 84}; 85 86static inline void 87__mem_cgroup_stat_reset_safe(struct mem_cgroup_stat_cpu *stat, 88 enum mem_cgroup_stat_index idx) 89{ 90 stat->count[idx] = 0; 91} 92 93static inline s64 94__mem_cgroup_stat_read_local(struct mem_cgroup_stat_cpu *stat, 95 enum mem_cgroup_stat_index idx) 96{ 97 return stat->count[idx]; 98} 99 100/* 101 * For accounting under irq disable, no need for increment preempt count. 102 */ 103static inline void __mem_cgroup_stat_add_safe(struct mem_cgroup_stat_cpu *stat, 104 enum mem_cgroup_stat_index idx, int val) 105{ 106 stat->count[idx] += val; 107} 108 109static s64 mem_cgroup_read_stat(struct mem_cgroup_stat *stat, 110 enum mem_cgroup_stat_index idx) 111{ 112 int cpu; 113 s64 ret = 0; 114 for_each_possible_cpu(cpu) 115 ret += stat->cpustat[cpu].count[idx]; 116 return ret; 117} 118 119static s64 mem_cgroup_local_usage(struct mem_cgroup_stat *stat) 120{ 121 s64 ret; 122 123 ret = mem_cgroup_read_stat(stat, MEM_CGROUP_STAT_CACHE); 124 ret += mem_cgroup_read_stat(stat, MEM_CGROUP_STAT_RSS); 125 return ret; 126} 127 128/* 129 * per-zone information in memory controller. 130 */ 131struct mem_cgroup_per_zone { 132 /* 133 * spin_lock to protect the per cgroup LRU 134 */ 135 struct list_head lists[NR_LRU_LISTS]; 136 unsigned long count[NR_LRU_LISTS]; 137 138 struct zone_reclaim_stat reclaim_stat; 139 struct rb_node tree_node; /* RB tree node */ 140 unsigned long long usage_in_excess;/* Set to the value by which */ 141 /* the soft limit is exceeded*/ 142 bool on_tree; 143 struct mem_cgroup *mem; /* Back pointer, we cannot */ 144 /* use container_of */ 145}; 146/* Macro for accessing counter */ 147#define MEM_CGROUP_ZSTAT(mz, idx) ((mz)->count[(idx)]) 148 149struct mem_cgroup_per_node { 150 struct mem_cgroup_per_zone zoneinfo[MAX_NR_ZONES]; 151}; 152 153struct mem_cgroup_lru_info { 154 struct mem_cgroup_per_node *nodeinfo[MAX_NUMNODES]; 155}; 156 157/* 158 * Cgroups above their limits are maintained in a RB-Tree, independent of 159 * their hierarchy representation 160 */ 161 162struct mem_cgroup_tree_per_zone { 163 struct rb_root rb_root; 164 spinlock_t lock; 165}; 166 167struct mem_cgroup_tree_per_node { 168 struct mem_cgroup_tree_per_zone rb_tree_per_zone[MAX_NR_ZONES]; 169}; 170 171struct mem_cgroup_tree { 172 struct mem_cgroup_tree_per_node *rb_tree_per_node[MAX_NUMNODES]; 173}; 174 175static struct mem_cgroup_tree soft_limit_tree __read_mostly; 176 177/* 178 * The memory controller data structure. The memory controller controls both 179 * page cache and RSS per cgroup. We would eventually like to provide 180 * statistics based on the statistics developed by Rik Van Riel for clock-pro, 181 * to help the administrator determine what knobs to tune. 182 * 183 * TODO: Add a water mark for the memory controller. Reclaim will begin when 184 * we hit the water mark. May be even add a low water mark, such that 185 * no reclaim occurs from a cgroup at it's low water mark, this is 186 * a feature that will be implemented much later in the future. 187 */ 188struct mem_cgroup { 189 struct cgroup_subsys_state css; 190 /* 191 * the counter to account for memory usage 192 */ 193 struct res_counter res; 194 /* 195 * the counter to account for mem+swap usage. 196 */ 197 struct res_counter memsw; 198 /* 199 * Per cgroup active and inactive list, similar to the 200 * per zone LRU lists. 201 */ 202 struct mem_cgroup_lru_info info; 203 204 /* 205 protect against reclaim related member. 206 */ 207 spinlock_t reclaim_param_lock; 208 209 int prev_priority; /* for recording reclaim priority */ 210 211 /* 212 * While reclaiming in a hiearchy, we cache the last child we 213 * reclaimed from. 214 */ 215 int last_scanned_child; 216 /* 217 * Should the accounting and control be hierarchical, per subtree? 218 */ 219 bool use_hierarchy; 220 unsigned long last_oom_jiffies; 221 atomic_t refcnt; 222 223 unsigned int swappiness; 224 225 /* set when res.limit == memsw.limit */ 226 bool memsw_is_minimum; 227 228 /* 229 * statistics. This must be placed at the end of memcg. 230 */ 231 struct mem_cgroup_stat stat; 232}; 233 234/* 235 * Maximum loops in mem_cgroup_hierarchical_reclaim(), used for soft 236 * limit reclaim to prevent infinite loops, if they ever occur. 237 */ 238#define MEM_CGROUP_MAX_RECLAIM_LOOPS (100) 239#define MEM_CGROUP_MAX_SOFT_LIMIT_RECLAIM_LOOPS (2) 240 241enum charge_type { 242 MEM_CGROUP_CHARGE_TYPE_CACHE = 0, 243 MEM_CGROUP_CHARGE_TYPE_MAPPED, 244 MEM_CGROUP_CHARGE_TYPE_SHMEM, /* used by page migration of shmem */ 245 MEM_CGROUP_CHARGE_TYPE_FORCE, /* used by force_empty */ 246 MEM_CGROUP_CHARGE_TYPE_SWAPOUT, /* for accounting swapcache */ 247 MEM_CGROUP_CHARGE_TYPE_DROP, /* a page was unused swap cache */ 248 NR_CHARGE_TYPE, 249}; 250 251/* only for here (for easy reading.) */ 252#define PCGF_CACHE (1UL << PCG_CACHE) 253#define PCGF_USED (1UL << PCG_USED) 254#define PCGF_LOCK (1UL << PCG_LOCK) 255/* Not used, but added here for completeness */ 256#define PCGF_ACCT (1UL << PCG_ACCT) 257 258/* for encoding cft->private value on file */ 259#define _MEM (0) 260#define _MEMSWAP (1) 261#define MEMFILE_PRIVATE(x, val) (((x) << 16) | (val)) 262#define MEMFILE_TYPE(val) (((val) >> 16) & 0xffff) 263#define MEMFILE_ATTR(val) ((val) & 0xffff) 264 265/* 266 * Reclaim flags for mem_cgroup_hierarchical_reclaim 267 */ 268#define MEM_CGROUP_RECLAIM_NOSWAP_BIT 0x0 269#define MEM_CGROUP_RECLAIM_NOSWAP (1 << MEM_CGROUP_RECLAIM_NOSWAP_BIT) 270#define MEM_CGROUP_RECLAIM_SHRINK_BIT 0x1 271#define MEM_CGROUP_RECLAIM_SHRINK (1 << MEM_CGROUP_RECLAIM_SHRINK_BIT) 272#define MEM_CGROUP_RECLAIM_SOFT_BIT 0x2 273#define MEM_CGROUP_RECLAIM_SOFT (1 << MEM_CGROUP_RECLAIM_SOFT_BIT) 274 275static void mem_cgroup_get(struct mem_cgroup *mem); 276static void mem_cgroup_put(struct mem_cgroup *mem); 277static struct mem_cgroup *parent_mem_cgroup(struct mem_cgroup *mem); 278 279static struct mem_cgroup_per_zone * 280mem_cgroup_zoneinfo(struct mem_cgroup *mem, int nid, int zid) 281{ 282 return &mem->info.nodeinfo[nid]->zoneinfo[zid]; 283} 284 285static struct mem_cgroup_per_zone * 286page_cgroup_zoneinfo(struct page_cgroup *pc) 287{ 288 struct mem_cgroup *mem = pc->mem_cgroup; 289 int nid = page_cgroup_nid(pc); 290 int zid = page_cgroup_zid(pc); 291 292 if (!mem) 293 return NULL; 294 295 return mem_cgroup_zoneinfo(mem, nid, zid); 296} 297 298static struct mem_cgroup_tree_per_zone * 299soft_limit_tree_node_zone(int nid, int zid) 300{ 301 return &soft_limit_tree.rb_tree_per_node[nid]->rb_tree_per_zone[zid]; 302} 303 304static struct mem_cgroup_tree_per_zone * 305soft_limit_tree_from_page(struct page *page) 306{ 307 int nid = page_to_nid(page); 308 int zid = page_zonenum(page); 309 310 return &soft_limit_tree.rb_tree_per_node[nid]->rb_tree_per_zone[zid]; 311} 312 313static void 314__mem_cgroup_insert_exceeded(struct mem_cgroup *mem, 315 struct mem_cgroup_per_zone *mz, 316 struct mem_cgroup_tree_per_zone *mctz, 317 unsigned long long new_usage_in_excess) 318{ 319 struct rb_node **p = &mctz->rb_root.rb_node; 320 struct rb_node *parent = NULL; 321 struct mem_cgroup_per_zone *mz_node; 322 323 if (mz->on_tree) 324 return; 325 326 mz->usage_in_excess = new_usage_in_excess; 327 if (!mz->usage_in_excess) 328 return; 329 while (*p) { 330 parent = *p; 331 mz_node = rb_entry(parent, struct mem_cgroup_per_zone, 332 tree_node); 333 if (mz->usage_in_excess < mz_node->usage_in_excess) 334 p = &(*p)->rb_left; 335 /* 336 * We can't avoid mem cgroups that are over their soft 337 * limit by the same amount 338 */ 339 else if (mz->usage_in_excess >= mz_node->usage_in_excess) 340 p = &(*p)->rb_right; 341 } 342 rb_link_node(&mz->tree_node, parent, p); 343 rb_insert_color(&mz->tree_node, &mctz->rb_root); 344 mz->on_tree = true; 345} 346 347static void 348__mem_cgroup_remove_exceeded(struct mem_cgroup *mem, 349 struct mem_cgroup_per_zone *mz, 350 struct mem_cgroup_tree_per_zone *mctz) 351{ 352 if (!mz->on_tree) 353 return; 354 rb_erase(&mz->tree_node, &mctz->rb_root); 355 mz->on_tree = false; 356} 357 358static void 359mem_cgroup_remove_exceeded(struct mem_cgroup *mem, 360 struct mem_cgroup_per_zone *mz, 361 struct mem_cgroup_tree_per_zone *mctz) 362{ 363 spin_lock(&mctz->lock); 364 __mem_cgroup_remove_exceeded(mem, mz, mctz); 365 spin_unlock(&mctz->lock); 366} 367 368static bool mem_cgroup_soft_limit_check(struct mem_cgroup *mem) 369{ 370 bool ret = false; 371 int cpu; 372 s64 val; 373 struct mem_cgroup_stat_cpu *cpustat; 374 375 cpu = get_cpu(); 376 cpustat = &mem->stat.cpustat[cpu]; 377 val = __mem_cgroup_stat_read_local(cpustat, MEM_CGROUP_STAT_EVENTS); 378 if (unlikely(val > SOFTLIMIT_EVENTS_THRESH)) { 379 __mem_cgroup_stat_reset_safe(cpustat, MEM_CGROUP_STAT_EVENTS); 380 ret = true; 381 } 382 put_cpu(); 383 return ret; 384} 385 386static void mem_cgroup_update_tree(struct mem_cgroup *mem, struct page *page) 387{ 388 unsigned long long excess; 389 struct mem_cgroup_per_zone *mz; 390 struct mem_cgroup_tree_per_zone *mctz; 391 int nid = page_to_nid(page); 392 int zid = page_zonenum(page); 393 mctz = soft_limit_tree_from_page(page); 394 395 /* 396 * Necessary to update all ancestors when hierarchy is used. 397 * because their event counter is not touched. 398 */ 399 for (; mem; mem = parent_mem_cgroup(mem)) { 400 mz = mem_cgroup_zoneinfo(mem, nid, zid); 401 excess = res_counter_soft_limit_excess(&mem->res); 402 /* 403 * We have to update the tree if mz is on RB-tree or 404 * mem is over its softlimit. 405 */ 406 if (excess || mz->on_tree) { 407 spin_lock(&mctz->lock); 408 /* if on-tree, remove it */ 409 if (mz->on_tree) 410 __mem_cgroup_remove_exceeded(mem, mz, mctz); 411 /* 412 * Insert again. mz->usage_in_excess will be updated. 413 * If excess is 0, no tree ops. 414 */ 415 __mem_cgroup_insert_exceeded(mem, mz, mctz, excess); 416 spin_unlock(&mctz->lock); 417 } 418 } 419} 420 421static void mem_cgroup_remove_from_trees(struct mem_cgroup *mem) 422{ 423 int node, zone; 424 struct mem_cgroup_per_zone *mz; 425 struct mem_cgroup_tree_per_zone *mctz; 426 427 for_each_node_state(node, N_POSSIBLE) { 428 for (zone = 0; zone < MAX_NR_ZONES; zone++) { 429 mz = mem_cgroup_zoneinfo(mem, node, zone); 430 mctz = soft_limit_tree_node_zone(node, zone); 431 mem_cgroup_remove_exceeded(mem, mz, mctz); 432 } 433 } 434} 435 436static inline unsigned long mem_cgroup_get_excess(struct mem_cgroup *mem) 437{ 438 return res_counter_soft_limit_excess(&mem->res) >> PAGE_SHIFT; 439} 440 441static struct mem_cgroup_per_zone * 442__mem_cgroup_largest_soft_limit_node(struct mem_cgroup_tree_per_zone *mctz) 443{ 444 struct rb_node *rightmost = NULL; 445 struct mem_cgroup_per_zone *mz; 446 447retry: 448 mz = NULL; 449 rightmost = rb_last(&mctz->rb_root); 450 if (!rightmost) 451 goto done; /* Nothing to reclaim from */ 452 453 mz = rb_entry(rightmost, struct mem_cgroup_per_zone, tree_node); 454 /* 455 * Remove the node now but someone else can add it back, 456 * we will to add it back at the end of reclaim to its correct 457 * position in the tree. 458 */ 459 __mem_cgroup_remove_exceeded(mz->mem, mz, mctz); 460 if (!res_counter_soft_limit_excess(&mz->mem->res) || 461 !css_tryget(&mz->mem->css)) 462 goto retry; 463done: 464 return mz; 465} 466 467static struct mem_cgroup_per_zone * 468mem_cgroup_largest_soft_limit_node(struct mem_cgroup_tree_per_zone *mctz) 469{ 470 struct mem_cgroup_per_zone *mz; 471 472 spin_lock(&mctz->lock); 473 mz = __mem_cgroup_largest_soft_limit_node(mctz); 474 spin_unlock(&mctz->lock); 475 return mz; 476} 477 478static void mem_cgroup_swap_statistics(struct mem_cgroup *mem, 479 bool charge) 480{ 481 int val = (charge) ? 1 : -1; 482 struct mem_cgroup_stat *stat = &mem->stat; 483 struct mem_cgroup_stat_cpu *cpustat; 484 int cpu = get_cpu(); 485 486 cpustat = &stat->cpustat[cpu]; 487 __mem_cgroup_stat_add_safe(cpustat, MEM_CGROUP_STAT_SWAPOUT, val); 488 put_cpu(); 489} 490 491static void mem_cgroup_charge_statistics(struct mem_cgroup *mem, 492 struct page_cgroup *pc, 493 bool charge) 494{ 495 int val = (charge) ? 1 : -1; 496 struct mem_cgroup_stat *stat = &mem->stat; 497 struct mem_cgroup_stat_cpu *cpustat; 498 int cpu = get_cpu(); 499 500 cpustat = &stat->cpustat[cpu]; 501 if (PageCgroupCache(pc)) 502 __mem_cgroup_stat_add_safe(cpustat, MEM_CGROUP_STAT_CACHE, val); 503 else 504 __mem_cgroup_stat_add_safe(cpustat, MEM_CGROUP_STAT_RSS, val); 505 506 if (charge) 507 __mem_cgroup_stat_add_safe(cpustat, 508 MEM_CGROUP_STAT_PGPGIN_COUNT, 1); 509 else 510 __mem_cgroup_stat_add_safe(cpustat, 511 MEM_CGROUP_STAT_PGPGOUT_COUNT, 1); 512 __mem_cgroup_stat_add_safe(cpustat, MEM_CGROUP_STAT_EVENTS, 1); 513 put_cpu(); 514} 515 516static unsigned long mem_cgroup_get_local_zonestat(struct mem_cgroup *mem, 517 enum lru_list idx) 518{ 519 int nid, zid; 520 struct mem_cgroup_per_zone *mz; 521 u64 total = 0; 522 523 for_each_online_node(nid) 524 for (zid = 0; zid < MAX_NR_ZONES; zid++) { 525 mz = mem_cgroup_zoneinfo(mem, nid, zid); 526 total += MEM_CGROUP_ZSTAT(mz, idx); 527 } 528 return total; 529} 530 531static struct mem_cgroup *mem_cgroup_from_cont(struct cgroup *cont) 532{ 533 return container_of(cgroup_subsys_state(cont, 534 mem_cgroup_subsys_id), struct mem_cgroup, 535 css); 536} 537 538struct mem_cgroup *mem_cgroup_from_task(struct task_struct *p) 539{ 540 /* 541 * mm_update_next_owner() may clear mm->owner to NULL 542 * if it races with swapoff, page migration, etc. 543 * So this can be called with p == NULL. 544 */ 545 if (unlikely(!p)) 546 return NULL; 547 548 return container_of(task_subsys_state(p, mem_cgroup_subsys_id), 549 struct mem_cgroup, css); 550} 551 552static struct mem_cgroup *try_get_mem_cgroup_from_mm(struct mm_struct *mm) 553{ 554 struct mem_cgroup *mem = NULL; 555 556 if (!mm) 557 return NULL; 558 /* 559 * Because we have no locks, mm->owner's may be being moved to other 560 * cgroup. We use css_tryget() here even if this looks 561 * pessimistic (rather than adding locks here). 562 */ 563 rcu_read_lock(); 564 do { 565 mem = mem_cgroup_from_task(rcu_dereference(mm->owner)); 566 if (unlikely(!mem)) 567 break; 568 } while (!css_tryget(&mem->css)); 569 rcu_read_unlock(); 570 return mem; 571} 572 573/* 574 * Call callback function against all cgroup under hierarchy tree. 575 */ 576static int mem_cgroup_walk_tree(struct mem_cgroup *root, void *data, 577 int (*func)(struct mem_cgroup *, void *)) 578{ 579 int found, ret, nextid; 580 struct cgroup_subsys_state *css; 581 struct mem_cgroup *mem; 582 583 if (!root->use_hierarchy) 584 return (*func)(root, data); 585 586 nextid = 1; 587 do { 588 ret = 0; 589 mem = NULL; 590 591 rcu_read_lock(); 592 css = css_get_next(&mem_cgroup_subsys, nextid, &root->css, 593 &found); 594 if (css && css_tryget(css)) 595 mem = container_of(css, struct mem_cgroup, css); 596 rcu_read_unlock(); 597 598 if (mem) { 599 ret = (*func)(mem, data); 600 css_put(&mem->css); 601 } 602 nextid = found + 1; 603 } while (!ret && css); 604 605 return ret; 606} 607 608static inline bool mem_cgroup_is_root(struct mem_cgroup *mem) 609{ 610 return (mem == root_mem_cgroup); 611} 612 613/* 614 * Following LRU functions are allowed to be used without PCG_LOCK. 615 * Operations are called by routine of global LRU independently from memcg. 616 * What we have to take care of here is validness of pc->mem_cgroup. 617 * 618 * Changes to pc->mem_cgroup happens when 619 * 1. charge 620 * 2. moving account 621 * In typical case, "charge" is done before add-to-lru. Exception is SwapCache. 622 * It is added to LRU before charge. 623 * If PCG_USED bit is not set, page_cgroup is not added to this private LRU. 624 * When moving account, the page is not on LRU. It's isolated. 625 */ 626 627void mem_cgroup_del_lru_list(struct page *page, enum lru_list lru) 628{ 629 struct page_cgroup *pc; 630 struct mem_cgroup_per_zone *mz; 631 632 if (mem_cgroup_disabled()) 633 return; 634 pc = lookup_page_cgroup(page); 635 /* can happen while we handle swapcache. */ 636 if (!TestClearPageCgroupAcctLRU(pc)) 637 return; 638 VM_BUG_ON(!pc->mem_cgroup); 639 /* 640 * We don't check PCG_USED bit. It's cleared when the "page" is finally 641 * removed from global LRU. 642 */ 643 mz = page_cgroup_zoneinfo(pc); 644 MEM_CGROUP_ZSTAT(mz, lru) -= 1; 645 if (mem_cgroup_is_root(pc->mem_cgroup)) 646 return; 647 VM_BUG_ON(list_empty(&pc->lru)); 648 list_del_init(&pc->lru); 649 return; 650} 651 652void mem_cgroup_del_lru(struct page *page) 653{ 654 mem_cgroup_del_lru_list(page, page_lru(page)); 655} 656 657void mem_cgroup_rotate_lru_list(struct page *page, enum lru_list lru) 658{ 659 struct mem_cgroup_per_zone *mz; 660 struct page_cgroup *pc; 661 662 if (mem_cgroup_disabled()) 663 return; 664 665 pc = lookup_page_cgroup(page); 666 /* 667 * Used bit is set without atomic ops but after smp_wmb(). 668 * For making pc->mem_cgroup visible, insert smp_rmb() here. 669 */ 670 smp_rmb(); 671 /* unused or root page is not rotated. */ 672 if (!PageCgroupUsed(pc) || mem_cgroup_is_root(pc->mem_cgroup)) 673 return; 674 mz = page_cgroup_zoneinfo(pc); 675 list_move(&pc->lru, &mz->lists[lru]); 676} 677 678void mem_cgroup_add_lru_list(struct page *page, enum lru_list lru) 679{ 680 struct page_cgroup *pc; 681 struct mem_cgroup_per_zone *mz; 682 683 if (mem_cgroup_disabled()) 684 return; 685 pc = lookup_page_cgroup(page); 686 VM_BUG_ON(PageCgroupAcctLRU(pc)); 687 /* 688 * Used bit is set without atomic ops but after smp_wmb(). 689 * For making pc->mem_cgroup visible, insert smp_rmb() here. 690 */ 691 smp_rmb(); 692 if (!PageCgroupUsed(pc)) 693 return; 694 695 mz = page_cgroup_zoneinfo(pc); 696 MEM_CGROUP_ZSTAT(mz, lru) += 1; 697 SetPageCgroupAcctLRU(pc); 698 if (mem_cgroup_is_root(pc->mem_cgroup)) 699 return; 700 list_add(&pc->lru, &mz->lists[lru]); 701} 702 703/* 704 * At handling SwapCache, pc->mem_cgroup may be changed while it's linked to 705 * lru because the page may.be reused after it's fully uncharged (because of 706 * SwapCache behavior).To handle that, unlink page_cgroup from LRU when charge 707 * it again. This function is only used to charge SwapCache. It's done under 708 * lock_page and expected that zone->lru_lock is never held. 709 */ 710static void mem_cgroup_lru_del_before_commit_swapcache(struct page *page) 711{ 712 unsigned long flags; 713 struct zone *zone = page_zone(page); 714 struct page_cgroup *pc = lookup_page_cgroup(page); 715 716 spin_lock_irqsave(&zone->lru_lock, flags); 717 /* 718 * Forget old LRU when this page_cgroup is *not* used. This Used bit 719 * is guarded by lock_page() because the page is SwapCache. 720 */ 721 if (!PageCgroupUsed(pc)) 722 mem_cgroup_del_lru_list(page, page_lru(page)); 723 spin_unlock_irqrestore(&zone->lru_lock, flags); 724} 725 726static void mem_cgroup_lru_add_after_commit_swapcache(struct page *page) 727{ 728 unsigned long flags; 729 struct zone *zone = page_zone(page); 730 struct page_cgroup *pc = lookup_page_cgroup(page); 731 732 spin_lock_irqsave(&zone->lru_lock, flags); 733 /* link when the page is linked to LRU but page_cgroup isn't */ 734 if (PageLRU(page) && !PageCgroupAcctLRU(pc)) 735 mem_cgroup_add_lru_list(page, page_lru(page)); 736 spin_unlock_irqrestore(&zone->lru_lock, flags); 737} 738 739 740void mem_cgroup_move_lists(struct page *page, 741 enum lru_list from, enum lru_list to) 742{ 743 if (mem_cgroup_disabled()) 744 return; 745 mem_cgroup_del_lru_list(page, from); 746 mem_cgroup_add_lru_list(page, to); 747} 748 749int task_in_mem_cgroup(struct task_struct *task, const struct mem_cgroup *mem) 750{ 751 int ret; 752 struct mem_cgroup *curr = NULL; 753 754 task_lock(task); 755 rcu_read_lock(); 756 curr = try_get_mem_cgroup_from_mm(task->mm); 757 rcu_read_unlock(); 758 task_unlock(task); 759 if (!curr) 760 return 0; 761 if (curr->use_hierarchy) 762 ret = css_is_ancestor(&curr->css, &mem->css); 763 else 764 ret = (curr == mem); 765 css_put(&curr->css); 766 return ret; 767} 768 769/* 770 * prev_priority control...this will be used in memory reclaim path. 771 */ 772int mem_cgroup_get_reclaim_priority(struct mem_cgroup *mem) 773{ 774 int prev_priority; 775 776 spin_lock(&mem->reclaim_param_lock); 777 prev_priority = mem->prev_priority; 778 spin_unlock(&mem->reclaim_param_lock); 779 780 return prev_priority; 781} 782 783void mem_cgroup_note_reclaim_priority(struct mem_cgroup *mem, int priority) 784{ 785 spin_lock(&mem->reclaim_param_lock); 786 if (priority < mem->prev_priority) 787 mem->prev_priority = priority; 788 spin_unlock(&mem->reclaim_param_lock); 789} 790 791void mem_cgroup_record_reclaim_priority(struct mem_cgroup *mem, int priority) 792{ 793 spin_lock(&mem->reclaim_param_lock); 794 mem->prev_priority = priority; 795 spin_unlock(&mem->reclaim_param_lock); 796} 797 798static int calc_inactive_ratio(struct mem_cgroup *memcg, unsigned long *present_pages) 799{ 800 unsigned long active; 801 unsigned long inactive; 802 unsigned long gb; 803 unsigned long inactive_ratio; 804 805 inactive = mem_cgroup_get_local_zonestat(memcg, LRU_INACTIVE_ANON); 806 active = mem_cgroup_get_local_zonestat(memcg, LRU_ACTIVE_ANON); 807 808 gb = (inactive + active) >> (30 - PAGE_SHIFT); 809 if (gb) 810 inactive_ratio = int_sqrt(10 * gb); 811 else 812 inactive_ratio = 1; 813 814 if (present_pages) { 815 present_pages[0] = inactive; 816 present_pages[1] = active; 817 } 818 819 return inactive_ratio; 820} 821 822int mem_cgroup_inactive_anon_is_low(struct mem_cgroup *memcg) 823{ 824 unsigned long active; 825 unsigned long inactive; 826 unsigned long present_pages[2]; 827 unsigned long inactive_ratio; 828 829 inactive_ratio = calc_inactive_ratio(memcg, present_pages); 830 831 inactive = present_pages[0]; 832 active = present_pages[1]; 833 834 if (inactive * inactive_ratio < active) 835 return 1; 836 837 return 0; 838} 839 840int mem_cgroup_inactive_file_is_low(struct mem_cgroup *memcg) 841{ 842 unsigned long active; 843 unsigned long inactive; 844 845 inactive = mem_cgroup_get_local_zonestat(memcg, LRU_INACTIVE_FILE); 846 active = mem_cgroup_get_local_zonestat(memcg, LRU_ACTIVE_FILE); 847 848 return (active > inactive); 849} 850 851unsigned long mem_cgroup_zone_nr_pages(struct mem_cgroup *memcg, 852 struct zone *zone, 853 enum lru_list lru) 854{ 855 int nid = zone->zone_pgdat->node_id; 856 int zid = zone_idx(zone); 857 struct mem_cgroup_per_zone *mz = mem_cgroup_zoneinfo(memcg, nid, zid); 858 859 return MEM_CGROUP_ZSTAT(mz, lru); 860} 861 862struct zone_reclaim_stat *mem_cgroup_get_reclaim_stat(struct mem_cgroup *memcg, 863 struct zone *zone) 864{ 865 int nid = zone->zone_pgdat->node_id; 866 int zid = zone_idx(zone); 867 struct mem_cgroup_per_zone *mz = mem_cgroup_zoneinfo(memcg, nid, zid); 868 869 return &mz->reclaim_stat; 870} 871 872struct zone_reclaim_stat * 873mem_cgroup_get_reclaim_stat_from_page(struct page *page) 874{ 875 struct page_cgroup *pc; 876 struct mem_cgroup_per_zone *mz; 877 878 if (mem_cgroup_disabled()) 879 return NULL; 880 881 pc = lookup_page_cgroup(page); 882 /* 883 * Used bit is set without atomic ops but after smp_wmb(). 884 * For making pc->mem_cgroup visible, insert smp_rmb() here. 885 */ 886 smp_rmb(); 887 if (!PageCgroupUsed(pc)) 888 return NULL; 889 890 mz = page_cgroup_zoneinfo(pc); 891 if (!mz) 892 return NULL; 893 894 return &mz->reclaim_stat; 895} 896 897unsigned long mem_cgroup_isolate_pages(unsigned long nr_to_scan, 898 struct list_head *dst, 899 unsigned long *scanned, int order, 900 int mode, struct zone *z, 901 struct mem_cgroup *mem_cont, 902 int active, int file) 903{ 904 unsigned long nr_taken = 0; 905 struct page *page; 906 unsigned long scan; 907 LIST_HEAD(pc_list); 908 struct list_head *src; 909 struct page_cgroup *pc, *tmp; 910 int nid = z->zone_pgdat->node_id; 911 int zid = zone_idx(z); 912 struct mem_cgroup_per_zone *mz; 913 int lru = LRU_FILE * file + active; 914 int ret; 915 916 BUG_ON(!mem_cont); 917 mz = mem_cgroup_zoneinfo(mem_cont, nid, zid); 918 src = &mz->lists[lru]; 919 920 scan = 0; 921 list_for_each_entry_safe_reverse(pc, tmp, src, lru) { 922 if (scan >= nr_to_scan) 923 break; 924 925 page = pc->page; 926 if (unlikely(!PageCgroupUsed(pc))) 927 continue; 928 if (unlikely(!PageLRU(page))) 929 continue; 930 931 scan++; 932 ret = __isolate_lru_page(page, mode, file); 933 switch (ret) { 934 case 0: 935 list_move(&page->lru, dst); 936 mem_cgroup_del_lru(page); 937 nr_taken++; 938 break; 939 case -EBUSY: 940 /* we don't affect global LRU but rotate in our LRU */ 941 mem_cgroup_rotate_lru_list(page, page_lru(page)); 942 break; 943 default: 944 break; 945 } 946 } 947 948 *scanned = scan; 949 return nr_taken; 950} 951 952#define mem_cgroup_from_res_counter(counter, member) \ 953 container_of(counter, struct mem_cgroup, member) 954 955static bool mem_cgroup_check_under_limit(struct mem_cgroup *mem) 956{ 957 if (do_swap_account) { 958 if (res_counter_check_under_limit(&mem->res) && 959 res_counter_check_under_limit(&mem->memsw)) 960 return true; 961 } else 962 if (res_counter_check_under_limit(&mem->res)) 963 return true; 964 return false; 965} 966 967static unsigned int get_swappiness(struct mem_cgroup *memcg) 968{ 969 struct cgroup *cgrp = memcg->css.cgroup; 970 unsigned int swappiness; 971 972 /* root ? */ 973 if (cgrp->parent == NULL) 974 return vm_swappiness; 975 976 spin_lock(&memcg->reclaim_param_lock); 977 swappiness = memcg->swappiness; 978 spin_unlock(&memcg->reclaim_param_lock); 979 980 return swappiness; 981} 982 983static int mem_cgroup_count_children_cb(struct mem_cgroup *mem, void *data) 984{ 985 int *val = data; 986 (*val)++; 987 return 0; 988} 989 990/** 991 * mem_cgroup_print_mem_info: Called from OOM with tasklist_lock held in read mode. 992 * @memcg: The memory cgroup that went over limit 993 * @p: Task that is going to be killed 994 * 995 * NOTE: @memcg and @p's mem_cgroup can be different when hierarchy is 996 * enabled 997 */ 998void mem_cgroup_print_oom_info(struct mem_cgroup *memcg, struct task_struct *p) 999{ 1000 struct cgroup *task_cgrp; 1001 struct cgroup *mem_cgrp; 1002 /* 1003 * Need a buffer in BSS, can't rely on allocations. The code relies 1004 * on the assumption that OOM is serialized for memory controller. 1005 * If this assumption is broken, revisit this code. 1006 */ 1007 static char memcg_name[PATH_MAX]; 1008 int ret; 1009 1010 if (!memcg) 1011 return; 1012 1013 1014 rcu_read_lock(); 1015 1016 mem_cgrp = memcg->css.cgroup; 1017 task_cgrp = task_cgroup(p, mem_cgroup_subsys_id); 1018 1019 ret = cgroup_path(task_cgrp, memcg_name, PATH_MAX); 1020 if (ret < 0) { 1021 /* 1022 * Unfortunately, we are unable to convert to a useful name 1023 * But we'll still print out the usage information 1024 */ 1025 rcu_read_unlock(); 1026 goto done; 1027 } 1028 rcu_read_unlock(); 1029 1030 printk(KERN_INFO "Task in %s killed", memcg_name); 1031 1032 rcu_read_lock(); 1033 ret = cgroup_path(mem_cgrp, memcg_name, PATH_MAX); 1034 if (ret < 0) { 1035 rcu_read_unlock(); 1036 goto done; 1037 } 1038 rcu_read_unlock(); 1039 1040 /* 1041 * Continues from above, so we don't need an KERN_ level 1042 */ 1043 printk(KERN_CONT " as a result of limit of %s\n", memcg_name); 1044done: 1045 1046 printk(KERN_INFO "memory: usage %llukB, limit %llukB, failcnt %llu\n", 1047 res_counter_read_u64(&memcg->res, RES_USAGE) >> 10, 1048 res_counter_read_u64(&memcg->res, RES_LIMIT) >> 10, 1049 res_counter_read_u64(&memcg->res, RES_FAILCNT)); 1050 printk(KERN_INFO "memory+swap: usage %llukB, limit %llukB, " 1051 "failcnt %llu\n", 1052 res_counter_read_u64(&memcg->memsw, RES_USAGE) >> 10, 1053 res_counter_read_u64(&memcg->memsw, RES_LIMIT) >> 10, 1054 res_counter_read_u64(&memcg->memsw, RES_FAILCNT)); 1055} 1056 1057/* 1058 * This function returns the number of memcg under hierarchy tree. Returns 1059 * 1(self count) if no children. 1060 */ 1061static int mem_cgroup_count_children(struct mem_cgroup *mem) 1062{ 1063 int num = 0; 1064 mem_cgroup_walk_tree(mem, &num, mem_cgroup_count_children_cb); 1065 return num; 1066} 1067 1068/* 1069 * Visit the first child (need not be the first child as per the ordering 1070 * of the cgroup list, since we track last_scanned_child) of @mem and use 1071 * that to reclaim free pages from. 1072 */ 1073static struct mem_cgroup * 1074mem_cgroup_select_victim(struct mem_cgroup *root_mem) 1075{ 1076 struct mem_cgroup *ret = NULL; 1077 struct cgroup_subsys_state *css; 1078 int nextid, found; 1079 1080 if (!root_mem->use_hierarchy) { 1081 css_get(&root_mem->css); 1082 ret = root_mem; 1083 } 1084 1085 while (!ret) { 1086 rcu_read_lock(); 1087 nextid = root_mem->last_scanned_child + 1; 1088 css = css_get_next(&mem_cgroup_subsys, nextid, &root_mem->css, 1089 &found); 1090 if (css && css_tryget(css)) 1091 ret = container_of(css, struct mem_cgroup, css); 1092 1093 rcu_read_unlock(); 1094 /* Updates scanning parameter */ 1095 spin_lock(&root_mem->reclaim_param_lock); 1096 if (!css) { 1097 /* this means start scan from ID:1 */ 1098 root_mem->last_scanned_child = 0; 1099 } else 1100 root_mem->last_scanned_child = found; 1101 spin_unlock(&root_mem->reclaim_param_lock); 1102 } 1103 1104 return ret; 1105} 1106 1107/* 1108 * Scan the hierarchy if needed to reclaim memory. We remember the last child 1109 * we reclaimed from, so that we don't end up penalizing one child extensively 1110 * based on its position in the children list. 1111 * 1112 * root_mem is the original ancestor that we've been reclaim from. 1113 * 1114 * We give up and return to the caller when we visit root_mem twice. 1115 * (other groups can be removed while we're walking....) 1116 * 1117 * If shrink==true, for avoiding to free too much, this returns immedieately. 1118 */ 1119static int mem_cgroup_hierarchical_reclaim(struct mem_cgroup *root_mem, 1120 struct zone *zone, 1121 gfp_t gfp_mask, 1122 unsigned long reclaim_options) 1123{ 1124 struct mem_cgroup *victim; 1125 int ret, total = 0; 1126 int loop = 0; 1127 bool noswap = reclaim_options & MEM_CGROUP_RECLAIM_NOSWAP; 1128 bool shrink = reclaim_options & MEM_CGROUP_RECLAIM_SHRINK; 1129 bool check_soft = reclaim_options & MEM_CGROUP_RECLAIM_SOFT; 1130 unsigned long excess = mem_cgroup_get_excess(root_mem); 1131 1132 /* If memsw_is_minimum==1, swap-out is of-no-use. */ 1133 if (root_mem->memsw_is_minimum) 1134 noswap = true; 1135 1136 while (1) { 1137 victim = mem_cgroup_select_victim(root_mem); 1138 if (victim == root_mem) { 1139 loop++; 1140 if (loop >= 2) { 1141 /* 1142 * If we have not been able to reclaim 1143 * anything, it might because there are 1144 * no reclaimable pages under this hierarchy 1145 */ 1146 if (!check_soft || !total) { 1147 css_put(&victim->css); 1148 break; 1149 } 1150 /* 1151 * We want to do more targetted reclaim. 1152 * excess >> 2 is not to excessive so as to 1153 * reclaim too much, nor too less that we keep 1154 * coming back to reclaim from this cgroup 1155 */ 1156 if (total >= (excess >> 2) || 1157 (loop > MEM_CGROUP_MAX_RECLAIM_LOOPS)) { 1158 css_put(&victim->css); 1159 break; 1160 } 1161 } 1162 } 1163 if (!mem_cgroup_local_usage(&victim->stat)) { 1164 /* this cgroup's local usage == 0 */ 1165 css_put(&victim->css); 1166 continue; 1167 } 1168 /* we use swappiness of local cgroup */ 1169 if (check_soft) 1170 ret = mem_cgroup_shrink_node_zone(victim, gfp_mask, 1171 noswap, get_swappiness(victim), zone, 1172 zone->zone_pgdat->node_id); 1173 else 1174 ret = try_to_free_mem_cgroup_pages(victim, gfp_mask, 1175 noswap, get_swappiness(victim)); 1176 css_put(&victim->css); 1177 /* 1178 * At shrinking usage, we can't check we should stop here or 1179 * reclaim more. It's depends on callers. last_scanned_child 1180 * will work enough for keeping fairness under tree. 1181 */ 1182 if (shrink) 1183 return ret; 1184 total += ret; 1185 if (check_soft) { 1186 if (res_counter_check_under_soft_limit(&root_mem->res)) 1187 return total; 1188 } else if (mem_cgroup_check_under_limit(root_mem)) 1189 return 1 + total; 1190 } 1191 return total; 1192} 1193 1194bool mem_cgroup_oom_called(struct task_struct *task) 1195{ 1196 bool ret = false; 1197 struct mem_cgroup *mem; 1198 struct mm_struct *mm; 1199 1200 rcu_read_lock(); 1201 mm = task->mm; 1202 if (!mm) 1203 mm = &init_mm; 1204 mem = mem_cgroup_from_task(rcu_dereference(mm->owner)); 1205 if (mem && time_before(jiffies, mem->last_oom_jiffies + HZ/10)) 1206 ret = true; 1207 rcu_read_unlock(); 1208 return ret; 1209} 1210 1211static int record_last_oom_cb(struct mem_cgroup *mem, void *data) 1212{ 1213 mem->last_oom_jiffies = jiffies; 1214 return 0; 1215} 1216 1217static void record_last_oom(struct mem_cgroup *mem) 1218{ 1219 mem_cgroup_walk_tree(mem, NULL, record_last_oom_cb); 1220} 1221 1222/* 1223 * Currently used to update mapped file statistics, but the routine can be 1224 * generalized to update other statistics as well. 1225 */ 1226void mem_cgroup_update_mapped_file_stat(struct page *page, int val) 1227{ 1228 struct mem_cgroup *mem; 1229 struct mem_cgroup_stat *stat; 1230 struct mem_cgroup_stat_cpu *cpustat; 1231 int cpu; 1232 struct page_cgroup *pc; 1233 1234 if (!page_is_file_cache(page)) 1235 return; 1236 1237 pc = lookup_page_cgroup(page); 1238 if (unlikely(!pc)) 1239 return; 1240 1241 lock_page_cgroup(pc); 1242 mem = pc->mem_cgroup; 1243 if (!mem) 1244 goto done; 1245 1246 if (!PageCgroupUsed(pc)) 1247 goto done; 1248 1249 /* 1250 * Preemption is already disabled, we don't need get_cpu() 1251 */ 1252 cpu = smp_processor_id(); 1253 stat = &mem->stat; 1254 cpustat = &stat->cpustat[cpu]; 1255 1256 __mem_cgroup_stat_add_safe(cpustat, MEM_CGROUP_STAT_MAPPED_FILE, val); 1257done: 1258 unlock_page_cgroup(pc); 1259} 1260 1261/* 1262 * Unlike exported interface, "oom" parameter is added. if oom==true, 1263 * oom-killer can be invoked. 1264 */ 1265static int __mem_cgroup_try_charge(struct mm_struct *mm, 1266 gfp_t gfp_mask, struct mem_cgroup **memcg, 1267 bool oom, struct page *page) 1268{ 1269 struct mem_cgroup *mem, *mem_over_limit; 1270 int nr_retries = MEM_CGROUP_RECLAIM_RETRIES; 1271 struct res_counter *fail_res; 1272 1273 if (unlikely(test_thread_flag(TIF_MEMDIE))) { 1274 /* Don't account this! */ 1275 *memcg = NULL; 1276 return 0; 1277 } 1278 1279 /* 1280 * We always charge the cgroup the mm_struct belongs to. 1281 * The mm_struct's mem_cgroup changes on task migration if the 1282 * thread group leader migrates. It's possible that mm is not 1283 * set, if so charge the init_mm (happens for pagecache usage). 1284 */ 1285 mem = *memcg; 1286 if (likely(!mem)) { 1287 mem = try_get_mem_cgroup_from_mm(mm); 1288 *memcg = mem; 1289 } else { 1290 css_get(&mem->css); 1291 } 1292 if (unlikely(!mem)) 1293 return 0; 1294 1295 VM_BUG_ON(css_is_removed(&mem->css)); 1296 1297 while (1) { 1298 int ret = 0; 1299 unsigned long flags = 0; 1300 1301 if (mem_cgroup_is_root(mem)) 1302 goto done; 1303 ret = res_counter_charge(&mem->res, PAGE_SIZE, &fail_res); 1304 if (likely(!ret)) { 1305 if (!do_swap_account) 1306 break; 1307 ret = res_counter_charge(&mem->memsw, PAGE_SIZE, 1308 &fail_res); 1309 if (likely(!ret)) 1310 break; 1311 /* mem+swap counter fails */ 1312 res_counter_uncharge(&mem->res, PAGE_SIZE); 1313 flags |= MEM_CGROUP_RECLAIM_NOSWAP; 1314 mem_over_limit = mem_cgroup_from_res_counter(fail_res, 1315 memsw); 1316 } else 1317 /* mem counter fails */ 1318 mem_over_limit = mem_cgroup_from_res_counter(fail_res, 1319 res); 1320 1321 if (!(gfp_mask & __GFP_WAIT)) 1322 goto nomem; 1323 1324 ret = mem_cgroup_hierarchical_reclaim(mem_over_limit, NULL, 1325 gfp_mask, flags); 1326 if (ret) 1327 continue; 1328 1329 /* 1330 * try_to_free_mem_cgroup_pages() might not give us a full 1331 * picture of reclaim. Some pages are reclaimed and might be 1332 * moved to swap cache or just unmapped from the cgroup. 1333 * Check the limit again to see if the reclaim reduced the 1334 * current usage of the cgroup before giving up 1335 * 1336 */ 1337 if (mem_cgroup_check_under_limit(mem_over_limit)) 1338 continue; 1339 1340 if (!nr_retries--) { 1341 if (oom) { 1342 mutex_lock(&memcg_tasklist); 1343 mem_cgroup_out_of_memory(mem_over_limit, gfp_mask); 1344 mutex_unlock(&memcg_tasklist); 1345 record_last_oom(mem_over_limit); 1346 } 1347 goto nomem; 1348 } 1349 } 1350 /* 1351 * Insert ancestor (and ancestor's ancestors), to softlimit RB-tree. 1352 * if they exceeds softlimit. 1353 */ 1354 if (mem_cgroup_soft_limit_check(mem)) 1355 mem_cgroup_update_tree(mem, page); 1356done: 1357 return 0; 1358nomem: 1359 css_put(&mem->css); 1360 return -ENOMEM; 1361} 1362 1363/* 1364 * A helper function to get mem_cgroup from ID. must be called under 1365 * rcu_read_lock(). The caller must check css_is_removed() or some if 1366 * it's concern. (dropping refcnt from swap can be called against removed 1367 * memcg.) 1368 */ 1369static struct mem_cgroup *mem_cgroup_lookup(unsigned short id) 1370{ 1371 struct cgroup_subsys_state *css; 1372 1373 /* ID 0 is unused ID */ 1374 if (!id) 1375 return NULL; 1376 css = css_lookup(&mem_cgroup_subsys, id); 1377 if (!css) 1378 return NULL; 1379 return container_of(css, struct mem_cgroup, css); 1380} 1381 1382static struct mem_cgroup *try_get_mem_cgroup_from_swapcache(struct page *page) 1383{ 1384 struct mem_cgroup *mem; 1385 struct page_cgroup *pc; 1386 unsigned short id; 1387 swp_entry_t ent; 1388 1389 VM_BUG_ON(!PageLocked(page)); 1390 1391 if (!PageSwapCache(page)) 1392 return NULL; 1393 1394 pc = lookup_page_cgroup(page); 1395 lock_page_cgroup(pc); 1396 if (PageCgroupUsed(pc)) { 1397 mem = pc->mem_cgroup; 1398 if (mem && !css_tryget(&mem->css)) 1399 mem = NULL; 1400 } else { 1401 ent.val = page_private(page); 1402 id = lookup_swap_cgroup(ent); 1403 rcu_read_lock(); 1404 mem = mem_cgroup_lookup(id); 1405 if (mem && !css_tryget(&mem->css)) 1406 mem = NULL; 1407 rcu_read_unlock(); 1408 } 1409 unlock_page_cgroup(pc); 1410 return mem; 1411} 1412 1413/* 1414 * commit a charge got by __mem_cgroup_try_charge() and makes page_cgroup to be 1415 * USED state. If already USED, uncharge and return. 1416 */ 1417 1418static void __mem_cgroup_commit_charge(struct mem_cgroup *mem, 1419 struct page_cgroup *pc, 1420 enum charge_type ctype) 1421{ 1422 /* try_charge() can return NULL to *memcg, taking care of it. */ 1423 if (!mem) 1424 return; 1425 1426 lock_page_cgroup(pc); 1427 if (unlikely(PageCgroupUsed(pc))) { 1428 unlock_page_cgroup(pc); 1429 if (!mem_cgroup_is_root(mem)) { 1430 res_counter_uncharge(&mem->res, PAGE_SIZE); 1431 if (do_swap_account) 1432 res_counter_uncharge(&mem->memsw, PAGE_SIZE); 1433 } 1434 css_put(&mem->css); 1435 return; 1436 } 1437 1438 pc->mem_cgroup = mem; 1439 /* 1440 * We access a page_cgroup asynchronously without lock_page_cgroup(). 1441 * Especially when a page_cgroup is taken from a page, pc->mem_cgroup 1442 * is accessed after testing USED bit. To make pc->mem_cgroup visible 1443 * before USED bit, we need memory barrier here. 1444 * See mem_cgroup_add_lru_list(), etc. 1445 */ 1446 smp_wmb(); 1447 switch (ctype) { 1448 case MEM_CGROUP_CHARGE_TYPE_CACHE: 1449 case MEM_CGROUP_CHARGE_TYPE_SHMEM: 1450 SetPageCgroupCache(pc); 1451 SetPageCgroupUsed(pc); 1452 break; 1453 case MEM_CGROUP_CHARGE_TYPE_MAPPED: 1454 ClearPageCgroupCache(pc); 1455 SetPageCgroupUsed(pc); 1456 break; 1457 default: 1458 break; 1459 } 1460 1461 mem_cgroup_charge_statistics(mem, pc, true); 1462 1463 unlock_page_cgroup(pc); 1464} 1465 1466/** 1467 * mem_cgroup_move_account - move account of the page 1468 * @pc: page_cgroup of the page. 1469 * @from: mem_cgroup which the page is moved from. 1470 * @to: mem_cgroup which the page is moved to. @from != @to. 1471 * 1472 * The caller must confirm following. 1473 * - page is not on LRU (isolate_page() is useful.) 1474 * 1475 * returns 0 at success, 1476 * returns -EBUSY when lock is busy or "pc" is unstable. 1477 * 1478 * This function does "uncharge" from old cgroup but doesn't do "charge" to 1479 * new cgroup. It should be done by a caller. 1480 */ 1481 1482static int mem_cgroup_move_account(struct page_cgroup *pc, 1483 struct mem_cgroup *from, struct mem_cgroup *to) 1484{ 1485 struct mem_cgroup_per_zone *from_mz, *to_mz; 1486 int nid, zid; 1487 int ret = -EBUSY; 1488 struct page *page; 1489 int cpu; 1490 struct mem_cgroup_stat *stat; 1491 struct mem_cgroup_stat_cpu *cpustat; 1492 1493 VM_BUG_ON(from == to); 1494 VM_BUG_ON(PageLRU(pc->page)); 1495 1496 nid = page_cgroup_nid(pc); 1497 zid = page_cgroup_zid(pc); 1498 from_mz = mem_cgroup_zoneinfo(from, nid, zid); 1499 to_mz = mem_cgroup_zoneinfo(to, nid, zid); 1500 1501 if (!trylock_page_cgroup(pc)) 1502 return ret; 1503 1504 if (!PageCgroupUsed(pc)) 1505 goto out; 1506 1507 if (pc->mem_cgroup != from) 1508 goto out; 1509 1510 if (!mem_cgroup_is_root(from)) 1511 res_counter_uncharge(&from->res, PAGE_SIZE); 1512 mem_cgroup_charge_statistics(from, pc, false); 1513 1514 page = pc->page; 1515 if (page_is_file_cache(page) && page_mapped(page)) { 1516 cpu = smp_processor_id(); 1517 /* Update mapped_file data for mem_cgroup "from" */ 1518 stat = &from->stat; 1519 cpustat = &stat->cpustat[cpu]; 1520 __mem_cgroup_stat_add_safe(cpustat, MEM_CGROUP_STAT_MAPPED_FILE, 1521 -1); 1522 1523 /* Update mapped_file data for mem_cgroup "to" */ 1524 stat = &to->stat; 1525 cpustat = &stat->cpustat[cpu]; 1526 __mem_cgroup_stat_add_safe(cpustat, MEM_CGROUP_STAT_MAPPED_FILE, 1527 1); 1528 } 1529 1530 if (do_swap_account && !mem_cgroup_is_root(from)) 1531 res_counter_uncharge(&from->memsw, PAGE_SIZE); 1532 css_put(&from->css); 1533 1534 css_get(&to->css); 1535 pc->mem_cgroup = to; 1536 mem_cgroup_charge_statistics(to, pc, true); 1537 ret = 0; 1538out: 1539 unlock_page_cgroup(pc); 1540 /* 1541 * We charges against "to" which may not have any tasks. Then, "to" 1542 * can be under rmdir(). But in current implementation, caller of 1543 * this function is just force_empty() and it's garanteed that 1544 * "to" is never removed. So, we don't check rmdir status here. 1545 */ 1546 return ret; 1547} 1548 1549/* 1550 * move charges to its parent. 1551 */ 1552 1553static int mem_cgroup_move_parent(struct page_cgroup *pc, 1554 struct mem_cgroup *child, 1555 gfp_t gfp_mask) 1556{ 1557 struct page *page = pc->page; 1558 struct cgroup *cg = child->css.cgroup; 1559 struct cgroup *pcg = cg->parent; 1560 struct mem_cgroup *parent; 1561 int ret; 1562 1563 /* Is ROOT ? */ 1564 if (!pcg) 1565 return -EINVAL; 1566 1567 1568 parent = mem_cgroup_from_cont(pcg); 1569 1570 1571 ret = __mem_cgroup_try_charge(NULL, gfp_mask, &parent, false, page); 1572 if (ret || !parent) 1573 return ret; 1574 1575 if (!get_page_unless_zero(page)) { 1576 ret = -EBUSY; 1577 goto uncharge; 1578 } 1579 1580 ret = isolate_lru_page(page); 1581 1582 if (ret) 1583 goto cancel; 1584 1585 ret = mem_cgroup_move_account(pc, child, parent); 1586 1587 putback_lru_page(page); 1588 if (!ret) { 1589 put_page(page); 1590 /* drop extra refcnt by try_charge() */ 1591 css_put(&parent->css); 1592 return 0; 1593 } 1594 1595cancel: 1596 put_page(page); 1597uncharge: 1598 /* drop extra refcnt by try_charge() */ 1599 css_put(&parent->css); 1600 /* uncharge if move fails */ 1601 if (!mem_cgroup_is_root(parent)) { 1602 res_counter_uncharge(&parent->res, PAGE_SIZE); 1603 if (do_swap_account) 1604 res_counter_uncharge(&parent->memsw, PAGE_SIZE); 1605 } 1606 return ret; 1607} 1608 1609/* 1610 * Charge the memory controller for page usage. 1611 * Return 1612 * 0 if the charge was successful 1613 * < 0 if the cgroup is over its limit 1614 */ 1615static int mem_cgroup_charge_common(struct page *page, struct mm_struct *mm, 1616 gfp_t gfp_mask, enum charge_type ctype, 1617 struct mem_cgroup *memcg) 1618{ 1619 struct mem_cgroup *mem; 1620 struct page_cgroup *pc; 1621 int ret; 1622 1623 pc = lookup_page_cgroup(page); 1624 /* can happen at boot */ 1625 if (unlikely(!pc)) 1626 return 0; 1627 prefetchw(pc); 1628 1629 mem = memcg; 1630 ret = __mem_cgroup_try_charge(mm, gfp_mask, &mem, true, page); 1631 if (ret || !mem) 1632 return ret; 1633 1634 __mem_cgroup_commit_charge(mem, pc, ctype); 1635 return 0; 1636} 1637 1638int mem_cgroup_newpage_charge(struct page *page, 1639 struct mm_struct *mm, gfp_t gfp_mask) 1640{ 1641 if (mem_cgroup_disabled()) 1642 return 0; 1643 if (PageCompound(page)) 1644 return 0; 1645 /* 1646 * If already mapped, we don't have to account. 1647 * If page cache, page->mapping has address_space. 1648 * But page->mapping may have out-of-use anon_vma pointer, 1649 * detecit it by PageAnon() check. newly-mapped-anon's page->mapping 1650 * is NULL. 1651 */ 1652 if (page_mapped(page) || (page->mapping && !PageAnon(page))) 1653 return 0; 1654 if (unlikely(!mm)) 1655 mm = &init_mm; 1656 return mem_cgroup_charge_common(page, mm, gfp_mask, 1657 MEM_CGROUP_CHARGE_TYPE_MAPPED, NULL); 1658} 1659 1660static void 1661__mem_cgroup_commit_charge_swapin(struct page *page, struct mem_cgroup *ptr, 1662 enum charge_type ctype); 1663 1664int mem_cgroup_cache_charge(struct page *page, struct mm_struct *mm, 1665 gfp_t gfp_mask) 1666{ 1667 struct mem_cgroup *mem = NULL; 1668 int ret; 1669 1670 if (mem_cgroup_disabled()) 1671 return 0; 1672 if (PageCompound(page)) 1673 return 0; 1674 /* 1675 * Corner case handling. This is called from add_to_page_cache() 1676 * in usual. But some FS (shmem) precharges this page before calling it 1677 * and call add_to_page_cache() with GFP_NOWAIT. 1678 * 1679 * For GFP_NOWAIT case, the page may be pre-charged before calling 1680 * add_to_page_cache(). (See shmem.c) check it here and avoid to call 1681 * charge twice. (It works but has to pay a bit larger cost.) 1682 * And when the page is SwapCache, it should take swap information 1683 * into account. This is under lock_page() now. 1684 */ 1685 if (!(gfp_mask & __GFP_WAIT)) { 1686 struct page_cgroup *pc; 1687 1688 1689 pc = lookup_page_cgroup(page); 1690 if (!pc) 1691 return 0; 1692 lock_page_cgroup(pc); 1693 if (PageCgroupUsed(pc)) { 1694 unlock_page_cgroup(pc); 1695 return 0; 1696 } 1697 unlock_page_cgroup(pc); 1698 } 1699 1700 if (unlikely(!mm && !mem)) 1701 mm = &init_mm; 1702 1703 if (page_is_file_cache(page)) 1704 return mem_cgroup_charge_common(page, mm, gfp_mask, 1705 MEM_CGROUP_CHARGE_TYPE_CACHE, NULL); 1706 1707 /* shmem */ 1708 if (PageSwapCache(page)) { 1709 ret = mem_cgroup_try_charge_swapin(mm, page, gfp_mask, &mem); 1710 if (!ret) 1711 __mem_cgroup_commit_charge_swapin(page, mem, 1712 MEM_CGROUP_CHARGE_TYPE_SHMEM); 1713 } else 1714 ret = mem_cgroup_charge_common(page, mm, gfp_mask, 1715 MEM_CGROUP_CHARGE_TYPE_SHMEM, mem); 1716 1717 return ret; 1718} 1719 1720/* 1721 * While swap-in, try_charge -> commit or cancel, the page is locked. 1722 * And when try_charge() successfully returns, one refcnt to memcg without 1723 * struct page_cgroup is aquired. This refcnt will be cumsumed by 1724 * "commit()" or removed by "cancel()" 1725 */ 1726int mem_cgroup_try_charge_swapin(struct mm_struct *mm, 1727 struct page *page, 1728 gfp_t mask, struct mem_cgroup **ptr) 1729{ 1730 struct mem_cgroup *mem; 1731 int ret; 1732 1733 if (mem_cgroup_disabled()) 1734 return 0; 1735 1736 if (!do_swap_account) 1737 goto charge_cur_mm; 1738 /* 1739 * A racing thread's fault, or swapoff, may have already updated 1740 * the pte, and even removed page from swap cache: return success 1741 * to go on to do_swap_page()'s pte_same() test, which should fail. 1742 */ 1743 if (!PageSwapCache(page)) 1744 return 0; 1745 mem = try_get_mem_cgroup_from_swapcache(page); 1746 if (!mem) 1747 goto charge_cur_mm; 1748 *ptr = mem; 1749 ret = __mem_cgroup_try_charge(NULL, mask, ptr, true, page); 1750 /* drop extra refcnt from tryget */ 1751 css_put(&mem->css); 1752 return ret; 1753charge_cur_mm: 1754 if (unlikely(!mm)) 1755 mm = &init_mm; 1756 return __mem_cgroup_try_charge(mm, mask, ptr, true, page); 1757} 1758 1759static void 1760__mem_cgroup_commit_charge_swapin(struct page *page, struct mem_cgroup *ptr, 1761 enum charge_type ctype) 1762{ 1763 struct page_cgroup *pc; 1764 1765 if (mem_cgroup_disabled()) 1766 return; 1767 if (!ptr) 1768 return; 1769 cgroup_exclude_rmdir(&ptr->css); 1770 pc = lookup_page_cgroup(page); 1771 mem_cgroup_lru_del_before_commit_swapcache(page); 1772 __mem_cgroup_commit_charge(ptr, pc, ctype); 1773 mem_cgroup_lru_add_after_commit_swapcache(page); 1774 /* 1775 * Now swap is on-memory. This means this page may be 1776 * counted both as mem and swap....double count. 1777 * Fix it by uncharging from memsw. Basically, this SwapCache is stable 1778 * under lock_page(). But in do_swap_page()::memory.c, reuse_swap_page() 1779 * may call delete_from_swap_cache() before reach here. 1780 */ 1781 if (do_swap_account && PageSwapCache(page)) { 1782 swp_entry_t ent = {.val = page_private(page)}; 1783 unsigned short id; 1784 struct mem_cgroup *memcg; 1785 1786 id = swap_cgroup_record(ent, 0); 1787 rcu_read_lock(); 1788 memcg = mem_cgroup_lookup(id); 1789 if (memcg) { 1790 /* 1791 * This recorded memcg can be obsolete one. So, avoid 1792 * calling css_tryget 1793 */ 1794 if (!mem_cgroup_is_root(memcg)) 1795 res_counter_uncharge(&memcg->memsw, PAGE_SIZE); 1796 mem_cgroup_swap_statistics(memcg, false); 1797 mem_cgroup_put(memcg); 1798 } 1799 rcu_read_unlock(); 1800 } 1801 /* 1802 * At swapin, we may charge account against cgroup which has no tasks. 1803 * So, rmdir()->pre_destroy() can be called while we do this charge. 1804 * In that case, we need to call pre_destroy() again. check it here. 1805 */ 1806 cgroup_release_and_wakeup_rmdir(&ptr->css); 1807} 1808 1809void mem_cgroup_commit_charge_swapin(struct page *page, struct mem_cgroup *ptr) 1810{ 1811 __mem_cgroup_commit_charge_swapin(page, ptr, 1812 MEM_CGROUP_CHARGE_TYPE_MAPPED); 1813} 1814 1815void mem_cgroup_cancel_charge_swapin(struct mem_cgroup *mem) 1816{ 1817 if (mem_cgroup_disabled()) 1818 return; 1819 if (!mem) 1820 return; 1821 if (!mem_cgroup_is_root(mem)) { 1822 res_counter_uncharge(&mem->res, PAGE_SIZE); 1823 if (do_swap_account) 1824 res_counter_uncharge(&mem->memsw, PAGE_SIZE); 1825 } 1826 css_put(&mem->css); 1827} 1828 1829 1830/* 1831 * uncharge if !page_mapped(page) 1832 */ 1833static struct mem_cgroup * 1834__mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype) 1835{ 1836 struct page_cgroup *pc; 1837 struct mem_cgroup *mem = NULL; 1838 struct mem_cgroup_per_zone *mz; 1839 1840 if (mem_cgroup_disabled()) 1841 return NULL; 1842 1843 if (PageSwapCache(page)) 1844 return NULL; 1845 1846 /* 1847 * Check if our page_cgroup is valid 1848 */ 1849 pc = lookup_page_cgroup(page); 1850 if (unlikely(!pc || !PageCgroupUsed(pc))) 1851 return NULL; 1852 1853 lock_page_cgroup(pc); 1854 1855 mem = pc->mem_cgroup; 1856 1857 if (!PageCgroupUsed(pc)) 1858 goto unlock_out; 1859 1860 switch (ctype) { 1861 case MEM_CGROUP_CHARGE_TYPE_MAPPED: 1862 case MEM_CGROUP_CHARGE_TYPE_DROP: 1863 if (page_mapped(page)) 1864 goto unlock_out; 1865 break; 1866 case MEM_CGROUP_CHARGE_TYPE_SWAPOUT: 1867 if (!PageAnon(page)) { /* Shared memory */ 1868 if (page->mapping && !page_is_file_cache(page)) 1869 goto unlock_out; 1870 } else if (page_mapped(page)) /* Anon */ 1871 goto unlock_out; 1872 break; 1873 default: 1874 break; 1875 } 1876 1877 if (!mem_cgroup_is_root(mem)) { 1878 res_counter_uncharge(&mem->res, PAGE_SIZE); 1879 if (do_swap_account && 1880 (ctype != MEM_CGROUP_CHARGE_TYPE_SWAPOUT)) 1881 res_counter_uncharge(&mem->memsw, PAGE_SIZE); 1882 } 1883 if (ctype == MEM_CGROUP_CHARGE_TYPE_SWAPOUT) 1884 mem_cgroup_swap_statistics(mem, true); 1885 mem_cgroup_charge_statistics(mem, pc, false); 1886 1887 ClearPageCgroupUsed(pc); 1888 /* 1889 * pc->mem_cgroup is not cleared here. It will be accessed when it's 1890 * freed from LRU. This is safe because uncharged page is expected not 1891 * to be reused (freed soon). Exception is SwapCache, it's handled by 1892 * special functions. 1893 */ 1894 1895 mz = page_cgroup_zoneinfo(pc); 1896 unlock_page_cgroup(pc); 1897 1898 if (mem_cgroup_soft_limit_check(mem)) 1899 mem_cgroup_update_tree(mem, page); 1900 /* at swapout, this memcg will be accessed to record to swap */ 1901 if (ctype != MEM_CGROUP_CHARGE_TYPE_SWAPOUT) 1902 css_put(&mem->css); 1903 1904 return mem; 1905 1906unlock_out: 1907 unlock_page_cgroup(pc); 1908 return NULL; 1909} 1910 1911void mem_cgroup_uncharge_page(struct page *page) 1912{ 1913 /* early check. */ 1914 if (page_mapped(page)) 1915 return; 1916 if (page->mapping && !PageAnon(page)) 1917 return; 1918 __mem_cgroup_uncharge_common(page, MEM_CGROUP_CHARGE_TYPE_MAPPED); 1919} 1920 1921void mem_cgroup_uncharge_cache_page(struct page *page) 1922{ 1923 VM_BUG_ON(page_mapped(page)); 1924 VM_BUG_ON(page->mapping); 1925 __mem_cgroup_uncharge_common(page, MEM_CGROUP_CHARGE_TYPE_CACHE); 1926} 1927 1928#ifdef CONFIG_SWAP 1929/* 1930 * called after __delete_from_swap_cache() and drop "page" account. 1931 * memcg information is recorded to swap_cgroup of "ent" 1932 */ 1933void 1934mem_cgroup_uncharge_swapcache(struct page *page, swp_entry_t ent, bool swapout) 1935{ 1936 struct mem_cgroup *memcg; 1937 int ctype = MEM_CGROUP_CHARGE_TYPE_SWAPOUT; 1938 1939 if (!swapout) /* this was a swap cache but the swap is unused ! */ 1940 ctype = MEM_CGROUP_CHARGE_TYPE_DROP; 1941 1942 memcg = __mem_cgroup_uncharge_common(page, ctype); 1943 1944 /* record memcg information */ 1945 if (do_swap_account && swapout && memcg) { 1946 swap_cgroup_record(ent, css_id(&memcg->css)); 1947 mem_cgroup_get(memcg); 1948 } 1949 if (swapout && memcg) 1950 css_put(&memcg->css); 1951} 1952#endif 1953 1954#ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP 1955/* 1956 * called from swap_entry_free(). remove record in swap_cgroup and 1957 * uncharge "memsw" account. 1958 */ 1959void mem_cgroup_uncharge_swap(swp_entry_t ent) 1960{ 1961 struct mem_cgroup *memcg; 1962 unsigned short id; 1963 1964 if (!do_swap_account) 1965 return; 1966 1967 id = swap_cgroup_record(ent, 0); 1968 rcu_read_lock(); 1969 memcg = mem_cgroup_lookup(id); 1970 if (memcg) { 1971 /* 1972 * We uncharge this because swap is freed. 1973 * This memcg can be obsolete one. We avoid calling css_tryget 1974 */ 1975 if (!mem_cgroup_is_root(memcg)) 1976 res_counter_uncharge(&memcg->memsw, PAGE_SIZE); 1977 mem_cgroup_swap_statistics(memcg, false); 1978 mem_cgroup_put(memcg); 1979 } 1980 rcu_read_unlock(); 1981} 1982#endif 1983 1984/* 1985 * Before starting migration, account PAGE_SIZE to mem_cgroup that the old 1986 * page belongs to. 1987 */ 1988int mem_cgroup_prepare_migration(struct page *page, struct mem_cgroup **ptr) 1989{ 1990 struct page_cgroup *pc; 1991 struct mem_cgroup *mem = NULL; 1992 int ret = 0; 1993 1994 if (mem_cgroup_disabled()) 1995 return 0; 1996 1997 pc = lookup_page_cgroup(page); 1998 lock_page_cgroup(pc); 1999 if (PageCgroupUsed(pc)) { 2000 mem = pc->mem_cgroup; 2001 css_get(&mem->css); 2002 } 2003 unlock_page_cgroup(pc); 2004 2005 if (mem) { 2006 ret = __mem_cgroup_try_charge(NULL, GFP_KERNEL, &mem, false, 2007 page); 2008 css_put(&mem->css); 2009 } 2010 *ptr = mem; 2011 return ret; 2012} 2013 2014/* remove redundant charge if migration failed*/ 2015void mem_cgroup_end_migration(struct mem_cgroup *mem, 2016 struct page *oldpage, struct page *newpage) 2017{ 2018 struct page *target, *unused; 2019 struct page_cgroup *pc; 2020 enum charge_type ctype; 2021 2022 if (!mem) 2023 return; 2024 cgroup_exclude_rmdir(&mem->css); 2025 /* at migration success, oldpage->mapping is NULL. */ 2026 if (oldpage->mapping) { 2027 target = oldpage; 2028 unused = NULL; 2029 } else { 2030 target = newpage; 2031 unused = oldpage; 2032 } 2033 2034 if (PageAnon(target)) 2035 ctype = MEM_CGROUP_CHARGE_TYPE_MAPPED; 2036 else if (page_is_file_cache(target)) 2037 ctype = MEM_CGROUP_CHARGE_TYPE_CACHE; 2038 else 2039 ctype = MEM_CGROUP_CHARGE_TYPE_SHMEM; 2040 2041 /* unused page is not on radix-tree now. */ 2042 if (unused) 2043 __mem_cgroup_uncharge_common(unused, ctype); 2044 2045 pc = lookup_page_cgroup(target); 2046 /* 2047 * __mem_cgroup_commit_charge() check PCG_USED bit of page_cgroup. 2048 * So, double-counting is effectively avoided. 2049 */ 2050 __mem_cgroup_commit_charge(mem, pc, ctype); 2051 2052 /* 2053 * Both of oldpage and newpage are still under lock_page(). 2054 * Then, we don't have to care about race in radix-tree. 2055 * But we have to be careful that this page is unmapped or not. 2056 * 2057 * There is a case for !page_mapped(). At the start of 2058 * migration, oldpage was mapped. But now, it's zapped. 2059 * But we know *target* page is not freed/reused under us. 2060 * mem_cgroup_uncharge_page() does all necessary checks. 2061 */ 2062 if (ctype == MEM_CGROUP_CHARGE_TYPE_MAPPED) 2063 mem_cgroup_uncharge_page(target); 2064 /* 2065 * At migration, we may charge account against cgroup which has no tasks 2066 * So, rmdir()->pre_destroy() can be called while we do this charge. 2067 * In that case, we need to call pre_destroy() again. check it here. 2068 */ 2069 cgroup_release_and_wakeup_rmdir(&mem->css); 2070} 2071 2072/* 2073 * A call to try to shrink memory usage on charge failure at shmem's swapin. 2074 * Calling hierarchical_reclaim is not enough because we should update 2075 * last_oom_jiffies to prevent pagefault_out_of_memory from invoking global OOM. 2076 * Moreover considering hierarchy, we should reclaim from the mem_over_limit, 2077 * not from the memcg which this page would be charged to. 2078 * try_charge_swapin does all of these works properly. 2079 */ 2080int mem_cgroup_shmem_charge_fallback(struct page *page, 2081 struct mm_struct *mm, 2082 gfp_t gfp_mask) 2083{ 2084 struct mem_cgroup *mem = NULL; 2085 int ret; 2086 2087 if (mem_cgroup_disabled()) 2088 return 0; 2089 2090 ret = mem_cgroup_try_charge_swapin(mm, page, gfp_mask, &mem); 2091 if (!ret) 2092 mem_cgroup_cancel_charge_swapin(mem); /* it does !mem check */ 2093 2094 return ret; 2095} 2096 2097static DEFINE_MUTEX(set_limit_mutex); 2098 2099static int mem_cgroup_resize_limit(struct mem_cgroup *memcg, 2100 unsigned long long val) 2101{ 2102 int retry_count; 2103 int progress; 2104 u64 memswlimit; 2105 int ret = 0; 2106 int children = mem_cgroup_count_children(memcg); 2107 u64 curusage, oldusage; 2108 2109 /* 2110 * For keeping hierarchical_reclaim simple, how long we should retry 2111 * is depends on callers. We set our retry-count to be function 2112 * of # of children which we should visit in this loop. 2113 */ 2114 retry_count = MEM_CGROUP_RECLAIM_RETRIES * children; 2115 2116 oldusage = res_counter_read_u64(&memcg->res, RES_USAGE); 2117 2118 while (retry_count) { 2119 if (signal_pending(current)) { 2120 ret = -EINTR; 2121 break; 2122 } 2123 /* 2124 * Rather than hide all in some function, I do this in 2125 * open coded manner. You see what this really does. 2126 * We have to guarantee mem->res.limit < mem->memsw.limit. 2127 */ 2128 mutex_lock(&set_limit_mutex); 2129 memswlimit = res_counter_read_u64(&memcg->memsw, RES_LIMIT); 2130 if (memswlimit < val) { 2131 ret = -EINVAL; 2132 mutex_unlock(&set_limit_mutex); 2133 break; 2134 } 2135 ret = res_counter_set_limit(&memcg->res, val); 2136 if (!ret) { 2137 if (memswlimit == val) 2138 memcg->memsw_is_minimum = true; 2139 else 2140 memcg->memsw_is_minimum = false; 2141 } 2142 mutex_unlock(&set_limit_mutex); 2143 2144 if (!ret) 2145 break; 2146 2147 progress = mem_cgroup_hierarchical_reclaim(memcg, NULL, 2148 GFP_KERNEL, 2149 MEM_CGROUP_RECLAIM_SHRINK); 2150 curusage = res_counter_read_u64(&memcg->res, RES_USAGE); 2151 /* Usage is reduced ? */ 2152 if (curusage >= oldusage) 2153 retry_count--; 2154 else 2155 oldusage = curusage; 2156 } 2157 2158 return ret; 2159} 2160 2161static int mem_cgroup_resize_memsw_limit(struct mem_cgroup *memcg, 2162 unsigned long long val) 2163{ 2164 int retry_count; 2165 u64 memlimit, oldusage, curusage; 2166 int children = mem_cgroup_count_children(memcg); 2167 int ret = -EBUSY; 2168 2169 /* see mem_cgroup_resize_res_limit */ 2170 retry_count = children * MEM_CGROUP_RECLAIM_RETRIES; 2171 oldusage = res_counter_read_u64(&memcg->memsw, RES_USAGE); 2172 while (retry_count) { 2173 if (signal_pending(current)) { 2174 ret = -EINTR; 2175 break; 2176 } 2177 /* 2178 * Rather than hide all in some function, I do this in 2179 * open coded manner. You see what this really does. 2180 * We have to guarantee mem->res.limit < mem->memsw.limit. 2181 */ 2182 mutex_lock(&set_limit_mutex); 2183 memlimit = res_counter_read_u64(&memcg->res, RES_LIMIT); 2184 if (memlimit > val) { 2185 ret = -EINVAL; 2186 mutex_unlock(&set_limit_mutex); 2187 break; 2188 } 2189 ret = res_counter_set_limit(&memcg->memsw, val); 2190 if (!ret) { 2191 if (memlimit == val) 2192 memcg->memsw_is_minimum = true; 2193 else 2194 memcg->memsw_is_minimum = false; 2195 } 2196 mutex_unlock(&set_limit_mutex); 2197 2198 if (!ret) 2199 break; 2200 2201 mem_cgroup_hierarchical_reclaim(memcg, NULL, GFP_KERNEL, 2202 MEM_CGROUP_RECLAIM_NOSWAP | 2203 MEM_CGROUP_RECLAIM_SHRINK); 2204 curusage = res_counter_read_u64(&memcg->memsw, RES_USAGE); 2205 /* Usage is reduced ? */ 2206 if (curusage >= oldusage) 2207 retry_count--; 2208 else 2209 oldusage = curusage; 2210 } 2211 return ret; 2212} 2213 2214unsigned long mem_cgroup_soft_limit_reclaim(struct zone *zone, int order, 2215 gfp_t gfp_mask, int nid, 2216 int zid) 2217{ 2218 unsigned long nr_reclaimed = 0; 2219 struct mem_cgroup_per_zone *mz, *next_mz = NULL; 2220 unsigned long reclaimed; 2221 int loop = 0; 2222 struct mem_cgroup_tree_per_zone *mctz; 2223 unsigned long long excess; 2224 2225 if (order > 0) 2226 return 0; 2227 2228 mctz = soft_limit_tree_node_zone(nid, zid); 2229 /* 2230 * This loop can run a while, specially if mem_cgroup's continuously 2231 * keep exceeding their soft limit and putting the system under 2232 * pressure 2233 */ 2234 do { 2235 if (next_mz) 2236 mz = next_mz; 2237 else 2238 mz = mem_cgroup_largest_soft_limit_node(mctz); 2239 if (!mz) 2240 break; 2241 2242 reclaimed = mem_cgroup_hierarchical_reclaim(mz->mem, zone, 2243 gfp_mask, 2244 MEM_CGROUP_RECLAIM_SOFT); 2245 nr_reclaimed += reclaimed; 2246 spin_lock(&mctz->lock); 2247 2248 /* 2249 * If we failed to reclaim anything from this memory cgroup 2250 * it is time to move on to the next cgroup 2251 */ 2252 next_mz = NULL; 2253 if (!reclaimed) { 2254 do { 2255 /* 2256 * Loop until we find yet another one. 2257 * 2258 * By the time we get the soft_limit lock 2259 * again, someone might have aded the 2260 * group back on the RB tree. Iterate to 2261 * make sure we get a different mem. 2262 * mem_cgroup_largest_soft_limit_node returns 2263 * NULL if no other cgroup is present on 2264 * the tree 2265 */ 2266 next_mz = 2267 __mem_cgroup_largest_soft_limit_node(mctz); 2268 if (next_mz == mz) { 2269 css_put(&next_mz->mem->css); 2270 next_mz = NULL; 2271 } else /* next_mz == NULL or other memcg */ 2272 break; 2273 } while (1); 2274 } 2275 __mem_cgroup_remove_exceeded(mz->mem, mz, mctz); 2276 excess = res_counter_soft_limit_excess(&mz->mem->res); 2277 /* 2278 * One school of thought says that we should not add 2279 * back the node to the tree if reclaim returns 0. 2280 * But our reclaim could return 0, simply because due 2281 * to priority we are exposing a smaller subset of 2282 * memory to reclaim from. Consider this as a longer 2283 * term TODO. 2284 */ 2285 /* If excess == 0, no tree ops */ 2286 __mem_cgroup_insert_exceeded(mz->mem, mz, mctz, excess); 2287 spin_unlock(&mctz->lock); 2288 css_put(&mz->mem->css); 2289 loop++; 2290 /* 2291 * Could not reclaim anything and there are no more 2292 * mem cgroups to try or we seem to be looping without 2293 * reclaiming anything. 2294 */ 2295 if (!nr_reclaimed && 2296 (next_mz == NULL || 2297 loop > MEM_CGROUP_MAX_SOFT_LIMIT_RECLAIM_LOOPS)) 2298 break; 2299 } while (!nr_reclaimed); 2300 if (next_mz) 2301 css_put(&next_mz->mem->css); 2302 return nr_reclaimed; 2303} 2304 2305/* 2306 * This routine traverse page_cgroup in given list and drop them all. 2307 * *And* this routine doesn't reclaim page itself, just removes page_cgroup. 2308 */ 2309static int mem_cgroup_force_empty_list(struct mem_cgroup *mem, 2310 int node, int zid, enum lru_list lru) 2311{ 2312 struct zone *zone; 2313 struct mem_cgroup_per_zone *mz; 2314 struct page_cgroup *pc, *busy; 2315 unsigned long flags, loop; 2316 struct list_head *list; 2317 int ret = 0; 2318 2319 zone = &NODE_DATA(node)->node_zones[zid]; 2320 mz = mem_cgroup_zoneinfo(mem, node, zid); 2321 list = &mz->lists[lru]; 2322 2323 loop = MEM_CGROUP_ZSTAT(mz, lru); 2324 /* give some margin against EBUSY etc...*/ 2325 loop += 256; 2326 busy = NULL; 2327 while (loop--) { 2328 ret = 0; 2329 spin_lock_irqsave(&zone->lru_lock, flags); 2330 if (list_empty(list)) { 2331 spin_unlock_irqrestore(&zone->lru_lock, flags); 2332 break; 2333 } 2334 pc = list_entry(list->prev, struct page_cgroup, lru); 2335 if (busy == pc) { 2336 list_move(&pc->lru, list); 2337 busy = 0; 2338 spin_unlock_irqrestore(&zone->lru_lock, flags); 2339 continue; 2340 } 2341 spin_unlock_irqrestore(&zone->lru_lock, flags); 2342 2343 ret = mem_cgroup_move_parent(pc, mem, GFP_KERNEL); 2344 if (ret == -ENOMEM) 2345 break; 2346 2347 if (ret == -EBUSY || ret == -EINVAL) { 2348 /* found lock contention or "pc" is obsolete. */ 2349 busy = pc; 2350 cond_resched(); 2351 } else 2352 busy = NULL; 2353 } 2354 2355 if (!ret && !list_empty(list)) 2356 return -EBUSY; 2357 return ret; 2358} 2359 2360/* 2361 * make mem_cgroup's charge to be 0 if there is no task. 2362 * This enables deleting this mem_cgroup. 2363 */ 2364static int mem_cgroup_force_empty(struct mem_cgroup *mem, bool free_all) 2365{ 2366 int ret; 2367 int node, zid, shrink; 2368 int nr_retries = MEM_CGROUP_RECLAIM_RETRIES; 2369 struct cgroup *cgrp = mem->css.cgroup; 2370 2371 css_get(&mem->css); 2372 2373 shrink = 0; 2374 /* should free all ? */ 2375 if (free_all) 2376 goto try_to_free; 2377move_account: 2378 while (mem->res.usage > 0) { 2379 ret = -EBUSY; 2380 if (cgroup_task_count(cgrp) || !list_empty(&cgrp->children)) 2381 goto out; 2382 ret = -EINTR; 2383 if (signal_pending(current)) 2384 goto out; 2385 /* This is for making all *used* pages to be on LRU. */ 2386 lru_add_drain_all(); 2387 ret = 0; 2388 for_each_node_state(node, N_HIGH_MEMORY) { 2389 for (zid = 0; !ret && zid < MAX_NR_ZONES; zid++) { 2390 enum lru_list l; 2391 for_each_lru(l) { 2392 ret = mem_cgroup_force_empty_list(mem, 2393 node, zid, l); 2394 if (ret) 2395 break; 2396 } 2397 } 2398 if (ret) 2399 break; 2400 } 2401 /* it seems parent cgroup doesn't have enough mem */ 2402 if (ret == -ENOMEM) 2403 goto try_to_free; 2404 cond_resched(); 2405 } 2406 ret = 0; 2407out: 2408 css_put(&mem->css); 2409 return ret; 2410 2411try_to_free: 2412 /* returns EBUSY if there is a task or if we come here twice. */ 2413 if (cgroup_task_count(cgrp) || !list_empty(&cgrp->children) || shrink) { 2414 ret = -EBUSY; 2415 goto out; 2416 } 2417 /* we call try-to-free pages for make this cgroup empty */ 2418 lru_add_drain_all(); 2419 /* try to free all pages in this cgroup */ 2420 shrink = 1; 2421 while (nr_retries && mem->res.usage > 0) { 2422 int progress; 2423 2424 if (signal_pending(current)) { 2425 ret = -EINTR; 2426 goto out; 2427 } 2428 progress = try_to_free_mem_cgroup_pages(mem, GFP_KERNEL, 2429 false, get_swappiness(mem)); 2430 if (!progress) { 2431 nr_retries--; 2432 /* maybe some writeback is necessary */ 2433 congestion_wait(BLK_RW_ASYNC, HZ/10); 2434 } 2435 2436 } 2437 lru_add_drain(); 2438 /* try move_account...there may be some *locked* pages. */ 2439 if (mem->res.usage) 2440 goto move_account; 2441 ret = 0; 2442 goto out; 2443} 2444 2445int mem_cgroup_force_empty_write(struct cgroup *cont, unsigned int event) 2446{ 2447 return mem_cgroup_force_empty(mem_cgroup_from_cont(cont), true); 2448} 2449 2450 2451static u64 mem_cgroup_hierarchy_read(struct cgroup *cont, struct cftype *cft) 2452{ 2453 return mem_cgroup_from_cont(cont)->use_hierarchy; 2454} 2455 2456static int mem_cgroup_hierarchy_write(struct cgroup *cont, struct cftype *cft, 2457 u64 val) 2458{ 2459 int retval = 0; 2460 struct mem_cgroup *mem = mem_cgroup_from_cont(cont); 2461 struct cgroup *parent = cont->parent; 2462 struct mem_cgroup *parent_mem = NULL; 2463 2464 if (parent) 2465 parent_mem = mem_cgroup_from_cont(parent); 2466 2467 cgroup_lock(); 2468 /* 2469 * If parent's use_hiearchy is set, we can't make any modifications 2470 * in the child subtrees. If it is unset, then the change can 2471 * occur, provided the current cgroup has no children. 2472 * 2473 * For the root cgroup, parent_mem is NULL, we allow value to be 2474 * set if there are no children. 2475 */ 2476 if ((!parent_mem || !parent_mem->use_hierarchy) && 2477 (val == 1 || val == 0)) { 2478 if (list_empty(&cont->children)) 2479 mem->use_hierarchy = val; 2480 else 2481 retval = -EBUSY; 2482 } else 2483 retval = -EINVAL; 2484 cgroup_unlock(); 2485 2486 return retval; 2487} 2488 2489struct mem_cgroup_idx_data { 2490 s64 val; 2491 enum mem_cgroup_stat_index idx; 2492}; 2493 2494static int 2495mem_cgroup_get_idx_stat(struct mem_cgroup *mem, void *data) 2496{ 2497 struct mem_cgroup_idx_data *d = data; 2498 d->val += mem_cgroup_read_stat(&mem->stat, d->idx); 2499 return 0; 2500} 2501 2502static void 2503mem_cgroup_get_recursive_idx_stat(struct mem_cgroup *mem, 2504 enum mem_cgroup_stat_index idx, s64 *val) 2505{ 2506 struct mem_cgroup_idx_data d; 2507 d.idx = idx; 2508 d.val = 0; 2509 mem_cgroup_walk_tree(mem, &d, mem_cgroup_get_idx_stat); 2510 *val = d.val; 2511} 2512 2513static u64 mem_cgroup_read(struct cgroup *cont, struct cftype *cft) 2514{ 2515 struct mem_cgroup *mem = mem_cgroup_from_cont(cont); 2516 u64 idx_val, val; 2517 int type, name; 2518 2519 type = MEMFILE_TYPE(cft->private); 2520 name = MEMFILE_ATTR(cft->private); 2521 switch (type) { 2522 case _MEM: 2523 if (name == RES_USAGE && mem_cgroup_is_root(mem)) { 2524 mem_cgroup_get_recursive_idx_stat(mem, 2525 MEM_CGROUP_STAT_CACHE, &idx_val); 2526 val = idx_val; 2527 mem_cgroup_get_recursive_idx_stat(mem, 2528 MEM_CGROUP_STAT_RSS, &idx_val); 2529 val += idx_val; 2530 val <<= PAGE_SHIFT; 2531 } else 2532 val = res_counter_read_u64(&mem->res, name); 2533 break; 2534 case _MEMSWAP: 2535 if (name == RES_USAGE && mem_cgroup_is_root(mem)) { 2536 mem_cgroup_get_recursive_idx_stat(mem, 2537 MEM_CGROUP_STAT_CACHE, &idx_val); 2538 val = idx_val; 2539 mem_cgroup_get_recursive_idx_stat(mem, 2540 MEM_CGROUP_STAT_RSS, &idx_val); 2541 val += idx_val; 2542 mem_cgroup_get_recursive_idx_stat(mem, 2543 MEM_CGROUP_STAT_SWAPOUT, &idx_val); 2544 val <<= PAGE_SHIFT; 2545 } else 2546 val = res_counter_read_u64(&mem->memsw, name); 2547 break; 2548 default: 2549 BUG(); 2550 break; 2551 } 2552 return val; 2553} 2554/* 2555 * The user of this function is... 2556 * RES_LIMIT. 2557 */ 2558static int mem_cgroup_write(struct cgroup *cont, struct cftype *cft, 2559 const char *buffer) 2560{ 2561 struct mem_cgroup *memcg = mem_cgroup_from_cont(cont); 2562 int type, name; 2563 unsigned long long val; 2564 int ret; 2565 2566 type = MEMFILE_TYPE(cft->private); 2567 name = MEMFILE_ATTR(cft->private); 2568 switch (name) { 2569 case RES_LIMIT: 2570 if (mem_cgroup_is_root(memcg)) { /* Can't set limit on root */ 2571 ret = -EINVAL; 2572 break; 2573 } 2574 /* This function does all necessary parse...reuse it */ 2575 ret = res_counter_memparse_write_strategy(buffer, &val); 2576 if (ret) 2577 break; 2578 if (type == _MEM) 2579 ret = mem_cgroup_resize_limit(memcg, val); 2580 else 2581 ret = mem_cgroup_resize_memsw_limit(memcg, val); 2582 break; 2583 case RES_SOFT_LIMIT: 2584 ret = res_counter_memparse_write_strategy(buffer, &val); 2585 if (ret) 2586 break; 2587 /* 2588 * For memsw, soft limits are hard to implement in terms 2589 * of semantics, for now, we support soft limits for 2590 * control without swap 2591 */ 2592 if (type == _MEM) 2593 ret = res_counter_set_soft_limit(&memcg->res, val); 2594 else 2595 ret = -EINVAL; 2596 break; 2597 default: 2598 ret = -EINVAL; /* should be BUG() ? */ 2599 break; 2600 } 2601 return ret; 2602} 2603 2604static void memcg_get_hierarchical_limit(struct mem_cgroup *memcg, 2605 unsigned long long *mem_limit, unsigned long long *memsw_limit) 2606{ 2607 struct cgroup *cgroup; 2608 unsigned long long min_limit, min_memsw_limit, tmp; 2609 2610 min_limit = res_counter_read_u64(&memcg->res, RES_LIMIT); 2611 min_memsw_limit = res_counter_read_u64(&memcg->memsw, RES_LIMIT); 2612 cgroup = memcg->css.cgroup; 2613 if (!memcg->use_hierarchy) 2614 goto out; 2615 2616 while (cgroup->parent) { 2617 cgroup = cgroup->parent; 2618 memcg = mem_cgroup_from_cont(cgroup); 2619 if (!memcg->use_hierarchy) 2620 break; 2621 tmp = res_counter_read_u64(&memcg->res, RES_LIMIT); 2622 min_limit = min(min_limit, tmp); 2623 tmp = res_counter_read_u64(&memcg->memsw, RES_LIMIT); 2624 min_memsw_limit = min(min_memsw_limit, tmp); 2625 } 2626out: 2627 *mem_limit = min_limit; 2628 *memsw_limit = min_memsw_limit; 2629 return; 2630} 2631 2632static int mem_cgroup_reset(struct cgroup *cont, unsigned int event) 2633{ 2634 struct mem_cgroup *mem; 2635 int type, name; 2636 2637 mem = mem_cgroup_from_cont(cont); 2638 type = MEMFILE_TYPE(event); 2639 name = MEMFILE_ATTR(event); 2640 switch (name) { 2641 case RES_MAX_USAGE: 2642 if (type == _MEM) 2643 res_counter_reset_max(&mem->res); 2644 else 2645 res_counter_reset_max(&mem->memsw); 2646 break; 2647 case RES_FAILCNT: 2648 if (type == _MEM) 2649 res_counter_reset_failcnt(&mem->res); 2650 else 2651 res_counter_reset_failcnt(&mem->memsw); 2652 break; 2653 } 2654 2655 return 0; 2656} 2657 2658 2659/* For read statistics */ 2660enum { 2661 MCS_CACHE, 2662 MCS_RSS, 2663 MCS_MAPPED_FILE, 2664 MCS_PGPGIN, 2665 MCS_PGPGOUT, 2666 MCS_SWAP, 2667 MCS_INACTIVE_ANON, 2668 MCS_ACTIVE_ANON, 2669 MCS_INACTIVE_FILE, 2670 MCS_ACTIVE_FILE, 2671 MCS_UNEVICTABLE, 2672 NR_MCS_STAT, 2673}; 2674 2675struct mcs_total_stat { 2676 s64 stat[NR_MCS_STAT]; 2677}; 2678 2679struct { 2680 char *local_name; 2681 char *total_name; 2682} memcg_stat_strings[NR_MCS_STAT] = { 2683 {"cache", "total_cache"}, 2684 {"rss", "total_rss"}, 2685 {"mapped_file", "total_mapped_file"}, 2686 {"pgpgin", "total_pgpgin"}, 2687 {"pgpgout", "total_pgpgout"}, 2688 {"swap", "total_swap"}, 2689 {"inactive_anon", "total_inactive_anon"}, 2690 {"active_anon", "total_active_anon"}, 2691 {"inactive_file", "total_inactive_file"}, 2692 {"active_file", "total_active_file"}, 2693 {"unevictable", "total_unevictable"} 2694}; 2695 2696 2697static int mem_cgroup_get_local_stat(struct mem_cgroup *mem, void *data) 2698{ 2699 struct mcs_total_stat *s = data; 2700 s64 val; 2701 2702 /* per cpu stat */ 2703 val = mem_cgroup_read_stat(&mem->stat, MEM_CGROUP_STAT_CACHE); 2704 s->stat[MCS_CACHE] += val * PAGE_SIZE; 2705 val = mem_cgroup_read_stat(&mem->stat, MEM_CGROUP_STAT_RSS); 2706 s->stat[MCS_RSS] += val * PAGE_SIZE; 2707 val = mem_cgroup_read_stat(&mem->stat, MEM_CGROUP_STAT_MAPPED_FILE); 2708 s->stat[MCS_MAPPED_FILE] += val * PAGE_SIZE; 2709 val = mem_cgroup_read_stat(&mem->stat, MEM_CGROUP_STAT_PGPGIN_COUNT); 2710 s->stat[MCS_PGPGIN] += val; 2711 val = mem_cgroup_read_stat(&mem->stat, MEM_CGROUP_STAT_PGPGOUT_COUNT); 2712 s->stat[MCS_PGPGOUT] += val; 2713 if (do_swap_account) { 2714 val = mem_cgroup_read_stat(&mem->stat, MEM_CGROUP_STAT_SWAPOUT); 2715 s->stat[MCS_SWAP] += val * PAGE_SIZE; 2716 } 2717 2718 /* per zone stat */ 2719 val = mem_cgroup_get_local_zonestat(mem, LRU_INACTIVE_ANON); 2720 s->stat[MCS_INACTIVE_ANON] += val * PAGE_SIZE; 2721 val = mem_cgroup_get_local_zonestat(mem, LRU_ACTIVE_ANON); 2722 s->stat[MCS_ACTIVE_ANON] += val * PAGE_SIZE; 2723 val = mem_cgroup_get_local_zonestat(mem, LRU_INACTIVE_FILE); 2724 s->stat[MCS_INACTIVE_FILE] += val * PAGE_SIZE; 2725 val = mem_cgroup_get_local_zonestat(mem, LRU_ACTIVE_FILE); 2726 s->stat[MCS_ACTIVE_FILE] += val * PAGE_SIZE; 2727 val = mem_cgroup_get_local_zonestat(mem, LRU_UNEVICTABLE); 2728 s->stat[MCS_UNEVICTABLE] += val * PAGE_SIZE; 2729 return 0; 2730} 2731 2732static void 2733mem_cgroup_get_total_stat(struct mem_cgroup *mem, struct mcs_total_stat *s) 2734{ 2735 mem_cgroup_walk_tree(mem, s, mem_cgroup_get_local_stat); 2736} 2737 2738static int mem_control_stat_show(struct cgroup *cont, struct cftype *cft, 2739 struct cgroup_map_cb *cb) 2740{ 2741 struct mem_cgroup *mem_cont = mem_cgroup_from_cont(cont); 2742 struct mcs_total_stat mystat; 2743 int i; 2744 2745 memset(&mystat, 0, sizeof(mystat)); 2746 mem_cgroup_get_local_stat(mem_cont, &mystat); 2747 2748 for (i = 0; i < NR_MCS_STAT; i++) { 2749 if (i == MCS_SWAP && !do_swap_account) 2750 continue; 2751 cb->fill(cb, memcg_stat_strings[i].local_name, mystat.stat[i]); 2752 } 2753 2754 /* Hierarchical information */ 2755 { 2756 unsigned long long limit, memsw_limit; 2757 memcg_get_hierarchical_limit(mem_cont, &limit, &memsw_limit); 2758 cb->fill(cb, "hierarchical_memory_limit", limit); 2759 if (do_swap_account) 2760 cb->fill(cb, "hierarchical_memsw_limit", memsw_limit); 2761 } 2762 2763 memset(&mystat, 0, sizeof(mystat)); 2764 mem_cgroup_get_total_stat(mem_cont, &mystat); 2765 for (i = 0; i < NR_MCS_STAT; i++) { 2766 if (i == MCS_SWAP && !do_swap_account) 2767 continue; 2768 cb->fill(cb, memcg_stat_strings[i].total_name, mystat.stat[i]); 2769 } 2770 2771#ifdef CONFIG_DEBUG_VM 2772 cb->fill(cb, "inactive_ratio", calc_inactive_ratio(mem_cont, NULL)); 2773 2774 { 2775 int nid, zid; 2776 struct mem_cgroup_per_zone *mz; 2777 unsigned long recent_rotated[2] = {0, 0}; 2778 unsigned long recent_scanned[2] = {0, 0}; 2779 2780 for_each_online_node(nid) 2781 for (zid = 0; zid < MAX_NR_ZONES; zid++) { 2782 mz = mem_cgroup_zoneinfo(mem_cont, nid, zid); 2783 2784 recent_rotated[0] += 2785 mz->reclaim_stat.recent_rotated[0]; 2786 recent_rotated[1] += 2787 mz->reclaim_stat.recent_rotated[1]; 2788 recent_scanned[0] += 2789 mz->reclaim_stat.recent_scanned[0]; 2790 recent_scanned[1] += 2791 mz->reclaim_stat.recent_scanned[1]; 2792 } 2793 cb->fill(cb, "recent_rotated_anon", recent_rotated[0]); 2794 cb->fill(cb, "recent_rotated_file", recent_rotated[1]); 2795 cb->fill(cb, "recent_scanned_anon", recent_scanned[0]); 2796 cb->fill(cb, "recent_scanned_file", recent_scanned[1]); 2797 } 2798#endif 2799 2800 return 0; 2801} 2802 2803static u64 mem_cgroup_swappiness_read(struct cgroup *cgrp, struct cftype *cft) 2804{ 2805 struct mem_cgroup *memcg = mem_cgroup_from_cont(cgrp); 2806 2807 return get_swappiness(memcg); 2808} 2809 2810static int mem_cgroup_swappiness_write(struct cgroup *cgrp, struct cftype *cft, 2811 u64 val) 2812{ 2813 struct mem_cgroup *memcg = mem_cgroup_from_cont(cgrp); 2814 struct mem_cgroup *parent; 2815 2816 if (val > 100) 2817 return -EINVAL; 2818 2819 if (cgrp->parent == NULL) 2820 return -EINVAL; 2821 2822 parent = mem_cgroup_from_cont(cgrp->parent); 2823 2824 cgroup_lock(); 2825 2826 /* If under hierarchy, only empty-root can set this value */ 2827 if ((parent->use_hierarchy) || 2828 (memcg->use_hierarchy && !list_empty(&cgrp->children))) { 2829 cgroup_unlock(); 2830 return -EINVAL; 2831 } 2832 2833 spin_lock(&memcg->reclaim_param_lock); 2834 memcg->swappiness = val; 2835 spin_unlock(&memcg->reclaim_param_lock); 2836 2837 cgroup_unlock(); 2838 2839 return 0; 2840} 2841 2842 2843static struct cftype mem_cgroup_files[] = { 2844 { 2845 .name = "usage_in_bytes", 2846 .private = MEMFILE_PRIVATE(_MEM, RES_USAGE), 2847 .read_u64 = mem_cgroup_read, 2848 }, 2849 { 2850 .name = "max_usage_in_bytes", 2851 .private = MEMFILE_PRIVATE(_MEM, RES_MAX_USAGE), 2852 .trigger = mem_cgroup_reset, 2853 .read_u64 = mem_cgroup_read, 2854 }, 2855 { 2856 .name = "limit_in_bytes", 2857 .private = MEMFILE_PRIVATE(_MEM, RES_LIMIT), 2858 .write_string = mem_cgroup_write, 2859 .read_u64 = mem_cgroup_read, 2860 }, 2861 { 2862 .name = "soft_limit_in_bytes", 2863 .private = MEMFILE_PRIVATE(_MEM, RES_SOFT_LIMIT), 2864 .write_string = mem_cgroup_write, 2865 .read_u64 = mem_cgroup_read, 2866 }, 2867 { 2868 .name = "failcnt", 2869 .private = MEMFILE_PRIVATE(_MEM, RES_FAILCNT), 2870 .trigger = mem_cgroup_reset, 2871 .read_u64 = mem_cgroup_read, 2872 }, 2873 { 2874 .name = "stat", 2875 .read_map = mem_control_stat_show, 2876 }, 2877 { 2878 .name = "force_empty", 2879 .trigger = mem_cgroup_force_empty_write, 2880 }, 2881 { 2882 .name = "use_hierarchy", 2883 .write_u64 = mem_cgroup_hierarchy_write, 2884 .read_u64 = mem_cgroup_hierarchy_read, 2885 }, 2886 { 2887 .name = "swappiness", 2888 .read_u64 = mem_cgroup_swappiness_read, 2889 .write_u64 = mem_cgroup_swappiness_write, 2890 }, 2891}; 2892 2893#ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP 2894static struct cftype memsw_cgroup_files[] = { 2895 { 2896 .name = "memsw.usage_in_bytes", 2897 .private = MEMFILE_PRIVATE(_MEMSWAP, RES_USAGE), 2898 .read_u64 = mem_cgroup_read, 2899 }, 2900 { 2901 .name = "memsw.max_usage_in_bytes", 2902 .private = MEMFILE_PRIVATE(_MEMSWAP, RES_MAX_USAGE), 2903 .trigger = mem_cgroup_reset, 2904 .read_u64 = mem_cgroup_read, 2905 }, 2906 { 2907 .name = "memsw.limit_in_bytes", 2908 .private = MEMFILE_PRIVATE(_MEMSWAP, RES_LIMIT), 2909 .write_string = mem_cgroup_write, 2910 .read_u64 = mem_cgroup_read, 2911 }, 2912 { 2913 .name = "memsw.failcnt", 2914 .private = MEMFILE_PRIVATE(_MEMSWAP, RES_FAILCNT), 2915 .trigger = mem_cgroup_reset, 2916 .read_u64 = mem_cgroup_read, 2917 }, 2918}; 2919 2920static int register_memsw_files(struct cgroup *cont, struct cgroup_subsys *ss) 2921{ 2922 if (!do_swap_account) 2923 return 0; 2924 return cgroup_add_files(cont, ss, memsw_cgroup_files, 2925 ARRAY_SIZE(memsw_cgroup_files)); 2926}; 2927#else 2928static int register_memsw_files(struct cgroup *cont, struct cgroup_subsys *ss) 2929{ 2930 return 0; 2931} 2932#endif 2933 2934static int alloc_mem_cgroup_per_zone_info(struct mem_cgroup *mem, int node) 2935{ 2936 struct mem_cgroup_per_node *pn; 2937 struct mem_cgroup_per_zone *mz; 2938 enum lru_list l; 2939 int zone, tmp = node; 2940 /* 2941 * This routine is called against possible nodes. 2942 * But it's BUG to call kmalloc() against offline node. 2943 * 2944 * TODO: this routine can waste much memory for nodes which will 2945 * never be onlined. It's better to use memory hotplug callback 2946 * function. 2947 */ 2948 if (!node_state(node, N_NORMAL_MEMORY)) 2949 tmp = -1; 2950 pn = kmalloc_node(sizeof(*pn), GFP_KERNEL, tmp); 2951 if (!pn) 2952 return 1; 2953 2954 mem->info.nodeinfo[node] = pn; 2955 memset(pn, 0, sizeof(*pn)); 2956 2957 for (zone = 0; zone < MAX_NR_ZONES; zone++) { 2958 mz = &pn->zoneinfo[zone]; 2959 for_each_lru(l) 2960 INIT_LIST_HEAD(&mz->lists[l]); 2961 mz->usage_in_excess = 0; 2962 mz->on_tree = false; 2963 mz->mem = mem; 2964 } 2965 return 0; 2966} 2967 2968static void free_mem_cgroup_per_zone_info(struct mem_cgroup *mem, int node) 2969{ 2970 kfree(mem->info.nodeinfo[node]); 2971} 2972 2973static int mem_cgroup_size(void) 2974{ 2975 int cpustat_size = nr_cpu_ids * sizeof(struct mem_cgroup_stat_cpu); 2976 return sizeof(struct mem_cgroup) + cpustat_size; 2977} 2978 2979static struct mem_cgroup *mem_cgroup_alloc(void) 2980{ 2981 struct mem_cgroup *mem; 2982 int size = mem_cgroup_size(); 2983 2984 if (size < PAGE_SIZE) 2985 mem = kmalloc(size, GFP_KERNEL); 2986 else 2987 mem = vmalloc(size); 2988 2989 if (mem) 2990 memset(mem, 0, size); 2991 return mem; 2992} 2993 2994/* 2995 * At destroying mem_cgroup, references from swap_cgroup can remain. 2996 * (scanning all at force_empty is too costly...) 2997 * 2998 * Instead of clearing all references at force_empty, we remember 2999 * the number of reference from swap_cgroup and free mem_cgroup when 3000 * it goes down to 0. 3001 * 3002 * Removal of cgroup itself succeeds regardless of refs from swap. 3003 */ 3004 3005static void __mem_cgroup_free(struct mem_cgroup *mem) 3006{ 3007 int node; 3008 3009 mem_cgroup_remove_from_trees(mem); 3010 free_css_id(&mem_cgroup_subsys, &mem->css); 3011 3012 for_each_node_state(node, N_POSSIBLE) 3013 free_mem_cgroup_per_zone_info(mem, node); 3014 3015 if (mem_cgroup_size() < PAGE_SIZE) 3016 kfree(mem); 3017 else 3018 vfree(mem); 3019} 3020 3021static void mem_cgroup_get(struct mem_cgroup *mem) 3022{ 3023 atomic_inc(&mem->refcnt); 3024} 3025 3026static void mem_cgroup_put(struct mem_cgroup *mem) 3027{ 3028 if (atomic_dec_and_test(&mem->refcnt)) { 3029 struct mem_cgroup *parent = parent_mem_cgroup(mem); 3030 __mem_cgroup_free(mem); 3031 if (parent) 3032 mem_cgroup_put(parent); 3033 } 3034} 3035 3036/* 3037 * Returns the parent mem_cgroup in memcgroup hierarchy with hierarchy enabled. 3038 */ 3039static struct mem_cgroup *parent_mem_cgroup(struct mem_cgroup *mem) 3040{ 3041 if (!mem->res.parent) 3042 return NULL; 3043 return mem_cgroup_from_res_counter(mem->res.parent, res); 3044} 3045 3046#ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP 3047static void __init enable_swap_cgroup(void) 3048{ 3049 if (!mem_cgroup_disabled() && really_do_swap_account) 3050 do_swap_account = 1; 3051} 3052#else 3053static void __init enable_swap_cgroup(void) 3054{ 3055} 3056#endif 3057 3058static int mem_cgroup_soft_limit_tree_init(void) 3059{ 3060 struct mem_cgroup_tree_per_node *rtpn; 3061 struct mem_cgroup_tree_per_zone *rtpz; 3062 int tmp, node, zone; 3063 3064 for_each_node_state(node, N_POSSIBLE) { 3065 tmp = node; 3066 if (!node_state(node, N_NORMAL_MEMORY)) 3067 tmp = -1; 3068 rtpn = kzalloc_node(sizeof(*rtpn), GFP_KERNEL, tmp); 3069 if (!rtpn) 3070 return 1; 3071 3072 soft_limit_tree.rb_tree_per_node[node] = rtpn; 3073 3074 for (zone = 0; zone < MAX_NR_ZONES; zone++) { 3075 rtpz = &rtpn->rb_tree_per_zone[zone]; 3076 rtpz->rb_root = RB_ROOT; 3077 spin_lock_init(&rtpz->lock); 3078 } 3079 } 3080 return 0; 3081} 3082 3083static struct cgroup_subsys_state * __ref 3084mem_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cont) 3085{ 3086 struct mem_cgroup *mem, *parent; 3087 long error = -ENOMEM; 3088 int node; 3089 3090 mem = mem_cgroup_alloc(); 3091 if (!mem) 3092 return ERR_PTR(error); 3093 3094 for_each_node_state(node, N_POSSIBLE) 3095 if (alloc_mem_cgroup_per_zone_info(mem, node)) 3096 goto free_out; 3097 3098 /* root ? */ 3099 if (cont->parent == NULL) { 3100 enable_swap_cgroup(); 3101 parent = NULL; 3102 root_mem_cgroup = mem; 3103 if (mem_cgroup_soft_limit_tree_init()) 3104 goto free_out; 3105 3106 } else { 3107 parent = mem_cgroup_from_cont(cont->parent); 3108 mem->use_hierarchy = parent->use_hierarchy; 3109 } 3110 3111 if (parent && parent->use_hierarchy) { 3112 res_counter_init(&mem->res, &parent->res); 3113 res_counter_init(&mem->memsw, &parent->memsw); 3114 /* 3115 * We increment refcnt of the parent to ensure that we can 3116 * safely access it on res_counter_charge/uncharge. 3117 * This refcnt will be decremented when freeing this 3118 * mem_cgroup(see mem_cgroup_put). 3119 */ 3120 mem_cgroup_get(parent); 3121 } else { 3122 res_counter_init(&mem->res, NULL); 3123 res_counter_init(&mem->memsw, NULL); 3124 } 3125 mem->last_scanned_child = 0; 3126 spin_lock_init(&mem->reclaim_param_lock); 3127 3128 if (parent) 3129 mem->swappiness = get_swappiness(parent); 3130 atomic_set(&mem->refcnt, 1); 3131 return &mem->css; 3132free_out: 3133 __mem_cgroup_free(mem); 3134 root_mem_cgroup = NULL; 3135 return ERR_PTR(error); 3136} 3137 3138static int mem_cgroup_pre_destroy(struct cgroup_subsys *ss, 3139 struct cgroup *cont) 3140{ 3141 struct mem_cgroup *mem = mem_cgroup_from_cont(cont); 3142 3143 return mem_cgroup_force_empty(mem, false); 3144} 3145 3146static void mem_cgroup_destroy(struct cgroup_subsys *ss, 3147 struct cgroup *cont) 3148{ 3149 struct mem_cgroup *mem = mem_cgroup_from_cont(cont); 3150 3151 mem_cgroup_put(mem); 3152} 3153 3154static int mem_cgroup_populate(struct cgroup_subsys *ss, 3155 struct cgroup *cont) 3156{ 3157 int ret; 3158 3159 ret = cgroup_add_files(cont, ss, mem_cgroup_files, 3160 ARRAY_SIZE(mem_cgroup_files)); 3161 3162 if (!ret) 3163 ret = register_memsw_files(cont, ss); 3164 return ret; 3165} 3166 3167static void mem_cgroup_move_task(struct cgroup_subsys *ss, 3168 struct cgroup *cont, 3169 struct cgroup *old_cont, 3170 struct task_struct *p, 3171 bool threadgroup) 3172{ 3173 mutex_lock(&memcg_tasklist); 3174 /* 3175 * FIXME: It's better to move charges of this process from old 3176 * memcg to new memcg. But it's just on TODO-List now. 3177 */ 3178 mutex_unlock(&memcg_tasklist); 3179} 3180 3181struct cgroup_subsys mem_cgroup_subsys = { 3182 .name = "memory", 3183 .subsys_id = mem_cgroup_subsys_id, 3184 .create = mem_cgroup_create, 3185 .pre_destroy = mem_cgroup_pre_destroy, 3186 .destroy = mem_cgroup_destroy, 3187 .populate = mem_cgroup_populate, 3188 .attach = mem_cgroup_move_task, 3189 .early_init = 0, 3190 .use_id = 1, 3191}; 3192 3193#ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP 3194 3195static int __init disable_swap_account(char *s) 3196{ 3197 really_do_swap_account = 0; 3198 return 1; 3199} 3200__setup("noswapaccount", disable_swap_account); 3201#endif 3202