memcontrol.c revision aa20d489ceb024f91aae084ee00c47fc6a12255c
1/* memcontrol.c - Memory Controller 2 * 3 * Copyright IBM Corporation, 2007 4 * Author Balbir Singh <balbir@linux.vnet.ibm.com> 5 * 6 * Copyright 2007 OpenVZ SWsoft Inc 7 * Author: Pavel Emelianov <xemul@openvz.org> 8 * 9 * This program is free software; you can redistribute it and/or modify 10 * it under the terms of the GNU General Public License as published by 11 * the Free Software Foundation; either version 2 of the License, or 12 * (at your option) any later version. 13 * 14 * This program is distributed in the hope that it will be useful, 15 * but WITHOUT ANY WARRANTY; without even the implied warranty of 16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 17 * GNU General Public License for more details. 18 */ 19 20#include <linux/res_counter.h> 21#include <linux/memcontrol.h> 22#include <linux/cgroup.h> 23#include <linux/mm.h> 24#include <linux/pagemap.h> 25#include <linux/smp.h> 26#include <linux/page-flags.h> 27#include <linux/backing-dev.h> 28#include <linux/bit_spinlock.h> 29#include <linux/rcupdate.h> 30#include <linux/limits.h> 31#include <linux/mutex.h> 32#include <linux/rbtree.h> 33#include <linux/slab.h> 34#include <linux/swap.h> 35#include <linux/spinlock.h> 36#include <linux/fs.h> 37#include <linux/seq_file.h> 38#include <linux/vmalloc.h> 39#include <linux/mm_inline.h> 40#include <linux/page_cgroup.h> 41#include <linux/cpu.h> 42#include "internal.h" 43 44#include <asm/uaccess.h> 45 46struct cgroup_subsys mem_cgroup_subsys __read_mostly; 47#define MEM_CGROUP_RECLAIM_RETRIES 5 48struct mem_cgroup *root_mem_cgroup __read_mostly; 49 50#ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP 51/* Turned on only when memory cgroup is enabled && really_do_swap_account = 1 */ 52int do_swap_account __read_mostly; 53static int really_do_swap_account __initdata = 1; /* for remember boot option*/ 54#else 55#define do_swap_account (0) 56#endif 57 58#define SOFTLIMIT_EVENTS_THRESH (1000) 59 60/* 61 * Statistics for memory cgroup. 62 */ 63enum mem_cgroup_stat_index { 64 /* 65 * For MEM_CONTAINER_TYPE_ALL, usage = pagecache + rss. 66 */ 67 MEM_CGROUP_STAT_CACHE, /* # of pages charged as cache */ 68 MEM_CGROUP_STAT_RSS, /* # of pages charged as anon rss */ 69 MEM_CGROUP_STAT_FILE_MAPPED, /* # of pages charged as file rss */ 70 MEM_CGROUP_STAT_PGPGIN_COUNT, /* # of pages paged in */ 71 MEM_CGROUP_STAT_PGPGOUT_COUNT, /* # of pages paged out */ 72 MEM_CGROUP_STAT_EVENTS, /* sum of pagein + pageout for internal use */ 73 MEM_CGROUP_STAT_SWAPOUT, /* # of pages, swapped out */ 74 75 MEM_CGROUP_STAT_NSTATS, 76}; 77 78struct mem_cgroup_stat_cpu { 79 s64 count[MEM_CGROUP_STAT_NSTATS]; 80} ____cacheline_aligned_in_smp; 81 82struct mem_cgroup_stat { 83 struct mem_cgroup_stat_cpu cpustat[0]; 84}; 85 86static inline void 87__mem_cgroup_stat_reset_safe(struct mem_cgroup_stat_cpu *stat, 88 enum mem_cgroup_stat_index idx) 89{ 90 stat->count[idx] = 0; 91} 92 93static inline s64 94__mem_cgroup_stat_read_local(struct mem_cgroup_stat_cpu *stat, 95 enum mem_cgroup_stat_index idx) 96{ 97 return stat->count[idx]; 98} 99 100/* 101 * For accounting under irq disable, no need for increment preempt count. 102 */ 103static inline void __mem_cgroup_stat_add_safe(struct mem_cgroup_stat_cpu *stat, 104 enum mem_cgroup_stat_index idx, int val) 105{ 106 stat->count[idx] += val; 107} 108 109static s64 mem_cgroup_read_stat(struct mem_cgroup_stat *stat, 110 enum mem_cgroup_stat_index idx) 111{ 112 int cpu; 113 s64 ret = 0; 114 for_each_possible_cpu(cpu) 115 ret += stat->cpustat[cpu].count[idx]; 116 return ret; 117} 118 119static s64 mem_cgroup_local_usage(struct mem_cgroup_stat *stat) 120{ 121 s64 ret; 122 123 ret = mem_cgroup_read_stat(stat, MEM_CGROUP_STAT_CACHE); 124 ret += mem_cgroup_read_stat(stat, MEM_CGROUP_STAT_RSS); 125 return ret; 126} 127 128/* 129 * per-zone information in memory controller. 130 */ 131struct mem_cgroup_per_zone { 132 /* 133 * spin_lock to protect the per cgroup LRU 134 */ 135 struct list_head lists[NR_LRU_LISTS]; 136 unsigned long count[NR_LRU_LISTS]; 137 138 struct zone_reclaim_stat reclaim_stat; 139 struct rb_node tree_node; /* RB tree node */ 140 unsigned long long usage_in_excess;/* Set to the value by which */ 141 /* the soft limit is exceeded*/ 142 bool on_tree; 143 struct mem_cgroup *mem; /* Back pointer, we cannot */ 144 /* use container_of */ 145}; 146/* Macro for accessing counter */ 147#define MEM_CGROUP_ZSTAT(mz, idx) ((mz)->count[(idx)]) 148 149struct mem_cgroup_per_node { 150 struct mem_cgroup_per_zone zoneinfo[MAX_NR_ZONES]; 151}; 152 153struct mem_cgroup_lru_info { 154 struct mem_cgroup_per_node *nodeinfo[MAX_NUMNODES]; 155}; 156 157/* 158 * Cgroups above their limits are maintained in a RB-Tree, independent of 159 * their hierarchy representation 160 */ 161 162struct mem_cgroup_tree_per_zone { 163 struct rb_root rb_root; 164 spinlock_t lock; 165}; 166 167struct mem_cgroup_tree_per_node { 168 struct mem_cgroup_tree_per_zone rb_tree_per_zone[MAX_NR_ZONES]; 169}; 170 171struct mem_cgroup_tree { 172 struct mem_cgroup_tree_per_node *rb_tree_per_node[MAX_NUMNODES]; 173}; 174 175static struct mem_cgroup_tree soft_limit_tree __read_mostly; 176 177/* 178 * The memory controller data structure. The memory controller controls both 179 * page cache and RSS per cgroup. We would eventually like to provide 180 * statistics based on the statistics developed by Rik Van Riel for clock-pro, 181 * to help the administrator determine what knobs to tune. 182 * 183 * TODO: Add a water mark for the memory controller. Reclaim will begin when 184 * we hit the water mark. May be even add a low water mark, such that 185 * no reclaim occurs from a cgroup at it's low water mark, this is 186 * a feature that will be implemented much later in the future. 187 */ 188struct mem_cgroup { 189 struct cgroup_subsys_state css; 190 /* 191 * the counter to account for memory usage 192 */ 193 struct res_counter res; 194 /* 195 * the counter to account for mem+swap usage. 196 */ 197 struct res_counter memsw; 198 /* 199 * Per cgroup active and inactive list, similar to the 200 * per zone LRU lists. 201 */ 202 struct mem_cgroup_lru_info info; 203 204 /* 205 protect against reclaim related member. 206 */ 207 spinlock_t reclaim_param_lock; 208 209 int prev_priority; /* for recording reclaim priority */ 210 211 /* 212 * While reclaiming in a hierarchy, we cache the last child we 213 * reclaimed from. 214 */ 215 int last_scanned_child; 216 /* 217 * Should the accounting and control be hierarchical, per subtree? 218 */ 219 bool use_hierarchy; 220 unsigned long last_oom_jiffies; 221 atomic_t refcnt; 222 223 unsigned int swappiness; 224 225 /* set when res.limit == memsw.limit */ 226 bool memsw_is_minimum; 227 228 /* 229 * statistics. This must be placed at the end of memcg. 230 */ 231 struct mem_cgroup_stat stat; 232}; 233 234/* 235 * Maximum loops in mem_cgroup_hierarchical_reclaim(), used for soft 236 * limit reclaim to prevent infinite loops, if they ever occur. 237 */ 238#define MEM_CGROUP_MAX_RECLAIM_LOOPS (100) 239#define MEM_CGROUP_MAX_SOFT_LIMIT_RECLAIM_LOOPS (2) 240 241enum charge_type { 242 MEM_CGROUP_CHARGE_TYPE_CACHE = 0, 243 MEM_CGROUP_CHARGE_TYPE_MAPPED, 244 MEM_CGROUP_CHARGE_TYPE_SHMEM, /* used by page migration of shmem */ 245 MEM_CGROUP_CHARGE_TYPE_FORCE, /* used by force_empty */ 246 MEM_CGROUP_CHARGE_TYPE_SWAPOUT, /* for accounting swapcache */ 247 MEM_CGROUP_CHARGE_TYPE_DROP, /* a page was unused swap cache */ 248 NR_CHARGE_TYPE, 249}; 250 251/* only for here (for easy reading.) */ 252#define PCGF_CACHE (1UL << PCG_CACHE) 253#define PCGF_USED (1UL << PCG_USED) 254#define PCGF_LOCK (1UL << PCG_LOCK) 255/* Not used, but added here for completeness */ 256#define PCGF_ACCT (1UL << PCG_ACCT) 257 258/* for encoding cft->private value on file */ 259#define _MEM (0) 260#define _MEMSWAP (1) 261#define MEMFILE_PRIVATE(x, val) (((x) << 16) | (val)) 262#define MEMFILE_TYPE(val) (((val) >> 16) & 0xffff) 263#define MEMFILE_ATTR(val) ((val) & 0xffff) 264 265/* 266 * Reclaim flags for mem_cgroup_hierarchical_reclaim 267 */ 268#define MEM_CGROUP_RECLAIM_NOSWAP_BIT 0x0 269#define MEM_CGROUP_RECLAIM_NOSWAP (1 << MEM_CGROUP_RECLAIM_NOSWAP_BIT) 270#define MEM_CGROUP_RECLAIM_SHRINK_BIT 0x1 271#define MEM_CGROUP_RECLAIM_SHRINK (1 << MEM_CGROUP_RECLAIM_SHRINK_BIT) 272#define MEM_CGROUP_RECLAIM_SOFT_BIT 0x2 273#define MEM_CGROUP_RECLAIM_SOFT (1 << MEM_CGROUP_RECLAIM_SOFT_BIT) 274 275static void mem_cgroup_get(struct mem_cgroup *mem); 276static void mem_cgroup_put(struct mem_cgroup *mem); 277static struct mem_cgroup *parent_mem_cgroup(struct mem_cgroup *mem); 278static void drain_all_stock_async(void); 279 280static struct mem_cgroup_per_zone * 281mem_cgroup_zoneinfo(struct mem_cgroup *mem, int nid, int zid) 282{ 283 return &mem->info.nodeinfo[nid]->zoneinfo[zid]; 284} 285 286static struct mem_cgroup_per_zone * 287page_cgroup_zoneinfo(struct page_cgroup *pc) 288{ 289 struct mem_cgroup *mem = pc->mem_cgroup; 290 int nid = page_cgroup_nid(pc); 291 int zid = page_cgroup_zid(pc); 292 293 if (!mem) 294 return NULL; 295 296 return mem_cgroup_zoneinfo(mem, nid, zid); 297} 298 299static struct mem_cgroup_tree_per_zone * 300soft_limit_tree_node_zone(int nid, int zid) 301{ 302 return &soft_limit_tree.rb_tree_per_node[nid]->rb_tree_per_zone[zid]; 303} 304 305static struct mem_cgroup_tree_per_zone * 306soft_limit_tree_from_page(struct page *page) 307{ 308 int nid = page_to_nid(page); 309 int zid = page_zonenum(page); 310 311 return &soft_limit_tree.rb_tree_per_node[nid]->rb_tree_per_zone[zid]; 312} 313 314static void 315__mem_cgroup_insert_exceeded(struct mem_cgroup *mem, 316 struct mem_cgroup_per_zone *mz, 317 struct mem_cgroup_tree_per_zone *mctz, 318 unsigned long long new_usage_in_excess) 319{ 320 struct rb_node **p = &mctz->rb_root.rb_node; 321 struct rb_node *parent = NULL; 322 struct mem_cgroup_per_zone *mz_node; 323 324 if (mz->on_tree) 325 return; 326 327 mz->usage_in_excess = new_usage_in_excess; 328 if (!mz->usage_in_excess) 329 return; 330 while (*p) { 331 parent = *p; 332 mz_node = rb_entry(parent, struct mem_cgroup_per_zone, 333 tree_node); 334 if (mz->usage_in_excess < mz_node->usage_in_excess) 335 p = &(*p)->rb_left; 336 /* 337 * We can't avoid mem cgroups that are over their soft 338 * limit by the same amount 339 */ 340 else if (mz->usage_in_excess >= mz_node->usage_in_excess) 341 p = &(*p)->rb_right; 342 } 343 rb_link_node(&mz->tree_node, parent, p); 344 rb_insert_color(&mz->tree_node, &mctz->rb_root); 345 mz->on_tree = true; 346} 347 348static void 349__mem_cgroup_remove_exceeded(struct mem_cgroup *mem, 350 struct mem_cgroup_per_zone *mz, 351 struct mem_cgroup_tree_per_zone *mctz) 352{ 353 if (!mz->on_tree) 354 return; 355 rb_erase(&mz->tree_node, &mctz->rb_root); 356 mz->on_tree = false; 357} 358 359static void 360mem_cgroup_remove_exceeded(struct mem_cgroup *mem, 361 struct mem_cgroup_per_zone *mz, 362 struct mem_cgroup_tree_per_zone *mctz) 363{ 364 spin_lock(&mctz->lock); 365 __mem_cgroup_remove_exceeded(mem, mz, mctz); 366 spin_unlock(&mctz->lock); 367} 368 369static bool mem_cgroup_soft_limit_check(struct mem_cgroup *mem) 370{ 371 bool ret = false; 372 int cpu; 373 s64 val; 374 struct mem_cgroup_stat_cpu *cpustat; 375 376 cpu = get_cpu(); 377 cpustat = &mem->stat.cpustat[cpu]; 378 val = __mem_cgroup_stat_read_local(cpustat, MEM_CGROUP_STAT_EVENTS); 379 if (unlikely(val > SOFTLIMIT_EVENTS_THRESH)) { 380 __mem_cgroup_stat_reset_safe(cpustat, MEM_CGROUP_STAT_EVENTS); 381 ret = true; 382 } 383 put_cpu(); 384 return ret; 385} 386 387static void mem_cgroup_update_tree(struct mem_cgroup *mem, struct page *page) 388{ 389 unsigned long long excess; 390 struct mem_cgroup_per_zone *mz; 391 struct mem_cgroup_tree_per_zone *mctz; 392 int nid = page_to_nid(page); 393 int zid = page_zonenum(page); 394 mctz = soft_limit_tree_from_page(page); 395 396 /* 397 * Necessary to update all ancestors when hierarchy is used. 398 * because their event counter is not touched. 399 */ 400 for (; mem; mem = parent_mem_cgroup(mem)) { 401 mz = mem_cgroup_zoneinfo(mem, nid, zid); 402 excess = res_counter_soft_limit_excess(&mem->res); 403 /* 404 * We have to update the tree if mz is on RB-tree or 405 * mem is over its softlimit. 406 */ 407 if (excess || mz->on_tree) { 408 spin_lock(&mctz->lock); 409 /* if on-tree, remove it */ 410 if (mz->on_tree) 411 __mem_cgroup_remove_exceeded(mem, mz, mctz); 412 /* 413 * Insert again. mz->usage_in_excess will be updated. 414 * If excess is 0, no tree ops. 415 */ 416 __mem_cgroup_insert_exceeded(mem, mz, mctz, excess); 417 spin_unlock(&mctz->lock); 418 } 419 } 420} 421 422static void mem_cgroup_remove_from_trees(struct mem_cgroup *mem) 423{ 424 int node, zone; 425 struct mem_cgroup_per_zone *mz; 426 struct mem_cgroup_tree_per_zone *mctz; 427 428 for_each_node_state(node, N_POSSIBLE) { 429 for (zone = 0; zone < MAX_NR_ZONES; zone++) { 430 mz = mem_cgroup_zoneinfo(mem, node, zone); 431 mctz = soft_limit_tree_node_zone(node, zone); 432 mem_cgroup_remove_exceeded(mem, mz, mctz); 433 } 434 } 435} 436 437static inline unsigned long mem_cgroup_get_excess(struct mem_cgroup *mem) 438{ 439 return res_counter_soft_limit_excess(&mem->res) >> PAGE_SHIFT; 440} 441 442static struct mem_cgroup_per_zone * 443__mem_cgroup_largest_soft_limit_node(struct mem_cgroup_tree_per_zone *mctz) 444{ 445 struct rb_node *rightmost = NULL; 446 struct mem_cgroup_per_zone *mz; 447 448retry: 449 mz = NULL; 450 rightmost = rb_last(&mctz->rb_root); 451 if (!rightmost) 452 goto done; /* Nothing to reclaim from */ 453 454 mz = rb_entry(rightmost, struct mem_cgroup_per_zone, tree_node); 455 /* 456 * Remove the node now but someone else can add it back, 457 * we will to add it back at the end of reclaim to its correct 458 * position in the tree. 459 */ 460 __mem_cgroup_remove_exceeded(mz->mem, mz, mctz); 461 if (!res_counter_soft_limit_excess(&mz->mem->res) || 462 !css_tryget(&mz->mem->css)) 463 goto retry; 464done: 465 return mz; 466} 467 468static struct mem_cgroup_per_zone * 469mem_cgroup_largest_soft_limit_node(struct mem_cgroup_tree_per_zone *mctz) 470{ 471 struct mem_cgroup_per_zone *mz; 472 473 spin_lock(&mctz->lock); 474 mz = __mem_cgroup_largest_soft_limit_node(mctz); 475 spin_unlock(&mctz->lock); 476 return mz; 477} 478 479static void mem_cgroup_swap_statistics(struct mem_cgroup *mem, 480 bool charge) 481{ 482 int val = (charge) ? 1 : -1; 483 struct mem_cgroup_stat *stat = &mem->stat; 484 struct mem_cgroup_stat_cpu *cpustat; 485 int cpu = get_cpu(); 486 487 cpustat = &stat->cpustat[cpu]; 488 __mem_cgroup_stat_add_safe(cpustat, MEM_CGROUP_STAT_SWAPOUT, val); 489 put_cpu(); 490} 491 492static void mem_cgroup_charge_statistics(struct mem_cgroup *mem, 493 struct page_cgroup *pc, 494 bool charge) 495{ 496 int val = (charge) ? 1 : -1; 497 struct mem_cgroup_stat *stat = &mem->stat; 498 struct mem_cgroup_stat_cpu *cpustat; 499 int cpu = get_cpu(); 500 501 cpustat = &stat->cpustat[cpu]; 502 if (PageCgroupCache(pc)) 503 __mem_cgroup_stat_add_safe(cpustat, MEM_CGROUP_STAT_CACHE, val); 504 else 505 __mem_cgroup_stat_add_safe(cpustat, MEM_CGROUP_STAT_RSS, val); 506 507 if (charge) 508 __mem_cgroup_stat_add_safe(cpustat, 509 MEM_CGROUP_STAT_PGPGIN_COUNT, 1); 510 else 511 __mem_cgroup_stat_add_safe(cpustat, 512 MEM_CGROUP_STAT_PGPGOUT_COUNT, 1); 513 __mem_cgroup_stat_add_safe(cpustat, MEM_CGROUP_STAT_EVENTS, 1); 514 put_cpu(); 515} 516 517static unsigned long mem_cgroup_get_local_zonestat(struct mem_cgroup *mem, 518 enum lru_list idx) 519{ 520 int nid, zid; 521 struct mem_cgroup_per_zone *mz; 522 u64 total = 0; 523 524 for_each_online_node(nid) 525 for (zid = 0; zid < MAX_NR_ZONES; zid++) { 526 mz = mem_cgroup_zoneinfo(mem, nid, zid); 527 total += MEM_CGROUP_ZSTAT(mz, idx); 528 } 529 return total; 530} 531 532static struct mem_cgroup *mem_cgroup_from_cont(struct cgroup *cont) 533{ 534 return container_of(cgroup_subsys_state(cont, 535 mem_cgroup_subsys_id), struct mem_cgroup, 536 css); 537} 538 539struct mem_cgroup *mem_cgroup_from_task(struct task_struct *p) 540{ 541 /* 542 * mm_update_next_owner() may clear mm->owner to NULL 543 * if it races with swapoff, page migration, etc. 544 * So this can be called with p == NULL. 545 */ 546 if (unlikely(!p)) 547 return NULL; 548 549 return container_of(task_subsys_state(p, mem_cgroup_subsys_id), 550 struct mem_cgroup, css); 551} 552 553static struct mem_cgroup *try_get_mem_cgroup_from_mm(struct mm_struct *mm) 554{ 555 struct mem_cgroup *mem = NULL; 556 557 if (!mm) 558 return NULL; 559 /* 560 * Because we have no locks, mm->owner's may be being moved to other 561 * cgroup. We use css_tryget() here even if this looks 562 * pessimistic (rather than adding locks here). 563 */ 564 rcu_read_lock(); 565 do { 566 mem = mem_cgroup_from_task(rcu_dereference(mm->owner)); 567 if (unlikely(!mem)) 568 break; 569 } while (!css_tryget(&mem->css)); 570 rcu_read_unlock(); 571 return mem; 572} 573 574/* 575 * Call callback function against all cgroup under hierarchy tree. 576 */ 577static int mem_cgroup_walk_tree(struct mem_cgroup *root, void *data, 578 int (*func)(struct mem_cgroup *, void *)) 579{ 580 int found, ret, nextid; 581 struct cgroup_subsys_state *css; 582 struct mem_cgroup *mem; 583 584 if (!root->use_hierarchy) 585 return (*func)(root, data); 586 587 nextid = 1; 588 do { 589 ret = 0; 590 mem = NULL; 591 592 rcu_read_lock(); 593 css = css_get_next(&mem_cgroup_subsys, nextid, &root->css, 594 &found); 595 if (css && css_tryget(css)) 596 mem = container_of(css, struct mem_cgroup, css); 597 rcu_read_unlock(); 598 599 if (mem) { 600 ret = (*func)(mem, data); 601 css_put(&mem->css); 602 } 603 nextid = found + 1; 604 } while (!ret && css); 605 606 return ret; 607} 608 609static inline bool mem_cgroup_is_root(struct mem_cgroup *mem) 610{ 611 return (mem == root_mem_cgroup); 612} 613 614/* 615 * Following LRU functions are allowed to be used without PCG_LOCK. 616 * Operations are called by routine of global LRU independently from memcg. 617 * What we have to take care of here is validness of pc->mem_cgroup. 618 * 619 * Changes to pc->mem_cgroup happens when 620 * 1. charge 621 * 2. moving account 622 * In typical case, "charge" is done before add-to-lru. Exception is SwapCache. 623 * It is added to LRU before charge. 624 * If PCG_USED bit is not set, page_cgroup is not added to this private LRU. 625 * When moving account, the page is not on LRU. It's isolated. 626 */ 627 628void mem_cgroup_del_lru_list(struct page *page, enum lru_list lru) 629{ 630 struct page_cgroup *pc; 631 struct mem_cgroup_per_zone *mz; 632 633 if (mem_cgroup_disabled()) 634 return; 635 pc = lookup_page_cgroup(page); 636 /* can happen while we handle swapcache. */ 637 if (!TestClearPageCgroupAcctLRU(pc)) 638 return; 639 VM_BUG_ON(!pc->mem_cgroup); 640 /* 641 * We don't check PCG_USED bit. It's cleared when the "page" is finally 642 * removed from global LRU. 643 */ 644 mz = page_cgroup_zoneinfo(pc); 645 MEM_CGROUP_ZSTAT(mz, lru) -= 1; 646 if (mem_cgroup_is_root(pc->mem_cgroup)) 647 return; 648 VM_BUG_ON(list_empty(&pc->lru)); 649 list_del_init(&pc->lru); 650 return; 651} 652 653void mem_cgroup_del_lru(struct page *page) 654{ 655 mem_cgroup_del_lru_list(page, page_lru(page)); 656} 657 658void mem_cgroup_rotate_lru_list(struct page *page, enum lru_list lru) 659{ 660 struct mem_cgroup_per_zone *mz; 661 struct page_cgroup *pc; 662 663 if (mem_cgroup_disabled()) 664 return; 665 666 pc = lookup_page_cgroup(page); 667 /* 668 * Used bit is set without atomic ops but after smp_wmb(). 669 * For making pc->mem_cgroup visible, insert smp_rmb() here. 670 */ 671 smp_rmb(); 672 /* unused or root page is not rotated. */ 673 if (!PageCgroupUsed(pc) || mem_cgroup_is_root(pc->mem_cgroup)) 674 return; 675 mz = page_cgroup_zoneinfo(pc); 676 list_move(&pc->lru, &mz->lists[lru]); 677} 678 679void mem_cgroup_add_lru_list(struct page *page, enum lru_list lru) 680{ 681 struct page_cgroup *pc; 682 struct mem_cgroup_per_zone *mz; 683 684 if (mem_cgroup_disabled()) 685 return; 686 pc = lookup_page_cgroup(page); 687 VM_BUG_ON(PageCgroupAcctLRU(pc)); 688 /* 689 * Used bit is set without atomic ops but after smp_wmb(). 690 * For making pc->mem_cgroup visible, insert smp_rmb() here. 691 */ 692 smp_rmb(); 693 if (!PageCgroupUsed(pc)) 694 return; 695 696 mz = page_cgroup_zoneinfo(pc); 697 MEM_CGROUP_ZSTAT(mz, lru) += 1; 698 SetPageCgroupAcctLRU(pc); 699 if (mem_cgroup_is_root(pc->mem_cgroup)) 700 return; 701 list_add(&pc->lru, &mz->lists[lru]); 702} 703 704/* 705 * At handling SwapCache, pc->mem_cgroup may be changed while it's linked to 706 * lru because the page may.be reused after it's fully uncharged (because of 707 * SwapCache behavior).To handle that, unlink page_cgroup from LRU when charge 708 * it again. This function is only used to charge SwapCache. It's done under 709 * lock_page and expected that zone->lru_lock is never held. 710 */ 711static void mem_cgroup_lru_del_before_commit_swapcache(struct page *page) 712{ 713 unsigned long flags; 714 struct zone *zone = page_zone(page); 715 struct page_cgroup *pc = lookup_page_cgroup(page); 716 717 spin_lock_irqsave(&zone->lru_lock, flags); 718 /* 719 * Forget old LRU when this page_cgroup is *not* used. This Used bit 720 * is guarded by lock_page() because the page is SwapCache. 721 */ 722 if (!PageCgroupUsed(pc)) 723 mem_cgroup_del_lru_list(page, page_lru(page)); 724 spin_unlock_irqrestore(&zone->lru_lock, flags); 725} 726 727static void mem_cgroup_lru_add_after_commit_swapcache(struct page *page) 728{ 729 unsigned long flags; 730 struct zone *zone = page_zone(page); 731 struct page_cgroup *pc = lookup_page_cgroup(page); 732 733 spin_lock_irqsave(&zone->lru_lock, flags); 734 /* link when the page is linked to LRU but page_cgroup isn't */ 735 if (PageLRU(page) && !PageCgroupAcctLRU(pc)) 736 mem_cgroup_add_lru_list(page, page_lru(page)); 737 spin_unlock_irqrestore(&zone->lru_lock, flags); 738} 739 740 741void mem_cgroup_move_lists(struct page *page, 742 enum lru_list from, enum lru_list to) 743{ 744 if (mem_cgroup_disabled()) 745 return; 746 mem_cgroup_del_lru_list(page, from); 747 mem_cgroup_add_lru_list(page, to); 748} 749 750int task_in_mem_cgroup(struct task_struct *task, const struct mem_cgroup *mem) 751{ 752 int ret; 753 struct mem_cgroup *curr = NULL; 754 755 task_lock(task); 756 rcu_read_lock(); 757 curr = try_get_mem_cgroup_from_mm(task->mm); 758 rcu_read_unlock(); 759 task_unlock(task); 760 if (!curr) 761 return 0; 762 /* 763 * We should check use_hierarchy of "mem" not "curr". Because checking 764 * use_hierarchy of "curr" here make this function true if hierarchy is 765 * enabled in "curr" and "curr" is a child of "mem" in *cgroup* 766 * hierarchy(even if use_hierarchy is disabled in "mem"). 767 */ 768 if (mem->use_hierarchy) 769 ret = css_is_ancestor(&curr->css, &mem->css); 770 else 771 ret = (curr == mem); 772 css_put(&curr->css); 773 return ret; 774} 775 776/* 777 * prev_priority control...this will be used in memory reclaim path. 778 */ 779int mem_cgroup_get_reclaim_priority(struct mem_cgroup *mem) 780{ 781 int prev_priority; 782 783 spin_lock(&mem->reclaim_param_lock); 784 prev_priority = mem->prev_priority; 785 spin_unlock(&mem->reclaim_param_lock); 786 787 return prev_priority; 788} 789 790void mem_cgroup_note_reclaim_priority(struct mem_cgroup *mem, int priority) 791{ 792 spin_lock(&mem->reclaim_param_lock); 793 if (priority < mem->prev_priority) 794 mem->prev_priority = priority; 795 spin_unlock(&mem->reclaim_param_lock); 796} 797 798void mem_cgroup_record_reclaim_priority(struct mem_cgroup *mem, int priority) 799{ 800 spin_lock(&mem->reclaim_param_lock); 801 mem->prev_priority = priority; 802 spin_unlock(&mem->reclaim_param_lock); 803} 804 805static int calc_inactive_ratio(struct mem_cgroup *memcg, unsigned long *present_pages) 806{ 807 unsigned long active; 808 unsigned long inactive; 809 unsigned long gb; 810 unsigned long inactive_ratio; 811 812 inactive = mem_cgroup_get_local_zonestat(memcg, LRU_INACTIVE_ANON); 813 active = mem_cgroup_get_local_zonestat(memcg, LRU_ACTIVE_ANON); 814 815 gb = (inactive + active) >> (30 - PAGE_SHIFT); 816 if (gb) 817 inactive_ratio = int_sqrt(10 * gb); 818 else 819 inactive_ratio = 1; 820 821 if (present_pages) { 822 present_pages[0] = inactive; 823 present_pages[1] = active; 824 } 825 826 return inactive_ratio; 827} 828 829int mem_cgroup_inactive_anon_is_low(struct mem_cgroup *memcg) 830{ 831 unsigned long active; 832 unsigned long inactive; 833 unsigned long present_pages[2]; 834 unsigned long inactive_ratio; 835 836 inactive_ratio = calc_inactive_ratio(memcg, present_pages); 837 838 inactive = present_pages[0]; 839 active = present_pages[1]; 840 841 if (inactive * inactive_ratio < active) 842 return 1; 843 844 return 0; 845} 846 847int mem_cgroup_inactive_file_is_low(struct mem_cgroup *memcg) 848{ 849 unsigned long active; 850 unsigned long inactive; 851 852 inactive = mem_cgroup_get_local_zonestat(memcg, LRU_INACTIVE_FILE); 853 active = mem_cgroup_get_local_zonestat(memcg, LRU_ACTIVE_FILE); 854 855 return (active > inactive); 856} 857 858unsigned long mem_cgroup_zone_nr_pages(struct mem_cgroup *memcg, 859 struct zone *zone, 860 enum lru_list lru) 861{ 862 int nid = zone->zone_pgdat->node_id; 863 int zid = zone_idx(zone); 864 struct mem_cgroup_per_zone *mz = mem_cgroup_zoneinfo(memcg, nid, zid); 865 866 return MEM_CGROUP_ZSTAT(mz, lru); 867} 868 869struct zone_reclaim_stat *mem_cgroup_get_reclaim_stat(struct mem_cgroup *memcg, 870 struct zone *zone) 871{ 872 int nid = zone->zone_pgdat->node_id; 873 int zid = zone_idx(zone); 874 struct mem_cgroup_per_zone *mz = mem_cgroup_zoneinfo(memcg, nid, zid); 875 876 return &mz->reclaim_stat; 877} 878 879struct zone_reclaim_stat * 880mem_cgroup_get_reclaim_stat_from_page(struct page *page) 881{ 882 struct page_cgroup *pc; 883 struct mem_cgroup_per_zone *mz; 884 885 if (mem_cgroup_disabled()) 886 return NULL; 887 888 pc = lookup_page_cgroup(page); 889 /* 890 * Used bit is set without atomic ops but after smp_wmb(). 891 * For making pc->mem_cgroup visible, insert smp_rmb() here. 892 */ 893 smp_rmb(); 894 if (!PageCgroupUsed(pc)) 895 return NULL; 896 897 mz = page_cgroup_zoneinfo(pc); 898 if (!mz) 899 return NULL; 900 901 return &mz->reclaim_stat; 902} 903 904unsigned long mem_cgroup_isolate_pages(unsigned long nr_to_scan, 905 struct list_head *dst, 906 unsigned long *scanned, int order, 907 int mode, struct zone *z, 908 struct mem_cgroup *mem_cont, 909 int active, int file) 910{ 911 unsigned long nr_taken = 0; 912 struct page *page; 913 unsigned long scan; 914 LIST_HEAD(pc_list); 915 struct list_head *src; 916 struct page_cgroup *pc, *tmp; 917 int nid = z->zone_pgdat->node_id; 918 int zid = zone_idx(z); 919 struct mem_cgroup_per_zone *mz; 920 int lru = LRU_FILE * file + active; 921 int ret; 922 923 BUG_ON(!mem_cont); 924 mz = mem_cgroup_zoneinfo(mem_cont, nid, zid); 925 src = &mz->lists[lru]; 926 927 scan = 0; 928 list_for_each_entry_safe_reverse(pc, tmp, src, lru) { 929 if (scan >= nr_to_scan) 930 break; 931 932 page = pc->page; 933 if (unlikely(!PageCgroupUsed(pc))) 934 continue; 935 if (unlikely(!PageLRU(page))) 936 continue; 937 938 scan++; 939 ret = __isolate_lru_page(page, mode, file); 940 switch (ret) { 941 case 0: 942 list_move(&page->lru, dst); 943 mem_cgroup_del_lru(page); 944 nr_taken++; 945 break; 946 case -EBUSY: 947 /* we don't affect global LRU but rotate in our LRU */ 948 mem_cgroup_rotate_lru_list(page, page_lru(page)); 949 break; 950 default: 951 break; 952 } 953 } 954 955 *scanned = scan; 956 return nr_taken; 957} 958 959#define mem_cgroup_from_res_counter(counter, member) \ 960 container_of(counter, struct mem_cgroup, member) 961 962static bool mem_cgroup_check_under_limit(struct mem_cgroup *mem) 963{ 964 if (do_swap_account) { 965 if (res_counter_check_under_limit(&mem->res) && 966 res_counter_check_under_limit(&mem->memsw)) 967 return true; 968 } else 969 if (res_counter_check_under_limit(&mem->res)) 970 return true; 971 return false; 972} 973 974static unsigned int get_swappiness(struct mem_cgroup *memcg) 975{ 976 struct cgroup *cgrp = memcg->css.cgroup; 977 unsigned int swappiness; 978 979 /* root ? */ 980 if (cgrp->parent == NULL) 981 return vm_swappiness; 982 983 spin_lock(&memcg->reclaim_param_lock); 984 swappiness = memcg->swappiness; 985 spin_unlock(&memcg->reclaim_param_lock); 986 987 return swappiness; 988} 989 990static int mem_cgroup_count_children_cb(struct mem_cgroup *mem, void *data) 991{ 992 int *val = data; 993 (*val)++; 994 return 0; 995} 996 997/** 998 * mem_cgroup_print_mem_info: Called from OOM with tasklist_lock held in read mode. 999 * @memcg: The memory cgroup that went over limit 1000 * @p: Task that is going to be killed 1001 * 1002 * NOTE: @memcg and @p's mem_cgroup can be different when hierarchy is 1003 * enabled 1004 */ 1005void mem_cgroup_print_oom_info(struct mem_cgroup *memcg, struct task_struct *p) 1006{ 1007 struct cgroup *task_cgrp; 1008 struct cgroup *mem_cgrp; 1009 /* 1010 * Need a buffer in BSS, can't rely on allocations. The code relies 1011 * on the assumption that OOM is serialized for memory controller. 1012 * If this assumption is broken, revisit this code. 1013 */ 1014 static char memcg_name[PATH_MAX]; 1015 int ret; 1016 1017 if (!memcg || !p) 1018 return; 1019 1020 1021 rcu_read_lock(); 1022 1023 mem_cgrp = memcg->css.cgroup; 1024 task_cgrp = task_cgroup(p, mem_cgroup_subsys_id); 1025 1026 ret = cgroup_path(task_cgrp, memcg_name, PATH_MAX); 1027 if (ret < 0) { 1028 /* 1029 * Unfortunately, we are unable to convert to a useful name 1030 * But we'll still print out the usage information 1031 */ 1032 rcu_read_unlock(); 1033 goto done; 1034 } 1035 rcu_read_unlock(); 1036 1037 printk(KERN_INFO "Task in %s killed", memcg_name); 1038 1039 rcu_read_lock(); 1040 ret = cgroup_path(mem_cgrp, memcg_name, PATH_MAX); 1041 if (ret < 0) { 1042 rcu_read_unlock(); 1043 goto done; 1044 } 1045 rcu_read_unlock(); 1046 1047 /* 1048 * Continues from above, so we don't need an KERN_ level 1049 */ 1050 printk(KERN_CONT " as a result of limit of %s\n", memcg_name); 1051done: 1052 1053 printk(KERN_INFO "memory: usage %llukB, limit %llukB, failcnt %llu\n", 1054 res_counter_read_u64(&memcg->res, RES_USAGE) >> 10, 1055 res_counter_read_u64(&memcg->res, RES_LIMIT) >> 10, 1056 res_counter_read_u64(&memcg->res, RES_FAILCNT)); 1057 printk(KERN_INFO "memory+swap: usage %llukB, limit %llukB, " 1058 "failcnt %llu\n", 1059 res_counter_read_u64(&memcg->memsw, RES_USAGE) >> 10, 1060 res_counter_read_u64(&memcg->memsw, RES_LIMIT) >> 10, 1061 res_counter_read_u64(&memcg->memsw, RES_FAILCNT)); 1062} 1063 1064/* 1065 * This function returns the number of memcg under hierarchy tree. Returns 1066 * 1(self count) if no children. 1067 */ 1068static int mem_cgroup_count_children(struct mem_cgroup *mem) 1069{ 1070 int num = 0; 1071 mem_cgroup_walk_tree(mem, &num, mem_cgroup_count_children_cb); 1072 return num; 1073} 1074 1075/* 1076 * Visit the first child (need not be the first child as per the ordering 1077 * of the cgroup list, since we track last_scanned_child) of @mem and use 1078 * that to reclaim free pages from. 1079 */ 1080static struct mem_cgroup * 1081mem_cgroup_select_victim(struct mem_cgroup *root_mem) 1082{ 1083 struct mem_cgroup *ret = NULL; 1084 struct cgroup_subsys_state *css; 1085 int nextid, found; 1086 1087 if (!root_mem->use_hierarchy) { 1088 css_get(&root_mem->css); 1089 ret = root_mem; 1090 } 1091 1092 while (!ret) { 1093 rcu_read_lock(); 1094 nextid = root_mem->last_scanned_child + 1; 1095 css = css_get_next(&mem_cgroup_subsys, nextid, &root_mem->css, 1096 &found); 1097 if (css && css_tryget(css)) 1098 ret = container_of(css, struct mem_cgroup, css); 1099 1100 rcu_read_unlock(); 1101 /* Updates scanning parameter */ 1102 spin_lock(&root_mem->reclaim_param_lock); 1103 if (!css) { 1104 /* this means start scan from ID:1 */ 1105 root_mem->last_scanned_child = 0; 1106 } else 1107 root_mem->last_scanned_child = found; 1108 spin_unlock(&root_mem->reclaim_param_lock); 1109 } 1110 1111 return ret; 1112} 1113 1114/* 1115 * Scan the hierarchy if needed to reclaim memory. We remember the last child 1116 * we reclaimed from, so that we don't end up penalizing one child extensively 1117 * based on its position in the children list. 1118 * 1119 * root_mem is the original ancestor that we've been reclaim from. 1120 * 1121 * We give up and return to the caller when we visit root_mem twice. 1122 * (other groups can be removed while we're walking....) 1123 * 1124 * If shrink==true, for avoiding to free too much, this returns immedieately. 1125 */ 1126static int mem_cgroup_hierarchical_reclaim(struct mem_cgroup *root_mem, 1127 struct zone *zone, 1128 gfp_t gfp_mask, 1129 unsigned long reclaim_options) 1130{ 1131 struct mem_cgroup *victim; 1132 int ret, total = 0; 1133 int loop = 0; 1134 bool noswap = reclaim_options & MEM_CGROUP_RECLAIM_NOSWAP; 1135 bool shrink = reclaim_options & MEM_CGROUP_RECLAIM_SHRINK; 1136 bool check_soft = reclaim_options & MEM_CGROUP_RECLAIM_SOFT; 1137 unsigned long excess = mem_cgroup_get_excess(root_mem); 1138 1139 /* If memsw_is_minimum==1, swap-out is of-no-use. */ 1140 if (root_mem->memsw_is_minimum) 1141 noswap = true; 1142 1143 while (1) { 1144 victim = mem_cgroup_select_victim(root_mem); 1145 if (victim == root_mem) { 1146 loop++; 1147 if (loop >= 1) 1148 drain_all_stock_async(); 1149 if (loop >= 2) { 1150 /* 1151 * If we have not been able to reclaim 1152 * anything, it might because there are 1153 * no reclaimable pages under this hierarchy 1154 */ 1155 if (!check_soft || !total) { 1156 css_put(&victim->css); 1157 break; 1158 } 1159 /* 1160 * We want to do more targetted reclaim. 1161 * excess >> 2 is not to excessive so as to 1162 * reclaim too much, nor too less that we keep 1163 * coming back to reclaim from this cgroup 1164 */ 1165 if (total >= (excess >> 2) || 1166 (loop > MEM_CGROUP_MAX_RECLAIM_LOOPS)) { 1167 css_put(&victim->css); 1168 break; 1169 } 1170 } 1171 } 1172 if (!mem_cgroup_local_usage(&victim->stat)) { 1173 /* this cgroup's local usage == 0 */ 1174 css_put(&victim->css); 1175 continue; 1176 } 1177 /* we use swappiness of local cgroup */ 1178 if (check_soft) 1179 ret = mem_cgroup_shrink_node_zone(victim, gfp_mask, 1180 noswap, get_swappiness(victim), zone, 1181 zone->zone_pgdat->node_id); 1182 else 1183 ret = try_to_free_mem_cgroup_pages(victim, gfp_mask, 1184 noswap, get_swappiness(victim)); 1185 css_put(&victim->css); 1186 /* 1187 * At shrinking usage, we can't check we should stop here or 1188 * reclaim more. It's depends on callers. last_scanned_child 1189 * will work enough for keeping fairness under tree. 1190 */ 1191 if (shrink) 1192 return ret; 1193 total += ret; 1194 if (check_soft) { 1195 if (res_counter_check_under_soft_limit(&root_mem->res)) 1196 return total; 1197 } else if (mem_cgroup_check_under_limit(root_mem)) 1198 return 1 + total; 1199 } 1200 return total; 1201} 1202 1203bool mem_cgroup_oom_called(struct task_struct *task) 1204{ 1205 bool ret = false; 1206 struct mem_cgroup *mem; 1207 struct mm_struct *mm; 1208 1209 rcu_read_lock(); 1210 mm = task->mm; 1211 if (!mm) 1212 mm = &init_mm; 1213 mem = mem_cgroup_from_task(rcu_dereference(mm->owner)); 1214 if (mem && time_before(jiffies, mem->last_oom_jiffies + HZ/10)) 1215 ret = true; 1216 rcu_read_unlock(); 1217 return ret; 1218} 1219 1220static int record_last_oom_cb(struct mem_cgroup *mem, void *data) 1221{ 1222 mem->last_oom_jiffies = jiffies; 1223 return 0; 1224} 1225 1226static void record_last_oom(struct mem_cgroup *mem) 1227{ 1228 mem_cgroup_walk_tree(mem, NULL, record_last_oom_cb); 1229} 1230 1231/* 1232 * Currently used to update mapped file statistics, but the routine can be 1233 * generalized to update other statistics as well. 1234 */ 1235void mem_cgroup_update_file_mapped(struct page *page, int val) 1236{ 1237 struct mem_cgroup *mem; 1238 struct mem_cgroup_stat *stat; 1239 struct mem_cgroup_stat_cpu *cpustat; 1240 int cpu; 1241 struct page_cgroup *pc; 1242 1243 pc = lookup_page_cgroup(page); 1244 if (unlikely(!pc)) 1245 return; 1246 1247 lock_page_cgroup(pc); 1248 mem = pc->mem_cgroup; 1249 if (!mem) 1250 goto done; 1251 1252 if (!PageCgroupUsed(pc)) 1253 goto done; 1254 1255 /* 1256 * Preemption is already disabled, we don't need get_cpu() 1257 */ 1258 cpu = smp_processor_id(); 1259 stat = &mem->stat; 1260 cpustat = &stat->cpustat[cpu]; 1261 1262 __mem_cgroup_stat_add_safe(cpustat, MEM_CGROUP_STAT_FILE_MAPPED, val); 1263done: 1264 unlock_page_cgroup(pc); 1265} 1266 1267/* 1268 * size of first charge trial. "32" comes from vmscan.c's magic value. 1269 * TODO: maybe necessary to use big numbers in big irons. 1270 */ 1271#define CHARGE_SIZE (32 * PAGE_SIZE) 1272struct memcg_stock_pcp { 1273 struct mem_cgroup *cached; /* this never be root cgroup */ 1274 int charge; 1275 struct work_struct work; 1276}; 1277static DEFINE_PER_CPU(struct memcg_stock_pcp, memcg_stock); 1278static atomic_t memcg_drain_count; 1279 1280/* 1281 * Try to consume stocked charge on this cpu. If success, PAGE_SIZE is consumed 1282 * from local stock and true is returned. If the stock is 0 or charges from a 1283 * cgroup which is not current target, returns false. This stock will be 1284 * refilled. 1285 */ 1286static bool consume_stock(struct mem_cgroup *mem) 1287{ 1288 struct memcg_stock_pcp *stock; 1289 bool ret = true; 1290 1291 stock = &get_cpu_var(memcg_stock); 1292 if (mem == stock->cached && stock->charge) 1293 stock->charge -= PAGE_SIZE; 1294 else /* need to call res_counter_charge */ 1295 ret = false; 1296 put_cpu_var(memcg_stock); 1297 return ret; 1298} 1299 1300/* 1301 * Returns stocks cached in percpu to res_counter and reset cached information. 1302 */ 1303static void drain_stock(struct memcg_stock_pcp *stock) 1304{ 1305 struct mem_cgroup *old = stock->cached; 1306 1307 if (stock->charge) { 1308 res_counter_uncharge(&old->res, stock->charge); 1309 if (do_swap_account) 1310 res_counter_uncharge(&old->memsw, stock->charge); 1311 } 1312 stock->cached = NULL; 1313 stock->charge = 0; 1314} 1315 1316/* 1317 * This must be called under preempt disabled or must be called by 1318 * a thread which is pinned to local cpu. 1319 */ 1320static void drain_local_stock(struct work_struct *dummy) 1321{ 1322 struct memcg_stock_pcp *stock = &__get_cpu_var(memcg_stock); 1323 drain_stock(stock); 1324} 1325 1326/* 1327 * Cache charges(val) which is from res_counter, to local per_cpu area. 1328 * This will be consumed by consumt_stock() function, later. 1329 */ 1330static void refill_stock(struct mem_cgroup *mem, int val) 1331{ 1332 struct memcg_stock_pcp *stock = &get_cpu_var(memcg_stock); 1333 1334 if (stock->cached != mem) { /* reset if necessary */ 1335 drain_stock(stock); 1336 stock->cached = mem; 1337 } 1338 stock->charge += val; 1339 put_cpu_var(memcg_stock); 1340} 1341 1342/* 1343 * Tries to drain stocked charges in other cpus. This function is asynchronous 1344 * and just put a work per cpu for draining localy on each cpu. Caller can 1345 * expects some charges will be back to res_counter later but cannot wait for 1346 * it. 1347 */ 1348static void drain_all_stock_async(void) 1349{ 1350 int cpu; 1351 /* This function is for scheduling "drain" in asynchronous way. 1352 * The result of "drain" is not directly handled by callers. Then, 1353 * if someone is calling drain, we don't have to call drain more. 1354 * Anyway, WORK_STRUCT_PENDING check in queue_work_on() will catch if 1355 * there is a race. We just do loose check here. 1356 */ 1357 if (atomic_read(&memcg_drain_count)) 1358 return; 1359 /* Notify other cpus that system-wide "drain" is running */ 1360 atomic_inc(&memcg_drain_count); 1361 get_online_cpus(); 1362 for_each_online_cpu(cpu) { 1363 struct memcg_stock_pcp *stock = &per_cpu(memcg_stock, cpu); 1364 schedule_work_on(cpu, &stock->work); 1365 } 1366 put_online_cpus(); 1367 atomic_dec(&memcg_drain_count); 1368 /* We don't wait for flush_work */ 1369} 1370 1371/* This is a synchronous drain interface. */ 1372static void drain_all_stock_sync(void) 1373{ 1374 /* called when force_empty is called */ 1375 atomic_inc(&memcg_drain_count); 1376 schedule_on_each_cpu(drain_local_stock); 1377 atomic_dec(&memcg_drain_count); 1378} 1379 1380static int __cpuinit memcg_stock_cpu_callback(struct notifier_block *nb, 1381 unsigned long action, 1382 void *hcpu) 1383{ 1384 int cpu = (unsigned long)hcpu; 1385 struct memcg_stock_pcp *stock; 1386 1387 if (action != CPU_DEAD) 1388 return NOTIFY_OK; 1389 stock = &per_cpu(memcg_stock, cpu); 1390 drain_stock(stock); 1391 return NOTIFY_OK; 1392} 1393 1394/* 1395 * Unlike exported interface, "oom" parameter is added. if oom==true, 1396 * oom-killer can be invoked. 1397 */ 1398static int __mem_cgroup_try_charge(struct mm_struct *mm, 1399 gfp_t gfp_mask, struct mem_cgroup **memcg, 1400 bool oom, struct page *page) 1401{ 1402 struct mem_cgroup *mem, *mem_over_limit; 1403 int nr_retries = MEM_CGROUP_RECLAIM_RETRIES; 1404 struct res_counter *fail_res; 1405 int csize = CHARGE_SIZE; 1406 1407 if (unlikely(test_thread_flag(TIF_MEMDIE))) { 1408 /* Don't account this! */ 1409 *memcg = NULL; 1410 return 0; 1411 } 1412 1413 /* 1414 * We always charge the cgroup the mm_struct belongs to. 1415 * The mm_struct's mem_cgroup changes on task migration if the 1416 * thread group leader migrates. It's possible that mm is not 1417 * set, if so charge the init_mm (happens for pagecache usage). 1418 */ 1419 mem = *memcg; 1420 if (likely(!mem)) { 1421 mem = try_get_mem_cgroup_from_mm(mm); 1422 *memcg = mem; 1423 } else { 1424 css_get(&mem->css); 1425 } 1426 if (unlikely(!mem)) 1427 return 0; 1428 1429 VM_BUG_ON(css_is_removed(&mem->css)); 1430 if (mem_cgroup_is_root(mem)) 1431 goto done; 1432 1433 while (1) { 1434 int ret = 0; 1435 unsigned long flags = 0; 1436 1437 if (consume_stock(mem)) 1438 goto charged; 1439 1440 ret = res_counter_charge(&mem->res, csize, &fail_res); 1441 if (likely(!ret)) { 1442 if (!do_swap_account) 1443 break; 1444 ret = res_counter_charge(&mem->memsw, csize, &fail_res); 1445 if (likely(!ret)) 1446 break; 1447 /* mem+swap counter fails */ 1448 res_counter_uncharge(&mem->res, csize); 1449 flags |= MEM_CGROUP_RECLAIM_NOSWAP; 1450 mem_over_limit = mem_cgroup_from_res_counter(fail_res, 1451 memsw); 1452 } else 1453 /* mem counter fails */ 1454 mem_over_limit = mem_cgroup_from_res_counter(fail_res, 1455 res); 1456 1457 /* reduce request size and retry */ 1458 if (csize > PAGE_SIZE) { 1459 csize = PAGE_SIZE; 1460 continue; 1461 } 1462 if (!(gfp_mask & __GFP_WAIT)) 1463 goto nomem; 1464 1465 ret = mem_cgroup_hierarchical_reclaim(mem_over_limit, NULL, 1466 gfp_mask, flags); 1467 if (ret) 1468 continue; 1469 1470 /* 1471 * try_to_free_mem_cgroup_pages() might not give us a full 1472 * picture of reclaim. Some pages are reclaimed and might be 1473 * moved to swap cache or just unmapped from the cgroup. 1474 * Check the limit again to see if the reclaim reduced the 1475 * current usage of the cgroup before giving up 1476 * 1477 */ 1478 if (mem_cgroup_check_under_limit(mem_over_limit)) 1479 continue; 1480 1481 if (!nr_retries--) { 1482 if (oom) { 1483 mem_cgroup_out_of_memory(mem_over_limit, gfp_mask); 1484 record_last_oom(mem_over_limit); 1485 } 1486 goto nomem; 1487 } 1488 } 1489 if (csize > PAGE_SIZE) 1490 refill_stock(mem, csize - PAGE_SIZE); 1491charged: 1492 /* 1493 * Insert ancestor (and ancestor's ancestors), to softlimit RB-tree. 1494 * if they exceeds softlimit. 1495 */ 1496 if (mem_cgroup_soft_limit_check(mem)) 1497 mem_cgroup_update_tree(mem, page); 1498done: 1499 return 0; 1500nomem: 1501 css_put(&mem->css); 1502 return -ENOMEM; 1503} 1504 1505/* 1506 * Somemtimes we have to undo a charge we got by try_charge(). 1507 * This function is for that and do uncharge, put css's refcnt. 1508 * gotten by try_charge(). 1509 */ 1510static void mem_cgroup_cancel_charge(struct mem_cgroup *mem) 1511{ 1512 if (!mem_cgroup_is_root(mem)) { 1513 res_counter_uncharge(&mem->res, PAGE_SIZE); 1514 if (do_swap_account) 1515 res_counter_uncharge(&mem->memsw, PAGE_SIZE); 1516 } 1517 css_put(&mem->css); 1518} 1519 1520/* 1521 * A helper function to get mem_cgroup from ID. must be called under 1522 * rcu_read_lock(). The caller must check css_is_removed() or some if 1523 * it's concern. (dropping refcnt from swap can be called against removed 1524 * memcg.) 1525 */ 1526static struct mem_cgroup *mem_cgroup_lookup(unsigned short id) 1527{ 1528 struct cgroup_subsys_state *css; 1529 1530 /* ID 0 is unused ID */ 1531 if (!id) 1532 return NULL; 1533 css = css_lookup(&mem_cgroup_subsys, id); 1534 if (!css) 1535 return NULL; 1536 return container_of(css, struct mem_cgroup, css); 1537} 1538 1539static struct mem_cgroup *try_get_mem_cgroup_from_swapcache(struct page *page) 1540{ 1541 struct mem_cgroup *mem; 1542 struct page_cgroup *pc; 1543 unsigned short id; 1544 swp_entry_t ent; 1545 1546 VM_BUG_ON(!PageLocked(page)); 1547 1548 if (!PageSwapCache(page)) 1549 return NULL; 1550 1551 pc = lookup_page_cgroup(page); 1552 lock_page_cgroup(pc); 1553 if (PageCgroupUsed(pc)) { 1554 mem = pc->mem_cgroup; 1555 if (mem && !css_tryget(&mem->css)) 1556 mem = NULL; 1557 } else { 1558 ent.val = page_private(page); 1559 id = lookup_swap_cgroup(ent); 1560 rcu_read_lock(); 1561 mem = mem_cgroup_lookup(id); 1562 if (mem && !css_tryget(&mem->css)) 1563 mem = NULL; 1564 rcu_read_unlock(); 1565 } 1566 unlock_page_cgroup(pc); 1567 return mem; 1568} 1569 1570/* 1571 * commit a charge got by __mem_cgroup_try_charge() and makes page_cgroup to be 1572 * USED state. If already USED, uncharge and return. 1573 */ 1574 1575static void __mem_cgroup_commit_charge(struct mem_cgroup *mem, 1576 struct page_cgroup *pc, 1577 enum charge_type ctype) 1578{ 1579 /* try_charge() can return NULL to *memcg, taking care of it. */ 1580 if (!mem) 1581 return; 1582 1583 lock_page_cgroup(pc); 1584 if (unlikely(PageCgroupUsed(pc))) { 1585 unlock_page_cgroup(pc); 1586 mem_cgroup_cancel_charge(mem); 1587 return; 1588 } 1589 1590 pc->mem_cgroup = mem; 1591 /* 1592 * We access a page_cgroup asynchronously without lock_page_cgroup(). 1593 * Especially when a page_cgroup is taken from a page, pc->mem_cgroup 1594 * is accessed after testing USED bit. To make pc->mem_cgroup visible 1595 * before USED bit, we need memory barrier here. 1596 * See mem_cgroup_add_lru_list(), etc. 1597 */ 1598 smp_wmb(); 1599 switch (ctype) { 1600 case MEM_CGROUP_CHARGE_TYPE_CACHE: 1601 case MEM_CGROUP_CHARGE_TYPE_SHMEM: 1602 SetPageCgroupCache(pc); 1603 SetPageCgroupUsed(pc); 1604 break; 1605 case MEM_CGROUP_CHARGE_TYPE_MAPPED: 1606 ClearPageCgroupCache(pc); 1607 SetPageCgroupUsed(pc); 1608 break; 1609 default: 1610 break; 1611 } 1612 1613 mem_cgroup_charge_statistics(mem, pc, true); 1614 1615 unlock_page_cgroup(pc); 1616} 1617 1618/** 1619 * __mem_cgroup_move_account - move account of the page 1620 * @pc: page_cgroup of the page. 1621 * @from: mem_cgroup which the page is moved from. 1622 * @to: mem_cgroup which the page is moved to. @from != @to. 1623 * 1624 * The caller must confirm following. 1625 * - page is not on LRU (isolate_page() is useful.) 1626 * - the pc is locked, used, and ->mem_cgroup points to @from. 1627 * 1628 * This function does "uncharge" from old cgroup but doesn't do "charge" to 1629 * new cgroup. It should be done by a caller. 1630 */ 1631 1632static void __mem_cgroup_move_account(struct page_cgroup *pc, 1633 struct mem_cgroup *from, struct mem_cgroup *to) 1634{ 1635 struct page *page; 1636 int cpu; 1637 struct mem_cgroup_stat *stat; 1638 struct mem_cgroup_stat_cpu *cpustat; 1639 1640 VM_BUG_ON(from == to); 1641 VM_BUG_ON(PageLRU(pc->page)); 1642 VM_BUG_ON(!PageCgroupLocked(pc)); 1643 VM_BUG_ON(!PageCgroupUsed(pc)); 1644 VM_BUG_ON(pc->mem_cgroup != from); 1645 1646 if (!mem_cgroup_is_root(from)) 1647 res_counter_uncharge(&from->res, PAGE_SIZE); 1648 mem_cgroup_charge_statistics(from, pc, false); 1649 1650 page = pc->page; 1651 if (page_mapped(page) && !PageAnon(page)) { 1652 cpu = smp_processor_id(); 1653 /* Update mapped_file data for mem_cgroup "from" */ 1654 stat = &from->stat; 1655 cpustat = &stat->cpustat[cpu]; 1656 __mem_cgroup_stat_add_safe(cpustat, MEM_CGROUP_STAT_FILE_MAPPED, 1657 -1); 1658 1659 /* Update mapped_file data for mem_cgroup "to" */ 1660 stat = &to->stat; 1661 cpustat = &stat->cpustat[cpu]; 1662 __mem_cgroup_stat_add_safe(cpustat, MEM_CGROUP_STAT_FILE_MAPPED, 1663 1); 1664 } 1665 1666 if (do_swap_account && !mem_cgroup_is_root(from)) 1667 res_counter_uncharge(&from->memsw, PAGE_SIZE); 1668 css_put(&from->css); 1669 1670 css_get(&to->css); 1671 pc->mem_cgroup = to; 1672 mem_cgroup_charge_statistics(to, pc, true); 1673 /* 1674 * We charges against "to" which may not have any tasks. Then, "to" 1675 * can be under rmdir(). But in current implementation, caller of 1676 * this function is just force_empty() and it's garanteed that 1677 * "to" is never removed. So, we don't check rmdir status here. 1678 */ 1679} 1680 1681/* 1682 * check whether the @pc is valid for moving account and call 1683 * __mem_cgroup_move_account() 1684 */ 1685static int mem_cgroup_move_account(struct page_cgroup *pc, 1686 struct mem_cgroup *from, struct mem_cgroup *to) 1687{ 1688 int ret = -EINVAL; 1689 lock_page_cgroup(pc); 1690 if (PageCgroupUsed(pc) && pc->mem_cgroup == from) { 1691 __mem_cgroup_move_account(pc, from, to); 1692 ret = 0; 1693 } 1694 unlock_page_cgroup(pc); 1695 return ret; 1696} 1697 1698/* 1699 * move charges to its parent. 1700 */ 1701 1702static int mem_cgroup_move_parent(struct page_cgroup *pc, 1703 struct mem_cgroup *child, 1704 gfp_t gfp_mask) 1705{ 1706 struct page *page = pc->page; 1707 struct cgroup *cg = child->css.cgroup; 1708 struct cgroup *pcg = cg->parent; 1709 struct mem_cgroup *parent; 1710 int ret; 1711 1712 /* Is ROOT ? */ 1713 if (!pcg) 1714 return -EINVAL; 1715 1716 ret = -EBUSY; 1717 if (!get_page_unless_zero(page)) 1718 goto out; 1719 if (isolate_lru_page(page)) 1720 goto put; 1721 1722 parent = mem_cgroup_from_cont(pcg); 1723 ret = __mem_cgroup_try_charge(NULL, gfp_mask, &parent, false, page); 1724 if (ret || !parent) 1725 goto put_back; 1726 1727 ret = mem_cgroup_move_account(pc, child, parent); 1728 if (!ret) 1729 css_put(&parent->css); /* drop extra refcnt by try_charge() */ 1730 else 1731 mem_cgroup_cancel_charge(parent); /* does css_put */ 1732put_back: 1733 putback_lru_page(page); 1734put: 1735 put_page(page); 1736out: 1737 return ret; 1738} 1739 1740/* 1741 * Charge the memory controller for page usage. 1742 * Return 1743 * 0 if the charge was successful 1744 * < 0 if the cgroup is over its limit 1745 */ 1746static int mem_cgroup_charge_common(struct page *page, struct mm_struct *mm, 1747 gfp_t gfp_mask, enum charge_type ctype, 1748 struct mem_cgroup *memcg) 1749{ 1750 struct mem_cgroup *mem; 1751 struct page_cgroup *pc; 1752 int ret; 1753 1754 pc = lookup_page_cgroup(page); 1755 /* can happen at boot */ 1756 if (unlikely(!pc)) 1757 return 0; 1758 prefetchw(pc); 1759 1760 mem = memcg; 1761 ret = __mem_cgroup_try_charge(mm, gfp_mask, &mem, true, page); 1762 if (ret || !mem) 1763 return ret; 1764 1765 __mem_cgroup_commit_charge(mem, pc, ctype); 1766 return 0; 1767} 1768 1769int mem_cgroup_newpage_charge(struct page *page, 1770 struct mm_struct *mm, gfp_t gfp_mask) 1771{ 1772 if (mem_cgroup_disabled()) 1773 return 0; 1774 if (PageCompound(page)) 1775 return 0; 1776 /* 1777 * If already mapped, we don't have to account. 1778 * If page cache, page->mapping has address_space. 1779 * But page->mapping may have out-of-use anon_vma pointer, 1780 * detecit it by PageAnon() check. newly-mapped-anon's page->mapping 1781 * is NULL. 1782 */ 1783 if (page_mapped(page) || (page->mapping && !PageAnon(page))) 1784 return 0; 1785 if (unlikely(!mm)) 1786 mm = &init_mm; 1787 return mem_cgroup_charge_common(page, mm, gfp_mask, 1788 MEM_CGROUP_CHARGE_TYPE_MAPPED, NULL); 1789} 1790 1791static void 1792__mem_cgroup_commit_charge_swapin(struct page *page, struct mem_cgroup *ptr, 1793 enum charge_type ctype); 1794 1795int mem_cgroup_cache_charge(struct page *page, struct mm_struct *mm, 1796 gfp_t gfp_mask) 1797{ 1798 struct mem_cgroup *mem = NULL; 1799 int ret; 1800 1801 if (mem_cgroup_disabled()) 1802 return 0; 1803 if (PageCompound(page)) 1804 return 0; 1805 /* 1806 * Corner case handling. This is called from add_to_page_cache() 1807 * in usual. But some FS (shmem) precharges this page before calling it 1808 * and call add_to_page_cache() with GFP_NOWAIT. 1809 * 1810 * For GFP_NOWAIT case, the page may be pre-charged before calling 1811 * add_to_page_cache(). (See shmem.c) check it here and avoid to call 1812 * charge twice. (It works but has to pay a bit larger cost.) 1813 * And when the page is SwapCache, it should take swap information 1814 * into account. This is under lock_page() now. 1815 */ 1816 if (!(gfp_mask & __GFP_WAIT)) { 1817 struct page_cgroup *pc; 1818 1819 1820 pc = lookup_page_cgroup(page); 1821 if (!pc) 1822 return 0; 1823 lock_page_cgroup(pc); 1824 if (PageCgroupUsed(pc)) { 1825 unlock_page_cgroup(pc); 1826 return 0; 1827 } 1828 unlock_page_cgroup(pc); 1829 } 1830 1831 if (unlikely(!mm && !mem)) 1832 mm = &init_mm; 1833 1834 if (page_is_file_cache(page)) 1835 return mem_cgroup_charge_common(page, mm, gfp_mask, 1836 MEM_CGROUP_CHARGE_TYPE_CACHE, NULL); 1837 1838 /* shmem */ 1839 if (PageSwapCache(page)) { 1840 ret = mem_cgroup_try_charge_swapin(mm, page, gfp_mask, &mem); 1841 if (!ret) 1842 __mem_cgroup_commit_charge_swapin(page, mem, 1843 MEM_CGROUP_CHARGE_TYPE_SHMEM); 1844 } else 1845 ret = mem_cgroup_charge_common(page, mm, gfp_mask, 1846 MEM_CGROUP_CHARGE_TYPE_SHMEM, mem); 1847 1848 return ret; 1849} 1850 1851/* 1852 * While swap-in, try_charge -> commit or cancel, the page is locked. 1853 * And when try_charge() successfully returns, one refcnt to memcg without 1854 * struct page_cgroup is acquired. This refcnt will be consumed by 1855 * "commit()" or removed by "cancel()" 1856 */ 1857int mem_cgroup_try_charge_swapin(struct mm_struct *mm, 1858 struct page *page, 1859 gfp_t mask, struct mem_cgroup **ptr) 1860{ 1861 struct mem_cgroup *mem; 1862 int ret; 1863 1864 if (mem_cgroup_disabled()) 1865 return 0; 1866 1867 if (!do_swap_account) 1868 goto charge_cur_mm; 1869 /* 1870 * A racing thread's fault, or swapoff, may have already updated 1871 * the pte, and even removed page from swap cache: in those cases 1872 * do_swap_page()'s pte_same() test will fail; but there's also a 1873 * KSM case which does need to charge the page. 1874 */ 1875 if (!PageSwapCache(page)) 1876 goto charge_cur_mm; 1877 mem = try_get_mem_cgroup_from_swapcache(page); 1878 if (!mem) 1879 goto charge_cur_mm; 1880 *ptr = mem; 1881 ret = __mem_cgroup_try_charge(NULL, mask, ptr, true, page); 1882 /* drop extra refcnt from tryget */ 1883 css_put(&mem->css); 1884 return ret; 1885charge_cur_mm: 1886 if (unlikely(!mm)) 1887 mm = &init_mm; 1888 return __mem_cgroup_try_charge(mm, mask, ptr, true, page); 1889} 1890 1891static void 1892__mem_cgroup_commit_charge_swapin(struct page *page, struct mem_cgroup *ptr, 1893 enum charge_type ctype) 1894{ 1895 struct page_cgroup *pc; 1896 1897 if (mem_cgroup_disabled()) 1898 return; 1899 if (!ptr) 1900 return; 1901 cgroup_exclude_rmdir(&ptr->css); 1902 pc = lookup_page_cgroup(page); 1903 mem_cgroup_lru_del_before_commit_swapcache(page); 1904 __mem_cgroup_commit_charge(ptr, pc, ctype); 1905 mem_cgroup_lru_add_after_commit_swapcache(page); 1906 /* 1907 * Now swap is on-memory. This means this page may be 1908 * counted both as mem and swap....double count. 1909 * Fix it by uncharging from memsw. Basically, this SwapCache is stable 1910 * under lock_page(). But in do_swap_page()::memory.c, reuse_swap_page() 1911 * may call delete_from_swap_cache() before reach here. 1912 */ 1913 if (do_swap_account && PageSwapCache(page)) { 1914 swp_entry_t ent = {.val = page_private(page)}; 1915 unsigned short id; 1916 struct mem_cgroup *memcg; 1917 1918 id = swap_cgroup_record(ent, 0); 1919 rcu_read_lock(); 1920 memcg = mem_cgroup_lookup(id); 1921 if (memcg) { 1922 /* 1923 * This recorded memcg can be obsolete one. So, avoid 1924 * calling css_tryget 1925 */ 1926 if (!mem_cgroup_is_root(memcg)) 1927 res_counter_uncharge(&memcg->memsw, PAGE_SIZE); 1928 mem_cgroup_swap_statistics(memcg, false); 1929 mem_cgroup_put(memcg); 1930 } 1931 rcu_read_unlock(); 1932 } 1933 /* 1934 * At swapin, we may charge account against cgroup which has no tasks. 1935 * So, rmdir()->pre_destroy() can be called while we do this charge. 1936 * In that case, we need to call pre_destroy() again. check it here. 1937 */ 1938 cgroup_release_and_wakeup_rmdir(&ptr->css); 1939} 1940 1941void mem_cgroup_commit_charge_swapin(struct page *page, struct mem_cgroup *ptr) 1942{ 1943 __mem_cgroup_commit_charge_swapin(page, ptr, 1944 MEM_CGROUP_CHARGE_TYPE_MAPPED); 1945} 1946 1947void mem_cgroup_cancel_charge_swapin(struct mem_cgroup *mem) 1948{ 1949 if (mem_cgroup_disabled()) 1950 return; 1951 if (!mem) 1952 return; 1953 mem_cgroup_cancel_charge(mem); 1954} 1955 1956static void 1957__do_uncharge(struct mem_cgroup *mem, const enum charge_type ctype) 1958{ 1959 struct memcg_batch_info *batch = NULL; 1960 bool uncharge_memsw = true; 1961 /* If swapout, usage of swap doesn't decrease */ 1962 if (!do_swap_account || ctype == MEM_CGROUP_CHARGE_TYPE_SWAPOUT) 1963 uncharge_memsw = false; 1964 /* 1965 * do_batch > 0 when unmapping pages or inode invalidate/truncate. 1966 * In those cases, all pages freed continously can be expected to be in 1967 * the same cgroup and we have chance to coalesce uncharges. 1968 * But we do uncharge one by one if this is killed by OOM(TIF_MEMDIE) 1969 * because we want to do uncharge as soon as possible. 1970 */ 1971 if (!current->memcg_batch.do_batch || test_thread_flag(TIF_MEMDIE)) 1972 goto direct_uncharge; 1973 1974 batch = ¤t->memcg_batch; 1975 /* 1976 * In usual, we do css_get() when we remember memcg pointer. 1977 * But in this case, we keep res->usage until end of a series of 1978 * uncharges. Then, it's ok to ignore memcg's refcnt. 1979 */ 1980 if (!batch->memcg) 1981 batch->memcg = mem; 1982 /* 1983 * In typical case, batch->memcg == mem. This means we can 1984 * merge a series of uncharges to an uncharge of res_counter. 1985 * If not, we uncharge res_counter ony by one. 1986 */ 1987 if (batch->memcg != mem) 1988 goto direct_uncharge; 1989 /* remember freed charge and uncharge it later */ 1990 batch->bytes += PAGE_SIZE; 1991 if (uncharge_memsw) 1992 batch->memsw_bytes += PAGE_SIZE; 1993 return; 1994direct_uncharge: 1995 res_counter_uncharge(&mem->res, PAGE_SIZE); 1996 if (uncharge_memsw) 1997 res_counter_uncharge(&mem->memsw, PAGE_SIZE); 1998 return; 1999} 2000 2001/* 2002 * uncharge if !page_mapped(page) 2003 */ 2004static struct mem_cgroup * 2005__mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype) 2006{ 2007 struct page_cgroup *pc; 2008 struct mem_cgroup *mem = NULL; 2009 struct mem_cgroup_per_zone *mz; 2010 2011 if (mem_cgroup_disabled()) 2012 return NULL; 2013 2014 if (PageSwapCache(page)) 2015 return NULL; 2016 2017 /* 2018 * Check if our page_cgroup is valid 2019 */ 2020 pc = lookup_page_cgroup(page); 2021 if (unlikely(!pc || !PageCgroupUsed(pc))) 2022 return NULL; 2023 2024 lock_page_cgroup(pc); 2025 2026 mem = pc->mem_cgroup; 2027 2028 if (!PageCgroupUsed(pc)) 2029 goto unlock_out; 2030 2031 switch (ctype) { 2032 case MEM_CGROUP_CHARGE_TYPE_MAPPED: 2033 case MEM_CGROUP_CHARGE_TYPE_DROP: 2034 if (page_mapped(page)) 2035 goto unlock_out; 2036 break; 2037 case MEM_CGROUP_CHARGE_TYPE_SWAPOUT: 2038 if (!PageAnon(page)) { /* Shared memory */ 2039 if (page->mapping && !page_is_file_cache(page)) 2040 goto unlock_out; 2041 } else if (page_mapped(page)) /* Anon */ 2042 goto unlock_out; 2043 break; 2044 default: 2045 break; 2046 } 2047 2048 if (!mem_cgroup_is_root(mem)) 2049 __do_uncharge(mem, ctype); 2050 if (ctype == MEM_CGROUP_CHARGE_TYPE_SWAPOUT) 2051 mem_cgroup_swap_statistics(mem, true); 2052 mem_cgroup_charge_statistics(mem, pc, false); 2053 2054 ClearPageCgroupUsed(pc); 2055 /* 2056 * pc->mem_cgroup is not cleared here. It will be accessed when it's 2057 * freed from LRU. This is safe because uncharged page is expected not 2058 * to be reused (freed soon). Exception is SwapCache, it's handled by 2059 * special functions. 2060 */ 2061 2062 mz = page_cgroup_zoneinfo(pc); 2063 unlock_page_cgroup(pc); 2064 2065 if (mem_cgroup_soft_limit_check(mem)) 2066 mem_cgroup_update_tree(mem, page); 2067 /* at swapout, this memcg will be accessed to record to swap */ 2068 if (ctype != MEM_CGROUP_CHARGE_TYPE_SWAPOUT) 2069 css_put(&mem->css); 2070 2071 return mem; 2072 2073unlock_out: 2074 unlock_page_cgroup(pc); 2075 return NULL; 2076} 2077 2078void mem_cgroup_uncharge_page(struct page *page) 2079{ 2080 /* early check. */ 2081 if (page_mapped(page)) 2082 return; 2083 if (page->mapping && !PageAnon(page)) 2084 return; 2085 __mem_cgroup_uncharge_common(page, MEM_CGROUP_CHARGE_TYPE_MAPPED); 2086} 2087 2088void mem_cgroup_uncharge_cache_page(struct page *page) 2089{ 2090 VM_BUG_ON(page_mapped(page)); 2091 VM_BUG_ON(page->mapping); 2092 __mem_cgroup_uncharge_common(page, MEM_CGROUP_CHARGE_TYPE_CACHE); 2093} 2094 2095/* 2096 * Batch_start/batch_end is called in unmap_page_range/invlidate/trucate. 2097 * In that cases, pages are freed continuously and we can expect pages 2098 * are in the same memcg. All these calls itself limits the number of 2099 * pages freed at once, then uncharge_start/end() is called properly. 2100 * This may be called prural(2) times in a context, 2101 */ 2102 2103void mem_cgroup_uncharge_start(void) 2104{ 2105 current->memcg_batch.do_batch++; 2106 /* We can do nest. */ 2107 if (current->memcg_batch.do_batch == 1) { 2108 current->memcg_batch.memcg = NULL; 2109 current->memcg_batch.bytes = 0; 2110 current->memcg_batch.memsw_bytes = 0; 2111 } 2112} 2113 2114void mem_cgroup_uncharge_end(void) 2115{ 2116 struct memcg_batch_info *batch = ¤t->memcg_batch; 2117 2118 if (!batch->do_batch) 2119 return; 2120 2121 batch->do_batch--; 2122 if (batch->do_batch) /* If stacked, do nothing. */ 2123 return; 2124 2125 if (!batch->memcg) 2126 return; 2127 /* 2128 * This "batch->memcg" is valid without any css_get/put etc... 2129 * bacause we hide charges behind us. 2130 */ 2131 if (batch->bytes) 2132 res_counter_uncharge(&batch->memcg->res, batch->bytes); 2133 if (batch->memsw_bytes) 2134 res_counter_uncharge(&batch->memcg->memsw, batch->memsw_bytes); 2135 /* forget this pointer (for sanity check) */ 2136 batch->memcg = NULL; 2137} 2138 2139#ifdef CONFIG_SWAP 2140/* 2141 * called after __delete_from_swap_cache() and drop "page" account. 2142 * memcg information is recorded to swap_cgroup of "ent" 2143 */ 2144void 2145mem_cgroup_uncharge_swapcache(struct page *page, swp_entry_t ent, bool swapout) 2146{ 2147 struct mem_cgroup *memcg; 2148 int ctype = MEM_CGROUP_CHARGE_TYPE_SWAPOUT; 2149 2150 if (!swapout) /* this was a swap cache but the swap is unused ! */ 2151 ctype = MEM_CGROUP_CHARGE_TYPE_DROP; 2152 2153 memcg = __mem_cgroup_uncharge_common(page, ctype); 2154 2155 /* record memcg information */ 2156 if (do_swap_account && swapout && memcg) { 2157 swap_cgroup_record(ent, css_id(&memcg->css)); 2158 mem_cgroup_get(memcg); 2159 } 2160 if (swapout && memcg) 2161 css_put(&memcg->css); 2162} 2163#endif 2164 2165#ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP 2166/* 2167 * called from swap_entry_free(). remove record in swap_cgroup and 2168 * uncharge "memsw" account. 2169 */ 2170void mem_cgroup_uncharge_swap(swp_entry_t ent) 2171{ 2172 struct mem_cgroup *memcg; 2173 unsigned short id; 2174 2175 if (!do_swap_account) 2176 return; 2177 2178 id = swap_cgroup_record(ent, 0); 2179 rcu_read_lock(); 2180 memcg = mem_cgroup_lookup(id); 2181 if (memcg) { 2182 /* 2183 * We uncharge this because swap is freed. 2184 * This memcg can be obsolete one. We avoid calling css_tryget 2185 */ 2186 if (!mem_cgroup_is_root(memcg)) 2187 res_counter_uncharge(&memcg->memsw, PAGE_SIZE); 2188 mem_cgroup_swap_statistics(memcg, false); 2189 mem_cgroup_put(memcg); 2190 } 2191 rcu_read_unlock(); 2192} 2193#endif 2194 2195/* 2196 * Before starting migration, account PAGE_SIZE to mem_cgroup that the old 2197 * page belongs to. 2198 */ 2199int mem_cgroup_prepare_migration(struct page *page, struct mem_cgroup **ptr) 2200{ 2201 struct page_cgroup *pc; 2202 struct mem_cgroup *mem = NULL; 2203 int ret = 0; 2204 2205 if (mem_cgroup_disabled()) 2206 return 0; 2207 2208 pc = lookup_page_cgroup(page); 2209 lock_page_cgroup(pc); 2210 if (PageCgroupUsed(pc)) { 2211 mem = pc->mem_cgroup; 2212 css_get(&mem->css); 2213 } 2214 unlock_page_cgroup(pc); 2215 2216 if (mem) { 2217 ret = __mem_cgroup_try_charge(NULL, GFP_KERNEL, &mem, false, 2218 page); 2219 css_put(&mem->css); 2220 } 2221 *ptr = mem; 2222 return ret; 2223} 2224 2225/* remove redundant charge if migration failed*/ 2226void mem_cgroup_end_migration(struct mem_cgroup *mem, 2227 struct page *oldpage, struct page *newpage) 2228{ 2229 struct page *target, *unused; 2230 struct page_cgroup *pc; 2231 enum charge_type ctype; 2232 2233 if (!mem) 2234 return; 2235 cgroup_exclude_rmdir(&mem->css); 2236 /* at migration success, oldpage->mapping is NULL. */ 2237 if (oldpage->mapping) { 2238 target = oldpage; 2239 unused = NULL; 2240 } else { 2241 target = newpage; 2242 unused = oldpage; 2243 } 2244 2245 if (PageAnon(target)) 2246 ctype = MEM_CGROUP_CHARGE_TYPE_MAPPED; 2247 else if (page_is_file_cache(target)) 2248 ctype = MEM_CGROUP_CHARGE_TYPE_CACHE; 2249 else 2250 ctype = MEM_CGROUP_CHARGE_TYPE_SHMEM; 2251 2252 /* unused page is not on radix-tree now. */ 2253 if (unused) 2254 __mem_cgroup_uncharge_common(unused, ctype); 2255 2256 pc = lookup_page_cgroup(target); 2257 /* 2258 * __mem_cgroup_commit_charge() check PCG_USED bit of page_cgroup. 2259 * So, double-counting is effectively avoided. 2260 */ 2261 __mem_cgroup_commit_charge(mem, pc, ctype); 2262 2263 /* 2264 * Both of oldpage and newpage are still under lock_page(). 2265 * Then, we don't have to care about race in radix-tree. 2266 * But we have to be careful that this page is unmapped or not. 2267 * 2268 * There is a case for !page_mapped(). At the start of 2269 * migration, oldpage was mapped. But now, it's zapped. 2270 * But we know *target* page is not freed/reused under us. 2271 * mem_cgroup_uncharge_page() does all necessary checks. 2272 */ 2273 if (ctype == MEM_CGROUP_CHARGE_TYPE_MAPPED) 2274 mem_cgroup_uncharge_page(target); 2275 /* 2276 * At migration, we may charge account against cgroup which has no tasks 2277 * So, rmdir()->pre_destroy() can be called while we do this charge. 2278 * In that case, we need to call pre_destroy() again. check it here. 2279 */ 2280 cgroup_release_and_wakeup_rmdir(&mem->css); 2281} 2282 2283/* 2284 * A call to try to shrink memory usage on charge failure at shmem's swapin. 2285 * Calling hierarchical_reclaim is not enough because we should update 2286 * last_oom_jiffies to prevent pagefault_out_of_memory from invoking global OOM. 2287 * Moreover considering hierarchy, we should reclaim from the mem_over_limit, 2288 * not from the memcg which this page would be charged to. 2289 * try_charge_swapin does all of these works properly. 2290 */ 2291int mem_cgroup_shmem_charge_fallback(struct page *page, 2292 struct mm_struct *mm, 2293 gfp_t gfp_mask) 2294{ 2295 struct mem_cgroup *mem = NULL; 2296 int ret; 2297 2298 if (mem_cgroup_disabled()) 2299 return 0; 2300 2301 ret = mem_cgroup_try_charge_swapin(mm, page, gfp_mask, &mem); 2302 if (!ret) 2303 mem_cgroup_cancel_charge_swapin(mem); /* it does !mem check */ 2304 2305 return ret; 2306} 2307 2308static DEFINE_MUTEX(set_limit_mutex); 2309 2310static int mem_cgroup_resize_limit(struct mem_cgroup *memcg, 2311 unsigned long long val) 2312{ 2313 int retry_count; 2314 u64 memswlimit; 2315 int ret = 0; 2316 int children = mem_cgroup_count_children(memcg); 2317 u64 curusage, oldusage; 2318 2319 /* 2320 * For keeping hierarchical_reclaim simple, how long we should retry 2321 * is depends on callers. We set our retry-count to be function 2322 * of # of children which we should visit in this loop. 2323 */ 2324 retry_count = MEM_CGROUP_RECLAIM_RETRIES * children; 2325 2326 oldusage = res_counter_read_u64(&memcg->res, RES_USAGE); 2327 2328 while (retry_count) { 2329 if (signal_pending(current)) { 2330 ret = -EINTR; 2331 break; 2332 } 2333 /* 2334 * Rather than hide all in some function, I do this in 2335 * open coded manner. You see what this really does. 2336 * We have to guarantee mem->res.limit < mem->memsw.limit. 2337 */ 2338 mutex_lock(&set_limit_mutex); 2339 memswlimit = res_counter_read_u64(&memcg->memsw, RES_LIMIT); 2340 if (memswlimit < val) { 2341 ret = -EINVAL; 2342 mutex_unlock(&set_limit_mutex); 2343 break; 2344 } 2345 ret = res_counter_set_limit(&memcg->res, val); 2346 if (!ret) { 2347 if (memswlimit == val) 2348 memcg->memsw_is_minimum = true; 2349 else 2350 memcg->memsw_is_minimum = false; 2351 } 2352 mutex_unlock(&set_limit_mutex); 2353 2354 if (!ret) 2355 break; 2356 2357 mem_cgroup_hierarchical_reclaim(memcg, NULL, GFP_KERNEL, 2358 MEM_CGROUP_RECLAIM_SHRINK); 2359 curusage = res_counter_read_u64(&memcg->res, RES_USAGE); 2360 /* Usage is reduced ? */ 2361 if (curusage >= oldusage) 2362 retry_count--; 2363 else 2364 oldusage = curusage; 2365 } 2366 2367 return ret; 2368} 2369 2370static int mem_cgroup_resize_memsw_limit(struct mem_cgroup *memcg, 2371 unsigned long long val) 2372{ 2373 int retry_count; 2374 u64 memlimit, oldusage, curusage; 2375 int children = mem_cgroup_count_children(memcg); 2376 int ret = -EBUSY; 2377 2378 /* see mem_cgroup_resize_res_limit */ 2379 retry_count = children * MEM_CGROUP_RECLAIM_RETRIES; 2380 oldusage = res_counter_read_u64(&memcg->memsw, RES_USAGE); 2381 while (retry_count) { 2382 if (signal_pending(current)) { 2383 ret = -EINTR; 2384 break; 2385 } 2386 /* 2387 * Rather than hide all in some function, I do this in 2388 * open coded manner. You see what this really does. 2389 * We have to guarantee mem->res.limit < mem->memsw.limit. 2390 */ 2391 mutex_lock(&set_limit_mutex); 2392 memlimit = res_counter_read_u64(&memcg->res, RES_LIMIT); 2393 if (memlimit > val) { 2394 ret = -EINVAL; 2395 mutex_unlock(&set_limit_mutex); 2396 break; 2397 } 2398 ret = res_counter_set_limit(&memcg->memsw, val); 2399 if (!ret) { 2400 if (memlimit == val) 2401 memcg->memsw_is_minimum = true; 2402 else 2403 memcg->memsw_is_minimum = false; 2404 } 2405 mutex_unlock(&set_limit_mutex); 2406 2407 if (!ret) 2408 break; 2409 2410 mem_cgroup_hierarchical_reclaim(memcg, NULL, GFP_KERNEL, 2411 MEM_CGROUP_RECLAIM_NOSWAP | 2412 MEM_CGROUP_RECLAIM_SHRINK); 2413 curusage = res_counter_read_u64(&memcg->memsw, RES_USAGE); 2414 /* Usage is reduced ? */ 2415 if (curusage >= oldusage) 2416 retry_count--; 2417 else 2418 oldusage = curusage; 2419 } 2420 return ret; 2421} 2422 2423unsigned long mem_cgroup_soft_limit_reclaim(struct zone *zone, int order, 2424 gfp_t gfp_mask, int nid, 2425 int zid) 2426{ 2427 unsigned long nr_reclaimed = 0; 2428 struct mem_cgroup_per_zone *mz, *next_mz = NULL; 2429 unsigned long reclaimed; 2430 int loop = 0; 2431 struct mem_cgroup_tree_per_zone *mctz; 2432 unsigned long long excess; 2433 2434 if (order > 0) 2435 return 0; 2436 2437 mctz = soft_limit_tree_node_zone(nid, zid); 2438 /* 2439 * This loop can run a while, specially if mem_cgroup's continuously 2440 * keep exceeding their soft limit and putting the system under 2441 * pressure 2442 */ 2443 do { 2444 if (next_mz) 2445 mz = next_mz; 2446 else 2447 mz = mem_cgroup_largest_soft_limit_node(mctz); 2448 if (!mz) 2449 break; 2450 2451 reclaimed = mem_cgroup_hierarchical_reclaim(mz->mem, zone, 2452 gfp_mask, 2453 MEM_CGROUP_RECLAIM_SOFT); 2454 nr_reclaimed += reclaimed; 2455 spin_lock(&mctz->lock); 2456 2457 /* 2458 * If we failed to reclaim anything from this memory cgroup 2459 * it is time to move on to the next cgroup 2460 */ 2461 next_mz = NULL; 2462 if (!reclaimed) { 2463 do { 2464 /* 2465 * Loop until we find yet another one. 2466 * 2467 * By the time we get the soft_limit lock 2468 * again, someone might have aded the 2469 * group back on the RB tree. Iterate to 2470 * make sure we get a different mem. 2471 * mem_cgroup_largest_soft_limit_node returns 2472 * NULL if no other cgroup is present on 2473 * the tree 2474 */ 2475 next_mz = 2476 __mem_cgroup_largest_soft_limit_node(mctz); 2477 if (next_mz == mz) { 2478 css_put(&next_mz->mem->css); 2479 next_mz = NULL; 2480 } else /* next_mz == NULL or other memcg */ 2481 break; 2482 } while (1); 2483 } 2484 __mem_cgroup_remove_exceeded(mz->mem, mz, mctz); 2485 excess = res_counter_soft_limit_excess(&mz->mem->res); 2486 /* 2487 * One school of thought says that we should not add 2488 * back the node to the tree if reclaim returns 0. 2489 * But our reclaim could return 0, simply because due 2490 * to priority we are exposing a smaller subset of 2491 * memory to reclaim from. Consider this as a longer 2492 * term TODO. 2493 */ 2494 /* If excess == 0, no tree ops */ 2495 __mem_cgroup_insert_exceeded(mz->mem, mz, mctz, excess); 2496 spin_unlock(&mctz->lock); 2497 css_put(&mz->mem->css); 2498 loop++; 2499 /* 2500 * Could not reclaim anything and there are no more 2501 * mem cgroups to try or we seem to be looping without 2502 * reclaiming anything. 2503 */ 2504 if (!nr_reclaimed && 2505 (next_mz == NULL || 2506 loop > MEM_CGROUP_MAX_SOFT_LIMIT_RECLAIM_LOOPS)) 2507 break; 2508 } while (!nr_reclaimed); 2509 if (next_mz) 2510 css_put(&next_mz->mem->css); 2511 return nr_reclaimed; 2512} 2513 2514/* 2515 * This routine traverse page_cgroup in given list and drop them all. 2516 * *And* this routine doesn't reclaim page itself, just removes page_cgroup. 2517 */ 2518static int mem_cgroup_force_empty_list(struct mem_cgroup *mem, 2519 int node, int zid, enum lru_list lru) 2520{ 2521 struct zone *zone; 2522 struct mem_cgroup_per_zone *mz; 2523 struct page_cgroup *pc, *busy; 2524 unsigned long flags, loop; 2525 struct list_head *list; 2526 int ret = 0; 2527 2528 zone = &NODE_DATA(node)->node_zones[zid]; 2529 mz = mem_cgroup_zoneinfo(mem, node, zid); 2530 list = &mz->lists[lru]; 2531 2532 loop = MEM_CGROUP_ZSTAT(mz, lru); 2533 /* give some margin against EBUSY etc...*/ 2534 loop += 256; 2535 busy = NULL; 2536 while (loop--) { 2537 ret = 0; 2538 spin_lock_irqsave(&zone->lru_lock, flags); 2539 if (list_empty(list)) { 2540 spin_unlock_irqrestore(&zone->lru_lock, flags); 2541 break; 2542 } 2543 pc = list_entry(list->prev, struct page_cgroup, lru); 2544 if (busy == pc) { 2545 list_move(&pc->lru, list); 2546 busy = 0; 2547 spin_unlock_irqrestore(&zone->lru_lock, flags); 2548 continue; 2549 } 2550 spin_unlock_irqrestore(&zone->lru_lock, flags); 2551 2552 ret = mem_cgroup_move_parent(pc, mem, GFP_KERNEL); 2553 if (ret == -ENOMEM) 2554 break; 2555 2556 if (ret == -EBUSY || ret == -EINVAL) { 2557 /* found lock contention or "pc" is obsolete. */ 2558 busy = pc; 2559 cond_resched(); 2560 } else 2561 busy = NULL; 2562 } 2563 2564 if (!ret && !list_empty(list)) 2565 return -EBUSY; 2566 return ret; 2567} 2568 2569/* 2570 * make mem_cgroup's charge to be 0 if there is no task. 2571 * This enables deleting this mem_cgroup. 2572 */ 2573static int mem_cgroup_force_empty(struct mem_cgroup *mem, bool free_all) 2574{ 2575 int ret; 2576 int node, zid, shrink; 2577 int nr_retries = MEM_CGROUP_RECLAIM_RETRIES; 2578 struct cgroup *cgrp = mem->css.cgroup; 2579 2580 css_get(&mem->css); 2581 2582 shrink = 0; 2583 /* should free all ? */ 2584 if (free_all) 2585 goto try_to_free; 2586move_account: 2587 while (mem->res.usage > 0) { 2588 ret = -EBUSY; 2589 if (cgroup_task_count(cgrp) || !list_empty(&cgrp->children)) 2590 goto out; 2591 ret = -EINTR; 2592 if (signal_pending(current)) 2593 goto out; 2594 /* This is for making all *used* pages to be on LRU. */ 2595 lru_add_drain_all(); 2596 drain_all_stock_sync(); 2597 ret = 0; 2598 for_each_node_state(node, N_HIGH_MEMORY) { 2599 for (zid = 0; !ret && zid < MAX_NR_ZONES; zid++) { 2600 enum lru_list l; 2601 for_each_lru(l) { 2602 ret = mem_cgroup_force_empty_list(mem, 2603 node, zid, l); 2604 if (ret) 2605 break; 2606 } 2607 } 2608 if (ret) 2609 break; 2610 } 2611 /* it seems parent cgroup doesn't have enough mem */ 2612 if (ret == -ENOMEM) 2613 goto try_to_free; 2614 cond_resched(); 2615 } 2616 ret = 0; 2617out: 2618 css_put(&mem->css); 2619 return ret; 2620 2621try_to_free: 2622 /* returns EBUSY if there is a task or if we come here twice. */ 2623 if (cgroup_task_count(cgrp) || !list_empty(&cgrp->children) || shrink) { 2624 ret = -EBUSY; 2625 goto out; 2626 } 2627 /* we call try-to-free pages for make this cgroup empty */ 2628 lru_add_drain_all(); 2629 /* try to free all pages in this cgroup */ 2630 shrink = 1; 2631 while (nr_retries && mem->res.usage > 0) { 2632 int progress; 2633 2634 if (signal_pending(current)) { 2635 ret = -EINTR; 2636 goto out; 2637 } 2638 progress = try_to_free_mem_cgroup_pages(mem, GFP_KERNEL, 2639 false, get_swappiness(mem)); 2640 if (!progress) { 2641 nr_retries--; 2642 /* maybe some writeback is necessary */ 2643 congestion_wait(BLK_RW_ASYNC, HZ/10); 2644 } 2645 2646 } 2647 lru_add_drain(); 2648 /* try move_account...there may be some *locked* pages. */ 2649 if (mem->res.usage) 2650 goto move_account; 2651 ret = 0; 2652 goto out; 2653} 2654 2655int mem_cgroup_force_empty_write(struct cgroup *cont, unsigned int event) 2656{ 2657 return mem_cgroup_force_empty(mem_cgroup_from_cont(cont), true); 2658} 2659 2660 2661static u64 mem_cgroup_hierarchy_read(struct cgroup *cont, struct cftype *cft) 2662{ 2663 return mem_cgroup_from_cont(cont)->use_hierarchy; 2664} 2665 2666static int mem_cgroup_hierarchy_write(struct cgroup *cont, struct cftype *cft, 2667 u64 val) 2668{ 2669 int retval = 0; 2670 struct mem_cgroup *mem = mem_cgroup_from_cont(cont); 2671 struct cgroup *parent = cont->parent; 2672 struct mem_cgroup *parent_mem = NULL; 2673 2674 if (parent) 2675 parent_mem = mem_cgroup_from_cont(parent); 2676 2677 cgroup_lock(); 2678 /* 2679 * If parent's use_hierarchy is set, we can't make any modifications 2680 * in the child subtrees. If it is unset, then the change can 2681 * occur, provided the current cgroup has no children. 2682 * 2683 * For the root cgroup, parent_mem is NULL, we allow value to be 2684 * set if there are no children. 2685 */ 2686 if ((!parent_mem || !parent_mem->use_hierarchy) && 2687 (val == 1 || val == 0)) { 2688 if (list_empty(&cont->children)) 2689 mem->use_hierarchy = val; 2690 else 2691 retval = -EBUSY; 2692 } else 2693 retval = -EINVAL; 2694 cgroup_unlock(); 2695 2696 return retval; 2697} 2698 2699struct mem_cgroup_idx_data { 2700 s64 val; 2701 enum mem_cgroup_stat_index idx; 2702}; 2703 2704static int 2705mem_cgroup_get_idx_stat(struct mem_cgroup *mem, void *data) 2706{ 2707 struct mem_cgroup_idx_data *d = data; 2708 d->val += mem_cgroup_read_stat(&mem->stat, d->idx); 2709 return 0; 2710} 2711 2712static void 2713mem_cgroup_get_recursive_idx_stat(struct mem_cgroup *mem, 2714 enum mem_cgroup_stat_index idx, s64 *val) 2715{ 2716 struct mem_cgroup_idx_data d; 2717 d.idx = idx; 2718 d.val = 0; 2719 mem_cgroup_walk_tree(mem, &d, mem_cgroup_get_idx_stat); 2720 *val = d.val; 2721} 2722 2723static u64 mem_cgroup_read(struct cgroup *cont, struct cftype *cft) 2724{ 2725 struct mem_cgroup *mem = mem_cgroup_from_cont(cont); 2726 u64 idx_val, val; 2727 int type, name; 2728 2729 type = MEMFILE_TYPE(cft->private); 2730 name = MEMFILE_ATTR(cft->private); 2731 switch (type) { 2732 case _MEM: 2733 if (name == RES_USAGE && mem_cgroup_is_root(mem)) { 2734 mem_cgroup_get_recursive_idx_stat(mem, 2735 MEM_CGROUP_STAT_CACHE, &idx_val); 2736 val = idx_val; 2737 mem_cgroup_get_recursive_idx_stat(mem, 2738 MEM_CGROUP_STAT_RSS, &idx_val); 2739 val += idx_val; 2740 val <<= PAGE_SHIFT; 2741 } else 2742 val = res_counter_read_u64(&mem->res, name); 2743 break; 2744 case _MEMSWAP: 2745 if (name == RES_USAGE && mem_cgroup_is_root(mem)) { 2746 mem_cgroup_get_recursive_idx_stat(mem, 2747 MEM_CGROUP_STAT_CACHE, &idx_val); 2748 val = idx_val; 2749 mem_cgroup_get_recursive_idx_stat(mem, 2750 MEM_CGROUP_STAT_RSS, &idx_val); 2751 val += idx_val; 2752 mem_cgroup_get_recursive_idx_stat(mem, 2753 MEM_CGROUP_STAT_SWAPOUT, &idx_val); 2754 val += idx_val; 2755 val <<= PAGE_SHIFT; 2756 } else 2757 val = res_counter_read_u64(&mem->memsw, name); 2758 break; 2759 default: 2760 BUG(); 2761 break; 2762 } 2763 return val; 2764} 2765/* 2766 * The user of this function is... 2767 * RES_LIMIT. 2768 */ 2769static int mem_cgroup_write(struct cgroup *cont, struct cftype *cft, 2770 const char *buffer) 2771{ 2772 struct mem_cgroup *memcg = mem_cgroup_from_cont(cont); 2773 int type, name; 2774 unsigned long long val; 2775 int ret; 2776 2777 type = MEMFILE_TYPE(cft->private); 2778 name = MEMFILE_ATTR(cft->private); 2779 switch (name) { 2780 case RES_LIMIT: 2781 if (mem_cgroup_is_root(memcg)) { /* Can't set limit on root */ 2782 ret = -EINVAL; 2783 break; 2784 } 2785 /* This function does all necessary parse...reuse it */ 2786 ret = res_counter_memparse_write_strategy(buffer, &val); 2787 if (ret) 2788 break; 2789 if (type == _MEM) 2790 ret = mem_cgroup_resize_limit(memcg, val); 2791 else 2792 ret = mem_cgroup_resize_memsw_limit(memcg, val); 2793 break; 2794 case RES_SOFT_LIMIT: 2795 ret = res_counter_memparse_write_strategy(buffer, &val); 2796 if (ret) 2797 break; 2798 /* 2799 * For memsw, soft limits are hard to implement in terms 2800 * of semantics, for now, we support soft limits for 2801 * control without swap 2802 */ 2803 if (type == _MEM) 2804 ret = res_counter_set_soft_limit(&memcg->res, val); 2805 else 2806 ret = -EINVAL; 2807 break; 2808 default: 2809 ret = -EINVAL; /* should be BUG() ? */ 2810 break; 2811 } 2812 return ret; 2813} 2814 2815static void memcg_get_hierarchical_limit(struct mem_cgroup *memcg, 2816 unsigned long long *mem_limit, unsigned long long *memsw_limit) 2817{ 2818 struct cgroup *cgroup; 2819 unsigned long long min_limit, min_memsw_limit, tmp; 2820 2821 min_limit = res_counter_read_u64(&memcg->res, RES_LIMIT); 2822 min_memsw_limit = res_counter_read_u64(&memcg->memsw, RES_LIMIT); 2823 cgroup = memcg->css.cgroup; 2824 if (!memcg->use_hierarchy) 2825 goto out; 2826 2827 while (cgroup->parent) { 2828 cgroup = cgroup->parent; 2829 memcg = mem_cgroup_from_cont(cgroup); 2830 if (!memcg->use_hierarchy) 2831 break; 2832 tmp = res_counter_read_u64(&memcg->res, RES_LIMIT); 2833 min_limit = min(min_limit, tmp); 2834 tmp = res_counter_read_u64(&memcg->memsw, RES_LIMIT); 2835 min_memsw_limit = min(min_memsw_limit, tmp); 2836 } 2837out: 2838 *mem_limit = min_limit; 2839 *memsw_limit = min_memsw_limit; 2840 return; 2841} 2842 2843static int mem_cgroup_reset(struct cgroup *cont, unsigned int event) 2844{ 2845 struct mem_cgroup *mem; 2846 int type, name; 2847 2848 mem = mem_cgroup_from_cont(cont); 2849 type = MEMFILE_TYPE(event); 2850 name = MEMFILE_ATTR(event); 2851 switch (name) { 2852 case RES_MAX_USAGE: 2853 if (type == _MEM) 2854 res_counter_reset_max(&mem->res); 2855 else 2856 res_counter_reset_max(&mem->memsw); 2857 break; 2858 case RES_FAILCNT: 2859 if (type == _MEM) 2860 res_counter_reset_failcnt(&mem->res); 2861 else 2862 res_counter_reset_failcnt(&mem->memsw); 2863 break; 2864 } 2865 2866 return 0; 2867} 2868 2869 2870/* For read statistics */ 2871enum { 2872 MCS_CACHE, 2873 MCS_RSS, 2874 MCS_FILE_MAPPED, 2875 MCS_PGPGIN, 2876 MCS_PGPGOUT, 2877 MCS_SWAP, 2878 MCS_INACTIVE_ANON, 2879 MCS_ACTIVE_ANON, 2880 MCS_INACTIVE_FILE, 2881 MCS_ACTIVE_FILE, 2882 MCS_UNEVICTABLE, 2883 NR_MCS_STAT, 2884}; 2885 2886struct mcs_total_stat { 2887 s64 stat[NR_MCS_STAT]; 2888}; 2889 2890struct { 2891 char *local_name; 2892 char *total_name; 2893} memcg_stat_strings[NR_MCS_STAT] = { 2894 {"cache", "total_cache"}, 2895 {"rss", "total_rss"}, 2896 {"mapped_file", "total_mapped_file"}, 2897 {"pgpgin", "total_pgpgin"}, 2898 {"pgpgout", "total_pgpgout"}, 2899 {"swap", "total_swap"}, 2900 {"inactive_anon", "total_inactive_anon"}, 2901 {"active_anon", "total_active_anon"}, 2902 {"inactive_file", "total_inactive_file"}, 2903 {"active_file", "total_active_file"}, 2904 {"unevictable", "total_unevictable"} 2905}; 2906 2907 2908static int mem_cgroup_get_local_stat(struct mem_cgroup *mem, void *data) 2909{ 2910 struct mcs_total_stat *s = data; 2911 s64 val; 2912 2913 /* per cpu stat */ 2914 val = mem_cgroup_read_stat(&mem->stat, MEM_CGROUP_STAT_CACHE); 2915 s->stat[MCS_CACHE] += val * PAGE_SIZE; 2916 val = mem_cgroup_read_stat(&mem->stat, MEM_CGROUP_STAT_RSS); 2917 s->stat[MCS_RSS] += val * PAGE_SIZE; 2918 val = mem_cgroup_read_stat(&mem->stat, MEM_CGROUP_STAT_FILE_MAPPED); 2919 s->stat[MCS_FILE_MAPPED] += val * PAGE_SIZE; 2920 val = mem_cgroup_read_stat(&mem->stat, MEM_CGROUP_STAT_PGPGIN_COUNT); 2921 s->stat[MCS_PGPGIN] += val; 2922 val = mem_cgroup_read_stat(&mem->stat, MEM_CGROUP_STAT_PGPGOUT_COUNT); 2923 s->stat[MCS_PGPGOUT] += val; 2924 if (do_swap_account) { 2925 val = mem_cgroup_read_stat(&mem->stat, MEM_CGROUP_STAT_SWAPOUT); 2926 s->stat[MCS_SWAP] += val * PAGE_SIZE; 2927 } 2928 2929 /* per zone stat */ 2930 val = mem_cgroup_get_local_zonestat(mem, LRU_INACTIVE_ANON); 2931 s->stat[MCS_INACTIVE_ANON] += val * PAGE_SIZE; 2932 val = mem_cgroup_get_local_zonestat(mem, LRU_ACTIVE_ANON); 2933 s->stat[MCS_ACTIVE_ANON] += val * PAGE_SIZE; 2934 val = mem_cgroup_get_local_zonestat(mem, LRU_INACTIVE_FILE); 2935 s->stat[MCS_INACTIVE_FILE] += val * PAGE_SIZE; 2936 val = mem_cgroup_get_local_zonestat(mem, LRU_ACTIVE_FILE); 2937 s->stat[MCS_ACTIVE_FILE] += val * PAGE_SIZE; 2938 val = mem_cgroup_get_local_zonestat(mem, LRU_UNEVICTABLE); 2939 s->stat[MCS_UNEVICTABLE] += val * PAGE_SIZE; 2940 return 0; 2941} 2942 2943static void 2944mem_cgroup_get_total_stat(struct mem_cgroup *mem, struct mcs_total_stat *s) 2945{ 2946 mem_cgroup_walk_tree(mem, s, mem_cgroup_get_local_stat); 2947} 2948 2949static int mem_control_stat_show(struct cgroup *cont, struct cftype *cft, 2950 struct cgroup_map_cb *cb) 2951{ 2952 struct mem_cgroup *mem_cont = mem_cgroup_from_cont(cont); 2953 struct mcs_total_stat mystat; 2954 int i; 2955 2956 memset(&mystat, 0, sizeof(mystat)); 2957 mem_cgroup_get_local_stat(mem_cont, &mystat); 2958 2959 for (i = 0; i < NR_MCS_STAT; i++) { 2960 if (i == MCS_SWAP && !do_swap_account) 2961 continue; 2962 cb->fill(cb, memcg_stat_strings[i].local_name, mystat.stat[i]); 2963 } 2964 2965 /* Hierarchical information */ 2966 { 2967 unsigned long long limit, memsw_limit; 2968 memcg_get_hierarchical_limit(mem_cont, &limit, &memsw_limit); 2969 cb->fill(cb, "hierarchical_memory_limit", limit); 2970 if (do_swap_account) 2971 cb->fill(cb, "hierarchical_memsw_limit", memsw_limit); 2972 } 2973 2974 memset(&mystat, 0, sizeof(mystat)); 2975 mem_cgroup_get_total_stat(mem_cont, &mystat); 2976 for (i = 0; i < NR_MCS_STAT; i++) { 2977 if (i == MCS_SWAP && !do_swap_account) 2978 continue; 2979 cb->fill(cb, memcg_stat_strings[i].total_name, mystat.stat[i]); 2980 } 2981 2982#ifdef CONFIG_DEBUG_VM 2983 cb->fill(cb, "inactive_ratio", calc_inactive_ratio(mem_cont, NULL)); 2984 2985 { 2986 int nid, zid; 2987 struct mem_cgroup_per_zone *mz; 2988 unsigned long recent_rotated[2] = {0, 0}; 2989 unsigned long recent_scanned[2] = {0, 0}; 2990 2991 for_each_online_node(nid) 2992 for (zid = 0; zid < MAX_NR_ZONES; zid++) { 2993 mz = mem_cgroup_zoneinfo(mem_cont, nid, zid); 2994 2995 recent_rotated[0] += 2996 mz->reclaim_stat.recent_rotated[0]; 2997 recent_rotated[1] += 2998 mz->reclaim_stat.recent_rotated[1]; 2999 recent_scanned[0] += 3000 mz->reclaim_stat.recent_scanned[0]; 3001 recent_scanned[1] += 3002 mz->reclaim_stat.recent_scanned[1]; 3003 } 3004 cb->fill(cb, "recent_rotated_anon", recent_rotated[0]); 3005 cb->fill(cb, "recent_rotated_file", recent_rotated[1]); 3006 cb->fill(cb, "recent_scanned_anon", recent_scanned[0]); 3007 cb->fill(cb, "recent_scanned_file", recent_scanned[1]); 3008 } 3009#endif 3010 3011 return 0; 3012} 3013 3014static u64 mem_cgroup_swappiness_read(struct cgroup *cgrp, struct cftype *cft) 3015{ 3016 struct mem_cgroup *memcg = mem_cgroup_from_cont(cgrp); 3017 3018 return get_swappiness(memcg); 3019} 3020 3021static int mem_cgroup_swappiness_write(struct cgroup *cgrp, struct cftype *cft, 3022 u64 val) 3023{ 3024 struct mem_cgroup *memcg = mem_cgroup_from_cont(cgrp); 3025 struct mem_cgroup *parent; 3026 3027 if (val > 100) 3028 return -EINVAL; 3029 3030 if (cgrp->parent == NULL) 3031 return -EINVAL; 3032 3033 parent = mem_cgroup_from_cont(cgrp->parent); 3034 3035 cgroup_lock(); 3036 3037 /* If under hierarchy, only empty-root can set this value */ 3038 if ((parent->use_hierarchy) || 3039 (memcg->use_hierarchy && !list_empty(&cgrp->children))) { 3040 cgroup_unlock(); 3041 return -EINVAL; 3042 } 3043 3044 spin_lock(&memcg->reclaim_param_lock); 3045 memcg->swappiness = val; 3046 spin_unlock(&memcg->reclaim_param_lock); 3047 3048 cgroup_unlock(); 3049 3050 return 0; 3051} 3052 3053 3054static struct cftype mem_cgroup_files[] = { 3055 { 3056 .name = "usage_in_bytes", 3057 .private = MEMFILE_PRIVATE(_MEM, RES_USAGE), 3058 .read_u64 = mem_cgroup_read, 3059 }, 3060 { 3061 .name = "max_usage_in_bytes", 3062 .private = MEMFILE_PRIVATE(_MEM, RES_MAX_USAGE), 3063 .trigger = mem_cgroup_reset, 3064 .read_u64 = mem_cgroup_read, 3065 }, 3066 { 3067 .name = "limit_in_bytes", 3068 .private = MEMFILE_PRIVATE(_MEM, RES_LIMIT), 3069 .write_string = mem_cgroup_write, 3070 .read_u64 = mem_cgroup_read, 3071 }, 3072 { 3073 .name = "soft_limit_in_bytes", 3074 .private = MEMFILE_PRIVATE(_MEM, RES_SOFT_LIMIT), 3075 .write_string = mem_cgroup_write, 3076 .read_u64 = mem_cgroup_read, 3077 }, 3078 { 3079 .name = "failcnt", 3080 .private = MEMFILE_PRIVATE(_MEM, RES_FAILCNT), 3081 .trigger = mem_cgroup_reset, 3082 .read_u64 = mem_cgroup_read, 3083 }, 3084 { 3085 .name = "stat", 3086 .read_map = mem_control_stat_show, 3087 }, 3088 { 3089 .name = "force_empty", 3090 .trigger = mem_cgroup_force_empty_write, 3091 }, 3092 { 3093 .name = "use_hierarchy", 3094 .write_u64 = mem_cgroup_hierarchy_write, 3095 .read_u64 = mem_cgroup_hierarchy_read, 3096 }, 3097 { 3098 .name = "swappiness", 3099 .read_u64 = mem_cgroup_swappiness_read, 3100 .write_u64 = mem_cgroup_swappiness_write, 3101 }, 3102}; 3103 3104#ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP 3105static struct cftype memsw_cgroup_files[] = { 3106 { 3107 .name = "memsw.usage_in_bytes", 3108 .private = MEMFILE_PRIVATE(_MEMSWAP, RES_USAGE), 3109 .read_u64 = mem_cgroup_read, 3110 }, 3111 { 3112 .name = "memsw.max_usage_in_bytes", 3113 .private = MEMFILE_PRIVATE(_MEMSWAP, RES_MAX_USAGE), 3114 .trigger = mem_cgroup_reset, 3115 .read_u64 = mem_cgroup_read, 3116 }, 3117 { 3118 .name = "memsw.limit_in_bytes", 3119 .private = MEMFILE_PRIVATE(_MEMSWAP, RES_LIMIT), 3120 .write_string = mem_cgroup_write, 3121 .read_u64 = mem_cgroup_read, 3122 }, 3123 { 3124 .name = "memsw.failcnt", 3125 .private = MEMFILE_PRIVATE(_MEMSWAP, RES_FAILCNT), 3126 .trigger = mem_cgroup_reset, 3127 .read_u64 = mem_cgroup_read, 3128 }, 3129}; 3130 3131static int register_memsw_files(struct cgroup *cont, struct cgroup_subsys *ss) 3132{ 3133 if (!do_swap_account) 3134 return 0; 3135 return cgroup_add_files(cont, ss, memsw_cgroup_files, 3136 ARRAY_SIZE(memsw_cgroup_files)); 3137}; 3138#else 3139static int register_memsw_files(struct cgroup *cont, struct cgroup_subsys *ss) 3140{ 3141 return 0; 3142} 3143#endif 3144 3145static int alloc_mem_cgroup_per_zone_info(struct mem_cgroup *mem, int node) 3146{ 3147 struct mem_cgroup_per_node *pn; 3148 struct mem_cgroup_per_zone *mz; 3149 enum lru_list l; 3150 int zone, tmp = node; 3151 /* 3152 * This routine is called against possible nodes. 3153 * But it's BUG to call kmalloc() against offline node. 3154 * 3155 * TODO: this routine can waste much memory for nodes which will 3156 * never be onlined. It's better to use memory hotplug callback 3157 * function. 3158 */ 3159 if (!node_state(node, N_NORMAL_MEMORY)) 3160 tmp = -1; 3161 pn = kmalloc_node(sizeof(*pn), GFP_KERNEL, tmp); 3162 if (!pn) 3163 return 1; 3164 3165 mem->info.nodeinfo[node] = pn; 3166 memset(pn, 0, sizeof(*pn)); 3167 3168 for (zone = 0; zone < MAX_NR_ZONES; zone++) { 3169 mz = &pn->zoneinfo[zone]; 3170 for_each_lru(l) 3171 INIT_LIST_HEAD(&mz->lists[l]); 3172 mz->usage_in_excess = 0; 3173 mz->on_tree = false; 3174 mz->mem = mem; 3175 } 3176 return 0; 3177} 3178 3179static void free_mem_cgroup_per_zone_info(struct mem_cgroup *mem, int node) 3180{ 3181 kfree(mem->info.nodeinfo[node]); 3182} 3183 3184static int mem_cgroup_size(void) 3185{ 3186 int cpustat_size = nr_cpu_ids * sizeof(struct mem_cgroup_stat_cpu); 3187 return sizeof(struct mem_cgroup) + cpustat_size; 3188} 3189 3190static struct mem_cgroup *mem_cgroup_alloc(void) 3191{ 3192 struct mem_cgroup *mem; 3193 int size = mem_cgroup_size(); 3194 3195 if (size < PAGE_SIZE) 3196 mem = kmalloc(size, GFP_KERNEL); 3197 else 3198 mem = vmalloc(size); 3199 3200 if (mem) 3201 memset(mem, 0, size); 3202 return mem; 3203} 3204 3205/* 3206 * At destroying mem_cgroup, references from swap_cgroup can remain. 3207 * (scanning all at force_empty is too costly...) 3208 * 3209 * Instead of clearing all references at force_empty, we remember 3210 * the number of reference from swap_cgroup and free mem_cgroup when 3211 * it goes down to 0. 3212 * 3213 * Removal of cgroup itself succeeds regardless of refs from swap. 3214 */ 3215 3216static void __mem_cgroup_free(struct mem_cgroup *mem) 3217{ 3218 int node; 3219 3220 mem_cgroup_remove_from_trees(mem); 3221 free_css_id(&mem_cgroup_subsys, &mem->css); 3222 3223 for_each_node_state(node, N_POSSIBLE) 3224 free_mem_cgroup_per_zone_info(mem, node); 3225 3226 if (mem_cgroup_size() < PAGE_SIZE) 3227 kfree(mem); 3228 else 3229 vfree(mem); 3230} 3231 3232static void mem_cgroup_get(struct mem_cgroup *mem) 3233{ 3234 atomic_inc(&mem->refcnt); 3235} 3236 3237static void mem_cgroup_put(struct mem_cgroup *mem) 3238{ 3239 if (atomic_dec_and_test(&mem->refcnt)) { 3240 struct mem_cgroup *parent = parent_mem_cgroup(mem); 3241 __mem_cgroup_free(mem); 3242 if (parent) 3243 mem_cgroup_put(parent); 3244 } 3245} 3246 3247/* 3248 * Returns the parent mem_cgroup in memcgroup hierarchy with hierarchy enabled. 3249 */ 3250static struct mem_cgroup *parent_mem_cgroup(struct mem_cgroup *mem) 3251{ 3252 if (!mem->res.parent) 3253 return NULL; 3254 return mem_cgroup_from_res_counter(mem->res.parent, res); 3255} 3256 3257#ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP 3258static void __init enable_swap_cgroup(void) 3259{ 3260 if (!mem_cgroup_disabled() && really_do_swap_account) 3261 do_swap_account = 1; 3262} 3263#else 3264static void __init enable_swap_cgroup(void) 3265{ 3266} 3267#endif 3268 3269static int mem_cgroup_soft_limit_tree_init(void) 3270{ 3271 struct mem_cgroup_tree_per_node *rtpn; 3272 struct mem_cgroup_tree_per_zone *rtpz; 3273 int tmp, node, zone; 3274 3275 for_each_node_state(node, N_POSSIBLE) { 3276 tmp = node; 3277 if (!node_state(node, N_NORMAL_MEMORY)) 3278 tmp = -1; 3279 rtpn = kzalloc_node(sizeof(*rtpn), GFP_KERNEL, tmp); 3280 if (!rtpn) 3281 return 1; 3282 3283 soft_limit_tree.rb_tree_per_node[node] = rtpn; 3284 3285 for (zone = 0; zone < MAX_NR_ZONES; zone++) { 3286 rtpz = &rtpn->rb_tree_per_zone[zone]; 3287 rtpz->rb_root = RB_ROOT; 3288 spin_lock_init(&rtpz->lock); 3289 } 3290 } 3291 return 0; 3292} 3293 3294static struct cgroup_subsys_state * __ref 3295mem_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cont) 3296{ 3297 struct mem_cgroup *mem, *parent; 3298 long error = -ENOMEM; 3299 int node; 3300 3301 mem = mem_cgroup_alloc(); 3302 if (!mem) 3303 return ERR_PTR(error); 3304 3305 for_each_node_state(node, N_POSSIBLE) 3306 if (alloc_mem_cgroup_per_zone_info(mem, node)) 3307 goto free_out; 3308 3309 /* root ? */ 3310 if (cont->parent == NULL) { 3311 int cpu; 3312 enable_swap_cgroup(); 3313 parent = NULL; 3314 root_mem_cgroup = mem; 3315 if (mem_cgroup_soft_limit_tree_init()) 3316 goto free_out; 3317 for_each_possible_cpu(cpu) { 3318 struct memcg_stock_pcp *stock = 3319 &per_cpu(memcg_stock, cpu); 3320 INIT_WORK(&stock->work, drain_local_stock); 3321 } 3322 hotcpu_notifier(memcg_stock_cpu_callback, 0); 3323 3324 } else { 3325 parent = mem_cgroup_from_cont(cont->parent); 3326 mem->use_hierarchy = parent->use_hierarchy; 3327 } 3328 3329 if (parent && parent->use_hierarchy) { 3330 res_counter_init(&mem->res, &parent->res); 3331 res_counter_init(&mem->memsw, &parent->memsw); 3332 /* 3333 * We increment refcnt of the parent to ensure that we can 3334 * safely access it on res_counter_charge/uncharge. 3335 * This refcnt will be decremented when freeing this 3336 * mem_cgroup(see mem_cgroup_put). 3337 */ 3338 mem_cgroup_get(parent); 3339 } else { 3340 res_counter_init(&mem->res, NULL); 3341 res_counter_init(&mem->memsw, NULL); 3342 } 3343 mem->last_scanned_child = 0; 3344 spin_lock_init(&mem->reclaim_param_lock); 3345 3346 if (parent) 3347 mem->swappiness = get_swappiness(parent); 3348 atomic_set(&mem->refcnt, 1); 3349 return &mem->css; 3350free_out: 3351 __mem_cgroup_free(mem); 3352 root_mem_cgroup = NULL; 3353 return ERR_PTR(error); 3354} 3355 3356static int mem_cgroup_pre_destroy(struct cgroup_subsys *ss, 3357 struct cgroup *cont) 3358{ 3359 struct mem_cgroup *mem = mem_cgroup_from_cont(cont); 3360 3361 return mem_cgroup_force_empty(mem, false); 3362} 3363 3364static void mem_cgroup_destroy(struct cgroup_subsys *ss, 3365 struct cgroup *cont) 3366{ 3367 struct mem_cgroup *mem = mem_cgroup_from_cont(cont); 3368 3369 mem_cgroup_put(mem); 3370} 3371 3372static int mem_cgroup_populate(struct cgroup_subsys *ss, 3373 struct cgroup *cont) 3374{ 3375 int ret; 3376 3377 ret = cgroup_add_files(cont, ss, mem_cgroup_files, 3378 ARRAY_SIZE(mem_cgroup_files)); 3379 3380 if (!ret) 3381 ret = register_memsw_files(cont, ss); 3382 return ret; 3383} 3384 3385static void mem_cgroup_move_task(struct cgroup_subsys *ss, 3386 struct cgroup *cont, 3387 struct cgroup *old_cont, 3388 struct task_struct *p, 3389 bool threadgroup) 3390{ 3391 /* 3392 * FIXME: It's better to move charges of this process from old 3393 * memcg to new memcg. But it's just on TODO-List now. 3394 */ 3395} 3396 3397struct cgroup_subsys mem_cgroup_subsys = { 3398 .name = "memory", 3399 .subsys_id = mem_cgroup_subsys_id, 3400 .create = mem_cgroup_create, 3401 .pre_destroy = mem_cgroup_pre_destroy, 3402 .destroy = mem_cgroup_destroy, 3403 .populate = mem_cgroup_populate, 3404 .attach = mem_cgroup_move_task, 3405 .early_init = 0, 3406 .use_id = 1, 3407}; 3408 3409#ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP 3410 3411static int __init disable_swap_account(char *s) 3412{ 3413 really_do_swap_account = 0; 3414 return 1; 3415} 3416__setup("noswapaccount", disable_swap_account); 3417#endif 3418