memcontrol.c revision c772be939e078afd2505ede7d596a30f8f61de95
1/* memcontrol.c - Memory Controller 2 * 3 * Copyright IBM Corporation, 2007 4 * Author Balbir Singh <balbir@linux.vnet.ibm.com> 5 * 6 * Copyright 2007 OpenVZ SWsoft Inc 7 * Author: Pavel Emelianov <xemul@openvz.org> 8 * 9 * This program is free software; you can redistribute it and/or modify 10 * it under the terms of the GNU General Public License as published by 11 * the Free Software Foundation; either version 2 of the License, or 12 * (at your option) any later version. 13 * 14 * This program is distributed in the hope that it will be useful, 15 * but WITHOUT ANY WARRANTY; without even the implied warranty of 16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 17 * GNU General Public License for more details. 18 */ 19 20#include <linux/res_counter.h> 21#include <linux/memcontrol.h> 22#include <linux/cgroup.h> 23#include <linux/mm.h> 24#include <linux/pagemap.h> 25#include <linux/smp.h> 26#include <linux/page-flags.h> 27#include <linux/backing-dev.h> 28#include <linux/bit_spinlock.h> 29#include <linux/rcupdate.h> 30#include <linux/mutex.h> 31#include <linux/slab.h> 32#include <linux/swap.h> 33#include <linux/spinlock.h> 34#include <linux/fs.h> 35#include <linux/seq_file.h> 36#include <linux/vmalloc.h> 37#include <linux/mm_inline.h> 38#include <linux/page_cgroup.h> 39#include "internal.h" 40 41#include <asm/uaccess.h> 42 43struct cgroup_subsys mem_cgroup_subsys __read_mostly; 44#define MEM_CGROUP_RECLAIM_RETRIES 5 45 46#ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP 47/* Turned on only when memory cgroup is enabled && really_do_swap_account = 0 */ 48int do_swap_account __read_mostly; 49static int really_do_swap_account __initdata = 1; /* for remember boot option*/ 50#else 51#define do_swap_account (0) 52#endif 53 54 55/* 56 * Statistics for memory cgroup. 57 */ 58enum mem_cgroup_stat_index { 59 /* 60 * For MEM_CONTAINER_TYPE_ALL, usage = pagecache + rss. 61 */ 62 MEM_CGROUP_STAT_CACHE, /* # of pages charged as cache */ 63 MEM_CGROUP_STAT_RSS, /* # of pages charged as rss */ 64 MEM_CGROUP_STAT_PGPGIN_COUNT, /* # of pages paged in */ 65 MEM_CGROUP_STAT_PGPGOUT_COUNT, /* # of pages paged out */ 66 67 MEM_CGROUP_STAT_NSTATS, 68}; 69 70struct mem_cgroup_stat_cpu { 71 s64 count[MEM_CGROUP_STAT_NSTATS]; 72} ____cacheline_aligned_in_smp; 73 74struct mem_cgroup_stat { 75 struct mem_cgroup_stat_cpu cpustat[0]; 76}; 77 78/* 79 * For accounting under irq disable, no need for increment preempt count. 80 */ 81static inline void __mem_cgroup_stat_add_safe(struct mem_cgroup_stat_cpu *stat, 82 enum mem_cgroup_stat_index idx, int val) 83{ 84 stat->count[idx] += val; 85} 86 87static s64 mem_cgroup_read_stat(struct mem_cgroup_stat *stat, 88 enum mem_cgroup_stat_index idx) 89{ 90 int cpu; 91 s64 ret = 0; 92 for_each_possible_cpu(cpu) 93 ret += stat->cpustat[cpu].count[idx]; 94 return ret; 95} 96 97/* 98 * per-zone information in memory controller. 99 */ 100struct mem_cgroup_per_zone { 101 /* 102 * spin_lock to protect the per cgroup LRU 103 */ 104 struct list_head lists[NR_LRU_LISTS]; 105 unsigned long count[NR_LRU_LISTS]; 106 107 struct zone_reclaim_stat reclaim_stat; 108}; 109/* Macro for accessing counter */ 110#define MEM_CGROUP_ZSTAT(mz, idx) ((mz)->count[(idx)]) 111 112struct mem_cgroup_per_node { 113 struct mem_cgroup_per_zone zoneinfo[MAX_NR_ZONES]; 114}; 115 116struct mem_cgroup_lru_info { 117 struct mem_cgroup_per_node *nodeinfo[MAX_NUMNODES]; 118}; 119 120/* 121 * The memory controller data structure. The memory controller controls both 122 * page cache and RSS per cgroup. We would eventually like to provide 123 * statistics based on the statistics developed by Rik Van Riel for clock-pro, 124 * to help the administrator determine what knobs to tune. 125 * 126 * TODO: Add a water mark for the memory controller. Reclaim will begin when 127 * we hit the water mark. May be even add a low water mark, such that 128 * no reclaim occurs from a cgroup at it's low water mark, this is 129 * a feature that will be implemented much later in the future. 130 */ 131struct mem_cgroup { 132 struct cgroup_subsys_state css; 133 /* 134 * the counter to account for memory usage 135 */ 136 struct res_counter res; 137 /* 138 * the counter to account for mem+swap usage. 139 */ 140 struct res_counter memsw; 141 /* 142 * Per cgroup active and inactive list, similar to the 143 * per zone LRU lists. 144 */ 145 struct mem_cgroup_lru_info info; 146 147 /* 148 protect against reclaim related member. 149 */ 150 spinlock_t reclaim_param_lock; 151 152 int prev_priority; /* for recording reclaim priority */ 153 154 /* 155 * While reclaiming in a hiearchy, we cache the last child we 156 * reclaimed from. Protected by cgroup_lock() 157 */ 158 struct mem_cgroup *last_scanned_child; 159 /* 160 * Should the accounting and control be hierarchical, per subtree? 161 */ 162 bool use_hierarchy; 163 unsigned long last_oom_jiffies; 164 int obsolete; 165 atomic_t refcnt; 166 167 unsigned int swappiness; 168 169 /* 170 * statistics. This must be placed at the end of memcg. 171 */ 172 struct mem_cgroup_stat stat; 173}; 174 175enum charge_type { 176 MEM_CGROUP_CHARGE_TYPE_CACHE = 0, 177 MEM_CGROUP_CHARGE_TYPE_MAPPED, 178 MEM_CGROUP_CHARGE_TYPE_SHMEM, /* used by page migration of shmem */ 179 MEM_CGROUP_CHARGE_TYPE_FORCE, /* used by force_empty */ 180 MEM_CGROUP_CHARGE_TYPE_SWAPOUT, /* for accounting swapcache */ 181 NR_CHARGE_TYPE, 182}; 183 184/* only for here (for easy reading.) */ 185#define PCGF_CACHE (1UL << PCG_CACHE) 186#define PCGF_USED (1UL << PCG_USED) 187#define PCGF_LOCK (1UL << PCG_LOCK) 188static const unsigned long 189pcg_default_flags[NR_CHARGE_TYPE] = { 190 PCGF_CACHE | PCGF_USED | PCGF_LOCK, /* File Cache */ 191 PCGF_USED | PCGF_LOCK, /* Anon */ 192 PCGF_CACHE | PCGF_USED | PCGF_LOCK, /* Shmem */ 193 0, /* FORCE */ 194}; 195 196/* for encoding cft->private value on file */ 197#define _MEM (0) 198#define _MEMSWAP (1) 199#define MEMFILE_PRIVATE(x, val) (((x) << 16) | (val)) 200#define MEMFILE_TYPE(val) (((val) >> 16) & 0xffff) 201#define MEMFILE_ATTR(val) ((val) & 0xffff) 202 203static void mem_cgroup_get(struct mem_cgroup *mem); 204static void mem_cgroup_put(struct mem_cgroup *mem); 205 206static void mem_cgroup_charge_statistics(struct mem_cgroup *mem, 207 struct page_cgroup *pc, 208 bool charge) 209{ 210 int val = (charge)? 1 : -1; 211 struct mem_cgroup_stat *stat = &mem->stat; 212 struct mem_cgroup_stat_cpu *cpustat; 213 int cpu = get_cpu(); 214 215 cpustat = &stat->cpustat[cpu]; 216 if (PageCgroupCache(pc)) 217 __mem_cgroup_stat_add_safe(cpustat, MEM_CGROUP_STAT_CACHE, val); 218 else 219 __mem_cgroup_stat_add_safe(cpustat, MEM_CGROUP_STAT_RSS, val); 220 221 if (charge) 222 __mem_cgroup_stat_add_safe(cpustat, 223 MEM_CGROUP_STAT_PGPGIN_COUNT, 1); 224 else 225 __mem_cgroup_stat_add_safe(cpustat, 226 MEM_CGROUP_STAT_PGPGOUT_COUNT, 1); 227 put_cpu(); 228} 229 230static struct mem_cgroup_per_zone * 231mem_cgroup_zoneinfo(struct mem_cgroup *mem, int nid, int zid) 232{ 233 return &mem->info.nodeinfo[nid]->zoneinfo[zid]; 234} 235 236static struct mem_cgroup_per_zone * 237page_cgroup_zoneinfo(struct page_cgroup *pc) 238{ 239 struct mem_cgroup *mem = pc->mem_cgroup; 240 int nid = page_cgroup_nid(pc); 241 int zid = page_cgroup_zid(pc); 242 243 if (!mem) 244 return NULL; 245 246 return mem_cgroup_zoneinfo(mem, nid, zid); 247} 248 249static unsigned long mem_cgroup_get_all_zonestat(struct mem_cgroup *mem, 250 enum lru_list idx) 251{ 252 int nid, zid; 253 struct mem_cgroup_per_zone *mz; 254 u64 total = 0; 255 256 for_each_online_node(nid) 257 for (zid = 0; zid < MAX_NR_ZONES; zid++) { 258 mz = mem_cgroup_zoneinfo(mem, nid, zid); 259 total += MEM_CGROUP_ZSTAT(mz, idx); 260 } 261 return total; 262} 263 264static struct mem_cgroup *mem_cgroup_from_cont(struct cgroup *cont) 265{ 266 return container_of(cgroup_subsys_state(cont, 267 mem_cgroup_subsys_id), struct mem_cgroup, 268 css); 269} 270 271struct mem_cgroup *mem_cgroup_from_task(struct task_struct *p) 272{ 273 /* 274 * mm_update_next_owner() may clear mm->owner to NULL 275 * if it races with swapoff, page migration, etc. 276 * So this can be called with p == NULL. 277 */ 278 if (unlikely(!p)) 279 return NULL; 280 281 return container_of(task_subsys_state(p, mem_cgroup_subsys_id), 282 struct mem_cgroup, css); 283} 284 285/* 286 * Following LRU functions are allowed to be used without PCG_LOCK. 287 * Operations are called by routine of global LRU independently from memcg. 288 * What we have to take care of here is validness of pc->mem_cgroup. 289 * 290 * Changes to pc->mem_cgroup happens when 291 * 1. charge 292 * 2. moving account 293 * In typical case, "charge" is done before add-to-lru. Exception is SwapCache. 294 * It is added to LRU before charge. 295 * If PCG_USED bit is not set, page_cgroup is not added to this private LRU. 296 * When moving account, the page is not on LRU. It's isolated. 297 */ 298 299void mem_cgroup_del_lru_list(struct page *page, enum lru_list lru) 300{ 301 struct page_cgroup *pc; 302 struct mem_cgroup *mem; 303 struct mem_cgroup_per_zone *mz; 304 305 if (mem_cgroup_disabled()) 306 return; 307 pc = lookup_page_cgroup(page); 308 /* can happen while we handle swapcache. */ 309 if (list_empty(&pc->lru)) 310 return; 311 mz = page_cgroup_zoneinfo(pc); 312 mem = pc->mem_cgroup; 313 MEM_CGROUP_ZSTAT(mz, lru) -= 1; 314 list_del_init(&pc->lru); 315 return; 316} 317 318void mem_cgroup_del_lru(struct page *page) 319{ 320 mem_cgroup_del_lru_list(page, page_lru(page)); 321} 322 323void mem_cgroup_rotate_lru_list(struct page *page, enum lru_list lru) 324{ 325 struct mem_cgroup_per_zone *mz; 326 struct page_cgroup *pc; 327 328 if (mem_cgroup_disabled()) 329 return; 330 331 pc = lookup_page_cgroup(page); 332 smp_rmb(); 333 /* unused page is not rotated. */ 334 if (!PageCgroupUsed(pc)) 335 return; 336 mz = page_cgroup_zoneinfo(pc); 337 list_move(&pc->lru, &mz->lists[lru]); 338} 339 340void mem_cgroup_add_lru_list(struct page *page, enum lru_list lru) 341{ 342 struct page_cgroup *pc; 343 struct mem_cgroup_per_zone *mz; 344 345 if (mem_cgroup_disabled()) 346 return; 347 pc = lookup_page_cgroup(page); 348 /* barrier to sync with "charge" */ 349 smp_rmb(); 350 if (!PageCgroupUsed(pc)) 351 return; 352 353 mz = page_cgroup_zoneinfo(pc); 354 MEM_CGROUP_ZSTAT(mz, lru) += 1; 355 list_add(&pc->lru, &mz->lists[lru]); 356} 357/* 358 * To add swapcache into LRU. Be careful to all this function. 359 * zone->lru_lock shouldn't be held and irq must not be disabled. 360 */ 361static void mem_cgroup_lru_fixup(struct page *page) 362{ 363 if (!isolate_lru_page(page)) 364 putback_lru_page(page); 365} 366 367void mem_cgroup_move_lists(struct page *page, 368 enum lru_list from, enum lru_list to) 369{ 370 if (mem_cgroup_disabled()) 371 return; 372 mem_cgroup_del_lru_list(page, from); 373 mem_cgroup_add_lru_list(page, to); 374} 375 376int task_in_mem_cgroup(struct task_struct *task, const struct mem_cgroup *mem) 377{ 378 int ret; 379 380 task_lock(task); 381 ret = task->mm && mm_match_cgroup(task->mm, mem); 382 task_unlock(task); 383 return ret; 384} 385 386/* 387 * Calculate mapped_ratio under memory controller. This will be used in 388 * vmscan.c for deteremining we have to reclaim mapped pages. 389 */ 390int mem_cgroup_calc_mapped_ratio(struct mem_cgroup *mem) 391{ 392 long total, rss; 393 394 /* 395 * usage is recorded in bytes. But, here, we assume the number of 396 * physical pages can be represented by "long" on any arch. 397 */ 398 total = (long) (mem->res.usage >> PAGE_SHIFT) + 1L; 399 rss = (long)mem_cgroup_read_stat(&mem->stat, MEM_CGROUP_STAT_RSS); 400 return (int)((rss * 100L) / total); 401} 402 403/* 404 * prev_priority control...this will be used in memory reclaim path. 405 */ 406int mem_cgroup_get_reclaim_priority(struct mem_cgroup *mem) 407{ 408 int prev_priority; 409 410 spin_lock(&mem->reclaim_param_lock); 411 prev_priority = mem->prev_priority; 412 spin_unlock(&mem->reclaim_param_lock); 413 414 return prev_priority; 415} 416 417void mem_cgroup_note_reclaim_priority(struct mem_cgroup *mem, int priority) 418{ 419 spin_lock(&mem->reclaim_param_lock); 420 if (priority < mem->prev_priority) 421 mem->prev_priority = priority; 422 spin_unlock(&mem->reclaim_param_lock); 423} 424 425void mem_cgroup_record_reclaim_priority(struct mem_cgroup *mem, int priority) 426{ 427 spin_lock(&mem->reclaim_param_lock); 428 mem->prev_priority = priority; 429 spin_unlock(&mem->reclaim_param_lock); 430} 431 432static int calc_inactive_ratio(struct mem_cgroup *memcg, unsigned long *present_pages) 433{ 434 unsigned long active; 435 unsigned long inactive; 436 unsigned long gb; 437 unsigned long inactive_ratio; 438 439 inactive = mem_cgroup_get_all_zonestat(memcg, LRU_INACTIVE_ANON); 440 active = mem_cgroup_get_all_zonestat(memcg, LRU_ACTIVE_ANON); 441 442 gb = (inactive + active) >> (30 - PAGE_SHIFT); 443 if (gb) 444 inactive_ratio = int_sqrt(10 * gb); 445 else 446 inactive_ratio = 1; 447 448 if (present_pages) { 449 present_pages[0] = inactive; 450 present_pages[1] = active; 451 } 452 453 return inactive_ratio; 454} 455 456int mem_cgroup_inactive_anon_is_low(struct mem_cgroup *memcg) 457{ 458 unsigned long active; 459 unsigned long inactive; 460 unsigned long present_pages[2]; 461 unsigned long inactive_ratio; 462 463 inactive_ratio = calc_inactive_ratio(memcg, present_pages); 464 465 inactive = present_pages[0]; 466 active = present_pages[1]; 467 468 if (inactive * inactive_ratio < active) 469 return 1; 470 471 return 0; 472} 473 474unsigned long mem_cgroup_zone_nr_pages(struct mem_cgroup *memcg, 475 struct zone *zone, 476 enum lru_list lru) 477{ 478 int nid = zone->zone_pgdat->node_id; 479 int zid = zone_idx(zone); 480 struct mem_cgroup_per_zone *mz = mem_cgroup_zoneinfo(memcg, nid, zid); 481 482 return MEM_CGROUP_ZSTAT(mz, lru); 483} 484 485struct zone_reclaim_stat *mem_cgroup_get_reclaim_stat(struct mem_cgroup *memcg, 486 struct zone *zone) 487{ 488 int nid = zone->zone_pgdat->node_id; 489 int zid = zone_idx(zone); 490 struct mem_cgroup_per_zone *mz = mem_cgroup_zoneinfo(memcg, nid, zid); 491 492 return &mz->reclaim_stat; 493} 494 495struct zone_reclaim_stat * 496mem_cgroup_get_reclaim_stat_from_page(struct page *page) 497{ 498 struct page_cgroup *pc; 499 struct mem_cgroup_per_zone *mz; 500 501 if (mem_cgroup_disabled()) 502 return NULL; 503 504 pc = lookup_page_cgroup(page); 505 mz = page_cgroup_zoneinfo(pc); 506 if (!mz) 507 return NULL; 508 509 return &mz->reclaim_stat; 510} 511 512unsigned long mem_cgroup_isolate_pages(unsigned long nr_to_scan, 513 struct list_head *dst, 514 unsigned long *scanned, int order, 515 int mode, struct zone *z, 516 struct mem_cgroup *mem_cont, 517 int active, int file) 518{ 519 unsigned long nr_taken = 0; 520 struct page *page; 521 unsigned long scan; 522 LIST_HEAD(pc_list); 523 struct list_head *src; 524 struct page_cgroup *pc, *tmp; 525 int nid = z->zone_pgdat->node_id; 526 int zid = zone_idx(z); 527 struct mem_cgroup_per_zone *mz; 528 int lru = LRU_FILE * !!file + !!active; 529 530 BUG_ON(!mem_cont); 531 mz = mem_cgroup_zoneinfo(mem_cont, nid, zid); 532 src = &mz->lists[lru]; 533 534 scan = 0; 535 list_for_each_entry_safe_reverse(pc, tmp, src, lru) { 536 if (scan >= nr_to_scan) 537 break; 538 539 page = pc->page; 540 if (unlikely(!PageCgroupUsed(pc))) 541 continue; 542 if (unlikely(!PageLRU(page))) 543 continue; 544 545 scan++; 546 if (__isolate_lru_page(page, mode, file) == 0) { 547 list_move(&page->lru, dst); 548 nr_taken++; 549 } 550 } 551 552 *scanned = scan; 553 return nr_taken; 554} 555 556#define mem_cgroup_from_res_counter(counter, member) \ 557 container_of(counter, struct mem_cgroup, member) 558 559/* 560 * This routine finds the DFS walk successor. This routine should be 561 * called with cgroup_mutex held 562 */ 563static struct mem_cgroup * 564mem_cgroup_get_next_node(struct mem_cgroup *curr, struct mem_cgroup *root_mem) 565{ 566 struct cgroup *cgroup, *curr_cgroup, *root_cgroup; 567 568 curr_cgroup = curr->css.cgroup; 569 root_cgroup = root_mem->css.cgroup; 570 571 if (!list_empty(&curr_cgroup->children)) { 572 /* 573 * Walk down to children 574 */ 575 mem_cgroup_put(curr); 576 cgroup = list_entry(curr_cgroup->children.next, 577 struct cgroup, sibling); 578 curr = mem_cgroup_from_cont(cgroup); 579 mem_cgroup_get(curr); 580 goto done; 581 } 582 583visit_parent: 584 if (curr_cgroup == root_cgroup) { 585 mem_cgroup_put(curr); 586 curr = root_mem; 587 mem_cgroup_get(curr); 588 goto done; 589 } 590 591 /* 592 * Goto next sibling 593 */ 594 if (curr_cgroup->sibling.next != &curr_cgroup->parent->children) { 595 mem_cgroup_put(curr); 596 cgroup = list_entry(curr_cgroup->sibling.next, struct cgroup, 597 sibling); 598 curr = mem_cgroup_from_cont(cgroup); 599 mem_cgroup_get(curr); 600 goto done; 601 } 602 603 /* 604 * Go up to next parent and next parent's sibling if need be 605 */ 606 curr_cgroup = curr_cgroup->parent; 607 goto visit_parent; 608 609done: 610 root_mem->last_scanned_child = curr; 611 return curr; 612} 613 614/* 615 * Visit the first child (need not be the first child as per the ordering 616 * of the cgroup list, since we track last_scanned_child) of @mem and use 617 * that to reclaim free pages from. 618 */ 619static struct mem_cgroup * 620mem_cgroup_get_first_node(struct mem_cgroup *root_mem) 621{ 622 struct cgroup *cgroup; 623 struct mem_cgroup *ret; 624 bool obsolete = (root_mem->last_scanned_child && 625 root_mem->last_scanned_child->obsolete); 626 627 /* 628 * Scan all children under the mem_cgroup mem 629 */ 630 cgroup_lock(); 631 if (list_empty(&root_mem->css.cgroup->children)) { 632 ret = root_mem; 633 goto done; 634 } 635 636 if (!root_mem->last_scanned_child || obsolete) { 637 638 if (obsolete) 639 mem_cgroup_put(root_mem->last_scanned_child); 640 641 cgroup = list_first_entry(&root_mem->css.cgroup->children, 642 struct cgroup, sibling); 643 ret = mem_cgroup_from_cont(cgroup); 644 mem_cgroup_get(ret); 645 } else 646 ret = mem_cgroup_get_next_node(root_mem->last_scanned_child, 647 root_mem); 648 649done: 650 root_mem->last_scanned_child = ret; 651 cgroup_unlock(); 652 return ret; 653} 654 655static bool mem_cgroup_check_under_limit(struct mem_cgroup *mem) 656{ 657 if (do_swap_account) { 658 if (res_counter_check_under_limit(&mem->res) && 659 res_counter_check_under_limit(&mem->memsw)) 660 return true; 661 } else 662 if (res_counter_check_under_limit(&mem->res)) 663 return true; 664 return false; 665} 666 667static unsigned int get_swappiness(struct mem_cgroup *memcg) 668{ 669 struct cgroup *cgrp = memcg->css.cgroup; 670 unsigned int swappiness; 671 672 /* root ? */ 673 if (cgrp->parent == NULL) 674 return vm_swappiness; 675 676 spin_lock(&memcg->reclaim_param_lock); 677 swappiness = memcg->swappiness; 678 spin_unlock(&memcg->reclaim_param_lock); 679 680 return swappiness; 681} 682 683/* 684 * Dance down the hierarchy if needed to reclaim memory. We remember the 685 * last child we reclaimed from, so that we don't end up penalizing 686 * one child extensively based on its position in the children list. 687 * 688 * root_mem is the original ancestor that we've been reclaim from. 689 */ 690static int mem_cgroup_hierarchical_reclaim(struct mem_cgroup *root_mem, 691 gfp_t gfp_mask, bool noswap) 692{ 693 struct mem_cgroup *next_mem; 694 int ret = 0; 695 696 /* 697 * Reclaim unconditionally and don't check for return value. 698 * We need to reclaim in the current group and down the tree. 699 * One might think about checking for children before reclaiming, 700 * but there might be left over accounting, even after children 701 * have left. 702 */ 703 ret = try_to_free_mem_cgroup_pages(root_mem, gfp_mask, noswap, 704 get_swappiness(root_mem)); 705 if (mem_cgroup_check_under_limit(root_mem)) 706 return 0; 707 if (!root_mem->use_hierarchy) 708 return ret; 709 710 next_mem = mem_cgroup_get_first_node(root_mem); 711 712 while (next_mem != root_mem) { 713 if (next_mem->obsolete) { 714 mem_cgroup_put(next_mem); 715 cgroup_lock(); 716 next_mem = mem_cgroup_get_first_node(root_mem); 717 cgroup_unlock(); 718 continue; 719 } 720 ret = try_to_free_mem_cgroup_pages(next_mem, gfp_mask, noswap, 721 get_swappiness(next_mem)); 722 if (mem_cgroup_check_under_limit(root_mem)) 723 return 0; 724 cgroup_lock(); 725 next_mem = mem_cgroup_get_next_node(next_mem, root_mem); 726 cgroup_unlock(); 727 } 728 return ret; 729} 730 731bool mem_cgroup_oom_called(struct task_struct *task) 732{ 733 bool ret = false; 734 struct mem_cgroup *mem; 735 struct mm_struct *mm; 736 737 rcu_read_lock(); 738 mm = task->mm; 739 if (!mm) 740 mm = &init_mm; 741 mem = mem_cgroup_from_task(rcu_dereference(mm->owner)); 742 if (mem && time_before(jiffies, mem->last_oom_jiffies + HZ/10)) 743 ret = true; 744 rcu_read_unlock(); 745 return ret; 746} 747/* 748 * Unlike exported interface, "oom" parameter is added. if oom==true, 749 * oom-killer can be invoked. 750 */ 751static int __mem_cgroup_try_charge(struct mm_struct *mm, 752 gfp_t gfp_mask, struct mem_cgroup **memcg, 753 bool oom) 754{ 755 struct mem_cgroup *mem, *mem_over_limit; 756 int nr_retries = MEM_CGROUP_RECLAIM_RETRIES; 757 struct res_counter *fail_res; 758 759 if (unlikely(test_thread_flag(TIF_MEMDIE))) { 760 /* Don't account this! */ 761 *memcg = NULL; 762 return 0; 763 } 764 765 /* 766 * We always charge the cgroup the mm_struct belongs to. 767 * The mm_struct's mem_cgroup changes on task migration if the 768 * thread group leader migrates. It's possible that mm is not 769 * set, if so charge the init_mm (happens for pagecache usage). 770 */ 771 if (likely(!*memcg)) { 772 rcu_read_lock(); 773 mem = mem_cgroup_from_task(rcu_dereference(mm->owner)); 774 if (unlikely(!mem)) { 775 rcu_read_unlock(); 776 return 0; 777 } 778 /* 779 * For every charge from the cgroup, increment reference count 780 */ 781 css_get(&mem->css); 782 *memcg = mem; 783 rcu_read_unlock(); 784 } else { 785 mem = *memcg; 786 css_get(&mem->css); 787 } 788 789 while (1) { 790 int ret; 791 bool noswap = false; 792 793 ret = res_counter_charge(&mem->res, PAGE_SIZE, &fail_res); 794 if (likely(!ret)) { 795 if (!do_swap_account) 796 break; 797 ret = res_counter_charge(&mem->memsw, PAGE_SIZE, 798 &fail_res); 799 if (likely(!ret)) 800 break; 801 /* mem+swap counter fails */ 802 res_counter_uncharge(&mem->res, PAGE_SIZE); 803 noswap = true; 804 mem_over_limit = mem_cgroup_from_res_counter(fail_res, 805 memsw); 806 } else 807 /* mem counter fails */ 808 mem_over_limit = mem_cgroup_from_res_counter(fail_res, 809 res); 810 811 if (!(gfp_mask & __GFP_WAIT)) 812 goto nomem; 813 814 ret = mem_cgroup_hierarchical_reclaim(mem_over_limit, gfp_mask, 815 noswap); 816 817 /* 818 * try_to_free_mem_cgroup_pages() might not give us a full 819 * picture of reclaim. Some pages are reclaimed and might be 820 * moved to swap cache or just unmapped from the cgroup. 821 * Check the limit again to see if the reclaim reduced the 822 * current usage of the cgroup before giving up 823 * 824 */ 825 if (mem_cgroup_check_under_limit(mem_over_limit)) 826 continue; 827 828 if (!nr_retries--) { 829 if (oom) { 830 mem_cgroup_out_of_memory(mem_over_limit, gfp_mask); 831 mem_over_limit->last_oom_jiffies = jiffies; 832 } 833 goto nomem; 834 } 835 } 836 return 0; 837nomem: 838 css_put(&mem->css); 839 return -ENOMEM; 840} 841 842/** 843 * mem_cgroup_try_charge - get charge of PAGE_SIZE. 844 * @mm: an mm_struct which is charged against. (when *memcg is NULL) 845 * @gfp_mask: gfp_mask for reclaim. 846 * @memcg: a pointer to memory cgroup which is charged against. 847 * 848 * charge against memory cgroup pointed by *memcg. if *memcg == NULL, estimated 849 * memory cgroup from @mm is got and stored in *memcg. 850 * 851 * Returns 0 if success. -ENOMEM at failure. 852 * This call can invoke OOM-Killer. 853 */ 854 855int mem_cgroup_try_charge(struct mm_struct *mm, 856 gfp_t mask, struct mem_cgroup **memcg) 857{ 858 return __mem_cgroup_try_charge(mm, mask, memcg, true); 859} 860 861/* 862 * commit a charge got by mem_cgroup_try_charge() and makes page_cgroup to be 863 * USED state. If already USED, uncharge and return. 864 */ 865 866static void __mem_cgroup_commit_charge(struct mem_cgroup *mem, 867 struct page_cgroup *pc, 868 enum charge_type ctype) 869{ 870 /* try_charge() can return NULL to *memcg, taking care of it. */ 871 if (!mem) 872 return; 873 874 lock_page_cgroup(pc); 875 if (unlikely(PageCgroupUsed(pc))) { 876 unlock_page_cgroup(pc); 877 res_counter_uncharge(&mem->res, PAGE_SIZE); 878 if (do_swap_account) 879 res_counter_uncharge(&mem->memsw, PAGE_SIZE); 880 css_put(&mem->css); 881 return; 882 } 883 pc->mem_cgroup = mem; 884 smp_wmb(); 885 pc->flags = pcg_default_flags[ctype]; 886 887 mem_cgroup_charge_statistics(mem, pc, true); 888 889 unlock_page_cgroup(pc); 890} 891 892/** 893 * mem_cgroup_move_account - move account of the page 894 * @pc: page_cgroup of the page. 895 * @from: mem_cgroup which the page is moved from. 896 * @to: mem_cgroup which the page is moved to. @from != @to. 897 * 898 * The caller must confirm following. 899 * - page is not on LRU (isolate_page() is useful.) 900 * 901 * returns 0 at success, 902 * returns -EBUSY when lock is busy or "pc" is unstable. 903 * 904 * This function does "uncharge" from old cgroup but doesn't do "charge" to 905 * new cgroup. It should be done by a caller. 906 */ 907 908static int mem_cgroup_move_account(struct page_cgroup *pc, 909 struct mem_cgroup *from, struct mem_cgroup *to) 910{ 911 struct mem_cgroup_per_zone *from_mz, *to_mz; 912 int nid, zid; 913 int ret = -EBUSY; 914 915 VM_BUG_ON(from == to); 916 VM_BUG_ON(PageLRU(pc->page)); 917 918 nid = page_cgroup_nid(pc); 919 zid = page_cgroup_zid(pc); 920 from_mz = mem_cgroup_zoneinfo(from, nid, zid); 921 to_mz = mem_cgroup_zoneinfo(to, nid, zid); 922 923 if (!trylock_page_cgroup(pc)) 924 return ret; 925 926 if (!PageCgroupUsed(pc)) 927 goto out; 928 929 if (pc->mem_cgroup != from) 930 goto out; 931 932 css_put(&from->css); 933 res_counter_uncharge(&from->res, PAGE_SIZE); 934 mem_cgroup_charge_statistics(from, pc, false); 935 if (do_swap_account) 936 res_counter_uncharge(&from->memsw, PAGE_SIZE); 937 pc->mem_cgroup = to; 938 mem_cgroup_charge_statistics(to, pc, true); 939 css_get(&to->css); 940 ret = 0; 941out: 942 unlock_page_cgroup(pc); 943 return ret; 944} 945 946/* 947 * move charges to its parent. 948 */ 949 950static int mem_cgroup_move_parent(struct page_cgroup *pc, 951 struct mem_cgroup *child, 952 gfp_t gfp_mask) 953{ 954 struct page *page = pc->page; 955 struct cgroup *cg = child->css.cgroup; 956 struct cgroup *pcg = cg->parent; 957 struct mem_cgroup *parent; 958 int ret; 959 960 /* Is ROOT ? */ 961 if (!pcg) 962 return -EINVAL; 963 964 965 parent = mem_cgroup_from_cont(pcg); 966 967 968 ret = __mem_cgroup_try_charge(NULL, gfp_mask, &parent, false); 969 if (ret || !parent) 970 return ret; 971 972 if (!get_page_unless_zero(page)) 973 return -EBUSY; 974 975 ret = isolate_lru_page(page); 976 977 if (ret) 978 goto cancel; 979 980 ret = mem_cgroup_move_account(pc, child, parent); 981 982 /* drop extra refcnt by try_charge() (move_account increment one) */ 983 css_put(&parent->css); 984 putback_lru_page(page); 985 if (!ret) { 986 put_page(page); 987 return 0; 988 } 989 /* uncharge if move fails */ 990cancel: 991 res_counter_uncharge(&parent->res, PAGE_SIZE); 992 if (do_swap_account) 993 res_counter_uncharge(&parent->memsw, PAGE_SIZE); 994 put_page(page); 995 return ret; 996} 997 998/* 999 * Charge the memory controller for page usage. 1000 * Return 1001 * 0 if the charge was successful 1002 * < 0 if the cgroup is over its limit 1003 */ 1004static int mem_cgroup_charge_common(struct page *page, struct mm_struct *mm, 1005 gfp_t gfp_mask, enum charge_type ctype, 1006 struct mem_cgroup *memcg) 1007{ 1008 struct mem_cgroup *mem; 1009 struct page_cgroup *pc; 1010 int ret; 1011 1012 pc = lookup_page_cgroup(page); 1013 /* can happen at boot */ 1014 if (unlikely(!pc)) 1015 return 0; 1016 prefetchw(pc); 1017 1018 mem = memcg; 1019 ret = __mem_cgroup_try_charge(mm, gfp_mask, &mem, true); 1020 if (ret || !mem) 1021 return ret; 1022 1023 __mem_cgroup_commit_charge(mem, pc, ctype); 1024 return 0; 1025} 1026 1027int mem_cgroup_newpage_charge(struct page *page, 1028 struct mm_struct *mm, gfp_t gfp_mask) 1029{ 1030 if (mem_cgroup_disabled()) 1031 return 0; 1032 if (PageCompound(page)) 1033 return 0; 1034 /* 1035 * If already mapped, we don't have to account. 1036 * If page cache, page->mapping has address_space. 1037 * But page->mapping may have out-of-use anon_vma pointer, 1038 * detecit it by PageAnon() check. newly-mapped-anon's page->mapping 1039 * is NULL. 1040 */ 1041 if (page_mapped(page) || (page->mapping && !PageAnon(page))) 1042 return 0; 1043 if (unlikely(!mm)) 1044 mm = &init_mm; 1045 return mem_cgroup_charge_common(page, mm, gfp_mask, 1046 MEM_CGROUP_CHARGE_TYPE_MAPPED, NULL); 1047} 1048 1049int mem_cgroup_cache_charge(struct page *page, struct mm_struct *mm, 1050 gfp_t gfp_mask) 1051{ 1052 if (mem_cgroup_disabled()) 1053 return 0; 1054 if (PageCompound(page)) 1055 return 0; 1056 /* 1057 * Corner case handling. This is called from add_to_page_cache() 1058 * in usual. But some FS (shmem) precharges this page before calling it 1059 * and call add_to_page_cache() with GFP_NOWAIT. 1060 * 1061 * For GFP_NOWAIT case, the page may be pre-charged before calling 1062 * add_to_page_cache(). (See shmem.c) check it here and avoid to call 1063 * charge twice. (It works but has to pay a bit larger cost.) 1064 */ 1065 if (!(gfp_mask & __GFP_WAIT)) { 1066 struct page_cgroup *pc; 1067 1068 1069 pc = lookup_page_cgroup(page); 1070 if (!pc) 1071 return 0; 1072 lock_page_cgroup(pc); 1073 if (PageCgroupUsed(pc)) { 1074 unlock_page_cgroup(pc); 1075 return 0; 1076 } 1077 unlock_page_cgroup(pc); 1078 } 1079 1080 if (unlikely(!mm)) 1081 mm = &init_mm; 1082 1083 if (page_is_file_cache(page)) 1084 return mem_cgroup_charge_common(page, mm, gfp_mask, 1085 MEM_CGROUP_CHARGE_TYPE_CACHE, NULL); 1086 else 1087 return mem_cgroup_charge_common(page, mm, gfp_mask, 1088 MEM_CGROUP_CHARGE_TYPE_SHMEM, NULL); 1089} 1090 1091int mem_cgroup_try_charge_swapin(struct mm_struct *mm, 1092 struct page *page, 1093 gfp_t mask, struct mem_cgroup **ptr) 1094{ 1095 struct mem_cgroup *mem; 1096 swp_entry_t ent; 1097 1098 if (mem_cgroup_disabled()) 1099 return 0; 1100 1101 if (!do_swap_account) 1102 goto charge_cur_mm; 1103 1104 /* 1105 * A racing thread's fault, or swapoff, may have already updated 1106 * the pte, and even removed page from swap cache: return success 1107 * to go on to do_swap_page()'s pte_same() test, which should fail. 1108 */ 1109 if (!PageSwapCache(page)) 1110 return 0; 1111 1112 ent.val = page_private(page); 1113 1114 mem = lookup_swap_cgroup(ent); 1115 if (!mem || mem->obsolete) 1116 goto charge_cur_mm; 1117 *ptr = mem; 1118 return __mem_cgroup_try_charge(NULL, mask, ptr, true); 1119charge_cur_mm: 1120 if (unlikely(!mm)) 1121 mm = &init_mm; 1122 return __mem_cgroup_try_charge(mm, mask, ptr, true); 1123} 1124 1125#ifdef CONFIG_SWAP 1126 1127int mem_cgroup_cache_charge_swapin(struct page *page, 1128 struct mm_struct *mm, gfp_t mask, bool locked) 1129{ 1130 int ret = 0; 1131 1132 if (mem_cgroup_disabled()) 1133 return 0; 1134 if (unlikely(!mm)) 1135 mm = &init_mm; 1136 if (!locked) 1137 lock_page(page); 1138 /* 1139 * If not locked, the page can be dropped from SwapCache until 1140 * we reach here. 1141 */ 1142 if (PageSwapCache(page)) { 1143 struct mem_cgroup *mem = NULL; 1144 swp_entry_t ent; 1145 1146 ent.val = page_private(page); 1147 if (do_swap_account) { 1148 mem = lookup_swap_cgroup(ent); 1149 if (mem && mem->obsolete) 1150 mem = NULL; 1151 if (mem) 1152 mm = NULL; 1153 } 1154 ret = mem_cgroup_charge_common(page, mm, mask, 1155 MEM_CGROUP_CHARGE_TYPE_SHMEM, mem); 1156 1157 if (!ret && do_swap_account) { 1158 /* avoid double counting */ 1159 mem = swap_cgroup_record(ent, NULL); 1160 if (mem) { 1161 res_counter_uncharge(&mem->memsw, PAGE_SIZE); 1162 mem_cgroup_put(mem); 1163 } 1164 } 1165 } 1166 if (!locked) 1167 unlock_page(page); 1168 /* add this page(page_cgroup) to the LRU we want. */ 1169 mem_cgroup_lru_fixup(page); 1170 1171 return ret; 1172} 1173#endif 1174 1175void mem_cgroup_commit_charge_swapin(struct page *page, struct mem_cgroup *ptr) 1176{ 1177 struct page_cgroup *pc; 1178 1179 if (mem_cgroup_disabled()) 1180 return; 1181 if (!ptr) 1182 return; 1183 pc = lookup_page_cgroup(page); 1184 __mem_cgroup_commit_charge(ptr, pc, MEM_CGROUP_CHARGE_TYPE_MAPPED); 1185 /* 1186 * Now swap is on-memory. This means this page may be 1187 * counted both as mem and swap....double count. 1188 * Fix it by uncharging from memsw. This SwapCache is stable 1189 * because we're still under lock_page(). 1190 */ 1191 if (do_swap_account) { 1192 swp_entry_t ent = {.val = page_private(page)}; 1193 struct mem_cgroup *memcg; 1194 memcg = swap_cgroup_record(ent, NULL); 1195 if (memcg) { 1196 /* If memcg is obsolete, memcg can be != ptr */ 1197 res_counter_uncharge(&memcg->memsw, PAGE_SIZE); 1198 mem_cgroup_put(memcg); 1199 } 1200 1201 } 1202 /* add this page(page_cgroup) to the LRU we want. */ 1203 mem_cgroup_lru_fixup(page); 1204} 1205 1206void mem_cgroup_cancel_charge_swapin(struct mem_cgroup *mem) 1207{ 1208 if (mem_cgroup_disabled()) 1209 return; 1210 if (!mem) 1211 return; 1212 res_counter_uncharge(&mem->res, PAGE_SIZE); 1213 if (do_swap_account) 1214 res_counter_uncharge(&mem->memsw, PAGE_SIZE); 1215 css_put(&mem->css); 1216} 1217 1218 1219/* 1220 * uncharge if !page_mapped(page) 1221 */ 1222static struct mem_cgroup * 1223__mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype) 1224{ 1225 struct page_cgroup *pc; 1226 struct mem_cgroup *mem = NULL; 1227 struct mem_cgroup_per_zone *mz; 1228 1229 if (mem_cgroup_disabled()) 1230 return NULL; 1231 1232 if (PageSwapCache(page)) 1233 return NULL; 1234 1235 /* 1236 * Check if our page_cgroup is valid 1237 */ 1238 pc = lookup_page_cgroup(page); 1239 if (unlikely(!pc || !PageCgroupUsed(pc))) 1240 return NULL; 1241 1242 lock_page_cgroup(pc); 1243 1244 mem = pc->mem_cgroup; 1245 1246 if (!PageCgroupUsed(pc)) 1247 goto unlock_out; 1248 1249 switch (ctype) { 1250 case MEM_CGROUP_CHARGE_TYPE_MAPPED: 1251 if (page_mapped(page)) 1252 goto unlock_out; 1253 break; 1254 case MEM_CGROUP_CHARGE_TYPE_SWAPOUT: 1255 if (!PageAnon(page)) { /* Shared memory */ 1256 if (page->mapping && !page_is_file_cache(page)) 1257 goto unlock_out; 1258 } else if (page_mapped(page)) /* Anon */ 1259 goto unlock_out; 1260 break; 1261 default: 1262 break; 1263 } 1264 1265 res_counter_uncharge(&mem->res, PAGE_SIZE); 1266 if (do_swap_account && (ctype != MEM_CGROUP_CHARGE_TYPE_SWAPOUT)) 1267 res_counter_uncharge(&mem->memsw, PAGE_SIZE); 1268 1269 mem_cgroup_charge_statistics(mem, pc, false); 1270 ClearPageCgroupUsed(pc); 1271 1272 mz = page_cgroup_zoneinfo(pc); 1273 unlock_page_cgroup(pc); 1274 1275 /* at swapout, this memcg will be accessed to record to swap */ 1276 if (ctype != MEM_CGROUP_CHARGE_TYPE_SWAPOUT) 1277 css_put(&mem->css); 1278 1279 return mem; 1280 1281unlock_out: 1282 unlock_page_cgroup(pc); 1283 return NULL; 1284} 1285 1286void mem_cgroup_uncharge_page(struct page *page) 1287{ 1288 /* early check. */ 1289 if (page_mapped(page)) 1290 return; 1291 if (page->mapping && !PageAnon(page)) 1292 return; 1293 __mem_cgroup_uncharge_common(page, MEM_CGROUP_CHARGE_TYPE_MAPPED); 1294} 1295 1296void mem_cgroup_uncharge_cache_page(struct page *page) 1297{ 1298 VM_BUG_ON(page_mapped(page)); 1299 VM_BUG_ON(page->mapping); 1300 __mem_cgroup_uncharge_common(page, MEM_CGROUP_CHARGE_TYPE_CACHE); 1301} 1302 1303/* 1304 * called from __delete_from_swap_cache() and drop "page" account. 1305 * memcg information is recorded to swap_cgroup of "ent" 1306 */ 1307void mem_cgroup_uncharge_swapcache(struct page *page, swp_entry_t ent) 1308{ 1309 struct mem_cgroup *memcg; 1310 1311 memcg = __mem_cgroup_uncharge_common(page, 1312 MEM_CGROUP_CHARGE_TYPE_SWAPOUT); 1313 /* record memcg information */ 1314 if (do_swap_account && memcg) { 1315 swap_cgroup_record(ent, memcg); 1316 mem_cgroup_get(memcg); 1317 } 1318 if (memcg) 1319 css_put(&memcg->css); 1320} 1321 1322#ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP 1323/* 1324 * called from swap_entry_free(). remove record in swap_cgroup and 1325 * uncharge "memsw" account. 1326 */ 1327void mem_cgroup_uncharge_swap(swp_entry_t ent) 1328{ 1329 struct mem_cgroup *memcg; 1330 1331 if (!do_swap_account) 1332 return; 1333 1334 memcg = swap_cgroup_record(ent, NULL); 1335 if (memcg) { 1336 res_counter_uncharge(&memcg->memsw, PAGE_SIZE); 1337 mem_cgroup_put(memcg); 1338 } 1339} 1340#endif 1341 1342/* 1343 * Before starting migration, account PAGE_SIZE to mem_cgroup that the old 1344 * page belongs to. 1345 */ 1346int mem_cgroup_prepare_migration(struct page *page, struct mem_cgroup **ptr) 1347{ 1348 struct page_cgroup *pc; 1349 struct mem_cgroup *mem = NULL; 1350 int ret = 0; 1351 1352 if (mem_cgroup_disabled()) 1353 return 0; 1354 1355 pc = lookup_page_cgroup(page); 1356 lock_page_cgroup(pc); 1357 if (PageCgroupUsed(pc)) { 1358 mem = pc->mem_cgroup; 1359 css_get(&mem->css); 1360 } 1361 unlock_page_cgroup(pc); 1362 1363 if (mem) { 1364 ret = mem_cgroup_try_charge(NULL, GFP_KERNEL, &mem); 1365 css_put(&mem->css); 1366 } 1367 *ptr = mem; 1368 return ret; 1369} 1370 1371/* remove redundant charge if migration failed*/ 1372void mem_cgroup_end_migration(struct mem_cgroup *mem, 1373 struct page *oldpage, struct page *newpage) 1374{ 1375 struct page *target, *unused; 1376 struct page_cgroup *pc; 1377 enum charge_type ctype; 1378 1379 if (!mem) 1380 return; 1381 1382 /* at migration success, oldpage->mapping is NULL. */ 1383 if (oldpage->mapping) { 1384 target = oldpage; 1385 unused = NULL; 1386 } else { 1387 target = newpage; 1388 unused = oldpage; 1389 } 1390 1391 if (PageAnon(target)) 1392 ctype = MEM_CGROUP_CHARGE_TYPE_MAPPED; 1393 else if (page_is_file_cache(target)) 1394 ctype = MEM_CGROUP_CHARGE_TYPE_CACHE; 1395 else 1396 ctype = MEM_CGROUP_CHARGE_TYPE_SHMEM; 1397 1398 /* unused page is not on radix-tree now. */ 1399 if (unused) 1400 __mem_cgroup_uncharge_common(unused, ctype); 1401 1402 pc = lookup_page_cgroup(target); 1403 /* 1404 * __mem_cgroup_commit_charge() check PCG_USED bit of page_cgroup. 1405 * So, double-counting is effectively avoided. 1406 */ 1407 __mem_cgroup_commit_charge(mem, pc, ctype); 1408 1409 /* 1410 * Both of oldpage and newpage are still under lock_page(). 1411 * Then, we don't have to care about race in radix-tree. 1412 * But we have to be careful that this page is unmapped or not. 1413 * 1414 * There is a case for !page_mapped(). At the start of 1415 * migration, oldpage was mapped. But now, it's zapped. 1416 * But we know *target* page is not freed/reused under us. 1417 * mem_cgroup_uncharge_page() does all necessary checks. 1418 */ 1419 if (ctype == MEM_CGROUP_CHARGE_TYPE_MAPPED) 1420 mem_cgroup_uncharge_page(target); 1421} 1422 1423/* 1424 * A call to try to shrink memory usage under specified resource controller. 1425 * This is typically used for page reclaiming for shmem for reducing side 1426 * effect of page allocation from shmem, which is used by some mem_cgroup. 1427 */ 1428int mem_cgroup_shrink_usage(struct mm_struct *mm, gfp_t gfp_mask) 1429{ 1430 struct mem_cgroup *mem; 1431 int progress = 0; 1432 int retry = MEM_CGROUP_RECLAIM_RETRIES; 1433 1434 if (mem_cgroup_disabled()) 1435 return 0; 1436 if (!mm) 1437 return 0; 1438 1439 rcu_read_lock(); 1440 mem = mem_cgroup_from_task(rcu_dereference(mm->owner)); 1441 if (unlikely(!mem)) { 1442 rcu_read_unlock(); 1443 return 0; 1444 } 1445 css_get(&mem->css); 1446 rcu_read_unlock(); 1447 1448 do { 1449 progress = try_to_free_mem_cgroup_pages(mem, gfp_mask, true, 1450 get_swappiness(mem)); 1451 progress += mem_cgroup_check_under_limit(mem); 1452 } while (!progress && --retry); 1453 1454 css_put(&mem->css); 1455 if (!retry) 1456 return -ENOMEM; 1457 return 0; 1458} 1459 1460static DEFINE_MUTEX(set_limit_mutex); 1461 1462static int mem_cgroup_resize_limit(struct mem_cgroup *memcg, 1463 unsigned long long val) 1464{ 1465 1466 int retry_count = MEM_CGROUP_RECLAIM_RETRIES; 1467 int progress; 1468 u64 memswlimit; 1469 int ret = 0; 1470 1471 while (retry_count) { 1472 if (signal_pending(current)) { 1473 ret = -EINTR; 1474 break; 1475 } 1476 /* 1477 * Rather than hide all in some function, I do this in 1478 * open coded manner. You see what this really does. 1479 * We have to guarantee mem->res.limit < mem->memsw.limit. 1480 */ 1481 mutex_lock(&set_limit_mutex); 1482 memswlimit = res_counter_read_u64(&memcg->memsw, RES_LIMIT); 1483 if (memswlimit < val) { 1484 ret = -EINVAL; 1485 mutex_unlock(&set_limit_mutex); 1486 break; 1487 } 1488 ret = res_counter_set_limit(&memcg->res, val); 1489 mutex_unlock(&set_limit_mutex); 1490 1491 if (!ret) 1492 break; 1493 1494 progress = try_to_free_mem_cgroup_pages(memcg, 1495 GFP_KERNEL, 1496 false, 1497 get_swappiness(memcg)); 1498 if (!progress) retry_count--; 1499 } 1500 1501 return ret; 1502} 1503 1504int mem_cgroup_resize_memsw_limit(struct mem_cgroup *memcg, 1505 unsigned long long val) 1506{ 1507 int retry_count = MEM_CGROUP_RECLAIM_RETRIES; 1508 u64 memlimit, oldusage, curusage; 1509 int ret; 1510 1511 if (!do_swap_account) 1512 return -EINVAL; 1513 1514 while (retry_count) { 1515 if (signal_pending(current)) { 1516 ret = -EINTR; 1517 break; 1518 } 1519 /* 1520 * Rather than hide all in some function, I do this in 1521 * open coded manner. You see what this really does. 1522 * We have to guarantee mem->res.limit < mem->memsw.limit. 1523 */ 1524 mutex_lock(&set_limit_mutex); 1525 memlimit = res_counter_read_u64(&memcg->res, RES_LIMIT); 1526 if (memlimit > val) { 1527 ret = -EINVAL; 1528 mutex_unlock(&set_limit_mutex); 1529 break; 1530 } 1531 ret = res_counter_set_limit(&memcg->memsw, val); 1532 mutex_unlock(&set_limit_mutex); 1533 1534 if (!ret) 1535 break; 1536 1537 oldusage = res_counter_read_u64(&memcg->memsw, RES_USAGE); 1538 try_to_free_mem_cgroup_pages(memcg, GFP_KERNEL, true, 1539 get_swappiness(memcg)); 1540 curusage = res_counter_read_u64(&memcg->memsw, RES_USAGE); 1541 if (curusage >= oldusage) 1542 retry_count--; 1543 } 1544 return ret; 1545} 1546 1547/* 1548 * This routine traverse page_cgroup in given list and drop them all. 1549 * *And* this routine doesn't reclaim page itself, just removes page_cgroup. 1550 */ 1551static int mem_cgroup_force_empty_list(struct mem_cgroup *mem, 1552 int node, int zid, enum lru_list lru) 1553{ 1554 struct zone *zone; 1555 struct mem_cgroup_per_zone *mz; 1556 struct page_cgroup *pc, *busy; 1557 unsigned long flags, loop; 1558 struct list_head *list; 1559 int ret = 0; 1560 1561 zone = &NODE_DATA(node)->node_zones[zid]; 1562 mz = mem_cgroup_zoneinfo(mem, node, zid); 1563 list = &mz->lists[lru]; 1564 1565 loop = MEM_CGROUP_ZSTAT(mz, lru); 1566 /* give some margin against EBUSY etc...*/ 1567 loop += 256; 1568 busy = NULL; 1569 while (loop--) { 1570 ret = 0; 1571 spin_lock_irqsave(&zone->lru_lock, flags); 1572 if (list_empty(list)) { 1573 spin_unlock_irqrestore(&zone->lru_lock, flags); 1574 break; 1575 } 1576 pc = list_entry(list->prev, struct page_cgroup, lru); 1577 if (busy == pc) { 1578 list_move(&pc->lru, list); 1579 busy = 0; 1580 spin_unlock_irqrestore(&zone->lru_lock, flags); 1581 continue; 1582 } 1583 spin_unlock_irqrestore(&zone->lru_lock, flags); 1584 1585 ret = mem_cgroup_move_parent(pc, mem, GFP_KERNEL); 1586 if (ret == -ENOMEM) 1587 break; 1588 1589 if (ret == -EBUSY || ret == -EINVAL) { 1590 /* found lock contention or "pc" is obsolete. */ 1591 busy = pc; 1592 cond_resched(); 1593 } else 1594 busy = NULL; 1595 } 1596 1597 if (!ret && !list_empty(list)) 1598 return -EBUSY; 1599 return ret; 1600} 1601 1602/* 1603 * make mem_cgroup's charge to be 0 if there is no task. 1604 * This enables deleting this mem_cgroup. 1605 */ 1606static int mem_cgroup_force_empty(struct mem_cgroup *mem, bool free_all) 1607{ 1608 int ret; 1609 int node, zid, shrink; 1610 int nr_retries = MEM_CGROUP_RECLAIM_RETRIES; 1611 struct cgroup *cgrp = mem->css.cgroup; 1612 1613 css_get(&mem->css); 1614 1615 shrink = 0; 1616 /* should free all ? */ 1617 if (free_all) 1618 goto try_to_free; 1619move_account: 1620 while (mem->res.usage > 0) { 1621 ret = -EBUSY; 1622 if (cgroup_task_count(cgrp) || !list_empty(&cgrp->children)) 1623 goto out; 1624 ret = -EINTR; 1625 if (signal_pending(current)) 1626 goto out; 1627 /* This is for making all *used* pages to be on LRU. */ 1628 lru_add_drain_all(); 1629 ret = 0; 1630 for_each_node_state(node, N_POSSIBLE) { 1631 for (zid = 0; !ret && zid < MAX_NR_ZONES; zid++) { 1632 enum lru_list l; 1633 for_each_lru(l) { 1634 ret = mem_cgroup_force_empty_list(mem, 1635 node, zid, l); 1636 if (ret) 1637 break; 1638 } 1639 } 1640 if (ret) 1641 break; 1642 } 1643 /* it seems parent cgroup doesn't have enough mem */ 1644 if (ret == -ENOMEM) 1645 goto try_to_free; 1646 cond_resched(); 1647 } 1648 ret = 0; 1649out: 1650 css_put(&mem->css); 1651 return ret; 1652 1653try_to_free: 1654 /* returns EBUSY if there is a task or if we come here twice. */ 1655 if (cgroup_task_count(cgrp) || !list_empty(&cgrp->children) || shrink) { 1656 ret = -EBUSY; 1657 goto out; 1658 } 1659 /* we call try-to-free pages for make this cgroup empty */ 1660 lru_add_drain_all(); 1661 /* try to free all pages in this cgroup */ 1662 shrink = 1; 1663 while (nr_retries && mem->res.usage > 0) { 1664 int progress; 1665 1666 if (signal_pending(current)) { 1667 ret = -EINTR; 1668 goto out; 1669 } 1670 progress = try_to_free_mem_cgroup_pages(mem, GFP_KERNEL, 1671 false, get_swappiness(mem)); 1672 if (!progress) { 1673 nr_retries--; 1674 /* maybe some writeback is necessary */ 1675 congestion_wait(WRITE, HZ/10); 1676 } 1677 1678 } 1679 lru_add_drain(); 1680 /* try move_account...there may be some *locked* pages. */ 1681 if (mem->res.usage) 1682 goto move_account; 1683 ret = 0; 1684 goto out; 1685} 1686 1687int mem_cgroup_force_empty_write(struct cgroup *cont, unsigned int event) 1688{ 1689 return mem_cgroup_force_empty(mem_cgroup_from_cont(cont), true); 1690} 1691 1692 1693static u64 mem_cgroup_hierarchy_read(struct cgroup *cont, struct cftype *cft) 1694{ 1695 return mem_cgroup_from_cont(cont)->use_hierarchy; 1696} 1697 1698static int mem_cgroup_hierarchy_write(struct cgroup *cont, struct cftype *cft, 1699 u64 val) 1700{ 1701 int retval = 0; 1702 struct mem_cgroup *mem = mem_cgroup_from_cont(cont); 1703 struct cgroup *parent = cont->parent; 1704 struct mem_cgroup *parent_mem = NULL; 1705 1706 if (parent) 1707 parent_mem = mem_cgroup_from_cont(parent); 1708 1709 cgroup_lock(); 1710 /* 1711 * If parent's use_hiearchy is set, we can't make any modifications 1712 * in the child subtrees. If it is unset, then the change can 1713 * occur, provided the current cgroup has no children. 1714 * 1715 * For the root cgroup, parent_mem is NULL, we allow value to be 1716 * set if there are no children. 1717 */ 1718 if ((!parent_mem || !parent_mem->use_hierarchy) && 1719 (val == 1 || val == 0)) { 1720 if (list_empty(&cont->children)) 1721 mem->use_hierarchy = val; 1722 else 1723 retval = -EBUSY; 1724 } else 1725 retval = -EINVAL; 1726 cgroup_unlock(); 1727 1728 return retval; 1729} 1730 1731static u64 mem_cgroup_read(struct cgroup *cont, struct cftype *cft) 1732{ 1733 struct mem_cgroup *mem = mem_cgroup_from_cont(cont); 1734 u64 val = 0; 1735 int type, name; 1736 1737 type = MEMFILE_TYPE(cft->private); 1738 name = MEMFILE_ATTR(cft->private); 1739 switch (type) { 1740 case _MEM: 1741 val = res_counter_read_u64(&mem->res, name); 1742 break; 1743 case _MEMSWAP: 1744 if (do_swap_account) 1745 val = res_counter_read_u64(&mem->memsw, name); 1746 break; 1747 default: 1748 BUG(); 1749 break; 1750 } 1751 return val; 1752} 1753/* 1754 * The user of this function is... 1755 * RES_LIMIT. 1756 */ 1757static int mem_cgroup_write(struct cgroup *cont, struct cftype *cft, 1758 const char *buffer) 1759{ 1760 struct mem_cgroup *memcg = mem_cgroup_from_cont(cont); 1761 int type, name; 1762 unsigned long long val; 1763 int ret; 1764 1765 type = MEMFILE_TYPE(cft->private); 1766 name = MEMFILE_ATTR(cft->private); 1767 switch (name) { 1768 case RES_LIMIT: 1769 /* This function does all necessary parse...reuse it */ 1770 ret = res_counter_memparse_write_strategy(buffer, &val); 1771 if (ret) 1772 break; 1773 if (type == _MEM) 1774 ret = mem_cgroup_resize_limit(memcg, val); 1775 else 1776 ret = mem_cgroup_resize_memsw_limit(memcg, val); 1777 break; 1778 default: 1779 ret = -EINVAL; /* should be BUG() ? */ 1780 break; 1781 } 1782 return ret; 1783} 1784 1785static int mem_cgroup_reset(struct cgroup *cont, unsigned int event) 1786{ 1787 struct mem_cgroup *mem; 1788 int type, name; 1789 1790 mem = mem_cgroup_from_cont(cont); 1791 type = MEMFILE_TYPE(event); 1792 name = MEMFILE_ATTR(event); 1793 switch (name) { 1794 case RES_MAX_USAGE: 1795 if (type == _MEM) 1796 res_counter_reset_max(&mem->res); 1797 else 1798 res_counter_reset_max(&mem->memsw); 1799 break; 1800 case RES_FAILCNT: 1801 if (type == _MEM) 1802 res_counter_reset_failcnt(&mem->res); 1803 else 1804 res_counter_reset_failcnt(&mem->memsw); 1805 break; 1806 } 1807 return 0; 1808} 1809 1810static const struct mem_cgroup_stat_desc { 1811 const char *msg; 1812 u64 unit; 1813} mem_cgroup_stat_desc[] = { 1814 [MEM_CGROUP_STAT_CACHE] = { "cache", PAGE_SIZE, }, 1815 [MEM_CGROUP_STAT_RSS] = { "rss", PAGE_SIZE, }, 1816 [MEM_CGROUP_STAT_PGPGIN_COUNT] = {"pgpgin", 1, }, 1817 [MEM_CGROUP_STAT_PGPGOUT_COUNT] = {"pgpgout", 1, }, 1818}; 1819 1820static int mem_control_stat_show(struct cgroup *cont, struct cftype *cft, 1821 struct cgroup_map_cb *cb) 1822{ 1823 struct mem_cgroup *mem_cont = mem_cgroup_from_cont(cont); 1824 struct mem_cgroup_stat *stat = &mem_cont->stat; 1825 int i; 1826 1827 for (i = 0; i < ARRAY_SIZE(stat->cpustat[0].count); i++) { 1828 s64 val; 1829 1830 val = mem_cgroup_read_stat(stat, i); 1831 val *= mem_cgroup_stat_desc[i].unit; 1832 cb->fill(cb, mem_cgroup_stat_desc[i].msg, val); 1833 } 1834 /* showing # of active pages */ 1835 { 1836 unsigned long active_anon, inactive_anon; 1837 unsigned long active_file, inactive_file; 1838 unsigned long unevictable; 1839 1840 inactive_anon = mem_cgroup_get_all_zonestat(mem_cont, 1841 LRU_INACTIVE_ANON); 1842 active_anon = mem_cgroup_get_all_zonestat(mem_cont, 1843 LRU_ACTIVE_ANON); 1844 inactive_file = mem_cgroup_get_all_zonestat(mem_cont, 1845 LRU_INACTIVE_FILE); 1846 active_file = mem_cgroup_get_all_zonestat(mem_cont, 1847 LRU_ACTIVE_FILE); 1848 unevictable = mem_cgroup_get_all_zonestat(mem_cont, 1849 LRU_UNEVICTABLE); 1850 1851 cb->fill(cb, "active_anon", (active_anon) * PAGE_SIZE); 1852 cb->fill(cb, "inactive_anon", (inactive_anon) * PAGE_SIZE); 1853 cb->fill(cb, "active_file", (active_file) * PAGE_SIZE); 1854 cb->fill(cb, "inactive_file", (inactive_file) * PAGE_SIZE); 1855 cb->fill(cb, "unevictable", unevictable * PAGE_SIZE); 1856 1857 } 1858 1859#ifdef CONFIG_DEBUG_VM 1860 cb->fill(cb, "inactive_ratio", calc_inactive_ratio(mem_cont, NULL)); 1861 1862 { 1863 int nid, zid; 1864 struct mem_cgroup_per_zone *mz; 1865 unsigned long recent_rotated[2] = {0, 0}; 1866 unsigned long recent_scanned[2] = {0, 0}; 1867 1868 for_each_online_node(nid) 1869 for (zid = 0; zid < MAX_NR_ZONES; zid++) { 1870 mz = mem_cgroup_zoneinfo(mem_cont, nid, zid); 1871 1872 recent_rotated[0] += 1873 mz->reclaim_stat.recent_rotated[0]; 1874 recent_rotated[1] += 1875 mz->reclaim_stat.recent_rotated[1]; 1876 recent_scanned[0] += 1877 mz->reclaim_stat.recent_scanned[0]; 1878 recent_scanned[1] += 1879 mz->reclaim_stat.recent_scanned[1]; 1880 } 1881 cb->fill(cb, "recent_rotated_anon", recent_rotated[0]); 1882 cb->fill(cb, "recent_rotated_file", recent_rotated[1]); 1883 cb->fill(cb, "recent_scanned_anon", recent_scanned[0]); 1884 cb->fill(cb, "recent_scanned_file", recent_scanned[1]); 1885 } 1886#endif 1887 1888 return 0; 1889} 1890 1891static u64 mem_cgroup_swappiness_read(struct cgroup *cgrp, struct cftype *cft) 1892{ 1893 struct mem_cgroup *memcg = mem_cgroup_from_cont(cgrp); 1894 1895 return get_swappiness(memcg); 1896} 1897 1898static int mem_cgroup_swappiness_write(struct cgroup *cgrp, struct cftype *cft, 1899 u64 val) 1900{ 1901 struct mem_cgroup *memcg = mem_cgroup_from_cont(cgrp); 1902 struct mem_cgroup *parent; 1903 if (val > 100) 1904 return -EINVAL; 1905 1906 if (cgrp->parent == NULL) 1907 return -EINVAL; 1908 1909 parent = mem_cgroup_from_cont(cgrp->parent); 1910 /* If under hierarchy, only empty-root can set this value */ 1911 if ((parent->use_hierarchy) || 1912 (memcg->use_hierarchy && !list_empty(&cgrp->children))) 1913 return -EINVAL; 1914 1915 spin_lock(&memcg->reclaim_param_lock); 1916 memcg->swappiness = val; 1917 spin_unlock(&memcg->reclaim_param_lock); 1918 1919 return 0; 1920} 1921 1922 1923static struct cftype mem_cgroup_files[] = { 1924 { 1925 .name = "usage_in_bytes", 1926 .private = MEMFILE_PRIVATE(_MEM, RES_USAGE), 1927 .read_u64 = mem_cgroup_read, 1928 }, 1929 { 1930 .name = "max_usage_in_bytes", 1931 .private = MEMFILE_PRIVATE(_MEM, RES_MAX_USAGE), 1932 .trigger = mem_cgroup_reset, 1933 .read_u64 = mem_cgroup_read, 1934 }, 1935 { 1936 .name = "limit_in_bytes", 1937 .private = MEMFILE_PRIVATE(_MEM, RES_LIMIT), 1938 .write_string = mem_cgroup_write, 1939 .read_u64 = mem_cgroup_read, 1940 }, 1941 { 1942 .name = "failcnt", 1943 .private = MEMFILE_PRIVATE(_MEM, RES_FAILCNT), 1944 .trigger = mem_cgroup_reset, 1945 .read_u64 = mem_cgroup_read, 1946 }, 1947 { 1948 .name = "stat", 1949 .read_map = mem_control_stat_show, 1950 }, 1951 { 1952 .name = "force_empty", 1953 .trigger = mem_cgroup_force_empty_write, 1954 }, 1955 { 1956 .name = "use_hierarchy", 1957 .write_u64 = mem_cgroup_hierarchy_write, 1958 .read_u64 = mem_cgroup_hierarchy_read, 1959 }, 1960 { 1961 .name = "swappiness", 1962 .read_u64 = mem_cgroup_swappiness_read, 1963 .write_u64 = mem_cgroup_swappiness_write, 1964 }, 1965}; 1966 1967#ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP 1968static struct cftype memsw_cgroup_files[] = { 1969 { 1970 .name = "memsw.usage_in_bytes", 1971 .private = MEMFILE_PRIVATE(_MEMSWAP, RES_USAGE), 1972 .read_u64 = mem_cgroup_read, 1973 }, 1974 { 1975 .name = "memsw.max_usage_in_bytes", 1976 .private = MEMFILE_PRIVATE(_MEMSWAP, RES_MAX_USAGE), 1977 .trigger = mem_cgroup_reset, 1978 .read_u64 = mem_cgroup_read, 1979 }, 1980 { 1981 .name = "memsw.limit_in_bytes", 1982 .private = MEMFILE_PRIVATE(_MEMSWAP, RES_LIMIT), 1983 .write_string = mem_cgroup_write, 1984 .read_u64 = mem_cgroup_read, 1985 }, 1986 { 1987 .name = "memsw.failcnt", 1988 .private = MEMFILE_PRIVATE(_MEMSWAP, RES_FAILCNT), 1989 .trigger = mem_cgroup_reset, 1990 .read_u64 = mem_cgroup_read, 1991 }, 1992}; 1993 1994static int register_memsw_files(struct cgroup *cont, struct cgroup_subsys *ss) 1995{ 1996 if (!do_swap_account) 1997 return 0; 1998 return cgroup_add_files(cont, ss, memsw_cgroup_files, 1999 ARRAY_SIZE(memsw_cgroup_files)); 2000}; 2001#else 2002static int register_memsw_files(struct cgroup *cont, struct cgroup_subsys *ss) 2003{ 2004 return 0; 2005} 2006#endif 2007 2008static int alloc_mem_cgroup_per_zone_info(struct mem_cgroup *mem, int node) 2009{ 2010 struct mem_cgroup_per_node *pn; 2011 struct mem_cgroup_per_zone *mz; 2012 enum lru_list l; 2013 int zone, tmp = node; 2014 /* 2015 * This routine is called against possible nodes. 2016 * But it's BUG to call kmalloc() against offline node. 2017 * 2018 * TODO: this routine can waste much memory for nodes which will 2019 * never be onlined. It's better to use memory hotplug callback 2020 * function. 2021 */ 2022 if (!node_state(node, N_NORMAL_MEMORY)) 2023 tmp = -1; 2024 pn = kmalloc_node(sizeof(*pn), GFP_KERNEL, tmp); 2025 if (!pn) 2026 return 1; 2027 2028 mem->info.nodeinfo[node] = pn; 2029 memset(pn, 0, sizeof(*pn)); 2030 2031 for (zone = 0; zone < MAX_NR_ZONES; zone++) { 2032 mz = &pn->zoneinfo[zone]; 2033 for_each_lru(l) 2034 INIT_LIST_HEAD(&mz->lists[l]); 2035 } 2036 return 0; 2037} 2038 2039static void free_mem_cgroup_per_zone_info(struct mem_cgroup *mem, int node) 2040{ 2041 kfree(mem->info.nodeinfo[node]); 2042} 2043 2044static int mem_cgroup_size(void) 2045{ 2046 int cpustat_size = nr_cpu_ids * sizeof(struct mem_cgroup_stat_cpu); 2047 return sizeof(struct mem_cgroup) + cpustat_size; 2048} 2049 2050static struct mem_cgroup *mem_cgroup_alloc(void) 2051{ 2052 struct mem_cgroup *mem; 2053 int size = mem_cgroup_size(); 2054 2055 if (size < PAGE_SIZE) 2056 mem = kmalloc(size, GFP_KERNEL); 2057 else 2058 mem = vmalloc(size); 2059 2060 if (mem) 2061 memset(mem, 0, size); 2062 return mem; 2063} 2064 2065/* 2066 * At destroying mem_cgroup, references from swap_cgroup can remain. 2067 * (scanning all at force_empty is too costly...) 2068 * 2069 * Instead of clearing all references at force_empty, we remember 2070 * the number of reference from swap_cgroup and free mem_cgroup when 2071 * it goes down to 0. 2072 * 2073 * When mem_cgroup is destroyed, mem->obsolete will be set to 0 and 2074 * entry which points to this memcg will be ignore at swapin. 2075 * 2076 * Removal of cgroup itself succeeds regardless of refs from swap. 2077 */ 2078 2079static void mem_cgroup_free(struct mem_cgroup *mem) 2080{ 2081 int node; 2082 2083 if (atomic_read(&mem->refcnt) > 0) 2084 return; 2085 2086 2087 for_each_node_state(node, N_POSSIBLE) 2088 free_mem_cgroup_per_zone_info(mem, node); 2089 2090 if (mem_cgroup_size() < PAGE_SIZE) 2091 kfree(mem); 2092 else 2093 vfree(mem); 2094} 2095 2096static void mem_cgroup_get(struct mem_cgroup *mem) 2097{ 2098 atomic_inc(&mem->refcnt); 2099} 2100 2101static void mem_cgroup_put(struct mem_cgroup *mem) 2102{ 2103 if (atomic_dec_and_test(&mem->refcnt)) { 2104 if (!mem->obsolete) 2105 return; 2106 mem_cgroup_free(mem); 2107 } 2108} 2109 2110 2111#ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP 2112static void __init enable_swap_cgroup(void) 2113{ 2114 if (!mem_cgroup_disabled() && really_do_swap_account) 2115 do_swap_account = 1; 2116} 2117#else 2118static void __init enable_swap_cgroup(void) 2119{ 2120} 2121#endif 2122 2123static struct cgroup_subsys_state * 2124mem_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cont) 2125{ 2126 struct mem_cgroup *mem, *parent; 2127 int node; 2128 2129 mem = mem_cgroup_alloc(); 2130 if (!mem) 2131 return ERR_PTR(-ENOMEM); 2132 2133 for_each_node_state(node, N_POSSIBLE) 2134 if (alloc_mem_cgroup_per_zone_info(mem, node)) 2135 goto free_out; 2136 /* root ? */ 2137 if (cont->parent == NULL) { 2138 enable_swap_cgroup(); 2139 parent = NULL; 2140 } else { 2141 parent = mem_cgroup_from_cont(cont->parent); 2142 mem->use_hierarchy = parent->use_hierarchy; 2143 } 2144 2145 if (parent && parent->use_hierarchy) { 2146 res_counter_init(&mem->res, &parent->res); 2147 res_counter_init(&mem->memsw, &parent->memsw); 2148 } else { 2149 res_counter_init(&mem->res, NULL); 2150 res_counter_init(&mem->memsw, NULL); 2151 } 2152 mem->last_scanned_child = NULL; 2153 spin_lock_init(&mem->reclaim_param_lock); 2154 2155 if (parent) 2156 mem->swappiness = get_swappiness(parent); 2157 2158 return &mem->css; 2159free_out: 2160 for_each_node_state(node, N_POSSIBLE) 2161 free_mem_cgroup_per_zone_info(mem, node); 2162 mem_cgroup_free(mem); 2163 return ERR_PTR(-ENOMEM); 2164} 2165 2166static void mem_cgroup_pre_destroy(struct cgroup_subsys *ss, 2167 struct cgroup *cont) 2168{ 2169 struct mem_cgroup *mem = mem_cgroup_from_cont(cont); 2170 mem->obsolete = 1; 2171 mem_cgroup_force_empty(mem, false); 2172} 2173 2174static void mem_cgroup_destroy(struct cgroup_subsys *ss, 2175 struct cgroup *cont) 2176{ 2177 mem_cgroup_free(mem_cgroup_from_cont(cont)); 2178} 2179 2180static int mem_cgroup_populate(struct cgroup_subsys *ss, 2181 struct cgroup *cont) 2182{ 2183 int ret; 2184 2185 ret = cgroup_add_files(cont, ss, mem_cgroup_files, 2186 ARRAY_SIZE(mem_cgroup_files)); 2187 2188 if (!ret) 2189 ret = register_memsw_files(cont, ss); 2190 return ret; 2191} 2192 2193static void mem_cgroup_move_task(struct cgroup_subsys *ss, 2194 struct cgroup *cont, 2195 struct cgroup *old_cont, 2196 struct task_struct *p) 2197{ 2198 /* 2199 * FIXME: It's better to move charges of this process from old 2200 * memcg to new memcg. But it's just on TODO-List now. 2201 */ 2202} 2203 2204struct cgroup_subsys mem_cgroup_subsys = { 2205 .name = "memory", 2206 .subsys_id = mem_cgroup_subsys_id, 2207 .create = mem_cgroup_create, 2208 .pre_destroy = mem_cgroup_pre_destroy, 2209 .destroy = mem_cgroup_destroy, 2210 .populate = mem_cgroup_populate, 2211 .attach = mem_cgroup_move_task, 2212 .early_init = 0, 2213}; 2214 2215#ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP 2216 2217static int __init disable_swap_account(char *s) 2218{ 2219 really_do_swap_account = 0; 2220 return 1; 2221} 2222__setup("noswapaccount", disable_swap_account); 2223#endif 2224