memcontrol.c revision bd112db872c2f69993c86f458467acb4a14da010
1/* memcontrol.c - Memory Controller 2 * 3 * Copyright IBM Corporation, 2007 4 * Author Balbir Singh <balbir@linux.vnet.ibm.com> 5 * 6 * Copyright 2007 OpenVZ SWsoft Inc 7 * Author: Pavel Emelianov <xemul@openvz.org> 8 * 9 * This program is free software; you can redistribute it and/or modify 10 * it under the terms of the GNU General Public License as published by 11 * the Free Software Foundation; either version 2 of the License, or 12 * (at your option) any later version. 13 * 14 * This program is distributed in the hope that it will be useful, 15 * but WITHOUT ANY WARRANTY; without even the implied warranty of 16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 17 * GNU General Public License for more details. 18 */ 19 20#include <linux/res_counter.h> 21#include <linux/memcontrol.h> 22#include <linux/cgroup.h> 23#include <linux/mm.h> 24#include <linux/pagemap.h> 25#include <linux/smp.h> 26#include <linux/page-flags.h> 27#include <linux/backing-dev.h> 28#include <linux/bit_spinlock.h> 29#include <linux/rcupdate.h> 30#include <linux/mutex.h> 31#include <linux/slab.h> 32#include <linux/swap.h> 33#include <linux/spinlock.h> 34#include <linux/fs.h> 35#include <linux/seq_file.h> 36#include <linux/vmalloc.h> 37#include <linux/mm_inline.h> 38#include <linux/page_cgroup.h> 39#include "internal.h" 40 41#include <asm/uaccess.h> 42 43struct cgroup_subsys mem_cgroup_subsys __read_mostly; 44#define MEM_CGROUP_RECLAIM_RETRIES 5 45 46#ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP 47/* Turned on only when memory cgroup is enabled && really_do_swap_account = 0 */ 48int do_swap_account __read_mostly; 49static int really_do_swap_account __initdata = 1; /* for remember boot option*/ 50#else 51#define do_swap_account (0) 52#endif 53 54static DEFINE_MUTEX(memcg_tasklist); /* can be hold under cgroup_mutex */ 55 56/* 57 * Statistics for memory cgroup. 58 */ 59enum mem_cgroup_stat_index { 60 /* 61 * For MEM_CONTAINER_TYPE_ALL, usage = pagecache + rss. 62 */ 63 MEM_CGROUP_STAT_CACHE, /* # of pages charged as cache */ 64 MEM_CGROUP_STAT_RSS, /* # of pages charged as rss */ 65 MEM_CGROUP_STAT_PGPGIN_COUNT, /* # of pages paged in */ 66 MEM_CGROUP_STAT_PGPGOUT_COUNT, /* # of pages paged out */ 67 68 MEM_CGROUP_STAT_NSTATS, 69}; 70 71struct mem_cgroup_stat_cpu { 72 s64 count[MEM_CGROUP_STAT_NSTATS]; 73} ____cacheline_aligned_in_smp; 74 75struct mem_cgroup_stat { 76 struct mem_cgroup_stat_cpu cpustat[0]; 77}; 78 79/* 80 * For accounting under irq disable, no need for increment preempt count. 81 */ 82static inline void __mem_cgroup_stat_add_safe(struct mem_cgroup_stat_cpu *stat, 83 enum mem_cgroup_stat_index idx, int val) 84{ 85 stat->count[idx] += val; 86} 87 88static s64 mem_cgroup_read_stat(struct mem_cgroup_stat *stat, 89 enum mem_cgroup_stat_index idx) 90{ 91 int cpu; 92 s64 ret = 0; 93 for_each_possible_cpu(cpu) 94 ret += stat->cpustat[cpu].count[idx]; 95 return ret; 96} 97 98/* 99 * per-zone information in memory controller. 100 */ 101struct mem_cgroup_per_zone { 102 /* 103 * spin_lock to protect the per cgroup LRU 104 */ 105 struct list_head lists[NR_LRU_LISTS]; 106 unsigned long count[NR_LRU_LISTS]; 107 108 struct zone_reclaim_stat reclaim_stat; 109}; 110/* Macro for accessing counter */ 111#define MEM_CGROUP_ZSTAT(mz, idx) ((mz)->count[(idx)]) 112 113struct mem_cgroup_per_node { 114 struct mem_cgroup_per_zone zoneinfo[MAX_NR_ZONES]; 115}; 116 117struct mem_cgroup_lru_info { 118 struct mem_cgroup_per_node *nodeinfo[MAX_NUMNODES]; 119}; 120 121/* 122 * The memory controller data structure. The memory controller controls both 123 * page cache and RSS per cgroup. We would eventually like to provide 124 * statistics based on the statistics developed by Rik Van Riel for clock-pro, 125 * to help the administrator determine what knobs to tune. 126 * 127 * TODO: Add a water mark for the memory controller. Reclaim will begin when 128 * we hit the water mark. May be even add a low water mark, such that 129 * no reclaim occurs from a cgroup at it's low water mark, this is 130 * a feature that will be implemented much later in the future. 131 */ 132struct mem_cgroup { 133 struct cgroup_subsys_state css; 134 /* 135 * the counter to account for memory usage 136 */ 137 struct res_counter res; 138 /* 139 * the counter to account for mem+swap usage. 140 */ 141 struct res_counter memsw; 142 /* 143 * Per cgroup active and inactive list, similar to the 144 * per zone LRU lists. 145 */ 146 struct mem_cgroup_lru_info info; 147 148 /* 149 protect against reclaim related member. 150 */ 151 spinlock_t reclaim_param_lock; 152 153 int prev_priority; /* for recording reclaim priority */ 154 155 /* 156 * While reclaiming in a hiearchy, we cache the last child we 157 * reclaimed from. Protected by hierarchy_mutex 158 */ 159 struct mem_cgroup *last_scanned_child; 160 /* 161 * Should the accounting and control be hierarchical, per subtree? 162 */ 163 bool use_hierarchy; 164 unsigned long last_oom_jiffies; 165 atomic_t refcnt; 166 167 unsigned int swappiness; 168 169 /* 170 * statistics. This must be placed at the end of memcg. 171 */ 172 struct mem_cgroup_stat stat; 173}; 174 175enum charge_type { 176 MEM_CGROUP_CHARGE_TYPE_CACHE = 0, 177 MEM_CGROUP_CHARGE_TYPE_MAPPED, 178 MEM_CGROUP_CHARGE_TYPE_SHMEM, /* used by page migration of shmem */ 179 MEM_CGROUP_CHARGE_TYPE_FORCE, /* used by force_empty */ 180 MEM_CGROUP_CHARGE_TYPE_SWAPOUT, /* for accounting swapcache */ 181 NR_CHARGE_TYPE, 182}; 183 184/* only for here (for easy reading.) */ 185#define PCGF_CACHE (1UL << PCG_CACHE) 186#define PCGF_USED (1UL << PCG_USED) 187#define PCGF_LOCK (1UL << PCG_LOCK) 188static const unsigned long 189pcg_default_flags[NR_CHARGE_TYPE] = { 190 PCGF_CACHE | PCGF_USED | PCGF_LOCK, /* File Cache */ 191 PCGF_USED | PCGF_LOCK, /* Anon */ 192 PCGF_CACHE | PCGF_USED | PCGF_LOCK, /* Shmem */ 193 0, /* FORCE */ 194}; 195 196/* for encoding cft->private value on file */ 197#define _MEM (0) 198#define _MEMSWAP (1) 199#define MEMFILE_PRIVATE(x, val) (((x) << 16) | (val)) 200#define MEMFILE_TYPE(val) (((val) >> 16) & 0xffff) 201#define MEMFILE_ATTR(val) ((val) & 0xffff) 202 203static void mem_cgroup_get(struct mem_cgroup *mem); 204static void mem_cgroup_put(struct mem_cgroup *mem); 205 206static void mem_cgroup_charge_statistics(struct mem_cgroup *mem, 207 struct page_cgroup *pc, 208 bool charge) 209{ 210 int val = (charge)? 1 : -1; 211 struct mem_cgroup_stat *stat = &mem->stat; 212 struct mem_cgroup_stat_cpu *cpustat; 213 int cpu = get_cpu(); 214 215 cpustat = &stat->cpustat[cpu]; 216 if (PageCgroupCache(pc)) 217 __mem_cgroup_stat_add_safe(cpustat, MEM_CGROUP_STAT_CACHE, val); 218 else 219 __mem_cgroup_stat_add_safe(cpustat, MEM_CGROUP_STAT_RSS, val); 220 221 if (charge) 222 __mem_cgroup_stat_add_safe(cpustat, 223 MEM_CGROUP_STAT_PGPGIN_COUNT, 1); 224 else 225 __mem_cgroup_stat_add_safe(cpustat, 226 MEM_CGROUP_STAT_PGPGOUT_COUNT, 1); 227 put_cpu(); 228} 229 230static struct mem_cgroup_per_zone * 231mem_cgroup_zoneinfo(struct mem_cgroup *mem, int nid, int zid) 232{ 233 return &mem->info.nodeinfo[nid]->zoneinfo[zid]; 234} 235 236static struct mem_cgroup_per_zone * 237page_cgroup_zoneinfo(struct page_cgroup *pc) 238{ 239 struct mem_cgroup *mem = pc->mem_cgroup; 240 int nid = page_cgroup_nid(pc); 241 int zid = page_cgroup_zid(pc); 242 243 if (!mem) 244 return NULL; 245 246 return mem_cgroup_zoneinfo(mem, nid, zid); 247} 248 249static unsigned long mem_cgroup_get_all_zonestat(struct mem_cgroup *mem, 250 enum lru_list idx) 251{ 252 int nid, zid; 253 struct mem_cgroup_per_zone *mz; 254 u64 total = 0; 255 256 for_each_online_node(nid) 257 for (zid = 0; zid < MAX_NR_ZONES; zid++) { 258 mz = mem_cgroup_zoneinfo(mem, nid, zid); 259 total += MEM_CGROUP_ZSTAT(mz, idx); 260 } 261 return total; 262} 263 264static struct mem_cgroup *mem_cgroup_from_cont(struct cgroup *cont) 265{ 266 return container_of(cgroup_subsys_state(cont, 267 mem_cgroup_subsys_id), struct mem_cgroup, 268 css); 269} 270 271struct mem_cgroup *mem_cgroup_from_task(struct task_struct *p) 272{ 273 /* 274 * mm_update_next_owner() may clear mm->owner to NULL 275 * if it races with swapoff, page migration, etc. 276 * So this can be called with p == NULL. 277 */ 278 if (unlikely(!p)) 279 return NULL; 280 281 return container_of(task_subsys_state(p, mem_cgroup_subsys_id), 282 struct mem_cgroup, css); 283} 284 285static struct mem_cgroup *try_get_mem_cgroup_from_mm(struct mm_struct *mm) 286{ 287 struct mem_cgroup *mem = NULL; 288 /* 289 * Because we have no locks, mm->owner's may be being moved to other 290 * cgroup. We use css_tryget() here even if this looks 291 * pessimistic (rather than adding locks here). 292 */ 293 rcu_read_lock(); 294 do { 295 mem = mem_cgroup_from_task(rcu_dereference(mm->owner)); 296 if (unlikely(!mem)) 297 break; 298 } while (!css_tryget(&mem->css)); 299 rcu_read_unlock(); 300 return mem; 301} 302 303static bool mem_cgroup_is_obsolete(struct mem_cgroup *mem) 304{ 305 if (!mem) 306 return true; 307 return css_is_removed(&mem->css); 308} 309 310/* 311 * Following LRU functions are allowed to be used without PCG_LOCK. 312 * Operations are called by routine of global LRU independently from memcg. 313 * What we have to take care of here is validness of pc->mem_cgroup. 314 * 315 * Changes to pc->mem_cgroup happens when 316 * 1. charge 317 * 2. moving account 318 * In typical case, "charge" is done before add-to-lru. Exception is SwapCache. 319 * It is added to LRU before charge. 320 * If PCG_USED bit is not set, page_cgroup is not added to this private LRU. 321 * When moving account, the page is not on LRU. It's isolated. 322 */ 323 324void mem_cgroup_del_lru_list(struct page *page, enum lru_list lru) 325{ 326 struct page_cgroup *pc; 327 struct mem_cgroup *mem; 328 struct mem_cgroup_per_zone *mz; 329 330 if (mem_cgroup_disabled()) 331 return; 332 pc = lookup_page_cgroup(page); 333 /* can happen while we handle swapcache. */ 334 if (list_empty(&pc->lru) || !pc->mem_cgroup) 335 return; 336 /* 337 * We don't check PCG_USED bit. It's cleared when the "page" is finally 338 * removed from global LRU. 339 */ 340 mz = page_cgroup_zoneinfo(pc); 341 mem = pc->mem_cgroup; 342 MEM_CGROUP_ZSTAT(mz, lru) -= 1; 343 list_del_init(&pc->lru); 344 return; 345} 346 347void mem_cgroup_del_lru(struct page *page) 348{ 349 mem_cgroup_del_lru_list(page, page_lru(page)); 350} 351 352void mem_cgroup_rotate_lru_list(struct page *page, enum lru_list lru) 353{ 354 struct mem_cgroup_per_zone *mz; 355 struct page_cgroup *pc; 356 357 if (mem_cgroup_disabled()) 358 return; 359 360 pc = lookup_page_cgroup(page); 361 /* 362 * Used bit is set without atomic ops but after smp_wmb(). 363 * For making pc->mem_cgroup visible, insert smp_rmb() here. 364 */ 365 smp_rmb(); 366 /* unused page is not rotated. */ 367 if (!PageCgroupUsed(pc)) 368 return; 369 mz = page_cgroup_zoneinfo(pc); 370 list_move(&pc->lru, &mz->lists[lru]); 371} 372 373void mem_cgroup_add_lru_list(struct page *page, enum lru_list lru) 374{ 375 struct page_cgroup *pc; 376 struct mem_cgroup_per_zone *mz; 377 378 if (mem_cgroup_disabled()) 379 return; 380 pc = lookup_page_cgroup(page); 381 /* 382 * Used bit is set without atomic ops but after smp_wmb(). 383 * For making pc->mem_cgroup visible, insert smp_rmb() here. 384 */ 385 smp_rmb(); 386 if (!PageCgroupUsed(pc)) 387 return; 388 389 mz = page_cgroup_zoneinfo(pc); 390 MEM_CGROUP_ZSTAT(mz, lru) += 1; 391 list_add(&pc->lru, &mz->lists[lru]); 392} 393 394/* 395 * At handling SwapCache, pc->mem_cgroup may be changed while it's linked to 396 * lru because the page may.be reused after it's fully uncharged (because of 397 * SwapCache behavior).To handle that, unlink page_cgroup from LRU when charge 398 * it again. This function is only used to charge SwapCache. It's done under 399 * lock_page and expected that zone->lru_lock is never held. 400 */ 401static void mem_cgroup_lru_del_before_commit_swapcache(struct page *page) 402{ 403 unsigned long flags; 404 struct zone *zone = page_zone(page); 405 struct page_cgroup *pc = lookup_page_cgroup(page); 406 407 spin_lock_irqsave(&zone->lru_lock, flags); 408 /* 409 * Forget old LRU when this page_cgroup is *not* used. This Used bit 410 * is guarded by lock_page() because the page is SwapCache. 411 */ 412 if (!PageCgroupUsed(pc)) 413 mem_cgroup_del_lru_list(page, page_lru(page)); 414 spin_unlock_irqrestore(&zone->lru_lock, flags); 415} 416 417static void mem_cgroup_lru_add_after_commit_swapcache(struct page *page) 418{ 419 unsigned long flags; 420 struct zone *zone = page_zone(page); 421 struct page_cgroup *pc = lookup_page_cgroup(page); 422 423 spin_lock_irqsave(&zone->lru_lock, flags); 424 /* link when the page is linked to LRU but page_cgroup isn't */ 425 if (PageLRU(page) && list_empty(&pc->lru)) 426 mem_cgroup_add_lru_list(page, page_lru(page)); 427 spin_unlock_irqrestore(&zone->lru_lock, flags); 428} 429 430 431void mem_cgroup_move_lists(struct page *page, 432 enum lru_list from, enum lru_list to) 433{ 434 if (mem_cgroup_disabled()) 435 return; 436 mem_cgroup_del_lru_list(page, from); 437 mem_cgroup_add_lru_list(page, to); 438} 439 440int task_in_mem_cgroup(struct task_struct *task, const struct mem_cgroup *mem) 441{ 442 int ret; 443 444 task_lock(task); 445 ret = task->mm && mm_match_cgroup(task->mm, mem); 446 task_unlock(task); 447 return ret; 448} 449 450/* 451 * Calculate mapped_ratio under memory controller. This will be used in 452 * vmscan.c for deteremining we have to reclaim mapped pages. 453 */ 454int mem_cgroup_calc_mapped_ratio(struct mem_cgroup *mem) 455{ 456 long total, rss; 457 458 /* 459 * usage is recorded in bytes. But, here, we assume the number of 460 * physical pages can be represented by "long" on any arch. 461 */ 462 total = (long) (mem->res.usage >> PAGE_SHIFT) + 1L; 463 rss = (long)mem_cgroup_read_stat(&mem->stat, MEM_CGROUP_STAT_RSS); 464 return (int)((rss * 100L) / total); 465} 466 467/* 468 * prev_priority control...this will be used in memory reclaim path. 469 */ 470int mem_cgroup_get_reclaim_priority(struct mem_cgroup *mem) 471{ 472 int prev_priority; 473 474 spin_lock(&mem->reclaim_param_lock); 475 prev_priority = mem->prev_priority; 476 spin_unlock(&mem->reclaim_param_lock); 477 478 return prev_priority; 479} 480 481void mem_cgroup_note_reclaim_priority(struct mem_cgroup *mem, int priority) 482{ 483 spin_lock(&mem->reclaim_param_lock); 484 if (priority < mem->prev_priority) 485 mem->prev_priority = priority; 486 spin_unlock(&mem->reclaim_param_lock); 487} 488 489void mem_cgroup_record_reclaim_priority(struct mem_cgroup *mem, int priority) 490{ 491 spin_lock(&mem->reclaim_param_lock); 492 mem->prev_priority = priority; 493 spin_unlock(&mem->reclaim_param_lock); 494} 495 496static int calc_inactive_ratio(struct mem_cgroup *memcg, unsigned long *present_pages) 497{ 498 unsigned long active; 499 unsigned long inactive; 500 unsigned long gb; 501 unsigned long inactive_ratio; 502 503 inactive = mem_cgroup_get_all_zonestat(memcg, LRU_INACTIVE_ANON); 504 active = mem_cgroup_get_all_zonestat(memcg, LRU_ACTIVE_ANON); 505 506 gb = (inactive + active) >> (30 - PAGE_SHIFT); 507 if (gb) 508 inactive_ratio = int_sqrt(10 * gb); 509 else 510 inactive_ratio = 1; 511 512 if (present_pages) { 513 present_pages[0] = inactive; 514 present_pages[1] = active; 515 } 516 517 return inactive_ratio; 518} 519 520int mem_cgroup_inactive_anon_is_low(struct mem_cgroup *memcg) 521{ 522 unsigned long active; 523 unsigned long inactive; 524 unsigned long present_pages[2]; 525 unsigned long inactive_ratio; 526 527 inactive_ratio = calc_inactive_ratio(memcg, present_pages); 528 529 inactive = present_pages[0]; 530 active = present_pages[1]; 531 532 if (inactive * inactive_ratio < active) 533 return 1; 534 535 return 0; 536} 537 538unsigned long mem_cgroup_zone_nr_pages(struct mem_cgroup *memcg, 539 struct zone *zone, 540 enum lru_list lru) 541{ 542 int nid = zone->zone_pgdat->node_id; 543 int zid = zone_idx(zone); 544 struct mem_cgroup_per_zone *mz = mem_cgroup_zoneinfo(memcg, nid, zid); 545 546 return MEM_CGROUP_ZSTAT(mz, lru); 547} 548 549struct zone_reclaim_stat *mem_cgroup_get_reclaim_stat(struct mem_cgroup *memcg, 550 struct zone *zone) 551{ 552 int nid = zone->zone_pgdat->node_id; 553 int zid = zone_idx(zone); 554 struct mem_cgroup_per_zone *mz = mem_cgroup_zoneinfo(memcg, nid, zid); 555 556 return &mz->reclaim_stat; 557} 558 559struct zone_reclaim_stat * 560mem_cgroup_get_reclaim_stat_from_page(struct page *page) 561{ 562 struct page_cgroup *pc; 563 struct mem_cgroup_per_zone *mz; 564 565 if (mem_cgroup_disabled()) 566 return NULL; 567 568 pc = lookup_page_cgroup(page); 569 /* 570 * Used bit is set without atomic ops but after smp_wmb(). 571 * For making pc->mem_cgroup visible, insert smp_rmb() here. 572 */ 573 smp_rmb(); 574 if (!PageCgroupUsed(pc)) 575 return NULL; 576 577 mz = page_cgroup_zoneinfo(pc); 578 if (!mz) 579 return NULL; 580 581 return &mz->reclaim_stat; 582} 583 584unsigned long mem_cgroup_isolate_pages(unsigned long nr_to_scan, 585 struct list_head *dst, 586 unsigned long *scanned, int order, 587 int mode, struct zone *z, 588 struct mem_cgroup *mem_cont, 589 int active, int file) 590{ 591 unsigned long nr_taken = 0; 592 struct page *page; 593 unsigned long scan; 594 LIST_HEAD(pc_list); 595 struct list_head *src; 596 struct page_cgroup *pc, *tmp; 597 int nid = z->zone_pgdat->node_id; 598 int zid = zone_idx(z); 599 struct mem_cgroup_per_zone *mz; 600 int lru = LRU_FILE * !!file + !!active; 601 602 BUG_ON(!mem_cont); 603 mz = mem_cgroup_zoneinfo(mem_cont, nid, zid); 604 src = &mz->lists[lru]; 605 606 scan = 0; 607 list_for_each_entry_safe_reverse(pc, tmp, src, lru) { 608 if (scan >= nr_to_scan) 609 break; 610 611 page = pc->page; 612 if (unlikely(!PageCgroupUsed(pc))) 613 continue; 614 if (unlikely(!PageLRU(page))) 615 continue; 616 617 scan++; 618 if (__isolate_lru_page(page, mode, file) == 0) { 619 list_move(&page->lru, dst); 620 nr_taken++; 621 } 622 } 623 624 *scanned = scan; 625 return nr_taken; 626} 627 628#define mem_cgroup_from_res_counter(counter, member) \ 629 container_of(counter, struct mem_cgroup, member) 630 631/* 632 * This routine finds the DFS walk successor. This routine should be 633 * called with hierarchy_mutex held 634 */ 635static struct mem_cgroup * 636mem_cgroup_get_next_node(struct mem_cgroup *curr, struct mem_cgroup *root_mem) 637{ 638 struct cgroup *cgroup, *curr_cgroup, *root_cgroup; 639 640 curr_cgroup = curr->css.cgroup; 641 root_cgroup = root_mem->css.cgroup; 642 643 if (!list_empty(&curr_cgroup->children)) { 644 /* 645 * Walk down to children 646 */ 647 mem_cgroup_put(curr); 648 cgroup = list_entry(curr_cgroup->children.next, 649 struct cgroup, sibling); 650 curr = mem_cgroup_from_cont(cgroup); 651 mem_cgroup_get(curr); 652 goto done; 653 } 654 655visit_parent: 656 if (curr_cgroup == root_cgroup) { 657 mem_cgroup_put(curr); 658 curr = root_mem; 659 mem_cgroup_get(curr); 660 goto done; 661 } 662 663 /* 664 * Goto next sibling 665 */ 666 if (curr_cgroup->sibling.next != &curr_cgroup->parent->children) { 667 mem_cgroup_put(curr); 668 cgroup = list_entry(curr_cgroup->sibling.next, struct cgroup, 669 sibling); 670 curr = mem_cgroup_from_cont(cgroup); 671 mem_cgroup_get(curr); 672 goto done; 673 } 674 675 /* 676 * Go up to next parent and next parent's sibling if need be 677 */ 678 curr_cgroup = curr_cgroup->parent; 679 goto visit_parent; 680 681done: 682 root_mem->last_scanned_child = curr; 683 return curr; 684} 685 686/* 687 * Visit the first child (need not be the first child as per the ordering 688 * of the cgroup list, since we track last_scanned_child) of @mem and use 689 * that to reclaim free pages from. 690 */ 691static struct mem_cgroup * 692mem_cgroup_get_first_node(struct mem_cgroup *root_mem) 693{ 694 struct cgroup *cgroup; 695 struct mem_cgroup *ret; 696 bool obsolete; 697 698 obsolete = mem_cgroup_is_obsolete(root_mem->last_scanned_child); 699 700 /* 701 * Scan all children under the mem_cgroup mem 702 */ 703 mutex_lock(&mem_cgroup_subsys.hierarchy_mutex); 704 if (list_empty(&root_mem->css.cgroup->children)) { 705 ret = root_mem; 706 goto done; 707 } 708 709 if (!root_mem->last_scanned_child || obsolete) { 710 711 if (obsolete && root_mem->last_scanned_child) 712 mem_cgroup_put(root_mem->last_scanned_child); 713 714 cgroup = list_first_entry(&root_mem->css.cgroup->children, 715 struct cgroup, sibling); 716 ret = mem_cgroup_from_cont(cgroup); 717 mem_cgroup_get(ret); 718 } else 719 ret = mem_cgroup_get_next_node(root_mem->last_scanned_child, 720 root_mem); 721 722done: 723 root_mem->last_scanned_child = ret; 724 mutex_unlock(&mem_cgroup_subsys.hierarchy_mutex); 725 return ret; 726} 727 728static bool mem_cgroup_check_under_limit(struct mem_cgroup *mem) 729{ 730 if (do_swap_account) { 731 if (res_counter_check_under_limit(&mem->res) && 732 res_counter_check_under_limit(&mem->memsw)) 733 return true; 734 } else 735 if (res_counter_check_under_limit(&mem->res)) 736 return true; 737 return false; 738} 739 740static unsigned int get_swappiness(struct mem_cgroup *memcg) 741{ 742 struct cgroup *cgrp = memcg->css.cgroup; 743 unsigned int swappiness; 744 745 /* root ? */ 746 if (cgrp->parent == NULL) 747 return vm_swappiness; 748 749 spin_lock(&memcg->reclaim_param_lock); 750 swappiness = memcg->swappiness; 751 spin_unlock(&memcg->reclaim_param_lock); 752 753 return swappiness; 754} 755 756/* 757 * Dance down the hierarchy if needed to reclaim memory. We remember the 758 * last child we reclaimed from, so that we don't end up penalizing 759 * one child extensively based on its position in the children list. 760 * 761 * root_mem is the original ancestor that we've been reclaim from. 762 */ 763static int mem_cgroup_hierarchical_reclaim(struct mem_cgroup *root_mem, 764 gfp_t gfp_mask, bool noswap) 765{ 766 struct mem_cgroup *next_mem; 767 int ret = 0; 768 769 /* 770 * Reclaim unconditionally and don't check for return value. 771 * We need to reclaim in the current group and down the tree. 772 * One might think about checking for children before reclaiming, 773 * but there might be left over accounting, even after children 774 * have left. 775 */ 776 ret = try_to_free_mem_cgroup_pages(root_mem, gfp_mask, noswap, 777 get_swappiness(root_mem)); 778 if (mem_cgroup_check_under_limit(root_mem)) 779 return 0; 780 if (!root_mem->use_hierarchy) 781 return ret; 782 783 next_mem = mem_cgroup_get_first_node(root_mem); 784 785 while (next_mem != root_mem) { 786 if (mem_cgroup_is_obsolete(next_mem)) { 787 mem_cgroup_put(next_mem); 788 next_mem = mem_cgroup_get_first_node(root_mem); 789 continue; 790 } 791 ret = try_to_free_mem_cgroup_pages(next_mem, gfp_mask, noswap, 792 get_swappiness(next_mem)); 793 if (mem_cgroup_check_under_limit(root_mem)) 794 return 0; 795 mutex_lock(&mem_cgroup_subsys.hierarchy_mutex); 796 next_mem = mem_cgroup_get_next_node(next_mem, root_mem); 797 mutex_unlock(&mem_cgroup_subsys.hierarchy_mutex); 798 } 799 return ret; 800} 801 802bool mem_cgroup_oom_called(struct task_struct *task) 803{ 804 bool ret = false; 805 struct mem_cgroup *mem; 806 struct mm_struct *mm; 807 808 rcu_read_lock(); 809 mm = task->mm; 810 if (!mm) 811 mm = &init_mm; 812 mem = mem_cgroup_from_task(rcu_dereference(mm->owner)); 813 if (mem && time_before(jiffies, mem->last_oom_jiffies + HZ/10)) 814 ret = true; 815 rcu_read_unlock(); 816 return ret; 817} 818/* 819 * Unlike exported interface, "oom" parameter is added. if oom==true, 820 * oom-killer can be invoked. 821 */ 822static int __mem_cgroup_try_charge(struct mm_struct *mm, 823 gfp_t gfp_mask, struct mem_cgroup **memcg, 824 bool oom) 825{ 826 struct mem_cgroup *mem, *mem_over_limit; 827 int nr_retries = MEM_CGROUP_RECLAIM_RETRIES; 828 struct res_counter *fail_res; 829 830 if (unlikely(test_thread_flag(TIF_MEMDIE))) { 831 /* Don't account this! */ 832 *memcg = NULL; 833 return 0; 834 } 835 836 /* 837 * We always charge the cgroup the mm_struct belongs to. 838 * The mm_struct's mem_cgroup changes on task migration if the 839 * thread group leader migrates. It's possible that mm is not 840 * set, if so charge the init_mm (happens for pagecache usage). 841 */ 842 mem = *memcg; 843 if (likely(!mem)) { 844 mem = try_get_mem_cgroup_from_mm(mm); 845 *memcg = mem; 846 } else { 847 css_get(&mem->css); 848 } 849 if (unlikely(!mem)) 850 return 0; 851 852 VM_BUG_ON(mem_cgroup_is_obsolete(mem)); 853 854 while (1) { 855 int ret; 856 bool noswap = false; 857 858 ret = res_counter_charge(&mem->res, PAGE_SIZE, &fail_res); 859 if (likely(!ret)) { 860 if (!do_swap_account) 861 break; 862 ret = res_counter_charge(&mem->memsw, PAGE_SIZE, 863 &fail_res); 864 if (likely(!ret)) 865 break; 866 /* mem+swap counter fails */ 867 res_counter_uncharge(&mem->res, PAGE_SIZE); 868 noswap = true; 869 mem_over_limit = mem_cgroup_from_res_counter(fail_res, 870 memsw); 871 } else 872 /* mem counter fails */ 873 mem_over_limit = mem_cgroup_from_res_counter(fail_res, 874 res); 875 876 if (!(gfp_mask & __GFP_WAIT)) 877 goto nomem; 878 879 ret = mem_cgroup_hierarchical_reclaim(mem_over_limit, gfp_mask, 880 noswap); 881 882 /* 883 * try_to_free_mem_cgroup_pages() might not give us a full 884 * picture of reclaim. Some pages are reclaimed and might be 885 * moved to swap cache or just unmapped from the cgroup. 886 * Check the limit again to see if the reclaim reduced the 887 * current usage of the cgroup before giving up 888 * 889 */ 890 if (mem_cgroup_check_under_limit(mem_over_limit)) 891 continue; 892 893 if (!nr_retries--) { 894 if (oom) { 895 mutex_lock(&memcg_tasklist); 896 mem_cgroup_out_of_memory(mem_over_limit, gfp_mask); 897 mutex_unlock(&memcg_tasklist); 898 mem_over_limit->last_oom_jiffies = jiffies; 899 } 900 goto nomem; 901 } 902 } 903 return 0; 904nomem: 905 css_put(&mem->css); 906 return -ENOMEM; 907} 908 909static struct mem_cgroup *try_get_mem_cgroup_from_swapcache(struct page *page) 910{ 911 struct mem_cgroup *mem; 912 swp_entry_t ent; 913 914 if (!PageSwapCache(page)) 915 return NULL; 916 917 ent.val = page_private(page); 918 mem = lookup_swap_cgroup(ent); 919 if (!mem) 920 return NULL; 921 if (!css_tryget(&mem->css)) 922 return NULL; 923 return mem; 924} 925 926/* 927 * commit a charge got by __mem_cgroup_try_charge() and makes page_cgroup to be 928 * USED state. If already USED, uncharge and return. 929 */ 930 931static void __mem_cgroup_commit_charge(struct mem_cgroup *mem, 932 struct page_cgroup *pc, 933 enum charge_type ctype) 934{ 935 /* try_charge() can return NULL to *memcg, taking care of it. */ 936 if (!mem) 937 return; 938 939 lock_page_cgroup(pc); 940 if (unlikely(PageCgroupUsed(pc))) { 941 unlock_page_cgroup(pc); 942 res_counter_uncharge(&mem->res, PAGE_SIZE); 943 if (do_swap_account) 944 res_counter_uncharge(&mem->memsw, PAGE_SIZE); 945 css_put(&mem->css); 946 return; 947 } 948 pc->mem_cgroup = mem; 949 smp_wmb(); 950 pc->flags = pcg_default_flags[ctype]; 951 952 mem_cgroup_charge_statistics(mem, pc, true); 953 954 unlock_page_cgroup(pc); 955} 956 957/** 958 * mem_cgroup_move_account - move account of the page 959 * @pc: page_cgroup of the page. 960 * @from: mem_cgroup which the page is moved from. 961 * @to: mem_cgroup which the page is moved to. @from != @to. 962 * 963 * The caller must confirm following. 964 * - page is not on LRU (isolate_page() is useful.) 965 * 966 * returns 0 at success, 967 * returns -EBUSY when lock is busy or "pc" is unstable. 968 * 969 * This function does "uncharge" from old cgroup but doesn't do "charge" to 970 * new cgroup. It should be done by a caller. 971 */ 972 973static int mem_cgroup_move_account(struct page_cgroup *pc, 974 struct mem_cgroup *from, struct mem_cgroup *to) 975{ 976 struct mem_cgroup_per_zone *from_mz, *to_mz; 977 int nid, zid; 978 int ret = -EBUSY; 979 980 VM_BUG_ON(from == to); 981 VM_BUG_ON(PageLRU(pc->page)); 982 983 nid = page_cgroup_nid(pc); 984 zid = page_cgroup_zid(pc); 985 from_mz = mem_cgroup_zoneinfo(from, nid, zid); 986 to_mz = mem_cgroup_zoneinfo(to, nid, zid); 987 988 if (!trylock_page_cgroup(pc)) 989 return ret; 990 991 if (!PageCgroupUsed(pc)) 992 goto out; 993 994 if (pc->mem_cgroup != from) 995 goto out; 996 997 css_put(&from->css); 998 res_counter_uncharge(&from->res, PAGE_SIZE); 999 mem_cgroup_charge_statistics(from, pc, false); 1000 if (do_swap_account) 1001 res_counter_uncharge(&from->memsw, PAGE_SIZE); 1002 pc->mem_cgroup = to; 1003 mem_cgroup_charge_statistics(to, pc, true); 1004 css_get(&to->css); 1005 ret = 0; 1006out: 1007 unlock_page_cgroup(pc); 1008 return ret; 1009} 1010 1011/* 1012 * move charges to its parent. 1013 */ 1014 1015static int mem_cgroup_move_parent(struct page_cgroup *pc, 1016 struct mem_cgroup *child, 1017 gfp_t gfp_mask) 1018{ 1019 struct page *page = pc->page; 1020 struct cgroup *cg = child->css.cgroup; 1021 struct cgroup *pcg = cg->parent; 1022 struct mem_cgroup *parent; 1023 int ret; 1024 1025 /* Is ROOT ? */ 1026 if (!pcg) 1027 return -EINVAL; 1028 1029 1030 parent = mem_cgroup_from_cont(pcg); 1031 1032 1033 ret = __mem_cgroup_try_charge(NULL, gfp_mask, &parent, false); 1034 if (ret || !parent) 1035 return ret; 1036 1037 if (!get_page_unless_zero(page)) 1038 return -EBUSY; 1039 1040 ret = isolate_lru_page(page); 1041 1042 if (ret) 1043 goto cancel; 1044 1045 ret = mem_cgroup_move_account(pc, child, parent); 1046 1047 /* drop extra refcnt by try_charge() (move_account increment one) */ 1048 css_put(&parent->css); 1049 putback_lru_page(page); 1050 if (!ret) { 1051 put_page(page); 1052 return 0; 1053 } 1054 /* uncharge if move fails */ 1055cancel: 1056 res_counter_uncharge(&parent->res, PAGE_SIZE); 1057 if (do_swap_account) 1058 res_counter_uncharge(&parent->memsw, PAGE_SIZE); 1059 put_page(page); 1060 return ret; 1061} 1062 1063/* 1064 * Charge the memory controller for page usage. 1065 * Return 1066 * 0 if the charge was successful 1067 * < 0 if the cgroup is over its limit 1068 */ 1069static int mem_cgroup_charge_common(struct page *page, struct mm_struct *mm, 1070 gfp_t gfp_mask, enum charge_type ctype, 1071 struct mem_cgroup *memcg) 1072{ 1073 struct mem_cgroup *mem; 1074 struct page_cgroup *pc; 1075 int ret; 1076 1077 pc = lookup_page_cgroup(page); 1078 /* can happen at boot */ 1079 if (unlikely(!pc)) 1080 return 0; 1081 prefetchw(pc); 1082 1083 mem = memcg; 1084 ret = __mem_cgroup_try_charge(mm, gfp_mask, &mem, true); 1085 if (ret || !mem) 1086 return ret; 1087 1088 __mem_cgroup_commit_charge(mem, pc, ctype); 1089 return 0; 1090} 1091 1092int mem_cgroup_newpage_charge(struct page *page, 1093 struct mm_struct *mm, gfp_t gfp_mask) 1094{ 1095 if (mem_cgroup_disabled()) 1096 return 0; 1097 if (PageCompound(page)) 1098 return 0; 1099 /* 1100 * If already mapped, we don't have to account. 1101 * If page cache, page->mapping has address_space. 1102 * But page->mapping may have out-of-use anon_vma pointer, 1103 * detecit it by PageAnon() check. newly-mapped-anon's page->mapping 1104 * is NULL. 1105 */ 1106 if (page_mapped(page) || (page->mapping && !PageAnon(page))) 1107 return 0; 1108 if (unlikely(!mm)) 1109 mm = &init_mm; 1110 return mem_cgroup_charge_common(page, mm, gfp_mask, 1111 MEM_CGROUP_CHARGE_TYPE_MAPPED, NULL); 1112} 1113 1114int mem_cgroup_cache_charge(struct page *page, struct mm_struct *mm, 1115 gfp_t gfp_mask) 1116{ 1117 struct mem_cgroup *mem = NULL; 1118 int ret; 1119 1120 if (mem_cgroup_disabled()) 1121 return 0; 1122 if (PageCompound(page)) 1123 return 0; 1124 /* 1125 * Corner case handling. This is called from add_to_page_cache() 1126 * in usual. But some FS (shmem) precharges this page before calling it 1127 * and call add_to_page_cache() with GFP_NOWAIT. 1128 * 1129 * For GFP_NOWAIT case, the page may be pre-charged before calling 1130 * add_to_page_cache(). (See shmem.c) check it here and avoid to call 1131 * charge twice. (It works but has to pay a bit larger cost.) 1132 * And when the page is SwapCache, it should take swap information 1133 * into account. This is under lock_page() now. 1134 */ 1135 if (!(gfp_mask & __GFP_WAIT)) { 1136 struct page_cgroup *pc; 1137 1138 1139 pc = lookup_page_cgroup(page); 1140 if (!pc) 1141 return 0; 1142 lock_page_cgroup(pc); 1143 if (PageCgroupUsed(pc)) { 1144 unlock_page_cgroup(pc); 1145 return 0; 1146 } 1147 unlock_page_cgroup(pc); 1148 } 1149 1150 if (do_swap_account && PageSwapCache(page)) { 1151 mem = try_get_mem_cgroup_from_swapcache(page); 1152 if (mem) 1153 mm = NULL; 1154 else 1155 mem = NULL; 1156 /* SwapCache may be still linked to LRU now. */ 1157 mem_cgroup_lru_del_before_commit_swapcache(page); 1158 } 1159 1160 if (unlikely(!mm && !mem)) 1161 mm = &init_mm; 1162 1163 if (page_is_file_cache(page)) 1164 return mem_cgroup_charge_common(page, mm, gfp_mask, 1165 MEM_CGROUP_CHARGE_TYPE_CACHE, NULL); 1166 1167 ret = mem_cgroup_charge_common(page, mm, gfp_mask, 1168 MEM_CGROUP_CHARGE_TYPE_SHMEM, mem); 1169 if (mem) 1170 css_put(&mem->css); 1171 if (PageSwapCache(page)) 1172 mem_cgroup_lru_add_after_commit_swapcache(page); 1173 1174 if (do_swap_account && !ret && PageSwapCache(page)) { 1175 swp_entry_t ent = {.val = page_private(page)}; 1176 /* avoid double counting */ 1177 mem = swap_cgroup_record(ent, NULL); 1178 if (mem) { 1179 res_counter_uncharge(&mem->memsw, PAGE_SIZE); 1180 mem_cgroup_put(mem); 1181 } 1182 } 1183 return ret; 1184} 1185 1186/* 1187 * While swap-in, try_charge -> commit or cancel, the page is locked. 1188 * And when try_charge() successfully returns, one refcnt to memcg without 1189 * struct page_cgroup is aquired. This refcnt will be cumsumed by 1190 * "commit()" or removed by "cancel()" 1191 */ 1192int mem_cgroup_try_charge_swapin(struct mm_struct *mm, 1193 struct page *page, 1194 gfp_t mask, struct mem_cgroup **ptr) 1195{ 1196 struct mem_cgroup *mem; 1197 int ret; 1198 1199 if (mem_cgroup_disabled()) 1200 return 0; 1201 1202 if (!do_swap_account) 1203 goto charge_cur_mm; 1204 /* 1205 * A racing thread's fault, or swapoff, may have already updated 1206 * the pte, and even removed page from swap cache: return success 1207 * to go on to do_swap_page()'s pte_same() test, which should fail. 1208 */ 1209 if (!PageSwapCache(page)) 1210 return 0; 1211 mem = try_get_mem_cgroup_from_swapcache(page); 1212 if (!mem) 1213 goto charge_cur_mm; 1214 *ptr = mem; 1215 ret = __mem_cgroup_try_charge(NULL, mask, ptr, true); 1216 /* drop extra refcnt from tryget */ 1217 css_put(&mem->css); 1218 return ret; 1219charge_cur_mm: 1220 if (unlikely(!mm)) 1221 mm = &init_mm; 1222 return __mem_cgroup_try_charge(mm, mask, ptr, true); 1223} 1224 1225void mem_cgroup_commit_charge_swapin(struct page *page, struct mem_cgroup *ptr) 1226{ 1227 struct page_cgroup *pc; 1228 1229 if (mem_cgroup_disabled()) 1230 return; 1231 if (!ptr) 1232 return; 1233 pc = lookup_page_cgroup(page); 1234 mem_cgroup_lru_del_before_commit_swapcache(page); 1235 __mem_cgroup_commit_charge(ptr, pc, MEM_CGROUP_CHARGE_TYPE_MAPPED); 1236 mem_cgroup_lru_add_after_commit_swapcache(page); 1237 /* 1238 * Now swap is on-memory. This means this page may be 1239 * counted both as mem and swap....double count. 1240 * Fix it by uncharging from memsw. Basically, this SwapCache is stable 1241 * under lock_page(). But in do_swap_page()::memory.c, reuse_swap_page() 1242 * may call delete_from_swap_cache() before reach here. 1243 */ 1244 if (do_swap_account && PageSwapCache(page)) { 1245 swp_entry_t ent = {.val = page_private(page)}; 1246 struct mem_cgroup *memcg; 1247 memcg = swap_cgroup_record(ent, NULL); 1248 if (memcg) { 1249 res_counter_uncharge(&memcg->memsw, PAGE_SIZE); 1250 mem_cgroup_put(memcg); 1251 } 1252 1253 } 1254 /* add this page(page_cgroup) to the LRU we want. */ 1255 1256} 1257 1258void mem_cgroup_cancel_charge_swapin(struct mem_cgroup *mem) 1259{ 1260 if (mem_cgroup_disabled()) 1261 return; 1262 if (!mem) 1263 return; 1264 res_counter_uncharge(&mem->res, PAGE_SIZE); 1265 if (do_swap_account) 1266 res_counter_uncharge(&mem->memsw, PAGE_SIZE); 1267 css_put(&mem->css); 1268} 1269 1270 1271/* 1272 * uncharge if !page_mapped(page) 1273 */ 1274static struct mem_cgroup * 1275__mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype) 1276{ 1277 struct page_cgroup *pc; 1278 struct mem_cgroup *mem = NULL; 1279 struct mem_cgroup_per_zone *mz; 1280 1281 if (mem_cgroup_disabled()) 1282 return NULL; 1283 1284 if (PageSwapCache(page)) 1285 return NULL; 1286 1287 /* 1288 * Check if our page_cgroup is valid 1289 */ 1290 pc = lookup_page_cgroup(page); 1291 if (unlikely(!pc || !PageCgroupUsed(pc))) 1292 return NULL; 1293 1294 lock_page_cgroup(pc); 1295 1296 mem = pc->mem_cgroup; 1297 1298 if (!PageCgroupUsed(pc)) 1299 goto unlock_out; 1300 1301 switch (ctype) { 1302 case MEM_CGROUP_CHARGE_TYPE_MAPPED: 1303 if (page_mapped(page)) 1304 goto unlock_out; 1305 break; 1306 case MEM_CGROUP_CHARGE_TYPE_SWAPOUT: 1307 if (!PageAnon(page)) { /* Shared memory */ 1308 if (page->mapping && !page_is_file_cache(page)) 1309 goto unlock_out; 1310 } else if (page_mapped(page)) /* Anon */ 1311 goto unlock_out; 1312 break; 1313 default: 1314 break; 1315 } 1316 1317 res_counter_uncharge(&mem->res, PAGE_SIZE); 1318 if (do_swap_account && (ctype != MEM_CGROUP_CHARGE_TYPE_SWAPOUT)) 1319 res_counter_uncharge(&mem->memsw, PAGE_SIZE); 1320 1321 mem_cgroup_charge_statistics(mem, pc, false); 1322 ClearPageCgroupUsed(pc); 1323 /* 1324 * pc->mem_cgroup is not cleared here. It will be accessed when it's 1325 * freed from LRU. This is safe because uncharged page is expected not 1326 * to be reused (freed soon). Exception is SwapCache, it's handled by 1327 * special functions. 1328 */ 1329 1330 mz = page_cgroup_zoneinfo(pc); 1331 unlock_page_cgroup(pc); 1332 1333 /* at swapout, this memcg will be accessed to record to swap */ 1334 if (ctype != MEM_CGROUP_CHARGE_TYPE_SWAPOUT) 1335 css_put(&mem->css); 1336 1337 return mem; 1338 1339unlock_out: 1340 unlock_page_cgroup(pc); 1341 return NULL; 1342} 1343 1344void mem_cgroup_uncharge_page(struct page *page) 1345{ 1346 /* early check. */ 1347 if (page_mapped(page)) 1348 return; 1349 if (page->mapping && !PageAnon(page)) 1350 return; 1351 __mem_cgroup_uncharge_common(page, MEM_CGROUP_CHARGE_TYPE_MAPPED); 1352} 1353 1354void mem_cgroup_uncharge_cache_page(struct page *page) 1355{ 1356 VM_BUG_ON(page_mapped(page)); 1357 VM_BUG_ON(page->mapping); 1358 __mem_cgroup_uncharge_common(page, MEM_CGROUP_CHARGE_TYPE_CACHE); 1359} 1360 1361/* 1362 * called from __delete_from_swap_cache() and drop "page" account. 1363 * memcg information is recorded to swap_cgroup of "ent" 1364 */ 1365void mem_cgroup_uncharge_swapcache(struct page *page, swp_entry_t ent) 1366{ 1367 struct mem_cgroup *memcg; 1368 1369 memcg = __mem_cgroup_uncharge_common(page, 1370 MEM_CGROUP_CHARGE_TYPE_SWAPOUT); 1371 /* record memcg information */ 1372 if (do_swap_account && memcg) { 1373 swap_cgroup_record(ent, memcg); 1374 mem_cgroup_get(memcg); 1375 } 1376 if (memcg) 1377 css_put(&memcg->css); 1378} 1379 1380#ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP 1381/* 1382 * called from swap_entry_free(). remove record in swap_cgroup and 1383 * uncharge "memsw" account. 1384 */ 1385void mem_cgroup_uncharge_swap(swp_entry_t ent) 1386{ 1387 struct mem_cgroup *memcg; 1388 1389 if (!do_swap_account) 1390 return; 1391 1392 memcg = swap_cgroup_record(ent, NULL); 1393 if (memcg) { 1394 res_counter_uncharge(&memcg->memsw, PAGE_SIZE); 1395 mem_cgroup_put(memcg); 1396 } 1397} 1398#endif 1399 1400/* 1401 * Before starting migration, account PAGE_SIZE to mem_cgroup that the old 1402 * page belongs to. 1403 */ 1404int mem_cgroup_prepare_migration(struct page *page, struct mem_cgroup **ptr) 1405{ 1406 struct page_cgroup *pc; 1407 struct mem_cgroup *mem = NULL; 1408 int ret = 0; 1409 1410 if (mem_cgroup_disabled()) 1411 return 0; 1412 1413 pc = lookup_page_cgroup(page); 1414 lock_page_cgroup(pc); 1415 if (PageCgroupUsed(pc)) { 1416 mem = pc->mem_cgroup; 1417 css_get(&mem->css); 1418 } 1419 unlock_page_cgroup(pc); 1420 1421 if (mem) { 1422 ret = __mem_cgroup_try_charge(NULL, GFP_KERNEL, &mem, false); 1423 css_put(&mem->css); 1424 } 1425 *ptr = mem; 1426 return ret; 1427} 1428 1429/* remove redundant charge if migration failed*/ 1430void mem_cgroup_end_migration(struct mem_cgroup *mem, 1431 struct page *oldpage, struct page *newpage) 1432{ 1433 struct page *target, *unused; 1434 struct page_cgroup *pc; 1435 enum charge_type ctype; 1436 1437 if (!mem) 1438 return; 1439 1440 /* at migration success, oldpage->mapping is NULL. */ 1441 if (oldpage->mapping) { 1442 target = oldpage; 1443 unused = NULL; 1444 } else { 1445 target = newpage; 1446 unused = oldpage; 1447 } 1448 1449 if (PageAnon(target)) 1450 ctype = MEM_CGROUP_CHARGE_TYPE_MAPPED; 1451 else if (page_is_file_cache(target)) 1452 ctype = MEM_CGROUP_CHARGE_TYPE_CACHE; 1453 else 1454 ctype = MEM_CGROUP_CHARGE_TYPE_SHMEM; 1455 1456 /* unused page is not on radix-tree now. */ 1457 if (unused) 1458 __mem_cgroup_uncharge_common(unused, ctype); 1459 1460 pc = lookup_page_cgroup(target); 1461 /* 1462 * __mem_cgroup_commit_charge() check PCG_USED bit of page_cgroup. 1463 * So, double-counting is effectively avoided. 1464 */ 1465 __mem_cgroup_commit_charge(mem, pc, ctype); 1466 1467 /* 1468 * Both of oldpage and newpage are still under lock_page(). 1469 * Then, we don't have to care about race in radix-tree. 1470 * But we have to be careful that this page is unmapped or not. 1471 * 1472 * There is a case for !page_mapped(). At the start of 1473 * migration, oldpage was mapped. But now, it's zapped. 1474 * But we know *target* page is not freed/reused under us. 1475 * mem_cgroup_uncharge_page() does all necessary checks. 1476 */ 1477 if (ctype == MEM_CGROUP_CHARGE_TYPE_MAPPED) 1478 mem_cgroup_uncharge_page(target); 1479} 1480 1481/* 1482 * A call to try to shrink memory usage under specified resource controller. 1483 * This is typically used for page reclaiming for shmem for reducing side 1484 * effect of page allocation from shmem, which is used by some mem_cgroup. 1485 */ 1486int mem_cgroup_shrink_usage(struct page *page, 1487 struct mm_struct *mm, 1488 gfp_t gfp_mask) 1489{ 1490 struct mem_cgroup *mem = NULL; 1491 int progress = 0; 1492 int retry = MEM_CGROUP_RECLAIM_RETRIES; 1493 1494 if (mem_cgroup_disabled()) 1495 return 0; 1496 if (page) 1497 mem = try_get_mem_cgroup_from_swapcache(page); 1498 if (!mem && mm) 1499 mem = try_get_mem_cgroup_from_mm(mm); 1500 if (unlikely(!mem)) 1501 return 0; 1502 1503 do { 1504 progress = mem_cgroup_hierarchical_reclaim(mem, gfp_mask, true); 1505 progress += mem_cgroup_check_under_limit(mem); 1506 } while (!progress && --retry); 1507 1508 css_put(&mem->css); 1509 if (!retry) 1510 return -ENOMEM; 1511 return 0; 1512} 1513 1514static DEFINE_MUTEX(set_limit_mutex); 1515 1516static int mem_cgroup_resize_limit(struct mem_cgroup *memcg, 1517 unsigned long long val) 1518{ 1519 1520 int retry_count = MEM_CGROUP_RECLAIM_RETRIES; 1521 int progress; 1522 u64 memswlimit; 1523 int ret = 0; 1524 1525 while (retry_count) { 1526 if (signal_pending(current)) { 1527 ret = -EINTR; 1528 break; 1529 } 1530 /* 1531 * Rather than hide all in some function, I do this in 1532 * open coded manner. You see what this really does. 1533 * We have to guarantee mem->res.limit < mem->memsw.limit. 1534 */ 1535 mutex_lock(&set_limit_mutex); 1536 memswlimit = res_counter_read_u64(&memcg->memsw, RES_LIMIT); 1537 if (memswlimit < val) { 1538 ret = -EINVAL; 1539 mutex_unlock(&set_limit_mutex); 1540 break; 1541 } 1542 ret = res_counter_set_limit(&memcg->res, val); 1543 mutex_unlock(&set_limit_mutex); 1544 1545 if (!ret) 1546 break; 1547 1548 progress = mem_cgroup_hierarchical_reclaim(memcg, GFP_KERNEL, 1549 false); 1550 if (!progress) retry_count--; 1551 } 1552 1553 return ret; 1554} 1555 1556int mem_cgroup_resize_memsw_limit(struct mem_cgroup *memcg, 1557 unsigned long long val) 1558{ 1559 int retry_count = MEM_CGROUP_RECLAIM_RETRIES; 1560 u64 memlimit, oldusage, curusage; 1561 int ret; 1562 1563 if (!do_swap_account) 1564 return -EINVAL; 1565 1566 while (retry_count) { 1567 if (signal_pending(current)) { 1568 ret = -EINTR; 1569 break; 1570 } 1571 /* 1572 * Rather than hide all in some function, I do this in 1573 * open coded manner. You see what this really does. 1574 * We have to guarantee mem->res.limit < mem->memsw.limit. 1575 */ 1576 mutex_lock(&set_limit_mutex); 1577 memlimit = res_counter_read_u64(&memcg->res, RES_LIMIT); 1578 if (memlimit > val) { 1579 ret = -EINVAL; 1580 mutex_unlock(&set_limit_mutex); 1581 break; 1582 } 1583 ret = res_counter_set_limit(&memcg->memsw, val); 1584 mutex_unlock(&set_limit_mutex); 1585 1586 if (!ret) 1587 break; 1588 1589 oldusage = res_counter_read_u64(&memcg->memsw, RES_USAGE); 1590 mem_cgroup_hierarchical_reclaim(memcg, GFP_KERNEL, true); 1591 curusage = res_counter_read_u64(&memcg->memsw, RES_USAGE); 1592 if (curusage >= oldusage) 1593 retry_count--; 1594 } 1595 return ret; 1596} 1597 1598/* 1599 * This routine traverse page_cgroup in given list and drop them all. 1600 * *And* this routine doesn't reclaim page itself, just removes page_cgroup. 1601 */ 1602static int mem_cgroup_force_empty_list(struct mem_cgroup *mem, 1603 int node, int zid, enum lru_list lru) 1604{ 1605 struct zone *zone; 1606 struct mem_cgroup_per_zone *mz; 1607 struct page_cgroup *pc, *busy; 1608 unsigned long flags, loop; 1609 struct list_head *list; 1610 int ret = 0; 1611 1612 zone = &NODE_DATA(node)->node_zones[zid]; 1613 mz = mem_cgroup_zoneinfo(mem, node, zid); 1614 list = &mz->lists[lru]; 1615 1616 loop = MEM_CGROUP_ZSTAT(mz, lru); 1617 /* give some margin against EBUSY etc...*/ 1618 loop += 256; 1619 busy = NULL; 1620 while (loop--) { 1621 ret = 0; 1622 spin_lock_irqsave(&zone->lru_lock, flags); 1623 if (list_empty(list)) { 1624 spin_unlock_irqrestore(&zone->lru_lock, flags); 1625 break; 1626 } 1627 pc = list_entry(list->prev, struct page_cgroup, lru); 1628 if (busy == pc) { 1629 list_move(&pc->lru, list); 1630 busy = 0; 1631 spin_unlock_irqrestore(&zone->lru_lock, flags); 1632 continue; 1633 } 1634 spin_unlock_irqrestore(&zone->lru_lock, flags); 1635 1636 ret = mem_cgroup_move_parent(pc, mem, GFP_KERNEL); 1637 if (ret == -ENOMEM) 1638 break; 1639 1640 if (ret == -EBUSY || ret == -EINVAL) { 1641 /* found lock contention or "pc" is obsolete. */ 1642 busy = pc; 1643 cond_resched(); 1644 } else 1645 busy = NULL; 1646 } 1647 1648 if (!ret && !list_empty(list)) 1649 return -EBUSY; 1650 return ret; 1651} 1652 1653/* 1654 * make mem_cgroup's charge to be 0 if there is no task. 1655 * This enables deleting this mem_cgroup. 1656 */ 1657static int mem_cgroup_force_empty(struct mem_cgroup *mem, bool free_all) 1658{ 1659 int ret; 1660 int node, zid, shrink; 1661 int nr_retries = MEM_CGROUP_RECLAIM_RETRIES; 1662 struct cgroup *cgrp = mem->css.cgroup; 1663 1664 css_get(&mem->css); 1665 1666 shrink = 0; 1667 /* should free all ? */ 1668 if (free_all) 1669 goto try_to_free; 1670move_account: 1671 while (mem->res.usage > 0) { 1672 ret = -EBUSY; 1673 if (cgroup_task_count(cgrp) || !list_empty(&cgrp->children)) 1674 goto out; 1675 ret = -EINTR; 1676 if (signal_pending(current)) 1677 goto out; 1678 /* This is for making all *used* pages to be on LRU. */ 1679 lru_add_drain_all(); 1680 ret = 0; 1681 for_each_node_state(node, N_POSSIBLE) { 1682 for (zid = 0; !ret && zid < MAX_NR_ZONES; zid++) { 1683 enum lru_list l; 1684 for_each_lru(l) { 1685 ret = mem_cgroup_force_empty_list(mem, 1686 node, zid, l); 1687 if (ret) 1688 break; 1689 } 1690 } 1691 if (ret) 1692 break; 1693 } 1694 /* it seems parent cgroup doesn't have enough mem */ 1695 if (ret == -ENOMEM) 1696 goto try_to_free; 1697 cond_resched(); 1698 } 1699 ret = 0; 1700out: 1701 css_put(&mem->css); 1702 return ret; 1703 1704try_to_free: 1705 /* returns EBUSY if there is a task or if we come here twice. */ 1706 if (cgroup_task_count(cgrp) || !list_empty(&cgrp->children) || shrink) { 1707 ret = -EBUSY; 1708 goto out; 1709 } 1710 /* we call try-to-free pages for make this cgroup empty */ 1711 lru_add_drain_all(); 1712 /* try to free all pages in this cgroup */ 1713 shrink = 1; 1714 while (nr_retries && mem->res.usage > 0) { 1715 int progress; 1716 1717 if (signal_pending(current)) { 1718 ret = -EINTR; 1719 goto out; 1720 } 1721 progress = try_to_free_mem_cgroup_pages(mem, GFP_KERNEL, 1722 false, get_swappiness(mem)); 1723 if (!progress) { 1724 nr_retries--; 1725 /* maybe some writeback is necessary */ 1726 congestion_wait(WRITE, HZ/10); 1727 } 1728 1729 } 1730 lru_add_drain(); 1731 /* try move_account...there may be some *locked* pages. */ 1732 if (mem->res.usage) 1733 goto move_account; 1734 ret = 0; 1735 goto out; 1736} 1737 1738int mem_cgroup_force_empty_write(struct cgroup *cont, unsigned int event) 1739{ 1740 return mem_cgroup_force_empty(mem_cgroup_from_cont(cont), true); 1741} 1742 1743 1744static u64 mem_cgroup_hierarchy_read(struct cgroup *cont, struct cftype *cft) 1745{ 1746 return mem_cgroup_from_cont(cont)->use_hierarchy; 1747} 1748 1749static int mem_cgroup_hierarchy_write(struct cgroup *cont, struct cftype *cft, 1750 u64 val) 1751{ 1752 int retval = 0; 1753 struct mem_cgroup *mem = mem_cgroup_from_cont(cont); 1754 struct cgroup *parent = cont->parent; 1755 struct mem_cgroup *parent_mem = NULL; 1756 1757 if (parent) 1758 parent_mem = mem_cgroup_from_cont(parent); 1759 1760 cgroup_lock(); 1761 /* 1762 * If parent's use_hiearchy is set, we can't make any modifications 1763 * in the child subtrees. If it is unset, then the change can 1764 * occur, provided the current cgroup has no children. 1765 * 1766 * For the root cgroup, parent_mem is NULL, we allow value to be 1767 * set if there are no children. 1768 */ 1769 if ((!parent_mem || !parent_mem->use_hierarchy) && 1770 (val == 1 || val == 0)) { 1771 if (list_empty(&cont->children)) 1772 mem->use_hierarchy = val; 1773 else 1774 retval = -EBUSY; 1775 } else 1776 retval = -EINVAL; 1777 cgroup_unlock(); 1778 1779 return retval; 1780} 1781 1782static u64 mem_cgroup_read(struct cgroup *cont, struct cftype *cft) 1783{ 1784 struct mem_cgroup *mem = mem_cgroup_from_cont(cont); 1785 u64 val = 0; 1786 int type, name; 1787 1788 type = MEMFILE_TYPE(cft->private); 1789 name = MEMFILE_ATTR(cft->private); 1790 switch (type) { 1791 case _MEM: 1792 val = res_counter_read_u64(&mem->res, name); 1793 break; 1794 case _MEMSWAP: 1795 if (do_swap_account) 1796 val = res_counter_read_u64(&mem->memsw, name); 1797 break; 1798 default: 1799 BUG(); 1800 break; 1801 } 1802 return val; 1803} 1804/* 1805 * The user of this function is... 1806 * RES_LIMIT. 1807 */ 1808static int mem_cgroup_write(struct cgroup *cont, struct cftype *cft, 1809 const char *buffer) 1810{ 1811 struct mem_cgroup *memcg = mem_cgroup_from_cont(cont); 1812 int type, name; 1813 unsigned long long val; 1814 int ret; 1815 1816 type = MEMFILE_TYPE(cft->private); 1817 name = MEMFILE_ATTR(cft->private); 1818 switch (name) { 1819 case RES_LIMIT: 1820 /* This function does all necessary parse...reuse it */ 1821 ret = res_counter_memparse_write_strategy(buffer, &val); 1822 if (ret) 1823 break; 1824 if (type == _MEM) 1825 ret = mem_cgroup_resize_limit(memcg, val); 1826 else 1827 ret = mem_cgroup_resize_memsw_limit(memcg, val); 1828 break; 1829 default: 1830 ret = -EINVAL; /* should be BUG() ? */ 1831 break; 1832 } 1833 return ret; 1834} 1835 1836static void memcg_get_hierarchical_limit(struct mem_cgroup *memcg, 1837 unsigned long long *mem_limit, unsigned long long *memsw_limit) 1838{ 1839 struct cgroup *cgroup; 1840 unsigned long long min_limit, min_memsw_limit, tmp; 1841 1842 min_limit = res_counter_read_u64(&memcg->res, RES_LIMIT); 1843 min_memsw_limit = res_counter_read_u64(&memcg->memsw, RES_LIMIT); 1844 cgroup = memcg->css.cgroup; 1845 if (!memcg->use_hierarchy) 1846 goto out; 1847 1848 while (cgroup->parent) { 1849 cgroup = cgroup->parent; 1850 memcg = mem_cgroup_from_cont(cgroup); 1851 if (!memcg->use_hierarchy) 1852 break; 1853 tmp = res_counter_read_u64(&memcg->res, RES_LIMIT); 1854 min_limit = min(min_limit, tmp); 1855 tmp = res_counter_read_u64(&memcg->memsw, RES_LIMIT); 1856 min_memsw_limit = min(min_memsw_limit, tmp); 1857 } 1858out: 1859 *mem_limit = min_limit; 1860 *memsw_limit = min_memsw_limit; 1861 return; 1862} 1863 1864static int mem_cgroup_reset(struct cgroup *cont, unsigned int event) 1865{ 1866 struct mem_cgroup *mem; 1867 int type, name; 1868 1869 mem = mem_cgroup_from_cont(cont); 1870 type = MEMFILE_TYPE(event); 1871 name = MEMFILE_ATTR(event); 1872 switch (name) { 1873 case RES_MAX_USAGE: 1874 if (type == _MEM) 1875 res_counter_reset_max(&mem->res); 1876 else 1877 res_counter_reset_max(&mem->memsw); 1878 break; 1879 case RES_FAILCNT: 1880 if (type == _MEM) 1881 res_counter_reset_failcnt(&mem->res); 1882 else 1883 res_counter_reset_failcnt(&mem->memsw); 1884 break; 1885 } 1886 return 0; 1887} 1888 1889static const struct mem_cgroup_stat_desc { 1890 const char *msg; 1891 u64 unit; 1892} mem_cgroup_stat_desc[] = { 1893 [MEM_CGROUP_STAT_CACHE] = { "cache", PAGE_SIZE, }, 1894 [MEM_CGROUP_STAT_RSS] = { "rss", PAGE_SIZE, }, 1895 [MEM_CGROUP_STAT_PGPGIN_COUNT] = {"pgpgin", 1, }, 1896 [MEM_CGROUP_STAT_PGPGOUT_COUNT] = {"pgpgout", 1, }, 1897}; 1898 1899static int mem_control_stat_show(struct cgroup *cont, struct cftype *cft, 1900 struct cgroup_map_cb *cb) 1901{ 1902 struct mem_cgroup *mem_cont = mem_cgroup_from_cont(cont); 1903 struct mem_cgroup_stat *stat = &mem_cont->stat; 1904 int i; 1905 1906 for (i = 0; i < ARRAY_SIZE(stat->cpustat[0].count); i++) { 1907 s64 val; 1908 1909 val = mem_cgroup_read_stat(stat, i); 1910 val *= mem_cgroup_stat_desc[i].unit; 1911 cb->fill(cb, mem_cgroup_stat_desc[i].msg, val); 1912 } 1913 /* showing # of active pages */ 1914 { 1915 unsigned long active_anon, inactive_anon; 1916 unsigned long active_file, inactive_file; 1917 unsigned long unevictable; 1918 1919 inactive_anon = mem_cgroup_get_all_zonestat(mem_cont, 1920 LRU_INACTIVE_ANON); 1921 active_anon = mem_cgroup_get_all_zonestat(mem_cont, 1922 LRU_ACTIVE_ANON); 1923 inactive_file = mem_cgroup_get_all_zonestat(mem_cont, 1924 LRU_INACTIVE_FILE); 1925 active_file = mem_cgroup_get_all_zonestat(mem_cont, 1926 LRU_ACTIVE_FILE); 1927 unevictable = mem_cgroup_get_all_zonestat(mem_cont, 1928 LRU_UNEVICTABLE); 1929 1930 cb->fill(cb, "active_anon", (active_anon) * PAGE_SIZE); 1931 cb->fill(cb, "inactive_anon", (inactive_anon) * PAGE_SIZE); 1932 cb->fill(cb, "active_file", (active_file) * PAGE_SIZE); 1933 cb->fill(cb, "inactive_file", (inactive_file) * PAGE_SIZE); 1934 cb->fill(cb, "unevictable", unevictable * PAGE_SIZE); 1935 1936 } 1937 { 1938 unsigned long long limit, memsw_limit; 1939 memcg_get_hierarchical_limit(mem_cont, &limit, &memsw_limit); 1940 cb->fill(cb, "hierarchical_memory_limit", limit); 1941 if (do_swap_account) 1942 cb->fill(cb, "hierarchical_memsw_limit", memsw_limit); 1943 } 1944 1945#ifdef CONFIG_DEBUG_VM 1946 cb->fill(cb, "inactive_ratio", calc_inactive_ratio(mem_cont, NULL)); 1947 1948 { 1949 int nid, zid; 1950 struct mem_cgroup_per_zone *mz; 1951 unsigned long recent_rotated[2] = {0, 0}; 1952 unsigned long recent_scanned[2] = {0, 0}; 1953 1954 for_each_online_node(nid) 1955 for (zid = 0; zid < MAX_NR_ZONES; zid++) { 1956 mz = mem_cgroup_zoneinfo(mem_cont, nid, zid); 1957 1958 recent_rotated[0] += 1959 mz->reclaim_stat.recent_rotated[0]; 1960 recent_rotated[1] += 1961 mz->reclaim_stat.recent_rotated[1]; 1962 recent_scanned[0] += 1963 mz->reclaim_stat.recent_scanned[0]; 1964 recent_scanned[1] += 1965 mz->reclaim_stat.recent_scanned[1]; 1966 } 1967 cb->fill(cb, "recent_rotated_anon", recent_rotated[0]); 1968 cb->fill(cb, "recent_rotated_file", recent_rotated[1]); 1969 cb->fill(cb, "recent_scanned_anon", recent_scanned[0]); 1970 cb->fill(cb, "recent_scanned_file", recent_scanned[1]); 1971 } 1972#endif 1973 1974 return 0; 1975} 1976 1977static u64 mem_cgroup_swappiness_read(struct cgroup *cgrp, struct cftype *cft) 1978{ 1979 struct mem_cgroup *memcg = mem_cgroup_from_cont(cgrp); 1980 1981 return get_swappiness(memcg); 1982} 1983 1984static int mem_cgroup_swappiness_write(struct cgroup *cgrp, struct cftype *cft, 1985 u64 val) 1986{ 1987 struct mem_cgroup *memcg = mem_cgroup_from_cont(cgrp); 1988 struct mem_cgroup *parent; 1989 if (val > 100) 1990 return -EINVAL; 1991 1992 if (cgrp->parent == NULL) 1993 return -EINVAL; 1994 1995 parent = mem_cgroup_from_cont(cgrp->parent); 1996 /* If under hierarchy, only empty-root can set this value */ 1997 if ((parent->use_hierarchy) || 1998 (memcg->use_hierarchy && !list_empty(&cgrp->children))) 1999 return -EINVAL; 2000 2001 spin_lock(&memcg->reclaim_param_lock); 2002 memcg->swappiness = val; 2003 spin_unlock(&memcg->reclaim_param_lock); 2004 2005 return 0; 2006} 2007 2008 2009static struct cftype mem_cgroup_files[] = { 2010 { 2011 .name = "usage_in_bytes", 2012 .private = MEMFILE_PRIVATE(_MEM, RES_USAGE), 2013 .read_u64 = mem_cgroup_read, 2014 }, 2015 { 2016 .name = "max_usage_in_bytes", 2017 .private = MEMFILE_PRIVATE(_MEM, RES_MAX_USAGE), 2018 .trigger = mem_cgroup_reset, 2019 .read_u64 = mem_cgroup_read, 2020 }, 2021 { 2022 .name = "limit_in_bytes", 2023 .private = MEMFILE_PRIVATE(_MEM, RES_LIMIT), 2024 .write_string = mem_cgroup_write, 2025 .read_u64 = mem_cgroup_read, 2026 }, 2027 { 2028 .name = "failcnt", 2029 .private = MEMFILE_PRIVATE(_MEM, RES_FAILCNT), 2030 .trigger = mem_cgroup_reset, 2031 .read_u64 = mem_cgroup_read, 2032 }, 2033 { 2034 .name = "stat", 2035 .read_map = mem_control_stat_show, 2036 }, 2037 { 2038 .name = "force_empty", 2039 .trigger = mem_cgroup_force_empty_write, 2040 }, 2041 { 2042 .name = "use_hierarchy", 2043 .write_u64 = mem_cgroup_hierarchy_write, 2044 .read_u64 = mem_cgroup_hierarchy_read, 2045 }, 2046 { 2047 .name = "swappiness", 2048 .read_u64 = mem_cgroup_swappiness_read, 2049 .write_u64 = mem_cgroup_swappiness_write, 2050 }, 2051}; 2052 2053#ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP 2054static struct cftype memsw_cgroup_files[] = { 2055 { 2056 .name = "memsw.usage_in_bytes", 2057 .private = MEMFILE_PRIVATE(_MEMSWAP, RES_USAGE), 2058 .read_u64 = mem_cgroup_read, 2059 }, 2060 { 2061 .name = "memsw.max_usage_in_bytes", 2062 .private = MEMFILE_PRIVATE(_MEMSWAP, RES_MAX_USAGE), 2063 .trigger = mem_cgroup_reset, 2064 .read_u64 = mem_cgroup_read, 2065 }, 2066 { 2067 .name = "memsw.limit_in_bytes", 2068 .private = MEMFILE_PRIVATE(_MEMSWAP, RES_LIMIT), 2069 .write_string = mem_cgroup_write, 2070 .read_u64 = mem_cgroup_read, 2071 }, 2072 { 2073 .name = "memsw.failcnt", 2074 .private = MEMFILE_PRIVATE(_MEMSWAP, RES_FAILCNT), 2075 .trigger = mem_cgroup_reset, 2076 .read_u64 = mem_cgroup_read, 2077 }, 2078}; 2079 2080static int register_memsw_files(struct cgroup *cont, struct cgroup_subsys *ss) 2081{ 2082 if (!do_swap_account) 2083 return 0; 2084 return cgroup_add_files(cont, ss, memsw_cgroup_files, 2085 ARRAY_SIZE(memsw_cgroup_files)); 2086}; 2087#else 2088static int register_memsw_files(struct cgroup *cont, struct cgroup_subsys *ss) 2089{ 2090 return 0; 2091} 2092#endif 2093 2094static int alloc_mem_cgroup_per_zone_info(struct mem_cgroup *mem, int node) 2095{ 2096 struct mem_cgroup_per_node *pn; 2097 struct mem_cgroup_per_zone *mz; 2098 enum lru_list l; 2099 int zone, tmp = node; 2100 /* 2101 * This routine is called against possible nodes. 2102 * But it's BUG to call kmalloc() against offline node. 2103 * 2104 * TODO: this routine can waste much memory for nodes which will 2105 * never be onlined. It's better to use memory hotplug callback 2106 * function. 2107 */ 2108 if (!node_state(node, N_NORMAL_MEMORY)) 2109 tmp = -1; 2110 pn = kmalloc_node(sizeof(*pn), GFP_KERNEL, tmp); 2111 if (!pn) 2112 return 1; 2113 2114 mem->info.nodeinfo[node] = pn; 2115 memset(pn, 0, sizeof(*pn)); 2116 2117 for (zone = 0; zone < MAX_NR_ZONES; zone++) { 2118 mz = &pn->zoneinfo[zone]; 2119 for_each_lru(l) 2120 INIT_LIST_HEAD(&mz->lists[l]); 2121 } 2122 return 0; 2123} 2124 2125static void free_mem_cgroup_per_zone_info(struct mem_cgroup *mem, int node) 2126{ 2127 kfree(mem->info.nodeinfo[node]); 2128} 2129 2130static int mem_cgroup_size(void) 2131{ 2132 int cpustat_size = nr_cpu_ids * sizeof(struct mem_cgroup_stat_cpu); 2133 return sizeof(struct mem_cgroup) + cpustat_size; 2134} 2135 2136static struct mem_cgroup *mem_cgroup_alloc(void) 2137{ 2138 struct mem_cgroup *mem; 2139 int size = mem_cgroup_size(); 2140 2141 if (size < PAGE_SIZE) 2142 mem = kmalloc(size, GFP_KERNEL); 2143 else 2144 mem = vmalloc(size); 2145 2146 if (mem) 2147 memset(mem, 0, size); 2148 return mem; 2149} 2150 2151/* 2152 * At destroying mem_cgroup, references from swap_cgroup can remain. 2153 * (scanning all at force_empty is too costly...) 2154 * 2155 * Instead of clearing all references at force_empty, we remember 2156 * the number of reference from swap_cgroup and free mem_cgroup when 2157 * it goes down to 0. 2158 * 2159 * Removal of cgroup itself succeeds regardless of refs from swap. 2160 */ 2161 2162static void __mem_cgroup_free(struct mem_cgroup *mem) 2163{ 2164 int node; 2165 2166 for_each_node_state(node, N_POSSIBLE) 2167 free_mem_cgroup_per_zone_info(mem, node); 2168 2169 if (mem_cgroup_size() < PAGE_SIZE) 2170 kfree(mem); 2171 else 2172 vfree(mem); 2173} 2174 2175static void mem_cgroup_get(struct mem_cgroup *mem) 2176{ 2177 atomic_inc(&mem->refcnt); 2178} 2179 2180static void mem_cgroup_put(struct mem_cgroup *mem) 2181{ 2182 if (atomic_dec_and_test(&mem->refcnt)) 2183 __mem_cgroup_free(mem); 2184} 2185 2186 2187#ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP 2188static void __init enable_swap_cgroup(void) 2189{ 2190 if (!mem_cgroup_disabled() && really_do_swap_account) 2191 do_swap_account = 1; 2192} 2193#else 2194static void __init enable_swap_cgroup(void) 2195{ 2196} 2197#endif 2198 2199static struct cgroup_subsys_state * 2200mem_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cont) 2201{ 2202 struct mem_cgroup *mem, *parent; 2203 int node; 2204 2205 mem = mem_cgroup_alloc(); 2206 if (!mem) 2207 return ERR_PTR(-ENOMEM); 2208 2209 for_each_node_state(node, N_POSSIBLE) 2210 if (alloc_mem_cgroup_per_zone_info(mem, node)) 2211 goto free_out; 2212 /* root ? */ 2213 if (cont->parent == NULL) { 2214 enable_swap_cgroup(); 2215 parent = NULL; 2216 } else { 2217 parent = mem_cgroup_from_cont(cont->parent); 2218 mem->use_hierarchy = parent->use_hierarchy; 2219 } 2220 2221 if (parent && parent->use_hierarchy) { 2222 res_counter_init(&mem->res, &parent->res); 2223 res_counter_init(&mem->memsw, &parent->memsw); 2224 } else { 2225 res_counter_init(&mem->res, NULL); 2226 res_counter_init(&mem->memsw, NULL); 2227 } 2228 mem->last_scanned_child = NULL; 2229 spin_lock_init(&mem->reclaim_param_lock); 2230 2231 if (parent) 2232 mem->swappiness = get_swappiness(parent); 2233 atomic_set(&mem->refcnt, 1); 2234 return &mem->css; 2235free_out: 2236 __mem_cgroup_free(mem); 2237 return ERR_PTR(-ENOMEM); 2238} 2239 2240static void mem_cgroup_pre_destroy(struct cgroup_subsys *ss, 2241 struct cgroup *cont) 2242{ 2243 struct mem_cgroup *mem = mem_cgroup_from_cont(cont); 2244 mem_cgroup_force_empty(mem, false); 2245} 2246 2247static void mem_cgroup_destroy(struct cgroup_subsys *ss, 2248 struct cgroup *cont) 2249{ 2250 mem_cgroup_put(mem_cgroup_from_cont(cont)); 2251} 2252 2253static int mem_cgroup_populate(struct cgroup_subsys *ss, 2254 struct cgroup *cont) 2255{ 2256 int ret; 2257 2258 ret = cgroup_add_files(cont, ss, mem_cgroup_files, 2259 ARRAY_SIZE(mem_cgroup_files)); 2260 2261 if (!ret) 2262 ret = register_memsw_files(cont, ss); 2263 return ret; 2264} 2265 2266static void mem_cgroup_move_task(struct cgroup_subsys *ss, 2267 struct cgroup *cont, 2268 struct cgroup *old_cont, 2269 struct task_struct *p) 2270{ 2271 mutex_lock(&memcg_tasklist); 2272 /* 2273 * FIXME: It's better to move charges of this process from old 2274 * memcg to new memcg. But it's just on TODO-List now. 2275 */ 2276 mutex_unlock(&memcg_tasklist); 2277} 2278 2279struct cgroup_subsys mem_cgroup_subsys = { 2280 .name = "memory", 2281 .subsys_id = mem_cgroup_subsys_id, 2282 .create = mem_cgroup_create, 2283 .pre_destroy = mem_cgroup_pre_destroy, 2284 .destroy = mem_cgroup_destroy, 2285 .populate = mem_cgroup_populate, 2286 .attach = mem_cgroup_move_task, 2287 .early_init = 0, 2288}; 2289 2290#ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP 2291 2292static int __init disable_swap_account(char *s) 2293{ 2294 really_do_swap_account = 0; 2295 return 1; 2296} 2297__setup("noswapaccount", disable_swap_account); 2298#endif 2299