memcontrol.c revision 7f4d454dee2e0bdd21bafd413d1c53e443a26540
1/* memcontrol.c - Memory Controller 2 * 3 * Copyright IBM Corporation, 2007 4 * Author Balbir Singh <balbir@linux.vnet.ibm.com> 5 * 6 * Copyright 2007 OpenVZ SWsoft Inc 7 * Author: Pavel Emelianov <xemul@openvz.org> 8 * 9 * This program is free software; you can redistribute it and/or modify 10 * it under the terms of the GNU General Public License as published by 11 * the Free Software Foundation; either version 2 of the License, or 12 * (at your option) any later version. 13 * 14 * This program is distributed in the hope that it will be useful, 15 * but WITHOUT ANY WARRANTY; without even the implied warranty of 16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 17 * GNU General Public License for more details. 18 */ 19 20#include <linux/res_counter.h> 21#include <linux/memcontrol.h> 22#include <linux/cgroup.h> 23#include <linux/mm.h> 24#include <linux/pagemap.h> 25#include <linux/smp.h> 26#include <linux/page-flags.h> 27#include <linux/backing-dev.h> 28#include <linux/bit_spinlock.h> 29#include <linux/rcupdate.h> 30#include <linux/mutex.h> 31#include <linux/slab.h> 32#include <linux/swap.h> 33#include <linux/spinlock.h> 34#include <linux/fs.h> 35#include <linux/seq_file.h> 36#include <linux/vmalloc.h> 37#include <linux/mm_inline.h> 38#include <linux/page_cgroup.h> 39#include "internal.h" 40 41#include <asm/uaccess.h> 42 43struct cgroup_subsys mem_cgroup_subsys __read_mostly; 44#define MEM_CGROUP_RECLAIM_RETRIES 5 45 46#ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP 47/* Turned on only when memory cgroup is enabled && really_do_swap_account = 0 */ 48int do_swap_account __read_mostly; 49static int really_do_swap_account __initdata = 1; /* for remember boot option*/ 50#else 51#define do_swap_account (0) 52#endif 53 54static DEFINE_MUTEX(memcg_tasklist); /* can be hold under cgroup_mutex */ 55 56/* 57 * Statistics for memory cgroup. 58 */ 59enum mem_cgroup_stat_index { 60 /* 61 * For MEM_CONTAINER_TYPE_ALL, usage = pagecache + rss. 62 */ 63 MEM_CGROUP_STAT_CACHE, /* # of pages charged as cache */ 64 MEM_CGROUP_STAT_RSS, /* # of pages charged as rss */ 65 MEM_CGROUP_STAT_PGPGIN_COUNT, /* # of pages paged in */ 66 MEM_CGROUP_STAT_PGPGOUT_COUNT, /* # of pages paged out */ 67 68 MEM_CGROUP_STAT_NSTATS, 69}; 70 71struct mem_cgroup_stat_cpu { 72 s64 count[MEM_CGROUP_STAT_NSTATS]; 73} ____cacheline_aligned_in_smp; 74 75struct mem_cgroup_stat { 76 struct mem_cgroup_stat_cpu cpustat[0]; 77}; 78 79/* 80 * For accounting under irq disable, no need for increment preempt count. 81 */ 82static inline void __mem_cgroup_stat_add_safe(struct mem_cgroup_stat_cpu *stat, 83 enum mem_cgroup_stat_index idx, int val) 84{ 85 stat->count[idx] += val; 86} 87 88static s64 mem_cgroup_read_stat(struct mem_cgroup_stat *stat, 89 enum mem_cgroup_stat_index idx) 90{ 91 int cpu; 92 s64 ret = 0; 93 for_each_possible_cpu(cpu) 94 ret += stat->cpustat[cpu].count[idx]; 95 return ret; 96} 97 98/* 99 * per-zone information in memory controller. 100 */ 101struct mem_cgroup_per_zone { 102 /* 103 * spin_lock to protect the per cgroup LRU 104 */ 105 struct list_head lists[NR_LRU_LISTS]; 106 unsigned long count[NR_LRU_LISTS]; 107 108 struct zone_reclaim_stat reclaim_stat; 109}; 110/* Macro for accessing counter */ 111#define MEM_CGROUP_ZSTAT(mz, idx) ((mz)->count[(idx)]) 112 113struct mem_cgroup_per_node { 114 struct mem_cgroup_per_zone zoneinfo[MAX_NR_ZONES]; 115}; 116 117struct mem_cgroup_lru_info { 118 struct mem_cgroup_per_node *nodeinfo[MAX_NUMNODES]; 119}; 120 121/* 122 * The memory controller data structure. The memory controller controls both 123 * page cache and RSS per cgroup. We would eventually like to provide 124 * statistics based on the statistics developed by Rik Van Riel for clock-pro, 125 * to help the administrator determine what knobs to tune. 126 * 127 * TODO: Add a water mark for the memory controller. Reclaim will begin when 128 * we hit the water mark. May be even add a low water mark, such that 129 * no reclaim occurs from a cgroup at it's low water mark, this is 130 * a feature that will be implemented much later in the future. 131 */ 132struct mem_cgroup { 133 struct cgroup_subsys_state css; 134 /* 135 * the counter to account for memory usage 136 */ 137 struct res_counter res; 138 /* 139 * the counter to account for mem+swap usage. 140 */ 141 struct res_counter memsw; 142 /* 143 * Per cgroup active and inactive list, similar to the 144 * per zone LRU lists. 145 */ 146 struct mem_cgroup_lru_info info; 147 148 /* 149 protect against reclaim related member. 150 */ 151 spinlock_t reclaim_param_lock; 152 153 int prev_priority; /* for recording reclaim priority */ 154 155 /* 156 * While reclaiming in a hiearchy, we cache the last child we 157 * reclaimed from. Protected by cgroup_lock() 158 */ 159 struct mem_cgroup *last_scanned_child; 160 /* 161 * Should the accounting and control be hierarchical, per subtree? 162 */ 163 bool use_hierarchy; 164 unsigned long last_oom_jiffies; 165 int obsolete; 166 atomic_t refcnt; 167 168 unsigned int swappiness; 169 170 /* 171 * statistics. This must be placed at the end of memcg. 172 */ 173 struct mem_cgroup_stat stat; 174}; 175 176enum charge_type { 177 MEM_CGROUP_CHARGE_TYPE_CACHE = 0, 178 MEM_CGROUP_CHARGE_TYPE_MAPPED, 179 MEM_CGROUP_CHARGE_TYPE_SHMEM, /* used by page migration of shmem */ 180 MEM_CGROUP_CHARGE_TYPE_FORCE, /* used by force_empty */ 181 MEM_CGROUP_CHARGE_TYPE_SWAPOUT, /* for accounting swapcache */ 182 NR_CHARGE_TYPE, 183}; 184 185/* only for here (for easy reading.) */ 186#define PCGF_CACHE (1UL << PCG_CACHE) 187#define PCGF_USED (1UL << PCG_USED) 188#define PCGF_LOCK (1UL << PCG_LOCK) 189static const unsigned long 190pcg_default_flags[NR_CHARGE_TYPE] = { 191 PCGF_CACHE | PCGF_USED | PCGF_LOCK, /* File Cache */ 192 PCGF_USED | PCGF_LOCK, /* Anon */ 193 PCGF_CACHE | PCGF_USED | PCGF_LOCK, /* Shmem */ 194 0, /* FORCE */ 195}; 196 197/* for encoding cft->private value on file */ 198#define _MEM (0) 199#define _MEMSWAP (1) 200#define MEMFILE_PRIVATE(x, val) (((x) << 16) | (val)) 201#define MEMFILE_TYPE(val) (((val) >> 16) & 0xffff) 202#define MEMFILE_ATTR(val) ((val) & 0xffff) 203 204static void mem_cgroup_get(struct mem_cgroup *mem); 205static void mem_cgroup_put(struct mem_cgroup *mem); 206 207static void mem_cgroup_charge_statistics(struct mem_cgroup *mem, 208 struct page_cgroup *pc, 209 bool charge) 210{ 211 int val = (charge)? 1 : -1; 212 struct mem_cgroup_stat *stat = &mem->stat; 213 struct mem_cgroup_stat_cpu *cpustat; 214 int cpu = get_cpu(); 215 216 cpustat = &stat->cpustat[cpu]; 217 if (PageCgroupCache(pc)) 218 __mem_cgroup_stat_add_safe(cpustat, MEM_CGROUP_STAT_CACHE, val); 219 else 220 __mem_cgroup_stat_add_safe(cpustat, MEM_CGROUP_STAT_RSS, val); 221 222 if (charge) 223 __mem_cgroup_stat_add_safe(cpustat, 224 MEM_CGROUP_STAT_PGPGIN_COUNT, 1); 225 else 226 __mem_cgroup_stat_add_safe(cpustat, 227 MEM_CGROUP_STAT_PGPGOUT_COUNT, 1); 228 put_cpu(); 229} 230 231static struct mem_cgroup_per_zone * 232mem_cgroup_zoneinfo(struct mem_cgroup *mem, int nid, int zid) 233{ 234 return &mem->info.nodeinfo[nid]->zoneinfo[zid]; 235} 236 237static struct mem_cgroup_per_zone * 238page_cgroup_zoneinfo(struct page_cgroup *pc) 239{ 240 struct mem_cgroup *mem = pc->mem_cgroup; 241 int nid = page_cgroup_nid(pc); 242 int zid = page_cgroup_zid(pc); 243 244 if (!mem) 245 return NULL; 246 247 return mem_cgroup_zoneinfo(mem, nid, zid); 248} 249 250static unsigned long mem_cgroup_get_all_zonestat(struct mem_cgroup *mem, 251 enum lru_list idx) 252{ 253 int nid, zid; 254 struct mem_cgroup_per_zone *mz; 255 u64 total = 0; 256 257 for_each_online_node(nid) 258 for (zid = 0; zid < MAX_NR_ZONES; zid++) { 259 mz = mem_cgroup_zoneinfo(mem, nid, zid); 260 total += MEM_CGROUP_ZSTAT(mz, idx); 261 } 262 return total; 263} 264 265static struct mem_cgroup *mem_cgroup_from_cont(struct cgroup *cont) 266{ 267 return container_of(cgroup_subsys_state(cont, 268 mem_cgroup_subsys_id), struct mem_cgroup, 269 css); 270} 271 272struct mem_cgroup *mem_cgroup_from_task(struct task_struct *p) 273{ 274 /* 275 * mm_update_next_owner() may clear mm->owner to NULL 276 * if it races with swapoff, page migration, etc. 277 * So this can be called with p == NULL. 278 */ 279 if (unlikely(!p)) 280 return NULL; 281 282 return container_of(task_subsys_state(p, mem_cgroup_subsys_id), 283 struct mem_cgroup, css); 284} 285 286/* 287 * Following LRU functions are allowed to be used without PCG_LOCK. 288 * Operations are called by routine of global LRU independently from memcg. 289 * What we have to take care of here is validness of pc->mem_cgroup. 290 * 291 * Changes to pc->mem_cgroup happens when 292 * 1. charge 293 * 2. moving account 294 * In typical case, "charge" is done before add-to-lru. Exception is SwapCache. 295 * It is added to LRU before charge. 296 * If PCG_USED bit is not set, page_cgroup is not added to this private LRU. 297 * When moving account, the page is not on LRU. It's isolated. 298 */ 299 300void mem_cgroup_del_lru_list(struct page *page, enum lru_list lru) 301{ 302 struct page_cgroup *pc; 303 struct mem_cgroup *mem; 304 struct mem_cgroup_per_zone *mz; 305 306 if (mem_cgroup_disabled()) 307 return; 308 pc = lookup_page_cgroup(page); 309 /* can happen while we handle swapcache. */ 310 if (list_empty(&pc->lru)) 311 return; 312 mz = page_cgroup_zoneinfo(pc); 313 mem = pc->mem_cgroup; 314 MEM_CGROUP_ZSTAT(mz, lru) -= 1; 315 list_del_init(&pc->lru); 316 return; 317} 318 319void mem_cgroup_del_lru(struct page *page) 320{ 321 mem_cgroup_del_lru_list(page, page_lru(page)); 322} 323 324void mem_cgroup_rotate_lru_list(struct page *page, enum lru_list lru) 325{ 326 struct mem_cgroup_per_zone *mz; 327 struct page_cgroup *pc; 328 329 if (mem_cgroup_disabled()) 330 return; 331 332 pc = lookup_page_cgroup(page); 333 smp_rmb(); 334 /* unused page is not rotated. */ 335 if (!PageCgroupUsed(pc)) 336 return; 337 mz = page_cgroup_zoneinfo(pc); 338 list_move(&pc->lru, &mz->lists[lru]); 339} 340 341void mem_cgroup_add_lru_list(struct page *page, enum lru_list lru) 342{ 343 struct page_cgroup *pc; 344 struct mem_cgroup_per_zone *mz; 345 346 if (mem_cgroup_disabled()) 347 return; 348 pc = lookup_page_cgroup(page); 349 /* barrier to sync with "charge" */ 350 smp_rmb(); 351 if (!PageCgroupUsed(pc)) 352 return; 353 354 mz = page_cgroup_zoneinfo(pc); 355 MEM_CGROUP_ZSTAT(mz, lru) += 1; 356 list_add(&pc->lru, &mz->lists[lru]); 357} 358/* 359 * To add swapcache into LRU. Be careful to all this function. 360 * zone->lru_lock shouldn't be held and irq must not be disabled. 361 */ 362static void mem_cgroup_lru_fixup(struct page *page) 363{ 364 if (!isolate_lru_page(page)) 365 putback_lru_page(page); 366} 367 368void mem_cgroup_move_lists(struct page *page, 369 enum lru_list from, enum lru_list to) 370{ 371 if (mem_cgroup_disabled()) 372 return; 373 mem_cgroup_del_lru_list(page, from); 374 mem_cgroup_add_lru_list(page, to); 375} 376 377int task_in_mem_cgroup(struct task_struct *task, const struct mem_cgroup *mem) 378{ 379 int ret; 380 381 task_lock(task); 382 ret = task->mm && mm_match_cgroup(task->mm, mem); 383 task_unlock(task); 384 return ret; 385} 386 387/* 388 * Calculate mapped_ratio under memory controller. This will be used in 389 * vmscan.c for deteremining we have to reclaim mapped pages. 390 */ 391int mem_cgroup_calc_mapped_ratio(struct mem_cgroup *mem) 392{ 393 long total, rss; 394 395 /* 396 * usage is recorded in bytes. But, here, we assume the number of 397 * physical pages can be represented by "long" on any arch. 398 */ 399 total = (long) (mem->res.usage >> PAGE_SHIFT) + 1L; 400 rss = (long)mem_cgroup_read_stat(&mem->stat, MEM_CGROUP_STAT_RSS); 401 return (int)((rss * 100L) / total); 402} 403 404/* 405 * prev_priority control...this will be used in memory reclaim path. 406 */ 407int mem_cgroup_get_reclaim_priority(struct mem_cgroup *mem) 408{ 409 int prev_priority; 410 411 spin_lock(&mem->reclaim_param_lock); 412 prev_priority = mem->prev_priority; 413 spin_unlock(&mem->reclaim_param_lock); 414 415 return prev_priority; 416} 417 418void mem_cgroup_note_reclaim_priority(struct mem_cgroup *mem, int priority) 419{ 420 spin_lock(&mem->reclaim_param_lock); 421 if (priority < mem->prev_priority) 422 mem->prev_priority = priority; 423 spin_unlock(&mem->reclaim_param_lock); 424} 425 426void mem_cgroup_record_reclaim_priority(struct mem_cgroup *mem, int priority) 427{ 428 spin_lock(&mem->reclaim_param_lock); 429 mem->prev_priority = priority; 430 spin_unlock(&mem->reclaim_param_lock); 431} 432 433static int calc_inactive_ratio(struct mem_cgroup *memcg, unsigned long *present_pages) 434{ 435 unsigned long active; 436 unsigned long inactive; 437 unsigned long gb; 438 unsigned long inactive_ratio; 439 440 inactive = mem_cgroup_get_all_zonestat(memcg, LRU_INACTIVE_ANON); 441 active = mem_cgroup_get_all_zonestat(memcg, LRU_ACTIVE_ANON); 442 443 gb = (inactive + active) >> (30 - PAGE_SHIFT); 444 if (gb) 445 inactive_ratio = int_sqrt(10 * gb); 446 else 447 inactive_ratio = 1; 448 449 if (present_pages) { 450 present_pages[0] = inactive; 451 present_pages[1] = active; 452 } 453 454 return inactive_ratio; 455} 456 457int mem_cgroup_inactive_anon_is_low(struct mem_cgroup *memcg) 458{ 459 unsigned long active; 460 unsigned long inactive; 461 unsigned long present_pages[2]; 462 unsigned long inactive_ratio; 463 464 inactive_ratio = calc_inactive_ratio(memcg, present_pages); 465 466 inactive = present_pages[0]; 467 active = present_pages[1]; 468 469 if (inactive * inactive_ratio < active) 470 return 1; 471 472 return 0; 473} 474 475unsigned long mem_cgroup_zone_nr_pages(struct mem_cgroup *memcg, 476 struct zone *zone, 477 enum lru_list lru) 478{ 479 int nid = zone->zone_pgdat->node_id; 480 int zid = zone_idx(zone); 481 struct mem_cgroup_per_zone *mz = mem_cgroup_zoneinfo(memcg, nid, zid); 482 483 return MEM_CGROUP_ZSTAT(mz, lru); 484} 485 486struct zone_reclaim_stat *mem_cgroup_get_reclaim_stat(struct mem_cgroup *memcg, 487 struct zone *zone) 488{ 489 int nid = zone->zone_pgdat->node_id; 490 int zid = zone_idx(zone); 491 struct mem_cgroup_per_zone *mz = mem_cgroup_zoneinfo(memcg, nid, zid); 492 493 return &mz->reclaim_stat; 494} 495 496struct zone_reclaim_stat * 497mem_cgroup_get_reclaim_stat_from_page(struct page *page) 498{ 499 struct page_cgroup *pc; 500 struct mem_cgroup_per_zone *mz; 501 502 if (mem_cgroup_disabled()) 503 return NULL; 504 505 pc = lookup_page_cgroup(page); 506 mz = page_cgroup_zoneinfo(pc); 507 if (!mz) 508 return NULL; 509 510 return &mz->reclaim_stat; 511} 512 513unsigned long mem_cgroup_isolate_pages(unsigned long nr_to_scan, 514 struct list_head *dst, 515 unsigned long *scanned, int order, 516 int mode, struct zone *z, 517 struct mem_cgroup *mem_cont, 518 int active, int file) 519{ 520 unsigned long nr_taken = 0; 521 struct page *page; 522 unsigned long scan; 523 LIST_HEAD(pc_list); 524 struct list_head *src; 525 struct page_cgroup *pc, *tmp; 526 int nid = z->zone_pgdat->node_id; 527 int zid = zone_idx(z); 528 struct mem_cgroup_per_zone *mz; 529 int lru = LRU_FILE * !!file + !!active; 530 531 BUG_ON(!mem_cont); 532 mz = mem_cgroup_zoneinfo(mem_cont, nid, zid); 533 src = &mz->lists[lru]; 534 535 scan = 0; 536 list_for_each_entry_safe_reverse(pc, tmp, src, lru) { 537 if (scan >= nr_to_scan) 538 break; 539 540 page = pc->page; 541 if (unlikely(!PageCgroupUsed(pc))) 542 continue; 543 if (unlikely(!PageLRU(page))) 544 continue; 545 546 scan++; 547 if (__isolate_lru_page(page, mode, file) == 0) { 548 list_move(&page->lru, dst); 549 nr_taken++; 550 } 551 } 552 553 *scanned = scan; 554 return nr_taken; 555} 556 557#define mem_cgroup_from_res_counter(counter, member) \ 558 container_of(counter, struct mem_cgroup, member) 559 560/* 561 * This routine finds the DFS walk successor. This routine should be 562 * called with cgroup_mutex held 563 */ 564static struct mem_cgroup * 565mem_cgroup_get_next_node(struct mem_cgroup *curr, struct mem_cgroup *root_mem) 566{ 567 struct cgroup *cgroup, *curr_cgroup, *root_cgroup; 568 569 curr_cgroup = curr->css.cgroup; 570 root_cgroup = root_mem->css.cgroup; 571 572 if (!list_empty(&curr_cgroup->children)) { 573 /* 574 * Walk down to children 575 */ 576 mem_cgroup_put(curr); 577 cgroup = list_entry(curr_cgroup->children.next, 578 struct cgroup, sibling); 579 curr = mem_cgroup_from_cont(cgroup); 580 mem_cgroup_get(curr); 581 goto done; 582 } 583 584visit_parent: 585 if (curr_cgroup == root_cgroup) { 586 mem_cgroup_put(curr); 587 curr = root_mem; 588 mem_cgroup_get(curr); 589 goto done; 590 } 591 592 /* 593 * Goto next sibling 594 */ 595 if (curr_cgroup->sibling.next != &curr_cgroup->parent->children) { 596 mem_cgroup_put(curr); 597 cgroup = list_entry(curr_cgroup->sibling.next, struct cgroup, 598 sibling); 599 curr = mem_cgroup_from_cont(cgroup); 600 mem_cgroup_get(curr); 601 goto done; 602 } 603 604 /* 605 * Go up to next parent and next parent's sibling if need be 606 */ 607 curr_cgroup = curr_cgroup->parent; 608 goto visit_parent; 609 610done: 611 root_mem->last_scanned_child = curr; 612 return curr; 613} 614 615/* 616 * Visit the first child (need not be the first child as per the ordering 617 * of the cgroup list, since we track last_scanned_child) of @mem and use 618 * that to reclaim free pages from. 619 */ 620static struct mem_cgroup * 621mem_cgroup_get_first_node(struct mem_cgroup *root_mem) 622{ 623 struct cgroup *cgroup; 624 struct mem_cgroup *ret; 625 bool obsolete = (root_mem->last_scanned_child && 626 root_mem->last_scanned_child->obsolete); 627 628 /* 629 * Scan all children under the mem_cgroup mem 630 */ 631 cgroup_lock(); 632 if (list_empty(&root_mem->css.cgroup->children)) { 633 ret = root_mem; 634 goto done; 635 } 636 637 if (!root_mem->last_scanned_child || obsolete) { 638 639 if (obsolete) 640 mem_cgroup_put(root_mem->last_scanned_child); 641 642 cgroup = list_first_entry(&root_mem->css.cgroup->children, 643 struct cgroup, sibling); 644 ret = mem_cgroup_from_cont(cgroup); 645 mem_cgroup_get(ret); 646 } else 647 ret = mem_cgroup_get_next_node(root_mem->last_scanned_child, 648 root_mem); 649 650done: 651 root_mem->last_scanned_child = ret; 652 cgroup_unlock(); 653 return ret; 654} 655 656static bool mem_cgroup_check_under_limit(struct mem_cgroup *mem) 657{ 658 if (do_swap_account) { 659 if (res_counter_check_under_limit(&mem->res) && 660 res_counter_check_under_limit(&mem->memsw)) 661 return true; 662 } else 663 if (res_counter_check_under_limit(&mem->res)) 664 return true; 665 return false; 666} 667 668static unsigned int get_swappiness(struct mem_cgroup *memcg) 669{ 670 struct cgroup *cgrp = memcg->css.cgroup; 671 unsigned int swappiness; 672 673 /* root ? */ 674 if (cgrp->parent == NULL) 675 return vm_swappiness; 676 677 spin_lock(&memcg->reclaim_param_lock); 678 swappiness = memcg->swappiness; 679 spin_unlock(&memcg->reclaim_param_lock); 680 681 return swappiness; 682} 683 684/* 685 * Dance down the hierarchy if needed to reclaim memory. We remember the 686 * last child we reclaimed from, so that we don't end up penalizing 687 * one child extensively based on its position in the children list. 688 * 689 * root_mem is the original ancestor that we've been reclaim from. 690 */ 691static int mem_cgroup_hierarchical_reclaim(struct mem_cgroup *root_mem, 692 gfp_t gfp_mask, bool noswap) 693{ 694 struct mem_cgroup *next_mem; 695 int ret = 0; 696 697 /* 698 * Reclaim unconditionally and don't check for return value. 699 * We need to reclaim in the current group and down the tree. 700 * One might think about checking for children before reclaiming, 701 * but there might be left over accounting, even after children 702 * have left. 703 */ 704 ret = try_to_free_mem_cgroup_pages(root_mem, gfp_mask, noswap, 705 get_swappiness(root_mem)); 706 if (mem_cgroup_check_under_limit(root_mem)) 707 return 0; 708 if (!root_mem->use_hierarchy) 709 return ret; 710 711 next_mem = mem_cgroup_get_first_node(root_mem); 712 713 while (next_mem != root_mem) { 714 if (next_mem->obsolete) { 715 mem_cgroup_put(next_mem); 716 cgroup_lock(); 717 next_mem = mem_cgroup_get_first_node(root_mem); 718 cgroup_unlock(); 719 continue; 720 } 721 ret = try_to_free_mem_cgroup_pages(next_mem, gfp_mask, noswap, 722 get_swappiness(next_mem)); 723 if (mem_cgroup_check_under_limit(root_mem)) 724 return 0; 725 cgroup_lock(); 726 next_mem = mem_cgroup_get_next_node(next_mem, root_mem); 727 cgroup_unlock(); 728 } 729 return ret; 730} 731 732bool mem_cgroup_oom_called(struct task_struct *task) 733{ 734 bool ret = false; 735 struct mem_cgroup *mem; 736 struct mm_struct *mm; 737 738 rcu_read_lock(); 739 mm = task->mm; 740 if (!mm) 741 mm = &init_mm; 742 mem = mem_cgroup_from_task(rcu_dereference(mm->owner)); 743 if (mem && time_before(jiffies, mem->last_oom_jiffies + HZ/10)) 744 ret = true; 745 rcu_read_unlock(); 746 return ret; 747} 748/* 749 * Unlike exported interface, "oom" parameter is added. if oom==true, 750 * oom-killer can be invoked. 751 */ 752static int __mem_cgroup_try_charge(struct mm_struct *mm, 753 gfp_t gfp_mask, struct mem_cgroup **memcg, 754 bool oom) 755{ 756 struct mem_cgroup *mem, *mem_over_limit; 757 int nr_retries = MEM_CGROUP_RECLAIM_RETRIES; 758 struct res_counter *fail_res; 759 760 if (unlikely(test_thread_flag(TIF_MEMDIE))) { 761 /* Don't account this! */ 762 *memcg = NULL; 763 return 0; 764 } 765 766 /* 767 * We always charge the cgroup the mm_struct belongs to. 768 * The mm_struct's mem_cgroup changes on task migration if the 769 * thread group leader migrates. It's possible that mm is not 770 * set, if so charge the init_mm (happens for pagecache usage). 771 */ 772 if (likely(!*memcg)) { 773 rcu_read_lock(); 774 mem = mem_cgroup_from_task(rcu_dereference(mm->owner)); 775 if (unlikely(!mem)) { 776 rcu_read_unlock(); 777 return 0; 778 } 779 /* 780 * For every charge from the cgroup, increment reference count 781 */ 782 css_get(&mem->css); 783 *memcg = mem; 784 rcu_read_unlock(); 785 } else { 786 mem = *memcg; 787 css_get(&mem->css); 788 } 789 790 while (1) { 791 int ret; 792 bool noswap = false; 793 794 ret = res_counter_charge(&mem->res, PAGE_SIZE, &fail_res); 795 if (likely(!ret)) { 796 if (!do_swap_account) 797 break; 798 ret = res_counter_charge(&mem->memsw, PAGE_SIZE, 799 &fail_res); 800 if (likely(!ret)) 801 break; 802 /* mem+swap counter fails */ 803 res_counter_uncharge(&mem->res, PAGE_SIZE); 804 noswap = true; 805 mem_over_limit = mem_cgroup_from_res_counter(fail_res, 806 memsw); 807 } else 808 /* mem counter fails */ 809 mem_over_limit = mem_cgroup_from_res_counter(fail_res, 810 res); 811 812 if (!(gfp_mask & __GFP_WAIT)) 813 goto nomem; 814 815 ret = mem_cgroup_hierarchical_reclaim(mem_over_limit, gfp_mask, 816 noswap); 817 818 /* 819 * try_to_free_mem_cgroup_pages() might not give us a full 820 * picture of reclaim. Some pages are reclaimed and might be 821 * moved to swap cache or just unmapped from the cgroup. 822 * Check the limit again to see if the reclaim reduced the 823 * current usage of the cgroup before giving up 824 * 825 */ 826 if (mem_cgroup_check_under_limit(mem_over_limit)) 827 continue; 828 829 if (!nr_retries--) { 830 if (oom) { 831 mutex_lock(&memcg_tasklist); 832 mem_cgroup_out_of_memory(mem_over_limit, gfp_mask); 833 mutex_unlock(&memcg_tasklist); 834 mem_over_limit->last_oom_jiffies = jiffies; 835 } 836 goto nomem; 837 } 838 } 839 return 0; 840nomem: 841 css_put(&mem->css); 842 return -ENOMEM; 843} 844 845/* 846 * commit a charge got by __mem_cgroup_try_charge() and makes page_cgroup to be 847 * USED state. If already USED, uncharge and return. 848 */ 849 850static void __mem_cgroup_commit_charge(struct mem_cgroup *mem, 851 struct page_cgroup *pc, 852 enum charge_type ctype) 853{ 854 /* try_charge() can return NULL to *memcg, taking care of it. */ 855 if (!mem) 856 return; 857 858 lock_page_cgroup(pc); 859 if (unlikely(PageCgroupUsed(pc))) { 860 unlock_page_cgroup(pc); 861 res_counter_uncharge(&mem->res, PAGE_SIZE); 862 if (do_swap_account) 863 res_counter_uncharge(&mem->memsw, PAGE_SIZE); 864 css_put(&mem->css); 865 return; 866 } 867 pc->mem_cgroup = mem; 868 smp_wmb(); 869 pc->flags = pcg_default_flags[ctype]; 870 871 mem_cgroup_charge_statistics(mem, pc, true); 872 873 unlock_page_cgroup(pc); 874} 875 876/** 877 * mem_cgroup_move_account - move account of the page 878 * @pc: page_cgroup of the page. 879 * @from: mem_cgroup which the page is moved from. 880 * @to: mem_cgroup which the page is moved to. @from != @to. 881 * 882 * The caller must confirm following. 883 * - page is not on LRU (isolate_page() is useful.) 884 * 885 * returns 0 at success, 886 * returns -EBUSY when lock is busy or "pc" is unstable. 887 * 888 * This function does "uncharge" from old cgroup but doesn't do "charge" to 889 * new cgroup. It should be done by a caller. 890 */ 891 892static int mem_cgroup_move_account(struct page_cgroup *pc, 893 struct mem_cgroup *from, struct mem_cgroup *to) 894{ 895 struct mem_cgroup_per_zone *from_mz, *to_mz; 896 int nid, zid; 897 int ret = -EBUSY; 898 899 VM_BUG_ON(from == to); 900 VM_BUG_ON(PageLRU(pc->page)); 901 902 nid = page_cgroup_nid(pc); 903 zid = page_cgroup_zid(pc); 904 from_mz = mem_cgroup_zoneinfo(from, nid, zid); 905 to_mz = mem_cgroup_zoneinfo(to, nid, zid); 906 907 if (!trylock_page_cgroup(pc)) 908 return ret; 909 910 if (!PageCgroupUsed(pc)) 911 goto out; 912 913 if (pc->mem_cgroup != from) 914 goto out; 915 916 css_put(&from->css); 917 res_counter_uncharge(&from->res, PAGE_SIZE); 918 mem_cgroup_charge_statistics(from, pc, false); 919 if (do_swap_account) 920 res_counter_uncharge(&from->memsw, PAGE_SIZE); 921 pc->mem_cgroup = to; 922 mem_cgroup_charge_statistics(to, pc, true); 923 css_get(&to->css); 924 ret = 0; 925out: 926 unlock_page_cgroup(pc); 927 return ret; 928} 929 930/* 931 * move charges to its parent. 932 */ 933 934static int mem_cgroup_move_parent(struct page_cgroup *pc, 935 struct mem_cgroup *child, 936 gfp_t gfp_mask) 937{ 938 struct page *page = pc->page; 939 struct cgroup *cg = child->css.cgroup; 940 struct cgroup *pcg = cg->parent; 941 struct mem_cgroup *parent; 942 int ret; 943 944 /* Is ROOT ? */ 945 if (!pcg) 946 return -EINVAL; 947 948 949 parent = mem_cgroup_from_cont(pcg); 950 951 952 ret = __mem_cgroup_try_charge(NULL, gfp_mask, &parent, false); 953 if (ret || !parent) 954 return ret; 955 956 if (!get_page_unless_zero(page)) 957 return -EBUSY; 958 959 ret = isolate_lru_page(page); 960 961 if (ret) 962 goto cancel; 963 964 ret = mem_cgroup_move_account(pc, child, parent); 965 966 /* drop extra refcnt by try_charge() (move_account increment one) */ 967 css_put(&parent->css); 968 putback_lru_page(page); 969 if (!ret) { 970 put_page(page); 971 return 0; 972 } 973 /* uncharge if move fails */ 974cancel: 975 res_counter_uncharge(&parent->res, PAGE_SIZE); 976 if (do_swap_account) 977 res_counter_uncharge(&parent->memsw, PAGE_SIZE); 978 put_page(page); 979 return ret; 980} 981 982/* 983 * Charge the memory controller for page usage. 984 * Return 985 * 0 if the charge was successful 986 * < 0 if the cgroup is over its limit 987 */ 988static int mem_cgroup_charge_common(struct page *page, struct mm_struct *mm, 989 gfp_t gfp_mask, enum charge_type ctype, 990 struct mem_cgroup *memcg) 991{ 992 struct mem_cgroup *mem; 993 struct page_cgroup *pc; 994 int ret; 995 996 pc = lookup_page_cgroup(page); 997 /* can happen at boot */ 998 if (unlikely(!pc)) 999 return 0; 1000 prefetchw(pc); 1001 1002 mem = memcg; 1003 ret = __mem_cgroup_try_charge(mm, gfp_mask, &mem, true); 1004 if (ret || !mem) 1005 return ret; 1006 1007 __mem_cgroup_commit_charge(mem, pc, ctype); 1008 return 0; 1009} 1010 1011int mem_cgroup_newpage_charge(struct page *page, 1012 struct mm_struct *mm, gfp_t gfp_mask) 1013{ 1014 if (mem_cgroup_disabled()) 1015 return 0; 1016 if (PageCompound(page)) 1017 return 0; 1018 /* 1019 * If already mapped, we don't have to account. 1020 * If page cache, page->mapping has address_space. 1021 * But page->mapping may have out-of-use anon_vma pointer, 1022 * detecit it by PageAnon() check. newly-mapped-anon's page->mapping 1023 * is NULL. 1024 */ 1025 if (page_mapped(page) || (page->mapping && !PageAnon(page))) 1026 return 0; 1027 if (unlikely(!mm)) 1028 mm = &init_mm; 1029 return mem_cgroup_charge_common(page, mm, gfp_mask, 1030 MEM_CGROUP_CHARGE_TYPE_MAPPED, NULL); 1031} 1032 1033int mem_cgroup_cache_charge(struct page *page, struct mm_struct *mm, 1034 gfp_t gfp_mask) 1035{ 1036 if (mem_cgroup_disabled()) 1037 return 0; 1038 if (PageCompound(page)) 1039 return 0; 1040 /* 1041 * Corner case handling. This is called from add_to_page_cache() 1042 * in usual. But some FS (shmem) precharges this page before calling it 1043 * and call add_to_page_cache() with GFP_NOWAIT. 1044 * 1045 * For GFP_NOWAIT case, the page may be pre-charged before calling 1046 * add_to_page_cache(). (See shmem.c) check it here and avoid to call 1047 * charge twice. (It works but has to pay a bit larger cost.) 1048 */ 1049 if (!(gfp_mask & __GFP_WAIT)) { 1050 struct page_cgroup *pc; 1051 1052 1053 pc = lookup_page_cgroup(page); 1054 if (!pc) 1055 return 0; 1056 lock_page_cgroup(pc); 1057 if (PageCgroupUsed(pc)) { 1058 unlock_page_cgroup(pc); 1059 return 0; 1060 } 1061 unlock_page_cgroup(pc); 1062 } 1063 1064 if (unlikely(!mm)) 1065 mm = &init_mm; 1066 1067 if (page_is_file_cache(page)) 1068 return mem_cgroup_charge_common(page, mm, gfp_mask, 1069 MEM_CGROUP_CHARGE_TYPE_CACHE, NULL); 1070 else 1071 return mem_cgroup_charge_common(page, mm, gfp_mask, 1072 MEM_CGROUP_CHARGE_TYPE_SHMEM, NULL); 1073} 1074 1075int mem_cgroup_try_charge_swapin(struct mm_struct *mm, 1076 struct page *page, 1077 gfp_t mask, struct mem_cgroup **ptr) 1078{ 1079 struct mem_cgroup *mem; 1080 swp_entry_t ent; 1081 1082 if (mem_cgroup_disabled()) 1083 return 0; 1084 1085 if (!do_swap_account) 1086 goto charge_cur_mm; 1087 1088 /* 1089 * A racing thread's fault, or swapoff, may have already updated 1090 * the pte, and even removed page from swap cache: return success 1091 * to go on to do_swap_page()'s pte_same() test, which should fail. 1092 */ 1093 if (!PageSwapCache(page)) 1094 return 0; 1095 1096 ent.val = page_private(page); 1097 1098 mem = lookup_swap_cgroup(ent); 1099 if (!mem || mem->obsolete) 1100 goto charge_cur_mm; 1101 *ptr = mem; 1102 return __mem_cgroup_try_charge(NULL, mask, ptr, true); 1103charge_cur_mm: 1104 if (unlikely(!mm)) 1105 mm = &init_mm; 1106 return __mem_cgroup_try_charge(mm, mask, ptr, true); 1107} 1108 1109#ifdef CONFIG_SWAP 1110 1111int mem_cgroup_cache_charge_swapin(struct page *page, 1112 struct mm_struct *mm, gfp_t mask, bool locked) 1113{ 1114 int ret = 0; 1115 1116 if (mem_cgroup_disabled()) 1117 return 0; 1118 if (unlikely(!mm)) 1119 mm = &init_mm; 1120 if (!locked) 1121 lock_page(page); 1122 /* 1123 * If not locked, the page can be dropped from SwapCache until 1124 * we reach here. 1125 */ 1126 if (PageSwapCache(page)) { 1127 struct mem_cgroup *mem = NULL; 1128 swp_entry_t ent; 1129 1130 ent.val = page_private(page); 1131 if (do_swap_account) { 1132 mem = lookup_swap_cgroup(ent); 1133 if (mem && mem->obsolete) 1134 mem = NULL; 1135 if (mem) 1136 mm = NULL; 1137 } 1138 ret = mem_cgroup_charge_common(page, mm, mask, 1139 MEM_CGROUP_CHARGE_TYPE_SHMEM, mem); 1140 1141 if (!ret && do_swap_account) { 1142 /* avoid double counting */ 1143 mem = swap_cgroup_record(ent, NULL); 1144 if (mem) { 1145 res_counter_uncharge(&mem->memsw, PAGE_SIZE); 1146 mem_cgroup_put(mem); 1147 } 1148 } 1149 } 1150 if (!locked) 1151 unlock_page(page); 1152 /* add this page(page_cgroup) to the LRU we want. */ 1153 mem_cgroup_lru_fixup(page); 1154 1155 return ret; 1156} 1157#endif 1158 1159void mem_cgroup_commit_charge_swapin(struct page *page, struct mem_cgroup *ptr) 1160{ 1161 struct page_cgroup *pc; 1162 1163 if (mem_cgroup_disabled()) 1164 return; 1165 if (!ptr) 1166 return; 1167 pc = lookup_page_cgroup(page); 1168 __mem_cgroup_commit_charge(ptr, pc, MEM_CGROUP_CHARGE_TYPE_MAPPED); 1169 /* 1170 * Now swap is on-memory. This means this page may be 1171 * counted both as mem and swap....double count. 1172 * Fix it by uncharging from memsw. This SwapCache is stable 1173 * because we're still under lock_page(). 1174 */ 1175 if (do_swap_account) { 1176 swp_entry_t ent = {.val = page_private(page)}; 1177 struct mem_cgroup *memcg; 1178 memcg = swap_cgroup_record(ent, NULL); 1179 if (memcg) { 1180 /* If memcg is obsolete, memcg can be != ptr */ 1181 res_counter_uncharge(&memcg->memsw, PAGE_SIZE); 1182 mem_cgroup_put(memcg); 1183 } 1184 1185 } 1186 /* add this page(page_cgroup) to the LRU we want. */ 1187 mem_cgroup_lru_fixup(page); 1188} 1189 1190void mem_cgroup_cancel_charge_swapin(struct mem_cgroup *mem) 1191{ 1192 if (mem_cgroup_disabled()) 1193 return; 1194 if (!mem) 1195 return; 1196 res_counter_uncharge(&mem->res, PAGE_SIZE); 1197 if (do_swap_account) 1198 res_counter_uncharge(&mem->memsw, PAGE_SIZE); 1199 css_put(&mem->css); 1200} 1201 1202 1203/* 1204 * uncharge if !page_mapped(page) 1205 */ 1206static struct mem_cgroup * 1207__mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype) 1208{ 1209 struct page_cgroup *pc; 1210 struct mem_cgroup *mem = NULL; 1211 struct mem_cgroup_per_zone *mz; 1212 1213 if (mem_cgroup_disabled()) 1214 return NULL; 1215 1216 if (PageSwapCache(page)) 1217 return NULL; 1218 1219 /* 1220 * Check if our page_cgroup is valid 1221 */ 1222 pc = lookup_page_cgroup(page); 1223 if (unlikely(!pc || !PageCgroupUsed(pc))) 1224 return NULL; 1225 1226 lock_page_cgroup(pc); 1227 1228 mem = pc->mem_cgroup; 1229 1230 if (!PageCgroupUsed(pc)) 1231 goto unlock_out; 1232 1233 switch (ctype) { 1234 case MEM_CGROUP_CHARGE_TYPE_MAPPED: 1235 if (page_mapped(page)) 1236 goto unlock_out; 1237 break; 1238 case MEM_CGROUP_CHARGE_TYPE_SWAPOUT: 1239 if (!PageAnon(page)) { /* Shared memory */ 1240 if (page->mapping && !page_is_file_cache(page)) 1241 goto unlock_out; 1242 } else if (page_mapped(page)) /* Anon */ 1243 goto unlock_out; 1244 break; 1245 default: 1246 break; 1247 } 1248 1249 res_counter_uncharge(&mem->res, PAGE_SIZE); 1250 if (do_swap_account && (ctype != MEM_CGROUP_CHARGE_TYPE_SWAPOUT)) 1251 res_counter_uncharge(&mem->memsw, PAGE_SIZE); 1252 1253 mem_cgroup_charge_statistics(mem, pc, false); 1254 ClearPageCgroupUsed(pc); 1255 1256 mz = page_cgroup_zoneinfo(pc); 1257 unlock_page_cgroup(pc); 1258 1259 /* at swapout, this memcg will be accessed to record to swap */ 1260 if (ctype != MEM_CGROUP_CHARGE_TYPE_SWAPOUT) 1261 css_put(&mem->css); 1262 1263 return mem; 1264 1265unlock_out: 1266 unlock_page_cgroup(pc); 1267 return NULL; 1268} 1269 1270void mem_cgroup_uncharge_page(struct page *page) 1271{ 1272 /* early check. */ 1273 if (page_mapped(page)) 1274 return; 1275 if (page->mapping && !PageAnon(page)) 1276 return; 1277 __mem_cgroup_uncharge_common(page, MEM_CGROUP_CHARGE_TYPE_MAPPED); 1278} 1279 1280void mem_cgroup_uncharge_cache_page(struct page *page) 1281{ 1282 VM_BUG_ON(page_mapped(page)); 1283 VM_BUG_ON(page->mapping); 1284 __mem_cgroup_uncharge_common(page, MEM_CGROUP_CHARGE_TYPE_CACHE); 1285} 1286 1287/* 1288 * called from __delete_from_swap_cache() and drop "page" account. 1289 * memcg information is recorded to swap_cgroup of "ent" 1290 */ 1291void mem_cgroup_uncharge_swapcache(struct page *page, swp_entry_t ent) 1292{ 1293 struct mem_cgroup *memcg; 1294 1295 memcg = __mem_cgroup_uncharge_common(page, 1296 MEM_CGROUP_CHARGE_TYPE_SWAPOUT); 1297 /* record memcg information */ 1298 if (do_swap_account && memcg) { 1299 swap_cgroup_record(ent, memcg); 1300 mem_cgroup_get(memcg); 1301 } 1302 if (memcg) 1303 css_put(&memcg->css); 1304} 1305 1306#ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP 1307/* 1308 * called from swap_entry_free(). remove record in swap_cgroup and 1309 * uncharge "memsw" account. 1310 */ 1311void mem_cgroup_uncharge_swap(swp_entry_t ent) 1312{ 1313 struct mem_cgroup *memcg; 1314 1315 if (!do_swap_account) 1316 return; 1317 1318 memcg = swap_cgroup_record(ent, NULL); 1319 if (memcg) { 1320 res_counter_uncharge(&memcg->memsw, PAGE_SIZE); 1321 mem_cgroup_put(memcg); 1322 } 1323} 1324#endif 1325 1326/* 1327 * Before starting migration, account PAGE_SIZE to mem_cgroup that the old 1328 * page belongs to. 1329 */ 1330int mem_cgroup_prepare_migration(struct page *page, struct mem_cgroup **ptr) 1331{ 1332 struct page_cgroup *pc; 1333 struct mem_cgroup *mem = NULL; 1334 int ret = 0; 1335 1336 if (mem_cgroup_disabled()) 1337 return 0; 1338 1339 pc = lookup_page_cgroup(page); 1340 lock_page_cgroup(pc); 1341 if (PageCgroupUsed(pc)) { 1342 mem = pc->mem_cgroup; 1343 css_get(&mem->css); 1344 } 1345 unlock_page_cgroup(pc); 1346 1347 if (mem) { 1348 ret = __mem_cgroup_try_charge(NULL, GFP_KERNEL, &mem, false); 1349 css_put(&mem->css); 1350 } 1351 *ptr = mem; 1352 return ret; 1353} 1354 1355/* remove redundant charge if migration failed*/ 1356void mem_cgroup_end_migration(struct mem_cgroup *mem, 1357 struct page *oldpage, struct page *newpage) 1358{ 1359 struct page *target, *unused; 1360 struct page_cgroup *pc; 1361 enum charge_type ctype; 1362 1363 if (!mem) 1364 return; 1365 1366 /* at migration success, oldpage->mapping is NULL. */ 1367 if (oldpage->mapping) { 1368 target = oldpage; 1369 unused = NULL; 1370 } else { 1371 target = newpage; 1372 unused = oldpage; 1373 } 1374 1375 if (PageAnon(target)) 1376 ctype = MEM_CGROUP_CHARGE_TYPE_MAPPED; 1377 else if (page_is_file_cache(target)) 1378 ctype = MEM_CGROUP_CHARGE_TYPE_CACHE; 1379 else 1380 ctype = MEM_CGROUP_CHARGE_TYPE_SHMEM; 1381 1382 /* unused page is not on radix-tree now. */ 1383 if (unused) 1384 __mem_cgroup_uncharge_common(unused, ctype); 1385 1386 pc = lookup_page_cgroup(target); 1387 /* 1388 * __mem_cgroup_commit_charge() check PCG_USED bit of page_cgroup. 1389 * So, double-counting is effectively avoided. 1390 */ 1391 __mem_cgroup_commit_charge(mem, pc, ctype); 1392 1393 /* 1394 * Both of oldpage and newpage are still under lock_page(). 1395 * Then, we don't have to care about race in radix-tree. 1396 * But we have to be careful that this page is unmapped or not. 1397 * 1398 * There is a case for !page_mapped(). At the start of 1399 * migration, oldpage was mapped. But now, it's zapped. 1400 * But we know *target* page is not freed/reused under us. 1401 * mem_cgroup_uncharge_page() does all necessary checks. 1402 */ 1403 if (ctype == MEM_CGROUP_CHARGE_TYPE_MAPPED) 1404 mem_cgroup_uncharge_page(target); 1405} 1406 1407/* 1408 * A call to try to shrink memory usage under specified resource controller. 1409 * This is typically used for page reclaiming for shmem for reducing side 1410 * effect of page allocation from shmem, which is used by some mem_cgroup. 1411 */ 1412int mem_cgroup_shrink_usage(struct mm_struct *mm, gfp_t gfp_mask) 1413{ 1414 struct mem_cgroup *mem; 1415 int progress = 0; 1416 int retry = MEM_CGROUP_RECLAIM_RETRIES; 1417 1418 if (mem_cgroup_disabled()) 1419 return 0; 1420 if (!mm) 1421 return 0; 1422 1423 rcu_read_lock(); 1424 mem = mem_cgroup_from_task(rcu_dereference(mm->owner)); 1425 if (unlikely(!mem)) { 1426 rcu_read_unlock(); 1427 return 0; 1428 } 1429 css_get(&mem->css); 1430 rcu_read_unlock(); 1431 1432 do { 1433 progress = try_to_free_mem_cgroup_pages(mem, gfp_mask, true, 1434 get_swappiness(mem)); 1435 progress += mem_cgroup_check_under_limit(mem); 1436 } while (!progress && --retry); 1437 1438 css_put(&mem->css); 1439 if (!retry) 1440 return -ENOMEM; 1441 return 0; 1442} 1443 1444static DEFINE_MUTEX(set_limit_mutex); 1445 1446static int mem_cgroup_resize_limit(struct mem_cgroup *memcg, 1447 unsigned long long val) 1448{ 1449 1450 int retry_count = MEM_CGROUP_RECLAIM_RETRIES; 1451 int progress; 1452 u64 memswlimit; 1453 int ret = 0; 1454 1455 while (retry_count) { 1456 if (signal_pending(current)) { 1457 ret = -EINTR; 1458 break; 1459 } 1460 /* 1461 * Rather than hide all in some function, I do this in 1462 * open coded manner. You see what this really does. 1463 * We have to guarantee mem->res.limit < mem->memsw.limit. 1464 */ 1465 mutex_lock(&set_limit_mutex); 1466 memswlimit = res_counter_read_u64(&memcg->memsw, RES_LIMIT); 1467 if (memswlimit < val) { 1468 ret = -EINVAL; 1469 mutex_unlock(&set_limit_mutex); 1470 break; 1471 } 1472 ret = res_counter_set_limit(&memcg->res, val); 1473 mutex_unlock(&set_limit_mutex); 1474 1475 if (!ret) 1476 break; 1477 1478 progress = try_to_free_mem_cgroup_pages(memcg, 1479 GFP_KERNEL, 1480 false, 1481 get_swappiness(memcg)); 1482 if (!progress) retry_count--; 1483 } 1484 1485 return ret; 1486} 1487 1488int mem_cgroup_resize_memsw_limit(struct mem_cgroup *memcg, 1489 unsigned long long val) 1490{ 1491 int retry_count = MEM_CGROUP_RECLAIM_RETRIES; 1492 u64 memlimit, oldusage, curusage; 1493 int ret; 1494 1495 if (!do_swap_account) 1496 return -EINVAL; 1497 1498 while (retry_count) { 1499 if (signal_pending(current)) { 1500 ret = -EINTR; 1501 break; 1502 } 1503 /* 1504 * Rather than hide all in some function, I do this in 1505 * open coded manner. You see what this really does. 1506 * We have to guarantee mem->res.limit < mem->memsw.limit. 1507 */ 1508 mutex_lock(&set_limit_mutex); 1509 memlimit = res_counter_read_u64(&memcg->res, RES_LIMIT); 1510 if (memlimit > val) { 1511 ret = -EINVAL; 1512 mutex_unlock(&set_limit_mutex); 1513 break; 1514 } 1515 ret = res_counter_set_limit(&memcg->memsw, val); 1516 mutex_unlock(&set_limit_mutex); 1517 1518 if (!ret) 1519 break; 1520 1521 oldusage = res_counter_read_u64(&memcg->memsw, RES_USAGE); 1522 try_to_free_mem_cgroup_pages(memcg, GFP_KERNEL, true, 1523 get_swappiness(memcg)); 1524 curusage = res_counter_read_u64(&memcg->memsw, RES_USAGE); 1525 if (curusage >= oldusage) 1526 retry_count--; 1527 } 1528 return ret; 1529} 1530 1531/* 1532 * This routine traverse page_cgroup in given list and drop them all. 1533 * *And* this routine doesn't reclaim page itself, just removes page_cgroup. 1534 */ 1535static int mem_cgroup_force_empty_list(struct mem_cgroup *mem, 1536 int node, int zid, enum lru_list lru) 1537{ 1538 struct zone *zone; 1539 struct mem_cgroup_per_zone *mz; 1540 struct page_cgroup *pc, *busy; 1541 unsigned long flags, loop; 1542 struct list_head *list; 1543 int ret = 0; 1544 1545 zone = &NODE_DATA(node)->node_zones[zid]; 1546 mz = mem_cgroup_zoneinfo(mem, node, zid); 1547 list = &mz->lists[lru]; 1548 1549 loop = MEM_CGROUP_ZSTAT(mz, lru); 1550 /* give some margin against EBUSY etc...*/ 1551 loop += 256; 1552 busy = NULL; 1553 while (loop--) { 1554 ret = 0; 1555 spin_lock_irqsave(&zone->lru_lock, flags); 1556 if (list_empty(list)) { 1557 spin_unlock_irqrestore(&zone->lru_lock, flags); 1558 break; 1559 } 1560 pc = list_entry(list->prev, struct page_cgroup, lru); 1561 if (busy == pc) { 1562 list_move(&pc->lru, list); 1563 busy = 0; 1564 spin_unlock_irqrestore(&zone->lru_lock, flags); 1565 continue; 1566 } 1567 spin_unlock_irqrestore(&zone->lru_lock, flags); 1568 1569 ret = mem_cgroup_move_parent(pc, mem, GFP_KERNEL); 1570 if (ret == -ENOMEM) 1571 break; 1572 1573 if (ret == -EBUSY || ret == -EINVAL) { 1574 /* found lock contention or "pc" is obsolete. */ 1575 busy = pc; 1576 cond_resched(); 1577 } else 1578 busy = NULL; 1579 } 1580 1581 if (!ret && !list_empty(list)) 1582 return -EBUSY; 1583 return ret; 1584} 1585 1586/* 1587 * make mem_cgroup's charge to be 0 if there is no task. 1588 * This enables deleting this mem_cgroup. 1589 */ 1590static int mem_cgroup_force_empty(struct mem_cgroup *mem, bool free_all) 1591{ 1592 int ret; 1593 int node, zid, shrink; 1594 int nr_retries = MEM_CGROUP_RECLAIM_RETRIES; 1595 struct cgroup *cgrp = mem->css.cgroup; 1596 1597 css_get(&mem->css); 1598 1599 shrink = 0; 1600 /* should free all ? */ 1601 if (free_all) 1602 goto try_to_free; 1603move_account: 1604 while (mem->res.usage > 0) { 1605 ret = -EBUSY; 1606 if (cgroup_task_count(cgrp) || !list_empty(&cgrp->children)) 1607 goto out; 1608 ret = -EINTR; 1609 if (signal_pending(current)) 1610 goto out; 1611 /* This is for making all *used* pages to be on LRU. */ 1612 lru_add_drain_all(); 1613 ret = 0; 1614 for_each_node_state(node, N_POSSIBLE) { 1615 for (zid = 0; !ret && zid < MAX_NR_ZONES; zid++) { 1616 enum lru_list l; 1617 for_each_lru(l) { 1618 ret = mem_cgroup_force_empty_list(mem, 1619 node, zid, l); 1620 if (ret) 1621 break; 1622 } 1623 } 1624 if (ret) 1625 break; 1626 } 1627 /* it seems parent cgroup doesn't have enough mem */ 1628 if (ret == -ENOMEM) 1629 goto try_to_free; 1630 cond_resched(); 1631 } 1632 ret = 0; 1633out: 1634 css_put(&mem->css); 1635 return ret; 1636 1637try_to_free: 1638 /* returns EBUSY if there is a task or if we come here twice. */ 1639 if (cgroup_task_count(cgrp) || !list_empty(&cgrp->children) || shrink) { 1640 ret = -EBUSY; 1641 goto out; 1642 } 1643 /* we call try-to-free pages for make this cgroup empty */ 1644 lru_add_drain_all(); 1645 /* try to free all pages in this cgroup */ 1646 shrink = 1; 1647 while (nr_retries && mem->res.usage > 0) { 1648 int progress; 1649 1650 if (signal_pending(current)) { 1651 ret = -EINTR; 1652 goto out; 1653 } 1654 progress = try_to_free_mem_cgroup_pages(mem, GFP_KERNEL, 1655 false, get_swappiness(mem)); 1656 if (!progress) { 1657 nr_retries--; 1658 /* maybe some writeback is necessary */ 1659 congestion_wait(WRITE, HZ/10); 1660 } 1661 1662 } 1663 lru_add_drain(); 1664 /* try move_account...there may be some *locked* pages. */ 1665 if (mem->res.usage) 1666 goto move_account; 1667 ret = 0; 1668 goto out; 1669} 1670 1671int mem_cgroup_force_empty_write(struct cgroup *cont, unsigned int event) 1672{ 1673 return mem_cgroup_force_empty(mem_cgroup_from_cont(cont), true); 1674} 1675 1676 1677static u64 mem_cgroup_hierarchy_read(struct cgroup *cont, struct cftype *cft) 1678{ 1679 return mem_cgroup_from_cont(cont)->use_hierarchy; 1680} 1681 1682static int mem_cgroup_hierarchy_write(struct cgroup *cont, struct cftype *cft, 1683 u64 val) 1684{ 1685 int retval = 0; 1686 struct mem_cgroup *mem = mem_cgroup_from_cont(cont); 1687 struct cgroup *parent = cont->parent; 1688 struct mem_cgroup *parent_mem = NULL; 1689 1690 if (parent) 1691 parent_mem = mem_cgroup_from_cont(parent); 1692 1693 cgroup_lock(); 1694 /* 1695 * If parent's use_hiearchy is set, we can't make any modifications 1696 * in the child subtrees. If it is unset, then the change can 1697 * occur, provided the current cgroup has no children. 1698 * 1699 * For the root cgroup, parent_mem is NULL, we allow value to be 1700 * set if there are no children. 1701 */ 1702 if ((!parent_mem || !parent_mem->use_hierarchy) && 1703 (val == 1 || val == 0)) { 1704 if (list_empty(&cont->children)) 1705 mem->use_hierarchy = val; 1706 else 1707 retval = -EBUSY; 1708 } else 1709 retval = -EINVAL; 1710 cgroup_unlock(); 1711 1712 return retval; 1713} 1714 1715static u64 mem_cgroup_read(struct cgroup *cont, struct cftype *cft) 1716{ 1717 struct mem_cgroup *mem = mem_cgroup_from_cont(cont); 1718 u64 val = 0; 1719 int type, name; 1720 1721 type = MEMFILE_TYPE(cft->private); 1722 name = MEMFILE_ATTR(cft->private); 1723 switch (type) { 1724 case _MEM: 1725 val = res_counter_read_u64(&mem->res, name); 1726 break; 1727 case _MEMSWAP: 1728 if (do_swap_account) 1729 val = res_counter_read_u64(&mem->memsw, name); 1730 break; 1731 default: 1732 BUG(); 1733 break; 1734 } 1735 return val; 1736} 1737/* 1738 * The user of this function is... 1739 * RES_LIMIT. 1740 */ 1741static int mem_cgroup_write(struct cgroup *cont, struct cftype *cft, 1742 const char *buffer) 1743{ 1744 struct mem_cgroup *memcg = mem_cgroup_from_cont(cont); 1745 int type, name; 1746 unsigned long long val; 1747 int ret; 1748 1749 type = MEMFILE_TYPE(cft->private); 1750 name = MEMFILE_ATTR(cft->private); 1751 switch (name) { 1752 case RES_LIMIT: 1753 /* This function does all necessary parse...reuse it */ 1754 ret = res_counter_memparse_write_strategy(buffer, &val); 1755 if (ret) 1756 break; 1757 if (type == _MEM) 1758 ret = mem_cgroup_resize_limit(memcg, val); 1759 else 1760 ret = mem_cgroup_resize_memsw_limit(memcg, val); 1761 break; 1762 default: 1763 ret = -EINVAL; /* should be BUG() ? */ 1764 break; 1765 } 1766 return ret; 1767} 1768 1769static void memcg_get_hierarchical_limit(struct mem_cgroup *memcg, 1770 unsigned long long *mem_limit, unsigned long long *memsw_limit) 1771{ 1772 struct cgroup *cgroup; 1773 unsigned long long min_limit, min_memsw_limit, tmp; 1774 1775 min_limit = res_counter_read_u64(&memcg->res, RES_LIMIT); 1776 min_memsw_limit = res_counter_read_u64(&memcg->memsw, RES_LIMIT); 1777 cgroup = memcg->css.cgroup; 1778 if (!memcg->use_hierarchy) 1779 goto out; 1780 1781 while (cgroup->parent) { 1782 cgroup = cgroup->parent; 1783 memcg = mem_cgroup_from_cont(cgroup); 1784 if (!memcg->use_hierarchy) 1785 break; 1786 tmp = res_counter_read_u64(&memcg->res, RES_LIMIT); 1787 min_limit = min(min_limit, tmp); 1788 tmp = res_counter_read_u64(&memcg->memsw, RES_LIMIT); 1789 min_memsw_limit = min(min_memsw_limit, tmp); 1790 } 1791out: 1792 *mem_limit = min_limit; 1793 *memsw_limit = min_memsw_limit; 1794 return; 1795} 1796 1797static int mem_cgroup_reset(struct cgroup *cont, unsigned int event) 1798{ 1799 struct mem_cgroup *mem; 1800 int type, name; 1801 1802 mem = mem_cgroup_from_cont(cont); 1803 type = MEMFILE_TYPE(event); 1804 name = MEMFILE_ATTR(event); 1805 switch (name) { 1806 case RES_MAX_USAGE: 1807 if (type == _MEM) 1808 res_counter_reset_max(&mem->res); 1809 else 1810 res_counter_reset_max(&mem->memsw); 1811 break; 1812 case RES_FAILCNT: 1813 if (type == _MEM) 1814 res_counter_reset_failcnt(&mem->res); 1815 else 1816 res_counter_reset_failcnt(&mem->memsw); 1817 break; 1818 } 1819 return 0; 1820} 1821 1822static const struct mem_cgroup_stat_desc { 1823 const char *msg; 1824 u64 unit; 1825} mem_cgroup_stat_desc[] = { 1826 [MEM_CGROUP_STAT_CACHE] = { "cache", PAGE_SIZE, }, 1827 [MEM_CGROUP_STAT_RSS] = { "rss", PAGE_SIZE, }, 1828 [MEM_CGROUP_STAT_PGPGIN_COUNT] = {"pgpgin", 1, }, 1829 [MEM_CGROUP_STAT_PGPGOUT_COUNT] = {"pgpgout", 1, }, 1830}; 1831 1832static int mem_control_stat_show(struct cgroup *cont, struct cftype *cft, 1833 struct cgroup_map_cb *cb) 1834{ 1835 struct mem_cgroup *mem_cont = mem_cgroup_from_cont(cont); 1836 struct mem_cgroup_stat *stat = &mem_cont->stat; 1837 int i; 1838 1839 for (i = 0; i < ARRAY_SIZE(stat->cpustat[0].count); i++) { 1840 s64 val; 1841 1842 val = mem_cgroup_read_stat(stat, i); 1843 val *= mem_cgroup_stat_desc[i].unit; 1844 cb->fill(cb, mem_cgroup_stat_desc[i].msg, val); 1845 } 1846 /* showing # of active pages */ 1847 { 1848 unsigned long active_anon, inactive_anon; 1849 unsigned long active_file, inactive_file; 1850 unsigned long unevictable; 1851 1852 inactive_anon = mem_cgroup_get_all_zonestat(mem_cont, 1853 LRU_INACTIVE_ANON); 1854 active_anon = mem_cgroup_get_all_zonestat(mem_cont, 1855 LRU_ACTIVE_ANON); 1856 inactive_file = mem_cgroup_get_all_zonestat(mem_cont, 1857 LRU_INACTIVE_FILE); 1858 active_file = mem_cgroup_get_all_zonestat(mem_cont, 1859 LRU_ACTIVE_FILE); 1860 unevictable = mem_cgroup_get_all_zonestat(mem_cont, 1861 LRU_UNEVICTABLE); 1862 1863 cb->fill(cb, "active_anon", (active_anon) * PAGE_SIZE); 1864 cb->fill(cb, "inactive_anon", (inactive_anon) * PAGE_SIZE); 1865 cb->fill(cb, "active_file", (active_file) * PAGE_SIZE); 1866 cb->fill(cb, "inactive_file", (inactive_file) * PAGE_SIZE); 1867 cb->fill(cb, "unevictable", unevictable * PAGE_SIZE); 1868 1869 } 1870 { 1871 unsigned long long limit, memsw_limit; 1872 memcg_get_hierarchical_limit(mem_cont, &limit, &memsw_limit); 1873 cb->fill(cb, "hierarchical_memory_limit", limit); 1874 if (do_swap_account) 1875 cb->fill(cb, "hierarchical_memsw_limit", memsw_limit); 1876 } 1877 1878#ifdef CONFIG_DEBUG_VM 1879 cb->fill(cb, "inactive_ratio", calc_inactive_ratio(mem_cont, NULL)); 1880 1881 { 1882 int nid, zid; 1883 struct mem_cgroup_per_zone *mz; 1884 unsigned long recent_rotated[2] = {0, 0}; 1885 unsigned long recent_scanned[2] = {0, 0}; 1886 1887 for_each_online_node(nid) 1888 for (zid = 0; zid < MAX_NR_ZONES; zid++) { 1889 mz = mem_cgroup_zoneinfo(mem_cont, nid, zid); 1890 1891 recent_rotated[0] += 1892 mz->reclaim_stat.recent_rotated[0]; 1893 recent_rotated[1] += 1894 mz->reclaim_stat.recent_rotated[1]; 1895 recent_scanned[0] += 1896 mz->reclaim_stat.recent_scanned[0]; 1897 recent_scanned[1] += 1898 mz->reclaim_stat.recent_scanned[1]; 1899 } 1900 cb->fill(cb, "recent_rotated_anon", recent_rotated[0]); 1901 cb->fill(cb, "recent_rotated_file", recent_rotated[1]); 1902 cb->fill(cb, "recent_scanned_anon", recent_scanned[0]); 1903 cb->fill(cb, "recent_scanned_file", recent_scanned[1]); 1904 } 1905#endif 1906 1907 return 0; 1908} 1909 1910static u64 mem_cgroup_swappiness_read(struct cgroup *cgrp, struct cftype *cft) 1911{ 1912 struct mem_cgroup *memcg = mem_cgroup_from_cont(cgrp); 1913 1914 return get_swappiness(memcg); 1915} 1916 1917static int mem_cgroup_swappiness_write(struct cgroup *cgrp, struct cftype *cft, 1918 u64 val) 1919{ 1920 struct mem_cgroup *memcg = mem_cgroup_from_cont(cgrp); 1921 struct mem_cgroup *parent; 1922 if (val > 100) 1923 return -EINVAL; 1924 1925 if (cgrp->parent == NULL) 1926 return -EINVAL; 1927 1928 parent = mem_cgroup_from_cont(cgrp->parent); 1929 /* If under hierarchy, only empty-root can set this value */ 1930 if ((parent->use_hierarchy) || 1931 (memcg->use_hierarchy && !list_empty(&cgrp->children))) 1932 return -EINVAL; 1933 1934 spin_lock(&memcg->reclaim_param_lock); 1935 memcg->swappiness = val; 1936 spin_unlock(&memcg->reclaim_param_lock); 1937 1938 return 0; 1939} 1940 1941 1942static struct cftype mem_cgroup_files[] = { 1943 { 1944 .name = "usage_in_bytes", 1945 .private = MEMFILE_PRIVATE(_MEM, RES_USAGE), 1946 .read_u64 = mem_cgroup_read, 1947 }, 1948 { 1949 .name = "max_usage_in_bytes", 1950 .private = MEMFILE_PRIVATE(_MEM, RES_MAX_USAGE), 1951 .trigger = mem_cgroup_reset, 1952 .read_u64 = mem_cgroup_read, 1953 }, 1954 { 1955 .name = "limit_in_bytes", 1956 .private = MEMFILE_PRIVATE(_MEM, RES_LIMIT), 1957 .write_string = mem_cgroup_write, 1958 .read_u64 = mem_cgroup_read, 1959 }, 1960 { 1961 .name = "failcnt", 1962 .private = MEMFILE_PRIVATE(_MEM, RES_FAILCNT), 1963 .trigger = mem_cgroup_reset, 1964 .read_u64 = mem_cgroup_read, 1965 }, 1966 { 1967 .name = "stat", 1968 .read_map = mem_control_stat_show, 1969 }, 1970 { 1971 .name = "force_empty", 1972 .trigger = mem_cgroup_force_empty_write, 1973 }, 1974 { 1975 .name = "use_hierarchy", 1976 .write_u64 = mem_cgroup_hierarchy_write, 1977 .read_u64 = mem_cgroup_hierarchy_read, 1978 }, 1979 { 1980 .name = "swappiness", 1981 .read_u64 = mem_cgroup_swappiness_read, 1982 .write_u64 = mem_cgroup_swappiness_write, 1983 }, 1984}; 1985 1986#ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP 1987static struct cftype memsw_cgroup_files[] = { 1988 { 1989 .name = "memsw.usage_in_bytes", 1990 .private = MEMFILE_PRIVATE(_MEMSWAP, RES_USAGE), 1991 .read_u64 = mem_cgroup_read, 1992 }, 1993 { 1994 .name = "memsw.max_usage_in_bytes", 1995 .private = MEMFILE_PRIVATE(_MEMSWAP, RES_MAX_USAGE), 1996 .trigger = mem_cgroup_reset, 1997 .read_u64 = mem_cgroup_read, 1998 }, 1999 { 2000 .name = "memsw.limit_in_bytes", 2001 .private = MEMFILE_PRIVATE(_MEMSWAP, RES_LIMIT), 2002 .write_string = mem_cgroup_write, 2003 .read_u64 = mem_cgroup_read, 2004 }, 2005 { 2006 .name = "memsw.failcnt", 2007 .private = MEMFILE_PRIVATE(_MEMSWAP, RES_FAILCNT), 2008 .trigger = mem_cgroup_reset, 2009 .read_u64 = mem_cgroup_read, 2010 }, 2011}; 2012 2013static int register_memsw_files(struct cgroup *cont, struct cgroup_subsys *ss) 2014{ 2015 if (!do_swap_account) 2016 return 0; 2017 return cgroup_add_files(cont, ss, memsw_cgroup_files, 2018 ARRAY_SIZE(memsw_cgroup_files)); 2019}; 2020#else 2021static int register_memsw_files(struct cgroup *cont, struct cgroup_subsys *ss) 2022{ 2023 return 0; 2024} 2025#endif 2026 2027static int alloc_mem_cgroup_per_zone_info(struct mem_cgroup *mem, int node) 2028{ 2029 struct mem_cgroup_per_node *pn; 2030 struct mem_cgroup_per_zone *mz; 2031 enum lru_list l; 2032 int zone, tmp = node; 2033 /* 2034 * This routine is called against possible nodes. 2035 * But it's BUG to call kmalloc() against offline node. 2036 * 2037 * TODO: this routine can waste much memory for nodes which will 2038 * never be onlined. It's better to use memory hotplug callback 2039 * function. 2040 */ 2041 if (!node_state(node, N_NORMAL_MEMORY)) 2042 tmp = -1; 2043 pn = kmalloc_node(sizeof(*pn), GFP_KERNEL, tmp); 2044 if (!pn) 2045 return 1; 2046 2047 mem->info.nodeinfo[node] = pn; 2048 memset(pn, 0, sizeof(*pn)); 2049 2050 for (zone = 0; zone < MAX_NR_ZONES; zone++) { 2051 mz = &pn->zoneinfo[zone]; 2052 for_each_lru(l) 2053 INIT_LIST_HEAD(&mz->lists[l]); 2054 } 2055 return 0; 2056} 2057 2058static void free_mem_cgroup_per_zone_info(struct mem_cgroup *mem, int node) 2059{ 2060 kfree(mem->info.nodeinfo[node]); 2061} 2062 2063static int mem_cgroup_size(void) 2064{ 2065 int cpustat_size = nr_cpu_ids * sizeof(struct mem_cgroup_stat_cpu); 2066 return sizeof(struct mem_cgroup) + cpustat_size; 2067} 2068 2069static struct mem_cgroup *mem_cgroup_alloc(void) 2070{ 2071 struct mem_cgroup *mem; 2072 int size = mem_cgroup_size(); 2073 2074 if (size < PAGE_SIZE) 2075 mem = kmalloc(size, GFP_KERNEL); 2076 else 2077 mem = vmalloc(size); 2078 2079 if (mem) 2080 memset(mem, 0, size); 2081 return mem; 2082} 2083 2084/* 2085 * At destroying mem_cgroup, references from swap_cgroup can remain. 2086 * (scanning all at force_empty is too costly...) 2087 * 2088 * Instead of clearing all references at force_empty, we remember 2089 * the number of reference from swap_cgroup and free mem_cgroup when 2090 * it goes down to 0. 2091 * 2092 * When mem_cgroup is destroyed, mem->obsolete will be set to 0 and 2093 * entry which points to this memcg will be ignore at swapin. 2094 * 2095 * Removal of cgroup itself succeeds regardless of refs from swap. 2096 */ 2097 2098static void mem_cgroup_free(struct mem_cgroup *mem) 2099{ 2100 int node; 2101 2102 if (atomic_read(&mem->refcnt) > 0) 2103 return; 2104 2105 2106 for_each_node_state(node, N_POSSIBLE) 2107 free_mem_cgroup_per_zone_info(mem, node); 2108 2109 if (mem_cgroup_size() < PAGE_SIZE) 2110 kfree(mem); 2111 else 2112 vfree(mem); 2113} 2114 2115static void mem_cgroup_get(struct mem_cgroup *mem) 2116{ 2117 atomic_inc(&mem->refcnt); 2118} 2119 2120static void mem_cgroup_put(struct mem_cgroup *mem) 2121{ 2122 if (atomic_dec_and_test(&mem->refcnt)) { 2123 if (!mem->obsolete) 2124 return; 2125 mem_cgroup_free(mem); 2126 } 2127} 2128 2129 2130#ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP 2131static void __init enable_swap_cgroup(void) 2132{ 2133 if (!mem_cgroup_disabled() && really_do_swap_account) 2134 do_swap_account = 1; 2135} 2136#else 2137static void __init enable_swap_cgroup(void) 2138{ 2139} 2140#endif 2141 2142static struct cgroup_subsys_state * 2143mem_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cont) 2144{ 2145 struct mem_cgroup *mem, *parent; 2146 int node; 2147 2148 mem = mem_cgroup_alloc(); 2149 if (!mem) 2150 return ERR_PTR(-ENOMEM); 2151 2152 for_each_node_state(node, N_POSSIBLE) 2153 if (alloc_mem_cgroup_per_zone_info(mem, node)) 2154 goto free_out; 2155 /* root ? */ 2156 if (cont->parent == NULL) { 2157 enable_swap_cgroup(); 2158 parent = NULL; 2159 } else { 2160 parent = mem_cgroup_from_cont(cont->parent); 2161 mem->use_hierarchy = parent->use_hierarchy; 2162 } 2163 2164 if (parent && parent->use_hierarchy) { 2165 res_counter_init(&mem->res, &parent->res); 2166 res_counter_init(&mem->memsw, &parent->memsw); 2167 } else { 2168 res_counter_init(&mem->res, NULL); 2169 res_counter_init(&mem->memsw, NULL); 2170 } 2171 mem->last_scanned_child = NULL; 2172 spin_lock_init(&mem->reclaim_param_lock); 2173 2174 if (parent) 2175 mem->swappiness = get_swappiness(parent); 2176 2177 return &mem->css; 2178free_out: 2179 for_each_node_state(node, N_POSSIBLE) 2180 free_mem_cgroup_per_zone_info(mem, node); 2181 mem_cgroup_free(mem); 2182 return ERR_PTR(-ENOMEM); 2183} 2184 2185static void mem_cgroup_pre_destroy(struct cgroup_subsys *ss, 2186 struct cgroup *cont) 2187{ 2188 struct mem_cgroup *mem = mem_cgroup_from_cont(cont); 2189 mem->obsolete = 1; 2190 mem_cgroup_force_empty(mem, false); 2191} 2192 2193static void mem_cgroup_destroy(struct cgroup_subsys *ss, 2194 struct cgroup *cont) 2195{ 2196 mem_cgroup_free(mem_cgroup_from_cont(cont)); 2197} 2198 2199static int mem_cgroup_populate(struct cgroup_subsys *ss, 2200 struct cgroup *cont) 2201{ 2202 int ret; 2203 2204 ret = cgroup_add_files(cont, ss, mem_cgroup_files, 2205 ARRAY_SIZE(mem_cgroup_files)); 2206 2207 if (!ret) 2208 ret = register_memsw_files(cont, ss); 2209 return ret; 2210} 2211 2212static void mem_cgroup_move_task(struct cgroup_subsys *ss, 2213 struct cgroup *cont, 2214 struct cgroup *old_cont, 2215 struct task_struct *p) 2216{ 2217 mutex_lock(&memcg_tasklist); 2218 /* 2219 * FIXME: It's better to move charges of this process from old 2220 * memcg to new memcg. But it's just on TODO-List now. 2221 */ 2222 mutex_unlock(&memcg_tasklist); 2223} 2224 2225struct cgroup_subsys mem_cgroup_subsys = { 2226 .name = "memory", 2227 .subsys_id = mem_cgroup_subsys_id, 2228 .create = mem_cgroup_create, 2229 .pre_destroy = mem_cgroup_pre_destroy, 2230 .destroy = mem_cgroup_destroy, 2231 .populate = mem_cgroup_populate, 2232 .attach = mem_cgroup_move_task, 2233 .early_init = 0, 2234}; 2235 2236#ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP 2237 2238static int __init disable_swap_account(char *s) 2239{ 2240 really_do_swap_account = 0; 2241 return 1; 2242} 2243__setup("noswapaccount", disable_swap_account); 2244#endif 2245