memcontrol.c revision 58ae83db2a40dea15d4277d499a11dadc823c388
1/* memcontrol.c - Memory Controller 2 * 3 * Copyright IBM Corporation, 2007 4 * Author Balbir Singh <balbir@linux.vnet.ibm.com> 5 * 6 * Copyright 2007 OpenVZ SWsoft Inc 7 * Author: Pavel Emelianov <xemul@openvz.org> 8 * 9 * This program is free software; you can redistribute it and/or modify 10 * it under the terms of the GNU General Public License as published by 11 * the Free Software Foundation; either version 2 of the License, or 12 * (at your option) any later version. 13 * 14 * This program is distributed in the hope that it will be useful, 15 * but WITHOUT ANY WARRANTY; without even the implied warranty of 16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 17 * GNU General Public License for more details. 18 */ 19 20#include <linux/res_counter.h> 21#include <linux/memcontrol.h> 22#include <linux/cgroup.h> 23#include <linux/mm.h> 24#include <linux/smp.h> 25#include <linux/page-flags.h> 26#include <linux/backing-dev.h> 27#include <linux/bit_spinlock.h> 28#include <linux/rcupdate.h> 29#include <linux/swap.h> 30#include <linux/spinlock.h> 31#include <linux/fs.h> 32#include <linux/seq_file.h> 33 34#include <asm/uaccess.h> 35 36struct cgroup_subsys mem_cgroup_subsys; 37static const int MEM_CGROUP_RECLAIM_RETRIES = 5; 38 39/* 40 * Statistics for memory cgroup. 41 */ 42enum mem_cgroup_stat_index { 43 /* 44 * For MEM_CONTAINER_TYPE_ALL, usage = pagecache + rss. 45 */ 46 MEM_CGROUP_STAT_CACHE, /* # of pages charged as cache */ 47 MEM_CGROUP_STAT_RSS, /* # of pages charged as rss */ 48 49 MEM_CGROUP_STAT_NSTATS, 50}; 51 52struct mem_cgroup_stat_cpu { 53 s64 count[MEM_CGROUP_STAT_NSTATS]; 54} ____cacheline_aligned_in_smp; 55 56struct mem_cgroup_stat { 57 struct mem_cgroup_stat_cpu cpustat[NR_CPUS]; 58}; 59 60/* 61 * For accounting under irq disable, no need for increment preempt count. 62 */ 63static void __mem_cgroup_stat_add_safe(struct mem_cgroup_stat *stat, 64 enum mem_cgroup_stat_index idx, int val) 65{ 66 int cpu = smp_processor_id(); 67 stat->cpustat[cpu].count[idx] += val; 68} 69 70static s64 mem_cgroup_read_stat(struct mem_cgroup_stat *stat, 71 enum mem_cgroup_stat_index idx) 72{ 73 int cpu; 74 s64 ret = 0; 75 for_each_possible_cpu(cpu) 76 ret += stat->cpustat[cpu].count[idx]; 77 return ret; 78} 79 80/* 81 * per-zone information in memory controller. 82 */ 83 84enum mem_cgroup_zstat_index { 85 MEM_CGROUP_ZSTAT_ACTIVE, 86 MEM_CGROUP_ZSTAT_INACTIVE, 87 88 NR_MEM_CGROUP_ZSTAT, 89}; 90 91struct mem_cgroup_per_zone { 92 unsigned long count[NR_MEM_CGROUP_ZSTAT]; 93}; 94/* Macro for accessing counter */ 95#define MEM_CGROUP_ZSTAT(mz, idx) ((mz)->count[(idx)]) 96 97struct mem_cgroup_per_node { 98 struct mem_cgroup_per_zone zoneinfo[MAX_NR_ZONES]; 99}; 100 101struct mem_cgroup_lru_info { 102 struct mem_cgroup_per_node *nodeinfo[MAX_NUMNODES]; 103}; 104 105/* 106 * The memory controller data structure. The memory controller controls both 107 * page cache and RSS per cgroup. We would eventually like to provide 108 * statistics based on the statistics developed by Rik Van Riel for clock-pro, 109 * to help the administrator determine what knobs to tune. 110 * 111 * TODO: Add a water mark for the memory controller. Reclaim will begin when 112 * we hit the water mark. May be even add a low water mark, such that 113 * no reclaim occurs from a cgroup at it's low water mark, this is 114 * a feature that will be implemented much later in the future. 115 */ 116struct mem_cgroup { 117 struct cgroup_subsys_state css; 118 /* 119 * the counter to account for memory usage 120 */ 121 struct res_counter res; 122 /* 123 * Per cgroup active and inactive list, similar to the 124 * per zone LRU lists. 125 * TODO: Consider making these lists per zone 126 */ 127 struct list_head active_list; 128 struct list_head inactive_list; 129 struct mem_cgroup_lru_info info; 130 /* 131 * spin_lock to protect the per cgroup LRU 132 */ 133 spinlock_t lru_lock; 134 unsigned long control_type; /* control RSS or RSS+Pagecache */ 135 /* 136 * statistics. 137 */ 138 struct mem_cgroup_stat stat; 139}; 140 141/* 142 * We use the lower bit of the page->page_cgroup pointer as a bit spin 143 * lock. We need to ensure that page->page_cgroup is atleast two 144 * byte aligned (based on comments from Nick Piggin) 145 */ 146#define PAGE_CGROUP_LOCK_BIT 0x0 147#define PAGE_CGROUP_LOCK (1 << PAGE_CGROUP_LOCK_BIT) 148 149/* 150 * A page_cgroup page is associated with every page descriptor. The 151 * page_cgroup helps us identify information about the cgroup 152 */ 153struct page_cgroup { 154 struct list_head lru; /* per cgroup LRU list */ 155 struct page *page; 156 struct mem_cgroup *mem_cgroup; 157 atomic_t ref_cnt; /* Helpful when pages move b/w */ 158 /* mapped and cached states */ 159 int flags; 160}; 161#define PAGE_CGROUP_FLAG_CACHE (0x1) /* charged as cache */ 162#define PAGE_CGROUP_FLAG_ACTIVE (0x2) /* page is active in this cgroup */ 163 164static inline int page_cgroup_nid(struct page_cgroup *pc) 165{ 166 return page_to_nid(pc->page); 167} 168 169static inline enum zone_type page_cgroup_zid(struct page_cgroup *pc) 170{ 171 return page_zonenum(pc->page); 172} 173 174enum { 175 MEM_CGROUP_TYPE_UNSPEC = 0, 176 MEM_CGROUP_TYPE_MAPPED, 177 MEM_CGROUP_TYPE_CACHED, 178 MEM_CGROUP_TYPE_ALL, 179 MEM_CGROUP_TYPE_MAX, 180}; 181 182enum charge_type { 183 MEM_CGROUP_CHARGE_TYPE_CACHE = 0, 184 MEM_CGROUP_CHARGE_TYPE_MAPPED, 185}; 186 187 188/* 189 * Always modified under lru lock. Then, not necessary to preempt_disable() 190 */ 191static void mem_cgroup_charge_statistics(struct mem_cgroup *mem, int flags, 192 bool charge) 193{ 194 int val = (charge)? 1 : -1; 195 struct mem_cgroup_stat *stat = &mem->stat; 196 VM_BUG_ON(!irqs_disabled()); 197 198 if (flags & PAGE_CGROUP_FLAG_CACHE) 199 __mem_cgroup_stat_add_safe(stat, 200 MEM_CGROUP_STAT_CACHE, val); 201 else 202 __mem_cgroup_stat_add_safe(stat, MEM_CGROUP_STAT_RSS, val); 203} 204 205static inline struct mem_cgroup_per_zone * 206mem_cgroup_zoneinfo(struct mem_cgroup *mem, int nid, int zid) 207{ 208 BUG_ON(!mem->info.nodeinfo[nid]); 209 return &mem->info.nodeinfo[nid]->zoneinfo[zid]; 210} 211 212static inline struct mem_cgroup_per_zone * 213page_cgroup_zoneinfo(struct page_cgroup *pc) 214{ 215 struct mem_cgroup *mem = pc->mem_cgroup; 216 int nid = page_cgroup_nid(pc); 217 int zid = page_cgroup_zid(pc); 218 219 return mem_cgroup_zoneinfo(mem, nid, zid); 220} 221 222static unsigned long mem_cgroup_get_all_zonestat(struct mem_cgroup *mem, 223 enum mem_cgroup_zstat_index idx) 224{ 225 int nid, zid; 226 struct mem_cgroup_per_zone *mz; 227 u64 total = 0; 228 229 for_each_online_node(nid) 230 for (zid = 0; zid < MAX_NR_ZONES; zid++) { 231 mz = mem_cgroup_zoneinfo(mem, nid, zid); 232 total += MEM_CGROUP_ZSTAT(mz, idx); 233 } 234 return total; 235} 236 237static struct mem_cgroup init_mem_cgroup; 238 239static inline 240struct mem_cgroup *mem_cgroup_from_cont(struct cgroup *cont) 241{ 242 return container_of(cgroup_subsys_state(cont, 243 mem_cgroup_subsys_id), struct mem_cgroup, 244 css); 245} 246 247static inline 248struct mem_cgroup *mem_cgroup_from_task(struct task_struct *p) 249{ 250 return container_of(task_subsys_state(p, mem_cgroup_subsys_id), 251 struct mem_cgroup, css); 252} 253 254void mm_init_cgroup(struct mm_struct *mm, struct task_struct *p) 255{ 256 struct mem_cgroup *mem; 257 258 mem = mem_cgroup_from_task(p); 259 css_get(&mem->css); 260 mm->mem_cgroup = mem; 261} 262 263void mm_free_cgroup(struct mm_struct *mm) 264{ 265 css_put(&mm->mem_cgroup->css); 266} 267 268static inline int page_cgroup_locked(struct page *page) 269{ 270 return bit_spin_is_locked(PAGE_CGROUP_LOCK_BIT, 271 &page->page_cgroup); 272} 273 274void page_assign_page_cgroup(struct page *page, struct page_cgroup *pc) 275{ 276 int locked; 277 278 /* 279 * While resetting the page_cgroup we might not hold the 280 * page_cgroup lock. free_hot_cold_page() is an example 281 * of such a scenario 282 */ 283 if (pc) 284 VM_BUG_ON(!page_cgroup_locked(page)); 285 locked = (page->page_cgroup & PAGE_CGROUP_LOCK); 286 page->page_cgroup = ((unsigned long)pc | locked); 287} 288 289struct page_cgroup *page_get_page_cgroup(struct page *page) 290{ 291 return (struct page_cgroup *) 292 (page->page_cgroup & ~PAGE_CGROUP_LOCK); 293} 294 295static void __always_inline lock_page_cgroup(struct page *page) 296{ 297 bit_spin_lock(PAGE_CGROUP_LOCK_BIT, &page->page_cgroup); 298 VM_BUG_ON(!page_cgroup_locked(page)); 299} 300 301static void __always_inline unlock_page_cgroup(struct page *page) 302{ 303 bit_spin_unlock(PAGE_CGROUP_LOCK_BIT, &page->page_cgroup); 304} 305 306/* 307 * Tie new page_cgroup to struct page under lock_page_cgroup() 308 * This can fail if the page has been tied to a page_cgroup. 309 * If success, returns 0. 310 */ 311static int page_cgroup_assign_new_page_cgroup(struct page *page, 312 struct page_cgroup *pc) 313{ 314 int ret = 0; 315 316 lock_page_cgroup(page); 317 if (!page_get_page_cgroup(page)) 318 page_assign_page_cgroup(page, pc); 319 else /* A page is tied to other pc. */ 320 ret = 1; 321 unlock_page_cgroup(page); 322 return ret; 323} 324 325/* 326 * Clear page->page_cgroup member under lock_page_cgroup(). 327 * If given "pc" value is different from one page->page_cgroup, 328 * page->cgroup is not cleared. 329 * Returns a value of page->page_cgroup at lock taken. 330 * A can can detect failure of clearing by following 331 * clear_page_cgroup(page, pc) == pc 332 */ 333 334static struct page_cgroup *clear_page_cgroup(struct page *page, 335 struct page_cgroup *pc) 336{ 337 struct page_cgroup *ret; 338 /* lock and clear */ 339 lock_page_cgroup(page); 340 ret = page_get_page_cgroup(page); 341 if (likely(ret == pc)) 342 page_assign_page_cgroup(page, NULL); 343 unlock_page_cgroup(page); 344 return ret; 345} 346 347static void __mem_cgroup_remove_list(struct page_cgroup *pc) 348{ 349 int from = pc->flags & PAGE_CGROUP_FLAG_ACTIVE; 350 struct mem_cgroup_per_zone *mz = page_cgroup_zoneinfo(pc); 351 352 if (from) 353 MEM_CGROUP_ZSTAT(mz, MEM_CGROUP_ZSTAT_ACTIVE) -= 1; 354 else 355 MEM_CGROUP_ZSTAT(mz, MEM_CGROUP_ZSTAT_INACTIVE) -= 1; 356 357 mem_cgroup_charge_statistics(pc->mem_cgroup, pc->flags, false); 358 list_del_init(&pc->lru); 359} 360 361static void __mem_cgroup_add_list(struct page_cgroup *pc) 362{ 363 int to = pc->flags & PAGE_CGROUP_FLAG_ACTIVE; 364 struct mem_cgroup_per_zone *mz = page_cgroup_zoneinfo(pc); 365 366 if (!to) { 367 MEM_CGROUP_ZSTAT(mz, MEM_CGROUP_ZSTAT_INACTIVE) += 1; 368 list_add(&pc->lru, &pc->mem_cgroup->inactive_list); 369 } else { 370 MEM_CGROUP_ZSTAT(mz, MEM_CGROUP_ZSTAT_ACTIVE) += 1; 371 list_add(&pc->lru, &pc->mem_cgroup->active_list); 372 } 373 mem_cgroup_charge_statistics(pc->mem_cgroup, pc->flags, true); 374} 375 376static void __mem_cgroup_move_lists(struct page_cgroup *pc, bool active) 377{ 378 int from = pc->flags & PAGE_CGROUP_FLAG_ACTIVE; 379 struct mem_cgroup_per_zone *mz = page_cgroup_zoneinfo(pc); 380 381 if (from) 382 MEM_CGROUP_ZSTAT(mz, MEM_CGROUP_ZSTAT_ACTIVE) -= 1; 383 else 384 MEM_CGROUP_ZSTAT(mz, MEM_CGROUP_ZSTAT_INACTIVE) -= 1; 385 386 if (active) { 387 MEM_CGROUP_ZSTAT(mz, MEM_CGROUP_ZSTAT_ACTIVE) += 1; 388 pc->flags |= PAGE_CGROUP_FLAG_ACTIVE; 389 list_move(&pc->lru, &pc->mem_cgroup->active_list); 390 } else { 391 MEM_CGROUP_ZSTAT(mz, MEM_CGROUP_ZSTAT_INACTIVE) += 1; 392 pc->flags &= ~PAGE_CGROUP_FLAG_ACTIVE; 393 list_move(&pc->lru, &pc->mem_cgroup->inactive_list); 394 } 395} 396 397int task_in_mem_cgroup(struct task_struct *task, const struct mem_cgroup *mem) 398{ 399 int ret; 400 401 task_lock(task); 402 ret = task->mm && mm_cgroup(task->mm) == mem; 403 task_unlock(task); 404 return ret; 405} 406 407/* 408 * This routine assumes that the appropriate zone's lru lock is already held 409 */ 410void mem_cgroup_move_lists(struct page_cgroup *pc, bool active) 411{ 412 struct mem_cgroup *mem; 413 if (!pc) 414 return; 415 416 mem = pc->mem_cgroup; 417 418 spin_lock(&mem->lru_lock); 419 __mem_cgroup_move_lists(pc, active); 420 spin_unlock(&mem->lru_lock); 421} 422 423/* 424 * Calculate mapped_ratio under memory controller. This will be used in 425 * vmscan.c for deteremining we have to reclaim mapped pages. 426 */ 427int mem_cgroup_calc_mapped_ratio(struct mem_cgroup *mem) 428{ 429 long total, rss; 430 431 /* 432 * usage is recorded in bytes. But, here, we assume the number of 433 * physical pages can be represented by "long" on any arch. 434 */ 435 total = (long) (mem->res.usage >> PAGE_SHIFT) + 1L; 436 rss = (long)mem_cgroup_read_stat(&mem->stat, MEM_CGROUP_STAT_RSS); 437 return (int)((rss * 100L) / total); 438} 439 440unsigned long mem_cgroup_isolate_pages(unsigned long nr_to_scan, 441 struct list_head *dst, 442 unsigned long *scanned, int order, 443 int mode, struct zone *z, 444 struct mem_cgroup *mem_cont, 445 int active) 446{ 447 unsigned long nr_taken = 0; 448 struct page *page; 449 unsigned long scan; 450 LIST_HEAD(pc_list); 451 struct list_head *src; 452 struct page_cgroup *pc, *tmp; 453 454 if (active) 455 src = &mem_cont->active_list; 456 else 457 src = &mem_cont->inactive_list; 458 459 spin_lock(&mem_cont->lru_lock); 460 scan = 0; 461 list_for_each_entry_safe_reverse(pc, tmp, src, lru) { 462 if (scan >= nr_to_scan) 463 break; 464 page = pc->page; 465 VM_BUG_ON(!pc); 466 467 if (unlikely(!PageLRU(page))) 468 continue; 469 470 if (PageActive(page) && !active) { 471 __mem_cgroup_move_lists(pc, true); 472 continue; 473 } 474 if (!PageActive(page) && active) { 475 __mem_cgroup_move_lists(pc, false); 476 continue; 477 } 478 479 /* 480 * Reclaim, per zone 481 * TODO: make the active/inactive lists per zone 482 */ 483 if (page_zone(page) != z) 484 continue; 485 486 scan++; 487 list_move(&pc->lru, &pc_list); 488 489 if (__isolate_lru_page(page, mode) == 0) { 490 list_move(&page->lru, dst); 491 nr_taken++; 492 } 493 } 494 495 list_splice(&pc_list, src); 496 spin_unlock(&mem_cont->lru_lock); 497 498 *scanned = scan; 499 return nr_taken; 500} 501 502/* 503 * Charge the memory controller for page usage. 504 * Return 505 * 0 if the charge was successful 506 * < 0 if the cgroup is over its limit 507 */ 508static int mem_cgroup_charge_common(struct page *page, struct mm_struct *mm, 509 gfp_t gfp_mask, enum charge_type ctype) 510{ 511 struct mem_cgroup *mem; 512 struct page_cgroup *pc; 513 unsigned long flags; 514 unsigned long nr_retries = MEM_CGROUP_RECLAIM_RETRIES; 515 516 /* 517 * Should page_cgroup's go to their own slab? 518 * One could optimize the performance of the charging routine 519 * by saving a bit in the page_flags and using it as a lock 520 * to see if the cgroup page already has a page_cgroup associated 521 * with it 522 */ 523retry: 524 if (page) { 525 lock_page_cgroup(page); 526 pc = page_get_page_cgroup(page); 527 /* 528 * The page_cgroup exists and 529 * the page has already been accounted. 530 */ 531 if (pc) { 532 if (unlikely(!atomic_inc_not_zero(&pc->ref_cnt))) { 533 /* this page is under being uncharged ? */ 534 unlock_page_cgroup(page); 535 cpu_relax(); 536 goto retry; 537 } else { 538 unlock_page_cgroup(page); 539 goto done; 540 } 541 } 542 unlock_page_cgroup(page); 543 } 544 545 pc = kzalloc(sizeof(struct page_cgroup), gfp_mask); 546 if (pc == NULL) 547 goto err; 548 549 /* 550 * We always charge the cgroup the mm_struct belongs to. 551 * The mm_struct's mem_cgroup changes on task migration if the 552 * thread group leader migrates. It's possible that mm is not 553 * set, if so charge the init_mm (happens for pagecache usage). 554 */ 555 if (!mm) 556 mm = &init_mm; 557 558 rcu_read_lock(); 559 mem = rcu_dereference(mm->mem_cgroup); 560 /* 561 * For every charge from the cgroup, increment reference 562 * count 563 */ 564 css_get(&mem->css); 565 rcu_read_unlock(); 566 567 /* 568 * If we created the page_cgroup, we should free it on exceeding 569 * the cgroup limit. 570 */ 571 while (res_counter_charge(&mem->res, PAGE_SIZE)) { 572 if (!(gfp_mask & __GFP_WAIT)) 573 goto out; 574 575 if (try_to_free_mem_cgroup_pages(mem, gfp_mask)) 576 continue; 577 578 /* 579 * try_to_free_mem_cgroup_pages() might not give us a full 580 * picture of reclaim. Some pages are reclaimed and might be 581 * moved to swap cache or just unmapped from the cgroup. 582 * Check the limit again to see if the reclaim reduced the 583 * current usage of the cgroup before giving up 584 */ 585 if (res_counter_check_under_limit(&mem->res)) 586 continue; 587 588 if (!nr_retries--) { 589 mem_cgroup_out_of_memory(mem, gfp_mask); 590 goto out; 591 } 592 congestion_wait(WRITE, HZ/10); 593 } 594 595 atomic_set(&pc->ref_cnt, 1); 596 pc->mem_cgroup = mem; 597 pc->page = page; 598 pc->flags = PAGE_CGROUP_FLAG_ACTIVE; 599 if (ctype == MEM_CGROUP_CHARGE_TYPE_CACHE) 600 pc->flags |= PAGE_CGROUP_FLAG_CACHE; 601 602 if (!page || page_cgroup_assign_new_page_cgroup(page, pc)) { 603 /* 604 * Another charge has been added to this page already. 605 * We take lock_page_cgroup(page) again and read 606 * page->cgroup, increment refcnt.... just retry is OK. 607 */ 608 res_counter_uncharge(&mem->res, PAGE_SIZE); 609 css_put(&mem->css); 610 kfree(pc); 611 if (!page) 612 goto done; 613 goto retry; 614 } 615 616 spin_lock_irqsave(&mem->lru_lock, flags); 617 /* Update statistics vector */ 618 __mem_cgroup_add_list(pc); 619 spin_unlock_irqrestore(&mem->lru_lock, flags); 620 621done: 622 return 0; 623out: 624 css_put(&mem->css); 625 kfree(pc); 626err: 627 return -ENOMEM; 628} 629 630int mem_cgroup_charge(struct page *page, struct mm_struct *mm, 631 gfp_t gfp_mask) 632{ 633 return mem_cgroup_charge_common(page, mm, gfp_mask, 634 MEM_CGROUP_CHARGE_TYPE_MAPPED); 635} 636 637/* 638 * See if the cached pages should be charged at all? 639 */ 640int mem_cgroup_cache_charge(struct page *page, struct mm_struct *mm, 641 gfp_t gfp_mask) 642{ 643 int ret = 0; 644 struct mem_cgroup *mem; 645 if (!mm) 646 mm = &init_mm; 647 648 rcu_read_lock(); 649 mem = rcu_dereference(mm->mem_cgroup); 650 css_get(&mem->css); 651 rcu_read_unlock(); 652 if (mem->control_type == MEM_CGROUP_TYPE_ALL) 653 ret = mem_cgroup_charge_common(page, mm, gfp_mask, 654 MEM_CGROUP_CHARGE_TYPE_CACHE); 655 css_put(&mem->css); 656 return ret; 657} 658 659/* 660 * Uncharging is always a welcome operation, we never complain, simply 661 * uncharge. 662 */ 663void mem_cgroup_uncharge(struct page_cgroup *pc) 664{ 665 struct mem_cgroup *mem; 666 struct page *page; 667 unsigned long flags; 668 669 /* 670 * This can handle cases when a page is not charged at all and we 671 * are switching between handling the control_type. 672 */ 673 if (!pc) 674 return; 675 676 if (atomic_dec_and_test(&pc->ref_cnt)) { 677 page = pc->page; 678 /* 679 * get page->cgroup and clear it under lock. 680 * force_empty can drop page->cgroup without checking refcnt. 681 */ 682 if (clear_page_cgroup(page, pc) == pc) { 683 mem = pc->mem_cgroup; 684 css_put(&mem->css); 685 res_counter_uncharge(&mem->res, PAGE_SIZE); 686 spin_lock_irqsave(&mem->lru_lock, flags); 687 __mem_cgroup_remove_list(pc); 688 spin_unlock_irqrestore(&mem->lru_lock, flags); 689 kfree(pc); 690 } 691 } 692} 693 694/* 695 * Returns non-zero if a page (under migration) has valid page_cgroup member. 696 * Refcnt of page_cgroup is incremented. 697 */ 698 699int mem_cgroup_prepare_migration(struct page *page) 700{ 701 struct page_cgroup *pc; 702 int ret = 0; 703 lock_page_cgroup(page); 704 pc = page_get_page_cgroup(page); 705 if (pc && atomic_inc_not_zero(&pc->ref_cnt)) 706 ret = 1; 707 unlock_page_cgroup(page); 708 return ret; 709} 710 711void mem_cgroup_end_migration(struct page *page) 712{ 713 struct page_cgroup *pc = page_get_page_cgroup(page); 714 mem_cgroup_uncharge(pc); 715} 716/* 717 * We know both *page* and *newpage* are now not-on-LRU and Pg_locked. 718 * And no race with uncharge() routines because page_cgroup for *page* 719 * has extra one reference by mem_cgroup_prepare_migration. 720 */ 721 722void mem_cgroup_page_migration(struct page *page, struct page *newpage) 723{ 724 struct page_cgroup *pc; 725 struct mem_cgroup *mem; 726 unsigned long flags; 727retry: 728 pc = page_get_page_cgroup(page); 729 if (!pc) 730 return; 731 mem = pc->mem_cgroup; 732 if (clear_page_cgroup(page, pc) != pc) 733 goto retry; 734 735 spin_lock_irqsave(&mem->lru_lock, flags); 736 737 __mem_cgroup_remove_list(pc); 738 pc->page = newpage; 739 lock_page_cgroup(newpage); 740 page_assign_page_cgroup(newpage, pc); 741 unlock_page_cgroup(newpage); 742 __mem_cgroup_add_list(pc); 743 744 spin_unlock_irqrestore(&mem->lru_lock, flags); 745 return; 746} 747 748/* 749 * This routine traverse page_cgroup in given list and drop them all. 750 * This routine ignores page_cgroup->ref_cnt. 751 * *And* this routine doesn't reclaim page itself, just removes page_cgroup. 752 */ 753#define FORCE_UNCHARGE_BATCH (128) 754static void 755mem_cgroup_force_empty_list(struct mem_cgroup *mem, struct list_head *list) 756{ 757 struct page_cgroup *pc; 758 struct page *page; 759 int count; 760 unsigned long flags; 761 762retry: 763 count = FORCE_UNCHARGE_BATCH; 764 spin_lock_irqsave(&mem->lru_lock, flags); 765 766 while (--count && !list_empty(list)) { 767 pc = list_entry(list->prev, struct page_cgroup, lru); 768 page = pc->page; 769 /* Avoid race with charge */ 770 atomic_set(&pc->ref_cnt, 0); 771 if (clear_page_cgroup(page, pc) == pc) { 772 css_put(&mem->css); 773 res_counter_uncharge(&mem->res, PAGE_SIZE); 774 __mem_cgroup_remove_list(pc); 775 kfree(pc); 776 } else /* being uncharged ? ...do relax */ 777 break; 778 } 779 spin_unlock_irqrestore(&mem->lru_lock, flags); 780 if (!list_empty(list)) { 781 cond_resched(); 782 goto retry; 783 } 784 return; 785} 786 787/* 788 * make mem_cgroup's charge to be 0 if there is no task. 789 * This enables deleting this mem_cgroup. 790 */ 791 792int mem_cgroup_force_empty(struct mem_cgroup *mem) 793{ 794 int ret = -EBUSY; 795 css_get(&mem->css); 796 /* 797 * page reclaim code (kswapd etc..) will move pages between 798` * active_list <-> inactive_list while we don't take a lock. 799 * So, we have to do loop here until all lists are empty. 800 */ 801 while (!(list_empty(&mem->active_list) && 802 list_empty(&mem->inactive_list))) { 803 if (atomic_read(&mem->css.cgroup->count) > 0) 804 goto out; 805 /* drop all page_cgroup in active_list */ 806 mem_cgroup_force_empty_list(mem, &mem->active_list); 807 /* drop all page_cgroup in inactive_list */ 808 mem_cgroup_force_empty_list(mem, &mem->inactive_list); 809 } 810 ret = 0; 811out: 812 css_put(&mem->css); 813 return ret; 814} 815 816 817 818int mem_cgroup_write_strategy(char *buf, unsigned long long *tmp) 819{ 820 *tmp = memparse(buf, &buf); 821 if (*buf != '\0') 822 return -EINVAL; 823 824 /* 825 * Round up the value to the closest page size 826 */ 827 *tmp = ((*tmp + PAGE_SIZE - 1) >> PAGE_SHIFT) << PAGE_SHIFT; 828 return 0; 829} 830 831static ssize_t mem_cgroup_read(struct cgroup *cont, 832 struct cftype *cft, struct file *file, 833 char __user *userbuf, size_t nbytes, loff_t *ppos) 834{ 835 return res_counter_read(&mem_cgroup_from_cont(cont)->res, 836 cft->private, userbuf, nbytes, ppos, 837 NULL); 838} 839 840static ssize_t mem_cgroup_write(struct cgroup *cont, struct cftype *cft, 841 struct file *file, const char __user *userbuf, 842 size_t nbytes, loff_t *ppos) 843{ 844 return res_counter_write(&mem_cgroup_from_cont(cont)->res, 845 cft->private, userbuf, nbytes, ppos, 846 mem_cgroup_write_strategy); 847} 848 849static ssize_t mem_control_type_write(struct cgroup *cont, 850 struct cftype *cft, struct file *file, 851 const char __user *userbuf, 852 size_t nbytes, loff_t *pos) 853{ 854 int ret; 855 char *buf, *end; 856 unsigned long tmp; 857 struct mem_cgroup *mem; 858 859 mem = mem_cgroup_from_cont(cont); 860 buf = kmalloc(nbytes + 1, GFP_KERNEL); 861 ret = -ENOMEM; 862 if (buf == NULL) 863 goto out; 864 865 buf[nbytes] = 0; 866 ret = -EFAULT; 867 if (copy_from_user(buf, userbuf, nbytes)) 868 goto out_free; 869 870 ret = -EINVAL; 871 tmp = simple_strtoul(buf, &end, 10); 872 if (*end != '\0') 873 goto out_free; 874 875 if (tmp <= MEM_CGROUP_TYPE_UNSPEC || tmp >= MEM_CGROUP_TYPE_MAX) 876 goto out_free; 877 878 mem->control_type = tmp; 879 ret = nbytes; 880out_free: 881 kfree(buf); 882out: 883 return ret; 884} 885 886static ssize_t mem_control_type_read(struct cgroup *cont, 887 struct cftype *cft, 888 struct file *file, char __user *userbuf, 889 size_t nbytes, loff_t *ppos) 890{ 891 unsigned long val; 892 char buf[64], *s; 893 struct mem_cgroup *mem; 894 895 mem = mem_cgroup_from_cont(cont); 896 s = buf; 897 val = mem->control_type; 898 s += sprintf(s, "%lu\n", val); 899 return simple_read_from_buffer((void __user *)userbuf, nbytes, 900 ppos, buf, s - buf); 901} 902 903 904static ssize_t mem_force_empty_write(struct cgroup *cont, 905 struct cftype *cft, struct file *file, 906 const char __user *userbuf, 907 size_t nbytes, loff_t *ppos) 908{ 909 struct mem_cgroup *mem = mem_cgroup_from_cont(cont); 910 int ret; 911 ret = mem_cgroup_force_empty(mem); 912 if (!ret) 913 ret = nbytes; 914 return ret; 915} 916 917/* 918 * Note: This should be removed if cgroup supports write-only file. 919 */ 920 921static ssize_t mem_force_empty_read(struct cgroup *cont, 922 struct cftype *cft, 923 struct file *file, char __user *userbuf, 924 size_t nbytes, loff_t *ppos) 925{ 926 return -EINVAL; 927} 928 929 930static const struct mem_cgroup_stat_desc { 931 const char *msg; 932 u64 unit; 933} mem_cgroup_stat_desc[] = { 934 [MEM_CGROUP_STAT_CACHE] = { "cache", PAGE_SIZE, }, 935 [MEM_CGROUP_STAT_RSS] = { "rss", PAGE_SIZE, }, 936}; 937 938static int mem_control_stat_show(struct seq_file *m, void *arg) 939{ 940 struct cgroup *cont = m->private; 941 struct mem_cgroup *mem_cont = mem_cgroup_from_cont(cont); 942 struct mem_cgroup_stat *stat = &mem_cont->stat; 943 int i; 944 945 for (i = 0; i < ARRAY_SIZE(stat->cpustat[0].count); i++) { 946 s64 val; 947 948 val = mem_cgroup_read_stat(stat, i); 949 val *= mem_cgroup_stat_desc[i].unit; 950 seq_printf(m, "%s %lld\n", mem_cgroup_stat_desc[i].msg, 951 (long long)val); 952 } 953 /* showing # of active pages */ 954 { 955 unsigned long active, inactive; 956 957 inactive = mem_cgroup_get_all_zonestat(mem_cont, 958 MEM_CGROUP_ZSTAT_INACTIVE); 959 active = mem_cgroup_get_all_zonestat(mem_cont, 960 MEM_CGROUP_ZSTAT_ACTIVE); 961 seq_printf(m, "active %ld\n", (active) * PAGE_SIZE); 962 seq_printf(m, "inactive %ld\n", (inactive) * PAGE_SIZE); 963 } 964 return 0; 965} 966 967static const struct file_operations mem_control_stat_file_operations = { 968 .read = seq_read, 969 .llseek = seq_lseek, 970 .release = single_release, 971}; 972 973static int mem_control_stat_open(struct inode *unused, struct file *file) 974{ 975 /* XXX __d_cont */ 976 struct cgroup *cont = file->f_dentry->d_parent->d_fsdata; 977 978 file->f_op = &mem_control_stat_file_operations; 979 return single_open(file, mem_control_stat_show, cont); 980} 981 982 983 984static struct cftype mem_cgroup_files[] = { 985 { 986 .name = "usage_in_bytes", 987 .private = RES_USAGE, 988 .read = mem_cgroup_read, 989 }, 990 { 991 .name = "limit_in_bytes", 992 .private = RES_LIMIT, 993 .write = mem_cgroup_write, 994 .read = mem_cgroup_read, 995 }, 996 { 997 .name = "failcnt", 998 .private = RES_FAILCNT, 999 .read = mem_cgroup_read, 1000 }, 1001 { 1002 .name = "control_type", 1003 .write = mem_control_type_write, 1004 .read = mem_control_type_read, 1005 }, 1006 { 1007 .name = "force_empty", 1008 .write = mem_force_empty_write, 1009 .read = mem_force_empty_read, 1010 }, 1011 { 1012 .name = "stat", 1013 .open = mem_control_stat_open, 1014 }, 1015}; 1016 1017static int alloc_mem_cgroup_per_zone_info(struct mem_cgroup *mem, int node) 1018{ 1019 struct mem_cgroup_per_node *pn; 1020 1021 pn = kmalloc_node(sizeof(*pn), GFP_KERNEL, node); 1022 if (!pn) 1023 return 1; 1024 mem->info.nodeinfo[node] = pn; 1025 memset(pn, 0, sizeof(*pn)); 1026 return 0; 1027} 1028 1029static struct mem_cgroup init_mem_cgroup; 1030 1031static struct cgroup_subsys_state * 1032mem_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cont) 1033{ 1034 struct mem_cgroup *mem; 1035 int node; 1036 1037 if (unlikely((cont->parent) == NULL)) { 1038 mem = &init_mem_cgroup; 1039 init_mm.mem_cgroup = mem; 1040 } else 1041 mem = kzalloc(sizeof(struct mem_cgroup), GFP_KERNEL); 1042 1043 if (mem == NULL) 1044 return NULL; 1045 1046 res_counter_init(&mem->res); 1047 INIT_LIST_HEAD(&mem->active_list); 1048 INIT_LIST_HEAD(&mem->inactive_list); 1049 spin_lock_init(&mem->lru_lock); 1050 mem->control_type = MEM_CGROUP_TYPE_ALL; 1051 memset(&mem->info, 0, sizeof(mem->info)); 1052 1053 for_each_node_state(node, N_POSSIBLE) 1054 if (alloc_mem_cgroup_per_zone_info(mem, node)) 1055 goto free_out; 1056 1057 return &mem->css; 1058free_out: 1059 for_each_node_state(node, N_POSSIBLE) 1060 kfree(mem->info.nodeinfo[node]); 1061 if (cont->parent != NULL) 1062 kfree(mem); 1063 return NULL; 1064} 1065 1066static void mem_cgroup_pre_destroy(struct cgroup_subsys *ss, 1067 struct cgroup *cont) 1068{ 1069 struct mem_cgroup *mem = mem_cgroup_from_cont(cont); 1070 mem_cgroup_force_empty(mem); 1071} 1072 1073static void mem_cgroup_destroy(struct cgroup_subsys *ss, 1074 struct cgroup *cont) 1075{ 1076 int node; 1077 struct mem_cgroup *mem = mem_cgroup_from_cont(cont); 1078 1079 for_each_node_state(node, N_POSSIBLE) 1080 kfree(mem->info.nodeinfo[node]); 1081 1082 kfree(mem_cgroup_from_cont(cont)); 1083} 1084 1085static int mem_cgroup_populate(struct cgroup_subsys *ss, 1086 struct cgroup *cont) 1087{ 1088 return cgroup_add_files(cont, ss, mem_cgroup_files, 1089 ARRAY_SIZE(mem_cgroup_files)); 1090} 1091 1092static void mem_cgroup_move_task(struct cgroup_subsys *ss, 1093 struct cgroup *cont, 1094 struct cgroup *old_cont, 1095 struct task_struct *p) 1096{ 1097 struct mm_struct *mm; 1098 struct mem_cgroup *mem, *old_mem; 1099 1100 mm = get_task_mm(p); 1101 if (mm == NULL) 1102 return; 1103 1104 mem = mem_cgroup_from_cont(cont); 1105 old_mem = mem_cgroup_from_cont(old_cont); 1106 1107 if (mem == old_mem) 1108 goto out; 1109 1110 /* 1111 * Only thread group leaders are allowed to migrate, the mm_struct is 1112 * in effect owned by the leader 1113 */ 1114 if (p->tgid != p->pid) 1115 goto out; 1116 1117 css_get(&mem->css); 1118 rcu_assign_pointer(mm->mem_cgroup, mem); 1119 css_put(&old_mem->css); 1120 1121out: 1122 mmput(mm); 1123 return; 1124} 1125 1126struct cgroup_subsys mem_cgroup_subsys = { 1127 .name = "memory", 1128 .subsys_id = mem_cgroup_subsys_id, 1129 .create = mem_cgroup_create, 1130 .pre_destroy = mem_cgroup_pre_destroy, 1131 .destroy = mem_cgroup_destroy, 1132 .populate = mem_cgroup_populate, 1133 .attach = mem_cgroup_move_task, 1134 .early_init = 0, 1135}; 1136