memcontrol.c revision aa3b189551ad8e5cc1d9c663735c131650238278
1/* memcontrol.c - Memory Controller 2 * 3 * Copyright IBM Corporation, 2007 4 * Author Balbir Singh <balbir@linux.vnet.ibm.com> 5 * 6 * Copyright 2007 OpenVZ SWsoft Inc 7 * Author: Pavel Emelianov <xemul@openvz.org> 8 * 9 * Memory thresholds 10 * Copyright (C) 2009 Nokia Corporation 11 * Author: Kirill A. Shutemov 12 * 13 * This program is free software; you can redistribute it and/or modify 14 * it under the terms of the GNU General Public License as published by 15 * the Free Software Foundation; either version 2 of the License, or 16 * (at your option) any later version. 17 * 18 * This program is distributed in the hope that it will be useful, 19 * but WITHOUT ANY WARRANTY; without even the implied warranty of 20 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 21 * GNU General Public License for more details. 22 */ 23 24#include <linux/res_counter.h> 25#include <linux/memcontrol.h> 26#include <linux/cgroup.h> 27#include <linux/mm.h> 28#include <linux/hugetlb.h> 29#include <linux/pagemap.h> 30#include <linux/smp.h> 31#include <linux/page-flags.h> 32#include <linux/backing-dev.h> 33#include <linux/bit_spinlock.h> 34#include <linux/rcupdate.h> 35#include <linux/limits.h> 36#include <linux/mutex.h> 37#include <linux/rbtree.h> 38#include <linux/slab.h> 39#include <linux/swap.h> 40#include <linux/swapops.h> 41#include <linux/spinlock.h> 42#include <linux/eventfd.h> 43#include <linux/sort.h> 44#include <linux/fs.h> 45#include <linux/seq_file.h> 46#include <linux/vmalloc.h> 47#include <linux/mm_inline.h> 48#include <linux/page_cgroup.h> 49#include <linux/cpu.h> 50#include <linux/oom.h> 51#include "internal.h" 52 53#include <asm/uaccess.h> 54 55#include <trace/events/vmscan.h> 56 57struct cgroup_subsys mem_cgroup_subsys __read_mostly; 58#define MEM_CGROUP_RECLAIM_RETRIES 5 59struct mem_cgroup *root_mem_cgroup __read_mostly; 60 61#ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP 62/* Turned on only when memory cgroup is enabled && really_do_swap_account = 1 */ 63int do_swap_account __read_mostly; 64 65/* for remember boot option*/ 66#ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP_ENABLED 67static int really_do_swap_account __initdata = 1; 68#else 69static int really_do_swap_account __initdata = 0; 70#endif 71 72#else 73#define do_swap_account (0) 74#endif 75 76 77/* 78 * Statistics for memory cgroup. 79 */ 80enum mem_cgroup_stat_index { 81 /* 82 * For MEM_CONTAINER_TYPE_ALL, usage = pagecache + rss. 83 */ 84 MEM_CGROUP_STAT_CACHE, /* # of pages charged as cache */ 85 MEM_CGROUP_STAT_RSS, /* # of pages charged as anon rss */ 86 MEM_CGROUP_STAT_FILE_MAPPED, /* # of pages charged as file rss */ 87 MEM_CGROUP_STAT_SWAPOUT, /* # of pages, swapped out */ 88 MEM_CGROUP_STAT_DATA, /* end of data requires synchronization */ 89 MEM_CGROUP_ON_MOVE, /* someone is moving account between groups */ 90 MEM_CGROUP_STAT_NSTATS, 91}; 92 93enum mem_cgroup_events_index { 94 MEM_CGROUP_EVENTS_PGPGIN, /* # of pages paged in */ 95 MEM_CGROUP_EVENTS_PGPGOUT, /* # of pages paged out */ 96 MEM_CGROUP_EVENTS_COUNT, /* # of pages paged in/out */ 97 MEM_CGROUP_EVENTS_PGFAULT, /* # of page-faults */ 98 MEM_CGROUP_EVENTS_PGMAJFAULT, /* # of major page-faults */ 99 MEM_CGROUP_EVENTS_NSTATS, 100}; 101/* 102 * Per memcg event counter is incremented at every pagein/pageout. With THP, 103 * it will be incremated by the number of pages. This counter is used for 104 * for trigger some periodic events. This is straightforward and better 105 * than using jiffies etc. to handle periodic memcg event. 106 */ 107enum mem_cgroup_events_target { 108 MEM_CGROUP_TARGET_THRESH, 109 MEM_CGROUP_TARGET_SOFTLIMIT, 110 MEM_CGROUP_TARGET_NUMAINFO, 111 MEM_CGROUP_NTARGETS, 112}; 113#define THRESHOLDS_EVENTS_TARGET (128) 114#define SOFTLIMIT_EVENTS_TARGET (1024) 115#define NUMAINFO_EVENTS_TARGET (1024) 116 117struct mem_cgroup_stat_cpu { 118 long count[MEM_CGROUP_STAT_NSTATS]; 119 unsigned long events[MEM_CGROUP_EVENTS_NSTATS]; 120 unsigned long targets[MEM_CGROUP_NTARGETS]; 121}; 122 123/* 124 * per-zone information in memory controller. 125 */ 126struct mem_cgroup_per_zone { 127 /* 128 * spin_lock to protect the per cgroup LRU 129 */ 130 struct list_head lists[NR_LRU_LISTS]; 131 unsigned long count[NR_LRU_LISTS]; 132 133 struct zone_reclaim_stat reclaim_stat; 134 struct rb_node tree_node; /* RB tree node */ 135 unsigned long long usage_in_excess;/* Set to the value by which */ 136 /* the soft limit is exceeded*/ 137 bool on_tree; 138 struct mem_cgroup *mem; /* Back pointer, we cannot */ 139 /* use container_of */ 140}; 141/* Macro for accessing counter */ 142#define MEM_CGROUP_ZSTAT(mz, idx) ((mz)->count[(idx)]) 143 144struct mem_cgroup_per_node { 145 struct mem_cgroup_per_zone zoneinfo[MAX_NR_ZONES]; 146}; 147 148struct mem_cgroup_lru_info { 149 struct mem_cgroup_per_node *nodeinfo[MAX_NUMNODES]; 150}; 151 152/* 153 * Cgroups above their limits are maintained in a RB-Tree, independent of 154 * their hierarchy representation 155 */ 156 157struct mem_cgroup_tree_per_zone { 158 struct rb_root rb_root; 159 spinlock_t lock; 160}; 161 162struct mem_cgroup_tree_per_node { 163 struct mem_cgroup_tree_per_zone rb_tree_per_zone[MAX_NR_ZONES]; 164}; 165 166struct mem_cgroup_tree { 167 struct mem_cgroup_tree_per_node *rb_tree_per_node[MAX_NUMNODES]; 168}; 169 170static struct mem_cgroup_tree soft_limit_tree __read_mostly; 171 172struct mem_cgroup_threshold { 173 struct eventfd_ctx *eventfd; 174 u64 threshold; 175}; 176 177/* For threshold */ 178struct mem_cgroup_threshold_ary { 179 /* An array index points to threshold just below usage. */ 180 int current_threshold; 181 /* Size of entries[] */ 182 unsigned int size; 183 /* Array of thresholds */ 184 struct mem_cgroup_threshold entries[0]; 185}; 186 187struct mem_cgroup_thresholds { 188 /* Primary thresholds array */ 189 struct mem_cgroup_threshold_ary *primary; 190 /* 191 * Spare threshold array. 192 * This is needed to make mem_cgroup_unregister_event() "never fail". 193 * It must be able to store at least primary->size - 1 entries. 194 */ 195 struct mem_cgroup_threshold_ary *spare; 196}; 197 198/* for OOM */ 199struct mem_cgroup_eventfd_list { 200 struct list_head list; 201 struct eventfd_ctx *eventfd; 202}; 203 204static void mem_cgroup_threshold(struct mem_cgroup *mem); 205static void mem_cgroup_oom_notify(struct mem_cgroup *mem); 206 207enum { 208 SCAN_BY_LIMIT, 209 SCAN_BY_SYSTEM, 210 NR_SCAN_CONTEXT, 211 SCAN_BY_SHRINK, /* not recorded now */ 212}; 213 214enum { 215 SCAN, 216 SCAN_ANON, 217 SCAN_FILE, 218 ROTATE, 219 ROTATE_ANON, 220 ROTATE_FILE, 221 FREED, 222 FREED_ANON, 223 FREED_FILE, 224 ELAPSED, 225 NR_SCANSTATS, 226}; 227 228struct scanstat { 229 spinlock_t lock; 230 unsigned long stats[NR_SCAN_CONTEXT][NR_SCANSTATS]; 231 unsigned long rootstats[NR_SCAN_CONTEXT][NR_SCANSTATS]; 232}; 233 234const char *scanstat_string[NR_SCANSTATS] = { 235 "scanned_pages", 236 "scanned_anon_pages", 237 "scanned_file_pages", 238 "rotated_pages", 239 "rotated_anon_pages", 240 "rotated_file_pages", 241 "freed_pages", 242 "freed_anon_pages", 243 "freed_file_pages", 244 "elapsed_ns", 245}; 246#define SCANSTAT_WORD_LIMIT "_by_limit" 247#define SCANSTAT_WORD_SYSTEM "_by_system" 248#define SCANSTAT_WORD_HIERARCHY "_under_hierarchy" 249 250 251/* 252 * The memory controller data structure. The memory controller controls both 253 * page cache and RSS per cgroup. We would eventually like to provide 254 * statistics based on the statistics developed by Rik Van Riel for clock-pro, 255 * to help the administrator determine what knobs to tune. 256 * 257 * TODO: Add a water mark for the memory controller. Reclaim will begin when 258 * we hit the water mark. May be even add a low water mark, such that 259 * no reclaim occurs from a cgroup at it's low water mark, this is 260 * a feature that will be implemented much later in the future. 261 */ 262struct mem_cgroup { 263 struct cgroup_subsys_state css; 264 /* 265 * the counter to account for memory usage 266 */ 267 struct res_counter res; 268 /* 269 * the counter to account for mem+swap usage. 270 */ 271 struct res_counter memsw; 272 /* 273 * Per cgroup active and inactive list, similar to the 274 * per zone LRU lists. 275 */ 276 struct mem_cgroup_lru_info info; 277 /* 278 * While reclaiming in a hierarchy, we cache the last child we 279 * reclaimed from. 280 */ 281 int last_scanned_child; 282 int last_scanned_node; 283#if MAX_NUMNODES > 1 284 nodemask_t scan_nodes; 285 atomic_t numainfo_events; 286 atomic_t numainfo_updating; 287#endif 288 /* 289 * Should the accounting and control be hierarchical, per subtree? 290 */ 291 bool use_hierarchy; 292 293 bool oom_lock; 294 atomic_t under_oom; 295 296 atomic_t refcnt; 297 298 int swappiness; 299 /* OOM-Killer disable */ 300 int oom_kill_disable; 301 302 /* set when res.limit == memsw.limit */ 303 bool memsw_is_minimum; 304 305 /* protect arrays of thresholds */ 306 struct mutex thresholds_lock; 307 308 /* thresholds for memory usage. RCU-protected */ 309 struct mem_cgroup_thresholds thresholds; 310 311 /* thresholds for mem+swap usage. RCU-protected */ 312 struct mem_cgroup_thresholds memsw_thresholds; 313 314 /* For oom notifier event fd */ 315 struct list_head oom_notify; 316 /* For recording LRU-scan statistics */ 317 struct scanstat scanstat; 318 /* 319 * Should we move charges of a task when a task is moved into this 320 * mem_cgroup ? And what type of charges should we move ? 321 */ 322 unsigned long move_charge_at_immigrate; 323 /* 324 * percpu counter. 325 */ 326 struct mem_cgroup_stat_cpu *stat; 327 /* 328 * used when a cpu is offlined or other synchronizations 329 * See mem_cgroup_read_stat(). 330 */ 331 struct mem_cgroup_stat_cpu nocpu_base; 332 spinlock_t pcp_counter_lock; 333}; 334 335/* Stuffs for move charges at task migration. */ 336/* 337 * Types of charges to be moved. "move_charge_at_immitgrate" is treated as a 338 * left-shifted bitmap of these types. 339 */ 340enum move_type { 341 MOVE_CHARGE_TYPE_ANON, /* private anonymous page and swap of it */ 342 MOVE_CHARGE_TYPE_FILE, /* file page(including tmpfs) and swap of it */ 343 NR_MOVE_TYPE, 344}; 345 346/* "mc" and its members are protected by cgroup_mutex */ 347static struct move_charge_struct { 348 spinlock_t lock; /* for from, to */ 349 struct mem_cgroup *from; 350 struct mem_cgroup *to; 351 unsigned long precharge; 352 unsigned long moved_charge; 353 unsigned long moved_swap; 354 struct task_struct *moving_task; /* a task moving charges */ 355 wait_queue_head_t waitq; /* a waitq for other context */ 356} mc = { 357 .lock = __SPIN_LOCK_UNLOCKED(mc.lock), 358 .waitq = __WAIT_QUEUE_HEAD_INITIALIZER(mc.waitq), 359}; 360 361static bool move_anon(void) 362{ 363 return test_bit(MOVE_CHARGE_TYPE_ANON, 364 &mc.to->move_charge_at_immigrate); 365} 366 367static bool move_file(void) 368{ 369 return test_bit(MOVE_CHARGE_TYPE_FILE, 370 &mc.to->move_charge_at_immigrate); 371} 372 373/* 374 * Maximum loops in mem_cgroup_hierarchical_reclaim(), used for soft 375 * limit reclaim to prevent infinite loops, if they ever occur. 376 */ 377#define MEM_CGROUP_MAX_RECLAIM_LOOPS (100) 378#define MEM_CGROUP_MAX_SOFT_LIMIT_RECLAIM_LOOPS (2) 379 380enum charge_type { 381 MEM_CGROUP_CHARGE_TYPE_CACHE = 0, 382 MEM_CGROUP_CHARGE_TYPE_MAPPED, 383 MEM_CGROUP_CHARGE_TYPE_SHMEM, /* used by page migration of shmem */ 384 MEM_CGROUP_CHARGE_TYPE_FORCE, /* used by force_empty */ 385 MEM_CGROUP_CHARGE_TYPE_SWAPOUT, /* for accounting swapcache */ 386 MEM_CGROUP_CHARGE_TYPE_DROP, /* a page was unused swap cache */ 387 NR_CHARGE_TYPE, 388}; 389 390/* for encoding cft->private value on file */ 391#define _MEM (0) 392#define _MEMSWAP (1) 393#define _OOM_TYPE (2) 394#define MEMFILE_PRIVATE(x, val) (((x) << 16) | (val)) 395#define MEMFILE_TYPE(val) (((val) >> 16) & 0xffff) 396#define MEMFILE_ATTR(val) ((val) & 0xffff) 397/* Used for OOM nofiier */ 398#define OOM_CONTROL (0) 399 400/* 401 * Reclaim flags for mem_cgroup_hierarchical_reclaim 402 */ 403#define MEM_CGROUP_RECLAIM_NOSWAP_BIT 0x0 404#define MEM_CGROUP_RECLAIM_NOSWAP (1 << MEM_CGROUP_RECLAIM_NOSWAP_BIT) 405#define MEM_CGROUP_RECLAIM_SHRINK_BIT 0x1 406#define MEM_CGROUP_RECLAIM_SHRINK (1 << MEM_CGROUP_RECLAIM_SHRINK_BIT) 407#define MEM_CGROUP_RECLAIM_SOFT_BIT 0x2 408#define MEM_CGROUP_RECLAIM_SOFT (1 << MEM_CGROUP_RECLAIM_SOFT_BIT) 409 410static void mem_cgroup_get(struct mem_cgroup *mem); 411static void mem_cgroup_put(struct mem_cgroup *mem); 412static struct mem_cgroup *parent_mem_cgroup(struct mem_cgroup *mem); 413static void drain_all_stock_async(struct mem_cgroup *mem); 414 415static struct mem_cgroup_per_zone * 416mem_cgroup_zoneinfo(struct mem_cgroup *mem, int nid, int zid) 417{ 418 return &mem->info.nodeinfo[nid]->zoneinfo[zid]; 419} 420 421struct cgroup_subsys_state *mem_cgroup_css(struct mem_cgroup *mem) 422{ 423 return &mem->css; 424} 425 426static struct mem_cgroup_per_zone * 427page_cgroup_zoneinfo(struct mem_cgroup *mem, struct page *page) 428{ 429 int nid = page_to_nid(page); 430 int zid = page_zonenum(page); 431 432 return mem_cgroup_zoneinfo(mem, nid, zid); 433} 434 435static struct mem_cgroup_tree_per_zone * 436soft_limit_tree_node_zone(int nid, int zid) 437{ 438 return &soft_limit_tree.rb_tree_per_node[nid]->rb_tree_per_zone[zid]; 439} 440 441static struct mem_cgroup_tree_per_zone * 442soft_limit_tree_from_page(struct page *page) 443{ 444 int nid = page_to_nid(page); 445 int zid = page_zonenum(page); 446 447 return &soft_limit_tree.rb_tree_per_node[nid]->rb_tree_per_zone[zid]; 448} 449 450static void 451__mem_cgroup_insert_exceeded(struct mem_cgroup *mem, 452 struct mem_cgroup_per_zone *mz, 453 struct mem_cgroup_tree_per_zone *mctz, 454 unsigned long long new_usage_in_excess) 455{ 456 struct rb_node **p = &mctz->rb_root.rb_node; 457 struct rb_node *parent = NULL; 458 struct mem_cgroup_per_zone *mz_node; 459 460 if (mz->on_tree) 461 return; 462 463 mz->usage_in_excess = new_usage_in_excess; 464 if (!mz->usage_in_excess) 465 return; 466 while (*p) { 467 parent = *p; 468 mz_node = rb_entry(parent, struct mem_cgroup_per_zone, 469 tree_node); 470 if (mz->usage_in_excess < mz_node->usage_in_excess) 471 p = &(*p)->rb_left; 472 /* 473 * We can't avoid mem cgroups that are over their soft 474 * limit by the same amount 475 */ 476 else if (mz->usage_in_excess >= mz_node->usage_in_excess) 477 p = &(*p)->rb_right; 478 } 479 rb_link_node(&mz->tree_node, parent, p); 480 rb_insert_color(&mz->tree_node, &mctz->rb_root); 481 mz->on_tree = true; 482} 483 484static void 485__mem_cgroup_remove_exceeded(struct mem_cgroup *mem, 486 struct mem_cgroup_per_zone *mz, 487 struct mem_cgroup_tree_per_zone *mctz) 488{ 489 if (!mz->on_tree) 490 return; 491 rb_erase(&mz->tree_node, &mctz->rb_root); 492 mz->on_tree = false; 493} 494 495static void 496mem_cgroup_remove_exceeded(struct mem_cgroup *mem, 497 struct mem_cgroup_per_zone *mz, 498 struct mem_cgroup_tree_per_zone *mctz) 499{ 500 spin_lock(&mctz->lock); 501 __mem_cgroup_remove_exceeded(mem, mz, mctz); 502 spin_unlock(&mctz->lock); 503} 504 505 506static void mem_cgroup_update_tree(struct mem_cgroup *mem, struct page *page) 507{ 508 unsigned long long excess; 509 struct mem_cgroup_per_zone *mz; 510 struct mem_cgroup_tree_per_zone *mctz; 511 int nid = page_to_nid(page); 512 int zid = page_zonenum(page); 513 mctz = soft_limit_tree_from_page(page); 514 515 /* 516 * Necessary to update all ancestors when hierarchy is used. 517 * because their event counter is not touched. 518 */ 519 for (; mem; mem = parent_mem_cgroup(mem)) { 520 mz = mem_cgroup_zoneinfo(mem, nid, zid); 521 excess = res_counter_soft_limit_excess(&mem->res); 522 /* 523 * We have to update the tree if mz is on RB-tree or 524 * mem is over its softlimit. 525 */ 526 if (excess || mz->on_tree) { 527 spin_lock(&mctz->lock); 528 /* if on-tree, remove it */ 529 if (mz->on_tree) 530 __mem_cgroup_remove_exceeded(mem, mz, mctz); 531 /* 532 * Insert again. mz->usage_in_excess will be updated. 533 * If excess is 0, no tree ops. 534 */ 535 __mem_cgroup_insert_exceeded(mem, mz, mctz, excess); 536 spin_unlock(&mctz->lock); 537 } 538 } 539} 540 541static void mem_cgroup_remove_from_trees(struct mem_cgroup *mem) 542{ 543 int node, zone; 544 struct mem_cgroup_per_zone *mz; 545 struct mem_cgroup_tree_per_zone *mctz; 546 547 for_each_node_state(node, N_POSSIBLE) { 548 for (zone = 0; zone < MAX_NR_ZONES; zone++) { 549 mz = mem_cgroup_zoneinfo(mem, node, zone); 550 mctz = soft_limit_tree_node_zone(node, zone); 551 mem_cgroup_remove_exceeded(mem, mz, mctz); 552 } 553 } 554} 555 556static struct mem_cgroup_per_zone * 557__mem_cgroup_largest_soft_limit_node(struct mem_cgroup_tree_per_zone *mctz) 558{ 559 struct rb_node *rightmost = NULL; 560 struct mem_cgroup_per_zone *mz; 561 562retry: 563 mz = NULL; 564 rightmost = rb_last(&mctz->rb_root); 565 if (!rightmost) 566 goto done; /* Nothing to reclaim from */ 567 568 mz = rb_entry(rightmost, struct mem_cgroup_per_zone, tree_node); 569 /* 570 * Remove the node now but someone else can add it back, 571 * we will to add it back at the end of reclaim to its correct 572 * position in the tree. 573 */ 574 __mem_cgroup_remove_exceeded(mz->mem, mz, mctz); 575 if (!res_counter_soft_limit_excess(&mz->mem->res) || 576 !css_tryget(&mz->mem->css)) 577 goto retry; 578done: 579 return mz; 580} 581 582static struct mem_cgroup_per_zone * 583mem_cgroup_largest_soft_limit_node(struct mem_cgroup_tree_per_zone *mctz) 584{ 585 struct mem_cgroup_per_zone *mz; 586 587 spin_lock(&mctz->lock); 588 mz = __mem_cgroup_largest_soft_limit_node(mctz); 589 spin_unlock(&mctz->lock); 590 return mz; 591} 592 593/* 594 * Implementation Note: reading percpu statistics for memcg. 595 * 596 * Both of vmstat[] and percpu_counter has threshold and do periodic 597 * synchronization to implement "quick" read. There are trade-off between 598 * reading cost and precision of value. Then, we may have a chance to implement 599 * a periodic synchronizion of counter in memcg's counter. 600 * 601 * But this _read() function is used for user interface now. The user accounts 602 * memory usage by memory cgroup and he _always_ requires exact value because 603 * he accounts memory. Even if we provide quick-and-fuzzy read, we always 604 * have to visit all online cpus and make sum. So, for now, unnecessary 605 * synchronization is not implemented. (just implemented for cpu hotplug) 606 * 607 * If there are kernel internal actions which can make use of some not-exact 608 * value, and reading all cpu value can be performance bottleneck in some 609 * common workload, threashold and synchonization as vmstat[] should be 610 * implemented. 611 */ 612static long mem_cgroup_read_stat(struct mem_cgroup *mem, 613 enum mem_cgroup_stat_index idx) 614{ 615 long val = 0; 616 int cpu; 617 618 get_online_cpus(); 619 for_each_online_cpu(cpu) 620 val += per_cpu(mem->stat->count[idx], cpu); 621#ifdef CONFIG_HOTPLUG_CPU 622 spin_lock(&mem->pcp_counter_lock); 623 val += mem->nocpu_base.count[idx]; 624 spin_unlock(&mem->pcp_counter_lock); 625#endif 626 put_online_cpus(); 627 return val; 628} 629 630static void mem_cgroup_swap_statistics(struct mem_cgroup *mem, 631 bool charge) 632{ 633 int val = (charge) ? 1 : -1; 634 this_cpu_add(mem->stat->count[MEM_CGROUP_STAT_SWAPOUT], val); 635} 636 637void mem_cgroup_pgfault(struct mem_cgroup *mem, int val) 638{ 639 this_cpu_add(mem->stat->events[MEM_CGROUP_EVENTS_PGFAULT], val); 640} 641 642void mem_cgroup_pgmajfault(struct mem_cgroup *mem, int val) 643{ 644 this_cpu_add(mem->stat->events[MEM_CGROUP_EVENTS_PGMAJFAULT], val); 645} 646 647static unsigned long mem_cgroup_read_events(struct mem_cgroup *mem, 648 enum mem_cgroup_events_index idx) 649{ 650 unsigned long val = 0; 651 int cpu; 652 653 for_each_online_cpu(cpu) 654 val += per_cpu(mem->stat->events[idx], cpu); 655#ifdef CONFIG_HOTPLUG_CPU 656 spin_lock(&mem->pcp_counter_lock); 657 val += mem->nocpu_base.events[idx]; 658 spin_unlock(&mem->pcp_counter_lock); 659#endif 660 return val; 661} 662 663static void mem_cgroup_charge_statistics(struct mem_cgroup *mem, 664 bool file, int nr_pages) 665{ 666 preempt_disable(); 667 668 if (file) 669 __this_cpu_add(mem->stat->count[MEM_CGROUP_STAT_CACHE], nr_pages); 670 else 671 __this_cpu_add(mem->stat->count[MEM_CGROUP_STAT_RSS], nr_pages); 672 673 /* pagein of a big page is an event. So, ignore page size */ 674 if (nr_pages > 0) 675 __this_cpu_inc(mem->stat->events[MEM_CGROUP_EVENTS_PGPGIN]); 676 else { 677 __this_cpu_inc(mem->stat->events[MEM_CGROUP_EVENTS_PGPGOUT]); 678 nr_pages = -nr_pages; /* for event */ 679 } 680 681 __this_cpu_add(mem->stat->events[MEM_CGROUP_EVENTS_COUNT], nr_pages); 682 683 preempt_enable(); 684} 685 686unsigned long 687mem_cgroup_zone_nr_lru_pages(struct mem_cgroup *mem, int nid, int zid, 688 unsigned int lru_mask) 689{ 690 struct mem_cgroup_per_zone *mz; 691 enum lru_list l; 692 unsigned long ret = 0; 693 694 mz = mem_cgroup_zoneinfo(mem, nid, zid); 695 696 for_each_lru(l) { 697 if (BIT(l) & lru_mask) 698 ret += MEM_CGROUP_ZSTAT(mz, l); 699 } 700 return ret; 701} 702 703static unsigned long 704mem_cgroup_node_nr_lru_pages(struct mem_cgroup *mem, 705 int nid, unsigned int lru_mask) 706{ 707 u64 total = 0; 708 int zid; 709 710 for (zid = 0; zid < MAX_NR_ZONES; zid++) 711 total += mem_cgroup_zone_nr_lru_pages(mem, nid, zid, lru_mask); 712 713 return total; 714} 715 716static unsigned long mem_cgroup_nr_lru_pages(struct mem_cgroup *mem, 717 unsigned int lru_mask) 718{ 719 int nid; 720 u64 total = 0; 721 722 for_each_node_state(nid, N_HIGH_MEMORY) 723 total += mem_cgroup_node_nr_lru_pages(mem, nid, lru_mask); 724 return total; 725} 726 727static bool __memcg_event_check(struct mem_cgroup *mem, int target) 728{ 729 unsigned long val, next; 730 731 val = this_cpu_read(mem->stat->events[MEM_CGROUP_EVENTS_COUNT]); 732 next = this_cpu_read(mem->stat->targets[target]); 733 /* from time_after() in jiffies.h */ 734 return ((long)next - (long)val < 0); 735} 736 737static void __mem_cgroup_target_update(struct mem_cgroup *mem, int target) 738{ 739 unsigned long val, next; 740 741 val = this_cpu_read(mem->stat->events[MEM_CGROUP_EVENTS_COUNT]); 742 743 switch (target) { 744 case MEM_CGROUP_TARGET_THRESH: 745 next = val + THRESHOLDS_EVENTS_TARGET; 746 break; 747 case MEM_CGROUP_TARGET_SOFTLIMIT: 748 next = val + SOFTLIMIT_EVENTS_TARGET; 749 break; 750 case MEM_CGROUP_TARGET_NUMAINFO: 751 next = val + NUMAINFO_EVENTS_TARGET; 752 break; 753 default: 754 return; 755 } 756 757 this_cpu_write(mem->stat->targets[target], next); 758} 759 760/* 761 * Check events in order. 762 * 763 */ 764static void memcg_check_events(struct mem_cgroup *mem, struct page *page) 765{ 766 /* threshold event is triggered in finer grain than soft limit */ 767 if (unlikely(__memcg_event_check(mem, MEM_CGROUP_TARGET_THRESH))) { 768 mem_cgroup_threshold(mem); 769 __mem_cgroup_target_update(mem, MEM_CGROUP_TARGET_THRESH); 770 if (unlikely(__memcg_event_check(mem, 771 MEM_CGROUP_TARGET_SOFTLIMIT))) { 772 mem_cgroup_update_tree(mem, page); 773 __mem_cgroup_target_update(mem, 774 MEM_CGROUP_TARGET_SOFTLIMIT); 775 } 776#if MAX_NUMNODES > 1 777 if (unlikely(__memcg_event_check(mem, 778 MEM_CGROUP_TARGET_NUMAINFO))) { 779 atomic_inc(&mem->numainfo_events); 780 __mem_cgroup_target_update(mem, 781 MEM_CGROUP_TARGET_NUMAINFO); 782 } 783#endif 784 } 785} 786 787static struct mem_cgroup *mem_cgroup_from_cont(struct cgroup *cont) 788{ 789 return container_of(cgroup_subsys_state(cont, 790 mem_cgroup_subsys_id), struct mem_cgroup, 791 css); 792} 793 794struct mem_cgroup *mem_cgroup_from_task(struct task_struct *p) 795{ 796 /* 797 * mm_update_next_owner() may clear mm->owner to NULL 798 * if it races with swapoff, page migration, etc. 799 * So this can be called with p == NULL. 800 */ 801 if (unlikely(!p)) 802 return NULL; 803 804 return container_of(task_subsys_state(p, mem_cgroup_subsys_id), 805 struct mem_cgroup, css); 806} 807 808struct mem_cgroup *try_get_mem_cgroup_from_mm(struct mm_struct *mm) 809{ 810 struct mem_cgroup *mem = NULL; 811 812 if (!mm) 813 return NULL; 814 /* 815 * Because we have no locks, mm->owner's may be being moved to other 816 * cgroup. We use css_tryget() here even if this looks 817 * pessimistic (rather than adding locks here). 818 */ 819 rcu_read_lock(); 820 do { 821 mem = mem_cgroup_from_task(rcu_dereference(mm->owner)); 822 if (unlikely(!mem)) 823 break; 824 } while (!css_tryget(&mem->css)); 825 rcu_read_unlock(); 826 return mem; 827} 828 829/* The caller has to guarantee "mem" exists before calling this */ 830static struct mem_cgroup *mem_cgroup_start_loop(struct mem_cgroup *mem) 831{ 832 struct cgroup_subsys_state *css; 833 int found; 834 835 if (!mem) /* ROOT cgroup has the smallest ID */ 836 return root_mem_cgroup; /*css_put/get against root is ignored*/ 837 if (!mem->use_hierarchy) { 838 if (css_tryget(&mem->css)) 839 return mem; 840 return NULL; 841 } 842 rcu_read_lock(); 843 /* 844 * searching a memory cgroup which has the smallest ID under given 845 * ROOT cgroup. (ID >= 1) 846 */ 847 css = css_get_next(&mem_cgroup_subsys, 1, &mem->css, &found); 848 if (css && css_tryget(css)) 849 mem = container_of(css, struct mem_cgroup, css); 850 else 851 mem = NULL; 852 rcu_read_unlock(); 853 return mem; 854} 855 856static struct mem_cgroup *mem_cgroup_get_next(struct mem_cgroup *iter, 857 struct mem_cgroup *root, 858 bool cond) 859{ 860 int nextid = css_id(&iter->css) + 1; 861 int found; 862 int hierarchy_used; 863 struct cgroup_subsys_state *css; 864 865 hierarchy_used = iter->use_hierarchy; 866 867 css_put(&iter->css); 868 /* If no ROOT, walk all, ignore hierarchy */ 869 if (!cond || (root && !hierarchy_used)) 870 return NULL; 871 872 if (!root) 873 root = root_mem_cgroup; 874 875 do { 876 iter = NULL; 877 rcu_read_lock(); 878 879 css = css_get_next(&mem_cgroup_subsys, nextid, 880 &root->css, &found); 881 if (css && css_tryget(css)) 882 iter = container_of(css, struct mem_cgroup, css); 883 rcu_read_unlock(); 884 /* If css is NULL, no more cgroups will be found */ 885 nextid = found + 1; 886 } while (css && !iter); 887 888 return iter; 889} 890/* 891 * for_eacn_mem_cgroup_tree() for visiting all cgroup under tree. Please 892 * be careful that "break" loop is not allowed. We have reference count. 893 * Instead of that modify "cond" to be false and "continue" to exit the loop. 894 */ 895#define for_each_mem_cgroup_tree_cond(iter, root, cond) \ 896 for (iter = mem_cgroup_start_loop(root);\ 897 iter != NULL;\ 898 iter = mem_cgroup_get_next(iter, root, cond)) 899 900#define for_each_mem_cgroup_tree(iter, root) \ 901 for_each_mem_cgroup_tree_cond(iter, root, true) 902 903#define for_each_mem_cgroup_all(iter) \ 904 for_each_mem_cgroup_tree_cond(iter, NULL, true) 905 906 907static inline bool mem_cgroup_is_root(struct mem_cgroup *mem) 908{ 909 return (mem == root_mem_cgroup); 910} 911 912void mem_cgroup_count_vm_event(struct mm_struct *mm, enum vm_event_item idx) 913{ 914 struct mem_cgroup *mem; 915 916 if (!mm) 917 return; 918 919 rcu_read_lock(); 920 mem = mem_cgroup_from_task(rcu_dereference(mm->owner)); 921 if (unlikely(!mem)) 922 goto out; 923 924 switch (idx) { 925 case PGMAJFAULT: 926 mem_cgroup_pgmajfault(mem, 1); 927 break; 928 case PGFAULT: 929 mem_cgroup_pgfault(mem, 1); 930 break; 931 default: 932 BUG(); 933 } 934out: 935 rcu_read_unlock(); 936} 937EXPORT_SYMBOL(mem_cgroup_count_vm_event); 938 939/* 940 * Following LRU functions are allowed to be used without PCG_LOCK. 941 * Operations are called by routine of global LRU independently from memcg. 942 * What we have to take care of here is validness of pc->mem_cgroup. 943 * 944 * Changes to pc->mem_cgroup happens when 945 * 1. charge 946 * 2. moving account 947 * In typical case, "charge" is done before add-to-lru. Exception is SwapCache. 948 * It is added to LRU before charge. 949 * If PCG_USED bit is not set, page_cgroup is not added to this private LRU. 950 * When moving account, the page is not on LRU. It's isolated. 951 */ 952 953void mem_cgroup_del_lru_list(struct page *page, enum lru_list lru) 954{ 955 struct page_cgroup *pc; 956 struct mem_cgroup_per_zone *mz; 957 958 if (mem_cgroup_disabled()) 959 return; 960 pc = lookup_page_cgroup(page); 961 /* can happen while we handle swapcache. */ 962 if (!TestClearPageCgroupAcctLRU(pc)) 963 return; 964 VM_BUG_ON(!pc->mem_cgroup); 965 /* 966 * We don't check PCG_USED bit. It's cleared when the "page" is finally 967 * removed from global LRU. 968 */ 969 mz = page_cgroup_zoneinfo(pc->mem_cgroup, page); 970 /* huge page split is done under lru_lock. so, we have no races. */ 971 MEM_CGROUP_ZSTAT(mz, lru) -= 1 << compound_order(page); 972 if (mem_cgroup_is_root(pc->mem_cgroup)) 973 return; 974 VM_BUG_ON(list_empty(&pc->lru)); 975 list_del_init(&pc->lru); 976} 977 978void mem_cgroup_del_lru(struct page *page) 979{ 980 mem_cgroup_del_lru_list(page, page_lru(page)); 981} 982 983/* 984 * Writeback is about to end against a page which has been marked for immediate 985 * reclaim. If it still appears to be reclaimable, move it to the tail of the 986 * inactive list. 987 */ 988void mem_cgroup_rotate_reclaimable_page(struct page *page) 989{ 990 struct mem_cgroup_per_zone *mz; 991 struct page_cgroup *pc; 992 enum lru_list lru = page_lru(page); 993 994 if (mem_cgroup_disabled()) 995 return; 996 997 pc = lookup_page_cgroup(page); 998 /* unused or root page is not rotated. */ 999 if (!PageCgroupUsed(pc)) 1000 return; 1001 /* Ensure pc->mem_cgroup is visible after reading PCG_USED. */ 1002 smp_rmb(); 1003 if (mem_cgroup_is_root(pc->mem_cgroup)) 1004 return; 1005 mz = page_cgroup_zoneinfo(pc->mem_cgroup, page); 1006 list_move_tail(&pc->lru, &mz->lists[lru]); 1007} 1008 1009void mem_cgroup_rotate_lru_list(struct page *page, enum lru_list lru) 1010{ 1011 struct mem_cgroup_per_zone *mz; 1012 struct page_cgroup *pc; 1013 1014 if (mem_cgroup_disabled()) 1015 return; 1016 1017 pc = lookup_page_cgroup(page); 1018 /* unused or root page is not rotated. */ 1019 if (!PageCgroupUsed(pc)) 1020 return; 1021 /* Ensure pc->mem_cgroup is visible after reading PCG_USED. */ 1022 smp_rmb(); 1023 if (mem_cgroup_is_root(pc->mem_cgroup)) 1024 return; 1025 mz = page_cgroup_zoneinfo(pc->mem_cgroup, page); 1026 list_move(&pc->lru, &mz->lists[lru]); 1027} 1028 1029void mem_cgroup_add_lru_list(struct page *page, enum lru_list lru) 1030{ 1031 struct page_cgroup *pc; 1032 struct mem_cgroup_per_zone *mz; 1033 1034 if (mem_cgroup_disabled()) 1035 return; 1036 pc = lookup_page_cgroup(page); 1037 VM_BUG_ON(PageCgroupAcctLRU(pc)); 1038 if (!PageCgroupUsed(pc)) 1039 return; 1040 /* Ensure pc->mem_cgroup is visible after reading PCG_USED. */ 1041 smp_rmb(); 1042 mz = page_cgroup_zoneinfo(pc->mem_cgroup, page); 1043 /* huge page split is done under lru_lock. so, we have no races. */ 1044 MEM_CGROUP_ZSTAT(mz, lru) += 1 << compound_order(page); 1045 SetPageCgroupAcctLRU(pc); 1046 if (mem_cgroup_is_root(pc->mem_cgroup)) 1047 return; 1048 list_add(&pc->lru, &mz->lists[lru]); 1049} 1050 1051/* 1052 * At handling SwapCache and other FUSE stuff, pc->mem_cgroup may be changed 1053 * while it's linked to lru because the page may be reused after it's fully 1054 * uncharged. To handle that, unlink page_cgroup from LRU when charge it again. 1055 * It's done under lock_page and expected that zone->lru_lock isnever held. 1056 */ 1057static void mem_cgroup_lru_del_before_commit(struct page *page) 1058{ 1059 unsigned long flags; 1060 struct zone *zone = page_zone(page); 1061 struct page_cgroup *pc = lookup_page_cgroup(page); 1062 1063 /* 1064 * Doing this check without taking ->lru_lock seems wrong but this 1065 * is safe. Because if page_cgroup's USED bit is unset, the page 1066 * will not be added to any memcg's LRU. If page_cgroup's USED bit is 1067 * set, the commit after this will fail, anyway. 1068 * This all charge/uncharge is done under some mutual execustion. 1069 * So, we don't need to taking care of changes in USED bit. 1070 */ 1071 if (likely(!PageLRU(page))) 1072 return; 1073 1074 spin_lock_irqsave(&zone->lru_lock, flags); 1075 /* 1076 * Forget old LRU when this page_cgroup is *not* used. This Used bit 1077 * is guarded by lock_page() because the page is SwapCache. 1078 */ 1079 if (!PageCgroupUsed(pc)) 1080 mem_cgroup_del_lru_list(page, page_lru(page)); 1081 spin_unlock_irqrestore(&zone->lru_lock, flags); 1082} 1083 1084static void mem_cgroup_lru_add_after_commit(struct page *page) 1085{ 1086 unsigned long flags; 1087 struct zone *zone = page_zone(page); 1088 struct page_cgroup *pc = lookup_page_cgroup(page); 1089 1090 /* taking care of that the page is added to LRU while we commit it */ 1091 if (likely(!PageLRU(page))) 1092 return; 1093 spin_lock_irqsave(&zone->lru_lock, flags); 1094 /* link when the page is linked to LRU but page_cgroup isn't */ 1095 if (PageLRU(page) && !PageCgroupAcctLRU(pc)) 1096 mem_cgroup_add_lru_list(page, page_lru(page)); 1097 spin_unlock_irqrestore(&zone->lru_lock, flags); 1098} 1099 1100 1101void mem_cgroup_move_lists(struct page *page, 1102 enum lru_list from, enum lru_list to) 1103{ 1104 if (mem_cgroup_disabled()) 1105 return; 1106 mem_cgroup_del_lru_list(page, from); 1107 mem_cgroup_add_lru_list(page, to); 1108} 1109 1110/* 1111 * Checks whether given mem is same or in the root_mem's 1112 * hierarchy subtree 1113 */ 1114static bool mem_cgroup_same_or_subtree(const struct mem_cgroup *root_mem, 1115 struct mem_cgroup *mem) 1116{ 1117 if (root_mem != mem) { 1118 return (root_mem->use_hierarchy && 1119 css_is_ancestor(&mem->css, &root_mem->css)); 1120 } 1121 1122 return true; 1123} 1124 1125int task_in_mem_cgroup(struct task_struct *task, const struct mem_cgroup *mem) 1126{ 1127 int ret; 1128 struct mem_cgroup *curr = NULL; 1129 struct task_struct *p; 1130 1131 p = find_lock_task_mm(task); 1132 if (!p) 1133 return 0; 1134 curr = try_get_mem_cgroup_from_mm(p->mm); 1135 task_unlock(p); 1136 if (!curr) 1137 return 0; 1138 /* 1139 * We should check use_hierarchy of "mem" not "curr". Because checking 1140 * use_hierarchy of "curr" here make this function true if hierarchy is 1141 * enabled in "curr" and "curr" is a child of "mem" in *cgroup* 1142 * hierarchy(even if use_hierarchy is disabled in "mem"). 1143 */ 1144 ret = mem_cgroup_same_or_subtree(mem, curr); 1145 css_put(&curr->css); 1146 return ret; 1147} 1148 1149static int calc_inactive_ratio(struct mem_cgroup *memcg, unsigned long *present_pages) 1150{ 1151 unsigned long active; 1152 unsigned long inactive; 1153 unsigned long gb; 1154 unsigned long inactive_ratio; 1155 1156 inactive = mem_cgroup_nr_lru_pages(memcg, BIT(LRU_INACTIVE_ANON)); 1157 active = mem_cgroup_nr_lru_pages(memcg, BIT(LRU_ACTIVE_ANON)); 1158 1159 gb = (inactive + active) >> (30 - PAGE_SHIFT); 1160 if (gb) 1161 inactive_ratio = int_sqrt(10 * gb); 1162 else 1163 inactive_ratio = 1; 1164 1165 if (present_pages) { 1166 present_pages[0] = inactive; 1167 present_pages[1] = active; 1168 } 1169 1170 return inactive_ratio; 1171} 1172 1173int mem_cgroup_inactive_anon_is_low(struct mem_cgroup *memcg) 1174{ 1175 unsigned long active; 1176 unsigned long inactive; 1177 unsigned long present_pages[2]; 1178 unsigned long inactive_ratio; 1179 1180 inactive_ratio = calc_inactive_ratio(memcg, present_pages); 1181 1182 inactive = present_pages[0]; 1183 active = present_pages[1]; 1184 1185 if (inactive * inactive_ratio < active) 1186 return 1; 1187 1188 return 0; 1189} 1190 1191int mem_cgroup_inactive_file_is_low(struct mem_cgroup *memcg) 1192{ 1193 unsigned long active; 1194 unsigned long inactive; 1195 1196 inactive = mem_cgroup_nr_lru_pages(memcg, BIT(LRU_INACTIVE_FILE)); 1197 active = mem_cgroup_nr_lru_pages(memcg, BIT(LRU_ACTIVE_FILE)); 1198 1199 return (active > inactive); 1200} 1201 1202struct zone_reclaim_stat *mem_cgroup_get_reclaim_stat(struct mem_cgroup *memcg, 1203 struct zone *zone) 1204{ 1205 int nid = zone_to_nid(zone); 1206 int zid = zone_idx(zone); 1207 struct mem_cgroup_per_zone *mz = mem_cgroup_zoneinfo(memcg, nid, zid); 1208 1209 return &mz->reclaim_stat; 1210} 1211 1212struct zone_reclaim_stat * 1213mem_cgroup_get_reclaim_stat_from_page(struct page *page) 1214{ 1215 struct page_cgroup *pc; 1216 struct mem_cgroup_per_zone *mz; 1217 1218 if (mem_cgroup_disabled()) 1219 return NULL; 1220 1221 pc = lookup_page_cgroup(page); 1222 if (!PageCgroupUsed(pc)) 1223 return NULL; 1224 /* Ensure pc->mem_cgroup is visible after reading PCG_USED. */ 1225 smp_rmb(); 1226 mz = page_cgroup_zoneinfo(pc->mem_cgroup, page); 1227 return &mz->reclaim_stat; 1228} 1229 1230unsigned long mem_cgroup_isolate_pages(unsigned long nr_to_scan, 1231 struct list_head *dst, 1232 unsigned long *scanned, int order, 1233 int mode, struct zone *z, 1234 struct mem_cgroup *mem_cont, 1235 int active, int file) 1236{ 1237 unsigned long nr_taken = 0; 1238 struct page *page; 1239 unsigned long scan; 1240 LIST_HEAD(pc_list); 1241 struct list_head *src; 1242 struct page_cgroup *pc, *tmp; 1243 int nid = zone_to_nid(z); 1244 int zid = zone_idx(z); 1245 struct mem_cgroup_per_zone *mz; 1246 int lru = LRU_FILE * file + active; 1247 int ret; 1248 1249 BUG_ON(!mem_cont); 1250 mz = mem_cgroup_zoneinfo(mem_cont, nid, zid); 1251 src = &mz->lists[lru]; 1252 1253 scan = 0; 1254 list_for_each_entry_safe_reverse(pc, tmp, src, lru) { 1255 if (scan >= nr_to_scan) 1256 break; 1257 1258 if (unlikely(!PageCgroupUsed(pc))) 1259 continue; 1260 1261 page = lookup_cgroup_page(pc); 1262 1263 if (unlikely(!PageLRU(page))) 1264 continue; 1265 1266 scan++; 1267 ret = __isolate_lru_page(page, mode, file); 1268 switch (ret) { 1269 case 0: 1270 list_move(&page->lru, dst); 1271 mem_cgroup_del_lru(page); 1272 nr_taken += hpage_nr_pages(page); 1273 break; 1274 case -EBUSY: 1275 /* we don't affect global LRU but rotate in our LRU */ 1276 mem_cgroup_rotate_lru_list(page, page_lru(page)); 1277 break; 1278 default: 1279 break; 1280 } 1281 } 1282 1283 *scanned = scan; 1284 1285 trace_mm_vmscan_memcg_isolate(0, nr_to_scan, scan, nr_taken, 1286 0, 0, 0, mode); 1287 1288 return nr_taken; 1289} 1290 1291#define mem_cgroup_from_res_counter(counter, member) \ 1292 container_of(counter, struct mem_cgroup, member) 1293 1294/** 1295 * mem_cgroup_margin - calculate chargeable space of a memory cgroup 1296 * @mem: the memory cgroup 1297 * 1298 * Returns the maximum amount of memory @mem can be charged with, in 1299 * pages. 1300 */ 1301static unsigned long mem_cgroup_margin(struct mem_cgroup *mem) 1302{ 1303 unsigned long long margin; 1304 1305 margin = res_counter_margin(&mem->res); 1306 if (do_swap_account) 1307 margin = min(margin, res_counter_margin(&mem->memsw)); 1308 return margin >> PAGE_SHIFT; 1309} 1310 1311int mem_cgroup_swappiness(struct mem_cgroup *memcg) 1312{ 1313 struct cgroup *cgrp = memcg->css.cgroup; 1314 1315 /* root ? */ 1316 if (cgrp->parent == NULL) 1317 return vm_swappiness; 1318 1319 return memcg->swappiness; 1320} 1321 1322static void mem_cgroup_start_move(struct mem_cgroup *mem) 1323{ 1324 int cpu; 1325 1326 get_online_cpus(); 1327 spin_lock(&mem->pcp_counter_lock); 1328 for_each_online_cpu(cpu) 1329 per_cpu(mem->stat->count[MEM_CGROUP_ON_MOVE], cpu) += 1; 1330 mem->nocpu_base.count[MEM_CGROUP_ON_MOVE] += 1; 1331 spin_unlock(&mem->pcp_counter_lock); 1332 put_online_cpus(); 1333 1334 synchronize_rcu(); 1335} 1336 1337static void mem_cgroup_end_move(struct mem_cgroup *mem) 1338{ 1339 int cpu; 1340 1341 if (!mem) 1342 return; 1343 get_online_cpus(); 1344 spin_lock(&mem->pcp_counter_lock); 1345 for_each_online_cpu(cpu) 1346 per_cpu(mem->stat->count[MEM_CGROUP_ON_MOVE], cpu) -= 1; 1347 mem->nocpu_base.count[MEM_CGROUP_ON_MOVE] -= 1; 1348 spin_unlock(&mem->pcp_counter_lock); 1349 put_online_cpus(); 1350} 1351/* 1352 * 2 routines for checking "mem" is under move_account() or not. 1353 * 1354 * mem_cgroup_stealed() - checking a cgroup is mc.from or not. This is used 1355 * for avoiding race in accounting. If true, 1356 * pc->mem_cgroup may be overwritten. 1357 * 1358 * mem_cgroup_under_move() - checking a cgroup is mc.from or mc.to or 1359 * under hierarchy of moving cgroups. This is for 1360 * waiting at hith-memory prressure caused by "move". 1361 */ 1362 1363static bool mem_cgroup_stealed(struct mem_cgroup *mem) 1364{ 1365 VM_BUG_ON(!rcu_read_lock_held()); 1366 return this_cpu_read(mem->stat->count[MEM_CGROUP_ON_MOVE]) > 0; 1367} 1368 1369static bool mem_cgroup_under_move(struct mem_cgroup *mem) 1370{ 1371 struct mem_cgroup *from; 1372 struct mem_cgroup *to; 1373 bool ret = false; 1374 /* 1375 * Unlike task_move routines, we access mc.to, mc.from not under 1376 * mutual exclusion by cgroup_mutex. Here, we take spinlock instead. 1377 */ 1378 spin_lock(&mc.lock); 1379 from = mc.from; 1380 to = mc.to; 1381 if (!from) 1382 goto unlock; 1383 1384 ret = mem_cgroup_same_or_subtree(mem, from) 1385 || mem_cgroup_same_or_subtree(mem, to); 1386unlock: 1387 spin_unlock(&mc.lock); 1388 return ret; 1389} 1390 1391static bool mem_cgroup_wait_acct_move(struct mem_cgroup *mem) 1392{ 1393 if (mc.moving_task && current != mc.moving_task) { 1394 if (mem_cgroup_under_move(mem)) { 1395 DEFINE_WAIT(wait); 1396 prepare_to_wait(&mc.waitq, &wait, TASK_INTERRUPTIBLE); 1397 /* moving charge context might have finished. */ 1398 if (mc.moving_task) 1399 schedule(); 1400 finish_wait(&mc.waitq, &wait); 1401 return true; 1402 } 1403 } 1404 return false; 1405} 1406 1407/** 1408 * mem_cgroup_print_oom_info: Called from OOM with tasklist_lock held in read mode. 1409 * @memcg: The memory cgroup that went over limit 1410 * @p: Task that is going to be killed 1411 * 1412 * NOTE: @memcg and @p's mem_cgroup can be different when hierarchy is 1413 * enabled 1414 */ 1415void mem_cgroup_print_oom_info(struct mem_cgroup *memcg, struct task_struct *p) 1416{ 1417 struct cgroup *task_cgrp; 1418 struct cgroup *mem_cgrp; 1419 /* 1420 * Need a buffer in BSS, can't rely on allocations. The code relies 1421 * on the assumption that OOM is serialized for memory controller. 1422 * If this assumption is broken, revisit this code. 1423 */ 1424 static char memcg_name[PATH_MAX]; 1425 int ret; 1426 1427 if (!memcg || !p) 1428 return; 1429 1430 1431 rcu_read_lock(); 1432 1433 mem_cgrp = memcg->css.cgroup; 1434 task_cgrp = task_cgroup(p, mem_cgroup_subsys_id); 1435 1436 ret = cgroup_path(task_cgrp, memcg_name, PATH_MAX); 1437 if (ret < 0) { 1438 /* 1439 * Unfortunately, we are unable to convert to a useful name 1440 * But we'll still print out the usage information 1441 */ 1442 rcu_read_unlock(); 1443 goto done; 1444 } 1445 rcu_read_unlock(); 1446 1447 printk(KERN_INFO "Task in %s killed", memcg_name); 1448 1449 rcu_read_lock(); 1450 ret = cgroup_path(mem_cgrp, memcg_name, PATH_MAX); 1451 if (ret < 0) { 1452 rcu_read_unlock(); 1453 goto done; 1454 } 1455 rcu_read_unlock(); 1456 1457 /* 1458 * Continues from above, so we don't need an KERN_ level 1459 */ 1460 printk(KERN_CONT " as a result of limit of %s\n", memcg_name); 1461done: 1462 1463 printk(KERN_INFO "memory: usage %llukB, limit %llukB, failcnt %llu\n", 1464 res_counter_read_u64(&memcg->res, RES_USAGE) >> 10, 1465 res_counter_read_u64(&memcg->res, RES_LIMIT) >> 10, 1466 res_counter_read_u64(&memcg->res, RES_FAILCNT)); 1467 printk(KERN_INFO "memory+swap: usage %llukB, limit %llukB, " 1468 "failcnt %llu\n", 1469 res_counter_read_u64(&memcg->memsw, RES_USAGE) >> 10, 1470 res_counter_read_u64(&memcg->memsw, RES_LIMIT) >> 10, 1471 res_counter_read_u64(&memcg->memsw, RES_FAILCNT)); 1472} 1473 1474/* 1475 * This function returns the number of memcg under hierarchy tree. Returns 1476 * 1(self count) if no children. 1477 */ 1478static int mem_cgroup_count_children(struct mem_cgroup *mem) 1479{ 1480 int num = 0; 1481 struct mem_cgroup *iter; 1482 1483 for_each_mem_cgroup_tree(iter, mem) 1484 num++; 1485 return num; 1486} 1487 1488/* 1489 * Return the memory (and swap, if configured) limit for a memcg. 1490 */ 1491u64 mem_cgroup_get_limit(struct mem_cgroup *memcg) 1492{ 1493 u64 limit; 1494 u64 memsw; 1495 1496 limit = res_counter_read_u64(&memcg->res, RES_LIMIT); 1497 limit += total_swap_pages << PAGE_SHIFT; 1498 1499 memsw = res_counter_read_u64(&memcg->memsw, RES_LIMIT); 1500 /* 1501 * If memsw is finite and limits the amount of swap space available 1502 * to this memcg, return that limit. 1503 */ 1504 return min(limit, memsw); 1505} 1506 1507/* 1508 * Visit the first child (need not be the first child as per the ordering 1509 * of the cgroup list, since we track last_scanned_child) of @mem and use 1510 * that to reclaim free pages from. 1511 */ 1512static struct mem_cgroup * 1513mem_cgroup_select_victim(struct mem_cgroup *root_mem) 1514{ 1515 struct mem_cgroup *ret = NULL; 1516 struct cgroup_subsys_state *css; 1517 int nextid, found; 1518 1519 if (!root_mem->use_hierarchy) { 1520 css_get(&root_mem->css); 1521 ret = root_mem; 1522 } 1523 1524 while (!ret) { 1525 rcu_read_lock(); 1526 nextid = root_mem->last_scanned_child + 1; 1527 css = css_get_next(&mem_cgroup_subsys, nextid, &root_mem->css, 1528 &found); 1529 if (css && css_tryget(css)) 1530 ret = container_of(css, struct mem_cgroup, css); 1531 1532 rcu_read_unlock(); 1533 /* Updates scanning parameter */ 1534 if (!css) { 1535 /* this means start scan from ID:1 */ 1536 root_mem->last_scanned_child = 0; 1537 } else 1538 root_mem->last_scanned_child = found; 1539 } 1540 1541 return ret; 1542} 1543 1544/** 1545 * test_mem_cgroup_node_reclaimable 1546 * @mem: the target memcg 1547 * @nid: the node ID to be checked. 1548 * @noswap : specify true here if the user wants flle only information. 1549 * 1550 * This function returns whether the specified memcg contains any 1551 * reclaimable pages on a node. Returns true if there are any reclaimable 1552 * pages in the node. 1553 */ 1554static bool test_mem_cgroup_node_reclaimable(struct mem_cgroup *mem, 1555 int nid, bool noswap) 1556{ 1557 if (mem_cgroup_node_nr_lru_pages(mem, nid, LRU_ALL_FILE)) 1558 return true; 1559 if (noswap || !total_swap_pages) 1560 return false; 1561 if (mem_cgroup_node_nr_lru_pages(mem, nid, LRU_ALL_ANON)) 1562 return true; 1563 return false; 1564 1565} 1566#if MAX_NUMNODES > 1 1567 1568/* 1569 * Always updating the nodemask is not very good - even if we have an empty 1570 * list or the wrong list here, we can start from some node and traverse all 1571 * nodes based on the zonelist. So update the list loosely once per 10 secs. 1572 * 1573 */ 1574static void mem_cgroup_may_update_nodemask(struct mem_cgroup *mem) 1575{ 1576 int nid; 1577 /* 1578 * numainfo_events > 0 means there was at least NUMAINFO_EVENTS_TARGET 1579 * pagein/pageout changes since the last update. 1580 */ 1581 if (!atomic_read(&mem->numainfo_events)) 1582 return; 1583 if (atomic_inc_return(&mem->numainfo_updating) > 1) 1584 return; 1585 1586 /* make a nodemask where this memcg uses memory from */ 1587 mem->scan_nodes = node_states[N_HIGH_MEMORY]; 1588 1589 for_each_node_mask(nid, node_states[N_HIGH_MEMORY]) { 1590 1591 if (!test_mem_cgroup_node_reclaimable(mem, nid, false)) 1592 node_clear(nid, mem->scan_nodes); 1593 } 1594 1595 atomic_set(&mem->numainfo_events, 0); 1596 atomic_set(&mem->numainfo_updating, 0); 1597} 1598 1599/* 1600 * Selecting a node where we start reclaim from. Because what we need is just 1601 * reducing usage counter, start from anywhere is O,K. Considering 1602 * memory reclaim from current node, there are pros. and cons. 1603 * 1604 * Freeing memory from current node means freeing memory from a node which 1605 * we'll use or we've used. So, it may make LRU bad. And if several threads 1606 * hit limits, it will see a contention on a node. But freeing from remote 1607 * node means more costs for memory reclaim because of memory latency. 1608 * 1609 * Now, we use round-robin. Better algorithm is welcomed. 1610 */ 1611int mem_cgroup_select_victim_node(struct mem_cgroup *mem) 1612{ 1613 int node; 1614 1615 mem_cgroup_may_update_nodemask(mem); 1616 node = mem->last_scanned_node; 1617 1618 node = next_node(node, mem->scan_nodes); 1619 if (node == MAX_NUMNODES) 1620 node = first_node(mem->scan_nodes); 1621 /* 1622 * We call this when we hit limit, not when pages are added to LRU. 1623 * No LRU may hold pages because all pages are UNEVICTABLE or 1624 * memcg is too small and all pages are not on LRU. In that case, 1625 * we use curret node. 1626 */ 1627 if (unlikely(node == MAX_NUMNODES)) 1628 node = numa_node_id(); 1629 1630 mem->last_scanned_node = node; 1631 return node; 1632} 1633 1634/* 1635 * Check all nodes whether it contains reclaimable pages or not. 1636 * For quick scan, we make use of scan_nodes. This will allow us to skip 1637 * unused nodes. But scan_nodes is lazily updated and may not cotain 1638 * enough new information. We need to do double check. 1639 */ 1640bool mem_cgroup_reclaimable(struct mem_cgroup *mem, bool noswap) 1641{ 1642 int nid; 1643 1644 /* 1645 * quick check...making use of scan_node. 1646 * We can skip unused nodes. 1647 */ 1648 if (!nodes_empty(mem->scan_nodes)) { 1649 for (nid = first_node(mem->scan_nodes); 1650 nid < MAX_NUMNODES; 1651 nid = next_node(nid, mem->scan_nodes)) { 1652 1653 if (test_mem_cgroup_node_reclaimable(mem, nid, noswap)) 1654 return true; 1655 } 1656 } 1657 /* 1658 * Check rest of nodes. 1659 */ 1660 for_each_node_state(nid, N_HIGH_MEMORY) { 1661 if (node_isset(nid, mem->scan_nodes)) 1662 continue; 1663 if (test_mem_cgroup_node_reclaimable(mem, nid, noswap)) 1664 return true; 1665 } 1666 return false; 1667} 1668 1669#else 1670int mem_cgroup_select_victim_node(struct mem_cgroup *mem) 1671{ 1672 return 0; 1673} 1674 1675bool mem_cgroup_reclaimable(struct mem_cgroup *mem, bool noswap) 1676{ 1677 return test_mem_cgroup_node_reclaimable(mem, 0, noswap); 1678} 1679#endif 1680 1681static void __mem_cgroup_record_scanstat(unsigned long *stats, 1682 struct memcg_scanrecord *rec) 1683{ 1684 1685 stats[SCAN] += rec->nr_scanned[0] + rec->nr_scanned[1]; 1686 stats[SCAN_ANON] += rec->nr_scanned[0]; 1687 stats[SCAN_FILE] += rec->nr_scanned[1]; 1688 1689 stats[ROTATE] += rec->nr_rotated[0] + rec->nr_rotated[1]; 1690 stats[ROTATE_ANON] += rec->nr_rotated[0]; 1691 stats[ROTATE_FILE] += rec->nr_rotated[1]; 1692 1693 stats[FREED] += rec->nr_freed[0] + rec->nr_freed[1]; 1694 stats[FREED_ANON] += rec->nr_freed[0]; 1695 stats[FREED_FILE] += rec->nr_freed[1]; 1696 1697 stats[ELAPSED] += rec->elapsed; 1698} 1699 1700static void mem_cgroup_record_scanstat(struct memcg_scanrecord *rec) 1701{ 1702 struct mem_cgroup *mem; 1703 int context = rec->context; 1704 1705 if (context >= NR_SCAN_CONTEXT) 1706 return; 1707 1708 mem = rec->mem; 1709 spin_lock(&mem->scanstat.lock); 1710 __mem_cgroup_record_scanstat(mem->scanstat.stats[context], rec); 1711 spin_unlock(&mem->scanstat.lock); 1712 1713 mem = rec->root; 1714 spin_lock(&mem->scanstat.lock); 1715 __mem_cgroup_record_scanstat(mem->scanstat.rootstats[context], rec); 1716 spin_unlock(&mem->scanstat.lock); 1717} 1718 1719/* 1720 * Scan the hierarchy if needed to reclaim memory. We remember the last child 1721 * we reclaimed from, so that we don't end up penalizing one child extensively 1722 * based on its position in the children list. 1723 * 1724 * root_mem is the original ancestor that we've been reclaim from. 1725 * 1726 * We give up and return to the caller when we visit root_mem twice. 1727 * (other groups can be removed while we're walking....) 1728 * 1729 * If shrink==true, for avoiding to free too much, this returns immedieately. 1730 */ 1731static int mem_cgroup_hierarchical_reclaim(struct mem_cgroup *root_mem, 1732 struct zone *zone, 1733 gfp_t gfp_mask, 1734 unsigned long reclaim_options, 1735 unsigned long *total_scanned) 1736{ 1737 struct mem_cgroup *victim; 1738 int ret, total = 0; 1739 int loop = 0; 1740 bool noswap = reclaim_options & MEM_CGROUP_RECLAIM_NOSWAP; 1741 bool shrink = reclaim_options & MEM_CGROUP_RECLAIM_SHRINK; 1742 bool check_soft = reclaim_options & MEM_CGROUP_RECLAIM_SOFT; 1743 struct memcg_scanrecord rec; 1744 unsigned long excess; 1745 unsigned long scanned; 1746 1747 excess = res_counter_soft_limit_excess(&root_mem->res) >> PAGE_SHIFT; 1748 1749 /* If memsw_is_minimum==1, swap-out is of-no-use. */ 1750 if (!check_soft && !shrink && root_mem->memsw_is_minimum) 1751 noswap = true; 1752 1753 if (shrink) 1754 rec.context = SCAN_BY_SHRINK; 1755 else if (check_soft) 1756 rec.context = SCAN_BY_SYSTEM; 1757 else 1758 rec.context = SCAN_BY_LIMIT; 1759 1760 rec.root = root_mem; 1761 1762 while (1) { 1763 victim = mem_cgroup_select_victim(root_mem); 1764 if (victim == root_mem) { 1765 loop++; 1766 /* 1767 * We are not draining per cpu cached charges during 1768 * soft limit reclaim because global reclaim doesn't 1769 * care about charges. It tries to free some memory and 1770 * charges will not give any. 1771 */ 1772 if (!check_soft && loop >= 1) 1773 drain_all_stock_async(root_mem); 1774 if (loop >= 2) { 1775 /* 1776 * If we have not been able to reclaim 1777 * anything, it might because there are 1778 * no reclaimable pages under this hierarchy 1779 */ 1780 if (!check_soft || !total) { 1781 css_put(&victim->css); 1782 break; 1783 } 1784 /* 1785 * We want to do more targeted reclaim. 1786 * excess >> 2 is not to excessive so as to 1787 * reclaim too much, nor too less that we keep 1788 * coming back to reclaim from this cgroup 1789 */ 1790 if (total >= (excess >> 2) || 1791 (loop > MEM_CGROUP_MAX_RECLAIM_LOOPS)) { 1792 css_put(&victim->css); 1793 break; 1794 } 1795 } 1796 } 1797 if (!mem_cgroup_reclaimable(victim, noswap)) { 1798 /* this cgroup's local usage == 0 */ 1799 css_put(&victim->css); 1800 continue; 1801 } 1802 rec.mem = victim; 1803 rec.nr_scanned[0] = 0; 1804 rec.nr_scanned[1] = 0; 1805 rec.nr_rotated[0] = 0; 1806 rec.nr_rotated[1] = 0; 1807 rec.nr_freed[0] = 0; 1808 rec.nr_freed[1] = 0; 1809 rec.elapsed = 0; 1810 /* we use swappiness of local cgroup */ 1811 if (check_soft) { 1812 ret = mem_cgroup_shrink_node_zone(victim, gfp_mask, 1813 noswap, zone, &rec, &scanned); 1814 *total_scanned += scanned; 1815 } else 1816 ret = try_to_free_mem_cgroup_pages(victim, gfp_mask, 1817 noswap, &rec); 1818 mem_cgroup_record_scanstat(&rec); 1819 css_put(&victim->css); 1820 /* 1821 * At shrinking usage, we can't check we should stop here or 1822 * reclaim more. It's depends on callers. last_scanned_child 1823 * will work enough for keeping fairness under tree. 1824 */ 1825 if (shrink) 1826 return ret; 1827 total += ret; 1828 if (check_soft) { 1829 if (!res_counter_soft_limit_excess(&root_mem->res)) 1830 return total; 1831 } else if (mem_cgroup_margin(root_mem)) 1832 return total; 1833 } 1834 return total; 1835} 1836 1837/* 1838 * Check OOM-Killer is already running under our hierarchy. 1839 * If someone is running, return false. 1840 * Has to be called with memcg_oom_lock 1841 */ 1842static bool mem_cgroup_oom_lock(struct mem_cgroup *mem) 1843{ 1844 int lock_count = -1; 1845 struct mem_cgroup *iter, *failed = NULL; 1846 bool cond = true; 1847 1848 for_each_mem_cgroup_tree_cond(iter, mem, cond) { 1849 bool locked = iter->oom_lock; 1850 1851 iter->oom_lock = true; 1852 if (lock_count == -1) 1853 lock_count = iter->oom_lock; 1854 else if (lock_count != locked) { 1855 /* 1856 * this subtree of our hierarchy is already locked 1857 * so we cannot give a lock. 1858 */ 1859 lock_count = 0; 1860 failed = iter; 1861 cond = false; 1862 } 1863 } 1864 1865 if (!failed) 1866 goto done; 1867 1868 /* 1869 * OK, we failed to lock the whole subtree so we have to clean up 1870 * what we set up to the failing subtree 1871 */ 1872 cond = true; 1873 for_each_mem_cgroup_tree_cond(iter, mem, cond) { 1874 if (iter == failed) { 1875 cond = false; 1876 continue; 1877 } 1878 iter->oom_lock = false; 1879 } 1880done: 1881 return lock_count; 1882} 1883 1884/* 1885 * Has to be called with memcg_oom_lock 1886 */ 1887static int mem_cgroup_oom_unlock(struct mem_cgroup *mem) 1888{ 1889 struct mem_cgroup *iter; 1890 1891 for_each_mem_cgroup_tree(iter, mem) 1892 iter->oom_lock = false; 1893 return 0; 1894} 1895 1896static void mem_cgroup_mark_under_oom(struct mem_cgroup *mem) 1897{ 1898 struct mem_cgroup *iter; 1899 1900 for_each_mem_cgroup_tree(iter, mem) 1901 atomic_inc(&iter->under_oom); 1902} 1903 1904static void mem_cgroup_unmark_under_oom(struct mem_cgroup *mem) 1905{ 1906 struct mem_cgroup *iter; 1907 1908 /* 1909 * When a new child is created while the hierarchy is under oom, 1910 * mem_cgroup_oom_lock() may not be called. We have to use 1911 * atomic_add_unless() here. 1912 */ 1913 for_each_mem_cgroup_tree(iter, mem) 1914 atomic_add_unless(&iter->under_oom, -1, 0); 1915} 1916 1917static DEFINE_SPINLOCK(memcg_oom_lock); 1918static DECLARE_WAIT_QUEUE_HEAD(memcg_oom_waitq); 1919 1920struct oom_wait_info { 1921 struct mem_cgroup *mem; 1922 wait_queue_t wait; 1923}; 1924 1925static int memcg_oom_wake_function(wait_queue_t *wait, 1926 unsigned mode, int sync, void *arg) 1927{ 1928 struct mem_cgroup *wake_mem = (struct mem_cgroup *)arg, 1929 *oom_wait_mem; 1930 struct oom_wait_info *oom_wait_info; 1931 1932 oom_wait_info = container_of(wait, struct oom_wait_info, wait); 1933 oom_wait_mem = oom_wait_info->mem; 1934 1935 /* 1936 * Both of oom_wait_info->mem and wake_mem are stable under us. 1937 * Then we can use css_is_ancestor without taking care of RCU. 1938 */ 1939 if (!mem_cgroup_same_or_subtree(oom_wait_mem, wake_mem) 1940 && !mem_cgroup_same_or_subtree(wake_mem, oom_wait_mem)) 1941 return 0; 1942 return autoremove_wake_function(wait, mode, sync, arg); 1943} 1944 1945static void memcg_wakeup_oom(struct mem_cgroup *mem) 1946{ 1947 /* for filtering, pass "mem" as argument. */ 1948 __wake_up(&memcg_oom_waitq, TASK_NORMAL, 0, mem); 1949} 1950 1951static void memcg_oom_recover(struct mem_cgroup *mem) 1952{ 1953 if (mem && atomic_read(&mem->under_oom)) 1954 memcg_wakeup_oom(mem); 1955} 1956 1957/* 1958 * try to call OOM killer. returns false if we should exit memory-reclaim loop. 1959 */ 1960bool mem_cgroup_handle_oom(struct mem_cgroup *mem, gfp_t mask) 1961{ 1962 struct oom_wait_info owait; 1963 bool locked, need_to_kill; 1964 1965 owait.mem = mem; 1966 owait.wait.flags = 0; 1967 owait.wait.func = memcg_oom_wake_function; 1968 owait.wait.private = current; 1969 INIT_LIST_HEAD(&owait.wait.task_list); 1970 need_to_kill = true; 1971 mem_cgroup_mark_under_oom(mem); 1972 1973 /* At first, try to OOM lock hierarchy under mem.*/ 1974 spin_lock(&memcg_oom_lock); 1975 locked = mem_cgroup_oom_lock(mem); 1976 /* 1977 * Even if signal_pending(), we can't quit charge() loop without 1978 * accounting. So, UNINTERRUPTIBLE is appropriate. But SIGKILL 1979 * under OOM is always welcomed, use TASK_KILLABLE here. 1980 */ 1981 prepare_to_wait(&memcg_oom_waitq, &owait.wait, TASK_KILLABLE); 1982 if (!locked || mem->oom_kill_disable) 1983 need_to_kill = false; 1984 if (locked) 1985 mem_cgroup_oom_notify(mem); 1986 spin_unlock(&memcg_oom_lock); 1987 1988 if (need_to_kill) { 1989 finish_wait(&memcg_oom_waitq, &owait.wait); 1990 mem_cgroup_out_of_memory(mem, mask); 1991 } else { 1992 schedule(); 1993 finish_wait(&memcg_oom_waitq, &owait.wait); 1994 } 1995 spin_lock(&memcg_oom_lock); 1996 if (locked) 1997 mem_cgroup_oom_unlock(mem); 1998 memcg_wakeup_oom(mem); 1999 spin_unlock(&memcg_oom_lock); 2000 2001 mem_cgroup_unmark_under_oom(mem); 2002 2003 if (test_thread_flag(TIF_MEMDIE) || fatal_signal_pending(current)) 2004 return false; 2005 /* Give chance to dying process */ 2006 schedule_timeout(1); 2007 return true; 2008} 2009 2010/* 2011 * Currently used to update mapped file statistics, but the routine can be 2012 * generalized to update other statistics as well. 2013 * 2014 * Notes: Race condition 2015 * 2016 * We usually use page_cgroup_lock() for accessing page_cgroup member but 2017 * it tends to be costly. But considering some conditions, we doesn't need 2018 * to do so _always_. 2019 * 2020 * Considering "charge", lock_page_cgroup() is not required because all 2021 * file-stat operations happen after a page is attached to radix-tree. There 2022 * are no race with "charge". 2023 * 2024 * Considering "uncharge", we know that memcg doesn't clear pc->mem_cgroup 2025 * at "uncharge" intentionally. So, we always see valid pc->mem_cgroup even 2026 * if there are race with "uncharge". Statistics itself is properly handled 2027 * by flags. 2028 * 2029 * Considering "move", this is an only case we see a race. To make the race 2030 * small, we check MEM_CGROUP_ON_MOVE percpu value and detect there are 2031 * possibility of race condition. If there is, we take a lock. 2032 */ 2033 2034void mem_cgroup_update_page_stat(struct page *page, 2035 enum mem_cgroup_page_stat_item idx, int val) 2036{ 2037 struct mem_cgroup *mem; 2038 struct page_cgroup *pc = lookup_page_cgroup(page); 2039 bool need_unlock = false; 2040 unsigned long uninitialized_var(flags); 2041 2042 if (unlikely(!pc)) 2043 return; 2044 2045 rcu_read_lock(); 2046 mem = pc->mem_cgroup; 2047 if (unlikely(!mem || !PageCgroupUsed(pc))) 2048 goto out; 2049 /* pc->mem_cgroup is unstable ? */ 2050 if (unlikely(mem_cgroup_stealed(mem)) || PageTransHuge(page)) { 2051 /* take a lock against to access pc->mem_cgroup */ 2052 move_lock_page_cgroup(pc, &flags); 2053 need_unlock = true; 2054 mem = pc->mem_cgroup; 2055 if (!mem || !PageCgroupUsed(pc)) 2056 goto out; 2057 } 2058 2059 switch (idx) { 2060 case MEMCG_NR_FILE_MAPPED: 2061 if (val > 0) 2062 SetPageCgroupFileMapped(pc); 2063 else if (!page_mapped(page)) 2064 ClearPageCgroupFileMapped(pc); 2065 idx = MEM_CGROUP_STAT_FILE_MAPPED; 2066 break; 2067 default: 2068 BUG(); 2069 } 2070 2071 this_cpu_add(mem->stat->count[idx], val); 2072 2073out: 2074 if (unlikely(need_unlock)) 2075 move_unlock_page_cgroup(pc, &flags); 2076 rcu_read_unlock(); 2077 return; 2078} 2079EXPORT_SYMBOL(mem_cgroup_update_page_stat); 2080 2081/* 2082 * size of first charge trial. "32" comes from vmscan.c's magic value. 2083 * TODO: maybe necessary to use big numbers in big irons. 2084 */ 2085#define CHARGE_BATCH 32U 2086struct memcg_stock_pcp { 2087 struct mem_cgroup *cached; /* this never be root cgroup */ 2088 unsigned int nr_pages; 2089 struct work_struct work; 2090 unsigned long flags; 2091#define FLUSHING_CACHED_CHARGE (0) 2092}; 2093static DEFINE_PER_CPU(struct memcg_stock_pcp, memcg_stock); 2094 2095/* 2096 * Try to consume stocked charge on this cpu. If success, one page is consumed 2097 * from local stock and true is returned. If the stock is 0 or charges from a 2098 * cgroup which is not current target, returns false. This stock will be 2099 * refilled. 2100 */ 2101static bool consume_stock(struct mem_cgroup *mem) 2102{ 2103 struct memcg_stock_pcp *stock; 2104 bool ret = true; 2105 2106 stock = &get_cpu_var(memcg_stock); 2107 if (mem == stock->cached && stock->nr_pages) 2108 stock->nr_pages--; 2109 else /* need to call res_counter_charge */ 2110 ret = false; 2111 put_cpu_var(memcg_stock); 2112 return ret; 2113} 2114 2115/* 2116 * Returns stocks cached in percpu to res_counter and reset cached information. 2117 */ 2118static void drain_stock(struct memcg_stock_pcp *stock) 2119{ 2120 struct mem_cgroup *old = stock->cached; 2121 2122 if (stock->nr_pages) { 2123 unsigned long bytes = stock->nr_pages * PAGE_SIZE; 2124 2125 res_counter_uncharge(&old->res, bytes); 2126 if (do_swap_account) 2127 res_counter_uncharge(&old->memsw, bytes); 2128 stock->nr_pages = 0; 2129 } 2130 stock->cached = NULL; 2131} 2132 2133/* 2134 * This must be called under preempt disabled or must be called by 2135 * a thread which is pinned to local cpu. 2136 */ 2137static void drain_local_stock(struct work_struct *dummy) 2138{ 2139 struct memcg_stock_pcp *stock = &__get_cpu_var(memcg_stock); 2140 drain_stock(stock); 2141 clear_bit(FLUSHING_CACHED_CHARGE, &stock->flags); 2142} 2143 2144/* 2145 * Cache charges(val) which is from res_counter, to local per_cpu area. 2146 * This will be consumed by consume_stock() function, later. 2147 */ 2148static void refill_stock(struct mem_cgroup *mem, unsigned int nr_pages) 2149{ 2150 struct memcg_stock_pcp *stock = &get_cpu_var(memcg_stock); 2151 2152 if (stock->cached != mem) { /* reset if necessary */ 2153 drain_stock(stock); 2154 stock->cached = mem; 2155 } 2156 stock->nr_pages += nr_pages; 2157 put_cpu_var(memcg_stock); 2158} 2159 2160/* 2161 * Drains all per-CPU charge caches for given root_mem resp. subtree 2162 * of the hierarchy under it. sync flag says whether we should block 2163 * until the work is done. 2164 */ 2165static void drain_all_stock(struct mem_cgroup *root_mem, bool sync) 2166{ 2167 int cpu, curcpu; 2168 2169 /* Notify other cpus that system-wide "drain" is running */ 2170 get_online_cpus(); 2171 /* 2172 * Get a hint for avoiding draining charges on the current cpu, 2173 * which must be exhausted by our charging. It is not required that 2174 * this be a precise check, so we use raw_smp_processor_id() instead of 2175 * getcpu()/putcpu(). 2176 */ 2177 curcpu = raw_smp_processor_id(); 2178 for_each_online_cpu(cpu) { 2179 struct memcg_stock_pcp *stock = &per_cpu(memcg_stock, cpu); 2180 struct mem_cgroup *mem; 2181 2182 mem = stock->cached; 2183 if (!mem || !stock->nr_pages) 2184 continue; 2185 if (!mem_cgroup_same_or_subtree(root_mem, mem)) 2186 continue; 2187 if (!test_and_set_bit(FLUSHING_CACHED_CHARGE, &stock->flags)) { 2188 if (cpu == curcpu) 2189 drain_local_stock(&stock->work); 2190 else 2191 schedule_work_on(cpu, &stock->work); 2192 } 2193 } 2194 2195 if (!sync) 2196 goto out; 2197 2198 for_each_online_cpu(cpu) { 2199 struct memcg_stock_pcp *stock = &per_cpu(memcg_stock, cpu); 2200 if (mem_cgroup_same_or_subtree(root_mem, stock->cached) && 2201 test_bit(FLUSHING_CACHED_CHARGE, &stock->flags)) 2202 flush_work(&stock->work); 2203 } 2204out: 2205 put_online_cpus(); 2206} 2207 2208/* 2209 * Tries to drain stocked charges in other cpus. This function is asynchronous 2210 * and just put a work per cpu for draining localy on each cpu. Caller can 2211 * expects some charges will be back to res_counter later but cannot wait for 2212 * it. 2213 */ 2214static void drain_all_stock_async(struct mem_cgroup *root_mem) 2215{ 2216 drain_all_stock(root_mem, false); 2217} 2218 2219/* This is a synchronous drain interface. */ 2220static void drain_all_stock_sync(struct mem_cgroup *root_mem) 2221{ 2222 /* called when force_empty is called */ 2223 drain_all_stock(root_mem, true); 2224} 2225 2226/* 2227 * This function drains percpu counter value from DEAD cpu and 2228 * move it to local cpu. Note that this function can be preempted. 2229 */ 2230static void mem_cgroup_drain_pcp_counter(struct mem_cgroup *mem, int cpu) 2231{ 2232 int i; 2233 2234 spin_lock(&mem->pcp_counter_lock); 2235 for (i = 0; i < MEM_CGROUP_STAT_DATA; i++) { 2236 long x = per_cpu(mem->stat->count[i], cpu); 2237 2238 per_cpu(mem->stat->count[i], cpu) = 0; 2239 mem->nocpu_base.count[i] += x; 2240 } 2241 for (i = 0; i < MEM_CGROUP_EVENTS_NSTATS; i++) { 2242 unsigned long x = per_cpu(mem->stat->events[i], cpu); 2243 2244 per_cpu(mem->stat->events[i], cpu) = 0; 2245 mem->nocpu_base.events[i] += x; 2246 } 2247 /* need to clear ON_MOVE value, works as a kind of lock. */ 2248 per_cpu(mem->stat->count[MEM_CGROUP_ON_MOVE], cpu) = 0; 2249 spin_unlock(&mem->pcp_counter_lock); 2250} 2251 2252static void synchronize_mem_cgroup_on_move(struct mem_cgroup *mem, int cpu) 2253{ 2254 int idx = MEM_CGROUP_ON_MOVE; 2255 2256 spin_lock(&mem->pcp_counter_lock); 2257 per_cpu(mem->stat->count[idx], cpu) = mem->nocpu_base.count[idx]; 2258 spin_unlock(&mem->pcp_counter_lock); 2259} 2260 2261static int __cpuinit memcg_cpu_hotplug_callback(struct notifier_block *nb, 2262 unsigned long action, 2263 void *hcpu) 2264{ 2265 int cpu = (unsigned long)hcpu; 2266 struct memcg_stock_pcp *stock; 2267 struct mem_cgroup *iter; 2268 2269 if ((action == CPU_ONLINE)) { 2270 for_each_mem_cgroup_all(iter) 2271 synchronize_mem_cgroup_on_move(iter, cpu); 2272 return NOTIFY_OK; 2273 } 2274 2275 if ((action != CPU_DEAD) || action != CPU_DEAD_FROZEN) 2276 return NOTIFY_OK; 2277 2278 for_each_mem_cgroup_all(iter) 2279 mem_cgroup_drain_pcp_counter(iter, cpu); 2280 2281 stock = &per_cpu(memcg_stock, cpu); 2282 drain_stock(stock); 2283 return NOTIFY_OK; 2284} 2285 2286 2287/* See __mem_cgroup_try_charge() for details */ 2288enum { 2289 CHARGE_OK, /* success */ 2290 CHARGE_RETRY, /* need to retry but retry is not bad */ 2291 CHARGE_NOMEM, /* we can't do more. return -ENOMEM */ 2292 CHARGE_WOULDBLOCK, /* GFP_WAIT wasn't set and no enough res. */ 2293 CHARGE_OOM_DIE, /* the current is killed because of OOM */ 2294}; 2295 2296static int mem_cgroup_do_charge(struct mem_cgroup *mem, gfp_t gfp_mask, 2297 unsigned int nr_pages, bool oom_check) 2298{ 2299 unsigned long csize = nr_pages * PAGE_SIZE; 2300 struct mem_cgroup *mem_over_limit; 2301 struct res_counter *fail_res; 2302 unsigned long flags = 0; 2303 int ret; 2304 2305 ret = res_counter_charge(&mem->res, csize, &fail_res); 2306 2307 if (likely(!ret)) { 2308 if (!do_swap_account) 2309 return CHARGE_OK; 2310 ret = res_counter_charge(&mem->memsw, csize, &fail_res); 2311 if (likely(!ret)) 2312 return CHARGE_OK; 2313 2314 res_counter_uncharge(&mem->res, csize); 2315 mem_over_limit = mem_cgroup_from_res_counter(fail_res, memsw); 2316 flags |= MEM_CGROUP_RECLAIM_NOSWAP; 2317 } else 2318 mem_over_limit = mem_cgroup_from_res_counter(fail_res, res); 2319 /* 2320 * nr_pages can be either a huge page (HPAGE_PMD_NR), a batch 2321 * of regular pages (CHARGE_BATCH), or a single regular page (1). 2322 * 2323 * Never reclaim on behalf of optional batching, retry with a 2324 * single page instead. 2325 */ 2326 if (nr_pages == CHARGE_BATCH) 2327 return CHARGE_RETRY; 2328 2329 if (!(gfp_mask & __GFP_WAIT)) 2330 return CHARGE_WOULDBLOCK; 2331 2332 ret = mem_cgroup_hierarchical_reclaim(mem_over_limit, NULL, 2333 gfp_mask, flags, NULL); 2334 if (mem_cgroup_margin(mem_over_limit) >= nr_pages) 2335 return CHARGE_RETRY; 2336 /* 2337 * Even though the limit is exceeded at this point, reclaim 2338 * may have been able to free some pages. Retry the charge 2339 * before killing the task. 2340 * 2341 * Only for regular pages, though: huge pages are rather 2342 * unlikely to succeed so close to the limit, and we fall back 2343 * to regular pages anyway in case of failure. 2344 */ 2345 if (nr_pages == 1 && ret) 2346 return CHARGE_RETRY; 2347 2348 /* 2349 * At task move, charge accounts can be doubly counted. So, it's 2350 * better to wait until the end of task_move if something is going on. 2351 */ 2352 if (mem_cgroup_wait_acct_move(mem_over_limit)) 2353 return CHARGE_RETRY; 2354 2355 /* If we don't need to call oom-killer at el, return immediately */ 2356 if (!oom_check) 2357 return CHARGE_NOMEM; 2358 /* check OOM */ 2359 if (!mem_cgroup_handle_oom(mem_over_limit, gfp_mask)) 2360 return CHARGE_OOM_DIE; 2361 2362 return CHARGE_RETRY; 2363} 2364 2365/* 2366 * Unlike exported interface, "oom" parameter is added. if oom==true, 2367 * oom-killer can be invoked. 2368 */ 2369static int __mem_cgroup_try_charge(struct mm_struct *mm, 2370 gfp_t gfp_mask, 2371 unsigned int nr_pages, 2372 struct mem_cgroup **memcg, 2373 bool oom) 2374{ 2375 unsigned int batch = max(CHARGE_BATCH, nr_pages); 2376 int nr_oom_retries = MEM_CGROUP_RECLAIM_RETRIES; 2377 struct mem_cgroup *mem = NULL; 2378 int ret; 2379 2380 /* 2381 * Unlike gloval-vm's OOM-kill, we're not in memory shortage 2382 * in system level. So, allow to go ahead dying process in addition to 2383 * MEMDIE process. 2384 */ 2385 if (unlikely(test_thread_flag(TIF_MEMDIE) 2386 || fatal_signal_pending(current))) 2387 goto bypass; 2388 2389 /* 2390 * We always charge the cgroup the mm_struct belongs to. 2391 * The mm_struct's mem_cgroup changes on task migration if the 2392 * thread group leader migrates. It's possible that mm is not 2393 * set, if so charge the init_mm (happens for pagecache usage). 2394 */ 2395 if (!*memcg && !mm) 2396 goto bypass; 2397again: 2398 if (*memcg) { /* css should be a valid one */ 2399 mem = *memcg; 2400 VM_BUG_ON(css_is_removed(&mem->css)); 2401 if (mem_cgroup_is_root(mem)) 2402 goto done; 2403 if (nr_pages == 1 && consume_stock(mem)) 2404 goto done; 2405 css_get(&mem->css); 2406 } else { 2407 struct task_struct *p; 2408 2409 rcu_read_lock(); 2410 p = rcu_dereference(mm->owner); 2411 /* 2412 * Because we don't have task_lock(), "p" can exit. 2413 * In that case, "mem" can point to root or p can be NULL with 2414 * race with swapoff. Then, we have small risk of mis-accouning. 2415 * But such kind of mis-account by race always happens because 2416 * we don't have cgroup_mutex(). It's overkill and we allo that 2417 * small race, here. 2418 * (*) swapoff at el will charge against mm-struct not against 2419 * task-struct. So, mm->owner can be NULL. 2420 */ 2421 mem = mem_cgroup_from_task(p); 2422 if (!mem || mem_cgroup_is_root(mem)) { 2423 rcu_read_unlock(); 2424 goto done; 2425 } 2426 if (nr_pages == 1 && consume_stock(mem)) { 2427 /* 2428 * It seems dagerous to access memcg without css_get(). 2429 * But considering how consume_stok works, it's not 2430 * necessary. If consume_stock success, some charges 2431 * from this memcg are cached on this cpu. So, we 2432 * don't need to call css_get()/css_tryget() before 2433 * calling consume_stock(). 2434 */ 2435 rcu_read_unlock(); 2436 goto done; 2437 } 2438 /* after here, we may be blocked. we need to get refcnt */ 2439 if (!css_tryget(&mem->css)) { 2440 rcu_read_unlock(); 2441 goto again; 2442 } 2443 rcu_read_unlock(); 2444 } 2445 2446 do { 2447 bool oom_check; 2448 2449 /* If killed, bypass charge */ 2450 if (fatal_signal_pending(current)) { 2451 css_put(&mem->css); 2452 goto bypass; 2453 } 2454 2455 oom_check = false; 2456 if (oom && !nr_oom_retries) { 2457 oom_check = true; 2458 nr_oom_retries = MEM_CGROUP_RECLAIM_RETRIES; 2459 } 2460 2461 ret = mem_cgroup_do_charge(mem, gfp_mask, batch, oom_check); 2462 switch (ret) { 2463 case CHARGE_OK: 2464 break; 2465 case CHARGE_RETRY: /* not in OOM situation but retry */ 2466 batch = nr_pages; 2467 css_put(&mem->css); 2468 mem = NULL; 2469 goto again; 2470 case CHARGE_WOULDBLOCK: /* !__GFP_WAIT */ 2471 css_put(&mem->css); 2472 goto nomem; 2473 case CHARGE_NOMEM: /* OOM routine works */ 2474 if (!oom) { 2475 css_put(&mem->css); 2476 goto nomem; 2477 } 2478 /* If oom, we never return -ENOMEM */ 2479 nr_oom_retries--; 2480 break; 2481 case CHARGE_OOM_DIE: /* Killed by OOM Killer */ 2482 css_put(&mem->css); 2483 goto bypass; 2484 } 2485 } while (ret != CHARGE_OK); 2486 2487 if (batch > nr_pages) 2488 refill_stock(mem, batch - nr_pages); 2489 css_put(&mem->css); 2490done: 2491 *memcg = mem; 2492 return 0; 2493nomem: 2494 *memcg = NULL; 2495 return -ENOMEM; 2496bypass: 2497 *memcg = NULL; 2498 return 0; 2499} 2500 2501/* 2502 * Somemtimes we have to undo a charge we got by try_charge(). 2503 * This function is for that and do uncharge, put css's refcnt. 2504 * gotten by try_charge(). 2505 */ 2506static void __mem_cgroup_cancel_charge(struct mem_cgroup *mem, 2507 unsigned int nr_pages) 2508{ 2509 if (!mem_cgroup_is_root(mem)) { 2510 unsigned long bytes = nr_pages * PAGE_SIZE; 2511 2512 res_counter_uncharge(&mem->res, bytes); 2513 if (do_swap_account) 2514 res_counter_uncharge(&mem->memsw, bytes); 2515 } 2516} 2517 2518/* 2519 * A helper function to get mem_cgroup from ID. must be called under 2520 * rcu_read_lock(). The caller must check css_is_removed() or some if 2521 * it's concern. (dropping refcnt from swap can be called against removed 2522 * memcg.) 2523 */ 2524static struct mem_cgroup *mem_cgroup_lookup(unsigned short id) 2525{ 2526 struct cgroup_subsys_state *css; 2527 2528 /* ID 0 is unused ID */ 2529 if (!id) 2530 return NULL; 2531 css = css_lookup(&mem_cgroup_subsys, id); 2532 if (!css) 2533 return NULL; 2534 return container_of(css, struct mem_cgroup, css); 2535} 2536 2537struct mem_cgroup *try_get_mem_cgroup_from_page(struct page *page) 2538{ 2539 struct mem_cgroup *mem = NULL; 2540 struct page_cgroup *pc; 2541 unsigned short id; 2542 swp_entry_t ent; 2543 2544 VM_BUG_ON(!PageLocked(page)); 2545 2546 pc = lookup_page_cgroup(page); 2547 lock_page_cgroup(pc); 2548 if (PageCgroupUsed(pc)) { 2549 mem = pc->mem_cgroup; 2550 if (mem && !css_tryget(&mem->css)) 2551 mem = NULL; 2552 } else if (PageSwapCache(page)) { 2553 ent.val = page_private(page); 2554 id = lookup_swap_cgroup(ent); 2555 rcu_read_lock(); 2556 mem = mem_cgroup_lookup(id); 2557 if (mem && !css_tryget(&mem->css)) 2558 mem = NULL; 2559 rcu_read_unlock(); 2560 } 2561 unlock_page_cgroup(pc); 2562 return mem; 2563} 2564 2565static void __mem_cgroup_commit_charge(struct mem_cgroup *mem, 2566 struct page *page, 2567 unsigned int nr_pages, 2568 struct page_cgroup *pc, 2569 enum charge_type ctype) 2570{ 2571 lock_page_cgroup(pc); 2572 if (unlikely(PageCgroupUsed(pc))) { 2573 unlock_page_cgroup(pc); 2574 __mem_cgroup_cancel_charge(mem, nr_pages); 2575 return; 2576 } 2577 /* 2578 * we don't need page_cgroup_lock about tail pages, becase they are not 2579 * accessed by any other context at this point. 2580 */ 2581 pc->mem_cgroup = mem; 2582 /* 2583 * We access a page_cgroup asynchronously without lock_page_cgroup(). 2584 * Especially when a page_cgroup is taken from a page, pc->mem_cgroup 2585 * is accessed after testing USED bit. To make pc->mem_cgroup visible 2586 * before USED bit, we need memory barrier here. 2587 * See mem_cgroup_add_lru_list(), etc. 2588 */ 2589 smp_wmb(); 2590 switch (ctype) { 2591 case MEM_CGROUP_CHARGE_TYPE_CACHE: 2592 case MEM_CGROUP_CHARGE_TYPE_SHMEM: 2593 SetPageCgroupCache(pc); 2594 SetPageCgroupUsed(pc); 2595 break; 2596 case MEM_CGROUP_CHARGE_TYPE_MAPPED: 2597 ClearPageCgroupCache(pc); 2598 SetPageCgroupUsed(pc); 2599 break; 2600 default: 2601 break; 2602 } 2603 2604 mem_cgroup_charge_statistics(mem, PageCgroupCache(pc), nr_pages); 2605 unlock_page_cgroup(pc); 2606 /* 2607 * "charge_statistics" updated event counter. Then, check it. 2608 * Insert ancestor (and ancestor's ancestors), to softlimit RB-tree. 2609 * if they exceeds softlimit. 2610 */ 2611 memcg_check_events(mem, page); 2612} 2613 2614#ifdef CONFIG_TRANSPARENT_HUGEPAGE 2615 2616#define PCGF_NOCOPY_AT_SPLIT ((1 << PCG_LOCK) | (1 << PCG_MOVE_LOCK) |\ 2617 (1 << PCG_ACCT_LRU) | (1 << PCG_MIGRATION)) 2618/* 2619 * Because tail pages are not marked as "used", set it. We're under 2620 * zone->lru_lock, 'splitting on pmd' and compund_lock. 2621 */ 2622void mem_cgroup_split_huge_fixup(struct page *head, struct page *tail) 2623{ 2624 struct page_cgroup *head_pc = lookup_page_cgroup(head); 2625 struct page_cgroup *tail_pc = lookup_page_cgroup(tail); 2626 unsigned long flags; 2627 2628 if (mem_cgroup_disabled()) 2629 return; 2630 /* 2631 * We have no races with charge/uncharge but will have races with 2632 * page state accounting. 2633 */ 2634 move_lock_page_cgroup(head_pc, &flags); 2635 2636 tail_pc->mem_cgroup = head_pc->mem_cgroup; 2637 smp_wmb(); /* see __commit_charge() */ 2638 if (PageCgroupAcctLRU(head_pc)) { 2639 enum lru_list lru; 2640 struct mem_cgroup_per_zone *mz; 2641 2642 /* 2643 * LRU flags cannot be copied because we need to add tail 2644 *.page to LRU by generic call and our hook will be called. 2645 * We hold lru_lock, then, reduce counter directly. 2646 */ 2647 lru = page_lru(head); 2648 mz = page_cgroup_zoneinfo(head_pc->mem_cgroup, head); 2649 MEM_CGROUP_ZSTAT(mz, lru) -= 1; 2650 } 2651 tail_pc->flags = head_pc->flags & ~PCGF_NOCOPY_AT_SPLIT; 2652 move_unlock_page_cgroup(head_pc, &flags); 2653} 2654#endif 2655 2656/** 2657 * mem_cgroup_move_account - move account of the page 2658 * @page: the page 2659 * @nr_pages: number of regular pages (>1 for huge pages) 2660 * @pc: page_cgroup of the page. 2661 * @from: mem_cgroup which the page is moved from. 2662 * @to: mem_cgroup which the page is moved to. @from != @to. 2663 * @uncharge: whether we should call uncharge and css_put against @from. 2664 * 2665 * The caller must confirm following. 2666 * - page is not on LRU (isolate_page() is useful.) 2667 * - compound_lock is held when nr_pages > 1 2668 * 2669 * This function doesn't do "charge" nor css_get to new cgroup. It should be 2670 * done by a caller(__mem_cgroup_try_charge would be useful). If @uncharge is 2671 * true, this function does "uncharge" from old cgroup, but it doesn't if 2672 * @uncharge is false, so a caller should do "uncharge". 2673 */ 2674static int mem_cgroup_move_account(struct page *page, 2675 unsigned int nr_pages, 2676 struct page_cgroup *pc, 2677 struct mem_cgroup *from, 2678 struct mem_cgroup *to, 2679 bool uncharge) 2680{ 2681 unsigned long flags; 2682 int ret; 2683 2684 VM_BUG_ON(from == to); 2685 VM_BUG_ON(PageLRU(page)); 2686 /* 2687 * The page is isolated from LRU. So, collapse function 2688 * will not handle this page. But page splitting can happen. 2689 * Do this check under compound_page_lock(). The caller should 2690 * hold it. 2691 */ 2692 ret = -EBUSY; 2693 if (nr_pages > 1 && !PageTransHuge(page)) 2694 goto out; 2695 2696 lock_page_cgroup(pc); 2697 2698 ret = -EINVAL; 2699 if (!PageCgroupUsed(pc) || pc->mem_cgroup != from) 2700 goto unlock; 2701 2702 move_lock_page_cgroup(pc, &flags); 2703 2704 if (PageCgroupFileMapped(pc)) { 2705 /* Update mapped_file data for mem_cgroup */ 2706 preempt_disable(); 2707 __this_cpu_dec(from->stat->count[MEM_CGROUP_STAT_FILE_MAPPED]); 2708 __this_cpu_inc(to->stat->count[MEM_CGROUP_STAT_FILE_MAPPED]); 2709 preempt_enable(); 2710 } 2711 mem_cgroup_charge_statistics(from, PageCgroupCache(pc), -nr_pages); 2712 if (uncharge) 2713 /* This is not "cancel", but cancel_charge does all we need. */ 2714 __mem_cgroup_cancel_charge(from, nr_pages); 2715 2716 /* caller should have done css_get */ 2717 pc->mem_cgroup = to; 2718 mem_cgroup_charge_statistics(to, PageCgroupCache(pc), nr_pages); 2719 /* 2720 * We charges against "to" which may not have any tasks. Then, "to" 2721 * can be under rmdir(). But in current implementation, caller of 2722 * this function is just force_empty() and move charge, so it's 2723 * guaranteed that "to" is never removed. So, we don't check rmdir 2724 * status here. 2725 */ 2726 move_unlock_page_cgroup(pc, &flags); 2727 ret = 0; 2728unlock: 2729 unlock_page_cgroup(pc); 2730 /* 2731 * check events 2732 */ 2733 memcg_check_events(to, page); 2734 memcg_check_events(from, page); 2735out: 2736 return ret; 2737} 2738 2739/* 2740 * move charges to its parent. 2741 */ 2742 2743static int mem_cgroup_move_parent(struct page *page, 2744 struct page_cgroup *pc, 2745 struct mem_cgroup *child, 2746 gfp_t gfp_mask) 2747{ 2748 struct cgroup *cg = child->css.cgroup; 2749 struct cgroup *pcg = cg->parent; 2750 struct mem_cgroup *parent; 2751 unsigned int nr_pages; 2752 unsigned long uninitialized_var(flags); 2753 int ret; 2754 2755 /* Is ROOT ? */ 2756 if (!pcg) 2757 return -EINVAL; 2758 2759 ret = -EBUSY; 2760 if (!get_page_unless_zero(page)) 2761 goto out; 2762 if (isolate_lru_page(page)) 2763 goto put; 2764 2765 nr_pages = hpage_nr_pages(page); 2766 2767 parent = mem_cgroup_from_cont(pcg); 2768 ret = __mem_cgroup_try_charge(NULL, gfp_mask, nr_pages, &parent, false); 2769 if (ret || !parent) 2770 goto put_back; 2771 2772 if (nr_pages > 1) 2773 flags = compound_lock_irqsave(page); 2774 2775 ret = mem_cgroup_move_account(page, nr_pages, pc, child, parent, true); 2776 if (ret) 2777 __mem_cgroup_cancel_charge(parent, nr_pages); 2778 2779 if (nr_pages > 1) 2780 compound_unlock_irqrestore(page, flags); 2781put_back: 2782 putback_lru_page(page); 2783put: 2784 put_page(page); 2785out: 2786 return ret; 2787} 2788 2789/* 2790 * Charge the memory controller for page usage. 2791 * Return 2792 * 0 if the charge was successful 2793 * < 0 if the cgroup is over its limit 2794 */ 2795static int mem_cgroup_charge_common(struct page *page, struct mm_struct *mm, 2796 gfp_t gfp_mask, enum charge_type ctype) 2797{ 2798 struct mem_cgroup *mem = NULL; 2799 unsigned int nr_pages = 1; 2800 struct page_cgroup *pc; 2801 bool oom = true; 2802 int ret; 2803 2804 if (PageTransHuge(page)) { 2805 nr_pages <<= compound_order(page); 2806 VM_BUG_ON(!PageTransHuge(page)); 2807 /* 2808 * Never OOM-kill a process for a huge page. The 2809 * fault handler will fall back to regular pages. 2810 */ 2811 oom = false; 2812 } 2813 2814 pc = lookup_page_cgroup(page); 2815 BUG_ON(!pc); /* XXX: remove this and move pc lookup into commit */ 2816 2817 ret = __mem_cgroup_try_charge(mm, gfp_mask, nr_pages, &mem, oom); 2818 if (ret || !mem) 2819 return ret; 2820 2821 __mem_cgroup_commit_charge(mem, page, nr_pages, pc, ctype); 2822 return 0; 2823} 2824 2825int mem_cgroup_newpage_charge(struct page *page, 2826 struct mm_struct *mm, gfp_t gfp_mask) 2827{ 2828 if (mem_cgroup_disabled()) 2829 return 0; 2830 /* 2831 * If already mapped, we don't have to account. 2832 * If page cache, page->mapping has address_space. 2833 * But page->mapping may have out-of-use anon_vma pointer, 2834 * detecit it by PageAnon() check. newly-mapped-anon's page->mapping 2835 * is NULL. 2836 */ 2837 if (page_mapped(page) || (page->mapping && !PageAnon(page))) 2838 return 0; 2839 if (unlikely(!mm)) 2840 mm = &init_mm; 2841 return mem_cgroup_charge_common(page, mm, gfp_mask, 2842 MEM_CGROUP_CHARGE_TYPE_MAPPED); 2843} 2844 2845static void 2846__mem_cgroup_commit_charge_swapin(struct page *page, struct mem_cgroup *ptr, 2847 enum charge_type ctype); 2848 2849static void 2850__mem_cgroup_commit_charge_lrucare(struct page *page, struct mem_cgroup *mem, 2851 enum charge_type ctype) 2852{ 2853 struct page_cgroup *pc = lookup_page_cgroup(page); 2854 /* 2855 * In some case, SwapCache, FUSE(splice_buf->radixtree), the page 2856 * is already on LRU. It means the page may on some other page_cgroup's 2857 * LRU. Take care of it. 2858 */ 2859 mem_cgroup_lru_del_before_commit(page); 2860 __mem_cgroup_commit_charge(mem, page, 1, pc, ctype); 2861 mem_cgroup_lru_add_after_commit(page); 2862 return; 2863} 2864 2865int mem_cgroup_cache_charge(struct page *page, struct mm_struct *mm, 2866 gfp_t gfp_mask) 2867{ 2868 struct mem_cgroup *mem = NULL; 2869 int ret; 2870 2871 if (mem_cgroup_disabled()) 2872 return 0; 2873 if (PageCompound(page)) 2874 return 0; 2875 2876 if (unlikely(!mm)) 2877 mm = &init_mm; 2878 2879 if (page_is_file_cache(page)) { 2880 ret = __mem_cgroup_try_charge(mm, gfp_mask, 1, &mem, true); 2881 if (ret || !mem) 2882 return ret; 2883 2884 /* 2885 * FUSE reuses pages without going through the final 2886 * put that would remove them from the LRU list, make 2887 * sure that they get relinked properly. 2888 */ 2889 __mem_cgroup_commit_charge_lrucare(page, mem, 2890 MEM_CGROUP_CHARGE_TYPE_CACHE); 2891 return ret; 2892 } 2893 /* shmem */ 2894 if (PageSwapCache(page)) { 2895 ret = mem_cgroup_try_charge_swapin(mm, page, gfp_mask, &mem); 2896 if (!ret) 2897 __mem_cgroup_commit_charge_swapin(page, mem, 2898 MEM_CGROUP_CHARGE_TYPE_SHMEM); 2899 } else 2900 ret = mem_cgroup_charge_common(page, mm, gfp_mask, 2901 MEM_CGROUP_CHARGE_TYPE_SHMEM); 2902 2903 return ret; 2904} 2905 2906/* 2907 * While swap-in, try_charge -> commit or cancel, the page is locked. 2908 * And when try_charge() successfully returns, one refcnt to memcg without 2909 * struct page_cgroup is acquired. This refcnt will be consumed by 2910 * "commit()" or removed by "cancel()" 2911 */ 2912int mem_cgroup_try_charge_swapin(struct mm_struct *mm, 2913 struct page *page, 2914 gfp_t mask, struct mem_cgroup **ptr) 2915{ 2916 struct mem_cgroup *mem; 2917 int ret; 2918 2919 *ptr = NULL; 2920 2921 if (mem_cgroup_disabled()) 2922 return 0; 2923 2924 if (!do_swap_account) 2925 goto charge_cur_mm; 2926 /* 2927 * A racing thread's fault, or swapoff, may have already updated 2928 * the pte, and even removed page from swap cache: in those cases 2929 * do_swap_page()'s pte_same() test will fail; but there's also a 2930 * KSM case which does need to charge the page. 2931 */ 2932 if (!PageSwapCache(page)) 2933 goto charge_cur_mm; 2934 mem = try_get_mem_cgroup_from_page(page); 2935 if (!mem) 2936 goto charge_cur_mm; 2937 *ptr = mem; 2938 ret = __mem_cgroup_try_charge(NULL, mask, 1, ptr, true); 2939 css_put(&mem->css); 2940 return ret; 2941charge_cur_mm: 2942 if (unlikely(!mm)) 2943 mm = &init_mm; 2944 return __mem_cgroup_try_charge(mm, mask, 1, ptr, true); 2945} 2946 2947static void 2948__mem_cgroup_commit_charge_swapin(struct page *page, struct mem_cgroup *ptr, 2949 enum charge_type ctype) 2950{ 2951 if (mem_cgroup_disabled()) 2952 return; 2953 if (!ptr) 2954 return; 2955 cgroup_exclude_rmdir(&ptr->css); 2956 2957 __mem_cgroup_commit_charge_lrucare(page, ptr, ctype); 2958 /* 2959 * Now swap is on-memory. This means this page may be 2960 * counted both as mem and swap....double count. 2961 * Fix it by uncharging from memsw. Basically, this SwapCache is stable 2962 * under lock_page(). But in do_swap_page()::memory.c, reuse_swap_page() 2963 * may call delete_from_swap_cache() before reach here. 2964 */ 2965 if (do_swap_account && PageSwapCache(page)) { 2966 swp_entry_t ent = {.val = page_private(page)}; 2967 unsigned short id; 2968 struct mem_cgroup *memcg; 2969 2970 id = swap_cgroup_record(ent, 0); 2971 rcu_read_lock(); 2972 memcg = mem_cgroup_lookup(id); 2973 if (memcg) { 2974 /* 2975 * This recorded memcg can be obsolete one. So, avoid 2976 * calling css_tryget 2977 */ 2978 if (!mem_cgroup_is_root(memcg)) 2979 res_counter_uncharge(&memcg->memsw, PAGE_SIZE); 2980 mem_cgroup_swap_statistics(memcg, false); 2981 mem_cgroup_put(memcg); 2982 } 2983 rcu_read_unlock(); 2984 } 2985 /* 2986 * At swapin, we may charge account against cgroup which has no tasks. 2987 * So, rmdir()->pre_destroy() can be called while we do this charge. 2988 * In that case, we need to call pre_destroy() again. check it here. 2989 */ 2990 cgroup_release_and_wakeup_rmdir(&ptr->css); 2991} 2992 2993void mem_cgroup_commit_charge_swapin(struct page *page, struct mem_cgroup *ptr) 2994{ 2995 __mem_cgroup_commit_charge_swapin(page, ptr, 2996 MEM_CGROUP_CHARGE_TYPE_MAPPED); 2997} 2998 2999void mem_cgroup_cancel_charge_swapin(struct mem_cgroup *mem) 3000{ 3001 if (mem_cgroup_disabled()) 3002 return; 3003 if (!mem) 3004 return; 3005 __mem_cgroup_cancel_charge(mem, 1); 3006} 3007 3008static void mem_cgroup_do_uncharge(struct mem_cgroup *mem, 3009 unsigned int nr_pages, 3010 const enum charge_type ctype) 3011{ 3012 struct memcg_batch_info *batch = NULL; 3013 bool uncharge_memsw = true; 3014 3015 /* If swapout, usage of swap doesn't decrease */ 3016 if (!do_swap_account || ctype == MEM_CGROUP_CHARGE_TYPE_SWAPOUT) 3017 uncharge_memsw = false; 3018 3019 batch = ¤t->memcg_batch; 3020 /* 3021 * In usual, we do css_get() when we remember memcg pointer. 3022 * But in this case, we keep res->usage until end of a series of 3023 * uncharges. Then, it's ok to ignore memcg's refcnt. 3024 */ 3025 if (!batch->memcg) 3026 batch->memcg = mem; 3027 /* 3028 * do_batch > 0 when unmapping pages or inode invalidate/truncate. 3029 * In those cases, all pages freed continuously can be expected to be in 3030 * the same cgroup and we have chance to coalesce uncharges. 3031 * But we do uncharge one by one if this is killed by OOM(TIF_MEMDIE) 3032 * because we want to do uncharge as soon as possible. 3033 */ 3034 3035 if (!batch->do_batch || test_thread_flag(TIF_MEMDIE)) 3036 goto direct_uncharge; 3037 3038 if (nr_pages > 1) 3039 goto direct_uncharge; 3040 3041 /* 3042 * In typical case, batch->memcg == mem. This means we can 3043 * merge a series of uncharges to an uncharge of res_counter. 3044 * If not, we uncharge res_counter ony by one. 3045 */ 3046 if (batch->memcg != mem) 3047 goto direct_uncharge; 3048 /* remember freed charge and uncharge it later */ 3049 batch->nr_pages++; 3050 if (uncharge_memsw) 3051 batch->memsw_nr_pages++; 3052 return; 3053direct_uncharge: 3054 res_counter_uncharge(&mem->res, nr_pages * PAGE_SIZE); 3055 if (uncharge_memsw) 3056 res_counter_uncharge(&mem->memsw, nr_pages * PAGE_SIZE); 3057 if (unlikely(batch->memcg != mem)) 3058 memcg_oom_recover(mem); 3059 return; 3060} 3061 3062/* 3063 * uncharge if !page_mapped(page) 3064 */ 3065static struct mem_cgroup * 3066__mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype) 3067{ 3068 struct mem_cgroup *mem = NULL; 3069 unsigned int nr_pages = 1; 3070 struct page_cgroup *pc; 3071 3072 if (mem_cgroup_disabled()) 3073 return NULL; 3074 3075 if (PageSwapCache(page)) 3076 return NULL; 3077 3078 if (PageTransHuge(page)) { 3079 nr_pages <<= compound_order(page); 3080 VM_BUG_ON(!PageTransHuge(page)); 3081 } 3082 /* 3083 * Check if our page_cgroup is valid 3084 */ 3085 pc = lookup_page_cgroup(page); 3086 if (unlikely(!pc || !PageCgroupUsed(pc))) 3087 return NULL; 3088 3089 lock_page_cgroup(pc); 3090 3091 mem = pc->mem_cgroup; 3092 3093 if (!PageCgroupUsed(pc)) 3094 goto unlock_out; 3095 3096 switch (ctype) { 3097 case MEM_CGROUP_CHARGE_TYPE_MAPPED: 3098 case MEM_CGROUP_CHARGE_TYPE_DROP: 3099 /* See mem_cgroup_prepare_migration() */ 3100 if (page_mapped(page) || PageCgroupMigration(pc)) 3101 goto unlock_out; 3102 break; 3103 case MEM_CGROUP_CHARGE_TYPE_SWAPOUT: 3104 if (!PageAnon(page)) { /* Shared memory */ 3105 if (page->mapping && !page_is_file_cache(page)) 3106 goto unlock_out; 3107 } else if (page_mapped(page)) /* Anon */ 3108 goto unlock_out; 3109 break; 3110 default: 3111 break; 3112 } 3113 3114 mem_cgroup_charge_statistics(mem, PageCgroupCache(pc), -nr_pages); 3115 3116 ClearPageCgroupUsed(pc); 3117 /* 3118 * pc->mem_cgroup is not cleared here. It will be accessed when it's 3119 * freed from LRU. This is safe because uncharged page is expected not 3120 * to be reused (freed soon). Exception is SwapCache, it's handled by 3121 * special functions. 3122 */ 3123 3124 unlock_page_cgroup(pc); 3125 /* 3126 * even after unlock, we have mem->res.usage here and this memcg 3127 * will never be freed. 3128 */ 3129 memcg_check_events(mem, page); 3130 if (do_swap_account && ctype == MEM_CGROUP_CHARGE_TYPE_SWAPOUT) { 3131 mem_cgroup_swap_statistics(mem, true); 3132 mem_cgroup_get(mem); 3133 } 3134 if (!mem_cgroup_is_root(mem)) 3135 mem_cgroup_do_uncharge(mem, nr_pages, ctype); 3136 3137 return mem; 3138 3139unlock_out: 3140 unlock_page_cgroup(pc); 3141 return NULL; 3142} 3143 3144void mem_cgroup_uncharge_page(struct page *page) 3145{ 3146 /* early check. */ 3147 if (page_mapped(page)) 3148 return; 3149 if (page->mapping && !PageAnon(page)) 3150 return; 3151 __mem_cgroup_uncharge_common(page, MEM_CGROUP_CHARGE_TYPE_MAPPED); 3152} 3153 3154void mem_cgroup_uncharge_cache_page(struct page *page) 3155{ 3156 VM_BUG_ON(page_mapped(page)); 3157 VM_BUG_ON(page->mapping); 3158 __mem_cgroup_uncharge_common(page, MEM_CGROUP_CHARGE_TYPE_CACHE); 3159} 3160 3161/* 3162 * Batch_start/batch_end is called in unmap_page_range/invlidate/trucate. 3163 * In that cases, pages are freed continuously and we can expect pages 3164 * are in the same memcg. All these calls itself limits the number of 3165 * pages freed at once, then uncharge_start/end() is called properly. 3166 * This may be called prural(2) times in a context, 3167 */ 3168 3169void mem_cgroup_uncharge_start(void) 3170{ 3171 current->memcg_batch.do_batch++; 3172 /* We can do nest. */ 3173 if (current->memcg_batch.do_batch == 1) { 3174 current->memcg_batch.memcg = NULL; 3175 current->memcg_batch.nr_pages = 0; 3176 current->memcg_batch.memsw_nr_pages = 0; 3177 } 3178} 3179 3180void mem_cgroup_uncharge_end(void) 3181{ 3182 struct memcg_batch_info *batch = ¤t->memcg_batch; 3183 3184 if (!batch->do_batch) 3185 return; 3186 3187 batch->do_batch--; 3188 if (batch->do_batch) /* If stacked, do nothing. */ 3189 return; 3190 3191 if (!batch->memcg) 3192 return; 3193 /* 3194 * This "batch->memcg" is valid without any css_get/put etc... 3195 * bacause we hide charges behind us. 3196 */ 3197 if (batch->nr_pages) 3198 res_counter_uncharge(&batch->memcg->res, 3199 batch->nr_pages * PAGE_SIZE); 3200 if (batch->memsw_nr_pages) 3201 res_counter_uncharge(&batch->memcg->memsw, 3202 batch->memsw_nr_pages * PAGE_SIZE); 3203 memcg_oom_recover(batch->memcg); 3204 /* forget this pointer (for sanity check) */ 3205 batch->memcg = NULL; 3206} 3207 3208#ifdef CONFIG_SWAP 3209/* 3210 * called after __delete_from_swap_cache() and drop "page" account. 3211 * memcg information is recorded to swap_cgroup of "ent" 3212 */ 3213void 3214mem_cgroup_uncharge_swapcache(struct page *page, swp_entry_t ent, bool swapout) 3215{ 3216 struct mem_cgroup *memcg; 3217 int ctype = MEM_CGROUP_CHARGE_TYPE_SWAPOUT; 3218 3219 if (!swapout) /* this was a swap cache but the swap is unused ! */ 3220 ctype = MEM_CGROUP_CHARGE_TYPE_DROP; 3221 3222 memcg = __mem_cgroup_uncharge_common(page, ctype); 3223 3224 /* 3225 * record memcg information, if swapout && memcg != NULL, 3226 * mem_cgroup_get() was called in uncharge(). 3227 */ 3228 if (do_swap_account && swapout && memcg) 3229 swap_cgroup_record(ent, css_id(&memcg->css)); 3230} 3231#endif 3232 3233#ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP 3234/* 3235 * called from swap_entry_free(). remove record in swap_cgroup and 3236 * uncharge "memsw" account. 3237 */ 3238void mem_cgroup_uncharge_swap(swp_entry_t ent) 3239{ 3240 struct mem_cgroup *memcg; 3241 unsigned short id; 3242 3243 if (!do_swap_account) 3244 return; 3245 3246 id = swap_cgroup_record(ent, 0); 3247 rcu_read_lock(); 3248 memcg = mem_cgroup_lookup(id); 3249 if (memcg) { 3250 /* 3251 * We uncharge this because swap is freed. 3252 * This memcg can be obsolete one. We avoid calling css_tryget 3253 */ 3254 if (!mem_cgroup_is_root(memcg)) 3255 res_counter_uncharge(&memcg->memsw, PAGE_SIZE); 3256 mem_cgroup_swap_statistics(memcg, false); 3257 mem_cgroup_put(memcg); 3258 } 3259 rcu_read_unlock(); 3260} 3261 3262/** 3263 * mem_cgroup_move_swap_account - move swap charge and swap_cgroup's record. 3264 * @entry: swap entry to be moved 3265 * @from: mem_cgroup which the entry is moved from 3266 * @to: mem_cgroup which the entry is moved to 3267 * @need_fixup: whether we should fixup res_counters and refcounts. 3268 * 3269 * It succeeds only when the swap_cgroup's record for this entry is the same 3270 * as the mem_cgroup's id of @from. 3271 * 3272 * Returns 0 on success, -EINVAL on failure. 3273 * 3274 * The caller must have charged to @to, IOW, called res_counter_charge() about 3275 * both res and memsw, and called css_get(). 3276 */ 3277static int mem_cgroup_move_swap_account(swp_entry_t entry, 3278 struct mem_cgroup *from, struct mem_cgroup *to, bool need_fixup) 3279{ 3280 unsigned short old_id, new_id; 3281 3282 old_id = css_id(&from->css); 3283 new_id = css_id(&to->css); 3284 3285 if (swap_cgroup_cmpxchg(entry, old_id, new_id) == old_id) { 3286 mem_cgroup_swap_statistics(from, false); 3287 mem_cgroup_swap_statistics(to, true); 3288 /* 3289 * This function is only called from task migration context now. 3290 * It postpones res_counter and refcount handling till the end 3291 * of task migration(mem_cgroup_clear_mc()) for performance 3292 * improvement. But we cannot postpone mem_cgroup_get(to) 3293 * because if the process that has been moved to @to does 3294 * swap-in, the refcount of @to might be decreased to 0. 3295 */ 3296 mem_cgroup_get(to); 3297 if (need_fixup) { 3298 if (!mem_cgroup_is_root(from)) 3299 res_counter_uncharge(&from->memsw, PAGE_SIZE); 3300 mem_cgroup_put(from); 3301 /* 3302 * we charged both to->res and to->memsw, so we should 3303 * uncharge to->res. 3304 */ 3305 if (!mem_cgroup_is_root(to)) 3306 res_counter_uncharge(&to->res, PAGE_SIZE); 3307 } 3308 return 0; 3309 } 3310 return -EINVAL; 3311} 3312#else 3313static inline int mem_cgroup_move_swap_account(swp_entry_t entry, 3314 struct mem_cgroup *from, struct mem_cgroup *to, bool need_fixup) 3315{ 3316 return -EINVAL; 3317} 3318#endif 3319 3320/* 3321 * Before starting migration, account PAGE_SIZE to mem_cgroup that the old 3322 * page belongs to. 3323 */ 3324int mem_cgroup_prepare_migration(struct page *page, 3325 struct page *newpage, struct mem_cgroup **ptr, gfp_t gfp_mask) 3326{ 3327 struct mem_cgroup *mem = NULL; 3328 struct page_cgroup *pc; 3329 enum charge_type ctype; 3330 int ret = 0; 3331 3332 *ptr = NULL; 3333 3334 VM_BUG_ON(PageTransHuge(page)); 3335 if (mem_cgroup_disabled()) 3336 return 0; 3337 3338 pc = lookup_page_cgroup(page); 3339 lock_page_cgroup(pc); 3340 if (PageCgroupUsed(pc)) { 3341 mem = pc->mem_cgroup; 3342 css_get(&mem->css); 3343 /* 3344 * At migrating an anonymous page, its mapcount goes down 3345 * to 0 and uncharge() will be called. But, even if it's fully 3346 * unmapped, migration may fail and this page has to be 3347 * charged again. We set MIGRATION flag here and delay uncharge 3348 * until end_migration() is called 3349 * 3350 * Corner Case Thinking 3351 * A) 3352 * When the old page was mapped as Anon and it's unmap-and-freed 3353 * while migration was ongoing. 3354 * If unmap finds the old page, uncharge() of it will be delayed 3355 * until end_migration(). If unmap finds a new page, it's 3356 * uncharged when it make mapcount to be 1->0. If unmap code 3357 * finds swap_migration_entry, the new page will not be mapped 3358 * and end_migration() will find it(mapcount==0). 3359 * 3360 * B) 3361 * When the old page was mapped but migraion fails, the kernel 3362 * remaps it. A charge for it is kept by MIGRATION flag even 3363 * if mapcount goes down to 0. We can do remap successfully 3364 * without charging it again. 3365 * 3366 * C) 3367 * The "old" page is under lock_page() until the end of 3368 * migration, so, the old page itself will not be swapped-out. 3369 * If the new page is swapped out before end_migraton, our 3370 * hook to usual swap-out path will catch the event. 3371 */ 3372 if (PageAnon(page)) 3373 SetPageCgroupMigration(pc); 3374 } 3375 unlock_page_cgroup(pc); 3376 /* 3377 * If the page is not charged at this point, 3378 * we return here. 3379 */ 3380 if (!mem) 3381 return 0; 3382 3383 *ptr = mem; 3384 ret = __mem_cgroup_try_charge(NULL, gfp_mask, 1, ptr, false); 3385 css_put(&mem->css);/* drop extra refcnt */ 3386 if (ret || *ptr == NULL) { 3387 if (PageAnon(page)) { 3388 lock_page_cgroup(pc); 3389 ClearPageCgroupMigration(pc); 3390 unlock_page_cgroup(pc); 3391 /* 3392 * The old page may be fully unmapped while we kept it. 3393 */ 3394 mem_cgroup_uncharge_page(page); 3395 } 3396 return -ENOMEM; 3397 } 3398 /* 3399 * We charge new page before it's used/mapped. So, even if unlock_page() 3400 * is called before end_migration, we can catch all events on this new 3401 * page. In the case new page is migrated but not remapped, new page's 3402 * mapcount will be finally 0 and we call uncharge in end_migration(). 3403 */ 3404 pc = lookup_page_cgroup(newpage); 3405 if (PageAnon(page)) 3406 ctype = MEM_CGROUP_CHARGE_TYPE_MAPPED; 3407 else if (page_is_file_cache(page)) 3408 ctype = MEM_CGROUP_CHARGE_TYPE_CACHE; 3409 else 3410 ctype = MEM_CGROUP_CHARGE_TYPE_SHMEM; 3411 __mem_cgroup_commit_charge(mem, page, 1, pc, ctype); 3412 return ret; 3413} 3414 3415/* remove redundant charge if migration failed*/ 3416void mem_cgroup_end_migration(struct mem_cgroup *mem, 3417 struct page *oldpage, struct page *newpage, bool migration_ok) 3418{ 3419 struct page *used, *unused; 3420 struct page_cgroup *pc; 3421 3422 if (!mem) 3423 return; 3424 /* blocks rmdir() */ 3425 cgroup_exclude_rmdir(&mem->css); 3426 if (!migration_ok) { 3427 used = oldpage; 3428 unused = newpage; 3429 } else { 3430 used = newpage; 3431 unused = oldpage; 3432 } 3433 /* 3434 * We disallowed uncharge of pages under migration because mapcount 3435 * of the page goes down to zero, temporarly. 3436 * Clear the flag and check the page should be charged. 3437 */ 3438 pc = lookup_page_cgroup(oldpage); 3439 lock_page_cgroup(pc); 3440 ClearPageCgroupMigration(pc); 3441 unlock_page_cgroup(pc); 3442 3443 __mem_cgroup_uncharge_common(unused, MEM_CGROUP_CHARGE_TYPE_FORCE); 3444 3445 /* 3446 * If a page is a file cache, radix-tree replacement is very atomic 3447 * and we can skip this check. When it was an Anon page, its mapcount 3448 * goes down to 0. But because we added MIGRATION flage, it's not 3449 * uncharged yet. There are several case but page->mapcount check 3450 * and USED bit check in mem_cgroup_uncharge_page() will do enough 3451 * check. (see prepare_charge() also) 3452 */ 3453 if (PageAnon(used)) 3454 mem_cgroup_uncharge_page(used); 3455 /* 3456 * At migration, we may charge account against cgroup which has no 3457 * tasks. 3458 * So, rmdir()->pre_destroy() can be called while we do this charge. 3459 * In that case, we need to call pre_destroy() again. check it here. 3460 */ 3461 cgroup_release_and_wakeup_rmdir(&mem->css); 3462} 3463 3464#ifdef CONFIG_DEBUG_VM 3465static struct page_cgroup *lookup_page_cgroup_used(struct page *page) 3466{ 3467 struct page_cgroup *pc; 3468 3469 pc = lookup_page_cgroup(page); 3470 if (likely(pc) && PageCgroupUsed(pc)) 3471 return pc; 3472 return NULL; 3473} 3474 3475bool mem_cgroup_bad_page_check(struct page *page) 3476{ 3477 if (mem_cgroup_disabled()) 3478 return false; 3479 3480 return lookup_page_cgroup_used(page) != NULL; 3481} 3482 3483void mem_cgroup_print_bad_page(struct page *page) 3484{ 3485 struct page_cgroup *pc; 3486 3487 pc = lookup_page_cgroup_used(page); 3488 if (pc) { 3489 int ret = -1; 3490 char *path; 3491 3492 printk(KERN_ALERT "pc:%p pc->flags:%lx pc->mem_cgroup:%p", 3493 pc, pc->flags, pc->mem_cgroup); 3494 3495 path = kmalloc(PATH_MAX, GFP_KERNEL); 3496 if (path) { 3497 rcu_read_lock(); 3498 ret = cgroup_path(pc->mem_cgroup->css.cgroup, 3499 path, PATH_MAX); 3500 rcu_read_unlock(); 3501 } 3502 3503 printk(KERN_CONT "(%s)\n", 3504 (ret < 0) ? "cannot get the path" : path); 3505 kfree(path); 3506 } 3507} 3508#endif 3509 3510static DEFINE_MUTEX(set_limit_mutex); 3511 3512static int mem_cgroup_resize_limit(struct mem_cgroup *memcg, 3513 unsigned long long val) 3514{ 3515 int retry_count; 3516 u64 memswlimit, memlimit; 3517 int ret = 0; 3518 int children = mem_cgroup_count_children(memcg); 3519 u64 curusage, oldusage; 3520 int enlarge; 3521 3522 /* 3523 * For keeping hierarchical_reclaim simple, how long we should retry 3524 * is depends on callers. We set our retry-count to be function 3525 * of # of children which we should visit in this loop. 3526 */ 3527 retry_count = MEM_CGROUP_RECLAIM_RETRIES * children; 3528 3529 oldusage = res_counter_read_u64(&memcg->res, RES_USAGE); 3530 3531 enlarge = 0; 3532 while (retry_count) { 3533 if (signal_pending(current)) { 3534 ret = -EINTR; 3535 break; 3536 } 3537 /* 3538 * Rather than hide all in some function, I do this in 3539 * open coded manner. You see what this really does. 3540 * We have to guarantee mem->res.limit < mem->memsw.limit. 3541 */ 3542 mutex_lock(&set_limit_mutex); 3543 memswlimit = res_counter_read_u64(&memcg->memsw, RES_LIMIT); 3544 if (memswlimit < val) { 3545 ret = -EINVAL; 3546 mutex_unlock(&set_limit_mutex); 3547 break; 3548 } 3549 3550 memlimit = res_counter_read_u64(&memcg->res, RES_LIMIT); 3551 if (memlimit < val) 3552 enlarge = 1; 3553 3554 ret = res_counter_set_limit(&memcg->res, val); 3555 if (!ret) { 3556 if (memswlimit == val) 3557 memcg->memsw_is_minimum = true; 3558 else 3559 memcg->memsw_is_minimum = false; 3560 } 3561 mutex_unlock(&set_limit_mutex); 3562 3563 if (!ret) 3564 break; 3565 3566 mem_cgroup_hierarchical_reclaim(memcg, NULL, GFP_KERNEL, 3567 MEM_CGROUP_RECLAIM_SHRINK, 3568 NULL); 3569 curusage = res_counter_read_u64(&memcg->res, RES_USAGE); 3570 /* Usage is reduced ? */ 3571 if (curusage >= oldusage) 3572 retry_count--; 3573 else 3574 oldusage = curusage; 3575 } 3576 if (!ret && enlarge) 3577 memcg_oom_recover(memcg); 3578 3579 return ret; 3580} 3581 3582static int mem_cgroup_resize_memsw_limit(struct mem_cgroup *memcg, 3583 unsigned long long val) 3584{ 3585 int retry_count; 3586 u64 memlimit, memswlimit, oldusage, curusage; 3587 int children = mem_cgroup_count_children(memcg); 3588 int ret = -EBUSY; 3589 int enlarge = 0; 3590 3591 /* see mem_cgroup_resize_res_limit */ 3592 retry_count = children * MEM_CGROUP_RECLAIM_RETRIES; 3593 oldusage = res_counter_read_u64(&memcg->memsw, RES_USAGE); 3594 while (retry_count) { 3595 if (signal_pending(current)) { 3596 ret = -EINTR; 3597 break; 3598 } 3599 /* 3600 * Rather than hide all in some function, I do this in 3601 * open coded manner. You see what this really does. 3602 * We have to guarantee mem->res.limit < mem->memsw.limit. 3603 */ 3604 mutex_lock(&set_limit_mutex); 3605 memlimit = res_counter_read_u64(&memcg->res, RES_LIMIT); 3606 if (memlimit > val) { 3607 ret = -EINVAL; 3608 mutex_unlock(&set_limit_mutex); 3609 break; 3610 } 3611 memswlimit = res_counter_read_u64(&memcg->memsw, RES_LIMIT); 3612 if (memswlimit < val) 3613 enlarge = 1; 3614 ret = res_counter_set_limit(&memcg->memsw, val); 3615 if (!ret) { 3616 if (memlimit == val) 3617 memcg->memsw_is_minimum = true; 3618 else 3619 memcg->memsw_is_minimum = false; 3620 } 3621 mutex_unlock(&set_limit_mutex); 3622 3623 if (!ret) 3624 break; 3625 3626 mem_cgroup_hierarchical_reclaim(memcg, NULL, GFP_KERNEL, 3627 MEM_CGROUP_RECLAIM_NOSWAP | 3628 MEM_CGROUP_RECLAIM_SHRINK, 3629 NULL); 3630 curusage = res_counter_read_u64(&memcg->memsw, RES_USAGE); 3631 /* Usage is reduced ? */ 3632 if (curusage >= oldusage) 3633 retry_count--; 3634 else 3635 oldusage = curusage; 3636 } 3637 if (!ret && enlarge) 3638 memcg_oom_recover(memcg); 3639 return ret; 3640} 3641 3642unsigned long mem_cgroup_soft_limit_reclaim(struct zone *zone, int order, 3643 gfp_t gfp_mask, 3644 unsigned long *total_scanned) 3645{ 3646 unsigned long nr_reclaimed = 0; 3647 struct mem_cgroup_per_zone *mz, *next_mz = NULL; 3648 unsigned long reclaimed; 3649 int loop = 0; 3650 struct mem_cgroup_tree_per_zone *mctz; 3651 unsigned long long excess; 3652 unsigned long nr_scanned; 3653 3654 if (order > 0) 3655 return 0; 3656 3657 mctz = soft_limit_tree_node_zone(zone_to_nid(zone), zone_idx(zone)); 3658 /* 3659 * This loop can run a while, specially if mem_cgroup's continuously 3660 * keep exceeding their soft limit and putting the system under 3661 * pressure 3662 */ 3663 do { 3664 if (next_mz) 3665 mz = next_mz; 3666 else 3667 mz = mem_cgroup_largest_soft_limit_node(mctz); 3668 if (!mz) 3669 break; 3670 3671 nr_scanned = 0; 3672 reclaimed = mem_cgroup_hierarchical_reclaim(mz->mem, zone, 3673 gfp_mask, 3674 MEM_CGROUP_RECLAIM_SOFT, 3675 &nr_scanned); 3676 nr_reclaimed += reclaimed; 3677 *total_scanned += nr_scanned; 3678 spin_lock(&mctz->lock); 3679 3680 /* 3681 * If we failed to reclaim anything from this memory cgroup 3682 * it is time to move on to the next cgroup 3683 */ 3684 next_mz = NULL; 3685 if (!reclaimed) { 3686 do { 3687 /* 3688 * Loop until we find yet another one. 3689 * 3690 * By the time we get the soft_limit lock 3691 * again, someone might have aded the 3692 * group back on the RB tree. Iterate to 3693 * make sure we get a different mem. 3694 * mem_cgroup_largest_soft_limit_node returns 3695 * NULL if no other cgroup is present on 3696 * the tree 3697 */ 3698 next_mz = 3699 __mem_cgroup_largest_soft_limit_node(mctz); 3700 if (next_mz == mz) 3701 css_put(&next_mz->mem->css); 3702 else /* next_mz == NULL or other memcg */ 3703 break; 3704 } while (1); 3705 } 3706 __mem_cgroup_remove_exceeded(mz->mem, mz, mctz); 3707 excess = res_counter_soft_limit_excess(&mz->mem->res); 3708 /* 3709 * One school of thought says that we should not add 3710 * back the node to the tree if reclaim returns 0. 3711 * But our reclaim could return 0, simply because due 3712 * to priority we are exposing a smaller subset of 3713 * memory to reclaim from. Consider this as a longer 3714 * term TODO. 3715 */ 3716 /* If excess == 0, no tree ops */ 3717 __mem_cgroup_insert_exceeded(mz->mem, mz, mctz, excess); 3718 spin_unlock(&mctz->lock); 3719 css_put(&mz->mem->css); 3720 loop++; 3721 /* 3722 * Could not reclaim anything and there are no more 3723 * mem cgroups to try or we seem to be looping without 3724 * reclaiming anything. 3725 */ 3726 if (!nr_reclaimed && 3727 (next_mz == NULL || 3728 loop > MEM_CGROUP_MAX_SOFT_LIMIT_RECLAIM_LOOPS)) 3729 break; 3730 } while (!nr_reclaimed); 3731 if (next_mz) 3732 css_put(&next_mz->mem->css); 3733 return nr_reclaimed; 3734} 3735 3736/* 3737 * This routine traverse page_cgroup in given list and drop them all. 3738 * *And* this routine doesn't reclaim page itself, just removes page_cgroup. 3739 */ 3740static int mem_cgroup_force_empty_list(struct mem_cgroup *mem, 3741 int node, int zid, enum lru_list lru) 3742{ 3743 struct zone *zone; 3744 struct mem_cgroup_per_zone *mz; 3745 struct page_cgroup *pc, *busy; 3746 unsigned long flags, loop; 3747 struct list_head *list; 3748 int ret = 0; 3749 3750 zone = &NODE_DATA(node)->node_zones[zid]; 3751 mz = mem_cgroup_zoneinfo(mem, node, zid); 3752 list = &mz->lists[lru]; 3753 3754 loop = MEM_CGROUP_ZSTAT(mz, lru); 3755 /* give some margin against EBUSY etc...*/ 3756 loop += 256; 3757 busy = NULL; 3758 while (loop--) { 3759 struct page *page; 3760 3761 ret = 0; 3762 spin_lock_irqsave(&zone->lru_lock, flags); 3763 if (list_empty(list)) { 3764 spin_unlock_irqrestore(&zone->lru_lock, flags); 3765 break; 3766 } 3767 pc = list_entry(list->prev, struct page_cgroup, lru); 3768 if (busy == pc) { 3769 list_move(&pc->lru, list); 3770 busy = NULL; 3771 spin_unlock_irqrestore(&zone->lru_lock, flags); 3772 continue; 3773 } 3774 spin_unlock_irqrestore(&zone->lru_lock, flags); 3775 3776 page = lookup_cgroup_page(pc); 3777 3778 ret = mem_cgroup_move_parent(page, pc, mem, GFP_KERNEL); 3779 if (ret == -ENOMEM) 3780 break; 3781 3782 if (ret == -EBUSY || ret == -EINVAL) { 3783 /* found lock contention or "pc" is obsolete. */ 3784 busy = pc; 3785 cond_resched(); 3786 } else 3787 busy = NULL; 3788 } 3789 3790 if (!ret && !list_empty(list)) 3791 return -EBUSY; 3792 return ret; 3793} 3794 3795/* 3796 * make mem_cgroup's charge to be 0 if there is no task. 3797 * This enables deleting this mem_cgroup. 3798 */ 3799static int mem_cgroup_force_empty(struct mem_cgroup *mem, bool free_all) 3800{ 3801 int ret; 3802 int node, zid, shrink; 3803 int nr_retries = MEM_CGROUP_RECLAIM_RETRIES; 3804 struct cgroup *cgrp = mem->css.cgroup; 3805 3806 css_get(&mem->css); 3807 3808 shrink = 0; 3809 /* should free all ? */ 3810 if (free_all) 3811 goto try_to_free; 3812move_account: 3813 do { 3814 ret = -EBUSY; 3815 if (cgroup_task_count(cgrp) || !list_empty(&cgrp->children)) 3816 goto out; 3817 ret = -EINTR; 3818 if (signal_pending(current)) 3819 goto out; 3820 /* This is for making all *used* pages to be on LRU. */ 3821 lru_add_drain_all(); 3822 drain_all_stock_sync(mem); 3823 ret = 0; 3824 mem_cgroup_start_move(mem); 3825 for_each_node_state(node, N_HIGH_MEMORY) { 3826 for (zid = 0; !ret && zid < MAX_NR_ZONES; zid++) { 3827 enum lru_list l; 3828 for_each_lru(l) { 3829 ret = mem_cgroup_force_empty_list(mem, 3830 node, zid, l); 3831 if (ret) 3832 break; 3833 } 3834 } 3835 if (ret) 3836 break; 3837 } 3838 mem_cgroup_end_move(mem); 3839 memcg_oom_recover(mem); 3840 /* it seems parent cgroup doesn't have enough mem */ 3841 if (ret == -ENOMEM) 3842 goto try_to_free; 3843 cond_resched(); 3844 /* "ret" should also be checked to ensure all lists are empty. */ 3845 } while (mem->res.usage > 0 || ret); 3846out: 3847 css_put(&mem->css); 3848 return ret; 3849 3850try_to_free: 3851 /* returns EBUSY if there is a task or if we come here twice. */ 3852 if (cgroup_task_count(cgrp) || !list_empty(&cgrp->children) || shrink) { 3853 ret = -EBUSY; 3854 goto out; 3855 } 3856 /* we call try-to-free pages for make this cgroup empty */ 3857 lru_add_drain_all(); 3858 /* try to free all pages in this cgroup */ 3859 shrink = 1; 3860 while (nr_retries && mem->res.usage > 0) { 3861 struct memcg_scanrecord rec; 3862 int progress; 3863 3864 if (signal_pending(current)) { 3865 ret = -EINTR; 3866 goto out; 3867 } 3868 rec.context = SCAN_BY_SHRINK; 3869 rec.mem = mem; 3870 rec.root = mem; 3871 progress = try_to_free_mem_cgroup_pages(mem, GFP_KERNEL, 3872 false, &rec); 3873 if (!progress) { 3874 nr_retries--; 3875 /* maybe some writeback is necessary */ 3876 congestion_wait(BLK_RW_ASYNC, HZ/10); 3877 } 3878 3879 } 3880 lru_add_drain(); 3881 /* try move_account...there may be some *locked* pages. */ 3882 goto move_account; 3883} 3884 3885int mem_cgroup_force_empty_write(struct cgroup *cont, unsigned int event) 3886{ 3887 return mem_cgroup_force_empty(mem_cgroup_from_cont(cont), true); 3888} 3889 3890 3891static u64 mem_cgroup_hierarchy_read(struct cgroup *cont, struct cftype *cft) 3892{ 3893 return mem_cgroup_from_cont(cont)->use_hierarchy; 3894} 3895 3896static int mem_cgroup_hierarchy_write(struct cgroup *cont, struct cftype *cft, 3897 u64 val) 3898{ 3899 int retval = 0; 3900 struct mem_cgroup *mem = mem_cgroup_from_cont(cont); 3901 struct cgroup *parent = cont->parent; 3902 struct mem_cgroup *parent_mem = NULL; 3903 3904 if (parent) 3905 parent_mem = mem_cgroup_from_cont(parent); 3906 3907 cgroup_lock(); 3908 /* 3909 * If parent's use_hierarchy is set, we can't make any modifications 3910 * in the child subtrees. If it is unset, then the change can 3911 * occur, provided the current cgroup has no children. 3912 * 3913 * For the root cgroup, parent_mem is NULL, we allow value to be 3914 * set if there are no children. 3915 */ 3916 if ((!parent_mem || !parent_mem->use_hierarchy) && 3917 (val == 1 || val == 0)) { 3918 if (list_empty(&cont->children)) 3919 mem->use_hierarchy = val; 3920 else 3921 retval = -EBUSY; 3922 } else 3923 retval = -EINVAL; 3924 cgroup_unlock(); 3925 3926 return retval; 3927} 3928 3929 3930static unsigned long mem_cgroup_recursive_stat(struct mem_cgroup *mem, 3931 enum mem_cgroup_stat_index idx) 3932{ 3933 struct mem_cgroup *iter; 3934 long val = 0; 3935 3936 /* Per-cpu values can be negative, use a signed accumulator */ 3937 for_each_mem_cgroup_tree(iter, mem) 3938 val += mem_cgroup_read_stat(iter, idx); 3939 3940 if (val < 0) /* race ? */ 3941 val = 0; 3942 return val; 3943} 3944 3945static inline u64 mem_cgroup_usage(struct mem_cgroup *mem, bool swap) 3946{ 3947 u64 val; 3948 3949 if (!mem_cgroup_is_root(mem)) { 3950 if (!swap) 3951 return res_counter_read_u64(&mem->res, RES_USAGE); 3952 else 3953 return res_counter_read_u64(&mem->memsw, RES_USAGE); 3954 } 3955 3956 val = mem_cgroup_recursive_stat(mem, MEM_CGROUP_STAT_CACHE); 3957 val += mem_cgroup_recursive_stat(mem, MEM_CGROUP_STAT_RSS); 3958 3959 if (swap) 3960 val += mem_cgroup_recursive_stat(mem, MEM_CGROUP_STAT_SWAPOUT); 3961 3962 return val << PAGE_SHIFT; 3963} 3964 3965static u64 mem_cgroup_read(struct cgroup *cont, struct cftype *cft) 3966{ 3967 struct mem_cgroup *mem = mem_cgroup_from_cont(cont); 3968 u64 val; 3969 int type, name; 3970 3971 type = MEMFILE_TYPE(cft->private); 3972 name = MEMFILE_ATTR(cft->private); 3973 switch (type) { 3974 case _MEM: 3975 if (name == RES_USAGE) 3976 val = mem_cgroup_usage(mem, false); 3977 else 3978 val = res_counter_read_u64(&mem->res, name); 3979 break; 3980 case _MEMSWAP: 3981 if (name == RES_USAGE) 3982 val = mem_cgroup_usage(mem, true); 3983 else 3984 val = res_counter_read_u64(&mem->memsw, name); 3985 break; 3986 default: 3987 BUG(); 3988 break; 3989 } 3990 return val; 3991} 3992/* 3993 * The user of this function is... 3994 * RES_LIMIT. 3995 */ 3996static int mem_cgroup_write(struct cgroup *cont, struct cftype *cft, 3997 const char *buffer) 3998{ 3999 struct mem_cgroup *memcg = mem_cgroup_from_cont(cont); 4000 int type, name; 4001 unsigned long long val; 4002 int ret; 4003 4004 type = MEMFILE_TYPE(cft->private); 4005 name = MEMFILE_ATTR(cft->private); 4006 switch (name) { 4007 case RES_LIMIT: 4008 if (mem_cgroup_is_root(memcg)) { /* Can't set limit on root */ 4009 ret = -EINVAL; 4010 break; 4011 } 4012 /* This function does all necessary parse...reuse it */ 4013 ret = res_counter_memparse_write_strategy(buffer, &val); 4014 if (ret) 4015 break; 4016 if (type == _MEM) 4017 ret = mem_cgroup_resize_limit(memcg, val); 4018 else 4019 ret = mem_cgroup_resize_memsw_limit(memcg, val); 4020 break; 4021 case RES_SOFT_LIMIT: 4022 ret = res_counter_memparse_write_strategy(buffer, &val); 4023 if (ret) 4024 break; 4025 /* 4026 * For memsw, soft limits are hard to implement in terms 4027 * of semantics, for now, we support soft limits for 4028 * control without swap 4029 */ 4030 if (type == _MEM) 4031 ret = res_counter_set_soft_limit(&memcg->res, val); 4032 else 4033 ret = -EINVAL; 4034 break; 4035 default: 4036 ret = -EINVAL; /* should be BUG() ? */ 4037 break; 4038 } 4039 return ret; 4040} 4041 4042static void memcg_get_hierarchical_limit(struct mem_cgroup *memcg, 4043 unsigned long long *mem_limit, unsigned long long *memsw_limit) 4044{ 4045 struct cgroup *cgroup; 4046 unsigned long long min_limit, min_memsw_limit, tmp; 4047 4048 min_limit = res_counter_read_u64(&memcg->res, RES_LIMIT); 4049 min_memsw_limit = res_counter_read_u64(&memcg->memsw, RES_LIMIT); 4050 cgroup = memcg->css.cgroup; 4051 if (!memcg->use_hierarchy) 4052 goto out; 4053 4054 while (cgroup->parent) { 4055 cgroup = cgroup->parent; 4056 memcg = mem_cgroup_from_cont(cgroup); 4057 if (!memcg->use_hierarchy) 4058 break; 4059 tmp = res_counter_read_u64(&memcg->res, RES_LIMIT); 4060 min_limit = min(min_limit, tmp); 4061 tmp = res_counter_read_u64(&memcg->memsw, RES_LIMIT); 4062 min_memsw_limit = min(min_memsw_limit, tmp); 4063 } 4064out: 4065 *mem_limit = min_limit; 4066 *memsw_limit = min_memsw_limit; 4067 return; 4068} 4069 4070static int mem_cgroup_reset(struct cgroup *cont, unsigned int event) 4071{ 4072 struct mem_cgroup *mem; 4073 int type, name; 4074 4075 mem = mem_cgroup_from_cont(cont); 4076 type = MEMFILE_TYPE(event); 4077 name = MEMFILE_ATTR(event); 4078 switch (name) { 4079 case RES_MAX_USAGE: 4080 if (type == _MEM) 4081 res_counter_reset_max(&mem->res); 4082 else 4083 res_counter_reset_max(&mem->memsw); 4084 break; 4085 case RES_FAILCNT: 4086 if (type == _MEM) 4087 res_counter_reset_failcnt(&mem->res); 4088 else 4089 res_counter_reset_failcnt(&mem->memsw); 4090 break; 4091 } 4092 4093 return 0; 4094} 4095 4096static u64 mem_cgroup_move_charge_read(struct cgroup *cgrp, 4097 struct cftype *cft) 4098{ 4099 return mem_cgroup_from_cont(cgrp)->move_charge_at_immigrate; 4100} 4101 4102#ifdef CONFIG_MMU 4103static int mem_cgroup_move_charge_write(struct cgroup *cgrp, 4104 struct cftype *cft, u64 val) 4105{ 4106 struct mem_cgroup *mem = mem_cgroup_from_cont(cgrp); 4107 4108 if (val >= (1 << NR_MOVE_TYPE)) 4109 return -EINVAL; 4110 /* 4111 * We check this value several times in both in can_attach() and 4112 * attach(), so we need cgroup lock to prevent this value from being 4113 * inconsistent. 4114 */ 4115 cgroup_lock(); 4116 mem->move_charge_at_immigrate = val; 4117 cgroup_unlock(); 4118 4119 return 0; 4120} 4121#else 4122static int mem_cgroup_move_charge_write(struct cgroup *cgrp, 4123 struct cftype *cft, u64 val) 4124{ 4125 return -ENOSYS; 4126} 4127#endif 4128 4129 4130/* For read statistics */ 4131enum { 4132 MCS_CACHE, 4133 MCS_RSS, 4134 MCS_FILE_MAPPED, 4135 MCS_PGPGIN, 4136 MCS_PGPGOUT, 4137 MCS_SWAP, 4138 MCS_PGFAULT, 4139 MCS_PGMAJFAULT, 4140 MCS_INACTIVE_ANON, 4141 MCS_ACTIVE_ANON, 4142 MCS_INACTIVE_FILE, 4143 MCS_ACTIVE_FILE, 4144 MCS_UNEVICTABLE, 4145 NR_MCS_STAT, 4146}; 4147 4148struct mcs_total_stat { 4149 s64 stat[NR_MCS_STAT]; 4150}; 4151 4152struct { 4153 char *local_name; 4154 char *total_name; 4155} memcg_stat_strings[NR_MCS_STAT] = { 4156 {"cache", "total_cache"}, 4157 {"rss", "total_rss"}, 4158 {"mapped_file", "total_mapped_file"}, 4159 {"pgpgin", "total_pgpgin"}, 4160 {"pgpgout", "total_pgpgout"}, 4161 {"swap", "total_swap"}, 4162 {"pgfault", "total_pgfault"}, 4163 {"pgmajfault", "total_pgmajfault"}, 4164 {"inactive_anon", "total_inactive_anon"}, 4165 {"active_anon", "total_active_anon"}, 4166 {"inactive_file", "total_inactive_file"}, 4167 {"active_file", "total_active_file"}, 4168 {"unevictable", "total_unevictable"} 4169}; 4170 4171 4172static void 4173mem_cgroup_get_local_stat(struct mem_cgroup *mem, struct mcs_total_stat *s) 4174{ 4175 s64 val; 4176 4177 /* per cpu stat */ 4178 val = mem_cgroup_read_stat(mem, MEM_CGROUP_STAT_CACHE); 4179 s->stat[MCS_CACHE] += val * PAGE_SIZE; 4180 val = mem_cgroup_read_stat(mem, MEM_CGROUP_STAT_RSS); 4181 s->stat[MCS_RSS] += val * PAGE_SIZE; 4182 val = mem_cgroup_read_stat(mem, MEM_CGROUP_STAT_FILE_MAPPED); 4183 s->stat[MCS_FILE_MAPPED] += val * PAGE_SIZE; 4184 val = mem_cgroup_read_events(mem, MEM_CGROUP_EVENTS_PGPGIN); 4185 s->stat[MCS_PGPGIN] += val; 4186 val = mem_cgroup_read_events(mem, MEM_CGROUP_EVENTS_PGPGOUT); 4187 s->stat[MCS_PGPGOUT] += val; 4188 if (do_swap_account) { 4189 val = mem_cgroup_read_stat(mem, MEM_CGROUP_STAT_SWAPOUT); 4190 s->stat[MCS_SWAP] += val * PAGE_SIZE; 4191 } 4192 val = mem_cgroup_read_events(mem, MEM_CGROUP_EVENTS_PGFAULT); 4193 s->stat[MCS_PGFAULT] += val; 4194 val = mem_cgroup_read_events(mem, MEM_CGROUP_EVENTS_PGMAJFAULT); 4195 s->stat[MCS_PGMAJFAULT] += val; 4196 4197 /* per zone stat */ 4198 val = mem_cgroup_nr_lru_pages(mem, BIT(LRU_INACTIVE_ANON)); 4199 s->stat[MCS_INACTIVE_ANON] += val * PAGE_SIZE; 4200 val = mem_cgroup_nr_lru_pages(mem, BIT(LRU_ACTIVE_ANON)); 4201 s->stat[MCS_ACTIVE_ANON] += val * PAGE_SIZE; 4202 val = mem_cgroup_nr_lru_pages(mem, BIT(LRU_INACTIVE_FILE)); 4203 s->stat[MCS_INACTIVE_FILE] += val * PAGE_SIZE; 4204 val = mem_cgroup_nr_lru_pages(mem, BIT(LRU_ACTIVE_FILE)); 4205 s->stat[MCS_ACTIVE_FILE] += val * PAGE_SIZE; 4206 val = mem_cgroup_nr_lru_pages(mem, BIT(LRU_UNEVICTABLE)); 4207 s->stat[MCS_UNEVICTABLE] += val * PAGE_SIZE; 4208} 4209 4210static void 4211mem_cgroup_get_total_stat(struct mem_cgroup *mem, struct mcs_total_stat *s) 4212{ 4213 struct mem_cgroup *iter; 4214 4215 for_each_mem_cgroup_tree(iter, mem) 4216 mem_cgroup_get_local_stat(iter, s); 4217} 4218 4219#ifdef CONFIG_NUMA 4220static int mem_control_numa_stat_show(struct seq_file *m, void *arg) 4221{ 4222 int nid; 4223 unsigned long total_nr, file_nr, anon_nr, unevictable_nr; 4224 unsigned long node_nr; 4225 struct cgroup *cont = m->private; 4226 struct mem_cgroup *mem_cont = mem_cgroup_from_cont(cont); 4227 4228 total_nr = mem_cgroup_nr_lru_pages(mem_cont, LRU_ALL); 4229 seq_printf(m, "total=%lu", total_nr); 4230 for_each_node_state(nid, N_HIGH_MEMORY) { 4231 node_nr = mem_cgroup_node_nr_lru_pages(mem_cont, nid, LRU_ALL); 4232 seq_printf(m, " N%d=%lu", nid, node_nr); 4233 } 4234 seq_putc(m, '\n'); 4235 4236 file_nr = mem_cgroup_nr_lru_pages(mem_cont, LRU_ALL_FILE); 4237 seq_printf(m, "file=%lu", file_nr); 4238 for_each_node_state(nid, N_HIGH_MEMORY) { 4239 node_nr = mem_cgroup_node_nr_lru_pages(mem_cont, nid, 4240 LRU_ALL_FILE); 4241 seq_printf(m, " N%d=%lu", nid, node_nr); 4242 } 4243 seq_putc(m, '\n'); 4244 4245 anon_nr = mem_cgroup_nr_lru_pages(mem_cont, LRU_ALL_ANON); 4246 seq_printf(m, "anon=%lu", anon_nr); 4247 for_each_node_state(nid, N_HIGH_MEMORY) { 4248 node_nr = mem_cgroup_node_nr_lru_pages(mem_cont, nid, 4249 LRU_ALL_ANON); 4250 seq_printf(m, " N%d=%lu", nid, node_nr); 4251 } 4252 seq_putc(m, '\n'); 4253 4254 unevictable_nr = mem_cgroup_nr_lru_pages(mem_cont, BIT(LRU_UNEVICTABLE)); 4255 seq_printf(m, "unevictable=%lu", unevictable_nr); 4256 for_each_node_state(nid, N_HIGH_MEMORY) { 4257 node_nr = mem_cgroup_node_nr_lru_pages(mem_cont, nid, 4258 BIT(LRU_UNEVICTABLE)); 4259 seq_printf(m, " N%d=%lu", nid, node_nr); 4260 } 4261 seq_putc(m, '\n'); 4262 return 0; 4263} 4264#endif /* CONFIG_NUMA */ 4265 4266static int mem_control_stat_show(struct cgroup *cont, struct cftype *cft, 4267 struct cgroup_map_cb *cb) 4268{ 4269 struct mem_cgroup *mem_cont = mem_cgroup_from_cont(cont); 4270 struct mcs_total_stat mystat; 4271 int i; 4272 4273 memset(&mystat, 0, sizeof(mystat)); 4274 mem_cgroup_get_local_stat(mem_cont, &mystat); 4275 4276 4277 for (i = 0; i < NR_MCS_STAT; i++) { 4278 if (i == MCS_SWAP && !do_swap_account) 4279 continue; 4280 cb->fill(cb, memcg_stat_strings[i].local_name, mystat.stat[i]); 4281 } 4282 4283 /* Hierarchical information */ 4284 { 4285 unsigned long long limit, memsw_limit; 4286 memcg_get_hierarchical_limit(mem_cont, &limit, &memsw_limit); 4287 cb->fill(cb, "hierarchical_memory_limit", limit); 4288 if (do_swap_account) 4289 cb->fill(cb, "hierarchical_memsw_limit", memsw_limit); 4290 } 4291 4292 memset(&mystat, 0, sizeof(mystat)); 4293 mem_cgroup_get_total_stat(mem_cont, &mystat); 4294 for (i = 0; i < NR_MCS_STAT; i++) { 4295 if (i == MCS_SWAP && !do_swap_account) 4296 continue; 4297 cb->fill(cb, memcg_stat_strings[i].total_name, mystat.stat[i]); 4298 } 4299 4300#ifdef CONFIG_DEBUG_VM 4301 cb->fill(cb, "inactive_ratio", calc_inactive_ratio(mem_cont, NULL)); 4302 4303 { 4304 int nid, zid; 4305 struct mem_cgroup_per_zone *mz; 4306 unsigned long recent_rotated[2] = {0, 0}; 4307 unsigned long recent_scanned[2] = {0, 0}; 4308 4309 for_each_online_node(nid) 4310 for (zid = 0; zid < MAX_NR_ZONES; zid++) { 4311 mz = mem_cgroup_zoneinfo(mem_cont, nid, zid); 4312 4313 recent_rotated[0] += 4314 mz->reclaim_stat.recent_rotated[0]; 4315 recent_rotated[1] += 4316 mz->reclaim_stat.recent_rotated[1]; 4317 recent_scanned[0] += 4318 mz->reclaim_stat.recent_scanned[0]; 4319 recent_scanned[1] += 4320 mz->reclaim_stat.recent_scanned[1]; 4321 } 4322 cb->fill(cb, "recent_rotated_anon", recent_rotated[0]); 4323 cb->fill(cb, "recent_rotated_file", recent_rotated[1]); 4324 cb->fill(cb, "recent_scanned_anon", recent_scanned[0]); 4325 cb->fill(cb, "recent_scanned_file", recent_scanned[1]); 4326 } 4327#endif 4328 4329 return 0; 4330} 4331 4332static u64 mem_cgroup_swappiness_read(struct cgroup *cgrp, struct cftype *cft) 4333{ 4334 struct mem_cgroup *memcg = mem_cgroup_from_cont(cgrp); 4335 4336 return mem_cgroup_swappiness(memcg); 4337} 4338 4339static int mem_cgroup_swappiness_write(struct cgroup *cgrp, struct cftype *cft, 4340 u64 val) 4341{ 4342 struct mem_cgroup *memcg = mem_cgroup_from_cont(cgrp); 4343 struct mem_cgroup *parent; 4344 4345 if (val > 100) 4346 return -EINVAL; 4347 4348 if (cgrp->parent == NULL) 4349 return -EINVAL; 4350 4351 parent = mem_cgroup_from_cont(cgrp->parent); 4352 4353 cgroup_lock(); 4354 4355 /* If under hierarchy, only empty-root can set this value */ 4356 if ((parent->use_hierarchy) || 4357 (memcg->use_hierarchy && !list_empty(&cgrp->children))) { 4358 cgroup_unlock(); 4359 return -EINVAL; 4360 } 4361 4362 memcg->swappiness = val; 4363 4364 cgroup_unlock(); 4365 4366 return 0; 4367} 4368 4369static void __mem_cgroup_threshold(struct mem_cgroup *memcg, bool swap) 4370{ 4371 struct mem_cgroup_threshold_ary *t; 4372 u64 usage; 4373 int i; 4374 4375 rcu_read_lock(); 4376 if (!swap) 4377 t = rcu_dereference(memcg->thresholds.primary); 4378 else 4379 t = rcu_dereference(memcg->memsw_thresholds.primary); 4380 4381 if (!t) 4382 goto unlock; 4383 4384 usage = mem_cgroup_usage(memcg, swap); 4385 4386 /* 4387 * current_threshold points to threshold just below usage. 4388 * If it's not true, a threshold was crossed after last 4389 * call of __mem_cgroup_threshold(). 4390 */ 4391 i = t->current_threshold; 4392 4393 /* 4394 * Iterate backward over array of thresholds starting from 4395 * current_threshold and check if a threshold is crossed. 4396 * If none of thresholds below usage is crossed, we read 4397 * only one element of the array here. 4398 */ 4399 for (; i >= 0 && unlikely(t->entries[i].threshold > usage); i--) 4400 eventfd_signal(t->entries[i].eventfd, 1); 4401 4402 /* i = current_threshold + 1 */ 4403 i++; 4404 4405 /* 4406 * Iterate forward over array of thresholds starting from 4407 * current_threshold+1 and check if a threshold is crossed. 4408 * If none of thresholds above usage is crossed, we read 4409 * only one element of the array here. 4410 */ 4411 for (; i < t->size && unlikely(t->entries[i].threshold <= usage); i++) 4412 eventfd_signal(t->entries[i].eventfd, 1); 4413 4414 /* Update current_threshold */ 4415 t->current_threshold = i - 1; 4416unlock: 4417 rcu_read_unlock(); 4418} 4419 4420static void mem_cgroup_threshold(struct mem_cgroup *memcg) 4421{ 4422 while (memcg) { 4423 __mem_cgroup_threshold(memcg, false); 4424 if (do_swap_account) 4425 __mem_cgroup_threshold(memcg, true); 4426 4427 memcg = parent_mem_cgroup(memcg); 4428 } 4429} 4430 4431static int compare_thresholds(const void *a, const void *b) 4432{ 4433 const struct mem_cgroup_threshold *_a = a; 4434 const struct mem_cgroup_threshold *_b = b; 4435 4436 return _a->threshold - _b->threshold; 4437} 4438 4439static int mem_cgroup_oom_notify_cb(struct mem_cgroup *mem) 4440{ 4441 struct mem_cgroup_eventfd_list *ev; 4442 4443 list_for_each_entry(ev, &mem->oom_notify, list) 4444 eventfd_signal(ev->eventfd, 1); 4445 return 0; 4446} 4447 4448static void mem_cgroup_oom_notify(struct mem_cgroup *mem) 4449{ 4450 struct mem_cgroup *iter; 4451 4452 for_each_mem_cgroup_tree(iter, mem) 4453 mem_cgroup_oom_notify_cb(iter); 4454} 4455 4456static int mem_cgroup_usage_register_event(struct cgroup *cgrp, 4457 struct cftype *cft, struct eventfd_ctx *eventfd, const char *args) 4458{ 4459 struct mem_cgroup *memcg = mem_cgroup_from_cont(cgrp); 4460 struct mem_cgroup_thresholds *thresholds; 4461 struct mem_cgroup_threshold_ary *new; 4462 int type = MEMFILE_TYPE(cft->private); 4463 u64 threshold, usage; 4464 int i, size, ret; 4465 4466 ret = res_counter_memparse_write_strategy(args, &threshold); 4467 if (ret) 4468 return ret; 4469 4470 mutex_lock(&memcg->thresholds_lock); 4471 4472 if (type == _MEM) 4473 thresholds = &memcg->thresholds; 4474 else if (type == _MEMSWAP) 4475 thresholds = &memcg->memsw_thresholds; 4476 else 4477 BUG(); 4478 4479 usage = mem_cgroup_usage(memcg, type == _MEMSWAP); 4480 4481 /* Check if a threshold crossed before adding a new one */ 4482 if (thresholds->primary) 4483 __mem_cgroup_threshold(memcg, type == _MEMSWAP); 4484 4485 size = thresholds->primary ? thresholds->primary->size + 1 : 1; 4486 4487 /* Allocate memory for new array of thresholds */ 4488 new = kmalloc(sizeof(*new) + size * sizeof(struct mem_cgroup_threshold), 4489 GFP_KERNEL); 4490 if (!new) { 4491 ret = -ENOMEM; 4492 goto unlock; 4493 } 4494 new->size = size; 4495 4496 /* Copy thresholds (if any) to new array */ 4497 if (thresholds->primary) { 4498 memcpy(new->entries, thresholds->primary->entries, (size - 1) * 4499 sizeof(struct mem_cgroup_threshold)); 4500 } 4501 4502 /* Add new threshold */ 4503 new->entries[size - 1].eventfd = eventfd; 4504 new->entries[size - 1].threshold = threshold; 4505 4506 /* Sort thresholds. Registering of new threshold isn't time-critical */ 4507 sort(new->entries, size, sizeof(struct mem_cgroup_threshold), 4508 compare_thresholds, NULL); 4509 4510 /* Find current threshold */ 4511 new->current_threshold = -1; 4512 for (i = 0; i < size; i++) { 4513 if (new->entries[i].threshold < usage) { 4514 /* 4515 * new->current_threshold will not be used until 4516 * rcu_assign_pointer(), so it's safe to increment 4517 * it here. 4518 */ 4519 ++new->current_threshold; 4520 } 4521 } 4522 4523 /* Free old spare buffer and save old primary buffer as spare */ 4524 kfree(thresholds->spare); 4525 thresholds->spare = thresholds->primary; 4526 4527 rcu_assign_pointer(thresholds->primary, new); 4528 4529 /* To be sure that nobody uses thresholds */ 4530 synchronize_rcu(); 4531 4532unlock: 4533 mutex_unlock(&memcg->thresholds_lock); 4534 4535 return ret; 4536} 4537 4538static void mem_cgroup_usage_unregister_event(struct cgroup *cgrp, 4539 struct cftype *cft, struct eventfd_ctx *eventfd) 4540{ 4541 struct mem_cgroup *memcg = mem_cgroup_from_cont(cgrp); 4542 struct mem_cgroup_thresholds *thresholds; 4543 struct mem_cgroup_threshold_ary *new; 4544 int type = MEMFILE_TYPE(cft->private); 4545 u64 usage; 4546 int i, j, size; 4547 4548 mutex_lock(&memcg->thresholds_lock); 4549 if (type == _MEM) 4550 thresholds = &memcg->thresholds; 4551 else if (type == _MEMSWAP) 4552 thresholds = &memcg->memsw_thresholds; 4553 else 4554 BUG(); 4555 4556 /* 4557 * Something went wrong if we trying to unregister a threshold 4558 * if we don't have thresholds 4559 */ 4560 BUG_ON(!thresholds); 4561 4562 usage = mem_cgroup_usage(memcg, type == _MEMSWAP); 4563 4564 /* Check if a threshold crossed before removing */ 4565 __mem_cgroup_threshold(memcg, type == _MEMSWAP); 4566 4567 /* Calculate new number of threshold */ 4568 size = 0; 4569 for (i = 0; i < thresholds->primary->size; i++) { 4570 if (thresholds->primary->entries[i].eventfd != eventfd) 4571 size++; 4572 } 4573 4574 new = thresholds->spare; 4575 4576 /* Set thresholds array to NULL if we don't have thresholds */ 4577 if (!size) { 4578 kfree(new); 4579 new = NULL; 4580 goto swap_buffers; 4581 } 4582 4583 new->size = size; 4584 4585 /* Copy thresholds and find current threshold */ 4586 new->current_threshold = -1; 4587 for (i = 0, j = 0; i < thresholds->primary->size; i++) { 4588 if (thresholds->primary->entries[i].eventfd == eventfd) 4589 continue; 4590 4591 new->entries[j] = thresholds->primary->entries[i]; 4592 if (new->entries[j].threshold < usage) { 4593 /* 4594 * new->current_threshold will not be used 4595 * until rcu_assign_pointer(), so it's safe to increment 4596 * it here. 4597 */ 4598 ++new->current_threshold; 4599 } 4600 j++; 4601 } 4602 4603swap_buffers: 4604 /* Swap primary and spare array */ 4605 thresholds->spare = thresholds->primary; 4606 rcu_assign_pointer(thresholds->primary, new); 4607 4608 /* To be sure that nobody uses thresholds */ 4609 synchronize_rcu(); 4610 4611 mutex_unlock(&memcg->thresholds_lock); 4612} 4613 4614static int mem_cgroup_oom_register_event(struct cgroup *cgrp, 4615 struct cftype *cft, struct eventfd_ctx *eventfd, const char *args) 4616{ 4617 struct mem_cgroup *memcg = mem_cgroup_from_cont(cgrp); 4618 struct mem_cgroup_eventfd_list *event; 4619 int type = MEMFILE_TYPE(cft->private); 4620 4621 BUG_ON(type != _OOM_TYPE); 4622 event = kmalloc(sizeof(*event), GFP_KERNEL); 4623 if (!event) 4624 return -ENOMEM; 4625 4626 spin_lock(&memcg_oom_lock); 4627 4628 event->eventfd = eventfd; 4629 list_add(&event->list, &memcg->oom_notify); 4630 4631 /* already in OOM ? */ 4632 if (atomic_read(&memcg->under_oom)) 4633 eventfd_signal(eventfd, 1); 4634 spin_unlock(&memcg_oom_lock); 4635 4636 return 0; 4637} 4638 4639static void mem_cgroup_oom_unregister_event(struct cgroup *cgrp, 4640 struct cftype *cft, struct eventfd_ctx *eventfd) 4641{ 4642 struct mem_cgroup *mem = mem_cgroup_from_cont(cgrp); 4643 struct mem_cgroup_eventfd_list *ev, *tmp; 4644 int type = MEMFILE_TYPE(cft->private); 4645 4646 BUG_ON(type != _OOM_TYPE); 4647 4648 spin_lock(&memcg_oom_lock); 4649 4650 list_for_each_entry_safe(ev, tmp, &mem->oom_notify, list) { 4651 if (ev->eventfd == eventfd) { 4652 list_del(&ev->list); 4653 kfree(ev); 4654 } 4655 } 4656 4657 spin_unlock(&memcg_oom_lock); 4658} 4659 4660static int mem_cgroup_oom_control_read(struct cgroup *cgrp, 4661 struct cftype *cft, struct cgroup_map_cb *cb) 4662{ 4663 struct mem_cgroup *mem = mem_cgroup_from_cont(cgrp); 4664 4665 cb->fill(cb, "oom_kill_disable", mem->oom_kill_disable); 4666 4667 if (atomic_read(&mem->under_oom)) 4668 cb->fill(cb, "under_oom", 1); 4669 else 4670 cb->fill(cb, "under_oom", 0); 4671 return 0; 4672} 4673 4674static int mem_cgroup_oom_control_write(struct cgroup *cgrp, 4675 struct cftype *cft, u64 val) 4676{ 4677 struct mem_cgroup *mem = mem_cgroup_from_cont(cgrp); 4678 struct mem_cgroup *parent; 4679 4680 /* cannot set to root cgroup and only 0 and 1 are allowed */ 4681 if (!cgrp->parent || !((val == 0) || (val == 1))) 4682 return -EINVAL; 4683 4684 parent = mem_cgroup_from_cont(cgrp->parent); 4685 4686 cgroup_lock(); 4687 /* oom-kill-disable is a flag for subhierarchy. */ 4688 if ((parent->use_hierarchy) || 4689 (mem->use_hierarchy && !list_empty(&cgrp->children))) { 4690 cgroup_unlock(); 4691 return -EINVAL; 4692 } 4693 mem->oom_kill_disable = val; 4694 if (!val) 4695 memcg_oom_recover(mem); 4696 cgroup_unlock(); 4697 return 0; 4698} 4699 4700#ifdef CONFIG_NUMA 4701static const struct file_operations mem_control_numa_stat_file_operations = { 4702 .read = seq_read, 4703 .llseek = seq_lseek, 4704 .release = single_release, 4705}; 4706 4707static int mem_control_numa_stat_open(struct inode *unused, struct file *file) 4708{ 4709 struct cgroup *cont = file->f_dentry->d_parent->d_fsdata; 4710 4711 file->f_op = &mem_control_numa_stat_file_operations; 4712 return single_open(file, mem_control_numa_stat_show, cont); 4713} 4714#endif /* CONFIG_NUMA */ 4715 4716static int mem_cgroup_vmscan_stat_read(struct cgroup *cgrp, 4717 struct cftype *cft, 4718 struct cgroup_map_cb *cb) 4719{ 4720 struct mem_cgroup *mem = mem_cgroup_from_cont(cgrp); 4721 char string[64]; 4722 int i; 4723 4724 for (i = 0; i < NR_SCANSTATS; i++) { 4725 strcpy(string, scanstat_string[i]); 4726 strcat(string, SCANSTAT_WORD_LIMIT); 4727 cb->fill(cb, string, mem->scanstat.stats[SCAN_BY_LIMIT][i]); 4728 } 4729 4730 for (i = 0; i < NR_SCANSTATS; i++) { 4731 strcpy(string, scanstat_string[i]); 4732 strcat(string, SCANSTAT_WORD_SYSTEM); 4733 cb->fill(cb, string, mem->scanstat.stats[SCAN_BY_SYSTEM][i]); 4734 } 4735 4736 for (i = 0; i < NR_SCANSTATS; i++) { 4737 strcpy(string, scanstat_string[i]); 4738 strcat(string, SCANSTAT_WORD_LIMIT); 4739 strcat(string, SCANSTAT_WORD_HIERARCHY); 4740 cb->fill(cb, string, mem->scanstat.rootstats[SCAN_BY_LIMIT][i]); 4741 } 4742 for (i = 0; i < NR_SCANSTATS; i++) { 4743 strcpy(string, scanstat_string[i]); 4744 strcat(string, SCANSTAT_WORD_SYSTEM); 4745 strcat(string, SCANSTAT_WORD_HIERARCHY); 4746 cb->fill(cb, string, mem->scanstat.rootstats[SCAN_BY_SYSTEM][i]); 4747 } 4748 return 0; 4749} 4750 4751static int mem_cgroup_reset_vmscan_stat(struct cgroup *cgrp, 4752 unsigned int event) 4753{ 4754 struct mem_cgroup *mem = mem_cgroup_from_cont(cgrp); 4755 4756 spin_lock(&mem->scanstat.lock); 4757 memset(&mem->scanstat.stats, 0, sizeof(mem->scanstat.stats)); 4758 memset(&mem->scanstat.rootstats, 0, sizeof(mem->scanstat.rootstats)); 4759 spin_unlock(&mem->scanstat.lock); 4760 return 0; 4761} 4762 4763 4764static struct cftype mem_cgroup_files[] = { 4765 { 4766 .name = "usage_in_bytes", 4767 .private = MEMFILE_PRIVATE(_MEM, RES_USAGE), 4768 .read_u64 = mem_cgroup_read, 4769 .register_event = mem_cgroup_usage_register_event, 4770 .unregister_event = mem_cgroup_usage_unregister_event, 4771 }, 4772 { 4773 .name = "max_usage_in_bytes", 4774 .private = MEMFILE_PRIVATE(_MEM, RES_MAX_USAGE), 4775 .trigger = mem_cgroup_reset, 4776 .read_u64 = mem_cgroup_read, 4777 }, 4778 { 4779 .name = "limit_in_bytes", 4780 .private = MEMFILE_PRIVATE(_MEM, RES_LIMIT), 4781 .write_string = mem_cgroup_write, 4782 .read_u64 = mem_cgroup_read, 4783 }, 4784 { 4785 .name = "soft_limit_in_bytes", 4786 .private = MEMFILE_PRIVATE(_MEM, RES_SOFT_LIMIT), 4787 .write_string = mem_cgroup_write, 4788 .read_u64 = mem_cgroup_read, 4789 }, 4790 { 4791 .name = "failcnt", 4792 .private = MEMFILE_PRIVATE(_MEM, RES_FAILCNT), 4793 .trigger = mem_cgroup_reset, 4794 .read_u64 = mem_cgroup_read, 4795 }, 4796 { 4797 .name = "stat", 4798 .read_map = mem_control_stat_show, 4799 }, 4800 { 4801 .name = "force_empty", 4802 .trigger = mem_cgroup_force_empty_write, 4803 }, 4804 { 4805 .name = "use_hierarchy", 4806 .write_u64 = mem_cgroup_hierarchy_write, 4807 .read_u64 = mem_cgroup_hierarchy_read, 4808 }, 4809 { 4810 .name = "swappiness", 4811 .read_u64 = mem_cgroup_swappiness_read, 4812 .write_u64 = mem_cgroup_swappiness_write, 4813 }, 4814 { 4815 .name = "move_charge_at_immigrate", 4816 .read_u64 = mem_cgroup_move_charge_read, 4817 .write_u64 = mem_cgroup_move_charge_write, 4818 }, 4819 { 4820 .name = "oom_control", 4821 .read_map = mem_cgroup_oom_control_read, 4822 .write_u64 = mem_cgroup_oom_control_write, 4823 .register_event = mem_cgroup_oom_register_event, 4824 .unregister_event = mem_cgroup_oom_unregister_event, 4825 .private = MEMFILE_PRIVATE(_OOM_TYPE, OOM_CONTROL), 4826 }, 4827#ifdef CONFIG_NUMA 4828 { 4829 .name = "numa_stat", 4830 .open = mem_control_numa_stat_open, 4831 .mode = S_IRUGO, 4832 }, 4833#endif 4834 { 4835 .name = "vmscan_stat", 4836 .read_map = mem_cgroup_vmscan_stat_read, 4837 .trigger = mem_cgroup_reset_vmscan_stat, 4838 }, 4839}; 4840 4841#ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP 4842static struct cftype memsw_cgroup_files[] = { 4843 { 4844 .name = "memsw.usage_in_bytes", 4845 .private = MEMFILE_PRIVATE(_MEMSWAP, RES_USAGE), 4846 .read_u64 = mem_cgroup_read, 4847 .register_event = mem_cgroup_usage_register_event, 4848 .unregister_event = mem_cgroup_usage_unregister_event, 4849 }, 4850 { 4851 .name = "memsw.max_usage_in_bytes", 4852 .private = MEMFILE_PRIVATE(_MEMSWAP, RES_MAX_USAGE), 4853 .trigger = mem_cgroup_reset, 4854 .read_u64 = mem_cgroup_read, 4855 }, 4856 { 4857 .name = "memsw.limit_in_bytes", 4858 .private = MEMFILE_PRIVATE(_MEMSWAP, RES_LIMIT), 4859 .write_string = mem_cgroup_write, 4860 .read_u64 = mem_cgroup_read, 4861 }, 4862 { 4863 .name = "memsw.failcnt", 4864 .private = MEMFILE_PRIVATE(_MEMSWAP, RES_FAILCNT), 4865 .trigger = mem_cgroup_reset, 4866 .read_u64 = mem_cgroup_read, 4867 }, 4868}; 4869 4870static int register_memsw_files(struct cgroup *cont, struct cgroup_subsys *ss) 4871{ 4872 if (!do_swap_account) 4873 return 0; 4874 return cgroup_add_files(cont, ss, memsw_cgroup_files, 4875 ARRAY_SIZE(memsw_cgroup_files)); 4876}; 4877#else 4878static int register_memsw_files(struct cgroup *cont, struct cgroup_subsys *ss) 4879{ 4880 return 0; 4881} 4882#endif 4883 4884static int alloc_mem_cgroup_per_zone_info(struct mem_cgroup *mem, int node) 4885{ 4886 struct mem_cgroup_per_node *pn; 4887 struct mem_cgroup_per_zone *mz; 4888 enum lru_list l; 4889 int zone, tmp = node; 4890 /* 4891 * This routine is called against possible nodes. 4892 * But it's BUG to call kmalloc() against offline node. 4893 * 4894 * TODO: this routine can waste much memory for nodes which will 4895 * never be onlined. It's better to use memory hotplug callback 4896 * function. 4897 */ 4898 if (!node_state(node, N_NORMAL_MEMORY)) 4899 tmp = -1; 4900 pn = kzalloc_node(sizeof(*pn), GFP_KERNEL, tmp); 4901 if (!pn) 4902 return 1; 4903 4904 mem->info.nodeinfo[node] = pn; 4905 for (zone = 0; zone < MAX_NR_ZONES; zone++) { 4906 mz = &pn->zoneinfo[zone]; 4907 for_each_lru(l) 4908 INIT_LIST_HEAD(&mz->lists[l]); 4909 mz->usage_in_excess = 0; 4910 mz->on_tree = false; 4911 mz->mem = mem; 4912 } 4913 return 0; 4914} 4915 4916static void free_mem_cgroup_per_zone_info(struct mem_cgroup *mem, int node) 4917{ 4918 kfree(mem->info.nodeinfo[node]); 4919} 4920 4921static struct mem_cgroup *mem_cgroup_alloc(void) 4922{ 4923 struct mem_cgroup *mem; 4924 int size = sizeof(struct mem_cgroup); 4925 4926 /* Can be very big if MAX_NUMNODES is very big */ 4927 if (size < PAGE_SIZE) 4928 mem = kzalloc(size, GFP_KERNEL); 4929 else 4930 mem = vzalloc(size); 4931 4932 if (!mem) 4933 return NULL; 4934 4935 mem->stat = alloc_percpu(struct mem_cgroup_stat_cpu); 4936 if (!mem->stat) 4937 goto out_free; 4938 spin_lock_init(&mem->pcp_counter_lock); 4939 return mem; 4940 4941out_free: 4942 if (size < PAGE_SIZE) 4943 kfree(mem); 4944 else 4945 vfree(mem); 4946 return NULL; 4947} 4948 4949/* 4950 * At destroying mem_cgroup, references from swap_cgroup can remain. 4951 * (scanning all at force_empty is too costly...) 4952 * 4953 * Instead of clearing all references at force_empty, we remember 4954 * the number of reference from swap_cgroup and free mem_cgroup when 4955 * it goes down to 0. 4956 * 4957 * Removal of cgroup itself succeeds regardless of refs from swap. 4958 */ 4959 4960static void __mem_cgroup_free(struct mem_cgroup *mem) 4961{ 4962 int node; 4963 4964 mem_cgroup_remove_from_trees(mem); 4965 free_css_id(&mem_cgroup_subsys, &mem->css); 4966 4967 for_each_node_state(node, N_POSSIBLE) 4968 free_mem_cgroup_per_zone_info(mem, node); 4969 4970 free_percpu(mem->stat); 4971 if (sizeof(struct mem_cgroup) < PAGE_SIZE) 4972 kfree(mem); 4973 else 4974 vfree(mem); 4975} 4976 4977static void mem_cgroup_get(struct mem_cgroup *mem) 4978{ 4979 atomic_inc(&mem->refcnt); 4980} 4981 4982static void __mem_cgroup_put(struct mem_cgroup *mem, int count) 4983{ 4984 if (atomic_sub_and_test(count, &mem->refcnt)) { 4985 struct mem_cgroup *parent = parent_mem_cgroup(mem); 4986 __mem_cgroup_free(mem); 4987 if (parent) 4988 mem_cgroup_put(parent); 4989 } 4990} 4991 4992static void mem_cgroup_put(struct mem_cgroup *mem) 4993{ 4994 __mem_cgroup_put(mem, 1); 4995} 4996 4997/* 4998 * Returns the parent mem_cgroup in memcgroup hierarchy with hierarchy enabled. 4999 */ 5000static struct mem_cgroup *parent_mem_cgroup(struct mem_cgroup *mem) 5001{ 5002 if (!mem->res.parent) 5003 return NULL; 5004 return mem_cgroup_from_res_counter(mem->res.parent, res); 5005} 5006 5007#ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP 5008static void __init enable_swap_cgroup(void) 5009{ 5010 if (!mem_cgroup_disabled() && really_do_swap_account) 5011 do_swap_account = 1; 5012} 5013#else 5014static void __init enable_swap_cgroup(void) 5015{ 5016} 5017#endif 5018 5019static int mem_cgroup_soft_limit_tree_init(void) 5020{ 5021 struct mem_cgroup_tree_per_node *rtpn; 5022 struct mem_cgroup_tree_per_zone *rtpz; 5023 int tmp, node, zone; 5024 5025 for_each_node_state(node, N_POSSIBLE) { 5026 tmp = node; 5027 if (!node_state(node, N_NORMAL_MEMORY)) 5028 tmp = -1; 5029 rtpn = kzalloc_node(sizeof(*rtpn), GFP_KERNEL, tmp); 5030 if (!rtpn) 5031 return 1; 5032 5033 soft_limit_tree.rb_tree_per_node[node] = rtpn; 5034 5035 for (zone = 0; zone < MAX_NR_ZONES; zone++) { 5036 rtpz = &rtpn->rb_tree_per_zone[zone]; 5037 rtpz->rb_root = RB_ROOT; 5038 spin_lock_init(&rtpz->lock); 5039 } 5040 } 5041 return 0; 5042} 5043 5044static struct cgroup_subsys_state * __ref 5045mem_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cont) 5046{ 5047 struct mem_cgroup *mem, *parent; 5048 long error = -ENOMEM; 5049 int node; 5050 5051 mem = mem_cgroup_alloc(); 5052 if (!mem) 5053 return ERR_PTR(error); 5054 5055 for_each_node_state(node, N_POSSIBLE) 5056 if (alloc_mem_cgroup_per_zone_info(mem, node)) 5057 goto free_out; 5058 5059 /* root ? */ 5060 if (cont->parent == NULL) { 5061 int cpu; 5062 enable_swap_cgroup(); 5063 parent = NULL; 5064 root_mem_cgroup = mem; 5065 if (mem_cgroup_soft_limit_tree_init()) 5066 goto free_out; 5067 for_each_possible_cpu(cpu) { 5068 struct memcg_stock_pcp *stock = 5069 &per_cpu(memcg_stock, cpu); 5070 INIT_WORK(&stock->work, drain_local_stock); 5071 } 5072 hotcpu_notifier(memcg_cpu_hotplug_callback, 0); 5073 } else { 5074 parent = mem_cgroup_from_cont(cont->parent); 5075 mem->use_hierarchy = parent->use_hierarchy; 5076 mem->oom_kill_disable = parent->oom_kill_disable; 5077 } 5078 5079 if (parent && parent->use_hierarchy) { 5080 res_counter_init(&mem->res, &parent->res); 5081 res_counter_init(&mem->memsw, &parent->memsw); 5082 /* 5083 * We increment refcnt of the parent to ensure that we can 5084 * safely access it on res_counter_charge/uncharge. 5085 * This refcnt will be decremented when freeing this 5086 * mem_cgroup(see mem_cgroup_put). 5087 */ 5088 mem_cgroup_get(parent); 5089 } else { 5090 res_counter_init(&mem->res, NULL); 5091 res_counter_init(&mem->memsw, NULL); 5092 } 5093 mem->last_scanned_child = 0; 5094 mem->last_scanned_node = MAX_NUMNODES; 5095 INIT_LIST_HEAD(&mem->oom_notify); 5096 5097 if (parent) 5098 mem->swappiness = mem_cgroup_swappiness(parent); 5099 atomic_set(&mem->refcnt, 1); 5100 mem->move_charge_at_immigrate = 0; 5101 mutex_init(&mem->thresholds_lock); 5102 spin_lock_init(&mem->scanstat.lock); 5103 return &mem->css; 5104free_out: 5105 __mem_cgroup_free(mem); 5106 root_mem_cgroup = NULL; 5107 return ERR_PTR(error); 5108} 5109 5110static int mem_cgroup_pre_destroy(struct cgroup_subsys *ss, 5111 struct cgroup *cont) 5112{ 5113 struct mem_cgroup *mem = mem_cgroup_from_cont(cont); 5114 5115 return mem_cgroup_force_empty(mem, false); 5116} 5117 5118static void mem_cgroup_destroy(struct cgroup_subsys *ss, 5119 struct cgroup *cont) 5120{ 5121 struct mem_cgroup *mem = mem_cgroup_from_cont(cont); 5122 5123 mem_cgroup_put(mem); 5124} 5125 5126static int mem_cgroup_populate(struct cgroup_subsys *ss, 5127 struct cgroup *cont) 5128{ 5129 int ret; 5130 5131 ret = cgroup_add_files(cont, ss, mem_cgroup_files, 5132 ARRAY_SIZE(mem_cgroup_files)); 5133 5134 if (!ret) 5135 ret = register_memsw_files(cont, ss); 5136 return ret; 5137} 5138 5139#ifdef CONFIG_MMU 5140/* Handlers for move charge at task migration. */ 5141#define PRECHARGE_COUNT_AT_ONCE 256 5142static int mem_cgroup_do_precharge(unsigned long count) 5143{ 5144 int ret = 0; 5145 int batch_count = PRECHARGE_COUNT_AT_ONCE; 5146 struct mem_cgroup *mem = mc.to; 5147 5148 if (mem_cgroup_is_root(mem)) { 5149 mc.precharge += count; 5150 /* we don't need css_get for root */ 5151 return ret; 5152 } 5153 /* try to charge at once */ 5154 if (count > 1) { 5155 struct res_counter *dummy; 5156 /* 5157 * "mem" cannot be under rmdir() because we've already checked 5158 * by cgroup_lock_live_cgroup() that it is not removed and we 5159 * are still under the same cgroup_mutex. So we can postpone 5160 * css_get(). 5161 */ 5162 if (res_counter_charge(&mem->res, PAGE_SIZE * count, &dummy)) 5163 goto one_by_one; 5164 if (do_swap_account && res_counter_charge(&mem->memsw, 5165 PAGE_SIZE * count, &dummy)) { 5166 res_counter_uncharge(&mem->res, PAGE_SIZE * count); 5167 goto one_by_one; 5168 } 5169 mc.precharge += count; 5170 return ret; 5171 } 5172one_by_one: 5173 /* fall back to one by one charge */ 5174 while (count--) { 5175 if (signal_pending(current)) { 5176 ret = -EINTR; 5177 break; 5178 } 5179 if (!batch_count--) { 5180 batch_count = PRECHARGE_COUNT_AT_ONCE; 5181 cond_resched(); 5182 } 5183 ret = __mem_cgroup_try_charge(NULL, GFP_KERNEL, 1, &mem, false); 5184 if (ret || !mem) 5185 /* mem_cgroup_clear_mc() will do uncharge later */ 5186 return -ENOMEM; 5187 mc.precharge++; 5188 } 5189 return ret; 5190} 5191 5192/** 5193 * is_target_pte_for_mc - check a pte whether it is valid for move charge 5194 * @vma: the vma the pte to be checked belongs 5195 * @addr: the address corresponding to the pte to be checked 5196 * @ptent: the pte to be checked 5197 * @target: the pointer the target page or swap ent will be stored(can be NULL) 5198 * 5199 * Returns 5200 * 0(MC_TARGET_NONE): if the pte is not a target for move charge. 5201 * 1(MC_TARGET_PAGE): if the page corresponding to this pte is a target for 5202 * move charge. if @target is not NULL, the page is stored in target->page 5203 * with extra refcnt got(Callers should handle it). 5204 * 2(MC_TARGET_SWAP): if the swap entry corresponding to this pte is a 5205 * target for charge migration. if @target is not NULL, the entry is stored 5206 * in target->ent. 5207 * 5208 * Called with pte lock held. 5209 */ 5210union mc_target { 5211 struct page *page; 5212 swp_entry_t ent; 5213}; 5214 5215enum mc_target_type { 5216 MC_TARGET_NONE, /* not used */ 5217 MC_TARGET_PAGE, 5218 MC_TARGET_SWAP, 5219}; 5220 5221static struct page *mc_handle_present_pte(struct vm_area_struct *vma, 5222 unsigned long addr, pte_t ptent) 5223{ 5224 struct page *page = vm_normal_page(vma, addr, ptent); 5225 5226 if (!page || !page_mapped(page)) 5227 return NULL; 5228 if (PageAnon(page)) { 5229 /* we don't move shared anon */ 5230 if (!move_anon() || page_mapcount(page) > 2) 5231 return NULL; 5232 } else if (!move_file()) 5233 /* we ignore mapcount for file pages */ 5234 return NULL; 5235 if (!get_page_unless_zero(page)) 5236 return NULL; 5237 5238 return page; 5239} 5240 5241static struct page *mc_handle_swap_pte(struct vm_area_struct *vma, 5242 unsigned long addr, pte_t ptent, swp_entry_t *entry) 5243{ 5244 int usage_count; 5245 struct page *page = NULL; 5246 swp_entry_t ent = pte_to_swp_entry(ptent); 5247 5248 if (!move_anon() || non_swap_entry(ent)) 5249 return NULL; 5250 usage_count = mem_cgroup_count_swap_user(ent, &page); 5251 if (usage_count > 1) { /* we don't move shared anon */ 5252 if (page) 5253 put_page(page); 5254 return NULL; 5255 } 5256 if (do_swap_account) 5257 entry->val = ent.val; 5258 5259 return page; 5260} 5261 5262static struct page *mc_handle_file_pte(struct vm_area_struct *vma, 5263 unsigned long addr, pte_t ptent, swp_entry_t *entry) 5264{ 5265 struct page *page = NULL; 5266 struct inode *inode; 5267 struct address_space *mapping; 5268 pgoff_t pgoff; 5269 5270 if (!vma->vm_file) /* anonymous vma */ 5271 return NULL; 5272 if (!move_file()) 5273 return NULL; 5274 5275 inode = vma->vm_file->f_path.dentry->d_inode; 5276 mapping = vma->vm_file->f_mapping; 5277 if (pte_none(ptent)) 5278 pgoff = linear_page_index(vma, addr); 5279 else /* pte_file(ptent) is true */ 5280 pgoff = pte_to_pgoff(ptent); 5281 5282 /* page is moved even if it's not RSS of this task(page-faulted). */ 5283 page = find_get_page(mapping, pgoff); 5284 5285#ifdef CONFIG_SWAP 5286 /* shmem/tmpfs may report page out on swap: account for that too. */ 5287 if (radix_tree_exceptional_entry(page)) { 5288 swp_entry_t swap = radix_to_swp_entry(page); 5289 if (do_swap_account) 5290 *entry = swap; 5291 page = find_get_page(&swapper_space, swap.val); 5292 } 5293#endif 5294 return page; 5295} 5296 5297static int is_target_pte_for_mc(struct vm_area_struct *vma, 5298 unsigned long addr, pte_t ptent, union mc_target *target) 5299{ 5300 struct page *page = NULL; 5301 struct page_cgroup *pc; 5302 int ret = 0; 5303 swp_entry_t ent = { .val = 0 }; 5304 5305 if (pte_present(ptent)) 5306 page = mc_handle_present_pte(vma, addr, ptent); 5307 else if (is_swap_pte(ptent)) 5308 page = mc_handle_swap_pte(vma, addr, ptent, &ent); 5309 else if (pte_none(ptent) || pte_file(ptent)) 5310 page = mc_handle_file_pte(vma, addr, ptent, &ent); 5311 5312 if (!page && !ent.val) 5313 return 0; 5314 if (page) { 5315 pc = lookup_page_cgroup(page); 5316 /* 5317 * Do only loose check w/o page_cgroup lock. 5318 * mem_cgroup_move_account() checks the pc is valid or not under 5319 * the lock. 5320 */ 5321 if (PageCgroupUsed(pc) && pc->mem_cgroup == mc.from) { 5322 ret = MC_TARGET_PAGE; 5323 if (target) 5324 target->page = page; 5325 } 5326 if (!ret || !target) 5327 put_page(page); 5328 } 5329 /* There is a swap entry and a page doesn't exist or isn't charged */ 5330 if (ent.val && !ret && 5331 css_id(&mc.from->css) == lookup_swap_cgroup(ent)) { 5332 ret = MC_TARGET_SWAP; 5333 if (target) 5334 target->ent = ent; 5335 } 5336 return ret; 5337} 5338 5339static int mem_cgroup_count_precharge_pte_range(pmd_t *pmd, 5340 unsigned long addr, unsigned long end, 5341 struct mm_walk *walk) 5342{ 5343 struct vm_area_struct *vma = walk->private; 5344 pte_t *pte; 5345 spinlock_t *ptl; 5346 5347 split_huge_page_pmd(walk->mm, pmd); 5348 5349 pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl); 5350 for (; addr != end; pte++, addr += PAGE_SIZE) 5351 if (is_target_pte_for_mc(vma, addr, *pte, NULL)) 5352 mc.precharge++; /* increment precharge temporarily */ 5353 pte_unmap_unlock(pte - 1, ptl); 5354 cond_resched(); 5355 5356 return 0; 5357} 5358 5359static unsigned long mem_cgroup_count_precharge(struct mm_struct *mm) 5360{ 5361 unsigned long precharge; 5362 struct vm_area_struct *vma; 5363 5364 down_read(&mm->mmap_sem); 5365 for (vma = mm->mmap; vma; vma = vma->vm_next) { 5366 struct mm_walk mem_cgroup_count_precharge_walk = { 5367 .pmd_entry = mem_cgroup_count_precharge_pte_range, 5368 .mm = mm, 5369 .private = vma, 5370 }; 5371 if (is_vm_hugetlb_page(vma)) 5372 continue; 5373 walk_page_range(vma->vm_start, vma->vm_end, 5374 &mem_cgroup_count_precharge_walk); 5375 } 5376 up_read(&mm->mmap_sem); 5377 5378 precharge = mc.precharge; 5379 mc.precharge = 0; 5380 5381 return precharge; 5382} 5383 5384static int mem_cgroup_precharge_mc(struct mm_struct *mm) 5385{ 5386 unsigned long precharge = mem_cgroup_count_precharge(mm); 5387 5388 VM_BUG_ON(mc.moving_task); 5389 mc.moving_task = current; 5390 return mem_cgroup_do_precharge(precharge); 5391} 5392 5393/* cancels all extra charges on mc.from and mc.to, and wakes up all waiters. */ 5394static void __mem_cgroup_clear_mc(void) 5395{ 5396 struct mem_cgroup *from = mc.from; 5397 struct mem_cgroup *to = mc.to; 5398 5399 /* we must uncharge all the leftover precharges from mc.to */ 5400 if (mc.precharge) { 5401 __mem_cgroup_cancel_charge(mc.to, mc.precharge); 5402 mc.precharge = 0; 5403 } 5404 /* 5405 * we didn't uncharge from mc.from at mem_cgroup_move_account(), so 5406 * we must uncharge here. 5407 */ 5408 if (mc.moved_charge) { 5409 __mem_cgroup_cancel_charge(mc.from, mc.moved_charge); 5410 mc.moved_charge = 0; 5411 } 5412 /* we must fixup refcnts and charges */ 5413 if (mc.moved_swap) { 5414 /* uncharge swap account from the old cgroup */ 5415 if (!mem_cgroup_is_root(mc.from)) 5416 res_counter_uncharge(&mc.from->memsw, 5417 PAGE_SIZE * mc.moved_swap); 5418 __mem_cgroup_put(mc.from, mc.moved_swap); 5419 5420 if (!mem_cgroup_is_root(mc.to)) { 5421 /* 5422 * we charged both to->res and to->memsw, so we should 5423 * uncharge to->res. 5424 */ 5425 res_counter_uncharge(&mc.to->res, 5426 PAGE_SIZE * mc.moved_swap); 5427 } 5428 /* we've already done mem_cgroup_get(mc.to) */ 5429 mc.moved_swap = 0; 5430 } 5431 memcg_oom_recover(from); 5432 memcg_oom_recover(to); 5433 wake_up_all(&mc.waitq); 5434} 5435 5436static void mem_cgroup_clear_mc(void) 5437{ 5438 struct mem_cgroup *from = mc.from; 5439 5440 /* 5441 * we must clear moving_task before waking up waiters at the end of 5442 * task migration. 5443 */ 5444 mc.moving_task = NULL; 5445 __mem_cgroup_clear_mc(); 5446 spin_lock(&mc.lock); 5447 mc.from = NULL; 5448 mc.to = NULL; 5449 spin_unlock(&mc.lock); 5450 mem_cgroup_end_move(from); 5451} 5452 5453static int mem_cgroup_can_attach(struct cgroup_subsys *ss, 5454 struct cgroup *cgroup, 5455 struct task_struct *p) 5456{ 5457 int ret = 0; 5458 struct mem_cgroup *mem = mem_cgroup_from_cont(cgroup); 5459 5460 if (mem->move_charge_at_immigrate) { 5461 struct mm_struct *mm; 5462 struct mem_cgroup *from = mem_cgroup_from_task(p); 5463 5464 VM_BUG_ON(from == mem); 5465 5466 mm = get_task_mm(p); 5467 if (!mm) 5468 return 0; 5469 /* We move charges only when we move a owner of the mm */ 5470 if (mm->owner == p) { 5471 VM_BUG_ON(mc.from); 5472 VM_BUG_ON(mc.to); 5473 VM_BUG_ON(mc.precharge); 5474 VM_BUG_ON(mc.moved_charge); 5475 VM_BUG_ON(mc.moved_swap); 5476 mem_cgroup_start_move(from); 5477 spin_lock(&mc.lock); 5478 mc.from = from; 5479 mc.to = mem; 5480 spin_unlock(&mc.lock); 5481 /* We set mc.moving_task later */ 5482 5483 ret = mem_cgroup_precharge_mc(mm); 5484 if (ret) 5485 mem_cgroup_clear_mc(); 5486 } 5487 mmput(mm); 5488 } 5489 return ret; 5490} 5491 5492static void mem_cgroup_cancel_attach(struct cgroup_subsys *ss, 5493 struct cgroup *cgroup, 5494 struct task_struct *p) 5495{ 5496 mem_cgroup_clear_mc(); 5497} 5498 5499static int mem_cgroup_move_charge_pte_range(pmd_t *pmd, 5500 unsigned long addr, unsigned long end, 5501 struct mm_walk *walk) 5502{ 5503 int ret = 0; 5504 struct vm_area_struct *vma = walk->private; 5505 pte_t *pte; 5506 spinlock_t *ptl; 5507 5508 split_huge_page_pmd(walk->mm, pmd); 5509retry: 5510 pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl); 5511 for (; addr != end; addr += PAGE_SIZE) { 5512 pte_t ptent = *(pte++); 5513 union mc_target target; 5514 int type; 5515 struct page *page; 5516 struct page_cgroup *pc; 5517 swp_entry_t ent; 5518 5519 if (!mc.precharge) 5520 break; 5521 5522 type = is_target_pte_for_mc(vma, addr, ptent, &target); 5523 switch (type) { 5524 case MC_TARGET_PAGE: 5525 page = target.page; 5526 if (isolate_lru_page(page)) 5527 goto put; 5528 pc = lookup_page_cgroup(page); 5529 if (!mem_cgroup_move_account(page, 1, pc, 5530 mc.from, mc.to, false)) { 5531 mc.precharge--; 5532 /* we uncharge from mc.from later. */ 5533 mc.moved_charge++; 5534 } 5535 putback_lru_page(page); 5536put: /* is_target_pte_for_mc() gets the page */ 5537 put_page(page); 5538 break; 5539 case MC_TARGET_SWAP: 5540 ent = target.ent; 5541 if (!mem_cgroup_move_swap_account(ent, 5542 mc.from, mc.to, false)) { 5543 mc.precharge--; 5544 /* we fixup refcnts and charges later. */ 5545 mc.moved_swap++; 5546 } 5547 break; 5548 default: 5549 break; 5550 } 5551 } 5552 pte_unmap_unlock(pte - 1, ptl); 5553 cond_resched(); 5554 5555 if (addr != end) { 5556 /* 5557 * We have consumed all precharges we got in can_attach(). 5558 * We try charge one by one, but don't do any additional 5559 * charges to mc.to if we have failed in charge once in attach() 5560 * phase. 5561 */ 5562 ret = mem_cgroup_do_precharge(1); 5563 if (!ret) 5564 goto retry; 5565 } 5566 5567 return ret; 5568} 5569 5570static void mem_cgroup_move_charge(struct mm_struct *mm) 5571{ 5572 struct vm_area_struct *vma; 5573 5574 lru_add_drain_all(); 5575retry: 5576 if (unlikely(!down_read_trylock(&mm->mmap_sem))) { 5577 /* 5578 * Someone who are holding the mmap_sem might be waiting in 5579 * waitq. So we cancel all extra charges, wake up all waiters, 5580 * and retry. Because we cancel precharges, we might not be able 5581 * to move enough charges, but moving charge is a best-effort 5582 * feature anyway, so it wouldn't be a big problem. 5583 */ 5584 __mem_cgroup_clear_mc(); 5585 cond_resched(); 5586 goto retry; 5587 } 5588 for (vma = mm->mmap; vma; vma = vma->vm_next) { 5589 int ret; 5590 struct mm_walk mem_cgroup_move_charge_walk = { 5591 .pmd_entry = mem_cgroup_move_charge_pte_range, 5592 .mm = mm, 5593 .private = vma, 5594 }; 5595 if (is_vm_hugetlb_page(vma)) 5596 continue; 5597 ret = walk_page_range(vma->vm_start, vma->vm_end, 5598 &mem_cgroup_move_charge_walk); 5599 if (ret) 5600 /* 5601 * means we have consumed all precharges and failed in 5602 * doing additional charge. Just abandon here. 5603 */ 5604 break; 5605 } 5606 up_read(&mm->mmap_sem); 5607} 5608 5609static void mem_cgroup_move_task(struct cgroup_subsys *ss, 5610 struct cgroup *cont, 5611 struct cgroup *old_cont, 5612 struct task_struct *p) 5613{ 5614 struct mm_struct *mm = get_task_mm(p); 5615 5616 if (mm) { 5617 if (mc.to) 5618 mem_cgroup_move_charge(mm); 5619 put_swap_token(mm); 5620 mmput(mm); 5621 } 5622 if (mc.to) 5623 mem_cgroup_clear_mc(); 5624} 5625#else /* !CONFIG_MMU */ 5626static int mem_cgroup_can_attach(struct cgroup_subsys *ss, 5627 struct cgroup *cgroup, 5628 struct task_struct *p) 5629{ 5630 return 0; 5631} 5632static void mem_cgroup_cancel_attach(struct cgroup_subsys *ss, 5633 struct cgroup *cgroup, 5634 struct task_struct *p) 5635{ 5636} 5637static void mem_cgroup_move_task(struct cgroup_subsys *ss, 5638 struct cgroup *cont, 5639 struct cgroup *old_cont, 5640 struct task_struct *p) 5641{ 5642} 5643#endif 5644 5645struct cgroup_subsys mem_cgroup_subsys = { 5646 .name = "memory", 5647 .subsys_id = mem_cgroup_subsys_id, 5648 .create = mem_cgroup_create, 5649 .pre_destroy = mem_cgroup_pre_destroy, 5650 .destroy = mem_cgroup_destroy, 5651 .populate = mem_cgroup_populate, 5652 .can_attach = mem_cgroup_can_attach, 5653 .cancel_attach = mem_cgroup_cancel_attach, 5654 .attach = mem_cgroup_move_task, 5655 .early_init = 0, 5656 .use_id = 1, 5657}; 5658 5659#ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP 5660static int __init enable_swap_account(char *s) 5661{ 5662 /* consider enabled if no parameter or 1 is given */ 5663 if (!strcmp(s, "1")) 5664 really_do_swap_account = 1; 5665 else if (!strcmp(s, "0")) 5666 really_do_swap_account = 0; 5667 return 1; 5668} 5669__setup("swapaccount=", enable_swap_account); 5670 5671#endif 5672