memcontrol.c revision 66e1707bc34609f626e2e7b4fe7e454c9748bad5
1/* memcontrol.c - Memory Controller 2 * 3 * Copyright IBM Corporation, 2007 4 * Author Balbir Singh <balbir@linux.vnet.ibm.com> 5 * 6 * Copyright 2007 OpenVZ SWsoft Inc 7 * Author: Pavel Emelianov <xemul@openvz.org> 8 * 9 * This program is free software; you can redistribute it and/or modify 10 * it under the terms of the GNU General Public License as published by 11 * the Free Software Foundation; either version 2 of the License, or 12 * (at your option) any later version. 13 * 14 * This program is distributed in the hope that it will be useful, 15 * but WITHOUT ANY WARRANTY; without even the implied warranty of 16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 17 * GNU General Public License for more details. 18 */ 19 20#include <linux/res_counter.h> 21#include <linux/memcontrol.h> 22#include <linux/cgroup.h> 23#include <linux/mm.h> 24#include <linux/page-flags.h> 25#include <linux/backing-dev.h> 26#include <linux/bit_spinlock.h> 27#include <linux/rcupdate.h> 28#include <linux/swap.h> 29#include <linux/spinlock.h> 30#include <linux/fs.h> 31 32struct cgroup_subsys mem_cgroup_subsys; 33static const int MEM_CGROUP_RECLAIM_RETRIES = 5; 34 35/* 36 * The memory controller data structure. The memory controller controls both 37 * page cache and RSS per cgroup. We would eventually like to provide 38 * statistics based on the statistics developed by Rik Van Riel for clock-pro, 39 * to help the administrator determine what knobs to tune. 40 * 41 * TODO: Add a water mark for the memory controller. Reclaim will begin when 42 * we hit the water mark. May be even add a low water mark, such that 43 * no reclaim occurs from a cgroup at it's low water mark, this is 44 * a feature that will be implemented much later in the future. 45 */ 46struct mem_cgroup { 47 struct cgroup_subsys_state css; 48 /* 49 * the counter to account for memory usage 50 */ 51 struct res_counter res; 52 /* 53 * Per cgroup active and inactive list, similar to the 54 * per zone LRU lists. 55 * TODO: Consider making these lists per zone 56 */ 57 struct list_head active_list; 58 struct list_head inactive_list; 59 /* 60 * spin_lock to protect the per cgroup LRU 61 */ 62 spinlock_t lru_lock; 63}; 64 65/* 66 * We use the lower bit of the page->page_cgroup pointer as a bit spin 67 * lock. We need to ensure that page->page_cgroup is atleast two 68 * byte aligned (based on comments from Nick Piggin) 69 */ 70#define PAGE_CGROUP_LOCK_BIT 0x0 71#define PAGE_CGROUP_LOCK (1 << PAGE_CGROUP_LOCK_BIT) 72 73/* 74 * A page_cgroup page is associated with every page descriptor. The 75 * page_cgroup helps us identify information about the cgroup 76 */ 77struct page_cgroup { 78 struct list_head lru; /* per cgroup LRU list */ 79 struct page *page; 80 struct mem_cgroup *mem_cgroup; 81 atomic_t ref_cnt; /* Helpful when pages move b/w */ 82 /* mapped and cached states */ 83}; 84 85 86static inline 87struct mem_cgroup *mem_cgroup_from_cont(struct cgroup *cont) 88{ 89 return container_of(cgroup_subsys_state(cont, 90 mem_cgroup_subsys_id), struct mem_cgroup, 91 css); 92} 93 94static inline 95struct mem_cgroup *mem_cgroup_from_task(struct task_struct *p) 96{ 97 return container_of(task_subsys_state(p, mem_cgroup_subsys_id), 98 struct mem_cgroup, css); 99} 100 101void mm_init_cgroup(struct mm_struct *mm, struct task_struct *p) 102{ 103 struct mem_cgroup *mem; 104 105 mem = mem_cgroup_from_task(p); 106 css_get(&mem->css); 107 mm->mem_cgroup = mem; 108} 109 110void mm_free_cgroup(struct mm_struct *mm) 111{ 112 css_put(&mm->mem_cgroup->css); 113} 114 115static inline int page_cgroup_locked(struct page *page) 116{ 117 return bit_spin_is_locked(PAGE_CGROUP_LOCK_BIT, 118 &page->page_cgroup); 119} 120 121void page_assign_page_cgroup(struct page *page, struct page_cgroup *pc) 122{ 123 int locked; 124 125 /* 126 * While resetting the page_cgroup we might not hold the 127 * page_cgroup lock. free_hot_cold_page() is an example 128 * of such a scenario 129 */ 130 if (pc) 131 VM_BUG_ON(!page_cgroup_locked(page)); 132 locked = (page->page_cgroup & PAGE_CGROUP_LOCK); 133 page->page_cgroup = ((unsigned long)pc | locked); 134} 135 136struct page_cgroup *page_get_page_cgroup(struct page *page) 137{ 138 return (struct page_cgroup *) 139 (page->page_cgroup & ~PAGE_CGROUP_LOCK); 140} 141 142void __always_inline lock_page_cgroup(struct page *page) 143{ 144 bit_spin_lock(PAGE_CGROUP_LOCK_BIT, &page->page_cgroup); 145 VM_BUG_ON(!page_cgroup_locked(page)); 146} 147 148void __always_inline unlock_page_cgroup(struct page *page) 149{ 150 bit_spin_unlock(PAGE_CGROUP_LOCK_BIT, &page->page_cgroup); 151} 152 153void __mem_cgroup_move_lists(struct page_cgroup *pc, bool active) 154{ 155 if (active) 156 list_move(&pc->lru, &pc->mem_cgroup->active_list); 157 else 158 list_move(&pc->lru, &pc->mem_cgroup->inactive_list); 159} 160 161/* 162 * This routine assumes that the appropriate zone's lru lock is already held 163 */ 164void mem_cgroup_move_lists(struct page_cgroup *pc, bool active) 165{ 166 struct mem_cgroup *mem; 167 if (!pc) 168 return; 169 170 mem = pc->mem_cgroup; 171 172 spin_lock(&mem->lru_lock); 173 __mem_cgroup_move_lists(pc, active); 174 spin_unlock(&mem->lru_lock); 175} 176 177unsigned long mem_cgroup_isolate_pages(unsigned long nr_to_scan, 178 struct list_head *dst, 179 unsigned long *scanned, int order, 180 int mode, struct zone *z, 181 struct mem_cgroup *mem_cont, 182 int active) 183{ 184 unsigned long nr_taken = 0; 185 struct page *page; 186 unsigned long scan; 187 LIST_HEAD(pc_list); 188 struct list_head *src; 189 struct page_cgroup *pc; 190 191 if (active) 192 src = &mem_cont->active_list; 193 else 194 src = &mem_cont->inactive_list; 195 196 spin_lock(&mem_cont->lru_lock); 197 for (scan = 0; scan < nr_to_scan && !list_empty(src); scan++) { 198 pc = list_entry(src->prev, struct page_cgroup, lru); 199 page = pc->page; 200 VM_BUG_ON(!pc); 201 202 if (PageActive(page) && !active) { 203 __mem_cgroup_move_lists(pc, true); 204 scan--; 205 continue; 206 } 207 if (!PageActive(page) && active) { 208 __mem_cgroup_move_lists(pc, false); 209 scan--; 210 continue; 211 } 212 213 /* 214 * Reclaim, per zone 215 * TODO: make the active/inactive lists per zone 216 */ 217 if (page_zone(page) != z) 218 continue; 219 220 /* 221 * Check if the meta page went away from under us 222 */ 223 if (!list_empty(&pc->lru)) 224 list_move(&pc->lru, &pc_list); 225 else 226 continue; 227 228 if (__isolate_lru_page(page, mode) == 0) { 229 list_move(&page->lru, dst); 230 nr_taken++; 231 } 232 } 233 234 list_splice(&pc_list, src); 235 spin_unlock(&mem_cont->lru_lock); 236 237 *scanned = scan; 238 return nr_taken; 239} 240 241/* 242 * Charge the memory controller for page usage. 243 * Return 244 * 0 if the charge was successful 245 * < 0 if the cgroup is over its limit 246 */ 247int mem_cgroup_charge(struct page *page, struct mm_struct *mm) 248{ 249 struct mem_cgroup *mem; 250 struct page_cgroup *pc, *race_pc; 251 unsigned long flags; 252 unsigned long nr_retries = MEM_CGROUP_RECLAIM_RETRIES; 253 254 /* 255 * Should page_cgroup's go to their own slab? 256 * One could optimize the performance of the charging routine 257 * by saving a bit in the page_flags and using it as a lock 258 * to see if the cgroup page already has a page_cgroup associated 259 * with it 260 */ 261retry: 262 lock_page_cgroup(page); 263 pc = page_get_page_cgroup(page); 264 /* 265 * The page_cgroup exists and the page has already been accounted 266 */ 267 if (pc) { 268 if (unlikely(!atomic_inc_not_zero(&pc->ref_cnt))) { 269 /* this page is under being uncharged ? */ 270 unlock_page_cgroup(page); 271 cpu_relax(); 272 goto retry; 273 } else 274 goto done; 275 } 276 277 unlock_page_cgroup(page); 278 279 pc = kzalloc(sizeof(struct page_cgroup), GFP_KERNEL); 280 if (pc == NULL) 281 goto err; 282 283 rcu_read_lock(); 284 /* 285 * We always charge the cgroup the mm_struct belongs to 286 * the mm_struct's mem_cgroup changes on task migration if the 287 * thread group leader migrates. It's possible that mm is not 288 * set, if so charge the init_mm (happens for pagecache usage). 289 */ 290 if (!mm) 291 mm = &init_mm; 292 293 mem = rcu_dereference(mm->mem_cgroup); 294 /* 295 * For every charge from the cgroup, increment reference 296 * count 297 */ 298 css_get(&mem->css); 299 rcu_read_unlock(); 300 301 /* 302 * If we created the page_cgroup, we should free it on exceeding 303 * the cgroup limit. 304 */ 305 while (res_counter_charge(&mem->res, 1)) { 306 if (try_to_free_mem_cgroup_pages(mem)) 307 continue; 308 309 /* 310 * try_to_free_mem_cgroup_pages() might not give us a full 311 * picture of reclaim. Some pages are reclaimed and might be 312 * moved to swap cache or just unmapped from the cgroup. 313 * Check the limit again to see if the reclaim reduced the 314 * current usage of the cgroup before giving up 315 */ 316 if (res_counter_check_under_limit(&mem->res)) 317 continue; 318 /* 319 * Since we control both RSS and cache, we end up with a 320 * very interesting scenario where we end up reclaiming 321 * memory (essentially RSS), since the memory is pushed 322 * to swap cache, we eventually end up adding those 323 * pages back to our list. Hence we give ourselves a 324 * few chances before we fail 325 */ 326 else if (nr_retries--) { 327 congestion_wait(WRITE, HZ/10); 328 continue; 329 } 330 331 css_put(&mem->css); 332 goto free_pc; 333 } 334 335 lock_page_cgroup(page); 336 /* 337 * Check if somebody else beat us to allocating the page_cgroup 338 */ 339 race_pc = page_get_page_cgroup(page); 340 if (race_pc) { 341 kfree(pc); 342 pc = race_pc; 343 atomic_inc(&pc->ref_cnt); 344 res_counter_uncharge(&mem->res, 1); 345 css_put(&mem->css); 346 goto done; 347 } 348 349 atomic_set(&pc->ref_cnt, 1); 350 pc->mem_cgroup = mem; 351 pc->page = page; 352 page_assign_page_cgroup(page, pc); 353 354 spin_lock_irqsave(&mem->lru_lock, flags); 355 list_add(&pc->lru, &mem->active_list); 356 spin_unlock_irqrestore(&mem->lru_lock, flags); 357 358done: 359 unlock_page_cgroup(page); 360 return 0; 361free_pc: 362 kfree(pc); 363err: 364 return -ENOMEM; 365} 366 367/* 368 * Uncharging is always a welcome operation, we never complain, simply 369 * uncharge. 370 */ 371void mem_cgroup_uncharge(struct page_cgroup *pc) 372{ 373 struct mem_cgroup *mem; 374 struct page *page; 375 unsigned long flags; 376 377 if (!pc) 378 return; 379 380 if (atomic_dec_and_test(&pc->ref_cnt)) { 381 page = pc->page; 382 lock_page_cgroup(page); 383 mem = pc->mem_cgroup; 384 css_put(&mem->css); 385 page_assign_page_cgroup(page, NULL); 386 unlock_page_cgroup(page); 387 res_counter_uncharge(&mem->res, 1); 388 389 spin_lock_irqsave(&mem->lru_lock, flags); 390 list_del_init(&pc->lru); 391 spin_unlock_irqrestore(&mem->lru_lock, flags); 392 kfree(pc); 393 } 394} 395 396static ssize_t mem_cgroup_read(struct cgroup *cont, struct cftype *cft, 397 struct file *file, char __user *userbuf, size_t nbytes, 398 loff_t *ppos) 399{ 400 return res_counter_read(&mem_cgroup_from_cont(cont)->res, 401 cft->private, userbuf, nbytes, ppos); 402} 403 404static ssize_t mem_cgroup_write(struct cgroup *cont, struct cftype *cft, 405 struct file *file, const char __user *userbuf, 406 size_t nbytes, loff_t *ppos) 407{ 408 return res_counter_write(&mem_cgroup_from_cont(cont)->res, 409 cft->private, userbuf, nbytes, ppos); 410} 411 412static struct cftype mem_cgroup_files[] = { 413 { 414 .name = "usage", 415 .private = RES_USAGE, 416 .read = mem_cgroup_read, 417 }, 418 { 419 .name = "limit", 420 .private = RES_LIMIT, 421 .write = mem_cgroup_write, 422 .read = mem_cgroup_read, 423 }, 424 { 425 .name = "failcnt", 426 .private = RES_FAILCNT, 427 .read = mem_cgroup_read, 428 }, 429}; 430 431static struct mem_cgroup init_mem_cgroup; 432 433static struct cgroup_subsys_state * 434mem_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cont) 435{ 436 struct mem_cgroup *mem; 437 438 if (unlikely((cont->parent) == NULL)) { 439 mem = &init_mem_cgroup; 440 init_mm.mem_cgroup = mem; 441 } else 442 mem = kzalloc(sizeof(struct mem_cgroup), GFP_KERNEL); 443 444 if (mem == NULL) 445 return NULL; 446 447 res_counter_init(&mem->res); 448 INIT_LIST_HEAD(&mem->active_list); 449 INIT_LIST_HEAD(&mem->inactive_list); 450 spin_lock_init(&mem->lru_lock); 451 return &mem->css; 452} 453 454static void mem_cgroup_destroy(struct cgroup_subsys *ss, 455 struct cgroup *cont) 456{ 457 kfree(mem_cgroup_from_cont(cont)); 458} 459 460static int mem_cgroup_populate(struct cgroup_subsys *ss, 461 struct cgroup *cont) 462{ 463 return cgroup_add_files(cont, ss, mem_cgroup_files, 464 ARRAY_SIZE(mem_cgroup_files)); 465} 466 467static void mem_cgroup_move_task(struct cgroup_subsys *ss, 468 struct cgroup *cont, 469 struct cgroup *old_cont, 470 struct task_struct *p) 471{ 472 struct mm_struct *mm; 473 struct mem_cgroup *mem, *old_mem; 474 475 mm = get_task_mm(p); 476 if (mm == NULL) 477 return; 478 479 mem = mem_cgroup_from_cont(cont); 480 old_mem = mem_cgroup_from_cont(old_cont); 481 482 if (mem == old_mem) 483 goto out; 484 485 /* 486 * Only thread group leaders are allowed to migrate, the mm_struct is 487 * in effect owned by the leader 488 */ 489 if (p->tgid != p->pid) 490 goto out; 491 492 css_get(&mem->css); 493 rcu_assign_pointer(mm->mem_cgroup, mem); 494 css_put(&old_mem->css); 495 496out: 497 mmput(mm); 498 return; 499} 500 501struct cgroup_subsys mem_cgroup_subsys = { 502 .name = "memory", 503 .subsys_id = mem_cgroup_subsys_id, 504 .create = mem_cgroup_create, 505 .destroy = mem_cgroup_destroy, 506 .populate = mem_cgroup_populate, 507 .attach = mem_cgroup_move_task, 508 .early_init = 1, 509}; 510