oom_kill.c revision 1b604d75bbb6e28628c5a95a433432973c33d581
1/* 2 * linux/mm/oom_kill.c 3 * 4 * Copyright (C) 1998,2000 Rik van Riel 5 * Thanks go out to Claus Fischer for some serious inspiration and 6 * for goading me into coding this file... 7 * 8 * The routines in this file are used to kill a process when 9 * we're seriously out of memory. This gets called from __alloc_pages() 10 * in mm/page_alloc.c when we really run out of memory. 11 * 12 * Since we won't call these routines often (on a well-configured 13 * machine) this file will double as a 'coding guide' and a signpost 14 * for newbie kernel hackers. It features several pointers to major 15 * kernel subsystems and hints as to where to find out what things do. 16 */ 17 18#include <linux/oom.h> 19#include <linux/mm.h> 20#include <linux/err.h> 21#include <linux/sched.h> 22#include <linux/swap.h> 23#include <linux/timex.h> 24#include <linux/jiffies.h> 25#include <linux/cpuset.h> 26#include <linux/module.h> 27#include <linux/notifier.h> 28#include <linux/memcontrol.h> 29#include <linux/security.h> 30 31int sysctl_panic_on_oom; 32int sysctl_oom_kill_allocating_task; 33int sysctl_oom_dump_tasks; 34static DEFINE_SPINLOCK(zone_scan_lock); 35/* #define DEBUG */ 36 37/* 38 * Is all threads of the target process nodes overlap ours? 39 */ 40static int has_intersects_mems_allowed(struct task_struct *tsk) 41{ 42 struct task_struct *t; 43 44 t = tsk; 45 do { 46 if (cpuset_mems_allowed_intersects(current, t)) 47 return 1; 48 t = next_thread(t); 49 } while (t != tsk); 50 51 return 0; 52} 53 54/** 55 * badness - calculate a numeric value for how bad this task has been 56 * @p: task struct of which task we should calculate 57 * @uptime: current uptime in seconds 58 * 59 * The formula used is relatively simple and documented inline in the 60 * function. The main rationale is that we want to select a good task 61 * to kill when we run out of memory. 62 * 63 * Good in this context means that: 64 * 1) we lose the minimum amount of work done 65 * 2) we recover a large amount of memory 66 * 3) we don't kill anything innocent of eating tons of memory 67 * 4) we want to kill the minimum amount of processes (one) 68 * 5) we try to kill the process the user expects us to kill, this 69 * algorithm has been meticulously tuned to meet the principle 70 * of least surprise ... (be careful when you change it) 71 */ 72 73unsigned long badness(struct task_struct *p, unsigned long uptime) 74{ 75 unsigned long points, cpu_time, run_time; 76 struct mm_struct *mm; 77 struct task_struct *child; 78 int oom_adj = p->signal->oom_adj; 79 struct task_cputime task_time; 80 unsigned long utime; 81 unsigned long stime; 82 83 if (oom_adj == OOM_DISABLE) 84 return 0; 85 86 task_lock(p); 87 mm = p->mm; 88 if (!mm) { 89 task_unlock(p); 90 return 0; 91 } 92 93 /* 94 * The memory size of the process is the basis for the badness. 95 */ 96 points = mm->total_vm; 97 98 /* 99 * After this unlock we can no longer dereference local variable `mm' 100 */ 101 task_unlock(p); 102 103 /* 104 * swapoff can easily use up all memory, so kill those first. 105 */ 106 if (p->flags & PF_OOM_ORIGIN) 107 return ULONG_MAX; 108 109 /* 110 * Processes which fork a lot of child processes are likely 111 * a good choice. We add half the vmsize of the children if they 112 * have an own mm. This prevents forking servers to flood the 113 * machine with an endless amount of children. In case a single 114 * child is eating the vast majority of memory, adding only half 115 * to the parents will make the child our kill candidate of choice. 116 */ 117 list_for_each_entry(child, &p->children, sibling) { 118 task_lock(child); 119 if (child->mm != mm && child->mm) 120 points += child->mm->total_vm/2 + 1; 121 task_unlock(child); 122 } 123 124 /* 125 * CPU time is in tens of seconds and run time is in thousands 126 * of seconds. There is no particular reason for this other than 127 * that it turned out to work very well in practice. 128 */ 129 thread_group_cputime(p, &task_time); 130 utime = cputime_to_jiffies(task_time.utime); 131 stime = cputime_to_jiffies(task_time.stime); 132 cpu_time = (utime + stime) >> (SHIFT_HZ + 3); 133 134 135 if (uptime >= p->start_time.tv_sec) 136 run_time = (uptime - p->start_time.tv_sec) >> 10; 137 else 138 run_time = 0; 139 140 if (cpu_time) 141 points /= int_sqrt(cpu_time); 142 if (run_time) 143 points /= int_sqrt(int_sqrt(run_time)); 144 145 /* 146 * Niced processes are most likely less important, so double 147 * their badness points. 148 */ 149 if (task_nice(p) > 0) 150 points *= 2; 151 152 /* 153 * Superuser processes are usually more important, so we make it 154 * less likely that we kill those. 155 */ 156 if (has_capability_noaudit(p, CAP_SYS_ADMIN) || 157 has_capability_noaudit(p, CAP_SYS_RESOURCE)) 158 points /= 4; 159 160 /* 161 * We don't want to kill a process with direct hardware access. 162 * Not only could that mess up the hardware, but usually users 163 * tend to only have this flag set on applications they think 164 * of as important. 165 */ 166 if (has_capability_noaudit(p, CAP_SYS_RAWIO)) 167 points /= 4; 168 169 /* 170 * If p's nodes don't overlap ours, it may still help to kill p 171 * because p may have allocated or otherwise mapped memory on 172 * this node before. However it will be less likely. 173 */ 174 if (!has_intersects_mems_allowed(p)) 175 points /= 8; 176 177 /* 178 * Adjust the score by oom_adj. 179 */ 180 if (oom_adj) { 181 if (oom_adj > 0) { 182 if (!points) 183 points = 1; 184 points <<= oom_adj; 185 } else 186 points >>= -(oom_adj); 187 } 188 189#ifdef DEBUG 190 printk(KERN_DEBUG "OOMkill: task %d (%s) got %lu points\n", 191 p->pid, p->comm, points); 192#endif 193 return points; 194} 195 196/* 197 * Determine the type of allocation constraint. 198 */ 199static inline enum oom_constraint constrained_alloc(struct zonelist *zonelist, 200 gfp_t gfp_mask) 201{ 202#ifdef CONFIG_NUMA 203 struct zone *zone; 204 struct zoneref *z; 205 enum zone_type high_zoneidx = gfp_zone(gfp_mask); 206 nodemask_t nodes = node_states[N_HIGH_MEMORY]; 207 208 for_each_zone_zonelist(zone, z, zonelist, high_zoneidx) 209 if (cpuset_zone_allowed_softwall(zone, gfp_mask)) 210 node_clear(zone_to_nid(zone), nodes); 211 else 212 return CONSTRAINT_CPUSET; 213 214 if (!nodes_empty(nodes)) 215 return CONSTRAINT_MEMORY_POLICY; 216#endif 217 218 return CONSTRAINT_NONE; 219} 220 221/* 222 * Simple selection loop. We chose the process with the highest 223 * number of 'points'. We expect the caller will lock the tasklist. 224 * 225 * (not docbooked, we don't want this one cluttering up the manual) 226 */ 227static struct task_struct *select_bad_process(unsigned long *ppoints, 228 struct mem_cgroup *mem) 229{ 230 struct task_struct *p; 231 struct task_struct *chosen = NULL; 232 struct timespec uptime; 233 *ppoints = 0; 234 235 do_posix_clock_monotonic_gettime(&uptime); 236 for_each_process(p) { 237 unsigned long points; 238 239 /* 240 * skip kernel threads and tasks which have already released 241 * their mm. 242 */ 243 if (!p->mm) 244 continue; 245 /* skip the init task */ 246 if (is_global_init(p)) 247 continue; 248 if (mem && !task_in_mem_cgroup(p, mem)) 249 continue; 250 251 /* 252 * This task already has access to memory reserves and is 253 * being killed. Don't allow any other task access to the 254 * memory reserve. 255 * 256 * Note: this may have a chance of deadlock if it gets 257 * blocked waiting for another task which itself is waiting 258 * for memory. Is there a better alternative? 259 */ 260 if (test_tsk_thread_flag(p, TIF_MEMDIE)) 261 return ERR_PTR(-1UL); 262 263 /* 264 * This is in the process of releasing memory so wait for it 265 * to finish before killing some other task by mistake. 266 * 267 * However, if p is the current task, we allow the 'kill' to 268 * go ahead if it is exiting: this will simply set TIF_MEMDIE, 269 * which will allow it to gain access to memory reserves in 270 * the process of exiting and releasing its resources. 271 * Otherwise we could get an easy OOM deadlock. 272 */ 273 if (p->flags & PF_EXITING) { 274 if (p != current) 275 return ERR_PTR(-1UL); 276 277 chosen = p; 278 *ppoints = ULONG_MAX; 279 } 280 281 if (p->signal->oom_adj == OOM_DISABLE) 282 continue; 283 284 points = badness(p, uptime.tv_sec); 285 if (points > *ppoints || !chosen) { 286 chosen = p; 287 *ppoints = points; 288 } 289 } 290 291 return chosen; 292} 293 294/** 295 * dump_tasks - dump current memory state of all system tasks 296 * @mem: target memory controller 297 * 298 * Dumps the current memory state of all system tasks, excluding kernel threads. 299 * State information includes task's pid, uid, tgid, vm size, rss, cpu, oom_adj 300 * score, and name. 301 * 302 * If the actual is non-NULL, only tasks that are a member of the mem_cgroup are 303 * shown. 304 * 305 * Call with tasklist_lock read-locked. 306 */ 307static void dump_tasks(const struct mem_cgroup *mem) 308{ 309 struct task_struct *g, *p; 310 311 printk(KERN_INFO "[ pid ] uid tgid total_vm rss cpu oom_adj " 312 "name\n"); 313 do_each_thread(g, p) { 314 struct mm_struct *mm; 315 316 if (mem && !task_in_mem_cgroup(p, mem)) 317 continue; 318 if (!thread_group_leader(p)) 319 continue; 320 321 task_lock(p); 322 mm = p->mm; 323 if (!mm) { 324 /* 325 * total_vm and rss sizes do not exist for tasks with no 326 * mm so there's no need to report them; they can't be 327 * oom killed anyway. 328 */ 329 task_unlock(p); 330 continue; 331 } 332 printk(KERN_INFO "[%5d] %5d %5d %8lu %8lu %3d %3d %s\n", 333 p->pid, __task_cred(p)->uid, p->tgid, mm->total_vm, 334 get_mm_rss(mm), (int)task_cpu(p), p->signal->oom_adj, 335 p->comm); 336 task_unlock(p); 337 } while_each_thread(g, p); 338} 339 340static void dump_header(gfp_t gfp_mask, int order, struct mem_cgroup *mem) 341{ 342 pr_warning("%s invoked oom-killer: gfp_mask=0x%x, order=%d, " 343 "oom_adj=%d\n", 344 current->comm, gfp_mask, order, current->signal->oom_adj); 345 task_lock(current); 346 cpuset_print_task_mems_allowed(current); 347 task_unlock(current); 348 dump_stack(); 349 mem_cgroup_print_oom_info(mem, current); 350 show_mem(); 351 if (sysctl_oom_dump_tasks) 352 dump_tasks(mem); 353} 354 355/* 356 * Send SIGKILL to the selected process irrespective of CAP_SYS_RAW_IO 357 * flag though it's unlikely that we select a process with CAP_SYS_RAW_IO 358 * set. 359 */ 360static void __oom_kill_task(struct task_struct *p, int verbose) 361{ 362 if (is_global_init(p)) { 363 WARN_ON(1); 364 printk(KERN_WARNING "tried to kill init!\n"); 365 return; 366 } 367 368 if (!p->mm) { 369 WARN_ON(1); 370 printk(KERN_WARNING "tried to kill an mm-less task!\n"); 371 return; 372 } 373 374 if (verbose) 375 printk(KERN_ERR "Killed process %d (%s)\n", 376 task_pid_nr(p), p->comm); 377 378 /* 379 * We give our sacrificial lamb high priority and access to 380 * all the memory it needs. That way it should be able to 381 * exit() and clear out its resources quickly... 382 */ 383 p->rt.time_slice = HZ; 384 set_tsk_thread_flag(p, TIF_MEMDIE); 385 386 force_sig(SIGKILL, p); 387} 388 389static int oom_kill_task(struct task_struct *p) 390{ 391 /* WARNING: mm may not be dereferenced since we did not obtain its 392 * value from get_task_mm(p). This is OK since all we need to do is 393 * compare mm to q->mm below. 394 * 395 * Furthermore, even if mm contains a non-NULL value, p->mm may 396 * change to NULL at any time since we do not hold task_lock(p). 397 * However, this is of no concern to us. 398 */ 399 if (!p->mm || p->signal->oom_adj == OOM_DISABLE) 400 return 1; 401 402 __oom_kill_task(p, 1); 403 404 return 0; 405} 406 407static int oom_kill_process(struct task_struct *p, gfp_t gfp_mask, int order, 408 unsigned long points, struct mem_cgroup *mem, 409 const char *message) 410{ 411 struct task_struct *c; 412 413 if (printk_ratelimit()) 414 dump_header(gfp_mask, order, mem); 415 416 /* 417 * If the task is already exiting, don't alarm the sysadmin or kill 418 * its children or threads, just set TIF_MEMDIE so it can die quickly 419 */ 420 if (p->flags & PF_EXITING) { 421 __oom_kill_task(p, 0); 422 return 0; 423 } 424 425 printk(KERN_ERR "%s: kill process %d (%s) score %li or a child\n", 426 message, task_pid_nr(p), p->comm, points); 427 428 /* Try to kill a child first */ 429 list_for_each_entry(c, &p->children, sibling) { 430 if (c->mm == p->mm) 431 continue; 432 if (!oom_kill_task(c)) 433 return 0; 434 } 435 return oom_kill_task(p); 436} 437 438#ifdef CONFIG_CGROUP_MEM_RES_CTLR 439void mem_cgroup_out_of_memory(struct mem_cgroup *mem, gfp_t gfp_mask) 440{ 441 unsigned long points = 0; 442 struct task_struct *p; 443 444 read_lock(&tasklist_lock); 445retry: 446 p = select_bad_process(&points, mem); 447 if (PTR_ERR(p) == -1UL) 448 goto out; 449 450 if (!p) 451 p = current; 452 453 if (oom_kill_process(p, gfp_mask, 0, points, mem, 454 "Memory cgroup out of memory")) 455 goto retry; 456out: 457 read_unlock(&tasklist_lock); 458} 459#endif 460 461static BLOCKING_NOTIFIER_HEAD(oom_notify_list); 462 463int register_oom_notifier(struct notifier_block *nb) 464{ 465 return blocking_notifier_chain_register(&oom_notify_list, nb); 466} 467EXPORT_SYMBOL_GPL(register_oom_notifier); 468 469int unregister_oom_notifier(struct notifier_block *nb) 470{ 471 return blocking_notifier_chain_unregister(&oom_notify_list, nb); 472} 473EXPORT_SYMBOL_GPL(unregister_oom_notifier); 474 475/* 476 * Try to acquire the OOM killer lock for the zones in zonelist. Returns zero 477 * if a parallel OOM killing is already taking place that includes a zone in 478 * the zonelist. Otherwise, locks all zones in the zonelist and returns 1. 479 */ 480int try_set_zone_oom(struct zonelist *zonelist, gfp_t gfp_mask) 481{ 482 struct zoneref *z; 483 struct zone *zone; 484 int ret = 1; 485 486 spin_lock(&zone_scan_lock); 487 for_each_zone_zonelist(zone, z, zonelist, gfp_zone(gfp_mask)) { 488 if (zone_is_oom_locked(zone)) { 489 ret = 0; 490 goto out; 491 } 492 } 493 494 for_each_zone_zonelist(zone, z, zonelist, gfp_zone(gfp_mask)) { 495 /* 496 * Lock each zone in the zonelist under zone_scan_lock so a 497 * parallel invocation of try_set_zone_oom() doesn't succeed 498 * when it shouldn't. 499 */ 500 zone_set_flag(zone, ZONE_OOM_LOCKED); 501 } 502 503out: 504 spin_unlock(&zone_scan_lock); 505 return ret; 506} 507 508/* 509 * Clears the ZONE_OOM_LOCKED flag for all zones in the zonelist so that failed 510 * allocation attempts with zonelists containing them may now recall the OOM 511 * killer, if necessary. 512 */ 513void clear_zonelist_oom(struct zonelist *zonelist, gfp_t gfp_mask) 514{ 515 struct zoneref *z; 516 struct zone *zone; 517 518 spin_lock(&zone_scan_lock); 519 for_each_zone_zonelist(zone, z, zonelist, gfp_zone(gfp_mask)) { 520 zone_clear_flag(zone, ZONE_OOM_LOCKED); 521 } 522 spin_unlock(&zone_scan_lock); 523} 524 525/* 526 * Must be called with tasklist_lock held for read. 527 */ 528static void __out_of_memory(gfp_t gfp_mask, int order) 529{ 530 struct task_struct *p; 531 unsigned long points; 532 533 if (sysctl_oom_kill_allocating_task) 534 if (!oom_kill_process(current, gfp_mask, order, 0, NULL, 535 "Out of memory (oom_kill_allocating_task)")) 536 return; 537retry: 538 /* 539 * Rambo mode: Shoot down a process and hope it solves whatever 540 * issues we may have. 541 */ 542 p = select_bad_process(&points, NULL); 543 544 if (PTR_ERR(p) == -1UL) 545 return; 546 547 /* Found nothing?!?! Either we hang forever, or we panic. */ 548 if (!p) { 549 read_unlock(&tasklist_lock); 550 dump_header(gfp_mask, order, NULL); 551 panic("Out of memory and no killable processes...\n"); 552 } 553 554 if (oom_kill_process(p, gfp_mask, order, points, NULL, 555 "Out of memory")) 556 goto retry; 557} 558 559/* 560 * pagefault handler calls into here because it is out of memory but 561 * doesn't know exactly how or why. 562 */ 563void pagefault_out_of_memory(void) 564{ 565 unsigned long freed = 0; 566 567 blocking_notifier_call_chain(&oom_notify_list, 0, &freed); 568 if (freed > 0) 569 /* Got some memory back in the last second. */ 570 return; 571 572 /* 573 * If this is from memcg, oom-killer is already invoked. 574 * and not worth to go system-wide-oom. 575 */ 576 if (mem_cgroup_oom_called(current)) 577 goto rest_and_return; 578 579 if (sysctl_panic_on_oom) 580 panic("out of memory from page fault. panic_on_oom is selected.\n"); 581 582 read_lock(&tasklist_lock); 583 __out_of_memory(0, 0); /* unknown gfp_mask and order */ 584 read_unlock(&tasklist_lock); 585 586 /* 587 * Give "p" a good chance of killing itself before we 588 * retry to allocate memory. 589 */ 590rest_and_return: 591 if (!test_thread_flag(TIF_MEMDIE)) 592 schedule_timeout_uninterruptible(1); 593} 594 595/** 596 * out_of_memory - kill the "best" process when we run out of memory 597 * @zonelist: zonelist pointer 598 * @gfp_mask: memory allocation flags 599 * @order: amount of memory being requested as a power of 2 600 * 601 * If we run out of memory, we have the choice between either 602 * killing a random task (bad), letting the system crash (worse) 603 * OR try to be smart about which process to kill. Note that we 604 * don't have to be perfect here, we just have to be good. 605 */ 606void out_of_memory(struct zonelist *zonelist, gfp_t gfp_mask, int order) 607{ 608 unsigned long freed = 0; 609 enum oom_constraint constraint; 610 611 blocking_notifier_call_chain(&oom_notify_list, 0, &freed); 612 if (freed > 0) 613 /* Got some memory back in the last second. */ 614 return; 615 616 if (sysctl_panic_on_oom == 2) { 617 dump_header(gfp_mask, order, NULL); 618 panic("out of memory. Compulsory panic_on_oom is selected.\n"); 619 } 620 621 /* 622 * Check if there were limitations on the allocation (only relevant for 623 * NUMA) that may require different handling. 624 */ 625 constraint = constrained_alloc(zonelist, gfp_mask); 626 read_lock(&tasklist_lock); 627 628 switch (constraint) { 629 case CONSTRAINT_MEMORY_POLICY: 630 oom_kill_process(current, gfp_mask, order, 0, NULL, 631 "No available memory (MPOL_BIND)"); 632 break; 633 634 case CONSTRAINT_NONE: 635 if (sysctl_panic_on_oom) { 636 dump_header(gfp_mask, order, NULL); 637 panic("out of memory. panic_on_oom is selected\n"); 638 } 639 /* Fall-through */ 640 case CONSTRAINT_CPUSET: 641 __out_of_memory(gfp_mask, order); 642 break; 643 } 644 645 read_unlock(&tasklist_lock); 646 647 /* 648 * Give "p" a good chance of killing itself before we 649 * retry to allocate memory unless "p" is current 650 */ 651 if (!test_thread_flag(TIF_MEMDIE)) 652 schedule_timeout_uninterruptible(1); 653} 654