oom_kill.c revision 79b9ce311e192e9a31fd9f3cf1ee4a4edf9e2650
1/* 2 * linux/mm/oom_kill.c 3 * 4 * Copyright (C) 1998,2000 Rik van Riel 5 * Thanks go out to Claus Fischer for some serious inspiration and 6 * for goading me into coding this file... 7 * 8 * The routines in this file are used to kill a process when 9 * we're seriously out of memory. This gets called from kswapd() 10 * in linux/mm/vmscan.c when we really run out of memory. 11 * 12 * Since we won't call these routines often (on a well-configured 13 * machine) this file will double as a 'coding guide' and a signpost 14 * for newbie kernel hackers. It features several pointers to major 15 * kernel subsystems and hints as to where to find out what things do. 16 */ 17 18#include <linux/mm.h> 19#include <linux/sched.h> 20#include <linux/swap.h> 21#include <linux/timex.h> 22#include <linux/jiffies.h> 23 24/* #define DEBUG */ 25 26/** 27 * oom_badness - calculate a numeric value for how bad this task has been 28 * @p: task struct of which task we should calculate 29 * @p: current uptime in seconds 30 * 31 * The formula used is relatively simple and documented inline in the 32 * function. The main rationale is that we want to select a good task 33 * to kill when we run out of memory. 34 * 35 * Good in this context means that: 36 * 1) we lose the minimum amount of work done 37 * 2) we recover a large amount of memory 38 * 3) we don't kill anything innocent of eating tons of memory 39 * 4) we want to kill the minimum amount of processes (one) 40 * 5) we try to kill the process the user expects us to kill, this 41 * algorithm has been meticulously tuned to meet the principle 42 * of least surprise ... (be careful when you change it) 43 */ 44 45unsigned long badness(struct task_struct *p, unsigned long uptime) 46{ 47 unsigned long points, cpu_time, run_time, s; 48 struct list_head *tsk; 49 50 if (!p->mm) 51 return 0; 52 53 /* 54 * The memory size of the process is the basis for the badness. 55 */ 56 points = p->mm->total_vm; 57 58 /* 59 * Processes which fork a lot of child processes are likely 60 * a good choice. We add the vmsize of the childs if they 61 * have an own mm. This prevents forking servers to flood the 62 * machine with an endless amount of childs 63 */ 64 list_for_each(tsk, &p->children) { 65 struct task_struct *chld; 66 chld = list_entry(tsk, struct task_struct, sibling); 67 if (chld->mm != p->mm && chld->mm) 68 points += chld->mm->total_vm; 69 } 70 71 /* 72 * CPU time is in tens of seconds and run time is in thousands 73 * of seconds. There is no particular reason for this other than 74 * that it turned out to work very well in practice. 75 */ 76 cpu_time = (cputime_to_jiffies(p->utime) + cputime_to_jiffies(p->stime)) 77 >> (SHIFT_HZ + 3); 78 79 if (uptime >= p->start_time.tv_sec) 80 run_time = (uptime - p->start_time.tv_sec) >> 10; 81 else 82 run_time = 0; 83 84 s = int_sqrt(cpu_time); 85 if (s) 86 points /= s; 87 s = int_sqrt(int_sqrt(run_time)); 88 if (s) 89 points /= s; 90 91 /* 92 * Niced processes are most likely less important, so double 93 * their badness points. 94 */ 95 if (task_nice(p) > 0) 96 points *= 2; 97 98 /* 99 * Superuser processes are usually more important, so we make it 100 * less likely that we kill those. 101 */ 102 if (cap_t(p->cap_effective) & CAP_TO_MASK(CAP_SYS_ADMIN) || 103 p->uid == 0 || p->euid == 0) 104 points /= 4; 105 106 /* 107 * We don't want to kill a process with direct hardware access. 108 * Not only could that mess up the hardware, but usually users 109 * tend to only have this flag set on applications they think 110 * of as important. 111 */ 112 if (cap_t(p->cap_effective) & CAP_TO_MASK(CAP_SYS_RAWIO)) 113 points /= 4; 114 115 /* 116 * Adjust the score by oomkilladj. 117 */ 118 if (p->oomkilladj) { 119 if (p->oomkilladj > 0) 120 points <<= p->oomkilladj; 121 else 122 points >>= -(p->oomkilladj); 123 } 124 125#ifdef DEBUG 126 printk(KERN_DEBUG "OOMkill: task %d (%s) got %d points\n", 127 p->pid, p->comm, points); 128#endif 129 return points; 130} 131 132/* 133 * Simple selection loop. We chose the process with the highest 134 * number of 'points'. We expect the caller will lock the tasklist. 135 * 136 * (not docbooked, we don't want this one cluttering up the manual) 137 */ 138static struct task_struct * select_bad_process(void) 139{ 140 unsigned long maxpoints = 0; 141 struct task_struct *g, *p; 142 struct task_struct *chosen = NULL; 143 struct timespec uptime; 144 145 do_posix_clock_monotonic_gettime(&uptime); 146 do_each_thread(g, p) 147 /* skip the init task with pid == 1 */ 148 if (p->pid > 1 && p->oomkilladj != OOM_DISABLE) { 149 unsigned long points; 150 151 /* 152 * This is in the process of releasing memory so wait it 153 * to finish before killing some other task by mistake. 154 */ 155 if ((unlikely(test_tsk_thread_flag(p, TIF_MEMDIE)) || (p->flags & PF_EXITING)) && 156 !(p->flags & PF_DEAD)) 157 return ERR_PTR(-1UL); 158 if (p->flags & PF_SWAPOFF) 159 return p; 160 161 points = badness(p, uptime.tv_sec); 162 if (points > maxpoints || !chosen) { 163 chosen = p; 164 maxpoints = points; 165 } 166 } 167 while_each_thread(g, p); 168 return chosen; 169} 170 171/** 172 * We must be careful though to never send SIGKILL a process with 173 * CAP_SYS_RAW_IO set, send SIGTERM instead (but it's unlikely that 174 * we select a process with CAP_SYS_RAW_IO set). 175 */ 176static void __oom_kill_task(task_t *p) 177{ 178 if (p->pid == 1) { 179 WARN_ON(1); 180 printk(KERN_WARNING "tried to kill init!\n"); 181 return; 182 } 183 184 task_lock(p); 185 if (!p->mm || p->mm == &init_mm) { 186 WARN_ON(1); 187 printk(KERN_WARNING "tried to kill an mm-less task!\n"); 188 task_unlock(p); 189 return; 190 } 191 task_unlock(p); 192 printk(KERN_ERR "Out of Memory: Killed process %d (%s).\n", p->pid, p->comm); 193 194 /* 195 * We give our sacrificial lamb high priority and access to 196 * all the memory it needs. That way it should be able to 197 * exit() and clear out its resources quickly... 198 */ 199 p->time_slice = HZ; 200 set_tsk_thread_flag(p, TIF_MEMDIE); 201 202 force_sig(SIGKILL, p); 203} 204 205static struct mm_struct *oom_kill_task(task_t *p) 206{ 207 struct mm_struct *mm = get_task_mm(p); 208 task_t * g, * q; 209 210 if (!mm) 211 return NULL; 212 if (mm == &init_mm) { 213 mmput(mm); 214 return NULL; 215 } 216 217 __oom_kill_task(p); 218 /* 219 * kill all processes that share the ->mm (i.e. all threads), 220 * but are in a different thread group 221 */ 222 do_each_thread(g, q) 223 if (q->mm == mm && q->tgid != p->tgid) 224 __oom_kill_task(q); 225 while_each_thread(g, q); 226 227 return mm; 228} 229 230static struct mm_struct *oom_kill_process(struct task_struct *p) 231{ 232 struct mm_struct *mm; 233 struct task_struct *c; 234 struct list_head *tsk; 235 236 /* Try to kill a child first */ 237 list_for_each(tsk, &p->children) { 238 c = list_entry(tsk, struct task_struct, sibling); 239 if (c->mm == p->mm) 240 continue; 241 mm = oom_kill_task(c); 242 if (mm) 243 return mm; 244 } 245 return oom_kill_task(p); 246} 247 248/** 249 * oom_kill - kill the "best" process when we run out of memory 250 * 251 * If we run out of memory, we have the choice between either 252 * killing a random task (bad), letting the system crash (worse) 253 * OR try to be smart about which process to kill. Note that we 254 * don't have to be perfect here, we just have to be good. 255 */ 256void out_of_memory(unsigned int __nocast gfp_mask, int order) 257{ 258 struct mm_struct *mm = NULL; 259 task_t * p; 260 261 printk("oom-killer: gfp_mask=0x%x, order=%d\n", gfp_mask, order); 262 /* print memory stats */ 263 show_mem(); 264 265 read_lock(&tasklist_lock); 266retry: 267 p = select_bad_process(); 268 269 if (PTR_ERR(p) == -1UL) 270 goto out; 271 272 /* Found nothing?!?! Either we hang forever, or we panic. */ 273 if (!p) { 274 read_unlock(&tasklist_lock); 275 panic("Out of memory and no killable processes...\n"); 276 } 277 278 mm = oom_kill_process(p); 279 if (!mm) 280 goto retry; 281 282 out: 283 read_unlock(&tasklist_lock); 284 if (mm) 285 mmput(mm); 286 287 /* 288 * Give "p" a good chance of killing itself before we 289 * retry to allocate memory. 290 */ 291 __set_current_state(TASK_INTERRUPTIBLE); 292 schedule_timeout(1); 293} 294