core.c revision 1a687c2e9a99335c9e77392f050fe607fa18a652
1/* 2 * kernel/sched/core.c 3 * 4 * Kernel scheduler and related syscalls 5 * 6 * Copyright (C) 1991-2002 Linus Torvalds 7 * 8 * 1996-12-23 Modified by Dave Grothe to fix bugs in semaphores and 9 * make semaphores SMP safe 10 * 1998-11-19 Implemented schedule_timeout() and related stuff 11 * by Andrea Arcangeli 12 * 2002-01-04 New ultra-scalable O(1) scheduler by Ingo Molnar: 13 * hybrid priority-list and round-robin design with 14 * an array-switch method of distributing timeslices 15 * and per-CPU runqueues. Cleanups and useful suggestions 16 * by Davide Libenzi, preemptible kernel bits by Robert Love. 17 * 2003-09-03 Interactivity tuning by Con Kolivas. 18 * 2004-04-02 Scheduler domains code by Nick Piggin 19 * 2007-04-15 Work begun on replacing all interactivity tuning with a 20 * fair scheduling design by Con Kolivas. 21 * 2007-05-05 Load balancing (smp-nice) and other improvements 22 * by Peter Williams 23 * 2007-05-06 Interactivity improvements to CFS by Mike Galbraith 24 * 2007-07-01 Group scheduling enhancements by Srivatsa Vaddagiri 25 * 2007-11-29 RT balancing improvements by Steven Rostedt, Gregory Haskins, 26 * Thomas Gleixner, Mike Kravetz 27 */ 28 29#include <linux/mm.h> 30#include <linux/module.h> 31#include <linux/nmi.h> 32#include <linux/init.h> 33#include <linux/uaccess.h> 34#include <linux/highmem.h> 35#include <asm/mmu_context.h> 36#include <linux/interrupt.h> 37#include <linux/capability.h> 38#include <linux/completion.h> 39#include <linux/kernel_stat.h> 40#include <linux/debug_locks.h> 41#include <linux/perf_event.h> 42#include <linux/security.h> 43#include <linux/notifier.h> 44#include <linux/profile.h> 45#include <linux/freezer.h> 46#include <linux/vmalloc.h> 47#include <linux/blkdev.h> 48#include <linux/delay.h> 49#include <linux/pid_namespace.h> 50#include <linux/smp.h> 51#include <linux/threads.h> 52#include <linux/timer.h> 53#include <linux/rcupdate.h> 54#include <linux/cpu.h> 55#include <linux/cpuset.h> 56#include <linux/percpu.h> 57#include <linux/proc_fs.h> 58#include <linux/seq_file.h> 59#include <linux/sysctl.h> 60#include <linux/syscalls.h> 61#include <linux/times.h> 62#include <linux/tsacct_kern.h> 63#include <linux/kprobes.h> 64#include <linux/delayacct.h> 65#include <linux/unistd.h> 66#include <linux/pagemap.h> 67#include <linux/hrtimer.h> 68#include <linux/tick.h> 69#include <linux/debugfs.h> 70#include <linux/ctype.h> 71#include <linux/ftrace.h> 72#include <linux/slab.h> 73#include <linux/init_task.h> 74#include <linux/binfmts.h> 75 76#include <asm/switch_to.h> 77#include <asm/tlb.h> 78#include <asm/irq_regs.h> 79#include <asm/mutex.h> 80#ifdef CONFIG_PARAVIRT 81#include <asm/paravirt.h> 82#endif 83 84#include "sched.h" 85#include "../workqueue_sched.h" 86#include "../smpboot.h" 87 88#define CREATE_TRACE_POINTS 89#include <trace/events/sched.h> 90 91void start_bandwidth_timer(struct hrtimer *period_timer, ktime_t period) 92{ 93 unsigned long delta; 94 ktime_t soft, hard, now; 95 96 for (;;) { 97 if (hrtimer_active(period_timer)) 98 break; 99 100 now = hrtimer_cb_get_time(period_timer); 101 hrtimer_forward(period_timer, now, period); 102 103 soft = hrtimer_get_softexpires(period_timer); 104 hard = hrtimer_get_expires(period_timer); 105 delta = ktime_to_ns(ktime_sub(hard, soft)); 106 __hrtimer_start_range_ns(period_timer, soft, delta, 107 HRTIMER_MODE_ABS_PINNED, 0); 108 } 109} 110 111DEFINE_MUTEX(sched_domains_mutex); 112DEFINE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues); 113 114static void update_rq_clock_task(struct rq *rq, s64 delta); 115 116void update_rq_clock(struct rq *rq) 117{ 118 s64 delta; 119 120 if (rq->skip_clock_update > 0) 121 return; 122 123 delta = sched_clock_cpu(cpu_of(rq)) - rq->clock; 124 rq->clock += delta; 125 update_rq_clock_task(rq, delta); 126} 127 128/* 129 * Debugging: various feature bits 130 */ 131 132#define SCHED_FEAT(name, enabled) \ 133 (1UL << __SCHED_FEAT_##name) * enabled | 134 135const_debug unsigned int sysctl_sched_features = 136#include "features.h" 137 0; 138 139#undef SCHED_FEAT 140 141#ifdef CONFIG_SCHED_DEBUG 142#define SCHED_FEAT(name, enabled) \ 143 #name , 144 145static const char * const sched_feat_names[] = { 146#include "features.h" 147}; 148 149#undef SCHED_FEAT 150 151static int sched_feat_show(struct seq_file *m, void *v) 152{ 153 int i; 154 155 for (i = 0; i < __SCHED_FEAT_NR; i++) { 156 if (!(sysctl_sched_features & (1UL << i))) 157 seq_puts(m, "NO_"); 158 seq_printf(m, "%s ", sched_feat_names[i]); 159 } 160 seq_puts(m, "\n"); 161 162 return 0; 163} 164 165#ifdef HAVE_JUMP_LABEL 166 167#define jump_label_key__true STATIC_KEY_INIT_TRUE 168#define jump_label_key__false STATIC_KEY_INIT_FALSE 169 170#define SCHED_FEAT(name, enabled) \ 171 jump_label_key__##enabled , 172 173struct static_key sched_feat_keys[__SCHED_FEAT_NR] = { 174#include "features.h" 175}; 176 177#undef SCHED_FEAT 178 179static void sched_feat_disable(int i) 180{ 181 if (static_key_enabled(&sched_feat_keys[i])) 182 static_key_slow_dec(&sched_feat_keys[i]); 183} 184 185static void sched_feat_enable(int i) 186{ 187 if (!static_key_enabled(&sched_feat_keys[i])) 188 static_key_slow_inc(&sched_feat_keys[i]); 189} 190#else 191static void sched_feat_disable(int i) { }; 192static void sched_feat_enable(int i) { }; 193#endif /* HAVE_JUMP_LABEL */ 194 195static int sched_feat_set(char *cmp) 196{ 197 int i; 198 int neg = 0; 199 200 if (strncmp(cmp, "NO_", 3) == 0) { 201 neg = 1; 202 cmp += 3; 203 } 204 205 for (i = 0; i < __SCHED_FEAT_NR; i++) { 206 if (strcmp(cmp, sched_feat_names[i]) == 0) { 207 if (neg) { 208 sysctl_sched_features &= ~(1UL << i); 209 sched_feat_disable(i); 210 } else { 211 sysctl_sched_features |= (1UL << i); 212 sched_feat_enable(i); 213 } 214 break; 215 } 216 } 217 218 return i; 219} 220 221static ssize_t 222sched_feat_write(struct file *filp, const char __user *ubuf, 223 size_t cnt, loff_t *ppos) 224{ 225 char buf[64]; 226 char *cmp; 227 int i; 228 229 if (cnt > 63) 230 cnt = 63; 231 232 if (copy_from_user(&buf, ubuf, cnt)) 233 return -EFAULT; 234 235 buf[cnt] = 0; 236 cmp = strstrip(buf); 237 238 i = sched_feat_set(cmp); 239 if (i == __SCHED_FEAT_NR) 240 return -EINVAL; 241 242 *ppos += cnt; 243 244 return cnt; 245} 246 247static int sched_feat_open(struct inode *inode, struct file *filp) 248{ 249 return single_open(filp, sched_feat_show, NULL); 250} 251 252static const struct file_operations sched_feat_fops = { 253 .open = sched_feat_open, 254 .write = sched_feat_write, 255 .read = seq_read, 256 .llseek = seq_lseek, 257 .release = single_release, 258}; 259 260static __init int sched_init_debug(void) 261{ 262 debugfs_create_file("sched_features", 0644, NULL, NULL, 263 &sched_feat_fops); 264 265 return 0; 266} 267late_initcall(sched_init_debug); 268#endif /* CONFIG_SCHED_DEBUG */ 269 270/* 271 * Number of tasks to iterate in a single balance run. 272 * Limited because this is done with IRQs disabled. 273 */ 274const_debug unsigned int sysctl_sched_nr_migrate = 32; 275 276/* 277 * period over which we average the RT time consumption, measured 278 * in ms. 279 * 280 * default: 1s 281 */ 282const_debug unsigned int sysctl_sched_time_avg = MSEC_PER_SEC; 283 284/* 285 * period over which we measure -rt task cpu usage in us. 286 * default: 1s 287 */ 288unsigned int sysctl_sched_rt_period = 1000000; 289 290__read_mostly int scheduler_running; 291 292/* 293 * part of the period that we allow rt tasks to run in us. 294 * default: 0.95s 295 */ 296int sysctl_sched_rt_runtime = 950000; 297 298 299 300/* 301 * __task_rq_lock - lock the rq @p resides on. 302 */ 303static inline struct rq *__task_rq_lock(struct task_struct *p) 304 __acquires(rq->lock) 305{ 306 struct rq *rq; 307 308 lockdep_assert_held(&p->pi_lock); 309 310 for (;;) { 311 rq = task_rq(p); 312 raw_spin_lock(&rq->lock); 313 if (likely(rq == task_rq(p))) 314 return rq; 315 raw_spin_unlock(&rq->lock); 316 } 317} 318 319/* 320 * task_rq_lock - lock p->pi_lock and lock the rq @p resides on. 321 */ 322static struct rq *task_rq_lock(struct task_struct *p, unsigned long *flags) 323 __acquires(p->pi_lock) 324 __acquires(rq->lock) 325{ 326 struct rq *rq; 327 328 for (;;) { 329 raw_spin_lock_irqsave(&p->pi_lock, *flags); 330 rq = task_rq(p); 331 raw_spin_lock(&rq->lock); 332 if (likely(rq == task_rq(p))) 333 return rq; 334 raw_spin_unlock(&rq->lock); 335 raw_spin_unlock_irqrestore(&p->pi_lock, *flags); 336 } 337} 338 339static void __task_rq_unlock(struct rq *rq) 340 __releases(rq->lock) 341{ 342 raw_spin_unlock(&rq->lock); 343} 344 345static inline void 346task_rq_unlock(struct rq *rq, struct task_struct *p, unsigned long *flags) 347 __releases(rq->lock) 348 __releases(p->pi_lock) 349{ 350 raw_spin_unlock(&rq->lock); 351 raw_spin_unlock_irqrestore(&p->pi_lock, *flags); 352} 353 354/* 355 * this_rq_lock - lock this runqueue and disable interrupts. 356 */ 357static struct rq *this_rq_lock(void) 358 __acquires(rq->lock) 359{ 360 struct rq *rq; 361 362 local_irq_disable(); 363 rq = this_rq(); 364 raw_spin_lock(&rq->lock); 365 366 return rq; 367} 368 369#ifdef CONFIG_SCHED_HRTICK 370/* 371 * Use HR-timers to deliver accurate preemption points. 372 * 373 * Its all a bit involved since we cannot program an hrt while holding the 374 * rq->lock. So what we do is store a state in in rq->hrtick_* and ask for a 375 * reschedule event. 376 * 377 * When we get rescheduled we reprogram the hrtick_timer outside of the 378 * rq->lock. 379 */ 380 381static void hrtick_clear(struct rq *rq) 382{ 383 if (hrtimer_active(&rq->hrtick_timer)) 384 hrtimer_cancel(&rq->hrtick_timer); 385} 386 387/* 388 * High-resolution timer tick. 389 * Runs from hardirq context with interrupts disabled. 390 */ 391static enum hrtimer_restart hrtick(struct hrtimer *timer) 392{ 393 struct rq *rq = container_of(timer, struct rq, hrtick_timer); 394 395 WARN_ON_ONCE(cpu_of(rq) != smp_processor_id()); 396 397 raw_spin_lock(&rq->lock); 398 update_rq_clock(rq); 399 rq->curr->sched_class->task_tick(rq, rq->curr, 1); 400 raw_spin_unlock(&rq->lock); 401 402 return HRTIMER_NORESTART; 403} 404 405#ifdef CONFIG_SMP 406/* 407 * called from hardirq (IPI) context 408 */ 409static void __hrtick_start(void *arg) 410{ 411 struct rq *rq = arg; 412 413 raw_spin_lock(&rq->lock); 414 hrtimer_restart(&rq->hrtick_timer); 415 rq->hrtick_csd_pending = 0; 416 raw_spin_unlock(&rq->lock); 417} 418 419/* 420 * Called to set the hrtick timer state. 421 * 422 * called with rq->lock held and irqs disabled 423 */ 424void hrtick_start(struct rq *rq, u64 delay) 425{ 426 struct hrtimer *timer = &rq->hrtick_timer; 427 ktime_t time = ktime_add_ns(timer->base->get_time(), delay); 428 429 hrtimer_set_expires(timer, time); 430 431 if (rq == this_rq()) { 432 hrtimer_restart(timer); 433 } else if (!rq->hrtick_csd_pending) { 434 __smp_call_function_single(cpu_of(rq), &rq->hrtick_csd, 0); 435 rq->hrtick_csd_pending = 1; 436 } 437} 438 439static int 440hotplug_hrtick(struct notifier_block *nfb, unsigned long action, void *hcpu) 441{ 442 int cpu = (int)(long)hcpu; 443 444 switch (action) { 445 case CPU_UP_CANCELED: 446 case CPU_UP_CANCELED_FROZEN: 447 case CPU_DOWN_PREPARE: 448 case CPU_DOWN_PREPARE_FROZEN: 449 case CPU_DEAD: 450 case CPU_DEAD_FROZEN: 451 hrtick_clear(cpu_rq(cpu)); 452 return NOTIFY_OK; 453 } 454 455 return NOTIFY_DONE; 456} 457 458static __init void init_hrtick(void) 459{ 460 hotcpu_notifier(hotplug_hrtick, 0); 461} 462#else 463/* 464 * Called to set the hrtick timer state. 465 * 466 * called with rq->lock held and irqs disabled 467 */ 468void hrtick_start(struct rq *rq, u64 delay) 469{ 470 __hrtimer_start_range_ns(&rq->hrtick_timer, ns_to_ktime(delay), 0, 471 HRTIMER_MODE_REL_PINNED, 0); 472} 473 474static inline void init_hrtick(void) 475{ 476} 477#endif /* CONFIG_SMP */ 478 479static void init_rq_hrtick(struct rq *rq) 480{ 481#ifdef CONFIG_SMP 482 rq->hrtick_csd_pending = 0; 483 484 rq->hrtick_csd.flags = 0; 485 rq->hrtick_csd.func = __hrtick_start; 486 rq->hrtick_csd.info = rq; 487#endif 488 489 hrtimer_init(&rq->hrtick_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); 490 rq->hrtick_timer.function = hrtick; 491} 492#else /* CONFIG_SCHED_HRTICK */ 493static inline void hrtick_clear(struct rq *rq) 494{ 495} 496 497static inline void init_rq_hrtick(struct rq *rq) 498{ 499} 500 501static inline void init_hrtick(void) 502{ 503} 504#endif /* CONFIG_SCHED_HRTICK */ 505 506/* 507 * resched_task - mark a task 'to be rescheduled now'. 508 * 509 * On UP this means the setting of the need_resched flag, on SMP it 510 * might also involve a cross-CPU call to trigger the scheduler on 511 * the target CPU. 512 */ 513#ifdef CONFIG_SMP 514 515#ifndef tsk_is_polling 516#define tsk_is_polling(t) 0 517#endif 518 519void resched_task(struct task_struct *p) 520{ 521 int cpu; 522 523 assert_raw_spin_locked(&task_rq(p)->lock); 524 525 if (test_tsk_need_resched(p)) 526 return; 527 528 set_tsk_need_resched(p); 529 530 cpu = task_cpu(p); 531 if (cpu == smp_processor_id()) 532 return; 533 534 /* NEED_RESCHED must be visible before we test polling */ 535 smp_mb(); 536 if (!tsk_is_polling(p)) 537 smp_send_reschedule(cpu); 538} 539 540void resched_cpu(int cpu) 541{ 542 struct rq *rq = cpu_rq(cpu); 543 unsigned long flags; 544 545 if (!raw_spin_trylock_irqsave(&rq->lock, flags)) 546 return; 547 resched_task(cpu_curr(cpu)); 548 raw_spin_unlock_irqrestore(&rq->lock, flags); 549} 550 551#ifdef CONFIG_NO_HZ 552/* 553 * In the semi idle case, use the nearest busy cpu for migrating timers 554 * from an idle cpu. This is good for power-savings. 555 * 556 * We don't do similar optimization for completely idle system, as 557 * selecting an idle cpu will add more delays to the timers than intended 558 * (as that cpu's timer base may not be uptodate wrt jiffies etc). 559 */ 560int get_nohz_timer_target(void) 561{ 562 int cpu = smp_processor_id(); 563 int i; 564 struct sched_domain *sd; 565 566 rcu_read_lock(); 567 for_each_domain(cpu, sd) { 568 for_each_cpu(i, sched_domain_span(sd)) { 569 if (!idle_cpu(i)) { 570 cpu = i; 571 goto unlock; 572 } 573 } 574 } 575unlock: 576 rcu_read_unlock(); 577 return cpu; 578} 579/* 580 * When add_timer_on() enqueues a timer into the timer wheel of an 581 * idle CPU then this timer might expire before the next timer event 582 * which is scheduled to wake up that CPU. In case of a completely 583 * idle system the next event might even be infinite time into the 584 * future. wake_up_idle_cpu() ensures that the CPU is woken up and 585 * leaves the inner idle loop so the newly added timer is taken into 586 * account when the CPU goes back to idle and evaluates the timer 587 * wheel for the next timer event. 588 */ 589void wake_up_idle_cpu(int cpu) 590{ 591 struct rq *rq = cpu_rq(cpu); 592 593 if (cpu == smp_processor_id()) 594 return; 595 596 /* 597 * This is safe, as this function is called with the timer 598 * wheel base lock of (cpu) held. When the CPU is on the way 599 * to idle and has not yet set rq->curr to idle then it will 600 * be serialized on the timer wheel base lock and take the new 601 * timer into account automatically. 602 */ 603 if (rq->curr != rq->idle) 604 return; 605 606 /* 607 * We can set TIF_RESCHED on the idle task of the other CPU 608 * lockless. The worst case is that the other CPU runs the 609 * idle task through an additional NOOP schedule() 610 */ 611 set_tsk_need_resched(rq->idle); 612 613 /* NEED_RESCHED must be visible before we test polling */ 614 smp_mb(); 615 if (!tsk_is_polling(rq->idle)) 616 smp_send_reschedule(cpu); 617} 618 619static inline bool got_nohz_idle_kick(void) 620{ 621 int cpu = smp_processor_id(); 622 return idle_cpu(cpu) && test_bit(NOHZ_BALANCE_KICK, nohz_flags(cpu)); 623} 624 625#else /* CONFIG_NO_HZ */ 626 627static inline bool got_nohz_idle_kick(void) 628{ 629 return false; 630} 631 632#endif /* CONFIG_NO_HZ */ 633 634void sched_avg_update(struct rq *rq) 635{ 636 s64 period = sched_avg_period(); 637 638 while ((s64)(rq->clock - rq->age_stamp) > period) { 639 /* 640 * Inline assembly required to prevent the compiler 641 * optimising this loop into a divmod call. 642 * See __iter_div_u64_rem() for another example of this. 643 */ 644 asm("" : "+rm" (rq->age_stamp)); 645 rq->age_stamp += period; 646 rq->rt_avg /= 2; 647 } 648} 649 650#else /* !CONFIG_SMP */ 651void resched_task(struct task_struct *p) 652{ 653 assert_raw_spin_locked(&task_rq(p)->lock); 654 set_tsk_need_resched(p); 655} 656#endif /* CONFIG_SMP */ 657 658#if defined(CONFIG_RT_GROUP_SCHED) || (defined(CONFIG_FAIR_GROUP_SCHED) && \ 659 (defined(CONFIG_SMP) || defined(CONFIG_CFS_BANDWIDTH))) 660/* 661 * Iterate task_group tree rooted at *from, calling @down when first entering a 662 * node and @up when leaving it for the final time. 663 * 664 * Caller must hold rcu_lock or sufficient equivalent. 665 */ 666int walk_tg_tree_from(struct task_group *from, 667 tg_visitor down, tg_visitor up, void *data) 668{ 669 struct task_group *parent, *child; 670 int ret; 671 672 parent = from; 673 674down: 675 ret = (*down)(parent, data); 676 if (ret) 677 goto out; 678 list_for_each_entry_rcu(child, &parent->children, siblings) { 679 parent = child; 680 goto down; 681 682up: 683 continue; 684 } 685 ret = (*up)(parent, data); 686 if (ret || parent == from) 687 goto out; 688 689 child = parent; 690 parent = parent->parent; 691 if (parent) 692 goto up; 693out: 694 return ret; 695} 696 697int tg_nop(struct task_group *tg, void *data) 698{ 699 return 0; 700} 701#endif 702 703static void set_load_weight(struct task_struct *p) 704{ 705 int prio = p->static_prio - MAX_RT_PRIO; 706 struct load_weight *load = &p->se.load; 707 708 /* 709 * SCHED_IDLE tasks get minimal weight: 710 */ 711 if (p->policy == SCHED_IDLE) { 712 load->weight = scale_load(WEIGHT_IDLEPRIO); 713 load->inv_weight = WMULT_IDLEPRIO; 714 return; 715 } 716 717 load->weight = scale_load(prio_to_weight[prio]); 718 load->inv_weight = prio_to_wmult[prio]; 719} 720 721static void enqueue_task(struct rq *rq, struct task_struct *p, int flags) 722{ 723 update_rq_clock(rq); 724 sched_info_queued(p); 725 p->sched_class->enqueue_task(rq, p, flags); 726} 727 728static void dequeue_task(struct rq *rq, struct task_struct *p, int flags) 729{ 730 update_rq_clock(rq); 731 sched_info_dequeued(p); 732 p->sched_class->dequeue_task(rq, p, flags); 733} 734 735void activate_task(struct rq *rq, struct task_struct *p, int flags) 736{ 737 if (task_contributes_to_load(p)) 738 rq->nr_uninterruptible--; 739 740 enqueue_task(rq, p, flags); 741} 742 743void deactivate_task(struct rq *rq, struct task_struct *p, int flags) 744{ 745 if (task_contributes_to_load(p)) 746 rq->nr_uninterruptible++; 747 748 dequeue_task(rq, p, flags); 749} 750 751static void update_rq_clock_task(struct rq *rq, s64 delta) 752{ 753/* 754 * In theory, the compile should just see 0 here, and optimize out the call 755 * to sched_rt_avg_update. But I don't trust it... 756 */ 757#if defined(CONFIG_IRQ_TIME_ACCOUNTING) || defined(CONFIG_PARAVIRT_TIME_ACCOUNTING) 758 s64 steal = 0, irq_delta = 0; 759#endif 760#ifdef CONFIG_IRQ_TIME_ACCOUNTING 761 irq_delta = irq_time_read(cpu_of(rq)) - rq->prev_irq_time; 762 763 /* 764 * Since irq_time is only updated on {soft,}irq_exit, we might run into 765 * this case when a previous update_rq_clock() happened inside a 766 * {soft,}irq region. 767 * 768 * When this happens, we stop ->clock_task and only update the 769 * prev_irq_time stamp to account for the part that fit, so that a next 770 * update will consume the rest. This ensures ->clock_task is 771 * monotonic. 772 * 773 * It does however cause some slight miss-attribution of {soft,}irq 774 * time, a more accurate solution would be to update the irq_time using 775 * the current rq->clock timestamp, except that would require using 776 * atomic ops. 777 */ 778 if (irq_delta > delta) 779 irq_delta = delta; 780 781 rq->prev_irq_time += irq_delta; 782 delta -= irq_delta; 783#endif 784#ifdef CONFIG_PARAVIRT_TIME_ACCOUNTING 785 if (static_key_false((¶virt_steal_rq_enabled))) { 786 u64 st; 787 788 steal = paravirt_steal_clock(cpu_of(rq)); 789 steal -= rq->prev_steal_time_rq; 790 791 if (unlikely(steal > delta)) 792 steal = delta; 793 794 st = steal_ticks(steal); 795 steal = st * TICK_NSEC; 796 797 rq->prev_steal_time_rq += steal; 798 799 delta -= steal; 800 } 801#endif 802 803 rq->clock_task += delta; 804 805#if defined(CONFIG_IRQ_TIME_ACCOUNTING) || defined(CONFIG_PARAVIRT_TIME_ACCOUNTING) 806 if ((irq_delta + steal) && sched_feat(NONTASK_POWER)) 807 sched_rt_avg_update(rq, irq_delta + steal); 808#endif 809} 810 811void sched_set_stop_task(int cpu, struct task_struct *stop) 812{ 813 struct sched_param param = { .sched_priority = MAX_RT_PRIO - 1 }; 814 struct task_struct *old_stop = cpu_rq(cpu)->stop; 815 816 if (stop) { 817 /* 818 * Make it appear like a SCHED_FIFO task, its something 819 * userspace knows about and won't get confused about. 820 * 821 * Also, it will make PI more or less work without too 822 * much confusion -- but then, stop work should not 823 * rely on PI working anyway. 824 */ 825 sched_setscheduler_nocheck(stop, SCHED_FIFO, ¶m); 826 827 stop->sched_class = &stop_sched_class; 828 } 829 830 cpu_rq(cpu)->stop = stop; 831 832 if (old_stop) { 833 /* 834 * Reset it back to a normal scheduling class so that 835 * it can die in pieces. 836 */ 837 old_stop->sched_class = &rt_sched_class; 838 } 839} 840 841/* 842 * __normal_prio - return the priority that is based on the static prio 843 */ 844static inline int __normal_prio(struct task_struct *p) 845{ 846 return p->static_prio; 847} 848 849/* 850 * Calculate the expected normal priority: i.e. priority 851 * without taking RT-inheritance into account. Might be 852 * boosted by interactivity modifiers. Changes upon fork, 853 * setprio syscalls, and whenever the interactivity 854 * estimator recalculates. 855 */ 856static inline int normal_prio(struct task_struct *p) 857{ 858 int prio; 859 860 if (task_has_rt_policy(p)) 861 prio = MAX_RT_PRIO-1 - p->rt_priority; 862 else 863 prio = __normal_prio(p); 864 return prio; 865} 866 867/* 868 * Calculate the current priority, i.e. the priority 869 * taken into account by the scheduler. This value might 870 * be boosted by RT tasks, or might be boosted by 871 * interactivity modifiers. Will be RT if the task got 872 * RT-boosted. If not then it returns p->normal_prio. 873 */ 874static int effective_prio(struct task_struct *p) 875{ 876 p->normal_prio = normal_prio(p); 877 /* 878 * If we are RT tasks or we were boosted to RT priority, 879 * keep the priority unchanged. Otherwise, update priority 880 * to the normal priority: 881 */ 882 if (!rt_prio(p->prio)) 883 return p->normal_prio; 884 return p->prio; 885} 886 887/** 888 * task_curr - is this task currently executing on a CPU? 889 * @p: the task in question. 890 */ 891inline int task_curr(const struct task_struct *p) 892{ 893 return cpu_curr(task_cpu(p)) == p; 894} 895 896static inline void check_class_changed(struct rq *rq, struct task_struct *p, 897 const struct sched_class *prev_class, 898 int oldprio) 899{ 900 if (prev_class != p->sched_class) { 901 if (prev_class->switched_from) 902 prev_class->switched_from(rq, p); 903 p->sched_class->switched_to(rq, p); 904 } else if (oldprio != p->prio) 905 p->sched_class->prio_changed(rq, p, oldprio); 906} 907 908void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags) 909{ 910 const struct sched_class *class; 911 912 if (p->sched_class == rq->curr->sched_class) { 913 rq->curr->sched_class->check_preempt_curr(rq, p, flags); 914 } else { 915 for_each_class(class) { 916 if (class == rq->curr->sched_class) 917 break; 918 if (class == p->sched_class) { 919 resched_task(rq->curr); 920 break; 921 } 922 } 923 } 924 925 /* 926 * A queue event has occurred, and we're going to schedule. In 927 * this case, we can save a useless back to back clock update. 928 */ 929 if (rq->curr->on_rq && test_tsk_need_resched(rq->curr)) 930 rq->skip_clock_update = 1; 931} 932 933#ifdef CONFIG_SMP 934void set_task_cpu(struct task_struct *p, unsigned int new_cpu) 935{ 936#ifdef CONFIG_SCHED_DEBUG 937 /* 938 * We should never call set_task_cpu() on a blocked task, 939 * ttwu() will sort out the placement. 940 */ 941 WARN_ON_ONCE(p->state != TASK_RUNNING && p->state != TASK_WAKING && 942 !(task_thread_info(p)->preempt_count & PREEMPT_ACTIVE)); 943 944#ifdef CONFIG_LOCKDEP 945 /* 946 * The caller should hold either p->pi_lock or rq->lock, when changing 947 * a task's CPU. ->pi_lock for waking tasks, rq->lock for runnable tasks. 948 * 949 * sched_move_task() holds both and thus holding either pins the cgroup, 950 * see task_group(). 951 * 952 * Furthermore, all task_rq users should acquire both locks, see 953 * task_rq_lock(). 954 */ 955 WARN_ON_ONCE(debug_locks && !(lockdep_is_held(&p->pi_lock) || 956 lockdep_is_held(&task_rq(p)->lock))); 957#endif 958#endif 959 960 trace_sched_migrate_task(p, new_cpu); 961 962 if (task_cpu(p) != new_cpu) { 963 p->se.nr_migrations++; 964 perf_sw_event(PERF_COUNT_SW_CPU_MIGRATIONS, 1, NULL, 0); 965 } 966 967 __set_task_cpu(p, new_cpu); 968} 969 970struct migration_arg { 971 struct task_struct *task; 972 int dest_cpu; 973}; 974 975static int migration_cpu_stop(void *data); 976 977/* 978 * wait_task_inactive - wait for a thread to unschedule. 979 * 980 * If @match_state is nonzero, it's the @p->state value just checked and 981 * not expected to change. If it changes, i.e. @p might have woken up, 982 * then return zero. When we succeed in waiting for @p to be off its CPU, 983 * we return a positive number (its total switch count). If a second call 984 * a short while later returns the same number, the caller can be sure that 985 * @p has remained unscheduled the whole time. 986 * 987 * The caller must ensure that the task *will* unschedule sometime soon, 988 * else this function might spin for a *long* time. This function can't 989 * be called with interrupts off, or it may introduce deadlock with 990 * smp_call_function() if an IPI is sent by the same process we are 991 * waiting to become inactive. 992 */ 993unsigned long wait_task_inactive(struct task_struct *p, long match_state) 994{ 995 unsigned long flags; 996 int running, on_rq; 997 unsigned long ncsw; 998 struct rq *rq; 999 1000 for (;;) { 1001 /* 1002 * We do the initial early heuristics without holding 1003 * any task-queue locks at all. We'll only try to get 1004 * the runqueue lock when things look like they will 1005 * work out! 1006 */ 1007 rq = task_rq(p); 1008 1009 /* 1010 * If the task is actively running on another CPU 1011 * still, just relax and busy-wait without holding 1012 * any locks. 1013 * 1014 * NOTE! Since we don't hold any locks, it's not 1015 * even sure that "rq" stays as the right runqueue! 1016 * But we don't care, since "task_running()" will 1017 * return false if the runqueue has changed and p 1018 * is actually now running somewhere else! 1019 */ 1020 while (task_running(rq, p)) { 1021 if (match_state && unlikely(p->state != match_state)) 1022 return 0; 1023 cpu_relax(); 1024 } 1025 1026 /* 1027 * Ok, time to look more closely! We need the rq 1028 * lock now, to be *sure*. If we're wrong, we'll 1029 * just go back and repeat. 1030 */ 1031 rq = task_rq_lock(p, &flags); 1032 trace_sched_wait_task(p); 1033 running = task_running(rq, p); 1034 on_rq = p->on_rq; 1035 ncsw = 0; 1036 if (!match_state || p->state == match_state) 1037 ncsw = p->nvcsw | LONG_MIN; /* sets MSB */ 1038 task_rq_unlock(rq, p, &flags); 1039 1040 /* 1041 * If it changed from the expected state, bail out now. 1042 */ 1043 if (unlikely(!ncsw)) 1044 break; 1045 1046 /* 1047 * Was it really running after all now that we 1048 * checked with the proper locks actually held? 1049 * 1050 * Oops. Go back and try again.. 1051 */ 1052 if (unlikely(running)) { 1053 cpu_relax(); 1054 continue; 1055 } 1056 1057 /* 1058 * It's not enough that it's not actively running, 1059 * it must be off the runqueue _entirely_, and not 1060 * preempted! 1061 * 1062 * So if it was still runnable (but just not actively 1063 * running right now), it's preempted, and we should 1064 * yield - it could be a while. 1065 */ 1066 if (unlikely(on_rq)) { 1067 ktime_t to = ktime_set(0, NSEC_PER_SEC/HZ); 1068 1069 set_current_state(TASK_UNINTERRUPTIBLE); 1070 schedule_hrtimeout(&to, HRTIMER_MODE_REL); 1071 continue; 1072 } 1073 1074 /* 1075 * Ahh, all good. It wasn't running, and it wasn't 1076 * runnable, which means that it will never become 1077 * running in the future either. We're all done! 1078 */ 1079 break; 1080 } 1081 1082 return ncsw; 1083} 1084 1085/*** 1086 * kick_process - kick a running thread to enter/exit the kernel 1087 * @p: the to-be-kicked thread 1088 * 1089 * Cause a process which is running on another CPU to enter 1090 * kernel-mode, without any delay. (to get signals handled.) 1091 * 1092 * NOTE: this function doesn't have to take the runqueue lock, 1093 * because all it wants to ensure is that the remote task enters 1094 * the kernel. If the IPI races and the task has been migrated 1095 * to another CPU then no harm is done and the purpose has been 1096 * achieved as well. 1097 */ 1098void kick_process(struct task_struct *p) 1099{ 1100 int cpu; 1101 1102 preempt_disable(); 1103 cpu = task_cpu(p); 1104 if ((cpu != smp_processor_id()) && task_curr(p)) 1105 smp_send_reschedule(cpu); 1106 preempt_enable(); 1107} 1108EXPORT_SYMBOL_GPL(kick_process); 1109#endif /* CONFIG_SMP */ 1110 1111#ifdef CONFIG_SMP 1112/* 1113 * ->cpus_allowed is protected by both rq->lock and p->pi_lock 1114 */ 1115static int select_fallback_rq(int cpu, struct task_struct *p) 1116{ 1117 const struct cpumask *nodemask = cpumask_of_node(cpu_to_node(cpu)); 1118 enum { cpuset, possible, fail } state = cpuset; 1119 int dest_cpu; 1120 1121 /* Look for allowed, online CPU in same node. */ 1122 for_each_cpu(dest_cpu, nodemask) { 1123 if (!cpu_online(dest_cpu)) 1124 continue; 1125 if (!cpu_active(dest_cpu)) 1126 continue; 1127 if (cpumask_test_cpu(dest_cpu, tsk_cpus_allowed(p))) 1128 return dest_cpu; 1129 } 1130 1131 for (;;) { 1132 /* Any allowed, online CPU? */ 1133 for_each_cpu(dest_cpu, tsk_cpus_allowed(p)) { 1134 if (!cpu_online(dest_cpu)) 1135 continue; 1136 if (!cpu_active(dest_cpu)) 1137 continue; 1138 goto out; 1139 } 1140 1141 switch (state) { 1142 case cpuset: 1143 /* No more Mr. Nice Guy. */ 1144 cpuset_cpus_allowed_fallback(p); 1145 state = possible; 1146 break; 1147 1148 case possible: 1149 do_set_cpus_allowed(p, cpu_possible_mask); 1150 state = fail; 1151 break; 1152 1153 case fail: 1154 BUG(); 1155 break; 1156 } 1157 } 1158 1159out: 1160 if (state != cpuset) { 1161 /* 1162 * Don't tell them about moving exiting tasks or 1163 * kernel threads (both mm NULL), since they never 1164 * leave kernel. 1165 */ 1166 if (p->mm && printk_ratelimit()) { 1167 printk_sched("process %d (%s) no longer affine to cpu%d\n", 1168 task_pid_nr(p), p->comm, cpu); 1169 } 1170 } 1171 1172 return dest_cpu; 1173} 1174 1175/* 1176 * The caller (fork, wakeup) owns p->pi_lock, ->cpus_allowed is stable. 1177 */ 1178static inline 1179int select_task_rq(struct task_struct *p, int sd_flags, int wake_flags) 1180{ 1181 int cpu = p->sched_class->select_task_rq(p, sd_flags, wake_flags); 1182 1183 /* 1184 * In order not to call set_task_cpu() on a blocking task we need 1185 * to rely on ttwu() to place the task on a valid ->cpus_allowed 1186 * cpu. 1187 * 1188 * Since this is common to all placement strategies, this lives here. 1189 * 1190 * [ this allows ->select_task() to simply return task_cpu(p) and 1191 * not worry about this generic constraint ] 1192 */ 1193 if (unlikely(!cpumask_test_cpu(cpu, tsk_cpus_allowed(p)) || 1194 !cpu_online(cpu))) 1195 cpu = select_fallback_rq(task_cpu(p), p); 1196 1197 return cpu; 1198} 1199 1200static void update_avg(u64 *avg, u64 sample) 1201{ 1202 s64 diff = sample - *avg; 1203 *avg += diff >> 3; 1204} 1205#endif 1206 1207static void 1208ttwu_stat(struct task_struct *p, int cpu, int wake_flags) 1209{ 1210#ifdef CONFIG_SCHEDSTATS 1211 struct rq *rq = this_rq(); 1212 1213#ifdef CONFIG_SMP 1214 int this_cpu = smp_processor_id(); 1215 1216 if (cpu == this_cpu) { 1217 schedstat_inc(rq, ttwu_local); 1218 schedstat_inc(p, se.statistics.nr_wakeups_local); 1219 } else { 1220 struct sched_domain *sd; 1221 1222 schedstat_inc(p, se.statistics.nr_wakeups_remote); 1223 rcu_read_lock(); 1224 for_each_domain(this_cpu, sd) { 1225 if (cpumask_test_cpu(cpu, sched_domain_span(sd))) { 1226 schedstat_inc(sd, ttwu_wake_remote); 1227 break; 1228 } 1229 } 1230 rcu_read_unlock(); 1231 } 1232 1233 if (wake_flags & WF_MIGRATED) 1234 schedstat_inc(p, se.statistics.nr_wakeups_migrate); 1235 1236#endif /* CONFIG_SMP */ 1237 1238 schedstat_inc(rq, ttwu_count); 1239 schedstat_inc(p, se.statistics.nr_wakeups); 1240 1241 if (wake_flags & WF_SYNC) 1242 schedstat_inc(p, se.statistics.nr_wakeups_sync); 1243 1244#endif /* CONFIG_SCHEDSTATS */ 1245} 1246 1247static void ttwu_activate(struct rq *rq, struct task_struct *p, int en_flags) 1248{ 1249 activate_task(rq, p, en_flags); 1250 p->on_rq = 1; 1251 1252 /* if a worker is waking up, notify workqueue */ 1253 if (p->flags & PF_WQ_WORKER) 1254 wq_worker_waking_up(p, cpu_of(rq)); 1255} 1256 1257/* 1258 * Mark the task runnable and perform wakeup-preemption. 1259 */ 1260static void 1261ttwu_do_wakeup(struct rq *rq, struct task_struct *p, int wake_flags) 1262{ 1263 trace_sched_wakeup(p, true); 1264 check_preempt_curr(rq, p, wake_flags); 1265 1266 p->state = TASK_RUNNING; 1267#ifdef CONFIG_SMP 1268 if (p->sched_class->task_woken) 1269 p->sched_class->task_woken(rq, p); 1270 1271 if (rq->idle_stamp) { 1272 u64 delta = rq->clock - rq->idle_stamp; 1273 u64 max = 2*sysctl_sched_migration_cost; 1274 1275 if (delta > max) 1276 rq->avg_idle = max; 1277 else 1278 update_avg(&rq->avg_idle, delta); 1279 rq->idle_stamp = 0; 1280 } 1281#endif 1282} 1283 1284static void 1285ttwu_do_activate(struct rq *rq, struct task_struct *p, int wake_flags) 1286{ 1287#ifdef CONFIG_SMP 1288 if (p->sched_contributes_to_load) 1289 rq->nr_uninterruptible--; 1290#endif 1291 1292 ttwu_activate(rq, p, ENQUEUE_WAKEUP | ENQUEUE_WAKING); 1293 ttwu_do_wakeup(rq, p, wake_flags); 1294} 1295 1296/* 1297 * Called in case the task @p isn't fully descheduled from its runqueue, 1298 * in this case we must do a remote wakeup. Its a 'light' wakeup though, 1299 * since all we need to do is flip p->state to TASK_RUNNING, since 1300 * the task is still ->on_rq. 1301 */ 1302static int ttwu_remote(struct task_struct *p, int wake_flags) 1303{ 1304 struct rq *rq; 1305 int ret = 0; 1306 1307 rq = __task_rq_lock(p); 1308 if (p->on_rq) { 1309 ttwu_do_wakeup(rq, p, wake_flags); 1310 ret = 1; 1311 } 1312 __task_rq_unlock(rq); 1313 1314 return ret; 1315} 1316 1317#ifdef CONFIG_SMP 1318static void sched_ttwu_pending(void) 1319{ 1320 struct rq *rq = this_rq(); 1321 struct llist_node *llist = llist_del_all(&rq->wake_list); 1322 struct task_struct *p; 1323 1324 raw_spin_lock(&rq->lock); 1325 1326 while (llist) { 1327 p = llist_entry(llist, struct task_struct, wake_entry); 1328 llist = llist_next(llist); 1329 ttwu_do_activate(rq, p, 0); 1330 } 1331 1332 raw_spin_unlock(&rq->lock); 1333} 1334 1335void scheduler_ipi(void) 1336{ 1337 if (llist_empty(&this_rq()->wake_list) && !got_nohz_idle_kick()) 1338 return; 1339 1340 /* 1341 * Not all reschedule IPI handlers call irq_enter/irq_exit, since 1342 * traditionally all their work was done from the interrupt return 1343 * path. Now that we actually do some work, we need to make sure 1344 * we do call them. 1345 * 1346 * Some archs already do call them, luckily irq_enter/exit nest 1347 * properly. 1348 * 1349 * Arguably we should visit all archs and update all handlers, 1350 * however a fair share of IPIs are still resched only so this would 1351 * somewhat pessimize the simple resched case. 1352 */ 1353 irq_enter(); 1354 sched_ttwu_pending(); 1355 1356 /* 1357 * Check if someone kicked us for doing the nohz idle load balance. 1358 */ 1359 if (unlikely(got_nohz_idle_kick() && !need_resched())) { 1360 this_rq()->idle_balance = 1; 1361 raise_softirq_irqoff(SCHED_SOFTIRQ); 1362 } 1363 irq_exit(); 1364} 1365 1366static void ttwu_queue_remote(struct task_struct *p, int cpu) 1367{ 1368 if (llist_add(&p->wake_entry, &cpu_rq(cpu)->wake_list)) 1369 smp_send_reschedule(cpu); 1370} 1371 1372bool cpus_share_cache(int this_cpu, int that_cpu) 1373{ 1374 return per_cpu(sd_llc_id, this_cpu) == per_cpu(sd_llc_id, that_cpu); 1375} 1376#endif /* CONFIG_SMP */ 1377 1378static void ttwu_queue(struct task_struct *p, int cpu) 1379{ 1380 struct rq *rq = cpu_rq(cpu); 1381 1382#if defined(CONFIG_SMP) 1383 if (sched_feat(TTWU_QUEUE) && !cpus_share_cache(smp_processor_id(), cpu)) { 1384 sched_clock_cpu(cpu); /* sync clocks x-cpu */ 1385 ttwu_queue_remote(p, cpu); 1386 return; 1387 } 1388#endif 1389 1390 raw_spin_lock(&rq->lock); 1391 ttwu_do_activate(rq, p, 0); 1392 raw_spin_unlock(&rq->lock); 1393} 1394 1395/** 1396 * try_to_wake_up - wake up a thread 1397 * @p: the thread to be awakened 1398 * @state: the mask of task states that can be woken 1399 * @wake_flags: wake modifier flags (WF_*) 1400 * 1401 * Put it on the run-queue if it's not already there. The "current" 1402 * thread is always on the run-queue (except when the actual 1403 * re-schedule is in progress), and as such you're allowed to do 1404 * the simpler "current->state = TASK_RUNNING" to mark yourself 1405 * runnable without the overhead of this. 1406 * 1407 * Returns %true if @p was woken up, %false if it was already running 1408 * or @state didn't match @p's state. 1409 */ 1410static int 1411try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags) 1412{ 1413 unsigned long flags; 1414 int cpu, success = 0; 1415 1416 smp_wmb(); 1417 raw_spin_lock_irqsave(&p->pi_lock, flags); 1418 if (!(p->state & state)) 1419 goto out; 1420 1421 success = 1; /* we're going to change ->state */ 1422 cpu = task_cpu(p); 1423 1424 if (p->on_rq && ttwu_remote(p, wake_flags)) 1425 goto stat; 1426 1427#ifdef CONFIG_SMP 1428 /* 1429 * If the owning (remote) cpu is still in the middle of schedule() with 1430 * this task as prev, wait until its done referencing the task. 1431 */ 1432 while (p->on_cpu) 1433 cpu_relax(); 1434 /* 1435 * Pairs with the smp_wmb() in finish_lock_switch(). 1436 */ 1437 smp_rmb(); 1438 1439 p->sched_contributes_to_load = !!task_contributes_to_load(p); 1440 p->state = TASK_WAKING; 1441 1442 if (p->sched_class->task_waking) 1443 p->sched_class->task_waking(p); 1444 1445 cpu = select_task_rq(p, SD_BALANCE_WAKE, wake_flags); 1446 if (task_cpu(p) != cpu) { 1447 wake_flags |= WF_MIGRATED; 1448 set_task_cpu(p, cpu); 1449 } 1450#endif /* CONFIG_SMP */ 1451 1452 ttwu_queue(p, cpu); 1453stat: 1454 ttwu_stat(p, cpu, wake_flags); 1455out: 1456 raw_spin_unlock_irqrestore(&p->pi_lock, flags); 1457 1458 return success; 1459} 1460 1461/** 1462 * try_to_wake_up_local - try to wake up a local task with rq lock held 1463 * @p: the thread to be awakened 1464 * 1465 * Put @p on the run-queue if it's not already there. The caller must 1466 * ensure that this_rq() is locked, @p is bound to this_rq() and not 1467 * the current task. 1468 */ 1469static void try_to_wake_up_local(struct task_struct *p) 1470{ 1471 struct rq *rq = task_rq(p); 1472 1473 BUG_ON(rq != this_rq()); 1474 BUG_ON(p == current); 1475 lockdep_assert_held(&rq->lock); 1476 1477 if (!raw_spin_trylock(&p->pi_lock)) { 1478 raw_spin_unlock(&rq->lock); 1479 raw_spin_lock(&p->pi_lock); 1480 raw_spin_lock(&rq->lock); 1481 } 1482 1483 if (!(p->state & TASK_NORMAL)) 1484 goto out; 1485 1486 if (!p->on_rq) 1487 ttwu_activate(rq, p, ENQUEUE_WAKEUP); 1488 1489 ttwu_do_wakeup(rq, p, 0); 1490 ttwu_stat(p, smp_processor_id(), 0); 1491out: 1492 raw_spin_unlock(&p->pi_lock); 1493} 1494 1495/** 1496 * wake_up_process - Wake up a specific process 1497 * @p: The process to be woken up. 1498 * 1499 * Attempt to wake up the nominated process and move it to the set of runnable 1500 * processes. Returns 1 if the process was woken up, 0 if it was already 1501 * running. 1502 * 1503 * It may be assumed that this function implies a write memory barrier before 1504 * changing the task state if and only if any tasks are woken up. 1505 */ 1506int wake_up_process(struct task_struct *p) 1507{ 1508 return try_to_wake_up(p, TASK_ALL, 0); 1509} 1510EXPORT_SYMBOL(wake_up_process); 1511 1512int wake_up_state(struct task_struct *p, unsigned int state) 1513{ 1514 return try_to_wake_up(p, state, 0); 1515} 1516 1517/* 1518 * Perform scheduler related setup for a newly forked process p. 1519 * p is forked by current. 1520 * 1521 * __sched_fork() is basic setup used by init_idle() too: 1522 */ 1523static void __sched_fork(struct task_struct *p) 1524{ 1525 p->on_rq = 0; 1526 1527 p->se.on_rq = 0; 1528 p->se.exec_start = 0; 1529 p->se.sum_exec_runtime = 0; 1530 p->se.prev_sum_exec_runtime = 0; 1531 p->se.nr_migrations = 0; 1532 p->se.vruntime = 0; 1533 INIT_LIST_HEAD(&p->se.group_node); 1534 1535#ifdef CONFIG_SCHEDSTATS 1536 memset(&p->se.statistics, 0, sizeof(p->se.statistics)); 1537#endif 1538 1539 INIT_LIST_HEAD(&p->rt.run_list); 1540 1541#ifdef CONFIG_PREEMPT_NOTIFIERS 1542 INIT_HLIST_HEAD(&p->preempt_notifiers); 1543#endif 1544 1545#ifdef CONFIG_NUMA_BALANCING 1546 if (p->mm && atomic_read(&p->mm->mm_users) == 1) { 1547 p->mm->numa_next_scan = jiffies; 1548 p->mm->numa_next_reset = jiffies; 1549 p->mm->numa_scan_seq = 0; 1550 } 1551 1552 p->node_stamp = 0ULL; 1553 p->numa_scan_seq = p->mm ? p->mm->numa_scan_seq : 0; 1554 p->numa_migrate_seq = p->mm ? p->mm->numa_scan_seq - 1 : 0; 1555 p->numa_scan_period = sysctl_numa_balancing_scan_delay; 1556 p->numa_work.next = &p->numa_work; 1557#endif /* CONFIG_NUMA_BALANCING */ 1558} 1559 1560#ifdef CONFIG_NUMA_BALANCING 1561void set_numabalancing_state(bool enabled) 1562{ 1563 if (enabled) 1564 sched_feat_set("NUMA"); 1565 else 1566 sched_feat_set("NO_NUMA"); 1567} 1568#endif /* CONFIG_NUMA_BALANCING */ 1569 1570/* 1571 * fork()/clone()-time setup: 1572 */ 1573void sched_fork(struct task_struct *p) 1574{ 1575 unsigned long flags; 1576 int cpu = get_cpu(); 1577 1578 __sched_fork(p); 1579 /* 1580 * We mark the process as running here. This guarantees that 1581 * nobody will actually run it, and a signal or other external 1582 * event cannot wake it up and insert it on the runqueue either. 1583 */ 1584 p->state = TASK_RUNNING; 1585 1586 /* 1587 * Make sure we do not leak PI boosting priority to the child. 1588 */ 1589 p->prio = current->normal_prio; 1590 1591 /* 1592 * Revert to default priority/policy on fork if requested. 1593 */ 1594 if (unlikely(p->sched_reset_on_fork)) { 1595 if (task_has_rt_policy(p)) { 1596 p->policy = SCHED_NORMAL; 1597 p->static_prio = NICE_TO_PRIO(0); 1598 p->rt_priority = 0; 1599 } else if (PRIO_TO_NICE(p->static_prio) < 0) 1600 p->static_prio = NICE_TO_PRIO(0); 1601 1602 p->prio = p->normal_prio = __normal_prio(p); 1603 set_load_weight(p); 1604 1605 /* 1606 * We don't need the reset flag anymore after the fork. It has 1607 * fulfilled its duty: 1608 */ 1609 p->sched_reset_on_fork = 0; 1610 } 1611 1612 if (!rt_prio(p->prio)) 1613 p->sched_class = &fair_sched_class; 1614 1615 if (p->sched_class->task_fork) 1616 p->sched_class->task_fork(p); 1617 1618 /* 1619 * The child is not yet in the pid-hash so no cgroup attach races, 1620 * and the cgroup is pinned to this child due to cgroup_fork() 1621 * is ran before sched_fork(). 1622 * 1623 * Silence PROVE_RCU. 1624 */ 1625 raw_spin_lock_irqsave(&p->pi_lock, flags); 1626 set_task_cpu(p, cpu); 1627 raw_spin_unlock_irqrestore(&p->pi_lock, flags); 1628 1629#if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT) 1630 if (likely(sched_info_on())) 1631 memset(&p->sched_info, 0, sizeof(p->sched_info)); 1632#endif 1633#if defined(CONFIG_SMP) 1634 p->on_cpu = 0; 1635#endif 1636#ifdef CONFIG_PREEMPT_COUNT 1637 /* Want to start with kernel preemption disabled. */ 1638 task_thread_info(p)->preempt_count = 1; 1639#endif 1640#ifdef CONFIG_SMP 1641 plist_node_init(&p->pushable_tasks, MAX_PRIO); 1642#endif 1643 1644 put_cpu(); 1645} 1646 1647/* 1648 * wake_up_new_task - wake up a newly created task for the first time. 1649 * 1650 * This function will do some initial scheduler statistics housekeeping 1651 * that must be done for every newly created context, then puts the task 1652 * on the runqueue and wakes it. 1653 */ 1654void wake_up_new_task(struct task_struct *p) 1655{ 1656 unsigned long flags; 1657 struct rq *rq; 1658 1659 raw_spin_lock_irqsave(&p->pi_lock, flags); 1660#ifdef CONFIG_SMP 1661 /* 1662 * Fork balancing, do it here and not earlier because: 1663 * - cpus_allowed can change in the fork path 1664 * - any previously selected cpu might disappear through hotplug 1665 */ 1666 set_task_cpu(p, select_task_rq(p, SD_BALANCE_FORK, 0)); 1667#endif 1668 1669 rq = __task_rq_lock(p); 1670 activate_task(rq, p, 0); 1671 p->on_rq = 1; 1672 trace_sched_wakeup_new(p, true); 1673 check_preempt_curr(rq, p, WF_FORK); 1674#ifdef CONFIG_SMP 1675 if (p->sched_class->task_woken) 1676 p->sched_class->task_woken(rq, p); 1677#endif 1678 task_rq_unlock(rq, p, &flags); 1679} 1680 1681#ifdef CONFIG_PREEMPT_NOTIFIERS 1682 1683/** 1684 * preempt_notifier_register - tell me when current is being preempted & rescheduled 1685 * @notifier: notifier struct to register 1686 */ 1687void preempt_notifier_register(struct preempt_notifier *notifier) 1688{ 1689 hlist_add_head(¬ifier->link, ¤t->preempt_notifiers); 1690} 1691EXPORT_SYMBOL_GPL(preempt_notifier_register); 1692 1693/** 1694 * preempt_notifier_unregister - no longer interested in preemption notifications 1695 * @notifier: notifier struct to unregister 1696 * 1697 * This is safe to call from within a preemption notifier. 1698 */ 1699void preempt_notifier_unregister(struct preempt_notifier *notifier) 1700{ 1701 hlist_del(¬ifier->link); 1702} 1703EXPORT_SYMBOL_GPL(preempt_notifier_unregister); 1704 1705static void fire_sched_in_preempt_notifiers(struct task_struct *curr) 1706{ 1707 struct preempt_notifier *notifier; 1708 struct hlist_node *node; 1709 1710 hlist_for_each_entry(notifier, node, &curr->preempt_notifiers, link) 1711 notifier->ops->sched_in(notifier, raw_smp_processor_id()); 1712} 1713 1714static void 1715fire_sched_out_preempt_notifiers(struct task_struct *curr, 1716 struct task_struct *next) 1717{ 1718 struct preempt_notifier *notifier; 1719 struct hlist_node *node; 1720 1721 hlist_for_each_entry(notifier, node, &curr->preempt_notifiers, link) 1722 notifier->ops->sched_out(notifier, next); 1723} 1724 1725#else /* !CONFIG_PREEMPT_NOTIFIERS */ 1726 1727static void fire_sched_in_preempt_notifiers(struct task_struct *curr) 1728{ 1729} 1730 1731static void 1732fire_sched_out_preempt_notifiers(struct task_struct *curr, 1733 struct task_struct *next) 1734{ 1735} 1736 1737#endif /* CONFIG_PREEMPT_NOTIFIERS */ 1738 1739/** 1740 * prepare_task_switch - prepare to switch tasks 1741 * @rq: the runqueue preparing to switch 1742 * @prev: the current task that is being switched out 1743 * @next: the task we are going to switch to. 1744 * 1745 * This is called with the rq lock held and interrupts off. It must 1746 * be paired with a subsequent finish_task_switch after the context 1747 * switch. 1748 * 1749 * prepare_task_switch sets up locking and calls architecture specific 1750 * hooks. 1751 */ 1752static inline void 1753prepare_task_switch(struct rq *rq, struct task_struct *prev, 1754 struct task_struct *next) 1755{ 1756 trace_sched_switch(prev, next); 1757 sched_info_switch(prev, next); 1758 perf_event_task_sched_out(prev, next); 1759 fire_sched_out_preempt_notifiers(prev, next); 1760 prepare_lock_switch(rq, next); 1761 prepare_arch_switch(next); 1762} 1763 1764/** 1765 * finish_task_switch - clean up after a task-switch 1766 * @rq: runqueue associated with task-switch 1767 * @prev: the thread we just switched away from. 1768 * 1769 * finish_task_switch must be called after the context switch, paired 1770 * with a prepare_task_switch call before the context switch. 1771 * finish_task_switch will reconcile locking set up by prepare_task_switch, 1772 * and do any other architecture-specific cleanup actions. 1773 * 1774 * Note that we may have delayed dropping an mm in context_switch(). If 1775 * so, we finish that here outside of the runqueue lock. (Doing it 1776 * with the lock held can cause deadlocks; see schedule() for 1777 * details.) 1778 */ 1779static void finish_task_switch(struct rq *rq, struct task_struct *prev) 1780 __releases(rq->lock) 1781{ 1782 struct mm_struct *mm = rq->prev_mm; 1783 long prev_state; 1784 1785 rq->prev_mm = NULL; 1786 1787 /* 1788 * A task struct has one reference for the use as "current". 1789 * If a task dies, then it sets TASK_DEAD in tsk->state and calls 1790 * schedule one last time. The schedule call will never return, and 1791 * the scheduled task must drop that reference. 1792 * The test for TASK_DEAD must occur while the runqueue locks are 1793 * still held, otherwise prev could be scheduled on another cpu, die 1794 * there before we look at prev->state, and then the reference would 1795 * be dropped twice. 1796 * Manfred Spraul <manfred@colorfullife.com> 1797 */ 1798 prev_state = prev->state; 1799 vtime_task_switch(prev); 1800 finish_arch_switch(prev); 1801 perf_event_task_sched_in(prev, current); 1802 finish_lock_switch(rq, prev); 1803 finish_arch_post_lock_switch(); 1804 1805 fire_sched_in_preempt_notifiers(current); 1806 if (mm) 1807 mmdrop(mm); 1808 if (unlikely(prev_state == TASK_DEAD)) { 1809 /* 1810 * Remove function-return probe instances associated with this 1811 * task and put them back on the free list. 1812 */ 1813 kprobe_flush_task(prev); 1814 put_task_struct(prev); 1815 } 1816} 1817 1818#ifdef CONFIG_SMP 1819 1820/* assumes rq->lock is held */ 1821static inline void pre_schedule(struct rq *rq, struct task_struct *prev) 1822{ 1823 if (prev->sched_class->pre_schedule) 1824 prev->sched_class->pre_schedule(rq, prev); 1825} 1826 1827/* rq->lock is NOT held, but preemption is disabled */ 1828static inline void post_schedule(struct rq *rq) 1829{ 1830 if (rq->post_schedule) { 1831 unsigned long flags; 1832 1833 raw_spin_lock_irqsave(&rq->lock, flags); 1834 if (rq->curr->sched_class->post_schedule) 1835 rq->curr->sched_class->post_schedule(rq); 1836 raw_spin_unlock_irqrestore(&rq->lock, flags); 1837 1838 rq->post_schedule = 0; 1839 } 1840} 1841 1842#else 1843 1844static inline void pre_schedule(struct rq *rq, struct task_struct *p) 1845{ 1846} 1847 1848static inline void post_schedule(struct rq *rq) 1849{ 1850} 1851 1852#endif 1853 1854/** 1855 * schedule_tail - first thing a freshly forked thread must call. 1856 * @prev: the thread we just switched away from. 1857 */ 1858asmlinkage void schedule_tail(struct task_struct *prev) 1859 __releases(rq->lock) 1860{ 1861 struct rq *rq = this_rq(); 1862 1863 finish_task_switch(rq, prev); 1864 1865 /* 1866 * FIXME: do we need to worry about rq being invalidated by the 1867 * task_switch? 1868 */ 1869 post_schedule(rq); 1870 1871#ifdef __ARCH_WANT_UNLOCKED_CTXSW 1872 /* In this case, finish_task_switch does not reenable preemption */ 1873 preempt_enable(); 1874#endif 1875 if (current->set_child_tid) 1876 put_user(task_pid_vnr(current), current->set_child_tid); 1877} 1878 1879/* 1880 * context_switch - switch to the new MM and the new 1881 * thread's register state. 1882 */ 1883static inline void 1884context_switch(struct rq *rq, struct task_struct *prev, 1885 struct task_struct *next) 1886{ 1887 struct mm_struct *mm, *oldmm; 1888 1889 prepare_task_switch(rq, prev, next); 1890 1891 mm = next->mm; 1892 oldmm = prev->active_mm; 1893 /* 1894 * For paravirt, this is coupled with an exit in switch_to to 1895 * combine the page table reload and the switch backend into 1896 * one hypercall. 1897 */ 1898 arch_start_context_switch(prev); 1899 1900 if (!mm) { 1901 next->active_mm = oldmm; 1902 atomic_inc(&oldmm->mm_count); 1903 enter_lazy_tlb(oldmm, next); 1904 } else 1905 switch_mm(oldmm, mm, next); 1906 1907 if (!prev->mm) { 1908 prev->active_mm = NULL; 1909 rq->prev_mm = oldmm; 1910 } 1911 /* 1912 * Since the runqueue lock will be released by the next 1913 * task (which is an invalid locking op but in the case 1914 * of the scheduler it's an obvious special-case), so we 1915 * do an early lockdep release here: 1916 */ 1917#ifndef __ARCH_WANT_UNLOCKED_CTXSW 1918 spin_release(&rq->lock.dep_map, 1, _THIS_IP_); 1919#endif 1920 1921 /* Here we just switch the register state and the stack. */ 1922 rcu_switch(prev, next); 1923 switch_to(prev, next, prev); 1924 1925 barrier(); 1926 /* 1927 * this_rq must be evaluated again because prev may have moved 1928 * CPUs since it called schedule(), thus the 'rq' on its stack 1929 * frame will be invalid. 1930 */ 1931 finish_task_switch(this_rq(), prev); 1932} 1933 1934/* 1935 * nr_running, nr_uninterruptible and nr_context_switches: 1936 * 1937 * externally visible scheduler statistics: current number of runnable 1938 * threads, current number of uninterruptible-sleeping threads, total 1939 * number of context switches performed since bootup. 1940 */ 1941unsigned long nr_running(void) 1942{ 1943 unsigned long i, sum = 0; 1944 1945 for_each_online_cpu(i) 1946 sum += cpu_rq(i)->nr_running; 1947 1948 return sum; 1949} 1950 1951unsigned long nr_uninterruptible(void) 1952{ 1953 unsigned long i, sum = 0; 1954 1955 for_each_possible_cpu(i) 1956 sum += cpu_rq(i)->nr_uninterruptible; 1957 1958 /* 1959 * Since we read the counters lockless, it might be slightly 1960 * inaccurate. Do not allow it to go below zero though: 1961 */ 1962 if (unlikely((long)sum < 0)) 1963 sum = 0; 1964 1965 return sum; 1966} 1967 1968unsigned long long nr_context_switches(void) 1969{ 1970 int i; 1971 unsigned long long sum = 0; 1972 1973 for_each_possible_cpu(i) 1974 sum += cpu_rq(i)->nr_switches; 1975 1976 return sum; 1977} 1978 1979unsigned long nr_iowait(void) 1980{ 1981 unsigned long i, sum = 0; 1982 1983 for_each_possible_cpu(i) 1984 sum += atomic_read(&cpu_rq(i)->nr_iowait); 1985 1986 return sum; 1987} 1988 1989unsigned long nr_iowait_cpu(int cpu) 1990{ 1991 struct rq *this = cpu_rq(cpu); 1992 return atomic_read(&this->nr_iowait); 1993} 1994 1995unsigned long this_cpu_load(void) 1996{ 1997 struct rq *this = this_rq(); 1998 return this->cpu_load[0]; 1999} 2000 2001 2002/* 2003 * Global load-average calculations 2004 * 2005 * We take a distributed and async approach to calculating the global load-avg 2006 * in order to minimize overhead. 2007 * 2008 * The global load average is an exponentially decaying average of nr_running + 2009 * nr_uninterruptible. 2010 * 2011 * Once every LOAD_FREQ: 2012 * 2013 * nr_active = 0; 2014 * for_each_possible_cpu(cpu) 2015 * nr_active += cpu_of(cpu)->nr_running + cpu_of(cpu)->nr_uninterruptible; 2016 * 2017 * avenrun[n] = avenrun[0] * exp_n + nr_active * (1 - exp_n) 2018 * 2019 * Due to a number of reasons the above turns in the mess below: 2020 * 2021 * - for_each_possible_cpu() is prohibitively expensive on machines with 2022 * serious number of cpus, therefore we need to take a distributed approach 2023 * to calculating nr_active. 2024 * 2025 * \Sum_i x_i(t) = \Sum_i x_i(t) - x_i(t_0) | x_i(t_0) := 0 2026 * = \Sum_i { \Sum_j=1 x_i(t_j) - x_i(t_j-1) } 2027 * 2028 * So assuming nr_active := 0 when we start out -- true per definition, we 2029 * can simply take per-cpu deltas and fold those into a global accumulate 2030 * to obtain the same result. See calc_load_fold_active(). 2031 * 2032 * Furthermore, in order to avoid synchronizing all per-cpu delta folding 2033 * across the machine, we assume 10 ticks is sufficient time for every 2034 * cpu to have completed this task. 2035 * 2036 * This places an upper-bound on the IRQ-off latency of the machine. Then 2037 * again, being late doesn't loose the delta, just wrecks the sample. 2038 * 2039 * - cpu_rq()->nr_uninterruptible isn't accurately tracked per-cpu because 2040 * this would add another cross-cpu cacheline miss and atomic operation 2041 * to the wakeup path. Instead we increment on whatever cpu the task ran 2042 * when it went into uninterruptible state and decrement on whatever cpu 2043 * did the wakeup. This means that only the sum of nr_uninterruptible over 2044 * all cpus yields the correct result. 2045 * 2046 * This covers the NO_HZ=n code, for extra head-aches, see the comment below. 2047 */ 2048 2049/* Variables and functions for calc_load */ 2050static atomic_long_t calc_load_tasks; 2051static unsigned long calc_load_update; 2052unsigned long avenrun[3]; 2053EXPORT_SYMBOL(avenrun); /* should be removed */ 2054 2055/** 2056 * get_avenrun - get the load average array 2057 * @loads: pointer to dest load array 2058 * @offset: offset to add 2059 * @shift: shift count to shift the result left 2060 * 2061 * These values are estimates at best, so no need for locking. 2062 */ 2063void get_avenrun(unsigned long *loads, unsigned long offset, int shift) 2064{ 2065 loads[0] = (avenrun[0] + offset) << shift; 2066 loads[1] = (avenrun[1] + offset) << shift; 2067 loads[2] = (avenrun[2] + offset) << shift; 2068} 2069 2070static long calc_load_fold_active(struct rq *this_rq) 2071{ 2072 long nr_active, delta = 0; 2073 2074 nr_active = this_rq->nr_running; 2075 nr_active += (long) this_rq->nr_uninterruptible; 2076 2077 if (nr_active != this_rq->calc_load_active) { 2078 delta = nr_active - this_rq->calc_load_active; 2079 this_rq->calc_load_active = nr_active; 2080 } 2081 2082 return delta; 2083} 2084 2085/* 2086 * a1 = a0 * e + a * (1 - e) 2087 */ 2088static unsigned long 2089calc_load(unsigned long load, unsigned long exp, unsigned long active) 2090{ 2091 load *= exp; 2092 load += active * (FIXED_1 - exp); 2093 load += 1UL << (FSHIFT - 1); 2094 return load >> FSHIFT; 2095} 2096 2097#ifdef CONFIG_NO_HZ 2098/* 2099 * Handle NO_HZ for the global load-average. 2100 * 2101 * Since the above described distributed algorithm to compute the global 2102 * load-average relies on per-cpu sampling from the tick, it is affected by 2103 * NO_HZ. 2104 * 2105 * The basic idea is to fold the nr_active delta into a global idle-delta upon 2106 * entering NO_HZ state such that we can include this as an 'extra' cpu delta 2107 * when we read the global state. 2108 * 2109 * Obviously reality has to ruin such a delightfully simple scheme: 2110 * 2111 * - When we go NO_HZ idle during the window, we can negate our sample 2112 * contribution, causing under-accounting. 2113 * 2114 * We avoid this by keeping two idle-delta counters and flipping them 2115 * when the window starts, thus separating old and new NO_HZ load. 2116 * 2117 * The only trick is the slight shift in index flip for read vs write. 2118 * 2119 * 0s 5s 10s 15s 2120 * +10 +10 +10 +10 2121 * |-|-----------|-|-----------|-|-----------|-| 2122 * r:0 0 1 1 0 0 1 1 0 2123 * w:0 1 1 0 0 1 1 0 0 2124 * 2125 * This ensures we'll fold the old idle contribution in this window while 2126 * accumlating the new one. 2127 * 2128 * - When we wake up from NO_HZ idle during the window, we push up our 2129 * contribution, since we effectively move our sample point to a known 2130 * busy state. 2131 * 2132 * This is solved by pushing the window forward, and thus skipping the 2133 * sample, for this cpu (effectively using the idle-delta for this cpu which 2134 * was in effect at the time the window opened). This also solves the issue 2135 * of having to deal with a cpu having been in NOHZ idle for multiple 2136 * LOAD_FREQ intervals. 2137 * 2138 * When making the ILB scale, we should try to pull this in as well. 2139 */ 2140static atomic_long_t calc_load_idle[2]; 2141static int calc_load_idx; 2142 2143static inline int calc_load_write_idx(void) 2144{ 2145 int idx = calc_load_idx; 2146 2147 /* 2148 * See calc_global_nohz(), if we observe the new index, we also 2149 * need to observe the new update time. 2150 */ 2151 smp_rmb(); 2152 2153 /* 2154 * If the folding window started, make sure we start writing in the 2155 * next idle-delta. 2156 */ 2157 if (!time_before(jiffies, calc_load_update)) 2158 idx++; 2159 2160 return idx & 1; 2161} 2162 2163static inline int calc_load_read_idx(void) 2164{ 2165 return calc_load_idx & 1; 2166} 2167 2168void calc_load_enter_idle(void) 2169{ 2170 struct rq *this_rq = this_rq(); 2171 long delta; 2172 2173 /* 2174 * We're going into NOHZ mode, if there's any pending delta, fold it 2175 * into the pending idle delta. 2176 */ 2177 delta = calc_load_fold_active(this_rq); 2178 if (delta) { 2179 int idx = calc_load_write_idx(); 2180 atomic_long_add(delta, &calc_load_idle[idx]); 2181 } 2182} 2183 2184void calc_load_exit_idle(void) 2185{ 2186 struct rq *this_rq = this_rq(); 2187 2188 /* 2189 * If we're still before the sample window, we're done. 2190 */ 2191 if (time_before(jiffies, this_rq->calc_load_update)) 2192 return; 2193 2194 /* 2195 * We woke inside or after the sample window, this means we're already 2196 * accounted through the nohz accounting, so skip the entire deal and 2197 * sync up for the next window. 2198 */ 2199 this_rq->calc_load_update = calc_load_update; 2200 if (time_before(jiffies, this_rq->calc_load_update + 10)) 2201 this_rq->calc_load_update += LOAD_FREQ; 2202} 2203 2204static long calc_load_fold_idle(void) 2205{ 2206 int idx = calc_load_read_idx(); 2207 long delta = 0; 2208 2209 if (atomic_long_read(&calc_load_idle[idx])) 2210 delta = atomic_long_xchg(&calc_load_idle[idx], 0); 2211 2212 return delta; 2213} 2214 2215/** 2216 * fixed_power_int - compute: x^n, in O(log n) time 2217 * 2218 * @x: base of the power 2219 * @frac_bits: fractional bits of @x 2220 * @n: power to raise @x to. 2221 * 2222 * By exploiting the relation between the definition of the natural power 2223 * function: x^n := x*x*...*x (x multiplied by itself for n times), and 2224 * the binary encoding of numbers used by computers: n := \Sum n_i * 2^i, 2225 * (where: n_i \elem {0, 1}, the binary vector representing n), 2226 * we find: x^n := x^(\Sum n_i * 2^i) := \Prod x^(n_i * 2^i), which is 2227 * of course trivially computable in O(log_2 n), the length of our binary 2228 * vector. 2229 */ 2230static unsigned long 2231fixed_power_int(unsigned long x, unsigned int frac_bits, unsigned int n) 2232{ 2233 unsigned long result = 1UL << frac_bits; 2234 2235 if (n) for (;;) { 2236 if (n & 1) { 2237 result *= x; 2238 result += 1UL << (frac_bits - 1); 2239 result >>= frac_bits; 2240 } 2241 n >>= 1; 2242 if (!n) 2243 break; 2244 x *= x; 2245 x += 1UL << (frac_bits - 1); 2246 x >>= frac_bits; 2247 } 2248 2249 return result; 2250} 2251 2252/* 2253 * a1 = a0 * e + a * (1 - e) 2254 * 2255 * a2 = a1 * e + a * (1 - e) 2256 * = (a0 * e + a * (1 - e)) * e + a * (1 - e) 2257 * = a0 * e^2 + a * (1 - e) * (1 + e) 2258 * 2259 * a3 = a2 * e + a * (1 - e) 2260 * = (a0 * e^2 + a * (1 - e) * (1 + e)) * e + a * (1 - e) 2261 * = a0 * e^3 + a * (1 - e) * (1 + e + e^2) 2262 * 2263 * ... 2264 * 2265 * an = a0 * e^n + a * (1 - e) * (1 + e + ... + e^n-1) [1] 2266 * = a0 * e^n + a * (1 - e) * (1 - e^n)/(1 - e) 2267 * = a0 * e^n + a * (1 - e^n) 2268 * 2269 * [1] application of the geometric series: 2270 * 2271 * n 1 - x^(n+1) 2272 * S_n := \Sum x^i = ------------- 2273 * i=0 1 - x 2274 */ 2275static unsigned long 2276calc_load_n(unsigned long load, unsigned long exp, 2277 unsigned long active, unsigned int n) 2278{ 2279 2280 return calc_load(load, fixed_power_int(exp, FSHIFT, n), active); 2281} 2282 2283/* 2284 * NO_HZ can leave us missing all per-cpu ticks calling 2285 * calc_load_account_active(), but since an idle CPU folds its delta into 2286 * calc_load_tasks_idle per calc_load_account_idle(), all we need to do is fold 2287 * in the pending idle delta if our idle period crossed a load cycle boundary. 2288 * 2289 * Once we've updated the global active value, we need to apply the exponential 2290 * weights adjusted to the number of cycles missed. 2291 */ 2292static void calc_global_nohz(void) 2293{ 2294 long delta, active, n; 2295 2296 if (!time_before(jiffies, calc_load_update + 10)) { 2297 /* 2298 * Catch-up, fold however many we are behind still 2299 */ 2300 delta = jiffies - calc_load_update - 10; 2301 n = 1 + (delta / LOAD_FREQ); 2302 2303 active = atomic_long_read(&calc_load_tasks); 2304 active = active > 0 ? active * FIXED_1 : 0; 2305 2306 avenrun[0] = calc_load_n(avenrun[0], EXP_1, active, n); 2307 avenrun[1] = calc_load_n(avenrun[1], EXP_5, active, n); 2308 avenrun[2] = calc_load_n(avenrun[2], EXP_15, active, n); 2309 2310 calc_load_update += n * LOAD_FREQ; 2311 } 2312 2313 /* 2314 * Flip the idle index... 2315 * 2316 * Make sure we first write the new time then flip the index, so that 2317 * calc_load_write_idx() will see the new time when it reads the new 2318 * index, this avoids a double flip messing things up. 2319 */ 2320 smp_wmb(); 2321 calc_load_idx++; 2322} 2323#else /* !CONFIG_NO_HZ */ 2324 2325static inline long calc_load_fold_idle(void) { return 0; } 2326static inline void calc_global_nohz(void) { } 2327 2328#endif /* CONFIG_NO_HZ */ 2329 2330/* 2331 * calc_load - update the avenrun load estimates 10 ticks after the 2332 * CPUs have updated calc_load_tasks. 2333 */ 2334void calc_global_load(unsigned long ticks) 2335{ 2336 long active, delta; 2337 2338 if (time_before(jiffies, calc_load_update + 10)) 2339 return; 2340 2341 /* 2342 * Fold the 'old' idle-delta to include all NO_HZ cpus. 2343 */ 2344 delta = calc_load_fold_idle(); 2345 if (delta) 2346 atomic_long_add(delta, &calc_load_tasks); 2347 2348 active = atomic_long_read(&calc_load_tasks); 2349 active = active > 0 ? active * FIXED_1 : 0; 2350 2351 avenrun[0] = calc_load(avenrun[0], EXP_1, active); 2352 avenrun[1] = calc_load(avenrun[1], EXP_5, active); 2353 avenrun[2] = calc_load(avenrun[2], EXP_15, active); 2354 2355 calc_load_update += LOAD_FREQ; 2356 2357 /* 2358 * In case we idled for multiple LOAD_FREQ intervals, catch up in bulk. 2359 */ 2360 calc_global_nohz(); 2361} 2362 2363/* 2364 * Called from update_cpu_load() to periodically update this CPU's 2365 * active count. 2366 */ 2367static void calc_load_account_active(struct rq *this_rq) 2368{ 2369 long delta; 2370 2371 if (time_before(jiffies, this_rq->calc_load_update)) 2372 return; 2373 2374 delta = calc_load_fold_active(this_rq); 2375 if (delta) 2376 atomic_long_add(delta, &calc_load_tasks); 2377 2378 this_rq->calc_load_update += LOAD_FREQ; 2379} 2380 2381/* 2382 * End of global load-average stuff 2383 */ 2384 2385/* 2386 * The exact cpuload at various idx values, calculated at every tick would be 2387 * load = (2^idx - 1) / 2^idx * load + 1 / 2^idx * cur_load 2388 * 2389 * If a cpu misses updates for n-1 ticks (as it was idle) and update gets called 2390 * on nth tick when cpu may be busy, then we have: 2391 * load = ((2^idx - 1) / 2^idx)^(n-1) * load 2392 * load = (2^idx - 1) / 2^idx) * load + 1 / 2^idx * cur_load 2393 * 2394 * decay_load_missed() below does efficient calculation of 2395 * load = ((2^idx - 1) / 2^idx)^(n-1) * load 2396 * avoiding 0..n-1 loop doing load = ((2^idx - 1) / 2^idx) * load 2397 * 2398 * The calculation is approximated on a 128 point scale. 2399 * degrade_zero_ticks is the number of ticks after which load at any 2400 * particular idx is approximated to be zero. 2401 * degrade_factor is a precomputed table, a row for each load idx. 2402 * Each column corresponds to degradation factor for a power of two ticks, 2403 * based on 128 point scale. 2404 * Example: 2405 * row 2, col 3 (=12) says that the degradation at load idx 2 after 2406 * 8 ticks is 12/128 (which is an approximation of exact factor 3^8/4^8). 2407 * 2408 * With this power of 2 load factors, we can degrade the load n times 2409 * by looking at 1 bits in n and doing as many mult/shift instead of 2410 * n mult/shifts needed by the exact degradation. 2411 */ 2412#define DEGRADE_SHIFT 7 2413static const unsigned char 2414 degrade_zero_ticks[CPU_LOAD_IDX_MAX] = {0, 8, 32, 64, 128}; 2415static const unsigned char 2416 degrade_factor[CPU_LOAD_IDX_MAX][DEGRADE_SHIFT + 1] = { 2417 {0, 0, 0, 0, 0, 0, 0, 0}, 2418 {64, 32, 8, 0, 0, 0, 0, 0}, 2419 {96, 72, 40, 12, 1, 0, 0}, 2420 {112, 98, 75, 43, 15, 1, 0}, 2421 {120, 112, 98, 76, 45, 16, 2} }; 2422 2423/* 2424 * Update cpu_load for any missed ticks, due to tickless idle. The backlog 2425 * would be when CPU is idle and so we just decay the old load without 2426 * adding any new load. 2427 */ 2428static unsigned long 2429decay_load_missed(unsigned long load, unsigned long missed_updates, int idx) 2430{ 2431 int j = 0; 2432 2433 if (!missed_updates) 2434 return load; 2435 2436 if (missed_updates >= degrade_zero_ticks[idx]) 2437 return 0; 2438 2439 if (idx == 1) 2440 return load >> missed_updates; 2441 2442 while (missed_updates) { 2443 if (missed_updates % 2) 2444 load = (load * degrade_factor[idx][j]) >> DEGRADE_SHIFT; 2445 2446 missed_updates >>= 1; 2447 j++; 2448 } 2449 return load; 2450} 2451 2452/* 2453 * Update rq->cpu_load[] statistics. This function is usually called every 2454 * scheduler tick (TICK_NSEC). With tickless idle this will not be called 2455 * every tick. We fix it up based on jiffies. 2456 */ 2457static void __update_cpu_load(struct rq *this_rq, unsigned long this_load, 2458 unsigned long pending_updates) 2459{ 2460 int i, scale; 2461 2462 this_rq->nr_load_updates++; 2463 2464 /* Update our load: */ 2465 this_rq->cpu_load[0] = this_load; /* Fasttrack for idx 0 */ 2466 for (i = 1, scale = 2; i < CPU_LOAD_IDX_MAX; i++, scale += scale) { 2467 unsigned long old_load, new_load; 2468 2469 /* scale is effectively 1 << i now, and >> i divides by scale */ 2470 2471 old_load = this_rq->cpu_load[i]; 2472 old_load = decay_load_missed(old_load, pending_updates - 1, i); 2473 new_load = this_load; 2474 /* 2475 * Round up the averaging division if load is increasing. This 2476 * prevents us from getting stuck on 9 if the load is 10, for 2477 * example. 2478 */ 2479 if (new_load > old_load) 2480 new_load += scale - 1; 2481 2482 this_rq->cpu_load[i] = (old_load * (scale - 1) + new_load) >> i; 2483 } 2484 2485 sched_avg_update(this_rq); 2486} 2487 2488#ifdef CONFIG_NO_HZ 2489/* 2490 * There is no sane way to deal with nohz on smp when using jiffies because the 2491 * cpu doing the jiffies update might drift wrt the cpu doing the jiffy reading 2492 * causing off-by-one errors in observed deltas; {0,2} instead of {1,1}. 2493 * 2494 * Therefore we cannot use the delta approach from the regular tick since that 2495 * would seriously skew the load calculation. However we'll make do for those 2496 * updates happening while idle (nohz_idle_balance) or coming out of idle 2497 * (tick_nohz_idle_exit). 2498 * 2499 * This means we might still be one tick off for nohz periods. 2500 */ 2501 2502/* 2503 * Called from nohz_idle_balance() to update the load ratings before doing the 2504 * idle balance. 2505 */ 2506void update_idle_cpu_load(struct rq *this_rq) 2507{ 2508 unsigned long curr_jiffies = ACCESS_ONCE(jiffies); 2509 unsigned long load = this_rq->load.weight; 2510 unsigned long pending_updates; 2511 2512 /* 2513 * bail if there's load or we're actually up-to-date. 2514 */ 2515 if (load || curr_jiffies == this_rq->last_load_update_tick) 2516 return; 2517 2518 pending_updates = curr_jiffies - this_rq->last_load_update_tick; 2519 this_rq->last_load_update_tick = curr_jiffies; 2520 2521 __update_cpu_load(this_rq, load, pending_updates); 2522} 2523 2524/* 2525 * Called from tick_nohz_idle_exit() -- try and fix up the ticks we missed. 2526 */ 2527void update_cpu_load_nohz(void) 2528{ 2529 struct rq *this_rq = this_rq(); 2530 unsigned long curr_jiffies = ACCESS_ONCE(jiffies); 2531 unsigned long pending_updates; 2532 2533 if (curr_jiffies == this_rq->last_load_update_tick) 2534 return; 2535 2536 raw_spin_lock(&this_rq->lock); 2537 pending_updates = curr_jiffies - this_rq->last_load_update_tick; 2538 if (pending_updates) { 2539 this_rq->last_load_update_tick = curr_jiffies; 2540 /* 2541 * We were idle, this means load 0, the current load might be 2542 * !0 due to remote wakeups and the sort. 2543 */ 2544 __update_cpu_load(this_rq, 0, pending_updates); 2545 } 2546 raw_spin_unlock(&this_rq->lock); 2547} 2548#endif /* CONFIG_NO_HZ */ 2549 2550/* 2551 * Called from scheduler_tick() 2552 */ 2553static void update_cpu_load_active(struct rq *this_rq) 2554{ 2555 /* 2556 * See the mess around update_idle_cpu_load() / update_cpu_load_nohz(). 2557 */ 2558 this_rq->last_load_update_tick = jiffies; 2559 __update_cpu_load(this_rq, this_rq->load.weight, 1); 2560 2561 calc_load_account_active(this_rq); 2562} 2563 2564#ifdef CONFIG_SMP 2565 2566/* 2567 * sched_exec - execve() is a valuable balancing opportunity, because at 2568 * this point the task has the smallest effective memory and cache footprint. 2569 */ 2570void sched_exec(void) 2571{ 2572 struct task_struct *p = current; 2573 unsigned long flags; 2574 int dest_cpu; 2575 2576 raw_spin_lock_irqsave(&p->pi_lock, flags); 2577 dest_cpu = p->sched_class->select_task_rq(p, SD_BALANCE_EXEC, 0); 2578 if (dest_cpu == smp_processor_id()) 2579 goto unlock; 2580 2581 if (likely(cpu_active(dest_cpu))) { 2582 struct migration_arg arg = { p, dest_cpu }; 2583 2584 raw_spin_unlock_irqrestore(&p->pi_lock, flags); 2585 stop_one_cpu(task_cpu(p), migration_cpu_stop, &arg); 2586 return; 2587 } 2588unlock: 2589 raw_spin_unlock_irqrestore(&p->pi_lock, flags); 2590} 2591 2592#endif 2593 2594DEFINE_PER_CPU(struct kernel_stat, kstat); 2595DEFINE_PER_CPU(struct kernel_cpustat, kernel_cpustat); 2596 2597EXPORT_PER_CPU_SYMBOL(kstat); 2598EXPORT_PER_CPU_SYMBOL(kernel_cpustat); 2599 2600/* 2601 * Return any ns on the sched_clock that have not yet been accounted in 2602 * @p in case that task is currently running. 2603 * 2604 * Called with task_rq_lock() held on @rq. 2605 */ 2606static u64 do_task_delta_exec(struct task_struct *p, struct rq *rq) 2607{ 2608 u64 ns = 0; 2609 2610 if (task_current(rq, p)) { 2611 update_rq_clock(rq); 2612 ns = rq->clock_task - p->se.exec_start; 2613 if ((s64)ns < 0) 2614 ns = 0; 2615 } 2616 2617 return ns; 2618} 2619 2620unsigned long long task_delta_exec(struct task_struct *p) 2621{ 2622 unsigned long flags; 2623 struct rq *rq; 2624 u64 ns = 0; 2625 2626 rq = task_rq_lock(p, &flags); 2627 ns = do_task_delta_exec(p, rq); 2628 task_rq_unlock(rq, p, &flags); 2629 2630 return ns; 2631} 2632 2633/* 2634 * Return accounted runtime for the task. 2635 * In case the task is currently running, return the runtime plus current's 2636 * pending runtime that have not been accounted yet. 2637 */ 2638unsigned long long task_sched_runtime(struct task_struct *p) 2639{ 2640 unsigned long flags; 2641 struct rq *rq; 2642 u64 ns = 0; 2643 2644 rq = task_rq_lock(p, &flags); 2645 ns = p->se.sum_exec_runtime + do_task_delta_exec(p, rq); 2646 task_rq_unlock(rq, p, &flags); 2647 2648 return ns; 2649} 2650 2651/* 2652 * This function gets called by the timer code, with HZ frequency. 2653 * We call it with interrupts disabled. 2654 */ 2655void scheduler_tick(void) 2656{ 2657 int cpu = smp_processor_id(); 2658 struct rq *rq = cpu_rq(cpu); 2659 struct task_struct *curr = rq->curr; 2660 2661 sched_clock_tick(); 2662 2663 raw_spin_lock(&rq->lock); 2664 update_rq_clock(rq); 2665 update_cpu_load_active(rq); 2666 curr->sched_class->task_tick(rq, curr, 0); 2667 raw_spin_unlock(&rq->lock); 2668 2669 perf_event_task_tick(); 2670 2671#ifdef CONFIG_SMP 2672 rq->idle_balance = idle_cpu(cpu); 2673 trigger_load_balance(rq, cpu); 2674#endif 2675} 2676 2677notrace unsigned long get_parent_ip(unsigned long addr) 2678{ 2679 if (in_lock_functions(addr)) { 2680 addr = CALLER_ADDR2; 2681 if (in_lock_functions(addr)) 2682 addr = CALLER_ADDR3; 2683 } 2684 return addr; 2685} 2686 2687#if defined(CONFIG_PREEMPT) && (defined(CONFIG_DEBUG_PREEMPT) || \ 2688 defined(CONFIG_PREEMPT_TRACER)) 2689 2690void __kprobes add_preempt_count(int val) 2691{ 2692#ifdef CONFIG_DEBUG_PREEMPT 2693 /* 2694 * Underflow? 2695 */ 2696 if (DEBUG_LOCKS_WARN_ON((preempt_count() < 0))) 2697 return; 2698#endif 2699 preempt_count() += val; 2700#ifdef CONFIG_DEBUG_PREEMPT 2701 /* 2702 * Spinlock count overflowing soon? 2703 */ 2704 DEBUG_LOCKS_WARN_ON((preempt_count() & PREEMPT_MASK) >= 2705 PREEMPT_MASK - 10); 2706#endif 2707 if (preempt_count() == val) 2708 trace_preempt_off(CALLER_ADDR0, get_parent_ip(CALLER_ADDR1)); 2709} 2710EXPORT_SYMBOL(add_preempt_count); 2711 2712void __kprobes sub_preempt_count(int val) 2713{ 2714#ifdef CONFIG_DEBUG_PREEMPT 2715 /* 2716 * Underflow? 2717 */ 2718 if (DEBUG_LOCKS_WARN_ON(val > preempt_count())) 2719 return; 2720 /* 2721 * Is the spinlock portion underflowing? 2722 */ 2723 if (DEBUG_LOCKS_WARN_ON((val < PREEMPT_MASK) && 2724 !(preempt_count() & PREEMPT_MASK))) 2725 return; 2726#endif 2727 2728 if (preempt_count() == val) 2729 trace_preempt_on(CALLER_ADDR0, get_parent_ip(CALLER_ADDR1)); 2730 preempt_count() -= val; 2731} 2732EXPORT_SYMBOL(sub_preempt_count); 2733 2734#endif 2735 2736/* 2737 * Print scheduling while atomic bug: 2738 */ 2739static noinline void __schedule_bug(struct task_struct *prev) 2740{ 2741 if (oops_in_progress) 2742 return; 2743 2744 printk(KERN_ERR "BUG: scheduling while atomic: %s/%d/0x%08x\n", 2745 prev->comm, prev->pid, preempt_count()); 2746 2747 debug_show_held_locks(prev); 2748 print_modules(); 2749 if (irqs_disabled()) 2750 print_irqtrace_events(prev); 2751 dump_stack(); 2752 add_taint(TAINT_WARN); 2753} 2754 2755/* 2756 * Various schedule()-time debugging checks and statistics: 2757 */ 2758static inline void schedule_debug(struct task_struct *prev) 2759{ 2760 /* 2761 * Test if we are atomic. Since do_exit() needs to call into 2762 * schedule() atomically, we ignore that path for now. 2763 * Otherwise, whine if we are scheduling when we should not be. 2764 */ 2765 if (unlikely(in_atomic_preempt_off() && !prev->exit_state)) 2766 __schedule_bug(prev); 2767 rcu_sleep_check(); 2768 2769 profile_hit(SCHED_PROFILING, __builtin_return_address(0)); 2770 2771 schedstat_inc(this_rq(), sched_count); 2772} 2773 2774static void put_prev_task(struct rq *rq, struct task_struct *prev) 2775{ 2776 if (prev->on_rq || rq->skip_clock_update < 0) 2777 update_rq_clock(rq); 2778 prev->sched_class->put_prev_task(rq, prev); 2779} 2780 2781/* 2782 * Pick up the highest-prio task: 2783 */ 2784static inline struct task_struct * 2785pick_next_task(struct rq *rq) 2786{ 2787 const struct sched_class *class; 2788 struct task_struct *p; 2789 2790 /* 2791 * Optimization: we know that if all tasks are in 2792 * the fair class we can call that function directly: 2793 */ 2794 if (likely(rq->nr_running == rq->cfs.h_nr_running)) { 2795 p = fair_sched_class.pick_next_task(rq); 2796 if (likely(p)) 2797 return p; 2798 } 2799 2800 for_each_class(class) { 2801 p = class->pick_next_task(rq); 2802 if (p) 2803 return p; 2804 } 2805 2806 BUG(); /* the idle class will always have a runnable task */ 2807} 2808 2809/* 2810 * __schedule() is the main scheduler function. 2811 * 2812 * The main means of driving the scheduler and thus entering this function are: 2813 * 2814 * 1. Explicit blocking: mutex, semaphore, waitqueue, etc. 2815 * 2816 * 2. TIF_NEED_RESCHED flag is checked on interrupt and userspace return 2817 * paths. For example, see arch/x86/entry_64.S. 2818 * 2819 * To drive preemption between tasks, the scheduler sets the flag in timer 2820 * interrupt handler scheduler_tick(). 2821 * 2822 * 3. Wakeups don't really cause entry into schedule(). They add a 2823 * task to the run-queue and that's it. 2824 * 2825 * Now, if the new task added to the run-queue preempts the current 2826 * task, then the wakeup sets TIF_NEED_RESCHED and schedule() gets 2827 * called on the nearest possible occasion: 2828 * 2829 * - If the kernel is preemptible (CONFIG_PREEMPT=y): 2830 * 2831 * - in syscall or exception context, at the next outmost 2832 * preempt_enable(). (this might be as soon as the wake_up()'s 2833 * spin_unlock()!) 2834 * 2835 * - in IRQ context, return from interrupt-handler to 2836 * preemptible context 2837 * 2838 * - If the kernel is not preemptible (CONFIG_PREEMPT is not set) 2839 * then at the next: 2840 * 2841 * - cond_resched() call 2842 * - explicit schedule() call 2843 * - return from syscall or exception to user-space 2844 * - return from interrupt-handler to user-space 2845 */ 2846static void __sched __schedule(void) 2847{ 2848 struct task_struct *prev, *next; 2849 unsigned long *switch_count; 2850 struct rq *rq; 2851 int cpu; 2852 2853need_resched: 2854 preempt_disable(); 2855 cpu = smp_processor_id(); 2856 rq = cpu_rq(cpu); 2857 rcu_note_context_switch(cpu); 2858 prev = rq->curr; 2859 2860 schedule_debug(prev); 2861 2862 if (sched_feat(HRTICK)) 2863 hrtick_clear(rq); 2864 2865 raw_spin_lock_irq(&rq->lock); 2866 2867 switch_count = &prev->nivcsw; 2868 if (prev->state && !(preempt_count() & PREEMPT_ACTIVE)) { 2869 if (unlikely(signal_pending_state(prev->state, prev))) { 2870 prev->state = TASK_RUNNING; 2871 } else { 2872 deactivate_task(rq, prev, DEQUEUE_SLEEP); 2873 prev->on_rq = 0; 2874 2875 /* 2876 * If a worker went to sleep, notify and ask workqueue 2877 * whether it wants to wake up a task to maintain 2878 * concurrency. 2879 */ 2880 if (prev->flags & PF_WQ_WORKER) { 2881 struct task_struct *to_wakeup; 2882 2883 to_wakeup = wq_worker_sleeping(prev, cpu); 2884 if (to_wakeup) 2885 try_to_wake_up_local(to_wakeup); 2886 } 2887 } 2888 switch_count = &prev->nvcsw; 2889 } 2890 2891 pre_schedule(rq, prev); 2892 2893 if (unlikely(!rq->nr_running)) 2894 idle_balance(cpu, rq); 2895 2896 put_prev_task(rq, prev); 2897 next = pick_next_task(rq); 2898 clear_tsk_need_resched(prev); 2899 rq->skip_clock_update = 0; 2900 2901 if (likely(prev != next)) { 2902 rq->nr_switches++; 2903 rq->curr = next; 2904 ++*switch_count; 2905 2906 context_switch(rq, prev, next); /* unlocks the rq */ 2907 /* 2908 * The context switch have flipped the stack from under us 2909 * and restored the local variables which were saved when 2910 * this task called schedule() in the past. prev == current 2911 * is still correct, but it can be moved to another cpu/rq. 2912 */ 2913 cpu = smp_processor_id(); 2914 rq = cpu_rq(cpu); 2915 } else 2916 raw_spin_unlock_irq(&rq->lock); 2917 2918 post_schedule(rq); 2919 2920 sched_preempt_enable_no_resched(); 2921 if (need_resched()) 2922 goto need_resched; 2923} 2924 2925static inline void sched_submit_work(struct task_struct *tsk) 2926{ 2927 if (!tsk->state || tsk_is_pi_blocked(tsk)) 2928 return; 2929 /* 2930 * If we are going to sleep and we have plugged IO queued, 2931 * make sure to submit it to avoid deadlocks. 2932 */ 2933 if (blk_needs_flush_plug(tsk)) 2934 blk_schedule_flush_plug(tsk); 2935} 2936 2937asmlinkage void __sched schedule(void) 2938{ 2939 struct task_struct *tsk = current; 2940 2941 sched_submit_work(tsk); 2942 __schedule(); 2943} 2944EXPORT_SYMBOL(schedule); 2945 2946#ifdef CONFIG_RCU_USER_QS 2947asmlinkage void __sched schedule_user(void) 2948{ 2949 /* 2950 * If we come here after a random call to set_need_resched(), 2951 * or we have been woken up remotely but the IPI has not yet arrived, 2952 * we haven't yet exited the RCU idle mode. Do it here manually until 2953 * we find a better solution. 2954 */ 2955 rcu_user_exit(); 2956 schedule(); 2957 rcu_user_enter(); 2958} 2959#endif 2960 2961/** 2962 * schedule_preempt_disabled - called with preemption disabled 2963 * 2964 * Returns with preemption disabled. Note: preempt_count must be 1 2965 */ 2966void __sched schedule_preempt_disabled(void) 2967{ 2968 sched_preempt_enable_no_resched(); 2969 schedule(); 2970 preempt_disable(); 2971} 2972 2973#ifdef CONFIG_MUTEX_SPIN_ON_OWNER 2974 2975static inline bool owner_running(struct mutex *lock, struct task_struct *owner) 2976{ 2977 if (lock->owner != owner) 2978 return false; 2979 2980 /* 2981 * Ensure we emit the owner->on_cpu, dereference _after_ checking 2982 * lock->owner still matches owner, if that fails, owner might 2983 * point to free()d memory, if it still matches, the rcu_read_lock() 2984 * ensures the memory stays valid. 2985 */ 2986 barrier(); 2987 2988 return owner->on_cpu; 2989} 2990 2991/* 2992 * Look out! "owner" is an entirely speculative pointer 2993 * access and not reliable. 2994 */ 2995int mutex_spin_on_owner(struct mutex *lock, struct task_struct *owner) 2996{ 2997 if (!sched_feat(OWNER_SPIN)) 2998 return 0; 2999 3000 rcu_read_lock(); 3001 while (owner_running(lock, owner)) { 3002 if (need_resched()) 3003 break; 3004 3005 arch_mutex_cpu_relax(); 3006 } 3007 rcu_read_unlock(); 3008 3009 /* 3010 * We break out the loop above on need_resched() and when the 3011 * owner changed, which is a sign for heavy contention. Return 3012 * success only when lock->owner is NULL. 3013 */ 3014 return lock->owner == NULL; 3015} 3016#endif 3017 3018#ifdef CONFIG_PREEMPT 3019/* 3020 * this is the entry point to schedule() from in-kernel preemption 3021 * off of preempt_enable. Kernel preemptions off return from interrupt 3022 * occur there and call schedule directly. 3023 */ 3024asmlinkage void __sched notrace preempt_schedule(void) 3025{ 3026 struct thread_info *ti = current_thread_info(); 3027 3028 /* 3029 * If there is a non-zero preempt_count or interrupts are disabled, 3030 * we do not want to preempt the current task. Just return.. 3031 */ 3032 if (likely(ti->preempt_count || irqs_disabled())) 3033 return; 3034 3035 do { 3036 add_preempt_count_notrace(PREEMPT_ACTIVE); 3037 __schedule(); 3038 sub_preempt_count_notrace(PREEMPT_ACTIVE); 3039 3040 /* 3041 * Check again in case we missed a preemption opportunity 3042 * between schedule and now. 3043 */ 3044 barrier(); 3045 } while (need_resched()); 3046} 3047EXPORT_SYMBOL(preempt_schedule); 3048 3049/* 3050 * this is the entry point to schedule() from kernel preemption 3051 * off of irq context. 3052 * Note, that this is called and return with irqs disabled. This will 3053 * protect us against recursive calling from irq. 3054 */ 3055asmlinkage void __sched preempt_schedule_irq(void) 3056{ 3057 struct thread_info *ti = current_thread_info(); 3058 3059 /* Catch callers which need to be fixed */ 3060 BUG_ON(ti->preempt_count || !irqs_disabled()); 3061 3062 rcu_user_exit(); 3063 do { 3064 add_preempt_count(PREEMPT_ACTIVE); 3065 local_irq_enable(); 3066 __schedule(); 3067 local_irq_disable(); 3068 sub_preempt_count(PREEMPT_ACTIVE); 3069 3070 /* 3071 * Check again in case we missed a preemption opportunity 3072 * between schedule and now. 3073 */ 3074 barrier(); 3075 } while (need_resched()); 3076} 3077 3078#endif /* CONFIG_PREEMPT */ 3079 3080int default_wake_function(wait_queue_t *curr, unsigned mode, int wake_flags, 3081 void *key) 3082{ 3083 return try_to_wake_up(curr->private, mode, wake_flags); 3084} 3085EXPORT_SYMBOL(default_wake_function); 3086 3087/* 3088 * The core wakeup function. Non-exclusive wakeups (nr_exclusive == 0) just 3089 * wake everything up. If it's an exclusive wakeup (nr_exclusive == small +ve 3090 * number) then we wake all the non-exclusive tasks and one exclusive task. 3091 * 3092 * There are circumstances in which we can try to wake a task which has already 3093 * started to run but is not in state TASK_RUNNING. try_to_wake_up() returns 3094 * zero in this (rare) case, and we handle it by continuing to scan the queue. 3095 */ 3096static void __wake_up_common(wait_queue_head_t *q, unsigned int mode, 3097 int nr_exclusive, int wake_flags, void *key) 3098{ 3099 wait_queue_t *curr, *next; 3100 3101 list_for_each_entry_safe(curr, next, &q->task_list, task_list) { 3102 unsigned flags = curr->flags; 3103 3104 if (curr->func(curr, mode, wake_flags, key) && 3105 (flags & WQ_FLAG_EXCLUSIVE) && !--nr_exclusive) 3106 break; 3107 } 3108} 3109 3110/** 3111 * __wake_up - wake up threads blocked on a waitqueue. 3112 * @q: the waitqueue 3113 * @mode: which threads 3114 * @nr_exclusive: how many wake-one or wake-many threads to wake up 3115 * @key: is directly passed to the wakeup function 3116 * 3117 * It may be assumed that this function implies a write memory barrier before 3118 * changing the task state if and only if any tasks are woken up. 3119 */ 3120void __wake_up(wait_queue_head_t *q, unsigned int mode, 3121 int nr_exclusive, void *key) 3122{ 3123 unsigned long flags; 3124 3125 spin_lock_irqsave(&q->lock, flags); 3126 __wake_up_common(q, mode, nr_exclusive, 0, key); 3127 spin_unlock_irqrestore(&q->lock, flags); 3128} 3129EXPORT_SYMBOL(__wake_up); 3130 3131/* 3132 * Same as __wake_up but called with the spinlock in wait_queue_head_t held. 3133 */ 3134void __wake_up_locked(wait_queue_head_t *q, unsigned int mode, int nr) 3135{ 3136 __wake_up_common(q, mode, nr, 0, NULL); 3137} 3138EXPORT_SYMBOL_GPL(__wake_up_locked); 3139 3140void __wake_up_locked_key(wait_queue_head_t *q, unsigned int mode, void *key) 3141{ 3142 __wake_up_common(q, mode, 1, 0, key); 3143} 3144EXPORT_SYMBOL_GPL(__wake_up_locked_key); 3145 3146/** 3147 * __wake_up_sync_key - wake up threads blocked on a waitqueue. 3148 * @q: the waitqueue 3149 * @mode: which threads 3150 * @nr_exclusive: how many wake-one or wake-many threads to wake up 3151 * @key: opaque value to be passed to wakeup targets 3152 * 3153 * The sync wakeup differs that the waker knows that it will schedule 3154 * away soon, so while the target thread will be woken up, it will not 3155 * be migrated to another CPU - ie. the two threads are 'synchronized' 3156 * with each other. This can prevent needless bouncing between CPUs. 3157 * 3158 * On UP it can prevent extra preemption. 3159 * 3160 * It may be assumed that this function implies a write memory barrier before 3161 * changing the task state if and only if any tasks are woken up. 3162 */ 3163void __wake_up_sync_key(wait_queue_head_t *q, unsigned int mode, 3164 int nr_exclusive, void *key) 3165{ 3166 unsigned long flags; 3167 int wake_flags = WF_SYNC; 3168 3169 if (unlikely(!q)) 3170 return; 3171 3172 if (unlikely(!nr_exclusive)) 3173 wake_flags = 0; 3174 3175 spin_lock_irqsave(&q->lock, flags); 3176 __wake_up_common(q, mode, nr_exclusive, wake_flags, key); 3177 spin_unlock_irqrestore(&q->lock, flags); 3178} 3179EXPORT_SYMBOL_GPL(__wake_up_sync_key); 3180 3181/* 3182 * __wake_up_sync - see __wake_up_sync_key() 3183 */ 3184void __wake_up_sync(wait_queue_head_t *q, unsigned int mode, int nr_exclusive) 3185{ 3186 __wake_up_sync_key(q, mode, nr_exclusive, NULL); 3187} 3188EXPORT_SYMBOL_GPL(__wake_up_sync); /* For internal use only */ 3189 3190/** 3191 * complete: - signals a single thread waiting on this completion 3192 * @x: holds the state of this particular completion 3193 * 3194 * This will wake up a single thread waiting on this completion. Threads will be 3195 * awakened in the same order in which they were queued. 3196 * 3197 * See also complete_all(), wait_for_completion() and related routines. 3198 * 3199 * It may be assumed that this function implies a write memory barrier before 3200 * changing the task state if and only if any tasks are woken up. 3201 */ 3202void complete(struct completion *x) 3203{ 3204 unsigned long flags; 3205 3206 spin_lock_irqsave(&x->wait.lock, flags); 3207 x->done++; 3208 __wake_up_common(&x->wait, TASK_NORMAL, 1, 0, NULL); 3209 spin_unlock_irqrestore(&x->wait.lock, flags); 3210} 3211EXPORT_SYMBOL(complete); 3212 3213/** 3214 * complete_all: - signals all threads waiting on this completion 3215 * @x: holds the state of this particular completion 3216 * 3217 * This will wake up all threads waiting on this particular completion event. 3218 * 3219 * It may be assumed that this function implies a write memory barrier before 3220 * changing the task state if and only if any tasks are woken up. 3221 */ 3222void complete_all(struct completion *x) 3223{ 3224 unsigned long flags; 3225 3226 spin_lock_irqsave(&x->wait.lock, flags); 3227 x->done += UINT_MAX/2; 3228 __wake_up_common(&x->wait, TASK_NORMAL, 0, 0, NULL); 3229 spin_unlock_irqrestore(&x->wait.lock, flags); 3230} 3231EXPORT_SYMBOL(complete_all); 3232 3233static inline long __sched 3234do_wait_for_common(struct completion *x, long timeout, int state) 3235{ 3236 if (!x->done) { 3237 DECLARE_WAITQUEUE(wait, current); 3238 3239 __add_wait_queue_tail_exclusive(&x->wait, &wait); 3240 do { 3241 if (signal_pending_state(state, current)) { 3242 timeout = -ERESTARTSYS; 3243 break; 3244 } 3245 __set_current_state(state); 3246 spin_unlock_irq(&x->wait.lock); 3247 timeout = schedule_timeout(timeout); 3248 spin_lock_irq(&x->wait.lock); 3249 } while (!x->done && timeout); 3250 __remove_wait_queue(&x->wait, &wait); 3251 if (!x->done) 3252 return timeout; 3253 } 3254 x->done--; 3255 return timeout ?: 1; 3256} 3257 3258static long __sched 3259wait_for_common(struct completion *x, long timeout, int state) 3260{ 3261 might_sleep(); 3262 3263 spin_lock_irq(&x->wait.lock); 3264 timeout = do_wait_for_common(x, timeout, state); 3265 spin_unlock_irq(&x->wait.lock); 3266 return timeout; 3267} 3268 3269/** 3270 * wait_for_completion: - waits for completion of a task 3271 * @x: holds the state of this particular completion 3272 * 3273 * This waits to be signaled for completion of a specific task. It is NOT 3274 * interruptible and there is no timeout. 3275 * 3276 * See also similar routines (i.e. wait_for_completion_timeout()) with timeout 3277 * and interrupt capability. Also see complete(). 3278 */ 3279void __sched wait_for_completion(struct completion *x) 3280{ 3281 wait_for_common(x, MAX_SCHEDULE_TIMEOUT, TASK_UNINTERRUPTIBLE); 3282} 3283EXPORT_SYMBOL(wait_for_completion); 3284 3285/** 3286 * wait_for_completion_timeout: - waits for completion of a task (w/timeout) 3287 * @x: holds the state of this particular completion 3288 * @timeout: timeout value in jiffies 3289 * 3290 * This waits for either a completion of a specific task to be signaled or for a 3291 * specified timeout to expire. The timeout is in jiffies. It is not 3292 * interruptible. 3293 * 3294 * The return value is 0 if timed out, and positive (at least 1, or number of 3295 * jiffies left till timeout) if completed. 3296 */ 3297unsigned long __sched 3298wait_for_completion_timeout(struct completion *x, unsigned long timeout) 3299{ 3300 return wait_for_common(x, timeout, TASK_UNINTERRUPTIBLE); 3301} 3302EXPORT_SYMBOL(wait_for_completion_timeout); 3303 3304/** 3305 * wait_for_completion_interruptible: - waits for completion of a task (w/intr) 3306 * @x: holds the state of this particular completion 3307 * 3308 * This waits for completion of a specific task to be signaled. It is 3309 * interruptible. 3310 * 3311 * The return value is -ERESTARTSYS if interrupted, 0 if completed. 3312 */ 3313int __sched wait_for_completion_interruptible(struct completion *x) 3314{ 3315 long t = wait_for_common(x, MAX_SCHEDULE_TIMEOUT, TASK_INTERRUPTIBLE); 3316 if (t == -ERESTARTSYS) 3317 return t; 3318 return 0; 3319} 3320EXPORT_SYMBOL(wait_for_completion_interruptible); 3321 3322/** 3323 * wait_for_completion_interruptible_timeout: - waits for completion (w/(to,intr)) 3324 * @x: holds the state of this particular completion 3325 * @timeout: timeout value in jiffies 3326 * 3327 * This waits for either a completion of a specific task to be signaled or for a 3328 * specified timeout to expire. It is interruptible. The timeout is in jiffies. 3329 * 3330 * The return value is -ERESTARTSYS if interrupted, 0 if timed out, 3331 * positive (at least 1, or number of jiffies left till timeout) if completed. 3332 */ 3333long __sched 3334wait_for_completion_interruptible_timeout(struct completion *x, 3335 unsigned long timeout) 3336{ 3337 return wait_for_common(x, timeout, TASK_INTERRUPTIBLE); 3338} 3339EXPORT_SYMBOL(wait_for_completion_interruptible_timeout); 3340 3341/** 3342 * wait_for_completion_killable: - waits for completion of a task (killable) 3343 * @x: holds the state of this particular completion 3344 * 3345 * This waits to be signaled for completion of a specific task. It can be 3346 * interrupted by a kill signal. 3347 * 3348 * The return value is -ERESTARTSYS if interrupted, 0 if completed. 3349 */ 3350int __sched wait_for_completion_killable(struct completion *x) 3351{ 3352 long t = wait_for_common(x, MAX_SCHEDULE_TIMEOUT, TASK_KILLABLE); 3353 if (t == -ERESTARTSYS) 3354 return t; 3355 return 0; 3356} 3357EXPORT_SYMBOL(wait_for_completion_killable); 3358 3359/** 3360 * wait_for_completion_killable_timeout: - waits for completion of a task (w/(to,killable)) 3361 * @x: holds the state of this particular completion 3362 * @timeout: timeout value in jiffies 3363 * 3364 * This waits for either a completion of a specific task to be 3365 * signaled or for a specified timeout to expire. It can be 3366 * interrupted by a kill signal. The timeout is in jiffies. 3367 * 3368 * The return value is -ERESTARTSYS if interrupted, 0 if timed out, 3369 * positive (at least 1, or number of jiffies left till timeout) if completed. 3370 */ 3371long __sched 3372wait_for_completion_killable_timeout(struct completion *x, 3373 unsigned long timeout) 3374{ 3375 return wait_for_common(x, timeout, TASK_KILLABLE); 3376} 3377EXPORT_SYMBOL(wait_for_completion_killable_timeout); 3378 3379/** 3380 * try_wait_for_completion - try to decrement a completion without blocking 3381 * @x: completion structure 3382 * 3383 * Returns: 0 if a decrement cannot be done without blocking 3384 * 1 if a decrement succeeded. 3385 * 3386 * If a completion is being used as a counting completion, 3387 * attempt to decrement the counter without blocking. This 3388 * enables us to avoid waiting if the resource the completion 3389 * is protecting is not available. 3390 */ 3391bool try_wait_for_completion(struct completion *x) 3392{ 3393 unsigned long flags; 3394 int ret = 1; 3395 3396 spin_lock_irqsave(&x->wait.lock, flags); 3397 if (!x->done) 3398 ret = 0; 3399 else 3400 x->done--; 3401 spin_unlock_irqrestore(&x->wait.lock, flags); 3402 return ret; 3403} 3404EXPORT_SYMBOL(try_wait_for_completion); 3405 3406/** 3407 * completion_done - Test to see if a completion has any waiters 3408 * @x: completion structure 3409 * 3410 * Returns: 0 if there are waiters (wait_for_completion() in progress) 3411 * 1 if there are no waiters. 3412 * 3413 */ 3414bool completion_done(struct completion *x) 3415{ 3416 unsigned long flags; 3417 int ret = 1; 3418 3419 spin_lock_irqsave(&x->wait.lock, flags); 3420 if (!x->done) 3421 ret = 0; 3422 spin_unlock_irqrestore(&x->wait.lock, flags); 3423 return ret; 3424} 3425EXPORT_SYMBOL(completion_done); 3426 3427static long __sched 3428sleep_on_common(wait_queue_head_t *q, int state, long timeout) 3429{ 3430 unsigned long flags; 3431 wait_queue_t wait; 3432 3433 init_waitqueue_entry(&wait, current); 3434 3435 __set_current_state(state); 3436 3437 spin_lock_irqsave(&q->lock, flags); 3438 __add_wait_queue(q, &wait); 3439 spin_unlock(&q->lock); 3440 timeout = schedule_timeout(timeout); 3441 spin_lock_irq(&q->lock); 3442 __remove_wait_queue(q, &wait); 3443 spin_unlock_irqrestore(&q->lock, flags); 3444 3445 return timeout; 3446} 3447 3448void __sched interruptible_sleep_on(wait_queue_head_t *q) 3449{ 3450 sleep_on_common(q, TASK_INTERRUPTIBLE, MAX_SCHEDULE_TIMEOUT); 3451} 3452EXPORT_SYMBOL(interruptible_sleep_on); 3453 3454long __sched 3455interruptible_sleep_on_timeout(wait_queue_head_t *q, long timeout) 3456{ 3457 return sleep_on_common(q, TASK_INTERRUPTIBLE, timeout); 3458} 3459EXPORT_SYMBOL(interruptible_sleep_on_timeout); 3460 3461void __sched sleep_on(wait_queue_head_t *q) 3462{ 3463 sleep_on_common(q, TASK_UNINTERRUPTIBLE, MAX_SCHEDULE_TIMEOUT); 3464} 3465EXPORT_SYMBOL(sleep_on); 3466 3467long __sched sleep_on_timeout(wait_queue_head_t *q, long timeout) 3468{ 3469 return sleep_on_common(q, TASK_UNINTERRUPTIBLE, timeout); 3470} 3471EXPORT_SYMBOL(sleep_on_timeout); 3472 3473#ifdef CONFIG_RT_MUTEXES 3474 3475/* 3476 * rt_mutex_setprio - set the current priority of a task 3477 * @p: task 3478 * @prio: prio value (kernel-internal form) 3479 * 3480 * This function changes the 'effective' priority of a task. It does 3481 * not touch ->normal_prio like __setscheduler(). 3482 * 3483 * Used by the rt_mutex code to implement priority inheritance logic. 3484 */ 3485void rt_mutex_setprio(struct task_struct *p, int prio) 3486{ 3487 int oldprio, on_rq, running; 3488 struct rq *rq; 3489 const struct sched_class *prev_class; 3490 3491 BUG_ON(prio < 0 || prio > MAX_PRIO); 3492 3493 rq = __task_rq_lock(p); 3494 3495 /* 3496 * Idle task boosting is a nono in general. There is one 3497 * exception, when PREEMPT_RT and NOHZ is active: 3498 * 3499 * The idle task calls get_next_timer_interrupt() and holds 3500 * the timer wheel base->lock on the CPU and another CPU wants 3501 * to access the timer (probably to cancel it). We can safely 3502 * ignore the boosting request, as the idle CPU runs this code 3503 * with interrupts disabled and will complete the lock 3504 * protected section without being interrupted. So there is no 3505 * real need to boost. 3506 */ 3507 if (unlikely(p == rq->idle)) { 3508 WARN_ON(p != rq->curr); 3509 WARN_ON(p->pi_blocked_on); 3510 goto out_unlock; 3511 } 3512 3513 trace_sched_pi_setprio(p, prio); 3514 oldprio = p->prio; 3515 prev_class = p->sched_class; 3516 on_rq = p->on_rq; 3517 running = task_current(rq, p); 3518 if (on_rq) 3519 dequeue_task(rq, p, 0); 3520 if (running) 3521 p->sched_class->put_prev_task(rq, p); 3522 3523 if (rt_prio(prio)) 3524 p->sched_class = &rt_sched_class; 3525 else 3526 p->sched_class = &fair_sched_class; 3527 3528 p->prio = prio; 3529 3530 if (running) 3531 p->sched_class->set_curr_task(rq); 3532 if (on_rq) 3533 enqueue_task(rq, p, oldprio < prio ? ENQUEUE_HEAD : 0); 3534 3535 check_class_changed(rq, p, prev_class, oldprio); 3536out_unlock: 3537 __task_rq_unlock(rq); 3538} 3539#endif 3540void set_user_nice(struct task_struct *p, long nice) 3541{ 3542 int old_prio, delta, on_rq; 3543 unsigned long flags; 3544 struct rq *rq; 3545 3546 if (TASK_NICE(p) == nice || nice < -20 || nice > 19) 3547 return; 3548 /* 3549 * We have to be careful, if called from sys_setpriority(), 3550 * the task might be in the middle of scheduling on another CPU. 3551 */ 3552 rq = task_rq_lock(p, &flags); 3553 /* 3554 * The RT priorities are set via sched_setscheduler(), but we still 3555 * allow the 'normal' nice value to be set - but as expected 3556 * it wont have any effect on scheduling until the task is 3557 * SCHED_FIFO/SCHED_RR: 3558 */ 3559 if (task_has_rt_policy(p)) { 3560 p->static_prio = NICE_TO_PRIO(nice); 3561 goto out_unlock; 3562 } 3563 on_rq = p->on_rq; 3564 if (on_rq) 3565 dequeue_task(rq, p, 0); 3566 3567 p->static_prio = NICE_TO_PRIO(nice); 3568 set_load_weight(p); 3569 old_prio = p->prio; 3570 p->prio = effective_prio(p); 3571 delta = p->prio - old_prio; 3572 3573 if (on_rq) { 3574 enqueue_task(rq, p, 0); 3575 /* 3576 * If the task increased its priority or is running and 3577 * lowered its priority, then reschedule its CPU: 3578 */ 3579 if (delta < 0 || (delta > 0 && task_running(rq, p))) 3580 resched_task(rq->curr); 3581 } 3582out_unlock: 3583 task_rq_unlock(rq, p, &flags); 3584} 3585EXPORT_SYMBOL(set_user_nice); 3586 3587/* 3588 * can_nice - check if a task can reduce its nice value 3589 * @p: task 3590 * @nice: nice value 3591 */ 3592int can_nice(const struct task_struct *p, const int nice) 3593{ 3594 /* convert nice value [19,-20] to rlimit style value [1,40] */ 3595 int nice_rlim = 20 - nice; 3596 3597 return (nice_rlim <= task_rlimit(p, RLIMIT_NICE) || 3598 capable(CAP_SYS_NICE)); 3599} 3600 3601#ifdef __ARCH_WANT_SYS_NICE 3602 3603/* 3604 * sys_nice - change the priority of the current process. 3605 * @increment: priority increment 3606 * 3607 * sys_setpriority is a more generic, but much slower function that 3608 * does similar things. 3609 */ 3610SYSCALL_DEFINE1(nice, int, increment) 3611{ 3612 long nice, retval; 3613 3614 /* 3615 * Setpriority might change our priority at the same moment. 3616 * We don't have to worry. Conceptually one call occurs first 3617 * and we have a single winner. 3618 */ 3619 if (increment < -40) 3620 increment = -40; 3621 if (increment > 40) 3622 increment = 40; 3623 3624 nice = TASK_NICE(current) + increment; 3625 if (nice < -20) 3626 nice = -20; 3627 if (nice > 19) 3628 nice = 19; 3629 3630 if (increment < 0 && !can_nice(current, nice)) 3631 return -EPERM; 3632 3633 retval = security_task_setnice(current, nice); 3634 if (retval) 3635 return retval; 3636 3637 set_user_nice(current, nice); 3638 return 0; 3639} 3640 3641#endif 3642 3643/** 3644 * task_prio - return the priority value of a given task. 3645 * @p: the task in question. 3646 * 3647 * This is the priority value as seen by users in /proc. 3648 * RT tasks are offset by -200. Normal tasks are centered 3649 * around 0, value goes from -16 to +15. 3650 */ 3651int task_prio(const struct task_struct *p) 3652{ 3653 return p->prio - MAX_RT_PRIO; 3654} 3655 3656/** 3657 * task_nice - return the nice value of a given task. 3658 * @p: the task in question. 3659 */ 3660int task_nice(const struct task_struct *p) 3661{ 3662 return TASK_NICE(p); 3663} 3664EXPORT_SYMBOL(task_nice); 3665 3666/** 3667 * idle_cpu - is a given cpu idle currently? 3668 * @cpu: the processor in question. 3669 */ 3670int idle_cpu(int cpu) 3671{ 3672 struct rq *rq = cpu_rq(cpu); 3673 3674 if (rq->curr != rq->idle) 3675 return 0; 3676 3677 if (rq->nr_running) 3678 return 0; 3679 3680#ifdef CONFIG_SMP 3681 if (!llist_empty(&rq->wake_list)) 3682 return 0; 3683#endif 3684 3685 return 1; 3686} 3687 3688/** 3689 * idle_task - return the idle task for a given cpu. 3690 * @cpu: the processor in question. 3691 */ 3692struct task_struct *idle_task(int cpu) 3693{ 3694 return cpu_rq(cpu)->idle; 3695} 3696 3697/** 3698 * find_process_by_pid - find a process with a matching PID value. 3699 * @pid: the pid in question. 3700 */ 3701static struct task_struct *find_process_by_pid(pid_t pid) 3702{ 3703 return pid ? find_task_by_vpid(pid) : current; 3704} 3705 3706/* Actually do priority change: must hold rq lock. */ 3707static void 3708__setscheduler(struct rq *rq, struct task_struct *p, int policy, int prio) 3709{ 3710 p->policy = policy; 3711 p->rt_priority = prio; 3712 p->normal_prio = normal_prio(p); 3713 /* we are holding p->pi_lock already */ 3714 p->prio = rt_mutex_getprio(p); 3715 if (rt_prio(p->prio)) 3716 p->sched_class = &rt_sched_class; 3717 else 3718 p->sched_class = &fair_sched_class; 3719 set_load_weight(p); 3720} 3721 3722/* 3723 * check the target process has a UID that matches the current process's 3724 */ 3725static bool check_same_owner(struct task_struct *p) 3726{ 3727 const struct cred *cred = current_cred(), *pcred; 3728 bool match; 3729 3730 rcu_read_lock(); 3731 pcred = __task_cred(p); 3732 match = (uid_eq(cred->euid, pcred->euid) || 3733 uid_eq(cred->euid, pcred->uid)); 3734 rcu_read_unlock(); 3735 return match; 3736} 3737 3738static int __sched_setscheduler(struct task_struct *p, int policy, 3739 const struct sched_param *param, bool user) 3740{ 3741 int retval, oldprio, oldpolicy = -1, on_rq, running; 3742 unsigned long flags; 3743 const struct sched_class *prev_class; 3744 struct rq *rq; 3745 int reset_on_fork; 3746 3747 /* may grab non-irq protected spin_locks */ 3748 BUG_ON(in_interrupt()); 3749recheck: 3750 /* double check policy once rq lock held */ 3751 if (policy < 0) { 3752 reset_on_fork = p->sched_reset_on_fork; 3753 policy = oldpolicy = p->policy; 3754 } else { 3755 reset_on_fork = !!(policy & SCHED_RESET_ON_FORK); 3756 policy &= ~SCHED_RESET_ON_FORK; 3757 3758 if (policy != SCHED_FIFO && policy != SCHED_RR && 3759 policy != SCHED_NORMAL && policy != SCHED_BATCH && 3760 policy != SCHED_IDLE) 3761 return -EINVAL; 3762 } 3763 3764 /* 3765 * Valid priorities for SCHED_FIFO and SCHED_RR are 3766 * 1..MAX_USER_RT_PRIO-1, valid priority for SCHED_NORMAL, 3767 * SCHED_BATCH and SCHED_IDLE is 0. 3768 */ 3769 if (param->sched_priority < 0 || 3770 (p->mm && param->sched_priority > MAX_USER_RT_PRIO-1) || 3771 (!p->mm && param->sched_priority > MAX_RT_PRIO-1)) 3772 return -EINVAL; 3773 if (rt_policy(policy) != (param->sched_priority != 0)) 3774 return -EINVAL; 3775 3776 /* 3777 * Allow unprivileged RT tasks to decrease priority: 3778 */ 3779 if (user && !capable(CAP_SYS_NICE)) { 3780 if (rt_policy(policy)) { 3781 unsigned long rlim_rtprio = 3782 task_rlimit(p, RLIMIT_RTPRIO); 3783 3784 /* can't set/change the rt policy */ 3785 if (policy != p->policy && !rlim_rtprio) 3786 return -EPERM; 3787 3788 /* can't increase priority */ 3789 if (param->sched_priority > p->rt_priority && 3790 param->sched_priority > rlim_rtprio) 3791 return -EPERM; 3792 } 3793 3794 /* 3795 * Treat SCHED_IDLE as nice 20. Only allow a switch to 3796 * SCHED_NORMAL if the RLIMIT_NICE would normally permit it. 3797 */ 3798 if (p->policy == SCHED_IDLE && policy != SCHED_IDLE) { 3799 if (!can_nice(p, TASK_NICE(p))) 3800 return -EPERM; 3801 } 3802 3803 /* can't change other user's priorities */ 3804 if (!check_same_owner(p)) 3805 return -EPERM; 3806 3807 /* Normal users shall not reset the sched_reset_on_fork flag */ 3808 if (p->sched_reset_on_fork && !reset_on_fork) 3809 return -EPERM; 3810 } 3811 3812 if (user) { 3813 retval = security_task_setscheduler(p); 3814 if (retval) 3815 return retval; 3816 } 3817 3818 /* 3819 * make sure no PI-waiters arrive (or leave) while we are 3820 * changing the priority of the task: 3821 * 3822 * To be able to change p->policy safely, the appropriate 3823 * runqueue lock must be held. 3824 */ 3825 rq = task_rq_lock(p, &flags); 3826 3827 /* 3828 * Changing the policy of the stop threads its a very bad idea 3829 */ 3830 if (p == rq->stop) { 3831 task_rq_unlock(rq, p, &flags); 3832 return -EINVAL; 3833 } 3834 3835 /* 3836 * If not changing anything there's no need to proceed further: 3837 */ 3838 if (unlikely(policy == p->policy && (!rt_policy(policy) || 3839 param->sched_priority == p->rt_priority))) { 3840 task_rq_unlock(rq, p, &flags); 3841 return 0; 3842 } 3843 3844#ifdef CONFIG_RT_GROUP_SCHED 3845 if (user) { 3846 /* 3847 * Do not allow realtime tasks into groups that have no runtime 3848 * assigned. 3849 */ 3850 if (rt_bandwidth_enabled() && rt_policy(policy) && 3851 task_group(p)->rt_bandwidth.rt_runtime == 0 && 3852 !task_group_is_autogroup(task_group(p))) { 3853 task_rq_unlock(rq, p, &flags); 3854 return -EPERM; 3855 } 3856 } 3857#endif 3858 3859 /* recheck policy now with rq lock held */ 3860 if (unlikely(oldpolicy != -1 && oldpolicy != p->policy)) { 3861 policy = oldpolicy = -1; 3862 task_rq_unlock(rq, p, &flags); 3863 goto recheck; 3864 } 3865 on_rq = p->on_rq; 3866 running = task_current(rq, p); 3867 if (on_rq) 3868 dequeue_task(rq, p, 0); 3869 if (running) 3870 p->sched_class->put_prev_task(rq, p); 3871 3872 p->sched_reset_on_fork = reset_on_fork; 3873 3874 oldprio = p->prio; 3875 prev_class = p->sched_class; 3876 __setscheduler(rq, p, policy, param->sched_priority); 3877 3878 if (running) 3879 p->sched_class->set_curr_task(rq); 3880 if (on_rq) 3881 enqueue_task(rq, p, 0); 3882 3883 check_class_changed(rq, p, prev_class, oldprio); 3884 task_rq_unlock(rq, p, &flags); 3885 3886 rt_mutex_adjust_pi(p); 3887 3888 return 0; 3889} 3890 3891/** 3892 * sched_setscheduler - change the scheduling policy and/or RT priority of a thread. 3893 * @p: the task in question. 3894 * @policy: new policy. 3895 * @param: structure containing the new RT priority. 3896 * 3897 * NOTE that the task may be already dead. 3898 */ 3899int sched_setscheduler(struct task_struct *p, int policy, 3900 const struct sched_param *param) 3901{ 3902 return __sched_setscheduler(p, policy, param, true); 3903} 3904EXPORT_SYMBOL_GPL(sched_setscheduler); 3905 3906/** 3907 * sched_setscheduler_nocheck - change the scheduling policy and/or RT priority of a thread from kernelspace. 3908 * @p: the task in question. 3909 * @policy: new policy. 3910 * @param: structure containing the new RT priority. 3911 * 3912 * Just like sched_setscheduler, only don't bother checking if the 3913 * current context has permission. For example, this is needed in 3914 * stop_machine(): we create temporary high priority worker threads, 3915 * but our caller might not have that capability. 3916 */ 3917int sched_setscheduler_nocheck(struct task_struct *p, int policy, 3918 const struct sched_param *param) 3919{ 3920 return __sched_setscheduler(p, policy, param, false); 3921} 3922 3923static int 3924do_sched_setscheduler(pid_t pid, int policy, struct sched_param __user *param) 3925{ 3926 struct sched_param lparam; 3927 struct task_struct *p; 3928 int retval; 3929 3930 if (!param || pid < 0) 3931 return -EINVAL; 3932 if (copy_from_user(&lparam, param, sizeof(struct sched_param))) 3933 return -EFAULT; 3934 3935 rcu_read_lock(); 3936 retval = -ESRCH; 3937 p = find_process_by_pid(pid); 3938 if (p != NULL) 3939 retval = sched_setscheduler(p, policy, &lparam); 3940 rcu_read_unlock(); 3941 3942 return retval; 3943} 3944 3945/** 3946 * sys_sched_setscheduler - set/change the scheduler policy and RT priority 3947 * @pid: the pid in question. 3948 * @policy: new policy. 3949 * @param: structure containing the new RT priority. 3950 */ 3951SYSCALL_DEFINE3(sched_setscheduler, pid_t, pid, int, policy, 3952 struct sched_param __user *, param) 3953{ 3954 /* negative values for policy are not valid */ 3955 if (policy < 0) 3956 return -EINVAL; 3957 3958 return do_sched_setscheduler(pid, policy, param); 3959} 3960 3961/** 3962 * sys_sched_setparam - set/change the RT priority of a thread 3963 * @pid: the pid in question. 3964 * @param: structure containing the new RT priority. 3965 */ 3966SYSCALL_DEFINE2(sched_setparam, pid_t, pid, struct sched_param __user *, param) 3967{ 3968 return do_sched_setscheduler(pid, -1, param); 3969} 3970 3971/** 3972 * sys_sched_getscheduler - get the policy (scheduling class) of a thread 3973 * @pid: the pid in question. 3974 */ 3975SYSCALL_DEFINE1(sched_getscheduler, pid_t, pid) 3976{ 3977 struct task_struct *p; 3978 int retval; 3979 3980 if (pid < 0) 3981 return -EINVAL; 3982 3983 retval = -ESRCH; 3984 rcu_read_lock(); 3985 p = find_process_by_pid(pid); 3986 if (p) { 3987 retval = security_task_getscheduler(p); 3988 if (!retval) 3989 retval = p->policy 3990 | (p->sched_reset_on_fork ? SCHED_RESET_ON_FORK : 0); 3991 } 3992 rcu_read_unlock(); 3993 return retval; 3994} 3995 3996/** 3997 * sys_sched_getparam - get the RT priority of a thread 3998 * @pid: the pid in question. 3999 * @param: structure containing the RT priority. 4000 */ 4001SYSCALL_DEFINE2(sched_getparam, pid_t, pid, struct sched_param __user *, param) 4002{ 4003 struct sched_param lp; 4004 struct task_struct *p; 4005 int retval; 4006 4007 if (!param || pid < 0) 4008 return -EINVAL; 4009 4010 rcu_read_lock(); 4011 p = find_process_by_pid(pid); 4012 retval = -ESRCH; 4013 if (!p) 4014 goto out_unlock; 4015 4016 retval = security_task_getscheduler(p); 4017 if (retval) 4018 goto out_unlock; 4019 4020 lp.sched_priority = p->rt_priority; 4021 rcu_read_unlock(); 4022 4023 /* 4024 * This one might sleep, we cannot do it with a spinlock held ... 4025 */ 4026 retval = copy_to_user(param, &lp, sizeof(*param)) ? -EFAULT : 0; 4027 4028 return retval; 4029 4030out_unlock: 4031 rcu_read_unlock(); 4032 return retval; 4033} 4034 4035long sched_setaffinity(pid_t pid, const struct cpumask *in_mask) 4036{ 4037 cpumask_var_t cpus_allowed, new_mask; 4038 struct task_struct *p; 4039 int retval; 4040 4041 get_online_cpus(); 4042 rcu_read_lock(); 4043 4044 p = find_process_by_pid(pid); 4045 if (!p) { 4046 rcu_read_unlock(); 4047 put_online_cpus(); 4048 return -ESRCH; 4049 } 4050 4051 /* Prevent p going away */ 4052 get_task_struct(p); 4053 rcu_read_unlock(); 4054 4055 if (!alloc_cpumask_var(&cpus_allowed, GFP_KERNEL)) { 4056 retval = -ENOMEM; 4057 goto out_put_task; 4058 } 4059 if (!alloc_cpumask_var(&new_mask, GFP_KERNEL)) { 4060 retval = -ENOMEM; 4061 goto out_free_cpus_allowed; 4062 } 4063 retval = -EPERM; 4064 if (!check_same_owner(p) && !ns_capable(task_user_ns(p), CAP_SYS_NICE)) 4065 goto out_unlock; 4066 4067 retval = security_task_setscheduler(p); 4068 if (retval) 4069 goto out_unlock; 4070 4071 cpuset_cpus_allowed(p, cpus_allowed); 4072 cpumask_and(new_mask, in_mask, cpus_allowed); 4073again: 4074 retval = set_cpus_allowed_ptr(p, new_mask); 4075 4076 if (!retval) { 4077 cpuset_cpus_allowed(p, cpus_allowed); 4078 if (!cpumask_subset(new_mask, cpus_allowed)) { 4079 /* 4080 * We must have raced with a concurrent cpuset 4081 * update. Just reset the cpus_allowed to the 4082 * cpuset's cpus_allowed 4083 */ 4084 cpumask_copy(new_mask, cpus_allowed); 4085 goto again; 4086 } 4087 } 4088out_unlock: 4089 free_cpumask_var(new_mask); 4090out_free_cpus_allowed: 4091 free_cpumask_var(cpus_allowed); 4092out_put_task: 4093 put_task_struct(p); 4094 put_online_cpus(); 4095 return retval; 4096} 4097 4098static int get_user_cpu_mask(unsigned long __user *user_mask_ptr, unsigned len, 4099 struct cpumask *new_mask) 4100{ 4101 if (len < cpumask_size()) 4102 cpumask_clear(new_mask); 4103 else if (len > cpumask_size()) 4104 len = cpumask_size(); 4105 4106 return copy_from_user(new_mask, user_mask_ptr, len) ? -EFAULT : 0; 4107} 4108 4109/** 4110 * sys_sched_setaffinity - set the cpu affinity of a process 4111 * @pid: pid of the process 4112 * @len: length in bytes of the bitmask pointed to by user_mask_ptr 4113 * @user_mask_ptr: user-space pointer to the new cpu mask 4114 */ 4115SYSCALL_DEFINE3(sched_setaffinity, pid_t, pid, unsigned int, len, 4116 unsigned long __user *, user_mask_ptr) 4117{ 4118 cpumask_var_t new_mask; 4119 int retval; 4120 4121 if (!alloc_cpumask_var(&new_mask, GFP_KERNEL)) 4122 return -ENOMEM; 4123 4124 retval = get_user_cpu_mask(user_mask_ptr, len, new_mask); 4125 if (retval == 0) 4126 retval = sched_setaffinity(pid, new_mask); 4127 free_cpumask_var(new_mask); 4128 return retval; 4129} 4130 4131long sched_getaffinity(pid_t pid, struct cpumask *mask) 4132{ 4133 struct task_struct *p; 4134 unsigned long flags; 4135 int retval; 4136 4137 get_online_cpus(); 4138 rcu_read_lock(); 4139 4140 retval = -ESRCH; 4141 p = find_process_by_pid(pid); 4142 if (!p) 4143 goto out_unlock; 4144 4145 retval = security_task_getscheduler(p); 4146 if (retval) 4147 goto out_unlock; 4148 4149 raw_spin_lock_irqsave(&p->pi_lock, flags); 4150 cpumask_and(mask, &p->cpus_allowed, cpu_online_mask); 4151 raw_spin_unlock_irqrestore(&p->pi_lock, flags); 4152 4153out_unlock: 4154 rcu_read_unlock(); 4155 put_online_cpus(); 4156 4157 return retval; 4158} 4159 4160/** 4161 * sys_sched_getaffinity - get the cpu affinity of a process 4162 * @pid: pid of the process 4163 * @len: length in bytes of the bitmask pointed to by user_mask_ptr 4164 * @user_mask_ptr: user-space pointer to hold the current cpu mask 4165 */ 4166SYSCALL_DEFINE3(sched_getaffinity, pid_t, pid, unsigned int, len, 4167 unsigned long __user *, user_mask_ptr) 4168{ 4169 int ret; 4170 cpumask_var_t mask; 4171 4172 if ((len * BITS_PER_BYTE) < nr_cpu_ids) 4173 return -EINVAL; 4174 if (len & (sizeof(unsigned long)-1)) 4175 return -EINVAL; 4176 4177 if (!alloc_cpumask_var(&mask, GFP_KERNEL)) 4178 return -ENOMEM; 4179 4180 ret = sched_getaffinity(pid, mask); 4181 if (ret == 0) { 4182 size_t retlen = min_t(size_t, len, cpumask_size()); 4183 4184 if (copy_to_user(user_mask_ptr, mask, retlen)) 4185 ret = -EFAULT; 4186 else 4187 ret = retlen; 4188 } 4189 free_cpumask_var(mask); 4190 4191 return ret; 4192} 4193 4194/** 4195 * sys_sched_yield - yield the current processor to other threads. 4196 * 4197 * This function yields the current CPU to other tasks. If there are no 4198 * other threads running on this CPU then this function will return. 4199 */ 4200SYSCALL_DEFINE0(sched_yield) 4201{ 4202 struct rq *rq = this_rq_lock(); 4203 4204 schedstat_inc(rq, yld_count); 4205 current->sched_class->yield_task(rq); 4206 4207 /* 4208 * Since we are going to call schedule() anyway, there's 4209 * no need to preempt or enable interrupts: 4210 */ 4211 __release(rq->lock); 4212 spin_release(&rq->lock.dep_map, 1, _THIS_IP_); 4213 do_raw_spin_unlock(&rq->lock); 4214 sched_preempt_enable_no_resched(); 4215 4216 schedule(); 4217 4218 return 0; 4219} 4220 4221static inline int should_resched(void) 4222{ 4223 return need_resched() && !(preempt_count() & PREEMPT_ACTIVE); 4224} 4225 4226static void __cond_resched(void) 4227{ 4228 add_preempt_count(PREEMPT_ACTIVE); 4229 __schedule(); 4230 sub_preempt_count(PREEMPT_ACTIVE); 4231} 4232 4233int __sched _cond_resched(void) 4234{ 4235 if (should_resched()) { 4236 __cond_resched(); 4237 return 1; 4238 } 4239 return 0; 4240} 4241EXPORT_SYMBOL(_cond_resched); 4242 4243/* 4244 * __cond_resched_lock() - if a reschedule is pending, drop the given lock, 4245 * call schedule, and on return reacquire the lock. 4246 * 4247 * This works OK both with and without CONFIG_PREEMPT. We do strange low-level 4248 * operations here to prevent schedule() from being called twice (once via 4249 * spin_unlock(), once by hand). 4250 */ 4251int __cond_resched_lock(spinlock_t *lock) 4252{ 4253 int resched = should_resched(); 4254 int ret = 0; 4255 4256 lockdep_assert_held(lock); 4257 4258 if (spin_needbreak(lock) || resched) { 4259 spin_unlock(lock); 4260 if (resched) 4261 __cond_resched(); 4262 else 4263 cpu_relax(); 4264 ret = 1; 4265 spin_lock(lock); 4266 } 4267 return ret; 4268} 4269EXPORT_SYMBOL(__cond_resched_lock); 4270 4271int __sched __cond_resched_softirq(void) 4272{ 4273 BUG_ON(!in_softirq()); 4274 4275 if (should_resched()) { 4276 local_bh_enable(); 4277 __cond_resched(); 4278 local_bh_disable(); 4279 return 1; 4280 } 4281 return 0; 4282} 4283EXPORT_SYMBOL(__cond_resched_softirq); 4284 4285/** 4286 * yield - yield the current processor to other threads. 4287 * 4288 * Do not ever use this function, there's a 99% chance you're doing it wrong. 4289 * 4290 * The scheduler is at all times free to pick the calling task as the most 4291 * eligible task to run, if removing the yield() call from your code breaks 4292 * it, its already broken. 4293 * 4294 * Typical broken usage is: 4295 * 4296 * while (!event) 4297 * yield(); 4298 * 4299 * where one assumes that yield() will let 'the other' process run that will 4300 * make event true. If the current task is a SCHED_FIFO task that will never 4301 * happen. Never use yield() as a progress guarantee!! 4302 * 4303 * If you want to use yield() to wait for something, use wait_event(). 4304 * If you want to use yield() to be 'nice' for others, use cond_resched(). 4305 * If you still want to use yield(), do not! 4306 */ 4307void __sched yield(void) 4308{ 4309 set_current_state(TASK_RUNNING); 4310 sys_sched_yield(); 4311} 4312EXPORT_SYMBOL(yield); 4313 4314/** 4315 * yield_to - yield the current processor to another thread in 4316 * your thread group, or accelerate that thread toward the 4317 * processor it's on. 4318 * @p: target task 4319 * @preempt: whether task preemption is allowed or not 4320 * 4321 * It's the caller's job to ensure that the target task struct 4322 * can't go away on us before we can do any checks. 4323 * 4324 * Returns true if we indeed boosted the target task. 4325 */ 4326bool __sched yield_to(struct task_struct *p, bool preempt) 4327{ 4328 struct task_struct *curr = current; 4329 struct rq *rq, *p_rq; 4330 unsigned long flags; 4331 bool yielded = 0; 4332 4333 local_irq_save(flags); 4334 rq = this_rq(); 4335 4336again: 4337 p_rq = task_rq(p); 4338 double_rq_lock(rq, p_rq); 4339 while (task_rq(p) != p_rq) { 4340 double_rq_unlock(rq, p_rq); 4341 goto again; 4342 } 4343 4344 if (!curr->sched_class->yield_to_task) 4345 goto out; 4346 4347 if (curr->sched_class != p->sched_class) 4348 goto out; 4349 4350 if (task_running(p_rq, p) || p->state) 4351 goto out; 4352 4353 yielded = curr->sched_class->yield_to_task(rq, p, preempt); 4354 if (yielded) { 4355 schedstat_inc(rq, yld_count); 4356 /* 4357 * Make p's CPU reschedule; pick_next_entity takes care of 4358 * fairness. 4359 */ 4360 if (preempt && rq != p_rq) 4361 resched_task(p_rq->curr); 4362 } 4363 4364out: 4365 double_rq_unlock(rq, p_rq); 4366 local_irq_restore(flags); 4367 4368 if (yielded) 4369 schedule(); 4370 4371 return yielded; 4372} 4373EXPORT_SYMBOL_GPL(yield_to); 4374 4375/* 4376 * This task is about to go to sleep on IO. Increment rq->nr_iowait so 4377 * that process accounting knows that this is a task in IO wait state. 4378 */ 4379void __sched io_schedule(void) 4380{ 4381 struct rq *rq = raw_rq(); 4382 4383 delayacct_blkio_start(); 4384 atomic_inc(&rq->nr_iowait); 4385 blk_flush_plug(current); 4386 current->in_iowait = 1; 4387 schedule(); 4388 current->in_iowait = 0; 4389 atomic_dec(&rq->nr_iowait); 4390 delayacct_blkio_end(); 4391} 4392EXPORT_SYMBOL(io_schedule); 4393 4394long __sched io_schedule_timeout(long timeout) 4395{ 4396 struct rq *rq = raw_rq(); 4397 long ret; 4398 4399 delayacct_blkio_start(); 4400 atomic_inc(&rq->nr_iowait); 4401 blk_flush_plug(current); 4402 current->in_iowait = 1; 4403 ret = schedule_timeout(timeout); 4404 current->in_iowait = 0; 4405 atomic_dec(&rq->nr_iowait); 4406 delayacct_blkio_end(); 4407 return ret; 4408} 4409 4410/** 4411 * sys_sched_get_priority_max - return maximum RT priority. 4412 * @policy: scheduling class. 4413 * 4414 * this syscall returns the maximum rt_priority that can be used 4415 * by a given scheduling class. 4416 */ 4417SYSCALL_DEFINE1(sched_get_priority_max, int, policy) 4418{ 4419 int ret = -EINVAL; 4420 4421 switch (policy) { 4422 case SCHED_FIFO: 4423 case SCHED_RR: 4424 ret = MAX_USER_RT_PRIO-1; 4425 break; 4426 case SCHED_NORMAL: 4427 case SCHED_BATCH: 4428 case SCHED_IDLE: 4429 ret = 0; 4430 break; 4431 } 4432 return ret; 4433} 4434 4435/** 4436 * sys_sched_get_priority_min - return minimum RT priority. 4437 * @policy: scheduling class. 4438 * 4439 * this syscall returns the minimum rt_priority that can be used 4440 * by a given scheduling class. 4441 */ 4442SYSCALL_DEFINE1(sched_get_priority_min, int, policy) 4443{ 4444 int ret = -EINVAL; 4445 4446 switch (policy) { 4447 case SCHED_FIFO: 4448 case SCHED_RR: 4449 ret = 1; 4450 break; 4451 case SCHED_NORMAL: 4452 case SCHED_BATCH: 4453 case SCHED_IDLE: 4454 ret = 0; 4455 } 4456 return ret; 4457} 4458 4459/** 4460 * sys_sched_rr_get_interval - return the default timeslice of a process. 4461 * @pid: pid of the process. 4462 * @interval: userspace pointer to the timeslice value. 4463 * 4464 * this syscall writes the default timeslice value of a given process 4465 * into the user-space timespec buffer. A value of '0' means infinity. 4466 */ 4467SYSCALL_DEFINE2(sched_rr_get_interval, pid_t, pid, 4468 struct timespec __user *, interval) 4469{ 4470 struct task_struct *p; 4471 unsigned int time_slice; 4472 unsigned long flags; 4473 struct rq *rq; 4474 int retval; 4475 struct timespec t; 4476 4477 if (pid < 0) 4478 return -EINVAL; 4479 4480 retval = -ESRCH; 4481 rcu_read_lock(); 4482 p = find_process_by_pid(pid); 4483 if (!p) 4484 goto out_unlock; 4485 4486 retval = security_task_getscheduler(p); 4487 if (retval) 4488 goto out_unlock; 4489 4490 rq = task_rq_lock(p, &flags); 4491 time_slice = p->sched_class->get_rr_interval(rq, p); 4492 task_rq_unlock(rq, p, &flags); 4493 4494 rcu_read_unlock(); 4495 jiffies_to_timespec(time_slice, &t); 4496 retval = copy_to_user(interval, &t, sizeof(t)) ? -EFAULT : 0; 4497 return retval; 4498 4499out_unlock: 4500 rcu_read_unlock(); 4501 return retval; 4502} 4503 4504static const char stat_nam[] = TASK_STATE_TO_CHAR_STR; 4505 4506void sched_show_task(struct task_struct *p) 4507{ 4508 unsigned long free = 0; 4509 unsigned state; 4510 4511 state = p->state ? __ffs(p->state) + 1 : 0; 4512 printk(KERN_INFO "%-15.15s %c", p->comm, 4513 state < sizeof(stat_nam) - 1 ? stat_nam[state] : '?'); 4514#if BITS_PER_LONG == 32 4515 if (state == TASK_RUNNING) 4516 printk(KERN_CONT " running "); 4517 else 4518 printk(KERN_CONT " %08lx ", thread_saved_pc(p)); 4519#else 4520 if (state == TASK_RUNNING) 4521 printk(KERN_CONT " running task "); 4522 else 4523 printk(KERN_CONT " %016lx ", thread_saved_pc(p)); 4524#endif 4525#ifdef CONFIG_DEBUG_STACK_USAGE 4526 free = stack_not_used(p); 4527#endif 4528 printk(KERN_CONT "%5lu %5d %6d 0x%08lx\n", free, 4529 task_pid_nr(p), task_pid_nr(rcu_dereference(p->real_parent)), 4530 (unsigned long)task_thread_info(p)->flags); 4531 4532 show_stack(p, NULL); 4533} 4534 4535void show_state_filter(unsigned long state_filter) 4536{ 4537 struct task_struct *g, *p; 4538 4539#if BITS_PER_LONG == 32 4540 printk(KERN_INFO 4541 " task PC stack pid father\n"); 4542#else 4543 printk(KERN_INFO 4544 " task PC stack pid father\n"); 4545#endif 4546 rcu_read_lock(); 4547 do_each_thread(g, p) { 4548 /* 4549 * reset the NMI-timeout, listing all files on a slow 4550 * console might take a lot of time: 4551 */ 4552 touch_nmi_watchdog(); 4553 if (!state_filter || (p->state & state_filter)) 4554 sched_show_task(p); 4555 } while_each_thread(g, p); 4556 4557 touch_all_softlockup_watchdogs(); 4558 4559#ifdef CONFIG_SCHED_DEBUG 4560 sysrq_sched_debug_show(); 4561#endif 4562 rcu_read_unlock(); 4563 /* 4564 * Only show locks if all tasks are dumped: 4565 */ 4566 if (!state_filter) 4567 debug_show_all_locks(); 4568} 4569 4570void __cpuinit init_idle_bootup_task(struct task_struct *idle) 4571{ 4572 idle->sched_class = &idle_sched_class; 4573} 4574 4575/** 4576 * init_idle - set up an idle thread for a given CPU 4577 * @idle: task in question 4578 * @cpu: cpu the idle task belongs to 4579 * 4580 * NOTE: this function does not set the idle thread's NEED_RESCHED 4581 * flag, to make booting more robust. 4582 */ 4583void __cpuinit init_idle(struct task_struct *idle, int cpu) 4584{ 4585 struct rq *rq = cpu_rq(cpu); 4586 unsigned long flags; 4587 4588 raw_spin_lock_irqsave(&rq->lock, flags); 4589 4590 __sched_fork(idle); 4591 idle->state = TASK_RUNNING; 4592 idle->se.exec_start = sched_clock(); 4593 4594 do_set_cpus_allowed(idle, cpumask_of(cpu)); 4595 /* 4596 * We're having a chicken and egg problem, even though we are 4597 * holding rq->lock, the cpu isn't yet set to this cpu so the 4598 * lockdep check in task_group() will fail. 4599 * 4600 * Similar case to sched_fork(). / Alternatively we could 4601 * use task_rq_lock() here and obtain the other rq->lock. 4602 * 4603 * Silence PROVE_RCU 4604 */ 4605 rcu_read_lock(); 4606 __set_task_cpu(idle, cpu); 4607 rcu_read_unlock(); 4608 4609 rq->curr = rq->idle = idle; 4610#if defined(CONFIG_SMP) 4611 idle->on_cpu = 1; 4612#endif 4613 raw_spin_unlock_irqrestore(&rq->lock, flags); 4614 4615 /* Set the preempt count _outside_ the spinlocks! */ 4616 task_thread_info(idle)->preempt_count = 0; 4617 4618 /* 4619 * The idle tasks have their own, simple scheduling class: 4620 */ 4621 idle->sched_class = &idle_sched_class; 4622 ftrace_graph_init_idle_task(idle, cpu); 4623#if defined(CONFIG_SMP) 4624 sprintf(idle->comm, "%s/%d", INIT_TASK_COMM, cpu); 4625#endif 4626} 4627 4628#ifdef CONFIG_SMP 4629void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask) 4630{ 4631 if (p->sched_class && p->sched_class->set_cpus_allowed) 4632 p->sched_class->set_cpus_allowed(p, new_mask); 4633 4634 cpumask_copy(&p->cpus_allowed, new_mask); 4635 p->nr_cpus_allowed = cpumask_weight(new_mask); 4636} 4637 4638/* 4639 * This is how migration works: 4640 * 4641 * 1) we invoke migration_cpu_stop() on the target CPU using 4642 * stop_one_cpu(). 4643 * 2) stopper starts to run (implicitly forcing the migrated thread 4644 * off the CPU) 4645 * 3) it checks whether the migrated task is still in the wrong runqueue. 4646 * 4) if it's in the wrong runqueue then the migration thread removes 4647 * it and puts it into the right queue. 4648 * 5) stopper completes and stop_one_cpu() returns and the migration 4649 * is done. 4650 */ 4651 4652/* 4653 * Change a given task's CPU affinity. Migrate the thread to a 4654 * proper CPU and schedule it away if the CPU it's executing on 4655 * is removed from the allowed bitmask. 4656 * 4657 * NOTE: the caller must have a valid reference to the task, the 4658 * task must not exit() & deallocate itself prematurely. The 4659 * call is not atomic; no spinlocks may be held. 4660 */ 4661int set_cpus_allowed_ptr(struct task_struct *p, const struct cpumask *new_mask) 4662{ 4663 unsigned long flags; 4664 struct rq *rq; 4665 unsigned int dest_cpu; 4666 int ret = 0; 4667 4668 rq = task_rq_lock(p, &flags); 4669 4670 if (cpumask_equal(&p->cpus_allowed, new_mask)) 4671 goto out; 4672 4673 if (!cpumask_intersects(new_mask, cpu_active_mask)) { 4674 ret = -EINVAL; 4675 goto out; 4676 } 4677 4678 if (unlikely((p->flags & PF_THREAD_BOUND) && p != current)) { 4679 ret = -EINVAL; 4680 goto out; 4681 } 4682 4683 do_set_cpus_allowed(p, new_mask); 4684 4685 /* Can the task run on the task's current CPU? If so, we're done */ 4686 if (cpumask_test_cpu(task_cpu(p), new_mask)) 4687 goto out; 4688 4689 dest_cpu = cpumask_any_and(cpu_active_mask, new_mask); 4690 if (p->on_rq) { 4691 struct migration_arg arg = { p, dest_cpu }; 4692 /* Need help from migration thread: drop lock and wait. */ 4693 task_rq_unlock(rq, p, &flags); 4694 stop_one_cpu(cpu_of(rq), migration_cpu_stop, &arg); 4695 tlb_migrate_finish(p->mm); 4696 return 0; 4697 } 4698out: 4699 task_rq_unlock(rq, p, &flags); 4700 4701 return ret; 4702} 4703EXPORT_SYMBOL_GPL(set_cpus_allowed_ptr); 4704 4705/* 4706 * Move (not current) task off this cpu, onto dest cpu. We're doing 4707 * this because either it can't run here any more (set_cpus_allowed() 4708 * away from this CPU, or CPU going down), or because we're 4709 * attempting to rebalance this task on exec (sched_exec). 4710 * 4711 * So we race with normal scheduler movements, but that's OK, as long 4712 * as the task is no longer on this CPU. 4713 * 4714 * Returns non-zero if task was successfully migrated. 4715 */ 4716static int __migrate_task(struct task_struct *p, int src_cpu, int dest_cpu) 4717{ 4718 struct rq *rq_dest, *rq_src; 4719 int ret = 0; 4720 4721 if (unlikely(!cpu_active(dest_cpu))) 4722 return ret; 4723 4724 rq_src = cpu_rq(src_cpu); 4725 rq_dest = cpu_rq(dest_cpu); 4726 4727 raw_spin_lock(&p->pi_lock); 4728 double_rq_lock(rq_src, rq_dest); 4729 /* Already moved. */ 4730 if (task_cpu(p) != src_cpu) 4731 goto done; 4732 /* Affinity changed (again). */ 4733 if (!cpumask_test_cpu(dest_cpu, tsk_cpus_allowed(p))) 4734 goto fail; 4735 4736 /* 4737 * If we're not on a rq, the next wake-up will ensure we're 4738 * placed properly. 4739 */ 4740 if (p->on_rq) { 4741 dequeue_task(rq_src, p, 0); 4742 set_task_cpu(p, dest_cpu); 4743 enqueue_task(rq_dest, p, 0); 4744 check_preempt_curr(rq_dest, p, 0); 4745 } 4746done: 4747 ret = 1; 4748fail: 4749 double_rq_unlock(rq_src, rq_dest); 4750 raw_spin_unlock(&p->pi_lock); 4751 return ret; 4752} 4753 4754/* 4755 * migration_cpu_stop - this will be executed by a highprio stopper thread 4756 * and performs thread migration by bumping thread off CPU then 4757 * 'pushing' onto another runqueue. 4758 */ 4759static int migration_cpu_stop(void *data) 4760{ 4761 struct migration_arg *arg = data; 4762 4763 /* 4764 * The original target cpu might have gone down and we might 4765 * be on another cpu but it doesn't matter. 4766 */ 4767 local_irq_disable(); 4768 __migrate_task(arg->task, raw_smp_processor_id(), arg->dest_cpu); 4769 local_irq_enable(); 4770 return 0; 4771} 4772 4773#ifdef CONFIG_HOTPLUG_CPU 4774 4775/* 4776 * Ensures that the idle task is using init_mm right before its cpu goes 4777 * offline. 4778 */ 4779void idle_task_exit(void) 4780{ 4781 struct mm_struct *mm = current->active_mm; 4782 4783 BUG_ON(cpu_online(smp_processor_id())); 4784 4785 if (mm != &init_mm) 4786 switch_mm(mm, &init_mm, current); 4787 mmdrop(mm); 4788} 4789 4790/* 4791 * Since this CPU is going 'away' for a while, fold any nr_active delta 4792 * we might have. Assumes we're called after migrate_tasks() so that the 4793 * nr_active count is stable. 4794 * 4795 * Also see the comment "Global load-average calculations". 4796 */ 4797static void calc_load_migrate(struct rq *rq) 4798{ 4799 long delta = calc_load_fold_active(rq); 4800 if (delta) 4801 atomic_long_add(delta, &calc_load_tasks); 4802} 4803 4804/* 4805 * Migrate all tasks from the rq, sleeping tasks will be migrated by 4806 * try_to_wake_up()->select_task_rq(). 4807 * 4808 * Called with rq->lock held even though we'er in stop_machine() and 4809 * there's no concurrency possible, we hold the required locks anyway 4810 * because of lock validation efforts. 4811 */ 4812static void migrate_tasks(unsigned int dead_cpu) 4813{ 4814 struct rq *rq = cpu_rq(dead_cpu); 4815 struct task_struct *next, *stop = rq->stop; 4816 int dest_cpu; 4817 4818 /* 4819 * Fudge the rq selection such that the below task selection loop 4820 * doesn't get stuck on the currently eligible stop task. 4821 * 4822 * We're currently inside stop_machine() and the rq is either stuck 4823 * in the stop_machine_cpu_stop() loop, or we're executing this code, 4824 * either way we should never end up calling schedule() until we're 4825 * done here. 4826 */ 4827 rq->stop = NULL; 4828 4829 for ( ; ; ) { 4830 /* 4831 * There's this thread running, bail when that's the only 4832 * remaining thread. 4833 */ 4834 if (rq->nr_running == 1) 4835 break; 4836 4837 next = pick_next_task(rq); 4838 BUG_ON(!next); 4839 next->sched_class->put_prev_task(rq, next); 4840 4841 /* Find suitable destination for @next, with force if needed. */ 4842 dest_cpu = select_fallback_rq(dead_cpu, next); 4843 raw_spin_unlock(&rq->lock); 4844 4845 __migrate_task(next, dead_cpu, dest_cpu); 4846 4847 raw_spin_lock(&rq->lock); 4848 } 4849 4850 rq->stop = stop; 4851} 4852 4853#endif /* CONFIG_HOTPLUG_CPU */ 4854 4855#if defined(CONFIG_SCHED_DEBUG) && defined(CONFIG_SYSCTL) 4856 4857static struct ctl_table sd_ctl_dir[] = { 4858 { 4859 .procname = "sched_domain", 4860 .mode = 0555, 4861 }, 4862 {} 4863}; 4864 4865static struct ctl_table sd_ctl_root[] = { 4866 { 4867 .procname = "kernel", 4868 .mode = 0555, 4869 .child = sd_ctl_dir, 4870 }, 4871 {} 4872}; 4873 4874static struct ctl_table *sd_alloc_ctl_entry(int n) 4875{ 4876 struct ctl_table *entry = 4877 kcalloc(n, sizeof(struct ctl_table), GFP_KERNEL); 4878 4879 return entry; 4880} 4881 4882static void sd_free_ctl_entry(struct ctl_table **tablep) 4883{ 4884 struct ctl_table *entry; 4885 4886 /* 4887 * In the intermediate directories, both the child directory and 4888 * procname are dynamically allocated and could fail but the mode 4889 * will always be set. In the lowest directory the names are 4890 * static strings and all have proc handlers. 4891 */ 4892 for (entry = *tablep; entry->mode; entry++) { 4893 if (entry->child) 4894 sd_free_ctl_entry(&entry->child); 4895 if (entry->proc_handler == NULL) 4896 kfree(entry->procname); 4897 } 4898 4899 kfree(*tablep); 4900 *tablep = NULL; 4901} 4902 4903static int min_load_idx = 0; 4904static int max_load_idx = CPU_LOAD_IDX_MAX; 4905 4906static void 4907set_table_entry(struct ctl_table *entry, 4908 const char *procname, void *data, int maxlen, 4909 umode_t mode, proc_handler *proc_handler, 4910 bool load_idx) 4911{ 4912 entry->procname = procname; 4913 entry->data = data; 4914 entry->maxlen = maxlen; 4915 entry->mode = mode; 4916 entry->proc_handler = proc_handler; 4917 4918 if (load_idx) { 4919 entry->extra1 = &min_load_idx; 4920 entry->extra2 = &max_load_idx; 4921 } 4922} 4923 4924static struct ctl_table * 4925sd_alloc_ctl_domain_table(struct sched_domain *sd) 4926{ 4927 struct ctl_table *table = sd_alloc_ctl_entry(13); 4928 4929 if (table == NULL) 4930 return NULL; 4931 4932 set_table_entry(&table[0], "min_interval", &sd->min_interval, 4933 sizeof(long), 0644, proc_doulongvec_minmax, false); 4934 set_table_entry(&table[1], "max_interval", &sd->max_interval, 4935 sizeof(long), 0644, proc_doulongvec_minmax, false); 4936 set_table_entry(&table[2], "busy_idx", &sd->busy_idx, 4937 sizeof(int), 0644, proc_dointvec_minmax, true); 4938 set_table_entry(&table[3], "idle_idx", &sd->idle_idx, 4939 sizeof(int), 0644, proc_dointvec_minmax, true); 4940 set_table_entry(&table[4], "newidle_idx", &sd->newidle_idx, 4941 sizeof(int), 0644, proc_dointvec_minmax, true); 4942 set_table_entry(&table[5], "wake_idx", &sd->wake_idx, 4943 sizeof(int), 0644, proc_dointvec_minmax, true); 4944 set_table_entry(&table[6], "forkexec_idx", &sd->forkexec_idx, 4945 sizeof(int), 0644, proc_dointvec_minmax, true); 4946 set_table_entry(&table[7], "busy_factor", &sd->busy_factor, 4947 sizeof(int), 0644, proc_dointvec_minmax, false); 4948 set_table_entry(&table[8], "imbalance_pct", &sd->imbalance_pct, 4949 sizeof(int), 0644, proc_dointvec_minmax, false); 4950 set_table_entry(&table[9], "cache_nice_tries", 4951 &sd->cache_nice_tries, 4952 sizeof(int), 0644, proc_dointvec_minmax, false); 4953 set_table_entry(&table[10], "flags", &sd->flags, 4954 sizeof(int), 0644, proc_dointvec_minmax, false); 4955 set_table_entry(&table[11], "name", sd->name, 4956 CORENAME_MAX_SIZE, 0444, proc_dostring, false); 4957 /* &table[12] is terminator */ 4958 4959 return table; 4960} 4961 4962static ctl_table *sd_alloc_ctl_cpu_table(int cpu) 4963{ 4964 struct ctl_table *entry, *table; 4965 struct sched_domain *sd; 4966 int domain_num = 0, i; 4967 char buf[32]; 4968 4969 for_each_domain(cpu, sd) 4970 domain_num++; 4971 entry = table = sd_alloc_ctl_entry(domain_num + 1); 4972 if (table == NULL) 4973 return NULL; 4974 4975 i = 0; 4976 for_each_domain(cpu, sd) { 4977 snprintf(buf, 32, "domain%d", i); 4978 entry->procname = kstrdup(buf, GFP_KERNEL); 4979 entry->mode = 0555; 4980 entry->child = sd_alloc_ctl_domain_table(sd); 4981 entry++; 4982 i++; 4983 } 4984 return table; 4985} 4986 4987static struct ctl_table_header *sd_sysctl_header; 4988static void register_sched_domain_sysctl(void) 4989{ 4990 int i, cpu_num = num_possible_cpus(); 4991 struct ctl_table *entry = sd_alloc_ctl_entry(cpu_num + 1); 4992 char buf[32]; 4993 4994 WARN_ON(sd_ctl_dir[0].child); 4995 sd_ctl_dir[0].child = entry; 4996 4997 if (entry == NULL) 4998 return; 4999 5000 for_each_possible_cpu(i) { 5001 snprintf(buf, 32, "cpu%d", i); 5002 entry->procname = kstrdup(buf, GFP_KERNEL); 5003 entry->mode = 0555; 5004 entry->child = sd_alloc_ctl_cpu_table(i); 5005 entry++; 5006 } 5007 5008 WARN_ON(sd_sysctl_header); 5009 sd_sysctl_header = register_sysctl_table(sd_ctl_root); 5010} 5011 5012/* may be called multiple times per register */ 5013static void unregister_sched_domain_sysctl(void) 5014{ 5015 if (sd_sysctl_header) 5016 unregister_sysctl_table(sd_sysctl_header); 5017 sd_sysctl_header = NULL; 5018 if (sd_ctl_dir[0].child) 5019 sd_free_ctl_entry(&sd_ctl_dir[0].child); 5020} 5021#else 5022static void register_sched_domain_sysctl(void) 5023{ 5024} 5025static void unregister_sched_domain_sysctl(void) 5026{ 5027} 5028#endif 5029 5030static void set_rq_online(struct rq *rq) 5031{ 5032 if (!rq->online) { 5033 const struct sched_class *class; 5034 5035 cpumask_set_cpu(rq->cpu, rq->rd->online); 5036 rq->online = 1; 5037 5038 for_each_class(class) { 5039 if (class->rq_online) 5040 class->rq_online(rq); 5041 } 5042 } 5043} 5044 5045static void set_rq_offline(struct rq *rq) 5046{ 5047 if (rq->online) { 5048 const struct sched_class *class; 5049 5050 for_each_class(class) { 5051 if (class->rq_offline) 5052 class->rq_offline(rq); 5053 } 5054 5055 cpumask_clear_cpu(rq->cpu, rq->rd->online); 5056 rq->online = 0; 5057 } 5058} 5059 5060/* 5061 * migration_call - callback that gets triggered when a CPU is added. 5062 * Here we can start up the necessary migration thread for the new CPU. 5063 */ 5064static int __cpuinit 5065migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu) 5066{ 5067 int cpu = (long)hcpu; 5068 unsigned long flags; 5069 struct rq *rq = cpu_rq(cpu); 5070 5071 switch (action & ~CPU_TASKS_FROZEN) { 5072 5073 case CPU_UP_PREPARE: 5074 rq->calc_load_update = calc_load_update; 5075 break; 5076 5077 case CPU_ONLINE: 5078 /* Update our root-domain */ 5079 raw_spin_lock_irqsave(&rq->lock, flags); 5080 if (rq->rd) { 5081 BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span)); 5082 5083 set_rq_online(rq); 5084 } 5085 raw_spin_unlock_irqrestore(&rq->lock, flags); 5086 break; 5087 5088#ifdef CONFIG_HOTPLUG_CPU 5089 case CPU_DYING: 5090 sched_ttwu_pending(); 5091 /* Update our root-domain */ 5092 raw_spin_lock_irqsave(&rq->lock, flags); 5093 if (rq->rd) { 5094 BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span)); 5095 set_rq_offline(rq); 5096 } 5097 migrate_tasks(cpu); 5098 BUG_ON(rq->nr_running != 1); /* the migration thread */ 5099 raw_spin_unlock_irqrestore(&rq->lock, flags); 5100 break; 5101 5102 case CPU_DEAD: 5103 calc_load_migrate(rq); 5104 break; 5105#endif 5106 } 5107 5108 update_max_interval(); 5109 5110 return NOTIFY_OK; 5111} 5112 5113/* 5114 * Register at high priority so that task migration (migrate_all_tasks) 5115 * happens before everything else. This has to be lower priority than 5116 * the notifier in the perf_event subsystem, though. 5117 */ 5118static struct notifier_block __cpuinitdata migration_notifier = { 5119 .notifier_call = migration_call, 5120 .priority = CPU_PRI_MIGRATION, 5121}; 5122 5123static int __cpuinit sched_cpu_active(struct notifier_block *nfb, 5124 unsigned long action, void *hcpu) 5125{ 5126 switch (action & ~CPU_TASKS_FROZEN) { 5127 case CPU_STARTING: 5128 case CPU_DOWN_FAILED: 5129 set_cpu_active((long)hcpu, true); 5130 return NOTIFY_OK; 5131 default: 5132 return NOTIFY_DONE; 5133 } 5134} 5135 5136static int __cpuinit sched_cpu_inactive(struct notifier_block *nfb, 5137 unsigned long action, void *hcpu) 5138{ 5139 switch (action & ~CPU_TASKS_FROZEN) { 5140 case CPU_DOWN_PREPARE: 5141 set_cpu_active((long)hcpu, false); 5142 return NOTIFY_OK; 5143 default: 5144 return NOTIFY_DONE; 5145 } 5146} 5147 5148static int __init migration_init(void) 5149{ 5150 void *cpu = (void *)(long)smp_processor_id(); 5151 int err; 5152 5153 /* Initialize migration for the boot CPU */ 5154 err = migration_call(&migration_notifier, CPU_UP_PREPARE, cpu); 5155 BUG_ON(err == NOTIFY_BAD); 5156 migration_call(&migration_notifier, CPU_ONLINE, cpu); 5157 register_cpu_notifier(&migration_notifier); 5158 5159 /* Register cpu active notifiers */ 5160 cpu_notifier(sched_cpu_active, CPU_PRI_SCHED_ACTIVE); 5161 cpu_notifier(sched_cpu_inactive, CPU_PRI_SCHED_INACTIVE); 5162 5163 return 0; 5164} 5165early_initcall(migration_init); 5166#endif 5167 5168#ifdef CONFIG_SMP 5169 5170static cpumask_var_t sched_domains_tmpmask; /* sched_domains_mutex */ 5171 5172#ifdef CONFIG_SCHED_DEBUG 5173 5174static __read_mostly int sched_debug_enabled; 5175 5176static int __init sched_debug_setup(char *str) 5177{ 5178 sched_debug_enabled = 1; 5179 5180 return 0; 5181} 5182early_param("sched_debug", sched_debug_setup); 5183 5184static inline bool sched_debug(void) 5185{ 5186 return sched_debug_enabled; 5187} 5188 5189static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level, 5190 struct cpumask *groupmask) 5191{ 5192 struct sched_group *group = sd->groups; 5193 char str[256]; 5194 5195 cpulist_scnprintf(str, sizeof(str), sched_domain_span(sd)); 5196 cpumask_clear(groupmask); 5197 5198 printk(KERN_DEBUG "%*s domain %d: ", level, "", level); 5199 5200 if (!(sd->flags & SD_LOAD_BALANCE)) { 5201 printk("does not load-balance\n"); 5202 if (sd->parent) 5203 printk(KERN_ERR "ERROR: !SD_LOAD_BALANCE domain" 5204 " has parent"); 5205 return -1; 5206 } 5207 5208 printk(KERN_CONT "span %s level %s\n", str, sd->name); 5209 5210 if (!cpumask_test_cpu(cpu, sched_domain_span(sd))) { 5211 printk(KERN_ERR "ERROR: domain->span does not contain " 5212 "CPU%d\n", cpu); 5213 } 5214 if (!cpumask_test_cpu(cpu, sched_group_cpus(group))) { 5215 printk(KERN_ERR "ERROR: domain->groups does not contain" 5216 " CPU%d\n", cpu); 5217 } 5218 5219 printk(KERN_DEBUG "%*s groups:", level + 1, ""); 5220 do { 5221 if (!group) { 5222 printk("\n"); 5223 printk(KERN_ERR "ERROR: group is NULL\n"); 5224 break; 5225 } 5226 5227 /* 5228 * Even though we initialize ->power to something semi-sane, 5229 * we leave power_orig unset. This allows us to detect if 5230 * domain iteration is still funny without causing /0 traps. 5231 */ 5232 if (!group->sgp->power_orig) { 5233 printk(KERN_CONT "\n"); 5234 printk(KERN_ERR "ERROR: domain->cpu_power not " 5235 "set\n"); 5236 break; 5237 } 5238 5239 if (!cpumask_weight(sched_group_cpus(group))) { 5240 printk(KERN_CONT "\n"); 5241 printk(KERN_ERR "ERROR: empty group\n"); 5242 break; 5243 } 5244 5245 if (!(sd->flags & SD_OVERLAP) && 5246 cpumask_intersects(groupmask, sched_group_cpus(group))) { 5247 printk(KERN_CONT "\n"); 5248 printk(KERN_ERR "ERROR: repeated CPUs\n"); 5249 break; 5250 } 5251 5252 cpumask_or(groupmask, groupmask, sched_group_cpus(group)); 5253 5254 cpulist_scnprintf(str, sizeof(str), sched_group_cpus(group)); 5255 5256 printk(KERN_CONT " %s", str); 5257 if (group->sgp->power != SCHED_POWER_SCALE) { 5258 printk(KERN_CONT " (cpu_power = %d)", 5259 group->sgp->power); 5260 } 5261 5262 group = group->next; 5263 } while (group != sd->groups); 5264 printk(KERN_CONT "\n"); 5265 5266 if (!cpumask_equal(sched_domain_span(sd), groupmask)) 5267 printk(KERN_ERR "ERROR: groups don't span domain->span\n"); 5268 5269 if (sd->parent && 5270 !cpumask_subset(groupmask, sched_domain_span(sd->parent))) 5271 printk(KERN_ERR "ERROR: parent span is not a superset " 5272 "of domain->span\n"); 5273 return 0; 5274} 5275 5276static void sched_domain_debug(struct sched_domain *sd, int cpu) 5277{ 5278 int level = 0; 5279 5280 if (!sched_debug_enabled) 5281 return; 5282 5283 if (!sd) { 5284 printk(KERN_DEBUG "CPU%d attaching NULL sched-domain.\n", cpu); 5285 return; 5286 } 5287 5288 printk(KERN_DEBUG "CPU%d attaching sched-domain:\n", cpu); 5289 5290 for (;;) { 5291 if (sched_domain_debug_one(sd, cpu, level, sched_domains_tmpmask)) 5292 break; 5293 level++; 5294 sd = sd->parent; 5295 if (!sd) 5296 break; 5297 } 5298} 5299#else /* !CONFIG_SCHED_DEBUG */ 5300# define sched_domain_debug(sd, cpu) do { } while (0) 5301static inline bool sched_debug(void) 5302{ 5303 return false; 5304} 5305#endif /* CONFIG_SCHED_DEBUG */ 5306 5307static int sd_degenerate(struct sched_domain *sd) 5308{ 5309 if (cpumask_weight(sched_domain_span(sd)) == 1) 5310 return 1; 5311 5312 /* Following flags need at least 2 groups */ 5313 if (sd->flags & (SD_LOAD_BALANCE | 5314 SD_BALANCE_NEWIDLE | 5315 SD_BALANCE_FORK | 5316 SD_BALANCE_EXEC | 5317 SD_SHARE_CPUPOWER | 5318 SD_SHARE_PKG_RESOURCES)) { 5319 if (sd->groups != sd->groups->next) 5320 return 0; 5321 } 5322 5323 /* Following flags don't use groups */ 5324 if (sd->flags & (SD_WAKE_AFFINE)) 5325 return 0; 5326 5327 return 1; 5328} 5329 5330static int 5331sd_parent_degenerate(struct sched_domain *sd, struct sched_domain *parent) 5332{ 5333 unsigned long cflags = sd->flags, pflags = parent->flags; 5334 5335 if (sd_degenerate(parent)) 5336 return 1; 5337 5338 if (!cpumask_equal(sched_domain_span(sd), sched_domain_span(parent))) 5339 return 0; 5340 5341 /* Flags needing groups don't count if only 1 group in parent */ 5342 if (parent->groups == parent->groups->next) { 5343 pflags &= ~(SD_LOAD_BALANCE | 5344 SD_BALANCE_NEWIDLE | 5345 SD_BALANCE_FORK | 5346 SD_BALANCE_EXEC | 5347 SD_SHARE_CPUPOWER | 5348 SD_SHARE_PKG_RESOURCES); 5349 if (nr_node_ids == 1) 5350 pflags &= ~SD_SERIALIZE; 5351 } 5352 if (~cflags & pflags) 5353 return 0; 5354 5355 return 1; 5356} 5357 5358static void free_rootdomain(struct rcu_head *rcu) 5359{ 5360 struct root_domain *rd = container_of(rcu, struct root_domain, rcu); 5361 5362 cpupri_cleanup(&rd->cpupri); 5363 free_cpumask_var(rd->rto_mask); 5364 free_cpumask_var(rd->online); 5365 free_cpumask_var(rd->span); 5366 kfree(rd); 5367} 5368 5369static void rq_attach_root(struct rq *rq, struct root_domain *rd) 5370{ 5371 struct root_domain *old_rd = NULL; 5372 unsigned long flags; 5373 5374 raw_spin_lock_irqsave(&rq->lock, flags); 5375 5376 if (rq->rd) { 5377 old_rd = rq->rd; 5378 5379 if (cpumask_test_cpu(rq->cpu, old_rd->online)) 5380 set_rq_offline(rq); 5381 5382 cpumask_clear_cpu(rq->cpu, old_rd->span); 5383 5384 /* 5385 * If we dont want to free the old_rt yet then 5386 * set old_rd to NULL to skip the freeing later 5387 * in this function: 5388 */ 5389 if (!atomic_dec_and_test(&old_rd->refcount)) 5390 old_rd = NULL; 5391 } 5392 5393 atomic_inc(&rd->refcount); 5394 rq->rd = rd; 5395 5396 cpumask_set_cpu(rq->cpu, rd->span); 5397 if (cpumask_test_cpu(rq->cpu, cpu_active_mask)) 5398 set_rq_online(rq); 5399 5400 raw_spin_unlock_irqrestore(&rq->lock, flags); 5401 5402 if (old_rd) 5403 call_rcu_sched(&old_rd->rcu, free_rootdomain); 5404} 5405 5406static int init_rootdomain(struct root_domain *rd) 5407{ 5408 memset(rd, 0, sizeof(*rd)); 5409 5410 if (!alloc_cpumask_var(&rd->span, GFP_KERNEL)) 5411 goto out; 5412 if (!alloc_cpumask_var(&rd->online, GFP_KERNEL)) 5413 goto free_span; 5414 if (!alloc_cpumask_var(&rd->rto_mask, GFP_KERNEL)) 5415 goto free_online; 5416 5417 if (cpupri_init(&rd->cpupri) != 0) 5418 goto free_rto_mask; 5419 return 0; 5420 5421free_rto_mask: 5422 free_cpumask_var(rd->rto_mask); 5423free_online: 5424 free_cpumask_var(rd->online); 5425free_span: 5426 free_cpumask_var(rd->span); 5427out: 5428 return -ENOMEM; 5429} 5430 5431/* 5432 * By default the system creates a single root-domain with all cpus as 5433 * members (mimicking the global state we have today). 5434 */ 5435struct root_domain def_root_domain; 5436 5437static void init_defrootdomain(void) 5438{ 5439 init_rootdomain(&def_root_domain); 5440 5441 atomic_set(&def_root_domain.refcount, 1); 5442} 5443 5444static struct root_domain *alloc_rootdomain(void) 5445{ 5446 struct root_domain *rd; 5447 5448 rd = kmalloc(sizeof(*rd), GFP_KERNEL); 5449 if (!rd) 5450 return NULL; 5451 5452 if (init_rootdomain(rd) != 0) { 5453 kfree(rd); 5454 return NULL; 5455 } 5456 5457 return rd; 5458} 5459 5460static void free_sched_groups(struct sched_group *sg, int free_sgp) 5461{ 5462 struct sched_group *tmp, *first; 5463 5464 if (!sg) 5465 return; 5466 5467 first = sg; 5468 do { 5469 tmp = sg->next; 5470 5471 if (free_sgp && atomic_dec_and_test(&sg->sgp->ref)) 5472 kfree(sg->sgp); 5473 5474 kfree(sg); 5475 sg = tmp; 5476 } while (sg != first); 5477} 5478 5479static void free_sched_domain(struct rcu_head *rcu) 5480{ 5481 struct sched_domain *sd = container_of(rcu, struct sched_domain, rcu); 5482 5483 /* 5484 * If its an overlapping domain it has private groups, iterate and 5485 * nuke them all. 5486 */ 5487 if (sd->flags & SD_OVERLAP) { 5488 free_sched_groups(sd->groups, 1); 5489 } else if (atomic_dec_and_test(&sd->groups->ref)) { 5490 kfree(sd->groups->sgp); 5491 kfree(sd->groups); 5492 } 5493 kfree(sd); 5494} 5495 5496static void destroy_sched_domain(struct sched_domain *sd, int cpu) 5497{ 5498 call_rcu(&sd->rcu, free_sched_domain); 5499} 5500 5501static void destroy_sched_domains(struct sched_domain *sd, int cpu) 5502{ 5503 for (; sd; sd = sd->parent) 5504 destroy_sched_domain(sd, cpu); 5505} 5506 5507/* 5508 * Keep a special pointer to the highest sched_domain that has 5509 * SD_SHARE_PKG_RESOURCE set (Last Level Cache Domain) for this 5510 * allows us to avoid some pointer chasing select_idle_sibling(). 5511 * 5512 * Also keep a unique ID per domain (we use the first cpu number in 5513 * the cpumask of the domain), this allows us to quickly tell if 5514 * two cpus are in the same cache domain, see cpus_share_cache(). 5515 */ 5516DEFINE_PER_CPU(struct sched_domain *, sd_llc); 5517DEFINE_PER_CPU(int, sd_llc_id); 5518 5519static void update_top_cache_domain(int cpu) 5520{ 5521 struct sched_domain *sd; 5522 int id = cpu; 5523 5524 sd = highest_flag_domain(cpu, SD_SHARE_PKG_RESOURCES); 5525 if (sd) 5526 id = cpumask_first(sched_domain_span(sd)); 5527 5528 rcu_assign_pointer(per_cpu(sd_llc, cpu), sd); 5529 per_cpu(sd_llc_id, cpu) = id; 5530} 5531 5532/* 5533 * Attach the domain 'sd' to 'cpu' as its base domain. Callers must 5534 * hold the hotplug lock. 5535 */ 5536static void 5537cpu_attach_domain(struct sched_domain *sd, struct root_domain *rd, int cpu) 5538{ 5539 struct rq *rq = cpu_rq(cpu); 5540 struct sched_domain *tmp; 5541 5542 /* Remove the sched domains which do not contribute to scheduling. */ 5543 for (tmp = sd; tmp; ) { 5544 struct sched_domain *parent = tmp->parent; 5545 if (!parent) 5546 break; 5547 5548 if (sd_parent_degenerate(tmp, parent)) { 5549 tmp->parent = parent->parent; 5550 if (parent->parent) 5551 parent->parent->child = tmp; 5552 destroy_sched_domain(parent, cpu); 5553 } else 5554 tmp = tmp->parent; 5555 } 5556 5557 if (sd && sd_degenerate(sd)) { 5558 tmp = sd; 5559 sd = sd->parent; 5560 destroy_sched_domain(tmp, cpu); 5561 if (sd) 5562 sd->child = NULL; 5563 } 5564 5565 sched_domain_debug(sd, cpu); 5566 5567 rq_attach_root(rq, rd); 5568 tmp = rq->sd; 5569 rcu_assign_pointer(rq->sd, sd); 5570 destroy_sched_domains(tmp, cpu); 5571 5572 update_top_cache_domain(cpu); 5573} 5574 5575/* cpus with isolated domains */ 5576static cpumask_var_t cpu_isolated_map; 5577 5578/* Setup the mask of cpus configured for isolated domains */ 5579static int __init isolated_cpu_setup(char *str) 5580{ 5581 alloc_bootmem_cpumask_var(&cpu_isolated_map); 5582 cpulist_parse(str, cpu_isolated_map); 5583 return 1; 5584} 5585 5586__setup("isolcpus=", isolated_cpu_setup); 5587 5588static const struct cpumask *cpu_cpu_mask(int cpu) 5589{ 5590 return cpumask_of_node(cpu_to_node(cpu)); 5591} 5592 5593struct sd_data { 5594 struct sched_domain **__percpu sd; 5595 struct sched_group **__percpu sg; 5596 struct sched_group_power **__percpu sgp; 5597}; 5598 5599struct s_data { 5600 struct sched_domain ** __percpu sd; 5601 struct root_domain *rd; 5602}; 5603 5604enum s_alloc { 5605 sa_rootdomain, 5606 sa_sd, 5607 sa_sd_storage, 5608 sa_none, 5609}; 5610 5611struct sched_domain_topology_level; 5612 5613typedef struct sched_domain *(*sched_domain_init_f)(struct sched_domain_topology_level *tl, int cpu); 5614typedef const struct cpumask *(*sched_domain_mask_f)(int cpu); 5615 5616#define SDTL_OVERLAP 0x01 5617 5618struct sched_domain_topology_level { 5619 sched_domain_init_f init; 5620 sched_domain_mask_f mask; 5621 int flags; 5622 int numa_level; 5623 struct sd_data data; 5624}; 5625 5626/* 5627 * Build an iteration mask that can exclude certain CPUs from the upwards 5628 * domain traversal. 5629 * 5630 * Asymmetric node setups can result in situations where the domain tree is of 5631 * unequal depth, make sure to skip domains that already cover the entire 5632 * range. 5633 * 5634 * In that case build_sched_domains() will have terminated the iteration early 5635 * and our sibling sd spans will be empty. Domains should always include the 5636 * cpu they're built on, so check that. 5637 * 5638 */ 5639static void build_group_mask(struct sched_domain *sd, struct sched_group *sg) 5640{ 5641 const struct cpumask *span = sched_domain_span(sd); 5642 struct sd_data *sdd = sd->private; 5643 struct sched_domain *sibling; 5644 int i; 5645 5646 for_each_cpu(i, span) { 5647 sibling = *per_cpu_ptr(sdd->sd, i); 5648 if (!cpumask_test_cpu(i, sched_domain_span(sibling))) 5649 continue; 5650 5651 cpumask_set_cpu(i, sched_group_mask(sg)); 5652 } 5653} 5654 5655/* 5656 * Return the canonical balance cpu for this group, this is the first cpu 5657 * of this group that's also in the iteration mask. 5658 */ 5659int group_balance_cpu(struct sched_group *sg) 5660{ 5661 return cpumask_first_and(sched_group_cpus(sg), sched_group_mask(sg)); 5662} 5663 5664static int 5665build_overlap_sched_groups(struct sched_domain *sd, int cpu) 5666{ 5667 struct sched_group *first = NULL, *last = NULL, *groups = NULL, *sg; 5668 const struct cpumask *span = sched_domain_span(sd); 5669 struct cpumask *covered = sched_domains_tmpmask; 5670 struct sd_data *sdd = sd->private; 5671 struct sched_domain *child; 5672 int i; 5673 5674 cpumask_clear(covered); 5675 5676 for_each_cpu(i, span) { 5677 struct cpumask *sg_span; 5678 5679 if (cpumask_test_cpu(i, covered)) 5680 continue; 5681 5682 child = *per_cpu_ptr(sdd->sd, i); 5683 5684 /* See the comment near build_group_mask(). */ 5685 if (!cpumask_test_cpu(i, sched_domain_span(child))) 5686 continue; 5687 5688 sg = kzalloc_node(sizeof(struct sched_group) + cpumask_size(), 5689 GFP_KERNEL, cpu_to_node(cpu)); 5690 5691 if (!sg) 5692 goto fail; 5693 5694 sg_span = sched_group_cpus(sg); 5695 if (child->child) { 5696 child = child->child; 5697 cpumask_copy(sg_span, sched_domain_span(child)); 5698 } else 5699 cpumask_set_cpu(i, sg_span); 5700 5701 cpumask_or(covered, covered, sg_span); 5702 5703 sg->sgp = *per_cpu_ptr(sdd->sgp, i); 5704 if (atomic_inc_return(&sg->sgp->ref) == 1) 5705 build_group_mask(sd, sg); 5706 5707 /* 5708 * Initialize sgp->power such that even if we mess up the 5709 * domains and no possible iteration will get us here, we won't 5710 * die on a /0 trap. 5711 */ 5712 sg->sgp->power = SCHED_POWER_SCALE * cpumask_weight(sg_span); 5713 5714 /* 5715 * Make sure the first group of this domain contains the 5716 * canonical balance cpu. Otherwise the sched_domain iteration 5717 * breaks. See update_sg_lb_stats(). 5718 */ 5719 if ((!groups && cpumask_test_cpu(cpu, sg_span)) || 5720 group_balance_cpu(sg) == cpu) 5721 groups = sg; 5722 5723 if (!first) 5724 first = sg; 5725 if (last) 5726 last->next = sg; 5727 last = sg; 5728 last->next = first; 5729 } 5730 sd->groups = groups; 5731 5732 return 0; 5733 5734fail: 5735 free_sched_groups(first, 0); 5736 5737 return -ENOMEM; 5738} 5739 5740static int get_group(int cpu, struct sd_data *sdd, struct sched_group **sg) 5741{ 5742 struct sched_domain *sd = *per_cpu_ptr(sdd->sd, cpu); 5743 struct sched_domain *child = sd->child; 5744 5745 if (child) 5746 cpu = cpumask_first(sched_domain_span(child)); 5747 5748 if (sg) { 5749 *sg = *per_cpu_ptr(sdd->sg, cpu); 5750 (*sg)->sgp = *per_cpu_ptr(sdd->sgp, cpu); 5751 atomic_set(&(*sg)->sgp->ref, 1); /* for claim_allocations */ 5752 } 5753 5754 return cpu; 5755} 5756 5757/* 5758 * build_sched_groups will build a circular linked list of the groups 5759 * covered by the given span, and will set each group's ->cpumask correctly, 5760 * and ->cpu_power to 0. 5761 * 5762 * Assumes the sched_domain tree is fully constructed 5763 */ 5764static int 5765build_sched_groups(struct sched_domain *sd, int cpu) 5766{ 5767 struct sched_group *first = NULL, *last = NULL; 5768 struct sd_data *sdd = sd->private; 5769 const struct cpumask *span = sched_domain_span(sd); 5770 struct cpumask *covered; 5771 int i; 5772 5773 get_group(cpu, sdd, &sd->groups); 5774 atomic_inc(&sd->groups->ref); 5775 5776 if (cpu != cpumask_first(sched_domain_span(sd))) 5777 return 0; 5778 5779 lockdep_assert_held(&sched_domains_mutex); 5780 covered = sched_domains_tmpmask; 5781 5782 cpumask_clear(covered); 5783 5784 for_each_cpu(i, span) { 5785 struct sched_group *sg; 5786 int group = get_group(i, sdd, &sg); 5787 int j; 5788 5789 if (cpumask_test_cpu(i, covered)) 5790 continue; 5791 5792 cpumask_clear(sched_group_cpus(sg)); 5793 sg->sgp->power = 0; 5794 cpumask_setall(sched_group_mask(sg)); 5795 5796 for_each_cpu(j, span) { 5797 if (get_group(j, sdd, NULL) != group) 5798 continue; 5799 5800 cpumask_set_cpu(j, covered); 5801 cpumask_set_cpu(j, sched_group_cpus(sg)); 5802 } 5803 5804 if (!first) 5805 first = sg; 5806 if (last) 5807 last->next = sg; 5808 last = sg; 5809 } 5810 last->next = first; 5811 5812 return 0; 5813} 5814 5815/* 5816 * Initialize sched groups cpu_power. 5817 * 5818 * cpu_power indicates the capacity of sched group, which is used while 5819 * distributing the load between different sched groups in a sched domain. 5820 * Typically cpu_power for all the groups in a sched domain will be same unless 5821 * there are asymmetries in the topology. If there are asymmetries, group 5822 * having more cpu_power will pickup more load compared to the group having 5823 * less cpu_power. 5824 */ 5825static void init_sched_groups_power(int cpu, struct sched_domain *sd) 5826{ 5827 struct sched_group *sg = sd->groups; 5828 5829 WARN_ON(!sd || !sg); 5830 5831 do { 5832 sg->group_weight = cpumask_weight(sched_group_cpus(sg)); 5833 sg = sg->next; 5834 } while (sg != sd->groups); 5835 5836 if (cpu != group_balance_cpu(sg)) 5837 return; 5838 5839 update_group_power(sd, cpu); 5840 atomic_set(&sg->sgp->nr_busy_cpus, sg->group_weight); 5841} 5842 5843int __weak arch_sd_sibling_asym_packing(void) 5844{ 5845 return 0*SD_ASYM_PACKING; 5846} 5847 5848/* 5849 * Initializers for schedule domains 5850 * Non-inlined to reduce accumulated stack pressure in build_sched_domains() 5851 */ 5852 5853#ifdef CONFIG_SCHED_DEBUG 5854# define SD_INIT_NAME(sd, type) sd->name = #type 5855#else 5856# define SD_INIT_NAME(sd, type) do { } while (0) 5857#endif 5858 5859#define SD_INIT_FUNC(type) \ 5860static noinline struct sched_domain * \ 5861sd_init_##type(struct sched_domain_topology_level *tl, int cpu) \ 5862{ \ 5863 struct sched_domain *sd = *per_cpu_ptr(tl->data.sd, cpu); \ 5864 *sd = SD_##type##_INIT; \ 5865 SD_INIT_NAME(sd, type); \ 5866 sd->private = &tl->data; \ 5867 return sd; \ 5868} 5869 5870SD_INIT_FUNC(CPU) 5871#ifdef CONFIG_SCHED_SMT 5872 SD_INIT_FUNC(SIBLING) 5873#endif 5874#ifdef CONFIG_SCHED_MC 5875 SD_INIT_FUNC(MC) 5876#endif 5877#ifdef CONFIG_SCHED_BOOK 5878 SD_INIT_FUNC(BOOK) 5879#endif 5880 5881static int default_relax_domain_level = -1; 5882int sched_domain_level_max; 5883 5884static int __init setup_relax_domain_level(char *str) 5885{ 5886 if (kstrtoint(str, 0, &default_relax_domain_level)) 5887 pr_warn("Unable to set relax_domain_level\n"); 5888 5889 return 1; 5890} 5891__setup("relax_domain_level=", setup_relax_domain_level); 5892 5893static void set_domain_attribute(struct sched_domain *sd, 5894 struct sched_domain_attr *attr) 5895{ 5896 int request; 5897 5898 if (!attr || attr->relax_domain_level < 0) { 5899 if (default_relax_domain_level < 0) 5900 return; 5901 else 5902 request = default_relax_domain_level; 5903 } else 5904 request = attr->relax_domain_level; 5905 if (request < sd->level) { 5906 /* turn off idle balance on this domain */ 5907 sd->flags &= ~(SD_BALANCE_WAKE|SD_BALANCE_NEWIDLE); 5908 } else { 5909 /* turn on idle balance on this domain */ 5910 sd->flags |= (SD_BALANCE_WAKE|SD_BALANCE_NEWIDLE); 5911 } 5912} 5913 5914static void __sdt_free(const struct cpumask *cpu_map); 5915static int __sdt_alloc(const struct cpumask *cpu_map); 5916 5917static void __free_domain_allocs(struct s_data *d, enum s_alloc what, 5918 const struct cpumask *cpu_map) 5919{ 5920 switch (what) { 5921 case sa_rootdomain: 5922 if (!atomic_read(&d->rd->refcount)) 5923 free_rootdomain(&d->rd->rcu); /* fall through */ 5924 case sa_sd: 5925 free_percpu(d->sd); /* fall through */ 5926 case sa_sd_storage: 5927 __sdt_free(cpu_map); /* fall through */ 5928 case sa_none: 5929 break; 5930 } 5931} 5932 5933static enum s_alloc __visit_domain_allocation_hell(struct s_data *d, 5934 const struct cpumask *cpu_map) 5935{ 5936 memset(d, 0, sizeof(*d)); 5937 5938 if (__sdt_alloc(cpu_map)) 5939 return sa_sd_storage; 5940 d->sd = alloc_percpu(struct sched_domain *); 5941 if (!d->sd) 5942 return sa_sd_storage; 5943 d->rd = alloc_rootdomain(); 5944 if (!d->rd) 5945 return sa_sd; 5946 return sa_rootdomain; 5947} 5948 5949/* 5950 * NULL the sd_data elements we've used to build the sched_domain and 5951 * sched_group structure so that the subsequent __free_domain_allocs() 5952 * will not free the data we're using. 5953 */ 5954static void claim_allocations(int cpu, struct sched_domain *sd) 5955{ 5956 struct sd_data *sdd = sd->private; 5957 5958 WARN_ON_ONCE(*per_cpu_ptr(sdd->sd, cpu) != sd); 5959 *per_cpu_ptr(sdd->sd, cpu) = NULL; 5960 5961 if (atomic_read(&(*per_cpu_ptr(sdd->sg, cpu))->ref)) 5962 *per_cpu_ptr(sdd->sg, cpu) = NULL; 5963 5964 if (atomic_read(&(*per_cpu_ptr(sdd->sgp, cpu))->ref)) 5965 *per_cpu_ptr(sdd->sgp, cpu) = NULL; 5966} 5967 5968#ifdef CONFIG_SCHED_SMT 5969static const struct cpumask *cpu_smt_mask(int cpu) 5970{ 5971 return topology_thread_cpumask(cpu); 5972} 5973#endif 5974 5975/* 5976 * Topology list, bottom-up. 5977 */ 5978static struct sched_domain_topology_level default_topology[] = { 5979#ifdef CONFIG_SCHED_SMT 5980 { sd_init_SIBLING, cpu_smt_mask, }, 5981#endif 5982#ifdef CONFIG_SCHED_MC 5983 { sd_init_MC, cpu_coregroup_mask, }, 5984#endif 5985#ifdef CONFIG_SCHED_BOOK 5986 { sd_init_BOOK, cpu_book_mask, }, 5987#endif 5988 { sd_init_CPU, cpu_cpu_mask, }, 5989 { NULL, }, 5990}; 5991 5992static struct sched_domain_topology_level *sched_domain_topology = default_topology; 5993 5994#ifdef CONFIG_NUMA 5995 5996static int sched_domains_numa_levels; 5997static int *sched_domains_numa_distance; 5998static struct cpumask ***sched_domains_numa_masks; 5999static int sched_domains_curr_level; 6000 6001static inline int sd_local_flags(int level) 6002{ 6003 if (sched_domains_numa_distance[level] > RECLAIM_DISTANCE) 6004 return 0; 6005 6006 return SD_BALANCE_EXEC | SD_BALANCE_FORK | SD_WAKE_AFFINE; 6007} 6008 6009static struct sched_domain * 6010sd_numa_init(struct sched_domain_topology_level *tl, int cpu) 6011{ 6012 struct sched_domain *sd = *per_cpu_ptr(tl->data.sd, cpu); 6013 int level = tl->numa_level; 6014 int sd_weight = cpumask_weight( 6015 sched_domains_numa_masks[level][cpu_to_node(cpu)]); 6016 6017 *sd = (struct sched_domain){ 6018 .min_interval = sd_weight, 6019 .max_interval = 2*sd_weight, 6020 .busy_factor = 32, 6021 .imbalance_pct = 125, 6022 .cache_nice_tries = 2, 6023 .busy_idx = 3, 6024 .idle_idx = 2, 6025 .newidle_idx = 0, 6026 .wake_idx = 0, 6027 .forkexec_idx = 0, 6028 6029 .flags = 1*SD_LOAD_BALANCE 6030 | 1*SD_BALANCE_NEWIDLE 6031 | 0*SD_BALANCE_EXEC 6032 | 0*SD_BALANCE_FORK 6033 | 0*SD_BALANCE_WAKE 6034 | 0*SD_WAKE_AFFINE 6035 | 0*SD_SHARE_CPUPOWER 6036 | 0*SD_SHARE_PKG_RESOURCES 6037 | 1*SD_SERIALIZE 6038 | 0*SD_PREFER_SIBLING 6039 | sd_local_flags(level) 6040 , 6041 .last_balance = jiffies, 6042 .balance_interval = sd_weight, 6043 }; 6044 SD_INIT_NAME(sd, NUMA); 6045 sd->private = &tl->data; 6046 6047 /* 6048 * Ugly hack to pass state to sd_numa_mask()... 6049 */ 6050 sched_domains_curr_level = tl->numa_level; 6051 6052 return sd; 6053} 6054 6055static const struct cpumask *sd_numa_mask(int cpu) 6056{ 6057 return sched_domains_numa_masks[sched_domains_curr_level][cpu_to_node(cpu)]; 6058} 6059 6060static void sched_numa_warn(const char *str) 6061{ 6062 static int done = false; 6063 int i,j; 6064 6065 if (done) 6066 return; 6067 6068 done = true; 6069 6070 printk(KERN_WARNING "ERROR: %s\n\n", str); 6071 6072 for (i = 0; i < nr_node_ids; i++) { 6073 printk(KERN_WARNING " "); 6074 for (j = 0; j < nr_node_ids; j++) 6075 printk(KERN_CONT "%02d ", node_distance(i,j)); 6076 printk(KERN_CONT "\n"); 6077 } 6078 printk(KERN_WARNING "\n"); 6079} 6080 6081static bool find_numa_distance(int distance) 6082{ 6083 int i; 6084 6085 if (distance == node_distance(0, 0)) 6086 return true; 6087 6088 for (i = 0; i < sched_domains_numa_levels; i++) { 6089 if (sched_domains_numa_distance[i] == distance) 6090 return true; 6091 } 6092 6093 return false; 6094} 6095 6096static void sched_init_numa(void) 6097{ 6098 int next_distance, curr_distance = node_distance(0, 0); 6099 struct sched_domain_topology_level *tl; 6100 int level = 0; 6101 int i, j, k; 6102 6103 sched_domains_numa_distance = kzalloc(sizeof(int) * nr_node_ids, GFP_KERNEL); 6104 if (!sched_domains_numa_distance) 6105 return; 6106 6107 /* 6108 * O(nr_nodes^2) deduplicating selection sort -- in order to find the 6109 * unique distances in the node_distance() table. 6110 * 6111 * Assumes node_distance(0,j) includes all distances in 6112 * node_distance(i,j) in order to avoid cubic time. 6113 */ 6114 next_distance = curr_distance; 6115 for (i = 0; i < nr_node_ids; i++) { 6116 for (j = 0; j < nr_node_ids; j++) { 6117 for (k = 0; k < nr_node_ids; k++) { 6118 int distance = node_distance(i, k); 6119 6120 if (distance > curr_distance && 6121 (distance < next_distance || 6122 next_distance == curr_distance)) 6123 next_distance = distance; 6124 6125 /* 6126 * While not a strong assumption it would be nice to know 6127 * about cases where if node A is connected to B, B is not 6128 * equally connected to A. 6129 */ 6130 if (sched_debug() && node_distance(k, i) != distance) 6131 sched_numa_warn("Node-distance not symmetric"); 6132 6133 if (sched_debug() && i && !find_numa_distance(distance)) 6134 sched_numa_warn("Node-0 not representative"); 6135 } 6136 if (next_distance != curr_distance) { 6137 sched_domains_numa_distance[level++] = next_distance; 6138 sched_domains_numa_levels = level; 6139 curr_distance = next_distance; 6140 } else break; 6141 } 6142 6143 /* 6144 * In case of sched_debug() we verify the above assumption. 6145 */ 6146 if (!sched_debug()) 6147 break; 6148 } 6149 /* 6150 * 'level' contains the number of unique distances, excluding the 6151 * identity distance node_distance(i,i). 6152 * 6153 * The sched_domains_nume_distance[] array includes the actual distance 6154 * numbers. 6155 */ 6156 6157 /* 6158 * Here, we should temporarily reset sched_domains_numa_levels to 0. 6159 * If it fails to allocate memory for array sched_domains_numa_masks[][], 6160 * the array will contain less then 'level' members. This could be 6161 * dangerous when we use it to iterate array sched_domains_numa_masks[][] 6162 * in other functions. 6163 * 6164 * We reset it to 'level' at the end of this function. 6165 */ 6166 sched_domains_numa_levels = 0; 6167 6168 sched_domains_numa_masks = kzalloc(sizeof(void *) * level, GFP_KERNEL); 6169 if (!sched_domains_numa_masks) 6170 return; 6171 6172 /* 6173 * Now for each level, construct a mask per node which contains all 6174 * cpus of nodes that are that many hops away from us. 6175 */ 6176 for (i = 0; i < level; i++) { 6177 sched_domains_numa_masks[i] = 6178 kzalloc(nr_node_ids * sizeof(void *), GFP_KERNEL); 6179 if (!sched_domains_numa_masks[i]) 6180 return; 6181 6182 for (j = 0; j < nr_node_ids; j++) { 6183 struct cpumask *mask = kzalloc(cpumask_size(), GFP_KERNEL); 6184 if (!mask) 6185 return; 6186 6187 sched_domains_numa_masks[i][j] = mask; 6188 6189 for (k = 0; k < nr_node_ids; k++) { 6190 if (node_distance(j, k) > sched_domains_numa_distance[i]) 6191 continue; 6192 6193 cpumask_or(mask, mask, cpumask_of_node(k)); 6194 } 6195 } 6196 } 6197 6198 tl = kzalloc((ARRAY_SIZE(default_topology) + level) * 6199 sizeof(struct sched_domain_topology_level), GFP_KERNEL); 6200 if (!tl) 6201 return; 6202 6203 /* 6204 * Copy the default topology bits.. 6205 */ 6206 for (i = 0; default_topology[i].init; i++) 6207 tl[i] = default_topology[i]; 6208 6209 /* 6210 * .. and append 'j' levels of NUMA goodness. 6211 */ 6212 for (j = 0; j < level; i++, j++) { 6213 tl[i] = (struct sched_domain_topology_level){ 6214 .init = sd_numa_init, 6215 .mask = sd_numa_mask, 6216 .flags = SDTL_OVERLAP, 6217 .numa_level = j, 6218 }; 6219 } 6220 6221 sched_domain_topology = tl; 6222 6223 sched_domains_numa_levels = level; 6224} 6225 6226static void sched_domains_numa_masks_set(int cpu) 6227{ 6228 int i, j; 6229 int node = cpu_to_node(cpu); 6230 6231 for (i = 0; i < sched_domains_numa_levels; i++) { 6232 for (j = 0; j < nr_node_ids; j++) { 6233 if (node_distance(j, node) <= sched_domains_numa_distance[i]) 6234 cpumask_set_cpu(cpu, sched_domains_numa_masks[i][j]); 6235 } 6236 } 6237} 6238 6239static void sched_domains_numa_masks_clear(int cpu) 6240{ 6241 int i, j; 6242 for (i = 0; i < sched_domains_numa_levels; i++) { 6243 for (j = 0; j < nr_node_ids; j++) 6244 cpumask_clear_cpu(cpu, sched_domains_numa_masks[i][j]); 6245 } 6246} 6247 6248/* 6249 * Update sched_domains_numa_masks[level][node] array when new cpus 6250 * are onlined. 6251 */ 6252static int sched_domains_numa_masks_update(struct notifier_block *nfb, 6253 unsigned long action, 6254 void *hcpu) 6255{ 6256 int cpu = (long)hcpu; 6257 6258 switch (action & ~CPU_TASKS_FROZEN) { 6259 case CPU_ONLINE: 6260 sched_domains_numa_masks_set(cpu); 6261 break; 6262 6263 case CPU_DEAD: 6264 sched_domains_numa_masks_clear(cpu); 6265 break; 6266 6267 default: 6268 return NOTIFY_DONE; 6269 } 6270 6271 return NOTIFY_OK; 6272} 6273#else 6274static inline void sched_init_numa(void) 6275{ 6276} 6277 6278static int sched_domains_numa_masks_update(struct notifier_block *nfb, 6279 unsigned long action, 6280 void *hcpu) 6281{ 6282 return 0; 6283} 6284#endif /* CONFIG_NUMA */ 6285 6286static int __sdt_alloc(const struct cpumask *cpu_map) 6287{ 6288 struct sched_domain_topology_level *tl; 6289 int j; 6290 6291 for (tl = sched_domain_topology; tl->init; tl++) { 6292 struct sd_data *sdd = &tl->data; 6293 6294 sdd->sd = alloc_percpu(struct sched_domain *); 6295 if (!sdd->sd) 6296 return -ENOMEM; 6297 6298 sdd->sg = alloc_percpu(struct sched_group *); 6299 if (!sdd->sg) 6300 return -ENOMEM; 6301 6302 sdd->sgp = alloc_percpu(struct sched_group_power *); 6303 if (!sdd->sgp) 6304 return -ENOMEM; 6305 6306 for_each_cpu(j, cpu_map) { 6307 struct sched_domain *sd; 6308 struct sched_group *sg; 6309 struct sched_group_power *sgp; 6310 6311 sd = kzalloc_node(sizeof(struct sched_domain) + cpumask_size(), 6312 GFP_KERNEL, cpu_to_node(j)); 6313 if (!sd) 6314 return -ENOMEM; 6315 6316 *per_cpu_ptr(sdd->sd, j) = sd; 6317 6318 sg = kzalloc_node(sizeof(struct sched_group) + cpumask_size(), 6319 GFP_KERNEL, cpu_to_node(j)); 6320 if (!sg) 6321 return -ENOMEM; 6322 6323 sg->next = sg; 6324 6325 *per_cpu_ptr(sdd->sg, j) = sg; 6326 6327 sgp = kzalloc_node(sizeof(struct sched_group_power) + cpumask_size(), 6328 GFP_KERNEL, cpu_to_node(j)); 6329 if (!sgp) 6330 return -ENOMEM; 6331 6332 *per_cpu_ptr(sdd->sgp, j) = sgp; 6333 } 6334 } 6335 6336 return 0; 6337} 6338 6339static void __sdt_free(const struct cpumask *cpu_map) 6340{ 6341 struct sched_domain_topology_level *tl; 6342 int j; 6343 6344 for (tl = sched_domain_topology; tl->init; tl++) { 6345 struct sd_data *sdd = &tl->data; 6346 6347 for_each_cpu(j, cpu_map) { 6348 struct sched_domain *sd; 6349 6350 if (sdd->sd) { 6351 sd = *per_cpu_ptr(sdd->sd, j); 6352 if (sd && (sd->flags & SD_OVERLAP)) 6353 free_sched_groups(sd->groups, 0); 6354 kfree(*per_cpu_ptr(sdd->sd, j)); 6355 } 6356 6357 if (sdd->sg) 6358 kfree(*per_cpu_ptr(sdd->sg, j)); 6359 if (sdd->sgp) 6360 kfree(*per_cpu_ptr(sdd->sgp, j)); 6361 } 6362 free_percpu(sdd->sd); 6363 sdd->sd = NULL; 6364 free_percpu(sdd->sg); 6365 sdd->sg = NULL; 6366 free_percpu(sdd->sgp); 6367 sdd->sgp = NULL; 6368 } 6369} 6370 6371struct sched_domain *build_sched_domain(struct sched_domain_topology_level *tl, 6372 struct s_data *d, const struct cpumask *cpu_map, 6373 struct sched_domain_attr *attr, struct sched_domain *child, 6374 int cpu) 6375{ 6376 struct sched_domain *sd = tl->init(tl, cpu); 6377 if (!sd) 6378 return child; 6379 6380 cpumask_and(sched_domain_span(sd), cpu_map, tl->mask(cpu)); 6381 if (child) { 6382 sd->level = child->level + 1; 6383 sched_domain_level_max = max(sched_domain_level_max, sd->level); 6384 child->parent = sd; 6385 } 6386 sd->child = child; 6387 set_domain_attribute(sd, attr); 6388 6389 return sd; 6390} 6391 6392/* 6393 * Build sched domains for a given set of cpus and attach the sched domains 6394 * to the individual cpus 6395 */ 6396static int build_sched_domains(const struct cpumask *cpu_map, 6397 struct sched_domain_attr *attr) 6398{ 6399 enum s_alloc alloc_state = sa_none; 6400 struct sched_domain *sd; 6401 struct s_data d; 6402 int i, ret = -ENOMEM; 6403 6404 alloc_state = __visit_domain_allocation_hell(&d, cpu_map); 6405 if (alloc_state != sa_rootdomain) 6406 goto error; 6407 6408 /* Set up domains for cpus specified by the cpu_map. */ 6409 for_each_cpu(i, cpu_map) { 6410 struct sched_domain_topology_level *tl; 6411 6412 sd = NULL; 6413 for (tl = sched_domain_topology; tl->init; tl++) { 6414 sd = build_sched_domain(tl, &d, cpu_map, attr, sd, i); 6415 if (tl->flags & SDTL_OVERLAP || sched_feat(FORCE_SD_OVERLAP)) 6416 sd->flags |= SD_OVERLAP; 6417 if (cpumask_equal(cpu_map, sched_domain_span(sd))) 6418 break; 6419 } 6420 6421 while (sd->child) 6422 sd = sd->child; 6423 6424 *per_cpu_ptr(d.sd, i) = sd; 6425 } 6426 6427 /* Build the groups for the domains */ 6428 for_each_cpu(i, cpu_map) { 6429 for (sd = *per_cpu_ptr(d.sd, i); sd; sd = sd->parent) { 6430 sd->span_weight = cpumask_weight(sched_domain_span(sd)); 6431 if (sd->flags & SD_OVERLAP) { 6432 if (build_overlap_sched_groups(sd, i)) 6433 goto error; 6434 } else { 6435 if (build_sched_groups(sd, i)) 6436 goto error; 6437 } 6438 } 6439 } 6440 6441 /* Calculate CPU power for physical packages and nodes */ 6442 for (i = nr_cpumask_bits-1; i >= 0; i--) { 6443 if (!cpumask_test_cpu(i, cpu_map)) 6444 continue; 6445 6446 for (sd = *per_cpu_ptr(d.sd, i); sd; sd = sd->parent) { 6447 claim_allocations(i, sd); 6448 init_sched_groups_power(i, sd); 6449 } 6450 } 6451 6452 /* Attach the domains */ 6453 rcu_read_lock(); 6454 for_each_cpu(i, cpu_map) { 6455 sd = *per_cpu_ptr(d.sd, i); 6456 cpu_attach_domain(sd, d.rd, i); 6457 } 6458 rcu_read_unlock(); 6459 6460 ret = 0; 6461error: 6462 __free_domain_allocs(&d, alloc_state, cpu_map); 6463 return ret; 6464} 6465 6466static cpumask_var_t *doms_cur; /* current sched domains */ 6467static int ndoms_cur; /* number of sched domains in 'doms_cur' */ 6468static struct sched_domain_attr *dattr_cur; 6469 /* attribues of custom domains in 'doms_cur' */ 6470 6471/* 6472 * Special case: If a kmalloc of a doms_cur partition (array of 6473 * cpumask) fails, then fallback to a single sched domain, 6474 * as determined by the single cpumask fallback_doms. 6475 */ 6476static cpumask_var_t fallback_doms; 6477 6478/* 6479 * arch_update_cpu_topology lets virtualized architectures update the 6480 * cpu core maps. It is supposed to return 1 if the topology changed 6481 * or 0 if it stayed the same. 6482 */ 6483int __attribute__((weak)) arch_update_cpu_topology(void) 6484{ 6485 return 0; 6486} 6487 6488cpumask_var_t *alloc_sched_domains(unsigned int ndoms) 6489{ 6490 int i; 6491 cpumask_var_t *doms; 6492 6493 doms = kmalloc(sizeof(*doms) * ndoms, GFP_KERNEL); 6494 if (!doms) 6495 return NULL; 6496 for (i = 0; i < ndoms; i++) { 6497 if (!alloc_cpumask_var(&doms[i], GFP_KERNEL)) { 6498 free_sched_domains(doms, i); 6499 return NULL; 6500 } 6501 } 6502 return doms; 6503} 6504 6505void free_sched_domains(cpumask_var_t doms[], unsigned int ndoms) 6506{ 6507 unsigned int i; 6508 for (i = 0; i < ndoms; i++) 6509 free_cpumask_var(doms[i]); 6510 kfree(doms); 6511} 6512 6513/* 6514 * Set up scheduler domains and groups. Callers must hold the hotplug lock. 6515 * For now this just excludes isolated cpus, but could be used to 6516 * exclude other special cases in the future. 6517 */ 6518static int init_sched_domains(const struct cpumask *cpu_map) 6519{ 6520 int err; 6521 6522 arch_update_cpu_topology(); 6523 ndoms_cur = 1; 6524 doms_cur = alloc_sched_domains(ndoms_cur); 6525 if (!doms_cur) 6526 doms_cur = &fallback_doms; 6527 cpumask_andnot(doms_cur[0], cpu_map, cpu_isolated_map); 6528 err = build_sched_domains(doms_cur[0], NULL); 6529 register_sched_domain_sysctl(); 6530 6531 return err; 6532} 6533 6534/* 6535 * Detach sched domains from a group of cpus specified in cpu_map 6536 * These cpus will now be attached to the NULL domain 6537 */ 6538static void detach_destroy_domains(const struct cpumask *cpu_map) 6539{ 6540 int i; 6541 6542 rcu_read_lock(); 6543 for_each_cpu(i, cpu_map) 6544 cpu_attach_domain(NULL, &def_root_domain, i); 6545 rcu_read_unlock(); 6546} 6547 6548/* handle null as "default" */ 6549static int dattrs_equal(struct sched_domain_attr *cur, int idx_cur, 6550 struct sched_domain_attr *new, int idx_new) 6551{ 6552 struct sched_domain_attr tmp; 6553 6554 /* fast path */ 6555 if (!new && !cur) 6556 return 1; 6557 6558 tmp = SD_ATTR_INIT; 6559 return !memcmp(cur ? (cur + idx_cur) : &tmp, 6560 new ? (new + idx_new) : &tmp, 6561 sizeof(struct sched_domain_attr)); 6562} 6563 6564/* 6565 * Partition sched domains as specified by the 'ndoms_new' 6566 * cpumasks in the array doms_new[] of cpumasks. This compares 6567 * doms_new[] to the current sched domain partitioning, doms_cur[]. 6568 * It destroys each deleted domain and builds each new domain. 6569 * 6570 * 'doms_new' is an array of cpumask_var_t's of length 'ndoms_new'. 6571 * The masks don't intersect (don't overlap.) We should setup one 6572 * sched domain for each mask. CPUs not in any of the cpumasks will 6573 * not be load balanced. If the same cpumask appears both in the 6574 * current 'doms_cur' domains and in the new 'doms_new', we can leave 6575 * it as it is. 6576 * 6577 * The passed in 'doms_new' should be allocated using 6578 * alloc_sched_domains. This routine takes ownership of it and will 6579 * free_sched_domains it when done with it. If the caller failed the 6580 * alloc call, then it can pass in doms_new == NULL && ndoms_new == 1, 6581 * and partition_sched_domains() will fallback to the single partition 6582 * 'fallback_doms', it also forces the domains to be rebuilt. 6583 * 6584 * If doms_new == NULL it will be replaced with cpu_online_mask. 6585 * ndoms_new == 0 is a special case for destroying existing domains, 6586 * and it will not create the default domain. 6587 * 6588 * Call with hotplug lock held 6589 */ 6590void partition_sched_domains(int ndoms_new, cpumask_var_t doms_new[], 6591 struct sched_domain_attr *dattr_new) 6592{ 6593 int i, j, n; 6594 int new_topology; 6595 6596 mutex_lock(&sched_domains_mutex); 6597 6598 /* always unregister in case we don't destroy any domains */ 6599 unregister_sched_domain_sysctl(); 6600 6601 /* Let architecture update cpu core mappings. */ 6602 new_topology = arch_update_cpu_topology(); 6603 6604 n = doms_new ? ndoms_new : 0; 6605 6606 /* Destroy deleted domains */ 6607 for (i = 0; i < ndoms_cur; i++) { 6608 for (j = 0; j < n && !new_topology; j++) { 6609 if (cpumask_equal(doms_cur[i], doms_new[j]) 6610 && dattrs_equal(dattr_cur, i, dattr_new, j)) 6611 goto match1; 6612 } 6613 /* no match - a current sched domain not in new doms_new[] */ 6614 detach_destroy_domains(doms_cur[i]); 6615match1: 6616 ; 6617 } 6618 6619 if (doms_new == NULL) { 6620 ndoms_cur = 0; 6621 doms_new = &fallback_doms; 6622 cpumask_andnot(doms_new[0], cpu_active_mask, cpu_isolated_map); 6623 WARN_ON_ONCE(dattr_new); 6624 } 6625 6626 /* Build new domains */ 6627 for (i = 0; i < ndoms_new; i++) { 6628 for (j = 0; j < ndoms_cur && !new_topology; j++) { 6629 if (cpumask_equal(doms_new[i], doms_cur[j]) 6630 && dattrs_equal(dattr_new, i, dattr_cur, j)) 6631 goto match2; 6632 } 6633 /* no match - add a new doms_new */ 6634 build_sched_domains(doms_new[i], dattr_new ? dattr_new + i : NULL); 6635match2: 6636 ; 6637 } 6638 6639 /* Remember the new sched domains */ 6640 if (doms_cur != &fallback_doms) 6641 free_sched_domains(doms_cur, ndoms_cur); 6642 kfree(dattr_cur); /* kfree(NULL) is safe */ 6643 doms_cur = doms_new; 6644 dattr_cur = dattr_new; 6645 ndoms_cur = ndoms_new; 6646 6647 register_sched_domain_sysctl(); 6648 6649 mutex_unlock(&sched_domains_mutex); 6650} 6651 6652static int num_cpus_frozen; /* used to mark begin/end of suspend/resume */ 6653 6654/* 6655 * Update cpusets according to cpu_active mask. If cpusets are 6656 * disabled, cpuset_update_active_cpus() becomes a simple wrapper 6657 * around partition_sched_domains(). 6658 * 6659 * If we come here as part of a suspend/resume, don't touch cpusets because we 6660 * want to restore it back to its original state upon resume anyway. 6661 */ 6662static int cpuset_cpu_active(struct notifier_block *nfb, unsigned long action, 6663 void *hcpu) 6664{ 6665 switch (action) { 6666 case CPU_ONLINE_FROZEN: 6667 case CPU_DOWN_FAILED_FROZEN: 6668 6669 /* 6670 * num_cpus_frozen tracks how many CPUs are involved in suspend 6671 * resume sequence. As long as this is not the last online 6672 * operation in the resume sequence, just build a single sched 6673 * domain, ignoring cpusets. 6674 */ 6675 num_cpus_frozen--; 6676 if (likely(num_cpus_frozen)) { 6677 partition_sched_domains(1, NULL, NULL); 6678 break; 6679 } 6680 6681 /* 6682 * This is the last CPU online operation. So fall through and 6683 * restore the original sched domains by considering the 6684 * cpuset configurations. 6685 */ 6686 6687 case CPU_ONLINE: 6688 case CPU_DOWN_FAILED: 6689 cpuset_update_active_cpus(true); 6690 break; 6691 default: 6692 return NOTIFY_DONE; 6693 } 6694 return NOTIFY_OK; 6695} 6696 6697static int cpuset_cpu_inactive(struct notifier_block *nfb, unsigned long action, 6698 void *hcpu) 6699{ 6700 switch (action) { 6701 case CPU_DOWN_PREPARE: 6702 cpuset_update_active_cpus(false); 6703 break; 6704 case CPU_DOWN_PREPARE_FROZEN: 6705 num_cpus_frozen++; 6706 partition_sched_domains(1, NULL, NULL); 6707 break; 6708 default: 6709 return NOTIFY_DONE; 6710 } 6711 return NOTIFY_OK; 6712} 6713 6714void __init sched_init_smp(void) 6715{ 6716 cpumask_var_t non_isolated_cpus; 6717 6718 alloc_cpumask_var(&non_isolated_cpus, GFP_KERNEL); 6719 alloc_cpumask_var(&fallback_doms, GFP_KERNEL); 6720 6721 sched_init_numa(); 6722 6723 get_online_cpus(); 6724 mutex_lock(&sched_domains_mutex); 6725 init_sched_domains(cpu_active_mask); 6726 cpumask_andnot(non_isolated_cpus, cpu_possible_mask, cpu_isolated_map); 6727 if (cpumask_empty(non_isolated_cpus)) 6728 cpumask_set_cpu(smp_processor_id(), non_isolated_cpus); 6729 mutex_unlock(&sched_domains_mutex); 6730 put_online_cpus(); 6731 6732 hotcpu_notifier(sched_domains_numa_masks_update, CPU_PRI_SCHED_ACTIVE); 6733 hotcpu_notifier(cpuset_cpu_active, CPU_PRI_CPUSET_ACTIVE); 6734 hotcpu_notifier(cpuset_cpu_inactive, CPU_PRI_CPUSET_INACTIVE); 6735 6736 /* RT runtime code needs to handle some hotplug events */ 6737 hotcpu_notifier(update_runtime, 0); 6738 6739 init_hrtick(); 6740 6741 /* Move init over to a non-isolated CPU */ 6742 if (set_cpus_allowed_ptr(current, non_isolated_cpus) < 0) 6743 BUG(); 6744 sched_init_granularity(); 6745 free_cpumask_var(non_isolated_cpus); 6746 6747 init_sched_rt_class(); 6748} 6749#else 6750void __init sched_init_smp(void) 6751{ 6752 sched_init_granularity(); 6753} 6754#endif /* CONFIG_SMP */ 6755 6756const_debug unsigned int sysctl_timer_migration = 1; 6757 6758int in_sched_functions(unsigned long addr) 6759{ 6760 return in_lock_functions(addr) || 6761 (addr >= (unsigned long)__sched_text_start 6762 && addr < (unsigned long)__sched_text_end); 6763} 6764 6765#ifdef CONFIG_CGROUP_SCHED 6766struct task_group root_task_group; 6767LIST_HEAD(task_groups); 6768#endif 6769 6770DECLARE_PER_CPU(cpumask_var_t, load_balance_tmpmask); 6771 6772void __init sched_init(void) 6773{ 6774 int i, j; 6775 unsigned long alloc_size = 0, ptr; 6776 6777#ifdef CONFIG_FAIR_GROUP_SCHED 6778 alloc_size += 2 * nr_cpu_ids * sizeof(void **); 6779#endif 6780#ifdef CONFIG_RT_GROUP_SCHED 6781 alloc_size += 2 * nr_cpu_ids * sizeof(void **); 6782#endif 6783#ifdef CONFIG_CPUMASK_OFFSTACK 6784 alloc_size += num_possible_cpus() * cpumask_size(); 6785#endif 6786 if (alloc_size) { 6787 ptr = (unsigned long)kzalloc(alloc_size, GFP_NOWAIT); 6788 6789#ifdef CONFIG_FAIR_GROUP_SCHED 6790 root_task_group.se = (struct sched_entity **)ptr; 6791 ptr += nr_cpu_ids * sizeof(void **); 6792 6793 root_task_group.cfs_rq = (struct cfs_rq **)ptr; 6794 ptr += nr_cpu_ids * sizeof(void **); 6795 6796#endif /* CONFIG_FAIR_GROUP_SCHED */ 6797#ifdef CONFIG_RT_GROUP_SCHED 6798 root_task_group.rt_se = (struct sched_rt_entity **)ptr; 6799 ptr += nr_cpu_ids * sizeof(void **); 6800 6801 root_task_group.rt_rq = (struct rt_rq **)ptr; 6802 ptr += nr_cpu_ids * sizeof(void **); 6803 6804#endif /* CONFIG_RT_GROUP_SCHED */ 6805#ifdef CONFIG_CPUMASK_OFFSTACK 6806 for_each_possible_cpu(i) { 6807 per_cpu(load_balance_tmpmask, i) = (void *)ptr; 6808 ptr += cpumask_size(); 6809 } 6810#endif /* CONFIG_CPUMASK_OFFSTACK */ 6811 } 6812 6813#ifdef CONFIG_SMP 6814 init_defrootdomain(); 6815#endif 6816 6817 init_rt_bandwidth(&def_rt_bandwidth, 6818 global_rt_period(), global_rt_runtime()); 6819 6820#ifdef CONFIG_RT_GROUP_SCHED 6821 init_rt_bandwidth(&root_task_group.rt_bandwidth, 6822 global_rt_period(), global_rt_runtime()); 6823#endif /* CONFIG_RT_GROUP_SCHED */ 6824 6825#ifdef CONFIG_CGROUP_SCHED 6826 list_add(&root_task_group.list, &task_groups); 6827 INIT_LIST_HEAD(&root_task_group.children); 6828 INIT_LIST_HEAD(&root_task_group.siblings); 6829 autogroup_init(&init_task); 6830 6831#endif /* CONFIG_CGROUP_SCHED */ 6832 6833#ifdef CONFIG_CGROUP_CPUACCT 6834 root_cpuacct.cpustat = &kernel_cpustat; 6835 root_cpuacct.cpuusage = alloc_percpu(u64); 6836 /* Too early, not expected to fail */ 6837 BUG_ON(!root_cpuacct.cpuusage); 6838#endif 6839 for_each_possible_cpu(i) { 6840 struct rq *rq; 6841 6842 rq = cpu_rq(i); 6843 raw_spin_lock_init(&rq->lock); 6844 rq->nr_running = 0; 6845 rq->calc_load_active = 0; 6846 rq->calc_load_update = jiffies + LOAD_FREQ; 6847 init_cfs_rq(&rq->cfs); 6848 init_rt_rq(&rq->rt, rq); 6849#ifdef CONFIG_FAIR_GROUP_SCHED 6850 root_task_group.shares = ROOT_TASK_GROUP_LOAD; 6851 INIT_LIST_HEAD(&rq->leaf_cfs_rq_list); 6852 /* 6853 * How much cpu bandwidth does root_task_group get? 6854 * 6855 * In case of task-groups formed thr' the cgroup filesystem, it 6856 * gets 100% of the cpu resources in the system. This overall 6857 * system cpu resource is divided among the tasks of 6858 * root_task_group and its child task-groups in a fair manner, 6859 * based on each entity's (task or task-group's) weight 6860 * (se->load.weight). 6861 * 6862 * In other words, if root_task_group has 10 tasks of weight 6863 * 1024) and two child groups A0 and A1 (of weight 1024 each), 6864 * then A0's share of the cpu resource is: 6865 * 6866 * A0's bandwidth = 1024 / (10*1024 + 1024 + 1024) = 8.33% 6867 * 6868 * We achieve this by letting root_task_group's tasks sit 6869 * directly in rq->cfs (i.e root_task_group->se[] = NULL). 6870 */ 6871 init_cfs_bandwidth(&root_task_group.cfs_bandwidth); 6872 init_tg_cfs_entry(&root_task_group, &rq->cfs, NULL, i, NULL); 6873#endif /* CONFIG_FAIR_GROUP_SCHED */ 6874 6875 rq->rt.rt_runtime = def_rt_bandwidth.rt_runtime; 6876#ifdef CONFIG_RT_GROUP_SCHED 6877 INIT_LIST_HEAD(&rq->leaf_rt_rq_list); 6878 init_tg_rt_entry(&root_task_group, &rq->rt, NULL, i, NULL); 6879#endif 6880 6881 for (j = 0; j < CPU_LOAD_IDX_MAX; j++) 6882 rq->cpu_load[j] = 0; 6883 6884 rq->last_load_update_tick = jiffies; 6885 6886#ifdef CONFIG_SMP 6887 rq->sd = NULL; 6888 rq->rd = NULL; 6889 rq->cpu_power = SCHED_POWER_SCALE; 6890 rq->post_schedule = 0; 6891 rq->active_balance = 0; 6892 rq->next_balance = jiffies; 6893 rq->push_cpu = 0; 6894 rq->cpu = i; 6895 rq->online = 0; 6896 rq->idle_stamp = 0; 6897 rq->avg_idle = 2*sysctl_sched_migration_cost; 6898 6899 INIT_LIST_HEAD(&rq->cfs_tasks); 6900 6901 rq_attach_root(rq, &def_root_domain); 6902#ifdef CONFIG_NO_HZ 6903 rq->nohz_flags = 0; 6904#endif 6905#endif 6906 init_rq_hrtick(rq); 6907 atomic_set(&rq->nr_iowait, 0); 6908 } 6909 6910 set_load_weight(&init_task); 6911 6912#ifdef CONFIG_PREEMPT_NOTIFIERS 6913 INIT_HLIST_HEAD(&init_task.preempt_notifiers); 6914#endif 6915 6916#ifdef CONFIG_RT_MUTEXES 6917 plist_head_init(&init_task.pi_waiters); 6918#endif 6919 6920 /* 6921 * The boot idle thread does lazy MMU switching as well: 6922 */ 6923 atomic_inc(&init_mm.mm_count); 6924 enter_lazy_tlb(&init_mm, current); 6925 6926 /* 6927 * Make us the idle thread. Technically, schedule() should not be 6928 * called from this thread, however somewhere below it might be, 6929 * but because we are the idle thread, we just pick up running again 6930 * when this runqueue becomes "idle". 6931 */ 6932 init_idle(current, smp_processor_id()); 6933 6934 calc_load_update = jiffies + LOAD_FREQ; 6935 6936 /* 6937 * During early bootup we pretend to be a normal task: 6938 */ 6939 current->sched_class = &fair_sched_class; 6940 6941#ifdef CONFIG_SMP 6942 zalloc_cpumask_var(&sched_domains_tmpmask, GFP_NOWAIT); 6943 /* May be allocated at isolcpus cmdline parse time */ 6944 if (cpu_isolated_map == NULL) 6945 zalloc_cpumask_var(&cpu_isolated_map, GFP_NOWAIT); 6946 idle_thread_set_boot_cpu(); 6947#endif 6948 init_sched_fair_class(); 6949 6950 scheduler_running = 1; 6951} 6952 6953#ifdef CONFIG_DEBUG_ATOMIC_SLEEP 6954static inline int preempt_count_equals(int preempt_offset) 6955{ 6956 int nested = (preempt_count() & ~PREEMPT_ACTIVE) + rcu_preempt_depth(); 6957 6958 return (nested == preempt_offset); 6959} 6960 6961void __might_sleep(const char *file, int line, int preempt_offset) 6962{ 6963 static unsigned long prev_jiffy; /* ratelimiting */ 6964 6965 rcu_sleep_check(); /* WARN_ON_ONCE() by default, no rate limit reqd. */ 6966 if ((preempt_count_equals(preempt_offset) && !irqs_disabled()) || 6967 system_state != SYSTEM_RUNNING || oops_in_progress) 6968 return; 6969 if (time_before(jiffies, prev_jiffy + HZ) && prev_jiffy) 6970 return; 6971 prev_jiffy = jiffies; 6972 6973 printk(KERN_ERR 6974 "BUG: sleeping function called from invalid context at %s:%d\n", 6975 file, line); 6976 printk(KERN_ERR 6977 "in_atomic(): %d, irqs_disabled(): %d, pid: %d, name: %s\n", 6978 in_atomic(), irqs_disabled(), 6979 current->pid, current->comm); 6980 6981 debug_show_held_locks(current); 6982 if (irqs_disabled()) 6983 print_irqtrace_events(current); 6984 dump_stack(); 6985} 6986EXPORT_SYMBOL(__might_sleep); 6987#endif 6988 6989#ifdef CONFIG_MAGIC_SYSRQ 6990static void normalize_task(struct rq *rq, struct task_struct *p) 6991{ 6992 const struct sched_class *prev_class = p->sched_class; 6993 int old_prio = p->prio; 6994 int on_rq; 6995 6996 on_rq = p->on_rq; 6997 if (on_rq) 6998 dequeue_task(rq, p, 0); 6999 __setscheduler(rq, p, SCHED_NORMAL, 0); 7000 if (on_rq) { 7001 enqueue_task(rq, p, 0); 7002 resched_task(rq->curr); 7003 } 7004 7005 check_class_changed(rq, p, prev_class, old_prio); 7006} 7007 7008void normalize_rt_tasks(void) 7009{ 7010 struct task_struct *g, *p; 7011 unsigned long flags; 7012 struct rq *rq; 7013 7014 read_lock_irqsave(&tasklist_lock, flags); 7015 do_each_thread(g, p) { 7016 /* 7017 * Only normalize user tasks: 7018 */ 7019 if (!p->mm) 7020 continue; 7021 7022 p->se.exec_start = 0; 7023#ifdef CONFIG_SCHEDSTATS 7024 p->se.statistics.wait_start = 0; 7025 p->se.statistics.sleep_start = 0; 7026 p->se.statistics.block_start = 0; 7027#endif 7028 7029 if (!rt_task(p)) { 7030 /* 7031 * Renice negative nice level userspace 7032 * tasks back to 0: 7033 */ 7034 if (TASK_NICE(p) < 0 && p->mm) 7035 set_user_nice(p, 0); 7036 continue; 7037 } 7038 7039 raw_spin_lock(&p->pi_lock); 7040 rq = __task_rq_lock(p); 7041 7042 normalize_task(rq, p); 7043 7044 __task_rq_unlock(rq); 7045 raw_spin_unlock(&p->pi_lock); 7046 } while_each_thread(g, p); 7047 7048 read_unlock_irqrestore(&tasklist_lock, flags); 7049} 7050 7051#endif /* CONFIG_MAGIC_SYSRQ */ 7052 7053#if defined(CONFIG_IA64) || defined(CONFIG_KGDB_KDB) 7054/* 7055 * These functions are only useful for the IA64 MCA handling, or kdb. 7056 * 7057 * They can only be called when the whole system has been 7058 * stopped - every CPU needs to be quiescent, and no scheduling 7059 * activity can take place. Using them for anything else would 7060 * be a serious bug, and as a result, they aren't even visible 7061 * under any other configuration. 7062 */ 7063 7064/** 7065 * curr_task - return the current task for a given cpu. 7066 * @cpu: the processor in question. 7067 * 7068 * ONLY VALID WHEN THE WHOLE SYSTEM IS STOPPED! 7069 */ 7070struct task_struct *curr_task(int cpu) 7071{ 7072 return cpu_curr(cpu); 7073} 7074 7075#endif /* defined(CONFIG_IA64) || defined(CONFIG_KGDB_KDB) */ 7076 7077#ifdef CONFIG_IA64 7078/** 7079 * set_curr_task - set the current task for a given cpu. 7080 * @cpu: the processor in question. 7081 * @p: the task pointer to set. 7082 * 7083 * Description: This function must only be used when non-maskable interrupts 7084 * are serviced on a separate stack. It allows the architecture to switch the 7085 * notion of the current task on a cpu in a non-blocking manner. This function 7086 * must be called with all CPU's synchronized, and interrupts disabled, the 7087 * and caller must save the original value of the current task (see 7088 * curr_task() above) and restore that value before reenabling interrupts and 7089 * re-starting the system. 7090 * 7091 * ONLY VALID WHEN THE WHOLE SYSTEM IS STOPPED! 7092 */ 7093void set_curr_task(int cpu, struct task_struct *p) 7094{ 7095 cpu_curr(cpu) = p; 7096} 7097 7098#endif 7099 7100#ifdef CONFIG_CGROUP_SCHED 7101/* task_group_lock serializes the addition/removal of task groups */ 7102static DEFINE_SPINLOCK(task_group_lock); 7103 7104static void free_sched_group(struct task_group *tg) 7105{ 7106 free_fair_sched_group(tg); 7107 free_rt_sched_group(tg); 7108 autogroup_free(tg); 7109 kfree(tg); 7110} 7111 7112/* allocate runqueue etc for a new task group */ 7113struct task_group *sched_create_group(struct task_group *parent) 7114{ 7115 struct task_group *tg; 7116 unsigned long flags; 7117 7118 tg = kzalloc(sizeof(*tg), GFP_KERNEL); 7119 if (!tg) 7120 return ERR_PTR(-ENOMEM); 7121 7122 if (!alloc_fair_sched_group(tg, parent)) 7123 goto err; 7124 7125 if (!alloc_rt_sched_group(tg, parent)) 7126 goto err; 7127 7128 spin_lock_irqsave(&task_group_lock, flags); 7129 list_add_rcu(&tg->list, &task_groups); 7130 7131 WARN_ON(!parent); /* root should already exist */ 7132 7133 tg->parent = parent; 7134 INIT_LIST_HEAD(&tg->children); 7135 list_add_rcu(&tg->siblings, &parent->children); 7136 spin_unlock_irqrestore(&task_group_lock, flags); 7137 7138 return tg; 7139 7140err: 7141 free_sched_group(tg); 7142 return ERR_PTR(-ENOMEM); 7143} 7144 7145/* rcu callback to free various structures associated with a task group */ 7146static void free_sched_group_rcu(struct rcu_head *rhp) 7147{ 7148 /* now it should be safe to free those cfs_rqs */ 7149 free_sched_group(container_of(rhp, struct task_group, rcu)); 7150} 7151 7152/* Destroy runqueue etc associated with a task group */ 7153void sched_destroy_group(struct task_group *tg) 7154{ 7155 unsigned long flags; 7156 int i; 7157 7158 /* end participation in shares distribution */ 7159 for_each_possible_cpu(i) 7160 unregister_fair_sched_group(tg, i); 7161 7162 spin_lock_irqsave(&task_group_lock, flags); 7163 list_del_rcu(&tg->list); 7164 list_del_rcu(&tg->siblings); 7165 spin_unlock_irqrestore(&task_group_lock, flags); 7166 7167 /* wait for possible concurrent references to cfs_rqs complete */ 7168 call_rcu(&tg->rcu, free_sched_group_rcu); 7169} 7170 7171/* change task's runqueue when it moves between groups. 7172 * The caller of this function should have put the task in its new group 7173 * by now. This function just updates tsk->se.cfs_rq and tsk->se.parent to 7174 * reflect its new group. 7175 */ 7176void sched_move_task(struct task_struct *tsk) 7177{ 7178 struct task_group *tg; 7179 int on_rq, running; 7180 unsigned long flags; 7181 struct rq *rq; 7182 7183 rq = task_rq_lock(tsk, &flags); 7184 7185 running = task_current(rq, tsk); 7186 on_rq = tsk->on_rq; 7187 7188 if (on_rq) 7189 dequeue_task(rq, tsk, 0); 7190 if (unlikely(running)) 7191 tsk->sched_class->put_prev_task(rq, tsk); 7192 7193 tg = container_of(task_subsys_state_check(tsk, cpu_cgroup_subsys_id, 7194 lockdep_is_held(&tsk->sighand->siglock)), 7195 struct task_group, css); 7196 tg = autogroup_task_group(tsk, tg); 7197 tsk->sched_task_group = tg; 7198 7199#ifdef CONFIG_FAIR_GROUP_SCHED 7200 if (tsk->sched_class->task_move_group) 7201 tsk->sched_class->task_move_group(tsk, on_rq); 7202 else 7203#endif 7204 set_task_rq(tsk, task_cpu(tsk)); 7205 7206 if (unlikely(running)) 7207 tsk->sched_class->set_curr_task(rq); 7208 if (on_rq) 7209 enqueue_task(rq, tsk, 0); 7210 7211 task_rq_unlock(rq, tsk, &flags); 7212} 7213#endif /* CONFIG_CGROUP_SCHED */ 7214 7215#if defined(CONFIG_RT_GROUP_SCHED) || defined(CONFIG_CFS_BANDWIDTH) 7216static unsigned long to_ratio(u64 period, u64 runtime) 7217{ 7218 if (runtime == RUNTIME_INF) 7219 return 1ULL << 20; 7220 7221 return div64_u64(runtime << 20, period); 7222} 7223#endif 7224 7225#ifdef CONFIG_RT_GROUP_SCHED 7226/* 7227 * Ensure that the real time constraints are schedulable. 7228 */ 7229static DEFINE_MUTEX(rt_constraints_mutex); 7230 7231/* Must be called with tasklist_lock held */ 7232static inline int tg_has_rt_tasks(struct task_group *tg) 7233{ 7234 struct task_struct *g, *p; 7235 7236 do_each_thread(g, p) { 7237 if (rt_task(p) && task_rq(p)->rt.tg == tg) 7238 return 1; 7239 } while_each_thread(g, p); 7240 7241 return 0; 7242} 7243 7244struct rt_schedulable_data { 7245 struct task_group *tg; 7246 u64 rt_period; 7247 u64 rt_runtime; 7248}; 7249 7250static int tg_rt_schedulable(struct task_group *tg, void *data) 7251{ 7252 struct rt_schedulable_data *d = data; 7253 struct task_group *child; 7254 unsigned long total, sum = 0; 7255 u64 period, runtime; 7256 7257 period = ktime_to_ns(tg->rt_bandwidth.rt_period); 7258 runtime = tg->rt_bandwidth.rt_runtime; 7259 7260 if (tg == d->tg) { 7261 period = d->rt_period; 7262 runtime = d->rt_runtime; 7263 } 7264 7265 /* 7266 * Cannot have more runtime than the period. 7267 */ 7268 if (runtime > period && runtime != RUNTIME_INF) 7269 return -EINVAL; 7270 7271 /* 7272 * Ensure we don't starve existing RT tasks. 7273 */ 7274 if (rt_bandwidth_enabled() && !runtime && tg_has_rt_tasks(tg)) 7275 return -EBUSY; 7276 7277 total = to_ratio(period, runtime); 7278 7279 /* 7280 * Nobody can have more than the global setting allows. 7281 */ 7282 if (total > to_ratio(global_rt_period(), global_rt_runtime())) 7283 return -EINVAL; 7284 7285 /* 7286 * The sum of our children's runtime should not exceed our own. 7287 */ 7288 list_for_each_entry_rcu(child, &tg->children, siblings) { 7289 period = ktime_to_ns(child->rt_bandwidth.rt_period); 7290 runtime = child->rt_bandwidth.rt_runtime; 7291 7292 if (child == d->tg) { 7293 period = d->rt_period; 7294 runtime = d->rt_runtime; 7295 } 7296 7297 sum += to_ratio(period, runtime); 7298 } 7299 7300 if (sum > total) 7301 return -EINVAL; 7302 7303 return 0; 7304} 7305 7306static int __rt_schedulable(struct task_group *tg, u64 period, u64 runtime) 7307{ 7308 int ret; 7309 7310 struct rt_schedulable_data data = { 7311 .tg = tg, 7312 .rt_period = period, 7313 .rt_runtime = runtime, 7314 }; 7315 7316 rcu_read_lock(); 7317 ret = walk_tg_tree(tg_rt_schedulable, tg_nop, &data); 7318 rcu_read_unlock(); 7319 7320 return ret; 7321} 7322 7323static int tg_set_rt_bandwidth(struct task_group *tg, 7324 u64 rt_period, u64 rt_runtime) 7325{ 7326 int i, err = 0; 7327 7328 mutex_lock(&rt_constraints_mutex); 7329 read_lock(&tasklist_lock); 7330 err = __rt_schedulable(tg, rt_period, rt_runtime); 7331 if (err) 7332 goto unlock; 7333 7334 raw_spin_lock_irq(&tg->rt_bandwidth.rt_runtime_lock); 7335 tg->rt_bandwidth.rt_period = ns_to_ktime(rt_period); 7336 tg->rt_bandwidth.rt_runtime = rt_runtime; 7337 7338 for_each_possible_cpu(i) { 7339 struct rt_rq *rt_rq = tg->rt_rq[i]; 7340 7341 raw_spin_lock(&rt_rq->rt_runtime_lock); 7342 rt_rq->rt_runtime = rt_runtime; 7343 raw_spin_unlock(&rt_rq->rt_runtime_lock); 7344 } 7345 raw_spin_unlock_irq(&tg->rt_bandwidth.rt_runtime_lock); 7346unlock: 7347 read_unlock(&tasklist_lock); 7348 mutex_unlock(&rt_constraints_mutex); 7349 7350 return err; 7351} 7352 7353int sched_group_set_rt_runtime(struct task_group *tg, long rt_runtime_us) 7354{ 7355 u64 rt_runtime, rt_period; 7356 7357 rt_period = ktime_to_ns(tg->rt_bandwidth.rt_period); 7358 rt_runtime = (u64)rt_runtime_us * NSEC_PER_USEC; 7359 if (rt_runtime_us < 0) 7360 rt_runtime = RUNTIME_INF; 7361 7362 return tg_set_rt_bandwidth(tg, rt_period, rt_runtime); 7363} 7364 7365long sched_group_rt_runtime(struct task_group *tg) 7366{ 7367 u64 rt_runtime_us; 7368 7369 if (tg->rt_bandwidth.rt_runtime == RUNTIME_INF) 7370 return -1; 7371 7372 rt_runtime_us = tg->rt_bandwidth.rt_runtime; 7373 do_div(rt_runtime_us, NSEC_PER_USEC); 7374 return rt_runtime_us; 7375} 7376 7377int sched_group_set_rt_period(struct task_group *tg, long rt_period_us) 7378{ 7379 u64 rt_runtime, rt_period; 7380 7381 rt_period = (u64)rt_period_us * NSEC_PER_USEC; 7382 rt_runtime = tg->rt_bandwidth.rt_runtime; 7383 7384 if (rt_period == 0) 7385 return -EINVAL; 7386 7387 return tg_set_rt_bandwidth(tg, rt_period, rt_runtime); 7388} 7389 7390long sched_group_rt_period(struct task_group *tg) 7391{ 7392 u64 rt_period_us; 7393 7394 rt_period_us = ktime_to_ns(tg->rt_bandwidth.rt_period); 7395 do_div(rt_period_us, NSEC_PER_USEC); 7396 return rt_period_us; 7397} 7398 7399static int sched_rt_global_constraints(void) 7400{ 7401 u64 runtime, period; 7402 int ret = 0; 7403 7404 if (sysctl_sched_rt_period <= 0) 7405 return -EINVAL; 7406 7407 runtime = global_rt_runtime(); 7408 period = global_rt_period(); 7409 7410 /* 7411 * Sanity check on the sysctl variables. 7412 */ 7413 if (runtime > period && runtime != RUNTIME_INF) 7414 return -EINVAL; 7415 7416 mutex_lock(&rt_constraints_mutex); 7417 read_lock(&tasklist_lock); 7418 ret = __rt_schedulable(NULL, 0, 0); 7419 read_unlock(&tasklist_lock); 7420 mutex_unlock(&rt_constraints_mutex); 7421 7422 return ret; 7423} 7424 7425int sched_rt_can_attach(struct task_group *tg, struct task_struct *tsk) 7426{ 7427 /* Don't accept realtime tasks when there is no way for them to run */ 7428 if (rt_task(tsk) && tg->rt_bandwidth.rt_runtime == 0) 7429 return 0; 7430 7431 return 1; 7432} 7433 7434#else /* !CONFIG_RT_GROUP_SCHED */ 7435static int sched_rt_global_constraints(void) 7436{ 7437 unsigned long flags; 7438 int i; 7439 7440 if (sysctl_sched_rt_period <= 0) 7441 return -EINVAL; 7442 7443 /* 7444 * There's always some RT tasks in the root group 7445 * -- migration, kstopmachine etc.. 7446 */ 7447 if (sysctl_sched_rt_runtime == 0) 7448 return -EBUSY; 7449 7450 raw_spin_lock_irqsave(&def_rt_bandwidth.rt_runtime_lock, flags); 7451 for_each_possible_cpu(i) { 7452 struct rt_rq *rt_rq = &cpu_rq(i)->rt; 7453 7454 raw_spin_lock(&rt_rq->rt_runtime_lock); 7455 rt_rq->rt_runtime = global_rt_runtime(); 7456 raw_spin_unlock(&rt_rq->rt_runtime_lock); 7457 } 7458 raw_spin_unlock_irqrestore(&def_rt_bandwidth.rt_runtime_lock, flags); 7459 7460 return 0; 7461} 7462#endif /* CONFIG_RT_GROUP_SCHED */ 7463 7464int sched_rt_handler(struct ctl_table *table, int write, 7465 void __user *buffer, size_t *lenp, 7466 loff_t *ppos) 7467{ 7468 int ret; 7469 int old_period, old_runtime; 7470 static DEFINE_MUTEX(mutex); 7471 7472 mutex_lock(&mutex); 7473 old_period = sysctl_sched_rt_period; 7474 old_runtime = sysctl_sched_rt_runtime; 7475 7476 ret = proc_dointvec(table, write, buffer, lenp, ppos); 7477 7478 if (!ret && write) { 7479 ret = sched_rt_global_constraints(); 7480 if (ret) { 7481 sysctl_sched_rt_period = old_period; 7482 sysctl_sched_rt_runtime = old_runtime; 7483 } else { 7484 def_rt_bandwidth.rt_runtime = global_rt_runtime(); 7485 def_rt_bandwidth.rt_period = 7486 ns_to_ktime(global_rt_period()); 7487 } 7488 } 7489 mutex_unlock(&mutex); 7490 7491 return ret; 7492} 7493 7494#ifdef CONFIG_CGROUP_SCHED 7495 7496/* return corresponding task_group object of a cgroup */ 7497static inline struct task_group *cgroup_tg(struct cgroup *cgrp) 7498{ 7499 return container_of(cgroup_subsys_state(cgrp, cpu_cgroup_subsys_id), 7500 struct task_group, css); 7501} 7502 7503static struct cgroup_subsys_state *cpu_cgroup_create(struct cgroup *cgrp) 7504{ 7505 struct task_group *tg, *parent; 7506 7507 if (!cgrp->parent) { 7508 /* This is early initialization for the top cgroup */ 7509 return &root_task_group.css; 7510 } 7511 7512 parent = cgroup_tg(cgrp->parent); 7513 tg = sched_create_group(parent); 7514 if (IS_ERR(tg)) 7515 return ERR_PTR(-ENOMEM); 7516 7517 return &tg->css; 7518} 7519 7520static void cpu_cgroup_destroy(struct cgroup *cgrp) 7521{ 7522 struct task_group *tg = cgroup_tg(cgrp); 7523 7524 sched_destroy_group(tg); 7525} 7526 7527static int cpu_cgroup_can_attach(struct cgroup *cgrp, 7528 struct cgroup_taskset *tset) 7529{ 7530 struct task_struct *task; 7531 7532 cgroup_taskset_for_each(task, cgrp, tset) { 7533#ifdef CONFIG_RT_GROUP_SCHED 7534 if (!sched_rt_can_attach(cgroup_tg(cgrp), task)) 7535 return -EINVAL; 7536#else 7537 /* We don't support RT-tasks being in separate groups */ 7538 if (task->sched_class != &fair_sched_class) 7539 return -EINVAL; 7540#endif 7541 } 7542 return 0; 7543} 7544 7545static void cpu_cgroup_attach(struct cgroup *cgrp, 7546 struct cgroup_taskset *tset) 7547{ 7548 struct task_struct *task; 7549 7550 cgroup_taskset_for_each(task, cgrp, tset) 7551 sched_move_task(task); 7552} 7553 7554static void 7555cpu_cgroup_exit(struct cgroup *cgrp, struct cgroup *old_cgrp, 7556 struct task_struct *task) 7557{ 7558 /* 7559 * cgroup_exit() is called in the copy_process() failure path. 7560 * Ignore this case since the task hasn't ran yet, this avoids 7561 * trying to poke a half freed task state from generic code. 7562 */ 7563 if (!(task->flags & PF_EXITING)) 7564 return; 7565 7566 sched_move_task(task); 7567} 7568 7569#ifdef CONFIG_FAIR_GROUP_SCHED 7570static int cpu_shares_write_u64(struct cgroup *cgrp, struct cftype *cftype, 7571 u64 shareval) 7572{ 7573 return sched_group_set_shares(cgroup_tg(cgrp), scale_load(shareval)); 7574} 7575 7576static u64 cpu_shares_read_u64(struct cgroup *cgrp, struct cftype *cft) 7577{ 7578 struct task_group *tg = cgroup_tg(cgrp); 7579 7580 return (u64) scale_load_down(tg->shares); 7581} 7582 7583#ifdef CONFIG_CFS_BANDWIDTH 7584static DEFINE_MUTEX(cfs_constraints_mutex); 7585 7586const u64 max_cfs_quota_period = 1 * NSEC_PER_SEC; /* 1s */ 7587const u64 min_cfs_quota_period = 1 * NSEC_PER_MSEC; /* 1ms */ 7588 7589static int __cfs_schedulable(struct task_group *tg, u64 period, u64 runtime); 7590 7591static int tg_set_cfs_bandwidth(struct task_group *tg, u64 period, u64 quota) 7592{ 7593 int i, ret = 0, runtime_enabled, runtime_was_enabled; 7594 struct cfs_bandwidth *cfs_b = &tg->cfs_bandwidth; 7595 7596 if (tg == &root_task_group) 7597 return -EINVAL; 7598 7599 /* 7600 * Ensure we have at some amount of bandwidth every period. This is 7601 * to prevent reaching a state of large arrears when throttled via 7602 * entity_tick() resulting in prolonged exit starvation. 7603 */ 7604 if (quota < min_cfs_quota_period || period < min_cfs_quota_period) 7605 return -EINVAL; 7606 7607 /* 7608 * Likewise, bound things on the otherside by preventing insane quota 7609 * periods. This also allows us to normalize in computing quota 7610 * feasibility. 7611 */ 7612 if (period > max_cfs_quota_period) 7613 return -EINVAL; 7614 7615 mutex_lock(&cfs_constraints_mutex); 7616 ret = __cfs_schedulable(tg, period, quota); 7617 if (ret) 7618 goto out_unlock; 7619 7620 runtime_enabled = quota != RUNTIME_INF; 7621 runtime_was_enabled = cfs_b->quota != RUNTIME_INF; 7622 account_cfs_bandwidth_used(runtime_enabled, runtime_was_enabled); 7623 raw_spin_lock_irq(&cfs_b->lock); 7624 cfs_b->period = ns_to_ktime(period); 7625 cfs_b->quota = quota; 7626 7627 __refill_cfs_bandwidth_runtime(cfs_b); 7628 /* restart the period timer (if active) to handle new period expiry */ 7629 if (runtime_enabled && cfs_b->timer_active) { 7630 /* force a reprogram */ 7631 cfs_b->timer_active = 0; 7632 __start_cfs_bandwidth(cfs_b); 7633 } 7634 raw_spin_unlock_irq(&cfs_b->lock); 7635 7636 for_each_possible_cpu(i) { 7637 struct cfs_rq *cfs_rq = tg->cfs_rq[i]; 7638 struct rq *rq = cfs_rq->rq; 7639 7640 raw_spin_lock_irq(&rq->lock); 7641 cfs_rq->runtime_enabled = runtime_enabled; 7642 cfs_rq->runtime_remaining = 0; 7643 7644 if (cfs_rq->throttled) 7645 unthrottle_cfs_rq(cfs_rq); 7646 raw_spin_unlock_irq(&rq->lock); 7647 } 7648out_unlock: 7649 mutex_unlock(&cfs_constraints_mutex); 7650 7651 return ret; 7652} 7653 7654int tg_set_cfs_quota(struct task_group *tg, long cfs_quota_us) 7655{ 7656 u64 quota, period; 7657 7658 period = ktime_to_ns(tg->cfs_bandwidth.period); 7659 if (cfs_quota_us < 0) 7660 quota = RUNTIME_INF; 7661 else 7662 quota = (u64)cfs_quota_us * NSEC_PER_USEC; 7663 7664 return tg_set_cfs_bandwidth(tg, period, quota); 7665} 7666 7667long tg_get_cfs_quota(struct task_group *tg) 7668{ 7669 u64 quota_us; 7670 7671 if (tg->cfs_bandwidth.quota == RUNTIME_INF) 7672 return -1; 7673 7674 quota_us = tg->cfs_bandwidth.quota; 7675 do_div(quota_us, NSEC_PER_USEC); 7676 7677 return quota_us; 7678} 7679 7680int tg_set_cfs_period(struct task_group *tg, long cfs_period_us) 7681{ 7682 u64 quota, period; 7683 7684 period = (u64)cfs_period_us * NSEC_PER_USEC; 7685 quota = tg->cfs_bandwidth.quota; 7686 7687 return tg_set_cfs_bandwidth(tg, period, quota); 7688} 7689 7690long tg_get_cfs_period(struct task_group *tg) 7691{ 7692 u64 cfs_period_us; 7693 7694 cfs_period_us = ktime_to_ns(tg->cfs_bandwidth.period); 7695 do_div(cfs_period_us, NSEC_PER_USEC); 7696 7697 return cfs_period_us; 7698} 7699 7700static s64 cpu_cfs_quota_read_s64(struct cgroup *cgrp, struct cftype *cft) 7701{ 7702 return tg_get_cfs_quota(cgroup_tg(cgrp)); 7703} 7704 7705static int cpu_cfs_quota_write_s64(struct cgroup *cgrp, struct cftype *cftype, 7706 s64 cfs_quota_us) 7707{ 7708 return tg_set_cfs_quota(cgroup_tg(cgrp), cfs_quota_us); 7709} 7710 7711static u64 cpu_cfs_period_read_u64(struct cgroup *cgrp, struct cftype *cft) 7712{ 7713 return tg_get_cfs_period(cgroup_tg(cgrp)); 7714} 7715 7716static int cpu_cfs_period_write_u64(struct cgroup *cgrp, struct cftype *cftype, 7717 u64 cfs_period_us) 7718{ 7719 return tg_set_cfs_period(cgroup_tg(cgrp), cfs_period_us); 7720} 7721 7722struct cfs_schedulable_data { 7723 struct task_group *tg; 7724 u64 period, quota; 7725}; 7726 7727/* 7728 * normalize group quota/period to be quota/max_period 7729 * note: units are usecs 7730 */ 7731static u64 normalize_cfs_quota(struct task_group *tg, 7732 struct cfs_schedulable_data *d) 7733{ 7734 u64 quota, period; 7735 7736 if (tg == d->tg) { 7737 period = d->period; 7738 quota = d->quota; 7739 } else { 7740 period = tg_get_cfs_period(tg); 7741 quota = tg_get_cfs_quota(tg); 7742 } 7743 7744 /* note: these should typically be equivalent */ 7745 if (quota == RUNTIME_INF || quota == -1) 7746 return RUNTIME_INF; 7747 7748 return to_ratio(period, quota); 7749} 7750 7751static int tg_cfs_schedulable_down(struct task_group *tg, void *data) 7752{ 7753 struct cfs_schedulable_data *d = data; 7754 struct cfs_bandwidth *cfs_b = &tg->cfs_bandwidth; 7755 s64 quota = 0, parent_quota = -1; 7756 7757 if (!tg->parent) { 7758 quota = RUNTIME_INF; 7759 } else { 7760 struct cfs_bandwidth *parent_b = &tg->parent->cfs_bandwidth; 7761 7762 quota = normalize_cfs_quota(tg, d); 7763 parent_quota = parent_b->hierarchal_quota; 7764 7765 /* 7766 * ensure max(child_quota) <= parent_quota, inherit when no 7767 * limit is set 7768 */ 7769 if (quota == RUNTIME_INF) 7770 quota = parent_quota; 7771 else if (parent_quota != RUNTIME_INF && quota > parent_quota) 7772 return -EINVAL; 7773 } 7774 cfs_b->hierarchal_quota = quota; 7775 7776 return 0; 7777} 7778 7779static int __cfs_schedulable(struct task_group *tg, u64 period, u64 quota) 7780{ 7781 int ret; 7782 struct cfs_schedulable_data data = { 7783 .tg = tg, 7784 .period = period, 7785 .quota = quota, 7786 }; 7787 7788 if (quota != RUNTIME_INF) { 7789 do_div(data.period, NSEC_PER_USEC); 7790 do_div(data.quota, NSEC_PER_USEC); 7791 } 7792 7793 rcu_read_lock(); 7794 ret = walk_tg_tree(tg_cfs_schedulable_down, tg_nop, &data); 7795 rcu_read_unlock(); 7796 7797 return ret; 7798} 7799 7800static int cpu_stats_show(struct cgroup *cgrp, struct cftype *cft, 7801 struct cgroup_map_cb *cb) 7802{ 7803 struct task_group *tg = cgroup_tg(cgrp); 7804 struct cfs_bandwidth *cfs_b = &tg->cfs_bandwidth; 7805 7806 cb->fill(cb, "nr_periods", cfs_b->nr_periods); 7807 cb->fill(cb, "nr_throttled", cfs_b->nr_throttled); 7808 cb->fill(cb, "throttled_time", cfs_b->throttled_time); 7809 7810 return 0; 7811} 7812#endif /* CONFIG_CFS_BANDWIDTH */ 7813#endif /* CONFIG_FAIR_GROUP_SCHED */ 7814 7815#ifdef CONFIG_RT_GROUP_SCHED 7816static int cpu_rt_runtime_write(struct cgroup *cgrp, struct cftype *cft, 7817 s64 val) 7818{ 7819 return sched_group_set_rt_runtime(cgroup_tg(cgrp), val); 7820} 7821 7822static s64 cpu_rt_runtime_read(struct cgroup *cgrp, struct cftype *cft) 7823{ 7824 return sched_group_rt_runtime(cgroup_tg(cgrp)); 7825} 7826 7827static int cpu_rt_period_write_uint(struct cgroup *cgrp, struct cftype *cftype, 7828 u64 rt_period_us) 7829{ 7830 return sched_group_set_rt_period(cgroup_tg(cgrp), rt_period_us); 7831} 7832 7833static u64 cpu_rt_period_read_uint(struct cgroup *cgrp, struct cftype *cft) 7834{ 7835 return sched_group_rt_period(cgroup_tg(cgrp)); 7836} 7837#endif /* CONFIG_RT_GROUP_SCHED */ 7838 7839static struct cftype cpu_files[] = { 7840#ifdef CONFIG_FAIR_GROUP_SCHED 7841 { 7842 .name = "shares", 7843 .read_u64 = cpu_shares_read_u64, 7844 .write_u64 = cpu_shares_write_u64, 7845 }, 7846#endif 7847#ifdef CONFIG_CFS_BANDWIDTH 7848 { 7849 .name = "cfs_quota_us", 7850 .read_s64 = cpu_cfs_quota_read_s64, 7851 .write_s64 = cpu_cfs_quota_write_s64, 7852 }, 7853 { 7854 .name = "cfs_period_us", 7855 .read_u64 = cpu_cfs_period_read_u64, 7856 .write_u64 = cpu_cfs_period_write_u64, 7857 }, 7858 { 7859 .name = "stat", 7860 .read_map = cpu_stats_show, 7861 }, 7862#endif 7863#ifdef CONFIG_RT_GROUP_SCHED 7864 { 7865 .name = "rt_runtime_us", 7866 .read_s64 = cpu_rt_runtime_read, 7867 .write_s64 = cpu_rt_runtime_write, 7868 }, 7869 { 7870 .name = "rt_period_us", 7871 .read_u64 = cpu_rt_period_read_uint, 7872 .write_u64 = cpu_rt_period_write_uint, 7873 }, 7874#endif 7875 { } /* terminate */ 7876}; 7877 7878struct cgroup_subsys cpu_cgroup_subsys = { 7879 .name = "cpu", 7880 .create = cpu_cgroup_create, 7881 .destroy = cpu_cgroup_destroy, 7882 .can_attach = cpu_cgroup_can_attach, 7883 .attach = cpu_cgroup_attach, 7884 .exit = cpu_cgroup_exit, 7885 .subsys_id = cpu_cgroup_subsys_id, 7886 .base_cftypes = cpu_files, 7887 .early_init = 1, 7888}; 7889 7890#endif /* CONFIG_CGROUP_SCHED */ 7891 7892#ifdef CONFIG_CGROUP_CPUACCT 7893 7894/* 7895 * CPU accounting code for task groups. 7896 * 7897 * Based on the work by Paul Menage (menage@google.com) and Balbir Singh 7898 * (balbir@in.ibm.com). 7899 */ 7900 7901struct cpuacct root_cpuacct; 7902 7903/* create a new cpu accounting group */ 7904static struct cgroup_subsys_state *cpuacct_create(struct cgroup *cgrp) 7905{ 7906 struct cpuacct *ca; 7907 7908 if (!cgrp->parent) 7909 return &root_cpuacct.css; 7910 7911 ca = kzalloc(sizeof(*ca), GFP_KERNEL); 7912 if (!ca) 7913 goto out; 7914 7915 ca->cpuusage = alloc_percpu(u64); 7916 if (!ca->cpuusage) 7917 goto out_free_ca; 7918 7919 ca->cpustat = alloc_percpu(struct kernel_cpustat); 7920 if (!ca->cpustat) 7921 goto out_free_cpuusage; 7922 7923 return &ca->css; 7924 7925out_free_cpuusage: 7926 free_percpu(ca->cpuusage); 7927out_free_ca: 7928 kfree(ca); 7929out: 7930 return ERR_PTR(-ENOMEM); 7931} 7932 7933/* destroy an existing cpu accounting group */ 7934static void cpuacct_destroy(struct cgroup *cgrp) 7935{ 7936 struct cpuacct *ca = cgroup_ca(cgrp); 7937 7938 free_percpu(ca->cpustat); 7939 free_percpu(ca->cpuusage); 7940 kfree(ca); 7941} 7942 7943static u64 cpuacct_cpuusage_read(struct cpuacct *ca, int cpu) 7944{ 7945 u64 *cpuusage = per_cpu_ptr(ca->cpuusage, cpu); 7946 u64 data; 7947 7948#ifndef CONFIG_64BIT 7949 /* 7950 * Take rq->lock to make 64-bit read safe on 32-bit platforms. 7951 */ 7952 raw_spin_lock_irq(&cpu_rq(cpu)->lock); 7953 data = *cpuusage; 7954 raw_spin_unlock_irq(&cpu_rq(cpu)->lock); 7955#else 7956 data = *cpuusage; 7957#endif 7958 7959 return data; 7960} 7961 7962static void cpuacct_cpuusage_write(struct cpuacct *ca, int cpu, u64 val) 7963{ 7964 u64 *cpuusage = per_cpu_ptr(ca->cpuusage, cpu); 7965 7966#ifndef CONFIG_64BIT 7967 /* 7968 * Take rq->lock to make 64-bit write safe on 32-bit platforms. 7969 */ 7970 raw_spin_lock_irq(&cpu_rq(cpu)->lock); 7971 *cpuusage = val; 7972 raw_spin_unlock_irq(&cpu_rq(cpu)->lock); 7973#else 7974 *cpuusage = val; 7975#endif 7976} 7977 7978/* return total cpu usage (in nanoseconds) of a group */ 7979static u64 cpuusage_read(struct cgroup *cgrp, struct cftype *cft) 7980{ 7981 struct cpuacct *ca = cgroup_ca(cgrp); 7982 u64 totalcpuusage = 0; 7983 int i; 7984 7985 for_each_present_cpu(i) 7986 totalcpuusage += cpuacct_cpuusage_read(ca, i); 7987 7988 return totalcpuusage; 7989} 7990 7991static int cpuusage_write(struct cgroup *cgrp, struct cftype *cftype, 7992 u64 reset) 7993{ 7994 struct cpuacct *ca = cgroup_ca(cgrp); 7995 int err = 0; 7996 int i; 7997 7998 if (reset) { 7999 err = -EINVAL; 8000 goto out; 8001 } 8002 8003 for_each_present_cpu(i) 8004 cpuacct_cpuusage_write(ca, i, 0); 8005 8006out: 8007 return err; 8008} 8009 8010static int cpuacct_percpu_seq_read(struct cgroup *cgroup, struct cftype *cft, 8011 struct seq_file *m) 8012{ 8013 struct cpuacct *ca = cgroup_ca(cgroup); 8014 u64 percpu; 8015 int i; 8016 8017 for_each_present_cpu(i) { 8018 percpu = cpuacct_cpuusage_read(ca, i); 8019 seq_printf(m, "%llu ", (unsigned long long) percpu); 8020 } 8021 seq_printf(m, "\n"); 8022 return 0; 8023} 8024 8025static const char *cpuacct_stat_desc[] = { 8026 [CPUACCT_STAT_USER] = "user", 8027 [CPUACCT_STAT_SYSTEM] = "system", 8028}; 8029 8030static int cpuacct_stats_show(struct cgroup *cgrp, struct cftype *cft, 8031 struct cgroup_map_cb *cb) 8032{ 8033 struct cpuacct *ca = cgroup_ca(cgrp); 8034 int cpu; 8035 s64 val = 0; 8036 8037 for_each_online_cpu(cpu) { 8038 struct kernel_cpustat *kcpustat = per_cpu_ptr(ca->cpustat, cpu); 8039 val += kcpustat->cpustat[CPUTIME_USER]; 8040 val += kcpustat->cpustat[CPUTIME_NICE]; 8041 } 8042 val = cputime64_to_clock_t(val); 8043 cb->fill(cb, cpuacct_stat_desc[CPUACCT_STAT_USER], val); 8044 8045 val = 0; 8046 for_each_online_cpu(cpu) { 8047 struct kernel_cpustat *kcpustat = per_cpu_ptr(ca->cpustat, cpu); 8048 val += kcpustat->cpustat[CPUTIME_SYSTEM]; 8049 val += kcpustat->cpustat[CPUTIME_IRQ]; 8050 val += kcpustat->cpustat[CPUTIME_SOFTIRQ]; 8051 } 8052 8053 val = cputime64_to_clock_t(val); 8054 cb->fill(cb, cpuacct_stat_desc[CPUACCT_STAT_SYSTEM], val); 8055 8056 return 0; 8057} 8058 8059static struct cftype files[] = { 8060 { 8061 .name = "usage", 8062 .read_u64 = cpuusage_read, 8063 .write_u64 = cpuusage_write, 8064 }, 8065 { 8066 .name = "usage_percpu", 8067 .read_seq_string = cpuacct_percpu_seq_read, 8068 }, 8069 { 8070 .name = "stat", 8071 .read_map = cpuacct_stats_show, 8072 }, 8073 { } /* terminate */ 8074}; 8075 8076/* 8077 * charge this task's execution time to its accounting group. 8078 * 8079 * called with rq->lock held. 8080 */ 8081void cpuacct_charge(struct task_struct *tsk, u64 cputime) 8082{ 8083 struct cpuacct *ca; 8084 int cpu; 8085 8086 if (unlikely(!cpuacct_subsys.active)) 8087 return; 8088 8089 cpu = task_cpu(tsk); 8090 8091 rcu_read_lock(); 8092 8093 ca = task_ca(tsk); 8094 8095 for (; ca; ca = parent_ca(ca)) { 8096 u64 *cpuusage = per_cpu_ptr(ca->cpuusage, cpu); 8097 *cpuusage += cputime; 8098 } 8099 8100 rcu_read_unlock(); 8101} 8102 8103struct cgroup_subsys cpuacct_subsys = { 8104 .name = "cpuacct", 8105 .create = cpuacct_create, 8106 .destroy = cpuacct_destroy, 8107 .subsys_id = cpuacct_subsys_id, 8108 .base_cftypes = files, 8109}; 8110#endif /* CONFIG_CGROUP_CPUACCT */ 8111