core.c revision 746023159c40c523b08a3bc3d213dac212385895
1/* 2 * kernel/sched/core.c 3 * 4 * Kernel scheduler and related syscalls 5 * 6 * Copyright (C) 1991-2002 Linus Torvalds 7 * 8 * 1996-12-23 Modified by Dave Grothe to fix bugs in semaphores and 9 * make semaphores SMP safe 10 * 1998-11-19 Implemented schedule_timeout() and related stuff 11 * by Andrea Arcangeli 12 * 2002-01-04 New ultra-scalable O(1) scheduler by Ingo Molnar: 13 * hybrid priority-list and round-robin design with 14 * an array-switch method of distributing timeslices 15 * and per-CPU runqueues. Cleanups and useful suggestions 16 * by Davide Libenzi, preemptible kernel bits by Robert Love. 17 * 2003-09-03 Interactivity tuning by Con Kolivas. 18 * 2004-04-02 Scheduler domains code by Nick Piggin 19 * 2007-04-15 Work begun on replacing all interactivity tuning with a 20 * fair scheduling design by Con Kolivas. 21 * 2007-05-05 Load balancing (smp-nice) and other improvements 22 * by Peter Williams 23 * 2007-05-06 Interactivity improvements to CFS by Mike Galbraith 24 * 2007-07-01 Group scheduling enhancements by Srivatsa Vaddagiri 25 * 2007-11-29 RT balancing improvements by Steven Rostedt, Gregory Haskins, 26 * Thomas Gleixner, Mike Kravetz 27 */ 28 29#include <linux/mm.h> 30#include <linux/module.h> 31#include <linux/nmi.h> 32#include <linux/init.h> 33#include <linux/uaccess.h> 34#include <linux/highmem.h> 35#include <asm/mmu_context.h> 36#include <linux/interrupt.h> 37#include <linux/capability.h> 38#include <linux/completion.h> 39#include <linux/kernel_stat.h> 40#include <linux/debug_locks.h> 41#include <linux/perf_event.h> 42#include <linux/security.h> 43#include <linux/notifier.h> 44#include <linux/profile.h> 45#include <linux/freezer.h> 46#include <linux/vmalloc.h> 47#include <linux/blkdev.h> 48#include <linux/delay.h> 49#include <linux/pid_namespace.h> 50#include <linux/smp.h> 51#include <linux/threads.h> 52#include <linux/timer.h> 53#include <linux/rcupdate.h> 54#include <linux/cpu.h> 55#include <linux/cpuset.h> 56#include <linux/percpu.h> 57#include <linux/proc_fs.h> 58#include <linux/seq_file.h> 59#include <linux/sysctl.h> 60#include <linux/syscalls.h> 61#include <linux/times.h> 62#include <linux/tsacct_kern.h> 63#include <linux/kprobes.h> 64#include <linux/delayacct.h> 65#include <linux/unistd.h> 66#include <linux/pagemap.h> 67#include <linux/hrtimer.h> 68#include <linux/tick.h> 69#include <linux/debugfs.h> 70#include <linux/ctype.h> 71#include <linux/ftrace.h> 72#include <linux/slab.h> 73#include <linux/init_task.h> 74#include <linux/binfmts.h> 75#include <linux/context_tracking.h> 76 77#include <asm/switch_to.h> 78#include <asm/tlb.h> 79#include <asm/irq_regs.h> 80#include <asm/mutex.h> 81#ifdef CONFIG_PARAVIRT 82#include <asm/paravirt.h> 83#endif 84 85#include "sched.h" 86#include "../workqueue_internal.h" 87#include "../smpboot.h" 88 89#define CREATE_TRACE_POINTS 90#include <trace/events/sched.h> 91 92void start_bandwidth_timer(struct hrtimer *period_timer, ktime_t period) 93{ 94 unsigned long delta; 95 ktime_t soft, hard, now; 96 97 for (;;) { 98 if (hrtimer_active(period_timer)) 99 break; 100 101 now = hrtimer_cb_get_time(period_timer); 102 hrtimer_forward(period_timer, now, period); 103 104 soft = hrtimer_get_softexpires(period_timer); 105 hard = hrtimer_get_expires(period_timer); 106 delta = ktime_to_ns(ktime_sub(hard, soft)); 107 __hrtimer_start_range_ns(period_timer, soft, delta, 108 HRTIMER_MODE_ABS_PINNED, 0); 109 } 110} 111 112DEFINE_MUTEX(sched_domains_mutex); 113DEFINE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues); 114 115static void update_rq_clock_task(struct rq *rq, s64 delta); 116 117void update_rq_clock(struct rq *rq) 118{ 119 s64 delta; 120 121 if (rq->skip_clock_update > 0) 122 return; 123 124 delta = sched_clock_cpu(cpu_of(rq)) - rq->clock; 125 rq->clock += delta; 126 update_rq_clock_task(rq, delta); 127} 128 129/* 130 * Debugging: various feature bits 131 */ 132 133#define SCHED_FEAT(name, enabled) \ 134 (1UL << __SCHED_FEAT_##name) * enabled | 135 136const_debug unsigned int sysctl_sched_features = 137#include "features.h" 138 0; 139 140#undef SCHED_FEAT 141 142#ifdef CONFIG_SCHED_DEBUG 143#define SCHED_FEAT(name, enabled) \ 144 #name , 145 146static const char * const sched_feat_names[] = { 147#include "features.h" 148}; 149 150#undef SCHED_FEAT 151 152static int sched_feat_show(struct seq_file *m, void *v) 153{ 154 int i; 155 156 for (i = 0; i < __SCHED_FEAT_NR; i++) { 157 if (!(sysctl_sched_features & (1UL << i))) 158 seq_puts(m, "NO_"); 159 seq_printf(m, "%s ", sched_feat_names[i]); 160 } 161 seq_puts(m, "\n"); 162 163 return 0; 164} 165 166#ifdef HAVE_JUMP_LABEL 167 168#define jump_label_key__true STATIC_KEY_INIT_TRUE 169#define jump_label_key__false STATIC_KEY_INIT_FALSE 170 171#define SCHED_FEAT(name, enabled) \ 172 jump_label_key__##enabled , 173 174struct static_key sched_feat_keys[__SCHED_FEAT_NR] = { 175#include "features.h" 176}; 177 178#undef SCHED_FEAT 179 180static void sched_feat_disable(int i) 181{ 182 if (static_key_enabled(&sched_feat_keys[i])) 183 static_key_slow_dec(&sched_feat_keys[i]); 184} 185 186static void sched_feat_enable(int i) 187{ 188 if (!static_key_enabled(&sched_feat_keys[i])) 189 static_key_slow_inc(&sched_feat_keys[i]); 190} 191#else 192static void sched_feat_disable(int i) { }; 193static void sched_feat_enable(int i) { }; 194#endif /* HAVE_JUMP_LABEL */ 195 196static int sched_feat_set(char *cmp) 197{ 198 int i; 199 int neg = 0; 200 201 if (strncmp(cmp, "NO_", 3) == 0) { 202 neg = 1; 203 cmp += 3; 204 } 205 206 for (i = 0; i < __SCHED_FEAT_NR; i++) { 207 if (strcmp(cmp, sched_feat_names[i]) == 0) { 208 if (neg) { 209 sysctl_sched_features &= ~(1UL << i); 210 sched_feat_disable(i); 211 } else { 212 sysctl_sched_features |= (1UL << i); 213 sched_feat_enable(i); 214 } 215 break; 216 } 217 } 218 219 return i; 220} 221 222static ssize_t 223sched_feat_write(struct file *filp, const char __user *ubuf, 224 size_t cnt, loff_t *ppos) 225{ 226 char buf[64]; 227 char *cmp; 228 int i; 229 230 if (cnt > 63) 231 cnt = 63; 232 233 if (copy_from_user(&buf, ubuf, cnt)) 234 return -EFAULT; 235 236 buf[cnt] = 0; 237 cmp = strstrip(buf); 238 239 i = sched_feat_set(cmp); 240 if (i == __SCHED_FEAT_NR) 241 return -EINVAL; 242 243 *ppos += cnt; 244 245 return cnt; 246} 247 248static int sched_feat_open(struct inode *inode, struct file *filp) 249{ 250 return single_open(filp, sched_feat_show, NULL); 251} 252 253static const struct file_operations sched_feat_fops = { 254 .open = sched_feat_open, 255 .write = sched_feat_write, 256 .read = seq_read, 257 .llseek = seq_lseek, 258 .release = single_release, 259}; 260 261static __init int sched_init_debug(void) 262{ 263 debugfs_create_file("sched_features", 0644, NULL, NULL, 264 &sched_feat_fops); 265 266 return 0; 267} 268late_initcall(sched_init_debug); 269#endif /* CONFIG_SCHED_DEBUG */ 270 271/* 272 * Number of tasks to iterate in a single balance run. 273 * Limited because this is done with IRQs disabled. 274 */ 275const_debug unsigned int sysctl_sched_nr_migrate = 32; 276 277/* 278 * period over which we average the RT time consumption, measured 279 * in ms. 280 * 281 * default: 1s 282 */ 283const_debug unsigned int sysctl_sched_time_avg = MSEC_PER_SEC; 284 285/* 286 * period over which we measure -rt task cpu usage in us. 287 * default: 1s 288 */ 289unsigned int sysctl_sched_rt_period = 1000000; 290 291__read_mostly int scheduler_running; 292 293/* 294 * part of the period that we allow rt tasks to run in us. 295 * default: 0.95s 296 */ 297int sysctl_sched_rt_runtime = 950000; 298 299 300 301/* 302 * __task_rq_lock - lock the rq @p resides on. 303 */ 304static inline struct rq *__task_rq_lock(struct task_struct *p) 305 __acquires(rq->lock) 306{ 307 struct rq *rq; 308 309 lockdep_assert_held(&p->pi_lock); 310 311 for (;;) { 312 rq = task_rq(p); 313 raw_spin_lock(&rq->lock); 314 if (likely(rq == task_rq(p))) 315 return rq; 316 raw_spin_unlock(&rq->lock); 317 } 318} 319 320/* 321 * task_rq_lock - lock p->pi_lock and lock the rq @p resides on. 322 */ 323static struct rq *task_rq_lock(struct task_struct *p, unsigned long *flags) 324 __acquires(p->pi_lock) 325 __acquires(rq->lock) 326{ 327 struct rq *rq; 328 329 for (;;) { 330 raw_spin_lock_irqsave(&p->pi_lock, *flags); 331 rq = task_rq(p); 332 raw_spin_lock(&rq->lock); 333 if (likely(rq == task_rq(p))) 334 return rq; 335 raw_spin_unlock(&rq->lock); 336 raw_spin_unlock_irqrestore(&p->pi_lock, *flags); 337 } 338} 339 340static void __task_rq_unlock(struct rq *rq) 341 __releases(rq->lock) 342{ 343 raw_spin_unlock(&rq->lock); 344} 345 346static inline void 347task_rq_unlock(struct rq *rq, struct task_struct *p, unsigned long *flags) 348 __releases(rq->lock) 349 __releases(p->pi_lock) 350{ 351 raw_spin_unlock(&rq->lock); 352 raw_spin_unlock_irqrestore(&p->pi_lock, *flags); 353} 354 355/* 356 * this_rq_lock - lock this runqueue and disable interrupts. 357 */ 358static struct rq *this_rq_lock(void) 359 __acquires(rq->lock) 360{ 361 struct rq *rq; 362 363 local_irq_disable(); 364 rq = this_rq(); 365 raw_spin_lock(&rq->lock); 366 367 return rq; 368} 369 370#ifdef CONFIG_SCHED_HRTICK 371/* 372 * Use HR-timers to deliver accurate preemption points. 373 */ 374 375static void hrtick_clear(struct rq *rq) 376{ 377 if (hrtimer_active(&rq->hrtick_timer)) 378 hrtimer_cancel(&rq->hrtick_timer); 379} 380 381/* 382 * High-resolution timer tick. 383 * Runs from hardirq context with interrupts disabled. 384 */ 385static enum hrtimer_restart hrtick(struct hrtimer *timer) 386{ 387 struct rq *rq = container_of(timer, struct rq, hrtick_timer); 388 389 WARN_ON_ONCE(cpu_of(rq) != smp_processor_id()); 390 391 raw_spin_lock(&rq->lock); 392 update_rq_clock(rq); 393 rq->curr->sched_class->task_tick(rq, rq->curr, 1); 394 raw_spin_unlock(&rq->lock); 395 396 return HRTIMER_NORESTART; 397} 398 399#ifdef CONFIG_SMP 400 401static int __hrtick_restart(struct rq *rq) 402{ 403 struct hrtimer *timer = &rq->hrtick_timer; 404 ktime_t time = hrtimer_get_softexpires(timer); 405 406 return __hrtimer_start_range_ns(timer, time, 0, HRTIMER_MODE_ABS_PINNED, 0); 407} 408 409/* 410 * called from hardirq (IPI) context 411 */ 412static void __hrtick_start(void *arg) 413{ 414 struct rq *rq = arg; 415 416 raw_spin_lock(&rq->lock); 417 __hrtick_restart(rq); 418 rq->hrtick_csd_pending = 0; 419 raw_spin_unlock(&rq->lock); 420} 421 422/* 423 * Called to set the hrtick timer state. 424 * 425 * called with rq->lock held and irqs disabled 426 */ 427void hrtick_start(struct rq *rq, u64 delay) 428{ 429 struct hrtimer *timer = &rq->hrtick_timer; 430 ktime_t time = ktime_add_ns(timer->base->get_time(), delay); 431 432 hrtimer_set_expires(timer, time); 433 434 if (rq == this_rq()) { 435 __hrtick_restart(rq); 436 } else if (!rq->hrtick_csd_pending) { 437 __smp_call_function_single(cpu_of(rq), &rq->hrtick_csd, 0); 438 rq->hrtick_csd_pending = 1; 439 } 440} 441 442static int 443hotplug_hrtick(struct notifier_block *nfb, unsigned long action, void *hcpu) 444{ 445 int cpu = (int)(long)hcpu; 446 447 switch (action) { 448 case CPU_UP_CANCELED: 449 case CPU_UP_CANCELED_FROZEN: 450 case CPU_DOWN_PREPARE: 451 case CPU_DOWN_PREPARE_FROZEN: 452 case CPU_DEAD: 453 case CPU_DEAD_FROZEN: 454 hrtick_clear(cpu_rq(cpu)); 455 return NOTIFY_OK; 456 } 457 458 return NOTIFY_DONE; 459} 460 461static __init void init_hrtick(void) 462{ 463 hotcpu_notifier(hotplug_hrtick, 0); 464} 465#else 466/* 467 * Called to set the hrtick timer state. 468 * 469 * called with rq->lock held and irqs disabled 470 */ 471void hrtick_start(struct rq *rq, u64 delay) 472{ 473 __hrtimer_start_range_ns(&rq->hrtick_timer, ns_to_ktime(delay), 0, 474 HRTIMER_MODE_REL_PINNED, 0); 475} 476 477static inline void init_hrtick(void) 478{ 479} 480#endif /* CONFIG_SMP */ 481 482static void init_rq_hrtick(struct rq *rq) 483{ 484#ifdef CONFIG_SMP 485 rq->hrtick_csd_pending = 0; 486 487 rq->hrtick_csd.flags = 0; 488 rq->hrtick_csd.func = __hrtick_start; 489 rq->hrtick_csd.info = rq; 490#endif 491 492 hrtimer_init(&rq->hrtick_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); 493 rq->hrtick_timer.function = hrtick; 494} 495#else /* CONFIG_SCHED_HRTICK */ 496static inline void hrtick_clear(struct rq *rq) 497{ 498} 499 500static inline void init_rq_hrtick(struct rq *rq) 501{ 502} 503 504static inline void init_hrtick(void) 505{ 506} 507#endif /* CONFIG_SCHED_HRTICK */ 508 509/* 510 * resched_task - mark a task 'to be rescheduled now'. 511 * 512 * On UP this means the setting of the need_resched flag, on SMP it 513 * might also involve a cross-CPU call to trigger the scheduler on 514 * the target CPU. 515 */ 516void resched_task(struct task_struct *p) 517{ 518 int cpu; 519 520 lockdep_assert_held(&task_rq(p)->lock); 521 522 if (test_tsk_need_resched(p)) 523 return; 524 525 set_tsk_need_resched(p); 526 527 cpu = task_cpu(p); 528 if (cpu == smp_processor_id()) { 529 set_preempt_need_resched(); 530 return; 531 } 532 533 /* NEED_RESCHED must be visible before we test polling */ 534 smp_mb(); 535 if (!tsk_is_polling(p)) 536 smp_send_reschedule(cpu); 537} 538 539void resched_cpu(int cpu) 540{ 541 struct rq *rq = cpu_rq(cpu); 542 unsigned long flags; 543 544 if (!raw_spin_trylock_irqsave(&rq->lock, flags)) 545 return; 546 resched_task(cpu_curr(cpu)); 547 raw_spin_unlock_irqrestore(&rq->lock, flags); 548} 549 550#ifdef CONFIG_SMP 551#ifdef CONFIG_NO_HZ_COMMON 552/* 553 * In the semi idle case, use the nearest busy cpu for migrating timers 554 * from an idle cpu. This is good for power-savings. 555 * 556 * We don't do similar optimization for completely idle system, as 557 * selecting an idle cpu will add more delays to the timers than intended 558 * (as that cpu's timer base may not be uptodate wrt jiffies etc). 559 */ 560int get_nohz_timer_target(void) 561{ 562 int cpu = smp_processor_id(); 563 int i; 564 struct sched_domain *sd; 565 566 rcu_read_lock(); 567 for_each_domain(cpu, sd) { 568 for_each_cpu(i, sched_domain_span(sd)) { 569 if (!idle_cpu(i)) { 570 cpu = i; 571 goto unlock; 572 } 573 } 574 } 575unlock: 576 rcu_read_unlock(); 577 return cpu; 578} 579/* 580 * When add_timer_on() enqueues a timer into the timer wheel of an 581 * idle CPU then this timer might expire before the next timer event 582 * which is scheduled to wake up that CPU. In case of a completely 583 * idle system the next event might even be infinite time into the 584 * future. wake_up_idle_cpu() ensures that the CPU is woken up and 585 * leaves the inner idle loop so the newly added timer is taken into 586 * account when the CPU goes back to idle and evaluates the timer 587 * wheel for the next timer event. 588 */ 589static void wake_up_idle_cpu(int cpu) 590{ 591 struct rq *rq = cpu_rq(cpu); 592 593 if (cpu == smp_processor_id()) 594 return; 595 596 /* 597 * This is safe, as this function is called with the timer 598 * wheel base lock of (cpu) held. When the CPU is on the way 599 * to idle and has not yet set rq->curr to idle then it will 600 * be serialized on the timer wheel base lock and take the new 601 * timer into account automatically. 602 */ 603 if (rq->curr != rq->idle) 604 return; 605 606 /* 607 * We can set TIF_RESCHED on the idle task of the other CPU 608 * lockless. The worst case is that the other CPU runs the 609 * idle task through an additional NOOP schedule() 610 */ 611 set_tsk_need_resched(rq->idle); 612 613 /* NEED_RESCHED must be visible before we test polling */ 614 smp_mb(); 615 if (!tsk_is_polling(rq->idle)) 616 smp_send_reschedule(cpu); 617} 618 619static bool wake_up_full_nohz_cpu(int cpu) 620{ 621 if (tick_nohz_full_cpu(cpu)) { 622 if (cpu != smp_processor_id() || 623 tick_nohz_tick_stopped()) 624 smp_send_reschedule(cpu); 625 return true; 626 } 627 628 return false; 629} 630 631void wake_up_nohz_cpu(int cpu) 632{ 633 if (!wake_up_full_nohz_cpu(cpu)) 634 wake_up_idle_cpu(cpu); 635} 636 637static inline bool got_nohz_idle_kick(void) 638{ 639 int cpu = smp_processor_id(); 640 641 if (!test_bit(NOHZ_BALANCE_KICK, nohz_flags(cpu))) 642 return false; 643 644 if (idle_cpu(cpu) && !need_resched()) 645 return true; 646 647 /* 648 * We can't run Idle Load Balance on this CPU for this time so we 649 * cancel it and clear NOHZ_BALANCE_KICK 650 */ 651 clear_bit(NOHZ_BALANCE_KICK, nohz_flags(cpu)); 652 return false; 653} 654 655#else /* CONFIG_NO_HZ_COMMON */ 656 657static inline bool got_nohz_idle_kick(void) 658{ 659 return false; 660} 661 662#endif /* CONFIG_NO_HZ_COMMON */ 663 664#ifdef CONFIG_NO_HZ_FULL 665bool sched_can_stop_tick(void) 666{ 667 struct rq *rq; 668 669 rq = this_rq(); 670 671 /* Make sure rq->nr_running update is visible after the IPI */ 672 smp_rmb(); 673 674 /* More than one running task need preemption */ 675 if (rq->nr_running > 1) 676 return false; 677 678 return true; 679} 680#endif /* CONFIG_NO_HZ_FULL */ 681 682void sched_avg_update(struct rq *rq) 683{ 684 s64 period = sched_avg_period(); 685 686 while ((s64)(rq_clock(rq) - rq->age_stamp) > period) { 687 /* 688 * Inline assembly required to prevent the compiler 689 * optimising this loop into a divmod call. 690 * See __iter_div_u64_rem() for another example of this. 691 */ 692 asm("" : "+rm" (rq->age_stamp)); 693 rq->age_stamp += period; 694 rq->rt_avg /= 2; 695 } 696} 697 698#endif /* CONFIG_SMP */ 699 700#if defined(CONFIG_RT_GROUP_SCHED) || (defined(CONFIG_FAIR_GROUP_SCHED) && \ 701 (defined(CONFIG_SMP) || defined(CONFIG_CFS_BANDWIDTH))) 702/* 703 * Iterate task_group tree rooted at *from, calling @down when first entering a 704 * node and @up when leaving it for the final time. 705 * 706 * Caller must hold rcu_lock or sufficient equivalent. 707 */ 708int walk_tg_tree_from(struct task_group *from, 709 tg_visitor down, tg_visitor up, void *data) 710{ 711 struct task_group *parent, *child; 712 int ret; 713 714 parent = from; 715 716down: 717 ret = (*down)(parent, data); 718 if (ret) 719 goto out; 720 list_for_each_entry_rcu(child, &parent->children, siblings) { 721 parent = child; 722 goto down; 723 724up: 725 continue; 726 } 727 ret = (*up)(parent, data); 728 if (ret || parent == from) 729 goto out; 730 731 child = parent; 732 parent = parent->parent; 733 if (parent) 734 goto up; 735out: 736 return ret; 737} 738 739int tg_nop(struct task_group *tg, void *data) 740{ 741 return 0; 742} 743#endif 744 745static void set_load_weight(struct task_struct *p) 746{ 747 int prio = p->static_prio - MAX_RT_PRIO; 748 struct load_weight *load = &p->se.load; 749 750 /* 751 * SCHED_IDLE tasks get minimal weight: 752 */ 753 if (p->policy == SCHED_IDLE) { 754 load->weight = scale_load(WEIGHT_IDLEPRIO); 755 load->inv_weight = WMULT_IDLEPRIO; 756 return; 757 } 758 759 load->weight = scale_load(prio_to_weight[prio]); 760 load->inv_weight = prio_to_wmult[prio]; 761} 762 763static void enqueue_task(struct rq *rq, struct task_struct *p, int flags) 764{ 765 update_rq_clock(rq); 766 sched_info_queued(rq, p); 767 p->sched_class->enqueue_task(rq, p, flags); 768} 769 770static void dequeue_task(struct rq *rq, struct task_struct *p, int flags) 771{ 772 update_rq_clock(rq); 773 sched_info_dequeued(rq, p); 774 p->sched_class->dequeue_task(rq, p, flags); 775} 776 777void activate_task(struct rq *rq, struct task_struct *p, int flags) 778{ 779 if (task_contributes_to_load(p)) 780 rq->nr_uninterruptible--; 781 782 enqueue_task(rq, p, flags); 783} 784 785void deactivate_task(struct rq *rq, struct task_struct *p, int flags) 786{ 787 if (task_contributes_to_load(p)) 788 rq->nr_uninterruptible++; 789 790 dequeue_task(rq, p, flags); 791} 792 793static void update_rq_clock_task(struct rq *rq, s64 delta) 794{ 795/* 796 * In theory, the compile should just see 0 here, and optimize out the call 797 * to sched_rt_avg_update. But I don't trust it... 798 */ 799#if defined(CONFIG_IRQ_TIME_ACCOUNTING) || defined(CONFIG_PARAVIRT_TIME_ACCOUNTING) 800 s64 steal = 0, irq_delta = 0; 801#endif 802#ifdef CONFIG_IRQ_TIME_ACCOUNTING 803 irq_delta = irq_time_read(cpu_of(rq)) - rq->prev_irq_time; 804 805 /* 806 * Since irq_time is only updated on {soft,}irq_exit, we might run into 807 * this case when a previous update_rq_clock() happened inside a 808 * {soft,}irq region. 809 * 810 * When this happens, we stop ->clock_task and only update the 811 * prev_irq_time stamp to account for the part that fit, so that a next 812 * update will consume the rest. This ensures ->clock_task is 813 * monotonic. 814 * 815 * It does however cause some slight miss-attribution of {soft,}irq 816 * time, a more accurate solution would be to update the irq_time using 817 * the current rq->clock timestamp, except that would require using 818 * atomic ops. 819 */ 820 if (irq_delta > delta) 821 irq_delta = delta; 822 823 rq->prev_irq_time += irq_delta; 824 delta -= irq_delta; 825#endif 826#ifdef CONFIG_PARAVIRT_TIME_ACCOUNTING 827 if (static_key_false((¶virt_steal_rq_enabled))) { 828 u64 st; 829 830 steal = paravirt_steal_clock(cpu_of(rq)); 831 steal -= rq->prev_steal_time_rq; 832 833 if (unlikely(steal > delta)) 834 steal = delta; 835 836 st = steal_ticks(steal); 837 steal = st * TICK_NSEC; 838 839 rq->prev_steal_time_rq += steal; 840 841 delta -= steal; 842 } 843#endif 844 845 rq->clock_task += delta; 846 847#if defined(CONFIG_IRQ_TIME_ACCOUNTING) || defined(CONFIG_PARAVIRT_TIME_ACCOUNTING) 848 if ((irq_delta + steal) && sched_feat(NONTASK_POWER)) 849 sched_rt_avg_update(rq, irq_delta + steal); 850#endif 851} 852 853void sched_set_stop_task(int cpu, struct task_struct *stop) 854{ 855 struct sched_param param = { .sched_priority = MAX_RT_PRIO - 1 }; 856 struct task_struct *old_stop = cpu_rq(cpu)->stop; 857 858 if (stop) { 859 /* 860 * Make it appear like a SCHED_FIFO task, its something 861 * userspace knows about and won't get confused about. 862 * 863 * Also, it will make PI more or less work without too 864 * much confusion -- but then, stop work should not 865 * rely on PI working anyway. 866 */ 867 sched_setscheduler_nocheck(stop, SCHED_FIFO, ¶m); 868 869 stop->sched_class = &stop_sched_class; 870 } 871 872 cpu_rq(cpu)->stop = stop; 873 874 if (old_stop) { 875 /* 876 * Reset it back to a normal scheduling class so that 877 * it can die in pieces. 878 */ 879 old_stop->sched_class = &rt_sched_class; 880 } 881} 882 883/* 884 * __normal_prio - return the priority that is based on the static prio 885 */ 886static inline int __normal_prio(struct task_struct *p) 887{ 888 return p->static_prio; 889} 890 891/* 892 * Calculate the expected normal priority: i.e. priority 893 * without taking RT-inheritance into account. Might be 894 * boosted by interactivity modifiers. Changes upon fork, 895 * setprio syscalls, and whenever the interactivity 896 * estimator recalculates. 897 */ 898static inline int normal_prio(struct task_struct *p) 899{ 900 int prio; 901 902 if (task_has_rt_policy(p)) 903 prio = MAX_RT_PRIO-1 - p->rt_priority; 904 else 905 prio = __normal_prio(p); 906 return prio; 907} 908 909/* 910 * Calculate the current priority, i.e. the priority 911 * taken into account by the scheduler. This value might 912 * be boosted by RT tasks, or might be boosted by 913 * interactivity modifiers. Will be RT if the task got 914 * RT-boosted. If not then it returns p->normal_prio. 915 */ 916static int effective_prio(struct task_struct *p) 917{ 918 p->normal_prio = normal_prio(p); 919 /* 920 * If we are RT tasks or we were boosted to RT priority, 921 * keep the priority unchanged. Otherwise, update priority 922 * to the normal priority: 923 */ 924 if (!rt_prio(p->prio)) 925 return p->normal_prio; 926 return p->prio; 927} 928 929/** 930 * task_curr - is this task currently executing on a CPU? 931 * @p: the task in question. 932 * 933 * Return: 1 if the task is currently executing. 0 otherwise. 934 */ 935inline int task_curr(const struct task_struct *p) 936{ 937 return cpu_curr(task_cpu(p)) == p; 938} 939 940static inline void check_class_changed(struct rq *rq, struct task_struct *p, 941 const struct sched_class *prev_class, 942 int oldprio) 943{ 944 if (prev_class != p->sched_class) { 945 if (prev_class->switched_from) 946 prev_class->switched_from(rq, p); 947 p->sched_class->switched_to(rq, p); 948 } else if (oldprio != p->prio) 949 p->sched_class->prio_changed(rq, p, oldprio); 950} 951 952void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags) 953{ 954 const struct sched_class *class; 955 956 if (p->sched_class == rq->curr->sched_class) { 957 rq->curr->sched_class->check_preempt_curr(rq, p, flags); 958 } else { 959 for_each_class(class) { 960 if (class == rq->curr->sched_class) 961 break; 962 if (class == p->sched_class) { 963 resched_task(rq->curr); 964 break; 965 } 966 } 967 } 968 969 /* 970 * A queue event has occurred, and we're going to schedule. In 971 * this case, we can save a useless back to back clock update. 972 */ 973 if (rq->curr->on_rq && test_tsk_need_resched(rq->curr)) 974 rq->skip_clock_update = 1; 975} 976 977#ifdef CONFIG_SMP 978void set_task_cpu(struct task_struct *p, unsigned int new_cpu) 979{ 980#ifdef CONFIG_SCHED_DEBUG 981 /* 982 * We should never call set_task_cpu() on a blocked task, 983 * ttwu() will sort out the placement. 984 */ 985 WARN_ON_ONCE(p->state != TASK_RUNNING && p->state != TASK_WAKING && 986 !(task_preempt_count(p) & PREEMPT_ACTIVE)); 987 988#ifdef CONFIG_LOCKDEP 989 /* 990 * The caller should hold either p->pi_lock or rq->lock, when changing 991 * a task's CPU. ->pi_lock for waking tasks, rq->lock for runnable tasks. 992 * 993 * sched_move_task() holds both and thus holding either pins the cgroup, 994 * see task_group(). 995 * 996 * Furthermore, all task_rq users should acquire both locks, see 997 * task_rq_lock(). 998 */ 999 WARN_ON_ONCE(debug_locks && !(lockdep_is_held(&p->pi_lock) || 1000 lockdep_is_held(&task_rq(p)->lock))); 1001#endif 1002#endif 1003 1004 trace_sched_migrate_task(p, new_cpu); 1005 1006 if (task_cpu(p) != new_cpu) { 1007 if (p->sched_class->migrate_task_rq) 1008 p->sched_class->migrate_task_rq(p, new_cpu); 1009 p->se.nr_migrations++; 1010 perf_sw_event(PERF_COUNT_SW_CPU_MIGRATIONS, 1, NULL, 0); 1011 } 1012 1013 __set_task_cpu(p, new_cpu); 1014} 1015 1016static void __migrate_swap_task(struct task_struct *p, int cpu) 1017{ 1018 if (p->on_rq) { 1019 struct rq *src_rq, *dst_rq; 1020 1021 src_rq = task_rq(p); 1022 dst_rq = cpu_rq(cpu); 1023 1024 deactivate_task(src_rq, p, 0); 1025 set_task_cpu(p, cpu); 1026 activate_task(dst_rq, p, 0); 1027 check_preempt_curr(dst_rq, p, 0); 1028 } else { 1029 /* 1030 * Task isn't running anymore; make it appear like we migrated 1031 * it before it went to sleep. This means on wakeup we make the 1032 * previous cpu our targer instead of where it really is. 1033 */ 1034 p->wake_cpu = cpu; 1035 } 1036} 1037 1038struct migration_swap_arg { 1039 struct task_struct *src_task, *dst_task; 1040 int src_cpu, dst_cpu; 1041}; 1042 1043static int migrate_swap_stop(void *data) 1044{ 1045 struct migration_swap_arg *arg = data; 1046 struct rq *src_rq, *dst_rq; 1047 int ret = -EAGAIN; 1048 1049 src_rq = cpu_rq(arg->src_cpu); 1050 dst_rq = cpu_rq(arg->dst_cpu); 1051 1052 double_raw_lock(&arg->src_task->pi_lock, 1053 &arg->dst_task->pi_lock); 1054 double_rq_lock(src_rq, dst_rq); 1055 if (task_cpu(arg->dst_task) != arg->dst_cpu) 1056 goto unlock; 1057 1058 if (task_cpu(arg->src_task) != arg->src_cpu) 1059 goto unlock; 1060 1061 if (!cpumask_test_cpu(arg->dst_cpu, tsk_cpus_allowed(arg->src_task))) 1062 goto unlock; 1063 1064 if (!cpumask_test_cpu(arg->src_cpu, tsk_cpus_allowed(arg->dst_task))) 1065 goto unlock; 1066 1067 __migrate_swap_task(arg->src_task, arg->dst_cpu); 1068 __migrate_swap_task(arg->dst_task, arg->src_cpu); 1069 1070 ret = 0; 1071 1072unlock: 1073 double_rq_unlock(src_rq, dst_rq); 1074 raw_spin_unlock(&arg->dst_task->pi_lock); 1075 raw_spin_unlock(&arg->src_task->pi_lock); 1076 1077 return ret; 1078} 1079 1080/* 1081 * Cross migrate two tasks 1082 */ 1083int migrate_swap(struct task_struct *cur, struct task_struct *p) 1084{ 1085 struct migration_swap_arg arg; 1086 int ret = -EINVAL; 1087 1088 get_online_cpus(); 1089 1090 arg = (struct migration_swap_arg){ 1091 .src_task = cur, 1092 .src_cpu = task_cpu(cur), 1093 .dst_task = p, 1094 .dst_cpu = task_cpu(p), 1095 }; 1096 1097 if (arg.src_cpu == arg.dst_cpu) 1098 goto out; 1099 1100 if (!cpu_active(arg.src_cpu) || !cpu_active(arg.dst_cpu)) 1101 goto out; 1102 1103 if (!cpumask_test_cpu(arg.dst_cpu, tsk_cpus_allowed(arg.src_task))) 1104 goto out; 1105 1106 if (!cpumask_test_cpu(arg.src_cpu, tsk_cpus_allowed(arg.dst_task))) 1107 goto out; 1108 1109 ret = stop_two_cpus(arg.dst_cpu, arg.src_cpu, migrate_swap_stop, &arg); 1110 1111out: 1112 put_online_cpus(); 1113 return ret; 1114} 1115 1116struct migration_arg { 1117 struct task_struct *task; 1118 int dest_cpu; 1119}; 1120 1121static int migration_cpu_stop(void *data); 1122 1123/* 1124 * wait_task_inactive - wait for a thread to unschedule. 1125 * 1126 * If @match_state is nonzero, it's the @p->state value just checked and 1127 * not expected to change. If it changes, i.e. @p might have woken up, 1128 * then return zero. When we succeed in waiting for @p to be off its CPU, 1129 * we return a positive number (its total switch count). If a second call 1130 * a short while later returns the same number, the caller can be sure that 1131 * @p has remained unscheduled the whole time. 1132 * 1133 * The caller must ensure that the task *will* unschedule sometime soon, 1134 * else this function might spin for a *long* time. This function can't 1135 * be called with interrupts off, or it may introduce deadlock with 1136 * smp_call_function() if an IPI is sent by the same process we are 1137 * waiting to become inactive. 1138 */ 1139unsigned long wait_task_inactive(struct task_struct *p, long match_state) 1140{ 1141 unsigned long flags; 1142 int running, on_rq; 1143 unsigned long ncsw; 1144 struct rq *rq; 1145 1146 for (;;) { 1147 /* 1148 * We do the initial early heuristics without holding 1149 * any task-queue locks at all. We'll only try to get 1150 * the runqueue lock when things look like they will 1151 * work out! 1152 */ 1153 rq = task_rq(p); 1154 1155 /* 1156 * If the task is actively running on another CPU 1157 * still, just relax and busy-wait without holding 1158 * any locks. 1159 * 1160 * NOTE! Since we don't hold any locks, it's not 1161 * even sure that "rq" stays as the right runqueue! 1162 * But we don't care, since "task_running()" will 1163 * return false if the runqueue has changed and p 1164 * is actually now running somewhere else! 1165 */ 1166 while (task_running(rq, p)) { 1167 if (match_state && unlikely(p->state != match_state)) 1168 return 0; 1169 cpu_relax(); 1170 } 1171 1172 /* 1173 * Ok, time to look more closely! We need the rq 1174 * lock now, to be *sure*. If we're wrong, we'll 1175 * just go back and repeat. 1176 */ 1177 rq = task_rq_lock(p, &flags); 1178 trace_sched_wait_task(p); 1179 running = task_running(rq, p); 1180 on_rq = p->on_rq; 1181 ncsw = 0; 1182 if (!match_state || p->state == match_state) 1183 ncsw = p->nvcsw | LONG_MIN; /* sets MSB */ 1184 task_rq_unlock(rq, p, &flags); 1185 1186 /* 1187 * If it changed from the expected state, bail out now. 1188 */ 1189 if (unlikely(!ncsw)) 1190 break; 1191 1192 /* 1193 * Was it really running after all now that we 1194 * checked with the proper locks actually held? 1195 * 1196 * Oops. Go back and try again.. 1197 */ 1198 if (unlikely(running)) { 1199 cpu_relax(); 1200 continue; 1201 } 1202 1203 /* 1204 * It's not enough that it's not actively running, 1205 * it must be off the runqueue _entirely_, and not 1206 * preempted! 1207 * 1208 * So if it was still runnable (but just not actively 1209 * running right now), it's preempted, and we should 1210 * yield - it could be a while. 1211 */ 1212 if (unlikely(on_rq)) { 1213 ktime_t to = ktime_set(0, NSEC_PER_SEC/HZ); 1214 1215 set_current_state(TASK_UNINTERRUPTIBLE); 1216 schedule_hrtimeout(&to, HRTIMER_MODE_REL); 1217 continue; 1218 } 1219 1220 /* 1221 * Ahh, all good. It wasn't running, and it wasn't 1222 * runnable, which means that it will never become 1223 * running in the future either. We're all done! 1224 */ 1225 break; 1226 } 1227 1228 return ncsw; 1229} 1230 1231/*** 1232 * kick_process - kick a running thread to enter/exit the kernel 1233 * @p: the to-be-kicked thread 1234 * 1235 * Cause a process which is running on another CPU to enter 1236 * kernel-mode, without any delay. (to get signals handled.) 1237 * 1238 * NOTE: this function doesn't have to take the runqueue lock, 1239 * because all it wants to ensure is that the remote task enters 1240 * the kernel. If the IPI races and the task has been migrated 1241 * to another CPU then no harm is done and the purpose has been 1242 * achieved as well. 1243 */ 1244void kick_process(struct task_struct *p) 1245{ 1246 int cpu; 1247 1248 preempt_disable(); 1249 cpu = task_cpu(p); 1250 if ((cpu != smp_processor_id()) && task_curr(p)) 1251 smp_send_reschedule(cpu); 1252 preempt_enable(); 1253} 1254EXPORT_SYMBOL_GPL(kick_process); 1255#endif /* CONFIG_SMP */ 1256 1257#ifdef CONFIG_SMP 1258/* 1259 * ->cpus_allowed is protected by both rq->lock and p->pi_lock 1260 */ 1261static int select_fallback_rq(int cpu, struct task_struct *p) 1262{ 1263 int nid = cpu_to_node(cpu); 1264 const struct cpumask *nodemask = NULL; 1265 enum { cpuset, possible, fail } state = cpuset; 1266 int dest_cpu; 1267 1268 /* 1269 * If the node that the cpu is on has been offlined, cpu_to_node() 1270 * will return -1. There is no cpu on the node, and we should 1271 * select the cpu on the other node. 1272 */ 1273 if (nid != -1) { 1274 nodemask = cpumask_of_node(nid); 1275 1276 /* Look for allowed, online CPU in same node. */ 1277 for_each_cpu(dest_cpu, nodemask) { 1278 if (!cpu_online(dest_cpu)) 1279 continue; 1280 if (!cpu_active(dest_cpu)) 1281 continue; 1282 if (cpumask_test_cpu(dest_cpu, tsk_cpus_allowed(p))) 1283 return dest_cpu; 1284 } 1285 } 1286 1287 for (;;) { 1288 /* Any allowed, online CPU? */ 1289 for_each_cpu(dest_cpu, tsk_cpus_allowed(p)) { 1290 if (!cpu_online(dest_cpu)) 1291 continue; 1292 if (!cpu_active(dest_cpu)) 1293 continue; 1294 goto out; 1295 } 1296 1297 switch (state) { 1298 case cpuset: 1299 /* No more Mr. Nice Guy. */ 1300 cpuset_cpus_allowed_fallback(p); 1301 state = possible; 1302 break; 1303 1304 case possible: 1305 do_set_cpus_allowed(p, cpu_possible_mask); 1306 state = fail; 1307 break; 1308 1309 case fail: 1310 BUG(); 1311 break; 1312 } 1313 } 1314 1315out: 1316 if (state != cpuset) { 1317 /* 1318 * Don't tell them about moving exiting tasks or 1319 * kernel threads (both mm NULL), since they never 1320 * leave kernel. 1321 */ 1322 if (p->mm && printk_ratelimit()) { 1323 printk_sched("process %d (%s) no longer affine to cpu%d\n", 1324 task_pid_nr(p), p->comm, cpu); 1325 } 1326 } 1327 1328 return dest_cpu; 1329} 1330 1331/* 1332 * The caller (fork, wakeup) owns p->pi_lock, ->cpus_allowed is stable. 1333 */ 1334static inline 1335int select_task_rq(struct task_struct *p, int cpu, int sd_flags, int wake_flags) 1336{ 1337 cpu = p->sched_class->select_task_rq(p, cpu, sd_flags, wake_flags); 1338 1339 /* 1340 * In order not to call set_task_cpu() on a blocking task we need 1341 * to rely on ttwu() to place the task on a valid ->cpus_allowed 1342 * cpu. 1343 * 1344 * Since this is common to all placement strategies, this lives here. 1345 * 1346 * [ this allows ->select_task() to simply return task_cpu(p) and 1347 * not worry about this generic constraint ] 1348 */ 1349 if (unlikely(!cpumask_test_cpu(cpu, tsk_cpus_allowed(p)) || 1350 !cpu_online(cpu))) 1351 cpu = select_fallback_rq(task_cpu(p), p); 1352 1353 return cpu; 1354} 1355 1356static void update_avg(u64 *avg, u64 sample) 1357{ 1358 s64 diff = sample - *avg; 1359 *avg += diff >> 3; 1360} 1361#endif 1362 1363static void 1364ttwu_stat(struct task_struct *p, int cpu, int wake_flags) 1365{ 1366#ifdef CONFIG_SCHEDSTATS 1367 struct rq *rq = this_rq(); 1368 1369#ifdef CONFIG_SMP 1370 int this_cpu = smp_processor_id(); 1371 1372 if (cpu == this_cpu) { 1373 schedstat_inc(rq, ttwu_local); 1374 schedstat_inc(p, se.statistics.nr_wakeups_local); 1375 } else { 1376 struct sched_domain *sd; 1377 1378 schedstat_inc(p, se.statistics.nr_wakeups_remote); 1379 rcu_read_lock(); 1380 for_each_domain(this_cpu, sd) { 1381 if (cpumask_test_cpu(cpu, sched_domain_span(sd))) { 1382 schedstat_inc(sd, ttwu_wake_remote); 1383 break; 1384 } 1385 } 1386 rcu_read_unlock(); 1387 } 1388 1389 if (wake_flags & WF_MIGRATED) 1390 schedstat_inc(p, se.statistics.nr_wakeups_migrate); 1391 1392#endif /* CONFIG_SMP */ 1393 1394 schedstat_inc(rq, ttwu_count); 1395 schedstat_inc(p, se.statistics.nr_wakeups); 1396 1397 if (wake_flags & WF_SYNC) 1398 schedstat_inc(p, se.statistics.nr_wakeups_sync); 1399 1400#endif /* CONFIG_SCHEDSTATS */ 1401} 1402 1403static void ttwu_activate(struct rq *rq, struct task_struct *p, int en_flags) 1404{ 1405 activate_task(rq, p, en_flags); 1406 p->on_rq = 1; 1407 1408 /* if a worker is waking up, notify workqueue */ 1409 if (p->flags & PF_WQ_WORKER) 1410 wq_worker_waking_up(p, cpu_of(rq)); 1411} 1412 1413/* 1414 * Mark the task runnable and perform wakeup-preemption. 1415 */ 1416static void 1417ttwu_do_wakeup(struct rq *rq, struct task_struct *p, int wake_flags) 1418{ 1419 check_preempt_curr(rq, p, wake_flags); 1420 trace_sched_wakeup(p, true); 1421 1422 p->state = TASK_RUNNING; 1423#ifdef CONFIG_SMP 1424 if (p->sched_class->task_woken) 1425 p->sched_class->task_woken(rq, p); 1426 1427 if (rq->idle_stamp) { 1428 u64 delta = rq_clock(rq) - rq->idle_stamp; 1429 u64 max = 2*rq->max_idle_balance_cost; 1430 1431 update_avg(&rq->avg_idle, delta); 1432 1433 if (rq->avg_idle > max) 1434 rq->avg_idle = max; 1435 1436 rq->idle_stamp = 0; 1437 } 1438#endif 1439} 1440 1441static void 1442ttwu_do_activate(struct rq *rq, struct task_struct *p, int wake_flags) 1443{ 1444#ifdef CONFIG_SMP 1445 if (p->sched_contributes_to_load) 1446 rq->nr_uninterruptible--; 1447#endif 1448 1449 ttwu_activate(rq, p, ENQUEUE_WAKEUP | ENQUEUE_WAKING); 1450 ttwu_do_wakeup(rq, p, wake_flags); 1451} 1452 1453/* 1454 * Called in case the task @p isn't fully descheduled from its runqueue, 1455 * in this case we must do a remote wakeup. Its a 'light' wakeup though, 1456 * since all we need to do is flip p->state to TASK_RUNNING, since 1457 * the task is still ->on_rq. 1458 */ 1459static int ttwu_remote(struct task_struct *p, int wake_flags) 1460{ 1461 struct rq *rq; 1462 int ret = 0; 1463 1464 rq = __task_rq_lock(p); 1465 if (p->on_rq) { 1466 /* check_preempt_curr() may use rq clock */ 1467 update_rq_clock(rq); 1468 ttwu_do_wakeup(rq, p, wake_flags); 1469 ret = 1; 1470 } 1471 __task_rq_unlock(rq); 1472 1473 return ret; 1474} 1475 1476#ifdef CONFIG_SMP 1477static void sched_ttwu_pending(void) 1478{ 1479 struct rq *rq = this_rq(); 1480 struct llist_node *llist = llist_del_all(&rq->wake_list); 1481 struct task_struct *p; 1482 1483 raw_spin_lock(&rq->lock); 1484 1485 while (llist) { 1486 p = llist_entry(llist, struct task_struct, wake_entry); 1487 llist = llist_next(llist); 1488 ttwu_do_activate(rq, p, 0); 1489 } 1490 1491 raw_spin_unlock(&rq->lock); 1492} 1493 1494void scheduler_ipi(void) 1495{ 1496 /* 1497 * Fold TIF_NEED_RESCHED into the preempt_count; anybody setting 1498 * TIF_NEED_RESCHED remotely (for the first time) will also send 1499 * this IPI. 1500 */ 1501 if (tif_need_resched()) 1502 set_preempt_need_resched(); 1503 1504 if (llist_empty(&this_rq()->wake_list) 1505 && !tick_nohz_full_cpu(smp_processor_id()) 1506 && !got_nohz_idle_kick()) 1507 return; 1508 1509 /* 1510 * Not all reschedule IPI handlers call irq_enter/irq_exit, since 1511 * traditionally all their work was done from the interrupt return 1512 * path. Now that we actually do some work, we need to make sure 1513 * we do call them. 1514 * 1515 * Some archs already do call them, luckily irq_enter/exit nest 1516 * properly. 1517 * 1518 * Arguably we should visit all archs and update all handlers, 1519 * however a fair share of IPIs are still resched only so this would 1520 * somewhat pessimize the simple resched case. 1521 */ 1522 irq_enter(); 1523 tick_nohz_full_check(); 1524 sched_ttwu_pending(); 1525 1526 /* 1527 * Check if someone kicked us for doing the nohz idle load balance. 1528 */ 1529 if (unlikely(got_nohz_idle_kick())) { 1530 this_rq()->idle_balance = 1; 1531 raise_softirq_irqoff(SCHED_SOFTIRQ); 1532 } 1533 irq_exit(); 1534} 1535 1536static void ttwu_queue_remote(struct task_struct *p, int cpu) 1537{ 1538 if (llist_add(&p->wake_entry, &cpu_rq(cpu)->wake_list)) 1539 smp_send_reschedule(cpu); 1540} 1541 1542bool cpus_share_cache(int this_cpu, int that_cpu) 1543{ 1544 return per_cpu(sd_llc_id, this_cpu) == per_cpu(sd_llc_id, that_cpu); 1545} 1546#endif /* CONFIG_SMP */ 1547 1548static void ttwu_queue(struct task_struct *p, int cpu) 1549{ 1550 struct rq *rq = cpu_rq(cpu); 1551 1552#if defined(CONFIG_SMP) 1553 if (sched_feat(TTWU_QUEUE) && !cpus_share_cache(smp_processor_id(), cpu)) { 1554 sched_clock_cpu(cpu); /* sync clocks x-cpu */ 1555 ttwu_queue_remote(p, cpu); 1556 return; 1557 } 1558#endif 1559 1560 raw_spin_lock(&rq->lock); 1561 ttwu_do_activate(rq, p, 0); 1562 raw_spin_unlock(&rq->lock); 1563} 1564 1565/** 1566 * try_to_wake_up - wake up a thread 1567 * @p: the thread to be awakened 1568 * @state: the mask of task states that can be woken 1569 * @wake_flags: wake modifier flags (WF_*) 1570 * 1571 * Put it on the run-queue if it's not already there. The "current" 1572 * thread is always on the run-queue (except when the actual 1573 * re-schedule is in progress), and as such you're allowed to do 1574 * the simpler "current->state = TASK_RUNNING" to mark yourself 1575 * runnable without the overhead of this. 1576 * 1577 * Return: %true if @p was woken up, %false if it was already running. 1578 * or @state didn't match @p's state. 1579 */ 1580static int 1581try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags) 1582{ 1583 unsigned long flags; 1584 int cpu, success = 0; 1585 1586 /* 1587 * If we are going to wake up a thread waiting for CONDITION we 1588 * need to ensure that CONDITION=1 done by the caller can not be 1589 * reordered with p->state check below. This pairs with mb() in 1590 * set_current_state() the waiting thread does. 1591 */ 1592 smp_mb__before_spinlock(); 1593 raw_spin_lock_irqsave(&p->pi_lock, flags); 1594 if (!(p->state & state)) 1595 goto out; 1596 1597 success = 1; /* we're going to change ->state */ 1598 cpu = task_cpu(p); 1599 1600 if (p->on_rq && ttwu_remote(p, wake_flags)) 1601 goto stat; 1602 1603#ifdef CONFIG_SMP 1604 /* 1605 * If the owning (remote) cpu is still in the middle of schedule() with 1606 * this task as prev, wait until its done referencing the task. 1607 */ 1608 while (p->on_cpu) 1609 cpu_relax(); 1610 /* 1611 * Pairs with the smp_wmb() in finish_lock_switch(). 1612 */ 1613 smp_rmb(); 1614 1615 p->sched_contributes_to_load = !!task_contributes_to_load(p); 1616 p->state = TASK_WAKING; 1617 1618 if (p->sched_class->task_waking) 1619 p->sched_class->task_waking(p); 1620 1621 cpu = select_task_rq(p, p->wake_cpu, SD_BALANCE_WAKE, wake_flags); 1622 if (task_cpu(p) != cpu) { 1623 wake_flags |= WF_MIGRATED; 1624 set_task_cpu(p, cpu); 1625 } 1626#endif /* CONFIG_SMP */ 1627 1628 ttwu_queue(p, cpu); 1629stat: 1630 ttwu_stat(p, cpu, wake_flags); 1631out: 1632 raw_spin_unlock_irqrestore(&p->pi_lock, flags); 1633 1634 return success; 1635} 1636 1637/** 1638 * try_to_wake_up_local - try to wake up a local task with rq lock held 1639 * @p: the thread to be awakened 1640 * 1641 * Put @p on the run-queue if it's not already there. The caller must 1642 * ensure that this_rq() is locked, @p is bound to this_rq() and not 1643 * the current task. 1644 */ 1645static void try_to_wake_up_local(struct task_struct *p) 1646{ 1647 struct rq *rq = task_rq(p); 1648 1649 if (WARN_ON_ONCE(rq != this_rq()) || 1650 WARN_ON_ONCE(p == current)) 1651 return; 1652 1653 lockdep_assert_held(&rq->lock); 1654 1655 if (!raw_spin_trylock(&p->pi_lock)) { 1656 raw_spin_unlock(&rq->lock); 1657 raw_spin_lock(&p->pi_lock); 1658 raw_spin_lock(&rq->lock); 1659 } 1660 1661 if (!(p->state & TASK_NORMAL)) 1662 goto out; 1663 1664 if (!p->on_rq) 1665 ttwu_activate(rq, p, ENQUEUE_WAKEUP); 1666 1667 ttwu_do_wakeup(rq, p, 0); 1668 ttwu_stat(p, smp_processor_id(), 0); 1669out: 1670 raw_spin_unlock(&p->pi_lock); 1671} 1672 1673/** 1674 * wake_up_process - Wake up a specific process 1675 * @p: The process to be woken up. 1676 * 1677 * Attempt to wake up the nominated process and move it to the set of runnable 1678 * processes. 1679 * 1680 * Return: 1 if the process was woken up, 0 if it was already running. 1681 * 1682 * It may be assumed that this function implies a write memory barrier before 1683 * changing the task state if and only if any tasks are woken up. 1684 */ 1685int wake_up_process(struct task_struct *p) 1686{ 1687 WARN_ON(task_is_stopped_or_traced(p)); 1688 return try_to_wake_up(p, TASK_NORMAL, 0); 1689} 1690EXPORT_SYMBOL(wake_up_process); 1691 1692int wake_up_state(struct task_struct *p, unsigned int state) 1693{ 1694 return try_to_wake_up(p, state, 0); 1695} 1696 1697/* 1698 * Perform scheduler related setup for a newly forked process p. 1699 * p is forked by current. 1700 * 1701 * __sched_fork() is basic setup used by init_idle() too: 1702 */ 1703static void __sched_fork(unsigned long clone_flags, struct task_struct *p) 1704{ 1705 p->on_rq = 0; 1706 1707 p->se.on_rq = 0; 1708 p->se.exec_start = 0; 1709 p->se.sum_exec_runtime = 0; 1710 p->se.prev_sum_exec_runtime = 0; 1711 p->se.nr_migrations = 0; 1712 p->se.vruntime = 0; 1713 INIT_LIST_HEAD(&p->se.group_node); 1714 1715#ifdef CONFIG_SCHEDSTATS 1716 memset(&p->se.statistics, 0, sizeof(p->se.statistics)); 1717#endif 1718 1719 INIT_LIST_HEAD(&p->rt.run_list); 1720 1721#ifdef CONFIG_PREEMPT_NOTIFIERS 1722 INIT_HLIST_HEAD(&p->preempt_notifiers); 1723#endif 1724 1725#ifdef CONFIG_NUMA_BALANCING 1726 if (p->mm && atomic_read(&p->mm->mm_users) == 1) { 1727 p->mm->numa_next_scan = jiffies + msecs_to_jiffies(sysctl_numa_balancing_scan_delay); 1728 p->mm->numa_scan_seq = 0; 1729 } 1730 1731 if (clone_flags & CLONE_VM) 1732 p->numa_preferred_nid = current->numa_preferred_nid; 1733 else 1734 p->numa_preferred_nid = -1; 1735 1736 p->node_stamp = 0ULL; 1737 p->numa_scan_seq = p->mm ? p->mm->numa_scan_seq : 0; 1738 p->numa_scan_period = sysctl_numa_balancing_scan_delay; 1739 p->numa_work.next = &p->numa_work; 1740 p->numa_faults = NULL; 1741 p->numa_faults_buffer = NULL; 1742 1743 INIT_LIST_HEAD(&p->numa_entry); 1744 p->numa_group = NULL; 1745#endif /* CONFIG_NUMA_BALANCING */ 1746} 1747 1748#ifdef CONFIG_NUMA_BALANCING 1749#ifdef CONFIG_SCHED_DEBUG 1750void set_numabalancing_state(bool enabled) 1751{ 1752 if (enabled) 1753 sched_feat_set("NUMA"); 1754 else 1755 sched_feat_set("NO_NUMA"); 1756} 1757#else 1758__read_mostly bool numabalancing_enabled; 1759 1760void set_numabalancing_state(bool enabled) 1761{ 1762 numabalancing_enabled = enabled; 1763} 1764#endif /* CONFIG_SCHED_DEBUG */ 1765#endif /* CONFIG_NUMA_BALANCING */ 1766 1767/* 1768 * fork()/clone()-time setup: 1769 */ 1770void sched_fork(unsigned long clone_flags, struct task_struct *p) 1771{ 1772 unsigned long flags; 1773 int cpu = get_cpu(); 1774 1775 __sched_fork(clone_flags, p); 1776 /* 1777 * We mark the process as running here. This guarantees that 1778 * nobody will actually run it, and a signal or other external 1779 * event cannot wake it up and insert it on the runqueue either. 1780 */ 1781 p->state = TASK_RUNNING; 1782 1783 /* 1784 * Make sure we do not leak PI boosting priority to the child. 1785 */ 1786 p->prio = current->normal_prio; 1787 1788 /* 1789 * Revert to default priority/policy on fork if requested. 1790 */ 1791 if (unlikely(p->sched_reset_on_fork)) { 1792 if (task_has_rt_policy(p)) { 1793 p->policy = SCHED_NORMAL; 1794 p->static_prio = NICE_TO_PRIO(0); 1795 p->rt_priority = 0; 1796 } else if (PRIO_TO_NICE(p->static_prio) < 0) 1797 p->static_prio = NICE_TO_PRIO(0); 1798 1799 p->prio = p->normal_prio = __normal_prio(p); 1800 set_load_weight(p); 1801 1802 /* 1803 * We don't need the reset flag anymore after the fork. It has 1804 * fulfilled its duty: 1805 */ 1806 p->sched_reset_on_fork = 0; 1807 } 1808 1809 if (!rt_prio(p->prio)) 1810 p->sched_class = &fair_sched_class; 1811 1812 if (p->sched_class->task_fork) 1813 p->sched_class->task_fork(p); 1814 1815 /* 1816 * The child is not yet in the pid-hash so no cgroup attach races, 1817 * and the cgroup is pinned to this child due to cgroup_fork() 1818 * is ran before sched_fork(). 1819 * 1820 * Silence PROVE_RCU. 1821 */ 1822 raw_spin_lock_irqsave(&p->pi_lock, flags); 1823 set_task_cpu(p, cpu); 1824 raw_spin_unlock_irqrestore(&p->pi_lock, flags); 1825 1826#if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT) 1827 if (likely(sched_info_on())) 1828 memset(&p->sched_info, 0, sizeof(p->sched_info)); 1829#endif 1830#if defined(CONFIG_SMP) 1831 p->on_cpu = 0; 1832#endif 1833 init_task_preempt_count(p); 1834#ifdef CONFIG_SMP 1835 plist_node_init(&p->pushable_tasks, MAX_PRIO); 1836#endif 1837 1838 put_cpu(); 1839} 1840 1841/* 1842 * wake_up_new_task - wake up a newly created task for the first time. 1843 * 1844 * This function will do some initial scheduler statistics housekeeping 1845 * that must be done for every newly created context, then puts the task 1846 * on the runqueue and wakes it. 1847 */ 1848void wake_up_new_task(struct task_struct *p) 1849{ 1850 unsigned long flags; 1851 struct rq *rq; 1852 1853 raw_spin_lock_irqsave(&p->pi_lock, flags); 1854#ifdef CONFIG_SMP 1855 /* 1856 * Fork balancing, do it here and not earlier because: 1857 * - cpus_allowed can change in the fork path 1858 * - any previously selected cpu might disappear through hotplug 1859 */ 1860 set_task_cpu(p, select_task_rq(p, task_cpu(p), SD_BALANCE_FORK, 0)); 1861#endif 1862 1863 /* Initialize new task's runnable average */ 1864 init_task_runnable_average(p); 1865 rq = __task_rq_lock(p); 1866 activate_task(rq, p, 0); 1867 p->on_rq = 1; 1868 trace_sched_wakeup_new(p, true); 1869 check_preempt_curr(rq, p, WF_FORK); 1870#ifdef CONFIG_SMP 1871 if (p->sched_class->task_woken) 1872 p->sched_class->task_woken(rq, p); 1873#endif 1874 task_rq_unlock(rq, p, &flags); 1875} 1876 1877#ifdef CONFIG_PREEMPT_NOTIFIERS 1878 1879/** 1880 * preempt_notifier_register - tell me when current is being preempted & rescheduled 1881 * @notifier: notifier struct to register 1882 */ 1883void preempt_notifier_register(struct preempt_notifier *notifier) 1884{ 1885 hlist_add_head(¬ifier->link, ¤t->preempt_notifiers); 1886} 1887EXPORT_SYMBOL_GPL(preempt_notifier_register); 1888 1889/** 1890 * preempt_notifier_unregister - no longer interested in preemption notifications 1891 * @notifier: notifier struct to unregister 1892 * 1893 * This is safe to call from within a preemption notifier. 1894 */ 1895void preempt_notifier_unregister(struct preempt_notifier *notifier) 1896{ 1897 hlist_del(¬ifier->link); 1898} 1899EXPORT_SYMBOL_GPL(preempt_notifier_unregister); 1900 1901static void fire_sched_in_preempt_notifiers(struct task_struct *curr) 1902{ 1903 struct preempt_notifier *notifier; 1904 1905 hlist_for_each_entry(notifier, &curr->preempt_notifiers, link) 1906 notifier->ops->sched_in(notifier, raw_smp_processor_id()); 1907} 1908 1909static void 1910fire_sched_out_preempt_notifiers(struct task_struct *curr, 1911 struct task_struct *next) 1912{ 1913 struct preempt_notifier *notifier; 1914 1915 hlist_for_each_entry(notifier, &curr->preempt_notifiers, link) 1916 notifier->ops->sched_out(notifier, next); 1917} 1918 1919#else /* !CONFIG_PREEMPT_NOTIFIERS */ 1920 1921static void fire_sched_in_preempt_notifiers(struct task_struct *curr) 1922{ 1923} 1924 1925static void 1926fire_sched_out_preempt_notifiers(struct task_struct *curr, 1927 struct task_struct *next) 1928{ 1929} 1930 1931#endif /* CONFIG_PREEMPT_NOTIFIERS */ 1932 1933/** 1934 * prepare_task_switch - prepare to switch tasks 1935 * @rq: the runqueue preparing to switch 1936 * @prev: the current task that is being switched out 1937 * @next: the task we are going to switch to. 1938 * 1939 * This is called with the rq lock held and interrupts off. It must 1940 * be paired with a subsequent finish_task_switch after the context 1941 * switch. 1942 * 1943 * prepare_task_switch sets up locking and calls architecture specific 1944 * hooks. 1945 */ 1946static inline void 1947prepare_task_switch(struct rq *rq, struct task_struct *prev, 1948 struct task_struct *next) 1949{ 1950 trace_sched_switch(prev, next); 1951 sched_info_switch(rq, prev, next); 1952 perf_event_task_sched_out(prev, next); 1953 fire_sched_out_preempt_notifiers(prev, next); 1954 prepare_lock_switch(rq, next); 1955 prepare_arch_switch(next); 1956} 1957 1958/** 1959 * finish_task_switch - clean up after a task-switch 1960 * @rq: runqueue associated with task-switch 1961 * @prev: the thread we just switched away from. 1962 * 1963 * finish_task_switch must be called after the context switch, paired 1964 * with a prepare_task_switch call before the context switch. 1965 * finish_task_switch will reconcile locking set up by prepare_task_switch, 1966 * and do any other architecture-specific cleanup actions. 1967 * 1968 * Note that we may have delayed dropping an mm in context_switch(). If 1969 * so, we finish that here outside of the runqueue lock. (Doing it 1970 * with the lock held can cause deadlocks; see schedule() for 1971 * details.) 1972 */ 1973static void finish_task_switch(struct rq *rq, struct task_struct *prev) 1974 __releases(rq->lock) 1975{ 1976 struct mm_struct *mm = rq->prev_mm; 1977 long prev_state; 1978 1979 rq->prev_mm = NULL; 1980 1981 /* 1982 * A task struct has one reference for the use as "current". 1983 * If a task dies, then it sets TASK_DEAD in tsk->state and calls 1984 * schedule one last time. The schedule call will never return, and 1985 * the scheduled task must drop that reference. 1986 * The test for TASK_DEAD must occur while the runqueue locks are 1987 * still held, otherwise prev could be scheduled on another cpu, die 1988 * there before we look at prev->state, and then the reference would 1989 * be dropped twice. 1990 * Manfred Spraul <manfred@colorfullife.com> 1991 */ 1992 prev_state = prev->state; 1993 vtime_task_switch(prev); 1994 finish_arch_switch(prev); 1995 perf_event_task_sched_in(prev, current); 1996 finish_lock_switch(rq, prev); 1997 finish_arch_post_lock_switch(); 1998 1999 fire_sched_in_preempt_notifiers(current); 2000 if (mm) 2001 mmdrop(mm); 2002 if (unlikely(prev_state == TASK_DEAD)) { 2003 task_numa_free(prev); 2004 2005 /* 2006 * Remove function-return probe instances associated with this 2007 * task and put them back on the free list. 2008 */ 2009 kprobe_flush_task(prev); 2010 put_task_struct(prev); 2011 } 2012 2013 tick_nohz_task_switch(current); 2014} 2015 2016#ifdef CONFIG_SMP 2017 2018/* assumes rq->lock is held */ 2019static inline void pre_schedule(struct rq *rq, struct task_struct *prev) 2020{ 2021 if (prev->sched_class->pre_schedule) 2022 prev->sched_class->pre_schedule(rq, prev); 2023} 2024 2025/* rq->lock is NOT held, but preemption is disabled */ 2026static inline void post_schedule(struct rq *rq) 2027{ 2028 if (rq->post_schedule) { 2029 unsigned long flags; 2030 2031 raw_spin_lock_irqsave(&rq->lock, flags); 2032 if (rq->curr->sched_class->post_schedule) 2033 rq->curr->sched_class->post_schedule(rq); 2034 raw_spin_unlock_irqrestore(&rq->lock, flags); 2035 2036 rq->post_schedule = 0; 2037 } 2038} 2039 2040#else 2041 2042static inline void pre_schedule(struct rq *rq, struct task_struct *p) 2043{ 2044} 2045 2046static inline void post_schedule(struct rq *rq) 2047{ 2048} 2049 2050#endif 2051 2052/** 2053 * schedule_tail - first thing a freshly forked thread must call. 2054 * @prev: the thread we just switched away from. 2055 */ 2056asmlinkage void schedule_tail(struct task_struct *prev) 2057 __releases(rq->lock) 2058{ 2059 struct rq *rq = this_rq(); 2060 2061 finish_task_switch(rq, prev); 2062 2063 /* 2064 * FIXME: do we need to worry about rq being invalidated by the 2065 * task_switch? 2066 */ 2067 post_schedule(rq); 2068 2069#ifdef __ARCH_WANT_UNLOCKED_CTXSW 2070 /* In this case, finish_task_switch does not reenable preemption */ 2071 preempt_enable(); 2072#endif 2073 if (current->set_child_tid) 2074 put_user(task_pid_vnr(current), current->set_child_tid); 2075} 2076 2077/* 2078 * context_switch - switch to the new MM and the new 2079 * thread's register state. 2080 */ 2081static inline void 2082context_switch(struct rq *rq, struct task_struct *prev, 2083 struct task_struct *next) 2084{ 2085 struct mm_struct *mm, *oldmm; 2086 2087 prepare_task_switch(rq, prev, next); 2088 2089 mm = next->mm; 2090 oldmm = prev->active_mm; 2091 /* 2092 * For paravirt, this is coupled with an exit in switch_to to 2093 * combine the page table reload and the switch backend into 2094 * one hypercall. 2095 */ 2096 arch_start_context_switch(prev); 2097 2098 if (!mm) { 2099 next->active_mm = oldmm; 2100 atomic_inc(&oldmm->mm_count); 2101 enter_lazy_tlb(oldmm, next); 2102 } else 2103 switch_mm(oldmm, mm, next); 2104 2105 if (!prev->mm) { 2106 prev->active_mm = NULL; 2107 rq->prev_mm = oldmm; 2108 } 2109 /* 2110 * Since the runqueue lock will be released by the next 2111 * task (which is an invalid locking op but in the case 2112 * of the scheduler it's an obvious special-case), so we 2113 * do an early lockdep release here: 2114 */ 2115#ifndef __ARCH_WANT_UNLOCKED_CTXSW 2116 spin_release(&rq->lock.dep_map, 1, _THIS_IP_); 2117#endif 2118 2119 context_tracking_task_switch(prev, next); 2120 /* Here we just switch the register state and the stack. */ 2121 switch_to(prev, next, prev); 2122 2123 barrier(); 2124 /* 2125 * this_rq must be evaluated again because prev may have moved 2126 * CPUs since it called schedule(), thus the 'rq' on its stack 2127 * frame will be invalid. 2128 */ 2129 finish_task_switch(this_rq(), prev); 2130} 2131 2132/* 2133 * nr_running and nr_context_switches: 2134 * 2135 * externally visible scheduler statistics: current number of runnable 2136 * threads, total number of context switches performed since bootup. 2137 */ 2138unsigned long nr_running(void) 2139{ 2140 unsigned long i, sum = 0; 2141 2142 for_each_online_cpu(i) 2143 sum += cpu_rq(i)->nr_running; 2144 2145 return sum; 2146} 2147 2148unsigned long long nr_context_switches(void) 2149{ 2150 int i; 2151 unsigned long long sum = 0; 2152 2153 for_each_possible_cpu(i) 2154 sum += cpu_rq(i)->nr_switches; 2155 2156 return sum; 2157} 2158 2159unsigned long nr_iowait(void) 2160{ 2161 unsigned long i, sum = 0; 2162 2163 for_each_possible_cpu(i) 2164 sum += atomic_read(&cpu_rq(i)->nr_iowait); 2165 2166 return sum; 2167} 2168 2169unsigned long nr_iowait_cpu(int cpu) 2170{ 2171 struct rq *this = cpu_rq(cpu); 2172 return atomic_read(&this->nr_iowait); 2173} 2174 2175#ifdef CONFIG_SMP 2176 2177/* 2178 * sched_exec - execve() is a valuable balancing opportunity, because at 2179 * this point the task has the smallest effective memory and cache footprint. 2180 */ 2181void sched_exec(void) 2182{ 2183 struct task_struct *p = current; 2184 unsigned long flags; 2185 int dest_cpu; 2186 2187 raw_spin_lock_irqsave(&p->pi_lock, flags); 2188 dest_cpu = p->sched_class->select_task_rq(p, task_cpu(p), SD_BALANCE_EXEC, 0); 2189 if (dest_cpu == smp_processor_id()) 2190 goto unlock; 2191 2192 if (likely(cpu_active(dest_cpu))) { 2193 struct migration_arg arg = { p, dest_cpu }; 2194 2195 raw_spin_unlock_irqrestore(&p->pi_lock, flags); 2196 stop_one_cpu(task_cpu(p), migration_cpu_stop, &arg); 2197 return; 2198 } 2199unlock: 2200 raw_spin_unlock_irqrestore(&p->pi_lock, flags); 2201} 2202 2203#endif 2204 2205DEFINE_PER_CPU(struct kernel_stat, kstat); 2206DEFINE_PER_CPU(struct kernel_cpustat, kernel_cpustat); 2207 2208EXPORT_PER_CPU_SYMBOL(kstat); 2209EXPORT_PER_CPU_SYMBOL(kernel_cpustat); 2210 2211/* 2212 * Return any ns on the sched_clock that have not yet been accounted in 2213 * @p in case that task is currently running. 2214 * 2215 * Called with task_rq_lock() held on @rq. 2216 */ 2217static u64 do_task_delta_exec(struct task_struct *p, struct rq *rq) 2218{ 2219 u64 ns = 0; 2220 2221 if (task_current(rq, p)) { 2222 update_rq_clock(rq); 2223 ns = rq_clock_task(rq) - p->se.exec_start; 2224 if ((s64)ns < 0) 2225 ns = 0; 2226 } 2227 2228 return ns; 2229} 2230 2231unsigned long long task_delta_exec(struct task_struct *p) 2232{ 2233 unsigned long flags; 2234 struct rq *rq; 2235 u64 ns = 0; 2236 2237 rq = task_rq_lock(p, &flags); 2238 ns = do_task_delta_exec(p, rq); 2239 task_rq_unlock(rq, p, &flags); 2240 2241 return ns; 2242} 2243 2244/* 2245 * Return accounted runtime for the task. 2246 * In case the task is currently running, return the runtime plus current's 2247 * pending runtime that have not been accounted yet. 2248 */ 2249unsigned long long task_sched_runtime(struct task_struct *p) 2250{ 2251 unsigned long flags; 2252 struct rq *rq; 2253 u64 ns = 0; 2254 2255 rq = task_rq_lock(p, &flags); 2256 ns = p->se.sum_exec_runtime + do_task_delta_exec(p, rq); 2257 task_rq_unlock(rq, p, &flags); 2258 2259 return ns; 2260} 2261 2262/* 2263 * This function gets called by the timer code, with HZ frequency. 2264 * We call it with interrupts disabled. 2265 */ 2266void scheduler_tick(void) 2267{ 2268 int cpu = smp_processor_id(); 2269 struct rq *rq = cpu_rq(cpu); 2270 struct task_struct *curr = rq->curr; 2271 2272 sched_clock_tick(); 2273 2274 raw_spin_lock(&rq->lock); 2275 update_rq_clock(rq); 2276 curr->sched_class->task_tick(rq, curr, 0); 2277 update_cpu_load_active(rq); 2278 raw_spin_unlock(&rq->lock); 2279 2280 perf_event_task_tick(); 2281 2282#ifdef CONFIG_SMP 2283 rq->idle_balance = idle_cpu(cpu); 2284 trigger_load_balance(rq, cpu); 2285#endif 2286 rq_last_tick_reset(rq); 2287} 2288 2289#ifdef CONFIG_NO_HZ_FULL 2290/** 2291 * scheduler_tick_max_deferment 2292 * 2293 * Keep at least one tick per second when a single 2294 * active task is running because the scheduler doesn't 2295 * yet completely support full dynticks environment. 2296 * 2297 * This makes sure that uptime, CFS vruntime, load 2298 * balancing, etc... continue to move forward, even 2299 * with a very low granularity. 2300 * 2301 * Return: Maximum deferment in nanoseconds. 2302 */ 2303u64 scheduler_tick_max_deferment(void) 2304{ 2305 struct rq *rq = this_rq(); 2306 unsigned long next, now = ACCESS_ONCE(jiffies); 2307 2308 next = rq->last_sched_tick + HZ; 2309 2310 if (time_before_eq(next, now)) 2311 return 0; 2312 2313 return jiffies_to_usecs(next - now) * NSEC_PER_USEC; 2314} 2315#endif 2316 2317notrace unsigned long get_parent_ip(unsigned long addr) 2318{ 2319 if (in_lock_functions(addr)) { 2320 addr = CALLER_ADDR2; 2321 if (in_lock_functions(addr)) 2322 addr = CALLER_ADDR3; 2323 } 2324 return addr; 2325} 2326 2327#if defined(CONFIG_PREEMPT) && (defined(CONFIG_DEBUG_PREEMPT) || \ 2328 defined(CONFIG_PREEMPT_TRACER)) 2329 2330void __kprobes preempt_count_add(int val) 2331{ 2332#ifdef CONFIG_DEBUG_PREEMPT 2333 /* 2334 * Underflow? 2335 */ 2336 if (DEBUG_LOCKS_WARN_ON((preempt_count() < 0))) 2337 return; 2338#endif 2339 __preempt_count_add(val); 2340#ifdef CONFIG_DEBUG_PREEMPT 2341 /* 2342 * Spinlock count overflowing soon? 2343 */ 2344 DEBUG_LOCKS_WARN_ON((preempt_count() & PREEMPT_MASK) >= 2345 PREEMPT_MASK - 10); 2346#endif 2347 if (preempt_count() == val) 2348 trace_preempt_off(CALLER_ADDR0, get_parent_ip(CALLER_ADDR1)); 2349} 2350EXPORT_SYMBOL(preempt_count_add); 2351 2352void __kprobes preempt_count_sub(int val) 2353{ 2354#ifdef CONFIG_DEBUG_PREEMPT 2355 /* 2356 * Underflow? 2357 */ 2358 if (DEBUG_LOCKS_WARN_ON(val > preempt_count())) 2359 return; 2360 /* 2361 * Is the spinlock portion underflowing? 2362 */ 2363 if (DEBUG_LOCKS_WARN_ON((val < PREEMPT_MASK) && 2364 !(preempt_count() & PREEMPT_MASK))) 2365 return; 2366#endif 2367 2368 if (preempt_count() == val) 2369 trace_preempt_on(CALLER_ADDR0, get_parent_ip(CALLER_ADDR1)); 2370 __preempt_count_sub(val); 2371} 2372EXPORT_SYMBOL(preempt_count_sub); 2373 2374#endif 2375 2376/* 2377 * Print scheduling while atomic bug: 2378 */ 2379static noinline void __schedule_bug(struct task_struct *prev) 2380{ 2381 if (oops_in_progress) 2382 return; 2383 2384 printk(KERN_ERR "BUG: scheduling while atomic: %s/%d/0x%08x\n", 2385 prev->comm, prev->pid, preempt_count()); 2386 2387 debug_show_held_locks(prev); 2388 print_modules(); 2389 if (irqs_disabled()) 2390 print_irqtrace_events(prev); 2391 dump_stack(); 2392 add_taint(TAINT_WARN, LOCKDEP_STILL_OK); 2393} 2394 2395/* 2396 * Various schedule()-time debugging checks and statistics: 2397 */ 2398static inline void schedule_debug(struct task_struct *prev) 2399{ 2400 /* 2401 * Test if we are atomic. Since do_exit() needs to call into 2402 * schedule() atomically, we ignore that path for now. 2403 * Otherwise, whine if we are scheduling when we should not be. 2404 */ 2405 if (unlikely(in_atomic_preempt_off() && !prev->exit_state)) 2406 __schedule_bug(prev); 2407 rcu_sleep_check(); 2408 2409 profile_hit(SCHED_PROFILING, __builtin_return_address(0)); 2410 2411 schedstat_inc(this_rq(), sched_count); 2412} 2413 2414static void put_prev_task(struct rq *rq, struct task_struct *prev) 2415{ 2416 if (prev->on_rq || rq->skip_clock_update < 0) 2417 update_rq_clock(rq); 2418 prev->sched_class->put_prev_task(rq, prev); 2419} 2420 2421/* 2422 * Pick up the highest-prio task: 2423 */ 2424static inline struct task_struct * 2425pick_next_task(struct rq *rq) 2426{ 2427 const struct sched_class *class; 2428 struct task_struct *p; 2429 2430 /* 2431 * Optimization: we know that if all tasks are in 2432 * the fair class we can call that function directly: 2433 */ 2434 if (likely(rq->nr_running == rq->cfs.h_nr_running)) { 2435 p = fair_sched_class.pick_next_task(rq); 2436 if (likely(p)) 2437 return p; 2438 } 2439 2440 for_each_class(class) { 2441 p = class->pick_next_task(rq); 2442 if (p) 2443 return p; 2444 } 2445 2446 BUG(); /* the idle class will always have a runnable task */ 2447} 2448 2449/* 2450 * __schedule() is the main scheduler function. 2451 * 2452 * The main means of driving the scheduler and thus entering this function are: 2453 * 2454 * 1. Explicit blocking: mutex, semaphore, waitqueue, etc. 2455 * 2456 * 2. TIF_NEED_RESCHED flag is checked on interrupt and userspace return 2457 * paths. For example, see arch/x86/entry_64.S. 2458 * 2459 * To drive preemption between tasks, the scheduler sets the flag in timer 2460 * interrupt handler scheduler_tick(). 2461 * 2462 * 3. Wakeups don't really cause entry into schedule(). They add a 2463 * task to the run-queue and that's it. 2464 * 2465 * Now, if the new task added to the run-queue preempts the current 2466 * task, then the wakeup sets TIF_NEED_RESCHED and schedule() gets 2467 * called on the nearest possible occasion: 2468 * 2469 * - If the kernel is preemptible (CONFIG_PREEMPT=y): 2470 * 2471 * - in syscall or exception context, at the next outmost 2472 * preempt_enable(). (this might be as soon as the wake_up()'s 2473 * spin_unlock()!) 2474 * 2475 * - in IRQ context, return from interrupt-handler to 2476 * preemptible context 2477 * 2478 * - If the kernel is not preemptible (CONFIG_PREEMPT is not set) 2479 * then at the next: 2480 * 2481 * - cond_resched() call 2482 * - explicit schedule() call 2483 * - return from syscall or exception to user-space 2484 * - return from interrupt-handler to user-space 2485 */ 2486static void __sched __schedule(void) 2487{ 2488 struct task_struct *prev, *next; 2489 unsigned long *switch_count; 2490 struct rq *rq; 2491 int cpu; 2492 2493need_resched: 2494 preempt_disable(); 2495 cpu = smp_processor_id(); 2496 rq = cpu_rq(cpu); 2497 rcu_note_context_switch(cpu); 2498 prev = rq->curr; 2499 2500 schedule_debug(prev); 2501 2502 if (sched_feat(HRTICK)) 2503 hrtick_clear(rq); 2504 2505 /* 2506 * Make sure that signal_pending_state()->signal_pending() below 2507 * can't be reordered with __set_current_state(TASK_INTERRUPTIBLE) 2508 * done by the caller to avoid the race with signal_wake_up(). 2509 */ 2510 smp_mb__before_spinlock(); 2511 raw_spin_lock_irq(&rq->lock); 2512 2513 switch_count = &prev->nivcsw; 2514 if (prev->state && !(preempt_count() & PREEMPT_ACTIVE)) { 2515 if (unlikely(signal_pending_state(prev->state, prev))) { 2516 prev->state = TASK_RUNNING; 2517 } else { 2518 deactivate_task(rq, prev, DEQUEUE_SLEEP); 2519 prev->on_rq = 0; 2520 2521 /* 2522 * If a worker went to sleep, notify and ask workqueue 2523 * whether it wants to wake up a task to maintain 2524 * concurrency. 2525 */ 2526 if (prev->flags & PF_WQ_WORKER) { 2527 struct task_struct *to_wakeup; 2528 2529 to_wakeup = wq_worker_sleeping(prev, cpu); 2530 if (to_wakeup) 2531 try_to_wake_up_local(to_wakeup); 2532 } 2533 } 2534 switch_count = &prev->nvcsw; 2535 } 2536 2537 pre_schedule(rq, prev); 2538 2539 if (unlikely(!rq->nr_running)) 2540 idle_balance(cpu, rq); 2541 2542 put_prev_task(rq, prev); 2543 next = pick_next_task(rq); 2544 clear_tsk_need_resched(prev); 2545 clear_preempt_need_resched(); 2546 rq->skip_clock_update = 0; 2547 2548 if (likely(prev != next)) { 2549 rq->nr_switches++; 2550 rq->curr = next; 2551 ++*switch_count; 2552 2553 context_switch(rq, prev, next); /* unlocks the rq */ 2554 /* 2555 * The context switch have flipped the stack from under us 2556 * and restored the local variables which were saved when 2557 * this task called schedule() in the past. prev == current 2558 * is still correct, but it can be moved to another cpu/rq. 2559 */ 2560 cpu = smp_processor_id(); 2561 rq = cpu_rq(cpu); 2562 } else 2563 raw_spin_unlock_irq(&rq->lock); 2564 2565 post_schedule(rq); 2566 2567 sched_preempt_enable_no_resched(); 2568 if (need_resched()) 2569 goto need_resched; 2570} 2571 2572static inline void sched_submit_work(struct task_struct *tsk) 2573{ 2574 if (!tsk->state || tsk_is_pi_blocked(tsk)) 2575 return; 2576 /* 2577 * If we are going to sleep and we have plugged IO queued, 2578 * make sure to submit it to avoid deadlocks. 2579 */ 2580 if (blk_needs_flush_plug(tsk)) 2581 blk_schedule_flush_plug(tsk); 2582} 2583 2584asmlinkage void __sched schedule(void) 2585{ 2586 struct task_struct *tsk = current; 2587 2588 sched_submit_work(tsk); 2589 __schedule(); 2590} 2591EXPORT_SYMBOL(schedule); 2592 2593#ifdef CONFIG_CONTEXT_TRACKING 2594asmlinkage void __sched schedule_user(void) 2595{ 2596 /* 2597 * If we come here after a random call to set_need_resched(), 2598 * or we have been woken up remotely but the IPI has not yet arrived, 2599 * we haven't yet exited the RCU idle mode. Do it here manually until 2600 * we find a better solution. 2601 */ 2602 user_exit(); 2603 schedule(); 2604 user_enter(); 2605} 2606#endif 2607 2608/** 2609 * schedule_preempt_disabled - called with preemption disabled 2610 * 2611 * Returns with preemption disabled. Note: preempt_count must be 1 2612 */ 2613void __sched schedule_preempt_disabled(void) 2614{ 2615 sched_preempt_enable_no_resched(); 2616 schedule(); 2617 preempt_disable(); 2618} 2619 2620#ifdef CONFIG_PREEMPT 2621/* 2622 * this is the entry point to schedule() from in-kernel preemption 2623 * off of preempt_enable. Kernel preemptions off return from interrupt 2624 * occur there and call schedule directly. 2625 */ 2626asmlinkage void __sched notrace preempt_schedule(void) 2627{ 2628 /* 2629 * If there is a non-zero preempt_count or interrupts are disabled, 2630 * we do not want to preempt the current task. Just return.. 2631 */ 2632 if (likely(!preemptible())) 2633 return; 2634 2635 do { 2636 __preempt_count_add(PREEMPT_ACTIVE); 2637 __schedule(); 2638 __preempt_count_sub(PREEMPT_ACTIVE); 2639 2640 /* 2641 * Check again in case we missed a preemption opportunity 2642 * between schedule and now. 2643 */ 2644 barrier(); 2645 } while (need_resched()); 2646} 2647EXPORT_SYMBOL(preempt_schedule); 2648 2649/* 2650 * this is the entry point to schedule() from kernel preemption 2651 * off of irq context. 2652 * Note, that this is called and return with irqs disabled. This will 2653 * protect us against recursive calling from irq. 2654 */ 2655asmlinkage void __sched preempt_schedule_irq(void) 2656{ 2657 enum ctx_state prev_state; 2658 2659 /* Catch callers which need to be fixed */ 2660 BUG_ON(preempt_count() || !irqs_disabled()); 2661 2662 prev_state = exception_enter(); 2663 2664 do { 2665 __preempt_count_add(PREEMPT_ACTIVE); 2666 local_irq_enable(); 2667 __schedule(); 2668 local_irq_disable(); 2669 __preempt_count_sub(PREEMPT_ACTIVE); 2670 2671 /* 2672 * Check again in case we missed a preemption opportunity 2673 * between schedule and now. 2674 */ 2675 barrier(); 2676 } while (need_resched()); 2677 2678 exception_exit(prev_state); 2679} 2680 2681#endif /* CONFIG_PREEMPT */ 2682 2683int default_wake_function(wait_queue_t *curr, unsigned mode, int wake_flags, 2684 void *key) 2685{ 2686 return try_to_wake_up(curr->private, mode, wake_flags); 2687} 2688EXPORT_SYMBOL(default_wake_function); 2689 2690/* 2691 * The core wakeup function. Non-exclusive wakeups (nr_exclusive == 0) just 2692 * wake everything up. If it's an exclusive wakeup (nr_exclusive == small +ve 2693 * number) then we wake all the non-exclusive tasks and one exclusive task. 2694 * 2695 * There are circumstances in which we can try to wake a task which has already 2696 * started to run but is not in state TASK_RUNNING. try_to_wake_up() returns 2697 * zero in this (rare) case, and we handle it by continuing to scan the queue. 2698 */ 2699static void __wake_up_common(wait_queue_head_t *q, unsigned int mode, 2700 int nr_exclusive, int wake_flags, void *key) 2701{ 2702 wait_queue_t *curr, *next; 2703 2704 list_for_each_entry_safe(curr, next, &q->task_list, task_list) { 2705 unsigned flags = curr->flags; 2706 2707 if (curr->func(curr, mode, wake_flags, key) && 2708 (flags & WQ_FLAG_EXCLUSIVE) && !--nr_exclusive) 2709 break; 2710 } 2711} 2712 2713/** 2714 * __wake_up - wake up threads blocked on a waitqueue. 2715 * @q: the waitqueue 2716 * @mode: which threads 2717 * @nr_exclusive: how many wake-one or wake-many threads to wake up 2718 * @key: is directly passed to the wakeup function 2719 * 2720 * It may be assumed that this function implies a write memory barrier before 2721 * changing the task state if and only if any tasks are woken up. 2722 */ 2723void __wake_up(wait_queue_head_t *q, unsigned int mode, 2724 int nr_exclusive, void *key) 2725{ 2726 unsigned long flags; 2727 2728 spin_lock_irqsave(&q->lock, flags); 2729 __wake_up_common(q, mode, nr_exclusive, 0, key); 2730 spin_unlock_irqrestore(&q->lock, flags); 2731} 2732EXPORT_SYMBOL(__wake_up); 2733 2734/* 2735 * Same as __wake_up but called with the spinlock in wait_queue_head_t held. 2736 */ 2737void __wake_up_locked(wait_queue_head_t *q, unsigned int mode, int nr) 2738{ 2739 __wake_up_common(q, mode, nr, 0, NULL); 2740} 2741EXPORT_SYMBOL_GPL(__wake_up_locked); 2742 2743void __wake_up_locked_key(wait_queue_head_t *q, unsigned int mode, void *key) 2744{ 2745 __wake_up_common(q, mode, 1, 0, key); 2746} 2747EXPORT_SYMBOL_GPL(__wake_up_locked_key); 2748 2749/** 2750 * __wake_up_sync_key - wake up threads blocked on a waitqueue. 2751 * @q: the waitqueue 2752 * @mode: which threads 2753 * @nr_exclusive: how many wake-one or wake-many threads to wake up 2754 * @key: opaque value to be passed to wakeup targets 2755 * 2756 * The sync wakeup differs that the waker knows that it will schedule 2757 * away soon, so while the target thread will be woken up, it will not 2758 * be migrated to another CPU - ie. the two threads are 'synchronized' 2759 * with each other. This can prevent needless bouncing between CPUs. 2760 * 2761 * On UP it can prevent extra preemption. 2762 * 2763 * It may be assumed that this function implies a write memory barrier before 2764 * changing the task state if and only if any tasks are woken up. 2765 */ 2766void __wake_up_sync_key(wait_queue_head_t *q, unsigned int mode, 2767 int nr_exclusive, void *key) 2768{ 2769 unsigned long flags; 2770 int wake_flags = WF_SYNC; 2771 2772 if (unlikely(!q)) 2773 return; 2774 2775 if (unlikely(nr_exclusive != 1)) 2776 wake_flags = 0; 2777 2778 spin_lock_irqsave(&q->lock, flags); 2779 __wake_up_common(q, mode, nr_exclusive, wake_flags, key); 2780 spin_unlock_irqrestore(&q->lock, flags); 2781} 2782EXPORT_SYMBOL_GPL(__wake_up_sync_key); 2783 2784/* 2785 * __wake_up_sync - see __wake_up_sync_key() 2786 */ 2787void __wake_up_sync(wait_queue_head_t *q, unsigned int mode, int nr_exclusive) 2788{ 2789 __wake_up_sync_key(q, mode, nr_exclusive, NULL); 2790} 2791EXPORT_SYMBOL_GPL(__wake_up_sync); /* For internal use only */ 2792 2793/** 2794 * complete: - signals a single thread waiting on this completion 2795 * @x: holds the state of this particular completion 2796 * 2797 * This will wake up a single thread waiting on this completion. Threads will be 2798 * awakened in the same order in which they were queued. 2799 * 2800 * See also complete_all(), wait_for_completion() and related routines. 2801 * 2802 * It may be assumed that this function implies a write memory barrier before 2803 * changing the task state if and only if any tasks are woken up. 2804 */ 2805void complete(struct completion *x) 2806{ 2807 unsigned long flags; 2808 2809 spin_lock_irqsave(&x->wait.lock, flags); 2810 x->done++; 2811 __wake_up_common(&x->wait, TASK_NORMAL, 1, 0, NULL); 2812 spin_unlock_irqrestore(&x->wait.lock, flags); 2813} 2814EXPORT_SYMBOL(complete); 2815 2816/** 2817 * complete_all: - signals all threads waiting on this completion 2818 * @x: holds the state of this particular completion 2819 * 2820 * This will wake up all threads waiting on this particular completion event. 2821 * 2822 * It may be assumed that this function implies a write memory barrier before 2823 * changing the task state if and only if any tasks are woken up. 2824 */ 2825void complete_all(struct completion *x) 2826{ 2827 unsigned long flags; 2828 2829 spin_lock_irqsave(&x->wait.lock, flags); 2830 x->done += UINT_MAX/2; 2831 __wake_up_common(&x->wait, TASK_NORMAL, 0, 0, NULL); 2832 spin_unlock_irqrestore(&x->wait.lock, flags); 2833} 2834EXPORT_SYMBOL(complete_all); 2835 2836static inline long __sched 2837do_wait_for_common(struct completion *x, 2838 long (*action)(long), long timeout, int state) 2839{ 2840 if (!x->done) { 2841 DECLARE_WAITQUEUE(wait, current); 2842 2843 __add_wait_queue_tail_exclusive(&x->wait, &wait); 2844 do { 2845 if (signal_pending_state(state, current)) { 2846 timeout = -ERESTARTSYS; 2847 break; 2848 } 2849 __set_current_state(state); 2850 spin_unlock_irq(&x->wait.lock); 2851 timeout = action(timeout); 2852 spin_lock_irq(&x->wait.lock); 2853 } while (!x->done && timeout); 2854 __remove_wait_queue(&x->wait, &wait); 2855 if (!x->done) 2856 return timeout; 2857 } 2858 x->done--; 2859 return timeout ?: 1; 2860} 2861 2862static inline long __sched 2863__wait_for_common(struct completion *x, 2864 long (*action)(long), long timeout, int state) 2865{ 2866 might_sleep(); 2867 2868 spin_lock_irq(&x->wait.lock); 2869 timeout = do_wait_for_common(x, action, timeout, state); 2870 spin_unlock_irq(&x->wait.lock); 2871 return timeout; 2872} 2873 2874static long __sched 2875wait_for_common(struct completion *x, long timeout, int state) 2876{ 2877 return __wait_for_common(x, schedule_timeout, timeout, state); 2878} 2879 2880static long __sched 2881wait_for_common_io(struct completion *x, long timeout, int state) 2882{ 2883 return __wait_for_common(x, io_schedule_timeout, timeout, state); 2884} 2885 2886/** 2887 * wait_for_completion: - waits for completion of a task 2888 * @x: holds the state of this particular completion 2889 * 2890 * This waits to be signaled for completion of a specific task. It is NOT 2891 * interruptible and there is no timeout. 2892 * 2893 * See also similar routines (i.e. wait_for_completion_timeout()) with timeout 2894 * and interrupt capability. Also see complete(). 2895 */ 2896void __sched wait_for_completion(struct completion *x) 2897{ 2898 wait_for_common(x, MAX_SCHEDULE_TIMEOUT, TASK_UNINTERRUPTIBLE); 2899} 2900EXPORT_SYMBOL(wait_for_completion); 2901 2902/** 2903 * wait_for_completion_timeout: - waits for completion of a task (w/timeout) 2904 * @x: holds the state of this particular completion 2905 * @timeout: timeout value in jiffies 2906 * 2907 * This waits for either a completion of a specific task to be signaled or for a 2908 * specified timeout to expire. The timeout is in jiffies. It is not 2909 * interruptible. 2910 * 2911 * Return: 0 if timed out, and positive (at least 1, or number of jiffies left 2912 * till timeout) if completed. 2913 */ 2914unsigned long __sched 2915wait_for_completion_timeout(struct completion *x, unsigned long timeout) 2916{ 2917 return wait_for_common(x, timeout, TASK_UNINTERRUPTIBLE); 2918} 2919EXPORT_SYMBOL(wait_for_completion_timeout); 2920 2921/** 2922 * wait_for_completion_io: - waits for completion of a task 2923 * @x: holds the state of this particular completion 2924 * 2925 * This waits to be signaled for completion of a specific task. It is NOT 2926 * interruptible and there is no timeout. The caller is accounted as waiting 2927 * for IO. 2928 */ 2929void __sched wait_for_completion_io(struct completion *x) 2930{ 2931 wait_for_common_io(x, MAX_SCHEDULE_TIMEOUT, TASK_UNINTERRUPTIBLE); 2932} 2933EXPORT_SYMBOL(wait_for_completion_io); 2934 2935/** 2936 * wait_for_completion_io_timeout: - waits for completion of a task (w/timeout) 2937 * @x: holds the state of this particular completion 2938 * @timeout: timeout value in jiffies 2939 * 2940 * This waits for either a completion of a specific task to be signaled or for a 2941 * specified timeout to expire. The timeout is in jiffies. It is not 2942 * interruptible. The caller is accounted as waiting for IO. 2943 * 2944 * Return: 0 if timed out, and positive (at least 1, or number of jiffies left 2945 * till timeout) if completed. 2946 */ 2947unsigned long __sched 2948wait_for_completion_io_timeout(struct completion *x, unsigned long timeout) 2949{ 2950 return wait_for_common_io(x, timeout, TASK_UNINTERRUPTIBLE); 2951} 2952EXPORT_SYMBOL(wait_for_completion_io_timeout); 2953 2954/** 2955 * wait_for_completion_interruptible: - waits for completion of a task (w/intr) 2956 * @x: holds the state of this particular completion 2957 * 2958 * This waits for completion of a specific task to be signaled. It is 2959 * interruptible. 2960 * 2961 * Return: -ERESTARTSYS if interrupted, 0 if completed. 2962 */ 2963int __sched wait_for_completion_interruptible(struct completion *x) 2964{ 2965 long t = wait_for_common(x, MAX_SCHEDULE_TIMEOUT, TASK_INTERRUPTIBLE); 2966 if (t == -ERESTARTSYS) 2967 return t; 2968 return 0; 2969} 2970EXPORT_SYMBOL(wait_for_completion_interruptible); 2971 2972/** 2973 * wait_for_completion_interruptible_timeout: - waits for completion (w/(to,intr)) 2974 * @x: holds the state of this particular completion 2975 * @timeout: timeout value in jiffies 2976 * 2977 * This waits for either a completion of a specific task to be signaled or for a 2978 * specified timeout to expire. It is interruptible. The timeout is in jiffies. 2979 * 2980 * Return: -ERESTARTSYS if interrupted, 0 if timed out, positive (at least 1, 2981 * or number of jiffies left till timeout) if completed. 2982 */ 2983long __sched 2984wait_for_completion_interruptible_timeout(struct completion *x, 2985 unsigned long timeout) 2986{ 2987 return wait_for_common(x, timeout, TASK_INTERRUPTIBLE); 2988} 2989EXPORT_SYMBOL(wait_for_completion_interruptible_timeout); 2990 2991/** 2992 * wait_for_completion_killable: - waits for completion of a task (killable) 2993 * @x: holds the state of this particular completion 2994 * 2995 * This waits to be signaled for completion of a specific task. It can be 2996 * interrupted by a kill signal. 2997 * 2998 * Return: -ERESTARTSYS if interrupted, 0 if completed. 2999 */ 3000int __sched wait_for_completion_killable(struct completion *x) 3001{ 3002 long t = wait_for_common(x, MAX_SCHEDULE_TIMEOUT, TASK_KILLABLE); 3003 if (t == -ERESTARTSYS) 3004 return t; 3005 return 0; 3006} 3007EXPORT_SYMBOL(wait_for_completion_killable); 3008 3009/** 3010 * wait_for_completion_killable_timeout: - waits for completion of a task (w/(to,killable)) 3011 * @x: holds the state of this particular completion 3012 * @timeout: timeout value in jiffies 3013 * 3014 * This waits for either a completion of a specific task to be 3015 * signaled or for a specified timeout to expire. It can be 3016 * interrupted by a kill signal. The timeout is in jiffies. 3017 * 3018 * Return: -ERESTARTSYS if interrupted, 0 if timed out, positive (at least 1, 3019 * or number of jiffies left till timeout) if completed. 3020 */ 3021long __sched 3022wait_for_completion_killable_timeout(struct completion *x, 3023 unsigned long timeout) 3024{ 3025 return wait_for_common(x, timeout, TASK_KILLABLE); 3026} 3027EXPORT_SYMBOL(wait_for_completion_killable_timeout); 3028 3029/** 3030 * try_wait_for_completion - try to decrement a completion without blocking 3031 * @x: completion structure 3032 * 3033 * Return: 0 if a decrement cannot be done without blocking 3034 * 1 if a decrement succeeded. 3035 * 3036 * If a completion is being used as a counting completion, 3037 * attempt to decrement the counter without blocking. This 3038 * enables us to avoid waiting if the resource the completion 3039 * is protecting is not available. 3040 */ 3041bool try_wait_for_completion(struct completion *x) 3042{ 3043 unsigned long flags; 3044 int ret = 1; 3045 3046 spin_lock_irqsave(&x->wait.lock, flags); 3047 if (!x->done) 3048 ret = 0; 3049 else 3050 x->done--; 3051 spin_unlock_irqrestore(&x->wait.lock, flags); 3052 return ret; 3053} 3054EXPORT_SYMBOL(try_wait_for_completion); 3055 3056/** 3057 * completion_done - Test to see if a completion has any waiters 3058 * @x: completion structure 3059 * 3060 * Return: 0 if there are waiters (wait_for_completion() in progress) 3061 * 1 if there are no waiters. 3062 * 3063 */ 3064bool completion_done(struct completion *x) 3065{ 3066 unsigned long flags; 3067 int ret = 1; 3068 3069 spin_lock_irqsave(&x->wait.lock, flags); 3070 if (!x->done) 3071 ret = 0; 3072 spin_unlock_irqrestore(&x->wait.lock, flags); 3073 return ret; 3074} 3075EXPORT_SYMBOL(completion_done); 3076 3077static long __sched 3078sleep_on_common(wait_queue_head_t *q, int state, long timeout) 3079{ 3080 unsigned long flags; 3081 wait_queue_t wait; 3082 3083 init_waitqueue_entry(&wait, current); 3084 3085 __set_current_state(state); 3086 3087 spin_lock_irqsave(&q->lock, flags); 3088 __add_wait_queue(q, &wait); 3089 spin_unlock(&q->lock); 3090 timeout = schedule_timeout(timeout); 3091 spin_lock_irq(&q->lock); 3092 __remove_wait_queue(q, &wait); 3093 spin_unlock_irqrestore(&q->lock, flags); 3094 3095 return timeout; 3096} 3097 3098void __sched interruptible_sleep_on(wait_queue_head_t *q) 3099{ 3100 sleep_on_common(q, TASK_INTERRUPTIBLE, MAX_SCHEDULE_TIMEOUT); 3101} 3102EXPORT_SYMBOL(interruptible_sleep_on); 3103 3104long __sched 3105interruptible_sleep_on_timeout(wait_queue_head_t *q, long timeout) 3106{ 3107 return sleep_on_common(q, TASK_INTERRUPTIBLE, timeout); 3108} 3109EXPORT_SYMBOL(interruptible_sleep_on_timeout); 3110 3111void __sched sleep_on(wait_queue_head_t *q) 3112{ 3113 sleep_on_common(q, TASK_UNINTERRUPTIBLE, MAX_SCHEDULE_TIMEOUT); 3114} 3115EXPORT_SYMBOL(sleep_on); 3116 3117long __sched sleep_on_timeout(wait_queue_head_t *q, long timeout) 3118{ 3119 return sleep_on_common(q, TASK_UNINTERRUPTIBLE, timeout); 3120} 3121EXPORT_SYMBOL(sleep_on_timeout); 3122 3123#ifdef CONFIG_RT_MUTEXES 3124 3125/* 3126 * rt_mutex_setprio - set the current priority of a task 3127 * @p: task 3128 * @prio: prio value (kernel-internal form) 3129 * 3130 * This function changes the 'effective' priority of a task. It does 3131 * not touch ->normal_prio like __setscheduler(). 3132 * 3133 * Used by the rt_mutex code to implement priority inheritance logic. 3134 */ 3135void rt_mutex_setprio(struct task_struct *p, int prio) 3136{ 3137 int oldprio, on_rq, running; 3138 struct rq *rq; 3139 const struct sched_class *prev_class; 3140 3141 BUG_ON(prio < 0 || prio > MAX_PRIO); 3142 3143 rq = __task_rq_lock(p); 3144 3145 /* 3146 * Idle task boosting is a nono in general. There is one 3147 * exception, when PREEMPT_RT and NOHZ is active: 3148 * 3149 * The idle task calls get_next_timer_interrupt() and holds 3150 * the timer wheel base->lock on the CPU and another CPU wants 3151 * to access the timer (probably to cancel it). We can safely 3152 * ignore the boosting request, as the idle CPU runs this code 3153 * with interrupts disabled and will complete the lock 3154 * protected section without being interrupted. So there is no 3155 * real need to boost. 3156 */ 3157 if (unlikely(p == rq->idle)) { 3158 WARN_ON(p != rq->curr); 3159 WARN_ON(p->pi_blocked_on); 3160 goto out_unlock; 3161 } 3162 3163 trace_sched_pi_setprio(p, prio); 3164 oldprio = p->prio; 3165 prev_class = p->sched_class; 3166 on_rq = p->on_rq; 3167 running = task_current(rq, p); 3168 if (on_rq) 3169 dequeue_task(rq, p, 0); 3170 if (running) 3171 p->sched_class->put_prev_task(rq, p); 3172 3173 if (rt_prio(prio)) 3174 p->sched_class = &rt_sched_class; 3175 else 3176 p->sched_class = &fair_sched_class; 3177 3178 p->prio = prio; 3179 3180 if (running) 3181 p->sched_class->set_curr_task(rq); 3182 if (on_rq) 3183 enqueue_task(rq, p, oldprio < prio ? ENQUEUE_HEAD : 0); 3184 3185 check_class_changed(rq, p, prev_class, oldprio); 3186out_unlock: 3187 __task_rq_unlock(rq); 3188} 3189#endif 3190void set_user_nice(struct task_struct *p, long nice) 3191{ 3192 int old_prio, delta, on_rq; 3193 unsigned long flags; 3194 struct rq *rq; 3195 3196 if (TASK_NICE(p) == nice || nice < -20 || nice > 19) 3197 return; 3198 /* 3199 * We have to be careful, if called from sys_setpriority(), 3200 * the task might be in the middle of scheduling on another CPU. 3201 */ 3202 rq = task_rq_lock(p, &flags); 3203 /* 3204 * The RT priorities are set via sched_setscheduler(), but we still 3205 * allow the 'normal' nice value to be set - but as expected 3206 * it wont have any effect on scheduling until the task is 3207 * SCHED_FIFO/SCHED_RR: 3208 */ 3209 if (task_has_rt_policy(p)) { 3210 p->static_prio = NICE_TO_PRIO(nice); 3211 goto out_unlock; 3212 } 3213 on_rq = p->on_rq; 3214 if (on_rq) 3215 dequeue_task(rq, p, 0); 3216 3217 p->static_prio = NICE_TO_PRIO(nice); 3218 set_load_weight(p); 3219 old_prio = p->prio; 3220 p->prio = effective_prio(p); 3221 delta = p->prio - old_prio; 3222 3223 if (on_rq) { 3224 enqueue_task(rq, p, 0); 3225 /* 3226 * If the task increased its priority or is running and 3227 * lowered its priority, then reschedule its CPU: 3228 */ 3229 if (delta < 0 || (delta > 0 && task_running(rq, p))) 3230 resched_task(rq->curr); 3231 } 3232out_unlock: 3233 task_rq_unlock(rq, p, &flags); 3234} 3235EXPORT_SYMBOL(set_user_nice); 3236 3237/* 3238 * can_nice - check if a task can reduce its nice value 3239 * @p: task 3240 * @nice: nice value 3241 */ 3242int can_nice(const struct task_struct *p, const int nice) 3243{ 3244 /* convert nice value [19,-20] to rlimit style value [1,40] */ 3245 int nice_rlim = 20 - nice; 3246 3247 return (nice_rlim <= task_rlimit(p, RLIMIT_NICE) || 3248 capable(CAP_SYS_NICE)); 3249} 3250 3251#ifdef __ARCH_WANT_SYS_NICE 3252 3253/* 3254 * sys_nice - change the priority of the current process. 3255 * @increment: priority increment 3256 * 3257 * sys_setpriority is a more generic, but much slower function that 3258 * does similar things. 3259 */ 3260SYSCALL_DEFINE1(nice, int, increment) 3261{ 3262 long nice, retval; 3263 3264 /* 3265 * Setpriority might change our priority at the same moment. 3266 * We don't have to worry. Conceptually one call occurs first 3267 * and we have a single winner. 3268 */ 3269 if (increment < -40) 3270 increment = -40; 3271 if (increment > 40) 3272 increment = 40; 3273 3274 nice = TASK_NICE(current) + increment; 3275 if (nice < -20) 3276 nice = -20; 3277 if (nice > 19) 3278 nice = 19; 3279 3280 if (increment < 0 && !can_nice(current, nice)) 3281 return -EPERM; 3282 3283 retval = security_task_setnice(current, nice); 3284 if (retval) 3285 return retval; 3286 3287 set_user_nice(current, nice); 3288 return 0; 3289} 3290 3291#endif 3292 3293/** 3294 * task_prio - return the priority value of a given task. 3295 * @p: the task in question. 3296 * 3297 * Return: The priority value as seen by users in /proc. 3298 * RT tasks are offset by -200. Normal tasks are centered 3299 * around 0, value goes from -16 to +15. 3300 */ 3301int task_prio(const struct task_struct *p) 3302{ 3303 return p->prio - MAX_RT_PRIO; 3304} 3305 3306/** 3307 * task_nice - return the nice value of a given task. 3308 * @p: the task in question. 3309 * 3310 * Return: The nice value [ -20 ... 0 ... 19 ]. 3311 */ 3312int task_nice(const struct task_struct *p) 3313{ 3314 return TASK_NICE(p); 3315} 3316EXPORT_SYMBOL(task_nice); 3317 3318/** 3319 * idle_cpu - is a given cpu idle currently? 3320 * @cpu: the processor in question. 3321 * 3322 * Return: 1 if the CPU is currently idle. 0 otherwise. 3323 */ 3324int idle_cpu(int cpu) 3325{ 3326 struct rq *rq = cpu_rq(cpu); 3327 3328 if (rq->curr != rq->idle) 3329 return 0; 3330 3331 if (rq->nr_running) 3332 return 0; 3333 3334#ifdef CONFIG_SMP 3335 if (!llist_empty(&rq->wake_list)) 3336 return 0; 3337#endif 3338 3339 return 1; 3340} 3341 3342/** 3343 * idle_task - return the idle task for a given cpu. 3344 * @cpu: the processor in question. 3345 * 3346 * Return: The idle task for the cpu @cpu. 3347 */ 3348struct task_struct *idle_task(int cpu) 3349{ 3350 return cpu_rq(cpu)->idle; 3351} 3352 3353/** 3354 * find_process_by_pid - find a process with a matching PID value. 3355 * @pid: the pid in question. 3356 * 3357 * The task of @pid, if found. %NULL otherwise. 3358 */ 3359static struct task_struct *find_process_by_pid(pid_t pid) 3360{ 3361 return pid ? find_task_by_vpid(pid) : current; 3362} 3363 3364/* Actually do priority change: must hold rq lock. */ 3365static void 3366__setscheduler(struct rq *rq, struct task_struct *p, int policy, int prio) 3367{ 3368 p->policy = policy; 3369 p->rt_priority = prio; 3370 p->normal_prio = normal_prio(p); 3371 /* we are holding p->pi_lock already */ 3372 p->prio = rt_mutex_getprio(p); 3373 if (rt_prio(p->prio)) 3374 p->sched_class = &rt_sched_class; 3375 else 3376 p->sched_class = &fair_sched_class; 3377 set_load_weight(p); 3378} 3379 3380/* 3381 * check the target process has a UID that matches the current process's 3382 */ 3383static bool check_same_owner(struct task_struct *p) 3384{ 3385 const struct cred *cred = current_cred(), *pcred; 3386 bool match; 3387 3388 rcu_read_lock(); 3389 pcred = __task_cred(p); 3390 match = (uid_eq(cred->euid, pcred->euid) || 3391 uid_eq(cred->euid, pcred->uid)); 3392 rcu_read_unlock(); 3393 return match; 3394} 3395 3396static int __sched_setscheduler(struct task_struct *p, int policy, 3397 const struct sched_param *param, bool user) 3398{ 3399 int retval, oldprio, oldpolicy = -1, on_rq, running; 3400 unsigned long flags; 3401 const struct sched_class *prev_class; 3402 struct rq *rq; 3403 int reset_on_fork; 3404 3405 /* may grab non-irq protected spin_locks */ 3406 BUG_ON(in_interrupt()); 3407recheck: 3408 /* double check policy once rq lock held */ 3409 if (policy < 0) { 3410 reset_on_fork = p->sched_reset_on_fork; 3411 policy = oldpolicy = p->policy; 3412 } else { 3413 reset_on_fork = !!(policy & SCHED_RESET_ON_FORK); 3414 policy &= ~SCHED_RESET_ON_FORK; 3415 3416 if (policy != SCHED_FIFO && policy != SCHED_RR && 3417 policy != SCHED_NORMAL && policy != SCHED_BATCH && 3418 policy != SCHED_IDLE) 3419 return -EINVAL; 3420 } 3421 3422 /* 3423 * Valid priorities for SCHED_FIFO and SCHED_RR are 3424 * 1..MAX_USER_RT_PRIO-1, valid priority for SCHED_NORMAL, 3425 * SCHED_BATCH and SCHED_IDLE is 0. 3426 */ 3427 if (param->sched_priority < 0 || 3428 (p->mm && param->sched_priority > MAX_USER_RT_PRIO-1) || 3429 (!p->mm && param->sched_priority > MAX_RT_PRIO-1)) 3430 return -EINVAL; 3431 if (rt_policy(policy) != (param->sched_priority != 0)) 3432 return -EINVAL; 3433 3434 /* 3435 * Allow unprivileged RT tasks to decrease priority: 3436 */ 3437 if (user && !capable(CAP_SYS_NICE)) { 3438 if (rt_policy(policy)) { 3439 unsigned long rlim_rtprio = 3440 task_rlimit(p, RLIMIT_RTPRIO); 3441 3442 /* can't set/change the rt policy */ 3443 if (policy != p->policy && !rlim_rtprio) 3444 return -EPERM; 3445 3446 /* can't increase priority */ 3447 if (param->sched_priority > p->rt_priority && 3448 param->sched_priority > rlim_rtprio) 3449 return -EPERM; 3450 } 3451 3452 /* 3453 * Treat SCHED_IDLE as nice 20. Only allow a switch to 3454 * SCHED_NORMAL if the RLIMIT_NICE would normally permit it. 3455 */ 3456 if (p->policy == SCHED_IDLE && policy != SCHED_IDLE) { 3457 if (!can_nice(p, TASK_NICE(p))) 3458 return -EPERM; 3459 } 3460 3461 /* can't change other user's priorities */ 3462 if (!check_same_owner(p)) 3463 return -EPERM; 3464 3465 /* Normal users shall not reset the sched_reset_on_fork flag */ 3466 if (p->sched_reset_on_fork && !reset_on_fork) 3467 return -EPERM; 3468 } 3469 3470 if (user) { 3471 retval = security_task_setscheduler(p); 3472 if (retval) 3473 return retval; 3474 } 3475 3476 /* 3477 * make sure no PI-waiters arrive (or leave) while we are 3478 * changing the priority of the task: 3479 * 3480 * To be able to change p->policy safely, the appropriate 3481 * runqueue lock must be held. 3482 */ 3483 rq = task_rq_lock(p, &flags); 3484 3485 /* 3486 * Changing the policy of the stop threads its a very bad idea 3487 */ 3488 if (p == rq->stop) { 3489 task_rq_unlock(rq, p, &flags); 3490 return -EINVAL; 3491 } 3492 3493 /* 3494 * If not changing anything there's no need to proceed further: 3495 */ 3496 if (unlikely(policy == p->policy && (!rt_policy(policy) || 3497 param->sched_priority == p->rt_priority))) { 3498 task_rq_unlock(rq, p, &flags); 3499 return 0; 3500 } 3501 3502#ifdef CONFIG_RT_GROUP_SCHED 3503 if (user) { 3504 /* 3505 * Do not allow realtime tasks into groups that have no runtime 3506 * assigned. 3507 */ 3508 if (rt_bandwidth_enabled() && rt_policy(policy) && 3509 task_group(p)->rt_bandwidth.rt_runtime == 0 && 3510 !task_group_is_autogroup(task_group(p))) { 3511 task_rq_unlock(rq, p, &flags); 3512 return -EPERM; 3513 } 3514 } 3515#endif 3516 3517 /* recheck policy now with rq lock held */ 3518 if (unlikely(oldpolicy != -1 && oldpolicy != p->policy)) { 3519 policy = oldpolicy = -1; 3520 task_rq_unlock(rq, p, &flags); 3521 goto recheck; 3522 } 3523 on_rq = p->on_rq; 3524 running = task_current(rq, p); 3525 if (on_rq) 3526 dequeue_task(rq, p, 0); 3527 if (running) 3528 p->sched_class->put_prev_task(rq, p); 3529 3530 p->sched_reset_on_fork = reset_on_fork; 3531 3532 oldprio = p->prio; 3533 prev_class = p->sched_class; 3534 __setscheduler(rq, p, policy, param->sched_priority); 3535 3536 if (running) 3537 p->sched_class->set_curr_task(rq); 3538 if (on_rq) 3539 enqueue_task(rq, p, 0); 3540 3541 check_class_changed(rq, p, prev_class, oldprio); 3542 task_rq_unlock(rq, p, &flags); 3543 3544 rt_mutex_adjust_pi(p); 3545 3546 return 0; 3547} 3548 3549/** 3550 * sched_setscheduler - change the scheduling policy and/or RT priority of a thread. 3551 * @p: the task in question. 3552 * @policy: new policy. 3553 * @param: structure containing the new RT priority. 3554 * 3555 * Return: 0 on success. An error code otherwise. 3556 * 3557 * NOTE that the task may be already dead. 3558 */ 3559int sched_setscheduler(struct task_struct *p, int policy, 3560 const struct sched_param *param) 3561{ 3562 return __sched_setscheduler(p, policy, param, true); 3563} 3564EXPORT_SYMBOL_GPL(sched_setscheduler); 3565 3566/** 3567 * sched_setscheduler_nocheck - change the scheduling policy and/or RT priority of a thread from kernelspace. 3568 * @p: the task in question. 3569 * @policy: new policy. 3570 * @param: structure containing the new RT priority. 3571 * 3572 * Just like sched_setscheduler, only don't bother checking if the 3573 * current context has permission. For example, this is needed in 3574 * stop_machine(): we create temporary high priority worker threads, 3575 * but our caller might not have that capability. 3576 * 3577 * Return: 0 on success. An error code otherwise. 3578 */ 3579int sched_setscheduler_nocheck(struct task_struct *p, int policy, 3580 const struct sched_param *param) 3581{ 3582 return __sched_setscheduler(p, policy, param, false); 3583} 3584 3585static int 3586do_sched_setscheduler(pid_t pid, int policy, struct sched_param __user *param) 3587{ 3588 struct sched_param lparam; 3589 struct task_struct *p; 3590 int retval; 3591 3592 if (!param || pid < 0) 3593 return -EINVAL; 3594 if (copy_from_user(&lparam, param, sizeof(struct sched_param))) 3595 return -EFAULT; 3596 3597 rcu_read_lock(); 3598 retval = -ESRCH; 3599 p = find_process_by_pid(pid); 3600 if (p != NULL) 3601 retval = sched_setscheduler(p, policy, &lparam); 3602 rcu_read_unlock(); 3603 3604 return retval; 3605} 3606 3607/** 3608 * sys_sched_setscheduler - set/change the scheduler policy and RT priority 3609 * @pid: the pid in question. 3610 * @policy: new policy. 3611 * @param: structure containing the new RT priority. 3612 * 3613 * Return: 0 on success. An error code otherwise. 3614 */ 3615SYSCALL_DEFINE3(sched_setscheduler, pid_t, pid, int, policy, 3616 struct sched_param __user *, param) 3617{ 3618 /* negative values for policy are not valid */ 3619 if (policy < 0) 3620 return -EINVAL; 3621 3622 return do_sched_setscheduler(pid, policy, param); 3623} 3624 3625/** 3626 * sys_sched_setparam - set/change the RT priority of a thread 3627 * @pid: the pid in question. 3628 * @param: structure containing the new RT priority. 3629 * 3630 * Return: 0 on success. An error code otherwise. 3631 */ 3632SYSCALL_DEFINE2(sched_setparam, pid_t, pid, struct sched_param __user *, param) 3633{ 3634 return do_sched_setscheduler(pid, -1, param); 3635} 3636 3637/** 3638 * sys_sched_getscheduler - get the policy (scheduling class) of a thread 3639 * @pid: the pid in question. 3640 * 3641 * Return: On success, the policy of the thread. Otherwise, a negative error 3642 * code. 3643 */ 3644SYSCALL_DEFINE1(sched_getscheduler, pid_t, pid) 3645{ 3646 struct task_struct *p; 3647 int retval; 3648 3649 if (pid < 0) 3650 return -EINVAL; 3651 3652 retval = -ESRCH; 3653 rcu_read_lock(); 3654 p = find_process_by_pid(pid); 3655 if (p) { 3656 retval = security_task_getscheduler(p); 3657 if (!retval) 3658 retval = p->policy 3659 | (p->sched_reset_on_fork ? SCHED_RESET_ON_FORK : 0); 3660 } 3661 rcu_read_unlock(); 3662 return retval; 3663} 3664 3665/** 3666 * sys_sched_getparam - get the RT priority of a thread 3667 * @pid: the pid in question. 3668 * @param: structure containing the RT priority. 3669 * 3670 * Return: On success, 0 and the RT priority is in @param. Otherwise, an error 3671 * code. 3672 */ 3673SYSCALL_DEFINE2(sched_getparam, pid_t, pid, struct sched_param __user *, param) 3674{ 3675 struct sched_param lp; 3676 struct task_struct *p; 3677 int retval; 3678 3679 if (!param || pid < 0) 3680 return -EINVAL; 3681 3682 rcu_read_lock(); 3683 p = find_process_by_pid(pid); 3684 retval = -ESRCH; 3685 if (!p) 3686 goto out_unlock; 3687 3688 retval = security_task_getscheduler(p); 3689 if (retval) 3690 goto out_unlock; 3691 3692 lp.sched_priority = p->rt_priority; 3693 rcu_read_unlock(); 3694 3695 /* 3696 * This one might sleep, we cannot do it with a spinlock held ... 3697 */ 3698 retval = copy_to_user(param, &lp, sizeof(*param)) ? -EFAULT : 0; 3699 3700 return retval; 3701 3702out_unlock: 3703 rcu_read_unlock(); 3704 return retval; 3705} 3706 3707long sched_setaffinity(pid_t pid, const struct cpumask *in_mask) 3708{ 3709 cpumask_var_t cpus_allowed, new_mask; 3710 struct task_struct *p; 3711 int retval; 3712 3713 get_online_cpus(); 3714 rcu_read_lock(); 3715 3716 p = find_process_by_pid(pid); 3717 if (!p) { 3718 rcu_read_unlock(); 3719 put_online_cpus(); 3720 return -ESRCH; 3721 } 3722 3723 /* Prevent p going away */ 3724 get_task_struct(p); 3725 rcu_read_unlock(); 3726 3727 if (p->flags & PF_NO_SETAFFINITY) { 3728 retval = -EINVAL; 3729 goto out_put_task; 3730 } 3731 if (!alloc_cpumask_var(&cpus_allowed, GFP_KERNEL)) { 3732 retval = -ENOMEM; 3733 goto out_put_task; 3734 } 3735 if (!alloc_cpumask_var(&new_mask, GFP_KERNEL)) { 3736 retval = -ENOMEM; 3737 goto out_free_cpus_allowed; 3738 } 3739 retval = -EPERM; 3740 if (!check_same_owner(p)) { 3741 rcu_read_lock(); 3742 if (!ns_capable(__task_cred(p)->user_ns, CAP_SYS_NICE)) { 3743 rcu_read_unlock(); 3744 goto out_unlock; 3745 } 3746 rcu_read_unlock(); 3747 } 3748 3749 retval = security_task_setscheduler(p); 3750 if (retval) 3751 goto out_unlock; 3752 3753 cpuset_cpus_allowed(p, cpus_allowed); 3754 cpumask_and(new_mask, in_mask, cpus_allowed); 3755again: 3756 retval = set_cpus_allowed_ptr(p, new_mask); 3757 3758 if (!retval) { 3759 cpuset_cpus_allowed(p, cpus_allowed); 3760 if (!cpumask_subset(new_mask, cpus_allowed)) { 3761 /* 3762 * We must have raced with a concurrent cpuset 3763 * update. Just reset the cpus_allowed to the 3764 * cpuset's cpus_allowed 3765 */ 3766 cpumask_copy(new_mask, cpus_allowed); 3767 goto again; 3768 } 3769 } 3770out_unlock: 3771 free_cpumask_var(new_mask); 3772out_free_cpus_allowed: 3773 free_cpumask_var(cpus_allowed); 3774out_put_task: 3775 put_task_struct(p); 3776 put_online_cpus(); 3777 return retval; 3778} 3779 3780static int get_user_cpu_mask(unsigned long __user *user_mask_ptr, unsigned len, 3781 struct cpumask *new_mask) 3782{ 3783 if (len < cpumask_size()) 3784 cpumask_clear(new_mask); 3785 else if (len > cpumask_size()) 3786 len = cpumask_size(); 3787 3788 return copy_from_user(new_mask, user_mask_ptr, len) ? -EFAULT : 0; 3789} 3790 3791/** 3792 * sys_sched_setaffinity - set the cpu affinity of a process 3793 * @pid: pid of the process 3794 * @len: length in bytes of the bitmask pointed to by user_mask_ptr 3795 * @user_mask_ptr: user-space pointer to the new cpu mask 3796 * 3797 * Return: 0 on success. An error code otherwise. 3798 */ 3799SYSCALL_DEFINE3(sched_setaffinity, pid_t, pid, unsigned int, len, 3800 unsigned long __user *, user_mask_ptr) 3801{ 3802 cpumask_var_t new_mask; 3803 int retval; 3804 3805 if (!alloc_cpumask_var(&new_mask, GFP_KERNEL)) 3806 return -ENOMEM; 3807 3808 retval = get_user_cpu_mask(user_mask_ptr, len, new_mask); 3809 if (retval == 0) 3810 retval = sched_setaffinity(pid, new_mask); 3811 free_cpumask_var(new_mask); 3812 return retval; 3813} 3814 3815long sched_getaffinity(pid_t pid, struct cpumask *mask) 3816{ 3817 struct task_struct *p; 3818 unsigned long flags; 3819 int retval; 3820 3821 get_online_cpus(); 3822 rcu_read_lock(); 3823 3824 retval = -ESRCH; 3825 p = find_process_by_pid(pid); 3826 if (!p) 3827 goto out_unlock; 3828 3829 retval = security_task_getscheduler(p); 3830 if (retval) 3831 goto out_unlock; 3832 3833 raw_spin_lock_irqsave(&p->pi_lock, flags); 3834 cpumask_and(mask, &p->cpus_allowed, cpu_online_mask); 3835 raw_spin_unlock_irqrestore(&p->pi_lock, flags); 3836 3837out_unlock: 3838 rcu_read_unlock(); 3839 put_online_cpus(); 3840 3841 return retval; 3842} 3843 3844/** 3845 * sys_sched_getaffinity - get the cpu affinity of a process 3846 * @pid: pid of the process 3847 * @len: length in bytes of the bitmask pointed to by user_mask_ptr 3848 * @user_mask_ptr: user-space pointer to hold the current cpu mask 3849 * 3850 * Return: 0 on success. An error code otherwise. 3851 */ 3852SYSCALL_DEFINE3(sched_getaffinity, pid_t, pid, unsigned int, len, 3853 unsigned long __user *, user_mask_ptr) 3854{ 3855 int ret; 3856 cpumask_var_t mask; 3857 3858 if ((len * BITS_PER_BYTE) < nr_cpu_ids) 3859 return -EINVAL; 3860 if (len & (sizeof(unsigned long)-1)) 3861 return -EINVAL; 3862 3863 if (!alloc_cpumask_var(&mask, GFP_KERNEL)) 3864 return -ENOMEM; 3865 3866 ret = sched_getaffinity(pid, mask); 3867 if (ret == 0) { 3868 size_t retlen = min_t(size_t, len, cpumask_size()); 3869 3870 if (copy_to_user(user_mask_ptr, mask, retlen)) 3871 ret = -EFAULT; 3872 else 3873 ret = retlen; 3874 } 3875 free_cpumask_var(mask); 3876 3877 return ret; 3878} 3879 3880/** 3881 * sys_sched_yield - yield the current processor to other threads. 3882 * 3883 * This function yields the current CPU to other tasks. If there are no 3884 * other threads running on this CPU then this function will return. 3885 * 3886 * Return: 0. 3887 */ 3888SYSCALL_DEFINE0(sched_yield) 3889{ 3890 struct rq *rq = this_rq_lock(); 3891 3892 schedstat_inc(rq, yld_count); 3893 current->sched_class->yield_task(rq); 3894 3895 /* 3896 * Since we are going to call schedule() anyway, there's 3897 * no need to preempt or enable interrupts: 3898 */ 3899 __release(rq->lock); 3900 spin_release(&rq->lock.dep_map, 1, _THIS_IP_); 3901 do_raw_spin_unlock(&rq->lock); 3902 sched_preempt_enable_no_resched(); 3903 3904 schedule(); 3905 3906 return 0; 3907} 3908 3909static void __cond_resched(void) 3910{ 3911 __preempt_count_add(PREEMPT_ACTIVE); 3912 __schedule(); 3913 __preempt_count_sub(PREEMPT_ACTIVE); 3914} 3915 3916int __sched _cond_resched(void) 3917{ 3918 if (should_resched()) { 3919 __cond_resched(); 3920 return 1; 3921 } 3922 return 0; 3923} 3924EXPORT_SYMBOL(_cond_resched); 3925 3926/* 3927 * __cond_resched_lock() - if a reschedule is pending, drop the given lock, 3928 * call schedule, and on return reacquire the lock. 3929 * 3930 * This works OK both with and without CONFIG_PREEMPT. We do strange low-level 3931 * operations here to prevent schedule() from being called twice (once via 3932 * spin_unlock(), once by hand). 3933 */ 3934int __cond_resched_lock(spinlock_t *lock) 3935{ 3936 int resched = should_resched(); 3937 int ret = 0; 3938 3939 lockdep_assert_held(lock); 3940 3941 if (spin_needbreak(lock) || resched) { 3942 spin_unlock(lock); 3943 if (resched) 3944 __cond_resched(); 3945 else 3946 cpu_relax(); 3947 ret = 1; 3948 spin_lock(lock); 3949 } 3950 return ret; 3951} 3952EXPORT_SYMBOL(__cond_resched_lock); 3953 3954int __sched __cond_resched_softirq(void) 3955{ 3956 BUG_ON(!in_softirq()); 3957 3958 if (should_resched()) { 3959 local_bh_enable(); 3960 __cond_resched(); 3961 local_bh_disable(); 3962 return 1; 3963 } 3964 return 0; 3965} 3966EXPORT_SYMBOL(__cond_resched_softirq); 3967 3968/** 3969 * yield - yield the current processor to other threads. 3970 * 3971 * Do not ever use this function, there's a 99% chance you're doing it wrong. 3972 * 3973 * The scheduler is at all times free to pick the calling task as the most 3974 * eligible task to run, if removing the yield() call from your code breaks 3975 * it, its already broken. 3976 * 3977 * Typical broken usage is: 3978 * 3979 * while (!event) 3980 * yield(); 3981 * 3982 * where one assumes that yield() will let 'the other' process run that will 3983 * make event true. If the current task is a SCHED_FIFO task that will never 3984 * happen. Never use yield() as a progress guarantee!! 3985 * 3986 * If you want to use yield() to wait for something, use wait_event(). 3987 * If you want to use yield() to be 'nice' for others, use cond_resched(). 3988 * If you still want to use yield(), do not! 3989 */ 3990void __sched yield(void) 3991{ 3992 set_current_state(TASK_RUNNING); 3993 sys_sched_yield(); 3994} 3995EXPORT_SYMBOL(yield); 3996 3997/** 3998 * yield_to - yield the current processor to another thread in 3999 * your thread group, or accelerate that thread toward the 4000 * processor it's on. 4001 * @p: target task 4002 * @preempt: whether task preemption is allowed or not 4003 * 4004 * It's the caller's job to ensure that the target task struct 4005 * can't go away on us before we can do any checks. 4006 * 4007 * Return: 4008 * true (>0) if we indeed boosted the target task. 4009 * false (0) if we failed to boost the target. 4010 * -ESRCH if there's no task to yield to. 4011 */ 4012bool __sched yield_to(struct task_struct *p, bool preempt) 4013{ 4014 struct task_struct *curr = current; 4015 struct rq *rq, *p_rq; 4016 unsigned long flags; 4017 int yielded = 0; 4018 4019 local_irq_save(flags); 4020 rq = this_rq(); 4021 4022again: 4023 p_rq = task_rq(p); 4024 /* 4025 * If we're the only runnable task on the rq and target rq also 4026 * has only one task, there's absolutely no point in yielding. 4027 */ 4028 if (rq->nr_running == 1 && p_rq->nr_running == 1) { 4029 yielded = -ESRCH; 4030 goto out_irq; 4031 } 4032 4033 double_rq_lock(rq, p_rq); 4034 while (task_rq(p) != p_rq) { 4035 double_rq_unlock(rq, p_rq); 4036 goto again; 4037 } 4038 4039 if (!curr->sched_class->yield_to_task) 4040 goto out_unlock; 4041 4042 if (curr->sched_class != p->sched_class) 4043 goto out_unlock; 4044 4045 if (task_running(p_rq, p) || p->state) 4046 goto out_unlock; 4047 4048 yielded = curr->sched_class->yield_to_task(rq, p, preempt); 4049 if (yielded) { 4050 schedstat_inc(rq, yld_count); 4051 /* 4052 * Make p's CPU reschedule; pick_next_entity takes care of 4053 * fairness. 4054 */ 4055 if (preempt && rq != p_rq) 4056 resched_task(p_rq->curr); 4057 } 4058 4059out_unlock: 4060 double_rq_unlock(rq, p_rq); 4061out_irq: 4062 local_irq_restore(flags); 4063 4064 if (yielded > 0) 4065 schedule(); 4066 4067 return yielded; 4068} 4069EXPORT_SYMBOL_GPL(yield_to); 4070 4071/* 4072 * This task is about to go to sleep on IO. Increment rq->nr_iowait so 4073 * that process accounting knows that this is a task in IO wait state. 4074 */ 4075void __sched io_schedule(void) 4076{ 4077 struct rq *rq = raw_rq(); 4078 4079 delayacct_blkio_start(); 4080 atomic_inc(&rq->nr_iowait); 4081 blk_flush_plug(current); 4082 current->in_iowait = 1; 4083 schedule(); 4084 current->in_iowait = 0; 4085 atomic_dec(&rq->nr_iowait); 4086 delayacct_blkio_end(); 4087} 4088EXPORT_SYMBOL(io_schedule); 4089 4090long __sched io_schedule_timeout(long timeout) 4091{ 4092 struct rq *rq = raw_rq(); 4093 long ret; 4094 4095 delayacct_blkio_start(); 4096 atomic_inc(&rq->nr_iowait); 4097 blk_flush_plug(current); 4098 current->in_iowait = 1; 4099 ret = schedule_timeout(timeout); 4100 current->in_iowait = 0; 4101 atomic_dec(&rq->nr_iowait); 4102 delayacct_blkio_end(); 4103 return ret; 4104} 4105 4106/** 4107 * sys_sched_get_priority_max - return maximum RT priority. 4108 * @policy: scheduling class. 4109 * 4110 * Return: On success, this syscall returns the maximum 4111 * rt_priority that can be used by a given scheduling class. 4112 * On failure, a negative error code is returned. 4113 */ 4114SYSCALL_DEFINE1(sched_get_priority_max, int, policy) 4115{ 4116 int ret = -EINVAL; 4117 4118 switch (policy) { 4119 case SCHED_FIFO: 4120 case SCHED_RR: 4121 ret = MAX_USER_RT_PRIO-1; 4122 break; 4123 case SCHED_NORMAL: 4124 case SCHED_BATCH: 4125 case SCHED_IDLE: 4126 ret = 0; 4127 break; 4128 } 4129 return ret; 4130} 4131 4132/** 4133 * sys_sched_get_priority_min - return minimum RT priority. 4134 * @policy: scheduling class. 4135 * 4136 * Return: On success, this syscall returns the minimum 4137 * rt_priority that can be used by a given scheduling class. 4138 * On failure, a negative error code is returned. 4139 */ 4140SYSCALL_DEFINE1(sched_get_priority_min, int, policy) 4141{ 4142 int ret = -EINVAL; 4143 4144 switch (policy) { 4145 case SCHED_FIFO: 4146 case SCHED_RR: 4147 ret = 1; 4148 break; 4149 case SCHED_NORMAL: 4150 case SCHED_BATCH: 4151 case SCHED_IDLE: 4152 ret = 0; 4153 } 4154 return ret; 4155} 4156 4157/** 4158 * sys_sched_rr_get_interval - return the default timeslice of a process. 4159 * @pid: pid of the process. 4160 * @interval: userspace pointer to the timeslice value. 4161 * 4162 * this syscall writes the default timeslice value of a given process 4163 * into the user-space timespec buffer. A value of '0' means infinity. 4164 * 4165 * Return: On success, 0 and the timeslice is in @interval. Otherwise, 4166 * an error code. 4167 */ 4168SYSCALL_DEFINE2(sched_rr_get_interval, pid_t, pid, 4169 struct timespec __user *, interval) 4170{ 4171 struct task_struct *p; 4172 unsigned int time_slice; 4173 unsigned long flags; 4174 struct rq *rq; 4175 int retval; 4176 struct timespec t; 4177 4178 if (pid < 0) 4179 return -EINVAL; 4180 4181 retval = -ESRCH; 4182 rcu_read_lock(); 4183 p = find_process_by_pid(pid); 4184 if (!p) 4185 goto out_unlock; 4186 4187 retval = security_task_getscheduler(p); 4188 if (retval) 4189 goto out_unlock; 4190 4191 rq = task_rq_lock(p, &flags); 4192 time_slice = p->sched_class->get_rr_interval(rq, p); 4193 task_rq_unlock(rq, p, &flags); 4194 4195 rcu_read_unlock(); 4196 jiffies_to_timespec(time_slice, &t); 4197 retval = copy_to_user(interval, &t, sizeof(t)) ? -EFAULT : 0; 4198 return retval; 4199 4200out_unlock: 4201 rcu_read_unlock(); 4202 return retval; 4203} 4204 4205static const char stat_nam[] = TASK_STATE_TO_CHAR_STR; 4206 4207void sched_show_task(struct task_struct *p) 4208{ 4209 unsigned long free = 0; 4210 int ppid; 4211 unsigned state; 4212 4213 state = p->state ? __ffs(p->state) + 1 : 0; 4214 printk(KERN_INFO "%-15.15s %c", p->comm, 4215 state < sizeof(stat_nam) - 1 ? stat_nam[state] : '?'); 4216#if BITS_PER_LONG == 32 4217 if (state == TASK_RUNNING) 4218 printk(KERN_CONT " running "); 4219 else 4220 printk(KERN_CONT " %08lx ", thread_saved_pc(p)); 4221#else 4222 if (state == TASK_RUNNING) 4223 printk(KERN_CONT " running task "); 4224 else 4225 printk(KERN_CONT " %016lx ", thread_saved_pc(p)); 4226#endif 4227#ifdef CONFIG_DEBUG_STACK_USAGE 4228 free = stack_not_used(p); 4229#endif 4230 rcu_read_lock(); 4231 ppid = task_pid_nr(rcu_dereference(p->real_parent)); 4232 rcu_read_unlock(); 4233 printk(KERN_CONT "%5lu %5d %6d 0x%08lx\n", free, 4234 task_pid_nr(p), ppid, 4235 (unsigned long)task_thread_info(p)->flags); 4236 4237 print_worker_info(KERN_INFO, p); 4238 show_stack(p, NULL); 4239} 4240 4241void show_state_filter(unsigned long state_filter) 4242{ 4243 struct task_struct *g, *p; 4244 4245#if BITS_PER_LONG == 32 4246 printk(KERN_INFO 4247 " task PC stack pid father\n"); 4248#else 4249 printk(KERN_INFO 4250 " task PC stack pid father\n"); 4251#endif 4252 rcu_read_lock(); 4253 do_each_thread(g, p) { 4254 /* 4255 * reset the NMI-timeout, listing all files on a slow 4256 * console might take a lot of time: 4257 */ 4258 touch_nmi_watchdog(); 4259 if (!state_filter || (p->state & state_filter)) 4260 sched_show_task(p); 4261 } while_each_thread(g, p); 4262 4263 touch_all_softlockup_watchdogs(); 4264 4265#ifdef CONFIG_SCHED_DEBUG 4266 sysrq_sched_debug_show(); 4267#endif 4268 rcu_read_unlock(); 4269 /* 4270 * Only show locks if all tasks are dumped: 4271 */ 4272 if (!state_filter) 4273 debug_show_all_locks(); 4274} 4275 4276void init_idle_bootup_task(struct task_struct *idle) 4277{ 4278 idle->sched_class = &idle_sched_class; 4279} 4280 4281/** 4282 * init_idle - set up an idle thread for a given CPU 4283 * @idle: task in question 4284 * @cpu: cpu the idle task belongs to 4285 * 4286 * NOTE: this function does not set the idle thread's NEED_RESCHED 4287 * flag, to make booting more robust. 4288 */ 4289void init_idle(struct task_struct *idle, int cpu) 4290{ 4291 struct rq *rq = cpu_rq(cpu); 4292 unsigned long flags; 4293 4294 raw_spin_lock_irqsave(&rq->lock, flags); 4295 4296 __sched_fork(0, idle); 4297 idle->state = TASK_RUNNING; 4298 idle->se.exec_start = sched_clock(); 4299 4300 do_set_cpus_allowed(idle, cpumask_of(cpu)); 4301 /* 4302 * We're having a chicken and egg problem, even though we are 4303 * holding rq->lock, the cpu isn't yet set to this cpu so the 4304 * lockdep check in task_group() will fail. 4305 * 4306 * Similar case to sched_fork(). / Alternatively we could 4307 * use task_rq_lock() here and obtain the other rq->lock. 4308 * 4309 * Silence PROVE_RCU 4310 */ 4311 rcu_read_lock(); 4312 __set_task_cpu(idle, cpu); 4313 rcu_read_unlock(); 4314 4315 rq->curr = rq->idle = idle; 4316#if defined(CONFIG_SMP) 4317 idle->on_cpu = 1; 4318#endif 4319 raw_spin_unlock_irqrestore(&rq->lock, flags); 4320 4321 /* Set the preempt count _outside_ the spinlocks! */ 4322 init_idle_preempt_count(idle, cpu); 4323 4324 /* 4325 * The idle tasks have their own, simple scheduling class: 4326 */ 4327 idle->sched_class = &idle_sched_class; 4328 ftrace_graph_init_idle_task(idle, cpu); 4329 vtime_init_idle(idle, cpu); 4330#if defined(CONFIG_SMP) 4331 sprintf(idle->comm, "%s/%d", INIT_TASK_COMM, cpu); 4332#endif 4333} 4334 4335#ifdef CONFIG_SMP 4336void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask) 4337{ 4338 if (p->sched_class && p->sched_class->set_cpus_allowed) 4339 p->sched_class->set_cpus_allowed(p, new_mask); 4340 4341 cpumask_copy(&p->cpus_allowed, new_mask); 4342 p->nr_cpus_allowed = cpumask_weight(new_mask); 4343} 4344 4345/* 4346 * This is how migration works: 4347 * 4348 * 1) we invoke migration_cpu_stop() on the target CPU using 4349 * stop_one_cpu(). 4350 * 2) stopper starts to run (implicitly forcing the migrated thread 4351 * off the CPU) 4352 * 3) it checks whether the migrated task is still in the wrong runqueue. 4353 * 4) if it's in the wrong runqueue then the migration thread removes 4354 * it and puts it into the right queue. 4355 * 5) stopper completes and stop_one_cpu() returns and the migration 4356 * is done. 4357 */ 4358 4359/* 4360 * Change a given task's CPU affinity. Migrate the thread to a 4361 * proper CPU and schedule it away if the CPU it's executing on 4362 * is removed from the allowed bitmask. 4363 * 4364 * NOTE: the caller must have a valid reference to the task, the 4365 * task must not exit() & deallocate itself prematurely. The 4366 * call is not atomic; no spinlocks may be held. 4367 */ 4368int set_cpus_allowed_ptr(struct task_struct *p, const struct cpumask *new_mask) 4369{ 4370 unsigned long flags; 4371 struct rq *rq; 4372 unsigned int dest_cpu; 4373 int ret = 0; 4374 4375 rq = task_rq_lock(p, &flags); 4376 4377 if (cpumask_equal(&p->cpus_allowed, new_mask)) 4378 goto out; 4379 4380 if (!cpumask_intersects(new_mask, cpu_active_mask)) { 4381 ret = -EINVAL; 4382 goto out; 4383 } 4384 4385 do_set_cpus_allowed(p, new_mask); 4386 4387 /* Can the task run on the task's current CPU? If so, we're done */ 4388 if (cpumask_test_cpu(task_cpu(p), new_mask)) 4389 goto out; 4390 4391 dest_cpu = cpumask_any_and(cpu_active_mask, new_mask); 4392 if (p->on_rq) { 4393 struct migration_arg arg = { p, dest_cpu }; 4394 /* Need help from migration thread: drop lock and wait. */ 4395 task_rq_unlock(rq, p, &flags); 4396 stop_one_cpu(cpu_of(rq), migration_cpu_stop, &arg); 4397 tlb_migrate_finish(p->mm); 4398 return 0; 4399 } 4400out: 4401 task_rq_unlock(rq, p, &flags); 4402 4403 return ret; 4404} 4405EXPORT_SYMBOL_GPL(set_cpus_allowed_ptr); 4406 4407/* 4408 * Move (not current) task off this cpu, onto dest cpu. We're doing 4409 * this because either it can't run here any more (set_cpus_allowed() 4410 * away from this CPU, or CPU going down), or because we're 4411 * attempting to rebalance this task on exec (sched_exec). 4412 * 4413 * So we race with normal scheduler movements, but that's OK, as long 4414 * as the task is no longer on this CPU. 4415 * 4416 * Returns non-zero if task was successfully migrated. 4417 */ 4418static int __migrate_task(struct task_struct *p, int src_cpu, int dest_cpu) 4419{ 4420 struct rq *rq_dest, *rq_src; 4421 int ret = 0; 4422 4423 if (unlikely(!cpu_active(dest_cpu))) 4424 return ret; 4425 4426 rq_src = cpu_rq(src_cpu); 4427 rq_dest = cpu_rq(dest_cpu); 4428 4429 raw_spin_lock(&p->pi_lock); 4430 double_rq_lock(rq_src, rq_dest); 4431 /* Already moved. */ 4432 if (task_cpu(p) != src_cpu) 4433 goto done; 4434 /* Affinity changed (again). */ 4435 if (!cpumask_test_cpu(dest_cpu, tsk_cpus_allowed(p))) 4436 goto fail; 4437 4438 /* 4439 * If we're not on a rq, the next wake-up will ensure we're 4440 * placed properly. 4441 */ 4442 if (p->on_rq) { 4443 dequeue_task(rq_src, p, 0); 4444 set_task_cpu(p, dest_cpu); 4445 enqueue_task(rq_dest, p, 0); 4446 check_preempt_curr(rq_dest, p, 0); 4447 } 4448done: 4449 ret = 1; 4450fail: 4451 double_rq_unlock(rq_src, rq_dest); 4452 raw_spin_unlock(&p->pi_lock); 4453 return ret; 4454} 4455 4456#ifdef CONFIG_NUMA_BALANCING 4457/* Migrate current task p to target_cpu */ 4458int migrate_task_to(struct task_struct *p, int target_cpu) 4459{ 4460 struct migration_arg arg = { p, target_cpu }; 4461 int curr_cpu = task_cpu(p); 4462 4463 if (curr_cpu == target_cpu) 4464 return 0; 4465 4466 if (!cpumask_test_cpu(target_cpu, tsk_cpus_allowed(p))) 4467 return -EINVAL; 4468 4469 /* TODO: This is not properly updating schedstats */ 4470 4471 return stop_one_cpu(curr_cpu, migration_cpu_stop, &arg); 4472} 4473 4474/* 4475 * Requeue a task on a given node and accurately track the number of NUMA 4476 * tasks on the runqueues 4477 */ 4478void sched_setnuma(struct task_struct *p, int nid) 4479{ 4480 struct rq *rq; 4481 unsigned long flags; 4482 bool on_rq, running; 4483 4484 rq = task_rq_lock(p, &flags); 4485 on_rq = p->on_rq; 4486 running = task_current(rq, p); 4487 4488 if (on_rq) 4489 dequeue_task(rq, p, 0); 4490 if (running) 4491 p->sched_class->put_prev_task(rq, p); 4492 4493 p->numa_preferred_nid = nid; 4494 4495 if (running) 4496 p->sched_class->set_curr_task(rq); 4497 if (on_rq) 4498 enqueue_task(rq, p, 0); 4499 task_rq_unlock(rq, p, &flags); 4500} 4501#endif 4502 4503/* 4504 * migration_cpu_stop - this will be executed by a highprio stopper thread 4505 * and performs thread migration by bumping thread off CPU then 4506 * 'pushing' onto another runqueue. 4507 */ 4508static int migration_cpu_stop(void *data) 4509{ 4510 struct migration_arg *arg = data; 4511 4512 /* 4513 * The original target cpu might have gone down and we might 4514 * be on another cpu but it doesn't matter. 4515 */ 4516 local_irq_disable(); 4517 __migrate_task(arg->task, raw_smp_processor_id(), arg->dest_cpu); 4518 local_irq_enable(); 4519 return 0; 4520} 4521 4522#ifdef CONFIG_HOTPLUG_CPU 4523 4524/* 4525 * Ensures that the idle task is using init_mm right before its cpu goes 4526 * offline. 4527 */ 4528void idle_task_exit(void) 4529{ 4530 struct mm_struct *mm = current->active_mm; 4531 4532 BUG_ON(cpu_online(smp_processor_id())); 4533 4534 if (mm != &init_mm) 4535 switch_mm(mm, &init_mm, current); 4536 mmdrop(mm); 4537} 4538 4539/* 4540 * Since this CPU is going 'away' for a while, fold any nr_active delta 4541 * we might have. Assumes we're called after migrate_tasks() so that the 4542 * nr_active count is stable. 4543 * 4544 * Also see the comment "Global load-average calculations". 4545 */ 4546static void calc_load_migrate(struct rq *rq) 4547{ 4548 long delta = calc_load_fold_active(rq); 4549 if (delta) 4550 atomic_long_add(delta, &calc_load_tasks); 4551} 4552 4553/* 4554 * Migrate all tasks from the rq, sleeping tasks will be migrated by 4555 * try_to_wake_up()->select_task_rq(). 4556 * 4557 * Called with rq->lock held even though we'er in stop_machine() and 4558 * there's no concurrency possible, we hold the required locks anyway 4559 * because of lock validation efforts. 4560 */ 4561static void migrate_tasks(unsigned int dead_cpu) 4562{ 4563 struct rq *rq = cpu_rq(dead_cpu); 4564 struct task_struct *next, *stop = rq->stop; 4565 int dest_cpu; 4566 4567 /* 4568 * Fudge the rq selection such that the below task selection loop 4569 * doesn't get stuck on the currently eligible stop task. 4570 * 4571 * We're currently inside stop_machine() and the rq is either stuck 4572 * in the stop_machine_cpu_stop() loop, or we're executing this code, 4573 * either way we should never end up calling schedule() until we're 4574 * done here. 4575 */ 4576 rq->stop = NULL; 4577 4578 /* 4579 * put_prev_task() and pick_next_task() sched 4580 * class method both need to have an up-to-date 4581 * value of rq->clock[_task] 4582 */ 4583 update_rq_clock(rq); 4584 4585 for ( ; ; ) { 4586 /* 4587 * There's this thread running, bail when that's the only 4588 * remaining thread. 4589 */ 4590 if (rq->nr_running == 1) 4591 break; 4592 4593 next = pick_next_task(rq); 4594 BUG_ON(!next); 4595 next->sched_class->put_prev_task(rq, next); 4596 4597 /* Find suitable destination for @next, with force if needed. */ 4598 dest_cpu = select_fallback_rq(dead_cpu, next); 4599 raw_spin_unlock(&rq->lock); 4600 4601 __migrate_task(next, dead_cpu, dest_cpu); 4602 4603 raw_spin_lock(&rq->lock); 4604 } 4605 4606 rq->stop = stop; 4607} 4608 4609#endif /* CONFIG_HOTPLUG_CPU */ 4610 4611#if defined(CONFIG_SCHED_DEBUG) && defined(CONFIG_SYSCTL) 4612 4613static struct ctl_table sd_ctl_dir[] = { 4614 { 4615 .procname = "sched_domain", 4616 .mode = 0555, 4617 }, 4618 {} 4619}; 4620 4621static struct ctl_table sd_ctl_root[] = { 4622 { 4623 .procname = "kernel", 4624 .mode = 0555, 4625 .child = sd_ctl_dir, 4626 }, 4627 {} 4628}; 4629 4630static struct ctl_table *sd_alloc_ctl_entry(int n) 4631{ 4632 struct ctl_table *entry = 4633 kcalloc(n, sizeof(struct ctl_table), GFP_KERNEL); 4634 4635 return entry; 4636} 4637 4638static void sd_free_ctl_entry(struct ctl_table **tablep) 4639{ 4640 struct ctl_table *entry; 4641 4642 /* 4643 * In the intermediate directories, both the child directory and 4644 * procname are dynamically allocated and could fail but the mode 4645 * will always be set. In the lowest directory the names are 4646 * static strings and all have proc handlers. 4647 */ 4648 for (entry = *tablep; entry->mode; entry++) { 4649 if (entry->child) 4650 sd_free_ctl_entry(&entry->child); 4651 if (entry->proc_handler == NULL) 4652 kfree(entry->procname); 4653 } 4654 4655 kfree(*tablep); 4656 *tablep = NULL; 4657} 4658 4659static int min_load_idx = 0; 4660static int max_load_idx = CPU_LOAD_IDX_MAX-1; 4661 4662static void 4663set_table_entry(struct ctl_table *entry, 4664 const char *procname, void *data, int maxlen, 4665 umode_t mode, proc_handler *proc_handler, 4666 bool load_idx) 4667{ 4668 entry->procname = procname; 4669 entry->data = data; 4670 entry->maxlen = maxlen; 4671 entry->mode = mode; 4672 entry->proc_handler = proc_handler; 4673 4674 if (load_idx) { 4675 entry->extra1 = &min_load_idx; 4676 entry->extra2 = &max_load_idx; 4677 } 4678} 4679 4680static struct ctl_table * 4681sd_alloc_ctl_domain_table(struct sched_domain *sd) 4682{ 4683 struct ctl_table *table = sd_alloc_ctl_entry(13); 4684 4685 if (table == NULL) 4686 return NULL; 4687 4688 set_table_entry(&table[0], "min_interval", &sd->min_interval, 4689 sizeof(long), 0644, proc_doulongvec_minmax, false); 4690 set_table_entry(&table[1], "max_interval", &sd->max_interval, 4691 sizeof(long), 0644, proc_doulongvec_minmax, false); 4692 set_table_entry(&table[2], "busy_idx", &sd->busy_idx, 4693 sizeof(int), 0644, proc_dointvec_minmax, true); 4694 set_table_entry(&table[3], "idle_idx", &sd->idle_idx, 4695 sizeof(int), 0644, proc_dointvec_minmax, true); 4696 set_table_entry(&table[4], "newidle_idx", &sd->newidle_idx, 4697 sizeof(int), 0644, proc_dointvec_minmax, true); 4698 set_table_entry(&table[5], "wake_idx", &sd->wake_idx, 4699 sizeof(int), 0644, proc_dointvec_minmax, true); 4700 set_table_entry(&table[6], "forkexec_idx", &sd->forkexec_idx, 4701 sizeof(int), 0644, proc_dointvec_minmax, true); 4702 set_table_entry(&table[7], "busy_factor", &sd->busy_factor, 4703 sizeof(int), 0644, proc_dointvec_minmax, false); 4704 set_table_entry(&table[8], "imbalance_pct", &sd->imbalance_pct, 4705 sizeof(int), 0644, proc_dointvec_minmax, false); 4706 set_table_entry(&table[9], "cache_nice_tries", 4707 &sd->cache_nice_tries, 4708 sizeof(int), 0644, proc_dointvec_minmax, false); 4709 set_table_entry(&table[10], "flags", &sd->flags, 4710 sizeof(int), 0644, proc_dointvec_minmax, false); 4711 set_table_entry(&table[11], "name", sd->name, 4712 CORENAME_MAX_SIZE, 0444, proc_dostring, false); 4713 /* &table[12] is terminator */ 4714 4715 return table; 4716} 4717 4718static struct ctl_table *sd_alloc_ctl_cpu_table(int cpu) 4719{ 4720 struct ctl_table *entry, *table; 4721 struct sched_domain *sd; 4722 int domain_num = 0, i; 4723 char buf[32]; 4724 4725 for_each_domain(cpu, sd) 4726 domain_num++; 4727 entry = table = sd_alloc_ctl_entry(domain_num + 1); 4728 if (table == NULL) 4729 return NULL; 4730 4731 i = 0; 4732 for_each_domain(cpu, sd) { 4733 snprintf(buf, 32, "domain%d", i); 4734 entry->procname = kstrdup(buf, GFP_KERNEL); 4735 entry->mode = 0555; 4736 entry->child = sd_alloc_ctl_domain_table(sd); 4737 entry++; 4738 i++; 4739 } 4740 return table; 4741} 4742 4743static struct ctl_table_header *sd_sysctl_header; 4744static void register_sched_domain_sysctl(void) 4745{ 4746 int i, cpu_num = num_possible_cpus(); 4747 struct ctl_table *entry = sd_alloc_ctl_entry(cpu_num + 1); 4748 char buf[32]; 4749 4750 WARN_ON(sd_ctl_dir[0].child); 4751 sd_ctl_dir[0].child = entry; 4752 4753 if (entry == NULL) 4754 return; 4755 4756 for_each_possible_cpu(i) { 4757 snprintf(buf, 32, "cpu%d", i); 4758 entry->procname = kstrdup(buf, GFP_KERNEL); 4759 entry->mode = 0555; 4760 entry->child = sd_alloc_ctl_cpu_table(i); 4761 entry++; 4762 } 4763 4764 WARN_ON(sd_sysctl_header); 4765 sd_sysctl_header = register_sysctl_table(sd_ctl_root); 4766} 4767 4768/* may be called multiple times per register */ 4769static void unregister_sched_domain_sysctl(void) 4770{ 4771 if (sd_sysctl_header) 4772 unregister_sysctl_table(sd_sysctl_header); 4773 sd_sysctl_header = NULL; 4774 if (sd_ctl_dir[0].child) 4775 sd_free_ctl_entry(&sd_ctl_dir[0].child); 4776} 4777#else 4778static void register_sched_domain_sysctl(void) 4779{ 4780} 4781static void unregister_sched_domain_sysctl(void) 4782{ 4783} 4784#endif 4785 4786static void set_rq_online(struct rq *rq) 4787{ 4788 if (!rq->online) { 4789 const struct sched_class *class; 4790 4791 cpumask_set_cpu(rq->cpu, rq->rd->online); 4792 rq->online = 1; 4793 4794 for_each_class(class) { 4795 if (class->rq_online) 4796 class->rq_online(rq); 4797 } 4798 } 4799} 4800 4801static void set_rq_offline(struct rq *rq) 4802{ 4803 if (rq->online) { 4804 const struct sched_class *class; 4805 4806 for_each_class(class) { 4807 if (class->rq_offline) 4808 class->rq_offline(rq); 4809 } 4810 4811 cpumask_clear_cpu(rq->cpu, rq->rd->online); 4812 rq->online = 0; 4813 } 4814} 4815 4816/* 4817 * migration_call - callback that gets triggered when a CPU is added. 4818 * Here we can start up the necessary migration thread for the new CPU. 4819 */ 4820static int 4821migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu) 4822{ 4823 int cpu = (long)hcpu; 4824 unsigned long flags; 4825 struct rq *rq = cpu_rq(cpu); 4826 4827 switch (action & ~CPU_TASKS_FROZEN) { 4828 4829 case CPU_UP_PREPARE: 4830 rq->calc_load_update = calc_load_update; 4831 break; 4832 4833 case CPU_ONLINE: 4834 /* Update our root-domain */ 4835 raw_spin_lock_irqsave(&rq->lock, flags); 4836 if (rq->rd) { 4837 BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span)); 4838 4839 set_rq_online(rq); 4840 } 4841 raw_spin_unlock_irqrestore(&rq->lock, flags); 4842 break; 4843 4844#ifdef CONFIG_HOTPLUG_CPU 4845 case CPU_DYING: 4846 sched_ttwu_pending(); 4847 /* Update our root-domain */ 4848 raw_spin_lock_irqsave(&rq->lock, flags); 4849 if (rq->rd) { 4850 BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span)); 4851 set_rq_offline(rq); 4852 } 4853 migrate_tasks(cpu); 4854 BUG_ON(rq->nr_running != 1); /* the migration thread */ 4855 raw_spin_unlock_irqrestore(&rq->lock, flags); 4856 break; 4857 4858 case CPU_DEAD: 4859 calc_load_migrate(rq); 4860 break; 4861#endif 4862 } 4863 4864 update_max_interval(); 4865 4866 return NOTIFY_OK; 4867} 4868 4869/* 4870 * Register at high priority so that task migration (migrate_all_tasks) 4871 * happens before everything else. This has to be lower priority than 4872 * the notifier in the perf_event subsystem, though. 4873 */ 4874static struct notifier_block migration_notifier = { 4875 .notifier_call = migration_call, 4876 .priority = CPU_PRI_MIGRATION, 4877}; 4878 4879static int sched_cpu_active(struct notifier_block *nfb, 4880 unsigned long action, void *hcpu) 4881{ 4882 switch (action & ~CPU_TASKS_FROZEN) { 4883 case CPU_STARTING: 4884 case CPU_DOWN_FAILED: 4885 set_cpu_active((long)hcpu, true); 4886 return NOTIFY_OK; 4887 default: 4888 return NOTIFY_DONE; 4889 } 4890} 4891 4892static int sched_cpu_inactive(struct notifier_block *nfb, 4893 unsigned long action, void *hcpu) 4894{ 4895 switch (action & ~CPU_TASKS_FROZEN) { 4896 case CPU_DOWN_PREPARE: 4897 set_cpu_active((long)hcpu, false); 4898 return NOTIFY_OK; 4899 default: 4900 return NOTIFY_DONE; 4901 } 4902} 4903 4904static int __init migration_init(void) 4905{ 4906 void *cpu = (void *)(long)smp_processor_id(); 4907 int err; 4908 4909 /* Initialize migration for the boot CPU */ 4910 err = migration_call(&migration_notifier, CPU_UP_PREPARE, cpu); 4911 BUG_ON(err == NOTIFY_BAD); 4912 migration_call(&migration_notifier, CPU_ONLINE, cpu); 4913 register_cpu_notifier(&migration_notifier); 4914 4915 /* Register cpu active notifiers */ 4916 cpu_notifier(sched_cpu_active, CPU_PRI_SCHED_ACTIVE); 4917 cpu_notifier(sched_cpu_inactive, CPU_PRI_SCHED_INACTIVE); 4918 4919 return 0; 4920} 4921early_initcall(migration_init); 4922#endif 4923 4924#ifdef CONFIG_SMP 4925 4926static cpumask_var_t sched_domains_tmpmask; /* sched_domains_mutex */ 4927 4928#ifdef CONFIG_SCHED_DEBUG 4929 4930static __read_mostly int sched_debug_enabled; 4931 4932static int __init sched_debug_setup(char *str) 4933{ 4934 sched_debug_enabled = 1; 4935 4936 return 0; 4937} 4938early_param("sched_debug", sched_debug_setup); 4939 4940static inline bool sched_debug(void) 4941{ 4942 return sched_debug_enabled; 4943} 4944 4945static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level, 4946 struct cpumask *groupmask) 4947{ 4948 struct sched_group *group = sd->groups; 4949 char str[256]; 4950 4951 cpulist_scnprintf(str, sizeof(str), sched_domain_span(sd)); 4952 cpumask_clear(groupmask); 4953 4954 printk(KERN_DEBUG "%*s domain %d: ", level, "", level); 4955 4956 if (!(sd->flags & SD_LOAD_BALANCE)) { 4957 printk("does not load-balance\n"); 4958 if (sd->parent) 4959 printk(KERN_ERR "ERROR: !SD_LOAD_BALANCE domain" 4960 " has parent"); 4961 return -1; 4962 } 4963 4964 printk(KERN_CONT "span %s level %s\n", str, sd->name); 4965 4966 if (!cpumask_test_cpu(cpu, sched_domain_span(sd))) { 4967 printk(KERN_ERR "ERROR: domain->span does not contain " 4968 "CPU%d\n", cpu); 4969 } 4970 if (!cpumask_test_cpu(cpu, sched_group_cpus(group))) { 4971 printk(KERN_ERR "ERROR: domain->groups does not contain" 4972 " CPU%d\n", cpu); 4973 } 4974 4975 printk(KERN_DEBUG "%*s groups:", level + 1, ""); 4976 do { 4977 if (!group) { 4978 printk("\n"); 4979 printk(KERN_ERR "ERROR: group is NULL\n"); 4980 break; 4981 } 4982 4983 /* 4984 * Even though we initialize ->power to something semi-sane, 4985 * we leave power_orig unset. This allows us to detect if 4986 * domain iteration is still funny without causing /0 traps. 4987 */ 4988 if (!group->sgp->power_orig) { 4989 printk(KERN_CONT "\n"); 4990 printk(KERN_ERR "ERROR: domain->cpu_power not " 4991 "set\n"); 4992 break; 4993 } 4994 4995 if (!cpumask_weight(sched_group_cpus(group))) { 4996 printk(KERN_CONT "\n"); 4997 printk(KERN_ERR "ERROR: empty group\n"); 4998 break; 4999 } 5000 5001 if (!(sd->flags & SD_OVERLAP) && 5002 cpumask_intersects(groupmask, sched_group_cpus(group))) { 5003 printk(KERN_CONT "\n"); 5004 printk(KERN_ERR "ERROR: repeated CPUs\n"); 5005 break; 5006 } 5007 5008 cpumask_or(groupmask, groupmask, sched_group_cpus(group)); 5009 5010 cpulist_scnprintf(str, sizeof(str), sched_group_cpus(group)); 5011 5012 printk(KERN_CONT " %s", str); 5013 if (group->sgp->power != SCHED_POWER_SCALE) { 5014 printk(KERN_CONT " (cpu_power = %d)", 5015 group->sgp->power); 5016 } 5017 5018 group = group->next; 5019 } while (group != sd->groups); 5020 printk(KERN_CONT "\n"); 5021 5022 if (!cpumask_equal(sched_domain_span(sd), groupmask)) 5023 printk(KERN_ERR "ERROR: groups don't span domain->span\n"); 5024 5025 if (sd->parent && 5026 !cpumask_subset(groupmask, sched_domain_span(sd->parent))) 5027 printk(KERN_ERR "ERROR: parent span is not a superset " 5028 "of domain->span\n"); 5029 return 0; 5030} 5031 5032static void sched_domain_debug(struct sched_domain *sd, int cpu) 5033{ 5034 int level = 0; 5035 5036 if (!sched_debug_enabled) 5037 return; 5038 5039 if (!sd) { 5040 printk(KERN_DEBUG "CPU%d attaching NULL sched-domain.\n", cpu); 5041 return; 5042 } 5043 5044 printk(KERN_DEBUG "CPU%d attaching sched-domain:\n", cpu); 5045 5046 for (;;) { 5047 if (sched_domain_debug_one(sd, cpu, level, sched_domains_tmpmask)) 5048 break; 5049 level++; 5050 sd = sd->parent; 5051 if (!sd) 5052 break; 5053 } 5054} 5055#else /* !CONFIG_SCHED_DEBUG */ 5056# define sched_domain_debug(sd, cpu) do { } while (0) 5057static inline bool sched_debug(void) 5058{ 5059 return false; 5060} 5061#endif /* CONFIG_SCHED_DEBUG */ 5062 5063static int sd_degenerate(struct sched_domain *sd) 5064{ 5065 if (cpumask_weight(sched_domain_span(sd)) == 1) 5066 return 1; 5067 5068 /* Following flags need at least 2 groups */ 5069 if (sd->flags & (SD_LOAD_BALANCE | 5070 SD_BALANCE_NEWIDLE | 5071 SD_BALANCE_FORK | 5072 SD_BALANCE_EXEC | 5073 SD_SHARE_CPUPOWER | 5074 SD_SHARE_PKG_RESOURCES)) { 5075 if (sd->groups != sd->groups->next) 5076 return 0; 5077 } 5078 5079 /* Following flags don't use groups */ 5080 if (sd->flags & (SD_WAKE_AFFINE)) 5081 return 0; 5082 5083 return 1; 5084} 5085 5086static int 5087sd_parent_degenerate(struct sched_domain *sd, struct sched_domain *parent) 5088{ 5089 unsigned long cflags = sd->flags, pflags = parent->flags; 5090 5091 if (sd_degenerate(parent)) 5092 return 1; 5093 5094 if (!cpumask_equal(sched_domain_span(sd), sched_domain_span(parent))) 5095 return 0; 5096 5097 /* Flags needing groups don't count if only 1 group in parent */ 5098 if (parent->groups == parent->groups->next) { 5099 pflags &= ~(SD_LOAD_BALANCE | 5100 SD_BALANCE_NEWIDLE | 5101 SD_BALANCE_FORK | 5102 SD_BALANCE_EXEC | 5103 SD_SHARE_CPUPOWER | 5104 SD_SHARE_PKG_RESOURCES | 5105 SD_PREFER_SIBLING); 5106 if (nr_node_ids == 1) 5107 pflags &= ~SD_SERIALIZE; 5108 } 5109 if (~cflags & pflags) 5110 return 0; 5111 5112 return 1; 5113} 5114 5115static void free_rootdomain(struct rcu_head *rcu) 5116{ 5117 struct root_domain *rd = container_of(rcu, struct root_domain, rcu); 5118 5119 cpupri_cleanup(&rd->cpupri); 5120 free_cpumask_var(rd->rto_mask); 5121 free_cpumask_var(rd->online); 5122 free_cpumask_var(rd->span); 5123 kfree(rd); 5124} 5125 5126static void rq_attach_root(struct rq *rq, struct root_domain *rd) 5127{ 5128 struct root_domain *old_rd = NULL; 5129 unsigned long flags; 5130 5131 raw_spin_lock_irqsave(&rq->lock, flags); 5132 5133 if (rq->rd) { 5134 old_rd = rq->rd; 5135 5136 if (cpumask_test_cpu(rq->cpu, old_rd->online)) 5137 set_rq_offline(rq); 5138 5139 cpumask_clear_cpu(rq->cpu, old_rd->span); 5140 5141 /* 5142 * If we dont want to free the old_rt yet then 5143 * set old_rd to NULL to skip the freeing later 5144 * in this function: 5145 */ 5146 if (!atomic_dec_and_test(&old_rd->refcount)) 5147 old_rd = NULL; 5148 } 5149 5150 atomic_inc(&rd->refcount); 5151 rq->rd = rd; 5152 5153 cpumask_set_cpu(rq->cpu, rd->span); 5154 if (cpumask_test_cpu(rq->cpu, cpu_active_mask)) 5155 set_rq_online(rq); 5156 5157 raw_spin_unlock_irqrestore(&rq->lock, flags); 5158 5159 if (old_rd) 5160 call_rcu_sched(&old_rd->rcu, free_rootdomain); 5161} 5162 5163static int init_rootdomain(struct root_domain *rd) 5164{ 5165 memset(rd, 0, sizeof(*rd)); 5166 5167 if (!alloc_cpumask_var(&rd->span, GFP_KERNEL)) 5168 goto out; 5169 if (!alloc_cpumask_var(&rd->online, GFP_KERNEL)) 5170 goto free_span; 5171 if (!alloc_cpumask_var(&rd->rto_mask, GFP_KERNEL)) 5172 goto free_online; 5173 5174 if (cpupri_init(&rd->cpupri) != 0) 5175 goto free_rto_mask; 5176 return 0; 5177 5178free_rto_mask: 5179 free_cpumask_var(rd->rto_mask); 5180free_online: 5181 free_cpumask_var(rd->online); 5182free_span: 5183 free_cpumask_var(rd->span); 5184out: 5185 return -ENOMEM; 5186} 5187 5188/* 5189 * By default the system creates a single root-domain with all cpus as 5190 * members (mimicking the global state we have today). 5191 */ 5192struct root_domain def_root_domain; 5193 5194static void init_defrootdomain(void) 5195{ 5196 init_rootdomain(&def_root_domain); 5197 5198 atomic_set(&def_root_domain.refcount, 1); 5199} 5200 5201static struct root_domain *alloc_rootdomain(void) 5202{ 5203 struct root_domain *rd; 5204 5205 rd = kmalloc(sizeof(*rd), GFP_KERNEL); 5206 if (!rd) 5207 return NULL; 5208 5209 if (init_rootdomain(rd) != 0) { 5210 kfree(rd); 5211 return NULL; 5212 } 5213 5214 return rd; 5215} 5216 5217static void free_sched_groups(struct sched_group *sg, int free_sgp) 5218{ 5219 struct sched_group *tmp, *first; 5220 5221 if (!sg) 5222 return; 5223 5224 first = sg; 5225 do { 5226 tmp = sg->next; 5227 5228 if (free_sgp && atomic_dec_and_test(&sg->sgp->ref)) 5229 kfree(sg->sgp); 5230 5231 kfree(sg); 5232 sg = tmp; 5233 } while (sg != first); 5234} 5235 5236static void free_sched_domain(struct rcu_head *rcu) 5237{ 5238 struct sched_domain *sd = container_of(rcu, struct sched_domain, rcu); 5239 5240 /* 5241 * If its an overlapping domain it has private groups, iterate and 5242 * nuke them all. 5243 */ 5244 if (sd->flags & SD_OVERLAP) { 5245 free_sched_groups(sd->groups, 1); 5246 } else if (atomic_dec_and_test(&sd->groups->ref)) { 5247 kfree(sd->groups->sgp); 5248 kfree(sd->groups); 5249 } 5250 kfree(sd); 5251} 5252 5253static void destroy_sched_domain(struct sched_domain *sd, int cpu) 5254{ 5255 call_rcu(&sd->rcu, free_sched_domain); 5256} 5257 5258static void destroy_sched_domains(struct sched_domain *sd, int cpu) 5259{ 5260 for (; sd; sd = sd->parent) 5261 destroy_sched_domain(sd, cpu); 5262} 5263 5264/* 5265 * Keep a special pointer to the highest sched_domain that has 5266 * SD_SHARE_PKG_RESOURCE set (Last Level Cache Domain) for this 5267 * allows us to avoid some pointer chasing select_idle_sibling(). 5268 * 5269 * Also keep a unique ID per domain (we use the first cpu number in 5270 * the cpumask of the domain), this allows us to quickly tell if 5271 * two cpus are in the same cache domain, see cpus_share_cache(). 5272 */ 5273DEFINE_PER_CPU(struct sched_domain *, sd_llc); 5274DEFINE_PER_CPU(int, sd_llc_size); 5275DEFINE_PER_CPU(int, sd_llc_id); 5276DEFINE_PER_CPU(struct sched_domain *, sd_numa); 5277 5278static void update_top_cache_domain(int cpu) 5279{ 5280 struct sched_domain *sd; 5281 int id = cpu; 5282 int size = 1; 5283 5284 sd = highest_flag_domain(cpu, SD_SHARE_PKG_RESOURCES); 5285 if (sd) { 5286 id = cpumask_first(sched_domain_span(sd)); 5287 size = cpumask_weight(sched_domain_span(sd)); 5288 } 5289 5290 rcu_assign_pointer(per_cpu(sd_llc, cpu), sd); 5291 per_cpu(sd_llc_size, cpu) = size; 5292 per_cpu(sd_llc_id, cpu) = id; 5293 5294 sd = lowest_flag_domain(cpu, SD_NUMA); 5295 rcu_assign_pointer(per_cpu(sd_numa, cpu), sd); 5296} 5297 5298/* 5299 * Attach the domain 'sd' to 'cpu' as its base domain. Callers must 5300 * hold the hotplug lock. 5301 */ 5302static void 5303cpu_attach_domain(struct sched_domain *sd, struct root_domain *rd, int cpu) 5304{ 5305 struct rq *rq = cpu_rq(cpu); 5306 struct sched_domain *tmp; 5307 5308 /* Remove the sched domains which do not contribute to scheduling. */ 5309 for (tmp = sd; tmp; ) { 5310 struct sched_domain *parent = tmp->parent; 5311 if (!parent) 5312 break; 5313 5314 if (sd_parent_degenerate(tmp, parent)) { 5315 tmp->parent = parent->parent; 5316 if (parent->parent) 5317 parent->parent->child = tmp; 5318 /* 5319 * Transfer SD_PREFER_SIBLING down in case of a 5320 * degenerate parent; the spans match for this 5321 * so the property transfers. 5322 */ 5323 if (parent->flags & SD_PREFER_SIBLING) 5324 tmp->flags |= SD_PREFER_SIBLING; 5325 destroy_sched_domain(parent, cpu); 5326 } else 5327 tmp = tmp->parent; 5328 } 5329 5330 if (sd && sd_degenerate(sd)) { 5331 tmp = sd; 5332 sd = sd->parent; 5333 destroy_sched_domain(tmp, cpu); 5334 if (sd) 5335 sd->child = NULL; 5336 } 5337 5338 sched_domain_debug(sd, cpu); 5339 5340 rq_attach_root(rq, rd); 5341 tmp = rq->sd; 5342 rcu_assign_pointer(rq->sd, sd); 5343 destroy_sched_domains(tmp, cpu); 5344 5345 update_top_cache_domain(cpu); 5346} 5347 5348/* cpus with isolated domains */ 5349static cpumask_var_t cpu_isolated_map; 5350 5351/* Setup the mask of cpus configured for isolated domains */ 5352static int __init isolated_cpu_setup(char *str) 5353{ 5354 alloc_bootmem_cpumask_var(&cpu_isolated_map); 5355 cpulist_parse(str, cpu_isolated_map); 5356 return 1; 5357} 5358 5359__setup("isolcpus=", isolated_cpu_setup); 5360 5361static const struct cpumask *cpu_cpu_mask(int cpu) 5362{ 5363 return cpumask_of_node(cpu_to_node(cpu)); 5364} 5365 5366struct sd_data { 5367 struct sched_domain **__percpu sd; 5368 struct sched_group **__percpu sg; 5369 struct sched_group_power **__percpu sgp; 5370}; 5371 5372struct s_data { 5373 struct sched_domain ** __percpu sd; 5374 struct root_domain *rd; 5375}; 5376 5377enum s_alloc { 5378 sa_rootdomain, 5379 sa_sd, 5380 sa_sd_storage, 5381 sa_none, 5382}; 5383 5384struct sched_domain_topology_level; 5385 5386typedef struct sched_domain *(*sched_domain_init_f)(struct sched_domain_topology_level *tl, int cpu); 5387typedef const struct cpumask *(*sched_domain_mask_f)(int cpu); 5388 5389#define SDTL_OVERLAP 0x01 5390 5391struct sched_domain_topology_level { 5392 sched_domain_init_f init; 5393 sched_domain_mask_f mask; 5394 int flags; 5395 int numa_level; 5396 struct sd_data data; 5397}; 5398 5399/* 5400 * Build an iteration mask that can exclude certain CPUs from the upwards 5401 * domain traversal. 5402 * 5403 * Asymmetric node setups can result in situations where the domain tree is of 5404 * unequal depth, make sure to skip domains that already cover the entire 5405 * range. 5406 * 5407 * In that case build_sched_domains() will have terminated the iteration early 5408 * and our sibling sd spans will be empty. Domains should always include the 5409 * cpu they're built on, so check that. 5410 * 5411 */ 5412static void build_group_mask(struct sched_domain *sd, struct sched_group *sg) 5413{ 5414 const struct cpumask *span = sched_domain_span(sd); 5415 struct sd_data *sdd = sd->private; 5416 struct sched_domain *sibling; 5417 int i; 5418 5419 for_each_cpu(i, span) { 5420 sibling = *per_cpu_ptr(sdd->sd, i); 5421 if (!cpumask_test_cpu(i, sched_domain_span(sibling))) 5422 continue; 5423 5424 cpumask_set_cpu(i, sched_group_mask(sg)); 5425 } 5426} 5427 5428/* 5429 * Return the canonical balance cpu for this group, this is the first cpu 5430 * of this group that's also in the iteration mask. 5431 */ 5432int group_balance_cpu(struct sched_group *sg) 5433{ 5434 return cpumask_first_and(sched_group_cpus(sg), sched_group_mask(sg)); 5435} 5436 5437static int 5438build_overlap_sched_groups(struct sched_domain *sd, int cpu) 5439{ 5440 struct sched_group *first = NULL, *last = NULL, *groups = NULL, *sg; 5441 const struct cpumask *span = sched_domain_span(sd); 5442 struct cpumask *covered = sched_domains_tmpmask; 5443 struct sd_data *sdd = sd->private; 5444 struct sched_domain *child; 5445 int i; 5446 5447 cpumask_clear(covered); 5448 5449 for_each_cpu(i, span) { 5450 struct cpumask *sg_span; 5451 5452 if (cpumask_test_cpu(i, covered)) 5453 continue; 5454 5455 child = *per_cpu_ptr(sdd->sd, i); 5456 5457 /* See the comment near build_group_mask(). */ 5458 if (!cpumask_test_cpu(i, sched_domain_span(child))) 5459 continue; 5460 5461 sg = kzalloc_node(sizeof(struct sched_group) + cpumask_size(), 5462 GFP_KERNEL, cpu_to_node(cpu)); 5463 5464 if (!sg) 5465 goto fail; 5466 5467 sg_span = sched_group_cpus(sg); 5468 if (child->child) { 5469 child = child->child; 5470 cpumask_copy(sg_span, sched_domain_span(child)); 5471 } else 5472 cpumask_set_cpu(i, sg_span); 5473 5474 cpumask_or(covered, covered, sg_span); 5475 5476 sg->sgp = *per_cpu_ptr(sdd->sgp, i); 5477 if (atomic_inc_return(&sg->sgp->ref) == 1) 5478 build_group_mask(sd, sg); 5479 5480 /* 5481 * Initialize sgp->power such that even if we mess up the 5482 * domains and no possible iteration will get us here, we won't 5483 * die on a /0 trap. 5484 */ 5485 sg->sgp->power = SCHED_POWER_SCALE * cpumask_weight(sg_span); 5486 5487 /* 5488 * Make sure the first group of this domain contains the 5489 * canonical balance cpu. Otherwise the sched_domain iteration 5490 * breaks. See update_sg_lb_stats(). 5491 */ 5492 if ((!groups && cpumask_test_cpu(cpu, sg_span)) || 5493 group_balance_cpu(sg) == cpu) 5494 groups = sg; 5495 5496 if (!first) 5497 first = sg; 5498 if (last) 5499 last->next = sg; 5500 last = sg; 5501 last->next = first; 5502 } 5503 sd->groups = groups; 5504 5505 return 0; 5506 5507fail: 5508 free_sched_groups(first, 0); 5509 5510 return -ENOMEM; 5511} 5512 5513static int get_group(int cpu, struct sd_data *sdd, struct sched_group **sg) 5514{ 5515 struct sched_domain *sd = *per_cpu_ptr(sdd->sd, cpu); 5516 struct sched_domain *child = sd->child; 5517 5518 if (child) 5519 cpu = cpumask_first(sched_domain_span(child)); 5520 5521 if (sg) { 5522 *sg = *per_cpu_ptr(sdd->sg, cpu); 5523 (*sg)->sgp = *per_cpu_ptr(sdd->sgp, cpu); 5524 atomic_set(&(*sg)->sgp->ref, 1); /* for claim_allocations */ 5525 } 5526 5527 return cpu; 5528} 5529 5530/* 5531 * build_sched_groups will build a circular linked list of the groups 5532 * covered by the given span, and will set each group's ->cpumask correctly, 5533 * and ->cpu_power to 0. 5534 * 5535 * Assumes the sched_domain tree is fully constructed 5536 */ 5537static int 5538build_sched_groups(struct sched_domain *sd, int cpu) 5539{ 5540 struct sched_group *first = NULL, *last = NULL; 5541 struct sd_data *sdd = sd->private; 5542 const struct cpumask *span = sched_domain_span(sd); 5543 struct cpumask *covered; 5544 int i; 5545 5546 get_group(cpu, sdd, &sd->groups); 5547 atomic_inc(&sd->groups->ref); 5548 5549 if (cpu != cpumask_first(span)) 5550 return 0; 5551 5552 lockdep_assert_held(&sched_domains_mutex); 5553 covered = sched_domains_tmpmask; 5554 5555 cpumask_clear(covered); 5556 5557 for_each_cpu(i, span) { 5558 struct sched_group *sg; 5559 int group, j; 5560 5561 if (cpumask_test_cpu(i, covered)) 5562 continue; 5563 5564 group = get_group(i, sdd, &sg); 5565 cpumask_clear(sched_group_cpus(sg)); 5566 sg->sgp->power = 0; 5567 cpumask_setall(sched_group_mask(sg)); 5568 5569 for_each_cpu(j, span) { 5570 if (get_group(j, sdd, NULL) != group) 5571 continue; 5572 5573 cpumask_set_cpu(j, covered); 5574 cpumask_set_cpu(j, sched_group_cpus(sg)); 5575 } 5576 5577 if (!first) 5578 first = sg; 5579 if (last) 5580 last->next = sg; 5581 last = sg; 5582 } 5583 last->next = first; 5584 5585 return 0; 5586} 5587 5588/* 5589 * Initialize sched groups cpu_power. 5590 * 5591 * cpu_power indicates the capacity of sched group, which is used while 5592 * distributing the load between different sched groups in a sched domain. 5593 * Typically cpu_power for all the groups in a sched domain will be same unless 5594 * there are asymmetries in the topology. If there are asymmetries, group 5595 * having more cpu_power will pickup more load compared to the group having 5596 * less cpu_power. 5597 */ 5598static void init_sched_groups_power(int cpu, struct sched_domain *sd) 5599{ 5600 struct sched_group *sg = sd->groups; 5601 5602 WARN_ON(!sg); 5603 5604 do { 5605 sg->group_weight = cpumask_weight(sched_group_cpus(sg)); 5606 sg = sg->next; 5607 } while (sg != sd->groups); 5608 5609 if (cpu != group_balance_cpu(sg)) 5610 return; 5611 5612 update_group_power(sd, cpu); 5613 atomic_set(&sg->sgp->nr_busy_cpus, sg->group_weight); 5614} 5615 5616int __weak arch_sd_sibling_asym_packing(void) 5617{ 5618 return 0*SD_ASYM_PACKING; 5619} 5620 5621/* 5622 * Initializers for schedule domains 5623 * Non-inlined to reduce accumulated stack pressure in build_sched_domains() 5624 */ 5625 5626#ifdef CONFIG_SCHED_DEBUG 5627# define SD_INIT_NAME(sd, type) sd->name = #type 5628#else 5629# define SD_INIT_NAME(sd, type) do { } while (0) 5630#endif 5631 5632#define SD_INIT_FUNC(type) \ 5633static noinline struct sched_domain * \ 5634sd_init_##type(struct sched_domain_topology_level *tl, int cpu) \ 5635{ \ 5636 struct sched_domain *sd = *per_cpu_ptr(tl->data.sd, cpu); \ 5637 *sd = SD_##type##_INIT; \ 5638 SD_INIT_NAME(sd, type); \ 5639 sd->private = &tl->data; \ 5640 return sd; \ 5641} 5642 5643SD_INIT_FUNC(CPU) 5644#ifdef CONFIG_SCHED_SMT 5645 SD_INIT_FUNC(SIBLING) 5646#endif 5647#ifdef CONFIG_SCHED_MC 5648 SD_INIT_FUNC(MC) 5649#endif 5650#ifdef CONFIG_SCHED_BOOK 5651 SD_INIT_FUNC(BOOK) 5652#endif 5653 5654static int default_relax_domain_level = -1; 5655int sched_domain_level_max; 5656 5657static int __init setup_relax_domain_level(char *str) 5658{ 5659 if (kstrtoint(str, 0, &default_relax_domain_level)) 5660 pr_warn("Unable to set relax_domain_level\n"); 5661 5662 return 1; 5663} 5664__setup("relax_domain_level=", setup_relax_domain_level); 5665 5666static void set_domain_attribute(struct sched_domain *sd, 5667 struct sched_domain_attr *attr) 5668{ 5669 int request; 5670 5671 if (!attr || attr->relax_domain_level < 0) { 5672 if (default_relax_domain_level < 0) 5673 return; 5674 else 5675 request = default_relax_domain_level; 5676 } else 5677 request = attr->relax_domain_level; 5678 if (request < sd->level) { 5679 /* turn off idle balance on this domain */ 5680 sd->flags &= ~(SD_BALANCE_WAKE|SD_BALANCE_NEWIDLE); 5681 } else { 5682 /* turn on idle balance on this domain */ 5683 sd->flags |= (SD_BALANCE_WAKE|SD_BALANCE_NEWIDLE); 5684 } 5685} 5686 5687static void __sdt_free(const struct cpumask *cpu_map); 5688static int __sdt_alloc(const struct cpumask *cpu_map); 5689 5690static void __free_domain_allocs(struct s_data *d, enum s_alloc what, 5691 const struct cpumask *cpu_map) 5692{ 5693 switch (what) { 5694 case sa_rootdomain: 5695 if (!atomic_read(&d->rd->refcount)) 5696 free_rootdomain(&d->rd->rcu); /* fall through */ 5697 case sa_sd: 5698 free_percpu(d->sd); /* fall through */ 5699 case sa_sd_storage: 5700 __sdt_free(cpu_map); /* fall through */ 5701 case sa_none: 5702 break; 5703 } 5704} 5705 5706static enum s_alloc __visit_domain_allocation_hell(struct s_data *d, 5707 const struct cpumask *cpu_map) 5708{ 5709 memset(d, 0, sizeof(*d)); 5710 5711 if (__sdt_alloc(cpu_map)) 5712 return sa_sd_storage; 5713 d->sd = alloc_percpu(struct sched_domain *); 5714 if (!d->sd) 5715 return sa_sd_storage; 5716 d->rd = alloc_rootdomain(); 5717 if (!d->rd) 5718 return sa_sd; 5719 return sa_rootdomain; 5720} 5721 5722/* 5723 * NULL the sd_data elements we've used to build the sched_domain and 5724 * sched_group structure so that the subsequent __free_domain_allocs() 5725 * will not free the data we're using. 5726 */ 5727static void claim_allocations(int cpu, struct sched_domain *sd) 5728{ 5729 struct sd_data *sdd = sd->private; 5730 5731 WARN_ON_ONCE(*per_cpu_ptr(sdd->sd, cpu) != sd); 5732 *per_cpu_ptr(sdd->sd, cpu) = NULL; 5733 5734 if (atomic_read(&(*per_cpu_ptr(sdd->sg, cpu))->ref)) 5735 *per_cpu_ptr(sdd->sg, cpu) = NULL; 5736 5737 if (atomic_read(&(*per_cpu_ptr(sdd->sgp, cpu))->ref)) 5738 *per_cpu_ptr(sdd->sgp, cpu) = NULL; 5739} 5740 5741#ifdef CONFIG_SCHED_SMT 5742static const struct cpumask *cpu_smt_mask(int cpu) 5743{ 5744 return topology_thread_cpumask(cpu); 5745} 5746#endif 5747 5748/* 5749 * Topology list, bottom-up. 5750 */ 5751static struct sched_domain_topology_level default_topology[] = { 5752#ifdef CONFIG_SCHED_SMT 5753 { sd_init_SIBLING, cpu_smt_mask, }, 5754#endif 5755#ifdef CONFIG_SCHED_MC 5756 { sd_init_MC, cpu_coregroup_mask, }, 5757#endif 5758#ifdef CONFIG_SCHED_BOOK 5759 { sd_init_BOOK, cpu_book_mask, }, 5760#endif 5761 { sd_init_CPU, cpu_cpu_mask, }, 5762 { NULL, }, 5763}; 5764 5765static struct sched_domain_topology_level *sched_domain_topology = default_topology; 5766 5767#define for_each_sd_topology(tl) \ 5768 for (tl = sched_domain_topology; tl->init; tl++) 5769 5770#ifdef CONFIG_NUMA 5771 5772static int sched_domains_numa_levels; 5773static int *sched_domains_numa_distance; 5774static struct cpumask ***sched_domains_numa_masks; 5775static int sched_domains_curr_level; 5776 5777static inline int sd_local_flags(int level) 5778{ 5779 if (sched_domains_numa_distance[level] > RECLAIM_DISTANCE) 5780 return 0; 5781 5782 return SD_BALANCE_EXEC | SD_BALANCE_FORK | SD_WAKE_AFFINE; 5783} 5784 5785static struct sched_domain * 5786sd_numa_init(struct sched_domain_topology_level *tl, int cpu) 5787{ 5788 struct sched_domain *sd = *per_cpu_ptr(tl->data.sd, cpu); 5789 int level = tl->numa_level; 5790 int sd_weight = cpumask_weight( 5791 sched_domains_numa_masks[level][cpu_to_node(cpu)]); 5792 5793 *sd = (struct sched_domain){ 5794 .min_interval = sd_weight, 5795 .max_interval = 2*sd_weight, 5796 .busy_factor = 32, 5797 .imbalance_pct = 125, 5798 .cache_nice_tries = 2, 5799 .busy_idx = 3, 5800 .idle_idx = 2, 5801 .newidle_idx = 0, 5802 .wake_idx = 0, 5803 .forkexec_idx = 0, 5804 5805 .flags = 1*SD_LOAD_BALANCE 5806 | 1*SD_BALANCE_NEWIDLE 5807 | 0*SD_BALANCE_EXEC 5808 | 0*SD_BALANCE_FORK 5809 | 0*SD_BALANCE_WAKE 5810 | 0*SD_WAKE_AFFINE 5811 | 0*SD_SHARE_CPUPOWER 5812 | 0*SD_SHARE_PKG_RESOURCES 5813 | 1*SD_SERIALIZE 5814 | 0*SD_PREFER_SIBLING 5815 | 1*SD_NUMA 5816 | sd_local_flags(level) 5817 , 5818 .last_balance = jiffies, 5819 .balance_interval = sd_weight, 5820 }; 5821 SD_INIT_NAME(sd, NUMA); 5822 sd->private = &tl->data; 5823 5824 /* 5825 * Ugly hack to pass state to sd_numa_mask()... 5826 */ 5827 sched_domains_curr_level = tl->numa_level; 5828 5829 return sd; 5830} 5831 5832static const struct cpumask *sd_numa_mask(int cpu) 5833{ 5834 return sched_domains_numa_masks[sched_domains_curr_level][cpu_to_node(cpu)]; 5835} 5836 5837static void sched_numa_warn(const char *str) 5838{ 5839 static int done = false; 5840 int i,j; 5841 5842 if (done) 5843 return; 5844 5845 done = true; 5846 5847 printk(KERN_WARNING "ERROR: %s\n\n", str); 5848 5849 for (i = 0; i < nr_node_ids; i++) { 5850 printk(KERN_WARNING " "); 5851 for (j = 0; j < nr_node_ids; j++) 5852 printk(KERN_CONT "%02d ", node_distance(i,j)); 5853 printk(KERN_CONT "\n"); 5854 } 5855 printk(KERN_WARNING "\n"); 5856} 5857 5858static bool find_numa_distance(int distance) 5859{ 5860 int i; 5861 5862 if (distance == node_distance(0, 0)) 5863 return true; 5864 5865 for (i = 0; i < sched_domains_numa_levels; i++) { 5866 if (sched_domains_numa_distance[i] == distance) 5867 return true; 5868 } 5869 5870 return false; 5871} 5872 5873static void sched_init_numa(void) 5874{ 5875 int next_distance, curr_distance = node_distance(0, 0); 5876 struct sched_domain_topology_level *tl; 5877 int level = 0; 5878 int i, j, k; 5879 5880 sched_domains_numa_distance = kzalloc(sizeof(int) * nr_node_ids, GFP_KERNEL); 5881 if (!sched_domains_numa_distance) 5882 return; 5883 5884 /* 5885 * O(nr_nodes^2) deduplicating selection sort -- in order to find the 5886 * unique distances in the node_distance() table. 5887 * 5888 * Assumes node_distance(0,j) includes all distances in 5889 * node_distance(i,j) in order to avoid cubic time. 5890 */ 5891 next_distance = curr_distance; 5892 for (i = 0; i < nr_node_ids; i++) { 5893 for (j = 0; j < nr_node_ids; j++) { 5894 for (k = 0; k < nr_node_ids; k++) { 5895 int distance = node_distance(i, k); 5896 5897 if (distance > curr_distance && 5898 (distance < next_distance || 5899 next_distance == curr_distance)) 5900 next_distance = distance; 5901 5902 /* 5903 * While not a strong assumption it would be nice to know 5904 * about cases where if node A is connected to B, B is not 5905 * equally connected to A. 5906 */ 5907 if (sched_debug() && node_distance(k, i) != distance) 5908 sched_numa_warn("Node-distance not symmetric"); 5909 5910 if (sched_debug() && i && !find_numa_distance(distance)) 5911 sched_numa_warn("Node-0 not representative"); 5912 } 5913 if (next_distance != curr_distance) { 5914 sched_domains_numa_distance[level++] = next_distance; 5915 sched_domains_numa_levels = level; 5916 curr_distance = next_distance; 5917 } else break; 5918 } 5919 5920 /* 5921 * In case of sched_debug() we verify the above assumption. 5922 */ 5923 if (!sched_debug()) 5924 break; 5925 } 5926 /* 5927 * 'level' contains the number of unique distances, excluding the 5928 * identity distance node_distance(i,i). 5929 * 5930 * The sched_domains_numa_distance[] array includes the actual distance 5931 * numbers. 5932 */ 5933 5934 /* 5935 * Here, we should temporarily reset sched_domains_numa_levels to 0. 5936 * If it fails to allocate memory for array sched_domains_numa_masks[][], 5937 * the array will contain less then 'level' members. This could be 5938 * dangerous when we use it to iterate array sched_domains_numa_masks[][] 5939 * in other functions. 5940 * 5941 * We reset it to 'level' at the end of this function. 5942 */ 5943 sched_domains_numa_levels = 0; 5944 5945 sched_domains_numa_masks = kzalloc(sizeof(void *) * level, GFP_KERNEL); 5946 if (!sched_domains_numa_masks) 5947 return; 5948 5949 /* 5950 * Now for each level, construct a mask per node which contains all 5951 * cpus of nodes that are that many hops away from us. 5952 */ 5953 for (i = 0; i < level; i++) { 5954 sched_domains_numa_masks[i] = 5955 kzalloc(nr_node_ids * sizeof(void *), GFP_KERNEL); 5956 if (!sched_domains_numa_masks[i]) 5957 return; 5958 5959 for (j = 0; j < nr_node_ids; j++) { 5960 struct cpumask *mask = kzalloc(cpumask_size(), GFP_KERNEL); 5961 if (!mask) 5962 return; 5963 5964 sched_domains_numa_masks[i][j] = mask; 5965 5966 for (k = 0; k < nr_node_ids; k++) { 5967 if (node_distance(j, k) > sched_domains_numa_distance[i]) 5968 continue; 5969 5970 cpumask_or(mask, mask, cpumask_of_node(k)); 5971 } 5972 } 5973 } 5974 5975 tl = kzalloc((ARRAY_SIZE(default_topology) + level) * 5976 sizeof(struct sched_domain_topology_level), GFP_KERNEL); 5977 if (!tl) 5978 return; 5979 5980 /* 5981 * Copy the default topology bits.. 5982 */ 5983 for (i = 0; default_topology[i].init; i++) 5984 tl[i] = default_topology[i]; 5985 5986 /* 5987 * .. and append 'j' levels of NUMA goodness. 5988 */ 5989 for (j = 0; j < level; i++, j++) { 5990 tl[i] = (struct sched_domain_topology_level){ 5991 .init = sd_numa_init, 5992 .mask = sd_numa_mask, 5993 .flags = SDTL_OVERLAP, 5994 .numa_level = j, 5995 }; 5996 } 5997 5998 sched_domain_topology = tl; 5999 6000 sched_domains_numa_levels = level; 6001} 6002 6003static void sched_domains_numa_masks_set(int cpu) 6004{ 6005 int i, j; 6006 int node = cpu_to_node(cpu); 6007 6008 for (i = 0; i < sched_domains_numa_levels; i++) { 6009 for (j = 0; j < nr_node_ids; j++) { 6010 if (node_distance(j, node) <= sched_domains_numa_distance[i]) 6011 cpumask_set_cpu(cpu, sched_domains_numa_masks[i][j]); 6012 } 6013 } 6014} 6015 6016static void sched_domains_numa_masks_clear(int cpu) 6017{ 6018 int i, j; 6019 for (i = 0; i < sched_domains_numa_levels; i++) { 6020 for (j = 0; j < nr_node_ids; j++) 6021 cpumask_clear_cpu(cpu, sched_domains_numa_masks[i][j]); 6022 } 6023} 6024 6025/* 6026 * Update sched_domains_numa_masks[level][node] array when new cpus 6027 * are onlined. 6028 */ 6029static int sched_domains_numa_masks_update(struct notifier_block *nfb, 6030 unsigned long action, 6031 void *hcpu) 6032{ 6033 int cpu = (long)hcpu; 6034 6035 switch (action & ~CPU_TASKS_FROZEN) { 6036 case CPU_ONLINE: 6037 sched_domains_numa_masks_set(cpu); 6038 break; 6039 6040 case CPU_DEAD: 6041 sched_domains_numa_masks_clear(cpu); 6042 break; 6043 6044 default: 6045 return NOTIFY_DONE; 6046 } 6047 6048 return NOTIFY_OK; 6049} 6050#else 6051static inline void sched_init_numa(void) 6052{ 6053} 6054 6055static int sched_domains_numa_masks_update(struct notifier_block *nfb, 6056 unsigned long action, 6057 void *hcpu) 6058{ 6059 return 0; 6060} 6061#endif /* CONFIG_NUMA */ 6062 6063static int __sdt_alloc(const struct cpumask *cpu_map) 6064{ 6065 struct sched_domain_topology_level *tl; 6066 int j; 6067 6068 for_each_sd_topology(tl) { 6069 struct sd_data *sdd = &tl->data; 6070 6071 sdd->sd = alloc_percpu(struct sched_domain *); 6072 if (!sdd->sd) 6073 return -ENOMEM; 6074 6075 sdd->sg = alloc_percpu(struct sched_group *); 6076 if (!sdd->sg) 6077 return -ENOMEM; 6078 6079 sdd->sgp = alloc_percpu(struct sched_group_power *); 6080 if (!sdd->sgp) 6081 return -ENOMEM; 6082 6083 for_each_cpu(j, cpu_map) { 6084 struct sched_domain *sd; 6085 struct sched_group *sg; 6086 struct sched_group_power *sgp; 6087 6088 sd = kzalloc_node(sizeof(struct sched_domain) + cpumask_size(), 6089 GFP_KERNEL, cpu_to_node(j)); 6090 if (!sd) 6091 return -ENOMEM; 6092 6093 *per_cpu_ptr(sdd->sd, j) = sd; 6094 6095 sg = kzalloc_node(sizeof(struct sched_group) + cpumask_size(), 6096 GFP_KERNEL, cpu_to_node(j)); 6097 if (!sg) 6098 return -ENOMEM; 6099 6100 sg->next = sg; 6101 6102 *per_cpu_ptr(sdd->sg, j) = sg; 6103 6104 sgp = kzalloc_node(sizeof(struct sched_group_power) + cpumask_size(), 6105 GFP_KERNEL, cpu_to_node(j)); 6106 if (!sgp) 6107 return -ENOMEM; 6108 6109 *per_cpu_ptr(sdd->sgp, j) = sgp; 6110 } 6111 } 6112 6113 return 0; 6114} 6115 6116static void __sdt_free(const struct cpumask *cpu_map) 6117{ 6118 struct sched_domain_topology_level *tl; 6119 int j; 6120 6121 for_each_sd_topology(tl) { 6122 struct sd_data *sdd = &tl->data; 6123 6124 for_each_cpu(j, cpu_map) { 6125 struct sched_domain *sd; 6126 6127 if (sdd->sd) { 6128 sd = *per_cpu_ptr(sdd->sd, j); 6129 if (sd && (sd->flags & SD_OVERLAP)) 6130 free_sched_groups(sd->groups, 0); 6131 kfree(*per_cpu_ptr(sdd->sd, j)); 6132 } 6133 6134 if (sdd->sg) 6135 kfree(*per_cpu_ptr(sdd->sg, j)); 6136 if (sdd->sgp) 6137 kfree(*per_cpu_ptr(sdd->sgp, j)); 6138 } 6139 free_percpu(sdd->sd); 6140 sdd->sd = NULL; 6141 free_percpu(sdd->sg); 6142 sdd->sg = NULL; 6143 free_percpu(sdd->sgp); 6144 sdd->sgp = NULL; 6145 } 6146} 6147 6148struct sched_domain *build_sched_domain(struct sched_domain_topology_level *tl, 6149 const struct cpumask *cpu_map, struct sched_domain_attr *attr, 6150 struct sched_domain *child, int cpu) 6151{ 6152 struct sched_domain *sd = tl->init(tl, cpu); 6153 if (!sd) 6154 return child; 6155 6156 cpumask_and(sched_domain_span(sd), cpu_map, tl->mask(cpu)); 6157 if (child) { 6158 sd->level = child->level + 1; 6159 sched_domain_level_max = max(sched_domain_level_max, sd->level); 6160 child->parent = sd; 6161 sd->child = child; 6162 } 6163 set_domain_attribute(sd, attr); 6164 6165 return sd; 6166} 6167 6168/* 6169 * Build sched domains for a given set of cpus and attach the sched domains 6170 * to the individual cpus 6171 */ 6172static int build_sched_domains(const struct cpumask *cpu_map, 6173 struct sched_domain_attr *attr) 6174{ 6175 enum s_alloc alloc_state; 6176 struct sched_domain *sd; 6177 struct s_data d; 6178 int i, ret = -ENOMEM; 6179 6180 alloc_state = __visit_domain_allocation_hell(&d, cpu_map); 6181 if (alloc_state != sa_rootdomain) 6182 goto error; 6183 6184 /* Set up domains for cpus specified by the cpu_map. */ 6185 for_each_cpu(i, cpu_map) { 6186 struct sched_domain_topology_level *tl; 6187 6188 sd = NULL; 6189 for_each_sd_topology(tl) { 6190 sd = build_sched_domain(tl, cpu_map, attr, sd, i); 6191 if (tl == sched_domain_topology) 6192 *per_cpu_ptr(d.sd, i) = sd; 6193 if (tl->flags & SDTL_OVERLAP || sched_feat(FORCE_SD_OVERLAP)) 6194 sd->flags |= SD_OVERLAP; 6195 if (cpumask_equal(cpu_map, sched_domain_span(sd))) 6196 break; 6197 } 6198 } 6199 6200 /* Build the groups for the domains */ 6201 for_each_cpu(i, cpu_map) { 6202 for (sd = *per_cpu_ptr(d.sd, i); sd; sd = sd->parent) { 6203 sd->span_weight = cpumask_weight(sched_domain_span(sd)); 6204 if (sd->flags & SD_OVERLAP) { 6205 if (build_overlap_sched_groups(sd, i)) 6206 goto error; 6207 } else { 6208 if (build_sched_groups(sd, i)) 6209 goto error; 6210 } 6211 } 6212 } 6213 6214 /* Calculate CPU power for physical packages and nodes */ 6215 for (i = nr_cpumask_bits-1; i >= 0; i--) { 6216 if (!cpumask_test_cpu(i, cpu_map)) 6217 continue; 6218 6219 for (sd = *per_cpu_ptr(d.sd, i); sd; sd = sd->parent) { 6220 claim_allocations(i, sd); 6221 init_sched_groups_power(i, sd); 6222 } 6223 } 6224 6225 /* Attach the domains */ 6226 rcu_read_lock(); 6227 for_each_cpu(i, cpu_map) { 6228 sd = *per_cpu_ptr(d.sd, i); 6229 cpu_attach_domain(sd, d.rd, i); 6230 } 6231 rcu_read_unlock(); 6232 6233 ret = 0; 6234error: 6235 __free_domain_allocs(&d, alloc_state, cpu_map); 6236 return ret; 6237} 6238 6239static cpumask_var_t *doms_cur; /* current sched domains */ 6240static int ndoms_cur; /* number of sched domains in 'doms_cur' */ 6241static struct sched_domain_attr *dattr_cur; 6242 /* attribues of custom domains in 'doms_cur' */ 6243 6244/* 6245 * Special case: If a kmalloc of a doms_cur partition (array of 6246 * cpumask) fails, then fallback to a single sched domain, 6247 * as determined by the single cpumask fallback_doms. 6248 */ 6249static cpumask_var_t fallback_doms; 6250 6251/* 6252 * arch_update_cpu_topology lets virtualized architectures update the 6253 * cpu core maps. It is supposed to return 1 if the topology changed 6254 * or 0 if it stayed the same. 6255 */ 6256int __attribute__((weak)) arch_update_cpu_topology(void) 6257{ 6258 return 0; 6259} 6260 6261cpumask_var_t *alloc_sched_domains(unsigned int ndoms) 6262{ 6263 int i; 6264 cpumask_var_t *doms; 6265 6266 doms = kmalloc(sizeof(*doms) * ndoms, GFP_KERNEL); 6267 if (!doms) 6268 return NULL; 6269 for (i = 0; i < ndoms; i++) { 6270 if (!alloc_cpumask_var(&doms[i], GFP_KERNEL)) { 6271 free_sched_domains(doms, i); 6272 return NULL; 6273 } 6274 } 6275 return doms; 6276} 6277 6278void free_sched_domains(cpumask_var_t doms[], unsigned int ndoms) 6279{ 6280 unsigned int i; 6281 for (i = 0; i < ndoms; i++) 6282 free_cpumask_var(doms[i]); 6283 kfree(doms); 6284} 6285 6286/* 6287 * Set up scheduler domains and groups. Callers must hold the hotplug lock. 6288 * For now this just excludes isolated cpus, but could be used to 6289 * exclude other special cases in the future. 6290 */ 6291static int init_sched_domains(const struct cpumask *cpu_map) 6292{ 6293 int err; 6294 6295 arch_update_cpu_topology(); 6296 ndoms_cur = 1; 6297 doms_cur = alloc_sched_domains(ndoms_cur); 6298 if (!doms_cur) 6299 doms_cur = &fallback_doms; 6300 cpumask_andnot(doms_cur[0], cpu_map, cpu_isolated_map); 6301 err = build_sched_domains(doms_cur[0], NULL); 6302 register_sched_domain_sysctl(); 6303 6304 return err; 6305} 6306 6307/* 6308 * Detach sched domains from a group of cpus specified in cpu_map 6309 * These cpus will now be attached to the NULL domain 6310 */ 6311static void detach_destroy_domains(const struct cpumask *cpu_map) 6312{ 6313 int i; 6314 6315 rcu_read_lock(); 6316 for_each_cpu(i, cpu_map) 6317 cpu_attach_domain(NULL, &def_root_domain, i); 6318 rcu_read_unlock(); 6319} 6320 6321/* handle null as "default" */ 6322static int dattrs_equal(struct sched_domain_attr *cur, int idx_cur, 6323 struct sched_domain_attr *new, int idx_new) 6324{ 6325 struct sched_domain_attr tmp; 6326 6327 /* fast path */ 6328 if (!new && !cur) 6329 return 1; 6330 6331 tmp = SD_ATTR_INIT; 6332 return !memcmp(cur ? (cur + idx_cur) : &tmp, 6333 new ? (new + idx_new) : &tmp, 6334 sizeof(struct sched_domain_attr)); 6335} 6336 6337/* 6338 * Partition sched domains as specified by the 'ndoms_new' 6339 * cpumasks in the array doms_new[] of cpumasks. This compares 6340 * doms_new[] to the current sched domain partitioning, doms_cur[]. 6341 * It destroys each deleted domain and builds each new domain. 6342 * 6343 * 'doms_new' is an array of cpumask_var_t's of length 'ndoms_new'. 6344 * The masks don't intersect (don't overlap.) We should setup one 6345 * sched domain for each mask. CPUs not in any of the cpumasks will 6346 * not be load balanced. If the same cpumask appears both in the 6347 * current 'doms_cur' domains and in the new 'doms_new', we can leave 6348 * it as it is. 6349 * 6350 * The passed in 'doms_new' should be allocated using 6351 * alloc_sched_domains. This routine takes ownership of it and will 6352 * free_sched_domains it when done with it. If the caller failed the 6353 * alloc call, then it can pass in doms_new == NULL && ndoms_new == 1, 6354 * and partition_sched_domains() will fallback to the single partition 6355 * 'fallback_doms', it also forces the domains to be rebuilt. 6356 * 6357 * If doms_new == NULL it will be replaced with cpu_online_mask. 6358 * ndoms_new == 0 is a special case for destroying existing domains, 6359 * and it will not create the default domain. 6360 * 6361 * Call with hotplug lock held 6362 */ 6363void partition_sched_domains(int ndoms_new, cpumask_var_t doms_new[], 6364 struct sched_domain_attr *dattr_new) 6365{ 6366 int i, j, n; 6367 int new_topology; 6368 6369 mutex_lock(&sched_domains_mutex); 6370 6371 /* always unregister in case we don't destroy any domains */ 6372 unregister_sched_domain_sysctl(); 6373 6374 /* Let architecture update cpu core mappings. */ 6375 new_topology = arch_update_cpu_topology(); 6376 6377 n = doms_new ? ndoms_new : 0; 6378 6379 /* Destroy deleted domains */ 6380 for (i = 0; i < ndoms_cur; i++) { 6381 for (j = 0; j < n && !new_topology; j++) { 6382 if (cpumask_equal(doms_cur[i], doms_new[j]) 6383 && dattrs_equal(dattr_cur, i, dattr_new, j)) 6384 goto match1; 6385 } 6386 /* no match - a current sched domain not in new doms_new[] */ 6387 detach_destroy_domains(doms_cur[i]); 6388match1: 6389 ; 6390 } 6391 6392 n = ndoms_cur; 6393 if (doms_new == NULL) { 6394 n = 0; 6395 doms_new = &fallback_doms; 6396 cpumask_andnot(doms_new[0], cpu_active_mask, cpu_isolated_map); 6397 WARN_ON_ONCE(dattr_new); 6398 } 6399 6400 /* Build new domains */ 6401 for (i = 0; i < ndoms_new; i++) { 6402 for (j = 0; j < n && !new_topology; j++) { 6403 if (cpumask_equal(doms_new[i], doms_cur[j]) 6404 && dattrs_equal(dattr_new, i, dattr_cur, j)) 6405 goto match2; 6406 } 6407 /* no match - add a new doms_new */ 6408 build_sched_domains(doms_new[i], dattr_new ? dattr_new + i : NULL); 6409match2: 6410 ; 6411 } 6412 6413 /* Remember the new sched domains */ 6414 if (doms_cur != &fallback_doms) 6415 free_sched_domains(doms_cur, ndoms_cur); 6416 kfree(dattr_cur); /* kfree(NULL) is safe */ 6417 doms_cur = doms_new; 6418 dattr_cur = dattr_new; 6419 ndoms_cur = ndoms_new; 6420 6421 register_sched_domain_sysctl(); 6422 6423 mutex_unlock(&sched_domains_mutex); 6424} 6425 6426static int num_cpus_frozen; /* used to mark begin/end of suspend/resume */ 6427 6428/* 6429 * Update cpusets according to cpu_active mask. If cpusets are 6430 * disabled, cpuset_update_active_cpus() becomes a simple wrapper 6431 * around partition_sched_domains(). 6432 * 6433 * If we come here as part of a suspend/resume, don't touch cpusets because we 6434 * want to restore it back to its original state upon resume anyway. 6435 */ 6436static int cpuset_cpu_active(struct notifier_block *nfb, unsigned long action, 6437 void *hcpu) 6438{ 6439 switch (action) { 6440 case CPU_ONLINE_FROZEN: 6441 case CPU_DOWN_FAILED_FROZEN: 6442 6443 /* 6444 * num_cpus_frozen tracks how many CPUs are involved in suspend 6445 * resume sequence. As long as this is not the last online 6446 * operation in the resume sequence, just build a single sched 6447 * domain, ignoring cpusets. 6448 */ 6449 num_cpus_frozen--; 6450 if (likely(num_cpus_frozen)) { 6451 partition_sched_domains(1, NULL, NULL); 6452 break; 6453 } 6454 6455 /* 6456 * This is the last CPU online operation. So fall through and 6457 * restore the original sched domains by considering the 6458 * cpuset configurations. 6459 */ 6460 6461 case CPU_ONLINE: 6462 case CPU_DOWN_FAILED: 6463 cpuset_update_active_cpus(true); 6464 break; 6465 default: 6466 return NOTIFY_DONE; 6467 } 6468 return NOTIFY_OK; 6469} 6470 6471static int cpuset_cpu_inactive(struct notifier_block *nfb, unsigned long action, 6472 void *hcpu) 6473{ 6474 switch (action) { 6475 case CPU_DOWN_PREPARE: 6476 cpuset_update_active_cpus(false); 6477 break; 6478 case CPU_DOWN_PREPARE_FROZEN: 6479 num_cpus_frozen++; 6480 partition_sched_domains(1, NULL, NULL); 6481 break; 6482 default: 6483 return NOTIFY_DONE; 6484 } 6485 return NOTIFY_OK; 6486} 6487 6488void __init sched_init_smp(void) 6489{ 6490 cpumask_var_t non_isolated_cpus; 6491 6492 alloc_cpumask_var(&non_isolated_cpus, GFP_KERNEL); 6493 alloc_cpumask_var(&fallback_doms, GFP_KERNEL); 6494 6495 sched_init_numa(); 6496 6497 get_online_cpus(); 6498 mutex_lock(&sched_domains_mutex); 6499 init_sched_domains(cpu_active_mask); 6500 cpumask_andnot(non_isolated_cpus, cpu_possible_mask, cpu_isolated_map); 6501 if (cpumask_empty(non_isolated_cpus)) 6502 cpumask_set_cpu(smp_processor_id(), non_isolated_cpus); 6503 mutex_unlock(&sched_domains_mutex); 6504 put_online_cpus(); 6505 6506 hotcpu_notifier(sched_domains_numa_masks_update, CPU_PRI_SCHED_ACTIVE); 6507 hotcpu_notifier(cpuset_cpu_active, CPU_PRI_CPUSET_ACTIVE); 6508 hotcpu_notifier(cpuset_cpu_inactive, CPU_PRI_CPUSET_INACTIVE); 6509 6510 init_hrtick(); 6511 6512 /* Move init over to a non-isolated CPU */ 6513 if (set_cpus_allowed_ptr(current, non_isolated_cpus) < 0) 6514 BUG(); 6515 sched_init_granularity(); 6516 free_cpumask_var(non_isolated_cpus); 6517 6518 init_sched_rt_class(); 6519} 6520#else 6521void __init sched_init_smp(void) 6522{ 6523 sched_init_granularity(); 6524} 6525#endif /* CONFIG_SMP */ 6526 6527const_debug unsigned int sysctl_timer_migration = 1; 6528 6529int in_sched_functions(unsigned long addr) 6530{ 6531 return in_lock_functions(addr) || 6532 (addr >= (unsigned long)__sched_text_start 6533 && addr < (unsigned long)__sched_text_end); 6534} 6535 6536#ifdef CONFIG_CGROUP_SCHED 6537/* 6538 * Default task group. 6539 * Every task in system belongs to this group at bootup. 6540 */ 6541struct task_group root_task_group; 6542LIST_HEAD(task_groups); 6543#endif 6544 6545DECLARE_PER_CPU(cpumask_var_t, load_balance_mask); 6546 6547void __init sched_init(void) 6548{ 6549 int i, j; 6550 unsigned long alloc_size = 0, ptr; 6551 6552#ifdef CONFIG_FAIR_GROUP_SCHED 6553 alloc_size += 2 * nr_cpu_ids * sizeof(void **); 6554#endif 6555#ifdef CONFIG_RT_GROUP_SCHED 6556 alloc_size += 2 * nr_cpu_ids * sizeof(void **); 6557#endif 6558#ifdef CONFIG_CPUMASK_OFFSTACK 6559 alloc_size += num_possible_cpus() * cpumask_size(); 6560#endif 6561 if (alloc_size) { 6562 ptr = (unsigned long)kzalloc(alloc_size, GFP_NOWAIT); 6563 6564#ifdef CONFIG_FAIR_GROUP_SCHED 6565 root_task_group.se = (struct sched_entity **)ptr; 6566 ptr += nr_cpu_ids * sizeof(void **); 6567 6568 root_task_group.cfs_rq = (struct cfs_rq **)ptr; 6569 ptr += nr_cpu_ids * sizeof(void **); 6570 6571#endif /* CONFIG_FAIR_GROUP_SCHED */ 6572#ifdef CONFIG_RT_GROUP_SCHED 6573 root_task_group.rt_se = (struct sched_rt_entity **)ptr; 6574 ptr += nr_cpu_ids * sizeof(void **); 6575 6576 root_task_group.rt_rq = (struct rt_rq **)ptr; 6577 ptr += nr_cpu_ids * sizeof(void **); 6578 6579#endif /* CONFIG_RT_GROUP_SCHED */ 6580#ifdef CONFIG_CPUMASK_OFFSTACK 6581 for_each_possible_cpu(i) { 6582 per_cpu(load_balance_mask, i) = (void *)ptr; 6583 ptr += cpumask_size(); 6584 } 6585#endif /* CONFIG_CPUMASK_OFFSTACK */ 6586 } 6587 6588#ifdef CONFIG_SMP 6589 init_defrootdomain(); 6590#endif 6591 6592 init_rt_bandwidth(&def_rt_bandwidth, 6593 global_rt_period(), global_rt_runtime()); 6594 6595#ifdef CONFIG_RT_GROUP_SCHED 6596 init_rt_bandwidth(&root_task_group.rt_bandwidth, 6597 global_rt_period(), global_rt_runtime()); 6598#endif /* CONFIG_RT_GROUP_SCHED */ 6599 6600#ifdef CONFIG_CGROUP_SCHED 6601 list_add(&root_task_group.list, &task_groups); 6602 INIT_LIST_HEAD(&root_task_group.children); 6603 INIT_LIST_HEAD(&root_task_group.siblings); 6604 autogroup_init(&init_task); 6605 6606#endif /* CONFIG_CGROUP_SCHED */ 6607 6608 for_each_possible_cpu(i) { 6609 struct rq *rq; 6610 6611 rq = cpu_rq(i); 6612 raw_spin_lock_init(&rq->lock); 6613 rq->nr_running = 0; 6614 rq->calc_load_active = 0; 6615 rq->calc_load_update = jiffies + LOAD_FREQ; 6616 init_cfs_rq(&rq->cfs); 6617 init_rt_rq(&rq->rt, rq); 6618#ifdef CONFIG_FAIR_GROUP_SCHED 6619 root_task_group.shares = ROOT_TASK_GROUP_LOAD; 6620 INIT_LIST_HEAD(&rq->leaf_cfs_rq_list); 6621 /* 6622 * How much cpu bandwidth does root_task_group get? 6623 * 6624 * In case of task-groups formed thr' the cgroup filesystem, it 6625 * gets 100% of the cpu resources in the system. This overall 6626 * system cpu resource is divided among the tasks of 6627 * root_task_group and its child task-groups in a fair manner, 6628 * based on each entity's (task or task-group's) weight 6629 * (se->load.weight). 6630 * 6631 * In other words, if root_task_group has 10 tasks of weight 6632 * 1024) and two child groups A0 and A1 (of weight 1024 each), 6633 * then A0's share of the cpu resource is: 6634 * 6635 * A0's bandwidth = 1024 / (10*1024 + 1024 + 1024) = 8.33% 6636 * 6637 * We achieve this by letting root_task_group's tasks sit 6638 * directly in rq->cfs (i.e root_task_group->se[] = NULL). 6639 */ 6640 init_cfs_bandwidth(&root_task_group.cfs_bandwidth); 6641 init_tg_cfs_entry(&root_task_group, &rq->cfs, NULL, i, NULL); 6642#endif /* CONFIG_FAIR_GROUP_SCHED */ 6643 6644 rq->rt.rt_runtime = def_rt_bandwidth.rt_runtime; 6645#ifdef CONFIG_RT_GROUP_SCHED 6646 INIT_LIST_HEAD(&rq->leaf_rt_rq_list); 6647 init_tg_rt_entry(&root_task_group, &rq->rt, NULL, i, NULL); 6648#endif 6649 6650 for (j = 0; j < CPU_LOAD_IDX_MAX; j++) 6651 rq->cpu_load[j] = 0; 6652 6653 rq->last_load_update_tick = jiffies; 6654 6655#ifdef CONFIG_SMP 6656 rq->sd = NULL; 6657 rq->rd = NULL; 6658 rq->cpu_power = SCHED_POWER_SCALE; 6659 rq->post_schedule = 0; 6660 rq->active_balance = 0; 6661 rq->next_balance = jiffies; 6662 rq->push_cpu = 0; 6663 rq->cpu = i; 6664 rq->online = 0; 6665 rq->idle_stamp = 0; 6666 rq->avg_idle = 2*sysctl_sched_migration_cost; 6667 rq->max_idle_balance_cost = sysctl_sched_migration_cost; 6668 6669 INIT_LIST_HEAD(&rq->cfs_tasks); 6670 6671 rq_attach_root(rq, &def_root_domain); 6672#ifdef CONFIG_NO_HZ_COMMON 6673 rq->nohz_flags = 0; 6674#endif 6675#ifdef CONFIG_NO_HZ_FULL 6676 rq->last_sched_tick = 0; 6677#endif 6678#endif 6679 init_rq_hrtick(rq); 6680 atomic_set(&rq->nr_iowait, 0); 6681 } 6682 6683 set_load_weight(&init_task); 6684 6685#ifdef CONFIG_PREEMPT_NOTIFIERS 6686 INIT_HLIST_HEAD(&init_task.preempt_notifiers); 6687#endif 6688 6689#ifdef CONFIG_RT_MUTEXES 6690 plist_head_init(&init_task.pi_waiters); 6691#endif 6692 6693 /* 6694 * The boot idle thread does lazy MMU switching as well: 6695 */ 6696 atomic_inc(&init_mm.mm_count); 6697 enter_lazy_tlb(&init_mm, current); 6698 6699 /* 6700 * Make us the idle thread. Technically, schedule() should not be 6701 * called from this thread, however somewhere below it might be, 6702 * but because we are the idle thread, we just pick up running again 6703 * when this runqueue becomes "idle". 6704 */ 6705 init_idle(current, smp_processor_id()); 6706 6707 calc_load_update = jiffies + LOAD_FREQ; 6708 6709 /* 6710 * During early bootup we pretend to be a normal task: 6711 */ 6712 current->sched_class = &fair_sched_class; 6713 6714#ifdef CONFIG_SMP 6715 zalloc_cpumask_var(&sched_domains_tmpmask, GFP_NOWAIT); 6716 /* May be allocated at isolcpus cmdline parse time */ 6717 if (cpu_isolated_map == NULL) 6718 zalloc_cpumask_var(&cpu_isolated_map, GFP_NOWAIT); 6719 idle_thread_set_boot_cpu(); 6720#endif 6721 init_sched_fair_class(); 6722 6723 scheduler_running = 1; 6724} 6725 6726#ifdef CONFIG_DEBUG_ATOMIC_SLEEP 6727static inline int preempt_count_equals(int preempt_offset) 6728{ 6729 int nested = (preempt_count() & ~PREEMPT_ACTIVE) + rcu_preempt_depth(); 6730 6731 return (nested == preempt_offset); 6732} 6733 6734void __might_sleep(const char *file, int line, int preempt_offset) 6735{ 6736 static unsigned long prev_jiffy; /* ratelimiting */ 6737 6738 rcu_sleep_check(); /* WARN_ON_ONCE() by default, no rate limit reqd. */ 6739 if ((preempt_count_equals(preempt_offset) && !irqs_disabled()) || 6740 system_state != SYSTEM_RUNNING || oops_in_progress) 6741 return; 6742 if (time_before(jiffies, prev_jiffy + HZ) && prev_jiffy) 6743 return; 6744 prev_jiffy = jiffies; 6745 6746 printk(KERN_ERR 6747 "BUG: sleeping function called from invalid context at %s:%d\n", 6748 file, line); 6749 printk(KERN_ERR 6750 "in_atomic(): %d, irqs_disabled(): %d, pid: %d, name: %s\n", 6751 in_atomic(), irqs_disabled(), 6752 current->pid, current->comm); 6753 6754 debug_show_held_locks(current); 6755 if (irqs_disabled()) 6756 print_irqtrace_events(current); 6757 dump_stack(); 6758} 6759EXPORT_SYMBOL(__might_sleep); 6760#endif 6761 6762#ifdef CONFIG_MAGIC_SYSRQ 6763static void normalize_task(struct rq *rq, struct task_struct *p) 6764{ 6765 const struct sched_class *prev_class = p->sched_class; 6766 int old_prio = p->prio; 6767 int on_rq; 6768 6769 on_rq = p->on_rq; 6770 if (on_rq) 6771 dequeue_task(rq, p, 0); 6772 __setscheduler(rq, p, SCHED_NORMAL, 0); 6773 if (on_rq) { 6774 enqueue_task(rq, p, 0); 6775 resched_task(rq->curr); 6776 } 6777 6778 check_class_changed(rq, p, prev_class, old_prio); 6779} 6780 6781void normalize_rt_tasks(void) 6782{ 6783 struct task_struct *g, *p; 6784 unsigned long flags; 6785 struct rq *rq; 6786 6787 read_lock_irqsave(&tasklist_lock, flags); 6788 do_each_thread(g, p) { 6789 /* 6790 * Only normalize user tasks: 6791 */ 6792 if (!p->mm) 6793 continue; 6794 6795 p->se.exec_start = 0; 6796#ifdef CONFIG_SCHEDSTATS 6797 p->se.statistics.wait_start = 0; 6798 p->se.statistics.sleep_start = 0; 6799 p->se.statistics.block_start = 0; 6800#endif 6801 6802 if (!rt_task(p)) { 6803 /* 6804 * Renice negative nice level userspace 6805 * tasks back to 0: 6806 */ 6807 if (TASK_NICE(p) < 0 && p->mm) 6808 set_user_nice(p, 0); 6809 continue; 6810 } 6811 6812 raw_spin_lock(&p->pi_lock); 6813 rq = __task_rq_lock(p); 6814 6815 normalize_task(rq, p); 6816 6817 __task_rq_unlock(rq); 6818 raw_spin_unlock(&p->pi_lock); 6819 } while_each_thread(g, p); 6820 6821 read_unlock_irqrestore(&tasklist_lock, flags); 6822} 6823 6824#endif /* CONFIG_MAGIC_SYSRQ */ 6825 6826#if defined(CONFIG_IA64) || defined(CONFIG_KGDB_KDB) 6827/* 6828 * These functions are only useful for the IA64 MCA handling, or kdb. 6829 * 6830 * They can only be called when the whole system has been 6831 * stopped - every CPU needs to be quiescent, and no scheduling 6832 * activity can take place. Using them for anything else would 6833 * be a serious bug, and as a result, they aren't even visible 6834 * under any other configuration. 6835 */ 6836 6837/** 6838 * curr_task - return the current task for a given cpu. 6839 * @cpu: the processor in question. 6840 * 6841 * ONLY VALID WHEN THE WHOLE SYSTEM IS STOPPED! 6842 * 6843 * Return: The current task for @cpu. 6844 */ 6845struct task_struct *curr_task(int cpu) 6846{ 6847 return cpu_curr(cpu); 6848} 6849 6850#endif /* defined(CONFIG_IA64) || defined(CONFIG_KGDB_KDB) */ 6851 6852#ifdef CONFIG_IA64 6853/** 6854 * set_curr_task - set the current task for a given cpu. 6855 * @cpu: the processor in question. 6856 * @p: the task pointer to set. 6857 * 6858 * Description: This function must only be used when non-maskable interrupts 6859 * are serviced on a separate stack. It allows the architecture to switch the 6860 * notion of the current task on a cpu in a non-blocking manner. This function 6861 * must be called with all CPU's synchronized, and interrupts disabled, the 6862 * and caller must save the original value of the current task (see 6863 * curr_task() above) and restore that value before reenabling interrupts and 6864 * re-starting the system. 6865 * 6866 * ONLY VALID WHEN THE WHOLE SYSTEM IS STOPPED! 6867 */ 6868void set_curr_task(int cpu, struct task_struct *p) 6869{ 6870 cpu_curr(cpu) = p; 6871} 6872 6873#endif 6874 6875#ifdef CONFIG_CGROUP_SCHED 6876/* task_group_lock serializes the addition/removal of task groups */ 6877static DEFINE_SPINLOCK(task_group_lock); 6878 6879static void free_sched_group(struct task_group *tg) 6880{ 6881 free_fair_sched_group(tg); 6882 free_rt_sched_group(tg); 6883 autogroup_free(tg); 6884 kfree(tg); 6885} 6886 6887/* allocate runqueue etc for a new task group */ 6888struct task_group *sched_create_group(struct task_group *parent) 6889{ 6890 struct task_group *tg; 6891 6892 tg = kzalloc(sizeof(*tg), GFP_KERNEL); 6893 if (!tg) 6894 return ERR_PTR(-ENOMEM); 6895 6896 if (!alloc_fair_sched_group(tg, parent)) 6897 goto err; 6898 6899 if (!alloc_rt_sched_group(tg, parent)) 6900 goto err; 6901 6902 return tg; 6903 6904err: 6905 free_sched_group(tg); 6906 return ERR_PTR(-ENOMEM); 6907} 6908 6909void sched_online_group(struct task_group *tg, struct task_group *parent) 6910{ 6911 unsigned long flags; 6912 6913 spin_lock_irqsave(&task_group_lock, flags); 6914 list_add_rcu(&tg->list, &task_groups); 6915 6916 WARN_ON(!parent); /* root should already exist */ 6917 6918 tg->parent = parent; 6919 INIT_LIST_HEAD(&tg->children); 6920 list_add_rcu(&tg->siblings, &parent->children); 6921 spin_unlock_irqrestore(&task_group_lock, flags); 6922} 6923 6924/* rcu callback to free various structures associated with a task group */ 6925static void free_sched_group_rcu(struct rcu_head *rhp) 6926{ 6927 /* now it should be safe to free those cfs_rqs */ 6928 free_sched_group(container_of(rhp, struct task_group, rcu)); 6929} 6930 6931/* Destroy runqueue etc associated with a task group */ 6932void sched_destroy_group(struct task_group *tg) 6933{ 6934 /* wait for possible concurrent references to cfs_rqs complete */ 6935 call_rcu(&tg->rcu, free_sched_group_rcu); 6936} 6937 6938void sched_offline_group(struct task_group *tg) 6939{ 6940 unsigned long flags; 6941 int i; 6942 6943 /* end participation in shares distribution */ 6944 for_each_possible_cpu(i) 6945 unregister_fair_sched_group(tg, i); 6946 6947 spin_lock_irqsave(&task_group_lock, flags); 6948 list_del_rcu(&tg->list); 6949 list_del_rcu(&tg->siblings); 6950 spin_unlock_irqrestore(&task_group_lock, flags); 6951} 6952 6953/* change task's runqueue when it moves between groups. 6954 * The caller of this function should have put the task in its new group 6955 * by now. This function just updates tsk->se.cfs_rq and tsk->se.parent to 6956 * reflect its new group. 6957 */ 6958void sched_move_task(struct task_struct *tsk) 6959{ 6960 struct task_group *tg; 6961 int on_rq, running; 6962 unsigned long flags; 6963 struct rq *rq; 6964 6965 rq = task_rq_lock(tsk, &flags); 6966 6967 running = task_current(rq, tsk); 6968 on_rq = tsk->on_rq; 6969 6970 if (on_rq) 6971 dequeue_task(rq, tsk, 0); 6972 if (unlikely(running)) 6973 tsk->sched_class->put_prev_task(rq, tsk); 6974 6975 tg = container_of(task_css_check(tsk, cpu_cgroup_subsys_id, 6976 lockdep_is_held(&tsk->sighand->siglock)), 6977 struct task_group, css); 6978 tg = autogroup_task_group(tsk, tg); 6979 tsk->sched_task_group = tg; 6980 6981#ifdef CONFIG_FAIR_GROUP_SCHED 6982 if (tsk->sched_class->task_move_group) 6983 tsk->sched_class->task_move_group(tsk, on_rq); 6984 else 6985#endif 6986 set_task_rq(tsk, task_cpu(tsk)); 6987 6988 if (unlikely(running)) 6989 tsk->sched_class->set_curr_task(rq); 6990 if (on_rq) 6991 enqueue_task(rq, tsk, 0); 6992 6993 task_rq_unlock(rq, tsk, &flags); 6994} 6995#endif /* CONFIG_CGROUP_SCHED */ 6996 6997#if defined(CONFIG_RT_GROUP_SCHED) || defined(CONFIG_CFS_BANDWIDTH) 6998static unsigned long to_ratio(u64 period, u64 runtime) 6999{ 7000 if (runtime == RUNTIME_INF) 7001 return 1ULL << 20; 7002 7003 return div64_u64(runtime << 20, period); 7004} 7005#endif 7006 7007#ifdef CONFIG_RT_GROUP_SCHED 7008/* 7009 * Ensure that the real time constraints are schedulable. 7010 */ 7011static DEFINE_MUTEX(rt_constraints_mutex); 7012 7013/* Must be called with tasklist_lock held */ 7014static inline int tg_has_rt_tasks(struct task_group *tg) 7015{ 7016 struct task_struct *g, *p; 7017 7018 do_each_thread(g, p) { 7019 if (rt_task(p) && task_rq(p)->rt.tg == tg) 7020 return 1; 7021 } while_each_thread(g, p); 7022 7023 return 0; 7024} 7025 7026struct rt_schedulable_data { 7027 struct task_group *tg; 7028 u64 rt_period; 7029 u64 rt_runtime; 7030}; 7031 7032static int tg_rt_schedulable(struct task_group *tg, void *data) 7033{ 7034 struct rt_schedulable_data *d = data; 7035 struct task_group *child; 7036 unsigned long total, sum = 0; 7037 u64 period, runtime; 7038 7039 period = ktime_to_ns(tg->rt_bandwidth.rt_period); 7040 runtime = tg->rt_bandwidth.rt_runtime; 7041 7042 if (tg == d->tg) { 7043 period = d->rt_period; 7044 runtime = d->rt_runtime; 7045 } 7046 7047 /* 7048 * Cannot have more runtime than the period. 7049 */ 7050 if (runtime > period && runtime != RUNTIME_INF) 7051 return -EINVAL; 7052 7053 /* 7054 * Ensure we don't starve existing RT tasks. 7055 */ 7056 if (rt_bandwidth_enabled() && !runtime && tg_has_rt_tasks(tg)) 7057 return -EBUSY; 7058 7059 total = to_ratio(period, runtime); 7060 7061 /* 7062 * Nobody can have more than the global setting allows. 7063 */ 7064 if (total > to_ratio(global_rt_period(), global_rt_runtime())) 7065 return -EINVAL; 7066 7067 /* 7068 * The sum of our children's runtime should not exceed our own. 7069 */ 7070 list_for_each_entry_rcu(child, &tg->children, siblings) { 7071 period = ktime_to_ns(child->rt_bandwidth.rt_period); 7072 runtime = child->rt_bandwidth.rt_runtime; 7073 7074 if (child == d->tg) { 7075 period = d->rt_period; 7076 runtime = d->rt_runtime; 7077 } 7078 7079 sum += to_ratio(period, runtime); 7080 } 7081 7082 if (sum > total) 7083 return -EINVAL; 7084 7085 return 0; 7086} 7087 7088static int __rt_schedulable(struct task_group *tg, u64 period, u64 runtime) 7089{ 7090 int ret; 7091 7092 struct rt_schedulable_data data = { 7093 .tg = tg, 7094 .rt_period = period, 7095 .rt_runtime = runtime, 7096 }; 7097 7098 rcu_read_lock(); 7099 ret = walk_tg_tree(tg_rt_schedulable, tg_nop, &data); 7100 rcu_read_unlock(); 7101 7102 return ret; 7103} 7104 7105static int tg_set_rt_bandwidth(struct task_group *tg, 7106 u64 rt_period, u64 rt_runtime) 7107{ 7108 int i, err = 0; 7109 7110 mutex_lock(&rt_constraints_mutex); 7111 read_lock(&tasklist_lock); 7112 err = __rt_schedulable(tg, rt_period, rt_runtime); 7113 if (err) 7114 goto unlock; 7115 7116 raw_spin_lock_irq(&tg->rt_bandwidth.rt_runtime_lock); 7117 tg->rt_bandwidth.rt_period = ns_to_ktime(rt_period); 7118 tg->rt_bandwidth.rt_runtime = rt_runtime; 7119 7120 for_each_possible_cpu(i) { 7121 struct rt_rq *rt_rq = tg->rt_rq[i]; 7122 7123 raw_spin_lock(&rt_rq->rt_runtime_lock); 7124 rt_rq->rt_runtime = rt_runtime; 7125 raw_spin_unlock(&rt_rq->rt_runtime_lock); 7126 } 7127 raw_spin_unlock_irq(&tg->rt_bandwidth.rt_runtime_lock); 7128unlock: 7129 read_unlock(&tasklist_lock); 7130 mutex_unlock(&rt_constraints_mutex); 7131 7132 return err; 7133} 7134 7135static int sched_group_set_rt_runtime(struct task_group *tg, long rt_runtime_us) 7136{ 7137 u64 rt_runtime, rt_period; 7138 7139 rt_period = ktime_to_ns(tg->rt_bandwidth.rt_period); 7140 rt_runtime = (u64)rt_runtime_us * NSEC_PER_USEC; 7141 if (rt_runtime_us < 0) 7142 rt_runtime = RUNTIME_INF; 7143 7144 return tg_set_rt_bandwidth(tg, rt_period, rt_runtime); 7145} 7146 7147static long sched_group_rt_runtime(struct task_group *tg) 7148{ 7149 u64 rt_runtime_us; 7150 7151 if (tg->rt_bandwidth.rt_runtime == RUNTIME_INF) 7152 return -1; 7153 7154 rt_runtime_us = tg->rt_bandwidth.rt_runtime; 7155 do_div(rt_runtime_us, NSEC_PER_USEC); 7156 return rt_runtime_us; 7157} 7158 7159static int sched_group_set_rt_period(struct task_group *tg, long rt_period_us) 7160{ 7161 u64 rt_runtime, rt_period; 7162 7163 rt_period = (u64)rt_period_us * NSEC_PER_USEC; 7164 rt_runtime = tg->rt_bandwidth.rt_runtime; 7165 7166 if (rt_period == 0) 7167 return -EINVAL; 7168 7169 return tg_set_rt_bandwidth(tg, rt_period, rt_runtime); 7170} 7171 7172static long sched_group_rt_period(struct task_group *tg) 7173{ 7174 u64 rt_period_us; 7175 7176 rt_period_us = ktime_to_ns(tg->rt_bandwidth.rt_period); 7177 do_div(rt_period_us, NSEC_PER_USEC); 7178 return rt_period_us; 7179} 7180 7181static int sched_rt_global_constraints(void) 7182{ 7183 u64 runtime, period; 7184 int ret = 0; 7185 7186 if (sysctl_sched_rt_period <= 0) 7187 return -EINVAL; 7188 7189 runtime = global_rt_runtime(); 7190 period = global_rt_period(); 7191 7192 /* 7193 * Sanity check on the sysctl variables. 7194 */ 7195 if (runtime > period && runtime != RUNTIME_INF) 7196 return -EINVAL; 7197 7198 mutex_lock(&rt_constraints_mutex); 7199 read_lock(&tasklist_lock); 7200 ret = __rt_schedulable(NULL, 0, 0); 7201 read_unlock(&tasklist_lock); 7202 mutex_unlock(&rt_constraints_mutex); 7203 7204 return ret; 7205} 7206 7207static int sched_rt_can_attach(struct task_group *tg, struct task_struct *tsk) 7208{ 7209 /* Don't accept realtime tasks when there is no way for them to run */ 7210 if (rt_task(tsk) && tg->rt_bandwidth.rt_runtime == 0) 7211 return 0; 7212 7213 return 1; 7214} 7215 7216#else /* !CONFIG_RT_GROUP_SCHED */ 7217static int sched_rt_global_constraints(void) 7218{ 7219 unsigned long flags; 7220 int i; 7221 7222 if (sysctl_sched_rt_period <= 0) 7223 return -EINVAL; 7224 7225 /* 7226 * There's always some RT tasks in the root group 7227 * -- migration, kstopmachine etc.. 7228 */ 7229 if (sysctl_sched_rt_runtime == 0) 7230 return -EBUSY; 7231 7232 raw_spin_lock_irqsave(&def_rt_bandwidth.rt_runtime_lock, flags); 7233 for_each_possible_cpu(i) { 7234 struct rt_rq *rt_rq = &cpu_rq(i)->rt; 7235 7236 raw_spin_lock(&rt_rq->rt_runtime_lock); 7237 rt_rq->rt_runtime = global_rt_runtime(); 7238 raw_spin_unlock(&rt_rq->rt_runtime_lock); 7239 } 7240 raw_spin_unlock_irqrestore(&def_rt_bandwidth.rt_runtime_lock, flags); 7241 7242 return 0; 7243} 7244#endif /* CONFIG_RT_GROUP_SCHED */ 7245 7246int sched_rr_handler(struct ctl_table *table, int write, 7247 void __user *buffer, size_t *lenp, 7248 loff_t *ppos) 7249{ 7250 int ret; 7251 static DEFINE_MUTEX(mutex); 7252 7253 mutex_lock(&mutex); 7254 ret = proc_dointvec(table, write, buffer, lenp, ppos); 7255 /* make sure that internally we keep jiffies */ 7256 /* also, writing zero resets timeslice to default */ 7257 if (!ret && write) { 7258 sched_rr_timeslice = sched_rr_timeslice <= 0 ? 7259 RR_TIMESLICE : msecs_to_jiffies(sched_rr_timeslice); 7260 } 7261 mutex_unlock(&mutex); 7262 return ret; 7263} 7264 7265int sched_rt_handler(struct ctl_table *table, int write, 7266 void __user *buffer, size_t *lenp, 7267 loff_t *ppos) 7268{ 7269 int ret; 7270 int old_period, old_runtime; 7271 static DEFINE_MUTEX(mutex); 7272 7273 mutex_lock(&mutex); 7274 old_period = sysctl_sched_rt_period; 7275 old_runtime = sysctl_sched_rt_runtime; 7276 7277 ret = proc_dointvec(table, write, buffer, lenp, ppos); 7278 7279 if (!ret && write) { 7280 ret = sched_rt_global_constraints(); 7281 if (ret) { 7282 sysctl_sched_rt_period = old_period; 7283 sysctl_sched_rt_runtime = old_runtime; 7284 } else { 7285 def_rt_bandwidth.rt_runtime = global_rt_runtime(); 7286 def_rt_bandwidth.rt_period = 7287 ns_to_ktime(global_rt_period()); 7288 } 7289 } 7290 mutex_unlock(&mutex); 7291 7292 return ret; 7293} 7294 7295#ifdef CONFIG_CGROUP_SCHED 7296 7297static inline struct task_group *css_tg(struct cgroup_subsys_state *css) 7298{ 7299 return css ? container_of(css, struct task_group, css) : NULL; 7300} 7301 7302static struct cgroup_subsys_state * 7303cpu_cgroup_css_alloc(struct cgroup_subsys_state *parent_css) 7304{ 7305 struct task_group *parent = css_tg(parent_css); 7306 struct task_group *tg; 7307 7308 if (!parent) { 7309 /* This is early initialization for the top cgroup */ 7310 return &root_task_group.css; 7311 } 7312 7313 tg = sched_create_group(parent); 7314 if (IS_ERR(tg)) 7315 return ERR_PTR(-ENOMEM); 7316 7317 return &tg->css; 7318} 7319 7320static int cpu_cgroup_css_online(struct cgroup_subsys_state *css) 7321{ 7322 struct task_group *tg = css_tg(css); 7323 struct task_group *parent = css_tg(css_parent(css)); 7324 7325 if (parent) 7326 sched_online_group(tg, parent); 7327 return 0; 7328} 7329 7330static void cpu_cgroup_css_free(struct cgroup_subsys_state *css) 7331{ 7332 struct task_group *tg = css_tg(css); 7333 7334 sched_destroy_group(tg); 7335} 7336 7337static void cpu_cgroup_css_offline(struct cgroup_subsys_state *css) 7338{ 7339 struct task_group *tg = css_tg(css); 7340 7341 sched_offline_group(tg); 7342} 7343 7344static int cpu_cgroup_can_attach(struct cgroup_subsys_state *css, 7345 struct cgroup_taskset *tset) 7346{ 7347 struct task_struct *task; 7348 7349 cgroup_taskset_for_each(task, css, tset) { 7350#ifdef CONFIG_RT_GROUP_SCHED 7351 if (!sched_rt_can_attach(css_tg(css), task)) 7352 return -EINVAL; 7353#else 7354 /* We don't support RT-tasks being in separate groups */ 7355 if (task->sched_class != &fair_sched_class) 7356 return -EINVAL; 7357#endif 7358 } 7359 return 0; 7360} 7361 7362static void cpu_cgroup_attach(struct cgroup_subsys_state *css, 7363 struct cgroup_taskset *tset) 7364{ 7365 struct task_struct *task; 7366 7367 cgroup_taskset_for_each(task, css, tset) 7368 sched_move_task(task); 7369} 7370 7371static void cpu_cgroup_exit(struct cgroup_subsys_state *css, 7372 struct cgroup_subsys_state *old_css, 7373 struct task_struct *task) 7374{ 7375 /* 7376 * cgroup_exit() is called in the copy_process() failure path. 7377 * Ignore this case since the task hasn't ran yet, this avoids 7378 * trying to poke a half freed task state from generic code. 7379 */ 7380 if (!(task->flags & PF_EXITING)) 7381 return; 7382 7383 sched_move_task(task); 7384} 7385 7386#ifdef CONFIG_FAIR_GROUP_SCHED 7387static int cpu_shares_write_u64(struct cgroup_subsys_state *css, 7388 struct cftype *cftype, u64 shareval) 7389{ 7390 return sched_group_set_shares(css_tg(css), scale_load(shareval)); 7391} 7392 7393static u64 cpu_shares_read_u64(struct cgroup_subsys_state *css, 7394 struct cftype *cft) 7395{ 7396 struct task_group *tg = css_tg(css); 7397 7398 return (u64) scale_load_down(tg->shares); 7399} 7400 7401#ifdef CONFIG_CFS_BANDWIDTH 7402static DEFINE_MUTEX(cfs_constraints_mutex); 7403 7404const u64 max_cfs_quota_period = 1 * NSEC_PER_SEC; /* 1s */ 7405const u64 min_cfs_quota_period = 1 * NSEC_PER_MSEC; /* 1ms */ 7406 7407static int __cfs_schedulable(struct task_group *tg, u64 period, u64 runtime); 7408 7409static int tg_set_cfs_bandwidth(struct task_group *tg, u64 period, u64 quota) 7410{ 7411 int i, ret = 0, runtime_enabled, runtime_was_enabled; 7412 struct cfs_bandwidth *cfs_b = &tg->cfs_bandwidth; 7413 7414 if (tg == &root_task_group) 7415 return -EINVAL; 7416 7417 /* 7418 * Ensure we have at some amount of bandwidth every period. This is 7419 * to prevent reaching a state of large arrears when throttled via 7420 * entity_tick() resulting in prolonged exit starvation. 7421 */ 7422 if (quota < min_cfs_quota_period || period < min_cfs_quota_period) 7423 return -EINVAL; 7424 7425 /* 7426 * Likewise, bound things on the otherside by preventing insane quota 7427 * periods. This also allows us to normalize in computing quota 7428 * feasibility. 7429 */ 7430 if (period > max_cfs_quota_period) 7431 return -EINVAL; 7432 7433 mutex_lock(&cfs_constraints_mutex); 7434 ret = __cfs_schedulable(tg, period, quota); 7435 if (ret) 7436 goto out_unlock; 7437 7438 runtime_enabled = quota != RUNTIME_INF; 7439 runtime_was_enabled = cfs_b->quota != RUNTIME_INF; 7440 account_cfs_bandwidth_used(runtime_enabled, runtime_was_enabled); 7441 raw_spin_lock_irq(&cfs_b->lock); 7442 cfs_b->period = ns_to_ktime(period); 7443 cfs_b->quota = quota; 7444 7445 __refill_cfs_bandwidth_runtime(cfs_b); 7446 /* restart the period timer (if active) to handle new period expiry */ 7447 if (runtime_enabled && cfs_b->timer_active) { 7448 /* force a reprogram */ 7449 cfs_b->timer_active = 0; 7450 __start_cfs_bandwidth(cfs_b); 7451 } 7452 raw_spin_unlock_irq(&cfs_b->lock); 7453 7454 for_each_possible_cpu(i) { 7455 struct cfs_rq *cfs_rq = tg->cfs_rq[i]; 7456 struct rq *rq = cfs_rq->rq; 7457 7458 raw_spin_lock_irq(&rq->lock); 7459 cfs_rq->runtime_enabled = runtime_enabled; 7460 cfs_rq->runtime_remaining = 0; 7461 7462 if (cfs_rq->throttled) 7463 unthrottle_cfs_rq(cfs_rq); 7464 raw_spin_unlock_irq(&rq->lock); 7465 } 7466out_unlock: 7467 mutex_unlock(&cfs_constraints_mutex); 7468 7469 return ret; 7470} 7471 7472int tg_set_cfs_quota(struct task_group *tg, long cfs_quota_us) 7473{ 7474 u64 quota, period; 7475 7476 period = ktime_to_ns(tg->cfs_bandwidth.period); 7477 if (cfs_quota_us < 0) 7478 quota = RUNTIME_INF; 7479 else 7480 quota = (u64)cfs_quota_us * NSEC_PER_USEC; 7481 7482 return tg_set_cfs_bandwidth(tg, period, quota); 7483} 7484 7485long tg_get_cfs_quota(struct task_group *tg) 7486{ 7487 u64 quota_us; 7488 7489 if (tg->cfs_bandwidth.quota == RUNTIME_INF) 7490 return -1; 7491 7492 quota_us = tg->cfs_bandwidth.quota; 7493 do_div(quota_us, NSEC_PER_USEC); 7494 7495 return quota_us; 7496} 7497 7498int tg_set_cfs_period(struct task_group *tg, long cfs_period_us) 7499{ 7500 u64 quota, period; 7501 7502 period = (u64)cfs_period_us * NSEC_PER_USEC; 7503 quota = tg->cfs_bandwidth.quota; 7504 7505 return tg_set_cfs_bandwidth(tg, period, quota); 7506} 7507 7508long tg_get_cfs_period(struct task_group *tg) 7509{ 7510 u64 cfs_period_us; 7511 7512 cfs_period_us = ktime_to_ns(tg->cfs_bandwidth.period); 7513 do_div(cfs_period_us, NSEC_PER_USEC); 7514 7515 return cfs_period_us; 7516} 7517 7518static s64 cpu_cfs_quota_read_s64(struct cgroup_subsys_state *css, 7519 struct cftype *cft) 7520{ 7521 return tg_get_cfs_quota(css_tg(css)); 7522} 7523 7524static int cpu_cfs_quota_write_s64(struct cgroup_subsys_state *css, 7525 struct cftype *cftype, s64 cfs_quota_us) 7526{ 7527 return tg_set_cfs_quota(css_tg(css), cfs_quota_us); 7528} 7529 7530static u64 cpu_cfs_period_read_u64(struct cgroup_subsys_state *css, 7531 struct cftype *cft) 7532{ 7533 return tg_get_cfs_period(css_tg(css)); 7534} 7535 7536static int cpu_cfs_period_write_u64(struct cgroup_subsys_state *css, 7537 struct cftype *cftype, u64 cfs_period_us) 7538{ 7539 return tg_set_cfs_period(css_tg(css), cfs_period_us); 7540} 7541 7542struct cfs_schedulable_data { 7543 struct task_group *tg; 7544 u64 period, quota; 7545}; 7546 7547/* 7548 * normalize group quota/period to be quota/max_period 7549 * note: units are usecs 7550 */ 7551static u64 normalize_cfs_quota(struct task_group *tg, 7552 struct cfs_schedulable_data *d) 7553{ 7554 u64 quota, period; 7555 7556 if (tg == d->tg) { 7557 period = d->period; 7558 quota = d->quota; 7559 } else { 7560 period = tg_get_cfs_period(tg); 7561 quota = tg_get_cfs_quota(tg); 7562 } 7563 7564 /* note: these should typically be equivalent */ 7565 if (quota == RUNTIME_INF || quota == -1) 7566 return RUNTIME_INF; 7567 7568 return to_ratio(period, quota); 7569} 7570 7571static int tg_cfs_schedulable_down(struct task_group *tg, void *data) 7572{ 7573 struct cfs_schedulable_data *d = data; 7574 struct cfs_bandwidth *cfs_b = &tg->cfs_bandwidth; 7575 s64 quota = 0, parent_quota = -1; 7576 7577 if (!tg->parent) { 7578 quota = RUNTIME_INF; 7579 } else { 7580 struct cfs_bandwidth *parent_b = &tg->parent->cfs_bandwidth; 7581 7582 quota = normalize_cfs_quota(tg, d); 7583 parent_quota = parent_b->hierarchal_quota; 7584 7585 /* 7586 * ensure max(child_quota) <= parent_quota, inherit when no 7587 * limit is set 7588 */ 7589 if (quota == RUNTIME_INF) 7590 quota = parent_quota; 7591 else if (parent_quota != RUNTIME_INF && quota > parent_quota) 7592 return -EINVAL; 7593 } 7594 cfs_b->hierarchal_quota = quota; 7595 7596 return 0; 7597} 7598 7599static int __cfs_schedulable(struct task_group *tg, u64 period, u64 quota) 7600{ 7601 int ret; 7602 struct cfs_schedulable_data data = { 7603 .tg = tg, 7604 .period = period, 7605 .quota = quota, 7606 }; 7607 7608 if (quota != RUNTIME_INF) { 7609 do_div(data.period, NSEC_PER_USEC); 7610 do_div(data.quota, NSEC_PER_USEC); 7611 } 7612 7613 rcu_read_lock(); 7614 ret = walk_tg_tree(tg_cfs_schedulable_down, tg_nop, &data); 7615 rcu_read_unlock(); 7616 7617 return ret; 7618} 7619 7620static int cpu_stats_show(struct cgroup_subsys_state *css, struct cftype *cft, 7621 struct cgroup_map_cb *cb) 7622{ 7623 struct task_group *tg = css_tg(css); 7624 struct cfs_bandwidth *cfs_b = &tg->cfs_bandwidth; 7625 7626 cb->fill(cb, "nr_periods", cfs_b->nr_periods); 7627 cb->fill(cb, "nr_throttled", cfs_b->nr_throttled); 7628 cb->fill(cb, "throttled_time", cfs_b->throttled_time); 7629 7630 return 0; 7631} 7632#endif /* CONFIG_CFS_BANDWIDTH */ 7633#endif /* CONFIG_FAIR_GROUP_SCHED */ 7634 7635#ifdef CONFIG_RT_GROUP_SCHED 7636static int cpu_rt_runtime_write(struct cgroup_subsys_state *css, 7637 struct cftype *cft, s64 val) 7638{ 7639 return sched_group_set_rt_runtime(css_tg(css), val); 7640} 7641 7642static s64 cpu_rt_runtime_read(struct cgroup_subsys_state *css, 7643 struct cftype *cft) 7644{ 7645 return sched_group_rt_runtime(css_tg(css)); 7646} 7647 7648static int cpu_rt_period_write_uint(struct cgroup_subsys_state *css, 7649 struct cftype *cftype, u64 rt_period_us) 7650{ 7651 return sched_group_set_rt_period(css_tg(css), rt_period_us); 7652} 7653 7654static u64 cpu_rt_period_read_uint(struct cgroup_subsys_state *css, 7655 struct cftype *cft) 7656{ 7657 return sched_group_rt_period(css_tg(css)); 7658} 7659#endif /* CONFIG_RT_GROUP_SCHED */ 7660 7661static struct cftype cpu_files[] = { 7662#ifdef CONFIG_FAIR_GROUP_SCHED 7663 { 7664 .name = "shares", 7665 .read_u64 = cpu_shares_read_u64, 7666 .write_u64 = cpu_shares_write_u64, 7667 }, 7668#endif 7669#ifdef CONFIG_CFS_BANDWIDTH 7670 { 7671 .name = "cfs_quota_us", 7672 .read_s64 = cpu_cfs_quota_read_s64, 7673 .write_s64 = cpu_cfs_quota_write_s64, 7674 }, 7675 { 7676 .name = "cfs_period_us", 7677 .read_u64 = cpu_cfs_period_read_u64, 7678 .write_u64 = cpu_cfs_period_write_u64, 7679 }, 7680 { 7681 .name = "stat", 7682 .read_map = cpu_stats_show, 7683 }, 7684#endif 7685#ifdef CONFIG_RT_GROUP_SCHED 7686 { 7687 .name = "rt_runtime_us", 7688 .read_s64 = cpu_rt_runtime_read, 7689 .write_s64 = cpu_rt_runtime_write, 7690 }, 7691 { 7692 .name = "rt_period_us", 7693 .read_u64 = cpu_rt_period_read_uint, 7694 .write_u64 = cpu_rt_period_write_uint, 7695 }, 7696#endif 7697 { } /* terminate */ 7698}; 7699 7700struct cgroup_subsys cpu_cgroup_subsys = { 7701 .name = "cpu", 7702 .css_alloc = cpu_cgroup_css_alloc, 7703 .css_free = cpu_cgroup_css_free, 7704 .css_online = cpu_cgroup_css_online, 7705 .css_offline = cpu_cgroup_css_offline, 7706 .can_attach = cpu_cgroup_can_attach, 7707 .attach = cpu_cgroup_attach, 7708 .exit = cpu_cgroup_exit, 7709 .subsys_id = cpu_cgroup_subsys_id, 7710 .base_cftypes = cpu_files, 7711 .early_init = 1, 7712}; 7713 7714#endif /* CONFIG_CGROUP_SCHED */ 7715 7716void dump_cpu_task(int cpu) 7717{ 7718 pr_info("Task dump for CPU %d:\n", cpu); 7719 sched_show_task(cpu_curr(cpu)); 7720} 7721