core.c revision e3baac47f0e82c4be632f4f97215bb93bf16b342
1/* 2 * kernel/sched/core.c 3 * 4 * Kernel scheduler and related syscalls 5 * 6 * Copyright (C) 1991-2002 Linus Torvalds 7 * 8 * 1996-12-23 Modified by Dave Grothe to fix bugs in semaphores and 9 * make semaphores SMP safe 10 * 1998-11-19 Implemented schedule_timeout() and related stuff 11 * by Andrea Arcangeli 12 * 2002-01-04 New ultra-scalable O(1) scheduler by Ingo Molnar: 13 * hybrid priority-list and round-robin design with 14 * an array-switch method of distributing timeslices 15 * and per-CPU runqueues. Cleanups and useful suggestions 16 * by Davide Libenzi, preemptible kernel bits by Robert Love. 17 * 2003-09-03 Interactivity tuning by Con Kolivas. 18 * 2004-04-02 Scheduler domains code by Nick Piggin 19 * 2007-04-15 Work begun on replacing all interactivity tuning with a 20 * fair scheduling design by Con Kolivas. 21 * 2007-05-05 Load balancing (smp-nice) and other improvements 22 * by Peter Williams 23 * 2007-05-06 Interactivity improvements to CFS by Mike Galbraith 24 * 2007-07-01 Group scheduling enhancements by Srivatsa Vaddagiri 25 * 2007-11-29 RT balancing improvements by Steven Rostedt, Gregory Haskins, 26 * Thomas Gleixner, Mike Kravetz 27 */ 28 29#include <linux/mm.h> 30#include <linux/module.h> 31#include <linux/nmi.h> 32#include <linux/init.h> 33#include <linux/uaccess.h> 34#include <linux/highmem.h> 35#include <asm/mmu_context.h> 36#include <linux/interrupt.h> 37#include <linux/capability.h> 38#include <linux/completion.h> 39#include <linux/kernel_stat.h> 40#include <linux/debug_locks.h> 41#include <linux/perf_event.h> 42#include <linux/security.h> 43#include <linux/notifier.h> 44#include <linux/profile.h> 45#include <linux/freezer.h> 46#include <linux/vmalloc.h> 47#include <linux/blkdev.h> 48#include <linux/delay.h> 49#include <linux/pid_namespace.h> 50#include <linux/smp.h> 51#include <linux/threads.h> 52#include <linux/timer.h> 53#include <linux/rcupdate.h> 54#include <linux/cpu.h> 55#include <linux/cpuset.h> 56#include <linux/percpu.h> 57#include <linux/proc_fs.h> 58#include <linux/seq_file.h> 59#include <linux/sysctl.h> 60#include <linux/syscalls.h> 61#include <linux/times.h> 62#include <linux/tsacct_kern.h> 63#include <linux/kprobes.h> 64#include <linux/delayacct.h> 65#include <linux/unistd.h> 66#include <linux/pagemap.h> 67#include <linux/hrtimer.h> 68#include <linux/tick.h> 69#include <linux/debugfs.h> 70#include <linux/ctype.h> 71#include <linux/ftrace.h> 72#include <linux/slab.h> 73#include <linux/init_task.h> 74#include <linux/binfmts.h> 75#include <linux/context_tracking.h> 76#include <linux/compiler.h> 77 78#include <asm/switch_to.h> 79#include <asm/tlb.h> 80#include <asm/irq_regs.h> 81#include <asm/mutex.h> 82#ifdef CONFIG_PARAVIRT 83#include <asm/paravirt.h> 84#endif 85 86#include "sched.h" 87#include "../workqueue_internal.h" 88#include "../smpboot.h" 89 90#define CREATE_TRACE_POINTS 91#include <trace/events/sched.h> 92 93void start_bandwidth_timer(struct hrtimer *period_timer, ktime_t period) 94{ 95 unsigned long delta; 96 ktime_t soft, hard, now; 97 98 for (;;) { 99 if (hrtimer_active(period_timer)) 100 break; 101 102 now = hrtimer_cb_get_time(period_timer); 103 hrtimer_forward(period_timer, now, period); 104 105 soft = hrtimer_get_softexpires(period_timer); 106 hard = hrtimer_get_expires(period_timer); 107 delta = ktime_to_ns(ktime_sub(hard, soft)); 108 __hrtimer_start_range_ns(period_timer, soft, delta, 109 HRTIMER_MODE_ABS_PINNED, 0); 110 } 111} 112 113DEFINE_MUTEX(sched_domains_mutex); 114DEFINE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues); 115 116static void update_rq_clock_task(struct rq *rq, s64 delta); 117 118void update_rq_clock(struct rq *rq) 119{ 120 s64 delta; 121 122 if (rq->skip_clock_update > 0) 123 return; 124 125 delta = sched_clock_cpu(cpu_of(rq)) - rq->clock; 126 rq->clock += delta; 127 update_rq_clock_task(rq, delta); 128} 129 130/* 131 * Debugging: various feature bits 132 */ 133 134#define SCHED_FEAT(name, enabled) \ 135 (1UL << __SCHED_FEAT_##name) * enabled | 136 137const_debug unsigned int sysctl_sched_features = 138#include "features.h" 139 0; 140 141#undef SCHED_FEAT 142 143#ifdef CONFIG_SCHED_DEBUG 144#define SCHED_FEAT(name, enabled) \ 145 #name , 146 147static const char * const sched_feat_names[] = { 148#include "features.h" 149}; 150 151#undef SCHED_FEAT 152 153static int sched_feat_show(struct seq_file *m, void *v) 154{ 155 int i; 156 157 for (i = 0; i < __SCHED_FEAT_NR; i++) { 158 if (!(sysctl_sched_features & (1UL << i))) 159 seq_puts(m, "NO_"); 160 seq_printf(m, "%s ", sched_feat_names[i]); 161 } 162 seq_puts(m, "\n"); 163 164 return 0; 165} 166 167#ifdef HAVE_JUMP_LABEL 168 169#define jump_label_key__true STATIC_KEY_INIT_TRUE 170#define jump_label_key__false STATIC_KEY_INIT_FALSE 171 172#define SCHED_FEAT(name, enabled) \ 173 jump_label_key__##enabled , 174 175struct static_key sched_feat_keys[__SCHED_FEAT_NR] = { 176#include "features.h" 177}; 178 179#undef SCHED_FEAT 180 181static void sched_feat_disable(int i) 182{ 183 if (static_key_enabled(&sched_feat_keys[i])) 184 static_key_slow_dec(&sched_feat_keys[i]); 185} 186 187static void sched_feat_enable(int i) 188{ 189 if (!static_key_enabled(&sched_feat_keys[i])) 190 static_key_slow_inc(&sched_feat_keys[i]); 191} 192#else 193static void sched_feat_disable(int i) { }; 194static void sched_feat_enable(int i) { }; 195#endif /* HAVE_JUMP_LABEL */ 196 197static int sched_feat_set(char *cmp) 198{ 199 int i; 200 int neg = 0; 201 202 if (strncmp(cmp, "NO_", 3) == 0) { 203 neg = 1; 204 cmp += 3; 205 } 206 207 for (i = 0; i < __SCHED_FEAT_NR; i++) { 208 if (strcmp(cmp, sched_feat_names[i]) == 0) { 209 if (neg) { 210 sysctl_sched_features &= ~(1UL << i); 211 sched_feat_disable(i); 212 } else { 213 sysctl_sched_features |= (1UL << i); 214 sched_feat_enable(i); 215 } 216 break; 217 } 218 } 219 220 return i; 221} 222 223static ssize_t 224sched_feat_write(struct file *filp, const char __user *ubuf, 225 size_t cnt, loff_t *ppos) 226{ 227 char buf[64]; 228 char *cmp; 229 int i; 230 231 if (cnt > 63) 232 cnt = 63; 233 234 if (copy_from_user(&buf, ubuf, cnt)) 235 return -EFAULT; 236 237 buf[cnt] = 0; 238 cmp = strstrip(buf); 239 240 i = sched_feat_set(cmp); 241 if (i == __SCHED_FEAT_NR) 242 return -EINVAL; 243 244 *ppos += cnt; 245 246 return cnt; 247} 248 249static int sched_feat_open(struct inode *inode, struct file *filp) 250{ 251 return single_open(filp, sched_feat_show, NULL); 252} 253 254static const struct file_operations sched_feat_fops = { 255 .open = sched_feat_open, 256 .write = sched_feat_write, 257 .read = seq_read, 258 .llseek = seq_lseek, 259 .release = single_release, 260}; 261 262static __init int sched_init_debug(void) 263{ 264 debugfs_create_file("sched_features", 0644, NULL, NULL, 265 &sched_feat_fops); 266 267 return 0; 268} 269late_initcall(sched_init_debug); 270#endif /* CONFIG_SCHED_DEBUG */ 271 272/* 273 * Number of tasks to iterate in a single balance run. 274 * Limited because this is done with IRQs disabled. 275 */ 276const_debug unsigned int sysctl_sched_nr_migrate = 32; 277 278/* 279 * period over which we average the RT time consumption, measured 280 * in ms. 281 * 282 * default: 1s 283 */ 284const_debug unsigned int sysctl_sched_time_avg = MSEC_PER_SEC; 285 286/* 287 * period over which we measure -rt task cpu usage in us. 288 * default: 1s 289 */ 290unsigned int sysctl_sched_rt_period = 1000000; 291 292__read_mostly int scheduler_running; 293 294/* 295 * part of the period that we allow rt tasks to run in us. 296 * default: 0.95s 297 */ 298int sysctl_sched_rt_runtime = 950000; 299 300/* 301 * __task_rq_lock - lock the rq @p resides on. 302 */ 303static inline struct rq *__task_rq_lock(struct task_struct *p) 304 __acquires(rq->lock) 305{ 306 struct rq *rq; 307 308 lockdep_assert_held(&p->pi_lock); 309 310 for (;;) { 311 rq = task_rq(p); 312 raw_spin_lock(&rq->lock); 313 if (likely(rq == task_rq(p))) 314 return rq; 315 raw_spin_unlock(&rq->lock); 316 } 317} 318 319/* 320 * task_rq_lock - lock p->pi_lock and lock the rq @p resides on. 321 */ 322static struct rq *task_rq_lock(struct task_struct *p, unsigned long *flags) 323 __acquires(p->pi_lock) 324 __acquires(rq->lock) 325{ 326 struct rq *rq; 327 328 for (;;) { 329 raw_spin_lock_irqsave(&p->pi_lock, *flags); 330 rq = task_rq(p); 331 raw_spin_lock(&rq->lock); 332 if (likely(rq == task_rq(p))) 333 return rq; 334 raw_spin_unlock(&rq->lock); 335 raw_spin_unlock_irqrestore(&p->pi_lock, *flags); 336 } 337} 338 339static void __task_rq_unlock(struct rq *rq) 340 __releases(rq->lock) 341{ 342 raw_spin_unlock(&rq->lock); 343} 344 345static inline void 346task_rq_unlock(struct rq *rq, struct task_struct *p, unsigned long *flags) 347 __releases(rq->lock) 348 __releases(p->pi_lock) 349{ 350 raw_spin_unlock(&rq->lock); 351 raw_spin_unlock_irqrestore(&p->pi_lock, *flags); 352} 353 354/* 355 * this_rq_lock - lock this runqueue and disable interrupts. 356 */ 357static struct rq *this_rq_lock(void) 358 __acquires(rq->lock) 359{ 360 struct rq *rq; 361 362 local_irq_disable(); 363 rq = this_rq(); 364 raw_spin_lock(&rq->lock); 365 366 return rq; 367} 368 369#ifdef CONFIG_SCHED_HRTICK 370/* 371 * Use HR-timers to deliver accurate preemption points. 372 */ 373 374static void hrtick_clear(struct rq *rq) 375{ 376 if (hrtimer_active(&rq->hrtick_timer)) 377 hrtimer_cancel(&rq->hrtick_timer); 378} 379 380/* 381 * High-resolution timer tick. 382 * Runs from hardirq context with interrupts disabled. 383 */ 384static enum hrtimer_restart hrtick(struct hrtimer *timer) 385{ 386 struct rq *rq = container_of(timer, struct rq, hrtick_timer); 387 388 WARN_ON_ONCE(cpu_of(rq) != smp_processor_id()); 389 390 raw_spin_lock(&rq->lock); 391 update_rq_clock(rq); 392 rq->curr->sched_class->task_tick(rq, rq->curr, 1); 393 raw_spin_unlock(&rq->lock); 394 395 return HRTIMER_NORESTART; 396} 397 398#ifdef CONFIG_SMP 399 400static int __hrtick_restart(struct rq *rq) 401{ 402 struct hrtimer *timer = &rq->hrtick_timer; 403 ktime_t time = hrtimer_get_softexpires(timer); 404 405 return __hrtimer_start_range_ns(timer, time, 0, HRTIMER_MODE_ABS_PINNED, 0); 406} 407 408/* 409 * called from hardirq (IPI) context 410 */ 411static void __hrtick_start(void *arg) 412{ 413 struct rq *rq = arg; 414 415 raw_spin_lock(&rq->lock); 416 __hrtick_restart(rq); 417 rq->hrtick_csd_pending = 0; 418 raw_spin_unlock(&rq->lock); 419} 420 421/* 422 * Called to set the hrtick timer state. 423 * 424 * called with rq->lock held and irqs disabled 425 */ 426void hrtick_start(struct rq *rq, u64 delay) 427{ 428 struct hrtimer *timer = &rq->hrtick_timer; 429 ktime_t time = ktime_add_ns(timer->base->get_time(), delay); 430 431 hrtimer_set_expires(timer, time); 432 433 if (rq == this_rq()) { 434 __hrtick_restart(rq); 435 } else if (!rq->hrtick_csd_pending) { 436 smp_call_function_single_async(cpu_of(rq), &rq->hrtick_csd); 437 rq->hrtick_csd_pending = 1; 438 } 439} 440 441static int 442hotplug_hrtick(struct notifier_block *nfb, unsigned long action, void *hcpu) 443{ 444 int cpu = (int)(long)hcpu; 445 446 switch (action) { 447 case CPU_UP_CANCELED: 448 case CPU_UP_CANCELED_FROZEN: 449 case CPU_DOWN_PREPARE: 450 case CPU_DOWN_PREPARE_FROZEN: 451 case CPU_DEAD: 452 case CPU_DEAD_FROZEN: 453 hrtick_clear(cpu_rq(cpu)); 454 return NOTIFY_OK; 455 } 456 457 return NOTIFY_DONE; 458} 459 460static __init void init_hrtick(void) 461{ 462 hotcpu_notifier(hotplug_hrtick, 0); 463} 464#else 465/* 466 * Called to set the hrtick timer state. 467 * 468 * called with rq->lock held and irqs disabled 469 */ 470void hrtick_start(struct rq *rq, u64 delay) 471{ 472 __hrtimer_start_range_ns(&rq->hrtick_timer, ns_to_ktime(delay), 0, 473 HRTIMER_MODE_REL_PINNED, 0); 474} 475 476static inline void init_hrtick(void) 477{ 478} 479#endif /* CONFIG_SMP */ 480 481static void init_rq_hrtick(struct rq *rq) 482{ 483#ifdef CONFIG_SMP 484 rq->hrtick_csd_pending = 0; 485 486 rq->hrtick_csd.flags = 0; 487 rq->hrtick_csd.func = __hrtick_start; 488 rq->hrtick_csd.info = rq; 489#endif 490 491 hrtimer_init(&rq->hrtick_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); 492 rq->hrtick_timer.function = hrtick; 493} 494#else /* CONFIG_SCHED_HRTICK */ 495static inline void hrtick_clear(struct rq *rq) 496{ 497} 498 499static inline void init_rq_hrtick(struct rq *rq) 500{ 501} 502 503static inline void init_hrtick(void) 504{ 505} 506#endif /* CONFIG_SCHED_HRTICK */ 507 508/* 509 * cmpxchg based fetch_or, macro so it works for different integer types 510 */ 511#define fetch_or(ptr, val) \ 512({ typeof(*(ptr)) __old, __val = *(ptr); \ 513 for (;;) { \ 514 __old = cmpxchg((ptr), __val, __val | (val)); \ 515 if (__old == __val) \ 516 break; \ 517 __val = __old; \ 518 } \ 519 __old; \ 520}) 521 522#if defined(CONFIG_SMP) && defined(TIF_POLLING_NRFLAG) 523/* 524 * Atomically set TIF_NEED_RESCHED and test for TIF_POLLING_NRFLAG, 525 * this avoids any races wrt polling state changes and thereby avoids 526 * spurious IPIs. 527 */ 528static bool set_nr_and_not_polling(struct task_struct *p) 529{ 530 struct thread_info *ti = task_thread_info(p); 531 return !(fetch_or(&ti->flags, _TIF_NEED_RESCHED) & _TIF_POLLING_NRFLAG); 532} 533 534/* 535 * Atomically set TIF_NEED_RESCHED if TIF_POLLING_NRFLAG is set. 536 * 537 * If this returns true, then the idle task promises to call 538 * sched_ttwu_pending() and reschedule soon. 539 */ 540static bool set_nr_if_polling(struct task_struct *p) 541{ 542 struct thread_info *ti = task_thread_info(p); 543 typeof(ti->flags) old, val = ACCESS_ONCE(ti->flags); 544 545 for (;;) { 546 if (!(val & _TIF_POLLING_NRFLAG)) 547 return false; 548 if (val & _TIF_NEED_RESCHED) 549 return true; 550 old = cmpxchg(&ti->flags, val, val | _TIF_NEED_RESCHED); 551 if (old == val) 552 break; 553 val = old; 554 } 555 return true; 556} 557 558#else 559static bool set_nr_and_not_polling(struct task_struct *p) 560{ 561 set_tsk_need_resched(p); 562 return true; 563} 564 565#ifdef CONFIG_SMP 566static bool set_nr_if_polling(struct task_struct *p) 567{ 568 return false; 569} 570#endif 571#endif 572 573/* 574 * resched_task - mark a task 'to be rescheduled now'. 575 * 576 * On UP this means the setting of the need_resched flag, on SMP it 577 * might also involve a cross-CPU call to trigger the scheduler on 578 * the target CPU. 579 */ 580void resched_task(struct task_struct *p) 581{ 582 int cpu; 583 584 lockdep_assert_held(&task_rq(p)->lock); 585 586 if (test_tsk_need_resched(p)) 587 return; 588 589 cpu = task_cpu(p); 590 591 if (cpu == smp_processor_id()) { 592 set_tsk_need_resched(p); 593 set_preempt_need_resched(); 594 return; 595 } 596 597 if (set_nr_and_not_polling(p)) 598 smp_send_reschedule(cpu); 599 else 600 trace_sched_wake_idle_without_ipi(cpu); 601} 602 603void resched_cpu(int cpu) 604{ 605 struct rq *rq = cpu_rq(cpu); 606 unsigned long flags; 607 608 if (!raw_spin_trylock_irqsave(&rq->lock, flags)) 609 return; 610 resched_task(cpu_curr(cpu)); 611 raw_spin_unlock_irqrestore(&rq->lock, flags); 612} 613 614#ifdef CONFIG_SMP 615#ifdef CONFIG_NO_HZ_COMMON 616/* 617 * In the semi idle case, use the nearest busy cpu for migrating timers 618 * from an idle cpu. This is good for power-savings. 619 * 620 * We don't do similar optimization for completely idle system, as 621 * selecting an idle cpu will add more delays to the timers than intended 622 * (as that cpu's timer base may not be uptodate wrt jiffies etc). 623 */ 624int get_nohz_timer_target(int pinned) 625{ 626 int cpu = smp_processor_id(); 627 int i; 628 struct sched_domain *sd; 629 630 if (pinned || !get_sysctl_timer_migration() || !idle_cpu(cpu)) 631 return cpu; 632 633 rcu_read_lock(); 634 for_each_domain(cpu, sd) { 635 for_each_cpu(i, sched_domain_span(sd)) { 636 if (!idle_cpu(i)) { 637 cpu = i; 638 goto unlock; 639 } 640 } 641 } 642unlock: 643 rcu_read_unlock(); 644 return cpu; 645} 646/* 647 * When add_timer_on() enqueues a timer into the timer wheel of an 648 * idle CPU then this timer might expire before the next timer event 649 * which is scheduled to wake up that CPU. In case of a completely 650 * idle system the next event might even be infinite time into the 651 * future. wake_up_idle_cpu() ensures that the CPU is woken up and 652 * leaves the inner idle loop so the newly added timer is taken into 653 * account when the CPU goes back to idle and evaluates the timer 654 * wheel for the next timer event. 655 */ 656static void wake_up_idle_cpu(int cpu) 657{ 658 struct rq *rq = cpu_rq(cpu); 659 660 if (cpu == smp_processor_id()) 661 return; 662 663 if (set_nr_and_not_polling(rq->idle)) 664 smp_send_reschedule(cpu); 665 else 666 trace_sched_wake_idle_without_ipi(cpu); 667} 668 669static bool wake_up_full_nohz_cpu(int cpu) 670{ 671 if (tick_nohz_full_cpu(cpu)) { 672 if (cpu != smp_processor_id() || 673 tick_nohz_tick_stopped()) 674 smp_send_reschedule(cpu); 675 return true; 676 } 677 678 return false; 679} 680 681void wake_up_nohz_cpu(int cpu) 682{ 683 if (!wake_up_full_nohz_cpu(cpu)) 684 wake_up_idle_cpu(cpu); 685} 686 687static inline bool got_nohz_idle_kick(void) 688{ 689 int cpu = smp_processor_id(); 690 691 if (!test_bit(NOHZ_BALANCE_KICK, nohz_flags(cpu))) 692 return false; 693 694 if (idle_cpu(cpu) && !need_resched()) 695 return true; 696 697 /* 698 * We can't run Idle Load Balance on this CPU for this time so we 699 * cancel it and clear NOHZ_BALANCE_KICK 700 */ 701 clear_bit(NOHZ_BALANCE_KICK, nohz_flags(cpu)); 702 return false; 703} 704 705#else /* CONFIG_NO_HZ_COMMON */ 706 707static inline bool got_nohz_idle_kick(void) 708{ 709 return false; 710} 711 712#endif /* CONFIG_NO_HZ_COMMON */ 713 714#ifdef CONFIG_NO_HZ_FULL 715bool sched_can_stop_tick(void) 716{ 717 struct rq *rq; 718 719 rq = this_rq(); 720 721 /* Make sure rq->nr_running update is visible after the IPI */ 722 smp_rmb(); 723 724 /* More than one running task need preemption */ 725 if (rq->nr_running > 1) 726 return false; 727 728 return true; 729} 730#endif /* CONFIG_NO_HZ_FULL */ 731 732void sched_avg_update(struct rq *rq) 733{ 734 s64 period = sched_avg_period(); 735 736 while ((s64)(rq_clock(rq) - rq->age_stamp) > period) { 737 /* 738 * Inline assembly required to prevent the compiler 739 * optimising this loop into a divmod call. 740 * See __iter_div_u64_rem() for another example of this. 741 */ 742 asm("" : "+rm" (rq->age_stamp)); 743 rq->age_stamp += period; 744 rq->rt_avg /= 2; 745 } 746} 747 748#endif /* CONFIG_SMP */ 749 750#if defined(CONFIG_RT_GROUP_SCHED) || (defined(CONFIG_FAIR_GROUP_SCHED) && \ 751 (defined(CONFIG_SMP) || defined(CONFIG_CFS_BANDWIDTH))) 752/* 753 * Iterate task_group tree rooted at *from, calling @down when first entering a 754 * node and @up when leaving it for the final time. 755 * 756 * Caller must hold rcu_lock or sufficient equivalent. 757 */ 758int walk_tg_tree_from(struct task_group *from, 759 tg_visitor down, tg_visitor up, void *data) 760{ 761 struct task_group *parent, *child; 762 int ret; 763 764 parent = from; 765 766down: 767 ret = (*down)(parent, data); 768 if (ret) 769 goto out; 770 list_for_each_entry_rcu(child, &parent->children, siblings) { 771 parent = child; 772 goto down; 773 774up: 775 continue; 776 } 777 ret = (*up)(parent, data); 778 if (ret || parent == from) 779 goto out; 780 781 child = parent; 782 parent = parent->parent; 783 if (parent) 784 goto up; 785out: 786 return ret; 787} 788 789int tg_nop(struct task_group *tg, void *data) 790{ 791 return 0; 792} 793#endif 794 795static void set_load_weight(struct task_struct *p) 796{ 797 int prio = p->static_prio - MAX_RT_PRIO; 798 struct load_weight *load = &p->se.load; 799 800 /* 801 * SCHED_IDLE tasks get minimal weight: 802 */ 803 if (p->policy == SCHED_IDLE) { 804 load->weight = scale_load(WEIGHT_IDLEPRIO); 805 load->inv_weight = WMULT_IDLEPRIO; 806 return; 807 } 808 809 load->weight = scale_load(prio_to_weight[prio]); 810 load->inv_weight = prio_to_wmult[prio]; 811} 812 813static void enqueue_task(struct rq *rq, struct task_struct *p, int flags) 814{ 815 update_rq_clock(rq); 816 sched_info_queued(rq, p); 817 p->sched_class->enqueue_task(rq, p, flags); 818} 819 820static void dequeue_task(struct rq *rq, struct task_struct *p, int flags) 821{ 822 update_rq_clock(rq); 823 sched_info_dequeued(rq, p); 824 p->sched_class->dequeue_task(rq, p, flags); 825} 826 827void activate_task(struct rq *rq, struct task_struct *p, int flags) 828{ 829 if (task_contributes_to_load(p)) 830 rq->nr_uninterruptible--; 831 832 enqueue_task(rq, p, flags); 833} 834 835void deactivate_task(struct rq *rq, struct task_struct *p, int flags) 836{ 837 if (task_contributes_to_load(p)) 838 rq->nr_uninterruptible++; 839 840 dequeue_task(rq, p, flags); 841} 842 843static void update_rq_clock_task(struct rq *rq, s64 delta) 844{ 845/* 846 * In theory, the compile should just see 0 here, and optimize out the call 847 * to sched_rt_avg_update. But I don't trust it... 848 */ 849#if defined(CONFIG_IRQ_TIME_ACCOUNTING) || defined(CONFIG_PARAVIRT_TIME_ACCOUNTING) 850 s64 steal = 0, irq_delta = 0; 851#endif 852#ifdef CONFIG_IRQ_TIME_ACCOUNTING 853 irq_delta = irq_time_read(cpu_of(rq)) - rq->prev_irq_time; 854 855 /* 856 * Since irq_time is only updated on {soft,}irq_exit, we might run into 857 * this case when a previous update_rq_clock() happened inside a 858 * {soft,}irq region. 859 * 860 * When this happens, we stop ->clock_task and only update the 861 * prev_irq_time stamp to account for the part that fit, so that a next 862 * update will consume the rest. This ensures ->clock_task is 863 * monotonic. 864 * 865 * It does however cause some slight miss-attribution of {soft,}irq 866 * time, a more accurate solution would be to update the irq_time using 867 * the current rq->clock timestamp, except that would require using 868 * atomic ops. 869 */ 870 if (irq_delta > delta) 871 irq_delta = delta; 872 873 rq->prev_irq_time += irq_delta; 874 delta -= irq_delta; 875#endif 876#ifdef CONFIG_PARAVIRT_TIME_ACCOUNTING 877 if (static_key_false((¶virt_steal_rq_enabled))) { 878 steal = paravirt_steal_clock(cpu_of(rq)); 879 steal -= rq->prev_steal_time_rq; 880 881 if (unlikely(steal > delta)) 882 steal = delta; 883 884 rq->prev_steal_time_rq += steal; 885 delta -= steal; 886 } 887#endif 888 889 rq->clock_task += delta; 890 891#if defined(CONFIG_IRQ_TIME_ACCOUNTING) || defined(CONFIG_PARAVIRT_TIME_ACCOUNTING) 892 if ((irq_delta + steal) && sched_feat(NONTASK_CAPACITY)) 893 sched_rt_avg_update(rq, irq_delta + steal); 894#endif 895} 896 897void sched_set_stop_task(int cpu, struct task_struct *stop) 898{ 899 struct sched_param param = { .sched_priority = MAX_RT_PRIO - 1 }; 900 struct task_struct *old_stop = cpu_rq(cpu)->stop; 901 902 if (stop) { 903 /* 904 * Make it appear like a SCHED_FIFO task, its something 905 * userspace knows about and won't get confused about. 906 * 907 * Also, it will make PI more or less work without too 908 * much confusion -- but then, stop work should not 909 * rely on PI working anyway. 910 */ 911 sched_setscheduler_nocheck(stop, SCHED_FIFO, ¶m); 912 913 stop->sched_class = &stop_sched_class; 914 } 915 916 cpu_rq(cpu)->stop = stop; 917 918 if (old_stop) { 919 /* 920 * Reset it back to a normal scheduling class so that 921 * it can die in pieces. 922 */ 923 old_stop->sched_class = &rt_sched_class; 924 } 925} 926 927/* 928 * __normal_prio - return the priority that is based on the static prio 929 */ 930static inline int __normal_prio(struct task_struct *p) 931{ 932 return p->static_prio; 933} 934 935/* 936 * Calculate the expected normal priority: i.e. priority 937 * without taking RT-inheritance into account. Might be 938 * boosted by interactivity modifiers. Changes upon fork, 939 * setprio syscalls, and whenever the interactivity 940 * estimator recalculates. 941 */ 942static inline int normal_prio(struct task_struct *p) 943{ 944 int prio; 945 946 if (task_has_dl_policy(p)) 947 prio = MAX_DL_PRIO-1; 948 else if (task_has_rt_policy(p)) 949 prio = MAX_RT_PRIO-1 - p->rt_priority; 950 else 951 prio = __normal_prio(p); 952 return prio; 953} 954 955/* 956 * Calculate the current priority, i.e. the priority 957 * taken into account by the scheduler. This value might 958 * be boosted by RT tasks, or might be boosted by 959 * interactivity modifiers. Will be RT if the task got 960 * RT-boosted. If not then it returns p->normal_prio. 961 */ 962static int effective_prio(struct task_struct *p) 963{ 964 p->normal_prio = normal_prio(p); 965 /* 966 * If we are RT tasks or we were boosted to RT priority, 967 * keep the priority unchanged. Otherwise, update priority 968 * to the normal priority: 969 */ 970 if (!rt_prio(p->prio)) 971 return p->normal_prio; 972 return p->prio; 973} 974 975/** 976 * task_curr - is this task currently executing on a CPU? 977 * @p: the task in question. 978 * 979 * Return: 1 if the task is currently executing. 0 otherwise. 980 */ 981inline int task_curr(const struct task_struct *p) 982{ 983 return cpu_curr(task_cpu(p)) == p; 984} 985 986static inline void check_class_changed(struct rq *rq, struct task_struct *p, 987 const struct sched_class *prev_class, 988 int oldprio) 989{ 990 if (prev_class != p->sched_class) { 991 if (prev_class->switched_from) 992 prev_class->switched_from(rq, p); 993 p->sched_class->switched_to(rq, p); 994 } else if (oldprio != p->prio || dl_task(p)) 995 p->sched_class->prio_changed(rq, p, oldprio); 996} 997 998void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags) 999{ 1000 const struct sched_class *class; 1001 1002 if (p->sched_class == rq->curr->sched_class) { 1003 rq->curr->sched_class->check_preempt_curr(rq, p, flags); 1004 } else { 1005 for_each_class(class) { 1006 if (class == rq->curr->sched_class) 1007 break; 1008 if (class == p->sched_class) { 1009 resched_task(rq->curr); 1010 break; 1011 } 1012 } 1013 } 1014 1015 /* 1016 * A queue event has occurred, and we're going to schedule. In 1017 * this case, we can save a useless back to back clock update. 1018 */ 1019 if (rq->curr->on_rq && test_tsk_need_resched(rq->curr)) 1020 rq->skip_clock_update = 1; 1021} 1022 1023#ifdef CONFIG_SMP 1024void set_task_cpu(struct task_struct *p, unsigned int new_cpu) 1025{ 1026#ifdef CONFIG_SCHED_DEBUG 1027 /* 1028 * We should never call set_task_cpu() on a blocked task, 1029 * ttwu() will sort out the placement. 1030 */ 1031 WARN_ON_ONCE(p->state != TASK_RUNNING && p->state != TASK_WAKING && 1032 !(task_preempt_count(p) & PREEMPT_ACTIVE)); 1033 1034#ifdef CONFIG_LOCKDEP 1035 /* 1036 * The caller should hold either p->pi_lock or rq->lock, when changing 1037 * a task's CPU. ->pi_lock for waking tasks, rq->lock for runnable tasks. 1038 * 1039 * sched_move_task() holds both and thus holding either pins the cgroup, 1040 * see task_group(). 1041 * 1042 * Furthermore, all task_rq users should acquire both locks, see 1043 * task_rq_lock(). 1044 */ 1045 WARN_ON_ONCE(debug_locks && !(lockdep_is_held(&p->pi_lock) || 1046 lockdep_is_held(&task_rq(p)->lock))); 1047#endif 1048#endif 1049 1050 trace_sched_migrate_task(p, new_cpu); 1051 1052 if (task_cpu(p) != new_cpu) { 1053 if (p->sched_class->migrate_task_rq) 1054 p->sched_class->migrate_task_rq(p, new_cpu); 1055 p->se.nr_migrations++; 1056 perf_sw_event(PERF_COUNT_SW_CPU_MIGRATIONS, 1, NULL, 0); 1057 } 1058 1059 __set_task_cpu(p, new_cpu); 1060} 1061 1062static void __migrate_swap_task(struct task_struct *p, int cpu) 1063{ 1064 if (p->on_rq) { 1065 struct rq *src_rq, *dst_rq; 1066 1067 src_rq = task_rq(p); 1068 dst_rq = cpu_rq(cpu); 1069 1070 deactivate_task(src_rq, p, 0); 1071 set_task_cpu(p, cpu); 1072 activate_task(dst_rq, p, 0); 1073 check_preempt_curr(dst_rq, p, 0); 1074 } else { 1075 /* 1076 * Task isn't running anymore; make it appear like we migrated 1077 * it before it went to sleep. This means on wakeup we make the 1078 * previous cpu our targer instead of where it really is. 1079 */ 1080 p->wake_cpu = cpu; 1081 } 1082} 1083 1084struct migration_swap_arg { 1085 struct task_struct *src_task, *dst_task; 1086 int src_cpu, dst_cpu; 1087}; 1088 1089static int migrate_swap_stop(void *data) 1090{ 1091 struct migration_swap_arg *arg = data; 1092 struct rq *src_rq, *dst_rq; 1093 int ret = -EAGAIN; 1094 1095 src_rq = cpu_rq(arg->src_cpu); 1096 dst_rq = cpu_rq(arg->dst_cpu); 1097 1098 double_raw_lock(&arg->src_task->pi_lock, 1099 &arg->dst_task->pi_lock); 1100 double_rq_lock(src_rq, dst_rq); 1101 if (task_cpu(arg->dst_task) != arg->dst_cpu) 1102 goto unlock; 1103 1104 if (task_cpu(arg->src_task) != arg->src_cpu) 1105 goto unlock; 1106 1107 if (!cpumask_test_cpu(arg->dst_cpu, tsk_cpus_allowed(arg->src_task))) 1108 goto unlock; 1109 1110 if (!cpumask_test_cpu(arg->src_cpu, tsk_cpus_allowed(arg->dst_task))) 1111 goto unlock; 1112 1113 __migrate_swap_task(arg->src_task, arg->dst_cpu); 1114 __migrate_swap_task(arg->dst_task, arg->src_cpu); 1115 1116 ret = 0; 1117 1118unlock: 1119 double_rq_unlock(src_rq, dst_rq); 1120 raw_spin_unlock(&arg->dst_task->pi_lock); 1121 raw_spin_unlock(&arg->src_task->pi_lock); 1122 1123 return ret; 1124} 1125 1126/* 1127 * Cross migrate two tasks 1128 */ 1129int migrate_swap(struct task_struct *cur, struct task_struct *p) 1130{ 1131 struct migration_swap_arg arg; 1132 int ret = -EINVAL; 1133 1134 arg = (struct migration_swap_arg){ 1135 .src_task = cur, 1136 .src_cpu = task_cpu(cur), 1137 .dst_task = p, 1138 .dst_cpu = task_cpu(p), 1139 }; 1140 1141 if (arg.src_cpu == arg.dst_cpu) 1142 goto out; 1143 1144 /* 1145 * These three tests are all lockless; this is OK since all of them 1146 * will be re-checked with proper locks held further down the line. 1147 */ 1148 if (!cpu_active(arg.src_cpu) || !cpu_active(arg.dst_cpu)) 1149 goto out; 1150 1151 if (!cpumask_test_cpu(arg.dst_cpu, tsk_cpus_allowed(arg.src_task))) 1152 goto out; 1153 1154 if (!cpumask_test_cpu(arg.src_cpu, tsk_cpus_allowed(arg.dst_task))) 1155 goto out; 1156 1157 trace_sched_swap_numa(cur, arg.src_cpu, p, arg.dst_cpu); 1158 ret = stop_two_cpus(arg.dst_cpu, arg.src_cpu, migrate_swap_stop, &arg); 1159 1160out: 1161 return ret; 1162} 1163 1164struct migration_arg { 1165 struct task_struct *task; 1166 int dest_cpu; 1167}; 1168 1169static int migration_cpu_stop(void *data); 1170 1171/* 1172 * wait_task_inactive - wait for a thread to unschedule. 1173 * 1174 * If @match_state is nonzero, it's the @p->state value just checked and 1175 * not expected to change. If it changes, i.e. @p might have woken up, 1176 * then return zero. When we succeed in waiting for @p to be off its CPU, 1177 * we return a positive number (its total switch count). If a second call 1178 * a short while later returns the same number, the caller can be sure that 1179 * @p has remained unscheduled the whole time. 1180 * 1181 * The caller must ensure that the task *will* unschedule sometime soon, 1182 * else this function might spin for a *long* time. This function can't 1183 * be called with interrupts off, or it may introduce deadlock with 1184 * smp_call_function() if an IPI is sent by the same process we are 1185 * waiting to become inactive. 1186 */ 1187unsigned long wait_task_inactive(struct task_struct *p, long match_state) 1188{ 1189 unsigned long flags; 1190 int running, on_rq; 1191 unsigned long ncsw; 1192 struct rq *rq; 1193 1194 for (;;) { 1195 /* 1196 * We do the initial early heuristics without holding 1197 * any task-queue locks at all. We'll only try to get 1198 * the runqueue lock when things look like they will 1199 * work out! 1200 */ 1201 rq = task_rq(p); 1202 1203 /* 1204 * If the task is actively running on another CPU 1205 * still, just relax and busy-wait without holding 1206 * any locks. 1207 * 1208 * NOTE! Since we don't hold any locks, it's not 1209 * even sure that "rq" stays as the right runqueue! 1210 * But we don't care, since "task_running()" will 1211 * return false if the runqueue has changed and p 1212 * is actually now running somewhere else! 1213 */ 1214 while (task_running(rq, p)) { 1215 if (match_state && unlikely(p->state != match_state)) 1216 return 0; 1217 cpu_relax(); 1218 } 1219 1220 /* 1221 * Ok, time to look more closely! We need the rq 1222 * lock now, to be *sure*. If we're wrong, we'll 1223 * just go back and repeat. 1224 */ 1225 rq = task_rq_lock(p, &flags); 1226 trace_sched_wait_task(p); 1227 running = task_running(rq, p); 1228 on_rq = p->on_rq; 1229 ncsw = 0; 1230 if (!match_state || p->state == match_state) 1231 ncsw = p->nvcsw | LONG_MIN; /* sets MSB */ 1232 task_rq_unlock(rq, p, &flags); 1233 1234 /* 1235 * If it changed from the expected state, bail out now. 1236 */ 1237 if (unlikely(!ncsw)) 1238 break; 1239 1240 /* 1241 * Was it really running after all now that we 1242 * checked with the proper locks actually held? 1243 * 1244 * Oops. Go back and try again.. 1245 */ 1246 if (unlikely(running)) { 1247 cpu_relax(); 1248 continue; 1249 } 1250 1251 /* 1252 * It's not enough that it's not actively running, 1253 * it must be off the runqueue _entirely_, and not 1254 * preempted! 1255 * 1256 * So if it was still runnable (but just not actively 1257 * running right now), it's preempted, and we should 1258 * yield - it could be a while. 1259 */ 1260 if (unlikely(on_rq)) { 1261 ktime_t to = ktime_set(0, NSEC_PER_SEC/HZ); 1262 1263 set_current_state(TASK_UNINTERRUPTIBLE); 1264 schedule_hrtimeout(&to, HRTIMER_MODE_REL); 1265 continue; 1266 } 1267 1268 /* 1269 * Ahh, all good. It wasn't running, and it wasn't 1270 * runnable, which means that it will never become 1271 * running in the future either. We're all done! 1272 */ 1273 break; 1274 } 1275 1276 return ncsw; 1277} 1278 1279/*** 1280 * kick_process - kick a running thread to enter/exit the kernel 1281 * @p: the to-be-kicked thread 1282 * 1283 * Cause a process which is running on another CPU to enter 1284 * kernel-mode, without any delay. (to get signals handled.) 1285 * 1286 * NOTE: this function doesn't have to take the runqueue lock, 1287 * because all it wants to ensure is that the remote task enters 1288 * the kernel. If the IPI races and the task has been migrated 1289 * to another CPU then no harm is done and the purpose has been 1290 * achieved as well. 1291 */ 1292void kick_process(struct task_struct *p) 1293{ 1294 int cpu; 1295 1296 preempt_disable(); 1297 cpu = task_cpu(p); 1298 if ((cpu != smp_processor_id()) && task_curr(p)) 1299 smp_send_reschedule(cpu); 1300 preempt_enable(); 1301} 1302EXPORT_SYMBOL_GPL(kick_process); 1303#endif /* CONFIG_SMP */ 1304 1305#ifdef CONFIG_SMP 1306/* 1307 * ->cpus_allowed is protected by both rq->lock and p->pi_lock 1308 */ 1309static int select_fallback_rq(int cpu, struct task_struct *p) 1310{ 1311 int nid = cpu_to_node(cpu); 1312 const struct cpumask *nodemask = NULL; 1313 enum { cpuset, possible, fail } state = cpuset; 1314 int dest_cpu; 1315 1316 /* 1317 * If the node that the cpu is on has been offlined, cpu_to_node() 1318 * will return -1. There is no cpu on the node, and we should 1319 * select the cpu on the other node. 1320 */ 1321 if (nid != -1) { 1322 nodemask = cpumask_of_node(nid); 1323 1324 /* Look for allowed, online CPU in same node. */ 1325 for_each_cpu(dest_cpu, nodemask) { 1326 if (!cpu_online(dest_cpu)) 1327 continue; 1328 if (!cpu_active(dest_cpu)) 1329 continue; 1330 if (cpumask_test_cpu(dest_cpu, tsk_cpus_allowed(p))) 1331 return dest_cpu; 1332 } 1333 } 1334 1335 for (;;) { 1336 /* Any allowed, online CPU? */ 1337 for_each_cpu(dest_cpu, tsk_cpus_allowed(p)) { 1338 if (!cpu_online(dest_cpu)) 1339 continue; 1340 if (!cpu_active(dest_cpu)) 1341 continue; 1342 goto out; 1343 } 1344 1345 switch (state) { 1346 case cpuset: 1347 /* No more Mr. Nice Guy. */ 1348 cpuset_cpus_allowed_fallback(p); 1349 state = possible; 1350 break; 1351 1352 case possible: 1353 do_set_cpus_allowed(p, cpu_possible_mask); 1354 state = fail; 1355 break; 1356 1357 case fail: 1358 BUG(); 1359 break; 1360 } 1361 } 1362 1363out: 1364 if (state != cpuset) { 1365 /* 1366 * Don't tell them about moving exiting tasks or 1367 * kernel threads (both mm NULL), since they never 1368 * leave kernel. 1369 */ 1370 if (p->mm && printk_ratelimit()) { 1371 printk_sched("process %d (%s) no longer affine to cpu%d\n", 1372 task_pid_nr(p), p->comm, cpu); 1373 } 1374 } 1375 1376 return dest_cpu; 1377} 1378 1379/* 1380 * The caller (fork, wakeup) owns p->pi_lock, ->cpus_allowed is stable. 1381 */ 1382static inline 1383int select_task_rq(struct task_struct *p, int cpu, int sd_flags, int wake_flags) 1384{ 1385 cpu = p->sched_class->select_task_rq(p, cpu, sd_flags, wake_flags); 1386 1387 /* 1388 * In order not to call set_task_cpu() on a blocking task we need 1389 * to rely on ttwu() to place the task on a valid ->cpus_allowed 1390 * cpu. 1391 * 1392 * Since this is common to all placement strategies, this lives here. 1393 * 1394 * [ this allows ->select_task() to simply return task_cpu(p) and 1395 * not worry about this generic constraint ] 1396 */ 1397 if (unlikely(!cpumask_test_cpu(cpu, tsk_cpus_allowed(p)) || 1398 !cpu_online(cpu))) 1399 cpu = select_fallback_rq(task_cpu(p), p); 1400 1401 return cpu; 1402} 1403 1404static void update_avg(u64 *avg, u64 sample) 1405{ 1406 s64 diff = sample - *avg; 1407 *avg += diff >> 3; 1408} 1409#endif 1410 1411static void 1412ttwu_stat(struct task_struct *p, int cpu, int wake_flags) 1413{ 1414#ifdef CONFIG_SCHEDSTATS 1415 struct rq *rq = this_rq(); 1416 1417#ifdef CONFIG_SMP 1418 int this_cpu = smp_processor_id(); 1419 1420 if (cpu == this_cpu) { 1421 schedstat_inc(rq, ttwu_local); 1422 schedstat_inc(p, se.statistics.nr_wakeups_local); 1423 } else { 1424 struct sched_domain *sd; 1425 1426 schedstat_inc(p, se.statistics.nr_wakeups_remote); 1427 rcu_read_lock(); 1428 for_each_domain(this_cpu, sd) { 1429 if (cpumask_test_cpu(cpu, sched_domain_span(sd))) { 1430 schedstat_inc(sd, ttwu_wake_remote); 1431 break; 1432 } 1433 } 1434 rcu_read_unlock(); 1435 } 1436 1437 if (wake_flags & WF_MIGRATED) 1438 schedstat_inc(p, se.statistics.nr_wakeups_migrate); 1439 1440#endif /* CONFIG_SMP */ 1441 1442 schedstat_inc(rq, ttwu_count); 1443 schedstat_inc(p, se.statistics.nr_wakeups); 1444 1445 if (wake_flags & WF_SYNC) 1446 schedstat_inc(p, se.statistics.nr_wakeups_sync); 1447 1448#endif /* CONFIG_SCHEDSTATS */ 1449} 1450 1451static void ttwu_activate(struct rq *rq, struct task_struct *p, int en_flags) 1452{ 1453 activate_task(rq, p, en_flags); 1454 p->on_rq = 1; 1455 1456 /* if a worker is waking up, notify workqueue */ 1457 if (p->flags & PF_WQ_WORKER) 1458 wq_worker_waking_up(p, cpu_of(rq)); 1459} 1460 1461/* 1462 * Mark the task runnable and perform wakeup-preemption. 1463 */ 1464static void 1465ttwu_do_wakeup(struct rq *rq, struct task_struct *p, int wake_flags) 1466{ 1467 check_preempt_curr(rq, p, wake_flags); 1468 trace_sched_wakeup(p, true); 1469 1470 p->state = TASK_RUNNING; 1471#ifdef CONFIG_SMP 1472 if (p->sched_class->task_woken) 1473 p->sched_class->task_woken(rq, p); 1474 1475 if (rq->idle_stamp) { 1476 u64 delta = rq_clock(rq) - rq->idle_stamp; 1477 u64 max = 2*rq->max_idle_balance_cost; 1478 1479 update_avg(&rq->avg_idle, delta); 1480 1481 if (rq->avg_idle > max) 1482 rq->avg_idle = max; 1483 1484 rq->idle_stamp = 0; 1485 } 1486#endif 1487} 1488 1489static void 1490ttwu_do_activate(struct rq *rq, struct task_struct *p, int wake_flags) 1491{ 1492#ifdef CONFIG_SMP 1493 if (p->sched_contributes_to_load) 1494 rq->nr_uninterruptible--; 1495#endif 1496 1497 ttwu_activate(rq, p, ENQUEUE_WAKEUP | ENQUEUE_WAKING); 1498 ttwu_do_wakeup(rq, p, wake_flags); 1499} 1500 1501/* 1502 * Called in case the task @p isn't fully descheduled from its runqueue, 1503 * in this case we must do a remote wakeup. Its a 'light' wakeup though, 1504 * since all we need to do is flip p->state to TASK_RUNNING, since 1505 * the task is still ->on_rq. 1506 */ 1507static int ttwu_remote(struct task_struct *p, int wake_flags) 1508{ 1509 struct rq *rq; 1510 int ret = 0; 1511 1512 rq = __task_rq_lock(p); 1513 if (p->on_rq) { 1514 /* check_preempt_curr() may use rq clock */ 1515 update_rq_clock(rq); 1516 ttwu_do_wakeup(rq, p, wake_flags); 1517 ret = 1; 1518 } 1519 __task_rq_unlock(rq); 1520 1521 return ret; 1522} 1523 1524#ifdef CONFIG_SMP 1525void sched_ttwu_pending(void) 1526{ 1527 struct rq *rq = this_rq(); 1528 struct llist_node *llist = llist_del_all(&rq->wake_list); 1529 struct task_struct *p; 1530 unsigned long flags; 1531 1532 if (!llist) 1533 return; 1534 1535 raw_spin_lock_irqsave(&rq->lock, flags); 1536 1537 while (llist) { 1538 p = llist_entry(llist, struct task_struct, wake_entry); 1539 llist = llist_next(llist); 1540 ttwu_do_activate(rq, p, 0); 1541 } 1542 1543 raw_spin_unlock_irqrestore(&rq->lock, flags); 1544} 1545 1546void scheduler_ipi(void) 1547{ 1548 /* 1549 * Fold TIF_NEED_RESCHED into the preempt_count; anybody setting 1550 * TIF_NEED_RESCHED remotely (for the first time) will also send 1551 * this IPI. 1552 */ 1553 preempt_fold_need_resched(); 1554 1555 if (llist_empty(&this_rq()->wake_list) 1556 && !tick_nohz_full_cpu(smp_processor_id()) 1557 && !got_nohz_idle_kick()) 1558 return; 1559 1560 /* 1561 * Not all reschedule IPI handlers call irq_enter/irq_exit, since 1562 * traditionally all their work was done from the interrupt return 1563 * path. Now that we actually do some work, we need to make sure 1564 * we do call them. 1565 * 1566 * Some archs already do call them, luckily irq_enter/exit nest 1567 * properly. 1568 * 1569 * Arguably we should visit all archs and update all handlers, 1570 * however a fair share of IPIs are still resched only so this would 1571 * somewhat pessimize the simple resched case. 1572 */ 1573 irq_enter(); 1574 tick_nohz_full_check(); 1575 sched_ttwu_pending(); 1576 1577 /* 1578 * Check if someone kicked us for doing the nohz idle load balance. 1579 */ 1580 if (unlikely(got_nohz_idle_kick())) { 1581 this_rq()->idle_balance = 1; 1582 raise_softirq_irqoff(SCHED_SOFTIRQ); 1583 } 1584 irq_exit(); 1585} 1586 1587static void ttwu_queue_remote(struct task_struct *p, int cpu) 1588{ 1589 struct rq *rq = cpu_rq(cpu); 1590 1591 if (llist_add(&p->wake_entry, &cpu_rq(cpu)->wake_list)) { 1592 if (!set_nr_if_polling(rq->idle)) 1593 smp_send_reschedule(cpu); 1594 else 1595 trace_sched_wake_idle_without_ipi(cpu); 1596 } 1597} 1598 1599bool cpus_share_cache(int this_cpu, int that_cpu) 1600{ 1601 return per_cpu(sd_llc_id, this_cpu) == per_cpu(sd_llc_id, that_cpu); 1602} 1603#endif /* CONFIG_SMP */ 1604 1605static void ttwu_queue(struct task_struct *p, int cpu) 1606{ 1607 struct rq *rq = cpu_rq(cpu); 1608 1609#if defined(CONFIG_SMP) 1610 if (sched_feat(TTWU_QUEUE) && !cpus_share_cache(smp_processor_id(), cpu)) { 1611 sched_clock_cpu(cpu); /* sync clocks x-cpu */ 1612 ttwu_queue_remote(p, cpu); 1613 return; 1614 } 1615#endif 1616 1617 raw_spin_lock(&rq->lock); 1618 ttwu_do_activate(rq, p, 0); 1619 raw_spin_unlock(&rq->lock); 1620} 1621 1622/** 1623 * try_to_wake_up - wake up a thread 1624 * @p: the thread to be awakened 1625 * @state: the mask of task states that can be woken 1626 * @wake_flags: wake modifier flags (WF_*) 1627 * 1628 * Put it on the run-queue if it's not already there. The "current" 1629 * thread is always on the run-queue (except when the actual 1630 * re-schedule is in progress), and as such you're allowed to do 1631 * the simpler "current->state = TASK_RUNNING" to mark yourself 1632 * runnable without the overhead of this. 1633 * 1634 * Return: %true if @p was woken up, %false if it was already running. 1635 * or @state didn't match @p's state. 1636 */ 1637static int 1638try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags) 1639{ 1640 unsigned long flags; 1641 int cpu, success = 0; 1642 1643 /* 1644 * If we are going to wake up a thread waiting for CONDITION we 1645 * need to ensure that CONDITION=1 done by the caller can not be 1646 * reordered with p->state check below. This pairs with mb() in 1647 * set_current_state() the waiting thread does. 1648 */ 1649 smp_mb__before_spinlock(); 1650 raw_spin_lock_irqsave(&p->pi_lock, flags); 1651 if (!(p->state & state)) 1652 goto out; 1653 1654 success = 1; /* we're going to change ->state */ 1655 cpu = task_cpu(p); 1656 1657 if (p->on_rq && ttwu_remote(p, wake_flags)) 1658 goto stat; 1659 1660#ifdef CONFIG_SMP 1661 /* 1662 * If the owning (remote) cpu is still in the middle of schedule() with 1663 * this task as prev, wait until its done referencing the task. 1664 */ 1665 while (p->on_cpu) 1666 cpu_relax(); 1667 /* 1668 * Pairs with the smp_wmb() in finish_lock_switch(). 1669 */ 1670 smp_rmb(); 1671 1672 p->sched_contributes_to_load = !!task_contributes_to_load(p); 1673 p->state = TASK_WAKING; 1674 1675 if (p->sched_class->task_waking) 1676 p->sched_class->task_waking(p); 1677 1678 cpu = select_task_rq(p, p->wake_cpu, SD_BALANCE_WAKE, wake_flags); 1679 if (task_cpu(p) != cpu) { 1680 wake_flags |= WF_MIGRATED; 1681 set_task_cpu(p, cpu); 1682 } 1683#endif /* CONFIG_SMP */ 1684 1685 ttwu_queue(p, cpu); 1686stat: 1687 ttwu_stat(p, cpu, wake_flags); 1688out: 1689 raw_spin_unlock_irqrestore(&p->pi_lock, flags); 1690 1691 return success; 1692} 1693 1694/** 1695 * try_to_wake_up_local - try to wake up a local task with rq lock held 1696 * @p: the thread to be awakened 1697 * 1698 * Put @p on the run-queue if it's not already there. The caller must 1699 * ensure that this_rq() is locked, @p is bound to this_rq() and not 1700 * the current task. 1701 */ 1702static void try_to_wake_up_local(struct task_struct *p) 1703{ 1704 struct rq *rq = task_rq(p); 1705 1706 if (WARN_ON_ONCE(rq != this_rq()) || 1707 WARN_ON_ONCE(p == current)) 1708 return; 1709 1710 lockdep_assert_held(&rq->lock); 1711 1712 if (!raw_spin_trylock(&p->pi_lock)) { 1713 raw_spin_unlock(&rq->lock); 1714 raw_spin_lock(&p->pi_lock); 1715 raw_spin_lock(&rq->lock); 1716 } 1717 1718 if (!(p->state & TASK_NORMAL)) 1719 goto out; 1720 1721 if (!p->on_rq) 1722 ttwu_activate(rq, p, ENQUEUE_WAKEUP); 1723 1724 ttwu_do_wakeup(rq, p, 0); 1725 ttwu_stat(p, smp_processor_id(), 0); 1726out: 1727 raw_spin_unlock(&p->pi_lock); 1728} 1729 1730/** 1731 * wake_up_process - Wake up a specific process 1732 * @p: The process to be woken up. 1733 * 1734 * Attempt to wake up the nominated process and move it to the set of runnable 1735 * processes. 1736 * 1737 * Return: 1 if the process was woken up, 0 if it was already running. 1738 * 1739 * It may be assumed that this function implies a write memory barrier before 1740 * changing the task state if and only if any tasks are woken up. 1741 */ 1742int wake_up_process(struct task_struct *p) 1743{ 1744 WARN_ON(task_is_stopped_or_traced(p)); 1745 return try_to_wake_up(p, TASK_NORMAL, 0); 1746} 1747EXPORT_SYMBOL(wake_up_process); 1748 1749int wake_up_state(struct task_struct *p, unsigned int state) 1750{ 1751 return try_to_wake_up(p, state, 0); 1752} 1753 1754/* 1755 * Perform scheduler related setup for a newly forked process p. 1756 * p is forked by current. 1757 * 1758 * __sched_fork() is basic setup used by init_idle() too: 1759 */ 1760static void __sched_fork(unsigned long clone_flags, struct task_struct *p) 1761{ 1762 p->on_rq = 0; 1763 1764 p->se.on_rq = 0; 1765 p->se.exec_start = 0; 1766 p->se.sum_exec_runtime = 0; 1767 p->se.prev_sum_exec_runtime = 0; 1768 p->se.nr_migrations = 0; 1769 p->se.vruntime = 0; 1770 INIT_LIST_HEAD(&p->se.group_node); 1771 1772#ifdef CONFIG_SCHEDSTATS 1773 memset(&p->se.statistics, 0, sizeof(p->se.statistics)); 1774#endif 1775 1776 RB_CLEAR_NODE(&p->dl.rb_node); 1777 hrtimer_init(&p->dl.dl_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); 1778 p->dl.dl_runtime = p->dl.runtime = 0; 1779 p->dl.dl_deadline = p->dl.deadline = 0; 1780 p->dl.dl_period = 0; 1781 p->dl.flags = 0; 1782 1783 INIT_LIST_HEAD(&p->rt.run_list); 1784 1785#ifdef CONFIG_PREEMPT_NOTIFIERS 1786 INIT_HLIST_HEAD(&p->preempt_notifiers); 1787#endif 1788 1789#ifdef CONFIG_NUMA_BALANCING 1790 if (p->mm && atomic_read(&p->mm->mm_users) == 1) { 1791 p->mm->numa_next_scan = jiffies + msecs_to_jiffies(sysctl_numa_balancing_scan_delay); 1792 p->mm->numa_scan_seq = 0; 1793 } 1794 1795 if (clone_flags & CLONE_VM) 1796 p->numa_preferred_nid = current->numa_preferred_nid; 1797 else 1798 p->numa_preferred_nid = -1; 1799 1800 p->node_stamp = 0ULL; 1801 p->numa_scan_seq = p->mm ? p->mm->numa_scan_seq : 0; 1802 p->numa_scan_period = sysctl_numa_balancing_scan_delay; 1803 p->numa_work.next = &p->numa_work; 1804 p->numa_faults_memory = NULL; 1805 p->numa_faults_buffer_memory = NULL; 1806 p->last_task_numa_placement = 0; 1807 p->last_sum_exec_runtime = 0; 1808 1809 INIT_LIST_HEAD(&p->numa_entry); 1810 p->numa_group = NULL; 1811#endif /* CONFIG_NUMA_BALANCING */ 1812} 1813 1814#ifdef CONFIG_NUMA_BALANCING 1815#ifdef CONFIG_SCHED_DEBUG 1816void set_numabalancing_state(bool enabled) 1817{ 1818 if (enabled) 1819 sched_feat_set("NUMA"); 1820 else 1821 sched_feat_set("NO_NUMA"); 1822} 1823#else 1824__read_mostly bool numabalancing_enabled; 1825 1826void set_numabalancing_state(bool enabled) 1827{ 1828 numabalancing_enabled = enabled; 1829} 1830#endif /* CONFIG_SCHED_DEBUG */ 1831 1832#ifdef CONFIG_PROC_SYSCTL 1833int sysctl_numa_balancing(struct ctl_table *table, int write, 1834 void __user *buffer, size_t *lenp, loff_t *ppos) 1835{ 1836 struct ctl_table t; 1837 int err; 1838 int state = numabalancing_enabled; 1839 1840 if (write && !capable(CAP_SYS_ADMIN)) 1841 return -EPERM; 1842 1843 t = *table; 1844 t.data = &state; 1845 err = proc_dointvec_minmax(&t, write, buffer, lenp, ppos); 1846 if (err < 0) 1847 return err; 1848 if (write) 1849 set_numabalancing_state(state); 1850 return err; 1851} 1852#endif 1853#endif 1854 1855/* 1856 * fork()/clone()-time setup: 1857 */ 1858int sched_fork(unsigned long clone_flags, struct task_struct *p) 1859{ 1860 unsigned long flags; 1861 int cpu = get_cpu(); 1862 1863 __sched_fork(clone_flags, p); 1864 /* 1865 * We mark the process as running here. This guarantees that 1866 * nobody will actually run it, and a signal or other external 1867 * event cannot wake it up and insert it on the runqueue either. 1868 */ 1869 p->state = TASK_RUNNING; 1870 1871 /* 1872 * Make sure we do not leak PI boosting priority to the child. 1873 */ 1874 p->prio = current->normal_prio; 1875 1876 /* 1877 * Revert to default priority/policy on fork if requested. 1878 */ 1879 if (unlikely(p->sched_reset_on_fork)) { 1880 if (task_has_dl_policy(p) || task_has_rt_policy(p)) { 1881 p->policy = SCHED_NORMAL; 1882 p->static_prio = NICE_TO_PRIO(0); 1883 p->rt_priority = 0; 1884 } else if (PRIO_TO_NICE(p->static_prio) < 0) 1885 p->static_prio = NICE_TO_PRIO(0); 1886 1887 p->prio = p->normal_prio = __normal_prio(p); 1888 set_load_weight(p); 1889 1890 /* 1891 * We don't need the reset flag anymore after the fork. It has 1892 * fulfilled its duty: 1893 */ 1894 p->sched_reset_on_fork = 0; 1895 } 1896 1897 if (dl_prio(p->prio)) { 1898 put_cpu(); 1899 return -EAGAIN; 1900 } else if (rt_prio(p->prio)) { 1901 p->sched_class = &rt_sched_class; 1902 } else { 1903 p->sched_class = &fair_sched_class; 1904 } 1905 1906 if (p->sched_class->task_fork) 1907 p->sched_class->task_fork(p); 1908 1909 /* 1910 * The child is not yet in the pid-hash so no cgroup attach races, 1911 * and the cgroup is pinned to this child due to cgroup_fork() 1912 * is ran before sched_fork(). 1913 * 1914 * Silence PROVE_RCU. 1915 */ 1916 raw_spin_lock_irqsave(&p->pi_lock, flags); 1917 set_task_cpu(p, cpu); 1918 raw_spin_unlock_irqrestore(&p->pi_lock, flags); 1919 1920#if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT) 1921 if (likely(sched_info_on())) 1922 memset(&p->sched_info, 0, sizeof(p->sched_info)); 1923#endif 1924#if defined(CONFIG_SMP) 1925 p->on_cpu = 0; 1926#endif 1927 init_task_preempt_count(p); 1928#ifdef CONFIG_SMP 1929 plist_node_init(&p->pushable_tasks, MAX_PRIO); 1930 RB_CLEAR_NODE(&p->pushable_dl_tasks); 1931#endif 1932 1933 put_cpu(); 1934 return 0; 1935} 1936 1937unsigned long to_ratio(u64 period, u64 runtime) 1938{ 1939 if (runtime == RUNTIME_INF) 1940 return 1ULL << 20; 1941 1942 /* 1943 * Doing this here saves a lot of checks in all 1944 * the calling paths, and returning zero seems 1945 * safe for them anyway. 1946 */ 1947 if (period == 0) 1948 return 0; 1949 1950 return div64_u64(runtime << 20, period); 1951} 1952 1953#ifdef CONFIG_SMP 1954inline struct dl_bw *dl_bw_of(int i) 1955{ 1956 return &cpu_rq(i)->rd->dl_bw; 1957} 1958 1959static inline int dl_bw_cpus(int i) 1960{ 1961 struct root_domain *rd = cpu_rq(i)->rd; 1962 int cpus = 0; 1963 1964 for_each_cpu_and(i, rd->span, cpu_active_mask) 1965 cpus++; 1966 1967 return cpus; 1968} 1969#else 1970inline struct dl_bw *dl_bw_of(int i) 1971{ 1972 return &cpu_rq(i)->dl.dl_bw; 1973} 1974 1975static inline int dl_bw_cpus(int i) 1976{ 1977 return 1; 1978} 1979#endif 1980 1981static inline 1982void __dl_clear(struct dl_bw *dl_b, u64 tsk_bw) 1983{ 1984 dl_b->total_bw -= tsk_bw; 1985} 1986 1987static inline 1988void __dl_add(struct dl_bw *dl_b, u64 tsk_bw) 1989{ 1990 dl_b->total_bw += tsk_bw; 1991} 1992 1993static inline 1994bool __dl_overflow(struct dl_bw *dl_b, int cpus, u64 old_bw, u64 new_bw) 1995{ 1996 return dl_b->bw != -1 && 1997 dl_b->bw * cpus < dl_b->total_bw - old_bw + new_bw; 1998} 1999 2000/* 2001 * We must be sure that accepting a new task (or allowing changing the 2002 * parameters of an existing one) is consistent with the bandwidth 2003 * constraints. If yes, this function also accordingly updates the currently 2004 * allocated bandwidth to reflect the new situation. 2005 * 2006 * This function is called while holding p's rq->lock. 2007 */ 2008static int dl_overflow(struct task_struct *p, int policy, 2009 const struct sched_attr *attr) 2010{ 2011 2012 struct dl_bw *dl_b = dl_bw_of(task_cpu(p)); 2013 u64 period = attr->sched_period ?: attr->sched_deadline; 2014 u64 runtime = attr->sched_runtime; 2015 u64 new_bw = dl_policy(policy) ? to_ratio(period, runtime) : 0; 2016 int cpus, err = -1; 2017 2018 if (new_bw == p->dl.dl_bw) 2019 return 0; 2020 2021 /* 2022 * Either if a task, enters, leave, or stays -deadline but changes 2023 * its parameters, we may need to update accordingly the total 2024 * allocated bandwidth of the container. 2025 */ 2026 raw_spin_lock(&dl_b->lock); 2027 cpus = dl_bw_cpus(task_cpu(p)); 2028 if (dl_policy(policy) && !task_has_dl_policy(p) && 2029 !__dl_overflow(dl_b, cpus, 0, new_bw)) { 2030 __dl_add(dl_b, new_bw); 2031 err = 0; 2032 } else if (dl_policy(policy) && task_has_dl_policy(p) && 2033 !__dl_overflow(dl_b, cpus, p->dl.dl_bw, new_bw)) { 2034 __dl_clear(dl_b, p->dl.dl_bw); 2035 __dl_add(dl_b, new_bw); 2036 err = 0; 2037 } else if (!dl_policy(policy) && task_has_dl_policy(p)) { 2038 __dl_clear(dl_b, p->dl.dl_bw); 2039 err = 0; 2040 } 2041 raw_spin_unlock(&dl_b->lock); 2042 2043 return err; 2044} 2045 2046extern void init_dl_bw(struct dl_bw *dl_b); 2047 2048/* 2049 * wake_up_new_task - wake up a newly created task for the first time. 2050 * 2051 * This function will do some initial scheduler statistics housekeeping 2052 * that must be done for every newly created context, then puts the task 2053 * on the runqueue and wakes it. 2054 */ 2055void wake_up_new_task(struct task_struct *p) 2056{ 2057 unsigned long flags; 2058 struct rq *rq; 2059 2060 raw_spin_lock_irqsave(&p->pi_lock, flags); 2061#ifdef CONFIG_SMP 2062 /* 2063 * Fork balancing, do it here and not earlier because: 2064 * - cpus_allowed can change in the fork path 2065 * - any previously selected cpu might disappear through hotplug 2066 */ 2067 set_task_cpu(p, select_task_rq(p, task_cpu(p), SD_BALANCE_FORK, 0)); 2068#endif 2069 2070 /* Initialize new task's runnable average */ 2071 init_task_runnable_average(p); 2072 rq = __task_rq_lock(p); 2073 activate_task(rq, p, 0); 2074 p->on_rq = 1; 2075 trace_sched_wakeup_new(p, true); 2076 check_preempt_curr(rq, p, WF_FORK); 2077#ifdef CONFIG_SMP 2078 if (p->sched_class->task_woken) 2079 p->sched_class->task_woken(rq, p); 2080#endif 2081 task_rq_unlock(rq, p, &flags); 2082} 2083 2084#ifdef CONFIG_PREEMPT_NOTIFIERS 2085 2086/** 2087 * preempt_notifier_register - tell me when current is being preempted & rescheduled 2088 * @notifier: notifier struct to register 2089 */ 2090void preempt_notifier_register(struct preempt_notifier *notifier) 2091{ 2092 hlist_add_head(¬ifier->link, ¤t->preempt_notifiers); 2093} 2094EXPORT_SYMBOL_GPL(preempt_notifier_register); 2095 2096/** 2097 * preempt_notifier_unregister - no longer interested in preemption notifications 2098 * @notifier: notifier struct to unregister 2099 * 2100 * This is safe to call from within a preemption notifier. 2101 */ 2102void preempt_notifier_unregister(struct preempt_notifier *notifier) 2103{ 2104 hlist_del(¬ifier->link); 2105} 2106EXPORT_SYMBOL_GPL(preempt_notifier_unregister); 2107 2108static void fire_sched_in_preempt_notifiers(struct task_struct *curr) 2109{ 2110 struct preempt_notifier *notifier; 2111 2112 hlist_for_each_entry(notifier, &curr->preempt_notifiers, link) 2113 notifier->ops->sched_in(notifier, raw_smp_processor_id()); 2114} 2115 2116static void 2117fire_sched_out_preempt_notifiers(struct task_struct *curr, 2118 struct task_struct *next) 2119{ 2120 struct preempt_notifier *notifier; 2121 2122 hlist_for_each_entry(notifier, &curr->preempt_notifiers, link) 2123 notifier->ops->sched_out(notifier, next); 2124} 2125 2126#else /* !CONFIG_PREEMPT_NOTIFIERS */ 2127 2128static void fire_sched_in_preempt_notifiers(struct task_struct *curr) 2129{ 2130} 2131 2132static void 2133fire_sched_out_preempt_notifiers(struct task_struct *curr, 2134 struct task_struct *next) 2135{ 2136} 2137 2138#endif /* CONFIG_PREEMPT_NOTIFIERS */ 2139 2140/** 2141 * prepare_task_switch - prepare to switch tasks 2142 * @rq: the runqueue preparing to switch 2143 * @prev: the current task that is being switched out 2144 * @next: the task we are going to switch to. 2145 * 2146 * This is called with the rq lock held and interrupts off. It must 2147 * be paired with a subsequent finish_task_switch after the context 2148 * switch. 2149 * 2150 * prepare_task_switch sets up locking and calls architecture specific 2151 * hooks. 2152 */ 2153static inline void 2154prepare_task_switch(struct rq *rq, struct task_struct *prev, 2155 struct task_struct *next) 2156{ 2157 trace_sched_switch(prev, next); 2158 sched_info_switch(rq, prev, next); 2159 perf_event_task_sched_out(prev, next); 2160 fire_sched_out_preempt_notifiers(prev, next); 2161 prepare_lock_switch(rq, next); 2162 prepare_arch_switch(next); 2163} 2164 2165/** 2166 * finish_task_switch - clean up after a task-switch 2167 * @rq: runqueue associated with task-switch 2168 * @prev: the thread we just switched away from. 2169 * 2170 * finish_task_switch must be called after the context switch, paired 2171 * with a prepare_task_switch call before the context switch. 2172 * finish_task_switch will reconcile locking set up by prepare_task_switch, 2173 * and do any other architecture-specific cleanup actions. 2174 * 2175 * Note that we may have delayed dropping an mm in context_switch(). If 2176 * so, we finish that here outside of the runqueue lock. (Doing it 2177 * with the lock held can cause deadlocks; see schedule() for 2178 * details.) 2179 */ 2180static void finish_task_switch(struct rq *rq, struct task_struct *prev) 2181 __releases(rq->lock) 2182{ 2183 struct mm_struct *mm = rq->prev_mm; 2184 long prev_state; 2185 2186 rq->prev_mm = NULL; 2187 2188 /* 2189 * A task struct has one reference for the use as "current". 2190 * If a task dies, then it sets TASK_DEAD in tsk->state and calls 2191 * schedule one last time. The schedule call will never return, and 2192 * the scheduled task must drop that reference. 2193 * The test for TASK_DEAD must occur while the runqueue locks are 2194 * still held, otherwise prev could be scheduled on another cpu, die 2195 * there before we look at prev->state, and then the reference would 2196 * be dropped twice. 2197 * Manfred Spraul <manfred@colorfullife.com> 2198 */ 2199 prev_state = prev->state; 2200 vtime_task_switch(prev); 2201 finish_arch_switch(prev); 2202 perf_event_task_sched_in(prev, current); 2203 finish_lock_switch(rq, prev); 2204 finish_arch_post_lock_switch(); 2205 2206 fire_sched_in_preempt_notifiers(current); 2207 if (mm) 2208 mmdrop(mm); 2209 if (unlikely(prev_state == TASK_DEAD)) { 2210 if (prev->sched_class->task_dead) 2211 prev->sched_class->task_dead(prev); 2212 2213 /* 2214 * Remove function-return probe instances associated with this 2215 * task and put them back on the free list. 2216 */ 2217 kprobe_flush_task(prev); 2218 put_task_struct(prev); 2219 } 2220 2221 tick_nohz_task_switch(current); 2222} 2223 2224#ifdef CONFIG_SMP 2225 2226/* rq->lock is NOT held, but preemption is disabled */ 2227static inline void post_schedule(struct rq *rq) 2228{ 2229 if (rq->post_schedule) { 2230 unsigned long flags; 2231 2232 raw_spin_lock_irqsave(&rq->lock, flags); 2233 if (rq->curr->sched_class->post_schedule) 2234 rq->curr->sched_class->post_schedule(rq); 2235 raw_spin_unlock_irqrestore(&rq->lock, flags); 2236 2237 rq->post_schedule = 0; 2238 } 2239} 2240 2241#else 2242 2243static inline void post_schedule(struct rq *rq) 2244{ 2245} 2246 2247#endif 2248 2249/** 2250 * schedule_tail - first thing a freshly forked thread must call. 2251 * @prev: the thread we just switched away from. 2252 */ 2253asmlinkage __visible void schedule_tail(struct task_struct *prev) 2254 __releases(rq->lock) 2255{ 2256 struct rq *rq = this_rq(); 2257 2258 finish_task_switch(rq, prev); 2259 2260 /* 2261 * FIXME: do we need to worry about rq being invalidated by the 2262 * task_switch? 2263 */ 2264 post_schedule(rq); 2265 2266#ifdef __ARCH_WANT_UNLOCKED_CTXSW 2267 /* In this case, finish_task_switch does not reenable preemption */ 2268 preempt_enable(); 2269#endif 2270 if (current->set_child_tid) 2271 put_user(task_pid_vnr(current), current->set_child_tid); 2272} 2273 2274/* 2275 * context_switch - switch to the new MM and the new 2276 * thread's register state. 2277 */ 2278static inline void 2279context_switch(struct rq *rq, struct task_struct *prev, 2280 struct task_struct *next) 2281{ 2282 struct mm_struct *mm, *oldmm; 2283 2284 prepare_task_switch(rq, prev, next); 2285 2286 mm = next->mm; 2287 oldmm = prev->active_mm; 2288 /* 2289 * For paravirt, this is coupled with an exit in switch_to to 2290 * combine the page table reload and the switch backend into 2291 * one hypercall. 2292 */ 2293 arch_start_context_switch(prev); 2294 2295 if (!mm) { 2296 next->active_mm = oldmm; 2297 atomic_inc(&oldmm->mm_count); 2298 enter_lazy_tlb(oldmm, next); 2299 } else 2300 switch_mm(oldmm, mm, next); 2301 2302 if (!prev->mm) { 2303 prev->active_mm = NULL; 2304 rq->prev_mm = oldmm; 2305 } 2306 /* 2307 * Since the runqueue lock will be released by the next 2308 * task (which is an invalid locking op but in the case 2309 * of the scheduler it's an obvious special-case), so we 2310 * do an early lockdep release here: 2311 */ 2312#ifndef __ARCH_WANT_UNLOCKED_CTXSW 2313 spin_release(&rq->lock.dep_map, 1, _THIS_IP_); 2314#endif 2315 2316 context_tracking_task_switch(prev, next); 2317 /* Here we just switch the register state and the stack. */ 2318 switch_to(prev, next, prev); 2319 2320 barrier(); 2321 /* 2322 * this_rq must be evaluated again because prev may have moved 2323 * CPUs since it called schedule(), thus the 'rq' on its stack 2324 * frame will be invalid. 2325 */ 2326 finish_task_switch(this_rq(), prev); 2327} 2328 2329/* 2330 * nr_running and nr_context_switches: 2331 * 2332 * externally visible scheduler statistics: current number of runnable 2333 * threads, total number of context switches performed since bootup. 2334 */ 2335unsigned long nr_running(void) 2336{ 2337 unsigned long i, sum = 0; 2338 2339 for_each_online_cpu(i) 2340 sum += cpu_rq(i)->nr_running; 2341 2342 return sum; 2343} 2344 2345unsigned long long nr_context_switches(void) 2346{ 2347 int i; 2348 unsigned long long sum = 0; 2349 2350 for_each_possible_cpu(i) 2351 sum += cpu_rq(i)->nr_switches; 2352 2353 return sum; 2354} 2355 2356unsigned long nr_iowait(void) 2357{ 2358 unsigned long i, sum = 0; 2359 2360 for_each_possible_cpu(i) 2361 sum += atomic_read(&cpu_rq(i)->nr_iowait); 2362 2363 return sum; 2364} 2365 2366unsigned long nr_iowait_cpu(int cpu) 2367{ 2368 struct rq *this = cpu_rq(cpu); 2369 return atomic_read(&this->nr_iowait); 2370} 2371 2372#ifdef CONFIG_SMP 2373 2374/* 2375 * sched_exec - execve() is a valuable balancing opportunity, because at 2376 * this point the task has the smallest effective memory and cache footprint. 2377 */ 2378void sched_exec(void) 2379{ 2380 struct task_struct *p = current; 2381 unsigned long flags; 2382 int dest_cpu; 2383 2384 raw_spin_lock_irqsave(&p->pi_lock, flags); 2385 dest_cpu = p->sched_class->select_task_rq(p, task_cpu(p), SD_BALANCE_EXEC, 0); 2386 if (dest_cpu == smp_processor_id()) 2387 goto unlock; 2388 2389 if (likely(cpu_active(dest_cpu))) { 2390 struct migration_arg arg = { p, dest_cpu }; 2391 2392 raw_spin_unlock_irqrestore(&p->pi_lock, flags); 2393 stop_one_cpu(task_cpu(p), migration_cpu_stop, &arg); 2394 return; 2395 } 2396unlock: 2397 raw_spin_unlock_irqrestore(&p->pi_lock, flags); 2398} 2399 2400#endif 2401 2402DEFINE_PER_CPU(struct kernel_stat, kstat); 2403DEFINE_PER_CPU(struct kernel_cpustat, kernel_cpustat); 2404 2405EXPORT_PER_CPU_SYMBOL(kstat); 2406EXPORT_PER_CPU_SYMBOL(kernel_cpustat); 2407 2408/* 2409 * Return any ns on the sched_clock that have not yet been accounted in 2410 * @p in case that task is currently running. 2411 * 2412 * Called with task_rq_lock() held on @rq. 2413 */ 2414static u64 do_task_delta_exec(struct task_struct *p, struct rq *rq) 2415{ 2416 u64 ns = 0; 2417 2418 if (task_current(rq, p)) { 2419 update_rq_clock(rq); 2420 ns = rq_clock_task(rq) - p->se.exec_start; 2421 if ((s64)ns < 0) 2422 ns = 0; 2423 } 2424 2425 return ns; 2426} 2427 2428unsigned long long task_delta_exec(struct task_struct *p) 2429{ 2430 unsigned long flags; 2431 struct rq *rq; 2432 u64 ns = 0; 2433 2434 rq = task_rq_lock(p, &flags); 2435 ns = do_task_delta_exec(p, rq); 2436 task_rq_unlock(rq, p, &flags); 2437 2438 return ns; 2439} 2440 2441/* 2442 * Return accounted runtime for the task. 2443 * In case the task is currently running, return the runtime plus current's 2444 * pending runtime that have not been accounted yet. 2445 */ 2446unsigned long long task_sched_runtime(struct task_struct *p) 2447{ 2448 unsigned long flags; 2449 struct rq *rq; 2450 u64 ns = 0; 2451 2452#if defined(CONFIG_64BIT) && defined(CONFIG_SMP) 2453 /* 2454 * 64-bit doesn't need locks to atomically read a 64bit value. 2455 * So we have a optimization chance when the task's delta_exec is 0. 2456 * Reading ->on_cpu is racy, but this is ok. 2457 * 2458 * If we race with it leaving cpu, we'll take a lock. So we're correct. 2459 * If we race with it entering cpu, unaccounted time is 0. This is 2460 * indistinguishable from the read occurring a few cycles earlier. 2461 */ 2462 if (!p->on_cpu) 2463 return p->se.sum_exec_runtime; 2464#endif 2465 2466 rq = task_rq_lock(p, &flags); 2467 ns = p->se.sum_exec_runtime + do_task_delta_exec(p, rq); 2468 task_rq_unlock(rq, p, &flags); 2469 2470 return ns; 2471} 2472 2473/* 2474 * This function gets called by the timer code, with HZ frequency. 2475 * We call it with interrupts disabled. 2476 */ 2477void scheduler_tick(void) 2478{ 2479 int cpu = smp_processor_id(); 2480 struct rq *rq = cpu_rq(cpu); 2481 struct task_struct *curr = rq->curr; 2482 2483 sched_clock_tick(); 2484 2485 raw_spin_lock(&rq->lock); 2486 update_rq_clock(rq); 2487 curr->sched_class->task_tick(rq, curr, 0); 2488 update_cpu_load_active(rq); 2489 raw_spin_unlock(&rq->lock); 2490 2491 perf_event_task_tick(); 2492 2493#ifdef CONFIG_SMP 2494 rq->idle_balance = idle_cpu(cpu); 2495 trigger_load_balance(rq); 2496#endif 2497 rq_last_tick_reset(rq); 2498} 2499 2500#ifdef CONFIG_NO_HZ_FULL 2501/** 2502 * scheduler_tick_max_deferment 2503 * 2504 * Keep at least one tick per second when a single 2505 * active task is running because the scheduler doesn't 2506 * yet completely support full dynticks environment. 2507 * 2508 * This makes sure that uptime, CFS vruntime, load 2509 * balancing, etc... continue to move forward, even 2510 * with a very low granularity. 2511 * 2512 * Return: Maximum deferment in nanoseconds. 2513 */ 2514u64 scheduler_tick_max_deferment(void) 2515{ 2516 struct rq *rq = this_rq(); 2517 unsigned long next, now = ACCESS_ONCE(jiffies); 2518 2519 next = rq->last_sched_tick + HZ; 2520 2521 if (time_before_eq(next, now)) 2522 return 0; 2523 2524 return jiffies_to_nsecs(next - now); 2525} 2526#endif 2527 2528notrace unsigned long get_parent_ip(unsigned long addr) 2529{ 2530 if (in_lock_functions(addr)) { 2531 addr = CALLER_ADDR2; 2532 if (in_lock_functions(addr)) 2533 addr = CALLER_ADDR3; 2534 } 2535 return addr; 2536} 2537 2538#if defined(CONFIG_PREEMPT) && (defined(CONFIG_DEBUG_PREEMPT) || \ 2539 defined(CONFIG_PREEMPT_TRACER)) 2540 2541void __kprobes preempt_count_add(int val) 2542{ 2543#ifdef CONFIG_DEBUG_PREEMPT 2544 /* 2545 * Underflow? 2546 */ 2547 if (DEBUG_LOCKS_WARN_ON((preempt_count() < 0))) 2548 return; 2549#endif 2550 __preempt_count_add(val); 2551#ifdef CONFIG_DEBUG_PREEMPT 2552 /* 2553 * Spinlock count overflowing soon? 2554 */ 2555 DEBUG_LOCKS_WARN_ON((preempt_count() & PREEMPT_MASK) >= 2556 PREEMPT_MASK - 10); 2557#endif 2558 if (preempt_count() == val) { 2559 unsigned long ip = get_parent_ip(CALLER_ADDR1); 2560#ifdef CONFIG_DEBUG_PREEMPT 2561 current->preempt_disable_ip = ip; 2562#endif 2563 trace_preempt_off(CALLER_ADDR0, ip); 2564 } 2565} 2566EXPORT_SYMBOL(preempt_count_add); 2567 2568void __kprobes preempt_count_sub(int val) 2569{ 2570#ifdef CONFIG_DEBUG_PREEMPT 2571 /* 2572 * Underflow? 2573 */ 2574 if (DEBUG_LOCKS_WARN_ON(val > preempt_count())) 2575 return; 2576 /* 2577 * Is the spinlock portion underflowing? 2578 */ 2579 if (DEBUG_LOCKS_WARN_ON((val < PREEMPT_MASK) && 2580 !(preempt_count() & PREEMPT_MASK))) 2581 return; 2582#endif 2583 2584 if (preempt_count() == val) 2585 trace_preempt_on(CALLER_ADDR0, get_parent_ip(CALLER_ADDR1)); 2586 __preempt_count_sub(val); 2587} 2588EXPORT_SYMBOL(preempt_count_sub); 2589 2590#endif 2591 2592/* 2593 * Print scheduling while atomic bug: 2594 */ 2595static noinline void __schedule_bug(struct task_struct *prev) 2596{ 2597 if (oops_in_progress) 2598 return; 2599 2600 printk(KERN_ERR "BUG: scheduling while atomic: %s/%d/0x%08x\n", 2601 prev->comm, prev->pid, preempt_count()); 2602 2603 debug_show_held_locks(prev); 2604 print_modules(); 2605 if (irqs_disabled()) 2606 print_irqtrace_events(prev); 2607#ifdef CONFIG_DEBUG_PREEMPT 2608 if (in_atomic_preempt_off()) { 2609 pr_err("Preemption disabled at:"); 2610 print_ip_sym(current->preempt_disable_ip); 2611 pr_cont("\n"); 2612 } 2613#endif 2614 dump_stack(); 2615 add_taint(TAINT_WARN, LOCKDEP_STILL_OK); 2616} 2617 2618/* 2619 * Various schedule()-time debugging checks and statistics: 2620 */ 2621static inline void schedule_debug(struct task_struct *prev) 2622{ 2623 /* 2624 * Test if we are atomic. Since do_exit() needs to call into 2625 * schedule() atomically, we ignore that path. Otherwise whine 2626 * if we are scheduling when we should not. 2627 */ 2628 if (unlikely(in_atomic_preempt_off() && prev->state != TASK_DEAD)) 2629 __schedule_bug(prev); 2630 rcu_sleep_check(); 2631 2632 profile_hit(SCHED_PROFILING, __builtin_return_address(0)); 2633 2634 schedstat_inc(this_rq(), sched_count); 2635} 2636 2637/* 2638 * Pick up the highest-prio task: 2639 */ 2640static inline struct task_struct * 2641pick_next_task(struct rq *rq, struct task_struct *prev) 2642{ 2643 const struct sched_class *class = &fair_sched_class; 2644 struct task_struct *p; 2645 2646 /* 2647 * Optimization: we know that if all tasks are in 2648 * the fair class we can call that function directly: 2649 */ 2650 if (likely(prev->sched_class == class && 2651 rq->nr_running == rq->cfs.h_nr_running)) { 2652 p = fair_sched_class.pick_next_task(rq, prev); 2653 if (unlikely(p == RETRY_TASK)) 2654 goto again; 2655 2656 /* assumes fair_sched_class->next == idle_sched_class */ 2657 if (unlikely(!p)) 2658 p = idle_sched_class.pick_next_task(rq, prev); 2659 2660 return p; 2661 } 2662 2663again: 2664 for_each_class(class) { 2665 p = class->pick_next_task(rq, prev); 2666 if (p) { 2667 if (unlikely(p == RETRY_TASK)) 2668 goto again; 2669 return p; 2670 } 2671 } 2672 2673 BUG(); /* the idle class will always have a runnable task */ 2674} 2675 2676/* 2677 * __schedule() is the main scheduler function. 2678 * 2679 * The main means of driving the scheduler and thus entering this function are: 2680 * 2681 * 1. Explicit blocking: mutex, semaphore, waitqueue, etc. 2682 * 2683 * 2. TIF_NEED_RESCHED flag is checked on interrupt and userspace return 2684 * paths. For example, see arch/x86/entry_64.S. 2685 * 2686 * To drive preemption between tasks, the scheduler sets the flag in timer 2687 * interrupt handler scheduler_tick(). 2688 * 2689 * 3. Wakeups don't really cause entry into schedule(). They add a 2690 * task to the run-queue and that's it. 2691 * 2692 * Now, if the new task added to the run-queue preempts the current 2693 * task, then the wakeup sets TIF_NEED_RESCHED and schedule() gets 2694 * called on the nearest possible occasion: 2695 * 2696 * - If the kernel is preemptible (CONFIG_PREEMPT=y): 2697 * 2698 * - in syscall or exception context, at the next outmost 2699 * preempt_enable(). (this might be as soon as the wake_up()'s 2700 * spin_unlock()!) 2701 * 2702 * - in IRQ context, return from interrupt-handler to 2703 * preemptible context 2704 * 2705 * - If the kernel is not preemptible (CONFIG_PREEMPT is not set) 2706 * then at the next: 2707 * 2708 * - cond_resched() call 2709 * - explicit schedule() call 2710 * - return from syscall or exception to user-space 2711 * - return from interrupt-handler to user-space 2712 */ 2713static void __sched __schedule(void) 2714{ 2715 struct task_struct *prev, *next; 2716 unsigned long *switch_count; 2717 struct rq *rq; 2718 int cpu; 2719 2720need_resched: 2721 preempt_disable(); 2722 cpu = smp_processor_id(); 2723 rq = cpu_rq(cpu); 2724 rcu_note_context_switch(cpu); 2725 prev = rq->curr; 2726 2727 schedule_debug(prev); 2728 2729 if (sched_feat(HRTICK)) 2730 hrtick_clear(rq); 2731 2732 /* 2733 * Make sure that signal_pending_state()->signal_pending() below 2734 * can't be reordered with __set_current_state(TASK_INTERRUPTIBLE) 2735 * done by the caller to avoid the race with signal_wake_up(). 2736 */ 2737 smp_mb__before_spinlock(); 2738 raw_spin_lock_irq(&rq->lock); 2739 2740 switch_count = &prev->nivcsw; 2741 if (prev->state && !(preempt_count() & PREEMPT_ACTIVE)) { 2742 if (unlikely(signal_pending_state(prev->state, prev))) { 2743 prev->state = TASK_RUNNING; 2744 } else { 2745 deactivate_task(rq, prev, DEQUEUE_SLEEP); 2746 prev->on_rq = 0; 2747 2748 /* 2749 * If a worker went to sleep, notify and ask workqueue 2750 * whether it wants to wake up a task to maintain 2751 * concurrency. 2752 */ 2753 if (prev->flags & PF_WQ_WORKER) { 2754 struct task_struct *to_wakeup; 2755 2756 to_wakeup = wq_worker_sleeping(prev, cpu); 2757 if (to_wakeup) 2758 try_to_wake_up_local(to_wakeup); 2759 } 2760 } 2761 switch_count = &prev->nvcsw; 2762 } 2763 2764 if (prev->on_rq || rq->skip_clock_update < 0) 2765 update_rq_clock(rq); 2766 2767 next = pick_next_task(rq, prev); 2768 clear_tsk_need_resched(prev); 2769 clear_preempt_need_resched(); 2770 rq->skip_clock_update = 0; 2771 2772 if (likely(prev != next)) { 2773 rq->nr_switches++; 2774 rq->curr = next; 2775 ++*switch_count; 2776 2777 context_switch(rq, prev, next); /* unlocks the rq */ 2778 /* 2779 * The context switch have flipped the stack from under us 2780 * and restored the local variables which were saved when 2781 * this task called schedule() in the past. prev == current 2782 * is still correct, but it can be moved to another cpu/rq. 2783 */ 2784 cpu = smp_processor_id(); 2785 rq = cpu_rq(cpu); 2786 } else 2787 raw_spin_unlock_irq(&rq->lock); 2788 2789 post_schedule(rq); 2790 2791 sched_preempt_enable_no_resched(); 2792 if (need_resched()) 2793 goto need_resched; 2794} 2795 2796static inline void sched_submit_work(struct task_struct *tsk) 2797{ 2798 if (!tsk->state || tsk_is_pi_blocked(tsk)) 2799 return; 2800 /* 2801 * If we are going to sleep and we have plugged IO queued, 2802 * make sure to submit it to avoid deadlocks. 2803 */ 2804 if (blk_needs_flush_plug(tsk)) 2805 blk_schedule_flush_plug(tsk); 2806} 2807 2808asmlinkage __visible void __sched schedule(void) 2809{ 2810 struct task_struct *tsk = current; 2811 2812 sched_submit_work(tsk); 2813 __schedule(); 2814} 2815EXPORT_SYMBOL(schedule); 2816 2817#ifdef CONFIG_CONTEXT_TRACKING 2818asmlinkage __visible void __sched schedule_user(void) 2819{ 2820 /* 2821 * If we come here after a random call to set_need_resched(), 2822 * or we have been woken up remotely but the IPI has not yet arrived, 2823 * we haven't yet exited the RCU idle mode. Do it here manually until 2824 * we find a better solution. 2825 */ 2826 user_exit(); 2827 schedule(); 2828 user_enter(); 2829} 2830#endif 2831 2832/** 2833 * schedule_preempt_disabled - called with preemption disabled 2834 * 2835 * Returns with preemption disabled. Note: preempt_count must be 1 2836 */ 2837void __sched schedule_preempt_disabled(void) 2838{ 2839 sched_preempt_enable_no_resched(); 2840 schedule(); 2841 preempt_disable(); 2842} 2843 2844#ifdef CONFIG_PREEMPT 2845/* 2846 * this is the entry point to schedule() from in-kernel preemption 2847 * off of preempt_enable. Kernel preemptions off return from interrupt 2848 * occur there and call schedule directly. 2849 */ 2850asmlinkage __visible void __sched notrace preempt_schedule(void) 2851{ 2852 /* 2853 * If there is a non-zero preempt_count or interrupts are disabled, 2854 * we do not want to preempt the current task. Just return.. 2855 */ 2856 if (likely(!preemptible())) 2857 return; 2858 2859 do { 2860 __preempt_count_add(PREEMPT_ACTIVE); 2861 __schedule(); 2862 __preempt_count_sub(PREEMPT_ACTIVE); 2863 2864 /* 2865 * Check again in case we missed a preemption opportunity 2866 * between schedule and now. 2867 */ 2868 barrier(); 2869 } while (need_resched()); 2870} 2871EXPORT_SYMBOL(preempt_schedule); 2872#endif /* CONFIG_PREEMPT */ 2873 2874/* 2875 * this is the entry point to schedule() from kernel preemption 2876 * off of irq context. 2877 * Note, that this is called and return with irqs disabled. This will 2878 * protect us against recursive calling from irq. 2879 */ 2880asmlinkage __visible void __sched preempt_schedule_irq(void) 2881{ 2882 enum ctx_state prev_state; 2883 2884 /* Catch callers which need to be fixed */ 2885 BUG_ON(preempt_count() || !irqs_disabled()); 2886 2887 prev_state = exception_enter(); 2888 2889 do { 2890 __preempt_count_add(PREEMPT_ACTIVE); 2891 local_irq_enable(); 2892 __schedule(); 2893 local_irq_disable(); 2894 __preempt_count_sub(PREEMPT_ACTIVE); 2895 2896 /* 2897 * Check again in case we missed a preemption opportunity 2898 * between schedule and now. 2899 */ 2900 barrier(); 2901 } while (need_resched()); 2902 2903 exception_exit(prev_state); 2904} 2905 2906int default_wake_function(wait_queue_t *curr, unsigned mode, int wake_flags, 2907 void *key) 2908{ 2909 return try_to_wake_up(curr->private, mode, wake_flags); 2910} 2911EXPORT_SYMBOL(default_wake_function); 2912 2913#ifdef CONFIG_RT_MUTEXES 2914 2915/* 2916 * rt_mutex_setprio - set the current priority of a task 2917 * @p: task 2918 * @prio: prio value (kernel-internal form) 2919 * 2920 * This function changes the 'effective' priority of a task. It does 2921 * not touch ->normal_prio like __setscheduler(). 2922 * 2923 * Used by the rt_mutex code to implement priority inheritance 2924 * logic. Call site only calls if the priority of the task changed. 2925 */ 2926void rt_mutex_setprio(struct task_struct *p, int prio) 2927{ 2928 int oldprio, on_rq, running, enqueue_flag = 0; 2929 struct rq *rq; 2930 const struct sched_class *prev_class; 2931 2932 BUG_ON(prio > MAX_PRIO); 2933 2934 rq = __task_rq_lock(p); 2935 2936 /* 2937 * Idle task boosting is a nono in general. There is one 2938 * exception, when PREEMPT_RT and NOHZ is active: 2939 * 2940 * The idle task calls get_next_timer_interrupt() and holds 2941 * the timer wheel base->lock on the CPU and another CPU wants 2942 * to access the timer (probably to cancel it). We can safely 2943 * ignore the boosting request, as the idle CPU runs this code 2944 * with interrupts disabled and will complete the lock 2945 * protected section without being interrupted. So there is no 2946 * real need to boost. 2947 */ 2948 if (unlikely(p == rq->idle)) { 2949 WARN_ON(p != rq->curr); 2950 WARN_ON(p->pi_blocked_on); 2951 goto out_unlock; 2952 } 2953 2954 trace_sched_pi_setprio(p, prio); 2955 p->pi_top_task = rt_mutex_get_top_task(p); 2956 oldprio = p->prio; 2957 prev_class = p->sched_class; 2958 on_rq = p->on_rq; 2959 running = task_current(rq, p); 2960 if (on_rq) 2961 dequeue_task(rq, p, 0); 2962 if (running) 2963 p->sched_class->put_prev_task(rq, p); 2964 2965 /* 2966 * Boosting condition are: 2967 * 1. -rt task is running and holds mutex A 2968 * --> -dl task blocks on mutex A 2969 * 2970 * 2. -dl task is running and holds mutex A 2971 * --> -dl task blocks on mutex A and could preempt the 2972 * running task 2973 */ 2974 if (dl_prio(prio)) { 2975 if (!dl_prio(p->normal_prio) || (p->pi_top_task && 2976 dl_entity_preempt(&p->pi_top_task->dl, &p->dl))) { 2977 p->dl.dl_boosted = 1; 2978 p->dl.dl_throttled = 0; 2979 enqueue_flag = ENQUEUE_REPLENISH; 2980 } else 2981 p->dl.dl_boosted = 0; 2982 p->sched_class = &dl_sched_class; 2983 } else if (rt_prio(prio)) { 2984 if (dl_prio(oldprio)) 2985 p->dl.dl_boosted = 0; 2986 if (oldprio < prio) 2987 enqueue_flag = ENQUEUE_HEAD; 2988 p->sched_class = &rt_sched_class; 2989 } else { 2990 if (dl_prio(oldprio)) 2991 p->dl.dl_boosted = 0; 2992 p->sched_class = &fair_sched_class; 2993 } 2994 2995 p->prio = prio; 2996 2997 if (running) 2998 p->sched_class->set_curr_task(rq); 2999 if (on_rq) 3000 enqueue_task(rq, p, enqueue_flag); 3001 3002 check_class_changed(rq, p, prev_class, oldprio); 3003out_unlock: 3004 __task_rq_unlock(rq); 3005} 3006#endif 3007 3008void set_user_nice(struct task_struct *p, long nice) 3009{ 3010 int old_prio, delta, on_rq; 3011 unsigned long flags; 3012 struct rq *rq; 3013 3014 if (task_nice(p) == nice || nice < MIN_NICE || nice > MAX_NICE) 3015 return; 3016 /* 3017 * We have to be careful, if called from sys_setpriority(), 3018 * the task might be in the middle of scheduling on another CPU. 3019 */ 3020 rq = task_rq_lock(p, &flags); 3021 /* 3022 * The RT priorities are set via sched_setscheduler(), but we still 3023 * allow the 'normal' nice value to be set - but as expected 3024 * it wont have any effect on scheduling until the task is 3025 * SCHED_DEADLINE, SCHED_FIFO or SCHED_RR: 3026 */ 3027 if (task_has_dl_policy(p) || task_has_rt_policy(p)) { 3028 p->static_prio = NICE_TO_PRIO(nice); 3029 goto out_unlock; 3030 } 3031 on_rq = p->on_rq; 3032 if (on_rq) 3033 dequeue_task(rq, p, 0); 3034 3035 p->static_prio = NICE_TO_PRIO(nice); 3036 set_load_weight(p); 3037 old_prio = p->prio; 3038 p->prio = effective_prio(p); 3039 delta = p->prio - old_prio; 3040 3041 if (on_rq) { 3042 enqueue_task(rq, p, 0); 3043 /* 3044 * If the task increased its priority or is running and 3045 * lowered its priority, then reschedule its CPU: 3046 */ 3047 if (delta < 0 || (delta > 0 && task_running(rq, p))) 3048 resched_task(rq->curr); 3049 } 3050out_unlock: 3051 task_rq_unlock(rq, p, &flags); 3052} 3053EXPORT_SYMBOL(set_user_nice); 3054 3055/* 3056 * can_nice - check if a task can reduce its nice value 3057 * @p: task 3058 * @nice: nice value 3059 */ 3060int can_nice(const struct task_struct *p, const int nice) 3061{ 3062 /* convert nice value [19,-20] to rlimit style value [1,40] */ 3063 int nice_rlim = nice_to_rlimit(nice); 3064 3065 return (nice_rlim <= task_rlimit(p, RLIMIT_NICE) || 3066 capable(CAP_SYS_NICE)); 3067} 3068 3069#ifdef __ARCH_WANT_SYS_NICE 3070 3071/* 3072 * sys_nice - change the priority of the current process. 3073 * @increment: priority increment 3074 * 3075 * sys_setpriority is a more generic, but much slower function that 3076 * does similar things. 3077 */ 3078SYSCALL_DEFINE1(nice, int, increment) 3079{ 3080 long nice, retval; 3081 3082 /* 3083 * Setpriority might change our priority at the same moment. 3084 * We don't have to worry. Conceptually one call occurs first 3085 * and we have a single winner. 3086 */ 3087 increment = clamp(increment, -NICE_WIDTH, NICE_WIDTH); 3088 nice = task_nice(current) + increment; 3089 3090 nice = clamp_val(nice, MIN_NICE, MAX_NICE); 3091 if (increment < 0 && !can_nice(current, nice)) 3092 return -EPERM; 3093 3094 retval = security_task_setnice(current, nice); 3095 if (retval) 3096 return retval; 3097 3098 set_user_nice(current, nice); 3099 return 0; 3100} 3101 3102#endif 3103 3104/** 3105 * task_prio - return the priority value of a given task. 3106 * @p: the task in question. 3107 * 3108 * Return: The priority value as seen by users in /proc. 3109 * RT tasks are offset by -200. Normal tasks are centered 3110 * around 0, value goes from -16 to +15. 3111 */ 3112int task_prio(const struct task_struct *p) 3113{ 3114 return p->prio - MAX_RT_PRIO; 3115} 3116 3117/** 3118 * idle_cpu - is a given cpu idle currently? 3119 * @cpu: the processor in question. 3120 * 3121 * Return: 1 if the CPU is currently idle. 0 otherwise. 3122 */ 3123int idle_cpu(int cpu) 3124{ 3125 struct rq *rq = cpu_rq(cpu); 3126 3127 if (rq->curr != rq->idle) 3128 return 0; 3129 3130 if (rq->nr_running) 3131 return 0; 3132 3133#ifdef CONFIG_SMP 3134 if (!llist_empty(&rq->wake_list)) 3135 return 0; 3136#endif 3137 3138 return 1; 3139} 3140 3141/** 3142 * idle_task - return the idle task for a given cpu. 3143 * @cpu: the processor in question. 3144 * 3145 * Return: The idle task for the cpu @cpu. 3146 */ 3147struct task_struct *idle_task(int cpu) 3148{ 3149 return cpu_rq(cpu)->idle; 3150} 3151 3152/** 3153 * find_process_by_pid - find a process with a matching PID value. 3154 * @pid: the pid in question. 3155 * 3156 * The task of @pid, if found. %NULL otherwise. 3157 */ 3158static struct task_struct *find_process_by_pid(pid_t pid) 3159{ 3160 return pid ? find_task_by_vpid(pid) : current; 3161} 3162 3163/* 3164 * This function initializes the sched_dl_entity of a newly becoming 3165 * SCHED_DEADLINE task. 3166 * 3167 * Only the static values are considered here, the actual runtime and the 3168 * absolute deadline will be properly calculated when the task is enqueued 3169 * for the first time with its new policy. 3170 */ 3171static void 3172__setparam_dl(struct task_struct *p, const struct sched_attr *attr) 3173{ 3174 struct sched_dl_entity *dl_se = &p->dl; 3175 3176 init_dl_task_timer(dl_se); 3177 dl_se->dl_runtime = attr->sched_runtime; 3178 dl_se->dl_deadline = attr->sched_deadline; 3179 dl_se->dl_period = attr->sched_period ?: dl_se->dl_deadline; 3180 dl_se->flags = attr->sched_flags; 3181 dl_se->dl_bw = to_ratio(dl_se->dl_period, dl_se->dl_runtime); 3182 dl_se->dl_throttled = 0; 3183 dl_se->dl_new = 1; 3184 dl_se->dl_yielded = 0; 3185} 3186 3187static void __setscheduler_params(struct task_struct *p, 3188 const struct sched_attr *attr) 3189{ 3190 int policy = attr->sched_policy; 3191 3192 if (policy == -1) /* setparam */ 3193 policy = p->policy; 3194 3195 p->policy = policy; 3196 3197 if (dl_policy(policy)) 3198 __setparam_dl(p, attr); 3199 else if (fair_policy(policy)) 3200 p->static_prio = NICE_TO_PRIO(attr->sched_nice); 3201 3202 /* 3203 * __sched_setscheduler() ensures attr->sched_priority == 0 when 3204 * !rt_policy. Always setting this ensures that things like 3205 * getparam()/getattr() don't report silly values for !rt tasks. 3206 */ 3207 p->rt_priority = attr->sched_priority; 3208 p->normal_prio = normal_prio(p); 3209 set_load_weight(p); 3210} 3211 3212/* Actually do priority change: must hold pi & rq lock. */ 3213static void __setscheduler(struct rq *rq, struct task_struct *p, 3214 const struct sched_attr *attr) 3215{ 3216 __setscheduler_params(p, attr); 3217 3218 /* 3219 * If we get here, there was no pi waiters boosting the 3220 * task. It is safe to use the normal prio. 3221 */ 3222 p->prio = normal_prio(p); 3223 3224 if (dl_prio(p->prio)) 3225 p->sched_class = &dl_sched_class; 3226 else if (rt_prio(p->prio)) 3227 p->sched_class = &rt_sched_class; 3228 else 3229 p->sched_class = &fair_sched_class; 3230} 3231 3232static void 3233__getparam_dl(struct task_struct *p, struct sched_attr *attr) 3234{ 3235 struct sched_dl_entity *dl_se = &p->dl; 3236 3237 attr->sched_priority = p->rt_priority; 3238 attr->sched_runtime = dl_se->dl_runtime; 3239 attr->sched_deadline = dl_se->dl_deadline; 3240 attr->sched_period = dl_se->dl_period; 3241 attr->sched_flags = dl_se->flags; 3242} 3243 3244/* 3245 * This function validates the new parameters of a -deadline task. 3246 * We ask for the deadline not being zero, and greater or equal 3247 * than the runtime, as well as the period of being zero or 3248 * greater than deadline. Furthermore, we have to be sure that 3249 * user parameters are above the internal resolution of 1us (we 3250 * check sched_runtime only since it is always the smaller one) and 3251 * below 2^63 ns (we have to check both sched_deadline and 3252 * sched_period, as the latter can be zero). 3253 */ 3254static bool 3255__checkparam_dl(const struct sched_attr *attr) 3256{ 3257 /* deadline != 0 */ 3258 if (attr->sched_deadline == 0) 3259 return false; 3260 3261 /* 3262 * Since we truncate DL_SCALE bits, make sure we're at least 3263 * that big. 3264 */ 3265 if (attr->sched_runtime < (1ULL << DL_SCALE)) 3266 return false; 3267 3268 /* 3269 * Since we use the MSB for wrap-around and sign issues, make 3270 * sure it's not set (mind that period can be equal to zero). 3271 */ 3272 if (attr->sched_deadline & (1ULL << 63) || 3273 attr->sched_period & (1ULL << 63)) 3274 return false; 3275 3276 /* runtime <= deadline <= period (if period != 0) */ 3277 if ((attr->sched_period != 0 && 3278 attr->sched_period < attr->sched_deadline) || 3279 attr->sched_deadline < attr->sched_runtime) 3280 return false; 3281 3282 return true; 3283} 3284 3285/* 3286 * check the target process has a UID that matches the current process's 3287 */ 3288static bool check_same_owner(struct task_struct *p) 3289{ 3290 const struct cred *cred = current_cred(), *pcred; 3291 bool match; 3292 3293 rcu_read_lock(); 3294 pcred = __task_cred(p); 3295 match = (uid_eq(cred->euid, pcred->euid) || 3296 uid_eq(cred->euid, pcred->uid)); 3297 rcu_read_unlock(); 3298 return match; 3299} 3300 3301static int __sched_setscheduler(struct task_struct *p, 3302 const struct sched_attr *attr, 3303 bool user) 3304{ 3305 int newprio = dl_policy(attr->sched_policy) ? MAX_DL_PRIO - 1 : 3306 MAX_RT_PRIO - 1 - attr->sched_priority; 3307 int retval, oldprio, oldpolicy = -1, on_rq, running; 3308 int policy = attr->sched_policy; 3309 unsigned long flags; 3310 const struct sched_class *prev_class; 3311 struct rq *rq; 3312 int reset_on_fork; 3313 3314 /* may grab non-irq protected spin_locks */ 3315 BUG_ON(in_interrupt()); 3316recheck: 3317 /* double check policy once rq lock held */ 3318 if (policy < 0) { 3319 reset_on_fork = p->sched_reset_on_fork; 3320 policy = oldpolicy = p->policy; 3321 } else { 3322 reset_on_fork = !!(attr->sched_flags & SCHED_FLAG_RESET_ON_FORK); 3323 3324 if (policy != SCHED_DEADLINE && 3325 policy != SCHED_FIFO && policy != SCHED_RR && 3326 policy != SCHED_NORMAL && policy != SCHED_BATCH && 3327 policy != SCHED_IDLE) 3328 return -EINVAL; 3329 } 3330 3331 if (attr->sched_flags & ~(SCHED_FLAG_RESET_ON_FORK)) 3332 return -EINVAL; 3333 3334 /* 3335 * Valid priorities for SCHED_FIFO and SCHED_RR are 3336 * 1..MAX_USER_RT_PRIO-1, valid priority for SCHED_NORMAL, 3337 * SCHED_BATCH and SCHED_IDLE is 0. 3338 */ 3339 if ((p->mm && attr->sched_priority > MAX_USER_RT_PRIO-1) || 3340 (!p->mm && attr->sched_priority > MAX_RT_PRIO-1)) 3341 return -EINVAL; 3342 if ((dl_policy(policy) && !__checkparam_dl(attr)) || 3343 (rt_policy(policy) != (attr->sched_priority != 0))) 3344 return -EINVAL; 3345 3346 /* 3347 * Allow unprivileged RT tasks to decrease priority: 3348 */ 3349 if (user && !capable(CAP_SYS_NICE)) { 3350 if (fair_policy(policy)) { 3351 if (attr->sched_nice < task_nice(p) && 3352 !can_nice(p, attr->sched_nice)) 3353 return -EPERM; 3354 } 3355 3356 if (rt_policy(policy)) { 3357 unsigned long rlim_rtprio = 3358 task_rlimit(p, RLIMIT_RTPRIO); 3359 3360 /* can't set/change the rt policy */ 3361 if (policy != p->policy && !rlim_rtprio) 3362 return -EPERM; 3363 3364 /* can't increase priority */ 3365 if (attr->sched_priority > p->rt_priority && 3366 attr->sched_priority > rlim_rtprio) 3367 return -EPERM; 3368 } 3369 3370 /* 3371 * Can't set/change SCHED_DEADLINE policy at all for now 3372 * (safest behavior); in the future we would like to allow 3373 * unprivileged DL tasks to increase their relative deadline 3374 * or reduce their runtime (both ways reducing utilization) 3375 */ 3376 if (dl_policy(policy)) 3377 return -EPERM; 3378 3379 /* 3380 * Treat SCHED_IDLE as nice 20. Only allow a switch to 3381 * SCHED_NORMAL if the RLIMIT_NICE would normally permit it. 3382 */ 3383 if (p->policy == SCHED_IDLE && policy != SCHED_IDLE) { 3384 if (!can_nice(p, task_nice(p))) 3385 return -EPERM; 3386 } 3387 3388 /* can't change other user's priorities */ 3389 if (!check_same_owner(p)) 3390 return -EPERM; 3391 3392 /* Normal users shall not reset the sched_reset_on_fork flag */ 3393 if (p->sched_reset_on_fork && !reset_on_fork) 3394 return -EPERM; 3395 } 3396 3397 if (user) { 3398 retval = security_task_setscheduler(p); 3399 if (retval) 3400 return retval; 3401 } 3402 3403 /* 3404 * make sure no PI-waiters arrive (or leave) while we are 3405 * changing the priority of the task: 3406 * 3407 * To be able to change p->policy safely, the appropriate 3408 * runqueue lock must be held. 3409 */ 3410 rq = task_rq_lock(p, &flags); 3411 3412 /* 3413 * Changing the policy of the stop threads its a very bad idea 3414 */ 3415 if (p == rq->stop) { 3416 task_rq_unlock(rq, p, &flags); 3417 return -EINVAL; 3418 } 3419 3420 /* 3421 * If not changing anything there's no need to proceed further, 3422 * but store a possible modification of reset_on_fork. 3423 */ 3424 if (unlikely(policy == p->policy)) { 3425 if (fair_policy(policy) && attr->sched_nice != task_nice(p)) 3426 goto change; 3427 if (rt_policy(policy) && attr->sched_priority != p->rt_priority) 3428 goto change; 3429 if (dl_policy(policy)) 3430 goto change; 3431 3432 p->sched_reset_on_fork = reset_on_fork; 3433 task_rq_unlock(rq, p, &flags); 3434 return 0; 3435 } 3436change: 3437 3438 if (user) { 3439#ifdef CONFIG_RT_GROUP_SCHED 3440 /* 3441 * Do not allow realtime tasks into groups that have no runtime 3442 * assigned. 3443 */ 3444 if (rt_bandwidth_enabled() && rt_policy(policy) && 3445 task_group(p)->rt_bandwidth.rt_runtime == 0 && 3446 !task_group_is_autogroup(task_group(p))) { 3447 task_rq_unlock(rq, p, &flags); 3448 return -EPERM; 3449 } 3450#endif 3451#ifdef CONFIG_SMP 3452 if (dl_bandwidth_enabled() && dl_policy(policy)) { 3453 cpumask_t *span = rq->rd->span; 3454 3455 /* 3456 * Don't allow tasks with an affinity mask smaller than 3457 * the entire root_domain to become SCHED_DEADLINE. We 3458 * will also fail if there's no bandwidth available. 3459 */ 3460 if (!cpumask_subset(span, &p->cpus_allowed) || 3461 rq->rd->dl_bw.bw == 0) { 3462 task_rq_unlock(rq, p, &flags); 3463 return -EPERM; 3464 } 3465 } 3466#endif 3467 } 3468 3469 /* recheck policy now with rq lock held */ 3470 if (unlikely(oldpolicy != -1 && oldpolicy != p->policy)) { 3471 policy = oldpolicy = -1; 3472 task_rq_unlock(rq, p, &flags); 3473 goto recheck; 3474 } 3475 3476 /* 3477 * If setscheduling to SCHED_DEADLINE (or changing the parameters 3478 * of a SCHED_DEADLINE task) we need to check if enough bandwidth 3479 * is available. 3480 */ 3481 if ((dl_policy(policy) || dl_task(p)) && dl_overflow(p, policy, attr)) { 3482 task_rq_unlock(rq, p, &flags); 3483 return -EBUSY; 3484 } 3485 3486 p->sched_reset_on_fork = reset_on_fork; 3487 oldprio = p->prio; 3488 3489 /* 3490 * Special case for priority boosted tasks. 3491 * 3492 * If the new priority is lower or equal (user space view) 3493 * than the current (boosted) priority, we just store the new 3494 * normal parameters and do not touch the scheduler class and 3495 * the runqueue. This will be done when the task deboost 3496 * itself. 3497 */ 3498 if (rt_mutex_check_prio(p, newprio)) { 3499 __setscheduler_params(p, attr); 3500 task_rq_unlock(rq, p, &flags); 3501 return 0; 3502 } 3503 3504 on_rq = p->on_rq; 3505 running = task_current(rq, p); 3506 if (on_rq) 3507 dequeue_task(rq, p, 0); 3508 if (running) 3509 p->sched_class->put_prev_task(rq, p); 3510 3511 prev_class = p->sched_class; 3512 __setscheduler(rq, p, attr); 3513 3514 if (running) 3515 p->sched_class->set_curr_task(rq); 3516 if (on_rq) { 3517 /* 3518 * We enqueue to tail when the priority of a task is 3519 * increased (user space view). 3520 */ 3521 enqueue_task(rq, p, oldprio <= p->prio ? ENQUEUE_HEAD : 0); 3522 } 3523 3524 check_class_changed(rq, p, prev_class, oldprio); 3525 task_rq_unlock(rq, p, &flags); 3526 3527 rt_mutex_adjust_pi(p); 3528 3529 return 0; 3530} 3531 3532static int _sched_setscheduler(struct task_struct *p, int policy, 3533 const struct sched_param *param, bool check) 3534{ 3535 struct sched_attr attr = { 3536 .sched_policy = policy, 3537 .sched_priority = param->sched_priority, 3538 .sched_nice = PRIO_TO_NICE(p->static_prio), 3539 }; 3540 3541 /* 3542 * Fixup the legacy SCHED_RESET_ON_FORK hack 3543 */ 3544 if (policy & SCHED_RESET_ON_FORK) { 3545 attr.sched_flags |= SCHED_FLAG_RESET_ON_FORK; 3546 policy &= ~SCHED_RESET_ON_FORK; 3547 attr.sched_policy = policy; 3548 } 3549 3550 return __sched_setscheduler(p, &attr, check); 3551} 3552/** 3553 * sched_setscheduler - change the scheduling policy and/or RT priority of a thread. 3554 * @p: the task in question. 3555 * @policy: new policy. 3556 * @param: structure containing the new RT priority. 3557 * 3558 * Return: 0 on success. An error code otherwise. 3559 * 3560 * NOTE that the task may be already dead. 3561 */ 3562int sched_setscheduler(struct task_struct *p, int policy, 3563 const struct sched_param *param) 3564{ 3565 return _sched_setscheduler(p, policy, param, true); 3566} 3567EXPORT_SYMBOL_GPL(sched_setscheduler); 3568 3569int sched_setattr(struct task_struct *p, const struct sched_attr *attr) 3570{ 3571 return __sched_setscheduler(p, attr, true); 3572} 3573EXPORT_SYMBOL_GPL(sched_setattr); 3574 3575/** 3576 * sched_setscheduler_nocheck - change the scheduling policy and/or RT priority of a thread from kernelspace. 3577 * @p: the task in question. 3578 * @policy: new policy. 3579 * @param: structure containing the new RT priority. 3580 * 3581 * Just like sched_setscheduler, only don't bother checking if the 3582 * current context has permission. For example, this is needed in 3583 * stop_machine(): we create temporary high priority worker threads, 3584 * but our caller might not have that capability. 3585 * 3586 * Return: 0 on success. An error code otherwise. 3587 */ 3588int sched_setscheduler_nocheck(struct task_struct *p, int policy, 3589 const struct sched_param *param) 3590{ 3591 return _sched_setscheduler(p, policy, param, false); 3592} 3593 3594static int 3595do_sched_setscheduler(pid_t pid, int policy, struct sched_param __user *param) 3596{ 3597 struct sched_param lparam; 3598 struct task_struct *p; 3599 int retval; 3600 3601 if (!param || pid < 0) 3602 return -EINVAL; 3603 if (copy_from_user(&lparam, param, sizeof(struct sched_param))) 3604 return -EFAULT; 3605 3606 rcu_read_lock(); 3607 retval = -ESRCH; 3608 p = find_process_by_pid(pid); 3609 if (p != NULL) 3610 retval = sched_setscheduler(p, policy, &lparam); 3611 rcu_read_unlock(); 3612 3613 return retval; 3614} 3615 3616/* 3617 * Mimics kernel/events/core.c perf_copy_attr(). 3618 */ 3619static int sched_copy_attr(struct sched_attr __user *uattr, 3620 struct sched_attr *attr) 3621{ 3622 u32 size; 3623 int ret; 3624 3625 if (!access_ok(VERIFY_WRITE, uattr, SCHED_ATTR_SIZE_VER0)) 3626 return -EFAULT; 3627 3628 /* 3629 * zero the full structure, so that a short copy will be nice. 3630 */ 3631 memset(attr, 0, sizeof(*attr)); 3632 3633 ret = get_user(size, &uattr->size); 3634 if (ret) 3635 return ret; 3636 3637 if (size > PAGE_SIZE) /* silly large */ 3638 goto err_size; 3639 3640 if (!size) /* abi compat */ 3641 size = SCHED_ATTR_SIZE_VER0; 3642 3643 if (size < SCHED_ATTR_SIZE_VER0) 3644 goto err_size; 3645 3646 /* 3647 * If we're handed a bigger struct than we know of, 3648 * ensure all the unknown bits are 0 - i.e. new 3649 * user-space does not rely on any kernel feature 3650 * extensions we dont know about yet. 3651 */ 3652 if (size > sizeof(*attr)) { 3653 unsigned char __user *addr; 3654 unsigned char __user *end; 3655 unsigned char val; 3656 3657 addr = (void __user *)uattr + sizeof(*attr); 3658 end = (void __user *)uattr + size; 3659 3660 for (; addr < end; addr++) { 3661 ret = get_user(val, addr); 3662 if (ret) 3663 return ret; 3664 if (val) 3665 goto err_size; 3666 } 3667 size = sizeof(*attr); 3668 } 3669 3670 ret = copy_from_user(attr, uattr, size); 3671 if (ret) 3672 return -EFAULT; 3673 3674 /* 3675 * XXX: do we want to be lenient like existing syscalls; or do we want 3676 * to be strict and return an error on out-of-bounds values? 3677 */ 3678 attr->sched_nice = clamp(attr->sched_nice, MIN_NICE, MAX_NICE); 3679 3680 return 0; 3681 3682err_size: 3683 put_user(sizeof(*attr), &uattr->size); 3684 return -E2BIG; 3685} 3686 3687/** 3688 * sys_sched_setscheduler - set/change the scheduler policy and RT priority 3689 * @pid: the pid in question. 3690 * @policy: new policy. 3691 * @param: structure containing the new RT priority. 3692 * 3693 * Return: 0 on success. An error code otherwise. 3694 */ 3695SYSCALL_DEFINE3(sched_setscheduler, pid_t, pid, int, policy, 3696 struct sched_param __user *, param) 3697{ 3698 /* negative values for policy are not valid */ 3699 if (policy < 0) 3700 return -EINVAL; 3701 3702 return do_sched_setscheduler(pid, policy, param); 3703} 3704 3705/** 3706 * sys_sched_setparam - set/change the RT priority of a thread 3707 * @pid: the pid in question. 3708 * @param: structure containing the new RT priority. 3709 * 3710 * Return: 0 on success. An error code otherwise. 3711 */ 3712SYSCALL_DEFINE2(sched_setparam, pid_t, pid, struct sched_param __user *, param) 3713{ 3714 return do_sched_setscheduler(pid, -1, param); 3715} 3716 3717/** 3718 * sys_sched_setattr - same as above, but with extended sched_attr 3719 * @pid: the pid in question. 3720 * @uattr: structure containing the extended parameters. 3721 * @flags: for future extension. 3722 */ 3723SYSCALL_DEFINE3(sched_setattr, pid_t, pid, struct sched_attr __user *, uattr, 3724 unsigned int, flags) 3725{ 3726 struct sched_attr attr; 3727 struct task_struct *p; 3728 int retval; 3729 3730 if (!uattr || pid < 0 || flags) 3731 return -EINVAL; 3732 3733 retval = sched_copy_attr(uattr, &attr); 3734 if (retval) 3735 return retval; 3736 3737 if (attr.sched_policy < 0) 3738 return -EINVAL; 3739 3740 rcu_read_lock(); 3741 retval = -ESRCH; 3742 p = find_process_by_pid(pid); 3743 if (p != NULL) 3744 retval = sched_setattr(p, &attr); 3745 rcu_read_unlock(); 3746 3747 return retval; 3748} 3749 3750/** 3751 * sys_sched_getscheduler - get the policy (scheduling class) of a thread 3752 * @pid: the pid in question. 3753 * 3754 * Return: On success, the policy of the thread. Otherwise, a negative error 3755 * code. 3756 */ 3757SYSCALL_DEFINE1(sched_getscheduler, pid_t, pid) 3758{ 3759 struct task_struct *p; 3760 int retval; 3761 3762 if (pid < 0) 3763 return -EINVAL; 3764 3765 retval = -ESRCH; 3766 rcu_read_lock(); 3767 p = find_process_by_pid(pid); 3768 if (p) { 3769 retval = security_task_getscheduler(p); 3770 if (!retval) 3771 retval = p->policy 3772 | (p->sched_reset_on_fork ? SCHED_RESET_ON_FORK : 0); 3773 } 3774 rcu_read_unlock(); 3775 return retval; 3776} 3777 3778/** 3779 * sys_sched_getparam - get the RT priority of a thread 3780 * @pid: the pid in question. 3781 * @param: structure containing the RT priority. 3782 * 3783 * Return: On success, 0 and the RT priority is in @param. Otherwise, an error 3784 * code. 3785 */ 3786SYSCALL_DEFINE2(sched_getparam, pid_t, pid, struct sched_param __user *, param) 3787{ 3788 struct sched_param lp = { .sched_priority = 0 }; 3789 struct task_struct *p; 3790 int retval; 3791 3792 if (!param || pid < 0) 3793 return -EINVAL; 3794 3795 rcu_read_lock(); 3796 p = find_process_by_pid(pid); 3797 retval = -ESRCH; 3798 if (!p) 3799 goto out_unlock; 3800 3801 retval = security_task_getscheduler(p); 3802 if (retval) 3803 goto out_unlock; 3804 3805 if (task_has_rt_policy(p)) 3806 lp.sched_priority = p->rt_priority; 3807 rcu_read_unlock(); 3808 3809 /* 3810 * This one might sleep, we cannot do it with a spinlock held ... 3811 */ 3812 retval = copy_to_user(param, &lp, sizeof(*param)) ? -EFAULT : 0; 3813 3814 return retval; 3815 3816out_unlock: 3817 rcu_read_unlock(); 3818 return retval; 3819} 3820 3821static int sched_read_attr(struct sched_attr __user *uattr, 3822 struct sched_attr *attr, 3823 unsigned int usize) 3824{ 3825 int ret; 3826 3827 if (!access_ok(VERIFY_WRITE, uattr, usize)) 3828 return -EFAULT; 3829 3830 /* 3831 * If we're handed a smaller struct than we know of, 3832 * ensure all the unknown bits are 0 - i.e. old 3833 * user-space does not get uncomplete information. 3834 */ 3835 if (usize < sizeof(*attr)) { 3836 unsigned char *addr; 3837 unsigned char *end; 3838 3839 addr = (void *)attr + usize; 3840 end = (void *)attr + sizeof(*attr); 3841 3842 for (; addr < end; addr++) { 3843 if (*addr) 3844 return -EFBIG; 3845 } 3846 3847 attr->size = usize; 3848 } 3849 3850 ret = copy_to_user(uattr, attr, attr->size); 3851 if (ret) 3852 return -EFAULT; 3853 3854 return 0; 3855} 3856 3857/** 3858 * sys_sched_getattr - similar to sched_getparam, but with sched_attr 3859 * @pid: the pid in question. 3860 * @uattr: structure containing the extended parameters. 3861 * @size: sizeof(attr) for fwd/bwd comp. 3862 * @flags: for future extension. 3863 */ 3864SYSCALL_DEFINE4(sched_getattr, pid_t, pid, struct sched_attr __user *, uattr, 3865 unsigned int, size, unsigned int, flags) 3866{ 3867 struct sched_attr attr = { 3868 .size = sizeof(struct sched_attr), 3869 }; 3870 struct task_struct *p; 3871 int retval; 3872 3873 if (!uattr || pid < 0 || size > PAGE_SIZE || 3874 size < SCHED_ATTR_SIZE_VER0 || flags) 3875 return -EINVAL; 3876 3877 rcu_read_lock(); 3878 p = find_process_by_pid(pid); 3879 retval = -ESRCH; 3880 if (!p) 3881 goto out_unlock; 3882 3883 retval = security_task_getscheduler(p); 3884 if (retval) 3885 goto out_unlock; 3886 3887 attr.sched_policy = p->policy; 3888 if (p->sched_reset_on_fork) 3889 attr.sched_flags |= SCHED_FLAG_RESET_ON_FORK; 3890 if (task_has_dl_policy(p)) 3891 __getparam_dl(p, &attr); 3892 else if (task_has_rt_policy(p)) 3893 attr.sched_priority = p->rt_priority; 3894 else 3895 attr.sched_nice = task_nice(p); 3896 3897 rcu_read_unlock(); 3898 3899 retval = sched_read_attr(uattr, &attr, size); 3900 return retval; 3901 3902out_unlock: 3903 rcu_read_unlock(); 3904 return retval; 3905} 3906 3907long sched_setaffinity(pid_t pid, const struct cpumask *in_mask) 3908{ 3909 cpumask_var_t cpus_allowed, new_mask; 3910 struct task_struct *p; 3911 int retval; 3912 3913 rcu_read_lock(); 3914 3915 p = find_process_by_pid(pid); 3916 if (!p) { 3917 rcu_read_unlock(); 3918 return -ESRCH; 3919 } 3920 3921 /* Prevent p going away */ 3922 get_task_struct(p); 3923 rcu_read_unlock(); 3924 3925 if (p->flags & PF_NO_SETAFFINITY) { 3926 retval = -EINVAL; 3927 goto out_put_task; 3928 } 3929 if (!alloc_cpumask_var(&cpus_allowed, GFP_KERNEL)) { 3930 retval = -ENOMEM; 3931 goto out_put_task; 3932 } 3933 if (!alloc_cpumask_var(&new_mask, GFP_KERNEL)) { 3934 retval = -ENOMEM; 3935 goto out_free_cpus_allowed; 3936 } 3937 retval = -EPERM; 3938 if (!check_same_owner(p)) { 3939 rcu_read_lock(); 3940 if (!ns_capable(__task_cred(p)->user_ns, CAP_SYS_NICE)) { 3941 rcu_read_unlock(); 3942 goto out_unlock; 3943 } 3944 rcu_read_unlock(); 3945 } 3946 3947 retval = security_task_setscheduler(p); 3948 if (retval) 3949 goto out_unlock; 3950 3951 3952 cpuset_cpus_allowed(p, cpus_allowed); 3953 cpumask_and(new_mask, in_mask, cpus_allowed); 3954 3955 /* 3956 * Since bandwidth control happens on root_domain basis, 3957 * if admission test is enabled, we only admit -deadline 3958 * tasks allowed to run on all the CPUs in the task's 3959 * root_domain. 3960 */ 3961#ifdef CONFIG_SMP 3962 if (task_has_dl_policy(p)) { 3963 const struct cpumask *span = task_rq(p)->rd->span; 3964 3965 if (dl_bandwidth_enabled() && !cpumask_subset(span, new_mask)) { 3966 retval = -EBUSY; 3967 goto out_unlock; 3968 } 3969 } 3970#endif 3971again: 3972 retval = set_cpus_allowed_ptr(p, new_mask); 3973 3974 if (!retval) { 3975 cpuset_cpus_allowed(p, cpus_allowed); 3976 if (!cpumask_subset(new_mask, cpus_allowed)) { 3977 /* 3978 * We must have raced with a concurrent cpuset 3979 * update. Just reset the cpus_allowed to the 3980 * cpuset's cpus_allowed 3981 */ 3982 cpumask_copy(new_mask, cpus_allowed); 3983 goto again; 3984 } 3985 } 3986out_unlock: 3987 free_cpumask_var(new_mask); 3988out_free_cpus_allowed: 3989 free_cpumask_var(cpus_allowed); 3990out_put_task: 3991 put_task_struct(p); 3992 return retval; 3993} 3994 3995static int get_user_cpu_mask(unsigned long __user *user_mask_ptr, unsigned len, 3996 struct cpumask *new_mask) 3997{ 3998 if (len < cpumask_size()) 3999 cpumask_clear(new_mask); 4000 else if (len > cpumask_size()) 4001 len = cpumask_size(); 4002 4003 return copy_from_user(new_mask, user_mask_ptr, len) ? -EFAULT : 0; 4004} 4005 4006/** 4007 * sys_sched_setaffinity - set the cpu affinity of a process 4008 * @pid: pid of the process 4009 * @len: length in bytes of the bitmask pointed to by user_mask_ptr 4010 * @user_mask_ptr: user-space pointer to the new cpu mask 4011 * 4012 * Return: 0 on success. An error code otherwise. 4013 */ 4014SYSCALL_DEFINE3(sched_setaffinity, pid_t, pid, unsigned int, len, 4015 unsigned long __user *, user_mask_ptr) 4016{ 4017 cpumask_var_t new_mask; 4018 int retval; 4019 4020 if (!alloc_cpumask_var(&new_mask, GFP_KERNEL)) 4021 return -ENOMEM; 4022 4023 retval = get_user_cpu_mask(user_mask_ptr, len, new_mask); 4024 if (retval == 0) 4025 retval = sched_setaffinity(pid, new_mask); 4026 free_cpumask_var(new_mask); 4027 return retval; 4028} 4029 4030long sched_getaffinity(pid_t pid, struct cpumask *mask) 4031{ 4032 struct task_struct *p; 4033 unsigned long flags; 4034 int retval; 4035 4036 rcu_read_lock(); 4037 4038 retval = -ESRCH; 4039 p = find_process_by_pid(pid); 4040 if (!p) 4041 goto out_unlock; 4042 4043 retval = security_task_getscheduler(p); 4044 if (retval) 4045 goto out_unlock; 4046 4047 raw_spin_lock_irqsave(&p->pi_lock, flags); 4048 cpumask_and(mask, &p->cpus_allowed, cpu_active_mask); 4049 raw_spin_unlock_irqrestore(&p->pi_lock, flags); 4050 4051out_unlock: 4052 rcu_read_unlock(); 4053 4054 return retval; 4055} 4056 4057/** 4058 * sys_sched_getaffinity - get the cpu affinity of a process 4059 * @pid: pid of the process 4060 * @len: length in bytes of the bitmask pointed to by user_mask_ptr 4061 * @user_mask_ptr: user-space pointer to hold the current cpu mask 4062 * 4063 * Return: 0 on success. An error code otherwise. 4064 */ 4065SYSCALL_DEFINE3(sched_getaffinity, pid_t, pid, unsigned int, len, 4066 unsigned long __user *, user_mask_ptr) 4067{ 4068 int ret; 4069 cpumask_var_t mask; 4070 4071 if ((len * BITS_PER_BYTE) < nr_cpu_ids) 4072 return -EINVAL; 4073 if (len & (sizeof(unsigned long)-1)) 4074 return -EINVAL; 4075 4076 if (!alloc_cpumask_var(&mask, GFP_KERNEL)) 4077 return -ENOMEM; 4078 4079 ret = sched_getaffinity(pid, mask); 4080 if (ret == 0) { 4081 size_t retlen = min_t(size_t, len, cpumask_size()); 4082 4083 if (copy_to_user(user_mask_ptr, mask, retlen)) 4084 ret = -EFAULT; 4085 else 4086 ret = retlen; 4087 } 4088 free_cpumask_var(mask); 4089 4090 return ret; 4091} 4092 4093/** 4094 * sys_sched_yield - yield the current processor to other threads. 4095 * 4096 * This function yields the current CPU to other tasks. If there are no 4097 * other threads running on this CPU then this function will return. 4098 * 4099 * Return: 0. 4100 */ 4101SYSCALL_DEFINE0(sched_yield) 4102{ 4103 struct rq *rq = this_rq_lock(); 4104 4105 schedstat_inc(rq, yld_count); 4106 current->sched_class->yield_task(rq); 4107 4108 /* 4109 * Since we are going to call schedule() anyway, there's 4110 * no need to preempt or enable interrupts: 4111 */ 4112 __release(rq->lock); 4113 spin_release(&rq->lock.dep_map, 1, _THIS_IP_); 4114 do_raw_spin_unlock(&rq->lock); 4115 sched_preempt_enable_no_resched(); 4116 4117 schedule(); 4118 4119 return 0; 4120} 4121 4122static void __cond_resched(void) 4123{ 4124 __preempt_count_add(PREEMPT_ACTIVE); 4125 __schedule(); 4126 __preempt_count_sub(PREEMPT_ACTIVE); 4127} 4128 4129int __sched _cond_resched(void) 4130{ 4131 if (should_resched()) { 4132 __cond_resched(); 4133 return 1; 4134 } 4135 return 0; 4136} 4137EXPORT_SYMBOL(_cond_resched); 4138 4139/* 4140 * __cond_resched_lock() - if a reschedule is pending, drop the given lock, 4141 * call schedule, and on return reacquire the lock. 4142 * 4143 * This works OK both with and without CONFIG_PREEMPT. We do strange low-level 4144 * operations here to prevent schedule() from being called twice (once via 4145 * spin_unlock(), once by hand). 4146 */ 4147int __cond_resched_lock(spinlock_t *lock) 4148{ 4149 int resched = should_resched(); 4150 int ret = 0; 4151 4152 lockdep_assert_held(lock); 4153 4154 if (spin_needbreak(lock) || resched) { 4155 spin_unlock(lock); 4156 if (resched) 4157 __cond_resched(); 4158 else 4159 cpu_relax(); 4160 ret = 1; 4161 spin_lock(lock); 4162 } 4163 return ret; 4164} 4165EXPORT_SYMBOL(__cond_resched_lock); 4166 4167int __sched __cond_resched_softirq(void) 4168{ 4169 BUG_ON(!in_softirq()); 4170 4171 if (should_resched()) { 4172 local_bh_enable(); 4173 __cond_resched(); 4174 local_bh_disable(); 4175 return 1; 4176 } 4177 return 0; 4178} 4179EXPORT_SYMBOL(__cond_resched_softirq); 4180 4181/** 4182 * yield - yield the current processor to other threads. 4183 * 4184 * Do not ever use this function, there's a 99% chance you're doing it wrong. 4185 * 4186 * The scheduler is at all times free to pick the calling task as the most 4187 * eligible task to run, if removing the yield() call from your code breaks 4188 * it, its already broken. 4189 * 4190 * Typical broken usage is: 4191 * 4192 * while (!event) 4193 * yield(); 4194 * 4195 * where one assumes that yield() will let 'the other' process run that will 4196 * make event true. If the current task is a SCHED_FIFO task that will never 4197 * happen. Never use yield() as a progress guarantee!! 4198 * 4199 * If you want to use yield() to wait for something, use wait_event(). 4200 * If you want to use yield() to be 'nice' for others, use cond_resched(). 4201 * If you still want to use yield(), do not! 4202 */ 4203void __sched yield(void) 4204{ 4205 set_current_state(TASK_RUNNING); 4206 sys_sched_yield(); 4207} 4208EXPORT_SYMBOL(yield); 4209 4210/** 4211 * yield_to - yield the current processor to another thread in 4212 * your thread group, or accelerate that thread toward the 4213 * processor it's on. 4214 * @p: target task 4215 * @preempt: whether task preemption is allowed or not 4216 * 4217 * It's the caller's job to ensure that the target task struct 4218 * can't go away on us before we can do any checks. 4219 * 4220 * Return: 4221 * true (>0) if we indeed boosted the target task. 4222 * false (0) if we failed to boost the target. 4223 * -ESRCH if there's no task to yield to. 4224 */ 4225int __sched yield_to(struct task_struct *p, bool preempt) 4226{ 4227 struct task_struct *curr = current; 4228 struct rq *rq, *p_rq; 4229 unsigned long flags; 4230 int yielded = 0; 4231 4232 local_irq_save(flags); 4233 rq = this_rq(); 4234 4235again: 4236 p_rq = task_rq(p); 4237 /* 4238 * If we're the only runnable task on the rq and target rq also 4239 * has only one task, there's absolutely no point in yielding. 4240 */ 4241 if (rq->nr_running == 1 && p_rq->nr_running == 1) { 4242 yielded = -ESRCH; 4243 goto out_irq; 4244 } 4245 4246 double_rq_lock(rq, p_rq); 4247 if (task_rq(p) != p_rq) { 4248 double_rq_unlock(rq, p_rq); 4249 goto again; 4250 } 4251 4252 if (!curr->sched_class->yield_to_task) 4253 goto out_unlock; 4254 4255 if (curr->sched_class != p->sched_class) 4256 goto out_unlock; 4257 4258 if (task_running(p_rq, p) || p->state) 4259 goto out_unlock; 4260 4261 yielded = curr->sched_class->yield_to_task(rq, p, preempt); 4262 if (yielded) { 4263 schedstat_inc(rq, yld_count); 4264 /* 4265 * Make p's CPU reschedule; pick_next_entity takes care of 4266 * fairness. 4267 */ 4268 if (preempt && rq != p_rq) 4269 resched_task(p_rq->curr); 4270 } 4271 4272out_unlock: 4273 double_rq_unlock(rq, p_rq); 4274out_irq: 4275 local_irq_restore(flags); 4276 4277 if (yielded > 0) 4278 schedule(); 4279 4280 return yielded; 4281} 4282EXPORT_SYMBOL_GPL(yield_to); 4283 4284/* 4285 * This task is about to go to sleep on IO. Increment rq->nr_iowait so 4286 * that process accounting knows that this is a task in IO wait state. 4287 */ 4288void __sched io_schedule(void) 4289{ 4290 struct rq *rq = raw_rq(); 4291 4292 delayacct_blkio_start(); 4293 atomic_inc(&rq->nr_iowait); 4294 blk_flush_plug(current); 4295 current->in_iowait = 1; 4296 schedule(); 4297 current->in_iowait = 0; 4298 atomic_dec(&rq->nr_iowait); 4299 delayacct_blkio_end(); 4300} 4301EXPORT_SYMBOL(io_schedule); 4302 4303long __sched io_schedule_timeout(long timeout) 4304{ 4305 struct rq *rq = raw_rq(); 4306 long ret; 4307 4308 delayacct_blkio_start(); 4309 atomic_inc(&rq->nr_iowait); 4310 blk_flush_plug(current); 4311 current->in_iowait = 1; 4312 ret = schedule_timeout(timeout); 4313 current->in_iowait = 0; 4314 atomic_dec(&rq->nr_iowait); 4315 delayacct_blkio_end(); 4316 return ret; 4317} 4318 4319/** 4320 * sys_sched_get_priority_max - return maximum RT priority. 4321 * @policy: scheduling class. 4322 * 4323 * Return: On success, this syscall returns the maximum 4324 * rt_priority that can be used by a given scheduling class. 4325 * On failure, a negative error code is returned. 4326 */ 4327SYSCALL_DEFINE1(sched_get_priority_max, int, policy) 4328{ 4329 int ret = -EINVAL; 4330 4331 switch (policy) { 4332 case SCHED_FIFO: 4333 case SCHED_RR: 4334 ret = MAX_USER_RT_PRIO-1; 4335 break; 4336 case SCHED_DEADLINE: 4337 case SCHED_NORMAL: 4338 case SCHED_BATCH: 4339 case SCHED_IDLE: 4340 ret = 0; 4341 break; 4342 } 4343 return ret; 4344} 4345 4346/** 4347 * sys_sched_get_priority_min - return minimum RT priority. 4348 * @policy: scheduling class. 4349 * 4350 * Return: On success, this syscall returns the minimum 4351 * rt_priority that can be used by a given scheduling class. 4352 * On failure, a negative error code is returned. 4353 */ 4354SYSCALL_DEFINE1(sched_get_priority_min, int, policy) 4355{ 4356 int ret = -EINVAL; 4357 4358 switch (policy) { 4359 case SCHED_FIFO: 4360 case SCHED_RR: 4361 ret = 1; 4362 break; 4363 case SCHED_DEADLINE: 4364 case SCHED_NORMAL: 4365 case SCHED_BATCH: 4366 case SCHED_IDLE: 4367 ret = 0; 4368 } 4369 return ret; 4370} 4371 4372/** 4373 * sys_sched_rr_get_interval - return the default timeslice of a process. 4374 * @pid: pid of the process. 4375 * @interval: userspace pointer to the timeslice value. 4376 * 4377 * this syscall writes the default timeslice value of a given process 4378 * into the user-space timespec buffer. A value of '0' means infinity. 4379 * 4380 * Return: On success, 0 and the timeslice is in @interval. Otherwise, 4381 * an error code. 4382 */ 4383SYSCALL_DEFINE2(sched_rr_get_interval, pid_t, pid, 4384 struct timespec __user *, interval) 4385{ 4386 struct task_struct *p; 4387 unsigned int time_slice; 4388 unsigned long flags; 4389 struct rq *rq; 4390 int retval; 4391 struct timespec t; 4392 4393 if (pid < 0) 4394 return -EINVAL; 4395 4396 retval = -ESRCH; 4397 rcu_read_lock(); 4398 p = find_process_by_pid(pid); 4399 if (!p) 4400 goto out_unlock; 4401 4402 retval = security_task_getscheduler(p); 4403 if (retval) 4404 goto out_unlock; 4405 4406 rq = task_rq_lock(p, &flags); 4407 time_slice = 0; 4408 if (p->sched_class->get_rr_interval) 4409 time_slice = p->sched_class->get_rr_interval(rq, p); 4410 task_rq_unlock(rq, p, &flags); 4411 4412 rcu_read_unlock(); 4413 jiffies_to_timespec(time_slice, &t); 4414 retval = copy_to_user(interval, &t, sizeof(t)) ? -EFAULT : 0; 4415 return retval; 4416 4417out_unlock: 4418 rcu_read_unlock(); 4419 return retval; 4420} 4421 4422static const char stat_nam[] = TASK_STATE_TO_CHAR_STR; 4423 4424void sched_show_task(struct task_struct *p) 4425{ 4426 unsigned long free = 0; 4427 int ppid; 4428 unsigned state; 4429 4430 state = p->state ? __ffs(p->state) + 1 : 0; 4431 printk(KERN_INFO "%-15.15s %c", p->comm, 4432 state < sizeof(stat_nam) - 1 ? stat_nam[state] : '?'); 4433#if BITS_PER_LONG == 32 4434 if (state == TASK_RUNNING) 4435 printk(KERN_CONT " running "); 4436 else 4437 printk(KERN_CONT " %08lx ", thread_saved_pc(p)); 4438#else 4439 if (state == TASK_RUNNING) 4440 printk(KERN_CONT " running task "); 4441 else 4442 printk(KERN_CONT " %016lx ", thread_saved_pc(p)); 4443#endif 4444#ifdef CONFIG_DEBUG_STACK_USAGE 4445 free = stack_not_used(p); 4446#endif 4447 rcu_read_lock(); 4448 ppid = task_pid_nr(rcu_dereference(p->real_parent)); 4449 rcu_read_unlock(); 4450 printk(KERN_CONT "%5lu %5d %6d 0x%08lx\n", free, 4451 task_pid_nr(p), ppid, 4452 (unsigned long)task_thread_info(p)->flags); 4453 4454 print_worker_info(KERN_INFO, p); 4455 show_stack(p, NULL); 4456} 4457 4458void show_state_filter(unsigned long state_filter) 4459{ 4460 struct task_struct *g, *p; 4461 4462#if BITS_PER_LONG == 32 4463 printk(KERN_INFO 4464 " task PC stack pid father\n"); 4465#else 4466 printk(KERN_INFO 4467 " task PC stack pid father\n"); 4468#endif 4469 rcu_read_lock(); 4470 do_each_thread(g, p) { 4471 /* 4472 * reset the NMI-timeout, listing all files on a slow 4473 * console might take a lot of time: 4474 */ 4475 touch_nmi_watchdog(); 4476 if (!state_filter || (p->state & state_filter)) 4477 sched_show_task(p); 4478 } while_each_thread(g, p); 4479 4480 touch_all_softlockup_watchdogs(); 4481 4482#ifdef CONFIG_SCHED_DEBUG 4483 sysrq_sched_debug_show(); 4484#endif 4485 rcu_read_unlock(); 4486 /* 4487 * Only show locks if all tasks are dumped: 4488 */ 4489 if (!state_filter) 4490 debug_show_all_locks(); 4491} 4492 4493void init_idle_bootup_task(struct task_struct *idle) 4494{ 4495 idle->sched_class = &idle_sched_class; 4496} 4497 4498/** 4499 * init_idle - set up an idle thread for a given CPU 4500 * @idle: task in question 4501 * @cpu: cpu the idle task belongs to 4502 * 4503 * NOTE: this function does not set the idle thread's NEED_RESCHED 4504 * flag, to make booting more robust. 4505 */ 4506void init_idle(struct task_struct *idle, int cpu) 4507{ 4508 struct rq *rq = cpu_rq(cpu); 4509 unsigned long flags; 4510 4511 raw_spin_lock_irqsave(&rq->lock, flags); 4512 4513 __sched_fork(0, idle); 4514 idle->state = TASK_RUNNING; 4515 idle->se.exec_start = sched_clock(); 4516 4517 do_set_cpus_allowed(idle, cpumask_of(cpu)); 4518 /* 4519 * We're having a chicken and egg problem, even though we are 4520 * holding rq->lock, the cpu isn't yet set to this cpu so the 4521 * lockdep check in task_group() will fail. 4522 * 4523 * Similar case to sched_fork(). / Alternatively we could 4524 * use task_rq_lock() here and obtain the other rq->lock. 4525 * 4526 * Silence PROVE_RCU 4527 */ 4528 rcu_read_lock(); 4529 __set_task_cpu(idle, cpu); 4530 rcu_read_unlock(); 4531 4532 rq->curr = rq->idle = idle; 4533 idle->on_rq = 1; 4534#if defined(CONFIG_SMP) 4535 idle->on_cpu = 1; 4536#endif 4537 raw_spin_unlock_irqrestore(&rq->lock, flags); 4538 4539 /* Set the preempt count _outside_ the spinlocks! */ 4540 init_idle_preempt_count(idle, cpu); 4541 4542 /* 4543 * The idle tasks have their own, simple scheduling class: 4544 */ 4545 idle->sched_class = &idle_sched_class; 4546 ftrace_graph_init_idle_task(idle, cpu); 4547 vtime_init_idle(idle, cpu); 4548#if defined(CONFIG_SMP) 4549 sprintf(idle->comm, "%s/%d", INIT_TASK_COMM, cpu); 4550#endif 4551} 4552 4553#ifdef CONFIG_SMP 4554void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask) 4555{ 4556 if (p->sched_class && p->sched_class->set_cpus_allowed) 4557 p->sched_class->set_cpus_allowed(p, new_mask); 4558 4559 cpumask_copy(&p->cpus_allowed, new_mask); 4560 p->nr_cpus_allowed = cpumask_weight(new_mask); 4561} 4562 4563/* 4564 * This is how migration works: 4565 * 4566 * 1) we invoke migration_cpu_stop() on the target CPU using 4567 * stop_one_cpu(). 4568 * 2) stopper starts to run (implicitly forcing the migrated thread 4569 * off the CPU) 4570 * 3) it checks whether the migrated task is still in the wrong runqueue. 4571 * 4) if it's in the wrong runqueue then the migration thread removes 4572 * it and puts it into the right queue. 4573 * 5) stopper completes and stop_one_cpu() returns and the migration 4574 * is done. 4575 */ 4576 4577/* 4578 * Change a given task's CPU affinity. Migrate the thread to a 4579 * proper CPU and schedule it away if the CPU it's executing on 4580 * is removed from the allowed bitmask. 4581 * 4582 * NOTE: the caller must have a valid reference to the task, the 4583 * task must not exit() & deallocate itself prematurely. The 4584 * call is not atomic; no spinlocks may be held. 4585 */ 4586int set_cpus_allowed_ptr(struct task_struct *p, const struct cpumask *new_mask) 4587{ 4588 unsigned long flags; 4589 struct rq *rq; 4590 unsigned int dest_cpu; 4591 int ret = 0; 4592 4593 rq = task_rq_lock(p, &flags); 4594 4595 if (cpumask_equal(&p->cpus_allowed, new_mask)) 4596 goto out; 4597 4598 if (!cpumask_intersects(new_mask, cpu_active_mask)) { 4599 ret = -EINVAL; 4600 goto out; 4601 } 4602 4603 do_set_cpus_allowed(p, new_mask); 4604 4605 /* Can the task run on the task's current CPU? If so, we're done */ 4606 if (cpumask_test_cpu(task_cpu(p), new_mask)) 4607 goto out; 4608 4609 dest_cpu = cpumask_any_and(cpu_active_mask, new_mask); 4610 if (p->on_rq) { 4611 struct migration_arg arg = { p, dest_cpu }; 4612 /* Need help from migration thread: drop lock and wait. */ 4613 task_rq_unlock(rq, p, &flags); 4614 stop_one_cpu(cpu_of(rq), migration_cpu_stop, &arg); 4615 tlb_migrate_finish(p->mm); 4616 return 0; 4617 } 4618out: 4619 task_rq_unlock(rq, p, &flags); 4620 4621 return ret; 4622} 4623EXPORT_SYMBOL_GPL(set_cpus_allowed_ptr); 4624 4625/* 4626 * Move (not current) task off this cpu, onto dest cpu. We're doing 4627 * this because either it can't run here any more (set_cpus_allowed() 4628 * away from this CPU, or CPU going down), or because we're 4629 * attempting to rebalance this task on exec (sched_exec). 4630 * 4631 * So we race with normal scheduler movements, but that's OK, as long 4632 * as the task is no longer on this CPU. 4633 * 4634 * Returns non-zero if task was successfully migrated. 4635 */ 4636static int __migrate_task(struct task_struct *p, int src_cpu, int dest_cpu) 4637{ 4638 struct rq *rq_dest, *rq_src; 4639 int ret = 0; 4640 4641 if (unlikely(!cpu_active(dest_cpu))) 4642 return ret; 4643 4644 rq_src = cpu_rq(src_cpu); 4645 rq_dest = cpu_rq(dest_cpu); 4646 4647 raw_spin_lock(&p->pi_lock); 4648 double_rq_lock(rq_src, rq_dest); 4649 /* Already moved. */ 4650 if (task_cpu(p) != src_cpu) 4651 goto done; 4652 /* Affinity changed (again). */ 4653 if (!cpumask_test_cpu(dest_cpu, tsk_cpus_allowed(p))) 4654 goto fail; 4655 4656 /* 4657 * If we're not on a rq, the next wake-up will ensure we're 4658 * placed properly. 4659 */ 4660 if (p->on_rq) { 4661 dequeue_task(rq_src, p, 0); 4662 set_task_cpu(p, dest_cpu); 4663 enqueue_task(rq_dest, p, 0); 4664 check_preempt_curr(rq_dest, p, 0); 4665 } 4666done: 4667 ret = 1; 4668fail: 4669 double_rq_unlock(rq_src, rq_dest); 4670 raw_spin_unlock(&p->pi_lock); 4671 return ret; 4672} 4673 4674#ifdef CONFIG_NUMA_BALANCING 4675/* Migrate current task p to target_cpu */ 4676int migrate_task_to(struct task_struct *p, int target_cpu) 4677{ 4678 struct migration_arg arg = { p, target_cpu }; 4679 int curr_cpu = task_cpu(p); 4680 4681 if (curr_cpu == target_cpu) 4682 return 0; 4683 4684 if (!cpumask_test_cpu(target_cpu, tsk_cpus_allowed(p))) 4685 return -EINVAL; 4686 4687 /* TODO: This is not properly updating schedstats */ 4688 4689 trace_sched_move_numa(p, curr_cpu, target_cpu); 4690 return stop_one_cpu(curr_cpu, migration_cpu_stop, &arg); 4691} 4692 4693/* 4694 * Requeue a task on a given node and accurately track the number of NUMA 4695 * tasks on the runqueues 4696 */ 4697void sched_setnuma(struct task_struct *p, int nid) 4698{ 4699 struct rq *rq; 4700 unsigned long flags; 4701 bool on_rq, running; 4702 4703 rq = task_rq_lock(p, &flags); 4704 on_rq = p->on_rq; 4705 running = task_current(rq, p); 4706 4707 if (on_rq) 4708 dequeue_task(rq, p, 0); 4709 if (running) 4710 p->sched_class->put_prev_task(rq, p); 4711 4712 p->numa_preferred_nid = nid; 4713 4714 if (running) 4715 p->sched_class->set_curr_task(rq); 4716 if (on_rq) 4717 enqueue_task(rq, p, 0); 4718 task_rq_unlock(rq, p, &flags); 4719} 4720#endif 4721 4722/* 4723 * migration_cpu_stop - this will be executed by a highprio stopper thread 4724 * and performs thread migration by bumping thread off CPU then 4725 * 'pushing' onto another runqueue. 4726 */ 4727static int migration_cpu_stop(void *data) 4728{ 4729 struct migration_arg *arg = data; 4730 4731 /* 4732 * The original target cpu might have gone down and we might 4733 * be on another cpu but it doesn't matter. 4734 */ 4735 local_irq_disable(); 4736 __migrate_task(arg->task, raw_smp_processor_id(), arg->dest_cpu); 4737 local_irq_enable(); 4738 return 0; 4739} 4740 4741#ifdef CONFIG_HOTPLUG_CPU 4742 4743/* 4744 * Ensures that the idle task is using init_mm right before its cpu goes 4745 * offline. 4746 */ 4747void idle_task_exit(void) 4748{ 4749 struct mm_struct *mm = current->active_mm; 4750 4751 BUG_ON(cpu_online(smp_processor_id())); 4752 4753 if (mm != &init_mm) { 4754 switch_mm(mm, &init_mm, current); 4755 finish_arch_post_lock_switch(); 4756 } 4757 mmdrop(mm); 4758} 4759 4760/* 4761 * Since this CPU is going 'away' for a while, fold any nr_active delta 4762 * we might have. Assumes we're called after migrate_tasks() so that the 4763 * nr_active count is stable. 4764 * 4765 * Also see the comment "Global load-average calculations". 4766 */ 4767static void calc_load_migrate(struct rq *rq) 4768{ 4769 long delta = calc_load_fold_active(rq); 4770 if (delta) 4771 atomic_long_add(delta, &calc_load_tasks); 4772} 4773 4774static void put_prev_task_fake(struct rq *rq, struct task_struct *prev) 4775{ 4776} 4777 4778static const struct sched_class fake_sched_class = { 4779 .put_prev_task = put_prev_task_fake, 4780}; 4781 4782static struct task_struct fake_task = { 4783 /* 4784 * Avoid pull_{rt,dl}_task() 4785 */ 4786 .prio = MAX_PRIO + 1, 4787 .sched_class = &fake_sched_class, 4788}; 4789 4790/* 4791 * Migrate all tasks from the rq, sleeping tasks will be migrated by 4792 * try_to_wake_up()->select_task_rq(). 4793 * 4794 * Called with rq->lock held even though we'er in stop_machine() and 4795 * there's no concurrency possible, we hold the required locks anyway 4796 * because of lock validation efforts. 4797 */ 4798static void migrate_tasks(unsigned int dead_cpu) 4799{ 4800 struct rq *rq = cpu_rq(dead_cpu); 4801 struct task_struct *next, *stop = rq->stop; 4802 int dest_cpu; 4803 4804 /* 4805 * Fudge the rq selection such that the below task selection loop 4806 * doesn't get stuck on the currently eligible stop task. 4807 * 4808 * We're currently inside stop_machine() and the rq is either stuck 4809 * in the stop_machine_cpu_stop() loop, or we're executing this code, 4810 * either way we should never end up calling schedule() until we're 4811 * done here. 4812 */ 4813 rq->stop = NULL; 4814 4815 /* 4816 * put_prev_task() and pick_next_task() sched 4817 * class method both need to have an up-to-date 4818 * value of rq->clock[_task] 4819 */ 4820 update_rq_clock(rq); 4821 4822 for ( ; ; ) { 4823 /* 4824 * There's this thread running, bail when that's the only 4825 * remaining thread. 4826 */ 4827 if (rq->nr_running == 1) 4828 break; 4829 4830 next = pick_next_task(rq, &fake_task); 4831 BUG_ON(!next); 4832 next->sched_class->put_prev_task(rq, next); 4833 4834 /* Find suitable destination for @next, with force if needed. */ 4835 dest_cpu = select_fallback_rq(dead_cpu, next); 4836 raw_spin_unlock(&rq->lock); 4837 4838 __migrate_task(next, dead_cpu, dest_cpu); 4839 4840 raw_spin_lock(&rq->lock); 4841 } 4842 4843 rq->stop = stop; 4844} 4845 4846#endif /* CONFIG_HOTPLUG_CPU */ 4847 4848#if defined(CONFIG_SCHED_DEBUG) && defined(CONFIG_SYSCTL) 4849 4850static struct ctl_table sd_ctl_dir[] = { 4851 { 4852 .procname = "sched_domain", 4853 .mode = 0555, 4854 }, 4855 {} 4856}; 4857 4858static struct ctl_table sd_ctl_root[] = { 4859 { 4860 .procname = "kernel", 4861 .mode = 0555, 4862 .child = sd_ctl_dir, 4863 }, 4864 {} 4865}; 4866 4867static struct ctl_table *sd_alloc_ctl_entry(int n) 4868{ 4869 struct ctl_table *entry = 4870 kcalloc(n, sizeof(struct ctl_table), GFP_KERNEL); 4871 4872 return entry; 4873} 4874 4875static void sd_free_ctl_entry(struct ctl_table **tablep) 4876{ 4877 struct ctl_table *entry; 4878 4879 /* 4880 * In the intermediate directories, both the child directory and 4881 * procname are dynamically allocated and could fail but the mode 4882 * will always be set. In the lowest directory the names are 4883 * static strings and all have proc handlers. 4884 */ 4885 for (entry = *tablep; entry->mode; entry++) { 4886 if (entry->child) 4887 sd_free_ctl_entry(&entry->child); 4888 if (entry->proc_handler == NULL) 4889 kfree(entry->procname); 4890 } 4891 4892 kfree(*tablep); 4893 *tablep = NULL; 4894} 4895 4896static int min_load_idx = 0; 4897static int max_load_idx = CPU_LOAD_IDX_MAX-1; 4898 4899static void 4900set_table_entry(struct ctl_table *entry, 4901 const char *procname, void *data, int maxlen, 4902 umode_t mode, proc_handler *proc_handler, 4903 bool load_idx) 4904{ 4905 entry->procname = procname; 4906 entry->data = data; 4907 entry->maxlen = maxlen; 4908 entry->mode = mode; 4909 entry->proc_handler = proc_handler; 4910 4911 if (load_idx) { 4912 entry->extra1 = &min_load_idx; 4913 entry->extra2 = &max_load_idx; 4914 } 4915} 4916 4917static struct ctl_table * 4918sd_alloc_ctl_domain_table(struct sched_domain *sd) 4919{ 4920 struct ctl_table *table = sd_alloc_ctl_entry(14); 4921 4922 if (table == NULL) 4923 return NULL; 4924 4925 set_table_entry(&table[0], "min_interval", &sd->min_interval, 4926 sizeof(long), 0644, proc_doulongvec_minmax, false); 4927 set_table_entry(&table[1], "max_interval", &sd->max_interval, 4928 sizeof(long), 0644, proc_doulongvec_minmax, false); 4929 set_table_entry(&table[2], "busy_idx", &sd->busy_idx, 4930 sizeof(int), 0644, proc_dointvec_minmax, true); 4931 set_table_entry(&table[3], "idle_idx", &sd->idle_idx, 4932 sizeof(int), 0644, proc_dointvec_minmax, true); 4933 set_table_entry(&table[4], "newidle_idx", &sd->newidle_idx, 4934 sizeof(int), 0644, proc_dointvec_minmax, true); 4935 set_table_entry(&table[5], "wake_idx", &sd->wake_idx, 4936 sizeof(int), 0644, proc_dointvec_minmax, true); 4937 set_table_entry(&table[6], "forkexec_idx", &sd->forkexec_idx, 4938 sizeof(int), 0644, proc_dointvec_minmax, true); 4939 set_table_entry(&table[7], "busy_factor", &sd->busy_factor, 4940 sizeof(int), 0644, proc_dointvec_minmax, false); 4941 set_table_entry(&table[8], "imbalance_pct", &sd->imbalance_pct, 4942 sizeof(int), 0644, proc_dointvec_minmax, false); 4943 set_table_entry(&table[9], "cache_nice_tries", 4944 &sd->cache_nice_tries, 4945 sizeof(int), 0644, proc_dointvec_minmax, false); 4946 set_table_entry(&table[10], "flags", &sd->flags, 4947 sizeof(int), 0644, proc_dointvec_minmax, false); 4948 set_table_entry(&table[11], "max_newidle_lb_cost", 4949 &sd->max_newidle_lb_cost, 4950 sizeof(long), 0644, proc_doulongvec_minmax, false); 4951 set_table_entry(&table[12], "name", sd->name, 4952 CORENAME_MAX_SIZE, 0444, proc_dostring, false); 4953 /* &table[13] is terminator */ 4954 4955 return table; 4956} 4957 4958static struct ctl_table *sd_alloc_ctl_cpu_table(int cpu) 4959{ 4960 struct ctl_table *entry, *table; 4961 struct sched_domain *sd; 4962 int domain_num = 0, i; 4963 char buf[32]; 4964 4965 for_each_domain(cpu, sd) 4966 domain_num++; 4967 entry = table = sd_alloc_ctl_entry(domain_num + 1); 4968 if (table == NULL) 4969 return NULL; 4970 4971 i = 0; 4972 for_each_domain(cpu, sd) { 4973 snprintf(buf, 32, "domain%d", i); 4974 entry->procname = kstrdup(buf, GFP_KERNEL); 4975 entry->mode = 0555; 4976 entry->child = sd_alloc_ctl_domain_table(sd); 4977 entry++; 4978 i++; 4979 } 4980 return table; 4981} 4982 4983static struct ctl_table_header *sd_sysctl_header; 4984static void register_sched_domain_sysctl(void) 4985{ 4986 int i, cpu_num = num_possible_cpus(); 4987 struct ctl_table *entry = sd_alloc_ctl_entry(cpu_num + 1); 4988 char buf[32]; 4989 4990 WARN_ON(sd_ctl_dir[0].child); 4991 sd_ctl_dir[0].child = entry; 4992 4993 if (entry == NULL) 4994 return; 4995 4996 for_each_possible_cpu(i) { 4997 snprintf(buf, 32, "cpu%d", i); 4998 entry->procname = kstrdup(buf, GFP_KERNEL); 4999 entry->mode = 0555; 5000 entry->child = sd_alloc_ctl_cpu_table(i); 5001 entry++; 5002 } 5003 5004 WARN_ON(sd_sysctl_header); 5005 sd_sysctl_header = register_sysctl_table(sd_ctl_root); 5006} 5007 5008/* may be called multiple times per register */ 5009static void unregister_sched_domain_sysctl(void) 5010{ 5011 if (sd_sysctl_header) 5012 unregister_sysctl_table(sd_sysctl_header); 5013 sd_sysctl_header = NULL; 5014 if (sd_ctl_dir[0].child) 5015 sd_free_ctl_entry(&sd_ctl_dir[0].child); 5016} 5017#else 5018static void register_sched_domain_sysctl(void) 5019{ 5020} 5021static void unregister_sched_domain_sysctl(void) 5022{ 5023} 5024#endif 5025 5026static void set_rq_online(struct rq *rq) 5027{ 5028 if (!rq->online) { 5029 const struct sched_class *class; 5030 5031 cpumask_set_cpu(rq->cpu, rq->rd->online); 5032 rq->online = 1; 5033 5034 for_each_class(class) { 5035 if (class->rq_online) 5036 class->rq_online(rq); 5037 } 5038 } 5039} 5040 5041static void set_rq_offline(struct rq *rq) 5042{ 5043 if (rq->online) { 5044 const struct sched_class *class; 5045 5046 for_each_class(class) { 5047 if (class->rq_offline) 5048 class->rq_offline(rq); 5049 } 5050 5051 cpumask_clear_cpu(rq->cpu, rq->rd->online); 5052 rq->online = 0; 5053 } 5054} 5055 5056/* 5057 * migration_call - callback that gets triggered when a CPU is added. 5058 * Here we can start up the necessary migration thread for the new CPU. 5059 */ 5060static int 5061migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu) 5062{ 5063 int cpu = (long)hcpu; 5064 unsigned long flags; 5065 struct rq *rq = cpu_rq(cpu); 5066 5067 switch (action & ~CPU_TASKS_FROZEN) { 5068 5069 case CPU_UP_PREPARE: 5070 rq->calc_load_update = calc_load_update; 5071 break; 5072 5073 case CPU_ONLINE: 5074 /* Update our root-domain */ 5075 raw_spin_lock_irqsave(&rq->lock, flags); 5076 if (rq->rd) { 5077 BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span)); 5078 5079 set_rq_online(rq); 5080 } 5081 raw_spin_unlock_irqrestore(&rq->lock, flags); 5082 break; 5083 5084#ifdef CONFIG_HOTPLUG_CPU 5085 case CPU_DYING: 5086 sched_ttwu_pending(); 5087 /* Update our root-domain */ 5088 raw_spin_lock_irqsave(&rq->lock, flags); 5089 if (rq->rd) { 5090 BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span)); 5091 set_rq_offline(rq); 5092 } 5093 migrate_tasks(cpu); 5094 BUG_ON(rq->nr_running != 1); /* the migration thread */ 5095 raw_spin_unlock_irqrestore(&rq->lock, flags); 5096 break; 5097 5098 case CPU_DEAD: 5099 calc_load_migrate(rq); 5100 break; 5101#endif 5102 } 5103 5104 update_max_interval(); 5105 5106 return NOTIFY_OK; 5107} 5108 5109/* 5110 * Register at high priority so that task migration (migrate_all_tasks) 5111 * happens before everything else. This has to be lower priority than 5112 * the notifier in the perf_event subsystem, though. 5113 */ 5114static struct notifier_block migration_notifier = { 5115 .notifier_call = migration_call, 5116 .priority = CPU_PRI_MIGRATION, 5117}; 5118 5119static void __cpuinit set_cpu_rq_start_time(void) 5120{ 5121 int cpu = smp_processor_id(); 5122 struct rq *rq = cpu_rq(cpu); 5123 rq->age_stamp = sched_clock_cpu(cpu); 5124} 5125 5126static int sched_cpu_active(struct notifier_block *nfb, 5127 unsigned long action, void *hcpu) 5128{ 5129 switch (action & ~CPU_TASKS_FROZEN) { 5130 case CPU_STARTING: 5131 set_cpu_rq_start_time(); 5132 return NOTIFY_OK; 5133 case CPU_DOWN_FAILED: 5134 set_cpu_active((long)hcpu, true); 5135 return NOTIFY_OK; 5136 default: 5137 return NOTIFY_DONE; 5138 } 5139} 5140 5141static int sched_cpu_inactive(struct notifier_block *nfb, 5142 unsigned long action, void *hcpu) 5143{ 5144 unsigned long flags; 5145 long cpu = (long)hcpu; 5146 5147 switch (action & ~CPU_TASKS_FROZEN) { 5148 case CPU_DOWN_PREPARE: 5149 set_cpu_active(cpu, false); 5150 5151 /* explicitly allow suspend */ 5152 if (!(action & CPU_TASKS_FROZEN)) { 5153 struct dl_bw *dl_b = dl_bw_of(cpu); 5154 bool overflow; 5155 int cpus; 5156 5157 raw_spin_lock_irqsave(&dl_b->lock, flags); 5158 cpus = dl_bw_cpus(cpu); 5159 overflow = __dl_overflow(dl_b, cpus, 0, 0); 5160 raw_spin_unlock_irqrestore(&dl_b->lock, flags); 5161 5162 if (overflow) 5163 return notifier_from_errno(-EBUSY); 5164 } 5165 return NOTIFY_OK; 5166 } 5167 5168 return NOTIFY_DONE; 5169} 5170 5171static int __init migration_init(void) 5172{ 5173 void *cpu = (void *)(long)smp_processor_id(); 5174 int err; 5175 5176 /* Initialize migration for the boot CPU */ 5177 err = migration_call(&migration_notifier, CPU_UP_PREPARE, cpu); 5178 BUG_ON(err == NOTIFY_BAD); 5179 migration_call(&migration_notifier, CPU_ONLINE, cpu); 5180 register_cpu_notifier(&migration_notifier); 5181 5182 /* Register cpu active notifiers */ 5183 cpu_notifier(sched_cpu_active, CPU_PRI_SCHED_ACTIVE); 5184 cpu_notifier(sched_cpu_inactive, CPU_PRI_SCHED_INACTIVE); 5185 5186 return 0; 5187} 5188early_initcall(migration_init); 5189#endif 5190 5191#ifdef CONFIG_SMP 5192 5193static cpumask_var_t sched_domains_tmpmask; /* sched_domains_mutex */ 5194 5195#ifdef CONFIG_SCHED_DEBUG 5196 5197static __read_mostly int sched_debug_enabled; 5198 5199static int __init sched_debug_setup(char *str) 5200{ 5201 sched_debug_enabled = 1; 5202 5203 return 0; 5204} 5205early_param("sched_debug", sched_debug_setup); 5206 5207static inline bool sched_debug(void) 5208{ 5209 return sched_debug_enabled; 5210} 5211 5212static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level, 5213 struct cpumask *groupmask) 5214{ 5215 struct sched_group *group = sd->groups; 5216 char str[256]; 5217 5218 cpulist_scnprintf(str, sizeof(str), sched_domain_span(sd)); 5219 cpumask_clear(groupmask); 5220 5221 printk(KERN_DEBUG "%*s domain %d: ", level, "", level); 5222 5223 if (!(sd->flags & SD_LOAD_BALANCE)) { 5224 printk("does not load-balance\n"); 5225 if (sd->parent) 5226 printk(KERN_ERR "ERROR: !SD_LOAD_BALANCE domain" 5227 " has parent"); 5228 return -1; 5229 } 5230 5231 printk(KERN_CONT "span %s level %s\n", str, sd->name); 5232 5233 if (!cpumask_test_cpu(cpu, sched_domain_span(sd))) { 5234 printk(KERN_ERR "ERROR: domain->span does not contain " 5235 "CPU%d\n", cpu); 5236 } 5237 if (!cpumask_test_cpu(cpu, sched_group_cpus(group))) { 5238 printk(KERN_ERR "ERROR: domain->groups does not contain" 5239 " CPU%d\n", cpu); 5240 } 5241 5242 printk(KERN_DEBUG "%*s groups:", level + 1, ""); 5243 do { 5244 if (!group) { 5245 printk("\n"); 5246 printk(KERN_ERR "ERROR: group is NULL\n"); 5247 break; 5248 } 5249 5250 /* 5251 * Even though we initialize ->capacity to something semi-sane, 5252 * we leave capacity_orig unset. This allows us to detect if 5253 * domain iteration is still funny without causing /0 traps. 5254 */ 5255 if (!group->sgc->capacity_orig) { 5256 printk(KERN_CONT "\n"); 5257 printk(KERN_ERR "ERROR: domain->cpu_capacity not set\n"); 5258 break; 5259 } 5260 5261 if (!cpumask_weight(sched_group_cpus(group))) { 5262 printk(KERN_CONT "\n"); 5263 printk(KERN_ERR "ERROR: empty group\n"); 5264 break; 5265 } 5266 5267 if (!(sd->flags & SD_OVERLAP) && 5268 cpumask_intersects(groupmask, sched_group_cpus(group))) { 5269 printk(KERN_CONT "\n"); 5270 printk(KERN_ERR "ERROR: repeated CPUs\n"); 5271 break; 5272 } 5273 5274 cpumask_or(groupmask, groupmask, sched_group_cpus(group)); 5275 5276 cpulist_scnprintf(str, sizeof(str), sched_group_cpus(group)); 5277 5278 printk(KERN_CONT " %s", str); 5279 if (group->sgc->capacity != SCHED_CAPACITY_SCALE) { 5280 printk(KERN_CONT " (cpu_capacity = %d)", 5281 group->sgc->capacity); 5282 } 5283 5284 group = group->next; 5285 } while (group != sd->groups); 5286 printk(KERN_CONT "\n"); 5287 5288 if (!cpumask_equal(sched_domain_span(sd), groupmask)) 5289 printk(KERN_ERR "ERROR: groups don't span domain->span\n"); 5290 5291 if (sd->parent && 5292 !cpumask_subset(groupmask, sched_domain_span(sd->parent))) 5293 printk(KERN_ERR "ERROR: parent span is not a superset " 5294 "of domain->span\n"); 5295 return 0; 5296} 5297 5298static void sched_domain_debug(struct sched_domain *sd, int cpu) 5299{ 5300 int level = 0; 5301 5302 if (!sched_debug_enabled) 5303 return; 5304 5305 if (!sd) { 5306 printk(KERN_DEBUG "CPU%d attaching NULL sched-domain.\n", cpu); 5307 return; 5308 } 5309 5310 printk(KERN_DEBUG "CPU%d attaching sched-domain:\n", cpu); 5311 5312 for (;;) { 5313 if (sched_domain_debug_one(sd, cpu, level, sched_domains_tmpmask)) 5314 break; 5315 level++; 5316 sd = sd->parent; 5317 if (!sd) 5318 break; 5319 } 5320} 5321#else /* !CONFIG_SCHED_DEBUG */ 5322# define sched_domain_debug(sd, cpu) do { } while (0) 5323static inline bool sched_debug(void) 5324{ 5325 return false; 5326} 5327#endif /* CONFIG_SCHED_DEBUG */ 5328 5329static int sd_degenerate(struct sched_domain *sd) 5330{ 5331 if (cpumask_weight(sched_domain_span(sd)) == 1) 5332 return 1; 5333 5334 /* Following flags need at least 2 groups */ 5335 if (sd->flags & (SD_LOAD_BALANCE | 5336 SD_BALANCE_NEWIDLE | 5337 SD_BALANCE_FORK | 5338 SD_BALANCE_EXEC | 5339 SD_SHARE_CPUCAPACITY | 5340 SD_SHARE_PKG_RESOURCES | 5341 SD_SHARE_POWERDOMAIN)) { 5342 if (sd->groups != sd->groups->next) 5343 return 0; 5344 } 5345 5346 /* Following flags don't use groups */ 5347 if (sd->flags & (SD_WAKE_AFFINE)) 5348 return 0; 5349 5350 return 1; 5351} 5352 5353static int 5354sd_parent_degenerate(struct sched_domain *sd, struct sched_domain *parent) 5355{ 5356 unsigned long cflags = sd->flags, pflags = parent->flags; 5357 5358 if (sd_degenerate(parent)) 5359 return 1; 5360 5361 if (!cpumask_equal(sched_domain_span(sd), sched_domain_span(parent))) 5362 return 0; 5363 5364 /* Flags needing groups don't count if only 1 group in parent */ 5365 if (parent->groups == parent->groups->next) { 5366 pflags &= ~(SD_LOAD_BALANCE | 5367 SD_BALANCE_NEWIDLE | 5368 SD_BALANCE_FORK | 5369 SD_BALANCE_EXEC | 5370 SD_SHARE_CPUCAPACITY | 5371 SD_SHARE_PKG_RESOURCES | 5372 SD_PREFER_SIBLING | 5373 SD_SHARE_POWERDOMAIN); 5374 if (nr_node_ids == 1) 5375 pflags &= ~SD_SERIALIZE; 5376 } 5377 if (~cflags & pflags) 5378 return 0; 5379 5380 return 1; 5381} 5382 5383static void free_rootdomain(struct rcu_head *rcu) 5384{ 5385 struct root_domain *rd = container_of(rcu, struct root_domain, rcu); 5386 5387 cpupri_cleanup(&rd->cpupri); 5388 cpudl_cleanup(&rd->cpudl); 5389 free_cpumask_var(rd->dlo_mask); 5390 free_cpumask_var(rd->rto_mask); 5391 free_cpumask_var(rd->online); 5392 free_cpumask_var(rd->span); 5393 kfree(rd); 5394} 5395 5396static void rq_attach_root(struct rq *rq, struct root_domain *rd) 5397{ 5398 struct root_domain *old_rd = NULL; 5399 unsigned long flags; 5400 5401 raw_spin_lock_irqsave(&rq->lock, flags); 5402 5403 if (rq->rd) { 5404 old_rd = rq->rd; 5405 5406 if (cpumask_test_cpu(rq->cpu, old_rd->online)) 5407 set_rq_offline(rq); 5408 5409 cpumask_clear_cpu(rq->cpu, old_rd->span); 5410 5411 /* 5412 * If we dont want to free the old_rd yet then 5413 * set old_rd to NULL to skip the freeing later 5414 * in this function: 5415 */ 5416 if (!atomic_dec_and_test(&old_rd->refcount)) 5417 old_rd = NULL; 5418 } 5419 5420 atomic_inc(&rd->refcount); 5421 rq->rd = rd; 5422 5423 cpumask_set_cpu(rq->cpu, rd->span); 5424 if (cpumask_test_cpu(rq->cpu, cpu_active_mask)) 5425 set_rq_online(rq); 5426 5427 raw_spin_unlock_irqrestore(&rq->lock, flags); 5428 5429 if (old_rd) 5430 call_rcu_sched(&old_rd->rcu, free_rootdomain); 5431} 5432 5433static int init_rootdomain(struct root_domain *rd) 5434{ 5435 memset(rd, 0, sizeof(*rd)); 5436 5437 if (!alloc_cpumask_var(&rd->span, GFP_KERNEL)) 5438 goto out; 5439 if (!alloc_cpumask_var(&rd->online, GFP_KERNEL)) 5440 goto free_span; 5441 if (!alloc_cpumask_var(&rd->dlo_mask, GFP_KERNEL)) 5442 goto free_online; 5443 if (!alloc_cpumask_var(&rd->rto_mask, GFP_KERNEL)) 5444 goto free_dlo_mask; 5445 5446 init_dl_bw(&rd->dl_bw); 5447 if (cpudl_init(&rd->cpudl) != 0) 5448 goto free_dlo_mask; 5449 5450 if (cpupri_init(&rd->cpupri) != 0) 5451 goto free_rto_mask; 5452 return 0; 5453 5454free_rto_mask: 5455 free_cpumask_var(rd->rto_mask); 5456free_dlo_mask: 5457 free_cpumask_var(rd->dlo_mask); 5458free_online: 5459 free_cpumask_var(rd->online); 5460free_span: 5461 free_cpumask_var(rd->span); 5462out: 5463 return -ENOMEM; 5464} 5465 5466/* 5467 * By default the system creates a single root-domain with all cpus as 5468 * members (mimicking the global state we have today). 5469 */ 5470struct root_domain def_root_domain; 5471 5472static void init_defrootdomain(void) 5473{ 5474 init_rootdomain(&def_root_domain); 5475 5476 atomic_set(&def_root_domain.refcount, 1); 5477} 5478 5479static struct root_domain *alloc_rootdomain(void) 5480{ 5481 struct root_domain *rd; 5482 5483 rd = kmalloc(sizeof(*rd), GFP_KERNEL); 5484 if (!rd) 5485 return NULL; 5486 5487 if (init_rootdomain(rd) != 0) { 5488 kfree(rd); 5489 return NULL; 5490 } 5491 5492 return rd; 5493} 5494 5495static void free_sched_groups(struct sched_group *sg, int free_sgc) 5496{ 5497 struct sched_group *tmp, *first; 5498 5499 if (!sg) 5500 return; 5501 5502 first = sg; 5503 do { 5504 tmp = sg->next; 5505 5506 if (free_sgc && atomic_dec_and_test(&sg->sgc->ref)) 5507 kfree(sg->sgc); 5508 5509 kfree(sg); 5510 sg = tmp; 5511 } while (sg != first); 5512} 5513 5514static void free_sched_domain(struct rcu_head *rcu) 5515{ 5516 struct sched_domain *sd = container_of(rcu, struct sched_domain, rcu); 5517 5518 /* 5519 * If its an overlapping domain it has private groups, iterate and 5520 * nuke them all. 5521 */ 5522 if (sd->flags & SD_OVERLAP) { 5523 free_sched_groups(sd->groups, 1); 5524 } else if (atomic_dec_and_test(&sd->groups->ref)) { 5525 kfree(sd->groups->sgc); 5526 kfree(sd->groups); 5527 } 5528 kfree(sd); 5529} 5530 5531static void destroy_sched_domain(struct sched_domain *sd, int cpu) 5532{ 5533 call_rcu(&sd->rcu, free_sched_domain); 5534} 5535 5536static void destroy_sched_domains(struct sched_domain *sd, int cpu) 5537{ 5538 for (; sd; sd = sd->parent) 5539 destroy_sched_domain(sd, cpu); 5540} 5541 5542/* 5543 * Keep a special pointer to the highest sched_domain that has 5544 * SD_SHARE_PKG_RESOURCE set (Last Level Cache Domain) for this 5545 * allows us to avoid some pointer chasing select_idle_sibling(). 5546 * 5547 * Also keep a unique ID per domain (we use the first cpu number in 5548 * the cpumask of the domain), this allows us to quickly tell if 5549 * two cpus are in the same cache domain, see cpus_share_cache(). 5550 */ 5551DEFINE_PER_CPU(struct sched_domain *, sd_llc); 5552DEFINE_PER_CPU(int, sd_llc_size); 5553DEFINE_PER_CPU(int, sd_llc_id); 5554DEFINE_PER_CPU(struct sched_domain *, sd_numa); 5555DEFINE_PER_CPU(struct sched_domain *, sd_busy); 5556DEFINE_PER_CPU(struct sched_domain *, sd_asym); 5557 5558static void update_top_cache_domain(int cpu) 5559{ 5560 struct sched_domain *sd; 5561 struct sched_domain *busy_sd = NULL; 5562 int id = cpu; 5563 int size = 1; 5564 5565 sd = highest_flag_domain(cpu, SD_SHARE_PKG_RESOURCES); 5566 if (sd) { 5567 id = cpumask_first(sched_domain_span(sd)); 5568 size = cpumask_weight(sched_domain_span(sd)); 5569 busy_sd = sd->parent; /* sd_busy */ 5570 } 5571 rcu_assign_pointer(per_cpu(sd_busy, cpu), busy_sd); 5572 5573 rcu_assign_pointer(per_cpu(sd_llc, cpu), sd); 5574 per_cpu(sd_llc_size, cpu) = size; 5575 per_cpu(sd_llc_id, cpu) = id; 5576 5577 sd = lowest_flag_domain(cpu, SD_NUMA); 5578 rcu_assign_pointer(per_cpu(sd_numa, cpu), sd); 5579 5580 sd = highest_flag_domain(cpu, SD_ASYM_PACKING); 5581 rcu_assign_pointer(per_cpu(sd_asym, cpu), sd); 5582} 5583 5584/* 5585 * Attach the domain 'sd' to 'cpu' as its base domain. Callers must 5586 * hold the hotplug lock. 5587 */ 5588static void 5589cpu_attach_domain(struct sched_domain *sd, struct root_domain *rd, int cpu) 5590{ 5591 struct rq *rq = cpu_rq(cpu); 5592 struct sched_domain *tmp; 5593 5594 /* Remove the sched domains which do not contribute to scheduling. */ 5595 for (tmp = sd; tmp; ) { 5596 struct sched_domain *parent = tmp->parent; 5597 if (!parent) 5598 break; 5599 5600 if (sd_parent_degenerate(tmp, parent)) { 5601 tmp->parent = parent->parent; 5602 if (parent->parent) 5603 parent->parent->child = tmp; 5604 /* 5605 * Transfer SD_PREFER_SIBLING down in case of a 5606 * degenerate parent; the spans match for this 5607 * so the property transfers. 5608 */ 5609 if (parent->flags & SD_PREFER_SIBLING) 5610 tmp->flags |= SD_PREFER_SIBLING; 5611 destroy_sched_domain(parent, cpu); 5612 } else 5613 tmp = tmp->parent; 5614 } 5615 5616 if (sd && sd_degenerate(sd)) { 5617 tmp = sd; 5618 sd = sd->parent; 5619 destroy_sched_domain(tmp, cpu); 5620 if (sd) 5621 sd->child = NULL; 5622 } 5623 5624 sched_domain_debug(sd, cpu); 5625 5626 rq_attach_root(rq, rd); 5627 tmp = rq->sd; 5628 rcu_assign_pointer(rq->sd, sd); 5629 destroy_sched_domains(tmp, cpu); 5630 5631 update_top_cache_domain(cpu); 5632} 5633 5634/* cpus with isolated domains */ 5635static cpumask_var_t cpu_isolated_map; 5636 5637/* Setup the mask of cpus configured for isolated domains */ 5638static int __init isolated_cpu_setup(char *str) 5639{ 5640 alloc_bootmem_cpumask_var(&cpu_isolated_map); 5641 cpulist_parse(str, cpu_isolated_map); 5642 return 1; 5643} 5644 5645__setup("isolcpus=", isolated_cpu_setup); 5646 5647struct s_data { 5648 struct sched_domain ** __percpu sd; 5649 struct root_domain *rd; 5650}; 5651 5652enum s_alloc { 5653 sa_rootdomain, 5654 sa_sd, 5655 sa_sd_storage, 5656 sa_none, 5657}; 5658 5659/* 5660 * Build an iteration mask that can exclude certain CPUs from the upwards 5661 * domain traversal. 5662 * 5663 * Asymmetric node setups can result in situations where the domain tree is of 5664 * unequal depth, make sure to skip domains that already cover the entire 5665 * range. 5666 * 5667 * In that case build_sched_domains() will have terminated the iteration early 5668 * and our sibling sd spans will be empty. Domains should always include the 5669 * cpu they're built on, so check that. 5670 * 5671 */ 5672static void build_group_mask(struct sched_domain *sd, struct sched_group *sg) 5673{ 5674 const struct cpumask *span = sched_domain_span(sd); 5675 struct sd_data *sdd = sd->private; 5676 struct sched_domain *sibling; 5677 int i; 5678 5679 for_each_cpu(i, span) { 5680 sibling = *per_cpu_ptr(sdd->sd, i); 5681 if (!cpumask_test_cpu(i, sched_domain_span(sibling))) 5682 continue; 5683 5684 cpumask_set_cpu(i, sched_group_mask(sg)); 5685 } 5686} 5687 5688/* 5689 * Return the canonical balance cpu for this group, this is the first cpu 5690 * of this group that's also in the iteration mask. 5691 */ 5692int group_balance_cpu(struct sched_group *sg) 5693{ 5694 return cpumask_first_and(sched_group_cpus(sg), sched_group_mask(sg)); 5695} 5696 5697static int 5698build_overlap_sched_groups(struct sched_domain *sd, int cpu) 5699{ 5700 struct sched_group *first = NULL, *last = NULL, *groups = NULL, *sg; 5701 const struct cpumask *span = sched_domain_span(sd); 5702 struct cpumask *covered = sched_domains_tmpmask; 5703 struct sd_data *sdd = sd->private; 5704 struct sched_domain *child; 5705 int i; 5706 5707 cpumask_clear(covered); 5708 5709 for_each_cpu(i, span) { 5710 struct cpumask *sg_span; 5711 5712 if (cpumask_test_cpu(i, covered)) 5713 continue; 5714 5715 child = *per_cpu_ptr(sdd->sd, i); 5716 5717 /* See the comment near build_group_mask(). */ 5718 if (!cpumask_test_cpu(i, sched_domain_span(child))) 5719 continue; 5720 5721 sg = kzalloc_node(sizeof(struct sched_group) + cpumask_size(), 5722 GFP_KERNEL, cpu_to_node(cpu)); 5723 5724 if (!sg) 5725 goto fail; 5726 5727 sg_span = sched_group_cpus(sg); 5728 if (child->child) { 5729 child = child->child; 5730 cpumask_copy(sg_span, sched_domain_span(child)); 5731 } else 5732 cpumask_set_cpu(i, sg_span); 5733 5734 cpumask_or(covered, covered, sg_span); 5735 5736 sg->sgc = *per_cpu_ptr(sdd->sgc, i); 5737 if (atomic_inc_return(&sg->sgc->ref) == 1) 5738 build_group_mask(sd, sg); 5739 5740 /* 5741 * Initialize sgc->capacity such that even if we mess up the 5742 * domains and no possible iteration will get us here, we won't 5743 * die on a /0 trap. 5744 */ 5745 sg->sgc->capacity = SCHED_CAPACITY_SCALE * cpumask_weight(sg_span); 5746 sg->sgc->capacity_orig = sg->sgc->capacity; 5747 5748 /* 5749 * Make sure the first group of this domain contains the 5750 * canonical balance cpu. Otherwise the sched_domain iteration 5751 * breaks. See update_sg_lb_stats(). 5752 */ 5753 if ((!groups && cpumask_test_cpu(cpu, sg_span)) || 5754 group_balance_cpu(sg) == cpu) 5755 groups = sg; 5756 5757 if (!first) 5758 first = sg; 5759 if (last) 5760 last->next = sg; 5761 last = sg; 5762 last->next = first; 5763 } 5764 sd->groups = groups; 5765 5766 return 0; 5767 5768fail: 5769 free_sched_groups(first, 0); 5770 5771 return -ENOMEM; 5772} 5773 5774static int get_group(int cpu, struct sd_data *sdd, struct sched_group **sg) 5775{ 5776 struct sched_domain *sd = *per_cpu_ptr(sdd->sd, cpu); 5777 struct sched_domain *child = sd->child; 5778 5779 if (child) 5780 cpu = cpumask_first(sched_domain_span(child)); 5781 5782 if (sg) { 5783 *sg = *per_cpu_ptr(sdd->sg, cpu); 5784 (*sg)->sgc = *per_cpu_ptr(sdd->sgc, cpu); 5785 atomic_set(&(*sg)->sgc->ref, 1); /* for claim_allocations */ 5786 } 5787 5788 return cpu; 5789} 5790 5791/* 5792 * build_sched_groups will build a circular linked list of the groups 5793 * covered by the given span, and will set each group's ->cpumask correctly, 5794 * and ->cpu_capacity to 0. 5795 * 5796 * Assumes the sched_domain tree is fully constructed 5797 */ 5798static int 5799build_sched_groups(struct sched_domain *sd, int cpu) 5800{ 5801 struct sched_group *first = NULL, *last = NULL; 5802 struct sd_data *sdd = sd->private; 5803 const struct cpumask *span = sched_domain_span(sd); 5804 struct cpumask *covered; 5805 int i; 5806 5807 get_group(cpu, sdd, &sd->groups); 5808 atomic_inc(&sd->groups->ref); 5809 5810 if (cpu != cpumask_first(span)) 5811 return 0; 5812 5813 lockdep_assert_held(&sched_domains_mutex); 5814 covered = sched_domains_tmpmask; 5815 5816 cpumask_clear(covered); 5817 5818 for_each_cpu(i, span) { 5819 struct sched_group *sg; 5820 int group, j; 5821 5822 if (cpumask_test_cpu(i, covered)) 5823 continue; 5824 5825 group = get_group(i, sdd, &sg); 5826 cpumask_setall(sched_group_mask(sg)); 5827 5828 for_each_cpu(j, span) { 5829 if (get_group(j, sdd, NULL) != group) 5830 continue; 5831 5832 cpumask_set_cpu(j, covered); 5833 cpumask_set_cpu(j, sched_group_cpus(sg)); 5834 } 5835 5836 if (!first) 5837 first = sg; 5838 if (last) 5839 last->next = sg; 5840 last = sg; 5841 } 5842 last->next = first; 5843 5844 return 0; 5845} 5846 5847/* 5848 * Initialize sched groups cpu_capacity. 5849 * 5850 * cpu_capacity indicates the capacity of sched group, which is used while 5851 * distributing the load between different sched groups in a sched domain. 5852 * Typically cpu_capacity for all the groups in a sched domain will be same 5853 * unless there are asymmetries in the topology. If there are asymmetries, 5854 * group having more cpu_capacity will pickup more load compared to the 5855 * group having less cpu_capacity. 5856 */ 5857static void init_sched_groups_capacity(int cpu, struct sched_domain *sd) 5858{ 5859 struct sched_group *sg = sd->groups; 5860 5861 WARN_ON(!sg); 5862 5863 do { 5864 sg->group_weight = cpumask_weight(sched_group_cpus(sg)); 5865 sg = sg->next; 5866 } while (sg != sd->groups); 5867 5868 if (cpu != group_balance_cpu(sg)) 5869 return; 5870 5871 update_group_capacity(sd, cpu); 5872 atomic_set(&sg->sgc->nr_busy_cpus, sg->group_weight); 5873} 5874 5875/* 5876 * Initializers for schedule domains 5877 * Non-inlined to reduce accumulated stack pressure in build_sched_domains() 5878 */ 5879 5880static int default_relax_domain_level = -1; 5881int sched_domain_level_max; 5882 5883static int __init setup_relax_domain_level(char *str) 5884{ 5885 if (kstrtoint(str, 0, &default_relax_domain_level)) 5886 pr_warn("Unable to set relax_domain_level\n"); 5887 5888 return 1; 5889} 5890__setup("relax_domain_level=", setup_relax_domain_level); 5891 5892static void set_domain_attribute(struct sched_domain *sd, 5893 struct sched_domain_attr *attr) 5894{ 5895 int request; 5896 5897 if (!attr || attr->relax_domain_level < 0) { 5898 if (default_relax_domain_level < 0) 5899 return; 5900 else 5901 request = default_relax_domain_level; 5902 } else 5903 request = attr->relax_domain_level; 5904 if (request < sd->level) { 5905 /* turn off idle balance on this domain */ 5906 sd->flags &= ~(SD_BALANCE_WAKE|SD_BALANCE_NEWIDLE); 5907 } else { 5908 /* turn on idle balance on this domain */ 5909 sd->flags |= (SD_BALANCE_WAKE|SD_BALANCE_NEWIDLE); 5910 } 5911} 5912 5913static void __sdt_free(const struct cpumask *cpu_map); 5914static int __sdt_alloc(const struct cpumask *cpu_map); 5915 5916static void __free_domain_allocs(struct s_data *d, enum s_alloc what, 5917 const struct cpumask *cpu_map) 5918{ 5919 switch (what) { 5920 case sa_rootdomain: 5921 if (!atomic_read(&d->rd->refcount)) 5922 free_rootdomain(&d->rd->rcu); /* fall through */ 5923 case sa_sd: 5924 free_percpu(d->sd); /* fall through */ 5925 case sa_sd_storage: 5926 __sdt_free(cpu_map); /* fall through */ 5927 case sa_none: 5928 break; 5929 } 5930} 5931 5932static enum s_alloc __visit_domain_allocation_hell(struct s_data *d, 5933 const struct cpumask *cpu_map) 5934{ 5935 memset(d, 0, sizeof(*d)); 5936 5937 if (__sdt_alloc(cpu_map)) 5938 return sa_sd_storage; 5939 d->sd = alloc_percpu(struct sched_domain *); 5940 if (!d->sd) 5941 return sa_sd_storage; 5942 d->rd = alloc_rootdomain(); 5943 if (!d->rd) 5944 return sa_sd; 5945 return sa_rootdomain; 5946} 5947 5948/* 5949 * NULL the sd_data elements we've used to build the sched_domain and 5950 * sched_group structure so that the subsequent __free_domain_allocs() 5951 * will not free the data we're using. 5952 */ 5953static void claim_allocations(int cpu, struct sched_domain *sd) 5954{ 5955 struct sd_data *sdd = sd->private; 5956 5957 WARN_ON_ONCE(*per_cpu_ptr(sdd->sd, cpu) != sd); 5958 *per_cpu_ptr(sdd->sd, cpu) = NULL; 5959 5960 if (atomic_read(&(*per_cpu_ptr(sdd->sg, cpu))->ref)) 5961 *per_cpu_ptr(sdd->sg, cpu) = NULL; 5962 5963 if (atomic_read(&(*per_cpu_ptr(sdd->sgc, cpu))->ref)) 5964 *per_cpu_ptr(sdd->sgc, cpu) = NULL; 5965} 5966 5967#ifdef CONFIG_NUMA 5968static int sched_domains_numa_levels; 5969static int *sched_domains_numa_distance; 5970static struct cpumask ***sched_domains_numa_masks; 5971static int sched_domains_curr_level; 5972#endif 5973 5974/* 5975 * SD_flags allowed in topology descriptions. 5976 * 5977 * SD_SHARE_CPUCAPACITY - describes SMT topologies 5978 * SD_SHARE_PKG_RESOURCES - describes shared caches 5979 * SD_NUMA - describes NUMA topologies 5980 * SD_SHARE_POWERDOMAIN - describes shared power domain 5981 * 5982 * Odd one out: 5983 * SD_ASYM_PACKING - describes SMT quirks 5984 */ 5985#define TOPOLOGY_SD_FLAGS \ 5986 (SD_SHARE_CPUCAPACITY | \ 5987 SD_SHARE_PKG_RESOURCES | \ 5988 SD_NUMA | \ 5989 SD_ASYM_PACKING | \ 5990 SD_SHARE_POWERDOMAIN) 5991 5992static struct sched_domain * 5993sd_init(struct sched_domain_topology_level *tl, int cpu) 5994{ 5995 struct sched_domain *sd = *per_cpu_ptr(tl->data.sd, cpu); 5996 int sd_weight, sd_flags = 0; 5997 5998#ifdef CONFIG_NUMA 5999 /* 6000 * Ugly hack to pass state to sd_numa_mask()... 6001 */ 6002 sched_domains_curr_level = tl->numa_level; 6003#endif 6004 6005 sd_weight = cpumask_weight(tl->mask(cpu)); 6006 6007 if (tl->sd_flags) 6008 sd_flags = (*tl->sd_flags)(); 6009 if (WARN_ONCE(sd_flags & ~TOPOLOGY_SD_FLAGS, 6010 "wrong sd_flags in topology description\n")) 6011 sd_flags &= ~TOPOLOGY_SD_FLAGS; 6012 6013 *sd = (struct sched_domain){ 6014 .min_interval = sd_weight, 6015 .max_interval = 2*sd_weight, 6016 .busy_factor = 32, 6017 .imbalance_pct = 125, 6018 6019 .cache_nice_tries = 0, 6020 .busy_idx = 0, 6021 .idle_idx = 0, 6022 .newidle_idx = 0, 6023 .wake_idx = 0, 6024 .forkexec_idx = 0, 6025 6026 .flags = 1*SD_LOAD_BALANCE 6027 | 1*SD_BALANCE_NEWIDLE 6028 | 1*SD_BALANCE_EXEC 6029 | 1*SD_BALANCE_FORK 6030 | 0*SD_BALANCE_WAKE 6031 | 1*SD_WAKE_AFFINE 6032 | 0*SD_SHARE_CPUCAPACITY 6033 | 0*SD_SHARE_PKG_RESOURCES 6034 | 0*SD_SERIALIZE 6035 | 0*SD_PREFER_SIBLING 6036 | 0*SD_NUMA 6037 | sd_flags 6038 , 6039 6040 .last_balance = jiffies, 6041 .balance_interval = sd_weight, 6042 .smt_gain = 0, 6043 .max_newidle_lb_cost = 0, 6044 .next_decay_max_lb_cost = jiffies, 6045#ifdef CONFIG_SCHED_DEBUG 6046 .name = tl->name, 6047#endif 6048 }; 6049 6050 /* 6051 * Convert topological properties into behaviour. 6052 */ 6053 6054 if (sd->flags & SD_SHARE_CPUCAPACITY) { 6055 sd->imbalance_pct = 110; 6056 sd->smt_gain = 1178; /* ~15% */ 6057 6058 } else if (sd->flags & SD_SHARE_PKG_RESOURCES) { 6059 sd->imbalance_pct = 117; 6060 sd->cache_nice_tries = 1; 6061 sd->busy_idx = 2; 6062 6063#ifdef CONFIG_NUMA 6064 } else if (sd->flags & SD_NUMA) { 6065 sd->cache_nice_tries = 2; 6066 sd->busy_idx = 3; 6067 sd->idle_idx = 2; 6068 6069 sd->flags |= SD_SERIALIZE; 6070 if (sched_domains_numa_distance[tl->numa_level] > RECLAIM_DISTANCE) { 6071 sd->flags &= ~(SD_BALANCE_EXEC | 6072 SD_BALANCE_FORK | 6073 SD_WAKE_AFFINE); 6074 } 6075 6076#endif 6077 } else { 6078 sd->flags |= SD_PREFER_SIBLING; 6079 sd->cache_nice_tries = 1; 6080 sd->busy_idx = 2; 6081 sd->idle_idx = 1; 6082 } 6083 6084 sd->private = &tl->data; 6085 6086 return sd; 6087} 6088 6089/* 6090 * Topology list, bottom-up. 6091 */ 6092static struct sched_domain_topology_level default_topology[] = { 6093#ifdef CONFIG_SCHED_SMT 6094 { cpu_smt_mask, cpu_smt_flags, SD_INIT_NAME(SMT) }, 6095#endif 6096#ifdef CONFIG_SCHED_MC 6097 { cpu_coregroup_mask, cpu_core_flags, SD_INIT_NAME(MC) }, 6098#endif 6099 { cpu_cpu_mask, SD_INIT_NAME(DIE) }, 6100 { NULL, }, 6101}; 6102 6103struct sched_domain_topology_level *sched_domain_topology = default_topology; 6104 6105#define for_each_sd_topology(tl) \ 6106 for (tl = sched_domain_topology; tl->mask; tl++) 6107 6108void set_sched_topology(struct sched_domain_topology_level *tl) 6109{ 6110 sched_domain_topology = tl; 6111} 6112 6113#ifdef CONFIG_NUMA 6114 6115static const struct cpumask *sd_numa_mask(int cpu) 6116{ 6117 return sched_domains_numa_masks[sched_domains_curr_level][cpu_to_node(cpu)]; 6118} 6119 6120static void sched_numa_warn(const char *str) 6121{ 6122 static int done = false; 6123 int i,j; 6124 6125 if (done) 6126 return; 6127 6128 done = true; 6129 6130 printk(KERN_WARNING "ERROR: %s\n\n", str); 6131 6132 for (i = 0; i < nr_node_ids; i++) { 6133 printk(KERN_WARNING " "); 6134 for (j = 0; j < nr_node_ids; j++) 6135 printk(KERN_CONT "%02d ", node_distance(i,j)); 6136 printk(KERN_CONT "\n"); 6137 } 6138 printk(KERN_WARNING "\n"); 6139} 6140 6141static bool find_numa_distance(int distance) 6142{ 6143 int i; 6144 6145 if (distance == node_distance(0, 0)) 6146 return true; 6147 6148 for (i = 0; i < sched_domains_numa_levels; i++) { 6149 if (sched_domains_numa_distance[i] == distance) 6150 return true; 6151 } 6152 6153 return false; 6154} 6155 6156static void sched_init_numa(void) 6157{ 6158 int next_distance, curr_distance = node_distance(0, 0); 6159 struct sched_domain_topology_level *tl; 6160 int level = 0; 6161 int i, j, k; 6162 6163 sched_domains_numa_distance = kzalloc(sizeof(int) * nr_node_ids, GFP_KERNEL); 6164 if (!sched_domains_numa_distance) 6165 return; 6166 6167 /* 6168 * O(nr_nodes^2) deduplicating selection sort -- in order to find the 6169 * unique distances in the node_distance() table. 6170 * 6171 * Assumes node_distance(0,j) includes all distances in 6172 * node_distance(i,j) in order to avoid cubic time. 6173 */ 6174 next_distance = curr_distance; 6175 for (i = 0; i < nr_node_ids; i++) { 6176 for (j = 0; j < nr_node_ids; j++) { 6177 for (k = 0; k < nr_node_ids; k++) { 6178 int distance = node_distance(i, k); 6179 6180 if (distance > curr_distance && 6181 (distance < next_distance || 6182 next_distance == curr_distance)) 6183 next_distance = distance; 6184 6185 /* 6186 * While not a strong assumption it would be nice to know 6187 * about cases where if node A is connected to B, B is not 6188 * equally connected to A. 6189 */ 6190 if (sched_debug() && node_distance(k, i) != distance) 6191 sched_numa_warn("Node-distance not symmetric"); 6192 6193 if (sched_debug() && i && !find_numa_distance(distance)) 6194 sched_numa_warn("Node-0 not representative"); 6195 } 6196 if (next_distance != curr_distance) { 6197 sched_domains_numa_distance[level++] = next_distance; 6198 sched_domains_numa_levels = level; 6199 curr_distance = next_distance; 6200 } else break; 6201 } 6202 6203 /* 6204 * In case of sched_debug() we verify the above assumption. 6205 */ 6206 if (!sched_debug()) 6207 break; 6208 } 6209 /* 6210 * 'level' contains the number of unique distances, excluding the 6211 * identity distance node_distance(i,i). 6212 * 6213 * The sched_domains_numa_distance[] array includes the actual distance 6214 * numbers. 6215 */ 6216 6217 /* 6218 * Here, we should temporarily reset sched_domains_numa_levels to 0. 6219 * If it fails to allocate memory for array sched_domains_numa_masks[][], 6220 * the array will contain less then 'level' members. This could be 6221 * dangerous when we use it to iterate array sched_domains_numa_masks[][] 6222 * in other functions. 6223 * 6224 * We reset it to 'level' at the end of this function. 6225 */ 6226 sched_domains_numa_levels = 0; 6227 6228 sched_domains_numa_masks = kzalloc(sizeof(void *) * level, GFP_KERNEL); 6229 if (!sched_domains_numa_masks) 6230 return; 6231 6232 /* 6233 * Now for each level, construct a mask per node which contains all 6234 * cpus of nodes that are that many hops away from us. 6235 */ 6236 for (i = 0; i < level; i++) { 6237 sched_domains_numa_masks[i] = 6238 kzalloc(nr_node_ids * sizeof(void *), GFP_KERNEL); 6239 if (!sched_domains_numa_masks[i]) 6240 return; 6241 6242 for (j = 0; j < nr_node_ids; j++) { 6243 struct cpumask *mask = kzalloc(cpumask_size(), GFP_KERNEL); 6244 if (!mask) 6245 return; 6246 6247 sched_domains_numa_masks[i][j] = mask; 6248 6249 for (k = 0; k < nr_node_ids; k++) { 6250 if (node_distance(j, k) > sched_domains_numa_distance[i]) 6251 continue; 6252 6253 cpumask_or(mask, mask, cpumask_of_node(k)); 6254 } 6255 } 6256 } 6257 6258 /* Compute default topology size */ 6259 for (i = 0; sched_domain_topology[i].mask; i++); 6260 6261 tl = kzalloc((i + level + 1) * 6262 sizeof(struct sched_domain_topology_level), GFP_KERNEL); 6263 if (!tl) 6264 return; 6265 6266 /* 6267 * Copy the default topology bits.. 6268 */ 6269 for (i = 0; sched_domain_topology[i].mask; i++) 6270 tl[i] = sched_domain_topology[i]; 6271 6272 /* 6273 * .. and append 'j' levels of NUMA goodness. 6274 */ 6275 for (j = 0; j < level; i++, j++) { 6276 tl[i] = (struct sched_domain_topology_level){ 6277 .mask = sd_numa_mask, 6278 .sd_flags = cpu_numa_flags, 6279 .flags = SDTL_OVERLAP, 6280 .numa_level = j, 6281 SD_INIT_NAME(NUMA) 6282 }; 6283 } 6284 6285 sched_domain_topology = tl; 6286 6287 sched_domains_numa_levels = level; 6288} 6289 6290static void sched_domains_numa_masks_set(int cpu) 6291{ 6292 int i, j; 6293 int node = cpu_to_node(cpu); 6294 6295 for (i = 0; i < sched_domains_numa_levels; i++) { 6296 for (j = 0; j < nr_node_ids; j++) { 6297 if (node_distance(j, node) <= sched_domains_numa_distance[i]) 6298 cpumask_set_cpu(cpu, sched_domains_numa_masks[i][j]); 6299 } 6300 } 6301} 6302 6303static void sched_domains_numa_masks_clear(int cpu) 6304{ 6305 int i, j; 6306 for (i = 0; i < sched_domains_numa_levels; i++) { 6307 for (j = 0; j < nr_node_ids; j++) 6308 cpumask_clear_cpu(cpu, sched_domains_numa_masks[i][j]); 6309 } 6310} 6311 6312/* 6313 * Update sched_domains_numa_masks[level][node] array when new cpus 6314 * are onlined. 6315 */ 6316static int sched_domains_numa_masks_update(struct notifier_block *nfb, 6317 unsigned long action, 6318 void *hcpu) 6319{ 6320 int cpu = (long)hcpu; 6321 6322 switch (action & ~CPU_TASKS_FROZEN) { 6323 case CPU_ONLINE: 6324 sched_domains_numa_masks_set(cpu); 6325 break; 6326 6327 case CPU_DEAD: 6328 sched_domains_numa_masks_clear(cpu); 6329 break; 6330 6331 default: 6332 return NOTIFY_DONE; 6333 } 6334 6335 return NOTIFY_OK; 6336} 6337#else 6338static inline void sched_init_numa(void) 6339{ 6340} 6341 6342static int sched_domains_numa_masks_update(struct notifier_block *nfb, 6343 unsigned long action, 6344 void *hcpu) 6345{ 6346 return 0; 6347} 6348#endif /* CONFIG_NUMA */ 6349 6350static int __sdt_alloc(const struct cpumask *cpu_map) 6351{ 6352 struct sched_domain_topology_level *tl; 6353 int j; 6354 6355 for_each_sd_topology(tl) { 6356 struct sd_data *sdd = &tl->data; 6357 6358 sdd->sd = alloc_percpu(struct sched_domain *); 6359 if (!sdd->sd) 6360 return -ENOMEM; 6361 6362 sdd->sg = alloc_percpu(struct sched_group *); 6363 if (!sdd->sg) 6364 return -ENOMEM; 6365 6366 sdd->sgc = alloc_percpu(struct sched_group_capacity *); 6367 if (!sdd->sgc) 6368 return -ENOMEM; 6369 6370 for_each_cpu(j, cpu_map) { 6371 struct sched_domain *sd; 6372 struct sched_group *sg; 6373 struct sched_group_capacity *sgc; 6374 6375 sd = kzalloc_node(sizeof(struct sched_domain) + cpumask_size(), 6376 GFP_KERNEL, cpu_to_node(j)); 6377 if (!sd) 6378 return -ENOMEM; 6379 6380 *per_cpu_ptr(sdd->sd, j) = sd; 6381 6382 sg = kzalloc_node(sizeof(struct sched_group) + cpumask_size(), 6383 GFP_KERNEL, cpu_to_node(j)); 6384 if (!sg) 6385 return -ENOMEM; 6386 6387 sg->next = sg; 6388 6389 *per_cpu_ptr(sdd->sg, j) = sg; 6390 6391 sgc = kzalloc_node(sizeof(struct sched_group_capacity) + cpumask_size(), 6392 GFP_KERNEL, cpu_to_node(j)); 6393 if (!sgc) 6394 return -ENOMEM; 6395 6396 *per_cpu_ptr(sdd->sgc, j) = sgc; 6397 } 6398 } 6399 6400 return 0; 6401} 6402 6403static void __sdt_free(const struct cpumask *cpu_map) 6404{ 6405 struct sched_domain_topology_level *tl; 6406 int j; 6407 6408 for_each_sd_topology(tl) { 6409 struct sd_data *sdd = &tl->data; 6410 6411 for_each_cpu(j, cpu_map) { 6412 struct sched_domain *sd; 6413 6414 if (sdd->sd) { 6415 sd = *per_cpu_ptr(sdd->sd, j); 6416 if (sd && (sd->flags & SD_OVERLAP)) 6417 free_sched_groups(sd->groups, 0); 6418 kfree(*per_cpu_ptr(sdd->sd, j)); 6419 } 6420 6421 if (sdd->sg) 6422 kfree(*per_cpu_ptr(sdd->sg, j)); 6423 if (sdd->sgc) 6424 kfree(*per_cpu_ptr(sdd->sgc, j)); 6425 } 6426 free_percpu(sdd->sd); 6427 sdd->sd = NULL; 6428 free_percpu(sdd->sg); 6429 sdd->sg = NULL; 6430 free_percpu(sdd->sgc); 6431 sdd->sgc = NULL; 6432 } 6433} 6434 6435struct sched_domain *build_sched_domain(struct sched_domain_topology_level *tl, 6436 const struct cpumask *cpu_map, struct sched_domain_attr *attr, 6437 struct sched_domain *child, int cpu) 6438{ 6439 struct sched_domain *sd = sd_init(tl, cpu); 6440 if (!sd) 6441 return child; 6442 6443 cpumask_and(sched_domain_span(sd), cpu_map, tl->mask(cpu)); 6444 if (child) { 6445 sd->level = child->level + 1; 6446 sched_domain_level_max = max(sched_domain_level_max, sd->level); 6447 child->parent = sd; 6448 sd->child = child; 6449 } 6450 set_domain_attribute(sd, attr); 6451 6452 return sd; 6453} 6454 6455/* 6456 * Build sched domains for a given set of cpus and attach the sched domains 6457 * to the individual cpus 6458 */ 6459static int build_sched_domains(const struct cpumask *cpu_map, 6460 struct sched_domain_attr *attr) 6461{ 6462 enum s_alloc alloc_state; 6463 struct sched_domain *sd; 6464 struct s_data d; 6465 int i, ret = -ENOMEM; 6466 6467 alloc_state = __visit_domain_allocation_hell(&d, cpu_map); 6468 if (alloc_state != sa_rootdomain) 6469 goto error; 6470 6471 /* Set up domains for cpus specified by the cpu_map. */ 6472 for_each_cpu(i, cpu_map) { 6473 struct sched_domain_topology_level *tl; 6474 6475 sd = NULL; 6476 for_each_sd_topology(tl) { 6477 sd = build_sched_domain(tl, cpu_map, attr, sd, i); 6478 if (tl == sched_domain_topology) 6479 *per_cpu_ptr(d.sd, i) = sd; 6480 if (tl->flags & SDTL_OVERLAP || sched_feat(FORCE_SD_OVERLAP)) 6481 sd->flags |= SD_OVERLAP; 6482 if (cpumask_equal(cpu_map, sched_domain_span(sd))) 6483 break; 6484 } 6485 } 6486 6487 /* Build the groups for the domains */ 6488 for_each_cpu(i, cpu_map) { 6489 for (sd = *per_cpu_ptr(d.sd, i); sd; sd = sd->parent) { 6490 sd->span_weight = cpumask_weight(sched_domain_span(sd)); 6491 if (sd->flags & SD_OVERLAP) { 6492 if (build_overlap_sched_groups(sd, i)) 6493 goto error; 6494 } else { 6495 if (build_sched_groups(sd, i)) 6496 goto error; 6497 } 6498 } 6499 } 6500 6501 /* Calculate CPU capacity for physical packages and nodes */ 6502 for (i = nr_cpumask_bits-1; i >= 0; i--) { 6503 if (!cpumask_test_cpu(i, cpu_map)) 6504 continue; 6505 6506 for (sd = *per_cpu_ptr(d.sd, i); sd; sd = sd->parent) { 6507 claim_allocations(i, sd); 6508 init_sched_groups_capacity(i, sd); 6509 } 6510 } 6511 6512 /* Attach the domains */ 6513 rcu_read_lock(); 6514 for_each_cpu(i, cpu_map) { 6515 sd = *per_cpu_ptr(d.sd, i); 6516 cpu_attach_domain(sd, d.rd, i); 6517 } 6518 rcu_read_unlock(); 6519 6520 ret = 0; 6521error: 6522 __free_domain_allocs(&d, alloc_state, cpu_map); 6523 return ret; 6524} 6525 6526static cpumask_var_t *doms_cur; /* current sched domains */ 6527static int ndoms_cur; /* number of sched domains in 'doms_cur' */ 6528static struct sched_domain_attr *dattr_cur; 6529 /* attribues of custom domains in 'doms_cur' */ 6530 6531/* 6532 * Special case: If a kmalloc of a doms_cur partition (array of 6533 * cpumask) fails, then fallback to a single sched domain, 6534 * as determined by the single cpumask fallback_doms. 6535 */ 6536static cpumask_var_t fallback_doms; 6537 6538/* 6539 * arch_update_cpu_topology lets virtualized architectures update the 6540 * cpu core maps. It is supposed to return 1 if the topology changed 6541 * or 0 if it stayed the same. 6542 */ 6543int __weak arch_update_cpu_topology(void) 6544{ 6545 return 0; 6546} 6547 6548cpumask_var_t *alloc_sched_domains(unsigned int ndoms) 6549{ 6550 int i; 6551 cpumask_var_t *doms; 6552 6553 doms = kmalloc(sizeof(*doms) * ndoms, GFP_KERNEL); 6554 if (!doms) 6555 return NULL; 6556 for (i = 0; i < ndoms; i++) { 6557 if (!alloc_cpumask_var(&doms[i], GFP_KERNEL)) { 6558 free_sched_domains(doms, i); 6559 return NULL; 6560 } 6561 } 6562 return doms; 6563} 6564 6565void free_sched_domains(cpumask_var_t doms[], unsigned int ndoms) 6566{ 6567 unsigned int i; 6568 for (i = 0; i < ndoms; i++) 6569 free_cpumask_var(doms[i]); 6570 kfree(doms); 6571} 6572 6573/* 6574 * Set up scheduler domains and groups. Callers must hold the hotplug lock. 6575 * For now this just excludes isolated cpus, but could be used to 6576 * exclude other special cases in the future. 6577 */ 6578static int init_sched_domains(const struct cpumask *cpu_map) 6579{ 6580 int err; 6581 6582 arch_update_cpu_topology(); 6583 ndoms_cur = 1; 6584 doms_cur = alloc_sched_domains(ndoms_cur); 6585 if (!doms_cur) 6586 doms_cur = &fallback_doms; 6587 cpumask_andnot(doms_cur[0], cpu_map, cpu_isolated_map); 6588 err = build_sched_domains(doms_cur[0], NULL); 6589 register_sched_domain_sysctl(); 6590 6591 return err; 6592} 6593 6594/* 6595 * Detach sched domains from a group of cpus specified in cpu_map 6596 * These cpus will now be attached to the NULL domain 6597 */ 6598static void detach_destroy_domains(const struct cpumask *cpu_map) 6599{ 6600 int i; 6601 6602 rcu_read_lock(); 6603 for_each_cpu(i, cpu_map) 6604 cpu_attach_domain(NULL, &def_root_domain, i); 6605 rcu_read_unlock(); 6606} 6607 6608/* handle null as "default" */ 6609static int dattrs_equal(struct sched_domain_attr *cur, int idx_cur, 6610 struct sched_domain_attr *new, int idx_new) 6611{ 6612 struct sched_domain_attr tmp; 6613 6614 /* fast path */ 6615 if (!new && !cur) 6616 return 1; 6617 6618 tmp = SD_ATTR_INIT; 6619 return !memcmp(cur ? (cur + idx_cur) : &tmp, 6620 new ? (new + idx_new) : &tmp, 6621 sizeof(struct sched_domain_attr)); 6622} 6623 6624/* 6625 * Partition sched domains as specified by the 'ndoms_new' 6626 * cpumasks in the array doms_new[] of cpumasks. This compares 6627 * doms_new[] to the current sched domain partitioning, doms_cur[]. 6628 * It destroys each deleted domain and builds each new domain. 6629 * 6630 * 'doms_new' is an array of cpumask_var_t's of length 'ndoms_new'. 6631 * The masks don't intersect (don't overlap.) We should setup one 6632 * sched domain for each mask. CPUs not in any of the cpumasks will 6633 * not be load balanced. If the same cpumask appears both in the 6634 * current 'doms_cur' domains and in the new 'doms_new', we can leave 6635 * it as it is. 6636 * 6637 * The passed in 'doms_new' should be allocated using 6638 * alloc_sched_domains. This routine takes ownership of it and will 6639 * free_sched_domains it when done with it. If the caller failed the 6640 * alloc call, then it can pass in doms_new == NULL && ndoms_new == 1, 6641 * and partition_sched_domains() will fallback to the single partition 6642 * 'fallback_doms', it also forces the domains to be rebuilt. 6643 * 6644 * If doms_new == NULL it will be replaced with cpu_online_mask. 6645 * ndoms_new == 0 is a special case for destroying existing domains, 6646 * and it will not create the default domain. 6647 * 6648 * Call with hotplug lock held 6649 */ 6650void partition_sched_domains(int ndoms_new, cpumask_var_t doms_new[], 6651 struct sched_domain_attr *dattr_new) 6652{ 6653 int i, j, n; 6654 int new_topology; 6655 6656 mutex_lock(&sched_domains_mutex); 6657 6658 /* always unregister in case we don't destroy any domains */ 6659 unregister_sched_domain_sysctl(); 6660 6661 /* Let architecture update cpu core mappings. */ 6662 new_topology = arch_update_cpu_topology(); 6663 6664 n = doms_new ? ndoms_new : 0; 6665 6666 /* Destroy deleted domains */ 6667 for (i = 0; i < ndoms_cur; i++) { 6668 for (j = 0; j < n && !new_topology; j++) { 6669 if (cpumask_equal(doms_cur[i], doms_new[j]) 6670 && dattrs_equal(dattr_cur, i, dattr_new, j)) 6671 goto match1; 6672 } 6673 /* no match - a current sched domain not in new doms_new[] */ 6674 detach_destroy_domains(doms_cur[i]); 6675match1: 6676 ; 6677 } 6678 6679 n = ndoms_cur; 6680 if (doms_new == NULL) { 6681 n = 0; 6682 doms_new = &fallback_doms; 6683 cpumask_andnot(doms_new[0], cpu_active_mask, cpu_isolated_map); 6684 WARN_ON_ONCE(dattr_new); 6685 } 6686 6687 /* Build new domains */ 6688 for (i = 0; i < ndoms_new; i++) { 6689 for (j = 0; j < n && !new_topology; j++) { 6690 if (cpumask_equal(doms_new[i], doms_cur[j]) 6691 && dattrs_equal(dattr_new, i, dattr_cur, j)) 6692 goto match2; 6693 } 6694 /* no match - add a new doms_new */ 6695 build_sched_domains(doms_new[i], dattr_new ? dattr_new + i : NULL); 6696match2: 6697 ; 6698 } 6699 6700 /* Remember the new sched domains */ 6701 if (doms_cur != &fallback_doms) 6702 free_sched_domains(doms_cur, ndoms_cur); 6703 kfree(dattr_cur); /* kfree(NULL) is safe */ 6704 doms_cur = doms_new; 6705 dattr_cur = dattr_new; 6706 ndoms_cur = ndoms_new; 6707 6708 register_sched_domain_sysctl(); 6709 6710 mutex_unlock(&sched_domains_mutex); 6711} 6712 6713static int num_cpus_frozen; /* used to mark begin/end of suspend/resume */ 6714 6715/* 6716 * Update cpusets according to cpu_active mask. If cpusets are 6717 * disabled, cpuset_update_active_cpus() becomes a simple wrapper 6718 * around partition_sched_domains(). 6719 * 6720 * If we come here as part of a suspend/resume, don't touch cpusets because we 6721 * want to restore it back to its original state upon resume anyway. 6722 */ 6723static int cpuset_cpu_active(struct notifier_block *nfb, unsigned long action, 6724 void *hcpu) 6725{ 6726 switch (action) { 6727 case CPU_ONLINE_FROZEN: 6728 case CPU_DOWN_FAILED_FROZEN: 6729 6730 /* 6731 * num_cpus_frozen tracks how many CPUs are involved in suspend 6732 * resume sequence. As long as this is not the last online 6733 * operation in the resume sequence, just build a single sched 6734 * domain, ignoring cpusets. 6735 */ 6736 num_cpus_frozen--; 6737 if (likely(num_cpus_frozen)) { 6738 partition_sched_domains(1, NULL, NULL); 6739 break; 6740 } 6741 6742 /* 6743 * This is the last CPU online operation. So fall through and 6744 * restore the original sched domains by considering the 6745 * cpuset configurations. 6746 */ 6747 6748 case CPU_ONLINE: 6749 case CPU_DOWN_FAILED: 6750 cpuset_update_active_cpus(true); 6751 break; 6752 default: 6753 return NOTIFY_DONE; 6754 } 6755 return NOTIFY_OK; 6756} 6757 6758static int cpuset_cpu_inactive(struct notifier_block *nfb, unsigned long action, 6759 void *hcpu) 6760{ 6761 switch (action) { 6762 case CPU_DOWN_PREPARE: 6763 cpuset_update_active_cpus(false); 6764 break; 6765 case CPU_DOWN_PREPARE_FROZEN: 6766 num_cpus_frozen++; 6767 partition_sched_domains(1, NULL, NULL); 6768 break; 6769 default: 6770 return NOTIFY_DONE; 6771 } 6772 return NOTIFY_OK; 6773} 6774 6775void __init sched_init_smp(void) 6776{ 6777 cpumask_var_t non_isolated_cpus; 6778 6779 alloc_cpumask_var(&non_isolated_cpus, GFP_KERNEL); 6780 alloc_cpumask_var(&fallback_doms, GFP_KERNEL); 6781 6782 sched_init_numa(); 6783 6784 /* 6785 * There's no userspace yet to cause hotplug operations; hence all the 6786 * cpu masks are stable and all blatant races in the below code cannot 6787 * happen. 6788 */ 6789 mutex_lock(&sched_domains_mutex); 6790 init_sched_domains(cpu_active_mask); 6791 cpumask_andnot(non_isolated_cpus, cpu_possible_mask, cpu_isolated_map); 6792 if (cpumask_empty(non_isolated_cpus)) 6793 cpumask_set_cpu(smp_processor_id(), non_isolated_cpus); 6794 mutex_unlock(&sched_domains_mutex); 6795 6796 hotcpu_notifier(sched_domains_numa_masks_update, CPU_PRI_SCHED_ACTIVE); 6797 hotcpu_notifier(cpuset_cpu_active, CPU_PRI_CPUSET_ACTIVE); 6798 hotcpu_notifier(cpuset_cpu_inactive, CPU_PRI_CPUSET_INACTIVE); 6799 6800 init_hrtick(); 6801 6802 /* Move init over to a non-isolated CPU */ 6803 if (set_cpus_allowed_ptr(current, non_isolated_cpus) < 0) 6804 BUG(); 6805 sched_init_granularity(); 6806 free_cpumask_var(non_isolated_cpus); 6807 6808 init_sched_rt_class(); 6809 init_sched_dl_class(); 6810} 6811#else 6812void __init sched_init_smp(void) 6813{ 6814 sched_init_granularity(); 6815} 6816#endif /* CONFIG_SMP */ 6817 6818const_debug unsigned int sysctl_timer_migration = 1; 6819 6820int in_sched_functions(unsigned long addr) 6821{ 6822 return in_lock_functions(addr) || 6823 (addr >= (unsigned long)__sched_text_start 6824 && addr < (unsigned long)__sched_text_end); 6825} 6826 6827#ifdef CONFIG_CGROUP_SCHED 6828/* 6829 * Default task group. 6830 * Every task in system belongs to this group at bootup. 6831 */ 6832struct task_group root_task_group; 6833LIST_HEAD(task_groups); 6834#endif 6835 6836DECLARE_PER_CPU(cpumask_var_t, load_balance_mask); 6837 6838void __init sched_init(void) 6839{ 6840 int i, j; 6841 unsigned long alloc_size = 0, ptr; 6842 6843#ifdef CONFIG_FAIR_GROUP_SCHED 6844 alloc_size += 2 * nr_cpu_ids * sizeof(void **); 6845#endif 6846#ifdef CONFIG_RT_GROUP_SCHED 6847 alloc_size += 2 * nr_cpu_ids * sizeof(void **); 6848#endif 6849#ifdef CONFIG_CPUMASK_OFFSTACK 6850 alloc_size += num_possible_cpus() * cpumask_size(); 6851#endif 6852 if (alloc_size) { 6853 ptr = (unsigned long)kzalloc(alloc_size, GFP_NOWAIT); 6854 6855#ifdef CONFIG_FAIR_GROUP_SCHED 6856 root_task_group.se = (struct sched_entity **)ptr; 6857 ptr += nr_cpu_ids * sizeof(void **); 6858 6859 root_task_group.cfs_rq = (struct cfs_rq **)ptr; 6860 ptr += nr_cpu_ids * sizeof(void **); 6861 6862#endif /* CONFIG_FAIR_GROUP_SCHED */ 6863#ifdef CONFIG_RT_GROUP_SCHED 6864 root_task_group.rt_se = (struct sched_rt_entity **)ptr; 6865 ptr += nr_cpu_ids * sizeof(void **); 6866 6867 root_task_group.rt_rq = (struct rt_rq **)ptr; 6868 ptr += nr_cpu_ids * sizeof(void **); 6869 6870#endif /* CONFIG_RT_GROUP_SCHED */ 6871#ifdef CONFIG_CPUMASK_OFFSTACK 6872 for_each_possible_cpu(i) { 6873 per_cpu(load_balance_mask, i) = (void *)ptr; 6874 ptr += cpumask_size(); 6875 } 6876#endif /* CONFIG_CPUMASK_OFFSTACK */ 6877 } 6878 6879 init_rt_bandwidth(&def_rt_bandwidth, 6880 global_rt_period(), global_rt_runtime()); 6881 init_dl_bandwidth(&def_dl_bandwidth, 6882 global_rt_period(), global_rt_runtime()); 6883 6884#ifdef CONFIG_SMP 6885 init_defrootdomain(); 6886#endif 6887 6888#ifdef CONFIG_RT_GROUP_SCHED 6889 init_rt_bandwidth(&root_task_group.rt_bandwidth, 6890 global_rt_period(), global_rt_runtime()); 6891#endif /* CONFIG_RT_GROUP_SCHED */ 6892 6893#ifdef CONFIG_CGROUP_SCHED 6894 list_add(&root_task_group.list, &task_groups); 6895 INIT_LIST_HEAD(&root_task_group.children); 6896 INIT_LIST_HEAD(&root_task_group.siblings); 6897 autogroup_init(&init_task); 6898 6899#endif /* CONFIG_CGROUP_SCHED */ 6900 6901 for_each_possible_cpu(i) { 6902 struct rq *rq; 6903 6904 rq = cpu_rq(i); 6905 raw_spin_lock_init(&rq->lock); 6906 rq->nr_running = 0; 6907 rq->calc_load_active = 0; 6908 rq->calc_load_update = jiffies + LOAD_FREQ; 6909 init_cfs_rq(&rq->cfs); 6910 init_rt_rq(&rq->rt, rq); 6911 init_dl_rq(&rq->dl, rq); 6912#ifdef CONFIG_FAIR_GROUP_SCHED 6913 root_task_group.shares = ROOT_TASK_GROUP_LOAD; 6914 INIT_LIST_HEAD(&rq->leaf_cfs_rq_list); 6915 /* 6916 * How much cpu bandwidth does root_task_group get? 6917 * 6918 * In case of task-groups formed thr' the cgroup filesystem, it 6919 * gets 100% of the cpu resources in the system. This overall 6920 * system cpu resource is divided among the tasks of 6921 * root_task_group and its child task-groups in a fair manner, 6922 * based on each entity's (task or task-group's) weight 6923 * (se->load.weight). 6924 * 6925 * In other words, if root_task_group has 10 tasks of weight 6926 * 1024) and two child groups A0 and A1 (of weight 1024 each), 6927 * then A0's share of the cpu resource is: 6928 * 6929 * A0's bandwidth = 1024 / (10*1024 + 1024 + 1024) = 8.33% 6930 * 6931 * We achieve this by letting root_task_group's tasks sit 6932 * directly in rq->cfs (i.e root_task_group->se[] = NULL). 6933 */ 6934 init_cfs_bandwidth(&root_task_group.cfs_bandwidth); 6935 init_tg_cfs_entry(&root_task_group, &rq->cfs, NULL, i, NULL); 6936#endif /* CONFIG_FAIR_GROUP_SCHED */ 6937 6938 rq->rt.rt_runtime = def_rt_bandwidth.rt_runtime; 6939#ifdef CONFIG_RT_GROUP_SCHED 6940 init_tg_rt_entry(&root_task_group, &rq->rt, NULL, i, NULL); 6941#endif 6942 6943 for (j = 0; j < CPU_LOAD_IDX_MAX; j++) 6944 rq->cpu_load[j] = 0; 6945 6946 rq->last_load_update_tick = jiffies; 6947 6948#ifdef CONFIG_SMP 6949 rq->sd = NULL; 6950 rq->rd = NULL; 6951 rq->cpu_capacity = SCHED_CAPACITY_SCALE; 6952 rq->post_schedule = 0; 6953 rq->active_balance = 0; 6954 rq->next_balance = jiffies; 6955 rq->push_cpu = 0; 6956 rq->cpu = i; 6957 rq->online = 0; 6958 rq->idle_stamp = 0; 6959 rq->avg_idle = 2*sysctl_sched_migration_cost; 6960 rq->max_idle_balance_cost = sysctl_sched_migration_cost; 6961 6962 INIT_LIST_HEAD(&rq->cfs_tasks); 6963 6964 rq_attach_root(rq, &def_root_domain); 6965#ifdef CONFIG_NO_HZ_COMMON 6966 rq->nohz_flags = 0; 6967#endif 6968#ifdef CONFIG_NO_HZ_FULL 6969 rq->last_sched_tick = 0; 6970#endif 6971#endif 6972 init_rq_hrtick(rq); 6973 atomic_set(&rq->nr_iowait, 0); 6974 } 6975 6976 set_load_weight(&init_task); 6977 6978#ifdef CONFIG_PREEMPT_NOTIFIERS 6979 INIT_HLIST_HEAD(&init_task.preempt_notifiers); 6980#endif 6981 6982 /* 6983 * The boot idle thread does lazy MMU switching as well: 6984 */ 6985 atomic_inc(&init_mm.mm_count); 6986 enter_lazy_tlb(&init_mm, current); 6987 6988 /* 6989 * Make us the idle thread. Technically, schedule() should not be 6990 * called from this thread, however somewhere below it might be, 6991 * but because we are the idle thread, we just pick up running again 6992 * when this runqueue becomes "idle". 6993 */ 6994 init_idle(current, smp_processor_id()); 6995 6996 calc_load_update = jiffies + LOAD_FREQ; 6997 6998 /* 6999 * During early bootup we pretend to be a normal task: 7000 */ 7001 current->sched_class = &fair_sched_class; 7002 7003#ifdef CONFIG_SMP 7004 zalloc_cpumask_var(&sched_domains_tmpmask, GFP_NOWAIT); 7005 /* May be allocated at isolcpus cmdline parse time */ 7006 if (cpu_isolated_map == NULL) 7007 zalloc_cpumask_var(&cpu_isolated_map, GFP_NOWAIT); 7008 idle_thread_set_boot_cpu(); 7009 set_cpu_rq_start_time(); 7010#endif 7011 init_sched_fair_class(); 7012 7013 scheduler_running = 1; 7014} 7015 7016#ifdef CONFIG_DEBUG_ATOMIC_SLEEP 7017static inline int preempt_count_equals(int preempt_offset) 7018{ 7019 int nested = (preempt_count() & ~PREEMPT_ACTIVE) + rcu_preempt_depth(); 7020 7021 return (nested == preempt_offset); 7022} 7023 7024void __might_sleep(const char *file, int line, int preempt_offset) 7025{ 7026 static unsigned long prev_jiffy; /* ratelimiting */ 7027 7028 rcu_sleep_check(); /* WARN_ON_ONCE() by default, no rate limit reqd. */ 7029 if ((preempt_count_equals(preempt_offset) && !irqs_disabled() && 7030 !is_idle_task(current)) || 7031 system_state != SYSTEM_RUNNING || oops_in_progress) 7032 return; 7033 if (time_before(jiffies, prev_jiffy + HZ) && prev_jiffy) 7034 return; 7035 prev_jiffy = jiffies; 7036 7037 printk(KERN_ERR 7038 "BUG: sleeping function called from invalid context at %s:%d\n", 7039 file, line); 7040 printk(KERN_ERR 7041 "in_atomic(): %d, irqs_disabled(): %d, pid: %d, name: %s\n", 7042 in_atomic(), irqs_disabled(), 7043 current->pid, current->comm); 7044 7045 debug_show_held_locks(current); 7046 if (irqs_disabled()) 7047 print_irqtrace_events(current); 7048#ifdef CONFIG_DEBUG_PREEMPT 7049 if (!preempt_count_equals(preempt_offset)) { 7050 pr_err("Preemption disabled at:"); 7051 print_ip_sym(current->preempt_disable_ip); 7052 pr_cont("\n"); 7053 } 7054#endif 7055 dump_stack(); 7056} 7057EXPORT_SYMBOL(__might_sleep); 7058#endif 7059 7060#ifdef CONFIG_MAGIC_SYSRQ 7061static void normalize_task(struct rq *rq, struct task_struct *p) 7062{ 7063 const struct sched_class *prev_class = p->sched_class; 7064 struct sched_attr attr = { 7065 .sched_policy = SCHED_NORMAL, 7066 }; 7067 int old_prio = p->prio; 7068 int on_rq; 7069 7070 on_rq = p->on_rq; 7071 if (on_rq) 7072 dequeue_task(rq, p, 0); 7073 __setscheduler(rq, p, &attr); 7074 if (on_rq) { 7075 enqueue_task(rq, p, 0); 7076 resched_task(rq->curr); 7077 } 7078 7079 check_class_changed(rq, p, prev_class, old_prio); 7080} 7081 7082void normalize_rt_tasks(void) 7083{ 7084 struct task_struct *g, *p; 7085 unsigned long flags; 7086 struct rq *rq; 7087 7088 read_lock_irqsave(&tasklist_lock, flags); 7089 do_each_thread(g, p) { 7090 /* 7091 * Only normalize user tasks: 7092 */ 7093 if (!p->mm) 7094 continue; 7095 7096 p->se.exec_start = 0; 7097#ifdef CONFIG_SCHEDSTATS 7098 p->se.statistics.wait_start = 0; 7099 p->se.statistics.sleep_start = 0; 7100 p->se.statistics.block_start = 0; 7101#endif 7102 7103 if (!dl_task(p) && !rt_task(p)) { 7104 /* 7105 * Renice negative nice level userspace 7106 * tasks back to 0: 7107 */ 7108 if (task_nice(p) < 0 && p->mm) 7109 set_user_nice(p, 0); 7110 continue; 7111 } 7112 7113 raw_spin_lock(&p->pi_lock); 7114 rq = __task_rq_lock(p); 7115 7116 normalize_task(rq, p); 7117 7118 __task_rq_unlock(rq); 7119 raw_spin_unlock(&p->pi_lock); 7120 } while_each_thread(g, p); 7121 7122 read_unlock_irqrestore(&tasklist_lock, flags); 7123} 7124 7125#endif /* CONFIG_MAGIC_SYSRQ */ 7126 7127#if defined(CONFIG_IA64) || defined(CONFIG_KGDB_KDB) 7128/* 7129 * These functions are only useful for the IA64 MCA handling, or kdb. 7130 * 7131 * They can only be called when the whole system has been 7132 * stopped - every CPU needs to be quiescent, and no scheduling 7133 * activity can take place. Using them for anything else would 7134 * be a serious bug, and as a result, they aren't even visible 7135 * under any other configuration. 7136 */ 7137 7138/** 7139 * curr_task - return the current task for a given cpu. 7140 * @cpu: the processor in question. 7141 * 7142 * ONLY VALID WHEN THE WHOLE SYSTEM IS STOPPED! 7143 * 7144 * Return: The current task for @cpu. 7145 */ 7146struct task_struct *curr_task(int cpu) 7147{ 7148 return cpu_curr(cpu); 7149} 7150 7151#endif /* defined(CONFIG_IA64) || defined(CONFIG_KGDB_KDB) */ 7152 7153#ifdef CONFIG_IA64 7154/** 7155 * set_curr_task - set the current task for a given cpu. 7156 * @cpu: the processor in question. 7157 * @p: the task pointer to set. 7158 * 7159 * Description: This function must only be used when non-maskable interrupts 7160 * are serviced on a separate stack. It allows the architecture to switch the 7161 * notion of the current task on a cpu in a non-blocking manner. This function 7162 * must be called with all CPU's synchronized, and interrupts disabled, the 7163 * and caller must save the original value of the current task (see 7164 * curr_task() above) and restore that value before reenabling interrupts and 7165 * re-starting the system. 7166 * 7167 * ONLY VALID WHEN THE WHOLE SYSTEM IS STOPPED! 7168 */ 7169void set_curr_task(int cpu, struct task_struct *p) 7170{ 7171 cpu_curr(cpu) = p; 7172} 7173 7174#endif 7175 7176#ifdef CONFIG_CGROUP_SCHED 7177/* task_group_lock serializes the addition/removal of task groups */ 7178static DEFINE_SPINLOCK(task_group_lock); 7179 7180static void free_sched_group(struct task_group *tg) 7181{ 7182 free_fair_sched_group(tg); 7183 free_rt_sched_group(tg); 7184 autogroup_free(tg); 7185 kfree(tg); 7186} 7187 7188/* allocate runqueue etc for a new task group */ 7189struct task_group *sched_create_group(struct task_group *parent) 7190{ 7191 struct task_group *tg; 7192 7193 tg = kzalloc(sizeof(*tg), GFP_KERNEL); 7194 if (!tg) 7195 return ERR_PTR(-ENOMEM); 7196 7197 if (!alloc_fair_sched_group(tg, parent)) 7198 goto err; 7199 7200 if (!alloc_rt_sched_group(tg, parent)) 7201 goto err; 7202 7203 return tg; 7204 7205err: 7206 free_sched_group(tg); 7207 return ERR_PTR(-ENOMEM); 7208} 7209 7210void sched_online_group(struct task_group *tg, struct task_group *parent) 7211{ 7212 unsigned long flags; 7213 7214 spin_lock_irqsave(&task_group_lock, flags); 7215 list_add_rcu(&tg->list, &task_groups); 7216 7217 WARN_ON(!parent); /* root should already exist */ 7218 7219 tg->parent = parent; 7220 INIT_LIST_HEAD(&tg->children); 7221 list_add_rcu(&tg->siblings, &parent->children); 7222 spin_unlock_irqrestore(&task_group_lock, flags); 7223} 7224 7225/* rcu callback to free various structures associated with a task group */ 7226static void free_sched_group_rcu(struct rcu_head *rhp) 7227{ 7228 /* now it should be safe to free those cfs_rqs */ 7229 free_sched_group(container_of(rhp, struct task_group, rcu)); 7230} 7231 7232/* Destroy runqueue etc associated with a task group */ 7233void sched_destroy_group(struct task_group *tg) 7234{ 7235 /* wait for possible concurrent references to cfs_rqs complete */ 7236 call_rcu(&tg->rcu, free_sched_group_rcu); 7237} 7238 7239void sched_offline_group(struct task_group *tg) 7240{ 7241 unsigned long flags; 7242 int i; 7243 7244 /* end participation in shares distribution */ 7245 for_each_possible_cpu(i) 7246 unregister_fair_sched_group(tg, i); 7247 7248 spin_lock_irqsave(&task_group_lock, flags); 7249 list_del_rcu(&tg->list); 7250 list_del_rcu(&tg->siblings); 7251 spin_unlock_irqrestore(&task_group_lock, flags); 7252} 7253 7254/* change task's runqueue when it moves between groups. 7255 * The caller of this function should have put the task in its new group 7256 * by now. This function just updates tsk->se.cfs_rq and tsk->se.parent to 7257 * reflect its new group. 7258 */ 7259void sched_move_task(struct task_struct *tsk) 7260{ 7261 struct task_group *tg; 7262 int on_rq, running; 7263 unsigned long flags; 7264 struct rq *rq; 7265 7266 rq = task_rq_lock(tsk, &flags); 7267 7268 running = task_current(rq, tsk); 7269 on_rq = tsk->on_rq; 7270 7271 if (on_rq) 7272 dequeue_task(rq, tsk, 0); 7273 if (unlikely(running)) 7274 tsk->sched_class->put_prev_task(rq, tsk); 7275 7276 tg = container_of(task_css_check(tsk, cpu_cgrp_id, 7277 lockdep_is_held(&tsk->sighand->siglock)), 7278 struct task_group, css); 7279 tg = autogroup_task_group(tsk, tg); 7280 tsk->sched_task_group = tg; 7281 7282#ifdef CONFIG_FAIR_GROUP_SCHED 7283 if (tsk->sched_class->task_move_group) 7284 tsk->sched_class->task_move_group(tsk, on_rq); 7285 else 7286#endif 7287 set_task_rq(tsk, task_cpu(tsk)); 7288 7289 if (unlikely(running)) 7290 tsk->sched_class->set_curr_task(rq); 7291 if (on_rq) 7292 enqueue_task(rq, tsk, 0); 7293 7294 task_rq_unlock(rq, tsk, &flags); 7295} 7296#endif /* CONFIG_CGROUP_SCHED */ 7297 7298#ifdef CONFIG_RT_GROUP_SCHED 7299/* 7300 * Ensure that the real time constraints are schedulable. 7301 */ 7302static DEFINE_MUTEX(rt_constraints_mutex); 7303 7304/* Must be called with tasklist_lock held */ 7305static inline int tg_has_rt_tasks(struct task_group *tg) 7306{ 7307 struct task_struct *g, *p; 7308 7309 do_each_thread(g, p) { 7310 if (rt_task(p) && task_rq(p)->rt.tg == tg) 7311 return 1; 7312 } while_each_thread(g, p); 7313 7314 return 0; 7315} 7316 7317struct rt_schedulable_data { 7318 struct task_group *tg; 7319 u64 rt_period; 7320 u64 rt_runtime; 7321}; 7322 7323static int tg_rt_schedulable(struct task_group *tg, void *data) 7324{ 7325 struct rt_schedulable_data *d = data; 7326 struct task_group *child; 7327 unsigned long total, sum = 0; 7328 u64 period, runtime; 7329 7330 period = ktime_to_ns(tg->rt_bandwidth.rt_period); 7331 runtime = tg->rt_bandwidth.rt_runtime; 7332 7333 if (tg == d->tg) { 7334 period = d->rt_period; 7335 runtime = d->rt_runtime; 7336 } 7337 7338 /* 7339 * Cannot have more runtime than the period. 7340 */ 7341 if (runtime > period && runtime != RUNTIME_INF) 7342 return -EINVAL; 7343 7344 /* 7345 * Ensure we don't starve existing RT tasks. 7346 */ 7347 if (rt_bandwidth_enabled() && !runtime && tg_has_rt_tasks(tg)) 7348 return -EBUSY; 7349 7350 total = to_ratio(period, runtime); 7351 7352 /* 7353 * Nobody can have more than the global setting allows. 7354 */ 7355 if (total > to_ratio(global_rt_period(), global_rt_runtime())) 7356 return -EINVAL; 7357 7358 /* 7359 * The sum of our children's runtime should not exceed our own. 7360 */ 7361 list_for_each_entry_rcu(child, &tg->children, siblings) { 7362 period = ktime_to_ns(child->rt_bandwidth.rt_period); 7363 runtime = child->rt_bandwidth.rt_runtime; 7364 7365 if (child == d->tg) { 7366 period = d->rt_period; 7367 runtime = d->rt_runtime; 7368 } 7369 7370 sum += to_ratio(period, runtime); 7371 } 7372 7373 if (sum > total) 7374 return -EINVAL; 7375 7376 return 0; 7377} 7378 7379static int __rt_schedulable(struct task_group *tg, u64 period, u64 runtime) 7380{ 7381 int ret; 7382 7383 struct rt_schedulable_data data = { 7384 .tg = tg, 7385 .rt_period = period, 7386 .rt_runtime = runtime, 7387 }; 7388 7389 rcu_read_lock(); 7390 ret = walk_tg_tree(tg_rt_schedulable, tg_nop, &data); 7391 rcu_read_unlock(); 7392 7393 return ret; 7394} 7395 7396static int tg_set_rt_bandwidth(struct task_group *tg, 7397 u64 rt_period, u64 rt_runtime) 7398{ 7399 int i, err = 0; 7400 7401 mutex_lock(&rt_constraints_mutex); 7402 read_lock(&tasklist_lock); 7403 err = __rt_schedulable(tg, rt_period, rt_runtime); 7404 if (err) 7405 goto unlock; 7406 7407 raw_spin_lock_irq(&tg->rt_bandwidth.rt_runtime_lock); 7408 tg->rt_bandwidth.rt_period = ns_to_ktime(rt_period); 7409 tg->rt_bandwidth.rt_runtime = rt_runtime; 7410 7411 for_each_possible_cpu(i) { 7412 struct rt_rq *rt_rq = tg->rt_rq[i]; 7413 7414 raw_spin_lock(&rt_rq->rt_runtime_lock); 7415 rt_rq->rt_runtime = rt_runtime; 7416 raw_spin_unlock(&rt_rq->rt_runtime_lock); 7417 } 7418 raw_spin_unlock_irq(&tg->rt_bandwidth.rt_runtime_lock); 7419unlock: 7420 read_unlock(&tasklist_lock); 7421 mutex_unlock(&rt_constraints_mutex); 7422 7423 return err; 7424} 7425 7426static int sched_group_set_rt_runtime(struct task_group *tg, long rt_runtime_us) 7427{ 7428 u64 rt_runtime, rt_period; 7429 7430 rt_period = ktime_to_ns(tg->rt_bandwidth.rt_period); 7431 rt_runtime = (u64)rt_runtime_us * NSEC_PER_USEC; 7432 if (rt_runtime_us < 0) 7433 rt_runtime = RUNTIME_INF; 7434 7435 return tg_set_rt_bandwidth(tg, rt_period, rt_runtime); 7436} 7437 7438static long sched_group_rt_runtime(struct task_group *tg) 7439{ 7440 u64 rt_runtime_us; 7441 7442 if (tg->rt_bandwidth.rt_runtime == RUNTIME_INF) 7443 return -1; 7444 7445 rt_runtime_us = tg->rt_bandwidth.rt_runtime; 7446 do_div(rt_runtime_us, NSEC_PER_USEC); 7447 return rt_runtime_us; 7448} 7449 7450static int sched_group_set_rt_period(struct task_group *tg, long rt_period_us) 7451{ 7452 u64 rt_runtime, rt_period; 7453 7454 rt_period = (u64)rt_period_us * NSEC_PER_USEC; 7455 rt_runtime = tg->rt_bandwidth.rt_runtime; 7456 7457 if (rt_period == 0) 7458 return -EINVAL; 7459 7460 return tg_set_rt_bandwidth(tg, rt_period, rt_runtime); 7461} 7462 7463static long sched_group_rt_period(struct task_group *tg) 7464{ 7465 u64 rt_period_us; 7466 7467 rt_period_us = ktime_to_ns(tg->rt_bandwidth.rt_period); 7468 do_div(rt_period_us, NSEC_PER_USEC); 7469 return rt_period_us; 7470} 7471#endif /* CONFIG_RT_GROUP_SCHED */ 7472 7473#ifdef CONFIG_RT_GROUP_SCHED 7474static int sched_rt_global_constraints(void) 7475{ 7476 int ret = 0; 7477 7478 mutex_lock(&rt_constraints_mutex); 7479 read_lock(&tasklist_lock); 7480 ret = __rt_schedulable(NULL, 0, 0); 7481 read_unlock(&tasklist_lock); 7482 mutex_unlock(&rt_constraints_mutex); 7483 7484 return ret; 7485} 7486 7487static int sched_rt_can_attach(struct task_group *tg, struct task_struct *tsk) 7488{ 7489 /* Don't accept realtime tasks when there is no way for them to run */ 7490 if (rt_task(tsk) && tg->rt_bandwidth.rt_runtime == 0) 7491 return 0; 7492 7493 return 1; 7494} 7495 7496#else /* !CONFIG_RT_GROUP_SCHED */ 7497static int sched_rt_global_constraints(void) 7498{ 7499 unsigned long flags; 7500 int i, ret = 0; 7501 7502 raw_spin_lock_irqsave(&def_rt_bandwidth.rt_runtime_lock, flags); 7503 for_each_possible_cpu(i) { 7504 struct rt_rq *rt_rq = &cpu_rq(i)->rt; 7505 7506 raw_spin_lock(&rt_rq->rt_runtime_lock); 7507 rt_rq->rt_runtime = global_rt_runtime(); 7508 raw_spin_unlock(&rt_rq->rt_runtime_lock); 7509 } 7510 raw_spin_unlock_irqrestore(&def_rt_bandwidth.rt_runtime_lock, flags); 7511 7512 return ret; 7513} 7514#endif /* CONFIG_RT_GROUP_SCHED */ 7515 7516static int sched_dl_global_constraints(void) 7517{ 7518 u64 runtime = global_rt_runtime(); 7519 u64 period = global_rt_period(); 7520 u64 new_bw = to_ratio(period, runtime); 7521 int cpu, ret = 0; 7522 unsigned long flags; 7523 7524 /* 7525 * Here we want to check the bandwidth not being set to some 7526 * value smaller than the currently allocated bandwidth in 7527 * any of the root_domains. 7528 * 7529 * FIXME: Cycling on all the CPUs is overdoing, but simpler than 7530 * cycling on root_domains... Discussion on different/better 7531 * solutions is welcome! 7532 */ 7533 for_each_possible_cpu(cpu) { 7534 struct dl_bw *dl_b = dl_bw_of(cpu); 7535 7536 raw_spin_lock_irqsave(&dl_b->lock, flags); 7537 if (new_bw < dl_b->total_bw) 7538 ret = -EBUSY; 7539 raw_spin_unlock_irqrestore(&dl_b->lock, flags); 7540 7541 if (ret) 7542 break; 7543 } 7544 7545 return ret; 7546} 7547 7548static void sched_dl_do_global(void) 7549{ 7550 u64 new_bw = -1; 7551 int cpu; 7552 unsigned long flags; 7553 7554 def_dl_bandwidth.dl_period = global_rt_period(); 7555 def_dl_bandwidth.dl_runtime = global_rt_runtime(); 7556 7557 if (global_rt_runtime() != RUNTIME_INF) 7558 new_bw = to_ratio(global_rt_period(), global_rt_runtime()); 7559 7560 /* 7561 * FIXME: As above... 7562 */ 7563 for_each_possible_cpu(cpu) { 7564 struct dl_bw *dl_b = dl_bw_of(cpu); 7565 7566 raw_spin_lock_irqsave(&dl_b->lock, flags); 7567 dl_b->bw = new_bw; 7568 raw_spin_unlock_irqrestore(&dl_b->lock, flags); 7569 } 7570} 7571 7572static int sched_rt_global_validate(void) 7573{ 7574 if (sysctl_sched_rt_period <= 0) 7575 return -EINVAL; 7576 7577 if ((sysctl_sched_rt_runtime != RUNTIME_INF) && 7578 (sysctl_sched_rt_runtime > sysctl_sched_rt_period)) 7579 return -EINVAL; 7580 7581 return 0; 7582} 7583 7584static void sched_rt_do_global(void) 7585{ 7586 def_rt_bandwidth.rt_runtime = global_rt_runtime(); 7587 def_rt_bandwidth.rt_period = ns_to_ktime(global_rt_period()); 7588} 7589 7590int sched_rt_handler(struct ctl_table *table, int write, 7591 void __user *buffer, size_t *lenp, 7592 loff_t *ppos) 7593{ 7594 int old_period, old_runtime; 7595 static DEFINE_MUTEX(mutex); 7596 int ret; 7597 7598 mutex_lock(&mutex); 7599 old_period = sysctl_sched_rt_period; 7600 old_runtime = sysctl_sched_rt_runtime; 7601 7602 ret = proc_dointvec(table, write, buffer, lenp, ppos); 7603 7604 if (!ret && write) { 7605 ret = sched_rt_global_validate(); 7606 if (ret) 7607 goto undo; 7608 7609 ret = sched_rt_global_constraints(); 7610 if (ret) 7611 goto undo; 7612 7613 ret = sched_dl_global_constraints(); 7614 if (ret) 7615 goto undo; 7616 7617 sched_rt_do_global(); 7618 sched_dl_do_global(); 7619 } 7620 if (0) { 7621undo: 7622 sysctl_sched_rt_period = old_period; 7623 sysctl_sched_rt_runtime = old_runtime; 7624 } 7625 mutex_unlock(&mutex); 7626 7627 return ret; 7628} 7629 7630int sched_rr_handler(struct ctl_table *table, int write, 7631 void __user *buffer, size_t *lenp, 7632 loff_t *ppos) 7633{ 7634 int ret; 7635 static DEFINE_MUTEX(mutex); 7636 7637 mutex_lock(&mutex); 7638 ret = proc_dointvec(table, write, buffer, lenp, ppos); 7639 /* make sure that internally we keep jiffies */ 7640 /* also, writing zero resets timeslice to default */ 7641 if (!ret && write) { 7642 sched_rr_timeslice = sched_rr_timeslice <= 0 ? 7643 RR_TIMESLICE : msecs_to_jiffies(sched_rr_timeslice); 7644 } 7645 mutex_unlock(&mutex); 7646 return ret; 7647} 7648 7649#ifdef CONFIG_CGROUP_SCHED 7650 7651static inline struct task_group *css_tg(struct cgroup_subsys_state *css) 7652{ 7653 return css ? container_of(css, struct task_group, css) : NULL; 7654} 7655 7656static struct cgroup_subsys_state * 7657cpu_cgroup_css_alloc(struct cgroup_subsys_state *parent_css) 7658{ 7659 struct task_group *parent = css_tg(parent_css); 7660 struct task_group *tg; 7661 7662 if (!parent) { 7663 /* This is early initialization for the top cgroup */ 7664 return &root_task_group.css; 7665 } 7666 7667 tg = sched_create_group(parent); 7668 if (IS_ERR(tg)) 7669 return ERR_PTR(-ENOMEM); 7670 7671 return &tg->css; 7672} 7673 7674static int cpu_cgroup_css_online(struct cgroup_subsys_state *css) 7675{ 7676 struct task_group *tg = css_tg(css); 7677 struct task_group *parent = css_tg(css_parent(css)); 7678 7679 if (parent) 7680 sched_online_group(tg, parent); 7681 return 0; 7682} 7683 7684static void cpu_cgroup_css_free(struct cgroup_subsys_state *css) 7685{ 7686 struct task_group *tg = css_tg(css); 7687 7688 sched_destroy_group(tg); 7689} 7690 7691static void cpu_cgroup_css_offline(struct cgroup_subsys_state *css) 7692{ 7693 struct task_group *tg = css_tg(css); 7694 7695 sched_offline_group(tg); 7696} 7697 7698static int cpu_cgroup_can_attach(struct cgroup_subsys_state *css, 7699 struct cgroup_taskset *tset) 7700{ 7701 struct task_struct *task; 7702 7703 cgroup_taskset_for_each(task, tset) { 7704#ifdef CONFIG_RT_GROUP_SCHED 7705 if (!sched_rt_can_attach(css_tg(css), task)) 7706 return -EINVAL; 7707#else 7708 /* We don't support RT-tasks being in separate groups */ 7709 if (task->sched_class != &fair_sched_class) 7710 return -EINVAL; 7711#endif 7712 } 7713 return 0; 7714} 7715 7716static void cpu_cgroup_attach(struct cgroup_subsys_state *css, 7717 struct cgroup_taskset *tset) 7718{ 7719 struct task_struct *task; 7720 7721 cgroup_taskset_for_each(task, tset) 7722 sched_move_task(task); 7723} 7724 7725static void cpu_cgroup_exit(struct cgroup_subsys_state *css, 7726 struct cgroup_subsys_state *old_css, 7727 struct task_struct *task) 7728{ 7729 /* 7730 * cgroup_exit() is called in the copy_process() failure path. 7731 * Ignore this case since the task hasn't ran yet, this avoids 7732 * trying to poke a half freed task state from generic code. 7733 */ 7734 if (!(task->flags & PF_EXITING)) 7735 return; 7736 7737 sched_move_task(task); 7738} 7739 7740#ifdef CONFIG_FAIR_GROUP_SCHED 7741static int cpu_shares_write_u64(struct cgroup_subsys_state *css, 7742 struct cftype *cftype, u64 shareval) 7743{ 7744 return sched_group_set_shares(css_tg(css), scale_load(shareval)); 7745} 7746 7747static u64 cpu_shares_read_u64(struct cgroup_subsys_state *css, 7748 struct cftype *cft) 7749{ 7750 struct task_group *tg = css_tg(css); 7751 7752 return (u64) scale_load_down(tg->shares); 7753} 7754 7755#ifdef CONFIG_CFS_BANDWIDTH 7756static DEFINE_MUTEX(cfs_constraints_mutex); 7757 7758const u64 max_cfs_quota_period = 1 * NSEC_PER_SEC; /* 1s */ 7759const u64 min_cfs_quota_period = 1 * NSEC_PER_MSEC; /* 1ms */ 7760 7761static int __cfs_schedulable(struct task_group *tg, u64 period, u64 runtime); 7762 7763static int tg_set_cfs_bandwidth(struct task_group *tg, u64 period, u64 quota) 7764{ 7765 int i, ret = 0, runtime_enabled, runtime_was_enabled; 7766 struct cfs_bandwidth *cfs_b = &tg->cfs_bandwidth; 7767 7768 if (tg == &root_task_group) 7769 return -EINVAL; 7770 7771 /* 7772 * Ensure we have at some amount of bandwidth every period. This is 7773 * to prevent reaching a state of large arrears when throttled via 7774 * entity_tick() resulting in prolonged exit starvation. 7775 */ 7776 if (quota < min_cfs_quota_period || period < min_cfs_quota_period) 7777 return -EINVAL; 7778 7779 /* 7780 * Likewise, bound things on the otherside by preventing insane quota 7781 * periods. This also allows us to normalize in computing quota 7782 * feasibility. 7783 */ 7784 if (period > max_cfs_quota_period) 7785 return -EINVAL; 7786 7787 mutex_lock(&cfs_constraints_mutex); 7788 ret = __cfs_schedulable(tg, period, quota); 7789 if (ret) 7790 goto out_unlock; 7791 7792 runtime_enabled = quota != RUNTIME_INF; 7793 runtime_was_enabled = cfs_b->quota != RUNTIME_INF; 7794 /* 7795 * If we need to toggle cfs_bandwidth_used, off->on must occur 7796 * before making related changes, and on->off must occur afterwards 7797 */ 7798 if (runtime_enabled && !runtime_was_enabled) 7799 cfs_bandwidth_usage_inc(); 7800 raw_spin_lock_irq(&cfs_b->lock); 7801 cfs_b->period = ns_to_ktime(period); 7802 cfs_b->quota = quota; 7803 7804 __refill_cfs_bandwidth_runtime(cfs_b); 7805 /* restart the period timer (if active) to handle new period expiry */ 7806 if (runtime_enabled && cfs_b->timer_active) { 7807 /* force a reprogram */ 7808 cfs_b->timer_active = 0; 7809 __start_cfs_bandwidth(cfs_b); 7810 } 7811 raw_spin_unlock_irq(&cfs_b->lock); 7812 7813 for_each_possible_cpu(i) { 7814 struct cfs_rq *cfs_rq = tg->cfs_rq[i]; 7815 struct rq *rq = cfs_rq->rq; 7816 7817 raw_spin_lock_irq(&rq->lock); 7818 cfs_rq->runtime_enabled = runtime_enabled; 7819 cfs_rq->runtime_remaining = 0; 7820 7821 if (cfs_rq->throttled) 7822 unthrottle_cfs_rq(cfs_rq); 7823 raw_spin_unlock_irq(&rq->lock); 7824 } 7825 if (runtime_was_enabled && !runtime_enabled) 7826 cfs_bandwidth_usage_dec(); 7827out_unlock: 7828 mutex_unlock(&cfs_constraints_mutex); 7829 7830 return ret; 7831} 7832 7833int tg_set_cfs_quota(struct task_group *tg, long cfs_quota_us) 7834{ 7835 u64 quota, period; 7836 7837 period = ktime_to_ns(tg->cfs_bandwidth.period); 7838 if (cfs_quota_us < 0) 7839 quota = RUNTIME_INF; 7840 else 7841 quota = (u64)cfs_quota_us * NSEC_PER_USEC; 7842 7843 return tg_set_cfs_bandwidth(tg, period, quota); 7844} 7845 7846long tg_get_cfs_quota(struct task_group *tg) 7847{ 7848 u64 quota_us; 7849 7850 if (tg->cfs_bandwidth.quota == RUNTIME_INF) 7851 return -1; 7852 7853 quota_us = tg->cfs_bandwidth.quota; 7854 do_div(quota_us, NSEC_PER_USEC); 7855 7856 return quota_us; 7857} 7858 7859int tg_set_cfs_period(struct task_group *tg, long cfs_period_us) 7860{ 7861 u64 quota, period; 7862 7863 period = (u64)cfs_period_us * NSEC_PER_USEC; 7864 quota = tg->cfs_bandwidth.quota; 7865 7866 return tg_set_cfs_bandwidth(tg, period, quota); 7867} 7868 7869long tg_get_cfs_period(struct task_group *tg) 7870{ 7871 u64 cfs_period_us; 7872 7873 cfs_period_us = ktime_to_ns(tg->cfs_bandwidth.period); 7874 do_div(cfs_period_us, NSEC_PER_USEC); 7875 7876 return cfs_period_us; 7877} 7878 7879static s64 cpu_cfs_quota_read_s64(struct cgroup_subsys_state *css, 7880 struct cftype *cft) 7881{ 7882 return tg_get_cfs_quota(css_tg(css)); 7883} 7884 7885static int cpu_cfs_quota_write_s64(struct cgroup_subsys_state *css, 7886 struct cftype *cftype, s64 cfs_quota_us) 7887{ 7888 return tg_set_cfs_quota(css_tg(css), cfs_quota_us); 7889} 7890 7891static u64 cpu_cfs_period_read_u64(struct cgroup_subsys_state *css, 7892 struct cftype *cft) 7893{ 7894 return tg_get_cfs_period(css_tg(css)); 7895} 7896 7897static int cpu_cfs_period_write_u64(struct cgroup_subsys_state *css, 7898 struct cftype *cftype, u64 cfs_period_us) 7899{ 7900 return tg_set_cfs_period(css_tg(css), cfs_period_us); 7901} 7902 7903struct cfs_schedulable_data { 7904 struct task_group *tg; 7905 u64 period, quota; 7906}; 7907 7908/* 7909 * normalize group quota/period to be quota/max_period 7910 * note: units are usecs 7911 */ 7912static u64 normalize_cfs_quota(struct task_group *tg, 7913 struct cfs_schedulable_data *d) 7914{ 7915 u64 quota, period; 7916 7917 if (tg == d->tg) { 7918 period = d->period; 7919 quota = d->quota; 7920 } else { 7921 period = tg_get_cfs_period(tg); 7922 quota = tg_get_cfs_quota(tg); 7923 } 7924 7925 /* note: these should typically be equivalent */ 7926 if (quota == RUNTIME_INF || quota == -1) 7927 return RUNTIME_INF; 7928 7929 return to_ratio(period, quota); 7930} 7931 7932static int tg_cfs_schedulable_down(struct task_group *tg, void *data) 7933{ 7934 struct cfs_schedulable_data *d = data; 7935 struct cfs_bandwidth *cfs_b = &tg->cfs_bandwidth; 7936 s64 quota = 0, parent_quota = -1; 7937 7938 if (!tg->parent) { 7939 quota = RUNTIME_INF; 7940 } else { 7941 struct cfs_bandwidth *parent_b = &tg->parent->cfs_bandwidth; 7942 7943 quota = normalize_cfs_quota(tg, d); 7944 parent_quota = parent_b->hierarchal_quota; 7945 7946 /* 7947 * ensure max(child_quota) <= parent_quota, inherit when no 7948 * limit is set 7949 */ 7950 if (quota == RUNTIME_INF) 7951 quota = parent_quota; 7952 else if (parent_quota != RUNTIME_INF && quota > parent_quota) 7953 return -EINVAL; 7954 } 7955 cfs_b->hierarchal_quota = quota; 7956 7957 return 0; 7958} 7959 7960static int __cfs_schedulable(struct task_group *tg, u64 period, u64 quota) 7961{ 7962 int ret; 7963 struct cfs_schedulable_data data = { 7964 .tg = tg, 7965 .period = period, 7966 .quota = quota, 7967 }; 7968 7969 if (quota != RUNTIME_INF) { 7970 do_div(data.period, NSEC_PER_USEC); 7971 do_div(data.quota, NSEC_PER_USEC); 7972 } 7973 7974 rcu_read_lock(); 7975 ret = walk_tg_tree(tg_cfs_schedulable_down, tg_nop, &data); 7976 rcu_read_unlock(); 7977 7978 return ret; 7979} 7980 7981static int cpu_stats_show(struct seq_file *sf, void *v) 7982{ 7983 struct task_group *tg = css_tg(seq_css(sf)); 7984 struct cfs_bandwidth *cfs_b = &tg->cfs_bandwidth; 7985 7986 seq_printf(sf, "nr_periods %d\n", cfs_b->nr_periods); 7987 seq_printf(sf, "nr_throttled %d\n", cfs_b->nr_throttled); 7988 seq_printf(sf, "throttled_time %llu\n", cfs_b->throttled_time); 7989 7990 return 0; 7991} 7992#endif /* CONFIG_CFS_BANDWIDTH */ 7993#endif /* CONFIG_FAIR_GROUP_SCHED */ 7994 7995#ifdef CONFIG_RT_GROUP_SCHED 7996static int cpu_rt_runtime_write(struct cgroup_subsys_state *css, 7997 struct cftype *cft, s64 val) 7998{ 7999 return sched_group_set_rt_runtime(css_tg(css), val); 8000} 8001 8002static s64 cpu_rt_runtime_read(struct cgroup_subsys_state *css, 8003 struct cftype *cft) 8004{ 8005 return sched_group_rt_runtime(css_tg(css)); 8006} 8007 8008static int cpu_rt_period_write_uint(struct cgroup_subsys_state *css, 8009 struct cftype *cftype, u64 rt_period_us) 8010{ 8011 return sched_group_set_rt_period(css_tg(css), rt_period_us); 8012} 8013 8014static u64 cpu_rt_period_read_uint(struct cgroup_subsys_state *css, 8015 struct cftype *cft) 8016{ 8017 return sched_group_rt_period(css_tg(css)); 8018} 8019#endif /* CONFIG_RT_GROUP_SCHED */ 8020 8021static struct cftype cpu_files[] = { 8022#ifdef CONFIG_FAIR_GROUP_SCHED 8023 { 8024 .name = "shares", 8025 .read_u64 = cpu_shares_read_u64, 8026 .write_u64 = cpu_shares_write_u64, 8027 }, 8028#endif 8029#ifdef CONFIG_CFS_BANDWIDTH 8030 { 8031 .name = "cfs_quota_us", 8032 .read_s64 = cpu_cfs_quota_read_s64, 8033 .write_s64 = cpu_cfs_quota_write_s64, 8034 }, 8035 { 8036 .name = "cfs_period_us", 8037 .read_u64 = cpu_cfs_period_read_u64, 8038 .write_u64 = cpu_cfs_period_write_u64, 8039 }, 8040 { 8041 .name = "stat", 8042 .seq_show = cpu_stats_show, 8043 }, 8044#endif 8045#ifdef CONFIG_RT_GROUP_SCHED 8046 { 8047 .name = "rt_runtime_us", 8048 .read_s64 = cpu_rt_runtime_read, 8049 .write_s64 = cpu_rt_runtime_write, 8050 }, 8051 { 8052 .name = "rt_period_us", 8053 .read_u64 = cpu_rt_period_read_uint, 8054 .write_u64 = cpu_rt_period_write_uint, 8055 }, 8056#endif 8057 { } /* terminate */ 8058}; 8059 8060struct cgroup_subsys cpu_cgrp_subsys = { 8061 .css_alloc = cpu_cgroup_css_alloc, 8062 .css_free = cpu_cgroup_css_free, 8063 .css_online = cpu_cgroup_css_online, 8064 .css_offline = cpu_cgroup_css_offline, 8065 .can_attach = cpu_cgroup_can_attach, 8066 .attach = cpu_cgroup_attach, 8067 .exit = cpu_cgroup_exit, 8068 .base_cftypes = cpu_files, 8069 .early_init = 1, 8070}; 8071 8072#endif /* CONFIG_CGROUP_SCHED */ 8073 8074void dump_cpu_task(int cpu) 8075{ 8076 pr_info("Task dump for CPU %d:\n", cpu); 8077 sched_show_task(cpu_curr(cpu)); 8078} 8079