core.c revision 3105b86a9fee7d2c2e76edb53bbbc4027599628f
1/* 2 * kernel/sched/core.c 3 * 4 * Kernel scheduler and related syscalls 5 * 6 * Copyright (C) 1991-2002 Linus Torvalds 7 * 8 * 1996-12-23 Modified by Dave Grothe to fix bugs in semaphores and 9 * make semaphores SMP safe 10 * 1998-11-19 Implemented schedule_timeout() and related stuff 11 * by Andrea Arcangeli 12 * 2002-01-04 New ultra-scalable O(1) scheduler by Ingo Molnar: 13 * hybrid priority-list and round-robin design with 14 * an array-switch method of distributing timeslices 15 * and per-CPU runqueues. Cleanups and useful suggestions 16 * by Davide Libenzi, preemptible kernel bits by Robert Love. 17 * 2003-09-03 Interactivity tuning by Con Kolivas. 18 * 2004-04-02 Scheduler domains code by Nick Piggin 19 * 2007-04-15 Work begun on replacing all interactivity tuning with a 20 * fair scheduling design by Con Kolivas. 21 * 2007-05-05 Load balancing (smp-nice) and other improvements 22 * by Peter Williams 23 * 2007-05-06 Interactivity improvements to CFS by Mike Galbraith 24 * 2007-07-01 Group scheduling enhancements by Srivatsa Vaddagiri 25 * 2007-11-29 RT balancing improvements by Steven Rostedt, Gregory Haskins, 26 * Thomas Gleixner, Mike Kravetz 27 */ 28 29#include <linux/mm.h> 30#include <linux/module.h> 31#include <linux/nmi.h> 32#include <linux/init.h> 33#include <linux/uaccess.h> 34#include <linux/highmem.h> 35#include <asm/mmu_context.h> 36#include <linux/interrupt.h> 37#include <linux/capability.h> 38#include <linux/completion.h> 39#include <linux/kernel_stat.h> 40#include <linux/debug_locks.h> 41#include <linux/perf_event.h> 42#include <linux/security.h> 43#include <linux/notifier.h> 44#include <linux/profile.h> 45#include <linux/freezer.h> 46#include <linux/vmalloc.h> 47#include <linux/blkdev.h> 48#include <linux/delay.h> 49#include <linux/pid_namespace.h> 50#include <linux/smp.h> 51#include <linux/threads.h> 52#include <linux/timer.h> 53#include <linux/rcupdate.h> 54#include <linux/cpu.h> 55#include <linux/cpuset.h> 56#include <linux/percpu.h> 57#include <linux/proc_fs.h> 58#include <linux/seq_file.h> 59#include <linux/sysctl.h> 60#include <linux/syscalls.h> 61#include <linux/times.h> 62#include <linux/tsacct_kern.h> 63#include <linux/kprobes.h> 64#include <linux/delayacct.h> 65#include <linux/unistd.h> 66#include <linux/pagemap.h> 67#include <linux/hrtimer.h> 68#include <linux/tick.h> 69#include <linux/debugfs.h> 70#include <linux/ctype.h> 71#include <linux/ftrace.h> 72#include <linux/slab.h> 73#include <linux/init_task.h> 74#include <linux/binfmts.h> 75 76#include <asm/switch_to.h> 77#include <asm/tlb.h> 78#include <asm/irq_regs.h> 79#include <asm/mutex.h> 80#ifdef CONFIG_PARAVIRT 81#include <asm/paravirt.h> 82#endif 83 84#include "sched.h" 85#include "../workqueue_sched.h" 86#include "../smpboot.h" 87 88#define CREATE_TRACE_POINTS 89#include <trace/events/sched.h> 90 91void start_bandwidth_timer(struct hrtimer *period_timer, ktime_t period) 92{ 93 unsigned long delta; 94 ktime_t soft, hard, now; 95 96 for (;;) { 97 if (hrtimer_active(period_timer)) 98 break; 99 100 now = hrtimer_cb_get_time(period_timer); 101 hrtimer_forward(period_timer, now, period); 102 103 soft = hrtimer_get_softexpires(period_timer); 104 hard = hrtimer_get_expires(period_timer); 105 delta = ktime_to_ns(ktime_sub(hard, soft)); 106 __hrtimer_start_range_ns(period_timer, soft, delta, 107 HRTIMER_MODE_ABS_PINNED, 0); 108 } 109} 110 111DEFINE_MUTEX(sched_domains_mutex); 112DEFINE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues); 113 114static void update_rq_clock_task(struct rq *rq, s64 delta); 115 116void update_rq_clock(struct rq *rq) 117{ 118 s64 delta; 119 120 if (rq->skip_clock_update > 0) 121 return; 122 123 delta = sched_clock_cpu(cpu_of(rq)) - rq->clock; 124 rq->clock += delta; 125 update_rq_clock_task(rq, delta); 126} 127 128/* 129 * Debugging: various feature bits 130 */ 131 132#define SCHED_FEAT(name, enabled) \ 133 (1UL << __SCHED_FEAT_##name) * enabled | 134 135const_debug unsigned int sysctl_sched_features = 136#include "features.h" 137 0; 138 139#undef SCHED_FEAT 140 141#ifdef CONFIG_SCHED_DEBUG 142#define SCHED_FEAT(name, enabled) \ 143 #name , 144 145static const char * const sched_feat_names[] = { 146#include "features.h" 147}; 148 149#undef SCHED_FEAT 150 151static int sched_feat_show(struct seq_file *m, void *v) 152{ 153 int i; 154 155 for (i = 0; i < __SCHED_FEAT_NR; i++) { 156 if (!(sysctl_sched_features & (1UL << i))) 157 seq_puts(m, "NO_"); 158 seq_printf(m, "%s ", sched_feat_names[i]); 159 } 160 seq_puts(m, "\n"); 161 162 return 0; 163} 164 165#ifdef HAVE_JUMP_LABEL 166 167#define jump_label_key__true STATIC_KEY_INIT_TRUE 168#define jump_label_key__false STATIC_KEY_INIT_FALSE 169 170#define SCHED_FEAT(name, enabled) \ 171 jump_label_key__##enabled , 172 173struct static_key sched_feat_keys[__SCHED_FEAT_NR] = { 174#include "features.h" 175}; 176 177#undef SCHED_FEAT 178 179static void sched_feat_disable(int i) 180{ 181 if (static_key_enabled(&sched_feat_keys[i])) 182 static_key_slow_dec(&sched_feat_keys[i]); 183} 184 185static void sched_feat_enable(int i) 186{ 187 if (!static_key_enabled(&sched_feat_keys[i])) 188 static_key_slow_inc(&sched_feat_keys[i]); 189} 190#else 191static void sched_feat_disable(int i) { }; 192static void sched_feat_enable(int i) { }; 193#endif /* HAVE_JUMP_LABEL */ 194 195static int sched_feat_set(char *cmp) 196{ 197 int i; 198 int neg = 0; 199 200 if (strncmp(cmp, "NO_", 3) == 0) { 201 neg = 1; 202 cmp += 3; 203 } 204 205 for (i = 0; i < __SCHED_FEAT_NR; i++) { 206 if (strcmp(cmp, sched_feat_names[i]) == 0) { 207 if (neg) { 208 sysctl_sched_features &= ~(1UL << i); 209 sched_feat_disable(i); 210 } else { 211 sysctl_sched_features |= (1UL << i); 212 sched_feat_enable(i); 213 } 214 break; 215 } 216 } 217 218 return i; 219} 220 221static ssize_t 222sched_feat_write(struct file *filp, const char __user *ubuf, 223 size_t cnt, loff_t *ppos) 224{ 225 char buf[64]; 226 char *cmp; 227 int i; 228 229 if (cnt > 63) 230 cnt = 63; 231 232 if (copy_from_user(&buf, ubuf, cnt)) 233 return -EFAULT; 234 235 buf[cnt] = 0; 236 cmp = strstrip(buf); 237 238 i = sched_feat_set(cmp); 239 if (i == __SCHED_FEAT_NR) 240 return -EINVAL; 241 242 *ppos += cnt; 243 244 return cnt; 245} 246 247static int sched_feat_open(struct inode *inode, struct file *filp) 248{ 249 return single_open(filp, sched_feat_show, NULL); 250} 251 252static const struct file_operations sched_feat_fops = { 253 .open = sched_feat_open, 254 .write = sched_feat_write, 255 .read = seq_read, 256 .llseek = seq_lseek, 257 .release = single_release, 258}; 259 260static __init int sched_init_debug(void) 261{ 262 debugfs_create_file("sched_features", 0644, NULL, NULL, 263 &sched_feat_fops); 264 265 return 0; 266} 267late_initcall(sched_init_debug); 268#endif /* CONFIG_SCHED_DEBUG */ 269 270/* 271 * Number of tasks to iterate in a single balance run. 272 * Limited because this is done with IRQs disabled. 273 */ 274const_debug unsigned int sysctl_sched_nr_migrate = 32; 275 276/* 277 * period over which we average the RT time consumption, measured 278 * in ms. 279 * 280 * default: 1s 281 */ 282const_debug unsigned int sysctl_sched_time_avg = MSEC_PER_SEC; 283 284/* 285 * period over which we measure -rt task cpu usage in us. 286 * default: 1s 287 */ 288unsigned int sysctl_sched_rt_period = 1000000; 289 290__read_mostly int scheduler_running; 291 292/* 293 * part of the period that we allow rt tasks to run in us. 294 * default: 0.95s 295 */ 296int sysctl_sched_rt_runtime = 950000; 297 298 299 300/* 301 * __task_rq_lock - lock the rq @p resides on. 302 */ 303static inline struct rq *__task_rq_lock(struct task_struct *p) 304 __acquires(rq->lock) 305{ 306 struct rq *rq; 307 308 lockdep_assert_held(&p->pi_lock); 309 310 for (;;) { 311 rq = task_rq(p); 312 raw_spin_lock(&rq->lock); 313 if (likely(rq == task_rq(p))) 314 return rq; 315 raw_spin_unlock(&rq->lock); 316 } 317} 318 319/* 320 * task_rq_lock - lock p->pi_lock and lock the rq @p resides on. 321 */ 322static struct rq *task_rq_lock(struct task_struct *p, unsigned long *flags) 323 __acquires(p->pi_lock) 324 __acquires(rq->lock) 325{ 326 struct rq *rq; 327 328 for (;;) { 329 raw_spin_lock_irqsave(&p->pi_lock, *flags); 330 rq = task_rq(p); 331 raw_spin_lock(&rq->lock); 332 if (likely(rq == task_rq(p))) 333 return rq; 334 raw_spin_unlock(&rq->lock); 335 raw_spin_unlock_irqrestore(&p->pi_lock, *flags); 336 } 337} 338 339static void __task_rq_unlock(struct rq *rq) 340 __releases(rq->lock) 341{ 342 raw_spin_unlock(&rq->lock); 343} 344 345static inline void 346task_rq_unlock(struct rq *rq, struct task_struct *p, unsigned long *flags) 347 __releases(rq->lock) 348 __releases(p->pi_lock) 349{ 350 raw_spin_unlock(&rq->lock); 351 raw_spin_unlock_irqrestore(&p->pi_lock, *flags); 352} 353 354/* 355 * this_rq_lock - lock this runqueue and disable interrupts. 356 */ 357static struct rq *this_rq_lock(void) 358 __acquires(rq->lock) 359{ 360 struct rq *rq; 361 362 local_irq_disable(); 363 rq = this_rq(); 364 raw_spin_lock(&rq->lock); 365 366 return rq; 367} 368 369#ifdef CONFIG_SCHED_HRTICK 370/* 371 * Use HR-timers to deliver accurate preemption points. 372 * 373 * Its all a bit involved since we cannot program an hrt while holding the 374 * rq->lock. So what we do is store a state in in rq->hrtick_* and ask for a 375 * reschedule event. 376 * 377 * When we get rescheduled we reprogram the hrtick_timer outside of the 378 * rq->lock. 379 */ 380 381static void hrtick_clear(struct rq *rq) 382{ 383 if (hrtimer_active(&rq->hrtick_timer)) 384 hrtimer_cancel(&rq->hrtick_timer); 385} 386 387/* 388 * High-resolution timer tick. 389 * Runs from hardirq context with interrupts disabled. 390 */ 391static enum hrtimer_restart hrtick(struct hrtimer *timer) 392{ 393 struct rq *rq = container_of(timer, struct rq, hrtick_timer); 394 395 WARN_ON_ONCE(cpu_of(rq) != smp_processor_id()); 396 397 raw_spin_lock(&rq->lock); 398 update_rq_clock(rq); 399 rq->curr->sched_class->task_tick(rq, rq->curr, 1); 400 raw_spin_unlock(&rq->lock); 401 402 return HRTIMER_NORESTART; 403} 404 405#ifdef CONFIG_SMP 406/* 407 * called from hardirq (IPI) context 408 */ 409static void __hrtick_start(void *arg) 410{ 411 struct rq *rq = arg; 412 413 raw_spin_lock(&rq->lock); 414 hrtimer_restart(&rq->hrtick_timer); 415 rq->hrtick_csd_pending = 0; 416 raw_spin_unlock(&rq->lock); 417} 418 419/* 420 * Called to set the hrtick timer state. 421 * 422 * called with rq->lock held and irqs disabled 423 */ 424void hrtick_start(struct rq *rq, u64 delay) 425{ 426 struct hrtimer *timer = &rq->hrtick_timer; 427 ktime_t time = ktime_add_ns(timer->base->get_time(), delay); 428 429 hrtimer_set_expires(timer, time); 430 431 if (rq == this_rq()) { 432 hrtimer_restart(timer); 433 } else if (!rq->hrtick_csd_pending) { 434 __smp_call_function_single(cpu_of(rq), &rq->hrtick_csd, 0); 435 rq->hrtick_csd_pending = 1; 436 } 437} 438 439static int 440hotplug_hrtick(struct notifier_block *nfb, unsigned long action, void *hcpu) 441{ 442 int cpu = (int)(long)hcpu; 443 444 switch (action) { 445 case CPU_UP_CANCELED: 446 case CPU_UP_CANCELED_FROZEN: 447 case CPU_DOWN_PREPARE: 448 case CPU_DOWN_PREPARE_FROZEN: 449 case CPU_DEAD: 450 case CPU_DEAD_FROZEN: 451 hrtick_clear(cpu_rq(cpu)); 452 return NOTIFY_OK; 453 } 454 455 return NOTIFY_DONE; 456} 457 458static __init void init_hrtick(void) 459{ 460 hotcpu_notifier(hotplug_hrtick, 0); 461} 462#else 463/* 464 * Called to set the hrtick timer state. 465 * 466 * called with rq->lock held and irqs disabled 467 */ 468void hrtick_start(struct rq *rq, u64 delay) 469{ 470 __hrtimer_start_range_ns(&rq->hrtick_timer, ns_to_ktime(delay), 0, 471 HRTIMER_MODE_REL_PINNED, 0); 472} 473 474static inline void init_hrtick(void) 475{ 476} 477#endif /* CONFIG_SMP */ 478 479static void init_rq_hrtick(struct rq *rq) 480{ 481#ifdef CONFIG_SMP 482 rq->hrtick_csd_pending = 0; 483 484 rq->hrtick_csd.flags = 0; 485 rq->hrtick_csd.func = __hrtick_start; 486 rq->hrtick_csd.info = rq; 487#endif 488 489 hrtimer_init(&rq->hrtick_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); 490 rq->hrtick_timer.function = hrtick; 491} 492#else /* CONFIG_SCHED_HRTICK */ 493static inline void hrtick_clear(struct rq *rq) 494{ 495} 496 497static inline void init_rq_hrtick(struct rq *rq) 498{ 499} 500 501static inline void init_hrtick(void) 502{ 503} 504#endif /* CONFIG_SCHED_HRTICK */ 505 506/* 507 * resched_task - mark a task 'to be rescheduled now'. 508 * 509 * On UP this means the setting of the need_resched flag, on SMP it 510 * might also involve a cross-CPU call to trigger the scheduler on 511 * the target CPU. 512 */ 513#ifdef CONFIG_SMP 514 515#ifndef tsk_is_polling 516#define tsk_is_polling(t) 0 517#endif 518 519void resched_task(struct task_struct *p) 520{ 521 int cpu; 522 523 assert_raw_spin_locked(&task_rq(p)->lock); 524 525 if (test_tsk_need_resched(p)) 526 return; 527 528 set_tsk_need_resched(p); 529 530 cpu = task_cpu(p); 531 if (cpu == smp_processor_id()) 532 return; 533 534 /* NEED_RESCHED must be visible before we test polling */ 535 smp_mb(); 536 if (!tsk_is_polling(p)) 537 smp_send_reschedule(cpu); 538} 539 540void resched_cpu(int cpu) 541{ 542 struct rq *rq = cpu_rq(cpu); 543 unsigned long flags; 544 545 if (!raw_spin_trylock_irqsave(&rq->lock, flags)) 546 return; 547 resched_task(cpu_curr(cpu)); 548 raw_spin_unlock_irqrestore(&rq->lock, flags); 549} 550 551#ifdef CONFIG_NO_HZ 552/* 553 * In the semi idle case, use the nearest busy cpu for migrating timers 554 * from an idle cpu. This is good for power-savings. 555 * 556 * We don't do similar optimization for completely idle system, as 557 * selecting an idle cpu will add more delays to the timers than intended 558 * (as that cpu's timer base may not be uptodate wrt jiffies etc). 559 */ 560int get_nohz_timer_target(void) 561{ 562 int cpu = smp_processor_id(); 563 int i; 564 struct sched_domain *sd; 565 566 rcu_read_lock(); 567 for_each_domain(cpu, sd) { 568 for_each_cpu(i, sched_domain_span(sd)) { 569 if (!idle_cpu(i)) { 570 cpu = i; 571 goto unlock; 572 } 573 } 574 } 575unlock: 576 rcu_read_unlock(); 577 return cpu; 578} 579/* 580 * When add_timer_on() enqueues a timer into the timer wheel of an 581 * idle CPU then this timer might expire before the next timer event 582 * which is scheduled to wake up that CPU. In case of a completely 583 * idle system the next event might even be infinite time into the 584 * future. wake_up_idle_cpu() ensures that the CPU is woken up and 585 * leaves the inner idle loop so the newly added timer is taken into 586 * account when the CPU goes back to idle and evaluates the timer 587 * wheel for the next timer event. 588 */ 589void wake_up_idle_cpu(int cpu) 590{ 591 struct rq *rq = cpu_rq(cpu); 592 593 if (cpu == smp_processor_id()) 594 return; 595 596 /* 597 * This is safe, as this function is called with the timer 598 * wheel base lock of (cpu) held. When the CPU is on the way 599 * to idle and has not yet set rq->curr to idle then it will 600 * be serialized on the timer wheel base lock and take the new 601 * timer into account automatically. 602 */ 603 if (rq->curr != rq->idle) 604 return; 605 606 /* 607 * We can set TIF_RESCHED on the idle task of the other CPU 608 * lockless. The worst case is that the other CPU runs the 609 * idle task through an additional NOOP schedule() 610 */ 611 set_tsk_need_resched(rq->idle); 612 613 /* NEED_RESCHED must be visible before we test polling */ 614 smp_mb(); 615 if (!tsk_is_polling(rq->idle)) 616 smp_send_reschedule(cpu); 617} 618 619static inline bool got_nohz_idle_kick(void) 620{ 621 int cpu = smp_processor_id(); 622 return idle_cpu(cpu) && test_bit(NOHZ_BALANCE_KICK, nohz_flags(cpu)); 623} 624 625#else /* CONFIG_NO_HZ */ 626 627static inline bool got_nohz_idle_kick(void) 628{ 629 return false; 630} 631 632#endif /* CONFIG_NO_HZ */ 633 634void sched_avg_update(struct rq *rq) 635{ 636 s64 period = sched_avg_period(); 637 638 while ((s64)(rq->clock - rq->age_stamp) > period) { 639 /* 640 * Inline assembly required to prevent the compiler 641 * optimising this loop into a divmod call. 642 * See __iter_div_u64_rem() for another example of this. 643 */ 644 asm("" : "+rm" (rq->age_stamp)); 645 rq->age_stamp += period; 646 rq->rt_avg /= 2; 647 } 648} 649 650#else /* !CONFIG_SMP */ 651void resched_task(struct task_struct *p) 652{ 653 assert_raw_spin_locked(&task_rq(p)->lock); 654 set_tsk_need_resched(p); 655} 656#endif /* CONFIG_SMP */ 657 658#if defined(CONFIG_RT_GROUP_SCHED) || (defined(CONFIG_FAIR_GROUP_SCHED) && \ 659 (defined(CONFIG_SMP) || defined(CONFIG_CFS_BANDWIDTH))) 660/* 661 * Iterate task_group tree rooted at *from, calling @down when first entering a 662 * node and @up when leaving it for the final time. 663 * 664 * Caller must hold rcu_lock or sufficient equivalent. 665 */ 666int walk_tg_tree_from(struct task_group *from, 667 tg_visitor down, tg_visitor up, void *data) 668{ 669 struct task_group *parent, *child; 670 int ret; 671 672 parent = from; 673 674down: 675 ret = (*down)(parent, data); 676 if (ret) 677 goto out; 678 list_for_each_entry_rcu(child, &parent->children, siblings) { 679 parent = child; 680 goto down; 681 682up: 683 continue; 684 } 685 ret = (*up)(parent, data); 686 if (ret || parent == from) 687 goto out; 688 689 child = parent; 690 parent = parent->parent; 691 if (parent) 692 goto up; 693out: 694 return ret; 695} 696 697int tg_nop(struct task_group *tg, void *data) 698{ 699 return 0; 700} 701#endif 702 703static void set_load_weight(struct task_struct *p) 704{ 705 int prio = p->static_prio - MAX_RT_PRIO; 706 struct load_weight *load = &p->se.load; 707 708 /* 709 * SCHED_IDLE tasks get minimal weight: 710 */ 711 if (p->policy == SCHED_IDLE) { 712 load->weight = scale_load(WEIGHT_IDLEPRIO); 713 load->inv_weight = WMULT_IDLEPRIO; 714 return; 715 } 716 717 load->weight = scale_load(prio_to_weight[prio]); 718 load->inv_weight = prio_to_wmult[prio]; 719} 720 721static void enqueue_task(struct rq *rq, struct task_struct *p, int flags) 722{ 723 update_rq_clock(rq); 724 sched_info_queued(p); 725 p->sched_class->enqueue_task(rq, p, flags); 726} 727 728static void dequeue_task(struct rq *rq, struct task_struct *p, int flags) 729{ 730 update_rq_clock(rq); 731 sched_info_dequeued(p); 732 p->sched_class->dequeue_task(rq, p, flags); 733} 734 735void activate_task(struct rq *rq, struct task_struct *p, int flags) 736{ 737 if (task_contributes_to_load(p)) 738 rq->nr_uninterruptible--; 739 740 enqueue_task(rq, p, flags); 741} 742 743void deactivate_task(struct rq *rq, struct task_struct *p, int flags) 744{ 745 if (task_contributes_to_load(p)) 746 rq->nr_uninterruptible++; 747 748 dequeue_task(rq, p, flags); 749} 750 751static void update_rq_clock_task(struct rq *rq, s64 delta) 752{ 753/* 754 * In theory, the compile should just see 0 here, and optimize out the call 755 * to sched_rt_avg_update. But I don't trust it... 756 */ 757#if defined(CONFIG_IRQ_TIME_ACCOUNTING) || defined(CONFIG_PARAVIRT_TIME_ACCOUNTING) 758 s64 steal = 0, irq_delta = 0; 759#endif 760#ifdef CONFIG_IRQ_TIME_ACCOUNTING 761 irq_delta = irq_time_read(cpu_of(rq)) - rq->prev_irq_time; 762 763 /* 764 * Since irq_time is only updated on {soft,}irq_exit, we might run into 765 * this case when a previous update_rq_clock() happened inside a 766 * {soft,}irq region. 767 * 768 * When this happens, we stop ->clock_task and only update the 769 * prev_irq_time stamp to account for the part that fit, so that a next 770 * update will consume the rest. This ensures ->clock_task is 771 * monotonic. 772 * 773 * It does however cause some slight miss-attribution of {soft,}irq 774 * time, a more accurate solution would be to update the irq_time using 775 * the current rq->clock timestamp, except that would require using 776 * atomic ops. 777 */ 778 if (irq_delta > delta) 779 irq_delta = delta; 780 781 rq->prev_irq_time += irq_delta; 782 delta -= irq_delta; 783#endif 784#ifdef CONFIG_PARAVIRT_TIME_ACCOUNTING 785 if (static_key_false((¶virt_steal_rq_enabled))) { 786 u64 st; 787 788 steal = paravirt_steal_clock(cpu_of(rq)); 789 steal -= rq->prev_steal_time_rq; 790 791 if (unlikely(steal > delta)) 792 steal = delta; 793 794 st = steal_ticks(steal); 795 steal = st * TICK_NSEC; 796 797 rq->prev_steal_time_rq += steal; 798 799 delta -= steal; 800 } 801#endif 802 803 rq->clock_task += delta; 804 805#if defined(CONFIG_IRQ_TIME_ACCOUNTING) || defined(CONFIG_PARAVIRT_TIME_ACCOUNTING) 806 if ((irq_delta + steal) && sched_feat(NONTASK_POWER)) 807 sched_rt_avg_update(rq, irq_delta + steal); 808#endif 809} 810 811void sched_set_stop_task(int cpu, struct task_struct *stop) 812{ 813 struct sched_param param = { .sched_priority = MAX_RT_PRIO - 1 }; 814 struct task_struct *old_stop = cpu_rq(cpu)->stop; 815 816 if (stop) { 817 /* 818 * Make it appear like a SCHED_FIFO task, its something 819 * userspace knows about and won't get confused about. 820 * 821 * Also, it will make PI more or less work without too 822 * much confusion -- but then, stop work should not 823 * rely on PI working anyway. 824 */ 825 sched_setscheduler_nocheck(stop, SCHED_FIFO, ¶m); 826 827 stop->sched_class = &stop_sched_class; 828 } 829 830 cpu_rq(cpu)->stop = stop; 831 832 if (old_stop) { 833 /* 834 * Reset it back to a normal scheduling class so that 835 * it can die in pieces. 836 */ 837 old_stop->sched_class = &rt_sched_class; 838 } 839} 840 841/* 842 * __normal_prio - return the priority that is based on the static prio 843 */ 844static inline int __normal_prio(struct task_struct *p) 845{ 846 return p->static_prio; 847} 848 849/* 850 * Calculate the expected normal priority: i.e. priority 851 * without taking RT-inheritance into account. Might be 852 * boosted by interactivity modifiers. Changes upon fork, 853 * setprio syscalls, and whenever the interactivity 854 * estimator recalculates. 855 */ 856static inline int normal_prio(struct task_struct *p) 857{ 858 int prio; 859 860 if (task_has_rt_policy(p)) 861 prio = MAX_RT_PRIO-1 - p->rt_priority; 862 else 863 prio = __normal_prio(p); 864 return prio; 865} 866 867/* 868 * Calculate the current priority, i.e. the priority 869 * taken into account by the scheduler. This value might 870 * be boosted by RT tasks, or might be boosted by 871 * interactivity modifiers. Will be RT if the task got 872 * RT-boosted. If not then it returns p->normal_prio. 873 */ 874static int effective_prio(struct task_struct *p) 875{ 876 p->normal_prio = normal_prio(p); 877 /* 878 * If we are RT tasks or we were boosted to RT priority, 879 * keep the priority unchanged. Otherwise, update priority 880 * to the normal priority: 881 */ 882 if (!rt_prio(p->prio)) 883 return p->normal_prio; 884 return p->prio; 885} 886 887/** 888 * task_curr - is this task currently executing on a CPU? 889 * @p: the task in question. 890 */ 891inline int task_curr(const struct task_struct *p) 892{ 893 return cpu_curr(task_cpu(p)) == p; 894} 895 896static inline void check_class_changed(struct rq *rq, struct task_struct *p, 897 const struct sched_class *prev_class, 898 int oldprio) 899{ 900 if (prev_class != p->sched_class) { 901 if (prev_class->switched_from) 902 prev_class->switched_from(rq, p); 903 p->sched_class->switched_to(rq, p); 904 } else if (oldprio != p->prio) 905 p->sched_class->prio_changed(rq, p, oldprio); 906} 907 908void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags) 909{ 910 const struct sched_class *class; 911 912 if (p->sched_class == rq->curr->sched_class) { 913 rq->curr->sched_class->check_preempt_curr(rq, p, flags); 914 } else { 915 for_each_class(class) { 916 if (class == rq->curr->sched_class) 917 break; 918 if (class == p->sched_class) { 919 resched_task(rq->curr); 920 break; 921 } 922 } 923 } 924 925 /* 926 * A queue event has occurred, and we're going to schedule. In 927 * this case, we can save a useless back to back clock update. 928 */ 929 if (rq->curr->on_rq && test_tsk_need_resched(rq->curr)) 930 rq->skip_clock_update = 1; 931} 932 933#ifdef CONFIG_SMP 934void set_task_cpu(struct task_struct *p, unsigned int new_cpu) 935{ 936#ifdef CONFIG_SCHED_DEBUG 937 /* 938 * We should never call set_task_cpu() on a blocked task, 939 * ttwu() will sort out the placement. 940 */ 941 WARN_ON_ONCE(p->state != TASK_RUNNING && p->state != TASK_WAKING && 942 !(task_thread_info(p)->preempt_count & PREEMPT_ACTIVE)); 943 944#ifdef CONFIG_LOCKDEP 945 /* 946 * The caller should hold either p->pi_lock or rq->lock, when changing 947 * a task's CPU. ->pi_lock for waking tasks, rq->lock for runnable tasks. 948 * 949 * sched_move_task() holds both and thus holding either pins the cgroup, 950 * see task_group(). 951 * 952 * Furthermore, all task_rq users should acquire both locks, see 953 * task_rq_lock(). 954 */ 955 WARN_ON_ONCE(debug_locks && !(lockdep_is_held(&p->pi_lock) || 956 lockdep_is_held(&task_rq(p)->lock))); 957#endif 958#endif 959 960 trace_sched_migrate_task(p, new_cpu); 961 962 if (task_cpu(p) != new_cpu) { 963 p->se.nr_migrations++; 964 perf_sw_event(PERF_COUNT_SW_CPU_MIGRATIONS, 1, NULL, 0); 965 } 966 967 __set_task_cpu(p, new_cpu); 968} 969 970struct migration_arg { 971 struct task_struct *task; 972 int dest_cpu; 973}; 974 975static int migration_cpu_stop(void *data); 976 977/* 978 * wait_task_inactive - wait for a thread to unschedule. 979 * 980 * If @match_state is nonzero, it's the @p->state value just checked and 981 * not expected to change. If it changes, i.e. @p might have woken up, 982 * then return zero. When we succeed in waiting for @p to be off its CPU, 983 * we return a positive number (its total switch count). If a second call 984 * a short while later returns the same number, the caller can be sure that 985 * @p has remained unscheduled the whole time. 986 * 987 * The caller must ensure that the task *will* unschedule sometime soon, 988 * else this function might spin for a *long* time. This function can't 989 * be called with interrupts off, or it may introduce deadlock with 990 * smp_call_function() if an IPI is sent by the same process we are 991 * waiting to become inactive. 992 */ 993unsigned long wait_task_inactive(struct task_struct *p, long match_state) 994{ 995 unsigned long flags; 996 int running, on_rq; 997 unsigned long ncsw; 998 struct rq *rq; 999 1000 for (;;) { 1001 /* 1002 * We do the initial early heuristics without holding 1003 * any task-queue locks at all. We'll only try to get 1004 * the runqueue lock when things look like they will 1005 * work out! 1006 */ 1007 rq = task_rq(p); 1008 1009 /* 1010 * If the task is actively running on another CPU 1011 * still, just relax and busy-wait without holding 1012 * any locks. 1013 * 1014 * NOTE! Since we don't hold any locks, it's not 1015 * even sure that "rq" stays as the right runqueue! 1016 * But we don't care, since "task_running()" will 1017 * return false if the runqueue has changed and p 1018 * is actually now running somewhere else! 1019 */ 1020 while (task_running(rq, p)) { 1021 if (match_state && unlikely(p->state != match_state)) 1022 return 0; 1023 cpu_relax(); 1024 } 1025 1026 /* 1027 * Ok, time to look more closely! We need the rq 1028 * lock now, to be *sure*. If we're wrong, we'll 1029 * just go back and repeat. 1030 */ 1031 rq = task_rq_lock(p, &flags); 1032 trace_sched_wait_task(p); 1033 running = task_running(rq, p); 1034 on_rq = p->on_rq; 1035 ncsw = 0; 1036 if (!match_state || p->state == match_state) 1037 ncsw = p->nvcsw | LONG_MIN; /* sets MSB */ 1038 task_rq_unlock(rq, p, &flags); 1039 1040 /* 1041 * If it changed from the expected state, bail out now. 1042 */ 1043 if (unlikely(!ncsw)) 1044 break; 1045 1046 /* 1047 * Was it really running after all now that we 1048 * checked with the proper locks actually held? 1049 * 1050 * Oops. Go back and try again.. 1051 */ 1052 if (unlikely(running)) { 1053 cpu_relax(); 1054 continue; 1055 } 1056 1057 /* 1058 * It's not enough that it's not actively running, 1059 * it must be off the runqueue _entirely_, and not 1060 * preempted! 1061 * 1062 * So if it was still runnable (but just not actively 1063 * running right now), it's preempted, and we should 1064 * yield - it could be a while. 1065 */ 1066 if (unlikely(on_rq)) { 1067 ktime_t to = ktime_set(0, NSEC_PER_SEC/HZ); 1068 1069 set_current_state(TASK_UNINTERRUPTIBLE); 1070 schedule_hrtimeout(&to, HRTIMER_MODE_REL); 1071 continue; 1072 } 1073 1074 /* 1075 * Ahh, all good. It wasn't running, and it wasn't 1076 * runnable, which means that it will never become 1077 * running in the future either. We're all done! 1078 */ 1079 break; 1080 } 1081 1082 return ncsw; 1083} 1084 1085/*** 1086 * kick_process - kick a running thread to enter/exit the kernel 1087 * @p: the to-be-kicked thread 1088 * 1089 * Cause a process which is running on another CPU to enter 1090 * kernel-mode, without any delay. (to get signals handled.) 1091 * 1092 * NOTE: this function doesn't have to take the runqueue lock, 1093 * because all it wants to ensure is that the remote task enters 1094 * the kernel. If the IPI races and the task has been migrated 1095 * to another CPU then no harm is done and the purpose has been 1096 * achieved as well. 1097 */ 1098void kick_process(struct task_struct *p) 1099{ 1100 int cpu; 1101 1102 preempt_disable(); 1103 cpu = task_cpu(p); 1104 if ((cpu != smp_processor_id()) && task_curr(p)) 1105 smp_send_reschedule(cpu); 1106 preempt_enable(); 1107} 1108EXPORT_SYMBOL_GPL(kick_process); 1109#endif /* CONFIG_SMP */ 1110 1111#ifdef CONFIG_SMP 1112/* 1113 * ->cpus_allowed is protected by both rq->lock and p->pi_lock 1114 */ 1115static int select_fallback_rq(int cpu, struct task_struct *p) 1116{ 1117 const struct cpumask *nodemask = cpumask_of_node(cpu_to_node(cpu)); 1118 enum { cpuset, possible, fail } state = cpuset; 1119 int dest_cpu; 1120 1121 /* Look for allowed, online CPU in same node. */ 1122 for_each_cpu(dest_cpu, nodemask) { 1123 if (!cpu_online(dest_cpu)) 1124 continue; 1125 if (!cpu_active(dest_cpu)) 1126 continue; 1127 if (cpumask_test_cpu(dest_cpu, tsk_cpus_allowed(p))) 1128 return dest_cpu; 1129 } 1130 1131 for (;;) { 1132 /* Any allowed, online CPU? */ 1133 for_each_cpu(dest_cpu, tsk_cpus_allowed(p)) { 1134 if (!cpu_online(dest_cpu)) 1135 continue; 1136 if (!cpu_active(dest_cpu)) 1137 continue; 1138 goto out; 1139 } 1140 1141 switch (state) { 1142 case cpuset: 1143 /* No more Mr. Nice Guy. */ 1144 cpuset_cpus_allowed_fallback(p); 1145 state = possible; 1146 break; 1147 1148 case possible: 1149 do_set_cpus_allowed(p, cpu_possible_mask); 1150 state = fail; 1151 break; 1152 1153 case fail: 1154 BUG(); 1155 break; 1156 } 1157 } 1158 1159out: 1160 if (state != cpuset) { 1161 /* 1162 * Don't tell them about moving exiting tasks or 1163 * kernel threads (both mm NULL), since they never 1164 * leave kernel. 1165 */ 1166 if (p->mm && printk_ratelimit()) { 1167 printk_sched("process %d (%s) no longer affine to cpu%d\n", 1168 task_pid_nr(p), p->comm, cpu); 1169 } 1170 } 1171 1172 return dest_cpu; 1173} 1174 1175/* 1176 * The caller (fork, wakeup) owns p->pi_lock, ->cpus_allowed is stable. 1177 */ 1178static inline 1179int select_task_rq(struct task_struct *p, int sd_flags, int wake_flags) 1180{ 1181 int cpu = p->sched_class->select_task_rq(p, sd_flags, wake_flags); 1182 1183 /* 1184 * In order not to call set_task_cpu() on a blocking task we need 1185 * to rely on ttwu() to place the task on a valid ->cpus_allowed 1186 * cpu. 1187 * 1188 * Since this is common to all placement strategies, this lives here. 1189 * 1190 * [ this allows ->select_task() to simply return task_cpu(p) and 1191 * not worry about this generic constraint ] 1192 */ 1193 if (unlikely(!cpumask_test_cpu(cpu, tsk_cpus_allowed(p)) || 1194 !cpu_online(cpu))) 1195 cpu = select_fallback_rq(task_cpu(p), p); 1196 1197 return cpu; 1198} 1199 1200static void update_avg(u64 *avg, u64 sample) 1201{ 1202 s64 diff = sample - *avg; 1203 *avg += diff >> 3; 1204} 1205#endif 1206 1207static void 1208ttwu_stat(struct task_struct *p, int cpu, int wake_flags) 1209{ 1210#ifdef CONFIG_SCHEDSTATS 1211 struct rq *rq = this_rq(); 1212 1213#ifdef CONFIG_SMP 1214 int this_cpu = smp_processor_id(); 1215 1216 if (cpu == this_cpu) { 1217 schedstat_inc(rq, ttwu_local); 1218 schedstat_inc(p, se.statistics.nr_wakeups_local); 1219 } else { 1220 struct sched_domain *sd; 1221 1222 schedstat_inc(p, se.statistics.nr_wakeups_remote); 1223 rcu_read_lock(); 1224 for_each_domain(this_cpu, sd) { 1225 if (cpumask_test_cpu(cpu, sched_domain_span(sd))) { 1226 schedstat_inc(sd, ttwu_wake_remote); 1227 break; 1228 } 1229 } 1230 rcu_read_unlock(); 1231 } 1232 1233 if (wake_flags & WF_MIGRATED) 1234 schedstat_inc(p, se.statistics.nr_wakeups_migrate); 1235 1236#endif /* CONFIG_SMP */ 1237 1238 schedstat_inc(rq, ttwu_count); 1239 schedstat_inc(p, se.statistics.nr_wakeups); 1240 1241 if (wake_flags & WF_SYNC) 1242 schedstat_inc(p, se.statistics.nr_wakeups_sync); 1243 1244#endif /* CONFIG_SCHEDSTATS */ 1245} 1246 1247static void ttwu_activate(struct rq *rq, struct task_struct *p, int en_flags) 1248{ 1249 activate_task(rq, p, en_flags); 1250 p->on_rq = 1; 1251 1252 /* if a worker is waking up, notify workqueue */ 1253 if (p->flags & PF_WQ_WORKER) 1254 wq_worker_waking_up(p, cpu_of(rq)); 1255} 1256 1257/* 1258 * Mark the task runnable and perform wakeup-preemption. 1259 */ 1260static void 1261ttwu_do_wakeup(struct rq *rq, struct task_struct *p, int wake_flags) 1262{ 1263 trace_sched_wakeup(p, true); 1264 check_preempt_curr(rq, p, wake_flags); 1265 1266 p->state = TASK_RUNNING; 1267#ifdef CONFIG_SMP 1268 if (p->sched_class->task_woken) 1269 p->sched_class->task_woken(rq, p); 1270 1271 if (rq->idle_stamp) { 1272 u64 delta = rq->clock - rq->idle_stamp; 1273 u64 max = 2*sysctl_sched_migration_cost; 1274 1275 if (delta > max) 1276 rq->avg_idle = max; 1277 else 1278 update_avg(&rq->avg_idle, delta); 1279 rq->idle_stamp = 0; 1280 } 1281#endif 1282} 1283 1284static void 1285ttwu_do_activate(struct rq *rq, struct task_struct *p, int wake_flags) 1286{ 1287#ifdef CONFIG_SMP 1288 if (p->sched_contributes_to_load) 1289 rq->nr_uninterruptible--; 1290#endif 1291 1292 ttwu_activate(rq, p, ENQUEUE_WAKEUP | ENQUEUE_WAKING); 1293 ttwu_do_wakeup(rq, p, wake_flags); 1294} 1295 1296/* 1297 * Called in case the task @p isn't fully descheduled from its runqueue, 1298 * in this case we must do a remote wakeup. Its a 'light' wakeup though, 1299 * since all we need to do is flip p->state to TASK_RUNNING, since 1300 * the task is still ->on_rq. 1301 */ 1302static int ttwu_remote(struct task_struct *p, int wake_flags) 1303{ 1304 struct rq *rq; 1305 int ret = 0; 1306 1307 rq = __task_rq_lock(p); 1308 if (p->on_rq) { 1309 ttwu_do_wakeup(rq, p, wake_flags); 1310 ret = 1; 1311 } 1312 __task_rq_unlock(rq); 1313 1314 return ret; 1315} 1316 1317#ifdef CONFIG_SMP 1318static void sched_ttwu_pending(void) 1319{ 1320 struct rq *rq = this_rq(); 1321 struct llist_node *llist = llist_del_all(&rq->wake_list); 1322 struct task_struct *p; 1323 1324 raw_spin_lock(&rq->lock); 1325 1326 while (llist) { 1327 p = llist_entry(llist, struct task_struct, wake_entry); 1328 llist = llist_next(llist); 1329 ttwu_do_activate(rq, p, 0); 1330 } 1331 1332 raw_spin_unlock(&rq->lock); 1333} 1334 1335void scheduler_ipi(void) 1336{ 1337 if (llist_empty(&this_rq()->wake_list) && !got_nohz_idle_kick()) 1338 return; 1339 1340 /* 1341 * Not all reschedule IPI handlers call irq_enter/irq_exit, since 1342 * traditionally all their work was done from the interrupt return 1343 * path. Now that we actually do some work, we need to make sure 1344 * we do call them. 1345 * 1346 * Some archs already do call them, luckily irq_enter/exit nest 1347 * properly. 1348 * 1349 * Arguably we should visit all archs and update all handlers, 1350 * however a fair share of IPIs are still resched only so this would 1351 * somewhat pessimize the simple resched case. 1352 */ 1353 irq_enter(); 1354 sched_ttwu_pending(); 1355 1356 /* 1357 * Check if someone kicked us for doing the nohz idle load balance. 1358 */ 1359 if (unlikely(got_nohz_idle_kick() && !need_resched())) { 1360 this_rq()->idle_balance = 1; 1361 raise_softirq_irqoff(SCHED_SOFTIRQ); 1362 } 1363 irq_exit(); 1364} 1365 1366static void ttwu_queue_remote(struct task_struct *p, int cpu) 1367{ 1368 if (llist_add(&p->wake_entry, &cpu_rq(cpu)->wake_list)) 1369 smp_send_reschedule(cpu); 1370} 1371 1372bool cpus_share_cache(int this_cpu, int that_cpu) 1373{ 1374 return per_cpu(sd_llc_id, this_cpu) == per_cpu(sd_llc_id, that_cpu); 1375} 1376#endif /* CONFIG_SMP */ 1377 1378static void ttwu_queue(struct task_struct *p, int cpu) 1379{ 1380 struct rq *rq = cpu_rq(cpu); 1381 1382#if defined(CONFIG_SMP) 1383 if (sched_feat(TTWU_QUEUE) && !cpus_share_cache(smp_processor_id(), cpu)) { 1384 sched_clock_cpu(cpu); /* sync clocks x-cpu */ 1385 ttwu_queue_remote(p, cpu); 1386 return; 1387 } 1388#endif 1389 1390 raw_spin_lock(&rq->lock); 1391 ttwu_do_activate(rq, p, 0); 1392 raw_spin_unlock(&rq->lock); 1393} 1394 1395/** 1396 * try_to_wake_up - wake up a thread 1397 * @p: the thread to be awakened 1398 * @state: the mask of task states that can be woken 1399 * @wake_flags: wake modifier flags (WF_*) 1400 * 1401 * Put it on the run-queue if it's not already there. The "current" 1402 * thread is always on the run-queue (except when the actual 1403 * re-schedule is in progress), and as such you're allowed to do 1404 * the simpler "current->state = TASK_RUNNING" to mark yourself 1405 * runnable without the overhead of this. 1406 * 1407 * Returns %true if @p was woken up, %false if it was already running 1408 * or @state didn't match @p's state. 1409 */ 1410static int 1411try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags) 1412{ 1413 unsigned long flags; 1414 int cpu, success = 0; 1415 1416 smp_wmb(); 1417 raw_spin_lock_irqsave(&p->pi_lock, flags); 1418 if (!(p->state & state)) 1419 goto out; 1420 1421 success = 1; /* we're going to change ->state */ 1422 cpu = task_cpu(p); 1423 1424 if (p->on_rq && ttwu_remote(p, wake_flags)) 1425 goto stat; 1426 1427#ifdef CONFIG_SMP 1428 /* 1429 * If the owning (remote) cpu is still in the middle of schedule() with 1430 * this task as prev, wait until its done referencing the task. 1431 */ 1432 while (p->on_cpu) 1433 cpu_relax(); 1434 /* 1435 * Pairs with the smp_wmb() in finish_lock_switch(). 1436 */ 1437 smp_rmb(); 1438 1439 p->sched_contributes_to_load = !!task_contributes_to_load(p); 1440 p->state = TASK_WAKING; 1441 1442 if (p->sched_class->task_waking) 1443 p->sched_class->task_waking(p); 1444 1445 cpu = select_task_rq(p, SD_BALANCE_WAKE, wake_flags); 1446 if (task_cpu(p) != cpu) { 1447 wake_flags |= WF_MIGRATED; 1448 set_task_cpu(p, cpu); 1449 } 1450#endif /* CONFIG_SMP */ 1451 1452 ttwu_queue(p, cpu); 1453stat: 1454 ttwu_stat(p, cpu, wake_flags); 1455out: 1456 raw_spin_unlock_irqrestore(&p->pi_lock, flags); 1457 1458 return success; 1459} 1460 1461/** 1462 * try_to_wake_up_local - try to wake up a local task with rq lock held 1463 * @p: the thread to be awakened 1464 * 1465 * Put @p on the run-queue if it's not already there. The caller must 1466 * ensure that this_rq() is locked, @p is bound to this_rq() and not 1467 * the current task. 1468 */ 1469static void try_to_wake_up_local(struct task_struct *p) 1470{ 1471 struct rq *rq = task_rq(p); 1472 1473 BUG_ON(rq != this_rq()); 1474 BUG_ON(p == current); 1475 lockdep_assert_held(&rq->lock); 1476 1477 if (!raw_spin_trylock(&p->pi_lock)) { 1478 raw_spin_unlock(&rq->lock); 1479 raw_spin_lock(&p->pi_lock); 1480 raw_spin_lock(&rq->lock); 1481 } 1482 1483 if (!(p->state & TASK_NORMAL)) 1484 goto out; 1485 1486 if (!p->on_rq) 1487 ttwu_activate(rq, p, ENQUEUE_WAKEUP); 1488 1489 ttwu_do_wakeup(rq, p, 0); 1490 ttwu_stat(p, smp_processor_id(), 0); 1491out: 1492 raw_spin_unlock(&p->pi_lock); 1493} 1494 1495/** 1496 * wake_up_process - Wake up a specific process 1497 * @p: The process to be woken up. 1498 * 1499 * Attempt to wake up the nominated process and move it to the set of runnable 1500 * processes. Returns 1 if the process was woken up, 0 if it was already 1501 * running. 1502 * 1503 * It may be assumed that this function implies a write memory barrier before 1504 * changing the task state if and only if any tasks are woken up. 1505 */ 1506int wake_up_process(struct task_struct *p) 1507{ 1508 return try_to_wake_up(p, TASK_ALL, 0); 1509} 1510EXPORT_SYMBOL(wake_up_process); 1511 1512int wake_up_state(struct task_struct *p, unsigned int state) 1513{ 1514 return try_to_wake_up(p, state, 0); 1515} 1516 1517/* 1518 * Perform scheduler related setup for a newly forked process p. 1519 * p is forked by current. 1520 * 1521 * __sched_fork() is basic setup used by init_idle() too: 1522 */ 1523static void __sched_fork(struct task_struct *p) 1524{ 1525 p->on_rq = 0; 1526 1527 p->se.on_rq = 0; 1528 p->se.exec_start = 0; 1529 p->se.sum_exec_runtime = 0; 1530 p->se.prev_sum_exec_runtime = 0; 1531 p->se.nr_migrations = 0; 1532 p->se.vruntime = 0; 1533 INIT_LIST_HEAD(&p->se.group_node); 1534 1535#ifdef CONFIG_SCHEDSTATS 1536 memset(&p->se.statistics, 0, sizeof(p->se.statistics)); 1537#endif 1538 1539 INIT_LIST_HEAD(&p->rt.run_list); 1540 1541#ifdef CONFIG_PREEMPT_NOTIFIERS 1542 INIT_HLIST_HEAD(&p->preempt_notifiers); 1543#endif 1544 1545#ifdef CONFIG_NUMA_BALANCING 1546 if (p->mm && atomic_read(&p->mm->mm_users) == 1) { 1547 p->mm->numa_next_scan = jiffies; 1548 p->mm->numa_next_reset = jiffies; 1549 p->mm->numa_scan_seq = 0; 1550 } 1551 1552 p->node_stamp = 0ULL; 1553 p->numa_scan_seq = p->mm ? p->mm->numa_scan_seq : 0; 1554 p->numa_migrate_seq = p->mm ? p->mm->numa_scan_seq - 1 : 0; 1555 p->numa_scan_period = sysctl_numa_balancing_scan_delay; 1556 p->numa_work.next = &p->numa_work; 1557#endif /* CONFIG_NUMA_BALANCING */ 1558} 1559 1560#ifdef CONFIG_NUMA_BALANCING 1561#ifdef CONFIG_SCHED_DEBUG 1562void set_numabalancing_state(bool enabled) 1563{ 1564 if (enabled) 1565 sched_feat_set("NUMA"); 1566 else 1567 sched_feat_set("NO_NUMA"); 1568} 1569#else 1570__read_mostly bool numabalancing_enabled; 1571 1572void set_numabalancing_state(bool enabled) 1573{ 1574 numabalancing_enabled = enabled; 1575} 1576#endif /* CONFIG_SCHED_DEBUG */ 1577#endif /* CONFIG_NUMA_BALANCING */ 1578 1579/* 1580 * fork()/clone()-time setup: 1581 */ 1582void sched_fork(struct task_struct *p) 1583{ 1584 unsigned long flags; 1585 int cpu = get_cpu(); 1586 1587 __sched_fork(p); 1588 /* 1589 * We mark the process as running here. This guarantees that 1590 * nobody will actually run it, and a signal or other external 1591 * event cannot wake it up and insert it on the runqueue either. 1592 */ 1593 p->state = TASK_RUNNING; 1594 1595 /* 1596 * Make sure we do not leak PI boosting priority to the child. 1597 */ 1598 p->prio = current->normal_prio; 1599 1600 /* 1601 * Revert to default priority/policy on fork if requested. 1602 */ 1603 if (unlikely(p->sched_reset_on_fork)) { 1604 if (task_has_rt_policy(p)) { 1605 p->policy = SCHED_NORMAL; 1606 p->static_prio = NICE_TO_PRIO(0); 1607 p->rt_priority = 0; 1608 } else if (PRIO_TO_NICE(p->static_prio) < 0) 1609 p->static_prio = NICE_TO_PRIO(0); 1610 1611 p->prio = p->normal_prio = __normal_prio(p); 1612 set_load_weight(p); 1613 1614 /* 1615 * We don't need the reset flag anymore after the fork. It has 1616 * fulfilled its duty: 1617 */ 1618 p->sched_reset_on_fork = 0; 1619 } 1620 1621 if (!rt_prio(p->prio)) 1622 p->sched_class = &fair_sched_class; 1623 1624 if (p->sched_class->task_fork) 1625 p->sched_class->task_fork(p); 1626 1627 /* 1628 * The child is not yet in the pid-hash so no cgroup attach races, 1629 * and the cgroup is pinned to this child due to cgroup_fork() 1630 * is ran before sched_fork(). 1631 * 1632 * Silence PROVE_RCU. 1633 */ 1634 raw_spin_lock_irqsave(&p->pi_lock, flags); 1635 set_task_cpu(p, cpu); 1636 raw_spin_unlock_irqrestore(&p->pi_lock, flags); 1637 1638#if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT) 1639 if (likely(sched_info_on())) 1640 memset(&p->sched_info, 0, sizeof(p->sched_info)); 1641#endif 1642#if defined(CONFIG_SMP) 1643 p->on_cpu = 0; 1644#endif 1645#ifdef CONFIG_PREEMPT_COUNT 1646 /* Want to start with kernel preemption disabled. */ 1647 task_thread_info(p)->preempt_count = 1; 1648#endif 1649#ifdef CONFIG_SMP 1650 plist_node_init(&p->pushable_tasks, MAX_PRIO); 1651#endif 1652 1653 put_cpu(); 1654} 1655 1656/* 1657 * wake_up_new_task - wake up a newly created task for the first time. 1658 * 1659 * This function will do some initial scheduler statistics housekeeping 1660 * that must be done for every newly created context, then puts the task 1661 * on the runqueue and wakes it. 1662 */ 1663void wake_up_new_task(struct task_struct *p) 1664{ 1665 unsigned long flags; 1666 struct rq *rq; 1667 1668 raw_spin_lock_irqsave(&p->pi_lock, flags); 1669#ifdef CONFIG_SMP 1670 /* 1671 * Fork balancing, do it here and not earlier because: 1672 * - cpus_allowed can change in the fork path 1673 * - any previously selected cpu might disappear through hotplug 1674 */ 1675 set_task_cpu(p, select_task_rq(p, SD_BALANCE_FORK, 0)); 1676#endif 1677 1678 rq = __task_rq_lock(p); 1679 activate_task(rq, p, 0); 1680 p->on_rq = 1; 1681 trace_sched_wakeup_new(p, true); 1682 check_preempt_curr(rq, p, WF_FORK); 1683#ifdef CONFIG_SMP 1684 if (p->sched_class->task_woken) 1685 p->sched_class->task_woken(rq, p); 1686#endif 1687 task_rq_unlock(rq, p, &flags); 1688} 1689 1690#ifdef CONFIG_PREEMPT_NOTIFIERS 1691 1692/** 1693 * preempt_notifier_register - tell me when current is being preempted & rescheduled 1694 * @notifier: notifier struct to register 1695 */ 1696void preempt_notifier_register(struct preempt_notifier *notifier) 1697{ 1698 hlist_add_head(¬ifier->link, ¤t->preempt_notifiers); 1699} 1700EXPORT_SYMBOL_GPL(preempt_notifier_register); 1701 1702/** 1703 * preempt_notifier_unregister - no longer interested in preemption notifications 1704 * @notifier: notifier struct to unregister 1705 * 1706 * This is safe to call from within a preemption notifier. 1707 */ 1708void preempt_notifier_unregister(struct preempt_notifier *notifier) 1709{ 1710 hlist_del(¬ifier->link); 1711} 1712EXPORT_SYMBOL_GPL(preempt_notifier_unregister); 1713 1714static void fire_sched_in_preempt_notifiers(struct task_struct *curr) 1715{ 1716 struct preempt_notifier *notifier; 1717 struct hlist_node *node; 1718 1719 hlist_for_each_entry(notifier, node, &curr->preempt_notifiers, link) 1720 notifier->ops->sched_in(notifier, raw_smp_processor_id()); 1721} 1722 1723static void 1724fire_sched_out_preempt_notifiers(struct task_struct *curr, 1725 struct task_struct *next) 1726{ 1727 struct preempt_notifier *notifier; 1728 struct hlist_node *node; 1729 1730 hlist_for_each_entry(notifier, node, &curr->preempt_notifiers, link) 1731 notifier->ops->sched_out(notifier, next); 1732} 1733 1734#else /* !CONFIG_PREEMPT_NOTIFIERS */ 1735 1736static void fire_sched_in_preempt_notifiers(struct task_struct *curr) 1737{ 1738} 1739 1740static void 1741fire_sched_out_preempt_notifiers(struct task_struct *curr, 1742 struct task_struct *next) 1743{ 1744} 1745 1746#endif /* CONFIG_PREEMPT_NOTIFIERS */ 1747 1748/** 1749 * prepare_task_switch - prepare to switch tasks 1750 * @rq: the runqueue preparing to switch 1751 * @prev: the current task that is being switched out 1752 * @next: the task we are going to switch to. 1753 * 1754 * This is called with the rq lock held and interrupts off. It must 1755 * be paired with a subsequent finish_task_switch after the context 1756 * switch. 1757 * 1758 * prepare_task_switch sets up locking and calls architecture specific 1759 * hooks. 1760 */ 1761static inline void 1762prepare_task_switch(struct rq *rq, struct task_struct *prev, 1763 struct task_struct *next) 1764{ 1765 trace_sched_switch(prev, next); 1766 sched_info_switch(prev, next); 1767 perf_event_task_sched_out(prev, next); 1768 fire_sched_out_preempt_notifiers(prev, next); 1769 prepare_lock_switch(rq, next); 1770 prepare_arch_switch(next); 1771} 1772 1773/** 1774 * finish_task_switch - clean up after a task-switch 1775 * @rq: runqueue associated with task-switch 1776 * @prev: the thread we just switched away from. 1777 * 1778 * finish_task_switch must be called after the context switch, paired 1779 * with a prepare_task_switch call before the context switch. 1780 * finish_task_switch will reconcile locking set up by prepare_task_switch, 1781 * and do any other architecture-specific cleanup actions. 1782 * 1783 * Note that we may have delayed dropping an mm in context_switch(). If 1784 * so, we finish that here outside of the runqueue lock. (Doing it 1785 * with the lock held can cause deadlocks; see schedule() for 1786 * details.) 1787 */ 1788static void finish_task_switch(struct rq *rq, struct task_struct *prev) 1789 __releases(rq->lock) 1790{ 1791 struct mm_struct *mm = rq->prev_mm; 1792 long prev_state; 1793 1794 rq->prev_mm = NULL; 1795 1796 /* 1797 * A task struct has one reference for the use as "current". 1798 * If a task dies, then it sets TASK_DEAD in tsk->state and calls 1799 * schedule one last time. The schedule call will never return, and 1800 * the scheduled task must drop that reference. 1801 * The test for TASK_DEAD must occur while the runqueue locks are 1802 * still held, otherwise prev could be scheduled on another cpu, die 1803 * there before we look at prev->state, and then the reference would 1804 * be dropped twice. 1805 * Manfred Spraul <manfred@colorfullife.com> 1806 */ 1807 prev_state = prev->state; 1808 vtime_task_switch(prev); 1809 finish_arch_switch(prev); 1810 perf_event_task_sched_in(prev, current); 1811 finish_lock_switch(rq, prev); 1812 finish_arch_post_lock_switch(); 1813 1814 fire_sched_in_preempt_notifiers(current); 1815 if (mm) 1816 mmdrop(mm); 1817 if (unlikely(prev_state == TASK_DEAD)) { 1818 /* 1819 * Remove function-return probe instances associated with this 1820 * task and put them back on the free list. 1821 */ 1822 kprobe_flush_task(prev); 1823 put_task_struct(prev); 1824 } 1825} 1826 1827#ifdef CONFIG_SMP 1828 1829/* assumes rq->lock is held */ 1830static inline void pre_schedule(struct rq *rq, struct task_struct *prev) 1831{ 1832 if (prev->sched_class->pre_schedule) 1833 prev->sched_class->pre_schedule(rq, prev); 1834} 1835 1836/* rq->lock is NOT held, but preemption is disabled */ 1837static inline void post_schedule(struct rq *rq) 1838{ 1839 if (rq->post_schedule) { 1840 unsigned long flags; 1841 1842 raw_spin_lock_irqsave(&rq->lock, flags); 1843 if (rq->curr->sched_class->post_schedule) 1844 rq->curr->sched_class->post_schedule(rq); 1845 raw_spin_unlock_irqrestore(&rq->lock, flags); 1846 1847 rq->post_schedule = 0; 1848 } 1849} 1850 1851#else 1852 1853static inline void pre_schedule(struct rq *rq, struct task_struct *p) 1854{ 1855} 1856 1857static inline void post_schedule(struct rq *rq) 1858{ 1859} 1860 1861#endif 1862 1863/** 1864 * schedule_tail - first thing a freshly forked thread must call. 1865 * @prev: the thread we just switched away from. 1866 */ 1867asmlinkage void schedule_tail(struct task_struct *prev) 1868 __releases(rq->lock) 1869{ 1870 struct rq *rq = this_rq(); 1871 1872 finish_task_switch(rq, prev); 1873 1874 /* 1875 * FIXME: do we need to worry about rq being invalidated by the 1876 * task_switch? 1877 */ 1878 post_schedule(rq); 1879 1880#ifdef __ARCH_WANT_UNLOCKED_CTXSW 1881 /* In this case, finish_task_switch does not reenable preemption */ 1882 preempt_enable(); 1883#endif 1884 if (current->set_child_tid) 1885 put_user(task_pid_vnr(current), current->set_child_tid); 1886} 1887 1888/* 1889 * context_switch - switch to the new MM and the new 1890 * thread's register state. 1891 */ 1892static inline void 1893context_switch(struct rq *rq, struct task_struct *prev, 1894 struct task_struct *next) 1895{ 1896 struct mm_struct *mm, *oldmm; 1897 1898 prepare_task_switch(rq, prev, next); 1899 1900 mm = next->mm; 1901 oldmm = prev->active_mm; 1902 /* 1903 * For paravirt, this is coupled with an exit in switch_to to 1904 * combine the page table reload and the switch backend into 1905 * one hypercall. 1906 */ 1907 arch_start_context_switch(prev); 1908 1909 if (!mm) { 1910 next->active_mm = oldmm; 1911 atomic_inc(&oldmm->mm_count); 1912 enter_lazy_tlb(oldmm, next); 1913 } else 1914 switch_mm(oldmm, mm, next); 1915 1916 if (!prev->mm) { 1917 prev->active_mm = NULL; 1918 rq->prev_mm = oldmm; 1919 } 1920 /* 1921 * Since the runqueue lock will be released by the next 1922 * task (which is an invalid locking op but in the case 1923 * of the scheduler it's an obvious special-case), so we 1924 * do an early lockdep release here: 1925 */ 1926#ifndef __ARCH_WANT_UNLOCKED_CTXSW 1927 spin_release(&rq->lock.dep_map, 1, _THIS_IP_); 1928#endif 1929 1930 /* Here we just switch the register state and the stack. */ 1931 rcu_switch(prev, next); 1932 switch_to(prev, next, prev); 1933 1934 barrier(); 1935 /* 1936 * this_rq must be evaluated again because prev may have moved 1937 * CPUs since it called schedule(), thus the 'rq' on its stack 1938 * frame will be invalid. 1939 */ 1940 finish_task_switch(this_rq(), prev); 1941} 1942 1943/* 1944 * nr_running, nr_uninterruptible and nr_context_switches: 1945 * 1946 * externally visible scheduler statistics: current number of runnable 1947 * threads, current number of uninterruptible-sleeping threads, total 1948 * number of context switches performed since bootup. 1949 */ 1950unsigned long nr_running(void) 1951{ 1952 unsigned long i, sum = 0; 1953 1954 for_each_online_cpu(i) 1955 sum += cpu_rq(i)->nr_running; 1956 1957 return sum; 1958} 1959 1960unsigned long nr_uninterruptible(void) 1961{ 1962 unsigned long i, sum = 0; 1963 1964 for_each_possible_cpu(i) 1965 sum += cpu_rq(i)->nr_uninterruptible; 1966 1967 /* 1968 * Since we read the counters lockless, it might be slightly 1969 * inaccurate. Do not allow it to go below zero though: 1970 */ 1971 if (unlikely((long)sum < 0)) 1972 sum = 0; 1973 1974 return sum; 1975} 1976 1977unsigned long long nr_context_switches(void) 1978{ 1979 int i; 1980 unsigned long long sum = 0; 1981 1982 for_each_possible_cpu(i) 1983 sum += cpu_rq(i)->nr_switches; 1984 1985 return sum; 1986} 1987 1988unsigned long nr_iowait(void) 1989{ 1990 unsigned long i, sum = 0; 1991 1992 for_each_possible_cpu(i) 1993 sum += atomic_read(&cpu_rq(i)->nr_iowait); 1994 1995 return sum; 1996} 1997 1998unsigned long nr_iowait_cpu(int cpu) 1999{ 2000 struct rq *this = cpu_rq(cpu); 2001 return atomic_read(&this->nr_iowait); 2002} 2003 2004unsigned long this_cpu_load(void) 2005{ 2006 struct rq *this = this_rq(); 2007 return this->cpu_load[0]; 2008} 2009 2010 2011/* 2012 * Global load-average calculations 2013 * 2014 * We take a distributed and async approach to calculating the global load-avg 2015 * in order to minimize overhead. 2016 * 2017 * The global load average is an exponentially decaying average of nr_running + 2018 * nr_uninterruptible. 2019 * 2020 * Once every LOAD_FREQ: 2021 * 2022 * nr_active = 0; 2023 * for_each_possible_cpu(cpu) 2024 * nr_active += cpu_of(cpu)->nr_running + cpu_of(cpu)->nr_uninterruptible; 2025 * 2026 * avenrun[n] = avenrun[0] * exp_n + nr_active * (1 - exp_n) 2027 * 2028 * Due to a number of reasons the above turns in the mess below: 2029 * 2030 * - for_each_possible_cpu() is prohibitively expensive on machines with 2031 * serious number of cpus, therefore we need to take a distributed approach 2032 * to calculating nr_active. 2033 * 2034 * \Sum_i x_i(t) = \Sum_i x_i(t) - x_i(t_0) | x_i(t_0) := 0 2035 * = \Sum_i { \Sum_j=1 x_i(t_j) - x_i(t_j-1) } 2036 * 2037 * So assuming nr_active := 0 when we start out -- true per definition, we 2038 * can simply take per-cpu deltas and fold those into a global accumulate 2039 * to obtain the same result. See calc_load_fold_active(). 2040 * 2041 * Furthermore, in order to avoid synchronizing all per-cpu delta folding 2042 * across the machine, we assume 10 ticks is sufficient time for every 2043 * cpu to have completed this task. 2044 * 2045 * This places an upper-bound on the IRQ-off latency of the machine. Then 2046 * again, being late doesn't loose the delta, just wrecks the sample. 2047 * 2048 * - cpu_rq()->nr_uninterruptible isn't accurately tracked per-cpu because 2049 * this would add another cross-cpu cacheline miss and atomic operation 2050 * to the wakeup path. Instead we increment on whatever cpu the task ran 2051 * when it went into uninterruptible state and decrement on whatever cpu 2052 * did the wakeup. This means that only the sum of nr_uninterruptible over 2053 * all cpus yields the correct result. 2054 * 2055 * This covers the NO_HZ=n code, for extra head-aches, see the comment below. 2056 */ 2057 2058/* Variables and functions for calc_load */ 2059static atomic_long_t calc_load_tasks; 2060static unsigned long calc_load_update; 2061unsigned long avenrun[3]; 2062EXPORT_SYMBOL(avenrun); /* should be removed */ 2063 2064/** 2065 * get_avenrun - get the load average array 2066 * @loads: pointer to dest load array 2067 * @offset: offset to add 2068 * @shift: shift count to shift the result left 2069 * 2070 * These values are estimates at best, so no need for locking. 2071 */ 2072void get_avenrun(unsigned long *loads, unsigned long offset, int shift) 2073{ 2074 loads[0] = (avenrun[0] + offset) << shift; 2075 loads[1] = (avenrun[1] + offset) << shift; 2076 loads[2] = (avenrun[2] + offset) << shift; 2077} 2078 2079static long calc_load_fold_active(struct rq *this_rq) 2080{ 2081 long nr_active, delta = 0; 2082 2083 nr_active = this_rq->nr_running; 2084 nr_active += (long) this_rq->nr_uninterruptible; 2085 2086 if (nr_active != this_rq->calc_load_active) { 2087 delta = nr_active - this_rq->calc_load_active; 2088 this_rq->calc_load_active = nr_active; 2089 } 2090 2091 return delta; 2092} 2093 2094/* 2095 * a1 = a0 * e + a * (1 - e) 2096 */ 2097static unsigned long 2098calc_load(unsigned long load, unsigned long exp, unsigned long active) 2099{ 2100 load *= exp; 2101 load += active * (FIXED_1 - exp); 2102 load += 1UL << (FSHIFT - 1); 2103 return load >> FSHIFT; 2104} 2105 2106#ifdef CONFIG_NO_HZ 2107/* 2108 * Handle NO_HZ for the global load-average. 2109 * 2110 * Since the above described distributed algorithm to compute the global 2111 * load-average relies on per-cpu sampling from the tick, it is affected by 2112 * NO_HZ. 2113 * 2114 * The basic idea is to fold the nr_active delta into a global idle-delta upon 2115 * entering NO_HZ state such that we can include this as an 'extra' cpu delta 2116 * when we read the global state. 2117 * 2118 * Obviously reality has to ruin such a delightfully simple scheme: 2119 * 2120 * - When we go NO_HZ idle during the window, we can negate our sample 2121 * contribution, causing under-accounting. 2122 * 2123 * We avoid this by keeping two idle-delta counters and flipping them 2124 * when the window starts, thus separating old and new NO_HZ load. 2125 * 2126 * The only trick is the slight shift in index flip for read vs write. 2127 * 2128 * 0s 5s 10s 15s 2129 * +10 +10 +10 +10 2130 * |-|-----------|-|-----------|-|-----------|-| 2131 * r:0 0 1 1 0 0 1 1 0 2132 * w:0 1 1 0 0 1 1 0 0 2133 * 2134 * This ensures we'll fold the old idle contribution in this window while 2135 * accumlating the new one. 2136 * 2137 * - When we wake up from NO_HZ idle during the window, we push up our 2138 * contribution, since we effectively move our sample point to a known 2139 * busy state. 2140 * 2141 * This is solved by pushing the window forward, and thus skipping the 2142 * sample, for this cpu (effectively using the idle-delta for this cpu which 2143 * was in effect at the time the window opened). This also solves the issue 2144 * of having to deal with a cpu having been in NOHZ idle for multiple 2145 * LOAD_FREQ intervals. 2146 * 2147 * When making the ILB scale, we should try to pull this in as well. 2148 */ 2149static atomic_long_t calc_load_idle[2]; 2150static int calc_load_idx; 2151 2152static inline int calc_load_write_idx(void) 2153{ 2154 int idx = calc_load_idx; 2155 2156 /* 2157 * See calc_global_nohz(), if we observe the new index, we also 2158 * need to observe the new update time. 2159 */ 2160 smp_rmb(); 2161 2162 /* 2163 * If the folding window started, make sure we start writing in the 2164 * next idle-delta. 2165 */ 2166 if (!time_before(jiffies, calc_load_update)) 2167 idx++; 2168 2169 return idx & 1; 2170} 2171 2172static inline int calc_load_read_idx(void) 2173{ 2174 return calc_load_idx & 1; 2175} 2176 2177void calc_load_enter_idle(void) 2178{ 2179 struct rq *this_rq = this_rq(); 2180 long delta; 2181 2182 /* 2183 * We're going into NOHZ mode, if there's any pending delta, fold it 2184 * into the pending idle delta. 2185 */ 2186 delta = calc_load_fold_active(this_rq); 2187 if (delta) { 2188 int idx = calc_load_write_idx(); 2189 atomic_long_add(delta, &calc_load_idle[idx]); 2190 } 2191} 2192 2193void calc_load_exit_idle(void) 2194{ 2195 struct rq *this_rq = this_rq(); 2196 2197 /* 2198 * If we're still before the sample window, we're done. 2199 */ 2200 if (time_before(jiffies, this_rq->calc_load_update)) 2201 return; 2202 2203 /* 2204 * We woke inside or after the sample window, this means we're already 2205 * accounted through the nohz accounting, so skip the entire deal and 2206 * sync up for the next window. 2207 */ 2208 this_rq->calc_load_update = calc_load_update; 2209 if (time_before(jiffies, this_rq->calc_load_update + 10)) 2210 this_rq->calc_load_update += LOAD_FREQ; 2211} 2212 2213static long calc_load_fold_idle(void) 2214{ 2215 int idx = calc_load_read_idx(); 2216 long delta = 0; 2217 2218 if (atomic_long_read(&calc_load_idle[idx])) 2219 delta = atomic_long_xchg(&calc_load_idle[idx], 0); 2220 2221 return delta; 2222} 2223 2224/** 2225 * fixed_power_int - compute: x^n, in O(log n) time 2226 * 2227 * @x: base of the power 2228 * @frac_bits: fractional bits of @x 2229 * @n: power to raise @x to. 2230 * 2231 * By exploiting the relation between the definition of the natural power 2232 * function: x^n := x*x*...*x (x multiplied by itself for n times), and 2233 * the binary encoding of numbers used by computers: n := \Sum n_i * 2^i, 2234 * (where: n_i \elem {0, 1}, the binary vector representing n), 2235 * we find: x^n := x^(\Sum n_i * 2^i) := \Prod x^(n_i * 2^i), which is 2236 * of course trivially computable in O(log_2 n), the length of our binary 2237 * vector. 2238 */ 2239static unsigned long 2240fixed_power_int(unsigned long x, unsigned int frac_bits, unsigned int n) 2241{ 2242 unsigned long result = 1UL << frac_bits; 2243 2244 if (n) for (;;) { 2245 if (n & 1) { 2246 result *= x; 2247 result += 1UL << (frac_bits - 1); 2248 result >>= frac_bits; 2249 } 2250 n >>= 1; 2251 if (!n) 2252 break; 2253 x *= x; 2254 x += 1UL << (frac_bits - 1); 2255 x >>= frac_bits; 2256 } 2257 2258 return result; 2259} 2260 2261/* 2262 * a1 = a0 * e + a * (1 - e) 2263 * 2264 * a2 = a1 * e + a * (1 - e) 2265 * = (a0 * e + a * (1 - e)) * e + a * (1 - e) 2266 * = a0 * e^2 + a * (1 - e) * (1 + e) 2267 * 2268 * a3 = a2 * e + a * (1 - e) 2269 * = (a0 * e^2 + a * (1 - e) * (1 + e)) * e + a * (1 - e) 2270 * = a0 * e^3 + a * (1 - e) * (1 + e + e^2) 2271 * 2272 * ... 2273 * 2274 * an = a0 * e^n + a * (1 - e) * (1 + e + ... + e^n-1) [1] 2275 * = a0 * e^n + a * (1 - e) * (1 - e^n)/(1 - e) 2276 * = a0 * e^n + a * (1 - e^n) 2277 * 2278 * [1] application of the geometric series: 2279 * 2280 * n 1 - x^(n+1) 2281 * S_n := \Sum x^i = ------------- 2282 * i=0 1 - x 2283 */ 2284static unsigned long 2285calc_load_n(unsigned long load, unsigned long exp, 2286 unsigned long active, unsigned int n) 2287{ 2288 2289 return calc_load(load, fixed_power_int(exp, FSHIFT, n), active); 2290} 2291 2292/* 2293 * NO_HZ can leave us missing all per-cpu ticks calling 2294 * calc_load_account_active(), but since an idle CPU folds its delta into 2295 * calc_load_tasks_idle per calc_load_account_idle(), all we need to do is fold 2296 * in the pending idle delta if our idle period crossed a load cycle boundary. 2297 * 2298 * Once we've updated the global active value, we need to apply the exponential 2299 * weights adjusted to the number of cycles missed. 2300 */ 2301static void calc_global_nohz(void) 2302{ 2303 long delta, active, n; 2304 2305 if (!time_before(jiffies, calc_load_update + 10)) { 2306 /* 2307 * Catch-up, fold however many we are behind still 2308 */ 2309 delta = jiffies - calc_load_update - 10; 2310 n = 1 + (delta / LOAD_FREQ); 2311 2312 active = atomic_long_read(&calc_load_tasks); 2313 active = active > 0 ? active * FIXED_1 : 0; 2314 2315 avenrun[0] = calc_load_n(avenrun[0], EXP_1, active, n); 2316 avenrun[1] = calc_load_n(avenrun[1], EXP_5, active, n); 2317 avenrun[2] = calc_load_n(avenrun[2], EXP_15, active, n); 2318 2319 calc_load_update += n * LOAD_FREQ; 2320 } 2321 2322 /* 2323 * Flip the idle index... 2324 * 2325 * Make sure we first write the new time then flip the index, so that 2326 * calc_load_write_idx() will see the new time when it reads the new 2327 * index, this avoids a double flip messing things up. 2328 */ 2329 smp_wmb(); 2330 calc_load_idx++; 2331} 2332#else /* !CONFIG_NO_HZ */ 2333 2334static inline long calc_load_fold_idle(void) { return 0; } 2335static inline void calc_global_nohz(void) { } 2336 2337#endif /* CONFIG_NO_HZ */ 2338 2339/* 2340 * calc_load - update the avenrun load estimates 10 ticks after the 2341 * CPUs have updated calc_load_tasks. 2342 */ 2343void calc_global_load(unsigned long ticks) 2344{ 2345 long active, delta; 2346 2347 if (time_before(jiffies, calc_load_update + 10)) 2348 return; 2349 2350 /* 2351 * Fold the 'old' idle-delta to include all NO_HZ cpus. 2352 */ 2353 delta = calc_load_fold_idle(); 2354 if (delta) 2355 atomic_long_add(delta, &calc_load_tasks); 2356 2357 active = atomic_long_read(&calc_load_tasks); 2358 active = active > 0 ? active * FIXED_1 : 0; 2359 2360 avenrun[0] = calc_load(avenrun[0], EXP_1, active); 2361 avenrun[1] = calc_load(avenrun[1], EXP_5, active); 2362 avenrun[2] = calc_load(avenrun[2], EXP_15, active); 2363 2364 calc_load_update += LOAD_FREQ; 2365 2366 /* 2367 * In case we idled for multiple LOAD_FREQ intervals, catch up in bulk. 2368 */ 2369 calc_global_nohz(); 2370} 2371 2372/* 2373 * Called from update_cpu_load() to periodically update this CPU's 2374 * active count. 2375 */ 2376static void calc_load_account_active(struct rq *this_rq) 2377{ 2378 long delta; 2379 2380 if (time_before(jiffies, this_rq->calc_load_update)) 2381 return; 2382 2383 delta = calc_load_fold_active(this_rq); 2384 if (delta) 2385 atomic_long_add(delta, &calc_load_tasks); 2386 2387 this_rq->calc_load_update += LOAD_FREQ; 2388} 2389 2390/* 2391 * End of global load-average stuff 2392 */ 2393 2394/* 2395 * The exact cpuload at various idx values, calculated at every tick would be 2396 * load = (2^idx - 1) / 2^idx * load + 1 / 2^idx * cur_load 2397 * 2398 * If a cpu misses updates for n-1 ticks (as it was idle) and update gets called 2399 * on nth tick when cpu may be busy, then we have: 2400 * load = ((2^idx - 1) / 2^idx)^(n-1) * load 2401 * load = (2^idx - 1) / 2^idx) * load + 1 / 2^idx * cur_load 2402 * 2403 * decay_load_missed() below does efficient calculation of 2404 * load = ((2^idx - 1) / 2^idx)^(n-1) * load 2405 * avoiding 0..n-1 loop doing load = ((2^idx - 1) / 2^idx) * load 2406 * 2407 * The calculation is approximated on a 128 point scale. 2408 * degrade_zero_ticks is the number of ticks after which load at any 2409 * particular idx is approximated to be zero. 2410 * degrade_factor is a precomputed table, a row for each load idx. 2411 * Each column corresponds to degradation factor for a power of two ticks, 2412 * based on 128 point scale. 2413 * Example: 2414 * row 2, col 3 (=12) says that the degradation at load idx 2 after 2415 * 8 ticks is 12/128 (which is an approximation of exact factor 3^8/4^8). 2416 * 2417 * With this power of 2 load factors, we can degrade the load n times 2418 * by looking at 1 bits in n and doing as many mult/shift instead of 2419 * n mult/shifts needed by the exact degradation. 2420 */ 2421#define DEGRADE_SHIFT 7 2422static const unsigned char 2423 degrade_zero_ticks[CPU_LOAD_IDX_MAX] = {0, 8, 32, 64, 128}; 2424static const unsigned char 2425 degrade_factor[CPU_LOAD_IDX_MAX][DEGRADE_SHIFT + 1] = { 2426 {0, 0, 0, 0, 0, 0, 0, 0}, 2427 {64, 32, 8, 0, 0, 0, 0, 0}, 2428 {96, 72, 40, 12, 1, 0, 0}, 2429 {112, 98, 75, 43, 15, 1, 0}, 2430 {120, 112, 98, 76, 45, 16, 2} }; 2431 2432/* 2433 * Update cpu_load for any missed ticks, due to tickless idle. The backlog 2434 * would be when CPU is idle and so we just decay the old load without 2435 * adding any new load. 2436 */ 2437static unsigned long 2438decay_load_missed(unsigned long load, unsigned long missed_updates, int idx) 2439{ 2440 int j = 0; 2441 2442 if (!missed_updates) 2443 return load; 2444 2445 if (missed_updates >= degrade_zero_ticks[idx]) 2446 return 0; 2447 2448 if (idx == 1) 2449 return load >> missed_updates; 2450 2451 while (missed_updates) { 2452 if (missed_updates % 2) 2453 load = (load * degrade_factor[idx][j]) >> DEGRADE_SHIFT; 2454 2455 missed_updates >>= 1; 2456 j++; 2457 } 2458 return load; 2459} 2460 2461/* 2462 * Update rq->cpu_load[] statistics. This function is usually called every 2463 * scheduler tick (TICK_NSEC). With tickless idle this will not be called 2464 * every tick. We fix it up based on jiffies. 2465 */ 2466static void __update_cpu_load(struct rq *this_rq, unsigned long this_load, 2467 unsigned long pending_updates) 2468{ 2469 int i, scale; 2470 2471 this_rq->nr_load_updates++; 2472 2473 /* Update our load: */ 2474 this_rq->cpu_load[0] = this_load; /* Fasttrack for idx 0 */ 2475 for (i = 1, scale = 2; i < CPU_LOAD_IDX_MAX; i++, scale += scale) { 2476 unsigned long old_load, new_load; 2477 2478 /* scale is effectively 1 << i now, and >> i divides by scale */ 2479 2480 old_load = this_rq->cpu_load[i]; 2481 old_load = decay_load_missed(old_load, pending_updates - 1, i); 2482 new_load = this_load; 2483 /* 2484 * Round up the averaging division if load is increasing. This 2485 * prevents us from getting stuck on 9 if the load is 10, for 2486 * example. 2487 */ 2488 if (new_load > old_load) 2489 new_load += scale - 1; 2490 2491 this_rq->cpu_load[i] = (old_load * (scale - 1) + new_load) >> i; 2492 } 2493 2494 sched_avg_update(this_rq); 2495} 2496 2497#ifdef CONFIG_NO_HZ 2498/* 2499 * There is no sane way to deal with nohz on smp when using jiffies because the 2500 * cpu doing the jiffies update might drift wrt the cpu doing the jiffy reading 2501 * causing off-by-one errors in observed deltas; {0,2} instead of {1,1}. 2502 * 2503 * Therefore we cannot use the delta approach from the regular tick since that 2504 * would seriously skew the load calculation. However we'll make do for those 2505 * updates happening while idle (nohz_idle_balance) or coming out of idle 2506 * (tick_nohz_idle_exit). 2507 * 2508 * This means we might still be one tick off for nohz periods. 2509 */ 2510 2511/* 2512 * Called from nohz_idle_balance() to update the load ratings before doing the 2513 * idle balance. 2514 */ 2515void update_idle_cpu_load(struct rq *this_rq) 2516{ 2517 unsigned long curr_jiffies = ACCESS_ONCE(jiffies); 2518 unsigned long load = this_rq->load.weight; 2519 unsigned long pending_updates; 2520 2521 /* 2522 * bail if there's load or we're actually up-to-date. 2523 */ 2524 if (load || curr_jiffies == this_rq->last_load_update_tick) 2525 return; 2526 2527 pending_updates = curr_jiffies - this_rq->last_load_update_tick; 2528 this_rq->last_load_update_tick = curr_jiffies; 2529 2530 __update_cpu_load(this_rq, load, pending_updates); 2531} 2532 2533/* 2534 * Called from tick_nohz_idle_exit() -- try and fix up the ticks we missed. 2535 */ 2536void update_cpu_load_nohz(void) 2537{ 2538 struct rq *this_rq = this_rq(); 2539 unsigned long curr_jiffies = ACCESS_ONCE(jiffies); 2540 unsigned long pending_updates; 2541 2542 if (curr_jiffies == this_rq->last_load_update_tick) 2543 return; 2544 2545 raw_spin_lock(&this_rq->lock); 2546 pending_updates = curr_jiffies - this_rq->last_load_update_tick; 2547 if (pending_updates) { 2548 this_rq->last_load_update_tick = curr_jiffies; 2549 /* 2550 * We were idle, this means load 0, the current load might be 2551 * !0 due to remote wakeups and the sort. 2552 */ 2553 __update_cpu_load(this_rq, 0, pending_updates); 2554 } 2555 raw_spin_unlock(&this_rq->lock); 2556} 2557#endif /* CONFIG_NO_HZ */ 2558 2559/* 2560 * Called from scheduler_tick() 2561 */ 2562static void update_cpu_load_active(struct rq *this_rq) 2563{ 2564 /* 2565 * See the mess around update_idle_cpu_load() / update_cpu_load_nohz(). 2566 */ 2567 this_rq->last_load_update_tick = jiffies; 2568 __update_cpu_load(this_rq, this_rq->load.weight, 1); 2569 2570 calc_load_account_active(this_rq); 2571} 2572 2573#ifdef CONFIG_SMP 2574 2575/* 2576 * sched_exec - execve() is a valuable balancing opportunity, because at 2577 * this point the task has the smallest effective memory and cache footprint. 2578 */ 2579void sched_exec(void) 2580{ 2581 struct task_struct *p = current; 2582 unsigned long flags; 2583 int dest_cpu; 2584 2585 raw_spin_lock_irqsave(&p->pi_lock, flags); 2586 dest_cpu = p->sched_class->select_task_rq(p, SD_BALANCE_EXEC, 0); 2587 if (dest_cpu == smp_processor_id()) 2588 goto unlock; 2589 2590 if (likely(cpu_active(dest_cpu))) { 2591 struct migration_arg arg = { p, dest_cpu }; 2592 2593 raw_spin_unlock_irqrestore(&p->pi_lock, flags); 2594 stop_one_cpu(task_cpu(p), migration_cpu_stop, &arg); 2595 return; 2596 } 2597unlock: 2598 raw_spin_unlock_irqrestore(&p->pi_lock, flags); 2599} 2600 2601#endif 2602 2603DEFINE_PER_CPU(struct kernel_stat, kstat); 2604DEFINE_PER_CPU(struct kernel_cpustat, kernel_cpustat); 2605 2606EXPORT_PER_CPU_SYMBOL(kstat); 2607EXPORT_PER_CPU_SYMBOL(kernel_cpustat); 2608 2609/* 2610 * Return any ns on the sched_clock that have not yet been accounted in 2611 * @p in case that task is currently running. 2612 * 2613 * Called with task_rq_lock() held on @rq. 2614 */ 2615static u64 do_task_delta_exec(struct task_struct *p, struct rq *rq) 2616{ 2617 u64 ns = 0; 2618 2619 if (task_current(rq, p)) { 2620 update_rq_clock(rq); 2621 ns = rq->clock_task - p->se.exec_start; 2622 if ((s64)ns < 0) 2623 ns = 0; 2624 } 2625 2626 return ns; 2627} 2628 2629unsigned long long task_delta_exec(struct task_struct *p) 2630{ 2631 unsigned long flags; 2632 struct rq *rq; 2633 u64 ns = 0; 2634 2635 rq = task_rq_lock(p, &flags); 2636 ns = do_task_delta_exec(p, rq); 2637 task_rq_unlock(rq, p, &flags); 2638 2639 return ns; 2640} 2641 2642/* 2643 * Return accounted runtime for the task. 2644 * In case the task is currently running, return the runtime plus current's 2645 * pending runtime that have not been accounted yet. 2646 */ 2647unsigned long long task_sched_runtime(struct task_struct *p) 2648{ 2649 unsigned long flags; 2650 struct rq *rq; 2651 u64 ns = 0; 2652 2653 rq = task_rq_lock(p, &flags); 2654 ns = p->se.sum_exec_runtime + do_task_delta_exec(p, rq); 2655 task_rq_unlock(rq, p, &flags); 2656 2657 return ns; 2658} 2659 2660/* 2661 * This function gets called by the timer code, with HZ frequency. 2662 * We call it with interrupts disabled. 2663 */ 2664void scheduler_tick(void) 2665{ 2666 int cpu = smp_processor_id(); 2667 struct rq *rq = cpu_rq(cpu); 2668 struct task_struct *curr = rq->curr; 2669 2670 sched_clock_tick(); 2671 2672 raw_spin_lock(&rq->lock); 2673 update_rq_clock(rq); 2674 update_cpu_load_active(rq); 2675 curr->sched_class->task_tick(rq, curr, 0); 2676 raw_spin_unlock(&rq->lock); 2677 2678 perf_event_task_tick(); 2679 2680#ifdef CONFIG_SMP 2681 rq->idle_balance = idle_cpu(cpu); 2682 trigger_load_balance(rq, cpu); 2683#endif 2684} 2685 2686notrace unsigned long get_parent_ip(unsigned long addr) 2687{ 2688 if (in_lock_functions(addr)) { 2689 addr = CALLER_ADDR2; 2690 if (in_lock_functions(addr)) 2691 addr = CALLER_ADDR3; 2692 } 2693 return addr; 2694} 2695 2696#if defined(CONFIG_PREEMPT) && (defined(CONFIG_DEBUG_PREEMPT) || \ 2697 defined(CONFIG_PREEMPT_TRACER)) 2698 2699void __kprobes add_preempt_count(int val) 2700{ 2701#ifdef CONFIG_DEBUG_PREEMPT 2702 /* 2703 * Underflow? 2704 */ 2705 if (DEBUG_LOCKS_WARN_ON((preempt_count() < 0))) 2706 return; 2707#endif 2708 preempt_count() += val; 2709#ifdef CONFIG_DEBUG_PREEMPT 2710 /* 2711 * Spinlock count overflowing soon? 2712 */ 2713 DEBUG_LOCKS_WARN_ON((preempt_count() & PREEMPT_MASK) >= 2714 PREEMPT_MASK - 10); 2715#endif 2716 if (preempt_count() == val) 2717 trace_preempt_off(CALLER_ADDR0, get_parent_ip(CALLER_ADDR1)); 2718} 2719EXPORT_SYMBOL(add_preempt_count); 2720 2721void __kprobes sub_preempt_count(int val) 2722{ 2723#ifdef CONFIG_DEBUG_PREEMPT 2724 /* 2725 * Underflow? 2726 */ 2727 if (DEBUG_LOCKS_WARN_ON(val > preempt_count())) 2728 return; 2729 /* 2730 * Is the spinlock portion underflowing? 2731 */ 2732 if (DEBUG_LOCKS_WARN_ON((val < PREEMPT_MASK) && 2733 !(preempt_count() & PREEMPT_MASK))) 2734 return; 2735#endif 2736 2737 if (preempt_count() == val) 2738 trace_preempt_on(CALLER_ADDR0, get_parent_ip(CALLER_ADDR1)); 2739 preempt_count() -= val; 2740} 2741EXPORT_SYMBOL(sub_preempt_count); 2742 2743#endif 2744 2745/* 2746 * Print scheduling while atomic bug: 2747 */ 2748static noinline void __schedule_bug(struct task_struct *prev) 2749{ 2750 if (oops_in_progress) 2751 return; 2752 2753 printk(KERN_ERR "BUG: scheduling while atomic: %s/%d/0x%08x\n", 2754 prev->comm, prev->pid, preempt_count()); 2755 2756 debug_show_held_locks(prev); 2757 print_modules(); 2758 if (irqs_disabled()) 2759 print_irqtrace_events(prev); 2760 dump_stack(); 2761 add_taint(TAINT_WARN); 2762} 2763 2764/* 2765 * Various schedule()-time debugging checks and statistics: 2766 */ 2767static inline void schedule_debug(struct task_struct *prev) 2768{ 2769 /* 2770 * Test if we are atomic. Since do_exit() needs to call into 2771 * schedule() atomically, we ignore that path for now. 2772 * Otherwise, whine if we are scheduling when we should not be. 2773 */ 2774 if (unlikely(in_atomic_preempt_off() && !prev->exit_state)) 2775 __schedule_bug(prev); 2776 rcu_sleep_check(); 2777 2778 profile_hit(SCHED_PROFILING, __builtin_return_address(0)); 2779 2780 schedstat_inc(this_rq(), sched_count); 2781} 2782 2783static void put_prev_task(struct rq *rq, struct task_struct *prev) 2784{ 2785 if (prev->on_rq || rq->skip_clock_update < 0) 2786 update_rq_clock(rq); 2787 prev->sched_class->put_prev_task(rq, prev); 2788} 2789 2790/* 2791 * Pick up the highest-prio task: 2792 */ 2793static inline struct task_struct * 2794pick_next_task(struct rq *rq) 2795{ 2796 const struct sched_class *class; 2797 struct task_struct *p; 2798 2799 /* 2800 * Optimization: we know that if all tasks are in 2801 * the fair class we can call that function directly: 2802 */ 2803 if (likely(rq->nr_running == rq->cfs.h_nr_running)) { 2804 p = fair_sched_class.pick_next_task(rq); 2805 if (likely(p)) 2806 return p; 2807 } 2808 2809 for_each_class(class) { 2810 p = class->pick_next_task(rq); 2811 if (p) 2812 return p; 2813 } 2814 2815 BUG(); /* the idle class will always have a runnable task */ 2816} 2817 2818/* 2819 * __schedule() is the main scheduler function. 2820 * 2821 * The main means of driving the scheduler and thus entering this function are: 2822 * 2823 * 1. Explicit blocking: mutex, semaphore, waitqueue, etc. 2824 * 2825 * 2. TIF_NEED_RESCHED flag is checked on interrupt and userspace return 2826 * paths. For example, see arch/x86/entry_64.S. 2827 * 2828 * To drive preemption between tasks, the scheduler sets the flag in timer 2829 * interrupt handler scheduler_tick(). 2830 * 2831 * 3. Wakeups don't really cause entry into schedule(). They add a 2832 * task to the run-queue and that's it. 2833 * 2834 * Now, if the new task added to the run-queue preempts the current 2835 * task, then the wakeup sets TIF_NEED_RESCHED and schedule() gets 2836 * called on the nearest possible occasion: 2837 * 2838 * - If the kernel is preemptible (CONFIG_PREEMPT=y): 2839 * 2840 * - in syscall or exception context, at the next outmost 2841 * preempt_enable(). (this might be as soon as the wake_up()'s 2842 * spin_unlock()!) 2843 * 2844 * - in IRQ context, return from interrupt-handler to 2845 * preemptible context 2846 * 2847 * - If the kernel is not preemptible (CONFIG_PREEMPT is not set) 2848 * then at the next: 2849 * 2850 * - cond_resched() call 2851 * - explicit schedule() call 2852 * - return from syscall or exception to user-space 2853 * - return from interrupt-handler to user-space 2854 */ 2855static void __sched __schedule(void) 2856{ 2857 struct task_struct *prev, *next; 2858 unsigned long *switch_count; 2859 struct rq *rq; 2860 int cpu; 2861 2862need_resched: 2863 preempt_disable(); 2864 cpu = smp_processor_id(); 2865 rq = cpu_rq(cpu); 2866 rcu_note_context_switch(cpu); 2867 prev = rq->curr; 2868 2869 schedule_debug(prev); 2870 2871 if (sched_feat(HRTICK)) 2872 hrtick_clear(rq); 2873 2874 raw_spin_lock_irq(&rq->lock); 2875 2876 switch_count = &prev->nivcsw; 2877 if (prev->state && !(preempt_count() & PREEMPT_ACTIVE)) { 2878 if (unlikely(signal_pending_state(prev->state, prev))) { 2879 prev->state = TASK_RUNNING; 2880 } else { 2881 deactivate_task(rq, prev, DEQUEUE_SLEEP); 2882 prev->on_rq = 0; 2883 2884 /* 2885 * If a worker went to sleep, notify and ask workqueue 2886 * whether it wants to wake up a task to maintain 2887 * concurrency. 2888 */ 2889 if (prev->flags & PF_WQ_WORKER) { 2890 struct task_struct *to_wakeup; 2891 2892 to_wakeup = wq_worker_sleeping(prev, cpu); 2893 if (to_wakeup) 2894 try_to_wake_up_local(to_wakeup); 2895 } 2896 } 2897 switch_count = &prev->nvcsw; 2898 } 2899 2900 pre_schedule(rq, prev); 2901 2902 if (unlikely(!rq->nr_running)) 2903 idle_balance(cpu, rq); 2904 2905 put_prev_task(rq, prev); 2906 next = pick_next_task(rq); 2907 clear_tsk_need_resched(prev); 2908 rq->skip_clock_update = 0; 2909 2910 if (likely(prev != next)) { 2911 rq->nr_switches++; 2912 rq->curr = next; 2913 ++*switch_count; 2914 2915 context_switch(rq, prev, next); /* unlocks the rq */ 2916 /* 2917 * The context switch have flipped the stack from under us 2918 * and restored the local variables which were saved when 2919 * this task called schedule() in the past. prev == current 2920 * is still correct, but it can be moved to another cpu/rq. 2921 */ 2922 cpu = smp_processor_id(); 2923 rq = cpu_rq(cpu); 2924 } else 2925 raw_spin_unlock_irq(&rq->lock); 2926 2927 post_schedule(rq); 2928 2929 sched_preempt_enable_no_resched(); 2930 if (need_resched()) 2931 goto need_resched; 2932} 2933 2934static inline void sched_submit_work(struct task_struct *tsk) 2935{ 2936 if (!tsk->state || tsk_is_pi_blocked(tsk)) 2937 return; 2938 /* 2939 * If we are going to sleep and we have plugged IO queued, 2940 * make sure to submit it to avoid deadlocks. 2941 */ 2942 if (blk_needs_flush_plug(tsk)) 2943 blk_schedule_flush_plug(tsk); 2944} 2945 2946asmlinkage void __sched schedule(void) 2947{ 2948 struct task_struct *tsk = current; 2949 2950 sched_submit_work(tsk); 2951 __schedule(); 2952} 2953EXPORT_SYMBOL(schedule); 2954 2955#ifdef CONFIG_RCU_USER_QS 2956asmlinkage void __sched schedule_user(void) 2957{ 2958 /* 2959 * If we come here after a random call to set_need_resched(), 2960 * or we have been woken up remotely but the IPI has not yet arrived, 2961 * we haven't yet exited the RCU idle mode. Do it here manually until 2962 * we find a better solution. 2963 */ 2964 rcu_user_exit(); 2965 schedule(); 2966 rcu_user_enter(); 2967} 2968#endif 2969 2970/** 2971 * schedule_preempt_disabled - called with preemption disabled 2972 * 2973 * Returns with preemption disabled. Note: preempt_count must be 1 2974 */ 2975void __sched schedule_preempt_disabled(void) 2976{ 2977 sched_preempt_enable_no_resched(); 2978 schedule(); 2979 preempt_disable(); 2980} 2981 2982#ifdef CONFIG_MUTEX_SPIN_ON_OWNER 2983 2984static inline bool owner_running(struct mutex *lock, struct task_struct *owner) 2985{ 2986 if (lock->owner != owner) 2987 return false; 2988 2989 /* 2990 * Ensure we emit the owner->on_cpu, dereference _after_ checking 2991 * lock->owner still matches owner, if that fails, owner might 2992 * point to free()d memory, if it still matches, the rcu_read_lock() 2993 * ensures the memory stays valid. 2994 */ 2995 barrier(); 2996 2997 return owner->on_cpu; 2998} 2999 3000/* 3001 * Look out! "owner" is an entirely speculative pointer 3002 * access and not reliable. 3003 */ 3004int mutex_spin_on_owner(struct mutex *lock, struct task_struct *owner) 3005{ 3006 if (!sched_feat(OWNER_SPIN)) 3007 return 0; 3008 3009 rcu_read_lock(); 3010 while (owner_running(lock, owner)) { 3011 if (need_resched()) 3012 break; 3013 3014 arch_mutex_cpu_relax(); 3015 } 3016 rcu_read_unlock(); 3017 3018 /* 3019 * We break out the loop above on need_resched() and when the 3020 * owner changed, which is a sign for heavy contention. Return 3021 * success only when lock->owner is NULL. 3022 */ 3023 return lock->owner == NULL; 3024} 3025#endif 3026 3027#ifdef CONFIG_PREEMPT 3028/* 3029 * this is the entry point to schedule() from in-kernel preemption 3030 * off of preempt_enable. Kernel preemptions off return from interrupt 3031 * occur there and call schedule directly. 3032 */ 3033asmlinkage void __sched notrace preempt_schedule(void) 3034{ 3035 struct thread_info *ti = current_thread_info(); 3036 3037 /* 3038 * If there is a non-zero preempt_count or interrupts are disabled, 3039 * we do not want to preempt the current task. Just return.. 3040 */ 3041 if (likely(ti->preempt_count || irqs_disabled())) 3042 return; 3043 3044 do { 3045 add_preempt_count_notrace(PREEMPT_ACTIVE); 3046 __schedule(); 3047 sub_preempt_count_notrace(PREEMPT_ACTIVE); 3048 3049 /* 3050 * Check again in case we missed a preemption opportunity 3051 * between schedule and now. 3052 */ 3053 barrier(); 3054 } while (need_resched()); 3055} 3056EXPORT_SYMBOL(preempt_schedule); 3057 3058/* 3059 * this is the entry point to schedule() from kernel preemption 3060 * off of irq context. 3061 * Note, that this is called and return with irqs disabled. This will 3062 * protect us against recursive calling from irq. 3063 */ 3064asmlinkage void __sched preempt_schedule_irq(void) 3065{ 3066 struct thread_info *ti = current_thread_info(); 3067 3068 /* Catch callers which need to be fixed */ 3069 BUG_ON(ti->preempt_count || !irqs_disabled()); 3070 3071 rcu_user_exit(); 3072 do { 3073 add_preempt_count(PREEMPT_ACTIVE); 3074 local_irq_enable(); 3075 __schedule(); 3076 local_irq_disable(); 3077 sub_preempt_count(PREEMPT_ACTIVE); 3078 3079 /* 3080 * Check again in case we missed a preemption opportunity 3081 * between schedule and now. 3082 */ 3083 barrier(); 3084 } while (need_resched()); 3085} 3086 3087#endif /* CONFIG_PREEMPT */ 3088 3089int default_wake_function(wait_queue_t *curr, unsigned mode, int wake_flags, 3090 void *key) 3091{ 3092 return try_to_wake_up(curr->private, mode, wake_flags); 3093} 3094EXPORT_SYMBOL(default_wake_function); 3095 3096/* 3097 * The core wakeup function. Non-exclusive wakeups (nr_exclusive == 0) just 3098 * wake everything up. If it's an exclusive wakeup (nr_exclusive == small +ve 3099 * number) then we wake all the non-exclusive tasks and one exclusive task. 3100 * 3101 * There are circumstances in which we can try to wake a task which has already 3102 * started to run but is not in state TASK_RUNNING. try_to_wake_up() returns 3103 * zero in this (rare) case, and we handle it by continuing to scan the queue. 3104 */ 3105static void __wake_up_common(wait_queue_head_t *q, unsigned int mode, 3106 int nr_exclusive, int wake_flags, void *key) 3107{ 3108 wait_queue_t *curr, *next; 3109 3110 list_for_each_entry_safe(curr, next, &q->task_list, task_list) { 3111 unsigned flags = curr->flags; 3112 3113 if (curr->func(curr, mode, wake_flags, key) && 3114 (flags & WQ_FLAG_EXCLUSIVE) && !--nr_exclusive) 3115 break; 3116 } 3117} 3118 3119/** 3120 * __wake_up - wake up threads blocked on a waitqueue. 3121 * @q: the waitqueue 3122 * @mode: which threads 3123 * @nr_exclusive: how many wake-one or wake-many threads to wake up 3124 * @key: is directly passed to the wakeup function 3125 * 3126 * It may be assumed that this function implies a write memory barrier before 3127 * changing the task state if and only if any tasks are woken up. 3128 */ 3129void __wake_up(wait_queue_head_t *q, unsigned int mode, 3130 int nr_exclusive, void *key) 3131{ 3132 unsigned long flags; 3133 3134 spin_lock_irqsave(&q->lock, flags); 3135 __wake_up_common(q, mode, nr_exclusive, 0, key); 3136 spin_unlock_irqrestore(&q->lock, flags); 3137} 3138EXPORT_SYMBOL(__wake_up); 3139 3140/* 3141 * Same as __wake_up but called with the spinlock in wait_queue_head_t held. 3142 */ 3143void __wake_up_locked(wait_queue_head_t *q, unsigned int mode, int nr) 3144{ 3145 __wake_up_common(q, mode, nr, 0, NULL); 3146} 3147EXPORT_SYMBOL_GPL(__wake_up_locked); 3148 3149void __wake_up_locked_key(wait_queue_head_t *q, unsigned int mode, void *key) 3150{ 3151 __wake_up_common(q, mode, 1, 0, key); 3152} 3153EXPORT_SYMBOL_GPL(__wake_up_locked_key); 3154 3155/** 3156 * __wake_up_sync_key - wake up threads blocked on a waitqueue. 3157 * @q: the waitqueue 3158 * @mode: which threads 3159 * @nr_exclusive: how many wake-one or wake-many threads to wake up 3160 * @key: opaque value to be passed to wakeup targets 3161 * 3162 * The sync wakeup differs that the waker knows that it will schedule 3163 * away soon, so while the target thread will be woken up, it will not 3164 * be migrated to another CPU - ie. the two threads are 'synchronized' 3165 * with each other. This can prevent needless bouncing between CPUs. 3166 * 3167 * On UP it can prevent extra preemption. 3168 * 3169 * It may be assumed that this function implies a write memory barrier before 3170 * changing the task state if and only if any tasks are woken up. 3171 */ 3172void __wake_up_sync_key(wait_queue_head_t *q, unsigned int mode, 3173 int nr_exclusive, void *key) 3174{ 3175 unsigned long flags; 3176 int wake_flags = WF_SYNC; 3177 3178 if (unlikely(!q)) 3179 return; 3180 3181 if (unlikely(!nr_exclusive)) 3182 wake_flags = 0; 3183 3184 spin_lock_irqsave(&q->lock, flags); 3185 __wake_up_common(q, mode, nr_exclusive, wake_flags, key); 3186 spin_unlock_irqrestore(&q->lock, flags); 3187} 3188EXPORT_SYMBOL_GPL(__wake_up_sync_key); 3189 3190/* 3191 * __wake_up_sync - see __wake_up_sync_key() 3192 */ 3193void __wake_up_sync(wait_queue_head_t *q, unsigned int mode, int nr_exclusive) 3194{ 3195 __wake_up_sync_key(q, mode, nr_exclusive, NULL); 3196} 3197EXPORT_SYMBOL_GPL(__wake_up_sync); /* For internal use only */ 3198 3199/** 3200 * complete: - signals a single thread waiting on this completion 3201 * @x: holds the state of this particular completion 3202 * 3203 * This will wake up a single thread waiting on this completion. Threads will be 3204 * awakened in the same order in which they were queued. 3205 * 3206 * See also complete_all(), wait_for_completion() and related routines. 3207 * 3208 * It may be assumed that this function implies a write memory barrier before 3209 * changing the task state if and only if any tasks are woken up. 3210 */ 3211void complete(struct completion *x) 3212{ 3213 unsigned long flags; 3214 3215 spin_lock_irqsave(&x->wait.lock, flags); 3216 x->done++; 3217 __wake_up_common(&x->wait, TASK_NORMAL, 1, 0, NULL); 3218 spin_unlock_irqrestore(&x->wait.lock, flags); 3219} 3220EXPORT_SYMBOL(complete); 3221 3222/** 3223 * complete_all: - signals all threads waiting on this completion 3224 * @x: holds the state of this particular completion 3225 * 3226 * This will wake up all threads waiting on this particular completion event. 3227 * 3228 * It may be assumed that this function implies a write memory barrier before 3229 * changing the task state if and only if any tasks are woken up. 3230 */ 3231void complete_all(struct completion *x) 3232{ 3233 unsigned long flags; 3234 3235 spin_lock_irqsave(&x->wait.lock, flags); 3236 x->done += UINT_MAX/2; 3237 __wake_up_common(&x->wait, TASK_NORMAL, 0, 0, NULL); 3238 spin_unlock_irqrestore(&x->wait.lock, flags); 3239} 3240EXPORT_SYMBOL(complete_all); 3241 3242static inline long __sched 3243do_wait_for_common(struct completion *x, long timeout, int state) 3244{ 3245 if (!x->done) { 3246 DECLARE_WAITQUEUE(wait, current); 3247 3248 __add_wait_queue_tail_exclusive(&x->wait, &wait); 3249 do { 3250 if (signal_pending_state(state, current)) { 3251 timeout = -ERESTARTSYS; 3252 break; 3253 } 3254 __set_current_state(state); 3255 spin_unlock_irq(&x->wait.lock); 3256 timeout = schedule_timeout(timeout); 3257 spin_lock_irq(&x->wait.lock); 3258 } while (!x->done && timeout); 3259 __remove_wait_queue(&x->wait, &wait); 3260 if (!x->done) 3261 return timeout; 3262 } 3263 x->done--; 3264 return timeout ?: 1; 3265} 3266 3267static long __sched 3268wait_for_common(struct completion *x, long timeout, int state) 3269{ 3270 might_sleep(); 3271 3272 spin_lock_irq(&x->wait.lock); 3273 timeout = do_wait_for_common(x, timeout, state); 3274 spin_unlock_irq(&x->wait.lock); 3275 return timeout; 3276} 3277 3278/** 3279 * wait_for_completion: - waits for completion of a task 3280 * @x: holds the state of this particular completion 3281 * 3282 * This waits to be signaled for completion of a specific task. It is NOT 3283 * interruptible and there is no timeout. 3284 * 3285 * See also similar routines (i.e. wait_for_completion_timeout()) with timeout 3286 * and interrupt capability. Also see complete(). 3287 */ 3288void __sched wait_for_completion(struct completion *x) 3289{ 3290 wait_for_common(x, MAX_SCHEDULE_TIMEOUT, TASK_UNINTERRUPTIBLE); 3291} 3292EXPORT_SYMBOL(wait_for_completion); 3293 3294/** 3295 * wait_for_completion_timeout: - waits for completion of a task (w/timeout) 3296 * @x: holds the state of this particular completion 3297 * @timeout: timeout value in jiffies 3298 * 3299 * This waits for either a completion of a specific task to be signaled or for a 3300 * specified timeout to expire. The timeout is in jiffies. It is not 3301 * interruptible. 3302 * 3303 * The return value is 0 if timed out, and positive (at least 1, or number of 3304 * jiffies left till timeout) if completed. 3305 */ 3306unsigned long __sched 3307wait_for_completion_timeout(struct completion *x, unsigned long timeout) 3308{ 3309 return wait_for_common(x, timeout, TASK_UNINTERRUPTIBLE); 3310} 3311EXPORT_SYMBOL(wait_for_completion_timeout); 3312 3313/** 3314 * wait_for_completion_interruptible: - waits for completion of a task (w/intr) 3315 * @x: holds the state of this particular completion 3316 * 3317 * This waits for completion of a specific task to be signaled. It is 3318 * interruptible. 3319 * 3320 * The return value is -ERESTARTSYS if interrupted, 0 if completed. 3321 */ 3322int __sched wait_for_completion_interruptible(struct completion *x) 3323{ 3324 long t = wait_for_common(x, MAX_SCHEDULE_TIMEOUT, TASK_INTERRUPTIBLE); 3325 if (t == -ERESTARTSYS) 3326 return t; 3327 return 0; 3328} 3329EXPORT_SYMBOL(wait_for_completion_interruptible); 3330 3331/** 3332 * wait_for_completion_interruptible_timeout: - waits for completion (w/(to,intr)) 3333 * @x: holds the state of this particular completion 3334 * @timeout: timeout value in jiffies 3335 * 3336 * This waits for either a completion of a specific task to be signaled or for a 3337 * specified timeout to expire. It is interruptible. The timeout is in jiffies. 3338 * 3339 * The return value is -ERESTARTSYS if interrupted, 0 if timed out, 3340 * positive (at least 1, or number of jiffies left till timeout) if completed. 3341 */ 3342long __sched 3343wait_for_completion_interruptible_timeout(struct completion *x, 3344 unsigned long timeout) 3345{ 3346 return wait_for_common(x, timeout, TASK_INTERRUPTIBLE); 3347} 3348EXPORT_SYMBOL(wait_for_completion_interruptible_timeout); 3349 3350/** 3351 * wait_for_completion_killable: - waits for completion of a task (killable) 3352 * @x: holds the state of this particular completion 3353 * 3354 * This waits to be signaled for completion of a specific task. It can be 3355 * interrupted by a kill signal. 3356 * 3357 * The return value is -ERESTARTSYS if interrupted, 0 if completed. 3358 */ 3359int __sched wait_for_completion_killable(struct completion *x) 3360{ 3361 long t = wait_for_common(x, MAX_SCHEDULE_TIMEOUT, TASK_KILLABLE); 3362 if (t == -ERESTARTSYS) 3363 return t; 3364 return 0; 3365} 3366EXPORT_SYMBOL(wait_for_completion_killable); 3367 3368/** 3369 * wait_for_completion_killable_timeout: - waits for completion of a task (w/(to,killable)) 3370 * @x: holds the state of this particular completion 3371 * @timeout: timeout value in jiffies 3372 * 3373 * This waits for either a completion of a specific task to be 3374 * signaled or for a specified timeout to expire. It can be 3375 * interrupted by a kill signal. The timeout is in jiffies. 3376 * 3377 * The return value is -ERESTARTSYS if interrupted, 0 if timed out, 3378 * positive (at least 1, or number of jiffies left till timeout) if completed. 3379 */ 3380long __sched 3381wait_for_completion_killable_timeout(struct completion *x, 3382 unsigned long timeout) 3383{ 3384 return wait_for_common(x, timeout, TASK_KILLABLE); 3385} 3386EXPORT_SYMBOL(wait_for_completion_killable_timeout); 3387 3388/** 3389 * try_wait_for_completion - try to decrement a completion without blocking 3390 * @x: completion structure 3391 * 3392 * Returns: 0 if a decrement cannot be done without blocking 3393 * 1 if a decrement succeeded. 3394 * 3395 * If a completion is being used as a counting completion, 3396 * attempt to decrement the counter without blocking. This 3397 * enables us to avoid waiting if the resource the completion 3398 * is protecting is not available. 3399 */ 3400bool try_wait_for_completion(struct completion *x) 3401{ 3402 unsigned long flags; 3403 int ret = 1; 3404 3405 spin_lock_irqsave(&x->wait.lock, flags); 3406 if (!x->done) 3407 ret = 0; 3408 else 3409 x->done--; 3410 spin_unlock_irqrestore(&x->wait.lock, flags); 3411 return ret; 3412} 3413EXPORT_SYMBOL(try_wait_for_completion); 3414 3415/** 3416 * completion_done - Test to see if a completion has any waiters 3417 * @x: completion structure 3418 * 3419 * Returns: 0 if there are waiters (wait_for_completion() in progress) 3420 * 1 if there are no waiters. 3421 * 3422 */ 3423bool completion_done(struct completion *x) 3424{ 3425 unsigned long flags; 3426 int ret = 1; 3427 3428 spin_lock_irqsave(&x->wait.lock, flags); 3429 if (!x->done) 3430 ret = 0; 3431 spin_unlock_irqrestore(&x->wait.lock, flags); 3432 return ret; 3433} 3434EXPORT_SYMBOL(completion_done); 3435 3436static long __sched 3437sleep_on_common(wait_queue_head_t *q, int state, long timeout) 3438{ 3439 unsigned long flags; 3440 wait_queue_t wait; 3441 3442 init_waitqueue_entry(&wait, current); 3443 3444 __set_current_state(state); 3445 3446 spin_lock_irqsave(&q->lock, flags); 3447 __add_wait_queue(q, &wait); 3448 spin_unlock(&q->lock); 3449 timeout = schedule_timeout(timeout); 3450 spin_lock_irq(&q->lock); 3451 __remove_wait_queue(q, &wait); 3452 spin_unlock_irqrestore(&q->lock, flags); 3453 3454 return timeout; 3455} 3456 3457void __sched interruptible_sleep_on(wait_queue_head_t *q) 3458{ 3459 sleep_on_common(q, TASK_INTERRUPTIBLE, MAX_SCHEDULE_TIMEOUT); 3460} 3461EXPORT_SYMBOL(interruptible_sleep_on); 3462 3463long __sched 3464interruptible_sleep_on_timeout(wait_queue_head_t *q, long timeout) 3465{ 3466 return sleep_on_common(q, TASK_INTERRUPTIBLE, timeout); 3467} 3468EXPORT_SYMBOL(interruptible_sleep_on_timeout); 3469 3470void __sched sleep_on(wait_queue_head_t *q) 3471{ 3472 sleep_on_common(q, TASK_UNINTERRUPTIBLE, MAX_SCHEDULE_TIMEOUT); 3473} 3474EXPORT_SYMBOL(sleep_on); 3475 3476long __sched sleep_on_timeout(wait_queue_head_t *q, long timeout) 3477{ 3478 return sleep_on_common(q, TASK_UNINTERRUPTIBLE, timeout); 3479} 3480EXPORT_SYMBOL(sleep_on_timeout); 3481 3482#ifdef CONFIG_RT_MUTEXES 3483 3484/* 3485 * rt_mutex_setprio - set the current priority of a task 3486 * @p: task 3487 * @prio: prio value (kernel-internal form) 3488 * 3489 * This function changes the 'effective' priority of a task. It does 3490 * not touch ->normal_prio like __setscheduler(). 3491 * 3492 * Used by the rt_mutex code to implement priority inheritance logic. 3493 */ 3494void rt_mutex_setprio(struct task_struct *p, int prio) 3495{ 3496 int oldprio, on_rq, running; 3497 struct rq *rq; 3498 const struct sched_class *prev_class; 3499 3500 BUG_ON(prio < 0 || prio > MAX_PRIO); 3501 3502 rq = __task_rq_lock(p); 3503 3504 /* 3505 * Idle task boosting is a nono in general. There is one 3506 * exception, when PREEMPT_RT and NOHZ is active: 3507 * 3508 * The idle task calls get_next_timer_interrupt() and holds 3509 * the timer wheel base->lock on the CPU and another CPU wants 3510 * to access the timer (probably to cancel it). We can safely 3511 * ignore the boosting request, as the idle CPU runs this code 3512 * with interrupts disabled and will complete the lock 3513 * protected section without being interrupted. So there is no 3514 * real need to boost. 3515 */ 3516 if (unlikely(p == rq->idle)) { 3517 WARN_ON(p != rq->curr); 3518 WARN_ON(p->pi_blocked_on); 3519 goto out_unlock; 3520 } 3521 3522 trace_sched_pi_setprio(p, prio); 3523 oldprio = p->prio; 3524 prev_class = p->sched_class; 3525 on_rq = p->on_rq; 3526 running = task_current(rq, p); 3527 if (on_rq) 3528 dequeue_task(rq, p, 0); 3529 if (running) 3530 p->sched_class->put_prev_task(rq, p); 3531 3532 if (rt_prio(prio)) 3533 p->sched_class = &rt_sched_class; 3534 else 3535 p->sched_class = &fair_sched_class; 3536 3537 p->prio = prio; 3538 3539 if (running) 3540 p->sched_class->set_curr_task(rq); 3541 if (on_rq) 3542 enqueue_task(rq, p, oldprio < prio ? ENQUEUE_HEAD : 0); 3543 3544 check_class_changed(rq, p, prev_class, oldprio); 3545out_unlock: 3546 __task_rq_unlock(rq); 3547} 3548#endif 3549void set_user_nice(struct task_struct *p, long nice) 3550{ 3551 int old_prio, delta, on_rq; 3552 unsigned long flags; 3553 struct rq *rq; 3554 3555 if (TASK_NICE(p) == nice || nice < -20 || nice > 19) 3556 return; 3557 /* 3558 * We have to be careful, if called from sys_setpriority(), 3559 * the task might be in the middle of scheduling on another CPU. 3560 */ 3561 rq = task_rq_lock(p, &flags); 3562 /* 3563 * The RT priorities are set via sched_setscheduler(), but we still 3564 * allow the 'normal' nice value to be set - but as expected 3565 * it wont have any effect on scheduling until the task is 3566 * SCHED_FIFO/SCHED_RR: 3567 */ 3568 if (task_has_rt_policy(p)) { 3569 p->static_prio = NICE_TO_PRIO(nice); 3570 goto out_unlock; 3571 } 3572 on_rq = p->on_rq; 3573 if (on_rq) 3574 dequeue_task(rq, p, 0); 3575 3576 p->static_prio = NICE_TO_PRIO(nice); 3577 set_load_weight(p); 3578 old_prio = p->prio; 3579 p->prio = effective_prio(p); 3580 delta = p->prio - old_prio; 3581 3582 if (on_rq) { 3583 enqueue_task(rq, p, 0); 3584 /* 3585 * If the task increased its priority or is running and 3586 * lowered its priority, then reschedule its CPU: 3587 */ 3588 if (delta < 0 || (delta > 0 && task_running(rq, p))) 3589 resched_task(rq->curr); 3590 } 3591out_unlock: 3592 task_rq_unlock(rq, p, &flags); 3593} 3594EXPORT_SYMBOL(set_user_nice); 3595 3596/* 3597 * can_nice - check if a task can reduce its nice value 3598 * @p: task 3599 * @nice: nice value 3600 */ 3601int can_nice(const struct task_struct *p, const int nice) 3602{ 3603 /* convert nice value [19,-20] to rlimit style value [1,40] */ 3604 int nice_rlim = 20 - nice; 3605 3606 return (nice_rlim <= task_rlimit(p, RLIMIT_NICE) || 3607 capable(CAP_SYS_NICE)); 3608} 3609 3610#ifdef __ARCH_WANT_SYS_NICE 3611 3612/* 3613 * sys_nice - change the priority of the current process. 3614 * @increment: priority increment 3615 * 3616 * sys_setpriority is a more generic, but much slower function that 3617 * does similar things. 3618 */ 3619SYSCALL_DEFINE1(nice, int, increment) 3620{ 3621 long nice, retval; 3622 3623 /* 3624 * Setpriority might change our priority at the same moment. 3625 * We don't have to worry. Conceptually one call occurs first 3626 * and we have a single winner. 3627 */ 3628 if (increment < -40) 3629 increment = -40; 3630 if (increment > 40) 3631 increment = 40; 3632 3633 nice = TASK_NICE(current) + increment; 3634 if (nice < -20) 3635 nice = -20; 3636 if (nice > 19) 3637 nice = 19; 3638 3639 if (increment < 0 && !can_nice(current, nice)) 3640 return -EPERM; 3641 3642 retval = security_task_setnice(current, nice); 3643 if (retval) 3644 return retval; 3645 3646 set_user_nice(current, nice); 3647 return 0; 3648} 3649 3650#endif 3651 3652/** 3653 * task_prio - return the priority value of a given task. 3654 * @p: the task in question. 3655 * 3656 * This is the priority value as seen by users in /proc. 3657 * RT tasks are offset by -200. Normal tasks are centered 3658 * around 0, value goes from -16 to +15. 3659 */ 3660int task_prio(const struct task_struct *p) 3661{ 3662 return p->prio - MAX_RT_PRIO; 3663} 3664 3665/** 3666 * task_nice - return the nice value of a given task. 3667 * @p: the task in question. 3668 */ 3669int task_nice(const struct task_struct *p) 3670{ 3671 return TASK_NICE(p); 3672} 3673EXPORT_SYMBOL(task_nice); 3674 3675/** 3676 * idle_cpu - is a given cpu idle currently? 3677 * @cpu: the processor in question. 3678 */ 3679int idle_cpu(int cpu) 3680{ 3681 struct rq *rq = cpu_rq(cpu); 3682 3683 if (rq->curr != rq->idle) 3684 return 0; 3685 3686 if (rq->nr_running) 3687 return 0; 3688 3689#ifdef CONFIG_SMP 3690 if (!llist_empty(&rq->wake_list)) 3691 return 0; 3692#endif 3693 3694 return 1; 3695} 3696 3697/** 3698 * idle_task - return the idle task for a given cpu. 3699 * @cpu: the processor in question. 3700 */ 3701struct task_struct *idle_task(int cpu) 3702{ 3703 return cpu_rq(cpu)->idle; 3704} 3705 3706/** 3707 * find_process_by_pid - find a process with a matching PID value. 3708 * @pid: the pid in question. 3709 */ 3710static struct task_struct *find_process_by_pid(pid_t pid) 3711{ 3712 return pid ? find_task_by_vpid(pid) : current; 3713} 3714 3715/* Actually do priority change: must hold rq lock. */ 3716static void 3717__setscheduler(struct rq *rq, struct task_struct *p, int policy, int prio) 3718{ 3719 p->policy = policy; 3720 p->rt_priority = prio; 3721 p->normal_prio = normal_prio(p); 3722 /* we are holding p->pi_lock already */ 3723 p->prio = rt_mutex_getprio(p); 3724 if (rt_prio(p->prio)) 3725 p->sched_class = &rt_sched_class; 3726 else 3727 p->sched_class = &fair_sched_class; 3728 set_load_weight(p); 3729} 3730 3731/* 3732 * check the target process has a UID that matches the current process's 3733 */ 3734static bool check_same_owner(struct task_struct *p) 3735{ 3736 const struct cred *cred = current_cred(), *pcred; 3737 bool match; 3738 3739 rcu_read_lock(); 3740 pcred = __task_cred(p); 3741 match = (uid_eq(cred->euid, pcred->euid) || 3742 uid_eq(cred->euid, pcred->uid)); 3743 rcu_read_unlock(); 3744 return match; 3745} 3746 3747static int __sched_setscheduler(struct task_struct *p, int policy, 3748 const struct sched_param *param, bool user) 3749{ 3750 int retval, oldprio, oldpolicy = -1, on_rq, running; 3751 unsigned long flags; 3752 const struct sched_class *prev_class; 3753 struct rq *rq; 3754 int reset_on_fork; 3755 3756 /* may grab non-irq protected spin_locks */ 3757 BUG_ON(in_interrupt()); 3758recheck: 3759 /* double check policy once rq lock held */ 3760 if (policy < 0) { 3761 reset_on_fork = p->sched_reset_on_fork; 3762 policy = oldpolicy = p->policy; 3763 } else { 3764 reset_on_fork = !!(policy & SCHED_RESET_ON_FORK); 3765 policy &= ~SCHED_RESET_ON_FORK; 3766 3767 if (policy != SCHED_FIFO && policy != SCHED_RR && 3768 policy != SCHED_NORMAL && policy != SCHED_BATCH && 3769 policy != SCHED_IDLE) 3770 return -EINVAL; 3771 } 3772 3773 /* 3774 * Valid priorities for SCHED_FIFO and SCHED_RR are 3775 * 1..MAX_USER_RT_PRIO-1, valid priority for SCHED_NORMAL, 3776 * SCHED_BATCH and SCHED_IDLE is 0. 3777 */ 3778 if (param->sched_priority < 0 || 3779 (p->mm && param->sched_priority > MAX_USER_RT_PRIO-1) || 3780 (!p->mm && param->sched_priority > MAX_RT_PRIO-1)) 3781 return -EINVAL; 3782 if (rt_policy(policy) != (param->sched_priority != 0)) 3783 return -EINVAL; 3784 3785 /* 3786 * Allow unprivileged RT tasks to decrease priority: 3787 */ 3788 if (user && !capable(CAP_SYS_NICE)) { 3789 if (rt_policy(policy)) { 3790 unsigned long rlim_rtprio = 3791 task_rlimit(p, RLIMIT_RTPRIO); 3792 3793 /* can't set/change the rt policy */ 3794 if (policy != p->policy && !rlim_rtprio) 3795 return -EPERM; 3796 3797 /* can't increase priority */ 3798 if (param->sched_priority > p->rt_priority && 3799 param->sched_priority > rlim_rtprio) 3800 return -EPERM; 3801 } 3802 3803 /* 3804 * Treat SCHED_IDLE as nice 20. Only allow a switch to 3805 * SCHED_NORMAL if the RLIMIT_NICE would normally permit it. 3806 */ 3807 if (p->policy == SCHED_IDLE && policy != SCHED_IDLE) { 3808 if (!can_nice(p, TASK_NICE(p))) 3809 return -EPERM; 3810 } 3811 3812 /* can't change other user's priorities */ 3813 if (!check_same_owner(p)) 3814 return -EPERM; 3815 3816 /* Normal users shall not reset the sched_reset_on_fork flag */ 3817 if (p->sched_reset_on_fork && !reset_on_fork) 3818 return -EPERM; 3819 } 3820 3821 if (user) { 3822 retval = security_task_setscheduler(p); 3823 if (retval) 3824 return retval; 3825 } 3826 3827 /* 3828 * make sure no PI-waiters arrive (or leave) while we are 3829 * changing the priority of the task: 3830 * 3831 * To be able to change p->policy safely, the appropriate 3832 * runqueue lock must be held. 3833 */ 3834 rq = task_rq_lock(p, &flags); 3835 3836 /* 3837 * Changing the policy of the stop threads its a very bad idea 3838 */ 3839 if (p == rq->stop) { 3840 task_rq_unlock(rq, p, &flags); 3841 return -EINVAL; 3842 } 3843 3844 /* 3845 * If not changing anything there's no need to proceed further: 3846 */ 3847 if (unlikely(policy == p->policy && (!rt_policy(policy) || 3848 param->sched_priority == p->rt_priority))) { 3849 task_rq_unlock(rq, p, &flags); 3850 return 0; 3851 } 3852 3853#ifdef CONFIG_RT_GROUP_SCHED 3854 if (user) { 3855 /* 3856 * Do not allow realtime tasks into groups that have no runtime 3857 * assigned. 3858 */ 3859 if (rt_bandwidth_enabled() && rt_policy(policy) && 3860 task_group(p)->rt_bandwidth.rt_runtime == 0 && 3861 !task_group_is_autogroup(task_group(p))) { 3862 task_rq_unlock(rq, p, &flags); 3863 return -EPERM; 3864 } 3865 } 3866#endif 3867 3868 /* recheck policy now with rq lock held */ 3869 if (unlikely(oldpolicy != -1 && oldpolicy != p->policy)) { 3870 policy = oldpolicy = -1; 3871 task_rq_unlock(rq, p, &flags); 3872 goto recheck; 3873 } 3874 on_rq = p->on_rq; 3875 running = task_current(rq, p); 3876 if (on_rq) 3877 dequeue_task(rq, p, 0); 3878 if (running) 3879 p->sched_class->put_prev_task(rq, p); 3880 3881 p->sched_reset_on_fork = reset_on_fork; 3882 3883 oldprio = p->prio; 3884 prev_class = p->sched_class; 3885 __setscheduler(rq, p, policy, param->sched_priority); 3886 3887 if (running) 3888 p->sched_class->set_curr_task(rq); 3889 if (on_rq) 3890 enqueue_task(rq, p, 0); 3891 3892 check_class_changed(rq, p, prev_class, oldprio); 3893 task_rq_unlock(rq, p, &flags); 3894 3895 rt_mutex_adjust_pi(p); 3896 3897 return 0; 3898} 3899 3900/** 3901 * sched_setscheduler - change the scheduling policy and/or RT priority of a thread. 3902 * @p: the task in question. 3903 * @policy: new policy. 3904 * @param: structure containing the new RT priority. 3905 * 3906 * NOTE that the task may be already dead. 3907 */ 3908int sched_setscheduler(struct task_struct *p, int policy, 3909 const struct sched_param *param) 3910{ 3911 return __sched_setscheduler(p, policy, param, true); 3912} 3913EXPORT_SYMBOL_GPL(sched_setscheduler); 3914 3915/** 3916 * sched_setscheduler_nocheck - change the scheduling policy and/or RT priority of a thread from kernelspace. 3917 * @p: the task in question. 3918 * @policy: new policy. 3919 * @param: structure containing the new RT priority. 3920 * 3921 * Just like sched_setscheduler, only don't bother checking if the 3922 * current context has permission. For example, this is needed in 3923 * stop_machine(): we create temporary high priority worker threads, 3924 * but our caller might not have that capability. 3925 */ 3926int sched_setscheduler_nocheck(struct task_struct *p, int policy, 3927 const struct sched_param *param) 3928{ 3929 return __sched_setscheduler(p, policy, param, false); 3930} 3931 3932static int 3933do_sched_setscheduler(pid_t pid, int policy, struct sched_param __user *param) 3934{ 3935 struct sched_param lparam; 3936 struct task_struct *p; 3937 int retval; 3938 3939 if (!param || pid < 0) 3940 return -EINVAL; 3941 if (copy_from_user(&lparam, param, sizeof(struct sched_param))) 3942 return -EFAULT; 3943 3944 rcu_read_lock(); 3945 retval = -ESRCH; 3946 p = find_process_by_pid(pid); 3947 if (p != NULL) 3948 retval = sched_setscheduler(p, policy, &lparam); 3949 rcu_read_unlock(); 3950 3951 return retval; 3952} 3953 3954/** 3955 * sys_sched_setscheduler - set/change the scheduler policy and RT priority 3956 * @pid: the pid in question. 3957 * @policy: new policy. 3958 * @param: structure containing the new RT priority. 3959 */ 3960SYSCALL_DEFINE3(sched_setscheduler, pid_t, pid, int, policy, 3961 struct sched_param __user *, param) 3962{ 3963 /* negative values for policy are not valid */ 3964 if (policy < 0) 3965 return -EINVAL; 3966 3967 return do_sched_setscheduler(pid, policy, param); 3968} 3969 3970/** 3971 * sys_sched_setparam - set/change the RT priority of a thread 3972 * @pid: the pid in question. 3973 * @param: structure containing the new RT priority. 3974 */ 3975SYSCALL_DEFINE2(sched_setparam, pid_t, pid, struct sched_param __user *, param) 3976{ 3977 return do_sched_setscheduler(pid, -1, param); 3978} 3979 3980/** 3981 * sys_sched_getscheduler - get the policy (scheduling class) of a thread 3982 * @pid: the pid in question. 3983 */ 3984SYSCALL_DEFINE1(sched_getscheduler, pid_t, pid) 3985{ 3986 struct task_struct *p; 3987 int retval; 3988 3989 if (pid < 0) 3990 return -EINVAL; 3991 3992 retval = -ESRCH; 3993 rcu_read_lock(); 3994 p = find_process_by_pid(pid); 3995 if (p) { 3996 retval = security_task_getscheduler(p); 3997 if (!retval) 3998 retval = p->policy 3999 | (p->sched_reset_on_fork ? SCHED_RESET_ON_FORK : 0); 4000 } 4001 rcu_read_unlock(); 4002 return retval; 4003} 4004 4005/** 4006 * sys_sched_getparam - get the RT priority of a thread 4007 * @pid: the pid in question. 4008 * @param: structure containing the RT priority. 4009 */ 4010SYSCALL_DEFINE2(sched_getparam, pid_t, pid, struct sched_param __user *, param) 4011{ 4012 struct sched_param lp; 4013 struct task_struct *p; 4014 int retval; 4015 4016 if (!param || pid < 0) 4017 return -EINVAL; 4018 4019 rcu_read_lock(); 4020 p = find_process_by_pid(pid); 4021 retval = -ESRCH; 4022 if (!p) 4023 goto out_unlock; 4024 4025 retval = security_task_getscheduler(p); 4026 if (retval) 4027 goto out_unlock; 4028 4029 lp.sched_priority = p->rt_priority; 4030 rcu_read_unlock(); 4031 4032 /* 4033 * This one might sleep, we cannot do it with a spinlock held ... 4034 */ 4035 retval = copy_to_user(param, &lp, sizeof(*param)) ? -EFAULT : 0; 4036 4037 return retval; 4038 4039out_unlock: 4040 rcu_read_unlock(); 4041 return retval; 4042} 4043 4044long sched_setaffinity(pid_t pid, const struct cpumask *in_mask) 4045{ 4046 cpumask_var_t cpus_allowed, new_mask; 4047 struct task_struct *p; 4048 int retval; 4049 4050 get_online_cpus(); 4051 rcu_read_lock(); 4052 4053 p = find_process_by_pid(pid); 4054 if (!p) { 4055 rcu_read_unlock(); 4056 put_online_cpus(); 4057 return -ESRCH; 4058 } 4059 4060 /* Prevent p going away */ 4061 get_task_struct(p); 4062 rcu_read_unlock(); 4063 4064 if (!alloc_cpumask_var(&cpus_allowed, GFP_KERNEL)) { 4065 retval = -ENOMEM; 4066 goto out_put_task; 4067 } 4068 if (!alloc_cpumask_var(&new_mask, GFP_KERNEL)) { 4069 retval = -ENOMEM; 4070 goto out_free_cpus_allowed; 4071 } 4072 retval = -EPERM; 4073 if (!check_same_owner(p) && !ns_capable(task_user_ns(p), CAP_SYS_NICE)) 4074 goto out_unlock; 4075 4076 retval = security_task_setscheduler(p); 4077 if (retval) 4078 goto out_unlock; 4079 4080 cpuset_cpus_allowed(p, cpus_allowed); 4081 cpumask_and(new_mask, in_mask, cpus_allowed); 4082again: 4083 retval = set_cpus_allowed_ptr(p, new_mask); 4084 4085 if (!retval) { 4086 cpuset_cpus_allowed(p, cpus_allowed); 4087 if (!cpumask_subset(new_mask, cpus_allowed)) { 4088 /* 4089 * We must have raced with a concurrent cpuset 4090 * update. Just reset the cpus_allowed to the 4091 * cpuset's cpus_allowed 4092 */ 4093 cpumask_copy(new_mask, cpus_allowed); 4094 goto again; 4095 } 4096 } 4097out_unlock: 4098 free_cpumask_var(new_mask); 4099out_free_cpus_allowed: 4100 free_cpumask_var(cpus_allowed); 4101out_put_task: 4102 put_task_struct(p); 4103 put_online_cpus(); 4104 return retval; 4105} 4106 4107static int get_user_cpu_mask(unsigned long __user *user_mask_ptr, unsigned len, 4108 struct cpumask *new_mask) 4109{ 4110 if (len < cpumask_size()) 4111 cpumask_clear(new_mask); 4112 else if (len > cpumask_size()) 4113 len = cpumask_size(); 4114 4115 return copy_from_user(new_mask, user_mask_ptr, len) ? -EFAULT : 0; 4116} 4117 4118/** 4119 * sys_sched_setaffinity - set the cpu affinity of a process 4120 * @pid: pid of the process 4121 * @len: length in bytes of the bitmask pointed to by user_mask_ptr 4122 * @user_mask_ptr: user-space pointer to the new cpu mask 4123 */ 4124SYSCALL_DEFINE3(sched_setaffinity, pid_t, pid, unsigned int, len, 4125 unsigned long __user *, user_mask_ptr) 4126{ 4127 cpumask_var_t new_mask; 4128 int retval; 4129 4130 if (!alloc_cpumask_var(&new_mask, GFP_KERNEL)) 4131 return -ENOMEM; 4132 4133 retval = get_user_cpu_mask(user_mask_ptr, len, new_mask); 4134 if (retval == 0) 4135 retval = sched_setaffinity(pid, new_mask); 4136 free_cpumask_var(new_mask); 4137 return retval; 4138} 4139 4140long sched_getaffinity(pid_t pid, struct cpumask *mask) 4141{ 4142 struct task_struct *p; 4143 unsigned long flags; 4144 int retval; 4145 4146 get_online_cpus(); 4147 rcu_read_lock(); 4148 4149 retval = -ESRCH; 4150 p = find_process_by_pid(pid); 4151 if (!p) 4152 goto out_unlock; 4153 4154 retval = security_task_getscheduler(p); 4155 if (retval) 4156 goto out_unlock; 4157 4158 raw_spin_lock_irqsave(&p->pi_lock, flags); 4159 cpumask_and(mask, &p->cpus_allowed, cpu_online_mask); 4160 raw_spin_unlock_irqrestore(&p->pi_lock, flags); 4161 4162out_unlock: 4163 rcu_read_unlock(); 4164 put_online_cpus(); 4165 4166 return retval; 4167} 4168 4169/** 4170 * sys_sched_getaffinity - get the cpu affinity of a process 4171 * @pid: pid of the process 4172 * @len: length in bytes of the bitmask pointed to by user_mask_ptr 4173 * @user_mask_ptr: user-space pointer to hold the current cpu mask 4174 */ 4175SYSCALL_DEFINE3(sched_getaffinity, pid_t, pid, unsigned int, len, 4176 unsigned long __user *, user_mask_ptr) 4177{ 4178 int ret; 4179 cpumask_var_t mask; 4180 4181 if ((len * BITS_PER_BYTE) < nr_cpu_ids) 4182 return -EINVAL; 4183 if (len & (sizeof(unsigned long)-1)) 4184 return -EINVAL; 4185 4186 if (!alloc_cpumask_var(&mask, GFP_KERNEL)) 4187 return -ENOMEM; 4188 4189 ret = sched_getaffinity(pid, mask); 4190 if (ret == 0) { 4191 size_t retlen = min_t(size_t, len, cpumask_size()); 4192 4193 if (copy_to_user(user_mask_ptr, mask, retlen)) 4194 ret = -EFAULT; 4195 else 4196 ret = retlen; 4197 } 4198 free_cpumask_var(mask); 4199 4200 return ret; 4201} 4202 4203/** 4204 * sys_sched_yield - yield the current processor to other threads. 4205 * 4206 * This function yields the current CPU to other tasks. If there are no 4207 * other threads running on this CPU then this function will return. 4208 */ 4209SYSCALL_DEFINE0(sched_yield) 4210{ 4211 struct rq *rq = this_rq_lock(); 4212 4213 schedstat_inc(rq, yld_count); 4214 current->sched_class->yield_task(rq); 4215 4216 /* 4217 * Since we are going to call schedule() anyway, there's 4218 * no need to preempt or enable interrupts: 4219 */ 4220 __release(rq->lock); 4221 spin_release(&rq->lock.dep_map, 1, _THIS_IP_); 4222 do_raw_spin_unlock(&rq->lock); 4223 sched_preempt_enable_no_resched(); 4224 4225 schedule(); 4226 4227 return 0; 4228} 4229 4230static inline int should_resched(void) 4231{ 4232 return need_resched() && !(preempt_count() & PREEMPT_ACTIVE); 4233} 4234 4235static void __cond_resched(void) 4236{ 4237 add_preempt_count(PREEMPT_ACTIVE); 4238 __schedule(); 4239 sub_preempt_count(PREEMPT_ACTIVE); 4240} 4241 4242int __sched _cond_resched(void) 4243{ 4244 if (should_resched()) { 4245 __cond_resched(); 4246 return 1; 4247 } 4248 return 0; 4249} 4250EXPORT_SYMBOL(_cond_resched); 4251 4252/* 4253 * __cond_resched_lock() - if a reschedule is pending, drop the given lock, 4254 * call schedule, and on return reacquire the lock. 4255 * 4256 * This works OK both with and without CONFIG_PREEMPT. We do strange low-level 4257 * operations here to prevent schedule() from being called twice (once via 4258 * spin_unlock(), once by hand). 4259 */ 4260int __cond_resched_lock(spinlock_t *lock) 4261{ 4262 int resched = should_resched(); 4263 int ret = 0; 4264 4265 lockdep_assert_held(lock); 4266 4267 if (spin_needbreak(lock) || resched) { 4268 spin_unlock(lock); 4269 if (resched) 4270 __cond_resched(); 4271 else 4272 cpu_relax(); 4273 ret = 1; 4274 spin_lock(lock); 4275 } 4276 return ret; 4277} 4278EXPORT_SYMBOL(__cond_resched_lock); 4279 4280int __sched __cond_resched_softirq(void) 4281{ 4282 BUG_ON(!in_softirq()); 4283 4284 if (should_resched()) { 4285 local_bh_enable(); 4286 __cond_resched(); 4287 local_bh_disable(); 4288 return 1; 4289 } 4290 return 0; 4291} 4292EXPORT_SYMBOL(__cond_resched_softirq); 4293 4294/** 4295 * yield - yield the current processor to other threads. 4296 * 4297 * Do not ever use this function, there's a 99% chance you're doing it wrong. 4298 * 4299 * The scheduler is at all times free to pick the calling task as the most 4300 * eligible task to run, if removing the yield() call from your code breaks 4301 * it, its already broken. 4302 * 4303 * Typical broken usage is: 4304 * 4305 * while (!event) 4306 * yield(); 4307 * 4308 * where one assumes that yield() will let 'the other' process run that will 4309 * make event true. If the current task is a SCHED_FIFO task that will never 4310 * happen. Never use yield() as a progress guarantee!! 4311 * 4312 * If you want to use yield() to wait for something, use wait_event(). 4313 * If you want to use yield() to be 'nice' for others, use cond_resched(). 4314 * If you still want to use yield(), do not! 4315 */ 4316void __sched yield(void) 4317{ 4318 set_current_state(TASK_RUNNING); 4319 sys_sched_yield(); 4320} 4321EXPORT_SYMBOL(yield); 4322 4323/** 4324 * yield_to - yield the current processor to another thread in 4325 * your thread group, or accelerate that thread toward the 4326 * processor it's on. 4327 * @p: target task 4328 * @preempt: whether task preemption is allowed or not 4329 * 4330 * It's the caller's job to ensure that the target task struct 4331 * can't go away on us before we can do any checks. 4332 * 4333 * Returns true if we indeed boosted the target task. 4334 */ 4335bool __sched yield_to(struct task_struct *p, bool preempt) 4336{ 4337 struct task_struct *curr = current; 4338 struct rq *rq, *p_rq; 4339 unsigned long flags; 4340 bool yielded = 0; 4341 4342 local_irq_save(flags); 4343 rq = this_rq(); 4344 4345again: 4346 p_rq = task_rq(p); 4347 double_rq_lock(rq, p_rq); 4348 while (task_rq(p) != p_rq) { 4349 double_rq_unlock(rq, p_rq); 4350 goto again; 4351 } 4352 4353 if (!curr->sched_class->yield_to_task) 4354 goto out; 4355 4356 if (curr->sched_class != p->sched_class) 4357 goto out; 4358 4359 if (task_running(p_rq, p) || p->state) 4360 goto out; 4361 4362 yielded = curr->sched_class->yield_to_task(rq, p, preempt); 4363 if (yielded) { 4364 schedstat_inc(rq, yld_count); 4365 /* 4366 * Make p's CPU reschedule; pick_next_entity takes care of 4367 * fairness. 4368 */ 4369 if (preempt && rq != p_rq) 4370 resched_task(p_rq->curr); 4371 } 4372 4373out: 4374 double_rq_unlock(rq, p_rq); 4375 local_irq_restore(flags); 4376 4377 if (yielded) 4378 schedule(); 4379 4380 return yielded; 4381} 4382EXPORT_SYMBOL_GPL(yield_to); 4383 4384/* 4385 * This task is about to go to sleep on IO. Increment rq->nr_iowait so 4386 * that process accounting knows that this is a task in IO wait state. 4387 */ 4388void __sched io_schedule(void) 4389{ 4390 struct rq *rq = raw_rq(); 4391 4392 delayacct_blkio_start(); 4393 atomic_inc(&rq->nr_iowait); 4394 blk_flush_plug(current); 4395 current->in_iowait = 1; 4396 schedule(); 4397 current->in_iowait = 0; 4398 atomic_dec(&rq->nr_iowait); 4399 delayacct_blkio_end(); 4400} 4401EXPORT_SYMBOL(io_schedule); 4402 4403long __sched io_schedule_timeout(long timeout) 4404{ 4405 struct rq *rq = raw_rq(); 4406 long ret; 4407 4408 delayacct_blkio_start(); 4409 atomic_inc(&rq->nr_iowait); 4410 blk_flush_plug(current); 4411 current->in_iowait = 1; 4412 ret = schedule_timeout(timeout); 4413 current->in_iowait = 0; 4414 atomic_dec(&rq->nr_iowait); 4415 delayacct_blkio_end(); 4416 return ret; 4417} 4418 4419/** 4420 * sys_sched_get_priority_max - return maximum RT priority. 4421 * @policy: scheduling class. 4422 * 4423 * this syscall returns the maximum rt_priority that can be used 4424 * by a given scheduling class. 4425 */ 4426SYSCALL_DEFINE1(sched_get_priority_max, int, policy) 4427{ 4428 int ret = -EINVAL; 4429 4430 switch (policy) { 4431 case SCHED_FIFO: 4432 case SCHED_RR: 4433 ret = MAX_USER_RT_PRIO-1; 4434 break; 4435 case SCHED_NORMAL: 4436 case SCHED_BATCH: 4437 case SCHED_IDLE: 4438 ret = 0; 4439 break; 4440 } 4441 return ret; 4442} 4443 4444/** 4445 * sys_sched_get_priority_min - return minimum RT priority. 4446 * @policy: scheduling class. 4447 * 4448 * this syscall returns the minimum rt_priority that can be used 4449 * by a given scheduling class. 4450 */ 4451SYSCALL_DEFINE1(sched_get_priority_min, int, policy) 4452{ 4453 int ret = -EINVAL; 4454 4455 switch (policy) { 4456 case SCHED_FIFO: 4457 case SCHED_RR: 4458 ret = 1; 4459 break; 4460 case SCHED_NORMAL: 4461 case SCHED_BATCH: 4462 case SCHED_IDLE: 4463 ret = 0; 4464 } 4465 return ret; 4466} 4467 4468/** 4469 * sys_sched_rr_get_interval - return the default timeslice of a process. 4470 * @pid: pid of the process. 4471 * @interval: userspace pointer to the timeslice value. 4472 * 4473 * this syscall writes the default timeslice value of a given process 4474 * into the user-space timespec buffer. A value of '0' means infinity. 4475 */ 4476SYSCALL_DEFINE2(sched_rr_get_interval, pid_t, pid, 4477 struct timespec __user *, interval) 4478{ 4479 struct task_struct *p; 4480 unsigned int time_slice; 4481 unsigned long flags; 4482 struct rq *rq; 4483 int retval; 4484 struct timespec t; 4485 4486 if (pid < 0) 4487 return -EINVAL; 4488 4489 retval = -ESRCH; 4490 rcu_read_lock(); 4491 p = find_process_by_pid(pid); 4492 if (!p) 4493 goto out_unlock; 4494 4495 retval = security_task_getscheduler(p); 4496 if (retval) 4497 goto out_unlock; 4498 4499 rq = task_rq_lock(p, &flags); 4500 time_slice = p->sched_class->get_rr_interval(rq, p); 4501 task_rq_unlock(rq, p, &flags); 4502 4503 rcu_read_unlock(); 4504 jiffies_to_timespec(time_slice, &t); 4505 retval = copy_to_user(interval, &t, sizeof(t)) ? -EFAULT : 0; 4506 return retval; 4507 4508out_unlock: 4509 rcu_read_unlock(); 4510 return retval; 4511} 4512 4513static const char stat_nam[] = TASK_STATE_TO_CHAR_STR; 4514 4515void sched_show_task(struct task_struct *p) 4516{ 4517 unsigned long free = 0; 4518 unsigned state; 4519 4520 state = p->state ? __ffs(p->state) + 1 : 0; 4521 printk(KERN_INFO "%-15.15s %c", p->comm, 4522 state < sizeof(stat_nam) - 1 ? stat_nam[state] : '?'); 4523#if BITS_PER_LONG == 32 4524 if (state == TASK_RUNNING) 4525 printk(KERN_CONT " running "); 4526 else 4527 printk(KERN_CONT " %08lx ", thread_saved_pc(p)); 4528#else 4529 if (state == TASK_RUNNING) 4530 printk(KERN_CONT " running task "); 4531 else 4532 printk(KERN_CONT " %016lx ", thread_saved_pc(p)); 4533#endif 4534#ifdef CONFIG_DEBUG_STACK_USAGE 4535 free = stack_not_used(p); 4536#endif 4537 printk(KERN_CONT "%5lu %5d %6d 0x%08lx\n", free, 4538 task_pid_nr(p), task_pid_nr(rcu_dereference(p->real_parent)), 4539 (unsigned long)task_thread_info(p)->flags); 4540 4541 show_stack(p, NULL); 4542} 4543 4544void show_state_filter(unsigned long state_filter) 4545{ 4546 struct task_struct *g, *p; 4547 4548#if BITS_PER_LONG == 32 4549 printk(KERN_INFO 4550 " task PC stack pid father\n"); 4551#else 4552 printk(KERN_INFO 4553 " task PC stack pid father\n"); 4554#endif 4555 rcu_read_lock(); 4556 do_each_thread(g, p) { 4557 /* 4558 * reset the NMI-timeout, listing all files on a slow 4559 * console might take a lot of time: 4560 */ 4561 touch_nmi_watchdog(); 4562 if (!state_filter || (p->state & state_filter)) 4563 sched_show_task(p); 4564 } while_each_thread(g, p); 4565 4566 touch_all_softlockup_watchdogs(); 4567 4568#ifdef CONFIG_SCHED_DEBUG 4569 sysrq_sched_debug_show(); 4570#endif 4571 rcu_read_unlock(); 4572 /* 4573 * Only show locks if all tasks are dumped: 4574 */ 4575 if (!state_filter) 4576 debug_show_all_locks(); 4577} 4578 4579void __cpuinit init_idle_bootup_task(struct task_struct *idle) 4580{ 4581 idle->sched_class = &idle_sched_class; 4582} 4583 4584/** 4585 * init_idle - set up an idle thread for a given CPU 4586 * @idle: task in question 4587 * @cpu: cpu the idle task belongs to 4588 * 4589 * NOTE: this function does not set the idle thread's NEED_RESCHED 4590 * flag, to make booting more robust. 4591 */ 4592void __cpuinit init_idle(struct task_struct *idle, int cpu) 4593{ 4594 struct rq *rq = cpu_rq(cpu); 4595 unsigned long flags; 4596 4597 raw_spin_lock_irqsave(&rq->lock, flags); 4598 4599 __sched_fork(idle); 4600 idle->state = TASK_RUNNING; 4601 idle->se.exec_start = sched_clock(); 4602 4603 do_set_cpus_allowed(idle, cpumask_of(cpu)); 4604 /* 4605 * We're having a chicken and egg problem, even though we are 4606 * holding rq->lock, the cpu isn't yet set to this cpu so the 4607 * lockdep check in task_group() will fail. 4608 * 4609 * Similar case to sched_fork(). / Alternatively we could 4610 * use task_rq_lock() here and obtain the other rq->lock. 4611 * 4612 * Silence PROVE_RCU 4613 */ 4614 rcu_read_lock(); 4615 __set_task_cpu(idle, cpu); 4616 rcu_read_unlock(); 4617 4618 rq->curr = rq->idle = idle; 4619#if defined(CONFIG_SMP) 4620 idle->on_cpu = 1; 4621#endif 4622 raw_spin_unlock_irqrestore(&rq->lock, flags); 4623 4624 /* Set the preempt count _outside_ the spinlocks! */ 4625 task_thread_info(idle)->preempt_count = 0; 4626 4627 /* 4628 * The idle tasks have their own, simple scheduling class: 4629 */ 4630 idle->sched_class = &idle_sched_class; 4631 ftrace_graph_init_idle_task(idle, cpu); 4632#if defined(CONFIG_SMP) 4633 sprintf(idle->comm, "%s/%d", INIT_TASK_COMM, cpu); 4634#endif 4635} 4636 4637#ifdef CONFIG_SMP 4638void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask) 4639{ 4640 if (p->sched_class && p->sched_class->set_cpus_allowed) 4641 p->sched_class->set_cpus_allowed(p, new_mask); 4642 4643 cpumask_copy(&p->cpus_allowed, new_mask); 4644 p->nr_cpus_allowed = cpumask_weight(new_mask); 4645} 4646 4647/* 4648 * This is how migration works: 4649 * 4650 * 1) we invoke migration_cpu_stop() on the target CPU using 4651 * stop_one_cpu(). 4652 * 2) stopper starts to run (implicitly forcing the migrated thread 4653 * off the CPU) 4654 * 3) it checks whether the migrated task is still in the wrong runqueue. 4655 * 4) if it's in the wrong runqueue then the migration thread removes 4656 * it and puts it into the right queue. 4657 * 5) stopper completes and stop_one_cpu() returns and the migration 4658 * is done. 4659 */ 4660 4661/* 4662 * Change a given task's CPU affinity. Migrate the thread to a 4663 * proper CPU and schedule it away if the CPU it's executing on 4664 * is removed from the allowed bitmask. 4665 * 4666 * NOTE: the caller must have a valid reference to the task, the 4667 * task must not exit() & deallocate itself prematurely. The 4668 * call is not atomic; no spinlocks may be held. 4669 */ 4670int set_cpus_allowed_ptr(struct task_struct *p, const struct cpumask *new_mask) 4671{ 4672 unsigned long flags; 4673 struct rq *rq; 4674 unsigned int dest_cpu; 4675 int ret = 0; 4676 4677 rq = task_rq_lock(p, &flags); 4678 4679 if (cpumask_equal(&p->cpus_allowed, new_mask)) 4680 goto out; 4681 4682 if (!cpumask_intersects(new_mask, cpu_active_mask)) { 4683 ret = -EINVAL; 4684 goto out; 4685 } 4686 4687 if (unlikely((p->flags & PF_THREAD_BOUND) && p != current)) { 4688 ret = -EINVAL; 4689 goto out; 4690 } 4691 4692 do_set_cpus_allowed(p, new_mask); 4693 4694 /* Can the task run on the task's current CPU? If so, we're done */ 4695 if (cpumask_test_cpu(task_cpu(p), new_mask)) 4696 goto out; 4697 4698 dest_cpu = cpumask_any_and(cpu_active_mask, new_mask); 4699 if (p->on_rq) { 4700 struct migration_arg arg = { p, dest_cpu }; 4701 /* Need help from migration thread: drop lock and wait. */ 4702 task_rq_unlock(rq, p, &flags); 4703 stop_one_cpu(cpu_of(rq), migration_cpu_stop, &arg); 4704 tlb_migrate_finish(p->mm); 4705 return 0; 4706 } 4707out: 4708 task_rq_unlock(rq, p, &flags); 4709 4710 return ret; 4711} 4712EXPORT_SYMBOL_GPL(set_cpus_allowed_ptr); 4713 4714/* 4715 * Move (not current) task off this cpu, onto dest cpu. We're doing 4716 * this because either it can't run here any more (set_cpus_allowed() 4717 * away from this CPU, or CPU going down), or because we're 4718 * attempting to rebalance this task on exec (sched_exec). 4719 * 4720 * So we race with normal scheduler movements, but that's OK, as long 4721 * as the task is no longer on this CPU. 4722 * 4723 * Returns non-zero if task was successfully migrated. 4724 */ 4725static int __migrate_task(struct task_struct *p, int src_cpu, int dest_cpu) 4726{ 4727 struct rq *rq_dest, *rq_src; 4728 int ret = 0; 4729 4730 if (unlikely(!cpu_active(dest_cpu))) 4731 return ret; 4732 4733 rq_src = cpu_rq(src_cpu); 4734 rq_dest = cpu_rq(dest_cpu); 4735 4736 raw_spin_lock(&p->pi_lock); 4737 double_rq_lock(rq_src, rq_dest); 4738 /* Already moved. */ 4739 if (task_cpu(p) != src_cpu) 4740 goto done; 4741 /* Affinity changed (again). */ 4742 if (!cpumask_test_cpu(dest_cpu, tsk_cpus_allowed(p))) 4743 goto fail; 4744 4745 /* 4746 * If we're not on a rq, the next wake-up will ensure we're 4747 * placed properly. 4748 */ 4749 if (p->on_rq) { 4750 dequeue_task(rq_src, p, 0); 4751 set_task_cpu(p, dest_cpu); 4752 enqueue_task(rq_dest, p, 0); 4753 check_preempt_curr(rq_dest, p, 0); 4754 } 4755done: 4756 ret = 1; 4757fail: 4758 double_rq_unlock(rq_src, rq_dest); 4759 raw_spin_unlock(&p->pi_lock); 4760 return ret; 4761} 4762 4763/* 4764 * migration_cpu_stop - this will be executed by a highprio stopper thread 4765 * and performs thread migration by bumping thread off CPU then 4766 * 'pushing' onto another runqueue. 4767 */ 4768static int migration_cpu_stop(void *data) 4769{ 4770 struct migration_arg *arg = data; 4771 4772 /* 4773 * The original target cpu might have gone down and we might 4774 * be on another cpu but it doesn't matter. 4775 */ 4776 local_irq_disable(); 4777 __migrate_task(arg->task, raw_smp_processor_id(), arg->dest_cpu); 4778 local_irq_enable(); 4779 return 0; 4780} 4781 4782#ifdef CONFIG_HOTPLUG_CPU 4783 4784/* 4785 * Ensures that the idle task is using init_mm right before its cpu goes 4786 * offline. 4787 */ 4788void idle_task_exit(void) 4789{ 4790 struct mm_struct *mm = current->active_mm; 4791 4792 BUG_ON(cpu_online(smp_processor_id())); 4793 4794 if (mm != &init_mm) 4795 switch_mm(mm, &init_mm, current); 4796 mmdrop(mm); 4797} 4798 4799/* 4800 * Since this CPU is going 'away' for a while, fold any nr_active delta 4801 * we might have. Assumes we're called after migrate_tasks() so that the 4802 * nr_active count is stable. 4803 * 4804 * Also see the comment "Global load-average calculations". 4805 */ 4806static void calc_load_migrate(struct rq *rq) 4807{ 4808 long delta = calc_load_fold_active(rq); 4809 if (delta) 4810 atomic_long_add(delta, &calc_load_tasks); 4811} 4812 4813/* 4814 * Migrate all tasks from the rq, sleeping tasks will be migrated by 4815 * try_to_wake_up()->select_task_rq(). 4816 * 4817 * Called with rq->lock held even though we'er in stop_machine() and 4818 * there's no concurrency possible, we hold the required locks anyway 4819 * because of lock validation efforts. 4820 */ 4821static void migrate_tasks(unsigned int dead_cpu) 4822{ 4823 struct rq *rq = cpu_rq(dead_cpu); 4824 struct task_struct *next, *stop = rq->stop; 4825 int dest_cpu; 4826 4827 /* 4828 * Fudge the rq selection such that the below task selection loop 4829 * doesn't get stuck on the currently eligible stop task. 4830 * 4831 * We're currently inside stop_machine() and the rq is either stuck 4832 * in the stop_machine_cpu_stop() loop, or we're executing this code, 4833 * either way we should never end up calling schedule() until we're 4834 * done here. 4835 */ 4836 rq->stop = NULL; 4837 4838 for ( ; ; ) { 4839 /* 4840 * There's this thread running, bail when that's the only 4841 * remaining thread. 4842 */ 4843 if (rq->nr_running == 1) 4844 break; 4845 4846 next = pick_next_task(rq); 4847 BUG_ON(!next); 4848 next->sched_class->put_prev_task(rq, next); 4849 4850 /* Find suitable destination for @next, with force if needed. */ 4851 dest_cpu = select_fallback_rq(dead_cpu, next); 4852 raw_spin_unlock(&rq->lock); 4853 4854 __migrate_task(next, dead_cpu, dest_cpu); 4855 4856 raw_spin_lock(&rq->lock); 4857 } 4858 4859 rq->stop = stop; 4860} 4861 4862#endif /* CONFIG_HOTPLUG_CPU */ 4863 4864#if defined(CONFIG_SCHED_DEBUG) && defined(CONFIG_SYSCTL) 4865 4866static struct ctl_table sd_ctl_dir[] = { 4867 { 4868 .procname = "sched_domain", 4869 .mode = 0555, 4870 }, 4871 {} 4872}; 4873 4874static struct ctl_table sd_ctl_root[] = { 4875 { 4876 .procname = "kernel", 4877 .mode = 0555, 4878 .child = sd_ctl_dir, 4879 }, 4880 {} 4881}; 4882 4883static struct ctl_table *sd_alloc_ctl_entry(int n) 4884{ 4885 struct ctl_table *entry = 4886 kcalloc(n, sizeof(struct ctl_table), GFP_KERNEL); 4887 4888 return entry; 4889} 4890 4891static void sd_free_ctl_entry(struct ctl_table **tablep) 4892{ 4893 struct ctl_table *entry; 4894 4895 /* 4896 * In the intermediate directories, both the child directory and 4897 * procname are dynamically allocated and could fail but the mode 4898 * will always be set. In the lowest directory the names are 4899 * static strings and all have proc handlers. 4900 */ 4901 for (entry = *tablep; entry->mode; entry++) { 4902 if (entry->child) 4903 sd_free_ctl_entry(&entry->child); 4904 if (entry->proc_handler == NULL) 4905 kfree(entry->procname); 4906 } 4907 4908 kfree(*tablep); 4909 *tablep = NULL; 4910} 4911 4912static int min_load_idx = 0; 4913static int max_load_idx = CPU_LOAD_IDX_MAX; 4914 4915static void 4916set_table_entry(struct ctl_table *entry, 4917 const char *procname, void *data, int maxlen, 4918 umode_t mode, proc_handler *proc_handler, 4919 bool load_idx) 4920{ 4921 entry->procname = procname; 4922 entry->data = data; 4923 entry->maxlen = maxlen; 4924 entry->mode = mode; 4925 entry->proc_handler = proc_handler; 4926 4927 if (load_idx) { 4928 entry->extra1 = &min_load_idx; 4929 entry->extra2 = &max_load_idx; 4930 } 4931} 4932 4933static struct ctl_table * 4934sd_alloc_ctl_domain_table(struct sched_domain *sd) 4935{ 4936 struct ctl_table *table = sd_alloc_ctl_entry(13); 4937 4938 if (table == NULL) 4939 return NULL; 4940 4941 set_table_entry(&table[0], "min_interval", &sd->min_interval, 4942 sizeof(long), 0644, proc_doulongvec_minmax, false); 4943 set_table_entry(&table[1], "max_interval", &sd->max_interval, 4944 sizeof(long), 0644, proc_doulongvec_minmax, false); 4945 set_table_entry(&table[2], "busy_idx", &sd->busy_idx, 4946 sizeof(int), 0644, proc_dointvec_minmax, true); 4947 set_table_entry(&table[3], "idle_idx", &sd->idle_idx, 4948 sizeof(int), 0644, proc_dointvec_minmax, true); 4949 set_table_entry(&table[4], "newidle_idx", &sd->newidle_idx, 4950 sizeof(int), 0644, proc_dointvec_minmax, true); 4951 set_table_entry(&table[5], "wake_idx", &sd->wake_idx, 4952 sizeof(int), 0644, proc_dointvec_minmax, true); 4953 set_table_entry(&table[6], "forkexec_idx", &sd->forkexec_idx, 4954 sizeof(int), 0644, proc_dointvec_minmax, true); 4955 set_table_entry(&table[7], "busy_factor", &sd->busy_factor, 4956 sizeof(int), 0644, proc_dointvec_minmax, false); 4957 set_table_entry(&table[8], "imbalance_pct", &sd->imbalance_pct, 4958 sizeof(int), 0644, proc_dointvec_minmax, false); 4959 set_table_entry(&table[9], "cache_nice_tries", 4960 &sd->cache_nice_tries, 4961 sizeof(int), 0644, proc_dointvec_minmax, false); 4962 set_table_entry(&table[10], "flags", &sd->flags, 4963 sizeof(int), 0644, proc_dointvec_minmax, false); 4964 set_table_entry(&table[11], "name", sd->name, 4965 CORENAME_MAX_SIZE, 0444, proc_dostring, false); 4966 /* &table[12] is terminator */ 4967 4968 return table; 4969} 4970 4971static ctl_table *sd_alloc_ctl_cpu_table(int cpu) 4972{ 4973 struct ctl_table *entry, *table; 4974 struct sched_domain *sd; 4975 int domain_num = 0, i; 4976 char buf[32]; 4977 4978 for_each_domain(cpu, sd) 4979 domain_num++; 4980 entry = table = sd_alloc_ctl_entry(domain_num + 1); 4981 if (table == NULL) 4982 return NULL; 4983 4984 i = 0; 4985 for_each_domain(cpu, sd) { 4986 snprintf(buf, 32, "domain%d", i); 4987 entry->procname = kstrdup(buf, GFP_KERNEL); 4988 entry->mode = 0555; 4989 entry->child = sd_alloc_ctl_domain_table(sd); 4990 entry++; 4991 i++; 4992 } 4993 return table; 4994} 4995 4996static struct ctl_table_header *sd_sysctl_header; 4997static void register_sched_domain_sysctl(void) 4998{ 4999 int i, cpu_num = num_possible_cpus(); 5000 struct ctl_table *entry = sd_alloc_ctl_entry(cpu_num + 1); 5001 char buf[32]; 5002 5003 WARN_ON(sd_ctl_dir[0].child); 5004 sd_ctl_dir[0].child = entry; 5005 5006 if (entry == NULL) 5007 return; 5008 5009 for_each_possible_cpu(i) { 5010 snprintf(buf, 32, "cpu%d", i); 5011 entry->procname = kstrdup(buf, GFP_KERNEL); 5012 entry->mode = 0555; 5013 entry->child = sd_alloc_ctl_cpu_table(i); 5014 entry++; 5015 } 5016 5017 WARN_ON(sd_sysctl_header); 5018 sd_sysctl_header = register_sysctl_table(sd_ctl_root); 5019} 5020 5021/* may be called multiple times per register */ 5022static void unregister_sched_domain_sysctl(void) 5023{ 5024 if (sd_sysctl_header) 5025 unregister_sysctl_table(sd_sysctl_header); 5026 sd_sysctl_header = NULL; 5027 if (sd_ctl_dir[0].child) 5028 sd_free_ctl_entry(&sd_ctl_dir[0].child); 5029} 5030#else 5031static void register_sched_domain_sysctl(void) 5032{ 5033} 5034static void unregister_sched_domain_sysctl(void) 5035{ 5036} 5037#endif 5038 5039static void set_rq_online(struct rq *rq) 5040{ 5041 if (!rq->online) { 5042 const struct sched_class *class; 5043 5044 cpumask_set_cpu(rq->cpu, rq->rd->online); 5045 rq->online = 1; 5046 5047 for_each_class(class) { 5048 if (class->rq_online) 5049 class->rq_online(rq); 5050 } 5051 } 5052} 5053 5054static void set_rq_offline(struct rq *rq) 5055{ 5056 if (rq->online) { 5057 const struct sched_class *class; 5058 5059 for_each_class(class) { 5060 if (class->rq_offline) 5061 class->rq_offline(rq); 5062 } 5063 5064 cpumask_clear_cpu(rq->cpu, rq->rd->online); 5065 rq->online = 0; 5066 } 5067} 5068 5069/* 5070 * migration_call - callback that gets triggered when a CPU is added. 5071 * Here we can start up the necessary migration thread for the new CPU. 5072 */ 5073static int __cpuinit 5074migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu) 5075{ 5076 int cpu = (long)hcpu; 5077 unsigned long flags; 5078 struct rq *rq = cpu_rq(cpu); 5079 5080 switch (action & ~CPU_TASKS_FROZEN) { 5081 5082 case CPU_UP_PREPARE: 5083 rq->calc_load_update = calc_load_update; 5084 break; 5085 5086 case CPU_ONLINE: 5087 /* Update our root-domain */ 5088 raw_spin_lock_irqsave(&rq->lock, flags); 5089 if (rq->rd) { 5090 BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span)); 5091 5092 set_rq_online(rq); 5093 } 5094 raw_spin_unlock_irqrestore(&rq->lock, flags); 5095 break; 5096 5097#ifdef CONFIG_HOTPLUG_CPU 5098 case CPU_DYING: 5099 sched_ttwu_pending(); 5100 /* Update our root-domain */ 5101 raw_spin_lock_irqsave(&rq->lock, flags); 5102 if (rq->rd) { 5103 BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span)); 5104 set_rq_offline(rq); 5105 } 5106 migrate_tasks(cpu); 5107 BUG_ON(rq->nr_running != 1); /* the migration thread */ 5108 raw_spin_unlock_irqrestore(&rq->lock, flags); 5109 break; 5110 5111 case CPU_DEAD: 5112 calc_load_migrate(rq); 5113 break; 5114#endif 5115 } 5116 5117 update_max_interval(); 5118 5119 return NOTIFY_OK; 5120} 5121 5122/* 5123 * Register at high priority so that task migration (migrate_all_tasks) 5124 * happens before everything else. This has to be lower priority than 5125 * the notifier in the perf_event subsystem, though. 5126 */ 5127static struct notifier_block __cpuinitdata migration_notifier = { 5128 .notifier_call = migration_call, 5129 .priority = CPU_PRI_MIGRATION, 5130}; 5131 5132static int __cpuinit sched_cpu_active(struct notifier_block *nfb, 5133 unsigned long action, void *hcpu) 5134{ 5135 switch (action & ~CPU_TASKS_FROZEN) { 5136 case CPU_STARTING: 5137 case CPU_DOWN_FAILED: 5138 set_cpu_active((long)hcpu, true); 5139 return NOTIFY_OK; 5140 default: 5141 return NOTIFY_DONE; 5142 } 5143} 5144 5145static int __cpuinit sched_cpu_inactive(struct notifier_block *nfb, 5146 unsigned long action, void *hcpu) 5147{ 5148 switch (action & ~CPU_TASKS_FROZEN) { 5149 case CPU_DOWN_PREPARE: 5150 set_cpu_active((long)hcpu, false); 5151 return NOTIFY_OK; 5152 default: 5153 return NOTIFY_DONE; 5154 } 5155} 5156 5157static int __init migration_init(void) 5158{ 5159 void *cpu = (void *)(long)smp_processor_id(); 5160 int err; 5161 5162 /* Initialize migration for the boot CPU */ 5163 err = migration_call(&migration_notifier, CPU_UP_PREPARE, cpu); 5164 BUG_ON(err == NOTIFY_BAD); 5165 migration_call(&migration_notifier, CPU_ONLINE, cpu); 5166 register_cpu_notifier(&migration_notifier); 5167 5168 /* Register cpu active notifiers */ 5169 cpu_notifier(sched_cpu_active, CPU_PRI_SCHED_ACTIVE); 5170 cpu_notifier(sched_cpu_inactive, CPU_PRI_SCHED_INACTIVE); 5171 5172 return 0; 5173} 5174early_initcall(migration_init); 5175#endif 5176 5177#ifdef CONFIG_SMP 5178 5179static cpumask_var_t sched_domains_tmpmask; /* sched_domains_mutex */ 5180 5181#ifdef CONFIG_SCHED_DEBUG 5182 5183static __read_mostly int sched_debug_enabled; 5184 5185static int __init sched_debug_setup(char *str) 5186{ 5187 sched_debug_enabled = 1; 5188 5189 return 0; 5190} 5191early_param("sched_debug", sched_debug_setup); 5192 5193static inline bool sched_debug(void) 5194{ 5195 return sched_debug_enabled; 5196} 5197 5198static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level, 5199 struct cpumask *groupmask) 5200{ 5201 struct sched_group *group = sd->groups; 5202 char str[256]; 5203 5204 cpulist_scnprintf(str, sizeof(str), sched_domain_span(sd)); 5205 cpumask_clear(groupmask); 5206 5207 printk(KERN_DEBUG "%*s domain %d: ", level, "", level); 5208 5209 if (!(sd->flags & SD_LOAD_BALANCE)) { 5210 printk("does not load-balance\n"); 5211 if (sd->parent) 5212 printk(KERN_ERR "ERROR: !SD_LOAD_BALANCE domain" 5213 " has parent"); 5214 return -1; 5215 } 5216 5217 printk(KERN_CONT "span %s level %s\n", str, sd->name); 5218 5219 if (!cpumask_test_cpu(cpu, sched_domain_span(sd))) { 5220 printk(KERN_ERR "ERROR: domain->span does not contain " 5221 "CPU%d\n", cpu); 5222 } 5223 if (!cpumask_test_cpu(cpu, sched_group_cpus(group))) { 5224 printk(KERN_ERR "ERROR: domain->groups does not contain" 5225 " CPU%d\n", cpu); 5226 } 5227 5228 printk(KERN_DEBUG "%*s groups:", level + 1, ""); 5229 do { 5230 if (!group) { 5231 printk("\n"); 5232 printk(KERN_ERR "ERROR: group is NULL\n"); 5233 break; 5234 } 5235 5236 /* 5237 * Even though we initialize ->power to something semi-sane, 5238 * we leave power_orig unset. This allows us to detect if 5239 * domain iteration is still funny without causing /0 traps. 5240 */ 5241 if (!group->sgp->power_orig) { 5242 printk(KERN_CONT "\n"); 5243 printk(KERN_ERR "ERROR: domain->cpu_power not " 5244 "set\n"); 5245 break; 5246 } 5247 5248 if (!cpumask_weight(sched_group_cpus(group))) { 5249 printk(KERN_CONT "\n"); 5250 printk(KERN_ERR "ERROR: empty group\n"); 5251 break; 5252 } 5253 5254 if (!(sd->flags & SD_OVERLAP) && 5255 cpumask_intersects(groupmask, sched_group_cpus(group))) { 5256 printk(KERN_CONT "\n"); 5257 printk(KERN_ERR "ERROR: repeated CPUs\n"); 5258 break; 5259 } 5260 5261 cpumask_or(groupmask, groupmask, sched_group_cpus(group)); 5262 5263 cpulist_scnprintf(str, sizeof(str), sched_group_cpus(group)); 5264 5265 printk(KERN_CONT " %s", str); 5266 if (group->sgp->power != SCHED_POWER_SCALE) { 5267 printk(KERN_CONT " (cpu_power = %d)", 5268 group->sgp->power); 5269 } 5270 5271 group = group->next; 5272 } while (group != sd->groups); 5273 printk(KERN_CONT "\n"); 5274 5275 if (!cpumask_equal(sched_domain_span(sd), groupmask)) 5276 printk(KERN_ERR "ERROR: groups don't span domain->span\n"); 5277 5278 if (sd->parent && 5279 !cpumask_subset(groupmask, sched_domain_span(sd->parent))) 5280 printk(KERN_ERR "ERROR: parent span is not a superset " 5281 "of domain->span\n"); 5282 return 0; 5283} 5284 5285static void sched_domain_debug(struct sched_domain *sd, int cpu) 5286{ 5287 int level = 0; 5288 5289 if (!sched_debug_enabled) 5290 return; 5291 5292 if (!sd) { 5293 printk(KERN_DEBUG "CPU%d attaching NULL sched-domain.\n", cpu); 5294 return; 5295 } 5296 5297 printk(KERN_DEBUG "CPU%d attaching sched-domain:\n", cpu); 5298 5299 for (;;) { 5300 if (sched_domain_debug_one(sd, cpu, level, sched_domains_tmpmask)) 5301 break; 5302 level++; 5303 sd = sd->parent; 5304 if (!sd) 5305 break; 5306 } 5307} 5308#else /* !CONFIG_SCHED_DEBUG */ 5309# define sched_domain_debug(sd, cpu) do { } while (0) 5310static inline bool sched_debug(void) 5311{ 5312 return false; 5313} 5314#endif /* CONFIG_SCHED_DEBUG */ 5315 5316static int sd_degenerate(struct sched_domain *sd) 5317{ 5318 if (cpumask_weight(sched_domain_span(sd)) == 1) 5319 return 1; 5320 5321 /* Following flags need at least 2 groups */ 5322 if (sd->flags & (SD_LOAD_BALANCE | 5323 SD_BALANCE_NEWIDLE | 5324 SD_BALANCE_FORK | 5325 SD_BALANCE_EXEC | 5326 SD_SHARE_CPUPOWER | 5327 SD_SHARE_PKG_RESOURCES)) { 5328 if (sd->groups != sd->groups->next) 5329 return 0; 5330 } 5331 5332 /* Following flags don't use groups */ 5333 if (sd->flags & (SD_WAKE_AFFINE)) 5334 return 0; 5335 5336 return 1; 5337} 5338 5339static int 5340sd_parent_degenerate(struct sched_domain *sd, struct sched_domain *parent) 5341{ 5342 unsigned long cflags = sd->flags, pflags = parent->flags; 5343 5344 if (sd_degenerate(parent)) 5345 return 1; 5346 5347 if (!cpumask_equal(sched_domain_span(sd), sched_domain_span(parent))) 5348 return 0; 5349 5350 /* Flags needing groups don't count if only 1 group in parent */ 5351 if (parent->groups == parent->groups->next) { 5352 pflags &= ~(SD_LOAD_BALANCE | 5353 SD_BALANCE_NEWIDLE | 5354 SD_BALANCE_FORK | 5355 SD_BALANCE_EXEC | 5356 SD_SHARE_CPUPOWER | 5357 SD_SHARE_PKG_RESOURCES); 5358 if (nr_node_ids == 1) 5359 pflags &= ~SD_SERIALIZE; 5360 } 5361 if (~cflags & pflags) 5362 return 0; 5363 5364 return 1; 5365} 5366 5367static void free_rootdomain(struct rcu_head *rcu) 5368{ 5369 struct root_domain *rd = container_of(rcu, struct root_domain, rcu); 5370 5371 cpupri_cleanup(&rd->cpupri); 5372 free_cpumask_var(rd->rto_mask); 5373 free_cpumask_var(rd->online); 5374 free_cpumask_var(rd->span); 5375 kfree(rd); 5376} 5377 5378static void rq_attach_root(struct rq *rq, struct root_domain *rd) 5379{ 5380 struct root_domain *old_rd = NULL; 5381 unsigned long flags; 5382 5383 raw_spin_lock_irqsave(&rq->lock, flags); 5384 5385 if (rq->rd) { 5386 old_rd = rq->rd; 5387 5388 if (cpumask_test_cpu(rq->cpu, old_rd->online)) 5389 set_rq_offline(rq); 5390 5391 cpumask_clear_cpu(rq->cpu, old_rd->span); 5392 5393 /* 5394 * If we dont want to free the old_rt yet then 5395 * set old_rd to NULL to skip the freeing later 5396 * in this function: 5397 */ 5398 if (!atomic_dec_and_test(&old_rd->refcount)) 5399 old_rd = NULL; 5400 } 5401 5402 atomic_inc(&rd->refcount); 5403 rq->rd = rd; 5404 5405 cpumask_set_cpu(rq->cpu, rd->span); 5406 if (cpumask_test_cpu(rq->cpu, cpu_active_mask)) 5407 set_rq_online(rq); 5408 5409 raw_spin_unlock_irqrestore(&rq->lock, flags); 5410 5411 if (old_rd) 5412 call_rcu_sched(&old_rd->rcu, free_rootdomain); 5413} 5414 5415static int init_rootdomain(struct root_domain *rd) 5416{ 5417 memset(rd, 0, sizeof(*rd)); 5418 5419 if (!alloc_cpumask_var(&rd->span, GFP_KERNEL)) 5420 goto out; 5421 if (!alloc_cpumask_var(&rd->online, GFP_KERNEL)) 5422 goto free_span; 5423 if (!alloc_cpumask_var(&rd->rto_mask, GFP_KERNEL)) 5424 goto free_online; 5425 5426 if (cpupri_init(&rd->cpupri) != 0) 5427 goto free_rto_mask; 5428 return 0; 5429 5430free_rto_mask: 5431 free_cpumask_var(rd->rto_mask); 5432free_online: 5433 free_cpumask_var(rd->online); 5434free_span: 5435 free_cpumask_var(rd->span); 5436out: 5437 return -ENOMEM; 5438} 5439 5440/* 5441 * By default the system creates a single root-domain with all cpus as 5442 * members (mimicking the global state we have today). 5443 */ 5444struct root_domain def_root_domain; 5445 5446static void init_defrootdomain(void) 5447{ 5448 init_rootdomain(&def_root_domain); 5449 5450 atomic_set(&def_root_domain.refcount, 1); 5451} 5452 5453static struct root_domain *alloc_rootdomain(void) 5454{ 5455 struct root_domain *rd; 5456 5457 rd = kmalloc(sizeof(*rd), GFP_KERNEL); 5458 if (!rd) 5459 return NULL; 5460 5461 if (init_rootdomain(rd) != 0) { 5462 kfree(rd); 5463 return NULL; 5464 } 5465 5466 return rd; 5467} 5468 5469static void free_sched_groups(struct sched_group *sg, int free_sgp) 5470{ 5471 struct sched_group *tmp, *first; 5472 5473 if (!sg) 5474 return; 5475 5476 first = sg; 5477 do { 5478 tmp = sg->next; 5479 5480 if (free_sgp && atomic_dec_and_test(&sg->sgp->ref)) 5481 kfree(sg->sgp); 5482 5483 kfree(sg); 5484 sg = tmp; 5485 } while (sg != first); 5486} 5487 5488static void free_sched_domain(struct rcu_head *rcu) 5489{ 5490 struct sched_domain *sd = container_of(rcu, struct sched_domain, rcu); 5491 5492 /* 5493 * If its an overlapping domain it has private groups, iterate and 5494 * nuke them all. 5495 */ 5496 if (sd->flags & SD_OVERLAP) { 5497 free_sched_groups(sd->groups, 1); 5498 } else if (atomic_dec_and_test(&sd->groups->ref)) { 5499 kfree(sd->groups->sgp); 5500 kfree(sd->groups); 5501 } 5502 kfree(sd); 5503} 5504 5505static void destroy_sched_domain(struct sched_domain *sd, int cpu) 5506{ 5507 call_rcu(&sd->rcu, free_sched_domain); 5508} 5509 5510static void destroy_sched_domains(struct sched_domain *sd, int cpu) 5511{ 5512 for (; sd; sd = sd->parent) 5513 destroy_sched_domain(sd, cpu); 5514} 5515 5516/* 5517 * Keep a special pointer to the highest sched_domain that has 5518 * SD_SHARE_PKG_RESOURCE set (Last Level Cache Domain) for this 5519 * allows us to avoid some pointer chasing select_idle_sibling(). 5520 * 5521 * Also keep a unique ID per domain (we use the first cpu number in 5522 * the cpumask of the domain), this allows us to quickly tell if 5523 * two cpus are in the same cache domain, see cpus_share_cache(). 5524 */ 5525DEFINE_PER_CPU(struct sched_domain *, sd_llc); 5526DEFINE_PER_CPU(int, sd_llc_id); 5527 5528static void update_top_cache_domain(int cpu) 5529{ 5530 struct sched_domain *sd; 5531 int id = cpu; 5532 5533 sd = highest_flag_domain(cpu, SD_SHARE_PKG_RESOURCES); 5534 if (sd) 5535 id = cpumask_first(sched_domain_span(sd)); 5536 5537 rcu_assign_pointer(per_cpu(sd_llc, cpu), sd); 5538 per_cpu(sd_llc_id, cpu) = id; 5539} 5540 5541/* 5542 * Attach the domain 'sd' to 'cpu' as its base domain. Callers must 5543 * hold the hotplug lock. 5544 */ 5545static void 5546cpu_attach_domain(struct sched_domain *sd, struct root_domain *rd, int cpu) 5547{ 5548 struct rq *rq = cpu_rq(cpu); 5549 struct sched_domain *tmp; 5550 5551 /* Remove the sched domains which do not contribute to scheduling. */ 5552 for (tmp = sd; tmp; ) { 5553 struct sched_domain *parent = tmp->parent; 5554 if (!parent) 5555 break; 5556 5557 if (sd_parent_degenerate(tmp, parent)) { 5558 tmp->parent = parent->parent; 5559 if (parent->parent) 5560 parent->parent->child = tmp; 5561 destroy_sched_domain(parent, cpu); 5562 } else 5563 tmp = tmp->parent; 5564 } 5565 5566 if (sd && sd_degenerate(sd)) { 5567 tmp = sd; 5568 sd = sd->parent; 5569 destroy_sched_domain(tmp, cpu); 5570 if (sd) 5571 sd->child = NULL; 5572 } 5573 5574 sched_domain_debug(sd, cpu); 5575 5576 rq_attach_root(rq, rd); 5577 tmp = rq->sd; 5578 rcu_assign_pointer(rq->sd, sd); 5579 destroy_sched_domains(tmp, cpu); 5580 5581 update_top_cache_domain(cpu); 5582} 5583 5584/* cpus with isolated domains */ 5585static cpumask_var_t cpu_isolated_map; 5586 5587/* Setup the mask of cpus configured for isolated domains */ 5588static int __init isolated_cpu_setup(char *str) 5589{ 5590 alloc_bootmem_cpumask_var(&cpu_isolated_map); 5591 cpulist_parse(str, cpu_isolated_map); 5592 return 1; 5593} 5594 5595__setup("isolcpus=", isolated_cpu_setup); 5596 5597static const struct cpumask *cpu_cpu_mask(int cpu) 5598{ 5599 return cpumask_of_node(cpu_to_node(cpu)); 5600} 5601 5602struct sd_data { 5603 struct sched_domain **__percpu sd; 5604 struct sched_group **__percpu sg; 5605 struct sched_group_power **__percpu sgp; 5606}; 5607 5608struct s_data { 5609 struct sched_domain ** __percpu sd; 5610 struct root_domain *rd; 5611}; 5612 5613enum s_alloc { 5614 sa_rootdomain, 5615 sa_sd, 5616 sa_sd_storage, 5617 sa_none, 5618}; 5619 5620struct sched_domain_topology_level; 5621 5622typedef struct sched_domain *(*sched_domain_init_f)(struct sched_domain_topology_level *tl, int cpu); 5623typedef const struct cpumask *(*sched_domain_mask_f)(int cpu); 5624 5625#define SDTL_OVERLAP 0x01 5626 5627struct sched_domain_topology_level { 5628 sched_domain_init_f init; 5629 sched_domain_mask_f mask; 5630 int flags; 5631 int numa_level; 5632 struct sd_data data; 5633}; 5634 5635/* 5636 * Build an iteration mask that can exclude certain CPUs from the upwards 5637 * domain traversal. 5638 * 5639 * Asymmetric node setups can result in situations where the domain tree is of 5640 * unequal depth, make sure to skip domains that already cover the entire 5641 * range. 5642 * 5643 * In that case build_sched_domains() will have terminated the iteration early 5644 * and our sibling sd spans will be empty. Domains should always include the 5645 * cpu they're built on, so check that. 5646 * 5647 */ 5648static void build_group_mask(struct sched_domain *sd, struct sched_group *sg) 5649{ 5650 const struct cpumask *span = sched_domain_span(sd); 5651 struct sd_data *sdd = sd->private; 5652 struct sched_domain *sibling; 5653 int i; 5654 5655 for_each_cpu(i, span) { 5656 sibling = *per_cpu_ptr(sdd->sd, i); 5657 if (!cpumask_test_cpu(i, sched_domain_span(sibling))) 5658 continue; 5659 5660 cpumask_set_cpu(i, sched_group_mask(sg)); 5661 } 5662} 5663 5664/* 5665 * Return the canonical balance cpu for this group, this is the first cpu 5666 * of this group that's also in the iteration mask. 5667 */ 5668int group_balance_cpu(struct sched_group *sg) 5669{ 5670 return cpumask_first_and(sched_group_cpus(sg), sched_group_mask(sg)); 5671} 5672 5673static int 5674build_overlap_sched_groups(struct sched_domain *sd, int cpu) 5675{ 5676 struct sched_group *first = NULL, *last = NULL, *groups = NULL, *sg; 5677 const struct cpumask *span = sched_domain_span(sd); 5678 struct cpumask *covered = sched_domains_tmpmask; 5679 struct sd_data *sdd = sd->private; 5680 struct sched_domain *child; 5681 int i; 5682 5683 cpumask_clear(covered); 5684 5685 for_each_cpu(i, span) { 5686 struct cpumask *sg_span; 5687 5688 if (cpumask_test_cpu(i, covered)) 5689 continue; 5690 5691 child = *per_cpu_ptr(sdd->sd, i); 5692 5693 /* See the comment near build_group_mask(). */ 5694 if (!cpumask_test_cpu(i, sched_domain_span(child))) 5695 continue; 5696 5697 sg = kzalloc_node(sizeof(struct sched_group) + cpumask_size(), 5698 GFP_KERNEL, cpu_to_node(cpu)); 5699 5700 if (!sg) 5701 goto fail; 5702 5703 sg_span = sched_group_cpus(sg); 5704 if (child->child) { 5705 child = child->child; 5706 cpumask_copy(sg_span, sched_domain_span(child)); 5707 } else 5708 cpumask_set_cpu(i, sg_span); 5709 5710 cpumask_or(covered, covered, sg_span); 5711 5712 sg->sgp = *per_cpu_ptr(sdd->sgp, i); 5713 if (atomic_inc_return(&sg->sgp->ref) == 1) 5714 build_group_mask(sd, sg); 5715 5716 /* 5717 * Initialize sgp->power such that even if we mess up the 5718 * domains and no possible iteration will get us here, we won't 5719 * die on a /0 trap. 5720 */ 5721 sg->sgp->power = SCHED_POWER_SCALE * cpumask_weight(sg_span); 5722 5723 /* 5724 * Make sure the first group of this domain contains the 5725 * canonical balance cpu. Otherwise the sched_domain iteration 5726 * breaks. See update_sg_lb_stats(). 5727 */ 5728 if ((!groups && cpumask_test_cpu(cpu, sg_span)) || 5729 group_balance_cpu(sg) == cpu) 5730 groups = sg; 5731 5732 if (!first) 5733 first = sg; 5734 if (last) 5735 last->next = sg; 5736 last = sg; 5737 last->next = first; 5738 } 5739 sd->groups = groups; 5740 5741 return 0; 5742 5743fail: 5744 free_sched_groups(first, 0); 5745 5746 return -ENOMEM; 5747} 5748 5749static int get_group(int cpu, struct sd_data *sdd, struct sched_group **sg) 5750{ 5751 struct sched_domain *sd = *per_cpu_ptr(sdd->sd, cpu); 5752 struct sched_domain *child = sd->child; 5753 5754 if (child) 5755 cpu = cpumask_first(sched_domain_span(child)); 5756 5757 if (sg) { 5758 *sg = *per_cpu_ptr(sdd->sg, cpu); 5759 (*sg)->sgp = *per_cpu_ptr(sdd->sgp, cpu); 5760 atomic_set(&(*sg)->sgp->ref, 1); /* for claim_allocations */ 5761 } 5762 5763 return cpu; 5764} 5765 5766/* 5767 * build_sched_groups will build a circular linked list of the groups 5768 * covered by the given span, and will set each group's ->cpumask correctly, 5769 * and ->cpu_power to 0. 5770 * 5771 * Assumes the sched_domain tree is fully constructed 5772 */ 5773static int 5774build_sched_groups(struct sched_domain *sd, int cpu) 5775{ 5776 struct sched_group *first = NULL, *last = NULL; 5777 struct sd_data *sdd = sd->private; 5778 const struct cpumask *span = sched_domain_span(sd); 5779 struct cpumask *covered; 5780 int i; 5781 5782 get_group(cpu, sdd, &sd->groups); 5783 atomic_inc(&sd->groups->ref); 5784 5785 if (cpu != cpumask_first(sched_domain_span(sd))) 5786 return 0; 5787 5788 lockdep_assert_held(&sched_domains_mutex); 5789 covered = sched_domains_tmpmask; 5790 5791 cpumask_clear(covered); 5792 5793 for_each_cpu(i, span) { 5794 struct sched_group *sg; 5795 int group = get_group(i, sdd, &sg); 5796 int j; 5797 5798 if (cpumask_test_cpu(i, covered)) 5799 continue; 5800 5801 cpumask_clear(sched_group_cpus(sg)); 5802 sg->sgp->power = 0; 5803 cpumask_setall(sched_group_mask(sg)); 5804 5805 for_each_cpu(j, span) { 5806 if (get_group(j, sdd, NULL) != group) 5807 continue; 5808 5809 cpumask_set_cpu(j, covered); 5810 cpumask_set_cpu(j, sched_group_cpus(sg)); 5811 } 5812 5813 if (!first) 5814 first = sg; 5815 if (last) 5816 last->next = sg; 5817 last = sg; 5818 } 5819 last->next = first; 5820 5821 return 0; 5822} 5823 5824/* 5825 * Initialize sched groups cpu_power. 5826 * 5827 * cpu_power indicates the capacity of sched group, which is used while 5828 * distributing the load between different sched groups in a sched domain. 5829 * Typically cpu_power for all the groups in a sched domain will be same unless 5830 * there are asymmetries in the topology. If there are asymmetries, group 5831 * having more cpu_power will pickup more load compared to the group having 5832 * less cpu_power. 5833 */ 5834static void init_sched_groups_power(int cpu, struct sched_domain *sd) 5835{ 5836 struct sched_group *sg = sd->groups; 5837 5838 WARN_ON(!sd || !sg); 5839 5840 do { 5841 sg->group_weight = cpumask_weight(sched_group_cpus(sg)); 5842 sg = sg->next; 5843 } while (sg != sd->groups); 5844 5845 if (cpu != group_balance_cpu(sg)) 5846 return; 5847 5848 update_group_power(sd, cpu); 5849 atomic_set(&sg->sgp->nr_busy_cpus, sg->group_weight); 5850} 5851 5852int __weak arch_sd_sibling_asym_packing(void) 5853{ 5854 return 0*SD_ASYM_PACKING; 5855} 5856 5857/* 5858 * Initializers for schedule domains 5859 * Non-inlined to reduce accumulated stack pressure in build_sched_domains() 5860 */ 5861 5862#ifdef CONFIG_SCHED_DEBUG 5863# define SD_INIT_NAME(sd, type) sd->name = #type 5864#else 5865# define SD_INIT_NAME(sd, type) do { } while (0) 5866#endif 5867 5868#define SD_INIT_FUNC(type) \ 5869static noinline struct sched_domain * \ 5870sd_init_##type(struct sched_domain_topology_level *tl, int cpu) \ 5871{ \ 5872 struct sched_domain *sd = *per_cpu_ptr(tl->data.sd, cpu); \ 5873 *sd = SD_##type##_INIT; \ 5874 SD_INIT_NAME(sd, type); \ 5875 sd->private = &tl->data; \ 5876 return sd; \ 5877} 5878 5879SD_INIT_FUNC(CPU) 5880#ifdef CONFIG_SCHED_SMT 5881 SD_INIT_FUNC(SIBLING) 5882#endif 5883#ifdef CONFIG_SCHED_MC 5884 SD_INIT_FUNC(MC) 5885#endif 5886#ifdef CONFIG_SCHED_BOOK 5887 SD_INIT_FUNC(BOOK) 5888#endif 5889 5890static int default_relax_domain_level = -1; 5891int sched_domain_level_max; 5892 5893static int __init setup_relax_domain_level(char *str) 5894{ 5895 if (kstrtoint(str, 0, &default_relax_domain_level)) 5896 pr_warn("Unable to set relax_domain_level\n"); 5897 5898 return 1; 5899} 5900__setup("relax_domain_level=", setup_relax_domain_level); 5901 5902static void set_domain_attribute(struct sched_domain *sd, 5903 struct sched_domain_attr *attr) 5904{ 5905 int request; 5906 5907 if (!attr || attr->relax_domain_level < 0) { 5908 if (default_relax_domain_level < 0) 5909 return; 5910 else 5911 request = default_relax_domain_level; 5912 } else 5913 request = attr->relax_domain_level; 5914 if (request < sd->level) { 5915 /* turn off idle balance on this domain */ 5916 sd->flags &= ~(SD_BALANCE_WAKE|SD_BALANCE_NEWIDLE); 5917 } else { 5918 /* turn on idle balance on this domain */ 5919 sd->flags |= (SD_BALANCE_WAKE|SD_BALANCE_NEWIDLE); 5920 } 5921} 5922 5923static void __sdt_free(const struct cpumask *cpu_map); 5924static int __sdt_alloc(const struct cpumask *cpu_map); 5925 5926static void __free_domain_allocs(struct s_data *d, enum s_alloc what, 5927 const struct cpumask *cpu_map) 5928{ 5929 switch (what) { 5930 case sa_rootdomain: 5931 if (!atomic_read(&d->rd->refcount)) 5932 free_rootdomain(&d->rd->rcu); /* fall through */ 5933 case sa_sd: 5934 free_percpu(d->sd); /* fall through */ 5935 case sa_sd_storage: 5936 __sdt_free(cpu_map); /* fall through */ 5937 case sa_none: 5938 break; 5939 } 5940} 5941 5942static enum s_alloc __visit_domain_allocation_hell(struct s_data *d, 5943 const struct cpumask *cpu_map) 5944{ 5945 memset(d, 0, sizeof(*d)); 5946 5947 if (__sdt_alloc(cpu_map)) 5948 return sa_sd_storage; 5949 d->sd = alloc_percpu(struct sched_domain *); 5950 if (!d->sd) 5951 return sa_sd_storage; 5952 d->rd = alloc_rootdomain(); 5953 if (!d->rd) 5954 return sa_sd; 5955 return sa_rootdomain; 5956} 5957 5958/* 5959 * NULL the sd_data elements we've used to build the sched_domain and 5960 * sched_group structure so that the subsequent __free_domain_allocs() 5961 * will not free the data we're using. 5962 */ 5963static void claim_allocations(int cpu, struct sched_domain *sd) 5964{ 5965 struct sd_data *sdd = sd->private; 5966 5967 WARN_ON_ONCE(*per_cpu_ptr(sdd->sd, cpu) != sd); 5968 *per_cpu_ptr(sdd->sd, cpu) = NULL; 5969 5970 if (atomic_read(&(*per_cpu_ptr(sdd->sg, cpu))->ref)) 5971 *per_cpu_ptr(sdd->sg, cpu) = NULL; 5972 5973 if (atomic_read(&(*per_cpu_ptr(sdd->sgp, cpu))->ref)) 5974 *per_cpu_ptr(sdd->sgp, cpu) = NULL; 5975} 5976 5977#ifdef CONFIG_SCHED_SMT 5978static const struct cpumask *cpu_smt_mask(int cpu) 5979{ 5980 return topology_thread_cpumask(cpu); 5981} 5982#endif 5983 5984/* 5985 * Topology list, bottom-up. 5986 */ 5987static struct sched_domain_topology_level default_topology[] = { 5988#ifdef CONFIG_SCHED_SMT 5989 { sd_init_SIBLING, cpu_smt_mask, }, 5990#endif 5991#ifdef CONFIG_SCHED_MC 5992 { sd_init_MC, cpu_coregroup_mask, }, 5993#endif 5994#ifdef CONFIG_SCHED_BOOK 5995 { sd_init_BOOK, cpu_book_mask, }, 5996#endif 5997 { sd_init_CPU, cpu_cpu_mask, }, 5998 { NULL, }, 5999}; 6000 6001static struct sched_domain_topology_level *sched_domain_topology = default_topology; 6002 6003#ifdef CONFIG_NUMA 6004 6005static int sched_domains_numa_levels; 6006static int *sched_domains_numa_distance; 6007static struct cpumask ***sched_domains_numa_masks; 6008static int sched_domains_curr_level; 6009 6010static inline int sd_local_flags(int level) 6011{ 6012 if (sched_domains_numa_distance[level] > RECLAIM_DISTANCE) 6013 return 0; 6014 6015 return SD_BALANCE_EXEC | SD_BALANCE_FORK | SD_WAKE_AFFINE; 6016} 6017 6018static struct sched_domain * 6019sd_numa_init(struct sched_domain_topology_level *tl, int cpu) 6020{ 6021 struct sched_domain *sd = *per_cpu_ptr(tl->data.sd, cpu); 6022 int level = tl->numa_level; 6023 int sd_weight = cpumask_weight( 6024 sched_domains_numa_masks[level][cpu_to_node(cpu)]); 6025 6026 *sd = (struct sched_domain){ 6027 .min_interval = sd_weight, 6028 .max_interval = 2*sd_weight, 6029 .busy_factor = 32, 6030 .imbalance_pct = 125, 6031 .cache_nice_tries = 2, 6032 .busy_idx = 3, 6033 .idle_idx = 2, 6034 .newidle_idx = 0, 6035 .wake_idx = 0, 6036 .forkexec_idx = 0, 6037 6038 .flags = 1*SD_LOAD_BALANCE 6039 | 1*SD_BALANCE_NEWIDLE 6040 | 0*SD_BALANCE_EXEC 6041 | 0*SD_BALANCE_FORK 6042 | 0*SD_BALANCE_WAKE 6043 | 0*SD_WAKE_AFFINE 6044 | 0*SD_SHARE_CPUPOWER 6045 | 0*SD_SHARE_PKG_RESOURCES 6046 | 1*SD_SERIALIZE 6047 | 0*SD_PREFER_SIBLING 6048 | sd_local_flags(level) 6049 , 6050 .last_balance = jiffies, 6051 .balance_interval = sd_weight, 6052 }; 6053 SD_INIT_NAME(sd, NUMA); 6054 sd->private = &tl->data; 6055 6056 /* 6057 * Ugly hack to pass state to sd_numa_mask()... 6058 */ 6059 sched_domains_curr_level = tl->numa_level; 6060 6061 return sd; 6062} 6063 6064static const struct cpumask *sd_numa_mask(int cpu) 6065{ 6066 return sched_domains_numa_masks[sched_domains_curr_level][cpu_to_node(cpu)]; 6067} 6068 6069static void sched_numa_warn(const char *str) 6070{ 6071 static int done = false; 6072 int i,j; 6073 6074 if (done) 6075 return; 6076 6077 done = true; 6078 6079 printk(KERN_WARNING "ERROR: %s\n\n", str); 6080 6081 for (i = 0; i < nr_node_ids; i++) { 6082 printk(KERN_WARNING " "); 6083 for (j = 0; j < nr_node_ids; j++) 6084 printk(KERN_CONT "%02d ", node_distance(i,j)); 6085 printk(KERN_CONT "\n"); 6086 } 6087 printk(KERN_WARNING "\n"); 6088} 6089 6090static bool find_numa_distance(int distance) 6091{ 6092 int i; 6093 6094 if (distance == node_distance(0, 0)) 6095 return true; 6096 6097 for (i = 0; i < sched_domains_numa_levels; i++) { 6098 if (sched_domains_numa_distance[i] == distance) 6099 return true; 6100 } 6101 6102 return false; 6103} 6104 6105static void sched_init_numa(void) 6106{ 6107 int next_distance, curr_distance = node_distance(0, 0); 6108 struct sched_domain_topology_level *tl; 6109 int level = 0; 6110 int i, j, k; 6111 6112 sched_domains_numa_distance = kzalloc(sizeof(int) * nr_node_ids, GFP_KERNEL); 6113 if (!sched_domains_numa_distance) 6114 return; 6115 6116 /* 6117 * O(nr_nodes^2) deduplicating selection sort -- in order to find the 6118 * unique distances in the node_distance() table. 6119 * 6120 * Assumes node_distance(0,j) includes all distances in 6121 * node_distance(i,j) in order to avoid cubic time. 6122 */ 6123 next_distance = curr_distance; 6124 for (i = 0; i < nr_node_ids; i++) { 6125 for (j = 0; j < nr_node_ids; j++) { 6126 for (k = 0; k < nr_node_ids; k++) { 6127 int distance = node_distance(i, k); 6128 6129 if (distance > curr_distance && 6130 (distance < next_distance || 6131 next_distance == curr_distance)) 6132 next_distance = distance; 6133 6134 /* 6135 * While not a strong assumption it would be nice to know 6136 * about cases where if node A is connected to B, B is not 6137 * equally connected to A. 6138 */ 6139 if (sched_debug() && node_distance(k, i) != distance) 6140 sched_numa_warn("Node-distance not symmetric"); 6141 6142 if (sched_debug() && i && !find_numa_distance(distance)) 6143 sched_numa_warn("Node-0 not representative"); 6144 } 6145 if (next_distance != curr_distance) { 6146 sched_domains_numa_distance[level++] = next_distance; 6147 sched_domains_numa_levels = level; 6148 curr_distance = next_distance; 6149 } else break; 6150 } 6151 6152 /* 6153 * In case of sched_debug() we verify the above assumption. 6154 */ 6155 if (!sched_debug()) 6156 break; 6157 } 6158 /* 6159 * 'level' contains the number of unique distances, excluding the 6160 * identity distance node_distance(i,i). 6161 * 6162 * The sched_domains_nume_distance[] array includes the actual distance 6163 * numbers. 6164 */ 6165 6166 /* 6167 * Here, we should temporarily reset sched_domains_numa_levels to 0. 6168 * If it fails to allocate memory for array sched_domains_numa_masks[][], 6169 * the array will contain less then 'level' members. This could be 6170 * dangerous when we use it to iterate array sched_domains_numa_masks[][] 6171 * in other functions. 6172 * 6173 * We reset it to 'level' at the end of this function. 6174 */ 6175 sched_domains_numa_levels = 0; 6176 6177 sched_domains_numa_masks = kzalloc(sizeof(void *) * level, GFP_KERNEL); 6178 if (!sched_domains_numa_masks) 6179 return; 6180 6181 /* 6182 * Now for each level, construct a mask per node which contains all 6183 * cpus of nodes that are that many hops away from us. 6184 */ 6185 for (i = 0; i < level; i++) { 6186 sched_domains_numa_masks[i] = 6187 kzalloc(nr_node_ids * sizeof(void *), GFP_KERNEL); 6188 if (!sched_domains_numa_masks[i]) 6189 return; 6190 6191 for (j = 0; j < nr_node_ids; j++) { 6192 struct cpumask *mask = kzalloc(cpumask_size(), GFP_KERNEL); 6193 if (!mask) 6194 return; 6195 6196 sched_domains_numa_masks[i][j] = mask; 6197 6198 for (k = 0; k < nr_node_ids; k++) { 6199 if (node_distance(j, k) > sched_domains_numa_distance[i]) 6200 continue; 6201 6202 cpumask_or(mask, mask, cpumask_of_node(k)); 6203 } 6204 } 6205 } 6206 6207 tl = kzalloc((ARRAY_SIZE(default_topology) + level) * 6208 sizeof(struct sched_domain_topology_level), GFP_KERNEL); 6209 if (!tl) 6210 return; 6211 6212 /* 6213 * Copy the default topology bits.. 6214 */ 6215 for (i = 0; default_topology[i].init; i++) 6216 tl[i] = default_topology[i]; 6217 6218 /* 6219 * .. and append 'j' levels of NUMA goodness. 6220 */ 6221 for (j = 0; j < level; i++, j++) { 6222 tl[i] = (struct sched_domain_topology_level){ 6223 .init = sd_numa_init, 6224 .mask = sd_numa_mask, 6225 .flags = SDTL_OVERLAP, 6226 .numa_level = j, 6227 }; 6228 } 6229 6230 sched_domain_topology = tl; 6231 6232 sched_domains_numa_levels = level; 6233} 6234 6235static void sched_domains_numa_masks_set(int cpu) 6236{ 6237 int i, j; 6238 int node = cpu_to_node(cpu); 6239 6240 for (i = 0; i < sched_domains_numa_levels; i++) { 6241 for (j = 0; j < nr_node_ids; j++) { 6242 if (node_distance(j, node) <= sched_domains_numa_distance[i]) 6243 cpumask_set_cpu(cpu, sched_domains_numa_masks[i][j]); 6244 } 6245 } 6246} 6247 6248static void sched_domains_numa_masks_clear(int cpu) 6249{ 6250 int i, j; 6251 for (i = 0; i < sched_domains_numa_levels; i++) { 6252 for (j = 0; j < nr_node_ids; j++) 6253 cpumask_clear_cpu(cpu, sched_domains_numa_masks[i][j]); 6254 } 6255} 6256 6257/* 6258 * Update sched_domains_numa_masks[level][node] array when new cpus 6259 * are onlined. 6260 */ 6261static int sched_domains_numa_masks_update(struct notifier_block *nfb, 6262 unsigned long action, 6263 void *hcpu) 6264{ 6265 int cpu = (long)hcpu; 6266 6267 switch (action & ~CPU_TASKS_FROZEN) { 6268 case CPU_ONLINE: 6269 sched_domains_numa_masks_set(cpu); 6270 break; 6271 6272 case CPU_DEAD: 6273 sched_domains_numa_masks_clear(cpu); 6274 break; 6275 6276 default: 6277 return NOTIFY_DONE; 6278 } 6279 6280 return NOTIFY_OK; 6281} 6282#else 6283static inline void sched_init_numa(void) 6284{ 6285} 6286 6287static int sched_domains_numa_masks_update(struct notifier_block *nfb, 6288 unsigned long action, 6289 void *hcpu) 6290{ 6291 return 0; 6292} 6293#endif /* CONFIG_NUMA */ 6294 6295static int __sdt_alloc(const struct cpumask *cpu_map) 6296{ 6297 struct sched_domain_topology_level *tl; 6298 int j; 6299 6300 for (tl = sched_domain_topology; tl->init; tl++) { 6301 struct sd_data *sdd = &tl->data; 6302 6303 sdd->sd = alloc_percpu(struct sched_domain *); 6304 if (!sdd->sd) 6305 return -ENOMEM; 6306 6307 sdd->sg = alloc_percpu(struct sched_group *); 6308 if (!sdd->sg) 6309 return -ENOMEM; 6310 6311 sdd->sgp = alloc_percpu(struct sched_group_power *); 6312 if (!sdd->sgp) 6313 return -ENOMEM; 6314 6315 for_each_cpu(j, cpu_map) { 6316 struct sched_domain *sd; 6317 struct sched_group *sg; 6318 struct sched_group_power *sgp; 6319 6320 sd = kzalloc_node(sizeof(struct sched_domain) + cpumask_size(), 6321 GFP_KERNEL, cpu_to_node(j)); 6322 if (!sd) 6323 return -ENOMEM; 6324 6325 *per_cpu_ptr(sdd->sd, j) = sd; 6326 6327 sg = kzalloc_node(sizeof(struct sched_group) + cpumask_size(), 6328 GFP_KERNEL, cpu_to_node(j)); 6329 if (!sg) 6330 return -ENOMEM; 6331 6332 sg->next = sg; 6333 6334 *per_cpu_ptr(sdd->sg, j) = sg; 6335 6336 sgp = kzalloc_node(sizeof(struct sched_group_power) + cpumask_size(), 6337 GFP_KERNEL, cpu_to_node(j)); 6338 if (!sgp) 6339 return -ENOMEM; 6340 6341 *per_cpu_ptr(sdd->sgp, j) = sgp; 6342 } 6343 } 6344 6345 return 0; 6346} 6347 6348static void __sdt_free(const struct cpumask *cpu_map) 6349{ 6350 struct sched_domain_topology_level *tl; 6351 int j; 6352 6353 for (tl = sched_domain_topology; tl->init; tl++) { 6354 struct sd_data *sdd = &tl->data; 6355 6356 for_each_cpu(j, cpu_map) { 6357 struct sched_domain *sd; 6358 6359 if (sdd->sd) { 6360 sd = *per_cpu_ptr(sdd->sd, j); 6361 if (sd && (sd->flags & SD_OVERLAP)) 6362 free_sched_groups(sd->groups, 0); 6363 kfree(*per_cpu_ptr(sdd->sd, j)); 6364 } 6365 6366 if (sdd->sg) 6367 kfree(*per_cpu_ptr(sdd->sg, j)); 6368 if (sdd->sgp) 6369 kfree(*per_cpu_ptr(sdd->sgp, j)); 6370 } 6371 free_percpu(sdd->sd); 6372 sdd->sd = NULL; 6373 free_percpu(sdd->sg); 6374 sdd->sg = NULL; 6375 free_percpu(sdd->sgp); 6376 sdd->sgp = NULL; 6377 } 6378} 6379 6380struct sched_domain *build_sched_domain(struct sched_domain_topology_level *tl, 6381 struct s_data *d, const struct cpumask *cpu_map, 6382 struct sched_domain_attr *attr, struct sched_domain *child, 6383 int cpu) 6384{ 6385 struct sched_domain *sd = tl->init(tl, cpu); 6386 if (!sd) 6387 return child; 6388 6389 cpumask_and(sched_domain_span(sd), cpu_map, tl->mask(cpu)); 6390 if (child) { 6391 sd->level = child->level + 1; 6392 sched_domain_level_max = max(sched_domain_level_max, sd->level); 6393 child->parent = sd; 6394 } 6395 sd->child = child; 6396 set_domain_attribute(sd, attr); 6397 6398 return sd; 6399} 6400 6401/* 6402 * Build sched domains for a given set of cpus and attach the sched domains 6403 * to the individual cpus 6404 */ 6405static int build_sched_domains(const struct cpumask *cpu_map, 6406 struct sched_domain_attr *attr) 6407{ 6408 enum s_alloc alloc_state = sa_none; 6409 struct sched_domain *sd; 6410 struct s_data d; 6411 int i, ret = -ENOMEM; 6412 6413 alloc_state = __visit_domain_allocation_hell(&d, cpu_map); 6414 if (alloc_state != sa_rootdomain) 6415 goto error; 6416 6417 /* Set up domains for cpus specified by the cpu_map. */ 6418 for_each_cpu(i, cpu_map) { 6419 struct sched_domain_topology_level *tl; 6420 6421 sd = NULL; 6422 for (tl = sched_domain_topology; tl->init; tl++) { 6423 sd = build_sched_domain(tl, &d, cpu_map, attr, sd, i); 6424 if (tl->flags & SDTL_OVERLAP || sched_feat(FORCE_SD_OVERLAP)) 6425 sd->flags |= SD_OVERLAP; 6426 if (cpumask_equal(cpu_map, sched_domain_span(sd))) 6427 break; 6428 } 6429 6430 while (sd->child) 6431 sd = sd->child; 6432 6433 *per_cpu_ptr(d.sd, i) = sd; 6434 } 6435 6436 /* Build the groups for the domains */ 6437 for_each_cpu(i, cpu_map) { 6438 for (sd = *per_cpu_ptr(d.sd, i); sd; sd = sd->parent) { 6439 sd->span_weight = cpumask_weight(sched_domain_span(sd)); 6440 if (sd->flags & SD_OVERLAP) { 6441 if (build_overlap_sched_groups(sd, i)) 6442 goto error; 6443 } else { 6444 if (build_sched_groups(sd, i)) 6445 goto error; 6446 } 6447 } 6448 } 6449 6450 /* Calculate CPU power for physical packages and nodes */ 6451 for (i = nr_cpumask_bits-1; i >= 0; i--) { 6452 if (!cpumask_test_cpu(i, cpu_map)) 6453 continue; 6454 6455 for (sd = *per_cpu_ptr(d.sd, i); sd; sd = sd->parent) { 6456 claim_allocations(i, sd); 6457 init_sched_groups_power(i, sd); 6458 } 6459 } 6460 6461 /* Attach the domains */ 6462 rcu_read_lock(); 6463 for_each_cpu(i, cpu_map) { 6464 sd = *per_cpu_ptr(d.sd, i); 6465 cpu_attach_domain(sd, d.rd, i); 6466 } 6467 rcu_read_unlock(); 6468 6469 ret = 0; 6470error: 6471 __free_domain_allocs(&d, alloc_state, cpu_map); 6472 return ret; 6473} 6474 6475static cpumask_var_t *doms_cur; /* current sched domains */ 6476static int ndoms_cur; /* number of sched domains in 'doms_cur' */ 6477static struct sched_domain_attr *dattr_cur; 6478 /* attribues of custom domains in 'doms_cur' */ 6479 6480/* 6481 * Special case: If a kmalloc of a doms_cur partition (array of 6482 * cpumask) fails, then fallback to a single sched domain, 6483 * as determined by the single cpumask fallback_doms. 6484 */ 6485static cpumask_var_t fallback_doms; 6486 6487/* 6488 * arch_update_cpu_topology lets virtualized architectures update the 6489 * cpu core maps. It is supposed to return 1 if the topology changed 6490 * or 0 if it stayed the same. 6491 */ 6492int __attribute__((weak)) arch_update_cpu_topology(void) 6493{ 6494 return 0; 6495} 6496 6497cpumask_var_t *alloc_sched_domains(unsigned int ndoms) 6498{ 6499 int i; 6500 cpumask_var_t *doms; 6501 6502 doms = kmalloc(sizeof(*doms) * ndoms, GFP_KERNEL); 6503 if (!doms) 6504 return NULL; 6505 for (i = 0; i < ndoms; i++) { 6506 if (!alloc_cpumask_var(&doms[i], GFP_KERNEL)) { 6507 free_sched_domains(doms, i); 6508 return NULL; 6509 } 6510 } 6511 return doms; 6512} 6513 6514void free_sched_domains(cpumask_var_t doms[], unsigned int ndoms) 6515{ 6516 unsigned int i; 6517 for (i = 0; i < ndoms; i++) 6518 free_cpumask_var(doms[i]); 6519 kfree(doms); 6520} 6521 6522/* 6523 * Set up scheduler domains and groups. Callers must hold the hotplug lock. 6524 * For now this just excludes isolated cpus, but could be used to 6525 * exclude other special cases in the future. 6526 */ 6527static int init_sched_domains(const struct cpumask *cpu_map) 6528{ 6529 int err; 6530 6531 arch_update_cpu_topology(); 6532 ndoms_cur = 1; 6533 doms_cur = alloc_sched_domains(ndoms_cur); 6534 if (!doms_cur) 6535 doms_cur = &fallback_doms; 6536 cpumask_andnot(doms_cur[0], cpu_map, cpu_isolated_map); 6537 err = build_sched_domains(doms_cur[0], NULL); 6538 register_sched_domain_sysctl(); 6539 6540 return err; 6541} 6542 6543/* 6544 * Detach sched domains from a group of cpus specified in cpu_map 6545 * These cpus will now be attached to the NULL domain 6546 */ 6547static void detach_destroy_domains(const struct cpumask *cpu_map) 6548{ 6549 int i; 6550 6551 rcu_read_lock(); 6552 for_each_cpu(i, cpu_map) 6553 cpu_attach_domain(NULL, &def_root_domain, i); 6554 rcu_read_unlock(); 6555} 6556 6557/* handle null as "default" */ 6558static int dattrs_equal(struct sched_domain_attr *cur, int idx_cur, 6559 struct sched_domain_attr *new, int idx_new) 6560{ 6561 struct sched_domain_attr tmp; 6562 6563 /* fast path */ 6564 if (!new && !cur) 6565 return 1; 6566 6567 tmp = SD_ATTR_INIT; 6568 return !memcmp(cur ? (cur + idx_cur) : &tmp, 6569 new ? (new + idx_new) : &tmp, 6570 sizeof(struct sched_domain_attr)); 6571} 6572 6573/* 6574 * Partition sched domains as specified by the 'ndoms_new' 6575 * cpumasks in the array doms_new[] of cpumasks. This compares 6576 * doms_new[] to the current sched domain partitioning, doms_cur[]. 6577 * It destroys each deleted domain and builds each new domain. 6578 * 6579 * 'doms_new' is an array of cpumask_var_t's of length 'ndoms_new'. 6580 * The masks don't intersect (don't overlap.) We should setup one 6581 * sched domain for each mask. CPUs not in any of the cpumasks will 6582 * not be load balanced. If the same cpumask appears both in the 6583 * current 'doms_cur' domains and in the new 'doms_new', we can leave 6584 * it as it is. 6585 * 6586 * The passed in 'doms_new' should be allocated using 6587 * alloc_sched_domains. This routine takes ownership of it and will 6588 * free_sched_domains it when done with it. If the caller failed the 6589 * alloc call, then it can pass in doms_new == NULL && ndoms_new == 1, 6590 * and partition_sched_domains() will fallback to the single partition 6591 * 'fallback_doms', it also forces the domains to be rebuilt. 6592 * 6593 * If doms_new == NULL it will be replaced with cpu_online_mask. 6594 * ndoms_new == 0 is a special case for destroying existing domains, 6595 * and it will not create the default domain. 6596 * 6597 * Call with hotplug lock held 6598 */ 6599void partition_sched_domains(int ndoms_new, cpumask_var_t doms_new[], 6600 struct sched_domain_attr *dattr_new) 6601{ 6602 int i, j, n; 6603 int new_topology; 6604 6605 mutex_lock(&sched_domains_mutex); 6606 6607 /* always unregister in case we don't destroy any domains */ 6608 unregister_sched_domain_sysctl(); 6609 6610 /* Let architecture update cpu core mappings. */ 6611 new_topology = arch_update_cpu_topology(); 6612 6613 n = doms_new ? ndoms_new : 0; 6614 6615 /* Destroy deleted domains */ 6616 for (i = 0; i < ndoms_cur; i++) { 6617 for (j = 0; j < n && !new_topology; j++) { 6618 if (cpumask_equal(doms_cur[i], doms_new[j]) 6619 && dattrs_equal(dattr_cur, i, dattr_new, j)) 6620 goto match1; 6621 } 6622 /* no match - a current sched domain not in new doms_new[] */ 6623 detach_destroy_domains(doms_cur[i]); 6624match1: 6625 ; 6626 } 6627 6628 if (doms_new == NULL) { 6629 ndoms_cur = 0; 6630 doms_new = &fallback_doms; 6631 cpumask_andnot(doms_new[0], cpu_active_mask, cpu_isolated_map); 6632 WARN_ON_ONCE(dattr_new); 6633 } 6634 6635 /* Build new domains */ 6636 for (i = 0; i < ndoms_new; i++) { 6637 for (j = 0; j < ndoms_cur && !new_topology; j++) { 6638 if (cpumask_equal(doms_new[i], doms_cur[j]) 6639 && dattrs_equal(dattr_new, i, dattr_cur, j)) 6640 goto match2; 6641 } 6642 /* no match - add a new doms_new */ 6643 build_sched_domains(doms_new[i], dattr_new ? dattr_new + i : NULL); 6644match2: 6645 ; 6646 } 6647 6648 /* Remember the new sched domains */ 6649 if (doms_cur != &fallback_doms) 6650 free_sched_domains(doms_cur, ndoms_cur); 6651 kfree(dattr_cur); /* kfree(NULL) is safe */ 6652 doms_cur = doms_new; 6653 dattr_cur = dattr_new; 6654 ndoms_cur = ndoms_new; 6655 6656 register_sched_domain_sysctl(); 6657 6658 mutex_unlock(&sched_domains_mutex); 6659} 6660 6661static int num_cpus_frozen; /* used to mark begin/end of suspend/resume */ 6662 6663/* 6664 * Update cpusets according to cpu_active mask. If cpusets are 6665 * disabled, cpuset_update_active_cpus() becomes a simple wrapper 6666 * around partition_sched_domains(). 6667 * 6668 * If we come here as part of a suspend/resume, don't touch cpusets because we 6669 * want to restore it back to its original state upon resume anyway. 6670 */ 6671static int cpuset_cpu_active(struct notifier_block *nfb, unsigned long action, 6672 void *hcpu) 6673{ 6674 switch (action) { 6675 case CPU_ONLINE_FROZEN: 6676 case CPU_DOWN_FAILED_FROZEN: 6677 6678 /* 6679 * num_cpus_frozen tracks how many CPUs are involved in suspend 6680 * resume sequence. As long as this is not the last online 6681 * operation in the resume sequence, just build a single sched 6682 * domain, ignoring cpusets. 6683 */ 6684 num_cpus_frozen--; 6685 if (likely(num_cpus_frozen)) { 6686 partition_sched_domains(1, NULL, NULL); 6687 break; 6688 } 6689 6690 /* 6691 * This is the last CPU online operation. So fall through and 6692 * restore the original sched domains by considering the 6693 * cpuset configurations. 6694 */ 6695 6696 case CPU_ONLINE: 6697 case CPU_DOWN_FAILED: 6698 cpuset_update_active_cpus(true); 6699 break; 6700 default: 6701 return NOTIFY_DONE; 6702 } 6703 return NOTIFY_OK; 6704} 6705 6706static int cpuset_cpu_inactive(struct notifier_block *nfb, unsigned long action, 6707 void *hcpu) 6708{ 6709 switch (action) { 6710 case CPU_DOWN_PREPARE: 6711 cpuset_update_active_cpus(false); 6712 break; 6713 case CPU_DOWN_PREPARE_FROZEN: 6714 num_cpus_frozen++; 6715 partition_sched_domains(1, NULL, NULL); 6716 break; 6717 default: 6718 return NOTIFY_DONE; 6719 } 6720 return NOTIFY_OK; 6721} 6722 6723void __init sched_init_smp(void) 6724{ 6725 cpumask_var_t non_isolated_cpus; 6726 6727 alloc_cpumask_var(&non_isolated_cpus, GFP_KERNEL); 6728 alloc_cpumask_var(&fallback_doms, GFP_KERNEL); 6729 6730 sched_init_numa(); 6731 6732 get_online_cpus(); 6733 mutex_lock(&sched_domains_mutex); 6734 init_sched_domains(cpu_active_mask); 6735 cpumask_andnot(non_isolated_cpus, cpu_possible_mask, cpu_isolated_map); 6736 if (cpumask_empty(non_isolated_cpus)) 6737 cpumask_set_cpu(smp_processor_id(), non_isolated_cpus); 6738 mutex_unlock(&sched_domains_mutex); 6739 put_online_cpus(); 6740 6741 hotcpu_notifier(sched_domains_numa_masks_update, CPU_PRI_SCHED_ACTIVE); 6742 hotcpu_notifier(cpuset_cpu_active, CPU_PRI_CPUSET_ACTIVE); 6743 hotcpu_notifier(cpuset_cpu_inactive, CPU_PRI_CPUSET_INACTIVE); 6744 6745 /* RT runtime code needs to handle some hotplug events */ 6746 hotcpu_notifier(update_runtime, 0); 6747 6748 init_hrtick(); 6749 6750 /* Move init over to a non-isolated CPU */ 6751 if (set_cpus_allowed_ptr(current, non_isolated_cpus) < 0) 6752 BUG(); 6753 sched_init_granularity(); 6754 free_cpumask_var(non_isolated_cpus); 6755 6756 init_sched_rt_class(); 6757} 6758#else 6759void __init sched_init_smp(void) 6760{ 6761 sched_init_granularity(); 6762} 6763#endif /* CONFIG_SMP */ 6764 6765const_debug unsigned int sysctl_timer_migration = 1; 6766 6767int in_sched_functions(unsigned long addr) 6768{ 6769 return in_lock_functions(addr) || 6770 (addr >= (unsigned long)__sched_text_start 6771 && addr < (unsigned long)__sched_text_end); 6772} 6773 6774#ifdef CONFIG_CGROUP_SCHED 6775struct task_group root_task_group; 6776LIST_HEAD(task_groups); 6777#endif 6778 6779DECLARE_PER_CPU(cpumask_var_t, load_balance_tmpmask); 6780 6781void __init sched_init(void) 6782{ 6783 int i, j; 6784 unsigned long alloc_size = 0, ptr; 6785 6786#ifdef CONFIG_FAIR_GROUP_SCHED 6787 alloc_size += 2 * nr_cpu_ids * sizeof(void **); 6788#endif 6789#ifdef CONFIG_RT_GROUP_SCHED 6790 alloc_size += 2 * nr_cpu_ids * sizeof(void **); 6791#endif 6792#ifdef CONFIG_CPUMASK_OFFSTACK 6793 alloc_size += num_possible_cpus() * cpumask_size(); 6794#endif 6795 if (alloc_size) { 6796 ptr = (unsigned long)kzalloc(alloc_size, GFP_NOWAIT); 6797 6798#ifdef CONFIG_FAIR_GROUP_SCHED 6799 root_task_group.se = (struct sched_entity **)ptr; 6800 ptr += nr_cpu_ids * sizeof(void **); 6801 6802 root_task_group.cfs_rq = (struct cfs_rq **)ptr; 6803 ptr += nr_cpu_ids * sizeof(void **); 6804 6805#endif /* CONFIG_FAIR_GROUP_SCHED */ 6806#ifdef CONFIG_RT_GROUP_SCHED 6807 root_task_group.rt_se = (struct sched_rt_entity **)ptr; 6808 ptr += nr_cpu_ids * sizeof(void **); 6809 6810 root_task_group.rt_rq = (struct rt_rq **)ptr; 6811 ptr += nr_cpu_ids * sizeof(void **); 6812 6813#endif /* CONFIG_RT_GROUP_SCHED */ 6814#ifdef CONFIG_CPUMASK_OFFSTACK 6815 for_each_possible_cpu(i) { 6816 per_cpu(load_balance_tmpmask, i) = (void *)ptr; 6817 ptr += cpumask_size(); 6818 } 6819#endif /* CONFIG_CPUMASK_OFFSTACK */ 6820 } 6821 6822#ifdef CONFIG_SMP 6823 init_defrootdomain(); 6824#endif 6825 6826 init_rt_bandwidth(&def_rt_bandwidth, 6827 global_rt_period(), global_rt_runtime()); 6828 6829#ifdef CONFIG_RT_GROUP_SCHED 6830 init_rt_bandwidth(&root_task_group.rt_bandwidth, 6831 global_rt_period(), global_rt_runtime()); 6832#endif /* CONFIG_RT_GROUP_SCHED */ 6833 6834#ifdef CONFIG_CGROUP_SCHED 6835 list_add(&root_task_group.list, &task_groups); 6836 INIT_LIST_HEAD(&root_task_group.children); 6837 INIT_LIST_HEAD(&root_task_group.siblings); 6838 autogroup_init(&init_task); 6839 6840#endif /* CONFIG_CGROUP_SCHED */ 6841 6842#ifdef CONFIG_CGROUP_CPUACCT 6843 root_cpuacct.cpustat = &kernel_cpustat; 6844 root_cpuacct.cpuusage = alloc_percpu(u64); 6845 /* Too early, not expected to fail */ 6846 BUG_ON(!root_cpuacct.cpuusage); 6847#endif 6848 for_each_possible_cpu(i) { 6849 struct rq *rq; 6850 6851 rq = cpu_rq(i); 6852 raw_spin_lock_init(&rq->lock); 6853 rq->nr_running = 0; 6854 rq->calc_load_active = 0; 6855 rq->calc_load_update = jiffies + LOAD_FREQ; 6856 init_cfs_rq(&rq->cfs); 6857 init_rt_rq(&rq->rt, rq); 6858#ifdef CONFIG_FAIR_GROUP_SCHED 6859 root_task_group.shares = ROOT_TASK_GROUP_LOAD; 6860 INIT_LIST_HEAD(&rq->leaf_cfs_rq_list); 6861 /* 6862 * How much cpu bandwidth does root_task_group get? 6863 * 6864 * In case of task-groups formed thr' the cgroup filesystem, it 6865 * gets 100% of the cpu resources in the system. This overall 6866 * system cpu resource is divided among the tasks of 6867 * root_task_group and its child task-groups in a fair manner, 6868 * based on each entity's (task or task-group's) weight 6869 * (se->load.weight). 6870 * 6871 * In other words, if root_task_group has 10 tasks of weight 6872 * 1024) and two child groups A0 and A1 (of weight 1024 each), 6873 * then A0's share of the cpu resource is: 6874 * 6875 * A0's bandwidth = 1024 / (10*1024 + 1024 + 1024) = 8.33% 6876 * 6877 * We achieve this by letting root_task_group's tasks sit 6878 * directly in rq->cfs (i.e root_task_group->se[] = NULL). 6879 */ 6880 init_cfs_bandwidth(&root_task_group.cfs_bandwidth); 6881 init_tg_cfs_entry(&root_task_group, &rq->cfs, NULL, i, NULL); 6882#endif /* CONFIG_FAIR_GROUP_SCHED */ 6883 6884 rq->rt.rt_runtime = def_rt_bandwidth.rt_runtime; 6885#ifdef CONFIG_RT_GROUP_SCHED 6886 INIT_LIST_HEAD(&rq->leaf_rt_rq_list); 6887 init_tg_rt_entry(&root_task_group, &rq->rt, NULL, i, NULL); 6888#endif 6889 6890 for (j = 0; j < CPU_LOAD_IDX_MAX; j++) 6891 rq->cpu_load[j] = 0; 6892 6893 rq->last_load_update_tick = jiffies; 6894 6895#ifdef CONFIG_SMP 6896 rq->sd = NULL; 6897 rq->rd = NULL; 6898 rq->cpu_power = SCHED_POWER_SCALE; 6899 rq->post_schedule = 0; 6900 rq->active_balance = 0; 6901 rq->next_balance = jiffies; 6902 rq->push_cpu = 0; 6903 rq->cpu = i; 6904 rq->online = 0; 6905 rq->idle_stamp = 0; 6906 rq->avg_idle = 2*sysctl_sched_migration_cost; 6907 6908 INIT_LIST_HEAD(&rq->cfs_tasks); 6909 6910 rq_attach_root(rq, &def_root_domain); 6911#ifdef CONFIG_NO_HZ 6912 rq->nohz_flags = 0; 6913#endif 6914#endif 6915 init_rq_hrtick(rq); 6916 atomic_set(&rq->nr_iowait, 0); 6917 } 6918 6919 set_load_weight(&init_task); 6920 6921#ifdef CONFIG_PREEMPT_NOTIFIERS 6922 INIT_HLIST_HEAD(&init_task.preempt_notifiers); 6923#endif 6924 6925#ifdef CONFIG_RT_MUTEXES 6926 plist_head_init(&init_task.pi_waiters); 6927#endif 6928 6929 /* 6930 * The boot idle thread does lazy MMU switching as well: 6931 */ 6932 atomic_inc(&init_mm.mm_count); 6933 enter_lazy_tlb(&init_mm, current); 6934 6935 /* 6936 * Make us the idle thread. Technically, schedule() should not be 6937 * called from this thread, however somewhere below it might be, 6938 * but because we are the idle thread, we just pick up running again 6939 * when this runqueue becomes "idle". 6940 */ 6941 init_idle(current, smp_processor_id()); 6942 6943 calc_load_update = jiffies + LOAD_FREQ; 6944 6945 /* 6946 * During early bootup we pretend to be a normal task: 6947 */ 6948 current->sched_class = &fair_sched_class; 6949 6950#ifdef CONFIG_SMP 6951 zalloc_cpumask_var(&sched_domains_tmpmask, GFP_NOWAIT); 6952 /* May be allocated at isolcpus cmdline parse time */ 6953 if (cpu_isolated_map == NULL) 6954 zalloc_cpumask_var(&cpu_isolated_map, GFP_NOWAIT); 6955 idle_thread_set_boot_cpu(); 6956#endif 6957 init_sched_fair_class(); 6958 6959 scheduler_running = 1; 6960} 6961 6962#ifdef CONFIG_DEBUG_ATOMIC_SLEEP 6963static inline int preempt_count_equals(int preempt_offset) 6964{ 6965 int nested = (preempt_count() & ~PREEMPT_ACTIVE) + rcu_preempt_depth(); 6966 6967 return (nested == preempt_offset); 6968} 6969 6970void __might_sleep(const char *file, int line, int preempt_offset) 6971{ 6972 static unsigned long prev_jiffy; /* ratelimiting */ 6973 6974 rcu_sleep_check(); /* WARN_ON_ONCE() by default, no rate limit reqd. */ 6975 if ((preempt_count_equals(preempt_offset) && !irqs_disabled()) || 6976 system_state != SYSTEM_RUNNING || oops_in_progress) 6977 return; 6978 if (time_before(jiffies, prev_jiffy + HZ) && prev_jiffy) 6979 return; 6980 prev_jiffy = jiffies; 6981 6982 printk(KERN_ERR 6983 "BUG: sleeping function called from invalid context at %s:%d\n", 6984 file, line); 6985 printk(KERN_ERR 6986 "in_atomic(): %d, irqs_disabled(): %d, pid: %d, name: %s\n", 6987 in_atomic(), irqs_disabled(), 6988 current->pid, current->comm); 6989 6990 debug_show_held_locks(current); 6991 if (irqs_disabled()) 6992 print_irqtrace_events(current); 6993 dump_stack(); 6994} 6995EXPORT_SYMBOL(__might_sleep); 6996#endif 6997 6998#ifdef CONFIG_MAGIC_SYSRQ 6999static void normalize_task(struct rq *rq, struct task_struct *p) 7000{ 7001 const struct sched_class *prev_class = p->sched_class; 7002 int old_prio = p->prio; 7003 int on_rq; 7004 7005 on_rq = p->on_rq; 7006 if (on_rq) 7007 dequeue_task(rq, p, 0); 7008 __setscheduler(rq, p, SCHED_NORMAL, 0); 7009 if (on_rq) { 7010 enqueue_task(rq, p, 0); 7011 resched_task(rq->curr); 7012 } 7013 7014 check_class_changed(rq, p, prev_class, old_prio); 7015} 7016 7017void normalize_rt_tasks(void) 7018{ 7019 struct task_struct *g, *p; 7020 unsigned long flags; 7021 struct rq *rq; 7022 7023 read_lock_irqsave(&tasklist_lock, flags); 7024 do_each_thread(g, p) { 7025 /* 7026 * Only normalize user tasks: 7027 */ 7028 if (!p->mm) 7029 continue; 7030 7031 p->se.exec_start = 0; 7032#ifdef CONFIG_SCHEDSTATS 7033 p->se.statistics.wait_start = 0; 7034 p->se.statistics.sleep_start = 0; 7035 p->se.statistics.block_start = 0; 7036#endif 7037 7038 if (!rt_task(p)) { 7039 /* 7040 * Renice negative nice level userspace 7041 * tasks back to 0: 7042 */ 7043 if (TASK_NICE(p) < 0 && p->mm) 7044 set_user_nice(p, 0); 7045 continue; 7046 } 7047 7048 raw_spin_lock(&p->pi_lock); 7049 rq = __task_rq_lock(p); 7050 7051 normalize_task(rq, p); 7052 7053 __task_rq_unlock(rq); 7054 raw_spin_unlock(&p->pi_lock); 7055 } while_each_thread(g, p); 7056 7057 read_unlock_irqrestore(&tasklist_lock, flags); 7058} 7059 7060#endif /* CONFIG_MAGIC_SYSRQ */ 7061 7062#if defined(CONFIG_IA64) || defined(CONFIG_KGDB_KDB) 7063/* 7064 * These functions are only useful for the IA64 MCA handling, or kdb. 7065 * 7066 * They can only be called when the whole system has been 7067 * stopped - every CPU needs to be quiescent, and no scheduling 7068 * activity can take place. Using them for anything else would 7069 * be a serious bug, and as a result, they aren't even visible 7070 * under any other configuration. 7071 */ 7072 7073/** 7074 * curr_task - return the current task for a given cpu. 7075 * @cpu: the processor in question. 7076 * 7077 * ONLY VALID WHEN THE WHOLE SYSTEM IS STOPPED! 7078 */ 7079struct task_struct *curr_task(int cpu) 7080{ 7081 return cpu_curr(cpu); 7082} 7083 7084#endif /* defined(CONFIG_IA64) || defined(CONFIG_KGDB_KDB) */ 7085 7086#ifdef CONFIG_IA64 7087/** 7088 * set_curr_task - set the current task for a given cpu. 7089 * @cpu: the processor in question. 7090 * @p: the task pointer to set. 7091 * 7092 * Description: This function must only be used when non-maskable interrupts 7093 * are serviced on a separate stack. It allows the architecture to switch the 7094 * notion of the current task on a cpu in a non-blocking manner. This function 7095 * must be called with all CPU's synchronized, and interrupts disabled, the 7096 * and caller must save the original value of the current task (see 7097 * curr_task() above) and restore that value before reenabling interrupts and 7098 * re-starting the system. 7099 * 7100 * ONLY VALID WHEN THE WHOLE SYSTEM IS STOPPED! 7101 */ 7102void set_curr_task(int cpu, struct task_struct *p) 7103{ 7104 cpu_curr(cpu) = p; 7105} 7106 7107#endif 7108 7109#ifdef CONFIG_CGROUP_SCHED 7110/* task_group_lock serializes the addition/removal of task groups */ 7111static DEFINE_SPINLOCK(task_group_lock); 7112 7113static void free_sched_group(struct task_group *tg) 7114{ 7115 free_fair_sched_group(tg); 7116 free_rt_sched_group(tg); 7117 autogroup_free(tg); 7118 kfree(tg); 7119} 7120 7121/* allocate runqueue etc for a new task group */ 7122struct task_group *sched_create_group(struct task_group *parent) 7123{ 7124 struct task_group *tg; 7125 unsigned long flags; 7126 7127 tg = kzalloc(sizeof(*tg), GFP_KERNEL); 7128 if (!tg) 7129 return ERR_PTR(-ENOMEM); 7130 7131 if (!alloc_fair_sched_group(tg, parent)) 7132 goto err; 7133 7134 if (!alloc_rt_sched_group(tg, parent)) 7135 goto err; 7136 7137 spin_lock_irqsave(&task_group_lock, flags); 7138 list_add_rcu(&tg->list, &task_groups); 7139 7140 WARN_ON(!parent); /* root should already exist */ 7141 7142 tg->parent = parent; 7143 INIT_LIST_HEAD(&tg->children); 7144 list_add_rcu(&tg->siblings, &parent->children); 7145 spin_unlock_irqrestore(&task_group_lock, flags); 7146 7147 return tg; 7148 7149err: 7150 free_sched_group(tg); 7151 return ERR_PTR(-ENOMEM); 7152} 7153 7154/* rcu callback to free various structures associated with a task group */ 7155static void free_sched_group_rcu(struct rcu_head *rhp) 7156{ 7157 /* now it should be safe to free those cfs_rqs */ 7158 free_sched_group(container_of(rhp, struct task_group, rcu)); 7159} 7160 7161/* Destroy runqueue etc associated with a task group */ 7162void sched_destroy_group(struct task_group *tg) 7163{ 7164 unsigned long flags; 7165 int i; 7166 7167 /* end participation in shares distribution */ 7168 for_each_possible_cpu(i) 7169 unregister_fair_sched_group(tg, i); 7170 7171 spin_lock_irqsave(&task_group_lock, flags); 7172 list_del_rcu(&tg->list); 7173 list_del_rcu(&tg->siblings); 7174 spin_unlock_irqrestore(&task_group_lock, flags); 7175 7176 /* wait for possible concurrent references to cfs_rqs complete */ 7177 call_rcu(&tg->rcu, free_sched_group_rcu); 7178} 7179 7180/* change task's runqueue when it moves between groups. 7181 * The caller of this function should have put the task in its new group 7182 * by now. This function just updates tsk->se.cfs_rq and tsk->se.parent to 7183 * reflect its new group. 7184 */ 7185void sched_move_task(struct task_struct *tsk) 7186{ 7187 struct task_group *tg; 7188 int on_rq, running; 7189 unsigned long flags; 7190 struct rq *rq; 7191 7192 rq = task_rq_lock(tsk, &flags); 7193 7194 running = task_current(rq, tsk); 7195 on_rq = tsk->on_rq; 7196 7197 if (on_rq) 7198 dequeue_task(rq, tsk, 0); 7199 if (unlikely(running)) 7200 tsk->sched_class->put_prev_task(rq, tsk); 7201 7202 tg = container_of(task_subsys_state_check(tsk, cpu_cgroup_subsys_id, 7203 lockdep_is_held(&tsk->sighand->siglock)), 7204 struct task_group, css); 7205 tg = autogroup_task_group(tsk, tg); 7206 tsk->sched_task_group = tg; 7207 7208#ifdef CONFIG_FAIR_GROUP_SCHED 7209 if (tsk->sched_class->task_move_group) 7210 tsk->sched_class->task_move_group(tsk, on_rq); 7211 else 7212#endif 7213 set_task_rq(tsk, task_cpu(tsk)); 7214 7215 if (unlikely(running)) 7216 tsk->sched_class->set_curr_task(rq); 7217 if (on_rq) 7218 enqueue_task(rq, tsk, 0); 7219 7220 task_rq_unlock(rq, tsk, &flags); 7221} 7222#endif /* CONFIG_CGROUP_SCHED */ 7223 7224#if defined(CONFIG_RT_GROUP_SCHED) || defined(CONFIG_CFS_BANDWIDTH) 7225static unsigned long to_ratio(u64 period, u64 runtime) 7226{ 7227 if (runtime == RUNTIME_INF) 7228 return 1ULL << 20; 7229 7230 return div64_u64(runtime << 20, period); 7231} 7232#endif 7233 7234#ifdef CONFIG_RT_GROUP_SCHED 7235/* 7236 * Ensure that the real time constraints are schedulable. 7237 */ 7238static DEFINE_MUTEX(rt_constraints_mutex); 7239 7240/* Must be called with tasklist_lock held */ 7241static inline int tg_has_rt_tasks(struct task_group *tg) 7242{ 7243 struct task_struct *g, *p; 7244 7245 do_each_thread(g, p) { 7246 if (rt_task(p) && task_rq(p)->rt.tg == tg) 7247 return 1; 7248 } while_each_thread(g, p); 7249 7250 return 0; 7251} 7252 7253struct rt_schedulable_data { 7254 struct task_group *tg; 7255 u64 rt_period; 7256 u64 rt_runtime; 7257}; 7258 7259static int tg_rt_schedulable(struct task_group *tg, void *data) 7260{ 7261 struct rt_schedulable_data *d = data; 7262 struct task_group *child; 7263 unsigned long total, sum = 0; 7264 u64 period, runtime; 7265 7266 period = ktime_to_ns(tg->rt_bandwidth.rt_period); 7267 runtime = tg->rt_bandwidth.rt_runtime; 7268 7269 if (tg == d->tg) { 7270 period = d->rt_period; 7271 runtime = d->rt_runtime; 7272 } 7273 7274 /* 7275 * Cannot have more runtime than the period. 7276 */ 7277 if (runtime > period && runtime != RUNTIME_INF) 7278 return -EINVAL; 7279 7280 /* 7281 * Ensure we don't starve existing RT tasks. 7282 */ 7283 if (rt_bandwidth_enabled() && !runtime && tg_has_rt_tasks(tg)) 7284 return -EBUSY; 7285 7286 total = to_ratio(period, runtime); 7287 7288 /* 7289 * Nobody can have more than the global setting allows. 7290 */ 7291 if (total > to_ratio(global_rt_period(), global_rt_runtime())) 7292 return -EINVAL; 7293 7294 /* 7295 * The sum of our children's runtime should not exceed our own. 7296 */ 7297 list_for_each_entry_rcu(child, &tg->children, siblings) { 7298 period = ktime_to_ns(child->rt_bandwidth.rt_period); 7299 runtime = child->rt_bandwidth.rt_runtime; 7300 7301 if (child == d->tg) { 7302 period = d->rt_period; 7303 runtime = d->rt_runtime; 7304 } 7305 7306 sum += to_ratio(period, runtime); 7307 } 7308 7309 if (sum > total) 7310 return -EINVAL; 7311 7312 return 0; 7313} 7314 7315static int __rt_schedulable(struct task_group *tg, u64 period, u64 runtime) 7316{ 7317 int ret; 7318 7319 struct rt_schedulable_data data = { 7320 .tg = tg, 7321 .rt_period = period, 7322 .rt_runtime = runtime, 7323 }; 7324 7325 rcu_read_lock(); 7326 ret = walk_tg_tree(tg_rt_schedulable, tg_nop, &data); 7327 rcu_read_unlock(); 7328 7329 return ret; 7330} 7331 7332static int tg_set_rt_bandwidth(struct task_group *tg, 7333 u64 rt_period, u64 rt_runtime) 7334{ 7335 int i, err = 0; 7336 7337 mutex_lock(&rt_constraints_mutex); 7338 read_lock(&tasklist_lock); 7339 err = __rt_schedulable(tg, rt_period, rt_runtime); 7340 if (err) 7341 goto unlock; 7342 7343 raw_spin_lock_irq(&tg->rt_bandwidth.rt_runtime_lock); 7344 tg->rt_bandwidth.rt_period = ns_to_ktime(rt_period); 7345 tg->rt_bandwidth.rt_runtime = rt_runtime; 7346 7347 for_each_possible_cpu(i) { 7348 struct rt_rq *rt_rq = tg->rt_rq[i]; 7349 7350 raw_spin_lock(&rt_rq->rt_runtime_lock); 7351 rt_rq->rt_runtime = rt_runtime; 7352 raw_spin_unlock(&rt_rq->rt_runtime_lock); 7353 } 7354 raw_spin_unlock_irq(&tg->rt_bandwidth.rt_runtime_lock); 7355unlock: 7356 read_unlock(&tasklist_lock); 7357 mutex_unlock(&rt_constraints_mutex); 7358 7359 return err; 7360} 7361 7362int sched_group_set_rt_runtime(struct task_group *tg, long rt_runtime_us) 7363{ 7364 u64 rt_runtime, rt_period; 7365 7366 rt_period = ktime_to_ns(tg->rt_bandwidth.rt_period); 7367 rt_runtime = (u64)rt_runtime_us * NSEC_PER_USEC; 7368 if (rt_runtime_us < 0) 7369 rt_runtime = RUNTIME_INF; 7370 7371 return tg_set_rt_bandwidth(tg, rt_period, rt_runtime); 7372} 7373 7374long sched_group_rt_runtime(struct task_group *tg) 7375{ 7376 u64 rt_runtime_us; 7377 7378 if (tg->rt_bandwidth.rt_runtime == RUNTIME_INF) 7379 return -1; 7380 7381 rt_runtime_us = tg->rt_bandwidth.rt_runtime; 7382 do_div(rt_runtime_us, NSEC_PER_USEC); 7383 return rt_runtime_us; 7384} 7385 7386int sched_group_set_rt_period(struct task_group *tg, long rt_period_us) 7387{ 7388 u64 rt_runtime, rt_period; 7389 7390 rt_period = (u64)rt_period_us * NSEC_PER_USEC; 7391 rt_runtime = tg->rt_bandwidth.rt_runtime; 7392 7393 if (rt_period == 0) 7394 return -EINVAL; 7395 7396 return tg_set_rt_bandwidth(tg, rt_period, rt_runtime); 7397} 7398 7399long sched_group_rt_period(struct task_group *tg) 7400{ 7401 u64 rt_period_us; 7402 7403 rt_period_us = ktime_to_ns(tg->rt_bandwidth.rt_period); 7404 do_div(rt_period_us, NSEC_PER_USEC); 7405 return rt_period_us; 7406} 7407 7408static int sched_rt_global_constraints(void) 7409{ 7410 u64 runtime, period; 7411 int ret = 0; 7412 7413 if (sysctl_sched_rt_period <= 0) 7414 return -EINVAL; 7415 7416 runtime = global_rt_runtime(); 7417 period = global_rt_period(); 7418 7419 /* 7420 * Sanity check on the sysctl variables. 7421 */ 7422 if (runtime > period && runtime != RUNTIME_INF) 7423 return -EINVAL; 7424 7425 mutex_lock(&rt_constraints_mutex); 7426 read_lock(&tasklist_lock); 7427 ret = __rt_schedulable(NULL, 0, 0); 7428 read_unlock(&tasklist_lock); 7429 mutex_unlock(&rt_constraints_mutex); 7430 7431 return ret; 7432} 7433 7434int sched_rt_can_attach(struct task_group *tg, struct task_struct *tsk) 7435{ 7436 /* Don't accept realtime tasks when there is no way for them to run */ 7437 if (rt_task(tsk) && tg->rt_bandwidth.rt_runtime == 0) 7438 return 0; 7439 7440 return 1; 7441} 7442 7443#else /* !CONFIG_RT_GROUP_SCHED */ 7444static int sched_rt_global_constraints(void) 7445{ 7446 unsigned long flags; 7447 int i; 7448 7449 if (sysctl_sched_rt_period <= 0) 7450 return -EINVAL; 7451 7452 /* 7453 * There's always some RT tasks in the root group 7454 * -- migration, kstopmachine etc.. 7455 */ 7456 if (sysctl_sched_rt_runtime == 0) 7457 return -EBUSY; 7458 7459 raw_spin_lock_irqsave(&def_rt_bandwidth.rt_runtime_lock, flags); 7460 for_each_possible_cpu(i) { 7461 struct rt_rq *rt_rq = &cpu_rq(i)->rt; 7462 7463 raw_spin_lock(&rt_rq->rt_runtime_lock); 7464 rt_rq->rt_runtime = global_rt_runtime(); 7465 raw_spin_unlock(&rt_rq->rt_runtime_lock); 7466 } 7467 raw_spin_unlock_irqrestore(&def_rt_bandwidth.rt_runtime_lock, flags); 7468 7469 return 0; 7470} 7471#endif /* CONFIG_RT_GROUP_SCHED */ 7472 7473int sched_rt_handler(struct ctl_table *table, int write, 7474 void __user *buffer, size_t *lenp, 7475 loff_t *ppos) 7476{ 7477 int ret; 7478 int old_period, old_runtime; 7479 static DEFINE_MUTEX(mutex); 7480 7481 mutex_lock(&mutex); 7482 old_period = sysctl_sched_rt_period; 7483 old_runtime = sysctl_sched_rt_runtime; 7484 7485 ret = proc_dointvec(table, write, buffer, lenp, ppos); 7486 7487 if (!ret && write) { 7488 ret = sched_rt_global_constraints(); 7489 if (ret) { 7490 sysctl_sched_rt_period = old_period; 7491 sysctl_sched_rt_runtime = old_runtime; 7492 } else { 7493 def_rt_bandwidth.rt_runtime = global_rt_runtime(); 7494 def_rt_bandwidth.rt_period = 7495 ns_to_ktime(global_rt_period()); 7496 } 7497 } 7498 mutex_unlock(&mutex); 7499 7500 return ret; 7501} 7502 7503#ifdef CONFIG_CGROUP_SCHED 7504 7505/* return corresponding task_group object of a cgroup */ 7506static inline struct task_group *cgroup_tg(struct cgroup *cgrp) 7507{ 7508 return container_of(cgroup_subsys_state(cgrp, cpu_cgroup_subsys_id), 7509 struct task_group, css); 7510} 7511 7512static struct cgroup_subsys_state *cpu_cgroup_create(struct cgroup *cgrp) 7513{ 7514 struct task_group *tg, *parent; 7515 7516 if (!cgrp->parent) { 7517 /* This is early initialization for the top cgroup */ 7518 return &root_task_group.css; 7519 } 7520 7521 parent = cgroup_tg(cgrp->parent); 7522 tg = sched_create_group(parent); 7523 if (IS_ERR(tg)) 7524 return ERR_PTR(-ENOMEM); 7525 7526 return &tg->css; 7527} 7528 7529static void cpu_cgroup_destroy(struct cgroup *cgrp) 7530{ 7531 struct task_group *tg = cgroup_tg(cgrp); 7532 7533 sched_destroy_group(tg); 7534} 7535 7536static int cpu_cgroup_can_attach(struct cgroup *cgrp, 7537 struct cgroup_taskset *tset) 7538{ 7539 struct task_struct *task; 7540 7541 cgroup_taskset_for_each(task, cgrp, tset) { 7542#ifdef CONFIG_RT_GROUP_SCHED 7543 if (!sched_rt_can_attach(cgroup_tg(cgrp), task)) 7544 return -EINVAL; 7545#else 7546 /* We don't support RT-tasks being in separate groups */ 7547 if (task->sched_class != &fair_sched_class) 7548 return -EINVAL; 7549#endif 7550 } 7551 return 0; 7552} 7553 7554static void cpu_cgroup_attach(struct cgroup *cgrp, 7555 struct cgroup_taskset *tset) 7556{ 7557 struct task_struct *task; 7558 7559 cgroup_taskset_for_each(task, cgrp, tset) 7560 sched_move_task(task); 7561} 7562 7563static void 7564cpu_cgroup_exit(struct cgroup *cgrp, struct cgroup *old_cgrp, 7565 struct task_struct *task) 7566{ 7567 /* 7568 * cgroup_exit() is called in the copy_process() failure path. 7569 * Ignore this case since the task hasn't ran yet, this avoids 7570 * trying to poke a half freed task state from generic code. 7571 */ 7572 if (!(task->flags & PF_EXITING)) 7573 return; 7574 7575 sched_move_task(task); 7576} 7577 7578#ifdef CONFIG_FAIR_GROUP_SCHED 7579static int cpu_shares_write_u64(struct cgroup *cgrp, struct cftype *cftype, 7580 u64 shareval) 7581{ 7582 return sched_group_set_shares(cgroup_tg(cgrp), scale_load(shareval)); 7583} 7584 7585static u64 cpu_shares_read_u64(struct cgroup *cgrp, struct cftype *cft) 7586{ 7587 struct task_group *tg = cgroup_tg(cgrp); 7588 7589 return (u64) scale_load_down(tg->shares); 7590} 7591 7592#ifdef CONFIG_CFS_BANDWIDTH 7593static DEFINE_MUTEX(cfs_constraints_mutex); 7594 7595const u64 max_cfs_quota_period = 1 * NSEC_PER_SEC; /* 1s */ 7596const u64 min_cfs_quota_period = 1 * NSEC_PER_MSEC; /* 1ms */ 7597 7598static int __cfs_schedulable(struct task_group *tg, u64 period, u64 runtime); 7599 7600static int tg_set_cfs_bandwidth(struct task_group *tg, u64 period, u64 quota) 7601{ 7602 int i, ret = 0, runtime_enabled, runtime_was_enabled; 7603 struct cfs_bandwidth *cfs_b = &tg->cfs_bandwidth; 7604 7605 if (tg == &root_task_group) 7606 return -EINVAL; 7607 7608 /* 7609 * Ensure we have at some amount of bandwidth every period. This is 7610 * to prevent reaching a state of large arrears when throttled via 7611 * entity_tick() resulting in prolonged exit starvation. 7612 */ 7613 if (quota < min_cfs_quota_period || period < min_cfs_quota_period) 7614 return -EINVAL; 7615 7616 /* 7617 * Likewise, bound things on the otherside by preventing insane quota 7618 * periods. This also allows us to normalize in computing quota 7619 * feasibility. 7620 */ 7621 if (period > max_cfs_quota_period) 7622 return -EINVAL; 7623 7624 mutex_lock(&cfs_constraints_mutex); 7625 ret = __cfs_schedulable(tg, period, quota); 7626 if (ret) 7627 goto out_unlock; 7628 7629 runtime_enabled = quota != RUNTIME_INF; 7630 runtime_was_enabled = cfs_b->quota != RUNTIME_INF; 7631 account_cfs_bandwidth_used(runtime_enabled, runtime_was_enabled); 7632 raw_spin_lock_irq(&cfs_b->lock); 7633 cfs_b->period = ns_to_ktime(period); 7634 cfs_b->quota = quota; 7635 7636 __refill_cfs_bandwidth_runtime(cfs_b); 7637 /* restart the period timer (if active) to handle new period expiry */ 7638 if (runtime_enabled && cfs_b->timer_active) { 7639 /* force a reprogram */ 7640 cfs_b->timer_active = 0; 7641 __start_cfs_bandwidth(cfs_b); 7642 } 7643 raw_spin_unlock_irq(&cfs_b->lock); 7644 7645 for_each_possible_cpu(i) { 7646 struct cfs_rq *cfs_rq = tg->cfs_rq[i]; 7647 struct rq *rq = cfs_rq->rq; 7648 7649 raw_spin_lock_irq(&rq->lock); 7650 cfs_rq->runtime_enabled = runtime_enabled; 7651 cfs_rq->runtime_remaining = 0; 7652 7653 if (cfs_rq->throttled) 7654 unthrottle_cfs_rq(cfs_rq); 7655 raw_spin_unlock_irq(&rq->lock); 7656 } 7657out_unlock: 7658 mutex_unlock(&cfs_constraints_mutex); 7659 7660 return ret; 7661} 7662 7663int tg_set_cfs_quota(struct task_group *tg, long cfs_quota_us) 7664{ 7665 u64 quota, period; 7666 7667 period = ktime_to_ns(tg->cfs_bandwidth.period); 7668 if (cfs_quota_us < 0) 7669 quota = RUNTIME_INF; 7670 else 7671 quota = (u64)cfs_quota_us * NSEC_PER_USEC; 7672 7673 return tg_set_cfs_bandwidth(tg, period, quota); 7674} 7675 7676long tg_get_cfs_quota(struct task_group *tg) 7677{ 7678 u64 quota_us; 7679 7680 if (tg->cfs_bandwidth.quota == RUNTIME_INF) 7681 return -1; 7682 7683 quota_us = tg->cfs_bandwidth.quota; 7684 do_div(quota_us, NSEC_PER_USEC); 7685 7686 return quota_us; 7687} 7688 7689int tg_set_cfs_period(struct task_group *tg, long cfs_period_us) 7690{ 7691 u64 quota, period; 7692 7693 period = (u64)cfs_period_us * NSEC_PER_USEC; 7694 quota = tg->cfs_bandwidth.quota; 7695 7696 return tg_set_cfs_bandwidth(tg, period, quota); 7697} 7698 7699long tg_get_cfs_period(struct task_group *tg) 7700{ 7701 u64 cfs_period_us; 7702 7703 cfs_period_us = ktime_to_ns(tg->cfs_bandwidth.period); 7704 do_div(cfs_period_us, NSEC_PER_USEC); 7705 7706 return cfs_period_us; 7707} 7708 7709static s64 cpu_cfs_quota_read_s64(struct cgroup *cgrp, struct cftype *cft) 7710{ 7711 return tg_get_cfs_quota(cgroup_tg(cgrp)); 7712} 7713 7714static int cpu_cfs_quota_write_s64(struct cgroup *cgrp, struct cftype *cftype, 7715 s64 cfs_quota_us) 7716{ 7717 return tg_set_cfs_quota(cgroup_tg(cgrp), cfs_quota_us); 7718} 7719 7720static u64 cpu_cfs_period_read_u64(struct cgroup *cgrp, struct cftype *cft) 7721{ 7722 return tg_get_cfs_period(cgroup_tg(cgrp)); 7723} 7724 7725static int cpu_cfs_period_write_u64(struct cgroup *cgrp, struct cftype *cftype, 7726 u64 cfs_period_us) 7727{ 7728 return tg_set_cfs_period(cgroup_tg(cgrp), cfs_period_us); 7729} 7730 7731struct cfs_schedulable_data { 7732 struct task_group *tg; 7733 u64 period, quota; 7734}; 7735 7736/* 7737 * normalize group quota/period to be quota/max_period 7738 * note: units are usecs 7739 */ 7740static u64 normalize_cfs_quota(struct task_group *tg, 7741 struct cfs_schedulable_data *d) 7742{ 7743 u64 quota, period; 7744 7745 if (tg == d->tg) { 7746 period = d->period; 7747 quota = d->quota; 7748 } else { 7749 period = tg_get_cfs_period(tg); 7750 quota = tg_get_cfs_quota(tg); 7751 } 7752 7753 /* note: these should typically be equivalent */ 7754 if (quota == RUNTIME_INF || quota == -1) 7755 return RUNTIME_INF; 7756 7757 return to_ratio(period, quota); 7758} 7759 7760static int tg_cfs_schedulable_down(struct task_group *tg, void *data) 7761{ 7762 struct cfs_schedulable_data *d = data; 7763 struct cfs_bandwidth *cfs_b = &tg->cfs_bandwidth; 7764 s64 quota = 0, parent_quota = -1; 7765 7766 if (!tg->parent) { 7767 quota = RUNTIME_INF; 7768 } else { 7769 struct cfs_bandwidth *parent_b = &tg->parent->cfs_bandwidth; 7770 7771 quota = normalize_cfs_quota(tg, d); 7772 parent_quota = parent_b->hierarchal_quota; 7773 7774 /* 7775 * ensure max(child_quota) <= parent_quota, inherit when no 7776 * limit is set 7777 */ 7778 if (quota == RUNTIME_INF) 7779 quota = parent_quota; 7780 else if (parent_quota != RUNTIME_INF && quota > parent_quota) 7781 return -EINVAL; 7782 } 7783 cfs_b->hierarchal_quota = quota; 7784 7785 return 0; 7786} 7787 7788static int __cfs_schedulable(struct task_group *tg, u64 period, u64 quota) 7789{ 7790 int ret; 7791 struct cfs_schedulable_data data = { 7792 .tg = tg, 7793 .period = period, 7794 .quota = quota, 7795 }; 7796 7797 if (quota != RUNTIME_INF) { 7798 do_div(data.period, NSEC_PER_USEC); 7799 do_div(data.quota, NSEC_PER_USEC); 7800 } 7801 7802 rcu_read_lock(); 7803 ret = walk_tg_tree(tg_cfs_schedulable_down, tg_nop, &data); 7804 rcu_read_unlock(); 7805 7806 return ret; 7807} 7808 7809static int cpu_stats_show(struct cgroup *cgrp, struct cftype *cft, 7810 struct cgroup_map_cb *cb) 7811{ 7812 struct task_group *tg = cgroup_tg(cgrp); 7813 struct cfs_bandwidth *cfs_b = &tg->cfs_bandwidth; 7814 7815 cb->fill(cb, "nr_periods", cfs_b->nr_periods); 7816 cb->fill(cb, "nr_throttled", cfs_b->nr_throttled); 7817 cb->fill(cb, "throttled_time", cfs_b->throttled_time); 7818 7819 return 0; 7820} 7821#endif /* CONFIG_CFS_BANDWIDTH */ 7822#endif /* CONFIG_FAIR_GROUP_SCHED */ 7823 7824#ifdef CONFIG_RT_GROUP_SCHED 7825static int cpu_rt_runtime_write(struct cgroup *cgrp, struct cftype *cft, 7826 s64 val) 7827{ 7828 return sched_group_set_rt_runtime(cgroup_tg(cgrp), val); 7829} 7830 7831static s64 cpu_rt_runtime_read(struct cgroup *cgrp, struct cftype *cft) 7832{ 7833 return sched_group_rt_runtime(cgroup_tg(cgrp)); 7834} 7835 7836static int cpu_rt_period_write_uint(struct cgroup *cgrp, struct cftype *cftype, 7837 u64 rt_period_us) 7838{ 7839 return sched_group_set_rt_period(cgroup_tg(cgrp), rt_period_us); 7840} 7841 7842static u64 cpu_rt_period_read_uint(struct cgroup *cgrp, struct cftype *cft) 7843{ 7844 return sched_group_rt_period(cgroup_tg(cgrp)); 7845} 7846#endif /* CONFIG_RT_GROUP_SCHED */ 7847 7848static struct cftype cpu_files[] = { 7849#ifdef CONFIG_FAIR_GROUP_SCHED 7850 { 7851 .name = "shares", 7852 .read_u64 = cpu_shares_read_u64, 7853 .write_u64 = cpu_shares_write_u64, 7854 }, 7855#endif 7856#ifdef CONFIG_CFS_BANDWIDTH 7857 { 7858 .name = "cfs_quota_us", 7859 .read_s64 = cpu_cfs_quota_read_s64, 7860 .write_s64 = cpu_cfs_quota_write_s64, 7861 }, 7862 { 7863 .name = "cfs_period_us", 7864 .read_u64 = cpu_cfs_period_read_u64, 7865 .write_u64 = cpu_cfs_period_write_u64, 7866 }, 7867 { 7868 .name = "stat", 7869 .read_map = cpu_stats_show, 7870 }, 7871#endif 7872#ifdef CONFIG_RT_GROUP_SCHED 7873 { 7874 .name = "rt_runtime_us", 7875 .read_s64 = cpu_rt_runtime_read, 7876 .write_s64 = cpu_rt_runtime_write, 7877 }, 7878 { 7879 .name = "rt_period_us", 7880 .read_u64 = cpu_rt_period_read_uint, 7881 .write_u64 = cpu_rt_period_write_uint, 7882 }, 7883#endif 7884 { } /* terminate */ 7885}; 7886 7887struct cgroup_subsys cpu_cgroup_subsys = { 7888 .name = "cpu", 7889 .create = cpu_cgroup_create, 7890 .destroy = cpu_cgroup_destroy, 7891 .can_attach = cpu_cgroup_can_attach, 7892 .attach = cpu_cgroup_attach, 7893 .exit = cpu_cgroup_exit, 7894 .subsys_id = cpu_cgroup_subsys_id, 7895 .base_cftypes = cpu_files, 7896 .early_init = 1, 7897}; 7898 7899#endif /* CONFIG_CGROUP_SCHED */ 7900 7901#ifdef CONFIG_CGROUP_CPUACCT 7902 7903/* 7904 * CPU accounting code for task groups. 7905 * 7906 * Based on the work by Paul Menage (menage@google.com) and Balbir Singh 7907 * (balbir@in.ibm.com). 7908 */ 7909 7910struct cpuacct root_cpuacct; 7911 7912/* create a new cpu accounting group */ 7913static struct cgroup_subsys_state *cpuacct_create(struct cgroup *cgrp) 7914{ 7915 struct cpuacct *ca; 7916 7917 if (!cgrp->parent) 7918 return &root_cpuacct.css; 7919 7920 ca = kzalloc(sizeof(*ca), GFP_KERNEL); 7921 if (!ca) 7922 goto out; 7923 7924 ca->cpuusage = alloc_percpu(u64); 7925 if (!ca->cpuusage) 7926 goto out_free_ca; 7927 7928 ca->cpustat = alloc_percpu(struct kernel_cpustat); 7929 if (!ca->cpustat) 7930 goto out_free_cpuusage; 7931 7932 return &ca->css; 7933 7934out_free_cpuusage: 7935 free_percpu(ca->cpuusage); 7936out_free_ca: 7937 kfree(ca); 7938out: 7939 return ERR_PTR(-ENOMEM); 7940} 7941 7942/* destroy an existing cpu accounting group */ 7943static void cpuacct_destroy(struct cgroup *cgrp) 7944{ 7945 struct cpuacct *ca = cgroup_ca(cgrp); 7946 7947 free_percpu(ca->cpustat); 7948 free_percpu(ca->cpuusage); 7949 kfree(ca); 7950} 7951 7952static u64 cpuacct_cpuusage_read(struct cpuacct *ca, int cpu) 7953{ 7954 u64 *cpuusage = per_cpu_ptr(ca->cpuusage, cpu); 7955 u64 data; 7956 7957#ifndef CONFIG_64BIT 7958 /* 7959 * Take rq->lock to make 64-bit read safe on 32-bit platforms. 7960 */ 7961 raw_spin_lock_irq(&cpu_rq(cpu)->lock); 7962 data = *cpuusage; 7963 raw_spin_unlock_irq(&cpu_rq(cpu)->lock); 7964#else 7965 data = *cpuusage; 7966#endif 7967 7968 return data; 7969} 7970 7971static void cpuacct_cpuusage_write(struct cpuacct *ca, int cpu, u64 val) 7972{ 7973 u64 *cpuusage = per_cpu_ptr(ca->cpuusage, cpu); 7974 7975#ifndef CONFIG_64BIT 7976 /* 7977 * Take rq->lock to make 64-bit write safe on 32-bit platforms. 7978 */ 7979 raw_spin_lock_irq(&cpu_rq(cpu)->lock); 7980 *cpuusage = val; 7981 raw_spin_unlock_irq(&cpu_rq(cpu)->lock); 7982#else 7983 *cpuusage = val; 7984#endif 7985} 7986 7987/* return total cpu usage (in nanoseconds) of a group */ 7988static u64 cpuusage_read(struct cgroup *cgrp, struct cftype *cft) 7989{ 7990 struct cpuacct *ca = cgroup_ca(cgrp); 7991 u64 totalcpuusage = 0; 7992 int i; 7993 7994 for_each_present_cpu(i) 7995 totalcpuusage += cpuacct_cpuusage_read(ca, i); 7996 7997 return totalcpuusage; 7998} 7999 8000static int cpuusage_write(struct cgroup *cgrp, struct cftype *cftype, 8001 u64 reset) 8002{ 8003 struct cpuacct *ca = cgroup_ca(cgrp); 8004 int err = 0; 8005 int i; 8006 8007 if (reset) { 8008 err = -EINVAL; 8009 goto out; 8010 } 8011 8012 for_each_present_cpu(i) 8013 cpuacct_cpuusage_write(ca, i, 0); 8014 8015out: 8016 return err; 8017} 8018 8019static int cpuacct_percpu_seq_read(struct cgroup *cgroup, struct cftype *cft, 8020 struct seq_file *m) 8021{ 8022 struct cpuacct *ca = cgroup_ca(cgroup); 8023 u64 percpu; 8024 int i; 8025 8026 for_each_present_cpu(i) { 8027 percpu = cpuacct_cpuusage_read(ca, i); 8028 seq_printf(m, "%llu ", (unsigned long long) percpu); 8029 } 8030 seq_printf(m, "\n"); 8031 return 0; 8032} 8033 8034static const char *cpuacct_stat_desc[] = { 8035 [CPUACCT_STAT_USER] = "user", 8036 [CPUACCT_STAT_SYSTEM] = "system", 8037}; 8038 8039static int cpuacct_stats_show(struct cgroup *cgrp, struct cftype *cft, 8040 struct cgroup_map_cb *cb) 8041{ 8042 struct cpuacct *ca = cgroup_ca(cgrp); 8043 int cpu; 8044 s64 val = 0; 8045 8046 for_each_online_cpu(cpu) { 8047 struct kernel_cpustat *kcpustat = per_cpu_ptr(ca->cpustat, cpu); 8048 val += kcpustat->cpustat[CPUTIME_USER]; 8049 val += kcpustat->cpustat[CPUTIME_NICE]; 8050 } 8051 val = cputime64_to_clock_t(val); 8052 cb->fill(cb, cpuacct_stat_desc[CPUACCT_STAT_USER], val); 8053 8054 val = 0; 8055 for_each_online_cpu(cpu) { 8056 struct kernel_cpustat *kcpustat = per_cpu_ptr(ca->cpustat, cpu); 8057 val += kcpustat->cpustat[CPUTIME_SYSTEM]; 8058 val += kcpustat->cpustat[CPUTIME_IRQ]; 8059 val += kcpustat->cpustat[CPUTIME_SOFTIRQ]; 8060 } 8061 8062 val = cputime64_to_clock_t(val); 8063 cb->fill(cb, cpuacct_stat_desc[CPUACCT_STAT_SYSTEM], val); 8064 8065 return 0; 8066} 8067 8068static struct cftype files[] = { 8069 { 8070 .name = "usage", 8071 .read_u64 = cpuusage_read, 8072 .write_u64 = cpuusage_write, 8073 }, 8074 { 8075 .name = "usage_percpu", 8076 .read_seq_string = cpuacct_percpu_seq_read, 8077 }, 8078 { 8079 .name = "stat", 8080 .read_map = cpuacct_stats_show, 8081 }, 8082 { } /* terminate */ 8083}; 8084 8085/* 8086 * charge this task's execution time to its accounting group. 8087 * 8088 * called with rq->lock held. 8089 */ 8090void cpuacct_charge(struct task_struct *tsk, u64 cputime) 8091{ 8092 struct cpuacct *ca; 8093 int cpu; 8094 8095 if (unlikely(!cpuacct_subsys.active)) 8096 return; 8097 8098 cpu = task_cpu(tsk); 8099 8100 rcu_read_lock(); 8101 8102 ca = task_ca(tsk); 8103 8104 for (; ca; ca = parent_ca(ca)) { 8105 u64 *cpuusage = per_cpu_ptr(ca->cpuusage, cpu); 8106 *cpuusage += cputime; 8107 } 8108 8109 rcu_read_unlock(); 8110} 8111 8112struct cgroup_subsys cpuacct_subsys = { 8113 .name = "cpuacct", 8114 .create = cpuacct_create, 8115 .destroy = cpuacct_destroy, 8116 .subsys_id = cpuacct_subsys_id, 8117 .base_cftypes = files, 8118}; 8119#endif /* CONFIG_CGROUP_CPUACCT */ 8120