core.c revision 6201b4d61fbf194df6371fb3376c5026cb8f5eec
1/*
2 *  kernel/sched/core.c
3 *
4 *  Kernel scheduler and related syscalls
5 *
6 *  Copyright (C) 1991-2002  Linus Torvalds
7 *
8 *  1996-12-23  Modified by Dave Grothe to fix bugs in semaphores and
9 *		make semaphores SMP safe
10 *  1998-11-19	Implemented schedule_timeout() and related stuff
11 *		by Andrea Arcangeli
12 *  2002-01-04	New ultra-scalable O(1) scheduler by Ingo Molnar:
13 *		hybrid priority-list and round-robin design with
14 *		an array-switch method of distributing timeslices
15 *		and per-CPU runqueues.  Cleanups and useful suggestions
16 *		by Davide Libenzi, preemptible kernel bits by Robert Love.
17 *  2003-09-03	Interactivity tuning by Con Kolivas.
18 *  2004-04-02	Scheduler domains code by Nick Piggin
19 *  2007-04-15  Work begun on replacing all interactivity tuning with a
20 *              fair scheduling design by Con Kolivas.
21 *  2007-05-05  Load balancing (smp-nice) and other improvements
22 *              by Peter Williams
23 *  2007-05-06  Interactivity improvements to CFS by Mike Galbraith
24 *  2007-07-01  Group scheduling enhancements by Srivatsa Vaddagiri
25 *  2007-11-29  RT balancing improvements by Steven Rostedt, Gregory Haskins,
26 *              Thomas Gleixner, Mike Kravetz
27 */
28
29#include <linux/mm.h>
30#include <linux/module.h>
31#include <linux/nmi.h>
32#include <linux/init.h>
33#include <linux/uaccess.h>
34#include <linux/highmem.h>
35#include <asm/mmu_context.h>
36#include <linux/interrupt.h>
37#include <linux/capability.h>
38#include <linux/completion.h>
39#include <linux/kernel_stat.h>
40#include <linux/debug_locks.h>
41#include <linux/perf_event.h>
42#include <linux/security.h>
43#include <linux/notifier.h>
44#include <linux/profile.h>
45#include <linux/freezer.h>
46#include <linux/vmalloc.h>
47#include <linux/blkdev.h>
48#include <linux/delay.h>
49#include <linux/pid_namespace.h>
50#include <linux/smp.h>
51#include <linux/threads.h>
52#include <linux/timer.h>
53#include <linux/rcupdate.h>
54#include <linux/cpu.h>
55#include <linux/cpuset.h>
56#include <linux/percpu.h>
57#include <linux/proc_fs.h>
58#include <linux/seq_file.h>
59#include <linux/sysctl.h>
60#include <linux/syscalls.h>
61#include <linux/times.h>
62#include <linux/tsacct_kern.h>
63#include <linux/kprobes.h>
64#include <linux/delayacct.h>
65#include <linux/unistd.h>
66#include <linux/pagemap.h>
67#include <linux/hrtimer.h>
68#include <linux/tick.h>
69#include <linux/debugfs.h>
70#include <linux/ctype.h>
71#include <linux/ftrace.h>
72#include <linux/slab.h>
73#include <linux/init_task.h>
74#include <linux/binfmts.h>
75#include <linux/context_tracking.h>
76
77#include <asm/switch_to.h>
78#include <asm/tlb.h>
79#include <asm/irq_regs.h>
80#include <asm/mutex.h>
81#ifdef CONFIG_PARAVIRT
82#include <asm/paravirt.h>
83#endif
84
85#include "sched.h"
86#include "../workqueue_internal.h"
87#include "../smpboot.h"
88
89#define CREATE_TRACE_POINTS
90#include <trace/events/sched.h>
91
92void start_bandwidth_timer(struct hrtimer *period_timer, ktime_t period)
93{
94	unsigned long delta;
95	ktime_t soft, hard, now;
96
97	for (;;) {
98		if (hrtimer_active(period_timer))
99			break;
100
101		now = hrtimer_cb_get_time(period_timer);
102		hrtimer_forward(period_timer, now, period);
103
104		soft = hrtimer_get_softexpires(period_timer);
105		hard = hrtimer_get_expires(period_timer);
106		delta = ktime_to_ns(ktime_sub(hard, soft));
107		__hrtimer_start_range_ns(period_timer, soft, delta,
108					 HRTIMER_MODE_ABS_PINNED, 0);
109	}
110}
111
112DEFINE_MUTEX(sched_domains_mutex);
113DEFINE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues);
114
115static void update_rq_clock_task(struct rq *rq, s64 delta);
116
117void update_rq_clock(struct rq *rq)
118{
119	s64 delta;
120
121	if (rq->skip_clock_update > 0)
122		return;
123
124	delta = sched_clock_cpu(cpu_of(rq)) - rq->clock;
125	rq->clock += delta;
126	update_rq_clock_task(rq, delta);
127}
128
129/*
130 * Debugging: various feature bits
131 */
132
133#define SCHED_FEAT(name, enabled)	\
134	(1UL << __SCHED_FEAT_##name) * enabled |
135
136const_debug unsigned int sysctl_sched_features =
137#include "features.h"
138	0;
139
140#undef SCHED_FEAT
141
142#ifdef CONFIG_SCHED_DEBUG
143#define SCHED_FEAT(name, enabled)	\
144	#name ,
145
146static const char * const sched_feat_names[] = {
147#include "features.h"
148};
149
150#undef SCHED_FEAT
151
152static int sched_feat_show(struct seq_file *m, void *v)
153{
154	int i;
155
156	for (i = 0; i < __SCHED_FEAT_NR; i++) {
157		if (!(sysctl_sched_features & (1UL << i)))
158			seq_puts(m, "NO_");
159		seq_printf(m, "%s ", sched_feat_names[i]);
160	}
161	seq_puts(m, "\n");
162
163	return 0;
164}
165
166#ifdef HAVE_JUMP_LABEL
167
168#define jump_label_key__true  STATIC_KEY_INIT_TRUE
169#define jump_label_key__false STATIC_KEY_INIT_FALSE
170
171#define SCHED_FEAT(name, enabled)	\
172	jump_label_key__##enabled ,
173
174struct static_key sched_feat_keys[__SCHED_FEAT_NR] = {
175#include "features.h"
176};
177
178#undef SCHED_FEAT
179
180static void sched_feat_disable(int i)
181{
182	if (static_key_enabled(&sched_feat_keys[i]))
183		static_key_slow_dec(&sched_feat_keys[i]);
184}
185
186static void sched_feat_enable(int i)
187{
188	if (!static_key_enabled(&sched_feat_keys[i]))
189		static_key_slow_inc(&sched_feat_keys[i]);
190}
191#else
192static void sched_feat_disable(int i) { };
193static void sched_feat_enable(int i) { };
194#endif /* HAVE_JUMP_LABEL */
195
196static int sched_feat_set(char *cmp)
197{
198	int i;
199	int neg = 0;
200
201	if (strncmp(cmp, "NO_", 3) == 0) {
202		neg = 1;
203		cmp += 3;
204	}
205
206	for (i = 0; i < __SCHED_FEAT_NR; i++) {
207		if (strcmp(cmp, sched_feat_names[i]) == 0) {
208			if (neg) {
209				sysctl_sched_features &= ~(1UL << i);
210				sched_feat_disable(i);
211			} else {
212				sysctl_sched_features |= (1UL << i);
213				sched_feat_enable(i);
214			}
215			break;
216		}
217	}
218
219	return i;
220}
221
222static ssize_t
223sched_feat_write(struct file *filp, const char __user *ubuf,
224		size_t cnt, loff_t *ppos)
225{
226	char buf[64];
227	char *cmp;
228	int i;
229
230	if (cnt > 63)
231		cnt = 63;
232
233	if (copy_from_user(&buf, ubuf, cnt))
234		return -EFAULT;
235
236	buf[cnt] = 0;
237	cmp = strstrip(buf);
238
239	i = sched_feat_set(cmp);
240	if (i == __SCHED_FEAT_NR)
241		return -EINVAL;
242
243	*ppos += cnt;
244
245	return cnt;
246}
247
248static int sched_feat_open(struct inode *inode, struct file *filp)
249{
250	return single_open(filp, sched_feat_show, NULL);
251}
252
253static const struct file_operations sched_feat_fops = {
254	.open		= sched_feat_open,
255	.write		= sched_feat_write,
256	.read		= seq_read,
257	.llseek		= seq_lseek,
258	.release	= single_release,
259};
260
261static __init int sched_init_debug(void)
262{
263	debugfs_create_file("sched_features", 0644, NULL, NULL,
264			&sched_feat_fops);
265
266	return 0;
267}
268late_initcall(sched_init_debug);
269#endif /* CONFIG_SCHED_DEBUG */
270
271/*
272 * Number of tasks to iterate in a single balance run.
273 * Limited because this is done with IRQs disabled.
274 */
275const_debug unsigned int sysctl_sched_nr_migrate = 32;
276
277/*
278 * period over which we average the RT time consumption, measured
279 * in ms.
280 *
281 * default: 1s
282 */
283const_debug unsigned int sysctl_sched_time_avg = MSEC_PER_SEC;
284
285/*
286 * period over which we measure -rt task cpu usage in us.
287 * default: 1s
288 */
289unsigned int sysctl_sched_rt_period = 1000000;
290
291__read_mostly int scheduler_running;
292
293/*
294 * part of the period that we allow rt tasks to run in us.
295 * default: 0.95s
296 */
297int sysctl_sched_rt_runtime = 950000;
298
299/*
300 * __task_rq_lock - lock the rq @p resides on.
301 */
302static inline struct rq *__task_rq_lock(struct task_struct *p)
303	__acquires(rq->lock)
304{
305	struct rq *rq;
306
307	lockdep_assert_held(&p->pi_lock);
308
309	for (;;) {
310		rq = task_rq(p);
311		raw_spin_lock(&rq->lock);
312		if (likely(rq == task_rq(p)))
313			return rq;
314		raw_spin_unlock(&rq->lock);
315	}
316}
317
318/*
319 * task_rq_lock - lock p->pi_lock and lock the rq @p resides on.
320 */
321static struct rq *task_rq_lock(struct task_struct *p, unsigned long *flags)
322	__acquires(p->pi_lock)
323	__acquires(rq->lock)
324{
325	struct rq *rq;
326
327	for (;;) {
328		raw_spin_lock_irqsave(&p->pi_lock, *flags);
329		rq = task_rq(p);
330		raw_spin_lock(&rq->lock);
331		if (likely(rq == task_rq(p)))
332			return rq;
333		raw_spin_unlock(&rq->lock);
334		raw_spin_unlock_irqrestore(&p->pi_lock, *flags);
335	}
336}
337
338static void __task_rq_unlock(struct rq *rq)
339	__releases(rq->lock)
340{
341	raw_spin_unlock(&rq->lock);
342}
343
344static inline void
345task_rq_unlock(struct rq *rq, struct task_struct *p, unsigned long *flags)
346	__releases(rq->lock)
347	__releases(p->pi_lock)
348{
349	raw_spin_unlock(&rq->lock);
350	raw_spin_unlock_irqrestore(&p->pi_lock, *flags);
351}
352
353/*
354 * this_rq_lock - lock this runqueue and disable interrupts.
355 */
356static struct rq *this_rq_lock(void)
357	__acquires(rq->lock)
358{
359	struct rq *rq;
360
361	local_irq_disable();
362	rq = this_rq();
363	raw_spin_lock(&rq->lock);
364
365	return rq;
366}
367
368#ifdef CONFIG_SCHED_HRTICK
369/*
370 * Use HR-timers to deliver accurate preemption points.
371 */
372
373static void hrtick_clear(struct rq *rq)
374{
375	if (hrtimer_active(&rq->hrtick_timer))
376		hrtimer_cancel(&rq->hrtick_timer);
377}
378
379/*
380 * High-resolution timer tick.
381 * Runs from hardirq context with interrupts disabled.
382 */
383static enum hrtimer_restart hrtick(struct hrtimer *timer)
384{
385	struct rq *rq = container_of(timer, struct rq, hrtick_timer);
386
387	WARN_ON_ONCE(cpu_of(rq) != smp_processor_id());
388
389	raw_spin_lock(&rq->lock);
390	update_rq_clock(rq);
391	rq->curr->sched_class->task_tick(rq, rq->curr, 1);
392	raw_spin_unlock(&rq->lock);
393
394	return HRTIMER_NORESTART;
395}
396
397#ifdef CONFIG_SMP
398
399static int __hrtick_restart(struct rq *rq)
400{
401	struct hrtimer *timer = &rq->hrtick_timer;
402	ktime_t time = hrtimer_get_softexpires(timer);
403
404	return __hrtimer_start_range_ns(timer, time, 0, HRTIMER_MODE_ABS_PINNED, 0);
405}
406
407/*
408 * called from hardirq (IPI) context
409 */
410static void __hrtick_start(void *arg)
411{
412	struct rq *rq = arg;
413
414	raw_spin_lock(&rq->lock);
415	__hrtick_restart(rq);
416	rq->hrtick_csd_pending = 0;
417	raw_spin_unlock(&rq->lock);
418}
419
420/*
421 * Called to set the hrtick timer state.
422 *
423 * called with rq->lock held and irqs disabled
424 */
425void hrtick_start(struct rq *rq, u64 delay)
426{
427	struct hrtimer *timer = &rq->hrtick_timer;
428	ktime_t time = ktime_add_ns(timer->base->get_time(), delay);
429
430	hrtimer_set_expires(timer, time);
431
432	if (rq == this_rq()) {
433		__hrtick_restart(rq);
434	} else if (!rq->hrtick_csd_pending) {
435		__smp_call_function_single(cpu_of(rq), &rq->hrtick_csd, 0);
436		rq->hrtick_csd_pending = 1;
437	}
438}
439
440static int
441hotplug_hrtick(struct notifier_block *nfb, unsigned long action, void *hcpu)
442{
443	int cpu = (int)(long)hcpu;
444
445	switch (action) {
446	case CPU_UP_CANCELED:
447	case CPU_UP_CANCELED_FROZEN:
448	case CPU_DOWN_PREPARE:
449	case CPU_DOWN_PREPARE_FROZEN:
450	case CPU_DEAD:
451	case CPU_DEAD_FROZEN:
452		hrtick_clear(cpu_rq(cpu));
453		return NOTIFY_OK;
454	}
455
456	return NOTIFY_DONE;
457}
458
459static __init void init_hrtick(void)
460{
461	hotcpu_notifier(hotplug_hrtick, 0);
462}
463#else
464/*
465 * Called to set the hrtick timer state.
466 *
467 * called with rq->lock held and irqs disabled
468 */
469void hrtick_start(struct rq *rq, u64 delay)
470{
471	__hrtimer_start_range_ns(&rq->hrtick_timer, ns_to_ktime(delay), 0,
472			HRTIMER_MODE_REL_PINNED, 0);
473}
474
475static inline void init_hrtick(void)
476{
477}
478#endif /* CONFIG_SMP */
479
480static void init_rq_hrtick(struct rq *rq)
481{
482#ifdef CONFIG_SMP
483	rq->hrtick_csd_pending = 0;
484
485	rq->hrtick_csd.flags = 0;
486	rq->hrtick_csd.func = __hrtick_start;
487	rq->hrtick_csd.info = rq;
488#endif
489
490	hrtimer_init(&rq->hrtick_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
491	rq->hrtick_timer.function = hrtick;
492}
493#else	/* CONFIG_SCHED_HRTICK */
494static inline void hrtick_clear(struct rq *rq)
495{
496}
497
498static inline void init_rq_hrtick(struct rq *rq)
499{
500}
501
502static inline void init_hrtick(void)
503{
504}
505#endif	/* CONFIG_SCHED_HRTICK */
506
507/*
508 * resched_task - mark a task 'to be rescheduled now'.
509 *
510 * On UP this means the setting of the need_resched flag, on SMP it
511 * might also involve a cross-CPU call to trigger the scheduler on
512 * the target CPU.
513 */
514void resched_task(struct task_struct *p)
515{
516	int cpu;
517
518	lockdep_assert_held(&task_rq(p)->lock);
519
520	if (test_tsk_need_resched(p))
521		return;
522
523	set_tsk_need_resched(p);
524
525	cpu = task_cpu(p);
526	if (cpu == smp_processor_id()) {
527		set_preempt_need_resched();
528		return;
529	}
530
531	/* NEED_RESCHED must be visible before we test polling */
532	smp_mb();
533	if (!tsk_is_polling(p))
534		smp_send_reschedule(cpu);
535}
536
537void resched_cpu(int cpu)
538{
539	struct rq *rq = cpu_rq(cpu);
540	unsigned long flags;
541
542	if (!raw_spin_trylock_irqsave(&rq->lock, flags))
543		return;
544	resched_task(cpu_curr(cpu));
545	raw_spin_unlock_irqrestore(&rq->lock, flags);
546}
547
548#ifdef CONFIG_SMP
549#ifdef CONFIG_NO_HZ_COMMON
550/*
551 * In the semi idle case, use the nearest busy cpu for migrating timers
552 * from an idle cpu.  This is good for power-savings.
553 *
554 * We don't do similar optimization for completely idle system, as
555 * selecting an idle cpu will add more delays to the timers than intended
556 * (as that cpu's timer base may not be uptodate wrt jiffies etc).
557 */
558int get_nohz_timer_target(int pinned)
559{
560	int cpu = smp_processor_id();
561	int i;
562	struct sched_domain *sd;
563
564	if (pinned || !get_sysctl_timer_migration() || !idle_cpu(cpu))
565		return cpu;
566
567	rcu_read_lock();
568	for_each_domain(cpu, sd) {
569		for_each_cpu(i, sched_domain_span(sd)) {
570			if (!idle_cpu(i)) {
571				cpu = i;
572				goto unlock;
573			}
574		}
575	}
576unlock:
577	rcu_read_unlock();
578	return cpu;
579}
580/*
581 * When add_timer_on() enqueues a timer into the timer wheel of an
582 * idle CPU then this timer might expire before the next timer event
583 * which is scheduled to wake up that CPU. In case of a completely
584 * idle system the next event might even be infinite time into the
585 * future. wake_up_idle_cpu() ensures that the CPU is woken up and
586 * leaves the inner idle loop so the newly added timer is taken into
587 * account when the CPU goes back to idle and evaluates the timer
588 * wheel for the next timer event.
589 */
590static void wake_up_idle_cpu(int cpu)
591{
592	struct rq *rq = cpu_rq(cpu);
593
594	if (cpu == smp_processor_id())
595		return;
596
597	/*
598	 * This is safe, as this function is called with the timer
599	 * wheel base lock of (cpu) held. When the CPU is on the way
600	 * to idle and has not yet set rq->curr to idle then it will
601	 * be serialized on the timer wheel base lock and take the new
602	 * timer into account automatically.
603	 */
604	if (rq->curr != rq->idle)
605		return;
606
607	/*
608	 * We can set TIF_RESCHED on the idle task of the other CPU
609	 * lockless. The worst case is that the other CPU runs the
610	 * idle task through an additional NOOP schedule()
611	 */
612	set_tsk_need_resched(rq->idle);
613
614	/* NEED_RESCHED must be visible before we test polling */
615	smp_mb();
616	if (!tsk_is_polling(rq->idle))
617		smp_send_reschedule(cpu);
618}
619
620static bool wake_up_full_nohz_cpu(int cpu)
621{
622	if (tick_nohz_full_cpu(cpu)) {
623		if (cpu != smp_processor_id() ||
624		    tick_nohz_tick_stopped())
625			smp_send_reschedule(cpu);
626		return true;
627	}
628
629	return false;
630}
631
632void wake_up_nohz_cpu(int cpu)
633{
634	if (!wake_up_full_nohz_cpu(cpu))
635		wake_up_idle_cpu(cpu);
636}
637
638static inline bool got_nohz_idle_kick(void)
639{
640	int cpu = smp_processor_id();
641
642	if (!test_bit(NOHZ_BALANCE_KICK, nohz_flags(cpu)))
643		return false;
644
645	if (idle_cpu(cpu) && !need_resched())
646		return true;
647
648	/*
649	 * We can't run Idle Load Balance on this CPU for this time so we
650	 * cancel it and clear NOHZ_BALANCE_KICK
651	 */
652	clear_bit(NOHZ_BALANCE_KICK, nohz_flags(cpu));
653	return false;
654}
655
656#else /* CONFIG_NO_HZ_COMMON */
657
658static inline bool got_nohz_idle_kick(void)
659{
660	return false;
661}
662
663#endif /* CONFIG_NO_HZ_COMMON */
664
665#ifdef CONFIG_NO_HZ_FULL
666bool sched_can_stop_tick(void)
667{
668       struct rq *rq;
669
670       rq = this_rq();
671
672       /* Make sure rq->nr_running update is visible after the IPI */
673       smp_rmb();
674
675       /* More than one running task need preemption */
676       if (rq->nr_running > 1)
677               return false;
678
679       return true;
680}
681#endif /* CONFIG_NO_HZ_FULL */
682
683void sched_avg_update(struct rq *rq)
684{
685	s64 period = sched_avg_period();
686
687	while ((s64)(rq_clock(rq) - rq->age_stamp) > period) {
688		/*
689		 * Inline assembly required to prevent the compiler
690		 * optimising this loop into a divmod call.
691		 * See __iter_div_u64_rem() for another example of this.
692		 */
693		asm("" : "+rm" (rq->age_stamp));
694		rq->age_stamp += period;
695		rq->rt_avg /= 2;
696	}
697}
698
699#endif /* CONFIG_SMP */
700
701#if defined(CONFIG_RT_GROUP_SCHED) || (defined(CONFIG_FAIR_GROUP_SCHED) && \
702			(defined(CONFIG_SMP) || defined(CONFIG_CFS_BANDWIDTH)))
703/*
704 * Iterate task_group tree rooted at *from, calling @down when first entering a
705 * node and @up when leaving it for the final time.
706 *
707 * Caller must hold rcu_lock or sufficient equivalent.
708 */
709int walk_tg_tree_from(struct task_group *from,
710			     tg_visitor down, tg_visitor up, void *data)
711{
712	struct task_group *parent, *child;
713	int ret;
714
715	parent = from;
716
717down:
718	ret = (*down)(parent, data);
719	if (ret)
720		goto out;
721	list_for_each_entry_rcu(child, &parent->children, siblings) {
722		parent = child;
723		goto down;
724
725up:
726		continue;
727	}
728	ret = (*up)(parent, data);
729	if (ret || parent == from)
730		goto out;
731
732	child = parent;
733	parent = parent->parent;
734	if (parent)
735		goto up;
736out:
737	return ret;
738}
739
740int tg_nop(struct task_group *tg, void *data)
741{
742	return 0;
743}
744#endif
745
746static void set_load_weight(struct task_struct *p)
747{
748	int prio = p->static_prio - MAX_RT_PRIO;
749	struct load_weight *load = &p->se.load;
750
751	/*
752	 * SCHED_IDLE tasks get minimal weight:
753	 */
754	if (p->policy == SCHED_IDLE) {
755		load->weight = scale_load(WEIGHT_IDLEPRIO);
756		load->inv_weight = WMULT_IDLEPRIO;
757		return;
758	}
759
760	load->weight = scale_load(prio_to_weight[prio]);
761	load->inv_weight = prio_to_wmult[prio];
762}
763
764static void enqueue_task(struct rq *rq, struct task_struct *p, int flags)
765{
766	update_rq_clock(rq);
767	sched_info_queued(rq, p);
768	p->sched_class->enqueue_task(rq, p, flags);
769}
770
771static void dequeue_task(struct rq *rq, struct task_struct *p, int flags)
772{
773	update_rq_clock(rq);
774	sched_info_dequeued(rq, p);
775	p->sched_class->dequeue_task(rq, p, flags);
776}
777
778void activate_task(struct rq *rq, struct task_struct *p, int flags)
779{
780	if (task_contributes_to_load(p))
781		rq->nr_uninterruptible--;
782
783	enqueue_task(rq, p, flags);
784}
785
786void deactivate_task(struct rq *rq, struct task_struct *p, int flags)
787{
788	if (task_contributes_to_load(p))
789		rq->nr_uninterruptible++;
790
791	dequeue_task(rq, p, flags);
792}
793
794static void update_rq_clock_task(struct rq *rq, s64 delta)
795{
796/*
797 * In theory, the compile should just see 0 here, and optimize out the call
798 * to sched_rt_avg_update. But I don't trust it...
799 */
800#if defined(CONFIG_IRQ_TIME_ACCOUNTING) || defined(CONFIG_PARAVIRT_TIME_ACCOUNTING)
801	s64 steal = 0, irq_delta = 0;
802#endif
803#ifdef CONFIG_IRQ_TIME_ACCOUNTING
804	irq_delta = irq_time_read(cpu_of(rq)) - rq->prev_irq_time;
805
806	/*
807	 * Since irq_time is only updated on {soft,}irq_exit, we might run into
808	 * this case when a previous update_rq_clock() happened inside a
809	 * {soft,}irq region.
810	 *
811	 * When this happens, we stop ->clock_task and only update the
812	 * prev_irq_time stamp to account for the part that fit, so that a next
813	 * update will consume the rest. This ensures ->clock_task is
814	 * monotonic.
815	 *
816	 * It does however cause some slight miss-attribution of {soft,}irq
817	 * time, a more accurate solution would be to update the irq_time using
818	 * the current rq->clock timestamp, except that would require using
819	 * atomic ops.
820	 */
821	if (irq_delta > delta)
822		irq_delta = delta;
823
824	rq->prev_irq_time += irq_delta;
825	delta -= irq_delta;
826#endif
827#ifdef CONFIG_PARAVIRT_TIME_ACCOUNTING
828	if (static_key_false((&paravirt_steal_rq_enabled))) {
829		u64 st;
830
831		steal = paravirt_steal_clock(cpu_of(rq));
832		steal -= rq->prev_steal_time_rq;
833
834		if (unlikely(steal > delta))
835			steal = delta;
836
837		st = steal_ticks(steal);
838		steal = st * TICK_NSEC;
839
840		rq->prev_steal_time_rq += steal;
841
842		delta -= steal;
843	}
844#endif
845
846	rq->clock_task += delta;
847
848#if defined(CONFIG_IRQ_TIME_ACCOUNTING) || defined(CONFIG_PARAVIRT_TIME_ACCOUNTING)
849	if ((irq_delta + steal) && sched_feat(NONTASK_POWER))
850		sched_rt_avg_update(rq, irq_delta + steal);
851#endif
852}
853
854void sched_set_stop_task(int cpu, struct task_struct *stop)
855{
856	struct sched_param param = { .sched_priority = MAX_RT_PRIO - 1 };
857	struct task_struct *old_stop = cpu_rq(cpu)->stop;
858
859	if (stop) {
860		/*
861		 * Make it appear like a SCHED_FIFO task, its something
862		 * userspace knows about and won't get confused about.
863		 *
864		 * Also, it will make PI more or less work without too
865		 * much confusion -- but then, stop work should not
866		 * rely on PI working anyway.
867		 */
868		sched_setscheduler_nocheck(stop, SCHED_FIFO, &param);
869
870		stop->sched_class = &stop_sched_class;
871	}
872
873	cpu_rq(cpu)->stop = stop;
874
875	if (old_stop) {
876		/*
877		 * Reset it back to a normal scheduling class so that
878		 * it can die in pieces.
879		 */
880		old_stop->sched_class = &rt_sched_class;
881	}
882}
883
884/*
885 * __normal_prio - return the priority that is based on the static prio
886 */
887static inline int __normal_prio(struct task_struct *p)
888{
889	return p->static_prio;
890}
891
892/*
893 * Calculate the expected normal priority: i.e. priority
894 * without taking RT-inheritance into account. Might be
895 * boosted by interactivity modifiers. Changes upon fork,
896 * setprio syscalls, and whenever the interactivity
897 * estimator recalculates.
898 */
899static inline int normal_prio(struct task_struct *p)
900{
901	int prio;
902
903	if (task_has_dl_policy(p))
904		prio = MAX_DL_PRIO-1;
905	else if (task_has_rt_policy(p))
906		prio = MAX_RT_PRIO-1 - p->rt_priority;
907	else
908		prio = __normal_prio(p);
909	return prio;
910}
911
912/*
913 * Calculate the current priority, i.e. the priority
914 * taken into account by the scheduler. This value might
915 * be boosted by RT tasks, or might be boosted by
916 * interactivity modifiers. Will be RT if the task got
917 * RT-boosted. If not then it returns p->normal_prio.
918 */
919static int effective_prio(struct task_struct *p)
920{
921	p->normal_prio = normal_prio(p);
922	/*
923	 * If we are RT tasks or we were boosted to RT priority,
924	 * keep the priority unchanged. Otherwise, update priority
925	 * to the normal priority:
926	 */
927	if (!rt_prio(p->prio))
928		return p->normal_prio;
929	return p->prio;
930}
931
932/**
933 * task_curr - is this task currently executing on a CPU?
934 * @p: the task in question.
935 *
936 * Return: 1 if the task is currently executing. 0 otherwise.
937 */
938inline int task_curr(const struct task_struct *p)
939{
940	return cpu_curr(task_cpu(p)) == p;
941}
942
943static inline void check_class_changed(struct rq *rq, struct task_struct *p,
944				       const struct sched_class *prev_class,
945				       int oldprio)
946{
947	if (prev_class != p->sched_class) {
948		if (prev_class->switched_from)
949			prev_class->switched_from(rq, p);
950		p->sched_class->switched_to(rq, p);
951	} else if (oldprio != p->prio || dl_task(p))
952		p->sched_class->prio_changed(rq, p, oldprio);
953}
954
955void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags)
956{
957	const struct sched_class *class;
958
959	if (p->sched_class == rq->curr->sched_class) {
960		rq->curr->sched_class->check_preempt_curr(rq, p, flags);
961	} else {
962		for_each_class(class) {
963			if (class == rq->curr->sched_class)
964				break;
965			if (class == p->sched_class) {
966				resched_task(rq->curr);
967				break;
968			}
969		}
970	}
971
972	/*
973	 * A queue event has occurred, and we're going to schedule.  In
974	 * this case, we can save a useless back to back clock update.
975	 */
976	if (rq->curr->on_rq && test_tsk_need_resched(rq->curr))
977		rq->skip_clock_update = 1;
978}
979
980#ifdef CONFIG_SMP
981void set_task_cpu(struct task_struct *p, unsigned int new_cpu)
982{
983#ifdef CONFIG_SCHED_DEBUG
984	/*
985	 * We should never call set_task_cpu() on a blocked task,
986	 * ttwu() will sort out the placement.
987	 */
988	WARN_ON_ONCE(p->state != TASK_RUNNING && p->state != TASK_WAKING &&
989			!(task_preempt_count(p) & PREEMPT_ACTIVE));
990
991#ifdef CONFIG_LOCKDEP
992	/*
993	 * The caller should hold either p->pi_lock or rq->lock, when changing
994	 * a task's CPU. ->pi_lock for waking tasks, rq->lock for runnable tasks.
995	 *
996	 * sched_move_task() holds both and thus holding either pins the cgroup,
997	 * see task_group().
998	 *
999	 * Furthermore, all task_rq users should acquire both locks, see
1000	 * task_rq_lock().
1001	 */
1002	WARN_ON_ONCE(debug_locks && !(lockdep_is_held(&p->pi_lock) ||
1003				      lockdep_is_held(&task_rq(p)->lock)));
1004#endif
1005#endif
1006
1007	trace_sched_migrate_task(p, new_cpu);
1008
1009	if (task_cpu(p) != new_cpu) {
1010		if (p->sched_class->migrate_task_rq)
1011			p->sched_class->migrate_task_rq(p, new_cpu);
1012		p->se.nr_migrations++;
1013		perf_sw_event(PERF_COUNT_SW_CPU_MIGRATIONS, 1, NULL, 0);
1014	}
1015
1016	__set_task_cpu(p, new_cpu);
1017}
1018
1019static void __migrate_swap_task(struct task_struct *p, int cpu)
1020{
1021	if (p->on_rq) {
1022		struct rq *src_rq, *dst_rq;
1023
1024		src_rq = task_rq(p);
1025		dst_rq = cpu_rq(cpu);
1026
1027		deactivate_task(src_rq, p, 0);
1028		set_task_cpu(p, cpu);
1029		activate_task(dst_rq, p, 0);
1030		check_preempt_curr(dst_rq, p, 0);
1031	} else {
1032		/*
1033		 * Task isn't running anymore; make it appear like we migrated
1034		 * it before it went to sleep. This means on wakeup we make the
1035		 * previous cpu our targer instead of where it really is.
1036		 */
1037		p->wake_cpu = cpu;
1038	}
1039}
1040
1041struct migration_swap_arg {
1042	struct task_struct *src_task, *dst_task;
1043	int src_cpu, dst_cpu;
1044};
1045
1046static int migrate_swap_stop(void *data)
1047{
1048	struct migration_swap_arg *arg = data;
1049	struct rq *src_rq, *dst_rq;
1050	int ret = -EAGAIN;
1051
1052	src_rq = cpu_rq(arg->src_cpu);
1053	dst_rq = cpu_rq(arg->dst_cpu);
1054
1055	double_raw_lock(&arg->src_task->pi_lock,
1056			&arg->dst_task->pi_lock);
1057	double_rq_lock(src_rq, dst_rq);
1058	if (task_cpu(arg->dst_task) != arg->dst_cpu)
1059		goto unlock;
1060
1061	if (task_cpu(arg->src_task) != arg->src_cpu)
1062		goto unlock;
1063
1064	if (!cpumask_test_cpu(arg->dst_cpu, tsk_cpus_allowed(arg->src_task)))
1065		goto unlock;
1066
1067	if (!cpumask_test_cpu(arg->src_cpu, tsk_cpus_allowed(arg->dst_task)))
1068		goto unlock;
1069
1070	__migrate_swap_task(arg->src_task, arg->dst_cpu);
1071	__migrate_swap_task(arg->dst_task, arg->src_cpu);
1072
1073	ret = 0;
1074
1075unlock:
1076	double_rq_unlock(src_rq, dst_rq);
1077	raw_spin_unlock(&arg->dst_task->pi_lock);
1078	raw_spin_unlock(&arg->src_task->pi_lock);
1079
1080	return ret;
1081}
1082
1083/*
1084 * Cross migrate two tasks
1085 */
1086int migrate_swap(struct task_struct *cur, struct task_struct *p)
1087{
1088	struct migration_swap_arg arg;
1089	int ret = -EINVAL;
1090
1091	arg = (struct migration_swap_arg){
1092		.src_task = cur,
1093		.src_cpu = task_cpu(cur),
1094		.dst_task = p,
1095		.dst_cpu = task_cpu(p),
1096	};
1097
1098	if (arg.src_cpu == arg.dst_cpu)
1099		goto out;
1100
1101	/*
1102	 * These three tests are all lockless; this is OK since all of them
1103	 * will be re-checked with proper locks held further down the line.
1104	 */
1105	if (!cpu_active(arg.src_cpu) || !cpu_active(arg.dst_cpu))
1106		goto out;
1107
1108	if (!cpumask_test_cpu(arg.dst_cpu, tsk_cpus_allowed(arg.src_task)))
1109		goto out;
1110
1111	if (!cpumask_test_cpu(arg.src_cpu, tsk_cpus_allowed(arg.dst_task)))
1112		goto out;
1113
1114	trace_sched_swap_numa(cur, arg.src_cpu, p, arg.dst_cpu);
1115	ret = stop_two_cpus(arg.dst_cpu, arg.src_cpu, migrate_swap_stop, &arg);
1116
1117out:
1118	return ret;
1119}
1120
1121struct migration_arg {
1122	struct task_struct *task;
1123	int dest_cpu;
1124};
1125
1126static int migration_cpu_stop(void *data);
1127
1128/*
1129 * wait_task_inactive - wait for a thread to unschedule.
1130 *
1131 * If @match_state is nonzero, it's the @p->state value just checked and
1132 * not expected to change.  If it changes, i.e. @p might have woken up,
1133 * then return zero.  When we succeed in waiting for @p to be off its CPU,
1134 * we return a positive number (its total switch count).  If a second call
1135 * a short while later returns the same number, the caller can be sure that
1136 * @p has remained unscheduled the whole time.
1137 *
1138 * The caller must ensure that the task *will* unschedule sometime soon,
1139 * else this function might spin for a *long* time. This function can't
1140 * be called with interrupts off, or it may introduce deadlock with
1141 * smp_call_function() if an IPI is sent by the same process we are
1142 * waiting to become inactive.
1143 */
1144unsigned long wait_task_inactive(struct task_struct *p, long match_state)
1145{
1146	unsigned long flags;
1147	int running, on_rq;
1148	unsigned long ncsw;
1149	struct rq *rq;
1150
1151	for (;;) {
1152		/*
1153		 * We do the initial early heuristics without holding
1154		 * any task-queue locks at all. We'll only try to get
1155		 * the runqueue lock when things look like they will
1156		 * work out!
1157		 */
1158		rq = task_rq(p);
1159
1160		/*
1161		 * If the task is actively running on another CPU
1162		 * still, just relax and busy-wait without holding
1163		 * any locks.
1164		 *
1165		 * NOTE! Since we don't hold any locks, it's not
1166		 * even sure that "rq" stays as the right runqueue!
1167		 * But we don't care, since "task_running()" will
1168		 * return false if the runqueue has changed and p
1169		 * is actually now running somewhere else!
1170		 */
1171		while (task_running(rq, p)) {
1172			if (match_state && unlikely(p->state != match_state))
1173				return 0;
1174			cpu_relax();
1175		}
1176
1177		/*
1178		 * Ok, time to look more closely! We need the rq
1179		 * lock now, to be *sure*. If we're wrong, we'll
1180		 * just go back and repeat.
1181		 */
1182		rq = task_rq_lock(p, &flags);
1183		trace_sched_wait_task(p);
1184		running = task_running(rq, p);
1185		on_rq = p->on_rq;
1186		ncsw = 0;
1187		if (!match_state || p->state == match_state)
1188			ncsw = p->nvcsw | LONG_MIN; /* sets MSB */
1189		task_rq_unlock(rq, p, &flags);
1190
1191		/*
1192		 * If it changed from the expected state, bail out now.
1193		 */
1194		if (unlikely(!ncsw))
1195			break;
1196
1197		/*
1198		 * Was it really running after all now that we
1199		 * checked with the proper locks actually held?
1200		 *
1201		 * Oops. Go back and try again..
1202		 */
1203		if (unlikely(running)) {
1204			cpu_relax();
1205			continue;
1206		}
1207
1208		/*
1209		 * It's not enough that it's not actively running,
1210		 * it must be off the runqueue _entirely_, and not
1211		 * preempted!
1212		 *
1213		 * So if it was still runnable (but just not actively
1214		 * running right now), it's preempted, and we should
1215		 * yield - it could be a while.
1216		 */
1217		if (unlikely(on_rq)) {
1218			ktime_t to = ktime_set(0, NSEC_PER_SEC/HZ);
1219
1220			set_current_state(TASK_UNINTERRUPTIBLE);
1221			schedule_hrtimeout(&to, HRTIMER_MODE_REL);
1222			continue;
1223		}
1224
1225		/*
1226		 * Ahh, all good. It wasn't running, and it wasn't
1227		 * runnable, which means that it will never become
1228		 * running in the future either. We're all done!
1229		 */
1230		break;
1231	}
1232
1233	return ncsw;
1234}
1235
1236/***
1237 * kick_process - kick a running thread to enter/exit the kernel
1238 * @p: the to-be-kicked thread
1239 *
1240 * Cause a process which is running on another CPU to enter
1241 * kernel-mode, without any delay. (to get signals handled.)
1242 *
1243 * NOTE: this function doesn't have to take the runqueue lock,
1244 * because all it wants to ensure is that the remote task enters
1245 * the kernel. If the IPI races and the task has been migrated
1246 * to another CPU then no harm is done and the purpose has been
1247 * achieved as well.
1248 */
1249void kick_process(struct task_struct *p)
1250{
1251	int cpu;
1252
1253	preempt_disable();
1254	cpu = task_cpu(p);
1255	if ((cpu != smp_processor_id()) && task_curr(p))
1256		smp_send_reschedule(cpu);
1257	preempt_enable();
1258}
1259EXPORT_SYMBOL_GPL(kick_process);
1260#endif /* CONFIG_SMP */
1261
1262#ifdef CONFIG_SMP
1263/*
1264 * ->cpus_allowed is protected by both rq->lock and p->pi_lock
1265 */
1266static int select_fallback_rq(int cpu, struct task_struct *p)
1267{
1268	int nid = cpu_to_node(cpu);
1269	const struct cpumask *nodemask = NULL;
1270	enum { cpuset, possible, fail } state = cpuset;
1271	int dest_cpu;
1272
1273	/*
1274	 * If the node that the cpu is on has been offlined, cpu_to_node()
1275	 * will return -1. There is no cpu on the node, and we should
1276	 * select the cpu on the other node.
1277	 */
1278	if (nid != -1) {
1279		nodemask = cpumask_of_node(nid);
1280
1281		/* Look for allowed, online CPU in same node. */
1282		for_each_cpu(dest_cpu, nodemask) {
1283			if (!cpu_online(dest_cpu))
1284				continue;
1285			if (!cpu_active(dest_cpu))
1286				continue;
1287			if (cpumask_test_cpu(dest_cpu, tsk_cpus_allowed(p)))
1288				return dest_cpu;
1289		}
1290	}
1291
1292	for (;;) {
1293		/* Any allowed, online CPU? */
1294		for_each_cpu(dest_cpu, tsk_cpus_allowed(p)) {
1295			if (!cpu_online(dest_cpu))
1296				continue;
1297			if (!cpu_active(dest_cpu))
1298				continue;
1299			goto out;
1300		}
1301
1302		switch (state) {
1303		case cpuset:
1304			/* No more Mr. Nice Guy. */
1305			cpuset_cpus_allowed_fallback(p);
1306			state = possible;
1307			break;
1308
1309		case possible:
1310			do_set_cpus_allowed(p, cpu_possible_mask);
1311			state = fail;
1312			break;
1313
1314		case fail:
1315			BUG();
1316			break;
1317		}
1318	}
1319
1320out:
1321	if (state != cpuset) {
1322		/*
1323		 * Don't tell them about moving exiting tasks or
1324		 * kernel threads (both mm NULL), since they never
1325		 * leave kernel.
1326		 */
1327		if (p->mm && printk_ratelimit()) {
1328			printk_sched("process %d (%s) no longer affine to cpu%d\n",
1329					task_pid_nr(p), p->comm, cpu);
1330		}
1331	}
1332
1333	return dest_cpu;
1334}
1335
1336/*
1337 * The caller (fork, wakeup) owns p->pi_lock, ->cpus_allowed is stable.
1338 */
1339static inline
1340int select_task_rq(struct task_struct *p, int cpu, int sd_flags, int wake_flags)
1341{
1342	cpu = p->sched_class->select_task_rq(p, cpu, sd_flags, wake_flags);
1343
1344	/*
1345	 * In order not to call set_task_cpu() on a blocking task we need
1346	 * to rely on ttwu() to place the task on a valid ->cpus_allowed
1347	 * cpu.
1348	 *
1349	 * Since this is common to all placement strategies, this lives here.
1350	 *
1351	 * [ this allows ->select_task() to simply return task_cpu(p) and
1352	 *   not worry about this generic constraint ]
1353	 */
1354	if (unlikely(!cpumask_test_cpu(cpu, tsk_cpus_allowed(p)) ||
1355		     !cpu_online(cpu)))
1356		cpu = select_fallback_rq(task_cpu(p), p);
1357
1358	return cpu;
1359}
1360
1361static void update_avg(u64 *avg, u64 sample)
1362{
1363	s64 diff = sample - *avg;
1364	*avg += diff >> 3;
1365}
1366#endif
1367
1368static void
1369ttwu_stat(struct task_struct *p, int cpu, int wake_flags)
1370{
1371#ifdef CONFIG_SCHEDSTATS
1372	struct rq *rq = this_rq();
1373
1374#ifdef CONFIG_SMP
1375	int this_cpu = smp_processor_id();
1376
1377	if (cpu == this_cpu) {
1378		schedstat_inc(rq, ttwu_local);
1379		schedstat_inc(p, se.statistics.nr_wakeups_local);
1380	} else {
1381		struct sched_domain *sd;
1382
1383		schedstat_inc(p, se.statistics.nr_wakeups_remote);
1384		rcu_read_lock();
1385		for_each_domain(this_cpu, sd) {
1386			if (cpumask_test_cpu(cpu, sched_domain_span(sd))) {
1387				schedstat_inc(sd, ttwu_wake_remote);
1388				break;
1389			}
1390		}
1391		rcu_read_unlock();
1392	}
1393
1394	if (wake_flags & WF_MIGRATED)
1395		schedstat_inc(p, se.statistics.nr_wakeups_migrate);
1396
1397#endif /* CONFIG_SMP */
1398
1399	schedstat_inc(rq, ttwu_count);
1400	schedstat_inc(p, se.statistics.nr_wakeups);
1401
1402	if (wake_flags & WF_SYNC)
1403		schedstat_inc(p, se.statistics.nr_wakeups_sync);
1404
1405#endif /* CONFIG_SCHEDSTATS */
1406}
1407
1408static void ttwu_activate(struct rq *rq, struct task_struct *p, int en_flags)
1409{
1410	activate_task(rq, p, en_flags);
1411	p->on_rq = 1;
1412
1413	/* if a worker is waking up, notify workqueue */
1414	if (p->flags & PF_WQ_WORKER)
1415		wq_worker_waking_up(p, cpu_of(rq));
1416}
1417
1418/*
1419 * Mark the task runnable and perform wakeup-preemption.
1420 */
1421static void
1422ttwu_do_wakeup(struct rq *rq, struct task_struct *p, int wake_flags)
1423{
1424	check_preempt_curr(rq, p, wake_flags);
1425	trace_sched_wakeup(p, true);
1426
1427	p->state = TASK_RUNNING;
1428#ifdef CONFIG_SMP
1429	if (p->sched_class->task_woken)
1430		p->sched_class->task_woken(rq, p);
1431
1432	if (rq->idle_stamp) {
1433		u64 delta = rq_clock(rq) - rq->idle_stamp;
1434		u64 max = 2*rq->max_idle_balance_cost;
1435
1436		update_avg(&rq->avg_idle, delta);
1437
1438		if (rq->avg_idle > max)
1439			rq->avg_idle = max;
1440
1441		rq->idle_stamp = 0;
1442	}
1443#endif
1444}
1445
1446static void
1447ttwu_do_activate(struct rq *rq, struct task_struct *p, int wake_flags)
1448{
1449#ifdef CONFIG_SMP
1450	if (p->sched_contributes_to_load)
1451		rq->nr_uninterruptible--;
1452#endif
1453
1454	ttwu_activate(rq, p, ENQUEUE_WAKEUP | ENQUEUE_WAKING);
1455	ttwu_do_wakeup(rq, p, wake_flags);
1456}
1457
1458/*
1459 * Called in case the task @p isn't fully descheduled from its runqueue,
1460 * in this case we must do a remote wakeup. Its a 'light' wakeup though,
1461 * since all we need to do is flip p->state to TASK_RUNNING, since
1462 * the task is still ->on_rq.
1463 */
1464static int ttwu_remote(struct task_struct *p, int wake_flags)
1465{
1466	struct rq *rq;
1467	int ret = 0;
1468
1469	rq = __task_rq_lock(p);
1470	if (p->on_rq) {
1471		/* check_preempt_curr() may use rq clock */
1472		update_rq_clock(rq);
1473		ttwu_do_wakeup(rq, p, wake_flags);
1474		ret = 1;
1475	}
1476	__task_rq_unlock(rq);
1477
1478	return ret;
1479}
1480
1481#ifdef CONFIG_SMP
1482static void sched_ttwu_pending(void)
1483{
1484	struct rq *rq = this_rq();
1485	struct llist_node *llist = llist_del_all(&rq->wake_list);
1486	struct task_struct *p;
1487
1488	raw_spin_lock(&rq->lock);
1489
1490	while (llist) {
1491		p = llist_entry(llist, struct task_struct, wake_entry);
1492		llist = llist_next(llist);
1493		ttwu_do_activate(rq, p, 0);
1494	}
1495
1496	raw_spin_unlock(&rq->lock);
1497}
1498
1499void scheduler_ipi(void)
1500{
1501	/*
1502	 * Fold TIF_NEED_RESCHED into the preempt_count; anybody setting
1503	 * TIF_NEED_RESCHED remotely (for the first time) will also send
1504	 * this IPI.
1505	 */
1506	preempt_fold_need_resched();
1507
1508	if (llist_empty(&this_rq()->wake_list)
1509			&& !tick_nohz_full_cpu(smp_processor_id())
1510			&& !got_nohz_idle_kick())
1511		return;
1512
1513	/*
1514	 * Not all reschedule IPI handlers call irq_enter/irq_exit, since
1515	 * traditionally all their work was done from the interrupt return
1516	 * path. Now that we actually do some work, we need to make sure
1517	 * we do call them.
1518	 *
1519	 * Some archs already do call them, luckily irq_enter/exit nest
1520	 * properly.
1521	 *
1522	 * Arguably we should visit all archs and update all handlers,
1523	 * however a fair share of IPIs are still resched only so this would
1524	 * somewhat pessimize the simple resched case.
1525	 */
1526	irq_enter();
1527	tick_nohz_full_check();
1528	sched_ttwu_pending();
1529
1530	/*
1531	 * Check if someone kicked us for doing the nohz idle load balance.
1532	 */
1533	if (unlikely(got_nohz_idle_kick())) {
1534		this_rq()->idle_balance = 1;
1535		raise_softirq_irqoff(SCHED_SOFTIRQ);
1536	}
1537	irq_exit();
1538}
1539
1540static void ttwu_queue_remote(struct task_struct *p, int cpu)
1541{
1542	if (llist_add(&p->wake_entry, &cpu_rq(cpu)->wake_list))
1543		smp_send_reschedule(cpu);
1544}
1545
1546bool cpus_share_cache(int this_cpu, int that_cpu)
1547{
1548	return per_cpu(sd_llc_id, this_cpu) == per_cpu(sd_llc_id, that_cpu);
1549}
1550#endif /* CONFIG_SMP */
1551
1552static void ttwu_queue(struct task_struct *p, int cpu)
1553{
1554	struct rq *rq = cpu_rq(cpu);
1555
1556#if defined(CONFIG_SMP)
1557	if (sched_feat(TTWU_QUEUE) && !cpus_share_cache(smp_processor_id(), cpu)) {
1558		sched_clock_cpu(cpu); /* sync clocks x-cpu */
1559		ttwu_queue_remote(p, cpu);
1560		return;
1561	}
1562#endif
1563
1564	raw_spin_lock(&rq->lock);
1565	ttwu_do_activate(rq, p, 0);
1566	raw_spin_unlock(&rq->lock);
1567}
1568
1569/**
1570 * try_to_wake_up - wake up a thread
1571 * @p: the thread to be awakened
1572 * @state: the mask of task states that can be woken
1573 * @wake_flags: wake modifier flags (WF_*)
1574 *
1575 * Put it on the run-queue if it's not already there. The "current"
1576 * thread is always on the run-queue (except when the actual
1577 * re-schedule is in progress), and as such you're allowed to do
1578 * the simpler "current->state = TASK_RUNNING" to mark yourself
1579 * runnable without the overhead of this.
1580 *
1581 * Return: %true if @p was woken up, %false if it was already running.
1582 * or @state didn't match @p's state.
1583 */
1584static int
1585try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags)
1586{
1587	unsigned long flags;
1588	int cpu, success = 0;
1589
1590	/*
1591	 * If we are going to wake up a thread waiting for CONDITION we
1592	 * need to ensure that CONDITION=1 done by the caller can not be
1593	 * reordered with p->state check below. This pairs with mb() in
1594	 * set_current_state() the waiting thread does.
1595	 */
1596	smp_mb__before_spinlock();
1597	raw_spin_lock_irqsave(&p->pi_lock, flags);
1598	if (!(p->state & state))
1599		goto out;
1600
1601	success = 1; /* we're going to change ->state */
1602	cpu = task_cpu(p);
1603
1604	if (p->on_rq && ttwu_remote(p, wake_flags))
1605		goto stat;
1606
1607#ifdef CONFIG_SMP
1608	/*
1609	 * If the owning (remote) cpu is still in the middle of schedule() with
1610	 * this task as prev, wait until its done referencing the task.
1611	 */
1612	while (p->on_cpu)
1613		cpu_relax();
1614	/*
1615	 * Pairs with the smp_wmb() in finish_lock_switch().
1616	 */
1617	smp_rmb();
1618
1619	p->sched_contributes_to_load = !!task_contributes_to_load(p);
1620	p->state = TASK_WAKING;
1621
1622	if (p->sched_class->task_waking)
1623		p->sched_class->task_waking(p);
1624
1625	cpu = select_task_rq(p, p->wake_cpu, SD_BALANCE_WAKE, wake_flags);
1626	if (task_cpu(p) != cpu) {
1627		wake_flags |= WF_MIGRATED;
1628		set_task_cpu(p, cpu);
1629	}
1630#endif /* CONFIG_SMP */
1631
1632	ttwu_queue(p, cpu);
1633stat:
1634	ttwu_stat(p, cpu, wake_flags);
1635out:
1636	raw_spin_unlock_irqrestore(&p->pi_lock, flags);
1637
1638	return success;
1639}
1640
1641/**
1642 * try_to_wake_up_local - try to wake up a local task with rq lock held
1643 * @p: the thread to be awakened
1644 *
1645 * Put @p on the run-queue if it's not already there. The caller must
1646 * ensure that this_rq() is locked, @p is bound to this_rq() and not
1647 * the current task.
1648 */
1649static void try_to_wake_up_local(struct task_struct *p)
1650{
1651	struct rq *rq = task_rq(p);
1652
1653	if (WARN_ON_ONCE(rq != this_rq()) ||
1654	    WARN_ON_ONCE(p == current))
1655		return;
1656
1657	lockdep_assert_held(&rq->lock);
1658
1659	if (!raw_spin_trylock(&p->pi_lock)) {
1660		raw_spin_unlock(&rq->lock);
1661		raw_spin_lock(&p->pi_lock);
1662		raw_spin_lock(&rq->lock);
1663	}
1664
1665	if (!(p->state & TASK_NORMAL))
1666		goto out;
1667
1668	if (!p->on_rq)
1669		ttwu_activate(rq, p, ENQUEUE_WAKEUP);
1670
1671	ttwu_do_wakeup(rq, p, 0);
1672	ttwu_stat(p, smp_processor_id(), 0);
1673out:
1674	raw_spin_unlock(&p->pi_lock);
1675}
1676
1677/**
1678 * wake_up_process - Wake up a specific process
1679 * @p: The process to be woken up.
1680 *
1681 * Attempt to wake up the nominated process and move it to the set of runnable
1682 * processes.
1683 *
1684 * Return: 1 if the process was woken up, 0 if it was already running.
1685 *
1686 * It may be assumed that this function implies a write memory barrier before
1687 * changing the task state if and only if any tasks are woken up.
1688 */
1689int wake_up_process(struct task_struct *p)
1690{
1691	WARN_ON(task_is_stopped_or_traced(p));
1692	return try_to_wake_up(p, TASK_NORMAL, 0);
1693}
1694EXPORT_SYMBOL(wake_up_process);
1695
1696int wake_up_state(struct task_struct *p, unsigned int state)
1697{
1698	return try_to_wake_up(p, state, 0);
1699}
1700
1701/*
1702 * Perform scheduler related setup for a newly forked process p.
1703 * p is forked by current.
1704 *
1705 * __sched_fork() is basic setup used by init_idle() too:
1706 */
1707static void __sched_fork(unsigned long clone_flags, struct task_struct *p)
1708{
1709	p->on_rq			= 0;
1710
1711	p->se.on_rq			= 0;
1712	p->se.exec_start		= 0;
1713	p->se.sum_exec_runtime		= 0;
1714	p->se.prev_sum_exec_runtime	= 0;
1715	p->se.nr_migrations		= 0;
1716	p->se.vruntime			= 0;
1717	INIT_LIST_HEAD(&p->se.group_node);
1718
1719#ifdef CONFIG_SCHEDSTATS
1720	memset(&p->se.statistics, 0, sizeof(p->se.statistics));
1721#endif
1722
1723	RB_CLEAR_NODE(&p->dl.rb_node);
1724	hrtimer_init(&p->dl.dl_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
1725	p->dl.dl_runtime = p->dl.runtime = 0;
1726	p->dl.dl_deadline = p->dl.deadline = 0;
1727	p->dl.dl_period = 0;
1728	p->dl.flags = 0;
1729
1730	INIT_LIST_HEAD(&p->rt.run_list);
1731
1732#ifdef CONFIG_PREEMPT_NOTIFIERS
1733	INIT_HLIST_HEAD(&p->preempt_notifiers);
1734#endif
1735
1736#ifdef CONFIG_NUMA_BALANCING
1737	if (p->mm && atomic_read(&p->mm->mm_users) == 1) {
1738		p->mm->numa_next_scan = jiffies + msecs_to_jiffies(sysctl_numa_balancing_scan_delay);
1739		p->mm->numa_scan_seq = 0;
1740	}
1741
1742	if (clone_flags & CLONE_VM)
1743		p->numa_preferred_nid = current->numa_preferred_nid;
1744	else
1745		p->numa_preferred_nid = -1;
1746
1747	p->node_stamp = 0ULL;
1748	p->numa_scan_seq = p->mm ? p->mm->numa_scan_seq : 0;
1749	p->numa_scan_period = sysctl_numa_balancing_scan_delay;
1750	p->numa_work.next = &p->numa_work;
1751	p->numa_faults = NULL;
1752	p->numa_faults_buffer = NULL;
1753
1754	INIT_LIST_HEAD(&p->numa_entry);
1755	p->numa_group = NULL;
1756#endif /* CONFIG_NUMA_BALANCING */
1757}
1758
1759#ifdef CONFIG_NUMA_BALANCING
1760#ifdef CONFIG_SCHED_DEBUG
1761void set_numabalancing_state(bool enabled)
1762{
1763	if (enabled)
1764		sched_feat_set("NUMA");
1765	else
1766		sched_feat_set("NO_NUMA");
1767}
1768#else
1769__read_mostly bool numabalancing_enabled;
1770
1771void set_numabalancing_state(bool enabled)
1772{
1773	numabalancing_enabled = enabled;
1774}
1775#endif /* CONFIG_SCHED_DEBUG */
1776
1777#ifdef CONFIG_PROC_SYSCTL
1778int sysctl_numa_balancing(struct ctl_table *table, int write,
1779			 void __user *buffer, size_t *lenp, loff_t *ppos)
1780{
1781	struct ctl_table t;
1782	int err;
1783	int state = numabalancing_enabled;
1784
1785	if (write && !capable(CAP_SYS_ADMIN))
1786		return -EPERM;
1787
1788	t = *table;
1789	t.data = &state;
1790	err = proc_dointvec_minmax(&t, write, buffer, lenp, ppos);
1791	if (err < 0)
1792		return err;
1793	if (write)
1794		set_numabalancing_state(state);
1795	return err;
1796}
1797#endif
1798#endif
1799
1800/*
1801 * fork()/clone()-time setup:
1802 */
1803int sched_fork(unsigned long clone_flags, struct task_struct *p)
1804{
1805	unsigned long flags;
1806	int cpu = get_cpu();
1807
1808	__sched_fork(clone_flags, p);
1809	/*
1810	 * We mark the process as running here. This guarantees that
1811	 * nobody will actually run it, and a signal or other external
1812	 * event cannot wake it up and insert it on the runqueue either.
1813	 */
1814	p->state = TASK_RUNNING;
1815
1816	/*
1817	 * Make sure we do not leak PI boosting priority to the child.
1818	 */
1819	p->prio = current->normal_prio;
1820
1821	/*
1822	 * Revert to default priority/policy on fork if requested.
1823	 */
1824	if (unlikely(p->sched_reset_on_fork)) {
1825		if (task_has_dl_policy(p) || task_has_rt_policy(p)) {
1826			p->policy = SCHED_NORMAL;
1827			p->static_prio = NICE_TO_PRIO(0);
1828			p->rt_priority = 0;
1829		} else if (PRIO_TO_NICE(p->static_prio) < 0)
1830			p->static_prio = NICE_TO_PRIO(0);
1831
1832		p->prio = p->normal_prio = __normal_prio(p);
1833		set_load_weight(p);
1834
1835		/*
1836		 * We don't need the reset flag anymore after the fork. It has
1837		 * fulfilled its duty:
1838		 */
1839		p->sched_reset_on_fork = 0;
1840	}
1841
1842	if (dl_prio(p->prio)) {
1843		put_cpu();
1844		return -EAGAIN;
1845	} else if (rt_prio(p->prio)) {
1846		p->sched_class = &rt_sched_class;
1847	} else {
1848		p->sched_class = &fair_sched_class;
1849	}
1850
1851	if (p->sched_class->task_fork)
1852		p->sched_class->task_fork(p);
1853
1854	/*
1855	 * The child is not yet in the pid-hash so no cgroup attach races,
1856	 * and the cgroup is pinned to this child due to cgroup_fork()
1857	 * is ran before sched_fork().
1858	 *
1859	 * Silence PROVE_RCU.
1860	 */
1861	raw_spin_lock_irqsave(&p->pi_lock, flags);
1862	set_task_cpu(p, cpu);
1863	raw_spin_unlock_irqrestore(&p->pi_lock, flags);
1864
1865#if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT)
1866	if (likely(sched_info_on()))
1867		memset(&p->sched_info, 0, sizeof(p->sched_info));
1868#endif
1869#if defined(CONFIG_SMP)
1870	p->on_cpu = 0;
1871#endif
1872	init_task_preempt_count(p);
1873#ifdef CONFIG_SMP
1874	plist_node_init(&p->pushable_tasks, MAX_PRIO);
1875	RB_CLEAR_NODE(&p->pushable_dl_tasks);
1876#endif
1877
1878	put_cpu();
1879	return 0;
1880}
1881
1882unsigned long to_ratio(u64 period, u64 runtime)
1883{
1884	if (runtime == RUNTIME_INF)
1885		return 1ULL << 20;
1886
1887	/*
1888	 * Doing this here saves a lot of checks in all
1889	 * the calling paths, and returning zero seems
1890	 * safe for them anyway.
1891	 */
1892	if (period == 0)
1893		return 0;
1894
1895	return div64_u64(runtime << 20, period);
1896}
1897
1898#ifdef CONFIG_SMP
1899inline struct dl_bw *dl_bw_of(int i)
1900{
1901	return &cpu_rq(i)->rd->dl_bw;
1902}
1903
1904static inline int dl_bw_cpus(int i)
1905{
1906	struct root_domain *rd = cpu_rq(i)->rd;
1907	int cpus = 0;
1908
1909	for_each_cpu_and(i, rd->span, cpu_active_mask)
1910		cpus++;
1911
1912	return cpus;
1913}
1914#else
1915inline struct dl_bw *dl_bw_of(int i)
1916{
1917	return &cpu_rq(i)->dl.dl_bw;
1918}
1919
1920static inline int dl_bw_cpus(int i)
1921{
1922	return 1;
1923}
1924#endif
1925
1926static inline
1927void __dl_clear(struct dl_bw *dl_b, u64 tsk_bw)
1928{
1929	dl_b->total_bw -= tsk_bw;
1930}
1931
1932static inline
1933void __dl_add(struct dl_bw *dl_b, u64 tsk_bw)
1934{
1935	dl_b->total_bw += tsk_bw;
1936}
1937
1938static inline
1939bool __dl_overflow(struct dl_bw *dl_b, int cpus, u64 old_bw, u64 new_bw)
1940{
1941	return dl_b->bw != -1 &&
1942	       dl_b->bw * cpus < dl_b->total_bw - old_bw + new_bw;
1943}
1944
1945/*
1946 * We must be sure that accepting a new task (or allowing changing the
1947 * parameters of an existing one) is consistent with the bandwidth
1948 * constraints. If yes, this function also accordingly updates the currently
1949 * allocated bandwidth to reflect the new situation.
1950 *
1951 * This function is called while holding p's rq->lock.
1952 */
1953static int dl_overflow(struct task_struct *p, int policy,
1954		       const struct sched_attr *attr)
1955{
1956
1957	struct dl_bw *dl_b = dl_bw_of(task_cpu(p));
1958	u64 period = attr->sched_period;
1959	u64 runtime = attr->sched_runtime;
1960	u64 new_bw = dl_policy(policy) ? to_ratio(period, runtime) : 0;
1961	int cpus, err = -1;
1962
1963	if (new_bw == p->dl.dl_bw)
1964		return 0;
1965
1966	/*
1967	 * Either if a task, enters, leave, or stays -deadline but changes
1968	 * its parameters, we may need to update accordingly the total
1969	 * allocated bandwidth of the container.
1970	 */
1971	raw_spin_lock(&dl_b->lock);
1972	cpus = dl_bw_cpus(task_cpu(p));
1973	if (dl_policy(policy) && !task_has_dl_policy(p) &&
1974	    !__dl_overflow(dl_b, cpus, 0, new_bw)) {
1975		__dl_add(dl_b, new_bw);
1976		err = 0;
1977	} else if (dl_policy(policy) && task_has_dl_policy(p) &&
1978		   !__dl_overflow(dl_b, cpus, p->dl.dl_bw, new_bw)) {
1979		__dl_clear(dl_b, p->dl.dl_bw);
1980		__dl_add(dl_b, new_bw);
1981		err = 0;
1982	} else if (!dl_policy(policy) && task_has_dl_policy(p)) {
1983		__dl_clear(dl_b, p->dl.dl_bw);
1984		err = 0;
1985	}
1986	raw_spin_unlock(&dl_b->lock);
1987
1988	return err;
1989}
1990
1991extern void init_dl_bw(struct dl_bw *dl_b);
1992
1993/*
1994 * wake_up_new_task - wake up a newly created task for the first time.
1995 *
1996 * This function will do some initial scheduler statistics housekeeping
1997 * that must be done for every newly created context, then puts the task
1998 * on the runqueue and wakes it.
1999 */
2000void wake_up_new_task(struct task_struct *p)
2001{
2002	unsigned long flags;
2003	struct rq *rq;
2004
2005	raw_spin_lock_irqsave(&p->pi_lock, flags);
2006#ifdef CONFIG_SMP
2007	/*
2008	 * Fork balancing, do it here and not earlier because:
2009	 *  - cpus_allowed can change in the fork path
2010	 *  - any previously selected cpu might disappear through hotplug
2011	 */
2012	set_task_cpu(p, select_task_rq(p, task_cpu(p), SD_BALANCE_FORK, 0));
2013#endif
2014
2015	/* Initialize new task's runnable average */
2016	init_task_runnable_average(p);
2017	rq = __task_rq_lock(p);
2018	activate_task(rq, p, 0);
2019	p->on_rq = 1;
2020	trace_sched_wakeup_new(p, true);
2021	check_preempt_curr(rq, p, WF_FORK);
2022#ifdef CONFIG_SMP
2023	if (p->sched_class->task_woken)
2024		p->sched_class->task_woken(rq, p);
2025#endif
2026	task_rq_unlock(rq, p, &flags);
2027}
2028
2029#ifdef CONFIG_PREEMPT_NOTIFIERS
2030
2031/**
2032 * preempt_notifier_register - tell me when current is being preempted & rescheduled
2033 * @notifier: notifier struct to register
2034 */
2035void preempt_notifier_register(struct preempt_notifier *notifier)
2036{
2037	hlist_add_head(&notifier->link, &current->preempt_notifiers);
2038}
2039EXPORT_SYMBOL_GPL(preempt_notifier_register);
2040
2041/**
2042 * preempt_notifier_unregister - no longer interested in preemption notifications
2043 * @notifier: notifier struct to unregister
2044 *
2045 * This is safe to call from within a preemption notifier.
2046 */
2047void preempt_notifier_unregister(struct preempt_notifier *notifier)
2048{
2049	hlist_del(&notifier->link);
2050}
2051EXPORT_SYMBOL_GPL(preempt_notifier_unregister);
2052
2053static void fire_sched_in_preempt_notifiers(struct task_struct *curr)
2054{
2055	struct preempt_notifier *notifier;
2056
2057	hlist_for_each_entry(notifier, &curr->preempt_notifiers, link)
2058		notifier->ops->sched_in(notifier, raw_smp_processor_id());
2059}
2060
2061static void
2062fire_sched_out_preempt_notifiers(struct task_struct *curr,
2063				 struct task_struct *next)
2064{
2065	struct preempt_notifier *notifier;
2066
2067	hlist_for_each_entry(notifier, &curr->preempt_notifiers, link)
2068		notifier->ops->sched_out(notifier, next);
2069}
2070
2071#else /* !CONFIG_PREEMPT_NOTIFIERS */
2072
2073static void fire_sched_in_preempt_notifiers(struct task_struct *curr)
2074{
2075}
2076
2077static void
2078fire_sched_out_preempt_notifiers(struct task_struct *curr,
2079				 struct task_struct *next)
2080{
2081}
2082
2083#endif /* CONFIG_PREEMPT_NOTIFIERS */
2084
2085/**
2086 * prepare_task_switch - prepare to switch tasks
2087 * @rq: the runqueue preparing to switch
2088 * @prev: the current task that is being switched out
2089 * @next: the task we are going to switch to.
2090 *
2091 * This is called with the rq lock held and interrupts off. It must
2092 * be paired with a subsequent finish_task_switch after the context
2093 * switch.
2094 *
2095 * prepare_task_switch sets up locking and calls architecture specific
2096 * hooks.
2097 */
2098static inline void
2099prepare_task_switch(struct rq *rq, struct task_struct *prev,
2100		    struct task_struct *next)
2101{
2102	trace_sched_switch(prev, next);
2103	sched_info_switch(rq, prev, next);
2104	perf_event_task_sched_out(prev, next);
2105	fire_sched_out_preempt_notifiers(prev, next);
2106	prepare_lock_switch(rq, next);
2107	prepare_arch_switch(next);
2108}
2109
2110/**
2111 * finish_task_switch - clean up after a task-switch
2112 * @rq: runqueue associated with task-switch
2113 * @prev: the thread we just switched away from.
2114 *
2115 * finish_task_switch must be called after the context switch, paired
2116 * with a prepare_task_switch call before the context switch.
2117 * finish_task_switch will reconcile locking set up by prepare_task_switch,
2118 * and do any other architecture-specific cleanup actions.
2119 *
2120 * Note that we may have delayed dropping an mm in context_switch(). If
2121 * so, we finish that here outside of the runqueue lock. (Doing it
2122 * with the lock held can cause deadlocks; see schedule() for
2123 * details.)
2124 */
2125static void finish_task_switch(struct rq *rq, struct task_struct *prev)
2126	__releases(rq->lock)
2127{
2128	struct mm_struct *mm = rq->prev_mm;
2129	long prev_state;
2130
2131	rq->prev_mm = NULL;
2132
2133	/*
2134	 * A task struct has one reference for the use as "current".
2135	 * If a task dies, then it sets TASK_DEAD in tsk->state and calls
2136	 * schedule one last time. The schedule call will never return, and
2137	 * the scheduled task must drop that reference.
2138	 * The test for TASK_DEAD must occur while the runqueue locks are
2139	 * still held, otherwise prev could be scheduled on another cpu, die
2140	 * there before we look at prev->state, and then the reference would
2141	 * be dropped twice.
2142	 *		Manfred Spraul <manfred@colorfullife.com>
2143	 */
2144	prev_state = prev->state;
2145	vtime_task_switch(prev);
2146	finish_arch_switch(prev);
2147	perf_event_task_sched_in(prev, current);
2148	finish_lock_switch(rq, prev);
2149	finish_arch_post_lock_switch();
2150
2151	fire_sched_in_preempt_notifiers(current);
2152	if (mm)
2153		mmdrop(mm);
2154	if (unlikely(prev_state == TASK_DEAD)) {
2155		task_numa_free(prev);
2156
2157		if (prev->sched_class->task_dead)
2158			prev->sched_class->task_dead(prev);
2159
2160		/*
2161		 * Remove function-return probe instances associated with this
2162		 * task and put them back on the free list.
2163		 */
2164		kprobe_flush_task(prev);
2165		put_task_struct(prev);
2166	}
2167
2168	tick_nohz_task_switch(current);
2169}
2170
2171#ifdef CONFIG_SMP
2172
2173/* assumes rq->lock is held */
2174static inline void pre_schedule(struct rq *rq, struct task_struct *prev)
2175{
2176	if (prev->sched_class->pre_schedule)
2177		prev->sched_class->pre_schedule(rq, prev);
2178}
2179
2180/* rq->lock is NOT held, but preemption is disabled */
2181static inline void post_schedule(struct rq *rq)
2182{
2183	if (rq->post_schedule) {
2184		unsigned long flags;
2185
2186		raw_spin_lock_irqsave(&rq->lock, flags);
2187		if (rq->curr->sched_class->post_schedule)
2188			rq->curr->sched_class->post_schedule(rq);
2189		raw_spin_unlock_irqrestore(&rq->lock, flags);
2190
2191		rq->post_schedule = 0;
2192	}
2193}
2194
2195#else
2196
2197static inline void pre_schedule(struct rq *rq, struct task_struct *p)
2198{
2199}
2200
2201static inline void post_schedule(struct rq *rq)
2202{
2203}
2204
2205#endif
2206
2207/**
2208 * schedule_tail - first thing a freshly forked thread must call.
2209 * @prev: the thread we just switched away from.
2210 */
2211asmlinkage void schedule_tail(struct task_struct *prev)
2212	__releases(rq->lock)
2213{
2214	struct rq *rq = this_rq();
2215
2216	finish_task_switch(rq, prev);
2217
2218	/*
2219	 * FIXME: do we need to worry about rq being invalidated by the
2220	 * task_switch?
2221	 */
2222	post_schedule(rq);
2223
2224#ifdef __ARCH_WANT_UNLOCKED_CTXSW
2225	/* In this case, finish_task_switch does not reenable preemption */
2226	preempt_enable();
2227#endif
2228	if (current->set_child_tid)
2229		put_user(task_pid_vnr(current), current->set_child_tid);
2230}
2231
2232/*
2233 * context_switch - switch to the new MM and the new
2234 * thread's register state.
2235 */
2236static inline void
2237context_switch(struct rq *rq, struct task_struct *prev,
2238	       struct task_struct *next)
2239{
2240	struct mm_struct *mm, *oldmm;
2241
2242	prepare_task_switch(rq, prev, next);
2243
2244	mm = next->mm;
2245	oldmm = prev->active_mm;
2246	/*
2247	 * For paravirt, this is coupled with an exit in switch_to to
2248	 * combine the page table reload and the switch backend into
2249	 * one hypercall.
2250	 */
2251	arch_start_context_switch(prev);
2252
2253	if (!mm) {
2254		next->active_mm = oldmm;
2255		atomic_inc(&oldmm->mm_count);
2256		enter_lazy_tlb(oldmm, next);
2257	} else
2258		switch_mm(oldmm, mm, next);
2259
2260	if (!prev->mm) {
2261		prev->active_mm = NULL;
2262		rq->prev_mm = oldmm;
2263	}
2264	/*
2265	 * Since the runqueue lock will be released by the next
2266	 * task (which is an invalid locking op but in the case
2267	 * of the scheduler it's an obvious special-case), so we
2268	 * do an early lockdep release here:
2269	 */
2270#ifndef __ARCH_WANT_UNLOCKED_CTXSW
2271	spin_release(&rq->lock.dep_map, 1, _THIS_IP_);
2272#endif
2273
2274	context_tracking_task_switch(prev, next);
2275	/* Here we just switch the register state and the stack. */
2276	switch_to(prev, next, prev);
2277
2278	barrier();
2279	/*
2280	 * this_rq must be evaluated again because prev may have moved
2281	 * CPUs since it called schedule(), thus the 'rq' on its stack
2282	 * frame will be invalid.
2283	 */
2284	finish_task_switch(this_rq(), prev);
2285}
2286
2287/*
2288 * nr_running and nr_context_switches:
2289 *
2290 * externally visible scheduler statistics: current number of runnable
2291 * threads, total number of context switches performed since bootup.
2292 */
2293unsigned long nr_running(void)
2294{
2295	unsigned long i, sum = 0;
2296
2297	for_each_online_cpu(i)
2298		sum += cpu_rq(i)->nr_running;
2299
2300	return sum;
2301}
2302
2303unsigned long long nr_context_switches(void)
2304{
2305	int i;
2306	unsigned long long sum = 0;
2307
2308	for_each_possible_cpu(i)
2309		sum += cpu_rq(i)->nr_switches;
2310
2311	return sum;
2312}
2313
2314unsigned long nr_iowait(void)
2315{
2316	unsigned long i, sum = 0;
2317
2318	for_each_possible_cpu(i)
2319		sum += atomic_read(&cpu_rq(i)->nr_iowait);
2320
2321	return sum;
2322}
2323
2324unsigned long nr_iowait_cpu(int cpu)
2325{
2326	struct rq *this = cpu_rq(cpu);
2327	return atomic_read(&this->nr_iowait);
2328}
2329
2330#ifdef CONFIG_SMP
2331
2332/*
2333 * sched_exec - execve() is a valuable balancing opportunity, because at
2334 * this point the task has the smallest effective memory and cache footprint.
2335 */
2336void sched_exec(void)
2337{
2338	struct task_struct *p = current;
2339	unsigned long flags;
2340	int dest_cpu;
2341
2342	raw_spin_lock_irqsave(&p->pi_lock, flags);
2343	dest_cpu = p->sched_class->select_task_rq(p, task_cpu(p), SD_BALANCE_EXEC, 0);
2344	if (dest_cpu == smp_processor_id())
2345		goto unlock;
2346
2347	if (likely(cpu_active(dest_cpu))) {
2348		struct migration_arg arg = { p, dest_cpu };
2349
2350		raw_spin_unlock_irqrestore(&p->pi_lock, flags);
2351		stop_one_cpu(task_cpu(p), migration_cpu_stop, &arg);
2352		return;
2353	}
2354unlock:
2355	raw_spin_unlock_irqrestore(&p->pi_lock, flags);
2356}
2357
2358#endif
2359
2360DEFINE_PER_CPU(struct kernel_stat, kstat);
2361DEFINE_PER_CPU(struct kernel_cpustat, kernel_cpustat);
2362
2363EXPORT_PER_CPU_SYMBOL(kstat);
2364EXPORT_PER_CPU_SYMBOL(kernel_cpustat);
2365
2366/*
2367 * Return any ns on the sched_clock that have not yet been accounted in
2368 * @p in case that task is currently running.
2369 *
2370 * Called with task_rq_lock() held on @rq.
2371 */
2372static u64 do_task_delta_exec(struct task_struct *p, struct rq *rq)
2373{
2374	u64 ns = 0;
2375
2376	if (task_current(rq, p)) {
2377		update_rq_clock(rq);
2378		ns = rq_clock_task(rq) - p->se.exec_start;
2379		if ((s64)ns < 0)
2380			ns = 0;
2381	}
2382
2383	return ns;
2384}
2385
2386unsigned long long task_delta_exec(struct task_struct *p)
2387{
2388	unsigned long flags;
2389	struct rq *rq;
2390	u64 ns = 0;
2391
2392	rq = task_rq_lock(p, &flags);
2393	ns = do_task_delta_exec(p, rq);
2394	task_rq_unlock(rq, p, &flags);
2395
2396	return ns;
2397}
2398
2399/*
2400 * Return accounted runtime for the task.
2401 * In case the task is currently running, return the runtime plus current's
2402 * pending runtime that have not been accounted yet.
2403 */
2404unsigned long long task_sched_runtime(struct task_struct *p)
2405{
2406	unsigned long flags;
2407	struct rq *rq;
2408	u64 ns = 0;
2409
2410#if defined(CONFIG_64BIT) && defined(CONFIG_SMP)
2411	/*
2412	 * 64-bit doesn't need locks to atomically read a 64bit value.
2413	 * So we have a optimization chance when the task's delta_exec is 0.
2414	 * Reading ->on_cpu is racy, but this is ok.
2415	 *
2416	 * If we race with it leaving cpu, we'll take a lock. So we're correct.
2417	 * If we race with it entering cpu, unaccounted time is 0. This is
2418	 * indistinguishable from the read occurring a few cycles earlier.
2419	 */
2420	if (!p->on_cpu)
2421		return p->se.sum_exec_runtime;
2422#endif
2423
2424	rq = task_rq_lock(p, &flags);
2425	ns = p->se.sum_exec_runtime + do_task_delta_exec(p, rq);
2426	task_rq_unlock(rq, p, &flags);
2427
2428	return ns;
2429}
2430
2431/*
2432 * This function gets called by the timer code, with HZ frequency.
2433 * We call it with interrupts disabled.
2434 */
2435void scheduler_tick(void)
2436{
2437	int cpu = smp_processor_id();
2438	struct rq *rq = cpu_rq(cpu);
2439	struct task_struct *curr = rq->curr;
2440
2441	sched_clock_tick();
2442
2443	raw_spin_lock(&rq->lock);
2444	update_rq_clock(rq);
2445	curr->sched_class->task_tick(rq, curr, 0);
2446	update_cpu_load_active(rq);
2447	raw_spin_unlock(&rq->lock);
2448
2449	perf_event_task_tick();
2450
2451#ifdef CONFIG_SMP
2452	rq->idle_balance = idle_cpu(cpu);
2453	trigger_load_balance(rq);
2454#endif
2455	rq_last_tick_reset(rq);
2456}
2457
2458#ifdef CONFIG_NO_HZ_FULL
2459/**
2460 * scheduler_tick_max_deferment
2461 *
2462 * Keep at least one tick per second when a single
2463 * active task is running because the scheduler doesn't
2464 * yet completely support full dynticks environment.
2465 *
2466 * This makes sure that uptime, CFS vruntime, load
2467 * balancing, etc... continue to move forward, even
2468 * with a very low granularity.
2469 *
2470 * Return: Maximum deferment in nanoseconds.
2471 */
2472u64 scheduler_tick_max_deferment(void)
2473{
2474	struct rq *rq = this_rq();
2475	unsigned long next, now = ACCESS_ONCE(jiffies);
2476
2477	next = rq->last_sched_tick + HZ;
2478
2479	if (time_before_eq(next, now))
2480		return 0;
2481
2482	return jiffies_to_nsecs(next - now);
2483}
2484#endif
2485
2486notrace unsigned long get_parent_ip(unsigned long addr)
2487{
2488	if (in_lock_functions(addr)) {
2489		addr = CALLER_ADDR2;
2490		if (in_lock_functions(addr))
2491			addr = CALLER_ADDR3;
2492	}
2493	return addr;
2494}
2495
2496#if defined(CONFIG_PREEMPT) && (defined(CONFIG_DEBUG_PREEMPT) || \
2497				defined(CONFIG_PREEMPT_TRACER))
2498
2499void __kprobes preempt_count_add(int val)
2500{
2501#ifdef CONFIG_DEBUG_PREEMPT
2502	/*
2503	 * Underflow?
2504	 */
2505	if (DEBUG_LOCKS_WARN_ON((preempt_count() < 0)))
2506		return;
2507#endif
2508	__preempt_count_add(val);
2509#ifdef CONFIG_DEBUG_PREEMPT
2510	/*
2511	 * Spinlock count overflowing soon?
2512	 */
2513	DEBUG_LOCKS_WARN_ON((preempt_count() & PREEMPT_MASK) >=
2514				PREEMPT_MASK - 10);
2515#endif
2516	if (preempt_count() == val)
2517		trace_preempt_off(CALLER_ADDR0, get_parent_ip(CALLER_ADDR1));
2518}
2519EXPORT_SYMBOL(preempt_count_add);
2520
2521void __kprobes preempt_count_sub(int val)
2522{
2523#ifdef CONFIG_DEBUG_PREEMPT
2524	/*
2525	 * Underflow?
2526	 */
2527	if (DEBUG_LOCKS_WARN_ON(val > preempt_count()))
2528		return;
2529	/*
2530	 * Is the spinlock portion underflowing?
2531	 */
2532	if (DEBUG_LOCKS_WARN_ON((val < PREEMPT_MASK) &&
2533			!(preempt_count() & PREEMPT_MASK)))
2534		return;
2535#endif
2536
2537	if (preempt_count() == val)
2538		trace_preempt_on(CALLER_ADDR0, get_parent_ip(CALLER_ADDR1));
2539	__preempt_count_sub(val);
2540}
2541EXPORT_SYMBOL(preempt_count_sub);
2542
2543#endif
2544
2545/*
2546 * Print scheduling while atomic bug:
2547 */
2548static noinline void __schedule_bug(struct task_struct *prev)
2549{
2550	if (oops_in_progress)
2551		return;
2552
2553	printk(KERN_ERR "BUG: scheduling while atomic: %s/%d/0x%08x\n",
2554		prev->comm, prev->pid, preempt_count());
2555
2556	debug_show_held_locks(prev);
2557	print_modules();
2558	if (irqs_disabled())
2559		print_irqtrace_events(prev);
2560	dump_stack();
2561	add_taint(TAINT_WARN, LOCKDEP_STILL_OK);
2562}
2563
2564/*
2565 * Various schedule()-time debugging checks and statistics:
2566 */
2567static inline void schedule_debug(struct task_struct *prev)
2568{
2569	/*
2570	 * Test if we are atomic. Since do_exit() needs to call into
2571	 * schedule() atomically, we ignore that path. Otherwise whine
2572	 * if we are scheduling when we should not.
2573	 */
2574	if (unlikely(in_atomic_preempt_off() && prev->state != TASK_DEAD))
2575		__schedule_bug(prev);
2576	rcu_sleep_check();
2577
2578	profile_hit(SCHED_PROFILING, __builtin_return_address(0));
2579
2580	schedstat_inc(this_rq(), sched_count);
2581}
2582
2583static void put_prev_task(struct rq *rq, struct task_struct *prev)
2584{
2585	if (prev->on_rq || rq->skip_clock_update < 0)
2586		update_rq_clock(rq);
2587	prev->sched_class->put_prev_task(rq, prev);
2588}
2589
2590/*
2591 * Pick up the highest-prio task:
2592 */
2593static inline struct task_struct *
2594pick_next_task(struct rq *rq)
2595{
2596	const struct sched_class *class;
2597	struct task_struct *p;
2598
2599	/*
2600	 * Optimization: we know that if all tasks are in
2601	 * the fair class we can call that function directly:
2602	 */
2603	if (likely(rq->nr_running == rq->cfs.h_nr_running)) {
2604		p = fair_sched_class.pick_next_task(rq);
2605		if (likely(p))
2606			return p;
2607	}
2608
2609	for_each_class(class) {
2610		p = class->pick_next_task(rq);
2611		if (p)
2612			return p;
2613	}
2614
2615	BUG(); /* the idle class will always have a runnable task */
2616}
2617
2618/*
2619 * __schedule() is the main scheduler function.
2620 *
2621 * The main means of driving the scheduler and thus entering this function are:
2622 *
2623 *   1. Explicit blocking: mutex, semaphore, waitqueue, etc.
2624 *
2625 *   2. TIF_NEED_RESCHED flag is checked on interrupt and userspace return
2626 *      paths. For example, see arch/x86/entry_64.S.
2627 *
2628 *      To drive preemption between tasks, the scheduler sets the flag in timer
2629 *      interrupt handler scheduler_tick().
2630 *
2631 *   3. Wakeups don't really cause entry into schedule(). They add a
2632 *      task to the run-queue and that's it.
2633 *
2634 *      Now, if the new task added to the run-queue preempts the current
2635 *      task, then the wakeup sets TIF_NEED_RESCHED and schedule() gets
2636 *      called on the nearest possible occasion:
2637 *
2638 *       - If the kernel is preemptible (CONFIG_PREEMPT=y):
2639 *
2640 *         - in syscall or exception context, at the next outmost
2641 *           preempt_enable(). (this might be as soon as the wake_up()'s
2642 *           spin_unlock()!)
2643 *
2644 *         - in IRQ context, return from interrupt-handler to
2645 *           preemptible context
2646 *
2647 *       - If the kernel is not preemptible (CONFIG_PREEMPT is not set)
2648 *         then at the next:
2649 *
2650 *          - cond_resched() call
2651 *          - explicit schedule() call
2652 *          - return from syscall or exception to user-space
2653 *          - return from interrupt-handler to user-space
2654 */
2655static void __sched __schedule(void)
2656{
2657	struct task_struct *prev, *next;
2658	unsigned long *switch_count;
2659	struct rq *rq;
2660	int cpu;
2661
2662need_resched:
2663	preempt_disable();
2664	cpu = smp_processor_id();
2665	rq = cpu_rq(cpu);
2666	rcu_note_context_switch(cpu);
2667	prev = rq->curr;
2668
2669	schedule_debug(prev);
2670
2671	if (sched_feat(HRTICK))
2672		hrtick_clear(rq);
2673
2674	/*
2675	 * Make sure that signal_pending_state()->signal_pending() below
2676	 * can't be reordered with __set_current_state(TASK_INTERRUPTIBLE)
2677	 * done by the caller to avoid the race with signal_wake_up().
2678	 */
2679	smp_mb__before_spinlock();
2680	raw_spin_lock_irq(&rq->lock);
2681
2682	switch_count = &prev->nivcsw;
2683	if (prev->state && !(preempt_count() & PREEMPT_ACTIVE)) {
2684		if (unlikely(signal_pending_state(prev->state, prev))) {
2685			prev->state = TASK_RUNNING;
2686		} else {
2687			deactivate_task(rq, prev, DEQUEUE_SLEEP);
2688			prev->on_rq = 0;
2689
2690			/*
2691			 * If a worker went to sleep, notify and ask workqueue
2692			 * whether it wants to wake up a task to maintain
2693			 * concurrency.
2694			 */
2695			if (prev->flags & PF_WQ_WORKER) {
2696				struct task_struct *to_wakeup;
2697
2698				to_wakeup = wq_worker_sleeping(prev, cpu);
2699				if (to_wakeup)
2700					try_to_wake_up_local(to_wakeup);
2701			}
2702		}
2703		switch_count = &prev->nvcsw;
2704	}
2705
2706	pre_schedule(rq, prev);
2707
2708	if (unlikely(!rq->nr_running))
2709		idle_balance(cpu, rq);
2710
2711	put_prev_task(rq, prev);
2712	next = pick_next_task(rq);
2713	clear_tsk_need_resched(prev);
2714	clear_preempt_need_resched();
2715	rq->skip_clock_update = 0;
2716
2717	if (likely(prev != next)) {
2718		rq->nr_switches++;
2719		rq->curr = next;
2720		++*switch_count;
2721
2722		context_switch(rq, prev, next); /* unlocks the rq */
2723		/*
2724		 * The context switch have flipped the stack from under us
2725		 * and restored the local variables which were saved when
2726		 * this task called schedule() in the past. prev == current
2727		 * is still correct, but it can be moved to another cpu/rq.
2728		 */
2729		cpu = smp_processor_id();
2730		rq = cpu_rq(cpu);
2731	} else
2732		raw_spin_unlock_irq(&rq->lock);
2733
2734	post_schedule(rq);
2735
2736	sched_preempt_enable_no_resched();
2737	if (need_resched())
2738		goto need_resched;
2739}
2740
2741static inline void sched_submit_work(struct task_struct *tsk)
2742{
2743	if (!tsk->state || tsk_is_pi_blocked(tsk))
2744		return;
2745	/*
2746	 * If we are going to sleep and we have plugged IO queued,
2747	 * make sure to submit it to avoid deadlocks.
2748	 */
2749	if (blk_needs_flush_plug(tsk))
2750		blk_schedule_flush_plug(tsk);
2751}
2752
2753asmlinkage void __sched schedule(void)
2754{
2755	struct task_struct *tsk = current;
2756
2757	sched_submit_work(tsk);
2758	__schedule();
2759}
2760EXPORT_SYMBOL(schedule);
2761
2762#ifdef CONFIG_CONTEXT_TRACKING
2763asmlinkage void __sched schedule_user(void)
2764{
2765	/*
2766	 * If we come here after a random call to set_need_resched(),
2767	 * or we have been woken up remotely but the IPI has not yet arrived,
2768	 * we haven't yet exited the RCU idle mode. Do it here manually until
2769	 * we find a better solution.
2770	 */
2771	user_exit();
2772	schedule();
2773	user_enter();
2774}
2775#endif
2776
2777/**
2778 * schedule_preempt_disabled - called with preemption disabled
2779 *
2780 * Returns with preemption disabled. Note: preempt_count must be 1
2781 */
2782void __sched schedule_preempt_disabled(void)
2783{
2784	sched_preempt_enable_no_resched();
2785	schedule();
2786	preempt_disable();
2787}
2788
2789#ifdef CONFIG_PREEMPT
2790/*
2791 * this is the entry point to schedule() from in-kernel preemption
2792 * off of preempt_enable. Kernel preemptions off return from interrupt
2793 * occur there and call schedule directly.
2794 */
2795asmlinkage void __sched notrace preempt_schedule(void)
2796{
2797	/*
2798	 * If there is a non-zero preempt_count or interrupts are disabled,
2799	 * we do not want to preempt the current task. Just return..
2800	 */
2801	if (likely(!preemptible()))
2802		return;
2803
2804	do {
2805		__preempt_count_add(PREEMPT_ACTIVE);
2806		__schedule();
2807		__preempt_count_sub(PREEMPT_ACTIVE);
2808
2809		/*
2810		 * Check again in case we missed a preemption opportunity
2811		 * between schedule and now.
2812		 */
2813		barrier();
2814	} while (need_resched());
2815}
2816EXPORT_SYMBOL(preempt_schedule);
2817#endif /* CONFIG_PREEMPT */
2818
2819/*
2820 * this is the entry point to schedule() from kernel preemption
2821 * off of irq context.
2822 * Note, that this is called and return with irqs disabled. This will
2823 * protect us against recursive calling from irq.
2824 */
2825asmlinkage void __sched preempt_schedule_irq(void)
2826{
2827	enum ctx_state prev_state;
2828
2829	/* Catch callers which need to be fixed */
2830	BUG_ON(preempt_count() || !irqs_disabled());
2831
2832	prev_state = exception_enter();
2833
2834	do {
2835		__preempt_count_add(PREEMPT_ACTIVE);
2836		local_irq_enable();
2837		__schedule();
2838		local_irq_disable();
2839		__preempt_count_sub(PREEMPT_ACTIVE);
2840
2841		/*
2842		 * Check again in case we missed a preemption opportunity
2843		 * between schedule and now.
2844		 */
2845		barrier();
2846	} while (need_resched());
2847
2848	exception_exit(prev_state);
2849}
2850
2851int default_wake_function(wait_queue_t *curr, unsigned mode, int wake_flags,
2852			  void *key)
2853{
2854	return try_to_wake_up(curr->private, mode, wake_flags);
2855}
2856EXPORT_SYMBOL(default_wake_function);
2857
2858static long __sched
2859sleep_on_common(wait_queue_head_t *q, int state, long timeout)
2860{
2861	unsigned long flags;
2862	wait_queue_t wait;
2863
2864	init_waitqueue_entry(&wait, current);
2865
2866	__set_current_state(state);
2867
2868	spin_lock_irqsave(&q->lock, flags);
2869	__add_wait_queue(q, &wait);
2870	spin_unlock(&q->lock);
2871	timeout = schedule_timeout(timeout);
2872	spin_lock_irq(&q->lock);
2873	__remove_wait_queue(q, &wait);
2874	spin_unlock_irqrestore(&q->lock, flags);
2875
2876	return timeout;
2877}
2878
2879void __sched interruptible_sleep_on(wait_queue_head_t *q)
2880{
2881	sleep_on_common(q, TASK_INTERRUPTIBLE, MAX_SCHEDULE_TIMEOUT);
2882}
2883EXPORT_SYMBOL(interruptible_sleep_on);
2884
2885long __sched
2886interruptible_sleep_on_timeout(wait_queue_head_t *q, long timeout)
2887{
2888	return sleep_on_common(q, TASK_INTERRUPTIBLE, timeout);
2889}
2890EXPORT_SYMBOL(interruptible_sleep_on_timeout);
2891
2892void __sched sleep_on(wait_queue_head_t *q)
2893{
2894	sleep_on_common(q, TASK_UNINTERRUPTIBLE, MAX_SCHEDULE_TIMEOUT);
2895}
2896EXPORT_SYMBOL(sleep_on);
2897
2898long __sched sleep_on_timeout(wait_queue_head_t *q, long timeout)
2899{
2900	return sleep_on_common(q, TASK_UNINTERRUPTIBLE, timeout);
2901}
2902EXPORT_SYMBOL(sleep_on_timeout);
2903
2904#ifdef CONFIG_RT_MUTEXES
2905
2906/*
2907 * rt_mutex_setprio - set the current priority of a task
2908 * @p: task
2909 * @prio: prio value (kernel-internal form)
2910 *
2911 * This function changes the 'effective' priority of a task. It does
2912 * not touch ->normal_prio like __setscheduler().
2913 *
2914 * Used by the rt_mutex code to implement priority inheritance logic.
2915 */
2916void rt_mutex_setprio(struct task_struct *p, int prio)
2917{
2918	int oldprio, on_rq, running, enqueue_flag = 0;
2919	struct rq *rq;
2920	const struct sched_class *prev_class;
2921
2922	BUG_ON(prio > MAX_PRIO);
2923
2924	rq = __task_rq_lock(p);
2925
2926	/*
2927	 * Idle task boosting is a nono in general. There is one
2928	 * exception, when PREEMPT_RT and NOHZ is active:
2929	 *
2930	 * The idle task calls get_next_timer_interrupt() and holds
2931	 * the timer wheel base->lock on the CPU and another CPU wants
2932	 * to access the timer (probably to cancel it). We can safely
2933	 * ignore the boosting request, as the idle CPU runs this code
2934	 * with interrupts disabled and will complete the lock
2935	 * protected section without being interrupted. So there is no
2936	 * real need to boost.
2937	 */
2938	if (unlikely(p == rq->idle)) {
2939		WARN_ON(p != rq->curr);
2940		WARN_ON(p->pi_blocked_on);
2941		goto out_unlock;
2942	}
2943
2944	trace_sched_pi_setprio(p, prio);
2945	p->pi_top_task = rt_mutex_get_top_task(p);
2946	oldprio = p->prio;
2947	prev_class = p->sched_class;
2948	on_rq = p->on_rq;
2949	running = task_current(rq, p);
2950	if (on_rq)
2951		dequeue_task(rq, p, 0);
2952	if (running)
2953		p->sched_class->put_prev_task(rq, p);
2954
2955	/*
2956	 * Boosting condition are:
2957	 * 1. -rt task is running and holds mutex A
2958	 *      --> -dl task blocks on mutex A
2959	 *
2960	 * 2. -dl task is running and holds mutex A
2961	 *      --> -dl task blocks on mutex A and could preempt the
2962	 *          running task
2963	 */
2964	if (dl_prio(prio)) {
2965		if (!dl_prio(p->normal_prio) || (p->pi_top_task &&
2966			dl_entity_preempt(&p->pi_top_task->dl, &p->dl))) {
2967			p->dl.dl_boosted = 1;
2968			p->dl.dl_throttled = 0;
2969			enqueue_flag = ENQUEUE_REPLENISH;
2970		} else
2971			p->dl.dl_boosted = 0;
2972		p->sched_class = &dl_sched_class;
2973	} else if (rt_prio(prio)) {
2974		if (dl_prio(oldprio))
2975			p->dl.dl_boosted = 0;
2976		if (oldprio < prio)
2977			enqueue_flag = ENQUEUE_HEAD;
2978		p->sched_class = &rt_sched_class;
2979	} else {
2980		if (dl_prio(oldprio))
2981			p->dl.dl_boosted = 0;
2982		p->sched_class = &fair_sched_class;
2983	}
2984
2985	p->prio = prio;
2986
2987	if (running)
2988		p->sched_class->set_curr_task(rq);
2989	if (on_rq)
2990		enqueue_task(rq, p, enqueue_flag);
2991
2992	check_class_changed(rq, p, prev_class, oldprio);
2993out_unlock:
2994	__task_rq_unlock(rq);
2995}
2996#endif
2997
2998void set_user_nice(struct task_struct *p, long nice)
2999{
3000	int old_prio, delta, on_rq;
3001	unsigned long flags;
3002	struct rq *rq;
3003
3004	if (TASK_NICE(p) == nice || nice < -20 || nice > 19)
3005		return;
3006	/*
3007	 * We have to be careful, if called from sys_setpriority(),
3008	 * the task might be in the middle of scheduling on another CPU.
3009	 */
3010	rq = task_rq_lock(p, &flags);
3011	/*
3012	 * The RT priorities are set via sched_setscheduler(), but we still
3013	 * allow the 'normal' nice value to be set - but as expected
3014	 * it wont have any effect on scheduling until the task is
3015	 * SCHED_DEADLINE, SCHED_FIFO or SCHED_RR:
3016	 */
3017	if (task_has_dl_policy(p) || task_has_rt_policy(p)) {
3018		p->static_prio = NICE_TO_PRIO(nice);
3019		goto out_unlock;
3020	}
3021	on_rq = p->on_rq;
3022	if (on_rq)
3023		dequeue_task(rq, p, 0);
3024
3025	p->static_prio = NICE_TO_PRIO(nice);
3026	set_load_weight(p);
3027	old_prio = p->prio;
3028	p->prio = effective_prio(p);
3029	delta = p->prio - old_prio;
3030
3031	if (on_rq) {
3032		enqueue_task(rq, p, 0);
3033		/*
3034		 * If the task increased its priority or is running and
3035		 * lowered its priority, then reschedule its CPU:
3036		 */
3037		if (delta < 0 || (delta > 0 && task_running(rq, p)))
3038			resched_task(rq->curr);
3039	}
3040out_unlock:
3041	task_rq_unlock(rq, p, &flags);
3042}
3043EXPORT_SYMBOL(set_user_nice);
3044
3045/*
3046 * can_nice - check if a task can reduce its nice value
3047 * @p: task
3048 * @nice: nice value
3049 */
3050int can_nice(const struct task_struct *p, const int nice)
3051{
3052	/* convert nice value [19,-20] to rlimit style value [1,40] */
3053	int nice_rlim = 20 - nice;
3054
3055	return (nice_rlim <= task_rlimit(p, RLIMIT_NICE) ||
3056		capable(CAP_SYS_NICE));
3057}
3058
3059#ifdef __ARCH_WANT_SYS_NICE
3060
3061/*
3062 * sys_nice - change the priority of the current process.
3063 * @increment: priority increment
3064 *
3065 * sys_setpriority is a more generic, but much slower function that
3066 * does similar things.
3067 */
3068SYSCALL_DEFINE1(nice, int, increment)
3069{
3070	long nice, retval;
3071
3072	/*
3073	 * Setpriority might change our priority at the same moment.
3074	 * We don't have to worry. Conceptually one call occurs first
3075	 * and we have a single winner.
3076	 */
3077	if (increment < -40)
3078		increment = -40;
3079	if (increment > 40)
3080		increment = 40;
3081
3082	nice = TASK_NICE(current) + increment;
3083	if (nice < -20)
3084		nice = -20;
3085	if (nice > 19)
3086		nice = 19;
3087
3088	if (increment < 0 && !can_nice(current, nice))
3089		return -EPERM;
3090
3091	retval = security_task_setnice(current, nice);
3092	if (retval)
3093		return retval;
3094
3095	set_user_nice(current, nice);
3096	return 0;
3097}
3098
3099#endif
3100
3101/**
3102 * task_prio - return the priority value of a given task.
3103 * @p: the task in question.
3104 *
3105 * Return: The priority value as seen by users in /proc.
3106 * RT tasks are offset by -200. Normal tasks are centered
3107 * around 0, value goes from -16 to +15.
3108 */
3109int task_prio(const struct task_struct *p)
3110{
3111	return p->prio - MAX_RT_PRIO;
3112}
3113
3114/**
3115 * task_nice - return the nice value of a given task.
3116 * @p: the task in question.
3117 *
3118 * Return: The nice value [ -20 ... 0 ... 19 ].
3119 */
3120int task_nice(const struct task_struct *p)
3121{
3122	return TASK_NICE(p);
3123}
3124EXPORT_SYMBOL(task_nice);
3125
3126/**
3127 * idle_cpu - is a given cpu idle currently?
3128 * @cpu: the processor in question.
3129 *
3130 * Return: 1 if the CPU is currently idle. 0 otherwise.
3131 */
3132int idle_cpu(int cpu)
3133{
3134	struct rq *rq = cpu_rq(cpu);
3135
3136	if (rq->curr != rq->idle)
3137		return 0;
3138
3139	if (rq->nr_running)
3140		return 0;
3141
3142#ifdef CONFIG_SMP
3143	if (!llist_empty(&rq->wake_list))
3144		return 0;
3145#endif
3146
3147	return 1;
3148}
3149
3150/**
3151 * idle_task - return the idle task for a given cpu.
3152 * @cpu: the processor in question.
3153 *
3154 * Return: The idle task for the cpu @cpu.
3155 */
3156struct task_struct *idle_task(int cpu)
3157{
3158	return cpu_rq(cpu)->idle;
3159}
3160
3161/**
3162 * find_process_by_pid - find a process with a matching PID value.
3163 * @pid: the pid in question.
3164 *
3165 * The task of @pid, if found. %NULL otherwise.
3166 */
3167static struct task_struct *find_process_by_pid(pid_t pid)
3168{
3169	return pid ? find_task_by_vpid(pid) : current;
3170}
3171
3172/*
3173 * This function initializes the sched_dl_entity of a newly becoming
3174 * SCHED_DEADLINE task.
3175 *
3176 * Only the static values are considered here, the actual runtime and the
3177 * absolute deadline will be properly calculated when the task is enqueued
3178 * for the first time with its new policy.
3179 */
3180static void
3181__setparam_dl(struct task_struct *p, const struct sched_attr *attr)
3182{
3183	struct sched_dl_entity *dl_se = &p->dl;
3184
3185	init_dl_task_timer(dl_se);
3186	dl_se->dl_runtime = attr->sched_runtime;
3187	dl_se->dl_deadline = attr->sched_deadline;
3188	dl_se->dl_period = attr->sched_period ?: dl_se->dl_deadline;
3189	dl_se->flags = attr->sched_flags;
3190	dl_se->dl_bw = to_ratio(dl_se->dl_period, dl_se->dl_runtime);
3191	dl_se->dl_throttled = 0;
3192	dl_se->dl_new = 1;
3193}
3194
3195/* Actually do priority change: must hold pi & rq lock. */
3196static void __setscheduler(struct rq *rq, struct task_struct *p,
3197			   const struct sched_attr *attr)
3198{
3199	int policy = attr->sched_policy;
3200
3201	if (policy == -1) /* setparam */
3202		policy = p->policy;
3203
3204	p->policy = policy;
3205
3206	if (dl_policy(policy))
3207		__setparam_dl(p, attr);
3208	else if (fair_policy(policy))
3209		p->static_prio = NICE_TO_PRIO(attr->sched_nice);
3210
3211	/*
3212	 * __sched_setscheduler() ensures attr->sched_priority == 0 when
3213	 * !rt_policy. Always setting this ensures that things like
3214	 * getparam()/getattr() don't report silly values for !rt tasks.
3215	 */
3216	p->rt_priority = attr->sched_priority;
3217
3218	p->normal_prio = normal_prio(p);
3219	p->prio = rt_mutex_getprio(p);
3220
3221	if (dl_prio(p->prio))
3222		p->sched_class = &dl_sched_class;
3223	else if (rt_prio(p->prio))
3224		p->sched_class = &rt_sched_class;
3225	else
3226		p->sched_class = &fair_sched_class;
3227
3228	set_load_weight(p);
3229}
3230
3231static void
3232__getparam_dl(struct task_struct *p, struct sched_attr *attr)
3233{
3234	struct sched_dl_entity *dl_se = &p->dl;
3235
3236	attr->sched_priority = p->rt_priority;
3237	attr->sched_runtime = dl_se->dl_runtime;
3238	attr->sched_deadline = dl_se->dl_deadline;
3239	attr->sched_period = dl_se->dl_period;
3240	attr->sched_flags = dl_se->flags;
3241}
3242
3243/*
3244 * This function validates the new parameters of a -deadline task.
3245 * We ask for the deadline not being zero, and greater or equal
3246 * than the runtime, as well as the period of being zero or
3247 * greater than deadline. Furthermore, we have to be sure that
3248 * user parameters are above the internal resolution (1us); we
3249 * check sched_runtime only since it is always the smaller one.
3250 */
3251static bool
3252__checkparam_dl(const struct sched_attr *attr)
3253{
3254	return attr && attr->sched_deadline != 0 &&
3255		(attr->sched_period == 0 ||
3256		(s64)(attr->sched_period   - attr->sched_deadline) >= 0) &&
3257		(s64)(attr->sched_deadline - attr->sched_runtime ) >= 0  &&
3258		attr->sched_runtime >= (2 << (DL_SCALE - 1));
3259}
3260
3261/*
3262 * check the target process has a UID that matches the current process's
3263 */
3264static bool check_same_owner(struct task_struct *p)
3265{
3266	const struct cred *cred = current_cred(), *pcred;
3267	bool match;
3268
3269	rcu_read_lock();
3270	pcred = __task_cred(p);
3271	match = (uid_eq(cred->euid, pcred->euid) ||
3272		 uid_eq(cred->euid, pcred->uid));
3273	rcu_read_unlock();
3274	return match;
3275}
3276
3277static int __sched_setscheduler(struct task_struct *p,
3278				const struct sched_attr *attr,
3279				bool user)
3280{
3281	int retval, oldprio, oldpolicy = -1, on_rq, running;
3282	int policy = attr->sched_policy;
3283	unsigned long flags;
3284	const struct sched_class *prev_class;
3285	struct rq *rq;
3286	int reset_on_fork;
3287
3288	/* may grab non-irq protected spin_locks */
3289	BUG_ON(in_interrupt());
3290recheck:
3291	/* double check policy once rq lock held */
3292	if (policy < 0) {
3293		reset_on_fork = p->sched_reset_on_fork;
3294		policy = oldpolicy = p->policy;
3295	} else {
3296		reset_on_fork = !!(attr->sched_flags & SCHED_FLAG_RESET_ON_FORK);
3297
3298		if (policy != SCHED_DEADLINE &&
3299				policy != SCHED_FIFO && policy != SCHED_RR &&
3300				policy != SCHED_NORMAL && policy != SCHED_BATCH &&
3301				policy != SCHED_IDLE)
3302			return -EINVAL;
3303	}
3304
3305	if (attr->sched_flags & ~(SCHED_FLAG_RESET_ON_FORK))
3306		return -EINVAL;
3307
3308	/*
3309	 * Valid priorities for SCHED_FIFO and SCHED_RR are
3310	 * 1..MAX_USER_RT_PRIO-1, valid priority for SCHED_NORMAL,
3311	 * SCHED_BATCH and SCHED_IDLE is 0.
3312	 */
3313	if ((p->mm && attr->sched_priority > MAX_USER_RT_PRIO-1) ||
3314	    (!p->mm && attr->sched_priority > MAX_RT_PRIO-1))
3315		return -EINVAL;
3316	if ((dl_policy(policy) && !__checkparam_dl(attr)) ||
3317	    (rt_policy(policy) != (attr->sched_priority != 0)))
3318		return -EINVAL;
3319
3320	/*
3321	 * Allow unprivileged RT tasks to decrease priority:
3322	 */
3323	if (user && !capable(CAP_SYS_NICE)) {
3324		if (fair_policy(policy)) {
3325			if (attr->sched_nice < TASK_NICE(p) &&
3326			    !can_nice(p, attr->sched_nice))
3327				return -EPERM;
3328		}
3329
3330		if (rt_policy(policy)) {
3331			unsigned long rlim_rtprio =
3332					task_rlimit(p, RLIMIT_RTPRIO);
3333
3334			/* can't set/change the rt policy */
3335			if (policy != p->policy && !rlim_rtprio)
3336				return -EPERM;
3337
3338			/* can't increase priority */
3339			if (attr->sched_priority > p->rt_priority &&
3340			    attr->sched_priority > rlim_rtprio)
3341				return -EPERM;
3342		}
3343
3344		/*
3345		 * Treat SCHED_IDLE as nice 20. Only allow a switch to
3346		 * SCHED_NORMAL if the RLIMIT_NICE would normally permit it.
3347		 */
3348		if (p->policy == SCHED_IDLE && policy != SCHED_IDLE) {
3349			if (!can_nice(p, TASK_NICE(p)))
3350				return -EPERM;
3351		}
3352
3353		/* can't change other user's priorities */
3354		if (!check_same_owner(p))
3355			return -EPERM;
3356
3357		/* Normal users shall not reset the sched_reset_on_fork flag */
3358		if (p->sched_reset_on_fork && !reset_on_fork)
3359			return -EPERM;
3360	}
3361
3362	if (user) {
3363		retval = security_task_setscheduler(p);
3364		if (retval)
3365			return retval;
3366	}
3367
3368	/*
3369	 * make sure no PI-waiters arrive (or leave) while we are
3370	 * changing the priority of the task:
3371	 *
3372	 * To be able to change p->policy safely, the appropriate
3373	 * runqueue lock must be held.
3374	 */
3375	rq = task_rq_lock(p, &flags);
3376
3377	/*
3378	 * Changing the policy of the stop threads its a very bad idea
3379	 */
3380	if (p == rq->stop) {
3381		task_rq_unlock(rq, p, &flags);
3382		return -EINVAL;
3383	}
3384
3385	/*
3386	 * If not changing anything there's no need to proceed further:
3387	 */
3388	if (unlikely(policy == p->policy)) {
3389		if (fair_policy(policy) && attr->sched_nice != TASK_NICE(p))
3390			goto change;
3391		if (rt_policy(policy) && attr->sched_priority != p->rt_priority)
3392			goto change;
3393		if (dl_policy(policy))
3394			goto change;
3395
3396		task_rq_unlock(rq, p, &flags);
3397		return 0;
3398	}
3399change:
3400
3401	if (user) {
3402#ifdef CONFIG_RT_GROUP_SCHED
3403		/*
3404		 * Do not allow realtime tasks into groups that have no runtime
3405		 * assigned.
3406		 */
3407		if (rt_bandwidth_enabled() && rt_policy(policy) &&
3408				task_group(p)->rt_bandwidth.rt_runtime == 0 &&
3409				!task_group_is_autogroup(task_group(p))) {
3410			task_rq_unlock(rq, p, &flags);
3411			return -EPERM;
3412		}
3413#endif
3414#ifdef CONFIG_SMP
3415		if (dl_bandwidth_enabled() && dl_policy(policy)) {
3416			cpumask_t *span = rq->rd->span;
3417
3418			/*
3419			 * Don't allow tasks with an affinity mask smaller than
3420			 * the entire root_domain to become SCHED_DEADLINE. We
3421			 * will also fail if there's no bandwidth available.
3422			 */
3423			if (!cpumask_subset(span, &p->cpus_allowed) ||
3424			    rq->rd->dl_bw.bw == 0) {
3425				task_rq_unlock(rq, p, &flags);
3426				return -EPERM;
3427			}
3428		}
3429#endif
3430	}
3431
3432	/* recheck policy now with rq lock held */
3433	if (unlikely(oldpolicy != -1 && oldpolicy != p->policy)) {
3434		policy = oldpolicy = -1;
3435		task_rq_unlock(rq, p, &flags);
3436		goto recheck;
3437	}
3438
3439	/*
3440	 * If setscheduling to SCHED_DEADLINE (or changing the parameters
3441	 * of a SCHED_DEADLINE task) we need to check if enough bandwidth
3442	 * is available.
3443	 */
3444	if ((dl_policy(policy) || dl_task(p)) && dl_overflow(p, policy, attr)) {
3445		task_rq_unlock(rq, p, &flags);
3446		return -EBUSY;
3447	}
3448
3449	on_rq = p->on_rq;
3450	running = task_current(rq, p);
3451	if (on_rq)
3452		dequeue_task(rq, p, 0);
3453	if (running)
3454		p->sched_class->put_prev_task(rq, p);
3455
3456	p->sched_reset_on_fork = reset_on_fork;
3457
3458	oldprio = p->prio;
3459	prev_class = p->sched_class;
3460	__setscheduler(rq, p, attr);
3461
3462	if (running)
3463		p->sched_class->set_curr_task(rq);
3464	if (on_rq)
3465		enqueue_task(rq, p, 0);
3466
3467	check_class_changed(rq, p, prev_class, oldprio);
3468	task_rq_unlock(rq, p, &flags);
3469
3470	rt_mutex_adjust_pi(p);
3471
3472	return 0;
3473}
3474
3475static int _sched_setscheduler(struct task_struct *p, int policy,
3476			       const struct sched_param *param, bool check)
3477{
3478	struct sched_attr attr = {
3479		.sched_policy   = policy,
3480		.sched_priority = param->sched_priority,
3481		.sched_nice	= PRIO_TO_NICE(p->static_prio),
3482	};
3483
3484	/*
3485	 * Fixup the legacy SCHED_RESET_ON_FORK hack
3486	 */
3487	if (policy & SCHED_RESET_ON_FORK) {
3488		attr.sched_flags |= SCHED_FLAG_RESET_ON_FORK;
3489		policy &= ~SCHED_RESET_ON_FORK;
3490		attr.sched_policy = policy;
3491	}
3492
3493	return __sched_setscheduler(p, &attr, check);
3494}
3495/**
3496 * sched_setscheduler - change the scheduling policy and/or RT priority of a thread.
3497 * @p: the task in question.
3498 * @policy: new policy.
3499 * @param: structure containing the new RT priority.
3500 *
3501 * Return: 0 on success. An error code otherwise.
3502 *
3503 * NOTE that the task may be already dead.
3504 */
3505int sched_setscheduler(struct task_struct *p, int policy,
3506		       const struct sched_param *param)
3507{
3508	return _sched_setscheduler(p, policy, param, true);
3509}
3510EXPORT_SYMBOL_GPL(sched_setscheduler);
3511
3512int sched_setattr(struct task_struct *p, const struct sched_attr *attr)
3513{
3514	return __sched_setscheduler(p, attr, true);
3515}
3516EXPORT_SYMBOL_GPL(sched_setattr);
3517
3518/**
3519 * sched_setscheduler_nocheck - change the scheduling policy and/or RT priority of a thread from kernelspace.
3520 * @p: the task in question.
3521 * @policy: new policy.
3522 * @param: structure containing the new RT priority.
3523 *
3524 * Just like sched_setscheduler, only don't bother checking if the
3525 * current context has permission.  For example, this is needed in
3526 * stop_machine(): we create temporary high priority worker threads,
3527 * but our caller might not have that capability.
3528 *
3529 * Return: 0 on success. An error code otherwise.
3530 */
3531int sched_setscheduler_nocheck(struct task_struct *p, int policy,
3532			       const struct sched_param *param)
3533{
3534	return _sched_setscheduler(p, policy, param, false);
3535}
3536
3537static int
3538do_sched_setscheduler(pid_t pid, int policy, struct sched_param __user *param)
3539{
3540	struct sched_param lparam;
3541	struct task_struct *p;
3542	int retval;
3543
3544	if (!param || pid < 0)
3545		return -EINVAL;
3546	if (copy_from_user(&lparam, param, sizeof(struct sched_param)))
3547		return -EFAULT;
3548
3549	rcu_read_lock();
3550	retval = -ESRCH;
3551	p = find_process_by_pid(pid);
3552	if (p != NULL)
3553		retval = sched_setscheduler(p, policy, &lparam);
3554	rcu_read_unlock();
3555
3556	return retval;
3557}
3558
3559/*
3560 * Mimics kernel/events/core.c perf_copy_attr().
3561 */
3562static int sched_copy_attr(struct sched_attr __user *uattr,
3563			   struct sched_attr *attr)
3564{
3565	u32 size;
3566	int ret;
3567
3568	if (!access_ok(VERIFY_WRITE, uattr, SCHED_ATTR_SIZE_VER0))
3569		return -EFAULT;
3570
3571	/*
3572	 * zero the full structure, so that a short copy will be nice.
3573	 */
3574	memset(attr, 0, sizeof(*attr));
3575
3576	ret = get_user(size, &uattr->size);
3577	if (ret)
3578		return ret;
3579
3580	if (size > PAGE_SIZE)	/* silly large */
3581		goto err_size;
3582
3583	if (!size)		/* abi compat */
3584		size = SCHED_ATTR_SIZE_VER0;
3585
3586	if (size < SCHED_ATTR_SIZE_VER0)
3587		goto err_size;
3588
3589	/*
3590	 * If we're handed a bigger struct than we know of,
3591	 * ensure all the unknown bits are 0 - i.e. new
3592	 * user-space does not rely on any kernel feature
3593	 * extensions we dont know about yet.
3594	 */
3595	if (size > sizeof(*attr)) {
3596		unsigned char __user *addr;
3597		unsigned char __user *end;
3598		unsigned char val;
3599
3600		addr = (void __user *)uattr + sizeof(*attr);
3601		end  = (void __user *)uattr + size;
3602
3603		for (; addr < end; addr++) {
3604			ret = get_user(val, addr);
3605			if (ret)
3606				return ret;
3607			if (val)
3608				goto err_size;
3609		}
3610		size = sizeof(*attr);
3611	}
3612
3613	ret = copy_from_user(attr, uattr, size);
3614	if (ret)
3615		return -EFAULT;
3616
3617	/*
3618	 * XXX: do we want to be lenient like existing syscalls; or do we want
3619	 * to be strict and return an error on out-of-bounds values?
3620	 */
3621	attr->sched_nice = clamp(attr->sched_nice, -20, 19);
3622
3623out:
3624	return ret;
3625
3626err_size:
3627	put_user(sizeof(*attr), &uattr->size);
3628	ret = -E2BIG;
3629	goto out;
3630}
3631
3632/**
3633 * sys_sched_setscheduler - set/change the scheduler policy and RT priority
3634 * @pid: the pid in question.
3635 * @policy: new policy.
3636 * @param: structure containing the new RT priority.
3637 *
3638 * Return: 0 on success. An error code otherwise.
3639 */
3640SYSCALL_DEFINE3(sched_setscheduler, pid_t, pid, int, policy,
3641		struct sched_param __user *, param)
3642{
3643	/* negative values for policy are not valid */
3644	if (policy < 0)
3645		return -EINVAL;
3646
3647	return do_sched_setscheduler(pid, policy, param);
3648}
3649
3650/**
3651 * sys_sched_setparam - set/change the RT priority of a thread
3652 * @pid: the pid in question.
3653 * @param: structure containing the new RT priority.
3654 *
3655 * Return: 0 on success. An error code otherwise.
3656 */
3657SYSCALL_DEFINE2(sched_setparam, pid_t, pid, struct sched_param __user *, param)
3658{
3659	return do_sched_setscheduler(pid, -1, param);
3660}
3661
3662/**
3663 * sys_sched_setattr - same as above, but with extended sched_attr
3664 * @pid: the pid in question.
3665 * @uattr: structure containing the extended parameters.
3666 */
3667SYSCALL_DEFINE2(sched_setattr, pid_t, pid, struct sched_attr __user *, uattr)
3668{
3669	struct sched_attr attr;
3670	struct task_struct *p;
3671	int retval;
3672
3673	if (!uattr || pid < 0)
3674		return -EINVAL;
3675
3676	if (sched_copy_attr(uattr, &attr))
3677		return -EFAULT;
3678
3679	rcu_read_lock();
3680	retval = -ESRCH;
3681	p = find_process_by_pid(pid);
3682	if (p != NULL)
3683		retval = sched_setattr(p, &attr);
3684	rcu_read_unlock();
3685
3686	return retval;
3687}
3688
3689/**
3690 * sys_sched_getscheduler - get the policy (scheduling class) of a thread
3691 * @pid: the pid in question.
3692 *
3693 * Return: On success, the policy of the thread. Otherwise, a negative error
3694 * code.
3695 */
3696SYSCALL_DEFINE1(sched_getscheduler, pid_t, pid)
3697{
3698	struct task_struct *p;
3699	int retval;
3700
3701	if (pid < 0)
3702		return -EINVAL;
3703
3704	retval = -ESRCH;
3705	rcu_read_lock();
3706	p = find_process_by_pid(pid);
3707	if (p) {
3708		retval = security_task_getscheduler(p);
3709		if (!retval)
3710			retval = p->policy
3711				| (p->sched_reset_on_fork ? SCHED_RESET_ON_FORK : 0);
3712	}
3713	rcu_read_unlock();
3714	return retval;
3715}
3716
3717/**
3718 * sys_sched_getparam - get the RT priority of a thread
3719 * @pid: the pid in question.
3720 * @param: structure containing the RT priority.
3721 *
3722 * Return: On success, 0 and the RT priority is in @param. Otherwise, an error
3723 * code.
3724 */
3725SYSCALL_DEFINE2(sched_getparam, pid_t, pid, struct sched_param __user *, param)
3726{
3727	struct sched_param lp;
3728	struct task_struct *p;
3729	int retval;
3730
3731	if (!param || pid < 0)
3732		return -EINVAL;
3733
3734	rcu_read_lock();
3735	p = find_process_by_pid(pid);
3736	retval = -ESRCH;
3737	if (!p)
3738		goto out_unlock;
3739
3740	retval = security_task_getscheduler(p);
3741	if (retval)
3742		goto out_unlock;
3743
3744	if (task_has_dl_policy(p)) {
3745		retval = -EINVAL;
3746		goto out_unlock;
3747	}
3748	lp.sched_priority = p->rt_priority;
3749	rcu_read_unlock();
3750
3751	/*
3752	 * This one might sleep, we cannot do it with a spinlock held ...
3753	 */
3754	retval = copy_to_user(param, &lp, sizeof(*param)) ? -EFAULT : 0;
3755
3756	return retval;
3757
3758out_unlock:
3759	rcu_read_unlock();
3760	return retval;
3761}
3762
3763static int sched_read_attr(struct sched_attr __user *uattr,
3764			   struct sched_attr *attr,
3765			   unsigned int usize)
3766{
3767	int ret;
3768
3769	if (!access_ok(VERIFY_WRITE, uattr, usize))
3770		return -EFAULT;
3771
3772	/*
3773	 * If we're handed a smaller struct than we know of,
3774	 * ensure all the unknown bits are 0 - i.e. old
3775	 * user-space does not get uncomplete information.
3776	 */
3777	if (usize < sizeof(*attr)) {
3778		unsigned char *addr;
3779		unsigned char *end;
3780
3781		addr = (void *)attr + usize;
3782		end  = (void *)attr + sizeof(*attr);
3783
3784		for (; addr < end; addr++) {
3785			if (*addr)
3786				goto err_size;
3787		}
3788
3789		attr->size = usize;
3790	}
3791
3792	ret = copy_to_user(uattr, attr, usize);
3793	if (ret)
3794		return -EFAULT;
3795
3796out:
3797	return ret;
3798
3799err_size:
3800	ret = -E2BIG;
3801	goto out;
3802}
3803
3804/**
3805 * sys_sched_getattr - similar to sched_getparam, but with sched_attr
3806 * @pid: the pid in question.
3807 * @uattr: structure containing the extended parameters.
3808 * @size: sizeof(attr) for fwd/bwd comp.
3809 */
3810SYSCALL_DEFINE3(sched_getattr, pid_t, pid, struct sched_attr __user *, uattr,
3811		unsigned int, size)
3812{
3813	struct sched_attr attr = {
3814		.size = sizeof(struct sched_attr),
3815	};
3816	struct task_struct *p;
3817	int retval;
3818
3819	if (!uattr || pid < 0 || size > PAGE_SIZE ||
3820	    size < SCHED_ATTR_SIZE_VER0)
3821		return -EINVAL;
3822
3823	rcu_read_lock();
3824	p = find_process_by_pid(pid);
3825	retval = -ESRCH;
3826	if (!p)
3827		goto out_unlock;
3828
3829	retval = security_task_getscheduler(p);
3830	if (retval)
3831		goto out_unlock;
3832
3833	attr.sched_policy = p->policy;
3834	if (p->sched_reset_on_fork)
3835		attr.sched_flags |= SCHED_FLAG_RESET_ON_FORK;
3836	if (task_has_dl_policy(p))
3837		__getparam_dl(p, &attr);
3838	else if (task_has_rt_policy(p))
3839		attr.sched_priority = p->rt_priority;
3840	else
3841		attr.sched_nice = TASK_NICE(p);
3842
3843	rcu_read_unlock();
3844
3845	retval = sched_read_attr(uattr, &attr, size);
3846	return retval;
3847
3848out_unlock:
3849	rcu_read_unlock();
3850	return retval;
3851}
3852
3853long sched_setaffinity(pid_t pid, const struct cpumask *in_mask)
3854{
3855	cpumask_var_t cpus_allowed, new_mask;
3856	struct task_struct *p;
3857	int retval;
3858
3859	rcu_read_lock();
3860
3861	p = find_process_by_pid(pid);
3862	if (!p) {
3863		rcu_read_unlock();
3864		return -ESRCH;
3865	}
3866
3867	/* Prevent p going away */
3868	get_task_struct(p);
3869	rcu_read_unlock();
3870
3871	if (p->flags & PF_NO_SETAFFINITY) {
3872		retval = -EINVAL;
3873		goto out_put_task;
3874	}
3875	if (!alloc_cpumask_var(&cpus_allowed, GFP_KERNEL)) {
3876		retval = -ENOMEM;
3877		goto out_put_task;
3878	}
3879	if (!alloc_cpumask_var(&new_mask, GFP_KERNEL)) {
3880		retval = -ENOMEM;
3881		goto out_free_cpus_allowed;
3882	}
3883	retval = -EPERM;
3884	if (!check_same_owner(p)) {
3885		rcu_read_lock();
3886		if (!ns_capable(__task_cred(p)->user_ns, CAP_SYS_NICE)) {
3887			rcu_read_unlock();
3888			goto out_unlock;
3889		}
3890		rcu_read_unlock();
3891	}
3892
3893	retval = security_task_setscheduler(p);
3894	if (retval)
3895		goto out_unlock;
3896
3897
3898	cpuset_cpus_allowed(p, cpus_allowed);
3899	cpumask_and(new_mask, in_mask, cpus_allowed);
3900
3901	/*
3902	 * Since bandwidth control happens on root_domain basis,
3903	 * if admission test is enabled, we only admit -deadline
3904	 * tasks allowed to run on all the CPUs in the task's
3905	 * root_domain.
3906	 */
3907#ifdef CONFIG_SMP
3908	if (task_has_dl_policy(p)) {
3909		const struct cpumask *span = task_rq(p)->rd->span;
3910
3911		if (dl_bandwidth_enabled() && !cpumask_subset(span, new_mask)) {
3912			retval = -EBUSY;
3913			goto out_unlock;
3914		}
3915	}
3916#endif
3917again:
3918	retval = set_cpus_allowed_ptr(p, new_mask);
3919
3920	if (!retval) {
3921		cpuset_cpus_allowed(p, cpus_allowed);
3922		if (!cpumask_subset(new_mask, cpus_allowed)) {
3923			/*
3924			 * We must have raced with a concurrent cpuset
3925			 * update. Just reset the cpus_allowed to the
3926			 * cpuset's cpus_allowed
3927			 */
3928			cpumask_copy(new_mask, cpus_allowed);
3929			goto again;
3930		}
3931	}
3932out_unlock:
3933	free_cpumask_var(new_mask);
3934out_free_cpus_allowed:
3935	free_cpumask_var(cpus_allowed);
3936out_put_task:
3937	put_task_struct(p);
3938	return retval;
3939}
3940
3941static int get_user_cpu_mask(unsigned long __user *user_mask_ptr, unsigned len,
3942			     struct cpumask *new_mask)
3943{
3944	if (len < cpumask_size())
3945		cpumask_clear(new_mask);
3946	else if (len > cpumask_size())
3947		len = cpumask_size();
3948
3949	return copy_from_user(new_mask, user_mask_ptr, len) ? -EFAULT : 0;
3950}
3951
3952/**
3953 * sys_sched_setaffinity - set the cpu affinity of a process
3954 * @pid: pid of the process
3955 * @len: length in bytes of the bitmask pointed to by user_mask_ptr
3956 * @user_mask_ptr: user-space pointer to the new cpu mask
3957 *
3958 * Return: 0 on success. An error code otherwise.
3959 */
3960SYSCALL_DEFINE3(sched_setaffinity, pid_t, pid, unsigned int, len,
3961		unsigned long __user *, user_mask_ptr)
3962{
3963	cpumask_var_t new_mask;
3964	int retval;
3965
3966	if (!alloc_cpumask_var(&new_mask, GFP_KERNEL))
3967		return -ENOMEM;
3968
3969	retval = get_user_cpu_mask(user_mask_ptr, len, new_mask);
3970	if (retval == 0)
3971		retval = sched_setaffinity(pid, new_mask);
3972	free_cpumask_var(new_mask);
3973	return retval;
3974}
3975
3976long sched_getaffinity(pid_t pid, struct cpumask *mask)
3977{
3978	struct task_struct *p;
3979	unsigned long flags;
3980	int retval;
3981
3982	rcu_read_lock();
3983
3984	retval = -ESRCH;
3985	p = find_process_by_pid(pid);
3986	if (!p)
3987		goto out_unlock;
3988
3989	retval = security_task_getscheduler(p);
3990	if (retval)
3991		goto out_unlock;
3992
3993	raw_spin_lock_irqsave(&p->pi_lock, flags);
3994	cpumask_and(mask, &p->cpus_allowed, cpu_active_mask);
3995	raw_spin_unlock_irqrestore(&p->pi_lock, flags);
3996
3997out_unlock:
3998	rcu_read_unlock();
3999
4000	return retval;
4001}
4002
4003/**
4004 * sys_sched_getaffinity - get the cpu affinity of a process
4005 * @pid: pid of the process
4006 * @len: length in bytes of the bitmask pointed to by user_mask_ptr
4007 * @user_mask_ptr: user-space pointer to hold the current cpu mask
4008 *
4009 * Return: 0 on success. An error code otherwise.
4010 */
4011SYSCALL_DEFINE3(sched_getaffinity, pid_t, pid, unsigned int, len,
4012		unsigned long __user *, user_mask_ptr)
4013{
4014	int ret;
4015	cpumask_var_t mask;
4016
4017	if ((len * BITS_PER_BYTE) < nr_cpu_ids)
4018		return -EINVAL;
4019	if (len & (sizeof(unsigned long)-1))
4020		return -EINVAL;
4021
4022	if (!alloc_cpumask_var(&mask, GFP_KERNEL))
4023		return -ENOMEM;
4024
4025	ret = sched_getaffinity(pid, mask);
4026	if (ret == 0) {
4027		size_t retlen = min_t(size_t, len, cpumask_size());
4028
4029		if (copy_to_user(user_mask_ptr, mask, retlen))
4030			ret = -EFAULT;
4031		else
4032			ret = retlen;
4033	}
4034	free_cpumask_var(mask);
4035
4036	return ret;
4037}
4038
4039/**
4040 * sys_sched_yield - yield the current processor to other threads.
4041 *
4042 * This function yields the current CPU to other tasks. If there are no
4043 * other threads running on this CPU then this function will return.
4044 *
4045 * Return: 0.
4046 */
4047SYSCALL_DEFINE0(sched_yield)
4048{
4049	struct rq *rq = this_rq_lock();
4050
4051	schedstat_inc(rq, yld_count);
4052	current->sched_class->yield_task(rq);
4053
4054	/*
4055	 * Since we are going to call schedule() anyway, there's
4056	 * no need to preempt or enable interrupts:
4057	 */
4058	__release(rq->lock);
4059	spin_release(&rq->lock.dep_map, 1, _THIS_IP_);
4060	do_raw_spin_unlock(&rq->lock);
4061	sched_preempt_enable_no_resched();
4062
4063	schedule();
4064
4065	return 0;
4066}
4067
4068static void __cond_resched(void)
4069{
4070	__preempt_count_add(PREEMPT_ACTIVE);
4071	__schedule();
4072	__preempt_count_sub(PREEMPT_ACTIVE);
4073}
4074
4075int __sched _cond_resched(void)
4076{
4077	if (should_resched()) {
4078		__cond_resched();
4079		return 1;
4080	}
4081	return 0;
4082}
4083EXPORT_SYMBOL(_cond_resched);
4084
4085/*
4086 * __cond_resched_lock() - if a reschedule is pending, drop the given lock,
4087 * call schedule, and on return reacquire the lock.
4088 *
4089 * This works OK both with and without CONFIG_PREEMPT. We do strange low-level
4090 * operations here to prevent schedule() from being called twice (once via
4091 * spin_unlock(), once by hand).
4092 */
4093int __cond_resched_lock(spinlock_t *lock)
4094{
4095	int resched = should_resched();
4096	int ret = 0;
4097
4098	lockdep_assert_held(lock);
4099
4100	if (spin_needbreak(lock) || resched) {
4101		spin_unlock(lock);
4102		if (resched)
4103			__cond_resched();
4104		else
4105			cpu_relax();
4106		ret = 1;
4107		spin_lock(lock);
4108	}
4109	return ret;
4110}
4111EXPORT_SYMBOL(__cond_resched_lock);
4112
4113int __sched __cond_resched_softirq(void)
4114{
4115	BUG_ON(!in_softirq());
4116
4117	if (should_resched()) {
4118		local_bh_enable();
4119		__cond_resched();
4120		local_bh_disable();
4121		return 1;
4122	}
4123	return 0;
4124}
4125EXPORT_SYMBOL(__cond_resched_softirq);
4126
4127/**
4128 * yield - yield the current processor to other threads.
4129 *
4130 * Do not ever use this function, there's a 99% chance you're doing it wrong.
4131 *
4132 * The scheduler is at all times free to pick the calling task as the most
4133 * eligible task to run, if removing the yield() call from your code breaks
4134 * it, its already broken.
4135 *
4136 * Typical broken usage is:
4137 *
4138 * while (!event)
4139 * 	yield();
4140 *
4141 * where one assumes that yield() will let 'the other' process run that will
4142 * make event true. If the current task is a SCHED_FIFO task that will never
4143 * happen. Never use yield() as a progress guarantee!!
4144 *
4145 * If you want to use yield() to wait for something, use wait_event().
4146 * If you want to use yield() to be 'nice' for others, use cond_resched().
4147 * If you still want to use yield(), do not!
4148 */
4149void __sched yield(void)
4150{
4151	set_current_state(TASK_RUNNING);
4152	sys_sched_yield();
4153}
4154EXPORT_SYMBOL(yield);
4155
4156/**
4157 * yield_to - yield the current processor to another thread in
4158 * your thread group, or accelerate that thread toward the
4159 * processor it's on.
4160 * @p: target task
4161 * @preempt: whether task preemption is allowed or not
4162 *
4163 * It's the caller's job to ensure that the target task struct
4164 * can't go away on us before we can do any checks.
4165 *
4166 * Return:
4167 *	true (>0) if we indeed boosted the target task.
4168 *	false (0) if we failed to boost the target.
4169 *	-ESRCH if there's no task to yield to.
4170 */
4171bool __sched yield_to(struct task_struct *p, bool preempt)
4172{
4173	struct task_struct *curr = current;
4174	struct rq *rq, *p_rq;
4175	unsigned long flags;
4176	int yielded = 0;
4177
4178	local_irq_save(flags);
4179	rq = this_rq();
4180
4181again:
4182	p_rq = task_rq(p);
4183	/*
4184	 * If we're the only runnable task on the rq and target rq also
4185	 * has only one task, there's absolutely no point in yielding.
4186	 */
4187	if (rq->nr_running == 1 && p_rq->nr_running == 1) {
4188		yielded = -ESRCH;
4189		goto out_irq;
4190	}
4191
4192	double_rq_lock(rq, p_rq);
4193	if (task_rq(p) != p_rq) {
4194		double_rq_unlock(rq, p_rq);
4195		goto again;
4196	}
4197
4198	if (!curr->sched_class->yield_to_task)
4199		goto out_unlock;
4200
4201	if (curr->sched_class != p->sched_class)
4202		goto out_unlock;
4203
4204	if (task_running(p_rq, p) || p->state)
4205		goto out_unlock;
4206
4207	yielded = curr->sched_class->yield_to_task(rq, p, preempt);
4208	if (yielded) {
4209		schedstat_inc(rq, yld_count);
4210		/*
4211		 * Make p's CPU reschedule; pick_next_entity takes care of
4212		 * fairness.
4213		 */
4214		if (preempt && rq != p_rq)
4215			resched_task(p_rq->curr);
4216	}
4217
4218out_unlock:
4219	double_rq_unlock(rq, p_rq);
4220out_irq:
4221	local_irq_restore(flags);
4222
4223	if (yielded > 0)
4224		schedule();
4225
4226	return yielded;
4227}
4228EXPORT_SYMBOL_GPL(yield_to);
4229
4230/*
4231 * This task is about to go to sleep on IO. Increment rq->nr_iowait so
4232 * that process accounting knows that this is a task in IO wait state.
4233 */
4234void __sched io_schedule(void)
4235{
4236	struct rq *rq = raw_rq();
4237
4238	delayacct_blkio_start();
4239	atomic_inc(&rq->nr_iowait);
4240	blk_flush_plug(current);
4241	current->in_iowait = 1;
4242	schedule();
4243	current->in_iowait = 0;
4244	atomic_dec(&rq->nr_iowait);
4245	delayacct_blkio_end();
4246}
4247EXPORT_SYMBOL(io_schedule);
4248
4249long __sched io_schedule_timeout(long timeout)
4250{
4251	struct rq *rq = raw_rq();
4252	long ret;
4253
4254	delayacct_blkio_start();
4255	atomic_inc(&rq->nr_iowait);
4256	blk_flush_plug(current);
4257	current->in_iowait = 1;
4258	ret = schedule_timeout(timeout);
4259	current->in_iowait = 0;
4260	atomic_dec(&rq->nr_iowait);
4261	delayacct_blkio_end();
4262	return ret;
4263}
4264
4265/**
4266 * sys_sched_get_priority_max - return maximum RT priority.
4267 * @policy: scheduling class.
4268 *
4269 * Return: On success, this syscall returns the maximum
4270 * rt_priority that can be used by a given scheduling class.
4271 * On failure, a negative error code is returned.
4272 */
4273SYSCALL_DEFINE1(sched_get_priority_max, int, policy)
4274{
4275	int ret = -EINVAL;
4276
4277	switch (policy) {
4278	case SCHED_FIFO:
4279	case SCHED_RR:
4280		ret = MAX_USER_RT_PRIO-1;
4281		break;
4282	case SCHED_DEADLINE:
4283	case SCHED_NORMAL:
4284	case SCHED_BATCH:
4285	case SCHED_IDLE:
4286		ret = 0;
4287		break;
4288	}
4289	return ret;
4290}
4291
4292/**
4293 * sys_sched_get_priority_min - return minimum RT priority.
4294 * @policy: scheduling class.
4295 *
4296 * Return: On success, this syscall returns the minimum
4297 * rt_priority that can be used by a given scheduling class.
4298 * On failure, a negative error code is returned.
4299 */
4300SYSCALL_DEFINE1(sched_get_priority_min, int, policy)
4301{
4302	int ret = -EINVAL;
4303
4304	switch (policy) {
4305	case SCHED_FIFO:
4306	case SCHED_RR:
4307		ret = 1;
4308		break;
4309	case SCHED_DEADLINE:
4310	case SCHED_NORMAL:
4311	case SCHED_BATCH:
4312	case SCHED_IDLE:
4313		ret = 0;
4314	}
4315	return ret;
4316}
4317
4318/**
4319 * sys_sched_rr_get_interval - return the default timeslice of a process.
4320 * @pid: pid of the process.
4321 * @interval: userspace pointer to the timeslice value.
4322 *
4323 * this syscall writes the default timeslice value of a given process
4324 * into the user-space timespec buffer. A value of '0' means infinity.
4325 *
4326 * Return: On success, 0 and the timeslice is in @interval. Otherwise,
4327 * an error code.
4328 */
4329SYSCALL_DEFINE2(sched_rr_get_interval, pid_t, pid,
4330		struct timespec __user *, interval)
4331{
4332	struct task_struct *p;
4333	unsigned int time_slice;
4334	unsigned long flags;
4335	struct rq *rq;
4336	int retval;
4337	struct timespec t;
4338
4339	if (pid < 0)
4340		return -EINVAL;
4341
4342	retval = -ESRCH;
4343	rcu_read_lock();
4344	p = find_process_by_pid(pid);
4345	if (!p)
4346		goto out_unlock;
4347
4348	retval = security_task_getscheduler(p);
4349	if (retval)
4350		goto out_unlock;
4351
4352	rq = task_rq_lock(p, &flags);
4353	time_slice = 0;
4354	if (p->sched_class->get_rr_interval)
4355		time_slice = p->sched_class->get_rr_interval(rq, p);
4356	task_rq_unlock(rq, p, &flags);
4357
4358	rcu_read_unlock();
4359	jiffies_to_timespec(time_slice, &t);
4360	retval = copy_to_user(interval, &t, sizeof(t)) ? -EFAULT : 0;
4361	return retval;
4362
4363out_unlock:
4364	rcu_read_unlock();
4365	return retval;
4366}
4367
4368static const char stat_nam[] = TASK_STATE_TO_CHAR_STR;
4369
4370void sched_show_task(struct task_struct *p)
4371{
4372	unsigned long free = 0;
4373	int ppid;
4374	unsigned state;
4375
4376	state = p->state ? __ffs(p->state) + 1 : 0;
4377	printk(KERN_INFO "%-15.15s %c", p->comm,
4378		state < sizeof(stat_nam) - 1 ? stat_nam[state] : '?');
4379#if BITS_PER_LONG == 32
4380	if (state == TASK_RUNNING)
4381		printk(KERN_CONT " running  ");
4382	else
4383		printk(KERN_CONT " %08lx ", thread_saved_pc(p));
4384#else
4385	if (state == TASK_RUNNING)
4386		printk(KERN_CONT "  running task    ");
4387	else
4388		printk(KERN_CONT " %016lx ", thread_saved_pc(p));
4389#endif
4390#ifdef CONFIG_DEBUG_STACK_USAGE
4391	free = stack_not_used(p);
4392#endif
4393	rcu_read_lock();
4394	ppid = task_pid_nr(rcu_dereference(p->real_parent));
4395	rcu_read_unlock();
4396	printk(KERN_CONT "%5lu %5d %6d 0x%08lx\n", free,
4397		task_pid_nr(p), ppid,
4398		(unsigned long)task_thread_info(p)->flags);
4399
4400	print_worker_info(KERN_INFO, p);
4401	show_stack(p, NULL);
4402}
4403
4404void show_state_filter(unsigned long state_filter)
4405{
4406	struct task_struct *g, *p;
4407
4408#if BITS_PER_LONG == 32
4409	printk(KERN_INFO
4410		"  task                PC stack   pid father\n");
4411#else
4412	printk(KERN_INFO
4413		"  task                        PC stack   pid father\n");
4414#endif
4415	rcu_read_lock();
4416	do_each_thread(g, p) {
4417		/*
4418		 * reset the NMI-timeout, listing all files on a slow
4419		 * console might take a lot of time:
4420		 */
4421		touch_nmi_watchdog();
4422		if (!state_filter || (p->state & state_filter))
4423			sched_show_task(p);
4424	} while_each_thread(g, p);
4425
4426	touch_all_softlockup_watchdogs();
4427
4428#ifdef CONFIG_SCHED_DEBUG
4429	sysrq_sched_debug_show();
4430#endif
4431	rcu_read_unlock();
4432	/*
4433	 * Only show locks if all tasks are dumped:
4434	 */
4435	if (!state_filter)
4436		debug_show_all_locks();
4437}
4438
4439void init_idle_bootup_task(struct task_struct *idle)
4440{
4441	idle->sched_class = &idle_sched_class;
4442}
4443
4444/**
4445 * init_idle - set up an idle thread for a given CPU
4446 * @idle: task in question
4447 * @cpu: cpu the idle task belongs to
4448 *
4449 * NOTE: this function does not set the idle thread's NEED_RESCHED
4450 * flag, to make booting more robust.
4451 */
4452void init_idle(struct task_struct *idle, int cpu)
4453{
4454	struct rq *rq = cpu_rq(cpu);
4455	unsigned long flags;
4456
4457	raw_spin_lock_irqsave(&rq->lock, flags);
4458
4459	__sched_fork(0, idle);
4460	idle->state = TASK_RUNNING;
4461	idle->se.exec_start = sched_clock();
4462
4463	do_set_cpus_allowed(idle, cpumask_of(cpu));
4464	/*
4465	 * We're having a chicken and egg problem, even though we are
4466	 * holding rq->lock, the cpu isn't yet set to this cpu so the
4467	 * lockdep check in task_group() will fail.
4468	 *
4469	 * Similar case to sched_fork(). / Alternatively we could
4470	 * use task_rq_lock() here and obtain the other rq->lock.
4471	 *
4472	 * Silence PROVE_RCU
4473	 */
4474	rcu_read_lock();
4475	__set_task_cpu(idle, cpu);
4476	rcu_read_unlock();
4477
4478	rq->curr = rq->idle = idle;
4479#if defined(CONFIG_SMP)
4480	idle->on_cpu = 1;
4481#endif
4482	raw_spin_unlock_irqrestore(&rq->lock, flags);
4483
4484	/* Set the preempt count _outside_ the spinlocks! */
4485	init_idle_preempt_count(idle, cpu);
4486
4487	/*
4488	 * The idle tasks have their own, simple scheduling class:
4489	 */
4490	idle->sched_class = &idle_sched_class;
4491	ftrace_graph_init_idle_task(idle, cpu);
4492	vtime_init_idle(idle, cpu);
4493#if defined(CONFIG_SMP)
4494	sprintf(idle->comm, "%s/%d", INIT_TASK_COMM, cpu);
4495#endif
4496}
4497
4498#ifdef CONFIG_SMP
4499void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask)
4500{
4501	if (p->sched_class && p->sched_class->set_cpus_allowed)
4502		p->sched_class->set_cpus_allowed(p, new_mask);
4503
4504	cpumask_copy(&p->cpus_allowed, new_mask);
4505	p->nr_cpus_allowed = cpumask_weight(new_mask);
4506}
4507
4508/*
4509 * This is how migration works:
4510 *
4511 * 1) we invoke migration_cpu_stop() on the target CPU using
4512 *    stop_one_cpu().
4513 * 2) stopper starts to run (implicitly forcing the migrated thread
4514 *    off the CPU)
4515 * 3) it checks whether the migrated task is still in the wrong runqueue.
4516 * 4) if it's in the wrong runqueue then the migration thread removes
4517 *    it and puts it into the right queue.
4518 * 5) stopper completes and stop_one_cpu() returns and the migration
4519 *    is done.
4520 */
4521
4522/*
4523 * Change a given task's CPU affinity. Migrate the thread to a
4524 * proper CPU and schedule it away if the CPU it's executing on
4525 * is removed from the allowed bitmask.
4526 *
4527 * NOTE: the caller must have a valid reference to the task, the
4528 * task must not exit() & deallocate itself prematurely. The
4529 * call is not atomic; no spinlocks may be held.
4530 */
4531int set_cpus_allowed_ptr(struct task_struct *p, const struct cpumask *new_mask)
4532{
4533	unsigned long flags;
4534	struct rq *rq;
4535	unsigned int dest_cpu;
4536	int ret = 0;
4537
4538	rq = task_rq_lock(p, &flags);
4539
4540	if (cpumask_equal(&p->cpus_allowed, new_mask))
4541		goto out;
4542
4543	if (!cpumask_intersects(new_mask, cpu_active_mask)) {
4544		ret = -EINVAL;
4545		goto out;
4546	}
4547
4548	do_set_cpus_allowed(p, new_mask);
4549
4550	/* Can the task run on the task's current CPU? If so, we're done */
4551	if (cpumask_test_cpu(task_cpu(p), new_mask))
4552		goto out;
4553
4554	dest_cpu = cpumask_any_and(cpu_active_mask, new_mask);
4555	if (p->on_rq) {
4556		struct migration_arg arg = { p, dest_cpu };
4557		/* Need help from migration thread: drop lock and wait. */
4558		task_rq_unlock(rq, p, &flags);
4559		stop_one_cpu(cpu_of(rq), migration_cpu_stop, &arg);
4560		tlb_migrate_finish(p->mm);
4561		return 0;
4562	}
4563out:
4564	task_rq_unlock(rq, p, &flags);
4565
4566	return ret;
4567}
4568EXPORT_SYMBOL_GPL(set_cpus_allowed_ptr);
4569
4570/*
4571 * Move (not current) task off this cpu, onto dest cpu. We're doing
4572 * this because either it can't run here any more (set_cpus_allowed()
4573 * away from this CPU, or CPU going down), or because we're
4574 * attempting to rebalance this task on exec (sched_exec).
4575 *
4576 * So we race with normal scheduler movements, but that's OK, as long
4577 * as the task is no longer on this CPU.
4578 *
4579 * Returns non-zero if task was successfully migrated.
4580 */
4581static int __migrate_task(struct task_struct *p, int src_cpu, int dest_cpu)
4582{
4583	struct rq *rq_dest, *rq_src;
4584	int ret = 0;
4585
4586	if (unlikely(!cpu_active(dest_cpu)))
4587		return ret;
4588
4589	rq_src = cpu_rq(src_cpu);
4590	rq_dest = cpu_rq(dest_cpu);
4591
4592	raw_spin_lock(&p->pi_lock);
4593	double_rq_lock(rq_src, rq_dest);
4594	/* Already moved. */
4595	if (task_cpu(p) != src_cpu)
4596		goto done;
4597	/* Affinity changed (again). */
4598	if (!cpumask_test_cpu(dest_cpu, tsk_cpus_allowed(p)))
4599		goto fail;
4600
4601	/*
4602	 * If we're not on a rq, the next wake-up will ensure we're
4603	 * placed properly.
4604	 */
4605	if (p->on_rq) {
4606		dequeue_task(rq_src, p, 0);
4607		set_task_cpu(p, dest_cpu);
4608		enqueue_task(rq_dest, p, 0);
4609		check_preempt_curr(rq_dest, p, 0);
4610	}
4611done:
4612	ret = 1;
4613fail:
4614	double_rq_unlock(rq_src, rq_dest);
4615	raw_spin_unlock(&p->pi_lock);
4616	return ret;
4617}
4618
4619#ifdef CONFIG_NUMA_BALANCING
4620/* Migrate current task p to target_cpu */
4621int migrate_task_to(struct task_struct *p, int target_cpu)
4622{
4623	struct migration_arg arg = { p, target_cpu };
4624	int curr_cpu = task_cpu(p);
4625
4626	if (curr_cpu == target_cpu)
4627		return 0;
4628
4629	if (!cpumask_test_cpu(target_cpu, tsk_cpus_allowed(p)))
4630		return -EINVAL;
4631
4632	/* TODO: This is not properly updating schedstats */
4633
4634	trace_sched_move_numa(p, curr_cpu, target_cpu);
4635	return stop_one_cpu(curr_cpu, migration_cpu_stop, &arg);
4636}
4637
4638/*
4639 * Requeue a task on a given node and accurately track the number of NUMA
4640 * tasks on the runqueues
4641 */
4642void sched_setnuma(struct task_struct *p, int nid)
4643{
4644	struct rq *rq;
4645	unsigned long flags;
4646	bool on_rq, running;
4647
4648	rq = task_rq_lock(p, &flags);
4649	on_rq = p->on_rq;
4650	running = task_current(rq, p);
4651
4652	if (on_rq)
4653		dequeue_task(rq, p, 0);
4654	if (running)
4655		p->sched_class->put_prev_task(rq, p);
4656
4657	p->numa_preferred_nid = nid;
4658
4659	if (running)
4660		p->sched_class->set_curr_task(rq);
4661	if (on_rq)
4662		enqueue_task(rq, p, 0);
4663	task_rq_unlock(rq, p, &flags);
4664}
4665#endif
4666
4667/*
4668 * migration_cpu_stop - this will be executed by a highprio stopper thread
4669 * and performs thread migration by bumping thread off CPU then
4670 * 'pushing' onto another runqueue.
4671 */
4672static int migration_cpu_stop(void *data)
4673{
4674	struct migration_arg *arg = data;
4675
4676	/*
4677	 * The original target cpu might have gone down and we might
4678	 * be on another cpu but it doesn't matter.
4679	 */
4680	local_irq_disable();
4681	__migrate_task(arg->task, raw_smp_processor_id(), arg->dest_cpu);
4682	local_irq_enable();
4683	return 0;
4684}
4685
4686#ifdef CONFIG_HOTPLUG_CPU
4687
4688/*
4689 * Ensures that the idle task is using init_mm right before its cpu goes
4690 * offline.
4691 */
4692void idle_task_exit(void)
4693{
4694	struct mm_struct *mm = current->active_mm;
4695
4696	BUG_ON(cpu_online(smp_processor_id()));
4697
4698	if (mm != &init_mm)
4699		switch_mm(mm, &init_mm, current);
4700	mmdrop(mm);
4701}
4702
4703/*
4704 * Since this CPU is going 'away' for a while, fold any nr_active delta
4705 * we might have. Assumes we're called after migrate_tasks() so that the
4706 * nr_active count is stable.
4707 *
4708 * Also see the comment "Global load-average calculations".
4709 */
4710static void calc_load_migrate(struct rq *rq)
4711{
4712	long delta = calc_load_fold_active(rq);
4713	if (delta)
4714		atomic_long_add(delta, &calc_load_tasks);
4715}
4716
4717/*
4718 * Migrate all tasks from the rq, sleeping tasks will be migrated by
4719 * try_to_wake_up()->select_task_rq().
4720 *
4721 * Called with rq->lock held even though we'er in stop_machine() and
4722 * there's no concurrency possible, we hold the required locks anyway
4723 * because of lock validation efforts.
4724 */
4725static void migrate_tasks(unsigned int dead_cpu)
4726{
4727	struct rq *rq = cpu_rq(dead_cpu);
4728	struct task_struct *next, *stop = rq->stop;
4729	int dest_cpu;
4730
4731	/*
4732	 * Fudge the rq selection such that the below task selection loop
4733	 * doesn't get stuck on the currently eligible stop task.
4734	 *
4735	 * We're currently inside stop_machine() and the rq is either stuck
4736	 * in the stop_machine_cpu_stop() loop, or we're executing this code,
4737	 * either way we should never end up calling schedule() until we're
4738	 * done here.
4739	 */
4740	rq->stop = NULL;
4741
4742	/*
4743	 * put_prev_task() and pick_next_task() sched
4744	 * class method both need to have an up-to-date
4745	 * value of rq->clock[_task]
4746	 */
4747	update_rq_clock(rq);
4748
4749	for ( ; ; ) {
4750		/*
4751		 * There's this thread running, bail when that's the only
4752		 * remaining thread.
4753		 */
4754		if (rq->nr_running == 1)
4755			break;
4756
4757		next = pick_next_task(rq);
4758		BUG_ON(!next);
4759		next->sched_class->put_prev_task(rq, next);
4760
4761		/* Find suitable destination for @next, with force if needed. */
4762		dest_cpu = select_fallback_rq(dead_cpu, next);
4763		raw_spin_unlock(&rq->lock);
4764
4765		__migrate_task(next, dead_cpu, dest_cpu);
4766
4767		raw_spin_lock(&rq->lock);
4768	}
4769
4770	rq->stop = stop;
4771}
4772
4773#endif /* CONFIG_HOTPLUG_CPU */
4774
4775#if defined(CONFIG_SCHED_DEBUG) && defined(CONFIG_SYSCTL)
4776
4777static struct ctl_table sd_ctl_dir[] = {
4778	{
4779		.procname	= "sched_domain",
4780		.mode		= 0555,
4781	},
4782	{}
4783};
4784
4785static struct ctl_table sd_ctl_root[] = {
4786	{
4787		.procname	= "kernel",
4788		.mode		= 0555,
4789		.child		= sd_ctl_dir,
4790	},
4791	{}
4792};
4793
4794static struct ctl_table *sd_alloc_ctl_entry(int n)
4795{
4796	struct ctl_table *entry =
4797		kcalloc(n, sizeof(struct ctl_table), GFP_KERNEL);
4798
4799	return entry;
4800}
4801
4802static void sd_free_ctl_entry(struct ctl_table **tablep)
4803{
4804	struct ctl_table *entry;
4805
4806	/*
4807	 * In the intermediate directories, both the child directory and
4808	 * procname are dynamically allocated and could fail but the mode
4809	 * will always be set. In the lowest directory the names are
4810	 * static strings and all have proc handlers.
4811	 */
4812	for (entry = *tablep; entry->mode; entry++) {
4813		if (entry->child)
4814			sd_free_ctl_entry(&entry->child);
4815		if (entry->proc_handler == NULL)
4816			kfree(entry->procname);
4817	}
4818
4819	kfree(*tablep);
4820	*tablep = NULL;
4821}
4822
4823static int min_load_idx = 0;
4824static int max_load_idx = CPU_LOAD_IDX_MAX-1;
4825
4826static void
4827set_table_entry(struct ctl_table *entry,
4828		const char *procname, void *data, int maxlen,
4829		umode_t mode, proc_handler *proc_handler,
4830		bool load_idx)
4831{
4832	entry->procname = procname;
4833	entry->data = data;
4834	entry->maxlen = maxlen;
4835	entry->mode = mode;
4836	entry->proc_handler = proc_handler;
4837
4838	if (load_idx) {
4839		entry->extra1 = &min_load_idx;
4840		entry->extra2 = &max_load_idx;
4841	}
4842}
4843
4844static struct ctl_table *
4845sd_alloc_ctl_domain_table(struct sched_domain *sd)
4846{
4847	struct ctl_table *table = sd_alloc_ctl_entry(13);
4848
4849	if (table == NULL)
4850		return NULL;
4851
4852	set_table_entry(&table[0], "min_interval", &sd->min_interval,
4853		sizeof(long), 0644, proc_doulongvec_minmax, false);
4854	set_table_entry(&table[1], "max_interval", &sd->max_interval,
4855		sizeof(long), 0644, proc_doulongvec_minmax, false);
4856	set_table_entry(&table[2], "busy_idx", &sd->busy_idx,
4857		sizeof(int), 0644, proc_dointvec_minmax, true);
4858	set_table_entry(&table[3], "idle_idx", &sd->idle_idx,
4859		sizeof(int), 0644, proc_dointvec_minmax, true);
4860	set_table_entry(&table[4], "newidle_idx", &sd->newidle_idx,
4861		sizeof(int), 0644, proc_dointvec_minmax, true);
4862	set_table_entry(&table[5], "wake_idx", &sd->wake_idx,
4863		sizeof(int), 0644, proc_dointvec_minmax, true);
4864	set_table_entry(&table[6], "forkexec_idx", &sd->forkexec_idx,
4865		sizeof(int), 0644, proc_dointvec_minmax, true);
4866	set_table_entry(&table[7], "busy_factor", &sd->busy_factor,
4867		sizeof(int), 0644, proc_dointvec_minmax, false);
4868	set_table_entry(&table[8], "imbalance_pct", &sd->imbalance_pct,
4869		sizeof(int), 0644, proc_dointvec_minmax, false);
4870	set_table_entry(&table[9], "cache_nice_tries",
4871		&sd->cache_nice_tries,
4872		sizeof(int), 0644, proc_dointvec_minmax, false);
4873	set_table_entry(&table[10], "flags", &sd->flags,
4874		sizeof(int), 0644, proc_dointvec_minmax, false);
4875	set_table_entry(&table[11], "name", sd->name,
4876		CORENAME_MAX_SIZE, 0444, proc_dostring, false);
4877	/* &table[12] is terminator */
4878
4879	return table;
4880}
4881
4882static struct ctl_table *sd_alloc_ctl_cpu_table(int cpu)
4883{
4884	struct ctl_table *entry, *table;
4885	struct sched_domain *sd;
4886	int domain_num = 0, i;
4887	char buf[32];
4888
4889	for_each_domain(cpu, sd)
4890		domain_num++;
4891	entry = table = sd_alloc_ctl_entry(domain_num + 1);
4892	if (table == NULL)
4893		return NULL;
4894
4895	i = 0;
4896	for_each_domain(cpu, sd) {
4897		snprintf(buf, 32, "domain%d", i);
4898		entry->procname = kstrdup(buf, GFP_KERNEL);
4899		entry->mode = 0555;
4900		entry->child = sd_alloc_ctl_domain_table(sd);
4901		entry++;
4902		i++;
4903	}
4904	return table;
4905}
4906
4907static struct ctl_table_header *sd_sysctl_header;
4908static void register_sched_domain_sysctl(void)
4909{
4910	int i, cpu_num = num_possible_cpus();
4911	struct ctl_table *entry = sd_alloc_ctl_entry(cpu_num + 1);
4912	char buf[32];
4913
4914	WARN_ON(sd_ctl_dir[0].child);
4915	sd_ctl_dir[0].child = entry;
4916
4917	if (entry == NULL)
4918		return;
4919
4920	for_each_possible_cpu(i) {
4921		snprintf(buf, 32, "cpu%d", i);
4922		entry->procname = kstrdup(buf, GFP_KERNEL);
4923		entry->mode = 0555;
4924		entry->child = sd_alloc_ctl_cpu_table(i);
4925		entry++;
4926	}
4927
4928	WARN_ON(sd_sysctl_header);
4929	sd_sysctl_header = register_sysctl_table(sd_ctl_root);
4930}
4931
4932/* may be called multiple times per register */
4933static void unregister_sched_domain_sysctl(void)
4934{
4935	if (sd_sysctl_header)
4936		unregister_sysctl_table(sd_sysctl_header);
4937	sd_sysctl_header = NULL;
4938	if (sd_ctl_dir[0].child)
4939		sd_free_ctl_entry(&sd_ctl_dir[0].child);
4940}
4941#else
4942static void register_sched_domain_sysctl(void)
4943{
4944}
4945static void unregister_sched_domain_sysctl(void)
4946{
4947}
4948#endif
4949
4950static void set_rq_online(struct rq *rq)
4951{
4952	if (!rq->online) {
4953		const struct sched_class *class;
4954
4955		cpumask_set_cpu(rq->cpu, rq->rd->online);
4956		rq->online = 1;
4957
4958		for_each_class(class) {
4959			if (class->rq_online)
4960				class->rq_online(rq);
4961		}
4962	}
4963}
4964
4965static void set_rq_offline(struct rq *rq)
4966{
4967	if (rq->online) {
4968		const struct sched_class *class;
4969
4970		for_each_class(class) {
4971			if (class->rq_offline)
4972				class->rq_offline(rq);
4973		}
4974
4975		cpumask_clear_cpu(rq->cpu, rq->rd->online);
4976		rq->online = 0;
4977	}
4978}
4979
4980/*
4981 * migration_call - callback that gets triggered when a CPU is added.
4982 * Here we can start up the necessary migration thread for the new CPU.
4983 */
4984static int
4985migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu)
4986{
4987	int cpu = (long)hcpu;
4988	unsigned long flags;
4989	struct rq *rq = cpu_rq(cpu);
4990
4991	switch (action & ~CPU_TASKS_FROZEN) {
4992
4993	case CPU_UP_PREPARE:
4994		rq->calc_load_update = calc_load_update;
4995		break;
4996
4997	case CPU_ONLINE:
4998		/* Update our root-domain */
4999		raw_spin_lock_irqsave(&rq->lock, flags);
5000		if (rq->rd) {
5001			BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span));
5002
5003			set_rq_online(rq);
5004		}
5005		raw_spin_unlock_irqrestore(&rq->lock, flags);
5006		break;
5007
5008#ifdef CONFIG_HOTPLUG_CPU
5009	case CPU_DYING:
5010		sched_ttwu_pending();
5011		/* Update our root-domain */
5012		raw_spin_lock_irqsave(&rq->lock, flags);
5013		if (rq->rd) {
5014			BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span));
5015			set_rq_offline(rq);
5016		}
5017		migrate_tasks(cpu);
5018		BUG_ON(rq->nr_running != 1); /* the migration thread */
5019		raw_spin_unlock_irqrestore(&rq->lock, flags);
5020		break;
5021
5022	case CPU_DEAD:
5023		calc_load_migrate(rq);
5024		break;
5025#endif
5026	}
5027
5028	update_max_interval();
5029
5030	return NOTIFY_OK;
5031}
5032
5033/*
5034 * Register at high priority so that task migration (migrate_all_tasks)
5035 * happens before everything else.  This has to be lower priority than
5036 * the notifier in the perf_event subsystem, though.
5037 */
5038static struct notifier_block migration_notifier = {
5039	.notifier_call = migration_call,
5040	.priority = CPU_PRI_MIGRATION,
5041};
5042
5043static int sched_cpu_active(struct notifier_block *nfb,
5044				      unsigned long action, void *hcpu)
5045{
5046	switch (action & ~CPU_TASKS_FROZEN) {
5047	case CPU_STARTING:
5048	case CPU_DOWN_FAILED:
5049		set_cpu_active((long)hcpu, true);
5050		return NOTIFY_OK;
5051	default:
5052		return NOTIFY_DONE;
5053	}
5054}
5055
5056static int sched_cpu_inactive(struct notifier_block *nfb,
5057					unsigned long action, void *hcpu)
5058{
5059	unsigned long flags;
5060	long cpu = (long)hcpu;
5061
5062	switch (action & ~CPU_TASKS_FROZEN) {
5063	case CPU_DOWN_PREPARE:
5064		set_cpu_active(cpu, false);
5065
5066		/* explicitly allow suspend */
5067		if (!(action & CPU_TASKS_FROZEN)) {
5068			struct dl_bw *dl_b = dl_bw_of(cpu);
5069			bool overflow;
5070			int cpus;
5071
5072			raw_spin_lock_irqsave(&dl_b->lock, flags);
5073			cpus = dl_bw_cpus(cpu);
5074			overflow = __dl_overflow(dl_b, cpus, 0, 0);
5075			raw_spin_unlock_irqrestore(&dl_b->lock, flags);
5076
5077			if (overflow)
5078				return notifier_from_errno(-EBUSY);
5079		}
5080		return NOTIFY_OK;
5081	}
5082
5083	return NOTIFY_DONE;
5084}
5085
5086static int __init migration_init(void)
5087{
5088	void *cpu = (void *)(long)smp_processor_id();
5089	int err;
5090
5091	/* Initialize migration for the boot CPU */
5092	err = migration_call(&migration_notifier, CPU_UP_PREPARE, cpu);
5093	BUG_ON(err == NOTIFY_BAD);
5094	migration_call(&migration_notifier, CPU_ONLINE, cpu);
5095	register_cpu_notifier(&migration_notifier);
5096
5097	/* Register cpu active notifiers */
5098	cpu_notifier(sched_cpu_active, CPU_PRI_SCHED_ACTIVE);
5099	cpu_notifier(sched_cpu_inactive, CPU_PRI_SCHED_INACTIVE);
5100
5101	return 0;
5102}
5103early_initcall(migration_init);
5104#endif
5105
5106#ifdef CONFIG_SMP
5107
5108static cpumask_var_t sched_domains_tmpmask; /* sched_domains_mutex */
5109
5110#ifdef CONFIG_SCHED_DEBUG
5111
5112static __read_mostly int sched_debug_enabled;
5113
5114static int __init sched_debug_setup(char *str)
5115{
5116	sched_debug_enabled = 1;
5117
5118	return 0;
5119}
5120early_param("sched_debug", sched_debug_setup);
5121
5122static inline bool sched_debug(void)
5123{
5124	return sched_debug_enabled;
5125}
5126
5127static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level,
5128				  struct cpumask *groupmask)
5129{
5130	struct sched_group *group = sd->groups;
5131	char str[256];
5132
5133	cpulist_scnprintf(str, sizeof(str), sched_domain_span(sd));
5134	cpumask_clear(groupmask);
5135
5136	printk(KERN_DEBUG "%*s domain %d: ", level, "", level);
5137
5138	if (!(sd->flags & SD_LOAD_BALANCE)) {
5139		printk("does not load-balance\n");
5140		if (sd->parent)
5141			printk(KERN_ERR "ERROR: !SD_LOAD_BALANCE domain"
5142					" has parent");
5143		return -1;
5144	}
5145
5146	printk(KERN_CONT "span %s level %s\n", str, sd->name);
5147
5148	if (!cpumask_test_cpu(cpu, sched_domain_span(sd))) {
5149		printk(KERN_ERR "ERROR: domain->span does not contain "
5150				"CPU%d\n", cpu);
5151	}
5152	if (!cpumask_test_cpu(cpu, sched_group_cpus(group))) {
5153		printk(KERN_ERR "ERROR: domain->groups does not contain"
5154				" CPU%d\n", cpu);
5155	}
5156
5157	printk(KERN_DEBUG "%*s groups:", level + 1, "");
5158	do {
5159		if (!group) {
5160			printk("\n");
5161			printk(KERN_ERR "ERROR: group is NULL\n");
5162			break;
5163		}
5164
5165		/*
5166		 * Even though we initialize ->power to something semi-sane,
5167		 * we leave power_orig unset. This allows us to detect if
5168		 * domain iteration is still funny without causing /0 traps.
5169		 */
5170		if (!group->sgp->power_orig) {
5171			printk(KERN_CONT "\n");
5172			printk(KERN_ERR "ERROR: domain->cpu_power not "
5173					"set\n");
5174			break;
5175		}
5176
5177		if (!cpumask_weight(sched_group_cpus(group))) {
5178			printk(KERN_CONT "\n");
5179			printk(KERN_ERR "ERROR: empty group\n");
5180			break;
5181		}
5182
5183		if (!(sd->flags & SD_OVERLAP) &&
5184		    cpumask_intersects(groupmask, sched_group_cpus(group))) {
5185			printk(KERN_CONT "\n");
5186			printk(KERN_ERR "ERROR: repeated CPUs\n");
5187			break;
5188		}
5189
5190		cpumask_or(groupmask, groupmask, sched_group_cpus(group));
5191
5192		cpulist_scnprintf(str, sizeof(str), sched_group_cpus(group));
5193
5194		printk(KERN_CONT " %s", str);
5195		if (group->sgp->power != SCHED_POWER_SCALE) {
5196			printk(KERN_CONT " (cpu_power = %d)",
5197				group->sgp->power);
5198		}
5199
5200		group = group->next;
5201	} while (group != sd->groups);
5202	printk(KERN_CONT "\n");
5203
5204	if (!cpumask_equal(sched_domain_span(sd), groupmask))
5205		printk(KERN_ERR "ERROR: groups don't span domain->span\n");
5206
5207	if (sd->parent &&
5208	    !cpumask_subset(groupmask, sched_domain_span(sd->parent)))
5209		printk(KERN_ERR "ERROR: parent span is not a superset "
5210			"of domain->span\n");
5211	return 0;
5212}
5213
5214static void sched_domain_debug(struct sched_domain *sd, int cpu)
5215{
5216	int level = 0;
5217
5218	if (!sched_debug_enabled)
5219		return;
5220
5221	if (!sd) {
5222		printk(KERN_DEBUG "CPU%d attaching NULL sched-domain.\n", cpu);
5223		return;
5224	}
5225
5226	printk(KERN_DEBUG "CPU%d attaching sched-domain:\n", cpu);
5227
5228	for (;;) {
5229		if (sched_domain_debug_one(sd, cpu, level, sched_domains_tmpmask))
5230			break;
5231		level++;
5232		sd = sd->parent;
5233		if (!sd)
5234			break;
5235	}
5236}
5237#else /* !CONFIG_SCHED_DEBUG */
5238# define sched_domain_debug(sd, cpu) do { } while (0)
5239static inline bool sched_debug(void)
5240{
5241	return false;
5242}
5243#endif /* CONFIG_SCHED_DEBUG */
5244
5245static int sd_degenerate(struct sched_domain *sd)
5246{
5247	if (cpumask_weight(sched_domain_span(sd)) == 1)
5248		return 1;
5249
5250	/* Following flags need at least 2 groups */
5251	if (sd->flags & (SD_LOAD_BALANCE |
5252			 SD_BALANCE_NEWIDLE |
5253			 SD_BALANCE_FORK |
5254			 SD_BALANCE_EXEC |
5255			 SD_SHARE_CPUPOWER |
5256			 SD_SHARE_PKG_RESOURCES)) {
5257		if (sd->groups != sd->groups->next)
5258			return 0;
5259	}
5260
5261	/* Following flags don't use groups */
5262	if (sd->flags & (SD_WAKE_AFFINE))
5263		return 0;
5264
5265	return 1;
5266}
5267
5268static int
5269sd_parent_degenerate(struct sched_domain *sd, struct sched_domain *parent)
5270{
5271	unsigned long cflags = sd->flags, pflags = parent->flags;
5272
5273	if (sd_degenerate(parent))
5274		return 1;
5275
5276	if (!cpumask_equal(sched_domain_span(sd), sched_domain_span(parent)))
5277		return 0;
5278
5279	/* Flags needing groups don't count if only 1 group in parent */
5280	if (parent->groups == parent->groups->next) {
5281		pflags &= ~(SD_LOAD_BALANCE |
5282				SD_BALANCE_NEWIDLE |
5283				SD_BALANCE_FORK |
5284				SD_BALANCE_EXEC |
5285				SD_SHARE_CPUPOWER |
5286				SD_SHARE_PKG_RESOURCES |
5287				SD_PREFER_SIBLING);
5288		if (nr_node_ids == 1)
5289			pflags &= ~SD_SERIALIZE;
5290	}
5291	if (~cflags & pflags)
5292		return 0;
5293
5294	return 1;
5295}
5296
5297static void free_rootdomain(struct rcu_head *rcu)
5298{
5299	struct root_domain *rd = container_of(rcu, struct root_domain, rcu);
5300
5301	cpupri_cleanup(&rd->cpupri);
5302	cpudl_cleanup(&rd->cpudl);
5303	free_cpumask_var(rd->dlo_mask);
5304	free_cpumask_var(rd->rto_mask);
5305	free_cpumask_var(rd->online);
5306	free_cpumask_var(rd->span);
5307	kfree(rd);
5308}
5309
5310static void rq_attach_root(struct rq *rq, struct root_domain *rd)
5311{
5312	struct root_domain *old_rd = NULL;
5313	unsigned long flags;
5314
5315	raw_spin_lock_irqsave(&rq->lock, flags);
5316
5317	if (rq->rd) {
5318		old_rd = rq->rd;
5319
5320		if (cpumask_test_cpu(rq->cpu, old_rd->online))
5321			set_rq_offline(rq);
5322
5323		cpumask_clear_cpu(rq->cpu, old_rd->span);
5324
5325		/*
5326		 * If we dont want to free the old_rd yet then
5327		 * set old_rd to NULL to skip the freeing later
5328		 * in this function:
5329		 */
5330		if (!atomic_dec_and_test(&old_rd->refcount))
5331			old_rd = NULL;
5332	}
5333
5334	atomic_inc(&rd->refcount);
5335	rq->rd = rd;
5336
5337	cpumask_set_cpu(rq->cpu, rd->span);
5338	if (cpumask_test_cpu(rq->cpu, cpu_active_mask))
5339		set_rq_online(rq);
5340
5341	raw_spin_unlock_irqrestore(&rq->lock, flags);
5342
5343	if (old_rd)
5344		call_rcu_sched(&old_rd->rcu, free_rootdomain);
5345}
5346
5347static int init_rootdomain(struct root_domain *rd)
5348{
5349	memset(rd, 0, sizeof(*rd));
5350
5351	if (!alloc_cpumask_var(&rd->span, GFP_KERNEL))
5352		goto out;
5353	if (!alloc_cpumask_var(&rd->online, GFP_KERNEL))
5354		goto free_span;
5355	if (!alloc_cpumask_var(&rd->dlo_mask, GFP_KERNEL))
5356		goto free_online;
5357	if (!alloc_cpumask_var(&rd->rto_mask, GFP_KERNEL))
5358		goto free_dlo_mask;
5359
5360	init_dl_bw(&rd->dl_bw);
5361	if (cpudl_init(&rd->cpudl) != 0)
5362		goto free_dlo_mask;
5363
5364	if (cpupri_init(&rd->cpupri) != 0)
5365		goto free_rto_mask;
5366	return 0;
5367
5368free_rto_mask:
5369	free_cpumask_var(rd->rto_mask);
5370free_dlo_mask:
5371	free_cpumask_var(rd->dlo_mask);
5372free_online:
5373	free_cpumask_var(rd->online);
5374free_span:
5375	free_cpumask_var(rd->span);
5376out:
5377	return -ENOMEM;
5378}
5379
5380/*
5381 * By default the system creates a single root-domain with all cpus as
5382 * members (mimicking the global state we have today).
5383 */
5384struct root_domain def_root_domain;
5385
5386static void init_defrootdomain(void)
5387{
5388	init_rootdomain(&def_root_domain);
5389
5390	atomic_set(&def_root_domain.refcount, 1);
5391}
5392
5393static struct root_domain *alloc_rootdomain(void)
5394{
5395	struct root_domain *rd;
5396
5397	rd = kmalloc(sizeof(*rd), GFP_KERNEL);
5398	if (!rd)
5399		return NULL;
5400
5401	if (init_rootdomain(rd) != 0) {
5402		kfree(rd);
5403		return NULL;
5404	}
5405
5406	return rd;
5407}
5408
5409static void free_sched_groups(struct sched_group *sg, int free_sgp)
5410{
5411	struct sched_group *tmp, *first;
5412
5413	if (!sg)
5414		return;
5415
5416	first = sg;
5417	do {
5418		tmp = sg->next;
5419
5420		if (free_sgp && atomic_dec_and_test(&sg->sgp->ref))
5421			kfree(sg->sgp);
5422
5423		kfree(sg);
5424		sg = tmp;
5425	} while (sg != first);
5426}
5427
5428static void free_sched_domain(struct rcu_head *rcu)
5429{
5430	struct sched_domain *sd = container_of(rcu, struct sched_domain, rcu);
5431
5432	/*
5433	 * If its an overlapping domain it has private groups, iterate and
5434	 * nuke them all.
5435	 */
5436	if (sd->flags & SD_OVERLAP) {
5437		free_sched_groups(sd->groups, 1);
5438	} else if (atomic_dec_and_test(&sd->groups->ref)) {
5439		kfree(sd->groups->sgp);
5440		kfree(sd->groups);
5441	}
5442	kfree(sd);
5443}
5444
5445static void destroy_sched_domain(struct sched_domain *sd, int cpu)
5446{
5447	call_rcu(&sd->rcu, free_sched_domain);
5448}
5449
5450static void destroy_sched_domains(struct sched_domain *sd, int cpu)
5451{
5452	for (; sd; sd = sd->parent)
5453		destroy_sched_domain(sd, cpu);
5454}
5455
5456/*
5457 * Keep a special pointer to the highest sched_domain that has
5458 * SD_SHARE_PKG_RESOURCE set (Last Level Cache Domain) for this
5459 * allows us to avoid some pointer chasing select_idle_sibling().
5460 *
5461 * Also keep a unique ID per domain (we use the first cpu number in
5462 * the cpumask of the domain), this allows us to quickly tell if
5463 * two cpus are in the same cache domain, see cpus_share_cache().
5464 */
5465DEFINE_PER_CPU(struct sched_domain *, sd_llc);
5466DEFINE_PER_CPU(int, sd_llc_size);
5467DEFINE_PER_CPU(int, sd_llc_id);
5468DEFINE_PER_CPU(struct sched_domain *, sd_numa);
5469DEFINE_PER_CPU(struct sched_domain *, sd_busy);
5470DEFINE_PER_CPU(struct sched_domain *, sd_asym);
5471
5472static void update_top_cache_domain(int cpu)
5473{
5474	struct sched_domain *sd;
5475	struct sched_domain *busy_sd = NULL;
5476	int id = cpu;
5477	int size = 1;
5478
5479	sd = highest_flag_domain(cpu, SD_SHARE_PKG_RESOURCES);
5480	if (sd) {
5481		id = cpumask_first(sched_domain_span(sd));
5482		size = cpumask_weight(sched_domain_span(sd));
5483		busy_sd = sd->parent; /* sd_busy */
5484	}
5485	rcu_assign_pointer(per_cpu(sd_busy, cpu), busy_sd);
5486
5487	rcu_assign_pointer(per_cpu(sd_llc, cpu), sd);
5488	per_cpu(sd_llc_size, cpu) = size;
5489	per_cpu(sd_llc_id, cpu) = id;
5490
5491	sd = lowest_flag_domain(cpu, SD_NUMA);
5492	rcu_assign_pointer(per_cpu(sd_numa, cpu), sd);
5493
5494	sd = highest_flag_domain(cpu, SD_ASYM_PACKING);
5495	rcu_assign_pointer(per_cpu(sd_asym, cpu), sd);
5496}
5497
5498/*
5499 * Attach the domain 'sd' to 'cpu' as its base domain. Callers must
5500 * hold the hotplug lock.
5501 */
5502static void
5503cpu_attach_domain(struct sched_domain *sd, struct root_domain *rd, int cpu)
5504{
5505	struct rq *rq = cpu_rq(cpu);
5506	struct sched_domain *tmp;
5507
5508	/* Remove the sched domains which do not contribute to scheduling. */
5509	for (tmp = sd; tmp; ) {
5510		struct sched_domain *parent = tmp->parent;
5511		if (!parent)
5512			break;
5513
5514		if (sd_parent_degenerate(tmp, parent)) {
5515			tmp->parent = parent->parent;
5516			if (parent->parent)
5517				parent->parent->child = tmp;
5518			/*
5519			 * Transfer SD_PREFER_SIBLING down in case of a
5520			 * degenerate parent; the spans match for this
5521			 * so the property transfers.
5522			 */
5523			if (parent->flags & SD_PREFER_SIBLING)
5524				tmp->flags |= SD_PREFER_SIBLING;
5525			destroy_sched_domain(parent, cpu);
5526		} else
5527			tmp = tmp->parent;
5528	}
5529
5530	if (sd && sd_degenerate(sd)) {
5531		tmp = sd;
5532		sd = sd->parent;
5533		destroy_sched_domain(tmp, cpu);
5534		if (sd)
5535			sd->child = NULL;
5536	}
5537
5538	sched_domain_debug(sd, cpu);
5539
5540	rq_attach_root(rq, rd);
5541	tmp = rq->sd;
5542	rcu_assign_pointer(rq->sd, sd);
5543	destroy_sched_domains(tmp, cpu);
5544
5545	update_top_cache_domain(cpu);
5546}
5547
5548/* cpus with isolated domains */
5549static cpumask_var_t cpu_isolated_map;
5550
5551/* Setup the mask of cpus configured for isolated domains */
5552static int __init isolated_cpu_setup(char *str)
5553{
5554	alloc_bootmem_cpumask_var(&cpu_isolated_map);
5555	cpulist_parse(str, cpu_isolated_map);
5556	return 1;
5557}
5558
5559__setup("isolcpus=", isolated_cpu_setup);
5560
5561static const struct cpumask *cpu_cpu_mask(int cpu)
5562{
5563	return cpumask_of_node(cpu_to_node(cpu));
5564}
5565
5566struct sd_data {
5567	struct sched_domain **__percpu sd;
5568	struct sched_group **__percpu sg;
5569	struct sched_group_power **__percpu sgp;
5570};
5571
5572struct s_data {
5573	struct sched_domain ** __percpu sd;
5574	struct root_domain	*rd;
5575};
5576
5577enum s_alloc {
5578	sa_rootdomain,
5579	sa_sd,
5580	sa_sd_storage,
5581	sa_none,
5582};
5583
5584struct sched_domain_topology_level;
5585
5586typedef struct sched_domain *(*sched_domain_init_f)(struct sched_domain_topology_level *tl, int cpu);
5587typedef const struct cpumask *(*sched_domain_mask_f)(int cpu);
5588
5589#define SDTL_OVERLAP	0x01
5590
5591struct sched_domain_topology_level {
5592	sched_domain_init_f init;
5593	sched_domain_mask_f mask;
5594	int		    flags;
5595	int		    numa_level;
5596	struct sd_data      data;
5597};
5598
5599/*
5600 * Build an iteration mask that can exclude certain CPUs from the upwards
5601 * domain traversal.
5602 *
5603 * Asymmetric node setups can result in situations where the domain tree is of
5604 * unequal depth, make sure to skip domains that already cover the entire
5605 * range.
5606 *
5607 * In that case build_sched_domains() will have terminated the iteration early
5608 * and our sibling sd spans will be empty. Domains should always include the
5609 * cpu they're built on, so check that.
5610 *
5611 */
5612static void build_group_mask(struct sched_domain *sd, struct sched_group *sg)
5613{
5614	const struct cpumask *span = sched_domain_span(sd);
5615	struct sd_data *sdd = sd->private;
5616	struct sched_domain *sibling;
5617	int i;
5618
5619	for_each_cpu(i, span) {
5620		sibling = *per_cpu_ptr(sdd->sd, i);
5621		if (!cpumask_test_cpu(i, sched_domain_span(sibling)))
5622			continue;
5623
5624		cpumask_set_cpu(i, sched_group_mask(sg));
5625	}
5626}
5627
5628/*
5629 * Return the canonical balance cpu for this group, this is the first cpu
5630 * of this group that's also in the iteration mask.
5631 */
5632int group_balance_cpu(struct sched_group *sg)
5633{
5634	return cpumask_first_and(sched_group_cpus(sg), sched_group_mask(sg));
5635}
5636
5637static int
5638build_overlap_sched_groups(struct sched_domain *sd, int cpu)
5639{
5640	struct sched_group *first = NULL, *last = NULL, *groups = NULL, *sg;
5641	const struct cpumask *span = sched_domain_span(sd);
5642	struct cpumask *covered = sched_domains_tmpmask;
5643	struct sd_data *sdd = sd->private;
5644	struct sched_domain *child;
5645	int i;
5646
5647	cpumask_clear(covered);
5648
5649	for_each_cpu(i, span) {
5650		struct cpumask *sg_span;
5651
5652		if (cpumask_test_cpu(i, covered))
5653			continue;
5654
5655		child = *per_cpu_ptr(sdd->sd, i);
5656
5657		/* See the comment near build_group_mask(). */
5658		if (!cpumask_test_cpu(i, sched_domain_span(child)))
5659			continue;
5660
5661		sg = kzalloc_node(sizeof(struct sched_group) + cpumask_size(),
5662				GFP_KERNEL, cpu_to_node(cpu));
5663
5664		if (!sg)
5665			goto fail;
5666
5667		sg_span = sched_group_cpus(sg);
5668		if (child->child) {
5669			child = child->child;
5670			cpumask_copy(sg_span, sched_domain_span(child));
5671		} else
5672			cpumask_set_cpu(i, sg_span);
5673
5674		cpumask_or(covered, covered, sg_span);
5675
5676		sg->sgp = *per_cpu_ptr(sdd->sgp, i);
5677		if (atomic_inc_return(&sg->sgp->ref) == 1)
5678			build_group_mask(sd, sg);
5679
5680		/*
5681		 * Initialize sgp->power such that even if we mess up the
5682		 * domains and no possible iteration will get us here, we won't
5683		 * die on a /0 trap.
5684		 */
5685		sg->sgp->power = SCHED_POWER_SCALE * cpumask_weight(sg_span);
5686		sg->sgp->power_orig = sg->sgp->power;
5687
5688		/*
5689		 * Make sure the first group of this domain contains the
5690		 * canonical balance cpu. Otherwise the sched_domain iteration
5691		 * breaks. See update_sg_lb_stats().
5692		 */
5693		if ((!groups && cpumask_test_cpu(cpu, sg_span)) ||
5694		    group_balance_cpu(sg) == cpu)
5695			groups = sg;
5696
5697		if (!first)
5698			first = sg;
5699		if (last)
5700			last->next = sg;
5701		last = sg;
5702		last->next = first;
5703	}
5704	sd->groups = groups;
5705
5706	return 0;
5707
5708fail:
5709	free_sched_groups(first, 0);
5710
5711	return -ENOMEM;
5712}
5713
5714static int get_group(int cpu, struct sd_data *sdd, struct sched_group **sg)
5715{
5716	struct sched_domain *sd = *per_cpu_ptr(sdd->sd, cpu);
5717	struct sched_domain *child = sd->child;
5718
5719	if (child)
5720		cpu = cpumask_first(sched_domain_span(child));
5721
5722	if (sg) {
5723		*sg = *per_cpu_ptr(sdd->sg, cpu);
5724		(*sg)->sgp = *per_cpu_ptr(sdd->sgp, cpu);
5725		atomic_set(&(*sg)->sgp->ref, 1); /* for claim_allocations */
5726	}
5727
5728	return cpu;
5729}
5730
5731/*
5732 * build_sched_groups will build a circular linked list of the groups
5733 * covered by the given span, and will set each group's ->cpumask correctly,
5734 * and ->cpu_power to 0.
5735 *
5736 * Assumes the sched_domain tree is fully constructed
5737 */
5738static int
5739build_sched_groups(struct sched_domain *sd, int cpu)
5740{
5741	struct sched_group *first = NULL, *last = NULL;
5742	struct sd_data *sdd = sd->private;
5743	const struct cpumask *span = sched_domain_span(sd);
5744	struct cpumask *covered;
5745	int i;
5746
5747	get_group(cpu, sdd, &sd->groups);
5748	atomic_inc(&sd->groups->ref);
5749
5750	if (cpu != cpumask_first(span))
5751		return 0;
5752
5753	lockdep_assert_held(&sched_domains_mutex);
5754	covered = sched_domains_tmpmask;
5755
5756	cpumask_clear(covered);
5757
5758	for_each_cpu(i, span) {
5759		struct sched_group *sg;
5760		int group, j;
5761
5762		if (cpumask_test_cpu(i, covered))
5763			continue;
5764
5765		group = get_group(i, sdd, &sg);
5766		cpumask_clear(sched_group_cpus(sg));
5767		sg->sgp->power = 0;
5768		cpumask_setall(sched_group_mask(sg));
5769
5770		for_each_cpu(j, span) {
5771			if (get_group(j, sdd, NULL) != group)
5772				continue;
5773
5774			cpumask_set_cpu(j, covered);
5775			cpumask_set_cpu(j, sched_group_cpus(sg));
5776		}
5777
5778		if (!first)
5779			first = sg;
5780		if (last)
5781			last->next = sg;
5782		last = sg;
5783	}
5784	last->next = first;
5785
5786	return 0;
5787}
5788
5789/*
5790 * Initialize sched groups cpu_power.
5791 *
5792 * cpu_power indicates the capacity of sched group, which is used while
5793 * distributing the load between different sched groups in a sched domain.
5794 * Typically cpu_power for all the groups in a sched domain will be same unless
5795 * there are asymmetries in the topology. If there are asymmetries, group
5796 * having more cpu_power will pickup more load compared to the group having
5797 * less cpu_power.
5798 */
5799static void init_sched_groups_power(int cpu, struct sched_domain *sd)
5800{
5801	struct sched_group *sg = sd->groups;
5802
5803	WARN_ON(!sg);
5804
5805	do {
5806		sg->group_weight = cpumask_weight(sched_group_cpus(sg));
5807		sg = sg->next;
5808	} while (sg != sd->groups);
5809
5810	if (cpu != group_balance_cpu(sg))
5811		return;
5812
5813	update_group_power(sd, cpu);
5814	atomic_set(&sg->sgp->nr_busy_cpus, sg->group_weight);
5815}
5816
5817int __weak arch_sd_sibling_asym_packing(void)
5818{
5819       return 0*SD_ASYM_PACKING;
5820}
5821
5822/*
5823 * Initializers for schedule domains
5824 * Non-inlined to reduce accumulated stack pressure in build_sched_domains()
5825 */
5826
5827#ifdef CONFIG_SCHED_DEBUG
5828# define SD_INIT_NAME(sd, type)		sd->name = #type
5829#else
5830# define SD_INIT_NAME(sd, type)		do { } while (0)
5831#endif
5832
5833#define SD_INIT_FUNC(type)						\
5834static noinline struct sched_domain *					\
5835sd_init_##type(struct sched_domain_topology_level *tl, int cpu) 	\
5836{									\
5837	struct sched_domain *sd = *per_cpu_ptr(tl->data.sd, cpu);	\
5838	*sd = SD_##type##_INIT;						\
5839	SD_INIT_NAME(sd, type);						\
5840	sd->private = &tl->data;					\
5841	return sd;							\
5842}
5843
5844SD_INIT_FUNC(CPU)
5845#ifdef CONFIG_SCHED_SMT
5846 SD_INIT_FUNC(SIBLING)
5847#endif
5848#ifdef CONFIG_SCHED_MC
5849 SD_INIT_FUNC(MC)
5850#endif
5851#ifdef CONFIG_SCHED_BOOK
5852 SD_INIT_FUNC(BOOK)
5853#endif
5854
5855static int default_relax_domain_level = -1;
5856int sched_domain_level_max;
5857
5858static int __init setup_relax_domain_level(char *str)
5859{
5860	if (kstrtoint(str, 0, &default_relax_domain_level))
5861		pr_warn("Unable to set relax_domain_level\n");
5862
5863	return 1;
5864}
5865__setup("relax_domain_level=", setup_relax_domain_level);
5866
5867static void set_domain_attribute(struct sched_domain *sd,
5868				 struct sched_domain_attr *attr)
5869{
5870	int request;
5871
5872	if (!attr || attr->relax_domain_level < 0) {
5873		if (default_relax_domain_level < 0)
5874			return;
5875		else
5876			request = default_relax_domain_level;
5877	} else
5878		request = attr->relax_domain_level;
5879	if (request < sd->level) {
5880		/* turn off idle balance on this domain */
5881		sd->flags &= ~(SD_BALANCE_WAKE|SD_BALANCE_NEWIDLE);
5882	} else {
5883		/* turn on idle balance on this domain */
5884		sd->flags |= (SD_BALANCE_WAKE|SD_BALANCE_NEWIDLE);
5885	}
5886}
5887
5888static void __sdt_free(const struct cpumask *cpu_map);
5889static int __sdt_alloc(const struct cpumask *cpu_map);
5890
5891static void __free_domain_allocs(struct s_data *d, enum s_alloc what,
5892				 const struct cpumask *cpu_map)
5893{
5894	switch (what) {
5895	case sa_rootdomain:
5896		if (!atomic_read(&d->rd->refcount))
5897			free_rootdomain(&d->rd->rcu); /* fall through */
5898	case sa_sd:
5899		free_percpu(d->sd); /* fall through */
5900	case sa_sd_storage:
5901		__sdt_free(cpu_map); /* fall through */
5902	case sa_none:
5903		break;
5904	}
5905}
5906
5907static enum s_alloc __visit_domain_allocation_hell(struct s_data *d,
5908						   const struct cpumask *cpu_map)
5909{
5910	memset(d, 0, sizeof(*d));
5911
5912	if (__sdt_alloc(cpu_map))
5913		return sa_sd_storage;
5914	d->sd = alloc_percpu(struct sched_domain *);
5915	if (!d->sd)
5916		return sa_sd_storage;
5917	d->rd = alloc_rootdomain();
5918	if (!d->rd)
5919		return sa_sd;
5920	return sa_rootdomain;
5921}
5922
5923/*
5924 * NULL the sd_data elements we've used to build the sched_domain and
5925 * sched_group structure so that the subsequent __free_domain_allocs()
5926 * will not free the data we're using.
5927 */
5928static void claim_allocations(int cpu, struct sched_domain *sd)
5929{
5930	struct sd_data *sdd = sd->private;
5931
5932	WARN_ON_ONCE(*per_cpu_ptr(sdd->sd, cpu) != sd);
5933	*per_cpu_ptr(sdd->sd, cpu) = NULL;
5934
5935	if (atomic_read(&(*per_cpu_ptr(sdd->sg, cpu))->ref))
5936		*per_cpu_ptr(sdd->sg, cpu) = NULL;
5937
5938	if (atomic_read(&(*per_cpu_ptr(sdd->sgp, cpu))->ref))
5939		*per_cpu_ptr(sdd->sgp, cpu) = NULL;
5940}
5941
5942#ifdef CONFIG_SCHED_SMT
5943static const struct cpumask *cpu_smt_mask(int cpu)
5944{
5945	return topology_thread_cpumask(cpu);
5946}
5947#endif
5948
5949/*
5950 * Topology list, bottom-up.
5951 */
5952static struct sched_domain_topology_level default_topology[] = {
5953#ifdef CONFIG_SCHED_SMT
5954	{ sd_init_SIBLING, cpu_smt_mask, },
5955#endif
5956#ifdef CONFIG_SCHED_MC
5957	{ sd_init_MC, cpu_coregroup_mask, },
5958#endif
5959#ifdef CONFIG_SCHED_BOOK
5960	{ sd_init_BOOK, cpu_book_mask, },
5961#endif
5962	{ sd_init_CPU, cpu_cpu_mask, },
5963	{ NULL, },
5964};
5965
5966static struct sched_domain_topology_level *sched_domain_topology = default_topology;
5967
5968#define for_each_sd_topology(tl)			\
5969	for (tl = sched_domain_topology; tl->init; tl++)
5970
5971#ifdef CONFIG_NUMA
5972
5973static int sched_domains_numa_levels;
5974static int *sched_domains_numa_distance;
5975static struct cpumask ***sched_domains_numa_masks;
5976static int sched_domains_curr_level;
5977
5978static inline int sd_local_flags(int level)
5979{
5980	if (sched_domains_numa_distance[level] > RECLAIM_DISTANCE)
5981		return 0;
5982
5983	return SD_BALANCE_EXEC | SD_BALANCE_FORK | SD_WAKE_AFFINE;
5984}
5985
5986static struct sched_domain *
5987sd_numa_init(struct sched_domain_topology_level *tl, int cpu)
5988{
5989	struct sched_domain *sd = *per_cpu_ptr(tl->data.sd, cpu);
5990	int level = tl->numa_level;
5991	int sd_weight = cpumask_weight(
5992			sched_domains_numa_masks[level][cpu_to_node(cpu)]);
5993
5994	*sd = (struct sched_domain){
5995		.min_interval		= sd_weight,
5996		.max_interval		= 2*sd_weight,
5997		.busy_factor		= 32,
5998		.imbalance_pct		= 125,
5999		.cache_nice_tries	= 2,
6000		.busy_idx		= 3,
6001		.idle_idx		= 2,
6002		.newidle_idx		= 0,
6003		.wake_idx		= 0,
6004		.forkexec_idx		= 0,
6005
6006		.flags			= 1*SD_LOAD_BALANCE
6007					| 1*SD_BALANCE_NEWIDLE
6008					| 0*SD_BALANCE_EXEC
6009					| 0*SD_BALANCE_FORK
6010					| 0*SD_BALANCE_WAKE
6011					| 0*SD_WAKE_AFFINE
6012					| 0*SD_SHARE_CPUPOWER
6013					| 0*SD_SHARE_PKG_RESOURCES
6014					| 1*SD_SERIALIZE
6015					| 0*SD_PREFER_SIBLING
6016					| 1*SD_NUMA
6017					| sd_local_flags(level)
6018					,
6019		.last_balance		= jiffies,
6020		.balance_interval	= sd_weight,
6021	};
6022	SD_INIT_NAME(sd, NUMA);
6023	sd->private = &tl->data;
6024
6025	/*
6026	 * Ugly hack to pass state to sd_numa_mask()...
6027	 */
6028	sched_domains_curr_level = tl->numa_level;
6029
6030	return sd;
6031}
6032
6033static const struct cpumask *sd_numa_mask(int cpu)
6034{
6035	return sched_domains_numa_masks[sched_domains_curr_level][cpu_to_node(cpu)];
6036}
6037
6038static void sched_numa_warn(const char *str)
6039{
6040	static int done = false;
6041	int i,j;
6042
6043	if (done)
6044		return;
6045
6046	done = true;
6047
6048	printk(KERN_WARNING "ERROR: %s\n\n", str);
6049
6050	for (i = 0; i < nr_node_ids; i++) {
6051		printk(KERN_WARNING "  ");
6052		for (j = 0; j < nr_node_ids; j++)
6053			printk(KERN_CONT "%02d ", node_distance(i,j));
6054		printk(KERN_CONT "\n");
6055	}
6056	printk(KERN_WARNING "\n");
6057}
6058
6059static bool find_numa_distance(int distance)
6060{
6061	int i;
6062
6063	if (distance == node_distance(0, 0))
6064		return true;
6065
6066	for (i = 0; i < sched_domains_numa_levels; i++) {
6067		if (sched_domains_numa_distance[i] == distance)
6068			return true;
6069	}
6070
6071	return false;
6072}
6073
6074static void sched_init_numa(void)
6075{
6076	int next_distance, curr_distance = node_distance(0, 0);
6077	struct sched_domain_topology_level *tl;
6078	int level = 0;
6079	int i, j, k;
6080
6081	sched_domains_numa_distance = kzalloc(sizeof(int) * nr_node_ids, GFP_KERNEL);
6082	if (!sched_domains_numa_distance)
6083		return;
6084
6085	/*
6086	 * O(nr_nodes^2) deduplicating selection sort -- in order to find the
6087	 * unique distances in the node_distance() table.
6088	 *
6089	 * Assumes node_distance(0,j) includes all distances in
6090	 * node_distance(i,j) in order to avoid cubic time.
6091	 */
6092	next_distance = curr_distance;
6093	for (i = 0; i < nr_node_ids; i++) {
6094		for (j = 0; j < nr_node_ids; j++) {
6095			for (k = 0; k < nr_node_ids; k++) {
6096				int distance = node_distance(i, k);
6097
6098				if (distance > curr_distance &&
6099				    (distance < next_distance ||
6100				     next_distance == curr_distance))
6101					next_distance = distance;
6102
6103				/*
6104				 * While not a strong assumption it would be nice to know
6105				 * about cases where if node A is connected to B, B is not
6106				 * equally connected to A.
6107				 */
6108				if (sched_debug() && node_distance(k, i) != distance)
6109					sched_numa_warn("Node-distance not symmetric");
6110
6111				if (sched_debug() && i && !find_numa_distance(distance))
6112					sched_numa_warn("Node-0 not representative");
6113			}
6114			if (next_distance != curr_distance) {
6115				sched_domains_numa_distance[level++] = next_distance;
6116				sched_domains_numa_levels = level;
6117				curr_distance = next_distance;
6118			} else break;
6119		}
6120
6121		/*
6122		 * In case of sched_debug() we verify the above assumption.
6123		 */
6124		if (!sched_debug())
6125			break;
6126	}
6127	/*
6128	 * 'level' contains the number of unique distances, excluding the
6129	 * identity distance node_distance(i,i).
6130	 *
6131	 * The sched_domains_numa_distance[] array includes the actual distance
6132	 * numbers.
6133	 */
6134
6135	/*
6136	 * Here, we should temporarily reset sched_domains_numa_levels to 0.
6137	 * If it fails to allocate memory for array sched_domains_numa_masks[][],
6138	 * the array will contain less then 'level' members. This could be
6139	 * dangerous when we use it to iterate array sched_domains_numa_masks[][]
6140	 * in other functions.
6141	 *
6142	 * We reset it to 'level' at the end of this function.
6143	 */
6144	sched_domains_numa_levels = 0;
6145
6146	sched_domains_numa_masks = kzalloc(sizeof(void *) * level, GFP_KERNEL);
6147	if (!sched_domains_numa_masks)
6148		return;
6149
6150	/*
6151	 * Now for each level, construct a mask per node which contains all
6152	 * cpus of nodes that are that many hops away from us.
6153	 */
6154	for (i = 0; i < level; i++) {
6155		sched_domains_numa_masks[i] =
6156			kzalloc(nr_node_ids * sizeof(void *), GFP_KERNEL);
6157		if (!sched_domains_numa_masks[i])
6158			return;
6159
6160		for (j = 0; j < nr_node_ids; j++) {
6161			struct cpumask *mask = kzalloc(cpumask_size(), GFP_KERNEL);
6162			if (!mask)
6163				return;
6164
6165			sched_domains_numa_masks[i][j] = mask;
6166
6167			for (k = 0; k < nr_node_ids; k++) {
6168				if (node_distance(j, k) > sched_domains_numa_distance[i])
6169					continue;
6170
6171				cpumask_or(mask, mask, cpumask_of_node(k));
6172			}
6173		}
6174	}
6175
6176	tl = kzalloc((ARRAY_SIZE(default_topology) + level) *
6177			sizeof(struct sched_domain_topology_level), GFP_KERNEL);
6178	if (!tl)
6179		return;
6180
6181	/*
6182	 * Copy the default topology bits..
6183	 */
6184	for (i = 0; default_topology[i].init; i++)
6185		tl[i] = default_topology[i];
6186
6187	/*
6188	 * .. and append 'j' levels of NUMA goodness.
6189	 */
6190	for (j = 0; j < level; i++, j++) {
6191		tl[i] = (struct sched_domain_topology_level){
6192			.init = sd_numa_init,
6193			.mask = sd_numa_mask,
6194			.flags = SDTL_OVERLAP,
6195			.numa_level = j,
6196		};
6197	}
6198
6199	sched_domain_topology = tl;
6200
6201	sched_domains_numa_levels = level;
6202}
6203
6204static void sched_domains_numa_masks_set(int cpu)
6205{
6206	int i, j;
6207	int node = cpu_to_node(cpu);
6208
6209	for (i = 0; i < sched_domains_numa_levels; i++) {
6210		for (j = 0; j < nr_node_ids; j++) {
6211			if (node_distance(j, node) <= sched_domains_numa_distance[i])
6212				cpumask_set_cpu(cpu, sched_domains_numa_masks[i][j]);
6213		}
6214	}
6215}
6216
6217static void sched_domains_numa_masks_clear(int cpu)
6218{
6219	int i, j;
6220	for (i = 0; i < sched_domains_numa_levels; i++) {
6221		for (j = 0; j < nr_node_ids; j++)
6222			cpumask_clear_cpu(cpu, sched_domains_numa_masks[i][j]);
6223	}
6224}
6225
6226/*
6227 * Update sched_domains_numa_masks[level][node] array when new cpus
6228 * are onlined.
6229 */
6230static int sched_domains_numa_masks_update(struct notifier_block *nfb,
6231					   unsigned long action,
6232					   void *hcpu)
6233{
6234	int cpu = (long)hcpu;
6235
6236	switch (action & ~CPU_TASKS_FROZEN) {
6237	case CPU_ONLINE:
6238		sched_domains_numa_masks_set(cpu);
6239		break;
6240
6241	case CPU_DEAD:
6242		sched_domains_numa_masks_clear(cpu);
6243		break;
6244
6245	default:
6246		return NOTIFY_DONE;
6247	}
6248
6249	return NOTIFY_OK;
6250}
6251#else
6252static inline void sched_init_numa(void)
6253{
6254}
6255
6256static int sched_domains_numa_masks_update(struct notifier_block *nfb,
6257					   unsigned long action,
6258					   void *hcpu)
6259{
6260	return 0;
6261}
6262#endif /* CONFIG_NUMA */
6263
6264static int __sdt_alloc(const struct cpumask *cpu_map)
6265{
6266	struct sched_domain_topology_level *tl;
6267	int j;
6268
6269	for_each_sd_topology(tl) {
6270		struct sd_data *sdd = &tl->data;
6271
6272		sdd->sd = alloc_percpu(struct sched_domain *);
6273		if (!sdd->sd)
6274			return -ENOMEM;
6275
6276		sdd->sg = alloc_percpu(struct sched_group *);
6277		if (!sdd->sg)
6278			return -ENOMEM;
6279
6280		sdd->sgp = alloc_percpu(struct sched_group_power *);
6281		if (!sdd->sgp)
6282			return -ENOMEM;
6283
6284		for_each_cpu(j, cpu_map) {
6285			struct sched_domain *sd;
6286			struct sched_group *sg;
6287			struct sched_group_power *sgp;
6288
6289		       	sd = kzalloc_node(sizeof(struct sched_domain) + cpumask_size(),
6290					GFP_KERNEL, cpu_to_node(j));
6291			if (!sd)
6292				return -ENOMEM;
6293
6294			*per_cpu_ptr(sdd->sd, j) = sd;
6295
6296			sg = kzalloc_node(sizeof(struct sched_group) + cpumask_size(),
6297					GFP_KERNEL, cpu_to_node(j));
6298			if (!sg)
6299				return -ENOMEM;
6300
6301			sg->next = sg;
6302
6303			*per_cpu_ptr(sdd->sg, j) = sg;
6304
6305			sgp = kzalloc_node(sizeof(struct sched_group_power) + cpumask_size(),
6306					GFP_KERNEL, cpu_to_node(j));
6307			if (!sgp)
6308				return -ENOMEM;
6309
6310			*per_cpu_ptr(sdd->sgp, j) = sgp;
6311		}
6312	}
6313
6314	return 0;
6315}
6316
6317static void __sdt_free(const struct cpumask *cpu_map)
6318{
6319	struct sched_domain_topology_level *tl;
6320	int j;
6321
6322	for_each_sd_topology(tl) {
6323		struct sd_data *sdd = &tl->data;
6324
6325		for_each_cpu(j, cpu_map) {
6326			struct sched_domain *sd;
6327
6328			if (sdd->sd) {
6329				sd = *per_cpu_ptr(sdd->sd, j);
6330				if (sd && (sd->flags & SD_OVERLAP))
6331					free_sched_groups(sd->groups, 0);
6332				kfree(*per_cpu_ptr(sdd->sd, j));
6333			}
6334
6335			if (sdd->sg)
6336				kfree(*per_cpu_ptr(sdd->sg, j));
6337			if (sdd->sgp)
6338				kfree(*per_cpu_ptr(sdd->sgp, j));
6339		}
6340		free_percpu(sdd->sd);
6341		sdd->sd = NULL;
6342		free_percpu(sdd->sg);
6343		sdd->sg = NULL;
6344		free_percpu(sdd->sgp);
6345		sdd->sgp = NULL;
6346	}
6347}
6348
6349struct sched_domain *build_sched_domain(struct sched_domain_topology_level *tl,
6350		const struct cpumask *cpu_map, struct sched_domain_attr *attr,
6351		struct sched_domain *child, int cpu)
6352{
6353	struct sched_domain *sd = tl->init(tl, cpu);
6354	if (!sd)
6355		return child;
6356
6357	cpumask_and(sched_domain_span(sd), cpu_map, tl->mask(cpu));
6358	if (child) {
6359		sd->level = child->level + 1;
6360		sched_domain_level_max = max(sched_domain_level_max, sd->level);
6361		child->parent = sd;
6362		sd->child = child;
6363	}
6364	set_domain_attribute(sd, attr);
6365
6366	return sd;
6367}
6368
6369/*
6370 * Build sched domains for a given set of cpus and attach the sched domains
6371 * to the individual cpus
6372 */
6373static int build_sched_domains(const struct cpumask *cpu_map,
6374			       struct sched_domain_attr *attr)
6375{
6376	enum s_alloc alloc_state;
6377	struct sched_domain *sd;
6378	struct s_data d;
6379	int i, ret = -ENOMEM;
6380
6381	alloc_state = __visit_domain_allocation_hell(&d, cpu_map);
6382	if (alloc_state != sa_rootdomain)
6383		goto error;
6384
6385	/* Set up domains for cpus specified by the cpu_map. */
6386	for_each_cpu(i, cpu_map) {
6387		struct sched_domain_topology_level *tl;
6388
6389		sd = NULL;
6390		for_each_sd_topology(tl) {
6391			sd = build_sched_domain(tl, cpu_map, attr, sd, i);
6392			if (tl == sched_domain_topology)
6393				*per_cpu_ptr(d.sd, i) = sd;
6394			if (tl->flags & SDTL_OVERLAP || sched_feat(FORCE_SD_OVERLAP))
6395				sd->flags |= SD_OVERLAP;
6396			if (cpumask_equal(cpu_map, sched_domain_span(sd)))
6397				break;
6398		}
6399	}
6400
6401	/* Build the groups for the domains */
6402	for_each_cpu(i, cpu_map) {
6403		for (sd = *per_cpu_ptr(d.sd, i); sd; sd = sd->parent) {
6404			sd->span_weight = cpumask_weight(sched_domain_span(sd));
6405			if (sd->flags & SD_OVERLAP) {
6406				if (build_overlap_sched_groups(sd, i))
6407					goto error;
6408			} else {
6409				if (build_sched_groups(sd, i))
6410					goto error;
6411			}
6412		}
6413	}
6414
6415	/* Calculate CPU power for physical packages and nodes */
6416	for (i = nr_cpumask_bits-1; i >= 0; i--) {
6417		if (!cpumask_test_cpu(i, cpu_map))
6418			continue;
6419
6420		for (sd = *per_cpu_ptr(d.sd, i); sd; sd = sd->parent) {
6421			claim_allocations(i, sd);
6422			init_sched_groups_power(i, sd);
6423		}
6424	}
6425
6426	/* Attach the domains */
6427	rcu_read_lock();
6428	for_each_cpu(i, cpu_map) {
6429		sd = *per_cpu_ptr(d.sd, i);
6430		cpu_attach_domain(sd, d.rd, i);
6431	}
6432	rcu_read_unlock();
6433
6434	ret = 0;
6435error:
6436	__free_domain_allocs(&d, alloc_state, cpu_map);
6437	return ret;
6438}
6439
6440static cpumask_var_t *doms_cur;	/* current sched domains */
6441static int ndoms_cur;		/* number of sched domains in 'doms_cur' */
6442static struct sched_domain_attr *dattr_cur;
6443				/* attribues of custom domains in 'doms_cur' */
6444
6445/*
6446 * Special case: If a kmalloc of a doms_cur partition (array of
6447 * cpumask) fails, then fallback to a single sched domain,
6448 * as determined by the single cpumask fallback_doms.
6449 */
6450static cpumask_var_t fallback_doms;
6451
6452/*
6453 * arch_update_cpu_topology lets virtualized architectures update the
6454 * cpu core maps. It is supposed to return 1 if the topology changed
6455 * or 0 if it stayed the same.
6456 */
6457int __attribute__((weak)) arch_update_cpu_topology(void)
6458{
6459	return 0;
6460}
6461
6462cpumask_var_t *alloc_sched_domains(unsigned int ndoms)
6463{
6464	int i;
6465	cpumask_var_t *doms;
6466
6467	doms = kmalloc(sizeof(*doms) * ndoms, GFP_KERNEL);
6468	if (!doms)
6469		return NULL;
6470	for (i = 0; i < ndoms; i++) {
6471		if (!alloc_cpumask_var(&doms[i], GFP_KERNEL)) {
6472			free_sched_domains(doms, i);
6473			return NULL;
6474		}
6475	}
6476	return doms;
6477}
6478
6479void free_sched_domains(cpumask_var_t doms[], unsigned int ndoms)
6480{
6481	unsigned int i;
6482	for (i = 0; i < ndoms; i++)
6483		free_cpumask_var(doms[i]);
6484	kfree(doms);
6485}
6486
6487/*
6488 * Set up scheduler domains and groups. Callers must hold the hotplug lock.
6489 * For now this just excludes isolated cpus, but could be used to
6490 * exclude other special cases in the future.
6491 */
6492static int init_sched_domains(const struct cpumask *cpu_map)
6493{
6494	int err;
6495
6496	arch_update_cpu_topology();
6497	ndoms_cur = 1;
6498	doms_cur = alloc_sched_domains(ndoms_cur);
6499	if (!doms_cur)
6500		doms_cur = &fallback_doms;
6501	cpumask_andnot(doms_cur[0], cpu_map, cpu_isolated_map);
6502	err = build_sched_domains(doms_cur[0], NULL);
6503	register_sched_domain_sysctl();
6504
6505	return err;
6506}
6507
6508/*
6509 * Detach sched domains from a group of cpus specified in cpu_map
6510 * These cpus will now be attached to the NULL domain
6511 */
6512static void detach_destroy_domains(const struct cpumask *cpu_map)
6513{
6514	int i;
6515
6516	rcu_read_lock();
6517	for_each_cpu(i, cpu_map)
6518		cpu_attach_domain(NULL, &def_root_domain, i);
6519	rcu_read_unlock();
6520}
6521
6522/* handle null as "default" */
6523static int dattrs_equal(struct sched_domain_attr *cur, int idx_cur,
6524			struct sched_domain_attr *new, int idx_new)
6525{
6526	struct sched_domain_attr tmp;
6527
6528	/* fast path */
6529	if (!new && !cur)
6530		return 1;
6531
6532	tmp = SD_ATTR_INIT;
6533	return !memcmp(cur ? (cur + idx_cur) : &tmp,
6534			new ? (new + idx_new) : &tmp,
6535			sizeof(struct sched_domain_attr));
6536}
6537
6538/*
6539 * Partition sched domains as specified by the 'ndoms_new'
6540 * cpumasks in the array doms_new[] of cpumasks. This compares
6541 * doms_new[] to the current sched domain partitioning, doms_cur[].
6542 * It destroys each deleted domain and builds each new domain.
6543 *
6544 * 'doms_new' is an array of cpumask_var_t's of length 'ndoms_new'.
6545 * The masks don't intersect (don't overlap.) We should setup one
6546 * sched domain for each mask. CPUs not in any of the cpumasks will
6547 * not be load balanced. If the same cpumask appears both in the
6548 * current 'doms_cur' domains and in the new 'doms_new', we can leave
6549 * it as it is.
6550 *
6551 * The passed in 'doms_new' should be allocated using
6552 * alloc_sched_domains.  This routine takes ownership of it and will
6553 * free_sched_domains it when done with it. If the caller failed the
6554 * alloc call, then it can pass in doms_new == NULL && ndoms_new == 1,
6555 * and partition_sched_domains() will fallback to the single partition
6556 * 'fallback_doms', it also forces the domains to be rebuilt.
6557 *
6558 * If doms_new == NULL it will be replaced with cpu_online_mask.
6559 * ndoms_new == 0 is a special case for destroying existing domains,
6560 * and it will not create the default domain.
6561 *
6562 * Call with hotplug lock held
6563 */
6564void partition_sched_domains(int ndoms_new, cpumask_var_t doms_new[],
6565			     struct sched_domain_attr *dattr_new)
6566{
6567	int i, j, n;
6568	int new_topology;
6569
6570	mutex_lock(&sched_domains_mutex);
6571
6572	/* always unregister in case we don't destroy any domains */
6573	unregister_sched_domain_sysctl();
6574
6575	/* Let architecture update cpu core mappings. */
6576	new_topology = arch_update_cpu_topology();
6577
6578	n = doms_new ? ndoms_new : 0;
6579
6580	/* Destroy deleted domains */
6581	for (i = 0; i < ndoms_cur; i++) {
6582		for (j = 0; j < n && !new_topology; j++) {
6583			if (cpumask_equal(doms_cur[i], doms_new[j])
6584			    && dattrs_equal(dattr_cur, i, dattr_new, j))
6585				goto match1;
6586		}
6587		/* no match - a current sched domain not in new doms_new[] */
6588		detach_destroy_domains(doms_cur[i]);
6589match1:
6590		;
6591	}
6592
6593	n = ndoms_cur;
6594	if (doms_new == NULL) {
6595		n = 0;
6596		doms_new = &fallback_doms;
6597		cpumask_andnot(doms_new[0], cpu_active_mask, cpu_isolated_map);
6598		WARN_ON_ONCE(dattr_new);
6599	}
6600
6601	/* Build new domains */
6602	for (i = 0; i < ndoms_new; i++) {
6603		for (j = 0; j < n && !new_topology; j++) {
6604			if (cpumask_equal(doms_new[i], doms_cur[j])
6605			    && dattrs_equal(dattr_new, i, dattr_cur, j))
6606				goto match2;
6607		}
6608		/* no match - add a new doms_new */
6609		build_sched_domains(doms_new[i], dattr_new ? dattr_new + i : NULL);
6610match2:
6611		;
6612	}
6613
6614	/* Remember the new sched domains */
6615	if (doms_cur != &fallback_doms)
6616		free_sched_domains(doms_cur, ndoms_cur);
6617	kfree(dattr_cur);	/* kfree(NULL) is safe */
6618	doms_cur = doms_new;
6619	dattr_cur = dattr_new;
6620	ndoms_cur = ndoms_new;
6621
6622	register_sched_domain_sysctl();
6623
6624	mutex_unlock(&sched_domains_mutex);
6625}
6626
6627static int num_cpus_frozen;	/* used to mark begin/end of suspend/resume */
6628
6629/*
6630 * Update cpusets according to cpu_active mask.  If cpusets are
6631 * disabled, cpuset_update_active_cpus() becomes a simple wrapper
6632 * around partition_sched_domains().
6633 *
6634 * If we come here as part of a suspend/resume, don't touch cpusets because we
6635 * want to restore it back to its original state upon resume anyway.
6636 */
6637static int cpuset_cpu_active(struct notifier_block *nfb, unsigned long action,
6638			     void *hcpu)
6639{
6640	switch (action) {
6641	case CPU_ONLINE_FROZEN:
6642	case CPU_DOWN_FAILED_FROZEN:
6643
6644		/*
6645		 * num_cpus_frozen tracks how many CPUs are involved in suspend
6646		 * resume sequence. As long as this is not the last online
6647		 * operation in the resume sequence, just build a single sched
6648		 * domain, ignoring cpusets.
6649		 */
6650		num_cpus_frozen--;
6651		if (likely(num_cpus_frozen)) {
6652			partition_sched_domains(1, NULL, NULL);
6653			break;
6654		}
6655
6656		/*
6657		 * This is the last CPU online operation. So fall through and
6658		 * restore the original sched domains by considering the
6659		 * cpuset configurations.
6660		 */
6661
6662	case CPU_ONLINE:
6663	case CPU_DOWN_FAILED:
6664		cpuset_update_active_cpus(true);
6665		break;
6666	default:
6667		return NOTIFY_DONE;
6668	}
6669	return NOTIFY_OK;
6670}
6671
6672static int cpuset_cpu_inactive(struct notifier_block *nfb, unsigned long action,
6673			       void *hcpu)
6674{
6675	switch (action) {
6676	case CPU_DOWN_PREPARE:
6677		cpuset_update_active_cpus(false);
6678		break;
6679	case CPU_DOWN_PREPARE_FROZEN:
6680		num_cpus_frozen++;
6681		partition_sched_domains(1, NULL, NULL);
6682		break;
6683	default:
6684		return NOTIFY_DONE;
6685	}
6686	return NOTIFY_OK;
6687}
6688
6689void __init sched_init_smp(void)
6690{
6691	cpumask_var_t non_isolated_cpus;
6692
6693	alloc_cpumask_var(&non_isolated_cpus, GFP_KERNEL);
6694	alloc_cpumask_var(&fallback_doms, GFP_KERNEL);
6695
6696	sched_init_numa();
6697
6698	/*
6699	 * There's no userspace yet to cause hotplug operations; hence all the
6700	 * cpu masks are stable and all blatant races in the below code cannot
6701	 * happen.
6702	 */
6703	mutex_lock(&sched_domains_mutex);
6704	init_sched_domains(cpu_active_mask);
6705	cpumask_andnot(non_isolated_cpus, cpu_possible_mask, cpu_isolated_map);
6706	if (cpumask_empty(non_isolated_cpus))
6707		cpumask_set_cpu(smp_processor_id(), non_isolated_cpus);
6708	mutex_unlock(&sched_domains_mutex);
6709
6710	hotcpu_notifier(sched_domains_numa_masks_update, CPU_PRI_SCHED_ACTIVE);
6711	hotcpu_notifier(cpuset_cpu_active, CPU_PRI_CPUSET_ACTIVE);
6712	hotcpu_notifier(cpuset_cpu_inactive, CPU_PRI_CPUSET_INACTIVE);
6713
6714	init_hrtick();
6715
6716	/* Move init over to a non-isolated CPU */
6717	if (set_cpus_allowed_ptr(current, non_isolated_cpus) < 0)
6718		BUG();
6719	sched_init_granularity();
6720	free_cpumask_var(non_isolated_cpus);
6721
6722	init_sched_rt_class();
6723	init_sched_dl_class();
6724}
6725#else
6726void __init sched_init_smp(void)
6727{
6728	sched_init_granularity();
6729}
6730#endif /* CONFIG_SMP */
6731
6732const_debug unsigned int sysctl_timer_migration = 1;
6733
6734int in_sched_functions(unsigned long addr)
6735{
6736	return in_lock_functions(addr) ||
6737		(addr >= (unsigned long)__sched_text_start
6738		&& addr < (unsigned long)__sched_text_end);
6739}
6740
6741#ifdef CONFIG_CGROUP_SCHED
6742/*
6743 * Default task group.
6744 * Every task in system belongs to this group at bootup.
6745 */
6746struct task_group root_task_group;
6747LIST_HEAD(task_groups);
6748#endif
6749
6750DECLARE_PER_CPU(cpumask_var_t, load_balance_mask);
6751
6752void __init sched_init(void)
6753{
6754	int i, j;
6755	unsigned long alloc_size = 0, ptr;
6756
6757#ifdef CONFIG_FAIR_GROUP_SCHED
6758	alloc_size += 2 * nr_cpu_ids * sizeof(void **);
6759#endif
6760#ifdef CONFIG_RT_GROUP_SCHED
6761	alloc_size += 2 * nr_cpu_ids * sizeof(void **);
6762#endif
6763#ifdef CONFIG_CPUMASK_OFFSTACK
6764	alloc_size += num_possible_cpus() * cpumask_size();
6765#endif
6766	if (alloc_size) {
6767		ptr = (unsigned long)kzalloc(alloc_size, GFP_NOWAIT);
6768
6769#ifdef CONFIG_FAIR_GROUP_SCHED
6770		root_task_group.se = (struct sched_entity **)ptr;
6771		ptr += nr_cpu_ids * sizeof(void **);
6772
6773		root_task_group.cfs_rq = (struct cfs_rq **)ptr;
6774		ptr += nr_cpu_ids * sizeof(void **);
6775
6776#endif /* CONFIG_FAIR_GROUP_SCHED */
6777#ifdef CONFIG_RT_GROUP_SCHED
6778		root_task_group.rt_se = (struct sched_rt_entity **)ptr;
6779		ptr += nr_cpu_ids * sizeof(void **);
6780
6781		root_task_group.rt_rq = (struct rt_rq **)ptr;
6782		ptr += nr_cpu_ids * sizeof(void **);
6783
6784#endif /* CONFIG_RT_GROUP_SCHED */
6785#ifdef CONFIG_CPUMASK_OFFSTACK
6786		for_each_possible_cpu(i) {
6787			per_cpu(load_balance_mask, i) = (void *)ptr;
6788			ptr += cpumask_size();
6789		}
6790#endif /* CONFIG_CPUMASK_OFFSTACK */
6791	}
6792
6793	init_rt_bandwidth(&def_rt_bandwidth,
6794			global_rt_period(), global_rt_runtime());
6795	init_dl_bandwidth(&def_dl_bandwidth,
6796			global_rt_period(), global_rt_runtime());
6797
6798#ifdef CONFIG_SMP
6799	init_defrootdomain();
6800#endif
6801
6802#ifdef CONFIG_RT_GROUP_SCHED
6803	init_rt_bandwidth(&root_task_group.rt_bandwidth,
6804			global_rt_period(), global_rt_runtime());
6805#endif /* CONFIG_RT_GROUP_SCHED */
6806
6807#ifdef CONFIG_CGROUP_SCHED
6808	list_add(&root_task_group.list, &task_groups);
6809	INIT_LIST_HEAD(&root_task_group.children);
6810	INIT_LIST_HEAD(&root_task_group.siblings);
6811	autogroup_init(&init_task);
6812
6813#endif /* CONFIG_CGROUP_SCHED */
6814
6815	for_each_possible_cpu(i) {
6816		struct rq *rq;
6817
6818		rq = cpu_rq(i);
6819		raw_spin_lock_init(&rq->lock);
6820		rq->nr_running = 0;
6821		rq->calc_load_active = 0;
6822		rq->calc_load_update = jiffies + LOAD_FREQ;
6823		init_cfs_rq(&rq->cfs);
6824		init_rt_rq(&rq->rt, rq);
6825		init_dl_rq(&rq->dl, rq);
6826#ifdef CONFIG_FAIR_GROUP_SCHED
6827		root_task_group.shares = ROOT_TASK_GROUP_LOAD;
6828		INIT_LIST_HEAD(&rq->leaf_cfs_rq_list);
6829		/*
6830		 * How much cpu bandwidth does root_task_group get?
6831		 *
6832		 * In case of task-groups formed thr' the cgroup filesystem, it
6833		 * gets 100% of the cpu resources in the system. This overall
6834		 * system cpu resource is divided among the tasks of
6835		 * root_task_group and its child task-groups in a fair manner,
6836		 * based on each entity's (task or task-group's) weight
6837		 * (se->load.weight).
6838		 *
6839		 * In other words, if root_task_group has 10 tasks of weight
6840		 * 1024) and two child groups A0 and A1 (of weight 1024 each),
6841		 * then A0's share of the cpu resource is:
6842		 *
6843		 *	A0's bandwidth = 1024 / (10*1024 + 1024 + 1024) = 8.33%
6844		 *
6845		 * We achieve this by letting root_task_group's tasks sit
6846		 * directly in rq->cfs (i.e root_task_group->se[] = NULL).
6847		 */
6848		init_cfs_bandwidth(&root_task_group.cfs_bandwidth);
6849		init_tg_cfs_entry(&root_task_group, &rq->cfs, NULL, i, NULL);
6850#endif /* CONFIG_FAIR_GROUP_SCHED */
6851
6852		rq->rt.rt_runtime = def_rt_bandwidth.rt_runtime;
6853#ifdef CONFIG_RT_GROUP_SCHED
6854		INIT_LIST_HEAD(&rq->leaf_rt_rq_list);
6855		init_tg_rt_entry(&root_task_group, &rq->rt, NULL, i, NULL);
6856#endif
6857
6858		for (j = 0; j < CPU_LOAD_IDX_MAX; j++)
6859			rq->cpu_load[j] = 0;
6860
6861		rq->last_load_update_tick = jiffies;
6862
6863#ifdef CONFIG_SMP
6864		rq->sd = NULL;
6865		rq->rd = NULL;
6866		rq->cpu_power = SCHED_POWER_SCALE;
6867		rq->post_schedule = 0;
6868		rq->active_balance = 0;
6869		rq->next_balance = jiffies;
6870		rq->push_cpu = 0;
6871		rq->cpu = i;
6872		rq->online = 0;
6873		rq->idle_stamp = 0;
6874		rq->avg_idle = 2*sysctl_sched_migration_cost;
6875		rq->max_idle_balance_cost = sysctl_sched_migration_cost;
6876
6877		INIT_LIST_HEAD(&rq->cfs_tasks);
6878
6879		rq_attach_root(rq, &def_root_domain);
6880#ifdef CONFIG_NO_HZ_COMMON
6881		rq->nohz_flags = 0;
6882#endif
6883#ifdef CONFIG_NO_HZ_FULL
6884		rq->last_sched_tick = 0;
6885#endif
6886#endif
6887		init_rq_hrtick(rq);
6888		atomic_set(&rq->nr_iowait, 0);
6889	}
6890
6891	set_load_weight(&init_task);
6892
6893#ifdef CONFIG_PREEMPT_NOTIFIERS
6894	INIT_HLIST_HEAD(&init_task.preempt_notifiers);
6895#endif
6896
6897	/*
6898	 * The boot idle thread does lazy MMU switching as well:
6899	 */
6900	atomic_inc(&init_mm.mm_count);
6901	enter_lazy_tlb(&init_mm, current);
6902
6903	/*
6904	 * Make us the idle thread. Technically, schedule() should not be
6905	 * called from this thread, however somewhere below it might be,
6906	 * but because we are the idle thread, we just pick up running again
6907	 * when this runqueue becomes "idle".
6908	 */
6909	init_idle(current, smp_processor_id());
6910
6911	calc_load_update = jiffies + LOAD_FREQ;
6912
6913	/*
6914	 * During early bootup we pretend to be a normal task:
6915	 */
6916	current->sched_class = &fair_sched_class;
6917
6918#ifdef CONFIG_SMP
6919	zalloc_cpumask_var(&sched_domains_tmpmask, GFP_NOWAIT);
6920	/* May be allocated at isolcpus cmdline parse time */
6921	if (cpu_isolated_map == NULL)
6922		zalloc_cpumask_var(&cpu_isolated_map, GFP_NOWAIT);
6923	idle_thread_set_boot_cpu();
6924#endif
6925	init_sched_fair_class();
6926
6927	scheduler_running = 1;
6928}
6929
6930#ifdef CONFIG_DEBUG_ATOMIC_SLEEP
6931static inline int preempt_count_equals(int preempt_offset)
6932{
6933	int nested = (preempt_count() & ~PREEMPT_ACTIVE) + rcu_preempt_depth();
6934
6935	return (nested == preempt_offset);
6936}
6937
6938void __might_sleep(const char *file, int line, int preempt_offset)
6939{
6940	static unsigned long prev_jiffy;	/* ratelimiting */
6941
6942	rcu_sleep_check(); /* WARN_ON_ONCE() by default, no rate limit reqd. */
6943	if ((preempt_count_equals(preempt_offset) && !irqs_disabled()) ||
6944	    system_state != SYSTEM_RUNNING || oops_in_progress)
6945		return;
6946	if (time_before(jiffies, prev_jiffy + HZ) && prev_jiffy)
6947		return;
6948	prev_jiffy = jiffies;
6949
6950	printk(KERN_ERR
6951		"BUG: sleeping function called from invalid context at %s:%d\n",
6952			file, line);
6953	printk(KERN_ERR
6954		"in_atomic(): %d, irqs_disabled(): %d, pid: %d, name: %s\n",
6955			in_atomic(), irqs_disabled(),
6956			current->pid, current->comm);
6957
6958	debug_show_held_locks(current);
6959	if (irqs_disabled())
6960		print_irqtrace_events(current);
6961	dump_stack();
6962}
6963EXPORT_SYMBOL(__might_sleep);
6964#endif
6965
6966#ifdef CONFIG_MAGIC_SYSRQ
6967static void normalize_task(struct rq *rq, struct task_struct *p)
6968{
6969	const struct sched_class *prev_class = p->sched_class;
6970	struct sched_attr attr = {
6971		.sched_policy = SCHED_NORMAL,
6972	};
6973	int old_prio = p->prio;
6974	int on_rq;
6975
6976	on_rq = p->on_rq;
6977	if (on_rq)
6978		dequeue_task(rq, p, 0);
6979	__setscheduler(rq, p, &attr);
6980	if (on_rq) {
6981		enqueue_task(rq, p, 0);
6982		resched_task(rq->curr);
6983	}
6984
6985	check_class_changed(rq, p, prev_class, old_prio);
6986}
6987
6988void normalize_rt_tasks(void)
6989{
6990	struct task_struct *g, *p;
6991	unsigned long flags;
6992	struct rq *rq;
6993
6994	read_lock_irqsave(&tasklist_lock, flags);
6995	do_each_thread(g, p) {
6996		/*
6997		 * Only normalize user tasks:
6998		 */
6999		if (!p->mm)
7000			continue;
7001
7002		p->se.exec_start		= 0;
7003#ifdef CONFIG_SCHEDSTATS
7004		p->se.statistics.wait_start	= 0;
7005		p->se.statistics.sleep_start	= 0;
7006		p->se.statistics.block_start	= 0;
7007#endif
7008
7009		if (!dl_task(p) && !rt_task(p)) {
7010			/*
7011			 * Renice negative nice level userspace
7012			 * tasks back to 0:
7013			 */
7014			if (TASK_NICE(p) < 0 && p->mm)
7015				set_user_nice(p, 0);
7016			continue;
7017		}
7018
7019		raw_spin_lock(&p->pi_lock);
7020		rq = __task_rq_lock(p);
7021
7022		normalize_task(rq, p);
7023
7024		__task_rq_unlock(rq);
7025		raw_spin_unlock(&p->pi_lock);
7026	} while_each_thread(g, p);
7027
7028	read_unlock_irqrestore(&tasklist_lock, flags);
7029}
7030
7031#endif /* CONFIG_MAGIC_SYSRQ */
7032
7033#if defined(CONFIG_IA64) || defined(CONFIG_KGDB_KDB)
7034/*
7035 * These functions are only useful for the IA64 MCA handling, or kdb.
7036 *
7037 * They can only be called when the whole system has been
7038 * stopped - every CPU needs to be quiescent, and no scheduling
7039 * activity can take place. Using them for anything else would
7040 * be a serious bug, and as a result, they aren't even visible
7041 * under any other configuration.
7042 */
7043
7044/**
7045 * curr_task - return the current task for a given cpu.
7046 * @cpu: the processor in question.
7047 *
7048 * ONLY VALID WHEN THE WHOLE SYSTEM IS STOPPED!
7049 *
7050 * Return: The current task for @cpu.
7051 */
7052struct task_struct *curr_task(int cpu)
7053{
7054	return cpu_curr(cpu);
7055}
7056
7057#endif /* defined(CONFIG_IA64) || defined(CONFIG_KGDB_KDB) */
7058
7059#ifdef CONFIG_IA64
7060/**
7061 * set_curr_task - set the current task for a given cpu.
7062 * @cpu: the processor in question.
7063 * @p: the task pointer to set.
7064 *
7065 * Description: This function must only be used when non-maskable interrupts
7066 * are serviced on a separate stack. It allows the architecture to switch the
7067 * notion of the current task on a cpu in a non-blocking manner. This function
7068 * must be called with all CPU's synchronized, and interrupts disabled, the
7069 * and caller must save the original value of the current task (see
7070 * curr_task() above) and restore that value before reenabling interrupts and
7071 * re-starting the system.
7072 *
7073 * ONLY VALID WHEN THE WHOLE SYSTEM IS STOPPED!
7074 */
7075void set_curr_task(int cpu, struct task_struct *p)
7076{
7077	cpu_curr(cpu) = p;
7078}
7079
7080#endif
7081
7082#ifdef CONFIG_CGROUP_SCHED
7083/* task_group_lock serializes the addition/removal of task groups */
7084static DEFINE_SPINLOCK(task_group_lock);
7085
7086static void free_sched_group(struct task_group *tg)
7087{
7088	free_fair_sched_group(tg);
7089	free_rt_sched_group(tg);
7090	autogroup_free(tg);
7091	kfree(tg);
7092}
7093
7094/* allocate runqueue etc for a new task group */
7095struct task_group *sched_create_group(struct task_group *parent)
7096{
7097	struct task_group *tg;
7098
7099	tg = kzalloc(sizeof(*tg), GFP_KERNEL);
7100	if (!tg)
7101		return ERR_PTR(-ENOMEM);
7102
7103	if (!alloc_fair_sched_group(tg, parent))
7104		goto err;
7105
7106	if (!alloc_rt_sched_group(tg, parent))
7107		goto err;
7108
7109	return tg;
7110
7111err:
7112	free_sched_group(tg);
7113	return ERR_PTR(-ENOMEM);
7114}
7115
7116void sched_online_group(struct task_group *tg, struct task_group *parent)
7117{
7118	unsigned long flags;
7119
7120	spin_lock_irqsave(&task_group_lock, flags);
7121	list_add_rcu(&tg->list, &task_groups);
7122
7123	WARN_ON(!parent); /* root should already exist */
7124
7125	tg->parent = parent;
7126	INIT_LIST_HEAD(&tg->children);
7127	list_add_rcu(&tg->siblings, &parent->children);
7128	spin_unlock_irqrestore(&task_group_lock, flags);
7129}
7130
7131/* rcu callback to free various structures associated with a task group */
7132static void free_sched_group_rcu(struct rcu_head *rhp)
7133{
7134	/* now it should be safe to free those cfs_rqs */
7135	free_sched_group(container_of(rhp, struct task_group, rcu));
7136}
7137
7138/* Destroy runqueue etc associated with a task group */
7139void sched_destroy_group(struct task_group *tg)
7140{
7141	/* wait for possible concurrent references to cfs_rqs complete */
7142	call_rcu(&tg->rcu, free_sched_group_rcu);
7143}
7144
7145void sched_offline_group(struct task_group *tg)
7146{
7147	unsigned long flags;
7148	int i;
7149
7150	/* end participation in shares distribution */
7151	for_each_possible_cpu(i)
7152		unregister_fair_sched_group(tg, i);
7153
7154	spin_lock_irqsave(&task_group_lock, flags);
7155	list_del_rcu(&tg->list);
7156	list_del_rcu(&tg->siblings);
7157	spin_unlock_irqrestore(&task_group_lock, flags);
7158}
7159
7160/* change task's runqueue when it moves between groups.
7161 *	The caller of this function should have put the task in its new group
7162 *	by now. This function just updates tsk->se.cfs_rq and tsk->se.parent to
7163 *	reflect its new group.
7164 */
7165void sched_move_task(struct task_struct *tsk)
7166{
7167	struct task_group *tg;
7168	int on_rq, running;
7169	unsigned long flags;
7170	struct rq *rq;
7171
7172	rq = task_rq_lock(tsk, &flags);
7173
7174	running = task_current(rq, tsk);
7175	on_rq = tsk->on_rq;
7176
7177	if (on_rq)
7178		dequeue_task(rq, tsk, 0);
7179	if (unlikely(running))
7180		tsk->sched_class->put_prev_task(rq, tsk);
7181
7182	tg = container_of(task_css_check(tsk, cpu_cgroup_subsys_id,
7183				lockdep_is_held(&tsk->sighand->siglock)),
7184			  struct task_group, css);
7185	tg = autogroup_task_group(tsk, tg);
7186	tsk->sched_task_group = tg;
7187
7188#ifdef CONFIG_FAIR_GROUP_SCHED
7189	if (tsk->sched_class->task_move_group)
7190		tsk->sched_class->task_move_group(tsk, on_rq);
7191	else
7192#endif
7193		set_task_rq(tsk, task_cpu(tsk));
7194
7195	if (unlikely(running))
7196		tsk->sched_class->set_curr_task(rq);
7197	if (on_rq)
7198		enqueue_task(rq, tsk, 0);
7199
7200	task_rq_unlock(rq, tsk, &flags);
7201}
7202#endif /* CONFIG_CGROUP_SCHED */
7203
7204#ifdef CONFIG_RT_GROUP_SCHED
7205/*
7206 * Ensure that the real time constraints are schedulable.
7207 */
7208static DEFINE_MUTEX(rt_constraints_mutex);
7209
7210/* Must be called with tasklist_lock held */
7211static inline int tg_has_rt_tasks(struct task_group *tg)
7212{
7213	struct task_struct *g, *p;
7214
7215	do_each_thread(g, p) {
7216		if (rt_task(p) && task_rq(p)->rt.tg == tg)
7217			return 1;
7218	} while_each_thread(g, p);
7219
7220	return 0;
7221}
7222
7223struct rt_schedulable_data {
7224	struct task_group *tg;
7225	u64 rt_period;
7226	u64 rt_runtime;
7227};
7228
7229static int tg_rt_schedulable(struct task_group *tg, void *data)
7230{
7231	struct rt_schedulable_data *d = data;
7232	struct task_group *child;
7233	unsigned long total, sum = 0;
7234	u64 period, runtime;
7235
7236	period = ktime_to_ns(tg->rt_bandwidth.rt_period);
7237	runtime = tg->rt_bandwidth.rt_runtime;
7238
7239	if (tg == d->tg) {
7240		period = d->rt_period;
7241		runtime = d->rt_runtime;
7242	}
7243
7244	/*
7245	 * Cannot have more runtime than the period.
7246	 */
7247	if (runtime > period && runtime != RUNTIME_INF)
7248		return -EINVAL;
7249
7250	/*
7251	 * Ensure we don't starve existing RT tasks.
7252	 */
7253	if (rt_bandwidth_enabled() && !runtime && tg_has_rt_tasks(tg))
7254		return -EBUSY;
7255
7256	total = to_ratio(period, runtime);
7257
7258	/*
7259	 * Nobody can have more than the global setting allows.
7260	 */
7261	if (total > to_ratio(global_rt_period(), global_rt_runtime()))
7262		return -EINVAL;
7263
7264	/*
7265	 * The sum of our children's runtime should not exceed our own.
7266	 */
7267	list_for_each_entry_rcu(child, &tg->children, siblings) {
7268		period = ktime_to_ns(child->rt_bandwidth.rt_period);
7269		runtime = child->rt_bandwidth.rt_runtime;
7270
7271		if (child == d->tg) {
7272			period = d->rt_period;
7273			runtime = d->rt_runtime;
7274		}
7275
7276		sum += to_ratio(period, runtime);
7277	}
7278
7279	if (sum > total)
7280		return -EINVAL;
7281
7282	return 0;
7283}
7284
7285static int __rt_schedulable(struct task_group *tg, u64 period, u64 runtime)
7286{
7287	int ret;
7288
7289	struct rt_schedulable_data data = {
7290		.tg = tg,
7291		.rt_period = period,
7292		.rt_runtime = runtime,
7293	};
7294
7295	rcu_read_lock();
7296	ret = walk_tg_tree(tg_rt_schedulable, tg_nop, &data);
7297	rcu_read_unlock();
7298
7299	return ret;
7300}
7301
7302static int tg_set_rt_bandwidth(struct task_group *tg,
7303		u64 rt_period, u64 rt_runtime)
7304{
7305	int i, err = 0;
7306
7307	mutex_lock(&rt_constraints_mutex);
7308	read_lock(&tasklist_lock);
7309	err = __rt_schedulable(tg, rt_period, rt_runtime);
7310	if (err)
7311		goto unlock;
7312
7313	raw_spin_lock_irq(&tg->rt_bandwidth.rt_runtime_lock);
7314	tg->rt_bandwidth.rt_period = ns_to_ktime(rt_period);
7315	tg->rt_bandwidth.rt_runtime = rt_runtime;
7316
7317	for_each_possible_cpu(i) {
7318		struct rt_rq *rt_rq = tg->rt_rq[i];
7319
7320		raw_spin_lock(&rt_rq->rt_runtime_lock);
7321		rt_rq->rt_runtime = rt_runtime;
7322		raw_spin_unlock(&rt_rq->rt_runtime_lock);
7323	}
7324	raw_spin_unlock_irq(&tg->rt_bandwidth.rt_runtime_lock);
7325unlock:
7326	read_unlock(&tasklist_lock);
7327	mutex_unlock(&rt_constraints_mutex);
7328
7329	return err;
7330}
7331
7332static int sched_group_set_rt_runtime(struct task_group *tg, long rt_runtime_us)
7333{
7334	u64 rt_runtime, rt_period;
7335
7336	rt_period = ktime_to_ns(tg->rt_bandwidth.rt_period);
7337	rt_runtime = (u64)rt_runtime_us * NSEC_PER_USEC;
7338	if (rt_runtime_us < 0)
7339		rt_runtime = RUNTIME_INF;
7340
7341	return tg_set_rt_bandwidth(tg, rt_period, rt_runtime);
7342}
7343
7344static long sched_group_rt_runtime(struct task_group *tg)
7345{
7346	u64 rt_runtime_us;
7347
7348	if (tg->rt_bandwidth.rt_runtime == RUNTIME_INF)
7349		return -1;
7350
7351	rt_runtime_us = tg->rt_bandwidth.rt_runtime;
7352	do_div(rt_runtime_us, NSEC_PER_USEC);
7353	return rt_runtime_us;
7354}
7355
7356static int sched_group_set_rt_period(struct task_group *tg, long rt_period_us)
7357{
7358	u64 rt_runtime, rt_period;
7359
7360	rt_period = (u64)rt_period_us * NSEC_PER_USEC;
7361	rt_runtime = tg->rt_bandwidth.rt_runtime;
7362
7363	if (rt_period == 0)
7364		return -EINVAL;
7365
7366	return tg_set_rt_bandwidth(tg, rt_period, rt_runtime);
7367}
7368
7369static long sched_group_rt_period(struct task_group *tg)
7370{
7371	u64 rt_period_us;
7372
7373	rt_period_us = ktime_to_ns(tg->rt_bandwidth.rt_period);
7374	do_div(rt_period_us, NSEC_PER_USEC);
7375	return rt_period_us;
7376}
7377#endif /* CONFIG_RT_GROUP_SCHED */
7378
7379#ifdef CONFIG_RT_GROUP_SCHED
7380static int sched_rt_global_constraints(void)
7381{
7382	int ret = 0;
7383
7384	mutex_lock(&rt_constraints_mutex);
7385	read_lock(&tasklist_lock);
7386	ret = __rt_schedulable(NULL, 0, 0);
7387	read_unlock(&tasklist_lock);
7388	mutex_unlock(&rt_constraints_mutex);
7389
7390	return ret;
7391}
7392
7393static int sched_rt_can_attach(struct task_group *tg, struct task_struct *tsk)
7394{
7395	/* Don't accept realtime tasks when there is no way for them to run */
7396	if (rt_task(tsk) && tg->rt_bandwidth.rt_runtime == 0)
7397		return 0;
7398
7399	return 1;
7400}
7401
7402#else /* !CONFIG_RT_GROUP_SCHED */
7403static int sched_rt_global_constraints(void)
7404{
7405	unsigned long flags;
7406	int i, ret = 0;
7407
7408	raw_spin_lock_irqsave(&def_rt_bandwidth.rt_runtime_lock, flags);
7409	for_each_possible_cpu(i) {
7410		struct rt_rq *rt_rq = &cpu_rq(i)->rt;
7411
7412		raw_spin_lock(&rt_rq->rt_runtime_lock);
7413		rt_rq->rt_runtime = global_rt_runtime();
7414		raw_spin_unlock(&rt_rq->rt_runtime_lock);
7415	}
7416	raw_spin_unlock_irqrestore(&def_rt_bandwidth.rt_runtime_lock, flags);
7417
7418	return ret;
7419}
7420#endif /* CONFIG_RT_GROUP_SCHED */
7421
7422static int sched_dl_global_constraints(void)
7423{
7424	u64 runtime = global_rt_runtime();
7425	u64 period = global_rt_period();
7426	u64 new_bw = to_ratio(period, runtime);
7427	int cpu, ret = 0;
7428
7429	/*
7430	 * Here we want to check the bandwidth not being set to some
7431	 * value smaller than the currently allocated bandwidth in
7432	 * any of the root_domains.
7433	 *
7434	 * FIXME: Cycling on all the CPUs is overdoing, but simpler than
7435	 * cycling on root_domains... Discussion on different/better
7436	 * solutions is welcome!
7437	 */
7438	for_each_possible_cpu(cpu) {
7439		struct dl_bw *dl_b = dl_bw_of(cpu);
7440
7441		raw_spin_lock(&dl_b->lock);
7442		if (new_bw < dl_b->total_bw)
7443			ret = -EBUSY;
7444		raw_spin_unlock(&dl_b->lock);
7445
7446		if (ret)
7447			break;
7448	}
7449
7450	return ret;
7451}
7452
7453static void sched_dl_do_global(void)
7454{
7455	u64 new_bw = -1;
7456	int cpu;
7457
7458	def_dl_bandwidth.dl_period = global_rt_period();
7459	def_dl_bandwidth.dl_runtime = global_rt_runtime();
7460
7461	if (global_rt_runtime() != RUNTIME_INF)
7462		new_bw = to_ratio(global_rt_period(), global_rt_runtime());
7463
7464	/*
7465	 * FIXME: As above...
7466	 */
7467	for_each_possible_cpu(cpu) {
7468		struct dl_bw *dl_b = dl_bw_of(cpu);
7469
7470		raw_spin_lock(&dl_b->lock);
7471		dl_b->bw = new_bw;
7472		raw_spin_unlock(&dl_b->lock);
7473	}
7474}
7475
7476static int sched_rt_global_validate(void)
7477{
7478	if (sysctl_sched_rt_period <= 0)
7479		return -EINVAL;
7480
7481	if (sysctl_sched_rt_runtime > sysctl_sched_rt_period)
7482		return -EINVAL;
7483
7484	return 0;
7485}
7486
7487static void sched_rt_do_global(void)
7488{
7489	def_rt_bandwidth.rt_runtime = global_rt_runtime();
7490	def_rt_bandwidth.rt_period = ns_to_ktime(global_rt_period());
7491}
7492
7493int sched_rt_handler(struct ctl_table *table, int write,
7494		void __user *buffer, size_t *lenp,
7495		loff_t *ppos)
7496{
7497	int old_period, old_runtime;
7498	static DEFINE_MUTEX(mutex);
7499	int ret;
7500
7501	mutex_lock(&mutex);
7502	old_period = sysctl_sched_rt_period;
7503	old_runtime = sysctl_sched_rt_runtime;
7504
7505	ret = proc_dointvec(table, write, buffer, lenp, ppos);
7506
7507	if (!ret && write) {
7508		ret = sched_rt_global_validate();
7509		if (ret)
7510			goto undo;
7511
7512		ret = sched_rt_global_constraints();
7513		if (ret)
7514			goto undo;
7515
7516		ret = sched_dl_global_constraints();
7517		if (ret)
7518			goto undo;
7519
7520		sched_rt_do_global();
7521		sched_dl_do_global();
7522	}
7523	if (0) {
7524undo:
7525		sysctl_sched_rt_period = old_period;
7526		sysctl_sched_rt_runtime = old_runtime;
7527	}
7528	mutex_unlock(&mutex);
7529
7530	return ret;
7531}
7532
7533int sched_rr_handler(struct ctl_table *table, int write,
7534		void __user *buffer, size_t *lenp,
7535		loff_t *ppos)
7536{
7537	int ret;
7538	static DEFINE_MUTEX(mutex);
7539
7540	mutex_lock(&mutex);
7541	ret = proc_dointvec(table, write, buffer, lenp, ppos);
7542	/* make sure that internally we keep jiffies */
7543	/* also, writing zero resets timeslice to default */
7544	if (!ret && write) {
7545		sched_rr_timeslice = sched_rr_timeslice <= 0 ?
7546			RR_TIMESLICE : msecs_to_jiffies(sched_rr_timeslice);
7547	}
7548	mutex_unlock(&mutex);
7549	return ret;
7550}
7551
7552#ifdef CONFIG_CGROUP_SCHED
7553
7554static inline struct task_group *css_tg(struct cgroup_subsys_state *css)
7555{
7556	return css ? container_of(css, struct task_group, css) : NULL;
7557}
7558
7559static struct cgroup_subsys_state *
7560cpu_cgroup_css_alloc(struct cgroup_subsys_state *parent_css)
7561{
7562	struct task_group *parent = css_tg(parent_css);
7563	struct task_group *tg;
7564
7565	if (!parent) {
7566		/* This is early initialization for the top cgroup */
7567		return &root_task_group.css;
7568	}
7569
7570	tg = sched_create_group(parent);
7571	if (IS_ERR(tg))
7572		return ERR_PTR(-ENOMEM);
7573
7574	return &tg->css;
7575}
7576
7577static int cpu_cgroup_css_online(struct cgroup_subsys_state *css)
7578{
7579	struct task_group *tg = css_tg(css);
7580	struct task_group *parent = css_tg(css_parent(css));
7581
7582	if (parent)
7583		sched_online_group(tg, parent);
7584	return 0;
7585}
7586
7587static void cpu_cgroup_css_free(struct cgroup_subsys_state *css)
7588{
7589	struct task_group *tg = css_tg(css);
7590
7591	sched_destroy_group(tg);
7592}
7593
7594static void cpu_cgroup_css_offline(struct cgroup_subsys_state *css)
7595{
7596	struct task_group *tg = css_tg(css);
7597
7598	sched_offline_group(tg);
7599}
7600
7601static int cpu_cgroup_can_attach(struct cgroup_subsys_state *css,
7602				 struct cgroup_taskset *tset)
7603{
7604	struct task_struct *task;
7605
7606	cgroup_taskset_for_each(task, css, tset) {
7607#ifdef CONFIG_RT_GROUP_SCHED
7608		if (!sched_rt_can_attach(css_tg(css), task))
7609			return -EINVAL;
7610#else
7611		/* We don't support RT-tasks being in separate groups */
7612		if (task->sched_class != &fair_sched_class)
7613			return -EINVAL;
7614#endif
7615	}
7616	return 0;
7617}
7618
7619static void cpu_cgroup_attach(struct cgroup_subsys_state *css,
7620			      struct cgroup_taskset *tset)
7621{
7622	struct task_struct *task;
7623
7624	cgroup_taskset_for_each(task, css, tset)
7625		sched_move_task(task);
7626}
7627
7628static void cpu_cgroup_exit(struct cgroup_subsys_state *css,
7629			    struct cgroup_subsys_state *old_css,
7630			    struct task_struct *task)
7631{
7632	/*
7633	 * cgroup_exit() is called in the copy_process() failure path.
7634	 * Ignore this case since the task hasn't ran yet, this avoids
7635	 * trying to poke a half freed task state from generic code.
7636	 */
7637	if (!(task->flags & PF_EXITING))
7638		return;
7639
7640	sched_move_task(task);
7641}
7642
7643#ifdef CONFIG_FAIR_GROUP_SCHED
7644static int cpu_shares_write_u64(struct cgroup_subsys_state *css,
7645				struct cftype *cftype, u64 shareval)
7646{
7647	return sched_group_set_shares(css_tg(css), scale_load(shareval));
7648}
7649
7650static u64 cpu_shares_read_u64(struct cgroup_subsys_state *css,
7651			       struct cftype *cft)
7652{
7653	struct task_group *tg = css_tg(css);
7654
7655	return (u64) scale_load_down(tg->shares);
7656}
7657
7658#ifdef CONFIG_CFS_BANDWIDTH
7659static DEFINE_MUTEX(cfs_constraints_mutex);
7660
7661const u64 max_cfs_quota_period = 1 * NSEC_PER_SEC; /* 1s */
7662const u64 min_cfs_quota_period = 1 * NSEC_PER_MSEC; /* 1ms */
7663
7664static int __cfs_schedulable(struct task_group *tg, u64 period, u64 runtime);
7665
7666static int tg_set_cfs_bandwidth(struct task_group *tg, u64 period, u64 quota)
7667{
7668	int i, ret = 0, runtime_enabled, runtime_was_enabled;
7669	struct cfs_bandwidth *cfs_b = &tg->cfs_bandwidth;
7670
7671	if (tg == &root_task_group)
7672		return -EINVAL;
7673
7674	/*
7675	 * Ensure we have at some amount of bandwidth every period.  This is
7676	 * to prevent reaching a state of large arrears when throttled via
7677	 * entity_tick() resulting in prolonged exit starvation.
7678	 */
7679	if (quota < min_cfs_quota_period || period < min_cfs_quota_period)
7680		return -EINVAL;
7681
7682	/*
7683	 * Likewise, bound things on the otherside by preventing insane quota
7684	 * periods.  This also allows us to normalize in computing quota
7685	 * feasibility.
7686	 */
7687	if (period > max_cfs_quota_period)
7688		return -EINVAL;
7689
7690	mutex_lock(&cfs_constraints_mutex);
7691	ret = __cfs_schedulable(tg, period, quota);
7692	if (ret)
7693		goto out_unlock;
7694
7695	runtime_enabled = quota != RUNTIME_INF;
7696	runtime_was_enabled = cfs_b->quota != RUNTIME_INF;
7697	/*
7698	 * If we need to toggle cfs_bandwidth_used, off->on must occur
7699	 * before making related changes, and on->off must occur afterwards
7700	 */
7701	if (runtime_enabled && !runtime_was_enabled)
7702		cfs_bandwidth_usage_inc();
7703	raw_spin_lock_irq(&cfs_b->lock);
7704	cfs_b->period = ns_to_ktime(period);
7705	cfs_b->quota = quota;
7706
7707	__refill_cfs_bandwidth_runtime(cfs_b);
7708	/* restart the period timer (if active) to handle new period expiry */
7709	if (runtime_enabled && cfs_b->timer_active) {
7710		/* force a reprogram */
7711		cfs_b->timer_active = 0;
7712		__start_cfs_bandwidth(cfs_b);
7713	}
7714	raw_spin_unlock_irq(&cfs_b->lock);
7715
7716	for_each_possible_cpu(i) {
7717		struct cfs_rq *cfs_rq = tg->cfs_rq[i];
7718		struct rq *rq = cfs_rq->rq;
7719
7720		raw_spin_lock_irq(&rq->lock);
7721		cfs_rq->runtime_enabled = runtime_enabled;
7722		cfs_rq->runtime_remaining = 0;
7723
7724		if (cfs_rq->throttled)
7725			unthrottle_cfs_rq(cfs_rq);
7726		raw_spin_unlock_irq(&rq->lock);
7727	}
7728	if (runtime_was_enabled && !runtime_enabled)
7729		cfs_bandwidth_usage_dec();
7730out_unlock:
7731	mutex_unlock(&cfs_constraints_mutex);
7732
7733	return ret;
7734}
7735
7736int tg_set_cfs_quota(struct task_group *tg, long cfs_quota_us)
7737{
7738	u64 quota, period;
7739
7740	period = ktime_to_ns(tg->cfs_bandwidth.period);
7741	if (cfs_quota_us < 0)
7742		quota = RUNTIME_INF;
7743	else
7744		quota = (u64)cfs_quota_us * NSEC_PER_USEC;
7745
7746	return tg_set_cfs_bandwidth(tg, period, quota);
7747}
7748
7749long tg_get_cfs_quota(struct task_group *tg)
7750{
7751	u64 quota_us;
7752
7753	if (tg->cfs_bandwidth.quota == RUNTIME_INF)
7754		return -1;
7755
7756	quota_us = tg->cfs_bandwidth.quota;
7757	do_div(quota_us, NSEC_PER_USEC);
7758
7759	return quota_us;
7760}
7761
7762int tg_set_cfs_period(struct task_group *tg, long cfs_period_us)
7763{
7764	u64 quota, period;
7765
7766	period = (u64)cfs_period_us * NSEC_PER_USEC;
7767	quota = tg->cfs_bandwidth.quota;
7768
7769	return tg_set_cfs_bandwidth(tg, period, quota);
7770}
7771
7772long tg_get_cfs_period(struct task_group *tg)
7773{
7774	u64 cfs_period_us;
7775
7776	cfs_period_us = ktime_to_ns(tg->cfs_bandwidth.period);
7777	do_div(cfs_period_us, NSEC_PER_USEC);
7778
7779	return cfs_period_us;
7780}
7781
7782static s64 cpu_cfs_quota_read_s64(struct cgroup_subsys_state *css,
7783				  struct cftype *cft)
7784{
7785	return tg_get_cfs_quota(css_tg(css));
7786}
7787
7788static int cpu_cfs_quota_write_s64(struct cgroup_subsys_state *css,
7789				   struct cftype *cftype, s64 cfs_quota_us)
7790{
7791	return tg_set_cfs_quota(css_tg(css), cfs_quota_us);
7792}
7793
7794static u64 cpu_cfs_period_read_u64(struct cgroup_subsys_state *css,
7795				   struct cftype *cft)
7796{
7797	return tg_get_cfs_period(css_tg(css));
7798}
7799
7800static int cpu_cfs_period_write_u64(struct cgroup_subsys_state *css,
7801				    struct cftype *cftype, u64 cfs_period_us)
7802{
7803	return tg_set_cfs_period(css_tg(css), cfs_period_us);
7804}
7805
7806struct cfs_schedulable_data {
7807	struct task_group *tg;
7808	u64 period, quota;
7809};
7810
7811/*
7812 * normalize group quota/period to be quota/max_period
7813 * note: units are usecs
7814 */
7815static u64 normalize_cfs_quota(struct task_group *tg,
7816			       struct cfs_schedulable_data *d)
7817{
7818	u64 quota, period;
7819
7820	if (tg == d->tg) {
7821		period = d->period;
7822		quota = d->quota;
7823	} else {
7824		period = tg_get_cfs_period(tg);
7825		quota = tg_get_cfs_quota(tg);
7826	}
7827
7828	/* note: these should typically be equivalent */
7829	if (quota == RUNTIME_INF || quota == -1)
7830		return RUNTIME_INF;
7831
7832	return to_ratio(period, quota);
7833}
7834
7835static int tg_cfs_schedulable_down(struct task_group *tg, void *data)
7836{
7837	struct cfs_schedulable_data *d = data;
7838	struct cfs_bandwidth *cfs_b = &tg->cfs_bandwidth;
7839	s64 quota = 0, parent_quota = -1;
7840
7841	if (!tg->parent) {
7842		quota = RUNTIME_INF;
7843	} else {
7844		struct cfs_bandwidth *parent_b = &tg->parent->cfs_bandwidth;
7845
7846		quota = normalize_cfs_quota(tg, d);
7847		parent_quota = parent_b->hierarchal_quota;
7848
7849		/*
7850		 * ensure max(child_quota) <= parent_quota, inherit when no
7851		 * limit is set
7852		 */
7853		if (quota == RUNTIME_INF)
7854			quota = parent_quota;
7855		else if (parent_quota != RUNTIME_INF && quota > parent_quota)
7856			return -EINVAL;
7857	}
7858	cfs_b->hierarchal_quota = quota;
7859
7860	return 0;
7861}
7862
7863static int __cfs_schedulable(struct task_group *tg, u64 period, u64 quota)
7864{
7865	int ret;
7866	struct cfs_schedulable_data data = {
7867		.tg = tg,
7868		.period = period,
7869		.quota = quota,
7870	};
7871
7872	if (quota != RUNTIME_INF) {
7873		do_div(data.period, NSEC_PER_USEC);
7874		do_div(data.quota, NSEC_PER_USEC);
7875	}
7876
7877	rcu_read_lock();
7878	ret = walk_tg_tree(tg_cfs_schedulable_down, tg_nop, &data);
7879	rcu_read_unlock();
7880
7881	return ret;
7882}
7883
7884static int cpu_stats_show(struct seq_file *sf, void *v)
7885{
7886	struct task_group *tg = css_tg(seq_css(sf));
7887	struct cfs_bandwidth *cfs_b = &tg->cfs_bandwidth;
7888
7889	seq_printf(sf, "nr_periods %d\n", cfs_b->nr_periods);
7890	seq_printf(sf, "nr_throttled %d\n", cfs_b->nr_throttled);
7891	seq_printf(sf, "throttled_time %llu\n", cfs_b->throttled_time);
7892
7893	return 0;
7894}
7895#endif /* CONFIG_CFS_BANDWIDTH */
7896#endif /* CONFIG_FAIR_GROUP_SCHED */
7897
7898#ifdef CONFIG_RT_GROUP_SCHED
7899static int cpu_rt_runtime_write(struct cgroup_subsys_state *css,
7900				struct cftype *cft, s64 val)
7901{
7902	return sched_group_set_rt_runtime(css_tg(css), val);
7903}
7904
7905static s64 cpu_rt_runtime_read(struct cgroup_subsys_state *css,
7906			       struct cftype *cft)
7907{
7908	return sched_group_rt_runtime(css_tg(css));
7909}
7910
7911static int cpu_rt_period_write_uint(struct cgroup_subsys_state *css,
7912				    struct cftype *cftype, u64 rt_period_us)
7913{
7914	return sched_group_set_rt_period(css_tg(css), rt_period_us);
7915}
7916
7917static u64 cpu_rt_period_read_uint(struct cgroup_subsys_state *css,
7918				   struct cftype *cft)
7919{
7920	return sched_group_rt_period(css_tg(css));
7921}
7922#endif /* CONFIG_RT_GROUP_SCHED */
7923
7924static struct cftype cpu_files[] = {
7925#ifdef CONFIG_FAIR_GROUP_SCHED
7926	{
7927		.name = "shares",
7928		.read_u64 = cpu_shares_read_u64,
7929		.write_u64 = cpu_shares_write_u64,
7930	},
7931#endif
7932#ifdef CONFIG_CFS_BANDWIDTH
7933	{
7934		.name = "cfs_quota_us",
7935		.read_s64 = cpu_cfs_quota_read_s64,
7936		.write_s64 = cpu_cfs_quota_write_s64,
7937	},
7938	{
7939		.name = "cfs_period_us",
7940		.read_u64 = cpu_cfs_period_read_u64,
7941		.write_u64 = cpu_cfs_period_write_u64,
7942	},
7943	{
7944		.name = "stat",
7945		.seq_show = cpu_stats_show,
7946	},
7947#endif
7948#ifdef CONFIG_RT_GROUP_SCHED
7949	{
7950		.name = "rt_runtime_us",
7951		.read_s64 = cpu_rt_runtime_read,
7952		.write_s64 = cpu_rt_runtime_write,
7953	},
7954	{
7955		.name = "rt_period_us",
7956		.read_u64 = cpu_rt_period_read_uint,
7957		.write_u64 = cpu_rt_period_write_uint,
7958	},
7959#endif
7960	{ }	/* terminate */
7961};
7962
7963struct cgroup_subsys cpu_cgroup_subsys = {
7964	.name		= "cpu",
7965	.css_alloc	= cpu_cgroup_css_alloc,
7966	.css_free	= cpu_cgroup_css_free,
7967	.css_online	= cpu_cgroup_css_online,
7968	.css_offline	= cpu_cgroup_css_offline,
7969	.can_attach	= cpu_cgroup_can_attach,
7970	.attach		= cpu_cgroup_attach,
7971	.exit		= cpu_cgroup_exit,
7972	.subsys_id	= cpu_cgroup_subsys_id,
7973	.base_cftypes	= cpu_files,
7974	.early_init	= 1,
7975};
7976
7977#endif	/* CONFIG_CGROUP_SCHED */
7978
7979void dump_cpu_task(int cpu)
7980{
7981	pr_info("Task dump for CPU %d:\n", cpu);
7982	sched_show_task(cpu_curr(cpu));
7983}
7984