tree_plugin.h revision 78e4bc34e5d966cfd95f1238565afc399d56225c
1/*
2 * Read-Copy Update mechanism for mutual exclusion (tree-based version)
3 * Internal non-public definitions that provide either classic
4 * or preemptible semantics.
5 *
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation; either version 2 of the License, or
9 * (at your option) any later version.
10 *
11 * This program is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
14 * GNU General Public License for more details.
15 *
16 * You should have received a copy of the GNU General Public License
17 * along with this program; if not, write to the Free Software
18 * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
19 *
20 * Copyright Red Hat, 2009
21 * Copyright IBM Corporation, 2009
22 *
23 * Author: Ingo Molnar <mingo@elte.hu>
24 *	   Paul E. McKenney <paulmck@linux.vnet.ibm.com>
25 */
26
27#include <linux/delay.h>
28#include <linux/gfp.h>
29#include <linux/oom.h>
30#include <linux/smpboot.h>
31#include "../time/tick-internal.h"
32
33#define RCU_KTHREAD_PRIO 1
34
35#ifdef CONFIG_RCU_BOOST
36#define RCU_BOOST_PRIO CONFIG_RCU_BOOST_PRIO
37#else
38#define RCU_BOOST_PRIO RCU_KTHREAD_PRIO
39#endif
40
41#ifdef CONFIG_RCU_NOCB_CPU
42static cpumask_var_t rcu_nocb_mask; /* CPUs to have callbacks offloaded. */
43static bool have_rcu_nocb_mask;	    /* Was rcu_nocb_mask allocated? */
44static bool __read_mostly rcu_nocb_poll;    /* Offload kthread are to poll. */
45static char __initdata nocb_buf[NR_CPUS * 5];
46#endif /* #ifdef CONFIG_RCU_NOCB_CPU */
47
48/*
49 * Check the RCU kernel configuration parameters and print informative
50 * messages about anything out of the ordinary.  If you like #ifdef, you
51 * will love this function.
52 */
53static void __init rcu_bootup_announce_oddness(void)
54{
55#ifdef CONFIG_RCU_TRACE
56	pr_info("\tRCU debugfs-based tracing is enabled.\n");
57#endif
58#if (defined(CONFIG_64BIT) && CONFIG_RCU_FANOUT != 64) || (!defined(CONFIG_64BIT) && CONFIG_RCU_FANOUT != 32)
59	pr_info("\tCONFIG_RCU_FANOUT set to non-default value of %d\n",
60	       CONFIG_RCU_FANOUT);
61#endif
62#ifdef CONFIG_RCU_FANOUT_EXACT
63	pr_info("\tHierarchical RCU autobalancing is disabled.\n");
64#endif
65#ifdef CONFIG_RCU_FAST_NO_HZ
66	pr_info("\tRCU dyntick-idle grace-period acceleration is enabled.\n");
67#endif
68#ifdef CONFIG_PROVE_RCU
69	pr_info("\tRCU lockdep checking is enabled.\n");
70#endif
71#ifdef CONFIG_RCU_TORTURE_TEST_RUNNABLE
72	pr_info("\tRCU torture testing starts during boot.\n");
73#endif
74#if defined(CONFIG_TREE_PREEMPT_RCU) && !defined(CONFIG_RCU_CPU_STALL_VERBOSE)
75	pr_info("\tDump stacks of tasks blocking RCU-preempt GP.\n");
76#endif
77#if defined(CONFIG_RCU_CPU_STALL_INFO)
78	pr_info("\tAdditional per-CPU info printed with stalls.\n");
79#endif
80#if NUM_RCU_LVL_4 != 0
81	pr_info("\tFour-level hierarchy is enabled.\n");
82#endif
83	if (rcu_fanout_leaf != CONFIG_RCU_FANOUT_LEAF)
84		pr_info("\tBoot-time adjustment of leaf fanout to %d.\n", rcu_fanout_leaf);
85	if (nr_cpu_ids != NR_CPUS)
86		pr_info("\tRCU restricting CPUs from NR_CPUS=%d to nr_cpu_ids=%d.\n", NR_CPUS, nr_cpu_ids);
87#ifdef CONFIG_RCU_NOCB_CPU
88#ifndef CONFIG_RCU_NOCB_CPU_NONE
89	if (!have_rcu_nocb_mask) {
90		zalloc_cpumask_var(&rcu_nocb_mask, GFP_KERNEL);
91		have_rcu_nocb_mask = true;
92	}
93#ifdef CONFIG_RCU_NOCB_CPU_ZERO
94	pr_info("\tOffload RCU callbacks from CPU 0\n");
95	cpumask_set_cpu(0, rcu_nocb_mask);
96#endif /* #ifdef CONFIG_RCU_NOCB_CPU_ZERO */
97#ifdef CONFIG_RCU_NOCB_CPU_ALL
98	pr_info("\tOffload RCU callbacks from all CPUs\n");
99	cpumask_copy(rcu_nocb_mask, cpu_possible_mask);
100#endif /* #ifdef CONFIG_RCU_NOCB_CPU_ALL */
101#endif /* #ifndef CONFIG_RCU_NOCB_CPU_NONE */
102	if (have_rcu_nocb_mask) {
103		if (!cpumask_subset(rcu_nocb_mask, cpu_possible_mask)) {
104			pr_info("\tNote: kernel parameter 'rcu_nocbs=' contains nonexistent CPUs.\n");
105			cpumask_and(rcu_nocb_mask, cpu_possible_mask,
106				    rcu_nocb_mask);
107		}
108		cpulist_scnprintf(nocb_buf, sizeof(nocb_buf), rcu_nocb_mask);
109		pr_info("\tOffload RCU callbacks from CPUs: %s.\n", nocb_buf);
110		if (rcu_nocb_poll)
111			pr_info("\tPoll for callbacks from no-CBs CPUs.\n");
112	}
113#endif /* #ifdef CONFIG_RCU_NOCB_CPU */
114}
115
116#ifdef CONFIG_TREE_PREEMPT_RCU
117
118RCU_STATE_INITIALIZER(rcu_preempt, 'p', call_rcu);
119static struct rcu_state *rcu_state = &rcu_preempt_state;
120
121static int rcu_preempted_readers_exp(struct rcu_node *rnp);
122
123/*
124 * Tell them what RCU they are running.
125 */
126static void __init rcu_bootup_announce(void)
127{
128	pr_info("Preemptible hierarchical RCU implementation.\n");
129	rcu_bootup_announce_oddness();
130}
131
132/*
133 * Return the number of RCU-preempt batches processed thus far
134 * for debug and statistics.
135 */
136long rcu_batches_completed_preempt(void)
137{
138	return rcu_preempt_state.completed;
139}
140EXPORT_SYMBOL_GPL(rcu_batches_completed_preempt);
141
142/*
143 * Return the number of RCU batches processed thus far for debug & stats.
144 */
145long rcu_batches_completed(void)
146{
147	return rcu_batches_completed_preempt();
148}
149EXPORT_SYMBOL_GPL(rcu_batches_completed);
150
151/*
152 * Force a quiescent state for preemptible RCU.
153 */
154void rcu_force_quiescent_state(void)
155{
156	force_quiescent_state(&rcu_preempt_state);
157}
158EXPORT_SYMBOL_GPL(rcu_force_quiescent_state);
159
160/*
161 * Record a preemptible-RCU quiescent state for the specified CPU.  Note
162 * that this just means that the task currently running on the CPU is
163 * not in a quiescent state.  There might be any number of tasks blocked
164 * while in an RCU read-side critical section.
165 *
166 * Unlike the other rcu_*_qs() functions, callers to this function
167 * must disable irqs in order to protect the assignment to
168 * ->rcu_read_unlock_special.
169 */
170static void rcu_preempt_qs(int cpu)
171{
172	struct rcu_data *rdp = &per_cpu(rcu_preempt_data, cpu);
173
174	if (rdp->passed_quiesce == 0)
175		trace_rcu_grace_period(TPS("rcu_preempt"), rdp->gpnum, TPS("cpuqs"));
176	rdp->passed_quiesce = 1;
177	current->rcu_read_unlock_special &= ~RCU_READ_UNLOCK_NEED_QS;
178}
179
180/*
181 * We have entered the scheduler, and the current task might soon be
182 * context-switched away from.  If this task is in an RCU read-side
183 * critical section, we will no longer be able to rely on the CPU to
184 * record that fact, so we enqueue the task on the blkd_tasks list.
185 * The task will dequeue itself when it exits the outermost enclosing
186 * RCU read-side critical section.  Therefore, the current grace period
187 * cannot be permitted to complete until the blkd_tasks list entries
188 * predating the current grace period drain, in other words, until
189 * rnp->gp_tasks becomes NULL.
190 *
191 * Caller must disable preemption.
192 */
193static void rcu_preempt_note_context_switch(int cpu)
194{
195	struct task_struct *t = current;
196	unsigned long flags;
197	struct rcu_data *rdp;
198	struct rcu_node *rnp;
199
200	if (t->rcu_read_lock_nesting > 0 &&
201	    (t->rcu_read_unlock_special & RCU_READ_UNLOCK_BLOCKED) == 0) {
202
203		/* Possibly blocking in an RCU read-side critical section. */
204		rdp = per_cpu_ptr(rcu_preempt_state.rda, cpu);
205		rnp = rdp->mynode;
206		raw_spin_lock_irqsave(&rnp->lock, flags);
207		t->rcu_read_unlock_special |= RCU_READ_UNLOCK_BLOCKED;
208		t->rcu_blocked_node = rnp;
209
210		/*
211		 * If this CPU has already checked in, then this task
212		 * will hold up the next grace period rather than the
213		 * current grace period.  Queue the task accordingly.
214		 * If the task is queued for the current grace period
215		 * (i.e., this CPU has not yet passed through a quiescent
216		 * state for the current grace period), then as long
217		 * as that task remains queued, the current grace period
218		 * cannot end.  Note that there is some uncertainty as
219		 * to exactly when the current grace period started.
220		 * We take a conservative approach, which can result
221		 * in unnecessarily waiting on tasks that started very
222		 * slightly after the current grace period began.  C'est
223		 * la vie!!!
224		 *
225		 * But first, note that the current CPU must still be
226		 * on line!
227		 */
228		WARN_ON_ONCE((rdp->grpmask & rnp->qsmaskinit) == 0);
229		WARN_ON_ONCE(!list_empty(&t->rcu_node_entry));
230		if ((rnp->qsmask & rdp->grpmask) && rnp->gp_tasks != NULL) {
231			list_add(&t->rcu_node_entry, rnp->gp_tasks->prev);
232			rnp->gp_tasks = &t->rcu_node_entry;
233#ifdef CONFIG_RCU_BOOST
234			if (rnp->boost_tasks != NULL)
235				rnp->boost_tasks = rnp->gp_tasks;
236#endif /* #ifdef CONFIG_RCU_BOOST */
237		} else {
238			list_add(&t->rcu_node_entry, &rnp->blkd_tasks);
239			if (rnp->qsmask & rdp->grpmask)
240				rnp->gp_tasks = &t->rcu_node_entry;
241		}
242		trace_rcu_preempt_task(rdp->rsp->name,
243				       t->pid,
244				       (rnp->qsmask & rdp->grpmask)
245				       ? rnp->gpnum
246				       : rnp->gpnum + 1);
247		raw_spin_unlock_irqrestore(&rnp->lock, flags);
248	} else if (t->rcu_read_lock_nesting < 0 &&
249		   t->rcu_read_unlock_special) {
250
251		/*
252		 * Complete exit from RCU read-side critical section on
253		 * behalf of preempted instance of __rcu_read_unlock().
254		 */
255		rcu_read_unlock_special(t);
256	}
257
258	/*
259	 * Either we were not in an RCU read-side critical section to
260	 * begin with, or we have now recorded that critical section
261	 * globally.  Either way, we can now note a quiescent state
262	 * for this CPU.  Again, if we were in an RCU read-side critical
263	 * section, and if that critical section was blocking the current
264	 * grace period, then the fact that the task has been enqueued
265	 * means that we continue to block the current grace period.
266	 */
267	local_irq_save(flags);
268	rcu_preempt_qs(cpu);
269	local_irq_restore(flags);
270}
271
272/*
273 * Check for preempted RCU readers blocking the current grace period
274 * for the specified rcu_node structure.  If the caller needs a reliable
275 * answer, it must hold the rcu_node's ->lock.
276 */
277static int rcu_preempt_blocked_readers_cgp(struct rcu_node *rnp)
278{
279	return rnp->gp_tasks != NULL;
280}
281
282/*
283 * Record a quiescent state for all tasks that were previously queued
284 * on the specified rcu_node structure and that were blocking the current
285 * RCU grace period.  The caller must hold the specified rnp->lock with
286 * irqs disabled, and this lock is released upon return, but irqs remain
287 * disabled.
288 */
289static void rcu_report_unblock_qs_rnp(struct rcu_node *rnp, unsigned long flags)
290	__releases(rnp->lock)
291{
292	unsigned long mask;
293	struct rcu_node *rnp_p;
294
295	if (rnp->qsmask != 0 || rcu_preempt_blocked_readers_cgp(rnp)) {
296		raw_spin_unlock_irqrestore(&rnp->lock, flags);
297		return;  /* Still need more quiescent states! */
298	}
299
300	rnp_p = rnp->parent;
301	if (rnp_p == NULL) {
302		/*
303		 * Either there is only one rcu_node in the tree,
304		 * or tasks were kicked up to root rcu_node due to
305		 * CPUs going offline.
306		 */
307		rcu_report_qs_rsp(&rcu_preempt_state, flags);
308		return;
309	}
310
311	/* Report up the rest of the hierarchy. */
312	mask = rnp->grpmask;
313	raw_spin_unlock(&rnp->lock);	/* irqs remain disabled. */
314	raw_spin_lock(&rnp_p->lock);	/* irqs already disabled. */
315	rcu_report_qs_rnp(mask, &rcu_preempt_state, rnp_p, flags);
316}
317
318/*
319 * Advance a ->blkd_tasks-list pointer to the next entry, instead
320 * returning NULL if at the end of the list.
321 */
322static struct list_head *rcu_next_node_entry(struct task_struct *t,
323					     struct rcu_node *rnp)
324{
325	struct list_head *np;
326
327	np = t->rcu_node_entry.next;
328	if (np == &rnp->blkd_tasks)
329		np = NULL;
330	return np;
331}
332
333/*
334 * Handle special cases during rcu_read_unlock(), such as needing to
335 * notify RCU core processing or task having blocked during the RCU
336 * read-side critical section.
337 */
338void rcu_read_unlock_special(struct task_struct *t)
339{
340	int empty;
341	int empty_exp;
342	int empty_exp_now;
343	unsigned long flags;
344	struct list_head *np;
345#ifdef CONFIG_RCU_BOOST
346	struct rt_mutex *rbmp = NULL;
347#endif /* #ifdef CONFIG_RCU_BOOST */
348	struct rcu_node *rnp;
349	int special;
350
351	/* NMI handlers cannot block and cannot safely manipulate state. */
352	if (in_nmi())
353		return;
354
355	local_irq_save(flags);
356
357	/*
358	 * If RCU core is waiting for this CPU to exit critical section,
359	 * let it know that we have done so.
360	 */
361	special = t->rcu_read_unlock_special;
362	if (special & RCU_READ_UNLOCK_NEED_QS) {
363		rcu_preempt_qs(smp_processor_id());
364	}
365
366	/* Hardware IRQ handlers cannot block. */
367	if (in_irq() || in_serving_softirq()) {
368		local_irq_restore(flags);
369		return;
370	}
371
372	/* Clean up if blocked during RCU read-side critical section. */
373	if (special & RCU_READ_UNLOCK_BLOCKED) {
374		t->rcu_read_unlock_special &= ~RCU_READ_UNLOCK_BLOCKED;
375
376		/*
377		 * Remove this task from the list it blocked on.  The
378		 * task can migrate while we acquire the lock, but at
379		 * most one time.  So at most two passes through loop.
380		 */
381		for (;;) {
382			rnp = t->rcu_blocked_node;
383			raw_spin_lock(&rnp->lock);  /* irqs already disabled. */
384			if (rnp == t->rcu_blocked_node)
385				break;
386			raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */
387		}
388		empty = !rcu_preempt_blocked_readers_cgp(rnp);
389		empty_exp = !rcu_preempted_readers_exp(rnp);
390		smp_mb(); /* ensure expedited fastpath sees end of RCU c-s. */
391		np = rcu_next_node_entry(t, rnp);
392		list_del_init(&t->rcu_node_entry);
393		t->rcu_blocked_node = NULL;
394		trace_rcu_unlock_preempted_task(TPS("rcu_preempt"),
395						rnp->gpnum, t->pid);
396		if (&t->rcu_node_entry == rnp->gp_tasks)
397			rnp->gp_tasks = np;
398		if (&t->rcu_node_entry == rnp->exp_tasks)
399			rnp->exp_tasks = np;
400#ifdef CONFIG_RCU_BOOST
401		if (&t->rcu_node_entry == rnp->boost_tasks)
402			rnp->boost_tasks = np;
403		/* Snapshot/clear ->rcu_boost_mutex with rcu_node lock held. */
404		if (t->rcu_boost_mutex) {
405			rbmp = t->rcu_boost_mutex;
406			t->rcu_boost_mutex = NULL;
407		}
408#endif /* #ifdef CONFIG_RCU_BOOST */
409
410		/*
411		 * If this was the last task on the current list, and if
412		 * we aren't waiting on any CPUs, report the quiescent state.
413		 * Note that rcu_report_unblock_qs_rnp() releases rnp->lock,
414		 * so we must take a snapshot of the expedited state.
415		 */
416		empty_exp_now = !rcu_preempted_readers_exp(rnp);
417		if (!empty && !rcu_preempt_blocked_readers_cgp(rnp)) {
418			trace_rcu_quiescent_state_report(TPS("preempt_rcu"),
419							 rnp->gpnum,
420							 0, rnp->qsmask,
421							 rnp->level,
422							 rnp->grplo,
423							 rnp->grphi,
424							 !!rnp->gp_tasks);
425			rcu_report_unblock_qs_rnp(rnp, flags);
426		} else {
427			raw_spin_unlock_irqrestore(&rnp->lock, flags);
428		}
429
430#ifdef CONFIG_RCU_BOOST
431		/* Unboost if we were boosted. */
432		if (rbmp)
433			rt_mutex_unlock(rbmp);
434#endif /* #ifdef CONFIG_RCU_BOOST */
435
436		/*
437		 * If this was the last task on the expedited lists,
438		 * then we need to report up the rcu_node hierarchy.
439		 */
440		if (!empty_exp && empty_exp_now)
441			rcu_report_exp_rnp(&rcu_preempt_state, rnp, true);
442	} else {
443		local_irq_restore(flags);
444	}
445}
446
447#ifdef CONFIG_RCU_CPU_STALL_VERBOSE
448
449/*
450 * Dump detailed information for all tasks blocking the current RCU
451 * grace period on the specified rcu_node structure.
452 */
453static void rcu_print_detail_task_stall_rnp(struct rcu_node *rnp)
454{
455	unsigned long flags;
456	struct task_struct *t;
457
458	raw_spin_lock_irqsave(&rnp->lock, flags);
459	if (!rcu_preempt_blocked_readers_cgp(rnp)) {
460		raw_spin_unlock_irqrestore(&rnp->lock, flags);
461		return;
462	}
463	t = list_entry(rnp->gp_tasks,
464		       struct task_struct, rcu_node_entry);
465	list_for_each_entry_continue(t, &rnp->blkd_tasks, rcu_node_entry)
466		sched_show_task(t);
467	raw_spin_unlock_irqrestore(&rnp->lock, flags);
468}
469
470/*
471 * Dump detailed information for all tasks blocking the current RCU
472 * grace period.
473 */
474static void rcu_print_detail_task_stall(struct rcu_state *rsp)
475{
476	struct rcu_node *rnp = rcu_get_root(rsp);
477
478	rcu_print_detail_task_stall_rnp(rnp);
479	rcu_for_each_leaf_node(rsp, rnp)
480		rcu_print_detail_task_stall_rnp(rnp);
481}
482
483#else /* #ifdef CONFIG_RCU_CPU_STALL_VERBOSE */
484
485static void rcu_print_detail_task_stall(struct rcu_state *rsp)
486{
487}
488
489#endif /* #else #ifdef CONFIG_RCU_CPU_STALL_VERBOSE */
490
491#ifdef CONFIG_RCU_CPU_STALL_INFO
492
493static void rcu_print_task_stall_begin(struct rcu_node *rnp)
494{
495	pr_err("\tTasks blocked on level-%d rcu_node (CPUs %d-%d):",
496	       rnp->level, rnp->grplo, rnp->grphi);
497}
498
499static void rcu_print_task_stall_end(void)
500{
501	pr_cont("\n");
502}
503
504#else /* #ifdef CONFIG_RCU_CPU_STALL_INFO */
505
506static void rcu_print_task_stall_begin(struct rcu_node *rnp)
507{
508}
509
510static void rcu_print_task_stall_end(void)
511{
512}
513
514#endif /* #else #ifdef CONFIG_RCU_CPU_STALL_INFO */
515
516/*
517 * Scan the current list of tasks blocked within RCU read-side critical
518 * sections, printing out the tid of each.
519 */
520static int rcu_print_task_stall(struct rcu_node *rnp)
521{
522	struct task_struct *t;
523	int ndetected = 0;
524
525	if (!rcu_preempt_blocked_readers_cgp(rnp))
526		return 0;
527	rcu_print_task_stall_begin(rnp);
528	t = list_entry(rnp->gp_tasks,
529		       struct task_struct, rcu_node_entry);
530	list_for_each_entry_continue(t, &rnp->blkd_tasks, rcu_node_entry) {
531		pr_cont(" P%d", t->pid);
532		ndetected++;
533	}
534	rcu_print_task_stall_end();
535	return ndetected;
536}
537
538/*
539 * Check that the list of blocked tasks for the newly completed grace
540 * period is in fact empty.  It is a serious bug to complete a grace
541 * period that still has RCU readers blocked!  This function must be
542 * invoked -before- updating this rnp's ->gpnum, and the rnp's ->lock
543 * must be held by the caller.
544 *
545 * Also, if there are blocked tasks on the list, they automatically
546 * block the newly created grace period, so set up ->gp_tasks accordingly.
547 */
548static void rcu_preempt_check_blocked_tasks(struct rcu_node *rnp)
549{
550	WARN_ON_ONCE(rcu_preempt_blocked_readers_cgp(rnp));
551	if (!list_empty(&rnp->blkd_tasks))
552		rnp->gp_tasks = rnp->blkd_tasks.next;
553	WARN_ON_ONCE(rnp->qsmask);
554}
555
556#ifdef CONFIG_HOTPLUG_CPU
557
558/*
559 * Handle tasklist migration for case in which all CPUs covered by the
560 * specified rcu_node have gone offline.  Move them up to the root
561 * rcu_node.  The reason for not just moving them to the immediate
562 * parent is to remove the need for rcu_read_unlock_special() to
563 * make more than two attempts to acquire the target rcu_node's lock.
564 * Returns true if there were tasks blocking the current RCU grace
565 * period.
566 *
567 * Returns 1 if there was previously a task blocking the current grace
568 * period on the specified rcu_node structure.
569 *
570 * The caller must hold rnp->lock with irqs disabled.
571 */
572static int rcu_preempt_offline_tasks(struct rcu_state *rsp,
573				     struct rcu_node *rnp,
574				     struct rcu_data *rdp)
575{
576	struct list_head *lp;
577	struct list_head *lp_root;
578	int retval = 0;
579	struct rcu_node *rnp_root = rcu_get_root(rsp);
580	struct task_struct *t;
581
582	if (rnp == rnp_root) {
583		WARN_ONCE(1, "Last CPU thought to be offlined?");
584		return 0;  /* Shouldn't happen: at least one CPU online. */
585	}
586
587	/* If we are on an internal node, complain bitterly. */
588	WARN_ON_ONCE(rnp != rdp->mynode);
589
590	/*
591	 * Move tasks up to root rcu_node.  Don't try to get fancy for
592	 * this corner-case operation -- just put this node's tasks
593	 * at the head of the root node's list, and update the root node's
594	 * ->gp_tasks and ->exp_tasks pointers to those of this node's,
595	 * if non-NULL.  This might result in waiting for more tasks than
596	 * absolutely necessary, but this is a good performance/complexity
597	 * tradeoff.
598	 */
599	if (rcu_preempt_blocked_readers_cgp(rnp) && rnp->qsmask == 0)
600		retval |= RCU_OFL_TASKS_NORM_GP;
601	if (rcu_preempted_readers_exp(rnp))
602		retval |= RCU_OFL_TASKS_EXP_GP;
603	lp = &rnp->blkd_tasks;
604	lp_root = &rnp_root->blkd_tasks;
605	while (!list_empty(lp)) {
606		t = list_entry(lp->next, typeof(*t), rcu_node_entry);
607		raw_spin_lock(&rnp_root->lock); /* irqs already disabled */
608		list_del(&t->rcu_node_entry);
609		t->rcu_blocked_node = rnp_root;
610		list_add(&t->rcu_node_entry, lp_root);
611		if (&t->rcu_node_entry == rnp->gp_tasks)
612			rnp_root->gp_tasks = rnp->gp_tasks;
613		if (&t->rcu_node_entry == rnp->exp_tasks)
614			rnp_root->exp_tasks = rnp->exp_tasks;
615#ifdef CONFIG_RCU_BOOST
616		if (&t->rcu_node_entry == rnp->boost_tasks)
617			rnp_root->boost_tasks = rnp->boost_tasks;
618#endif /* #ifdef CONFIG_RCU_BOOST */
619		raw_spin_unlock(&rnp_root->lock); /* irqs still disabled */
620	}
621
622	rnp->gp_tasks = NULL;
623	rnp->exp_tasks = NULL;
624#ifdef CONFIG_RCU_BOOST
625	rnp->boost_tasks = NULL;
626	/*
627	 * In case root is being boosted and leaf was not.  Make sure
628	 * that we boost the tasks blocking the current grace period
629	 * in this case.
630	 */
631	raw_spin_lock(&rnp_root->lock); /* irqs already disabled */
632	if (rnp_root->boost_tasks != NULL &&
633	    rnp_root->boost_tasks != rnp_root->gp_tasks &&
634	    rnp_root->boost_tasks != rnp_root->exp_tasks)
635		rnp_root->boost_tasks = rnp_root->gp_tasks;
636	raw_spin_unlock(&rnp_root->lock); /* irqs still disabled */
637#endif /* #ifdef CONFIG_RCU_BOOST */
638
639	return retval;
640}
641
642#endif /* #ifdef CONFIG_HOTPLUG_CPU */
643
644/*
645 * Check for a quiescent state from the current CPU.  When a task blocks,
646 * the task is recorded in the corresponding CPU's rcu_node structure,
647 * which is checked elsewhere.
648 *
649 * Caller must disable hard irqs.
650 */
651static void rcu_preempt_check_callbacks(int cpu)
652{
653	struct task_struct *t = current;
654
655	if (t->rcu_read_lock_nesting == 0) {
656		rcu_preempt_qs(cpu);
657		return;
658	}
659	if (t->rcu_read_lock_nesting > 0 &&
660	    per_cpu(rcu_preempt_data, cpu).qs_pending)
661		t->rcu_read_unlock_special |= RCU_READ_UNLOCK_NEED_QS;
662}
663
664#ifdef CONFIG_RCU_BOOST
665
666static void rcu_preempt_do_callbacks(void)
667{
668	rcu_do_batch(&rcu_preempt_state, this_cpu_ptr(&rcu_preempt_data));
669}
670
671#endif /* #ifdef CONFIG_RCU_BOOST */
672
673/*
674 * Queue a preemptible-RCU callback for invocation after a grace period.
675 */
676void call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu))
677{
678	__call_rcu(head, func, &rcu_preempt_state, -1, 0);
679}
680EXPORT_SYMBOL_GPL(call_rcu);
681
682/*
683 * Queue an RCU callback for lazy invocation after a grace period.
684 * This will likely be later named something like "call_rcu_lazy()",
685 * but this change will require some way of tagging the lazy RCU
686 * callbacks in the list of pending callbacks.  Until then, this
687 * function may only be called from __kfree_rcu().
688 */
689void kfree_call_rcu(struct rcu_head *head,
690		    void (*func)(struct rcu_head *rcu))
691{
692	__call_rcu(head, func, &rcu_preempt_state, -1, 1);
693}
694EXPORT_SYMBOL_GPL(kfree_call_rcu);
695
696/**
697 * synchronize_rcu - wait until a grace period has elapsed.
698 *
699 * Control will return to the caller some time after a full grace
700 * period has elapsed, in other words after all currently executing RCU
701 * read-side critical sections have completed.  Note, however, that
702 * upon return from synchronize_rcu(), the caller might well be executing
703 * concurrently with new RCU read-side critical sections that began while
704 * synchronize_rcu() was waiting.  RCU read-side critical sections are
705 * delimited by rcu_read_lock() and rcu_read_unlock(), and may be nested.
706 *
707 * See the description of synchronize_sched() for more detailed information
708 * on memory ordering guarantees.
709 */
710void synchronize_rcu(void)
711{
712	rcu_lockdep_assert(!lock_is_held(&rcu_bh_lock_map) &&
713			   !lock_is_held(&rcu_lock_map) &&
714			   !lock_is_held(&rcu_sched_lock_map),
715			   "Illegal synchronize_rcu() in RCU read-side critical section");
716	if (!rcu_scheduler_active)
717		return;
718	if (rcu_expedited)
719		synchronize_rcu_expedited();
720	else
721		wait_rcu_gp(call_rcu);
722}
723EXPORT_SYMBOL_GPL(synchronize_rcu);
724
725static DECLARE_WAIT_QUEUE_HEAD(sync_rcu_preempt_exp_wq);
726static unsigned long sync_rcu_preempt_exp_count;
727static DEFINE_MUTEX(sync_rcu_preempt_exp_mutex);
728
729/*
730 * Return non-zero if there are any tasks in RCU read-side critical
731 * sections blocking the current preemptible-RCU expedited grace period.
732 * If there is no preemptible-RCU expedited grace period currently in
733 * progress, returns zero unconditionally.
734 */
735static int rcu_preempted_readers_exp(struct rcu_node *rnp)
736{
737	return rnp->exp_tasks != NULL;
738}
739
740/*
741 * return non-zero if there is no RCU expedited grace period in progress
742 * for the specified rcu_node structure, in other words, if all CPUs and
743 * tasks covered by the specified rcu_node structure have done their bit
744 * for the current expedited grace period.  Works only for preemptible
745 * RCU -- other RCU implementation use other means.
746 *
747 * Caller must hold sync_rcu_preempt_exp_mutex.
748 */
749static int sync_rcu_preempt_exp_done(struct rcu_node *rnp)
750{
751	return !rcu_preempted_readers_exp(rnp) &&
752	       ACCESS_ONCE(rnp->expmask) == 0;
753}
754
755/*
756 * Report the exit from RCU read-side critical section for the last task
757 * that queued itself during or before the current expedited preemptible-RCU
758 * grace period.  This event is reported either to the rcu_node structure on
759 * which the task was queued or to one of that rcu_node structure's ancestors,
760 * recursively up the tree.  (Calm down, calm down, we do the recursion
761 * iteratively!)
762 *
763 * Most callers will set the "wake" flag, but the task initiating the
764 * expedited grace period need not wake itself.
765 *
766 * Caller must hold sync_rcu_preempt_exp_mutex.
767 */
768static void rcu_report_exp_rnp(struct rcu_state *rsp, struct rcu_node *rnp,
769			       bool wake)
770{
771	unsigned long flags;
772	unsigned long mask;
773
774	raw_spin_lock_irqsave(&rnp->lock, flags);
775	for (;;) {
776		if (!sync_rcu_preempt_exp_done(rnp)) {
777			raw_spin_unlock_irqrestore(&rnp->lock, flags);
778			break;
779		}
780		if (rnp->parent == NULL) {
781			raw_spin_unlock_irqrestore(&rnp->lock, flags);
782			if (wake) {
783				smp_mb(); /* EGP done before wake_up(). */
784				wake_up(&sync_rcu_preempt_exp_wq);
785			}
786			break;
787		}
788		mask = rnp->grpmask;
789		raw_spin_unlock(&rnp->lock); /* irqs remain disabled */
790		rnp = rnp->parent;
791		raw_spin_lock(&rnp->lock); /* irqs already disabled */
792		rnp->expmask &= ~mask;
793	}
794}
795
796/*
797 * Snapshot the tasks blocking the newly started preemptible-RCU expedited
798 * grace period for the specified rcu_node structure.  If there are no such
799 * tasks, report it up the rcu_node hierarchy.
800 *
801 * Caller must hold sync_rcu_preempt_exp_mutex and must exclude
802 * CPU hotplug operations.
803 */
804static void
805sync_rcu_preempt_exp_init(struct rcu_state *rsp, struct rcu_node *rnp)
806{
807	unsigned long flags;
808	int must_wait = 0;
809
810	raw_spin_lock_irqsave(&rnp->lock, flags);
811	if (list_empty(&rnp->blkd_tasks)) {
812		raw_spin_unlock_irqrestore(&rnp->lock, flags);
813	} else {
814		rnp->exp_tasks = rnp->blkd_tasks.next;
815		rcu_initiate_boost(rnp, flags);  /* releases rnp->lock */
816		must_wait = 1;
817	}
818	if (!must_wait)
819		rcu_report_exp_rnp(rsp, rnp, false); /* Don't wake self. */
820}
821
822/**
823 * synchronize_rcu_expedited - Brute-force RCU grace period
824 *
825 * Wait for an RCU-preempt grace period, but expedite it.  The basic
826 * idea is to invoke synchronize_sched_expedited() to push all the tasks to
827 * the ->blkd_tasks lists and wait for this list to drain.  This consumes
828 * significant time on all CPUs and is unfriendly to real-time workloads,
829 * so is thus not recommended for any sort of common-case code.
830 * In fact, if you are using synchronize_rcu_expedited() in a loop,
831 * please restructure your code to batch your updates, and then Use a
832 * single synchronize_rcu() instead.
833 *
834 * Note that it is illegal to call this function while holding any lock
835 * that is acquired by a CPU-hotplug notifier.  And yes, it is also illegal
836 * to call this function from a CPU-hotplug notifier.  Failing to observe
837 * these restriction will result in deadlock.
838 */
839void synchronize_rcu_expedited(void)
840{
841	unsigned long flags;
842	struct rcu_node *rnp;
843	struct rcu_state *rsp = &rcu_preempt_state;
844	unsigned long snap;
845	int trycount = 0;
846
847	smp_mb(); /* Caller's modifications seen first by other CPUs. */
848	snap = ACCESS_ONCE(sync_rcu_preempt_exp_count) + 1;
849	smp_mb(); /* Above access cannot bleed into critical section. */
850
851	/*
852	 * Block CPU-hotplug operations.  This means that any CPU-hotplug
853	 * operation that finds an rcu_node structure with tasks in the
854	 * process of being boosted will know that all tasks blocking
855	 * this expedited grace period will already be in the process of
856	 * being boosted.  This simplifies the process of moving tasks
857	 * from leaf to root rcu_node structures.
858	 */
859	get_online_cpus();
860
861	/*
862	 * Acquire lock, falling back to synchronize_rcu() if too many
863	 * lock-acquisition failures.  Of course, if someone does the
864	 * expedited grace period for us, just leave.
865	 */
866	while (!mutex_trylock(&sync_rcu_preempt_exp_mutex)) {
867		if (ULONG_CMP_LT(snap,
868		    ACCESS_ONCE(sync_rcu_preempt_exp_count))) {
869			put_online_cpus();
870			goto mb_ret; /* Others did our work for us. */
871		}
872		if (trycount++ < 10) {
873			udelay(trycount * num_online_cpus());
874		} else {
875			put_online_cpus();
876			wait_rcu_gp(call_rcu);
877			return;
878		}
879	}
880	if (ULONG_CMP_LT(snap, ACCESS_ONCE(sync_rcu_preempt_exp_count))) {
881		put_online_cpus();
882		goto unlock_mb_ret; /* Others did our work for us. */
883	}
884
885	/* force all RCU readers onto ->blkd_tasks lists. */
886	synchronize_sched_expedited();
887
888	/* Initialize ->expmask for all non-leaf rcu_node structures. */
889	rcu_for_each_nonleaf_node_breadth_first(rsp, rnp) {
890		raw_spin_lock_irqsave(&rnp->lock, flags);
891		rnp->expmask = rnp->qsmaskinit;
892		raw_spin_unlock_irqrestore(&rnp->lock, flags);
893	}
894
895	/* Snapshot current state of ->blkd_tasks lists. */
896	rcu_for_each_leaf_node(rsp, rnp)
897		sync_rcu_preempt_exp_init(rsp, rnp);
898	if (NUM_RCU_NODES > 1)
899		sync_rcu_preempt_exp_init(rsp, rcu_get_root(rsp));
900
901	put_online_cpus();
902
903	/* Wait for snapshotted ->blkd_tasks lists to drain. */
904	rnp = rcu_get_root(rsp);
905	wait_event(sync_rcu_preempt_exp_wq,
906		   sync_rcu_preempt_exp_done(rnp));
907
908	/* Clean up and exit. */
909	smp_mb(); /* ensure expedited GP seen before counter increment. */
910	ACCESS_ONCE(sync_rcu_preempt_exp_count)++;
911unlock_mb_ret:
912	mutex_unlock(&sync_rcu_preempt_exp_mutex);
913mb_ret:
914	smp_mb(); /* ensure subsequent action seen after grace period. */
915}
916EXPORT_SYMBOL_GPL(synchronize_rcu_expedited);
917
918/**
919 * rcu_barrier - Wait until all in-flight call_rcu() callbacks complete.
920 *
921 * Note that this primitive does not necessarily wait for an RCU grace period
922 * to complete.  For example, if there are no RCU callbacks queued anywhere
923 * in the system, then rcu_barrier() is within its rights to return
924 * immediately, without waiting for anything, much less an RCU grace period.
925 */
926void rcu_barrier(void)
927{
928	_rcu_barrier(&rcu_preempt_state);
929}
930EXPORT_SYMBOL_GPL(rcu_barrier);
931
932/*
933 * Initialize preemptible RCU's state structures.
934 */
935static void __init __rcu_init_preempt(void)
936{
937	rcu_init_one(&rcu_preempt_state, &rcu_preempt_data);
938}
939
940/*
941 * Check for a task exiting while in a preemptible-RCU read-side
942 * critical section, clean up if so.  No need to issue warnings,
943 * as debug_check_no_locks_held() already does this if lockdep
944 * is enabled.
945 */
946void exit_rcu(void)
947{
948	struct task_struct *t = current;
949
950	if (likely(list_empty(&current->rcu_node_entry)))
951		return;
952	t->rcu_read_lock_nesting = 1;
953	barrier();
954	t->rcu_read_unlock_special = RCU_READ_UNLOCK_BLOCKED;
955	__rcu_read_unlock();
956}
957
958#else /* #ifdef CONFIG_TREE_PREEMPT_RCU */
959
960static struct rcu_state *rcu_state = &rcu_sched_state;
961
962/*
963 * Tell them what RCU they are running.
964 */
965static void __init rcu_bootup_announce(void)
966{
967	pr_info("Hierarchical RCU implementation.\n");
968	rcu_bootup_announce_oddness();
969}
970
971/*
972 * Return the number of RCU batches processed thus far for debug & stats.
973 */
974long rcu_batches_completed(void)
975{
976	return rcu_batches_completed_sched();
977}
978EXPORT_SYMBOL_GPL(rcu_batches_completed);
979
980/*
981 * Force a quiescent state for RCU, which, because there is no preemptible
982 * RCU, becomes the same as rcu-sched.
983 */
984void rcu_force_quiescent_state(void)
985{
986	rcu_sched_force_quiescent_state();
987}
988EXPORT_SYMBOL_GPL(rcu_force_quiescent_state);
989
990/*
991 * Because preemptible RCU does not exist, we never have to check for
992 * CPUs being in quiescent states.
993 */
994static void rcu_preempt_note_context_switch(int cpu)
995{
996}
997
998/*
999 * Because preemptible RCU does not exist, there are never any preempted
1000 * RCU readers.
1001 */
1002static int rcu_preempt_blocked_readers_cgp(struct rcu_node *rnp)
1003{
1004	return 0;
1005}
1006
1007#ifdef CONFIG_HOTPLUG_CPU
1008
1009/* Because preemptible RCU does not exist, no quieting of tasks. */
1010static void rcu_report_unblock_qs_rnp(struct rcu_node *rnp, unsigned long flags)
1011{
1012	raw_spin_unlock_irqrestore(&rnp->lock, flags);
1013}
1014
1015#endif /* #ifdef CONFIG_HOTPLUG_CPU */
1016
1017/*
1018 * Because preemptible RCU does not exist, we never have to check for
1019 * tasks blocked within RCU read-side critical sections.
1020 */
1021static void rcu_print_detail_task_stall(struct rcu_state *rsp)
1022{
1023}
1024
1025/*
1026 * Because preemptible RCU does not exist, we never have to check for
1027 * tasks blocked within RCU read-side critical sections.
1028 */
1029static int rcu_print_task_stall(struct rcu_node *rnp)
1030{
1031	return 0;
1032}
1033
1034/*
1035 * Because there is no preemptible RCU, there can be no readers blocked,
1036 * so there is no need to check for blocked tasks.  So check only for
1037 * bogus qsmask values.
1038 */
1039static void rcu_preempt_check_blocked_tasks(struct rcu_node *rnp)
1040{
1041	WARN_ON_ONCE(rnp->qsmask);
1042}
1043
1044#ifdef CONFIG_HOTPLUG_CPU
1045
1046/*
1047 * Because preemptible RCU does not exist, it never needs to migrate
1048 * tasks that were blocked within RCU read-side critical sections, and
1049 * such non-existent tasks cannot possibly have been blocking the current
1050 * grace period.
1051 */
1052static int rcu_preempt_offline_tasks(struct rcu_state *rsp,
1053				     struct rcu_node *rnp,
1054				     struct rcu_data *rdp)
1055{
1056	return 0;
1057}
1058
1059#endif /* #ifdef CONFIG_HOTPLUG_CPU */
1060
1061/*
1062 * Because preemptible RCU does not exist, it never has any callbacks
1063 * to check.
1064 */
1065static void rcu_preempt_check_callbacks(int cpu)
1066{
1067}
1068
1069/*
1070 * Queue an RCU callback for lazy invocation after a grace period.
1071 * This will likely be later named something like "call_rcu_lazy()",
1072 * but this change will require some way of tagging the lazy RCU
1073 * callbacks in the list of pending callbacks.  Until then, this
1074 * function may only be called from __kfree_rcu().
1075 *
1076 * Because there is no preemptible RCU, we use RCU-sched instead.
1077 */
1078void kfree_call_rcu(struct rcu_head *head,
1079		    void (*func)(struct rcu_head *rcu))
1080{
1081	__call_rcu(head, func, &rcu_sched_state, -1, 1);
1082}
1083EXPORT_SYMBOL_GPL(kfree_call_rcu);
1084
1085/*
1086 * Wait for an rcu-preempt grace period, but make it happen quickly.
1087 * But because preemptible RCU does not exist, map to rcu-sched.
1088 */
1089void synchronize_rcu_expedited(void)
1090{
1091	synchronize_sched_expedited();
1092}
1093EXPORT_SYMBOL_GPL(synchronize_rcu_expedited);
1094
1095#ifdef CONFIG_HOTPLUG_CPU
1096
1097/*
1098 * Because preemptible RCU does not exist, there is never any need to
1099 * report on tasks preempted in RCU read-side critical sections during
1100 * expedited RCU grace periods.
1101 */
1102static void rcu_report_exp_rnp(struct rcu_state *rsp, struct rcu_node *rnp,
1103			       bool wake)
1104{
1105}
1106
1107#endif /* #ifdef CONFIG_HOTPLUG_CPU */
1108
1109/*
1110 * Because preemptible RCU does not exist, rcu_barrier() is just
1111 * another name for rcu_barrier_sched().
1112 */
1113void rcu_barrier(void)
1114{
1115	rcu_barrier_sched();
1116}
1117EXPORT_SYMBOL_GPL(rcu_barrier);
1118
1119/*
1120 * Because preemptible RCU does not exist, it need not be initialized.
1121 */
1122static void __init __rcu_init_preempt(void)
1123{
1124}
1125
1126/*
1127 * Because preemptible RCU does not exist, tasks cannot possibly exit
1128 * while in preemptible RCU read-side critical sections.
1129 */
1130void exit_rcu(void)
1131{
1132}
1133
1134#endif /* #else #ifdef CONFIG_TREE_PREEMPT_RCU */
1135
1136#ifdef CONFIG_RCU_BOOST
1137
1138#include "../locking/rtmutex_common.h"
1139
1140#ifdef CONFIG_RCU_TRACE
1141
1142static void rcu_initiate_boost_trace(struct rcu_node *rnp)
1143{
1144	if (list_empty(&rnp->blkd_tasks))
1145		rnp->n_balk_blkd_tasks++;
1146	else if (rnp->exp_tasks == NULL && rnp->gp_tasks == NULL)
1147		rnp->n_balk_exp_gp_tasks++;
1148	else if (rnp->gp_tasks != NULL && rnp->boost_tasks != NULL)
1149		rnp->n_balk_boost_tasks++;
1150	else if (rnp->gp_tasks != NULL && rnp->qsmask != 0)
1151		rnp->n_balk_notblocked++;
1152	else if (rnp->gp_tasks != NULL &&
1153		 ULONG_CMP_LT(jiffies, rnp->boost_time))
1154		rnp->n_balk_notyet++;
1155	else
1156		rnp->n_balk_nos++;
1157}
1158
1159#else /* #ifdef CONFIG_RCU_TRACE */
1160
1161static void rcu_initiate_boost_trace(struct rcu_node *rnp)
1162{
1163}
1164
1165#endif /* #else #ifdef CONFIG_RCU_TRACE */
1166
1167static void rcu_wake_cond(struct task_struct *t, int status)
1168{
1169	/*
1170	 * If the thread is yielding, only wake it when this
1171	 * is invoked from idle
1172	 */
1173	if (status != RCU_KTHREAD_YIELDING || is_idle_task(current))
1174		wake_up_process(t);
1175}
1176
1177/*
1178 * Carry out RCU priority boosting on the task indicated by ->exp_tasks
1179 * or ->boost_tasks, advancing the pointer to the next task in the
1180 * ->blkd_tasks list.
1181 *
1182 * Note that irqs must be enabled: boosting the task can block.
1183 * Returns 1 if there are more tasks needing to be boosted.
1184 */
1185static int rcu_boost(struct rcu_node *rnp)
1186{
1187	unsigned long flags;
1188	struct rt_mutex mtx;
1189	struct task_struct *t;
1190	struct list_head *tb;
1191
1192	if (rnp->exp_tasks == NULL && rnp->boost_tasks == NULL)
1193		return 0;  /* Nothing left to boost. */
1194
1195	raw_spin_lock_irqsave(&rnp->lock, flags);
1196
1197	/*
1198	 * Recheck under the lock: all tasks in need of boosting
1199	 * might exit their RCU read-side critical sections on their own.
1200	 */
1201	if (rnp->exp_tasks == NULL && rnp->boost_tasks == NULL) {
1202		raw_spin_unlock_irqrestore(&rnp->lock, flags);
1203		return 0;
1204	}
1205
1206	/*
1207	 * Preferentially boost tasks blocking expedited grace periods.
1208	 * This cannot starve the normal grace periods because a second
1209	 * expedited grace period must boost all blocked tasks, including
1210	 * those blocking the pre-existing normal grace period.
1211	 */
1212	if (rnp->exp_tasks != NULL) {
1213		tb = rnp->exp_tasks;
1214		rnp->n_exp_boosts++;
1215	} else {
1216		tb = rnp->boost_tasks;
1217		rnp->n_normal_boosts++;
1218	}
1219	rnp->n_tasks_boosted++;
1220
1221	/*
1222	 * We boost task t by manufacturing an rt_mutex that appears to
1223	 * be held by task t.  We leave a pointer to that rt_mutex where
1224	 * task t can find it, and task t will release the mutex when it
1225	 * exits its outermost RCU read-side critical section.  Then
1226	 * simply acquiring this artificial rt_mutex will boost task
1227	 * t's priority.  (Thanks to tglx for suggesting this approach!)
1228	 *
1229	 * Note that task t must acquire rnp->lock to remove itself from
1230	 * the ->blkd_tasks list, which it will do from exit() if from
1231	 * nowhere else.  We therefore are guaranteed that task t will
1232	 * stay around at least until we drop rnp->lock.  Note that
1233	 * rnp->lock also resolves races between our priority boosting
1234	 * and task t's exiting its outermost RCU read-side critical
1235	 * section.
1236	 */
1237	t = container_of(tb, struct task_struct, rcu_node_entry);
1238	rt_mutex_init_proxy_locked(&mtx, t);
1239	t->rcu_boost_mutex = &mtx;
1240	raw_spin_unlock_irqrestore(&rnp->lock, flags);
1241	rt_mutex_lock(&mtx);  /* Side effect: boosts task t's priority. */
1242	rt_mutex_unlock(&mtx);  /* Keep lockdep happy. */
1243
1244	return ACCESS_ONCE(rnp->exp_tasks) != NULL ||
1245	       ACCESS_ONCE(rnp->boost_tasks) != NULL;
1246}
1247
1248/*
1249 * Priority-boosting kthread.  One per leaf rcu_node and one for the
1250 * root rcu_node.
1251 */
1252static int rcu_boost_kthread(void *arg)
1253{
1254	struct rcu_node *rnp = (struct rcu_node *)arg;
1255	int spincnt = 0;
1256	int more2boost;
1257
1258	trace_rcu_utilization(TPS("Start boost kthread@init"));
1259	for (;;) {
1260		rnp->boost_kthread_status = RCU_KTHREAD_WAITING;
1261		trace_rcu_utilization(TPS("End boost kthread@rcu_wait"));
1262		rcu_wait(rnp->boost_tasks || rnp->exp_tasks);
1263		trace_rcu_utilization(TPS("Start boost kthread@rcu_wait"));
1264		rnp->boost_kthread_status = RCU_KTHREAD_RUNNING;
1265		more2boost = rcu_boost(rnp);
1266		if (more2boost)
1267			spincnt++;
1268		else
1269			spincnt = 0;
1270		if (spincnt > 10) {
1271			rnp->boost_kthread_status = RCU_KTHREAD_YIELDING;
1272			trace_rcu_utilization(TPS("End boost kthread@rcu_yield"));
1273			schedule_timeout_interruptible(2);
1274			trace_rcu_utilization(TPS("Start boost kthread@rcu_yield"));
1275			spincnt = 0;
1276		}
1277	}
1278	/* NOTREACHED */
1279	trace_rcu_utilization(TPS("End boost kthread@notreached"));
1280	return 0;
1281}
1282
1283/*
1284 * Check to see if it is time to start boosting RCU readers that are
1285 * blocking the current grace period, and, if so, tell the per-rcu_node
1286 * kthread to start boosting them.  If there is an expedited grace
1287 * period in progress, it is always time to boost.
1288 *
1289 * The caller must hold rnp->lock, which this function releases.
1290 * The ->boost_kthread_task is immortal, so we don't need to worry
1291 * about it going away.
1292 */
1293static void rcu_initiate_boost(struct rcu_node *rnp, unsigned long flags)
1294{
1295	struct task_struct *t;
1296
1297	if (!rcu_preempt_blocked_readers_cgp(rnp) && rnp->exp_tasks == NULL) {
1298		rnp->n_balk_exp_gp_tasks++;
1299		raw_spin_unlock_irqrestore(&rnp->lock, flags);
1300		return;
1301	}
1302	if (rnp->exp_tasks != NULL ||
1303	    (rnp->gp_tasks != NULL &&
1304	     rnp->boost_tasks == NULL &&
1305	     rnp->qsmask == 0 &&
1306	     ULONG_CMP_GE(jiffies, rnp->boost_time))) {
1307		if (rnp->exp_tasks == NULL)
1308			rnp->boost_tasks = rnp->gp_tasks;
1309		raw_spin_unlock_irqrestore(&rnp->lock, flags);
1310		t = rnp->boost_kthread_task;
1311		if (t)
1312			rcu_wake_cond(t, rnp->boost_kthread_status);
1313	} else {
1314		rcu_initiate_boost_trace(rnp);
1315		raw_spin_unlock_irqrestore(&rnp->lock, flags);
1316	}
1317}
1318
1319/*
1320 * Wake up the per-CPU kthread to invoke RCU callbacks.
1321 */
1322static void invoke_rcu_callbacks_kthread(void)
1323{
1324	unsigned long flags;
1325
1326	local_irq_save(flags);
1327	__this_cpu_write(rcu_cpu_has_work, 1);
1328	if (__this_cpu_read(rcu_cpu_kthread_task) != NULL &&
1329	    current != __this_cpu_read(rcu_cpu_kthread_task)) {
1330		rcu_wake_cond(__this_cpu_read(rcu_cpu_kthread_task),
1331			      __this_cpu_read(rcu_cpu_kthread_status));
1332	}
1333	local_irq_restore(flags);
1334}
1335
1336/*
1337 * Is the current CPU running the RCU-callbacks kthread?
1338 * Caller must have preemption disabled.
1339 */
1340static bool rcu_is_callbacks_kthread(void)
1341{
1342	return __this_cpu_read(rcu_cpu_kthread_task) == current;
1343}
1344
1345#define RCU_BOOST_DELAY_JIFFIES DIV_ROUND_UP(CONFIG_RCU_BOOST_DELAY * HZ, 1000)
1346
1347/*
1348 * Do priority-boost accounting for the start of a new grace period.
1349 */
1350static void rcu_preempt_boost_start_gp(struct rcu_node *rnp)
1351{
1352	rnp->boost_time = jiffies + RCU_BOOST_DELAY_JIFFIES;
1353}
1354
1355/*
1356 * Create an RCU-boost kthread for the specified node if one does not
1357 * already exist.  We only create this kthread for preemptible RCU.
1358 * Returns zero if all is well, a negated errno otherwise.
1359 */
1360static int rcu_spawn_one_boost_kthread(struct rcu_state *rsp,
1361						 struct rcu_node *rnp)
1362{
1363	int rnp_index = rnp - &rsp->node[0];
1364	unsigned long flags;
1365	struct sched_param sp;
1366	struct task_struct *t;
1367
1368	if (&rcu_preempt_state != rsp)
1369		return 0;
1370
1371	if (!rcu_scheduler_fully_active || rnp->qsmaskinit == 0)
1372		return 0;
1373
1374	rsp->boost = 1;
1375	if (rnp->boost_kthread_task != NULL)
1376		return 0;
1377	t = kthread_create(rcu_boost_kthread, (void *)rnp,
1378			   "rcub/%d", rnp_index);
1379	if (IS_ERR(t))
1380		return PTR_ERR(t);
1381	raw_spin_lock_irqsave(&rnp->lock, flags);
1382	rnp->boost_kthread_task = t;
1383	raw_spin_unlock_irqrestore(&rnp->lock, flags);
1384	sp.sched_priority = RCU_BOOST_PRIO;
1385	sched_setscheduler_nocheck(t, SCHED_FIFO, &sp);
1386	wake_up_process(t); /* get to TASK_INTERRUPTIBLE quickly. */
1387	return 0;
1388}
1389
1390static void rcu_kthread_do_work(void)
1391{
1392	rcu_do_batch(&rcu_sched_state, this_cpu_ptr(&rcu_sched_data));
1393	rcu_do_batch(&rcu_bh_state, this_cpu_ptr(&rcu_bh_data));
1394	rcu_preempt_do_callbacks();
1395}
1396
1397static void rcu_cpu_kthread_setup(unsigned int cpu)
1398{
1399	struct sched_param sp;
1400
1401	sp.sched_priority = RCU_KTHREAD_PRIO;
1402	sched_setscheduler_nocheck(current, SCHED_FIFO, &sp);
1403}
1404
1405static void rcu_cpu_kthread_park(unsigned int cpu)
1406{
1407	per_cpu(rcu_cpu_kthread_status, cpu) = RCU_KTHREAD_OFFCPU;
1408}
1409
1410static int rcu_cpu_kthread_should_run(unsigned int cpu)
1411{
1412	return __this_cpu_read(rcu_cpu_has_work);
1413}
1414
1415/*
1416 * Per-CPU kernel thread that invokes RCU callbacks.  This replaces the
1417 * RCU softirq used in flavors and configurations of RCU that do not
1418 * support RCU priority boosting.
1419 */
1420static void rcu_cpu_kthread(unsigned int cpu)
1421{
1422	unsigned int *statusp = this_cpu_ptr(&rcu_cpu_kthread_status);
1423	char work, *workp = this_cpu_ptr(&rcu_cpu_has_work);
1424	int spincnt;
1425
1426	for (spincnt = 0; spincnt < 10; spincnt++) {
1427		trace_rcu_utilization(TPS("Start CPU kthread@rcu_wait"));
1428		local_bh_disable();
1429		*statusp = RCU_KTHREAD_RUNNING;
1430		this_cpu_inc(rcu_cpu_kthread_loops);
1431		local_irq_disable();
1432		work = *workp;
1433		*workp = 0;
1434		local_irq_enable();
1435		if (work)
1436			rcu_kthread_do_work();
1437		local_bh_enable();
1438		if (*workp == 0) {
1439			trace_rcu_utilization(TPS("End CPU kthread@rcu_wait"));
1440			*statusp = RCU_KTHREAD_WAITING;
1441			return;
1442		}
1443	}
1444	*statusp = RCU_KTHREAD_YIELDING;
1445	trace_rcu_utilization(TPS("Start CPU kthread@rcu_yield"));
1446	schedule_timeout_interruptible(2);
1447	trace_rcu_utilization(TPS("End CPU kthread@rcu_yield"));
1448	*statusp = RCU_KTHREAD_WAITING;
1449}
1450
1451/*
1452 * Set the per-rcu_node kthread's affinity to cover all CPUs that are
1453 * served by the rcu_node in question.  The CPU hotplug lock is still
1454 * held, so the value of rnp->qsmaskinit will be stable.
1455 *
1456 * We don't include outgoingcpu in the affinity set, use -1 if there is
1457 * no outgoing CPU.  If there are no CPUs left in the affinity set,
1458 * this function allows the kthread to execute on any CPU.
1459 */
1460static void rcu_boost_kthread_setaffinity(struct rcu_node *rnp, int outgoingcpu)
1461{
1462	struct task_struct *t = rnp->boost_kthread_task;
1463	unsigned long mask = rnp->qsmaskinit;
1464	cpumask_var_t cm;
1465	int cpu;
1466
1467	if (!t)
1468		return;
1469	if (!zalloc_cpumask_var(&cm, GFP_KERNEL))
1470		return;
1471	for (cpu = rnp->grplo; cpu <= rnp->grphi; cpu++, mask >>= 1)
1472		if ((mask & 0x1) && cpu != outgoingcpu)
1473			cpumask_set_cpu(cpu, cm);
1474	if (cpumask_weight(cm) == 0) {
1475		cpumask_setall(cm);
1476		for (cpu = rnp->grplo; cpu <= rnp->grphi; cpu++)
1477			cpumask_clear_cpu(cpu, cm);
1478		WARN_ON_ONCE(cpumask_weight(cm) == 0);
1479	}
1480	set_cpus_allowed_ptr(t, cm);
1481	free_cpumask_var(cm);
1482}
1483
1484static struct smp_hotplug_thread rcu_cpu_thread_spec = {
1485	.store			= &rcu_cpu_kthread_task,
1486	.thread_should_run	= rcu_cpu_kthread_should_run,
1487	.thread_fn		= rcu_cpu_kthread,
1488	.thread_comm		= "rcuc/%u",
1489	.setup			= rcu_cpu_kthread_setup,
1490	.park			= rcu_cpu_kthread_park,
1491};
1492
1493/*
1494 * Spawn all kthreads -- called as soon as the scheduler is running.
1495 */
1496static int __init rcu_spawn_kthreads(void)
1497{
1498	struct rcu_node *rnp;
1499	int cpu;
1500
1501	rcu_scheduler_fully_active = 1;
1502	for_each_possible_cpu(cpu)
1503		per_cpu(rcu_cpu_has_work, cpu) = 0;
1504	BUG_ON(smpboot_register_percpu_thread(&rcu_cpu_thread_spec));
1505	rnp = rcu_get_root(rcu_state);
1506	(void)rcu_spawn_one_boost_kthread(rcu_state, rnp);
1507	if (NUM_RCU_NODES > 1) {
1508		rcu_for_each_leaf_node(rcu_state, rnp)
1509			(void)rcu_spawn_one_boost_kthread(rcu_state, rnp);
1510	}
1511	return 0;
1512}
1513early_initcall(rcu_spawn_kthreads);
1514
1515static void rcu_prepare_kthreads(int cpu)
1516{
1517	struct rcu_data *rdp = per_cpu_ptr(rcu_state->rda, cpu);
1518	struct rcu_node *rnp = rdp->mynode;
1519
1520	/* Fire up the incoming CPU's kthread and leaf rcu_node kthread. */
1521	if (rcu_scheduler_fully_active)
1522		(void)rcu_spawn_one_boost_kthread(rcu_state, rnp);
1523}
1524
1525#else /* #ifdef CONFIG_RCU_BOOST */
1526
1527static void rcu_initiate_boost(struct rcu_node *rnp, unsigned long flags)
1528{
1529	raw_spin_unlock_irqrestore(&rnp->lock, flags);
1530}
1531
1532static void invoke_rcu_callbacks_kthread(void)
1533{
1534	WARN_ON_ONCE(1);
1535}
1536
1537static bool rcu_is_callbacks_kthread(void)
1538{
1539	return false;
1540}
1541
1542static void rcu_preempt_boost_start_gp(struct rcu_node *rnp)
1543{
1544}
1545
1546static void rcu_boost_kthread_setaffinity(struct rcu_node *rnp, int outgoingcpu)
1547{
1548}
1549
1550static int __init rcu_scheduler_really_started(void)
1551{
1552	rcu_scheduler_fully_active = 1;
1553	return 0;
1554}
1555early_initcall(rcu_scheduler_really_started);
1556
1557static void rcu_prepare_kthreads(int cpu)
1558{
1559}
1560
1561#endif /* #else #ifdef CONFIG_RCU_BOOST */
1562
1563#if !defined(CONFIG_RCU_FAST_NO_HZ)
1564
1565/*
1566 * Check to see if any future RCU-related work will need to be done
1567 * by the current CPU, even if none need be done immediately, returning
1568 * 1 if so.  This function is part of the RCU implementation; it is -not-
1569 * an exported member of the RCU API.
1570 *
1571 * Because we not have RCU_FAST_NO_HZ, just check whether this CPU needs
1572 * any flavor of RCU.
1573 */
1574int rcu_needs_cpu(int cpu, unsigned long *delta_jiffies)
1575{
1576	*delta_jiffies = ULONG_MAX;
1577	return rcu_cpu_has_callbacks(cpu, NULL);
1578}
1579
1580/*
1581 * Because we do not have RCU_FAST_NO_HZ, don't bother cleaning up
1582 * after it.
1583 */
1584static void rcu_cleanup_after_idle(int cpu)
1585{
1586}
1587
1588/*
1589 * Do the idle-entry grace-period work, which, because CONFIG_RCU_FAST_NO_HZ=n,
1590 * is nothing.
1591 */
1592static void rcu_prepare_for_idle(int cpu)
1593{
1594}
1595
1596/*
1597 * Don't bother keeping a running count of the number of RCU callbacks
1598 * posted because CONFIG_RCU_FAST_NO_HZ=n.
1599 */
1600static void rcu_idle_count_callbacks_posted(void)
1601{
1602}
1603
1604#else /* #if !defined(CONFIG_RCU_FAST_NO_HZ) */
1605
1606/*
1607 * This code is invoked when a CPU goes idle, at which point we want
1608 * to have the CPU do everything required for RCU so that it can enter
1609 * the energy-efficient dyntick-idle mode.  This is handled by a
1610 * state machine implemented by rcu_prepare_for_idle() below.
1611 *
1612 * The following three proprocessor symbols control this state machine:
1613 *
1614 * RCU_IDLE_GP_DELAY gives the number of jiffies that a CPU is permitted
1615 *	to sleep in dyntick-idle mode with RCU callbacks pending.  This
1616 *	is sized to be roughly one RCU grace period.  Those energy-efficiency
1617 *	benchmarkers who might otherwise be tempted to set this to a large
1618 *	number, be warned: Setting RCU_IDLE_GP_DELAY too high can hang your
1619 *	system.  And if you are -that- concerned about energy efficiency,
1620 *	just power the system down and be done with it!
1621 * RCU_IDLE_LAZY_GP_DELAY gives the number of jiffies that a CPU is
1622 *	permitted to sleep in dyntick-idle mode with only lazy RCU
1623 *	callbacks pending.  Setting this too high can OOM your system.
1624 *
1625 * The values below work well in practice.  If future workloads require
1626 * adjustment, they can be converted into kernel config parameters, though
1627 * making the state machine smarter might be a better option.
1628 */
1629#define RCU_IDLE_GP_DELAY 4		/* Roughly one grace period. */
1630#define RCU_IDLE_LAZY_GP_DELAY (6 * HZ)	/* Roughly six seconds. */
1631
1632static int rcu_idle_gp_delay = RCU_IDLE_GP_DELAY;
1633module_param(rcu_idle_gp_delay, int, 0644);
1634static int rcu_idle_lazy_gp_delay = RCU_IDLE_LAZY_GP_DELAY;
1635module_param(rcu_idle_lazy_gp_delay, int, 0644);
1636
1637extern int tick_nohz_enabled;
1638
1639/*
1640 * Try to advance callbacks for all flavors of RCU on the current CPU, but
1641 * only if it has been awhile since the last time we did so.  Afterwards,
1642 * if there are any callbacks ready for immediate invocation, return true.
1643 */
1644static bool rcu_try_advance_all_cbs(void)
1645{
1646	bool cbs_ready = false;
1647	struct rcu_data *rdp;
1648	struct rcu_dynticks *rdtp = this_cpu_ptr(&rcu_dynticks);
1649	struct rcu_node *rnp;
1650	struct rcu_state *rsp;
1651
1652	/* Exit early if we advanced recently. */
1653	if (jiffies == rdtp->last_advance_all)
1654		return 0;
1655	rdtp->last_advance_all = jiffies;
1656
1657	for_each_rcu_flavor(rsp) {
1658		rdp = this_cpu_ptr(rsp->rda);
1659		rnp = rdp->mynode;
1660
1661		/*
1662		 * Don't bother checking unless a grace period has
1663		 * completed since we last checked and there are
1664		 * callbacks not yet ready to invoke.
1665		 */
1666		if (rdp->completed != rnp->completed &&
1667		    rdp->nxttail[RCU_DONE_TAIL] != rdp->nxttail[RCU_NEXT_TAIL])
1668			note_gp_changes(rsp, rdp);
1669
1670		if (cpu_has_callbacks_ready_to_invoke(rdp))
1671			cbs_ready = true;
1672	}
1673	return cbs_ready;
1674}
1675
1676/*
1677 * Allow the CPU to enter dyntick-idle mode unless it has callbacks ready
1678 * to invoke.  If the CPU has callbacks, try to advance them.  Tell the
1679 * caller to set the timeout based on whether or not there are non-lazy
1680 * callbacks.
1681 *
1682 * The caller must have disabled interrupts.
1683 */
1684int rcu_needs_cpu(int cpu, unsigned long *dj)
1685{
1686	struct rcu_dynticks *rdtp = &per_cpu(rcu_dynticks, cpu);
1687
1688	/* Snapshot to detect later posting of non-lazy callback. */
1689	rdtp->nonlazy_posted_snap = rdtp->nonlazy_posted;
1690
1691	/* If no callbacks, RCU doesn't need the CPU. */
1692	if (!rcu_cpu_has_callbacks(cpu, &rdtp->all_lazy)) {
1693		*dj = ULONG_MAX;
1694		return 0;
1695	}
1696
1697	/* Attempt to advance callbacks. */
1698	if (rcu_try_advance_all_cbs()) {
1699		/* Some ready to invoke, so initiate later invocation. */
1700		invoke_rcu_core();
1701		return 1;
1702	}
1703	rdtp->last_accelerate = jiffies;
1704
1705	/* Request timer delay depending on laziness, and round. */
1706	if (!rdtp->all_lazy) {
1707		*dj = round_up(rcu_idle_gp_delay + jiffies,
1708			       rcu_idle_gp_delay) - jiffies;
1709	} else {
1710		*dj = round_jiffies(rcu_idle_lazy_gp_delay + jiffies) - jiffies;
1711	}
1712	return 0;
1713}
1714
1715/*
1716 * Prepare a CPU for idle from an RCU perspective.  The first major task
1717 * is to sense whether nohz mode has been enabled or disabled via sysfs.
1718 * The second major task is to check to see if a non-lazy callback has
1719 * arrived at a CPU that previously had only lazy callbacks.  The third
1720 * major task is to accelerate (that is, assign grace-period numbers to)
1721 * any recently arrived callbacks.
1722 *
1723 * The caller must have disabled interrupts.
1724 */
1725static void rcu_prepare_for_idle(int cpu)
1726{
1727	struct rcu_data *rdp;
1728	struct rcu_dynticks *rdtp = &per_cpu(rcu_dynticks, cpu);
1729	struct rcu_node *rnp;
1730	struct rcu_state *rsp;
1731	int tne;
1732
1733	/* Handle nohz enablement switches conservatively. */
1734	tne = ACCESS_ONCE(tick_nohz_enabled);
1735	if (tne != rdtp->tick_nohz_enabled_snap) {
1736		if (rcu_cpu_has_callbacks(cpu, NULL))
1737			invoke_rcu_core(); /* force nohz to see update. */
1738		rdtp->tick_nohz_enabled_snap = tne;
1739		return;
1740	}
1741	if (!tne)
1742		return;
1743
1744	/* If this is a no-CBs CPU, no callbacks, just return. */
1745	if (rcu_is_nocb_cpu(cpu))
1746		return;
1747
1748	/*
1749	 * If a non-lazy callback arrived at a CPU having only lazy
1750	 * callbacks, invoke RCU core for the side-effect of recalculating
1751	 * idle duration on re-entry to idle.
1752	 */
1753	if (rdtp->all_lazy &&
1754	    rdtp->nonlazy_posted != rdtp->nonlazy_posted_snap) {
1755		rdtp->all_lazy = false;
1756		rdtp->nonlazy_posted_snap = rdtp->nonlazy_posted;
1757		invoke_rcu_core();
1758		return;
1759	}
1760
1761	/*
1762	 * If we have not yet accelerated this jiffy, accelerate all
1763	 * callbacks on this CPU.
1764	 */
1765	if (rdtp->last_accelerate == jiffies)
1766		return;
1767	rdtp->last_accelerate = jiffies;
1768	for_each_rcu_flavor(rsp) {
1769		rdp = per_cpu_ptr(rsp->rda, cpu);
1770		if (!*rdp->nxttail[RCU_DONE_TAIL])
1771			continue;
1772		rnp = rdp->mynode;
1773		raw_spin_lock(&rnp->lock); /* irqs already disabled. */
1774		rcu_accelerate_cbs(rsp, rnp, rdp);
1775		raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */
1776	}
1777}
1778
1779/*
1780 * Clean up for exit from idle.  Attempt to advance callbacks based on
1781 * any grace periods that elapsed while the CPU was idle, and if any
1782 * callbacks are now ready to invoke, initiate invocation.
1783 */
1784static void rcu_cleanup_after_idle(int cpu)
1785{
1786
1787	if (rcu_is_nocb_cpu(cpu))
1788		return;
1789	if (rcu_try_advance_all_cbs())
1790		invoke_rcu_core();
1791}
1792
1793/*
1794 * Keep a running count of the number of non-lazy callbacks posted
1795 * on this CPU.  This running counter (which is never decremented) allows
1796 * rcu_prepare_for_idle() to detect when something out of the idle loop
1797 * posts a callback, even if an equal number of callbacks are invoked.
1798 * Of course, callbacks should only be posted from within a trace event
1799 * designed to be called from idle or from within RCU_NONIDLE().
1800 */
1801static void rcu_idle_count_callbacks_posted(void)
1802{
1803	__this_cpu_add(rcu_dynticks.nonlazy_posted, 1);
1804}
1805
1806/*
1807 * Data for flushing lazy RCU callbacks at OOM time.
1808 */
1809static atomic_t oom_callback_count;
1810static DECLARE_WAIT_QUEUE_HEAD(oom_callback_wq);
1811
1812/*
1813 * RCU OOM callback -- decrement the outstanding count and deliver the
1814 * wake-up if we are the last one.
1815 */
1816static void rcu_oom_callback(struct rcu_head *rhp)
1817{
1818	if (atomic_dec_and_test(&oom_callback_count))
1819		wake_up(&oom_callback_wq);
1820}
1821
1822/*
1823 * Post an rcu_oom_notify callback on the current CPU if it has at
1824 * least one lazy callback.  This will unnecessarily post callbacks
1825 * to CPUs that already have a non-lazy callback at the end of their
1826 * callback list, but this is an infrequent operation, so accept some
1827 * extra overhead to keep things simple.
1828 */
1829static void rcu_oom_notify_cpu(void *unused)
1830{
1831	struct rcu_state *rsp;
1832	struct rcu_data *rdp;
1833
1834	for_each_rcu_flavor(rsp) {
1835		rdp = __this_cpu_ptr(rsp->rda);
1836		if (rdp->qlen_lazy != 0) {
1837			atomic_inc(&oom_callback_count);
1838			rsp->call(&rdp->oom_head, rcu_oom_callback);
1839		}
1840	}
1841}
1842
1843/*
1844 * If low on memory, ensure that each CPU has a non-lazy callback.
1845 * This will wake up CPUs that have only lazy callbacks, in turn
1846 * ensuring that they free up the corresponding memory in a timely manner.
1847 * Because an uncertain amount of memory will be freed in some uncertain
1848 * timeframe, we do not claim to have freed anything.
1849 */
1850static int rcu_oom_notify(struct notifier_block *self,
1851			  unsigned long notused, void *nfreed)
1852{
1853	int cpu;
1854
1855	/* Wait for callbacks from earlier instance to complete. */
1856	wait_event(oom_callback_wq, atomic_read(&oom_callback_count) == 0);
1857	smp_mb(); /* Ensure callback reuse happens after callback invocation. */
1858
1859	/*
1860	 * Prevent premature wakeup: ensure that all increments happen
1861	 * before there is a chance of the counter reaching zero.
1862	 */
1863	atomic_set(&oom_callback_count, 1);
1864
1865	get_online_cpus();
1866	for_each_online_cpu(cpu) {
1867		smp_call_function_single(cpu, rcu_oom_notify_cpu, NULL, 1);
1868		cond_resched();
1869	}
1870	put_online_cpus();
1871
1872	/* Unconditionally decrement: no need to wake ourselves up. */
1873	atomic_dec(&oom_callback_count);
1874
1875	return NOTIFY_OK;
1876}
1877
1878static struct notifier_block rcu_oom_nb = {
1879	.notifier_call = rcu_oom_notify
1880};
1881
1882static int __init rcu_register_oom_notifier(void)
1883{
1884	register_oom_notifier(&rcu_oom_nb);
1885	return 0;
1886}
1887early_initcall(rcu_register_oom_notifier);
1888
1889#endif /* #else #if !defined(CONFIG_RCU_FAST_NO_HZ) */
1890
1891#ifdef CONFIG_RCU_CPU_STALL_INFO
1892
1893#ifdef CONFIG_RCU_FAST_NO_HZ
1894
1895static void print_cpu_stall_fast_no_hz(char *cp, int cpu)
1896{
1897	struct rcu_dynticks *rdtp = &per_cpu(rcu_dynticks, cpu);
1898	unsigned long nlpd = rdtp->nonlazy_posted - rdtp->nonlazy_posted_snap;
1899
1900	sprintf(cp, "last_accelerate: %04lx/%04lx, nonlazy_posted: %ld, %c%c",
1901		rdtp->last_accelerate & 0xffff, jiffies & 0xffff,
1902		ulong2long(nlpd),
1903		rdtp->all_lazy ? 'L' : '.',
1904		rdtp->tick_nohz_enabled_snap ? '.' : 'D');
1905}
1906
1907#else /* #ifdef CONFIG_RCU_FAST_NO_HZ */
1908
1909static void print_cpu_stall_fast_no_hz(char *cp, int cpu)
1910{
1911	*cp = '\0';
1912}
1913
1914#endif /* #else #ifdef CONFIG_RCU_FAST_NO_HZ */
1915
1916/* Initiate the stall-info list. */
1917static void print_cpu_stall_info_begin(void)
1918{
1919	pr_cont("\n");
1920}
1921
1922/*
1923 * Print out diagnostic information for the specified stalled CPU.
1924 *
1925 * If the specified CPU is aware of the current RCU grace period
1926 * (flavor specified by rsp), then print the number of scheduling
1927 * clock interrupts the CPU has taken during the time that it has
1928 * been aware.  Otherwise, print the number of RCU grace periods
1929 * that this CPU is ignorant of, for example, "1" if the CPU was
1930 * aware of the previous grace period.
1931 *
1932 * Also print out idle and (if CONFIG_RCU_FAST_NO_HZ) idle-entry info.
1933 */
1934static void print_cpu_stall_info(struct rcu_state *rsp, int cpu)
1935{
1936	char fast_no_hz[72];
1937	struct rcu_data *rdp = per_cpu_ptr(rsp->rda, cpu);
1938	struct rcu_dynticks *rdtp = rdp->dynticks;
1939	char *ticks_title;
1940	unsigned long ticks_value;
1941
1942	if (rsp->gpnum == rdp->gpnum) {
1943		ticks_title = "ticks this GP";
1944		ticks_value = rdp->ticks_this_gp;
1945	} else {
1946		ticks_title = "GPs behind";
1947		ticks_value = rsp->gpnum - rdp->gpnum;
1948	}
1949	print_cpu_stall_fast_no_hz(fast_no_hz, cpu);
1950	pr_err("\t%d: (%lu %s) idle=%03x/%llx/%d softirq=%u/%u %s\n",
1951	       cpu, ticks_value, ticks_title,
1952	       atomic_read(&rdtp->dynticks) & 0xfff,
1953	       rdtp->dynticks_nesting, rdtp->dynticks_nmi_nesting,
1954	       rdp->softirq_snap, kstat_softirqs_cpu(RCU_SOFTIRQ, cpu),
1955	       fast_no_hz);
1956}
1957
1958/* Terminate the stall-info list. */
1959static void print_cpu_stall_info_end(void)
1960{
1961	pr_err("\t");
1962}
1963
1964/* Zero ->ticks_this_gp for all flavors of RCU. */
1965static void zero_cpu_stall_ticks(struct rcu_data *rdp)
1966{
1967	rdp->ticks_this_gp = 0;
1968	rdp->softirq_snap = kstat_softirqs_cpu(RCU_SOFTIRQ, smp_processor_id());
1969}
1970
1971/* Increment ->ticks_this_gp for all flavors of RCU. */
1972static void increment_cpu_stall_ticks(void)
1973{
1974	struct rcu_state *rsp;
1975
1976	for_each_rcu_flavor(rsp)
1977		__this_cpu_ptr(rsp->rda)->ticks_this_gp++;
1978}
1979
1980#else /* #ifdef CONFIG_RCU_CPU_STALL_INFO */
1981
1982static void print_cpu_stall_info_begin(void)
1983{
1984	pr_cont(" {");
1985}
1986
1987static void print_cpu_stall_info(struct rcu_state *rsp, int cpu)
1988{
1989	pr_cont(" %d", cpu);
1990}
1991
1992static void print_cpu_stall_info_end(void)
1993{
1994	pr_cont("} ");
1995}
1996
1997static void zero_cpu_stall_ticks(struct rcu_data *rdp)
1998{
1999}
2000
2001static void increment_cpu_stall_ticks(void)
2002{
2003}
2004
2005#endif /* #else #ifdef CONFIG_RCU_CPU_STALL_INFO */
2006
2007#ifdef CONFIG_RCU_NOCB_CPU
2008
2009/*
2010 * Offload callback processing from the boot-time-specified set of CPUs
2011 * specified by rcu_nocb_mask.  For each CPU in the set, there is a
2012 * kthread created that pulls the callbacks from the corresponding CPU,
2013 * waits for a grace period to elapse, and invokes the callbacks.
2014 * The no-CBs CPUs do a wake_up() on their kthread when they insert
2015 * a callback into any empty list, unless the rcu_nocb_poll boot parameter
2016 * has been specified, in which case each kthread actively polls its
2017 * CPU.  (Which isn't so great for energy efficiency, but which does
2018 * reduce RCU's overhead on that CPU.)
2019 *
2020 * This is intended to be used in conjunction with Frederic Weisbecker's
2021 * adaptive-idle work, which would seriously reduce OS jitter on CPUs
2022 * running CPU-bound user-mode computations.
2023 *
2024 * Offloading of callback processing could also in theory be used as
2025 * an energy-efficiency measure because CPUs with no RCU callbacks
2026 * queued are more aggressive about entering dyntick-idle mode.
2027 */
2028
2029
2030/* Parse the boot-time rcu_nocb_mask CPU list from the kernel parameters. */
2031static int __init rcu_nocb_setup(char *str)
2032{
2033	alloc_bootmem_cpumask_var(&rcu_nocb_mask);
2034	have_rcu_nocb_mask = true;
2035	cpulist_parse(str, rcu_nocb_mask);
2036	return 1;
2037}
2038__setup("rcu_nocbs=", rcu_nocb_setup);
2039
2040static int __init parse_rcu_nocb_poll(char *arg)
2041{
2042	rcu_nocb_poll = 1;
2043	return 0;
2044}
2045early_param("rcu_nocb_poll", parse_rcu_nocb_poll);
2046
2047/*
2048 * Do any no-CBs CPUs need another grace period?
2049 *
2050 * Interrupts must be disabled.  If the caller does not hold the root
2051 * rnp_node structure's ->lock, the results are advisory only.
2052 */
2053static int rcu_nocb_needs_gp(struct rcu_state *rsp)
2054{
2055	struct rcu_node *rnp = rcu_get_root(rsp);
2056
2057	return rnp->need_future_gp[(ACCESS_ONCE(rnp->completed) + 1) & 0x1];
2058}
2059
2060/*
2061 * Wake up any no-CBs CPUs' kthreads that were waiting on the just-ended
2062 * grace period.
2063 */
2064static void rcu_nocb_gp_cleanup(struct rcu_state *rsp, struct rcu_node *rnp)
2065{
2066	wake_up_all(&rnp->nocb_gp_wq[rnp->completed & 0x1]);
2067}
2068
2069/*
2070 * Set the root rcu_node structure's ->need_future_gp field
2071 * based on the sum of those of all rcu_node structures.  This does
2072 * double-count the root rcu_node structure's requests, but this
2073 * is necessary to handle the possibility of a rcu_nocb_kthread()
2074 * having awakened during the time that the rcu_node structures
2075 * were being updated for the end of the previous grace period.
2076 */
2077static void rcu_nocb_gp_set(struct rcu_node *rnp, int nrq)
2078{
2079	rnp->need_future_gp[(rnp->completed + 1) & 0x1] += nrq;
2080}
2081
2082static void rcu_init_one_nocb(struct rcu_node *rnp)
2083{
2084	init_waitqueue_head(&rnp->nocb_gp_wq[0]);
2085	init_waitqueue_head(&rnp->nocb_gp_wq[1]);
2086}
2087
2088/* Is the specified CPU a no-CPUs CPU? */
2089bool rcu_is_nocb_cpu(int cpu)
2090{
2091	if (have_rcu_nocb_mask)
2092		return cpumask_test_cpu(cpu, rcu_nocb_mask);
2093	return false;
2094}
2095
2096/*
2097 * Enqueue the specified string of rcu_head structures onto the specified
2098 * CPU's no-CBs lists.  The CPU is specified by rdp, the head of the
2099 * string by rhp, and the tail of the string by rhtp.  The non-lazy/lazy
2100 * counts are supplied by rhcount and rhcount_lazy.
2101 *
2102 * If warranted, also wake up the kthread servicing this CPUs queues.
2103 */
2104static void __call_rcu_nocb_enqueue(struct rcu_data *rdp,
2105				    struct rcu_head *rhp,
2106				    struct rcu_head **rhtp,
2107				    int rhcount, int rhcount_lazy)
2108{
2109	int len;
2110	struct rcu_head **old_rhpp;
2111	struct task_struct *t;
2112
2113	/* Enqueue the callback on the nocb list and update counts. */
2114	old_rhpp = xchg(&rdp->nocb_tail, rhtp);
2115	ACCESS_ONCE(*old_rhpp) = rhp;
2116	atomic_long_add(rhcount, &rdp->nocb_q_count);
2117	atomic_long_add(rhcount_lazy, &rdp->nocb_q_count_lazy);
2118
2119	/* If we are not being polled and there is a kthread, awaken it ... */
2120	t = ACCESS_ONCE(rdp->nocb_kthread);
2121	if (rcu_nocb_poll || !t) {
2122		trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu,
2123				    TPS("WakeNotPoll"));
2124		return;
2125	}
2126	len = atomic_long_read(&rdp->nocb_q_count);
2127	if (old_rhpp == &rdp->nocb_head) {
2128		wake_up(&rdp->nocb_wq); /* ... only if queue was empty ... */
2129		rdp->qlen_last_fqs_check = 0;
2130		trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu, TPS("WakeEmpty"));
2131	} else if (len > rdp->qlen_last_fqs_check + qhimark) {
2132		wake_up_process(t); /* ... or if many callbacks queued. */
2133		rdp->qlen_last_fqs_check = LONG_MAX / 2;
2134		trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu, TPS("WakeOvf"));
2135	} else {
2136		trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu, TPS("WakeNot"));
2137	}
2138	return;
2139}
2140
2141/*
2142 * This is a helper for __call_rcu(), which invokes this when the normal
2143 * callback queue is inoperable.  If this is not a no-CBs CPU, this
2144 * function returns failure back to __call_rcu(), which can complain
2145 * appropriately.
2146 *
2147 * Otherwise, this function queues the callback where the corresponding
2148 * "rcuo" kthread can find it.
2149 */
2150static bool __call_rcu_nocb(struct rcu_data *rdp, struct rcu_head *rhp,
2151			    bool lazy)
2152{
2153
2154	if (!rcu_is_nocb_cpu(rdp->cpu))
2155		return 0;
2156	__call_rcu_nocb_enqueue(rdp, rhp, &rhp->next, 1, lazy);
2157	if (__is_kfree_rcu_offset((unsigned long)rhp->func))
2158		trace_rcu_kfree_callback(rdp->rsp->name, rhp,
2159					 (unsigned long)rhp->func,
2160					 -atomic_long_read(&rdp->nocb_q_count_lazy),
2161					 -atomic_long_read(&rdp->nocb_q_count));
2162	else
2163		trace_rcu_callback(rdp->rsp->name, rhp,
2164				   -atomic_long_read(&rdp->nocb_q_count_lazy),
2165				   -atomic_long_read(&rdp->nocb_q_count));
2166	return 1;
2167}
2168
2169/*
2170 * Adopt orphaned callbacks on a no-CBs CPU, or return 0 if this is
2171 * not a no-CBs CPU.
2172 */
2173static bool __maybe_unused rcu_nocb_adopt_orphan_cbs(struct rcu_state *rsp,
2174						     struct rcu_data *rdp)
2175{
2176	long ql = rsp->qlen;
2177	long qll = rsp->qlen_lazy;
2178
2179	/* If this is not a no-CBs CPU, tell the caller to do it the old way. */
2180	if (!rcu_is_nocb_cpu(smp_processor_id()))
2181		return 0;
2182	rsp->qlen = 0;
2183	rsp->qlen_lazy = 0;
2184
2185	/* First, enqueue the donelist, if any.  This preserves CB ordering. */
2186	if (rsp->orphan_donelist != NULL) {
2187		__call_rcu_nocb_enqueue(rdp, rsp->orphan_donelist,
2188					rsp->orphan_donetail, ql, qll);
2189		ql = qll = 0;
2190		rsp->orphan_donelist = NULL;
2191		rsp->orphan_donetail = &rsp->orphan_donelist;
2192	}
2193	if (rsp->orphan_nxtlist != NULL) {
2194		__call_rcu_nocb_enqueue(rdp, rsp->orphan_nxtlist,
2195					rsp->orphan_nxttail, ql, qll);
2196		ql = qll = 0;
2197		rsp->orphan_nxtlist = NULL;
2198		rsp->orphan_nxttail = &rsp->orphan_nxtlist;
2199	}
2200	return 1;
2201}
2202
2203/*
2204 * If necessary, kick off a new grace period, and either way wait
2205 * for a subsequent grace period to complete.
2206 */
2207static void rcu_nocb_wait_gp(struct rcu_data *rdp)
2208{
2209	unsigned long c;
2210	bool d;
2211	unsigned long flags;
2212	struct rcu_node *rnp = rdp->mynode;
2213
2214	raw_spin_lock_irqsave(&rnp->lock, flags);
2215	c = rcu_start_future_gp(rnp, rdp);
2216	raw_spin_unlock_irqrestore(&rnp->lock, flags);
2217
2218	/*
2219	 * Wait for the grace period.  Do so interruptibly to avoid messing
2220	 * up the load average.
2221	 */
2222	trace_rcu_future_gp(rnp, rdp, c, TPS("StartWait"));
2223	for (;;) {
2224		wait_event_interruptible(
2225			rnp->nocb_gp_wq[c & 0x1],
2226			(d = ULONG_CMP_GE(ACCESS_ONCE(rnp->completed), c)));
2227		if (likely(d))
2228			break;
2229		flush_signals(current);
2230		trace_rcu_future_gp(rnp, rdp, c, TPS("ResumeWait"));
2231	}
2232	trace_rcu_future_gp(rnp, rdp, c, TPS("EndWait"));
2233	smp_mb(); /* Ensure that CB invocation happens after GP end. */
2234}
2235
2236/*
2237 * Per-rcu_data kthread, but only for no-CBs CPUs.  Each kthread invokes
2238 * callbacks queued by the corresponding no-CBs CPU.
2239 */
2240static int rcu_nocb_kthread(void *arg)
2241{
2242	int c, cl;
2243	bool firsttime = 1;
2244	struct rcu_head *list;
2245	struct rcu_head *next;
2246	struct rcu_head **tail;
2247	struct rcu_data *rdp = arg;
2248
2249	/* Each pass through this loop invokes one batch of callbacks */
2250	for (;;) {
2251		/* If not polling, wait for next batch of callbacks. */
2252		if (!rcu_nocb_poll) {
2253			trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu,
2254					    TPS("Sleep"));
2255			wait_event_interruptible(rdp->nocb_wq, rdp->nocb_head);
2256			/* Memory barrier provide by xchg() below. */
2257		} else if (firsttime) {
2258			firsttime = 0;
2259			trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu,
2260					    TPS("Poll"));
2261		}
2262		list = ACCESS_ONCE(rdp->nocb_head);
2263		if (!list) {
2264			if (!rcu_nocb_poll)
2265				trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu,
2266						    TPS("WokeEmpty"));
2267			schedule_timeout_interruptible(1);
2268			flush_signals(current);
2269			continue;
2270		}
2271		firsttime = 1;
2272		trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu,
2273				    TPS("WokeNonEmpty"));
2274
2275		/*
2276		 * Extract queued callbacks, update counts, and wait
2277		 * for a grace period to elapse.
2278		 */
2279		ACCESS_ONCE(rdp->nocb_head) = NULL;
2280		tail = xchg(&rdp->nocb_tail, &rdp->nocb_head);
2281		c = atomic_long_xchg(&rdp->nocb_q_count, 0);
2282		cl = atomic_long_xchg(&rdp->nocb_q_count_lazy, 0);
2283		ACCESS_ONCE(rdp->nocb_p_count) += c;
2284		ACCESS_ONCE(rdp->nocb_p_count_lazy) += cl;
2285		rcu_nocb_wait_gp(rdp);
2286
2287		/* Each pass through the following loop invokes a callback. */
2288		trace_rcu_batch_start(rdp->rsp->name, cl, c, -1);
2289		c = cl = 0;
2290		while (list) {
2291			next = list->next;
2292			/* Wait for enqueuing to complete, if needed. */
2293			while (next == NULL && &list->next != tail) {
2294				trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu,
2295						    TPS("WaitQueue"));
2296				schedule_timeout_interruptible(1);
2297				trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu,
2298						    TPS("WokeQueue"));
2299				next = list->next;
2300			}
2301			debug_rcu_head_unqueue(list);
2302			local_bh_disable();
2303			if (__rcu_reclaim(rdp->rsp->name, list))
2304				cl++;
2305			c++;
2306			local_bh_enable();
2307			list = next;
2308		}
2309		trace_rcu_batch_end(rdp->rsp->name, c, !!list, 0, 0, 1);
2310		ACCESS_ONCE(rdp->nocb_p_count) -= c;
2311		ACCESS_ONCE(rdp->nocb_p_count_lazy) -= cl;
2312		rdp->n_nocbs_invoked += c;
2313	}
2314	return 0;
2315}
2316
2317/* Initialize per-rcu_data variables for no-CBs CPUs. */
2318static void __init rcu_boot_init_nocb_percpu_data(struct rcu_data *rdp)
2319{
2320	rdp->nocb_tail = &rdp->nocb_head;
2321	init_waitqueue_head(&rdp->nocb_wq);
2322}
2323
2324/* Create a kthread for each RCU flavor for each no-CBs CPU. */
2325static void __init rcu_spawn_nocb_kthreads(struct rcu_state *rsp)
2326{
2327	int cpu;
2328	struct rcu_data *rdp;
2329	struct task_struct *t;
2330
2331	if (rcu_nocb_mask == NULL)
2332		return;
2333	for_each_cpu(cpu, rcu_nocb_mask) {
2334		rdp = per_cpu_ptr(rsp->rda, cpu);
2335		t = kthread_run(rcu_nocb_kthread, rdp,
2336				"rcuo%c/%d", rsp->abbr, cpu);
2337		BUG_ON(IS_ERR(t));
2338		ACCESS_ONCE(rdp->nocb_kthread) = t;
2339	}
2340}
2341
2342/* Prevent __call_rcu() from enqueuing callbacks on no-CBs CPUs */
2343static bool init_nocb_callback_list(struct rcu_data *rdp)
2344{
2345	if (rcu_nocb_mask == NULL ||
2346	    !cpumask_test_cpu(rdp->cpu, rcu_nocb_mask))
2347		return false;
2348	rdp->nxttail[RCU_NEXT_TAIL] = NULL;
2349	return true;
2350}
2351
2352#else /* #ifdef CONFIG_RCU_NOCB_CPU */
2353
2354static int rcu_nocb_needs_gp(struct rcu_state *rsp)
2355{
2356	return 0;
2357}
2358
2359static void rcu_nocb_gp_cleanup(struct rcu_state *rsp, struct rcu_node *rnp)
2360{
2361}
2362
2363static void rcu_nocb_gp_set(struct rcu_node *rnp, int nrq)
2364{
2365}
2366
2367static void rcu_init_one_nocb(struct rcu_node *rnp)
2368{
2369}
2370
2371static bool __call_rcu_nocb(struct rcu_data *rdp, struct rcu_head *rhp,
2372			    bool lazy)
2373{
2374	return 0;
2375}
2376
2377static bool __maybe_unused rcu_nocb_adopt_orphan_cbs(struct rcu_state *rsp,
2378						     struct rcu_data *rdp)
2379{
2380	return 0;
2381}
2382
2383static void __init rcu_boot_init_nocb_percpu_data(struct rcu_data *rdp)
2384{
2385}
2386
2387static void __init rcu_spawn_nocb_kthreads(struct rcu_state *rsp)
2388{
2389}
2390
2391static bool init_nocb_callback_list(struct rcu_data *rdp)
2392{
2393	return false;
2394}
2395
2396#endif /* #else #ifdef CONFIG_RCU_NOCB_CPU */
2397
2398/*
2399 * An adaptive-ticks CPU can potentially execute in kernel mode for an
2400 * arbitrarily long period of time with the scheduling-clock tick turned
2401 * off.  RCU will be paying attention to this CPU because it is in the
2402 * kernel, but the CPU cannot be guaranteed to be executing the RCU state
2403 * machine because the scheduling-clock tick has been disabled.  Therefore,
2404 * if an adaptive-ticks CPU is failing to respond to the current grace
2405 * period and has not be idle from an RCU perspective, kick it.
2406 */
2407static void rcu_kick_nohz_cpu(int cpu)
2408{
2409#ifdef CONFIG_NO_HZ_FULL
2410	if (tick_nohz_full_cpu(cpu))
2411		smp_send_reschedule(cpu);
2412#endif /* #ifdef CONFIG_NO_HZ_FULL */
2413}
2414
2415
2416#ifdef CONFIG_NO_HZ_FULL_SYSIDLE
2417
2418/*
2419 * Define RCU flavor that holds sysidle state.  This needs to be the
2420 * most active flavor of RCU.
2421 */
2422#ifdef CONFIG_PREEMPT_RCU
2423static struct rcu_state *rcu_sysidle_state = &rcu_preempt_state;
2424#else /* #ifdef CONFIG_PREEMPT_RCU */
2425static struct rcu_state *rcu_sysidle_state = &rcu_sched_state;
2426#endif /* #else #ifdef CONFIG_PREEMPT_RCU */
2427
2428static int full_sysidle_state;		/* Current system-idle state. */
2429#define RCU_SYSIDLE_NOT		0	/* Some CPU is not idle. */
2430#define RCU_SYSIDLE_SHORT	1	/* All CPUs idle for brief period. */
2431#define RCU_SYSIDLE_LONG	2	/* All CPUs idle for long enough. */
2432#define RCU_SYSIDLE_FULL	3	/* All CPUs idle, ready for sysidle. */
2433#define RCU_SYSIDLE_FULL_NOTED	4	/* Actually entered sysidle state. */
2434
2435/*
2436 * Invoked to note exit from irq or task transition to idle.  Note that
2437 * usermode execution does -not- count as idle here!  After all, we want
2438 * to detect full-system idle states, not RCU quiescent states and grace
2439 * periods.  The caller must have disabled interrupts.
2440 */
2441static void rcu_sysidle_enter(struct rcu_dynticks *rdtp, int irq)
2442{
2443	unsigned long j;
2444
2445	/* Adjust nesting, check for fully idle. */
2446	if (irq) {
2447		rdtp->dynticks_idle_nesting--;
2448		WARN_ON_ONCE(rdtp->dynticks_idle_nesting < 0);
2449		if (rdtp->dynticks_idle_nesting != 0)
2450			return;  /* Still not fully idle. */
2451	} else {
2452		if ((rdtp->dynticks_idle_nesting & DYNTICK_TASK_NEST_MASK) ==
2453		    DYNTICK_TASK_NEST_VALUE) {
2454			rdtp->dynticks_idle_nesting = 0;
2455		} else {
2456			rdtp->dynticks_idle_nesting -= DYNTICK_TASK_NEST_VALUE;
2457			WARN_ON_ONCE(rdtp->dynticks_idle_nesting < 0);
2458			return;  /* Still not fully idle. */
2459		}
2460	}
2461
2462	/* Record start of fully idle period. */
2463	j = jiffies;
2464	ACCESS_ONCE(rdtp->dynticks_idle_jiffies) = j;
2465	smp_mb__before_atomic_inc();
2466	atomic_inc(&rdtp->dynticks_idle);
2467	smp_mb__after_atomic_inc();
2468	WARN_ON_ONCE(atomic_read(&rdtp->dynticks_idle) & 0x1);
2469}
2470
2471/*
2472 * Unconditionally force exit from full system-idle state.  This is
2473 * invoked when a normal CPU exits idle, but must be called separately
2474 * for the timekeeping CPU (tick_do_timer_cpu).  The reason for this
2475 * is that the timekeeping CPU is permitted to take scheduling-clock
2476 * interrupts while the system is in system-idle state, and of course
2477 * rcu_sysidle_exit() has no way of distinguishing a scheduling-clock
2478 * interrupt from any other type of interrupt.
2479 */
2480void rcu_sysidle_force_exit(void)
2481{
2482	int oldstate = ACCESS_ONCE(full_sysidle_state);
2483	int newoldstate;
2484
2485	/*
2486	 * Each pass through the following loop attempts to exit full
2487	 * system-idle state.  If contention proves to be a problem,
2488	 * a trylock-based contention tree could be used here.
2489	 */
2490	while (oldstate > RCU_SYSIDLE_SHORT) {
2491		newoldstate = cmpxchg(&full_sysidle_state,
2492				      oldstate, RCU_SYSIDLE_NOT);
2493		if (oldstate == newoldstate &&
2494		    oldstate == RCU_SYSIDLE_FULL_NOTED) {
2495			rcu_kick_nohz_cpu(tick_do_timer_cpu);
2496			return; /* We cleared it, done! */
2497		}
2498		oldstate = newoldstate;
2499	}
2500	smp_mb(); /* Order initial oldstate fetch vs. later non-idle work. */
2501}
2502
2503/*
2504 * Invoked to note entry to irq or task transition from idle.  Note that
2505 * usermode execution does -not- count as idle here!  The caller must
2506 * have disabled interrupts.
2507 */
2508static void rcu_sysidle_exit(struct rcu_dynticks *rdtp, int irq)
2509{
2510	/* Adjust nesting, check for already non-idle. */
2511	if (irq) {
2512		rdtp->dynticks_idle_nesting++;
2513		WARN_ON_ONCE(rdtp->dynticks_idle_nesting <= 0);
2514		if (rdtp->dynticks_idle_nesting != 1)
2515			return; /* Already non-idle. */
2516	} else {
2517		/*
2518		 * Allow for irq misnesting.  Yes, it really is possible
2519		 * to enter an irq handler then never leave it, and maybe
2520		 * also vice versa.  Handle both possibilities.
2521		 */
2522		if (rdtp->dynticks_idle_nesting & DYNTICK_TASK_NEST_MASK) {
2523			rdtp->dynticks_idle_nesting += DYNTICK_TASK_NEST_VALUE;
2524			WARN_ON_ONCE(rdtp->dynticks_idle_nesting <= 0);
2525			return; /* Already non-idle. */
2526		} else {
2527			rdtp->dynticks_idle_nesting = DYNTICK_TASK_EXIT_IDLE;
2528		}
2529	}
2530
2531	/* Record end of idle period. */
2532	smp_mb__before_atomic_inc();
2533	atomic_inc(&rdtp->dynticks_idle);
2534	smp_mb__after_atomic_inc();
2535	WARN_ON_ONCE(!(atomic_read(&rdtp->dynticks_idle) & 0x1));
2536
2537	/*
2538	 * If we are the timekeeping CPU, we are permitted to be non-idle
2539	 * during a system-idle state.  This must be the case, because
2540	 * the timekeeping CPU has to take scheduling-clock interrupts
2541	 * during the time that the system is transitioning to full
2542	 * system-idle state.  This means that the timekeeping CPU must
2543	 * invoke rcu_sysidle_force_exit() directly if it does anything
2544	 * more than take a scheduling-clock interrupt.
2545	 */
2546	if (smp_processor_id() == tick_do_timer_cpu)
2547		return;
2548
2549	/* Update system-idle state: We are clearly no longer fully idle! */
2550	rcu_sysidle_force_exit();
2551}
2552
2553/*
2554 * Check to see if the current CPU is idle.  Note that usermode execution
2555 * does not count as idle.  The caller must have disabled interrupts.
2556 */
2557static void rcu_sysidle_check_cpu(struct rcu_data *rdp, bool *isidle,
2558				  unsigned long *maxj)
2559{
2560	int cur;
2561	unsigned long j;
2562	struct rcu_dynticks *rdtp = rdp->dynticks;
2563
2564	/*
2565	 * If some other CPU has already reported non-idle, if this is
2566	 * not the flavor of RCU that tracks sysidle state, or if this
2567	 * is an offline or the timekeeping CPU, nothing to do.
2568	 */
2569	if (!*isidle || rdp->rsp != rcu_sysidle_state ||
2570	    cpu_is_offline(rdp->cpu) || rdp->cpu == tick_do_timer_cpu)
2571		return;
2572	if (rcu_gp_in_progress(rdp->rsp))
2573		WARN_ON_ONCE(smp_processor_id() != tick_do_timer_cpu);
2574
2575	/* Pick up current idle and NMI-nesting counter and check. */
2576	cur = atomic_read(&rdtp->dynticks_idle);
2577	if (cur & 0x1) {
2578		*isidle = false; /* We are not idle! */
2579		return;
2580	}
2581	smp_mb(); /* Read counters before timestamps. */
2582
2583	/* Pick up timestamps. */
2584	j = ACCESS_ONCE(rdtp->dynticks_idle_jiffies);
2585	/* If this CPU entered idle more recently, update maxj timestamp. */
2586	if (ULONG_CMP_LT(*maxj, j))
2587		*maxj = j;
2588}
2589
2590/*
2591 * Is this the flavor of RCU that is handling full-system idle?
2592 */
2593static bool is_sysidle_rcu_state(struct rcu_state *rsp)
2594{
2595	return rsp == rcu_sysidle_state;
2596}
2597
2598/*
2599 * Bind the grace-period kthread for the sysidle flavor of RCU to the
2600 * timekeeping CPU.
2601 */
2602static void rcu_bind_gp_kthread(void)
2603{
2604	int cpu = ACCESS_ONCE(tick_do_timer_cpu);
2605
2606	if (cpu < 0 || cpu >= nr_cpu_ids)
2607		return;
2608	if (raw_smp_processor_id() != cpu)
2609		set_cpus_allowed_ptr(current, cpumask_of(cpu));
2610}
2611
2612/*
2613 * Return a delay in jiffies based on the number of CPUs, rcu_node
2614 * leaf fanout, and jiffies tick rate.  The idea is to allow larger
2615 * systems more time to transition to full-idle state in order to
2616 * avoid the cache thrashing that otherwise occur on the state variable.
2617 * Really small systems (less than a couple of tens of CPUs) should
2618 * instead use a single global atomically incremented counter, and later
2619 * versions of this will automatically reconfigure themselves accordingly.
2620 */
2621static unsigned long rcu_sysidle_delay(void)
2622{
2623	if (nr_cpu_ids <= CONFIG_NO_HZ_FULL_SYSIDLE_SMALL)
2624		return 0;
2625	return DIV_ROUND_UP(nr_cpu_ids * HZ, rcu_fanout_leaf * 1000);
2626}
2627
2628/*
2629 * Advance the full-system-idle state.  This is invoked when all of
2630 * the non-timekeeping CPUs are idle.
2631 */
2632static void rcu_sysidle(unsigned long j)
2633{
2634	/* Check the current state. */
2635	switch (ACCESS_ONCE(full_sysidle_state)) {
2636	case RCU_SYSIDLE_NOT:
2637
2638		/* First time all are idle, so note a short idle period. */
2639		ACCESS_ONCE(full_sysidle_state) = RCU_SYSIDLE_SHORT;
2640		break;
2641
2642	case RCU_SYSIDLE_SHORT:
2643
2644		/*
2645		 * Idle for a bit, time to advance to next state?
2646		 * cmpxchg failure means race with non-idle, let them win.
2647		 */
2648		if (ULONG_CMP_GE(jiffies, j + rcu_sysidle_delay()))
2649			(void)cmpxchg(&full_sysidle_state,
2650				      RCU_SYSIDLE_SHORT, RCU_SYSIDLE_LONG);
2651		break;
2652
2653	case RCU_SYSIDLE_LONG:
2654
2655		/*
2656		 * Do an additional check pass before advancing to full.
2657		 * cmpxchg failure means race with non-idle, let them win.
2658		 */
2659		if (ULONG_CMP_GE(jiffies, j + rcu_sysidle_delay()))
2660			(void)cmpxchg(&full_sysidle_state,
2661				      RCU_SYSIDLE_LONG, RCU_SYSIDLE_FULL);
2662		break;
2663
2664	default:
2665		break;
2666	}
2667}
2668
2669/*
2670 * Found a non-idle non-timekeeping CPU, so kick the system-idle state
2671 * back to the beginning.
2672 */
2673static void rcu_sysidle_cancel(void)
2674{
2675	smp_mb();
2676	ACCESS_ONCE(full_sysidle_state) = RCU_SYSIDLE_NOT;
2677}
2678
2679/*
2680 * Update the sysidle state based on the results of a force-quiescent-state
2681 * scan of the CPUs' dyntick-idle state.
2682 */
2683static void rcu_sysidle_report(struct rcu_state *rsp, int isidle,
2684			       unsigned long maxj, bool gpkt)
2685{
2686	if (rsp != rcu_sysidle_state)
2687		return;  /* Wrong flavor, ignore. */
2688	if (gpkt && nr_cpu_ids <= CONFIG_NO_HZ_FULL_SYSIDLE_SMALL)
2689		return;  /* Running state machine from timekeeping CPU. */
2690	if (isidle)
2691		rcu_sysidle(maxj);    /* More idle! */
2692	else
2693		rcu_sysidle_cancel(); /* Idle is over. */
2694}
2695
2696/*
2697 * Wrapper for rcu_sysidle_report() when called from the grace-period
2698 * kthread's context.
2699 */
2700static void rcu_sysidle_report_gp(struct rcu_state *rsp, int isidle,
2701				  unsigned long maxj)
2702{
2703	rcu_sysidle_report(rsp, isidle, maxj, true);
2704}
2705
2706/* Callback and function for forcing an RCU grace period. */
2707struct rcu_sysidle_head {
2708	struct rcu_head rh;
2709	int inuse;
2710};
2711
2712static void rcu_sysidle_cb(struct rcu_head *rhp)
2713{
2714	struct rcu_sysidle_head *rshp;
2715
2716	/*
2717	 * The following memory barrier is needed to replace the
2718	 * memory barriers that would normally be in the memory
2719	 * allocator.
2720	 */
2721	smp_mb();  /* grace period precedes setting inuse. */
2722
2723	rshp = container_of(rhp, struct rcu_sysidle_head, rh);
2724	ACCESS_ONCE(rshp->inuse) = 0;
2725}
2726
2727/*
2728 * Check to see if the system is fully idle, other than the timekeeping CPU.
2729 * The caller must have disabled interrupts.
2730 */
2731bool rcu_sys_is_idle(void)
2732{
2733	static struct rcu_sysidle_head rsh;
2734	int rss = ACCESS_ONCE(full_sysidle_state);
2735
2736	if (WARN_ON_ONCE(smp_processor_id() != tick_do_timer_cpu))
2737		return false;
2738
2739	/* Handle small-system case by doing a full scan of CPUs. */
2740	if (nr_cpu_ids <= CONFIG_NO_HZ_FULL_SYSIDLE_SMALL) {
2741		int oldrss = rss - 1;
2742
2743		/*
2744		 * One pass to advance to each state up to _FULL.
2745		 * Give up if any pass fails to advance the state.
2746		 */
2747		while (rss < RCU_SYSIDLE_FULL && oldrss < rss) {
2748			int cpu;
2749			bool isidle = true;
2750			unsigned long maxj = jiffies - ULONG_MAX / 4;
2751			struct rcu_data *rdp;
2752
2753			/* Scan all the CPUs looking for nonidle CPUs. */
2754			for_each_possible_cpu(cpu) {
2755				rdp = per_cpu_ptr(rcu_sysidle_state->rda, cpu);
2756				rcu_sysidle_check_cpu(rdp, &isidle, &maxj);
2757				if (!isidle)
2758					break;
2759			}
2760			rcu_sysidle_report(rcu_sysidle_state,
2761					   isidle, maxj, false);
2762			oldrss = rss;
2763			rss = ACCESS_ONCE(full_sysidle_state);
2764		}
2765	}
2766
2767	/* If this is the first observation of an idle period, record it. */
2768	if (rss == RCU_SYSIDLE_FULL) {
2769		rss = cmpxchg(&full_sysidle_state,
2770			      RCU_SYSIDLE_FULL, RCU_SYSIDLE_FULL_NOTED);
2771		return rss == RCU_SYSIDLE_FULL;
2772	}
2773
2774	smp_mb(); /* ensure rss load happens before later caller actions. */
2775
2776	/* If already fully idle, tell the caller (in case of races). */
2777	if (rss == RCU_SYSIDLE_FULL_NOTED)
2778		return true;
2779
2780	/*
2781	 * If we aren't there yet, and a grace period is not in flight,
2782	 * initiate a grace period.  Either way, tell the caller that
2783	 * we are not there yet.  We use an xchg() rather than an assignment
2784	 * to make up for the memory barriers that would otherwise be
2785	 * provided by the memory allocator.
2786	 */
2787	if (nr_cpu_ids > CONFIG_NO_HZ_FULL_SYSIDLE_SMALL &&
2788	    !rcu_gp_in_progress(rcu_sysidle_state) &&
2789	    !rsh.inuse && xchg(&rsh.inuse, 1) == 0)
2790		call_rcu(&rsh.rh, rcu_sysidle_cb);
2791	return false;
2792}
2793
2794/*
2795 * Initialize dynticks sysidle state for CPUs coming online.
2796 */
2797static void rcu_sysidle_init_percpu_data(struct rcu_dynticks *rdtp)
2798{
2799	rdtp->dynticks_idle_nesting = DYNTICK_TASK_NEST_VALUE;
2800}
2801
2802#else /* #ifdef CONFIG_NO_HZ_FULL_SYSIDLE */
2803
2804static void rcu_sysidle_enter(struct rcu_dynticks *rdtp, int irq)
2805{
2806}
2807
2808static void rcu_sysidle_exit(struct rcu_dynticks *rdtp, int irq)
2809{
2810}
2811
2812static void rcu_sysidle_check_cpu(struct rcu_data *rdp, bool *isidle,
2813				  unsigned long *maxj)
2814{
2815}
2816
2817static bool is_sysidle_rcu_state(struct rcu_state *rsp)
2818{
2819	return false;
2820}
2821
2822static void rcu_bind_gp_kthread(void)
2823{
2824}
2825
2826static void rcu_sysidle_report_gp(struct rcu_state *rsp, int isidle,
2827				  unsigned long maxj)
2828{
2829}
2830
2831static void rcu_sysidle_init_percpu_data(struct rcu_dynticks *rdtp)
2832{
2833}
2834
2835#endif /* #else #ifdef CONFIG_NO_HZ_FULL_SYSIDLE */
2836