blk-cgroup.c revision a637120e49021d197e9578cba545bbaa459cbb51
1/*
2 * Common Block IO controller cgroup interface
3 *
4 * Based on ideas and code from CFQ, CFS and BFQ:
5 * Copyright (C) 2003 Jens Axboe <axboe@kernel.dk>
6 *
7 * Copyright (C) 2008 Fabio Checconi <fabio@gandalf.sssup.it>
8 *		      Paolo Valente <paolo.valente@unimore.it>
9 *
10 * Copyright (C) 2009 Vivek Goyal <vgoyal@redhat.com>
11 * 	              Nauman Rafique <nauman@google.com>
12 */
13#include <linux/ioprio.h>
14#include <linux/kdev_t.h>
15#include <linux/module.h>
16#include <linux/err.h>
17#include <linux/blkdev.h>
18#include <linux/slab.h>
19#include <linux/genhd.h>
20#include <linux/delay.h>
21#include <linux/atomic.h>
22#include "blk-cgroup.h"
23#include "blk.h"
24
25#define MAX_KEY_LEN 100
26
27static DEFINE_MUTEX(blkcg_pol_mutex);
28
29struct blkcg blkcg_root = { .cfq_weight = 2 * CFQ_WEIGHT_DEFAULT };
30EXPORT_SYMBOL_GPL(blkcg_root);
31
32static struct blkcg_policy *blkcg_policy[BLKCG_MAX_POLS];
33
34struct blkcg *cgroup_to_blkcg(struct cgroup *cgroup)
35{
36	return container_of(cgroup_subsys_state(cgroup, blkio_subsys_id),
37			    struct blkcg, css);
38}
39EXPORT_SYMBOL_GPL(cgroup_to_blkcg);
40
41static struct blkcg *task_blkcg(struct task_struct *tsk)
42{
43	return container_of(task_subsys_state(tsk, blkio_subsys_id),
44			    struct blkcg, css);
45}
46
47struct blkcg *bio_blkcg(struct bio *bio)
48{
49	if (bio && bio->bi_css)
50		return container_of(bio->bi_css, struct blkcg, css);
51	return task_blkcg(current);
52}
53EXPORT_SYMBOL_GPL(bio_blkcg);
54
55static bool blkcg_policy_enabled(struct request_queue *q,
56				 const struct blkcg_policy *pol)
57{
58	return pol && test_bit(pol->plid, q->blkcg_pols);
59}
60
61/**
62 * blkg_free - free a blkg
63 * @blkg: blkg to free
64 *
65 * Free @blkg which may be partially allocated.
66 */
67static void blkg_free(struct blkcg_gq *blkg)
68{
69	int i;
70
71	if (!blkg)
72		return;
73
74	for (i = 0; i < BLKCG_MAX_POLS; i++) {
75		struct blkcg_policy *pol = blkcg_policy[i];
76		struct blkg_policy_data *pd = blkg->pd[i];
77
78		if (!pd)
79			continue;
80
81		if (pol && pol->pd_exit_fn)
82			pol->pd_exit_fn(blkg);
83
84		kfree(pd);
85	}
86
87	kfree(blkg);
88}
89
90/**
91 * blkg_alloc - allocate a blkg
92 * @blkcg: block cgroup the new blkg is associated with
93 * @q: request_queue the new blkg is associated with
94 *
95 * Allocate a new blkg assocating @blkcg and @q.
96 */
97static struct blkcg_gq *blkg_alloc(struct blkcg *blkcg, struct request_queue *q)
98{
99	struct blkcg_gq *blkg;
100	int i;
101
102	/* alloc and init base part */
103	blkg = kzalloc_node(sizeof(*blkg), GFP_ATOMIC, q->node);
104	if (!blkg)
105		return NULL;
106
107	blkg->q = q;
108	INIT_LIST_HEAD(&blkg->q_node);
109	blkg->blkcg = blkcg;
110	blkg->refcnt = 1;
111
112	for (i = 0; i < BLKCG_MAX_POLS; i++) {
113		struct blkcg_policy *pol = blkcg_policy[i];
114		struct blkg_policy_data *pd;
115
116		if (!blkcg_policy_enabled(q, pol))
117			continue;
118
119		/* alloc per-policy data and attach it to blkg */
120		pd = kzalloc_node(pol->pd_size, GFP_ATOMIC, q->node);
121		if (!pd) {
122			blkg_free(blkg);
123			return NULL;
124		}
125
126		blkg->pd[i] = pd;
127		pd->blkg = blkg;
128	}
129
130	/* invoke per-policy init */
131	for (i = 0; i < BLKCG_MAX_POLS; i++) {
132		struct blkcg_policy *pol = blkcg_policy[i];
133
134		if (blkcg_policy_enabled(blkg->q, pol))
135			pol->pd_init_fn(blkg);
136	}
137
138	return blkg;
139}
140
141static struct blkcg_gq *__blkg_lookup(struct blkcg *blkcg,
142				      struct request_queue *q)
143{
144	struct blkcg_gq *blkg;
145
146	blkg = rcu_dereference(blkcg->blkg_hint);
147	if (blkg && blkg->q == q)
148		return blkg;
149
150	/*
151	 * Hint didn't match.  Look up from the radix tree.  Note that we
152	 * may not be holding queue_lock and thus are not sure whether
153	 * @blkg from blkg_tree has already been removed or not, so we
154	 * can't update hint to the lookup result.  Leave it to the caller.
155	 */
156	blkg = radix_tree_lookup(&blkcg->blkg_tree, q->id);
157	if (blkg && blkg->q == q)
158		return blkg;
159
160	return NULL;
161}
162
163/**
164 * blkg_lookup - lookup blkg for the specified blkcg - q pair
165 * @blkcg: blkcg of interest
166 * @q: request_queue of interest
167 *
168 * Lookup blkg for the @blkcg - @q pair.  This function should be called
169 * under RCU read lock and is guaranteed to return %NULL if @q is bypassing
170 * - see blk_queue_bypass_start() for details.
171 */
172struct blkcg_gq *blkg_lookup(struct blkcg *blkcg, struct request_queue *q)
173{
174	WARN_ON_ONCE(!rcu_read_lock_held());
175
176	if (unlikely(blk_queue_bypass(q)))
177		return NULL;
178	return __blkg_lookup(blkcg, q);
179}
180EXPORT_SYMBOL_GPL(blkg_lookup);
181
182static struct blkcg_gq *__blkg_lookup_create(struct blkcg *blkcg,
183					     struct request_queue *q)
184	__releases(q->queue_lock) __acquires(q->queue_lock)
185{
186	struct blkcg_gq *blkg;
187	int ret;
188
189	WARN_ON_ONCE(!rcu_read_lock_held());
190	lockdep_assert_held(q->queue_lock);
191
192	/* lookup and update hint on success, see __blkg_lookup() for details */
193	blkg = __blkg_lookup(blkcg, q);
194	if (blkg) {
195		rcu_assign_pointer(blkcg->blkg_hint, blkg);
196		return blkg;
197	}
198
199	/* blkg holds a reference to blkcg */
200	if (!css_tryget(&blkcg->css))
201		return ERR_PTR(-EINVAL);
202
203	/* allocate */
204	ret = -ENOMEM;
205	blkg = blkg_alloc(blkcg, q);
206	if (unlikely(!blkg))
207		goto err_put;
208
209	/* insert */
210	ret = radix_tree_preload(GFP_ATOMIC);
211	if (ret)
212		goto err_free;
213
214	spin_lock(&blkcg->lock);
215	ret = radix_tree_insert(&blkcg->blkg_tree, q->id, blkg);
216	if (likely(!ret)) {
217		hlist_add_head_rcu(&blkg->blkcg_node, &blkcg->blkg_list);
218		list_add(&blkg->q_node, &q->blkg_list);
219	}
220	spin_unlock(&blkcg->lock);
221
222	radix_tree_preload_end();
223
224	if (!ret)
225		return blkg;
226err_free:
227	blkg_free(blkg);
228err_put:
229	css_put(&blkcg->css);
230	return ERR_PTR(ret);
231}
232
233struct blkcg_gq *blkg_lookup_create(struct blkcg *blkcg,
234				    struct request_queue *q)
235{
236	/*
237	 * This could be the first entry point of blkcg implementation and
238	 * we shouldn't allow anything to go through for a bypassing queue.
239	 */
240	if (unlikely(blk_queue_bypass(q)))
241		return ERR_PTR(blk_queue_dead(q) ? -EINVAL : -EBUSY);
242	return __blkg_lookup_create(blkcg, q);
243}
244EXPORT_SYMBOL_GPL(blkg_lookup_create);
245
246static void blkg_destroy(struct blkcg_gq *blkg)
247{
248	struct request_queue *q = blkg->q;
249	struct blkcg *blkcg = blkg->blkcg;
250
251	lockdep_assert_held(q->queue_lock);
252	lockdep_assert_held(&blkcg->lock);
253
254	/* Something wrong if we are trying to remove same group twice */
255	WARN_ON_ONCE(list_empty(&blkg->q_node));
256	WARN_ON_ONCE(hlist_unhashed(&blkg->blkcg_node));
257
258	radix_tree_delete(&blkcg->blkg_tree, blkg->q->id);
259	list_del_init(&blkg->q_node);
260	hlist_del_init_rcu(&blkg->blkcg_node);
261
262	/*
263	 * Both setting lookup hint to and clearing it from @blkg are done
264	 * under queue_lock.  If it's not pointing to @blkg now, it never
265	 * will.  Hint assignment itself can race safely.
266	 */
267	if (rcu_dereference_raw(blkcg->blkg_hint) == blkg)
268		rcu_assign_pointer(blkcg->blkg_hint, NULL);
269
270	/*
271	 * Put the reference taken at the time of creation so that when all
272	 * queues are gone, group can be destroyed.
273	 */
274	blkg_put(blkg);
275}
276
277/**
278 * blkg_destroy_all - destroy all blkgs associated with a request_queue
279 * @q: request_queue of interest
280 *
281 * Destroy all blkgs associated with @q.
282 */
283static void blkg_destroy_all(struct request_queue *q)
284{
285	struct blkcg_gq *blkg, *n;
286
287	lockdep_assert_held(q->queue_lock);
288
289	list_for_each_entry_safe(blkg, n, &q->blkg_list, q_node) {
290		struct blkcg *blkcg = blkg->blkcg;
291
292		spin_lock(&blkcg->lock);
293		blkg_destroy(blkg);
294		spin_unlock(&blkcg->lock);
295	}
296}
297
298static void blkg_rcu_free(struct rcu_head *rcu_head)
299{
300	blkg_free(container_of(rcu_head, struct blkcg_gq, rcu_head));
301}
302
303void __blkg_release(struct blkcg_gq *blkg)
304{
305	/* release the extra blkcg reference this blkg has been holding */
306	css_put(&blkg->blkcg->css);
307
308	/*
309	 * A group is freed in rcu manner. But having an rcu lock does not
310	 * mean that one can access all the fields of blkg and assume these
311	 * are valid. For example, don't try to follow throtl_data and
312	 * request queue links.
313	 *
314	 * Having a reference to blkg under an rcu allows acess to only
315	 * values local to groups like group stats and group rate limits
316	 */
317	call_rcu(&blkg->rcu_head, blkg_rcu_free);
318}
319EXPORT_SYMBOL_GPL(__blkg_release);
320
321static int blkcg_reset_stats(struct cgroup *cgroup, struct cftype *cftype,
322			     u64 val)
323{
324	struct blkcg *blkcg = cgroup_to_blkcg(cgroup);
325	struct blkcg_gq *blkg;
326	struct hlist_node *n;
327	int i;
328
329	mutex_lock(&blkcg_pol_mutex);
330	spin_lock_irq(&blkcg->lock);
331
332	/*
333	 * Note that stat reset is racy - it doesn't synchronize against
334	 * stat updates.  This is a debug feature which shouldn't exist
335	 * anyway.  If you get hit by a race, retry.
336	 */
337	hlist_for_each_entry(blkg, n, &blkcg->blkg_list, blkcg_node) {
338		for (i = 0; i < BLKCG_MAX_POLS; i++) {
339			struct blkcg_policy *pol = blkcg_policy[i];
340
341			if (blkcg_policy_enabled(blkg->q, pol) &&
342			    pol->pd_reset_stats_fn)
343				pol->pd_reset_stats_fn(blkg);
344		}
345	}
346
347	spin_unlock_irq(&blkcg->lock);
348	mutex_unlock(&blkcg_pol_mutex);
349	return 0;
350}
351
352static const char *blkg_dev_name(struct blkcg_gq *blkg)
353{
354	/* some drivers (floppy) instantiate a queue w/o disk registered */
355	if (blkg->q->backing_dev_info.dev)
356		return dev_name(blkg->q->backing_dev_info.dev);
357	return NULL;
358}
359
360/**
361 * blkcg_print_blkgs - helper for printing per-blkg data
362 * @sf: seq_file to print to
363 * @blkcg: blkcg of interest
364 * @prfill: fill function to print out a blkg
365 * @pol: policy in question
366 * @data: data to be passed to @prfill
367 * @show_total: to print out sum of prfill return values or not
368 *
369 * This function invokes @prfill on each blkg of @blkcg if pd for the
370 * policy specified by @pol exists.  @prfill is invoked with @sf, the
371 * policy data and @data.  If @show_total is %true, the sum of the return
372 * values from @prfill is printed with "Total" label at the end.
373 *
374 * This is to be used to construct print functions for
375 * cftype->read_seq_string method.
376 */
377void blkcg_print_blkgs(struct seq_file *sf, struct blkcg *blkcg,
378		       u64 (*prfill)(struct seq_file *,
379				     struct blkg_policy_data *, int),
380		       const struct blkcg_policy *pol, int data,
381		       bool show_total)
382{
383	struct blkcg_gq *blkg;
384	struct hlist_node *n;
385	u64 total = 0;
386
387	spin_lock_irq(&blkcg->lock);
388	hlist_for_each_entry(blkg, n, &blkcg->blkg_list, blkcg_node)
389		if (blkcg_policy_enabled(blkg->q, pol))
390			total += prfill(sf, blkg->pd[pol->plid], data);
391	spin_unlock_irq(&blkcg->lock);
392
393	if (show_total)
394		seq_printf(sf, "Total %llu\n", (unsigned long long)total);
395}
396EXPORT_SYMBOL_GPL(blkcg_print_blkgs);
397
398/**
399 * __blkg_prfill_u64 - prfill helper for a single u64 value
400 * @sf: seq_file to print to
401 * @pd: policy private data of interest
402 * @v: value to print
403 *
404 * Print @v to @sf for the device assocaited with @pd.
405 */
406u64 __blkg_prfill_u64(struct seq_file *sf, struct blkg_policy_data *pd, u64 v)
407{
408	const char *dname = blkg_dev_name(pd->blkg);
409
410	if (!dname)
411		return 0;
412
413	seq_printf(sf, "%s %llu\n", dname, (unsigned long long)v);
414	return v;
415}
416EXPORT_SYMBOL_GPL(__blkg_prfill_u64);
417
418/**
419 * __blkg_prfill_rwstat - prfill helper for a blkg_rwstat
420 * @sf: seq_file to print to
421 * @pd: policy private data of interest
422 * @rwstat: rwstat to print
423 *
424 * Print @rwstat to @sf for the device assocaited with @pd.
425 */
426u64 __blkg_prfill_rwstat(struct seq_file *sf, struct blkg_policy_data *pd,
427			 const struct blkg_rwstat *rwstat)
428{
429	static const char *rwstr[] = {
430		[BLKG_RWSTAT_READ]	= "Read",
431		[BLKG_RWSTAT_WRITE]	= "Write",
432		[BLKG_RWSTAT_SYNC]	= "Sync",
433		[BLKG_RWSTAT_ASYNC]	= "Async",
434	};
435	const char *dname = blkg_dev_name(pd->blkg);
436	u64 v;
437	int i;
438
439	if (!dname)
440		return 0;
441
442	for (i = 0; i < BLKG_RWSTAT_NR; i++)
443		seq_printf(sf, "%s %s %llu\n", dname, rwstr[i],
444			   (unsigned long long)rwstat->cnt[i]);
445
446	v = rwstat->cnt[BLKG_RWSTAT_READ] + rwstat->cnt[BLKG_RWSTAT_WRITE];
447	seq_printf(sf, "%s Total %llu\n", dname, (unsigned long long)v);
448	return v;
449}
450
451/**
452 * blkg_prfill_stat - prfill callback for blkg_stat
453 * @sf: seq_file to print to
454 * @pd: policy private data of interest
455 * @off: offset to the blkg_stat in @pd
456 *
457 * prfill callback for printing a blkg_stat.
458 */
459u64 blkg_prfill_stat(struct seq_file *sf, struct blkg_policy_data *pd, int off)
460{
461	return __blkg_prfill_u64(sf, pd, blkg_stat_read((void *)pd + off));
462}
463EXPORT_SYMBOL_GPL(blkg_prfill_stat);
464
465/**
466 * blkg_prfill_rwstat - prfill callback for blkg_rwstat
467 * @sf: seq_file to print to
468 * @pd: policy private data of interest
469 * @off: offset to the blkg_rwstat in @pd
470 *
471 * prfill callback for printing a blkg_rwstat.
472 */
473u64 blkg_prfill_rwstat(struct seq_file *sf, struct blkg_policy_data *pd,
474		       int off)
475{
476	struct blkg_rwstat rwstat = blkg_rwstat_read((void *)pd + off);
477
478	return __blkg_prfill_rwstat(sf, pd, &rwstat);
479}
480EXPORT_SYMBOL_GPL(blkg_prfill_rwstat);
481
482/**
483 * blkg_conf_prep - parse and prepare for per-blkg config update
484 * @blkcg: target block cgroup
485 * @pol: target policy
486 * @input: input string
487 * @ctx: blkg_conf_ctx to be filled
488 *
489 * Parse per-blkg config update from @input and initialize @ctx with the
490 * result.  @ctx->blkg points to the blkg to be updated and @ctx->v the new
491 * value.  This function returns with RCU read lock and queue lock held and
492 * must be paired with blkg_conf_finish().
493 */
494int blkg_conf_prep(struct blkcg *blkcg, const struct blkcg_policy *pol,
495		   const char *input, struct blkg_conf_ctx *ctx)
496	__acquires(rcu) __acquires(disk->queue->queue_lock)
497{
498	struct gendisk *disk;
499	struct blkcg_gq *blkg;
500	unsigned int major, minor;
501	unsigned long long v;
502	int part, ret;
503
504	if (sscanf(input, "%u:%u %llu", &major, &minor, &v) != 3)
505		return -EINVAL;
506
507	disk = get_gendisk(MKDEV(major, minor), &part);
508	if (!disk || part)
509		return -EINVAL;
510
511	rcu_read_lock();
512	spin_lock_irq(disk->queue->queue_lock);
513
514	if (blkcg_policy_enabled(disk->queue, pol))
515		blkg = blkg_lookup_create(blkcg, disk->queue);
516	else
517		blkg = ERR_PTR(-EINVAL);
518
519	if (IS_ERR(blkg)) {
520		ret = PTR_ERR(blkg);
521		rcu_read_unlock();
522		spin_unlock_irq(disk->queue->queue_lock);
523		put_disk(disk);
524		/*
525		 * If queue was bypassing, we should retry.  Do so after a
526		 * short msleep().  It isn't strictly necessary but queue
527		 * can be bypassing for some time and it's always nice to
528		 * avoid busy looping.
529		 */
530		if (ret == -EBUSY) {
531			msleep(10);
532			ret = restart_syscall();
533		}
534		return ret;
535	}
536
537	ctx->disk = disk;
538	ctx->blkg = blkg;
539	ctx->v = v;
540	return 0;
541}
542EXPORT_SYMBOL_GPL(blkg_conf_prep);
543
544/**
545 * blkg_conf_finish - finish up per-blkg config update
546 * @ctx: blkg_conf_ctx intiailized by blkg_conf_prep()
547 *
548 * Finish up after per-blkg config update.  This function must be paired
549 * with blkg_conf_prep().
550 */
551void blkg_conf_finish(struct blkg_conf_ctx *ctx)
552	__releases(ctx->disk->queue->queue_lock) __releases(rcu)
553{
554	spin_unlock_irq(ctx->disk->queue->queue_lock);
555	rcu_read_unlock();
556	put_disk(ctx->disk);
557}
558EXPORT_SYMBOL_GPL(blkg_conf_finish);
559
560struct cftype blkcg_files[] = {
561	{
562		.name = "reset_stats",
563		.write_u64 = blkcg_reset_stats,
564	},
565	{ }	/* terminate */
566};
567
568/**
569 * blkcg_pre_destroy - cgroup pre_destroy callback
570 * @cgroup: cgroup of interest
571 *
572 * This function is called when @cgroup is about to go away and responsible
573 * for shooting down all blkgs associated with @cgroup.  blkgs should be
574 * removed while holding both q and blkcg locks.  As blkcg lock is nested
575 * inside q lock, this function performs reverse double lock dancing.
576 *
577 * This is the blkcg counterpart of ioc_release_fn().
578 */
579static int blkcg_pre_destroy(struct cgroup *cgroup)
580{
581	struct blkcg *blkcg = cgroup_to_blkcg(cgroup);
582
583	spin_lock_irq(&blkcg->lock);
584
585	while (!hlist_empty(&blkcg->blkg_list)) {
586		struct blkcg_gq *blkg = hlist_entry(blkcg->blkg_list.first,
587						struct blkcg_gq, blkcg_node);
588		struct request_queue *q = blkg->q;
589
590		if (spin_trylock(q->queue_lock)) {
591			blkg_destroy(blkg);
592			spin_unlock(q->queue_lock);
593		} else {
594			spin_unlock_irq(&blkcg->lock);
595			cpu_relax();
596			spin_lock_irq(&blkcg->lock);
597		}
598	}
599
600	spin_unlock_irq(&blkcg->lock);
601	return 0;
602}
603
604static void blkcg_destroy(struct cgroup *cgroup)
605{
606	struct blkcg *blkcg = cgroup_to_blkcg(cgroup);
607
608	if (blkcg != &blkcg_root)
609		kfree(blkcg);
610}
611
612static struct cgroup_subsys_state *blkcg_create(struct cgroup *cgroup)
613{
614	static atomic64_t id_seq = ATOMIC64_INIT(0);
615	struct blkcg *blkcg;
616	struct cgroup *parent = cgroup->parent;
617
618	if (!parent) {
619		blkcg = &blkcg_root;
620		goto done;
621	}
622
623	blkcg = kzalloc(sizeof(*blkcg), GFP_KERNEL);
624	if (!blkcg)
625		return ERR_PTR(-ENOMEM);
626
627	blkcg->cfq_weight = CFQ_WEIGHT_DEFAULT;
628	blkcg->id = atomic64_inc_return(&id_seq); /* root is 0, start from 1 */
629done:
630	spin_lock_init(&blkcg->lock);
631	INIT_RADIX_TREE(&blkcg->blkg_tree, GFP_ATOMIC);
632	INIT_HLIST_HEAD(&blkcg->blkg_list);
633
634	return &blkcg->css;
635}
636
637/**
638 * blkcg_init_queue - initialize blkcg part of request queue
639 * @q: request_queue to initialize
640 *
641 * Called from blk_alloc_queue_node(). Responsible for initializing blkcg
642 * part of new request_queue @q.
643 *
644 * RETURNS:
645 * 0 on success, -errno on failure.
646 */
647int blkcg_init_queue(struct request_queue *q)
648{
649	might_sleep();
650
651	return blk_throtl_init(q);
652}
653
654/**
655 * blkcg_drain_queue - drain blkcg part of request_queue
656 * @q: request_queue to drain
657 *
658 * Called from blk_drain_queue().  Responsible for draining blkcg part.
659 */
660void blkcg_drain_queue(struct request_queue *q)
661{
662	lockdep_assert_held(q->queue_lock);
663
664	blk_throtl_drain(q);
665}
666
667/**
668 * blkcg_exit_queue - exit and release blkcg part of request_queue
669 * @q: request_queue being released
670 *
671 * Called from blk_release_queue().  Responsible for exiting blkcg part.
672 */
673void blkcg_exit_queue(struct request_queue *q)
674{
675	spin_lock_irq(q->queue_lock);
676	blkg_destroy_all(q);
677	spin_unlock_irq(q->queue_lock);
678
679	blk_throtl_exit(q);
680}
681
682/*
683 * We cannot support shared io contexts, as we have no mean to support
684 * two tasks with the same ioc in two different groups without major rework
685 * of the main cic data structures.  For now we allow a task to change
686 * its cgroup only if it's the only owner of its ioc.
687 */
688static int blkcg_can_attach(struct cgroup *cgrp, struct cgroup_taskset *tset)
689{
690	struct task_struct *task;
691	struct io_context *ioc;
692	int ret = 0;
693
694	/* task_lock() is needed to avoid races with exit_io_context() */
695	cgroup_taskset_for_each(task, cgrp, tset) {
696		task_lock(task);
697		ioc = task->io_context;
698		if (ioc && atomic_read(&ioc->nr_tasks) > 1)
699			ret = -EINVAL;
700		task_unlock(task);
701		if (ret)
702			break;
703	}
704	return ret;
705}
706
707struct cgroup_subsys blkio_subsys = {
708	.name = "blkio",
709	.create = blkcg_create,
710	.can_attach = blkcg_can_attach,
711	.pre_destroy = blkcg_pre_destroy,
712	.destroy = blkcg_destroy,
713	.subsys_id = blkio_subsys_id,
714	.base_cftypes = blkcg_files,
715	.module = THIS_MODULE,
716};
717EXPORT_SYMBOL_GPL(blkio_subsys);
718
719/**
720 * blkcg_activate_policy - activate a blkcg policy on a request_queue
721 * @q: request_queue of interest
722 * @pol: blkcg policy to activate
723 *
724 * Activate @pol on @q.  Requires %GFP_KERNEL context.  @q goes through
725 * bypass mode to populate its blkgs with policy_data for @pol.
726 *
727 * Activation happens with @q bypassed, so nobody would be accessing blkgs
728 * from IO path.  Update of each blkg is protected by both queue and blkcg
729 * locks so that holding either lock and testing blkcg_policy_enabled() is
730 * always enough for dereferencing policy data.
731 *
732 * The caller is responsible for synchronizing [de]activations and policy
733 * [un]registerations.  Returns 0 on success, -errno on failure.
734 */
735int blkcg_activate_policy(struct request_queue *q,
736			  const struct blkcg_policy *pol)
737{
738	LIST_HEAD(pds);
739	struct blkcg_gq *blkg;
740	struct blkg_policy_data *pd, *n;
741	int cnt = 0, ret;
742
743	if (blkcg_policy_enabled(q, pol))
744		return 0;
745
746	blk_queue_bypass_start(q);
747
748	/* make sure the root blkg exists and count the existing blkgs */
749	spin_lock_irq(q->queue_lock);
750
751	rcu_read_lock();
752	blkg = __blkg_lookup_create(&blkcg_root, q);
753	rcu_read_unlock();
754
755	if (IS_ERR(blkg)) {
756		ret = PTR_ERR(blkg);
757		goto out_unlock;
758	}
759	q->root_blkg = blkg;
760
761	list_for_each_entry(blkg, &q->blkg_list, q_node)
762		cnt++;
763
764	spin_unlock_irq(q->queue_lock);
765
766	/* allocate policy_data for all existing blkgs */
767	while (cnt--) {
768		pd = kzalloc_node(pol->pd_size, GFP_KERNEL, q->node);
769		if (!pd) {
770			ret = -ENOMEM;
771			goto out_free;
772		}
773		list_add_tail(&pd->alloc_node, &pds);
774	}
775
776	/*
777	 * Install the allocated pds.  With @q bypassing, no new blkg
778	 * should have been created while the queue lock was dropped.
779	 */
780	spin_lock_irq(q->queue_lock);
781
782	list_for_each_entry(blkg, &q->blkg_list, q_node) {
783		if (WARN_ON(list_empty(&pds))) {
784			/* umm... this shouldn't happen, just abort */
785			ret = -ENOMEM;
786			goto out_unlock;
787		}
788		pd = list_first_entry(&pds, struct blkg_policy_data, alloc_node);
789		list_del_init(&pd->alloc_node);
790
791		/* grab blkcg lock too while installing @pd on @blkg */
792		spin_lock(&blkg->blkcg->lock);
793
794		blkg->pd[pol->plid] = pd;
795		pd->blkg = blkg;
796		pol->pd_init_fn(blkg);
797
798		spin_unlock(&blkg->blkcg->lock);
799	}
800
801	__set_bit(pol->plid, q->blkcg_pols);
802	ret = 0;
803out_unlock:
804	spin_unlock_irq(q->queue_lock);
805out_free:
806	blk_queue_bypass_end(q);
807	list_for_each_entry_safe(pd, n, &pds, alloc_node)
808		kfree(pd);
809	return ret;
810}
811EXPORT_SYMBOL_GPL(blkcg_activate_policy);
812
813/**
814 * blkcg_deactivate_policy - deactivate a blkcg policy on a request_queue
815 * @q: request_queue of interest
816 * @pol: blkcg policy to deactivate
817 *
818 * Deactivate @pol on @q.  Follows the same synchronization rules as
819 * blkcg_activate_policy().
820 */
821void blkcg_deactivate_policy(struct request_queue *q,
822			     const struct blkcg_policy *pol)
823{
824	struct blkcg_gq *blkg;
825
826	if (!blkcg_policy_enabled(q, pol))
827		return;
828
829	blk_queue_bypass_start(q);
830	spin_lock_irq(q->queue_lock);
831
832	__clear_bit(pol->plid, q->blkcg_pols);
833
834	/* if no policy is left, no need for blkgs - shoot them down */
835	if (bitmap_empty(q->blkcg_pols, BLKCG_MAX_POLS))
836		blkg_destroy_all(q);
837
838	list_for_each_entry(blkg, &q->blkg_list, q_node) {
839		/* grab blkcg lock too while removing @pd from @blkg */
840		spin_lock(&blkg->blkcg->lock);
841
842		if (pol->pd_exit_fn)
843			pol->pd_exit_fn(blkg);
844
845		kfree(blkg->pd[pol->plid]);
846		blkg->pd[pol->plid] = NULL;
847
848		spin_unlock(&blkg->blkcg->lock);
849	}
850
851	spin_unlock_irq(q->queue_lock);
852	blk_queue_bypass_end(q);
853}
854EXPORT_SYMBOL_GPL(blkcg_deactivate_policy);
855
856/**
857 * blkcg_policy_register - register a blkcg policy
858 * @pol: blkcg policy to register
859 *
860 * Register @pol with blkcg core.  Might sleep and @pol may be modified on
861 * successful registration.  Returns 0 on success and -errno on failure.
862 */
863int blkcg_policy_register(struct blkcg_policy *pol)
864{
865	int i, ret;
866
867	if (WARN_ON(pol->pd_size < sizeof(struct blkg_policy_data)))
868		return -EINVAL;
869
870	mutex_lock(&blkcg_pol_mutex);
871
872	/* find an empty slot */
873	ret = -ENOSPC;
874	for (i = 0; i < BLKCG_MAX_POLS; i++)
875		if (!blkcg_policy[i])
876			break;
877	if (i >= BLKCG_MAX_POLS)
878		goto out_unlock;
879
880	/* register and update blkgs */
881	pol->plid = i;
882	blkcg_policy[i] = pol;
883
884	/* everything is in place, add intf files for the new policy */
885	if (pol->cftypes)
886		WARN_ON(cgroup_add_cftypes(&blkio_subsys, pol->cftypes));
887	ret = 0;
888out_unlock:
889	mutex_unlock(&blkcg_pol_mutex);
890	return ret;
891}
892EXPORT_SYMBOL_GPL(blkcg_policy_register);
893
894/**
895 * blkcg_policy_unregister - unregister a blkcg policy
896 * @pol: blkcg policy to unregister
897 *
898 * Undo blkcg_policy_register(@pol).  Might sleep.
899 */
900void blkcg_policy_unregister(struct blkcg_policy *pol)
901{
902	mutex_lock(&blkcg_pol_mutex);
903
904	if (WARN_ON(blkcg_policy[pol->plid] != pol))
905		goto out_unlock;
906
907	/* kill the intf files first */
908	if (pol->cftypes)
909		cgroup_rm_cftypes(&blkio_subsys, pol->cftypes);
910
911	/* unregister and update blkgs */
912	blkcg_policy[pol->plid] = NULL;
913out_unlock:
914	mutex_unlock(&blkcg_pol_mutex);
915}
916EXPORT_SYMBOL_GPL(blkcg_policy_unregister);
917