blk-cgroup.c revision a637120e49021d197e9578cba545bbaa459cbb51
1/* 2 * Common Block IO controller cgroup interface 3 * 4 * Based on ideas and code from CFQ, CFS and BFQ: 5 * Copyright (C) 2003 Jens Axboe <axboe@kernel.dk> 6 * 7 * Copyright (C) 2008 Fabio Checconi <fabio@gandalf.sssup.it> 8 * Paolo Valente <paolo.valente@unimore.it> 9 * 10 * Copyright (C) 2009 Vivek Goyal <vgoyal@redhat.com> 11 * Nauman Rafique <nauman@google.com> 12 */ 13#include <linux/ioprio.h> 14#include <linux/kdev_t.h> 15#include <linux/module.h> 16#include <linux/err.h> 17#include <linux/blkdev.h> 18#include <linux/slab.h> 19#include <linux/genhd.h> 20#include <linux/delay.h> 21#include <linux/atomic.h> 22#include "blk-cgroup.h" 23#include "blk.h" 24 25#define MAX_KEY_LEN 100 26 27static DEFINE_MUTEX(blkcg_pol_mutex); 28 29struct blkcg blkcg_root = { .cfq_weight = 2 * CFQ_WEIGHT_DEFAULT }; 30EXPORT_SYMBOL_GPL(blkcg_root); 31 32static struct blkcg_policy *blkcg_policy[BLKCG_MAX_POLS]; 33 34struct blkcg *cgroup_to_blkcg(struct cgroup *cgroup) 35{ 36 return container_of(cgroup_subsys_state(cgroup, blkio_subsys_id), 37 struct blkcg, css); 38} 39EXPORT_SYMBOL_GPL(cgroup_to_blkcg); 40 41static struct blkcg *task_blkcg(struct task_struct *tsk) 42{ 43 return container_of(task_subsys_state(tsk, blkio_subsys_id), 44 struct blkcg, css); 45} 46 47struct blkcg *bio_blkcg(struct bio *bio) 48{ 49 if (bio && bio->bi_css) 50 return container_of(bio->bi_css, struct blkcg, css); 51 return task_blkcg(current); 52} 53EXPORT_SYMBOL_GPL(bio_blkcg); 54 55static bool blkcg_policy_enabled(struct request_queue *q, 56 const struct blkcg_policy *pol) 57{ 58 return pol && test_bit(pol->plid, q->blkcg_pols); 59} 60 61/** 62 * blkg_free - free a blkg 63 * @blkg: blkg to free 64 * 65 * Free @blkg which may be partially allocated. 66 */ 67static void blkg_free(struct blkcg_gq *blkg) 68{ 69 int i; 70 71 if (!blkg) 72 return; 73 74 for (i = 0; i < BLKCG_MAX_POLS; i++) { 75 struct blkcg_policy *pol = blkcg_policy[i]; 76 struct blkg_policy_data *pd = blkg->pd[i]; 77 78 if (!pd) 79 continue; 80 81 if (pol && pol->pd_exit_fn) 82 pol->pd_exit_fn(blkg); 83 84 kfree(pd); 85 } 86 87 kfree(blkg); 88} 89 90/** 91 * blkg_alloc - allocate a blkg 92 * @blkcg: block cgroup the new blkg is associated with 93 * @q: request_queue the new blkg is associated with 94 * 95 * Allocate a new blkg assocating @blkcg and @q. 96 */ 97static struct blkcg_gq *blkg_alloc(struct blkcg *blkcg, struct request_queue *q) 98{ 99 struct blkcg_gq *blkg; 100 int i; 101 102 /* alloc and init base part */ 103 blkg = kzalloc_node(sizeof(*blkg), GFP_ATOMIC, q->node); 104 if (!blkg) 105 return NULL; 106 107 blkg->q = q; 108 INIT_LIST_HEAD(&blkg->q_node); 109 blkg->blkcg = blkcg; 110 blkg->refcnt = 1; 111 112 for (i = 0; i < BLKCG_MAX_POLS; i++) { 113 struct blkcg_policy *pol = blkcg_policy[i]; 114 struct blkg_policy_data *pd; 115 116 if (!blkcg_policy_enabled(q, pol)) 117 continue; 118 119 /* alloc per-policy data and attach it to blkg */ 120 pd = kzalloc_node(pol->pd_size, GFP_ATOMIC, q->node); 121 if (!pd) { 122 blkg_free(blkg); 123 return NULL; 124 } 125 126 blkg->pd[i] = pd; 127 pd->blkg = blkg; 128 } 129 130 /* invoke per-policy init */ 131 for (i = 0; i < BLKCG_MAX_POLS; i++) { 132 struct blkcg_policy *pol = blkcg_policy[i]; 133 134 if (blkcg_policy_enabled(blkg->q, pol)) 135 pol->pd_init_fn(blkg); 136 } 137 138 return blkg; 139} 140 141static struct blkcg_gq *__blkg_lookup(struct blkcg *blkcg, 142 struct request_queue *q) 143{ 144 struct blkcg_gq *blkg; 145 146 blkg = rcu_dereference(blkcg->blkg_hint); 147 if (blkg && blkg->q == q) 148 return blkg; 149 150 /* 151 * Hint didn't match. Look up from the radix tree. Note that we 152 * may not be holding queue_lock and thus are not sure whether 153 * @blkg from blkg_tree has already been removed or not, so we 154 * can't update hint to the lookup result. Leave it to the caller. 155 */ 156 blkg = radix_tree_lookup(&blkcg->blkg_tree, q->id); 157 if (blkg && blkg->q == q) 158 return blkg; 159 160 return NULL; 161} 162 163/** 164 * blkg_lookup - lookup blkg for the specified blkcg - q pair 165 * @blkcg: blkcg of interest 166 * @q: request_queue of interest 167 * 168 * Lookup blkg for the @blkcg - @q pair. This function should be called 169 * under RCU read lock and is guaranteed to return %NULL if @q is bypassing 170 * - see blk_queue_bypass_start() for details. 171 */ 172struct blkcg_gq *blkg_lookup(struct blkcg *blkcg, struct request_queue *q) 173{ 174 WARN_ON_ONCE(!rcu_read_lock_held()); 175 176 if (unlikely(blk_queue_bypass(q))) 177 return NULL; 178 return __blkg_lookup(blkcg, q); 179} 180EXPORT_SYMBOL_GPL(blkg_lookup); 181 182static struct blkcg_gq *__blkg_lookup_create(struct blkcg *blkcg, 183 struct request_queue *q) 184 __releases(q->queue_lock) __acquires(q->queue_lock) 185{ 186 struct blkcg_gq *blkg; 187 int ret; 188 189 WARN_ON_ONCE(!rcu_read_lock_held()); 190 lockdep_assert_held(q->queue_lock); 191 192 /* lookup and update hint on success, see __blkg_lookup() for details */ 193 blkg = __blkg_lookup(blkcg, q); 194 if (blkg) { 195 rcu_assign_pointer(blkcg->blkg_hint, blkg); 196 return blkg; 197 } 198 199 /* blkg holds a reference to blkcg */ 200 if (!css_tryget(&blkcg->css)) 201 return ERR_PTR(-EINVAL); 202 203 /* allocate */ 204 ret = -ENOMEM; 205 blkg = blkg_alloc(blkcg, q); 206 if (unlikely(!blkg)) 207 goto err_put; 208 209 /* insert */ 210 ret = radix_tree_preload(GFP_ATOMIC); 211 if (ret) 212 goto err_free; 213 214 spin_lock(&blkcg->lock); 215 ret = radix_tree_insert(&blkcg->blkg_tree, q->id, blkg); 216 if (likely(!ret)) { 217 hlist_add_head_rcu(&blkg->blkcg_node, &blkcg->blkg_list); 218 list_add(&blkg->q_node, &q->blkg_list); 219 } 220 spin_unlock(&blkcg->lock); 221 222 radix_tree_preload_end(); 223 224 if (!ret) 225 return blkg; 226err_free: 227 blkg_free(blkg); 228err_put: 229 css_put(&blkcg->css); 230 return ERR_PTR(ret); 231} 232 233struct blkcg_gq *blkg_lookup_create(struct blkcg *blkcg, 234 struct request_queue *q) 235{ 236 /* 237 * This could be the first entry point of blkcg implementation and 238 * we shouldn't allow anything to go through for a bypassing queue. 239 */ 240 if (unlikely(blk_queue_bypass(q))) 241 return ERR_PTR(blk_queue_dead(q) ? -EINVAL : -EBUSY); 242 return __blkg_lookup_create(blkcg, q); 243} 244EXPORT_SYMBOL_GPL(blkg_lookup_create); 245 246static void blkg_destroy(struct blkcg_gq *blkg) 247{ 248 struct request_queue *q = blkg->q; 249 struct blkcg *blkcg = blkg->blkcg; 250 251 lockdep_assert_held(q->queue_lock); 252 lockdep_assert_held(&blkcg->lock); 253 254 /* Something wrong if we are trying to remove same group twice */ 255 WARN_ON_ONCE(list_empty(&blkg->q_node)); 256 WARN_ON_ONCE(hlist_unhashed(&blkg->blkcg_node)); 257 258 radix_tree_delete(&blkcg->blkg_tree, blkg->q->id); 259 list_del_init(&blkg->q_node); 260 hlist_del_init_rcu(&blkg->blkcg_node); 261 262 /* 263 * Both setting lookup hint to and clearing it from @blkg are done 264 * under queue_lock. If it's not pointing to @blkg now, it never 265 * will. Hint assignment itself can race safely. 266 */ 267 if (rcu_dereference_raw(blkcg->blkg_hint) == blkg) 268 rcu_assign_pointer(blkcg->blkg_hint, NULL); 269 270 /* 271 * Put the reference taken at the time of creation so that when all 272 * queues are gone, group can be destroyed. 273 */ 274 blkg_put(blkg); 275} 276 277/** 278 * blkg_destroy_all - destroy all blkgs associated with a request_queue 279 * @q: request_queue of interest 280 * 281 * Destroy all blkgs associated with @q. 282 */ 283static void blkg_destroy_all(struct request_queue *q) 284{ 285 struct blkcg_gq *blkg, *n; 286 287 lockdep_assert_held(q->queue_lock); 288 289 list_for_each_entry_safe(blkg, n, &q->blkg_list, q_node) { 290 struct blkcg *blkcg = blkg->blkcg; 291 292 spin_lock(&blkcg->lock); 293 blkg_destroy(blkg); 294 spin_unlock(&blkcg->lock); 295 } 296} 297 298static void blkg_rcu_free(struct rcu_head *rcu_head) 299{ 300 blkg_free(container_of(rcu_head, struct blkcg_gq, rcu_head)); 301} 302 303void __blkg_release(struct blkcg_gq *blkg) 304{ 305 /* release the extra blkcg reference this blkg has been holding */ 306 css_put(&blkg->blkcg->css); 307 308 /* 309 * A group is freed in rcu manner. But having an rcu lock does not 310 * mean that one can access all the fields of blkg and assume these 311 * are valid. For example, don't try to follow throtl_data and 312 * request queue links. 313 * 314 * Having a reference to blkg under an rcu allows acess to only 315 * values local to groups like group stats and group rate limits 316 */ 317 call_rcu(&blkg->rcu_head, blkg_rcu_free); 318} 319EXPORT_SYMBOL_GPL(__blkg_release); 320 321static int blkcg_reset_stats(struct cgroup *cgroup, struct cftype *cftype, 322 u64 val) 323{ 324 struct blkcg *blkcg = cgroup_to_blkcg(cgroup); 325 struct blkcg_gq *blkg; 326 struct hlist_node *n; 327 int i; 328 329 mutex_lock(&blkcg_pol_mutex); 330 spin_lock_irq(&blkcg->lock); 331 332 /* 333 * Note that stat reset is racy - it doesn't synchronize against 334 * stat updates. This is a debug feature which shouldn't exist 335 * anyway. If you get hit by a race, retry. 336 */ 337 hlist_for_each_entry(blkg, n, &blkcg->blkg_list, blkcg_node) { 338 for (i = 0; i < BLKCG_MAX_POLS; i++) { 339 struct blkcg_policy *pol = blkcg_policy[i]; 340 341 if (blkcg_policy_enabled(blkg->q, pol) && 342 pol->pd_reset_stats_fn) 343 pol->pd_reset_stats_fn(blkg); 344 } 345 } 346 347 spin_unlock_irq(&blkcg->lock); 348 mutex_unlock(&blkcg_pol_mutex); 349 return 0; 350} 351 352static const char *blkg_dev_name(struct blkcg_gq *blkg) 353{ 354 /* some drivers (floppy) instantiate a queue w/o disk registered */ 355 if (blkg->q->backing_dev_info.dev) 356 return dev_name(blkg->q->backing_dev_info.dev); 357 return NULL; 358} 359 360/** 361 * blkcg_print_blkgs - helper for printing per-blkg data 362 * @sf: seq_file to print to 363 * @blkcg: blkcg of interest 364 * @prfill: fill function to print out a blkg 365 * @pol: policy in question 366 * @data: data to be passed to @prfill 367 * @show_total: to print out sum of prfill return values or not 368 * 369 * This function invokes @prfill on each blkg of @blkcg if pd for the 370 * policy specified by @pol exists. @prfill is invoked with @sf, the 371 * policy data and @data. If @show_total is %true, the sum of the return 372 * values from @prfill is printed with "Total" label at the end. 373 * 374 * This is to be used to construct print functions for 375 * cftype->read_seq_string method. 376 */ 377void blkcg_print_blkgs(struct seq_file *sf, struct blkcg *blkcg, 378 u64 (*prfill)(struct seq_file *, 379 struct blkg_policy_data *, int), 380 const struct blkcg_policy *pol, int data, 381 bool show_total) 382{ 383 struct blkcg_gq *blkg; 384 struct hlist_node *n; 385 u64 total = 0; 386 387 spin_lock_irq(&blkcg->lock); 388 hlist_for_each_entry(blkg, n, &blkcg->blkg_list, blkcg_node) 389 if (blkcg_policy_enabled(blkg->q, pol)) 390 total += prfill(sf, blkg->pd[pol->plid], data); 391 spin_unlock_irq(&blkcg->lock); 392 393 if (show_total) 394 seq_printf(sf, "Total %llu\n", (unsigned long long)total); 395} 396EXPORT_SYMBOL_GPL(blkcg_print_blkgs); 397 398/** 399 * __blkg_prfill_u64 - prfill helper for a single u64 value 400 * @sf: seq_file to print to 401 * @pd: policy private data of interest 402 * @v: value to print 403 * 404 * Print @v to @sf for the device assocaited with @pd. 405 */ 406u64 __blkg_prfill_u64(struct seq_file *sf, struct blkg_policy_data *pd, u64 v) 407{ 408 const char *dname = blkg_dev_name(pd->blkg); 409 410 if (!dname) 411 return 0; 412 413 seq_printf(sf, "%s %llu\n", dname, (unsigned long long)v); 414 return v; 415} 416EXPORT_SYMBOL_GPL(__blkg_prfill_u64); 417 418/** 419 * __blkg_prfill_rwstat - prfill helper for a blkg_rwstat 420 * @sf: seq_file to print to 421 * @pd: policy private data of interest 422 * @rwstat: rwstat to print 423 * 424 * Print @rwstat to @sf for the device assocaited with @pd. 425 */ 426u64 __blkg_prfill_rwstat(struct seq_file *sf, struct blkg_policy_data *pd, 427 const struct blkg_rwstat *rwstat) 428{ 429 static const char *rwstr[] = { 430 [BLKG_RWSTAT_READ] = "Read", 431 [BLKG_RWSTAT_WRITE] = "Write", 432 [BLKG_RWSTAT_SYNC] = "Sync", 433 [BLKG_RWSTAT_ASYNC] = "Async", 434 }; 435 const char *dname = blkg_dev_name(pd->blkg); 436 u64 v; 437 int i; 438 439 if (!dname) 440 return 0; 441 442 for (i = 0; i < BLKG_RWSTAT_NR; i++) 443 seq_printf(sf, "%s %s %llu\n", dname, rwstr[i], 444 (unsigned long long)rwstat->cnt[i]); 445 446 v = rwstat->cnt[BLKG_RWSTAT_READ] + rwstat->cnt[BLKG_RWSTAT_WRITE]; 447 seq_printf(sf, "%s Total %llu\n", dname, (unsigned long long)v); 448 return v; 449} 450 451/** 452 * blkg_prfill_stat - prfill callback for blkg_stat 453 * @sf: seq_file to print to 454 * @pd: policy private data of interest 455 * @off: offset to the blkg_stat in @pd 456 * 457 * prfill callback for printing a blkg_stat. 458 */ 459u64 blkg_prfill_stat(struct seq_file *sf, struct blkg_policy_data *pd, int off) 460{ 461 return __blkg_prfill_u64(sf, pd, blkg_stat_read((void *)pd + off)); 462} 463EXPORT_SYMBOL_GPL(blkg_prfill_stat); 464 465/** 466 * blkg_prfill_rwstat - prfill callback for blkg_rwstat 467 * @sf: seq_file to print to 468 * @pd: policy private data of interest 469 * @off: offset to the blkg_rwstat in @pd 470 * 471 * prfill callback for printing a blkg_rwstat. 472 */ 473u64 blkg_prfill_rwstat(struct seq_file *sf, struct blkg_policy_data *pd, 474 int off) 475{ 476 struct blkg_rwstat rwstat = blkg_rwstat_read((void *)pd + off); 477 478 return __blkg_prfill_rwstat(sf, pd, &rwstat); 479} 480EXPORT_SYMBOL_GPL(blkg_prfill_rwstat); 481 482/** 483 * blkg_conf_prep - parse and prepare for per-blkg config update 484 * @blkcg: target block cgroup 485 * @pol: target policy 486 * @input: input string 487 * @ctx: blkg_conf_ctx to be filled 488 * 489 * Parse per-blkg config update from @input and initialize @ctx with the 490 * result. @ctx->blkg points to the blkg to be updated and @ctx->v the new 491 * value. This function returns with RCU read lock and queue lock held and 492 * must be paired with blkg_conf_finish(). 493 */ 494int blkg_conf_prep(struct blkcg *blkcg, const struct blkcg_policy *pol, 495 const char *input, struct blkg_conf_ctx *ctx) 496 __acquires(rcu) __acquires(disk->queue->queue_lock) 497{ 498 struct gendisk *disk; 499 struct blkcg_gq *blkg; 500 unsigned int major, minor; 501 unsigned long long v; 502 int part, ret; 503 504 if (sscanf(input, "%u:%u %llu", &major, &minor, &v) != 3) 505 return -EINVAL; 506 507 disk = get_gendisk(MKDEV(major, minor), &part); 508 if (!disk || part) 509 return -EINVAL; 510 511 rcu_read_lock(); 512 spin_lock_irq(disk->queue->queue_lock); 513 514 if (blkcg_policy_enabled(disk->queue, pol)) 515 blkg = blkg_lookup_create(blkcg, disk->queue); 516 else 517 blkg = ERR_PTR(-EINVAL); 518 519 if (IS_ERR(blkg)) { 520 ret = PTR_ERR(blkg); 521 rcu_read_unlock(); 522 spin_unlock_irq(disk->queue->queue_lock); 523 put_disk(disk); 524 /* 525 * If queue was bypassing, we should retry. Do so after a 526 * short msleep(). It isn't strictly necessary but queue 527 * can be bypassing for some time and it's always nice to 528 * avoid busy looping. 529 */ 530 if (ret == -EBUSY) { 531 msleep(10); 532 ret = restart_syscall(); 533 } 534 return ret; 535 } 536 537 ctx->disk = disk; 538 ctx->blkg = blkg; 539 ctx->v = v; 540 return 0; 541} 542EXPORT_SYMBOL_GPL(blkg_conf_prep); 543 544/** 545 * blkg_conf_finish - finish up per-blkg config update 546 * @ctx: blkg_conf_ctx intiailized by blkg_conf_prep() 547 * 548 * Finish up after per-blkg config update. This function must be paired 549 * with blkg_conf_prep(). 550 */ 551void blkg_conf_finish(struct blkg_conf_ctx *ctx) 552 __releases(ctx->disk->queue->queue_lock) __releases(rcu) 553{ 554 spin_unlock_irq(ctx->disk->queue->queue_lock); 555 rcu_read_unlock(); 556 put_disk(ctx->disk); 557} 558EXPORT_SYMBOL_GPL(blkg_conf_finish); 559 560struct cftype blkcg_files[] = { 561 { 562 .name = "reset_stats", 563 .write_u64 = blkcg_reset_stats, 564 }, 565 { } /* terminate */ 566}; 567 568/** 569 * blkcg_pre_destroy - cgroup pre_destroy callback 570 * @cgroup: cgroup of interest 571 * 572 * This function is called when @cgroup is about to go away and responsible 573 * for shooting down all blkgs associated with @cgroup. blkgs should be 574 * removed while holding both q and blkcg locks. As blkcg lock is nested 575 * inside q lock, this function performs reverse double lock dancing. 576 * 577 * This is the blkcg counterpart of ioc_release_fn(). 578 */ 579static int blkcg_pre_destroy(struct cgroup *cgroup) 580{ 581 struct blkcg *blkcg = cgroup_to_blkcg(cgroup); 582 583 spin_lock_irq(&blkcg->lock); 584 585 while (!hlist_empty(&blkcg->blkg_list)) { 586 struct blkcg_gq *blkg = hlist_entry(blkcg->blkg_list.first, 587 struct blkcg_gq, blkcg_node); 588 struct request_queue *q = blkg->q; 589 590 if (spin_trylock(q->queue_lock)) { 591 blkg_destroy(blkg); 592 spin_unlock(q->queue_lock); 593 } else { 594 spin_unlock_irq(&blkcg->lock); 595 cpu_relax(); 596 spin_lock_irq(&blkcg->lock); 597 } 598 } 599 600 spin_unlock_irq(&blkcg->lock); 601 return 0; 602} 603 604static void blkcg_destroy(struct cgroup *cgroup) 605{ 606 struct blkcg *blkcg = cgroup_to_blkcg(cgroup); 607 608 if (blkcg != &blkcg_root) 609 kfree(blkcg); 610} 611 612static struct cgroup_subsys_state *blkcg_create(struct cgroup *cgroup) 613{ 614 static atomic64_t id_seq = ATOMIC64_INIT(0); 615 struct blkcg *blkcg; 616 struct cgroup *parent = cgroup->parent; 617 618 if (!parent) { 619 blkcg = &blkcg_root; 620 goto done; 621 } 622 623 blkcg = kzalloc(sizeof(*blkcg), GFP_KERNEL); 624 if (!blkcg) 625 return ERR_PTR(-ENOMEM); 626 627 blkcg->cfq_weight = CFQ_WEIGHT_DEFAULT; 628 blkcg->id = atomic64_inc_return(&id_seq); /* root is 0, start from 1 */ 629done: 630 spin_lock_init(&blkcg->lock); 631 INIT_RADIX_TREE(&blkcg->blkg_tree, GFP_ATOMIC); 632 INIT_HLIST_HEAD(&blkcg->blkg_list); 633 634 return &blkcg->css; 635} 636 637/** 638 * blkcg_init_queue - initialize blkcg part of request queue 639 * @q: request_queue to initialize 640 * 641 * Called from blk_alloc_queue_node(). Responsible for initializing blkcg 642 * part of new request_queue @q. 643 * 644 * RETURNS: 645 * 0 on success, -errno on failure. 646 */ 647int blkcg_init_queue(struct request_queue *q) 648{ 649 might_sleep(); 650 651 return blk_throtl_init(q); 652} 653 654/** 655 * blkcg_drain_queue - drain blkcg part of request_queue 656 * @q: request_queue to drain 657 * 658 * Called from blk_drain_queue(). Responsible for draining blkcg part. 659 */ 660void blkcg_drain_queue(struct request_queue *q) 661{ 662 lockdep_assert_held(q->queue_lock); 663 664 blk_throtl_drain(q); 665} 666 667/** 668 * blkcg_exit_queue - exit and release blkcg part of request_queue 669 * @q: request_queue being released 670 * 671 * Called from blk_release_queue(). Responsible for exiting blkcg part. 672 */ 673void blkcg_exit_queue(struct request_queue *q) 674{ 675 spin_lock_irq(q->queue_lock); 676 blkg_destroy_all(q); 677 spin_unlock_irq(q->queue_lock); 678 679 blk_throtl_exit(q); 680} 681 682/* 683 * We cannot support shared io contexts, as we have no mean to support 684 * two tasks with the same ioc in two different groups without major rework 685 * of the main cic data structures. For now we allow a task to change 686 * its cgroup only if it's the only owner of its ioc. 687 */ 688static int blkcg_can_attach(struct cgroup *cgrp, struct cgroup_taskset *tset) 689{ 690 struct task_struct *task; 691 struct io_context *ioc; 692 int ret = 0; 693 694 /* task_lock() is needed to avoid races with exit_io_context() */ 695 cgroup_taskset_for_each(task, cgrp, tset) { 696 task_lock(task); 697 ioc = task->io_context; 698 if (ioc && atomic_read(&ioc->nr_tasks) > 1) 699 ret = -EINVAL; 700 task_unlock(task); 701 if (ret) 702 break; 703 } 704 return ret; 705} 706 707struct cgroup_subsys blkio_subsys = { 708 .name = "blkio", 709 .create = blkcg_create, 710 .can_attach = blkcg_can_attach, 711 .pre_destroy = blkcg_pre_destroy, 712 .destroy = blkcg_destroy, 713 .subsys_id = blkio_subsys_id, 714 .base_cftypes = blkcg_files, 715 .module = THIS_MODULE, 716}; 717EXPORT_SYMBOL_GPL(blkio_subsys); 718 719/** 720 * blkcg_activate_policy - activate a blkcg policy on a request_queue 721 * @q: request_queue of interest 722 * @pol: blkcg policy to activate 723 * 724 * Activate @pol on @q. Requires %GFP_KERNEL context. @q goes through 725 * bypass mode to populate its blkgs with policy_data for @pol. 726 * 727 * Activation happens with @q bypassed, so nobody would be accessing blkgs 728 * from IO path. Update of each blkg is protected by both queue and blkcg 729 * locks so that holding either lock and testing blkcg_policy_enabled() is 730 * always enough for dereferencing policy data. 731 * 732 * The caller is responsible for synchronizing [de]activations and policy 733 * [un]registerations. Returns 0 on success, -errno on failure. 734 */ 735int blkcg_activate_policy(struct request_queue *q, 736 const struct blkcg_policy *pol) 737{ 738 LIST_HEAD(pds); 739 struct blkcg_gq *blkg; 740 struct blkg_policy_data *pd, *n; 741 int cnt = 0, ret; 742 743 if (blkcg_policy_enabled(q, pol)) 744 return 0; 745 746 blk_queue_bypass_start(q); 747 748 /* make sure the root blkg exists and count the existing blkgs */ 749 spin_lock_irq(q->queue_lock); 750 751 rcu_read_lock(); 752 blkg = __blkg_lookup_create(&blkcg_root, q); 753 rcu_read_unlock(); 754 755 if (IS_ERR(blkg)) { 756 ret = PTR_ERR(blkg); 757 goto out_unlock; 758 } 759 q->root_blkg = blkg; 760 761 list_for_each_entry(blkg, &q->blkg_list, q_node) 762 cnt++; 763 764 spin_unlock_irq(q->queue_lock); 765 766 /* allocate policy_data for all existing blkgs */ 767 while (cnt--) { 768 pd = kzalloc_node(pol->pd_size, GFP_KERNEL, q->node); 769 if (!pd) { 770 ret = -ENOMEM; 771 goto out_free; 772 } 773 list_add_tail(&pd->alloc_node, &pds); 774 } 775 776 /* 777 * Install the allocated pds. With @q bypassing, no new blkg 778 * should have been created while the queue lock was dropped. 779 */ 780 spin_lock_irq(q->queue_lock); 781 782 list_for_each_entry(blkg, &q->blkg_list, q_node) { 783 if (WARN_ON(list_empty(&pds))) { 784 /* umm... this shouldn't happen, just abort */ 785 ret = -ENOMEM; 786 goto out_unlock; 787 } 788 pd = list_first_entry(&pds, struct blkg_policy_data, alloc_node); 789 list_del_init(&pd->alloc_node); 790 791 /* grab blkcg lock too while installing @pd on @blkg */ 792 spin_lock(&blkg->blkcg->lock); 793 794 blkg->pd[pol->plid] = pd; 795 pd->blkg = blkg; 796 pol->pd_init_fn(blkg); 797 798 spin_unlock(&blkg->blkcg->lock); 799 } 800 801 __set_bit(pol->plid, q->blkcg_pols); 802 ret = 0; 803out_unlock: 804 spin_unlock_irq(q->queue_lock); 805out_free: 806 blk_queue_bypass_end(q); 807 list_for_each_entry_safe(pd, n, &pds, alloc_node) 808 kfree(pd); 809 return ret; 810} 811EXPORT_SYMBOL_GPL(blkcg_activate_policy); 812 813/** 814 * blkcg_deactivate_policy - deactivate a blkcg policy on a request_queue 815 * @q: request_queue of interest 816 * @pol: blkcg policy to deactivate 817 * 818 * Deactivate @pol on @q. Follows the same synchronization rules as 819 * blkcg_activate_policy(). 820 */ 821void blkcg_deactivate_policy(struct request_queue *q, 822 const struct blkcg_policy *pol) 823{ 824 struct blkcg_gq *blkg; 825 826 if (!blkcg_policy_enabled(q, pol)) 827 return; 828 829 blk_queue_bypass_start(q); 830 spin_lock_irq(q->queue_lock); 831 832 __clear_bit(pol->plid, q->blkcg_pols); 833 834 /* if no policy is left, no need for blkgs - shoot them down */ 835 if (bitmap_empty(q->blkcg_pols, BLKCG_MAX_POLS)) 836 blkg_destroy_all(q); 837 838 list_for_each_entry(blkg, &q->blkg_list, q_node) { 839 /* grab blkcg lock too while removing @pd from @blkg */ 840 spin_lock(&blkg->blkcg->lock); 841 842 if (pol->pd_exit_fn) 843 pol->pd_exit_fn(blkg); 844 845 kfree(blkg->pd[pol->plid]); 846 blkg->pd[pol->plid] = NULL; 847 848 spin_unlock(&blkg->blkcg->lock); 849 } 850 851 spin_unlock_irq(q->queue_lock); 852 blk_queue_bypass_end(q); 853} 854EXPORT_SYMBOL_GPL(blkcg_deactivate_policy); 855 856/** 857 * blkcg_policy_register - register a blkcg policy 858 * @pol: blkcg policy to register 859 * 860 * Register @pol with blkcg core. Might sleep and @pol may be modified on 861 * successful registration. Returns 0 on success and -errno on failure. 862 */ 863int blkcg_policy_register(struct blkcg_policy *pol) 864{ 865 int i, ret; 866 867 if (WARN_ON(pol->pd_size < sizeof(struct blkg_policy_data))) 868 return -EINVAL; 869 870 mutex_lock(&blkcg_pol_mutex); 871 872 /* find an empty slot */ 873 ret = -ENOSPC; 874 for (i = 0; i < BLKCG_MAX_POLS; i++) 875 if (!blkcg_policy[i]) 876 break; 877 if (i >= BLKCG_MAX_POLS) 878 goto out_unlock; 879 880 /* register and update blkgs */ 881 pol->plid = i; 882 blkcg_policy[i] = pol; 883 884 /* everything is in place, add intf files for the new policy */ 885 if (pol->cftypes) 886 WARN_ON(cgroup_add_cftypes(&blkio_subsys, pol->cftypes)); 887 ret = 0; 888out_unlock: 889 mutex_unlock(&blkcg_pol_mutex); 890 return ret; 891} 892EXPORT_SYMBOL_GPL(blkcg_policy_register); 893 894/** 895 * blkcg_policy_unregister - unregister a blkcg policy 896 * @pol: blkcg policy to unregister 897 * 898 * Undo blkcg_policy_register(@pol). Might sleep. 899 */ 900void blkcg_policy_unregister(struct blkcg_policy *pol) 901{ 902 mutex_lock(&blkcg_pol_mutex); 903 904 if (WARN_ON(blkcg_policy[pol->plid] != pol)) 905 goto out_unlock; 906 907 /* kill the intf files first */ 908 if (pol->cftypes) 909 cgroup_rm_cftypes(&blkio_subsys, pol->cftypes); 910 911 /* unregister and update blkgs */ 912 blkcg_policy[pol->plid] = NULL; 913out_unlock: 914 mutex_unlock(&blkcg_pol_mutex); 915} 916EXPORT_SYMBOL_GPL(blkcg_policy_unregister); 917