blk-cgroup.c revision f9fcc2d3919b8eb575b3cee9274feefafb641bca
1/* 2 * Common Block IO controller cgroup interface 3 * 4 * Based on ideas and code from CFQ, CFS and BFQ: 5 * Copyright (C) 2003 Jens Axboe <axboe@kernel.dk> 6 * 7 * Copyright (C) 2008 Fabio Checconi <fabio@gandalf.sssup.it> 8 * Paolo Valente <paolo.valente@unimore.it> 9 * 10 * Copyright (C) 2009 Vivek Goyal <vgoyal@redhat.com> 11 * Nauman Rafique <nauman@google.com> 12 */ 13#include <linux/ioprio.h> 14#include <linux/kdev_t.h> 15#include <linux/module.h> 16#include <linux/err.h> 17#include <linux/blkdev.h> 18#include <linux/slab.h> 19#include <linux/genhd.h> 20#include <linux/delay.h> 21#include <linux/atomic.h> 22#include "blk-cgroup.h" 23#include "blk.h" 24 25#define MAX_KEY_LEN 100 26 27static DEFINE_MUTEX(blkcg_pol_mutex); 28 29struct blkcg blkcg_root = { .cfq_weight = 2 * CFQ_WEIGHT_DEFAULT }; 30EXPORT_SYMBOL_GPL(blkcg_root); 31 32static struct blkcg_policy *blkcg_policy[BLKCG_MAX_POLS]; 33 34struct blkcg *cgroup_to_blkcg(struct cgroup *cgroup) 35{ 36 return container_of(cgroup_subsys_state(cgroup, blkio_subsys_id), 37 struct blkcg, css); 38} 39EXPORT_SYMBOL_GPL(cgroup_to_blkcg); 40 41static struct blkcg *task_blkcg(struct task_struct *tsk) 42{ 43 return container_of(task_subsys_state(tsk, blkio_subsys_id), 44 struct blkcg, css); 45} 46 47struct blkcg *bio_blkcg(struct bio *bio) 48{ 49 if (bio && bio->bi_css) 50 return container_of(bio->bi_css, struct blkcg, css); 51 return task_blkcg(current); 52} 53EXPORT_SYMBOL_GPL(bio_blkcg); 54 55static bool blkcg_policy_enabled(struct request_queue *q, 56 const struct blkcg_policy *pol) 57{ 58 return pol && test_bit(pol->plid, q->blkcg_pols); 59} 60 61/** 62 * blkg_free - free a blkg 63 * @blkg: blkg to free 64 * 65 * Free @blkg which may be partially allocated. 66 */ 67static void blkg_free(struct blkcg_gq *blkg) 68{ 69 int i; 70 71 if (!blkg) 72 return; 73 74 for (i = 0; i < BLKCG_MAX_POLS; i++) { 75 struct blkcg_policy *pol = blkcg_policy[i]; 76 struct blkg_policy_data *pd = blkg->pd[i]; 77 78 if (!pd) 79 continue; 80 81 if (pol && pol->pd_exit_fn) 82 pol->pd_exit_fn(blkg); 83 84 kfree(pd); 85 } 86 87 kfree(blkg); 88} 89 90/** 91 * blkg_alloc - allocate a blkg 92 * @blkcg: block cgroup the new blkg is associated with 93 * @q: request_queue the new blkg is associated with 94 * 95 * Allocate a new blkg assocating @blkcg and @q. 96 */ 97static struct blkcg_gq *blkg_alloc(struct blkcg *blkcg, struct request_queue *q) 98{ 99 struct blkcg_gq *blkg; 100 int i; 101 102 /* alloc and init base part */ 103 blkg = kzalloc_node(sizeof(*blkg), GFP_ATOMIC, q->node); 104 if (!blkg) 105 return NULL; 106 107 blkg->q = q; 108 INIT_LIST_HEAD(&blkg->q_node); 109 blkg->blkcg = blkcg; 110 blkg->refcnt = 1; 111 112 for (i = 0; i < BLKCG_MAX_POLS; i++) { 113 struct blkcg_policy *pol = blkcg_policy[i]; 114 struct blkg_policy_data *pd; 115 116 if (!blkcg_policy_enabled(q, pol)) 117 continue; 118 119 /* alloc per-policy data and attach it to blkg */ 120 pd = kzalloc_node(pol->pd_size, GFP_ATOMIC, q->node); 121 if (!pd) { 122 blkg_free(blkg); 123 return NULL; 124 } 125 126 blkg->pd[i] = pd; 127 pd->blkg = blkg; 128 } 129 130 /* invoke per-policy init */ 131 for (i = 0; i < BLKCG_MAX_POLS; i++) { 132 struct blkcg_policy *pol = blkcg_policy[i]; 133 134 if (blkcg_policy_enabled(blkg->q, pol)) 135 pol->pd_init_fn(blkg); 136 } 137 138 return blkg; 139} 140 141static struct blkcg_gq *__blkg_lookup(struct blkcg *blkcg, 142 struct request_queue *q) 143{ 144 struct blkcg_gq *blkg; 145 struct hlist_node *n; 146 147 hlist_for_each_entry_rcu(blkg, n, &blkcg->blkg_list, blkcg_node) 148 if (blkg->q == q) 149 return blkg; 150 return NULL; 151} 152 153/** 154 * blkg_lookup - lookup blkg for the specified blkcg - q pair 155 * @blkcg: blkcg of interest 156 * @q: request_queue of interest 157 * 158 * Lookup blkg for the @blkcg - @q pair. This function should be called 159 * under RCU read lock and is guaranteed to return %NULL if @q is bypassing 160 * - see blk_queue_bypass_start() for details. 161 */ 162struct blkcg_gq *blkg_lookup(struct blkcg *blkcg, struct request_queue *q) 163{ 164 WARN_ON_ONCE(!rcu_read_lock_held()); 165 166 if (unlikely(blk_queue_bypass(q))) 167 return NULL; 168 return __blkg_lookup(blkcg, q); 169} 170EXPORT_SYMBOL_GPL(blkg_lookup); 171 172static struct blkcg_gq *__blkg_lookup_create(struct blkcg *blkcg, 173 struct request_queue *q) 174 __releases(q->queue_lock) __acquires(q->queue_lock) 175{ 176 struct blkcg_gq *blkg; 177 178 WARN_ON_ONCE(!rcu_read_lock_held()); 179 lockdep_assert_held(q->queue_lock); 180 181 blkg = __blkg_lookup(blkcg, q); 182 if (blkg) 183 return blkg; 184 185 /* blkg holds a reference to blkcg */ 186 if (!css_tryget(&blkcg->css)) 187 return ERR_PTR(-EINVAL); 188 189 /* 190 * Allocate and initialize. 191 */ 192 blkg = blkg_alloc(blkcg, q); 193 194 /* did alloc fail? */ 195 if (unlikely(!blkg)) { 196 blkg = ERR_PTR(-ENOMEM); 197 goto out; 198 } 199 200 /* insert */ 201 spin_lock(&blkcg->lock); 202 hlist_add_head_rcu(&blkg->blkcg_node, &blkcg->blkg_list); 203 list_add(&blkg->q_node, &q->blkg_list); 204 spin_unlock(&blkcg->lock); 205out: 206 return blkg; 207} 208 209struct blkcg_gq *blkg_lookup_create(struct blkcg *blkcg, 210 struct request_queue *q) 211{ 212 /* 213 * This could be the first entry point of blkcg implementation and 214 * we shouldn't allow anything to go through for a bypassing queue. 215 */ 216 if (unlikely(blk_queue_bypass(q))) 217 return ERR_PTR(blk_queue_dead(q) ? -EINVAL : -EBUSY); 218 return __blkg_lookup_create(blkcg, q); 219} 220EXPORT_SYMBOL_GPL(blkg_lookup_create); 221 222static void blkg_destroy(struct blkcg_gq *blkg) 223{ 224 struct request_queue *q = blkg->q; 225 struct blkcg *blkcg = blkg->blkcg; 226 227 lockdep_assert_held(q->queue_lock); 228 lockdep_assert_held(&blkcg->lock); 229 230 /* Something wrong if we are trying to remove same group twice */ 231 WARN_ON_ONCE(list_empty(&blkg->q_node)); 232 WARN_ON_ONCE(hlist_unhashed(&blkg->blkcg_node)); 233 list_del_init(&blkg->q_node); 234 hlist_del_init_rcu(&blkg->blkcg_node); 235 236 /* 237 * Put the reference taken at the time of creation so that when all 238 * queues are gone, group can be destroyed. 239 */ 240 blkg_put(blkg); 241} 242 243/** 244 * blkg_destroy_all - destroy all blkgs associated with a request_queue 245 * @q: request_queue of interest 246 * 247 * Destroy all blkgs associated with @q. 248 */ 249static void blkg_destroy_all(struct request_queue *q) 250{ 251 struct blkcg_gq *blkg, *n; 252 253 lockdep_assert_held(q->queue_lock); 254 255 list_for_each_entry_safe(blkg, n, &q->blkg_list, q_node) { 256 struct blkcg *blkcg = blkg->blkcg; 257 258 spin_lock(&blkcg->lock); 259 blkg_destroy(blkg); 260 spin_unlock(&blkcg->lock); 261 } 262} 263 264static void blkg_rcu_free(struct rcu_head *rcu_head) 265{ 266 blkg_free(container_of(rcu_head, struct blkcg_gq, rcu_head)); 267} 268 269void __blkg_release(struct blkcg_gq *blkg) 270{ 271 /* release the extra blkcg reference this blkg has been holding */ 272 css_put(&blkg->blkcg->css); 273 274 /* 275 * A group is freed in rcu manner. But having an rcu lock does not 276 * mean that one can access all the fields of blkg and assume these 277 * are valid. For example, don't try to follow throtl_data and 278 * request queue links. 279 * 280 * Having a reference to blkg under an rcu allows acess to only 281 * values local to groups like group stats and group rate limits 282 */ 283 call_rcu(&blkg->rcu_head, blkg_rcu_free); 284} 285EXPORT_SYMBOL_GPL(__blkg_release); 286 287static int blkcg_reset_stats(struct cgroup *cgroup, struct cftype *cftype, 288 u64 val) 289{ 290 struct blkcg *blkcg = cgroup_to_blkcg(cgroup); 291 struct blkcg_gq *blkg; 292 struct hlist_node *n; 293 int i; 294 295 mutex_lock(&blkcg_pol_mutex); 296 spin_lock_irq(&blkcg->lock); 297 298 /* 299 * Note that stat reset is racy - it doesn't synchronize against 300 * stat updates. This is a debug feature which shouldn't exist 301 * anyway. If you get hit by a race, retry. 302 */ 303 hlist_for_each_entry(blkg, n, &blkcg->blkg_list, blkcg_node) { 304 for (i = 0; i < BLKCG_MAX_POLS; i++) { 305 struct blkcg_policy *pol = blkcg_policy[i]; 306 307 if (blkcg_policy_enabled(blkg->q, pol) && 308 pol->pd_reset_stats_fn) 309 pol->pd_reset_stats_fn(blkg); 310 } 311 } 312 313 spin_unlock_irq(&blkcg->lock); 314 mutex_unlock(&blkcg_pol_mutex); 315 return 0; 316} 317 318static const char *blkg_dev_name(struct blkcg_gq *blkg) 319{ 320 /* some drivers (floppy) instantiate a queue w/o disk registered */ 321 if (blkg->q->backing_dev_info.dev) 322 return dev_name(blkg->q->backing_dev_info.dev); 323 return NULL; 324} 325 326/** 327 * blkcg_print_blkgs - helper for printing per-blkg data 328 * @sf: seq_file to print to 329 * @blkcg: blkcg of interest 330 * @prfill: fill function to print out a blkg 331 * @pol: policy in question 332 * @data: data to be passed to @prfill 333 * @show_total: to print out sum of prfill return values or not 334 * 335 * This function invokes @prfill on each blkg of @blkcg if pd for the 336 * policy specified by @pol exists. @prfill is invoked with @sf, the 337 * policy data and @data. If @show_total is %true, the sum of the return 338 * values from @prfill is printed with "Total" label at the end. 339 * 340 * This is to be used to construct print functions for 341 * cftype->read_seq_string method. 342 */ 343void blkcg_print_blkgs(struct seq_file *sf, struct blkcg *blkcg, 344 u64 (*prfill)(struct seq_file *, 345 struct blkg_policy_data *, int), 346 const struct blkcg_policy *pol, int data, 347 bool show_total) 348{ 349 struct blkcg_gq *blkg; 350 struct hlist_node *n; 351 u64 total = 0; 352 353 spin_lock_irq(&blkcg->lock); 354 hlist_for_each_entry(blkg, n, &blkcg->blkg_list, blkcg_node) 355 if (blkcg_policy_enabled(blkg->q, pol)) 356 total += prfill(sf, blkg->pd[pol->plid], data); 357 spin_unlock_irq(&blkcg->lock); 358 359 if (show_total) 360 seq_printf(sf, "Total %llu\n", (unsigned long long)total); 361} 362EXPORT_SYMBOL_GPL(blkcg_print_blkgs); 363 364/** 365 * __blkg_prfill_u64 - prfill helper for a single u64 value 366 * @sf: seq_file to print to 367 * @pd: policy private data of interest 368 * @v: value to print 369 * 370 * Print @v to @sf for the device assocaited with @pd. 371 */ 372u64 __blkg_prfill_u64(struct seq_file *sf, struct blkg_policy_data *pd, u64 v) 373{ 374 const char *dname = blkg_dev_name(pd->blkg); 375 376 if (!dname) 377 return 0; 378 379 seq_printf(sf, "%s %llu\n", dname, (unsigned long long)v); 380 return v; 381} 382EXPORT_SYMBOL_GPL(__blkg_prfill_u64); 383 384/** 385 * __blkg_prfill_rwstat - prfill helper for a blkg_rwstat 386 * @sf: seq_file to print to 387 * @pd: policy private data of interest 388 * @rwstat: rwstat to print 389 * 390 * Print @rwstat to @sf for the device assocaited with @pd. 391 */ 392u64 __blkg_prfill_rwstat(struct seq_file *sf, struct blkg_policy_data *pd, 393 const struct blkg_rwstat *rwstat) 394{ 395 static const char *rwstr[] = { 396 [BLKG_RWSTAT_READ] = "Read", 397 [BLKG_RWSTAT_WRITE] = "Write", 398 [BLKG_RWSTAT_SYNC] = "Sync", 399 [BLKG_RWSTAT_ASYNC] = "Async", 400 }; 401 const char *dname = blkg_dev_name(pd->blkg); 402 u64 v; 403 int i; 404 405 if (!dname) 406 return 0; 407 408 for (i = 0; i < BLKG_RWSTAT_NR; i++) 409 seq_printf(sf, "%s %s %llu\n", dname, rwstr[i], 410 (unsigned long long)rwstat->cnt[i]); 411 412 v = rwstat->cnt[BLKG_RWSTAT_READ] + rwstat->cnt[BLKG_RWSTAT_WRITE]; 413 seq_printf(sf, "%s Total %llu\n", dname, (unsigned long long)v); 414 return v; 415} 416 417/** 418 * blkg_prfill_stat - prfill callback for blkg_stat 419 * @sf: seq_file to print to 420 * @pd: policy private data of interest 421 * @off: offset to the blkg_stat in @pd 422 * 423 * prfill callback for printing a blkg_stat. 424 */ 425u64 blkg_prfill_stat(struct seq_file *sf, struct blkg_policy_data *pd, int off) 426{ 427 return __blkg_prfill_u64(sf, pd, blkg_stat_read((void *)pd + off)); 428} 429EXPORT_SYMBOL_GPL(blkg_prfill_stat); 430 431/** 432 * blkg_prfill_rwstat - prfill callback for blkg_rwstat 433 * @sf: seq_file to print to 434 * @pd: policy private data of interest 435 * @off: offset to the blkg_rwstat in @pd 436 * 437 * prfill callback for printing a blkg_rwstat. 438 */ 439u64 blkg_prfill_rwstat(struct seq_file *sf, struct blkg_policy_data *pd, 440 int off) 441{ 442 struct blkg_rwstat rwstat = blkg_rwstat_read((void *)pd + off); 443 444 return __blkg_prfill_rwstat(sf, pd, &rwstat); 445} 446EXPORT_SYMBOL_GPL(blkg_prfill_rwstat); 447 448/** 449 * blkg_conf_prep - parse and prepare for per-blkg config update 450 * @blkcg: target block cgroup 451 * @pol: target policy 452 * @input: input string 453 * @ctx: blkg_conf_ctx to be filled 454 * 455 * Parse per-blkg config update from @input and initialize @ctx with the 456 * result. @ctx->blkg points to the blkg to be updated and @ctx->v the new 457 * value. This function returns with RCU read lock and queue lock held and 458 * must be paired with blkg_conf_finish(). 459 */ 460int blkg_conf_prep(struct blkcg *blkcg, const struct blkcg_policy *pol, 461 const char *input, struct blkg_conf_ctx *ctx) 462 __acquires(rcu) __acquires(disk->queue->queue_lock) 463{ 464 struct gendisk *disk; 465 struct blkcg_gq *blkg; 466 unsigned int major, minor; 467 unsigned long long v; 468 int part, ret; 469 470 if (sscanf(input, "%u:%u %llu", &major, &minor, &v) != 3) 471 return -EINVAL; 472 473 disk = get_gendisk(MKDEV(major, minor), &part); 474 if (!disk || part) 475 return -EINVAL; 476 477 rcu_read_lock(); 478 spin_lock_irq(disk->queue->queue_lock); 479 480 if (blkcg_policy_enabled(disk->queue, pol)) 481 blkg = blkg_lookup_create(blkcg, disk->queue); 482 else 483 blkg = ERR_PTR(-EINVAL); 484 485 if (IS_ERR(blkg)) { 486 ret = PTR_ERR(blkg); 487 rcu_read_unlock(); 488 spin_unlock_irq(disk->queue->queue_lock); 489 put_disk(disk); 490 /* 491 * If queue was bypassing, we should retry. Do so after a 492 * short msleep(). It isn't strictly necessary but queue 493 * can be bypassing for some time and it's always nice to 494 * avoid busy looping. 495 */ 496 if (ret == -EBUSY) { 497 msleep(10); 498 ret = restart_syscall(); 499 } 500 return ret; 501 } 502 503 ctx->disk = disk; 504 ctx->blkg = blkg; 505 ctx->v = v; 506 return 0; 507} 508EXPORT_SYMBOL_GPL(blkg_conf_prep); 509 510/** 511 * blkg_conf_finish - finish up per-blkg config update 512 * @ctx: blkg_conf_ctx intiailized by blkg_conf_prep() 513 * 514 * Finish up after per-blkg config update. This function must be paired 515 * with blkg_conf_prep(). 516 */ 517void blkg_conf_finish(struct blkg_conf_ctx *ctx) 518 __releases(ctx->disk->queue->queue_lock) __releases(rcu) 519{ 520 spin_unlock_irq(ctx->disk->queue->queue_lock); 521 rcu_read_unlock(); 522 put_disk(ctx->disk); 523} 524EXPORT_SYMBOL_GPL(blkg_conf_finish); 525 526struct cftype blkcg_files[] = { 527 { 528 .name = "reset_stats", 529 .write_u64 = blkcg_reset_stats, 530 }, 531 { } /* terminate */ 532}; 533 534/** 535 * blkcg_pre_destroy - cgroup pre_destroy callback 536 * @cgroup: cgroup of interest 537 * 538 * This function is called when @cgroup is about to go away and responsible 539 * for shooting down all blkgs associated with @cgroup. blkgs should be 540 * removed while holding both q and blkcg locks. As blkcg lock is nested 541 * inside q lock, this function performs reverse double lock dancing. 542 * 543 * This is the blkcg counterpart of ioc_release_fn(). 544 */ 545static int blkcg_pre_destroy(struct cgroup *cgroup) 546{ 547 struct blkcg *blkcg = cgroup_to_blkcg(cgroup); 548 549 spin_lock_irq(&blkcg->lock); 550 551 while (!hlist_empty(&blkcg->blkg_list)) { 552 struct blkcg_gq *blkg = hlist_entry(blkcg->blkg_list.first, 553 struct blkcg_gq, blkcg_node); 554 struct request_queue *q = blkg->q; 555 556 if (spin_trylock(q->queue_lock)) { 557 blkg_destroy(blkg); 558 spin_unlock(q->queue_lock); 559 } else { 560 spin_unlock_irq(&blkcg->lock); 561 cpu_relax(); 562 spin_lock_irq(&blkcg->lock); 563 } 564 } 565 566 spin_unlock_irq(&blkcg->lock); 567 return 0; 568} 569 570static void blkcg_destroy(struct cgroup *cgroup) 571{ 572 struct blkcg *blkcg = cgroup_to_blkcg(cgroup); 573 574 if (blkcg != &blkcg_root) 575 kfree(blkcg); 576} 577 578static struct cgroup_subsys_state *blkcg_create(struct cgroup *cgroup) 579{ 580 static atomic64_t id_seq = ATOMIC64_INIT(0); 581 struct blkcg *blkcg; 582 struct cgroup *parent = cgroup->parent; 583 584 if (!parent) { 585 blkcg = &blkcg_root; 586 goto done; 587 } 588 589 blkcg = kzalloc(sizeof(*blkcg), GFP_KERNEL); 590 if (!blkcg) 591 return ERR_PTR(-ENOMEM); 592 593 blkcg->cfq_weight = CFQ_WEIGHT_DEFAULT; 594 blkcg->id = atomic64_inc_return(&id_seq); /* root is 0, start from 1 */ 595done: 596 spin_lock_init(&blkcg->lock); 597 INIT_HLIST_HEAD(&blkcg->blkg_list); 598 599 return &blkcg->css; 600} 601 602/** 603 * blkcg_init_queue - initialize blkcg part of request queue 604 * @q: request_queue to initialize 605 * 606 * Called from blk_alloc_queue_node(). Responsible for initializing blkcg 607 * part of new request_queue @q. 608 * 609 * RETURNS: 610 * 0 on success, -errno on failure. 611 */ 612int blkcg_init_queue(struct request_queue *q) 613{ 614 might_sleep(); 615 616 return blk_throtl_init(q); 617} 618 619/** 620 * blkcg_drain_queue - drain blkcg part of request_queue 621 * @q: request_queue to drain 622 * 623 * Called from blk_drain_queue(). Responsible for draining blkcg part. 624 */ 625void blkcg_drain_queue(struct request_queue *q) 626{ 627 lockdep_assert_held(q->queue_lock); 628 629 blk_throtl_drain(q); 630} 631 632/** 633 * blkcg_exit_queue - exit and release blkcg part of request_queue 634 * @q: request_queue being released 635 * 636 * Called from blk_release_queue(). Responsible for exiting blkcg part. 637 */ 638void blkcg_exit_queue(struct request_queue *q) 639{ 640 spin_lock_irq(q->queue_lock); 641 blkg_destroy_all(q); 642 spin_unlock_irq(q->queue_lock); 643 644 blk_throtl_exit(q); 645} 646 647/* 648 * We cannot support shared io contexts, as we have no mean to support 649 * two tasks with the same ioc in two different groups without major rework 650 * of the main cic data structures. For now we allow a task to change 651 * its cgroup only if it's the only owner of its ioc. 652 */ 653static int blkcg_can_attach(struct cgroup *cgrp, struct cgroup_taskset *tset) 654{ 655 struct task_struct *task; 656 struct io_context *ioc; 657 int ret = 0; 658 659 /* task_lock() is needed to avoid races with exit_io_context() */ 660 cgroup_taskset_for_each(task, cgrp, tset) { 661 task_lock(task); 662 ioc = task->io_context; 663 if (ioc && atomic_read(&ioc->nr_tasks) > 1) 664 ret = -EINVAL; 665 task_unlock(task); 666 if (ret) 667 break; 668 } 669 return ret; 670} 671 672struct cgroup_subsys blkio_subsys = { 673 .name = "blkio", 674 .create = blkcg_create, 675 .can_attach = blkcg_can_attach, 676 .pre_destroy = blkcg_pre_destroy, 677 .destroy = blkcg_destroy, 678 .subsys_id = blkio_subsys_id, 679 .base_cftypes = blkcg_files, 680 .module = THIS_MODULE, 681}; 682EXPORT_SYMBOL_GPL(blkio_subsys); 683 684/** 685 * blkcg_activate_policy - activate a blkcg policy on a request_queue 686 * @q: request_queue of interest 687 * @pol: blkcg policy to activate 688 * 689 * Activate @pol on @q. Requires %GFP_KERNEL context. @q goes through 690 * bypass mode to populate its blkgs with policy_data for @pol. 691 * 692 * Activation happens with @q bypassed, so nobody would be accessing blkgs 693 * from IO path. Update of each blkg is protected by both queue and blkcg 694 * locks so that holding either lock and testing blkcg_policy_enabled() is 695 * always enough for dereferencing policy data. 696 * 697 * The caller is responsible for synchronizing [de]activations and policy 698 * [un]registerations. Returns 0 on success, -errno on failure. 699 */ 700int blkcg_activate_policy(struct request_queue *q, 701 const struct blkcg_policy *pol) 702{ 703 LIST_HEAD(pds); 704 struct blkcg_gq *blkg; 705 struct blkg_policy_data *pd, *n; 706 int cnt = 0, ret; 707 708 if (blkcg_policy_enabled(q, pol)) 709 return 0; 710 711 blk_queue_bypass_start(q); 712 713 /* make sure the root blkg exists and count the existing blkgs */ 714 spin_lock_irq(q->queue_lock); 715 716 rcu_read_lock(); 717 blkg = __blkg_lookup_create(&blkcg_root, q); 718 rcu_read_unlock(); 719 720 if (IS_ERR(blkg)) { 721 ret = PTR_ERR(blkg); 722 goto out_unlock; 723 } 724 q->root_blkg = blkg; 725 726 list_for_each_entry(blkg, &q->blkg_list, q_node) 727 cnt++; 728 729 spin_unlock_irq(q->queue_lock); 730 731 /* allocate policy_data for all existing blkgs */ 732 while (cnt--) { 733 pd = kzalloc_node(pol->pd_size, GFP_KERNEL, q->node); 734 if (!pd) { 735 ret = -ENOMEM; 736 goto out_free; 737 } 738 list_add_tail(&pd->alloc_node, &pds); 739 } 740 741 /* 742 * Install the allocated pds. With @q bypassing, no new blkg 743 * should have been created while the queue lock was dropped. 744 */ 745 spin_lock_irq(q->queue_lock); 746 747 list_for_each_entry(blkg, &q->blkg_list, q_node) { 748 if (WARN_ON(list_empty(&pds))) { 749 /* umm... this shouldn't happen, just abort */ 750 ret = -ENOMEM; 751 goto out_unlock; 752 } 753 pd = list_first_entry(&pds, struct blkg_policy_data, alloc_node); 754 list_del_init(&pd->alloc_node); 755 756 /* grab blkcg lock too while installing @pd on @blkg */ 757 spin_lock(&blkg->blkcg->lock); 758 759 blkg->pd[pol->plid] = pd; 760 pd->blkg = blkg; 761 pol->pd_init_fn(blkg); 762 763 spin_unlock(&blkg->blkcg->lock); 764 } 765 766 __set_bit(pol->plid, q->blkcg_pols); 767 ret = 0; 768out_unlock: 769 spin_unlock_irq(q->queue_lock); 770out_free: 771 blk_queue_bypass_end(q); 772 list_for_each_entry_safe(pd, n, &pds, alloc_node) 773 kfree(pd); 774 return ret; 775} 776EXPORT_SYMBOL_GPL(blkcg_activate_policy); 777 778/** 779 * blkcg_deactivate_policy - deactivate a blkcg policy on a request_queue 780 * @q: request_queue of interest 781 * @pol: blkcg policy to deactivate 782 * 783 * Deactivate @pol on @q. Follows the same synchronization rules as 784 * blkcg_activate_policy(). 785 */ 786void blkcg_deactivate_policy(struct request_queue *q, 787 const struct blkcg_policy *pol) 788{ 789 struct blkcg_gq *blkg; 790 791 if (!blkcg_policy_enabled(q, pol)) 792 return; 793 794 blk_queue_bypass_start(q); 795 spin_lock_irq(q->queue_lock); 796 797 __clear_bit(pol->plid, q->blkcg_pols); 798 799 /* if no policy is left, no need for blkgs - shoot them down */ 800 if (bitmap_empty(q->blkcg_pols, BLKCG_MAX_POLS)) 801 blkg_destroy_all(q); 802 803 list_for_each_entry(blkg, &q->blkg_list, q_node) { 804 /* grab blkcg lock too while removing @pd from @blkg */ 805 spin_lock(&blkg->blkcg->lock); 806 807 if (pol->pd_exit_fn) 808 pol->pd_exit_fn(blkg); 809 810 kfree(blkg->pd[pol->plid]); 811 blkg->pd[pol->plid] = NULL; 812 813 spin_unlock(&blkg->blkcg->lock); 814 } 815 816 spin_unlock_irq(q->queue_lock); 817 blk_queue_bypass_end(q); 818} 819EXPORT_SYMBOL_GPL(blkcg_deactivate_policy); 820 821/** 822 * blkcg_policy_register - register a blkcg policy 823 * @pol: blkcg policy to register 824 * 825 * Register @pol with blkcg core. Might sleep and @pol may be modified on 826 * successful registration. Returns 0 on success and -errno on failure. 827 */ 828int blkcg_policy_register(struct blkcg_policy *pol) 829{ 830 int i, ret; 831 832 if (WARN_ON(pol->pd_size < sizeof(struct blkg_policy_data))) 833 return -EINVAL; 834 835 mutex_lock(&blkcg_pol_mutex); 836 837 /* find an empty slot */ 838 ret = -ENOSPC; 839 for (i = 0; i < BLKCG_MAX_POLS; i++) 840 if (!blkcg_policy[i]) 841 break; 842 if (i >= BLKCG_MAX_POLS) 843 goto out_unlock; 844 845 /* register and update blkgs */ 846 pol->plid = i; 847 blkcg_policy[i] = pol; 848 849 /* everything is in place, add intf files for the new policy */ 850 if (pol->cftypes) 851 WARN_ON(cgroup_add_cftypes(&blkio_subsys, pol->cftypes)); 852 ret = 0; 853out_unlock: 854 mutex_unlock(&blkcg_pol_mutex); 855 return ret; 856} 857EXPORT_SYMBOL_GPL(blkcg_policy_register); 858 859/** 860 * blkcg_policy_unregister - unregister a blkcg policy 861 * @pol: blkcg policy to unregister 862 * 863 * Undo blkcg_policy_register(@pol). Might sleep. 864 */ 865void blkcg_policy_unregister(struct blkcg_policy *pol) 866{ 867 mutex_lock(&blkcg_pol_mutex); 868 869 if (WARN_ON(blkcg_policy[pol->plid] != pol)) 870 goto out_unlock; 871 872 /* kill the intf files first */ 873 if (pol->cftypes) 874 cgroup_rm_cftypes(&blkio_subsys, pol->cftypes); 875 876 /* unregister and update blkgs */ 877 blkcg_policy[pol->plid] = NULL; 878out_unlock: 879 mutex_unlock(&blkcg_pol_mutex); 880} 881EXPORT_SYMBOL_GPL(blkcg_policy_unregister); 882