blk-cgroup.c revision 155fead9b6347ead90e0b0396cb108a6ba6126c6
1/* 2 * Common Block IO controller cgroup interface 3 * 4 * Based on ideas and code from CFQ, CFS and BFQ: 5 * Copyright (C) 2003 Jens Axboe <axboe@kernel.dk> 6 * 7 * Copyright (C) 2008 Fabio Checconi <fabio@gandalf.sssup.it> 8 * Paolo Valente <paolo.valente@unimore.it> 9 * 10 * Copyright (C) 2009 Vivek Goyal <vgoyal@redhat.com> 11 * Nauman Rafique <nauman@google.com> 12 */ 13#include <linux/ioprio.h> 14#include <linux/kdev_t.h> 15#include <linux/module.h> 16#include <linux/err.h> 17#include <linux/blkdev.h> 18#include <linux/slab.h> 19#include <linux/genhd.h> 20#include <linux/delay.h> 21#include <linux/atomic.h> 22#include "blk-cgroup.h" 23#include "blk.h" 24 25#define MAX_KEY_LEN 100 26 27static DEFINE_SPINLOCK(blkio_list_lock); 28static LIST_HEAD(blkio_list); 29 30static DEFINE_MUTEX(all_q_mutex); 31static LIST_HEAD(all_q_list); 32 33/* List of groups pending per cpu stats allocation */ 34static DEFINE_SPINLOCK(alloc_list_lock); 35static LIST_HEAD(alloc_list); 36 37static void blkio_stat_alloc_fn(struct work_struct *); 38static DECLARE_DELAYED_WORK(blkio_stat_alloc_work, blkio_stat_alloc_fn); 39 40struct blkio_cgroup blkio_root_cgroup = { .weight = 2*BLKIO_WEIGHT_DEFAULT }; 41EXPORT_SYMBOL_GPL(blkio_root_cgroup); 42 43static struct blkio_policy_type *blkio_policy[BLKIO_NR_POLICIES]; 44 45struct blkio_cgroup *cgroup_to_blkio_cgroup(struct cgroup *cgroup) 46{ 47 return container_of(cgroup_subsys_state(cgroup, blkio_subsys_id), 48 struct blkio_cgroup, css); 49} 50EXPORT_SYMBOL_GPL(cgroup_to_blkio_cgroup); 51 52static struct blkio_cgroup *task_blkio_cgroup(struct task_struct *tsk) 53{ 54 return container_of(task_subsys_state(tsk, blkio_subsys_id), 55 struct blkio_cgroup, css); 56} 57 58struct blkio_cgroup *bio_blkio_cgroup(struct bio *bio) 59{ 60 if (bio && bio->bi_css) 61 return container_of(bio->bi_css, struct blkio_cgroup, css); 62 return task_blkio_cgroup(current); 63} 64EXPORT_SYMBOL_GPL(bio_blkio_cgroup); 65 66/* 67 * Worker for allocating per cpu stat for blk groups. This is scheduled on 68 * the system_nrt_wq once there are some groups on the alloc_list waiting 69 * for allocation. 70 */ 71static void blkio_stat_alloc_fn(struct work_struct *work) 72{ 73 static void *pcpu_stats[BLKIO_NR_POLICIES]; 74 struct delayed_work *dwork = to_delayed_work(work); 75 struct blkio_group *blkg; 76 int i; 77 bool empty = false; 78 79alloc_stats: 80 for (i = 0; i < BLKIO_NR_POLICIES; i++) { 81 if (pcpu_stats[i] != NULL) 82 continue; 83 84 pcpu_stats[i] = alloc_percpu(struct blkio_group_stats_cpu); 85 86 /* Allocation failed. Try again after some time. */ 87 if (pcpu_stats[i] == NULL) { 88 queue_delayed_work(system_nrt_wq, dwork, 89 msecs_to_jiffies(10)); 90 return; 91 } 92 } 93 94 spin_lock_irq(&blkio_list_lock); 95 spin_lock(&alloc_list_lock); 96 97 /* cgroup got deleted or queue exited. */ 98 if (!list_empty(&alloc_list)) { 99 blkg = list_first_entry(&alloc_list, struct blkio_group, 100 alloc_node); 101 for (i = 0; i < BLKIO_NR_POLICIES; i++) { 102 struct blkg_policy_data *pd = blkg->pd[i]; 103 104 if (blkio_policy[i] && pd && !pd->stats_cpu) 105 swap(pd->stats_cpu, pcpu_stats[i]); 106 } 107 108 list_del_init(&blkg->alloc_node); 109 } 110 111 empty = list_empty(&alloc_list); 112 113 spin_unlock(&alloc_list_lock); 114 spin_unlock_irq(&blkio_list_lock); 115 116 if (!empty) 117 goto alloc_stats; 118} 119 120/** 121 * blkg_free - free a blkg 122 * @blkg: blkg to free 123 * 124 * Free @blkg which may be partially allocated. 125 */ 126static void blkg_free(struct blkio_group *blkg) 127{ 128 int i; 129 130 if (!blkg) 131 return; 132 133 for (i = 0; i < BLKIO_NR_POLICIES; i++) { 134 struct blkio_policy_type *pol = blkio_policy[i]; 135 struct blkg_policy_data *pd = blkg->pd[i]; 136 137 if (!pd) 138 continue; 139 140 if (pol && pol->ops.blkio_exit_group_fn) 141 pol->ops.blkio_exit_group_fn(blkg); 142 143 free_percpu(pd->stats_cpu); 144 kfree(pd); 145 } 146 147 kfree(blkg); 148} 149 150/** 151 * blkg_alloc - allocate a blkg 152 * @blkcg: block cgroup the new blkg is associated with 153 * @q: request_queue the new blkg is associated with 154 * 155 * Allocate a new blkg assocating @blkcg and @q. 156 */ 157static struct blkio_group *blkg_alloc(struct blkio_cgroup *blkcg, 158 struct request_queue *q) 159{ 160 struct blkio_group *blkg; 161 int i; 162 163 /* alloc and init base part */ 164 blkg = kzalloc_node(sizeof(*blkg), GFP_ATOMIC, q->node); 165 if (!blkg) 166 return NULL; 167 168 blkg->q = q; 169 INIT_LIST_HEAD(&blkg->q_node); 170 INIT_LIST_HEAD(&blkg->alloc_node); 171 blkg->blkcg = blkcg; 172 blkg->refcnt = 1; 173 cgroup_path(blkcg->css.cgroup, blkg->path, sizeof(blkg->path)); 174 175 for (i = 0; i < BLKIO_NR_POLICIES; i++) { 176 struct blkio_policy_type *pol = blkio_policy[i]; 177 struct blkg_policy_data *pd; 178 179 if (!pol) 180 continue; 181 182 /* alloc per-policy data and attach it to blkg */ 183 pd = kzalloc_node(sizeof(*pd) + pol->pdata_size, GFP_ATOMIC, 184 q->node); 185 if (!pd) { 186 blkg_free(blkg); 187 return NULL; 188 } 189 190 blkg->pd[i] = pd; 191 pd->blkg = blkg; 192 } 193 194 /* invoke per-policy init */ 195 for (i = 0; i < BLKIO_NR_POLICIES; i++) { 196 struct blkio_policy_type *pol = blkio_policy[i]; 197 198 if (pol) 199 pol->ops.blkio_init_group_fn(blkg); 200 } 201 202 return blkg; 203} 204 205struct blkio_group *blkg_lookup_create(struct blkio_cgroup *blkcg, 206 struct request_queue *q, 207 bool for_root) 208 __releases(q->queue_lock) __acquires(q->queue_lock) 209{ 210 struct blkio_group *blkg; 211 212 WARN_ON_ONCE(!rcu_read_lock_held()); 213 lockdep_assert_held(q->queue_lock); 214 215 /* 216 * This could be the first entry point of blkcg implementation and 217 * we shouldn't allow anything to go through for a bypassing queue. 218 * The following can be removed if blkg lookup is guaranteed to 219 * fail on a bypassing queue. 220 */ 221 if (unlikely(blk_queue_bypass(q)) && !for_root) 222 return ERR_PTR(blk_queue_dead(q) ? -EINVAL : -EBUSY); 223 224 blkg = blkg_lookup(blkcg, q); 225 if (blkg) 226 return blkg; 227 228 /* blkg holds a reference to blkcg */ 229 if (!css_tryget(&blkcg->css)) 230 return ERR_PTR(-EINVAL); 231 232 /* 233 * Allocate and initialize. 234 */ 235 blkg = blkg_alloc(blkcg, q); 236 237 /* did alloc fail? */ 238 if (unlikely(!blkg)) { 239 blkg = ERR_PTR(-ENOMEM); 240 goto out; 241 } 242 243 /* insert */ 244 spin_lock(&blkcg->lock); 245 hlist_add_head_rcu(&blkg->blkcg_node, &blkcg->blkg_list); 246 list_add(&blkg->q_node, &q->blkg_list); 247 spin_unlock(&blkcg->lock); 248 249 spin_lock(&alloc_list_lock); 250 list_add(&blkg->alloc_node, &alloc_list); 251 /* Queue per cpu stat allocation from worker thread. */ 252 queue_delayed_work(system_nrt_wq, &blkio_stat_alloc_work, 0); 253 spin_unlock(&alloc_list_lock); 254out: 255 return blkg; 256} 257EXPORT_SYMBOL_GPL(blkg_lookup_create); 258 259/* called under rcu_read_lock(). */ 260struct blkio_group *blkg_lookup(struct blkio_cgroup *blkcg, 261 struct request_queue *q) 262{ 263 struct blkio_group *blkg; 264 struct hlist_node *n; 265 266 hlist_for_each_entry_rcu(blkg, n, &blkcg->blkg_list, blkcg_node) 267 if (blkg->q == q) 268 return blkg; 269 return NULL; 270} 271EXPORT_SYMBOL_GPL(blkg_lookup); 272 273static void blkg_destroy(struct blkio_group *blkg) 274{ 275 struct request_queue *q = blkg->q; 276 struct blkio_cgroup *blkcg = blkg->blkcg; 277 278 lockdep_assert_held(q->queue_lock); 279 lockdep_assert_held(&blkcg->lock); 280 281 /* Something wrong if we are trying to remove same group twice */ 282 WARN_ON_ONCE(list_empty(&blkg->q_node)); 283 WARN_ON_ONCE(hlist_unhashed(&blkg->blkcg_node)); 284 list_del_init(&blkg->q_node); 285 hlist_del_init_rcu(&blkg->blkcg_node); 286 287 spin_lock(&alloc_list_lock); 288 list_del_init(&blkg->alloc_node); 289 spin_unlock(&alloc_list_lock); 290 291 /* 292 * Put the reference taken at the time of creation so that when all 293 * queues are gone, group can be destroyed. 294 */ 295 blkg_put(blkg); 296} 297 298/* 299 * XXX: This updates blkg policy data in-place for root blkg, which is 300 * necessary across elevator switch and policy registration as root blkgs 301 * aren't shot down. This broken and racy implementation is temporary. 302 * Eventually, blkg shoot down will be replaced by proper in-place update. 303 */ 304void update_root_blkg_pd(struct request_queue *q, enum blkio_policy_id plid) 305{ 306 struct blkio_policy_type *pol = blkio_policy[plid]; 307 struct blkio_group *blkg = blkg_lookup(&blkio_root_cgroup, q); 308 struct blkg_policy_data *pd; 309 310 if (!blkg) 311 return; 312 313 kfree(blkg->pd[plid]); 314 blkg->pd[plid] = NULL; 315 316 if (!pol) 317 return; 318 319 pd = kzalloc(sizeof(*pd) + pol->pdata_size, GFP_KERNEL); 320 WARN_ON_ONCE(!pd); 321 322 pd->stats_cpu = alloc_percpu(struct blkio_group_stats_cpu); 323 WARN_ON_ONCE(!pd->stats_cpu); 324 325 blkg->pd[plid] = pd; 326 pd->blkg = blkg; 327 pol->ops.blkio_init_group_fn(blkg); 328} 329EXPORT_SYMBOL_GPL(update_root_blkg_pd); 330 331/** 332 * blkg_destroy_all - destroy all blkgs associated with a request_queue 333 * @q: request_queue of interest 334 * @destroy_root: whether to destroy root blkg or not 335 * 336 * Destroy blkgs associated with @q. If @destroy_root is %true, all are 337 * destroyed; otherwise, root blkg is left alone. 338 */ 339void blkg_destroy_all(struct request_queue *q, bool destroy_root) 340{ 341 struct blkio_group *blkg, *n; 342 343 spin_lock_irq(q->queue_lock); 344 345 list_for_each_entry_safe(blkg, n, &q->blkg_list, q_node) { 346 struct blkio_cgroup *blkcg = blkg->blkcg; 347 348 /* skip root? */ 349 if (!destroy_root && blkg->blkcg == &blkio_root_cgroup) 350 continue; 351 352 spin_lock(&blkcg->lock); 353 blkg_destroy(blkg); 354 spin_unlock(&blkcg->lock); 355 } 356 357 spin_unlock_irq(q->queue_lock); 358} 359EXPORT_SYMBOL_GPL(blkg_destroy_all); 360 361static void blkg_rcu_free(struct rcu_head *rcu_head) 362{ 363 blkg_free(container_of(rcu_head, struct blkio_group, rcu_head)); 364} 365 366void __blkg_release(struct blkio_group *blkg) 367{ 368 /* release the extra blkcg reference this blkg has been holding */ 369 css_put(&blkg->blkcg->css); 370 371 /* 372 * A group is freed in rcu manner. But having an rcu lock does not 373 * mean that one can access all the fields of blkg and assume these 374 * are valid. For example, don't try to follow throtl_data and 375 * request queue links. 376 * 377 * Having a reference to blkg under an rcu allows acess to only 378 * values local to groups like group stats and group rate limits 379 */ 380 call_rcu(&blkg->rcu_head, blkg_rcu_free); 381} 382EXPORT_SYMBOL_GPL(__blkg_release); 383 384static void blkio_reset_stats_cpu(struct blkio_group *blkg, int plid) 385{ 386 struct blkg_policy_data *pd = blkg->pd[plid]; 387 int cpu; 388 389 if (pd->stats_cpu == NULL) 390 return; 391 392 for_each_possible_cpu(cpu) { 393 struct blkio_group_stats_cpu *sc = 394 per_cpu_ptr(pd->stats_cpu, cpu); 395 396 blkg_rwstat_reset(&sc->service_bytes); 397 blkg_rwstat_reset(&sc->serviced); 398 } 399} 400 401static int 402blkiocg_reset_stats(struct cgroup *cgroup, struct cftype *cftype, u64 val) 403{ 404 struct blkio_cgroup *blkcg = cgroup_to_blkio_cgroup(cgroup); 405 struct blkio_group *blkg; 406 struct hlist_node *n; 407 408 spin_lock(&blkio_list_lock); 409 spin_lock_irq(&blkcg->lock); 410 411 /* 412 * Note that stat reset is racy - it doesn't synchronize against 413 * stat updates. This is a debug feature which shouldn't exist 414 * anyway. If you get hit by a race, retry. 415 */ 416 hlist_for_each_entry(blkg, n, &blkcg->blkg_list, blkcg_node) { 417 struct blkio_policy_type *pol; 418 419 list_for_each_entry(pol, &blkio_list, list) { 420 blkio_reset_stats_cpu(blkg, pol->plid); 421 422 if (pol->ops.blkio_reset_group_stats_fn) 423 pol->ops.blkio_reset_group_stats_fn(blkg); 424 } 425 } 426 427 spin_unlock_irq(&blkcg->lock); 428 spin_unlock(&blkio_list_lock); 429 return 0; 430} 431 432static const char *blkg_dev_name(struct blkio_group *blkg) 433{ 434 /* some drivers (floppy) instantiate a queue w/o disk registered */ 435 if (blkg->q->backing_dev_info.dev) 436 return dev_name(blkg->q->backing_dev_info.dev); 437 return NULL; 438} 439 440/** 441 * blkcg_print_blkgs - helper for printing per-blkg data 442 * @sf: seq_file to print to 443 * @blkcg: blkcg of interest 444 * @prfill: fill function to print out a blkg 445 * @pol: policy in question 446 * @data: data to be passed to @prfill 447 * @show_total: to print out sum of prfill return values or not 448 * 449 * This function invokes @prfill on each blkg of @blkcg if pd for the 450 * policy specified by @pol exists. @prfill is invoked with @sf, the 451 * policy data and @data. If @show_total is %true, the sum of the return 452 * values from @prfill is printed with "Total" label at the end. 453 * 454 * This is to be used to construct print functions for 455 * cftype->read_seq_string method. 456 */ 457void blkcg_print_blkgs(struct seq_file *sf, struct blkio_cgroup *blkcg, 458 u64 (*prfill)(struct seq_file *, struct blkg_policy_data *, int), 459 int pol, int data, bool show_total) 460{ 461 struct blkio_group *blkg; 462 struct hlist_node *n; 463 u64 total = 0; 464 465 spin_lock_irq(&blkcg->lock); 466 hlist_for_each_entry(blkg, n, &blkcg->blkg_list, blkcg_node) 467 if (blkg->pd[pol]) 468 total += prfill(sf, blkg->pd[pol], data); 469 spin_unlock_irq(&blkcg->lock); 470 471 if (show_total) 472 seq_printf(sf, "Total %llu\n", (unsigned long long)total); 473} 474EXPORT_SYMBOL_GPL(blkcg_print_blkgs); 475 476/** 477 * __blkg_prfill_u64 - prfill helper for a single u64 value 478 * @sf: seq_file to print to 479 * @pd: policy data of interest 480 * @v: value to print 481 * 482 * Print @v to @sf for the device assocaited with @pd. 483 */ 484u64 __blkg_prfill_u64(struct seq_file *sf, struct blkg_policy_data *pd, u64 v) 485{ 486 const char *dname = blkg_dev_name(pd->blkg); 487 488 if (!dname) 489 return 0; 490 491 seq_printf(sf, "%s %llu\n", dname, (unsigned long long)v); 492 return v; 493} 494EXPORT_SYMBOL_GPL(__blkg_prfill_u64); 495 496/** 497 * __blkg_prfill_rwstat - prfill helper for a blkg_rwstat 498 * @sf: seq_file to print to 499 * @pd: policy data of interest 500 * @rwstat: rwstat to print 501 * 502 * Print @rwstat to @sf for the device assocaited with @pd. 503 */ 504u64 __blkg_prfill_rwstat(struct seq_file *sf, struct blkg_policy_data *pd, 505 const struct blkg_rwstat *rwstat) 506{ 507 static const char *rwstr[] = { 508 [BLKG_RWSTAT_READ] = "Read", 509 [BLKG_RWSTAT_WRITE] = "Write", 510 [BLKG_RWSTAT_SYNC] = "Sync", 511 [BLKG_RWSTAT_ASYNC] = "Async", 512 }; 513 const char *dname = blkg_dev_name(pd->blkg); 514 u64 v; 515 int i; 516 517 if (!dname) 518 return 0; 519 520 for (i = 0; i < BLKG_RWSTAT_NR; i++) 521 seq_printf(sf, "%s %s %llu\n", dname, rwstr[i], 522 (unsigned long long)rwstat->cnt[i]); 523 524 v = rwstat->cnt[BLKG_RWSTAT_READ] + rwstat->cnt[BLKG_RWSTAT_WRITE]; 525 seq_printf(sf, "%s Total %llu\n", dname, (unsigned long long)v); 526 return v; 527} 528 529static u64 blkg_prfill_stat(struct seq_file *sf, struct blkg_policy_data *pd, 530 int off) 531{ 532 return __blkg_prfill_u64(sf, pd, 533 blkg_stat_read((void *)pd->pdata + off)); 534} 535 536static u64 blkg_prfill_rwstat(struct seq_file *sf, struct blkg_policy_data *pd, 537 int off) 538{ 539 struct blkg_rwstat rwstat = blkg_rwstat_read((void *)pd->pdata + off); 540 541 return __blkg_prfill_rwstat(sf, pd, &rwstat); 542} 543 544/* print blkg_stat specified by BLKCG_STAT_PRIV() */ 545int blkcg_print_stat(struct cgroup *cgrp, struct cftype *cft, 546 struct seq_file *sf) 547{ 548 struct blkio_cgroup *blkcg = cgroup_to_blkio_cgroup(cgrp); 549 550 blkcg_print_blkgs(sf, blkcg, blkg_prfill_stat, 551 BLKCG_STAT_POL(cft->private), 552 BLKCG_STAT_OFF(cft->private), false); 553 return 0; 554} 555EXPORT_SYMBOL_GPL(blkcg_print_stat); 556 557/* print blkg_rwstat specified by BLKCG_STAT_PRIV() */ 558int blkcg_print_rwstat(struct cgroup *cgrp, struct cftype *cft, 559 struct seq_file *sf) 560{ 561 struct blkio_cgroup *blkcg = cgroup_to_blkio_cgroup(cgrp); 562 563 blkcg_print_blkgs(sf, blkcg, blkg_prfill_rwstat, 564 BLKCG_STAT_POL(cft->private), 565 BLKCG_STAT_OFF(cft->private), true); 566 return 0; 567} 568EXPORT_SYMBOL_GPL(blkcg_print_rwstat); 569 570/** 571 * blkg_conf_prep - parse and prepare for per-blkg config update 572 * @blkcg: target block cgroup 573 * @input: input string 574 * @ctx: blkg_conf_ctx to be filled 575 * 576 * Parse per-blkg config update from @input and initialize @ctx with the 577 * result. @ctx->blkg points to the blkg to be updated and @ctx->v the new 578 * value. This function returns with RCU read locked and must be paired 579 * with blkg_conf_finish(). 580 */ 581int blkg_conf_prep(struct blkio_cgroup *blkcg, const char *input, 582 struct blkg_conf_ctx *ctx) 583 __acquires(rcu) 584{ 585 struct gendisk *disk; 586 struct blkio_group *blkg; 587 unsigned int major, minor; 588 unsigned long long v; 589 int part, ret; 590 591 if (sscanf(input, "%u:%u %llu", &major, &minor, &v) != 3) 592 return -EINVAL; 593 594 disk = get_gendisk(MKDEV(major, minor), &part); 595 if (!disk || part) 596 return -EINVAL; 597 598 rcu_read_lock(); 599 600 spin_lock_irq(disk->queue->queue_lock); 601 blkg = blkg_lookup_create(blkcg, disk->queue, false); 602 spin_unlock_irq(disk->queue->queue_lock); 603 604 if (IS_ERR(blkg)) { 605 ret = PTR_ERR(blkg); 606 rcu_read_unlock(); 607 put_disk(disk); 608 /* 609 * If queue was bypassing, we should retry. Do so after a 610 * short msleep(). It isn't strictly necessary but queue 611 * can be bypassing for some time and it's always nice to 612 * avoid busy looping. 613 */ 614 if (ret == -EBUSY) { 615 msleep(10); 616 ret = restart_syscall(); 617 } 618 return ret; 619 } 620 621 ctx->disk = disk; 622 ctx->blkg = blkg; 623 ctx->v = v; 624 return 0; 625} 626EXPORT_SYMBOL_GPL(blkg_conf_prep); 627 628/** 629 * blkg_conf_finish - finish up per-blkg config update 630 * @ctx: blkg_conf_ctx intiailized by blkg_conf_prep() 631 * 632 * Finish up after per-blkg config update. This function must be paired 633 * with blkg_conf_prep(). 634 */ 635void blkg_conf_finish(struct blkg_conf_ctx *ctx) 636 __releases(rcu) 637{ 638 rcu_read_unlock(); 639 put_disk(ctx->disk); 640} 641EXPORT_SYMBOL_GPL(blkg_conf_finish); 642 643struct cftype blkio_files[] = { 644 { 645 .name = "reset_stats", 646 .write_u64 = blkiocg_reset_stats, 647 }, 648 { } /* terminate */ 649}; 650 651/** 652 * blkiocg_pre_destroy - cgroup pre_destroy callback 653 * @cgroup: cgroup of interest 654 * 655 * This function is called when @cgroup is about to go away and responsible 656 * for shooting down all blkgs associated with @cgroup. blkgs should be 657 * removed while holding both q and blkcg locks. As blkcg lock is nested 658 * inside q lock, this function performs reverse double lock dancing. 659 * 660 * This is the blkcg counterpart of ioc_release_fn(). 661 */ 662static int blkiocg_pre_destroy(struct cgroup *cgroup) 663{ 664 struct blkio_cgroup *blkcg = cgroup_to_blkio_cgroup(cgroup); 665 666 spin_lock_irq(&blkcg->lock); 667 668 while (!hlist_empty(&blkcg->blkg_list)) { 669 struct blkio_group *blkg = hlist_entry(blkcg->blkg_list.first, 670 struct blkio_group, blkcg_node); 671 struct request_queue *q = blkg->q; 672 673 if (spin_trylock(q->queue_lock)) { 674 blkg_destroy(blkg); 675 spin_unlock(q->queue_lock); 676 } else { 677 spin_unlock_irq(&blkcg->lock); 678 cpu_relax(); 679 spin_lock_irq(&blkcg->lock); 680 } 681 } 682 683 spin_unlock_irq(&blkcg->lock); 684 return 0; 685} 686 687static void blkiocg_destroy(struct cgroup *cgroup) 688{ 689 struct blkio_cgroup *blkcg = cgroup_to_blkio_cgroup(cgroup); 690 691 if (blkcg != &blkio_root_cgroup) 692 kfree(blkcg); 693} 694 695static struct cgroup_subsys_state *blkiocg_create(struct cgroup *cgroup) 696{ 697 static atomic64_t id_seq = ATOMIC64_INIT(0); 698 struct blkio_cgroup *blkcg; 699 struct cgroup *parent = cgroup->parent; 700 701 if (!parent) { 702 blkcg = &blkio_root_cgroup; 703 goto done; 704 } 705 706 blkcg = kzalloc(sizeof(*blkcg), GFP_KERNEL); 707 if (!blkcg) 708 return ERR_PTR(-ENOMEM); 709 710 blkcg->weight = BLKIO_WEIGHT_DEFAULT; 711 blkcg->id = atomic64_inc_return(&id_seq); /* root is 0, start from 1 */ 712done: 713 spin_lock_init(&blkcg->lock); 714 INIT_HLIST_HEAD(&blkcg->blkg_list); 715 716 return &blkcg->css; 717} 718 719/** 720 * blkcg_init_queue - initialize blkcg part of request queue 721 * @q: request_queue to initialize 722 * 723 * Called from blk_alloc_queue_node(). Responsible for initializing blkcg 724 * part of new request_queue @q. 725 * 726 * RETURNS: 727 * 0 on success, -errno on failure. 728 */ 729int blkcg_init_queue(struct request_queue *q) 730{ 731 int ret; 732 733 might_sleep(); 734 735 ret = blk_throtl_init(q); 736 if (ret) 737 return ret; 738 739 mutex_lock(&all_q_mutex); 740 INIT_LIST_HEAD(&q->all_q_node); 741 list_add_tail(&q->all_q_node, &all_q_list); 742 mutex_unlock(&all_q_mutex); 743 744 return 0; 745} 746 747/** 748 * blkcg_drain_queue - drain blkcg part of request_queue 749 * @q: request_queue to drain 750 * 751 * Called from blk_drain_queue(). Responsible for draining blkcg part. 752 */ 753void blkcg_drain_queue(struct request_queue *q) 754{ 755 lockdep_assert_held(q->queue_lock); 756 757 blk_throtl_drain(q); 758} 759 760/** 761 * blkcg_exit_queue - exit and release blkcg part of request_queue 762 * @q: request_queue being released 763 * 764 * Called from blk_release_queue(). Responsible for exiting blkcg part. 765 */ 766void blkcg_exit_queue(struct request_queue *q) 767{ 768 mutex_lock(&all_q_mutex); 769 list_del_init(&q->all_q_node); 770 mutex_unlock(&all_q_mutex); 771 772 blkg_destroy_all(q, true); 773 774 blk_throtl_exit(q); 775} 776 777/* 778 * We cannot support shared io contexts, as we have no mean to support 779 * two tasks with the same ioc in two different groups without major rework 780 * of the main cic data structures. For now we allow a task to change 781 * its cgroup only if it's the only owner of its ioc. 782 */ 783static int blkiocg_can_attach(struct cgroup *cgrp, struct cgroup_taskset *tset) 784{ 785 struct task_struct *task; 786 struct io_context *ioc; 787 int ret = 0; 788 789 /* task_lock() is needed to avoid races with exit_io_context() */ 790 cgroup_taskset_for_each(task, cgrp, tset) { 791 task_lock(task); 792 ioc = task->io_context; 793 if (ioc && atomic_read(&ioc->nr_tasks) > 1) 794 ret = -EINVAL; 795 task_unlock(task); 796 if (ret) 797 break; 798 } 799 return ret; 800} 801 802static void blkcg_bypass_start(void) 803 __acquires(&all_q_mutex) 804{ 805 struct request_queue *q; 806 807 mutex_lock(&all_q_mutex); 808 809 list_for_each_entry(q, &all_q_list, all_q_node) { 810 blk_queue_bypass_start(q); 811 blkg_destroy_all(q, false); 812 } 813} 814 815static void blkcg_bypass_end(void) 816 __releases(&all_q_mutex) 817{ 818 struct request_queue *q; 819 820 list_for_each_entry(q, &all_q_list, all_q_node) 821 blk_queue_bypass_end(q); 822 823 mutex_unlock(&all_q_mutex); 824} 825 826struct cgroup_subsys blkio_subsys = { 827 .name = "blkio", 828 .create = blkiocg_create, 829 .can_attach = blkiocg_can_attach, 830 .pre_destroy = blkiocg_pre_destroy, 831 .destroy = blkiocg_destroy, 832 .subsys_id = blkio_subsys_id, 833 .base_cftypes = blkio_files, 834 .module = THIS_MODULE, 835}; 836EXPORT_SYMBOL_GPL(blkio_subsys); 837 838void blkio_policy_register(struct blkio_policy_type *blkiop) 839{ 840 struct request_queue *q; 841 842 blkcg_bypass_start(); 843 spin_lock(&blkio_list_lock); 844 845 BUG_ON(blkio_policy[blkiop->plid]); 846 blkio_policy[blkiop->plid] = blkiop; 847 list_add_tail(&blkiop->list, &blkio_list); 848 849 spin_unlock(&blkio_list_lock); 850 list_for_each_entry(q, &all_q_list, all_q_node) 851 update_root_blkg_pd(q, blkiop->plid); 852 blkcg_bypass_end(); 853 854 if (blkiop->cftypes) 855 WARN_ON(cgroup_add_cftypes(&blkio_subsys, blkiop->cftypes)); 856} 857EXPORT_SYMBOL_GPL(blkio_policy_register); 858 859void blkio_policy_unregister(struct blkio_policy_type *blkiop) 860{ 861 struct request_queue *q; 862 863 if (blkiop->cftypes) 864 cgroup_rm_cftypes(&blkio_subsys, blkiop->cftypes); 865 866 blkcg_bypass_start(); 867 spin_lock(&blkio_list_lock); 868 869 BUG_ON(blkio_policy[blkiop->plid] != blkiop); 870 blkio_policy[blkiop->plid] = NULL; 871 list_del_init(&blkiop->list); 872 873 spin_unlock(&blkio_list_lock); 874 list_for_each_entry(q, &all_q_list, all_q_node) 875 update_root_blkg_pd(q, blkiop->plid); 876 blkcg_bypass_end(); 877} 878EXPORT_SYMBOL_GPL(blkio_policy_unregister); 879