blk-cgroup.c revision 1adaf3dde37a8b9b59ea59c5f58fed7761178383
1/* 2 * Common Block IO controller cgroup interface 3 * 4 * Based on ideas and code from CFQ, CFS and BFQ: 5 * Copyright (C) 2003 Jens Axboe <axboe@kernel.dk> 6 * 7 * Copyright (C) 2008 Fabio Checconi <fabio@gandalf.sssup.it> 8 * Paolo Valente <paolo.valente@unimore.it> 9 * 10 * Copyright (C) 2009 Vivek Goyal <vgoyal@redhat.com> 11 * Nauman Rafique <nauman@google.com> 12 */ 13#include <linux/ioprio.h> 14#include <linux/seq_file.h> 15#include <linux/kdev_t.h> 16#include <linux/module.h> 17#include <linux/err.h> 18#include <linux/blkdev.h> 19#include <linux/slab.h> 20#include <linux/genhd.h> 21#include <linux/delay.h> 22#include "blk-cgroup.h" 23#include "blk.h" 24 25#define MAX_KEY_LEN 100 26 27static DEFINE_SPINLOCK(blkio_list_lock); 28static LIST_HEAD(blkio_list); 29 30static DEFINE_MUTEX(all_q_mutex); 31static LIST_HEAD(all_q_list); 32 33struct blkio_cgroup blkio_root_cgroup = { .weight = 2*BLKIO_WEIGHT_DEFAULT }; 34EXPORT_SYMBOL_GPL(blkio_root_cgroup); 35 36static struct blkio_policy_type *blkio_policy[BLKIO_NR_POLICIES]; 37 38static struct cgroup_subsys_state *blkiocg_create(struct cgroup_subsys *, 39 struct cgroup *); 40static int blkiocg_can_attach(struct cgroup_subsys *, struct cgroup *, 41 struct cgroup_taskset *); 42static void blkiocg_attach(struct cgroup_subsys *, struct cgroup *, 43 struct cgroup_taskset *); 44static int blkiocg_pre_destroy(struct cgroup_subsys *, struct cgroup *); 45static void blkiocg_destroy(struct cgroup_subsys *, struct cgroup *); 46static int blkiocg_populate(struct cgroup_subsys *, struct cgroup *); 47 48/* for encoding cft->private value on file */ 49#define BLKIOFILE_PRIVATE(x, val) (((x) << 16) | (val)) 50/* What policy owns the file, proportional or throttle */ 51#define BLKIOFILE_POLICY(val) (((val) >> 16) & 0xffff) 52#define BLKIOFILE_ATTR(val) ((val) & 0xffff) 53 54struct cgroup_subsys blkio_subsys = { 55 .name = "blkio", 56 .create = blkiocg_create, 57 .can_attach = blkiocg_can_attach, 58 .attach = blkiocg_attach, 59 .pre_destroy = blkiocg_pre_destroy, 60 .destroy = blkiocg_destroy, 61 .populate = blkiocg_populate, 62 .subsys_id = blkio_subsys_id, 63 .module = THIS_MODULE, 64}; 65EXPORT_SYMBOL_GPL(blkio_subsys); 66 67struct blkio_cgroup *cgroup_to_blkio_cgroup(struct cgroup *cgroup) 68{ 69 return container_of(cgroup_subsys_state(cgroup, blkio_subsys_id), 70 struct blkio_cgroup, css); 71} 72EXPORT_SYMBOL_GPL(cgroup_to_blkio_cgroup); 73 74struct blkio_cgroup *task_blkio_cgroup(struct task_struct *tsk) 75{ 76 return container_of(task_subsys_state(tsk, blkio_subsys_id), 77 struct blkio_cgroup, css); 78} 79EXPORT_SYMBOL_GPL(task_blkio_cgroup); 80 81static inline void 82blkio_update_group_weight(struct blkio_group *blkg, unsigned int weight) 83{ 84 struct blkio_policy_type *blkiop; 85 86 list_for_each_entry(blkiop, &blkio_list, list) { 87 /* If this policy does not own the blkg, do not send updates */ 88 if (blkiop->plid != blkg->plid) 89 continue; 90 if (blkiop->ops.blkio_update_group_weight_fn) 91 blkiop->ops.blkio_update_group_weight_fn(blkg->q, 92 blkg, weight); 93 } 94} 95 96static inline void blkio_update_group_bps(struct blkio_group *blkg, u64 bps, 97 int fileid) 98{ 99 struct blkio_policy_type *blkiop; 100 101 list_for_each_entry(blkiop, &blkio_list, list) { 102 103 /* If this policy does not own the blkg, do not send updates */ 104 if (blkiop->plid != blkg->plid) 105 continue; 106 107 if (fileid == BLKIO_THROTL_read_bps_device 108 && blkiop->ops.blkio_update_group_read_bps_fn) 109 blkiop->ops.blkio_update_group_read_bps_fn(blkg->q, 110 blkg, bps); 111 112 if (fileid == BLKIO_THROTL_write_bps_device 113 && blkiop->ops.blkio_update_group_write_bps_fn) 114 blkiop->ops.blkio_update_group_write_bps_fn(blkg->q, 115 blkg, bps); 116 } 117} 118 119static inline void blkio_update_group_iops(struct blkio_group *blkg, 120 unsigned int iops, int fileid) 121{ 122 struct blkio_policy_type *blkiop; 123 124 list_for_each_entry(blkiop, &blkio_list, list) { 125 126 /* If this policy does not own the blkg, do not send updates */ 127 if (blkiop->plid != blkg->plid) 128 continue; 129 130 if (fileid == BLKIO_THROTL_read_iops_device 131 && blkiop->ops.blkio_update_group_read_iops_fn) 132 blkiop->ops.blkio_update_group_read_iops_fn(blkg->q, 133 blkg, iops); 134 135 if (fileid == BLKIO_THROTL_write_iops_device 136 && blkiop->ops.blkio_update_group_write_iops_fn) 137 blkiop->ops.blkio_update_group_write_iops_fn(blkg->q, 138 blkg,iops); 139 } 140} 141 142/* 143 * Add to the appropriate stat variable depending on the request type. 144 * This should be called with the blkg->stats_lock held. 145 */ 146static void blkio_add_stat(uint64_t *stat, uint64_t add, bool direction, 147 bool sync) 148{ 149 if (direction) 150 stat[BLKIO_STAT_WRITE] += add; 151 else 152 stat[BLKIO_STAT_READ] += add; 153 if (sync) 154 stat[BLKIO_STAT_SYNC] += add; 155 else 156 stat[BLKIO_STAT_ASYNC] += add; 157} 158 159/* 160 * Decrements the appropriate stat variable if non-zero depending on the 161 * request type. Panics on value being zero. 162 * This should be called with the blkg->stats_lock held. 163 */ 164static void blkio_check_and_dec_stat(uint64_t *stat, bool direction, bool sync) 165{ 166 if (direction) { 167 BUG_ON(stat[BLKIO_STAT_WRITE] == 0); 168 stat[BLKIO_STAT_WRITE]--; 169 } else { 170 BUG_ON(stat[BLKIO_STAT_READ] == 0); 171 stat[BLKIO_STAT_READ]--; 172 } 173 if (sync) { 174 BUG_ON(stat[BLKIO_STAT_SYNC] == 0); 175 stat[BLKIO_STAT_SYNC]--; 176 } else { 177 BUG_ON(stat[BLKIO_STAT_ASYNC] == 0); 178 stat[BLKIO_STAT_ASYNC]--; 179 } 180} 181 182#ifdef CONFIG_DEBUG_BLK_CGROUP 183/* This should be called with the blkg->stats_lock held. */ 184static void blkio_set_start_group_wait_time(struct blkio_group *blkg, 185 struct blkio_group *curr_blkg) 186{ 187 if (blkio_blkg_waiting(&blkg->stats)) 188 return; 189 if (blkg == curr_blkg) 190 return; 191 blkg->stats.start_group_wait_time = sched_clock(); 192 blkio_mark_blkg_waiting(&blkg->stats); 193} 194 195/* This should be called with the blkg->stats_lock held. */ 196static void blkio_update_group_wait_time(struct blkio_group_stats *stats) 197{ 198 unsigned long long now; 199 200 if (!blkio_blkg_waiting(stats)) 201 return; 202 203 now = sched_clock(); 204 if (time_after64(now, stats->start_group_wait_time)) 205 stats->group_wait_time += now - stats->start_group_wait_time; 206 blkio_clear_blkg_waiting(stats); 207} 208 209/* This should be called with the blkg->stats_lock held. */ 210static void blkio_end_empty_time(struct blkio_group_stats *stats) 211{ 212 unsigned long long now; 213 214 if (!blkio_blkg_empty(stats)) 215 return; 216 217 now = sched_clock(); 218 if (time_after64(now, stats->start_empty_time)) 219 stats->empty_time += now - stats->start_empty_time; 220 blkio_clear_blkg_empty(stats); 221} 222 223void blkiocg_update_set_idle_time_stats(struct blkio_group *blkg) 224{ 225 unsigned long flags; 226 227 spin_lock_irqsave(&blkg->stats_lock, flags); 228 BUG_ON(blkio_blkg_idling(&blkg->stats)); 229 blkg->stats.start_idle_time = sched_clock(); 230 blkio_mark_blkg_idling(&blkg->stats); 231 spin_unlock_irqrestore(&blkg->stats_lock, flags); 232} 233EXPORT_SYMBOL_GPL(blkiocg_update_set_idle_time_stats); 234 235void blkiocg_update_idle_time_stats(struct blkio_group *blkg) 236{ 237 unsigned long flags; 238 unsigned long long now; 239 struct blkio_group_stats *stats; 240 241 spin_lock_irqsave(&blkg->stats_lock, flags); 242 stats = &blkg->stats; 243 if (blkio_blkg_idling(stats)) { 244 now = sched_clock(); 245 if (time_after64(now, stats->start_idle_time)) 246 stats->idle_time += now - stats->start_idle_time; 247 blkio_clear_blkg_idling(stats); 248 } 249 spin_unlock_irqrestore(&blkg->stats_lock, flags); 250} 251EXPORT_SYMBOL_GPL(blkiocg_update_idle_time_stats); 252 253void blkiocg_update_avg_queue_size_stats(struct blkio_group *blkg) 254{ 255 unsigned long flags; 256 struct blkio_group_stats *stats; 257 258 spin_lock_irqsave(&blkg->stats_lock, flags); 259 stats = &blkg->stats; 260 stats->avg_queue_size_sum += 261 stats->stat_arr[BLKIO_STAT_QUEUED][BLKIO_STAT_READ] + 262 stats->stat_arr[BLKIO_STAT_QUEUED][BLKIO_STAT_WRITE]; 263 stats->avg_queue_size_samples++; 264 blkio_update_group_wait_time(stats); 265 spin_unlock_irqrestore(&blkg->stats_lock, flags); 266} 267EXPORT_SYMBOL_GPL(blkiocg_update_avg_queue_size_stats); 268 269void blkiocg_set_start_empty_time(struct blkio_group *blkg) 270{ 271 unsigned long flags; 272 struct blkio_group_stats *stats; 273 274 spin_lock_irqsave(&blkg->stats_lock, flags); 275 stats = &blkg->stats; 276 277 if (stats->stat_arr[BLKIO_STAT_QUEUED][BLKIO_STAT_READ] || 278 stats->stat_arr[BLKIO_STAT_QUEUED][BLKIO_STAT_WRITE]) { 279 spin_unlock_irqrestore(&blkg->stats_lock, flags); 280 return; 281 } 282 283 /* 284 * group is already marked empty. This can happen if cfqq got new 285 * request in parent group and moved to this group while being added 286 * to service tree. Just ignore the event and move on. 287 */ 288 if(blkio_blkg_empty(stats)) { 289 spin_unlock_irqrestore(&blkg->stats_lock, flags); 290 return; 291 } 292 293 stats->start_empty_time = sched_clock(); 294 blkio_mark_blkg_empty(stats); 295 spin_unlock_irqrestore(&blkg->stats_lock, flags); 296} 297EXPORT_SYMBOL_GPL(blkiocg_set_start_empty_time); 298 299void blkiocg_update_dequeue_stats(struct blkio_group *blkg, 300 unsigned long dequeue) 301{ 302 blkg->stats.dequeue += dequeue; 303} 304EXPORT_SYMBOL_GPL(blkiocg_update_dequeue_stats); 305#else 306static inline void blkio_set_start_group_wait_time(struct blkio_group *blkg, 307 struct blkio_group *curr_blkg) {} 308static inline void blkio_end_empty_time(struct blkio_group_stats *stats) {} 309#endif 310 311void blkiocg_update_io_add_stats(struct blkio_group *blkg, 312 struct blkio_group *curr_blkg, bool direction, 313 bool sync) 314{ 315 unsigned long flags; 316 317 spin_lock_irqsave(&blkg->stats_lock, flags); 318 blkio_add_stat(blkg->stats.stat_arr[BLKIO_STAT_QUEUED], 1, direction, 319 sync); 320 blkio_end_empty_time(&blkg->stats); 321 blkio_set_start_group_wait_time(blkg, curr_blkg); 322 spin_unlock_irqrestore(&blkg->stats_lock, flags); 323} 324EXPORT_SYMBOL_GPL(blkiocg_update_io_add_stats); 325 326void blkiocg_update_io_remove_stats(struct blkio_group *blkg, 327 bool direction, bool sync) 328{ 329 unsigned long flags; 330 331 spin_lock_irqsave(&blkg->stats_lock, flags); 332 blkio_check_and_dec_stat(blkg->stats.stat_arr[BLKIO_STAT_QUEUED], 333 direction, sync); 334 spin_unlock_irqrestore(&blkg->stats_lock, flags); 335} 336EXPORT_SYMBOL_GPL(blkiocg_update_io_remove_stats); 337 338void blkiocg_update_timeslice_used(struct blkio_group *blkg, unsigned long time, 339 unsigned long unaccounted_time) 340{ 341 unsigned long flags; 342 343 spin_lock_irqsave(&blkg->stats_lock, flags); 344 blkg->stats.time += time; 345#ifdef CONFIG_DEBUG_BLK_CGROUP 346 blkg->stats.unaccounted_time += unaccounted_time; 347#endif 348 spin_unlock_irqrestore(&blkg->stats_lock, flags); 349} 350EXPORT_SYMBOL_GPL(blkiocg_update_timeslice_used); 351 352/* 353 * should be called under rcu read lock or queue lock to make sure blkg pointer 354 * is valid. 355 */ 356void blkiocg_update_dispatch_stats(struct blkio_group *blkg, 357 uint64_t bytes, bool direction, bool sync) 358{ 359 struct blkio_group_stats_cpu *stats_cpu; 360 unsigned long flags; 361 362 /* 363 * Disabling interrupts to provide mutual exclusion between two 364 * writes on same cpu. It probably is not needed for 64bit. Not 365 * optimizing that case yet. 366 */ 367 local_irq_save(flags); 368 369 stats_cpu = this_cpu_ptr(blkg->stats_cpu); 370 371 u64_stats_update_begin(&stats_cpu->syncp); 372 stats_cpu->sectors += bytes >> 9; 373 blkio_add_stat(stats_cpu->stat_arr_cpu[BLKIO_STAT_CPU_SERVICED], 374 1, direction, sync); 375 blkio_add_stat(stats_cpu->stat_arr_cpu[BLKIO_STAT_CPU_SERVICE_BYTES], 376 bytes, direction, sync); 377 u64_stats_update_end(&stats_cpu->syncp); 378 local_irq_restore(flags); 379} 380EXPORT_SYMBOL_GPL(blkiocg_update_dispatch_stats); 381 382void blkiocg_update_completion_stats(struct blkio_group *blkg, 383 uint64_t start_time, uint64_t io_start_time, bool direction, bool sync) 384{ 385 struct blkio_group_stats *stats; 386 unsigned long flags; 387 unsigned long long now = sched_clock(); 388 389 spin_lock_irqsave(&blkg->stats_lock, flags); 390 stats = &blkg->stats; 391 if (time_after64(now, io_start_time)) 392 blkio_add_stat(stats->stat_arr[BLKIO_STAT_SERVICE_TIME], 393 now - io_start_time, direction, sync); 394 if (time_after64(io_start_time, start_time)) 395 blkio_add_stat(stats->stat_arr[BLKIO_STAT_WAIT_TIME], 396 io_start_time - start_time, direction, sync); 397 spin_unlock_irqrestore(&blkg->stats_lock, flags); 398} 399EXPORT_SYMBOL_GPL(blkiocg_update_completion_stats); 400 401/* Merged stats are per cpu. */ 402void blkiocg_update_io_merged_stats(struct blkio_group *blkg, bool direction, 403 bool sync) 404{ 405 struct blkio_group_stats_cpu *stats_cpu; 406 unsigned long flags; 407 408 /* 409 * Disabling interrupts to provide mutual exclusion between two 410 * writes on same cpu. It probably is not needed for 64bit. Not 411 * optimizing that case yet. 412 */ 413 local_irq_save(flags); 414 415 stats_cpu = this_cpu_ptr(blkg->stats_cpu); 416 417 u64_stats_update_begin(&stats_cpu->syncp); 418 blkio_add_stat(stats_cpu->stat_arr_cpu[BLKIO_STAT_CPU_MERGED], 1, 419 direction, sync); 420 u64_stats_update_end(&stats_cpu->syncp); 421 local_irq_restore(flags); 422} 423EXPORT_SYMBOL_GPL(blkiocg_update_io_merged_stats); 424 425/** 426 * blkg_free - free a blkg 427 * @blkg: blkg to free 428 * 429 * Free @blkg which may be partially allocated. 430 */ 431static void blkg_free(struct blkio_group *blkg) 432{ 433 if (blkg) { 434 free_percpu(blkg->stats_cpu); 435 kfree(blkg->pd); 436 kfree(blkg); 437 } 438} 439 440/** 441 * blkg_alloc - allocate a blkg 442 * @blkcg: block cgroup the new blkg is associated with 443 * @q: request_queue the new blkg is associated with 444 * @pol: policy the new blkg is associated with 445 * 446 * Allocate a new blkg assocating @blkcg and @q for @pol. 447 * 448 * FIXME: Should be called with queue locked but currently isn't due to 449 * percpu stat breakage. 450 */ 451static struct blkio_group *blkg_alloc(struct blkio_cgroup *blkcg, 452 struct request_queue *q, 453 struct blkio_policy_type *pol) 454{ 455 struct blkio_group *blkg; 456 457 /* alloc and init base part */ 458 blkg = kzalloc_node(sizeof(*blkg), GFP_ATOMIC, q->node); 459 if (!blkg) 460 return NULL; 461 462 spin_lock_init(&blkg->stats_lock); 463 rcu_assign_pointer(blkg->q, q); 464 blkg->blkcg = blkcg; 465 blkg->plid = pol->plid; 466 blkg->refcnt = 1; 467 cgroup_path(blkcg->css.cgroup, blkg->path, sizeof(blkg->path)); 468 469 /* alloc per-policy data */ 470 blkg->pd = kzalloc_node(sizeof(*blkg->pd) + pol->pdata_size, GFP_ATOMIC, 471 q->node); 472 if (!blkg->pd) { 473 blkg_free(blkg); 474 return NULL; 475 } 476 477 /* broken, read comment in the callsite */ 478 blkg->stats_cpu = alloc_percpu(struct blkio_group_stats_cpu); 479 if (!blkg->stats_cpu) { 480 blkg_free(blkg); 481 return NULL; 482 } 483 484 /* attach pd to blkg and invoke per-policy init */ 485 blkg->pd->blkg = blkg; 486 pol->ops.blkio_init_group_fn(blkg); 487 return blkg; 488} 489 490struct blkio_group *blkg_lookup_create(struct blkio_cgroup *blkcg, 491 struct request_queue *q, 492 enum blkio_policy_id plid, 493 bool for_root) 494 __releases(q->queue_lock) __acquires(q->queue_lock) 495{ 496 struct blkio_policy_type *pol = blkio_policy[plid]; 497 struct blkio_group *blkg, *new_blkg; 498 499 WARN_ON_ONCE(!rcu_read_lock_held()); 500 lockdep_assert_held(q->queue_lock); 501 502 /* 503 * This could be the first entry point of blkcg implementation and 504 * we shouldn't allow anything to go through for a bypassing queue. 505 * The following can be removed if blkg lookup is guaranteed to 506 * fail on a bypassing queue. 507 */ 508 if (unlikely(blk_queue_bypass(q)) && !for_root) 509 return ERR_PTR(blk_queue_dead(q) ? -EINVAL : -EBUSY); 510 511 blkg = blkg_lookup(blkcg, q, plid); 512 if (blkg) 513 return blkg; 514 515 /* blkg holds a reference to blkcg */ 516 if (!css_tryget(&blkcg->css)) 517 return ERR_PTR(-EINVAL); 518 519 /* 520 * Allocate and initialize. 521 * 522 * FIXME: The following is broken. Percpu memory allocation 523 * requires %GFP_KERNEL context and can't be performed from IO 524 * path. Allocation here should inherently be atomic and the 525 * following lock dancing can be removed once the broken percpu 526 * allocation is fixed. 527 */ 528 spin_unlock_irq(q->queue_lock); 529 rcu_read_unlock(); 530 531 new_blkg = blkg_alloc(blkcg, q, pol); 532 533 rcu_read_lock(); 534 spin_lock_irq(q->queue_lock); 535 536 /* did bypass get turned on inbetween? */ 537 if (unlikely(blk_queue_bypass(q)) && !for_root) { 538 blkg = ERR_PTR(blk_queue_dead(q) ? -EINVAL : -EBUSY); 539 goto out; 540 } 541 542 /* did someone beat us to it? */ 543 blkg = blkg_lookup(blkcg, q, plid); 544 if (unlikely(blkg)) 545 goto out; 546 547 /* did alloc fail? */ 548 if (unlikely(!new_blkg)) { 549 blkg = ERR_PTR(-ENOMEM); 550 goto out; 551 } 552 553 /* insert */ 554 spin_lock(&blkcg->lock); 555 swap(blkg, new_blkg); 556 hlist_add_head_rcu(&blkg->blkcg_node, &blkcg->blkg_list); 557 pol->ops.blkio_link_group_fn(q, blkg); 558 spin_unlock(&blkcg->lock); 559out: 560 blkg_free(new_blkg); 561 return blkg; 562} 563EXPORT_SYMBOL_GPL(blkg_lookup_create); 564 565static void __blkiocg_del_blkio_group(struct blkio_group *blkg) 566{ 567 hlist_del_init_rcu(&blkg->blkcg_node); 568} 569 570/* 571 * returns 0 if blkio_group was still on cgroup list. Otherwise returns 1 572 * indicating that blk_group was unhashed by the time we got to it. 573 */ 574int blkiocg_del_blkio_group(struct blkio_group *blkg) 575{ 576 struct blkio_cgroup *blkcg = blkg->blkcg; 577 unsigned long flags; 578 int ret = 1; 579 580 spin_lock_irqsave(&blkcg->lock, flags); 581 if (!hlist_unhashed(&blkg->blkcg_node)) { 582 __blkiocg_del_blkio_group(blkg); 583 ret = 0; 584 } 585 spin_unlock_irqrestore(&blkcg->lock, flags); 586 587 return ret; 588} 589EXPORT_SYMBOL_GPL(blkiocg_del_blkio_group); 590 591/* called under rcu_read_lock(). */ 592struct blkio_group *blkg_lookup(struct blkio_cgroup *blkcg, 593 struct request_queue *q, 594 enum blkio_policy_id plid) 595{ 596 struct blkio_group *blkg; 597 struct hlist_node *n; 598 599 hlist_for_each_entry_rcu(blkg, n, &blkcg->blkg_list, blkcg_node) 600 if (blkg->q == q && blkg->plid == plid) 601 return blkg; 602 return NULL; 603} 604EXPORT_SYMBOL_GPL(blkg_lookup); 605 606void blkg_destroy_all(struct request_queue *q) 607{ 608 struct blkio_policy_type *pol; 609 610 while (true) { 611 bool done = true; 612 613 spin_lock(&blkio_list_lock); 614 spin_lock_irq(q->queue_lock); 615 616 /* 617 * clear_queue_fn() might return with non-empty group list 618 * if it raced cgroup removal and lost. cgroup removal is 619 * guaranteed to make forward progress and retrying after a 620 * while is enough. This ugliness is scheduled to be 621 * removed after locking update. 622 */ 623 list_for_each_entry(pol, &blkio_list, list) 624 if (!pol->ops.blkio_clear_queue_fn(q)) 625 done = false; 626 627 spin_unlock_irq(q->queue_lock); 628 spin_unlock(&blkio_list_lock); 629 630 if (done) 631 break; 632 633 msleep(10); /* just some random duration I like */ 634 } 635} 636 637static void blkg_rcu_free(struct rcu_head *rcu_head) 638{ 639 blkg_free(container_of(rcu_head, struct blkio_group, rcu_head)); 640} 641 642void __blkg_release(struct blkio_group *blkg) 643{ 644 /* release the extra blkcg reference this blkg has been holding */ 645 css_put(&blkg->blkcg->css); 646 647 /* 648 * A group is freed in rcu manner. But having an rcu lock does not 649 * mean that one can access all the fields of blkg and assume these 650 * are valid. For example, don't try to follow throtl_data and 651 * request queue links. 652 * 653 * Having a reference to blkg under an rcu allows acess to only 654 * values local to groups like group stats and group rate limits 655 */ 656 call_rcu(&blkg->rcu_head, blkg_rcu_free); 657} 658EXPORT_SYMBOL_GPL(__blkg_release); 659 660static void blkio_reset_stats_cpu(struct blkio_group *blkg) 661{ 662 struct blkio_group_stats_cpu *stats_cpu; 663 int i, j, k; 664 /* 665 * Note: On 64 bit arch this should not be an issue. This has the 666 * possibility of returning some inconsistent value on 32bit arch 667 * as 64bit update on 32bit is non atomic. Taking care of this 668 * corner case makes code very complicated, like sending IPIs to 669 * cpus, taking care of stats of offline cpus etc. 670 * 671 * reset stats is anyway more of a debug feature and this sounds a 672 * corner case. So I am not complicating the code yet until and 673 * unless this becomes a real issue. 674 */ 675 for_each_possible_cpu(i) { 676 stats_cpu = per_cpu_ptr(blkg->stats_cpu, i); 677 stats_cpu->sectors = 0; 678 for(j = 0; j < BLKIO_STAT_CPU_NR; j++) 679 for (k = 0; k < BLKIO_STAT_TOTAL; k++) 680 stats_cpu->stat_arr_cpu[j][k] = 0; 681 } 682} 683 684static int 685blkiocg_reset_stats(struct cgroup *cgroup, struct cftype *cftype, u64 val) 686{ 687 struct blkio_cgroup *blkcg; 688 struct blkio_group *blkg; 689 struct blkio_group_stats *stats; 690 struct hlist_node *n; 691 uint64_t queued[BLKIO_STAT_TOTAL]; 692 int i; 693#ifdef CONFIG_DEBUG_BLK_CGROUP 694 bool idling, waiting, empty; 695 unsigned long long now = sched_clock(); 696#endif 697 698 blkcg = cgroup_to_blkio_cgroup(cgroup); 699 spin_lock_irq(&blkcg->lock); 700 hlist_for_each_entry(blkg, n, &blkcg->blkg_list, blkcg_node) { 701 spin_lock(&blkg->stats_lock); 702 stats = &blkg->stats; 703#ifdef CONFIG_DEBUG_BLK_CGROUP 704 idling = blkio_blkg_idling(stats); 705 waiting = blkio_blkg_waiting(stats); 706 empty = blkio_blkg_empty(stats); 707#endif 708 for (i = 0; i < BLKIO_STAT_TOTAL; i++) 709 queued[i] = stats->stat_arr[BLKIO_STAT_QUEUED][i]; 710 memset(stats, 0, sizeof(struct blkio_group_stats)); 711 for (i = 0; i < BLKIO_STAT_TOTAL; i++) 712 stats->stat_arr[BLKIO_STAT_QUEUED][i] = queued[i]; 713#ifdef CONFIG_DEBUG_BLK_CGROUP 714 if (idling) { 715 blkio_mark_blkg_idling(stats); 716 stats->start_idle_time = now; 717 } 718 if (waiting) { 719 blkio_mark_blkg_waiting(stats); 720 stats->start_group_wait_time = now; 721 } 722 if (empty) { 723 blkio_mark_blkg_empty(stats); 724 stats->start_empty_time = now; 725 } 726#endif 727 spin_unlock(&blkg->stats_lock); 728 729 /* Reset Per cpu stats which don't take blkg->stats_lock */ 730 blkio_reset_stats_cpu(blkg); 731 } 732 733 spin_unlock_irq(&blkcg->lock); 734 return 0; 735} 736 737static void blkio_get_key_name(enum stat_sub_type type, const char *dname, 738 char *str, int chars_left, bool diskname_only) 739{ 740 snprintf(str, chars_left, "%s", dname); 741 chars_left -= strlen(str); 742 if (chars_left <= 0) { 743 printk(KERN_WARNING 744 "Possibly incorrect cgroup stat display format"); 745 return; 746 } 747 if (diskname_only) 748 return; 749 switch (type) { 750 case BLKIO_STAT_READ: 751 strlcat(str, " Read", chars_left); 752 break; 753 case BLKIO_STAT_WRITE: 754 strlcat(str, " Write", chars_left); 755 break; 756 case BLKIO_STAT_SYNC: 757 strlcat(str, " Sync", chars_left); 758 break; 759 case BLKIO_STAT_ASYNC: 760 strlcat(str, " Async", chars_left); 761 break; 762 case BLKIO_STAT_TOTAL: 763 strlcat(str, " Total", chars_left); 764 break; 765 default: 766 strlcat(str, " Invalid", chars_left); 767 } 768} 769 770static uint64_t blkio_fill_stat(char *str, int chars_left, uint64_t val, 771 struct cgroup_map_cb *cb, const char *dname) 772{ 773 blkio_get_key_name(0, dname, str, chars_left, true); 774 cb->fill(cb, str, val); 775 return val; 776} 777 778 779static uint64_t blkio_read_stat_cpu(struct blkio_group *blkg, 780 enum stat_type_cpu type, enum stat_sub_type sub_type) 781{ 782 int cpu; 783 struct blkio_group_stats_cpu *stats_cpu; 784 u64 val = 0, tval; 785 786 for_each_possible_cpu(cpu) { 787 unsigned int start; 788 stats_cpu = per_cpu_ptr(blkg->stats_cpu, cpu); 789 790 do { 791 start = u64_stats_fetch_begin(&stats_cpu->syncp); 792 if (type == BLKIO_STAT_CPU_SECTORS) 793 tval = stats_cpu->sectors; 794 else 795 tval = stats_cpu->stat_arr_cpu[type][sub_type]; 796 } while(u64_stats_fetch_retry(&stats_cpu->syncp, start)); 797 798 val += tval; 799 } 800 801 return val; 802} 803 804static uint64_t blkio_get_stat_cpu(struct blkio_group *blkg, 805 struct cgroup_map_cb *cb, const char *dname, 806 enum stat_type_cpu type) 807{ 808 uint64_t disk_total, val; 809 char key_str[MAX_KEY_LEN]; 810 enum stat_sub_type sub_type; 811 812 if (type == BLKIO_STAT_CPU_SECTORS) { 813 val = blkio_read_stat_cpu(blkg, type, 0); 814 return blkio_fill_stat(key_str, MAX_KEY_LEN - 1, val, cb, 815 dname); 816 } 817 818 for (sub_type = BLKIO_STAT_READ; sub_type < BLKIO_STAT_TOTAL; 819 sub_type++) { 820 blkio_get_key_name(sub_type, dname, key_str, MAX_KEY_LEN, 821 false); 822 val = blkio_read_stat_cpu(blkg, type, sub_type); 823 cb->fill(cb, key_str, val); 824 } 825 826 disk_total = blkio_read_stat_cpu(blkg, type, BLKIO_STAT_READ) + 827 blkio_read_stat_cpu(blkg, type, BLKIO_STAT_WRITE); 828 829 blkio_get_key_name(BLKIO_STAT_TOTAL, dname, key_str, MAX_KEY_LEN, 830 false); 831 cb->fill(cb, key_str, disk_total); 832 return disk_total; 833} 834 835/* This should be called with blkg->stats_lock held */ 836static uint64_t blkio_get_stat(struct blkio_group *blkg, 837 struct cgroup_map_cb *cb, const char *dname, 838 enum stat_type type) 839{ 840 uint64_t disk_total; 841 char key_str[MAX_KEY_LEN]; 842 enum stat_sub_type sub_type; 843 844 if (type == BLKIO_STAT_TIME) 845 return blkio_fill_stat(key_str, MAX_KEY_LEN - 1, 846 blkg->stats.time, cb, dname); 847#ifdef CONFIG_DEBUG_BLK_CGROUP 848 if (type == BLKIO_STAT_UNACCOUNTED_TIME) 849 return blkio_fill_stat(key_str, MAX_KEY_LEN - 1, 850 blkg->stats.unaccounted_time, cb, dname); 851 if (type == BLKIO_STAT_AVG_QUEUE_SIZE) { 852 uint64_t sum = blkg->stats.avg_queue_size_sum; 853 uint64_t samples = blkg->stats.avg_queue_size_samples; 854 if (samples) 855 do_div(sum, samples); 856 else 857 sum = 0; 858 return blkio_fill_stat(key_str, MAX_KEY_LEN - 1, 859 sum, cb, dname); 860 } 861 if (type == BLKIO_STAT_GROUP_WAIT_TIME) 862 return blkio_fill_stat(key_str, MAX_KEY_LEN - 1, 863 blkg->stats.group_wait_time, cb, dname); 864 if (type == BLKIO_STAT_IDLE_TIME) 865 return blkio_fill_stat(key_str, MAX_KEY_LEN - 1, 866 blkg->stats.idle_time, cb, dname); 867 if (type == BLKIO_STAT_EMPTY_TIME) 868 return blkio_fill_stat(key_str, MAX_KEY_LEN - 1, 869 blkg->stats.empty_time, cb, dname); 870 if (type == BLKIO_STAT_DEQUEUE) 871 return blkio_fill_stat(key_str, MAX_KEY_LEN - 1, 872 blkg->stats.dequeue, cb, dname); 873#endif 874 875 for (sub_type = BLKIO_STAT_READ; sub_type < BLKIO_STAT_TOTAL; 876 sub_type++) { 877 blkio_get_key_name(sub_type, dname, key_str, MAX_KEY_LEN, 878 false); 879 cb->fill(cb, key_str, blkg->stats.stat_arr[type][sub_type]); 880 } 881 disk_total = blkg->stats.stat_arr[type][BLKIO_STAT_READ] + 882 blkg->stats.stat_arr[type][BLKIO_STAT_WRITE]; 883 blkio_get_key_name(BLKIO_STAT_TOTAL, dname, key_str, MAX_KEY_LEN, 884 false); 885 cb->fill(cb, key_str, disk_total); 886 return disk_total; 887} 888 889static int blkio_policy_parse_and_set(char *buf, enum blkio_policy_id plid, 890 int fileid, struct blkio_cgroup *blkcg) 891{ 892 struct gendisk *disk = NULL; 893 struct blkio_group *blkg = NULL; 894 char *s[4], *p, *major_s = NULL, *minor_s = NULL; 895 unsigned long major, minor; 896 int i = 0, ret = -EINVAL; 897 int part; 898 dev_t dev; 899 u64 temp; 900 901 memset(s, 0, sizeof(s)); 902 903 while ((p = strsep(&buf, " ")) != NULL) { 904 if (!*p) 905 continue; 906 907 s[i++] = p; 908 909 /* Prevent from inputing too many things */ 910 if (i == 3) 911 break; 912 } 913 914 if (i != 2) 915 goto out; 916 917 p = strsep(&s[0], ":"); 918 if (p != NULL) 919 major_s = p; 920 else 921 goto out; 922 923 minor_s = s[0]; 924 if (!minor_s) 925 goto out; 926 927 if (strict_strtoul(major_s, 10, &major)) 928 goto out; 929 930 if (strict_strtoul(minor_s, 10, &minor)) 931 goto out; 932 933 dev = MKDEV(major, minor); 934 935 if (strict_strtoull(s[1], 10, &temp)) 936 goto out; 937 938 disk = get_gendisk(dev, &part); 939 if (!disk || part) 940 goto out; 941 942 rcu_read_lock(); 943 944 spin_lock_irq(disk->queue->queue_lock); 945 blkg = blkg_lookup_create(blkcg, disk->queue, plid, false); 946 spin_unlock_irq(disk->queue->queue_lock); 947 948 if (IS_ERR(blkg)) { 949 ret = PTR_ERR(blkg); 950 goto out_unlock; 951 } 952 953 switch (plid) { 954 case BLKIO_POLICY_PROP: 955 if ((temp < BLKIO_WEIGHT_MIN && temp > 0) || 956 temp > BLKIO_WEIGHT_MAX) 957 goto out_unlock; 958 959 blkg->conf.weight = temp; 960 blkio_update_group_weight(blkg, temp ?: blkcg->weight); 961 break; 962 case BLKIO_POLICY_THROTL: 963 switch(fileid) { 964 case BLKIO_THROTL_read_bps_device: 965 blkg->conf.bps[READ] = temp; 966 blkio_update_group_bps(blkg, temp ?: -1, fileid); 967 break; 968 case BLKIO_THROTL_write_bps_device: 969 blkg->conf.bps[WRITE] = temp; 970 blkio_update_group_bps(blkg, temp ?: -1, fileid); 971 break; 972 case BLKIO_THROTL_read_iops_device: 973 if (temp > THROTL_IOPS_MAX) 974 goto out_unlock; 975 blkg->conf.iops[READ] = temp; 976 blkio_update_group_iops(blkg, temp ?: -1, fileid); 977 break; 978 case BLKIO_THROTL_write_iops_device: 979 if (temp > THROTL_IOPS_MAX) 980 goto out_unlock; 981 blkg->conf.iops[WRITE] = temp; 982 blkio_update_group_iops(blkg, temp ?: -1, fileid); 983 break; 984 } 985 break; 986 default: 987 BUG(); 988 } 989 ret = 0; 990out_unlock: 991 rcu_read_unlock(); 992out: 993 put_disk(disk); 994 995 /* 996 * If queue was bypassing, we should retry. Do so after a short 997 * msleep(). It isn't strictly necessary but queue can be 998 * bypassing for some time and it's always nice to avoid busy 999 * looping. 1000 */ 1001 if (ret == -EBUSY) { 1002 msleep(10); 1003 return restart_syscall(); 1004 } 1005 return ret; 1006} 1007 1008static int blkiocg_file_write(struct cgroup *cgrp, struct cftype *cft, 1009 const char *buffer) 1010{ 1011 int ret = 0; 1012 char *buf; 1013 struct blkio_cgroup *blkcg = cgroup_to_blkio_cgroup(cgrp); 1014 enum blkio_policy_id plid = BLKIOFILE_POLICY(cft->private); 1015 int fileid = BLKIOFILE_ATTR(cft->private); 1016 1017 buf = kstrdup(buffer, GFP_KERNEL); 1018 if (!buf) 1019 return -ENOMEM; 1020 1021 ret = blkio_policy_parse_and_set(buf, plid, fileid, blkcg); 1022 kfree(buf); 1023 return ret; 1024} 1025 1026static const char *blkg_dev_name(struct blkio_group *blkg) 1027{ 1028 /* some drivers (floppy) instantiate a queue w/o disk registered */ 1029 if (blkg->q->backing_dev_info.dev) 1030 return dev_name(blkg->q->backing_dev_info.dev); 1031 return NULL; 1032} 1033 1034static void blkio_print_group_conf(struct cftype *cft, struct blkio_group *blkg, 1035 struct seq_file *m) 1036{ 1037 const char *dname = blkg_dev_name(blkg); 1038 int fileid = BLKIOFILE_ATTR(cft->private); 1039 int rw = WRITE; 1040 1041 if (!dname) 1042 return; 1043 1044 switch (blkg->plid) { 1045 case BLKIO_POLICY_PROP: 1046 if (blkg->conf.weight) 1047 seq_printf(m, "%s\t%u\n", 1048 dname, blkg->conf.weight); 1049 break; 1050 case BLKIO_POLICY_THROTL: 1051 switch (fileid) { 1052 case BLKIO_THROTL_read_bps_device: 1053 rw = READ; 1054 case BLKIO_THROTL_write_bps_device: 1055 if (blkg->conf.bps[rw]) 1056 seq_printf(m, "%s\t%llu\n", 1057 dname, blkg->conf.bps[rw]); 1058 break; 1059 case BLKIO_THROTL_read_iops_device: 1060 rw = READ; 1061 case BLKIO_THROTL_write_iops_device: 1062 if (blkg->conf.iops[rw]) 1063 seq_printf(m, "%s\t%u\n", 1064 dname, blkg->conf.iops[rw]); 1065 break; 1066 } 1067 break; 1068 default: 1069 BUG(); 1070 } 1071} 1072 1073/* cgroup files which read their data from policy nodes end up here */ 1074static void blkio_read_conf(struct cftype *cft, struct blkio_cgroup *blkcg, 1075 struct seq_file *m) 1076{ 1077 struct blkio_group *blkg; 1078 struct hlist_node *n; 1079 1080 spin_lock_irq(&blkcg->lock); 1081 hlist_for_each_entry(blkg, n, &blkcg->blkg_list, blkcg_node) 1082 if (BLKIOFILE_POLICY(cft->private) == blkg->plid) 1083 blkio_print_group_conf(cft, blkg, m); 1084 spin_unlock_irq(&blkcg->lock); 1085} 1086 1087static int blkiocg_file_read(struct cgroup *cgrp, struct cftype *cft, 1088 struct seq_file *m) 1089{ 1090 struct blkio_cgroup *blkcg; 1091 enum blkio_policy_id plid = BLKIOFILE_POLICY(cft->private); 1092 int name = BLKIOFILE_ATTR(cft->private); 1093 1094 blkcg = cgroup_to_blkio_cgroup(cgrp); 1095 1096 switch(plid) { 1097 case BLKIO_POLICY_PROP: 1098 switch(name) { 1099 case BLKIO_PROP_weight_device: 1100 blkio_read_conf(cft, blkcg, m); 1101 return 0; 1102 default: 1103 BUG(); 1104 } 1105 break; 1106 case BLKIO_POLICY_THROTL: 1107 switch(name){ 1108 case BLKIO_THROTL_read_bps_device: 1109 case BLKIO_THROTL_write_bps_device: 1110 case BLKIO_THROTL_read_iops_device: 1111 case BLKIO_THROTL_write_iops_device: 1112 blkio_read_conf(cft, blkcg, m); 1113 return 0; 1114 default: 1115 BUG(); 1116 } 1117 break; 1118 default: 1119 BUG(); 1120 } 1121 1122 return 0; 1123} 1124 1125static int blkio_read_blkg_stats(struct blkio_cgroup *blkcg, 1126 struct cftype *cft, struct cgroup_map_cb *cb, 1127 enum stat_type type, bool show_total, bool pcpu) 1128{ 1129 struct blkio_group *blkg; 1130 struct hlist_node *n; 1131 uint64_t cgroup_total = 0; 1132 1133 rcu_read_lock(); 1134 hlist_for_each_entry_rcu(blkg, n, &blkcg->blkg_list, blkcg_node) { 1135 const char *dname = blkg_dev_name(blkg); 1136 1137 if (!dname || BLKIOFILE_POLICY(cft->private) != blkg->plid) 1138 continue; 1139 if (pcpu) 1140 cgroup_total += blkio_get_stat_cpu(blkg, cb, dname, 1141 type); 1142 else { 1143 spin_lock_irq(&blkg->stats_lock); 1144 cgroup_total += blkio_get_stat(blkg, cb, dname, type); 1145 spin_unlock_irq(&blkg->stats_lock); 1146 } 1147 } 1148 if (show_total) 1149 cb->fill(cb, "Total", cgroup_total); 1150 rcu_read_unlock(); 1151 return 0; 1152} 1153 1154/* All map kind of cgroup file get serviced by this function */ 1155static int blkiocg_file_read_map(struct cgroup *cgrp, struct cftype *cft, 1156 struct cgroup_map_cb *cb) 1157{ 1158 struct blkio_cgroup *blkcg; 1159 enum blkio_policy_id plid = BLKIOFILE_POLICY(cft->private); 1160 int name = BLKIOFILE_ATTR(cft->private); 1161 1162 blkcg = cgroup_to_blkio_cgroup(cgrp); 1163 1164 switch(plid) { 1165 case BLKIO_POLICY_PROP: 1166 switch(name) { 1167 case BLKIO_PROP_time: 1168 return blkio_read_blkg_stats(blkcg, cft, cb, 1169 BLKIO_STAT_TIME, 0, 0); 1170 case BLKIO_PROP_sectors: 1171 return blkio_read_blkg_stats(blkcg, cft, cb, 1172 BLKIO_STAT_CPU_SECTORS, 0, 1); 1173 case BLKIO_PROP_io_service_bytes: 1174 return blkio_read_blkg_stats(blkcg, cft, cb, 1175 BLKIO_STAT_CPU_SERVICE_BYTES, 1, 1); 1176 case BLKIO_PROP_io_serviced: 1177 return blkio_read_blkg_stats(blkcg, cft, cb, 1178 BLKIO_STAT_CPU_SERVICED, 1, 1); 1179 case BLKIO_PROP_io_service_time: 1180 return blkio_read_blkg_stats(blkcg, cft, cb, 1181 BLKIO_STAT_SERVICE_TIME, 1, 0); 1182 case BLKIO_PROP_io_wait_time: 1183 return blkio_read_blkg_stats(blkcg, cft, cb, 1184 BLKIO_STAT_WAIT_TIME, 1, 0); 1185 case BLKIO_PROP_io_merged: 1186 return blkio_read_blkg_stats(blkcg, cft, cb, 1187 BLKIO_STAT_CPU_MERGED, 1, 1); 1188 case BLKIO_PROP_io_queued: 1189 return blkio_read_blkg_stats(blkcg, cft, cb, 1190 BLKIO_STAT_QUEUED, 1, 0); 1191#ifdef CONFIG_DEBUG_BLK_CGROUP 1192 case BLKIO_PROP_unaccounted_time: 1193 return blkio_read_blkg_stats(blkcg, cft, cb, 1194 BLKIO_STAT_UNACCOUNTED_TIME, 0, 0); 1195 case BLKIO_PROP_dequeue: 1196 return blkio_read_blkg_stats(blkcg, cft, cb, 1197 BLKIO_STAT_DEQUEUE, 0, 0); 1198 case BLKIO_PROP_avg_queue_size: 1199 return blkio_read_blkg_stats(blkcg, cft, cb, 1200 BLKIO_STAT_AVG_QUEUE_SIZE, 0, 0); 1201 case BLKIO_PROP_group_wait_time: 1202 return blkio_read_blkg_stats(blkcg, cft, cb, 1203 BLKIO_STAT_GROUP_WAIT_TIME, 0, 0); 1204 case BLKIO_PROP_idle_time: 1205 return blkio_read_blkg_stats(blkcg, cft, cb, 1206 BLKIO_STAT_IDLE_TIME, 0, 0); 1207 case BLKIO_PROP_empty_time: 1208 return blkio_read_blkg_stats(blkcg, cft, cb, 1209 BLKIO_STAT_EMPTY_TIME, 0, 0); 1210#endif 1211 default: 1212 BUG(); 1213 } 1214 break; 1215 case BLKIO_POLICY_THROTL: 1216 switch(name){ 1217 case BLKIO_THROTL_io_service_bytes: 1218 return blkio_read_blkg_stats(blkcg, cft, cb, 1219 BLKIO_STAT_CPU_SERVICE_BYTES, 1, 1); 1220 case BLKIO_THROTL_io_serviced: 1221 return blkio_read_blkg_stats(blkcg, cft, cb, 1222 BLKIO_STAT_CPU_SERVICED, 1, 1); 1223 default: 1224 BUG(); 1225 } 1226 break; 1227 default: 1228 BUG(); 1229 } 1230 1231 return 0; 1232} 1233 1234static int blkio_weight_write(struct blkio_cgroup *blkcg, int plid, u64 val) 1235{ 1236 struct blkio_group *blkg; 1237 struct hlist_node *n; 1238 1239 if (val < BLKIO_WEIGHT_MIN || val > BLKIO_WEIGHT_MAX) 1240 return -EINVAL; 1241 1242 spin_lock(&blkio_list_lock); 1243 spin_lock_irq(&blkcg->lock); 1244 blkcg->weight = (unsigned int)val; 1245 1246 hlist_for_each_entry(blkg, n, &blkcg->blkg_list, blkcg_node) 1247 if (blkg->plid == plid && !blkg->conf.weight) 1248 blkio_update_group_weight(blkg, blkcg->weight); 1249 1250 spin_unlock_irq(&blkcg->lock); 1251 spin_unlock(&blkio_list_lock); 1252 return 0; 1253} 1254 1255static u64 blkiocg_file_read_u64 (struct cgroup *cgrp, struct cftype *cft) { 1256 struct blkio_cgroup *blkcg; 1257 enum blkio_policy_id plid = BLKIOFILE_POLICY(cft->private); 1258 int name = BLKIOFILE_ATTR(cft->private); 1259 1260 blkcg = cgroup_to_blkio_cgroup(cgrp); 1261 1262 switch(plid) { 1263 case BLKIO_POLICY_PROP: 1264 switch(name) { 1265 case BLKIO_PROP_weight: 1266 return (u64)blkcg->weight; 1267 } 1268 break; 1269 default: 1270 BUG(); 1271 } 1272 return 0; 1273} 1274 1275static int 1276blkiocg_file_write_u64(struct cgroup *cgrp, struct cftype *cft, u64 val) 1277{ 1278 struct blkio_cgroup *blkcg; 1279 enum blkio_policy_id plid = BLKIOFILE_POLICY(cft->private); 1280 int name = BLKIOFILE_ATTR(cft->private); 1281 1282 blkcg = cgroup_to_blkio_cgroup(cgrp); 1283 1284 switch(plid) { 1285 case BLKIO_POLICY_PROP: 1286 switch(name) { 1287 case BLKIO_PROP_weight: 1288 return blkio_weight_write(blkcg, plid, val); 1289 } 1290 break; 1291 default: 1292 BUG(); 1293 } 1294 1295 return 0; 1296} 1297 1298struct cftype blkio_files[] = { 1299 { 1300 .name = "weight_device", 1301 .private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP, 1302 BLKIO_PROP_weight_device), 1303 .read_seq_string = blkiocg_file_read, 1304 .write_string = blkiocg_file_write, 1305 .max_write_len = 256, 1306 }, 1307 { 1308 .name = "weight", 1309 .private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP, 1310 BLKIO_PROP_weight), 1311 .read_u64 = blkiocg_file_read_u64, 1312 .write_u64 = blkiocg_file_write_u64, 1313 }, 1314 { 1315 .name = "time", 1316 .private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP, 1317 BLKIO_PROP_time), 1318 .read_map = blkiocg_file_read_map, 1319 }, 1320 { 1321 .name = "sectors", 1322 .private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP, 1323 BLKIO_PROP_sectors), 1324 .read_map = blkiocg_file_read_map, 1325 }, 1326 { 1327 .name = "io_service_bytes", 1328 .private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP, 1329 BLKIO_PROP_io_service_bytes), 1330 .read_map = blkiocg_file_read_map, 1331 }, 1332 { 1333 .name = "io_serviced", 1334 .private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP, 1335 BLKIO_PROP_io_serviced), 1336 .read_map = blkiocg_file_read_map, 1337 }, 1338 { 1339 .name = "io_service_time", 1340 .private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP, 1341 BLKIO_PROP_io_service_time), 1342 .read_map = blkiocg_file_read_map, 1343 }, 1344 { 1345 .name = "io_wait_time", 1346 .private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP, 1347 BLKIO_PROP_io_wait_time), 1348 .read_map = blkiocg_file_read_map, 1349 }, 1350 { 1351 .name = "io_merged", 1352 .private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP, 1353 BLKIO_PROP_io_merged), 1354 .read_map = blkiocg_file_read_map, 1355 }, 1356 { 1357 .name = "io_queued", 1358 .private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP, 1359 BLKIO_PROP_io_queued), 1360 .read_map = blkiocg_file_read_map, 1361 }, 1362 { 1363 .name = "reset_stats", 1364 .write_u64 = blkiocg_reset_stats, 1365 }, 1366#ifdef CONFIG_BLK_DEV_THROTTLING 1367 { 1368 .name = "throttle.read_bps_device", 1369 .private = BLKIOFILE_PRIVATE(BLKIO_POLICY_THROTL, 1370 BLKIO_THROTL_read_bps_device), 1371 .read_seq_string = blkiocg_file_read, 1372 .write_string = blkiocg_file_write, 1373 .max_write_len = 256, 1374 }, 1375 1376 { 1377 .name = "throttle.write_bps_device", 1378 .private = BLKIOFILE_PRIVATE(BLKIO_POLICY_THROTL, 1379 BLKIO_THROTL_write_bps_device), 1380 .read_seq_string = blkiocg_file_read, 1381 .write_string = blkiocg_file_write, 1382 .max_write_len = 256, 1383 }, 1384 1385 { 1386 .name = "throttle.read_iops_device", 1387 .private = BLKIOFILE_PRIVATE(BLKIO_POLICY_THROTL, 1388 BLKIO_THROTL_read_iops_device), 1389 .read_seq_string = blkiocg_file_read, 1390 .write_string = blkiocg_file_write, 1391 .max_write_len = 256, 1392 }, 1393 1394 { 1395 .name = "throttle.write_iops_device", 1396 .private = BLKIOFILE_PRIVATE(BLKIO_POLICY_THROTL, 1397 BLKIO_THROTL_write_iops_device), 1398 .read_seq_string = blkiocg_file_read, 1399 .write_string = blkiocg_file_write, 1400 .max_write_len = 256, 1401 }, 1402 { 1403 .name = "throttle.io_service_bytes", 1404 .private = BLKIOFILE_PRIVATE(BLKIO_POLICY_THROTL, 1405 BLKIO_THROTL_io_service_bytes), 1406 .read_map = blkiocg_file_read_map, 1407 }, 1408 { 1409 .name = "throttle.io_serviced", 1410 .private = BLKIOFILE_PRIVATE(BLKIO_POLICY_THROTL, 1411 BLKIO_THROTL_io_serviced), 1412 .read_map = blkiocg_file_read_map, 1413 }, 1414#endif /* CONFIG_BLK_DEV_THROTTLING */ 1415 1416#ifdef CONFIG_DEBUG_BLK_CGROUP 1417 { 1418 .name = "avg_queue_size", 1419 .private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP, 1420 BLKIO_PROP_avg_queue_size), 1421 .read_map = blkiocg_file_read_map, 1422 }, 1423 { 1424 .name = "group_wait_time", 1425 .private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP, 1426 BLKIO_PROP_group_wait_time), 1427 .read_map = blkiocg_file_read_map, 1428 }, 1429 { 1430 .name = "idle_time", 1431 .private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP, 1432 BLKIO_PROP_idle_time), 1433 .read_map = blkiocg_file_read_map, 1434 }, 1435 { 1436 .name = "empty_time", 1437 .private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP, 1438 BLKIO_PROP_empty_time), 1439 .read_map = blkiocg_file_read_map, 1440 }, 1441 { 1442 .name = "dequeue", 1443 .private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP, 1444 BLKIO_PROP_dequeue), 1445 .read_map = blkiocg_file_read_map, 1446 }, 1447 { 1448 .name = "unaccounted_time", 1449 .private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP, 1450 BLKIO_PROP_unaccounted_time), 1451 .read_map = blkiocg_file_read_map, 1452 }, 1453#endif 1454}; 1455 1456static int blkiocg_populate(struct cgroup_subsys *subsys, struct cgroup *cgroup) 1457{ 1458 return cgroup_add_files(cgroup, subsys, blkio_files, 1459 ARRAY_SIZE(blkio_files)); 1460} 1461 1462static int blkiocg_pre_destroy(struct cgroup_subsys *subsys, 1463 struct cgroup *cgroup) 1464{ 1465 struct blkio_cgroup *blkcg = cgroup_to_blkio_cgroup(cgroup); 1466 unsigned long flags; 1467 struct blkio_group *blkg; 1468 struct request_queue *q; 1469 struct blkio_policy_type *blkiop; 1470 1471 rcu_read_lock(); 1472 1473 do { 1474 spin_lock_irqsave(&blkcg->lock, flags); 1475 1476 if (hlist_empty(&blkcg->blkg_list)) { 1477 spin_unlock_irqrestore(&blkcg->lock, flags); 1478 break; 1479 } 1480 1481 blkg = hlist_entry(blkcg->blkg_list.first, struct blkio_group, 1482 blkcg_node); 1483 q = rcu_dereference(blkg->q); 1484 __blkiocg_del_blkio_group(blkg); 1485 1486 spin_unlock_irqrestore(&blkcg->lock, flags); 1487 1488 /* 1489 * This blkio_group is being unlinked as associated cgroup is 1490 * going away. Let all the IO controlling policies know about 1491 * this event. 1492 */ 1493 spin_lock(&blkio_list_lock); 1494 list_for_each_entry(blkiop, &blkio_list, list) { 1495 if (blkiop->plid != blkg->plid) 1496 continue; 1497 blkiop->ops.blkio_unlink_group_fn(q, blkg); 1498 } 1499 spin_unlock(&blkio_list_lock); 1500 } while (1); 1501 1502 rcu_read_unlock(); 1503 1504 return 0; 1505} 1506 1507static void blkiocg_destroy(struct cgroup_subsys *subsys, struct cgroup *cgroup) 1508{ 1509 struct blkio_cgroup *blkcg = cgroup_to_blkio_cgroup(cgroup); 1510 1511 if (blkcg != &blkio_root_cgroup) 1512 kfree(blkcg); 1513} 1514 1515static struct cgroup_subsys_state * 1516blkiocg_create(struct cgroup_subsys *subsys, struct cgroup *cgroup) 1517{ 1518 struct blkio_cgroup *blkcg; 1519 struct cgroup *parent = cgroup->parent; 1520 1521 if (!parent) { 1522 blkcg = &blkio_root_cgroup; 1523 goto done; 1524 } 1525 1526 blkcg = kzalloc(sizeof(*blkcg), GFP_KERNEL); 1527 if (!blkcg) 1528 return ERR_PTR(-ENOMEM); 1529 1530 blkcg->weight = BLKIO_WEIGHT_DEFAULT; 1531done: 1532 spin_lock_init(&blkcg->lock); 1533 INIT_HLIST_HEAD(&blkcg->blkg_list); 1534 1535 return &blkcg->css; 1536} 1537 1538/** 1539 * blkcg_init_queue - initialize blkcg part of request queue 1540 * @q: request_queue to initialize 1541 * 1542 * Called from blk_alloc_queue_node(). Responsible for initializing blkcg 1543 * part of new request_queue @q. 1544 * 1545 * RETURNS: 1546 * 0 on success, -errno on failure. 1547 */ 1548int blkcg_init_queue(struct request_queue *q) 1549{ 1550 int ret; 1551 1552 might_sleep(); 1553 1554 ret = blk_throtl_init(q); 1555 if (ret) 1556 return ret; 1557 1558 mutex_lock(&all_q_mutex); 1559 INIT_LIST_HEAD(&q->all_q_node); 1560 list_add_tail(&q->all_q_node, &all_q_list); 1561 mutex_unlock(&all_q_mutex); 1562 1563 return 0; 1564} 1565 1566/** 1567 * blkcg_drain_queue - drain blkcg part of request_queue 1568 * @q: request_queue to drain 1569 * 1570 * Called from blk_drain_queue(). Responsible for draining blkcg part. 1571 */ 1572void blkcg_drain_queue(struct request_queue *q) 1573{ 1574 lockdep_assert_held(q->queue_lock); 1575 1576 blk_throtl_drain(q); 1577} 1578 1579/** 1580 * blkcg_exit_queue - exit and release blkcg part of request_queue 1581 * @q: request_queue being released 1582 * 1583 * Called from blk_release_queue(). Responsible for exiting blkcg part. 1584 */ 1585void blkcg_exit_queue(struct request_queue *q) 1586{ 1587 mutex_lock(&all_q_mutex); 1588 list_del_init(&q->all_q_node); 1589 mutex_unlock(&all_q_mutex); 1590 1591 blk_throtl_exit(q); 1592} 1593 1594/* 1595 * We cannot support shared io contexts, as we have no mean to support 1596 * two tasks with the same ioc in two different groups without major rework 1597 * of the main cic data structures. For now we allow a task to change 1598 * its cgroup only if it's the only owner of its ioc. 1599 */ 1600static int blkiocg_can_attach(struct cgroup_subsys *ss, struct cgroup *cgrp, 1601 struct cgroup_taskset *tset) 1602{ 1603 struct task_struct *task; 1604 struct io_context *ioc; 1605 int ret = 0; 1606 1607 /* task_lock() is needed to avoid races with exit_io_context() */ 1608 cgroup_taskset_for_each(task, cgrp, tset) { 1609 task_lock(task); 1610 ioc = task->io_context; 1611 if (ioc && atomic_read(&ioc->nr_tasks) > 1) 1612 ret = -EINVAL; 1613 task_unlock(task); 1614 if (ret) 1615 break; 1616 } 1617 return ret; 1618} 1619 1620static void blkiocg_attach(struct cgroup_subsys *ss, struct cgroup *cgrp, 1621 struct cgroup_taskset *tset) 1622{ 1623 struct task_struct *task; 1624 struct io_context *ioc; 1625 1626 cgroup_taskset_for_each(task, cgrp, tset) { 1627 /* we don't lose anything even if ioc allocation fails */ 1628 ioc = get_task_io_context(task, GFP_ATOMIC, NUMA_NO_NODE); 1629 if (ioc) { 1630 ioc_cgroup_changed(ioc); 1631 put_io_context(ioc); 1632 } 1633 } 1634} 1635 1636static void blkcg_bypass_start(void) 1637 __acquires(&all_q_mutex) 1638{ 1639 struct request_queue *q; 1640 1641 mutex_lock(&all_q_mutex); 1642 1643 list_for_each_entry(q, &all_q_list, all_q_node) { 1644 blk_queue_bypass_start(q); 1645 blkg_destroy_all(q); 1646 } 1647} 1648 1649static void blkcg_bypass_end(void) 1650 __releases(&all_q_mutex) 1651{ 1652 struct request_queue *q; 1653 1654 list_for_each_entry(q, &all_q_list, all_q_node) 1655 blk_queue_bypass_end(q); 1656 1657 mutex_unlock(&all_q_mutex); 1658} 1659 1660void blkio_policy_register(struct blkio_policy_type *blkiop) 1661{ 1662 blkcg_bypass_start(); 1663 spin_lock(&blkio_list_lock); 1664 1665 BUG_ON(blkio_policy[blkiop->plid]); 1666 blkio_policy[blkiop->plid] = blkiop; 1667 list_add_tail(&blkiop->list, &blkio_list); 1668 1669 spin_unlock(&blkio_list_lock); 1670 blkcg_bypass_end(); 1671} 1672EXPORT_SYMBOL_GPL(blkio_policy_register); 1673 1674void blkio_policy_unregister(struct blkio_policy_type *blkiop) 1675{ 1676 blkcg_bypass_start(); 1677 spin_lock(&blkio_list_lock); 1678 1679 BUG_ON(blkio_policy[blkiop->plid] != blkiop); 1680 blkio_policy[blkiop->plid] = NULL; 1681 list_del_init(&blkiop->list); 1682 1683 spin_unlock(&blkio_list_lock); 1684 blkcg_bypass_end(); 1685} 1686EXPORT_SYMBOL_GPL(blkio_policy_unregister); 1687