blk-cgroup.c revision 72e06c255181537d0b3e1f657a9ed81655d745b1
1/* 2 * Common Block IO controller cgroup interface 3 * 4 * Based on ideas and code from CFQ, CFS and BFQ: 5 * Copyright (C) 2003 Jens Axboe <axboe@kernel.dk> 6 * 7 * Copyright (C) 2008 Fabio Checconi <fabio@gandalf.sssup.it> 8 * Paolo Valente <paolo.valente@unimore.it> 9 * 10 * Copyright (C) 2009 Vivek Goyal <vgoyal@redhat.com> 11 * Nauman Rafique <nauman@google.com> 12 */ 13#include <linux/ioprio.h> 14#include <linux/seq_file.h> 15#include <linux/kdev_t.h> 16#include <linux/module.h> 17#include <linux/err.h> 18#include <linux/blkdev.h> 19#include <linux/slab.h> 20#include <linux/genhd.h> 21#include <linux/delay.h> 22#include "blk-cgroup.h" 23 24#define MAX_KEY_LEN 100 25 26static DEFINE_SPINLOCK(blkio_list_lock); 27static LIST_HEAD(blkio_list); 28 29struct blkio_cgroup blkio_root_cgroup = { .weight = 2*BLKIO_WEIGHT_DEFAULT }; 30EXPORT_SYMBOL_GPL(blkio_root_cgroup); 31 32static struct cgroup_subsys_state *blkiocg_create(struct cgroup_subsys *, 33 struct cgroup *); 34static int blkiocg_can_attach(struct cgroup_subsys *, struct cgroup *, 35 struct cgroup_taskset *); 36static void blkiocg_attach(struct cgroup_subsys *, struct cgroup *, 37 struct cgroup_taskset *); 38static void blkiocg_destroy(struct cgroup_subsys *, struct cgroup *); 39static int blkiocg_populate(struct cgroup_subsys *, struct cgroup *); 40 41/* for encoding cft->private value on file */ 42#define BLKIOFILE_PRIVATE(x, val) (((x) << 16) | (val)) 43/* What policy owns the file, proportional or throttle */ 44#define BLKIOFILE_POLICY(val) (((val) >> 16) & 0xffff) 45#define BLKIOFILE_ATTR(val) ((val) & 0xffff) 46 47struct cgroup_subsys blkio_subsys = { 48 .name = "blkio", 49 .create = blkiocg_create, 50 .can_attach = blkiocg_can_attach, 51 .attach = blkiocg_attach, 52 .destroy = blkiocg_destroy, 53 .populate = blkiocg_populate, 54 .subsys_id = blkio_subsys_id, 55 .use_id = 1, 56 .module = THIS_MODULE, 57}; 58EXPORT_SYMBOL_GPL(blkio_subsys); 59 60static inline void blkio_policy_insert_node(struct blkio_cgroup *blkcg, 61 struct blkio_policy_node *pn) 62{ 63 list_add(&pn->node, &blkcg->policy_list); 64} 65 66static inline bool cftype_blkg_same_policy(struct cftype *cft, 67 struct blkio_group *blkg) 68{ 69 enum blkio_policy_id plid = BLKIOFILE_POLICY(cft->private); 70 71 if (blkg->plid == plid) 72 return 1; 73 74 return 0; 75} 76 77/* Determines if policy node matches cgroup file being accessed */ 78static inline bool pn_matches_cftype(struct cftype *cft, 79 struct blkio_policy_node *pn) 80{ 81 enum blkio_policy_id plid = BLKIOFILE_POLICY(cft->private); 82 int fileid = BLKIOFILE_ATTR(cft->private); 83 84 return (plid == pn->plid && fileid == pn->fileid); 85} 86 87/* Must be called with blkcg->lock held */ 88static inline void blkio_policy_delete_node(struct blkio_policy_node *pn) 89{ 90 list_del(&pn->node); 91} 92 93/* Must be called with blkcg->lock held */ 94static struct blkio_policy_node * 95blkio_policy_search_node(const struct blkio_cgroup *blkcg, dev_t dev, 96 enum blkio_policy_id plid, int fileid) 97{ 98 struct blkio_policy_node *pn; 99 100 list_for_each_entry(pn, &blkcg->policy_list, node) { 101 if (pn->dev == dev && pn->plid == plid && pn->fileid == fileid) 102 return pn; 103 } 104 105 return NULL; 106} 107 108struct blkio_cgroup *cgroup_to_blkio_cgroup(struct cgroup *cgroup) 109{ 110 return container_of(cgroup_subsys_state(cgroup, blkio_subsys_id), 111 struct blkio_cgroup, css); 112} 113EXPORT_SYMBOL_GPL(cgroup_to_blkio_cgroup); 114 115struct blkio_cgroup *task_blkio_cgroup(struct task_struct *tsk) 116{ 117 return container_of(task_subsys_state(tsk, blkio_subsys_id), 118 struct blkio_cgroup, css); 119} 120EXPORT_SYMBOL_GPL(task_blkio_cgroup); 121 122static inline void 123blkio_update_group_weight(struct blkio_group *blkg, unsigned int weight) 124{ 125 struct blkio_policy_type *blkiop; 126 127 list_for_each_entry(blkiop, &blkio_list, list) { 128 /* If this policy does not own the blkg, do not send updates */ 129 if (blkiop->plid != blkg->plid) 130 continue; 131 if (blkiop->ops.blkio_update_group_weight_fn) 132 blkiop->ops.blkio_update_group_weight_fn(blkg->key, 133 blkg, weight); 134 } 135} 136 137static inline void blkio_update_group_bps(struct blkio_group *blkg, u64 bps, 138 int fileid) 139{ 140 struct blkio_policy_type *blkiop; 141 142 list_for_each_entry(blkiop, &blkio_list, list) { 143 144 /* If this policy does not own the blkg, do not send updates */ 145 if (blkiop->plid != blkg->plid) 146 continue; 147 148 if (fileid == BLKIO_THROTL_read_bps_device 149 && blkiop->ops.blkio_update_group_read_bps_fn) 150 blkiop->ops.blkio_update_group_read_bps_fn(blkg->key, 151 blkg, bps); 152 153 if (fileid == BLKIO_THROTL_write_bps_device 154 && blkiop->ops.blkio_update_group_write_bps_fn) 155 blkiop->ops.blkio_update_group_write_bps_fn(blkg->key, 156 blkg, bps); 157 } 158} 159 160static inline void blkio_update_group_iops(struct blkio_group *blkg, 161 unsigned int iops, int fileid) 162{ 163 struct blkio_policy_type *blkiop; 164 165 list_for_each_entry(blkiop, &blkio_list, list) { 166 167 /* If this policy does not own the blkg, do not send updates */ 168 if (blkiop->plid != blkg->plid) 169 continue; 170 171 if (fileid == BLKIO_THROTL_read_iops_device 172 && blkiop->ops.blkio_update_group_read_iops_fn) 173 blkiop->ops.blkio_update_group_read_iops_fn(blkg->key, 174 blkg, iops); 175 176 if (fileid == BLKIO_THROTL_write_iops_device 177 && blkiop->ops.blkio_update_group_write_iops_fn) 178 blkiop->ops.blkio_update_group_write_iops_fn(blkg->key, 179 blkg,iops); 180 } 181} 182 183/* 184 * Add to the appropriate stat variable depending on the request type. 185 * This should be called with the blkg->stats_lock held. 186 */ 187static void blkio_add_stat(uint64_t *stat, uint64_t add, bool direction, 188 bool sync) 189{ 190 if (direction) 191 stat[BLKIO_STAT_WRITE] += add; 192 else 193 stat[BLKIO_STAT_READ] += add; 194 if (sync) 195 stat[BLKIO_STAT_SYNC] += add; 196 else 197 stat[BLKIO_STAT_ASYNC] += add; 198} 199 200/* 201 * Decrements the appropriate stat variable if non-zero depending on the 202 * request type. Panics on value being zero. 203 * This should be called with the blkg->stats_lock held. 204 */ 205static void blkio_check_and_dec_stat(uint64_t *stat, bool direction, bool sync) 206{ 207 if (direction) { 208 BUG_ON(stat[BLKIO_STAT_WRITE] == 0); 209 stat[BLKIO_STAT_WRITE]--; 210 } else { 211 BUG_ON(stat[BLKIO_STAT_READ] == 0); 212 stat[BLKIO_STAT_READ]--; 213 } 214 if (sync) { 215 BUG_ON(stat[BLKIO_STAT_SYNC] == 0); 216 stat[BLKIO_STAT_SYNC]--; 217 } else { 218 BUG_ON(stat[BLKIO_STAT_ASYNC] == 0); 219 stat[BLKIO_STAT_ASYNC]--; 220 } 221} 222 223#ifdef CONFIG_DEBUG_BLK_CGROUP 224/* This should be called with the blkg->stats_lock held. */ 225static void blkio_set_start_group_wait_time(struct blkio_group *blkg, 226 struct blkio_group *curr_blkg) 227{ 228 if (blkio_blkg_waiting(&blkg->stats)) 229 return; 230 if (blkg == curr_blkg) 231 return; 232 blkg->stats.start_group_wait_time = sched_clock(); 233 blkio_mark_blkg_waiting(&blkg->stats); 234} 235 236/* This should be called with the blkg->stats_lock held. */ 237static void blkio_update_group_wait_time(struct blkio_group_stats *stats) 238{ 239 unsigned long long now; 240 241 if (!blkio_blkg_waiting(stats)) 242 return; 243 244 now = sched_clock(); 245 if (time_after64(now, stats->start_group_wait_time)) 246 stats->group_wait_time += now - stats->start_group_wait_time; 247 blkio_clear_blkg_waiting(stats); 248} 249 250/* This should be called with the blkg->stats_lock held. */ 251static void blkio_end_empty_time(struct blkio_group_stats *stats) 252{ 253 unsigned long long now; 254 255 if (!blkio_blkg_empty(stats)) 256 return; 257 258 now = sched_clock(); 259 if (time_after64(now, stats->start_empty_time)) 260 stats->empty_time += now - stats->start_empty_time; 261 blkio_clear_blkg_empty(stats); 262} 263 264void blkiocg_update_set_idle_time_stats(struct blkio_group *blkg) 265{ 266 unsigned long flags; 267 268 spin_lock_irqsave(&blkg->stats_lock, flags); 269 BUG_ON(blkio_blkg_idling(&blkg->stats)); 270 blkg->stats.start_idle_time = sched_clock(); 271 blkio_mark_blkg_idling(&blkg->stats); 272 spin_unlock_irqrestore(&blkg->stats_lock, flags); 273} 274EXPORT_SYMBOL_GPL(blkiocg_update_set_idle_time_stats); 275 276void blkiocg_update_idle_time_stats(struct blkio_group *blkg) 277{ 278 unsigned long flags; 279 unsigned long long now; 280 struct blkio_group_stats *stats; 281 282 spin_lock_irqsave(&blkg->stats_lock, flags); 283 stats = &blkg->stats; 284 if (blkio_blkg_idling(stats)) { 285 now = sched_clock(); 286 if (time_after64(now, stats->start_idle_time)) 287 stats->idle_time += now - stats->start_idle_time; 288 blkio_clear_blkg_idling(stats); 289 } 290 spin_unlock_irqrestore(&blkg->stats_lock, flags); 291} 292EXPORT_SYMBOL_GPL(blkiocg_update_idle_time_stats); 293 294void blkiocg_update_avg_queue_size_stats(struct blkio_group *blkg) 295{ 296 unsigned long flags; 297 struct blkio_group_stats *stats; 298 299 spin_lock_irqsave(&blkg->stats_lock, flags); 300 stats = &blkg->stats; 301 stats->avg_queue_size_sum += 302 stats->stat_arr[BLKIO_STAT_QUEUED][BLKIO_STAT_READ] + 303 stats->stat_arr[BLKIO_STAT_QUEUED][BLKIO_STAT_WRITE]; 304 stats->avg_queue_size_samples++; 305 blkio_update_group_wait_time(stats); 306 spin_unlock_irqrestore(&blkg->stats_lock, flags); 307} 308EXPORT_SYMBOL_GPL(blkiocg_update_avg_queue_size_stats); 309 310void blkiocg_set_start_empty_time(struct blkio_group *blkg) 311{ 312 unsigned long flags; 313 struct blkio_group_stats *stats; 314 315 spin_lock_irqsave(&blkg->stats_lock, flags); 316 stats = &blkg->stats; 317 318 if (stats->stat_arr[BLKIO_STAT_QUEUED][BLKIO_STAT_READ] || 319 stats->stat_arr[BLKIO_STAT_QUEUED][BLKIO_STAT_WRITE]) { 320 spin_unlock_irqrestore(&blkg->stats_lock, flags); 321 return; 322 } 323 324 /* 325 * group is already marked empty. This can happen if cfqq got new 326 * request in parent group and moved to this group while being added 327 * to service tree. Just ignore the event and move on. 328 */ 329 if(blkio_blkg_empty(stats)) { 330 spin_unlock_irqrestore(&blkg->stats_lock, flags); 331 return; 332 } 333 334 stats->start_empty_time = sched_clock(); 335 blkio_mark_blkg_empty(stats); 336 spin_unlock_irqrestore(&blkg->stats_lock, flags); 337} 338EXPORT_SYMBOL_GPL(blkiocg_set_start_empty_time); 339 340void blkiocg_update_dequeue_stats(struct blkio_group *blkg, 341 unsigned long dequeue) 342{ 343 blkg->stats.dequeue += dequeue; 344} 345EXPORT_SYMBOL_GPL(blkiocg_update_dequeue_stats); 346#else 347static inline void blkio_set_start_group_wait_time(struct blkio_group *blkg, 348 struct blkio_group *curr_blkg) {} 349static inline void blkio_end_empty_time(struct blkio_group_stats *stats) {} 350#endif 351 352void blkiocg_update_io_add_stats(struct blkio_group *blkg, 353 struct blkio_group *curr_blkg, bool direction, 354 bool sync) 355{ 356 unsigned long flags; 357 358 spin_lock_irqsave(&blkg->stats_lock, flags); 359 blkio_add_stat(blkg->stats.stat_arr[BLKIO_STAT_QUEUED], 1, direction, 360 sync); 361 blkio_end_empty_time(&blkg->stats); 362 blkio_set_start_group_wait_time(blkg, curr_blkg); 363 spin_unlock_irqrestore(&blkg->stats_lock, flags); 364} 365EXPORT_SYMBOL_GPL(blkiocg_update_io_add_stats); 366 367void blkiocg_update_io_remove_stats(struct blkio_group *blkg, 368 bool direction, bool sync) 369{ 370 unsigned long flags; 371 372 spin_lock_irqsave(&blkg->stats_lock, flags); 373 blkio_check_and_dec_stat(blkg->stats.stat_arr[BLKIO_STAT_QUEUED], 374 direction, sync); 375 spin_unlock_irqrestore(&blkg->stats_lock, flags); 376} 377EXPORT_SYMBOL_GPL(blkiocg_update_io_remove_stats); 378 379void blkiocg_update_timeslice_used(struct blkio_group *blkg, unsigned long time, 380 unsigned long unaccounted_time) 381{ 382 unsigned long flags; 383 384 spin_lock_irqsave(&blkg->stats_lock, flags); 385 blkg->stats.time += time; 386#ifdef CONFIG_DEBUG_BLK_CGROUP 387 blkg->stats.unaccounted_time += unaccounted_time; 388#endif 389 spin_unlock_irqrestore(&blkg->stats_lock, flags); 390} 391EXPORT_SYMBOL_GPL(blkiocg_update_timeslice_used); 392 393/* 394 * should be called under rcu read lock or queue lock to make sure blkg pointer 395 * is valid. 396 */ 397void blkiocg_update_dispatch_stats(struct blkio_group *blkg, 398 uint64_t bytes, bool direction, bool sync) 399{ 400 struct blkio_group_stats_cpu *stats_cpu; 401 unsigned long flags; 402 403 /* 404 * Disabling interrupts to provide mutual exclusion between two 405 * writes on same cpu. It probably is not needed for 64bit. Not 406 * optimizing that case yet. 407 */ 408 local_irq_save(flags); 409 410 stats_cpu = this_cpu_ptr(blkg->stats_cpu); 411 412 u64_stats_update_begin(&stats_cpu->syncp); 413 stats_cpu->sectors += bytes >> 9; 414 blkio_add_stat(stats_cpu->stat_arr_cpu[BLKIO_STAT_CPU_SERVICED], 415 1, direction, sync); 416 blkio_add_stat(stats_cpu->stat_arr_cpu[BLKIO_STAT_CPU_SERVICE_BYTES], 417 bytes, direction, sync); 418 u64_stats_update_end(&stats_cpu->syncp); 419 local_irq_restore(flags); 420} 421EXPORT_SYMBOL_GPL(blkiocg_update_dispatch_stats); 422 423void blkiocg_update_completion_stats(struct blkio_group *blkg, 424 uint64_t start_time, uint64_t io_start_time, bool direction, bool sync) 425{ 426 struct blkio_group_stats *stats; 427 unsigned long flags; 428 unsigned long long now = sched_clock(); 429 430 spin_lock_irqsave(&blkg->stats_lock, flags); 431 stats = &blkg->stats; 432 if (time_after64(now, io_start_time)) 433 blkio_add_stat(stats->stat_arr[BLKIO_STAT_SERVICE_TIME], 434 now - io_start_time, direction, sync); 435 if (time_after64(io_start_time, start_time)) 436 blkio_add_stat(stats->stat_arr[BLKIO_STAT_WAIT_TIME], 437 io_start_time - start_time, direction, sync); 438 spin_unlock_irqrestore(&blkg->stats_lock, flags); 439} 440EXPORT_SYMBOL_GPL(blkiocg_update_completion_stats); 441 442/* Merged stats are per cpu. */ 443void blkiocg_update_io_merged_stats(struct blkio_group *blkg, bool direction, 444 bool sync) 445{ 446 struct blkio_group_stats_cpu *stats_cpu; 447 unsigned long flags; 448 449 /* 450 * Disabling interrupts to provide mutual exclusion between two 451 * writes on same cpu. It probably is not needed for 64bit. Not 452 * optimizing that case yet. 453 */ 454 local_irq_save(flags); 455 456 stats_cpu = this_cpu_ptr(blkg->stats_cpu); 457 458 u64_stats_update_begin(&stats_cpu->syncp); 459 blkio_add_stat(stats_cpu->stat_arr_cpu[BLKIO_STAT_CPU_MERGED], 1, 460 direction, sync); 461 u64_stats_update_end(&stats_cpu->syncp); 462 local_irq_restore(flags); 463} 464EXPORT_SYMBOL_GPL(blkiocg_update_io_merged_stats); 465 466/* 467 * This function allocates the per cpu stats for blkio_group. Should be called 468 * from sleepable context as alloc_per_cpu() requires that. 469 */ 470int blkio_alloc_blkg_stats(struct blkio_group *blkg) 471{ 472 /* Allocate memory for per cpu stats */ 473 blkg->stats_cpu = alloc_percpu(struct blkio_group_stats_cpu); 474 if (!blkg->stats_cpu) 475 return -ENOMEM; 476 return 0; 477} 478EXPORT_SYMBOL_GPL(blkio_alloc_blkg_stats); 479 480void blkiocg_add_blkio_group(struct blkio_cgroup *blkcg, 481 struct blkio_group *blkg, void *key, dev_t dev, 482 enum blkio_policy_id plid) 483{ 484 unsigned long flags; 485 486 spin_lock_irqsave(&blkcg->lock, flags); 487 spin_lock_init(&blkg->stats_lock); 488 rcu_assign_pointer(blkg->key, key); 489 blkg->blkcg_id = css_id(&blkcg->css); 490 hlist_add_head_rcu(&blkg->blkcg_node, &blkcg->blkg_list); 491 blkg->plid = plid; 492 spin_unlock_irqrestore(&blkcg->lock, flags); 493 /* Need to take css reference ? */ 494 cgroup_path(blkcg->css.cgroup, blkg->path, sizeof(blkg->path)); 495 blkg->dev = dev; 496} 497EXPORT_SYMBOL_GPL(blkiocg_add_blkio_group); 498 499static void __blkiocg_del_blkio_group(struct blkio_group *blkg) 500{ 501 hlist_del_init_rcu(&blkg->blkcg_node); 502 blkg->blkcg_id = 0; 503} 504 505/* 506 * returns 0 if blkio_group was still on cgroup list. Otherwise returns 1 507 * indicating that blk_group was unhashed by the time we got to it. 508 */ 509int blkiocg_del_blkio_group(struct blkio_group *blkg) 510{ 511 struct blkio_cgroup *blkcg; 512 unsigned long flags; 513 struct cgroup_subsys_state *css; 514 int ret = 1; 515 516 rcu_read_lock(); 517 css = css_lookup(&blkio_subsys, blkg->blkcg_id); 518 if (css) { 519 blkcg = container_of(css, struct blkio_cgroup, css); 520 spin_lock_irqsave(&blkcg->lock, flags); 521 if (!hlist_unhashed(&blkg->blkcg_node)) { 522 __blkiocg_del_blkio_group(blkg); 523 ret = 0; 524 } 525 spin_unlock_irqrestore(&blkcg->lock, flags); 526 } 527 528 rcu_read_unlock(); 529 return ret; 530} 531EXPORT_SYMBOL_GPL(blkiocg_del_blkio_group); 532 533/* called under rcu_read_lock(). */ 534struct blkio_group *blkiocg_lookup_group(struct blkio_cgroup *blkcg, void *key) 535{ 536 struct blkio_group *blkg; 537 struct hlist_node *n; 538 void *__key; 539 540 hlist_for_each_entry_rcu(blkg, n, &blkcg->blkg_list, blkcg_node) { 541 __key = blkg->key; 542 if (__key == key) 543 return blkg; 544 } 545 546 return NULL; 547} 548EXPORT_SYMBOL_GPL(blkiocg_lookup_group); 549 550void blkg_destroy_all(struct request_queue *q) 551{ 552 struct blkio_policy_type *pol; 553 554 while (true) { 555 bool done = true; 556 557 spin_lock(&blkio_list_lock); 558 spin_lock_irq(q->queue_lock); 559 560 /* 561 * clear_queue_fn() might return with non-empty group list 562 * if it raced cgroup removal and lost. cgroup removal is 563 * guaranteed to make forward progress and retrying after a 564 * while is enough. This ugliness is scheduled to be 565 * removed after locking update. 566 */ 567 list_for_each_entry(pol, &blkio_list, list) 568 if (!pol->ops.blkio_clear_queue_fn(q)) 569 done = false; 570 571 spin_unlock_irq(q->queue_lock); 572 spin_unlock(&blkio_list_lock); 573 574 if (done) 575 break; 576 577 msleep(10); /* just some random duration I like */ 578 } 579} 580 581static void blkio_reset_stats_cpu(struct blkio_group *blkg) 582{ 583 struct blkio_group_stats_cpu *stats_cpu; 584 int i, j, k; 585 /* 586 * Note: On 64 bit arch this should not be an issue. This has the 587 * possibility of returning some inconsistent value on 32bit arch 588 * as 64bit update on 32bit is non atomic. Taking care of this 589 * corner case makes code very complicated, like sending IPIs to 590 * cpus, taking care of stats of offline cpus etc. 591 * 592 * reset stats is anyway more of a debug feature and this sounds a 593 * corner case. So I am not complicating the code yet until and 594 * unless this becomes a real issue. 595 */ 596 for_each_possible_cpu(i) { 597 stats_cpu = per_cpu_ptr(blkg->stats_cpu, i); 598 stats_cpu->sectors = 0; 599 for(j = 0; j < BLKIO_STAT_CPU_NR; j++) 600 for (k = 0; k < BLKIO_STAT_TOTAL; k++) 601 stats_cpu->stat_arr_cpu[j][k] = 0; 602 } 603} 604 605static int 606blkiocg_reset_stats(struct cgroup *cgroup, struct cftype *cftype, u64 val) 607{ 608 struct blkio_cgroup *blkcg; 609 struct blkio_group *blkg; 610 struct blkio_group_stats *stats; 611 struct hlist_node *n; 612 uint64_t queued[BLKIO_STAT_TOTAL]; 613 int i; 614#ifdef CONFIG_DEBUG_BLK_CGROUP 615 bool idling, waiting, empty; 616 unsigned long long now = sched_clock(); 617#endif 618 619 blkcg = cgroup_to_blkio_cgroup(cgroup); 620 spin_lock_irq(&blkcg->lock); 621 hlist_for_each_entry(blkg, n, &blkcg->blkg_list, blkcg_node) { 622 spin_lock(&blkg->stats_lock); 623 stats = &blkg->stats; 624#ifdef CONFIG_DEBUG_BLK_CGROUP 625 idling = blkio_blkg_idling(stats); 626 waiting = blkio_blkg_waiting(stats); 627 empty = blkio_blkg_empty(stats); 628#endif 629 for (i = 0; i < BLKIO_STAT_TOTAL; i++) 630 queued[i] = stats->stat_arr[BLKIO_STAT_QUEUED][i]; 631 memset(stats, 0, sizeof(struct blkio_group_stats)); 632 for (i = 0; i < BLKIO_STAT_TOTAL; i++) 633 stats->stat_arr[BLKIO_STAT_QUEUED][i] = queued[i]; 634#ifdef CONFIG_DEBUG_BLK_CGROUP 635 if (idling) { 636 blkio_mark_blkg_idling(stats); 637 stats->start_idle_time = now; 638 } 639 if (waiting) { 640 blkio_mark_blkg_waiting(stats); 641 stats->start_group_wait_time = now; 642 } 643 if (empty) { 644 blkio_mark_blkg_empty(stats); 645 stats->start_empty_time = now; 646 } 647#endif 648 spin_unlock(&blkg->stats_lock); 649 650 /* Reset Per cpu stats which don't take blkg->stats_lock */ 651 blkio_reset_stats_cpu(blkg); 652 } 653 654 spin_unlock_irq(&blkcg->lock); 655 return 0; 656} 657 658static void blkio_get_key_name(enum stat_sub_type type, dev_t dev, char *str, 659 int chars_left, bool diskname_only) 660{ 661 snprintf(str, chars_left, "%d:%d", MAJOR(dev), MINOR(dev)); 662 chars_left -= strlen(str); 663 if (chars_left <= 0) { 664 printk(KERN_WARNING 665 "Possibly incorrect cgroup stat display format"); 666 return; 667 } 668 if (diskname_only) 669 return; 670 switch (type) { 671 case BLKIO_STAT_READ: 672 strlcat(str, " Read", chars_left); 673 break; 674 case BLKIO_STAT_WRITE: 675 strlcat(str, " Write", chars_left); 676 break; 677 case BLKIO_STAT_SYNC: 678 strlcat(str, " Sync", chars_left); 679 break; 680 case BLKIO_STAT_ASYNC: 681 strlcat(str, " Async", chars_left); 682 break; 683 case BLKIO_STAT_TOTAL: 684 strlcat(str, " Total", chars_left); 685 break; 686 default: 687 strlcat(str, " Invalid", chars_left); 688 } 689} 690 691static uint64_t blkio_fill_stat(char *str, int chars_left, uint64_t val, 692 struct cgroup_map_cb *cb, dev_t dev) 693{ 694 blkio_get_key_name(0, dev, str, chars_left, true); 695 cb->fill(cb, str, val); 696 return val; 697} 698 699 700static uint64_t blkio_read_stat_cpu(struct blkio_group *blkg, 701 enum stat_type_cpu type, enum stat_sub_type sub_type) 702{ 703 int cpu; 704 struct blkio_group_stats_cpu *stats_cpu; 705 u64 val = 0, tval; 706 707 for_each_possible_cpu(cpu) { 708 unsigned int start; 709 stats_cpu = per_cpu_ptr(blkg->stats_cpu, cpu); 710 711 do { 712 start = u64_stats_fetch_begin(&stats_cpu->syncp); 713 if (type == BLKIO_STAT_CPU_SECTORS) 714 tval = stats_cpu->sectors; 715 else 716 tval = stats_cpu->stat_arr_cpu[type][sub_type]; 717 } while(u64_stats_fetch_retry(&stats_cpu->syncp, start)); 718 719 val += tval; 720 } 721 722 return val; 723} 724 725static uint64_t blkio_get_stat_cpu(struct blkio_group *blkg, 726 struct cgroup_map_cb *cb, dev_t dev, enum stat_type_cpu type) 727{ 728 uint64_t disk_total, val; 729 char key_str[MAX_KEY_LEN]; 730 enum stat_sub_type sub_type; 731 732 if (type == BLKIO_STAT_CPU_SECTORS) { 733 val = blkio_read_stat_cpu(blkg, type, 0); 734 return blkio_fill_stat(key_str, MAX_KEY_LEN - 1, val, cb, dev); 735 } 736 737 for (sub_type = BLKIO_STAT_READ; sub_type < BLKIO_STAT_TOTAL; 738 sub_type++) { 739 blkio_get_key_name(sub_type, dev, key_str, MAX_KEY_LEN, false); 740 val = blkio_read_stat_cpu(blkg, type, sub_type); 741 cb->fill(cb, key_str, val); 742 } 743 744 disk_total = blkio_read_stat_cpu(blkg, type, BLKIO_STAT_READ) + 745 blkio_read_stat_cpu(blkg, type, BLKIO_STAT_WRITE); 746 747 blkio_get_key_name(BLKIO_STAT_TOTAL, dev, key_str, MAX_KEY_LEN, false); 748 cb->fill(cb, key_str, disk_total); 749 return disk_total; 750} 751 752/* This should be called with blkg->stats_lock held */ 753static uint64_t blkio_get_stat(struct blkio_group *blkg, 754 struct cgroup_map_cb *cb, dev_t dev, enum stat_type type) 755{ 756 uint64_t disk_total; 757 char key_str[MAX_KEY_LEN]; 758 enum stat_sub_type sub_type; 759 760 if (type == BLKIO_STAT_TIME) 761 return blkio_fill_stat(key_str, MAX_KEY_LEN - 1, 762 blkg->stats.time, cb, dev); 763#ifdef CONFIG_DEBUG_BLK_CGROUP 764 if (type == BLKIO_STAT_UNACCOUNTED_TIME) 765 return blkio_fill_stat(key_str, MAX_KEY_LEN - 1, 766 blkg->stats.unaccounted_time, cb, dev); 767 if (type == BLKIO_STAT_AVG_QUEUE_SIZE) { 768 uint64_t sum = blkg->stats.avg_queue_size_sum; 769 uint64_t samples = blkg->stats.avg_queue_size_samples; 770 if (samples) 771 do_div(sum, samples); 772 else 773 sum = 0; 774 return blkio_fill_stat(key_str, MAX_KEY_LEN - 1, sum, cb, dev); 775 } 776 if (type == BLKIO_STAT_GROUP_WAIT_TIME) 777 return blkio_fill_stat(key_str, MAX_KEY_LEN - 1, 778 blkg->stats.group_wait_time, cb, dev); 779 if (type == BLKIO_STAT_IDLE_TIME) 780 return blkio_fill_stat(key_str, MAX_KEY_LEN - 1, 781 blkg->stats.idle_time, cb, dev); 782 if (type == BLKIO_STAT_EMPTY_TIME) 783 return blkio_fill_stat(key_str, MAX_KEY_LEN - 1, 784 blkg->stats.empty_time, cb, dev); 785 if (type == BLKIO_STAT_DEQUEUE) 786 return blkio_fill_stat(key_str, MAX_KEY_LEN - 1, 787 blkg->stats.dequeue, cb, dev); 788#endif 789 790 for (sub_type = BLKIO_STAT_READ; sub_type < BLKIO_STAT_TOTAL; 791 sub_type++) { 792 blkio_get_key_name(sub_type, dev, key_str, MAX_KEY_LEN, false); 793 cb->fill(cb, key_str, blkg->stats.stat_arr[type][sub_type]); 794 } 795 disk_total = blkg->stats.stat_arr[type][BLKIO_STAT_READ] + 796 blkg->stats.stat_arr[type][BLKIO_STAT_WRITE]; 797 blkio_get_key_name(BLKIO_STAT_TOTAL, dev, key_str, MAX_KEY_LEN, false); 798 cb->fill(cb, key_str, disk_total); 799 return disk_total; 800} 801 802static int blkio_policy_parse_and_set(char *buf, 803 struct blkio_policy_node *newpn, enum blkio_policy_id plid, int fileid) 804{ 805 struct gendisk *disk = NULL; 806 char *s[4], *p, *major_s = NULL, *minor_s = NULL; 807 unsigned long major, minor; 808 int i = 0, ret = -EINVAL; 809 int part; 810 dev_t dev; 811 u64 temp; 812 813 memset(s, 0, sizeof(s)); 814 815 while ((p = strsep(&buf, " ")) != NULL) { 816 if (!*p) 817 continue; 818 819 s[i++] = p; 820 821 /* Prevent from inputing too many things */ 822 if (i == 3) 823 break; 824 } 825 826 if (i != 2) 827 goto out; 828 829 p = strsep(&s[0], ":"); 830 if (p != NULL) 831 major_s = p; 832 else 833 goto out; 834 835 minor_s = s[0]; 836 if (!minor_s) 837 goto out; 838 839 if (strict_strtoul(major_s, 10, &major)) 840 goto out; 841 842 if (strict_strtoul(minor_s, 10, &minor)) 843 goto out; 844 845 dev = MKDEV(major, minor); 846 847 if (strict_strtoull(s[1], 10, &temp)) 848 goto out; 849 850 /* For rule removal, do not check for device presence. */ 851 if (temp) { 852 disk = get_gendisk(dev, &part); 853 if (!disk || part) { 854 ret = -ENODEV; 855 goto out; 856 } 857 } 858 859 newpn->dev = dev; 860 861 switch (plid) { 862 case BLKIO_POLICY_PROP: 863 if ((temp < BLKIO_WEIGHT_MIN && temp > 0) || 864 temp > BLKIO_WEIGHT_MAX) 865 goto out; 866 867 newpn->plid = plid; 868 newpn->fileid = fileid; 869 newpn->val.weight = temp; 870 break; 871 case BLKIO_POLICY_THROTL: 872 switch(fileid) { 873 case BLKIO_THROTL_read_bps_device: 874 case BLKIO_THROTL_write_bps_device: 875 newpn->plid = plid; 876 newpn->fileid = fileid; 877 newpn->val.bps = temp; 878 break; 879 case BLKIO_THROTL_read_iops_device: 880 case BLKIO_THROTL_write_iops_device: 881 if (temp > THROTL_IOPS_MAX) 882 goto out; 883 884 newpn->plid = plid; 885 newpn->fileid = fileid; 886 newpn->val.iops = (unsigned int)temp; 887 break; 888 } 889 break; 890 default: 891 BUG(); 892 } 893 ret = 0; 894out: 895 put_disk(disk); 896 return ret; 897} 898 899unsigned int blkcg_get_weight(struct blkio_cgroup *blkcg, 900 dev_t dev) 901{ 902 struct blkio_policy_node *pn; 903 unsigned long flags; 904 unsigned int weight; 905 906 spin_lock_irqsave(&blkcg->lock, flags); 907 908 pn = blkio_policy_search_node(blkcg, dev, BLKIO_POLICY_PROP, 909 BLKIO_PROP_weight_device); 910 if (pn) 911 weight = pn->val.weight; 912 else 913 weight = blkcg->weight; 914 915 spin_unlock_irqrestore(&blkcg->lock, flags); 916 917 return weight; 918} 919EXPORT_SYMBOL_GPL(blkcg_get_weight); 920 921uint64_t blkcg_get_read_bps(struct blkio_cgroup *blkcg, dev_t dev) 922{ 923 struct blkio_policy_node *pn; 924 unsigned long flags; 925 uint64_t bps = -1; 926 927 spin_lock_irqsave(&blkcg->lock, flags); 928 pn = blkio_policy_search_node(blkcg, dev, BLKIO_POLICY_THROTL, 929 BLKIO_THROTL_read_bps_device); 930 if (pn) 931 bps = pn->val.bps; 932 spin_unlock_irqrestore(&blkcg->lock, flags); 933 934 return bps; 935} 936 937uint64_t blkcg_get_write_bps(struct blkio_cgroup *blkcg, dev_t dev) 938{ 939 struct blkio_policy_node *pn; 940 unsigned long flags; 941 uint64_t bps = -1; 942 943 spin_lock_irqsave(&blkcg->lock, flags); 944 pn = blkio_policy_search_node(blkcg, dev, BLKIO_POLICY_THROTL, 945 BLKIO_THROTL_write_bps_device); 946 if (pn) 947 bps = pn->val.bps; 948 spin_unlock_irqrestore(&blkcg->lock, flags); 949 950 return bps; 951} 952 953unsigned int blkcg_get_read_iops(struct blkio_cgroup *blkcg, dev_t dev) 954{ 955 struct blkio_policy_node *pn; 956 unsigned long flags; 957 unsigned int iops = -1; 958 959 spin_lock_irqsave(&blkcg->lock, flags); 960 pn = blkio_policy_search_node(blkcg, dev, BLKIO_POLICY_THROTL, 961 BLKIO_THROTL_read_iops_device); 962 if (pn) 963 iops = pn->val.iops; 964 spin_unlock_irqrestore(&blkcg->lock, flags); 965 966 return iops; 967} 968 969unsigned int blkcg_get_write_iops(struct blkio_cgroup *blkcg, dev_t dev) 970{ 971 struct blkio_policy_node *pn; 972 unsigned long flags; 973 unsigned int iops = -1; 974 975 spin_lock_irqsave(&blkcg->lock, flags); 976 pn = blkio_policy_search_node(blkcg, dev, BLKIO_POLICY_THROTL, 977 BLKIO_THROTL_write_iops_device); 978 if (pn) 979 iops = pn->val.iops; 980 spin_unlock_irqrestore(&blkcg->lock, flags); 981 982 return iops; 983} 984 985/* Checks whether user asked for deleting a policy rule */ 986static bool blkio_delete_rule_command(struct blkio_policy_node *pn) 987{ 988 switch(pn->plid) { 989 case BLKIO_POLICY_PROP: 990 if (pn->val.weight == 0) 991 return 1; 992 break; 993 case BLKIO_POLICY_THROTL: 994 switch(pn->fileid) { 995 case BLKIO_THROTL_read_bps_device: 996 case BLKIO_THROTL_write_bps_device: 997 if (pn->val.bps == 0) 998 return 1; 999 break; 1000 case BLKIO_THROTL_read_iops_device: 1001 case BLKIO_THROTL_write_iops_device: 1002 if (pn->val.iops == 0) 1003 return 1; 1004 } 1005 break; 1006 default: 1007 BUG(); 1008 } 1009 1010 return 0; 1011} 1012 1013static void blkio_update_policy_rule(struct blkio_policy_node *oldpn, 1014 struct blkio_policy_node *newpn) 1015{ 1016 switch(oldpn->plid) { 1017 case BLKIO_POLICY_PROP: 1018 oldpn->val.weight = newpn->val.weight; 1019 break; 1020 case BLKIO_POLICY_THROTL: 1021 switch(newpn->fileid) { 1022 case BLKIO_THROTL_read_bps_device: 1023 case BLKIO_THROTL_write_bps_device: 1024 oldpn->val.bps = newpn->val.bps; 1025 break; 1026 case BLKIO_THROTL_read_iops_device: 1027 case BLKIO_THROTL_write_iops_device: 1028 oldpn->val.iops = newpn->val.iops; 1029 } 1030 break; 1031 default: 1032 BUG(); 1033 } 1034} 1035 1036/* 1037 * Some rules/values in blkg have changed. Propagate those to respective 1038 * policies. 1039 */ 1040static void blkio_update_blkg_policy(struct blkio_cgroup *blkcg, 1041 struct blkio_group *blkg, struct blkio_policy_node *pn) 1042{ 1043 unsigned int weight, iops; 1044 u64 bps; 1045 1046 switch(pn->plid) { 1047 case BLKIO_POLICY_PROP: 1048 weight = pn->val.weight ? pn->val.weight : 1049 blkcg->weight; 1050 blkio_update_group_weight(blkg, weight); 1051 break; 1052 case BLKIO_POLICY_THROTL: 1053 switch(pn->fileid) { 1054 case BLKIO_THROTL_read_bps_device: 1055 case BLKIO_THROTL_write_bps_device: 1056 bps = pn->val.bps ? pn->val.bps : (-1); 1057 blkio_update_group_bps(blkg, bps, pn->fileid); 1058 break; 1059 case BLKIO_THROTL_read_iops_device: 1060 case BLKIO_THROTL_write_iops_device: 1061 iops = pn->val.iops ? pn->val.iops : (-1); 1062 blkio_update_group_iops(blkg, iops, pn->fileid); 1063 break; 1064 } 1065 break; 1066 default: 1067 BUG(); 1068 } 1069} 1070 1071/* 1072 * A policy node rule has been updated. Propagate this update to all the 1073 * block groups which might be affected by this update. 1074 */ 1075static void blkio_update_policy_node_blkg(struct blkio_cgroup *blkcg, 1076 struct blkio_policy_node *pn) 1077{ 1078 struct blkio_group *blkg; 1079 struct hlist_node *n; 1080 1081 spin_lock(&blkio_list_lock); 1082 spin_lock_irq(&blkcg->lock); 1083 1084 hlist_for_each_entry(blkg, n, &blkcg->blkg_list, blkcg_node) { 1085 if (pn->dev != blkg->dev || pn->plid != blkg->plid) 1086 continue; 1087 blkio_update_blkg_policy(blkcg, blkg, pn); 1088 } 1089 1090 spin_unlock_irq(&blkcg->lock); 1091 spin_unlock(&blkio_list_lock); 1092} 1093 1094static int blkiocg_file_write(struct cgroup *cgrp, struct cftype *cft, 1095 const char *buffer) 1096{ 1097 int ret = 0; 1098 char *buf; 1099 struct blkio_policy_node *newpn, *pn; 1100 struct blkio_cgroup *blkcg; 1101 int keep_newpn = 0; 1102 enum blkio_policy_id plid = BLKIOFILE_POLICY(cft->private); 1103 int fileid = BLKIOFILE_ATTR(cft->private); 1104 1105 buf = kstrdup(buffer, GFP_KERNEL); 1106 if (!buf) 1107 return -ENOMEM; 1108 1109 newpn = kzalloc(sizeof(*newpn), GFP_KERNEL); 1110 if (!newpn) { 1111 ret = -ENOMEM; 1112 goto free_buf; 1113 } 1114 1115 ret = blkio_policy_parse_and_set(buf, newpn, plid, fileid); 1116 if (ret) 1117 goto free_newpn; 1118 1119 blkcg = cgroup_to_blkio_cgroup(cgrp); 1120 1121 spin_lock_irq(&blkcg->lock); 1122 1123 pn = blkio_policy_search_node(blkcg, newpn->dev, plid, fileid); 1124 if (!pn) { 1125 if (!blkio_delete_rule_command(newpn)) { 1126 blkio_policy_insert_node(blkcg, newpn); 1127 keep_newpn = 1; 1128 } 1129 spin_unlock_irq(&blkcg->lock); 1130 goto update_io_group; 1131 } 1132 1133 if (blkio_delete_rule_command(newpn)) { 1134 blkio_policy_delete_node(pn); 1135 kfree(pn); 1136 spin_unlock_irq(&blkcg->lock); 1137 goto update_io_group; 1138 } 1139 spin_unlock_irq(&blkcg->lock); 1140 1141 blkio_update_policy_rule(pn, newpn); 1142 1143update_io_group: 1144 blkio_update_policy_node_blkg(blkcg, newpn); 1145 1146free_newpn: 1147 if (!keep_newpn) 1148 kfree(newpn); 1149free_buf: 1150 kfree(buf); 1151 return ret; 1152} 1153 1154static void 1155blkio_print_policy_node(struct seq_file *m, struct blkio_policy_node *pn) 1156{ 1157 switch(pn->plid) { 1158 case BLKIO_POLICY_PROP: 1159 if (pn->fileid == BLKIO_PROP_weight_device) 1160 seq_printf(m, "%u:%u\t%u\n", MAJOR(pn->dev), 1161 MINOR(pn->dev), pn->val.weight); 1162 break; 1163 case BLKIO_POLICY_THROTL: 1164 switch(pn->fileid) { 1165 case BLKIO_THROTL_read_bps_device: 1166 case BLKIO_THROTL_write_bps_device: 1167 seq_printf(m, "%u:%u\t%llu\n", MAJOR(pn->dev), 1168 MINOR(pn->dev), pn->val.bps); 1169 break; 1170 case BLKIO_THROTL_read_iops_device: 1171 case BLKIO_THROTL_write_iops_device: 1172 seq_printf(m, "%u:%u\t%u\n", MAJOR(pn->dev), 1173 MINOR(pn->dev), pn->val.iops); 1174 break; 1175 } 1176 break; 1177 default: 1178 BUG(); 1179 } 1180} 1181 1182/* cgroup files which read their data from policy nodes end up here */ 1183static void blkio_read_policy_node_files(struct cftype *cft, 1184 struct blkio_cgroup *blkcg, struct seq_file *m) 1185{ 1186 struct blkio_policy_node *pn; 1187 1188 if (!list_empty(&blkcg->policy_list)) { 1189 spin_lock_irq(&blkcg->lock); 1190 list_for_each_entry(pn, &blkcg->policy_list, node) { 1191 if (!pn_matches_cftype(cft, pn)) 1192 continue; 1193 blkio_print_policy_node(m, pn); 1194 } 1195 spin_unlock_irq(&blkcg->lock); 1196 } 1197} 1198 1199static int blkiocg_file_read(struct cgroup *cgrp, struct cftype *cft, 1200 struct seq_file *m) 1201{ 1202 struct blkio_cgroup *blkcg; 1203 enum blkio_policy_id plid = BLKIOFILE_POLICY(cft->private); 1204 int name = BLKIOFILE_ATTR(cft->private); 1205 1206 blkcg = cgroup_to_blkio_cgroup(cgrp); 1207 1208 switch(plid) { 1209 case BLKIO_POLICY_PROP: 1210 switch(name) { 1211 case BLKIO_PROP_weight_device: 1212 blkio_read_policy_node_files(cft, blkcg, m); 1213 return 0; 1214 default: 1215 BUG(); 1216 } 1217 break; 1218 case BLKIO_POLICY_THROTL: 1219 switch(name){ 1220 case BLKIO_THROTL_read_bps_device: 1221 case BLKIO_THROTL_write_bps_device: 1222 case BLKIO_THROTL_read_iops_device: 1223 case BLKIO_THROTL_write_iops_device: 1224 blkio_read_policy_node_files(cft, blkcg, m); 1225 return 0; 1226 default: 1227 BUG(); 1228 } 1229 break; 1230 default: 1231 BUG(); 1232 } 1233 1234 return 0; 1235} 1236 1237static int blkio_read_blkg_stats(struct blkio_cgroup *blkcg, 1238 struct cftype *cft, struct cgroup_map_cb *cb, 1239 enum stat_type type, bool show_total, bool pcpu) 1240{ 1241 struct blkio_group *blkg; 1242 struct hlist_node *n; 1243 uint64_t cgroup_total = 0; 1244 1245 rcu_read_lock(); 1246 hlist_for_each_entry_rcu(blkg, n, &blkcg->blkg_list, blkcg_node) { 1247 if (blkg->dev) { 1248 if (!cftype_blkg_same_policy(cft, blkg)) 1249 continue; 1250 if (pcpu) 1251 cgroup_total += blkio_get_stat_cpu(blkg, cb, 1252 blkg->dev, type); 1253 else { 1254 spin_lock_irq(&blkg->stats_lock); 1255 cgroup_total += blkio_get_stat(blkg, cb, 1256 blkg->dev, type); 1257 spin_unlock_irq(&blkg->stats_lock); 1258 } 1259 } 1260 } 1261 if (show_total) 1262 cb->fill(cb, "Total", cgroup_total); 1263 rcu_read_unlock(); 1264 return 0; 1265} 1266 1267/* All map kind of cgroup file get serviced by this function */ 1268static int blkiocg_file_read_map(struct cgroup *cgrp, struct cftype *cft, 1269 struct cgroup_map_cb *cb) 1270{ 1271 struct blkio_cgroup *blkcg; 1272 enum blkio_policy_id plid = BLKIOFILE_POLICY(cft->private); 1273 int name = BLKIOFILE_ATTR(cft->private); 1274 1275 blkcg = cgroup_to_blkio_cgroup(cgrp); 1276 1277 switch(plid) { 1278 case BLKIO_POLICY_PROP: 1279 switch(name) { 1280 case BLKIO_PROP_time: 1281 return blkio_read_blkg_stats(blkcg, cft, cb, 1282 BLKIO_STAT_TIME, 0, 0); 1283 case BLKIO_PROP_sectors: 1284 return blkio_read_blkg_stats(blkcg, cft, cb, 1285 BLKIO_STAT_CPU_SECTORS, 0, 1); 1286 case BLKIO_PROP_io_service_bytes: 1287 return blkio_read_blkg_stats(blkcg, cft, cb, 1288 BLKIO_STAT_CPU_SERVICE_BYTES, 1, 1); 1289 case BLKIO_PROP_io_serviced: 1290 return blkio_read_blkg_stats(blkcg, cft, cb, 1291 BLKIO_STAT_CPU_SERVICED, 1, 1); 1292 case BLKIO_PROP_io_service_time: 1293 return blkio_read_blkg_stats(blkcg, cft, cb, 1294 BLKIO_STAT_SERVICE_TIME, 1, 0); 1295 case BLKIO_PROP_io_wait_time: 1296 return blkio_read_blkg_stats(blkcg, cft, cb, 1297 BLKIO_STAT_WAIT_TIME, 1, 0); 1298 case BLKIO_PROP_io_merged: 1299 return blkio_read_blkg_stats(blkcg, cft, cb, 1300 BLKIO_STAT_CPU_MERGED, 1, 1); 1301 case BLKIO_PROP_io_queued: 1302 return blkio_read_blkg_stats(blkcg, cft, cb, 1303 BLKIO_STAT_QUEUED, 1, 0); 1304#ifdef CONFIG_DEBUG_BLK_CGROUP 1305 case BLKIO_PROP_unaccounted_time: 1306 return blkio_read_blkg_stats(blkcg, cft, cb, 1307 BLKIO_STAT_UNACCOUNTED_TIME, 0, 0); 1308 case BLKIO_PROP_dequeue: 1309 return blkio_read_blkg_stats(blkcg, cft, cb, 1310 BLKIO_STAT_DEQUEUE, 0, 0); 1311 case BLKIO_PROP_avg_queue_size: 1312 return blkio_read_blkg_stats(blkcg, cft, cb, 1313 BLKIO_STAT_AVG_QUEUE_SIZE, 0, 0); 1314 case BLKIO_PROP_group_wait_time: 1315 return blkio_read_blkg_stats(blkcg, cft, cb, 1316 BLKIO_STAT_GROUP_WAIT_TIME, 0, 0); 1317 case BLKIO_PROP_idle_time: 1318 return blkio_read_blkg_stats(blkcg, cft, cb, 1319 BLKIO_STAT_IDLE_TIME, 0, 0); 1320 case BLKIO_PROP_empty_time: 1321 return blkio_read_blkg_stats(blkcg, cft, cb, 1322 BLKIO_STAT_EMPTY_TIME, 0, 0); 1323#endif 1324 default: 1325 BUG(); 1326 } 1327 break; 1328 case BLKIO_POLICY_THROTL: 1329 switch(name){ 1330 case BLKIO_THROTL_io_service_bytes: 1331 return blkio_read_blkg_stats(blkcg, cft, cb, 1332 BLKIO_STAT_CPU_SERVICE_BYTES, 1, 1); 1333 case BLKIO_THROTL_io_serviced: 1334 return blkio_read_blkg_stats(blkcg, cft, cb, 1335 BLKIO_STAT_CPU_SERVICED, 1, 1); 1336 default: 1337 BUG(); 1338 } 1339 break; 1340 default: 1341 BUG(); 1342 } 1343 1344 return 0; 1345} 1346 1347static int blkio_weight_write(struct blkio_cgroup *blkcg, u64 val) 1348{ 1349 struct blkio_group *blkg; 1350 struct hlist_node *n; 1351 struct blkio_policy_node *pn; 1352 1353 if (val < BLKIO_WEIGHT_MIN || val > BLKIO_WEIGHT_MAX) 1354 return -EINVAL; 1355 1356 spin_lock(&blkio_list_lock); 1357 spin_lock_irq(&blkcg->lock); 1358 blkcg->weight = (unsigned int)val; 1359 1360 hlist_for_each_entry(blkg, n, &blkcg->blkg_list, blkcg_node) { 1361 pn = blkio_policy_search_node(blkcg, blkg->dev, 1362 BLKIO_POLICY_PROP, BLKIO_PROP_weight_device); 1363 if (pn) 1364 continue; 1365 1366 blkio_update_group_weight(blkg, blkcg->weight); 1367 } 1368 spin_unlock_irq(&blkcg->lock); 1369 spin_unlock(&blkio_list_lock); 1370 return 0; 1371} 1372 1373static u64 blkiocg_file_read_u64 (struct cgroup *cgrp, struct cftype *cft) { 1374 struct blkio_cgroup *blkcg; 1375 enum blkio_policy_id plid = BLKIOFILE_POLICY(cft->private); 1376 int name = BLKIOFILE_ATTR(cft->private); 1377 1378 blkcg = cgroup_to_blkio_cgroup(cgrp); 1379 1380 switch(plid) { 1381 case BLKIO_POLICY_PROP: 1382 switch(name) { 1383 case BLKIO_PROP_weight: 1384 return (u64)blkcg->weight; 1385 } 1386 break; 1387 default: 1388 BUG(); 1389 } 1390 return 0; 1391} 1392 1393static int 1394blkiocg_file_write_u64(struct cgroup *cgrp, struct cftype *cft, u64 val) 1395{ 1396 struct blkio_cgroup *blkcg; 1397 enum blkio_policy_id plid = BLKIOFILE_POLICY(cft->private); 1398 int name = BLKIOFILE_ATTR(cft->private); 1399 1400 blkcg = cgroup_to_blkio_cgroup(cgrp); 1401 1402 switch(plid) { 1403 case BLKIO_POLICY_PROP: 1404 switch(name) { 1405 case BLKIO_PROP_weight: 1406 return blkio_weight_write(blkcg, val); 1407 } 1408 break; 1409 default: 1410 BUG(); 1411 } 1412 1413 return 0; 1414} 1415 1416struct cftype blkio_files[] = { 1417 { 1418 .name = "weight_device", 1419 .private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP, 1420 BLKIO_PROP_weight_device), 1421 .read_seq_string = blkiocg_file_read, 1422 .write_string = blkiocg_file_write, 1423 .max_write_len = 256, 1424 }, 1425 { 1426 .name = "weight", 1427 .private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP, 1428 BLKIO_PROP_weight), 1429 .read_u64 = blkiocg_file_read_u64, 1430 .write_u64 = blkiocg_file_write_u64, 1431 }, 1432 { 1433 .name = "time", 1434 .private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP, 1435 BLKIO_PROP_time), 1436 .read_map = blkiocg_file_read_map, 1437 }, 1438 { 1439 .name = "sectors", 1440 .private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP, 1441 BLKIO_PROP_sectors), 1442 .read_map = blkiocg_file_read_map, 1443 }, 1444 { 1445 .name = "io_service_bytes", 1446 .private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP, 1447 BLKIO_PROP_io_service_bytes), 1448 .read_map = blkiocg_file_read_map, 1449 }, 1450 { 1451 .name = "io_serviced", 1452 .private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP, 1453 BLKIO_PROP_io_serviced), 1454 .read_map = blkiocg_file_read_map, 1455 }, 1456 { 1457 .name = "io_service_time", 1458 .private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP, 1459 BLKIO_PROP_io_service_time), 1460 .read_map = blkiocg_file_read_map, 1461 }, 1462 { 1463 .name = "io_wait_time", 1464 .private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP, 1465 BLKIO_PROP_io_wait_time), 1466 .read_map = blkiocg_file_read_map, 1467 }, 1468 { 1469 .name = "io_merged", 1470 .private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP, 1471 BLKIO_PROP_io_merged), 1472 .read_map = blkiocg_file_read_map, 1473 }, 1474 { 1475 .name = "io_queued", 1476 .private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP, 1477 BLKIO_PROP_io_queued), 1478 .read_map = blkiocg_file_read_map, 1479 }, 1480 { 1481 .name = "reset_stats", 1482 .write_u64 = blkiocg_reset_stats, 1483 }, 1484#ifdef CONFIG_BLK_DEV_THROTTLING 1485 { 1486 .name = "throttle.read_bps_device", 1487 .private = BLKIOFILE_PRIVATE(BLKIO_POLICY_THROTL, 1488 BLKIO_THROTL_read_bps_device), 1489 .read_seq_string = blkiocg_file_read, 1490 .write_string = blkiocg_file_write, 1491 .max_write_len = 256, 1492 }, 1493 1494 { 1495 .name = "throttle.write_bps_device", 1496 .private = BLKIOFILE_PRIVATE(BLKIO_POLICY_THROTL, 1497 BLKIO_THROTL_write_bps_device), 1498 .read_seq_string = blkiocg_file_read, 1499 .write_string = blkiocg_file_write, 1500 .max_write_len = 256, 1501 }, 1502 1503 { 1504 .name = "throttle.read_iops_device", 1505 .private = BLKIOFILE_PRIVATE(BLKIO_POLICY_THROTL, 1506 BLKIO_THROTL_read_iops_device), 1507 .read_seq_string = blkiocg_file_read, 1508 .write_string = blkiocg_file_write, 1509 .max_write_len = 256, 1510 }, 1511 1512 { 1513 .name = "throttle.write_iops_device", 1514 .private = BLKIOFILE_PRIVATE(BLKIO_POLICY_THROTL, 1515 BLKIO_THROTL_write_iops_device), 1516 .read_seq_string = blkiocg_file_read, 1517 .write_string = blkiocg_file_write, 1518 .max_write_len = 256, 1519 }, 1520 { 1521 .name = "throttle.io_service_bytes", 1522 .private = BLKIOFILE_PRIVATE(BLKIO_POLICY_THROTL, 1523 BLKIO_THROTL_io_service_bytes), 1524 .read_map = blkiocg_file_read_map, 1525 }, 1526 { 1527 .name = "throttle.io_serviced", 1528 .private = BLKIOFILE_PRIVATE(BLKIO_POLICY_THROTL, 1529 BLKIO_THROTL_io_serviced), 1530 .read_map = blkiocg_file_read_map, 1531 }, 1532#endif /* CONFIG_BLK_DEV_THROTTLING */ 1533 1534#ifdef CONFIG_DEBUG_BLK_CGROUP 1535 { 1536 .name = "avg_queue_size", 1537 .private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP, 1538 BLKIO_PROP_avg_queue_size), 1539 .read_map = blkiocg_file_read_map, 1540 }, 1541 { 1542 .name = "group_wait_time", 1543 .private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP, 1544 BLKIO_PROP_group_wait_time), 1545 .read_map = blkiocg_file_read_map, 1546 }, 1547 { 1548 .name = "idle_time", 1549 .private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP, 1550 BLKIO_PROP_idle_time), 1551 .read_map = blkiocg_file_read_map, 1552 }, 1553 { 1554 .name = "empty_time", 1555 .private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP, 1556 BLKIO_PROP_empty_time), 1557 .read_map = blkiocg_file_read_map, 1558 }, 1559 { 1560 .name = "dequeue", 1561 .private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP, 1562 BLKIO_PROP_dequeue), 1563 .read_map = blkiocg_file_read_map, 1564 }, 1565 { 1566 .name = "unaccounted_time", 1567 .private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP, 1568 BLKIO_PROP_unaccounted_time), 1569 .read_map = blkiocg_file_read_map, 1570 }, 1571#endif 1572}; 1573 1574static int blkiocg_populate(struct cgroup_subsys *subsys, struct cgroup *cgroup) 1575{ 1576 return cgroup_add_files(cgroup, subsys, blkio_files, 1577 ARRAY_SIZE(blkio_files)); 1578} 1579 1580static void blkiocg_destroy(struct cgroup_subsys *subsys, struct cgroup *cgroup) 1581{ 1582 struct blkio_cgroup *blkcg = cgroup_to_blkio_cgroup(cgroup); 1583 unsigned long flags; 1584 struct blkio_group *blkg; 1585 void *key; 1586 struct blkio_policy_type *blkiop; 1587 struct blkio_policy_node *pn, *pntmp; 1588 1589 rcu_read_lock(); 1590 do { 1591 spin_lock_irqsave(&blkcg->lock, flags); 1592 1593 if (hlist_empty(&blkcg->blkg_list)) { 1594 spin_unlock_irqrestore(&blkcg->lock, flags); 1595 break; 1596 } 1597 1598 blkg = hlist_entry(blkcg->blkg_list.first, struct blkio_group, 1599 blkcg_node); 1600 key = rcu_dereference(blkg->key); 1601 __blkiocg_del_blkio_group(blkg); 1602 1603 spin_unlock_irqrestore(&blkcg->lock, flags); 1604 1605 /* 1606 * This blkio_group is being unlinked as associated cgroup is 1607 * going away. Let all the IO controlling policies know about 1608 * this event. 1609 */ 1610 spin_lock(&blkio_list_lock); 1611 list_for_each_entry(blkiop, &blkio_list, list) { 1612 if (blkiop->plid != blkg->plid) 1613 continue; 1614 blkiop->ops.blkio_unlink_group_fn(key, blkg); 1615 } 1616 spin_unlock(&blkio_list_lock); 1617 } while (1); 1618 1619 list_for_each_entry_safe(pn, pntmp, &blkcg->policy_list, node) { 1620 blkio_policy_delete_node(pn); 1621 kfree(pn); 1622 } 1623 1624 free_css_id(&blkio_subsys, &blkcg->css); 1625 rcu_read_unlock(); 1626 if (blkcg != &blkio_root_cgroup) 1627 kfree(blkcg); 1628} 1629 1630static struct cgroup_subsys_state * 1631blkiocg_create(struct cgroup_subsys *subsys, struct cgroup *cgroup) 1632{ 1633 struct blkio_cgroup *blkcg; 1634 struct cgroup *parent = cgroup->parent; 1635 1636 if (!parent) { 1637 blkcg = &blkio_root_cgroup; 1638 goto done; 1639 } 1640 1641 blkcg = kzalloc(sizeof(*blkcg), GFP_KERNEL); 1642 if (!blkcg) 1643 return ERR_PTR(-ENOMEM); 1644 1645 blkcg->weight = BLKIO_WEIGHT_DEFAULT; 1646done: 1647 spin_lock_init(&blkcg->lock); 1648 INIT_HLIST_HEAD(&blkcg->blkg_list); 1649 1650 INIT_LIST_HEAD(&blkcg->policy_list); 1651 return &blkcg->css; 1652} 1653 1654/* 1655 * We cannot support shared io contexts, as we have no mean to support 1656 * two tasks with the same ioc in two different groups without major rework 1657 * of the main cic data structures. For now we allow a task to change 1658 * its cgroup only if it's the only owner of its ioc. 1659 */ 1660static int blkiocg_can_attach(struct cgroup_subsys *ss, struct cgroup *cgrp, 1661 struct cgroup_taskset *tset) 1662{ 1663 struct task_struct *task; 1664 struct io_context *ioc; 1665 int ret = 0; 1666 1667 /* task_lock() is needed to avoid races with exit_io_context() */ 1668 cgroup_taskset_for_each(task, cgrp, tset) { 1669 task_lock(task); 1670 ioc = task->io_context; 1671 if (ioc && atomic_read(&ioc->nr_tasks) > 1) 1672 ret = -EINVAL; 1673 task_unlock(task); 1674 if (ret) 1675 break; 1676 } 1677 return ret; 1678} 1679 1680static void blkiocg_attach(struct cgroup_subsys *ss, struct cgroup *cgrp, 1681 struct cgroup_taskset *tset) 1682{ 1683 struct task_struct *task; 1684 struct io_context *ioc; 1685 1686 cgroup_taskset_for_each(task, cgrp, tset) { 1687 /* we don't lose anything even if ioc allocation fails */ 1688 ioc = get_task_io_context(task, GFP_ATOMIC, NUMA_NO_NODE); 1689 if (ioc) { 1690 ioc_cgroup_changed(ioc); 1691 put_io_context(ioc); 1692 } 1693 } 1694} 1695 1696void blkio_policy_register(struct blkio_policy_type *blkiop) 1697{ 1698 spin_lock(&blkio_list_lock); 1699 list_add_tail(&blkiop->list, &blkio_list); 1700 spin_unlock(&blkio_list_lock); 1701} 1702EXPORT_SYMBOL_GPL(blkio_policy_register); 1703 1704void blkio_policy_unregister(struct blkio_policy_type *blkiop) 1705{ 1706 spin_lock(&blkio_list_lock); 1707 list_del_init(&blkiop->list); 1708 spin_unlock(&blkio_list_lock); 1709} 1710EXPORT_SYMBOL_GPL(blkio_policy_unregister); 1711