blk-cgroup.c revision 676f7c8f84d15e94065841529016da5ab92e901b
1/* 2 * Common Block IO controller cgroup interface 3 * 4 * Based on ideas and code from CFQ, CFS and BFQ: 5 * Copyright (C) 2003 Jens Axboe <axboe@kernel.dk> 6 * 7 * Copyright (C) 2008 Fabio Checconi <fabio@gandalf.sssup.it> 8 * Paolo Valente <paolo.valente@unimore.it> 9 * 10 * Copyright (C) 2009 Vivek Goyal <vgoyal@redhat.com> 11 * Nauman Rafique <nauman@google.com> 12 */ 13#include <linux/ioprio.h> 14#include <linux/seq_file.h> 15#include <linux/kdev_t.h> 16#include <linux/module.h> 17#include <linux/err.h> 18#include <linux/blkdev.h> 19#include <linux/slab.h> 20#include "blk-cgroup.h" 21#include <linux/genhd.h> 22 23#define MAX_KEY_LEN 100 24 25static DEFINE_SPINLOCK(blkio_list_lock); 26static LIST_HEAD(blkio_list); 27 28struct blkio_cgroup blkio_root_cgroup = { .weight = 2*BLKIO_WEIGHT_DEFAULT }; 29EXPORT_SYMBOL_GPL(blkio_root_cgroup); 30 31/* for encoding cft->private value on file */ 32#define BLKIOFILE_PRIVATE(x, val) (((x) << 16) | (val)) 33/* What policy owns the file, proportional or throttle */ 34#define BLKIOFILE_POLICY(val) (((val) >> 16) & 0xffff) 35#define BLKIOFILE_ATTR(val) ((val) & 0xffff) 36 37static inline void blkio_policy_insert_node(struct blkio_cgroup *blkcg, 38 struct blkio_policy_node *pn) 39{ 40 list_add(&pn->node, &blkcg->policy_list); 41} 42 43static inline bool cftype_blkg_same_policy(struct cftype *cft, 44 struct blkio_group *blkg) 45{ 46 enum blkio_policy_id plid = BLKIOFILE_POLICY(cft->private); 47 48 if (blkg->plid == plid) 49 return 1; 50 51 return 0; 52} 53 54/* Determines if policy node matches cgroup file being accessed */ 55static inline bool pn_matches_cftype(struct cftype *cft, 56 struct blkio_policy_node *pn) 57{ 58 enum blkio_policy_id plid = BLKIOFILE_POLICY(cft->private); 59 int fileid = BLKIOFILE_ATTR(cft->private); 60 61 return (plid == pn->plid && fileid == pn->fileid); 62} 63 64/* Must be called with blkcg->lock held */ 65static inline void blkio_policy_delete_node(struct blkio_policy_node *pn) 66{ 67 list_del(&pn->node); 68} 69 70/* Must be called with blkcg->lock held */ 71static struct blkio_policy_node * 72blkio_policy_search_node(const struct blkio_cgroup *blkcg, dev_t dev, 73 enum blkio_policy_id plid, int fileid) 74{ 75 struct blkio_policy_node *pn; 76 77 list_for_each_entry(pn, &blkcg->policy_list, node) { 78 if (pn->dev == dev && pn->plid == plid && pn->fileid == fileid) 79 return pn; 80 } 81 82 return NULL; 83} 84 85struct blkio_cgroup *cgroup_to_blkio_cgroup(struct cgroup *cgroup) 86{ 87 return container_of(cgroup_subsys_state(cgroup, blkio_subsys_id), 88 struct blkio_cgroup, css); 89} 90EXPORT_SYMBOL_GPL(cgroup_to_blkio_cgroup); 91 92struct blkio_cgroup *task_blkio_cgroup(struct task_struct *tsk) 93{ 94 return container_of(task_subsys_state(tsk, blkio_subsys_id), 95 struct blkio_cgroup, css); 96} 97EXPORT_SYMBOL_GPL(task_blkio_cgroup); 98 99static inline void 100blkio_update_group_weight(struct blkio_group *blkg, unsigned int weight) 101{ 102 struct blkio_policy_type *blkiop; 103 104 list_for_each_entry(blkiop, &blkio_list, list) { 105 /* If this policy does not own the blkg, do not send updates */ 106 if (blkiop->plid != blkg->plid) 107 continue; 108 if (blkiop->ops.blkio_update_group_weight_fn) 109 blkiop->ops.blkio_update_group_weight_fn(blkg->key, 110 blkg, weight); 111 } 112} 113 114static inline void blkio_update_group_bps(struct blkio_group *blkg, u64 bps, 115 int fileid) 116{ 117 struct blkio_policy_type *blkiop; 118 119 list_for_each_entry(blkiop, &blkio_list, list) { 120 121 /* If this policy does not own the blkg, do not send updates */ 122 if (blkiop->plid != blkg->plid) 123 continue; 124 125 if (fileid == BLKIO_THROTL_read_bps_device 126 && blkiop->ops.blkio_update_group_read_bps_fn) 127 blkiop->ops.blkio_update_group_read_bps_fn(blkg->key, 128 blkg, bps); 129 130 if (fileid == BLKIO_THROTL_write_bps_device 131 && blkiop->ops.blkio_update_group_write_bps_fn) 132 blkiop->ops.blkio_update_group_write_bps_fn(blkg->key, 133 blkg, bps); 134 } 135} 136 137static inline void blkio_update_group_iops(struct blkio_group *blkg, 138 unsigned int iops, int fileid) 139{ 140 struct blkio_policy_type *blkiop; 141 142 list_for_each_entry(blkiop, &blkio_list, list) { 143 144 /* If this policy does not own the blkg, do not send updates */ 145 if (blkiop->plid != blkg->plid) 146 continue; 147 148 if (fileid == BLKIO_THROTL_read_iops_device 149 && blkiop->ops.blkio_update_group_read_iops_fn) 150 blkiop->ops.blkio_update_group_read_iops_fn(blkg->key, 151 blkg, iops); 152 153 if (fileid == BLKIO_THROTL_write_iops_device 154 && blkiop->ops.blkio_update_group_write_iops_fn) 155 blkiop->ops.blkio_update_group_write_iops_fn(blkg->key, 156 blkg,iops); 157 } 158} 159 160/* 161 * Add to the appropriate stat variable depending on the request type. 162 * This should be called with the blkg->stats_lock held. 163 */ 164static void blkio_add_stat(uint64_t *stat, uint64_t add, bool direction, 165 bool sync) 166{ 167 if (direction) 168 stat[BLKIO_STAT_WRITE] += add; 169 else 170 stat[BLKIO_STAT_READ] += add; 171 if (sync) 172 stat[BLKIO_STAT_SYNC] += add; 173 else 174 stat[BLKIO_STAT_ASYNC] += add; 175} 176 177/* 178 * Decrements the appropriate stat variable if non-zero depending on the 179 * request type. Panics on value being zero. 180 * This should be called with the blkg->stats_lock held. 181 */ 182static void blkio_check_and_dec_stat(uint64_t *stat, bool direction, bool sync) 183{ 184 if (direction) { 185 BUG_ON(stat[BLKIO_STAT_WRITE] == 0); 186 stat[BLKIO_STAT_WRITE]--; 187 } else { 188 BUG_ON(stat[BLKIO_STAT_READ] == 0); 189 stat[BLKIO_STAT_READ]--; 190 } 191 if (sync) { 192 BUG_ON(stat[BLKIO_STAT_SYNC] == 0); 193 stat[BLKIO_STAT_SYNC]--; 194 } else { 195 BUG_ON(stat[BLKIO_STAT_ASYNC] == 0); 196 stat[BLKIO_STAT_ASYNC]--; 197 } 198} 199 200#ifdef CONFIG_DEBUG_BLK_CGROUP 201/* This should be called with the blkg->stats_lock held. */ 202static void blkio_set_start_group_wait_time(struct blkio_group *blkg, 203 struct blkio_group *curr_blkg) 204{ 205 if (blkio_blkg_waiting(&blkg->stats)) 206 return; 207 if (blkg == curr_blkg) 208 return; 209 blkg->stats.start_group_wait_time = sched_clock(); 210 blkio_mark_blkg_waiting(&blkg->stats); 211} 212 213/* This should be called with the blkg->stats_lock held. */ 214static void blkio_update_group_wait_time(struct blkio_group_stats *stats) 215{ 216 unsigned long long now; 217 218 if (!blkio_blkg_waiting(stats)) 219 return; 220 221 now = sched_clock(); 222 if (time_after64(now, stats->start_group_wait_time)) 223 stats->group_wait_time += now - stats->start_group_wait_time; 224 blkio_clear_blkg_waiting(stats); 225} 226 227/* This should be called with the blkg->stats_lock held. */ 228static void blkio_end_empty_time(struct blkio_group_stats *stats) 229{ 230 unsigned long long now; 231 232 if (!blkio_blkg_empty(stats)) 233 return; 234 235 now = sched_clock(); 236 if (time_after64(now, stats->start_empty_time)) 237 stats->empty_time += now - stats->start_empty_time; 238 blkio_clear_blkg_empty(stats); 239} 240 241void blkiocg_update_set_idle_time_stats(struct blkio_group *blkg) 242{ 243 unsigned long flags; 244 245 spin_lock_irqsave(&blkg->stats_lock, flags); 246 BUG_ON(blkio_blkg_idling(&blkg->stats)); 247 blkg->stats.start_idle_time = sched_clock(); 248 blkio_mark_blkg_idling(&blkg->stats); 249 spin_unlock_irqrestore(&blkg->stats_lock, flags); 250} 251EXPORT_SYMBOL_GPL(blkiocg_update_set_idle_time_stats); 252 253void blkiocg_update_idle_time_stats(struct blkio_group *blkg) 254{ 255 unsigned long flags; 256 unsigned long long now; 257 struct blkio_group_stats *stats; 258 259 spin_lock_irqsave(&blkg->stats_lock, flags); 260 stats = &blkg->stats; 261 if (blkio_blkg_idling(stats)) { 262 now = sched_clock(); 263 if (time_after64(now, stats->start_idle_time)) 264 stats->idle_time += now - stats->start_idle_time; 265 blkio_clear_blkg_idling(stats); 266 } 267 spin_unlock_irqrestore(&blkg->stats_lock, flags); 268} 269EXPORT_SYMBOL_GPL(blkiocg_update_idle_time_stats); 270 271void blkiocg_update_avg_queue_size_stats(struct blkio_group *blkg) 272{ 273 unsigned long flags; 274 struct blkio_group_stats *stats; 275 276 spin_lock_irqsave(&blkg->stats_lock, flags); 277 stats = &blkg->stats; 278 stats->avg_queue_size_sum += 279 stats->stat_arr[BLKIO_STAT_QUEUED][BLKIO_STAT_READ] + 280 stats->stat_arr[BLKIO_STAT_QUEUED][BLKIO_STAT_WRITE]; 281 stats->avg_queue_size_samples++; 282 blkio_update_group_wait_time(stats); 283 spin_unlock_irqrestore(&blkg->stats_lock, flags); 284} 285EXPORT_SYMBOL_GPL(blkiocg_update_avg_queue_size_stats); 286 287void blkiocg_set_start_empty_time(struct blkio_group *blkg) 288{ 289 unsigned long flags; 290 struct blkio_group_stats *stats; 291 292 spin_lock_irqsave(&blkg->stats_lock, flags); 293 stats = &blkg->stats; 294 295 if (stats->stat_arr[BLKIO_STAT_QUEUED][BLKIO_STAT_READ] || 296 stats->stat_arr[BLKIO_STAT_QUEUED][BLKIO_STAT_WRITE]) { 297 spin_unlock_irqrestore(&blkg->stats_lock, flags); 298 return; 299 } 300 301 /* 302 * group is already marked empty. This can happen if cfqq got new 303 * request in parent group and moved to this group while being added 304 * to service tree. Just ignore the event and move on. 305 */ 306 if(blkio_blkg_empty(stats)) { 307 spin_unlock_irqrestore(&blkg->stats_lock, flags); 308 return; 309 } 310 311 stats->start_empty_time = sched_clock(); 312 blkio_mark_blkg_empty(stats); 313 spin_unlock_irqrestore(&blkg->stats_lock, flags); 314} 315EXPORT_SYMBOL_GPL(blkiocg_set_start_empty_time); 316 317void blkiocg_update_dequeue_stats(struct blkio_group *blkg, 318 unsigned long dequeue) 319{ 320 blkg->stats.dequeue += dequeue; 321} 322EXPORT_SYMBOL_GPL(blkiocg_update_dequeue_stats); 323#else 324static inline void blkio_set_start_group_wait_time(struct blkio_group *blkg, 325 struct blkio_group *curr_blkg) {} 326static inline void blkio_end_empty_time(struct blkio_group_stats *stats) {} 327#endif 328 329void blkiocg_update_io_add_stats(struct blkio_group *blkg, 330 struct blkio_group *curr_blkg, bool direction, 331 bool sync) 332{ 333 unsigned long flags; 334 335 spin_lock_irqsave(&blkg->stats_lock, flags); 336 blkio_add_stat(blkg->stats.stat_arr[BLKIO_STAT_QUEUED], 1, direction, 337 sync); 338 blkio_end_empty_time(&blkg->stats); 339 blkio_set_start_group_wait_time(blkg, curr_blkg); 340 spin_unlock_irqrestore(&blkg->stats_lock, flags); 341} 342EXPORT_SYMBOL_GPL(blkiocg_update_io_add_stats); 343 344void blkiocg_update_io_remove_stats(struct blkio_group *blkg, 345 bool direction, bool sync) 346{ 347 unsigned long flags; 348 349 spin_lock_irqsave(&blkg->stats_lock, flags); 350 blkio_check_and_dec_stat(blkg->stats.stat_arr[BLKIO_STAT_QUEUED], 351 direction, sync); 352 spin_unlock_irqrestore(&blkg->stats_lock, flags); 353} 354EXPORT_SYMBOL_GPL(blkiocg_update_io_remove_stats); 355 356void blkiocg_update_timeslice_used(struct blkio_group *blkg, unsigned long time, 357 unsigned long unaccounted_time) 358{ 359 unsigned long flags; 360 361 spin_lock_irqsave(&blkg->stats_lock, flags); 362 blkg->stats.time += time; 363#ifdef CONFIG_DEBUG_BLK_CGROUP 364 blkg->stats.unaccounted_time += unaccounted_time; 365#endif 366 spin_unlock_irqrestore(&blkg->stats_lock, flags); 367} 368EXPORT_SYMBOL_GPL(blkiocg_update_timeslice_used); 369 370/* 371 * should be called under rcu read lock or queue lock to make sure blkg pointer 372 * is valid. 373 */ 374void blkiocg_update_dispatch_stats(struct blkio_group *blkg, 375 uint64_t bytes, bool direction, bool sync) 376{ 377 struct blkio_group_stats_cpu *stats_cpu; 378 unsigned long flags; 379 380 /* 381 * Disabling interrupts to provide mutual exclusion between two 382 * writes on same cpu. It probably is not needed for 64bit. Not 383 * optimizing that case yet. 384 */ 385 local_irq_save(flags); 386 387 stats_cpu = this_cpu_ptr(blkg->stats_cpu); 388 389 u64_stats_update_begin(&stats_cpu->syncp); 390 stats_cpu->sectors += bytes >> 9; 391 blkio_add_stat(stats_cpu->stat_arr_cpu[BLKIO_STAT_CPU_SERVICED], 392 1, direction, sync); 393 blkio_add_stat(stats_cpu->stat_arr_cpu[BLKIO_STAT_CPU_SERVICE_BYTES], 394 bytes, direction, sync); 395 u64_stats_update_end(&stats_cpu->syncp); 396 local_irq_restore(flags); 397} 398EXPORT_SYMBOL_GPL(blkiocg_update_dispatch_stats); 399 400void blkiocg_update_completion_stats(struct blkio_group *blkg, 401 uint64_t start_time, uint64_t io_start_time, bool direction, bool sync) 402{ 403 struct blkio_group_stats *stats; 404 unsigned long flags; 405 unsigned long long now = sched_clock(); 406 407 spin_lock_irqsave(&blkg->stats_lock, flags); 408 stats = &blkg->stats; 409 if (time_after64(now, io_start_time)) 410 blkio_add_stat(stats->stat_arr[BLKIO_STAT_SERVICE_TIME], 411 now - io_start_time, direction, sync); 412 if (time_after64(io_start_time, start_time)) 413 blkio_add_stat(stats->stat_arr[BLKIO_STAT_WAIT_TIME], 414 io_start_time - start_time, direction, sync); 415 spin_unlock_irqrestore(&blkg->stats_lock, flags); 416} 417EXPORT_SYMBOL_GPL(blkiocg_update_completion_stats); 418 419/* Merged stats are per cpu. */ 420void blkiocg_update_io_merged_stats(struct blkio_group *blkg, bool direction, 421 bool sync) 422{ 423 struct blkio_group_stats_cpu *stats_cpu; 424 unsigned long flags; 425 426 /* 427 * Disabling interrupts to provide mutual exclusion between two 428 * writes on same cpu. It probably is not needed for 64bit. Not 429 * optimizing that case yet. 430 */ 431 local_irq_save(flags); 432 433 stats_cpu = this_cpu_ptr(blkg->stats_cpu); 434 435 u64_stats_update_begin(&stats_cpu->syncp); 436 blkio_add_stat(stats_cpu->stat_arr_cpu[BLKIO_STAT_CPU_MERGED], 1, 437 direction, sync); 438 u64_stats_update_end(&stats_cpu->syncp); 439 local_irq_restore(flags); 440} 441EXPORT_SYMBOL_GPL(blkiocg_update_io_merged_stats); 442 443/* 444 * This function allocates the per cpu stats for blkio_group. Should be called 445 * from sleepable context as alloc_per_cpu() requires that. 446 */ 447int blkio_alloc_blkg_stats(struct blkio_group *blkg) 448{ 449 /* Allocate memory for per cpu stats */ 450 blkg->stats_cpu = alloc_percpu(struct blkio_group_stats_cpu); 451 if (!blkg->stats_cpu) 452 return -ENOMEM; 453 return 0; 454} 455EXPORT_SYMBOL_GPL(blkio_alloc_blkg_stats); 456 457void blkiocg_add_blkio_group(struct blkio_cgroup *blkcg, 458 struct blkio_group *blkg, void *key, dev_t dev, 459 enum blkio_policy_id plid) 460{ 461 unsigned long flags; 462 463 spin_lock_irqsave(&blkcg->lock, flags); 464 spin_lock_init(&blkg->stats_lock); 465 rcu_assign_pointer(blkg->key, key); 466 blkg->blkcg_id = css_id(&blkcg->css); 467 hlist_add_head_rcu(&blkg->blkcg_node, &blkcg->blkg_list); 468 blkg->plid = plid; 469 spin_unlock_irqrestore(&blkcg->lock, flags); 470 /* Need to take css reference ? */ 471 cgroup_path(blkcg->css.cgroup, blkg->path, sizeof(blkg->path)); 472 blkg->dev = dev; 473} 474EXPORT_SYMBOL_GPL(blkiocg_add_blkio_group); 475 476static void __blkiocg_del_blkio_group(struct blkio_group *blkg) 477{ 478 hlist_del_init_rcu(&blkg->blkcg_node); 479 blkg->blkcg_id = 0; 480} 481 482/* 483 * returns 0 if blkio_group was still on cgroup list. Otherwise returns 1 484 * indicating that blk_group was unhashed by the time we got to it. 485 */ 486int blkiocg_del_blkio_group(struct blkio_group *blkg) 487{ 488 struct blkio_cgroup *blkcg; 489 unsigned long flags; 490 struct cgroup_subsys_state *css; 491 int ret = 1; 492 493 rcu_read_lock(); 494 css = css_lookup(&blkio_subsys, blkg->blkcg_id); 495 if (css) { 496 blkcg = container_of(css, struct blkio_cgroup, css); 497 spin_lock_irqsave(&blkcg->lock, flags); 498 if (!hlist_unhashed(&blkg->blkcg_node)) { 499 __blkiocg_del_blkio_group(blkg); 500 ret = 0; 501 } 502 spin_unlock_irqrestore(&blkcg->lock, flags); 503 } 504 505 rcu_read_unlock(); 506 return ret; 507} 508EXPORT_SYMBOL_GPL(blkiocg_del_blkio_group); 509 510/* called under rcu_read_lock(). */ 511struct blkio_group *blkiocg_lookup_group(struct blkio_cgroup *blkcg, void *key) 512{ 513 struct blkio_group *blkg; 514 struct hlist_node *n; 515 void *__key; 516 517 hlist_for_each_entry_rcu(blkg, n, &blkcg->blkg_list, blkcg_node) { 518 __key = blkg->key; 519 if (__key == key) 520 return blkg; 521 } 522 523 return NULL; 524} 525EXPORT_SYMBOL_GPL(blkiocg_lookup_group); 526 527static void blkio_reset_stats_cpu(struct blkio_group *blkg) 528{ 529 struct blkio_group_stats_cpu *stats_cpu; 530 int i, j, k; 531 /* 532 * Note: On 64 bit arch this should not be an issue. This has the 533 * possibility of returning some inconsistent value on 32bit arch 534 * as 64bit update on 32bit is non atomic. Taking care of this 535 * corner case makes code very complicated, like sending IPIs to 536 * cpus, taking care of stats of offline cpus etc. 537 * 538 * reset stats is anyway more of a debug feature and this sounds a 539 * corner case. So I am not complicating the code yet until and 540 * unless this becomes a real issue. 541 */ 542 for_each_possible_cpu(i) { 543 stats_cpu = per_cpu_ptr(blkg->stats_cpu, i); 544 stats_cpu->sectors = 0; 545 for(j = 0; j < BLKIO_STAT_CPU_NR; j++) 546 for (k = 0; k < BLKIO_STAT_TOTAL; k++) 547 stats_cpu->stat_arr_cpu[j][k] = 0; 548 } 549} 550 551static int 552blkiocg_reset_stats(struct cgroup *cgroup, struct cftype *cftype, u64 val) 553{ 554 struct blkio_cgroup *blkcg; 555 struct blkio_group *blkg; 556 struct blkio_group_stats *stats; 557 struct hlist_node *n; 558 uint64_t queued[BLKIO_STAT_TOTAL]; 559 int i; 560#ifdef CONFIG_DEBUG_BLK_CGROUP 561 bool idling, waiting, empty; 562 unsigned long long now = sched_clock(); 563#endif 564 565 blkcg = cgroup_to_blkio_cgroup(cgroup); 566 spin_lock_irq(&blkcg->lock); 567 hlist_for_each_entry(blkg, n, &blkcg->blkg_list, blkcg_node) { 568 spin_lock(&blkg->stats_lock); 569 stats = &blkg->stats; 570#ifdef CONFIG_DEBUG_BLK_CGROUP 571 idling = blkio_blkg_idling(stats); 572 waiting = blkio_blkg_waiting(stats); 573 empty = blkio_blkg_empty(stats); 574#endif 575 for (i = 0; i < BLKIO_STAT_TOTAL; i++) 576 queued[i] = stats->stat_arr[BLKIO_STAT_QUEUED][i]; 577 memset(stats, 0, sizeof(struct blkio_group_stats)); 578 for (i = 0; i < BLKIO_STAT_TOTAL; i++) 579 stats->stat_arr[BLKIO_STAT_QUEUED][i] = queued[i]; 580#ifdef CONFIG_DEBUG_BLK_CGROUP 581 if (idling) { 582 blkio_mark_blkg_idling(stats); 583 stats->start_idle_time = now; 584 } 585 if (waiting) { 586 blkio_mark_blkg_waiting(stats); 587 stats->start_group_wait_time = now; 588 } 589 if (empty) { 590 blkio_mark_blkg_empty(stats); 591 stats->start_empty_time = now; 592 } 593#endif 594 spin_unlock(&blkg->stats_lock); 595 596 /* Reset Per cpu stats which don't take blkg->stats_lock */ 597 blkio_reset_stats_cpu(blkg); 598 } 599 600 spin_unlock_irq(&blkcg->lock); 601 return 0; 602} 603 604static void blkio_get_key_name(enum stat_sub_type type, dev_t dev, char *str, 605 int chars_left, bool diskname_only) 606{ 607 snprintf(str, chars_left, "%d:%d", MAJOR(dev), MINOR(dev)); 608 chars_left -= strlen(str); 609 if (chars_left <= 0) { 610 printk(KERN_WARNING 611 "Possibly incorrect cgroup stat display format"); 612 return; 613 } 614 if (diskname_only) 615 return; 616 switch (type) { 617 case BLKIO_STAT_READ: 618 strlcat(str, " Read", chars_left); 619 break; 620 case BLKIO_STAT_WRITE: 621 strlcat(str, " Write", chars_left); 622 break; 623 case BLKIO_STAT_SYNC: 624 strlcat(str, " Sync", chars_left); 625 break; 626 case BLKIO_STAT_ASYNC: 627 strlcat(str, " Async", chars_left); 628 break; 629 case BLKIO_STAT_TOTAL: 630 strlcat(str, " Total", chars_left); 631 break; 632 default: 633 strlcat(str, " Invalid", chars_left); 634 } 635} 636 637static uint64_t blkio_fill_stat(char *str, int chars_left, uint64_t val, 638 struct cgroup_map_cb *cb, dev_t dev) 639{ 640 blkio_get_key_name(0, dev, str, chars_left, true); 641 cb->fill(cb, str, val); 642 return val; 643} 644 645 646static uint64_t blkio_read_stat_cpu(struct blkio_group *blkg, 647 enum stat_type_cpu type, enum stat_sub_type sub_type) 648{ 649 int cpu; 650 struct blkio_group_stats_cpu *stats_cpu; 651 u64 val = 0, tval; 652 653 for_each_possible_cpu(cpu) { 654 unsigned int start; 655 stats_cpu = per_cpu_ptr(blkg->stats_cpu, cpu); 656 657 do { 658 start = u64_stats_fetch_begin(&stats_cpu->syncp); 659 if (type == BLKIO_STAT_CPU_SECTORS) 660 tval = stats_cpu->sectors; 661 else 662 tval = stats_cpu->stat_arr_cpu[type][sub_type]; 663 } while(u64_stats_fetch_retry(&stats_cpu->syncp, start)); 664 665 val += tval; 666 } 667 668 return val; 669} 670 671static uint64_t blkio_get_stat_cpu(struct blkio_group *blkg, 672 struct cgroup_map_cb *cb, dev_t dev, enum stat_type_cpu type) 673{ 674 uint64_t disk_total, val; 675 char key_str[MAX_KEY_LEN]; 676 enum stat_sub_type sub_type; 677 678 if (type == BLKIO_STAT_CPU_SECTORS) { 679 val = blkio_read_stat_cpu(blkg, type, 0); 680 return blkio_fill_stat(key_str, MAX_KEY_LEN - 1, val, cb, dev); 681 } 682 683 for (sub_type = BLKIO_STAT_READ; sub_type < BLKIO_STAT_TOTAL; 684 sub_type++) { 685 blkio_get_key_name(sub_type, dev, key_str, MAX_KEY_LEN, false); 686 val = blkio_read_stat_cpu(blkg, type, sub_type); 687 cb->fill(cb, key_str, val); 688 } 689 690 disk_total = blkio_read_stat_cpu(blkg, type, BLKIO_STAT_READ) + 691 blkio_read_stat_cpu(blkg, type, BLKIO_STAT_WRITE); 692 693 blkio_get_key_name(BLKIO_STAT_TOTAL, dev, key_str, MAX_KEY_LEN, false); 694 cb->fill(cb, key_str, disk_total); 695 return disk_total; 696} 697 698/* This should be called with blkg->stats_lock held */ 699static uint64_t blkio_get_stat(struct blkio_group *blkg, 700 struct cgroup_map_cb *cb, dev_t dev, enum stat_type type) 701{ 702 uint64_t disk_total; 703 char key_str[MAX_KEY_LEN]; 704 enum stat_sub_type sub_type; 705 706 if (type == BLKIO_STAT_TIME) 707 return blkio_fill_stat(key_str, MAX_KEY_LEN - 1, 708 blkg->stats.time, cb, dev); 709#ifdef CONFIG_DEBUG_BLK_CGROUP 710 if (type == BLKIO_STAT_UNACCOUNTED_TIME) 711 return blkio_fill_stat(key_str, MAX_KEY_LEN - 1, 712 blkg->stats.unaccounted_time, cb, dev); 713 if (type == BLKIO_STAT_AVG_QUEUE_SIZE) { 714 uint64_t sum = blkg->stats.avg_queue_size_sum; 715 uint64_t samples = blkg->stats.avg_queue_size_samples; 716 if (samples) 717 do_div(sum, samples); 718 else 719 sum = 0; 720 return blkio_fill_stat(key_str, MAX_KEY_LEN - 1, sum, cb, dev); 721 } 722 if (type == BLKIO_STAT_GROUP_WAIT_TIME) 723 return blkio_fill_stat(key_str, MAX_KEY_LEN - 1, 724 blkg->stats.group_wait_time, cb, dev); 725 if (type == BLKIO_STAT_IDLE_TIME) 726 return blkio_fill_stat(key_str, MAX_KEY_LEN - 1, 727 blkg->stats.idle_time, cb, dev); 728 if (type == BLKIO_STAT_EMPTY_TIME) 729 return blkio_fill_stat(key_str, MAX_KEY_LEN - 1, 730 blkg->stats.empty_time, cb, dev); 731 if (type == BLKIO_STAT_DEQUEUE) 732 return blkio_fill_stat(key_str, MAX_KEY_LEN - 1, 733 blkg->stats.dequeue, cb, dev); 734#endif 735 736 for (sub_type = BLKIO_STAT_READ; sub_type < BLKIO_STAT_TOTAL; 737 sub_type++) { 738 blkio_get_key_name(sub_type, dev, key_str, MAX_KEY_LEN, false); 739 cb->fill(cb, key_str, blkg->stats.stat_arr[type][sub_type]); 740 } 741 disk_total = blkg->stats.stat_arr[type][BLKIO_STAT_READ] + 742 blkg->stats.stat_arr[type][BLKIO_STAT_WRITE]; 743 blkio_get_key_name(BLKIO_STAT_TOTAL, dev, key_str, MAX_KEY_LEN, false); 744 cb->fill(cb, key_str, disk_total); 745 return disk_total; 746} 747 748static int blkio_policy_parse_and_set(char *buf, 749 struct blkio_policy_node *newpn, enum blkio_policy_id plid, int fileid) 750{ 751 struct gendisk *disk = NULL; 752 char *s[4], *p, *major_s = NULL, *minor_s = NULL; 753 unsigned long major, minor; 754 int i = 0, ret = -EINVAL; 755 int part; 756 dev_t dev; 757 u64 temp; 758 759 memset(s, 0, sizeof(s)); 760 761 while ((p = strsep(&buf, " ")) != NULL) { 762 if (!*p) 763 continue; 764 765 s[i++] = p; 766 767 /* Prevent from inputing too many things */ 768 if (i == 3) 769 break; 770 } 771 772 if (i != 2) 773 goto out; 774 775 p = strsep(&s[0], ":"); 776 if (p != NULL) 777 major_s = p; 778 else 779 goto out; 780 781 minor_s = s[0]; 782 if (!minor_s) 783 goto out; 784 785 if (strict_strtoul(major_s, 10, &major)) 786 goto out; 787 788 if (strict_strtoul(minor_s, 10, &minor)) 789 goto out; 790 791 dev = MKDEV(major, minor); 792 793 if (strict_strtoull(s[1], 10, &temp)) 794 goto out; 795 796 /* For rule removal, do not check for device presence. */ 797 if (temp) { 798 disk = get_gendisk(dev, &part); 799 if (!disk || part) { 800 ret = -ENODEV; 801 goto out; 802 } 803 } 804 805 newpn->dev = dev; 806 807 switch (plid) { 808 case BLKIO_POLICY_PROP: 809 if ((temp < BLKIO_WEIGHT_MIN && temp > 0) || 810 temp > BLKIO_WEIGHT_MAX) 811 goto out; 812 813 newpn->plid = plid; 814 newpn->fileid = fileid; 815 newpn->val.weight = temp; 816 break; 817 case BLKIO_POLICY_THROTL: 818 switch(fileid) { 819 case BLKIO_THROTL_read_bps_device: 820 case BLKIO_THROTL_write_bps_device: 821 newpn->plid = plid; 822 newpn->fileid = fileid; 823 newpn->val.bps = temp; 824 break; 825 case BLKIO_THROTL_read_iops_device: 826 case BLKIO_THROTL_write_iops_device: 827 if (temp > THROTL_IOPS_MAX) 828 goto out; 829 830 newpn->plid = plid; 831 newpn->fileid = fileid; 832 newpn->val.iops = (unsigned int)temp; 833 break; 834 } 835 break; 836 default: 837 BUG(); 838 } 839 ret = 0; 840out: 841 put_disk(disk); 842 return ret; 843} 844 845unsigned int blkcg_get_weight(struct blkio_cgroup *blkcg, 846 dev_t dev) 847{ 848 struct blkio_policy_node *pn; 849 unsigned long flags; 850 unsigned int weight; 851 852 spin_lock_irqsave(&blkcg->lock, flags); 853 854 pn = blkio_policy_search_node(blkcg, dev, BLKIO_POLICY_PROP, 855 BLKIO_PROP_weight_device); 856 if (pn) 857 weight = pn->val.weight; 858 else 859 weight = blkcg->weight; 860 861 spin_unlock_irqrestore(&blkcg->lock, flags); 862 863 return weight; 864} 865EXPORT_SYMBOL_GPL(blkcg_get_weight); 866 867uint64_t blkcg_get_read_bps(struct blkio_cgroup *blkcg, dev_t dev) 868{ 869 struct blkio_policy_node *pn; 870 unsigned long flags; 871 uint64_t bps = -1; 872 873 spin_lock_irqsave(&blkcg->lock, flags); 874 pn = blkio_policy_search_node(blkcg, dev, BLKIO_POLICY_THROTL, 875 BLKIO_THROTL_read_bps_device); 876 if (pn) 877 bps = pn->val.bps; 878 spin_unlock_irqrestore(&blkcg->lock, flags); 879 880 return bps; 881} 882 883uint64_t blkcg_get_write_bps(struct blkio_cgroup *blkcg, dev_t dev) 884{ 885 struct blkio_policy_node *pn; 886 unsigned long flags; 887 uint64_t bps = -1; 888 889 spin_lock_irqsave(&blkcg->lock, flags); 890 pn = blkio_policy_search_node(blkcg, dev, BLKIO_POLICY_THROTL, 891 BLKIO_THROTL_write_bps_device); 892 if (pn) 893 bps = pn->val.bps; 894 spin_unlock_irqrestore(&blkcg->lock, flags); 895 896 return bps; 897} 898 899unsigned int blkcg_get_read_iops(struct blkio_cgroup *blkcg, dev_t dev) 900{ 901 struct blkio_policy_node *pn; 902 unsigned long flags; 903 unsigned int iops = -1; 904 905 spin_lock_irqsave(&blkcg->lock, flags); 906 pn = blkio_policy_search_node(blkcg, dev, BLKIO_POLICY_THROTL, 907 BLKIO_THROTL_read_iops_device); 908 if (pn) 909 iops = pn->val.iops; 910 spin_unlock_irqrestore(&blkcg->lock, flags); 911 912 return iops; 913} 914 915unsigned int blkcg_get_write_iops(struct blkio_cgroup *blkcg, dev_t dev) 916{ 917 struct blkio_policy_node *pn; 918 unsigned long flags; 919 unsigned int iops = -1; 920 921 spin_lock_irqsave(&blkcg->lock, flags); 922 pn = blkio_policy_search_node(blkcg, dev, BLKIO_POLICY_THROTL, 923 BLKIO_THROTL_write_iops_device); 924 if (pn) 925 iops = pn->val.iops; 926 spin_unlock_irqrestore(&blkcg->lock, flags); 927 928 return iops; 929} 930 931/* Checks whether user asked for deleting a policy rule */ 932static bool blkio_delete_rule_command(struct blkio_policy_node *pn) 933{ 934 switch(pn->plid) { 935 case BLKIO_POLICY_PROP: 936 if (pn->val.weight == 0) 937 return 1; 938 break; 939 case BLKIO_POLICY_THROTL: 940 switch(pn->fileid) { 941 case BLKIO_THROTL_read_bps_device: 942 case BLKIO_THROTL_write_bps_device: 943 if (pn->val.bps == 0) 944 return 1; 945 break; 946 case BLKIO_THROTL_read_iops_device: 947 case BLKIO_THROTL_write_iops_device: 948 if (pn->val.iops == 0) 949 return 1; 950 } 951 break; 952 default: 953 BUG(); 954 } 955 956 return 0; 957} 958 959static void blkio_update_policy_rule(struct blkio_policy_node *oldpn, 960 struct blkio_policy_node *newpn) 961{ 962 switch(oldpn->plid) { 963 case BLKIO_POLICY_PROP: 964 oldpn->val.weight = newpn->val.weight; 965 break; 966 case BLKIO_POLICY_THROTL: 967 switch(newpn->fileid) { 968 case BLKIO_THROTL_read_bps_device: 969 case BLKIO_THROTL_write_bps_device: 970 oldpn->val.bps = newpn->val.bps; 971 break; 972 case BLKIO_THROTL_read_iops_device: 973 case BLKIO_THROTL_write_iops_device: 974 oldpn->val.iops = newpn->val.iops; 975 } 976 break; 977 default: 978 BUG(); 979 } 980} 981 982/* 983 * Some rules/values in blkg have changed. Propagate those to respective 984 * policies. 985 */ 986static void blkio_update_blkg_policy(struct blkio_cgroup *blkcg, 987 struct blkio_group *blkg, struct blkio_policy_node *pn) 988{ 989 unsigned int weight, iops; 990 u64 bps; 991 992 switch(pn->plid) { 993 case BLKIO_POLICY_PROP: 994 weight = pn->val.weight ? pn->val.weight : 995 blkcg->weight; 996 blkio_update_group_weight(blkg, weight); 997 break; 998 case BLKIO_POLICY_THROTL: 999 switch(pn->fileid) { 1000 case BLKIO_THROTL_read_bps_device: 1001 case BLKIO_THROTL_write_bps_device: 1002 bps = pn->val.bps ? pn->val.bps : (-1); 1003 blkio_update_group_bps(blkg, bps, pn->fileid); 1004 break; 1005 case BLKIO_THROTL_read_iops_device: 1006 case BLKIO_THROTL_write_iops_device: 1007 iops = pn->val.iops ? pn->val.iops : (-1); 1008 blkio_update_group_iops(blkg, iops, pn->fileid); 1009 break; 1010 } 1011 break; 1012 default: 1013 BUG(); 1014 } 1015} 1016 1017/* 1018 * A policy node rule has been updated. Propagate this update to all the 1019 * block groups which might be affected by this update. 1020 */ 1021static void blkio_update_policy_node_blkg(struct blkio_cgroup *blkcg, 1022 struct blkio_policy_node *pn) 1023{ 1024 struct blkio_group *blkg; 1025 struct hlist_node *n; 1026 1027 spin_lock(&blkio_list_lock); 1028 spin_lock_irq(&blkcg->lock); 1029 1030 hlist_for_each_entry(blkg, n, &blkcg->blkg_list, blkcg_node) { 1031 if (pn->dev != blkg->dev || pn->plid != blkg->plid) 1032 continue; 1033 blkio_update_blkg_policy(blkcg, blkg, pn); 1034 } 1035 1036 spin_unlock_irq(&blkcg->lock); 1037 spin_unlock(&blkio_list_lock); 1038} 1039 1040static int blkiocg_file_write(struct cgroup *cgrp, struct cftype *cft, 1041 const char *buffer) 1042{ 1043 int ret = 0; 1044 char *buf; 1045 struct blkio_policy_node *newpn, *pn; 1046 struct blkio_cgroup *blkcg; 1047 int keep_newpn = 0; 1048 enum blkio_policy_id plid = BLKIOFILE_POLICY(cft->private); 1049 int fileid = BLKIOFILE_ATTR(cft->private); 1050 1051 buf = kstrdup(buffer, GFP_KERNEL); 1052 if (!buf) 1053 return -ENOMEM; 1054 1055 newpn = kzalloc(sizeof(*newpn), GFP_KERNEL); 1056 if (!newpn) { 1057 ret = -ENOMEM; 1058 goto free_buf; 1059 } 1060 1061 ret = blkio_policy_parse_and_set(buf, newpn, plid, fileid); 1062 if (ret) 1063 goto free_newpn; 1064 1065 blkcg = cgroup_to_blkio_cgroup(cgrp); 1066 1067 spin_lock_irq(&blkcg->lock); 1068 1069 pn = blkio_policy_search_node(blkcg, newpn->dev, plid, fileid); 1070 if (!pn) { 1071 if (!blkio_delete_rule_command(newpn)) { 1072 blkio_policy_insert_node(blkcg, newpn); 1073 keep_newpn = 1; 1074 } 1075 spin_unlock_irq(&blkcg->lock); 1076 goto update_io_group; 1077 } 1078 1079 if (blkio_delete_rule_command(newpn)) { 1080 blkio_policy_delete_node(pn); 1081 kfree(pn); 1082 spin_unlock_irq(&blkcg->lock); 1083 goto update_io_group; 1084 } 1085 spin_unlock_irq(&blkcg->lock); 1086 1087 blkio_update_policy_rule(pn, newpn); 1088 1089update_io_group: 1090 blkio_update_policy_node_blkg(blkcg, newpn); 1091 1092free_newpn: 1093 if (!keep_newpn) 1094 kfree(newpn); 1095free_buf: 1096 kfree(buf); 1097 return ret; 1098} 1099 1100static void 1101blkio_print_policy_node(struct seq_file *m, struct blkio_policy_node *pn) 1102{ 1103 switch(pn->plid) { 1104 case BLKIO_POLICY_PROP: 1105 if (pn->fileid == BLKIO_PROP_weight_device) 1106 seq_printf(m, "%u:%u\t%u\n", MAJOR(pn->dev), 1107 MINOR(pn->dev), pn->val.weight); 1108 break; 1109 case BLKIO_POLICY_THROTL: 1110 switch(pn->fileid) { 1111 case BLKIO_THROTL_read_bps_device: 1112 case BLKIO_THROTL_write_bps_device: 1113 seq_printf(m, "%u:%u\t%llu\n", MAJOR(pn->dev), 1114 MINOR(pn->dev), pn->val.bps); 1115 break; 1116 case BLKIO_THROTL_read_iops_device: 1117 case BLKIO_THROTL_write_iops_device: 1118 seq_printf(m, "%u:%u\t%u\n", MAJOR(pn->dev), 1119 MINOR(pn->dev), pn->val.iops); 1120 break; 1121 } 1122 break; 1123 default: 1124 BUG(); 1125 } 1126} 1127 1128/* cgroup files which read their data from policy nodes end up here */ 1129static void blkio_read_policy_node_files(struct cftype *cft, 1130 struct blkio_cgroup *blkcg, struct seq_file *m) 1131{ 1132 struct blkio_policy_node *pn; 1133 1134 if (!list_empty(&blkcg->policy_list)) { 1135 spin_lock_irq(&blkcg->lock); 1136 list_for_each_entry(pn, &blkcg->policy_list, node) { 1137 if (!pn_matches_cftype(cft, pn)) 1138 continue; 1139 blkio_print_policy_node(m, pn); 1140 } 1141 spin_unlock_irq(&blkcg->lock); 1142 } 1143} 1144 1145static int blkiocg_file_read(struct cgroup *cgrp, struct cftype *cft, 1146 struct seq_file *m) 1147{ 1148 struct blkio_cgroup *blkcg; 1149 enum blkio_policy_id plid = BLKIOFILE_POLICY(cft->private); 1150 int name = BLKIOFILE_ATTR(cft->private); 1151 1152 blkcg = cgroup_to_blkio_cgroup(cgrp); 1153 1154 switch(plid) { 1155 case BLKIO_POLICY_PROP: 1156 switch(name) { 1157 case BLKIO_PROP_weight_device: 1158 blkio_read_policy_node_files(cft, blkcg, m); 1159 return 0; 1160 default: 1161 BUG(); 1162 } 1163 break; 1164 case BLKIO_POLICY_THROTL: 1165 switch(name){ 1166 case BLKIO_THROTL_read_bps_device: 1167 case BLKIO_THROTL_write_bps_device: 1168 case BLKIO_THROTL_read_iops_device: 1169 case BLKIO_THROTL_write_iops_device: 1170 blkio_read_policy_node_files(cft, blkcg, m); 1171 return 0; 1172 default: 1173 BUG(); 1174 } 1175 break; 1176 default: 1177 BUG(); 1178 } 1179 1180 return 0; 1181} 1182 1183static int blkio_read_blkg_stats(struct blkio_cgroup *blkcg, 1184 struct cftype *cft, struct cgroup_map_cb *cb, 1185 enum stat_type type, bool show_total, bool pcpu) 1186{ 1187 struct blkio_group *blkg; 1188 struct hlist_node *n; 1189 uint64_t cgroup_total = 0; 1190 1191 rcu_read_lock(); 1192 hlist_for_each_entry_rcu(blkg, n, &blkcg->blkg_list, blkcg_node) { 1193 if (blkg->dev) { 1194 if (!cftype_blkg_same_policy(cft, blkg)) 1195 continue; 1196 if (pcpu) 1197 cgroup_total += blkio_get_stat_cpu(blkg, cb, 1198 blkg->dev, type); 1199 else { 1200 spin_lock_irq(&blkg->stats_lock); 1201 cgroup_total += blkio_get_stat(blkg, cb, 1202 blkg->dev, type); 1203 spin_unlock_irq(&blkg->stats_lock); 1204 } 1205 } 1206 } 1207 if (show_total) 1208 cb->fill(cb, "Total", cgroup_total); 1209 rcu_read_unlock(); 1210 return 0; 1211} 1212 1213/* All map kind of cgroup file get serviced by this function */ 1214static int blkiocg_file_read_map(struct cgroup *cgrp, struct cftype *cft, 1215 struct cgroup_map_cb *cb) 1216{ 1217 struct blkio_cgroup *blkcg; 1218 enum blkio_policy_id plid = BLKIOFILE_POLICY(cft->private); 1219 int name = BLKIOFILE_ATTR(cft->private); 1220 1221 blkcg = cgroup_to_blkio_cgroup(cgrp); 1222 1223 switch(plid) { 1224 case BLKIO_POLICY_PROP: 1225 switch(name) { 1226 case BLKIO_PROP_time: 1227 return blkio_read_blkg_stats(blkcg, cft, cb, 1228 BLKIO_STAT_TIME, 0, 0); 1229 case BLKIO_PROP_sectors: 1230 return blkio_read_blkg_stats(blkcg, cft, cb, 1231 BLKIO_STAT_CPU_SECTORS, 0, 1); 1232 case BLKIO_PROP_io_service_bytes: 1233 return blkio_read_blkg_stats(blkcg, cft, cb, 1234 BLKIO_STAT_CPU_SERVICE_BYTES, 1, 1); 1235 case BLKIO_PROP_io_serviced: 1236 return blkio_read_blkg_stats(blkcg, cft, cb, 1237 BLKIO_STAT_CPU_SERVICED, 1, 1); 1238 case BLKIO_PROP_io_service_time: 1239 return blkio_read_blkg_stats(blkcg, cft, cb, 1240 BLKIO_STAT_SERVICE_TIME, 1, 0); 1241 case BLKIO_PROP_io_wait_time: 1242 return blkio_read_blkg_stats(blkcg, cft, cb, 1243 BLKIO_STAT_WAIT_TIME, 1, 0); 1244 case BLKIO_PROP_io_merged: 1245 return blkio_read_blkg_stats(blkcg, cft, cb, 1246 BLKIO_STAT_CPU_MERGED, 1, 1); 1247 case BLKIO_PROP_io_queued: 1248 return blkio_read_blkg_stats(blkcg, cft, cb, 1249 BLKIO_STAT_QUEUED, 1, 0); 1250#ifdef CONFIG_DEBUG_BLK_CGROUP 1251 case BLKIO_PROP_unaccounted_time: 1252 return blkio_read_blkg_stats(blkcg, cft, cb, 1253 BLKIO_STAT_UNACCOUNTED_TIME, 0, 0); 1254 case BLKIO_PROP_dequeue: 1255 return blkio_read_blkg_stats(blkcg, cft, cb, 1256 BLKIO_STAT_DEQUEUE, 0, 0); 1257 case BLKIO_PROP_avg_queue_size: 1258 return blkio_read_blkg_stats(blkcg, cft, cb, 1259 BLKIO_STAT_AVG_QUEUE_SIZE, 0, 0); 1260 case BLKIO_PROP_group_wait_time: 1261 return blkio_read_blkg_stats(blkcg, cft, cb, 1262 BLKIO_STAT_GROUP_WAIT_TIME, 0, 0); 1263 case BLKIO_PROP_idle_time: 1264 return blkio_read_blkg_stats(blkcg, cft, cb, 1265 BLKIO_STAT_IDLE_TIME, 0, 0); 1266 case BLKIO_PROP_empty_time: 1267 return blkio_read_blkg_stats(blkcg, cft, cb, 1268 BLKIO_STAT_EMPTY_TIME, 0, 0); 1269#endif 1270 default: 1271 BUG(); 1272 } 1273 break; 1274 case BLKIO_POLICY_THROTL: 1275 switch(name){ 1276 case BLKIO_THROTL_io_service_bytes: 1277 return blkio_read_blkg_stats(blkcg, cft, cb, 1278 BLKIO_STAT_CPU_SERVICE_BYTES, 1, 1); 1279 case BLKIO_THROTL_io_serviced: 1280 return blkio_read_blkg_stats(blkcg, cft, cb, 1281 BLKIO_STAT_CPU_SERVICED, 1, 1); 1282 default: 1283 BUG(); 1284 } 1285 break; 1286 default: 1287 BUG(); 1288 } 1289 1290 return 0; 1291} 1292 1293static int blkio_weight_write(struct blkio_cgroup *blkcg, u64 val) 1294{ 1295 struct blkio_group *blkg; 1296 struct hlist_node *n; 1297 struct blkio_policy_node *pn; 1298 1299 if (val < BLKIO_WEIGHT_MIN || val > BLKIO_WEIGHT_MAX) 1300 return -EINVAL; 1301 1302 spin_lock(&blkio_list_lock); 1303 spin_lock_irq(&blkcg->lock); 1304 blkcg->weight = (unsigned int)val; 1305 1306 hlist_for_each_entry(blkg, n, &blkcg->blkg_list, blkcg_node) { 1307 pn = blkio_policy_search_node(blkcg, blkg->dev, 1308 BLKIO_POLICY_PROP, BLKIO_PROP_weight_device); 1309 if (pn) 1310 continue; 1311 1312 blkio_update_group_weight(blkg, blkcg->weight); 1313 } 1314 spin_unlock_irq(&blkcg->lock); 1315 spin_unlock(&blkio_list_lock); 1316 return 0; 1317} 1318 1319static u64 blkiocg_file_read_u64 (struct cgroup *cgrp, struct cftype *cft) { 1320 struct blkio_cgroup *blkcg; 1321 enum blkio_policy_id plid = BLKIOFILE_POLICY(cft->private); 1322 int name = BLKIOFILE_ATTR(cft->private); 1323 1324 blkcg = cgroup_to_blkio_cgroup(cgrp); 1325 1326 switch(plid) { 1327 case BLKIO_POLICY_PROP: 1328 switch(name) { 1329 case BLKIO_PROP_weight: 1330 return (u64)blkcg->weight; 1331 } 1332 break; 1333 default: 1334 BUG(); 1335 } 1336 return 0; 1337} 1338 1339static int 1340blkiocg_file_write_u64(struct cgroup *cgrp, struct cftype *cft, u64 val) 1341{ 1342 struct blkio_cgroup *blkcg; 1343 enum blkio_policy_id plid = BLKIOFILE_POLICY(cft->private); 1344 int name = BLKIOFILE_ATTR(cft->private); 1345 1346 blkcg = cgroup_to_blkio_cgroup(cgrp); 1347 1348 switch(plid) { 1349 case BLKIO_POLICY_PROP: 1350 switch(name) { 1351 case BLKIO_PROP_weight: 1352 return blkio_weight_write(blkcg, val); 1353 } 1354 break; 1355 default: 1356 BUG(); 1357 } 1358 1359 return 0; 1360} 1361 1362struct cftype blkio_files[] = { 1363 { 1364 .name = "weight_device", 1365 .private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP, 1366 BLKIO_PROP_weight_device), 1367 .read_seq_string = blkiocg_file_read, 1368 .write_string = blkiocg_file_write, 1369 .max_write_len = 256, 1370 }, 1371 { 1372 .name = "weight", 1373 .private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP, 1374 BLKIO_PROP_weight), 1375 .read_u64 = blkiocg_file_read_u64, 1376 .write_u64 = blkiocg_file_write_u64, 1377 }, 1378 { 1379 .name = "time", 1380 .private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP, 1381 BLKIO_PROP_time), 1382 .read_map = blkiocg_file_read_map, 1383 }, 1384 { 1385 .name = "sectors", 1386 .private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP, 1387 BLKIO_PROP_sectors), 1388 .read_map = blkiocg_file_read_map, 1389 }, 1390 { 1391 .name = "io_service_bytes", 1392 .private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP, 1393 BLKIO_PROP_io_service_bytes), 1394 .read_map = blkiocg_file_read_map, 1395 }, 1396 { 1397 .name = "io_serviced", 1398 .private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP, 1399 BLKIO_PROP_io_serviced), 1400 .read_map = blkiocg_file_read_map, 1401 }, 1402 { 1403 .name = "io_service_time", 1404 .private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP, 1405 BLKIO_PROP_io_service_time), 1406 .read_map = blkiocg_file_read_map, 1407 }, 1408 { 1409 .name = "io_wait_time", 1410 .private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP, 1411 BLKIO_PROP_io_wait_time), 1412 .read_map = blkiocg_file_read_map, 1413 }, 1414 { 1415 .name = "io_merged", 1416 .private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP, 1417 BLKIO_PROP_io_merged), 1418 .read_map = blkiocg_file_read_map, 1419 }, 1420 { 1421 .name = "io_queued", 1422 .private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP, 1423 BLKIO_PROP_io_queued), 1424 .read_map = blkiocg_file_read_map, 1425 }, 1426 { 1427 .name = "reset_stats", 1428 .write_u64 = blkiocg_reset_stats, 1429 }, 1430#ifdef CONFIG_BLK_DEV_THROTTLING 1431 { 1432 .name = "throttle.read_bps_device", 1433 .private = BLKIOFILE_PRIVATE(BLKIO_POLICY_THROTL, 1434 BLKIO_THROTL_read_bps_device), 1435 .read_seq_string = blkiocg_file_read, 1436 .write_string = blkiocg_file_write, 1437 .max_write_len = 256, 1438 }, 1439 1440 { 1441 .name = "throttle.write_bps_device", 1442 .private = BLKIOFILE_PRIVATE(BLKIO_POLICY_THROTL, 1443 BLKIO_THROTL_write_bps_device), 1444 .read_seq_string = blkiocg_file_read, 1445 .write_string = blkiocg_file_write, 1446 .max_write_len = 256, 1447 }, 1448 1449 { 1450 .name = "throttle.read_iops_device", 1451 .private = BLKIOFILE_PRIVATE(BLKIO_POLICY_THROTL, 1452 BLKIO_THROTL_read_iops_device), 1453 .read_seq_string = blkiocg_file_read, 1454 .write_string = blkiocg_file_write, 1455 .max_write_len = 256, 1456 }, 1457 1458 { 1459 .name = "throttle.write_iops_device", 1460 .private = BLKIOFILE_PRIVATE(BLKIO_POLICY_THROTL, 1461 BLKIO_THROTL_write_iops_device), 1462 .read_seq_string = blkiocg_file_read, 1463 .write_string = blkiocg_file_write, 1464 .max_write_len = 256, 1465 }, 1466 { 1467 .name = "throttle.io_service_bytes", 1468 .private = BLKIOFILE_PRIVATE(BLKIO_POLICY_THROTL, 1469 BLKIO_THROTL_io_service_bytes), 1470 .read_map = blkiocg_file_read_map, 1471 }, 1472 { 1473 .name = "throttle.io_serviced", 1474 .private = BLKIOFILE_PRIVATE(BLKIO_POLICY_THROTL, 1475 BLKIO_THROTL_io_serviced), 1476 .read_map = blkiocg_file_read_map, 1477 }, 1478#endif /* CONFIG_BLK_DEV_THROTTLING */ 1479 1480#ifdef CONFIG_DEBUG_BLK_CGROUP 1481 { 1482 .name = "avg_queue_size", 1483 .private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP, 1484 BLKIO_PROP_avg_queue_size), 1485 .read_map = blkiocg_file_read_map, 1486 }, 1487 { 1488 .name = "group_wait_time", 1489 .private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP, 1490 BLKIO_PROP_group_wait_time), 1491 .read_map = blkiocg_file_read_map, 1492 }, 1493 { 1494 .name = "idle_time", 1495 .private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP, 1496 BLKIO_PROP_idle_time), 1497 .read_map = blkiocg_file_read_map, 1498 }, 1499 { 1500 .name = "empty_time", 1501 .private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP, 1502 BLKIO_PROP_empty_time), 1503 .read_map = blkiocg_file_read_map, 1504 }, 1505 { 1506 .name = "dequeue", 1507 .private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP, 1508 BLKIO_PROP_dequeue), 1509 .read_map = blkiocg_file_read_map, 1510 }, 1511 { 1512 .name = "unaccounted_time", 1513 .private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP, 1514 BLKIO_PROP_unaccounted_time), 1515 .read_map = blkiocg_file_read_map, 1516 }, 1517#endif 1518}; 1519 1520static int blkiocg_populate(struct cgroup_subsys *subsys, struct cgroup *cgroup) 1521{ 1522 return cgroup_add_files(cgroup, subsys, blkio_files, 1523 ARRAY_SIZE(blkio_files)); 1524} 1525 1526static void blkiocg_destroy(struct cgroup *cgroup) 1527{ 1528 struct blkio_cgroup *blkcg = cgroup_to_blkio_cgroup(cgroup); 1529 unsigned long flags; 1530 struct blkio_group *blkg; 1531 void *key; 1532 struct blkio_policy_type *blkiop; 1533 struct blkio_policy_node *pn, *pntmp; 1534 1535 rcu_read_lock(); 1536 do { 1537 spin_lock_irqsave(&blkcg->lock, flags); 1538 1539 if (hlist_empty(&blkcg->blkg_list)) { 1540 spin_unlock_irqrestore(&blkcg->lock, flags); 1541 break; 1542 } 1543 1544 blkg = hlist_entry(blkcg->blkg_list.first, struct blkio_group, 1545 blkcg_node); 1546 key = rcu_dereference(blkg->key); 1547 __blkiocg_del_blkio_group(blkg); 1548 1549 spin_unlock_irqrestore(&blkcg->lock, flags); 1550 1551 /* 1552 * This blkio_group is being unlinked as associated cgroup is 1553 * going away. Let all the IO controlling policies know about 1554 * this event. 1555 */ 1556 spin_lock(&blkio_list_lock); 1557 list_for_each_entry(blkiop, &blkio_list, list) { 1558 if (blkiop->plid != blkg->plid) 1559 continue; 1560 blkiop->ops.blkio_unlink_group_fn(key, blkg); 1561 } 1562 spin_unlock(&blkio_list_lock); 1563 } while (1); 1564 1565 list_for_each_entry_safe(pn, pntmp, &blkcg->policy_list, node) { 1566 blkio_policy_delete_node(pn); 1567 kfree(pn); 1568 } 1569 1570 free_css_id(&blkio_subsys, &blkcg->css); 1571 rcu_read_unlock(); 1572 if (blkcg != &blkio_root_cgroup) 1573 kfree(blkcg); 1574} 1575 1576static struct cgroup_subsys_state *blkiocg_create(struct cgroup *cgroup) 1577{ 1578 struct blkio_cgroup *blkcg; 1579 struct cgroup *parent = cgroup->parent; 1580 1581 if (!parent) { 1582 blkcg = &blkio_root_cgroup; 1583 goto done; 1584 } 1585 1586 blkcg = kzalloc(sizeof(*blkcg), GFP_KERNEL); 1587 if (!blkcg) 1588 return ERR_PTR(-ENOMEM); 1589 1590 blkcg->weight = BLKIO_WEIGHT_DEFAULT; 1591done: 1592 spin_lock_init(&blkcg->lock); 1593 INIT_HLIST_HEAD(&blkcg->blkg_list); 1594 1595 INIT_LIST_HEAD(&blkcg->policy_list); 1596 return &blkcg->css; 1597} 1598 1599/* 1600 * We cannot support shared io contexts, as we have no mean to support 1601 * two tasks with the same ioc in two different groups without major rework 1602 * of the main cic data structures. For now we allow a task to change 1603 * its cgroup only if it's the only owner of its ioc. 1604 */ 1605static int blkiocg_can_attach(struct cgroup *cgrp, struct cgroup_taskset *tset) 1606{ 1607 struct task_struct *task; 1608 struct io_context *ioc; 1609 int ret = 0; 1610 1611 /* task_lock() is needed to avoid races with exit_io_context() */ 1612 cgroup_taskset_for_each(task, cgrp, tset) { 1613 task_lock(task); 1614 ioc = task->io_context; 1615 if (ioc && atomic_read(&ioc->nr_tasks) > 1) 1616 ret = -EINVAL; 1617 task_unlock(task); 1618 if (ret) 1619 break; 1620 } 1621 return ret; 1622} 1623 1624static void blkiocg_attach(struct cgroup *cgrp, struct cgroup_taskset *tset) 1625{ 1626 struct task_struct *task; 1627 struct io_context *ioc; 1628 1629 cgroup_taskset_for_each(task, cgrp, tset) { 1630 /* we don't lose anything even if ioc allocation fails */ 1631 ioc = get_task_io_context(task, GFP_ATOMIC, NUMA_NO_NODE); 1632 if (ioc) { 1633 ioc_cgroup_changed(ioc); 1634 put_io_context(ioc); 1635 } 1636 } 1637} 1638 1639struct cgroup_subsys blkio_subsys = { 1640 .name = "blkio", 1641 .create = blkiocg_create, 1642 .can_attach = blkiocg_can_attach, 1643 .attach = blkiocg_attach, 1644 .destroy = blkiocg_destroy, 1645 .populate = blkiocg_populate, 1646#ifdef CONFIG_BLK_CGROUP 1647 /* note: blkio_subsys_id is otherwise defined in blk-cgroup.h */ 1648 .subsys_id = blkio_subsys_id, 1649#endif 1650 .use_id = 1, 1651 .module = THIS_MODULE, 1652}; 1653EXPORT_SYMBOL_GPL(blkio_subsys); 1654 1655void blkio_policy_register(struct blkio_policy_type *blkiop) 1656{ 1657 spin_lock(&blkio_list_lock); 1658 list_add_tail(&blkiop->list, &blkio_list); 1659 spin_unlock(&blkio_list_lock); 1660} 1661EXPORT_SYMBOL_GPL(blkio_policy_register); 1662 1663void blkio_policy_unregister(struct blkio_policy_type *blkiop) 1664{ 1665 spin_lock(&blkio_list_lock); 1666 list_del_init(&blkiop->list); 1667 spin_unlock(&blkio_list_lock); 1668} 1669EXPORT_SYMBOL_GPL(blkio_policy_unregister); 1670 1671static int __init init_cgroup_blkio(void) 1672{ 1673 return cgroup_load_subsys(&blkio_subsys); 1674} 1675 1676static void __exit exit_cgroup_blkio(void) 1677{ 1678 cgroup_unload_subsys(&blkio_subsys); 1679} 1680 1681module_init(init_cgroup_blkio); 1682module_exit(exit_cgroup_blkio); 1683MODULE_LICENSE("GPL"); 1684