1/* 2 * block queue tracing parse application 3 * 4 * Copyright (C) 2005 Jens Axboe <axboe@suse.de> 5 * Copyright (C) 2006 Jens Axboe <axboe@kernel.dk> 6 * 7 * This program is free software; you can redistribute it and/or modify 8 * it under the terms of the GNU General Public License as published by 9 * the Free Software Foundation; either version 2 of the License, or 10 * (at your option) any later version. 11 * 12 * This program is distributed in the hope that it will be useful, 13 * but WITHOUT ANY WARRANTY; without even the implied warranty of 14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 15 * GNU General Public License for more details. 16 * 17 * You should have received a copy of the GNU General Public License 18 * along with this program; if not, write to the Free Software 19 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA 20 * 21 */ 22#include <sys/types.h> 23#include <sys/stat.h> 24#include <unistd.h> 25#include <stdio.h> 26#include <fcntl.h> 27#include <stdlib.h> 28#include <string.h> 29#include <getopt.h> 30#include <errno.h> 31#include <signal.h> 32#include <locale.h> 33#include <libgen.h> 34 35#include "blktrace.h" 36#include "rbtree.h" 37#include "jhash.h" 38 39static char blkparse_version[] = "1.0.1"; 40 41struct skip_info { 42 unsigned long start, end; 43 struct skip_info *prev, *next; 44}; 45 46struct per_dev_info { 47 dev_t dev; 48 char *name; 49 50 int backwards; 51 unsigned long long events; 52 unsigned long long first_reported_time; 53 unsigned long long last_reported_time; 54 unsigned long long last_read_time; 55 struct io_stats io_stats; 56 unsigned long skips; 57 unsigned long long seq_skips; 58 unsigned int max_depth[2]; 59 unsigned int cur_depth[2]; 60 61 struct rb_root rb_track; 62 63 int nfiles; 64 int ncpus; 65 66 unsigned long *cpu_map; 67 unsigned int cpu_map_max; 68 69 struct per_cpu_info *cpus; 70}; 71 72/* 73 * some duplicated effort here, we can unify this hash and the ppi hash later 74 */ 75struct process_pid_map { 76 pid_t pid; 77 char comm[16]; 78 struct process_pid_map *hash_next, *list_next; 79}; 80 81#define PPM_HASH_SHIFT (8) 82#define PPM_HASH_SIZE (1 << PPM_HASH_SHIFT) 83#define PPM_HASH_MASK (PPM_HASH_SIZE - 1) 84static struct process_pid_map *ppm_hash_table[PPM_HASH_SIZE]; 85 86struct per_process_info { 87 struct process_pid_map *ppm; 88 struct io_stats io_stats; 89 struct per_process_info *hash_next, *list_next; 90 int more_than_one; 91 92 /* 93 * individual io stats 94 */ 95 unsigned long long longest_allocation_wait[2]; 96 unsigned long long longest_dispatch_wait[2]; 97 unsigned long long longest_completion_wait[2]; 98}; 99 100#define PPI_HASH_SHIFT (8) 101#define PPI_HASH_SIZE (1 << PPI_HASH_SHIFT) 102#define PPI_HASH_MASK (PPI_HASH_SIZE - 1) 103static struct per_process_info *ppi_hash_table[PPI_HASH_SIZE]; 104static struct per_process_info *ppi_list; 105static int ppi_list_entries; 106 107static struct option l_opts[] = { 108 { 109 .name = "act-mask", 110 .has_arg = required_argument, 111 .flag = NULL, 112 .val = 'a' 113 }, 114 { 115 .name = "set-mask", 116 .has_arg = required_argument, 117 .flag = NULL, 118 .val = 'A' 119 }, 120 { 121 .name = "batch", 122 .has_arg = required_argument, 123 .flag = NULL, 124 .val = 'b' 125 }, 126 { 127 .name = "input-directory", 128 .has_arg = required_argument, 129 .flag = NULL, 130 .val = 'D' 131 }, 132 { 133 .name = "dump-binary", 134 .has_arg = required_argument, 135 .flag = NULL, 136 .val = 'd' 137 }, 138 { 139 .name = "format", 140 .has_arg = required_argument, 141 .flag = NULL, 142 .val = 'f' 143 }, 144 { 145 .name = "format-spec", 146 .has_arg = required_argument, 147 .flag = NULL, 148 .val = 'F' 149 }, 150 { 151 .name = "hash-by-name", 152 .has_arg = no_argument, 153 .flag = NULL, 154 .val = 'h' 155 }, 156 { 157 .name = "input", 158 .has_arg = required_argument, 159 .flag = NULL, 160 .val = 'i' 161 }, 162 { 163 .name = "no-msgs", 164 .has_arg = no_argument, 165 .flag = NULL, 166 .val = 'M' 167 }, 168 { 169 .name = "output", 170 .has_arg = required_argument, 171 .flag = NULL, 172 .val = 'o' 173 }, 174 { 175 .name = "no-text-output", 176 .has_arg = no_argument, 177 .flag = NULL, 178 .val = 'O' 179 }, 180 { 181 .name = "quiet", 182 .has_arg = no_argument, 183 .flag = NULL, 184 .val = 'q' 185 }, 186 { 187 .name = "per-program-stats", 188 .has_arg = no_argument, 189 .flag = NULL, 190 .val = 's' 191 }, 192 { 193 .name = "track-ios", 194 .has_arg = no_argument, 195 .flag = NULL, 196 .val = 't' 197 }, 198 { 199 .name = "stopwatch", 200 .has_arg = required_argument, 201 .flag = NULL, 202 .val = 'w' 203 }, 204 { 205 .name = "verbose", 206 .has_arg = no_argument, 207 .flag = NULL, 208 .val = 'v' 209 }, 210 { 211 .name = "version", 212 .has_arg = no_argument, 213 .flag = NULL, 214 .val = 'V' 215 }, 216 { 217 .name = NULL, 218 } 219}; 220 221/* 222 * for sorting the displayed output 223 */ 224struct trace { 225 struct blk_io_trace *bit; 226 struct rb_node rb_node; 227 struct trace *next; 228 unsigned long read_sequence; 229}; 230 231static struct rb_root rb_sort_root; 232static unsigned long rb_sort_entries; 233 234static struct trace *trace_list; 235 236/* 237 * allocation cache 238 */ 239static struct blk_io_trace *bit_alloc_list; 240static struct trace *t_alloc_list; 241 242/* 243 * for tracking individual ios 244 */ 245struct io_track { 246 struct rb_node rb_node; 247 248 struct process_pid_map *ppm; 249 __u64 sector; 250 unsigned long long allocation_time; 251 unsigned long long queue_time; 252 unsigned long long dispatch_time; 253 unsigned long long completion_time; 254}; 255 256static int ndevices; 257static struct per_dev_info *devices; 258static char *get_dev_name(struct per_dev_info *, char *, int); 259static int trace_rb_insert_last(struct per_dev_info *, struct trace *); 260 261FILE *ofp = NULL; 262static char *output_name; 263static char *input_dir; 264 265static unsigned long long genesis_time; 266static unsigned long long last_allowed_time; 267static unsigned long long stopwatch_start; /* start from zero by default */ 268static unsigned long long stopwatch_end = -1ULL; /* "infinity" */ 269static unsigned long read_sequence; 270 271static int per_process_stats; 272static int per_device_and_cpu_stats = 1; 273static int track_ios; 274static int ppi_hash_by_pid = 1; 275static int verbose; 276static unsigned int act_mask = -1U; 277static int stats_printed; 278static int bin_output_msgs = 1; 279int data_is_native = -1; 280 281static FILE *dump_fp; 282static char *dump_binary; 283 284static unsigned int t_alloc_cache; 285static unsigned int bit_alloc_cache; 286 287#define RB_BATCH_DEFAULT (512) 288static unsigned int rb_batch = RB_BATCH_DEFAULT; 289 290static int pipeline; 291static char *pipename; 292 293static int text_output = 1; 294 295#define is_done() (*(volatile int *)(&done)) 296static volatile int done; 297 298struct timespec abs_start_time; 299static unsigned long long start_timestamp; 300 301static int have_drv_data = 0; 302 303#define JHASH_RANDOM (0x3af5f2ee) 304 305#define CPUS_PER_LONG (8 * sizeof(unsigned long)) 306#define CPU_IDX(cpu) ((cpu) / CPUS_PER_LONG) 307#define CPU_BIT(cpu) ((cpu) & (CPUS_PER_LONG - 1)) 308 309static void output_binary(void *buf, int len) 310{ 311 if (dump_binary) { 312 size_t n = fwrite(buf, len, 1, dump_fp); 313 if (n != 1) { 314 perror(dump_binary); 315 fclose(dump_fp); 316 dump_binary = NULL; 317 } 318 } 319} 320 321static void resize_cpu_info(struct per_dev_info *pdi, int cpu) 322{ 323 struct per_cpu_info *cpus = pdi->cpus; 324 int ncpus = pdi->ncpus; 325 int new_count = cpu + 1; 326 int new_space, size; 327 char *new_start; 328 329 size = new_count * sizeof(struct per_cpu_info); 330 cpus = realloc(cpus, size); 331 if (!cpus) { 332 char name[20]; 333 fprintf(stderr, "Out of memory, CPU info for device %s (%d)\n", 334 get_dev_name(pdi, name, sizeof(name)), size); 335 exit(1); 336 } 337 338 new_start = (char *)cpus + (ncpus * sizeof(struct per_cpu_info)); 339 new_space = (new_count - ncpus) * sizeof(struct per_cpu_info); 340 memset(new_start, 0, new_space); 341 342 pdi->ncpus = new_count; 343 pdi->cpus = cpus; 344 345 for (new_count = 0; new_count < pdi->ncpus; new_count++) { 346 struct per_cpu_info *pci = &pdi->cpus[new_count]; 347 348 if (!pci->fd) { 349 pci->fd = -1; 350 memset(&pci->rb_last, 0, sizeof(pci->rb_last)); 351 pci->rb_last_entries = 0; 352 pci->last_sequence = -1; 353 } 354 } 355} 356 357static struct per_cpu_info *get_cpu_info(struct per_dev_info *pdi, int cpu) 358{ 359 struct per_cpu_info *pci; 360 361 if (cpu >= pdi->ncpus) 362 resize_cpu_info(pdi, cpu); 363 364 pci = &pdi->cpus[cpu]; 365 pci->cpu = cpu; 366 return pci; 367} 368 369 370static int resize_devices(char *name) 371{ 372 int size = (ndevices + 1) * sizeof(struct per_dev_info); 373 374 devices = realloc(devices, size); 375 if (!devices) { 376 fprintf(stderr, "Out of memory, device %s (%d)\n", name, size); 377 return 1; 378 } 379 memset(&devices[ndevices], 0, sizeof(struct per_dev_info)); 380 devices[ndevices].name = name; 381 ndevices++; 382 return 0; 383} 384 385static struct per_dev_info *get_dev_info(dev_t dev) 386{ 387 struct per_dev_info *pdi; 388 int i; 389 390 for (i = 0; i < ndevices; i++) { 391 if (!devices[i].dev) 392 devices[i].dev = dev; 393 if (devices[i].dev == dev) 394 return &devices[i]; 395 } 396 397 if (resize_devices(NULL)) 398 return NULL; 399 400 pdi = &devices[ndevices - 1]; 401 pdi->dev = dev; 402 pdi->first_reported_time = 0; 403 pdi->last_read_time = 0; 404 405 return pdi; 406} 407 408static void insert_skip(struct per_cpu_info *pci, unsigned long start, 409 unsigned long end) 410{ 411 struct skip_info *sip; 412 413 for (sip = pci->skips_tail; sip != NULL; sip = sip->prev) { 414 if (end == (sip->start - 1)) { 415 sip->start = start; 416 return; 417 } else if (start == (sip->end + 1)) { 418 sip->end = end; 419 return; 420 } 421 } 422 423 sip = malloc(sizeof(struct skip_info)); 424 sip->start = start; 425 sip->end = end; 426 sip->prev = sip->next = NULL; 427 if (pci->skips_tail == NULL) 428 pci->skips_head = pci->skips_tail = sip; 429 else { 430 sip->prev = pci->skips_tail; 431 pci->skips_tail->next = sip; 432 pci->skips_tail = sip; 433 } 434} 435 436static void remove_sip(struct per_cpu_info *pci, struct skip_info *sip) 437{ 438 if (sip->prev == NULL) { 439 if (sip->next == NULL) 440 pci->skips_head = pci->skips_tail = NULL; 441 else { 442 pci->skips_head = sip->next; 443 sip->next->prev = NULL; 444 } 445 } else if (sip->next == NULL) { 446 pci->skips_tail = sip->prev; 447 sip->prev->next = NULL; 448 } else { 449 sip->prev->next = sip->next; 450 sip->next->prev = sip->prev; 451 } 452 453 sip->prev = sip->next = NULL; 454 free(sip); 455} 456 457#define IN_SKIP(sip,seq) (((sip)->start <= (seq)) && ((seq) <= sip->end)) 458static int check_current_skips(struct per_cpu_info *pci, unsigned long seq) 459{ 460 struct skip_info *sip; 461 462 for (sip = pci->skips_tail; sip != NULL; sip = sip->prev) { 463 if (IN_SKIP(sip, seq)) { 464 if (sip->start == seq) { 465 if (sip->end == seq) 466 remove_sip(pci, sip); 467 else 468 sip->start += 1; 469 } else if (sip->end == seq) 470 sip->end -= 1; 471 else { 472 sip->end = seq - 1; 473 insert_skip(pci, seq + 1, sip->end); 474 } 475 return 1; 476 } 477 } 478 479 return 0; 480} 481 482static void collect_pdi_skips(struct per_dev_info *pdi) 483{ 484 struct skip_info *sip; 485 int cpu; 486 487 pdi->skips = 0; 488 pdi->seq_skips = 0; 489 490 for (cpu = 0; cpu < pdi->ncpus; cpu++) { 491 struct per_cpu_info *pci = &pdi->cpus[cpu]; 492 493 for (sip = pci->skips_head; sip != NULL; sip = sip->next) { 494 pdi->skips++; 495 pdi->seq_skips += (sip->end - sip->start + 1); 496 if (verbose) 497 fprintf(stderr,"(%d,%d): skipping %lu -> %lu\n", 498 MAJOR(pdi->dev), MINOR(pdi->dev), 499 sip->start, sip->end); 500 } 501 } 502} 503 504static void cpu_mark_online(struct per_dev_info *pdi, unsigned int cpu) 505{ 506 if (cpu >= pdi->cpu_map_max || !pdi->cpu_map) { 507 int new_max = (cpu + CPUS_PER_LONG) & ~(CPUS_PER_LONG - 1); 508 unsigned long *map = malloc(new_max / sizeof(long)); 509 510 memset(map, 0, new_max / sizeof(long)); 511 512 if (pdi->cpu_map) { 513 memcpy(map, pdi->cpu_map, pdi->cpu_map_max / sizeof(long)); 514 free(pdi->cpu_map); 515 } 516 517 pdi->cpu_map = map; 518 pdi->cpu_map_max = new_max; 519 } 520 521 pdi->cpu_map[CPU_IDX(cpu)] |= (1UL << CPU_BIT(cpu)); 522} 523 524static inline void cpu_mark_offline(struct per_dev_info *pdi, int cpu) 525{ 526 pdi->cpu_map[CPU_IDX(cpu)] &= ~(1UL << CPU_BIT(cpu)); 527} 528 529static inline int cpu_is_online(struct per_dev_info *pdi, int cpu) 530{ 531 return (pdi->cpu_map[CPU_IDX(cpu)] & (1UL << CPU_BIT(cpu))) != 0; 532} 533 534static inline int ppm_hash_pid(pid_t pid) 535{ 536 return jhash_1word(pid, JHASH_RANDOM) & PPM_HASH_MASK; 537} 538 539static struct process_pid_map *find_ppm(pid_t pid) 540{ 541 const int hash_idx = ppm_hash_pid(pid); 542 struct process_pid_map *ppm; 543 544 ppm = ppm_hash_table[hash_idx]; 545 while (ppm) { 546 if (ppm->pid == pid) 547 return ppm; 548 549 ppm = ppm->hash_next; 550 } 551 552 return NULL; 553} 554 555static struct process_pid_map *add_ppm_hash(pid_t pid, const char *name) 556{ 557 const int hash_idx = ppm_hash_pid(pid); 558 struct process_pid_map *ppm; 559 560 ppm = find_ppm(pid); 561 if (!ppm) { 562 ppm = malloc(sizeof(*ppm)); 563 memset(ppm, 0, sizeof(*ppm)); 564 ppm->pid = pid; 565 strcpy(ppm->comm, name); 566 ppm->hash_next = ppm_hash_table[hash_idx]; 567 ppm_hash_table[hash_idx] = ppm; 568 } 569 570 return ppm; 571} 572 573static void handle_notify(struct blk_io_trace *bit) 574{ 575 void *payload = (caddr_t) bit + sizeof(*bit); 576 __u32 two32[2]; 577 578 switch (bit->action) { 579 case BLK_TN_PROCESS: 580 add_ppm_hash(bit->pid, payload); 581 break; 582 583 case BLK_TN_TIMESTAMP: 584 if (bit->pdu_len != sizeof(two32)) 585 return; 586 memcpy(two32, payload, sizeof(two32)); 587 if (!data_is_native) { 588 two32[0] = be32_to_cpu(two32[0]); 589 two32[1] = be32_to_cpu(two32[1]); 590 } 591 start_timestamp = bit->time; 592 abs_start_time.tv_sec = two32[0]; 593 abs_start_time.tv_nsec = two32[1]; 594 if (abs_start_time.tv_nsec < 0) { 595 abs_start_time.tv_sec--; 596 abs_start_time.tv_nsec += 1000000000; 597 } 598 599 break; 600 601 case BLK_TN_MESSAGE: 602 if (bit->pdu_len > 0) { 603 char msg[bit->pdu_len+1]; 604 605 memcpy(msg, (char *)payload, bit->pdu_len); 606 msg[bit->pdu_len] = '\0'; 607 608 fprintf(ofp, 609 "%3d,%-3d %2d %8s %5d.%09lu %5u %2s %3s %s\n", 610 MAJOR(bit->device), MINOR(bit->device), 611 bit->cpu, "0", (int) SECONDS(bit->time), 612 (unsigned long) NANO_SECONDS(bit->time), 613 0, "m", "N", msg); 614 } 615 break; 616 617 default: 618 /* Ignore unknown notify events */ 619 ; 620 } 621} 622 623char *find_process_name(pid_t pid) 624{ 625 struct process_pid_map *ppm = find_ppm(pid); 626 627 if (ppm) 628 return ppm->comm; 629 630 return NULL; 631} 632 633static inline int ppi_hash_pid(pid_t pid) 634{ 635 return jhash_1word(pid, JHASH_RANDOM) & PPI_HASH_MASK; 636} 637 638static inline int ppi_hash_name(const char *name) 639{ 640 return jhash(name, 16, JHASH_RANDOM) & PPI_HASH_MASK; 641} 642 643static inline int ppi_hash(struct per_process_info *ppi) 644{ 645 struct process_pid_map *ppm = ppi->ppm; 646 647 if (ppi_hash_by_pid) 648 return ppi_hash_pid(ppm->pid); 649 650 return ppi_hash_name(ppm->comm); 651} 652 653static inline void add_ppi_to_hash(struct per_process_info *ppi) 654{ 655 const int hash_idx = ppi_hash(ppi); 656 657 ppi->hash_next = ppi_hash_table[hash_idx]; 658 ppi_hash_table[hash_idx] = ppi; 659} 660 661static inline void add_ppi_to_list(struct per_process_info *ppi) 662{ 663 ppi->list_next = ppi_list; 664 ppi_list = ppi; 665 ppi_list_entries++; 666} 667 668static struct per_process_info *find_ppi_by_name(char *name) 669{ 670 const int hash_idx = ppi_hash_name(name); 671 struct per_process_info *ppi; 672 673 ppi = ppi_hash_table[hash_idx]; 674 while (ppi) { 675 struct process_pid_map *ppm = ppi->ppm; 676 677 if (!strcmp(ppm->comm, name)) 678 return ppi; 679 680 ppi = ppi->hash_next; 681 } 682 683 return NULL; 684} 685 686static struct per_process_info *find_ppi_by_pid(pid_t pid) 687{ 688 const int hash_idx = ppi_hash_pid(pid); 689 struct per_process_info *ppi; 690 691 ppi = ppi_hash_table[hash_idx]; 692 while (ppi) { 693 struct process_pid_map *ppm = ppi->ppm; 694 695 if (ppm->pid == pid) 696 return ppi; 697 698 ppi = ppi->hash_next; 699 } 700 701 return NULL; 702} 703 704static struct per_process_info *find_ppi(pid_t pid) 705{ 706 struct per_process_info *ppi; 707 char *name; 708 709 if (ppi_hash_by_pid) 710 return find_ppi_by_pid(pid); 711 712 name = find_process_name(pid); 713 if (!name) 714 return NULL; 715 716 ppi = find_ppi_by_name(name); 717 if (ppi && ppi->ppm->pid != pid) 718 ppi->more_than_one = 1; 719 720 return ppi; 721} 722 723/* 724 * struct trace and blktrace allocation cache, we do potentially 725 * millions of mallocs for these structures while only using at most 726 * a few thousand at the time 727 */ 728static inline void t_free(struct trace *t) 729{ 730 if (t_alloc_cache < 1024) { 731 t->next = t_alloc_list; 732 t_alloc_list = t; 733 t_alloc_cache++; 734 } else 735 free(t); 736} 737 738static inline struct trace *t_alloc(void) 739{ 740 struct trace *t = t_alloc_list; 741 742 if (t) { 743 t_alloc_list = t->next; 744 t_alloc_cache--; 745 return t; 746 } 747 748 return malloc(sizeof(*t)); 749} 750 751static inline void bit_free(struct blk_io_trace *bit) 752{ 753 if (bit_alloc_cache < 1024 && !bit->pdu_len) { 754 /* 755 * abuse a 64-bit field for a next pointer for the free item 756 */ 757 bit->time = (__u64) (unsigned long) bit_alloc_list; 758 bit_alloc_list = (struct blk_io_trace *) bit; 759 bit_alloc_cache++; 760 } else 761 free(bit); 762} 763 764static inline struct blk_io_trace *bit_alloc(void) 765{ 766 struct blk_io_trace *bit = bit_alloc_list; 767 768 if (bit) { 769 bit_alloc_list = (struct blk_io_trace *) (unsigned long) \ 770 bit->time; 771 bit_alloc_cache--; 772 return bit; 773 } 774 775 return malloc(sizeof(*bit)); 776} 777 778static inline void __put_trace_last(struct per_dev_info *pdi, struct trace *t) 779{ 780 struct per_cpu_info *pci = get_cpu_info(pdi, t->bit->cpu); 781 782 rb_erase(&t->rb_node, &pci->rb_last); 783 pci->rb_last_entries--; 784 785 bit_free(t->bit); 786 t_free(t); 787} 788 789static void put_trace(struct per_dev_info *pdi, struct trace *t) 790{ 791 rb_erase(&t->rb_node, &rb_sort_root); 792 rb_sort_entries--; 793 794 trace_rb_insert_last(pdi, t); 795} 796 797static inline int trace_rb_insert(struct trace *t, struct rb_root *root) 798{ 799 struct rb_node **p = &root->rb_node; 800 struct rb_node *parent = NULL; 801 struct trace *__t; 802 803 while (*p) { 804 parent = *p; 805 806 __t = rb_entry(parent, struct trace, rb_node); 807 808 if (t->bit->time < __t->bit->time) 809 p = &(*p)->rb_left; 810 else if (t->bit->time > __t->bit->time) 811 p = &(*p)->rb_right; 812 else if (t->bit->device < __t->bit->device) 813 p = &(*p)->rb_left; 814 else if (t->bit->device > __t->bit->device) 815 p = &(*p)->rb_right; 816 else if (t->bit->sequence < __t->bit->sequence) 817 p = &(*p)->rb_left; 818 else /* >= sequence */ 819 p = &(*p)->rb_right; 820 } 821 822 rb_link_node(&t->rb_node, parent, p); 823 rb_insert_color(&t->rb_node, root); 824 return 0; 825} 826 827static inline int trace_rb_insert_sort(struct trace *t) 828{ 829 if (!trace_rb_insert(t, &rb_sort_root)) { 830 rb_sort_entries++; 831 return 0; 832 } 833 834 return 1; 835} 836 837static int trace_rb_insert_last(struct per_dev_info *pdi, struct trace *t) 838{ 839 struct per_cpu_info *pci = get_cpu_info(pdi, t->bit->cpu); 840 841 if (trace_rb_insert(t, &pci->rb_last)) 842 return 1; 843 844 pci->rb_last_entries++; 845 846 if (pci->rb_last_entries > rb_batch * pdi->nfiles) { 847 struct rb_node *n = rb_first(&pci->rb_last); 848 849 t = rb_entry(n, struct trace, rb_node); 850 __put_trace_last(pdi, t); 851 } 852 853 return 0; 854} 855 856static struct trace *trace_rb_find(dev_t device, unsigned long sequence, 857 struct rb_root *root, int order) 858{ 859 struct rb_node *n = root->rb_node; 860 struct rb_node *prev = NULL; 861 struct trace *__t; 862 863 while (n) { 864 __t = rb_entry(n, struct trace, rb_node); 865 prev = n; 866 867 if (device < __t->bit->device) 868 n = n->rb_left; 869 else if (device > __t->bit->device) 870 n = n->rb_right; 871 else if (sequence < __t->bit->sequence) 872 n = n->rb_left; 873 else if (sequence > __t->bit->sequence) 874 n = n->rb_right; 875 else 876 return __t; 877 } 878 879 /* 880 * hack - the list may not be sequence ordered because some 881 * events don't have sequence and time matched. so we end up 882 * being a little off in the rb lookup here, because we don't 883 * know the time we are looking for. compensate by browsing 884 * a little ahead from the last entry to find the match 885 */ 886 if (order && prev) { 887 int max = 5; 888 889 while (((n = rb_next(prev)) != NULL) && max--) { 890 __t = rb_entry(n, struct trace, rb_node); 891 892 if (__t->bit->device == device && 893 __t->bit->sequence == sequence) 894 return __t; 895 896 prev = n; 897 } 898 } 899 900 return NULL; 901} 902 903static inline struct trace *trace_rb_find_last(struct per_dev_info *pdi, 904 struct per_cpu_info *pci, 905 unsigned long seq) 906{ 907 return trace_rb_find(pdi->dev, seq, &pci->rb_last, 0); 908} 909 910static inline int track_rb_insert(struct per_dev_info *pdi,struct io_track *iot) 911{ 912 struct rb_node **p = &pdi->rb_track.rb_node; 913 struct rb_node *parent = NULL; 914 struct io_track *__iot; 915 916 while (*p) { 917 parent = *p; 918 __iot = rb_entry(parent, struct io_track, rb_node); 919 920 if (iot->sector < __iot->sector) 921 p = &(*p)->rb_left; 922 else if (iot->sector > __iot->sector) 923 p = &(*p)->rb_right; 924 else { 925 fprintf(stderr, 926 "sector alias (%Lu) on device %d,%d!\n", 927 (unsigned long long) iot->sector, 928 MAJOR(pdi->dev), MINOR(pdi->dev)); 929 return 1; 930 } 931 } 932 933 rb_link_node(&iot->rb_node, parent, p); 934 rb_insert_color(&iot->rb_node, &pdi->rb_track); 935 return 0; 936} 937 938static struct io_track *__find_track(struct per_dev_info *pdi, __u64 sector) 939{ 940 struct rb_node *n = pdi->rb_track.rb_node; 941 struct io_track *__iot; 942 943 while (n) { 944 __iot = rb_entry(n, struct io_track, rb_node); 945 946 if (sector < __iot->sector) 947 n = n->rb_left; 948 else if (sector > __iot->sector) 949 n = n->rb_right; 950 else 951 return __iot; 952 } 953 954 return NULL; 955} 956 957static struct io_track *find_track(struct per_dev_info *pdi, pid_t pid, 958 __u64 sector) 959{ 960 struct io_track *iot; 961 962 iot = __find_track(pdi, sector); 963 if (!iot) { 964 iot = malloc(sizeof(*iot)); 965 iot->ppm = find_ppm(pid); 966 if (!iot->ppm) 967 iot->ppm = add_ppm_hash(pid, "unknown"); 968 iot->sector = sector; 969 track_rb_insert(pdi, iot); 970 } 971 972 return iot; 973} 974 975static void log_track_frontmerge(struct per_dev_info *pdi, 976 struct blk_io_trace *t) 977{ 978 struct io_track *iot; 979 980 if (!track_ios) 981 return; 982 983 iot = __find_track(pdi, t->sector + t_sec(t)); 984 if (!iot) { 985 if (verbose) 986 fprintf(stderr, "merge not found for (%d,%d): %llu\n", 987 MAJOR(pdi->dev), MINOR(pdi->dev), 988 (unsigned long long) t->sector + t_sec(t)); 989 return; 990 } 991 992 rb_erase(&iot->rb_node, &pdi->rb_track); 993 iot->sector -= t_sec(t); 994 track_rb_insert(pdi, iot); 995} 996 997static void log_track_getrq(struct per_dev_info *pdi, struct blk_io_trace *t) 998{ 999 struct io_track *iot; 1000 1001 if (!track_ios) 1002 return; 1003 1004 iot = find_track(pdi, t->pid, t->sector); 1005 iot->allocation_time = t->time; 1006} 1007 1008static inline int is_remapper(struct per_dev_info *pdi) 1009{ 1010 int major = MAJOR(pdi->dev); 1011 1012 return (major == 253 || major == 9); 1013} 1014 1015/* 1016 * for md/dm setups, the interesting cycle is Q -> C. So track queueing 1017 * time here, as dispatch time 1018 */ 1019static void log_track_queue(struct per_dev_info *pdi, struct blk_io_trace *t) 1020{ 1021 struct io_track *iot; 1022 1023 if (!track_ios) 1024 return; 1025 if (!is_remapper(pdi)) 1026 return; 1027 1028 iot = find_track(pdi, t->pid, t->sector); 1029 iot->dispatch_time = t->time; 1030} 1031 1032/* 1033 * return time between rq allocation and insertion 1034 */ 1035static unsigned long long log_track_insert(struct per_dev_info *pdi, 1036 struct blk_io_trace *t) 1037{ 1038 unsigned long long elapsed; 1039 struct io_track *iot; 1040 1041 if (!track_ios) 1042 return -1; 1043 1044 iot = find_track(pdi, t->pid, t->sector); 1045 iot->queue_time = t->time; 1046 1047 if (!iot->allocation_time) 1048 return -1; 1049 1050 elapsed = iot->queue_time - iot->allocation_time; 1051 1052 if (per_process_stats) { 1053 struct per_process_info *ppi = find_ppi(iot->ppm->pid); 1054 int w = (t->action & BLK_TC_ACT(BLK_TC_WRITE)) != 0; 1055 1056 if (ppi && elapsed > ppi->longest_allocation_wait[w]) 1057 ppi->longest_allocation_wait[w] = elapsed; 1058 } 1059 1060 return elapsed; 1061} 1062 1063/* 1064 * return time between queue and issue 1065 */ 1066static unsigned long long log_track_issue(struct per_dev_info *pdi, 1067 struct blk_io_trace *t) 1068{ 1069 unsigned long long elapsed; 1070 struct io_track *iot; 1071 1072 if (!track_ios) 1073 return -1; 1074 if ((t->action & BLK_TC_ACT(BLK_TC_FS)) == 0) 1075 return -1; 1076 1077 iot = __find_track(pdi, t->sector); 1078 if (!iot) { 1079 if (verbose) 1080 fprintf(stderr, "issue not found for (%d,%d): %llu\n", 1081 MAJOR(pdi->dev), MINOR(pdi->dev), 1082 (unsigned long long) t->sector); 1083 return -1; 1084 } 1085 1086 iot->dispatch_time = t->time; 1087 elapsed = iot->dispatch_time - iot->queue_time; 1088 1089 if (per_process_stats) { 1090 struct per_process_info *ppi = find_ppi(iot->ppm->pid); 1091 int w = (t->action & BLK_TC_ACT(BLK_TC_WRITE)) != 0; 1092 1093 if (ppi && elapsed > ppi->longest_dispatch_wait[w]) 1094 ppi->longest_dispatch_wait[w] = elapsed; 1095 } 1096 1097 return elapsed; 1098} 1099 1100/* 1101 * return time between dispatch and complete 1102 */ 1103static unsigned long long log_track_complete(struct per_dev_info *pdi, 1104 struct blk_io_trace *t) 1105{ 1106 unsigned long long elapsed; 1107 struct io_track *iot; 1108 1109 if (!track_ios) 1110 return -1; 1111 1112 iot = __find_track(pdi, t->sector); 1113 if (!iot) { 1114 if (verbose) 1115 fprintf(stderr,"complete not found for (%d,%d): %llu\n", 1116 MAJOR(pdi->dev), MINOR(pdi->dev), 1117 (unsigned long long) t->sector); 1118 return -1; 1119 } 1120 1121 iot->completion_time = t->time; 1122 elapsed = iot->completion_time - iot->dispatch_time; 1123 1124 if (per_process_stats) { 1125 struct per_process_info *ppi = find_ppi(iot->ppm->pid); 1126 int w = (t->action & BLK_TC_ACT(BLK_TC_WRITE)) != 0; 1127 1128 if (ppi && elapsed > ppi->longest_completion_wait[w]) 1129 ppi->longest_completion_wait[w] = elapsed; 1130 } 1131 1132 /* 1133 * kill the trace, we don't need it after completion 1134 */ 1135 rb_erase(&iot->rb_node, &pdi->rb_track); 1136 free(iot); 1137 1138 return elapsed; 1139} 1140 1141 1142static struct io_stats *find_process_io_stats(pid_t pid) 1143{ 1144 struct per_process_info *ppi = find_ppi(pid); 1145 1146 if (!ppi) { 1147 ppi = malloc(sizeof(*ppi)); 1148 memset(ppi, 0, sizeof(*ppi)); 1149 ppi->ppm = find_ppm(pid); 1150 if (!ppi->ppm) 1151 ppi->ppm = add_ppm_hash(pid, "unknown"); 1152 add_ppi_to_hash(ppi); 1153 add_ppi_to_list(ppi); 1154 } 1155 1156 return &ppi->io_stats; 1157} 1158 1159static char *get_dev_name(struct per_dev_info *pdi, char *buffer, int size) 1160{ 1161 if (pdi->name) 1162 snprintf(buffer, size, "%s", pdi->name); 1163 else 1164 snprintf(buffer, size, "%d,%d",MAJOR(pdi->dev),MINOR(pdi->dev)); 1165 return buffer; 1166} 1167 1168static void check_time(struct per_dev_info *pdi, struct blk_io_trace *bit) 1169{ 1170 unsigned long long this = bit->time; 1171 unsigned long long last = pdi->last_reported_time; 1172 1173 pdi->backwards = (this < last) ? 'B' : ' '; 1174 pdi->last_reported_time = this; 1175} 1176 1177static inline void __account_m(struct io_stats *ios, struct blk_io_trace *t, 1178 int rw) 1179{ 1180 if (rw) { 1181 ios->mwrites++; 1182 ios->mwrite_kb += t_kb(t); 1183 } else { 1184 ios->mreads++; 1185 ios->mread_kb += t_kb(t); 1186 } 1187} 1188 1189static inline void account_m(struct blk_io_trace *t, struct per_cpu_info *pci, 1190 int rw) 1191{ 1192 __account_m(&pci->io_stats, t, rw); 1193 1194 if (per_process_stats) { 1195 struct io_stats *ios = find_process_io_stats(t->pid); 1196 1197 __account_m(ios, t, rw); 1198 } 1199} 1200 1201static inline void __account_pc_queue(struct io_stats *ios, 1202 struct blk_io_trace *t, int rw) 1203{ 1204 if (rw) { 1205 ios->qwrites_pc++; 1206 ios->qwrite_kb_pc += t_kb(t); 1207 } else { 1208 ios->qreads_pc++; 1209 ios->qread_kb += t_kb(t); 1210 } 1211} 1212 1213static inline void account_pc_queue(struct blk_io_trace *t, 1214 struct per_cpu_info *pci, int rw) 1215{ 1216 __account_pc_queue(&pci->io_stats, t, rw); 1217 1218 if (per_process_stats) { 1219 struct io_stats *ios = find_process_io_stats(t->pid); 1220 1221 __account_pc_queue(ios, t, rw); 1222 } 1223} 1224 1225static inline void __account_pc_issue(struct io_stats *ios, int rw, 1226 unsigned int bytes) 1227{ 1228 if (rw) { 1229 ios->iwrites_pc++; 1230 ios->iwrite_kb_pc += bytes >> 10; 1231 } else { 1232 ios->ireads_pc++; 1233 ios->iread_kb_pc += bytes >> 10; 1234 } 1235} 1236 1237static inline void account_pc_issue(struct blk_io_trace *t, 1238 struct per_cpu_info *pci, int rw) 1239{ 1240 __account_pc_issue(&pci->io_stats, rw, t->bytes); 1241 1242 if (per_process_stats) { 1243 struct io_stats *ios = find_process_io_stats(t->pid); 1244 1245 __account_pc_issue(ios, rw, t->bytes); 1246 } 1247} 1248 1249static inline void __account_pc_requeue(struct io_stats *ios, 1250 struct blk_io_trace *t, int rw) 1251{ 1252 if (rw) { 1253 ios->wrqueue_pc++; 1254 ios->iwrite_kb_pc -= t_kb(t); 1255 } else { 1256 ios->rrqueue_pc++; 1257 ios->iread_kb_pc -= t_kb(t); 1258 } 1259} 1260 1261static inline void account_pc_requeue(struct blk_io_trace *t, 1262 struct per_cpu_info *pci, int rw) 1263{ 1264 __account_pc_requeue(&pci->io_stats, t, rw); 1265 1266 if (per_process_stats) { 1267 struct io_stats *ios = find_process_io_stats(t->pid); 1268 1269 __account_pc_requeue(ios, t, rw); 1270 } 1271} 1272 1273static inline void __account_pc_c(struct io_stats *ios, int rw) 1274{ 1275 if (rw) 1276 ios->cwrites_pc++; 1277 else 1278 ios->creads_pc++; 1279} 1280 1281static inline void account_pc_c(struct blk_io_trace *t, 1282 struct per_cpu_info *pci, int rw) 1283{ 1284 __account_pc_c(&pci->io_stats, rw); 1285 1286 if (per_process_stats) { 1287 struct io_stats *ios = find_process_io_stats(t->pid); 1288 1289 __account_pc_c(ios, rw); 1290 } 1291} 1292 1293static inline void __account_queue(struct io_stats *ios, struct blk_io_trace *t, 1294 int rw) 1295{ 1296 if (rw) { 1297 ios->qwrites++; 1298 ios->qwrite_kb += t_kb(t); 1299 } else { 1300 ios->qreads++; 1301 ios->qread_kb += t_kb(t); 1302 } 1303} 1304 1305static inline void account_queue(struct blk_io_trace *t, 1306 struct per_cpu_info *pci, int rw) 1307{ 1308 __account_queue(&pci->io_stats, t, rw); 1309 1310 if (per_process_stats) { 1311 struct io_stats *ios = find_process_io_stats(t->pid); 1312 1313 __account_queue(ios, t, rw); 1314 } 1315} 1316 1317static inline void __account_c(struct io_stats *ios, int rw, int bytes) 1318{ 1319 if (rw) { 1320 ios->cwrites++; 1321 ios->cwrite_kb += bytes >> 10; 1322 } else { 1323 ios->creads++; 1324 ios->cread_kb += bytes >> 10; 1325 } 1326} 1327 1328static inline void account_c(struct blk_io_trace *t, struct per_cpu_info *pci, 1329 int rw, int bytes) 1330{ 1331 __account_c(&pci->io_stats, rw, bytes); 1332 1333 if (per_process_stats) { 1334 struct io_stats *ios = find_process_io_stats(t->pid); 1335 1336 __account_c(ios, rw, bytes); 1337 } 1338} 1339 1340static inline void __account_issue(struct io_stats *ios, int rw, 1341 unsigned int bytes) 1342{ 1343 if (rw) { 1344 ios->iwrites++; 1345 ios->iwrite_kb += bytes >> 10; 1346 } else { 1347 ios->ireads++; 1348 ios->iread_kb += bytes >> 10; 1349 } 1350} 1351 1352static inline void account_issue(struct blk_io_trace *t, 1353 struct per_cpu_info *pci, int rw) 1354{ 1355 __account_issue(&pci->io_stats, rw, t->bytes); 1356 1357 if (per_process_stats) { 1358 struct io_stats *ios = find_process_io_stats(t->pid); 1359 1360 __account_issue(ios, rw, t->bytes); 1361 } 1362} 1363 1364static inline void __account_unplug(struct io_stats *ios, int timer) 1365{ 1366 if (timer) 1367 ios->timer_unplugs++; 1368 else 1369 ios->io_unplugs++; 1370} 1371 1372static inline void account_unplug(struct blk_io_trace *t, 1373 struct per_cpu_info *pci, int timer) 1374{ 1375 __account_unplug(&pci->io_stats, timer); 1376 1377 if (per_process_stats) { 1378 struct io_stats *ios = find_process_io_stats(t->pid); 1379 1380 __account_unplug(ios, timer); 1381 } 1382} 1383 1384static inline void __account_requeue(struct io_stats *ios, 1385 struct blk_io_trace *t, int rw) 1386{ 1387 if (rw) { 1388 ios->wrqueue++; 1389 ios->iwrite_kb -= t_kb(t); 1390 } else { 1391 ios->rrqueue++; 1392 ios->iread_kb -= t_kb(t); 1393 } 1394} 1395 1396static inline void account_requeue(struct blk_io_trace *t, 1397 struct per_cpu_info *pci, int rw) 1398{ 1399 __account_requeue(&pci->io_stats, t, rw); 1400 1401 if (per_process_stats) { 1402 struct io_stats *ios = find_process_io_stats(t->pid); 1403 1404 __account_requeue(ios, t, rw); 1405 } 1406} 1407 1408static void log_complete(struct per_dev_info *pdi, struct per_cpu_info *pci, 1409 struct blk_io_trace *t, char *act) 1410{ 1411 process_fmt(act, pci, t, log_track_complete(pdi, t), 0, NULL); 1412} 1413 1414static void log_insert(struct per_dev_info *pdi, struct per_cpu_info *pci, 1415 struct blk_io_trace *t, char *act) 1416{ 1417 process_fmt(act, pci, t, log_track_insert(pdi, t), 0, NULL); 1418} 1419 1420static void log_queue(struct per_cpu_info *pci, struct blk_io_trace *t, 1421 char *act) 1422{ 1423 process_fmt(act, pci, t, -1, 0, NULL); 1424} 1425 1426static void log_issue(struct per_dev_info *pdi, struct per_cpu_info *pci, 1427 struct blk_io_trace *t, char *act) 1428{ 1429 process_fmt(act, pci, t, log_track_issue(pdi, t), 0, NULL); 1430} 1431 1432static void log_merge(struct per_dev_info *pdi, struct per_cpu_info *pci, 1433 struct blk_io_trace *t, char *act) 1434{ 1435 if (act[0] == 'F') 1436 log_track_frontmerge(pdi, t); 1437 1438 process_fmt(act, pci, t, -1ULL, 0, NULL); 1439} 1440 1441static void log_action(struct per_cpu_info *pci, struct blk_io_trace *t, 1442 char *act) 1443{ 1444 process_fmt(act, pci, t, -1ULL, 0, NULL); 1445} 1446 1447static void log_generic(struct per_cpu_info *pci, struct blk_io_trace *t, 1448 char *act) 1449{ 1450 process_fmt(act, pci, t, -1ULL, 0, NULL); 1451} 1452 1453static void log_unplug(struct per_cpu_info *pci, struct blk_io_trace *t, 1454 char *act) 1455{ 1456 process_fmt(act, pci, t, -1ULL, 0, NULL); 1457} 1458 1459static void log_split(struct per_cpu_info *pci, struct blk_io_trace *t, 1460 char *act) 1461{ 1462 process_fmt(act, pci, t, -1ULL, 0, NULL); 1463} 1464 1465static void log_pc(struct per_cpu_info *pci, struct blk_io_trace *t, char *act) 1466{ 1467 unsigned char *buf = (unsigned char *) t + sizeof(*t); 1468 1469 process_fmt(act, pci, t, -1ULL, t->pdu_len, buf); 1470} 1471 1472static void dump_trace_pc(struct blk_io_trace *t, struct per_dev_info *pdi, 1473 struct per_cpu_info *pci) 1474{ 1475 int w = (t->action & BLK_TC_ACT(BLK_TC_WRITE)) != 0; 1476 int act = t->action & 0xffff; 1477 1478 switch (act) { 1479 case __BLK_TA_QUEUE: 1480 log_generic(pci, t, "Q"); 1481 account_pc_queue(t, pci, w); 1482 break; 1483 case __BLK_TA_GETRQ: 1484 log_generic(pci, t, "G"); 1485 break; 1486 case __BLK_TA_SLEEPRQ: 1487 log_generic(pci, t, "S"); 1488 break; 1489 case __BLK_TA_REQUEUE: 1490 /* 1491 * can happen if we miss traces, don't let it go 1492 * below zero 1493 */ 1494 if (pdi->cur_depth[w]) 1495 pdi->cur_depth[w]--; 1496 account_pc_requeue(t, pci, w); 1497 log_generic(pci, t, "R"); 1498 break; 1499 case __BLK_TA_ISSUE: 1500 account_pc_issue(t, pci, w); 1501 pdi->cur_depth[w]++; 1502 if (pdi->cur_depth[w] > pdi->max_depth[w]) 1503 pdi->max_depth[w] = pdi->cur_depth[w]; 1504 log_pc(pci, t, "D"); 1505 break; 1506 case __BLK_TA_COMPLETE: 1507 if (pdi->cur_depth[w]) 1508 pdi->cur_depth[w]--; 1509 log_pc(pci, t, "C"); 1510 account_pc_c(t, pci, w); 1511 break; 1512 case __BLK_TA_INSERT: 1513 log_pc(pci, t, "I"); 1514 break; 1515 default: 1516 fprintf(stderr, "Bad pc action %x\n", act); 1517 break; 1518 } 1519} 1520 1521static void dump_trace_fs(struct blk_io_trace *t, struct per_dev_info *pdi, 1522 struct per_cpu_info *pci) 1523{ 1524 int w = (t->action & BLK_TC_ACT(BLK_TC_WRITE)) != 0; 1525 int act = t->action & 0xffff; 1526 1527 switch (act) { 1528 case __BLK_TA_QUEUE: 1529 log_track_queue(pdi, t); 1530 account_queue(t, pci, w); 1531 log_queue(pci, t, "Q"); 1532 break; 1533 case __BLK_TA_INSERT: 1534 log_insert(pdi, pci, t, "I"); 1535 break; 1536 case __BLK_TA_BACKMERGE: 1537 account_m(t, pci, w); 1538 log_merge(pdi, pci, t, "M"); 1539 break; 1540 case __BLK_TA_FRONTMERGE: 1541 account_m(t, pci, w); 1542 log_merge(pdi, pci, t, "F"); 1543 break; 1544 case __BLK_TA_GETRQ: 1545 log_track_getrq(pdi, t); 1546 log_generic(pci, t, "G"); 1547 break; 1548 case __BLK_TA_SLEEPRQ: 1549 log_generic(pci, t, "S"); 1550 break; 1551 case __BLK_TA_REQUEUE: 1552 /* 1553 * can happen if we miss traces, don't let it go 1554 * below zero 1555 */ 1556 if (pdi->cur_depth[w]) 1557 pdi->cur_depth[w]--; 1558 account_requeue(t, pci, w); 1559 log_queue(pci, t, "R"); 1560 break; 1561 case __BLK_TA_ISSUE: 1562 account_issue(t, pci, w); 1563 pdi->cur_depth[w]++; 1564 if (pdi->cur_depth[w] > pdi->max_depth[w]) 1565 pdi->max_depth[w] = pdi->cur_depth[w]; 1566 log_issue(pdi, pci, t, "D"); 1567 break; 1568 case __BLK_TA_COMPLETE: 1569 if (pdi->cur_depth[w]) 1570 pdi->cur_depth[w]--; 1571 account_c(t, pci, w, t->bytes); 1572 log_complete(pdi, pci, t, "C"); 1573 break; 1574 case __BLK_TA_PLUG: 1575 log_action(pci, t, "P"); 1576 break; 1577 case __BLK_TA_UNPLUG_IO: 1578 account_unplug(t, pci, 0); 1579 log_unplug(pci, t, "U"); 1580 break; 1581 case __BLK_TA_UNPLUG_TIMER: 1582 account_unplug(t, pci, 1); 1583 log_unplug(pci, t, "UT"); 1584 break; 1585 case __BLK_TA_SPLIT: 1586 log_split(pci, t, "X"); 1587 break; 1588 case __BLK_TA_BOUNCE: 1589 log_generic(pci, t, "B"); 1590 break; 1591 case __BLK_TA_REMAP: 1592 log_generic(pci, t, "A"); 1593 break; 1594 case __BLK_TA_DRV_DATA: 1595 have_drv_data = 1; 1596 /* dump to binary file only */ 1597 break; 1598 default: 1599 fprintf(stderr, "Bad fs action %x\n", t->action); 1600 break; 1601 } 1602} 1603 1604static void dump_trace(struct blk_io_trace *t, struct per_cpu_info *pci, 1605 struct per_dev_info *pdi) 1606{ 1607 if (text_output) { 1608 if (t->action == BLK_TN_MESSAGE) 1609 handle_notify(t); 1610 else if (t->action & BLK_TC_ACT(BLK_TC_PC)) 1611 dump_trace_pc(t, pdi, pci); 1612 else 1613 dump_trace_fs(t, pdi, pci); 1614 } 1615 1616 if (!pdi->events) 1617 pdi->first_reported_time = t->time; 1618 1619 pdi->events++; 1620 1621 if (bin_output_msgs || 1622 !(t->action & BLK_TC_ACT(BLK_TC_NOTIFY) && 1623 t->action == BLK_TN_MESSAGE)) 1624 output_binary(t, sizeof(*t) + t->pdu_len); 1625} 1626 1627/* 1628 * print in a proper way, not too small and not too big. if more than 1629 * 1000,000K, turn into M and so on 1630 */ 1631static char *size_cnv(char *dst, unsigned long long num, int in_kb) 1632{ 1633 char suff[] = { '\0', 'K', 'M', 'G', 'P' }; 1634 unsigned int i = 0; 1635 1636 if (in_kb) 1637 i++; 1638 1639 while (num > 1000 * 1000ULL && (i < sizeof(suff) - 1)) { 1640 i++; 1641 num /= 1000; 1642 } 1643 1644 sprintf(dst, "%'8Lu%c", num, suff[i]); 1645 return dst; 1646} 1647 1648static void dump_io_stats(struct per_dev_info *pdi, struct io_stats *ios, 1649 char *msg) 1650{ 1651 static char x[256], y[256]; 1652 1653 fprintf(ofp, "%s\n", msg); 1654 1655 fprintf(ofp, " Reads Queued: %s, %siB\t", size_cnv(x, ios->qreads, 0), size_cnv(y, ios->qread_kb, 1)); 1656 fprintf(ofp, " Writes Queued: %s, %siB\n", size_cnv(x, ios->qwrites, 0), size_cnv(y, ios->qwrite_kb, 1)); 1657 fprintf(ofp, " Read Dispatches: %s, %siB\t", size_cnv(x, ios->ireads, 0), size_cnv(y, ios->iread_kb, 1)); 1658 fprintf(ofp, " Write Dispatches: %s, %siB\n", size_cnv(x, ios->iwrites, 0), size_cnv(y, ios->iwrite_kb, 1)); 1659 fprintf(ofp, " Reads Requeued: %s\t\t", size_cnv(x, ios->rrqueue, 0)); 1660 fprintf(ofp, " Writes Requeued: %s\n", size_cnv(x, ios->wrqueue, 0)); 1661 fprintf(ofp, " Reads Completed: %s, %siB\t", size_cnv(x, ios->creads, 0), size_cnv(y, ios->cread_kb, 1)); 1662 fprintf(ofp, " Writes Completed: %s, %siB\n", size_cnv(x, ios->cwrites, 0), size_cnv(y, ios->cwrite_kb, 1)); 1663 fprintf(ofp, " Read Merges: %s, %siB\t", size_cnv(x, ios->mreads, 0), size_cnv(y, ios->mread_kb, 1)); 1664 fprintf(ofp, " Write Merges: %s, %siB\n", size_cnv(x, ios->mwrites, 0), size_cnv(y, ios->mwrite_kb, 1)); 1665 if (pdi) { 1666 fprintf(ofp, " Read depth: %'8u%8c\t", pdi->max_depth[0], ' '); 1667 fprintf(ofp, " Write depth: %'8u\n", pdi->max_depth[1]); 1668 } 1669 if (ios->qreads_pc || ios->qwrites_pc || ios->ireads_pc || ios->iwrites_pc || 1670 ios->rrqueue_pc || ios->wrqueue_pc || ios->creads_pc || ios->cwrites_pc) { 1671 fprintf(ofp, " PC Reads Queued: %s, %siB\t", size_cnv(x, ios->qreads_pc, 0), size_cnv(y, ios->qread_kb_pc, 1)); 1672 fprintf(ofp, " PC Writes Queued: %s, %siB\n", size_cnv(x, ios->qwrites_pc, 0), size_cnv(y, ios->qwrite_kb_pc, 1)); 1673 fprintf(ofp, " PC Read Disp.: %s, %siB\t", size_cnv(x, ios->ireads_pc, 0), size_cnv(y, ios->iread_kb_pc, 1)); 1674 fprintf(ofp, " PC Write Disp.: %s, %siB\n", size_cnv(x, ios->iwrites_pc, 0), size_cnv(y, ios->iwrite_kb_pc, 1)); 1675 fprintf(ofp, " PC Reads Req.: %s\t\t", size_cnv(x, ios->rrqueue_pc, 0)); 1676 fprintf(ofp, " PC Writes Req.: %s\n", size_cnv(x, ios->wrqueue_pc, 0)); 1677 fprintf(ofp, " PC Reads Compl.: %s\t\t", size_cnv(x, ios->creads_pc, 0)); 1678 fprintf(ofp, " PC Writes Compl.: %s\n", size_cnv(x, ios->cwrites, 0)); 1679 } 1680 fprintf(ofp, " IO unplugs: %'8lu%8c\t", ios->io_unplugs, ' '); 1681 fprintf(ofp, " Timer unplugs: %'8lu\n", ios->timer_unplugs); 1682} 1683 1684static void dump_wait_stats(struct per_process_info *ppi) 1685{ 1686 unsigned long rawait = ppi->longest_allocation_wait[0] / 1000; 1687 unsigned long rdwait = ppi->longest_dispatch_wait[0] / 1000; 1688 unsigned long rcwait = ppi->longest_completion_wait[0] / 1000; 1689 unsigned long wawait = ppi->longest_allocation_wait[1] / 1000; 1690 unsigned long wdwait = ppi->longest_dispatch_wait[1] / 1000; 1691 unsigned long wcwait = ppi->longest_completion_wait[1] / 1000; 1692 1693 fprintf(ofp, " Allocation wait: %'8lu%8c\t", rawait, ' '); 1694 fprintf(ofp, " Allocation wait: %'8lu\n", wawait); 1695 fprintf(ofp, " Dispatch wait: %'8lu%8c\t", rdwait, ' '); 1696 fprintf(ofp, " Dispatch wait: %'8lu\n", wdwait); 1697 fprintf(ofp, " Completion wait: %'8lu%8c\t", rcwait, ' '); 1698 fprintf(ofp, " Completion wait: %'8lu\n", wcwait); 1699} 1700 1701static int ppi_name_compare(const void *p1, const void *p2) 1702{ 1703 struct per_process_info *ppi1 = *((struct per_process_info **) p1); 1704 struct per_process_info *ppi2 = *((struct per_process_info **) p2); 1705 int res; 1706 1707 res = strverscmp(ppi1->ppm->comm, ppi2->ppm->comm); 1708 if (!res) 1709 res = ppi1->ppm->pid > ppi2->ppm->pid; 1710 1711 return res; 1712} 1713 1714static void sort_process_list(void) 1715{ 1716 struct per_process_info **ppis; 1717 struct per_process_info *ppi; 1718 int i = 0; 1719 1720 ppis = malloc(ppi_list_entries * sizeof(struct per_process_info *)); 1721 1722 ppi = ppi_list; 1723 while (ppi) { 1724 ppis[i++] = ppi; 1725 ppi = ppi->list_next; 1726 } 1727 1728 qsort(ppis, ppi_list_entries, sizeof(ppi), ppi_name_compare); 1729 1730 i = ppi_list_entries - 1; 1731 ppi_list = NULL; 1732 while (i >= 0) { 1733 ppi = ppis[i]; 1734 1735 ppi->list_next = ppi_list; 1736 ppi_list = ppi; 1737 i--; 1738 } 1739 1740 free(ppis); 1741} 1742 1743static void show_process_stats(void) 1744{ 1745 struct per_process_info *ppi; 1746 1747 sort_process_list(); 1748 1749 ppi = ppi_list; 1750 while (ppi) { 1751 struct process_pid_map *ppm = ppi->ppm; 1752 char name[64]; 1753 1754 if (ppi->more_than_one) 1755 sprintf(name, "%s (%u, ...)", ppm->comm, ppm->pid); 1756 else 1757 sprintf(name, "%s (%u)", ppm->comm, ppm->pid); 1758 1759 dump_io_stats(NULL, &ppi->io_stats, name); 1760 dump_wait_stats(ppi); 1761 ppi = ppi->list_next; 1762 } 1763 1764 fprintf(ofp, "\n"); 1765} 1766 1767static void show_device_and_cpu_stats(void) 1768{ 1769 struct per_dev_info *pdi; 1770 struct per_cpu_info *pci; 1771 struct io_stats total, *ios; 1772 unsigned long long rrate, wrate, msec; 1773 int i, j, pci_events; 1774 char line[3 + 8/*cpu*/ + 2 + 32/*dev*/ + 3]; 1775 char name[32]; 1776 double ratio; 1777 1778 for (pdi = devices, i = 0; i < ndevices; i++, pdi++) { 1779 1780 memset(&total, 0, sizeof(total)); 1781 pci_events = 0; 1782 1783 if (i > 0) 1784 fprintf(ofp, "\n"); 1785 1786 for (pci = pdi->cpus, j = 0; j < pdi->ncpus; j++, pci++) { 1787 if (!pci->nelems) 1788 continue; 1789 1790 ios = &pci->io_stats; 1791 total.qreads += ios->qreads; 1792 total.qwrites += ios->qwrites; 1793 total.creads += ios->creads; 1794 total.cwrites += ios->cwrites; 1795 total.mreads += ios->mreads; 1796 total.mwrites += ios->mwrites; 1797 total.ireads += ios->ireads; 1798 total.iwrites += ios->iwrites; 1799 total.rrqueue += ios->rrqueue; 1800 total.wrqueue += ios->wrqueue; 1801 total.qread_kb += ios->qread_kb; 1802 total.qwrite_kb += ios->qwrite_kb; 1803 total.cread_kb += ios->cread_kb; 1804 total.cwrite_kb += ios->cwrite_kb; 1805 total.iread_kb += ios->iread_kb; 1806 total.iwrite_kb += ios->iwrite_kb; 1807 total.mread_kb += ios->mread_kb; 1808 total.mwrite_kb += ios->mwrite_kb; 1809 1810 total.qreads_pc += ios->qreads_pc; 1811 total.qwrites_pc += ios->qwrites_pc; 1812 total.creads_pc += ios->creads_pc; 1813 total.cwrites_pc += ios->cwrites_pc; 1814 total.ireads_pc += ios->ireads_pc; 1815 total.iwrites_pc += ios->iwrites_pc; 1816 total.rrqueue_pc += ios->rrqueue_pc; 1817 total.wrqueue_pc += ios->wrqueue_pc; 1818 total.qread_kb_pc += ios->qread_kb_pc; 1819 total.qwrite_kb_pc += ios->qwrite_kb_pc; 1820 total.iread_kb_pc += ios->iread_kb_pc; 1821 total.iwrite_kb_pc += ios->iwrite_kb_pc; 1822 1823 total.timer_unplugs += ios->timer_unplugs; 1824 total.io_unplugs += ios->io_unplugs; 1825 1826 snprintf(line, sizeof(line) - 1, "CPU%d (%s):", 1827 j, get_dev_name(pdi, name, sizeof(name))); 1828 dump_io_stats(pdi, ios, line); 1829 pci_events++; 1830 } 1831 1832 if (pci_events > 1) { 1833 fprintf(ofp, "\n"); 1834 snprintf(line, sizeof(line) - 1, "Total (%s):", 1835 get_dev_name(pdi, name, sizeof(name))); 1836 dump_io_stats(NULL, &total, line); 1837 } 1838 1839 wrate = rrate = 0; 1840 msec = (pdi->last_reported_time - pdi->first_reported_time) / 1000000; 1841 if (msec) { 1842 rrate = 1000 * total.cread_kb / msec; 1843 wrate = 1000 * total.cwrite_kb / msec; 1844 } 1845 1846 fprintf(ofp, "\nThroughput (R/W): %'LuKiB/s / %'LuKiB/s\n", 1847 rrate, wrate); 1848 fprintf(ofp, "Events (%s): %'Lu entries\n", 1849 get_dev_name(pdi, line, sizeof(line)), pdi->events); 1850 1851 collect_pdi_skips(pdi); 1852 if (!pdi->skips && !pdi->events) 1853 ratio = 0.0; 1854 else 1855 ratio = 100.0 * ((double)pdi->seq_skips / 1856 (double)(pdi->events + pdi->seq_skips)); 1857 fprintf(ofp, "Skips: %'lu forward (%'llu - %5.1lf%%)\n", 1858 pdi->skips, pdi->seq_skips, ratio); 1859 } 1860} 1861 1862static void find_genesis(void) 1863{ 1864 struct trace *t = trace_list; 1865 1866 genesis_time = -1ULL; 1867 while (t != NULL) { 1868 if (t->bit->time < genesis_time) 1869 genesis_time = t->bit->time; 1870 1871 t = t->next; 1872 } 1873 1874 /* The time stamp record will usually be the first 1875 * record in the trace, but not always. 1876 */ 1877 if (start_timestamp 1878 && start_timestamp != genesis_time) { 1879 long delta = genesis_time - start_timestamp; 1880 1881 abs_start_time.tv_sec += SECONDS(delta); 1882 abs_start_time.tv_nsec += NANO_SECONDS(delta); 1883 if (abs_start_time.tv_nsec < 0) { 1884 abs_start_time.tv_nsec += 1000000000; 1885 abs_start_time.tv_sec -= 1; 1886 } else 1887 if (abs_start_time.tv_nsec > 1000000000) { 1888 abs_start_time.tv_nsec -= 1000000000; 1889 abs_start_time.tv_sec += 1; 1890 } 1891 } 1892} 1893 1894static inline int check_stopwatch(struct blk_io_trace *bit) 1895{ 1896 if (bit->time < stopwatch_end && 1897 bit->time >= stopwatch_start) 1898 return 0; 1899 1900 return 1; 1901} 1902 1903/* 1904 * return youngest entry read 1905 */ 1906static int sort_entries(unsigned long long *youngest) 1907{ 1908 struct per_dev_info *pdi = NULL; 1909 struct per_cpu_info *pci = NULL; 1910 struct trace *t; 1911 1912 if (!genesis_time) 1913 find_genesis(); 1914 1915 *youngest = 0; 1916 while ((t = trace_list) != NULL) { 1917 struct blk_io_trace *bit = t->bit; 1918 1919 trace_list = t->next; 1920 1921 bit->time -= genesis_time; 1922 1923 if (bit->time < *youngest || !*youngest) 1924 *youngest = bit->time; 1925 1926 if (!pdi || pdi->dev != bit->device) { 1927 pdi = get_dev_info(bit->device); 1928 pci = NULL; 1929 } 1930 1931 if (!pci || pci->cpu != bit->cpu) 1932 pci = get_cpu_info(pdi, bit->cpu); 1933 1934 if (bit->sequence < pci->smallest_seq_read) 1935 pci->smallest_seq_read = bit->sequence; 1936 1937 if (check_stopwatch(bit)) { 1938 bit_free(bit); 1939 t_free(t); 1940 continue; 1941 } 1942 1943 if (trace_rb_insert_sort(t)) 1944 return -1; 1945 } 1946 1947 return 0; 1948} 1949 1950/* 1951 * to continue, we must have traces from all online cpus in the tree 1952 */ 1953static int check_cpu_map(struct per_dev_info *pdi) 1954{ 1955 unsigned long *cpu_map; 1956 struct rb_node *n; 1957 struct trace *__t; 1958 unsigned int i; 1959 int ret, cpu; 1960 1961 /* 1962 * create a map of the cpus we have traces for 1963 */ 1964 cpu_map = malloc(pdi->cpu_map_max / sizeof(long)); 1965 n = rb_first(&rb_sort_root); 1966 while (n) { 1967 __t = rb_entry(n, struct trace, rb_node); 1968 cpu = __t->bit->cpu; 1969 1970 cpu_map[CPU_IDX(cpu)] |= (1UL << CPU_BIT(cpu)); 1971 n = rb_next(n); 1972 } 1973 1974 /* 1975 * we can't continue if pdi->cpu_map has entries set that we don't 1976 * have in the sort rbtree. the opposite is not a problem, though 1977 */ 1978 ret = 0; 1979 for (i = 0; i < pdi->cpu_map_max / CPUS_PER_LONG; i++) { 1980 if (pdi->cpu_map[i] & ~(cpu_map[i])) { 1981 ret = 1; 1982 break; 1983 } 1984 } 1985 1986 free(cpu_map); 1987 return ret; 1988} 1989 1990static int check_sequence(struct per_dev_info *pdi, struct trace *t, int force) 1991{ 1992 struct blk_io_trace *bit = t->bit; 1993 unsigned long expected_sequence; 1994 struct per_cpu_info *pci; 1995 struct trace *__t; 1996 1997 pci = get_cpu_info(pdi, bit->cpu); 1998 expected_sequence = pci->last_sequence + 1; 1999 2000 if (!expected_sequence) { 2001 /* 2002 * 1 should be the first entry, just allow it 2003 */ 2004 if (bit->sequence == 1) 2005 return 0; 2006 if (bit->sequence == pci->smallest_seq_read) 2007 return 0; 2008 2009 return check_cpu_map(pdi); 2010 } 2011 2012 if (bit->sequence == expected_sequence) 2013 return 0; 2014 2015 /* 2016 * we may not have seen that sequence yet. if we are not doing 2017 * the final run, break and wait for more entries. 2018 */ 2019 if (expected_sequence < pci->smallest_seq_read) { 2020 __t = trace_rb_find_last(pdi, pci, expected_sequence); 2021 if (!__t) 2022 goto skip; 2023 2024 __put_trace_last(pdi, __t); 2025 return 0; 2026 } else if (!force) { 2027 return 1; 2028 } else { 2029skip: 2030 if (check_current_skips(pci, bit->sequence)) 2031 return 0; 2032 2033 if (expected_sequence < bit->sequence) 2034 insert_skip(pci, expected_sequence, bit->sequence - 1); 2035 return 0; 2036 } 2037} 2038 2039static void show_entries_rb(int force) 2040{ 2041 struct per_dev_info *pdi = NULL; 2042 struct per_cpu_info *pci = NULL; 2043 struct blk_io_trace *bit; 2044 struct rb_node *n; 2045 struct trace *t; 2046 2047 while ((n = rb_first(&rb_sort_root)) != NULL) { 2048 if (is_done() && !force && !pipeline) 2049 break; 2050 2051 t = rb_entry(n, struct trace, rb_node); 2052 bit = t->bit; 2053 2054 if (read_sequence - t->read_sequence < 1 && !force) 2055 break; 2056 2057 if (!pdi || pdi->dev != bit->device) { 2058 pdi = get_dev_info(bit->device); 2059 pci = NULL; 2060 } 2061 2062 if (!pdi) { 2063 fprintf(stderr, "Unknown device ID? (%d,%d)\n", 2064 MAJOR(bit->device), MINOR(bit->device)); 2065 break; 2066 } 2067 2068 if (check_sequence(pdi, t, force)) 2069 break; 2070 2071 if (!force && bit->time > last_allowed_time) 2072 break; 2073 2074 check_time(pdi, bit); 2075 2076 if (!pci || pci->cpu != bit->cpu) 2077 pci = get_cpu_info(pdi, bit->cpu); 2078 2079 pci->last_sequence = bit->sequence; 2080 2081 pci->nelems++; 2082 2083 if (bit->action & (act_mask << BLK_TC_SHIFT)) 2084 dump_trace(bit, pci, pdi); 2085 2086 put_trace(pdi, t); 2087 } 2088} 2089 2090static int read_data(int fd, void *buffer, int bytes, int block, int *fdblock) 2091{ 2092 int ret, bytes_left, fl; 2093 void *p; 2094 2095 if (block != *fdblock) { 2096 fl = fcntl(fd, F_GETFL); 2097 2098 if (!block) { 2099 *fdblock = 0; 2100 fcntl(fd, F_SETFL, fl | O_NONBLOCK); 2101 } else { 2102 *fdblock = 1; 2103 fcntl(fd, F_SETFL, fl & ~O_NONBLOCK); 2104 } 2105 } 2106 2107 bytes_left = bytes; 2108 p = buffer; 2109 while (bytes_left > 0) { 2110 ret = read(fd, p, bytes_left); 2111 if (!ret) 2112 return 1; 2113 else if (ret < 0) { 2114 if (errno != EAGAIN) { 2115 perror("read"); 2116 return -1; 2117 } 2118 2119 /* 2120 * never do partial reads. we can return if we 2121 * didn't read anything and we should not block, 2122 * otherwise wait for data 2123 */ 2124 if ((bytes_left == bytes) && !block) 2125 return 1; 2126 2127 usleep(10); 2128 continue; 2129 } else { 2130 p += ret; 2131 bytes_left -= ret; 2132 } 2133 } 2134 2135 return 0; 2136} 2137 2138static inline __u16 get_pdulen(struct blk_io_trace *bit) 2139{ 2140 if (data_is_native) 2141 return bit->pdu_len; 2142 2143 return __bswap_16(bit->pdu_len); 2144} 2145 2146static inline __u32 get_magic(struct blk_io_trace *bit) 2147{ 2148 if (data_is_native) 2149 return bit->magic; 2150 2151 return __bswap_32(bit->magic); 2152} 2153 2154static int read_events(int fd, int always_block, int *fdblock) 2155{ 2156 struct per_dev_info *pdi = NULL; 2157 unsigned int events = 0; 2158 2159 while (!is_done() && events < rb_batch) { 2160 struct blk_io_trace *bit; 2161 struct trace *t; 2162 int pdu_len, should_block, ret; 2163 __u32 magic; 2164 2165 bit = bit_alloc(); 2166 2167 should_block = !events || always_block; 2168 2169 ret = read_data(fd, bit, sizeof(*bit), should_block, fdblock); 2170 if (ret) { 2171 bit_free(bit); 2172 if (!events && ret < 0) 2173 events = ret; 2174 break; 2175 } 2176 2177 /* 2178 * look at first trace to check whether we need to convert 2179 * data in the future 2180 */ 2181 if (data_is_native == -1 && check_data_endianness(bit->magic)) 2182 break; 2183 2184 magic = get_magic(bit); 2185 if ((magic & 0xffffff00) != BLK_IO_TRACE_MAGIC) { 2186 fprintf(stderr, "Bad magic %x\n", magic); 2187 break; 2188 } 2189 2190 pdu_len = get_pdulen(bit); 2191 if (pdu_len) { 2192 void *ptr = realloc(bit, sizeof(*bit) + pdu_len); 2193 2194 if (read_data(fd, ptr + sizeof(*bit), pdu_len, 1, fdblock)) { 2195 bit_free(ptr); 2196 break; 2197 } 2198 2199 bit = ptr; 2200 } 2201 2202 trace_to_cpu(bit); 2203 2204 if (verify_trace(bit)) { 2205 bit_free(bit); 2206 continue; 2207 } 2208 2209 /* 2210 * not a real trace, so grab and handle it here 2211 */ 2212 if (bit->action & BLK_TC_ACT(BLK_TC_NOTIFY) && bit->action != BLK_TN_MESSAGE) { 2213 handle_notify(bit); 2214 output_binary(bit, sizeof(*bit) + bit->pdu_len); 2215 continue; 2216 } 2217 2218 t = t_alloc(); 2219 memset(t, 0, sizeof(*t)); 2220 t->bit = bit; 2221 t->read_sequence = read_sequence; 2222 2223 t->next = trace_list; 2224 trace_list = t; 2225 2226 if (!pdi || pdi->dev != bit->device) 2227 pdi = get_dev_info(bit->device); 2228 2229 if (bit->time > pdi->last_read_time) 2230 pdi->last_read_time = bit->time; 2231 2232 events++; 2233 } 2234 2235 return events; 2236} 2237 2238/* 2239 * Managing input streams 2240 */ 2241 2242struct ms_stream { 2243 struct ms_stream *next; 2244 struct trace *first, *last; 2245 struct per_dev_info *pdi; 2246 unsigned int cpu; 2247}; 2248 2249#define MS_HASH(d, c) ((MAJOR(d) & 0xff) ^ (MINOR(d) & 0xff) ^ (cpu & 0xff)) 2250 2251struct ms_stream *ms_head; 2252struct ms_stream *ms_hash[256]; 2253 2254static void ms_sort(struct ms_stream *msp); 2255static int ms_prime(struct ms_stream *msp); 2256 2257static inline struct trace *ms_peek(struct ms_stream *msp) 2258{ 2259 return (msp == NULL) ? NULL : msp->first; 2260} 2261 2262static inline __u64 ms_peek_time(struct ms_stream *msp) 2263{ 2264 return ms_peek(msp)->bit->time; 2265} 2266 2267static inline void ms_resort(struct ms_stream *msp) 2268{ 2269 if (msp->next && ms_peek_time(msp) > ms_peek_time(msp->next)) { 2270 ms_head = msp->next; 2271 msp->next = NULL; 2272 ms_sort(msp); 2273 } 2274} 2275 2276static inline void ms_deq(struct ms_stream *msp) 2277{ 2278 msp->first = msp->first->next; 2279 if (!msp->first) { 2280 msp->last = NULL; 2281 if (!ms_prime(msp)) { 2282 ms_head = msp->next; 2283 msp->next = NULL; 2284 return; 2285 } 2286 } 2287 2288 ms_resort(msp); 2289} 2290 2291static void ms_sort(struct ms_stream *msp) 2292{ 2293 __u64 msp_t = ms_peek_time(msp); 2294 struct ms_stream *this_msp = ms_head; 2295 2296 if (this_msp == NULL) 2297 ms_head = msp; 2298 else if (msp_t < ms_peek_time(this_msp)) { 2299 msp->next = this_msp; 2300 ms_head = msp; 2301 } 2302 else { 2303 while (this_msp->next && ms_peek_time(this_msp->next) < msp_t) 2304 this_msp = this_msp->next; 2305 2306 msp->next = this_msp->next; 2307 this_msp->next = msp; 2308 } 2309} 2310 2311static int ms_prime(struct ms_stream *msp) 2312{ 2313 __u32 magic; 2314 unsigned int i; 2315 struct trace *t; 2316 struct per_dev_info *pdi = msp->pdi; 2317 struct per_cpu_info *pci = get_cpu_info(pdi, msp->cpu); 2318 struct blk_io_trace *bit = NULL; 2319 int ret, pdu_len, ndone = 0; 2320 2321 for (i = 0; !is_done() && pci->fd >= 0 && i < rb_batch; i++) { 2322 bit = bit_alloc(); 2323 ret = read_data(pci->fd, bit, sizeof(*bit), 1, &pci->fdblock); 2324 if (ret) 2325 goto err; 2326 2327 if (data_is_native == -1 && check_data_endianness(bit->magic)) 2328 goto err; 2329 2330 magic = get_magic(bit); 2331 if ((magic & 0xffffff00) != BLK_IO_TRACE_MAGIC) { 2332 fprintf(stderr, "Bad magic %x\n", magic); 2333 goto err; 2334 2335 } 2336 2337 pdu_len = get_pdulen(bit); 2338 if (pdu_len) { 2339 void *ptr = realloc(bit, sizeof(*bit) + pdu_len); 2340 ret = read_data(pci->fd, ptr + sizeof(*bit), pdu_len, 2341 1, &pci->fdblock); 2342 if (ret) { 2343 free(ptr); 2344 bit = NULL; 2345 goto err; 2346 } 2347 2348 bit = ptr; 2349 } 2350 2351 trace_to_cpu(bit); 2352 if (verify_trace(bit)) 2353 goto err; 2354 2355 if (bit->action & BLK_TC_ACT(BLK_TC_NOTIFY) && bit->action != BLK_TN_MESSAGE) { 2356 handle_notify(bit); 2357 output_binary(bit, sizeof(*bit) + bit->pdu_len); 2358 bit_free(bit); 2359 2360 i -= 1; 2361 continue; 2362 } 2363 2364 if (bit->time > pdi->last_read_time) 2365 pdi->last_read_time = bit->time; 2366 2367 t = t_alloc(); 2368 memset(t, 0, sizeof(*t)); 2369 t->bit = bit; 2370 2371 if (msp->first == NULL) 2372 msp->first = msp->last = t; 2373 else { 2374 msp->last->next = t; 2375 msp->last = t; 2376 } 2377 2378 ndone++; 2379 } 2380 2381 return ndone; 2382 2383err: 2384 if (bit) bit_free(bit); 2385 2386 cpu_mark_offline(pdi, pci->cpu); 2387 close(pci->fd); 2388 pci->fd = -1; 2389 2390 return ndone; 2391} 2392 2393static struct ms_stream *ms_alloc(struct per_dev_info *pdi, int cpu) 2394{ 2395 struct ms_stream *msp = malloc(sizeof(*msp)); 2396 2397 msp->next = NULL; 2398 msp->first = msp->last = NULL; 2399 msp->pdi = pdi; 2400 msp->cpu = cpu; 2401 2402 if (ms_prime(msp)) 2403 ms_sort(msp); 2404 2405 return msp; 2406} 2407 2408static int setup_file(struct per_dev_info *pdi, int cpu) 2409{ 2410 int len = 0; 2411 struct stat st; 2412 char *p, *dname; 2413 struct per_cpu_info *pci = get_cpu_info(pdi, cpu); 2414 2415 pci->cpu = cpu; 2416 pci->fdblock = -1; 2417 2418 p = strdup(pdi->name); 2419 dname = dirname(p); 2420 if (strcmp(dname, ".")) { 2421 input_dir = dname; 2422 p = strdup(pdi->name); 2423 strcpy(pdi->name, basename(p)); 2424 } 2425 free(p); 2426 2427 if (input_dir) 2428 len = sprintf(pci->fname, "%s/", input_dir); 2429 2430 snprintf(pci->fname + len, sizeof(pci->fname)-1-len, 2431 "%s.blktrace.%d", pdi->name, pci->cpu); 2432 if (stat(pci->fname, &st) < 0) 2433 return 0; 2434 if (!st.st_size) 2435 return 1; 2436 2437 pci->fd = open(pci->fname, O_RDONLY); 2438 if (pci->fd < 0) { 2439 perror(pci->fname); 2440 return 0; 2441 } 2442 2443 printf("Input file %s added\n", pci->fname); 2444 cpu_mark_online(pdi, pci->cpu); 2445 2446 pdi->nfiles++; 2447 ms_alloc(pdi, pci->cpu); 2448 2449 return 1; 2450} 2451 2452static int handle(struct ms_stream *msp) 2453{ 2454 struct trace *t; 2455 struct per_dev_info *pdi; 2456 struct per_cpu_info *pci; 2457 struct blk_io_trace *bit; 2458 2459 t = ms_peek(msp); 2460 2461 bit = t->bit; 2462 pdi = msp->pdi; 2463 pci = get_cpu_info(pdi, msp->cpu); 2464 pci->nelems++; 2465 bit->time -= genesis_time; 2466 2467 if (t->bit->time > stopwatch_end) 2468 return 0; 2469 2470 pdi->last_reported_time = bit->time; 2471 if ((bit->action & (act_mask << BLK_TC_SHIFT))&& 2472 t->bit->time >= stopwatch_start) 2473 dump_trace(bit, pci, pdi); 2474 2475 ms_deq(msp); 2476 2477 if (text_output) 2478 trace_rb_insert_last(pdi, t); 2479 else { 2480 bit_free(t->bit); 2481 t_free(t); 2482 } 2483 2484 return 1; 2485} 2486 2487/* 2488 * Check if we need to sanitize the name. We allow 'foo', or if foo.blktrace.X 2489 * is given, then strip back down to 'foo' to avoid missing files. 2490 */ 2491static int name_fixup(char *name) 2492{ 2493 char *b; 2494 2495 if (!name) 2496 return 1; 2497 2498 b = strstr(name, ".blktrace."); 2499 if (b) 2500 *b = '\0'; 2501 2502 return 0; 2503} 2504 2505static int do_file(void) 2506{ 2507 int i, cpu, ret; 2508 struct per_dev_info *pdi; 2509 2510 /* 2511 * first prepare all files for reading 2512 */ 2513 for (i = 0; i < ndevices; i++) { 2514 pdi = &devices[i]; 2515 ret = name_fixup(pdi->name); 2516 if (ret) 2517 return ret; 2518 2519 for (cpu = 0; setup_file(pdi, cpu); cpu++) 2520 ; 2521 } 2522 2523 /* 2524 * Get the initial time stamp 2525 */ 2526 if (ms_head) 2527 genesis_time = ms_peek_time(ms_head); 2528 2529 /* 2530 * Keep processing traces while any are left 2531 */ 2532 while (!is_done() && ms_head && handle(ms_head)) 2533 ; 2534 2535 return 0; 2536} 2537 2538static void do_pipe(int fd) 2539{ 2540 unsigned long long youngest; 2541 int events, fdblock; 2542 2543 last_allowed_time = -1ULL; 2544 fdblock = -1; 2545 while ((events = read_events(fd, 0, &fdblock)) > 0) { 2546 read_sequence++; 2547 2548#if 0 2549 smallest_seq_read = -1U; 2550#endif 2551 2552 if (sort_entries(&youngest)) 2553 break; 2554 2555 if (youngest > stopwatch_end) 2556 break; 2557 2558 show_entries_rb(0); 2559 } 2560 2561 if (rb_sort_entries) 2562 show_entries_rb(1); 2563} 2564 2565static int do_fifo(void) 2566{ 2567 int fd; 2568 2569 if (!strcmp(pipename, "-")) 2570 fd = dup(STDIN_FILENO); 2571 else 2572 fd = open(pipename, O_RDONLY); 2573 2574 if (fd == -1) { 2575 perror("dup stdin"); 2576 return -1; 2577 } 2578 2579 do_pipe(fd); 2580 close(fd); 2581 return 0; 2582} 2583 2584static void show_stats(void) 2585{ 2586 if (!ofp) 2587 return; 2588 if (stats_printed) 2589 return; 2590 2591 stats_printed = 1; 2592 2593 if (per_process_stats) 2594 show_process_stats(); 2595 2596 if (per_device_and_cpu_stats) 2597 show_device_and_cpu_stats(); 2598 2599 fflush(ofp); 2600} 2601 2602static void handle_sigint(__attribute__((__unused__)) int sig) 2603{ 2604 done = 1; 2605} 2606 2607/* 2608 * Extract start and duration times from a string, allowing 2609 * us to specify a time interval of interest within a trace. 2610 * Format: "duration" (start is zero) or "start:duration". 2611 */ 2612static int find_stopwatch_interval(char *string) 2613{ 2614 double value; 2615 char *sp; 2616 2617 value = strtod(string, &sp); 2618 if (sp == string) { 2619 fprintf(stderr,"Invalid stopwatch timer: %s\n", string); 2620 return 1; 2621 } 2622 if (*sp == ':') { 2623 stopwatch_start = DOUBLE_TO_NANO_ULL(value); 2624 string = sp + 1; 2625 value = strtod(string, &sp); 2626 if (sp == string || *sp != '\0') { 2627 fprintf(stderr,"Invalid stopwatch duration time: %s\n", 2628 string); 2629 return 1; 2630 } 2631 } else if (*sp != '\0') { 2632 fprintf(stderr,"Invalid stopwatch start timer: %s\n", string); 2633 return 1; 2634 } 2635 stopwatch_end = DOUBLE_TO_NANO_ULL(value); 2636 if (stopwatch_end <= stopwatch_start) { 2637 fprintf(stderr, "Invalid stopwatch interval: %Lu -> %Lu\n", 2638 stopwatch_start, stopwatch_end); 2639 return 1; 2640 } 2641 2642 return 0; 2643} 2644 2645static int is_pipe(const char *str) 2646{ 2647 struct stat st; 2648 2649 if (!strcmp(str, "-")) 2650 return 1; 2651 if (!stat(str, &st) && S_ISFIFO(st.st_mode)) 2652 return 1; 2653 2654 return 0; 2655} 2656 2657#define S_OPTS "a:A:b:D:d:f:F:hi:o:Oqstw:vVM" 2658static char usage_str[] = "\n\n" \ 2659 "-i <file> | --input=<file>\n" \ 2660 "[ -a <action field> | --act-mask=<action field> ]\n" \ 2661 "[ -A <action mask> | --set-mask=<action mask> ]\n" \ 2662 "[ -b <traces> | --batch=<traces> ]\n" \ 2663 "[ -d <file> | --dump-binary=<file> ]\n" \ 2664 "[ -D <dir> | --input-directory=<dir> ]\n" \ 2665 "[ -f <format> | --format=<format> ]\n" \ 2666 "[ -F <spec> | --format-spec=<spec> ]\n" \ 2667 "[ -h | --hash-by-name ]\n" \ 2668 "[ -o <file> | --output=<file> ]\n" \ 2669 "[ -O | --no-text-output ]\n" \ 2670 "[ -q | --quiet ]\n" \ 2671 "[ -s | --per-program-stats ]\n" \ 2672 "[ -t | --track-ios ]\n" \ 2673 "[ -w <time> | --stopwatch=<time> ]\n" \ 2674 "[ -M | --no-msgs\n" \ 2675 "[ -v | --verbose ]\n" \ 2676 "[ -V | --version ]\n\n" \ 2677 "\t-b stdin read batching\n" \ 2678 "\t-d Output file. If specified, binary data is written to file\n" \ 2679 "\t-D Directory to prepend to input file names\n" \ 2680 "\t-f Output format. Customize the output format. The format field\n" \ 2681 "\t identifies can be found in the documentation\n" \ 2682 "\t-F Format specification. Can be found in the documentation\n" \ 2683 "\t-h Hash processes by name, not pid\n" \ 2684 "\t-i Input file containing trace data, or '-' for stdin\n" \ 2685 "\t-o Output file. If not given, output is stdout\n" \ 2686 "\t-O Do NOT output text data\n" \ 2687 "\t-q Quiet. Don't display any stats at the end of the trace\n" \ 2688 "\t-s Show per-program io statistics\n" \ 2689 "\t-t Track individual ios. Will tell you the time a request took\n" \ 2690 "\t to get queued, to get dispatched, and to get completed\n" \ 2691 "\t-w Only parse data between the given time interval in seconds.\n" \ 2692 "\t If 'start' isn't given, blkparse defaults the start time to 0\n" \ 2693 "\t-M Do not output messages to binary file\n" \ 2694 "\t-v More verbose for marginal errors\n" \ 2695 "\t-V Print program version info\n\n"; 2696 2697static void usage(char *prog) 2698{ 2699 fprintf(stderr, "Usage: %s %s %s", prog, blkparse_version, usage_str); 2700} 2701 2702int main(int argc, char *argv[]) 2703{ 2704 int i, c, ret, mode; 2705 int act_mask_tmp = 0; 2706 char *ofp_buffer = NULL; 2707 char *bin_ofp_buffer = NULL; 2708 2709 while ((c = getopt_long(argc, argv, S_OPTS, l_opts, NULL)) != -1) { 2710 switch (c) { 2711 case 'a': 2712 i = find_mask_map(optarg); 2713 if (i < 0) { 2714 fprintf(stderr,"Invalid action mask %s\n", 2715 optarg); 2716 return 1; 2717 } 2718 act_mask_tmp |= i; 2719 break; 2720 2721 case 'A': 2722 if ((sscanf(optarg, "%x", &i) != 1) || 2723 !valid_act_opt(i)) { 2724 fprintf(stderr, 2725 "Invalid set action mask %s/0x%x\n", 2726 optarg, i); 2727 return 1; 2728 } 2729 act_mask_tmp = i; 2730 break; 2731 case 'i': 2732 if (is_pipe(optarg) && !pipeline) { 2733 pipeline = 1; 2734 pipename = strdup(optarg); 2735 } else if (resize_devices(optarg) != 0) 2736 return 1; 2737 break; 2738 case 'D': 2739 input_dir = optarg; 2740 break; 2741 case 'o': 2742 output_name = optarg; 2743 break; 2744 case 'O': 2745 text_output = 0; 2746 break; 2747 case 'b': 2748 rb_batch = atoi(optarg); 2749 if (rb_batch <= 0) 2750 rb_batch = RB_BATCH_DEFAULT; 2751 break; 2752 case 's': 2753 per_process_stats = 1; 2754 break; 2755 case 't': 2756 track_ios = 1; 2757 break; 2758 case 'q': 2759 per_device_and_cpu_stats = 0; 2760 break; 2761 case 'w': 2762 if (find_stopwatch_interval(optarg) != 0) 2763 return 1; 2764 break; 2765 case 'f': 2766 set_all_format_specs(optarg); 2767 break; 2768 case 'F': 2769 if (add_format_spec(optarg) != 0) 2770 return 1; 2771 break; 2772 case 'h': 2773 ppi_hash_by_pid = 0; 2774 break; 2775 case 'v': 2776 verbose++; 2777 break; 2778 case 'V': 2779 printf("%s version %s\n", argv[0], blkparse_version); 2780 return 0; 2781 case 'd': 2782 dump_binary = optarg; 2783 break; 2784 case 'M': 2785 bin_output_msgs = 0; 2786 break; 2787 default: 2788 usage(argv[0]); 2789 return 1; 2790 } 2791 } 2792 2793 while (optind < argc) { 2794 if (is_pipe(argv[optind]) && !pipeline) { 2795 pipeline = 1; 2796 pipename = strdup(argv[optind]); 2797 } else if (resize_devices(argv[optind]) != 0) 2798 return 1; 2799 optind++; 2800 } 2801 2802 if (!pipeline && !ndevices) { 2803 usage(argv[0]); 2804 return 1; 2805 } 2806 2807 if (act_mask_tmp != 0) 2808 act_mask = act_mask_tmp; 2809 2810 memset(&rb_sort_root, 0, sizeof(rb_sort_root)); 2811 2812 signal(SIGINT, handle_sigint); 2813 signal(SIGHUP, handle_sigint); 2814 signal(SIGTERM, handle_sigint); 2815 2816 setlocale(LC_NUMERIC, "en_US"); 2817 2818 if (text_output) { 2819 if (!output_name) { 2820 ofp = fdopen(STDOUT_FILENO, "w"); 2821 mode = _IOLBF; 2822 } else { 2823 char ofname[128]; 2824 2825 snprintf(ofname, sizeof(ofname) - 1, "%s", output_name); 2826 ofp = fopen(ofname, "w"); 2827 mode = _IOFBF; 2828 } 2829 2830 if (!ofp) { 2831 perror("fopen"); 2832 return 1; 2833 } 2834 2835 ofp_buffer = malloc(4096); 2836 if (setvbuf(ofp, ofp_buffer, mode, 4096)) { 2837 perror("setvbuf"); 2838 return 1; 2839 } 2840 } 2841 2842 if (dump_binary) { 2843 dump_fp = fopen(dump_binary, "w"); 2844 if (!dump_fp) { 2845 perror(dump_binary); 2846 dump_binary = NULL; 2847 return 1; 2848 } 2849 bin_ofp_buffer = malloc(128 * 1024); 2850 if (setvbuf(dump_fp, bin_ofp_buffer, _IOFBF, 128 * 1024)) { 2851 perror("setvbuf binary"); 2852 return 1; 2853 } 2854 } 2855 2856 if (pipeline) 2857 ret = do_fifo(); 2858 else 2859 ret = do_file(); 2860 2861 if (!ret) 2862 show_stats(); 2863 2864 if (have_drv_data && !dump_binary) 2865 printf("\ndiscarded traces containing low-level device driver " 2866 "specific data (only available in binary output)\n"); 2867 2868 if (ofp_buffer) { 2869 fflush(ofp); 2870 free(ofp_buffer); 2871 } 2872 if (bin_ofp_buffer) { 2873 fflush(dump_fp); 2874 free(bin_ofp_buffer); 2875 } 2876 return ret; 2877} 2878