taskstats.c revision 60063497a95e716c9a689af3be2687d261f115b4
1/* 2 * taskstats.c - Export per-task statistics to userland 3 * 4 * Copyright (C) Shailabh Nagar, IBM Corp. 2006 5 * (C) Balbir Singh, IBM Corp. 2006 6 * 7 * This program is free software; you can redistribute it and/or modify 8 * it under the terms of the GNU General Public License as published by 9 * the Free Software Foundation; either version 2 of the License, or 10 * (at your option) any later version. 11 * 12 * This program is distributed in the hope that it will be useful, 13 * but WITHOUT ANY WARRANTY; without even the implied warranty of 14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 15 * GNU General Public License for more details. 16 * 17 */ 18 19#include <linux/kernel.h> 20#include <linux/taskstats_kern.h> 21#include <linux/tsacct_kern.h> 22#include <linux/delayacct.h> 23#include <linux/cpumask.h> 24#include <linux/percpu.h> 25#include <linux/slab.h> 26#include <linux/cgroupstats.h> 27#include <linux/cgroup.h> 28#include <linux/fs.h> 29#include <linux/file.h> 30#include <net/genetlink.h> 31#include <linux/atomic.h> 32 33/* 34 * Maximum length of a cpumask that can be specified in 35 * the TASKSTATS_CMD_ATTR_REGISTER/DEREGISTER_CPUMASK attribute 36 */ 37#define TASKSTATS_CPUMASK_MAXLEN (100+6*NR_CPUS) 38 39static DEFINE_PER_CPU(__u32, taskstats_seqnum); 40static int family_registered; 41struct kmem_cache *taskstats_cache; 42 43static struct genl_family family = { 44 .id = GENL_ID_GENERATE, 45 .name = TASKSTATS_GENL_NAME, 46 .version = TASKSTATS_GENL_VERSION, 47 .maxattr = TASKSTATS_CMD_ATTR_MAX, 48}; 49 50static const struct nla_policy taskstats_cmd_get_policy[TASKSTATS_CMD_ATTR_MAX+1] = { 51 [TASKSTATS_CMD_ATTR_PID] = { .type = NLA_U32 }, 52 [TASKSTATS_CMD_ATTR_TGID] = { .type = NLA_U32 }, 53 [TASKSTATS_CMD_ATTR_REGISTER_CPUMASK] = { .type = NLA_STRING }, 54 [TASKSTATS_CMD_ATTR_DEREGISTER_CPUMASK] = { .type = NLA_STRING },}; 55 56static const struct nla_policy cgroupstats_cmd_get_policy[CGROUPSTATS_CMD_ATTR_MAX+1] = { 57 [CGROUPSTATS_CMD_ATTR_FD] = { .type = NLA_U32 }, 58}; 59 60struct listener { 61 struct list_head list; 62 pid_t pid; 63 char valid; 64}; 65 66struct listener_list { 67 struct rw_semaphore sem; 68 struct list_head list; 69}; 70static DEFINE_PER_CPU(struct listener_list, listener_array); 71 72enum actions { 73 REGISTER, 74 DEREGISTER, 75 CPU_DONT_CARE 76}; 77 78static int prepare_reply(struct genl_info *info, u8 cmd, struct sk_buff **skbp, 79 size_t size) 80{ 81 struct sk_buff *skb; 82 void *reply; 83 84 /* 85 * If new attributes are added, please revisit this allocation 86 */ 87 skb = genlmsg_new(size, GFP_KERNEL); 88 if (!skb) 89 return -ENOMEM; 90 91 if (!info) { 92 int seq = this_cpu_inc_return(taskstats_seqnum) - 1; 93 94 reply = genlmsg_put(skb, 0, seq, &family, 0, cmd); 95 } else 96 reply = genlmsg_put_reply(skb, info, &family, 0, cmd); 97 if (reply == NULL) { 98 nlmsg_free(skb); 99 return -EINVAL; 100 } 101 102 *skbp = skb; 103 return 0; 104} 105 106/* 107 * Send taskstats data in @skb to listener with nl_pid @pid 108 */ 109static int send_reply(struct sk_buff *skb, struct genl_info *info) 110{ 111 struct genlmsghdr *genlhdr = nlmsg_data(nlmsg_hdr(skb)); 112 void *reply = genlmsg_data(genlhdr); 113 int rc; 114 115 rc = genlmsg_end(skb, reply); 116 if (rc < 0) { 117 nlmsg_free(skb); 118 return rc; 119 } 120 121 return genlmsg_reply(skb, info); 122} 123 124/* 125 * Send taskstats data in @skb to listeners registered for @cpu's exit data 126 */ 127static void send_cpu_listeners(struct sk_buff *skb, 128 struct listener_list *listeners) 129{ 130 struct genlmsghdr *genlhdr = nlmsg_data(nlmsg_hdr(skb)); 131 struct listener *s, *tmp; 132 struct sk_buff *skb_next, *skb_cur = skb; 133 void *reply = genlmsg_data(genlhdr); 134 int rc, delcount = 0; 135 136 rc = genlmsg_end(skb, reply); 137 if (rc < 0) { 138 nlmsg_free(skb); 139 return; 140 } 141 142 rc = 0; 143 down_read(&listeners->sem); 144 list_for_each_entry(s, &listeners->list, list) { 145 skb_next = NULL; 146 if (!list_is_last(&s->list, &listeners->list)) { 147 skb_next = skb_clone(skb_cur, GFP_KERNEL); 148 if (!skb_next) 149 break; 150 } 151 rc = genlmsg_unicast(&init_net, skb_cur, s->pid); 152 if (rc == -ECONNREFUSED) { 153 s->valid = 0; 154 delcount++; 155 } 156 skb_cur = skb_next; 157 } 158 up_read(&listeners->sem); 159 160 if (skb_cur) 161 nlmsg_free(skb_cur); 162 163 if (!delcount) 164 return; 165 166 /* Delete invalidated entries */ 167 down_write(&listeners->sem); 168 list_for_each_entry_safe(s, tmp, &listeners->list, list) { 169 if (!s->valid) { 170 list_del(&s->list); 171 kfree(s); 172 } 173 } 174 up_write(&listeners->sem); 175} 176 177static void fill_stats(struct task_struct *tsk, struct taskstats *stats) 178{ 179 memset(stats, 0, sizeof(*stats)); 180 /* 181 * Each accounting subsystem adds calls to its functions to 182 * fill in relevant parts of struct taskstsats as follows 183 * 184 * per-task-foo(stats, tsk); 185 */ 186 187 delayacct_add_tsk(stats, tsk); 188 189 /* fill in basic acct fields */ 190 stats->version = TASKSTATS_VERSION; 191 stats->nvcsw = tsk->nvcsw; 192 stats->nivcsw = tsk->nivcsw; 193 bacct_add_tsk(stats, tsk); 194 195 /* fill in extended acct fields */ 196 xacct_add_tsk(stats, tsk); 197} 198 199static int fill_stats_for_pid(pid_t pid, struct taskstats *stats) 200{ 201 struct task_struct *tsk; 202 203 rcu_read_lock(); 204 tsk = find_task_by_vpid(pid); 205 if (tsk) 206 get_task_struct(tsk); 207 rcu_read_unlock(); 208 if (!tsk) 209 return -ESRCH; 210 fill_stats(tsk, stats); 211 put_task_struct(tsk); 212 return 0; 213} 214 215static int fill_stats_for_tgid(pid_t tgid, struct taskstats *stats) 216{ 217 struct task_struct *tsk, *first; 218 unsigned long flags; 219 int rc = -ESRCH; 220 221 /* 222 * Add additional stats from live tasks except zombie thread group 223 * leaders who are already counted with the dead tasks 224 */ 225 rcu_read_lock(); 226 first = find_task_by_vpid(tgid); 227 228 if (!first || !lock_task_sighand(first, &flags)) 229 goto out; 230 231 if (first->signal->stats) 232 memcpy(stats, first->signal->stats, sizeof(*stats)); 233 else 234 memset(stats, 0, sizeof(*stats)); 235 236 tsk = first; 237 do { 238 if (tsk->exit_state) 239 continue; 240 /* 241 * Accounting subsystem can call its functions here to 242 * fill in relevant parts of struct taskstsats as follows 243 * 244 * per-task-foo(stats, tsk); 245 */ 246 delayacct_add_tsk(stats, tsk); 247 248 stats->nvcsw += tsk->nvcsw; 249 stats->nivcsw += tsk->nivcsw; 250 } while_each_thread(first, tsk); 251 252 unlock_task_sighand(first, &flags); 253 rc = 0; 254out: 255 rcu_read_unlock(); 256 257 stats->version = TASKSTATS_VERSION; 258 /* 259 * Accounting subsystems can also add calls here to modify 260 * fields of taskstats. 261 */ 262 return rc; 263} 264 265static void fill_tgid_exit(struct task_struct *tsk) 266{ 267 unsigned long flags; 268 269 spin_lock_irqsave(&tsk->sighand->siglock, flags); 270 if (!tsk->signal->stats) 271 goto ret; 272 273 /* 274 * Each accounting subsystem calls its functions here to 275 * accumalate its per-task stats for tsk, into the per-tgid structure 276 * 277 * per-task-foo(tsk->signal->stats, tsk); 278 */ 279 delayacct_add_tsk(tsk->signal->stats, tsk); 280ret: 281 spin_unlock_irqrestore(&tsk->sighand->siglock, flags); 282 return; 283} 284 285static int add_del_listener(pid_t pid, const struct cpumask *mask, int isadd) 286{ 287 struct listener_list *listeners; 288 struct listener *s, *tmp, *s2; 289 unsigned int cpu; 290 291 if (!cpumask_subset(mask, cpu_possible_mask)) 292 return -EINVAL; 293 294 s = NULL; 295 if (isadd == REGISTER) { 296 for_each_cpu(cpu, mask) { 297 if (!s) 298 s = kmalloc_node(sizeof(struct listener), 299 GFP_KERNEL, cpu_to_node(cpu)); 300 if (!s) 301 goto cleanup; 302 s->pid = pid; 303 INIT_LIST_HEAD(&s->list); 304 s->valid = 1; 305 306 listeners = &per_cpu(listener_array, cpu); 307 down_write(&listeners->sem); 308 list_for_each_entry_safe(s2, tmp, &listeners->list, list) { 309 if (s2->pid == pid) 310 goto next_cpu; 311 } 312 list_add(&s->list, &listeners->list); 313 s = NULL; 314next_cpu: 315 up_write(&listeners->sem); 316 } 317 kfree(s); 318 return 0; 319 } 320 321 /* Deregister or cleanup */ 322cleanup: 323 for_each_cpu(cpu, mask) { 324 listeners = &per_cpu(listener_array, cpu); 325 down_write(&listeners->sem); 326 list_for_each_entry_safe(s, tmp, &listeners->list, list) { 327 if (s->pid == pid) { 328 list_del(&s->list); 329 kfree(s); 330 break; 331 } 332 } 333 up_write(&listeners->sem); 334 } 335 return 0; 336} 337 338static int parse(struct nlattr *na, struct cpumask *mask) 339{ 340 char *data; 341 int len; 342 int ret; 343 344 if (na == NULL) 345 return 1; 346 len = nla_len(na); 347 if (len > TASKSTATS_CPUMASK_MAXLEN) 348 return -E2BIG; 349 if (len < 1) 350 return -EINVAL; 351 data = kmalloc(len, GFP_KERNEL); 352 if (!data) 353 return -ENOMEM; 354 nla_strlcpy(data, na, len); 355 ret = cpulist_parse(data, mask); 356 kfree(data); 357 return ret; 358} 359 360#if defined(CONFIG_64BIT) && !defined(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) 361#define TASKSTATS_NEEDS_PADDING 1 362#endif 363 364static struct taskstats *mk_reply(struct sk_buff *skb, int type, u32 pid) 365{ 366 struct nlattr *na, *ret; 367 int aggr; 368 369 aggr = (type == TASKSTATS_TYPE_PID) 370 ? TASKSTATS_TYPE_AGGR_PID 371 : TASKSTATS_TYPE_AGGR_TGID; 372 373 /* 374 * The taskstats structure is internally aligned on 8 byte 375 * boundaries but the layout of the aggregrate reply, with 376 * two NLA headers and the pid (each 4 bytes), actually 377 * force the entire structure to be unaligned. This causes 378 * the kernel to issue unaligned access warnings on some 379 * architectures like ia64. Unfortunately, some software out there 380 * doesn't properly unroll the NLA packet and assumes that the start 381 * of the taskstats structure will always be 20 bytes from the start 382 * of the netlink payload. Aligning the start of the taskstats 383 * structure breaks this software, which we don't want. So, for now 384 * the alignment only happens on architectures that require it 385 * and those users will have to update to fixed versions of those 386 * packages. Space is reserved in the packet only when needed. 387 * This ifdef should be removed in several years e.g. 2012 once 388 * we can be confident that fixed versions are installed on most 389 * systems. We add the padding before the aggregate since the 390 * aggregate is already a defined type. 391 */ 392#ifdef TASKSTATS_NEEDS_PADDING 393 if (nla_put(skb, TASKSTATS_TYPE_NULL, 0, NULL) < 0) 394 goto err; 395#endif 396 na = nla_nest_start(skb, aggr); 397 if (!na) 398 goto err; 399 400 if (nla_put(skb, type, sizeof(pid), &pid) < 0) 401 goto err; 402 ret = nla_reserve(skb, TASKSTATS_TYPE_STATS, sizeof(struct taskstats)); 403 if (!ret) 404 goto err; 405 nla_nest_end(skb, na); 406 407 return nla_data(ret); 408err: 409 return NULL; 410} 411 412static int cgroupstats_user_cmd(struct sk_buff *skb, struct genl_info *info) 413{ 414 int rc = 0; 415 struct sk_buff *rep_skb; 416 struct cgroupstats *stats; 417 struct nlattr *na; 418 size_t size; 419 u32 fd; 420 struct file *file; 421 int fput_needed; 422 423 na = info->attrs[CGROUPSTATS_CMD_ATTR_FD]; 424 if (!na) 425 return -EINVAL; 426 427 fd = nla_get_u32(info->attrs[CGROUPSTATS_CMD_ATTR_FD]); 428 file = fget_light(fd, &fput_needed); 429 if (!file) 430 return 0; 431 432 size = nla_total_size(sizeof(struct cgroupstats)); 433 434 rc = prepare_reply(info, CGROUPSTATS_CMD_NEW, &rep_skb, 435 size); 436 if (rc < 0) 437 goto err; 438 439 na = nla_reserve(rep_skb, CGROUPSTATS_TYPE_CGROUP_STATS, 440 sizeof(struct cgroupstats)); 441 stats = nla_data(na); 442 memset(stats, 0, sizeof(*stats)); 443 444 rc = cgroupstats_build(stats, file->f_dentry); 445 if (rc < 0) { 446 nlmsg_free(rep_skb); 447 goto err; 448 } 449 450 rc = send_reply(rep_skb, info); 451 452err: 453 fput_light(file, fput_needed); 454 return rc; 455} 456 457static int cmd_attr_register_cpumask(struct genl_info *info) 458{ 459 cpumask_var_t mask; 460 int rc; 461 462 if (!alloc_cpumask_var(&mask, GFP_KERNEL)) 463 return -ENOMEM; 464 rc = parse(info->attrs[TASKSTATS_CMD_ATTR_REGISTER_CPUMASK], mask); 465 if (rc < 0) 466 goto out; 467 rc = add_del_listener(info->snd_pid, mask, REGISTER); 468out: 469 free_cpumask_var(mask); 470 return rc; 471} 472 473static int cmd_attr_deregister_cpumask(struct genl_info *info) 474{ 475 cpumask_var_t mask; 476 int rc; 477 478 if (!alloc_cpumask_var(&mask, GFP_KERNEL)) 479 return -ENOMEM; 480 rc = parse(info->attrs[TASKSTATS_CMD_ATTR_DEREGISTER_CPUMASK], mask); 481 if (rc < 0) 482 goto out; 483 rc = add_del_listener(info->snd_pid, mask, DEREGISTER); 484out: 485 free_cpumask_var(mask); 486 return rc; 487} 488 489static size_t taskstats_packet_size(void) 490{ 491 size_t size; 492 493 size = nla_total_size(sizeof(u32)) + 494 nla_total_size(sizeof(struct taskstats)) + nla_total_size(0); 495#ifdef TASKSTATS_NEEDS_PADDING 496 size += nla_total_size(0); /* Padding for alignment */ 497#endif 498 return size; 499} 500 501static int cmd_attr_pid(struct genl_info *info) 502{ 503 struct taskstats *stats; 504 struct sk_buff *rep_skb; 505 size_t size; 506 u32 pid; 507 int rc; 508 509 size = taskstats_packet_size(); 510 511 rc = prepare_reply(info, TASKSTATS_CMD_NEW, &rep_skb, size); 512 if (rc < 0) 513 return rc; 514 515 rc = -EINVAL; 516 pid = nla_get_u32(info->attrs[TASKSTATS_CMD_ATTR_PID]); 517 stats = mk_reply(rep_skb, TASKSTATS_TYPE_PID, pid); 518 if (!stats) 519 goto err; 520 521 rc = fill_stats_for_pid(pid, stats); 522 if (rc < 0) 523 goto err; 524 return send_reply(rep_skb, info); 525err: 526 nlmsg_free(rep_skb); 527 return rc; 528} 529 530static int cmd_attr_tgid(struct genl_info *info) 531{ 532 struct taskstats *stats; 533 struct sk_buff *rep_skb; 534 size_t size; 535 u32 tgid; 536 int rc; 537 538 size = taskstats_packet_size(); 539 540 rc = prepare_reply(info, TASKSTATS_CMD_NEW, &rep_skb, size); 541 if (rc < 0) 542 return rc; 543 544 rc = -EINVAL; 545 tgid = nla_get_u32(info->attrs[TASKSTATS_CMD_ATTR_TGID]); 546 stats = mk_reply(rep_skb, TASKSTATS_TYPE_TGID, tgid); 547 if (!stats) 548 goto err; 549 550 rc = fill_stats_for_tgid(tgid, stats); 551 if (rc < 0) 552 goto err; 553 return send_reply(rep_skb, info); 554err: 555 nlmsg_free(rep_skb); 556 return rc; 557} 558 559static int taskstats_user_cmd(struct sk_buff *skb, struct genl_info *info) 560{ 561 if (info->attrs[TASKSTATS_CMD_ATTR_REGISTER_CPUMASK]) 562 return cmd_attr_register_cpumask(info); 563 else if (info->attrs[TASKSTATS_CMD_ATTR_DEREGISTER_CPUMASK]) 564 return cmd_attr_deregister_cpumask(info); 565 else if (info->attrs[TASKSTATS_CMD_ATTR_PID]) 566 return cmd_attr_pid(info); 567 else if (info->attrs[TASKSTATS_CMD_ATTR_TGID]) 568 return cmd_attr_tgid(info); 569 else 570 return -EINVAL; 571} 572 573static struct taskstats *taskstats_tgid_alloc(struct task_struct *tsk) 574{ 575 struct signal_struct *sig = tsk->signal; 576 struct taskstats *stats; 577 578 if (sig->stats || thread_group_empty(tsk)) 579 goto ret; 580 581 /* No problem if kmem_cache_zalloc() fails */ 582 stats = kmem_cache_zalloc(taskstats_cache, GFP_KERNEL); 583 584 spin_lock_irq(&tsk->sighand->siglock); 585 if (!sig->stats) { 586 sig->stats = stats; 587 stats = NULL; 588 } 589 spin_unlock_irq(&tsk->sighand->siglock); 590 591 if (stats) 592 kmem_cache_free(taskstats_cache, stats); 593ret: 594 return sig->stats; 595} 596 597/* Send pid data out on exit */ 598void taskstats_exit(struct task_struct *tsk, int group_dead) 599{ 600 int rc; 601 struct listener_list *listeners; 602 struct taskstats *stats; 603 struct sk_buff *rep_skb; 604 size_t size; 605 int is_thread_group; 606 607 if (!family_registered) 608 return; 609 610 /* 611 * Size includes space for nested attributes 612 */ 613 size = taskstats_packet_size(); 614 615 is_thread_group = !!taskstats_tgid_alloc(tsk); 616 if (is_thread_group) { 617 /* PID + STATS + TGID + STATS */ 618 size = 2 * size; 619 /* fill the tsk->signal->stats structure */ 620 fill_tgid_exit(tsk); 621 } 622 623 listeners = __this_cpu_ptr(&listener_array); 624 if (list_empty(&listeners->list)) 625 return; 626 627 rc = prepare_reply(NULL, TASKSTATS_CMD_NEW, &rep_skb, size); 628 if (rc < 0) 629 return; 630 631 stats = mk_reply(rep_skb, TASKSTATS_TYPE_PID, tsk->pid); 632 if (!stats) 633 goto err; 634 635 fill_stats(tsk, stats); 636 637 /* 638 * Doesn't matter if tsk is the leader or the last group member leaving 639 */ 640 if (!is_thread_group || !group_dead) 641 goto send; 642 643 stats = mk_reply(rep_skb, TASKSTATS_TYPE_TGID, tsk->tgid); 644 if (!stats) 645 goto err; 646 647 memcpy(stats, tsk->signal->stats, sizeof(*stats)); 648 649send: 650 send_cpu_listeners(rep_skb, listeners); 651 return; 652err: 653 nlmsg_free(rep_skb); 654} 655 656static struct genl_ops taskstats_ops = { 657 .cmd = TASKSTATS_CMD_GET, 658 .doit = taskstats_user_cmd, 659 .policy = taskstats_cmd_get_policy, 660}; 661 662static struct genl_ops cgroupstats_ops = { 663 .cmd = CGROUPSTATS_CMD_GET, 664 .doit = cgroupstats_user_cmd, 665 .policy = cgroupstats_cmd_get_policy, 666}; 667 668/* Needed early in initialization */ 669void __init taskstats_init_early(void) 670{ 671 unsigned int i; 672 673 taskstats_cache = KMEM_CACHE(taskstats, SLAB_PANIC); 674 for_each_possible_cpu(i) { 675 INIT_LIST_HEAD(&(per_cpu(listener_array, i).list)); 676 init_rwsem(&(per_cpu(listener_array, i).sem)); 677 } 678} 679 680static int __init taskstats_init(void) 681{ 682 int rc; 683 684 rc = genl_register_family(&family); 685 if (rc) 686 return rc; 687 688 rc = genl_register_ops(&family, &taskstats_ops); 689 if (rc < 0) 690 goto err; 691 692 rc = genl_register_ops(&family, &cgroupstats_ops); 693 if (rc < 0) 694 goto err_cgroup_ops; 695 696 family_registered = 1; 697 pr_info("registered taskstats version %d\n", TASKSTATS_GENL_VERSION); 698 return 0; 699err_cgroup_ops: 700 genl_unregister_ops(&family, &taskstats_ops); 701err: 702 genl_unregister_family(&family); 703 return rc; 704} 705 706/* 707 * late initcall ensures initialization of statistics collection 708 * mechanisms precedes initialization of the taskstats interface 709 */ 710late_initcall(taskstats_init); 711