taskstats.c revision 4a279ff1ea1cf325775ada983035123fcdc8e986
1/* 2 * taskstats.c - Export per-task statistics to userland 3 * 4 * Copyright (C) Shailabh Nagar, IBM Corp. 2006 5 * (C) Balbir Singh, IBM Corp. 2006 6 * 7 * This program is free software; you can redistribute it and/or modify 8 * it under the terms of the GNU General Public License as published by 9 * the Free Software Foundation; either version 2 of the License, or 10 * (at your option) any later version. 11 * 12 * This program is distributed in the hope that it will be useful, 13 * but WITHOUT ANY WARRANTY; without even the implied warranty of 14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 15 * GNU General Public License for more details. 16 * 17 */ 18 19#include <linux/kernel.h> 20#include <linux/taskstats_kern.h> 21#include <linux/tsacct_kern.h> 22#include <linux/delayacct.h> 23#include <linux/tsacct_kern.h> 24#include <linux/cpumask.h> 25#include <linux/percpu.h> 26#include <net/genetlink.h> 27#include <asm/atomic.h> 28 29/* 30 * Maximum length of a cpumask that can be specified in 31 * the TASKSTATS_CMD_ATTR_REGISTER/DEREGISTER_CPUMASK attribute 32 */ 33#define TASKSTATS_CPUMASK_MAXLEN (100+6*NR_CPUS) 34 35static DEFINE_PER_CPU(__u32, taskstats_seqnum) = { 0 }; 36static int family_registered; 37kmem_cache_t *taskstats_cache; 38 39static struct genl_family family = { 40 .id = GENL_ID_GENERATE, 41 .name = TASKSTATS_GENL_NAME, 42 .version = TASKSTATS_GENL_VERSION, 43 .maxattr = TASKSTATS_CMD_ATTR_MAX, 44}; 45 46static struct nla_policy taskstats_cmd_get_policy[TASKSTATS_CMD_ATTR_MAX+1] 47__read_mostly = { 48 [TASKSTATS_CMD_ATTR_PID] = { .type = NLA_U32 }, 49 [TASKSTATS_CMD_ATTR_TGID] = { .type = NLA_U32 }, 50 [TASKSTATS_CMD_ATTR_REGISTER_CPUMASK] = { .type = NLA_STRING }, 51 [TASKSTATS_CMD_ATTR_DEREGISTER_CPUMASK] = { .type = NLA_STRING },}; 52 53struct listener { 54 struct list_head list; 55 pid_t pid; 56 char valid; 57}; 58 59struct listener_list { 60 struct rw_semaphore sem; 61 struct list_head list; 62}; 63static DEFINE_PER_CPU(struct listener_list, listener_array); 64 65enum actions { 66 REGISTER, 67 DEREGISTER, 68 CPU_DONT_CARE 69}; 70 71static int prepare_reply(struct genl_info *info, u8 cmd, struct sk_buff **skbp, 72 void **replyp, size_t size) 73{ 74 struct sk_buff *skb; 75 void *reply; 76 77 /* 78 * If new attributes are added, please revisit this allocation 79 */ 80 size = nlmsg_total_size(genlmsg_total_size(size)); 81 skb = nlmsg_new(size, GFP_KERNEL); 82 if (!skb) 83 return -ENOMEM; 84 85 if (!info) { 86 int seq = get_cpu_var(taskstats_seqnum)++; 87 put_cpu_var(taskstats_seqnum); 88 89 reply = genlmsg_put(skb, 0, seq, 90 family.id, 0, 0, 91 cmd, family.version); 92 } else 93 reply = genlmsg_put(skb, info->snd_pid, info->snd_seq, 94 family.id, 0, 0, 95 cmd, family.version); 96 if (reply == NULL) { 97 nlmsg_free(skb); 98 return -EINVAL; 99 } 100 101 *skbp = skb; 102 *replyp = reply; 103 return 0; 104} 105 106/* 107 * Send taskstats data in @skb to listener with nl_pid @pid 108 */ 109static int send_reply(struct sk_buff *skb, pid_t pid) 110{ 111 struct genlmsghdr *genlhdr = nlmsg_data((struct nlmsghdr *)skb->data); 112 void *reply = genlmsg_data(genlhdr); 113 int rc; 114 115 rc = genlmsg_end(skb, reply); 116 if (rc < 0) { 117 nlmsg_free(skb); 118 return rc; 119 } 120 121 return genlmsg_unicast(skb, pid); 122} 123 124/* 125 * Send taskstats data in @skb to listeners registered for @cpu's exit data 126 */ 127static void send_cpu_listeners(struct sk_buff *skb, unsigned int cpu) 128{ 129 struct genlmsghdr *genlhdr = nlmsg_data((struct nlmsghdr *)skb->data); 130 struct listener_list *listeners; 131 struct listener *s, *tmp; 132 struct sk_buff *skb_next, *skb_cur = skb; 133 void *reply = genlmsg_data(genlhdr); 134 int rc, delcount = 0; 135 136 rc = genlmsg_end(skb, reply); 137 if (rc < 0) { 138 nlmsg_free(skb); 139 return; 140 } 141 142 rc = 0; 143 listeners = &per_cpu(listener_array, cpu); 144 down_read(&listeners->sem); 145 list_for_each_entry(s, &listeners->list, list) { 146 skb_next = NULL; 147 if (!list_is_last(&s->list, &listeners->list)) { 148 skb_next = skb_clone(skb_cur, GFP_KERNEL); 149 if (!skb_next) 150 break; 151 } 152 rc = genlmsg_unicast(skb_cur, s->pid); 153 if (rc == -ECONNREFUSED) { 154 s->valid = 0; 155 delcount++; 156 } 157 skb_cur = skb_next; 158 } 159 up_read(&listeners->sem); 160 161 if (skb_cur) 162 nlmsg_free(skb_cur); 163 164 if (!delcount) 165 return; 166 167 /* Delete invalidated entries */ 168 down_write(&listeners->sem); 169 list_for_each_entry_safe(s, tmp, &listeners->list, list) { 170 if (!s->valid) { 171 list_del(&s->list); 172 kfree(s); 173 } 174 } 175 up_write(&listeners->sem); 176} 177 178static int fill_pid(pid_t pid, struct task_struct *tsk, 179 struct taskstats *stats) 180{ 181 int rc = 0; 182 183 if (!tsk) { 184 rcu_read_lock(); 185 tsk = find_task_by_pid(pid); 186 if (tsk) 187 get_task_struct(tsk); 188 rcu_read_unlock(); 189 if (!tsk) 190 return -ESRCH; 191 } else 192 get_task_struct(tsk); 193 194 /* 195 * Each accounting subsystem adds calls to its functions to 196 * fill in relevant parts of struct taskstsats as follows 197 * 198 * per-task-foo(stats, tsk); 199 */ 200 201 delayacct_add_tsk(stats, tsk); 202 203 /* fill in basic acct fields */ 204 stats->version = TASKSTATS_VERSION; 205 bacct_add_tsk(stats, tsk); 206 207 /* fill in extended acct fields */ 208 xacct_add_tsk(stats, tsk); 209 210 /* Define err: label here if needed */ 211 put_task_struct(tsk); 212 return rc; 213 214} 215 216static int fill_tgid(pid_t tgid, struct task_struct *first, 217 struct taskstats *stats) 218{ 219 struct task_struct *tsk; 220 unsigned long flags; 221 int rc = -ESRCH; 222 223 /* 224 * Add additional stats from live tasks except zombie thread group 225 * leaders who are already counted with the dead tasks 226 */ 227 rcu_read_lock(); 228 if (!first) 229 first = find_task_by_pid(tgid); 230 231 if (!first || !lock_task_sighand(first, &flags)) 232 goto out; 233 234 if (first->signal->stats) 235 memcpy(stats, first->signal->stats, sizeof(*stats)); 236 237 tsk = first; 238 do { 239 if (tsk->exit_state) 240 continue; 241 /* 242 * Accounting subsystem can call its functions here to 243 * fill in relevant parts of struct taskstsats as follows 244 * 245 * per-task-foo(stats, tsk); 246 */ 247 delayacct_add_tsk(stats, tsk); 248 249 } while_each_thread(first, tsk); 250 251 unlock_task_sighand(first, &flags); 252 rc = 0; 253out: 254 rcu_read_unlock(); 255 256 stats->version = TASKSTATS_VERSION; 257 /* 258 * Accounting subsytems can also add calls here to modify 259 * fields of taskstats. 260 */ 261 return rc; 262} 263 264 265static void fill_tgid_exit(struct task_struct *tsk) 266{ 267 unsigned long flags; 268 269 spin_lock_irqsave(&tsk->sighand->siglock, flags); 270 if (!tsk->signal->stats) 271 goto ret; 272 273 /* 274 * Each accounting subsystem calls its functions here to 275 * accumalate its per-task stats for tsk, into the per-tgid structure 276 * 277 * per-task-foo(tsk->signal->stats, tsk); 278 */ 279 delayacct_add_tsk(tsk->signal->stats, tsk); 280ret: 281 spin_unlock_irqrestore(&tsk->sighand->siglock, flags); 282 return; 283} 284 285static int add_del_listener(pid_t pid, cpumask_t *maskp, int isadd) 286{ 287 struct listener_list *listeners; 288 struct listener *s, *tmp; 289 unsigned int cpu; 290 cpumask_t mask = *maskp; 291 292 if (!cpus_subset(mask, cpu_possible_map)) 293 return -EINVAL; 294 295 if (isadd == REGISTER) { 296 for_each_cpu_mask(cpu, mask) { 297 s = kmalloc_node(sizeof(struct listener), GFP_KERNEL, 298 cpu_to_node(cpu)); 299 if (!s) 300 goto cleanup; 301 s->pid = pid; 302 INIT_LIST_HEAD(&s->list); 303 s->valid = 1; 304 305 listeners = &per_cpu(listener_array, cpu); 306 down_write(&listeners->sem); 307 list_add(&s->list, &listeners->list); 308 up_write(&listeners->sem); 309 } 310 return 0; 311 } 312 313 /* Deregister or cleanup */ 314cleanup: 315 for_each_cpu_mask(cpu, mask) { 316 listeners = &per_cpu(listener_array, cpu); 317 down_write(&listeners->sem); 318 list_for_each_entry_safe(s, tmp, &listeners->list, list) { 319 if (s->pid == pid) { 320 list_del(&s->list); 321 kfree(s); 322 break; 323 } 324 } 325 up_write(&listeners->sem); 326 } 327 return 0; 328} 329 330static int parse(struct nlattr *na, cpumask_t *mask) 331{ 332 char *data; 333 int len; 334 int ret; 335 336 if (na == NULL) 337 return 1; 338 len = nla_len(na); 339 if (len > TASKSTATS_CPUMASK_MAXLEN) 340 return -E2BIG; 341 if (len < 1) 342 return -EINVAL; 343 data = kmalloc(len, GFP_KERNEL); 344 if (!data) 345 return -ENOMEM; 346 nla_strlcpy(data, na, len); 347 ret = cpulist_parse(data, *mask); 348 kfree(data); 349 return ret; 350} 351 352static int taskstats_user_cmd(struct sk_buff *skb, struct genl_info *info) 353{ 354 int rc = 0; 355 struct sk_buff *rep_skb; 356 struct taskstats stats; 357 void *reply; 358 size_t size; 359 struct nlattr *na; 360 cpumask_t mask; 361 362 rc = parse(info->attrs[TASKSTATS_CMD_ATTR_REGISTER_CPUMASK], &mask); 363 if (rc < 0) 364 return rc; 365 if (rc == 0) 366 return add_del_listener(info->snd_pid, &mask, REGISTER); 367 368 rc = parse(info->attrs[TASKSTATS_CMD_ATTR_DEREGISTER_CPUMASK], &mask); 369 if (rc < 0) 370 return rc; 371 if (rc == 0) 372 return add_del_listener(info->snd_pid, &mask, DEREGISTER); 373 374 /* 375 * Size includes space for nested attributes 376 */ 377 size = nla_total_size(sizeof(u32)) + 378 nla_total_size(sizeof(struct taskstats)) + nla_total_size(0); 379 380 memset(&stats, 0, sizeof(stats)); 381 rc = prepare_reply(info, TASKSTATS_CMD_NEW, &rep_skb, &reply, size); 382 if (rc < 0) 383 return rc; 384 385 if (info->attrs[TASKSTATS_CMD_ATTR_PID]) { 386 u32 pid = nla_get_u32(info->attrs[TASKSTATS_CMD_ATTR_PID]); 387 rc = fill_pid(pid, NULL, &stats); 388 if (rc < 0) 389 goto err; 390 391 na = nla_nest_start(rep_skb, TASKSTATS_TYPE_AGGR_PID); 392 NLA_PUT_U32(rep_skb, TASKSTATS_TYPE_PID, pid); 393 NLA_PUT_TYPE(rep_skb, struct taskstats, TASKSTATS_TYPE_STATS, 394 stats); 395 } else if (info->attrs[TASKSTATS_CMD_ATTR_TGID]) { 396 u32 tgid = nla_get_u32(info->attrs[TASKSTATS_CMD_ATTR_TGID]); 397 rc = fill_tgid(tgid, NULL, &stats); 398 if (rc < 0) 399 goto err; 400 401 na = nla_nest_start(rep_skb, TASKSTATS_TYPE_AGGR_TGID); 402 NLA_PUT_U32(rep_skb, TASKSTATS_TYPE_TGID, tgid); 403 NLA_PUT_TYPE(rep_skb, struct taskstats, TASKSTATS_TYPE_STATS, 404 stats); 405 } else { 406 rc = -EINVAL; 407 goto err; 408 } 409 410 nla_nest_end(rep_skb, na); 411 412 return send_reply(rep_skb, info->snd_pid); 413 414nla_put_failure: 415 rc = genlmsg_cancel(rep_skb, reply); 416err: 417 nlmsg_free(rep_skb); 418 return rc; 419} 420 421void taskstats_exit_alloc(struct taskstats **ptidstats, unsigned int *mycpu) 422{ 423 struct listener_list *listeners; 424 struct taskstats *tmp; 425 /* 426 * This is the cpu on which the task is exiting currently and will 427 * be the one for which the exit event is sent, even if the cpu 428 * on which this function is running changes later. 429 */ 430 *mycpu = raw_smp_processor_id(); 431 432 *ptidstats = NULL; 433 tmp = kmem_cache_zalloc(taskstats_cache, SLAB_KERNEL); 434 if (!tmp) 435 return; 436 437 listeners = &per_cpu(listener_array, *mycpu); 438 down_read(&listeners->sem); 439 if (!list_empty(&listeners->list)) { 440 *ptidstats = tmp; 441 tmp = NULL; 442 } 443 up_read(&listeners->sem); 444 kfree(tmp); 445} 446 447/* Send pid data out on exit */ 448void taskstats_exit_send(struct task_struct *tsk, struct taskstats *tidstats, 449 int group_dead, unsigned int mycpu) 450{ 451 int rc; 452 struct sk_buff *rep_skb; 453 void *reply; 454 size_t size; 455 int is_thread_group; 456 struct nlattr *na; 457 458 if (!family_registered) 459 return; 460 461 /* 462 * Size includes space for nested attributes 463 */ 464 size = nla_total_size(sizeof(u32)) + 465 nla_total_size(sizeof(struct taskstats)) + nla_total_size(0); 466 467 is_thread_group = (tsk->signal->stats != NULL); 468 if (is_thread_group) { 469 /* PID + STATS + TGID + STATS */ 470 size = 2 * size; 471 /* fill the tsk->signal->stats structure */ 472 fill_tgid_exit(tsk); 473 } 474 475 if (!tidstats) 476 return; 477 478 rc = prepare_reply(NULL, TASKSTATS_CMD_NEW, &rep_skb, &reply, size); 479 if (rc < 0) 480 goto ret; 481 482 rc = fill_pid(tsk->pid, tsk, tidstats); 483 if (rc < 0) 484 goto err_skb; 485 486 na = nla_nest_start(rep_skb, TASKSTATS_TYPE_AGGR_PID); 487 NLA_PUT_U32(rep_skb, TASKSTATS_TYPE_PID, (u32)tsk->pid); 488 NLA_PUT_TYPE(rep_skb, struct taskstats, TASKSTATS_TYPE_STATS, 489 *tidstats); 490 nla_nest_end(rep_skb, na); 491 492 if (!is_thread_group) 493 goto send; 494 495 /* 496 * Doesn't matter if tsk is the leader or the last group member leaving 497 */ 498 if (!group_dead) 499 goto send; 500 501 na = nla_nest_start(rep_skb, TASKSTATS_TYPE_AGGR_TGID); 502 NLA_PUT_U32(rep_skb, TASKSTATS_TYPE_TGID, (u32)tsk->tgid); 503 /* No locking needed for tsk->signal->stats since group is dead */ 504 NLA_PUT_TYPE(rep_skb, struct taskstats, TASKSTATS_TYPE_STATS, 505 *tsk->signal->stats); 506 nla_nest_end(rep_skb, na); 507 508send: 509 send_cpu_listeners(rep_skb, mycpu); 510 return; 511 512nla_put_failure: 513 genlmsg_cancel(rep_skb, reply); 514err_skb: 515 nlmsg_free(rep_skb); 516ret: 517 return; 518} 519 520static struct genl_ops taskstats_ops = { 521 .cmd = TASKSTATS_CMD_GET, 522 .doit = taskstats_user_cmd, 523 .policy = taskstats_cmd_get_policy, 524}; 525 526/* Needed early in initialization */ 527void __init taskstats_init_early(void) 528{ 529 unsigned int i; 530 531 taskstats_cache = kmem_cache_create("taskstats_cache", 532 sizeof(struct taskstats), 533 0, SLAB_PANIC, NULL, NULL); 534 for_each_possible_cpu(i) { 535 INIT_LIST_HEAD(&(per_cpu(listener_array, i).list)); 536 init_rwsem(&(per_cpu(listener_array, i).sem)); 537 } 538} 539 540static int __init taskstats_init(void) 541{ 542 int rc; 543 544 rc = genl_register_family(&family); 545 if (rc) 546 return rc; 547 548 rc = genl_register_ops(&family, &taskstats_ops); 549 if (rc < 0) 550 goto err; 551 552 family_registered = 1; 553 return 0; 554err: 555 genl_unregister_family(&family); 556 return rc; 557} 558 559/* 560 * late initcall ensures initialization of statistics collection 561 * mechanisms precedes initialization of the taskstats interface 562 */ 563late_initcall(taskstats_init); 564