taskstats.c revision bb129994c3bff9c5e8df91f05d7e9b6402fbd83f
1/*
2 * taskstats.c - Export per-task statistics to userland
3 *
4 * Copyright (C) Shailabh Nagar, IBM Corp. 2006
5 *           (C) Balbir Singh,   IBM Corp. 2006
6 *
7 * This program is free software; you can redistribute it and/or modify
8 * it under the terms of the GNU General Public License as published by
9 * the Free Software Foundation; either version 2 of the License, or
10 * (at your option) any later version.
11 *
12 * This program is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
15 * GNU General Public License for more details.
16 *
17 */
18
19#include <linux/kernel.h>
20#include <linux/taskstats_kern.h>
21#include <linux/delayacct.h>
22#include <linux/cpumask.h>
23#include <linux/percpu.h>
24#include <net/genetlink.h>
25#include <asm/atomic.h>
26
27/*
28 * Maximum length of a cpumask that can be specified in
29 * the TASKSTATS_CMD_ATTR_REGISTER/DEREGISTER_CPUMASK attribute
30 */
31#define TASKSTATS_CPUMASK_MAXLEN	(100+6*NR_CPUS)
32
33static DEFINE_PER_CPU(__u32, taskstats_seqnum) = { 0 };
34static int family_registered;
35kmem_cache_t *taskstats_cache;
36
37static struct genl_family family = {
38	.id		= GENL_ID_GENERATE,
39	.name		= TASKSTATS_GENL_NAME,
40	.version	= TASKSTATS_GENL_VERSION,
41	.maxattr	= TASKSTATS_CMD_ATTR_MAX,
42};
43
44static struct nla_policy taskstats_cmd_get_policy[TASKSTATS_CMD_ATTR_MAX+1]
45__read_mostly = {
46	[TASKSTATS_CMD_ATTR_PID]  = { .type = NLA_U32 },
47	[TASKSTATS_CMD_ATTR_TGID] = { .type = NLA_U32 },
48	[TASKSTATS_CMD_ATTR_REGISTER_CPUMASK] = { .type = NLA_STRING },
49	[TASKSTATS_CMD_ATTR_DEREGISTER_CPUMASK] = { .type = NLA_STRING },};
50
51struct listener {
52	struct list_head list;
53	pid_t pid;
54	char valid;
55};
56
57struct listener_list {
58	struct rw_semaphore sem;
59	struct list_head list;
60};
61static DEFINE_PER_CPU(struct listener_list, listener_array);
62
63enum actions {
64	REGISTER,
65	DEREGISTER,
66	CPU_DONT_CARE
67};
68
69static int prepare_reply(struct genl_info *info, u8 cmd, struct sk_buff **skbp,
70			void **replyp, size_t size)
71{
72	struct sk_buff *skb;
73	void *reply;
74
75	/*
76	 * If new attributes are added, please revisit this allocation
77	 */
78	skb = nlmsg_new(size);
79	if (!skb)
80		return -ENOMEM;
81
82	if (!info) {
83		int seq = get_cpu_var(taskstats_seqnum)++;
84		put_cpu_var(taskstats_seqnum);
85
86		reply = genlmsg_put(skb, 0, seq,
87				family.id, 0, 0,
88				cmd, family.version);
89	} else
90		reply = genlmsg_put(skb, info->snd_pid, info->snd_seq,
91				family.id, 0, 0,
92				cmd, family.version);
93	if (reply == NULL) {
94		nlmsg_free(skb);
95		return -EINVAL;
96	}
97
98	*skbp = skb;
99	*replyp = reply;
100	return 0;
101}
102
103/*
104 * Send taskstats data in @skb to listener with nl_pid @pid
105 */
106static int send_reply(struct sk_buff *skb, pid_t pid)
107{
108	struct genlmsghdr *genlhdr = nlmsg_data((struct nlmsghdr *)skb->data);
109	void *reply = genlmsg_data(genlhdr);
110	int rc;
111
112	rc = genlmsg_end(skb, reply);
113	if (rc < 0) {
114		nlmsg_free(skb);
115		return rc;
116	}
117
118	return genlmsg_unicast(skb, pid);
119}
120
121/*
122 * Send taskstats data in @skb to listeners registered for @cpu's exit data
123 */
124static int send_cpu_listeners(struct sk_buff *skb, unsigned int cpu)
125{
126	struct genlmsghdr *genlhdr = nlmsg_data((struct nlmsghdr *)skb->data);
127	struct listener_list *listeners;
128	struct listener *s, *tmp;
129	struct sk_buff *skb_next, *skb_cur = skb;
130	void *reply = genlmsg_data(genlhdr);
131	int rc, ret, delcount = 0;
132
133	rc = genlmsg_end(skb, reply);
134	if (rc < 0) {
135		nlmsg_free(skb);
136		return rc;
137	}
138
139	rc = 0;
140	listeners = &per_cpu(listener_array, cpu);
141	down_read(&listeners->sem);
142	list_for_each_entry_safe(s, tmp, &listeners->list, list) {
143		skb_next = NULL;
144		if (!list_is_last(&s->list, &listeners->list)) {
145			skb_next = skb_clone(skb_cur, GFP_KERNEL);
146			if (!skb_next) {
147				nlmsg_free(skb_cur);
148				rc = -ENOMEM;
149				break;
150			}
151		}
152		ret = genlmsg_unicast(skb_cur, s->pid);
153		if (ret == -ECONNREFUSED) {
154			s->valid = 0;
155			delcount++;
156			rc = ret;
157		}
158		skb_cur = skb_next;
159	}
160	up_read(&listeners->sem);
161
162	if (!delcount)
163		return rc;
164
165	/* Delete invalidated entries */
166	down_write(&listeners->sem);
167	list_for_each_entry_safe(s, tmp, &listeners->list, list) {
168		if (!s->valid) {
169			list_del(&s->list);
170			kfree(s);
171		}
172	}
173	up_write(&listeners->sem);
174	return rc;
175}
176
177static int fill_pid(pid_t pid, struct task_struct *pidtsk,
178		struct taskstats *stats)
179{
180	int rc;
181	struct task_struct *tsk = pidtsk;
182
183	if (!pidtsk) {
184		read_lock(&tasklist_lock);
185		tsk = find_task_by_pid(pid);
186		if (!tsk) {
187			read_unlock(&tasklist_lock);
188			return -ESRCH;
189		}
190		get_task_struct(tsk);
191		read_unlock(&tasklist_lock);
192	} else
193		get_task_struct(tsk);
194
195	/*
196	 * Each accounting subsystem adds calls to its functions to
197	 * fill in relevant parts of struct taskstsats as follows
198	 *
199	 *	rc = per-task-foo(stats, tsk);
200	 *	if (rc)
201	 *		goto err;
202	 */
203
204	rc = delayacct_add_tsk(stats, tsk);
205	stats->version = TASKSTATS_VERSION;
206
207	/* Define err: label here if needed */
208	put_task_struct(tsk);
209	return rc;
210
211}
212
213static int fill_tgid(pid_t tgid, struct task_struct *tgidtsk,
214		struct taskstats *stats)
215{
216	struct task_struct *tsk, *first;
217	unsigned long flags;
218
219	/*
220	 * Add additional stats from live tasks except zombie thread group
221	 * leaders who are already counted with the dead tasks
222	 */
223	first = tgidtsk;
224	if (!first) {
225		read_lock(&tasklist_lock);
226		first = find_task_by_pid(tgid);
227		if (!first) {
228			read_unlock(&tasklist_lock);
229			return -ESRCH;
230		}
231		get_task_struct(first);
232		read_unlock(&tasklist_lock);
233	} else
234		get_task_struct(first);
235
236	/* Start with stats from dead tasks */
237	spin_lock_irqsave(&first->signal->stats_lock, flags);
238	if (first->signal->stats)
239		memcpy(stats, first->signal->stats, sizeof(*stats));
240	spin_unlock_irqrestore(&first->signal->stats_lock, flags);
241
242	tsk = first;
243	read_lock(&tasklist_lock);
244	do {
245		if (tsk->exit_state == EXIT_ZOMBIE && thread_group_leader(tsk))
246			continue;
247		/*
248		 * Accounting subsystem can call its functions here to
249		 * fill in relevant parts of struct taskstsats as follows
250		 *
251		 *	per-task-foo(stats, tsk);
252		 */
253		delayacct_add_tsk(stats, tsk);
254
255	} while_each_thread(first, tsk);
256	read_unlock(&tasklist_lock);
257	stats->version = TASKSTATS_VERSION;
258
259	/*
260	 * Accounting subsytems can also add calls here to modify
261	 * fields of taskstats.
262	 */
263
264	return 0;
265}
266
267
268static void fill_tgid_exit(struct task_struct *tsk)
269{
270	unsigned long flags;
271
272	spin_lock_irqsave(&tsk->signal->stats_lock, flags);
273	if (!tsk->signal->stats)
274		goto ret;
275
276	/*
277	 * Each accounting subsystem calls its functions here to
278	 * accumalate its per-task stats for tsk, into the per-tgid structure
279	 *
280	 *	per-task-foo(tsk->signal->stats, tsk);
281	 */
282	delayacct_add_tsk(tsk->signal->stats, tsk);
283ret:
284	spin_unlock_irqrestore(&tsk->signal->stats_lock, flags);
285	return;
286}
287
288static int add_del_listener(pid_t pid, cpumask_t *maskp, int isadd)
289{
290	struct listener_list *listeners;
291	struct listener *s, *tmp;
292	unsigned int cpu;
293	cpumask_t mask = *maskp;
294
295	if (!cpus_subset(mask, cpu_possible_map))
296		return -EINVAL;
297
298	if (isadd == REGISTER) {
299		for_each_cpu_mask(cpu, mask) {
300			s = kmalloc_node(sizeof(struct listener), GFP_KERNEL,
301					 cpu_to_node(cpu));
302			if (!s)
303				goto cleanup;
304			s->pid = pid;
305			INIT_LIST_HEAD(&s->list);
306			s->valid = 1;
307
308			listeners = &per_cpu(listener_array, cpu);
309			down_write(&listeners->sem);
310			list_add(&s->list, &listeners->list);
311			up_write(&listeners->sem);
312		}
313		return 0;
314	}
315
316	/* Deregister or cleanup */
317cleanup:
318	for_each_cpu_mask(cpu, mask) {
319		listeners = &per_cpu(listener_array, cpu);
320		down_write(&listeners->sem);
321		list_for_each_entry_safe(s, tmp, &listeners->list, list) {
322			if (s->pid == pid) {
323				list_del(&s->list);
324				kfree(s);
325				break;
326			}
327		}
328		up_write(&listeners->sem);
329	}
330	return 0;
331}
332
333static int parse(struct nlattr *na, cpumask_t *mask)
334{
335	char *data;
336	int len;
337	int ret;
338
339	if (na == NULL)
340		return 1;
341	len = nla_len(na);
342	if (len > TASKSTATS_CPUMASK_MAXLEN)
343		return -E2BIG;
344	if (len < 1)
345		return -EINVAL;
346	data = kmalloc(len, GFP_KERNEL);
347	if (!data)
348		return -ENOMEM;
349	nla_strlcpy(data, na, len);
350	ret = cpulist_parse(data, *mask);
351	kfree(data);
352	return ret;
353}
354
355static int taskstats_user_cmd(struct sk_buff *skb, struct genl_info *info)
356{
357	int rc = 0;
358	struct sk_buff *rep_skb;
359	struct taskstats stats;
360	void *reply;
361	size_t size;
362	struct nlattr *na;
363	cpumask_t mask;
364
365	rc = parse(info->attrs[TASKSTATS_CMD_ATTR_REGISTER_CPUMASK], &mask);
366	if (rc < 0)
367		return rc;
368	if (rc == 0)
369		return add_del_listener(info->snd_pid, &mask, REGISTER);
370
371	rc = parse(info->attrs[TASKSTATS_CMD_ATTR_DEREGISTER_CPUMASK], &mask);
372	if (rc < 0)
373		return rc;
374	if (rc == 0)
375		return add_del_listener(info->snd_pid, &mask, DEREGISTER);
376
377	/*
378	 * Size includes space for nested attributes
379	 */
380	size = nla_total_size(sizeof(u32)) +
381		nla_total_size(sizeof(struct taskstats)) + nla_total_size(0);
382
383	memset(&stats, 0, sizeof(stats));
384	rc = prepare_reply(info, TASKSTATS_CMD_NEW, &rep_skb, &reply, size);
385	if (rc < 0)
386		return rc;
387
388	if (info->attrs[TASKSTATS_CMD_ATTR_PID]) {
389		u32 pid = nla_get_u32(info->attrs[TASKSTATS_CMD_ATTR_PID]);
390		rc = fill_pid(pid, NULL, &stats);
391		if (rc < 0)
392			goto err;
393
394		na = nla_nest_start(rep_skb, TASKSTATS_TYPE_AGGR_PID);
395		NLA_PUT_U32(rep_skb, TASKSTATS_TYPE_PID, pid);
396		NLA_PUT_TYPE(rep_skb, struct taskstats, TASKSTATS_TYPE_STATS,
397				stats);
398	} else if (info->attrs[TASKSTATS_CMD_ATTR_TGID]) {
399		u32 tgid = nla_get_u32(info->attrs[TASKSTATS_CMD_ATTR_TGID]);
400		rc = fill_tgid(tgid, NULL, &stats);
401		if (rc < 0)
402			goto err;
403
404		na = nla_nest_start(rep_skb, TASKSTATS_TYPE_AGGR_TGID);
405		NLA_PUT_U32(rep_skb, TASKSTATS_TYPE_TGID, tgid);
406		NLA_PUT_TYPE(rep_skb, struct taskstats, TASKSTATS_TYPE_STATS,
407				stats);
408	} else {
409		rc = -EINVAL;
410		goto err;
411	}
412
413	nla_nest_end(rep_skb, na);
414
415	return send_reply(rep_skb, info->snd_pid);
416
417nla_put_failure:
418	return genlmsg_cancel(rep_skb, reply);
419err:
420	nlmsg_free(rep_skb);
421	return rc;
422}
423
424void taskstats_exit_alloc(struct taskstats **ptidstats, unsigned int *mycpu)
425{
426	struct listener_list *listeners;
427	struct taskstats *tmp;
428	/*
429	 * This is the cpu on which the task is exiting currently and will
430	 * be the one for which the exit event is sent, even if the cpu
431	 * on which this function is running changes later.
432	 */
433	*mycpu = raw_smp_processor_id();
434
435	*ptidstats = NULL;
436	tmp = kmem_cache_zalloc(taskstats_cache, SLAB_KERNEL);
437	if (!tmp)
438		return;
439
440	listeners = &per_cpu(listener_array, *mycpu);
441	down_read(&listeners->sem);
442	if (!list_empty(&listeners->list)) {
443		*ptidstats = tmp;
444		tmp = NULL;
445	}
446	up_read(&listeners->sem);
447	kfree(tmp);
448}
449
450/* Send pid data out on exit */
451void taskstats_exit_send(struct task_struct *tsk, struct taskstats *tidstats,
452			int group_dead, unsigned int mycpu)
453{
454	int rc;
455	struct sk_buff *rep_skb;
456	void *reply;
457	size_t size;
458	int is_thread_group;
459	struct nlattr *na;
460	unsigned long flags;
461
462	if (!family_registered || !tidstats)
463		return;
464
465	spin_lock_irqsave(&tsk->signal->stats_lock, flags);
466	is_thread_group = tsk->signal->stats ? 1 : 0;
467	spin_unlock_irqrestore(&tsk->signal->stats_lock, flags);
468
469	rc = 0;
470	/*
471	 * Size includes space for nested attributes
472	 */
473	size = nla_total_size(sizeof(u32)) +
474		nla_total_size(sizeof(struct taskstats)) + nla_total_size(0);
475
476	if (is_thread_group)
477		size = 2 * size;	/* PID + STATS + TGID + STATS */
478
479	rc = prepare_reply(NULL, TASKSTATS_CMD_NEW, &rep_skb, &reply, size);
480	if (rc < 0)
481		goto ret;
482
483	rc = fill_pid(tsk->pid, tsk, tidstats);
484	if (rc < 0)
485		goto err_skb;
486
487	na = nla_nest_start(rep_skb, TASKSTATS_TYPE_AGGR_PID);
488	NLA_PUT_U32(rep_skb, TASKSTATS_TYPE_PID, (u32)tsk->pid);
489	NLA_PUT_TYPE(rep_skb, struct taskstats, TASKSTATS_TYPE_STATS,
490			*tidstats);
491	nla_nest_end(rep_skb, na);
492
493	if (!is_thread_group)
494		goto send;
495
496	/*
497	 * tsk has/had a thread group so fill the tsk->signal->stats structure
498	 * Doesn't matter if tsk is the leader or the last group member leaving
499	 */
500
501	fill_tgid_exit(tsk);
502	if (!group_dead)
503		goto send;
504
505	na = nla_nest_start(rep_skb, TASKSTATS_TYPE_AGGR_TGID);
506	NLA_PUT_U32(rep_skb, TASKSTATS_TYPE_TGID, (u32)tsk->tgid);
507	/* No locking needed for tsk->signal->stats since group is dead */
508	NLA_PUT_TYPE(rep_skb, struct taskstats, TASKSTATS_TYPE_STATS,
509			*tsk->signal->stats);
510	nla_nest_end(rep_skb, na);
511
512send:
513	send_cpu_listeners(rep_skb, mycpu);
514	return;
515
516nla_put_failure:
517	genlmsg_cancel(rep_skb, reply);
518	goto ret;
519err_skb:
520	nlmsg_free(rep_skb);
521ret:
522	return;
523}
524
525static struct genl_ops taskstats_ops = {
526	.cmd		= TASKSTATS_CMD_GET,
527	.doit		= taskstats_user_cmd,
528	.policy		= taskstats_cmd_get_policy,
529};
530
531/* Needed early in initialization */
532void __init taskstats_init_early(void)
533{
534	unsigned int i;
535
536	taskstats_cache = kmem_cache_create("taskstats_cache",
537						sizeof(struct taskstats),
538						0, SLAB_PANIC, NULL, NULL);
539	for_each_possible_cpu(i) {
540		INIT_LIST_HEAD(&(per_cpu(listener_array, i).list));
541		init_rwsem(&(per_cpu(listener_array, i).sem));
542	}
543}
544
545static int __init taskstats_init(void)
546{
547	int rc;
548
549	rc = genl_register_family(&family);
550	if (rc)
551		return rc;
552
553	rc = genl_register_ops(&family, &taskstats_ops);
554	if (rc < 0)
555		goto err;
556
557	family_registered = 1;
558	return 0;
559err:
560	genl_unregister_family(&family);
561	return rc;
562}
563
564/*
565 * late initcall ensures initialization of statistics collection
566 * mechanisms precedes initialization of the taskstats interface
567 */
568late_initcall(taskstats_init);
569