taskstats.c revision 0ae646845b603e9df5711084436d389f8371ffb3
1/*
2 * taskstats.c - Export per-task statistics to userland
3 *
4 * Copyright (C) Shailabh Nagar, IBM Corp. 2006
5 *           (C) Balbir Singh,   IBM Corp. 2006
6 *
7 * This program is free software; you can redistribute it and/or modify
8 * it under the terms of the GNU General Public License as published by
9 * the Free Software Foundation; either version 2 of the License, or
10 * (at your option) any later version.
11 *
12 * This program is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
15 * GNU General Public License for more details.
16 *
17 */
18
19#include <linux/kernel.h>
20#include <linux/taskstats_kern.h>
21#include <linux/delayacct.h>
22#include <linux/cpumask.h>
23#include <linux/percpu.h>
24#include <net/genetlink.h>
25#include <asm/atomic.h>
26
27/*
28 * Maximum length of a cpumask that can be specified in
29 * the TASKSTATS_CMD_ATTR_REGISTER/DEREGISTER_CPUMASK attribute
30 */
31#define TASKSTATS_CPUMASK_MAXLEN	(100+6*NR_CPUS)
32
33static DEFINE_PER_CPU(__u32, taskstats_seqnum) = { 0 };
34static int family_registered;
35kmem_cache_t *taskstats_cache;
36
37static struct genl_family family = {
38	.id		= GENL_ID_GENERATE,
39	.name		= TASKSTATS_GENL_NAME,
40	.version	= TASKSTATS_GENL_VERSION,
41	.maxattr	= TASKSTATS_CMD_ATTR_MAX,
42};
43
44static struct nla_policy taskstats_cmd_get_policy[TASKSTATS_CMD_ATTR_MAX+1]
45__read_mostly = {
46	[TASKSTATS_CMD_ATTR_PID]  = { .type = NLA_U32 },
47	[TASKSTATS_CMD_ATTR_TGID] = { .type = NLA_U32 },
48	[TASKSTATS_CMD_ATTR_REGISTER_CPUMASK] = { .type = NLA_STRING },
49	[TASKSTATS_CMD_ATTR_DEREGISTER_CPUMASK] = { .type = NLA_STRING },};
50
51struct listener {
52	struct list_head list;
53	pid_t pid;
54	char valid;
55};
56
57struct listener_list {
58	struct rw_semaphore sem;
59	struct list_head list;
60};
61static DEFINE_PER_CPU(struct listener_list, listener_array);
62
63enum actions {
64	REGISTER,
65	DEREGISTER,
66	CPU_DONT_CARE
67};
68
69static int prepare_reply(struct genl_info *info, u8 cmd, struct sk_buff **skbp,
70			void **replyp, size_t size)
71{
72	struct sk_buff *skb;
73	void *reply;
74
75	/*
76	 * If new attributes are added, please revisit this allocation
77	 */
78	skb = nlmsg_new(genlmsg_total_size(size), GFP_KERNEL);
79	if (!skb)
80		return -ENOMEM;
81
82	if (!info) {
83		int seq = get_cpu_var(taskstats_seqnum)++;
84		put_cpu_var(taskstats_seqnum);
85
86		reply = genlmsg_put(skb, 0, seq,
87				family.id, 0, 0,
88				cmd, family.version);
89	} else
90		reply = genlmsg_put(skb, info->snd_pid, info->snd_seq,
91				family.id, 0, 0,
92				cmd, family.version);
93	if (reply == NULL) {
94		nlmsg_free(skb);
95		return -EINVAL;
96	}
97
98	*skbp = skb;
99	*replyp = reply;
100	return 0;
101}
102
103/*
104 * Send taskstats data in @skb to listener with nl_pid @pid
105 */
106static int send_reply(struct sk_buff *skb, pid_t pid)
107{
108	struct genlmsghdr *genlhdr = nlmsg_data((struct nlmsghdr *)skb->data);
109	void *reply = genlmsg_data(genlhdr);
110	int rc;
111
112	rc = genlmsg_end(skb, reply);
113	if (rc < 0) {
114		nlmsg_free(skb);
115		return rc;
116	}
117
118	return genlmsg_unicast(skb, pid);
119}
120
121/*
122 * Send taskstats data in @skb to listeners registered for @cpu's exit data
123 */
124static void send_cpu_listeners(struct sk_buff *skb, unsigned int cpu)
125{
126	struct genlmsghdr *genlhdr = nlmsg_data((struct nlmsghdr *)skb->data);
127	struct listener_list *listeners;
128	struct listener *s, *tmp;
129	struct sk_buff *skb_next, *skb_cur = skb;
130	void *reply = genlmsg_data(genlhdr);
131	int rc, delcount = 0;
132
133	rc = genlmsg_end(skb, reply);
134	if (rc < 0) {
135		nlmsg_free(skb);
136		return;
137	}
138
139	rc = 0;
140	listeners = &per_cpu(listener_array, cpu);
141	down_read(&listeners->sem);
142	list_for_each_entry(s, &listeners->list, list) {
143		skb_next = NULL;
144		if (!list_is_last(&s->list, &listeners->list)) {
145			skb_next = skb_clone(skb_cur, GFP_KERNEL);
146			if (!skb_next)
147				break;
148		}
149		rc = genlmsg_unicast(skb_cur, s->pid);
150		if (rc == -ECONNREFUSED) {
151			s->valid = 0;
152			delcount++;
153		}
154		skb_cur = skb_next;
155	}
156	up_read(&listeners->sem);
157
158	if (skb_cur)
159		nlmsg_free(skb_cur);
160
161	if (!delcount)
162		return;
163
164	/* Delete invalidated entries */
165	down_write(&listeners->sem);
166	list_for_each_entry_safe(s, tmp, &listeners->list, list) {
167		if (!s->valid) {
168			list_del(&s->list);
169			kfree(s);
170		}
171	}
172	up_write(&listeners->sem);
173}
174
175static int fill_pid(pid_t pid, struct task_struct *pidtsk,
176		struct taskstats *stats)
177{
178	int rc = 0;
179	struct task_struct *tsk = pidtsk;
180
181	if (!pidtsk) {
182		read_lock(&tasklist_lock);
183		tsk = find_task_by_pid(pid);
184		if (!tsk) {
185			read_unlock(&tasklist_lock);
186			return -ESRCH;
187		}
188		get_task_struct(tsk);
189		read_unlock(&tasklist_lock);
190	} else
191		get_task_struct(tsk);
192
193	/*
194	 * Each accounting subsystem adds calls to its functions to
195	 * fill in relevant parts of struct taskstsats as follows
196	 *
197	 *	per-task-foo(stats, tsk);
198	 */
199
200	delayacct_add_tsk(stats, tsk);
201	stats->version = TASKSTATS_VERSION;
202
203	/* Define err: label here if needed */
204	put_task_struct(tsk);
205	return rc;
206
207}
208
209static int fill_tgid(pid_t tgid, struct task_struct *tgidtsk,
210		struct taskstats *stats)
211{
212	struct task_struct *tsk, *first;
213	unsigned long flags;
214
215	/*
216	 * Add additional stats from live tasks except zombie thread group
217	 * leaders who are already counted with the dead tasks
218	 */
219	first = tgidtsk;
220	if (!first) {
221		read_lock(&tasklist_lock);
222		first = find_task_by_pid(tgid);
223		if (!first) {
224			read_unlock(&tasklist_lock);
225			return -ESRCH;
226		}
227		get_task_struct(first);
228		read_unlock(&tasklist_lock);
229	} else
230		get_task_struct(first);
231
232	/* Start with stats from dead tasks */
233	spin_lock_irqsave(&first->signal->stats_lock, flags);
234	if (first->signal->stats)
235		memcpy(stats, first->signal->stats, sizeof(*stats));
236	spin_unlock_irqrestore(&first->signal->stats_lock, flags);
237
238	tsk = first;
239	read_lock(&tasklist_lock);
240	do {
241		if (tsk->exit_state == EXIT_ZOMBIE && thread_group_leader(tsk))
242			continue;
243		/*
244		 * Accounting subsystem can call its functions here to
245		 * fill in relevant parts of struct taskstsats as follows
246		 *
247		 *	per-task-foo(stats, tsk);
248		 */
249		delayacct_add_tsk(stats, tsk);
250
251	} while_each_thread(first, tsk);
252	read_unlock(&tasklist_lock);
253	stats->version = TASKSTATS_VERSION;
254
255	/*
256	 * Accounting subsytems can also add calls here to modify
257	 * fields of taskstats.
258	 */
259
260	return 0;
261}
262
263
264static void fill_tgid_exit(struct task_struct *tsk)
265{
266	unsigned long flags;
267
268	spin_lock_irqsave(&tsk->signal->stats_lock, flags);
269	if (!tsk->signal->stats)
270		goto ret;
271
272	/*
273	 * Each accounting subsystem calls its functions here to
274	 * accumalate its per-task stats for tsk, into the per-tgid structure
275	 *
276	 *	per-task-foo(tsk->signal->stats, tsk);
277	 */
278	delayacct_add_tsk(tsk->signal->stats, tsk);
279ret:
280	spin_unlock_irqrestore(&tsk->signal->stats_lock, flags);
281	return;
282}
283
284static int add_del_listener(pid_t pid, cpumask_t *maskp, int isadd)
285{
286	struct listener_list *listeners;
287	struct listener *s, *tmp;
288	unsigned int cpu;
289	cpumask_t mask = *maskp;
290
291	if (!cpus_subset(mask, cpu_possible_map))
292		return -EINVAL;
293
294	if (isadd == REGISTER) {
295		for_each_cpu_mask(cpu, mask) {
296			s = kmalloc_node(sizeof(struct listener), GFP_KERNEL,
297					 cpu_to_node(cpu));
298			if (!s)
299				goto cleanup;
300			s->pid = pid;
301			INIT_LIST_HEAD(&s->list);
302			s->valid = 1;
303
304			listeners = &per_cpu(listener_array, cpu);
305			down_write(&listeners->sem);
306			list_add(&s->list, &listeners->list);
307			up_write(&listeners->sem);
308		}
309		return 0;
310	}
311
312	/* Deregister or cleanup */
313cleanup:
314	for_each_cpu_mask(cpu, mask) {
315		listeners = &per_cpu(listener_array, cpu);
316		down_write(&listeners->sem);
317		list_for_each_entry_safe(s, tmp, &listeners->list, list) {
318			if (s->pid == pid) {
319				list_del(&s->list);
320				kfree(s);
321				break;
322			}
323		}
324		up_write(&listeners->sem);
325	}
326	return 0;
327}
328
329static int parse(struct nlattr *na, cpumask_t *mask)
330{
331	char *data;
332	int len;
333	int ret;
334
335	if (na == NULL)
336		return 1;
337	len = nla_len(na);
338	if (len > TASKSTATS_CPUMASK_MAXLEN)
339		return -E2BIG;
340	if (len < 1)
341		return -EINVAL;
342	data = kmalloc(len, GFP_KERNEL);
343	if (!data)
344		return -ENOMEM;
345	nla_strlcpy(data, na, len);
346	ret = cpulist_parse(data, *mask);
347	kfree(data);
348	return ret;
349}
350
351static int taskstats_user_cmd(struct sk_buff *skb, struct genl_info *info)
352{
353	int rc = 0;
354	struct sk_buff *rep_skb;
355	struct taskstats stats;
356	void *reply;
357	size_t size;
358	struct nlattr *na;
359	cpumask_t mask;
360
361	rc = parse(info->attrs[TASKSTATS_CMD_ATTR_REGISTER_CPUMASK], &mask);
362	if (rc < 0)
363		return rc;
364	if (rc == 0)
365		return add_del_listener(info->snd_pid, &mask, REGISTER);
366
367	rc = parse(info->attrs[TASKSTATS_CMD_ATTR_DEREGISTER_CPUMASK], &mask);
368	if (rc < 0)
369		return rc;
370	if (rc == 0)
371		return add_del_listener(info->snd_pid, &mask, DEREGISTER);
372
373	/*
374	 * Size includes space for nested attributes
375	 */
376	size = nla_total_size(sizeof(u32)) +
377		nla_total_size(sizeof(struct taskstats)) + nla_total_size(0);
378
379	memset(&stats, 0, sizeof(stats));
380	rc = prepare_reply(info, TASKSTATS_CMD_NEW, &rep_skb, &reply, size);
381	if (rc < 0)
382		return rc;
383
384	if (info->attrs[TASKSTATS_CMD_ATTR_PID]) {
385		u32 pid = nla_get_u32(info->attrs[TASKSTATS_CMD_ATTR_PID]);
386		rc = fill_pid(pid, NULL, &stats);
387		if (rc < 0)
388			goto err;
389
390		na = nla_nest_start(rep_skb, TASKSTATS_TYPE_AGGR_PID);
391		NLA_PUT_U32(rep_skb, TASKSTATS_TYPE_PID, pid);
392		NLA_PUT_TYPE(rep_skb, struct taskstats, TASKSTATS_TYPE_STATS,
393				stats);
394	} else if (info->attrs[TASKSTATS_CMD_ATTR_TGID]) {
395		u32 tgid = nla_get_u32(info->attrs[TASKSTATS_CMD_ATTR_TGID]);
396		rc = fill_tgid(tgid, NULL, &stats);
397		if (rc < 0)
398			goto err;
399
400		na = nla_nest_start(rep_skb, TASKSTATS_TYPE_AGGR_TGID);
401		NLA_PUT_U32(rep_skb, TASKSTATS_TYPE_TGID, tgid);
402		NLA_PUT_TYPE(rep_skb, struct taskstats, TASKSTATS_TYPE_STATS,
403				stats);
404	} else {
405		rc = -EINVAL;
406		goto err;
407	}
408
409	nla_nest_end(rep_skb, na);
410
411	return send_reply(rep_skb, info->snd_pid);
412
413nla_put_failure:
414	return genlmsg_cancel(rep_skb, reply);
415err:
416	nlmsg_free(rep_skb);
417	return rc;
418}
419
420void taskstats_exit_alloc(struct taskstats **ptidstats, unsigned int *mycpu)
421{
422	struct listener_list *listeners;
423	struct taskstats *tmp;
424	/*
425	 * This is the cpu on which the task is exiting currently and will
426	 * be the one for which the exit event is sent, even if the cpu
427	 * on which this function is running changes later.
428	 */
429	*mycpu = raw_smp_processor_id();
430
431	*ptidstats = NULL;
432	tmp = kmem_cache_zalloc(taskstats_cache, SLAB_KERNEL);
433	if (!tmp)
434		return;
435
436	listeners = &per_cpu(listener_array, *mycpu);
437	down_read(&listeners->sem);
438	if (!list_empty(&listeners->list)) {
439		*ptidstats = tmp;
440		tmp = NULL;
441	}
442	up_read(&listeners->sem);
443	kfree(tmp);
444}
445
446/* Send pid data out on exit */
447void taskstats_exit_send(struct task_struct *tsk, struct taskstats *tidstats,
448			int group_dead, unsigned int mycpu)
449{
450	int rc;
451	struct sk_buff *rep_skb;
452	void *reply;
453	size_t size;
454	int is_thread_group;
455	struct nlattr *na;
456	unsigned long flags;
457
458	if (!family_registered || !tidstats)
459		return;
460
461	spin_lock_irqsave(&tsk->signal->stats_lock, flags);
462	is_thread_group = tsk->signal->stats ? 1 : 0;
463	spin_unlock_irqrestore(&tsk->signal->stats_lock, flags);
464
465	rc = 0;
466	/*
467	 * Size includes space for nested attributes
468	 */
469	size = nla_total_size(sizeof(u32)) +
470		nla_total_size(sizeof(struct taskstats)) + nla_total_size(0);
471
472	if (is_thread_group)
473		size = 2 * size;	/* PID + STATS + TGID + STATS */
474
475	rc = prepare_reply(NULL, TASKSTATS_CMD_NEW, &rep_skb, &reply, size);
476	if (rc < 0)
477		goto ret;
478
479	rc = fill_pid(tsk->pid, tsk, tidstats);
480	if (rc < 0)
481		goto err_skb;
482
483	na = nla_nest_start(rep_skb, TASKSTATS_TYPE_AGGR_PID);
484	NLA_PUT_U32(rep_skb, TASKSTATS_TYPE_PID, (u32)tsk->pid);
485	NLA_PUT_TYPE(rep_skb, struct taskstats, TASKSTATS_TYPE_STATS,
486			*tidstats);
487	nla_nest_end(rep_skb, na);
488
489	if (!is_thread_group)
490		goto send;
491
492	/*
493	 * tsk has/had a thread group so fill the tsk->signal->stats structure
494	 * Doesn't matter if tsk is the leader or the last group member leaving
495	 */
496
497	fill_tgid_exit(tsk);
498	if (!group_dead)
499		goto send;
500
501	na = nla_nest_start(rep_skb, TASKSTATS_TYPE_AGGR_TGID);
502	NLA_PUT_U32(rep_skb, TASKSTATS_TYPE_TGID, (u32)tsk->tgid);
503	/* No locking needed for tsk->signal->stats since group is dead */
504	NLA_PUT_TYPE(rep_skb, struct taskstats, TASKSTATS_TYPE_STATS,
505			*tsk->signal->stats);
506	nla_nest_end(rep_skb, na);
507
508send:
509	send_cpu_listeners(rep_skb, mycpu);
510	return;
511
512nla_put_failure:
513	genlmsg_cancel(rep_skb, reply);
514	goto ret;
515err_skb:
516	nlmsg_free(rep_skb);
517ret:
518	return;
519}
520
521static struct genl_ops taskstats_ops = {
522	.cmd		= TASKSTATS_CMD_GET,
523	.doit		= taskstats_user_cmd,
524	.policy		= taskstats_cmd_get_policy,
525};
526
527/* Needed early in initialization */
528void __init taskstats_init_early(void)
529{
530	unsigned int i;
531
532	taskstats_cache = kmem_cache_create("taskstats_cache",
533						sizeof(struct taskstats),
534						0, SLAB_PANIC, NULL, NULL);
535	for_each_possible_cpu(i) {
536		INIT_LIST_HEAD(&(per_cpu(listener_array, i).list));
537		init_rwsem(&(per_cpu(listener_array, i).sem));
538	}
539}
540
541static int __init taskstats_init(void)
542{
543	int rc;
544
545	rc = genl_register_family(&family);
546	if (rc)
547		return rc;
548
549	rc = genl_register_ops(&family, &taskstats_ops);
550	if (rc < 0)
551		goto err;
552
553	family_registered = 1;
554	return 0;
555err:
556	genl_unregister_family(&family);
557	return rc;
558}
559
560/*
561 * late initcall ensures initialization of statistics collection
562 * mechanisms precedes initialization of the taskstats interface
563 */
564late_initcall(taskstats_init);
565