sch_api.c revision 16ebb5e0b36ceadc8186f71d68b0c4fa4b6e781b
1/*
2 * net/sched/sch_api.c	Packet scheduler API.
3 *
4 *		This program is free software; you can redistribute it and/or
5 *		modify it under the terms of the GNU General Public License
6 *		as published by the Free Software Foundation; either version
7 *		2 of the License, or (at your option) any later version.
8 *
9 * Authors:	Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
10 *
11 * Fixes:
12 *
13 * Rani Assaf <rani@magic.metawire.com> :980802: JIFFIES and CPU clock sources are repaired.
14 * Eduardo J. Blanco <ejbs@netlabs.com.uy> :990222: kmod support
15 * Jamal Hadi Salim <hadi@nortelnetworks.com>: 990601: ingress support
16 */
17
18#include <linux/module.h>
19#include <linux/types.h>
20#include <linux/kernel.h>
21#include <linux/string.h>
22#include <linux/errno.h>
23#include <linux/skbuff.h>
24#include <linux/init.h>
25#include <linux/proc_fs.h>
26#include <linux/seq_file.h>
27#include <linux/kmod.h>
28#include <linux/list.h>
29#include <linux/hrtimer.h>
30#include <linux/lockdep.h>
31
32#include <net/net_namespace.h>
33#include <net/sock.h>
34#include <net/netlink.h>
35#include <net/pkt_sched.h>
36
37static int qdisc_notify(struct sk_buff *oskb, struct nlmsghdr *n, u32 clid,
38			struct Qdisc *old, struct Qdisc *new);
39static int tclass_notify(struct sk_buff *oskb, struct nlmsghdr *n,
40			 struct Qdisc *q, unsigned long cl, int event);
41
42/*
43
44   Short review.
45   -------------
46
47   This file consists of two interrelated parts:
48
49   1. queueing disciplines manager frontend.
50   2. traffic classes manager frontend.
51
52   Generally, queueing discipline ("qdisc") is a black box,
53   which is able to enqueue packets and to dequeue them (when
54   device is ready to send something) in order and at times
55   determined by algorithm hidden in it.
56
57   qdisc's are divided to two categories:
58   - "queues", which have no internal structure visible from outside.
59   - "schedulers", which split all the packets to "traffic classes",
60     using "packet classifiers" (look at cls_api.c)
61
62   In turn, classes may have child qdiscs (as rule, queues)
63   attached to them etc. etc. etc.
64
65   The goal of the routines in this file is to translate
66   information supplied by user in the form of handles
67   to more intelligible for kernel form, to make some sanity
68   checks and part of work, which is common to all qdiscs
69   and to provide rtnetlink notifications.
70
71   All real intelligent work is done inside qdisc modules.
72
73
74
75   Every discipline has two major routines: enqueue and dequeue.
76
77   ---dequeue
78
79   dequeue usually returns a skb to send. It is allowed to return NULL,
80   but it does not mean that queue is empty, it just means that
81   discipline does not want to send anything this time.
82   Queue is really empty if q->q.qlen == 0.
83   For complicated disciplines with multiple queues q->q is not
84   real packet queue, but however q->q.qlen must be valid.
85
86   ---enqueue
87
88   enqueue returns 0, if packet was enqueued successfully.
89   If packet (this one or another one) was dropped, it returns
90   not zero error code.
91   NET_XMIT_DROP 	- this packet dropped
92     Expected action: do not backoff, but wait until queue will clear.
93   NET_XMIT_CN	 	- probably this packet enqueued, but another one dropped.
94     Expected action: backoff or ignore
95   NET_XMIT_POLICED	- dropped by police.
96     Expected action: backoff or error to real-time apps.
97
98   Auxiliary routines:
99
100   ---peek
101
102   like dequeue but without removing a packet from the queue
103
104   ---reset
105
106   returns qdisc to initial state: purge all buffers, clear all
107   timers, counters (except for statistics) etc.
108
109   ---init
110
111   initializes newly created qdisc.
112
113   ---destroy
114
115   destroys resources allocated by init and during lifetime of qdisc.
116
117   ---change
118
119   changes qdisc parameters.
120 */
121
122/* Protects list of registered TC modules. It is pure SMP lock. */
123static DEFINE_RWLOCK(qdisc_mod_lock);
124
125
126/************************************************
127 *	Queueing disciplines manipulation.	*
128 ************************************************/
129
130
131/* The list of all installed queueing disciplines. */
132
133static struct Qdisc_ops *qdisc_base;
134
135/* Register/uregister queueing discipline */
136
137int register_qdisc(struct Qdisc_ops *qops)
138{
139	struct Qdisc_ops *q, **qp;
140	int rc = -EEXIST;
141
142	write_lock(&qdisc_mod_lock);
143	for (qp = &qdisc_base; (q = *qp) != NULL; qp = &q->next)
144		if (!strcmp(qops->id, q->id))
145			goto out;
146
147	if (qops->enqueue == NULL)
148		qops->enqueue = noop_qdisc_ops.enqueue;
149	if (qops->peek == NULL) {
150		if (qops->dequeue == NULL) {
151			qops->peek = noop_qdisc_ops.peek;
152		} else {
153			rc = -EINVAL;
154			goto out;
155		}
156	}
157	if (qops->dequeue == NULL)
158		qops->dequeue = noop_qdisc_ops.dequeue;
159
160	qops->next = NULL;
161	*qp = qops;
162	rc = 0;
163out:
164	write_unlock(&qdisc_mod_lock);
165	return rc;
166}
167EXPORT_SYMBOL(register_qdisc);
168
169int unregister_qdisc(struct Qdisc_ops *qops)
170{
171	struct Qdisc_ops *q, **qp;
172	int err = -ENOENT;
173
174	write_lock(&qdisc_mod_lock);
175	for (qp = &qdisc_base; (q=*qp)!=NULL; qp = &q->next)
176		if (q == qops)
177			break;
178	if (q) {
179		*qp = q->next;
180		q->next = NULL;
181		err = 0;
182	}
183	write_unlock(&qdisc_mod_lock);
184	return err;
185}
186EXPORT_SYMBOL(unregister_qdisc);
187
188/* We know handle. Find qdisc among all qdisc's attached to device
189   (root qdisc, all its children, children of children etc.)
190 */
191
192static struct Qdisc *qdisc_match_from_root(struct Qdisc *root, u32 handle)
193{
194	struct Qdisc *q;
195
196	if (!(root->flags & TCQ_F_BUILTIN) &&
197	    root->handle == handle)
198		return root;
199
200	list_for_each_entry(q, &root->list, list) {
201		if (q->handle == handle)
202			return q;
203	}
204	return NULL;
205}
206
207static void qdisc_list_add(struct Qdisc *q)
208{
209	if ((q->parent != TC_H_ROOT) && !(q->flags & TCQ_F_INGRESS))
210		list_add_tail(&q->list, &qdisc_root_sleeping(q)->list);
211}
212
213void qdisc_list_del(struct Qdisc *q)
214{
215	if ((q->parent != TC_H_ROOT) && !(q->flags & TCQ_F_INGRESS))
216		list_del(&q->list);
217}
218EXPORT_SYMBOL(qdisc_list_del);
219
220struct Qdisc *qdisc_lookup(struct net_device *dev, u32 handle)
221{
222	unsigned int i;
223	struct Qdisc *q;
224
225	for (i = 0; i < dev->num_tx_queues; i++) {
226		struct netdev_queue *txq = netdev_get_tx_queue(dev, i);
227		struct Qdisc *txq_root = txq->qdisc_sleeping;
228
229		q = qdisc_match_from_root(txq_root, handle);
230		if (q)
231			goto out;
232	}
233
234	q = qdisc_match_from_root(dev->rx_queue.qdisc_sleeping, handle);
235out:
236	return q;
237}
238
239static struct Qdisc *qdisc_leaf(struct Qdisc *p, u32 classid)
240{
241	unsigned long cl;
242	struct Qdisc *leaf;
243	const struct Qdisc_class_ops *cops = p->ops->cl_ops;
244
245	if (cops == NULL)
246		return NULL;
247	cl = cops->get(p, classid);
248
249	if (cl == 0)
250		return NULL;
251	leaf = cops->leaf(p, cl);
252	cops->put(p, cl);
253	return leaf;
254}
255
256/* Find queueing discipline by name */
257
258static struct Qdisc_ops *qdisc_lookup_ops(struct nlattr *kind)
259{
260	struct Qdisc_ops *q = NULL;
261
262	if (kind) {
263		read_lock(&qdisc_mod_lock);
264		for (q = qdisc_base; q; q = q->next) {
265			if (nla_strcmp(kind, q->id) == 0) {
266				if (!try_module_get(q->owner))
267					q = NULL;
268				break;
269			}
270		}
271		read_unlock(&qdisc_mod_lock);
272	}
273	return q;
274}
275
276static struct qdisc_rate_table *qdisc_rtab_list;
277
278struct qdisc_rate_table *qdisc_get_rtab(struct tc_ratespec *r, struct nlattr *tab)
279{
280	struct qdisc_rate_table *rtab;
281
282	for (rtab = qdisc_rtab_list; rtab; rtab = rtab->next) {
283		if (memcmp(&rtab->rate, r, sizeof(struct tc_ratespec)) == 0) {
284			rtab->refcnt++;
285			return rtab;
286		}
287	}
288
289	if (tab == NULL || r->rate == 0 || r->cell_log == 0 ||
290	    nla_len(tab) != TC_RTAB_SIZE)
291		return NULL;
292
293	rtab = kmalloc(sizeof(*rtab), GFP_KERNEL);
294	if (rtab) {
295		rtab->rate = *r;
296		rtab->refcnt = 1;
297		memcpy(rtab->data, nla_data(tab), 1024);
298		rtab->next = qdisc_rtab_list;
299		qdisc_rtab_list = rtab;
300	}
301	return rtab;
302}
303EXPORT_SYMBOL(qdisc_get_rtab);
304
305void qdisc_put_rtab(struct qdisc_rate_table *tab)
306{
307	struct qdisc_rate_table *rtab, **rtabp;
308
309	if (!tab || --tab->refcnt)
310		return;
311
312	for (rtabp = &qdisc_rtab_list; (rtab=*rtabp) != NULL; rtabp = &rtab->next) {
313		if (rtab == tab) {
314			*rtabp = rtab->next;
315			kfree(rtab);
316			return;
317		}
318	}
319}
320EXPORT_SYMBOL(qdisc_put_rtab);
321
322static LIST_HEAD(qdisc_stab_list);
323static DEFINE_SPINLOCK(qdisc_stab_lock);
324
325static const struct nla_policy stab_policy[TCA_STAB_MAX + 1] = {
326	[TCA_STAB_BASE]	= { .len = sizeof(struct tc_sizespec) },
327	[TCA_STAB_DATA] = { .type = NLA_BINARY },
328};
329
330static struct qdisc_size_table *qdisc_get_stab(struct nlattr *opt)
331{
332	struct nlattr *tb[TCA_STAB_MAX + 1];
333	struct qdisc_size_table *stab;
334	struct tc_sizespec *s;
335	unsigned int tsize = 0;
336	u16 *tab = NULL;
337	int err;
338
339	err = nla_parse_nested(tb, TCA_STAB_MAX, opt, stab_policy);
340	if (err < 0)
341		return ERR_PTR(err);
342	if (!tb[TCA_STAB_BASE])
343		return ERR_PTR(-EINVAL);
344
345	s = nla_data(tb[TCA_STAB_BASE]);
346
347	if (s->tsize > 0) {
348		if (!tb[TCA_STAB_DATA])
349			return ERR_PTR(-EINVAL);
350		tab = nla_data(tb[TCA_STAB_DATA]);
351		tsize = nla_len(tb[TCA_STAB_DATA]) / sizeof(u16);
352	}
353
354	if (!s || tsize != s->tsize || (!tab && tsize > 0))
355		return ERR_PTR(-EINVAL);
356
357	spin_lock(&qdisc_stab_lock);
358
359	list_for_each_entry(stab, &qdisc_stab_list, list) {
360		if (memcmp(&stab->szopts, s, sizeof(*s)))
361			continue;
362		if (tsize > 0 && memcmp(stab->data, tab, tsize * sizeof(u16)))
363			continue;
364		stab->refcnt++;
365		spin_unlock(&qdisc_stab_lock);
366		return stab;
367	}
368
369	spin_unlock(&qdisc_stab_lock);
370
371	stab = kmalloc(sizeof(*stab) + tsize * sizeof(u16), GFP_KERNEL);
372	if (!stab)
373		return ERR_PTR(-ENOMEM);
374
375	stab->refcnt = 1;
376	stab->szopts = *s;
377	if (tsize > 0)
378		memcpy(stab->data, tab, tsize * sizeof(u16));
379
380	spin_lock(&qdisc_stab_lock);
381	list_add_tail(&stab->list, &qdisc_stab_list);
382	spin_unlock(&qdisc_stab_lock);
383
384	return stab;
385}
386
387void qdisc_put_stab(struct qdisc_size_table *tab)
388{
389	if (!tab)
390		return;
391
392	spin_lock(&qdisc_stab_lock);
393
394	if (--tab->refcnt == 0) {
395		list_del(&tab->list);
396		kfree(tab);
397	}
398
399	spin_unlock(&qdisc_stab_lock);
400}
401EXPORT_SYMBOL(qdisc_put_stab);
402
403static int qdisc_dump_stab(struct sk_buff *skb, struct qdisc_size_table *stab)
404{
405	struct nlattr *nest;
406
407	nest = nla_nest_start(skb, TCA_STAB);
408	if (nest == NULL)
409		goto nla_put_failure;
410	NLA_PUT(skb, TCA_STAB_BASE, sizeof(stab->szopts), &stab->szopts);
411	nla_nest_end(skb, nest);
412
413	return skb->len;
414
415nla_put_failure:
416	return -1;
417}
418
419void qdisc_calculate_pkt_len(struct sk_buff *skb, struct qdisc_size_table *stab)
420{
421	int pkt_len, slot;
422
423	pkt_len = skb->len + stab->szopts.overhead;
424	if (unlikely(!stab->szopts.tsize))
425		goto out;
426
427	slot = pkt_len + stab->szopts.cell_align;
428	if (unlikely(slot < 0))
429		slot = 0;
430
431	slot >>= stab->szopts.cell_log;
432	if (likely(slot < stab->szopts.tsize))
433		pkt_len = stab->data[slot];
434	else
435		pkt_len = stab->data[stab->szopts.tsize - 1] *
436				(slot / stab->szopts.tsize) +
437				stab->data[slot % stab->szopts.tsize];
438
439	pkt_len <<= stab->szopts.size_log;
440out:
441	if (unlikely(pkt_len < 1))
442		pkt_len = 1;
443	qdisc_skb_cb(skb)->pkt_len = pkt_len;
444}
445EXPORT_SYMBOL(qdisc_calculate_pkt_len);
446
447void qdisc_warn_nonwc(char *txt, struct Qdisc *qdisc)
448{
449	if (!(qdisc->flags & TCQ_F_WARN_NONWC)) {
450		printk(KERN_WARNING
451		       "%s: %s qdisc %X: is non-work-conserving?\n",
452		       txt, qdisc->ops->id, qdisc->handle >> 16);
453		qdisc->flags |= TCQ_F_WARN_NONWC;
454	}
455}
456EXPORT_SYMBOL(qdisc_warn_nonwc);
457
458static enum hrtimer_restart qdisc_watchdog(struct hrtimer *timer)
459{
460	struct qdisc_watchdog *wd = container_of(timer, struct qdisc_watchdog,
461						 timer);
462
463	wd->qdisc->flags &= ~TCQ_F_THROTTLED;
464	__netif_schedule(qdisc_root(wd->qdisc));
465
466	return HRTIMER_NORESTART;
467}
468
469void qdisc_watchdog_init(struct qdisc_watchdog *wd, struct Qdisc *qdisc)
470{
471	hrtimer_init(&wd->timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS);
472	wd->timer.function = qdisc_watchdog;
473	wd->qdisc = qdisc;
474}
475EXPORT_SYMBOL(qdisc_watchdog_init);
476
477void qdisc_watchdog_schedule(struct qdisc_watchdog *wd, psched_time_t expires)
478{
479	ktime_t time;
480
481	if (test_bit(__QDISC_STATE_DEACTIVATED,
482		     &qdisc_root_sleeping(wd->qdisc)->state))
483		return;
484
485	wd->qdisc->flags |= TCQ_F_THROTTLED;
486	time = ktime_set(0, 0);
487	time = ktime_add_ns(time, PSCHED_TICKS2NS(expires));
488	hrtimer_start(&wd->timer, time, HRTIMER_MODE_ABS);
489}
490EXPORT_SYMBOL(qdisc_watchdog_schedule);
491
492void qdisc_watchdog_cancel(struct qdisc_watchdog *wd)
493{
494	hrtimer_cancel(&wd->timer);
495	wd->qdisc->flags &= ~TCQ_F_THROTTLED;
496}
497EXPORT_SYMBOL(qdisc_watchdog_cancel);
498
499static struct hlist_head *qdisc_class_hash_alloc(unsigned int n)
500{
501	unsigned int size = n * sizeof(struct hlist_head), i;
502	struct hlist_head *h;
503
504	if (size <= PAGE_SIZE)
505		h = kmalloc(size, GFP_KERNEL);
506	else
507		h = (struct hlist_head *)
508			__get_free_pages(GFP_KERNEL, get_order(size));
509
510	if (h != NULL) {
511		for (i = 0; i < n; i++)
512			INIT_HLIST_HEAD(&h[i]);
513	}
514	return h;
515}
516
517static void qdisc_class_hash_free(struct hlist_head *h, unsigned int n)
518{
519	unsigned int size = n * sizeof(struct hlist_head);
520
521	if (size <= PAGE_SIZE)
522		kfree(h);
523	else
524		free_pages((unsigned long)h, get_order(size));
525}
526
527void qdisc_class_hash_grow(struct Qdisc *sch, struct Qdisc_class_hash *clhash)
528{
529	struct Qdisc_class_common *cl;
530	struct hlist_node *n, *next;
531	struct hlist_head *nhash, *ohash;
532	unsigned int nsize, nmask, osize;
533	unsigned int i, h;
534
535	/* Rehash when load factor exceeds 0.75 */
536	if (clhash->hashelems * 4 <= clhash->hashsize * 3)
537		return;
538	nsize = clhash->hashsize * 2;
539	nmask = nsize - 1;
540	nhash = qdisc_class_hash_alloc(nsize);
541	if (nhash == NULL)
542		return;
543
544	ohash = clhash->hash;
545	osize = clhash->hashsize;
546
547	sch_tree_lock(sch);
548	for (i = 0; i < osize; i++) {
549		hlist_for_each_entry_safe(cl, n, next, &ohash[i], hnode) {
550			h = qdisc_class_hash(cl->classid, nmask);
551			hlist_add_head(&cl->hnode, &nhash[h]);
552		}
553	}
554	clhash->hash     = nhash;
555	clhash->hashsize = nsize;
556	clhash->hashmask = nmask;
557	sch_tree_unlock(sch);
558
559	qdisc_class_hash_free(ohash, osize);
560}
561EXPORT_SYMBOL(qdisc_class_hash_grow);
562
563int qdisc_class_hash_init(struct Qdisc_class_hash *clhash)
564{
565	unsigned int size = 4;
566
567	clhash->hash = qdisc_class_hash_alloc(size);
568	if (clhash->hash == NULL)
569		return -ENOMEM;
570	clhash->hashsize  = size;
571	clhash->hashmask  = size - 1;
572	clhash->hashelems = 0;
573	return 0;
574}
575EXPORT_SYMBOL(qdisc_class_hash_init);
576
577void qdisc_class_hash_destroy(struct Qdisc_class_hash *clhash)
578{
579	qdisc_class_hash_free(clhash->hash, clhash->hashsize);
580}
581EXPORT_SYMBOL(qdisc_class_hash_destroy);
582
583void qdisc_class_hash_insert(struct Qdisc_class_hash *clhash,
584			     struct Qdisc_class_common *cl)
585{
586	unsigned int h;
587
588	INIT_HLIST_NODE(&cl->hnode);
589	h = qdisc_class_hash(cl->classid, clhash->hashmask);
590	hlist_add_head(&cl->hnode, &clhash->hash[h]);
591	clhash->hashelems++;
592}
593EXPORT_SYMBOL(qdisc_class_hash_insert);
594
595void qdisc_class_hash_remove(struct Qdisc_class_hash *clhash,
596			     struct Qdisc_class_common *cl)
597{
598	hlist_del(&cl->hnode);
599	clhash->hashelems--;
600}
601EXPORT_SYMBOL(qdisc_class_hash_remove);
602
603/* Allocate an unique handle from space managed by kernel */
604
605static u32 qdisc_alloc_handle(struct net_device *dev)
606{
607	int i = 0x10000;
608	static u32 autohandle = TC_H_MAKE(0x80000000U, 0);
609
610	do {
611		autohandle += TC_H_MAKE(0x10000U, 0);
612		if (autohandle == TC_H_MAKE(TC_H_ROOT, 0))
613			autohandle = TC_H_MAKE(0x80000000U, 0);
614	} while	(qdisc_lookup(dev, autohandle) && --i > 0);
615
616	return i>0 ? autohandle : 0;
617}
618
619/* Attach toplevel qdisc to device queue. */
620
621static struct Qdisc *dev_graft_qdisc(struct netdev_queue *dev_queue,
622				     struct Qdisc *qdisc)
623{
624	struct Qdisc *oqdisc = dev_queue->qdisc_sleeping;
625	spinlock_t *root_lock;
626
627	root_lock = qdisc_lock(oqdisc);
628	spin_lock_bh(root_lock);
629
630	/* Prune old scheduler */
631	if (oqdisc && atomic_read(&oqdisc->refcnt) <= 1)
632		qdisc_reset(oqdisc);
633
634	/* ... and graft new one */
635	if (qdisc == NULL)
636		qdisc = &noop_qdisc;
637	dev_queue->qdisc_sleeping = qdisc;
638	rcu_assign_pointer(dev_queue->qdisc, &noop_qdisc);
639
640	spin_unlock_bh(root_lock);
641
642	return oqdisc;
643}
644
645void qdisc_tree_decrease_qlen(struct Qdisc *sch, unsigned int n)
646{
647	const struct Qdisc_class_ops *cops;
648	unsigned long cl;
649	u32 parentid;
650
651	if (n == 0)
652		return;
653	while ((parentid = sch->parent)) {
654		if (TC_H_MAJ(parentid) == TC_H_MAJ(TC_H_INGRESS))
655			return;
656
657		sch = qdisc_lookup(qdisc_dev(sch), TC_H_MAJ(parentid));
658		if (sch == NULL) {
659			WARN_ON(parentid != TC_H_ROOT);
660			return;
661		}
662		cops = sch->ops->cl_ops;
663		if (cops->qlen_notify) {
664			cl = cops->get(sch, parentid);
665			cops->qlen_notify(sch, cl);
666			cops->put(sch, cl);
667		}
668		sch->q.qlen -= n;
669	}
670}
671EXPORT_SYMBOL(qdisc_tree_decrease_qlen);
672
673static void notify_and_destroy(struct sk_buff *skb, struct nlmsghdr *n, u32 clid,
674			       struct Qdisc *old, struct Qdisc *new)
675{
676	if (new || old)
677		qdisc_notify(skb, n, clid, old, new);
678
679	if (old)
680		qdisc_destroy(old);
681}
682
683/* Graft qdisc "new" to class "classid" of qdisc "parent" or
684 * to device "dev".
685 *
686 * When appropriate send a netlink notification using 'skb'
687 * and "n".
688 *
689 * On success, destroy old qdisc.
690 */
691
692static int qdisc_graft(struct net_device *dev, struct Qdisc *parent,
693		       struct sk_buff *skb, struct nlmsghdr *n, u32 classid,
694		       struct Qdisc *new, struct Qdisc *old)
695{
696	struct Qdisc *q = old;
697	int err = 0;
698
699	if (parent == NULL) {
700		unsigned int i, num_q, ingress;
701
702		ingress = 0;
703		num_q = dev->num_tx_queues;
704		if ((q && q->flags & TCQ_F_INGRESS) ||
705		    (new && new->flags & TCQ_F_INGRESS)) {
706			num_q = 1;
707			ingress = 1;
708		}
709
710		if (dev->flags & IFF_UP)
711			dev_deactivate(dev);
712
713		for (i = 0; i < num_q; i++) {
714			struct netdev_queue *dev_queue = &dev->rx_queue;
715
716			if (!ingress)
717				dev_queue = netdev_get_tx_queue(dev, i);
718
719			old = dev_graft_qdisc(dev_queue, new);
720			if (new && i > 0)
721				atomic_inc(&new->refcnt);
722
723			notify_and_destroy(skb, n, classid, old, new);
724		}
725
726		if (dev->flags & IFF_UP)
727			dev_activate(dev);
728	} else {
729		const struct Qdisc_class_ops *cops = parent->ops->cl_ops;
730
731		err = -EINVAL;
732
733		if (cops) {
734			unsigned long cl = cops->get(parent, classid);
735			if (cl) {
736				err = cops->graft(parent, cl, new, &old);
737				cops->put(parent, cl);
738			}
739		}
740		if (!err)
741			notify_and_destroy(skb, n, classid, old, new);
742	}
743	return err;
744}
745
746/* lockdep annotation is needed for ingress; egress gets it only for name */
747static struct lock_class_key qdisc_tx_lock;
748static struct lock_class_key qdisc_rx_lock;
749
750/*
751   Allocate and initialize new qdisc.
752
753   Parameters are passed via opt.
754 */
755
756static struct Qdisc *
757qdisc_create(struct net_device *dev, struct netdev_queue *dev_queue,
758	     u32 parent, u32 handle, struct nlattr **tca, int *errp)
759{
760	int err;
761	struct nlattr *kind = tca[TCA_KIND];
762	struct Qdisc *sch;
763	struct Qdisc_ops *ops;
764	struct qdisc_size_table *stab;
765
766	ops = qdisc_lookup_ops(kind);
767#ifdef CONFIG_MODULES
768	if (ops == NULL && kind != NULL) {
769		char name[IFNAMSIZ];
770		if (nla_strlcpy(name, kind, IFNAMSIZ) < IFNAMSIZ) {
771			/* We dropped the RTNL semaphore in order to
772			 * perform the module load.  So, even if we
773			 * succeeded in loading the module we have to
774			 * tell the caller to replay the request.  We
775			 * indicate this using -EAGAIN.
776			 * We replay the request because the device may
777			 * go away in the mean time.
778			 */
779			rtnl_unlock();
780			request_module("sch_%s", name);
781			rtnl_lock();
782			ops = qdisc_lookup_ops(kind);
783			if (ops != NULL) {
784				/* We will try again qdisc_lookup_ops,
785				 * so don't keep a reference.
786				 */
787				module_put(ops->owner);
788				err = -EAGAIN;
789				goto err_out;
790			}
791		}
792	}
793#endif
794
795	err = -ENOENT;
796	if (ops == NULL)
797		goto err_out;
798
799	sch = qdisc_alloc(dev_queue, ops);
800	if (IS_ERR(sch)) {
801		err = PTR_ERR(sch);
802		goto err_out2;
803	}
804
805	sch->parent = parent;
806
807	if (handle == TC_H_INGRESS) {
808		sch->flags |= TCQ_F_INGRESS;
809		handle = TC_H_MAKE(TC_H_INGRESS, 0);
810		lockdep_set_class(qdisc_lock(sch), &qdisc_rx_lock);
811	} else {
812		if (handle == 0) {
813			handle = qdisc_alloc_handle(dev);
814			err = -ENOMEM;
815			if (handle == 0)
816				goto err_out3;
817		}
818		lockdep_set_class(qdisc_lock(sch), &qdisc_tx_lock);
819	}
820
821	sch->handle = handle;
822
823	if (!ops->init || (err = ops->init(sch, tca[TCA_OPTIONS])) == 0) {
824		if (tca[TCA_STAB]) {
825			stab = qdisc_get_stab(tca[TCA_STAB]);
826			if (IS_ERR(stab)) {
827				err = PTR_ERR(stab);
828				goto err_out3;
829			}
830			sch->stab = stab;
831		}
832		if (tca[TCA_RATE]) {
833			spinlock_t *root_lock;
834
835			if ((sch->parent != TC_H_ROOT) &&
836			    !(sch->flags & TCQ_F_INGRESS))
837				root_lock = qdisc_root_sleeping_lock(sch);
838			else
839				root_lock = qdisc_lock(sch);
840
841			err = gen_new_estimator(&sch->bstats, &sch->rate_est,
842						root_lock, tca[TCA_RATE]);
843			if (err) {
844				/*
845				 * Any broken qdiscs that would require
846				 * a ops->reset() here? The qdisc was never
847				 * in action so it shouldn't be necessary.
848				 */
849				if (ops->destroy)
850					ops->destroy(sch);
851				goto err_out3;
852			}
853		}
854
855		qdisc_list_add(sch);
856
857		return sch;
858	}
859err_out3:
860	qdisc_put_stab(sch->stab);
861	dev_put(dev);
862	kfree((char *) sch - sch->padded);
863err_out2:
864	module_put(ops->owner);
865err_out:
866	*errp = err;
867	return NULL;
868}
869
870static int qdisc_change(struct Qdisc *sch, struct nlattr **tca)
871{
872	struct qdisc_size_table *stab = NULL;
873	int err = 0;
874
875	if (tca[TCA_OPTIONS]) {
876		if (sch->ops->change == NULL)
877			return -EINVAL;
878		err = sch->ops->change(sch, tca[TCA_OPTIONS]);
879		if (err)
880			return err;
881	}
882
883	if (tca[TCA_STAB]) {
884		stab = qdisc_get_stab(tca[TCA_STAB]);
885		if (IS_ERR(stab))
886			return PTR_ERR(stab);
887	}
888
889	qdisc_put_stab(sch->stab);
890	sch->stab = stab;
891
892	if (tca[TCA_RATE])
893		/* NB: ignores errors from replace_estimator
894		   because change can't be undone. */
895		gen_replace_estimator(&sch->bstats, &sch->rate_est,
896					    qdisc_root_sleeping_lock(sch),
897					    tca[TCA_RATE]);
898
899	return 0;
900}
901
902struct check_loop_arg
903{
904	struct qdisc_walker 	w;
905	struct Qdisc		*p;
906	int			depth;
907};
908
909static int check_loop_fn(struct Qdisc *q, unsigned long cl, struct qdisc_walker *w);
910
911static int check_loop(struct Qdisc *q, struct Qdisc *p, int depth)
912{
913	struct check_loop_arg	arg;
914
915	if (q->ops->cl_ops == NULL)
916		return 0;
917
918	arg.w.stop = arg.w.skip = arg.w.count = 0;
919	arg.w.fn = check_loop_fn;
920	arg.depth = depth;
921	arg.p = p;
922	q->ops->cl_ops->walk(q, &arg.w);
923	return arg.w.stop ? -ELOOP : 0;
924}
925
926static int
927check_loop_fn(struct Qdisc *q, unsigned long cl, struct qdisc_walker *w)
928{
929	struct Qdisc *leaf;
930	const struct Qdisc_class_ops *cops = q->ops->cl_ops;
931	struct check_loop_arg *arg = (struct check_loop_arg *)w;
932
933	leaf = cops->leaf(q, cl);
934	if (leaf) {
935		if (leaf == arg->p || arg->depth > 7)
936			return -ELOOP;
937		return check_loop(leaf, arg->p, arg->depth + 1);
938	}
939	return 0;
940}
941
942/*
943 * Delete/get qdisc.
944 */
945
946static int tc_get_qdisc(struct sk_buff *skb, struct nlmsghdr *n, void *arg)
947{
948	struct net *net = sock_net(skb->sk);
949	struct tcmsg *tcm = NLMSG_DATA(n);
950	struct nlattr *tca[TCA_MAX + 1];
951	struct net_device *dev;
952	u32 clid = tcm->tcm_parent;
953	struct Qdisc *q = NULL;
954	struct Qdisc *p = NULL;
955	int err;
956
957	if (net != &init_net)
958		return -EINVAL;
959
960	if ((dev = __dev_get_by_index(&init_net, tcm->tcm_ifindex)) == NULL)
961		return -ENODEV;
962
963	err = nlmsg_parse(n, sizeof(*tcm), tca, TCA_MAX, NULL);
964	if (err < 0)
965		return err;
966
967	if (clid) {
968		if (clid != TC_H_ROOT) {
969			if (TC_H_MAJ(clid) != TC_H_MAJ(TC_H_INGRESS)) {
970				if ((p = qdisc_lookup(dev, TC_H_MAJ(clid))) == NULL)
971					return -ENOENT;
972				q = qdisc_leaf(p, clid);
973			} else { /* ingress */
974				q = dev->rx_queue.qdisc_sleeping;
975			}
976		} else {
977			struct netdev_queue *dev_queue;
978			dev_queue = netdev_get_tx_queue(dev, 0);
979			q = dev_queue->qdisc_sleeping;
980		}
981		if (!q)
982			return -ENOENT;
983
984		if (tcm->tcm_handle && q->handle != tcm->tcm_handle)
985			return -EINVAL;
986	} else {
987		if ((q = qdisc_lookup(dev, tcm->tcm_handle)) == NULL)
988			return -ENOENT;
989	}
990
991	if (tca[TCA_KIND] && nla_strcmp(tca[TCA_KIND], q->ops->id))
992		return -EINVAL;
993
994	if (n->nlmsg_type == RTM_DELQDISC) {
995		if (!clid)
996			return -EINVAL;
997		if (q->handle == 0)
998			return -ENOENT;
999		if ((err = qdisc_graft(dev, p, skb, n, clid, NULL, q)) != 0)
1000			return err;
1001	} else {
1002		qdisc_notify(skb, n, clid, NULL, q);
1003	}
1004	return 0;
1005}
1006
1007/*
1008   Create/change qdisc.
1009 */
1010
1011static int tc_modify_qdisc(struct sk_buff *skb, struct nlmsghdr *n, void *arg)
1012{
1013	struct net *net = sock_net(skb->sk);
1014	struct tcmsg *tcm;
1015	struct nlattr *tca[TCA_MAX + 1];
1016	struct net_device *dev;
1017	u32 clid;
1018	struct Qdisc *q, *p;
1019	int err;
1020
1021	if (net != &init_net)
1022		return -EINVAL;
1023
1024replay:
1025	/* Reinit, just in case something touches this. */
1026	tcm = NLMSG_DATA(n);
1027	clid = tcm->tcm_parent;
1028	q = p = NULL;
1029
1030	if ((dev = __dev_get_by_index(&init_net, tcm->tcm_ifindex)) == NULL)
1031		return -ENODEV;
1032
1033	err = nlmsg_parse(n, sizeof(*tcm), tca, TCA_MAX, NULL);
1034	if (err < 0)
1035		return err;
1036
1037	if (clid) {
1038		if (clid != TC_H_ROOT) {
1039			if (clid != TC_H_INGRESS) {
1040				if ((p = qdisc_lookup(dev, TC_H_MAJ(clid))) == NULL)
1041					return -ENOENT;
1042				q = qdisc_leaf(p, clid);
1043			} else { /*ingress */
1044				q = dev->rx_queue.qdisc_sleeping;
1045			}
1046		} else {
1047			struct netdev_queue *dev_queue;
1048			dev_queue = netdev_get_tx_queue(dev, 0);
1049			q = dev_queue->qdisc_sleeping;
1050		}
1051
1052		/* It may be default qdisc, ignore it */
1053		if (q && q->handle == 0)
1054			q = NULL;
1055
1056		if (!q || !tcm->tcm_handle || q->handle != tcm->tcm_handle) {
1057			if (tcm->tcm_handle) {
1058				if (q && !(n->nlmsg_flags&NLM_F_REPLACE))
1059					return -EEXIST;
1060				if (TC_H_MIN(tcm->tcm_handle))
1061					return -EINVAL;
1062				if ((q = qdisc_lookup(dev, tcm->tcm_handle)) == NULL)
1063					goto create_n_graft;
1064				if (n->nlmsg_flags&NLM_F_EXCL)
1065					return -EEXIST;
1066				if (tca[TCA_KIND] && nla_strcmp(tca[TCA_KIND], q->ops->id))
1067					return -EINVAL;
1068				if (q == p ||
1069				    (p && check_loop(q, p, 0)))
1070					return -ELOOP;
1071				atomic_inc(&q->refcnt);
1072				goto graft;
1073			} else {
1074				if (q == NULL)
1075					goto create_n_graft;
1076
1077				/* This magic test requires explanation.
1078				 *
1079				 *   We know, that some child q is already
1080				 *   attached to this parent and have choice:
1081				 *   either to change it or to create/graft new one.
1082				 *
1083				 *   1. We are allowed to create/graft only
1084				 *   if CREATE and REPLACE flags are set.
1085				 *
1086				 *   2. If EXCL is set, requestor wanted to say,
1087				 *   that qdisc tcm_handle is not expected
1088				 *   to exist, so that we choose create/graft too.
1089				 *
1090				 *   3. The last case is when no flags are set.
1091				 *   Alas, it is sort of hole in API, we
1092				 *   cannot decide what to do unambiguously.
1093				 *   For now we select create/graft, if
1094				 *   user gave KIND, which does not match existing.
1095				 */
1096				if ((n->nlmsg_flags&NLM_F_CREATE) &&
1097				    (n->nlmsg_flags&NLM_F_REPLACE) &&
1098				    ((n->nlmsg_flags&NLM_F_EXCL) ||
1099				     (tca[TCA_KIND] &&
1100				      nla_strcmp(tca[TCA_KIND], q->ops->id))))
1101					goto create_n_graft;
1102			}
1103		}
1104	} else {
1105		if (!tcm->tcm_handle)
1106			return -EINVAL;
1107		q = qdisc_lookup(dev, tcm->tcm_handle);
1108	}
1109
1110	/* Change qdisc parameters */
1111	if (q == NULL)
1112		return -ENOENT;
1113	if (n->nlmsg_flags&NLM_F_EXCL)
1114		return -EEXIST;
1115	if (tca[TCA_KIND] && nla_strcmp(tca[TCA_KIND], q->ops->id))
1116		return -EINVAL;
1117	err = qdisc_change(q, tca);
1118	if (err == 0)
1119		qdisc_notify(skb, n, clid, NULL, q);
1120	return err;
1121
1122create_n_graft:
1123	if (!(n->nlmsg_flags&NLM_F_CREATE))
1124		return -ENOENT;
1125	if (clid == TC_H_INGRESS)
1126		q = qdisc_create(dev, &dev->rx_queue,
1127				 tcm->tcm_parent, tcm->tcm_parent,
1128				 tca, &err);
1129	else
1130		q = qdisc_create(dev, netdev_get_tx_queue(dev, 0),
1131				 tcm->tcm_parent, tcm->tcm_handle,
1132				 tca, &err);
1133	if (q == NULL) {
1134		if (err == -EAGAIN)
1135			goto replay;
1136		return err;
1137	}
1138
1139graft:
1140	err = qdisc_graft(dev, p, skb, n, clid, q, NULL);
1141	if (err) {
1142		if (q)
1143			qdisc_destroy(q);
1144		return err;
1145	}
1146
1147	return 0;
1148}
1149
1150static int tc_fill_qdisc(struct sk_buff *skb, struct Qdisc *q, u32 clid,
1151			 u32 pid, u32 seq, u16 flags, int event)
1152{
1153	struct tcmsg *tcm;
1154	struct nlmsghdr  *nlh;
1155	unsigned char *b = skb_tail_pointer(skb);
1156	struct gnet_dump d;
1157
1158	nlh = NLMSG_NEW(skb, pid, seq, event, sizeof(*tcm), flags);
1159	tcm = NLMSG_DATA(nlh);
1160	tcm->tcm_family = AF_UNSPEC;
1161	tcm->tcm__pad1 = 0;
1162	tcm->tcm__pad2 = 0;
1163	tcm->tcm_ifindex = qdisc_dev(q)->ifindex;
1164	tcm->tcm_parent = clid;
1165	tcm->tcm_handle = q->handle;
1166	tcm->tcm_info = atomic_read(&q->refcnt);
1167	NLA_PUT_STRING(skb, TCA_KIND, q->ops->id);
1168	if (q->ops->dump && q->ops->dump(q, skb) < 0)
1169		goto nla_put_failure;
1170	q->qstats.qlen = q->q.qlen;
1171
1172	if (q->stab && qdisc_dump_stab(skb, q->stab) < 0)
1173		goto nla_put_failure;
1174
1175	if (gnet_stats_start_copy_compat(skb, TCA_STATS2, TCA_STATS, TCA_XSTATS,
1176					 qdisc_root_sleeping_lock(q), &d) < 0)
1177		goto nla_put_failure;
1178
1179	if (q->ops->dump_stats && q->ops->dump_stats(q, &d) < 0)
1180		goto nla_put_failure;
1181
1182	if (gnet_stats_copy_basic(&d, &q->bstats) < 0 ||
1183	    gnet_stats_copy_rate_est(&d, &q->rate_est) < 0 ||
1184	    gnet_stats_copy_queue(&d, &q->qstats) < 0)
1185		goto nla_put_failure;
1186
1187	if (gnet_stats_finish_copy(&d) < 0)
1188		goto nla_put_failure;
1189
1190	nlh->nlmsg_len = skb_tail_pointer(skb) - b;
1191	return skb->len;
1192
1193nlmsg_failure:
1194nla_put_failure:
1195	nlmsg_trim(skb, b);
1196	return -1;
1197}
1198
1199static int qdisc_notify(struct sk_buff *oskb, struct nlmsghdr *n,
1200			u32 clid, struct Qdisc *old, struct Qdisc *new)
1201{
1202	struct sk_buff *skb;
1203	u32 pid = oskb ? NETLINK_CB(oskb).pid : 0;
1204
1205	skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
1206	if (!skb)
1207		return -ENOBUFS;
1208
1209	if (old && old->handle) {
1210		if (tc_fill_qdisc(skb, old, clid, pid, n->nlmsg_seq, 0, RTM_DELQDISC) < 0)
1211			goto err_out;
1212	}
1213	if (new) {
1214		if (tc_fill_qdisc(skb, new, clid, pid, n->nlmsg_seq, old ? NLM_F_REPLACE : 0, RTM_NEWQDISC) < 0)
1215			goto err_out;
1216	}
1217
1218	if (skb->len)
1219		return rtnetlink_send(skb, &init_net, pid, RTNLGRP_TC, n->nlmsg_flags&NLM_F_ECHO);
1220
1221err_out:
1222	kfree_skb(skb);
1223	return -EINVAL;
1224}
1225
1226static bool tc_qdisc_dump_ignore(struct Qdisc *q)
1227{
1228	return (q->flags & TCQ_F_BUILTIN) ? true : false;
1229}
1230
1231static int tc_dump_qdisc_root(struct Qdisc *root, struct sk_buff *skb,
1232			      struct netlink_callback *cb,
1233			      int *q_idx_p, int s_q_idx)
1234{
1235	int ret = 0, q_idx = *q_idx_p;
1236	struct Qdisc *q;
1237
1238	if (!root)
1239		return 0;
1240
1241	q = root;
1242	if (q_idx < s_q_idx) {
1243		q_idx++;
1244	} else {
1245		if (!tc_qdisc_dump_ignore(q) &&
1246		    tc_fill_qdisc(skb, q, q->parent, NETLINK_CB(cb->skb).pid,
1247				  cb->nlh->nlmsg_seq, NLM_F_MULTI, RTM_NEWQDISC) <= 0)
1248			goto done;
1249		q_idx++;
1250	}
1251	list_for_each_entry(q, &root->list, list) {
1252		if (q_idx < s_q_idx) {
1253			q_idx++;
1254			continue;
1255		}
1256		if (!tc_qdisc_dump_ignore(q) &&
1257		    tc_fill_qdisc(skb, q, q->parent, NETLINK_CB(cb->skb).pid,
1258				  cb->nlh->nlmsg_seq, NLM_F_MULTI, RTM_NEWQDISC) <= 0)
1259			goto done;
1260		q_idx++;
1261	}
1262
1263out:
1264	*q_idx_p = q_idx;
1265	return ret;
1266done:
1267	ret = -1;
1268	goto out;
1269}
1270
1271static int tc_dump_qdisc(struct sk_buff *skb, struct netlink_callback *cb)
1272{
1273	struct net *net = sock_net(skb->sk);
1274	int idx, q_idx;
1275	int s_idx, s_q_idx;
1276	struct net_device *dev;
1277
1278	if (net != &init_net)
1279		return 0;
1280
1281	s_idx = cb->args[0];
1282	s_q_idx = q_idx = cb->args[1];
1283	read_lock(&dev_base_lock);
1284	idx = 0;
1285	for_each_netdev(&init_net, dev) {
1286		struct netdev_queue *dev_queue;
1287
1288		if (idx < s_idx)
1289			goto cont;
1290		if (idx > s_idx)
1291			s_q_idx = 0;
1292		q_idx = 0;
1293
1294		dev_queue = netdev_get_tx_queue(dev, 0);
1295		if (tc_dump_qdisc_root(dev_queue->qdisc_sleeping, skb, cb, &q_idx, s_q_idx) < 0)
1296			goto done;
1297
1298		dev_queue = &dev->rx_queue;
1299		if (tc_dump_qdisc_root(dev_queue->qdisc_sleeping, skb, cb, &q_idx, s_q_idx) < 0)
1300			goto done;
1301
1302cont:
1303		idx++;
1304	}
1305
1306done:
1307	read_unlock(&dev_base_lock);
1308
1309	cb->args[0] = idx;
1310	cb->args[1] = q_idx;
1311
1312	return skb->len;
1313}
1314
1315
1316
1317/************************************************
1318 *	Traffic classes manipulation.		*
1319 ************************************************/
1320
1321
1322
1323static int tc_ctl_tclass(struct sk_buff *skb, struct nlmsghdr *n, void *arg)
1324{
1325	struct net *net = sock_net(skb->sk);
1326	struct netdev_queue *dev_queue;
1327	struct tcmsg *tcm = NLMSG_DATA(n);
1328	struct nlattr *tca[TCA_MAX + 1];
1329	struct net_device *dev;
1330	struct Qdisc *q = NULL;
1331	const struct Qdisc_class_ops *cops;
1332	unsigned long cl = 0;
1333	unsigned long new_cl;
1334	u32 pid = tcm->tcm_parent;
1335	u32 clid = tcm->tcm_handle;
1336	u32 qid = TC_H_MAJ(clid);
1337	int err;
1338
1339	if (net != &init_net)
1340		return -EINVAL;
1341
1342	if ((dev = __dev_get_by_index(&init_net, tcm->tcm_ifindex)) == NULL)
1343		return -ENODEV;
1344
1345	err = nlmsg_parse(n, sizeof(*tcm), tca, TCA_MAX, NULL);
1346	if (err < 0)
1347		return err;
1348
1349	/*
1350	   parent == TC_H_UNSPEC - unspecified parent.
1351	   parent == TC_H_ROOT   - class is root, which has no parent.
1352	   parent == X:0	 - parent is root class.
1353	   parent == X:Y	 - parent is a node in hierarchy.
1354	   parent == 0:Y	 - parent is X:Y, where X:0 is qdisc.
1355
1356	   handle == 0:0	 - generate handle from kernel pool.
1357	   handle == 0:Y	 - class is X:Y, where X:0 is qdisc.
1358	   handle == X:Y	 - clear.
1359	   handle == X:0	 - root class.
1360	 */
1361
1362	/* Step 1. Determine qdisc handle X:0 */
1363
1364	dev_queue = netdev_get_tx_queue(dev, 0);
1365	if (pid != TC_H_ROOT) {
1366		u32 qid1 = TC_H_MAJ(pid);
1367
1368		if (qid && qid1) {
1369			/* If both majors are known, they must be identical. */
1370			if (qid != qid1)
1371				return -EINVAL;
1372		} else if (qid1) {
1373			qid = qid1;
1374		} else if (qid == 0)
1375			qid = dev_queue->qdisc_sleeping->handle;
1376
1377		/* Now qid is genuine qdisc handle consistent
1378		   both with parent and child.
1379
1380		   TC_H_MAJ(pid) still may be unspecified, complete it now.
1381		 */
1382		if (pid)
1383			pid = TC_H_MAKE(qid, pid);
1384	} else {
1385		if (qid == 0)
1386			qid = dev_queue->qdisc_sleeping->handle;
1387	}
1388
1389	/* OK. Locate qdisc */
1390	if ((q = qdisc_lookup(dev, qid)) == NULL)
1391		return -ENOENT;
1392
1393	/* An check that it supports classes */
1394	cops = q->ops->cl_ops;
1395	if (cops == NULL)
1396		return -EINVAL;
1397
1398	/* Now try to get class */
1399	if (clid == 0) {
1400		if (pid == TC_H_ROOT)
1401			clid = qid;
1402	} else
1403		clid = TC_H_MAKE(qid, clid);
1404
1405	if (clid)
1406		cl = cops->get(q, clid);
1407
1408	if (cl == 0) {
1409		err = -ENOENT;
1410		if (n->nlmsg_type != RTM_NEWTCLASS || !(n->nlmsg_flags&NLM_F_CREATE))
1411			goto out;
1412	} else {
1413		switch (n->nlmsg_type) {
1414		case RTM_NEWTCLASS:
1415			err = -EEXIST;
1416			if (n->nlmsg_flags&NLM_F_EXCL)
1417				goto out;
1418			break;
1419		case RTM_DELTCLASS:
1420			err = cops->delete(q, cl);
1421			if (err == 0)
1422				tclass_notify(skb, n, q, cl, RTM_DELTCLASS);
1423			goto out;
1424		case RTM_GETTCLASS:
1425			err = tclass_notify(skb, n, q, cl, RTM_NEWTCLASS);
1426			goto out;
1427		default:
1428			err = -EINVAL;
1429			goto out;
1430		}
1431	}
1432
1433	new_cl = cl;
1434	err = cops->change(q, clid, pid, tca, &new_cl);
1435	if (err == 0)
1436		tclass_notify(skb, n, q, new_cl, RTM_NEWTCLASS);
1437
1438out:
1439	if (cl)
1440		cops->put(q, cl);
1441
1442	return err;
1443}
1444
1445
1446static int tc_fill_tclass(struct sk_buff *skb, struct Qdisc *q,
1447			  unsigned long cl,
1448			  u32 pid, u32 seq, u16 flags, int event)
1449{
1450	struct tcmsg *tcm;
1451	struct nlmsghdr  *nlh;
1452	unsigned char *b = skb_tail_pointer(skb);
1453	struct gnet_dump d;
1454	const struct Qdisc_class_ops *cl_ops = q->ops->cl_ops;
1455
1456	nlh = NLMSG_NEW(skb, pid, seq, event, sizeof(*tcm), flags);
1457	tcm = NLMSG_DATA(nlh);
1458	tcm->tcm_family = AF_UNSPEC;
1459	tcm->tcm__pad1 = 0;
1460	tcm->tcm__pad2 = 0;
1461	tcm->tcm_ifindex = qdisc_dev(q)->ifindex;
1462	tcm->tcm_parent = q->handle;
1463	tcm->tcm_handle = q->handle;
1464	tcm->tcm_info = 0;
1465	NLA_PUT_STRING(skb, TCA_KIND, q->ops->id);
1466	if (cl_ops->dump && cl_ops->dump(q, cl, skb, tcm) < 0)
1467		goto nla_put_failure;
1468
1469	if (gnet_stats_start_copy_compat(skb, TCA_STATS2, TCA_STATS, TCA_XSTATS,
1470					 qdisc_root_sleeping_lock(q), &d) < 0)
1471		goto nla_put_failure;
1472
1473	if (cl_ops->dump_stats && cl_ops->dump_stats(q, cl, &d) < 0)
1474		goto nla_put_failure;
1475
1476	if (gnet_stats_finish_copy(&d) < 0)
1477		goto nla_put_failure;
1478
1479	nlh->nlmsg_len = skb_tail_pointer(skb) - b;
1480	return skb->len;
1481
1482nlmsg_failure:
1483nla_put_failure:
1484	nlmsg_trim(skb, b);
1485	return -1;
1486}
1487
1488static int tclass_notify(struct sk_buff *oskb, struct nlmsghdr *n,
1489			  struct Qdisc *q, unsigned long cl, int event)
1490{
1491	struct sk_buff *skb;
1492	u32 pid = oskb ? NETLINK_CB(oskb).pid : 0;
1493
1494	skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
1495	if (!skb)
1496		return -ENOBUFS;
1497
1498	if (tc_fill_tclass(skb, q, cl, pid, n->nlmsg_seq, 0, event) < 0) {
1499		kfree_skb(skb);
1500		return -EINVAL;
1501	}
1502
1503	return rtnetlink_send(skb, &init_net, pid, RTNLGRP_TC, n->nlmsg_flags&NLM_F_ECHO);
1504}
1505
1506struct qdisc_dump_args
1507{
1508	struct qdisc_walker w;
1509	struct sk_buff *skb;
1510	struct netlink_callback *cb;
1511};
1512
1513static int qdisc_class_dump(struct Qdisc *q, unsigned long cl, struct qdisc_walker *arg)
1514{
1515	struct qdisc_dump_args *a = (struct qdisc_dump_args *)arg;
1516
1517	return tc_fill_tclass(a->skb, q, cl, NETLINK_CB(a->cb->skb).pid,
1518			      a->cb->nlh->nlmsg_seq, NLM_F_MULTI, RTM_NEWTCLASS);
1519}
1520
1521static int tc_dump_tclass_qdisc(struct Qdisc *q, struct sk_buff *skb,
1522				struct tcmsg *tcm, struct netlink_callback *cb,
1523				int *t_p, int s_t)
1524{
1525	struct qdisc_dump_args arg;
1526
1527	if (tc_qdisc_dump_ignore(q) ||
1528	    *t_p < s_t || !q->ops->cl_ops ||
1529	    (tcm->tcm_parent &&
1530	     TC_H_MAJ(tcm->tcm_parent) != q->handle)) {
1531		(*t_p)++;
1532		return 0;
1533	}
1534	if (*t_p > s_t)
1535		memset(&cb->args[1], 0, sizeof(cb->args)-sizeof(cb->args[0]));
1536	arg.w.fn = qdisc_class_dump;
1537	arg.skb = skb;
1538	arg.cb = cb;
1539	arg.w.stop  = 0;
1540	arg.w.skip = cb->args[1];
1541	arg.w.count = 0;
1542	q->ops->cl_ops->walk(q, &arg.w);
1543	cb->args[1] = arg.w.count;
1544	if (arg.w.stop)
1545		return -1;
1546	(*t_p)++;
1547	return 0;
1548}
1549
1550static int tc_dump_tclass_root(struct Qdisc *root, struct sk_buff *skb,
1551			       struct tcmsg *tcm, struct netlink_callback *cb,
1552			       int *t_p, int s_t)
1553{
1554	struct Qdisc *q;
1555
1556	if (!root)
1557		return 0;
1558
1559	if (tc_dump_tclass_qdisc(root, skb, tcm, cb, t_p, s_t) < 0)
1560		return -1;
1561
1562	list_for_each_entry(q, &root->list, list) {
1563		if (tc_dump_tclass_qdisc(q, skb, tcm, cb, t_p, s_t) < 0)
1564			return -1;
1565	}
1566
1567	return 0;
1568}
1569
1570static int tc_dump_tclass(struct sk_buff *skb, struct netlink_callback *cb)
1571{
1572	struct tcmsg *tcm = (struct tcmsg*)NLMSG_DATA(cb->nlh);
1573	struct net *net = sock_net(skb->sk);
1574	struct netdev_queue *dev_queue;
1575	struct net_device *dev;
1576	int t, s_t;
1577
1578	if (net != &init_net)
1579		return 0;
1580
1581	if (cb->nlh->nlmsg_len < NLMSG_LENGTH(sizeof(*tcm)))
1582		return 0;
1583	if ((dev = dev_get_by_index(&init_net, tcm->tcm_ifindex)) == NULL)
1584		return 0;
1585
1586	s_t = cb->args[0];
1587	t = 0;
1588
1589	dev_queue = netdev_get_tx_queue(dev, 0);
1590	if (tc_dump_tclass_root(dev_queue->qdisc_sleeping, skb, tcm, cb, &t, s_t) < 0)
1591		goto done;
1592
1593	dev_queue = &dev->rx_queue;
1594	if (tc_dump_tclass_root(dev_queue->qdisc_sleeping, skb, tcm, cb, &t, s_t) < 0)
1595		goto done;
1596
1597done:
1598	cb->args[0] = t;
1599
1600	dev_put(dev);
1601	return skb->len;
1602}
1603
1604/* Main classifier routine: scans classifier chain attached
1605   to this qdisc, (optionally) tests for protocol and asks
1606   specific classifiers.
1607 */
1608int tc_classify_compat(struct sk_buff *skb, struct tcf_proto *tp,
1609		       struct tcf_result *res)
1610{
1611	__be16 protocol = skb->protocol;
1612	int err = 0;
1613
1614	for (; tp; tp = tp->next) {
1615		if ((tp->protocol == protocol ||
1616		     tp->protocol == htons(ETH_P_ALL)) &&
1617		    (err = tp->classify(skb, tp, res)) >= 0) {
1618#ifdef CONFIG_NET_CLS_ACT
1619			if (err != TC_ACT_RECLASSIFY && skb->tc_verd)
1620				skb->tc_verd = SET_TC_VERD(skb->tc_verd, 0);
1621#endif
1622			return err;
1623		}
1624	}
1625	return -1;
1626}
1627EXPORT_SYMBOL(tc_classify_compat);
1628
1629int tc_classify(struct sk_buff *skb, struct tcf_proto *tp,
1630		struct tcf_result *res)
1631{
1632	int err = 0;
1633	__be16 protocol;
1634#ifdef CONFIG_NET_CLS_ACT
1635	struct tcf_proto *otp = tp;
1636reclassify:
1637#endif
1638	protocol = skb->protocol;
1639
1640	err = tc_classify_compat(skb, tp, res);
1641#ifdef CONFIG_NET_CLS_ACT
1642	if (err == TC_ACT_RECLASSIFY) {
1643		u32 verd = G_TC_VERD(skb->tc_verd);
1644		tp = otp;
1645
1646		if (verd++ >= MAX_REC_LOOP) {
1647			printk("rule prio %u protocol %02x reclassify loop, "
1648			       "packet dropped\n",
1649			       tp->prio&0xffff, ntohs(tp->protocol));
1650			return TC_ACT_SHOT;
1651		}
1652		skb->tc_verd = SET_TC_VERD(skb->tc_verd, verd);
1653		goto reclassify;
1654	}
1655#endif
1656	return err;
1657}
1658EXPORT_SYMBOL(tc_classify);
1659
1660void tcf_destroy(struct tcf_proto *tp)
1661{
1662	tp->ops->destroy(tp);
1663	module_put(tp->ops->owner);
1664	kfree(tp);
1665}
1666
1667void tcf_destroy_chain(struct tcf_proto **fl)
1668{
1669	struct tcf_proto *tp;
1670
1671	while ((tp = *fl) != NULL) {
1672		*fl = tp->next;
1673		tcf_destroy(tp);
1674	}
1675}
1676EXPORT_SYMBOL(tcf_destroy_chain);
1677
1678#ifdef CONFIG_PROC_FS
1679static int psched_show(struct seq_file *seq, void *v)
1680{
1681	struct timespec ts;
1682
1683	hrtimer_get_res(CLOCK_MONOTONIC, &ts);
1684	seq_printf(seq, "%08x %08x %08x %08x\n",
1685		   (u32)NSEC_PER_USEC, (u32)PSCHED_TICKS2NS(1),
1686		   1000000,
1687		   (u32)NSEC_PER_SEC/(u32)ktime_to_ns(timespec_to_ktime(ts)));
1688
1689	return 0;
1690}
1691
1692static int psched_open(struct inode *inode, struct file *file)
1693{
1694	return single_open(file, psched_show, PDE(inode)->data);
1695}
1696
1697static const struct file_operations psched_fops = {
1698	.owner = THIS_MODULE,
1699	.open = psched_open,
1700	.read  = seq_read,
1701	.llseek = seq_lseek,
1702	.release = single_release,
1703};
1704#endif
1705
1706static int __init pktsched_init(void)
1707{
1708	register_qdisc(&pfifo_qdisc_ops);
1709	register_qdisc(&bfifo_qdisc_ops);
1710	proc_net_fops_create(&init_net, "psched", 0, &psched_fops);
1711
1712	rtnl_register(PF_UNSPEC, RTM_NEWQDISC, tc_modify_qdisc, NULL);
1713	rtnl_register(PF_UNSPEC, RTM_DELQDISC, tc_get_qdisc, NULL);
1714	rtnl_register(PF_UNSPEC, RTM_GETQDISC, tc_get_qdisc, tc_dump_qdisc);
1715	rtnl_register(PF_UNSPEC, RTM_NEWTCLASS, tc_ctl_tclass, NULL);
1716	rtnl_register(PF_UNSPEC, RTM_DELTCLASS, tc_ctl_tclass, NULL);
1717	rtnl_register(PF_UNSPEC, RTM_GETTCLASS, tc_ctl_tclass, tc_dump_tclass);
1718
1719	return 0;
1720}
1721
1722subsys_initcall(pktsched_init);
1723