1/*
2 * net/sched/sch_api.c	Packet scheduler API.
3 *
4 *		This program is free software; you can redistribute it and/or
5 *		modify it under the terms of the GNU General Public License
6 *		as published by the Free Software Foundation; either version
7 *		2 of the License, or (at your option) any later version.
8 *
9 * Authors:	Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
10 *
11 * Fixes:
12 *
13 * Rani Assaf <rani@magic.metawire.com> :980802: JIFFIES and CPU clock sources are repaired.
14 * Eduardo J. Blanco <ejbs@netlabs.com.uy> :990222: kmod support
15 * Jamal Hadi Salim <hadi@nortelnetworks.com>: 990601: ingress support
16 */
17
18#include <linux/module.h>
19#include <linux/types.h>
20#include <linux/kernel.h>
21#include <linux/string.h>
22#include <linux/errno.h>
23#include <linux/skbuff.h>
24#include <linux/init.h>
25#include <linux/proc_fs.h>
26#include <linux/seq_file.h>
27#include <linux/kmod.h>
28#include <linux/list.h>
29#include <linux/hrtimer.h>
30#include <linux/lockdep.h>
31#include <linux/slab.h>
32
33#include <net/net_namespace.h>
34#include <net/sock.h>
35#include <net/netlink.h>
36#include <net/pkt_sched.h>
37
38static int qdisc_notify(struct net *net, struct sk_buff *oskb,
39			struct nlmsghdr *n, u32 clid,
40			struct Qdisc *old, struct Qdisc *new);
41static int tclass_notify(struct net *net, struct sk_buff *oskb,
42			 struct nlmsghdr *n, struct Qdisc *q,
43			 unsigned long cl, int event);
44
45/*
46
47   Short review.
48   -------------
49
50   This file consists of two interrelated parts:
51
52   1. queueing disciplines manager frontend.
53   2. traffic classes manager frontend.
54
55   Generally, queueing discipline ("qdisc") is a black box,
56   which is able to enqueue packets and to dequeue them (when
57   device is ready to send something) in order and at times
58   determined by algorithm hidden in it.
59
60   qdisc's are divided to two categories:
61   - "queues", which have no internal structure visible from outside.
62   - "schedulers", which split all the packets to "traffic classes",
63     using "packet classifiers" (look at cls_api.c)
64
65   In turn, classes may have child qdiscs (as rule, queues)
66   attached to them etc. etc. etc.
67
68   The goal of the routines in this file is to translate
69   information supplied by user in the form of handles
70   to more intelligible for kernel form, to make some sanity
71   checks and part of work, which is common to all qdiscs
72   and to provide rtnetlink notifications.
73
74   All real intelligent work is done inside qdisc modules.
75
76
77
78   Every discipline has two major routines: enqueue and dequeue.
79
80   ---dequeue
81
82   dequeue usually returns a skb to send. It is allowed to return NULL,
83   but it does not mean that queue is empty, it just means that
84   discipline does not want to send anything this time.
85   Queue is really empty if q->q.qlen == 0.
86   For complicated disciplines with multiple queues q->q is not
87   real packet queue, but however q->q.qlen must be valid.
88
89   ---enqueue
90
91   enqueue returns 0, if packet was enqueued successfully.
92   If packet (this one or another one) was dropped, it returns
93   not zero error code.
94   NET_XMIT_DROP 	- this packet dropped
95     Expected action: do not backoff, but wait until queue will clear.
96   NET_XMIT_CN	 	- probably this packet enqueued, but another one dropped.
97     Expected action: backoff or ignore
98   NET_XMIT_POLICED	- dropped by police.
99     Expected action: backoff or error to real-time apps.
100
101   Auxiliary routines:
102
103   ---peek
104
105   like dequeue but without removing a packet from the queue
106
107   ---reset
108
109   returns qdisc to initial state: purge all buffers, clear all
110   timers, counters (except for statistics) etc.
111
112   ---init
113
114   initializes newly created qdisc.
115
116   ---destroy
117
118   destroys resources allocated by init and during lifetime of qdisc.
119
120   ---change
121
122   changes qdisc parameters.
123 */
124
125/* Protects list of registered TC modules. It is pure SMP lock. */
126static DEFINE_RWLOCK(qdisc_mod_lock);
127
128
129/************************************************
130 *	Queueing disciplines manipulation.	*
131 ************************************************/
132
133
134/* The list of all installed queueing disciplines. */
135
136static struct Qdisc_ops *qdisc_base;
137
138/* Register/uregister queueing discipline */
139
140int register_qdisc(struct Qdisc_ops *qops)
141{
142	struct Qdisc_ops *q, **qp;
143	int rc = -EEXIST;
144
145	write_lock(&qdisc_mod_lock);
146	for (qp = &qdisc_base; (q = *qp) != NULL; qp = &q->next)
147		if (!strcmp(qops->id, q->id))
148			goto out;
149
150	if (qops->enqueue == NULL)
151		qops->enqueue = noop_qdisc_ops.enqueue;
152	if (qops->peek == NULL) {
153		if (qops->dequeue == NULL)
154			qops->peek = noop_qdisc_ops.peek;
155		else
156			goto out_einval;
157	}
158	if (qops->dequeue == NULL)
159		qops->dequeue = noop_qdisc_ops.dequeue;
160
161	if (qops->cl_ops) {
162		const struct Qdisc_class_ops *cops = qops->cl_ops;
163
164		if (!(cops->get && cops->put && cops->walk && cops->leaf))
165			goto out_einval;
166
167		if (cops->tcf_chain && !(cops->bind_tcf && cops->unbind_tcf))
168			goto out_einval;
169	}
170
171	qops->next = NULL;
172	*qp = qops;
173	rc = 0;
174out:
175	write_unlock(&qdisc_mod_lock);
176	return rc;
177
178out_einval:
179	rc = -EINVAL;
180	goto out;
181}
182EXPORT_SYMBOL(register_qdisc);
183
184int unregister_qdisc(struct Qdisc_ops *qops)
185{
186	struct Qdisc_ops *q, **qp;
187	int err = -ENOENT;
188
189	write_lock(&qdisc_mod_lock);
190	for (qp = &qdisc_base; (q = *qp) != NULL; qp = &q->next)
191		if (q == qops)
192			break;
193	if (q) {
194		*qp = q->next;
195		q->next = NULL;
196		err = 0;
197	}
198	write_unlock(&qdisc_mod_lock);
199	return err;
200}
201EXPORT_SYMBOL(unregister_qdisc);
202
203/* We know handle. Find qdisc among all qdisc's attached to device
204   (root qdisc, all its children, children of children etc.)
205 */
206
207static struct Qdisc *qdisc_match_from_root(struct Qdisc *root, u32 handle)
208{
209	struct Qdisc *q;
210
211	if (!(root->flags & TCQ_F_BUILTIN) &&
212	    root->handle == handle)
213		return root;
214
215	list_for_each_entry(q, &root->list, list) {
216		if (q->handle == handle)
217			return q;
218	}
219	return NULL;
220}
221
222static void qdisc_list_add(struct Qdisc *q)
223{
224	if ((q->parent != TC_H_ROOT) && !(q->flags & TCQ_F_INGRESS))
225		list_add_tail(&q->list, &qdisc_dev(q)->qdisc->list);
226}
227
228void qdisc_list_del(struct Qdisc *q)
229{
230	if ((q->parent != TC_H_ROOT) && !(q->flags & TCQ_F_INGRESS))
231		list_del(&q->list);
232}
233EXPORT_SYMBOL(qdisc_list_del);
234
235struct Qdisc *qdisc_lookup(struct net_device *dev, u32 handle)
236{
237	struct Qdisc *q;
238
239	q = qdisc_match_from_root(dev->qdisc, handle);
240	if (q)
241		goto out;
242
243	if (dev_ingress_queue(dev))
244		q = qdisc_match_from_root(
245			dev_ingress_queue(dev)->qdisc_sleeping,
246			handle);
247out:
248	return q;
249}
250
251static struct Qdisc *qdisc_leaf(struct Qdisc *p, u32 classid)
252{
253	unsigned long cl;
254	struct Qdisc *leaf;
255	const struct Qdisc_class_ops *cops = p->ops->cl_ops;
256
257	if (cops == NULL)
258		return NULL;
259	cl = cops->get(p, classid);
260
261	if (cl == 0)
262		return NULL;
263	leaf = cops->leaf(p, cl);
264	cops->put(p, cl);
265	return leaf;
266}
267
268/* Find queueing discipline by name */
269
270static struct Qdisc_ops *qdisc_lookup_ops(struct nlattr *kind)
271{
272	struct Qdisc_ops *q = NULL;
273
274	if (kind) {
275		read_lock(&qdisc_mod_lock);
276		for (q = qdisc_base; q; q = q->next) {
277			if (nla_strcmp(kind, q->id) == 0) {
278				if (!try_module_get(q->owner))
279					q = NULL;
280				break;
281			}
282		}
283		read_unlock(&qdisc_mod_lock);
284	}
285	return q;
286}
287
288static struct qdisc_rate_table *qdisc_rtab_list;
289
290struct qdisc_rate_table *qdisc_get_rtab(struct tc_ratespec *r, struct nlattr *tab)
291{
292	struct qdisc_rate_table *rtab;
293
294	if (tab == NULL || r->rate == 0 || r->cell_log == 0 ||
295	    nla_len(tab) != TC_RTAB_SIZE)
296		return NULL;
297
298	for (rtab = qdisc_rtab_list; rtab; rtab = rtab->next) {
299		if (!memcmp(&rtab->rate, r, sizeof(struct tc_ratespec)) &&
300		    !memcmp(&rtab->data, nla_data(tab), 1024)) {
301			rtab->refcnt++;
302			return rtab;
303		}
304	}
305
306	rtab = kmalloc(sizeof(*rtab), GFP_KERNEL);
307	if (rtab) {
308		rtab->rate = *r;
309		rtab->refcnt = 1;
310		memcpy(rtab->data, nla_data(tab), 1024);
311		rtab->next = qdisc_rtab_list;
312		qdisc_rtab_list = rtab;
313	}
314	return rtab;
315}
316EXPORT_SYMBOL(qdisc_get_rtab);
317
318void qdisc_put_rtab(struct qdisc_rate_table *tab)
319{
320	struct qdisc_rate_table *rtab, **rtabp;
321
322	if (!tab || --tab->refcnt)
323		return;
324
325	for (rtabp = &qdisc_rtab_list;
326	     (rtab = *rtabp) != NULL;
327	     rtabp = &rtab->next) {
328		if (rtab == tab) {
329			*rtabp = rtab->next;
330			kfree(rtab);
331			return;
332		}
333	}
334}
335EXPORT_SYMBOL(qdisc_put_rtab);
336
337static LIST_HEAD(qdisc_stab_list);
338static DEFINE_SPINLOCK(qdisc_stab_lock);
339
340static const struct nla_policy stab_policy[TCA_STAB_MAX + 1] = {
341	[TCA_STAB_BASE]	= { .len = sizeof(struct tc_sizespec) },
342	[TCA_STAB_DATA] = { .type = NLA_BINARY },
343};
344
345static struct qdisc_size_table *qdisc_get_stab(struct nlattr *opt)
346{
347	struct nlattr *tb[TCA_STAB_MAX + 1];
348	struct qdisc_size_table *stab;
349	struct tc_sizespec *s;
350	unsigned int tsize = 0;
351	u16 *tab = NULL;
352	int err;
353
354	err = nla_parse_nested(tb, TCA_STAB_MAX, opt, stab_policy);
355	if (err < 0)
356		return ERR_PTR(err);
357	if (!tb[TCA_STAB_BASE])
358		return ERR_PTR(-EINVAL);
359
360	s = nla_data(tb[TCA_STAB_BASE]);
361
362	if (s->tsize > 0) {
363		if (!tb[TCA_STAB_DATA])
364			return ERR_PTR(-EINVAL);
365		tab = nla_data(tb[TCA_STAB_DATA]);
366		tsize = nla_len(tb[TCA_STAB_DATA]) / sizeof(u16);
367	}
368
369	if (tsize != s->tsize || (!tab && tsize > 0))
370		return ERR_PTR(-EINVAL);
371
372	spin_lock(&qdisc_stab_lock);
373
374	list_for_each_entry(stab, &qdisc_stab_list, list) {
375		if (memcmp(&stab->szopts, s, sizeof(*s)))
376			continue;
377		if (tsize > 0 && memcmp(stab->data, tab, tsize * sizeof(u16)))
378			continue;
379		stab->refcnt++;
380		spin_unlock(&qdisc_stab_lock);
381		return stab;
382	}
383
384	spin_unlock(&qdisc_stab_lock);
385
386	stab = kmalloc(sizeof(*stab) + tsize * sizeof(u16), GFP_KERNEL);
387	if (!stab)
388		return ERR_PTR(-ENOMEM);
389
390	stab->refcnt = 1;
391	stab->szopts = *s;
392	if (tsize > 0)
393		memcpy(stab->data, tab, tsize * sizeof(u16));
394
395	spin_lock(&qdisc_stab_lock);
396	list_add_tail(&stab->list, &qdisc_stab_list);
397	spin_unlock(&qdisc_stab_lock);
398
399	return stab;
400}
401
402static void stab_kfree_rcu(struct rcu_head *head)
403{
404	kfree(container_of(head, struct qdisc_size_table, rcu));
405}
406
407void qdisc_put_stab(struct qdisc_size_table *tab)
408{
409	if (!tab)
410		return;
411
412	spin_lock(&qdisc_stab_lock);
413
414	if (--tab->refcnt == 0) {
415		list_del(&tab->list);
416		call_rcu_bh(&tab->rcu, stab_kfree_rcu);
417	}
418
419	spin_unlock(&qdisc_stab_lock);
420}
421EXPORT_SYMBOL(qdisc_put_stab);
422
423static int qdisc_dump_stab(struct sk_buff *skb, struct qdisc_size_table *stab)
424{
425	struct nlattr *nest;
426
427	nest = nla_nest_start(skb, TCA_STAB);
428	if (nest == NULL)
429		goto nla_put_failure;
430	if (nla_put(skb, TCA_STAB_BASE, sizeof(stab->szopts), &stab->szopts))
431		goto nla_put_failure;
432	nla_nest_end(skb, nest);
433
434	return skb->len;
435
436nla_put_failure:
437	return -1;
438}
439
440void __qdisc_calculate_pkt_len(struct sk_buff *skb, const struct qdisc_size_table *stab)
441{
442	int pkt_len, slot;
443
444	pkt_len = skb->len + stab->szopts.overhead;
445	if (unlikely(!stab->szopts.tsize))
446		goto out;
447
448	slot = pkt_len + stab->szopts.cell_align;
449	if (unlikely(slot < 0))
450		slot = 0;
451
452	slot >>= stab->szopts.cell_log;
453	if (likely(slot < stab->szopts.tsize))
454		pkt_len = stab->data[slot];
455	else
456		pkt_len = stab->data[stab->szopts.tsize - 1] *
457				(slot / stab->szopts.tsize) +
458				stab->data[slot % stab->szopts.tsize];
459
460	pkt_len <<= stab->szopts.size_log;
461out:
462	if (unlikely(pkt_len < 1))
463		pkt_len = 1;
464	qdisc_skb_cb(skb)->pkt_len = pkt_len;
465}
466EXPORT_SYMBOL(__qdisc_calculate_pkt_len);
467
468void qdisc_warn_nonwc(char *txt, struct Qdisc *qdisc)
469{
470	if (!(qdisc->flags & TCQ_F_WARN_NONWC)) {
471		pr_warn("%s: %s qdisc %X: is non-work-conserving?\n",
472			txt, qdisc->ops->id, qdisc->handle >> 16);
473		qdisc->flags |= TCQ_F_WARN_NONWC;
474	}
475}
476EXPORT_SYMBOL(qdisc_warn_nonwc);
477
478static enum hrtimer_restart qdisc_watchdog(struct hrtimer *timer)
479{
480	struct qdisc_watchdog *wd = container_of(timer, struct qdisc_watchdog,
481						 timer);
482
483	qdisc_unthrottled(wd->qdisc);
484	__netif_schedule(qdisc_root(wd->qdisc));
485
486	return HRTIMER_NORESTART;
487}
488
489void qdisc_watchdog_init(struct qdisc_watchdog *wd, struct Qdisc *qdisc)
490{
491	hrtimer_init(&wd->timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS);
492	wd->timer.function = qdisc_watchdog;
493	wd->qdisc = qdisc;
494}
495EXPORT_SYMBOL(qdisc_watchdog_init);
496
497void qdisc_watchdog_schedule_ns(struct qdisc_watchdog *wd, u64 expires)
498{
499	if (test_bit(__QDISC_STATE_DEACTIVATED,
500		     &qdisc_root_sleeping(wd->qdisc)->state))
501		return;
502
503	qdisc_throttled(wd->qdisc);
504
505	hrtimer_start(&wd->timer,
506		      ns_to_ktime(expires),
507		      HRTIMER_MODE_ABS);
508}
509EXPORT_SYMBOL(qdisc_watchdog_schedule_ns);
510
511void qdisc_watchdog_cancel(struct qdisc_watchdog *wd)
512{
513	hrtimer_cancel(&wd->timer);
514	qdisc_unthrottled(wd->qdisc);
515}
516EXPORT_SYMBOL(qdisc_watchdog_cancel);
517
518static struct hlist_head *qdisc_class_hash_alloc(unsigned int n)
519{
520	unsigned int size = n * sizeof(struct hlist_head), i;
521	struct hlist_head *h;
522
523	if (size <= PAGE_SIZE)
524		h = kmalloc(size, GFP_KERNEL);
525	else
526		h = (struct hlist_head *)
527			__get_free_pages(GFP_KERNEL, get_order(size));
528
529	if (h != NULL) {
530		for (i = 0; i < n; i++)
531			INIT_HLIST_HEAD(&h[i]);
532	}
533	return h;
534}
535
536static void qdisc_class_hash_free(struct hlist_head *h, unsigned int n)
537{
538	unsigned int size = n * sizeof(struct hlist_head);
539
540	if (size <= PAGE_SIZE)
541		kfree(h);
542	else
543		free_pages((unsigned long)h, get_order(size));
544}
545
546void qdisc_class_hash_grow(struct Qdisc *sch, struct Qdisc_class_hash *clhash)
547{
548	struct Qdisc_class_common *cl;
549	struct hlist_node *next;
550	struct hlist_head *nhash, *ohash;
551	unsigned int nsize, nmask, osize;
552	unsigned int i, h;
553
554	/* Rehash when load factor exceeds 0.75 */
555	if (clhash->hashelems * 4 <= clhash->hashsize * 3)
556		return;
557	nsize = clhash->hashsize * 2;
558	nmask = nsize - 1;
559	nhash = qdisc_class_hash_alloc(nsize);
560	if (nhash == NULL)
561		return;
562
563	ohash = clhash->hash;
564	osize = clhash->hashsize;
565
566	sch_tree_lock(sch);
567	for (i = 0; i < osize; i++) {
568		hlist_for_each_entry_safe(cl, next, &ohash[i], hnode) {
569			h = qdisc_class_hash(cl->classid, nmask);
570			hlist_add_head(&cl->hnode, &nhash[h]);
571		}
572	}
573	clhash->hash     = nhash;
574	clhash->hashsize = nsize;
575	clhash->hashmask = nmask;
576	sch_tree_unlock(sch);
577
578	qdisc_class_hash_free(ohash, osize);
579}
580EXPORT_SYMBOL(qdisc_class_hash_grow);
581
582int qdisc_class_hash_init(struct Qdisc_class_hash *clhash)
583{
584	unsigned int size = 4;
585
586	clhash->hash = qdisc_class_hash_alloc(size);
587	if (clhash->hash == NULL)
588		return -ENOMEM;
589	clhash->hashsize  = size;
590	clhash->hashmask  = size - 1;
591	clhash->hashelems = 0;
592	return 0;
593}
594EXPORT_SYMBOL(qdisc_class_hash_init);
595
596void qdisc_class_hash_destroy(struct Qdisc_class_hash *clhash)
597{
598	qdisc_class_hash_free(clhash->hash, clhash->hashsize);
599}
600EXPORT_SYMBOL(qdisc_class_hash_destroy);
601
602void qdisc_class_hash_insert(struct Qdisc_class_hash *clhash,
603			     struct Qdisc_class_common *cl)
604{
605	unsigned int h;
606
607	INIT_HLIST_NODE(&cl->hnode);
608	h = qdisc_class_hash(cl->classid, clhash->hashmask);
609	hlist_add_head(&cl->hnode, &clhash->hash[h]);
610	clhash->hashelems++;
611}
612EXPORT_SYMBOL(qdisc_class_hash_insert);
613
614void qdisc_class_hash_remove(struct Qdisc_class_hash *clhash,
615			     struct Qdisc_class_common *cl)
616{
617	hlist_del(&cl->hnode);
618	clhash->hashelems--;
619}
620EXPORT_SYMBOL(qdisc_class_hash_remove);
621
622/* Allocate an unique handle from space managed by kernel
623 * Possible range is [8000-FFFF]:0000 (0x8000 values)
624 */
625static u32 qdisc_alloc_handle(struct net_device *dev)
626{
627	int i = 0x8000;
628	static u32 autohandle = TC_H_MAKE(0x80000000U, 0);
629
630	do {
631		autohandle += TC_H_MAKE(0x10000U, 0);
632		if (autohandle == TC_H_MAKE(TC_H_ROOT, 0))
633			autohandle = TC_H_MAKE(0x80000000U, 0);
634		if (!qdisc_lookup(dev, autohandle))
635			return autohandle;
636		cond_resched();
637	} while	(--i > 0);
638
639	return 0;
640}
641
642void qdisc_tree_decrease_qlen(struct Qdisc *sch, unsigned int n)
643{
644	const struct Qdisc_class_ops *cops;
645	unsigned long cl;
646	u32 parentid;
647
648	if (n == 0)
649		return;
650	while ((parentid = sch->parent)) {
651		if (TC_H_MAJ(parentid) == TC_H_MAJ(TC_H_INGRESS))
652			return;
653
654		sch = qdisc_lookup(qdisc_dev(sch), TC_H_MAJ(parentid));
655		if (sch == NULL) {
656			WARN_ON(parentid != TC_H_ROOT);
657			return;
658		}
659		cops = sch->ops->cl_ops;
660		if (cops->qlen_notify) {
661			cl = cops->get(sch, parentid);
662			cops->qlen_notify(sch, cl);
663			cops->put(sch, cl);
664		}
665		sch->q.qlen -= n;
666	}
667}
668EXPORT_SYMBOL(qdisc_tree_decrease_qlen);
669
670static void notify_and_destroy(struct net *net, struct sk_buff *skb,
671			       struct nlmsghdr *n, u32 clid,
672			       struct Qdisc *old, struct Qdisc *new)
673{
674	if (new || old)
675		qdisc_notify(net, skb, n, clid, old, new);
676
677	if (old)
678		qdisc_destroy(old);
679}
680
681/* Graft qdisc "new" to class "classid" of qdisc "parent" or
682 * to device "dev".
683 *
684 * When appropriate send a netlink notification using 'skb'
685 * and "n".
686 *
687 * On success, destroy old qdisc.
688 */
689
690static int qdisc_graft(struct net_device *dev, struct Qdisc *parent,
691		       struct sk_buff *skb, struct nlmsghdr *n, u32 classid,
692		       struct Qdisc *new, struct Qdisc *old)
693{
694	struct Qdisc *q = old;
695	struct net *net = dev_net(dev);
696	int err = 0;
697
698	if (parent == NULL) {
699		unsigned int i, num_q, ingress;
700
701		ingress = 0;
702		num_q = dev->num_tx_queues;
703		if ((q && q->flags & TCQ_F_INGRESS) ||
704		    (new && new->flags & TCQ_F_INGRESS)) {
705			num_q = 1;
706			ingress = 1;
707			if (!dev_ingress_queue(dev))
708				return -ENOENT;
709		}
710
711		if (dev->flags & IFF_UP)
712			dev_deactivate(dev);
713
714		if (new && new->ops->attach) {
715			new->ops->attach(new);
716			num_q = 0;
717		}
718
719		for (i = 0; i < num_q; i++) {
720			struct netdev_queue *dev_queue = dev_ingress_queue(dev);
721
722			if (!ingress)
723				dev_queue = netdev_get_tx_queue(dev, i);
724
725			old = dev_graft_qdisc(dev_queue, new);
726			if (new && i > 0)
727				atomic_inc(&new->refcnt);
728
729			if (!ingress)
730				qdisc_destroy(old);
731		}
732
733		if (!ingress) {
734			notify_and_destroy(net, skb, n, classid,
735					   dev->qdisc, new);
736			if (new && !new->ops->attach)
737				atomic_inc(&new->refcnt);
738			dev->qdisc = new ? : &noop_qdisc;
739		} else {
740			notify_and_destroy(net, skb, n, classid, old, new);
741		}
742
743		if (dev->flags & IFF_UP)
744			dev_activate(dev);
745	} else {
746		const struct Qdisc_class_ops *cops = parent->ops->cl_ops;
747
748		err = -EOPNOTSUPP;
749		if (cops && cops->graft) {
750			unsigned long cl = cops->get(parent, classid);
751			if (cl) {
752				err = cops->graft(parent, cl, new, &old);
753				cops->put(parent, cl);
754			} else
755				err = -ENOENT;
756		}
757		if (!err)
758			notify_and_destroy(net, skb, n, classid, old, new);
759	}
760	return err;
761}
762
763/* lockdep annotation is needed for ingress; egress gets it only for name */
764static struct lock_class_key qdisc_tx_lock;
765static struct lock_class_key qdisc_rx_lock;
766
767/*
768   Allocate and initialize new qdisc.
769
770   Parameters are passed via opt.
771 */
772
773static struct Qdisc *
774qdisc_create(struct net_device *dev, struct netdev_queue *dev_queue,
775	     struct Qdisc *p, u32 parent, u32 handle,
776	     struct nlattr **tca, int *errp)
777{
778	int err;
779	struct nlattr *kind = tca[TCA_KIND];
780	struct Qdisc *sch;
781	struct Qdisc_ops *ops;
782	struct qdisc_size_table *stab;
783
784	ops = qdisc_lookup_ops(kind);
785#ifdef CONFIG_MODULES
786	if (ops == NULL && kind != NULL) {
787		char name[IFNAMSIZ];
788		if (nla_strlcpy(name, kind, IFNAMSIZ) < IFNAMSIZ) {
789			/* We dropped the RTNL semaphore in order to
790			 * perform the module load.  So, even if we
791			 * succeeded in loading the module we have to
792			 * tell the caller to replay the request.  We
793			 * indicate this using -EAGAIN.
794			 * We replay the request because the device may
795			 * go away in the mean time.
796			 */
797			rtnl_unlock();
798			request_module("sch_%s", name);
799			rtnl_lock();
800			ops = qdisc_lookup_ops(kind);
801			if (ops != NULL) {
802				/* We will try again qdisc_lookup_ops,
803				 * so don't keep a reference.
804				 */
805				module_put(ops->owner);
806				err = -EAGAIN;
807				goto err_out;
808			}
809		}
810	}
811#endif
812
813	err = -ENOENT;
814	if (ops == NULL)
815		goto err_out;
816
817	sch = qdisc_alloc(dev_queue, ops);
818	if (IS_ERR(sch)) {
819		err = PTR_ERR(sch);
820		goto err_out2;
821	}
822
823	sch->parent = parent;
824
825	if (handle == TC_H_INGRESS) {
826		sch->flags |= TCQ_F_INGRESS;
827		handle = TC_H_MAKE(TC_H_INGRESS, 0);
828		lockdep_set_class(qdisc_lock(sch), &qdisc_rx_lock);
829	} else {
830		if (handle == 0) {
831			handle = qdisc_alloc_handle(dev);
832			err = -ENOMEM;
833			if (handle == 0)
834				goto err_out3;
835		}
836		lockdep_set_class(qdisc_lock(sch), &qdisc_tx_lock);
837		if (!netif_is_multiqueue(dev))
838			sch->flags |= TCQ_F_ONETXQUEUE;
839	}
840
841	sch->handle = handle;
842
843	if (!ops->init || (err = ops->init(sch, tca[TCA_OPTIONS])) == 0) {
844		if (tca[TCA_STAB]) {
845			stab = qdisc_get_stab(tca[TCA_STAB]);
846			if (IS_ERR(stab)) {
847				err = PTR_ERR(stab);
848				goto err_out4;
849			}
850			rcu_assign_pointer(sch->stab, stab);
851		}
852		if (tca[TCA_RATE]) {
853			spinlock_t *root_lock;
854
855			err = -EOPNOTSUPP;
856			if (sch->flags & TCQ_F_MQROOT)
857				goto err_out4;
858
859			if ((sch->parent != TC_H_ROOT) &&
860			    !(sch->flags & TCQ_F_INGRESS) &&
861			    (!p || !(p->flags & TCQ_F_MQROOT)))
862				root_lock = qdisc_root_sleeping_lock(sch);
863			else
864				root_lock = qdisc_lock(sch);
865
866			err = gen_new_estimator(&sch->bstats, &sch->rate_est,
867						root_lock, tca[TCA_RATE]);
868			if (err)
869				goto err_out4;
870		}
871
872		qdisc_list_add(sch);
873
874		return sch;
875	}
876err_out3:
877	dev_put(dev);
878	kfree((char *) sch - sch->padded);
879err_out2:
880	module_put(ops->owner);
881err_out:
882	*errp = err;
883	return NULL;
884
885err_out4:
886	/*
887	 * Any broken qdiscs that would require a ops->reset() here?
888	 * The qdisc was never in action so it shouldn't be necessary.
889	 */
890	qdisc_put_stab(rtnl_dereference(sch->stab));
891	if (ops->destroy)
892		ops->destroy(sch);
893	goto err_out3;
894}
895
896static int qdisc_change(struct Qdisc *sch, struct nlattr **tca)
897{
898	struct qdisc_size_table *ostab, *stab = NULL;
899	int err = 0;
900
901	if (tca[TCA_OPTIONS]) {
902		if (sch->ops->change == NULL)
903			return -EINVAL;
904		err = sch->ops->change(sch, tca[TCA_OPTIONS]);
905		if (err)
906			return err;
907	}
908
909	if (tca[TCA_STAB]) {
910		stab = qdisc_get_stab(tca[TCA_STAB]);
911		if (IS_ERR(stab))
912			return PTR_ERR(stab);
913	}
914
915	ostab = rtnl_dereference(sch->stab);
916	rcu_assign_pointer(sch->stab, stab);
917	qdisc_put_stab(ostab);
918
919	if (tca[TCA_RATE]) {
920		/* NB: ignores errors from replace_estimator
921		   because change can't be undone. */
922		if (sch->flags & TCQ_F_MQROOT)
923			goto out;
924		gen_replace_estimator(&sch->bstats, &sch->rate_est,
925					    qdisc_root_sleeping_lock(sch),
926					    tca[TCA_RATE]);
927	}
928out:
929	return 0;
930}
931
932struct check_loop_arg {
933	struct qdisc_walker	w;
934	struct Qdisc		*p;
935	int			depth;
936};
937
938static int check_loop_fn(struct Qdisc *q, unsigned long cl, struct qdisc_walker *w);
939
940static int check_loop(struct Qdisc *q, struct Qdisc *p, int depth)
941{
942	struct check_loop_arg	arg;
943
944	if (q->ops->cl_ops == NULL)
945		return 0;
946
947	arg.w.stop = arg.w.skip = arg.w.count = 0;
948	arg.w.fn = check_loop_fn;
949	arg.depth = depth;
950	arg.p = p;
951	q->ops->cl_ops->walk(q, &arg.w);
952	return arg.w.stop ? -ELOOP : 0;
953}
954
955static int
956check_loop_fn(struct Qdisc *q, unsigned long cl, struct qdisc_walker *w)
957{
958	struct Qdisc *leaf;
959	const struct Qdisc_class_ops *cops = q->ops->cl_ops;
960	struct check_loop_arg *arg = (struct check_loop_arg *)w;
961
962	leaf = cops->leaf(q, cl);
963	if (leaf) {
964		if (leaf == arg->p || arg->depth > 7)
965			return -ELOOP;
966		return check_loop(leaf, arg->p, arg->depth + 1);
967	}
968	return 0;
969}
970
971/*
972 * Delete/get qdisc.
973 */
974
975static int tc_get_qdisc(struct sk_buff *skb, struct nlmsghdr *n)
976{
977	struct net *net = sock_net(skb->sk);
978	struct tcmsg *tcm = nlmsg_data(n);
979	struct nlattr *tca[TCA_MAX + 1];
980	struct net_device *dev;
981	u32 clid;
982	struct Qdisc *q = NULL;
983	struct Qdisc *p = NULL;
984	int err;
985
986	if ((n->nlmsg_type != RTM_GETQDISC) && !capable(CAP_NET_ADMIN))
987		return -EPERM;
988
989	err = nlmsg_parse(n, sizeof(*tcm), tca, TCA_MAX, NULL);
990	if (err < 0)
991		return err;
992
993	dev = __dev_get_by_index(net, tcm->tcm_ifindex);
994	if (!dev)
995		return -ENODEV;
996
997	clid = tcm->tcm_parent;
998	if (clid) {
999		if (clid != TC_H_ROOT) {
1000			if (TC_H_MAJ(clid) != TC_H_MAJ(TC_H_INGRESS)) {
1001				p = qdisc_lookup(dev, TC_H_MAJ(clid));
1002				if (!p)
1003					return -ENOENT;
1004				q = qdisc_leaf(p, clid);
1005			} else if (dev_ingress_queue(dev)) {
1006				q = dev_ingress_queue(dev)->qdisc_sleeping;
1007			}
1008		} else {
1009			q = dev->qdisc;
1010		}
1011		if (!q)
1012			return -ENOENT;
1013
1014		if (tcm->tcm_handle && q->handle != tcm->tcm_handle)
1015			return -EINVAL;
1016	} else {
1017		q = qdisc_lookup(dev, tcm->tcm_handle);
1018		if (!q)
1019			return -ENOENT;
1020	}
1021
1022	if (tca[TCA_KIND] && nla_strcmp(tca[TCA_KIND], q->ops->id))
1023		return -EINVAL;
1024
1025	if (n->nlmsg_type == RTM_DELQDISC) {
1026		if (!clid)
1027			return -EINVAL;
1028		if (q->handle == 0)
1029			return -ENOENT;
1030		err = qdisc_graft(dev, p, skb, n, clid, NULL, q);
1031		if (err != 0)
1032			return err;
1033	} else {
1034		qdisc_notify(net, skb, n, clid, NULL, q);
1035	}
1036	return 0;
1037}
1038
1039/*
1040 * Create/change qdisc.
1041 */
1042
1043static int tc_modify_qdisc(struct sk_buff *skb, struct nlmsghdr *n)
1044{
1045	struct net *net = sock_net(skb->sk);
1046	struct tcmsg *tcm;
1047	struct nlattr *tca[TCA_MAX + 1];
1048	struct net_device *dev;
1049	u32 clid;
1050	struct Qdisc *q, *p;
1051	int err;
1052
1053	if (!capable(CAP_NET_ADMIN))
1054		return -EPERM;
1055
1056replay:
1057	/* Reinit, just in case something touches this. */
1058	err = nlmsg_parse(n, sizeof(*tcm), tca, TCA_MAX, NULL);
1059	if (err < 0)
1060		return err;
1061
1062	tcm = nlmsg_data(n);
1063	clid = tcm->tcm_parent;
1064	q = p = NULL;
1065
1066	dev = __dev_get_by_index(net, tcm->tcm_ifindex);
1067	if (!dev)
1068		return -ENODEV;
1069
1070
1071	if (clid) {
1072		if (clid != TC_H_ROOT) {
1073			if (clid != TC_H_INGRESS) {
1074				p = qdisc_lookup(dev, TC_H_MAJ(clid));
1075				if (!p)
1076					return -ENOENT;
1077				q = qdisc_leaf(p, clid);
1078			} else if (dev_ingress_queue_create(dev)) {
1079				q = dev_ingress_queue(dev)->qdisc_sleeping;
1080			}
1081		} else {
1082			q = dev->qdisc;
1083		}
1084
1085		/* It may be default qdisc, ignore it */
1086		if (q && q->handle == 0)
1087			q = NULL;
1088
1089		if (!q || !tcm->tcm_handle || q->handle != tcm->tcm_handle) {
1090			if (tcm->tcm_handle) {
1091				if (q && !(n->nlmsg_flags & NLM_F_REPLACE))
1092					return -EEXIST;
1093				if (TC_H_MIN(tcm->tcm_handle))
1094					return -EINVAL;
1095				q = qdisc_lookup(dev, tcm->tcm_handle);
1096				if (!q)
1097					goto create_n_graft;
1098				if (n->nlmsg_flags & NLM_F_EXCL)
1099					return -EEXIST;
1100				if (tca[TCA_KIND] && nla_strcmp(tca[TCA_KIND], q->ops->id))
1101					return -EINVAL;
1102				if (q == p ||
1103				    (p && check_loop(q, p, 0)))
1104					return -ELOOP;
1105				atomic_inc(&q->refcnt);
1106				goto graft;
1107			} else {
1108				if (!q)
1109					goto create_n_graft;
1110
1111				/* This magic test requires explanation.
1112				 *
1113				 *   We know, that some child q is already
1114				 *   attached to this parent and have choice:
1115				 *   either to change it or to create/graft new one.
1116				 *
1117				 *   1. We are allowed to create/graft only
1118				 *   if CREATE and REPLACE flags are set.
1119				 *
1120				 *   2. If EXCL is set, requestor wanted to say,
1121				 *   that qdisc tcm_handle is not expected
1122				 *   to exist, so that we choose create/graft too.
1123				 *
1124				 *   3. The last case is when no flags are set.
1125				 *   Alas, it is sort of hole in API, we
1126				 *   cannot decide what to do unambiguously.
1127				 *   For now we select create/graft, if
1128				 *   user gave KIND, which does not match existing.
1129				 */
1130				if ((n->nlmsg_flags & NLM_F_CREATE) &&
1131				    (n->nlmsg_flags & NLM_F_REPLACE) &&
1132				    ((n->nlmsg_flags & NLM_F_EXCL) ||
1133				     (tca[TCA_KIND] &&
1134				      nla_strcmp(tca[TCA_KIND], q->ops->id))))
1135					goto create_n_graft;
1136			}
1137		}
1138	} else {
1139		if (!tcm->tcm_handle)
1140			return -EINVAL;
1141		q = qdisc_lookup(dev, tcm->tcm_handle);
1142	}
1143
1144	/* Change qdisc parameters */
1145	if (q == NULL)
1146		return -ENOENT;
1147	if (n->nlmsg_flags & NLM_F_EXCL)
1148		return -EEXIST;
1149	if (tca[TCA_KIND] && nla_strcmp(tca[TCA_KIND], q->ops->id))
1150		return -EINVAL;
1151	err = qdisc_change(q, tca);
1152	if (err == 0)
1153		qdisc_notify(net, skb, n, clid, NULL, q);
1154	return err;
1155
1156create_n_graft:
1157	if (!(n->nlmsg_flags & NLM_F_CREATE))
1158		return -ENOENT;
1159	if (clid == TC_H_INGRESS) {
1160		if (dev_ingress_queue(dev))
1161			q = qdisc_create(dev, dev_ingress_queue(dev), p,
1162					 tcm->tcm_parent, tcm->tcm_parent,
1163					 tca, &err);
1164		else
1165			err = -ENOENT;
1166	} else {
1167		struct netdev_queue *dev_queue;
1168
1169		if (p && p->ops->cl_ops && p->ops->cl_ops->select_queue)
1170			dev_queue = p->ops->cl_ops->select_queue(p, tcm);
1171		else if (p)
1172			dev_queue = p->dev_queue;
1173		else
1174			dev_queue = netdev_get_tx_queue(dev, 0);
1175
1176		q = qdisc_create(dev, dev_queue, p,
1177				 tcm->tcm_parent, tcm->tcm_handle,
1178				 tca, &err);
1179	}
1180	if (q == NULL) {
1181		if (err == -EAGAIN)
1182			goto replay;
1183		return err;
1184	}
1185
1186graft:
1187	err = qdisc_graft(dev, p, skb, n, clid, q, NULL);
1188	if (err) {
1189		if (q)
1190			qdisc_destroy(q);
1191		return err;
1192	}
1193
1194	return 0;
1195}
1196
1197static int tc_fill_qdisc(struct sk_buff *skb, struct Qdisc *q, u32 clid,
1198			 u32 portid, u32 seq, u16 flags, int event)
1199{
1200	struct tcmsg *tcm;
1201	struct nlmsghdr  *nlh;
1202	unsigned char *b = skb_tail_pointer(skb);
1203	struct gnet_dump d;
1204	struct qdisc_size_table *stab;
1205
1206	nlh = nlmsg_put(skb, portid, seq, event, sizeof(*tcm), flags);
1207	if (!nlh)
1208		goto out_nlmsg_trim;
1209	tcm = nlmsg_data(nlh);
1210	tcm->tcm_family = AF_UNSPEC;
1211	tcm->tcm__pad1 = 0;
1212	tcm->tcm__pad2 = 0;
1213	tcm->tcm_ifindex = qdisc_dev(q)->ifindex;
1214	tcm->tcm_parent = clid;
1215	tcm->tcm_handle = q->handle;
1216	tcm->tcm_info = atomic_read(&q->refcnt);
1217	if (nla_put_string(skb, TCA_KIND, q->ops->id))
1218		goto nla_put_failure;
1219	if (q->ops->dump && q->ops->dump(q, skb) < 0)
1220		goto nla_put_failure;
1221	q->qstats.qlen = q->q.qlen;
1222
1223	stab = rtnl_dereference(q->stab);
1224	if (stab && qdisc_dump_stab(skb, stab) < 0)
1225		goto nla_put_failure;
1226
1227	if (gnet_stats_start_copy_compat(skb, TCA_STATS2, TCA_STATS, TCA_XSTATS,
1228					 qdisc_root_sleeping_lock(q), &d) < 0)
1229		goto nla_put_failure;
1230
1231	if (q->ops->dump_stats && q->ops->dump_stats(q, &d) < 0)
1232		goto nla_put_failure;
1233
1234	if (gnet_stats_copy_basic(&d, &q->bstats) < 0 ||
1235	    gnet_stats_copy_rate_est(&d, &q->bstats, &q->rate_est) < 0 ||
1236	    gnet_stats_copy_queue(&d, &q->qstats) < 0)
1237		goto nla_put_failure;
1238
1239	if (gnet_stats_finish_copy(&d) < 0)
1240		goto nla_put_failure;
1241
1242	nlh->nlmsg_len = skb_tail_pointer(skb) - b;
1243	return skb->len;
1244
1245out_nlmsg_trim:
1246nla_put_failure:
1247	nlmsg_trim(skb, b);
1248	return -1;
1249}
1250
1251static bool tc_qdisc_dump_ignore(struct Qdisc *q)
1252{
1253	return (q->flags & TCQ_F_BUILTIN) ? true : false;
1254}
1255
1256static int qdisc_notify(struct net *net, struct sk_buff *oskb,
1257			struct nlmsghdr *n, u32 clid,
1258			struct Qdisc *old, struct Qdisc *new)
1259{
1260	struct sk_buff *skb;
1261	u32 portid = oskb ? NETLINK_CB(oskb).portid : 0;
1262
1263	skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
1264	if (!skb)
1265		return -ENOBUFS;
1266
1267	if (old && !tc_qdisc_dump_ignore(old)) {
1268		if (tc_fill_qdisc(skb, old, clid, portid, n->nlmsg_seq,
1269				  0, RTM_DELQDISC) < 0)
1270			goto err_out;
1271	}
1272	if (new && !tc_qdisc_dump_ignore(new)) {
1273		if (tc_fill_qdisc(skb, new, clid, portid, n->nlmsg_seq,
1274				  old ? NLM_F_REPLACE : 0, RTM_NEWQDISC) < 0)
1275			goto err_out;
1276	}
1277
1278	if (skb->len)
1279		return rtnetlink_send(skb, net, portid, RTNLGRP_TC,
1280				      n->nlmsg_flags & NLM_F_ECHO);
1281
1282err_out:
1283	kfree_skb(skb);
1284	return -EINVAL;
1285}
1286
1287static int tc_dump_qdisc_root(struct Qdisc *root, struct sk_buff *skb,
1288			      struct netlink_callback *cb,
1289			      int *q_idx_p, int s_q_idx)
1290{
1291	int ret = 0, q_idx = *q_idx_p;
1292	struct Qdisc *q;
1293
1294	if (!root)
1295		return 0;
1296
1297	q = root;
1298	if (q_idx < s_q_idx) {
1299		q_idx++;
1300	} else {
1301		if (!tc_qdisc_dump_ignore(q) &&
1302		    tc_fill_qdisc(skb, q, q->parent, NETLINK_CB(cb->skb).portid,
1303				  cb->nlh->nlmsg_seq, NLM_F_MULTI, RTM_NEWQDISC) <= 0)
1304			goto done;
1305		q_idx++;
1306	}
1307	list_for_each_entry(q, &root->list, list) {
1308		if (q_idx < s_q_idx) {
1309			q_idx++;
1310			continue;
1311		}
1312		if (!tc_qdisc_dump_ignore(q) &&
1313		    tc_fill_qdisc(skb, q, q->parent, NETLINK_CB(cb->skb).portid,
1314				  cb->nlh->nlmsg_seq, NLM_F_MULTI, RTM_NEWQDISC) <= 0)
1315			goto done;
1316		q_idx++;
1317	}
1318
1319out:
1320	*q_idx_p = q_idx;
1321	return ret;
1322done:
1323	ret = -1;
1324	goto out;
1325}
1326
1327static int tc_dump_qdisc(struct sk_buff *skb, struct netlink_callback *cb)
1328{
1329	struct net *net = sock_net(skb->sk);
1330	int idx, q_idx;
1331	int s_idx, s_q_idx;
1332	struct net_device *dev;
1333
1334	s_idx = cb->args[0];
1335	s_q_idx = q_idx = cb->args[1];
1336
1337	rcu_read_lock();
1338	idx = 0;
1339	for_each_netdev_rcu(net, dev) {
1340		struct netdev_queue *dev_queue;
1341
1342		if (idx < s_idx)
1343			goto cont;
1344		if (idx > s_idx)
1345			s_q_idx = 0;
1346		q_idx = 0;
1347
1348		if (tc_dump_qdisc_root(dev->qdisc, skb, cb, &q_idx, s_q_idx) < 0)
1349			goto done;
1350
1351		dev_queue = dev_ingress_queue(dev);
1352		if (dev_queue &&
1353		    tc_dump_qdisc_root(dev_queue->qdisc_sleeping, skb, cb,
1354				       &q_idx, s_q_idx) < 0)
1355			goto done;
1356
1357cont:
1358		idx++;
1359	}
1360
1361done:
1362	rcu_read_unlock();
1363
1364	cb->args[0] = idx;
1365	cb->args[1] = q_idx;
1366
1367	return skb->len;
1368}
1369
1370
1371
1372/************************************************
1373 *	Traffic classes manipulation.		*
1374 ************************************************/
1375
1376
1377
1378static int tc_ctl_tclass(struct sk_buff *skb, struct nlmsghdr *n)
1379{
1380	struct net *net = sock_net(skb->sk);
1381	struct tcmsg *tcm = nlmsg_data(n);
1382	struct nlattr *tca[TCA_MAX + 1];
1383	struct net_device *dev;
1384	struct Qdisc *q = NULL;
1385	const struct Qdisc_class_ops *cops;
1386	unsigned long cl = 0;
1387	unsigned long new_cl;
1388	u32 portid;
1389	u32 clid;
1390	u32 qid;
1391	int err;
1392
1393	if ((n->nlmsg_type != RTM_GETTCLASS) && !capable(CAP_NET_ADMIN))
1394		return -EPERM;
1395
1396	err = nlmsg_parse(n, sizeof(*tcm), tca, TCA_MAX, NULL);
1397	if (err < 0)
1398		return err;
1399
1400	dev = __dev_get_by_index(net, tcm->tcm_ifindex);
1401	if (!dev)
1402		return -ENODEV;
1403
1404	/*
1405	   parent == TC_H_UNSPEC - unspecified parent.
1406	   parent == TC_H_ROOT   - class is root, which has no parent.
1407	   parent == X:0	 - parent is root class.
1408	   parent == X:Y	 - parent is a node in hierarchy.
1409	   parent == 0:Y	 - parent is X:Y, where X:0 is qdisc.
1410
1411	   handle == 0:0	 - generate handle from kernel pool.
1412	   handle == 0:Y	 - class is X:Y, where X:0 is qdisc.
1413	   handle == X:Y	 - clear.
1414	   handle == X:0	 - root class.
1415	 */
1416
1417	/* Step 1. Determine qdisc handle X:0 */
1418
1419	portid = tcm->tcm_parent;
1420	clid = tcm->tcm_handle;
1421	qid = TC_H_MAJ(clid);
1422
1423	if (portid != TC_H_ROOT) {
1424		u32 qid1 = TC_H_MAJ(portid);
1425
1426		if (qid && qid1) {
1427			/* If both majors are known, they must be identical. */
1428			if (qid != qid1)
1429				return -EINVAL;
1430		} else if (qid1) {
1431			qid = qid1;
1432		} else if (qid == 0)
1433			qid = dev->qdisc->handle;
1434
1435		/* Now qid is genuine qdisc handle consistent
1436		 * both with parent and child.
1437		 *
1438		 * TC_H_MAJ(portid) still may be unspecified, complete it now.
1439		 */
1440		if (portid)
1441			portid = TC_H_MAKE(qid, portid);
1442	} else {
1443		if (qid == 0)
1444			qid = dev->qdisc->handle;
1445	}
1446
1447	/* OK. Locate qdisc */
1448	q = qdisc_lookup(dev, qid);
1449	if (!q)
1450		return -ENOENT;
1451
1452	/* An check that it supports classes */
1453	cops = q->ops->cl_ops;
1454	if (cops == NULL)
1455		return -EINVAL;
1456
1457	/* Now try to get class */
1458	if (clid == 0) {
1459		if (portid == TC_H_ROOT)
1460			clid = qid;
1461	} else
1462		clid = TC_H_MAKE(qid, clid);
1463
1464	if (clid)
1465		cl = cops->get(q, clid);
1466
1467	if (cl == 0) {
1468		err = -ENOENT;
1469		if (n->nlmsg_type != RTM_NEWTCLASS ||
1470		    !(n->nlmsg_flags & NLM_F_CREATE))
1471			goto out;
1472	} else {
1473		switch (n->nlmsg_type) {
1474		case RTM_NEWTCLASS:
1475			err = -EEXIST;
1476			if (n->nlmsg_flags & NLM_F_EXCL)
1477				goto out;
1478			break;
1479		case RTM_DELTCLASS:
1480			err = -EOPNOTSUPP;
1481			if (cops->delete)
1482				err = cops->delete(q, cl);
1483			if (err == 0)
1484				tclass_notify(net, skb, n, q, cl, RTM_DELTCLASS);
1485			goto out;
1486		case RTM_GETTCLASS:
1487			err = tclass_notify(net, skb, n, q, cl, RTM_NEWTCLASS);
1488			goto out;
1489		default:
1490			err = -EINVAL;
1491			goto out;
1492		}
1493	}
1494
1495	new_cl = cl;
1496	err = -EOPNOTSUPP;
1497	if (cops->change)
1498		err = cops->change(q, clid, portid, tca, &new_cl);
1499	if (err == 0)
1500		tclass_notify(net, skb, n, q, new_cl, RTM_NEWTCLASS);
1501
1502out:
1503	if (cl)
1504		cops->put(q, cl);
1505
1506	return err;
1507}
1508
1509
1510static int tc_fill_tclass(struct sk_buff *skb, struct Qdisc *q,
1511			  unsigned long cl,
1512			  u32 portid, u32 seq, u16 flags, int event)
1513{
1514	struct tcmsg *tcm;
1515	struct nlmsghdr  *nlh;
1516	unsigned char *b = skb_tail_pointer(skb);
1517	struct gnet_dump d;
1518	const struct Qdisc_class_ops *cl_ops = q->ops->cl_ops;
1519
1520	nlh = nlmsg_put(skb, portid, seq, event, sizeof(*tcm), flags);
1521	if (!nlh)
1522		goto out_nlmsg_trim;
1523	tcm = nlmsg_data(nlh);
1524	tcm->tcm_family = AF_UNSPEC;
1525	tcm->tcm__pad1 = 0;
1526	tcm->tcm__pad2 = 0;
1527	tcm->tcm_ifindex = qdisc_dev(q)->ifindex;
1528	tcm->tcm_parent = q->handle;
1529	tcm->tcm_handle = q->handle;
1530	tcm->tcm_info = 0;
1531	if (nla_put_string(skb, TCA_KIND, q->ops->id))
1532		goto nla_put_failure;
1533	if (cl_ops->dump && cl_ops->dump(q, cl, skb, tcm) < 0)
1534		goto nla_put_failure;
1535
1536	if (gnet_stats_start_copy_compat(skb, TCA_STATS2, TCA_STATS, TCA_XSTATS,
1537					 qdisc_root_sleeping_lock(q), &d) < 0)
1538		goto nla_put_failure;
1539
1540	if (cl_ops->dump_stats && cl_ops->dump_stats(q, cl, &d) < 0)
1541		goto nla_put_failure;
1542
1543	if (gnet_stats_finish_copy(&d) < 0)
1544		goto nla_put_failure;
1545
1546	nlh->nlmsg_len = skb_tail_pointer(skb) - b;
1547	return skb->len;
1548
1549out_nlmsg_trim:
1550nla_put_failure:
1551	nlmsg_trim(skb, b);
1552	return -1;
1553}
1554
1555static int tclass_notify(struct net *net, struct sk_buff *oskb,
1556			 struct nlmsghdr *n, struct Qdisc *q,
1557			 unsigned long cl, int event)
1558{
1559	struct sk_buff *skb;
1560	u32 portid = oskb ? NETLINK_CB(oskb).portid : 0;
1561
1562	skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
1563	if (!skb)
1564		return -ENOBUFS;
1565
1566	if (tc_fill_tclass(skb, q, cl, portid, n->nlmsg_seq, 0, event) < 0) {
1567		kfree_skb(skb);
1568		return -EINVAL;
1569	}
1570
1571	return rtnetlink_send(skb, net, portid, RTNLGRP_TC,
1572			      n->nlmsg_flags & NLM_F_ECHO);
1573}
1574
1575struct qdisc_dump_args {
1576	struct qdisc_walker	w;
1577	struct sk_buff		*skb;
1578	struct netlink_callback	*cb;
1579};
1580
1581static int qdisc_class_dump(struct Qdisc *q, unsigned long cl, struct qdisc_walker *arg)
1582{
1583	struct qdisc_dump_args *a = (struct qdisc_dump_args *)arg;
1584
1585	return tc_fill_tclass(a->skb, q, cl, NETLINK_CB(a->cb->skb).portid,
1586			      a->cb->nlh->nlmsg_seq, NLM_F_MULTI, RTM_NEWTCLASS);
1587}
1588
1589static int tc_dump_tclass_qdisc(struct Qdisc *q, struct sk_buff *skb,
1590				struct tcmsg *tcm, struct netlink_callback *cb,
1591				int *t_p, int s_t)
1592{
1593	struct qdisc_dump_args arg;
1594
1595	if (tc_qdisc_dump_ignore(q) ||
1596	    *t_p < s_t || !q->ops->cl_ops ||
1597	    (tcm->tcm_parent &&
1598	     TC_H_MAJ(tcm->tcm_parent) != q->handle)) {
1599		(*t_p)++;
1600		return 0;
1601	}
1602	if (*t_p > s_t)
1603		memset(&cb->args[1], 0, sizeof(cb->args)-sizeof(cb->args[0]));
1604	arg.w.fn = qdisc_class_dump;
1605	arg.skb = skb;
1606	arg.cb = cb;
1607	arg.w.stop  = 0;
1608	arg.w.skip = cb->args[1];
1609	arg.w.count = 0;
1610	q->ops->cl_ops->walk(q, &arg.w);
1611	cb->args[1] = arg.w.count;
1612	if (arg.w.stop)
1613		return -1;
1614	(*t_p)++;
1615	return 0;
1616}
1617
1618static int tc_dump_tclass_root(struct Qdisc *root, struct sk_buff *skb,
1619			       struct tcmsg *tcm, struct netlink_callback *cb,
1620			       int *t_p, int s_t)
1621{
1622	struct Qdisc *q;
1623
1624	if (!root)
1625		return 0;
1626
1627	if (tc_dump_tclass_qdisc(root, skb, tcm, cb, t_p, s_t) < 0)
1628		return -1;
1629
1630	list_for_each_entry(q, &root->list, list) {
1631		if (tc_dump_tclass_qdisc(q, skb, tcm, cb, t_p, s_t) < 0)
1632			return -1;
1633	}
1634
1635	return 0;
1636}
1637
1638static int tc_dump_tclass(struct sk_buff *skb, struct netlink_callback *cb)
1639{
1640	struct tcmsg *tcm = nlmsg_data(cb->nlh);
1641	struct net *net = sock_net(skb->sk);
1642	struct netdev_queue *dev_queue;
1643	struct net_device *dev;
1644	int t, s_t;
1645
1646	if (nlmsg_len(cb->nlh) < sizeof(*tcm))
1647		return 0;
1648	dev = dev_get_by_index(net, tcm->tcm_ifindex);
1649	if (!dev)
1650		return 0;
1651
1652	s_t = cb->args[0];
1653	t = 0;
1654
1655	if (tc_dump_tclass_root(dev->qdisc, skb, tcm, cb, &t, s_t) < 0)
1656		goto done;
1657
1658	dev_queue = dev_ingress_queue(dev);
1659	if (dev_queue &&
1660	    tc_dump_tclass_root(dev_queue->qdisc_sleeping, skb, tcm, cb,
1661				&t, s_t) < 0)
1662		goto done;
1663
1664done:
1665	cb->args[0] = t;
1666
1667	dev_put(dev);
1668	return skb->len;
1669}
1670
1671/* Main classifier routine: scans classifier chain attached
1672 * to this qdisc, (optionally) tests for protocol and asks
1673 * specific classifiers.
1674 */
1675int tc_classify_compat(struct sk_buff *skb, const struct tcf_proto *tp,
1676		       struct tcf_result *res)
1677{
1678	__be16 protocol = skb->protocol;
1679	int err;
1680
1681	for (; tp; tp = tp->next) {
1682		if (tp->protocol != protocol &&
1683		    tp->protocol != htons(ETH_P_ALL))
1684			continue;
1685		err = tp->classify(skb, tp, res);
1686
1687		if (err >= 0) {
1688#ifdef CONFIG_NET_CLS_ACT
1689			if (err != TC_ACT_RECLASSIFY && skb->tc_verd)
1690				skb->tc_verd = SET_TC_VERD(skb->tc_verd, 0);
1691#endif
1692			return err;
1693		}
1694	}
1695	return -1;
1696}
1697EXPORT_SYMBOL(tc_classify_compat);
1698
1699int tc_classify(struct sk_buff *skb, const struct tcf_proto *tp,
1700		struct tcf_result *res)
1701{
1702	int err = 0;
1703#ifdef CONFIG_NET_CLS_ACT
1704	const struct tcf_proto *otp = tp;
1705reclassify:
1706#endif
1707
1708	err = tc_classify_compat(skb, tp, res);
1709#ifdef CONFIG_NET_CLS_ACT
1710	if (err == TC_ACT_RECLASSIFY) {
1711		u32 verd = G_TC_VERD(skb->tc_verd);
1712		tp = otp;
1713
1714		if (verd++ >= MAX_REC_LOOP) {
1715			net_notice_ratelimited("%s: packet reclassify loop rule prio %u protocol %02x\n",
1716					       tp->q->ops->id,
1717					       tp->prio & 0xffff,
1718					       ntohs(tp->protocol));
1719			return TC_ACT_SHOT;
1720		}
1721		skb->tc_verd = SET_TC_VERD(skb->tc_verd, verd);
1722		goto reclassify;
1723	}
1724#endif
1725	return err;
1726}
1727EXPORT_SYMBOL(tc_classify);
1728
1729void tcf_destroy(struct tcf_proto *tp)
1730{
1731	tp->ops->destroy(tp);
1732	module_put(tp->ops->owner);
1733	kfree(tp);
1734}
1735
1736void tcf_destroy_chain(struct tcf_proto **fl)
1737{
1738	struct tcf_proto *tp;
1739
1740	while ((tp = *fl) != NULL) {
1741		*fl = tp->next;
1742		tcf_destroy(tp);
1743	}
1744}
1745EXPORT_SYMBOL(tcf_destroy_chain);
1746
1747#ifdef CONFIG_PROC_FS
1748static int psched_show(struct seq_file *seq, void *v)
1749{
1750	struct timespec ts;
1751
1752	hrtimer_get_res(CLOCK_MONOTONIC, &ts);
1753	seq_printf(seq, "%08x %08x %08x %08x\n",
1754		   (u32)NSEC_PER_USEC, (u32)PSCHED_TICKS2NS(1),
1755		   1000000,
1756		   (u32)NSEC_PER_SEC/(u32)ktime_to_ns(timespec_to_ktime(ts)));
1757
1758	return 0;
1759}
1760
1761static int psched_open(struct inode *inode, struct file *file)
1762{
1763	return single_open(file, psched_show, NULL);
1764}
1765
1766static const struct file_operations psched_fops = {
1767	.owner = THIS_MODULE,
1768	.open = psched_open,
1769	.read  = seq_read,
1770	.llseek = seq_lseek,
1771	.release = single_release,
1772};
1773
1774static int __net_init psched_net_init(struct net *net)
1775{
1776	struct proc_dir_entry *e;
1777
1778	e = proc_create("psched", 0, net->proc_net, &psched_fops);
1779	if (e == NULL)
1780		return -ENOMEM;
1781
1782	return 0;
1783}
1784
1785static void __net_exit psched_net_exit(struct net *net)
1786{
1787	remove_proc_entry("psched", net->proc_net);
1788}
1789#else
1790static int __net_init psched_net_init(struct net *net)
1791{
1792	return 0;
1793}
1794
1795static void __net_exit psched_net_exit(struct net *net)
1796{
1797}
1798#endif
1799
1800static struct pernet_operations psched_net_ops = {
1801	.init = psched_net_init,
1802	.exit = psched_net_exit,
1803};
1804
1805static int __init pktsched_init(void)
1806{
1807	int err;
1808
1809	err = register_pernet_subsys(&psched_net_ops);
1810	if (err) {
1811		pr_err("pktsched_init: "
1812		       "cannot initialize per netns operations\n");
1813		return err;
1814	}
1815
1816	register_qdisc(&pfifo_qdisc_ops);
1817	register_qdisc(&bfifo_qdisc_ops);
1818	register_qdisc(&pfifo_head_drop_qdisc_ops);
1819	register_qdisc(&mq_qdisc_ops);
1820
1821	rtnl_register(PF_UNSPEC, RTM_NEWQDISC, tc_modify_qdisc, NULL, NULL);
1822	rtnl_register(PF_UNSPEC, RTM_DELQDISC, tc_get_qdisc, NULL, NULL);
1823	rtnl_register(PF_UNSPEC, RTM_GETQDISC, tc_get_qdisc, tc_dump_qdisc, NULL);
1824	rtnl_register(PF_UNSPEC, RTM_NEWTCLASS, tc_ctl_tclass, NULL, NULL);
1825	rtnl_register(PF_UNSPEC, RTM_DELTCLASS, tc_ctl_tclass, NULL, NULL);
1826	rtnl_register(PF_UNSPEC, RTM_GETTCLASS, tc_ctl_tclass, tc_dump_tclass, NULL);
1827
1828	return 0;
1829}
1830
1831subsys_initcall(pktsched_init);
1832