1/*
2 * net/sched/sch_cbq.c	Class-Based Queueing discipline.
3 *
4 *		This program is free software; you can redistribute it and/or
5 *		modify it under the terms of the GNU General Public License
6 *		as published by the Free Software Foundation; either version
7 *		2 of the License, or (at your option) any later version.
8 *
9 * Authors:	Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
10 *
11 */
12
13#include <linux/module.h>
14#include <linux/slab.h>
15#include <linux/types.h>
16#include <linux/kernel.h>
17#include <linux/string.h>
18#include <linux/errno.h>
19#include <linux/skbuff.h>
20#include <net/netlink.h>
21#include <net/pkt_sched.h>
22
23
24/*	Class-Based Queueing (CBQ) algorithm.
25	=======================================
26
27	Sources: [1] Sally Floyd and Van Jacobson, "Link-sharing and Resource
28		 Management Models for Packet Networks",
29		 IEEE/ACM Transactions on Networking, Vol.3, No.4, 1995
30
31		 [2] Sally Floyd, "Notes on CBQ and Guaranteed Service", 1995
32
33		 [3] Sally Floyd, "Notes on Class-Based Queueing: Setting
34		 Parameters", 1996
35
36		 [4] Sally Floyd and Michael Speer, "Experimental Results
37		 for Class-Based Queueing", 1998, not published.
38
39	-----------------------------------------------------------------------
40
41	Algorithm skeleton was taken from NS simulator cbq.cc.
42	If someone wants to check this code against the LBL version,
43	he should take into account that ONLY the skeleton was borrowed,
44	the implementation is different. Particularly:
45
46	--- The WRR algorithm is different. Our version looks more
47	reasonable (I hope) and works when quanta are allowed to be
48	less than MTU, which is always the case when real time classes
49	have small rates. Note, that the statement of [3] is
50	incomplete, delay may actually be estimated even if class
51	per-round allotment is less than MTU. Namely, if per-round
52	allotment is W*r_i, and r_1+...+r_k = r < 1
53
54	delay_i <= ([MTU/(W*r_i)]*W*r + W*r + k*MTU)/B
55
56	In the worst case we have IntServ estimate with D = W*r+k*MTU
57	and C = MTU*r. The proof (if correct at all) is trivial.
58
59
60	--- It seems that cbq-2.0 is not very accurate. At least, I cannot
61	interpret some places, which look like wrong translations
62	from NS. Anyone is advised to find these differences
63	and explain to me, why I am wrong 8).
64
65	--- Linux has no EOI event, so that we cannot estimate true class
66	idle time. Workaround is to consider the next dequeue event
67	as sign that previous packet is finished. This is wrong because of
68	internal device queueing, but on a permanently loaded link it is true.
69	Moreover, combined with clock integrator, this scheme looks
70	very close to an ideal solution.  */
71
72struct cbq_sched_data;
73
74
75struct cbq_class {
76	struct Qdisc_class_common common;
77	struct cbq_class	*next_alive;	/* next class with backlog in this priority band */
78
79/* Parameters */
80	unsigned char		priority;	/* class priority */
81	unsigned char		priority2;	/* priority to be used after overlimit */
82	unsigned char		ewma_log;	/* time constant for idle time calculation */
83	unsigned char		ovl_strategy;
84#ifdef CONFIG_NET_CLS_ACT
85	unsigned char		police;
86#endif
87
88	u32			defmap;
89
90	/* Link-sharing scheduler parameters */
91	long			maxidle;	/* Class parameters: see below. */
92	long			offtime;
93	long			minidle;
94	u32			avpkt;
95	struct qdisc_rate_table	*R_tab;
96
97	/* Overlimit strategy parameters */
98	void			(*overlimit)(struct cbq_class *cl);
99	psched_tdiff_t		penalty;
100
101	/* General scheduler (WRR) parameters */
102	long			allot;
103	long			quantum;	/* Allotment per WRR round */
104	long			weight;		/* Relative allotment: see below */
105
106	struct Qdisc		*qdisc;		/* Ptr to CBQ discipline */
107	struct cbq_class	*split;		/* Ptr to split node */
108	struct cbq_class	*share;		/* Ptr to LS parent in the class tree */
109	struct cbq_class	*tparent;	/* Ptr to tree parent in the class tree */
110	struct cbq_class	*borrow;	/* NULL if class is bandwidth limited;
111						   parent otherwise */
112	struct cbq_class	*sibling;	/* Sibling chain */
113	struct cbq_class	*children;	/* Pointer to children chain */
114
115	struct Qdisc		*q;		/* Elementary queueing discipline */
116
117
118/* Variables */
119	unsigned char		cpriority;	/* Effective priority */
120	unsigned char		delayed;
121	unsigned char		level;		/* level of the class in hierarchy:
122						   0 for leaf classes, and maximal
123						   level of children + 1 for nodes.
124						 */
125
126	psched_time_t		last;		/* Last end of service */
127	psched_time_t		undertime;
128	long			avgidle;
129	long			deficit;	/* Saved deficit for WRR */
130	psched_time_t		penalized;
131	struct gnet_stats_basic_packed bstats;
132	struct gnet_stats_queue qstats;
133	struct gnet_stats_rate_est rate_est;
134	struct tc_cbq_xstats	xstats;
135
136	struct tcf_proto	*filter_list;
137
138	int			refcnt;
139	int			filters;
140
141	struct cbq_class	*defaults[TC_PRIO_MAX + 1];
142};
143
144struct cbq_sched_data {
145	struct Qdisc_class_hash	clhash;			/* Hash table of all classes */
146	int			nclasses[TC_CBQ_MAXPRIO + 1];
147	unsigned int		quanta[TC_CBQ_MAXPRIO + 1];
148
149	struct cbq_class	link;
150
151	unsigned int		activemask;
152	struct cbq_class	*active[TC_CBQ_MAXPRIO + 1];	/* List of all classes
153								   with backlog */
154
155#ifdef CONFIG_NET_CLS_ACT
156	struct cbq_class	*rx_class;
157#endif
158	struct cbq_class	*tx_class;
159	struct cbq_class	*tx_borrowed;
160	int			tx_len;
161	psched_time_t		now;		/* Cached timestamp */
162	psched_time_t		now_rt;		/* Cached real time */
163	unsigned int		pmask;
164
165	struct hrtimer		delay_timer;
166	struct qdisc_watchdog	watchdog;	/* Watchdog timer,
167						   started when CBQ has
168						   backlog, but cannot
169						   transmit just now */
170	psched_tdiff_t		wd_expires;
171	int			toplevel;
172	u32			hgenerator;
173};
174
175
176#define L2T(cl, len)	qdisc_l2t((cl)->R_tab, len)
177
178static inline struct cbq_class *
179cbq_class_lookup(struct cbq_sched_data *q, u32 classid)
180{
181	struct Qdisc_class_common *clc;
182
183	clc = qdisc_class_find(&q->clhash, classid);
184	if (clc == NULL)
185		return NULL;
186	return container_of(clc, struct cbq_class, common);
187}
188
189#ifdef CONFIG_NET_CLS_ACT
190
191static struct cbq_class *
192cbq_reclassify(struct sk_buff *skb, struct cbq_class *this)
193{
194	struct cbq_class *cl;
195
196	for (cl = this->tparent; cl; cl = cl->tparent) {
197		struct cbq_class *new = cl->defaults[TC_PRIO_BESTEFFORT];
198
199		if (new != NULL && new != this)
200			return new;
201	}
202	return NULL;
203}
204
205#endif
206
207/* Classify packet. The procedure is pretty complicated, but
208 * it allows us to combine link sharing and priority scheduling
209 * transparently.
210 *
211 * Namely, you can put link sharing rules (f.e. route based) at root of CBQ,
212 * so that it resolves to split nodes. Then packets are classified
213 * by logical priority, or a more specific classifier may be attached
214 * to the split node.
215 */
216
217static struct cbq_class *
218cbq_classify(struct sk_buff *skb, struct Qdisc *sch, int *qerr)
219{
220	struct cbq_sched_data *q = qdisc_priv(sch);
221	struct cbq_class *head = &q->link;
222	struct cbq_class **defmap;
223	struct cbq_class *cl = NULL;
224	u32 prio = skb->priority;
225	struct tcf_result res;
226
227	/*
228	 *  Step 1. If skb->priority points to one of our classes, use it.
229	 */
230	if (TC_H_MAJ(prio ^ sch->handle) == 0 &&
231	    (cl = cbq_class_lookup(q, prio)) != NULL)
232		return cl;
233
234	*qerr = NET_XMIT_SUCCESS | __NET_XMIT_BYPASS;
235	for (;;) {
236		int result = 0;
237		defmap = head->defaults;
238
239		/*
240		 * Step 2+n. Apply classifier.
241		 */
242		if (!head->filter_list ||
243		    (result = tc_classify_compat(skb, head->filter_list, &res)) < 0)
244			goto fallback;
245
246		cl = (void *)res.class;
247		if (!cl) {
248			if (TC_H_MAJ(res.classid))
249				cl = cbq_class_lookup(q, res.classid);
250			else if ((cl = defmap[res.classid & TC_PRIO_MAX]) == NULL)
251				cl = defmap[TC_PRIO_BESTEFFORT];
252
253			if (cl == NULL)
254				goto fallback;
255		}
256		if (cl->level >= head->level)
257			goto fallback;
258#ifdef CONFIG_NET_CLS_ACT
259		switch (result) {
260		case TC_ACT_QUEUED:
261		case TC_ACT_STOLEN:
262			*qerr = NET_XMIT_SUCCESS | __NET_XMIT_STOLEN;
263		case TC_ACT_SHOT:
264			return NULL;
265		case TC_ACT_RECLASSIFY:
266			return cbq_reclassify(skb, cl);
267		}
268#endif
269		if (cl->level == 0)
270			return cl;
271
272		/*
273		 * Step 3+n. If classifier selected a link sharing class,
274		 *	   apply agency specific classifier.
275		 *	   Repeat this procdure until we hit a leaf node.
276		 */
277		head = cl;
278	}
279
280fallback:
281	cl = head;
282
283	/*
284	 * Step 4. No success...
285	 */
286	if (TC_H_MAJ(prio) == 0 &&
287	    !(cl = head->defaults[prio & TC_PRIO_MAX]) &&
288	    !(cl = head->defaults[TC_PRIO_BESTEFFORT]))
289		return head;
290
291	return cl;
292}
293
294/*
295 * A packet has just been enqueued on the empty class.
296 * cbq_activate_class adds it to the tail of active class list
297 * of its priority band.
298 */
299
300static inline void cbq_activate_class(struct cbq_class *cl)
301{
302	struct cbq_sched_data *q = qdisc_priv(cl->qdisc);
303	int prio = cl->cpriority;
304	struct cbq_class *cl_tail;
305
306	cl_tail = q->active[prio];
307	q->active[prio] = cl;
308
309	if (cl_tail != NULL) {
310		cl->next_alive = cl_tail->next_alive;
311		cl_tail->next_alive = cl;
312	} else {
313		cl->next_alive = cl;
314		q->activemask |= (1<<prio);
315	}
316}
317
318/*
319 * Unlink class from active chain.
320 * Note that this same procedure is done directly in cbq_dequeue*
321 * during round-robin procedure.
322 */
323
324static void cbq_deactivate_class(struct cbq_class *this)
325{
326	struct cbq_sched_data *q = qdisc_priv(this->qdisc);
327	int prio = this->cpriority;
328	struct cbq_class *cl;
329	struct cbq_class *cl_prev = q->active[prio];
330
331	do {
332		cl = cl_prev->next_alive;
333		if (cl == this) {
334			cl_prev->next_alive = cl->next_alive;
335			cl->next_alive = NULL;
336
337			if (cl == q->active[prio]) {
338				q->active[prio] = cl_prev;
339				if (cl == q->active[prio]) {
340					q->active[prio] = NULL;
341					q->activemask &= ~(1<<prio);
342					return;
343				}
344			}
345			return;
346		}
347	} while ((cl_prev = cl) != q->active[prio]);
348}
349
350static void
351cbq_mark_toplevel(struct cbq_sched_data *q, struct cbq_class *cl)
352{
353	int toplevel = q->toplevel;
354
355	if (toplevel > cl->level && !(qdisc_is_throttled(cl->q))) {
356		psched_time_t now;
357		psched_tdiff_t incr;
358
359		now = psched_get_time();
360		incr = now - q->now_rt;
361		now = q->now + incr;
362
363		do {
364			if (cl->undertime < now) {
365				q->toplevel = cl->level;
366				return;
367			}
368		} while ((cl = cl->borrow) != NULL && toplevel > cl->level);
369	}
370}
371
372static int
373cbq_enqueue(struct sk_buff *skb, struct Qdisc *sch)
374{
375	struct cbq_sched_data *q = qdisc_priv(sch);
376	int uninitialized_var(ret);
377	struct cbq_class *cl = cbq_classify(skb, sch, &ret);
378
379#ifdef CONFIG_NET_CLS_ACT
380	q->rx_class = cl;
381#endif
382	if (cl == NULL) {
383		if (ret & __NET_XMIT_BYPASS)
384			sch->qstats.drops++;
385		kfree_skb(skb);
386		return ret;
387	}
388
389#ifdef CONFIG_NET_CLS_ACT
390	cl->q->__parent = sch;
391#endif
392	ret = qdisc_enqueue(skb, cl->q);
393	if (ret == NET_XMIT_SUCCESS) {
394		sch->q.qlen++;
395		cbq_mark_toplevel(q, cl);
396		if (!cl->next_alive)
397			cbq_activate_class(cl);
398		return ret;
399	}
400
401	if (net_xmit_drop_count(ret)) {
402		sch->qstats.drops++;
403		cbq_mark_toplevel(q, cl);
404		cl->qstats.drops++;
405	}
406	return ret;
407}
408
409/* Overlimit actions */
410
411/* TC_CBQ_OVL_CLASSIC: (default) penalize leaf class by adding offtime */
412
413static void cbq_ovl_classic(struct cbq_class *cl)
414{
415	struct cbq_sched_data *q = qdisc_priv(cl->qdisc);
416	psched_tdiff_t delay = cl->undertime - q->now;
417
418	if (!cl->delayed) {
419		delay += cl->offtime;
420
421		/*
422		 * Class goes to sleep, so that it will have no
423		 * chance to work avgidle. Let's forgive it 8)
424		 *
425		 * BTW cbq-2.0 has a crap in this
426		 * place, apparently they forgot to shift it by cl->ewma_log.
427		 */
428		if (cl->avgidle < 0)
429			delay -= (-cl->avgidle) - ((-cl->avgidle) >> cl->ewma_log);
430		if (cl->avgidle < cl->minidle)
431			cl->avgidle = cl->minidle;
432		if (delay <= 0)
433			delay = 1;
434		cl->undertime = q->now + delay;
435
436		cl->xstats.overactions++;
437		cl->delayed = 1;
438	}
439	if (q->wd_expires == 0 || q->wd_expires > delay)
440		q->wd_expires = delay;
441
442	/* Dirty work! We must schedule wakeups based on
443	 * real available rate, rather than leaf rate,
444	 * which may be tiny (even zero).
445	 */
446	if (q->toplevel == TC_CBQ_MAXLEVEL) {
447		struct cbq_class *b;
448		psched_tdiff_t base_delay = q->wd_expires;
449
450		for (b = cl->borrow; b; b = b->borrow) {
451			delay = b->undertime - q->now;
452			if (delay < base_delay) {
453				if (delay <= 0)
454					delay = 1;
455				base_delay = delay;
456			}
457		}
458
459		q->wd_expires = base_delay;
460	}
461}
462
463/* TC_CBQ_OVL_RCLASSIC: penalize by offtime classes in hierarchy, when
464 * they go overlimit
465 */
466
467static void cbq_ovl_rclassic(struct cbq_class *cl)
468{
469	struct cbq_sched_data *q = qdisc_priv(cl->qdisc);
470	struct cbq_class *this = cl;
471
472	do {
473		if (cl->level > q->toplevel) {
474			cl = NULL;
475			break;
476		}
477	} while ((cl = cl->borrow) != NULL);
478
479	if (cl == NULL)
480		cl = this;
481	cbq_ovl_classic(cl);
482}
483
484/* TC_CBQ_OVL_DELAY: delay until it will go to underlimit */
485
486static void cbq_ovl_delay(struct cbq_class *cl)
487{
488	struct cbq_sched_data *q = qdisc_priv(cl->qdisc);
489	psched_tdiff_t delay = cl->undertime - q->now;
490
491	if (test_bit(__QDISC_STATE_DEACTIVATED,
492		     &qdisc_root_sleeping(cl->qdisc)->state))
493		return;
494
495	if (!cl->delayed) {
496		psched_time_t sched = q->now;
497		ktime_t expires;
498
499		delay += cl->offtime;
500		if (cl->avgidle < 0)
501			delay -= (-cl->avgidle) - ((-cl->avgidle) >> cl->ewma_log);
502		if (cl->avgidle < cl->minidle)
503			cl->avgidle = cl->minidle;
504		cl->undertime = q->now + delay;
505
506		if (delay > 0) {
507			sched += delay + cl->penalty;
508			cl->penalized = sched;
509			cl->cpriority = TC_CBQ_MAXPRIO;
510			q->pmask |= (1<<TC_CBQ_MAXPRIO);
511
512			expires = ns_to_ktime(PSCHED_TICKS2NS(sched));
513			if (hrtimer_try_to_cancel(&q->delay_timer) &&
514			    ktime_to_ns(ktime_sub(
515					hrtimer_get_expires(&q->delay_timer),
516					expires)) > 0)
517				hrtimer_set_expires(&q->delay_timer, expires);
518			hrtimer_restart(&q->delay_timer);
519			cl->delayed = 1;
520			cl->xstats.overactions++;
521			return;
522		}
523		delay = 1;
524	}
525	if (q->wd_expires == 0 || q->wd_expires > delay)
526		q->wd_expires = delay;
527}
528
529/* TC_CBQ_OVL_LOWPRIO: penalize class by lowering its priority band */
530
531static void cbq_ovl_lowprio(struct cbq_class *cl)
532{
533	struct cbq_sched_data *q = qdisc_priv(cl->qdisc);
534
535	cl->penalized = q->now + cl->penalty;
536
537	if (cl->cpriority != cl->priority2) {
538		cl->cpriority = cl->priority2;
539		q->pmask |= (1<<cl->cpriority);
540		cl->xstats.overactions++;
541	}
542	cbq_ovl_classic(cl);
543}
544
545/* TC_CBQ_OVL_DROP: penalize class by dropping */
546
547static void cbq_ovl_drop(struct cbq_class *cl)
548{
549	if (cl->q->ops->drop)
550		if (cl->q->ops->drop(cl->q))
551			cl->qdisc->q.qlen--;
552	cl->xstats.overactions++;
553	cbq_ovl_classic(cl);
554}
555
556static psched_tdiff_t cbq_undelay_prio(struct cbq_sched_data *q, int prio,
557				       psched_time_t now)
558{
559	struct cbq_class *cl;
560	struct cbq_class *cl_prev = q->active[prio];
561	psched_time_t sched = now;
562
563	if (cl_prev == NULL)
564		return 0;
565
566	do {
567		cl = cl_prev->next_alive;
568		if (now - cl->penalized > 0) {
569			cl_prev->next_alive = cl->next_alive;
570			cl->next_alive = NULL;
571			cl->cpriority = cl->priority;
572			cl->delayed = 0;
573			cbq_activate_class(cl);
574
575			if (cl == q->active[prio]) {
576				q->active[prio] = cl_prev;
577				if (cl == q->active[prio]) {
578					q->active[prio] = NULL;
579					return 0;
580				}
581			}
582
583			cl = cl_prev->next_alive;
584		} else if (sched - cl->penalized > 0)
585			sched = cl->penalized;
586	} while ((cl_prev = cl) != q->active[prio]);
587
588	return sched - now;
589}
590
591static enum hrtimer_restart cbq_undelay(struct hrtimer *timer)
592{
593	struct cbq_sched_data *q = container_of(timer, struct cbq_sched_data,
594						delay_timer);
595	struct Qdisc *sch = q->watchdog.qdisc;
596	psched_time_t now;
597	psched_tdiff_t delay = 0;
598	unsigned int pmask;
599
600	now = psched_get_time();
601
602	pmask = q->pmask;
603	q->pmask = 0;
604
605	while (pmask) {
606		int prio = ffz(~pmask);
607		psched_tdiff_t tmp;
608
609		pmask &= ~(1<<prio);
610
611		tmp = cbq_undelay_prio(q, prio, now);
612		if (tmp > 0) {
613			q->pmask |= 1<<prio;
614			if (tmp < delay || delay == 0)
615				delay = tmp;
616		}
617	}
618
619	if (delay) {
620		ktime_t time;
621
622		time = ktime_set(0, 0);
623		time = ktime_add_ns(time, PSCHED_TICKS2NS(now + delay));
624		hrtimer_start(&q->delay_timer, time, HRTIMER_MODE_ABS);
625	}
626
627	qdisc_unthrottled(sch);
628	__netif_schedule(qdisc_root(sch));
629	return HRTIMER_NORESTART;
630}
631
632#ifdef CONFIG_NET_CLS_ACT
633static int cbq_reshape_fail(struct sk_buff *skb, struct Qdisc *child)
634{
635	struct Qdisc *sch = child->__parent;
636	struct cbq_sched_data *q = qdisc_priv(sch);
637	struct cbq_class *cl = q->rx_class;
638
639	q->rx_class = NULL;
640
641	if (cl && (cl = cbq_reclassify(skb, cl)) != NULL) {
642		int ret;
643
644		cbq_mark_toplevel(q, cl);
645
646		q->rx_class = cl;
647		cl->q->__parent = sch;
648
649		ret = qdisc_enqueue(skb, cl->q);
650		if (ret == NET_XMIT_SUCCESS) {
651			sch->q.qlen++;
652			if (!cl->next_alive)
653				cbq_activate_class(cl);
654			return 0;
655		}
656		if (net_xmit_drop_count(ret))
657			sch->qstats.drops++;
658		return 0;
659	}
660
661	sch->qstats.drops++;
662	return -1;
663}
664#endif
665
666/*
667 * It is mission critical procedure.
668 *
669 * We "regenerate" toplevel cutoff, if transmitting class
670 * has backlog and it is not regulated. It is not part of
671 * original CBQ description, but looks more reasonable.
672 * Probably, it is wrong. This question needs further investigation.
673 */
674
675static inline void
676cbq_update_toplevel(struct cbq_sched_data *q, struct cbq_class *cl,
677		    struct cbq_class *borrowed)
678{
679	if (cl && q->toplevel >= borrowed->level) {
680		if (cl->q->q.qlen > 1) {
681			do {
682				if (borrowed->undertime == PSCHED_PASTPERFECT) {
683					q->toplevel = borrowed->level;
684					return;
685				}
686			} while ((borrowed = borrowed->borrow) != NULL);
687		}
688#if 0
689	/* It is not necessary now. Uncommenting it
690	   will save CPU cycles, but decrease fairness.
691	 */
692		q->toplevel = TC_CBQ_MAXLEVEL;
693#endif
694	}
695}
696
697static void
698cbq_update(struct cbq_sched_data *q)
699{
700	struct cbq_class *this = q->tx_class;
701	struct cbq_class *cl = this;
702	int len = q->tx_len;
703
704	q->tx_class = NULL;
705
706	for ( ; cl; cl = cl->share) {
707		long avgidle = cl->avgidle;
708		long idle;
709
710		cl->bstats.packets++;
711		cl->bstats.bytes += len;
712
713		/*
714		 * (now - last) is total time between packet right edges.
715		 * (last_pktlen/rate) is "virtual" busy time, so that
716		 *
717		 *	idle = (now - last) - last_pktlen/rate
718		 */
719
720		idle = q->now - cl->last;
721		if ((unsigned long)idle > 128*1024*1024) {
722			avgidle = cl->maxidle;
723		} else {
724			idle -= L2T(cl, len);
725
726		/* true_avgidle := (1-W)*true_avgidle + W*idle,
727		 * where W=2^{-ewma_log}. But cl->avgidle is scaled:
728		 * cl->avgidle == true_avgidle/W,
729		 * hence:
730		 */
731			avgidle += idle - (avgidle>>cl->ewma_log);
732		}
733
734		if (avgidle <= 0) {
735			/* Overlimit or at-limit */
736
737			if (avgidle < cl->minidle)
738				avgidle = cl->minidle;
739
740			cl->avgidle = avgidle;
741
742			/* Calculate expected time, when this class
743			 * will be allowed to send.
744			 * It will occur, when:
745			 * (1-W)*true_avgidle + W*delay = 0, i.e.
746			 * idle = (1/W - 1)*(-true_avgidle)
747			 * or
748			 * idle = (1 - W)*(-cl->avgidle);
749			 */
750			idle = (-avgidle) - ((-avgidle) >> cl->ewma_log);
751
752			/*
753			 * That is not all.
754			 * To maintain the rate allocated to the class,
755			 * we add to undertime virtual clock,
756			 * necessary to complete transmitted packet.
757			 * (len/phys_bandwidth has been already passed
758			 * to the moment of cbq_update)
759			 */
760
761			idle -= L2T(&q->link, len);
762			idle += L2T(cl, len);
763
764			cl->undertime = q->now + idle;
765		} else {
766			/* Underlimit */
767
768			cl->undertime = PSCHED_PASTPERFECT;
769			if (avgidle > cl->maxidle)
770				cl->avgidle = cl->maxidle;
771			else
772				cl->avgidle = avgidle;
773		}
774		cl->last = q->now;
775	}
776
777	cbq_update_toplevel(q, this, q->tx_borrowed);
778}
779
780static inline struct cbq_class *
781cbq_under_limit(struct cbq_class *cl)
782{
783	struct cbq_sched_data *q = qdisc_priv(cl->qdisc);
784	struct cbq_class *this_cl = cl;
785
786	if (cl->tparent == NULL)
787		return cl;
788
789	if (cl->undertime == PSCHED_PASTPERFECT || q->now >= cl->undertime) {
790		cl->delayed = 0;
791		return cl;
792	}
793
794	do {
795		/* It is very suspicious place. Now overlimit
796		 * action is generated for not bounded classes
797		 * only if link is completely congested.
798		 * Though it is in agree with ancestor-only paradigm,
799		 * it looks very stupid. Particularly,
800		 * it means that this chunk of code will either
801		 * never be called or result in strong amplification
802		 * of burstiness. Dangerous, silly, and, however,
803		 * no another solution exists.
804		 */
805		cl = cl->borrow;
806		if (!cl) {
807			this_cl->qstats.overlimits++;
808			this_cl->overlimit(this_cl);
809			return NULL;
810		}
811		if (cl->level > q->toplevel)
812			return NULL;
813	} while (cl->undertime != PSCHED_PASTPERFECT && q->now < cl->undertime);
814
815	cl->delayed = 0;
816	return cl;
817}
818
819static inline struct sk_buff *
820cbq_dequeue_prio(struct Qdisc *sch, int prio)
821{
822	struct cbq_sched_data *q = qdisc_priv(sch);
823	struct cbq_class *cl_tail, *cl_prev, *cl;
824	struct sk_buff *skb;
825	int deficit;
826
827	cl_tail = cl_prev = q->active[prio];
828	cl = cl_prev->next_alive;
829
830	do {
831		deficit = 0;
832
833		/* Start round */
834		do {
835			struct cbq_class *borrow = cl;
836
837			if (cl->q->q.qlen &&
838			    (borrow = cbq_under_limit(cl)) == NULL)
839				goto skip_class;
840
841			if (cl->deficit <= 0) {
842				/* Class exhausted its allotment per
843				 * this round. Switch to the next one.
844				 */
845				deficit = 1;
846				cl->deficit += cl->quantum;
847				goto next_class;
848			}
849
850			skb = cl->q->dequeue(cl->q);
851
852			/* Class did not give us any skb :-(
853			 * It could occur even if cl->q->q.qlen != 0
854			 * f.e. if cl->q == "tbf"
855			 */
856			if (skb == NULL)
857				goto skip_class;
858
859			cl->deficit -= qdisc_pkt_len(skb);
860			q->tx_class = cl;
861			q->tx_borrowed = borrow;
862			if (borrow != cl) {
863#ifndef CBQ_XSTATS_BORROWS_BYTES
864				borrow->xstats.borrows++;
865				cl->xstats.borrows++;
866#else
867				borrow->xstats.borrows += qdisc_pkt_len(skb);
868				cl->xstats.borrows += qdisc_pkt_len(skb);
869#endif
870			}
871			q->tx_len = qdisc_pkt_len(skb);
872
873			if (cl->deficit <= 0) {
874				q->active[prio] = cl;
875				cl = cl->next_alive;
876				cl->deficit += cl->quantum;
877			}
878			return skb;
879
880skip_class:
881			if (cl->q->q.qlen == 0 || prio != cl->cpriority) {
882				/* Class is empty or penalized.
883				 * Unlink it from active chain.
884				 */
885				cl_prev->next_alive = cl->next_alive;
886				cl->next_alive = NULL;
887
888				/* Did cl_tail point to it? */
889				if (cl == cl_tail) {
890					/* Repair it! */
891					cl_tail = cl_prev;
892
893					/* Was it the last class in this band? */
894					if (cl == cl_tail) {
895						/* Kill the band! */
896						q->active[prio] = NULL;
897						q->activemask &= ~(1<<prio);
898						if (cl->q->q.qlen)
899							cbq_activate_class(cl);
900						return NULL;
901					}
902
903					q->active[prio] = cl_tail;
904				}
905				if (cl->q->q.qlen)
906					cbq_activate_class(cl);
907
908				cl = cl_prev;
909			}
910
911next_class:
912			cl_prev = cl;
913			cl = cl->next_alive;
914		} while (cl_prev != cl_tail);
915	} while (deficit);
916
917	q->active[prio] = cl_prev;
918
919	return NULL;
920}
921
922static inline struct sk_buff *
923cbq_dequeue_1(struct Qdisc *sch)
924{
925	struct cbq_sched_data *q = qdisc_priv(sch);
926	struct sk_buff *skb;
927	unsigned int activemask;
928
929	activemask = q->activemask & 0xFF;
930	while (activemask) {
931		int prio = ffz(~activemask);
932		activemask &= ~(1<<prio);
933		skb = cbq_dequeue_prio(sch, prio);
934		if (skb)
935			return skb;
936	}
937	return NULL;
938}
939
940static struct sk_buff *
941cbq_dequeue(struct Qdisc *sch)
942{
943	struct sk_buff *skb;
944	struct cbq_sched_data *q = qdisc_priv(sch);
945	psched_time_t now;
946	psched_tdiff_t incr;
947
948	now = psched_get_time();
949	incr = now - q->now_rt;
950
951	if (q->tx_class) {
952		psched_tdiff_t incr2;
953		/* Time integrator. We calculate EOS time
954		 * by adding expected packet transmission time.
955		 * If real time is greater, we warp artificial clock,
956		 * so that:
957		 *
958		 * cbq_time = max(real_time, work);
959		 */
960		incr2 = L2T(&q->link, q->tx_len);
961		q->now += incr2;
962		cbq_update(q);
963		if ((incr -= incr2) < 0)
964			incr = 0;
965		q->now += incr;
966	} else {
967		if (now > q->now)
968			q->now = now;
969	}
970	q->now_rt = now;
971
972	for (;;) {
973		q->wd_expires = 0;
974
975		skb = cbq_dequeue_1(sch);
976		if (skb) {
977			qdisc_bstats_update(sch, skb);
978			sch->q.qlen--;
979			qdisc_unthrottled(sch);
980			return skb;
981		}
982
983		/* All the classes are overlimit.
984		 *
985		 * It is possible, if:
986		 *
987		 * 1. Scheduler is empty.
988		 * 2. Toplevel cutoff inhibited borrowing.
989		 * 3. Root class is overlimit.
990		 *
991		 * Reset 2d and 3d conditions and retry.
992		 *
993		 * Note, that NS and cbq-2.0 are buggy, peeking
994		 * an arbitrary class is appropriate for ancestor-only
995		 * sharing, but not for toplevel algorithm.
996		 *
997		 * Our version is better, but slower, because it requires
998		 * two passes, but it is unavoidable with top-level sharing.
999		 */
1000
1001		if (q->toplevel == TC_CBQ_MAXLEVEL &&
1002		    q->link.undertime == PSCHED_PASTPERFECT)
1003			break;
1004
1005		q->toplevel = TC_CBQ_MAXLEVEL;
1006		q->link.undertime = PSCHED_PASTPERFECT;
1007	}
1008
1009	/* No packets in scheduler or nobody wants to give them to us :-(
1010	 * Sigh... start watchdog timer in the last case.
1011	 */
1012
1013	if (sch->q.qlen) {
1014		sch->qstats.overlimits++;
1015		if (q->wd_expires)
1016			qdisc_watchdog_schedule(&q->watchdog,
1017						now + q->wd_expires);
1018	}
1019	return NULL;
1020}
1021
1022/* CBQ class maintanance routines */
1023
1024static void cbq_adjust_levels(struct cbq_class *this)
1025{
1026	if (this == NULL)
1027		return;
1028
1029	do {
1030		int level = 0;
1031		struct cbq_class *cl;
1032
1033		cl = this->children;
1034		if (cl) {
1035			do {
1036				if (cl->level > level)
1037					level = cl->level;
1038			} while ((cl = cl->sibling) != this->children);
1039		}
1040		this->level = level + 1;
1041	} while ((this = this->tparent) != NULL);
1042}
1043
1044static void cbq_normalize_quanta(struct cbq_sched_data *q, int prio)
1045{
1046	struct cbq_class *cl;
1047	unsigned int h;
1048
1049	if (q->quanta[prio] == 0)
1050		return;
1051
1052	for (h = 0; h < q->clhash.hashsize; h++) {
1053		hlist_for_each_entry(cl, &q->clhash.hash[h], common.hnode) {
1054			/* BUGGGG... Beware! This expression suffer of
1055			 * arithmetic overflows!
1056			 */
1057			if (cl->priority == prio) {
1058				cl->quantum = (cl->weight*cl->allot*q->nclasses[prio])/
1059					q->quanta[prio];
1060			}
1061			if (cl->quantum <= 0 || cl->quantum>32*qdisc_dev(cl->qdisc)->mtu) {
1062				pr_warning("CBQ: class %08x has bad quantum==%ld, repaired.\n",
1063					   cl->common.classid, cl->quantum);
1064				cl->quantum = qdisc_dev(cl->qdisc)->mtu/2 + 1;
1065			}
1066		}
1067	}
1068}
1069
1070static void cbq_sync_defmap(struct cbq_class *cl)
1071{
1072	struct cbq_sched_data *q = qdisc_priv(cl->qdisc);
1073	struct cbq_class *split = cl->split;
1074	unsigned int h;
1075	int i;
1076
1077	if (split == NULL)
1078		return;
1079
1080	for (i = 0; i <= TC_PRIO_MAX; i++) {
1081		if (split->defaults[i] == cl && !(cl->defmap & (1<<i)))
1082			split->defaults[i] = NULL;
1083	}
1084
1085	for (i = 0; i <= TC_PRIO_MAX; i++) {
1086		int level = split->level;
1087
1088		if (split->defaults[i])
1089			continue;
1090
1091		for (h = 0; h < q->clhash.hashsize; h++) {
1092			struct cbq_class *c;
1093
1094			hlist_for_each_entry(c, &q->clhash.hash[h],
1095					     common.hnode) {
1096				if (c->split == split && c->level < level &&
1097				    c->defmap & (1<<i)) {
1098					split->defaults[i] = c;
1099					level = c->level;
1100				}
1101			}
1102		}
1103	}
1104}
1105
1106static void cbq_change_defmap(struct cbq_class *cl, u32 splitid, u32 def, u32 mask)
1107{
1108	struct cbq_class *split = NULL;
1109
1110	if (splitid == 0) {
1111		split = cl->split;
1112		if (!split)
1113			return;
1114		splitid = split->common.classid;
1115	}
1116
1117	if (split == NULL || split->common.classid != splitid) {
1118		for (split = cl->tparent; split; split = split->tparent)
1119			if (split->common.classid == splitid)
1120				break;
1121	}
1122
1123	if (split == NULL)
1124		return;
1125
1126	if (cl->split != split) {
1127		cl->defmap = 0;
1128		cbq_sync_defmap(cl);
1129		cl->split = split;
1130		cl->defmap = def & mask;
1131	} else
1132		cl->defmap = (cl->defmap & ~mask) | (def & mask);
1133
1134	cbq_sync_defmap(cl);
1135}
1136
1137static void cbq_unlink_class(struct cbq_class *this)
1138{
1139	struct cbq_class *cl, **clp;
1140	struct cbq_sched_data *q = qdisc_priv(this->qdisc);
1141
1142	qdisc_class_hash_remove(&q->clhash, &this->common);
1143
1144	if (this->tparent) {
1145		clp = &this->sibling;
1146		cl = *clp;
1147		do {
1148			if (cl == this) {
1149				*clp = cl->sibling;
1150				break;
1151			}
1152			clp = &cl->sibling;
1153		} while ((cl = *clp) != this->sibling);
1154
1155		if (this->tparent->children == this) {
1156			this->tparent->children = this->sibling;
1157			if (this->sibling == this)
1158				this->tparent->children = NULL;
1159		}
1160	} else {
1161		WARN_ON(this->sibling != this);
1162	}
1163}
1164
1165static void cbq_link_class(struct cbq_class *this)
1166{
1167	struct cbq_sched_data *q = qdisc_priv(this->qdisc);
1168	struct cbq_class *parent = this->tparent;
1169
1170	this->sibling = this;
1171	qdisc_class_hash_insert(&q->clhash, &this->common);
1172
1173	if (parent == NULL)
1174		return;
1175
1176	if (parent->children == NULL) {
1177		parent->children = this;
1178	} else {
1179		this->sibling = parent->children->sibling;
1180		parent->children->sibling = this;
1181	}
1182}
1183
1184static unsigned int cbq_drop(struct Qdisc *sch)
1185{
1186	struct cbq_sched_data *q = qdisc_priv(sch);
1187	struct cbq_class *cl, *cl_head;
1188	int prio;
1189	unsigned int len;
1190
1191	for (prio = TC_CBQ_MAXPRIO; prio >= 0; prio--) {
1192		cl_head = q->active[prio];
1193		if (!cl_head)
1194			continue;
1195
1196		cl = cl_head;
1197		do {
1198			if (cl->q->ops->drop && (len = cl->q->ops->drop(cl->q))) {
1199				sch->q.qlen--;
1200				if (!cl->q->q.qlen)
1201					cbq_deactivate_class(cl);
1202				return len;
1203			}
1204		} while ((cl = cl->next_alive) != cl_head);
1205	}
1206	return 0;
1207}
1208
1209static void
1210cbq_reset(struct Qdisc *sch)
1211{
1212	struct cbq_sched_data *q = qdisc_priv(sch);
1213	struct cbq_class *cl;
1214	int prio;
1215	unsigned int h;
1216
1217	q->activemask = 0;
1218	q->pmask = 0;
1219	q->tx_class = NULL;
1220	q->tx_borrowed = NULL;
1221	qdisc_watchdog_cancel(&q->watchdog);
1222	hrtimer_cancel(&q->delay_timer);
1223	q->toplevel = TC_CBQ_MAXLEVEL;
1224	q->now = psched_get_time();
1225	q->now_rt = q->now;
1226
1227	for (prio = 0; prio <= TC_CBQ_MAXPRIO; prio++)
1228		q->active[prio] = NULL;
1229
1230	for (h = 0; h < q->clhash.hashsize; h++) {
1231		hlist_for_each_entry(cl, &q->clhash.hash[h], common.hnode) {
1232			qdisc_reset(cl->q);
1233
1234			cl->next_alive = NULL;
1235			cl->undertime = PSCHED_PASTPERFECT;
1236			cl->avgidle = cl->maxidle;
1237			cl->deficit = cl->quantum;
1238			cl->cpriority = cl->priority;
1239		}
1240	}
1241	sch->q.qlen = 0;
1242}
1243
1244
1245static int cbq_set_lss(struct cbq_class *cl, struct tc_cbq_lssopt *lss)
1246{
1247	if (lss->change & TCF_CBQ_LSS_FLAGS) {
1248		cl->share = (lss->flags & TCF_CBQ_LSS_ISOLATED) ? NULL : cl->tparent;
1249		cl->borrow = (lss->flags & TCF_CBQ_LSS_BOUNDED) ? NULL : cl->tparent;
1250	}
1251	if (lss->change & TCF_CBQ_LSS_EWMA)
1252		cl->ewma_log = lss->ewma_log;
1253	if (lss->change & TCF_CBQ_LSS_AVPKT)
1254		cl->avpkt = lss->avpkt;
1255	if (lss->change & TCF_CBQ_LSS_MINIDLE)
1256		cl->minidle = -(long)lss->minidle;
1257	if (lss->change & TCF_CBQ_LSS_MAXIDLE) {
1258		cl->maxidle = lss->maxidle;
1259		cl->avgidle = lss->maxidle;
1260	}
1261	if (lss->change & TCF_CBQ_LSS_OFFTIME)
1262		cl->offtime = lss->offtime;
1263	return 0;
1264}
1265
1266static void cbq_rmprio(struct cbq_sched_data *q, struct cbq_class *cl)
1267{
1268	q->nclasses[cl->priority]--;
1269	q->quanta[cl->priority] -= cl->weight;
1270	cbq_normalize_quanta(q, cl->priority);
1271}
1272
1273static void cbq_addprio(struct cbq_sched_data *q, struct cbq_class *cl)
1274{
1275	q->nclasses[cl->priority]++;
1276	q->quanta[cl->priority] += cl->weight;
1277	cbq_normalize_quanta(q, cl->priority);
1278}
1279
1280static int cbq_set_wrr(struct cbq_class *cl, struct tc_cbq_wrropt *wrr)
1281{
1282	struct cbq_sched_data *q = qdisc_priv(cl->qdisc);
1283
1284	if (wrr->allot)
1285		cl->allot = wrr->allot;
1286	if (wrr->weight)
1287		cl->weight = wrr->weight;
1288	if (wrr->priority) {
1289		cl->priority = wrr->priority - 1;
1290		cl->cpriority = cl->priority;
1291		if (cl->priority >= cl->priority2)
1292			cl->priority2 = TC_CBQ_MAXPRIO - 1;
1293	}
1294
1295	cbq_addprio(q, cl);
1296	return 0;
1297}
1298
1299static int cbq_set_overlimit(struct cbq_class *cl, struct tc_cbq_ovl *ovl)
1300{
1301	switch (ovl->strategy) {
1302	case TC_CBQ_OVL_CLASSIC:
1303		cl->overlimit = cbq_ovl_classic;
1304		break;
1305	case TC_CBQ_OVL_DELAY:
1306		cl->overlimit = cbq_ovl_delay;
1307		break;
1308	case TC_CBQ_OVL_LOWPRIO:
1309		if (ovl->priority2 - 1 >= TC_CBQ_MAXPRIO ||
1310		    ovl->priority2 - 1 <= cl->priority)
1311			return -EINVAL;
1312		cl->priority2 = ovl->priority2 - 1;
1313		cl->overlimit = cbq_ovl_lowprio;
1314		break;
1315	case TC_CBQ_OVL_DROP:
1316		cl->overlimit = cbq_ovl_drop;
1317		break;
1318	case TC_CBQ_OVL_RCLASSIC:
1319		cl->overlimit = cbq_ovl_rclassic;
1320		break;
1321	default:
1322		return -EINVAL;
1323	}
1324	cl->penalty = ovl->penalty;
1325	return 0;
1326}
1327
1328#ifdef CONFIG_NET_CLS_ACT
1329static int cbq_set_police(struct cbq_class *cl, struct tc_cbq_police *p)
1330{
1331	cl->police = p->police;
1332
1333	if (cl->q->handle) {
1334		if (p->police == TC_POLICE_RECLASSIFY)
1335			cl->q->reshape_fail = cbq_reshape_fail;
1336		else
1337			cl->q->reshape_fail = NULL;
1338	}
1339	return 0;
1340}
1341#endif
1342
1343static int cbq_set_fopt(struct cbq_class *cl, struct tc_cbq_fopt *fopt)
1344{
1345	cbq_change_defmap(cl, fopt->split, fopt->defmap, fopt->defchange);
1346	return 0;
1347}
1348
1349static const struct nla_policy cbq_policy[TCA_CBQ_MAX + 1] = {
1350	[TCA_CBQ_LSSOPT]	= { .len = sizeof(struct tc_cbq_lssopt) },
1351	[TCA_CBQ_WRROPT]	= { .len = sizeof(struct tc_cbq_wrropt) },
1352	[TCA_CBQ_FOPT]		= { .len = sizeof(struct tc_cbq_fopt) },
1353	[TCA_CBQ_OVL_STRATEGY]	= { .len = sizeof(struct tc_cbq_ovl) },
1354	[TCA_CBQ_RATE]		= { .len = sizeof(struct tc_ratespec) },
1355	[TCA_CBQ_RTAB]		= { .type = NLA_BINARY, .len = TC_RTAB_SIZE },
1356	[TCA_CBQ_POLICE]	= { .len = sizeof(struct tc_cbq_police) },
1357};
1358
1359static int cbq_init(struct Qdisc *sch, struct nlattr *opt)
1360{
1361	struct cbq_sched_data *q = qdisc_priv(sch);
1362	struct nlattr *tb[TCA_CBQ_MAX + 1];
1363	struct tc_ratespec *r;
1364	int err;
1365
1366	err = nla_parse_nested(tb, TCA_CBQ_MAX, opt, cbq_policy);
1367	if (err < 0)
1368		return err;
1369
1370	if (tb[TCA_CBQ_RTAB] == NULL || tb[TCA_CBQ_RATE] == NULL)
1371		return -EINVAL;
1372
1373	r = nla_data(tb[TCA_CBQ_RATE]);
1374
1375	if ((q->link.R_tab = qdisc_get_rtab(r, tb[TCA_CBQ_RTAB])) == NULL)
1376		return -EINVAL;
1377
1378	err = qdisc_class_hash_init(&q->clhash);
1379	if (err < 0)
1380		goto put_rtab;
1381
1382	q->link.refcnt = 1;
1383	q->link.sibling = &q->link;
1384	q->link.common.classid = sch->handle;
1385	q->link.qdisc = sch;
1386	q->link.q = qdisc_create_dflt(sch->dev_queue, &pfifo_qdisc_ops,
1387				      sch->handle);
1388	if (!q->link.q)
1389		q->link.q = &noop_qdisc;
1390
1391	q->link.priority = TC_CBQ_MAXPRIO - 1;
1392	q->link.priority2 = TC_CBQ_MAXPRIO - 1;
1393	q->link.cpriority = TC_CBQ_MAXPRIO - 1;
1394	q->link.ovl_strategy = TC_CBQ_OVL_CLASSIC;
1395	q->link.overlimit = cbq_ovl_classic;
1396	q->link.allot = psched_mtu(qdisc_dev(sch));
1397	q->link.quantum = q->link.allot;
1398	q->link.weight = q->link.R_tab->rate.rate;
1399
1400	q->link.ewma_log = TC_CBQ_DEF_EWMA;
1401	q->link.avpkt = q->link.allot/2;
1402	q->link.minidle = -0x7FFFFFFF;
1403
1404	qdisc_watchdog_init(&q->watchdog, sch);
1405	hrtimer_init(&q->delay_timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS);
1406	q->delay_timer.function = cbq_undelay;
1407	q->toplevel = TC_CBQ_MAXLEVEL;
1408	q->now = psched_get_time();
1409	q->now_rt = q->now;
1410
1411	cbq_link_class(&q->link);
1412
1413	if (tb[TCA_CBQ_LSSOPT])
1414		cbq_set_lss(&q->link, nla_data(tb[TCA_CBQ_LSSOPT]));
1415
1416	cbq_addprio(q, &q->link);
1417	return 0;
1418
1419put_rtab:
1420	qdisc_put_rtab(q->link.R_tab);
1421	return err;
1422}
1423
1424static int cbq_dump_rate(struct sk_buff *skb, struct cbq_class *cl)
1425{
1426	unsigned char *b = skb_tail_pointer(skb);
1427
1428	if (nla_put(skb, TCA_CBQ_RATE, sizeof(cl->R_tab->rate), &cl->R_tab->rate))
1429		goto nla_put_failure;
1430	return skb->len;
1431
1432nla_put_failure:
1433	nlmsg_trim(skb, b);
1434	return -1;
1435}
1436
1437static int cbq_dump_lss(struct sk_buff *skb, struct cbq_class *cl)
1438{
1439	unsigned char *b = skb_tail_pointer(skb);
1440	struct tc_cbq_lssopt opt;
1441
1442	opt.flags = 0;
1443	if (cl->borrow == NULL)
1444		opt.flags |= TCF_CBQ_LSS_BOUNDED;
1445	if (cl->share == NULL)
1446		opt.flags |= TCF_CBQ_LSS_ISOLATED;
1447	opt.ewma_log = cl->ewma_log;
1448	opt.level = cl->level;
1449	opt.avpkt = cl->avpkt;
1450	opt.maxidle = cl->maxidle;
1451	opt.minidle = (u32)(-cl->minidle);
1452	opt.offtime = cl->offtime;
1453	opt.change = ~0;
1454	if (nla_put(skb, TCA_CBQ_LSSOPT, sizeof(opt), &opt))
1455		goto nla_put_failure;
1456	return skb->len;
1457
1458nla_put_failure:
1459	nlmsg_trim(skb, b);
1460	return -1;
1461}
1462
1463static int cbq_dump_wrr(struct sk_buff *skb, struct cbq_class *cl)
1464{
1465	unsigned char *b = skb_tail_pointer(skb);
1466	struct tc_cbq_wrropt opt;
1467
1468	opt.flags = 0;
1469	opt.allot = cl->allot;
1470	opt.priority = cl->priority + 1;
1471	opt.cpriority = cl->cpriority + 1;
1472	opt.weight = cl->weight;
1473	if (nla_put(skb, TCA_CBQ_WRROPT, sizeof(opt), &opt))
1474		goto nla_put_failure;
1475	return skb->len;
1476
1477nla_put_failure:
1478	nlmsg_trim(skb, b);
1479	return -1;
1480}
1481
1482static int cbq_dump_ovl(struct sk_buff *skb, struct cbq_class *cl)
1483{
1484	unsigned char *b = skb_tail_pointer(skb);
1485	struct tc_cbq_ovl opt;
1486
1487	opt.strategy = cl->ovl_strategy;
1488	opt.priority2 = cl->priority2 + 1;
1489	opt.pad = 0;
1490	opt.penalty = cl->penalty;
1491	if (nla_put(skb, TCA_CBQ_OVL_STRATEGY, sizeof(opt), &opt))
1492		goto nla_put_failure;
1493	return skb->len;
1494
1495nla_put_failure:
1496	nlmsg_trim(skb, b);
1497	return -1;
1498}
1499
1500static int cbq_dump_fopt(struct sk_buff *skb, struct cbq_class *cl)
1501{
1502	unsigned char *b = skb_tail_pointer(skb);
1503	struct tc_cbq_fopt opt;
1504
1505	if (cl->split || cl->defmap) {
1506		opt.split = cl->split ? cl->split->common.classid : 0;
1507		opt.defmap = cl->defmap;
1508		opt.defchange = ~0;
1509		if (nla_put(skb, TCA_CBQ_FOPT, sizeof(opt), &opt))
1510			goto nla_put_failure;
1511	}
1512	return skb->len;
1513
1514nla_put_failure:
1515	nlmsg_trim(skb, b);
1516	return -1;
1517}
1518
1519#ifdef CONFIG_NET_CLS_ACT
1520static int cbq_dump_police(struct sk_buff *skb, struct cbq_class *cl)
1521{
1522	unsigned char *b = skb_tail_pointer(skb);
1523	struct tc_cbq_police opt;
1524
1525	if (cl->police) {
1526		opt.police = cl->police;
1527		opt.__res1 = 0;
1528		opt.__res2 = 0;
1529		if (nla_put(skb, TCA_CBQ_POLICE, sizeof(opt), &opt))
1530			goto nla_put_failure;
1531	}
1532	return skb->len;
1533
1534nla_put_failure:
1535	nlmsg_trim(skb, b);
1536	return -1;
1537}
1538#endif
1539
1540static int cbq_dump_attr(struct sk_buff *skb, struct cbq_class *cl)
1541{
1542	if (cbq_dump_lss(skb, cl) < 0 ||
1543	    cbq_dump_rate(skb, cl) < 0 ||
1544	    cbq_dump_wrr(skb, cl) < 0 ||
1545	    cbq_dump_ovl(skb, cl) < 0 ||
1546#ifdef CONFIG_NET_CLS_ACT
1547	    cbq_dump_police(skb, cl) < 0 ||
1548#endif
1549	    cbq_dump_fopt(skb, cl) < 0)
1550		return -1;
1551	return 0;
1552}
1553
1554static int cbq_dump(struct Qdisc *sch, struct sk_buff *skb)
1555{
1556	struct cbq_sched_data *q = qdisc_priv(sch);
1557	struct nlattr *nest;
1558
1559	nest = nla_nest_start(skb, TCA_OPTIONS);
1560	if (nest == NULL)
1561		goto nla_put_failure;
1562	if (cbq_dump_attr(skb, &q->link) < 0)
1563		goto nla_put_failure;
1564	nla_nest_end(skb, nest);
1565	return skb->len;
1566
1567nla_put_failure:
1568	nla_nest_cancel(skb, nest);
1569	return -1;
1570}
1571
1572static int
1573cbq_dump_stats(struct Qdisc *sch, struct gnet_dump *d)
1574{
1575	struct cbq_sched_data *q = qdisc_priv(sch);
1576
1577	q->link.xstats.avgidle = q->link.avgidle;
1578	return gnet_stats_copy_app(d, &q->link.xstats, sizeof(q->link.xstats));
1579}
1580
1581static int
1582cbq_dump_class(struct Qdisc *sch, unsigned long arg,
1583	       struct sk_buff *skb, struct tcmsg *tcm)
1584{
1585	struct cbq_class *cl = (struct cbq_class *)arg;
1586	struct nlattr *nest;
1587
1588	if (cl->tparent)
1589		tcm->tcm_parent = cl->tparent->common.classid;
1590	else
1591		tcm->tcm_parent = TC_H_ROOT;
1592	tcm->tcm_handle = cl->common.classid;
1593	tcm->tcm_info = cl->q->handle;
1594
1595	nest = nla_nest_start(skb, TCA_OPTIONS);
1596	if (nest == NULL)
1597		goto nla_put_failure;
1598	if (cbq_dump_attr(skb, cl) < 0)
1599		goto nla_put_failure;
1600	nla_nest_end(skb, nest);
1601	return skb->len;
1602
1603nla_put_failure:
1604	nla_nest_cancel(skb, nest);
1605	return -1;
1606}
1607
1608static int
1609cbq_dump_class_stats(struct Qdisc *sch, unsigned long arg,
1610	struct gnet_dump *d)
1611{
1612	struct cbq_sched_data *q = qdisc_priv(sch);
1613	struct cbq_class *cl = (struct cbq_class *)arg;
1614
1615	cl->qstats.qlen = cl->q->q.qlen;
1616	cl->xstats.avgidle = cl->avgidle;
1617	cl->xstats.undertime = 0;
1618
1619	if (cl->undertime != PSCHED_PASTPERFECT)
1620		cl->xstats.undertime = cl->undertime - q->now;
1621
1622	if (gnet_stats_copy_basic(d, &cl->bstats) < 0 ||
1623	    gnet_stats_copy_rate_est(d, &cl->bstats, &cl->rate_est) < 0 ||
1624	    gnet_stats_copy_queue(d, &cl->qstats) < 0)
1625		return -1;
1626
1627	return gnet_stats_copy_app(d, &cl->xstats, sizeof(cl->xstats));
1628}
1629
1630static int cbq_graft(struct Qdisc *sch, unsigned long arg, struct Qdisc *new,
1631		     struct Qdisc **old)
1632{
1633	struct cbq_class *cl = (struct cbq_class *)arg;
1634
1635	if (new == NULL) {
1636		new = qdisc_create_dflt(sch->dev_queue,
1637					&pfifo_qdisc_ops, cl->common.classid);
1638		if (new == NULL)
1639			return -ENOBUFS;
1640	} else {
1641#ifdef CONFIG_NET_CLS_ACT
1642		if (cl->police == TC_POLICE_RECLASSIFY)
1643			new->reshape_fail = cbq_reshape_fail;
1644#endif
1645	}
1646	sch_tree_lock(sch);
1647	*old = cl->q;
1648	cl->q = new;
1649	qdisc_tree_decrease_qlen(*old, (*old)->q.qlen);
1650	qdisc_reset(*old);
1651	sch_tree_unlock(sch);
1652
1653	return 0;
1654}
1655
1656static struct Qdisc *cbq_leaf(struct Qdisc *sch, unsigned long arg)
1657{
1658	struct cbq_class *cl = (struct cbq_class *)arg;
1659
1660	return cl->q;
1661}
1662
1663static void cbq_qlen_notify(struct Qdisc *sch, unsigned long arg)
1664{
1665	struct cbq_class *cl = (struct cbq_class *)arg;
1666
1667	if (cl->q->q.qlen == 0)
1668		cbq_deactivate_class(cl);
1669}
1670
1671static unsigned long cbq_get(struct Qdisc *sch, u32 classid)
1672{
1673	struct cbq_sched_data *q = qdisc_priv(sch);
1674	struct cbq_class *cl = cbq_class_lookup(q, classid);
1675
1676	if (cl) {
1677		cl->refcnt++;
1678		return (unsigned long)cl;
1679	}
1680	return 0;
1681}
1682
1683static void cbq_destroy_class(struct Qdisc *sch, struct cbq_class *cl)
1684{
1685	struct cbq_sched_data *q = qdisc_priv(sch);
1686
1687	WARN_ON(cl->filters);
1688
1689	tcf_destroy_chain(&cl->filter_list);
1690	qdisc_destroy(cl->q);
1691	qdisc_put_rtab(cl->R_tab);
1692	gen_kill_estimator(&cl->bstats, &cl->rate_est);
1693	if (cl != &q->link)
1694		kfree(cl);
1695}
1696
1697static void cbq_destroy(struct Qdisc *sch)
1698{
1699	struct cbq_sched_data *q = qdisc_priv(sch);
1700	struct hlist_node *next;
1701	struct cbq_class *cl;
1702	unsigned int h;
1703
1704#ifdef CONFIG_NET_CLS_ACT
1705	q->rx_class = NULL;
1706#endif
1707	/*
1708	 * Filters must be destroyed first because we don't destroy the
1709	 * classes from root to leafs which means that filters can still
1710	 * be bound to classes which have been destroyed already. --TGR '04
1711	 */
1712	for (h = 0; h < q->clhash.hashsize; h++) {
1713		hlist_for_each_entry(cl, &q->clhash.hash[h], common.hnode)
1714			tcf_destroy_chain(&cl->filter_list);
1715	}
1716	for (h = 0; h < q->clhash.hashsize; h++) {
1717		hlist_for_each_entry_safe(cl, next, &q->clhash.hash[h],
1718					  common.hnode)
1719			cbq_destroy_class(sch, cl);
1720	}
1721	qdisc_class_hash_destroy(&q->clhash);
1722}
1723
1724static void cbq_put(struct Qdisc *sch, unsigned long arg)
1725{
1726	struct cbq_class *cl = (struct cbq_class *)arg;
1727
1728	if (--cl->refcnt == 0) {
1729#ifdef CONFIG_NET_CLS_ACT
1730		spinlock_t *root_lock = qdisc_root_sleeping_lock(sch);
1731		struct cbq_sched_data *q = qdisc_priv(sch);
1732
1733		spin_lock_bh(root_lock);
1734		if (q->rx_class == cl)
1735			q->rx_class = NULL;
1736		spin_unlock_bh(root_lock);
1737#endif
1738
1739		cbq_destroy_class(sch, cl);
1740	}
1741}
1742
1743static int
1744cbq_change_class(struct Qdisc *sch, u32 classid, u32 parentid, struct nlattr **tca,
1745		 unsigned long *arg)
1746{
1747	int err;
1748	struct cbq_sched_data *q = qdisc_priv(sch);
1749	struct cbq_class *cl = (struct cbq_class *)*arg;
1750	struct nlattr *opt = tca[TCA_OPTIONS];
1751	struct nlattr *tb[TCA_CBQ_MAX + 1];
1752	struct cbq_class *parent;
1753	struct qdisc_rate_table *rtab = NULL;
1754
1755	if (opt == NULL)
1756		return -EINVAL;
1757
1758	err = nla_parse_nested(tb, TCA_CBQ_MAX, opt, cbq_policy);
1759	if (err < 0)
1760		return err;
1761
1762	if (cl) {
1763		/* Check parent */
1764		if (parentid) {
1765			if (cl->tparent &&
1766			    cl->tparent->common.classid != parentid)
1767				return -EINVAL;
1768			if (!cl->tparent && parentid != TC_H_ROOT)
1769				return -EINVAL;
1770		}
1771
1772		if (tb[TCA_CBQ_RATE]) {
1773			rtab = qdisc_get_rtab(nla_data(tb[TCA_CBQ_RATE]),
1774					      tb[TCA_CBQ_RTAB]);
1775			if (rtab == NULL)
1776				return -EINVAL;
1777		}
1778
1779		if (tca[TCA_RATE]) {
1780			err = gen_replace_estimator(&cl->bstats, &cl->rate_est,
1781						    qdisc_root_sleeping_lock(sch),
1782						    tca[TCA_RATE]);
1783			if (err) {
1784				if (rtab)
1785					qdisc_put_rtab(rtab);
1786				return err;
1787			}
1788		}
1789
1790		/* Change class parameters */
1791		sch_tree_lock(sch);
1792
1793		if (cl->next_alive != NULL)
1794			cbq_deactivate_class(cl);
1795
1796		if (rtab) {
1797			qdisc_put_rtab(cl->R_tab);
1798			cl->R_tab = rtab;
1799		}
1800
1801		if (tb[TCA_CBQ_LSSOPT])
1802			cbq_set_lss(cl, nla_data(tb[TCA_CBQ_LSSOPT]));
1803
1804		if (tb[TCA_CBQ_WRROPT]) {
1805			cbq_rmprio(q, cl);
1806			cbq_set_wrr(cl, nla_data(tb[TCA_CBQ_WRROPT]));
1807		}
1808
1809		if (tb[TCA_CBQ_OVL_STRATEGY])
1810			cbq_set_overlimit(cl, nla_data(tb[TCA_CBQ_OVL_STRATEGY]));
1811
1812#ifdef CONFIG_NET_CLS_ACT
1813		if (tb[TCA_CBQ_POLICE])
1814			cbq_set_police(cl, nla_data(tb[TCA_CBQ_POLICE]));
1815#endif
1816
1817		if (tb[TCA_CBQ_FOPT])
1818			cbq_set_fopt(cl, nla_data(tb[TCA_CBQ_FOPT]));
1819
1820		if (cl->q->q.qlen)
1821			cbq_activate_class(cl);
1822
1823		sch_tree_unlock(sch);
1824
1825		return 0;
1826	}
1827
1828	if (parentid == TC_H_ROOT)
1829		return -EINVAL;
1830
1831	if (tb[TCA_CBQ_WRROPT] == NULL || tb[TCA_CBQ_RATE] == NULL ||
1832	    tb[TCA_CBQ_LSSOPT] == NULL)
1833		return -EINVAL;
1834
1835	rtab = qdisc_get_rtab(nla_data(tb[TCA_CBQ_RATE]), tb[TCA_CBQ_RTAB]);
1836	if (rtab == NULL)
1837		return -EINVAL;
1838
1839	if (classid) {
1840		err = -EINVAL;
1841		if (TC_H_MAJ(classid ^ sch->handle) ||
1842		    cbq_class_lookup(q, classid))
1843			goto failure;
1844	} else {
1845		int i;
1846		classid = TC_H_MAKE(sch->handle, 0x8000);
1847
1848		for (i = 0; i < 0x8000; i++) {
1849			if (++q->hgenerator >= 0x8000)
1850				q->hgenerator = 1;
1851			if (cbq_class_lookup(q, classid|q->hgenerator) == NULL)
1852				break;
1853		}
1854		err = -ENOSR;
1855		if (i >= 0x8000)
1856			goto failure;
1857		classid = classid|q->hgenerator;
1858	}
1859
1860	parent = &q->link;
1861	if (parentid) {
1862		parent = cbq_class_lookup(q, parentid);
1863		err = -EINVAL;
1864		if (parent == NULL)
1865			goto failure;
1866	}
1867
1868	err = -ENOBUFS;
1869	cl = kzalloc(sizeof(*cl), GFP_KERNEL);
1870	if (cl == NULL)
1871		goto failure;
1872
1873	if (tca[TCA_RATE]) {
1874		err = gen_new_estimator(&cl->bstats, &cl->rate_est,
1875					qdisc_root_sleeping_lock(sch),
1876					tca[TCA_RATE]);
1877		if (err) {
1878			kfree(cl);
1879			goto failure;
1880		}
1881	}
1882
1883	cl->R_tab = rtab;
1884	rtab = NULL;
1885	cl->refcnt = 1;
1886	cl->q = qdisc_create_dflt(sch->dev_queue, &pfifo_qdisc_ops, classid);
1887	if (!cl->q)
1888		cl->q = &noop_qdisc;
1889	cl->common.classid = classid;
1890	cl->tparent = parent;
1891	cl->qdisc = sch;
1892	cl->allot = parent->allot;
1893	cl->quantum = cl->allot;
1894	cl->weight = cl->R_tab->rate.rate;
1895
1896	sch_tree_lock(sch);
1897	cbq_link_class(cl);
1898	cl->borrow = cl->tparent;
1899	if (cl->tparent != &q->link)
1900		cl->share = cl->tparent;
1901	cbq_adjust_levels(parent);
1902	cl->minidle = -0x7FFFFFFF;
1903	cbq_set_lss(cl, nla_data(tb[TCA_CBQ_LSSOPT]));
1904	cbq_set_wrr(cl, nla_data(tb[TCA_CBQ_WRROPT]));
1905	if (cl->ewma_log == 0)
1906		cl->ewma_log = q->link.ewma_log;
1907	if (cl->maxidle == 0)
1908		cl->maxidle = q->link.maxidle;
1909	if (cl->avpkt == 0)
1910		cl->avpkt = q->link.avpkt;
1911	cl->overlimit = cbq_ovl_classic;
1912	if (tb[TCA_CBQ_OVL_STRATEGY])
1913		cbq_set_overlimit(cl, nla_data(tb[TCA_CBQ_OVL_STRATEGY]));
1914#ifdef CONFIG_NET_CLS_ACT
1915	if (tb[TCA_CBQ_POLICE])
1916		cbq_set_police(cl, nla_data(tb[TCA_CBQ_POLICE]));
1917#endif
1918	if (tb[TCA_CBQ_FOPT])
1919		cbq_set_fopt(cl, nla_data(tb[TCA_CBQ_FOPT]));
1920	sch_tree_unlock(sch);
1921
1922	qdisc_class_hash_grow(sch, &q->clhash);
1923
1924	*arg = (unsigned long)cl;
1925	return 0;
1926
1927failure:
1928	qdisc_put_rtab(rtab);
1929	return err;
1930}
1931
1932static int cbq_delete(struct Qdisc *sch, unsigned long arg)
1933{
1934	struct cbq_sched_data *q = qdisc_priv(sch);
1935	struct cbq_class *cl = (struct cbq_class *)arg;
1936	unsigned int qlen;
1937
1938	if (cl->filters || cl->children || cl == &q->link)
1939		return -EBUSY;
1940
1941	sch_tree_lock(sch);
1942
1943	qlen = cl->q->q.qlen;
1944	qdisc_reset(cl->q);
1945	qdisc_tree_decrease_qlen(cl->q, qlen);
1946
1947	if (cl->next_alive)
1948		cbq_deactivate_class(cl);
1949
1950	if (q->tx_borrowed == cl)
1951		q->tx_borrowed = q->tx_class;
1952	if (q->tx_class == cl) {
1953		q->tx_class = NULL;
1954		q->tx_borrowed = NULL;
1955	}
1956#ifdef CONFIG_NET_CLS_ACT
1957	if (q->rx_class == cl)
1958		q->rx_class = NULL;
1959#endif
1960
1961	cbq_unlink_class(cl);
1962	cbq_adjust_levels(cl->tparent);
1963	cl->defmap = 0;
1964	cbq_sync_defmap(cl);
1965
1966	cbq_rmprio(q, cl);
1967	sch_tree_unlock(sch);
1968
1969	BUG_ON(--cl->refcnt == 0);
1970	/*
1971	 * This shouldn't happen: we "hold" one cops->get() when called
1972	 * from tc_ctl_tclass; the destroy method is done from cops->put().
1973	 */
1974
1975	return 0;
1976}
1977
1978static struct tcf_proto **cbq_find_tcf(struct Qdisc *sch, unsigned long arg)
1979{
1980	struct cbq_sched_data *q = qdisc_priv(sch);
1981	struct cbq_class *cl = (struct cbq_class *)arg;
1982
1983	if (cl == NULL)
1984		cl = &q->link;
1985
1986	return &cl->filter_list;
1987}
1988
1989static unsigned long cbq_bind_filter(struct Qdisc *sch, unsigned long parent,
1990				     u32 classid)
1991{
1992	struct cbq_sched_data *q = qdisc_priv(sch);
1993	struct cbq_class *p = (struct cbq_class *)parent;
1994	struct cbq_class *cl = cbq_class_lookup(q, classid);
1995
1996	if (cl) {
1997		if (p && p->level <= cl->level)
1998			return 0;
1999		cl->filters++;
2000		return (unsigned long)cl;
2001	}
2002	return 0;
2003}
2004
2005static void cbq_unbind_filter(struct Qdisc *sch, unsigned long arg)
2006{
2007	struct cbq_class *cl = (struct cbq_class *)arg;
2008
2009	cl->filters--;
2010}
2011
2012static void cbq_walk(struct Qdisc *sch, struct qdisc_walker *arg)
2013{
2014	struct cbq_sched_data *q = qdisc_priv(sch);
2015	struct cbq_class *cl;
2016	unsigned int h;
2017
2018	if (arg->stop)
2019		return;
2020
2021	for (h = 0; h < q->clhash.hashsize; h++) {
2022		hlist_for_each_entry(cl, &q->clhash.hash[h], common.hnode) {
2023			if (arg->count < arg->skip) {
2024				arg->count++;
2025				continue;
2026			}
2027			if (arg->fn(sch, (unsigned long)cl, arg) < 0) {
2028				arg->stop = 1;
2029				return;
2030			}
2031			arg->count++;
2032		}
2033	}
2034}
2035
2036static const struct Qdisc_class_ops cbq_class_ops = {
2037	.graft		=	cbq_graft,
2038	.leaf		=	cbq_leaf,
2039	.qlen_notify	=	cbq_qlen_notify,
2040	.get		=	cbq_get,
2041	.put		=	cbq_put,
2042	.change		=	cbq_change_class,
2043	.delete		=	cbq_delete,
2044	.walk		=	cbq_walk,
2045	.tcf_chain	=	cbq_find_tcf,
2046	.bind_tcf	=	cbq_bind_filter,
2047	.unbind_tcf	=	cbq_unbind_filter,
2048	.dump		=	cbq_dump_class,
2049	.dump_stats	=	cbq_dump_class_stats,
2050};
2051
2052static struct Qdisc_ops cbq_qdisc_ops __read_mostly = {
2053	.next		=	NULL,
2054	.cl_ops		=	&cbq_class_ops,
2055	.id		=	"cbq",
2056	.priv_size	=	sizeof(struct cbq_sched_data),
2057	.enqueue	=	cbq_enqueue,
2058	.dequeue	=	cbq_dequeue,
2059	.peek		=	qdisc_peek_dequeued,
2060	.drop		=	cbq_drop,
2061	.init		=	cbq_init,
2062	.reset		=	cbq_reset,
2063	.destroy	=	cbq_destroy,
2064	.change		=	NULL,
2065	.dump		=	cbq_dump,
2066	.dump_stats	=	cbq_dump_stats,
2067	.owner		=	THIS_MODULE,
2068};
2069
2070static int __init cbq_module_init(void)
2071{
2072	return register_qdisc(&cbq_qdisc_ops);
2073}
2074static void __exit cbq_module_exit(void)
2075{
2076	unregister_qdisc(&cbq_qdisc_ops);
2077}
2078module_init(cbq_module_init)
2079module_exit(cbq_module_exit)
2080MODULE_LICENSE("GPL");
2081