sch_gred.c revision f62d6b936df500247474c13360eb23e1b602bad0
1/*
2 * net/sched/sch_gred.c	Generic Random Early Detection queue.
3 *
4 *
5 *              This program is free software; you can redistribute it and/or
6 *              modify it under the terms of the GNU General Public License
7 *              as published by the Free Software Foundation; either version
8 *              2 of the License, or (at your option) any later version.
9 *
10 * Authors:    J Hadi Salim (hadi@cyberus.ca) 1998-2002
11 *
12 *             991129: -  Bug fix with grio mode
13 *		       - a better sing. AvgQ mode with Grio(WRED)
14 *		       - A finer grained VQ dequeue based on sugestion
15 *		         from Ren Liu
16 *		       - More error checks
17 *
18 *
19 *
20 *  For all the glorious comments look at Alexey's sch_red.c
21 */
22
23#include <linux/config.h>
24#include <linux/module.h>
25#include <asm/uaccess.h>
26#include <asm/system.h>
27#include <linux/bitops.h>
28#include <linux/types.h>
29#include <linux/kernel.h>
30#include <linux/sched.h>
31#include <linux/string.h>
32#include <linux/mm.h>
33#include <linux/socket.h>
34#include <linux/sockios.h>
35#include <linux/in.h>
36#include <linux/errno.h>
37#include <linux/interrupt.h>
38#include <linux/if_ether.h>
39#include <linux/inet.h>
40#include <linux/netdevice.h>
41#include <linux/etherdevice.h>
42#include <linux/notifier.h>
43#include <net/ip.h>
44#include <net/route.h>
45#include <linux/skbuff.h>
46#include <net/sock.h>
47#include <net/pkt_sched.h>
48
49#if 1 /* control */
50#define DPRINTK(format,args...) printk(KERN_DEBUG format,##args)
51#else
52#define DPRINTK(format,args...)
53#endif
54
55#if 0 /* data */
56#define D2PRINTK(format,args...) printk(KERN_DEBUG format,##args)
57#else
58#define D2PRINTK(format,args...)
59#endif
60
61#define GRED_DEF_PRIO (MAX_DPs / 2)
62
63struct gred_sched_data;
64struct gred_sched;
65
66struct gred_sched_data
67{
68/* Parameters */
69	u32		limit;		/* HARD maximal queue length	*/
70	u32		qth_min;	/* Min average length threshold: A scaled */
71	u32		qth_max;	/* Max average length threshold: A scaled */
72	u32      	DP;		/* the drop pramaters */
73	char		Wlog;		/* log(W)		*/
74	char		Plog;		/* random number bits	*/
75	u32		Scell_max;
76	u32		Rmask;
77	u32		bytesin;	/* bytes seen on virtualQ so far*/
78	u32		packetsin;	/* packets seen on virtualQ so far*/
79	u32		backlog;	/* bytes on the virtualQ */
80	u32		forced;	/* packets dropped for exceeding limits */
81	u32		early;	/* packets dropped as a warning */
82	u32		other;	/* packets dropped by invoking drop() */
83	u32		pdrop;	/* packets dropped because we exceeded physical queue limits */
84	char		Scell_log;
85	u8		Stab[256];
86	u8              prio;        /* the prio of this vq */
87
88/* Variables */
89	unsigned long	qave;		/* Average queue length: A scaled */
90	int		qcount;		/* Packets since last random number generation */
91	u32		qR;		/* Cached random number */
92
93	psched_time_t	qidlestart;	/* Start of idle period	*/
94};
95
96enum {
97	GRED_WRED_MODE = 1,
98	GRED_RIO_MODE,
99};
100
101struct gred_sched
102{
103	struct gred_sched_data *tab[MAX_DPs];
104	unsigned long	flags;
105	u32 		DPs;
106	u32 		def;
107	u8 		initd;
108};
109
110static inline int gred_wred_mode(struct gred_sched *table)
111{
112	return test_bit(GRED_WRED_MODE, &table->flags);
113}
114
115static inline void gred_enable_wred_mode(struct gred_sched *table)
116{
117	__set_bit(GRED_WRED_MODE, &table->flags);
118}
119
120static inline void gred_disable_wred_mode(struct gred_sched *table)
121{
122	__clear_bit(GRED_WRED_MODE, &table->flags);
123}
124
125static inline int gred_rio_mode(struct gred_sched *table)
126{
127	return test_bit(GRED_RIO_MODE, &table->flags);
128}
129
130static inline void gred_enable_rio_mode(struct gred_sched *table)
131{
132	__set_bit(GRED_RIO_MODE, &table->flags);
133}
134
135static inline void gred_disable_rio_mode(struct gred_sched *table)
136{
137	__clear_bit(GRED_RIO_MODE, &table->flags);
138}
139
140static inline int gred_wred_mode_check(struct Qdisc *sch)
141{
142	struct gred_sched *table = qdisc_priv(sch);
143	int i;
144
145	/* Really ugly O(n^2) but shouldn't be necessary too frequent. */
146	for (i = 0; i < table->DPs; i++) {
147		struct gred_sched_data *q = table->tab[i];
148		int n;
149
150		if (q == NULL)
151			continue;
152
153		for (n = 0; n < table->DPs; n++)
154			if (table->tab[n] && table->tab[n] != q &&
155			    table->tab[n]->prio == q->prio)
156				return 1;
157	}
158
159	return 0;
160}
161
162static int
163gred_enqueue(struct sk_buff *skb, struct Qdisc* sch)
164{
165	psched_time_t now;
166	struct gred_sched_data *q=NULL;
167	struct gred_sched *t= qdisc_priv(sch);
168	unsigned long	qave=0;
169	int i=0;
170
171	if (!t->initd && skb_queue_len(&sch->q) < (sch->dev->tx_queue_len ? : 1)) {
172		D2PRINTK("NO GRED Queues setup yet! Enqueued anyway\n");
173		goto do_enqueue;
174	}
175
176
177	if ( ((skb->tc_index&0xf) > (t->DPs -1)) || !(q=t->tab[skb->tc_index&0xf])) {
178		printk("GRED: setting to default (%d)\n ",t->def);
179		if (!(q=t->tab[t->def])) {
180			DPRINTK("GRED: setting to default FAILED! dropping!! "
181			    "(%d)\n ", t->def);
182			goto drop;
183		}
184		/* fix tc_index? --could be controvesial but needed for
185		   requeueing */
186		skb->tc_index=(skb->tc_index&0xfffffff0) | t->def;
187	}
188
189	D2PRINTK("gred_enqueue virtualQ 0x%x classid %x backlog %d "
190	    "general backlog %d\n",skb->tc_index&0xf,sch->handle,q->backlog,
191	    sch->qstats.backlog);
192	/* sum up all the qaves of prios <= to ours to get the new qave*/
193	if (!gred_wred_mode(t) && gred_rio_mode(t)) {
194		for (i=0;i<t->DPs;i++) {
195			if ((!t->tab[i]) || (i==q->DP))
196				continue;
197
198			if ((t->tab[i]->prio < q->prio) && (PSCHED_IS_PASTPERFECT(t->tab[i]->qidlestart)))
199				qave +=t->tab[i]->qave;
200		}
201
202	}
203
204	q->packetsin++;
205	q->bytesin+=skb->len;
206
207	if (gred_wred_mode(t)) {
208		qave=0;
209		q->qave=t->tab[t->def]->qave;
210		q->qidlestart=t->tab[t->def]->qidlestart;
211	}
212
213	if (!PSCHED_IS_PASTPERFECT(q->qidlestart)) {
214		long us_idle;
215		PSCHED_GET_TIME(now);
216		us_idle = PSCHED_TDIFF_SAFE(now, q->qidlestart, q->Scell_max);
217		PSCHED_SET_PASTPERFECT(q->qidlestart);
218
219		q->qave >>= q->Stab[(us_idle>>q->Scell_log)&0xFF];
220	} else {
221		if (gred_wred_mode(t)) {
222			q->qave += sch->qstats.backlog - (q->qave >> q->Wlog);
223		} else {
224			q->qave += q->backlog - (q->qave >> q->Wlog);
225		}
226
227	}
228
229
230	if (gred_wred_mode(t))
231		t->tab[t->def]->qave=q->qave;
232
233	if ((q->qave+qave) < q->qth_min) {
234		q->qcount = -1;
235enqueue:
236		if (q->backlog + skb->len <= q->limit) {
237			q->backlog += skb->len;
238do_enqueue:
239			__skb_queue_tail(&sch->q, skb);
240			sch->qstats.backlog += skb->len;
241			sch->bstats.bytes += skb->len;
242			sch->bstats.packets++;
243			return 0;
244		} else {
245			q->pdrop++;
246		}
247
248drop:
249		kfree_skb(skb);
250		sch->qstats.drops++;
251		return NET_XMIT_DROP;
252	}
253	if ((q->qave+qave) >= q->qth_max) {
254		q->qcount = -1;
255		sch->qstats.overlimits++;
256		q->forced++;
257		goto drop;
258	}
259	if (++q->qcount) {
260		if ((((qave+q->qave) - q->qth_min)>>q->Wlog)*q->qcount < q->qR)
261			goto enqueue;
262		q->qcount = 0;
263		q->qR = net_random()&q->Rmask;
264		sch->qstats.overlimits++;
265		q->early++;
266		goto drop;
267	}
268	q->qR = net_random()&q->Rmask;
269	goto enqueue;
270}
271
272static int
273gred_requeue(struct sk_buff *skb, struct Qdisc* sch)
274{
275	struct gred_sched_data *q;
276	struct gred_sched *t= qdisc_priv(sch);
277	q= t->tab[(skb->tc_index&0xf)];
278/* error checking here -- probably unnecessary */
279	PSCHED_SET_PASTPERFECT(q->qidlestart);
280
281	__skb_queue_head(&sch->q, skb);
282	sch->qstats.backlog += skb->len;
283	sch->qstats.requeues++;
284	q->backlog += skb->len;
285	return 0;
286}
287
288static struct sk_buff *
289gred_dequeue(struct Qdisc* sch)
290{
291	struct sk_buff *skb;
292	struct gred_sched_data *q;
293	struct gred_sched *t= qdisc_priv(sch);
294
295	skb = __skb_dequeue(&sch->q);
296	if (skb) {
297		sch->qstats.backlog -= skb->len;
298		q= t->tab[(skb->tc_index&0xf)];
299		if (q) {
300			q->backlog -= skb->len;
301			if (!q->backlog && !gred_wred_mode(t))
302				PSCHED_GET_TIME(q->qidlestart);
303		} else {
304			D2PRINTK("gred_dequeue: skb has bad tcindex %x\n",skb->tc_index&0xf);
305		}
306		return skb;
307	}
308
309	if (gred_wred_mode(t)) {
310			q= t->tab[t->def];
311			if (!q)
312				D2PRINTK("no default VQ set: Results will be "
313				       "screwed up\n");
314			else
315				PSCHED_GET_TIME(q->qidlestart);
316	}
317
318	return NULL;
319}
320
321static unsigned int gred_drop(struct Qdisc* sch)
322{
323	struct sk_buff *skb;
324
325	struct gred_sched_data *q;
326	struct gred_sched *t= qdisc_priv(sch);
327
328	skb = __skb_dequeue_tail(&sch->q);
329	if (skb) {
330		unsigned int len = skb->len;
331		sch->qstats.backlog -= len;
332		sch->qstats.drops++;
333		q= t->tab[(skb->tc_index&0xf)];
334		if (q) {
335			q->backlog -= len;
336			q->other++;
337			if (!q->backlog && !gred_wred_mode(t))
338				PSCHED_GET_TIME(q->qidlestart);
339		} else {
340			D2PRINTK("gred_dequeue: skb has bad tcindex %x\n",skb->tc_index&0xf);
341		}
342
343		kfree_skb(skb);
344		return len;
345	}
346
347	q=t->tab[t->def];
348	if (!q) {
349		D2PRINTK("no default VQ set: Results might be screwed up\n");
350		return 0;
351	}
352
353	PSCHED_GET_TIME(q->qidlestart);
354	return 0;
355
356}
357
358static void gred_reset(struct Qdisc* sch)
359{
360	int i;
361	struct gred_sched_data *q;
362	struct gred_sched *t= qdisc_priv(sch);
363
364	__skb_queue_purge(&sch->q);
365
366	sch->qstats.backlog = 0;
367
368        for (i=0;i<t->DPs;i++) {
369	        q= t->tab[i];
370		if (!q)
371			continue;
372		PSCHED_SET_PASTPERFECT(q->qidlestart);
373		q->qave = 0;
374		q->qcount = -1;
375		q->backlog = 0;
376		q->other=0;
377		q->forced=0;
378		q->pdrop=0;
379		q->early=0;
380	}
381}
382
383static inline void gred_destroy_vq(struct gred_sched_data *q)
384{
385	kfree(q);
386}
387
388static inline int gred_change_table_def(struct Qdisc *sch, struct rtattr *dps)
389{
390	struct gred_sched *table = qdisc_priv(sch);
391	struct tc_gred_sopt *sopt;
392	int i;
393
394	if (dps == NULL || RTA_PAYLOAD(dps) < sizeof(*sopt))
395		return -EINVAL;
396
397	sopt = RTA_DATA(dps);
398
399	if (sopt->DPs > MAX_DPs || sopt->DPs == 0 || sopt->def_DP >= sopt->DPs)
400		return -EINVAL;
401
402	sch_tree_lock(sch);
403	table->DPs = sopt->DPs;
404	table->def = sopt->def_DP;
405
406	/*
407	 * Every entry point to GRED is synchronized with the above code
408	 * and the DP is checked against DPs, i.e. shadowed VQs can no
409	 * longer be found so we can unlock right here.
410	 */
411	sch_tree_unlock(sch);
412
413	if (sopt->grio) {
414		gred_enable_rio_mode(table);
415		gred_disable_wred_mode(table);
416		if (gred_wred_mode_check(sch))
417			gred_enable_wred_mode(table);
418	} else {
419		gred_disable_rio_mode(table);
420		gred_disable_wred_mode(table);
421	}
422
423	for (i = table->DPs; i < MAX_DPs; i++) {
424		if (table->tab[i]) {
425			printk(KERN_WARNING "GRED: Warning: Destroying "
426			       "shadowed VQ 0x%x\n", i);
427			gred_destroy_vq(table->tab[i]);
428			table->tab[i] = NULL;
429  		}
430	}
431
432	table->initd = 0;
433
434	return 0;
435}
436
437static inline int gred_change_vq(struct Qdisc *sch, int dp,
438				 struct tc_gred_qopt *ctl, int prio, u8 *stab)
439{
440	struct gred_sched *table = qdisc_priv(sch);
441	struct gred_sched_data *q;
442
443	if (table->tab[dp] == NULL) {
444		table->tab[dp] = kmalloc(sizeof(*q), GFP_KERNEL);
445		if (table->tab[dp] == NULL)
446			return -ENOMEM;
447		memset(table->tab[dp], 0, sizeof(*q));
448	}
449
450	q = table->tab[dp];
451	q->DP = dp;
452	q->prio = prio;
453
454	q->Wlog = ctl->Wlog;
455	q->Plog = ctl->Plog;
456	q->limit = ctl->limit;
457	q->Scell_log = ctl->Scell_log;
458 	q->Rmask = ctl->Plog < 32 ? ((1<<ctl->Plog) - 1) : ~0UL;
459 	q->Scell_max = (255<<q->Scell_log);
460 	q->qth_min = ctl->qth_min<<ctl->Wlog;
461 	q->qth_max = ctl->qth_max<<ctl->Wlog;
462 	q->qave=0;
463 	q->backlog=0;
464 	q->qcount = -1;
465 	q->other=0;
466 	q->forced=0;
467 	q->pdrop=0;
468	q->early=0;
469
470	PSCHED_SET_PASTPERFECT(q->qidlestart);
471	memcpy(q->Stab, stab, 256);
472
473	return 0;
474}
475
476static int gred_change(struct Qdisc *sch, struct rtattr *opt)
477{
478	struct gred_sched *table = qdisc_priv(sch);
479	struct tc_gred_qopt *ctl;
480	struct rtattr *tb[TCA_GRED_MAX];
481	int err = -EINVAL, prio = GRED_DEF_PRIO;
482	u8 *stab;
483
484	if (opt == NULL || rtattr_parse_nested(tb, TCA_GRED_MAX, opt))
485		return -EINVAL;
486
487	if (tb[TCA_GRED_PARMS-1] == NULL && tb[TCA_GRED_STAB-1] == NULL)
488		return gred_change_table_def(sch, opt);
489
490	if (tb[TCA_GRED_PARMS-1] == NULL ||
491	    RTA_PAYLOAD(tb[TCA_GRED_PARMS-1]) < sizeof(*ctl) ||
492	    tb[TCA_GRED_STAB-1] == NULL ||
493	    RTA_PAYLOAD(tb[TCA_GRED_STAB-1]) < 256)
494		return -EINVAL;
495
496	ctl = RTA_DATA(tb[TCA_GRED_PARMS-1]);
497	stab = RTA_DATA(tb[TCA_GRED_STAB-1]);
498
499	if (ctl->DP >= table->DPs)
500		goto errout;
501
502	if (gred_rio_mode(table)) {
503		if (ctl->prio == 0) {
504			int def_prio = GRED_DEF_PRIO;
505
506			if (table->tab[table->def])
507				def_prio = table->tab[table->def]->prio;
508
509			printk(KERN_DEBUG "GRED: DP %u does not have a prio "
510			       "setting default to %d\n", ctl->DP, def_prio);
511
512			prio = def_prio;
513		} else
514			prio = ctl->prio;
515	}
516
517	sch_tree_lock(sch);
518
519	err = gred_change_vq(sch, ctl->DP, ctl, prio, stab);
520	if (err < 0)
521		goto errout_locked;
522
523	if (table->tab[table->def] == NULL) {
524		if (gred_rio_mode(table))
525			prio = table->tab[ctl->DP]->prio;
526
527		err = gred_change_vq(sch, table->def, ctl, prio, stab);
528		if (err < 0)
529			goto errout_locked;
530	}
531
532	table->initd = 1;
533
534	if (gred_rio_mode(table)) {
535		gred_disable_wred_mode(table);
536		if (gred_wred_mode_check(sch))
537			gred_enable_wred_mode(table);
538	}
539
540	err = 0;
541
542errout_locked:
543	sch_tree_unlock(sch);
544errout:
545	return err;
546}
547
548static int gred_init(struct Qdisc *sch, struct rtattr *opt)
549{
550	struct rtattr *tb[TCA_GRED_MAX];
551
552	if (opt == NULL || rtattr_parse_nested(tb, TCA_GRED_MAX, opt))
553		return -EINVAL;
554
555	if (tb[TCA_GRED_PARMS-1] || tb[TCA_GRED_STAB-1])
556		return -EINVAL;
557
558	return gred_change_table_def(sch, tb[TCA_GRED_DPS-1]);
559}
560
561static int gred_dump(struct Qdisc *sch, struct sk_buff *skb)
562{
563	struct gred_sched *table = qdisc_priv(sch);
564	struct rtattr *parms, *opts = NULL;
565	int i;
566	struct tc_gred_sopt sopt = {
567		.DPs	= table->DPs,
568		.def_DP	= table->def,
569		.grio	= gred_rio_mode(table),
570	};
571
572	opts = RTA_NEST(skb, TCA_OPTIONS);
573	RTA_PUT(skb, TCA_GRED_DPS, sizeof(sopt), &sopt);
574	parms = RTA_NEST(skb, TCA_GRED_PARMS);
575
576	for (i = 0; i < MAX_DPs; i++) {
577		struct gred_sched_data *q = table->tab[i];
578		struct tc_gred_qopt opt;
579
580		memset(&opt, 0, sizeof(opt));
581
582		if (!q) {
583			/* hack -- fix at some point with proper message
584			   This is how we indicate to tc that there is no VQ
585			   at this DP */
586
587			opt.DP = MAX_DPs + i;
588			goto append_opt;
589		}
590
591		opt.limit	= q->limit;
592		opt.DP		= q->DP;
593		opt.backlog	= q->backlog;
594		opt.prio	= q->prio;
595		opt.qth_min	= q->qth_min >> q->Wlog;
596		opt.qth_max	= q->qth_max >> q->Wlog;
597		opt.Wlog	= q->Wlog;
598		opt.Plog	= q->Plog;
599		opt.Scell_log	= q->Scell_log;
600		opt.other	= q->other;
601		opt.early	= q->early;
602		opt.forced	= q->forced;
603		opt.pdrop	= q->pdrop;
604		opt.packets	= q->packetsin;
605		opt.bytesin	= q->bytesin;
606
607		if (q->qave) {
608			if (gred_wred_mode(table)) {
609				q->qidlestart=table->tab[table->def]->qidlestart;
610				q->qave=table->tab[table->def]->qave;
611			}
612			if (!PSCHED_IS_PASTPERFECT(q->qidlestart)) {
613				long idle;
614				unsigned long qave;
615				psched_time_t now;
616				PSCHED_GET_TIME(now);
617				idle = PSCHED_TDIFF_SAFE(now, q->qidlestart, q->Scell_max);
618				qave  = q->qave >> q->Stab[(idle>>q->Scell_log)&0xFF];
619				opt.qave = qave >> q->Wlog;
620
621			} else {
622				opt.qave = q->qave >> q->Wlog;
623			}
624		}
625
626append_opt:
627		RTA_APPEND(skb, sizeof(opt), &opt);
628	}
629
630	RTA_NEST_END(skb, parms);
631
632	return RTA_NEST_END(skb, opts);
633
634rtattr_failure:
635	return RTA_NEST_CANCEL(skb, opts);
636}
637
638static void gred_destroy(struct Qdisc *sch)
639{
640	struct gred_sched *table = qdisc_priv(sch);
641	int i;
642
643	for (i = 0;i < table->DPs; i++) {
644		if (table->tab[i])
645			gred_destroy_vq(table->tab[i]);
646	}
647}
648
649static struct Qdisc_ops gred_qdisc_ops = {
650	.next		=	NULL,
651	.cl_ops		=	NULL,
652	.id		=	"gred",
653	.priv_size	=	sizeof(struct gred_sched),
654	.enqueue	=	gred_enqueue,
655	.dequeue	=	gred_dequeue,
656	.requeue	=	gred_requeue,
657	.drop		=	gred_drop,
658	.init		=	gred_init,
659	.reset		=	gred_reset,
660	.destroy	=	gred_destroy,
661	.change		=	gred_change,
662	.dump		=	gred_dump,
663	.owner		=	THIS_MODULE,
664};
665
666static int __init gred_module_init(void)
667{
668	return register_qdisc(&gred_qdisc_ops);
669}
670static void __exit gred_module_exit(void)
671{
672	unregister_qdisc(&gred_qdisc_ops);
673}
674module_init(gred_module_init)
675module_exit(gred_module_exit)
676MODULE_LICENSE("GPL");
677