sch_gred.c revision a8aaa9958eea2420e13d5a00c3fae934e0a3889e
1/*
2 * net/sched/sch_gred.c	Generic Random Early Detection queue.
3 *
4 *
5 *              This program is free software; you can redistribute it and/or
6 *              modify it under the terms of the GNU General Public License
7 *              as published by the Free Software Foundation; either version
8 *              2 of the License, or (at your option) any later version.
9 *
10 * Authors:    J Hadi Salim (hadi@cyberus.ca) 1998-2002
11 *
12 *             991129: -  Bug fix with grio mode
13 *		       - a better sing. AvgQ mode with Grio(WRED)
14 *		       - A finer grained VQ dequeue based on sugestion
15 *		         from Ren Liu
16 *		       - More error checks
17 *
18 *
19 *
20 *  For all the glorious comments look at Alexey's sch_red.c
21 */
22
23#include <linux/config.h>
24#include <linux/module.h>
25#include <asm/uaccess.h>
26#include <asm/system.h>
27#include <linux/bitops.h>
28#include <linux/types.h>
29#include <linux/kernel.h>
30#include <linux/sched.h>
31#include <linux/string.h>
32#include <linux/mm.h>
33#include <linux/socket.h>
34#include <linux/sockios.h>
35#include <linux/in.h>
36#include <linux/errno.h>
37#include <linux/interrupt.h>
38#include <linux/if_ether.h>
39#include <linux/inet.h>
40#include <linux/netdevice.h>
41#include <linux/etherdevice.h>
42#include <linux/notifier.h>
43#include <net/ip.h>
44#include <net/route.h>
45#include <linux/skbuff.h>
46#include <net/sock.h>
47#include <net/pkt_sched.h>
48
49#if 1 /* control */
50#define DPRINTK(format,args...) printk(KERN_DEBUG format,##args)
51#else
52#define DPRINTK(format,args...)
53#endif
54
55#if 0 /* data */
56#define D2PRINTK(format,args...) printk(KERN_DEBUG format,##args)
57#else
58#define D2PRINTK(format,args...)
59#endif
60
61struct gred_sched_data;
62struct gred_sched;
63
64struct gred_sched_data
65{
66/* Parameters */
67	u32		limit;		/* HARD maximal queue length	*/
68	u32		qth_min;	/* Min average length threshold: A scaled */
69	u32		qth_max;	/* Max average length threshold: A scaled */
70	u32      	DP;		/* the drop pramaters */
71	char		Wlog;		/* log(W)		*/
72	char		Plog;		/* random number bits	*/
73	u32		Scell_max;
74	u32		Rmask;
75	u32		bytesin;	/* bytes seen on virtualQ so far*/
76	u32		packetsin;	/* packets seen on virtualQ so far*/
77	u32		backlog;	/* bytes on the virtualQ */
78	u32		forced;	/* packets dropped for exceeding limits */
79	u32		early;	/* packets dropped as a warning */
80	u32		other;	/* packets dropped by invoking drop() */
81	u32		pdrop;	/* packets dropped because we exceeded physical queue limits */
82	char		Scell_log;
83	u8		Stab[256];
84	u8              prio;        /* the prio of this vq */
85
86/* Variables */
87	unsigned long	qave;		/* Average queue length: A scaled */
88	int		qcount;		/* Packets since last random number generation */
89	u32		qR;		/* Cached random number */
90
91	psched_time_t	qidlestart;	/* Start of idle period	*/
92};
93
94enum {
95	GRED_WRED_MODE = 1,
96	GRED_RIO_MODE,
97};
98
99struct gred_sched
100{
101	struct gred_sched_data *tab[MAX_DPs];
102	unsigned long	flags;
103	u32 		DPs;
104	u32 		def;
105	u8 		initd;
106};
107
108static inline int gred_wred_mode(struct gred_sched *table)
109{
110	return test_bit(GRED_WRED_MODE, &table->flags);
111}
112
113static inline void gred_enable_wred_mode(struct gred_sched *table)
114{
115	__set_bit(GRED_WRED_MODE, &table->flags);
116}
117
118static inline void gred_disable_wred_mode(struct gred_sched *table)
119{
120	__clear_bit(GRED_WRED_MODE, &table->flags);
121}
122
123static inline int gred_rio_mode(struct gred_sched *table)
124{
125	return test_bit(GRED_RIO_MODE, &table->flags);
126}
127
128static inline void gred_enable_rio_mode(struct gred_sched *table)
129{
130	__set_bit(GRED_RIO_MODE, &table->flags);
131}
132
133static inline void gred_disable_rio_mode(struct gred_sched *table)
134{
135	__clear_bit(GRED_RIO_MODE, &table->flags);
136}
137
138static inline int gred_wred_mode_check(struct Qdisc *sch)
139{
140	struct gred_sched *table = qdisc_priv(sch);
141	int i;
142
143	/* Really ugly O(n^2) but shouldn't be necessary too frequent. */
144	for (i = 0; i < table->DPs; i++) {
145		struct gred_sched_data *q = table->tab[i];
146		int n;
147
148		if (q == NULL)
149			continue;
150
151		for (n = 0; n < table->DPs; n++)
152			if (table->tab[n] && table->tab[n] != q &&
153			    table->tab[n]->prio == q->prio)
154				return 1;
155	}
156
157	return 0;
158}
159
160static int
161gred_enqueue(struct sk_buff *skb, struct Qdisc* sch)
162{
163	psched_time_t now;
164	struct gred_sched_data *q=NULL;
165	struct gred_sched *t= qdisc_priv(sch);
166	unsigned long	qave=0;
167	int i=0;
168
169	if (!t->initd && skb_queue_len(&sch->q) < (sch->dev->tx_queue_len ? : 1)) {
170		D2PRINTK("NO GRED Queues setup yet! Enqueued anyway\n");
171		goto do_enqueue;
172	}
173
174
175	if ( ((skb->tc_index&0xf) > (t->DPs -1)) || !(q=t->tab[skb->tc_index&0xf])) {
176		printk("GRED: setting to default (%d)\n ",t->def);
177		if (!(q=t->tab[t->def])) {
178			DPRINTK("GRED: setting to default FAILED! dropping!! "
179			    "(%d)\n ", t->def);
180			goto drop;
181		}
182		/* fix tc_index? --could be controvesial but needed for
183		   requeueing */
184		skb->tc_index=(skb->tc_index&0xfffffff0) | t->def;
185	}
186
187	D2PRINTK("gred_enqueue virtualQ 0x%x classid %x backlog %d "
188	    "general backlog %d\n",skb->tc_index&0xf,sch->handle,q->backlog,
189	    sch->qstats.backlog);
190	/* sum up all the qaves of prios <= to ours to get the new qave*/
191	if (!gred_wred_mode(t) && gred_rio_mode(t)) {
192		for (i=0;i<t->DPs;i++) {
193			if ((!t->tab[i]) || (i==q->DP))
194				continue;
195
196			if ((t->tab[i]->prio < q->prio) && (PSCHED_IS_PASTPERFECT(t->tab[i]->qidlestart)))
197				qave +=t->tab[i]->qave;
198		}
199
200	}
201
202	q->packetsin++;
203	q->bytesin+=skb->len;
204
205	if (gred_wred_mode(t)) {
206		qave=0;
207		q->qave=t->tab[t->def]->qave;
208		q->qidlestart=t->tab[t->def]->qidlestart;
209	}
210
211	if (!PSCHED_IS_PASTPERFECT(q->qidlestart)) {
212		long us_idle;
213		PSCHED_GET_TIME(now);
214		us_idle = PSCHED_TDIFF_SAFE(now, q->qidlestart, q->Scell_max);
215		PSCHED_SET_PASTPERFECT(q->qidlestart);
216
217		q->qave >>= q->Stab[(us_idle>>q->Scell_log)&0xFF];
218	} else {
219		if (gred_wred_mode(t)) {
220			q->qave += sch->qstats.backlog - (q->qave >> q->Wlog);
221		} else {
222			q->qave += q->backlog - (q->qave >> q->Wlog);
223		}
224
225	}
226
227
228	if (gred_wred_mode(t))
229		t->tab[t->def]->qave=q->qave;
230
231	if ((q->qave+qave) < q->qth_min) {
232		q->qcount = -1;
233enqueue:
234		if (q->backlog + skb->len <= q->limit) {
235			q->backlog += skb->len;
236do_enqueue:
237			__skb_queue_tail(&sch->q, skb);
238			sch->qstats.backlog += skb->len;
239			sch->bstats.bytes += skb->len;
240			sch->bstats.packets++;
241			return 0;
242		} else {
243			q->pdrop++;
244		}
245
246drop:
247		kfree_skb(skb);
248		sch->qstats.drops++;
249		return NET_XMIT_DROP;
250	}
251	if ((q->qave+qave) >= q->qth_max) {
252		q->qcount = -1;
253		sch->qstats.overlimits++;
254		q->forced++;
255		goto drop;
256	}
257	if (++q->qcount) {
258		if ((((qave+q->qave) - q->qth_min)>>q->Wlog)*q->qcount < q->qR)
259			goto enqueue;
260		q->qcount = 0;
261		q->qR = net_random()&q->Rmask;
262		sch->qstats.overlimits++;
263		q->early++;
264		goto drop;
265	}
266	q->qR = net_random()&q->Rmask;
267	goto enqueue;
268}
269
270static int
271gred_requeue(struct sk_buff *skb, struct Qdisc* sch)
272{
273	struct gred_sched_data *q;
274	struct gred_sched *t= qdisc_priv(sch);
275	q= t->tab[(skb->tc_index&0xf)];
276/* error checking here -- probably unnecessary */
277	PSCHED_SET_PASTPERFECT(q->qidlestart);
278
279	__skb_queue_head(&sch->q, skb);
280	sch->qstats.backlog += skb->len;
281	sch->qstats.requeues++;
282	q->backlog += skb->len;
283	return 0;
284}
285
286static struct sk_buff *
287gred_dequeue(struct Qdisc* sch)
288{
289	struct sk_buff *skb;
290	struct gred_sched_data *q;
291	struct gred_sched *t= qdisc_priv(sch);
292
293	skb = __skb_dequeue(&sch->q);
294	if (skb) {
295		sch->qstats.backlog -= skb->len;
296		q= t->tab[(skb->tc_index&0xf)];
297		if (q) {
298			q->backlog -= skb->len;
299			if (!q->backlog && !gred_wred_mode(t))
300				PSCHED_GET_TIME(q->qidlestart);
301		} else {
302			D2PRINTK("gred_dequeue: skb has bad tcindex %x\n",skb->tc_index&0xf);
303		}
304		return skb;
305	}
306
307	if (gred_wred_mode(t)) {
308			q= t->tab[t->def];
309			if (!q)
310				D2PRINTK("no default VQ set: Results will be "
311				       "screwed up\n");
312			else
313				PSCHED_GET_TIME(q->qidlestart);
314	}
315
316	return NULL;
317}
318
319static unsigned int gred_drop(struct Qdisc* sch)
320{
321	struct sk_buff *skb;
322
323	struct gred_sched_data *q;
324	struct gred_sched *t= qdisc_priv(sch);
325
326	skb = __skb_dequeue_tail(&sch->q);
327	if (skb) {
328		unsigned int len = skb->len;
329		sch->qstats.backlog -= len;
330		sch->qstats.drops++;
331		q= t->tab[(skb->tc_index&0xf)];
332		if (q) {
333			q->backlog -= len;
334			q->other++;
335			if (!q->backlog && !gred_wred_mode(t))
336				PSCHED_GET_TIME(q->qidlestart);
337		} else {
338			D2PRINTK("gred_dequeue: skb has bad tcindex %x\n",skb->tc_index&0xf);
339		}
340
341		kfree_skb(skb);
342		return len;
343	}
344
345	q=t->tab[t->def];
346	if (!q) {
347		D2PRINTK("no default VQ set: Results might be screwed up\n");
348		return 0;
349	}
350
351	PSCHED_GET_TIME(q->qidlestart);
352	return 0;
353
354}
355
356static void gred_reset(struct Qdisc* sch)
357{
358	int i;
359	struct gred_sched_data *q;
360	struct gred_sched *t= qdisc_priv(sch);
361
362	__skb_queue_purge(&sch->q);
363
364	sch->qstats.backlog = 0;
365
366        for (i=0;i<t->DPs;i++) {
367	        q= t->tab[i];
368		if (!q)
369			continue;
370		PSCHED_SET_PASTPERFECT(q->qidlestart);
371		q->qave = 0;
372		q->qcount = -1;
373		q->backlog = 0;
374		q->other=0;
375		q->forced=0;
376		q->pdrop=0;
377		q->early=0;
378	}
379}
380
381static inline void gred_destroy_vq(struct gred_sched_data *q)
382{
383	kfree(q);
384}
385
386static inline int gred_change_table_def(struct Qdisc *sch, struct rtattr *dps)
387{
388	struct gred_sched *table = qdisc_priv(sch);
389	struct tc_gred_sopt *sopt;
390	int i;
391
392	if (dps == NULL || RTA_PAYLOAD(dps) < sizeof(*sopt))
393		return -EINVAL;
394
395	sopt = RTA_DATA(dps);
396
397	if (sopt->DPs > MAX_DPs || sopt->DPs == 0 || sopt->def_DP >= sopt->DPs)
398		return -EINVAL;
399
400	sch_tree_lock(sch);
401	table->DPs = sopt->DPs;
402	table->def = sopt->def_DP;
403
404	/*
405	 * Every entry point to GRED is synchronized with the above code
406	 * and the DP is checked against DPs, i.e. shadowed VQs can no
407	 * longer be found so we can unlock right here.
408	 */
409	sch_tree_unlock(sch);
410
411	if (sopt->grio) {
412		gred_enable_rio_mode(table);
413		gred_disable_wred_mode(table);
414		if (gred_wred_mode_check(sch))
415			gred_enable_wred_mode(table);
416	} else {
417		gred_disable_rio_mode(table);
418		gred_disable_wred_mode(table);
419	}
420
421	for (i = table->DPs; i < MAX_DPs; i++) {
422		if (table->tab[i]) {
423			printk(KERN_WARNING "GRED: Warning: Destroying "
424			       "shadowed VQ 0x%x\n", i);
425			gred_destroy_vq(table->tab[i]);
426			table->tab[i] = NULL;
427  		}
428	}
429
430	table->initd = 0;
431
432	return 0;
433}
434
435static int gred_change(struct Qdisc *sch, struct rtattr *opt)
436{
437	struct gred_sched *table = qdisc_priv(sch);
438	struct gred_sched_data *q;
439	struct tc_gred_qopt *ctl;
440	struct rtattr *tb[TCA_GRED_STAB];
441
442	if (opt == NULL || rtattr_parse_nested(tb, TCA_GRED_STAB, opt))
443		return -EINVAL;
444
445	if (tb[TCA_GRED_PARMS-1] == NULL && tb[TCA_GRED_STAB-1] == NULL)
446		return gred_change_table_def(sch, tb[TCA_GRED_DPS-1]);
447
448	if (!table->DPs || tb[TCA_GRED_PARMS-1] == 0 || tb[TCA_GRED_STAB-1] == 0 ||
449		RTA_PAYLOAD(tb[TCA_GRED_PARMS-1]) < sizeof(*ctl) ||
450		RTA_PAYLOAD(tb[TCA_GRED_STAB-1]) < 256)
451			return -EINVAL;
452
453	ctl = RTA_DATA(tb[TCA_GRED_PARMS-1]);
454
455	if (ctl->DP >= table->DPs)
456		return -EINVAL;
457
458	if (table->tab[ctl->DP] == NULL) {
459		table->tab[ctl->DP]=kmalloc(sizeof(struct gred_sched_data),
460					    GFP_KERNEL);
461		if (NULL == table->tab[ctl->DP])
462			return -ENOMEM;
463		memset(table->tab[ctl->DP], 0, (sizeof(struct gred_sched_data)));
464	}
465	q= table->tab[ctl->DP];
466
467	if (gred_rio_mode(table)) {
468		if (ctl->prio <=0) {
469			if (table->def && table->tab[table->def]) {
470				DPRINTK("\nGRED: DP %u does not have a prio"
471					"setting default to %d\n",ctl->DP,
472					table->tab[table->def]->prio);
473				q->prio=table->tab[table->def]->prio;
474			} else {
475				DPRINTK("\nGRED: DP %u does not have a prio"
476					" setting default to 8\n",ctl->DP);
477				q->prio=8;
478			}
479		} else {
480			q->prio=ctl->prio;
481		}
482	} else {
483		q->prio=8;
484	}
485
486
487	q->DP=ctl->DP;
488	q->Wlog = ctl->Wlog;
489	q->Plog = ctl->Plog;
490	q->limit = ctl->limit;
491	q->Scell_log = ctl->Scell_log;
492	q->Rmask = ctl->Plog < 32 ? ((1<<ctl->Plog) - 1) : ~0UL;
493	q->Scell_max = (255<<q->Scell_log);
494	q->qth_min = ctl->qth_min<<ctl->Wlog;
495	q->qth_max = ctl->qth_max<<ctl->Wlog;
496	q->qave=0;
497	q->backlog=0;
498	q->qcount = -1;
499	q->other=0;
500	q->forced=0;
501	q->pdrop=0;
502	q->early=0;
503
504	PSCHED_SET_PASTPERFECT(q->qidlestart);
505	memcpy(q->Stab, RTA_DATA(tb[TCA_GRED_STAB-1]), 256);
506
507	if (gred_rio_mode(table)) {
508		gred_disable_wred_mode(table);
509		if (gred_wred_mode_check(sch))
510			gred_enable_wred_mode(table);
511	}
512
513	if (!table->initd) {
514		table->initd=1;
515		/*
516        	the first entry also goes into the default until
517        	over-written
518		*/
519
520		if (table->tab[table->def] == NULL) {
521			table->tab[table->def]=
522				kmalloc(sizeof(struct gred_sched_data), GFP_KERNEL);
523			if (NULL == table->tab[table->def])
524				return -ENOMEM;
525
526			memset(table->tab[table->def], 0,
527			       (sizeof(struct gred_sched_data)));
528		}
529		q= table->tab[table->def];
530		q->DP=table->def;
531		q->Wlog = ctl->Wlog;
532		q->Plog = ctl->Plog;
533		q->limit = ctl->limit;
534		q->Scell_log = ctl->Scell_log;
535		q->Rmask = ctl->Plog < 32 ? ((1<<ctl->Plog) - 1) : ~0UL;
536		q->Scell_max = (255<<q->Scell_log);
537		q->qth_min = ctl->qth_min<<ctl->Wlog;
538		q->qth_max = ctl->qth_max<<ctl->Wlog;
539
540		if (gred_rio_mode(table))
541			q->prio=table->tab[ctl->DP]->prio;
542		else
543			q->prio=8;
544
545		q->qcount = -1;
546		PSCHED_SET_PASTPERFECT(q->qidlestart);
547		memcpy(q->Stab, RTA_DATA(tb[TCA_GRED_STAB-1]), 256);
548	}
549	return 0;
550
551}
552
553static int gred_init(struct Qdisc *sch, struct rtattr *opt)
554{
555	struct rtattr *tb[TCA_GRED_MAX];
556
557	if (opt == NULL || rtattr_parse_nested(tb, TCA_GRED_MAX, opt))
558		return -EINVAL;
559
560	if (tb[TCA_GRED_PARMS-1] || tb[TCA_GRED_STAB-1])
561		return -EINVAL;
562
563	return gred_change_table_def(sch, tb[TCA_GRED_DPS-1]);
564}
565
566static int gred_dump(struct Qdisc *sch, struct sk_buff *skb)
567{
568	struct gred_sched *table = qdisc_priv(sch);
569	struct rtattr *parms, *opts = NULL;
570	int i;
571	struct tc_gred_sopt sopt = {
572		.DPs	= table->DPs,
573		.def_DP	= table->def,
574		.grio	= gred_rio_mode(table),
575	};
576
577	opts = RTA_NEST(skb, TCA_OPTIONS);
578	RTA_PUT(skb, TCA_GRED_DPS, sizeof(sopt), &sopt);
579	parms = RTA_NEST(skb, TCA_GRED_PARMS);
580
581	for (i = 0; i < MAX_DPs; i++) {
582		struct gred_sched_data *q = table->tab[i];
583		struct tc_gred_qopt opt;
584
585		memset(&opt, 0, sizeof(opt));
586
587		if (!q) {
588			/* hack -- fix at some point with proper message
589			   This is how we indicate to tc that there is no VQ
590			   at this DP */
591
592			opt.DP = MAX_DPs + i;
593			goto append_opt;
594		}
595
596		opt.limit	= q->limit;
597		opt.DP		= q->DP;
598		opt.backlog	= q->backlog;
599		opt.prio	= q->prio;
600		opt.qth_min	= q->qth_min >> q->Wlog;
601		opt.qth_max	= q->qth_max >> q->Wlog;
602		opt.Wlog	= q->Wlog;
603		opt.Plog	= q->Plog;
604		opt.Scell_log	= q->Scell_log;
605		opt.other	= q->other;
606		opt.early	= q->early;
607		opt.forced	= q->forced;
608		opt.pdrop	= q->pdrop;
609		opt.packets	= q->packetsin;
610		opt.bytesin	= q->bytesin;
611
612		if (q->qave) {
613			if (gred_wred_mode(table)) {
614				q->qidlestart=table->tab[table->def]->qidlestart;
615				q->qave=table->tab[table->def]->qave;
616			}
617			if (!PSCHED_IS_PASTPERFECT(q->qidlestart)) {
618				long idle;
619				unsigned long qave;
620				psched_time_t now;
621				PSCHED_GET_TIME(now);
622				idle = PSCHED_TDIFF_SAFE(now, q->qidlestart, q->Scell_max);
623				qave  = q->qave >> q->Stab[(idle>>q->Scell_log)&0xFF];
624				opt.qave = qave >> q->Wlog;
625
626			} else {
627				opt.qave = q->qave >> q->Wlog;
628			}
629		}
630
631append_opt:
632		RTA_APPEND(skb, sizeof(opt), &opt);
633	}
634
635	RTA_NEST_END(skb, parms);
636
637	return RTA_NEST_END(skb, opts);
638
639rtattr_failure:
640	return RTA_NEST_CANCEL(skb, opts);
641}
642
643static void gred_destroy(struct Qdisc *sch)
644{
645	struct gred_sched *table = qdisc_priv(sch);
646	int i;
647
648	for (i = 0;i < table->DPs; i++) {
649		if (table->tab[i])
650			gred_destroy_vq(table->tab[i]);
651	}
652}
653
654static struct Qdisc_ops gred_qdisc_ops = {
655	.next		=	NULL,
656	.cl_ops		=	NULL,
657	.id		=	"gred",
658	.priv_size	=	sizeof(struct gred_sched),
659	.enqueue	=	gred_enqueue,
660	.dequeue	=	gred_dequeue,
661	.requeue	=	gred_requeue,
662	.drop		=	gred_drop,
663	.init		=	gred_init,
664	.reset		=	gred_reset,
665	.destroy	=	gred_destroy,
666	.change		=	gred_change,
667	.dump		=	gred_dump,
668	.owner		=	THIS_MODULE,
669};
670
671static int __init gred_module_init(void)
672{
673	return register_qdisc(&gred_qdisc_ops);
674}
675static void __exit gred_module_exit(void)
676{
677	unregister_qdisc(&gred_qdisc_ops);
678}
679module_init(gred_module_init)
680module_exit(gred_module_exit)
681MODULE_LICENSE("GPL");
682