sch_gred.c revision 7051703b990ec40bdf192ec7c87ffafd7011c640
1/*
2 * net/sched/sch_gred.c	Generic Random Early Detection queue.
3 *
4 *
5 *              This program is free software; you can redistribute it and/or
6 *              modify it under the terms of the GNU General Public License
7 *              as published by the Free Software Foundation; either version
8 *              2 of the License, or (at your option) any later version.
9 *
10 * Authors:    J Hadi Salim (hadi@cyberus.ca) 1998-2002
11 *
12 *             991129: -  Bug fix with grio mode
13 *		       - a better sing. AvgQ mode with Grio(WRED)
14 *		       - A finer grained VQ dequeue based on sugestion
15 *		         from Ren Liu
16 *		       - More error checks
17 *
18 *
19 *
20 *  For all the glorious comments look at Alexey's sch_red.c
21 */
22
23#include <linux/config.h>
24#include <linux/module.h>
25#include <asm/uaccess.h>
26#include <asm/system.h>
27#include <linux/bitops.h>
28#include <linux/types.h>
29#include <linux/kernel.h>
30#include <linux/sched.h>
31#include <linux/string.h>
32#include <linux/mm.h>
33#include <linux/socket.h>
34#include <linux/sockios.h>
35#include <linux/in.h>
36#include <linux/errno.h>
37#include <linux/interrupt.h>
38#include <linux/if_ether.h>
39#include <linux/inet.h>
40#include <linux/netdevice.h>
41#include <linux/etherdevice.h>
42#include <linux/notifier.h>
43#include <net/ip.h>
44#include <net/route.h>
45#include <linux/skbuff.h>
46#include <net/sock.h>
47#include <net/pkt_sched.h>
48#include <net/red.h>
49
50#if 1 /* control */
51#define DPRINTK(format,args...) printk(KERN_DEBUG format,##args)
52#else
53#define DPRINTK(format,args...)
54#endif
55
56#if 0 /* data */
57#define D2PRINTK(format,args...) printk(KERN_DEBUG format,##args)
58#else
59#define D2PRINTK(format,args...)
60#endif
61
62#define GRED_DEF_PRIO (MAX_DPs / 2)
63#define GRED_VQ_MASK (MAX_DPs - 1)
64
65struct gred_sched_data;
66struct gred_sched;
67
68struct gred_sched_data
69{
70	u32		limit;		/* HARD maximal queue length	*/
71	u32      	DP;		/* the drop pramaters */
72	u32		bytesin;	/* bytes seen on virtualQ so far*/
73	u32		packetsin;	/* packets seen on virtualQ so far*/
74	u32		backlog;	/* bytes on the virtualQ */
75	u8              prio;        /* the prio of this vq */
76
77	struct red_parms parms;
78	struct red_stats stats;
79};
80
81enum {
82	GRED_WRED_MODE = 1,
83	GRED_RIO_MODE,
84};
85
86struct gred_sched
87{
88	struct gred_sched_data *tab[MAX_DPs];
89	unsigned long	flags;
90	u32 		DPs;
91	u32 		def;
92	struct red_parms wred_set;
93};
94
95static inline int gred_wred_mode(struct gred_sched *table)
96{
97	return test_bit(GRED_WRED_MODE, &table->flags);
98}
99
100static inline void gred_enable_wred_mode(struct gred_sched *table)
101{
102	__set_bit(GRED_WRED_MODE, &table->flags);
103}
104
105static inline void gred_disable_wred_mode(struct gred_sched *table)
106{
107	__clear_bit(GRED_WRED_MODE, &table->flags);
108}
109
110static inline int gred_rio_mode(struct gred_sched *table)
111{
112	return test_bit(GRED_RIO_MODE, &table->flags);
113}
114
115static inline void gred_enable_rio_mode(struct gred_sched *table)
116{
117	__set_bit(GRED_RIO_MODE, &table->flags);
118}
119
120static inline void gred_disable_rio_mode(struct gred_sched *table)
121{
122	__clear_bit(GRED_RIO_MODE, &table->flags);
123}
124
125static inline int gred_wred_mode_check(struct Qdisc *sch)
126{
127	struct gred_sched *table = qdisc_priv(sch);
128	int i;
129
130	/* Really ugly O(n^2) but shouldn't be necessary too frequent. */
131	for (i = 0; i < table->DPs; i++) {
132		struct gred_sched_data *q = table->tab[i];
133		int n;
134
135		if (q == NULL)
136			continue;
137
138		for (n = 0; n < table->DPs; n++)
139			if (table->tab[n] && table->tab[n] != q &&
140			    table->tab[n]->prio == q->prio)
141				return 1;
142	}
143
144	return 0;
145}
146
147static inline unsigned int gred_backlog(struct gred_sched *table,
148					struct gred_sched_data *q,
149					struct Qdisc *sch)
150{
151	if (gred_wred_mode(table))
152		return sch->qstats.backlog;
153	else
154		return q->backlog;
155}
156
157static inline u16 tc_index_to_dp(struct sk_buff *skb)
158{
159	return skb->tc_index & GRED_VQ_MASK;
160}
161
162static inline void gred_load_wred_set(struct gred_sched *table,
163				      struct gred_sched_data *q)
164{
165	q->parms.qavg = table->wred_set.qavg;
166	q->parms.qidlestart = table->wred_set.qidlestart;
167}
168
169static inline void gred_store_wred_set(struct gred_sched *table,
170				       struct gred_sched_data *q)
171{
172	table->wred_set.qavg = q->parms.qavg;
173}
174
175static int
176gred_enqueue(struct sk_buff *skb, struct Qdisc* sch)
177{
178	struct gred_sched_data *q=NULL;
179	struct gred_sched *t= qdisc_priv(sch);
180	unsigned long qavg = 0;
181	int i=0;
182	u16 dp = tc_index_to_dp(skb);
183
184	if (dp >= t->DPs  || (q = t->tab[dp]) == NULL) {
185		dp = t->def;
186
187		if ((q = t->tab[dp]) == NULL) {
188			/* Pass through packets not assigned to a DP
189			 * if no default DP has been configured. This
190			 * allows for DP flows to be left untouched.
191			 */
192			if (skb_queue_len(&sch->q) < sch->dev->tx_queue_len)
193				return qdisc_enqueue_tail(skb, sch);
194			else
195				goto drop;
196		}
197
198		/* fix tc_index? --could be controvesial but needed for
199		   requeueing */
200		skb->tc_index = (skb->tc_index & ~GRED_VQ_MASK) | dp;
201	}
202
203	/* sum up all the qaves of prios <= to ours to get the new qave*/
204	if (!gred_wred_mode(t) && gred_rio_mode(t)) {
205		for (i=0;i<t->DPs;i++) {
206			if ((!t->tab[i]) || (i==q->DP))
207				continue;
208
209			if (t->tab[i]->prio < q->prio &&
210			    !red_is_idling(&t->tab[i]->parms))
211				qavg +=t->tab[i]->parms.qavg;
212		}
213
214	}
215
216	q->packetsin++;
217	q->bytesin+=skb->len;
218
219	if (gred_wred_mode(t)) {
220		qavg = 0;
221		gred_load_wred_set(t, q);
222	}
223
224	q->parms.qavg = red_calc_qavg(&q->parms, gred_backlog(t, q, sch));
225
226	if (red_is_idling(&q->parms))
227		red_end_of_idle_period(&q->parms);
228
229	if (gred_wred_mode(t))
230		gred_store_wred_set(t, q);
231
232	switch (red_action(&q->parms, q->parms.qavg + qavg)) {
233		case RED_DONT_MARK:
234			break;
235
236		case RED_PROB_MARK:
237			sch->qstats.overlimits++;
238			q->stats.prob_drop++;
239			goto congestion_drop;
240
241		case RED_HARD_MARK:
242			sch->qstats.overlimits++;
243			q->stats.forced_drop++;
244			goto congestion_drop;
245	}
246
247	if (q->backlog + skb->len <= q->limit) {
248		q->backlog += skb->len;
249		return qdisc_enqueue_tail(skb, sch);
250	}
251
252	q->stats.pdrop++;
253drop:
254	return qdisc_drop(skb, sch);
255
256congestion_drop:
257	qdisc_drop(skb, sch);
258	return NET_XMIT_CN;
259}
260
261static int
262gred_requeue(struct sk_buff *skb, struct Qdisc* sch)
263{
264	struct gred_sched *t = qdisc_priv(sch);
265	struct gred_sched_data *q;
266	u16 dp = tc_index_to_dp(skb);
267
268	if (dp >= t->DPs || (q = t->tab[dp]) == NULL) {
269		if (net_ratelimit())
270			printk(KERN_WARNING "GRED: Unable to relocate VQ 0x%x "
271			       "for requeue, screwing up backlog.\n",
272			       tc_index_to_dp(skb));
273	} else {
274		if (red_is_idling(&q->parms))
275			red_end_of_idle_period(&q->parms);
276		q->backlog += skb->len;
277	}
278
279	return qdisc_requeue(skb, sch);
280}
281
282static struct sk_buff *
283gred_dequeue(struct Qdisc* sch)
284{
285	struct sk_buff *skb;
286	struct gred_sched_data *q;
287	struct gred_sched *t= qdisc_priv(sch);
288
289	skb = qdisc_dequeue_head(sch);
290
291	if (skb) {
292		u16 dp = tc_index_to_dp(skb);
293
294		if (dp >= t->DPs || (q = t->tab[dp]) == NULL) {
295			if (net_ratelimit())
296				printk(KERN_WARNING "GRED: Unable to relocate "
297				       "VQ 0x%x after dequeue, screwing up "
298				       "backlog.\n", tc_index_to_dp(skb));
299		} else {
300			q->backlog -= skb->len;
301
302			if (!q->backlog && !gred_wred_mode(t))
303				red_start_of_idle_period(&q->parms);
304		}
305
306		return skb;
307	}
308
309	if (gred_wred_mode(t))
310		red_start_of_idle_period(&t->wred_set);
311
312	return NULL;
313}
314
315static unsigned int gred_drop(struct Qdisc* sch)
316{
317	struct sk_buff *skb;
318
319	struct gred_sched_data *q;
320	struct gred_sched *t= qdisc_priv(sch);
321
322	skb = qdisc_dequeue_tail(sch);
323	if (skb) {
324		unsigned int len = skb->len;
325		u16 dp = tc_index_to_dp(skb);
326
327		if (dp >= t->DPs || (q = t->tab[dp]) == NULL) {
328			if (net_ratelimit())
329				printk(KERN_WARNING "GRED: Unable to relocate "
330				       "VQ 0x%x while dropping, screwing up "
331				       "backlog.\n", tc_index_to_dp(skb));
332		} else {
333			q->backlog -= len;
334			q->stats.other++;
335
336			if (!q->backlog && !gred_wred_mode(t))
337				red_start_of_idle_period(&q->parms);
338		}
339
340		qdisc_drop(skb, sch);
341		return len;
342	}
343
344	if (gred_wred_mode(t))
345		red_start_of_idle_period(&t->wred_set);
346
347	return 0;
348
349}
350
351static void gred_reset(struct Qdisc* sch)
352{
353	int i;
354	struct gred_sched_data *q;
355	struct gred_sched *t= qdisc_priv(sch);
356
357	qdisc_reset_queue(sch);
358
359        for (i=0;i<t->DPs;i++) {
360	        q= t->tab[i];
361		if (!q)
362			continue;
363		red_restart(&q->parms);
364		q->backlog = 0;
365	}
366}
367
368static inline void gred_destroy_vq(struct gred_sched_data *q)
369{
370	kfree(q);
371}
372
373static inline int gred_change_table_def(struct Qdisc *sch, struct rtattr *dps)
374{
375	struct gred_sched *table = qdisc_priv(sch);
376	struct tc_gred_sopt *sopt;
377	int i;
378
379	if (dps == NULL || RTA_PAYLOAD(dps) < sizeof(*sopt))
380		return -EINVAL;
381
382	sopt = RTA_DATA(dps);
383
384	if (sopt->DPs > MAX_DPs || sopt->DPs == 0 || sopt->def_DP >= sopt->DPs)
385		return -EINVAL;
386
387	sch_tree_lock(sch);
388	table->DPs = sopt->DPs;
389	table->def = sopt->def_DP;
390
391	/*
392	 * Every entry point to GRED is synchronized with the above code
393	 * and the DP is checked against DPs, i.e. shadowed VQs can no
394	 * longer be found so we can unlock right here.
395	 */
396	sch_tree_unlock(sch);
397
398	if (sopt->grio) {
399		gred_enable_rio_mode(table);
400		gred_disable_wred_mode(table);
401		if (gred_wred_mode_check(sch))
402			gred_enable_wred_mode(table);
403	} else {
404		gred_disable_rio_mode(table);
405		gred_disable_wred_mode(table);
406	}
407
408	for (i = table->DPs; i < MAX_DPs; i++) {
409		if (table->tab[i]) {
410			printk(KERN_WARNING "GRED: Warning: Destroying "
411			       "shadowed VQ 0x%x\n", i);
412			gred_destroy_vq(table->tab[i]);
413			table->tab[i] = NULL;
414  		}
415	}
416
417	return 0;
418}
419
420static inline int gred_change_vq(struct Qdisc *sch, int dp,
421				 struct tc_gred_qopt *ctl, int prio, u8 *stab)
422{
423	struct gred_sched *table = qdisc_priv(sch);
424	struct gred_sched_data *q;
425
426	if (table->tab[dp] == NULL) {
427		table->tab[dp] = kmalloc(sizeof(*q), GFP_KERNEL);
428		if (table->tab[dp] == NULL)
429			return -ENOMEM;
430		memset(table->tab[dp], 0, sizeof(*q));
431	}
432
433	q = table->tab[dp];
434	q->DP = dp;
435	q->prio = prio;
436	q->limit = ctl->limit;
437
438	if (q->backlog == 0)
439		red_end_of_idle_period(&q->parms);
440
441	red_set_parms(&q->parms,
442		      ctl->qth_min, ctl->qth_max, ctl->Wlog, ctl->Plog,
443		      ctl->Scell_log, stab);
444
445	return 0;
446}
447
448static int gred_change(struct Qdisc *sch, struct rtattr *opt)
449{
450	struct gred_sched *table = qdisc_priv(sch);
451	struct tc_gred_qopt *ctl;
452	struct rtattr *tb[TCA_GRED_MAX];
453	int err = -EINVAL, prio = GRED_DEF_PRIO;
454	u8 *stab;
455
456	if (opt == NULL || rtattr_parse_nested(tb, TCA_GRED_MAX, opt))
457		return -EINVAL;
458
459	if (tb[TCA_GRED_PARMS-1] == NULL && tb[TCA_GRED_STAB-1] == NULL)
460		return gred_change_table_def(sch, opt);
461
462	if (tb[TCA_GRED_PARMS-1] == NULL ||
463	    RTA_PAYLOAD(tb[TCA_GRED_PARMS-1]) < sizeof(*ctl) ||
464	    tb[TCA_GRED_STAB-1] == NULL ||
465	    RTA_PAYLOAD(tb[TCA_GRED_STAB-1]) < 256)
466		return -EINVAL;
467
468	ctl = RTA_DATA(tb[TCA_GRED_PARMS-1]);
469	stab = RTA_DATA(tb[TCA_GRED_STAB-1]);
470
471	if (ctl->DP >= table->DPs)
472		goto errout;
473
474	if (gred_rio_mode(table)) {
475		if (ctl->prio == 0) {
476			int def_prio = GRED_DEF_PRIO;
477
478			if (table->tab[table->def])
479				def_prio = table->tab[table->def]->prio;
480
481			printk(KERN_DEBUG "GRED: DP %u does not have a prio "
482			       "setting default to %d\n", ctl->DP, def_prio);
483
484			prio = def_prio;
485		} else
486			prio = ctl->prio;
487	}
488
489	sch_tree_lock(sch);
490
491	err = gred_change_vq(sch, ctl->DP, ctl, prio, stab);
492	if (err < 0)
493		goto errout_locked;
494
495	if (table->tab[table->def] == NULL) {
496		if (gred_rio_mode(table))
497			prio = table->tab[ctl->DP]->prio;
498
499		err = gred_change_vq(sch, table->def, ctl, prio, stab);
500		if (err < 0)
501			goto errout_locked;
502	}
503
504	if (gred_rio_mode(table)) {
505		gred_disable_wred_mode(table);
506		if (gred_wred_mode_check(sch))
507			gred_enable_wred_mode(table);
508	}
509
510	err = 0;
511
512errout_locked:
513	sch_tree_unlock(sch);
514errout:
515	return err;
516}
517
518static int gred_init(struct Qdisc *sch, struct rtattr *opt)
519{
520	struct rtattr *tb[TCA_GRED_MAX];
521
522	if (opt == NULL || rtattr_parse_nested(tb, TCA_GRED_MAX, opt))
523		return -EINVAL;
524
525	if (tb[TCA_GRED_PARMS-1] || tb[TCA_GRED_STAB-1])
526		return -EINVAL;
527
528	return gred_change_table_def(sch, tb[TCA_GRED_DPS-1]);
529}
530
531static int gred_dump(struct Qdisc *sch, struct sk_buff *skb)
532{
533	struct gred_sched *table = qdisc_priv(sch);
534	struct rtattr *parms, *opts = NULL;
535	int i;
536	struct tc_gred_sopt sopt = {
537		.DPs	= table->DPs,
538		.def_DP	= table->def,
539		.grio	= gred_rio_mode(table),
540	};
541
542	opts = RTA_NEST(skb, TCA_OPTIONS);
543	RTA_PUT(skb, TCA_GRED_DPS, sizeof(sopt), &sopt);
544	parms = RTA_NEST(skb, TCA_GRED_PARMS);
545
546	for (i = 0; i < MAX_DPs; i++) {
547		struct gred_sched_data *q = table->tab[i];
548		struct tc_gred_qopt opt;
549
550		memset(&opt, 0, sizeof(opt));
551
552		if (!q) {
553			/* hack -- fix at some point with proper message
554			   This is how we indicate to tc that there is no VQ
555			   at this DP */
556
557			opt.DP = MAX_DPs + i;
558			goto append_opt;
559		}
560
561		opt.limit	= q->limit;
562		opt.DP		= q->DP;
563		opt.backlog	= q->backlog;
564		opt.prio	= q->prio;
565		opt.qth_min	= q->parms.qth_min >> q->parms.Wlog;
566		opt.qth_max	= q->parms.qth_max >> q->parms.Wlog;
567		opt.Wlog	= q->parms.Wlog;
568		opt.Plog	= q->parms.Plog;
569		opt.Scell_log	= q->parms.Scell_log;
570		opt.other	= q->stats.other;
571		opt.early	= q->stats.prob_drop;
572		opt.forced	= q->stats.forced_drop;
573		opt.pdrop	= q->stats.pdrop;
574		opt.packets	= q->packetsin;
575		opt.bytesin	= q->bytesin;
576
577		if (gred_wred_mode(table)) {
578			q->parms.qidlestart =
579				table->tab[table->def]->parms.qidlestart;
580			q->parms.qavg = table->tab[table->def]->parms.qavg;
581		}
582
583		opt.qave = red_calc_qavg(&q->parms, q->parms.qavg);
584
585append_opt:
586		RTA_APPEND(skb, sizeof(opt), &opt);
587	}
588
589	RTA_NEST_END(skb, parms);
590
591	return RTA_NEST_END(skb, opts);
592
593rtattr_failure:
594	return RTA_NEST_CANCEL(skb, opts);
595}
596
597static void gred_destroy(struct Qdisc *sch)
598{
599	struct gred_sched *table = qdisc_priv(sch);
600	int i;
601
602	for (i = 0;i < table->DPs; i++) {
603		if (table->tab[i])
604			gred_destroy_vq(table->tab[i]);
605	}
606}
607
608static struct Qdisc_ops gred_qdisc_ops = {
609	.next		=	NULL,
610	.cl_ops		=	NULL,
611	.id		=	"gred",
612	.priv_size	=	sizeof(struct gred_sched),
613	.enqueue	=	gred_enqueue,
614	.dequeue	=	gred_dequeue,
615	.requeue	=	gred_requeue,
616	.drop		=	gred_drop,
617	.init		=	gred_init,
618	.reset		=	gred_reset,
619	.destroy	=	gred_destroy,
620	.change		=	gred_change,
621	.dump		=	gred_dump,
622	.owner		=	THIS_MODULE,
623};
624
625static int __init gred_module_init(void)
626{
627	return register_qdisc(&gred_qdisc_ops);
628}
629static void __exit gred_module_exit(void)
630{
631	unregister_qdisc(&gred_qdisc_ops);
632}
633module_init(gred_module_init)
634module_exit(gred_module_exit)
635MODULE_LICENSE("GPL");
636