sch_gred.c revision 7051703b990ec40bdf192ec7c87ffafd7011c640
1/* 2 * net/sched/sch_gred.c Generic Random Early Detection queue. 3 * 4 * 5 * This program is free software; you can redistribute it and/or 6 * modify it under the terms of the GNU General Public License 7 * as published by the Free Software Foundation; either version 8 * 2 of the License, or (at your option) any later version. 9 * 10 * Authors: J Hadi Salim (hadi@cyberus.ca) 1998-2002 11 * 12 * 991129: - Bug fix with grio mode 13 * - a better sing. AvgQ mode with Grio(WRED) 14 * - A finer grained VQ dequeue based on sugestion 15 * from Ren Liu 16 * - More error checks 17 * 18 * 19 * 20 * For all the glorious comments look at Alexey's sch_red.c 21 */ 22 23#include <linux/config.h> 24#include <linux/module.h> 25#include <asm/uaccess.h> 26#include <asm/system.h> 27#include <linux/bitops.h> 28#include <linux/types.h> 29#include <linux/kernel.h> 30#include <linux/sched.h> 31#include <linux/string.h> 32#include <linux/mm.h> 33#include <linux/socket.h> 34#include <linux/sockios.h> 35#include <linux/in.h> 36#include <linux/errno.h> 37#include <linux/interrupt.h> 38#include <linux/if_ether.h> 39#include <linux/inet.h> 40#include <linux/netdevice.h> 41#include <linux/etherdevice.h> 42#include <linux/notifier.h> 43#include <net/ip.h> 44#include <net/route.h> 45#include <linux/skbuff.h> 46#include <net/sock.h> 47#include <net/pkt_sched.h> 48#include <net/red.h> 49 50#if 1 /* control */ 51#define DPRINTK(format,args...) printk(KERN_DEBUG format,##args) 52#else 53#define DPRINTK(format,args...) 54#endif 55 56#if 0 /* data */ 57#define D2PRINTK(format,args...) printk(KERN_DEBUG format,##args) 58#else 59#define D2PRINTK(format,args...) 60#endif 61 62#define GRED_DEF_PRIO (MAX_DPs / 2) 63#define GRED_VQ_MASK (MAX_DPs - 1) 64 65struct gred_sched_data; 66struct gred_sched; 67 68struct gred_sched_data 69{ 70 u32 limit; /* HARD maximal queue length */ 71 u32 DP; /* the drop pramaters */ 72 u32 bytesin; /* bytes seen on virtualQ so far*/ 73 u32 packetsin; /* packets seen on virtualQ so far*/ 74 u32 backlog; /* bytes on the virtualQ */ 75 u8 prio; /* the prio of this vq */ 76 77 struct red_parms parms; 78 struct red_stats stats; 79}; 80 81enum { 82 GRED_WRED_MODE = 1, 83 GRED_RIO_MODE, 84}; 85 86struct gred_sched 87{ 88 struct gred_sched_data *tab[MAX_DPs]; 89 unsigned long flags; 90 u32 DPs; 91 u32 def; 92 struct red_parms wred_set; 93}; 94 95static inline int gred_wred_mode(struct gred_sched *table) 96{ 97 return test_bit(GRED_WRED_MODE, &table->flags); 98} 99 100static inline void gred_enable_wred_mode(struct gred_sched *table) 101{ 102 __set_bit(GRED_WRED_MODE, &table->flags); 103} 104 105static inline void gred_disable_wred_mode(struct gred_sched *table) 106{ 107 __clear_bit(GRED_WRED_MODE, &table->flags); 108} 109 110static inline int gred_rio_mode(struct gred_sched *table) 111{ 112 return test_bit(GRED_RIO_MODE, &table->flags); 113} 114 115static inline void gred_enable_rio_mode(struct gred_sched *table) 116{ 117 __set_bit(GRED_RIO_MODE, &table->flags); 118} 119 120static inline void gred_disable_rio_mode(struct gred_sched *table) 121{ 122 __clear_bit(GRED_RIO_MODE, &table->flags); 123} 124 125static inline int gred_wred_mode_check(struct Qdisc *sch) 126{ 127 struct gred_sched *table = qdisc_priv(sch); 128 int i; 129 130 /* Really ugly O(n^2) but shouldn't be necessary too frequent. */ 131 for (i = 0; i < table->DPs; i++) { 132 struct gred_sched_data *q = table->tab[i]; 133 int n; 134 135 if (q == NULL) 136 continue; 137 138 for (n = 0; n < table->DPs; n++) 139 if (table->tab[n] && table->tab[n] != q && 140 table->tab[n]->prio == q->prio) 141 return 1; 142 } 143 144 return 0; 145} 146 147static inline unsigned int gred_backlog(struct gred_sched *table, 148 struct gred_sched_data *q, 149 struct Qdisc *sch) 150{ 151 if (gred_wred_mode(table)) 152 return sch->qstats.backlog; 153 else 154 return q->backlog; 155} 156 157static inline u16 tc_index_to_dp(struct sk_buff *skb) 158{ 159 return skb->tc_index & GRED_VQ_MASK; 160} 161 162static inline void gred_load_wred_set(struct gred_sched *table, 163 struct gred_sched_data *q) 164{ 165 q->parms.qavg = table->wred_set.qavg; 166 q->parms.qidlestart = table->wred_set.qidlestart; 167} 168 169static inline void gred_store_wred_set(struct gred_sched *table, 170 struct gred_sched_data *q) 171{ 172 table->wred_set.qavg = q->parms.qavg; 173} 174 175static int 176gred_enqueue(struct sk_buff *skb, struct Qdisc* sch) 177{ 178 struct gred_sched_data *q=NULL; 179 struct gred_sched *t= qdisc_priv(sch); 180 unsigned long qavg = 0; 181 int i=0; 182 u16 dp = tc_index_to_dp(skb); 183 184 if (dp >= t->DPs || (q = t->tab[dp]) == NULL) { 185 dp = t->def; 186 187 if ((q = t->tab[dp]) == NULL) { 188 /* Pass through packets not assigned to a DP 189 * if no default DP has been configured. This 190 * allows for DP flows to be left untouched. 191 */ 192 if (skb_queue_len(&sch->q) < sch->dev->tx_queue_len) 193 return qdisc_enqueue_tail(skb, sch); 194 else 195 goto drop; 196 } 197 198 /* fix tc_index? --could be controvesial but needed for 199 requeueing */ 200 skb->tc_index = (skb->tc_index & ~GRED_VQ_MASK) | dp; 201 } 202 203 /* sum up all the qaves of prios <= to ours to get the new qave*/ 204 if (!gred_wred_mode(t) && gred_rio_mode(t)) { 205 for (i=0;i<t->DPs;i++) { 206 if ((!t->tab[i]) || (i==q->DP)) 207 continue; 208 209 if (t->tab[i]->prio < q->prio && 210 !red_is_idling(&t->tab[i]->parms)) 211 qavg +=t->tab[i]->parms.qavg; 212 } 213 214 } 215 216 q->packetsin++; 217 q->bytesin+=skb->len; 218 219 if (gred_wred_mode(t)) { 220 qavg = 0; 221 gred_load_wred_set(t, q); 222 } 223 224 q->parms.qavg = red_calc_qavg(&q->parms, gred_backlog(t, q, sch)); 225 226 if (red_is_idling(&q->parms)) 227 red_end_of_idle_period(&q->parms); 228 229 if (gred_wred_mode(t)) 230 gred_store_wred_set(t, q); 231 232 switch (red_action(&q->parms, q->parms.qavg + qavg)) { 233 case RED_DONT_MARK: 234 break; 235 236 case RED_PROB_MARK: 237 sch->qstats.overlimits++; 238 q->stats.prob_drop++; 239 goto congestion_drop; 240 241 case RED_HARD_MARK: 242 sch->qstats.overlimits++; 243 q->stats.forced_drop++; 244 goto congestion_drop; 245 } 246 247 if (q->backlog + skb->len <= q->limit) { 248 q->backlog += skb->len; 249 return qdisc_enqueue_tail(skb, sch); 250 } 251 252 q->stats.pdrop++; 253drop: 254 return qdisc_drop(skb, sch); 255 256congestion_drop: 257 qdisc_drop(skb, sch); 258 return NET_XMIT_CN; 259} 260 261static int 262gred_requeue(struct sk_buff *skb, struct Qdisc* sch) 263{ 264 struct gred_sched *t = qdisc_priv(sch); 265 struct gred_sched_data *q; 266 u16 dp = tc_index_to_dp(skb); 267 268 if (dp >= t->DPs || (q = t->tab[dp]) == NULL) { 269 if (net_ratelimit()) 270 printk(KERN_WARNING "GRED: Unable to relocate VQ 0x%x " 271 "for requeue, screwing up backlog.\n", 272 tc_index_to_dp(skb)); 273 } else { 274 if (red_is_idling(&q->parms)) 275 red_end_of_idle_period(&q->parms); 276 q->backlog += skb->len; 277 } 278 279 return qdisc_requeue(skb, sch); 280} 281 282static struct sk_buff * 283gred_dequeue(struct Qdisc* sch) 284{ 285 struct sk_buff *skb; 286 struct gred_sched_data *q; 287 struct gred_sched *t= qdisc_priv(sch); 288 289 skb = qdisc_dequeue_head(sch); 290 291 if (skb) { 292 u16 dp = tc_index_to_dp(skb); 293 294 if (dp >= t->DPs || (q = t->tab[dp]) == NULL) { 295 if (net_ratelimit()) 296 printk(KERN_WARNING "GRED: Unable to relocate " 297 "VQ 0x%x after dequeue, screwing up " 298 "backlog.\n", tc_index_to_dp(skb)); 299 } else { 300 q->backlog -= skb->len; 301 302 if (!q->backlog && !gred_wred_mode(t)) 303 red_start_of_idle_period(&q->parms); 304 } 305 306 return skb; 307 } 308 309 if (gred_wred_mode(t)) 310 red_start_of_idle_period(&t->wred_set); 311 312 return NULL; 313} 314 315static unsigned int gred_drop(struct Qdisc* sch) 316{ 317 struct sk_buff *skb; 318 319 struct gred_sched_data *q; 320 struct gred_sched *t= qdisc_priv(sch); 321 322 skb = qdisc_dequeue_tail(sch); 323 if (skb) { 324 unsigned int len = skb->len; 325 u16 dp = tc_index_to_dp(skb); 326 327 if (dp >= t->DPs || (q = t->tab[dp]) == NULL) { 328 if (net_ratelimit()) 329 printk(KERN_WARNING "GRED: Unable to relocate " 330 "VQ 0x%x while dropping, screwing up " 331 "backlog.\n", tc_index_to_dp(skb)); 332 } else { 333 q->backlog -= len; 334 q->stats.other++; 335 336 if (!q->backlog && !gred_wred_mode(t)) 337 red_start_of_idle_period(&q->parms); 338 } 339 340 qdisc_drop(skb, sch); 341 return len; 342 } 343 344 if (gred_wred_mode(t)) 345 red_start_of_idle_period(&t->wred_set); 346 347 return 0; 348 349} 350 351static void gred_reset(struct Qdisc* sch) 352{ 353 int i; 354 struct gred_sched_data *q; 355 struct gred_sched *t= qdisc_priv(sch); 356 357 qdisc_reset_queue(sch); 358 359 for (i=0;i<t->DPs;i++) { 360 q= t->tab[i]; 361 if (!q) 362 continue; 363 red_restart(&q->parms); 364 q->backlog = 0; 365 } 366} 367 368static inline void gred_destroy_vq(struct gred_sched_data *q) 369{ 370 kfree(q); 371} 372 373static inline int gred_change_table_def(struct Qdisc *sch, struct rtattr *dps) 374{ 375 struct gred_sched *table = qdisc_priv(sch); 376 struct tc_gred_sopt *sopt; 377 int i; 378 379 if (dps == NULL || RTA_PAYLOAD(dps) < sizeof(*sopt)) 380 return -EINVAL; 381 382 sopt = RTA_DATA(dps); 383 384 if (sopt->DPs > MAX_DPs || sopt->DPs == 0 || sopt->def_DP >= sopt->DPs) 385 return -EINVAL; 386 387 sch_tree_lock(sch); 388 table->DPs = sopt->DPs; 389 table->def = sopt->def_DP; 390 391 /* 392 * Every entry point to GRED is synchronized with the above code 393 * and the DP is checked against DPs, i.e. shadowed VQs can no 394 * longer be found so we can unlock right here. 395 */ 396 sch_tree_unlock(sch); 397 398 if (sopt->grio) { 399 gred_enable_rio_mode(table); 400 gred_disable_wred_mode(table); 401 if (gred_wred_mode_check(sch)) 402 gred_enable_wred_mode(table); 403 } else { 404 gred_disable_rio_mode(table); 405 gred_disable_wred_mode(table); 406 } 407 408 for (i = table->DPs; i < MAX_DPs; i++) { 409 if (table->tab[i]) { 410 printk(KERN_WARNING "GRED: Warning: Destroying " 411 "shadowed VQ 0x%x\n", i); 412 gred_destroy_vq(table->tab[i]); 413 table->tab[i] = NULL; 414 } 415 } 416 417 return 0; 418} 419 420static inline int gred_change_vq(struct Qdisc *sch, int dp, 421 struct tc_gred_qopt *ctl, int prio, u8 *stab) 422{ 423 struct gred_sched *table = qdisc_priv(sch); 424 struct gred_sched_data *q; 425 426 if (table->tab[dp] == NULL) { 427 table->tab[dp] = kmalloc(sizeof(*q), GFP_KERNEL); 428 if (table->tab[dp] == NULL) 429 return -ENOMEM; 430 memset(table->tab[dp], 0, sizeof(*q)); 431 } 432 433 q = table->tab[dp]; 434 q->DP = dp; 435 q->prio = prio; 436 q->limit = ctl->limit; 437 438 if (q->backlog == 0) 439 red_end_of_idle_period(&q->parms); 440 441 red_set_parms(&q->parms, 442 ctl->qth_min, ctl->qth_max, ctl->Wlog, ctl->Plog, 443 ctl->Scell_log, stab); 444 445 return 0; 446} 447 448static int gred_change(struct Qdisc *sch, struct rtattr *opt) 449{ 450 struct gred_sched *table = qdisc_priv(sch); 451 struct tc_gred_qopt *ctl; 452 struct rtattr *tb[TCA_GRED_MAX]; 453 int err = -EINVAL, prio = GRED_DEF_PRIO; 454 u8 *stab; 455 456 if (opt == NULL || rtattr_parse_nested(tb, TCA_GRED_MAX, opt)) 457 return -EINVAL; 458 459 if (tb[TCA_GRED_PARMS-1] == NULL && tb[TCA_GRED_STAB-1] == NULL) 460 return gred_change_table_def(sch, opt); 461 462 if (tb[TCA_GRED_PARMS-1] == NULL || 463 RTA_PAYLOAD(tb[TCA_GRED_PARMS-1]) < sizeof(*ctl) || 464 tb[TCA_GRED_STAB-1] == NULL || 465 RTA_PAYLOAD(tb[TCA_GRED_STAB-1]) < 256) 466 return -EINVAL; 467 468 ctl = RTA_DATA(tb[TCA_GRED_PARMS-1]); 469 stab = RTA_DATA(tb[TCA_GRED_STAB-1]); 470 471 if (ctl->DP >= table->DPs) 472 goto errout; 473 474 if (gred_rio_mode(table)) { 475 if (ctl->prio == 0) { 476 int def_prio = GRED_DEF_PRIO; 477 478 if (table->tab[table->def]) 479 def_prio = table->tab[table->def]->prio; 480 481 printk(KERN_DEBUG "GRED: DP %u does not have a prio " 482 "setting default to %d\n", ctl->DP, def_prio); 483 484 prio = def_prio; 485 } else 486 prio = ctl->prio; 487 } 488 489 sch_tree_lock(sch); 490 491 err = gred_change_vq(sch, ctl->DP, ctl, prio, stab); 492 if (err < 0) 493 goto errout_locked; 494 495 if (table->tab[table->def] == NULL) { 496 if (gred_rio_mode(table)) 497 prio = table->tab[ctl->DP]->prio; 498 499 err = gred_change_vq(sch, table->def, ctl, prio, stab); 500 if (err < 0) 501 goto errout_locked; 502 } 503 504 if (gred_rio_mode(table)) { 505 gred_disable_wred_mode(table); 506 if (gred_wred_mode_check(sch)) 507 gred_enable_wred_mode(table); 508 } 509 510 err = 0; 511 512errout_locked: 513 sch_tree_unlock(sch); 514errout: 515 return err; 516} 517 518static int gred_init(struct Qdisc *sch, struct rtattr *opt) 519{ 520 struct rtattr *tb[TCA_GRED_MAX]; 521 522 if (opt == NULL || rtattr_parse_nested(tb, TCA_GRED_MAX, opt)) 523 return -EINVAL; 524 525 if (tb[TCA_GRED_PARMS-1] || tb[TCA_GRED_STAB-1]) 526 return -EINVAL; 527 528 return gred_change_table_def(sch, tb[TCA_GRED_DPS-1]); 529} 530 531static int gred_dump(struct Qdisc *sch, struct sk_buff *skb) 532{ 533 struct gred_sched *table = qdisc_priv(sch); 534 struct rtattr *parms, *opts = NULL; 535 int i; 536 struct tc_gred_sopt sopt = { 537 .DPs = table->DPs, 538 .def_DP = table->def, 539 .grio = gred_rio_mode(table), 540 }; 541 542 opts = RTA_NEST(skb, TCA_OPTIONS); 543 RTA_PUT(skb, TCA_GRED_DPS, sizeof(sopt), &sopt); 544 parms = RTA_NEST(skb, TCA_GRED_PARMS); 545 546 for (i = 0; i < MAX_DPs; i++) { 547 struct gred_sched_data *q = table->tab[i]; 548 struct tc_gred_qopt opt; 549 550 memset(&opt, 0, sizeof(opt)); 551 552 if (!q) { 553 /* hack -- fix at some point with proper message 554 This is how we indicate to tc that there is no VQ 555 at this DP */ 556 557 opt.DP = MAX_DPs + i; 558 goto append_opt; 559 } 560 561 opt.limit = q->limit; 562 opt.DP = q->DP; 563 opt.backlog = q->backlog; 564 opt.prio = q->prio; 565 opt.qth_min = q->parms.qth_min >> q->parms.Wlog; 566 opt.qth_max = q->parms.qth_max >> q->parms.Wlog; 567 opt.Wlog = q->parms.Wlog; 568 opt.Plog = q->parms.Plog; 569 opt.Scell_log = q->parms.Scell_log; 570 opt.other = q->stats.other; 571 opt.early = q->stats.prob_drop; 572 opt.forced = q->stats.forced_drop; 573 opt.pdrop = q->stats.pdrop; 574 opt.packets = q->packetsin; 575 opt.bytesin = q->bytesin; 576 577 if (gred_wred_mode(table)) { 578 q->parms.qidlestart = 579 table->tab[table->def]->parms.qidlestart; 580 q->parms.qavg = table->tab[table->def]->parms.qavg; 581 } 582 583 opt.qave = red_calc_qavg(&q->parms, q->parms.qavg); 584 585append_opt: 586 RTA_APPEND(skb, sizeof(opt), &opt); 587 } 588 589 RTA_NEST_END(skb, parms); 590 591 return RTA_NEST_END(skb, opts); 592 593rtattr_failure: 594 return RTA_NEST_CANCEL(skb, opts); 595} 596 597static void gred_destroy(struct Qdisc *sch) 598{ 599 struct gred_sched *table = qdisc_priv(sch); 600 int i; 601 602 for (i = 0;i < table->DPs; i++) { 603 if (table->tab[i]) 604 gred_destroy_vq(table->tab[i]); 605 } 606} 607 608static struct Qdisc_ops gred_qdisc_ops = { 609 .next = NULL, 610 .cl_ops = NULL, 611 .id = "gred", 612 .priv_size = sizeof(struct gred_sched), 613 .enqueue = gred_enqueue, 614 .dequeue = gred_dequeue, 615 .requeue = gred_requeue, 616 .drop = gred_drop, 617 .init = gred_init, 618 .reset = gred_reset, 619 .destroy = gred_destroy, 620 .change = gred_change, 621 .dump = gred_dump, 622 .owner = THIS_MODULE, 623}; 624 625static int __init gred_module_init(void) 626{ 627 return register_qdisc(&gred_qdisc_ops); 628} 629static void __exit gred_module_exit(void) 630{ 631 unregister_qdisc(&gred_qdisc_ops); 632} 633module_init(gred_module_init) 634module_exit(gred_module_exit) 635MODULE_LICENSE("GPL"); 636