sch_gred.c revision f62d6b936df500247474c13360eb23e1b602bad0
1/* 2 * net/sched/sch_gred.c Generic Random Early Detection queue. 3 * 4 * 5 * This program is free software; you can redistribute it and/or 6 * modify it under the terms of the GNU General Public License 7 * as published by the Free Software Foundation; either version 8 * 2 of the License, or (at your option) any later version. 9 * 10 * Authors: J Hadi Salim (hadi@cyberus.ca) 1998-2002 11 * 12 * 991129: - Bug fix with grio mode 13 * - a better sing. AvgQ mode with Grio(WRED) 14 * - A finer grained VQ dequeue based on sugestion 15 * from Ren Liu 16 * - More error checks 17 * 18 * 19 * 20 * For all the glorious comments look at Alexey's sch_red.c 21 */ 22 23#include <linux/config.h> 24#include <linux/module.h> 25#include <asm/uaccess.h> 26#include <asm/system.h> 27#include <linux/bitops.h> 28#include <linux/types.h> 29#include <linux/kernel.h> 30#include <linux/sched.h> 31#include <linux/string.h> 32#include <linux/mm.h> 33#include <linux/socket.h> 34#include <linux/sockios.h> 35#include <linux/in.h> 36#include <linux/errno.h> 37#include <linux/interrupt.h> 38#include <linux/if_ether.h> 39#include <linux/inet.h> 40#include <linux/netdevice.h> 41#include <linux/etherdevice.h> 42#include <linux/notifier.h> 43#include <net/ip.h> 44#include <net/route.h> 45#include <linux/skbuff.h> 46#include <net/sock.h> 47#include <net/pkt_sched.h> 48 49#if 1 /* control */ 50#define DPRINTK(format,args...) printk(KERN_DEBUG format,##args) 51#else 52#define DPRINTK(format,args...) 53#endif 54 55#if 0 /* data */ 56#define D2PRINTK(format,args...) printk(KERN_DEBUG format,##args) 57#else 58#define D2PRINTK(format,args...) 59#endif 60 61#define GRED_DEF_PRIO (MAX_DPs / 2) 62 63struct gred_sched_data; 64struct gred_sched; 65 66struct gred_sched_data 67{ 68/* Parameters */ 69 u32 limit; /* HARD maximal queue length */ 70 u32 qth_min; /* Min average length threshold: A scaled */ 71 u32 qth_max; /* Max average length threshold: A scaled */ 72 u32 DP; /* the drop pramaters */ 73 char Wlog; /* log(W) */ 74 char Plog; /* random number bits */ 75 u32 Scell_max; 76 u32 Rmask; 77 u32 bytesin; /* bytes seen on virtualQ so far*/ 78 u32 packetsin; /* packets seen on virtualQ so far*/ 79 u32 backlog; /* bytes on the virtualQ */ 80 u32 forced; /* packets dropped for exceeding limits */ 81 u32 early; /* packets dropped as a warning */ 82 u32 other; /* packets dropped by invoking drop() */ 83 u32 pdrop; /* packets dropped because we exceeded physical queue limits */ 84 char Scell_log; 85 u8 Stab[256]; 86 u8 prio; /* the prio of this vq */ 87 88/* Variables */ 89 unsigned long qave; /* Average queue length: A scaled */ 90 int qcount; /* Packets since last random number generation */ 91 u32 qR; /* Cached random number */ 92 93 psched_time_t qidlestart; /* Start of idle period */ 94}; 95 96enum { 97 GRED_WRED_MODE = 1, 98 GRED_RIO_MODE, 99}; 100 101struct gred_sched 102{ 103 struct gred_sched_data *tab[MAX_DPs]; 104 unsigned long flags; 105 u32 DPs; 106 u32 def; 107 u8 initd; 108}; 109 110static inline int gred_wred_mode(struct gred_sched *table) 111{ 112 return test_bit(GRED_WRED_MODE, &table->flags); 113} 114 115static inline void gred_enable_wred_mode(struct gred_sched *table) 116{ 117 __set_bit(GRED_WRED_MODE, &table->flags); 118} 119 120static inline void gred_disable_wred_mode(struct gred_sched *table) 121{ 122 __clear_bit(GRED_WRED_MODE, &table->flags); 123} 124 125static inline int gred_rio_mode(struct gred_sched *table) 126{ 127 return test_bit(GRED_RIO_MODE, &table->flags); 128} 129 130static inline void gred_enable_rio_mode(struct gred_sched *table) 131{ 132 __set_bit(GRED_RIO_MODE, &table->flags); 133} 134 135static inline void gred_disable_rio_mode(struct gred_sched *table) 136{ 137 __clear_bit(GRED_RIO_MODE, &table->flags); 138} 139 140static inline int gred_wred_mode_check(struct Qdisc *sch) 141{ 142 struct gred_sched *table = qdisc_priv(sch); 143 int i; 144 145 /* Really ugly O(n^2) but shouldn't be necessary too frequent. */ 146 for (i = 0; i < table->DPs; i++) { 147 struct gred_sched_data *q = table->tab[i]; 148 int n; 149 150 if (q == NULL) 151 continue; 152 153 for (n = 0; n < table->DPs; n++) 154 if (table->tab[n] && table->tab[n] != q && 155 table->tab[n]->prio == q->prio) 156 return 1; 157 } 158 159 return 0; 160} 161 162static int 163gred_enqueue(struct sk_buff *skb, struct Qdisc* sch) 164{ 165 psched_time_t now; 166 struct gred_sched_data *q=NULL; 167 struct gred_sched *t= qdisc_priv(sch); 168 unsigned long qave=0; 169 int i=0; 170 171 if (!t->initd && skb_queue_len(&sch->q) < (sch->dev->tx_queue_len ? : 1)) { 172 D2PRINTK("NO GRED Queues setup yet! Enqueued anyway\n"); 173 goto do_enqueue; 174 } 175 176 177 if ( ((skb->tc_index&0xf) > (t->DPs -1)) || !(q=t->tab[skb->tc_index&0xf])) { 178 printk("GRED: setting to default (%d)\n ",t->def); 179 if (!(q=t->tab[t->def])) { 180 DPRINTK("GRED: setting to default FAILED! dropping!! " 181 "(%d)\n ", t->def); 182 goto drop; 183 } 184 /* fix tc_index? --could be controvesial but needed for 185 requeueing */ 186 skb->tc_index=(skb->tc_index&0xfffffff0) | t->def; 187 } 188 189 D2PRINTK("gred_enqueue virtualQ 0x%x classid %x backlog %d " 190 "general backlog %d\n",skb->tc_index&0xf,sch->handle,q->backlog, 191 sch->qstats.backlog); 192 /* sum up all the qaves of prios <= to ours to get the new qave*/ 193 if (!gred_wred_mode(t) && gred_rio_mode(t)) { 194 for (i=0;i<t->DPs;i++) { 195 if ((!t->tab[i]) || (i==q->DP)) 196 continue; 197 198 if ((t->tab[i]->prio < q->prio) && (PSCHED_IS_PASTPERFECT(t->tab[i]->qidlestart))) 199 qave +=t->tab[i]->qave; 200 } 201 202 } 203 204 q->packetsin++; 205 q->bytesin+=skb->len; 206 207 if (gred_wred_mode(t)) { 208 qave=0; 209 q->qave=t->tab[t->def]->qave; 210 q->qidlestart=t->tab[t->def]->qidlestart; 211 } 212 213 if (!PSCHED_IS_PASTPERFECT(q->qidlestart)) { 214 long us_idle; 215 PSCHED_GET_TIME(now); 216 us_idle = PSCHED_TDIFF_SAFE(now, q->qidlestart, q->Scell_max); 217 PSCHED_SET_PASTPERFECT(q->qidlestart); 218 219 q->qave >>= q->Stab[(us_idle>>q->Scell_log)&0xFF]; 220 } else { 221 if (gred_wred_mode(t)) { 222 q->qave += sch->qstats.backlog - (q->qave >> q->Wlog); 223 } else { 224 q->qave += q->backlog - (q->qave >> q->Wlog); 225 } 226 227 } 228 229 230 if (gred_wred_mode(t)) 231 t->tab[t->def]->qave=q->qave; 232 233 if ((q->qave+qave) < q->qth_min) { 234 q->qcount = -1; 235enqueue: 236 if (q->backlog + skb->len <= q->limit) { 237 q->backlog += skb->len; 238do_enqueue: 239 __skb_queue_tail(&sch->q, skb); 240 sch->qstats.backlog += skb->len; 241 sch->bstats.bytes += skb->len; 242 sch->bstats.packets++; 243 return 0; 244 } else { 245 q->pdrop++; 246 } 247 248drop: 249 kfree_skb(skb); 250 sch->qstats.drops++; 251 return NET_XMIT_DROP; 252 } 253 if ((q->qave+qave) >= q->qth_max) { 254 q->qcount = -1; 255 sch->qstats.overlimits++; 256 q->forced++; 257 goto drop; 258 } 259 if (++q->qcount) { 260 if ((((qave+q->qave) - q->qth_min)>>q->Wlog)*q->qcount < q->qR) 261 goto enqueue; 262 q->qcount = 0; 263 q->qR = net_random()&q->Rmask; 264 sch->qstats.overlimits++; 265 q->early++; 266 goto drop; 267 } 268 q->qR = net_random()&q->Rmask; 269 goto enqueue; 270} 271 272static int 273gred_requeue(struct sk_buff *skb, struct Qdisc* sch) 274{ 275 struct gred_sched_data *q; 276 struct gred_sched *t= qdisc_priv(sch); 277 q= t->tab[(skb->tc_index&0xf)]; 278/* error checking here -- probably unnecessary */ 279 PSCHED_SET_PASTPERFECT(q->qidlestart); 280 281 __skb_queue_head(&sch->q, skb); 282 sch->qstats.backlog += skb->len; 283 sch->qstats.requeues++; 284 q->backlog += skb->len; 285 return 0; 286} 287 288static struct sk_buff * 289gred_dequeue(struct Qdisc* sch) 290{ 291 struct sk_buff *skb; 292 struct gred_sched_data *q; 293 struct gred_sched *t= qdisc_priv(sch); 294 295 skb = __skb_dequeue(&sch->q); 296 if (skb) { 297 sch->qstats.backlog -= skb->len; 298 q= t->tab[(skb->tc_index&0xf)]; 299 if (q) { 300 q->backlog -= skb->len; 301 if (!q->backlog && !gred_wred_mode(t)) 302 PSCHED_GET_TIME(q->qidlestart); 303 } else { 304 D2PRINTK("gred_dequeue: skb has bad tcindex %x\n",skb->tc_index&0xf); 305 } 306 return skb; 307 } 308 309 if (gred_wred_mode(t)) { 310 q= t->tab[t->def]; 311 if (!q) 312 D2PRINTK("no default VQ set: Results will be " 313 "screwed up\n"); 314 else 315 PSCHED_GET_TIME(q->qidlestart); 316 } 317 318 return NULL; 319} 320 321static unsigned int gred_drop(struct Qdisc* sch) 322{ 323 struct sk_buff *skb; 324 325 struct gred_sched_data *q; 326 struct gred_sched *t= qdisc_priv(sch); 327 328 skb = __skb_dequeue_tail(&sch->q); 329 if (skb) { 330 unsigned int len = skb->len; 331 sch->qstats.backlog -= len; 332 sch->qstats.drops++; 333 q= t->tab[(skb->tc_index&0xf)]; 334 if (q) { 335 q->backlog -= len; 336 q->other++; 337 if (!q->backlog && !gred_wred_mode(t)) 338 PSCHED_GET_TIME(q->qidlestart); 339 } else { 340 D2PRINTK("gred_dequeue: skb has bad tcindex %x\n",skb->tc_index&0xf); 341 } 342 343 kfree_skb(skb); 344 return len; 345 } 346 347 q=t->tab[t->def]; 348 if (!q) { 349 D2PRINTK("no default VQ set: Results might be screwed up\n"); 350 return 0; 351 } 352 353 PSCHED_GET_TIME(q->qidlestart); 354 return 0; 355 356} 357 358static void gred_reset(struct Qdisc* sch) 359{ 360 int i; 361 struct gred_sched_data *q; 362 struct gred_sched *t= qdisc_priv(sch); 363 364 __skb_queue_purge(&sch->q); 365 366 sch->qstats.backlog = 0; 367 368 for (i=0;i<t->DPs;i++) { 369 q= t->tab[i]; 370 if (!q) 371 continue; 372 PSCHED_SET_PASTPERFECT(q->qidlestart); 373 q->qave = 0; 374 q->qcount = -1; 375 q->backlog = 0; 376 q->other=0; 377 q->forced=0; 378 q->pdrop=0; 379 q->early=0; 380 } 381} 382 383static inline void gred_destroy_vq(struct gred_sched_data *q) 384{ 385 kfree(q); 386} 387 388static inline int gred_change_table_def(struct Qdisc *sch, struct rtattr *dps) 389{ 390 struct gred_sched *table = qdisc_priv(sch); 391 struct tc_gred_sopt *sopt; 392 int i; 393 394 if (dps == NULL || RTA_PAYLOAD(dps) < sizeof(*sopt)) 395 return -EINVAL; 396 397 sopt = RTA_DATA(dps); 398 399 if (sopt->DPs > MAX_DPs || sopt->DPs == 0 || sopt->def_DP >= sopt->DPs) 400 return -EINVAL; 401 402 sch_tree_lock(sch); 403 table->DPs = sopt->DPs; 404 table->def = sopt->def_DP; 405 406 /* 407 * Every entry point to GRED is synchronized with the above code 408 * and the DP is checked against DPs, i.e. shadowed VQs can no 409 * longer be found so we can unlock right here. 410 */ 411 sch_tree_unlock(sch); 412 413 if (sopt->grio) { 414 gred_enable_rio_mode(table); 415 gred_disable_wred_mode(table); 416 if (gred_wred_mode_check(sch)) 417 gred_enable_wred_mode(table); 418 } else { 419 gred_disable_rio_mode(table); 420 gred_disable_wred_mode(table); 421 } 422 423 for (i = table->DPs; i < MAX_DPs; i++) { 424 if (table->tab[i]) { 425 printk(KERN_WARNING "GRED: Warning: Destroying " 426 "shadowed VQ 0x%x\n", i); 427 gred_destroy_vq(table->tab[i]); 428 table->tab[i] = NULL; 429 } 430 } 431 432 table->initd = 0; 433 434 return 0; 435} 436 437static inline int gred_change_vq(struct Qdisc *sch, int dp, 438 struct tc_gred_qopt *ctl, int prio, u8 *stab) 439{ 440 struct gred_sched *table = qdisc_priv(sch); 441 struct gred_sched_data *q; 442 443 if (table->tab[dp] == NULL) { 444 table->tab[dp] = kmalloc(sizeof(*q), GFP_KERNEL); 445 if (table->tab[dp] == NULL) 446 return -ENOMEM; 447 memset(table->tab[dp], 0, sizeof(*q)); 448 } 449 450 q = table->tab[dp]; 451 q->DP = dp; 452 q->prio = prio; 453 454 q->Wlog = ctl->Wlog; 455 q->Plog = ctl->Plog; 456 q->limit = ctl->limit; 457 q->Scell_log = ctl->Scell_log; 458 q->Rmask = ctl->Plog < 32 ? ((1<<ctl->Plog) - 1) : ~0UL; 459 q->Scell_max = (255<<q->Scell_log); 460 q->qth_min = ctl->qth_min<<ctl->Wlog; 461 q->qth_max = ctl->qth_max<<ctl->Wlog; 462 q->qave=0; 463 q->backlog=0; 464 q->qcount = -1; 465 q->other=0; 466 q->forced=0; 467 q->pdrop=0; 468 q->early=0; 469 470 PSCHED_SET_PASTPERFECT(q->qidlestart); 471 memcpy(q->Stab, stab, 256); 472 473 return 0; 474} 475 476static int gred_change(struct Qdisc *sch, struct rtattr *opt) 477{ 478 struct gred_sched *table = qdisc_priv(sch); 479 struct tc_gred_qopt *ctl; 480 struct rtattr *tb[TCA_GRED_MAX]; 481 int err = -EINVAL, prio = GRED_DEF_PRIO; 482 u8 *stab; 483 484 if (opt == NULL || rtattr_parse_nested(tb, TCA_GRED_MAX, opt)) 485 return -EINVAL; 486 487 if (tb[TCA_GRED_PARMS-1] == NULL && tb[TCA_GRED_STAB-1] == NULL) 488 return gred_change_table_def(sch, opt); 489 490 if (tb[TCA_GRED_PARMS-1] == NULL || 491 RTA_PAYLOAD(tb[TCA_GRED_PARMS-1]) < sizeof(*ctl) || 492 tb[TCA_GRED_STAB-1] == NULL || 493 RTA_PAYLOAD(tb[TCA_GRED_STAB-1]) < 256) 494 return -EINVAL; 495 496 ctl = RTA_DATA(tb[TCA_GRED_PARMS-1]); 497 stab = RTA_DATA(tb[TCA_GRED_STAB-1]); 498 499 if (ctl->DP >= table->DPs) 500 goto errout; 501 502 if (gred_rio_mode(table)) { 503 if (ctl->prio == 0) { 504 int def_prio = GRED_DEF_PRIO; 505 506 if (table->tab[table->def]) 507 def_prio = table->tab[table->def]->prio; 508 509 printk(KERN_DEBUG "GRED: DP %u does not have a prio " 510 "setting default to %d\n", ctl->DP, def_prio); 511 512 prio = def_prio; 513 } else 514 prio = ctl->prio; 515 } 516 517 sch_tree_lock(sch); 518 519 err = gred_change_vq(sch, ctl->DP, ctl, prio, stab); 520 if (err < 0) 521 goto errout_locked; 522 523 if (table->tab[table->def] == NULL) { 524 if (gred_rio_mode(table)) 525 prio = table->tab[ctl->DP]->prio; 526 527 err = gred_change_vq(sch, table->def, ctl, prio, stab); 528 if (err < 0) 529 goto errout_locked; 530 } 531 532 table->initd = 1; 533 534 if (gred_rio_mode(table)) { 535 gred_disable_wred_mode(table); 536 if (gred_wred_mode_check(sch)) 537 gred_enable_wred_mode(table); 538 } 539 540 err = 0; 541 542errout_locked: 543 sch_tree_unlock(sch); 544errout: 545 return err; 546} 547 548static int gred_init(struct Qdisc *sch, struct rtattr *opt) 549{ 550 struct rtattr *tb[TCA_GRED_MAX]; 551 552 if (opt == NULL || rtattr_parse_nested(tb, TCA_GRED_MAX, opt)) 553 return -EINVAL; 554 555 if (tb[TCA_GRED_PARMS-1] || tb[TCA_GRED_STAB-1]) 556 return -EINVAL; 557 558 return gred_change_table_def(sch, tb[TCA_GRED_DPS-1]); 559} 560 561static int gred_dump(struct Qdisc *sch, struct sk_buff *skb) 562{ 563 struct gred_sched *table = qdisc_priv(sch); 564 struct rtattr *parms, *opts = NULL; 565 int i; 566 struct tc_gred_sopt sopt = { 567 .DPs = table->DPs, 568 .def_DP = table->def, 569 .grio = gred_rio_mode(table), 570 }; 571 572 opts = RTA_NEST(skb, TCA_OPTIONS); 573 RTA_PUT(skb, TCA_GRED_DPS, sizeof(sopt), &sopt); 574 parms = RTA_NEST(skb, TCA_GRED_PARMS); 575 576 for (i = 0; i < MAX_DPs; i++) { 577 struct gred_sched_data *q = table->tab[i]; 578 struct tc_gred_qopt opt; 579 580 memset(&opt, 0, sizeof(opt)); 581 582 if (!q) { 583 /* hack -- fix at some point with proper message 584 This is how we indicate to tc that there is no VQ 585 at this DP */ 586 587 opt.DP = MAX_DPs + i; 588 goto append_opt; 589 } 590 591 opt.limit = q->limit; 592 opt.DP = q->DP; 593 opt.backlog = q->backlog; 594 opt.prio = q->prio; 595 opt.qth_min = q->qth_min >> q->Wlog; 596 opt.qth_max = q->qth_max >> q->Wlog; 597 opt.Wlog = q->Wlog; 598 opt.Plog = q->Plog; 599 opt.Scell_log = q->Scell_log; 600 opt.other = q->other; 601 opt.early = q->early; 602 opt.forced = q->forced; 603 opt.pdrop = q->pdrop; 604 opt.packets = q->packetsin; 605 opt.bytesin = q->bytesin; 606 607 if (q->qave) { 608 if (gred_wred_mode(table)) { 609 q->qidlestart=table->tab[table->def]->qidlestart; 610 q->qave=table->tab[table->def]->qave; 611 } 612 if (!PSCHED_IS_PASTPERFECT(q->qidlestart)) { 613 long idle; 614 unsigned long qave; 615 psched_time_t now; 616 PSCHED_GET_TIME(now); 617 idle = PSCHED_TDIFF_SAFE(now, q->qidlestart, q->Scell_max); 618 qave = q->qave >> q->Stab[(idle>>q->Scell_log)&0xFF]; 619 opt.qave = qave >> q->Wlog; 620 621 } else { 622 opt.qave = q->qave >> q->Wlog; 623 } 624 } 625 626append_opt: 627 RTA_APPEND(skb, sizeof(opt), &opt); 628 } 629 630 RTA_NEST_END(skb, parms); 631 632 return RTA_NEST_END(skb, opts); 633 634rtattr_failure: 635 return RTA_NEST_CANCEL(skb, opts); 636} 637 638static void gred_destroy(struct Qdisc *sch) 639{ 640 struct gred_sched *table = qdisc_priv(sch); 641 int i; 642 643 for (i = 0;i < table->DPs; i++) { 644 if (table->tab[i]) 645 gred_destroy_vq(table->tab[i]); 646 } 647} 648 649static struct Qdisc_ops gred_qdisc_ops = { 650 .next = NULL, 651 .cl_ops = NULL, 652 .id = "gred", 653 .priv_size = sizeof(struct gred_sched), 654 .enqueue = gred_enqueue, 655 .dequeue = gred_dequeue, 656 .requeue = gred_requeue, 657 .drop = gred_drop, 658 .init = gred_init, 659 .reset = gred_reset, 660 .destroy = gred_destroy, 661 .change = gred_change, 662 .dump = gred_dump, 663 .owner = THIS_MODULE, 664}; 665 666static int __init gred_module_init(void) 667{ 668 return register_qdisc(&gred_qdisc_ops); 669} 670static void __exit gred_module_exit(void) 671{ 672 unregister_qdisc(&gred_qdisc_ops); 673} 674module_init(gred_module_init) 675module_exit(gred_module_exit) 676MODULE_LICENSE("GPL"); 677