sch_gred.c revision a8aaa9958eea2420e13d5a00c3fae934e0a3889e
1/* 2 * net/sched/sch_gred.c Generic Random Early Detection queue. 3 * 4 * 5 * This program is free software; you can redistribute it and/or 6 * modify it under the terms of the GNU General Public License 7 * as published by the Free Software Foundation; either version 8 * 2 of the License, or (at your option) any later version. 9 * 10 * Authors: J Hadi Salim (hadi@cyberus.ca) 1998-2002 11 * 12 * 991129: - Bug fix with grio mode 13 * - a better sing. AvgQ mode with Grio(WRED) 14 * - A finer grained VQ dequeue based on sugestion 15 * from Ren Liu 16 * - More error checks 17 * 18 * 19 * 20 * For all the glorious comments look at Alexey's sch_red.c 21 */ 22 23#include <linux/config.h> 24#include <linux/module.h> 25#include <asm/uaccess.h> 26#include <asm/system.h> 27#include <linux/bitops.h> 28#include <linux/types.h> 29#include <linux/kernel.h> 30#include <linux/sched.h> 31#include <linux/string.h> 32#include <linux/mm.h> 33#include <linux/socket.h> 34#include <linux/sockios.h> 35#include <linux/in.h> 36#include <linux/errno.h> 37#include <linux/interrupt.h> 38#include <linux/if_ether.h> 39#include <linux/inet.h> 40#include <linux/netdevice.h> 41#include <linux/etherdevice.h> 42#include <linux/notifier.h> 43#include <net/ip.h> 44#include <net/route.h> 45#include <linux/skbuff.h> 46#include <net/sock.h> 47#include <net/pkt_sched.h> 48 49#if 1 /* control */ 50#define DPRINTK(format,args...) printk(KERN_DEBUG format,##args) 51#else 52#define DPRINTK(format,args...) 53#endif 54 55#if 0 /* data */ 56#define D2PRINTK(format,args...) printk(KERN_DEBUG format,##args) 57#else 58#define D2PRINTK(format,args...) 59#endif 60 61struct gred_sched_data; 62struct gred_sched; 63 64struct gred_sched_data 65{ 66/* Parameters */ 67 u32 limit; /* HARD maximal queue length */ 68 u32 qth_min; /* Min average length threshold: A scaled */ 69 u32 qth_max; /* Max average length threshold: A scaled */ 70 u32 DP; /* the drop pramaters */ 71 char Wlog; /* log(W) */ 72 char Plog; /* random number bits */ 73 u32 Scell_max; 74 u32 Rmask; 75 u32 bytesin; /* bytes seen on virtualQ so far*/ 76 u32 packetsin; /* packets seen on virtualQ so far*/ 77 u32 backlog; /* bytes on the virtualQ */ 78 u32 forced; /* packets dropped for exceeding limits */ 79 u32 early; /* packets dropped as a warning */ 80 u32 other; /* packets dropped by invoking drop() */ 81 u32 pdrop; /* packets dropped because we exceeded physical queue limits */ 82 char Scell_log; 83 u8 Stab[256]; 84 u8 prio; /* the prio of this vq */ 85 86/* Variables */ 87 unsigned long qave; /* Average queue length: A scaled */ 88 int qcount; /* Packets since last random number generation */ 89 u32 qR; /* Cached random number */ 90 91 psched_time_t qidlestart; /* Start of idle period */ 92}; 93 94enum { 95 GRED_WRED_MODE = 1, 96 GRED_RIO_MODE, 97}; 98 99struct gred_sched 100{ 101 struct gred_sched_data *tab[MAX_DPs]; 102 unsigned long flags; 103 u32 DPs; 104 u32 def; 105 u8 initd; 106}; 107 108static inline int gred_wred_mode(struct gred_sched *table) 109{ 110 return test_bit(GRED_WRED_MODE, &table->flags); 111} 112 113static inline void gred_enable_wred_mode(struct gred_sched *table) 114{ 115 __set_bit(GRED_WRED_MODE, &table->flags); 116} 117 118static inline void gred_disable_wred_mode(struct gred_sched *table) 119{ 120 __clear_bit(GRED_WRED_MODE, &table->flags); 121} 122 123static inline int gred_rio_mode(struct gred_sched *table) 124{ 125 return test_bit(GRED_RIO_MODE, &table->flags); 126} 127 128static inline void gred_enable_rio_mode(struct gred_sched *table) 129{ 130 __set_bit(GRED_RIO_MODE, &table->flags); 131} 132 133static inline void gred_disable_rio_mode(struct gred_sched *table) 134{ 135 __clear_bit(GRED_RIO_MODE, &table->flags); 136} 137 138static inline int gred_wred_mode_check(struct Qdisc *sch) 139{ 140 struct gred_sched *table = qdisc_priv(sch); 141 int i; 142 143 /* Really ugly O(n^2) but shouldn't be necessary too frequent. */ 144 for (i = 0; i < table->DPs; i++) { 145 struct gred_sched_data *q = table->tab[i]; 146 int n; 147 148 if (q == NULL) 149 continue; 150 151 for (n = 0; n < table->DPs; n++) 152 if (table->tab[n] && table->tab[n] != q && 153 table->tab[n]->prio == q->prio) 154 return 1; 155 } 156 157 return 0; 158} 159 160static int 161gred_enqueue(struct sk_buff *skb, struct Qdisc* sch) 162{ 163 psched_time_t now; 164 struct gred_sched_data *q=NULL; 165 struct gred_sched *t= qdisc_priv(sch); 166 unsigned long qave=0; 167 int i=0; 168 169 if (!t->initd && skb_queue_len(&sch->q) < (sch->dev->tx_queue_len ? : 1)) { 170 D2PRINTK("NO GRED Queues setup yet! Enqueued anyway\n"); 171 goto do_enqueue; 172 } 173 174 175 if ( ((skb->tc_index&0xf) > (t->DPs -1)) || !(q=t->tab[skb->tc_index&0xf])) { 176 printk("GRED: setting to default (%d)\n ",t->def); 177 if (!(q=t->tab[t->def])) { 178 DPRINTK("GRED: setting to default FAILED! dropping!! " 179 "(%d)\n ", t->def); 180 goto drop; 181 } 182 /* fix tc_index? --could be controvesial but needed for 183 requeueing */ 184 skb->tc_index=(skb->tc_index&0xfffffff0) | t->def; 185 } 186 187 D2PRINTK("gred_enqueue virtualQ 0x%x classid %x backlog %d " 188 "general backlog %d\n",skb->tc_index&0xf,sch->handle,q->backlog, 189 sch->qstats.backlog); 190 /* sum up all the qaves of prios <= to ours to get the new qave*/ 191 if (!gred_wred_mode(t) && gred_rio_mode(t)) { 192 for (i=0;i<t->DPs;i++) { 193 if ((!t->tab[i]) || (i==q->DP)) 194 continue; 195 196 if ((t->tab[i]->prio < q->prio) && (PSCHED_IS_PASTPERFECT(t->tab[i]->qidlestart))) 197 qave +=t->tab[i]->qave; 198 } 199 200 } 201 202 q->packetsin++; 203 q->bytesin+=skb->len; 204 205 if (gred_wred_mode(t)) { 206 qave=0; 207 q->qave=t->tab[t->def]->qave; 208 q->qidlestart=t->tab[t->def]->qidlestart; 209 } 210 211 if (!PSCHED_IS_PASTPERFECT(q->qidlestart)) { 212 long us_idle; 213 PSCHED_GET_TIME(now); 214 us_idle = PSCHED_TDIFF_SAFE(now, q->qidlestart, q->Scell_max); 215 PSCHED_SET_PASTPERFECT(q->qidlestart); 216 217 q->qave >>= q->Stab[(us_idle>>q->Scell_log)&0xFF]; 218 } else { 219 if (gred_wred_mode(t)) { 220 q->qave += sch->qstats.backlog - (q->qave >> q->Wlog); 221 } else { 222 q->qave += q->backlog - (q->qave >> q->Wlog); 223 } 224 225 } 226 227 228 if (gred_wred_mode(t)) 229 t->tab[t->def]->qave=q->qave; 230 231 if ((q->qave+qave) < q->qth_min) { 232 q->qcount = -1; 233enqueue: 234 if (q->backlog + skb->len <= q->limit) { 235 q->backlog += skb->len; 236do_enqueue: 237 __skb_queue_tail(&sch->q, skb); 238 sch->qstats.backlog += skb->len; 239 sch->bstats.bytes += skb->len; 240 sch->bstats.packets++; 241 return 0; 242 } else { 243 q->pdrop++; 244 } 245 246drop: 247 kfree_skb(skb); 248 sch->qstats.drops++; 249 return NET_XMIT_DROP; 250 } 251 if ((q->qave+qave) >= q->qth_max) { 252 q->qcount = -1; 253 sch->qstats.overlimits++; 254 q->forced++; 255 goto drop; 256 } 257 if (++q->qcount) { 258 if ((((qave+q->qave) - q->qth_min)>>q->Wlog)*q->qcount < q->qR) 259 goto enqueue; 260 q->qcount = 0; 261 q->qR = net_random()&q->Rmask; 262 sch->qstats.overlimits++; 263 q->early++; 264 goto drop; 265 } 266 q->qR = net_random()&q->Rmask; 267 goto enqueue; 268} 269 270static int 271gred_requeue(struct sk_buff *skb, struct Qdisc* sch) 272{ 273 struct gred_sched_data *q; 274 struct gred_sched *t= qdisc_priv(sch); 275 q= t->tab[(skb->tc_index&0xf)]; 276/* error checking here -- probably unnecessary */ 277 PSCHED_SET_PASTPERFECT(q->qidlestart); 278 279 __skb_queue_head(&sch->q, skb); 280 sch->qstats.backlog += skb->len; 281 sch->qstats.requeues++; 282 q->backlog += skb->len; 283 return 0; 284} 285 286static struct sk_buff * 287gred_dequeue(struct Qdisc* sch) 288{ 289 struct sk_buff *skb; 290 struct gred_sched_data *q; 291 struct gred_sched *t= qdisc_priv(sch); 292 293 skb = __skb_dequeue(&sch->q); 294 if (skb) { 295 sch->qstats.backlog -= skb->len; 296 q= t->tab[(skb->tc_index&0xf)]; 297 if (q) { 298 q->backlog -= skb->len; 299 if (!q->backlog && !gred_wred_mode(t)) 300 PSCHED_GET_TIME(q->qidlestart); 301 } else { 302 D2PRINTK("gred_dequeue: skb has bad tcindex %x\n",skb->tc_index&0xf); 303 } 304 return skb; 305 } 306 307 if (gred_wred_mode(t)) { 308 q= t->tab[t->def]; 309 if (!q) 310 D2PRINTK("no default VQ set: Results will be " 311 "screwed up\n"); 312 else 313 PSCHED_GET_TIME(q->qidlestart); 314 } 315 316 return NULL; 317} 318 319static unsigned int gred_drop(struct Qdisc* sch) 320{ 321 struct sk_buff *skb; 322 323 struct gred_sched_data *q; 324 struct gred_sched *t= qdisc_priv(sch); 325 326 skb = __skb_dequeue_tail(&sch->q); 327 if (skb) { 328 unsigned int len = skb->len; 329 sch->qstats.backlog -= len; 330 sch->qstats.drops++; 331 q= t->tab[(skb->tc_index&0xf)]; 332 if (q) { 333 q->backlog -= len; 334 q->other++; 335 if (!q->backlog && !gred_wred_mode(t)) 336 PSCHED_GET_TIME(q->qidlestart); 337 } else { 338 D2PRINTK("gred_dequeue: skb has bad tcindex %x\n",skb->tc_index&0xf); 339 } 340 341 kfree_skb(skb); 342 return len; 343 } 344 345 q=t->tab[t->def]; 346 if (!q) { 347 D2PRINTK("no default VQ set: Results might be screwed up\n"); 348 return 0; 349 } 350 351 PSCHED_GET_TIME(q->qidlestart); 352 return 0; 353 354} 355 356static void gred_reset(struct Qdisc* sch) 357{ 358 int i; 359 struct gred_sched_data *q; 360 struct gred_sched *t= qdisc_priv(sch); 361 362 __skb_queue_purge(&sch->q); 363 364 sch->qstats.backlog = 0; 365 366 for (i=0;i<t->DPs;i++) { 367 q= t->tab[i]; 368 if (!q) 369 continue; 370 PSCHED_SET_PASTPERFECT(q->qidlestart); 371 q->qave = 0; 372 q->qcount = -1; 373 q->backlog = 0; 374 q->other=0; 375 q->forced=0; 376 q->pdrop=0; 377 q->early=0; 378 } 379} 380 381static inline void gred_destroy_vq(struct gred_sched_data *q) 382{ 383 kfree(q); 384} 385 386static inline int gred_change_table_def(struct Qdisc *sch, struct rtattr *dps) 387{ 388 struct gred_sched *table = qdisc_priv(sch); 389 struct tc_gred_sopt *sopt; 390 int i; 391 392 if (dps == NULL || RTA_PAYLOAD(dps) < sizeof(*sopt)) 393 return -EINVAL; 394 395 sopt = RTA_DATA(dps); 396 397 if (sopt->DPs > MAX_DPs || sopt->DPs == 0 || sopt->def_DP >= sopt->DPs) 398 return -EINVAL; 399 400 sch_tree_lock(sch); 401 table->DPs = sopt->DPs; 402 table->def = sopt->def_DP; 403 404 /* 405 * Every entry point to GRED is synchronized with the above code 406 * and the DP is checked against DPs, i.e. shadowed VQs can no 407 * longer be found so we can unlock right here. 408 */ 409 sch_tree_unlock(sch); 410 411 if (sopt->grio) { 412 gred_enable_rio_mode(table); 413 gred_disable_wred_mode(table); 414 if (gred_wred_mode_check(sch)) 415 gred_enable_wred_mode(table); 416 } else { 417 gred_disable_rio_mode(table); 418 gred_disable_wred_mode(table); 419 } 420 421 for (i = table->DPs; i < MAX_DPs; i++) { 422 if (table->tab[i]) { 423 printk(KERN_WARNING "GRED: Warning: Destroying " 424 "shadowed VQ 0x%x\n", i); 425 gred_destroy_vq(table->tab[i]); 426 table->tab[i] = NULL; 427 } 428 } 429 430 table->initd = 0; 431 432 return 0; 433} 434 435static int gred_change(struct Qdisc *sch, struct rtattr *opt) 436{ 437 struct gred_sched *table = qdisc_priv(sch); 438 struct gred_sched_data *q; 439 struct tc_gred_qopt *ctl; 440 struct rtattr *tb[TCA_GRED_STAB]; 441 442 if (opt == NULL || rtattr_parse_nested(tb, TCA_GRED_STAB, opt)) 443 return -EINVAL; 444 445 if (tb[TCA_GRED_PARMS-1] == NULL && tb[TCA_GRED_STAB-1] == NULL) 446 return gred_change_table_def(sch, tb[TCA_GRED_DPS-1]); 447 448 if (!table->DPs || tb[TCA_GRED_PARMS-1] == 0 || tb[TCA_GRED_STAB-1] == 0 || 449 RTA_PAYLOAD(tb[TCA_GRED_PARMS-1]) < sizeof(*ctl) || 450 RTA_PAYLOAD(tb[TCA_GRED_STAB-1]) < 256) 451 return -EINVAL; 452 453 ctl = RTA_DATA(tb[TCA_GRED_PARMS-1]); 454 455 if (ctl->DP >= table->DPs) 456 return -EINVAL; 457 458 if (table->tab[ctl->DP] == NULL) { 459 table->tab[ctl->DP]=kmalloc(sizeof(struct gred_sched_data), 460 GFP_KERNEL); 461 if (NULL == table->tab[ctl->DP]) 462 return -ENOMEM; 463 memset(table->tab[ctl->DP], 0, (sizeof(struct gred_sched_data))); 464 } 465 q= table->tab[ctl->DP]; 466 467 if (gred_rio_mode(table)) { 468 if (ctl->prio <=0) { 469 if (table->def && table->tab[table->def]) { 470 DPRINTK("\nGRED: DP %u does not have a prio" 471 "setting default to %d\n",ctl->DP, 472 table->tab[table->def]->prio); 473 q->prio=table->tab[table->def]->prio; 474 } else { 475 DPRINTK("\nGRED: DP %u does not have a prio" 476 " setting default to 8\n",ctl->DP); 477 q->prio=8; 478 } 479 } else { 480 q->prio=ctl->prio; 481 } 482 } else { 483 q->prio=8; 484 } 485 486 487 q->DP=ctl->DP; 488 q->Wlog = ctl->Wlog; 489 q->Plog = ctl->Plog; 490 q->limit = ctl->limit; 491 q->Scell_log = ctl->Scell_log; 492 q->Rmask = ctl->Plog < 32 ? ((1<<ctl->Plog) - 1) : ~0UL; 493 q->Scell_max = (255<<q->Scell_log); 494 q->qth_min = ctl->qth_min<<ctl->Wlog; 495 q->qth_max = ctl->qth_max<<ctl->Wlog; 496 q->qave=0; 497 q->backlog=0; 498 q->qcount = -1; 499 q->other=0; 500 q->forced=0; 501 q->pdrop=0; 502 q->early=0; 503 504 PSCHED_SET_PASTPERFECT(q->qidlestart); 505 memcpy(q->Stab, RTA_DATA(tb[TCA_GRED_STAB-1]), 256); 506 507 if (gred_rio_mode(table)) { 508 gred_disable_wred_mode(table); 509 if (gred_wred_mode_check(sch)) 510 gred_enable_wred_mode(table); 511 } 512 513 if (!table->initd) { 514 table->initd=1; 515 /* 516 the first entry also goes into the default until 517 over-written 518 */ 519 520 if (table->tab[table->def] == NULL) { 521 table->tab[table->def]= 522 kmalloc(sizeof(struct gred_sched_data), GFP_KERNEL); 523 if (NULL == table->tab[table->def]) 524 return -ENOMEM; 525 526 memset(table->tab[table->def], 0, 527 (sizeof(struct gred_sched_data))); 528 } 529 q= table->tab[table->def]; 530 q->DP=table->def; 531 q->Wlog = ctl->Wlog; 532 q->Plog = ctl->Plog; 533 q->limit = ctl->limit; 534 q->Scell_log = ctl->Scell_log; 535 q->Rmask = ctl->Plog < 32 ? ((1<<ctl->Plog) - 1) : ~0UL; 536 q->Scell_max = (255<<q->Scell_log); 537 q->qth_min = ctl->qth_min<<ctl->Wlog; 538 q->qth_max = ctl->qth_max<<ctl->Wlog; 539 540 if (gred_rio_mode(table)) 541 q->prio=table->tab[ctl->DP]->prio; 542 else 543 q->prio=8; 544 545 q->qcount = -1; 546 PSCHED_SET_PASTPERFECT(q->qidlestart); 547 memcpy(q->Stab, RTA_DATA(tb[TCA_GRED_STAB-1]), 256); 548 } 549 return 0; 550 551} 552 553static int gred_init(struct Qdisc *sch, struct rtattr *opt) 554{ 555 struct rtattr *tb[TCA_GRED_MAX]; 556 557 if (opt == NULL || rtattr_parse_nested(tb, TCA_GRED_MAX, opt)) 558 return -EINVAL; 559 560 if (tb[TCA_GRED_PARMS-1] || tb[TCA_GRED_STAB-1]) 561 return -EINVAL; 562 563 return gred_change_table_def(sch, tb[TCA_GRED_DPS-1]); 564} 565 566static int gred_dump(struct Qdisc *sch, struct sk_buff *skb) 567{ 568 struct gred_sched *table = qdisc_priv(sch); 569 struct rtattr *parms, *opts = NULL; 570 int i; 571 struct tc_gred_sopt sopt = { 572 .DPs = table->DPs, 573 .def_DP = table->def, 574 .grio = gred_rio_mode(table), 575 }; 576 577 opts = RTA_NEST(skb, TCA_OPTIONS); 578 RTA_PUT(skb, TCA_GRED_DPS, sizeof(sopt), &sopt); 579 parms = RTA_NEST(skb, TCA_GRED_PARMS); 580 581 for (i = 0; i < MAX_DPs; i++) { 582 struct gred_sched_data *q = table->tab[i]; 583 struct tc_gred_qopt opt; 584 585 memset(&opt, 0, sizeof(opt)); 586 587 if (!q) { 588 /* hack -- fix at some point with proper message 589 This is how we indicate to tc that there is no VQ 590 at this DP */ 591 592 opt.DP = MAX_DPs + i; 593 goto append_opt; 594 } 595 596 opt.limit = q->limit; 597 opt.DP = q->DP; 598 opt.backlog = q->backlog; 599 opt.prio = q->prio; 600 opt.qth_min = q->qth_min >> q->Wlog; 601 opt.qth_max = q->qth_max >> q->Wlog; 602 opt.Wlog = q->Wlog; 603 opt.Plog = q->Plog; 604 opt.Scell_log = q->Scell_log; 605 opt.other = q->other; 606 opt.early = q->early; 607 opt.forced = q->forced; 608 opt.pdrop = q->pdrop; 609 opt.packets = q->packetsin; 610 opt.bytesin = q->bytesin; 611 612 if (q->qave) { 613 if (gred_wred_mode(table)) { 614 q->qidlestart=table->tab[table->def]->qidlestart; 615 q->qave=table->tab[table->def]->qave; 616 } 617 if (!PSCHED_IS_PASTPERFECT(q->qidlestart)) { 618 long idle; 619 unsigned long qave; 620 psched_time_t now; 621 PSCHED_GET_TIME(now); 622 idle = PSCHED_TDIFF_SAFE(now, q->qidlestart, q->Scell_max); 623 qave = q->qave >> q->Stab[(idle>>q->Scell_log)&0xFF]; 624 opt.qave = qave >> q->Wlog; 625 626 } else { 627 opt.qave = q->qave >> q->Wlog; 628 } 629 } 630 631append_opt: 632 RTA_APPEND(skb, sizeof(opt), &opt); 633 } 634 635 RTA_NEST_END(skb, parms); 636 637 return RTA_NEST_END(skb, opts); 638 639rtattr_failure: 640 return RTA_NEST_CANCEL(skb, opts); 641} 642 643static void gred_destroy(struct Qdisc *sch) 644{ 645 struct gred_sched *table = qdisc_priv(sch); 646 int i; 647 648 for (i = 0;i < table->DPs; i++) { 649 if (table->tab[i]) 650 gred_destroy_vq(table->tab[i]); 651 } 652} 653 654static struct Qdisc_ops gred_qdisc_ops = { 655 .next = NULL, 656 .cl_ops = NULL, 657 .id = "gred", 658 .priv_size = sizeof(struct gred_sched), 659 .enqueue = gred_enqueue, 660 .dequeue = gred_dequeue, 661 .requeue = gred_requeue, 662 .drop = gred_drop, 663 .init = gred_init, 664 .reset = gred_reset, 665 .destroy = gred_destroy, 666 .change = gred_change, 667 .dump = gred_dump, 668 .owner = THIS_MODULE, 669}; 670 671static int __init gred_module_init(void) 672{ 673 return register_qdisc(&gred_qdisc_ops); 674} 675static void __exit gred_module_exit(void) 676{ 677 unregister_qdisc(&gred_qdisc_ops); 678} 679module_init(gred_module_init) 680module_exit(gred_module_exit) 681MODULE_LICENSE("GPL"); 682