fib_trie.c revision 2f36895aa774cf4d1c3d68921e0209e796b66600
1/* 2 * This program is free software; you can redistribute it and/or 3 * modify it under the terms of the GNU General Public License 4 * as published by the Free Software Foundation; either version 5 * 2 of the License, or (at your option) any later version. 6 * 7 * Robert Olsson <robert.olsson@its.uu.se> Uppsala Universitet 8 * & Swedish University of Agricultural Sciences. 9 * 10 * Jens Laas <jens.laas@data.slu.se> Swedish University of 11 * Agricultural Sciences. 12 * 13 * Hans Liss <hans.liss@its.uu.se> Uppsala Universitet 14 * 15 * This work is based on the LPC-trie which is originally descibed in: 16 * 17 * An experimental study of compression methods for dynamic tries 18 * Stefan Nilsson and Matti Tikkanen. Algorithmica, 33(1):19-33, 2002. 19 * http://www.nada.kth.se/~snilsson/public/papers/dyntrie2/ 20 * 21 * 22 * IP-address lookup using LC-tries. Stefan Nilsson and Gunnar Karlsson 23 * IEEE Journal on Selected Areas in Communications, 17(6):1083-1092, June 1999 24 * 25 * Version: $Id: fib_trie.c,v 1.3 2005/06/08 14:20:01 robert Exp $ 26 * 27 * 28 * Code from fib_hash has been reused which includes the following header: 29 * 30 * 31 * INET An implementation of the TCP/IP protocol suite for the LINUX 32 * operating system. INET is implemented using the BSD Socket 33 * interface as the means of communication with the user level. 34 * 35 * IPv4 FIB: lookup engine and maintenance routines. 36 * 37 * 38 * Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru> 39 * 40 * This program is free software; you can redistribute it and/or 41 * modify it under the terms of the GNU General Public License 42 * as published by the Free Software Foundation; either version 43 * 2 of the License, or (at your option) any later version. 44 */ 45 46#define VERSION "0.325" 47 48#include <linux/config.h> 49#include <asm/uaccess.h> 50#include <asm/system.h> 51#include <asm/bitops.h> 52#include <linux/types.h> 53#include <linux/kernel.h> 54#include <linux/sched.h> 55#include <linux/mm.h> 56#include <linux/string.h> 57#include <linux/socket.h> 58#include <linux/sockios.h> 59#include <linux/errno.h> 60#include <linux/in.h> 61#include <linux/inet.h> 62#include <linux/netdevice.h> 63#include <linux/if_arp.h> 64#include <linux/proc_fs.h> 65#include <linux/skbuff.h> 66#include <linux/netlink.h> 67#include <linux/init.h> 68#include <linux/list.h> 69#include <net/ip.h> 70#include <net/protocol.h> 71#include <net/route.h> 72#include <net/tcp.h> 73#include <net/sock.h> 74#include <net/ip_fib.h> 75#include "fib_lookup.h" 76 77#undef CONFIG_IP_FIB_TRIE_STATS 78#define MAX_CHILDS 16384 79 80#define EXTRACT(p, n, str) ((str)<<(p)>>(32-(n))) 81#define KEYLENGTH (8*sizeof(t_key)) 82#define MASK_PFX(k, l) (((l)==0)?0:(k >> (KEYLENGTH-l)) << (KEYLENGTH-l)) 83#define TKEY_GET_MASK(offset, bits) (((bits)==0)?0:((t_key)(-1) << (KEYLENGTH - bits) >> offset)) 84 85static DEFINE_RWLOCK(fib_lock); 86 87typedef unsigned int t_key; 88 89#define T_TNODE 0 90#define T_LEAF 1 91#define NODE_TYPE_MASK 0x1UL 92#define NODE_PARENT(_node) \ 93((struct tnode *)((_node)->_parent & ~NODE_TYPE_MASK)) 94#define NODE_SET_PARENT(_node, _ptr) \ 95((_node)->_parent = (((unsigned long)(_ptr)) | \ 96 ((_node)->_parent & NODE_TYPE_MASK))) 97#define NODE_INIT_PARENT(_node, _type) \ 98((_node)->_parent = (_type)) 99#define NODE_TYPE(_node) \ 100((_node)->_parent & NODE_TYPE_MASK) 101 102#define IS_TNODE(n) (!(n->_parent & T_LEAF)) 103#define IS_LEAF(n) (n->_parent & T_LEAF) 104 105struct node { 106 t_key key; 107 unsigned long _parent; 108}; 109 110struct leaf { 111 t_key key; 112 unsigned long _parent; 113 struct hlist_head list; 114}; 115 116struct leaf_info { 117 struct hlist_node hlist; 118 int plen; 119 struct list_head falh; 120}; 121 122struct tnode { 123 t_key key; 124 unsigned long _parent; 125 unsigned short pos:5; /* 2log(KEYLENGTH) bits needed */ 126 unsigned short bits:5; /* 2log(KEYLENGTH) bits needed */ 127 unsigned short full_children; /* KEYLENGTH bits needed */ 128 unsigned short empty_children; /* KEYLENGTH bits needed */ 129 struct node *child[0]; 130}; 131 132#ifdef CONFIG_IP_FIB_TRIE_STATS 133struct trie_use_stats { 134 unsigned int gets; 135 unsigned int backtrack; 136 unsigned int semantic_match_passed; 137 unsigned int semantic_match_miss; 138 unsigned int null_node_hit; 139 unsigned int resize_node_skipped; 140}; 141#endif 142 143struct trie_stat { 144 unsigned int totdepth; 145 unsigned int maxdepth; 146 unsigned int tnodes; 147 unsigned int leaves; 148 unsigned int nullpointers; 149 unsigned int nodesizes[MAX_CHILDS]; 150}; 151 152struct trie { 153 struct node *trie; 154#ifdef CONFIG_IP_FIB_TRIE_STATS 155 struct trie_use_stats stats; 156#endif 157 int size; 158 unsigned int revision; 159}; 160 161static int trie_debug = 0; 162 163static int tnode_full(struct tnode *tn, struct node *n); 164static void put_child(struct trie *t, struct tnode *tn, int i, struct node *n); 165static void tnode_put_child_reorg(struct tnode *tn, int i, struct node *n, int wasfull); 166static int tnode_child_length(struct tnode *tn); 167static struct node *resize(struct trie *t, struct tnode *tn); 168static struct tnode *inflate(struct trie *t, struct tnode *tn, int *err); 169static struct tnode *halve(struct trie *t, struct tnode *tn, int *err); 170static void tnode_free(struct tnode *tn); 171static void trie_dump_seq(struct seq_file *seq, struct trie *t); 172extern struct fib_alias *fib_find_alias(struct list_head *fah, u8 tos, u32 prio); 173extern int fib_detect_death(struct fib_info *fi, int order, 174 struct fib_info **last_resort, int *last_idx, int *dflt); 175 176extern void rtmsg_fib(int event, u32 key, struct fib_alias *fa, int z, int tb_id, 177 struct nlmsghdr *n, struct netlink_skb_parms *req); 178 179static kmem_cache_t *fn_alias_kmem; 180static struct trie *trie_local = NULL, *trie_main = NULL; 181 182static void trie_bug(char *err) 183{ 184 printk("Trie Bug: %s\n", err); 185 BUG(); 186} 187 188static inline struct node *tnode_get_child(struct tnode *tn, int i) 189{ 190 if (i >= 1<<tn->bits) 191 trie_bug("tnode_get_child"); 192 193 return tn->child[i]; 194} 195 196static inline int tnode_child_length(struct tnode *tn) 197{ 198 return 1<<tn->bits; 199} 200 201/* 202 _________________________________________________________________ 203 | i | i | i | i | i | i | i | N | N | N | S | S | S | S | S | C | 204 ---------------------------------------------------------------- 205 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 206 207 _________________________________________________________________ 208 | C | C | C | u | u | u | u | u | u | u | u | u | u | u | u | u | 209 ----------------------------------------------------------------- 210 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 211 212 tp->pos = 7 213 tp->bits = 3 214 n->pos = 15 215 n->bits=4 216 KEYLENGTH=32 217*/ 218 219static inline t_key tkey_extract_bits(t_key a, int offset, int bits) 220{ 221 if (offset < KEYLENGTH) 222 return ((t_key)(a << offset)) >> (KEYLENGTH - bits); 223 else 224 return 0; 225} 226 227static inline int tkey_equals(t_key a, t_key b) 228{ 229 return a == b; 230} 231 232static inline int tkey_sub_equals(t_key a, int offset, int bits, t_key b) 233{ 234 if (bits == 0 || offset >= KEYLENGTH) 235 return 1; 236 bits = bits > KEYLENGTH ? KEYLENGTH : bits; 237 return ((a ^ b) << offset) >> (KEYLENGTH - bits) == 0; 238} 239 240static inline int tkey_mismatch(t_key a, int offset, t_key b) 241{ 242 t_key diff = a ^ b; 243 int i = offset; 244 245 if(!diff) 246 return 0; 247 while((diff << i) >> (KEYLENGTH-1) == 0) 248 i++; 249 return i; 250} 251 252/* Candiate for fib_semantics */ 253 254static void fn_free_alias(struct fib_alias *fa) 255{ 256 fib_release_info(fa->fa_info); 257 kmem_cache_free(fn_alias_kmem, fa); 258} 259 260/* 261 To understand this stuff, an understanding of keys and all their bits is 262 necessary. Every node in the trie has a key associated with it, but not 263 all of the bits in that key are significant. 264 265 Consider a node 'n' and its parent 'tp'. 266 267 If n is a leaf, every bit in its key is significant. Its presence is 268 necessitaded by path compression, since during a tree traversal (when 269 searching for a leaf - unless we are doing an insertion) we will completely 270 ignore all skipped bits we encounter. Thus we need to verify, at the end of 271 a potentially successful search, that we have indeed been walking the 272 correct key path. 273 274 Note that we can never "miss" the correct key in the tree if present by 275 following the wrong path. Path compression ensures that segments of the key 276 that are the same for all keys with a given prefix are skipped, but the 277 skipped part *is* identical for each node in the subtrie below the skipped 278 bit! trie_insert() in this implementation takes care of that - note the 279 call to tkey_sub_equals() in trie_insert(). 280 281 if n is an internal node - a 'tnode' here, the various parts of its key 282 have many different meanings. 283 284 Example: 285 _________________________________________________________________ 286 | i | i | i | i | i | i | i | N | N | N | S | S | S | S | S | C | 287 ----------------------------------------------------------------- 288 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 289 290 _________________________________________________________________ 291 | C | C | C | u | u | u | u | u | u | u | u | u | u | u | u | u | 292 ----------------------------------------------------------------- 293 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 294 295 tp->pos = 7 296 tp->bits = 3 297 n->pos = 15 298 n->bits=4 299 300 First, let's just ignore the bits that come before the parent tp, that is 301 the bits from 0 to (tp->pos-1). They are *known* but at this point we do 302 not use them for anything. 303 304 The bits from (tp->pos) to (tp->pos + tp->bits - 1) - "N", above - are the 305 index into the parent's child array. That is, they will be used to find 306 'n' among tp's children. 307 308 The bits from (tp->pos + tp->bits) to (n->pos - 1) - "S" - are skipped bits 309 for the node n. 310 311 All the bits we have seen so far are significant to the node n. The rest 312 of the bits are really not needed or indeed known in n->key. 313 314 The bits from (n->pos) to (n->pos + n->bits - 1) - "C" - are the index into 315 n's child array, and will of course be different for each child. 316 317 The rest of the bits, from (n->pos + n->bits) onward, are completely unknown 318 at this point. 319 320*/ 321 322static void check_tnode(struct tnode *tn) 323{ 324 if(tn && tn->pos+tn->bits > 32) { 325 printk("TNODE ERROR tn=%p, pos=%d, bits=%d\n", tn, tn->pos, tn->bits); 326 } 327} 328 329static int halve_threshold = 25; 330static int inflate_threshold = 50; 331 332static struct leaf *leaf_new(void) 333{ 334 struct leaf *l = kmalloc(sizeof(struct leaf), GFP_KERNEL); 335 if(l) { 336 NODE_INIT_PARENT(l, T_LEAF); 337 INIT_HLIST_HEAD(&l->list); 338 } 339 return l; 340} 341 342static struct leaf_info *leaf_info_new(int plen) 343{ 344 struct leaf_info *li = kmalloc(sizeof(struct leaf_info), GFP_KERNEL); 345 if(li) { 346 li->plen = plen; 347 INIT_LIST_HEAD(&li->falh); 348 } 349 return li; 350} 351 352static inline void free_leaf(struct leaf *l) 353{ 354 kfree(l); 355} 356 357static inline void free_leaf_info(struct leaf_info *li) 358{ 359 kfree(li); 360} 361 362static struct tnode *tnode_alloc(unsigned int size) 363{ 364 if (size <= PAGE_SIZE) { 365 return kmalloc(size, GFP_KERNEL); 366 } else { 367 return (struct tnode *) 368 __get_free_pages(GFP_KERNEL, get_order(size)); 369 } 370} 371 372static void __tnode_free(struct tnode *tn) 373{ 374 unsigned int size = sizeof(struct tnode) + 375 (1<<tn->bits) * sizeof(struct node *); 376 377 if (size <= PAGE_SIZE) 378 kfree(tn); 379 else 380 free_pages((unsigned long)tn, get_order(size)); 381} 382 383static struct tnode* tnode_new(t_key key, int pos, int bits) 384{ 385 int nchildren = 1<<bits; 386 int sz = sizeof(struct tnode) + nchildren * sizeof(struct node *); 387 struct tnode *tn = tnode_alloc(sz); 388 389 if(tn) { 390 memset(tn, 0, sz); 391 NODE_INIT_PARENT(tn, T_TNODE); 392 tn->pos = pos; 393 tn->bits = bits; 394 tn->key = key; 395 tn->full_children = 0; 396 tn->empty_children = 1<<bits; 397 } 398 if(trie_debug > 0) 399 printk("AT %p s=%u %u\n", tn, (unsigned int) sizeof(struct tnode), 400 (unsigned int) (sizeof(struct node) * 1<<bits)); 401 return tn; 402} 403 404static void tnode_free(struct tnode *tn) 405{ 406 if(!tn) { 407 trie_bug("tnode_free\n"); 408 } 409 if(IS_LEAF(tn)) { 410 free_leaf((struct leaf *)tn); 411 if(trie_debug > 0 ) 412 printk("FL %p \n", tn); 413 } 414 else if(IS_TNODE(tn)) { 415 __tnode_free(tn); 416 if(trie_debug > 0 ) 417 printk("FT %p \n", tn); 418 } 419 else { 420 trie_bug("tnode_free\n"); 421 } 422} 423 424/* 425 * Check whether a tnode 'n' is "full", i.e. it is an internal node 426 * and no bits are skipped. See discussion in dyntree paper p. 6 427 */ 428 429static inline int tnode_full(struct tnode *tn, struct node *n) 430{ 431 if(n == NULL || IS_LEAF(n)) 432 return 0; 433 434 return ((struct tnode *) n)->pos == tn->pos + tn->bits; 435} 436 437static inline void put_child(struct trie *t, struct tnode *tn, int i, struct node *n) 438{ 439 tnode_put_child_reorg(tn, i, n, -1); 440} 441 442 /* 443 * Add a child at position i overwriting the old value. 444 * Update the value of full_children and empty_children. 445 */ 446 447static void tnode_put_child_reorg(struct tnode *tn, int i, struct node *n, int wasfull) 448{ 449 struct node *chi; 450 int isfull; 451 452 if(i >= 1<<tn->bits) { 453 printk("bits=%d, i=%d\n", tn->bits, i); 454 trie_bug("tnode_put_child_reorg bits"); 455 } 456 write_lock_bh(&fib_lock); 457 chi = tn->child[i]; 458 459 /* update emptyChildren */ 460 if (n == NULL && chi != NULL) 461 tn->empty_children++; 462 else if (n != NULL && chi == NULL) 463 tn->empty_children--; 464 465 /* update fullChildren */ 466 if (wasfull == -1) 467 wasfull = tnode_full(tn, chi); 468 469 isfull = tnode_full(tn, n); 470 if (wasfull && !isfull) 471 tn->full_children--; 472 473 else if (!wasfull && isfull) 474 tn->full_children++; 475 if(n) 476 NODE_SET_PARENT(n, tn); 477 478 tn->child[i] = n; 479 write_unlock_bh(&fib_lock); 480} 481 482static struct node *resize(struct trie *t, struct tnode *tn) 483{ 484 int i; 485 int err = 0; 486 487 if (!tn) 488 return NULL; 489 490 if(trie_debug) 491 printk("In tnode_resize %p inflate_threshold=%d threshold=%d\n", 492 tn, inflate_threshold, halve_threshold); 493 494 /* No children */ 495 if (tn->empty_children == tnode_child_length(tn)) { 496 tnode_free(tn); 497 return NULL; 498 } 499 /* One child */ 500 if (tn->empty_children == tnode_child_length(tn) - 1) 501 for (i = 0; i < tnode_child_length(tn); i++) { 502 503 write_lock_bh(&fib_lock); 504 if (tn->child[i] != NULL) { 505 506 /* compress one level */ 507 struct node *n = tn->child[i]; 508 if(n) 509 NODE_INIT_PARENT(n, NODE_TYPE(n)); 510 511 write_unlock_bh(&fib_lock); 512 tnode_free(tn); 513 return n; 514 } 515 write_unlock_bh(&fib_lock); 516 } 517 /* 518 * Double as long as the resulting node has a number of 519 * nonempty nodes that are above the threshold. 520 */ 521 522 /* 523 * From "Implementing a dynamic compressed trie" by Stefan Nilsson of 524 * the Helsinki University of Technology and Matti Tikkanen of Nokia 525 * Telecommunications, page 6: 526 * "A node is doubled if the ratio of non-empty children to all 527 * children in the *doubled* node is at least 'high'." 528 * 529 * 'high' in this instance is the variable 'inflate_threshold'. It 530 * is expressed as a percentage, so we multiply it with 531 * tnode_child_length() and instead of multiplying by 2 (since the 532 * child array will be doubled by inflate()) and multiplying 533 * the left-hand side by 100 (to handle the percentage thing) we 534 * multiply the left-hand side by 50. 535 * 536 * The left-hand side may look a bit weird: tnode_child_length(tn) 537 * - tn->empty_children is of course the number of non-null children 538 * in the current node. tn->full_children is the number of "full" 539 * children, that is non-null tnodes with a skip value of 0. 540 * All of those will be doubled in the resulting inflated tnode, so 541 * we just count them one extra time here. 542 * 543 * A clearer way to write this would be: 544 * 545 * to_be_doubled = tn->full_children; 546 * not_to_be_doubled = tnode_child_length(tn) - tn->empty_children - 547 * tn->full_children; 548 * 549 * new_child_length = tnode_child_length(tn) * 2; 550 * 551 * new_fill_factor = 100 * (not_to_be_doubled + 2*to_be_doubled) / 552 * new_child_length; 553 * if (new_fill_factor >= inflate_threshold) 554 * 555 * ...and so on, tho it would mess up the while() loop. 556 * 557 * anyway, 558 * 100 * (not_to_be_doubled + 2*to_be_doubled) / new_child_length >= 559 * inflate_threshold 560 * 561 * avoid a division: 562 * 100 * (not_to_be_doubled + 2*to_be_doubled) >= 563 * inflate_threshold * new_child_length 564 * 565 * expand not_to_be_doubled and to_be_doubled, and shorten: 566 * 100 * (tnode_child_length(tn) - tn->empty_children + 567 * tn->full_children ) >= inflate_threshold * new_child_length 568 * 569 * expand new_child_length: 570 * 100 * (tnode_child_length(tn) - tn->empty_children + 571 * tn->full_children ) >= 572 * inflate_threshold * tnode_child_length(tn) * 2 573 * 574 * shorten again: 575 * 50 * (tn->full_children + tnode_child_length(tn) - 576 * tn->empty_children ) >= inflate_threshold * 577 * tnode_child_length(tn) 578 * 579 */ 580 581 check_tnode(tn); 582 583 err = 0; 584 while ((tn->full_children > 0 && 585 50 * (tn->full_children + tnode_child_length(tn) - tn->empty_children) >= 586 inflate_threshold * tnode_child_length(tn))) { 587 588 tn = inflate(t, tn, &err); 589 590 if(err) { 591#ifdef CONFIG_IP_FIB_TRIE_STATS 592 t->stats.resize_node_skipped++; 593#endif 594 break; 595 } 596 } 597 598 check_tnode(tn); 599 600 /* 601 * Halve as long as the number of empty children in this 602 * node is above threshold. 603 */ 604 605 err = 0; 606 while (tn->bits > 1 && 607 100 * (tnode_child_length(tn) - tn->empty_children) < 608 halve_threshold * tnode_child_length(tn)) { 609 610 tn = halve(t, tn, &err); 611 612 if(err) { 613#ifdef CONFIG_IP_FIB_TRIE_STATS 614 t->stats.resize_node_skipped++; 615#endif 616 break; 617 } 618 } 619 620 621 /* Only one child remains */ 622 623 if (tn->empty_children == tnode_child_length(tn) - 1) 624 for (i = 0; i < tnode_child_length(tn); i++) { 625 626 write_lock_bh(&fib_lock); 627 if (tn->child[i] != NULL) { 628 /* compress one level */ 629 struct node *n = tn->child[i]; 630 631 if(n) 632 NODE_INIT_PARENT(n, NODE_TYPE(n)); 633 634 write_unlock_bh(&fib_lock); 635 tnode_free(tn); 636 return n; 637 } 638 write_unlock_bh(&fib_lock); 639 } 640 641 return (struct node *) tn; 642} 643 644static struct tnode *inflate(struct trie *t, struct tnode *tn, int *err) 645{ 646 struct tnode *inode; 647 struct tnode *oldtnode = tn; 648 int olen = tnode_child_length(tn); 649 int i; 650 651 if(trie_debug) 652 printk("In inflate\n"); 653 654 tn = tnode_new(oldtnode->key, oldtnode->pos, oldtnode->bits + 1); 655 656 if (!tn) { 657 *err = -ENOMEM; 658 return oldtnode; 659 } 660 661 /* 662 * Preallocate and store tnodes before the actual work so we 663 * don't get into an inconsistent state if memory allocation 664 * fails. In case of failure we return the oldnode and inflate 665 * of tnode is ignored. 666 */ 667 668 for(i = 0; i < olen; i++) { 669 struct tnode *inode = (struct tnode *) tnode_get_child(oldtnode, i); 670 671 if (inode && 672 IS_TNODE(inode) && 673 inode->pos == oldtnode->pos + oldtnode->bits && 674 inode->bits > 1) { 675 struct tnode *left, *right; 676 677 t_key m = TKEY_GET_MASK(inode->pos, 1); 678 679 left = tnode_new(inode->key&(~m), inode->pos + 1, 680 inode->bits - 1); 681 682 if(!left) { 683 *err = -ENOMEM; 684 break; 685 } 686 687 right = tnode_new(inode->key|m, inode->pos + 1, 688 inode->bits - 1); 689 690 if(!right) { 691 *err = -ENOMEM; 692 break; 693 } 694 695 put_child(t, tn, 2*i, (struct node *) left); 696 put_child(t, tn, 2*i+1, (struct node *) right); 697 } 698 } 699 700 if(*err) { 701 int size = tnode_child_length(tn); 702 int j; 703 704 for(j = 0; j < size; j++) 705 if( tn->child[j]) 706 tnode_free((struct tnode *)tn->child[j]); 707 708 tnode_free(tn); 709 710 *err = -ENOMEM; 711 return oldtnode; 712 } 713 714 for(i = 0; i < olen; i++) { 715 struct node *node = tnode_get_child(oldtnode, i); 716 717 /* An empty child */ 718 if (node == NULL) 719 continue; 720 721 /* A leaf or an internal node with skipped bits */ 722 723 if(IS_LEAF(node) || ((struct tnode *) node)->pos > 724 tn->pos + tn->bits - 1) { 725 if(tkey_extract_bits(node->key, oldtnode->pos + oldtnode->bits, 726 1) == 0) 727 put_child(t, tn, 2*i, node); 728 else 729 put_child(t, tn, 2*i+1, node); 730 continue; 731 } 732 733 /* An internal node with two children */ 734 inode = (struct tnode *) node; 735 736 if (inode->bits == 1) { 737 put_child(t, tn, 2*i, inode->child[0]); 738 put_child(t, tn, 2*i+1, inode->child[1]); 739 740 tnode_free(inode); 741 } 742 743 /* An internal node with more than two children */ 744 else { 745 struct tnode *left, *right; 746 int size, j; 747 748 /* We will replace this node 'inode' with two new 749 * ones, 'left' and 'right', each with half of the 750 * original children. The two new nodes will have 751 * a position one bit further down the key and this 752 * means that the "significant" part of their keys 753 * (see the discussion near the top of this file) 754 * will differ by one bit, which will be "0" in 755 * left's key and "1" in right's key. Since we are 756 * moving the key position by one step, the bit that 757 * we are moving away from - the bit at position 758 * (inode->pos) - is the one that will differ between 759 * left and right. So... we synthesize that bit in the 760 * two new keys. 761 * The mask 'm' below will be a single "one" bit at 762 * the position (inode->pos) 763 */ 764 765 /* Use the old key, but set the new significant 766 * bit to zero. 767 */ 768 769 left = (struct tnode *) tnode_get_child(tn, 2*i); 770 put_child(t, tn, 2*i, NULL); 771 772 if(!left) 773 BUG(); 774 775 right = (struct tnode *) tnode_get_child(tn, 2*i+1); 776 put_child(t, tn, 2*i+1, NULL); 777 778 if(!right) 779 BUG(); 780 781 size = tnode_child_length(left); 782 for(j = 0; j < size; j++) { 783 put_child(t, left, j, inode->child[j]); 784 put_child(t, right, j, inode->child[j + size]); 785 } 786 put_child(t, tn, 2*i, resize(t, left)); 787 put_child(t, tn, 2*i+1, resize(t, right)); 788 789 tnode_free(inode); 790 } 791 } 792 tnode_free(oldtnode); 793 return tn; 794} 795 796static struct tnode *halve(struct trie *t, struct tnode *tn, int *err) 797{ 798 struct tnode *oldtnode = tn; 799 struct node *left, *right; 800 int i; 801 int olen = tnode_child_length(tn); 802 803 if(trie_debug) printk("In halve\n"); 804 805 tn=tnode_new(oldtnode->key, oldtnode->pos, oldtnode->bits - 1); 806 807 if (!tn) { 808 *err = -ENOMEM; 809 return oldtnode; 810 } 811 812 /* 813 * Preallocate and store tnodes before the actual work so we 814 * don't get into an inconsistent state if memory allocation 815 * fails. In case of failure we return the oldnode and halve 816 * of tnode is ignored. 817 */ 818 819 for(i = 0; i < olen; i += 2) { 820 left = tnode_get_child(oldtnode, i); 821 right = tnode_get_child(oldtnode, i+1); 822 823 /* Two nonempty children */ 824 if( left && right) { 825 struct tnode *newBinNode = 826 tnode_new(left->key, tn->pos + tn->bits, 1); 827 828 if(!newBinNode) { 829 *err = -ENOMEM; 830 break; 831 } 832 put_child(t, tn, i/2, (struct node *)newBinNode); 833 } 834 } 835 836 if(*err) { 837 int size = tnode_child_length(tn); 838 int j; 839 840 for(j = 0; j < size; j++) 841 if( tn->child[j]) 842 tnode_free((struct tnode *)tn->child[j]); 843 844 tnode_free(tn); 845 846 *err = -ENOMEM; 847 return oldtnode; 848 } 849 850 for(i = 0; i < olen; i += 2) { 851 left = tnode_get_child(oldtnode, i); 852 right = tnode_get_child(oldtnode, i+1); 853 854 /* At least one of the children is empty */ 855 if (left == NULL) { 856 if (right == NULL) /* Both are empty */ 857 continue; 858 put_child(t, tn, i/2, right); 859 } else if (right == NULL) 860 put_child(t, tn, i/2, left); 861 862 /* Two nonempty children */ 863 else { 864 struct tnode *newBinNode = 865 (struct tnode *) tnode_get_child(tn, i/2); 866 put_child(t, tn, i/2, NULL); 867 868 if(!newBinNode) 869 BUG(); 870 871 put_child(t, newBinNode, 0, left); 872 put_child(t, newBinNode, 1, right); 873 put_child(t, tn, i/2, resize(t, newBinNode)); 874 } 875 } 876 tnode_free(oldtnode); 877 return tn; 878} 879 880static void *trie_init(struct trie *t) 881{ 882 if(t) { 883 t->size = 0; 884 t->trie = NULL; 885 t->revision = 0; 886#ifdef CONFIG_IP_FIB_TRIE_STATS 887 memset(&t->stats, 0, sizeof(struct trie_use_stats)); 888#endif 889 } 890 return t; 891} 892 893static struct leaf_info *find_leaf_info(struct hlist_head *head, int plen) 894{ 895 struct hlist_node *node; 896 struct leaf_info *li; 897 898 hlist_for_each_entry(li, node, head, hlist) { 899 900 if ( li->plen == plen ) 901 return li; 902 } 903 return NULL; 904} 905 906static inline struct list_head * get_fa_head(struct leaf *l, int plen) 907{ 908 struct list_head *fa_head=NULL; 909 struct leaf_info *li = find_leaf_info(&l->list, plen); 910 911 if(li) 912 fa_head = &li->falh; 913 914 return fa_head; 915} 916 917static void insert_leaf_info(struct hlist_head *head, struct leaf_info *new) 918{ 919 struct leaf_info *li=NULL, *last=NULL; 920 struct hlist_node *node, *tmp; 921 922 write_lock_bh(&fib_lock); 923 924 if(hlist_empty(head)) 925 hlist_add_head(&new->hlist, head); 926 else { 927 hlist_for_each_entry_safe(li, node, tmp, head, hlist) { 928 929 if (new->plen > li->plen) 930 break; 931 932 last = li; 933 } 934 if(last) 935 hlist_add_after(&last->hlist, &new->hlist); 936 else 937 hlist_add_before(&new->hlist, &li->hlist); 938 } 939 write_unlock_bh(&fib_lock); 940} 941 942static struct leaf * 943fib_find_node(struct trie *t, u32 key) 944{ 945 int pos; 946 struct tnode *tn; 947 struct node *n; 948 949 pos = 0; 950 n=t->trie; 951 952 while (n != NULL && NODE_TYPE(n) == T_TNODE) { 953 tn = (struct tnode *) n; 954 955 check_tnode(tn); 956 957 if(tkey_sub_equals(tn->key, pos, tn->pos-pos, key)) { 958 pos=tn->pos + tn->bits; 959 n = tnode_get_child(tn, tkey_extract_bits(key, tn->pos, tn->bits)); 960 } 961 else 962 break; 963 } 964 /* Case we have found a leaf. Compare prefixes */ 965 966 if (n != NULL && IS_LEAF(n) && tkey_equals(key, n->key)) { 967 struct leaf *l = (struct leaf *) n; 968 return l; 969 } 970 return NULL; 971} 972 973static struct node *trie_rebalance(struct trie *t, struct tnode *tn) 974{ 975 int i = 0; 976 int wasfull; 977 t_key cindex, key; 978 struct tnode *tp = NULL; 979 980 if(!tn) 981 BUG(); 982 983 key = tn->key; 984 i = 0; 985 986 while (tn != NULL && NODE_PARENT(tn) != NULL) { 987 988 if( i > 10 ) { 989 printk("Rebalance tn=%p \n", tn); 990 if(tn) printk("tn->parent=%p \n", NODE_PARENT(tn)); 991 992 printk("Rebalance tp=%p \n", tp); 993 if(tp) printk("tp->parent=%p \n", NODE_PARENT(tp)); 994 } 995 996 if( i > 12 ) BUG(); 997 i++; 998 999 tp = NODE_PARENT(tn); 1000 cindex = tkey_extract_bits(key, tp->pos, tp->bits); 1001 wasfull = tnode_full(tp, tnode_get_child(tp, cindex)); 1002 tn = (struct tnode *) resize (t, (struct tnode *)tn); 1003 tnode_put_child_reorg((struct tnode *)tp, cindex,(struct node*)tn, wasfull); 1004 1005 if(!NODE_PARENT(tn)) 1006 break; 1007 1008 tn = NODE_PARENT(tn); 1009 } 1010 /* Handle last (top) tnode */ 1011 if (IS_TNODE(tn)) 1012 tn = (struct tnode*) resize(t, (struct tnode *)tn); 1013 1014 return (struct node*) tn; 1015} 1016 1017static struct list_head * 1018fib_insert_node(struct trie *t, int *err, u32 key, int plen) 1019{ 1020 int pos, newpos; 1021 struct tnode *tp = NULL, *tn = NULL; 1022 struct node *n; 1023 struct leaf *l; 1024 int missbit; 1025 struct list_head *fa_head=NULL; 1026 struct leaf_info *li; 1027 t_key cindex; 1028 1029 pos = 0; 1030 n=t->trie; 1031 1032 /* If we point to NULL, stop. Either the tree is empty and we should 1033 * just put a new leaf in if, or we have reached an empty child slot, 1034 * and we should just put our new leaf in that. 1035 * If we point to a T_TNODE, check if it matches our key. Note that 1036 * a T_TNODE might be skipping any number of bits - its 'pos' need 1037 * not be the parent's 'pos'+'bits'! 1038 * 1039 * If it does match the current key, get pos/bits from it, extract 1040 * the index from our key, push the T_TNODE and walk the tree. 1041 * 1042 * If it doesn't, we have to replace it with a new T_TNODE. 1043 * 1044 * If we point to a T_LEAF, it might or might not have the same key 1045 * as we do. If it does, just change the value, update the T_LEAF's 1046 * value, and return it. 1047 * If it doesn't, we need to replace it with a T_TNODE. 1048 */ 1049 1050 while (n != NULL && NODE_TYPE(n) == T_TNODE) { 1051 tn = (struct tnode *) n; 1052 1053 check_tnode(tn); 1054 1055 if(tkey_sub_equals(tn->key, pos, tn->pos-pos, key)) { 1056 tp = tn; 1057 pos=tn->pos + tn->bits; 1058 n = tnode_get_child(tn, tkey_extract_bits(key, tn->pos, tn->bits)); 1059 1060 if(n && NODE_PARENT(n) != tn) { 1061 printk("BUG tn=%p, n->parent=%p\n", tn, NODE_PARENT(n)); 1062 BUG(); 1063 } 1064 } 1065 else 1066 break; 1067 } 1068 1069 /* 1070 * n ----> NULL, LEAF or TNODE 1071 * 1072 * tp is n's (parent) ----> NULL or TNODE 1073 */ 1074 1075 if(tp && IS_LEAF(tp)) 1076 BUG(); 1077 1078 1079 /* Case 1: n is a leaf. Compare prefixes */ 1080 1081 if (n != NULL && IS_LEAF(n) && tkey_equals(key, n->key)) { 1082 struct leaf *l = ( struct leaf *) n; 1083 1084 li = leaf_info_new(plen); 1085 1086 if(! li) { 1087 *err = -ENOMEM; 1088 goto err; 1089 } 1090 1091 fa_head = &li->falh; 1092 insert_leaf_info(&l->list, li); 1093 goto done; 1094 } 1095 t->size++; 1096 l = leaf_new(); 1097 1098 if(! l) { 1099 *err = -ENOMEM; 1100 goto err; 1101 } 1102 1103 l->key = key; 1104 li = leaf_info_new(plen); 1105 1106 if(! li) { 1107 tnode_free((struct tnode *) l); 1108 *err = -ENOMEM; 1109 goto err; 1110 } 1111 1112 fa_head = &li->falh; 1113 insert_leaf_info(&l->list, li); 1114 1115 /* Case 2: n is NULL, and will just insert a new leaf */ 1116 if (t->trie && n == NULL) { 1117 1118 NODE_SET_PARENT(l, tp); 1119 1120 if (!tp) 1121 BUG(); 1122 1123 else { 1124 cindex = tkey_extract_bits(key, tp->pos, tp->bits); 1125 put_child(t, (struct tnode *)tp, cindex, (struct node *)l); 1126 } 1127 } 1128 /* Case 3: n is a LEAF or a TNODE and the key doesn't match. */ 1129 else { 1130 /* 1131 * Add a new tnode here 1132 * first tnode need some special handling 1133 */ 1134 1135 if (tp) 1136 pos=tp->pos+tp->bits; 1137 else 1138 pos=0; 1139 if(n) { 1140 newpos = tkey_mismatch(key, pos, n->key); 1141 tn = tnode_new(n->key, newpos, 1); 1142 } 1143 else { 1144 newpos = 0; 1145 tn = tnode_new(key, newpos, 1); /* First tnode */ 1146 } 1147 1148 if(!tn) { 1149 free_leaf_info(li); 1150 tnode_free((struct tnode *) l); 1151 *err = -ENOMEM; 1152 goto err; 1153 } 1154 1155 NODE_SET_PARENT(tn, tp); 1156 1157 missbit=tkey_extract_bits(key, newpos, 1); 1158 put_child(t, tn, missbit, (struct node *)l); 1159 put_child(t, tn, 1-missbit, n); 1160 1161 if(tp) { 1162 cindex = tkey_extract_bits(key, tp->pos, tp->bits); 1163 put_child(t, (struct tnode *)tp, cindex, (struct node *)tn); 1164 } 1165 else { 1166 t->trie = (struct node*) tn; /* First tnode */ 1167 tp = tn; 1168 } 1169 } 1170 if(tp && tp->pos+tp->bits > 32) { 1171 printk("ERROR tp=%p pos=%d, bits=%d, key=%0x plen=%d\n", 1172 tp, tp->pos, tp->bits, key, plen); 1173 } 1174 /* Rebalance the trie */ 1175 t->trie = trie_rebalance(t, tp); 1176done: 1177 t->revision++; 1178err:; 1179 return fa_head; 1180} 1181 1182static int 1183fn_trie_insert(struct fib_table *tb, struct rtmsg *r, struct kern_rta *rta, 1184 struct nlmsghdr *nlhdr, struct netlink_skb_parms *req) 1185{ 1186 struct trie *t = (struct trie *) tb->tb_data; 1187 struct fib_alias *fa, *new_fa; 1188 struct list_head *fa_head=NULL; 1189 struct fib_info *fi; 1190 int plen = r->rtm_dst_len; 1191 int type = r->rtm_type; 1192 u8 tos = r->rtm_tos; 1193 u32 key, mask; 1194 int err; 1195 struct leaf *l; 1196 1197 if (plen > 32) 1198 return -EINVAL; 1199 1200 key = 0; 1201 if (rta->rta_dst) 1202 memcpy(&key, rta->rta_dst, 4); 1203 1204 key = ntohl(key); 1205 1206 if(trie_debug) 1207 printk("Insert table=%d %08x/%d\n", tb->tb_id, key, plen); 1208 1209 mask = ntohl( inet_make_mask(plen) ); 1210 1211 if(key & ~mask) 1212 return -EINVAL; 1213 1214 key = key & mask; 1215 1216 if ((fi = fib_create_info(r, rta, nlhdr, &err)) == NULL) 1217 goto err; 1218 1219 l = fib_find_node(t, key); 1220 fa = NULL; 1221 1222 if(l) { 1223 fa_head = get_fa_head(l, plen); 1224 fa = fib_find_alias(fa_head, tos, fi->fib_priority); 1225 } 1226 1227 /* Now fa, if non-NULL, points to the first fib alias 1228 * with the same keys [prefix,tos,priority], if such key already 1229 * exists or to the node before which we will insert new one. 1230 * 1231 * If fa is NULL, we will need to allocate a new one and 1232 * insert to the head of f. 1233 * 1234 * If f is NULL, no fib node matched the destination key 1235 * and we need to allocate a new one of those as well. 1236 */ 1237 1238 if (fa && 1239 fa->fa_info->fib_priority == fi->fib_priority) { 1240 struct fib_alias *fa_orig; 1241 1242 err = -EEXIST; 1243 if (nlhdr->nlmsg_flags & NLM_F_EXCL) 1244 goto out; 1245 1246 if (nlhdr->nlmsg_flags & NLM_F_REPLACE) { 1247 struct fib_info *fi_drop; 1248 u8 state; 1249 1250 write_lock_bh(&fib_lock); 1251 1252 fi_drop = fa->fa_info; 1253 fa->fa_info = fi; 1254 fa->fa_type = type; 1255 fa->fa_scope = r->rtm_scope; 1256 state = fa->fa_state; 1257 fa->fa_state &= ~FA_S_ACCESSED; 1258 1259 write_unlock_bh(&fib_lock); 1260 1261 fib_release_info(fi_drop); 1262 if (state & FA_S_ACCESSED) 1263 rt_cache_flush(-1); 1264 1265 goto succeeded; 1266 } 1267 /* Error if we find a perfect match which 1268 * uses the same scope, type, and nexthop 1269 * information. 1270 */ 1271 fa_orig = fa; 1272 list_for_each_entry(fa, fa_orig->fa_list.prev, fa_list) { 1273 if (fa->fa_tos != tos) 1274 break; 1275 if (fa->fa_info->fib_priority != fi->fib_priority) 1276 break; 1277 if (fa->fa_type == type && 1278 fa->fa_scope == r->rtm_scope && 1279 fa->fa_info == fi) { 1280 goto out; 1281 } 1282 } 1283 if (!(nlhdr->nlmsg_flags & NLM_F_APPEND)) 1284 fa = fa_orig; 1285 } 1286 err = -ENOENT; 1287 if (!(nlhdr->nlmsg_flags&NLM_F_CREATE)) 1288 goto out; 1289 1290 err = -ENOBUFS; 1291 new_fa = kmem_cache_alloc(fn_alias_kmem, SLAB_KERNEL); 1292 if (new_fa == NULL) 1293 goto out; 1294 1295 new_fa->fa_info = fi; 1296 new_fa->fa_tos = tos; 1297 new_fa->fa_type = type; 1298 new_fa->fa_scope = r->rtm_scope; 1299 new_fa->fa_state = 0; 1300#if 0 1301 new_fa->dst = NULL; 1302#endif 1303 /* 1304 * Insert new entry to the list. 1305 */ 1306 1307 if(!fa_head) { 1308 fa_head = fib_insert_node(t, &err, key, plen); 1309 err = 0; 1310 if(err) 1311 goto out_free_new_fa; 1312 } 1313 1314 write_lock_bh(&fib_lock); 1315 1316 list_add_tail(&new_fa->fa_list, 1317 (fa ? &fa->fa_list : fa_head)); 1318 1319 write_unlock_bh(&fib_lock); 1320 1321 rt_cache_flush(-1); 1322 rtmsg_fib(RTM_NEWROUTE, htonl(key), new_fa, plen, tb->tb_id, nlhdr, req); 1323succeeded: 1324 return 0; 1325 1326out_free_new_fa: 1327 kmem_cache_free(fn_alias_kmem, new_fa); 1328out: 1329 fib_release_info(fi); 1330err:; 1331 return err; 1332} 1333 1334static inline int check_leaf(struct trie *t, struct leaf *l, t_key key, int *plen, const struct flowi *flp, 1335 struct fib_result *res, int *err) 1336{ 1337 int i; 1338 t_key mask; 1339 struct leaf_info *li; 1340 struct hlist_head *hhead = &l->list; 1341 struct hlist_node *node; 1342 1343 hlist_for_each_entry(li, node, hhead, hlist) { 1344 1345 i = li->plen; 1346 mask = ntohl(inet_make_mask(i)); 1347 if (l->key != (key & mask)) 1348 continue; 1349 1350 if (((*err) = fib_semantic_match(&li->falh, flp, res, l->key, mask, i)) == 0) { 1351 *plen = i; 1352#ifdef CONFIG_IP_FIB_TRIE_STATS 1353 t->stats.semantic_match_passed++; 1354#endif 1355 return 1; 1356 } 1357#ifdef CONFIG_IP_FIB_TRIE_STATS 1358 t->stats.semantic_match_miss++; 1359#endif 1360 } 1361 return 0; 1362} 1363 1364static int 1365fn_trie_lookup(struct fib_table *tb, const struct flowi *flp, struct fib_result *res) 1366{ 1367 struct trie *t = (struct trie *) tb->tb_data; 1368 int plen, ret = 0; 1369 struct node *n; 1370 struct tnode *pn; 1371 int pos, bits; 1372 t_key key=ntohl(flp->fl4_dst); 1373 int chopped_off; 1374 t_key cindex = 0; 1375 int current_prefix_length = KEYLENGTH; 1376 n = t->trie; 1377 1378 read_lock(&fib_lock); 1379 if(!n) 1380 goto failed; 1381 1382#ifdef CONFIG_IP_FIB_TRIE_STATS 1383 t->stats.gets++; 1384#endif 1385 1386 /* Just a leaf? */ 1387 if (IS_LEAF(n)) { 1388 if( check_leaf(t, (struct leaf *)n, key, &plen, flp, res, &ret) ) 1389 goto found; 1390 goto failed; 1391 } 1392 pn = (struct tnode *) n; 1393 chopped_off = 0; 1394 1395 while (pn) { 1396 1397 pos = pn->pos; 1398 bits = pn->bits; 1399 1400 if(!chopped_off) 1401 cindex = tkey_extract_bits(MASK_PFX(key, current_prefix_length), pos, bits); 1402 1403 n = tnode_get_child(pn, cindex); 1404 1405 if (n == NULL) { 1406#ifdef CONFIG_IP_FIB_TRIE_STATS 1407 t->stats.null_node_hit++; 1408#endif 1409 goto backtrace; 1410 } 1411 1412 if (IS_TNODE(n)) { 1413#define HL_OPTIMIZE 1414#ifdef HL_OPTIMIZE 1415 struct tnode *cn = (struct tnode *)n; 1416 t_key node_prefix, key_prefix, pref_mismatch; 1417 int mp; 1418 1419 /* 1420 * It's a tnode, and we can do some extra checks here if we 1421 * like, to avoid descending into a dead-end branch. 1422 * This tnode is in the parent's child array at index 1423 * key[p_pos..p_pos+p_bits] but potentially with some bits 1424 * chopped off, so in reality the index may be just a 1425 * subprefix, padded with zero at the end. 1426 * We can also take a look at any skipped bits in this 1427 * tnode - everything up to p_pos is supposed to be ok, 1428 * and the non-chopped bits of the index (se previous 1429 * paragraph) are also guaranteed ok, but the rest is 1430 * considered unknown. 1431 * 1432 * The skipped bits are key[pos+bits..cn->pos]. 1433 */ 1434 1435 /* If current_prefix_length < pos+bits, we are already doing 1436 * actual prefix matching, which means everything from 1437 * pos+(bits-chopped_off) onward must be zero along some 1438 * branch of this subtree - otherwise there is *no* valid 1439 * prefix present. Here we can only check the skipped 1440 * bits. Remember, since we have already indexed into the 1441 * parent's child array, we know that the bits we chopped of 1442 * *are* zero. 1443 */ 1444 1445 /* NOTA BENE: CHECKING ONLY SKIPPED BITS FOR THE NEW NODE HERE */ 1446 1447 if (current_prefix_length < pos+bits) { 1448 if (tkey_extract_bits(cn->key, current_prefix_length, 1449 cn->pos - current_prefix_length) != 0 || 1450 !(cn->child[0])) 1451 goto backtrace; 1452 } 1453 1454 /* 1455 * If chopped_off=0, the index is fully validated and we 1456 * only need to look at the skipped bits for this, the new, 1457 * tnode. What we actually want to do is to find out if 1458 * these skipped bits match our key perfectly, or if we will 1459 * have to count on finding a matching prefix further down, 1460 * because if we do, we would like to have some way of 1461 * verifying the existence of such a prefix at this point. 1462 */ 1463 1464 /* The only thing we can do at this point is to verify that 1465 * any such matching prefix can indeed be a prefix to our 1466 * key, and if the bits in the node we are inspecting that 1467 * do not match our key are not ZERO, this cannot be true. 1468 * Thus, find out where there is a mismatch (before cn->pos) 1469 * and verify that all the mismatching bits are zero in the 1470 * new tnode's key. 1471 */ 1472 1473 /* Note: We aren't very concerned about the piece of the key 1474 * that precede pn->pos+pn->bits, since these have already been 1475 * checked. The bits after cn->pos aren't checked since these are 1476 * by definition "unknown" at this point. Thus, what we want to 1477 * see is if we are about to enter the "prefix matching" state, 1478 * and in that case verify that the skipped bits that will prevail 1479 * throughout this subtree are zero, as they have to be if we are 1480 * to find a matching prefix. 1481 */ 1482 1483 node_prefix = MASK_PFX(cn->key, cn->pos); 1484 key_prefix = MASK_PFX(key, cn->pos); 1485 pref_mismatch = key_prefix^node_prefix; 1486 mp = 0; 1487 1488 /* In short: If skipped bits in this node do not match the search 1489 * key, enter the "prefix matching" state.directly. 1490 */ 1491 if (pref_mismatch) { 1492 while (!(pref_mismatch & (1<<(KEYLENGTH-1)))) { 1493 mp++; 1494 pref_mismatch = pref_mismatch <<1; 1495 } 1496 key_prefix = tkey_extract_bits(cn->key, mp, cn->pos-mp); 1497 1498 if (key_prefix != 0) 1499 goto backtrace; 1500 1501 if (current_prefix_length >= cn->pos) 1502 current_prefix_length=mp; 1503 } 1504#endif 1505 pn = (struct tnode *)n; /* Descend */ 1506 chopped_off = 0; 1507 continue; 1508 } 1509 if (IS_LEAF(n)) { 1510 if( check_leaf(t, (struct leaf *)n, key, &plen, flp, res, &ret)) 1511 goto found; 1512 } 1513backtrace: 1514 chopped_off++; 1515 1516 /* As zero don't change the child key (cindex) */ 1517 while ((chopped_off <= pn->bits) && !(cindex & (1<<(chopped_off-1)))) { 1518 chopped_off++; 1519 } 1520 1521 /* Decrease current_... with bits chopped off */ 1522 if (current_prefix_length > pn->pos + pn->bits - chopped_off) 1523 current_prefix_length = pn->pos + pn->bits - chopped_off; 1524 1525 /* 1526 * Either we do the actual chop off according or if we have 1527 * chopped off all bits in this tnode walk up to our parent. 1528 */ 1529 1530 if(chopped_off <= pn->bits) 1531 cindex &= ~(1 << (chopped_off-1)); 1532 else { 1533 if( NODE_PARENT(pn) == NULL) 1534 goto failed; 1535 1536 /* Get Child's index */ 1537 cindex = tkey_extract_bits(pn->key, NODE_PARENT(pn)->pos, NODE_PARENT(pn)->bits); 1538 pn = NODE_PARENT(pn); 1539 chopped_off = 0; 1540 1541#ifdef CONFIG_IP_FIB_TRIE_STATS 1542 t->stats.backtrack++; 1543#endif 1544 goto backtrace; 1545 } 1546 } 1547failed: 1548 ret = 1; 1549found: 1550 read_unlock(&fib_lock); 1551 return ret; 1552} 1553 1554static int trie_leaf_remove(struct trie *t, t_key key) 1555{ 1556 t_key cindex; 1557 struct tnode *tp = NULL; 1558 struct node *n = t->trie; 1559 struct leaf *l; 1560 1561 if(trie_debug) 1562 printk("entering trie_leaf_remove(%p)\n", n); 1563 1564 /* Note that in the case skipped bits, those bits are *not* checked! 1565 * When we finish this, we will have NULL or a T_LEAF, and the 1566 * T_LEAF may or may not match our key. 1567 */ 1568 1569 while (n != NULL && IS_TNODE(n)) { 1570 struct tnode *tn = (struct tnode *) n; 1571 check_tnode(tn); 1572 n = tnode_get_child(tn ,tkey_extract_bits(key, tn->pos, tn->bits)); 1573 1574 if(n && NODE_PARENT(n) != tn) { 1575 printk("BUG tn=%p, n->parent=%p\n", tn, NODE_PARENT(n)); 1576 BUG(); 1577 } 1578 } 1579 l = (struct leaf *) n; 1580 1581 if(!n || !tkey_equals(l->key, key)) 1582 return 0; 1583 1584 /* 1585 * Key found. 1586 * Remove the leaf and rebalance the tree 1587 */ 1588 1589 t->revision++; 1590 t->size--; 1591 1592 tp = NODE_PARENT(n); 1593 tnode_free((struct tnode *) n); 1594 1595 if(tp) { 1596 cindex = tkey_extract_bits(key, tp->pos, tp->bits); 1597 put_child(t, (struct tnode *)tp, cindex, NULL); 1598 t->trie = trie_rebalance(t, tp); 1599 } 1600 else 1601 t->trie = NULL; 1602 1603 return 1; 1604} 1605 1606static int 1607fn_trie_delete(struct fib_table *tb, struct rtmsg *r, struct kern_rta *rta, 1608 struct nlmsghdr *nlhdr, struct netlink_skb_parms *req) 1609{ 1610 struct trie *t = (struct trie *) tb->tb_data; 1611 u32 key, mask; 1612 int plen = r->rtm_dst_len; 1613 u8 tos = r->rtm_tos; 1614 struct fib_alias *fa, *fa_to_delete; 1615 struct list_head *fa_head; 1616 struct leaf *l; 1617 1618 if (plen > 32) 1619 return -EINVAL; 1620 1621 key = 0; 1622 if (rta->rta_dst) 1623 memcpy(&key, rta->rta_dst, 4); 1624 1625 key = ntohl(key); 1626 mask = ntohl( inet_make_mask(plen) ); 1627 1628 if(key & ~mask) 1629 return -EINVAL; 1630 1631 key = key & mask; 1632 l = fib_find_node(t, key); 1633 1634 if(!l) 1635 return -ESRCH; 1636 1637 fa_head = get_fa_head(l, plen); 1638 fa = fib_find_alias(fa_head, tos, 0); 1639 1640 if (!fa) 1641 return -ESRCH; 1642 1643 if (trie_debug) 1644 printk("Deleting %08x/%d tos=%d t=%p\n", key, plen, tos, t); 1645 1646 fa_to_delete = NULL; 1647 fa_head = fa->fa_list.prev; 1648 list_for_each_entry(fa, fa_head, fa_list) { 1649 struct fib_info *fi = fa->fa_info; 1650 1651 if (fa->fa_tos != tos) 1652 break; 1653 1654 if ((!r->rtm_type || 1655 fa->fa_type == r->rtm_type) && 1656 (r->rtm_scope == RT_SCOPE_NOWHERE || 1657 fa->fa_scope == r->rtm_scope) && 1658 (!r->rtm_protocol || 1659 fi->fib_protocol == r->rtm_protocol) && 1660 fib_nh_match(r, nlhdr, rta, fi) == 0) { 1661 fa_to_delete = fa; 1662 break; 1663 } 1664 } 1665 1666 if (fa_to_delete) { 1667 int kill_li = 0; 1668 struct leaf_info *li; 1669 1670 fa = fa_to_delete; 1671 rtmsg_fib(RTM_DELROUTE, htonl(key), fa, plen, tb->tb_id, nlhdr, req); 1672 1673 l = fib_find_node(t, key); 1674 li = find_leaf_info(&l->list, plen); 1675 1676 write_lock_bh(&fib_lock); 1677 1678 list_del(&fa->fa_list); 1679 1680 if(list_empty(fa_head)) { 1681 hlist_del(&li->hlist); 1682 kill_li = 1; 1683 } 1684 write_unlock_bh(&fib_lock); 1685 1686 if(kill_li) 1687 free_leaf_info(li); 1688 1689 if(hlist_empty(&l->list)) 1690 trie_leaf_remove(t, key); 1691 1692 if (fa->fa_state & FA_S_ACCESSED) 1693 rt_cache_flush(-1); 1694 1695 fn_free_alias(fa); 1696 return 0; 1697 } 1698 return -ESRCH; 1699} 1700 1701static int trie_flush_list(struct trie *t, struct list_head *head) 1702{ 1703 struct fib_alias *fa, *fa_node; 1704 int found = 0; 1705 1706 list_for_each_entry_safe(fa, fa_node, head, fa_list) { 1707 struct fib_info *fi = fa->fa_info; 1708 1709 if (fi && (fi->fib_flags&RTNH_F_DEAD)) { 1710 1711 write_lock_bh(&fib_lock); 1712 list_del(&fa->fa_list); 1713 write_unlock_bh(&fib_lock); 1714 1715 fn_free_alias(fa); 1716 found++; 1717 } 1718 } 1719 return found; 1720} 1721 1722static int trie_flush_leaf(struct trie *t, struct leaf *l) 1723{ 1724 int found = 0; 1725 struct hlist_head *lih = &l->list; 1726 struct hlist_node *node, *tmp; 1727 struct leaf_info *li = NULL; 1728 1729 hlist_for_each_entry_safe(li, node, tmp, lih, hlist) { 1730 1731 found += trie_flush_list(t, &li->falh); 1732 1733 if (list_empty(&li->falh)) { 1734 1735 write_lock_bh(&fib_lock); 1736 hlist_del(&li->hlist); 1737 write_unlock_bh(&fib_lock); 1738 1739 free_leaf_info(li); 1740 } 1741 } 1742 return found; 1743} 1744 1745static struct leaf *nextleaf(struct trie *t, struct leaf *thisleaf) 1746{ 1747 struct node *c = (struct node *) thisleaf; 1748 struct tnode *p; 1749 int idx; 1750 1751 if(c == NULL) { 1752 if(t->trie == NULL) 1753 return NULL; 1754 1755 if (IS_LEAF(t->trie)) /* trie w. just a leaf */ 1756 return (struct leaf *) t->trie; 1757 1758 p = (struct tnode*) t->trie; /* Start */ 1759 } 1760 else 1761 p = (struct tnode *) NODE_PARENT(c); 1762 while (p) { 1763 int pos, last; 1764 1765 /* Find the next child of the parent */ 1766 if(c) 1767 pos = 1 + tkey_extract_bits(c->key, p->pos, p->bits); 1768 else 1769 pos = 0; 1770 1771 last = 1 << p->bits; 1772 for(idx = pos; idx < last ; idx++) { 1773 if( p->child[idx]) { 1774 1775 /* Decend if tnode */ 1776 1777 while (IS_TNODE(p->child[idx])) { 1778 p = (struct tnode*) p->child[idx]; 1779 idx = 0; 1780 1781 /* Rightmost non-NULL branch */ 1782 if( p && IS_TNODE(p) ) 1783 while ( p->child[idx] == NULL && idx < (1 << p->bits) ) idx++; 1784 1785 /* Done with this tnode? */ 1786 if( idx >= (1 << p->bits) || p->child[idx] == NULL ) 1787 goto up; 1788 } 1789 return (struct leaf*) p->child[idx]; 1790 } 1791 } 1792up: 1793 /* No more children go up one step */ 1794 c = (struct node*) p; 1795 p = (struct tnode *) NODE_PARENT(p); 1796 } 1797 return NULL; /* Ready. Root of trie */ 1798} 1799 1800static int fn_trie_flush(struct fib_table *tb) 1801{ 1802 struct trie *t = (struct trie *) tb->tb_data; 1803 struct leaf *ll = NULL, *l = NULL; 1804 int found = 0, h; 1805 1806 t->revision++; 1807 1808 for (h=0; (l = nextleaf(t, l)) != NULL; h++) { 1809 found += trie_flush_leaf(t, l); 1810 1811 if (ll && hlist_empty(&ll->list)) 1812 trie_leaf_remove(t, ll->key); 1813 ll = l; 1814 } 1815 1816 if (ll && hlist_empty(&ll->list)) 1817 trie_leaf_remove(t, ll->key); 1818 1819 if(trie_debug) 1820 printk("trie_flush found=%d\n", found); 1821 return found; 1822} 1823 1824static int trie_last_dflt=-1; 1825 1826static void 1827fn_trie_select_default(struct fib_table *tb, const struct flowi *flp, struct fib_result *res) 1828{ 1829 struct trie *t = (struct trie *) tb->tb_data; 1830 int order, last_idx; 1831 struct fib_info *fi = NULL; 1832 struct fib_info *last_resort; 1833 struct fib_alias *fa = NULL; 1834 struct list_head *fa_head; 1835 struct leaf *l; 1836 1837 last_idx = -1; 1838 last_resort = NULL; 1839 order = -1; 1840 1841 read_lock(&fib_lock); 1842 1843 l = fib_find_node(t, 0); 1844 if(!l) 1845 goto out; 1846 1847 fa_head = get_fa_head(l, 0); 1848 if(!fa_head) 1849 goto out; 1850 1851 if (list_empty(fa_head)) 1852 goto out; 1853 1854 list_for_each_entry(fa, fa_head, fa_list) { 1855 struct fib_info *next_fi = fa->fa_info; 1856 1857 if (fa->fa_scope != res->scope || 1858 fa->fa_type != RTN_UNICAST) 1859 continue; 1860 1861 if (next_fi->fib_priority > res->fi->fib_priority) 1862 break; 1863 if (!next_fi->fib_nh[0].nh_gw || 1864 next_fi->fib_nh[0].nh_scope != RT_SCOPE_LINK) 1865 continue; 1866 fa->fa_state |= FA_S_ACCESSED; 1867 1868 if (fi == NULL) { 1869 if (next_fi != res->fi) 1870 break; 1871 } else if (!fib_detect_death(fi, order, &last_resort, 1872 &last_idx, &trie_last_dflt)) { 1873 if (res->fi) 1874 fib_info_put(res->fi); 1875 res->fi = fi; 1876 atomic_inc(&fi->fib_clntref); 1877 trie_last_dflt = order; 1878 goto out; 1879 } 1880 fi = next_fi; 1881 order++; 1882 } 1883 if (order <= 0 || fi == NULL) { 1884 trie_last_dflt = -1; 1885 goto out; 1886 } 1887 1888 if (!fib_detect_death(fi, order, &last_resort, &last_idx, &trie_last_dflt)) { 1889 if (res->fi) 1890 fib_info_put(res->fi); 1891 res->fi = fi; 1892 atomic_inc(&fi->fib_clntref); 1893 trie_last_dflt = order; 1894 goto out; 1895 } 1896 if (last_idx >= 0) { 1897 if (res->fi) 1898 fib_info_put(res->fi); 1899 res->fi = last_resort; 1900 if (last_resort) 1901 atomic_inc(&last_resort->fib_clntref); 1902 } 1903 trie_last_dflt = last_idx; 1904 out:; 1905 read_unlock(&fib_lock); 1906} 1907 1908static int fn_trie_dump_fa(t_key key, int plen, struct list_head *fah, struct fib_table *tb, 1909 struct sk_buff *skb, struct netlink_callback *cb) 1910{ 1911 int i, s_i; 1912 struct fib_alias *fa; 1913 1914 u32 xkey=htonl(key); 1915 1916 s_i=cb->args[3]; 1917 i = 0; 1918 1919 list_for_each_entry(fa, fah, fa_list) { 1920 if (i < s_i) { 1921 i++; 1922 continue; 1923 } 1924 if (fa->fa_info->fib_nh == NULL) { 1925 printk("Trie error _fib_nh=NULL in fa[%d] k=%08x plen=%d\n", i, key, plen); 1926 i++; 1927 continue; 1928 } 1929 if (fa->fa_info == NULL) { 1930 printk("Trie error fa_info=NULL in fa[%d] k=%08x plen=%d\n", i, key, plen); 1931 i++; 1932 continue; 1933 } 1934 1935 if (fib_dump_info(skb, NETLINK_CB(cb->skb).pid, 1936 cb->nlh->nlmsg_seq, 1937 RTM_NEWROUTE, 1938 tb->tb_id, 1939 fa->fa_type, 1940 fa->fa_scope, 1941 &xkey, 1942 plen, 1943 fa->fa_tos, 1944 fa->fa_info, 0) < 0) { 1945 cb->args[3] = i; 1946 return -1; 1947 } 1948 i++; 1949 } 1950 cb->args[3]=i; 1951 return skb->len; 1952} 1953 1954static int fn_trie_dump_plen(struct trie *t, int plen, struct fib_table *tb, struct sk_buff *skb, 1955 struct netlink_callback *cb) 1956{ 1957 int h, s_h; 1958 struct list_head *fa_head; 1959 struct leaf *l = NULL; 1960 s_h=cb->args[2]; 1961 1962 for (h=0; (l = nextleaf(t, l)) != NULL; h++) { 1963 1964 if (h < s_h) 1965 continue; 1966 if (h > s_h) 1967 memset(&cb->args[3], 0, 1968 sizeof(cb->args) - 3*sizeof(cb->args[0])); 1969 1970 fa_head = get_fa_head(l, plen); 1971 1972 if(!fa_head) 1973 continue; 1974 1975 if(list_empty(fa_head)) 1976 continue; 1977 1978 if (fn_trie_dump_fa(l->key, plen, fa_head, tb, skb, cb)<0) { 1979 cb->args[2]=h; 1980 return -1; 1981 } 1982 } 1983 cb->args[2]=h; 1984 return skb->len; 1985} 1986 1987static int fn_trie_dump(struct fib_table *tb, struct sk_buff *skb, struct netlink_callback *cb) 1988{ 1989 int m, s_m; 1990 struct trie *t = (struct trie *) tb->tb_data; 1991 1992 s_m = cb->args[1]; 1993 1994 read_lock(&fib_lock); 1995 for (m=0; m<=32; m++) { 1996 1997 if (m < s_m) 1998 continue; 1999 if (m > s_m) 2000 memset(&cb->args[2], 0, 2001 sizeof(cb->args) - 2*sizeof(cb->args[0])); 2002 2003 if (fn_trie_dump_plen(t, 32-m, tb, skb, cb)<0) { 2004 cb->args[1] = m; 2005 goto out; 2006 } 2007 } 2008 read_unlock(&fib_lock); 2009 cb->args[1] = m; 2010 return skb->len; 2011 out: 2012 read_unlock(&fib_lock); 2013 return -1; 2014} 2015 2016/* Fix more generic FIB names for init later */ 2017 2018#ifdef CONFIG_IP_MULTIPLE_TABLES 2019struct fib_table * fib_hash_init(int id) 2020#else 2021struct fib_table * __init fib_hash_init(int id) 2022#endif 2023{ 2024 struct fib_table *tb; 2025 struct trie *t; 2026 2027 if (fn_alias_kmem == NULL) 2028 fn_alias_kmem = kmem_cache_create("ip_fib_alias", 2029 sizeof(struct fib_alias), 2030 0, SLAB_HWCACHE_ALIGN, 2031 NULL, NULL); 2032 2033 tb = kmalloc(sizeof(struct fib_table) + sizeof(struct trie), 2034 GFP_KERNEL); 2035 if (tb == NULL) 2036 return NULL; 2037 2038 tb->tb_id = id; 2039 tb->tb_lookup = fn_trie_lookup; 2040 tb->tb_insert = fn_trie_insert; 2041 tb->tb_delete = fn_trie_delete; 2042 tb->tb_flush = fn_trie_flush; 2043 tb->tb_select_default = fn_trie_select_default; 2044 tb->tb_dump = fn_trie_dump; 2045 memset(tb->tb_data, 0, sizeof(struct trie)); 2046 2047 t = (struct trie *) tb->tb_data; 2048 2049 trie_init(t); 2050 2051 if (id == RT_TABLE_LOCAL) 2052 trie_local=t; 2053 else if (id == RT_TABLE_MAIN) 2054 trie_main=t; 2055 2056 if (id == RT_TABLE_LOCAL) 2057 printk("IPv4 FIB: Using LC-trie version %s\n", VERSION); 2058 2059 return tb; 2060} 2061 2062/* Trie dump functions */ 2063 2064static void putspace_seq(struct seq_file *seq, int n) 2065{ 2066 while (n--) seq_printf(seq, " "); 2067} 2068 2069static void printbin_seq(struct seq_file *seq, unsigned int v, int bits) 2070{ 2071 while (bits--) 2072 seq_printf(seq, "%s", (v & (1<<bits))?"1":"0"); 2073} 2074 2075static void printnode_seq(struct seq_file *seq, int indent, struct node *n, 2076 int pend, int cindex, int bits) 2077{ 2078 putspace_seq(seq, indent); 2079 if (IS_LEAF(n)) 2080 seq_printf(seq, "|"); 2081 else 2082 seq_printf(seq, "+"); 2083 if (bits) { 2084 seq_printf(seq, "%d/", cindex); 2085 printbin_seq(seq, cindex, bits); 2086 seq_printf(seq, ": "); 2087 } 2088 else 2089 seq_printf(seq, "<root>: "); 2090 seq_printf(seq, "%s:%p ", IS_LEAF(n)?"Leaf":"Internal node", n); 2091 2092 if (IS_LEAF(n)) 2093 seq_printf(seq, "key=%d.%d.%d.%d\n", 2094 n->key >> 24, (n->key >> 16) % 256, (n->key >> 8) % 256, n->key % 256); 2095 else { 2096 int plen=((struct tnode *)n)->pos; 2097 t_key prf=MASK_PFX(n->key, plen); 2098 seq_printf(seq, "key=%d.%d.%d.%d/%d\n", 2099 prf >> 24, (prf >> 16) % 256, (prf >> 8) % 256, prf % 256, plen); 2100 } 2101 if (IS_LEAF(n)) { 2102 struct leaf *l=(struct leaf *)n; 2103 struct fib_alias *fa; 2104 int i; 2105 for (i=32; i>=0; i--) 2106 if(find_leaf_info(&l->list, i)) { 2107 2108 struct list_head *fa_head = get_fa_head(l, i); 2109 2110 if(!fa_head) 2111 continue; 2112 2113 if(list_empty(fa_head)) 2114 continue; 2115 2116 putspace_seq(seq, indent+2); 2117 seq_printf(seq, "{/%d...dumping}\n", i); 2118 2119 2120 list_for_each_entry(fa, fa_head, fa_list) { 2121 putspace_seq(seq, indent+2); 2122 if (fa->fa_info->fib_nh == NULL) { 2123 seq_printf(seq, "Error _fib_nh=NULL\n"); 2124 continue; 2125 } 2126 if (fa->fa_info == NULL) { 2127 seq_printf(seq, "Error fa_info=NULL\n"); 2128 continue; 2129 } 2130 2131 seq_printf(seq, "{type=%d scope=%d TOS=%d}\n", 2132 fa->fa_type, 2133 fa->fa_scope, 2134 fa->fa_tos); 2135 } 2136 } 2137 } 2138 else if (IS_TNODE(n)) { 2139 struct tnode *tn=(struct tnode *)n; 2140 putspace_seq(seq, indent); seq_printf(seq, "| "); 2141 seq_printf(seq, "{key prefix=%08x/", tn->key&TKEY_GET_MASK(0, tn->pos)); 2142 printbin_seq(seq, tkey_extract_bits(tn->key, 0, tn->pos), tn->pos); 2143 seq_printf(seq, "}\n"); 2144 putspace_seq(seq, indent); seq_printf(seq, "| "); 2145 seq_printf(seq, "{pos=%d", tn->pos); 2146 seq_printf(seq, " (skip=%d bits)", tn->pos - pend); 2147 seq_printf(seq, " bits=%d (%u children)}\n", tn->bits, (1 << tn->bits)); 2148 putspace_seq(seq, indent); seq_printf(seq, "| "); 2149 seq_printf(seq, "{empty=%d full=%d}\n", tn->empty_children, tn->full_children); 2150 } 2151} 2152 2153static void trie_dump_seq(struct seq_file *seq, struct trie *t) 2154{ 2155 struct node *n=t->trie; 2156 int cindex=0; 2157 int indent=1; 2158 int pend=0; 2159 int depth = 0; 2160 2161 read_lock(&fib_lock); 2162 2163 seq_printf(seq, "------ trie_dump of t=%p ------\n", t); 2164 if (n) { 2165 printnode_seq(seq, indent, n, pend, cindex, 0); 2166 if (IS_TNODE(n)) { 2167 struct tnode *tn=(struct tnode *)n; 2168 pend = tn->pos+tn->bits; 2169 putspace_seq(seq, indent); seq_printf(seq, "\\--\n"); 2170 indent += 3; 2171 depth++; 2172 2173 while (tn && cindex < (1 << tn->bits)) { 2174 if (tn->child[cindex]) { 2175 2176 /* Got a child */ 2177 2178 printnode_seq(seq, indent, tn->child[cindex], pend, cindex, tn->bits); 2179 if (IS_LEAF(tn->child[cindex])) { 2180 cindex++; 2181 2182 } 2183 else { 2184 /* 2185 * New tnode. Decend one level 2186 */ 2187 2188 depth++; 2189 n=tn->child[cindex]; 2190 tn=(struct tnode *)n; 2191 pend=tn->pos+tn->bits; 2192 putspace_seq(seq, indent); seq_printf(seq, "\\--\n"); 2193 indent+=3; 2194 cindex=0; 2195 } 2196 } 2197 else 2198 cindex++; 2199 2200 /* 2201 * Test if we are done 2202 */ 2203 2204 while (cindex >= (1 << tn->bits)) { 2205 2206 /* 2207 * Move upwards and test for root 2208 * pop off all traversed nodes 2209 */ 2210 2211 if (NODE_PARENT(tn) == NULL) { 2212 tn = NULL; 2213 n = NULL; 2214 break; 2215 } 2216 else { 2217 cindex = tkey_extract_bits(tn->key, NODE_PARENT(tn)->pos, NODE_PARENT(tn)->bits); 2218 tn = NODE_PARENT(tn); 2219 cindex++; 2220 n=(struct node *)tn; 2221 pend=tn->pos+tn->bits; 2222 indent-=3; 2223 depth--; 2224 } 2225 } 2226 } 2227 } 2228 else n = NULL; 2229 } 2230 else seq_printf(seq, "------ trie is empty\n"); 2231 2232 read_unlock(&fib_lock); 2233} 2234 2235static struct trie_stat *trie_stat_new(void) 2236{ 2237 struct trie_stat *s = kmalloc(sizeof(struct trie_stat), GFP_KERNEL); 2238 int i; 2239 2240 if(s) { 2241 s->totdepth = 0; 2242 s->maxdepth = 0; 2243 s->tnodes = 0; 2244 s->leaves = 0; 2245 s->nullpointers = 0; 2246 2247 for(i=0; i< MAX_CHILDS; i++) 2248 s->nodesizes[i] = 0; 2249 } 2250 return s; 2251} 2252 2253static struct trie_stat *trie_collect_stats(struct trie *t) 2254{ 2255 struct node *n=t->trie; 2256 struct trie_stat *s = trie_stat_new(); 2257 int cindex = 0; 2258 int indent = 1; 2259 int pend = 0; 2260 int depth = 0; 2261 2262 read_lock(&fib_lock); 2263 2264 if (s) { 2265 if (n) { 2266 if (IS_TNODE(n)) { 2267 struct tnode *tn = (struct tnode *)n; 2268 pend=tn->pos+tn->bits; 2269 indent += 3; 2270 s->nodesizes[tn->bits]++; 2271 depth++; 2272 2273 while (tn && cindex < (1 << tn->bits)) { 2274 if (tn->child[cindex]) { 2275 /* Got a child */ 2276 2277 if (IS_LEAF(tn->child[cindex])) { 2278 cindex++; 2279 2280 /* stats */ 2281 if (depth > s->maxdepth) 2282 s->maxdepth = depth; 2283 s->totdepth += depth; 2284 s->leaves++; 2285 } 2286 2287 else { 2288 /* 2289 * New tnode. Decend one level 2290 */ 2291 2292 s->tnodes++; 2293 s->nodesizes[tn->bits]++; 2294 depth++; 2295 2296 n = tn->child[cindex]; 2297 tn = (struct tnode *)n; 2298 pend = tn->pos+tn->bits; 2299 2300 indent += 3; 2301 cindex = 0; 2302 } 2303 } 2304 else { 2305 cindex++; 2306 s->nullpointers++; 2307 } 2308 2309 /* 2310 * Test if we are done 2311 */ 2312 2313 while (cindex >= (1 << tn->bits)) { 2314 2315 /* 2316 * Move upwards and test for root 2317 * pop off all traversed nodes 2318 */ 2319 2320 2321 if (NODE_PARENT(tn) == NULL) { 2322 tn = NULL; 2323 n = NULL; 2324 break; 2325 } 2326 else { 2327 cindex = tkey_extract_bits(tn->key, NODE_PARENT(tn)->pos, NODE_PARENT(tn)->bits); 2328 tn = NODE_PARENT(tn); 2329 cindex++; 2330 n = (struct node *)tn; 2331 pend=tn->pos+tn->bits; 2332 indent -= 3; 2333 depth--; 2334 } 2335 } 2336 } 2337 } 2338 else n = NULL; 2339 } 2340 } 2341 2342 read_unlock(&fib_lock); 2343 return s; 2344} 2345 2346#ifdef CONFIG_PROC_FS 2347 2348static struct fib_alias *fib_triestat_get_first(struct seq_file *seq) 2349{ 2350 return NULL; 2351} 2352 2353static struct fib_alias *fib_triestat_get_next(struct seq_file *seq) 2354{ 2355 return NULL; 2356} 2357 2358static void *fib_triestat_seq_start(struct seq_file *seq, loff_t *pos) 2359{ 2360 void *v = NULL; 2361 2362 if (ip_fib_main_table) 2363 v = *pos ? fib_triestat_get_next(seq) : SEQ_START_TOKEN; 2364 return v; 2365} 2366 2367static void *fib_triestat_seq_next(struct seq_file *seq, void *v, loff_t *pos) 2368{ 2369 ++*pos; 2370 return v == SEQ_START_TOKEN ? fib_triestat_get_first(seq) : fib_triestat_get_next(seq); 2371} 2372 2373static void fib_triestat_seq_stop(struct seq_file *seq, void *v) 2374{ 2375 2376} 2377 2378/* 2379 * This outputs /proc/net/fib_triestats 2380 * 2381 * It always works in backward compatibility mode. 2382 * The format of the file is not supposed to be changed. 2383 */ 2384 2385static void collect_and_show(struct trie *t, struct seq_file *seq) 2386{ 2387 int bytes = 0; /* How many bytes are used, a ref is 4 bytes */ 2388 int i, max, pointers; 2389 struct trie_stat *stat; 2390 int avdepth; 2391 2392 stat = trie_collect_stats(t); 2393 2394 bytes=0; 2395 seq_printf(seq, "trie=%p\n", t); 2396 2397 if (stat) { 2398 if (stat->leaves) 2399 avdepth=stat->totdepth*100 / stat->leaves; 2400 else 2401 avdepth=0; 2402 seq_printf(seq, "Aver depth: %d.%02d\n", avdepth / 100, avdepth % 100 ); 2403 seq_printf(seq, "Max depth: %4d\n", stat->maxdepth); 2404 2405 seq_printf(seq, "Leaves: %d\n", stat->leaves); 2406 bytes += sizeof(struct leaf) * stat->leaves; 2407 seq_printf(seq, "Internal nodes: %d\n", stat->tnodes); 2408 bytes += sizeof(struct tnode) * stat->tnodes; 2409 2410 max = MAX_CHILDS-1; 2411 2412 while (max >= 0 && stat->nodesizes[max] == 0) 2413 max--; 2414 pointers = 0; 2415 2416 for (i = 1; i <= max; i++) 2417 if (stat->nodesizes[i] != 0) { 2418 seq_printf(seq, " %d: %d", i, stat->nodesizes[i]); 2419 pointers += (1<<i) * stat->nodesizes[i]; 2420 } 2421 seq_printf(seq, "\n"); 2422 seq_printf(seq, "Pointers: %d\n", pointers); 2423 bytes += sizeof(struct node *) * pointers; 2424 seq_printf(seq, "Null ptrs: %d\n", stat->nullpointers); 2425 seq_printf(seq, "Total size: %d kB\n", bytes / 1024); 2426 2427 kfree(stat); 2428 } 2429 2430#ifdef CONFIG_IP_FIB_TRIE_STATS 2431 seq_printf(seq, "Counters:\n---------\n"); 2432 seq_printf(seq,"gets = %d\n", t->stats.gets); 2433 seq_printf(seq,"backtracks = %d\n", t->stats.backtrack); 2434 seq_printf(seq,"semantic match passed = %d\n", t->stats.semantic_match_passed); 2435 seq_printf(seq,"semantic match miss = %d\n", t->stats.semantic_match_miss); 2436 seq_printf(seq,"null node hit= %d\n", t->stats.null_node_hit); 2437 seq_printf(seq,"skipped node resize = %d\n", t->stats.resize_node_skipped); 2438#ifdef CLEAR_STATS 2439 memset(&(t->stats), 0, sizeof(t->stats)); 2440#endif 2441#endif /* CONFIG_IP_FIB_TRIE_STATS */ 2442} 2443 2444static int fib_triestat_seq_show(struct seq_file *seq, void *v) 2445{ 2446 char bf[128]; 2447 2448 if (v == SEQ_START_TOKEN) { 2449 seq_printf(seq, "Basic info: size of leaf: %Zd bytes, size of tnode: %Zd bytes.\n", 2450 sizeof(struct leaf), sizeof(struct tnode)); 2451 if (trie_local) 2452 collect_and_show(trie_local, seq); 2453 2454 if (trie_main) 2455 collect_and_show(trie_main, seq); 2456 } 2457 else { 2458 snprintf(bf, sizeof(bf), 2459 "*\t%08X\t%08X", 200, 400); 2460 2461 seq_printf(seq, "%-127s\n", bf); 2462 } 2463 return 0; 2464} 2465 2466static struct seq_operations fib_triestat_seq_ops = { 2467 .start = fib_triestat_seq_start, 2468 .next = fib_triestat_seq_next, 2469 .stop = fib_triestat_seq_stop, 2470 .show = fib_triestat_seq_show, 2471}; 2472 2473static int fib_triestat_seq_open(struct inode *inode, struct file *file) 2474{ 2475 struct seq_file *seq; 2476 int rc = -ENOMEM; 2477 2478 rc = seq_open(file, &fib_triestat_seq_ops); 2479 if (rc) 2480 goto out_kfree; 2481 2482 seq = file->private_data; 2483out: 2484 return rc; 2485out_kfree: 2486 goto out; 2487} 2488 2489static struct file_operations fib_triestat_seq_fops = { 2490 .owner = THIS_MODULE, 2491 .open = fib_triestat_seq_open, 2492 .read = seq_read, 2493 .llseek = seq_lseek, 2494 .release = seq_release_private, 2495}; 2496 2497int __init fib_stat_proc_init(void) 2498{ 2499 if (!proc_net_fops_create("fib_triestat", S_IRUGO, &fib_triestat_seq_fops)) 2500 return -ENOMEM; 2501 return 0; 2502} 2503 2504void __init fib_stat_proc_exit(void) 2505{ 2506 proc_net_remove("fib_triestat"); 2507} 2508 2509static struct fib_alias *fib_trie_get_first(struct seq_file *seq) 2510{ 2511 return NULL; 2512} 2513 2514static struct fib_alias *fib_trie_get_next(struct seq_file *seq) 2515{ 2516 return NULL; 2517} 2518 2519static void *fib_trie_seq_start(struct seq_file *seq, loff_t *pos) 2520{ 2521 void *v = NULL; 2522 2523 if (ip_fib_main_table) 2524 v = *pos ? fib_trie_get_next(seq) : SEQ_START_TOKEN; 2525 return v; 2526} 2527 2528static void *fib_trie_seq_next(struct seq_file *seq, void *v, loff_t *pos) 2529{ 2530 ++*pos; 2531 return v == SEQ_START_TOKEN ? fib_trie_get_first(seq) : fib_trie_get_next(seq); 2532} 2533 2534static void fib_trie_seq_stop(struct seq_file *seq, void *v) 2535{ 2536 2537} 2538 2539/* 2540 * This outputs /proc/net/fib_trie. 2541 * 2542 * It always works in backward compatibility mode. 2543 * The format of the file is not supposed to be changed. 2544 */ 2545 2546static int fib_trie_seq_show(struct seq_file *seq, void *v) 2547{ 2548 char bf[128]; 2549 2550 if (v == SEQ_START_TOKEN) { 2551 if (trie_local) 2552 trie_dump_seq(seq, trie_local); 2553 2554 if (trie_main) 2555 trie_dump_seq(seq, trie_main); 2556 } 2557 2558 else { 2559 snprintf(bf, sizeof(bf), 2560 "*\t%08X\t%08X", 200, 400); 2561 seq_printf(seq, "%-127s\n", bf); 2562 } 2563 2564 return 0; 2565} 2566 2567static struct seq_operations fib_trie_seq_ops = { 2568 .start = fib_trie_seq_start, 2569 .next = fib_trie_seq_next, 2570 .stop = fib_trie_seq_stop, 2571 .show = fib_trie_seq_show, 2572}; 2573 2574static int fib_trie_seq_open(struct inode *inode, struct file *file) 2575{ 2576 struct seq_file *seq; 2577 int rc = -ENOMEM; 2578 2579 rc = seq_open(file, &fib_trie_seq_ops); 2580 if (rc) 2581 goto out_kfree; 2582 2583 seq = file->private_data; 2584out: 2585 return rc; 2586out_kfree: 2587 goto out; 2588} 2589 2590static struct file_operations fib_trie_seq_fops = { 2591 .owner = THIS_MODULE, 2592 .open = fib_trie_seq_open, 2593 .read = seq_read, 2594 .llseek = seq_lseek, 2595 .release = seq_release_private, 2596}; 2597 2598int __init fib_proc_init(void) 2599{ 2600 if (!proc_net_fops_create("fib_trie", S_IRUGO, &fib_trie_seq_fops)) 2601 return -ENOMEM; 2602 return 0; 2603} 2604 2605void __init fib_proc_exit(void) 2606{ 2607 proc_net_remove("fib_trie"); 2608} 2609 2610#endif /* CONFIG_PROC_FS */ 2611