fib_trie.c revision 4dde4610c4ab54e9d36a4afaa98c23b017f7f9e3
1/* 2 * This program is free software; you can redistribute it and/or 3 * modify it under the terms of the GNU General Public License 4 * as published by the Free Software Foundation; either version 5 * 2 of the License, or (at your option) any later version. 6 * 7 * Robert Olsson <robert.olsson@its.uu.se> Uppsala Universitet 8 * & Swedish University of Agricultural Sciences. 9 * 10 * Jens Laas <jens.laas@data.slu.se> Swedish University of 11 * Agricultural Sciences. 12 * 13 * Hans Liss <hans.liss@its.uu.se> Uppsala Universitet 14 * 15 * This work is based on the LPC-trie which is originally descibed in: 16 * 17 * An experimental study of compression methods for dynamic tries 18 * Stefan Nilsson and Matti Tikkanen. Algorithmica, 33(1):19-33, 2002. 19 * http://www.nada.kth.se/~snilsson/public/papers/dyntrie2/ 20 * 21 * 22 * IP-address lookup using LC-tries. Stefan Nilsson and Gunnar Karlsson 23 * IEEE Journal on Selected Areas in Communications, 17(6):1083-1092, June 1999 24 * 25 * Version: $Id: fib_trie.c,v 1.3 2005/06/08 14:20:01 robert Exp $ 26 * 27 * 28 * Code from fib_hash has been reused which includes the following header: 29 * 30 * 31 * INET An implementation of the TCP/IP protocol suite for the LINUX 32 * operating system. INET is implemented using the BSD Socket 33 * interface as the means of communication with the user level. 34 * 35 * IPv4 FIB: lookup engine and maintenance routines. 36 * 37 * 38 * Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru> 39 * 40 * This program is free software; you can redistribute it and/or 41 * modify it under the terms of the GNU General Public License 42 * as published by the Free Software Foundation; either version 43 * 2 of the License, or (at your option) any later version. 44 * 45 * Substantial contributions to this work comes from: 46 * 47 * David S. Miller, <davem@davemloft.net> 48 * Stephen Hemminger <shemminger@osdl.org> 49 * Paul E. McKenney <paulmck@us.ibm.com> 50 * Patrick McHardy <kaber@trash.net> 51 */ 52 53#define VERSION "0.408" 54 55#include <asm/uaccess.h> 56#include <asm/system.h> 57#include <linux/bitops.h> 58#include <linux/types.h> 59#include <linux/kernel.h> 60#include <linux/mm.h> 61#include <linux/string.h> 62#include <linux/socket.h> 63#include <linux/sockios.h> 64#include <linux/errno.h> 65#include <linux/in.h> 66#include <linux/inet.h> 67#include <linux/inetdevice.h> 68#include <linux/netdevice.h> 69#include <linux/if_arp.h> 70#include <linux/proc_fs.h> 71#include <linux/rcupdate.h> 72#include <linux/skbuff.h> 73#include <linux/netlink.h> 74#include <linux/init.h> 75#include <linux/list.h> 76#include <net/net_namespace.h> 77#include <net/ip.h> 78#include <net/protocol.h> 79#include <net/route.h> 80#include <net/tcp.h> 81#include <net/sock.h> 82#include <net/ip_fib.h> 83#include "fib_lookup.h" 84 85#define MAX_STAT_DEPTH 32 86 87#define KEYLENGTH (8*sizeof(t_key)) 88 89typedef unsigned int t_key; 90 91#define T_TNODE 0 92#define T_LEAF 1 93#define NODE_TYPE_MASK 0x1UL 94#define NODE_TYPE(node) ((node)->parent & NODE_TYPE_MASK) 95 96#define IS_TNODE(n) (!(n->parent & T_LEAF)) 97#define IS_LEAF(n) (n->parent & T_LEAF) 98 99struct node { 100 t_key key; 101 unsigned long parent; 102}; 103 104struct leaf { 105 t_key key; 106 unsigned long parent; 107 struct hlist_head list; 108 struct rcu_head rcu; 109}; 110 111struct leaf_info { 112 struct hlist_node hlist; 113 struct rcu_head rcu; 114 int plen; 115 struct list_head falh; 116}; 117 118struct tnode { 119 t_key key; 120 unsigned long parent; 121 unsigned char pos; /* 2log(KEYLENGTH) bits needed */ 122 unsigned char bits; /* 2log(KEYLENGTH) bits needed */ 123 unsigned short full_children; /* KEYLENGTH bits needed */ 124 unsigned short empty_children; /* KEYLENGTH bits needed */ 125 struct rcu_head rcu; 126 struct node *child[0]; 127}; 128 129#ifdef CONFIG_IP_FIB_TRIE_STATS 130struct trie_use_stats { 131 unsigned int gets; 132 unsigned int backtrack; 133 unsigned int semantic_match_passed; 134 unsigned int semantic_match_miss; 135 unsigned int null_node_hit; 136 unsigned int resize_node_skipped; 137}; 138#endif 139 140struct trie_stat { 141 unsigned int totdepth; 142 unsigned int maxdepth; 143 unsigned int tnodes; 144 unsigned int leaves; 145 unsigned int nullpointers; 146 unsigned int nodesizes[MAX_STAT_DEPTH]; 147}; 148 149struct trie { 150 struct node *trie; 151#ifdef CONFIG_IP_FIB_TRIE_STATS 152 struct trie_use_stats stats; 153#endif 154 int size; 155}; 156 157static void put_child(struct trie *t, struct tnode *tn, int i, struct node *n); 158static void tnode_put_child_reorg(struct tnode *tn, int i, struct node *n, int wasfull); 159static struct node *resize(struct trie *t, struct tnode *tn); 160static struct tnode *inflate(struct trie *t, struct tnode *tn); 161static struct tnode *halve(struct trie *t, struct tnode *tn); 162static void tnode_free(struct tnode *tn); 163 164static struct kmem_cache *fn_alias_kmem __read_mostly; 165 166static inline struct tnode *node_parent(struct node *node) 167{ 168 struct tnode *ret; 169 170 ret = (struct tnode *)(node->parent & ~NODE_TYPE_MASK); 171 return rcu_dereference(ret); 172} 173 174static inline void node_set_parent(struct node *node, struct tnode *ptr) 175{ 176 rcu_assign_pointer(node->parent, 177 (unsigned long)ptr | NODE_TYPE(node)); 178} 179 180/* rcu_read_lock needs to be hold by caller from readside */ 181 182static inline struct node *tnode_get_child(struct tnode *tn, int i) 183{ 184 BUG_ON(i >= 1 << tn->bits); 185 186 return rcu_dereference(tn->child[i]); 187} 188 189static inline int tnode_child_length(const struct tnode *tn) 190{ 191 return 1 << tn->bits; 192} 193 194static inline t_key mask_pfx(t_key k, unsigned short l) 195{ 196 return (l == 0) ? 0 : k >> (KEYLENGTH-l) << (KEYLENGTH-l); 197} 198 199static inline t_key tkey_extract_bits(t_key a, int offset, int bits) 200{ 201 if (offset < KEYLENGTH) 202 return ((t_key)(a << offset)) >> (KEYLENGTH - bits); 203 else 204 return 0; 205} 206 207static inline int tkey_equals(t_key a, t_key b) 208{ 209 return a == b; 210} 211 212static inline int tkey_sub_equals(t_key a, int offset, int bits, t_key b) 213{ 214 if (bits == 0 || offset >= KEYLENGTH) 215 return 1; 216 bits = bits > KEYLENGTH ? KEYLENGTH : bits; 217 return ((a ^ b) << offset) >> (KEYLENGTH - bits) == 0; 218} 219 220static inline int tkey_mismatch(t_key a, int offset, t_key b) 221{ 222 t_key diff = a ^ b; 223 int i = offset; 224 225 if (!diff) 226 return 0; 227 while ((diff << i) >> (KEYLENGTH-1) == 0) 228 i++; 229 return i; 230} 231 232/* 233 To understand this stuff, an understanding of keys and all their bits is 234 necessary. Every node in the trie has a key associated with it, but not 235 all of the bits in that key are significant. 236 237 Consider a node 'n' and its parent 'tp'. 238 239 If n is a leaf, every bit in its key is significant. Its presence is 240 necessitated by path compression, since during a tree traversal (when 241 searching for a leaf - unless we are doing an insertion) we will completely 242 ignore all skipped bits we encounter. Thus we need to verify, at the end of 243 a potentially successful search, that we have indeed been walking the 244 correct key path. 245 246 Note that we can never "miss" the correct key in the tree if present by 247 following the wrong path. Path compression ensures that segments of the key 248 that are the same for all keys with a given prefix are skipped, but the 249 skipped part *is* identical for each node in the subtrie below the skipped 250 bit! trie_insert() in this implementation takes care of that - note the 251 call to tkey_sub_equals() in trie_insert(). 252 253 if n is an internal node - a 'tnode' here, the various parts of its key 254 have many different meanings. 255 256 Example: 257 _________________________________________________________________ 258 | i | i | i | i | i | i | i | N | N | N | S | S | S | S | S | C | 259 ----------------------------------------------------------------- 260 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 261 262 _________________________________________________________________ 263 | C | C | C | u | u | u | u | u | u | u | u | u | u | u | u | u | 264 ----------------------------------------------------------------- 265 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 266 267 tp->pos = 7 268 tp->bits = 3 269 n->pos = 15 270 n->bits = 4 271 272 First, let's just ignore the bits that come before the parent tp, that is 273 the bits from 0 to (tp->pos-1). They are *known* but at this point we do 274 not use them for anything. 275 276 The bits from (tp->pos) to (tp->pos + tp->bits - 1) - "N", above - are the 277 index into the parent's child array. That is, they will be used to find 278 'n' among tp's children. 279 280 The bits from (tp->pos + tp->bits) to (n->pos - 1) - "S" - are skipped bits 281 for the node n. 282 283 All the bits we have seen so far are significant to the node n. The rest 284 of the bits are really not needed or indeed known in n->key. 285 286 The bits from (n->pos) to (n->pos + n->bits - 1) - "C" - are the index into 287 n's child array, and will of course be different for each child. 288 289 290 The rest of the bits, from (n->pos + n->bits) onward, are completely unknown 291 at this point. 292 293*/ 294 295static inline void check_tnode(const struct tnode *tn) 296{ 297 WARN_ON(tn && tn->pos+tn->bits > 32); 298} 299 300static const int halve_threshold = 25; 301static const int inflate_threshold = 50; 302static const int halve_threshold_root = 8; 303static const int inflate_threshold_root = 15; 304 305 306static void __alias_free_mem(struct rcu_head *head) 307{ 308 struct fib_alias *fa = container_of(head, struct fib_alias, rcu); 309 kmem_cache_free(fn_alias_kmem, fa); 310} 311 312static inline void alias_free_mem_rcu(struct fib_alias *fa) 313{ 314 call_rcu(&fa->rcu, __alias_free_mem); 315} 316 317static void __leaf_free_rcu(struct rcu_head *head) 318{ 319 kfree(container_of(head, struct leaf, rcu)); 320} 321 322static void __leaf_info_free_rcu(struct rcu_head *head) 323{ 324 kfree(container_of(head, struct leaf_info, rcu)); 325} 326 327static inline void free_leaf_info(struct leaf_info *leaf) 328{ 329 call_rcu(&leaf->rcu, __leaf_info_free_rcu); 330} 331 332static struct tnode *tnode_alloc(unsigned int size) 333{ 334 struct page *pages; 335 336 if (size <= PAGE_SIZE) 337 return kcalloc(size, 1, GFP_KERNEL); 338 339 pages = alloc_pages(GFP_KERNEL|__GFP_ZERO, get_order(size)); 340 if (!pages) 341 return NULL; 342 343 return page_address(pages); 344} 345 346static void __tnode_free_rcu(struct rcu_head *head) 347{ 348 struct tnode *tn = container_of(head, struct tnode, rcu); 349 unsigned int size = sizeof(struct tnode) + 350 (1 << tn->bits) * sizeof(struct node *); 351 352 if (size <= PAGE_SIZE) 353 kfree(tn); 354 else 355 free_pages((unsigned long)tn, get_order(size)); 356} 357 358static inline void tnode_free(struct tnode *tn) 359{ 360 if (IS_LEAF(tn)) { 361 struct leaf *l = (struct leaf *) tn; 362 call_rcu_bh(&l->rcu, __leaf_free_rcu); 363 } else 364 call_rcu(&tn->rcu, __tnode_free_rcu); 365} 366 367static struct leaf *leaf_new(void) 368{ 369 struct leaf *l = kmalloc(sizeof(struct leaf), GFP_KERNEL); 370 if (l) { 371 l->parent = T_LEAF; 372 INIT_HLIST_HEAD(&l->list); 373 } 374 return l; 375} 376 377static struct leaf_info *leaf_info_new(int plen) 378{ 379 struct leaf_info *li = kmalloc(sizeof(struct leaf_info), GFP_KERNEL); 380 if (li) { 381 li->plen = plen; 382 INIT_LIST_HEAD(&li->falh); 383 } 384 return li; 385} 386 387static struct tnode* tnode_new(t_key key, int pos, int bits) 388{ 389 int nchildren = 1<<bits; 390 int sz = sizeof(struct tnode) + nchildren * sizeof(struct node *); 391 struct tnode *tn = tnode_alloc(sz); 392 393 if (tn) { 394 tn->parent = T_TNODE; 395 tn->pos = pos; 396 tn->bits = bits; 397 tn->key = key; 398 tn->full_children = 0; 399 tn->empty_children = 1<<bits; 400 } 401 402 pr_debug("AT %p s=%u %u\n", tn, (unsigned int) sizeof(struct tnode), 403 (unsigned int) (sizeof(struct node) * 1<<bits)); 404 return tn; 405} 406 407/* 408 * Check whether a tnode 'n' is "full", i.e. it is an internal node 409 * and no bits are skipped. See discussion in dyntree paper p. 6 410 */ 411 412static inline int tnode_full(const struct tnode *tn, const struct node *n) 413{ 414 if (n == NULL || IS_LEAF(n)) 415 return 0; 416 417 return ((struct tnode *) n)->pos == tn->pos + tn->bits; 418} 419 420static inline void put_child(struct trie *t, struct tnode *tn, int i, struct node *n) 421{ 422 tnode_put_child_reorg(tn, i, n, -1); 423} 424 425 /* 426 * Add a child at position i overwriting the old value. 427 * Update the value of full_children and empty_children. 428 */ 429 430static void tnode_put_child_reorg(struct tnode *tn, int i, struct node *n, int wasfull) 431{ 432 struct node *chi = tn->child[i]; 433 int isfull; 434 435 BUG_ON(i >= 1<<tn->bits); 436 437 438 /* update emptyChildren */ 439 if (n == NULL && chi != NULL) 440 tn->empty_children++; 441 else if (n != NULL && chi == NULL) 442 tn->empty_children--; 443 444 /* update fullChildren */ 445 if (wasfull == -1) 446 wasfull = tnode_full(tn, chi); 447 448 isfull = tnode_full(tn, n); 449 if (wasfull && !isfull) 450 tn->full_children--; 451 else if (!wasfull && isfull) 452 tn->full_children++; 453 454 if (n) 455 node_set_parent(n, tn); 456 457 rcu_assign_pointer(tn->child[i], n); 458} 459 460static struct node *resize(struct trie *t, struct tnode *tn) 461{ 462 int i; 463 int err = 0; 464 struct tnode *old_tn; 465 int inflate_threshold_use; 466 int halve_threshold_use; 467 int max_resize; 468 469 if (!tn) 470 return NULL; 471 472 pr_debug("In tnode_resize %p inflate_threshold=%d threshold=%d\n", 473 tn, inflate_threshold, halve_threshold); 474 475 /* No children */ 476 if (tn->empty_children == tnode_child_length(tn)) { 477 tnode_free(tn); 478 return NULL; 479 } 480 /* One child */ 481 if (tn->empty_children == tnode_child_length(tn) - 1) 482 for (i = 0; i < tnode_child_length(tn); i++) { 483 struct node *n; 484 485 n = tn->child[i]; 486 if (!n) 487 continue; 488 489 /* compress one level */ 490 node_set_parent(n, NULL); 491 tnode_free(tn); 492 return n; 493 } 494 /* 495 * Double as long as the resulting node has a number of 496 * nonempty nodes that are above the threshold. 497 */ 498 499 /* 500 * From "Implementing a dynamic compressed trie" by Stefan Nilsson of 501 * the Helsinki University of Technology and Matti Tikkanen of Nokia 502 * Telecommunications, page 6: 503 * "A node is doubled if the ratio of non-empty children to all 504 * children in the *doubled* node is at least 'high'." 505 * 506 * 'high' in this instance is the variable 'inflate_threshold'. It 507 * is expressed as a percentage, so we multiply it with 508 * tnode_child_length() and instead of multiplying by 2 (since the 509 * child array will be doubled by inflate()) and multiplying 510 * the left-hand side by 100 (to handle the percentage thing) we 511 * multiply the left-hand side by 50. 512 * 513 * The left-hand side may look a bit weird: tnode_child_length(tn) 514 * - tn->empty_children is of course the number of non-null children 515 * in the current node. tn->full_children is the number of "full" 516 * children, that is non-null tnodes with a skip value of 0. 517 * All of those will be doubled in the resulting inflated tnode, so 518 * we just count them one extra time here. 519 * 520 * A clearer way to write this would be: 521 * 522 * to_be_doubled = tn->full_children; 523 * not_to_be_doubled = tnode_child_length(tn) - tn->empty_children - 524 * tn->full_children; 525 * 526 * new_child_length = tnode_child_length(tn) * 2; 527 * 528 * new_fill_factor = 100 * (not_to_be_doubled + 2*to_be_doubled) / 529 * new_child_length; 530 * if (new_fill_factor >= inflate_threshold) 531 * 532 * ...and so on, tho it would mess up the while () loop. 533 * 534 * anyway, 535 * 100 * (not_to_be_doubled + 2*to_be_doubled) / new_child_length >= 536 * inflate_threshold 537 * 538 * avoid a division: 539 * 100 * (not_to_be_doubled + 2*to_be_doubled) >= 540 * inflate_threshold * new_child_length 541 * 542 * expand not_to_be_doubled and to_be_doubled, and shorten: 543 * 100 * (tnode_child_length(tn) - tn->empty_children + 544 * tn->full_children) >= inflate_threshold * new_child_length 545 * 546 * expand new_child_length: 547 * 100 * (tnode_child_length(tn) - tn->empty_children + 548 * tn->full_children) >= 549 * inflate_threshold * tnode_child_length(tn) * 2 550 * 551 * shorten again: 552 * 50 * (tn->full_children + tnode_child_length(tn) - 553 * tn->empty_children) >= inflate_threshold * 554 * tnode_child_length(tn) 555 * 556 */ 557 558 check_tnode(tn); 559 560 /* Keep root node larger */ 561 562 if (!tn->parent) 563 inflate_threshold_use = inflate_threshold_root; 564 else 565 inflate_threshold_use = inflate_threshold; 566 567 err = 0; 568 max_resize = 10; 569 while ((tn->full_children > 0 && max_resize-- && 570 50 * (tn->full_children + tnode_child_length(tn) - tn->empty_children) >= 571 inflate_threshold_use * tnode_child_length(tn))) { 572 573 old_tn = tn; 574 tn = inflate(t, tn); 575 if (IS_ERR(tn)) { 576 tn = old_tn; 577#ifdef CONFIG_IP_FIB_TRIE_STATS 578 t->stats.resize_node_skipped++; 579#endif 580 break; 581 } 582 } 583 584 if (max_resize < 0) { 585 if (!tn->parent) 586 printk(KERN_WARNING "Fix inflate_threshold_root. Now=%d size=%d bits\n", 587 inflate_threshold_root, tn->bits); 588 else 589 printk(KERN_WARNING "Fix inflate_threshold. Now=%d size=%d bits\n", 590 inflate_threshold, tn->bits); 591 } 592 593 check_tnode(tn); 594 595 /* 596 * Halve as long as the number of empty children in this 597 * node is above threshold. 598 */ 599 600 601 /* Keep root node larger */ 602 603 if (!tn->parent) 604 halve_threshold_use = halve_threshold_root; 605 else 606 halve_threshold_use = halve_threshold; 607 608 err = 0; 609 max_resize = 10; 610 while (tn->bits > 1 && max_resize-- && 611 100 * (tnode_child_length(tn) - tn->empty_children) < 612 halve_threshold_use * tnode_child_length(tn)) { 613 614 old_tn = tn; 615 tn = halve(t, tn); 616 if (IS_ERR(tn)) { 617 tn = old_tn; 618#ifdef CONFIG_IP_FIB_TRIE_STATS 619 t->stats.resize_node_skipped++; 620#endif 621 break; 622 } 623 } 624 625 if (max_resize < 0) { 626 if (!tn->parent) 627 printk(KERN_WARNING "Fix halve_threshold_root. Now=%d size=%d bits\n", 628 halve_threshold_root, tn->bits); 629 else 630 printk(KERN_WARNING "Fix halve_threshold. Now=%d size=%d bits\n", 631 halve_threshold, tn->bits); 632 } 633 634 /* Only one child remains */ 635 if (tn->empty_children == tnode_child_length(tn) - 1) 636 for (i = 0; i < tnode_child_length(tn); i++) { 637 struct node *n; 638 639 n = tn->child[i]; 640 if (!n) 641 continue; 642 643 /* compress one level */ 644 645 node_set_parent(n, NULL); 646 tnode_free(tn); 647 return n; 648 } 649 650 return (struct node *) tn; 651} 652 653static struct tnode *inflate(struct trie *t, struct tnode *tn) 654{ 655 struct tnode *oldtnode = tn; 656 int olen = tnode_child_length(tn); 657 int i; 658 659 pr_debug("In inflate\n"); 660 661 tn = tnode_new(oldtnode->key, oldtnode->pos, oldtnode->bits + 1); 662 663 if (!tn) 664 return ERR_PTR(-ENOMEM); 665 666 /* 667 * Preallocate and store tnodes before the actual work so we 668 * don't get into an inconsistent state if memory allocation 669 * fails. In case of failure we return the oldnode and inflate 670 * of tnode is ignored. 671 */ 672 673 for (i = 0; i < olen; i++) { 674 struct tnode *inode = (struct tnode *) tnode_get_child(oldtnode, i); 675 676 if (inode && 677 IS_TNODE(inode) && 678 inode->pos == oldtnode->pos + oldtnode->bits && 679 inode->bits > 1) { 680 struct tnode *left, *right; 681 t_key m = ~0U << (KEYLENGTH - 1) >> inode->pos; 682 683 left = tnode_new(inode->key&(~m), inode->pos + 1, 684 inode->bits - 1); 685 if (!left) 686 goto nomem; 687 688 right = tnode_new(inode->key|m, inode->pos + 1, 689 inode->bits - 1); 690 691 if (!right) { 692 tnode_free(left); 693 goto nomem; 694 } 695 696 put_child(t, tn, 2*i, (struct node *) left); 697 put_child(t, tn, 2*i+1, (struct node *) right); 698 } 699 } 700 701 for (i = 0; i < olen; i++) { 702 struct tnode *inode; 703 struct node *node = tnode_get_child(oldtnode, i); 704 struct tnode *left, *right; 705 int size, j; 706 707 /* An empty child */ 708 if (node == NULL) 709 continue; 710 711 /* A leaf or an internal node with skipped bits */ 712 713 if (IS_LEAF(node) || ((struct tnode *) node)->pos > 714 tn->pos + tn->bits - 1) { 715 if (tkey_extract_bits(node->key, oldtnode->pos + oldtnode->bits, 716 1) == 0) 717 put_child(t, tn, 2*i, node); 718 else 719 put_child(t, tn, 2*i+1, node); 720 continue; 721 } 722 723 /* An internal node with two children */ 724 inode = (struct tnode *) node; 725 726 if (inode->bits == 1) { 727 put_child(t, tn, 2*i, inode->child[0]); 728 put_child(t, tn, 2*i+1, inode->child[1]); 729 730 tnode_free(inode); 731 continue; 732 } 733 734 /* An internal node with more than two children */ 735 736 /* We will replace this node 'inode' with two new 737 * ones, 'left' and 'right', each with half of the 738 * original children. The two new nodes will have 739 * a position one bit further down the key and this 740 * means that the "significant" part of their keys 741 * (see the discussion near the top of this file) 742 * will differ by one bit, which will be "0" in 743 * left's key and "1" in right's key. Since we are 744 * moving the key position by one step, the bit that 745 * we are moving away from - the bit at position 746 * (inode->pos) - is the one that will differ between 747 * left and right. So... we synthesize that bit in the 748 * two new keys. 749 * The mask 'm' below will be a single "one" bit at 750 * the position (inode->pos) 751 */ 752 753 /* Use the old key, but set the new significant 754 * bit to zero. 755 */ 756 757 left = (struct tnode *) tnode_get_child(tn, 2*i); 758 put_child(t, tn, 2*i, NULL); 759 760 BUG_ON(!left); 761 762 right = (struct tnode *) tnode_get_child(tn, 2*i+1); 763 put_child(t, tn, 2*i+1, NULL); 764 765 BUG_ON(!right); 766 767 size = tnode_child_length(left); 768 for (j = 0; j < size; j++) { 769 put_child(t, left, j, inode->child[j]); 770 put_child(t, right, j, inode->child[j + size]); 771 } 772 put_child(t, tn, 2*i, resize(t, left)); 773 put_child(t, tn, 2*i+1, resize(t, right)); 774 775 tnode_free(inode); 776 } 777 tnode_free(oldtnode); 778 return tn; 779nomem: 780 { 781 int size = tnode_child_length(tn); 782 int j; 783 784 for (j = 0; j < size; j++) 785 if (tn->child[j]) 786 tnode_free((struct tnode *)tn->child[j]); 787 788 tnode_free(tn); 789 790 return ERR_PTR(-ENOMEM); 791 } 792} 793 794static struct tnode *halve(struct trie *t, struct tnode *tn) 795{ 796 struct tnode *oldtnode = tn; 797 struct node *left, *right; 798 int i; 799 int olen = tnode_child_length(tn); 800 801 pr_debug("In halve\n"); 802 803 tn = tnode_new(oldtnode->key, oldtnode->pos, oldtnode->bits - 1); 804 805 if (!tn) 806 return ERR_PTR(-ENOMEM); 807 808 /* 809 * Preallocate and store tnodes before the actual work so we 810 * don't get into an inconsistent state if memory allocation 811 * fails. In case of failure we return the oldnode and halve 812 * of tnode is ignored. 813 */ 814 815 for (i = 0; i < olen; i += 2) { 816 left = tnode_get_child(oldtnode, i); 817 right = tnode_get_child(oldtnode, i+1); 818 819 /* Two nonempty children */ 820 if (left && right) { 821 struct tnode *newn; 822 823 newn = tnode_new(left->key, tn->pos + tn->bits, 1); 824 825 if (!newn) 826 goto nomem; 827 828 put_child(t, tn, i/2, (struct node *)newn); 829 } 830 831 } 832 833 for (i = 0; i < olen; i += 2) { 834 struct tnode *newBinNode; 835 836 left = tnode_get_child(oldtnode, i); 837 right = tnode_get_child(oldtnode, i+1); 838 839 /* At least one of the children is empty */ 840 if (left == NULL) { 841 if (right == NULL) /* Both are empty */ 842 continue; 843 put_child(t, tn, i/2, right); 844 continue; 845 } 846 847 if (right == NULL) { 848 put_child(t, tn, i/2, left); 849 continue; 850 } 851 852 /* Two nonempty children */ 853 newBinNode = (struct tnode *) tnode_get_child(tn, i/2); 854 put_child(t, tn, i/2, NULL); 855 put_child(t, newBinNode, 0, left); 856 put_child(t, newBinNode, 1, right); 857 put_child(t, tn, i/2, resize(t, newBinNode)); 858 } 859 tnode_free(oldtnode); 860 return tn; 861nomem: 862 { 863 int size = tnode_child_length(tn); 864 int j; 865 866 for (j = 0; j < size; j++) 867 if (tn->child[j]) 868 tnode_free((struct tnode *)tn->child[j]); 869 870 tnode_free(tn); 871 872 return ERR_PTR(-ENOMEM); 873 } 874} 875 876/* readside must use rcu_read_lock currently dump routines 877 via get_fa_head and dump */ 878 879static struct leaf_info *find_leaf_info(struct leaf *l, int plen) 880{ 881 struct hlist_head *head = &l->list; 882 struct hlist_node *node; 883 struct leaf_info *li; 884 885 hlist_for_each_entry_rcu(li, node, head, hlist) 886 if (li->plen == plen) 887 return li; 888 889 return NULL; 890} 891 892static inline struct list_head * get_fa_head(struct leaf *l, int plen) 893{ 894 struct leaf_info *li = find_leaf_info(l, plen); 895 896 if (!li) 897 return NULL; 898 899 return &li->falh; 900} 901 902static void insert_leaf_info(struct hlist_head *head, struct leaf_info *new) 903{ 904 struct leaf_info *li = NULL, *last = NULL; 905 struct hlist_node *node; 906 907 if (hlist_empty(head)) { 908 hlist_add_head_rcu(&new->hlist, head); 909 } else { 910 hlist_for_each_entry(li, node, head, hlist) { 911 if (new->plen > li->plen) 912 break; 913 914 last = li; 915 } 916 if (last) 917 hlist_add_after_rcu(&last->hlist, &new->hlist); 918 else 919 hlist_add_before_rcu(&new->hlist, &li->hlist); 920 } 921} 922 923/* rcu_read_lock needs to be hold by caller from readside */ 924 925static struct leaf * 926fib_find_node(struct trie *t, u32 key) 927{ 928 int pos; 929 struct tnode *tn; 930 struct node *n; 931 932 pos = 0; 933 n = rcu_dereference(t->trie); 934 935 while (n != NULL && NODE_TYPE(n) == T_TNODE) { 936 tn = (struct tnode *) n; 937 938 check_tnode(tn); 939 940 if (tkey_sub_equals(tn->key, pos, tn->pos-pos, key)) { 941 pos = tn->pos + tn->bits; 942 n = tnode_get_child(tn, tkey_extract_bits(key, tn->pos, tn->bits)); 943 } else 944 break; 945 } 946 /* Case we have found a leaf. Compare prefixes */ 947 948 if (n != NULL && IS_LEAF(n) && tkey_equals(key, n->key)) 949 return (struct leaf *)n; 950 951 return NULL; 952} 953 954static struct node *trie_rebalance(struct trie *t, struct tnode *tn) 955{ 956 int wasfull; 957 t_key cindex, key = tn->key; 958 struct tnode *tp; 959 960 while (tn != NULL && (tp = node_parent((struct node *)tn)) != NULL) { 961 cindex = tkey_extract_bits(key, tp->pos, tp->bits); 962 wasfull = tnode_full(tp, tnode_get_child(tp, cindex)); 963 tn = (struct tnode *) resize (t, (struct tnode *)tn); 964 tnode_put_child_reorg((struct tnode *)tp, cindex,(struct node*)tn, wasfull); 965 966 tp = node_parent((struct node *) tn); 967 if (!tp) 968 break; 969 tn = tp; 970 } 971 972 /* Handle last (top) tnode */ 973 if (IS_TNODE(tn)) 974 tn = (struct tnode*) resize(t, (struct tnode *)tn); 975 976 return (struct node*) tn; 977} 978 979/* only used from updater-side */ 980 981static struct list_head *fib_insert_node(struct trie *t, u32 key, int plen) 982{ 983 int pos, newpos; 984 struct tnode *tp = NULL, *tn = NULL; 985 struct node *n; 986 struct leaf *l; 987 int missbit; 988 struct list_head *fa_head = NULL; 989 struct leaf_info *li; 990 t_key cindex; 991 992 pos = 0; 993 n = t->trie; 994 995 /* If we point to NULL, stop. Either the tree is empty and we should 996 * just put a new leaf in if, or we have reached an empty child slot, 997 * and we should just put our new leaf in that. 998 * If we point to a T_TNODE, check if it matches our key. Note that 999 * a T_TNODE might be skipping any number of bits - its 'pos' need 1000 * not be the parent's 'pos'+'bits'! 1001 * 1002 * If it does match the current key, get pos/bits from it, extract 1003 * the index from our key, push the T_TNODE and walk the tree. 1004 * 1005 * If it doesn't, we have to replace it with a new T_TNODE. 1006 * 1007 * If we point to a T_LEAF, it might or might not have the same key 1008 * as we do. If it does, just change the value, update the T_LEAF's 1009 * value, and return it. 1010 * If it doesn't, we need to replace it with a T_TNODE. 1011 */ 1012 1013 while (n != NULL && NODE_TYPE(n) == T_TNODE) { 1014 tn = (struct tnode *) n; 1015 1016 check_tnode(tn); 1017 1018 if (tkey_sub_equals(tn->key, pos, tn->pos-pos, key)) { 1019 tp = tn; 1020 pos = tn->pos + tn->bits; 1021 n = tnode_get_child(tn, tkey_extract_bits(key, tn->pos, tn->bits)); 1022 1023 BUG_ON(n && node_parent(n) != tn); 1024 } else 1025 break; 1026 } 1027 1028 /* 1029 * n ----> NULL, LEAF or TNODE 1030 * 1031 * tp is n's (parent) ----> NULL or TNODE 1032 */ 1033 1034 BUG_ON(tp && IS_LEAF(tp)); 1035 1036 /* Case 1: n is a leaf. Compare prefixes */ 1037 1038 if (n != NULL && IS_LEAF(n) && tkey_equals(key, n->key)) { 1039 l = (struct leaf *) n; 1040 li = leaf_info_new(plen); 1041 1042 if (!li) 1043 return NULL; 1044 1045 fa_head = &li->falh; 1046 insert_leaf_info(&l->list, li); 1047 goto done; 1048 } 1049 t->size++; 1050 l = leaf_new(); 1051 1052 if (!l) 1053 return NULL; 1054 1055 l->key = key; 1056 li = leaf_info_new(plen); 1057 1058 if (!li) { 1059 tnode_free((struct tnode *) l); 1060 return NULL; 1061 } 1062 1063 fa_head = &li->falh; 1064 insert_leaf_info(&l->list, li); 1065 1066 if (t->trie && n == NULL) { 1067 /* Case 2: n is NULL, and will just insert a new leaf */ 1068 1069 node_set_parent((struct node *)l, tp); 1070 1071 cindex = tkey_extract_bits(key, tp->pos, tp->bits); 1072 put_child(t, (struct tnode *)tp, cindex, (struct node *)l); 1073 } else { 1074 /* Case 3: n is a LEAF or a TNODE and the key doesn't match. */ 1075 /* 1076 * Add a new tnode here 1077 * first tnode need some special handling 1078 */ 1079 1080 if (tp) 1081 pos = tp->pos+tp->bits; 1082 else 1083 pos = 0; 1084 1085 if (n) { 1086 newpos = tkey_mismatch(key, pos, n->key); 1087 tn = tnode_new(n->key, newpos, 1); 1088 } else { 1089 newpos = 0; 1090 tn = tnode_new(key, newpos, 1); /* First tnode */ 1091 } 1092 1093 if (!tn) { 1094 free_leaf_info(li); 1095 tnode_free((struct tnode *) l); 1096 return NULL; 1097 } 1098 1099 node_set_parent((struct node *)tn, tp); 1100 1101 missbit = tkey_extract_bits(key, newpos, 1); 1102 put_child(t, tn, missbit, (struct node *)l); 1103 put_child(t, tn, 1-missbit, n); 1104 1105 if (tp) { 1106 cindex = tkey_extract_bits(key, tp->pos, tp->bits); 1107 put_child(t, (struct tnode *)tp, cindex, (struct node *)tn); 1108 } else { 1109 rcu_assign_pointer(t->trie, (struct node *)tn); /* First tnode */ 1110 tp = tn; 1111 } 1112 } 1113 1114 if (tp && tp->pos + tp->bits > 32) 1115 printk(KERN_WARNING "fib_trie tp=%p pos=%d, bits=%d, key=%0x plen=%d\n", 1116 tp, tp->pos, tp->bits, key, plen); 1117 1118 /* Rebalance the trie */ 1119 1120 rcu_assign_pointer(t->trie, trie_rebalance(t, tp)); 1121done: 1122 return fa_head; 1123} 1124 1125/* 1126 * Caller must hold RTNL. 1127 */ 1128static int fn_trie_insert(struct fib_table *tb, struct fib_config *cfg) 1129{ 1130 struct trie *t = (struct trie *) tb->tb_data; 1131 struct fib_alias *fa, *new_fa; 1132 struct list_head *fa_head = NULL; 1133 struct fib_info *fi; 1134 int plen = cfg->fc_dst_len; 1135 u8 tos = cfg->fc_tos; 1136 u32 key, mask; 1137 int err; 1138 struct leaf *l; 1139 1140 if (plen > 32) 1141 return -EINVAL; 1142 1143 key = ntohl(cfg->fc_dst); 1144 1145 pr_debug("Insert table=%u %08x/%d\n", tb->tb_id, key, plen); 1146 1147 mask = ntohl(inet_make_mask(plen)); 1148 1149 if (key & ~mask) 1150 return -EINVAL; 1151 1152 key = key & mask; 1153 1154 fi = fib_create_info(cfg); 1155 if (IS_ERR(fi)) { 1156 err = PTR_ERR(fi); 1157 goto err; 1158 } 1159 1160 l = fib_find_node(t, key); 1161 fa = NULL; 1162 1163 if (l) { 1164 fa_head = get_fa_head(l, plen); 1165 fa = fib_find_alias(fa_head, tos, fi->fib_priority); 1166 } 1167 1168 /* Now fa, if non-NULL, points to the first fib alias 1169 * with the same keys [prefix,tos,priority], if such key already 1170 * exists or to the node before which we will insert new one. 1171 * 1172 * If fa is NULL, we will need to allocate a new one and 1173 * insert to the head of f. 1174 * 1175 * If f is NULL, no fib node matched the destination key 1176 * and we need to allocate a new one of those as well. 1177 */ 1178 1179 if (fa && fa->fa_info->fib_priority == fi->fib_priority) { 1180 struct fib_alias *fa_orig; 1181 1182 err = -EEXIST; 1183 if (cfg->fc_nlflags & NLM_F_EXCL) 1184 goto out; 1185 1186 if (cfg->fc_nlflags & NLM_F_REPLACE) { 1187 struct fib_info *fi_drop; 1188 u8 state; 1189 1190 if (fi->fib_treeref > 1) 1191 goto out; 1192 1193 err = -ENOBUFS; 1194 new_fa = kmem_cache_alloc(fn_alias_kmem, GFP_KERNEL); 1195 if (new_fa == NULL) 1196 goto out; 1197 1198 fi_drop = fa->fa_info; 1199 new_fa->fa_tos = fa->fa_tos; 1200 new_fa->fa_info = fi; 1201 new_fa->fa_type = cfg->fc_type; 1202 new_fa->fa_scope = cfg->fc_scope; 1203 state = fa->fa_state; 1204 new_fa->fa_state &= ~FA_S_ACCESSED; 1205 1206 list_replace_rcu(&fa->fa_list, &new_fa->fa_list); 1207 alias_free_mem_rcu(fa); 1208 1209 fib_release_info(fi_drop); 1210 if (state & FA_S_ACCESSED) 1211 rt_cache_flush(-1); 1212 rtmsg_fib(RTM_NEWROUTE, htonl(key), new_fa, plen, 1213 tb->tb_id, &cfg->fc_nlinfo, NLM_F_REPLACE); 1214 1215 goto succeeded; 1216 } 1217 /* Error if we find a perfect match which 1218 * uses the same scope, type, and nexthop 1219 * information. 1220 */ 1221 fa_orig = fa; 1222 list_for_each_entry(fa, fa_orig->fa_list.prev, fa_list) { 1223 if (fa->fa_tos != tos) 1224 break; 1225 if (fa->fa_info->fib_priority != fi->fib_priority) 1226 break; 1227 if (fa->fa_type == cfg->fc_type && 1228 fa->fa_scope == cfg->fc_scope && 1229 fa->fa_info == fi) { 1230 goto out; 1231 } 1232 } 1233 if (!(cfg->fc_nlflags & NLM_F_APPEND)) 1234 fa = fa_orig; 1235 } 1236 err = -ENOENT; 1237 if (!(cfg->fc_nlflags & NLM_F_CREATE)) 1238 goto out; 1239 1240 err = -ENOBUFS; 1241 new_fa = kmem_cache_alloc(fn_alias_kmem, GFP_KERNEL); 1242 if (new_fa == NULL) 1243 goto out; 1244 1245 new_fa->fa_info = fi; 1246 new_fa->fa_tos = tos; 1247 new_fa->fa_type = cfg->fc_type; 1248 new_fa->fa_scope = cfg->fc_scope; 1249 new_fa->fa_state = 0; 1250 /* 1251 * Insert new entry to the list. 1252 */ 1253 1254 if (!fa_head) { 1255 fa_head = fib_insert_node(t, key, plen); 1256 if (unlikely(!fa_head)) { 1257 err = -ENOMEM; 1258 goto out_free_new_fa; 1259 } 1260 } 1261 1262 list_add_tail_rcu(&new_fa->fa_list, 1263 (fa ? &fa->fa_list : fa_head)); 1264 1265 rt_cache_flush(-1); 1266 rtmsg_fib(RTM_NEWROUTE, htonl(key), new_fa, plen, tb->tb_id, 1267 &cfg->fc_nlinfo, 0); 1268succeeded: 1269 return 0; 1270 1271out_free_new_fa: 1272 kmem_cache_free(fn_alias_kmem, new_fa); 1273out: 1274 fib_release_info(fi); 1275err: 1276 return err; 1277} 1278 1279 1280/* should be called with rcu_read_lock */ 1281static inline int check_leaf(struct trie *t, struct leaf *l, 1282 t_key key, int *plen, const struct flowi *flp, 1283 struct fib_result *res) 1284{ 1285 int err, i; 1286 __be32 mask; 1287 struct leaf_info *li; 1288 struct hlist_head *hhead = &l->list; 1289 struct hlist_node *node; 1290 1291 hlist_for_each_entry_rcu(li, node, hhead, hlist) { 1292 i = li->plen; 1293 mask = inet_make_mask(i); 1294 if (l->key != (key & ntohl(mask))) 1295 continue; 1296 1297 if ((err = fib_semantic_match(&li->falh, flp, res, htonl(l->key), mask, i)) <= 0) { 1298 *plen = i; 1299#ifdef CONFIG_IP_FIB_TRIE_STATS 1300 t->stats.semantic_match_passed++; 1301#endif 1302 return err; 1303 } 1304#ifdef CONFIG_IP_FIB_TRIE_STATS 1305 t->stats.semantic_match_miss++; 1306#endif 1307 } 1308 return 1; 1309} 1310 1311static int 1312fn_trie_lookup(struct fib_table *tb, const struct flowi *flp, struct fib_result *res) 1313{ 1314 struct trie *t = (struct trie *) tb->tb_data; 1315 int plen, ret = 0; 1316 struct node *n; 1317 struct tnode *pn; 1318 int pos, bits; 1319 t_key key = ntohl(flp->fl4_dst); 1320 int chopped_off; 1321 t_key cindex = 0; 1322 int current_prefix_length = KEYLENGTH; 1323 struct tnode *cn; 1324 t_key node_prefix, key_prefix, pref_mismatch; 1325 int mp; 1326 1327 rcu_read_lock(); 1328 1329 n = rcu_dereference(t->trie); 1330 if (!n) 1331 goto failed; 1332 1333#ifdef CONFIG_IP_FIB_TRIE_STATS 1334 t->stats.gets++; 1335#endif 1336 1337 /* Just a leaf? */ 1338 if (IS_LEAF(n)) { 1339 if ((ret = check_leaf(t, (struct leaf *)n, key, &plen, flp, res)) <= 0) 1340 goto found; 1341 goto failed; 1342 } 1343 pn = (struct tnode *) n; 1344 chopped_off = 0; 1345 1346 while (pn) { 1347 pos = pn->pos; 1348 bits = pn->bits; 1349 1350 if (!chopped_off) 1351 cindex = tkey_extract_bits(mask_pfx(key, current_prefix_length), 1352 pos, bits); 1353 1354 n = tnode_get_child(pn, cindex); 1355 1356 if (n == NULL) { 1357#ifdef CONFIG_IP_FIB_TRIE_STATS 1358 t->stats.null_node_hit++; 1359#endif 1360 goto backtrace; 1361 } 1362 1363 if (IS_LEAF(n)) { 1364 if ((ret = check_leaf(t, (struct leaf *)n, key, &plen, flp, res)) <= 0) 1365 goto found; 1366 else 1367 goto backtrace; 1368 } 1369 1370#define HL_OPTIMIZE 1371#ifdef HL_OPTIMIZE 1372 cn = (struct tnode *)n; 1373 1374 /* 1375 * It's a tnode, and we can do some extra checks here if we 1376 * like, to avoid descending into a dead-end branch. 1377 * This tnode is in the parent's child array at index 1378 * key[p_pos..p_pos+p_bits] but potentially with some bits 1379 * chopped off, so in reality the index may be just a 1380 * subprefix, padded with zero at the end. 1381 * We can also take a look at any skipped bits in this 1382 * tnode - everything up to p_pos is supposed to be ok, 1383 * and the non-chopped bits of the index (se previous 1384 * paragraph) are also guaranteed ok, but the rest is 1385 * considered unknown. 1386 * 1387 * The skipped bits are key[pos+bits..cn->pos]. 1388 */ 1389 1390 /* If current_prefix_length < pos+bits, we are already doing 1391 * actual prefix matching, which means everything from 1392 * pos+(bits-chopped_off) onward must be zero along some 1393 * branch of this subtree - otherwise there is *no* valid 1394 * prefix present. Here we can only check the skipped 1395 * bits. Remember, since we have already indexed into the 1396 * parent's child array, we know that the bits we chopped of 1397 * *are* zero. 1398 */ 1399 1400 /* NOTA BENE: CHECKING ONLY SKIPPED BITS FOR THE NEW NODE HERE */ 1401 1402 if (current_prefix_length < pos+bits) { 1403 if (tkey_extract_bits(cn->key, current_prefix_length, 1404 cn->pos - current_prefix_length) != 0 || 1405 !(cn->child[0])) 1406 goto backtrace; 1407 } 1408 1409 /* 1410 * If chopped_off=0, the index is fully validated and we 1411 * only need to look at the skipped bits for this, the new, 1412 * tnode. What we actually want to do is to find out if 1413 * these skipped bits match our key perfectly, or if we will 1414 * have to count on finding a matching prefix further down, 1415 * because if we do, we would like to have some way of 1416 * verifying the existence of such a prefix at this point. 1417 */ 1418 1419 /* The only thing we can do at this point is to verify that 1420 * any such matching prefix can indeed be a prefix to our 1421 * key, and if the bits in the node we are inspecting that 1422 * do not match our key are not ZERO, this cannot be true. 1423 * Thus, find out where there is a mismatch (before cn->pos) 1424 * and verify that all the mismatching bits are zero in the 1425 * new tnode's key. 1426 */ 1427 1428 /* Note: We aren't very concerned about the piece of the key 1429 * that precede pn->pos+pn->bits, since these have already been 1430 * checked. The bits after cn->pos aren't checked since these are 1431 * by definition "unknown" at this point. Thus, what we want to 1432 * see is if we are about to enter the "prefix matching" state, 1433 * and in that case verify that the skipped bits that will prevail 1434 * throughout this subtree are zero, as they have to be if we are 1435 * to find a matching prefix. 1436 */ 1437 1438 node_prefix = mask_pfx(cn->key, cn->pos); 1439 key_prefix = mask_pfx(key, cn->pos); 1440 pref_mismatch = key_prefix^node_prefix; 1441 mp = 0; 1442 1443 /* In short: If skipped bits in this node do not match the search 1444 * key, enter the "prefix matching" state.directly. 1445 */ 1446 if (pref_mismatch) { 1447 while (!(pref_mismatch & (1<<(KEYLENGTH-1)))) { 1448 mp++; 1449 pref_mismatch = pref_mismatch <<1; 1450 } 1451 key_prefix = tkey_extract_bits(cn->key, mp, cn->pos-mp); 1452 1453 if (key_prefix != 0) 1454 goto backtrace; 1455 1456 if (current_prefix_length >= cn->pos) 1457 current_prefix_length = mp; 1458 } 1459#endif 1460 pn = (struct tnode *)n; /* Descend */ 1461 chopped_off = 0; 1462 continue; 1463 1464backtrace: 1465 chopped_off++; 1466 1467 /* As zero don't change the child key (cindex) */ 1468 while ((chopped_off <= pn->bits) && !(cindex & (1<<(chopped_off-1)))) 1469 chopped_off++; 1470 1471 /* Decrease current_... with bits chopped off */ 1472 if (current_prefix_length > pn->pos + pn->bits - chopped_off) 1473 current_prefix_length = pn->pos + pn->bits - chopped_off; 1474 1475 /* 1476 * Either we do the actual chop off according or if we have 1477 * chopped off all bits in this tnode walk up to our parent. 1478 */ 1479 1480 if (chopped_off <= pn->bits) { 1481 cindex &= ~(1 << (chopped_off-1)); 1482 } else { 1483 struct tnode *parent = node_parent((struct node *) pn); 1484 if (!parent) 1485 goto failed; 1486 1487 /* Get Child's index */ 1488 cindex = tkey_extract_bits(pn->key, parent->pos, parent->bits); 1489 pn = parent; 1490 chopped_off = 0; 1491 1492#ifdef CONFIG_IP_FIB_TRIE_STATS 1493 t->stats.backtrack++; 1494#endif 1495 goto backtrace; 1496 } 1497 } 1498failed: 1499 ret = 1; 1500found: 1501 rcu_read_unlock(); 1502 return ret; 1503} 1504 1505/* only called from updater side */ 1506static int trie_leaf_remove(struct trie *t, t_key key) 1507{ 1508 t_key cindex; 1509 struct tnode *tp = NULL; 1510 struct node *n = t->trie; 1511 struct leaf *l; 1512 1513 pr_debug("entering trie_leaf_remove(%p)\n", n); 1514 1515 /* Note that in the case skipped bits, those bits are *not* checked! 1516 * When we finish this, we will have NULL or a T_LEAF, and the 1517 * T_LEAF may or may not match our key. 1518 */ 1519 1520 while (n != NULL && IS_TNODE(n)) { 1521 struct tnode *tn = (struct tnode *) n; 1522 check_tnode(tn); 1523 n = tnode_get_child(tn ,tkey_extract_bits(key, tn->pos, tn->bits)); 1524 1525 BUG_ON(n && node_parent(n) != tn); 1526 } 1527 l = (struct leaf *) n; 1528 1529 if (!n || !tkey_equals(l->key, key)) 1530 return 0; 1531 1532 /* 1533 * Key found. 1534 * Remove the leaf and rebalance the tree 1535 */ 1536 1537 t->size--; 1538 1539 tp = node_parent(n); 1540 tnode_free((struct tnode *) n); 1541 1542 if (tp) { 1543 cindex = tkey_extract_bits(key, tp->pos, tp->bits); 1544 put_child(t, (struct tnode *)tp, cindex, NULL); 1545 rcu_assign_pointer(t->trie, trie_rebalance(t, tp)); 1546 } else 1547 rcu_assign_pointer(t->trie, NULL); 1548 1549 return 1; 1550} 1551 1552/* 1553 * Caller must hold RTNL. 1554 */ 1555static int fn_trie_delete(struct fib_table *tb, struct fib_config *cfg) 1556{ 1557 struct trie *t = (struct trie *) tb->tb_data; 1558 u32 key, mask; 1559 int plen = cfg->fc_dst_len; 1560 u8 tos = cfg->fc_tos; 1561 struct fib_alias *fa, *fa_to_delete; 1562 struct list_head *fa_head; 1563 struct leaf *l; 1564 struct leaf_info *li; 1565 1566 if (plen > 32) 1567 return -EINVAL; 1568 1569 key = ntohl(cfg->fc_dst); 1570 mask = ntohl(inet_make_mask(plen)); 1571 1572 if (key & ~mask) 1573 return -EINVAL; 1574 1575 key = key & mask; 1576 l = fib_find_node(t, key); 1577 1578 if (!l) 1579 return -ESRCH; 1580 1581 fa_head = get_fa_head(l, plen); 1582 fa = fib_find_alias(fa_head, tos, 0); 1583 1584 if (!fa) 1585 return -ESRCH; 1586 1587 pr_debug("Deleting %08x/%d tos=%d t=%p\n", key, plen, tos, t); 1588 1589 fa_to_delete = NULL; 1590 fa_head = fa->fa_list.prev; 1591 1592 list_for_each_entry(fa, fa_head, fa_list) { 1593 struct fib_info *fi = fa->fa_info; 1594 1595 if (fa->fa_tos != tos) 1596 break; 1597 1598 if ((!cfg->fc_type || fa->fa_type == cfg->fc_type) && 1599 (cfg->fc_scope == RT_SCOPE_NOWHERE || 1600 fa->fa_scope == cfg->fc_scope) && 1601 (!cfg->fc_protocol || 1602 fi->fib_protocol == cfg->fc_protocol) && 1603 fib_nh_match(cfg, fi) == 0) { 1604 fa_to_delete = fa; 1605 break; 1606 } 1607 } 1608 1609 if (!fa_to_delete) 1610 return -ESRCH; 1611 1612 fa = fa_to_delete; 1613 rtmsg_fib(RTM_DELROUTE, htonl(key), fa, plen, tb->tb_id, 1614 &cfg->fc_nlinfo, 0); 1615 1616 l = fib_find_node(t, key); 1617 li = find_leaf_info(l, plen); 1618 1619 list_del_rcu(&fa->fa_list); 1620 1621 if (list_empty(fa_head)) { 1622 hlist_del_rcu(&li->hlist); 1623 free_leaf_info(li); 1624 } 1625 1626 if (hlist_empty(&l->list)) 1627 trie_leaf_remove(t, key); 1628 1629 if (fa->fa_state & FA_S_ACCESSED) 1630 rt_cache_flush(-1); 1631 1632 fib_release_info(fa->fa_info); 1633 alias_free_mem_rcu(fa); 1634 return 0; 1635} 1636 1637static int trie_flush_list(struct trie *t, struct list_head *head) 1638{ 1639 struct fib_alias *fa, *fa_node; 1640 int found = 0; 1641 1642 list_for_each_entry_safe(fa, fa_node, head, fa_list) { 1643 struct fib_info *fi = fa->fa_info; 1644 1645 if (fi && (fi->fib_flags & RTNH_F_DEAD)) { 1646 list_del_rcu(&fa->fa_list); 1647 fib_release_info(fa->fa_info); 1648 alias_free_mem_rcu(fa); 1649 found++; 1650 } 1651 } 1652 return found; 1653} 1654 1655static int trie_flush_leaf(struct trie *t, struct leaf *l) 1656{ 1657 int found = 0; 1658 struct hlist_head *lih = &l->list; 1659 struct hlist_node *node, *tmp; 1660 struct leaf_info *li = NULL; 1661 1662 hlist_for_each_entry_safe(li, node, tmp, lih, hlist) { 1663 found += trie_flush_list(t, &li->falh); 1664 1665 if (list_empty(&li->falh)) { 1666 hlist_del_rcu(&li->hlist); 1667 free_leaf_info(li); 1668 } 1669 } 1670 return found; 1671} 1672 1673/* rcu_read_lock needs to be hold by caller from readside */ 1674 1675static struct leaf *nextleaf(struct trie *t, struct leaf *thisleaf) 1676{ 1677 struct node *c = (struct node *) thisleaf; 1678 struct tnode *p; 1679 int idx; 1680 struct node *trie = rcu_dereference(t->trie); 1681 1682 if (c == NULL) { 1683 if (trie == NULL) 1684 return NULL; 1685 1686 if (IS_LEAF(trie)) /* trie w. just a leaf */ 1687 return (struct leaf *) trie; 1688 1689 p = (struct tnode*) trie; /* Start */ 1690 } else 1691 p = node_parent(c); 1692 1693 while (p) { 1694 int pos, last; 1695 1696 /* Find the next child of the parent */ 1697 if (c) 1698 pos = 1 + tkey_extract_bits(c->key, p->pos, p->bits); 1699 else 1700 pos = 0; 1701 1702 last = 1 << p->bits; 1703 for (idx = pos; idx < last ; idx++) { 1704 c = rcu_dereference(p->child[idx]); 1705 1706 if (!c) 1707 continue; 1708 1709 /* Decend if tnode */ 1710 while (IS_TNODE(c)) { 1711 p = (struct tnode *) c; 1712 idx = 0; 1713 1714 /* Rightmost non-NULL branch */ 1715 if (p && IS_TNODE(p)) 1716 while (!(c = rcu_dereference(p->child[idx])) 1717 && idx < (1<<p->bits)) idx++; 1718 1719 /* Done with this tnode? */ 1720 if (idx >= (1 << p->bits) || !c) 1721 goto up; 1722 } 1723 return (struct leaf *) c; 1724 } 1725up: 1726 /* No more children go up one step */ 1727 c = (struct node *) p; 1728 p = node_parent(c); 1729 } 1730 return NULL; /* Ready. Root of trie */ 1731} 1732 1733/* 1734 * Caller must hold RTNL. 1735 */ 1736static int fn_trie_flush(struct fib_table *tb) 1737{ 1738 struct trie *t = (struct trie *) tb->tb_data; 1739 struct leaf *ll = NULL, *l = NULL; 1740 int found = 0, h; 1741 1742 for (h = 0; (l = nextleaf(t, l)) != NULL; h++) { 1743 found += trie_flush_leaf(t, l); 1744 1745 if (ll && hlist_empty(&ll->list)) 1746 trie_leaf_remove(t, ll->key); 1747 ll = l; 1748 } 1749 1750 if (ll && hlist_empty(&ll->list)) 1751 trie_leaf_remove(t, ll->key); 1752 1753 pr_debug("trie_flush found=%d\n", found); 1754 return found; 1755} 1756 1757static void 1758fn_trie_select_default(struct fib_table *tb, const struct flowi *flp, struct fib_result *res) 1759{ 1760 struct trie *t = (struct trie *) tb->tb_data; 1761 int order, last_idx; 1762 struct fib_info *fi = NULL; 1763 struct fib_info *last_resort; 1764 struct fib_alias *fa = NULL; 1765 struct list_head *fa_head; 1766 struct leaf *l; 1767 1768 last_idx = -1; 1769 last_resort = NULL; 1770 order = -1; 1771 1772 rcu_read_lock(); 1773 1774 l = fib_find_node(t, 0); 1775 if (!l) 1776 goto out; 1777 1778 fa_head = get_fa_head(l, 0); 1779 if (!fa_head) 1780 goto out; 1781 1782 if (list_empty(fa_head)) 1783 goto out; 1784 1785 list_for_each_entry_rcu(fa, fa_head, fa_list) { 1786 struct fib_info *next_fi = fa->fa_info; 1787 1788 if (fa->fa_scope != res->scope || 1789 fa->fa_type != RTN_UNICAST) 1790 continue; 1791 1792 if (next_fi->fib_priority > res->fi->fib_priority) 1793 break; 1794 if (!next_fi->fib_nh[0].nh_gw || 1795 next_fi->fib_nh[0].nh_scope != RT_SCOPE_LINK) 1796 continue; 1797 fa->fa_state |= FA_S_ACCESSED; 1798 1799 if (fi == NULL) { 1800 if (next_fi != res->fi) 1801 break; 1802 } else if (!fib_detect_death(fi, order, &last_resort, 1803 &last_idx, tb->tb_default)) { 1804 fib_result_assign(res, fi); 1805 tb->tb_default = order; 1806 goto out; 1807 } 1808 fi = next_fi; 1809 order++; 1810 } 1811 if (order <= 0 || fi == NULL) { 1812 tb->tb_default = -1; 1813 goto out; 1814 } 1815 1816 if (!fib_detect_death(fi, order, &last_resort, &last_idx, 1817 tb->tb_default)) { 1818 fib_result_assign(res, fi); 1819 tb->tb_default = order; 1820 goto out; 1821 } 1822 if (last_idx >= 0) 1823 fib_result_assign(res, last_resort); 1824 tb->tb_default = last_idx; 1825out: 1826 rcu_read_unlock(); 1827} 1828 1829static int fn_trie_dump_fa(t_key key, int plen, struct list_head *fah, struct fib_table *tb, 1830 struct sk_buff *skb, struct netlink_callback *cb) 1831{ 1832 int i, s_i; 1833 struct fib_alias *fa; 1834 1835 __be32 xkey = htonl(key); 1836 1837 s_i = cb->args[4]; 1838 i = 0; 1839 1840 /* rcu_read_lock is hold by caller */ 1841 1842 list_for_each_entry_rcu(fa, fah, fa_list) { 1843 if (i < s_i) { 1844 i++; 1845 continue; 1846 } 1847 BUG_ON(!fa->fa_info); 1848 1849 if (fib_dump_info(skb, NETLINK_CB(cb->skb).pid, 1850 cb->nlh->nlmsg_seq, 1851 RTM_NEWROUTE, 1852 tb->tb_id, 1853 fa->fa_type, 1854 fa->fa_scope, 1855 xkey, 1856 plen, 1857 fa->fa_tos, 1858 fa->fa_info, 0) < 0) { 1859 cb->args[4] = i; 1860 return -1; 1861 } 1862 i++; 1863 } 1864 cb->args[4] = i; 1865 return skb->len; 1866} 1867 1868static int fn_trie_dump_plen(struct trie *t, int plen, struct fib_table *tb, struct sk_buff *skb, 1869 struct netlink_callback *cb) 1870{ 1871 int h, s_h; 1872 struct list_head *fa_head; 1873 struct leaf *l = NULL; 1874 1875 s_h = cb->args[3]; 1876 1877 for (h = 0; (l = nextleaf(t, l)) != NULL; h++) { 1878 if (h < s_h) 1879 continue; 1880 if (h > s_h) 1881 memset(&cb->args[4], 0, 1882 sizeof(cb->args) - 4*sizeof(cb->args[0])); 1883 1884 fa_head = get_fa_head(l, plen); 1885 1886 if (!fa_head) 1887 continue; 1888 1889 if (list_empty(fa_head)) 1890 continue; 1891 1892 if (fn_trie_dump_fa(l->key, plen, fa_head, tb, skb, cb)<0) { 1893 cb->args[3] = h; 1894 return -1; 1895 } 1896 } 1897 cb->args[3] = h; 1898 return skb->len; 1899} 1900 1901static int fn_trie_dump(struct fib_table *tb, struct sk_buff *skb, struct netlink_callback *cb) 1902{ 1903 int m, s_m; 1904 struct trie *t = (struct trie *) tb->tb_data; 1905 1906 s_m = cb->args[2]; 1907 1908 rcu_read_lock(); 1909 for (m = 0; m <= 32; m++) { 1910 if (m < s_m) 1911 continue; 1912 if (m > s_m) 1913 memset(&cb->args[3], 0, 1914 sizeof(cb->args) - 3*sizeof(cb->args[0])); 1915 1916 if (fn_trie_dump_plen(t, 32-m, tb, skb, cb)<0) { 1917 cb->args[2] = m; 1918 goto out; 1919 } 1920 } 1921 rcu_read_unlock(); 1922 cb->args[2] = m; 1923 return skb->len; 1924out: 1925 rcu_read_unlock(); 1926 return -1; 1927} 1928 1929/* Fix more generic FIB names for init later */ 1930 1931struct fib_table *fib_hash_init(u32 id) 1932{ 1933 struct fib_table *tb; 1934 struct trie *t; 1935 1936 if (fn_alias_kmem == NULL) 1937 fn_alias_kmem = kmem_cache_create("ip_fib_alias", 1938 sizeof(struct fib_alias), 1939 0, SLAB_HWCACHE_ALIGN, 1940 NULL); 1941 1942 tb = kmalloc(sizeof(struct fib_table) + sizeof(struct trie), 1943 GFP_KERNEL); 1944 if (tb == NULL) 1945 return NULL; 1946 1947 tb->tb_id = id; 1948 tb->tb_default = -1; 1949 tb->tb_lookup = fn_trie_lookup; 1950 tb->tb_insert = fn_trie_insert; 1951 tb->tb_delete = fn_trie_delete; 1952 tb->tb_flush = fn_trie_flush; 1953 tb->tb_select_default = fn_trie_select_default; 1954 tb->tb_dump = fn_trie_dump; 1955 1956 t = (struct trie *) tb->tb_data; 1957 memset(t, 0, sizeof(*t)); 1958 1959 if (id == RT_TABLE_LOCAL) 1960 printk(KERN_INFO "IPv4 FIB: Using LC-trie version %s\n", VERSION); 1961 1962 return tb; 1963} 1964 1965#ifdef CONFIG_PROC_FS 1966/* Depth first Trie walk iterator */ 1967struct fib_trie_iter { 1968 struct seq_net_private p; 1969 struct trie *trie_local, *trie_main; 1970 struct tnode *tnode; 1971 struct trie *trie; 1972 unsigned index; 1973 unsigned depth; 1974}; 1975 1976static struct node *fib_trie_get_next(struct fib_trie_iter *iter) 1977{ 1978 struct tnode *tn = iter->tnode; 1979 unsigned cindex = iter->index; 1980 struct tnode *p; 1981 1982 /* A single entry routing table */ 1983 if (!tn) 1984 return NULL; 1985 1986 pr_debug("get_next iter={node=%p index=%d depth=%d}\n", 1987 iter->tnode, iter->index, iter->depth); 1988rescan: 1989 while (cindex < (1<<tn->bits)) { 1990 struct node *n = tnode_get_child(tn, cindex); 1991 1992 if (n) { 1993 if (IS_LEAF(n)) { 1994 iter->tnode = tn; 1995 iter->index = cindex + 1; 1996 } else { 1997 /* push down one level */ 1998 iter->tnode = (struct tnode *) n; 1999 iter->index = 0; 2000 ++iter->depth; 2001 } 2002 return n; 2003 } 2004 2005 ++cindex; 2006 } 2007 2008 /* Current node exhausted, pop back up */ 2009 p = node_parent((struct node *)tn); 2010 if (p) { 2011 cindex = tkey_extract_bits(tn->key, p->pos, p->bits)+1; 2012 tn = p; 2013 --iter->depth; 2014 goto rescan; 2015 } 2016 2017 /* got root? */ 2018 return NULL; 2019} 2020 2021static struct node *fib_trie_get_first(struct fib_trie_iter *iter, 2022 struct trie *t) 2023{ 2024 struct node *n ; 2025 2026 if (!t) 2027 return NULL; 2028 2029 n = rcu_dereference(t->trie); 2030 2031 if (!iter) 2032 return NULL; 2033 2034 if (n) { 2035 if (IS_TNODE(n)) { 2036 iter->tnode = (struct tnode *) n; 2037 iter->trie = t; 2038 iter->index = 0; 2039 iter->depth = 1; 2040 } else { 2041 iter->tnode = NULL; 2042 iter->trie = t; 2043 iter->index = 0; 2044 iter->depth = 0; 2045 } 2046 return n; 2047 } 2048 return NULL; 2049} 2050 2051static void trie_collect_stats(struct trie *t, struct trie_stat *s) 2052{ 2053 struct node *n; 2054 struct fib_trie_iter iter; 2055 2056 memset(s, 0, sizeof(*s)); 2057 2058 rcu_read_lock(); 2059 for (n = fib_trie_get_first(&iter, t); n; 2060 n = fib_trie_get_next(&iter)) { 2061 if (IS_LEAF(n)) { 2062 s->leaves++; 2063 s->totdepth += iter.depth; 2064 if (iter.depth > s->maxdepth) 2065 s->maxdepth = iter.depth; 2066 } else { 2067 const struct tnode *tn = (const struct tnode *) n; 2068 int i; 2069 2070 s->tnodes++; 2071 if (tn->bits < MAX_STAT_DEPTH) 2072 s->nodesizes[tn->bits]++; 2073 2074 for (i = 0; i < (1<<tn->bits); i++) 2075 if (!tn->child[i]) 2076 s->nullpointers++; 2077 } 2078 } 2079 rcu_read_unlock(); 2080} 2081 2082/* 2083 * This outputs /proc/net/fib_triestats 2084 */ 2085static void trie_show_stats(struct seq_file *seq, struct trie_stat *stat) 2086{ 2087 unsigned i, max, pointers, bytes, avdepth; 2088 2089 if (stat->leaves) 2090 avdepth = stat->totdepth*100 / stat->leaves; 2091 else 2092 avdepth = 0; 2093 2094 seq_printf(seq, "\tAver depth: %u.%02d\n", avdepth / 100, avdepth % 100 ); 2095 seq_printf(seq, "\tMax depth: %u\n", stat->maxdepth); 2096 2097 seq_printf(seq, "\tLeaves: %u\n", stat->leaves); 2098 2099 bytes = sizeof(struct leaf) * stat->leaves; 2100 seq_printf(seq, "\tInternal nodes: %u\n\t", stat->tnodes); 2101 bytes += sizeof(struct tnode) * stat->tnodes; 2102 2103 max = MAX_STAT_DEPTH; 2104 while (max > 0 && stat->nodesizes[max-1] == 0) 2105 max--; 2106 2107 pointers = 0; 2108 for (i = 1; i <= max; i++) 2109 if (stat->nodesizes[i] != 0) { 2110 seq_printf(seq, " %u: %u", i, stat->nodesizes[i]); 2111 pointers += (1<<i) * stat->nodesizes[i]; 2112 } 2113 seq_putc(seq, '\n'); 2114 seq_printf(seq, "\tPointers: %u\n", pointers); 2115 2116 bytes += sizeof(struct node *) * pointers; 2117 seq_printf(seq, "Null ptrs: %u\n", stat->nullpointers); 2118 seq_printf(seq, "Total size: %u kB\n", (bytes + 1023) / 1024); 2119} 2120 2121#ifdef CONFIG_IP_FIB_TRIE_STATS 2122static void trie_show_usage(struct seq_file *seq, 2123 const struct trie_use_stats *stats) 2124{ 2125 seq_printf(seq, "\nCounters:\n---------\n"); 2126 seq_printf(seq,"gets = %u\n", stats->gets); 2127 seq_printf(seq,"backtracks = %u\n", stats->backtrack); 2128 seq_printf(seq,"semantic match passed = %u\n", stats->semantic_match_passed); 2129 seq_printf(seq,"semantic match miss = %u\n", stats->semantic_match_miss); 2130 seq_printf(seq,"null node hit= %u\n", stats->null_node_hit); 2131 seq_printf(seq,"skipped node resize = %u\n\n", stats->resize_node_skipped); 2132} 2133#endif /* CONFIG_IP_FIB_TRIE_STATS */ 2134 2135 2136static int fib_triestat_seq_show(struct seq_file *seq, void *v) 2137{ 2138 struct net *net = (struct net *)seq->private; 2139 struct trie *trie_local, *trie_main; 2140 struct trie_stat *stat; 2141 struct fib_table *tb; 2142 2143 trie_local = NULL; 2144 tb = fib_get_table(net, RT_TABLE_LOCAL); 2145 if (tb) 2146 trie_local = (struct trie *) tb->tb_data; 2147 2148 trie_main = NULL; 2149 tb = fib_get_table(net, RT_TABLE_MAIN); 2150 if (tb) 2151 trie_main = (struct trie *) tb->tb_data; 2152 2153 2154 stat = kmalloc(sizeof(*stat), GFP_KERNEL); 2155 if (!stat) 2156 return -ENOMEM; 2157 2158 seq_printf(seq, "Basic info: size of leaf: %Zd bytes, size of tnode: %Zd bytes.\n", 2159 sizeof(struct leaf), sizeof(struct tnode)); 2160 2161 if (trie_local) { 2162 seq_printf(seq, "Local:\n"); 2163 trie_collect_stats(trie_local, stat); 2164 trie_show_stats(seq, stat); 2165#ifdef CONFIG_IP_FIB_TRIE_STATS 2166 trie_show_usage(seq, &trie_local->stats); 2167#endif 2168 } 2169 2170 if (trie_main) { 2171 seq_printf(seq, "Main:\n"); 2172 trie_collect_stats(trie_main, stat); 2173 trie_show_stats(seq, stat); 2174#ifdef CONFIG_IP_FIB_TRIE_STATS 2175 trie_show_usage(seq, &trie_main->stats); 2176#endif 2177 } 2178 kfree(stat); 2179 2180 return 0; 2181} 2182 2183static int fib_triestat_seq_open(struct inode *inode, struct file *file) 2184{ 2185 int err; 2186 struct net *net; 2187 2188 net = get_proc_net(inode); 2189 if (net == NULL) 2190 return -ENXIO; 2191 err = single_open(file, fib_triestat_seq_show, net); 2192 if (err < 0) { 2193 put_net(net); 2194 return err; 2195 } 2196 return 0; 2197} 2198 2199static int fib_triestat_seq_release(struct inode *ino, struct file *f) 2200{ 2201 struct seq_file *seq = f->private_data; 2202 put_net(seq->private); 2203 return single_release(ino, f); 2204} 2205 2206static const struct file_operations fib_triestat_fops = { 2207 .owner = THIS_MODULE, 2208 .open = fib_triestat_seq_open, 2209 .read = seq_read, 2210 .llseek = seq_lseek, 2211 .release = fib_triestat_seq_release, 2212}; 2213 2214static struct node *fib_trie_get_idx(struct fib_trie_iter *iter, 2215 loff_t pos) 2216{ 2217 loff_t idx = 0; 2218 struct node *n; 2219 2220 for (n = fib_trie_get_first(iter, iter->trie_local); 2221 n; ++idx, n = fib_trie_get_next(iter)) { 2222 if (pos == idx) 2223 return n; 2224 } 2225 2226 for (n = fib_trie_get_first(iter, iter->trie_main); 2227 n; ++idx, n = fib_trie_get_next(iter)) { 2228 if (pos == idx) 2229 return n; 2230 } 2231 return NULL; 2232} 2233 2234static void *fib_trie_seq_start(struct seq_file *seq, loff_t *pos) 2235 __acquires(RCU) 2236{ 2237 struct fib_trie_iter *iter = seq->private; 2238 struct fib_table *tb; 2239 2240 if (!iter->trie_local) { 2241 tb = fib_get_table(iter->p.net, RT_TABLE_LOCAL); 2242 if (tb) 2243 iter->trie_local = (struct trie *) tb->tb_data; 2244 } 2245 if (!iter->trie_main) { 2246 tb = fib_get_table(iter->p.net, RT_TABLE_MAIN); 2247 if (tb) 2248 iter->trie_main = (struct trie *) tb->tb_data; 2249 } 2250 rcu_read_lock(); 2251 if (*pos == 0) 2252 return SEQ_START_TOKEN; 2253 return fib_trie_get_idx(iter, *pos - 1); 2254} 2255 2256static void *fib_trie_seq_next(struct seq_file *seq, void *v, loff_t *pos) 2257{ 2258 struct fib_trie_iter *iter = seq->private; 2259 void *l = v; 2260 2261 ++*pos; 2262 if (v == SEQ_START_TOKEN) 2263 return fib_trie_get_idx(iter, 0); 2264 2265 v = fib_trie_get_next(iter); 2266 BUG_ON(v == l); 2267 if (v) 2268 return v; 2269 2270 /* continue scan in next trie */ 2271 if (iter->trie == iter->trie_local) 2272 return fib_trie_get_first(iter, iter->trie_main); 2273 2274 return NULL; 2275} 2276 2277static void fib_trie_seq_stop(struct seq_file *seq, void *v) 2278 __releases(RCU) 2279{ 2280 rcu_read_unlock(); 2281} 2282 2283static void seq_indent(struct seq_file *seq, int n) 2284{ 2285 while (n-- > 0) seq_puts(seq, " "); 2286} 2287 2288static inline const char *rtn_scope(enum rt_scope_t s) 2289{ 2290 static char buf[32]; 2291 2292 switch (s) { 2293 case RT_SCOPE_UNIVERSE: return "universe"; 2294 case RT_SCOPE_SITE: return "site"; 2295 case RT_SCOPE_LINK: return "link"; 2296 case RT_SCOPE_HOST: return "host"; 2297 case RT_SCOPE_NOWHERE: return "nowhere"; 2298 default: 2299 snprintf(buf, sizeof(buf), "scope=%d", s); 2300 return buf; 2301 } 2302} 2303 2304static const char *rtn_type_names[__RTN_MAX] = { 2305 [RTN_UNSPEC] = "UNSPEC", 2306 [RTN_UNICAST] = "UNICAST", 2307 [RTN_LOCAL] = "LOCAL", 2308 [RTN_BROADCAST] = "BROADCAST", 2309 [RTN_ANYCAST] = "ANYCAST", 2310 [RTN_MULTICAST] = "MULTICAST", 2311 [RTN_BLACKHOLE] = "BLACKHOLE", 2312 [RTN_UNREACHABLE] = "UNREACHABLE", 2313 [RTN_PROHIBIT] = "PROHIBIT", 2314 [RTN_THROW] = "THROW", 2315 [RTN_NAT] = "NAT", 2316 [RTN_XRESOLVE] = "XRESOLVE", 2317}; 2318 2319static inline const char *rtn_type(unsigned t) 2320{ 2321 static char buf[32]; 2322 2323 if (t < __RTN_MAX && rtn_type_names[t]) 2324 return rtn_type_names[t]; 2325 snprintf(buf, sizeof(buf), "type %u", t); 2326 return buf; 2327} 2328 2329/* Pretty print the trie */ 2330static int fib_trie_seq_show(struct seq_file *seq, void *v) 2331{ 2332 const struct fib_trie_iter *iter = seq->private; 2333 struct node *n = v; 2334 2335 if (v == SEQ_START_TOKEN) 2336 return 0; 2337 2338 if (!node_parent(n)) { 2339 if (iter->trie == iter->trie_local) 2340 seq_puts(seq, "<local>:\n"); 2341 else 2342 seq_puts(seq, "<main>:\n"); 2343 } 2344 2345 if (IS_TNODE(n)) { 2346 struct tnode *tn = (struct tnode *) n; 2347 __be32 prf = htonl(mask_pfx(tn->key, tn->pos)); 2348 2349 seq_indent(seq, iter->depth-1); 2350 seq_printf(seq, " +-- %d.%d.%d.%d/%d %d %d %d\n", 2351 NIPQUAD(prf), tn->pos, tn->bits, tn->full_children, 2352 tn->empty_children); 2353 2354 } else { 2355 struct leaf *l = (struct leaf *) n; 2356 int i; 2357 __be32 val = htonl(l->key); 2358 2359 seq_indent(seq, iter->depth); 2360 seq_printf(seq, " |-- %d.%d.%d.%d\n", NIPQUAD(val)); 2361 for (i = 32; i >= 0; i--) { 2362 struct leaf_info *li = find_leaf_info(l, i); 2363 if (li) { 2364 struct fib_alias *fa; 2365 list_for_each_entry_rcu(fa, &li->falh, fa_list) { 2366 seq_indent(seq, iter->depth+1); 2367 seq_printf(seq, " /%d %s %s", i, 2368 rtn_scope(fa->fa_scope), 2369 rtn_type(fa->fa_type)); 2370 if (fa->fa_tos) 2371 seq_printf(seq, "tos =%d\n", 2372 fa->fa_tos); 2373 seq_putc(seq, '\n'); 2374 } 2375 } 2376 } 2377 } 2378 2379 return 0; 2380} 2381 2382static const struct seq_operations fib_trie_seq_ops = { 2383 .start = fib_trie_seq_start, 2384 .next = fib_trie_seq_next, 2385 .stop = fib_trie_seq_stop, 2386 .show = fib_trie_seq_show, 2387}; 2388 2389static int fib_trie_seq_open(struct inode *inode, struct file *file) 2390{ 2391 return seq_open_net(inode, file, &fib_trie_seq_ops, 2392 sizeof(struct fib_trie_iter)); 2393} 2394 2395static const struct file_operations fib_trie_fops = { 2396 .owner = THIS_MODULE, 2397 .open = fib_trie_seq_open, 2398 .read = seq_read, 2399 .llseek = seq_lseek, 2400 .release = seq_release_net, 2401}; 2402 2403static unsigned fib_flag_trans(int type, __be32 mask, const struct fib_info *fi) 2404{ 2405 static unsigned type2flags[RTN_MAX + 1] = { 2406 [7] = RTF_REJECT, [8] = RTF_REJECT, 2407 }; 2408 unsigned flags = type2flags[type]; 2409 2410 if (fi && fi->fib_nh->nh_gw) 2411 flags |= RTF_GATEWAY; 2412 if (mask == htonl(0xFFFFFFFF)) 2413 flags |= RTF_HOST; 2414 flags |= RTF_UP; 2415 return flags; 2416} 2417 2418/* 2419 * This outputs /proc/net/route. 2420 * The format of the file is not supposed to be changed 2421 * and needs to be same as fib_hash output to avoid breaking 2422 * legacy utilities 2423 */ 2424static int fib_route_seq_show(struct seq_file *seq, void *v) 2425{ 2426 const struct fib_trie_iter *iter = seq->private; 2427 struct leaf *l = v; 2428 int i; 2429 char bf[128]; 2430 2431 if (v == SEQ_START_TOKEN) { 2432 seq_printf(seq, "%-127s\n", "Iface\tDestination\tGateway " 2433 "\tFlags\tRefCnt\tUse\tMetric\tMask\t\tMTU" 2434 "\tWindow\tIRTT"); 2435 return 0; 2436 } 2437 2438 if (iter->trie == iter->trie_local) 2439 return 0; 2440 if (IS_TNODE(l)) 2441 return 0; 2442 2443 for (i=32; i>=0; i--) { 2444 struct leaf_info *li = find_leaf_info(l, i); 2445 struct fib_alias *fa; 2446 __be32 mask, prefix; 2447 2448 if (!li) 2449 continue; 2450 2451 mask = inet_make_mask(li->plen); 2452 prefix = htonl(l->key); 2453 2454 list_for_each_entry_rcu(fa, &li->falh, fa_list) { 2455 const struct fib_info *fi = fa->fa_info; 2456 unsigned flags = fib_flag_trans(fa->fa_type, mask, fi); 2457 2458 if (fa->fa_type == RTN_BROADCAST 2459 || fa->fa_type == RTN_MULTICAST) 2460 continue; 2461 2462 if (fi) 2463 snprintf(bf, sizeof(bf), 2464 "%s\t%08X\t%08X\t%04X\t%d\t%u\t%d\t%08X\t%d\t%u\t%u", 2465 fi->fib_dev ? fi->fib_dev->name : "*", 2466 prefix, 2467 fi->fib_nh->nh_gw, flags, 0, 0, 2468 fi->fib_priority, 2469 mask, 2470 (fi->fib_advmss ? fi->fib_advmss + 40 : 0), 2471 fi->fib_window, 2472 fi->fib_rtt >> 3); 2473 else 2474 snprintf(bf, sizeof(bf), 2475 "*\t%08X\t%08X\t%04X\t%d\t%u\t%d\t%08X\t%d\t%u\t%u", 2476 prefix, 0, flags, 0, 0, 0, 2477 mask, 0, 0, 0); 2478 2479 seq_printf(seq, "%-127s\n", bf); 2480 } 2481 } 2482 2483 return 0; 2484} 2485 2486static const struct seq_operations fib_route_seq_ops = { 2487 .start = fib_trie_seq_start, 2488 .next = fib_trie_seq_next, 2489 .stop = fib_trie_seq_stop, 2490 .show = fib_route_seq_show, 2491}; 2492 2493static int fib_route_seq_open(struct inode *inode, struct file *file) 2494{ 2495 return seq_open_net(inode, file, &fib_route_seq_ops, 2496 sizeof(struct fib_trie_iter)); 2497} 2498 2499static const struct file_operations fib_route_fops = { 2500 .owner = THIS_MODULE, 2501 .open = fib_route_seq_open, 2502 .read = seq_read, 2503 .llseek = seq_lseek, 2504 .release = seq_release_net, 2505}; 2506 2507int __net_init fib_proc_init(struct net *net) 2508{ 2509 if (!proc_net_fops_create(net, "fib_trie", S_IRUGO, &fib_trie_fops)) 2510 goto out1; 2511 2512 if (!proc_net_fops_create(net, "fib_triestat", S_IRUGO, 2513 &fib_triestat_fops)) 2514 goto out2; 2515 2516 if (!proc_net_fops_create(net, "route", S_IRUGO, &fib_route_fops)) 2517 goto out3; 2518 2519 return 0; 2520 2521out3: 2522 proc_net_remove(net, "fib_triestat"); 2523out2: 2524 proc_net_remove(net, "fib_trie"); 2525out1: 2526 return -ENOMEM; 2527} 2528 2529void __net_exit fib_proc_exit(struct net *net) 2530{ 2531 proc_net_remove(net, "fib_trie"); 2532 proc_net_remove(net, "fib_triestat"); 2533 proc_net_remove(net, "route"); 2534} 2535 2536#endif /* CONFIG_PROC_FS */ 2537