fib_trie.c revision 2f36895aa774cf4d1c3d68921e0209e796b66600
1/*
2 *   This program is free software; you can redistribute it and/or
3 *   modify it under the terms of the GNU General Public License
4 *   as published by the Free Software Foundation; either version
5 *   2 of the License, or (at your option) any later version.
6 *
7 *   Robert Olsson <robert.olsson@its.uu.se> Uppsala Universitet
8 *     & Swedish University of Agricultural Sciences.
9 *
10 *   Jens Laas <jens.laas@data.slu.se> Swedish University of
11 *     Agricultural Sciences.
12 *
13 *   Hans Liss <hans.liss@its.uu.se>  Uppsala Universitet
14 *
15 * This work is based on the LPC-trie which is originally descibed in:
16 *
17 * An experimental study of compression methods for dynamic tries
18 * Stefan Nilsson and Matti Tikkanen. Algorithmica, 33(1):19-33, 2002.
19 * http://www.nada.kth.se/~snilsson/public/papers/dyntrie2/
20 *
21 *
22 * IP-address lookup using LC-tries. Stefan Nilsson and Gunnar Karlsson
23 * IEEE Journal on Selected Areas in Communications, 17(6):1083-1092, June 1999
24 *
25 * Version:	$Id: fib_trie.c,v 1.3 2005/06/08 14:20:01 robert Exp $
26 *
27 *
28 * Code from fib_hash has been reused which includes the following header:
29 *
30 *
31 * INET		An implementation of the TCP/IP protocol suite for the LINUX
32 *		operating system.  INET is implemented using the  BSD Socket
33 *		interface as the means of communication with the user level.
34 *
35 *		IPv4 FIB: lookup engine and maintenance routines.
36 *
37 *
38 * Authors:	Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
39 *
40 *		This program is free software; you can redistribute it and/or
41 *		modify it under the terms of the GNU General Public License
42 *		as published by the Free Software Foundation; either version
43 *		2 of the License, or (at your option) any later version.
44 */
45
46#define VERSION "0.325"
47
48#include <linux/config.h>
49#include <asm/uaccess.h>
50#include <asm/system.h>
51#include <asm/bitops.h>
52#include <linux/types.h>
53#include <linux/kernel.h>
54#include <linux/sched.h>
55#include <linux/mm.h>
56#include <linux/string.h>
57#include <linux/socket.h>
58#include <linux/sockios.h>
59#include <linux/errno.h>
60#include <linux/in.h>
61#include <linux/inet.h>
62#include <linux/netdevice.h>
63#include <linux/if_arp.h>
64#include <linux/proc_fs.h>
65#include <linux/skbuff.h>
66#include <linux/netlink.h>
67#include <linux/init.h>
68#include <linux/list.h>
69#include <net/ip.h>
70#include <net/protocol.h>
71#include <net/route.h>
72#include <net/tcp.h>
73#include <net/sock.h>
74#include <net/ip_fib.h>
75#include "fib_lookup.h"
76
77#undef CONFIG_IP_FIB_TRIE_STATS
78#define MAX_CHILDS 16384
79
80#define EXTRACT(p, n, str) ((str)<<(p)>>(32-(n)))
81#define KEYLENGTH (8*sizeof(t_key))
82#define MASK_PFX(k, l) (((l)==0)?0:(k >> (KEYLENGTH-l)) << (KEYLENGTH-l))
83#define TKEY_GET_MASK(offset, bits) (((bits)==0)?0:((t_key)(-1) << (KEYLENGTH - bits) >> offset))
84
85static DEFINE_RWLOCK(fib_lock);
86
87typedef unsigned int t_key;
88
89#define T_TNODE 0
90#define T_LEAF  1
91#define NODE_TYPE_MASK	0x1UL
92#define NODE_PARENT(_node) \
93((struct tnode *)((_node)->_parent & ~NODE_TYPE_MASK))
94#define NODE_SET_PARENT(_node, _ptr) \
95((_node)->_parent = (((unsigned long)(_ptr)) | \
96                     ((_node)->_parent & NODE_TYPE_MASK)))
97#define NODE_INIT_PARENT(_node, _type) \
98((_node)->_parent = (_type))
99#define NODE_TYPE(_node) \
100((_node)->_parent & NODE_TYPE_MASK)
101
102#define IS_TNODE(n) (!(n->_parent & T_LEAF))
103#define IS_LEAF(n) (n->_parent & T_LEAF)
104
105struct node {
106        t_key key;
107	unsigned long _parent;
108};
109
110struct leaf {
111        t_key key;
112	unsigned long _parent;
113	struct hlist_head list;
114};
115
116struct leaf_info {
117	struct hlist_node hlist;
118	int plen;
119	struct list_head falh;
120};
121
122struct tnode {
123        t_key key;
124	unsigned long _parent;
125        unsigned short pos:5;        /* 2log(KEYLENGTH) bits needed */
126        unsigned short bits:5;       /* 2log(KEYLENGTH) bits needed */
127        unsigned short full_children;  /* KEYLENGTH bits needed */
128        unsigned short empty_children; /* KEYLENGTH bits needed */
129        struct node *child[0];
130};
131
132#ifdef CONFIG_IP_FIB_TRIE_STATS
133struct trie_use_stats {
134	unsigned int gets;
135	unsigned int backtrack;
136	unsigned int semantic_match_passed;
137	unsigned int semantic_match_miss;
138	unsigned int null_node_hit;
139	unsigned int resize_node_skipped;
140};
141#endif
142
143struct trie_stat {
144	unsigned int totdepth;
145	unsigned int maxdepth;
146	unsigned int tnodes;
147	unsigned int leaves;
148	unsigned int nullpointers;
149	unsigned int nodesizes[MAX_CHILDS];
150};
151
152struct trie {
153        struct node *trie;
154#ifdef CONFIG_IP_FIB_TRIE_STATS
155	struct trie_use_stats stats;
156#endif
157        int size;
158	unsigned int revision;
159};
160
161static int trie_debug = 0;
162
163static int tnode_full(struct tnode *tn, struct node *n);
164static void put_child(struct trie *t, struct tnode *tn, int i, struct node *n);
165static void tnode_put_child_reorg(struct tnode *tn, int i, struct node *n, int wasfull);
166static int tnode_child_length(struct tnode *tn);
167static struct node *resize(struct trie *t, struct tnode *tn);
168static struct tnode *inflate(struct trie *t, struct tnode *tn, int *err);
169static struct tnode *halve(struct trie *t, struct tnode *tn, int *err);
170static void tnode_free(struct tnode *tn);
171static void trie_dump_seq(struct seq_file *seq, struct trie *t);
172extern struct fib_alias *fib_find_alias(struct list_head *fah, u8 tos, u32 prio);
173extern int fib_detect_death(struct fib_info *fi, int order,
174                            struct fib_info **last_resort, int *last_idx, int *dflt);
175
176extern void rtmsg_fib(int event, u32 key, struct fib_alias *fa, int z, int tb_id,
177               struct nlmsghdr *n, struct netlink_skb_parms *req);
178
179static kmem_cache_t *fn_alias_kmem;
180static struct trie *trie_local = NULL, *trie_main = NULL;
181
182static void trie_bug(char *err)
183{
184	printk("Trie Bug: %s\n", err);
185	BUG();
186}
187
188static inline struct node *tnode_get_child(struct tnode *tn, int i)
189{
190        if (i >=  1<<tn->bits)
191                trie_bug("tnode_get_child");
192
193        return tn->child[i];
194}
195
196static inline int tnode_child_length(struct tnode *tn)
197{
198        return 1<<tn->bits;
199}
200
201/*
202  _________________________________________________________________
203  | i | i | i | i | i | i | i | N | N | N | S | S | S | S | S | C |
204  ----------------------------------------------------------------
205    0   1   2   3   4   5   6   7   8   9  10  11  12  13  14  15
206
207  _________________________________________________________________
208  | C | C | C | u | u | u | u | u | u | u | u | u | u | u | u | u |
209  -----------------------------------------------------------------
210   16  17  18  19  20  21  22  23  24  25  26  27  28  29  30  31
211
212  tp->pos = 7
213  tp->bits = 3
214  n->pos = 15
215  n->bits=4
216  KEYLENGTH=32
217*/
218
219static inline t_key tkey_extract_bits(t_key a, int offset, int bits)
220{
221        if (offset < KEYLENGTH)
222		return ((t_key)(a << offset)) >> (KEYLENGTH - bits);
223        else
224		return 0;
225}
226
227static inline int tkey_equals(t_key a, t_key b)
228{
229  return a == b;
230}
231
232static inline int tkey_sub_equals(t_key a, int offset, int bits, t_key b)
233{
234     if (bits == 0 || offset >= KEYLENGTH)
235            return 1;
236        bits = bits > KEYLENGTH ? KEYLENGTH : bits;
237        return ((a ^ b) << offset) >> (KEYLENGTH - bits) == 0;
238}
239
240static inline int tkey_mismatch(t_key a, int offset, t_key b)
241{
242	t_key diff = a ^ b;
243	int i = offset;
244
245	if(!diff)
246	  return 0;
247	while((diff << i) >> (KEYLENGTH-1) == 0)
248		i++;
249	return i;
250}
251
252/* Candiate for fib_semantics */
253
254static void fn_free_alias(struct fib_alias *fa)
255{
256	fib_release_info(fa->fa_info);
257	kmem_cache_free(fn_alias_kmem, fa);
258}
259
260/*
261  To understand this stuff, an understanding of keys and all their bits is
262  necessary. Every node in the trie has a key associated with it, but not
263  all of the bits in that key are significant.
264
265  Consider a node 'n' and its parent 'tp'.
266
267  If n is a leaf, every bit in its key is significant. Its presence is
268  necessitaded by path compression, since during a tree traversal (when
269  searching for a leaf - unless we are doing an insertion) we will completely
270  ignore all skipped bits we encounter. Thus we need to verify, at the end of
271  a potentially successful search, that we have indeed been walking the
272  correct key path.
273
274  Note that we can never "miss" the correct key in the tree if present by
275  following the wrong path. Path compression ensures that segments of the key
276  that are the same for all keys with a given prefix are skipped, but the
277  skipped part *is* identical for each node in the subtrie below the skipped
278  bit! trie_insert() in this implementation takes care of that - note the
279  call to tkey_sub_equals() in trie_insert().
280
281  if n is an internal node - a 'tnode' here, the various parts of its key
282  have many different meanings.
283
284  Example:
285  _________________________________________________________________
286  | i | i | i | i | i | i | i | N | N | N | S | S | S | S | S | C |
287  -----------------------------------------------------------------
288    0   1   2   3   4   5   6   7   8   9  10  11  12  13  14  15
289
290  _________________________________________________________________
291  | C | C | C | u | u | u | u | u | u | u | u | u | u | u | u | u |
292  -----------------------------------------------------------------
293   16  17  18  19  20  21  22  23  24  25  26  27  28  29  30  31
294
295  tp->pos = 7
296  tp->bits = 3
297  n->pos = 15
298  n->bits=4
299
300  First, let's just ignore the bits that come before the parent tp, that is
301  the bits from 0 to (tp->pos-1). They are *known* but at this point we do
302  not use them for anything.
303
304  The bits from (tp->pos) to (tp->pos + tp->bits - 1) - "N", above - are the
305  index into the parent's child array. That is, they will be used to find
306  'n' among tp's children.
307
308  The bits from (tp->pos + tp->bits) to (n->pos - 1) - "S" - are skipped bits
309  for the node n.
310
311  All the bits we have seen so far are significant to the node n. The rest
312  of the bits are really not needed or indeed known in n->key.
313
314  The bits from (n->pos) to (n->pos + n->bits - 1) - "C" - are the index into
315  n's child array, and will of course be different for each child.
316
317  The rest of the bits, from (n->pos + n->bits) onward, are completely unknown
318  at this point.
319
320*/
321
322static void check_tnode(struct tnode *tn)
323{
324	if(tn && tn->pos+tn->bits > 32) {
325		printk("TNODE ERROR tn=%p, pos=%d, bits=%d\n", tn, tn->pos, tn->bits);
326	}
327}
328
329static int halve_threshold = 25;
330static int inflate_threshold = 50;
331
332static struct leaf *leaf_new(void)
333{
334	struct leaf *l = kmalloc(sizeof(struct leaf),  GFP_KERNEL);
335	if(l) {
336		NODE_INIT_PARENT(l, T_LEAF);
337		INIT_HLIST_HEAD(&l->list);
338	}
339	return l;
340}
341
342static struct leaf_info *leaf_info_new(int plen)
343{
344	struct leaf_info *li = kmalloc(sizeof(struct leaf_info),  GFP_KERNEL);
345	if(li) {
346		li->plen = plen;
347		INIT_LIST_HEAD(&li->falh);
348	}
349	return li;
350}
351
352static inline void free_leaf(struct leaf *l)
353{
354	kfree(l);
355}
356
357static inline void free_leaf_info(struct leaf_info *li)
358{
359	kfree(li);
360}
361
362static struct tnode *tnode_alloc(unsigned int size)
363{
364	if (size <= PAGE_SIZE) {
365		return kmalloc(size, GFP_KERNEL);
366	} else {
367		return (struct tnode *)
368		       __get_free_pages(GFP_KERNEL, get_order(size));
369	}
370}
371
372static void __tnode_free(struct tnode *tn)
373{
374	unsigned int size = sizeof(struct tnode) +
375	                    (1<<tn->bits) * sizeof(struct node *);
376
377	if (size <= PAGE_SIZE)
378		kfree(tn);
379	else
380		free_pages((unsigned long)tn, get_order(size));
381}
382
383static struct tnode* tnode_new(t_key key, int pos, int bits)
384{
385	int nchildren = 1<<bits;
386	int sz = sizeof(struct tnode) + nchildren * sizeof(struct node *);
387	struct tnode *tn = tnode_alloc(sz);
388
389	if(tn)  {
390		memset(tn, 0, sz);
391		NODE_INIT_PARENT(tn, T_TNODE);
392		tn->pos = pos;
393		tn->bits = bits;
394		tn->key = key;
395		tn->full_children = 0;
396		tn->empty_children = 1<<bits;
397	}
398	if(trie_debug > 0)
399		printk("AT %p s=%u %u\n", tn, (unsigned int) sizeof(struct tnode),
400		       (unsigned int) (sizeof(struct node) * 1<<bits));
401	return tn;
402}
403
404static void tnode_free(struct tnode *tn)
405{
406	if(!tn) {
407		trie_bug("tnode_free\n");
408	}
409	if(IS_LEAF(tn)) {
410		free_leaf((struct leaf *)tn);
411		if(trie_debug > 0 )
412			printk("FL %p \n", tn);
413	}
414	else if(IS_TNODE(tn)) {
415		__tnode_free(tn);
416		if(trie_debug > 0 )
417			printk("FT %p \n", tn);
418	}
419	else {
420		trie_bug("tnode_free\n");
421	}
422}
423
424/*
425 * Check whether a tnode 'n' is "full", i.e. it is an internal node
426 * and no bits are skipped. See discussion in dyntree paper p. 6
427 */
428
429static inline int tnode_full(struct tnode *tn, struct node *n)
430{
431	if(n == NULL || IS_LEAF(n))
432		return 0;
433
434	return ((struct tnode *) n)->pos == tn->pos + tn->bits;
435}
436
437static inline void put_child(struct trie *t, struct tnode *tn, int i, struct node *n)
438{
439	tnode_put_child_reorg(tn, i, n, -1);
440}
441
442 /*
443  * Add a child at position i overwriting the old value.
444  * Update the value of full_children and empty_children.
445  */
446
447static void tnode_put_child_reorg(struct tnode *tn, int i, struct node *n, int wasfull)
448{
449	struct node *chi;
450	int isfull;
451
452	if(i >=  1<<tn->bits) {
453		printk("bits=%d, i=%d\n", tn->bits, i);
454		trie_bug("tnode_put_child_reorg bits");
455	}
456	write_lock_bh(&fib_lock);
457	chi = tn->child[i];
458
459	/* update emptyChildren */
460	if (n == NULL && chi != NULL)
461		tn->empty_children++;
462	else if (n != NULL && chi == NULL)
463		tn->empty_children--;
464
465	/* update fullChildren */
466        if (wasfull == -1)
467		wasfull = tnode_full(tn, chi);
468
469	isfull = tnode_full(tn, n);
470	if (wasfull && !isfull)
471		tn->full_children--;
472
473	else if (!wasfull && isfull)
474		tn->full_children++;
475	if(n)
476		NODE_SET_PARENT(n, tn);
477
478	tn->child[i] = n;
479	write_unlock_bh(&fib_lock);
480}
481
482static struct node *resize(struct trie *t, struct tnode *tn)
483{
484	int i;
485	int err = 0;
486
487 	if (!tn)
488		return NULL;
489
490	if(trie_debug)
491		printk("In tnode_resize %p inflate_threshold=%d threshold=%d\n",
492		      tn, inflate_threshold, halve_threshold);
493
494	/* No children */
495	if (tn->empty_children == tnode_child_length(tn)) {
496		tnode_free(tn);
497		return NULL;
498	}
499	/* One child */
500	if (tn->empty_children == tnode_child_length(tn) - 1)
501		for (i = 0; i < tnode_child_length(tn); i++) {
502
503			write_lock_bh(&fib_lock);
504			if (tn->child[i] != NULL) {
505
506				/* compress one level */
507				struct node *n = tn->child[i];
508				if(n)
509					NODE_INIT_PARENT(n, NODE_TYPE(n));
510
511				write_unlock_bh(&fib_lock);
512				tnode_free(tn);
513				return n;
514			}
515			write_unlock_bh(&fib_lock);
516		}
517	/*
518	 * Double as long as the resulting node has a number of
519	 * nonempty nodes that are above the threshold.
520	 */
521
522	/*
523	 * From "Implementing a dynamic compressed trie" by Stefan Nilsson of
524	 * the Helsinki University of Technology and Matti Tikkanen of Nokia
525	 * Telecommunications, page 6:
526	 * "A node is doubled if the ratio of non-empty children to all
527	 * children in the *doubled* node is at least 'high'."
528	 *
529	 * 'high' in this instance is the variable 'inflate_threshold'. It
530	 * is expressed as a percentage, so we multiply it with
531	 * tnode_child_length() and instead of multiplying by 2 (since the
532	 * child array will be doubled by inflate()) and multiplying
533	 * the left-hand side by 100 (to handle the percentage thing) we
534	 * multiply the left-hand side by 50.
535	 *
536	 * The left-hand side may look a bit weird: tnode_child_length(tn)
537	 * - tn->empty_children is of course the number of non-null children
538	 * in the current node. tn->full_children is the number of "full"
539	 * children, that is non-null tnodes with a skip value of 0.
540	 * All of those will be doubled in the resulting inflated tnode, so
541	 * we just count them one extra time here.
542	 *
543	 * A clearer way to write this would be:
544	 *
545	 * to_be_doubled = tn->full_children;
546	 * not_to_be_doubled = tnode_child_length(tn) - tn->empty_children -
547	 *     tn->full_children;
548	 *
549	 * new_child_length = tnode_child_length(tn) * 2;
550	 *
551	 * new_fill_factor = 100 * (not_to_be_doubled + 2*to_be_doubled) /
552	 *      new_child_length;
553	 * if (new_fill_factor >= inflate_threshold)
554	 *
555	 * ...and so on, tho it would mess up the while() loop.
556	 *
557	 * anyway,
558	 * 100 * (not_to_be_doubled + 2*to_be_doubled) / new_child_length >=
559	 *      inflate_threshold
560	 *
561	 * avoid a division:
562	 * 100 * (not_to_be_doubled + 2*to_be_doubled) >=
563	 *      inflate_threshold * new_child_length
564	 *
565	 * expand not_to_be_doubled and to_be_doubled, and shorten:
566	 * 100 * (tnode_child_length(tn) - tn->empty_children +
567	 *    tn->full_children ) >= inflate_threshold * new_child_length
568	 *
569	 * expand new_child_length:
570	 * 100 * (tnode_child_length(tn) - tn->empty_children +
571	 *    tn->full_children ) >=
572	 *      inflate_threshold * tnode_child_length(tn) * 2
573	 *
574	 * shorten again:
575	 * 50 * (tn->full_children + tnode_child_length(tn) -
576	 *    tn->empty_children ) >= inflate_threshold *
577	 *    tnode_child_length(tn)
578	 *
579	 */
580
581	check_tnode(tn);
582
583	err = 0;
584	while ((tn->full_children > 0 &&
585	       50 * (tn->full_children + tnode_child_length(tn) - tn->empty_children) >=
586				inflate_threshold * tnode_child_length(tn))) {
587
588		tn = inflate(t, tn, &err);
589
590		if(err) {
591#ifdef CONFIG_IP_FIB_TRIE_STATS
592			t->stats.resize_node_skipped++;
593#endif
594			break;
595		}
596	}
597
598	check_tnode(tn);
599
600	/*
601	 * Halve as long as the number of empty children in this
602	 * node is above threshold.
603	 */
604
605	err = 0;
606	while (tn->bits > 1 &&
607	       100 * (tnode_child_length(tn) - tn->empty_children) <
608	       halve_threshold * tnode_child_length(tn)) {
609
610		tn = halve(t, tn, &err);
611
612		if(err) {
613#ifdef CONFIG_IP_FIB_TRIE_STATS
614			t->stats.resize_node_skipped++;
615#endif
616			break;
617		}
618	}
619
620
621	/* Only one child remains */
622
623	if (tn->empty_children == tnode_child_length(tn) - 1)
624		for (i = 0; i < tnode_child_length(tn); i++) {
625
626			write_lock_bh(&fib_lock);
627			if (tn->child[i] != NULL) {
628				/* compress one level */
629				struct node *n = tn->child[i];
630
631				if(n)
632					NODE_INIT_PARENT(n, NODE_TYPE(n));
633
634				write_unlock_bh(&fib_lock);
635				tnode_free(tn);
636				return n;
637			}
638			write_unlock_bh(&fib_lock);
639		}
640
641	return (struct node *) tn;
642}
643
644static struct tnode *inflate(struct trie *t, struct tnode *tn, int *err)
645{
646	struct tnode *inode;
647	struct tnode *oldtnode = tn;
648	int olen = tnode_child_length(tn);
649	int i;
650
651  	if(trie_debug)
652		printk("In inflate\n");
653
654	tn = tnode_new(oldtnode->key, oldtnode->pos, oldtnode->bits + 1);
655
656	if (!tn) {
657		*err = -ENOMEM;
658		return oldtnode;
659	}
660
661	/*
662	 * Preallocate and store tnodes before the actual work so we
663	 * don't get into an inconsistent state if memory allocation
664	 * fails. In case of failure we return the oldnode and  inflate
665	 * of tnode is ignored.
666	 */
667
668	for(i = 0; i < olen; i++) {
669		struct tnode *inode = (struct tnode *) tnode_get_child(oldtnode, i);
670
671		if (inode &&
672		    IS_TNODE(inode) &&
673		    inode->pos == oldtnode->pos + oldtnode->bits &&
674		    inode->bits > 1) {
675			struct tnode *left, *right;
676
677			t_key m = TKEY_GET_MASK(inode->pos, 1);
678
679			left = tnode_new(inode->key&(~m), inode->pos + 1,
680					 inode->bits - 1);
681
682			if(!left) {
683				*err = -ENOMEM;
684				break;
685			}
686
687			right = tnode_new(inode->key|m, inode->pos + 1,
688					  inode->bits - 1);
689
690			if(!right) {
691				*err = -ENOMEM;
692				break;
693			}
694
695			put_child(t, tn, 2*i, (struct node *) left);
696			put_child(t, tn, 2*i+1, (struct node *) right);
697		}
698	}
699
700	if(*err) {
701		int size = tnode_child_length(tn);
702		int j;
703
704		for(j = 0; j < size; j++)
705			if( tn->child[j])
706				tnode_free((struct tnode *)tn->child[j]);
707
708		tnode_free(tn);
709
710		*err = -ENOMEM;
711		return oldtnode;
712	}
713
714	for(i = 0; i < olen; i++) {
715		struct node *node = tnode_get_child(oldtnode, i);
716
717		/* An empty child */
718		if (node == NULL)
719			continue;
720
721		/* A leaf or an internal node with skipped bits */
722
723		if(IS_LEAF(node) || ((struct tnode *) node)->pos >
724		   tn->pos + tn->bits - 1) {
725			if(tkey_extract_bits(node->key, oldtnode->pos + oldtnode->bits,
726					     1) == 0)
727				put_child(t, tn, 2*i, node);
728			else
729				put_child(t, tn, 2*i+1, node);
730			continue;
731		}
732
733		/* An internal node with two children */
734		inode = (struct tnode *) node;
735
736		if (inode->bits == 1) {
737			put_child(t, tn, 2*i, inode->child[0]);
738			put_child(t, tn, 2*i+1, inode->child[1]);
739
740			tnode_free(inode);
741		}
742
743			/* An internal node with more than two children */
744		else {
745			struct tnode *left, *right;
746			int size, j;
747
748			/* We will replace this node 'inode' with two new
749			 * ones, 'left' and 'right', each with half of the
750			 * original children. The two new nodes will have
751			 * a position one bit further down the key and this
752			 * means that the "significant" part of their keys
753			 * (see the discussion near the top of this file)
754			 * will differ by one bit, which will be "0" in
755			 * left's key and "1" in right's key. Since we are
756			 * moving the key position by one step, the bit that
757			 * we are moving away from - the bit at position
758			 * (inode->pos) - is the one that will differ between
759			 * left and right. So... we synthesize that bit in the
760			 * two  new keys.
761			 * The mask 'm' below will be a single "one" bit at
762			 * the position (inode->pos)
763			 */
764
765			/* Use the old key, but set the new significant
766			 *   bit to zero.
767			 */
768
769			left = (struct tnode *) tnode_get_child(tn, 2*i);
770			put_child(t, tn, 2*i, NULL);
771
772			if(!left)
773				BUG();
774
775			right = (struct tnode *) tnode_get_child(tn, 2*i+1);
776			put_child(t, tn, 2*i+1, NULL);
777
778			if(!right)
779				BUG();
780
781			size = tnode_child_length(left);
782			for(j = 0; j < size; j++) {
783				put_child(t, left, j, inode->child[j]);
784				put_child(t, right, j, inode->child[j + size]);
785			}
786			put_child(t, tn, 2*i, resize(t, left));
787			put_child(t, tn, 2*i+1, resize(t, right));
788
789			tnode_free(inode);
790		}
791	}
792	tnode_free(oldtnode);
793	return tn;
794}
795
796static struct tnode *halve(struct trie *t, struct tnode *tn, int *err)
797{
798	struct tnode *oldtnode = tn;
799	struct node *left, *right;
800	int i;
801	int olen = tnode_child_length(tn);
802
803	if(trie_debug) printk("In halve\n");
804
805	tn=tnode_new(oldtnode->key, oldtnode->pos, oldtnode->bits - 1);
806
807	if (!tn) {
808		*err = -ENOMEM;
809		return oldtnode;
810	}
811
812	/*
813	 * Preallocate and store tnodes before the actual work so we
814	 * don't get into an inconsistent state if memory allocation
815	 * fails. In case of failure we return the oldnode and halve
816	 * of tnode is ignored.
817	 */
818
819	for(i = 0; i < olen; i += 2) {
820		left = tnode_get_child(oldtnode, i);
821		right = tnode_get_child(oldtnode, i+1);
822
823		/* Two nonempty children */
824		if( left && right)  {
825			struct tnode *newBinNode =
826				tnode_new(left->key, tn->pos + tn->bits, 1);
827
828			if(!newBinNode) {
829				*err = -ENOMEM;
830				break;
831			}
832			put_child(t, tn, i/2, (struct node *)newBinNode);
833		}
834	}
835
836	if(*err) {
837		int size = tnode_child_length(tn);
838		int j;
839
840		for(j = 0; j < size; j++)
841			if( tn->child[j])
842				tnode_free((struct tnode *)tn->child[j]);
843
844		tnode_free(tn);
845
846		*err = -ENOMEM;
847		return oldtnode;
848	}
849
850	for(i = 0; i < olen; i += 2) {
851		left = tnode_get_child(oldtnode, i);
852		right = tnode_get_child(oldtnode, i+1);
853
854		/* At least one of the children is empty */
855		if (left == NULL) {
856			if (right == NULL)    /* Both are empty */
857				continue;
858			put_child(t, tn, i/2, right);
859		} else if (right == NULL)
860			put_child(t, tn, i/2, left);
861
862		/* Two nonempty children */
863		else {
864			struct tnode *newBinNode =
865				(struct tnode *) tnode_get_child(tn, i/2);
866			put_child(t, tn, i/2, NULL);
867
868			if(!newBinNode)
869				BUG();
870
871			put_child(t, newBinNode, 0, left);
872			put_child(t, newBinNode, 1, right);
873			put_child(t, tn, i/2, resize(t, newBinNode));
874		}
875	}
876	tnode_free(oldtnode);
877	return tn;
878}
879
880static void *trie_init(struct trie *t)
881{
882	if(t) {
883		t->size = 0;
884		t->trie = NULL;
885		t->revision = 0;
886#ifdef CONFIG_IP_FIB_TRIE_STATS
887       		memset(&t->stats, 0, sizeof(struct trie_use_stats));
888#endif
889	}
890	return t;
891}
892
893static struct leaf_info *find_leaf_info(struct hlist_head *head, int plen)
894{
895	struct hlist_node *node;
896	struct leaf_info *li;
897
898	hlist_for_each_entry(li, node, head, hlist) {
899
900		if ( li->plen == plen )
901			return li;
902	}
903	return NULL;
904}
905
906static inline struct list_head * get_fa_head(struct leaf *l, int plen)
907{
908	struct list_head *fa_head=NULL;
909	struct leaf_info *li = find_leaf_info(&l->list, plen);
910
911	if(li)
912		fa_head = &li->falh;
913
914	return fa_head;
915}
916
917static void insert_leaf_info(struct hlist_head *head, struct leaf_info *new)
918{
919	struct leaf_info *li=NULL, *last=NULL;
920	struct hlist_node *node, *tmp;
921
922	write_lock_bh(&fib_lock);
923
924	if(hlist_empty(head))
925		hlist_add_head(&new->hlist, head);
926	else {
927		hlist_for_each_entry_safe(li, node, tmp, head, hlist) {
928
929			if (new->plen > li->plen)
930				break;
931
932			last = li;
933		}
934		if(last)
935			hlist_add_after(&last->hlist, &new->hlist);
936		else
937			hlist_add_before(&new->hlist, &li->hlist);
938	}
939	write_unlock_bh(&fib_lock);
940}
941
942static struct leaf *
943fib_find_node(struct trie *t, u32 key)
944{
945	int pos;
946	struct tnode *tn;
947	struct node *n;
948
949	pos = 0;
950	n=t->trie;
951
952	while (n != NULL &&  NODE_TYPE(n) == T_TNODE) {
953		tn = (struct tnode *) n;
954
955		check_tnode(tn);
956
957		if(tkey_sub_equals(tn->key, pos, tn->pos-pos, key)) {
958			pos=tn->pos + tn->bits;
959			n = tnode_get_child(tn, tkey_extract_bits(key, tn->pos, tn->bits));
960		}
961		else
962			break;
963	}
964	/* Case we have found a leaf. Compare prefixes */
965
966	if (n != NULL && IS_LEAF(n) && tkey_equals(key, n->key)) {
967		struct leaf *l = (struct leaf *) n;
968		return l;
969	}
970	return NULL;
971}
972
973static struct node *trie_rebalance(struct trie *t, struct tnode *tn)
974{
975	int i = 0;
976	int wasfull;
977	t_key cindex, key;
978	struct tnode *tp = NULL;
979
980	if(!tn)
981		BUG();
982
983	key = tn->key;
984	i = 0;
985
986	while (tn != NULL && NODE_PARENT(tn) != NULL) {
987
988		if( i > 10 ) {
989			printk("Rebalance tn=%p \n", tn);
990			if(tn) 		printk("tn->parent=%p \n", NODE_PARENT(tn));
991
992			printk("Rebalance tp=%p \n", tp);
993			if(tp) 		printk("tp->parent=%p \n", NODE_PARENT(tp));
994		}
995
996		if( i > 12 ) BUG();
997		i++;
998
999		tp = NODE_PARENT(tn);
1000		cindex = tkey_extract_bits(key, tp->pos, tp->bits);
1001		wasfull = tnode_full(tp, tnode_get_child(tp, cindex));
1002		tn = (struct tnode *) resize (t, (struct tnode *)tn);
1003		tnode_put_child_reorg((struct tnode *)tp, cindex,(struct node*)tn, wasfull);
1004
1005		if(!NODE_PARENT(tn))
1006			break;
1007
1008		tn = NODE_PARENT(tn);
1009	}
1010	/* Handle last (top) tnode */
1011	if (IS_TNODE(tn))
1012		tn = (struct tnode*) resize(t, (struct tnode *)tn);
1013
1014	return (struct node*) tn;
1015}
1016
1017static  struct list_head *
1018fib_insert_node(struct trie *t, int *err, u32 key, int plen)
1019{
1020	int pos, newpos;
1021	struct tnode *tp = NULL, *tn = NULL;
1022	struct node *n;
1023	struct leaf *l;
1024	int missbit;
1025	struct list_head *fa_head=NULL;
1026	struct leaf_info *li;
1027	t_key cindex;
1028
1029	pos = 0;
1030	n=t->trie;
1031
1032	/* If we point to NULL, stop. Either the tree is empty and we should
1033	 * just put a new leaf in if, or we have reached an empty child slot,
1034	 * and we should just put our new leaf in that.
1035	 * If we point to a T_TNODE, check if it matches our key. Note that
1036	 * a T_TNODE might be skipping any number of bits - its 'pos' need
1037	 * not be the parent's 'pos'+'bits'!
1038	 *
1039	 * If it does match the current key, get pos/bits from it, extract
1040	 * the index from our key, push the T_TNODE and walk the tree.
1041	 *
1042	 * If it doesn't, we have to replace it with a new T_TNODE.
1043	 *
1044	 * If we point to a T_LEAF, it might or might not have the same key
1045	 * as we do. If it does, just change the value, update the T_LEAF's
1046	 * value, and return it.
1047	 * If it doesn't, we need to replace it with a T_TNODE.
1048	 */
1049
1050	while (n != NULL &&  NODE_TYPE(n) == T_TNODE) {
1051		tn = (struct tnode *) n;
1052
1053		check_tnode(tn);
1054
1055		if(tkey_sub_equals(tn->key, pos, tn->pos-pos, key)) {
1056			tp = tn;
1057			pos=tn->pos + tn->bits;
1058			n = tnode_get_child(tn, tkey_extract_bits(key, tn->pos, tn->bits));
1059
1060			if(n && NODE_PARENT(n) != tn) {
1061				printk("BUG tn=%p, n->parent=%p\n", tn, NODE_PARENT(n));
1062				BUG();
1063			}
1064		}
1065		else
1066			break;
1067	}
1068
1069	/*
1070	 * n  ----> NULL, LEAF or TNODE
1071	 *
1072	 * tp is n's (parent) ----> NULL or TNODE
1073	 */
1074
1075	if(tp && IS_LEAF(tp))
1076		BUG();
1077
1078
1079	/* Case 1: n is a leaf. Compare prefixes */
1080
1081	if (n != NULL && IS_LEAF(n) && tkey_equals(key, n->key)) {
1082		struct leaf *l = ( struct leaf *)  n;
1083
1084		li = leaf_info_new(plen);
1085
1086		if(! li) {
1087			*err = -ENOMEM;
1088			goto err;
1089		}
1090
1091		fa_head = &li->falh;
1092		insert_leaf_info(&l->list, li);
1093		goto done;
1094	}
1095	t->size++;
1096	l = leaf_new();
1097
1098	if(! l) {
1099		*err = -ENOMEM;
1100		goto err;
1101	}
1102
1103	l->key = key;
1104	li = leaf_info_new(plen);
1105
1106	if(! li) {
1107		tnode_free((struct tnode *) l);
1108		*err = -ENOMEM;
1109		goto err;
1110	}
1111
1112	fa_head = &li->falh;
1113	insert_leaf_info(&l->list, li);
1114
1115	/* Case 2: n is NULL, and will just insert a new leaf */
1116	if (t->trie && n == NULL) {
1117
1118		NODE_SET_PARENT(l, tp);
1119
1120		if (!tp)
1121			BUG();
1122
1123		else {
1124			cindex = tkey_extract_bits(key, tp->pos, tp->bits);
1125			put_child(t, (struct tnode *)tp, cindex, (struct node *)l);
1126		}
1127	}
1128	/* Case 3: n is a LEAF or a TNODE and the key doesn't match. */
1129	else {
1130		/*
1131		 *  Add a new tnode here
1132		 *  first tnode need some special handling
1133		 */
1134
1135		if (tp)
1136			pos=tp->pos+tp->bits;
1137		else
1138			pos=0;
1139		if(n) {
1140			newpos = tkey_mismatch(key, pos, n->key);
1141			tn = tnode_new(n->key, newpos, 1);
1142		}
1143		else {
1144			newpos = 0;
1145			tn = tnode_new(key, newpos, 1); /* First tnode */
1146		}
1147
1148		if(!tn) {
1149			free_leaf_info(li);
1150			tnode_free((struct tnode *) l);
1151			*err = -ENOMEM;
1152			goto err;
1153		}
1154
1155		NODE_SET_PARENT(tn, tp);
1156
1157		missbit=tkey_extract_bits(key, newpos, 1);
1158		put_child(t, tn, missbit, (struct node *)l);
1159		put_child(t, tn, 1-missbit, n);
1160
1161		if(tp) {
1162			cindex = tkey_extract_bits(key, tp->pos, tp->bits);
1163			put_child(t, (struct tnode *)tp, cindex, (struct node *)tn);
1164		}
1165		else {
1166			t->trie = (struct node*) tn; /* First tnode */
1167			tp = tn;
1168		}
1169	}
1170	if(tp && tp->pos+tp->bits > 32) {
1171		printk("ERROR tp=%p pos=%d, bits=%d, key=%0x plen=%d\n",
1172		       tp, tp->pos, tp->bits, key, plen);
1173	}
1174	/* Rebalance the trie */
1175	t->trie = trie_rebalance(t, tp);
1176done:
1177	t->revision++;
1178err:;
1179	return fa_head;
1180}
1181
1182static int
1183fn_trie_insert(struct fib_table *tb, struct rtmsg *r, struct kern_rta *rta,
1184	       struct nlmsghdr *nlhdr, struct netlink_skb_parms *req)
1185{
1186	struct trie *t = (struct trie *) tb->tb_data;
1187	struct fib_alias *fa, *new_fa;
1188	struct list_head *fa_head=NULL;
1189	struct fib_info *fi;
1190	int plen = r->rtm_dst_len;
1191	int type = r->rtm_type;
1192	u8 tos = r->rtm_tos;
1193	u32 key, mask;
1194	int err;
1195	struct leaf *l;
1196
1197	if (plen > 32)
1198		return -EINVAL;
1199
1200	key = 0;
1201	if (rta->rta_dst)
1202		memcpy(&key, rta->rta_dst, 4);
1203
1204	key = ntohl(key);
1205
1206	if(trie_debug)
1207		printk("Insert table=%d %08x/%d\n", tb->tb_id, key, plen);
1208
1209	mask =  ntohl( inet_make_mask(plen) );
1210
1211	if(key & ~mask)
1212		return -EINVAL;
1213
1214	key = key & mask;
1215
1216	if  ((fi = fib_create_info(r, rta, nlhdr, &err)) == NULL)
1217		goto err;
1218
1219	l = fib_find_node(t, key);
1220	fa = NULL;
1221
1222	if(l) {
1223		fa_head = get_fa_head(l, plen);
1224		fa = fib_find_alias(fa_head, tos, fi->fib_priority);
1225	}
1226
1227	/* Now fa, if non-NULL, points to the first fib alias
1228	 * with the same keys [prefix,tos,priority], if such key already
1229	 * exists or to the node before which we will insert new one.
1230	 *
1231	 * If fa is NULL, we will need to allocate a new one and
1232	 * insert to the head of f.
1233	 *
1234	 * If f is NULL, no fib node matched the destination key
1235	 * and we need to allocate a new one of those as well.
1236	 */
1237
1238	if (fa &&
1239	    fa->fa_info->fib_priority == fi->fib_priority) {
1240		struct fib_alias *fa_orig;
1241
1242		err = -EEXIST;
1243		if (nlhdr->nlmsg_flags & NLM_F_EXCL)
1244			goto out;
1245
1246		if (nlhdr->nlmsg_flags & NLM_F_REPLACE) {
1247			struct fib_info *fi_drop;
1248			u8 state;
1249
1250			write_lock_bh(&fib_lock);
1251
1252			fi_drop = fa->fa_info;
1253			fa->fa_info = fi;
1254			fa->fa_type = type;
1255			fa->fa_scope = r->rtm_scope;
1256			state = fa->fa_state;
1257			fa->fa_state &= ~FA_S_ACCESSED;
1258
1259			write_unlock_bh(&fib_lock);
1260
1261			fib_release_info(fi_drop);
1262			if (state & FA_S_ACCESSED)
1263			  rt_cache_flush(-1);
1264
1265			    goto succeeded;
1266		}
1267		/* Error if we find a perfect match which
1268		 * uses the same scope, type, and nexthop
1269		 * information.
1270		 */
1271		fa_orig = fa;
1272		list_for_each_entry(fa, fa_orig->fa_list.prev, fa_list) {
1273			if (fa->fa_tos != tos)
1274				break;
1275			if (fa->fa_info->fib_priority != fi->fib_priority)
1276				break;
1277			if (fa->fa_type == type &&
1278			    fa->fa_scope == r->rtm_scope &&
1279			    fa->fa_info == fi) {
1280				goto out;
1281			}
1282		}
1283		if (!(nlhdr->nlmsg_flags & NLM_F_APPEND))
1284			fa = fa_orig;
1285	}
1286	err = -ENOENT;
1287	if (!(nlhdr->nlmsg_flags&NLM_F_CREATE))
1288		goto out;
1289
1290	err = -ENOBUFS;
1291	new_fa = kmem_cache_alloc(fn_alias_kmem, SLAB_KERNEL);
1292	if (new_fa == NULL)
1293		goto out;
1294
1295	new_fa->fa_info = fi;
1296	new_fa->fa_tos = tos;
1297	new_fa->fa_type = type;
1298	new_fa->fa_scope = r->rtm_scope;
1299	new_fa->fa_state = 0;
1300#if 0
1301	new_fa->dst  = NULL;
1302#endif
1303	/*
1304	 * Insert new entry to the list.
1305	 */
1306
1307	if(!fa_head) {
1308		fa_head = fib_insert_node(t, &err, key, plen);
1309		err = 0;
1310		if(err)
1311			goto out_free_new_fa;
1312	}
1313
1314	write_lock_bh(&fib_lock);
1315
1316	list_add_tail(&new_fa->fa_list,
1317		 (fa ? &fa->fa_list : fa_head));
1318
1319	write_unlock_bh(&fib_lock);
1320
1321	rt_cache_flush(-1);
1322	rtmsg_fib(RTM_NEWROUTE, htonl(key), new_fa, plen, tb->tb_id, nlhdr, req);
1323succeeded:
1324	return 0;
1325
1326out_free_new_fa:
1327	kmem_cache_free(fn_alias_kmem, new_fa);
1328out:
1329	fib_release_info(fi);
1330err:;
1331	return err;
1332}
1333
1334static inline int check_leaf(struct trie *t, struct leaf *l,  t_key key, int *plen, const struct flowi *flp,
1335			     struct fib_result *res, int *err)
1336{
1337	int i;
1338	t_key mask;
1339	struct leaf_info *li;
1340	struct hlist_head *hhead = &l->list;
1341	struct hlist_node *node;
1342
1343	hlist_for_each_entry(li, node, hhead, hlist) {
1344
1345		i = li->plen;
1346		mask = ntohl(inet_make_mask(i));
1347		if (l->key != (key & mask))
1348			continue;
1349
1350		if (((*err) = fib_semantic_match(&li->falh, flp, res, l->key, mask, i)) == 0) {
1351			*plen = i;
1352#ifdef CONFIG_IP_FIB_TRIE_STATS
1353			t->stats.semantic_match_passed++;
1354#endif
1355			return 1;
1356		}
1357#ifdef CONFIG_IP_FIB_TRIE_STATS
1358		t->stats.semantic_match_miss++;
1359#endif
1360	}
1361	return 0;
1362}
1363
1364static int
1365fn_trie_lookup(struct fib_table *tb, const struct flowi *flp, struct fib_result *res)
1366{
1367	struct trie *t = (struct trie *) tb->tb_data;
1368	int plen, ret = 0;
1369	struct node *n;
1370	struct tnode *pn;
1371	int pos, bits;
1372	t_key key=ntohl(flp->fl4_dst);
1373	int chopped_off;
1374	t_key cindex = 0;
1375	int current_prefix_length = KEYLENGTH;
1376	n = t->trie;
1377
1378	read_lock(&fib_lock);
1379	if(!n)
1380		goto failed;
1381
1382#ifdef CONFIG_IP_FIB_TRIE_STATS
1383	t->stats.gets++;
1384#endif
1385
1386	/* Just a leaf? */
1387	if (IS_LEAF(n)) {
1388		if( check_leaf(t, (struct leaf *)n, key, &plen, flp, res, &ret) )
1389			goto found;
1390		goto failed;
1391	}
1392	pn = (struct tnode *) n;
1393	chopped_off = 0;
1394
1395        while (pn) {
1396
1397		pos = pn->pos;
1398		bits = pn->bits;
1399
1400		if(!chopped_off)
1401			cindex = tkey_extract_bits(MASK_PFX(key, current_prefix_length), pos, bits);
1402
1403		n = tnode_get_child(pn, cindex);
1404
1405		if (n == NULL) {
1406#ifdef CONFIG_IP_FIB_TRIE_STATS
1407			t->stats.null_node_hit++;
1408#endif
1409			goto backtrace;
1410		}
1411
1412		if (IS_TNODE(n)) {
1413#define HL_OPTIMIZE
1414#ifdef HL_OPTIMIZE
1415			struct tnode *cn = (struct tnode *)n;
1416			t_key node_prefix, key_prefix, pref_mismatch;
1417			int mp;
1418
1419			/*
1420			 * It's a tnode, and we can do some extra checks here if we
1421			 * like, to avoid descending into a dead-end branch.
1422			 * This tnode is in the parent's child array at index
1423			 * key[p_pos..p_pos+p_bits] but potentially with some bits
1424			 * chopped off, so in reality the index may be just a
1425			 * subprefix, padded with zero at the end.
1426			 * We can also take a look at any skipped bits in this
1427			 * tnode - everything up to p_pos is supposed to be ok,
1428			 * and the non-chopped bits of the index (se previous
1429			 * paragraph) are also guaranteed ok, but the rest is
1430			 * considered unknown.
1431			 *
1432			 * The skipped bits are key[pos+bits..cn->pos].
1433			 */
1434
1435			/* If current_prefix_length < pos+bits, we are already doing
1436			 * actual prefix  matching, which means everything from
1437			 * pos+(bits-chopped_off) onward must be zero along some
1438			 * branch of this subtree - otherwise there is *no* valid
1439			 * prefix present. Here we can only check the skipped
1440			 * bits. Remember, since we have already indexed into the
1441			 * parent's child array, we know that the bits we chopped of
1442			 * *are* zero.
1443			 */
1444
1445			/* NOTA BENE: CHECKING ONLY SKIPPED BITS FOR THE NEW NODE HERE */
1446
1447			if (current_prefix_length < pos+bits) {
1448				if (tkey_extract_bits(cn->key, current_prefix_length,
1449						      cn->pos - current_prefix_length) != 0 ||
1450				    !(cn->child[0]))
1451					goto backtrace;
1452			}
1453
1454			/*
1455			 * If chopped_off=0, the index is fully validated and we
1456			 * only need to look at the skipped bits for this, the new,
1457			 * tnode. What we actually want to do is to find out if
1458			 * these skipped bits match our key perfectly, or if we will
1459			 * have to count on finding a matching prefix further down,
1460			 * because if we do, we would like to have some way of
1461			 * verifying the existence of such a prefix at this point.
1462			 */
1463
1464			/* The only thing we can do at this point is to verify that
1465			 * any such matching prefix can indeed be a prefix to our
1466			 * key, and if the bits in the node we are inspecting that
1467			 * do not match our key are not ZERO, this cannot be true.
1468			 * Thus, find out where there is a mismatch (before cn->pos)
1469			 * and verify that all the mismatching bits are zero in the
1470			 * new tnode's key.
1471			 */
1472
1473			/* Note: We aren't very concerned about the piece of the key
1474			 * that precede pn->pos+pn->bits, since these have already been
1475			 * checked. The bits after cn->pos aren't checked since these are
1476			 * by definition "unknown" at this point. Thus, what we want to
1477			 * see is if we are about to enter the "prefix matching" state,
1478			 * and in that case verify that the skipped bits that will prevail
1479			 * throughout this subtree are zero, as they have to be if we are
1480			 * to find a matching prefix.
1481			 */
1482
1483			node_prefix = MASK_PFX(cn->key, cn->pos);
1484			key_prefix =  MASK_PFX(key, cn->pos);
1485			pref_mismatch = key_prefix^node_prefix;
1486			mp = 0;
1487
1488			/* In short: If skipped bits in this node do not match the search
1489			 * key, enter the "prefix matching" state.directly.
1490			 */
1491			if (pref_mismatch) {
1492				while (!(pref_mismatch & (1<<(KEYLENGTH-1)))) {
1493					mp++;
1494					pref_mismatch = pref_mismatch <<1;
1495				}
1496				key_prefix = tkey_extract_bits(cn->key, mp, cn->pos-mp);
1497
1498				if (key_prefix != 0)
1499					goto backtrace;
1500
1501				if (current_prefix_length >= cn->pos)
1502					current_prefix_length=mp;
1503		       }
1504#endif
1505		       pn = (struct tnode *)n; /* Descend */
1506		       chopped_off = 0;
1507		       continue;
1508		}
1509		if (IS_LEAF(n)) {
1510			if( check_leaf(t, (struct leaf *)n, key, &plen, flp, res, &ret))
1511				goto found;
1512	       }
1513backtrace:
1514		chopped_off++;
1515
1516		/* As zero don't change the child key (cindex) */
1517		while ((chopped_off <= pn->bits) && !(cindex & (1<<(chopped_off-1)))) {
1518			chopped_off++;
1519		}
1520
1521		/* Decrease current_... with bits chopped off */
1522		if (current_prefix_length > pn->pos + pn->bits - chopped_off)
1523			current_prefix_length = pn->pos + pn->bits - chopped_off;
1524
1525		/*
1526		 * Either we do the actual chop off according or if we have
1527		 * chopped off all bits in this tnode walk up to our parent.
1528		 */
1529
1530		if(chopped_off <= pn->bits)
1531			cindex &= ~(1 << (chopped_off-1));
1532		else {
1533			if( NODE_PARENT(pn) == NULL)
1534				goto failed;
1535
1536			/* Get Child's index */
1537			cindex = tkey_extract_bits(pn->key, NODE_PARENT(pn)->pos, NODE_PARENT(pn)->bits);
1538			pn = NODE_PARENT(pn);
1539			chopped_off = 0;
1540
1541#ifdef CONFIG_IP_FIB_TRIE_STATS
1542			t->stats.backtrack++;
1543#endif
1544			goto backtrace;
1545		}
1546	}
1547failed:
1548	ret =  1;
1549found:
1550	read_unlock(&fib_lock);
1551	return ret;
1552}
1553
1554static int trie_leaf_remove(struct trie *t, t_key key)
1555{
1556	t_key cindex;
1557	struct tnode *tp = NULL;
1558	struct node *n = t->trie;
1559	struct leaf *l;
1560
1561	if(trie_debug)
1562		printk("entering trie_leaf_remove(%p)\n", n);
1563
1564	/* Note that in the case skipped bits, those bits are *not* checked!
1565	 * When we finish this, we will have NULL or a T_LEAF, and the
1566	 * T_LEAF may or may not match our key.
1567	 */
1568
1569        while (n != NULL && IS_TNODE(n)) {
1570		struct tnode *tn = (struct tnode *) n;
1571		check_tnode(tn);
1572		n = tnode_get_child(tn ,tkey_extract_bits(key, tn->pos, tn->bits));
1573
1574			if(n && NODE_PARENT(n) != tn) {
1575				printk("BUG tn=%p, n->parent=%p\n", tn, NODE_PARENT(n));
1576				BUG();
1577			}
1578        }
1579	l = (struct leaf *) n;
1580
1581	if(!n || !tkey_equals(l->key, key))
1582		return 0;
1583
1584	/*
1585	 * Key found.
1586	 * Remove the leaf and rebalance the tree
1587	 */
1588
1589	t->revision++;
1590	t->size--;
1591
1592	tp = NODE_PARENT(n);
1593	tnode_free((struct tnode *) n);
1594
1595	if(tp) {
1596		cindex = tkey_extract_bits(key, tp->pos, tp->bits);
1597		put_child(t, (struct tnode *)tp, cindex, NULL);
1598		t->trie = trie_rebalance(t, tp);
1599	}
1600	else
1601		t->trie = NULL;
1602
1603	return 1;
1604}
1605
1606static int
1607fn_trie_delete(struct fib_table *tb, struct rtmsg *r, struct kern_rta *rta,
1608	       struct nlmsghdr *nlhdr, struct netlink_skb_parms *req)
1609{
1610	struct trie *t = (struct trie *) tb->tb_data;
1611	u32 key, mask;
1612	int plen = r->rtm_dst_len;
1613	u8 tos = r->rtm_tos;
1614	struct fib_alias *fa, *fa_to_delete;
1615	struct list_head *fa_head;
1616	struct leaf *l;
1617
1618	if (plen > 32)
1619		return -EINVAL;
1620
1621	key = 0;
1622	if (rta->rta_dst)
1623		memcpy(&key, rta->rta_dst, 4);
1624
1625	key = ntohl(key);
1626	mask =  ntohl( inet_make_mask(plen) );
1627
1628	if(key & ~mask)
1629		return -EINVAL;
1630
1631	key = key & mask;
1632	l = fib_find_node(t, key);
1633
1634	if(!l)
1635		return -ESRCH;
1636
1637	fa_head = get_fa_head(l, plen);
1638	fa = fib_find_alias(fa_head, tos, 0);
1639
1640	if (!fa)
1641		return -ESRCH;
1642
1643	if (trie_debug)
1644		printk("Deleting %08x/%d tos=%d t=%p\n", key, plen, tos, t);
1645
1646	fa_to_delete = NULL;
1647	fa_head = fa->fa_list.prev;
1648	list_for_each_entry(fa, fa_head, fa_list) {
1649		struct fib_info *fi = fa->fa_info;
1650
1651		if (fa->fa_tos != tos)
1652			break;
1653
1654		if ((!r->rtm_type ||
1655		     fa->fa_type == r->rtm_type) &&
1656		    (r->rtm_scope == RT_SCOPE_NOWHERE ||
1657		     fa->fa_scope == r->rtm_scope) &&
1658		    (!r->rtm_protocol ||
1659		     fi->fib_protocol == r->rtm_protocol) &&
1660		    fib_nh_match(r, nlhdr, rta, fi) == 0) {
1661			fa_to_delete = fa;
1662			break;
1663		}
1664	}
1665
1666	if (fa_to_delete) {
1667		int kill_li = 0;
1668		struct leaf_info *li;
1669
1670		fa = fa_to_delete;
1671		rtmsg_fib(RTM_DELROUTE, htonl(key), fa, plen, tb->tb_id, nlhdr, req);
1672
1673		l = fib_find_node(t, key);
1674		li = find_leaf_info(&l->list, plen);
1675
1676		write_lock_bh(&fib_lock);
1677
1678		list_del(&fa->fa_list);
1679
1680		if(list_empty(fa_head)) {
1681			hlist_del(&li->hlist);
1682			kill_li = 1;
1683		}
1684		write_unlock_bh(&fib_lock);
1685
1686		if(kill_li)
1687			free_leaf_info(li);
1688
1689		if(hlist_empty(&l->list))
1690			trie_leaf_remove(t, key);
1691
1692		if (fa->fa_state & FA_S_ACCESSED)
1693			rt_cache_flush(-1);
1694
1695		fn_free_alias(fa);
1696		return 0;
1697	}
1698	return -ESRCH;
1699}
1700
1701static int trie_flush_list(struct trie *t, struct list_head *head)
1702{
1703	struct fib_alias *fa, *fa_node;
1704	int found = 0;
1705
1706	list_for_each_entry_safe(fa, fa_node, head, fa_list) {
1707		struct fib_info *fi = fa->fa_info;
1708
1709		if (fi && (fi->fib_flags&RTNH_F_DEAD)) {
1710
1711 			write_lock_bh(&fib_lock);
1712			list_del(&fa->fa_list);
1713			write_unlock_bh(&fib_lock);
1714
1715			fn_free_alias(fa);
1716			found++;
1717		}
1718	}
1719	return found;
1720}
1721
1722static int trie_flush_leaf(struct trie *t, struct leaf *l)
1723{
1724	int found = 0;
1725	struct hlist_head *lih = &l->list;
1726	struct hlist_node *node, *tmp;
1727	struct leaf_info *li = NULL;
1728
1729	hlist_for_each_entry_safe(li, node, tmp, lih, hlist) {
1730
1731		found += trie_flush_list(t, &li->falh);
1732
1733		if (list_empty(&li->falh)) {
1734
1735 			write_lock_bh(&fib_lock);
1736			hlist_del(&li->hlist);
1737			write_unlock_bh(&fib_lock);
1738
1739			free_leaf_info(li);
1740		}
1741	}
1742	return found;
1743}
1744
1745static struct leaf *nextleaf(struct trie *t, struct leaf *thisleaf)
1746{
1747	struct node *c = (struct node *) thisleaf;
1748	struct tnode *p;
1749	int idx;
1750
1751	if(c == NULL) {
1752		if(t->trie == NULL)
1753			return NULL;
1754
1755		if (IS_LEAF(t->trie))          /* trie w. just a leaf */
1756			return (struct leaf *) t->trie;
1757
1758		p = (struct tnode*) t->trie;  /* Start */
1759	}
1760	else
1761		p = (struct tnode *) NODE_PARENT(c);
1762	while (p) {
1763		int pos, last;
1764
1765		/*  Find the next child of the parent */
1766		if(c)
1767			pos  = 1 + tkey_extract_bits(c->key, p->pos, p->bits);
1768		else
1769			pos = 0;
1770
1771		last = 1 << p->bits;
1772		for(idx = pos; idx < last ; idx++) {
1773			if( p->child[idx]) {
1774
1775				/* Decend if tnode */
1776
1777				while (IS_TNODE(p->child[idx])) {
1778					p = (struct tnode*) p->child[idx];
1779					idx = 0;
1780
1781					/* Rightmost non-NULL branch */
1782					if( p && IS_TNODE(p) )
1783						while ( p->child[idx] == NULL && idx < (1 << p->bits) ) idx++;
1784
1785					/* Done with this tnode? */
1786					if( idx >= (1 << p->bits) || p->child[idx] == NULL )
1787						goto up;
1788				}
1789				return (struct leaf*) p->child[idx];
1790			}
1791		}
1792up:
1793		/* No more children go up one step  */
1794		c = (struct node*) p;
1795		p = (struct tnode *) NODE_PARENT(p);
1796	}
1797	return NULL; /* Ready. Root of trie */
1798}
1799
1800static int fn_trie_flush(struct fib_table *tb)
1801{
1802	struct trie *t = (struct trie *) tb->tb_data;
1803	struct leaf *ll = NULL, *l = NULL;
1804	int found = 0, h;
1805
1806	t->revision++;
1807
1808	for (h=0; (l = nextleaf(t, l)) != NULL; h++) {
1809		found += trie_flush_leaf(t, l);
1810
1811		if (ll && hlist_empty(&ll->list))
1812			trie_leaf_remove(t, ll->key);
1813		ll = l;
1814	}
1815
1816	if (ll && hlist_empty(&ll->list))
1817		trie_leaf_remove(t, ll->key);
1818
1819	if(trie_debug)
1820		printk("trie_flush found=%d\n", found);
1821	return found;
1822}
1823
1824static int trie_last_dflt=-1;
1825
1826static void
1827fn_trie_select_default(struct fib_table *tb, const struct flowi *flp, struct fib_result *res)
1828{
1829	struct trie *t = (struct trie *) tb->tb_data;
1830	int order, last_idx;
1831	struct fib_info *fi = NULL;
1832	struct fib_info *last_resort;
1833	struct fib_alias *fa = NULL;
1834	struct list_head *fa_head;
1835	struct leaf *l;
1836
1837	last_idx = -1;
1838	last_resort = NULL;
1839	order = -1;
1840
1841	read_lock(&fib_lock);
1842
1843	l = fib_find_node(t, 0);
1844	if(!l)
1845		goto out;
1846
1847	fa_head = get_fa_head(l, 0);
1848	if(!fa_head)
1849		goto out;
1850
1851	if (list_empty(fa_head))
1852		goto out;
1853
1854	list_for_each_entry(fa, fa_head, fa_list) {
1855		struct fib_info *next_fi = fa->fa_info;
1856
1857		if (fa->fa_scope != res->scope ||
1858		    fa->fa_type != RTN_UNICAST)
1859			continue;
1860
1861		if (next_fi->fib_priority > res->fi->fib_priority)
1862			break;
1863		if (!next_fi->fib_nh[0].nh_gw ||
1864		    next_fi->fib_nh[0].nh_scope != RT_SCOPE_LINK)
1865			continue;
1866		fa->fa_state |= FA_S_ACCESSED;
1867
1868		if (fi == NULL) {
1869			if (next_fi != res->fi)
1870				break;
1871		} else if (!fib_detect_death(fi, order, &last_resort,
1872					     &last_idx, &trie_last_dflt)) {
1873			if (res->fi)
1874				fib_info_put(res->fi);
1875			res->fi = fi;
1876			atomic_inc(&fi->fib_clntref);
1877			trie_last_dflt = order;
1878			goto out;
1879		}
1880		fi = next_fi;
1881		order++;
1882	}
1883	if (order <= 0 || fi == NULL) {
1884		trie_last_dflt = -1;
1885		goto out;
1886	}
1887
1888	if (!fib_detect_death(fi, order, &last_resort, &last_idx, &trie_last_dflt)) {
1889		if (res->fi)
1890			fib_info_put(res->fi);
1891		res->fi = fi;
1892		atomic_inc(&fi->fib_clntref);
1893		trie_last_dflt = order;
1894		goto out;
1895	}
1896	if (last_idx >= 0) {
1897		if (res->fi)
1898			fib_info_put(res->fi);
1899		res->fi = last_resort;
1900		if (last_resort)
1901			atomic_inc(&last_resort->fib_clntref);
1902	}
1903	trie_last_dflt = last_idx;
1904 out:;
1905	read_unlock(&fib_lock);
1906}
1907
1908static int fn_trie_dump_fa(t_key key, int plen, struct list_head *fah, struct fib_table *tb,
1909			   struct sk_buff *skb, struct netlink_callback *cb)
1910{
1911	int i, s_i;
1912	struct fib_alias *fa;
1913
1914	u32 xkey=htonl(key);
1915
1916	s_i=cb->args[3];
1917	i = 0;
1918
1919	list_for_each_entry(fa, fah, fa_list) {
1920		if (i < s_i) {
1921			i++;
1922			continue;
1923		}
1924		if (fa->fa_info->fib_nh == NULL) {
1925			printk("Trie error _fib_nh=NULL in fa[%d] k=%08x plen=%d\n", i, key, plen);
1926			i++;
1927			continue;
1928		}
1929		if (fa->fa_info == NULL) {
1930			printk("Trie error fa_info=NULL in fa[%d] k=%08x plen=%d\n", i, key, plen);
1931			i++;
1932			continue;
1933		}
1934
1935		if (fib_dump_info(skb, NETLINK_CB(cb->skb).pid,
1936				  cb->nlh->nlmsg_seq,
1937				  RTM_NEWROUTE,
1938				  tb->tb_id,
1939				  fa->fa_type,
1940				  fa->fa_scope,
1941				  &xkey,
1942				  plen,
1943				  fa->fa_tos,
1944				  fa->fa_info, 0) < 0) {
1945			cb->args[3] = i;
1946			return -1;
1947			}
1948		i++;
1949	}
1950	cb->args[3]=i;
1951	return skb->len;
1952}
1953
1954static int fn_trie_dump_plen(struct trie *t, int plen, struct fib_table *tb, struct sk_buff *skb,
1955			     struct netlink_callback *cb)
1956{
1957	int h, s_h;
1958	struct list_head *fa_head;
1959	struct leaf *l = NULL;
1960	s_h=cb->args[2];
1961
1962	for (h=0; (l = nextleaf(t, l)) != NULL; h++) {
1963
1964		if (h < s_h)
1965			continue;
1966		if (h > s_h)
1967			memset(&cb->args[3], 0,
1968			       sizeof(cb->args) - 3*sizeof(cb->args[0]));
1969
1970		fa_head = get_fa_head(l, plen);
1971
1972		if(!fa_head)
1973			continue;
1974
1975		if(list_empty(fa_head))
1976			continue;
1977
1978		if (fn_trie_dump_fa(l->key, plen, fa_head, tb, skb, cb)<0) {
1979			cb->args[2]=h;
1980			return -1;
1981		}
1982	}
1983	cb->args[2]=h;
1984	return skb->len;
1985}
1986
1987static int fn_trie_dump(struct fib_table *tb, struct sk_buff *skb, struct netlink_callback *cb)
1988{
1989	int m, s_m;
1990	struct trie *t = (struct trie *) tb->tb_data;
1991
1992	s_m = cb->args[1];
1993
1994	read_lock(&fib_lock);
1995	for (m=0; m<=32; m++) {
1996
1997		if (m < s_m)
1998			continue;
1999		if (m > s_m)
2000			memset(&cb->args[2], 0,
2001			       sizeof(cb->args) - 2*sizeof(cb->args[0]));
2002
2003		if (fn_trie_dump_plen(t, 32-m, tb, skb, cb)<0) {
2004			cb->args[1] = m;
2005			goto out;
2006		}
2007	}
2008	read_unlock(&fib_lock);
2009	cb->args[1] = m;
2010	return skb->len;
2011 out:
2012	read_unlock(&fib_lock);
2013	return -1;
2014}
2015
2016/* Fix more generic FIB names for init later */
2017
2018#ifdef CONFIG_IP_MULTIPLE_TABLES
2019struct fib_table * fib_hash_init(int id)
2020#else
2021struct fib_table * __init fib_hash_init(int id)
2022#endif
2023{
2024	struct fib_table *tb;
2025	struct trie *t;
2026
2027	if (fn_alias_kmem == NULL)
2028		fn_alias_kmem = kmem_cache_create("ip_fib_alias",
2029						  sizeof(struct fib_alias),
2030						  0, SLAB_HWCACHE_ALIGN,
2031						  NULL, NULL);
2032
2033	tb = kmalloc(sizeof(struct fib_table) + sizeof(struct trie),
2034		     GFP_KERNEL);
2035	if (tb == NULL)
2036		return NULL;
2037
2038	tb->tb_id = id;
2039	tb->tb_lookup = fn_trie_lookup;
2040	tb->tb_insert = fn_trie_insert;
2041	tb->tb_delete = fn_trie_delete;
2042	tb->tb_flush = fn_trie_flush;
2043	tb->tb_select_default = fn_trie_select_default;
2044	tb->tb_dump = fn_trie_dump;
2045	memset(tb->tb_data, 0, sizeof(struct trie));
2046
2047	t = (struct trie *) tb->tb_data;
2048
2049	trie_init(t);
2050
2051	if (id == RT_TABLE_LOCAL)
2052                trie_local=t;
2053	  else if (id == RT_TABLE_MAIN)
2054                trie_main=t;
2055
2056	if (id == RT_TABLE_LOCAL)
2057		printk("IPv4 FIB: Using LC-trie version %s\n", VERSION);
2058
2059	return tb;
2060}
2061
2062/* Trie dump functions */
2063
2064static void putspace_seq(struct seq_file *seq, int n)
2065{
2066	while (n--) seq_printf(seq, " ");
2067}
2068
2069static void printbin_seq(struct seq_file *seq, unsigned int v, int bits)
2070{
2071	while (bits--)
2072		seq_printf(seq, "%s", (v & (1<<bits))?"1":"0");
2073}
2074
2075static void printnode_seq(struct seq_file *seq, int indent, struct node *n,
2076		   int pend, int cindex, int bits)
2077{
2078	putspace_seq(seq, indent);
2079	if (IS_LEAF(n))
2080		seq_printf(seq, "|");
2081	else
2082		seq_printf(seq, "+");
2083	if (bits) {
2084		seq_printf(seq, "%d/", cindex);
2085		printbin_seq(seq, cindex, bits);
2086		seq_printf(seq, ": ");
2087	}
2088	else
2089		seq_printf(seq, "<root>: ");
2090	seq_printf(seq, "%s:%p ", IS_LEAF(n)?"Leaf":"Internal node", n);
2091
2092	if (IS_LEAF(n))
2093		seq_printf(seq, "key=%d.%d.%d.%d\n",
2094			   n->key >> 24, (n->key >> 16) % 256, (n->key >> 8) % 256, n->key % 256);
2095	else {
2096		int plen=((struct tnode *)n)->pos;
2097		t_key prf=MASK_PFX(n->key, plen);
2098		seq_printf(seq, "key=%d.%d.%d.%d/%d\n",
2099			   prf >> 24, (prf >> 16) % 256, (prf >> 8) % 256, prf % 256, plen);
2100	}
2101	if (IS_LEAF(n)) {
2102		struct leaf *l=(struct leaf *)n;
2103		struct fib_alias *fa;
2104		int i;
2105		for (i=32; i>=0; i--)
2106		  if(find_leaf_info(&l->list, i)) {
2107
2108				struct list_head *fa_head = get_fa_head(l, i);
2109
2110				if(!fa_head)
2111					continue;
2112
2113				if(list_empty(fa_head))
2114					continue;
2115
2116				putspace_seq(seq, indent+2);
2117				seq_printf(seq, "{/%d...dumping}\n", i);
2118
2119
2120				list_for_each_entry(fa, fa_head, fa_list) {
2121					putspace_seq(seq, indent+2);
2122					if (fa->fa_info->fib_nh == NULL) {
2123						seq_printf(seq, "Error _fib_nh=NULL\n");
2124						continue;
2125					}
2126					if (fa->fa_info == NULL) {
2127						seq_printf(seq, "Error fa_info=NULL\n");
2128						continue;
2129					}
2130
2131					seq_printf(seq, "{type=%d scope=%d TOS=%d}\n",
2132					      fa->fa_type,
2133					      fa->fa_scope,
2134					      fa->fa_tos);
2135				}
2136			}
2137	}
2138	else if (IS_TNODE(n)) {
2139		struct tnode *tn=(struct tnode *)n;
2140		putspace_seq(seq, indent); seq_printf(seq, "|    ");
2141		seq_printf(seq, "{key prefix=%08x/", tn->key&TKEY_GET_MASK(0, tn->pos));
2142		printbin_seq(seq, tkey_extract_bits(tn->key, 0, tn->pos), tn->pos);
2143		seq_printf(seq, "}\n");
2144		putspace_seq(seq, indent); seq_printf(seq, "|    ");
2145		seq_printf(seq, "{pos=%d", tn->pos);
2146		seq_printf(seq, " (skip=%d bits)", tn->pos - pend);
2147		seq_printf(seq, " bits=%d (%u children)}\n", tn->bits, (1 << tn->bits));
2148		putspace_seq(seq, indent); seq_printf(seq, "|    ");
2149		seq_printf(seq, "{empty=%d full=%d}\n", tn->empty_children, tn->full_children);
2150	}
2151}
2152
2153static void trie_dump_seq(struct seq_file *seq, struct trie *t)
2154{
2155	struct node *n=t->trie;
2156	int cindex=0;
2157	int indent=1;
2158	int pend=0;
2159	int depth = 0;
2160
2161  	read_lock(&fib_lock);
2162
2163	seq_printf(seq, "------ trie_dump of t=%p ------\n", t);
2164	if (n) {
2165		printnode_seq(seq, indent, n, pend, cindex, 0);
2166		if (IS_TNODE(n)) {
2167			struct tnode *tn=(struct tnode *)n;
2168			pend = tn->pos+tn->bits;
2169			putspace_seq(seq, indent); seq_printf(seq, "\\--\n");
2170			indent += 3;
2171			depth++;
2172
2173			while (tn && cindex < (1 << tn->bits)) {
2174				if (tn->child[cindex]) {
2175
2176					/* Got a child */
2177
2178					printnode_seq(seq, indent, tn->child[cindex], pend, cindex, tn->bits);
2179					if (IS_LEAF(tn->child[cindex])) {
2180						cindex++;
2181
2182					}
2183					else {
2184						/*
2185						 * New tnode. Decend one level
2186						 */
2187
2188						depth++;
2189						n=tn->child[cindex];
2190						tn=(struct tnode *)n;
2191						pend=tn->pos+tn->bits;
2192						putspace_seq(seq, indent); seq_printf(seq, "\\--\n");
2193						indent+=3;
2194						cindex=0;
2195					}
2196				}
2197				else
2198					cindex++;
2199
2200				/*
2201				 * Test if we are done
2202				 */
2203
2204				while (cindex >= (1 << tn->bits)) {
2205
2206					/*
2207					 * Move upwards and test for root
2208					 * pop off all traversed  nodes
2209					 */
2210
2211					if (NODE_PARENT(tn) == NULL) {
2212						tn = NULL;
2213						n = NULL;
2214						break;
2215					}
2216					else {
2217						cindex = tkey_extract_bits(tn->key, NODE_PARENT(tn)->pos, NODE_PARENT(tn)->bits);
2218						tn = NODE_PARENT(tn);
2219						cindex++;
2220						n=(struct node *)tn;
2221						pend=tn->pos+tn->bits;
2222						indent-=3;
2223						depth--;
2224					}
2225				}
2226			}
2227		}
2228		else n = NULL;
2229	}
2230	else seq_printf(seq, "------ trie is empty\n");
2231
2232  	read_unlock(&fib_lock);
2233}
2234
2235static struct trie_stat *trie_stat_new(void)
2236{
2237	struct trie_stat *s = kmalloc(sizeof(struct trie_stat), GFP_KERNEL);
2238	int i;
2239
2240	if(s) {
2241		s->totdepth = 0;
2242		s->maxdepth = 0;
2243		s->tnodes = 0;
2244		s->leaves = 0;
2245		s->nullpointers = 0;
2246
2247		for(i=0; i< MAX_CHILDS; i++)
2248			s->nodesizes[i] = 0;
2249	}
2250	return s;
2251}
2252
2253static struct trie_stat *trie_collect_stats(struct trie *t)
2254{
2255	struct node *n=t->trie;
2256	struct trie_stat *s = trie_stat_new();
2257	int cindex = 0;
2258	int indent = 1;
2259	int pend = 0;
2260	int depth = 0;
2261
2262	read_lock(&fib_lock);
2263
2264	if (s) {
2265		if (n) {
2266			if (IS_TNODE(n)) {
2267				struct tnode *tn = (struct tnode *)n;
2268				pend=tn->pos+tn->bits;
2269				indent += 3;
2270				s->nodesizes[tn->bits]++;
2271				depth++;
2272
2273				while (tn && cindex < (1 << tn->bits)) {
2274					if (tn->child[cindex]) {
2275						/* Got a child */
2276
2277						if (IS_LEAF(tn->child[cindex])) {
2278							cindex++;
2279
2280							/* stats */
2281							if (depth > s->maxdepth)
2282								s->maxdepth = depth;
2283							s->totdepth += depth;
2284							s->leaves++;
2285						}
2286
2287						else {
2288							/*
2289							 * New tnode. Decend one level
2290							 */
2291
2292							s->tnodes++;
2293							s->nodesizes[tn->bits]++;
2294							depth++;
2295
2296							n = tn->child[cindex];
2297							tn = (struct tnode *)n;
2298							pend = tn->pos+tn->bits;
2299
2300							indent += 3;
2301							cindex = 0;
2302						}
2303					}
2304					else {
2305						cindex++;
2306						s->nullpointers++;
2307					}
2308
2309					/*
2310					 * Test if we are done
2311					 */
2312
2313					while (cindex >= (1 << tn->bits)) {
2314
2315						/*
2316						 * Move upwards and test for root
2317						 * pop off all traversed  nodes
2318						 */
2319
2320
2321						if (NODE_PARENT(tn) == NULL) {
2322							tn = NULL;
2323							n = NULL;
2324							break;
2325						}
2326						else {
2327							cindex = tkey_extract_bits(tn->key, NODE_PARENT(tn)->pos, NODE_PARENT(tn)->bits);
2328							tn = NODE_PARENT(tn);
2329							cindex++;
2330							n = (struct node *)tn;
2331							pend=tn->pos+tn->bits;
2332							indent -= 3;
2333							depth--;
2334						}
2335 					}
2336				}
2337			}
2338			else n = NULL;
2339		}
2340	}
2341
2342	read_unlock(&fib_lock);
2343	return s;
2344}
2345
2346#ifdef CONFIG_PROC_FS
2347
2348static struct fib_alias *fib_triestat_get_first(struct seq_file *seq)
2349{
2350	return NULL;
2351}
2352
2353static struct fib_alias *fib_triestat_get_next(struct seq_file *seq)
2354{
2355	return NULL;
2356}
2357
2358static void *fib_triestat_seq_start(struct seq_file *seq, loff_t *pos)
2359{
2360	void *v = NULL;
2361
2362	if (ip_fib_main_table)
2363		v = *pos ? fib_triestat_get_next(seq) : SEQ_START_TOKEN;
2364	return v;
2365}
2366
2367static void *fib_triestat_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2368{
2369	++*pos;
2370	return v == SEQ_START_TOKEN ? fib_triestat_get_first(seq) : fib_triestat_get_next(seq);
2371}
2372
2373static void fib_triestat_seq_stop(struct seq_file *seq, void *v)
2374{
2375
2376}
2377
2378/*
2379 *	This outputs /proc/net/fib_triestats
2380 *
2381 *	It always works in backward compatibility mode.
2382 *	The format of the file is not supposed to be changed.
2383 */
2384
2385static void collect_and_show(struct trie *t, struct seq_file *seq)
2386{
2387	int bytes = 0; /* How many bytes are used, a ref is 4 bytes */
2388	int i, max, pointers;
2389        struct trie_stat *stat;
2390	int avdepth;
2391
2392	stat = trie_collect_stats(t);
2393
2394	bytes=0;
2395	seq_printf(seq, "trie=%p\n", t);
2396
2397	if (stat) {
2398		if (stat->leaves)
2399			avdepth=stat->totdepth*100 / stat->leaves;
2400		else
2401			avdepth=0;
2402		seq_printf(seq, "Aver depth: %d.%02d\n", avdepth / 100, avdepth % 100 );
2403		seq_printf(seq, "Max depth: %4d\n", stat->maxdepth);
2404
2405		seq_printf(seq, "Leaves: %d\n", stat->leaves);
2406		bytes += sizeof(struct leaf) * stat->leaves;
2407		seq_printf(seq, "Internal nodes: %d\n", stat->tnodes);
2408		bytes += sizeof(struct tnode) * stat->tnodes;
2409
2410		max = MAX_CHILDS-1;
2411
2412		while (max >= 0 && stat->nodesizes[max] == 0)
2413			max--;
2414		pointers = 0;
2415
2416		for (i = 1; i <= max; i++)
2417			if (stat->nodesizes[i] != 0) {
2418				seq_printf(seq, "  %d: %d",  i, stat->nodesizes[i]);
2419				pointers += (1<<i) * stat->nodesizes[i];
2420			}
2421		seq_printf(seq, "\n");
2422		seq_printf(seq, "Pointers: %d\n", pointers);
2423		bytes += sizeof(struct node *) * pointers;
2424		seq_printf(seq, "Null ptrs: %d\n", stat->nullpointers);
2425		seq_printf(seq, "Total size: %d  kB\n", bytes / 1024);
2426
2427		kfree(stat);
2428	}
2429
2430#ifdef CONFIG_IP_FIB_TRIE_STATS
2431	seq_printf(seq, "Counters:\n---------\n");
2432	seq_printf(seq,"gets = %d\n", t->stats.gets);
2433	seq_printf(seq,"backtracks = %d\n", t->stats.backtrack);
2434	seq_printf(seq,"semantic match passed = %d\n", t->stats.semantic_match_passed);
2435	seq_printf(seq,"semantic match miss = %d\n", t->stats.semantic_match_miss);
2436	seq_printf(seq,"null node hit= %d\n", t->stats.null_node_hit);
2437	seq_printf(seq,"skipped node resize = %d\n", t->stats.resize_node_skipped);
2438#ifdef CLEAR_STATS
2439	memset(&(t->stats), 0, sizeof(t->stats));
2440#endif
2441#endif /*  CONFIG_IP_FIB_TRIE_STATS */
2442}
2443
2444static int fib_triestat_seq_show(struct seq_file *seq, void *v)
2445{
2446	char bf[128];
2447
2448	if (v == SEQ_START_TOKEN) {
2449		seq_printf(seq, "Basic info: size of leaf: %Zd bytes, size of tnode: %Zd bytes.\n",
2450			   sizeof(struct leaf), sizeof(struct tnode));
2451		if (trie_local)
2452			collect_and_show(trie_local, seq);
2453
2454		if (trie_main)
2455			collect_and_show(trie_main, seq);
2456	}
2457	else {
2458		snprintf(bf, sizeof(bf),
2459			 "*\t%08X\t%08X", 200, 400);
2460
2461		seq_printf(seq, "%-127s\n", bf);
2462	}
2463	return 0;
2464}
2465
2466static struct seq_operations fib_triestat_seq_ops = {
2467	.start  = fib_triestat_seq_start,
2468	.next   = fib_triestat_seq_next,
2469	.stop   = fib_triestat_seq_stop,
2470	.show   = fib_triestat_seq_show,
2471};
2472
2473static int fib_triestat_seq_open(struct inode *inode, struct file *file)
2474{
2475	struct seq_file *seq;
2476	int rc = -ENOMEM;
2477
2478	rc = seq_open(file, &fib_triestat_seq_ops);
2479	if (rc)
2480		goto out_kfree;
2481
2482	seq	     = file->private_data;
2483out:
2484	return rc;
2485out_kfree:
2486	goto out;
2487}
2488
2489static struct file_operations fib_triestat_seq_fops = {
2490	.owner		= THIS_MODULE,
2491	.open           = fib_triestat_seq_open,
2492	.read           = seq_read,
2493	.llseek         = seq_lseek,
2494	.release	= seq_release_private,
2495};
2496
2497int __init fib_stat_proc_init(void)
2498{
2499	if (!proc_net_fops_create("fib_triestat", S_IRUGO, &fib_triestat_seq_fops))
2500		return -ENOMEM;
2501	return 0;
2502}
2503
2504void __init fib_stat_proc_exit(void)
2505{
2506	proc_net_remove("fib_triestat");
2507}
2508
2509static struct fib_alias *fib_trie_get_first(struct seq_file *seq)
2510{
2511	return NULL;
2512}
2513
2514static struct fib_alias *fib_trie_get_next(struct seq_file *seq)
2515{
2516	return NULL;
2517}
2518
2519static void *fib_trie_seq_start(struct seq_file *seq, loff_t *pos)
2520{
2521	void *v = NULL;
2522
2523	if (ip_fib_main_table)
2524		v = *pos ? fib_trie_get_next(seq) : SEQ_START_TOKEN;
2525	return v;
2526}
2527
2528static void *fib_trie_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2529{
2530	++*pos;
2531	return v == SEQ_START_TOKEN ? fib_trie_get_first(seq) : fib_trie_get_next(seq);
2532}
2533
2534static void fib_trie_seq_stop(struct seq_file *seq, void *v)
2535{
2536
2537}
2538
2539/*
2540 *	This outputs /proc/net/fib_trie.
2541 *
2542 *	It always works in backward compatibility mode.
2543 *	The format of the file is not supposed to be changed.
2544 */
2545
2546static int fib_trie_seq_show(struct seq_file *seq, void *v)
2547{
2548	char bf[128];
2549
2550	if (v == SEQ_START_TOKEN) {
2551		if (trie_local)
2552			trie_dump_seq(seq, trie_local);
2553
2554		if (trie_main)
2555			trie_dump_seq(seq, trie_main);
2556	}
2557
2558	else {
2559		snprintf(bf, sizeof(bf),
2560			 "*\t%08X\t%08X", 200, 400);
2561		seq_printf(seq, "%-127s\n", bf);
2562	}
2563
2564	return 0;
2565}
2566
2567static struct seq_operations fib_trie_seq_ops = {
2568	.start  = fib_trie_seq_start,
2569	.next   = fib_trie_seq_next,
2570	.stop   = fib_trie_seq_stop,
2571	.show   = fib_trie_seq_show,
2572};
2573
2574static int fib_trie_seq_open(struct inode *inode, struct file *file)
2575{
2576	struct seq_file *seq;
2577	int rc = -ENOMEM;
2578
2579	rc = seq_open(file, &fib_trie_seq_ops);
2580	if (rc)
2581		goto out_kfree;
2582
2583	seq	     = file->private_data;
2584out:
2585	return rc;
2586out_kfree:
2587	goto out;
2588}
2589
2590static struct file_operations fib_trie_seq_fops = {
2591	.owner		= THIS_MODULE,
2592	.open           = fib_trie_seq_open,
2593	.read           = seq_read,
2594	.llseek         = seq_lseek,
2595	.release	= seq_release_private,
2596};
2597
2598int __init fib_proc_init(void)
2599{
2600	if (!proc_net_fops_create("fib_trie", S_IRUGO, &fib_trie_seq_fops))
2601		return -ENOMEM;
2602	return 0;
2603}
2604
2605void __init fib_proc_exit(void)
2606{
2607	proc_net_remove("fib_trie");
2608}
2609
2610#endif /* CONFIG_PROC_FS */
2611