fib_trie.c revision f0e36f8cee8101604378085171c980d9cc71d779
1/*
2 *   This program is free software; you can redistribute it and/or
3 *   modify it under the terms of the GNU General Public License
4 *   as published by the Free Software Foundation; either version
5 *   2 of the License, or (at your option) any later version.
6 *
7 *   Robert Olsson <robert.olsson@its.uu.se> Uppsala Universitet
8 *     & Swedish University of Agricultural Sciences.
9 *
10 *   Jens Laas <jens.laas@data.slu.se> Swedish University of
11 *     Agricultural Sciences.
12 *
13 *   Hans Liss <hans.liss@its.uu.se>  Uppsala Universitet
14 *
15 * This work is based on the LPC-trie which is originally descibed in:
16 *
17 * An experimental study of compression methods for dynamic tries
18 * Stefan Nilsson and Matti Tikkanen. Algorithmica, 33(1):19-33, 2002.
19 * http://www.nada.kth.se/~snilsson/public/papers/dyntrie2/
20 *
21 *
22 * IP-address lookup using LC-tries. Stefan Nilsson and Gunnar Karlsson
23 * IEEE Journal on Selected Areas in Communications, 17(6):1083-1092, June 1999
24 *
25 * Version:	$Id: fib_trie.c,v 1.3 2005/06/08 14:20:01 robert Exp $
26 *
27 *
28 * Code from fib_hash has been reused which includes the following header:
29 *
30 *
31 * INET		An implementation of the TCP/IP protocol suite for the LINUX
32 *		operating system.  INET is implemented using the  BSD Socket
33 *		interface as the means of communication with the user level.
34 *
35 *		IPv4 FIB: lookup engine and maintenance routines.
36 *
37 *
38 * Authors:	Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
39 *
40 *		This program is free software; you can redistribute it and/or
41 *		modify it under the terms of the GNU General Public License
42 *		as published by the Free Software Foundation; either version
43 *		2 of the License, or (at your option) any later version.
44 */
45
46#define VERSION "0.324"
47
48#include <linux/config.h>
49#include <asm/uaccess.h>
50#include <asm/system.h>
51#include <asm/bitops.h>
52#include <linux/types.h>
53#include <linux/kernel.h>
54#include <linux/sched.h>
55#include <linux/mm.h>
56#include <linux/string.h>
57#include <linux/socket.h>
58#include <linux/sockios.h>
59#include <linux/errno.h>
60#include <linux/in.h>
61#include <linux/inet.h>
62#include <linux/netdevice.h>
63#include <linux/if_arp.h>
64#include <linux/proc_fs.h>
65#include <linux/skbuff.h>
66#include <linux/netlink.h>
67#include <linux/init.h>
68#include <linux/list.h>
69#include <net/ip.h>
70#include <net/protocol.h>
71#include <net/route.h>
72#include <net/tcp.h>
73#include <net/sock.h>
74#include <net/ip_fib.h>
75#include "fib_lookup.h"
76
77#undef CONFIG_IP_FIB_TRIE_STATS
78#define MAX_CHILDS 16384
79
80#define EXTRACT(p, n, str) ((str)<<(p)>>(32-(n)))
81#define KEYLENGTH (8*sizeof(t_key))
82#define MASK_PFX(k, l) (((l)==0)?0:(k >> (KEYLENGTH-l)) << (KEYLENGTH-l))
83#define TKEY_GET_MASK(offset, bits) (((bits)==0)?0:((t_key)(-1) << (KEYLENGTH - bits) >> offset))
84
85static DEFINE_RWLOCK(fib_lock);
86
87typedef unsigned int t_key;
88
89#define T_TNODE 0
90#define T_LEAF  1
91#define NODE_TYPE_MASK	0x1UL
92#define NODE_PARENT(_node) \
93((struct tnode *)((_node)->_parent & ~NODE_TYPE_MASK))
94#define NODE_SET_PARENT(_node, _ptr) \
95((_node)->_parent = (((unsigned long)(_ptr)) | \
96                     ((_node)->_parent & NODE_TYPE_MASK)))
97#define NODE_INIT_PARENT(_node, _type) \
98((_node)->_parent = (_type))
99#define NODE_TYPE(_node) \
100((_node)->_parent & NODE_TYPE_MASK)
101
102#define IS_TNODE(n) (!(n->_parent & T_LEAF))
103#define IS_LEAF(n) (n->_parent & T_LEAF)
104
105struct node {
106        t_key key;
107	unsigned long _parent;
108};
109
110struct leaf {
111        t_key key;
112	unsigned long _parent;
113	struct hlist_head list;
114};
115
116struct leaf_info {
117	struct hlist_node hlist;
118	int plen;
119	struct list_head falh;
120};
121
122struct tnode {
123        t_key key;
124	unsigned long _parent;
125        unsigned short pos:5;        /* 2log(KEYLENGTH) bits needed */
126        unsigned short bits:5;       /* 2log(KEYLENGTH) bits needed */
127        unsigned short full_children;  /* KEYLENGTH bits needed */
128        unsigned short empty_children; /* KEYLENGTH bits needed */
129        struct node *child[0];
130};
131
132#ifdef CONFIG_IP_FIB_TRIE_STATS
133struct trie_use_stats {
134	unsigned int gets;
135	unsigned int backtrack;
136	unsigned int semantic_match_passed;
137	unsigned int semantic_match_miss;
138	unsigned int null_node_hit;
139};
140#endif
141
142struct trie_stat {
143	unsigned int totdepth;
144	unsigned int maxdepth;
145	unsigned int tnodes;
146	unsigned int leaves;
147	unsigned int nullpointers;
148	unsigned int nodesizes[MAX_CHILDS];
149};
150
151struct trie {
152        struct node *trie;
153#ifdef CONFIG_IP_FIB_TRIE_STATS
154	struct trie_use_stats stats;
155#endif
156        int size;
157	unsigned int revision;
158};
159
160static int trie_debug = 0;
161
162static int tnode_full(struct tnode *tn, struct node *n);
163static void put_child(struct trie *t, struct tnode *tn, int i, struct node *n);
164static void tnode_put_child_reorg(struct tnode *tn, int i, struct node *n, int wasfull);
165static int tnode_child_length(struct tnode *tn);
166static struct node *resize(struct trie *t, struct tnode *tn);
167static struct tnode *inflate(struct trie *t, struct tnode *tn);
168static struct tnode *halve(struct trie *t, struct tnode *tn);
169static void tnode_free(struct tnode *tn);
170static void trie_dump_seq(struct seq_file *seq, struct trie *t);
171extern struct fib_alias *fib_find_alias(struct list_head *fah, u8 tos, u32 prio);
172extern int fib_detect_death(struct fib_info *fi, int order,
173                            struct fib_info **last_resort, int *last_idx, int *dflt);
174
175extern void rtmsg_fib(int event, u32 key, struct fib_alias *fa, int z, int tb_id,
176               struct nlmsghdr *n, struct netlink_skb_parms *req);
177
178static kmem_cache_t *fn_alias_kmem;
179static struct trie *trie_local = NULL, *trie_main = NULL;
180
181static void trie_bug(char *err)
182{
183	printk("Trie Bug: %s\n", err);
184	BUG();
185}
186
187static inline struct node *tnode_get_child(struct tnode *tn, int i)
188{
189        if (i >=  1<<tn->bits)
190                trie_bug("tnode_get_child");
191
192        return tn->child[i];
193}
194
195static inline int tnode_child_length(struct tnode *tn)
196{
197        return 1<<tn->bits;
198}
199
200/*
201  _________________________________________________________________
202  | i | i | i | i | i | i | i | N | N | N | S | S | S | S | S | C |
203  ----------------------------------------------------------------
204    0   1   2   3   4   5   6   7   8   9  10  11  12  13  14  15
205
206  _________________________________________________________________
207  | C | C | C | u | u | u | u | u | u | u | u | u | u | u | u | u |
208  -----------------------------------------------------------------
209   16  17  18  19  20  21  22  23  24  25  26  27  28  29  30  31
210
211  tp->pos = 7
212  tp->bits = 3
213  n->pos = 15
214  n->bits=4
215  KEYLENGTH=32
216*/
217
218static inline t_key tkey_extract_bits(t_key a, int offset, int bits)
219{
220        if (offset < KEYLENGTH)
221		return ((t_key)(a << offset)) >> (KEYLENGTH - bits);
222        else
223		return 0;
224}
225
226static inline int tkey_equals(t_key a, t_key b)
227{
228  return a == b;
229}
230
231static inline int tkey_sub_equals(t_key a, int offset, int bits, t_key b)
232{
233     if (bits == 0 || offset >= KEYLENGTH)
234            return 1;
235        bits = bits > KEYLENGTH ? KEYLENGTH : bits;
236        return ((a ^ b) << offset) >> (KEYLENGTH - bits) == 0;
237}
238
239static inline int tkey_mismatch(t_key a, int offset, t_key b)
240{
241	t_key diff = a ^ b;
242	int i = offset;
243
244	if(!diff)
245	  return 0;
246	while((diff << i) >> (KEYLENGTH-1) == 0)
247		i++;
248	return i;
249}
250
251/* Candiate for fib_semantics */
252
253static void fn_free_alias(struct fib_alias *fa)
254{
255	fib_release_info(fa->fa_info);
256	kmem_cache_free(fn_alias_kmem, fa);
257}
258
259/*
260  To understand this stuff, an understanding of keys and all their bits is
261  necessary. Every node in the trie has a key associated with it, but not
262  all of the bits in that key are significant.
263
264  Consider a node 'n' and its parent 'tp'.
265
266  If n is a leaf, every bit in its key is significant. Its presence is
267  necessitaded by path compression, since during a tree traversal (when
268  searching for a leaf - unless we are doing an insertion) we will completely
269  ignore all skipped bits we encounter. Thus we need to verify, at the end of
270  a potentially successful search, that we have indeed been walking the
271  correct key path.
272
273  Note that we can never "miss" the correct key in the tree if present by
274  following the wrong path. Path compression ensures that segments of the key
275  that are the same for all keys with a given prefix are skipped, but the
276  skipped part *is* identical for each node in the subtrie below the skipped
277  bit! trie_insert() in this implementation takes care of that - note the
278  call to tkey_sub_equals() in trie_insert().
279
280  if n is an internal node - a 'tnode' here, the various parts of its key
281  have many different meanings.
282
283  Example:
284  _________________________________________________________________
285  | i | i | i | i | i | i | i | N | N | N | S | S | S | S | S | C |
286  -----------------------------------------------------------------
287    0   1   2   3   4   5   6   7   8   9  10  11  12  13  14  15
288
289  _________________________________________________________________
290  | C | C | C | u | u | u | u | u | u | u | u | u | u | u | u | u |
291  -----------------------------------------------------------------
292   16  17  18  19  20  21  22  23  24  25  26  27  28  29  30  31
293
294  tp->pos = 7
295  tp->bits = 3
296  n->pos = 15
297  n->bits=4
298
299  First, let's just ignore the bits that come before the parent tp, that is
300  the bits from 0 to (tp->pos-1). They are *known* but at this point we do
301  not use them for anything.
302
303  The bits from (tp->pos) to (tp->pos + tp->bits - 1) - "N", above - are the
304  index into the parent's child array. That is, they will be used to find
305  'n' among tp's children.
306
307  The bits from (tp->pos + tp->bits) to (n->pos - 1) - "S" - are skipped bits
308  for the node n.
309
310  All the bits we have seen so far are significant to the node n. The rest
311  of the bits are really not needed or indeed known in n->key.
312
313  The bits from (n->pos) to (n->pos + n->bits - 1) - "C" - are the index into
314  n's child array, and will of course be different for each child.
315
316  The rest of the bits, from (n->pos + n->bits) onward, are completely unknown
317  at this point.
318
319*/
320
321static void check_tnode(struct tnode *tn)
322{
323	if(tn && tn->pos+tn->bits > 32) {
324		printk("TNODE ERROR tn=%p, pos=%d, bits=%d\n", tn, tn->pos, tn->bits);
325	}
326}
327
328static int halve_threshold = 25;
329static int inflate_threshold = 50;
330
331static struct leaf *leaf_new(void)
332{
333	struct leaf *l = kmalloc(sizeof(struct leaf),  GFP_KERNEL);
334	if(l) {
335		NODE_INIT_PARENT(l, T_LEAF);
336		INIT_HLIST_HEAD(&l->list);
337	}
338	return l;
339}
340
341static struct leaf_info *leaf_info_new(int plen)
342{
343	struct leaf_info *li = kmalloc(sizeof(struct leaf_info),  GFP_KERNEL);
344	if(li) {
345		li->plen = plen;
346		INIT_LIST_HEAD(&li->falh);
347	}
348	return li;
349}
350
351static inline void free_leaf(struct leaf *l)
352{
353	kfree(l);
354}
355
356static inline void free_leaf_info(struct leaf_info *li)
357{
358	kfree(li);
359}
360
361static struct tnode *tnode_alloc(unsigned int size)
362{
363	if (size <= PAGE_SIZE) {
364		return kmalloc(size, GFP_KERNEL);
365	} else {
366		return (struct tnode *)
367		       __get_free_pages(GFP_KERNEL, get_order(size));
368	}
369}
370
371static void __tnode_free(struct tnode *tn)
372{
373	unsigned int size = sizeof(struct tnode) +
374	                    (1<<tn->bits) * sizeof(struct node *);
375
376	if (size <= PAGE_SIZE)
377		kfree(tn);
378	else
379		free_pages((unsigned long)tn, get_order(size));
380}
381
382static struct tnode* tnode_new(t_key key, int pos, int bits)
383{
384	int nchildren = 1<<bits;
385	int sz = sizeof(struct tnode) + nchildren * sizeof(struct node *);
386	struct tnode *tn = tnode_alloc(sz);
387
388	if(tn)  {
389		memset(tn, 0, sz);
390		NODE_INIT_PARENT(tn, T_TNODE);
391		tn->pos = pos;
392		tn->bits = bits;
393		tn->key = key;
394		tn->full_children = 0;
395		tn->empty_children = 1<<bits;
396	}
397	if(trie_debug > 0)
398		printk("AT %p s=%u %u\n", tn, (unsigned int) sizeof(struct tnode),
399		       (unsigned int) (sizeof(struct node) * 1<<bits));
400	return tn;
401}
402
403static void tnode_free(struct tnode *tn)
404{
405	if(!tn) {
406		trie_bug("tnode_free\n");
407	}
408	if(IS_LEAF(tn)) {
409		free_leaf((struct leaf *)tn);
410		if(trie_debug > 0 )
411			printk("FL %p \n", tn);
412	}
413	else if(IS_TNODE(tn)) {
414		__tnode_free(tn);
415		if(trie_debug > 0 )
416			printk("FT %p \n", tn);
417	}
418	else {
419		trie_bug("tnode_free\n");
420	}
421}
422
423/*
424 * Check whether a tnode 'n' is "full", i.e. it is an internal node
425 * and no bits are skipped. See discussion in dyntree paper p. 6
426 */
427
428static inline int tnode_full(struct tnode *tn, struct node *n)
429{
430	if(n == NULL || IS_LEAF(n))
431		return 0;
432
433	return ((struct tnode *) n)->pos == tn->pos + tn->bits;
434}
435
436static inline void put_child(struct trie *t, struct tnode *tn, int i, struct node *n)
437{
438	tnode_put_child_reorg(tn, i, n, -1);
439}
440
441 /*
442  * Add a child at position i overwriting the old value.
443  * Update the value of full_children and empty_children.
444  */
445
446static void tnode_put_child_reorg(struct tnode *tn, int i, struct node *n, int wasfull)
447{
448	struct node *chi;
449	int isfull;
450
451	if(i >=  1<<tn->bits) {
452		printk("bits=%d, i=%d\n", tn->bits, i);
453		trie_bug("tnode_put_child_reorg bits");
454	}
455	write_lock_bh(&fib_lock);
456	chi = tn->child[i];
457
458	/* update emptyChildren */
459	if (n == NULL && chi != NULL)
460		tn->empty_children++;
461	else if (n != NULL && chi == NULL)
462		tn->empty_children--;
463
464	/* update fullChildren */
465        if (wasfull == -1)
466		wasfull = tnode_full(tn, chi);
467
468	isfull = tnode_full(tn, n);
469	if (wasfull && !isfull)
470		tn->full_children--;
471
472	else if (!wasfull && isfull)
473		tn->full_children++;
474	if(n)
475		NODE_SET_PARENT(n, tn);
476
477	tn->child[i] = n;
478	write_unlock_bh(&fib_lock);
479}
480
481static struct node *resize(struct trie *t, struct tnode *tn)
482{
483	int i;
484
485 	if (!tn)
486		return NULL;
487
488	if(trie_debug)
489		printk("In tnode_resize %p inflate_threshold=%d threshold=%d\n",
490		      tn, inflate_threshold, halve_threshold);
491
492	/* No children */
493	if (tn->empty_children == tnode_child_length(tn)) {
494		tnode_free(tn);
495		return NULL;
496	}
497	/* One child */
498	if (tn->empty_children == tnode_child_length(tn) - 1)
499		for (i = 0; i < tnode_child_length(tn); i++) {
500
501			write_lock_bh(&fib_lock);
502			if (tn->child[i] != NULL) {
503
504				/* compress one level */
505				struct node *n = tn->child[i];
506				if(n)
507					NODE_INIT_PARENT(n, NODE_TYPE(n));
508
509				write_unlock_bh(&fib_lock);
510				tnode_free(tn);
511				return n;
512			}
513			write_unlock_bh(&fib_lock);
514		}
515	/*
516	 * Double as long as the resulting node has a number of
517	 * nonempty nodes that are above the threshold.
518	 */
519
520	/*
521	 * From "Implementing a dynamic compressed trie" by Stefan Nilsson of
522	 * the Helsinki University of Technology and Matti Tikkanen of Nokia
523	 * Telecommunications, page 6:
524	 * "A node is doubled if the ratio of non-empty children to all
525	 * children in the *doubled* node is at least 'high'."
526	 *
527	 * 'high' in this instance is the variable 'inflate_threshold'. It
528	 * is expressed as a percentage, so we multiply it with
529	 * tnode_child_length() and instead of multiplying by 2 (since the
530	 * child array will be doubled by inflate()) and multiplying
531	 * the left-hand side by 100 (to handle the percentage thing) we
532	 * multiply the left-hand side by 50.
533	 *
534	 * The left-hand side may look a bit weird: tnode_child_length(tn)
535	 * - tn->empty_children is of course the number of non-null children
536	 * in the current node. tn->full_children is the number of "full"
537	 * children, that is non-null tnodes with a skip value of 0.
538	 * All of those will be doubled in the resulting inflated tnode, so
539	 * we just count them one extra time here.
540	 *
541	 * A clearer way to write this would be:
542	 *
543	 * to_be_doubled = tn->full_children;
544	 * not_to_be_doubled = tnode_child_length(tn) - tn->empty_children -
545	 *     tn->full_children;
546	 *
547	 * new_child_length = tnode_child_length(tn) * 2;
548	 *
549	 * new_fill_factor = 100 * (not_to_be_doubled + 2*to_be_doubled) /
550	 *      new_child_length;
551	 * if (new_fill_factor >= inflate_threshold)
552	 *
553	 * ...and so on, tho it would mess up the while() loop.
554	 *
555	 * anyway,
556	 * 100 * (not_to_be_doubled + 2*to_be_doubled) / new_child_length >=
557	 *      inflate_threshold
558	 *
559	 * avoid a division:
560	 * 100 * (not_to_be_doubled + 2*to_be_doubled) >=
561	 *      inflate_threshold * new_child_length
562	 *
563	 * expand not_to_be_doubled and to_be_doubled, and shorten:
564	 * 100 * (tnode_child_length(tn) - tn->empty_children +
565	 *    tn->full_children ) >= inflate_threshold * new_child_length
566	 *
567	 * expand new_child_length:
568	 * 100 * (tnode_child_length(tn) - tn->empty_children +
569	 *    tn->full_children ) >=
570	 *      inflate_threshold * tnode_child_length(tn) * 2
571	 *
572	 * shorten again:
573	 * 50 * (tn->full_children + tnode_child_length(tn) -
574	 *    tn->empty_children ) >= inflate_threshold *
575	 *    tnode_child_length(tn)
576	 *
577	 */
578
579	check_tnode(tn);
580
581	while ((tn->full_children > 0 &&
582	       50 * (tn->full_children + tnode_child_length(tn) - tn->empty_children) >=
583				inflate_threshold * tnode_child_length(tn))) {
584
585		tn = inflate(t, tn);
586	}
587
588	check_tnode(tn);
589
590	/*
591	 * Halve as long as the number of empty children in this
592	 * node is above threshold.
593	 */
594	while (tn->bits > 1 &&
595	       100 * (tnode_child_length(tn) - tn->empty_children) <
596	       halve_threshold * tnode_child_length(tn))
597
598		tn = halve(t, tn);
599
600	/* Only one child remains */
601
602	if (tn->empty_children == tnode_child_length(tn) - 1)
603		for (i = 0; i < tnode_child_length(tn); i++) {
604
605			write_lock_bh(&fib_lock);
606			if (tn->child[i] != NULL) {
607				/* compress one level */
608				struct node *n = tn->child[i];
609
610				if(n)
611					NODE_INIT_PARENT(n, NODE_TYPE(n));
612
613				write_unlock_bh(&fib_lock);
614				tnode_free(tn);
615				return n;
616			}
617			write_unlock_bh(&fib_lock);
618		}
619
620	return (struct node *) tn;
621}
622
623static struct tnode *inflate(struct trie *t, struct tnode *tn)
624{
625	struct tnode *inode;
626	struct tnode *oldtnode = tn;
627	int olen = tnode_child_length(tn);
628	int i;
629
630  	if(trie_debug)
631		printk("In inflate\n");
632
633	tn = tnode_new(oldtnode->key, oldtnode->pos, oldtnode->bits + 1);
634
635	if (!tn)
636		trie_bug("tnode_new failed");
637
638	for(i = 0; i < olen; i++) {
639		struct node *node = tnode_get_child(oldtnode, i);
640
641		/* An empty child */
642		if (node == NULL)
643			continue;
644
645		/* A leaf or an internal node with skipped bits */
646
647		if(IS_LEAF(node) || ((struct tnode *) node)->pos >
648		   tn->pos + tn->bits - 1) {
649			if(tkey_extract_bits(node->key, tn->pos + tn->bits - 1,
650					     1) == 0)
651				put_child(t, tn, 2*i, node);
652			else
653				put_child(t, tn, 2*i+1, node);
654			continue;
655		}
656
657		/* An internal node with two children */
658		inode = (struct tnode *) node;
659
660		if (inode->bits == 1) {
661			put_child(t, tn, 2*i, inode->child[0]);
662			put_child(t, tn, 2*i+1, inode->child[1]);
663
664			tnode_free(inode);
665		}
666
667			/* An internal node with more than two children */
668		else {
669			struct tnode *left, *right;
670			int size, j;
671
672			/* We will replace this node 'inode' with two new
673			 * ones, 'left' and 'right', each with half of the
674			 * original children. The two new nodes will have
675			 * a position one bit further down the key and this
676			 * means that the "significant" part of their keys
677			 * (see the discussion near the top of this file)
678			 * will differ by one bit, which will be "0" in
679			 * left's key and "1" in right's key. Since we are
680			 * moving the key position by one step, the bit that
681			 * we are moving away from - the bit at position
682			 * (inode->pos) - is the one that will differ between
683			 * left and right. So... we synthesize that bit in the
684			 * two  new keys.
685			 * The mask 'm' below will be a single "one" bit at
686			 * the position (inode->pos)
687			 */
688
689			t_key m = TKEY_GET_MASK(inode->pos, 1);
690
691			/* Use the old key, but set the new significant
692			 *   bit to zero.
693			 */
694			left = tnode_new(inode->key&(~m), inode->pos + 1,
695					 inode->bits - 1);
696
697			if(!left)
698				trie_bug("tnode_new failed");
699
700
701			/* Use the old key, but set the new significant
702			 * bit to one.
703			 */
704			right = tnode_new(inode->key|m, inode->pos + 1,
705					  inode->bits - 1);
706
707			if(!right)
708				trie_bug("tnode_new failed");
709
710			size = tnode_child_length(left);
711			for(j = 0; j < size; j++) {
712				put_child(t, left, j, inode->child[j]);
713				put_child(t, right, j, inode->child[j + size]);
714			}
715			put_child(t, tn, 2*i, resize(t, left));
716			put_child(t, tn, 2*i+1, resize(t, right));
717
718			tnode_free(inode);
719		}
720	}
721	tnode_free(oldtnode);
722	return tn;
723}
724
725static struct tnode *halve(struct trie *t, struct tnode *tn)
726{
727	struct tnode *oldtnode = tn;
728	struct node *left, *right;
729	int i;
730	int olen = tnode_child_length(tn);
731
732	if(trie_debug) printk("In halve\n");
733
734	tn=tnode_new(oldtnode->key, oldtnode->pos, oldtnode->bits - 1);
735
736	if(!tn)
737		trie_bug("tnode_new failed");
738
739	for(i = 0; i < olen; i += 2) {
740		left = tnode_get_child(oldtnode, i);
741		right = tnode_get_child(oldtnode, i+1);
742
743		/* At least one of the children is empty */
744		if (left == NULL) {
745			if (right == NULL)    /* Both are empty */
746				continue;
747			put_child(t, tn, i/2, right);
748		} else if (right == NULL)
749			put_child(t, tn, i/2, left);
750
751		/* Two nonempty children */
752		else {
753			struct tnode *newBinNode =
754				tnode_new(left->key, tn->pos + tn->bits, 1);
755
756			if(!newBinNode)
757				trie_bug("tnode_new failed");
758
759			put_child(t, newBinNode, 0, left);
760			put_child(t, newBinNode, 1, right);
761			put_child(t, tn, i/2, resize(t, newBinNode));
762		}
763	}
764	tnode_free(oldtnode);
765	return tn;
766}
767
768static void *trie_init(struct trie *t)
769{
770	if(t) {
771		t->size = 0;
772		t->trie = NULL;
773		t->revision = 0;
774#ifdef CONFIG_IP_FIB_TRIE_STATS
775       		memset(&t->stats, 0, sizeof(struct trie_use_stats));
776#endif
777	}
778	return t;
779}
780
781static struct leaf_info *find_leaf_info(struct hlist_head *head, int plen)
782{
783	struct hlist_node *node;
784	struct leaf_info *li;
785
786	hlist_for_each_entry(li, node, head, hlist) {
787
788		if ( li->plen == plen )
789			return li;
790	}
791	return NULL;
792}
793
794static inline struct list_head * get_fa_head(struct leaf *l, int plen)
795{
796	struct list_head *fa_head=NULL;
797	struct leaf_info *li = find_leaf_info(&l->list, plen);
798
799	if(li)
800		fa_head = &li->falh;
801
802	return fa_head;
803}
804
805static void insert_leaf_info(struct hlist_head *head, struct leaf_info *new)
806{
807	struct leaf_info *li=NULL, *last=NULL;
808	struct hlist_node *node, *tmp;
809
810	write_lock_bh(&fib_lock);
811
812	if(hlist_empty(head))
813		hlist_add_head(&new->hlist, head);
814	else {
815		hlist_for_each_entry_safe(li, node, tmp, head, hlist) {
816
817			if (new->plen > li->plen)
818				break;
819
820			last = li;
821		}
822		if(last)
823			hlist_add_after(&last->hlist, &new->hlist);
824		else
825			hlist_add_before(&new->hlist, &li->hlist);
826	}
827	write_unlock_bh(&fib_lock);
828}
829
830static struct leaf *
831fib_find_node(struct trie *t, u32 key)
832{
833	int pos;
834	struct tnode *tn;
835	struct node *n;
836
837	pos = 0;
838	n=t->trie;
839
840	while (n != NULL &&  NODE_TYPE(n) == T_TNODE) {
841		tn = (struct tnode *) n;
842
843		check_tnode(tn);
844
845		if(tkey_sub_equals(tn->key, pos, tn->pos-pos, key)) {
846			pos=tn->pos + tn->bits;
847			n = tnode_get_child(tn, tkey_extract_bits(key, tn->pos, tn->bits));
848		}
849		else
850			break;
851	}
852	/* Case we have found a leaf. Compare prefixes */
853
854	if (n != NULL && IS_LEAF(n) && tkey_equals(key, n->key)) {
855		struct leaf *l = (struct leaf *) n;
856		return l;
857	}
858	return NULL;
859}
860
861static struct node *trie_rebalance(struct trie *t, struct tnode *tn)
862{
863	int i = 0;
864	int wasfull;
865	t_key cindex, key;
866	struct tnode *tp = NULL;
867
868	if(!tn)
869		BUG();
870
871	key = tn->key;
872	i = 0;
873
874	while (tn != NULL && NODE_PARENT(tn) != NULL) {
875
876		if( i > 10 ) {
877			printk("Rebalance tn=%p \n", tn);
878			if(tn) 		printk("tn->parent=%p \n", NODE_PARENT(tn));
879
880			printk("Rebalance tp=%p \n", tp);
881			if(tp) 		printk("tp->parent=%p \n", NODE_PARENT(tp));
882		}
883
884		if( i > 12 ) BUG();
885		i++;
886
887		tp = NODE_PARENT(tn);
888		cindex = tkey_extract_bits(key, tp->pos, tp->bits);
889		wasfull = tnode_full(tp, tnode_get_child(tp, cindex));
890		tn = (struct tnode *) resize (t, (struct tnode *)tn);
891		tnode_put_child_reorg((struct tnode *)tp, cindex,(struct node*)tn, wasfull);
892
893		if(!NODE_PARENT(tn))
894			break;
895
896		tn = NODE_PARENT(tn);
897	}
898	/* Handle last (top) tnode */
899	if (IS_TNODE(tn))
900		tn = (struct tnode*) resize(t, (struct tnode *)tn);
901
902	return (struct node*) tn;
903}
904
905static  struct list_head *
906fib_insert_node(struct trie *t, int *err, u32 key, int plen)
907{
908	int pos, newpos;
909	struct tnode *tp = NULL, *tn = NULL;
910	struct node *n;
911	struct leaf *l;
912	int missbit;
913	struct list_head *fa_head=NULL;
914	struct leaf_info *li;
915	t_key cindex;
916
917	pos = 0;
918	n=t->trie;
919
920	/* If we point to NULL, stop. Either the tree is empty and we should
921	 * just put a new leaf in if, or we have reached an empty child slot,
922	 * and we should just put our new leaf in that.
923	 * If we point to a T_TNODE, check if it matches our key. Note that
924	 * a T_TNODE might be skipping any number of bits - its 'pos' need
925	 * not be the parent's 'pos'+'bits'!
926	 *
927	 * If it does match the current key, get pos/bits from it, extract
928	 * the index from our key, push the T_TNODE and walk the tree.
929	 *
930	 * If it doesn't, we have to replace it with a new T_TNODE.
931	 *
932	 * If we point to a T_LEAF, it might or might not have the same key
933	 * as we do. If it does, just change the value, update the T_LEAF's
934	 * value, and return it.
935	 * If it doesn't, we need to replace it with a T_TNODE.
936	 */
937
938	while (n != NULL &&  NODE_TYPE(n) == T_TNODE) {
939		tn = (struct tnode *) n;
940
941		check_tnode(tn);
942
943		if(tkey_sub_equals(tn->key, pos, tn->pos-pos, key)) {
944			tp = tn;
945			pos=tn->pos + tn->bits;
946			n = tnode_get_child(tn, tkey_extract_bits(key, tn->pos, tn->bits));
947
948			if(n && NODE_PARENT(n) != tn) {
949				printk("BUG tn=%p, n->parent=%p\n", tn, NODE_PARENT(n));
950				BUG();
951			}
952		}
953		else
954			break;
955	}
956
957	/*
958	 * n  ----> NULL, LEAF or TNODE
959	 *
960	 * tp is n's (parent) ----> NULL or TNODE
961	 */
962
963	if(tp && IS_LEAF(tp))
964		BUG();
965
966
967	/* Case 1: n is a leaf. Compare prefixes */
968
969	if (n != NULL && IS_LEAF(n) && tkey_equals(key, n->key)) {
970		struct leaf *l = ( struct leaf *)  n;
971
972		li = leaf_info_new(plen);
973
974		if(! li) {
975			*err = -ENOMEM;
976			goto err;
977		}
978
979		fa_head = &li->falh;
980		insert_leaf_info(&l->list, li);
981		goto done;
982	}
983	t->size++;
984	l = leaf_new();
985
986	if(! l) {
987		*err = -ENOMEM;
988		goto err;
989	}
990
991	l->key = key;
992	li = leaf_info_new(plen);
993
994	if(! li) {
995		tnode_free((struct tnode *) l);
996		*err = -ENOMEM;
997		goto err;
998	}
999
1000	fa_head = &li->falh;
1001	insert_leaf_info(&l->list, li);
1002
1003	/* Case 2: n is NULL, and will just insert a new leaf */
1004	if (t->trie && n == NULL) {
1005
1006		NODE_SET_PARENT(l, tp);
1007
1008		if (!tp)
1009			BUG();
1010
1011		else {
1012			cindex = tkey_extract_bits(key, tp->pos, tp->bits);
1013			put_child(t, (struct tnode *)tp, cindex, (struct node *)l);
1014		}
1015	}
1016	/* Case 3: n is a LEAF or a TNODE and the key doesn't match. */
1017	else {
1018		/*
1019		 *  Add a new tnode here
1020		 *  first tnode need some special handling
1021		 */
1022
1023		if (tp)
1024			pos=tp->pos+tp->bits;
1025		else
1026			pos=0;
1027		if(n) {
1028			newpos = tkey_mismatch(key, pos, n->key);
1029			tn = tnode_new(n->key, newpos, 1);
1030		}
1031		else {
1032			newpos = 0;
1033			tn = tnode_new(key, newpos, 1); /* First tnode */
1034		}
1035
1036		if(!tn) {
1037			free_leaf_info(li);
1038			tnode_free((struct tnode *) l);
1039			*err = -ENOMEM;
1040			goto err;
1041		}
1042
1043		NODE_SET_PARENT(tn, tp);
1044
1045		missbit=tkey_extract_bits(key, newpos, 1);
1046		put_child(t, tn, missbit, (struct node *)l);
1047		put_child(t, tn, 1-missbit, n);
1048
1049		if(tp) {
1050			cindex = tkey_extract_bits(key, tp->pos, tp->bits);
1051			put_child(t, (struct tnode *)tp, cindex, (struct node *)tn);
1052		}
1053		else {
1054			t->trie = (struct node*) tn; /* First tnode */
1055			tp = tn;
1056		}
1057	}
1058	if(tp && tp->pos+tp->bits > 32) {
1059		printk("ERROR tp=%p pos=%d, bits=%d, key=%0x plen=%d\n",
1060		       tp, tp->pos, tp->bits, key, plen);
1061	}
1062	/* Rebalance the trie */
1063	t->trie = trie_rebalance(t, tp);
1064done:
1065	t->revision++;
1066err:;
1067	return fa_head;
1068}
1069
1070static int
1071fn_trie_insert(struct fib_table *tb, struct rtmsg *r, struct kern_rta *rta,
1072	       struct nlmsghdr *nlhdr, struct netlink_skb_parms *req)
1073{
1074	struct trie *t = (struct trie *) tb->tb_data;
1075	struct fib_alias *fa, *new_fa;
1076	struct list_head *fa_head=NULL;
1077	struct fib_info *fi;
1078	int plen = r->rtm_dst_len;
1079	int type = r->rtm_type;
1080	u8 tos = r->rtm_tos;
1081	u32 key, mask;
1082	int err;
1083	struct leaf *l;
1084
1085	if (plen > 32)
1086		return -EINVAL;
1087
1088	key = 0;
1089	if (rta->rta_dst)
1090		memcpy(&key, rta->rta_dst, 4);
1091
1092	key = ntohl(key);
1093
1094	if(trie_debug)
1095		printk("Insert table=%d %08x/%d\n", tb->tb_id, key, plen);
1096
1097	mask =  ntohl( inet_make_mask(plen) );
1098
1099	if(key & ~mask)
1100		return -EINVAL;
1101
1102	key = key & mask;
1103
1104	if  ((fi = fib_create_info(r, rta, nlhdr, &err)) == NULL)
1105		goto err;
1106
1107	l = fib_find_node(t, key);
1108	fa = NULL;
1109
1110	if(l) {
1111		fa_head = get_fa_head(l, plen);
1112		fa = fib_find_alias(fa_head, tos, fi->fib_priority);
1113	}
1114
1115	/* Now fa, if non-NULL, points to the first fib alias
1116	 * with the same keys [prefix,tos,priority], if such key already
1117	 * exists or to the node before which we will insert new one.
1118	 *
1119	 * If fa is NULL, we will need to allocate a new one and
1120	 * insert to the head of f.
1121	 *
1122	 * If f is NULL, no fib node matched the destination key
1123	 * and we need to allocate a new one of those as well.
1124	 */
1125
1126	if (fa &&
1127	    fa->fa_info->fib_priority == fi->fib_priority) {
1128		struct fib_alias *fa_orig;
1129
1130		err = -EEXIST;
1131		if (nlhdr->nlmsg_flags & NLM_F_EXCL)
1132			goto out;
1133
1134		if (nlhdr->nlmsg_flags & NLM_F_REPLACE) {
1135			struct fib_info *fi_drop;
1136			u8 state;
1137
1138			write_lock_bh(&fib_lock);
1139
1140			fi_drop = fa->fa_info;
1141			fa->fa_info = fi;
1142			fa->fa_type = type;
1143			fa->fa_scope = r->rtm_scope;
1144			state = fa->fa_state;
1145			fa->fa_state &= ~FA_S_ACCESSED;
1146
1147			write_unlock_bh(&fib_lock);
1148
1149			fib_release_info(fi_drop);
1150			if (state & FA_S_ACCESSED)
1151			  rt_cache_flush(-1);
1152
1153			    goto succeeded;
1154		}
1155		/* Error if we find a perfect match which
1156		 * uses the same scope, type, and nexthop
1157		 * information.
1158		 */
1159		fa_orig = fa;
1160		list_for_each_entry(fa, fa_orig->fa_list.prev, fa_list) {
1161			if (fa->fa_tos != tos)
1162				break;
1163			if (fa->fa_info->fib_priority != fi->fib_priority)
1164				break;
1165			if (fa->fa_type == type &&
1166			    fa->fa_scope == r->rtm_scope &&
1167			    fa->fa_info == fi) {
1168				goto out;
1169			}
1170		}
1171		if (!(nlhdr->nlmsg_flags & NLM_F_APPEND))
1172			fa = fa_orig;
1173	}
1174	err = -ENOENT;
1175	if (!(nlhdr->nlmsg_flags&NLM_F_CREATE))
1176		goto out;
1177
1178	err = -ENOBUFS;
1179	new_fa = kmem_cache_alloc(fn_alias_kmem, SLAB_KERNEL);
1180	if (new_fa == NULL)
1181		goto out;
1182
1183	new_fa->fa_info = fi;
1184	new_fa->fa_tos = tos;
1185	new_fa->fa_type = type;
1186	new_fa->fa_scope = r->rtm_scope;
1187	new_fa->fa_state = 0;
1188#if 0
1189	new_fa->dst  = NULL;
1190#endif
1191	/*
1192	 * Insert new entry to the list.
1193	 */
1194
1195	if(!fa_head) {
1196		fa_head = fib_insert_node(t, &err, key, plen);
1197		err = 0;
1198		if(err)
1199			goto out_free_new_fa;
1200	}
1201
1202	write_lock_bh(&fib_lock);
1203
1204	list_add_tail(&new_fa->fa_list,
1205		 (fa ? &fa->fa_list : fa_head));
1206
1207	write_unlock_bh(&fib_lock);
1208
1209	rt_cache_flush(-1);
1210	rtmsg_fib(RTM_NEWROUTE, htonl(key), new_fa, plen, tb->tb_id, nlhdr, req);
1211succeeded:
1212	return 0;
1213
1214out_free_new_fa:
1215	kmem_cache_free(fn_alias_kmem, new_fa);
1216out:
1217	fib_release_info(fi);
1218err:;
1219	return err;
1220}
1221
1222static inline int check_leaf(struct trie *t, struct leaf *l,  t_key key, int *plen, const struct flowi *flp,
1223			     struct fib_result *res, int *err)
1224{
1225	int i;
1226	t_key mask;
1227	struct leaf_info *li;
1228	struct hlist_head *hhead = &l->list;
1229	struct hlist_node *node;
1230
1231	hlist_for_each_entry(li, node, hhead, hlist) {
1232
1233		i = li->plen;
1234		mask = ntohl(inet_make_mask(i));
1235		if (l->key != (key & mask))
1236			continue;
1237
1238		if (((*err) = fib_semantic_match(&li->falh, flp, res, l->key, mask, i)) == 0) {
1239			*plen = i;
1240#ifdef CONFIG_IP_FIB_TRIE_STATS
1241			t->stats.semantic_match_passed++;
1242#endif
1243			return 1;
1244		}
1245#ifdef CONFIG_IP_FIB_TRIE_STATS
1246		t->stats.semantic_match_miss++;
1247#endif
1248	}
1249	return 0;
1250}
1251
1252static int
1253fn_trie_lookup(struct fib_table *tb, const struct flowi *flp, struct fib_result *res)
1254{
1255	struct trie *t = (struct trie *) tb->tb_data;
1256	int plen, ret = 0;
1257	struct node *n;
1258	struct tnode *pn;
1259	int pos, bits;
1260	t_key key=ntohl(flp->fl4_dst);
1261	int chopped_off;
1262	t_key cindex = 0;
1263	int current_prefix_length = KEYLENGTH;
1264	n = t->trie;
1265
1266	read_lock(&fib_lock);
1267	if(!n)
1268		goto failed;
1269
1270#ifdef CONFIG_IP_FIB_TRIE_STATS
1271	t->stats.gets++;
1272#endif
1273
1274	/* Just a leaf? */
1275	if (IS_LEAF(n)) {
1276		if( check_leaf(t, (struct leaf *)n, key, &plen, flp, res, &ret) )
1277			goto found;
1278		goto failed;
1279	}
1280	pn = (struct tnode *) n;
1281	chopped_off = 0;
1282
1283        while (pn) {
1284
1285		pos = pn->pos;
1286		bits = pn->bits;
1287
1288		if(!chopped_off)
1289			cindex = tkey_extract_bits(MASK_PFX(key, current_prefix_length), pos, bits);
1290
1291		n = tnode_get_child(pn, cindex);
1292
1293		if (n == NULL) {
1294#ifdef CONFIG_IP_FIB_TRIE_STATS
1295			t->stats.null_node_hit++;
1296#endif
1297			goto backtrace;
1298		}
1299
1300		if (IS_TNODE(n)) {
1301#define HL_OPTIMIZE
1302#ifdef HL_OPTIMIZE
1303			struct tnode *cn = (struct tnode *)n;
1304			t_key node_prefix, key_prefix, pref_mismatch;
1305			int mp;
1306
1307			/*
1308			 * It's a tnode, and we can do some extra checks here if we
1309			 * like, to avoid descending into a dead-end branch.
1310			 * This tnode is in the parent's child array at index
1311			 * key[p_pos..p_pos+p_bits] but potentially with some bits
1312			 * chopped off, so in reality the index may be just a
1313			 * subprefix, padded with zero at the end.
1314			 * We can also take a look at any skipped bits in this
1315			 * tnode - everything up to p_pos is supposed to be ok,
1316			 * and the non-chopped bits of the index (se previous
1317			 * paragraph) are also guaranteed ok, but the rest is
1318			 * considered unknown.
1319			 *
1320			 * The skipped bits are key[pos+bits..cn->pos].
1321			 */
1322
1323			/* If current_prefix_length < pos+bits, we are already doing
1324			 * actual prefix  matching, which means everything from
1325			 * pos+(bits-chopped_off) onward must be zero along some
1326			 * branch of this subtree - otherwise there is *no* valid
1327			 * prefix present. Here we can only check the skipped
1328			 * bits. Remember, since we have already indexed into the
1329			 * parent's child array, we know that the bits we chopped of
1330			 * *are* zero.
1331			 */
1332
1333			/* NOTA BENE: CHECKING ONLY SKIPPED BITS FOR THE NEW NODE HERE */
1334
1335			if (current_prefix_length < pos+bits) {
1336				if (tkey_extract_bits(cn->key, current_prefix_length,
1337						      cn->pos - current_prefix_length) != 0 ||
1338				    !(cn->child[0]))
1339					goto backtrace;
1340			}
1341
1342			/*
1343			 * If chopped_off=0, the index is fully validated and we
1344			 * only need to look at the skipped bits for this, the new,
1345			 * tnode. What we actually want to do is to find out if
1346			 * these skipped bits match our key perfectly, or if we will
1347			 * have to count on finding a matching prefix further down,
1348			 * because if we do, we would like to have some way of
1349			 * verifying the existence of such a prefix at this point.
1350			 */
1351
1352			/* The only thing we can do at this point is to verify that
1353			 * any such matching prefix can indeed be a prefix to our
1354			 * key, and if the bits in the node we are inspecting that
1355			 * do not match our key are not ZERO, this cannot be true.
1356			 * Thus, find out where there is a mismatch (before cn->pos)
1357			 * and verify that all the mismatching bits are zero in the
1358			 * new tnode's key.
1359			 */
1360
1361			/* Note: We aren't very concerned about the piece of the key
1362			 * that precede pn->pos+pn->bits, since these have already been
1363			 * checked. The bits after cn->pos aren't checked since these are
1364			 * by definition "unknown" at this point. Thus, what we want to
1365			 * see is if we are about to enter the "prefix matching" state,
1366			 * and in that case verify that the skipped bits that will prevail
1367			 * throughout this subtree are zero, as they have to be if we are
1368			 * to find a matching prefix.
1369			 */
1370
1371			node_prefix = MASK_PFX(cn->key, cn->pos);
1372			key_prefix =  MASK_PFX(key, cn->pos);
1373			pref_mismatch = key_prefix^node_prefix;
1374			mp = 0;
1375
1376			/* In short: If skipped bits in this node do not match the search
1377			 * key, enter the "prefix matching" state.directly.
1378			 */
1379			if (pref_mismatch) {
1380				while (!(pref_mismatch & (1<<(KEYLENGTH-1)))) {
1381					mp++;
1382					pref_mismatch = pref_mismatch <<1;
1383				}
1384				key_prefix = tkey_extract_bits(cn->key, mp, cn->pos-mp);
1385
1386				if (key_prefix != 0)
1387					goto backtrace;
1388
1389				if (current_prefix_length >= cn->pos)
1390					current_prefix_length=mp;
1391		       }
1392#endif
1393		       pn = (struct tnode *)n; /* Descend */
1394		       chopped_off = 0;
1395		       continue;
1396		}
1397		if (IS_LEAF(n)) {
1398			if( check_leaf(t, (struct leaf *)n, key, &plen, flp, res, &ret))
1399				goto found;
1400	       }
1401backtrace:
1402		chopped_off++;
1403
1404		/* As zero don't change the child key (cindex) */
1405		while ((chopped_off <= pn->bits) && !(cindex & (1<<(chopped_off-1)))) {
1406			chopped_off++;
1407		}
1408
1409		/* Decrease current_... with bits chopped off */
1410		if (current_prefix_length > pn->pos + pn->bits - chopped_off)
1411			current_prefix_length = pn->pos + pn->bits - chopped_off;
1412
1413		/*
1414		 * Either we do the actual chop off according or if we have
1415		 * chopped off all bits in this tnode walk up to our parent.
1416		 */
1417
1418		if(chopped_off <= pn->bits)
1419			cindex &= ~(1 << (chopped_off-1));
1420		else {
1421			if( NODE_PARENT(pn) == NULL)
1422				goto failed;
1423
1424			/* Get Child's index */
1425			cindex = tkey_extract_bits(pn->key, NODE_PARENT(pn)->pos, NODE_PARENT(pn)->bits);
1426			pn = NODE_PARENT(pn);
1427			chopped_off = 0;
1428
1429#ifdef CONFIG_IP_FIB_TRIE_STATS
1430			t->stats.backtrack++;
1431#endif
1432			goto backtrace;
1433		}
1434	}
1435failed:
1436	ret =  1;
1437found:
1438	read_unlock(&fib_lock);
1439	return ret;
1440}
1441
1442static int trie_leaf_remove(struct trie *t, t_key key)
1443{
1444	t_key cindex;
1445	struct tnode *tp = NULL;
1446	struct node *n = t->trie;
1447	struct leaf *l;
1448
1449	if(trie_debug)
1450		printk("entering trie_leaf_remove(%p)\n", n);
1451
1452	/* Note that in the case skipped bits, those bits are *not* checked!
1453	 * When we finish this, we will have NULL or a T_LEAF, and the
1454	 * T_LEAF may or may not match our key.
1455	 */
1456
1457        while (n != NULL && IS_TNODE(n)) {
1458		struct tnode *tn = (struct tnode *) n;
1459		check_tnode(tn);
1460		n = tnode_get_child(tn ,tkey_extract_bits(key, tn->pos, tn->bits));
1461
1462			if(n && NODE_PARENT(n) != tn) {
1463				printk("BUG tn=%p, n->parent=%p\n", tn, NODE_PARENT(n));
1464				BUG();
1465			}
1466        }
1467	l = (struct leaf *) n;
1468
1469	if(!n || !tkey_equals(l->key, key))
1470		return 0;
1471
1472	/*
1473	 * Key found.
1474	 * Remove the leaf and rebalance the tree
1475	 */
1476
1477	t->revision++;
1478	t->size--;
1479
1480	tp = NODE_PARENT(n);
1481	tnode_free((struct tnode *) n);
1482
1483	if(tp) {
1484		cindex = tkey_extract_bits(key, tp->pos, tp->bits);
1485		put_child(t, (struct tnode *)tp, cindex, NULL);
1486		t->trie = trie_rebalance(t, tp);
1487	}
1488	else
1489		t->trie = NULL;
1490
1491	return 1;
1492}
1493
1494static int
1495fn_trie_delete(struct fib_table *tb, struct rtmsg *r, struct kern_rta *rta,
1496	       struct nlmsghdr *nlhdr, struct netlink_skb_parms *req)
1497{
1498	struct trie *t = (struct trie *) tb->tb_data;
1499	u32 key, mask;
1500	int plen = r->rtm_dst_len;
1501	u8 tos = r->rtm_tos;
1502	struct fib_alias *fa, *fa_to_delete;
1503	struct list_head *fa_head;
1504	struct leaf *l;
1505
1506	if (plen > 32)
1507		return -EINVAL;
1508
1509	key = 0;
1510	if (rta->rta_dst)
1511		memcpy(&key, rta->rta_dst, 4);
1512
1513	key = ntohl(key);
1514	mask =  ntohl( inet_make_mask(plen) );
1515
1516	if(key & ~mask)
1517		return -EINVAL;
1518
1519	key = key & mask;
1520	l = fib_find_node(t, key);
1521
1522	if(!l)
1523		return -ESRCH;
1524
1525	fa_head = get_fa_head(l, plen);
1526	fa = fib_find_alias(fa_head, tos, 0);
1527
1528	if (!fa)
1529		return -ESRCH;
1530
1531	if (trie_debug)
1532		printk("Deleting %08x/%d tos=%d t=%p\n", key, plen, tos, t);
1533
1534	fa_to_delete = NULL;
1535	fa_head = fa->fa_list.prev;
1536	list_for_each_entry(fa, fa_head, fa_list) {
1537		struct fib_info *fi = fa->fa_info;
1538
1539		if (fa->fa_tos != tos)
1540			break;
1541
1542		if ((!r->rtm_type ||
1543		     fa->fa_type == r->rtm_type) &&
1544		    (r->rtm_scope == RT_SCOPE_NOWHERE ||
1545		     fa->fa_scope == r->rtm_scope) &&
1546		    (!r->rtm_protocol ||
1547		     fi->fib_protocol == r->rtm_protocol) &&
1548		    fib_nh_match(r, nlhdr, rta, fi) == 0) {
1549			fa_to_delete = fa;
1550			break;
1551		}
1552	}
1553
1554	if (fa_to_delete) {
1555		int kill_li = 0;
1556		struct leaf_info *li;
1557
1558		fa = fa_to_delete;
1559		rtmsg_fib(RTM_DELROUTE, htonl(key), fa, plen, tb->tb_id, nlhdr, req);
1560
1561		l = fib_find_node(t, key);
1562		li = find_leaf_info(&l->list, plen);
1563
1564		write_lock_bh(&fib_lock);
1565
1566		list_del(&fa->fa_list);
1567
1568		if(list_empty(fa_head)) {
1569			hlist_del(&li->hlist);
1570			kill_li = 1;
1571		}
1572		write_unlock_bh(&fib_lock);
1573
1574		if(kill_li)
1575			free_leaf_info(li);
1576
1577		if(hlist_empty(&l->list))
1578			trie_leaf_remove(t, key);
1579
1580		if (fa->fa_state & FA_S_ACCESSED)
1581			rt_cache_flush(-1);
1582
1583		fn_free_alias(fa);
1584		return 0;
1585	}
1586	return -ESRCH;
1587}
1588
1589static int trie_flush_list(struct trie *t, struct list_head *head)
1590{
1591	struct fib_alias *fa, *fa_node;
1592	int found = 0;
1593
1594	list_for_each_entry_safe(fa, fa_node, head, fa_list) {
1595		struct fib_info *fi = fa->fa_info;
1596
1597		if (fi && (fi->fib_flags&RTNH_F_DEAD)) {
1598
1599 			write_lock_bh(&fib_lock);
1600			list_del(&fa->fa_list);
1601			write_unlock_bh(&fib_lock);
1602
1603			fn_free_alias(fa);
1604			found++;
1605		}
1606	}
1607	return found;
1608}
1609
1610static int trie_flush_leaf(struct trie *t, struct leaf *l)
1611{
1612	int found = 0;
1613	struct hlist_head *lih = &l->list;
1614	struct hlist_node *node, *tmp;
1615	struct leaf_info *li = NULL;
1616
1617	hlist_for_each_entry_safe(li, node, tmp, lih, hlist) {
1618
1619		found += trie_flush_list(t, &li->falh);
1620
1621		if (list_empty(&li->falh)) {
1622
1623 			write_lock_bh(&fib_lock);
1624			hlist_del(&li->hlist);
1625			write_unlock_bh(&fib_lock);
1626
1627			free_leaf_info(li);
1628		}
1629	}
1630	return found;
1631}
1632
1633static struct leaf *nextleaf(struct trie *t, struct leaf *thisleaf)
1634{
1635	struct node *c = (struct node *) thisleaf;
1636	struct tnode *p;
1637	int idx;
1638
1639	if(c == NULL) {
1640		if(t->trie == NULL)
1641			return NULL;
1642
1643		if (IS_LEAF(t->trie))          /* trie w. just a leaf */
1644			return (struct leaf *) t->trie;
1645
1646		p = (struct tnode*) t->trie;  /* Start */
1647	}
1648	else
1649		p = (struct tnode *) NODE_PARENT(c);
1650	while (p) {
1651		int pos, last;
1652
1653		/*  Find the next child of the parent */
1654		if(c)
1655			pos  = 1 + tkey_extract_bits(c->key, p->pos, p->bits);
1656		else
1657			pos = 0;
1658
1659		last = 1 << p->bits;
1660		for(idx = pos; idx < last ; idx++) {
1661			if( p->child[idx]) {
1662
1663				/* Decend if tnode */
1664
1665				while (IS_TNODE(p->child[idx])) {
1666					p = (struct tnode*) p->child[idx];
1667					idx = 0;
1668
1669					/* Rightmost non-NULL branch */
1670					if( p && IS_TNODE(p) )
1671						while ( p->child[idx] == NULL && idx < (1 << p->bits) ) idx++;
1672
1673					/* Done with this tnode? */
1674					if( idx >= (1 << p->bits) || p->child[idx] == NULL )
1675						goto up;
1676				}
1677				return (struct leaf*) p->child[idx];
1678			}
1679		}
1680up:
1681		/* No more children go up one step  */
1682		c = (struct node*) p;
1683		p = (struct tnode *) NODE_PARENT(p);
1684	}
1685	return NULL; /* Ready. Root of trie */
1686}
1687
1688static int fn_trie_flush(struct fib_table *tb)
1689{
1690	struct trie *t = (struct trie *) tb->tb_data;
1691	struct leaf *ll = NULL, *l = NULL;
1692	int found = 0, h;
1693
1694	t->revision++;
1695
1696	for (h=0; (l = nextleaf(t, l)) != NULL; h++) {
1697		found += trie_flush_leaf(t, l);
1698
1699		if (ll && hlist_empty(&ll->list))
1700			trie_leaf_remove(t, ll->key);
1701		ll = l;
1702	}
1703
1704	if (ll && hlist_empty(&ll->list))
1705		trie_leaf_remove(t, ll->key);
1706
1707	if(trie_debug)
1708		printk("trie_flush found=%d\n", found);
1709	return found;
1710}
1711
1712static int trie_last_dflt=-1;
1713
1714static void
1715fn_trie_select_default(struct fib_table *tb, const struct flowi *flp, struct fib_result *res)
1716{
1717	struct trie *t = (struct trie *) tb->tb_data;
1718	int order, last_idx;
1719	struct fib_info *fi = NULL;
1720	struct fib_info *last_resort;
1721	struct fib_alias *fa = NULL;
1722	struct list_head *fa_head;
1723	struct leaf *l;
1724
1725	last_idx = -1;
1726	last_resort = NULL;
1727	order = -1;
1728
1729	read_lock(&fib_lock);
1730
1731	l = fib_find_node(t, 0);
1732	if(!l)
1733		goto out;
1734
1735	fa_head = get_fa_head(l, 0);
1736	if(!fa_head)
1737		goto out;
1738
1739	if (list_empty(fa_head))
1740		goto out;
1741
1742	list_for_each_entry(fa, fa_head, fa_list) {
1743		struct fib_info *next_fi = fa->fa_info;
1744
1745		if (fa->fa_scope != res->scope ||
1746		    fa->fa_type != RTN_UNICAST)
1747			continue;
1748
1749		if (next_fi->fib_priority > res->fi->fib_priority)
1750			break;
1751		if (!next_fi->fib_nh[0].nh_gw ||
1752		    next_fi->fib_nh[0].nh_scope != RT_SCOPE_LINK)
1753			continue;
1754		fa->fa_state |= FA_S_ACCESSED;
1755
1756		if (fi == NULL) {
1757			if (next_fi != res->fi)
1758				break;
1759		} else if (!fib_detect_death(fi, order, &last_resort,
1760					     &last_idx, &trie_last_dflt)) {
1761			if (res->fi)
1762				fib_info_put(res->fi);
1763			res->fi = fi;
1764			atomic_inc(&fi->fib_clntref);
1765			trie_last_dflt = order;
1766			goto out;
1767		}
1768		fi = next_fi;
1769		order++;
1770	}
1771	if (order <= 0 || fi == NULL) {
1772		trie_last_dflt = -1;
1773		goto out;
1774	}
1775
1776	if (!fib_detect_death(fi, order, &last_resort, &last_idx, &trie_last_dflt)) {
1777		if (res->fi)
1778			fib_info_put(res->fi);
1779		res->fi = fi;
1780		atomic_inc(&fi->fib_clntref);
1781		trie_last_dflt = order;
1782		goto out;
1783	}
1784	if (last_idx >= 0) {
1785		if (res->fi)
1786			fib_info_put(res->fi);
1787		res->fi = last_resort;
1788		if (last_resort)
1789			atomic_inc(&last_resort->fib_clntref);
1790	}
1791	trie_last_dflt = last_idx;
1792 out:;
1793	read_unlock(&fib_lock);
1794}
1795
1796static int fn_trie_dump_fa(t_key key, int plen, struct list_head *fah, struct fib_table *tb,
1797			   struct sk_buff *skb, struct netlink_callback *cb)
1798{
1799	int i, s_i;
1800	struct fib_alias *fa;
1801
1802	u32 xkey=htonl(key);
1803
1804	s_i=cb->args[3];
1805	i = 0;
1806
1807	list_for_each_entry(fa, fah, fa_list) {
1808		if (i < s_i) {
1809			i++;
1810			continue;
1811		}
1812		if (fa->fa_info->fib_nh == NULL) {
1813			printk("Trie error _fib_nh=NULL in fa[%d] k=%08x plen=%d\n", i, key, plen);
1814			i++;
1815			continue;
1816		}
1817		if (fa->fa_info == NULL) {
1818			printk("Trie error fa_info=NULL in fa[%d] k=%08x plen=%d\n", i, key, plen);
1819			i++;
1820			continue;
1821		}
1822
1823		if (fib_dump_info(skb, NETLINK_CB(cb->skb).pid,
1824				  cb->nlh->nlmsg_seq,
1825				  RTM_NEWROUTE,
1826				  tb->tb_id,
1827				  fa->fa_type,
1828				  fa->fa_scope,
1829				  &xkey,
1830				  plen,
1831				  fa->fa_tos,
1832				  fa->fa_info, 0) < 0) {
1833			cb->args[3] = i;
1834			return -1;
1835			}
1836		i++;
1837	}
1838	cb->args[3]=i;
1839	return skb->len;
1840}
1841
1842static int fn_trie_dump_plen(struct trie *t, int plen, struct fib_table *tb, struct sk_buff *skb,
1843			     struct netlink_callback *cb)
1844{
1845	int h, s_h;
1846	struct list_head *fa_head;
1847	struct leaf *l = NULL;
1848	s_h=cb->args[2];
1849
1850	for (h=0; (l = nextleaf(t, l)) != NULL; h++) {
1851
1852		if (h < s_h)
1853			continue;
1854		if (h > s_h)
1855			memset(&cb->args[3], 0,
1856			       sizeof(cb->args) - 3*sizeof(cb->args[0]));
1857
1858		fa_head = get_fa_head(l, plen);
1859
1860		if(!fa_head)
1861			continue;
1862
1863		if(list_empty(fa_head))
1864			continue;
1865
1866		if (fn_trie_dump_fa(l->key, plen, fa_head, tb, skb, cb)<0) {
1867			cb->args[2]=h;
1868			return -1;
1869		}
1870	}
1871	cb->args[2]=h;
1872	return skb->len;
1873}
1874
1875static int fn_trie_dump(struct fib_table *tb, struct sk_buff *skb, struct netlink_callback *cb)
1876{
1877	int m, s_m;
1878	struct trie *t = (struct trie *) tb->tb_data;
1879
1880	s_m = cb->args[1];
1881
1882	read_lock(&fib_lock);
1883	for (m=0; m<=32; m++) {
1884
1885		if (m < s_m)
1886			continue;
1887		if (m > s_m)
1888			memset(&cb->args[2], 0,
1889			       sizeof(cb->args) - 2*sizeof(cb->args[0]));
1890
1891		if (fn_trie_dump_plen(t, 32-m, tb, skb, cb)<0) {
1892			cb->args[1] = m;
1893			goto out;
1894		}
1895	}
1896	read_unlock(&fib_lock);
1897	cb->args[1] = m;
1898	return skb->len;
1899 out:
1900	read_unlock(&fib_lock);
1901	return -1;
1902}
1903
1904/* Fix more generic FIB names for init later */
1905
1906#ifdef CONFIG_IP_MULTIPLE_TABLES
1907struct fib_table * fib_hash_init(int id)
1908#else
1909struct fib_table * __init fib_hash_init(int id)
1910#endif
1911{
1912	struct fib_table *tb;
1913	struct trie *t;
1914
1915	if (fn_alias_kmem == NULL)
1916		fn_alias_kmem = kmem_cache_create("ip_fib_alias",
1917						  sizeof(struct fib_alias),
1918						  0, SLAB_HWCACHE_ALIGN,
1919						  NULL, NULL);
1920
1921	tb = kmalloc(sizeof(struct fib_table) + sizeof(struct trie),
1922		     GFP_KERNEL);
1923	if (tb == NULL)
1924		return NULL;
1925
1926	tb->tb_id = id;
1927	tb->tb_lookup = fn_trie_lookup;
1928	tb->tb_insert = fn_trie_insert;
1929	tb->tb_delete = fn_trie_delete;
1930	tb->tb_flush = fn_trie_flush;
1931	tb->tb_select_default = fn_trie_select_default;
1932	tb->tb_dump = fn_trie_dump;
1933	memset(tb->tb_data, 0, sizeof(struct trie));
1934
1935	t = (struct trie *) tb->tb_data;
1936
1937	trie_init(t);
1938
1939	if (id == RT_TABLE_LOCAL)
1940                trie_local=t;
1941	  else if (id == RT_TABLE_MAIN)
1942                trie_main=t;
1943
1944	if (id == RT_TABLE_LOCAL)
1945		printk("IPv4 FIB: Using LC-trie version %s\n", VERSION);
1946
1947	return tb;
1948}
1949
1950/* Trie dump functions */
1951
1952static void putspace_seq(struct seq_file *seq, int n)
1953{
1954	while (n--) seq_printf(seq, " ");
1955}
1956
1957static void printbin_seq(struct seq_file *seq, unsigned int v, int bits)
1958{
1959	while (bits--)
1960		seq_printf(seq, "%s", (v & (1<<bits))?"1":"0");
1961}
1962
1963static void printnode_seq(struct seq_file *seq, int indent, struct node *n,
1964		   int pend, int cindex, int bits)
1965{
1966	putspace_seq(seq, indent);
1967	if (IS_LEAF(n))
1968		seq_printf(seq, "|");
1969	else
1970		seq_printf(seq, "+");
1971	if (bits) {
1972		seq_printf(seq, "%d/", cindex);
1973		printbin_seq(seq, cindex, bits);
1974		seq_printf(seq, ": ");
1975	}
1976	else
1977		seq_printf(seq, "<root>: ");
1978	seq_printf(seq, "%s:%p ", IS_LEAF(n)?"Leaf":"Internal node", n);
1979
1980	if (IS_LEAF(n))
1981		seq_printf(seq, "key=%d.%d.%d.%d\n",
1982			   n->key >> 24, (n->key >> 16) % 256, (n->key >> 8) % 256, n->key % 256);
1983	else {
1984		int plen=((struct tnode *)n)->pos;
1985		t_key prf=MASK_PFX(n->key, plen);
1986		seq_printf(seq, "key=%d.%d.%d.%d/%d\n",
1987			   prf >> 24, (prf >> 16) % 256, (prf >> 8) % 256, prf % 256, plen);
1988	}
1989	if (IS_LEAF(n)) {
1990		struct leaf *l=(struct leaf *)n;
1991		struct fib_alias *fa;
1992		int i;
1993		for (i=32; i>=0; i--)
1994		  if(find_leaf_info(&l->list, i)) {
1995
1996				struct list_head *fa_head = get_fa_head(l, i);
1997
1998				if(!fa_head)
1999					continue;
2000
2001				if(list_empty(fa_head))
2002					continue;
2003
2004				putspace_seq(seq, indent+2);
2005				seq_printf(seq, "{/%d...dumping}\n", i);
2006
2007
2008				list_for_each_entry(fa, fa_head, fa_list) {
2009					putspace_seq(seq, indent+2);
2010					if (fa->fa_info->fib_nh == NULL) {
2011						seq_printf(seq, "Error _fib_nh=NULL\n");
2012						continue;
2013					}
2014					if (fa->fa_info == NULL) {
2015						seq_printf(seq, "Error fa_info=NULL\n");
2016						continue;
2017					}
2018
2019					seq_printf(seq, "{type=%d scope=%d TOS=%d}\n",
2020					      fa->fa_type,
2021					      fa->fa_scope,
2022					      fa->fa_tos);
2023				}
2024			}
2025	}
2026	else if (IS_TNODE(n)) {
2027		struct tnode *tn=(struct tnode *)n;
2028		putspace_seq(seq, indent); seq_printf(seq, "|    ");
2029		seq_printf(seq, "{key prefix=%08x/", tn->key&TKEY_GET_MASK(0, tn->pos));
2030		printbin_seq(seq, tkey_extract_bits(tn->key, 0, tn->pos), tn->pos);
2031		seq_printf(seq, "}\n");
2032		putspace_seq(seq, indent); seq_printf(seq, "|    ");
2033		seq_printf(seq, "{pos=%d", tn->pos);
2034		seq_printf(seq, " (skip=%d bits)", tn->pos - pend);
2035		seq_printf(seq, " bits=%d (%u children)}\n", tn->bits, (1 << tn->bits));
2036		putspace_seq(seq, indent); seq_printf(seq, "|    ");
2037		seq_printf(seq, "{empty=%d full=%d}\n", tn->empty_children, tn->full_children);
2038	}
2039}
2040
2041static void trie_dump_seq(struct seq_file *seq, struct trie *t)
2042{
2043	struct node *n=t->trie;
2044	int cindex=0;
2045	int indent=1;
2046	int pend=0;
2047	int depth = 0;
2048
2049  	read_lock(&fib_lock);
2050
2051	seq_printf(seq, "------ trie_dump of t=%p ------\n", t);
2052	if (n) {
2053		printnode_seq(seq, indent, n, pend, cindex, 0);
2054		if (IS_TNODE(n)) {
2055			struct tnode *tn=(struct tnode *)n;
2056			pend = tn->pos+tn->bits;
2057			putspace_seq(seq, indent); seq_printf(seq, "\\--\n");
2058			indent += 3;
2059			depth++;
2060
2061			while (tn && cindex < (1 << tn->bits)) {
2062				if (tn->child[cindex]) {
2063
2064					/* Got a child */
2065
2066					printnode_seq(seq, indent, tn->child[cindex], pend, cindex, tn->bits);
2067					if (IS_LEAF(tn->child[cindex])) {
2068						cindex++;
2069
2070					}
2071					else {
2072						/*
2073						 * New tnode. Decend one level
2074						 */
2075
2076						depth++;
2077						n=tn->child[cindex];
2078						tn=(struct tnode *)n;
2079						pend=tn->pos+tn->bits;
2080						putspace_seq(seq, indent); seq_printf(seq, "\\--\n");
2081						indent+=3;
2082						cindex=0;
2083					}
2084				}
2085				else
2086					cindex++;
2087
2088				/*
2089				 * Test if we are done
2090				 */
2091
2092				while (cindex >= (1 << tn->bits)) {
2093
2094					/*
2095					 * Move upwards and test for root
2096					 * pop off all traversed  nodes
2097					 */
2098
2099					if (NODE_PARENT(tn) == NULL) {
2100						tn = NULL;
2101						n = NULL;
2102						break;
2103					}
2104					else {
2105						cindex = tkey_extract_bits(tn->key, NODE_PARENT(tn)->pos, NODE_PARENT(tn)->bits);
2106						tn = NODE_PARENT(tn);
2107						cindex++;
2108						n=(struct node *)tn;
2109						pend=tn->pos+tn->bits;
2110						indent-=3;
2111						depth--;
2112					}
2113				}
2114			}
2115		}
2116		else n = NULL;
2117	}
2118	else seq_printf(seq, "------ trie is empty\n");
2119
2120  	read_unlock(&fib_lock);
2121}
2122
2123static struct trie_stat *trie_stat_new(void)
2124{
2125	struct trie_stat *s = kmalloc(sizeof(struct trie_stat), GFP_KERNEL);
2126	int i;
2127
2128	if(s) {
2129		s->totdepth = 0;
2130		s->maxdepth = 0;
2131		s->tnodes = 0;
2132		s->leaves = 0;
2133		s->nullpointers = 0;
2134
2135		for(i=0; i< MAX_CHILDS; i++)
2136			s->nodesizes[i] = 0;
2137	}
2138	return s;
2139}
2140
2141static struct trie_stat *trie_collect_stats(struct trie *t)
2142{
2143	struct node *n=t->trie;
2144	struct trie_stat *s = trie_stat_new();
2145	int cindex = 0;
2146	int indent = 1;
2147	int pend = 0;
2148	int depth = 0;
2149
2150	read_lock(&fib_lock);
2151
2152	if (s) {
2153		if (n) {
2154			if (IS_TNODE(n)) {
2155				struct tnode *tn = (struct tnode *)n;
2156				pend=tn->pos+tn->bits;
2157				indent += 3;
2158				s->nodesizes[tn->bits]++;
2159				depth++;
2160
2161				while (tn && cindex < (1 << tn->bits)) {
2162					if (tn->child[cindex]) {
2163						/* Got a child */
2164
2165						if (IS_LEAF(tn->child[cindex])) {
2166							cindex++;
2167
2168							/* stats */
2169							if (depth > s->maxdepth)
2170								s->maxdepth = depth;
2171							s->totdepth += depth;
2172							s->leaves++;
2173						}
2174
2175						else {
2176							/*
2177							 * New tnode. Decend one level
2178							 */
2179
2180							s->tnodes++;
2181							s->nodesizes[tn->bits]++;
2182							depth++;
2183
2184							n = tn->child[cindex];
2185							tn = (struct tnode *)n;
2186							pend = tn->pos+tn->bits;
2187
2188							indent += 3;
2189							cindex = 0;
2190						}
2191					}
2192					else {
2193						cindex++;
2194						s->nullpointers++;
2195					}
2196
2197					/*
2198					 * Test if we are done
2199					 */
2200
2201					while (cindex >= (1 << tn->bits)) {
2202
2203						/*
2204						 * Move upwards and test for root
2205						 * pop off all traversed  nodes
2206						 */
2207
2208
2209						if (NODE_PARENT(tn) == NULL) {
2210							tn = NULL;
2211							n = NULL;
2212							break;
2213						}
2214						else {
2215							cindex = tkey_extract_bits(tn->key, NODE_PARENT(tn)->pos, NODE_PARENT(tn)->bits);
2216							tn = NODE_PARENT(tn);
2217							cindex++;
2218							n = (struct node *)tn;
2219							pend=tn->pos+tn->bits;
2220							indent -= 3;
2221							depth--;
2222						}
2223 					}
2224				}
2225			}
2226			else n = NULL;
2227		}
2228	}
2229
2230	read_unlock(&fib_lock);
2231	return s;
2232}
2233
2234#ifdef CONFIG_PROC_FS
2235
2236static struct fib_alias *fib_triestat_get_first(struct seq_file *seq)
2237{
2238	return NULL;
2239}
2240
2241static struct fib_alias *fib_triestat_get_next(struct seq_file *seq)
2242{
2243	return NULL;
2244}
2245
2246static void *fib_triestat_seq_start(struct seq_file *seq, loff_t *pos)
2247{
2248	void *v = NULL;
2249
2250	if (ip_fib_main_table)
2251		v = *pos ? fib_triestat_get_next(seq) : SEQ_START_TOKEN;
2252	return v;
2253}
2254
2255static void *fib_triestat_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2256{
2257	++*pos;
2258	return v == SEQ_START_TOKEN ? fib_triestat_get_first(seq) : fib_triestat_get_next(seq);
2259}
2260
2261static void fib_triestat_seq_stop(struct seq_file *seq, void *v)
2262{
2263
2264}
2265
2266/*
2267 *	This outputs /proc/net/fib_triestats
2268 *
2269 *	It always works in backward compatibility mode.
2270 *	The format of the file is not supposed to be changed.
2271 */
2272
2273static void collect_and_show(struct trie *t, struct seq_file *seq)
2274{
2275	int bytes = 0; /* How many bytes are used, a ref is 4 bytes */
2276	int i, max, pointers;
2277        struct trie_stat *stat;
2278	int avdepth;
2279
2280	stat = trie_collect_stats(t);
2281
2282	bytes=0;
2283	seq_printf(seq, "trie=%p\n", t);
2284
2285	if (stat) {
2286		if (stat->leaves)
2287			avdepth=stat->totdepth*100 / stat->leaves;
2288		else
2289			avdepth=0;
2290		seq_printf(seq, "Aver depth: %d.%02d\n", avdepth / 100, avdepth % 100 );
2291		seq_printf(seq, "Max depth: %4d\n", stat->maxdepth);
2292
2293		seq_printf(seq, "Leaves: %d\n", stat->leaves);
2294		bytes += sizeof(struct leaf) * stat->leaves;
2295		seq_printf(seq, "Internal nodes: %d\n", stat->tnodes);
2296		bytes += sizeof(struct tnode) * stat->tnodes;
2297
2298		max = MAX_CHILDS-1;
2299
2300		while (max >= 0 && stat->nodesizes[max] == 0)
2301			max--;
2302		pointers = 0;
2303
2304		for (i = 1; i <= max; i++)
2305			if (stat->nodesizes[i] != 0) {
2306				seq_printf(seq, "  %d: %d",  i, stat->nodesizes[i]);
2307				pointers += (1<<i) * stat->nodesizes[i];
2308			}
2309		seq_printf(seq, "\n");
2310		seq_printf(seq, "Pointers: %d\n", pointers);
2311		bytes += sizeof(struct node *) * pointers;
2312		seq_printf(seq, "Null ptrs: %d\n", stat->nullpointers);
2313		seq_printf(seq, "Total size: %d  kB\n", bytes / 1024);
2314
2315		kfree(stat);
2316	}
2317
2318#ifdef CONFIG_IP_FIB_TRIE_STATS
2319	seq_printf(seq, "Counters:\n---------\n");
2320	seq_printf(seq,"gets = %d\n", t->stats.gets);
2321	seq_printf(seq,"backtracks = %d\n", t->stats.backtrack);
2322	seq_printf(seq,"semantic match passed = %d\n", t->stats.semantic_match_passed);
2323	seq_printf(seq,"semantic match miss = %d\n", t->stats.semantic_match_miss);
2324	seq_printf(seq,"null node hit= %d\n", t->stats.null_node_hit);
2325#ifdef CLEAR_STATS
2326	memset(&(t->stats), 0, sizeof(t->stats));
2327#endif
2328#endif /*  CONFIG_IP_FIB_TRIE_STATS */
2329}
2330
2331static int fib_triestat_seq_show(struct seq_file *seq, void *v)
2332{
2333	char bf[128];
2334
2335	if (v == SEQ_START_TOKEN) {
2336		seq_printf(seq, "Basic info: size of leaf: %Zd bytes, size of tnode: %Zd bytes.\n",
2337			   sizeof(struct leaf), sizeof(struct tnode));
2338		if (trie_local)
2339			collect_and_show(trie_local, seq);
2340
2341		if (trie_main)
2342			collect_and_show(trie_main, seq);
2343	}
2344	else {
2345		snprintf(bf, sizeof(bf),
2346			 "*\t%08X\t%08X", 200, 400);
2347
2348		seq_printf(seq, "%-127s\n", bf);
2349	}
2350	return 0;
2351}
2352
2353static struct seq_operations fib_triestat_seq_ops = {
2354	.start  = fib_triestat_seq_start,
2355	.next   = fib_triestat_seq_next,
2356	.stop   = fib_triestat_seq_stop,
2357	.show   = fib_triestat_seq_show,
2358};
2359
2360static int fib_triestat_seq_open(struct inode *inode, struct file *file)
2361{
2362	struct seq_file *seq;
2363	int rc = -ENOMEM;
2364
2365	rc = seq_open(file, &fib_triestat_seq_ops);
2366	if (rc)
2367		goto out_kfree;
2368
2369	seq	     = file->private_data;
2370out:
2371	return rc;
2372out_kfree:
2373	goto out;
2374}
2375
2376static struct file_operations fib_triestat_seq_fops = {
2377	.owner		= THIS_MODULE,
2378	.open           = fib_triestat_seq_open,
2379	.read           = seq_read,
2380	.llseek         = seq_lseek,
2381	.release	= seq_release_private,
2382};
2383
2384int __init fib_stat_proc_init(void)
2385{
2386	if (!proc_net_fops_create("fib_triestat", S_IRUGO, &fib_triestat_seq_fops))
2387		return -ENOMEM;
2388	return 0;
2389}
2390
2391void __init fib_stat_proc_exit(void)
2392{
2393	proc_net_remove("fib_triestat");
2394}
2395
2396static struct fib_alias *fib_trie_get_first(struct seq_file *seq)
2397{
2398	return NULL;
2399}
2400
2401static struct fib_alias *fib_trie_get_next(struct seq_file *seq)
2402{
2403	return NULL;
2404}
2405
2406static void *fib_trie_seq_start(struct seq_file *seq, loff_t *pos)
2407{
2408	void *v = NULL;
2409
2410	if (ip_fib_main_table)
2411		v = *pos ? fib_trie_get_next(seq) : SEQ_START_TOKEN;
2412	return v;
2413}
2414
2415static void *fib_trie_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2416{
2417	++*pos;
2418	return v == SEQ_START_TOKEN ? fib_trie_get_first(seq) : fib_trie_get_next(seq);
2419}
2420
2421static void fib_trie_seq_stop(struct seq_file *seq, void *v)
2422{
2423
2424}
2425
2426/*
2427 *	This outputs /proc/net/fib_trie.
2428 *
2429 *	It always works in backward compatibility mode.
2430 *	The format of the file is not supposed to be changed.
2431 */
2432
2433static int fib_trie_seq_show(struct seq_file *seq, void *v)
2434{
2435	char bf[128];
2436
2437	if (v == SEQ_START_TOKEN) {
2438		if (trie_local)
2439			trie_dump_seq(seq, trie_local);
2440
2441		if (trie_main)
2442			trie_dump_seq(seq, trie_main);
2443	}
2444
2445	else {
2446		snprintf(bf, sizeof(bf),
2447			 "*\t%08X\t%08X", 200, 400);
2448		seq_printf(seq, "%-127s\n", bf);
2449	}
2450
2451	return 0;
2452}
2453
2454static struct seq_operations fib_trie_seq_ops = {
2455	.start  = fib_trie_seq_start,
2456	.next   = fib_trie_seq_next,
2457	.stop   = fib_trie_seq_stop,
2458	.show   = fib_trie_seq_show,
2459};
2460
2461static int fib_trie_seq_open(struct inode *inode, struct file *file)
2462{
2463	struct seq_file *seq;
2464	int rc = -ENOMEM;
2465
2466	rc = seq_open(file, &fib_trie_seq_ops);
2467	if (rc)
2468		goto out_kfree;
2469
2470	seq	     = file->private_data;
2471out:
2472	return rc;
2473out_kfree:
2474	goto out;
2475}
2476
2477static struct file_operations fib_trie_seq_fops = {
2478	.owner		= THIS_MODULE,
2479	.open           = fib_trie_seq_open,
2480	.read           = seq_read,
2481	.llseek         = seq_lseek,
2482	.release	= seq_release_private,
2483};
2484
2485int __init fib_proc_init(void)
2486{
2487	if (!proc_net_fops_create("fib_trie", S_IRUGO, &fib_trie_seq_fops))
2488		return -ENOMEM;
2489	return 0;
2490}
2491
2492void __init fib_proc_exit(void)
2493{
2494	proc_net_remove("fib_trie");
2495}
2496
2497#endif /* CONFIG_PROC_FS */
2498