fib_trie.c revision bb435b8d816582064ee0ddb1e2a6fbca67f34108
1/*
2 *   This program is free software; you can redistribute it and/or
3 *   modify it under the terms of the GNU General Public License
4 *   as published by the Free Software Foundation; either version
5 *   2 of the License, or (at your option) any later version.
6 *
7 *   Robert Olsson <robert.olsson@its.uu.se> Uppsala Universitet
8 *     & Swedish University of Agricultural Sciences.
9 *
10 *   Jens Laas <jens.laas@data.slu.se> Swedish University of
11 *     Agricultural Sciences.
12 *
13 *   Hans Liss <hans.liss@its.uu.se>  Uppsala Universitet
14 *
15 * This work is based on the LPC-trie which is originally descibed in:
16 *
17 * An experimental study of compression methods for dynamic tries
18 * Stefan Nilsson and Matti Tikkanen. Algorithmica, 33(1):19-33, 2002.
19 * http://www.nada.kth.se/~snilsson/public/papers/dyntrie2/
20 *
21 *
22 * IP-address lookup using LC-tries. Stefan Nilsson and Gunnar Karlsson
23 * IEEE Journal on Selected Areas in Communications, 17(6):1083-1092, June 1999
24 *
25 * Version:	$Id: fib_trie.c,v 1.3 2005/06/08 14:20:01 robert Exp $
26 *
27 *
28 * Code from fib_hash has been reused which includes the following header:
29 *
30 *
31 * INET		An implementation of the TCP/IP protocol suite for the LINUX
32 *		operating system.  INET is implemented using the  BSD Socket
33 *		interface as the means of communication with the user level.
34 *
35 *		IPv4 FIB: lookup engine and maintenance routines.
36 *
37 *
38 * Authors:	Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
39 *
40 *		This program is free software; you can redistribute it and/or
41 *		modify it under the terms of the GNU General Public License
42 *		as published by the Free Software Foundation; either version
43 *		2 of the License, or (at your option) any later version.
44 */
45
46#define VERSION "0.325"
47
48#include <linux/config.h>
49#include <asm/uaccess.h>
50#include <asm/system.h>
51#include <asm/bitops.h>
52#include <linux/types.h>
53#include <linux/kernel.h>
54#include <linux/sched.h>
55#include <linux/mm.h>
56#include <linux/string.h>
57#include <linux/socket.h>
58#include <linux/sockios.h>
59#include <linux/errno.h>
60#include <linux/in.h>
61#include <linux/inet.h>
62#include <linux/netdevice.h>
63#include <linux/if_arp.h>
64#include <linux/proc_fs.h>
65#include <linux/skbuff.h>
66#include <linux/netlink.h>
67#include <linux/init.h>
68#include <linux/list.h>
69#include <net/ip.h>
70#include <net/protocol.h>
71#include <net/route.h>
72#include <net/tcp.h>
73#include <net/sock.h>
74#include <net/ip_fib.h>
75#include "fib_lookup.h"
76
77#undef CONFIG_IP_FIB_TRIE_STATS
78#define MAX_CHILDS 16384
79
80#define KEYLENGTH (8*sizeof(t_key))
81#define MASK_PFX(k, l) (((l)==0)?0:(k >> (KEYLENGTH-l)) << (KEYLENGTH-l))
82#define TKEY_GET_MASK(offset, bits) (((bits)==0)?0:((t_key)(-1) << (KEYLENGTH - bits) >> offset))
83
84static DEFINE_RWLOCK(fib_lock);
85
86typedef unsigned int t_key;
87
88#define T_TNODE 0
89#define T_LEAF  1
90#define NODE_TYPE_MASK	0x1UL
91#define NODE_PARENT(node) \
92	((struct tnode *)((node)->parent & ~NODE_TYPE_MASK))
93#define NODE_SET_PARENT(node, ptr) \
94	((node)->parent = (((unsigned long)(ptr)) | \
95                     ((node)->parent & NODE_TYPE_MASK)))
96#define NODE_INIT_PARENT(node, type) \
97	((node)->parent = (type))
98#define NODE_TYPE(node) \
99	((node)->parent & NODE_TYPE_MASK)
100
101#define IS_TNODE(n) (!(n->parent & T_LEAF))
102#define IS_LEAF(n) (n->parent & T_LEAF)
103
104struct node {
105	t_key key;
106	unsigned long parent;
107};
108
109struct leaf {
110	t_key key;
111	unsigned long parent;
112	struct hlist_head list;
113};
114
115struct leaf_info {
116	struct hlist_node hlist;
117	int plen;
118	struct list_head falh;
119};
120
121struct tnode {
122	t_key key;
123	unsigned long parent;
124	unsigned short pos:5;		/* 2log(KEYLENGTH) bits needed */
125	unsigned short bits:5;		/* 2log(KEYLENGTH) bits needed */
126	unsigned short full_children;	/* KEYLENGTH bits needed */
127	unsigned short empty_children;	/* KEYLENGTH bits needed */
128	struct node *child[0];
129};
130
131#ifdef CONFIG_IP_FIB_TRIE_STATS
132struct trie_use_stats {
133	unsigned int gets;
134	unsigned int backtrack;
135	unsigned int semantic_match_passed;
136	unsigned int semantic_match_miss;
137	unsigned int null_node_hit;
138	unsigned int resize_node_skipped;
139};
140#endif
141
142struct trie_stat {
143	unsigned int totdepth;
144	unsigned int maxdepth;
145	unsigned int tnodes;
146	unsigned int leaves;
147	unsigned int nullpointers;
148	unsigned int nodesizes[MAX_CHILDS];
149};
150
151struct trie {
152	struct node *trie;
153#ifdef CONFIG_IP_FIB_TRIE_STATS
154	struct trie_use_stats stats;
155#endif
156	int size;
157	unsigned int revision;
158};
159
160static int trie_debug = 0;
161
162#define DBG(x...) do { if (trie_debug) printk(x); } while (0)
163
164static void put_child(struct trie *t, struct tnode *tn, int i, struct node *n);
165static void tnode_put_child_reorg(struct tnode *tn, int i, struct node *n, int wasfull);
166static struct node *resize(struct trie *t, struct tnode *tn);
167static struct tnode *inflate(struct trie *t, struct tnode *tn);
168static struct tnode *halve(struct trie *t, struct tnode *tn);
169static void tnode_free(struct tnode *tn);
170static void trie_dump_seq(struct seq_file *seq, struct trie *t);
171extern struct fib_alias *fib_find_alias(struct list_head *fah, u8 tos, u32 prio);
172extern int fib_detect_death(struct fib_info *fi, int order,
173			    struct fib_info **last_resort, int *last_idx, int *dflt);
174
175extern void rtmsg_fib(int event, u32 key, struct fib_alias *fa, int z, int tb_id,
176		      struct nlmsghdr *n, struct netlink_skb_parms *req);
177
178static kmem_cache_t *fn_alias_kmem;
179static struct trie *trie_local = NULL, *trie_main = NULL;
180
181static inline struct node *tnode_get_child(struct tnode *tn, int i)
182{
183	BUG_ON(i >= 1 << tn->bits);
184
185	return tn->child[i];
186}
187
188static inline int tnode_child_length(const struct tnode *tn)
189{
190	return 1 << tn->bits;
191}
192
193static inline t_key tkey_extract_bits(t_key a, int offset, int bits)
194{
195	if (offset < KEYLENGTH)
196		return ((t_key)(a << offset)) >> (KEYLENGTH - bits);
197	else
198		return 0;
199}
200
201static inline int tkey_equals(t_key a, t_key b)
202{
203	return a == b;
204}
205
206static inline int tkey_sub_equals(t_key a, int offset, int bits, t_key b)
207{
208	if (bits == 0 || offset >= KEYLENGTH)
209		return 1;
210	bits = bits > KEYLENGTH ? KEYLENGTH : bits;
211	return ((a ^ b) << offset) >> (KEYLENGTH - bits) == 0;
212}
213
214static inline int tkey_mismatch(t_key a, int offset, t_key b)
215{
216	t_key diff = a ^ b;
217	int i = offset;
218
219	if (!diff)
220		return 0;
221	while ((diff << i) >> (KEYLENGTH-1) == 0)
222		i++;
223	return i;
224}
225
226/* Candidate for fib_semantics */
227
228static void fn_free_alias(struct fib_alias *fa)
229{
230	fib_release_info(fa->fa_info);
231	kmem_cache_free(fn_alias_kmem, fa);
232}
233
234/*
235  To understand this stuff, an understanding of keys and all their bits is
236  necessary. Every node in the trie has a key associated with it, but not
237  all of the bits in that key are significant.
238
239  Consider a node 'n' and its parent 'tp'.
240
241  If n is a leaf, every bit in its key is significant. Its presence is
242  necessitaded by path compression, since during a tree traversal (when
243  searching for a leaf - unless we are doing an insertion) we will completely
244  ignore all skipped bits we encounter. Thus we need to verify, at the end of
245  a potentially successful search, that we have indeed been walking the
246  correct key path.
247
248  Note that we can never "miss" the correct key in the tree if present by
249  following the wrong path. Path compression ensures that segments of the key
250  that are the same for all keys with a given prefix are skipped, but the
251  skipped part *is* identical for each node in the subtrie below the skipped
252  bit! trie_insert() in this implementation takes care of that - note the
253  call to tkey_sub_equals() in trie_insert().
254
255  if n is an internal node - a 'tnode' here, the various parts of its key
256  have many different meanings.
257
258  Example:
259  _________________________________________________________________
260  | i | i | i | i | i | i | i | N | N | N | S | S | S | S | S | C |
261  -----------------------------------------------------------------
262    0   1   2   3   4   5   6   7   8   9  10  11  12  13  14  15
263
264  _________________________________________________________________
265  | C | C | C | u | u | u | u | u | u | u | u | u | u | u | u | u |
266  -----------------------------------------------------------------
267   16  17  18  19  20  21  22  23  24  25  26  27  28  29  30  31
268
269  tp->pos = 7
270  tp->bits = 3
271  n->pos = 15
272  n->bits = 4
273
274  First, let's just ignore the bits that come before the parent tp, that is
275  the bits from 0 to (tp->pos-1). They are *known* but at this point we do
276  not use them for anything.
277
278  The bits from (tp->pos) to (tp->pos + tp->bits - 1) - "N", above - are the
279  index into the parent's child array. That is, they will be used to find
280  'n' among tp's children.
281
282  The bits from (tp->pos + tp->bits) to (n->pos - 1) - "S" - are skipped bits
283  for the node n.
284
285  All the bits we have seen so far are significant to the node n. The rest
286  of the bits are really not needed or indeed known in n->key.
287
288  The bits from (n->pos) to (n->pos + n->bits - 1) - "C" - are the index into
289  n's child array, and will of course be different for each child.
290
291
292  The rest of the bits, from (n->pos + n->bits) onward, are completely unknown
293  at this point.
294
295*/
296
297static void check_tnode(struct tnode *tn)
298{
299	if (tn && tn->pos+tn->bits > 32) {
300		printk("TNODE ERROR tn=%p, pos=%d, bits=%d\n", tn, tn->pos, tn->bits);
301	}
302}
303
304static int halve_threshold = 25;
305static int inflate_threshold = 50;
306
307static struct leaf *leaf_new(void)
308{
309	struct leaf *l = kmalloc(sizeof(struct leaf),  GFP_KERNEL);
310	if (l) {
311		NODE_INIT_PARENT(l, T_LEAF);
312		INIT_HLIST_HEAD(&l->list);
313	}
314	return l;
315}
316
317static struct leaf_info *leaf_info_new(int plen)
318{
319	struct leaf_info *li = kmalloc(sizeof(struct leaf_info),  GFP_KERNEL);
320
321	if (!li)
322		return NULL;
323
324	li->plen = plen;
325	INIT_LIST_HEAD(&li->falh);
326
327	return li;
328}
329
330static inline void free_leaf(struct leaf *l)
331{
332	kfree(l);
333}
334
335static inline void free_leaf_info(struct leaf_info *li)
336{
337	kfree(li);
338}
339
340static struct tnode *tnode_alloc(unsigned int size)
341{
342	if (size <= PAGE_SIZE) {
343		return kmalloc(size, GFP_KERNEL);
344	} else {
345		return (struct tnode *)
346			__get_free_pages(GFP_KERNEL, get_order(size));
347	}
348}
349
350static void __tnode_free(struct tnode *tn)
351{
352	unsigned int size = sizeof(struct tnode) +
353				(1 << tn->bits) * sizeof(struct node *);
354
355	if (size <= PAGE_SIZE)
356		kfree(tn);
357	else
358		free_pages((unsigned long)tn, get_order(size));
359}
360
361static struct tnode* tnode_new(t_key key, int pos, int bits)
362{
363	int nchildren = 1<<bits;
364	int sz = sizeof(struct tnode) + nchildren * sizeof(struct node *);
365	struct tnode *tn = tnode_alloc(sz);
366
367	if (tn) {
368		memset(tn, 0, sz);
369		NODE_INIT_PARENT(tn, T_TNODE);
370		tn->pos = pos;
371		tn->bits = bits;
372		tn->key = key;
373		tn->full_children = 0;
374		tn->empty_children = 1<<bits;
375	}
376
377	DBG("AT %p s=%u %u\n", tn, (unsigned int) sizeof(struct tnode),
378	       (unsigned int) (sizeof(struct node) * 1<<bits));
379	return tn;
380}
381
382static void tnode_free(struct tnode *tn)
383{
384	BUG_ON(!tn);
385
386	if (IS_LEAF(tn)) {
387		free_leaf((struct leaf *)tn);
388		DBG("FL %p \n", tn);
389	} else {
390		__tnode_free(tn);
391		DBG("FT %p \n", tn);
392	}
393}
394
395/*
396 * Check whether a tnode 'n' is "full", i.e. it is an internal node
397 * and no bits are skipped. See discussion in dyntree paper p. 6
398 */
399
400static inline int tnode_full(const struct tnode *tn, const struct node *n)
401{
402	if (n == NULL || IS_LEAF(n))
403		return 0;
404
405	return ((struct tnode *) n)->pos == tn->pos + tn->bits;
406}
407
408static inline void put_child(struct trie *t, struct tnode *tn, int i, struct node *n)
409{
410	tnode_put_child_reorg(tn, i, n, -1);
411}
412
413 /*
414  * Add a child at position i overwriting the old value.
415  * Update the value of full_children and empty_children.
416  */
417
418static void tnode_put_child_reorg(struct tnode *tn, int i, struct node *n, int wasfull)
419{
420	struct node *chi;
421	int isfull;
422
423	if (i >= 1<<tn->bits) {
424		printk("bits=%d, i=%d\n", tn->bits, i);
425		BUG();
426	}
427	write_lock_bh(&fib_lock);
428	chi = tn->child[i];
429
430	/* update emptyChildren */
431	if (n == NULL && chi != NULL)
432		tn->empty_children++;
433	else if (n != NULL && chi == NULL)
434		tn->empty_children--;
435
436	/* update fullChildren */
437	if (wasfull == -1)
438		wasfull = tnode_full(tn, chi);
439
440	isfull = tnode_full(tn, n);
441	if (wasfull && !isfull)
442		tn->full_children--;
443	else if (!wasfull && isfull)
444		tn->full_children++;
445
446	if (n)
447		NODE_SET_PARENT(n, tn);
448
449	tn->child[i] = n;
450	write_unlock_bh(&fib_lock);
451}
452
453static struct node *resize(struct trie *t, struct tnode *tn)
454{
455	int i;
456	int err = 0;
457	struct tnode *old_tn;
458
459 	if (!tn)
460		return NULL;
461
462	DBG("In tnode_resize %p inflate_threshold=%d threshold=%d\n",
463	      tn, inflate_threshold, halve_threshold);
464
465	/* No children */
466	if (tn->empty_children == tnode_child_length(tn)) {
467		tnode_free(tn);
468		return NULL;
469	}
470	/* One child */
471	if (tn->empty_children == tnode_child_length(tn) - 1)
472		for (i = 0; i < tnode_child_length(tn); i++) {
473			struct node *n;
474
475			write_lock_bh(&fib_lock);
476			n = tn->child[i];
477			if (!n) {
478				write_unlock_bh(&fib_lock);
479				continue;
480			}
481
482			/* compress one level */
483			NODE_INIT_PARENT(n, NODE_TYPE(n));
484
485			write_unlock_bh(&fib_lock);
486			tnode_free(tn);
487			return n;
488		}
489	/*
490	 * Double as long as the resulting node has a number of
491	 * nonempty nodes that are above the threshold.
492	 */
493
494	/*
495	 * From "Implementing a dynamic compressed trie" by Stefan Nilsson of
496	 * the Helsinki University of Technology and Matti Tikkanen of Nokia
497	 * Telecommunications, page 6:
498	 * "A node is doubled if the ratio of non-empty children to all
499	 * children in the *doubled* node is at least 'high'."
500	 *
501	 * 'high' in this instance is the variable 'inflate_threshold'. It
502	 * is expressed as a percentage, so we multiply it with
503	 * tnode_child_length() and instead of multiplying by 2 (since the
504	 * child array will be doubled by inflate()) and multiplying
505	 * the left-hand side by 100 (to handle the percentage thing) we
506	 * multiply the left-hand side by 50.
507	 *
508	 * The left-hand side may look a bit weird: tnode_child_length(tn)
509	 * - tn->empty_children is of course the number of non-null children
510	 * in the current node. tn->full_children is the number of "full"
511	 * children, that is non-null tnodes with a skip value of 0.
512	 * All of those will be doubled in the resulting inflated tnode, so
513	 * we just count them one extra time here.
514	 *
515	 * A clearer way to write this would be:
516	 *
517	 * to_be_doubled = tn->full_children;
518	 * not_to_be_doubled = tnode_child_length(tn) - tn->empty_children -
519	 *     tn->full_children;
520	 *
521	 * new_child_length = tnode_child_length(tn) * 2;
522	 *
523	 * new_fill_factor = 100 * (not_to_be_doubled + 2*to_be_doubled) /
524	 *      new_child_length;
525	 * if (new_fill_factor >= inflate_threshold)
526	 *
527	 * ...and so on, tho it would mess up the while () loop.
528	 *
529	 * anyway,
530	 * 100 * (not_to_be_doubled + 2*to_be_doubled) / new_child_length >=
531	 *      inflate_threshold
532	 *
533	 * avoid a division:
534	 * 100 * (not_to_be_doubled + 2*to_be_doubled) >=
535	 *      inflate_threshold * new_child_length
536	 *
537	 * expand not_to_be_doubled and to_be_doubled, and shorten:
538	 * 100 * (tnode_child_length(tn) - tn->empty_children +
539	 *    tn->full_children) >= inflate_threshold * new_child_length
540	 *
541	 * expand new_child_length:
542	 * 100 * (tnode_child_length(tn) - tn->empty_children +
543	 *    tn->full_children) >=
544	 *      inflate_threshold * tnode_child_length(tn) * 2
545	 *
546	 * shorten again:
547	 * 50 * (tn->full_children + tnode_child_length(tn) -
548	 *    tn->empty_children) >= inflate_threshold *
549	 *    tnode_child_length(tn)
550	 *
551	 */
552
553	check_tnode(tn);
554
555	err = 0;
556	while ((tn->full_children > 0 &&
557	       50 * (tn->full_children + tnode_child_length(tn) - tn->empty_children) >=
558				inflate_threshold * tnode_child_length(tn))) {
559
560		old_tn = tn;
561		tn = inflate(t, tn);
562		if (IS_ERR(tn)) {
563			tn = old_tn;
564#ifdef CONFIG_IP_FIB_TRIE_STATS
565			t->stats.resize_node_skipped++;
566#endif
567			break;
568		}
569	}
570
571	check_tnode(tn);
572
573	/*
574	 * Halve as long as the number of empty children in this
575	 * node is above threshold.
576	 */
577
578	err = 0;
579	while (tn->bits > 1 &&
580	       100 * (tnode_child_length(tn) - tn->empty_children) <
581	       halve_threshold * tnode_child_length(tn)) {
582
583		old_tn = tn;
584		tn = halve(t, tn);
585		if (IS_ERR(tn)) {
586			tn = old_tn;
587#ifdef CONFIG_IP_FIB_TRIE_STATS
588			t->stats.resize_node_skipped++;
589#endif
590			break;
591		}
592	}
593
594
595	/* Only one child remains */
596
597	if (tn->empty_children == tnode_child_length(tn) - 1)
598		for (i = 0; i < tnode_child_length(tn); i++) {
599			struct node *n;
600
601			write_lock_bh(&fib_lock);
602
603			n = tn->child[i];
604			if (!n) {
605				write_unlock_bh(&fib_lock);
606				continue;
607			}
608
609			/* compress one level */
610
611			NODE_INIT_PARENT(n, NODE_TYPE(n));
612
613			write_unlock_bh(&fib_lock);
614			tnode_free(tn);
615			return n;
616		}
617
618	return (struct node *) tn;
619}
620
621static struct tnode *inflate(struct trie *t, struct tnode *tn)
622{
623	struct tnode *inode;
624	struct tnode *oldtnode = tn;
625	int olen = tnode_child_length(tn);
626	int i;
627
628	DBG("In inflate\n");
629
630	tn = tnode_new(oldtnode->key, oldtnode->pos, oldtnode->bits + 1);
631
632	if (!tn)
633		return ERR_PTR(-ENOMEM);
634
635	/*
636	 * Preallocate and store tnodes before the actual work so we
637	 * don't get into an inconsistent state if memory allocation
638	 * fails. In case of failure we return the oldnode and  inflate
639	 * of tnode is ignored.
640	 */
641
642	for (i = 0; i < olen; i++) {
643		struct tnode *inode = (struct tnode *) tnode_get_child(oldtnode, i);
644
645		if (inode &&
646		    IS_TNODE(inode) &&
647		    inode->pos == oldtnode->pos + oldtnode->bits &&
648		    inode->bits > 1) {
649			struct tnode *left, *right;
650			t_key m = TKEY_GET_MASK(inode->pos, 1);
651
652			left = tnode_new(inode->key&(~m), inode->pos + 1,
653					 inode->bits - 1);
654			if (!left)
655				goto nomem;
656
657			right = tnode_new(inode->key|m, inode->pos + 1,
658					  inode->bits - 1);
659
660                        if (!right) {
661				tnode_free(left);
662				goto nomem;
663                        }
664
665			put_child(t, tn, 2*i, (struct node *) left);
666			put_child(t, tn, 2*i+1, (struct node *) right);
667		}
668	}
669
670	for (i = 0; i < olen; i++) {
671		struct node *node = tnode_get_child(oldtnode, i);
672		struct tnode *left, *right;
673		int size, j;
674
675		/* An empty child */
676		if (node == NULL)
677			continue;
678
679		/* A leaf or an internal node with skipped bits */
680
681		if (IS_LEAF(node) || ((struct tnode *) node)->pos >
682		   tn->pos + tn->bits - 1) {
683			if (tkey_extract_bits(node->key, oldtnode->pos + oldtnode->bits,
684					     1) == 0)
685				put_child(t, tn, 2*i, node);
686			else
687				put_child(t, tn, 2*i+1, node);
688			continue;
689		}
690
691		/* An internal node with two children */
692		inode = (struct tnode *) node;
693
694		if (inode->bits == 1) {
695			put_child(t, tn, 2*i, inode->child[0]);
696			put_child(t, tn, 2*i+1, inode->child[1]);
697
698			tnode_free(inode);
699			continue;
700		}
701
702		/* An internal node with more than two children */
703
704		/* We will replace this node 'inode' with two new
705		 * ones, 'left' and 'right', each with half of the
706		 * original children. The two new nodes will have
707		 * a position one bit further down the key and this
708		 * means that the "significant" part of their keys
709		 * (see the discussion near the top of this file)
710		 * will differ by one bit, which will be "0" in
711		 * left's key and "1" in right's key. Since we are
712		 * moving the key position by one step, the bit that
713		 * we are moving away from - the bit at position
714		 * (inode->pos) - is the one that will differ between
715		 * left and right. So... we synthesize that bit in the
716		 * two  new keys.
717		 * The mask 'm' below will be a single "one" bit at
718		 * the position (inode->pos)
719		 */
720
721		/* Use the old key, but set the new significant
722		 *   bit to zero.
723		 */
724
725		left = (struct tnode *) tnode_get_child(tn, 2*i);
726		put_child(t, tn, 2*i, NULL);
727
728		BUG_ON(!left);
729
730		right = (struct tnode *) tnode_get_child(tn, 2*i+1);
731		put_child(t, tn, 2*i+1, NULL);
732
733		BUG_ON(!right);
734
735		size = tnode_child_length(left);
736		for (j = 0; j < size; j++) {
737			put_child(t, left, j, inode->child[j]);
738			put_child(t, right, j, inode->child[j + size]);
739		}
740		put_child(t, tn, 2*i, resize(t, left));
741		put_child(t, tn, 2*i+1, resize(t, right));
742
743		tnode_free(inode);
744	}
745	tnode_free(oldtnode);
746	return tn;
747nomem:
748	{
749		int size = tnode_child_length(tn);
750		int j;
751
752		for(j = 0; j < size; j++)
753			if (tn->child[j])
754				tnode_free((struct tnode *)tn->child[j]);
755
756		tnode_free(tn);
757
758		return ERR_PTR(-ENOMEM);
759	}
760}
761
762static struct tnode *halve(struct trie *t, struct tnode *tn)
763{
764	struct tnode *oldtnode = tn;
765	struct node *left, *right;
766	int i;
767	int olen = tnode_child_length(tn);
768
769	DBG("In halve\n");
770
771	tn = tnode_new(oldtnode->key, oldtnode->pos, oldtnode->bits - 1);
772
773	if (!tn)
774		return ERR_PTR(-ENOMEM);
775
776	/*
777	 * Preallocate and store tnodes before the actual work so we
778	 * don't get into an inconsistent state if memory allocation
779	 * fails. In case of failure we return the oldnode and halve
780	 * of tnode is ignored.
781	 */
782
783	for (i = 0; i < olen; i += 2) {
784		left = tnode_get_child(oldtnode, i);
785		right = tnode_get_child(oldtnode, i+1);
786
787		/* Two nonempty children */
788		if (left && right)  {
789			struct tnode *newn;
790
791			newn = tnode_new(left->key, tn->pos + tn->bits, 1);
792
793			if (!newn)
794				goto nomem;
795
796			put_child(t, tn, i/2, (struct node *)newn);
797		}
798
799	}
800
801	for (i = 0; i < olen; i += 2) {
802		struct tnode *newBinNode;
803
804		left = tnode_get_child(oldtnode, i);
805		right = tnode_get_child(oldtnode, i+1);
806
807		/* At least one of the children is empty */
808		if (left == NULL) {
809			if (right == NULL)    /* Both are empty */
810				continue;
811			put_child(t, tn, i/2, right);
812			continue;
813		}
814
815		if (right == NULL) {
816			put_child(t, tn, i/2, left);
817			continue;
818		}
819
820		/* Two nonempty children */
821		newBinNode = (struct tnode *) tnode_get_child(tn, i/2);
822		put_child(t, tn, i/2, NULL);
823
824		BUG_ON(!newBinNode);
825
826		put_child(t, newBinNode, 0, left);
827		put_child(t, newBinNode, 1, right);
828		put_child(t, tn, i/2, resize(t, newBinNode));
829	}
830	tnode_free(oldtnode);
831	return tn;
832nomem:
833	{
834		int size = tnode_child_length(tn);
835		int j;
836
837		for(j = 0; j < size; j++)
838			if (tn->child[j])
839				tnode_free((struct tnode *)tn->child[j]);
840
841		tnode_free(tn);
842
843		return ERR_PTR(-ENOMEM);
844	}
845}
846
847static void trie_init(struct trie *t)
848{
849	if (!t)
850		return;
851
852	t->size = 0;
853	t->trie = NULL;
854	t->revision = 0;
855#ifdef CONFIG_IP_FIB_TRIE_STATS
856	memset(&t->stats, 0, sizeof(struct trie_use_stats));
857#endif
858}
859
860static struct leaf_info *find_leaf_info(struct hlist_head *head, int plen)
861{
862	struct hlist_node *node;
863	struct leaf_info *li;
864
865	hlist_for_each_entry(li, node, head, hlist)
866		if (li->plen == plen)
867			return li;
868
869	return NULL;
870}
871
872static inline struct list_head * get_fa_head(struct leaf *l, int plen)
873{
874	struct leaf_info *li = find_leaf_info(&l->list, plen);
875
876	if (!li)
877		return NULL;
878
879	return &li->falh;
880}
881
882static void insert_leaf_info(struct hlist_head *head, struct leaf_info *new)
883{
884	struct leaf_info *li = NULL, *last = NULL;
885	struct hlist_node *node;
886
887	write_lock_bh(&fib_lock);
888
889	if (hlist_empty(head)) {
890		hlist_add_head(&new->hlist, head);
891	} else {
892		hlist_for_each_entry(li, node, head, hlist) {
893			if (new->plen > li->plen)
894				break;
895
896			last = li;
897		}
898		if (last)
899			hlist_add_after(&last->hlist, &new->hlist);
900		else
901			hlist_add_before(&new->hlist, &li->hlist);
902	}
903	write_unlock_bh(&fib_lock);
904}
905
906static struct leaf *
907fib_find_node(struct trie *t, u32 key)
908{
909	int pos;
910	struct tnode *tn;
911	struct node *n;
912
913	pos = 0;
914	n = t->trie;
915
916	while (n != NULL &&  NODE_TYPE(n) == T_TNODE) {
917		tn = (struct tnode *) n;
918
919		check_tnode(tn);
920
921		if (tkey_sub_equals(tn->key, pos, tn->pos-pos, key)) {
922			pos = tn->pos + tn->bits;
923			n = tnode_get_child(tn, tkey_extract_bits(key, tn->pos, tn->bits));
924		} else
925			break;
926	}
927	/* Case we have found a leaf. Compare prefixes */
928
929	if (n != NULL && IS_LEAF(n) && tkey_equals(key, n->key))
930		return (struct leaf *)n;
931
932	return NULL;
933}
934
935static struct node *trie_rebalance(struct trie *t, struct tnode *tn)
936{
937	int i;
938	int wasfull;
939	t_key cindex, key;
940	struct tnode *tp = NULL;
941
942	BUG_ON(!tn);
943
944	key = tn->key;
945	i = 0;
946
947	while (tn != NULL && NODE_PARENT(tn) != NULL) {
948		if (i > 10) {
949			printk("Rebalance tn=%p \n", tn);
950			if (tn)
951				printk("tn->parent=%p \n", NODE_PARENT(tn));
952
953			printk("Rebalance tp=%p \n", tp);
954			if (tp)
955				printk("tp->parent=%p \n", NODE_PARENT(tp));
956		}
957
958		BUG_ON(i > 12); /* Why is this a bug? -ojn */
959		i++;
960
961		tp = NODE_PARENT(tn);
962		cindex = tkey_extract_bits(key, tp->pos, tp->bits);
963		wasfull = tnode_full(tp, tnode_get_child(tp, cindex));
964		tn = (struct tnode *) resize (t, (struct tnode *)tn);
965		tnode_put_child_reorg((struct tnode *)tp, cindex,(struct node*)tn, wasfull);
966
967		if (!NODE_PARENT(tn))
968			break;
969
970		tn = NODE_PARENT(tn);
971	}
972	/* Handle last (top) tnode */
973	if (IS_TNODE(tn))
974		tn = (struct tnode*) resize(t, (struct tnode *)tn);
975
976	return (struct node*) tn;
977}
978
979static  struct list_head *
980fib_insert_node(struct trie *t, int *err, u32 key, int plen)
981{
982	int pos, newpos;
983	struct tnode *tp = NULL, *tn = NULL;
984	struct node *n;
985	struct leaf *l;
986	int missbit;
987	struct list_head *fa_head = NULL;
988	struct leaf_info *li;
989	t_key cindex;
990
991	pos = 0;
992	n = t->trie;
993
994	/* If we point to NULL, stop. Either the tree is empty and we should
995	 * just put a new leaf in if, or we have reached an empty child slot,
996	 * and we should just put our new leaf in that.
997	 * If we point to a T_TNODE, check if it matches our key. Note that
998	 * a T_TNODE might be skipping any number of bits - its 'pos' need
999	 * not be the parent's 'pos'+'bits'!
1000	 *
1001	 * If it does match the current key, get pos/bits from it, extract
1002	 * the index from our key, push the T_TNODE and walk the tree.
1003	 *
1004	 * If it doesn't, we have to replace it with a new T_TNODE.
1005	 *
1006	 * If we point to a T_LEAF, it might or might not have the same key
1007	 * as we do. If it does, just change the value, update the T_LEAF's
1008	 * value, and return it.
1009	 * If it doesn't, we need to replace it with a T_TNODE.
1010	 */
1011
1012	while (n != NULL &&  NODE_TYPE(n) == T_TNODE) {
1013		tn = (struct tnode *) n;
1014
1015		check_tnode(tn);
1016
1017		if (tkey_sub_equals(tn->key, pos, tn->pos-pos, key)) {
1018			tp = tn;
1019			pos = tn->pos + tn->bits;
1020			n = tnode_get_child(tn, tkey_extract_bits(key, tn->pos, tn->bits));
1021
1022			if (n && NODE_PARENT(n) != tn) {
1023				printk("BUG tn=%p, n->parent=%p\n", tn, NODE_PARENT(n));
1024				BUG();
1025			}
1026		} else
1027			break;
1028	}
1029
1030	/*
1031	 * n  ----> NULL, LEAF or TNODE
1032	 *
1033	 * tp is n's (parent) ----> NULL or TNODE
1034	 */
1035
1036	BUG_ON(tp && IS_LEAF(tp));
1037
1038	/* Case 1: n is a leaf. Compare prefixes */
1039
1040	if (n != NULL && IS_LEAF(n) && tkey_equals(key, n->key)) {
1041		struct leaf *l = (struct leaf *) n;
1042
1043		li = leaf_info_new(plen);
1044
1045		if (!li) {
1046			*err = -ENOMEM;
1047			goto err;
1048		}
1049
1050		fa_head = &li->falh;
1051		insert_leaf_info(&l->list, li);
1052		goto done;
1053	}
1054	t->size++;
1055	l = leaf_new();
1056
1057	if (!l) {
1058		*err = -ENOMEM;
1059		goto err;
1060	}
1061
1062	l->key = key;
1063	li = leaf_info_new(plen);
1064
1065	if (!li) {
1066		tnode_free((struct tnode *) l);
1067		*err = -ENOMEM;
1068		goto err;
1069	}
1070
1071	fa_head = &li->falh;
1072	insert_leaf_info(&l->list, li);
1073
1074	if (t->trie && n == NULL) {
1075		/* Case 2: n is NULL, and will just insert a new leaf */
1076
1077		NODE_SET_PARENT(l, tp);
1078
1079		BUG_ON(!tp);
1080
1081		cindex = tkey_extract_bits(key, tp->pos, tp->bits);
1082		put_child(t, (struct tnode *)tp, cindex, (struct node *)l);
1083	} else {
1084		/* Case 3: n is a LEAF or a TNODE and the key doesn't match. */
1085		/*
1086		 *  Add a new tnode here
1087		 *  first tnode need some special handling
1088		 */
1089
1090		if (tp)
1091			pos = tp->pos+tp->bits;
1092		else
1093			pos = 0;
1094
1095		if (n) {
1096			newpos = tkey_mismatch(key, pos, n->key);
1097			tn = tnode_new(n->key, newpos, 1);
1098		} else {
1099			newpos = 0;
1100			tn = tnode_new(key, newpos, 1); /* First tnode */
1101		}
1102
1103		if (!tn) {
1104			free_leaf_info(li);
1105			tnode_free((struct tnode *) l);
1106			*err = -ENOMEM;
1107			goto err;
1108		}
1109
1110		NODE_SET_PARENT(tn, tp);
1111
1112		missbit = tkey_extract_bits(key, newpos, 1);
1113		put_child(t, tn, missbit, (struct node *)l);
1114		put_child(t, tn, 1-missbit, n);
1115
1116		if (tp) {
1117			cindex = tkey_extract_bits(key, tp->pos, tp->bits);
1118			put_child(t, (struct tnode *)tp, cindex, (struct node *)tn);
1119		} else {
1120			t->trie = (struct node*) tn; /* First tnode */
1121			tp = tn;
1122		}
1123	}
1124
1125	if (tp && tp->pos + tp->bits > 32)
1126		printk("ERROR tp=%p pos=%d, bits=%d, key=%0x plen=%d\n",
1127		       tp, tp->pos, tp->bits, key, plen);
1128
1129	/* Rebalance the trie */
1130	t->trie = trie_rebalance(t, tp);
1131done:
1132	t->revision++;
1133err:
1134	return fa_head;
1135}
1136
1137static int
1138fn_trie_insert(struct fib_table *tb, struct rtmsg *r, struct kern_rta *rta,
1139	       struct nlmsghdr *nlhdr, struct netlink_skb_parms *req)
1140{
1141	struct trie *t = (struct trie *) tb->tb_data;
1142	struct fib_alias *fa, *new_fa;
1143	struct list_head *fa_head = NULL;
1144	struct fib_info *fi;
1145	int plen = r->rtm_dst_len;
1146	int type = r->rtm_type;
1147	u8 tos = r->rtm_tos;
1148	u32 key, mask;
1149	int err;
1150	struct leaf *l;
1151
1152	if (plen > 32)
1153		return -EINVAL;
1154
1155	key = 0;
1156	if (rta->rta_dst)
1157		memcpy(&key, rta->rta_dst, 4);
1158
1159	key = ntohl(key);
1160
1161	DBG("Insert table=%d %08x/%d\n", tb->tb_id, key, plen);
1162
1163	mask = ntohl(inet_make_mask(plen));
1164
1165	if (key & ~mask)
1166		return -EINVAL;
1167
1168	key = key & mask;
1169
1170	fi = fib_create_info(r, rta, nlhdr, &err);
1171
1172	if (!fi)
1173		goto err;
1174
1175	l = fib_find_node(t, key);
1176	fa = NULL;
1177
1178	if (l) {
1179		fa_head = get_fa_head(l, plen);
1180		fa = fib_find_alias(fa_head, tos, fi->fib_priority);
1181	}
1182
1183	/* Now fa, if non-NULL, points to the first fib alias
1184	 * with the same keys [prefix,tos,priority], if such key already
1185	 * exists or to the node before which we will insert new one.
1186	 *
1187	 * If fa is NULL, we will need to allocate a new one and
1188	 * insert to the head of f.
1189	 *
1190	 * If f is NULL, no fib node matched the destination key
1191	 * and we need to allocate a new one of those as well.
1192	 */
1193
1194	if (fa && fa->fa_info->fib_priority == fi->fib_priority) {
1195		struct fib_alias *fa_orig;
1196
1197		err = -EEXIST;
1198		if (nlhdr->nlmsg_flags & NLM_F_EXCL)
1199			goto out;
1200
1201		if (nlhdr->nlmsg_flags & NLM_F_REPLACE) {
1202			struct fib_info *fi_drop;
1203			u8 state;
1204
1205			write_lock_bh(&fib_lock);
1206
1207			fi_drop = fa->fa_info;
1208			fa->fa_info = fi;
1209			fa->fa_type = type;
1210			fa->fa_scope = r->rtm_scope;
1211			state = fa->fa_state;
1212			fa->fa_state &= ~FA_S_ACCESSED;
1213
1214			write_unlock_bh(&fib_lock);
1215
1216			fib_release_info(fi_drop);
1217			if (state & FA_S_ACCESSED)
1218				rt_cache_flush(-1);
1219
1220			goto succeeded;
1221		}
1222		/* Error if we find a perfect match which
1223		 * uses the same scope, type, and nexthop
1224		 * information.
1225		 */
1226		fa_orig = fa;
1227		list_for_each_entry(fa, fa_orig->fa_list.prev, fa_list) {
1228			if (fa->fa_tos != tos)
1229				break;
1230			if (fa->fa_info->fib_priority != fi->fib_priority)
1231				break;
1232			if (fa->fa_type == type &&
1233			    fa->fa_scope == r->rtm_scope &&
1234			    fa->fa_info == fi) {
1235				goto out;
1236			}
1237		}
1238		if (!(nlhdr->nlmsg_flags & NLM_F_APPEND))
1239			fa = fa_orig;
1240	}
1241	err = -ENOENT;
1242	if (!(nlhdr->nlmsg_flags & NLM_F_CREATE))
1243		goto out;
1244
1245	err = -ENOBUFS;
1246	new_fa = kmem_cache_alloc(fn_alias_kmem, SLAB_KERNEL);
1247	if (new_fa == NULL)
1248		goto out;
1249
1250	new_fa->fa_info = fi;
1251	new_fa->fa_tos = tos;
1252	new_fa->fa_type = type;
1253	new_fa->fa_scope = r->rtm_scope;
1254	new_fa->fa_state = 0;
1255	/*
1256	 * Insert new entry to the list.
1257	 */
1258
1259	if (!fa_head) {
1260		fa_head = fib_insert_node(t, &err, key, plen);
1261		err = 0;
1262		if (err)
1263			goto out_free_new_fa;
1264	}
1265
1266	write_lock_bh(&fib_lock);
1267
1268	list_add_tail(&new_fa->fa_list, (fa ? &fa->fa_list : fa_head));
1269
1270	write_unlock_bh(&fib_lock);
1271
1272	rt_cache_flush(-1);
1273	rtmsg_fib(RTM_NEWROUTE, htonl(key), new_fa, plen, tb->tb_id, nlhdr, req);
1274succeeded:
1275	return 0;
1276
1277out_free_new_fa:
1278	kmem_cache_free(fn_alias_kmem, new_fa);
1279out:
1280	fib_release_info(fi);
1281err:
1282	return err;
1283}
1284
1285static inline int check_leaf(struct trie *t, struct leaf *l,  t_key key, int *plen, const struct flowi *flp,
1286			     struct fib_result *res)
1287{
1288	int err, i;
1289	t_key mask;
1290	struct leaf_info *li;
1291	struct hlist_head *hhead = &l->list;
1292	struct hlist_node *node;
1293
1294	hlist_for_each_entry(li, node, hhead, hlist) {
1295		i = li->plen;
1296		mask = ntohl(inet_make_mask(i));
1297		if (l->key != (key & mask))
1298			continue;
1299
1300		if ((err = fib_semantic_match(&li->falh, flp, res, l->key, mask, i)) <= 0) {
1301			*plen = i;
1302#ifdef CONFIG_IP_FIB_TRIE_STATS
1303			t->stats.semantic_match_passed++;
1304#endif
1305			return err;
1306		}
1307#ifdef CONFIG_IP_FIB_TRIE_STATS
1308		t->stats.semantic_match_miss++;
1309#endif
1310	}
1311	return 1;
1312}
1313
1314static int
1315fn_trie_lookup(struct fib_table *tb, const struct flowi *flp, struct fib_result *res)
1316{
1317	struct trie *t = (struct trie *) tb->tb_data;
1318	int plen, ret = 0;
1319	struct node *n;
1320	struct tnode *pn;
1321	int pos, bits;
1322	t_key key = ntohl(flp->fl4_dst);
1323	int chopped_off;
1324	t_key cindex = 0;
1325	int current_prefix_length = KEYLENGTH;
1326	struct tnode *cn;
1327	t_key node_prefix, key_prefix, pref_mismatch;
1328	int mp;
1329
1330	n = t->trie;
1331
1332	read_lock(&fib_lock);
1333
1334	if (!n)
1335		goto failed;
1336
1337#ifdef CONFIG_IP_FIB_TRIE_STATS
1338	t->stats.gets++;
1339#endif
1340
1341	/* Just a leaf? */
1342	if (IS_LEAF(n)) {
1343		if ((ret = check_leaf(t, (struct leaf *)n, key, &plen, flp, res)) <= 0)
1344			goto found;
1345		goto failed;
1346	}
1347	pn = (struct tnode *) n;
1348	chopped_off = 0;
1349
1350	while (pn) {
1351		pos = pn->pos;
1352		bits = pn->bits;
1353
1354		if (!chopped_off)
1355			cindex = tkey_extract_bits(MASK_PFX(key, current_prefix_length), pos, bits);
1356
1357		n = tnode_get_child(pn, cindex);
1358
1359		if (n == NULL) {
1360#ifdef CONFIG_IP_FIB_TRIE_STATS
1361			t->stats.null_node_hit++;
1362#endif
1363			goto backtrace;
1364		}
1365
1366		if (IS_LEAF(n)) {
1367			if ((ret = check_leaf(t, (struct leaf *)n, key, &plen, flp, res)) <= 0)
1368				goto found;
1369			else
1370				goto backtrace;
1371		}
1372
1373#define HL_OPTIMIZE
1374#ifdef HL_OPTIMIZE
1375		cn = (struct tnode *)n;
1376
1377		/*
1378		 * It's a tnode, and we can do some extra checks here if we
1379		 * like, to avoid descending into a dead-end branch.
1380		 * This tnode is in the parent's child array at index
1381		 * key[p_pos..p_pos+p_bits] but potentially with some bits
1382		 * chopped off, so in reality the index may be just a
1383		 * subprefix, padded with zero at the end.
1384		 * We can also take a look at any skipped bits in this
1385		 * tnode - everything up to p_pos is supposed to be ok,
1386		 * and the non-chopped bits of the index (se previous
1387		 * paragraph) are also guaranteed ok, but the rest is
1388		 * considered unknown.
1389		 *
1390		 * The skipped bits are key[pos+bits..cn->pos].
1391		 */
1392
1393		/* If current_prefix_length < pos+bits, we are already doing
1394		 * actual prefix  matching, which means everything from
1395		 * pos+(bits-chopped_off) onward must be zero along some
1396		 * branch of this subtree - otherwise there is *no* valid
1397		 * prefix present. Here we can only check the skipped
1398		 * bits. Remember, since we have already indexed into the
1399		 * parent's child array, we know that the bits we chopped of
1400		 * *are* zero.
1401		 */
1402
1403		/* NOTA BENE: CHECKING ONLY SKIPPED BITS FOR THE NEW NODE HERE */
1404
1405		if (current_prefix_length < pos+bits) {
1406			if (tkey_extract_bits(cn->key, current_prefix_length,
1407						cn->pos - current_prefix_length) != 0 ||
1408			    !(cn->child[0]))
1409				goto backtrace;
1410		}
1411
1412		/*
1413		 * If chopped_off=0, the index is fully validated and we
1414		 * only need to look at the skipped bits for this, the new,
1415		 * tnode. What we actually want to do is to find out if
1416		 * these skipped bits match our key perfectly, or if we will
1417		 * have to count on finding a matching prefix further down,
1418		 * because if we do, we would like to have some way of
1419		 * verifying the existence of such a prefix at this point.
1420		 */
1421
1422		/* The only thing we can do at this point is to verify that
1423		 * any such matching prefix can indeed be a prefix to our
1424		 * key, and if the bits in the node we are inspecting that
1425		 * do not match our key are not ZERO, this cannot be true.
1426		 * Thus, find out where there is a mismatch (before cn->pos)
1427		 * and verify that all the mismatching bits are zero in the
1428		 * new tnode's key.
1429		 */
1430
1431		/* Note: We aren't very concerned about the piece of the key
1432		 * that precede pn->pos+pn->bits, since these have already been
1433		 * checked. The bits after cn->pos aren't checked since these are
1434		 * by definition "unknown" at this point. Thus, what we want to
1435		 * see is if we are about to enter the "prefix matching" state,
1436		 * and in that case verify that the skipped bits that will prevail
1437		 * throughout this subtree are zero, as they have to be if we are
1438		 * to find a matching prefix.
1439		 */
1440
1441		node_prefix = MASK_PFX(cn->key, cn->pos);
1442		key_prefix = MASK_PFX(key, cn->pos);
1443		pref_mismatch = key_prefix^node_prefix;
1444		mp = 0;
1445
1446		/* In short: If skipped bits in this node do not match the search
1447		 * key, enter the "prefix matching" state.directly.
1448		 */
1449		if (pref_mismatch) {
1450			while (!(pref_mismatch & (1<<(KEYLENGTH-1)))) {
1451				mp++;
1452				pref_mismatch = pref_mismatch <<1;
1453			}
1454			key_prefix = tkey_extract_bits(cn->key, mp, cn->pos-mp);
1455
1456			if (key_prefix != 0)
1457				goto backtrace;
1458
1459			if (current_prefix_length >= cn->pos)
1460				current_prefix_length = mp;
1461		}
1462#endif
1463		pn = (struct tnode *)n; /* Descend */
1464		chopped_off = 0;
1465		continue;
1466
1467backtrace:
1468		chopped_off++;
1469
1470		/* As zero don't change the child key (cindex) */
1471		while ((chopped_off <= pn->bits) && !(cindex & (1<<(chopped_off-1))))
1472			chopped_off++;
1473
1474		/* Decrease current_... with bits chopped off */
1475		if (current_prefix_length > pn->pos + pn->bits - chopped_off)
1476			current_prefix_length = pn->pos + pn->bits - chopped_off;
1477
1478		/*
1479		 * Either we do the actual chop off according or if we have
1480		 * chopped off all bits in this tnode walk up to our parent.
1481		 */
1482
1483		if (chopped_off <= pn->bits) {
1484			cindex &= ~(1 << (chopped_off-1));
1485		} else {
1486			if (NODE_PARENT(pn) == NULL)
1487				goto failed;
1488
1489			/* Get Child's index */
1490			cindex = tkey_extract_bits(pn->key, NODE_PARENT(pn)->pos, NODE_PARENT(pn)->bits);
1491			pn = NODE_PARENT(pn);
1492			chopped_off = 0;
1493
1494#ifdef CONFIG_IP_FIB_TRIE_STATS
1495			t->stats.backtrack++;
1496#endif
1497			goto backtrace;
1498		}
1499	}
1500failed:
1501	ret = 1;
1502found:
1503	read_unlock(&fib_lock);
1504	return ret;
1505}
1506
1507static int trie_leaf_remove(struct trie *t, t_key key)
1508{
1509	t_key cindex;
1510	struct tnode *tp = NULL;
1511	struct node *n = t->trie;
1512	struct leaf *l;
1513
1514	DBG("entering trie_leaf_remove(%p)\n", n);
1515
1516	/* Note that in the case skipped bits, those bits are *not* checked!
1517	 * When we finish this, we will have NULL or a T_LEAF, and the
1518	 * T_LEAF may or may not match our key.
1519	 */
1520
1521	while (n != NULL && IS_TNODE(n)) {
1522		struct tnode *tn = (struct tnode *) n;
1523		check_tnode(tn);
1524		n = tnode_get_child(tn ,tkey_extract_bits(key, tn->pos, tn->bits));
1525
1526		if (n && NODE_PARENT(n) != tn) {
1527			printk("BUG tn=%p, n->parent=%p\n", tn, NODE_PARENT(n));
1528			BUG();
1529		}
1530	}
1531	l = (struct leaf *) n;
1532
1533	if (!n || !tkey_equals(l->key, key))
1534		return 0;
1535
1536	/*
1537	 * Key found.
1538	 * Remove the leaf and rebalance the tree
1539	 */
1540
1541	t->revision++;
1542	t->size--;
1543
1544	tp = NODE_PARENT(n);
1545	tnode_free((struct tnode *) n);
1546
1547	if (tp) {
1548		cindex = tkey_extract_bits(key, tp->pos, tp->bits);
1549		put_child(t, (struct tnode *)tp, cindex, NULL);
1550		t->trie = trie_rebalance(t, tp);
1551	} else
1552		t->trie = NULL;
1553
1554	return 1;
1555}
1556
1557static int
1558fn_trie_delete(struct fib_table *tb, struct rtmsg *r, struct kern_rta *rta,
1559		struct nlmsghdr *nlhdr, struct netlink_skb_parms *req)
1560{
1561	struct trie *t = (struct trie *) tb->tb_data;
1562	u32 key, mask;
1563	int plen = r->rtm_dst_len;
1564	u8 tos = r->rtm_tos;
1565	struct fib_alias *fa, *fa_to_delete;
1566	struct list_head *fa_head;
1567	struct leaf *l;
1568	int kill_li = 0;
1569	struct leaf_info *li;
1570
1571
1572	if (plen > 32)
1573		return -EINVAL;
1574
1575	key = 0;
1576	if (rta->rta_dst)
1577		memcpy(&key, rta->rta_dst, 4);
1578
1579	key = ntohl(key);
1580	mask = ntohl(inet_make_mask(plen));
1581
1582	if (key & ~mask)
1583		return -EINVAL;
1584
1585	key = key & mask;
1586	l = fib_find_node(t, key);
1587
1588	if (!l)
1589		return -ESRCH;
1590
1591	fa_head = get_fa_head(l, plen);
1592	fa = fib_find_alias(fa_head, tos, 0);
1593
1594	if (!fa)
1595		return -ESRCH;
1596
1597	DBG("Deleting %08x/%d tos=%d t=%p\n", key, plen, tos, t);
1598
1599	fa_to_delete = NULL;
1600	fa_head = fa->fa_list.prev;
1601	list_for_each_entry(fa, fa_head, fa_list) {
1602		struct fib_info *fi = fa->fa_info;
1603
1604		if (fa->fa_tos != tos)
1605			break;
1606
1607		if ((!r->rtm_type ||
1608		     fa->fa_type == r->rtm_type) &&
1609		    (r->rtm_scope == RT_SCOPE_NOWHERE ||
1610		     fa->fa_scope == r->rtm_scope) &&
1611		    (!r->rtm_protocol ||
1612		     fi->fib_protocol == r->rtm_protocol) &&
1613		    fib_nh_match(r, nlhdr, rta, fi) == 0) {
1614			fa_to_delete = fa;
1615			break;
1616		}
1617	}
1618
1619	if (!fa_to_delete)
1620		return -ESRCH;
1621
1622	fa = fa_to_delete;
1623	rtmsg_fib(RTM_DELROUTE, htonl(key), fa, plen, tb->tb_id, nlhdr, req);
1624
1625	l = fib_find_node(t, key);
1626	li = find_leaf_info(&l->list, plen);
1627
1628	write_lock_bh(&fib_lock);
1629
1630	list_del(&fa->fa_list);
1631
1632	if (list_empty(fa_head)) {
1633		hlist_del(&li->hlist);
1634		kill_li = 1;
1635	}
1636	write_unlock_bh(&fib_lock);
1637
1638	if (kill_li)
1639		free_leaf_info(li);
1640
1641	if (hlist_empty(&l->list))
1642		trie_leaf_remove(t, key);
1643
1644	if (fa->fa_state & FA_S_ACCESSED)
1645		rt_cache_flush(-1);
1646
1647	fn_free_alias(fa);
1648	return 0;
1649}
1650
1651static int trie_flush_list(struct trie *t, struct list_head *head)
1652{
1653	struct fib_alias *fa, *fa_node;
1654	int found = 0;
1655
1656	list_for_each_entry_safe(fa, fa_node, head, fa_list) {
1657		struct fib_info *fi = fa->fa_info;
1658
1659		if (fi && (fi->fib_flags&RTNH_F_DEAD)) {
1660 			write_lock_bh(&fib_lock);
1661			list_del(&fa->fa_list);
1662			write_unlock_bh(&fib_lock);
1663
1664			fn_free_alias(fa);
1665			found++;
1666		}
1667	}
1668	return found;
1669}
1670
1671static int trie_flush_leaf(struct trie *t, struct leaf *l)
1672{
1673	int found = 0;
1674	struct hlist_head *lih = &l->list;
1675	struct hlist_node *node, *tmp;
1676	struct leaf_info *li = NULL;
1677
1678	hlist_for_each_entry_safe(li, node, tmp, lih, hlist) {
1679		found += trie_flush_list(t, &li->falh);
1680
1681		if (list_empty(&li->falh)) {
1682 			write_lock_bh(&fib_lock);
1683			hlist_del(&li->hlist);
1684			write_unlock_bh(&fib_lock);
1685
1686			free_leaf_info(li);
1687		}
1688	}
1689	return found;
1690}
1691
1692static struct leaf *nextleaf(struct trie *t, struct leaf *thisleaf)
1693{
1694	struct node *c = (struct node *) thisleaf;
1695	struct tnode *p;
1696	int idx;
1697
1698	if (c == NULL) {
1699		if (t->trie == NULL)
1700			return NULL;
1701
1702		if (IS_LEAF(t->trie))          /* trie w. just a leaf */
1703			return (struct leaf *) t->trie;
1704
1705		p = (struct tnode*) t->trie;  /* Start */
1706	} else
1707		p = (struct tnode *) NODE_PARENT(c);
1708
1709	while (p) {
1710		int pos, last;
1711
1712		/*  Find the next child of the parent */
1713		if (c)
1714			pos = 1 + tkey_extract_bits(c->key, p->pos, p->bits);
1715		else
1716			pos = 0;
1717
1718		last = 1 << p->bits;
1719		for (idx = pos; idx < last ; idx++) {
1720			if (!p->child[idx])
1721				continue;
1722
1723			/* Decend if tnode */
1724			while (IS_TNODE(p->child[idx])) {
1725				p = (struct tnode*) p->child[idx];
1726				idx = 0;
1727
1728				/* Rightmost non-NULL branch */
1729				if (p && IS_TNODE(p))
1730					while (p->child[idx] == NULL && idx < (1 << p->bits)) idx++;
1731
1732				/* Done with this tnode? */
1733				if (idx >= (1 << p->bits) || p->child[idx] == NULL)
1734					goto up;
1735			}
1736			return (struct leaf*) p->child[idx];
1737		}
1738up:
1739		/* No more children go up one step  */
1740		c = (struct node *) p;
1741		p = (struct tnode *) NODE_PARENT(p);
1742	}
1743	return NULL; /* Ready. Root of trie */
1744}
1745
1746static int fn_trie_flush(struct fib_table *tb)
1747{
1748	struct trie *t = (struct trie *) tb->tb_data;
1749	struct leaf *ll = NULL, *l = NULL;
1750	int found = 0, h;
1751
1752	t->revision++;
1753
1754	for (h = 0; (l = nextleaf(t, l)) != NULL; h++) {
1755		found += trie_flush_leaf(t, l);
1756
1757		if (ll && hlist_empty(&ll->list))
1758			trie_leaf_remove(t, ll->key);
1759		ll = l;
1760	}
1761
1762	if (ll && hlist_empty(&ll->list))
1763		trie_leaf_remove(t, ll->key);
1764
1765	DBG("trie_flush found=%d\n", found);
1766	return found;
1767}
1768
1769static int trie_last_dflt = -1;
1770
1771static void
1772fn_trie_select_default(struct fib_table *tb, const struct flowi *flp, struct fib_result *res)
1773{
1774	struct trie *t = (struct trie *) tb->tb_data;
1775	int order, last_idx;
1776	struct fib_info *fi = NULL;
1777	struct fib_info *last_resort;
1778	struct fib_alias *fa = NULL;
1779	struct list_head *fa_head;
1780	struct leaf *l;
1781
1782	last_idx = -1;
1783	last_resort = NULL;
1784	order = -1;
1785
1786	read_lock(&fib_lock);
1787
1788	l = fib_find_node(t, 0);
1789	if (!l)
1790		goto out;
1791
1792	fa_head = get_fa_head(l, 0);
1793	if (!fa_head)
1794		goto out;
1795
1796	if (list_empty(fa_head))
1797		goto out;
1798
1799	list_for_each_entry(fa, fa_head, fa_list) {
1800		struct fib_info *next_fi = fa->fa_info;
1801
1802		if (fa->fa_scope != res->scope ||
1803		    fa->fa_type != RTN_UNICAST)
1804			continue;
1805
1806		if (next_fi->fib_priority > res->fi->fib_priority)
1807			break;
1808		if (!next_fi->fib_nh[0].nh_gw ||
1809		    next_fi->fib_nh[0].nh_scope != RT_SCOPE_LINK)
1810			continue;
1811		fa->fa_state |= FA_S_ACCESSED;
1812
1813		if (fi == NULL) {
1814			if (next_fi != res->fi)
1815				break;
1816		} else if (!fib_detect_death(fi, order, &last_resort,
1817					     &last_idx, &trie_last_dflt)) {
1818			if (res->fi)
1819				fib_info_put(res->fi);
1820			res->fi = fi;
1821			atomic_inc(&fi->fib_clntref);
1822			trie_last_dflt = order;
1823			goto out;
1824		}
1825		fi = next_fi;
1826		order++;
1827	}
1828	if (order <= 0 || fi == NULL) {
1829		trie_last_dflt = -1;
1830		goto out;
1831	}
1832
1833	if (!fib_detect_death(fi, order, &last_resort, &last_idx, &trie_last_dflt)) {
1834		if (res->fi)
1835			fib_info_put(res->fi);
1836		res->fi = fi;
1837		atomic_inc(&fi->fib_clntref);
1838		trie_last_dflt = order;
1839		goto out;
1840	}
1841	if (last_idx >= 0) {
1842		if (res->fi)
1843			fib_info_put(res->fi);
1844		res->fi = last_resort;
1845		if (last_resort)
1846			atomic_inc(&last_resort->fib_clntref);
1847	}
1848	trie_last_dflt = last_idx;
1849 out:;
1850	read_unlock(&fib_lock);
1851}
1852
1853static int fn_trie_dump_fa(t_key key, int plen, struct list_head *fah, struct fib_table *tb,
1854			   struct sk_buff *skb, struct netlink_callback *cb)
1855{
1856	int i, s_i;
1857	struct fib_alias *fa;
1858
1859	u32 xkey = htonl(key);
1860
1861	s_i = cb->args[3];
1862	i = 0;
1863
1864	list_for_each_entry(fa, fah, fa_list) {
1865		if (i < s_i) {
1866			i++;
1867			continue;
1868		}
1869		if (fa->fa_info->fib_nh == NULL) {
1870			printk("Trie error _fib_nh=NULL in fa[%d] k=%08x plen=%d\n", i, key, plen);
1871			i++;
1872			continue;
1873		}
1874		if (fa->fa_info == NULL) {
1875			printk("Trie error fa_info=NULL in fa[%d] k=%08x plen=%d\n", i, key, plen);
1876			i++;
1877			continue;
1878		}
1879
1880		if (fib_dump_info(skb, NETLINK_CB(cb->skb).pid,
1881				  cb->nlh->nlmsg_seq,
1882				  RTM_NEWROUTE,
1883				  tb->tb_id,
1884				  fa->fa_type,
1885				  fa->fa_scope,
1886				  &xkey,
1887				  plen,
1888				  fa->fa_tos,
1889				  fa->fa_info, 0) < 0) {
1890			cb->args[3] = i;
1891			return -1;
1892		}
1893		i++;
1894	}
1895	cb->args[3] = i;
1896	return skb->len;
1897}
1898
1899static int fn_trie_dump_plen(struct trie *t, int plen, struct fib_table *tb, struct sk_buff *skb,
1900			     struct netlink_callback *cb)
1901{
1902	int h, s_h;
1903	struct list_head *fa_head;
1904	struct leaf *l = NULL;
1905
1906	s_h = cb->args[2];
1907
1908	for (h = 0; (l = nextleaf(t, l)) != NULL; h++) {
1909		if (h < s_h)
1910			continue;
1911		if (h > s_h)
1912			memset(&cb->args[3], 0,
1913			       sizeof(cb->args) - 3*sizeof(cb->args[0]));
1914
1915		fa_head = get_fa_head(l, plen);
1916
1917		if (!fa_head)
1918			continue;
1919
1920		if (list_empty(fa_head))
1921			continue;
1922
1923		if (fn_trie_dump_fa(l->key, plen, fa_head, tb, skb, cb)<0) {
1924			cb->args[2] = h;
1925			return -1;
1926		}
1927	}
1928	cb->args[2] = h;
1929	return skb->len;
1930}
1931
1932static int fn_trie_dump(struct fib_table *tb, struct sk_buff *skb, struct netlink_callback *cb)
1933{
1934	int m, s_m;
1935	struct trie *t = (struct trie *) tb->tb_data;
1936
1937	s_m = cb->args[1];
1938
1939	read_lock(&fib_lock);
1940	for (m = 0; m <= 32; m++) {
1941		if (m < s_m)
1942			continue;
1943		if (m > s_m)
1944			memset(&cb->args[2], 0,
1945				sizeof(cb->args) - 2*sizeof(cb->args[0]));
1946
1947		if (fn_trie_dump_plen(t, 32-m, tb, skb, cb)<0) {
1948			cb->args[1] = m;
1949			goto out;
1950		}
1951	}
1952	read_unlock(&fib_lock);
1953	cb->args[1] = m;
1954	return skb->len;
1955out:
1956	read_unlock(&fib_lock);
1957	return -1;
1958}
1959
1960/* Fix more generic FIB names for init later */
1961
1962#ifdef CONFIG_IP_MULTIPLE_TABLES
1963struct fib_table * fib_hash_init(int id)
1964#else
1965struct fib_table * __init fib_hash_init(int id)
1966#endif
1967{
1968	struct fib_table *tb;
1969	struct trie *t;
1970
1971	if (fn_alias_kmem == NULL)
1972		fn_alias_kmem = kmem_cache_create("ip_fib_alias",
1973						  sizeof(struct fib_alias),
1974						  0, SLAB_HWCACHE_ALIGN,
1975						  NULL, NULL);
1976
1977	tb = kmalloc(sizeof(struct fib_table) + sizeof(struct trie),
1978		     GFP_KERNEL);
1979	if (tb == NULL)
1980		return NULL;
1981
1982	tb->tb_id = id;
1983	tb->tb_lookup = fn_trie_lookup;
1984	tb->tb_insert = fn_trie_insert;
1985	tb->tb_delete = fn_trie_delete;
1986	tb->tb_flush = fn_trie_flush;
1987	tb->tb_select_default = fn_trie_select_default;
1988	tb->tb_dump = fn_trie_dump;
1989	memset(tb->tb_data, 0, sizeof(struct trie));
1990
1991	t = (struct trie *) tb->tb_data;
1992
1993	trie_init(t);
1994
1995	if (id == RT_TABLE_LOCAL)
1996		trie_local = t;
1997	else if (id == RT_TABLE_MAIN)
1998		trie_main = t;
1999
2000	if (id == RT_TABLE_LOCAL)
2001		printk("IPv4 FIB: Using LC-trie version %s\n", VERSION);
2002
2003	return tb;
2004}
2005
2006/* Trie dump functions */
2007
2008static void putspace_seq(struct seq_file *seq, int n)
2009{
2010	while (n--)
2011		seq_printf(seq, " ");
2012}
2013
2014static void printbin_seq(struct seq_file *seq, unsigned int v, int bits)
2015{
2016	while (bits--)
2017		seq_printf(seq, "%s", (v & (1<<bits))?"1":"0");
2018}
2019
2020static void printnode_seq(struct seq_file *seq, int indent, struct node *n,
2021		   int pend, int cindex, int bits)
2022{
2023	putspace_seq(seq, indent);
2024	if (IS_LEAF(n))
2025		seq_printf(seq, "|");
2026	else
2027		seq_printf(seq, "+");
2028	if (bits) {
2029		seq_printf(seq, "%d/", cindex);
2030		printbin_seq(seq, cindex, bits);
2031		seq_printf(seq, ": ");
2032	} else
2033		seq_printf(seq, "<root>: ");
2034	seq_printf(seq, "%s:%p ", IS_LEAF(n)?"Leaf":"Internal node", n);
2035
2036	if (IS_LEAF(n)) {
2037		struct leaf *l = (struct leaf *)n;
2038		struct fib_alias *fa;
2039		int i;
2040
2041		seq_printf(seq, "key=%d.%d.%d.%d\n",
2042			   n->key >> 24, (n->key >> 16) % 256, (n->key >> 8) % 256, n->key % 256);
2043
2044		for (i = 32; i >= 0; i--)
2045			if (find_leaf_info(&l->list, i)) {
2046				struct list_head *fa_head = get_fa_head(l, i);
2047
2048				if (!fa_head)
2049					continue;
2050
2051				if (list_empty(fa_head))
2052					continue;
2053
2054				putspace_seq(seq, indent+2);
2055				seq_printf(seq, "{/%d...dumping}\n", i);
2056
2057				list_for_each_entry(fa, fa_head, fa_list) {
2058					putspace_seq(seq, indent+2);
2059					if (fa->fa_info == NULL) {
2060						seq_printf(seq, "Error fa_info=NULL\n");
2061						continue;
2062					}
2063					if (fa->fa_info->fib_nh == NULL) {
2064						seq_printf(seq, "Error _fib_nh=NULL\n");
2065						continue;
2066					}
2067
2068					seq_printf(seq, "{type=%d scope=%d TOS=%d}\n",
2069					      fa->fa_type,
2070					      fa->fa_scope,
2071					      fa->fa_tos);
2072				}
2073			}
2074	} else {
2075		struct tnode *tn = (struct tnode *)n;
2076		int plen = ((struct tnode *)n)->pos;
2077		t_key prf = MASK_PFX(n->key, plen);
2078
2079		seq_printf(seq, "key=%d.%d.%d.%d/%d\n",
2080			   prf >> 24, (prf >> 16) % 256, (prf >> 8) % 256, prf % 256, plen);
2081
2082		putspace_seq(seq, indent); seq_printf(seq, "|    ");
2083		seq_printf(seq, "{key prefix=%08x/", tn->key & TKEY_GET_MASK(0, tn->pos));
2084		printbin_seq(seq, tkey_extract_bits(tn->key, 0, tn->pos), tn->pos);
2085		seq_printf(seq, "}\n");
2086		putspace_seq(seq, indent); seq_printf(seq, "|    ");
2087		seq_printf(seq, "{pos=%d", tn->pos);
2088		seq_printf(seq, " (skip=%d bits)", tn->pos - pend);
2089		seq_printf(seq, " bits=%d (%u children)}\n", tn->bits, (1 << tn->bits));
2090		putspace_seq(seq, indent); seq_printf(seq, "|    ");
2091		seq_printf(seq, "{empty=%d full=%d}\n", tn->empty_children, tn->full_children);
2092	}
2093}
2094
2095static void trie_dump_seq(struct seq_file *seq, struct trie *t)
2096{
2097	struct node *n = t->trie;
2098	int cindex = 0;
2099	int indent = 1;
2100	int pend = 0;
2101	int depth = 0;
2102	struct tnode *tn;
2103
2104  	read_lock(&fib_lock);
2105
2106	seq_printf(seq, "------ trie_dump of t=%p ------\n", t);
2107
2108	if (!n) {
2109		seq_printf(seq, "------ trie is empty\n");
2110
2111	  	read_unlock(&fib_lock);
2112		return;
2113	}
2114
2115	printnode_seq(seq, indent, n, pend, cindex, 0);
2116
2117	if (!IS_TNODE(n)) {
2118	  	read_unlock(&fib_lock);
2119		return;
2120	}
2121
2122	tn = (struct tnode *)n;
2123	pend = tn->pos+tn->bits;
2124	putspace_seq(seq, indent); seq_printf(seq, "\\--\n");
2125	indent += 3;
2126	depth++;
2127
2128	while (tn && cindex < (1 << tn->bits)) {
2129		if (tn->child[cindex]) {
2130			/* Got a child */
2131
2132			printnode_seq(seq, indent, tn->child[cindex], pend, cindex, tn->bits);
2133			if (IS_LEAF(tn->child[cindex])) {
2134				cindex++;
2135			} else {
2136				/*
2137				 * New tnode. Decend one level
2138				 */
2139
2140				depth++;
2141				tn = (struct tnode *)tn->child[cindex];
2142				pend = tn->pos + tn->bits;
2143				putspace_seq(seq, indent); seq_printf(seq, "\\--\n");
2144				indent += 3;
2145				cindex = 0;
2146			}
2147		} else
2148			cindex++;
2149
2150		/*
2151		 * Test if we are done
2152		 */
2153
2154		while (cindex >= (1 << tn->bits)) {
2155			/*
2156			 * Move upwards and test for root
2157			 * pop off all traversed  nodes
2158			 */
2159
2160			if (NODE_PARENT(tn) == NULL) {
2161				tn = NULL;
2162				break;
2163			}
2164
2165			cindex = tkey_extract_bits(tn->key, NODE_PARENT(tn)->pos, NODE_PARENT(tn)->bits);
2166			cindex++;
2167			tn = NODE_PARENT(tn);
2168			pend = tn->pos + tn->bits;
2169			indent -= 3;
2170			depth--;
2171		}
2172	}
2173
2174  	read_unlock(&fib_lock);
2175}
2176
2177static struct trie_stat *trie_stat_new(void)
2178{
2179	struct trie_stat *s;
2180	int i;
2181
2182	s = kmalloc(sizeof(struct trie_stat), GFP_KERNEL);
2183	if (!s)
2184		return NULL;
2185
2186	s->totdepth = 0;
2187	s->maxdepth = 0;
2188	s->tnodes = 0;
2189	s->leaves = 0;
2190	s->nullpointers = 0;
2191
2192	for (i = 0; i < MAX_CHILDS; i++)
2193		s->nodesizes[i] = 0;
2194
2195	return s;
2196}
2197
2198static struct trie_stat *trie_collect_stats(struct trie *t)
2199{
2200	struct node *n = t->trie;
2201	struct trie_stat *s = trie_stat_new();
2202	int cindex = 0;
2203	int pend = 0;
2204	int depth = 0;
2205
2206	if (!s)
2207		return NULL;
2208	if (!n)
2209		return s;
2210
2211	read_lock(&fib_lock);
2212
2213	if (IS_TNODE(n)) {
2214		struct tnode *tn = (struct tnode *)n;
2215		pend = tn->pos+tn->bits;
2216		s->nodesizes[tn->bits]++;
2217		depth++;
2218
2219		while (tn && cindex < (1 << tn->bits)) {
2220			if (tn->child[cindex]) {
2221				/* Got a child */
2222
2223				if (IS_LEAF(tn->child[cindex])) {
2224					cindex++;
2225
2226					/* stats */
2227					if (depth > s->maxdepth)
2228						s->maxdepth = depth;
2229					s->totdepth += depth;
2230					s->leaves++;
2231				} else {
2232					/*
2233					 * New tnode. Decend one level
2234					 */
2235
2236					s->tnodes++;
2237					s->nodesizes[tn->bits]++;
2238					depth++;
2239
2240					n = tn->child[cindex];
2241					tn = (struct tnode *)n;
2242					pend = tn->pos+tn->bits;
2243
2244					cindex = 0;
2245				}
2246			} else {
2247				cindex++;
2248				s->nullpointers++;
2249			}
2250
2251			/*
2252			 * Test if we are done
2253			 */
2254
2255			while (cindex >= (1 << tn->bits)) {
2256				/*
2257				 * Move upwards and test for root
2258				 * pop off all traversed  nodes
2259				 */
2260
2261				if (NODE_PARENT(tn) == NULL) {
2262					tn = NULL;
2263					n = NULL;
2264					break;
2265				}
2266
2267				cindex = tkey_extract_bits(tn->key, NODE_PARENT(tn)->pos, NODE_PARENT(tn)->bits);
2268				tn = NODE_PARENT(tn);
2269				cindex++;
2270				n = (struct node *)tn;
2271				pend = tn->pos+tn->bits;
2272				depth--;
2273 			}
2274		}
2275	}
2276
2277	read_unlock(&fib_lock);
2278	return s;
2279}
2280
2281#ifdef CONFIG_PROC_FS
2282
2283static struct fib_alias *fib_triestat_get_first(struct seq_file *seq)
2284{
2285	return NULL;
2286}
2287
2288static struct fib_alias *fib_triestat_get_next(struct seq_file *seq)
2289{
2290	return NULL;
2291}
2292
2293static void *fib_triestat_seq_start(struct seq_file *seq, loff_t *pos)
2294{
2295	if (!ip_fib_main_table)
2296		return NULL;
2297
2298	if (*pos)
2299		return fib_triestat_get_next(seq);
2300	else
2301		return SEQ_START_TOKEN;
2302}
2303
2304static void *fib_triestat_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2305{
2306	++*pos;
2307	if (v == SEQ_START_TOKEN)
2308		return fib_triestat_get_first(seq);
2309	else
2310		return fib_triestat_get_next(seq);
2311}
2312
2313static void fib_triestat_seq_stop(struct seq_file *seq, void *v)
2314{
2315
2316}
2317
2318/*
2319 *	This outputs /proc/net/fib_triestats
2320 *
2321 *	It always works in backward compatibility mode.
2322 *	The format of the file is not supposed to be changed.
2323 */
2324
2325static void collect_and_show(struct trie *t, struct seq_file *seq)
2326{
2327	int bytes = 0; /* How many bytes are used, a ref is 4 bytes */
2328	int i, max, pointers;
2329	struct trie_stat *stat;
2330	int avdepth;
2331
2332	stat = trie_collect_stats(t);
2333
2334	bytes = 0;
2335	seq_printf(seq, "trie=%p\n", t);
2336
2337	if (stat) {
2338		if (stat->leaves)
2339			avdepth = stat->totdepth*100 / stat->leaves;
2340		else
2341			avdepth = 0;
2342		seq_printf(seq, "Aver depth: %d.%02d\n", avdepth / 100, avdepth % 100);
2343		seq_printf(seq, "Max depth: %4d\n", stat->maxdepth);
2344
2345		seq_printf(seq, "Leaves: %d\n", stat->leaves);
2346		bytes += sizeof(struct leaf) * stat->leaves;
2347		seq_printf(seq, "Internal nodes: %d\n", stat->tnodes);
2348		bytes += sizeof(struct tnode) * stat->tnodes;
2349
2350		max = MAX_CHILDS-1;
2351
2352		while (max >= 0 && stat->nodesizes[max] == 0)
2353			max--;
2354		pointers = 0;
2355
2356		for (i = 1; i <= max; i++)
2357			if (stat->nodesizes[i] != 0) {
2358				seq_printf(seq, "  %d: %d",  i, stat->nodesizes[i]);
2359				pointers += (1<<i) * stat->nodesizes[i];
2360			}
2361		seq_printf(seq, "\n");
2362		seq_printf(seq, "Pointers: %d\n", pointers);
2363		bytes += sizeof(struct node *) * pointers;
2364		seq_printf(seq, "Null ptrs: %d\n", stat->nullpointers);
2365		seq_printf(seq, "Total size: %d  kB\n", bytes / 1024);
2366
2367		kfree(stat);
2368	}
2369
2370#ifdef CONFIG_IP_FIB_TRIE_STATS
2371	seq_printf(seq, "Counters:\n---------\n");
2372	seq_printf(seq,"gets = %d\n", t->stats.gets);
2373	seq_printf(seq,"backtracks = %d\n", t->stats.backtrack);
2374	seq_printf(seq,"semantic match passed = %d\n", t->stats.semantic_match_passed);
2375	seq_printf(seq,"semantic match miss = %d\n", t->stats.semantic_match_miss);
2376	seq_printf(seq,"null node hit= %d\n", t->stats.null_node_hit);
2377	seq_printf(seq,"skipped node resize = %d\n", t->stats.resize_node_skipped);
2378#ifdef CLEAR_STATS
2379	memset(&(t->stats), 0, sizeof(t->stats));
2380#endif
2381#endif /*  CONFIG_IP_FIB_TRIE_STATS */
2382}
2383
2384static int fib_triestat_seq_show(struct seq_file *seq, void *v)
2385{
2386	char bf[128];
2387
2388	if (v == SEQ_START_TOKEN) {
2389		seq_printf(seq, "Basic info: size of leaf: %Zd bytes, size of tnode: %Zd bytes.\n",
2390			   sizeof(struct leaf), sizeof(struct tnode));
2391		if (trie_local)
2392			collect_and_show(trie_local, seq);
2393
2394		if (trie_main)
2395			collect_and_show(trie_main, seq);
2396	} else {
2397		snprintf(bf, sizeof(bf), "*\t%08X\t%08X", 200, 400);
2398
2399		seq_printf(seq, "%-127s\n", bf);
2400	}
2401	return 0;
2402}
2403
2404static struct seq_operations fib_triestat_seq_ops = {
2405	.start = fib_triestat_seq_start,
2406	.next  = fib_triestat_seq_next,
2407	.stop  = fib_triestat_seq_stop,
2408	.show  = fib_triestat_seq_show,
2409};
2410
2411static int fib_triestat_seq_open(struct inode *inode, struct file *file)
2412{
2413	struct seq_file *seq;
2414	int rc = -ENOMEM;
2415
2416	rc = seq_open(file, &fib_triestat_seq_ops);
2417	if (rc)
2418		goto out_kfree;
2419
2420	seq = file->private_data;
2421out:
2422	return rc;
2423out_kfree:
2424	goto out;
2425}
2426
2427static struct file_operations fib_triestat_seq_fops = {
2428	.owner	= THIS_MODULE,
2429	.open	= fib_triestat_seq_open,
2430	.read	= seq_read,
2431	.llseek	= seq_lseek,
2432	.release = seq_release_private,
2433};
2434
2435int __init fib_stat_proc_init(void)
2436{
2437	if (!proc_net_fops_create("fib_triestat", S_IRUGO, &fib_triestat_seq_fops))
2438		return -ENOMEM;
2439	return 0;
2440}
2441
2442void __init fib_stat_proc_exit(void)
2443{
2444	proc_net_remove("fib_triestat");
2445}
2446
2447static struct fib_alias *fib_trie_get_first(struct seq_file *seq)
2448{
2449	return NULL;
2450}
2451
2452static struct fib_alias *fib_trie_get_next(struct seq_file *seq)
2453{
2454	return NULL;
2455}
2456
2457static void *fib_trie_seq_start(struct seq_file *seq, loff_t *pos)
2458{
2459	if (!ip_fib_main_table)
2460		return NULL;
2461
2462	if (*pos)
2463		return fib_trie_get_next(seq);
2464	else
2465		return SEQ_START_TOKEN;
2466}
2467
2468static void *fib_trie_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2469{
2470	++*pos;
2471	if (v == SEQ_START_TOKEN)
2472		return fib_trie_get_first(seq);
2473	else
2474		return fib_trie_get_next(seq);
2475
2476}
2477
2478static void fib_trie_seq_stop(struct seq_file *seq, void *v)
2479{
2480}
2481
2482/*
2483 *	This outputs /proc/net/fib_trie.
2484 *
2485 *	It always works in backward compatibility mode.
2486 *	The format of the file is not supposed to be changed.
2487 */
2488
2489static int fib_trie_seq_show(struct seq_file *seq, void *v)
2490{
2491	char bf[128];
2492
2493	if (v == SEQ_START_TOKEN) {
2494		if (trie_local)
2495			trie_dump_seq(seq, trie_local);
2496
2497		if (trie_main)
2498			trie_dump_seq(seq, trie_main);
2499	} else {
2500		snprintf(bf, sizeof(bf),
2501			 "*\t%08X\t%08X", 200, 400);
2502		seq_printf(seq, "%-127s\n", bf);
2503	}
2504
2505	return 0;
2506}
2507
2508static struct seq_operations fib_trie_seq_ops = {
2509	.start = fib_trie_seq_start,
2510	.next  = fib_trie_seq_next,
2511	.stop  = fib_trie_seq_stop,
2512	.show  = fib_trie_seq_show,
2513};
2514
2515static int fib_trie_seq_open(struct inode *inode, struct file *file)
2516{
2517	struct seq_file *seq;
2518	int rc = -ENOMEM;
2519
2520	rc = seq_open(file, &fib_trie_seq_ops);
2521	if (rc)
2522		goto out_kfree;
2523
2524	seq = file->private_data;
2525out:
2526	return rc;
2527out_kfree:
2528	goto out;
2529}
2530
2531static struct file_operations fib_trie_seq_fops = {
2532	.owner	= THIS_MODULE,
2533	.open	= fib_trie_seq_open,
2534	.read	= seq_read,
2535	.llseek	= seq_lseek,
2536	.release= seq_release_private,
2537};
2538
2539int __init fib_proc_init(void)
2540{
2541	if (!proc_net_fops_create("fib_trie", S_IRUGO, &fib_trie_seq_fops))
2542		return -ENOMEM;
2543	return 0;
2544}
2545
2546void __init fib_proc_exit(void)
2547{
2548	proc_net_remove("fib_trie");
2549}
2550
2551#endif /* CONFIG_PROC_FS */
2552