fib_trie.c revision 132adf54639cf7dd9315e8df89c2faa59f6e46d9
1/*
2 *   This program is free software; you can redistribute it and/or
3 *   modify it under the terms of the GNU General Public License
4 *   as published by the Free Software Foundation; either version
5 *   2 of the License, or (at your option) any later version.
6 *
7 *   Robert Olsson <robert.olsson@its.uu.se> Uppsala Universitet
8 *     & Swedish University of Agricultural Sciences.
9 *
10 *   Jens Laas <jens.laas@data.slu.se> Swedish University of
11 *     Agricultural Sciences.
12 *
13 *   Hans Liss <hans.liss@its.uu.se>  Uppsala Universitet
14 *
15 * This work is based on the LPC-trie which is originally descibed in:
16 *
17 * An experimental study of compression methods for dynamic tries
18 * Stefan Nilsson and Matti Tikkanen. Algorithmica, 33(1):19-33, 2002.
19 * http://www.nada.kth.se/~snilsson/public/papers/dyntrie2/
20 *
21 *
22 * IP-address lookup using LC-tries. Stefan Nilsson and Gunnar Karlsson
23 * IEEE Journal on Selected Areas in Communications, 17(6):1083-1092, June 1999
24 *
25 * Version:	$Id: fib_trie.c,v 1.3 2005/06/08 14:20:01 robert Exp $
26 *
27 *
28 * Code from fib_hash has been reused which includes the following header:
29 *
30 *
31 * INET		An implementation of the TCP/IP protocol suite for the LINUX
32 *		operating system.  INET is implemented using the  BSD Socket
33 *		interface as the means of communication with the user level.
34 *
35 *		IPv4 FIB: lookup engine and maintenance routines.
36 *
37 *
38 * Authors:	Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
39 *
40 *		This program is free software; you can redistribute it and/or
41 *		modify it under the terms of the GNU General Public License
42 *		as published by the Free Software Foundation; either version
43 *		2 of the License, or (at your option) any later version.
44 *
45 * Substantial contributions to this work comes from:
46 *
47 *		David S. Miller, <davem@davemloft.net>
48 *		Stephen Hemminger <shemminger@osdl.org>
49 *		Paul E. McKenney <paulmck@us.ibm.com>
50 *		Patrick McHardy <kaber@trash.net>
51 */
52
53#define VERSION "0.407"
54
55#include <asm/uaccess.h>
56#include <asm/system.h>
57#include <asm/bitops.h>
58#include <linux/types.h>
59#include <linux/kernel.h>
60#include <linux/mm.h>
61#include <linux/string.h>
62#include <linux/socket.h>
63#include <linux/sockios.h>
64#include <linux/errno.h>
65#include <linux/in.h>
66#include <linux/inet.h>
67#include <linux/inetdevice.h>
68#include <linux/netdevice.h>
69#include <linux/if_arp.h>
70#include <linux/proc_fs.h>
71#include <linux/rcupdate.h>
72#include <linux/skbuff.h>
73#include <linux/netlink.h>
74#include <linux/init.h>
75#include <linux/list.h>
76#include <net/ip.h>
77#include <net/protocol.h>
78#include <net/route.h>
79#include <net/tcp.h>
80#include <net/sock.h>
81#include <net/ip_fib.h>
82#include "fib_lookup.h"
83
84#undef CONFIG_IP_FIB_TRIE_STATS
85#define MAX_STAT_DEPTH 32
86
87#define KEYLENGTH (8*sizeof(t_key))
88#define MASK_PFX(k, l) (((l)==0)?0:(k >> (KEYLENGTH-l)) << (KEYLENGTH-l))
89#define TKEY_GET_MASK(offset, bits) (((bits)==0)?0:((t_key)(-1) << (KEYLENGTH - bits) >> offset))
90
91typedef unsigned int t_key;
92
93#define T_TNODE 0
94#define T_LEAF  1
95#define NODE_TYPE_MASK	0x1UL
96#define NODE_PARENT(node) \
97	((struct tnode *)rcu_dereference(((node)->parent & ~NODE_TYPE_MASK)))
98
99#define NODE_TYPE(node) ((node)->parent & NODE_TYPE_MASK)
100
101#define NODE_SET_PARENT(node, ptr)		\
102	rcu_assign_pointer((node)->parent,	\
103			   ((unsigned long)(ptr)) | NODE_TYPE(node))
104
105#define IS_TNODE(n) (!(n->parent & T_LEAF))
106#define IS_LEAF(n) (n->parent & T_LEAF)
107
108struct node {
109	t_key key;
110	unsigned long parent;
111};
112
113struct leaf {
114	t_key key;
115	unsigned long parent;
116	struct hlist_head list;
117	struct rcu_head rcu;
118};
119
120struct leaf_info {
121	struct hlist_node hlist;
122	struct rcu_head rcu;
123	int plen;
124	struct list_head falh;
125};
126
127struct tnode {
128	t_key key;
129	unsigned long parent;
130	unsigned short pos:5;		/* 2log(KEYLENGTH) bits needed */
131	unsigned short bits:5;		/* 2log(KEYLENGTH) bits needed */
132	unsigned short full_children;	/* KEYLENGTH bits needed */
133	unsigned short empty_children;	/* KEYLENGTH bits needed */
134	struct rcu_head rcu;
135	struct node *child[0];
136};
137
138#ifdef CONFIG_IP_FIB_TRIE_STATS
139struct trie_use_stats {
140	unsigned int gets;
141	unsigned int backtrack;
142	unsigned int semantic_match_passed;
143	unsigned int semantic_match_miss;
144	unsigned int null_node_hit;
145	unsigned int resize_node_skipped;
146};
147#endif
148
149struct trie_stat {
150	unsigned int totdepth;
151	unsigned int maxdepth;
152	unsigned int tnodes;
153	unsigned int leaves;
154	unsigned int nullpointers;
155	unsigned int nodesizes[MAX_STAT_DEPTH];
156};
157
158struct trie {
159	struct node *trie;
160#ifdef CONFIG_IP_FIB_TRIE_STATS
161	struct trie_use_stats stats;
162#endif
163	int size;
164	unsigned int revision;
165};
166
167static void put_child(struct trie *t, struct tnode *tn, int i, struct node *n);
168static void tnode_put_child_reorg(struct tnode *tn, int i, struct node *n, int wasfull);
169static struct node *resize(struct trie *t, struct tnode *tn);
170static struct tnode *inflate(struct trie *t, struct tnode *tn);
171static struct tnode *halve(struct trie *t, struct tnode *tn);
172static void tnode_free(struct tnode *tn);
173
174static struct kmem_cache *fn_alias_kmem __read_mostly;
175static struct trie *trie_local = NULL, *trie_main = NULL;
176
177
178/* rcu_read_lock needs to be hold by caller from readside */
179
180static inline struct node *tnode_get_child(struct tnode *tn, int i)
181{
182	BUG_ON(i >= 1 << tn->bits);
183
184	return rcu_dereference(tn->child[i]);
185}
186
187static inline int tnode_child_length(const struct tnode *tn)
188{
189	return 1 << tn->bits;
190}
191
192static inline t_key tkey_extract_bits(t_key a, int offset, int bits)
193{
194	if (offset < KEYLENGTH)
195		return ((t_key)(a << offset)) >> (KEYLENGTH - bits);
196	else
197		return 0;
198}
199
200static inline int tkey_equals(t_key a, t_key b)
201{
202	return a == b;
203}
204
205static inline int tkey_sub_equals(t_key a, int offset, int bits, t_key b)
206{
207	if (bits == 0 || offset >= KEYLENGTH)
208		return 1;
209	bits = bits > KEYLENGTH ? KEYLENGTH : bits;
210	return ((a ^ b) << offset) >> (KEYLENGTH - bits) == 0;
211}
212
213static inline int tkey_mismatch(t_key a, int offset, t_key b)
214{
215	t_key diff = a ^ b;
216	int i = offset;
217
218	if (!diff)
219		return 0;
220	while ((diff << i) >> (KEYLENGTH-1) == 0)
221		i++;
222	return i;
223}
224
225/*
226  To understand this stuff, an understanding of keys and all their bits is
227  necessary. Every node in the trie has a key associated with it, but not
228  all of the bits in that key are significant.
229
230  Consider a node 'n' and its parent 'tp'.
231
232  If n is a leaf, every bit in its key is significant. Its presence is
233  necessitated by path compression, since during a tree traversal (when
234  searching for a leaf - unless we are doing an insertion) we will completely
235  ignore all skipped bits we encounter. Thus we need to verify, at the end of
236  a potentially successful search, that we have indeed been walking the
237  correct key path.
238
239  Note that we can never "miss" the correct key in the tree if present by
240  following the wrong path. Path compression ensures that segments of the key
241  that are the same for all keys with a given prefix are skipped, but the
242  skipped part *is* identical for each node in the subtrie below the skipped
243  bit! trie_insert() in this implementation takes care of that - note the
244  call to tkey_sub_equals() in trie_insert().
245
246  if n is an internal node - a 'tnode' here, the various parts of its key
247  have many different meanings.
248
249  Example:
250  _________________________________________________________________
251  | i | i | i | i | i | i | i | N | N | N | S | S | S | S | S | C |
252  -----------------------------------------------------------------
253    0   1   2   3   4   5   6   7   8   9  10  11  12  13  14  15
254
255  _________________________________________________________________
256  | C | C | C | u | u | u | u | u | u | u | u | u | u | u | u | u |
257  -----------------------------------------------------------------
258   16  17  18  19  20  21  22  23  24  25  26  27  28  29  30  31
259
260  tp->pos = 7
261  tp->bits = 3
262  n->pos = 15
263  n->bits = 4
264
265  First, let's just ignore the bits that come before the parent tp, that is
266  the bits from 0 to (tp->pos-1). They are *known* but at this point we do
267  not use them for anything.
268
269  The bits from (tp->pos) to (tp->pos + tp->bits - 1) - "N", above - are the
270  index into the parent's child array. That is, they will be used to find
271  'n' among tp's children.
272
273  The bits from (tp->pos + tp->bits) to (n->pos - 1) - "S" - are skipped bits
274  for the node n.
275
276  All the bits we have seen so far are significant to the node n. The rest
277  of the bits are really not needed or indeed known in n->key.
278
279  The bits from (n->pos) to (n->pos + n->bits - 1) - "C" - are the index into
280  n's child array, and will of course be different for each child.
281
282
283  The rest of the bits, from (n->pos + n->bits) onward, are completely unknown
284  at this point.
285
286*/
287
288static inline void check_tnode(const struct tnode *tn)
289{
290	WARN_ON(tn && tn->pos+tn->bits > 32);
291}
292
293static int halve_threshold = 25;
294static int inflate_threshold = 50;
295static int halve_threshold_root = 15;
296static int inflate_threshold_root = 25;
297
298
299static void __alias_free_mem(struct rcu_head *head)
300{
301	struct fib_alias *fa = container_of(head, struct fib_alias, rcu);
302	kmem_cache_free(fn_alias_kmem, fa);
303}
304
305static inline void alias_free_mem_rcu(struct fib_alias *fa)
306{
307	call_rcu(&fa->rcu, __alias_free_mem);
308}
309
310static void __leaf_free_rcu(struct rcu_head *head)
311{
312	kfree(container_of(head, struct leaf, rcu));
313}
314
315static void __leaf_info_free_rcu(struct rcu_head *head)
316{
317	kfree(container_of(head, struct leaf_info, rcu));
318}
319
320static inline void free_leaf_info(struct leaf_info *leaf)
321{
322	call_rcu(&leaf->rcu, __leaf_info_free_rcu);
323}
324
325static struct tnode *tnode_alloc(unsigned int size)
326{
327	struct page *pages;
328
329	if (size <= PAGE_SIZE)
330		return kcalloc(size, 1, GFP_KERNEL);
331
332	pages = alloc_pages(GFP_KERNEL|__GFP_ZERO, get_order(size));
333	if (!pages)
334		return NULL;
335
336	return page_address(pages);
337}
338
339static void __tnode_free_rcu(struct rcu_head *head)
340{
341	struct tnode *tn = container_of(head, struct tnode, rcu);
342	unsigned int size = sizeof(struct tnode) +
343		(1 << tn->bits) * sizeof(struct node *);
344
345	if (size <= PAGE_SIZE)
346		kfree(tn);
347	else
348		free_pages((unsigned long)tn, get_order(size));
349}
350
351static inline void tnode_free(struct tnode *tn)
352{
353	if (IS_LEAF(tn)) {
354		struct leaf *l = (struct leaf *) tn;
355		call_rcu_bh(&l->rcu, __leaf_free_rcu);
356	} else
357		call_rcu(&tn->rcu, __tnode_free_rcu);
358}
359
360static struct leaf *leaf_new(void)
361{
362	struct leaf *l = kmalloc(sizeof(struct leaf),  GFP_KERNEL);
363	if (l) {
364		l->parent = T_LEAF;
365		INIT_HLIST_HEAD(&l->list);
366	}
367	return l;
368}
369
370static struct leaf_info *leaf_info_new(int plen)
371{
372	struct leaf_info *li = kmalloc(sizeof(struct leaf_info),  GFP_KERNEL);
373	if (li) {
374		li->plen = plen;
375		INIT_LIST_HEAD(&li->falh);
376	}
377	return li;
378}
379
380static struct tnode* tnode_new(t_key key, int pos, int bits)
381{
382	int nchildren = 1<<bits;
383	int sz = sizeof(struct tnode) + nchildren * sizeof(struct node *);
384	struct tnode *tn = tnode_alloc(sz);
385
386	if (tn) {
387		memset(tn, 0, sz);
388		tn->parent = T_TNODE;
389		tn->pos = pos;
390		tn->bits = bits;
391		tn->key = key;
392		tn->full_children = 0;
393		tn->empty_children = 1<<bits;
394	}
395
396	pr_debug("AT %p s=%u %u\n", tn, (unsigned int) sizeof(struct tnode),
397		 (unsigned int) (sizeof(struct node) * 1<<bits));
398	return tn;
399}
400
401/*
402 * Check whether a tnode 'n' is "full", i.e. it is an internal node
403 * and no bits are skipped. See discussion in dyntree paper p. 6
404 */
405
406static inline int tnode_full(const struct tnode *tn, const struct node *n)
407{
408	if (n == NULL || IS_LEAF(n))
409		return 0;
410
411	return ((struct tnode *) n)->pos == tn->pos + tn->bits;
412}
413
414static inline void put_child(struct trie *t, struct tnode *tn, int i, struct node *n)
415{
416	tnode_put_child_reorg(tn, i, n, -1);
417}
418
419 /*
420  * Add a child at position i overwriting the old value.
421  * Update the value of full_children and empty_children.
422  */
423
424static void tnode_put_child_reorg(struct tnode *tn, int i, struct node *n, int wasfull)
425{
426	struct node *chi = tn->child[i];
427	int isfull;
428
429	BUG_ON(i >= 1<<tn->bits);
430
431
432	/* update emptyChildren */
433	if (n == NULL && chi != NULL)
434		tn->empty_children++;
435	else if (n != NULL && chi == NULL)
436		tn->empty_children--;
437
438	/* update fullChildren */
439	if (wasfull == -1)
440		wasfull = tnode_full(tn, chi);
441
442	isfull = tnode_full(tn, n);
443	if (wasfull && !isfull)
444		tn->full_children--;
445	else if (!wasfull && isfull)
446		tn->full_children++;
447
448	if (n)
449		NODE_SET_PARENT(n, tn);
450
451	rcu_assign_pointer(tn->child[i], n);
452}
453
454static struct node *resize(struct trie *t, struct tnode *tn)
455{
456	int i;
457	int err = 0;
458	struct tnode *old_tn;
459	int inflate_threshold_use;
460	int halve_threshold_use;
461
462	if (!tn)
463		return NULL;
464
465	pr_debug("In tnode_resize %p inflate_threshold=%d threshold=%d\n",
466		 tn, inflate_threshold, halve_threshold);
467
468	/* No children */
469	if (tn->empty_children == tnode_child_length(tn)) {
470		tnode_free(tn);
471		return NULL;
472	}
473	/* One child */
474	if (tn->empty_children == tnode_child_length(tn) - 1)
475		for (i = 0; i < tnode_child_length(tn); i++) {
476			struct node *n;
477
478			n = tn->child[i];
479			if (!n)
480				continue;
481
482			/* compress one level */
483			NODE_SET_PARENT(n, NULL);
484			tnode_free(tn);
485			return n;
486		}
487	/*
488	 * Double as long as the resulting node has a number of
489	 * nonempty nodes that are above the threshold.
490	 */
491
492	/*
493	 * From "Implementing a dynamic compressed trie" by Stefan Nilsson of
494	 * the Helsinki University of Technology and Matti Tikkanen of Nokia
495	 * Telecommunications, page 6:
496	 * "A node is doubled if the ratio of non-empty children to all
497	 * children in the *doubled* node is at least 'high'."
498	 *
499	 * 'high' in this instance is the variable 'inflate_threshold'. It
500	 * is expressed as a percentage, so we multiply it with
501	 * tnode_child_length() and instead of multiplying by 2 (since the
502	 * child array will be doubled by inflate()) and multiplying
503	 * the left-hand side by 100 (to handle the percentage thing) we
504	 * multiply the left-hand side by 50.
505	 *
506	 * The left-hand side may look a bit weird: tnode_child_length(tn)
507	 * - tn->empty_children is of course the number of non-null children
508	 * in the current node. tn->full_children is the number of "full"
509	 * children, that is non-null tnodes with a skip value of 0.
510	 * All of those will be doubled in the resulting inflated tnode, so
511	 * we just count them one extra time here.
512	 *
513	 * A clearer way to write this would be:
514	 *
515	 * to_be_doubled = tn->full_children;
516	 * not_to_be_doubled = tnode_child_length(tn) - tn->empty_children -
517	 *     tn->full_children;
518	 *
519	 * new_child_length = tnode_child_length(tn) * 2;
520	 *
521	 * new_fill_factor = 100 * (not_to_be_doubled + 2*to_be_doubled) /
522	 *      new_child_length;
523	 * if (new_fill_factor >= inflate_threshold)
524	 *
525	 * ...and so on, tho it would mess up the while () loop.
526	 *
527	 * anyway,
528	 * 100 * (not_to_be_doubled + 2*to_be_doubled) / new_child_length >=
529	 *      inflate_threshold
530	 *
531	 * avoid a division:
532	 * 100 * (not_to_be_doubled + 2*to_be_doubled) >=
533	 *      inflate_threshold * new_child_length
534	 *
535	 * expand not_to_be_doubled and to_be_doubled, and shorten:
536	 * 100 * (tnode_child_length(tn) - tn->empty_children +
537	 *    tn->full_children) >= inflate_threshold * new_child_length
538	 *
539	 * expand new_child_length:
540	 * 100 * (tnode_child_length(tn) - tn->empty_children +
541	 *    tn->full_children) >=
542	 *      inflate_threshold * tnode_child_length(tn) * 2
543	 *
544	 * shorten again:
545	 * 50 * (tn->full_children + tnode_child_length(tn) -
546	 *    tn->empty_children) >= inflate_threshold *
547	 *    tnode_child_length(tn)
548	 *
549	 */
550
551	check_tnode(tn);
552
553	/* Keep root node larger  */
554
555	if (!tn->parent)
556		inflate_threshold_use = inflate_threshold_root;
557	else
558		inflate_threshold_use = inflate_threshold;
559
560	err = 0;
561	while ((tn->full_children > 0 &&
562	       50 * (tn->full_children + tnode_child_length(tn) - tn->empty_children) >=
563				inflate_threshold_use * tnode_child_length(tn))) {
564
565		old_tn = tn;
566		tn = inflate(t, tn);
567		if (IS_ERR(tn)) {
568			tn = old_tn;
569#ifdef CONFIG_IP_FIB_TRIE_STATS
570			t->stats.resize_node_skipped++;
571#endif
572			break;
573		}
574	}
575
576	check_tnode(tn);
577
578	/*
579	 * Halve as long as the number of empty children in this
580	 * node is above threshold.
581	 */
582
583
584	/* Keep root node larger  */
585
586	if (!tn->parent)
587		halve_threshold_use = halve_threshold_root;
588	else
589		halve_threshold_use = halve_threshold;
590
591	err = 0;
592	while (tn->bits > 1 &&
593	       100 * (tnode_child_length(tn) - tn->empty_children) <
594	       halve_threshold_use * tnode_child_length(tn)) {
595
596		old_tn = tn;
597		tn = halve(t, tn);
598		if (IS_ERR(tn)) {
599			tn = old_tn;
600#ifdef CONFIG_IP_FIB_TRIE_STATS
601			t->stats.resize_node_skipped++;
602#endif
603			break;
604		}
605	}
606
607
608	/* Only one child remains */
609	if (tn->empty_children == tnode_child_length(tn) - 1)
610		for (i = 0; i < tnode_child_length(tn); i++) {
611			struct node *n;
612
613			n = tn->child[i];
614			if (!n)
615				continue;
616
617			/* compress one level */
618
619			NODE_SET_PARENT(n, NULL);
620			tnode_free(tn);
621			return n;
622		}
623
624	return (struct node *) tn;
625}
626
627static struct tnode *inflate(struct trie *t, struct tnode *tn)
628{
629	struct tnode *inode;
630	struct tnode *oldtnode = tn;
631	int olen = tnode_child_length(tn);
632	int i;
633
634	pr_debug("In inflate\n");
635
636	tn = tnode_new(oldtnode->key, oldtnode->pos, oldtnode->bits + 1);
637
638	if (!tn)
639		return ERR_PTR(-ENOMEM);
640
641	/*
642	 * Preallocate and store tnodes before the actual work so we
643	 * don't get into an inconsistent state if memory allocation
644	 * fails. In case of failure we return the oldnode and  inflate
645	 * of tnode is ignored.
646	 */
647
648	for (i = 0; i < olen; i++) {
649		struct tnode *inode = (struct tnode *) tnode_get_child(oldtnode, i);
650
651		if (inode &&
652		    IS_TNODE(inode) &&
653		    inode->pos == oldtnode->pos + oldtnode->bits &&
654		    inode->bits > 1) {
655			struct tnode *left, *right;
656			t_key m = TKEY_GET_MASK(inode->pos, 1);
657
658			left = tnode_new(inode->key&(~m), inode->pos + 1,
659					 inode->bits - 1);
660			if (!left)
661				goto nomem;
662
663			right = tnode_new(inode->key|m, inode->pos + 1,
664					  inode->bits - 1);
665
666			if (!right) {
667				tnode_free(left);
668				goto nomem;
669			}
670
671			put_child(t, tn, 2*i, (struct node *) left);
672			put_child(t, tn, 2*i+1, (struct node *) right);
673		}
674	}
675
676	for (i = 0; i < olen; i++) {
677		struct node *node = tnode_get_child(oldtnode, i);
678		struct tnode *left, *right;
679		int size, j;
680
681		/* An empty child */
682		if (node == NULL)
683			continue;
684
685		/* A leaf or an internal node with skipped bits */
686
687		if (IS_LEAF(node) || ((struct tnode *) node)->pos >
688		   tn->pos + tn->bits - 1) {
689			if (tkey_extract_bits(node->key, oldtnode->pos + oldtnode->bits,
690					     1) == 0)
691				put_child(t, tn, 2*i, node);
692			else
693				put_child(t, tn, 2*i+1, node);
694			continue;
695		}
696
697		/* An internal node with two children */
698		inode = (struct tnode *) node;
699
700		if (inode->bits == 1) {
701			put_child(t, tn, 2*i, inode->child[0]);
702			put_child(t, tn, 2*i+1, inode->child[1]);
703
704			tnode_free(inode);
705			continue;
706		}
707
708		/* An internal node with more than two children */
709
710		/* We will replace this node 'inode' with two new
711		 * ones, 'left' and 'right', each with half of the
712		 * original children. The two new nodes will have
713		 * a position one bit further down the key and this
714		 * means that the "significant" part of their keys
715		 * (see the discussion near the top of this file)
716		 * will differ by one bit, which will be "0" in
717		 * left's key and "1" in right's key. Since we are
718		 * moving the key position by one step, the bit that
719		 * we are moving away from - the bit at position
720		 * (inode->pos) - is the one that will differ between
721		 * left and right. So... we synthesize that bit in the
722		 * two  new keys.
723		 * The mask 'm' below will be a single "one" bit at
724		 * the position (inode->pos)
725		 */
726
727		/* Use the old key, but set the new significant
728		 *   bit to zero.
729		 */
730
731		left = (struct tnode *) tnode_get_child(tn, 2*i);
732		put_child(t, tn, 2*i, NULL);
733
734		BUG_ON(!left);
735
736		right = (struct tnode *) tnode_get_child(tn, 2*i+1);
737		put_child(t, tn, 2*i+1, NULL);
738
739		BUG_ON(!right);
740
741		size = tnode_child_length(left);
742		for (j = 0; j < size; j++) {
743			put_child(t, left, j, inode->child[j]);
744			put_child(t, right, j, inode->child[j + size]);
745		}
746		put_child(t, tn, 2*i, resize(t, left));
747		put_child(t, tn, 2*i+1, resize(t, right));
748
749		tnode_free(inode);
750	}
751	tnode_free(oldtnode);
752	return tn;
753nomem:
754	{
755		int size = tnode_child_length(tn);
756		int j;
757
758		for (j = 0; j < size; j++)
759			if (tn->child[j])
760				tnode_free((struct tnode *)tn->child[j]);
761
762		tnode_free(tn);
763
764		return ERR_PTR(-ENOMEM);
765	}
766}
767
768static struct tnode *halve(struct trie *t, struct tnode *tn)
769{
770	struct tnode *oldtnode = tn;
771	struct node *left, *right;
772	int i;
773	int olen = tnode_child_length(tn);
774
775	pr_debug("In halve\n");
776
777	tn = tnode_new(oldtnode->key, oldtnode->pos, oldtnode->bits - 1);
778
779	if (!tn)
780		return ERR_PTR(-ENOMEM);
781
782	/*
783	 * Preallocate and store tnodes before the actual work so we
784	 * don't get into an inconsistent state if memory allocation
785	 * fails. In case of failure we return the oldnode and halve
786	 * of tnode is ignored.
787	 */
788
789	for (i = 0; i < olen; i += 2) {
790		left = tnode_get_child(oldtnode, i);
791		right = tnode_get_child(oldtnode, i+1);
792
793		/* Two nonempty children */
794		if (left && right) {
795			struct tnode *newn;
796
797			newn = tnode_new(left->key, tn->pos + tn->bits, 1);
798
799			if (!newn)
800				goto nomem;
801
802			put_child(t, tn, i/2, (struct node *)newn);
803		}
804
805	}
806
807	for (i = 0; i < olen; i += 2) {
808		struct tnode *newBinNode;
809
810		left = tnode_get_child(oldtnode, i);
811		right = tnode_get_child(oldtnode, i+1);
812
813		/* At least one of the children is empty */
814		if (left == NULL) {
815			if (right == NULL)    /* Both are empty */
816				continue;
817			put_child(t, tn, i/2, right);
818			continue;
819		}
820
821		if (right == NULL) {
822			put_child(t, tn, i/2, left);
823			continue;
824		}
825
826		/* Two nonempty children */
827		newBinNode = (struct tnode *) tnode_get_child(tn, i/2);
828		put_child(t, tn, i/2, NULL);
829		put_child(t, newBinNode, 0, left);
830		put_child(t, newBinNode, 1, right);
831		put_child(t, tn, i/2, resize(t, newBinNode));
832	}
833	tnode_free(oldtnode);
834	return tn;
835nomem:
836	{
837		int size = tnode_child_length(tn);
838		int j;
839
840		for (j = 0; j < size; j++)
841			if (tn->child[j])
842				tnode_free((struct tnode *)tn->child[j]);
843
844		tnode_free(tn);
845
846		return ERR_PTR(-ENOMEM);
847	}
848}
849
850static void trie_init(struct trie *t)
851{
852	if (!t)
853		return;
854
855	t->size = 0;
856	rcu_assign_pointer(t->trie, NULL);
857	t->revision = 0;
858#ifdef CONFIG_IP_FIB_TRIE_STATS
859	memset(&t->stats, 0, sizeof(struct trie_use_stats));
860#endif
861}
862
863/* readside must use rcu_read_lock currently dump routines
864 via get_fa_head and dump */
865
866static struct leaf_info *find_leaf_info(struct leaf *l, int plen)
867{
868	struct hlist_head *head = &l->list;
869	struct hlist_node *node;
870	struct leaf_info *li;
871
872	hlist_for_each_entry_rcu(li, node, head, hlist)
873		if (li->plen == plen)
874			return li;
875
876	return NULL;
877}
878
879static inline struct list_head * get_fa_head(struct leaf *l, int plen)
880{
881	struct leaf_info *li = find_leaf_info(l, plen);
882
883	if (!li)
884		return NULL;
885
886	return &li->falh;
887}
888
889static void insert_leaf_info(struct hlist_head *head, struct leaf_info *new)
890{
891	struct leaf_info *li = NULL, *last = NULL;
892	struct hlist_node *node;
893
894	if (hlist_empty(head)) {
895		hlist_add_head_rcu(&new->hlist, head);
896	} else {
897		hlist_for_each_entry(li, node, head, hlist) {
898			if (new->plen > li->plen)
899				break;
900
901			last = li;
902		}
903		if (last)
904			hlist_add_after_rcu(&last->hlist, &new->hlist);
905		else
906			hlist_add_before_rcu(&new->hlist, &li->hlist);
907	}
908}
909
910/* rcu_read_lock needs to be hold by caller from readside */
911
912static struct leaf *
913fib_find_node(struct trie *t, u32 key)
914{
915	int pos;
916	struct tnode *tn;
917	struct node *n;
918
919	pos = 0;
920	n = rcu_dereference(t->trie);
921
922	while (n != NULL &&  NODE_TYPE(n) == T_TNODE) {
923		tn = (struct tnode *) n;
924
925		check_tnode(tn);
926
927		if (tkey_sub_equals(tn->key, pos, tn->pos-pos, key)) {
928			pos = tn->pos + tn->bits;
929			n = tnode_get_child(tn, tkey_extract_bits(key, tn->pos, tn->bits));
930		} else
931			break;
932	}
933	/* Case we have found a leaf. Compare prefixes */
934
935	if (n != NULL && IS_LEAF(n) && tkey_equals(key, n->key))
936		return (struct leaf *)n;
937
938	return NULL;
939}
940
941static struct node *trie_rebalance(struct trie *t, struct tnode *tn)
942{
943	int wasfull;
944	t_key cindex, key;
945	struct tnode *tp = NULL;
946
947	key = tn->key;
948
949	while (tn != NULL && NODE_PARENT(tn) != NULL) {
950
951		tp = NODE_PARENT(tn);
952		cindex = tkey_extract_bits(key, tp->pos, tp->bits);
953		wasfull = tnode_full(tp, tnode_get_child(tp, cindex));
954		tn = (struct tnode *) resize (t, (struct tnode *)tn);
955		tnode_put_child_reorg((struct tnode *)tp, cindex,(struct node*)tn, wasfull);
956
957		if (!NODE_PARENT(tn))
958			break;
959
960		tn = NODE_PARENT(tn);
961	}
962	/* Handle last (top) tnode */
963	if (IS_TNODE(tn))
964		tn = (struct tnode*) resize(t, (struct tnode *)tn);
965
966	return (struct node*) tn;
967}
968
969/* only used from updater-side */
970
971static  struct list_head *
972fib_insert_node(struct trie *t, int *err, u32 key, int plen)
973{
974	int pos, newpos;
975	struct tnode *tp = NULL, *tn = NULL;
976	struct node *n;
977	struct leaf *l;
978	int missbit;
979	struct list_head *fa_head = NULL;
980	struct leaf_info *li;
981	t_key cindex;
982
983	pos = 0;
984	n = t->trie;
985
986	/* If we point to NULL, stop. Either the tree is empty and we should
987	 * just put a new leaf in if, or we have reached an empty child slot,
988	 * and we should just put our new leaf in that.
989	 * If we point to a T_TNODE, check if it matches our key. Note that
990	 * a T_TNODE might be skipping any number of bits - its 'pos' need
991	 * not be the parent's 'pos'+'bits'!
992	 *
993	 * If it does match the current key, get pos/bits from it, extract
994	 * the index from our key, push the T_TNODE and walk the tree.
995	 *
996	 * If it doesn't, we have to replace it with a new T_TNODE.
997	 *
998	 * If we point to a T_LEAF, it might or might not have the same key
999	 * as we do. If it does, just change the value, update the T_LEAF's
1000	 * value, and return it.
1001	 * If it doesn't, we need to replace it with a T_TNODE.
1002	 */
1003
1004	while (n != NULL &&  NODE_TYPE(n) == T_TNODE) {
1005		tn = (struct tnode *) n;
1006
1007		check_tnode(tn);
1008
1009		if (tkey_sub_equals(tn->key, pos, tn->pos-pos, key)) {
1010			tp = tn;
1011			pos = tn->pos + tn->bits;
1012			n = tnode_get_child(tn, tkey_extract_bits(key, tn->pos, tn->bits));
1013
1014			BUG_ON(n && NODE_PARENT(n) != tn);
1015		} else
1016			break;
1017	}
1018
1019	/*
1020	 * n  ----> NULL, LEAF or TNODE
1021	 *
1022	 * tp is n's (parent) ----> NULL or TNODE
1023	 */
1024
1025	BUG_ON(tp && IS_LEAF(tp));
1026
1027	/* Case 1: n is a leaf. Compare prefixes */
1028
1029	if (n != NULL && IS_LEAF(n) && tkey_equals(key, n->key)) {
1030		struct leaf *l = (struct leaf *) n;
1031
1032		li = leaf_info_new(plen);
1033
1034		if (!li) {
1035			*err = -ENOMEM;
1036			goto err;
1037		}
1038
1039		fa_head = &li->falh;
1040		insert_leaf_info(&l->list, li);
1041		goto done;
1042	}
1043	t->size++;
1044	l = leaf_new();
1045
1046	if (!l) {
1047		*err = -ENOMEM;
1048		goto err;
1049	}
1050
1051	l->key = key;
1052	li = leaf_info_new(plen);
1053
1054	if (!li) {
1055		tnode_free((struct tnode *) l);
1056		*err = -ENOMEM;
1057		goto err;
1058	}
1059
1060	fa_head = &li->falh;
1061	insert_leaf_info(&l->list, li);
1062
1063	if (t->trie && n == NULL) {
1064		/* Case 2: n is NULL, and will just insert a new leaf */
1065
1066		NODE_SET_PARENT(l, tp);
1067
1068		cindex = tkey_extract_bits(key, tp->pos, tp->bits);
1069		put_child(t, (struct tnode *)tp, cindex, (struct node *)l);
1070	} else {
1071		/* Case 3: n is a LEAF or a TNODE and the key doesn't match. */
1072		/*
1073		 *  Add a new tnode here
1074		 *  first tnode need some special handling
1075		 */
1076
1077		if (tp)
1078			pos = tp->pos+tp->bits;
1079		else
1080			pos = 0;
1081
1082		if (n) {
1083			newpos = tkey_mismatch(key, pos, n->key);
1084			tn = tnode_new(n->key, newpos, 1);
1085		} else {
1086			newpos = 0;
1087			tn = tnode_new(key, newpos, 1); /* First tnode */
1088		}
1089
1090		if (!tn) {
1091			free_leaf_info(li);
1092			tnode_free((struct tnode *) l);
1093			*err = -ENOMEM;
1094			goto err;
1095		}
1096
1097		NODE_SET_PARENT(tn, tp);
1098
1099		missbit = tkey_extract_bits(key, newpos, 1);
1100		put_child(t, tn, missbit, (struct node *)l);
1101		put_child(t, tn, 1-missbit, n);
1102
1103		if (tp) {
1104			cindex = tkey_extract_bits(key, tp->pos, tp->bits);
1105			put_child(t, (struct tnode *)tp, cindex, (struct node *)tn);
1106		} else {
1107			rcu_assign_pointer(t->trie, (struct node *)tn); /* First tnode */
1108			tp = tn;
1109		}
1110	}
1111
1112	if (tp && tp->pos + tp->bits > 32)
1113		printk(KERN_WARNING "fib_trie tp=%p pos=%d, bits=%d, key=%0x plen=%d\n",
1114		       tp, tp->pos, tp->bits, key, plen);
1115
1116	/* Rebalance the trie */
1117
1118	rcu_assign_pointer(t->trie, trie_rebalance(t, tp));
1119done:
1120	t->revision++;
1121err:
1122	return fa_head;
1123}
1124
1125/*
1126 * Caller must hold RTNL.
1127 */
1128static int fn_trie_insert(struct fib_table *tb, struct fib_config *cfg)
1129{
1130	struct trie *t = (struct trie *) tb->tb_data;
1131	struct fib_alias *fa, *new_fa;
1132	struct list_head *fa_head = NULL;
1133	struct fib_info *fi;
1134	int plen = cfg->fc_dst_len;
1135	u8 tos = cfg->fc_tos;
1136	u32 key, mask;
1137	int err;
1138	struct leaf *l;
1139
1140	if (plen > 32)
1141		return -EINVAL;
1142
1143	key = ntohl(cfg->fc_dst);
1144
1145	pr_debug("Insert table=%u %08x/%d\n", tb->tb_id, key, plen);
1146
1147	mask = ntohl(inet_make_mask(plen));
1148
1149	if (key & ~mask)
1150		return -EINVAL;
1151
1152	key = key & mask;
1153
1154	fi = fib_create_info(cfg);
1155	if (IS_ERR(fi)) {
1156		err = PTR_ERR(fi);
1157		goto err;
1158	}
1159
1160	l = fib_find_node(t, key);
1161	fa = NULL;
1162
1163	if (l) {
1164		fa_head = get_fa_head(l, plen);
1165		fa = fib_find_alias(fa_head, tos, fi->fib_priority);
1166	}
1167
1168	/* Now fa, if non-NULL, points to the first fib alias
1169	 * with the same keys [prefix,tos,priority], if such key already
1170	 * exists or to the node before which we will insert new one.
1171	 *
1172	 * If fa is NULL, we will need to allocate a new one and
1173	 * insert to the head of f.
1174	 *
1175	 * If f is NULL, no fib node matched the destination key
1176	 * and we need to allocate a new one of those as well.
1177	 */
1178
1179	if (fa && fa->fa_info->fib_priority == fi->fib_priority) {
1180		struct fib_alias *fa_orig;
1181
1182		err = -EEXIST;
1183		if (cfg->fc_nlflags & NLM_F_EXCL)
1184			goto out;
1185
1186		if (cfg->fc_nlflags & NLM_F_REPLACE) {
1187			struct fib_info *fi_drop;
1188			u8 state;
1189
1190			err = -ENOBUFS;
1191			new_fa = kmem_cache_alloc(fn_alias_kmem, GFP_KERNEL);
1192			if (new_fa == NULL)
1193				goto out;
1194
1195			fi_drop = fa->fa_info;
1196			new_fa->fa_tos = fa->fa_tos;
1197			new_fa->fa_info = fi;
1198			new_fa->fa_type = cfg->fc_type;
1199			new_fa->fa_scope = cfg->fc_scope;
1200			state = fa->fa_state;
1201			new_fa->fa_state &= ~FA_S_ACCESSED;
1202
1203			list_replace_rcu(&fa->fa_list, &new_fa->fa_list);
1204			alias_free_mem_rcu(fa);
1205
1206			fib_release_info(fi_drop);
1207			if (state & FA_S_ACCESSED)
1208				rt_cache_flush(-1);
1209
1210			goto succeeded;
1211		}
1212		/* Error if we find a perfect match which
1213		 * uses the same scope, type, and nexthop
1214		 * information.
1215		 */
1216		fa_orig = fa;
1217		list_for_each_entry(fa, fa_orig->fa_list.prev, fa_list) {
1218			if (fa->fa_tos != tos)
1219				break;
1220			if (fa->fa_info->fib_priority != fi->fib_priority)
1221				break;
1222			if (fa->fa_type == cfg->fc_type &&
1223			    fa->fa_scope == cfg->fc_scope &&
1224			    fa->fa_info == fi) {
1225				goto out;
1226			}
1227		}
1228		if (!(cfg->fc_nlflags & NLM_F_APPEND))
1229			fa = fa_orig;
1230	}
1231	err = -ENOENT;
1232	if (!(cfg->fc_nlflags & NLM_F_CREATE))
1233		goto out;
1234
1235	err = -ENOBUFS;
1236	new_fa = kmem_cache_alloc(fn_alias_kmem, GFP_KERNEL);
1237	if (new_fa == NULL)
1238		goto out;
1239
1240	new_fa->fa_info = fi;
1241	new_fa->fa_tos = tos;
1242	new_fa->fa_type = cfg->fc_type;
1243	new_fa->fa_scope = cfg->fc_scope;
1244	new_fa->fa_state = 0;
1245	/*
1246	 * Insert new entry to the list.
1247	 */
1248
1249	if (!fa_head) {
1250		err = 0;
1251		fa_head = fib_insert_node(t, &err, key, plen);
1252		if (err)
1253			goto out_free_new_fa;
1254	}
1255
1256	list_add_tail_rcu(&new_fa->fa_list,
1257			  (fa ? &fa->fa_list : fa_head));
1258
1259	rt_cache_flush(-1);
1260	rtmsg_fib(RTM_NEWROUTE, htonl(key), new_fa, plen, tb->tb_id,
1261		  &cfg->fc_nlinfo);
1262succeeded:
1263	return 0;
1264
1265out_free_new_fa:
1266	kmem_cache_free(fn_alias_kmem, new_fa);
1267out:
1268	fib_release_info(fi);
1269err:
1270	return err;
1271}
1272
1273
1274/* should be called with rcu_read_lock */
1275static inline int check_leaf(struct trie *t, struct leaf *l,
1276			     t_key key, int *plen, const struct flowi *flp,
1277			     struct fib_result *res)
1278{
1279	int err, i;
1280	__be32 mask;
1281	struct leaf_info *li;
1282	struct hlist_head *hhead = &l->list;
1283	struct hlist_node *node;
1284
1285	hlist_for_each_entry_rcu(li, node, hhead, hlist) {
1286		i = li->plen;
1287		mask = inet_make_mask(i);
1288		if (l->key != (key & ntohl(mask)))
1289			continue;
1290
1291		if ((err = fib_semantic_match(&li->falh, flp, res, htonl(l->key), mask, i)) <= 0) {
1292			*plen = i;
1293#ifdef CONFIG_IP_FIB_TRIE_STATS
1294			t->stats.semantic_match_passed++;
1295#endif
1296			return err;
1297		}
1298#ifdef CONFIG_IP_FIB_TRIE_STATS
1299		t->stats.semantic_match_miss++;
1300#endif
1301	}
1302	return 1;
1303}
1304
1305static int
1306fn_trie_lookup(struct fib_table *tb, const struct flowi *flp, struct fib_result *res)
1307{
1308	struct trie *t = (struct trie *) tb->tb_data;
1309	int plen, ret = 0;
1310	struct node *n;
1311	struct tnode *pn;
1312	int pos, bits;
1313	t_key key = ntohl(flp->fl4_dst);
1314	int chopped_off;
1315	t_key cindex = 0;
1316	int current_prefix_length = KEYLENGTH;
1317	struct tnode *cn;
1318	t_key node_prefix, key_prefix, pref_mismatch;
1319	int mp;
1320
1321	rcu_read_lock();
1322
1323	n = rcu_dereference(t->trie);
1324	if (!n)
1325		goto failed;
1326
1327#ifdef CONFIG_IP_FIB_TRIE_STATS
1328	t->stats.gets++;
1329#endif
1330
1331	/* Just a leaf? */
1332	if (IS_LEAF(n)) {
1333		if ((ret = check_leaf(t, (struct leaf *)n, key, &plen, flp, res)) <= 0)
1334			goto found;
1335		goto failed;
1336	}
1337	pn = (struct tnode *) n;
1338	chopped_off = 0;
1339
1340	while (pn) {
1341		pos = pn->pos;
1342		bits = pn->bits;
1343
1344		if (!chopped_off)
1345			cindex = tkey_extract_bits(MASK_PFX(key, current_prefix_length), pos, bits);
1346
1347		n = tnode_get_child(pn, cindex);
1348
1349		if (n == NULL) {
1350#ifdef CONFIG_IP_FIB_TRIE_STATS
1351			t->stats.null_node_hit++;
1352#endif
1353			goto backtrace;
1354		}
1355
1356		if (IS_LEAF(n)) {
1357			if ((ret = check_leaf(t, (struct leaf *)n, key, &plen, flp, res)) <= 0)
1358				goto found;
1359			else
1360				goto backtrace;
1361		}
1362
1363#define HL_OPTIMIZE
1364#ifdef HL_OPTIMIZE
1365		cn = (struct tnode *)n;
1366
1367		/*
1368		 * It's a tnode, and we can do some extra checks here if we
1369		 * like, to avoid descending into a dead-end branch.
1370		 * This tnode is in the parent's child array at index
1371		 * key[p_pos..p_pos+p_bits] but potentially with some bits
1372		 * chopped off, so in reality the index may be just a
1373		 * subprefix, padded with zero at the end.
1374		 * We can also take a look at any skipped bits in this
1375		 * tnode - everything up to p_pos is supposed to be ok,
1376		 * and the non-chopped bits of the index (se previous
1377		 * paragraph) are also guaranteed ok, but the rest is
1378		 * considered unknown.
1379		 *
1380		 * The skipped bits are key[pos+bits..cn->pos].
1381		 */
1382
1383		/* If current_prefix_length < pos+bits, we are already doing
1384		 * actual prefix  matching, which means everything from
1385		 * pos+(bits-chopped_off) onward must be zero along some
1386		 * branch of this subtree - otherwise there is *no* valid
1387		 * prefix present. Here we can only check the skipped
1388		 * bits. Remember, since we have already indexed into the
1389		 * parent's child array, we know that the bits we chopped of
1390		 * *are* zero.
1391		 */
1392
1393		/* NOTA BENE: CHECKING ONLY SKIPPED BITS FOR THE NEW NODE HERE */
1394
1395		if (current_prefix_length < pos+bits) {
1396			if (tkey_extract_bits(cn->key, current_prefix_length,
1397						cn->pos - current_prefix_length) != 0 ||
1398			    !(cn->child[0]))
1399				goto backtrace;
1400		}
1401
1402		/*
1403		 * If chopped_off=0, the index is fully validated and we
1404		 * only need to look at the skipped bits for this, the new,
1405		 * tnode. What we actually want to do is to find out if
1406		 * these skipped bits match our key perfectly, or if we will
1407		 * have to count on finding a matching prefix further down,
1408		 * because if we do, we would like to have some way of
1409		 * verifying the existence of such a prefix at this point.
1410		 */
1411
1412		/* The only thing we can do at this point is to verify that
1413		 * any such matching prefix can indeed be a prefix to our
1414		 * key, and if the bits in the node we are inspecting that
1415		 * do not match our key are not ZERO, this cannot be true.
1416		 * Thus, find out where there is a mismatch (before cn->pos)
1417		 * and verify that all the mismatching bits are zero in the
1418		 * new tnode's key.
1419		 */
1420
1421		/* Note: We aren't very concerned about the piece of the key
1422		 * that precede pn->pos+pn->bits, since these have already been
1423		 * checked. The bits after cn->pos aren't checked since these are
1424		 * by definition "unknown" at this point. Thus, what we want to
1425		 * see is if we are about to enter the "prefix matching" state,
1426		 * and in that case verify that the skipped bits that will prevail
1427		 * throughout this subtree are zero, as they have to be if we are
1428		 * to find a matching prefix.
1429		 */
1430
1431		node_prefix = MASK_PFX(cn->key, cn->pos);
1432		key_prefix = MASK_PFX(key, cn->pos);
1433		pref_mismatch = key_prefix^node_prefix;
1434		mp = 0;
1435
1436		/* In short: If skipped bits in this node do not match the search
1437		 * key, enter the "prefix matching" state.directly.
1438		 */
1439		if (pref_mismatch) {
1440			while (!(pref_mismatch & (1<<(KEYLENGTH-1)))) {
1441				mp++;
1442				pref_mismatch = pref_mismatch <<1;
1443			}
1444			key_prefix = tkey_extract_bits(cn->key, mp, cn->pos-mp);
1445
1446			if (key_prefix != 0)
1447				goto backtrace;
1448
1449			if (current_prefix_length >= cn->pos)
1450				current_prefix_length = mp;
1451		}
1452#endif
1453		pn = (struct tnode *)n; /* Descend */
1454		chopped_off = 0;
1455		continue;
1456
1457backtrace:
1458		chopped_off++;
1459
1460		/* As zero don't change the child key (cindex) */
1461		while ((chopped_off <= pn->bits) && !(cindex & (1<<(chopped_off-1))))
1462			chopped_off++;
1463
1464		/* Decrease current_... with bits chopped off */
1465		if (current_prefix_length > pn->pos + pn->bits - chopped_off)
1466			current_prefix_length = pn->pos + pn->bits - chopped_off;
1467
1468		/*
1469		 * Either we do the actual chop off according or if we have
1470		 * chopped off all bits in this tnode walk up to our parent.
1471		 */
1472
1473		if (chopped_off <= pn->bits) {
1474			cindex &= ~(1 << (chopped_off-1));
1475		} else {
1476			if (NODE_PARENT(pn) == NULL)
1477				goto failed;
1478
1479			/* Get Child's index */
1480			cindex = tkey_extract_bits(pn->key, NODE_PARENT(pn)->pos, NODE_PARENT(pn)->bits);
1481			pn = NODE_PARENT(pn);
1482			chopped_off = 0;
1483
1484#ifdef CONFIG_IP_FIB_TRIE_STATS
1485			t->stats.backtrack++;
1486#endif
1487			goto backtrace;
1488		}
1489	}
1490failed:
1491	ret = 1;
1492found:
1493	rcu_read_unlock();
1494	return ret;
1495}
1496
1497/* only called from updater side */
1498static int trie_leaf_remove(struct trie *t, t_key key)
1499{
1500	t_key cindex;
1501	struct tnode *tp = NULL;
1502	struct node *n = t->trie;
1503	struct leaf *l;
1504
1505	pr_debug("entering trie_leaf_remove(%p)\n", n);
1506
1507	/* Note that in the case skipped bits, those bits are *not* checked!
1508	 * When we finish this, we will have NULL or a T_LEAF, and the
1509	 * T_LEAF may or may not match our key.
1510	 */
1511
1512	while (n != NULL && IS_TNODE(n)) {
1513		struct tnode *tn = (struct tnode *) n;
1514		check_tnode(tn);
1515		n = tnode_get_child(tn ,tkey_extract_bits(key, tn->pos, tn->bits));
1516
1517		BUG_ON(n && NODE_PARENT(n) != tn);
1518	}
1519	l = (struct leaf *) n;
1520
1521	if (!n || !tkey_equals(l->key, key))
1522		return 0;
1523
1524	/*
1525	 * Key found.
1526	 * Remove the leaf and rebalance the tree
1527	 */
1528
1529	t->revision++;
1530	t->size--;
1531
1532	tp = NODE_PARENT(n);
1533	tnode_free((struct tnode *) n);
1534
1535	if (tp) {
1536		cindex = tkey_extract_bits(key, tp->pos, tp->bits);
1537		put_child(t, (struct tnode *)tp, cindex, NULL);
1538		rcu_assign_pointer(t->trie, trie_rebalance(t, tp));
1539	} else
1540		rcu_assign_pointer(t->trie, NULL);
1541
1542	return 1;
1543}
1544
1545/*
1546 * Caller must hold RTNL.
1547 */
1548static int fn_trie_delete(struct fib_table *tb, struct fib_config *cfg)
1549{
1550	struct trie *t = (struct trie *) tb->tb_data;
1551	u32 key, mask;
1552	int plen = cfg->fc_dst_len;
1553	u8 tos = cfg->fc_tos;
1554	struct fib_alias *fa, *fa_to_delete;
1555	struct list_head *fa_head;
1556	struct leaf *l;
1557	struct leaf_info *li;
1558
1559	if (plen > 32)
1560		return -EINVAL;
1561
1562	key = ntohl(cfg->fc_dst);
1563	mask = ntohl(inet_make_mask(plen));
1564
1565	if (key & ~mask)
1566		return -EINVAL;
1567
1568	key = key & mask;
1569	l = fib_find_node(t, key);
1570
1571	if (!l)
1572		return -ESRCH;
1573
1574	fa_head = get_fa_head(l, plen);
1575	fa = fib_find_alias(fa_head, tos, 0);
1576
1577	if (!fa)
1578		return -ESRCH;
1579
1580	pr_debug("Deleting %08x/%d tos=%d t=%p\n", key, plen, tos, t);
1581
1582	fa_to_delete = NULL;
1583	fa_head = fa->fa_list.prev;
1584
1585	list_for_each_entry(fa, fa_head, fa_list) {
1586		struct fib_info *fi = fa->fa_info;
1587
1588		if (fa->fa_tos != tos)
1589			break;
1590
1591		if ((!cfg->fc_type || fa->fa_type == cfg->fc_type) &&
1592		    (cfg->fc_scope == RT_SCOPE_NOWHERE ||
1593		     fa->fa_scope == cfg->fc_scope) &&
1594		    (!cfg->fc_protocol ||
1595		     fi->fib_protocol == cfg->fc_protocol) &&
1596		    fib_nh_match(cfg, fi) == 0) {
1597			fa_to_delete = fa;
1598			break;
1599		}
1600	}
1601
1602	if (!fa_to_delete)
1603		return -ESRCH;
1604
1605	fa = fa_to_delete;
1606	rtmsg_fib(RTM_DELROUTE, htonl(key), fa, plen, tb->tb_id,
1607		  &cfg->fc_nlinfo);
1608
1609	l = fib_find_node(t, key);
1610	li = find_leaf_info(l, plen);
1611
1612	list_del_rcu(&fa->fa_list);
1613
1614	if (list_empty(fa_head)) {
1615		hlist_del_rcu(&li->hlist);
1616		free_leaf_info(li);
1617	}
1618
1619	if (hlist_empty(&l->list))
1620		trie_leaf_remove(t, key);
1621
1622	if (fa->fa_state & FA_S_ACCESSED)
1623		rt_cache_flush(-1);
1624
1625	fib_release_info(fa->fa_info);
1626	alias_free_mem_rcu(fa);
1627	return 0;
1628}
1629
1630static int trie_flush_list(struct trie *t, struct list_head *head)
1631{
1632	struct fib_alias *fa, *fa_node;
1633	int found = 0;
1634
1635	list_for_each_entry_safe(fa, fa_node, head, fa_list) {
1636		struct fib_info *fi = fa->fa_info;
1637
1638		if (fi && (fi->fib_flags & RTNH_F_DEAD)) {
1639			list_del_rcu(&fa->fa_list);
1640			fib_release_info(fa->fa_info);
1641			alias_free_mem_rcu(fa);
1642			found++;
1643		}
1644	}
1645	return found;
1646}
1647
1648static int trie_flush_leaf(struct trie *t, struct leaf *l)
1649{
1650	int found = 0;
1651	struct hlist_head *lih = &l->list;
1652	struct hlist_node *node, *tmp;
1653	struct leaf_info *li = NULL;
1654
1655	hlist_for_each_entry_safe(li, node, tmp, lih, hlist) {
1656		found += trie_flush_list(t, &li->falh);
1657
1658		if (list_empty(&li->falh)) {
1659			hlist_del_rcu(&li->hlist);
1660			free_leaf_info(li);
1661		}
1662	}
1663	return found;
1664}
1665
1666/* rcu_read_lock needs to be hold by caller from readside */
1667
1668static struct leaf *nextleaf(struct trie *t, struct leaf *thisleaf)
1669{
1670	struct node *c = (struct node *) thisleaf;
1671	struct tnode *p;
1672	int idx;
1673	struct node *trie = rcu_dereference(t->trie);
1674
1675	if (c == NULL) {
1676		if (trie == NULL)
1677			return NULL;
1678
1679		if (IS_LEAF(trie))          /* trie w. just a leaf */
1680			return (struct leaf *) trie;
1681
1682		p = (struct tnode*) trie;  /* Start */
1683	} else
1684		p = (struct tnode *) NODE_PARENT(c);
1685
1686	while (p) {
1687		int pos, last;
1688
1689		/*  Find the next child of the parent */
1690		if (c)
1691			pos = 1 + tkey_extract_bits(c->key, p->pos, p->bits);
1692		else
1693			pos = 0;
1694
1695		last = 1 << p->bits;
1696		for (idx = pos; idx < last ; idx++) {
1697			c = rcu_dereference(p->child[idx]);
1698
1699			if (!c)
1700				continue;
1701
1702			/* Decend if tnode */
1703			while (IS_TNODE(c)) {
1704				p = (struct tnode *) c;
1705				idx = 0;
1706
1707				/* Rightmost non-NULL branch */
1708				if (p && IS_TNODE(p))
1709					while (!(c = rcu_dereference(p->child[idx]))
1710					       && idx < (1<<p->bits)) idx++;
1711
1712				/* Done with this tnode? */
1713				if (idx >= (1 << p->bits) || !c)
1714					goto up;
1715			}
1716			return (struct leaf *) c;
1717		}
1718up:
1719		/* No more children go up one step  */
1720		c = (struct node *) p;
1721		p = (struct tnode *) NODE_PARENT(p);
1722	}
1723	return NULL; /* Ready. Root of trie */
1724}
1725
1726/*
1727 * Caller must hold RTNL.
1728 */
1729static int fn_trie_flush(struct fib_table *tb)
1730{
1731	struct trie *t = (struct trie *) tb->tb_data;
1732	struct leaf *ll = NULL, *l = NULL;
1733	int found = 0, h;
1734
1735	t->revision++;
1736
1737	for (h = 0; (l = nextleaf(t, l)) != NULL; h++) {
1738		found += trie_flush_leaf(t, l);
1739
1740		if (ll && hlist_empty(&ll->list))
1741			trie_leaf_remove(t, ll->key);
1742		ll = l;
1743	}
1744
1745	if (ll && hlist_empty(&ll->list))
1746		trie_leaf_remove(t, ll->key);
1747
1748	pr_debug("trie_flush found=%d\n", found);
1749	return found;
1750}
1751
1752static int trie_last_dflt = -1;
1753
1754static void
1755fn_trie_select_default(struct fib_table *tb, const struct flowi *flp, struct fib_result *res)
1756{
1757	struct trie *t = (struct trie *) tb->tb_data;
1758	int order, last_idx;
1759	struct fib_info *fi = NULL;
1760	struct fib_info *last_resort;
1761	struct fib_alias *fa = NULL;
1762	struct list_head *fa_head;
1763	struct leaf *l;
1764
1765	last_idx = -1;
1766	last_resort = NULL;
1767	order = -1;
1768
1769	rcu_read_lock();
1770
1771	l = fib_find_node(t, 0);
1772	if (!l)
1773		goto out;
1774
1775	fa_head = get_fa_head(l, 0);
1776	if (!fa_head)
1777		goto out;
1778
1779	if (list_empty(fa_head))
1780		goto out;
1781
1782	list_for_each_entry_rcu(fa, fa_head, fa_list) {
1783		struct fib_info *next_fi = fa->fa_info;
1784
1785		if (fa->fa_scope != res->scope ||
1786		    fa->fa_type != RTN_UNICAST)
1787			continue;
1788
1789		if (next_fi->fib_priority > res->fi->fib_priority)
1790			break;
1791		if (!next_fi->fib_nh[0].nh_gw ||
1792		    next_fi->fib_nh[0].nh_scope != RT_SCOPE_LINK)
1793			continue;
1794		fa->fa_state |= FA_S_ACCESSED;
1795
1796		if (fi == NULL) {
1797			if (next_fi != res->fi)
1798				break;
1799		} else if (!fib_detect_death(fi, order, &last_resort,
1800					     &last_idx, &trie_last_dflt)) {
1801			if (res->fi)
1802				fib_info_put(res->fi);
1803			res->fi = fi;
1804			atomic_inc(&fi->fib_clntref);
1805			trie_last_dflt = order;
1806			goto out;
1807		}
1808		fi = next_fi;
1809		order++;
1810	}
1811	if (order <= 0 || fi == NULL) {
1812		trie_last_dflt = -1;
1813		goto out;
1814	}
1815
1816	if (!fib_detect_death(fi, order, &last_resort, &last_idx, &trie_last_dflt)) {
1817		if (res->fi)
1818			fib_info_put(res->fi);
1819		res->fi = fi;
1820		atomic_inc(&fi->fib_clntref);
1821		trie_last_dflt = order;
1822		goto out;
1823	}
1824	if (last_idx >= 0) {
1825		if (res->fi)
1826			fib_info_put(res->fi);
1827		res->fi = last_resort;
1828		if (last_resort)
1829			atomic_inc(&last_resort->fib_clntref);
1830	}
1831	trie_last_dflt = last_idx;
1832 out:;
1833	rcu_read_unlock();
1834}
1835
1836static int fn_trie_dump_fa(t_key key, int plen, struct list_head *fah, struct fib_table *tb,
1837			   struct sk_buff *skb, struct netlink_callback *cb)
1838{
1839	int i, s_i;
1840	struct fib_alias *fa;
1841
1842	__be32 xkey = htonl(key);
1843
1844	s_i = cb->args[4];
1845	i = 0;
1846
1847	/* rcu_read_lock is hold by caller */
1848
1849	list_for_each_entry_rcu(fa, fah, fa_list) {
1850		if (i < s_i) {
1851			i++;
1852			continue;
1853		}
1854		BUG_ON(!fa->fa_info);
1855
1856		if (fib_dump_info(skb, NETLINK_CB(cb->skb).pid,
1857				  cb->nlh->nlmsg_seq,
1858				  RTM_NEWROUTE,
1859				  tb->tb_id,
1860				  fa->fa_type,
1861				  fa->fa_scope,
1862				  xkey,
1863				  plen,
1864				  fa->fa_tos,
1865				  fa->fa_info, 0) < 0) {
1866			cb->args[4] = i;
1867			return -1;
1868		}
1869		i++;
1870	}
1871	cb->args[4] = i;
1872	return skb->len;
1873}
1874
1875static int fn_trie_dump_plen(struct trie *t, int plen, struct fib_table *tb, struct sk_buff *skb,
1876			     struct netlink_callback *cb)
1877{
1878	int h, s_h;
1879	struct list_head *fa_head;
1880	struct leaf *l = NULL;
1881
1882	s_h = cb->args[3];
1883
1884	for (h = 0; (l = nextleaf(t, l)) != NULL; h++) {
1885		if (h < s_h)
1886			continue;
1887		if (h > s_h)
1888			memset(&cb->args[4], 0,
1889			       sizeof(cb->args) - 4*sizeof(cb->args[0]));
1890
1891		fa_head = get_fa_head(l, plen);
1892
1893		if (!fa_head)
1894			continue;
1895
1896		if (list_empty(fa_head))
1897			continue;
1898
1899		if (fn_trie_dump_fa(l->key, plen, fa_head, tb, skb, cb)<0) {
1900			cb->args[3] = h;
1901			return -1;
1902		}
1903	}
1904	cb->args[3] = h;
1905	return skb->len;
1906}
1907
1908static int fn_trie_dump(struct fib_table *tb, struct sk_buff *skb, struct netlink_callback *cb)
1909{
1910	int m, s_m;
1911	struct trie *t = (struct trie *) tb->tb_data;
1912
1913	s_m = cb->args[2];
1914
1915	rcu_read_lock();
1916	for (m = 0; m <= 32; m++) {
1917		if (m < s_m)
1918			continue;
1919		if (m > s_m)
1920			memset(&cb->args[3], 0,
1921				sizeof(cb->args) - 3*sizeof(cb->args[0]));
1922
1923		if (fn_trie_dump_plen(t, 32-m, tb, skb, cb)<0) {
1924			cb->args[2] = m;
1925			goto out;
1926		}
1927	}
1928	rcu_read_unlock();
1929	cb->args[2] = m;
1930	return skb->len;
1931out:
1932	rcu_read_unlock();
1933	return -1;
1934}
1935
1936/* Fix more generic FIB names for init later */
1937
1938#ifdef CONFIG_IP_MULTIPLE_TABLES
1939struct fib_table * fib_hash_init(u32 id)
1940#else
1941struct fib_table * __init fib_hash_init(u32 id)
1942#endif
1943{
1944	struct fib_table *tb;
1945	struct trie *t;
1946
1947	if (fn_alias_kmem == NULL)
1948		fn_alias_kmem = kmem_cache_create("ip_fib_alias",
1949						  sizeof(struct fib_alias),
1950						  0, SLAB_HWCACHE_ALIGN,
1951						  NULL, NULL);
1952
1953	tb = kmalloc(sizeof(struct fib_table) + sizeof(struct trie),
1954		     GFP_KERNEL);
1955	if (tb == NULL)
1956		return NULL;
1957
1958	tb->tb_id = id;
1959	tb->tb_lookup = fn_trie_lookup;
1960	tb->tb_insert = fn_trie_insert;
1961	tb->tb_delete = fn_trie_delete;
1962	tb->tb_flush = fn_trie_flush;
1963	tb->tb_select_default = fn_trie_select_default;
1964	tb->tb_dump = fn_trie_dump;
1965	memset(tb->tb_data, 0, sizeof(struct trie));
1966
1967	t = (struct trie *) tb->tb_data;
1968
1969	trie_init(t);
1970
1971	if (id == RT_TABLE_LOCAL)
1972		trie_local = t;
1973	else if (id == RT_TABLE_MAIN)
1974		trie_main = t;
1975
1976	if (id == RT_TABLE_LOCAL)
1977		printk(KERN_INFO "IPv4 FIB: Using LC-trie version %s\n", VERSION);
1978
1979	return tb;
1980}
1981
1982#ifdef CONFIG_PROC_FS
1983/* Depth first Trie walk iterator */
1984struct fib_trie_iter {
1985	struct tnode *tnode;
1986	struct trie *trie;
1987	unsigned index;
1988	unsigned depth;
1989};
1990
1991static struct node *fib_trie_get_next(struct fib_trie_iter *iter)
1992{
1993	struct tnode *tn = iter->tnode;
1994	unsigned cindex = iter->index;
1995	struct tnode *p;
1996
1997	/* A single entry routing table */
1998	if (!tn)
1999		return NULL;
2000
2001	pr_debug("get_next iter={node=%p index=%d depth=%d}\n",
2002		 iter->tnode, iter->index, iter->depth);
2003rescan:
2004	while (cindex < (1<<tn->bits)) {
2005		struct node *n = tnode_get_child(tn, cindex);
2006
2007		if (n) {
2008			if (IS_LEAF(n)) {
2009				iter->tnode = tn;
2010				iter->index = cindex + 1;
2011			} else {
2012				/* push down one level */
2013				iter->tnode = (struct tnode *) n;
2014				iter->index = 0;
2015				++iter->depth;
2016			}
2017			return n;
2018		}
2019
2020		++cindex;
2021	}
2022
2023	/* Current node exhausted, pop back up */
2024	p = NODE_PARENT(tn);
2025	if (p) {
2026		cindex = tkey_extract_bits(tn->key, p->pos, p->bits)+1;
2027		tn = p;
2028		--iter->depth;
2029		goto rescan;
2030	}
2031
2032	/* got root? */
2033	return NULL;
2034}
2035
2036static struct node *fib_trie_get_first(struct fib_trie_iter *iter,
2037				       struct trie *t)
2038{
2039	struct node *n ;
2040
2041	if (!t)
2042		return NULL;
2043
2044	n = rcu_dereference(t->trie);
2045
2046	if (!iter)
2047		return NULL;
2048
2049	if (n) {
2050		if (IS_TNODE(n)) {
2051			iter->tnode = (struct tnode *) n;
2052			iter->trie = t;
2053			iter->index = 0;
2054			iter->depth = 1;
2055		} else {
2056			iter->tnode = NULL;
2057			iter->trie  = t;
2058			iter->index = 0;
2059			iter->depth = 0;
2060		}
2061		return n;
2062	}
2063	return NULL;
2064}
2065
2066static void trie_collect_stats(struct trie *t, struct trie_stat *s)
2067{
2068	struct node *n;
2069	struct fib_trie_iter iter;
2070
2071	memset(s, 0, sizeof(*s));
2072
2073	rcu_read_lock();
2074	for (n = fib_trie_get_first(&iter, t); n;
2075	     n = fib_trie_get_next(&iter)) {
2076		if (IS_LEAF(n)) {
2077			s->leaves++;
2078			s->totdepth += iter.depth;
2079			if (iter.depth > s->maxdepth)
2080				s->maxdepth = iter.depth;
2081		} else {
2082			const struct tnode *tn = (const struct tnode *) n;
2083			int i;
2084
2085			s->tnodes++;
2086			if (tn->bits < MAX_STAT_DEPTH)
2087				s->nodesizes[tn->bits]++;
2088
2089			for (i = 0; i < (1<<tn->bits); i++)
2090				if (!tn->child[i])
2091					s->nullpointers++;
2092		}
2093	}
2094	rcu_read_unlock();
2095}
2096
2097/*
2098 *	This outputs /proc/net/fib_triestats
2099 */
2100static void trie_show_stats(struct seq_file *seq, struct trie_stat *stat)
2101{
2102	unsigned i, max, pointers, bytes, avdepth;
2103
2104	if (stat->leaves)
2105		avdepth = stat->totdepth*100 / stat->leaves;
2106	else
2107		avdepth = 0;
2108
2109	seq_printf(seq, "\tAver depth:     %d.%02d\n", avdepth / 100, avdepth % 100 );
2110	seq_printf(seq, "\tMax depth:      %u\n", stat->maxdepth);
2111
2112	seq_printf(seq, "\tLeaves:         %u\n", stat->leaves);
2113
2114	bytes = sizeof(struct leaf) * stat->leaves;
2115	seq_printf(seq, "\tInternal nodes: %d\n\t", stat->tnodes);
2116	bytes += sizeof(struct tnode) * stat->tnodes;
2117
2118	max = MAX_STAT_DEPTH;
2119	while (max > 0 && stat->nodesizes[max-1] == 0)
2120		max--;
2121
2122	pointers = 0;
2123	for (i = 1; i <= max; i++)
2124		if (stat->nodesizes[i] != 0) {
2125			seq_printf(seq, "  %d: %d",  i, stat->nodesizes[i]);
2126			pointers += (1<<i) * stat->nodesizes[i];
2127		}
2128	seq_putc(seq, '\n');
2129	seq_printf(seq, "\tPointers: %d\n", pointers);
2130
2131	bytes += sizeof(struct node *) * pointers;
2132	seq_printf(seq, "Null ptrs: %d\n", stat->nullpointers);
2133	seq_printf(seq, "Total size: %d  kB\n", (bytes + 1023) / 1024);
2134
2135#ifdef CONFIG_IP_FIB_TRIE_STATS
2136	seq_printf(seq, "Counters:\n---------\n");
2137	seq_printf(seq,"gets = %d\n", t->stats.gets);
2138	seq_printf(seq,"backtracks = %d\n", t->stats.backtrack);
2139	seq_printf(seq,"semantic match passed = %d\n", t->stats.semantic_match_passed);
2140	seq_printf(seq,"semantic match miss = %d\n", t->stats.semantic_match_miss);
2141	seq_printf(seq,"null node hit= %d\n", t->stats.null_node_hit);
2142	seq_printf(seq,"skipped node resize = %d\n", t->stats.resize_node_skipped);
2143#ifdef CLEAR_STATS
2144	memset(&(t->stats), 0, sizeof(t->stats));
2145#endif
2146#endif /*  CONFIG_IP_FIB_TRIE_STATS */
2147}
2148
2149static int fib_triestat_seq_show(struct seq_file *seq, void *v)
2150{
2151	struct trie_stat *stat;
2152
2153	stat = kmalloc(sizeof(*stat), GFP_KERNEL);
2154	if (!stat)
2155		return -ENOMEM;
2156
2157	seq_printf(seq, "Basic info: size of leaf: %Zd bytes, size of tnode: %Zd bytes.\n",
2158		   sizeof(struct leaf), sizeof(struct tnode));
2159
2160	if (trie_local) {
2161		seq_printf(seq, "Local:\n");
2162		trie_collect_stats(trie_local, stat);
2163		trie_show_stats(seq, stat);
2164	}
2165
2166	if (trie_main) {
2167		seq_printf(seq, "Main:\n");
2168		trie_collect_stats(trie_main, stat);
2169		trie_show_stats(seq, stat);
2170	}
2171	kfree(stat);
2172
2173	return 0;
2174}
2175
2176static int fib_triestat_seq_open(struct inode *inode, struct file *file)
2177{
2178	return single_open(file, fib_triestat_seq_show, NULL);
2179}
2180
2181static const struct file_operations fib_triestat_fops = {
2182	.owner	= THIS_MODULE,
2183	.open	= fib_triestat_seq_open,
2184	.read	= seq_read,
2185	.llseek	= seq_lseek,
2186	.release = single_release,
2187};
2188
2189static struct node *fib_trie_get_idx(struct fib_trie_iter *iter,
2190				      loff_t pos)
2191{
2192	loff_t idx = 0;
2193	struct node *n;
2194
2195	for (n = fib_trie_get_first(iter, trie_local);
2196	     n; ++idx, n = fib_trie_get_next(iter)) {
2197		if (pos == idx)
2198			return n;
2199	}
2200
2201	for (n = fib_trie_get_first(iter, trie_main);
2202	     n; ++idx, n = fib_trie_get_next(iter)) {
2203		if (pos == idx)
2204			return n;
2205	}
2206	return NULL;
2207}
2208
2209static void *fib_trie_seq_start(struct seq_file *seq, loff_t *pos)
2210{
2211	rcu_read_lock();
2212	if (*pos == 0)
2213		return SEQ_START_TOKEN;
2214	return fib_trie_get_idx(seq->private, *pos - 1);
2215}
2216
2217static void *fib_trie_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2218{
2219	struct fib_trie_iter *iter = seq->private;
2220	void *l = v;
2221
2222	++*pos;
2223	if (v == SEQ_START_TOKEN)
2224		return fib_trie_get_idx(iter, 0);
2225
2226	v = fib_trie_get_next(iter);
2227	BUG_ON(v == l);
2228	if (v)
2229		return v;
2230
2231	/* continue scan in next trie */
2232	if (iter->trie == trie_local)
2233		return fib_trie_get_first(iter, trie_main);
2234
2235	return NULL;
2236}
2237
2238static void fib_trie_seq_stop(struct seq_file *seq, void *v)
2239{
2240	rcu_read_unlock();
2241}
2242
2243static void seq_indent(struct seq_file *seq, int n)
2244{
2245	while (n-- > 0) seq_puts(seq, "   ");
2246}
2247
2248static inline const char *rtn_scope(enum rt_scope_t s)
2249{
2250	static char buf[32];
2251
2252	switch (s) {
2253	case RT_SCOPE_UNIVERSE: return "universe";
2254	case RT_SCOPE_SITE:	return "site";
2255	case RT_SCOPE_LINK:	return "link";
2256	case RT_SCOPE_HOST:	return "host";
2257	case RT_SCOPE_NOWHERE:	return "nowhere";
2258	default:
2259		snprintf(buf, sizeof(buf), "scope=%d", s);
2260		return buf;
2261	}
2262}
2263
2264static const char *rtn_type_names[__RTN_MAX] = {
2265	[RTN_UNSPEC] = "UNSPEC",
2266	[RTN_UNICAST] = "UNICAST",
2267	[RTN_LOCAL] = "LOCAL",
2268	[RTN_BROADCAST] = "BROADCAST",
2269	[RTN_ANYCAST] = "ANYCAST",
2270	[RTN_MULTICAST] = "MULTICAST",
2271	[RTN_BLACKHOLE] = "BLACKHOLE",
2272	[RTN_UNREACHABLE] = "UNREACHABLE",
2273	[RTN_PROHIBIT] = "PROHIBIT",
2274	[RTN_THROW] = "THROW",
2275	[RTN_NAT] = "NAT",
2276	[RTN_XRESOLVE] = "XRESOLVE",
2277};
2278
2279static inline const char *rtn_type(unsigned t)
2280{
2281	static char buf[32];
2282
2283	if (t < __RTN_MAX && rtn_type_names[t])
2284		return rtn_type_names[t];
2285	snprintf(buf, sizeof(buf), "type %d", t);
2286	return buf;
2287}
2288
2289/* Pretty print the trie */
2290static int fib_trie_seq_show(struct seq_file *seq, void *v)
2291{
2292	const struct fib_trie_iter *iter = seq->private;
2293	struct node *n = v;
2294
2295	if (v == SEQ_START_TOKEN)
2296		return 0;
2297
2298	if (!NODE_PARENT(n)) {
2299		if (iter->trie == trie_local)
2300			seq_puts(seq, "<local>:\n");
2301		else
2302			seq_puts(seq, "<main>:\n");
2303	}
2304
2305	if (IS_TNODE(n)) {
2306		struct tnode *tn = (struct tnode *) n;
2307		__be32 prf = htonl(MASK_PFX(tn->key, tn->pos));
2308
2309		seq_indent(seq, iter->depth-1);
2310		seq_printf(seq, "  +-- %d.%d.%d.%d/%d %d %d %d\n",
2311			   NIPQUAD(prf), tn->pos, tn->bits, tn->full_children,
2312			   tn->empty_children);
2313
2314	} else {
2315		struct leaf *l = (struct leaf *) n;
2316		int i;
2317		__be32 val = htonl(l->key);
2318
2319		seq_indent(seq, iter->depth);
2320		seq_printf(seq, "  |-- %d.%d.%d.%d\n", NIPQUAD(val));
2321		for (i = 32; i >= 0; i--) {
2322			struct leaf_info *li = find_leaf_info(l, i);
2323			if (li) {
2324				struct fib_alias *fa;
2325				list_for_each_entry_rcu(fa, &li->falh, fa_list) {
2326					seq_indent(seq, iter->depth+1);
2327					seq_printf(seq, "  /%d %s %s", i,
2328						   rtn_scope(fa->fa_scope),
2329						   rtn_type(fa->fa_type));
2330					if (fa->fa_tos)
2331						seq_printf(seq, "tos =%d\n",
2332							   fa->fa_tos);
2333					seq_putc(seq, '\n');
2334				}
2335			}
2336		}
2337	}
2338
2339	return 0;
2340}
2341
2342static struct seq_operations fib_trie_seq_ops = {
2343	.start  = fib_trie_seq_start,
2344	.next   = fib_trie_seq_next,
2345	.stop   = fib_trie_seq_stop,
2346	.show   = fib_trie_seq_show,
2347};
2348
2349static int fib_trie_seq_open(struct inode *inode, struct file *file)
2350{
2351	struct seq_file *seq;
2352	int rc = -ENOMEM;
2353	struct fib_trie_iter *s = kmalloc(sizeof(*s), GFP_KERNEL);
2354
2355	if (!s)
2356		goto out;
2357
2358	rc = seq_open(file, &fib_trie_seq_ops);
2359	if (rc)
2360		goto out_kfree;
2361
2362	seq	     = file->private_data;
2363	seq->private = s;
2364	memset(s, 0, sizeof(*s));
2365out:
2366	return rc;
2367out_kfree:
2368	kfree(s);
2369	goto out;
2370}
2371
2372static const struct file_operations fib_trie_fops = {
2373	.owner  = THIS_MODULE,
2374	.open   = fib_trie_seq_open,
2375	.read   = seq_read,
2376	.llseek = seq_lseek,
2377	.release = seq_release_private,
2378};
2379
2380static unsigned fib_flag_trans(int type, __be32 mask, const struct fib_info *fi)
2381{
2382	static unsigned type2flags[RTN_MAX + 1] = {
2383		[7] = RTF_REJECT, [8] = RTF_REJECT,
2384	};
2385	unsigned flags = type2flags[type];
2386
2387	if (fi && fi->fib_nh->nh_gw)
2388		flags |= RTF_GATEWAY;
2389	if (mask == htonl(0xFFFFFFFF))
2390		flags |= RTF_HOST;
2391	flags |= RTF_UP;
2392	return flags;
2393}
2394
2395/*
2396 *	This outputs /proc/net/route.
2397 *	The format of the file is not supposed to be changed
2398 * 	and needs to be same as fib_hash output to avoid breaking
2399 *	legacy utilities
2400 */
2401static int fib_route_seq_show(struct seq_file *seq, void *v)
2402{
2403	const struct fib_trie_iter *iter = seq->private;
2404	struct leaf *l = v;
2405	int i;
2406	char bf[128];
2407
2408	if (v == SEQ_START_TOKEN) {
2409		seq_printf(seq, "%-127s\n", "Iface\tDestination\tGateway "
2410			   "\tFlags\tRefCnt\tUse\tMetric\tMask\t\tMTU"
2411			   "\tWindow\tIRTT");
2412		return 0;
2413	}
2414
2415	if (iter->trie == trie_local)
2416		return 0;
2417	if (IS_TNODE(l))
2418		return 0;
2419
2420	for (i=32; i>=0; i--) {
2421		struct leaf_info *li = find_leaf_info(l, i);
2422		struct fib_alias *fa;
2423		__be32 mask, prefix;
2424
2425		if (!li)
2426			continue;
2427
2428		mask = inet_make_mask(li->plen);
2429		prefix = htonl(l->key);
2430
2431		list_for_each_entry_rcu(fa, &li->falh, fa_list) {
2432			const struct fib_info *fi = fa->fa_info;
2433			unsigned flags = fib_flag_trans(fa->fa_type, mask, fi);
2434
2435			if (fa->fa_type == RTN_BROADCAST
2436			    || fa->fa_type == RTN_MULTICAST)
2437				continue;
2438
2439			if (fi)
2440				snprintf(bf, sizeof(bf),
2441					 "%s\t%08X\t%08X\t%04X\t%d\t%u\t%d\t%08X\t%d\t%u\t%u",
2442					 fi->fib_dev ? fi->fib_dev->name : "*",
2443					 prefix,
2444					 fi->fib_nh->nh_gw, flags, 0, 0,
2445					 fi->fib_priority,
2446					 mask,
2447					 (fi->fib_advmss ? fi->fib_advmss + 40 : 0),
2448					 fi->fib_window,
2449					 fi->fib_rtt >> 3);
2450			else
2451				snprintf(bf, sizeof(bf),
2452					 "*\t%08X\t%08X\t%04X\t%d\t%u\t%d\t%08X\t%d\t%u\t%u",
2453					 prefix, 0, flags, 0, 0, 0,
2454					 mask, 0, 0, 0);
2455
2456			seq_printf(seq, "%-127s\n", bf);
2457		}
2458	}
2459
2460	return 0;
2461}
2462
2463static struct seq_operations fib_route_seq_ops = {
2464	.start  = fib_trie_seq_start,
2465	.next   = fib_trie_seq_next,
2466	.stop   = fib_trie_seq_stop,
2467	.show   = fib_route_seq_show,
2468};
2469
2470static int fib_route_seq_open(struct inode *inode, struct file *file)
2471{
2472	struct seq_file *seq;
2473	int rc = -ENOMEM;
2474	struct fib_trie_iter *s = kmalloc(sizeof(*s), GFP_KERNEL);
2475
2476	if (!s)
2477		goto out;
2478
2479	rc = seq_open(file, &fib_route_seq_ops);
2480	if (rc)
2481		goto out_kfree;
2482
2483	seq	     = file->private_data;
2484	seq->private = s;
2485	memset(s, 0, sizeof(*s));
2486out:
2487	return rc;
2488out_kfree:
2489	kfree(s);
2490	goto out;
2491}
2492
2493static const struct file_operations fib_route_fops = {
2494	.owner  = THIS_MODULE,
2495	.open   = fib_route_seq_open,
2496	.read   = seq_read,
2497	.llseek = seq_lseek,
2498	.release = seq_release_private,
2499};
2500
2501int __init fib_proc_init(void)
2502{
2503	if (!proc_net_fops_create("fib_trie", S_IRUGO, &fib_trie_fops))
2504		goto out1;
2505
2506	if (!proc_net_fops_create("fib_triestat", S_IRUGO, &fib_triestat_fops))
2507		goto out2;
2508
2509	if (!proc_net_fops_create("route", S_IRUGO, &fib_route_fops))
2510		goto out3;
2511
2512	return 0;
2513
2514out3:
2515	proc_net_remove("fib_triestat");
2516out2:
2517	proc_net_remove("fib_trie");
2518out1:
2519	return -ENOMEM;
2520}
2521
2522void __init fib_proc_exit(void)
2523{
2524	proc_net_remove("fib_trie");
2525	proc_net_remove("fib_triestat");
2526	proc_net_remove("route");
2527}
2528
2529#endif /* CONFIG_PROC_FS */
2530