fib_trie.c revision cf7732e4cc14b56d593ff53352673e1fd5e3ba52
1/*
2 *   This program is free software; you can redistribute it and/or
3 *   modify it under the terms of the GNU General Public License
4 *   as published by the Free Software Foundation; either version
5 *   2 of the License, or (at your option) any later version.
6 *
7 *   Robert Olsson <robert.olsson@its.uu.se> Uppsala Universitet
8 *     & Swedish University of Agricultural Sciences.
9 *
10 *   Jens Laas <jens.laas@data.slu.se> Swedish University of
11 *     Agricultural Sciences.
12 *
13 *   Hans Liss <hans.liss@its.uu.se>  Uppsala Universitet
14 *
15 * This work is based on the LPC-trie which is originally descibed in:
16 *
17 * An experimental study of compression methods for dynamic tries
18 * Stefan Nilsson and Matti Tikkanen. Algorithmica, 33(1):19-33, 2002.
19 * http://www.nada.kth.se/~snilsson/public/papers/dyntrie2/
20 *
21 *
22 * IP-address lookup using LC-tries. Stefan Nilsson and Gunnar Karlsson
23 * IEEE Journal on Selected Areas in Communications, 17(6):1083-1092, June 1999
24 *
25 * Version:	$Id: fib_trie.c,v 1.3 2005/06/08 14:20:01 robert Exp $
26 *
27 *
28 * Code from fib_hash has been reused which includes the following header:
29 *
30 *
31 * INET		An implementation of the TCP/IP protocol suite for the LINUX
32 *		operating system.  INET is implemented using the  BSD Socket
33 *		interface as the means of communication with the user level.
34 *
35 *		IPv4 FIB: lookup engine and maintenance routines.
36 *
37 *
38 * Authors:	Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
39 *
40 *		This program is free software; you can redistribute it and/or
41 *		modify it under the terms of the GNU General Public License
42 *		as published by the Free Software Foundation; either version
43 *		2 of the License, or (at your option) any later version.
44 *
45 * Substantial contributions to this work comes from:
46 *
47 *		David S. Miller, <davem@davemloft.net>
48 *		Stephen Hemminger <shemminger@osdl.org>
49 *		Paul E. McKenney <paulmck@us.ibm.com>
50 *		Patrick McHardy <kaber@trash.net>
51 */
52
53#define VERSION "0.408"
54
55#include <asm/uaccess.h>
56#include <asm/system.h>
57#include <asm/bitops.h>
58#include <linux/types.h>
59#include <linux/kernel.h>
60#include <linux/mm.h>
61#include <linux/string.h>
62#include <linux/socket.h>
63#include <linux/sockios.h>
64#include <linux/errno.h>
65#include <linux/in.h>
66#include <linux/inet.h>
67#include <linux/inetdevice.h>
68#include <linux/netdevice.h>
69#include <linux/if_arp.h>
70#include <linux/proc_fs.h>
71#include <linux/rcupdate.h>
72#include <linux/skbuff.h>
73#include <linux/netlink.h>
74#include <linux/init.h>
75#include <linux/list.h>
76#include <net/net_namespace.h>
77#include <net/ip.h>
78#include <net/protocol.h>
79#include <net/route.h>
80#include <net/tcp.h>
81#include <net/sock.h>
82#include <net/ip_fib.h>
83#include "fib_lookup.h"
84
85#undef CONFIG_IP_FIB_TRIE_STATS
86#define MAX_STAT_DEPTH 32
87
88#define KEYLENGTH (8*sizeof(t_key))
89
90typedef unsigned int t_key;
91
92#define T_TNODE 0
93#define T_LEAF  1
94#define NODE_TYPE_MASK	0x1UL
95#define NODE_TYPE(node) ((node)->parent & NODE_TYPE_MASK)
96
97#define IS_TNODE(n) (!(n->parent & T_LEAF))
98#define IS_LEAF(n) (n->parent & T_LEAF)
99
100struct node {
101	t_key key;
102	unsigned long parent;
103};
104
105struct leaf {
106	t_key key;
107	unsigned long parent;
108	struct hlist_head list;
109	struct rcu_head rcu;
110};
111
112struct leaf_info {
113	struct hlist_node hlist;
114	struct rcu_head rcu;
115	int plen;
116	struct list_head falh;
117};
118
119struct tnode {
120	t_key key;
121	unsigned long parent;
122	unsigned short pos:5;		/* 2log(KEYLENGTH) bits needed */
123	unsigned short bits:5;		/* 2log(KEYLENGTH) bits needed */
124	unsigned short full_children;	/* KEYLENGTH bits needed */
125	unsigned short empty_children;	/* KEYLENGTH bits needed */
126	struct rcu_head rcu;
127	struct node *child[0];
128};
129
130#ifdef CONFIG_IP_FIB_TRIE_STATS
131struct trie_use_stats {
132	unsigned int gets;
133	unsigned int backtrack;
134	unsigned int semantic_match_passed;
135	unsigned int semantic_match_miss;
136	unsigned int null_node_hit;
137	unsigned int resize_node_skipped;
138};
139#endif
140
141struct trie_stat {
142	unsigned int totdepth;
143	unsigned int maxdepth;
144	unsigned int tnodes;
145	unsigned int leaves;
146	unsigned int nullpointers;
147	unsigned int nodesizes[MAX_STAT_DEPTH];
148};
149
150struct trie {
151	struct node *trie;
152#ifdef CONFIG_IP_FIB_TRIE_STATS
153	struct trie_use_stats stats;
154#endif
155	int size;
156	unsigned int revision;
157};
158
159static void put_child(struct trie *t, struct tnode *tn, int i, struct node *n);
160static void tnode_put_child_reorg(struct tnode *tn, int i, struct node *n, int wasfull);
161static struct node *resize(struct trie *t, struct tnode *tn);
162static struct tnode *inflate(struct trie *t, struct tnode *tn);
163static struct tnode *halve(struct trie *t, struct tnode *tn);
164static void tnode_free(struct tnode *tn);
165
166static struct kmem_cache *fn_alias_kmem __read_mostly;
167static struct trie *trie_local = NULL, *trie_main = NULL;
168
169static inline struct tnode *node_parent(struct node *node)
170{
171	struct tnode *ret;
172
173	ret = (struct tnode *)(node->parent & ~NODE_TYPE_MASK);
174	return rcu_dereference(ret);
175}
176
177static inline void node_set_parent(struct node *node, struct tnode *ptr)
178{
179	rcu_assign_pointer(node->parent,
180			   (unsigned long)ptr | NODE_TYPE(node));
181}
182
183/* rcu_read_lock needs to be hold by caller from readside */
184
185static inline struct node *tnode_get_child(struct tnode *tn, int i)
186{
187	BUG_ON(i >= 1 << tn->bits);
188
189	return rcu_dereference(tn->child[i]);
190}
191
192static inline int tnode_child_length(const struct tnode *tn)
193{
194	return 1 << tn->bits;
195}
196
197static inline t_key mask_pfx(t_key k, unsigned short l)
198{
199	return (l == 0) ? 0 : k >> (KEYLENGTH-l) << (KEYLENGTH-l);
200}
201
202static inline t_key tkey_extract_bits(t_key a, int offset, int bits)
203{
204	if (offset < KEYLENGTH)
205		return ((t_key)(a << offset)) >> (KEYLENGTH - bits);
206	else
207		return 0;
208}
209
210static inline int tkey_equals(t_key a, t_key b)
211{
212	return a == b;
213}
214
215static inline int tkey_sub_equals(t_key a, int offset, int bits, t_key b)
216{
217	if (bits == 0 || offset >= KEYLENGTH)
218		return 1;
219	bits = bits > KEYLENGTH ? KEYLENGTH : bits;
220	return ((a ^ b) << offset) >> (KEYLENGTH - bits) == 0;
221}
222
223static inline int tkey_mismatch(t_key a, int offset, t_key b)
224{
225	t_key diff = a ^ b;
226	int i = offset;
227
228	if (!diff)
229		return 0;
230	while ((diff << i) >> (KEYLENGTH-1) == 0)
231		i++;
232	return i;
233}
234
235/*
236  To understand this stuff, an understanding of keys and all their bits is
237  necessary. Every node in the trie has a key associated with it, but not
238  all of the bits in that key are significant.
239
240  Consider a node 'n' and its parent 'tp'.
241
242  If n is a leaf, every bit in its key is significant. Its presence is
243  necessitated by path compression, since during a tree traversal (when
244  searching for a leaf - unless we are doing an insertion) we will completely
245  ignore all skipped bits we encounter. Thus we need to verify, at the end of
246  a potentially successful search, that we have indeed been walking the
247  correct key path.
248
249  Note that we can never "miss" the correct key in the tree if present by
250  following the wrong path. Path compression ensures that segments of the key
251  that are the same for all keys with a given prefix are skipped, but the
252  skipped part *is* identical for each node in the subtrie below the skipped
253  bit! trie_insert() in this implementation takes care of that - note the
254  call to tkey_sub_equals() in trie_insert().
255
256  if n is an internal node - a 'tnode' here, the various parts of its key
257  have many different meanings.
258
259  Example:
260  _________________________________________________________________
261  | i | i | i | i | i | i | i | N | N | N | S | S | S | S | S | C |
262  -----------------------------------------------------------------
263    0   1   2   3   4   5   6   7   8   9  10  11  12  13  14  15
264
265  _________________________________________________________________
266  | C | C | C | u | u | u | u | u | u | u | u | u | u | u | u | u |
267  -----------------------------------------------------------------
268   16  17  18  19  20  21  22  23  24  25  26  27  28  29  30  31
269
270  tp->pos = 7
271  tp->bits = 3
272  n->pos = 15
273  n->bits = 4
274
275  First, let's just ignore the bits that come before the parent tp, that is
276  the bits from 0 to (tp->pos-1). They are *known* but at this point we do
277  not use them for anything.
278
279  The bits from (tp->pos) to (tp->pos + tp->bits - 1) - "N", above - are the
280  index into the parent's child array. That is, they will be used to find
281  'n' among tp's children.
282
283  The bits from (tp->pos + tp->bits) to (n->pos - 1) - "S" - are skipped bits
284  for the node n.
285
286  All the bits we have seen so far are significant to the node n. The rest
287  of the bits are really not needed or indeed known in n->key.
288
289  The bits from (n->pos) to (n->pos + n->bits - 1) - "C" - are the index into
290  n's child array, and will of course be different for each child.
291
292
293  The rest of the bits, from (n->pos + n->bits) onward, are completely unknown
294  at this point.
295
296*/
297
298static inline void check_tnode(const struct tnode *tn)
299{
300	WARN_ON(tn && tn->pos+tn->bits > 32);
301}
302
303static int halve_threshold = 25;
304static int inflate_threshold = 50;
305static int halve_threshold_root = 8;
306static int inflate_threshold_root = 15;
307
308
309static void __alias_free_mem(struct rcu_head *head)
310{
311	struct fib_alias *fa = container_of(head, struct fib_alias, rcu);
312	kmem_cache_free(fn_alias_kmem, fa);
313}
314
315static inline void alias_free_mem_rcu(struct fib_alias *fa)
316{
317	call_rcu(&fa->rcu, __alias_free_mem);
318}
319
320static void __leaf_free_rcu(struct rcu_head *head)
321{
322	kfree(container_of(head, struct leaf, rcu));
323}
324
325static void __leaf_info_free_rcu(struct rcu_head *head)
326{
327	kfree(container_of(head, struct leaf_info, rcu));
328}
329
330static inline void free_leaf_info(struct leaf_info *leaf)
331{
332	call_rcu(&leaf->rcu, __leaf_info_free_rcu);
333}
334
335static struct tnode *tnode_alloc(unsigned int size)
336{
337	struct page *pages;
338
339	if (size <= PAGE_SIZE)
340		return kcalloc(size, 1, GFP_KERNEL);
341
342	pages = alloc_pages(GFP_KERNEL|__GFP_ZERO, get_order(size));
343	if (!pages)
344		return NULL;
345
346	return page_address(pages);
347}
348
349static void __tnode_free_rcu(struct rcu_head *head)
350{
351	struct tnode *tn = container_of(head, struct tnode, rcu);
352	unsigned int size = sizeof(struct tnode) +
353		(1 << tn->bits) * sizeof(struct node *);
354
355	if (size <= PAGE_SIZE)
356		kfree(tn);
357	else
358		free_pages((unsigned long)tn, get_order(size));
359}
360
361static inline void tnode_free(struct tnode *tn)
362{
363	if (IS_LEAF(tn)) {
364		struct leaf *l = (struct leaf *) tn;
365		call_rcu_bh(&l->rcu, __leaf_free_rcu);
366	} else
367		call_rcu(&tn->rcu, __tnode_free_rcu);
368}
369
370static struct leaf *leaf_new(void)
371{
372	struct leaf *l = kmalloc(sizeof(struct leaf),  GFP_KERNEL);
373	if (l) {
374		l->parent = T_LEAF;
375		INIT_HLIST_HEAD(&l->list);
376	}
377	return l;
378}
379
380static struct leaf_info *leaf_info_new(int plen)
381{
382	struct leaf_info *li = kmalloc(sizeof(struct leaf_info),  GFP_KERNEL);
383	if (li) {
384		li->plen = plen;
385		INIT_LIST_HEAD(&li->falh);
386	}
387	return li;
388}
389
390static struct tnode* tnode_new(t_key key, int pos, int bits)
391{
392	int nchildren = 1<<bits;
393	int sz = sizeof(struct tnode) + nchildren * sizeof(struct node *);
394	struct tnode *tn = tnode_alloc(sz);
395
396	if (tn) {
397		memset(tn, 0, sz);
398		tn->parent = T_TNODE;
399		tn->pos = pos;
400		tn->bits = bits;
401		tn->key = key;
402		tn->full_children = 0;
403		tn->empty_children = 1<<bits;
404	}
405
406	pr_debug("AT %p s=%u %u\n", tn, (unsigned int) sizeof(struct tnode),
407		 (unsigned int) (sizeof(struct node) * 1<<bits));
408	return tn;
409}
410
411/*
412 * Check whether a tnode 'n' is "full", i.e. it is an internal node
413 * and no bits are skipped. See discussion in dyntree paper p. 6
414 */
415
416static inline int tnode_full(const struct tnode *tn, const struct node *n)
417{
418	if (n == NULL || IS_LEAF(n))
419		return 0;
420
421	return ((struct tnode *) n)->pos == tn->pos + tn->bits;
422}
423
424static inline void put_child(struct trie *t, struct tnode *tn, int i, struct node *n)
425{
426	tnode_put_child_reorg(tn, i, n, -1);
427}
428
429 /*
430  * Add a child at position i overwriting the old value.
431  * Update the value of full_children and empty_children.
432  */
433
434static void tnode_put_child_reorg(struct tnode *tn, int i, struct node *n, int wasfull)
435{
436	struct node *chi = tn->child[i];
437	int isfull;
438
439	BUG_ON(i >= 1<<tn->bits);
440
441
442	/* update emptyChildren */
443	if (n == NULL && chi != NULL)
444		tn->empty_children++;
445	else if (n != NULL && chi == NULL)
446		tn->empty_children--;
447
448	/* update fullChildren */
449	if (wasfull == -1)
450		wasfull = tnode_full(tn, chi);
451
452	isfull = tnode_full(tn, n);
453	if (wasfull && !isfull)
454		tn->full_children--;
455	else if (!wasfull && isfull)
456		tn->full_children++;
457
458	if (n)
459		node_set_parent(n, tn);
460
461	rcu_assign_pointer(tn->child[i], n);
462}
463
464static struct node *resize(struct trie *t, struct tnode *tn)
465{
466	int i;
467	int err = 0;
468	struct tnode *old_tn;
469	int inflate_threshold_use;
470	int halve_threshold_use;
471	int max_resize;
472
473	if (!tn)
474		return NULL;
475
476	pr_debug("In tnode_resize %p inflate_threshold=%d threshold=%d\n",
477		 tn, inflate_threshold, halve_threshold);
478
479	/* No children */
480	if (tn->empty_children == tnode_child_length(tn)) {
481		tnode_free(tn);
482		return NULL;
483	}
484	/* One child */
485	if (tn->empty_children == tnode_child_length(tn) - 1)
486		for (i = 0; i < tnode_child_length(tn); i++) {
487			struct node *n;
488
489			n = tn->child[i];
490			if (!n)
491				continue;
492
493			/* compress one level */
494			node_set_parent(n, NULL);
495			tnode_free(tn);
496			return n;
497		}
498	/*
499	 * Double as long as the resulting node has a number of
500	 * nonempty nodes that are above the threshold.
501	 */
502
503	/*
504	 * From "Implementing a dynamic compressed trie" by Stefan Nilsson of
505	 * the Helsinki University of Technology and Matti Tikkanen of Nokia
506	 * Telecommunications, page 6:
507	 * "A node is doubled if the ratio of non-empty children to all
508	 * children in the *doubled* node is at least 'high'."
509	 *
510	 * 'high' in this instance is the variable 'inflate_threshold'. It
511	 * is expressed as a percentage, so we multiply it with
512	 * tnode_child_length() and instead of multiplying by 2 (since the
513	 * child array will be doubled by inflate()) and multiplying
514	 * the left-hand side by 100 (to handle the percentage thing) we
515	 * multiply the left-hand side by 50.
516	 *
517	 * The left-hand side may look a bit weird: tnode_child_length(tn)
518	 * - tn->empty_children is of course the number of non-null children
519	 * in the current node. tn->full_children is the number of "full"
520	 * children, that is non-null tnodes with a skip value of 0.
521	 * All of those will be doubled in the resulting inflated tnode, so
522	 * we just count them one extra time here.
523	 *
524	 * A clearer way to write this would be:
525	 *
526	 * to_be_doubled = tn->full_children;
527	 * not_to_be_doubled = tnode_child_length(tn) - tn->empty_children -
528	 *     tn->full_children;
529	 *
530	 * new_child_length = tnode_child_length(tn) * 2;
531	 *
532	 * new_fill_factor = 100 * (not_to_be_doubled + 2*to_be_doubled) /
533	 *      new_child_length;
534	 * if (new_fill_factor >= inflate_threshold)
535	 *
536	 * ...and so on, tho it would mess up the while () loop.
537	 *
538	 * anyway,
539	 * 100 * (not_to_be_doubled + 2*to_be_doubled) / new_child_length >=
540	 *      inflate_threshold
541	 *
542	 * avoid a division:
543	 * 100 * (not_to_be_doubled + 2*to_be_doubled) >=
544	 *      inflate_threshold * new_child_length
545	 *
546	 * expand not_to_be_doubled and to_be_doubled, and shorten:
547	 * 100 * (tnode_child_length(tn) - tn->empty_children +
548	 *    tn->full_children) >= inflate_threshold * new_child_length
549	 *
550	 * expand new_child_length:
551	 * 100 * (tnode_child_length(tn) - tn->empty_children +
552	 *    tn->full_children) >=
553	 *      inflate_threshold * tnode_child_length(tn) * 2
554	 *
555	 * shorten again:
556	 * 50 * (tn->full_children + tnode_child_length(tn) -
557	 *    tn->empty_children) >= inflate_threshold *
558	 *    tnode_child_length(tn)
559	 *
560	 */
561
562	check_tnode(tn);
563
564	/* Keep root node larger  */
565
566	if (!tn->parent)
567		inflate_threshold_use = inflate_threshold_root;
568	else
569		inflate_threshold_use = inflate_threshold;
570
571	err = 0;
572	max_resize = 10;
573	while ((tn->full_children > 0 &&  max_resize-- &&
574	       50 * (tn->full_children + tnode_child_length(tn) - tn->empty_children) >=
575				inflate_threshold_use * tnode_child_length(tn))) {
576
577		old_tn = tn;
578		tn = inflate(t, tn);
579		if (IS_ERR(tn)) {
580			tn = old_tn;
581#ifdef CONFIG_IP_FIB_TRIE_STATS
582			t->stats.resize_node_skipped++;
583#endif
584			break;
585		}
586	}
587
588	if (max_resize < 0) {
589		if (!tn->parent)
590			printk(KERN_WARNING "Fix inflate_threshold_root. Now=%d size=%d bits\n",
591			       inflate_threshold_root, tn->bits);
592		else
593			printk(KERN_WARNING "Fix inflate_threshold. Now=%d size=%d bits\n",
594			       inflate_threshold, tn->bits);
595	}
596
597	check_tnode(tn);
598
599	/*
600	 * Halve as long as the number of empty children in this
601	 * node is above threshold.
602	 */
603
604
605	/* Keep root node larger  */
606
607	if (!tn->parent)
608		halve_threshold_use = halve_threshold_root;
609	else
610		halve_threshold_use = halve_threshold;
611
612	err = 0;
613	max_resize = 10;
614	while (tn->bits > 1 &&  max_resize-- &&
615	       100 * (tnode_child_length(tn) - tn->empty_children) <
616	       halve_threshold_use * tnode_child_length(tn)) {
617
618		old_tn = tn;
619		tn = halve(t, tn);
620		if (IS_ERR(tn)) {
621			tn = old_tn;
622#ifdef CONFIG_IP_FIB_TRIE_STATS
623			t->stats.resize_node_skipped++;
624#endif
625			break;
626		}
627	}
628
629	if (max_resize < 0) {
630		if (!tn->parent)
631			printk(KERN_WARNING "Fix halve_threshold_root. Now=%d size=%d bits\n",
632			       halve_threshold_root, tn->bits);
633		else
634			printk(KERN_WARNING "Fix halve_threshold. Now=%d size=%d bits\n",
635			       halve_threshold, tn->bits);
636	}
637
638	/* Only one child remains */
639	if (tn->empty_children == tnode_child_length(tn) - 1)
640		for (i = 0; i < tnode_child_length(tn); i++) {
641			struct node *n;
642
643			n = tn->child[i];
644			if (!n)
645				continue;
646
647			/* compress one level */
648
649			node_set_parent(n, NULL);
650			tnode_free(tn);
651			return n;
652		}
653
654	return (struct node *) tn;
655}
656
657static struct tnode *inflate(struct trie *t, struct tnode *tn)
658{
659	struct tnode *inode;
660	struct tnode *oldtnode = tn;
661	int olen = tnode_child_length(tn);
662	int i;
663
664	pr_debug("In inflate\n");
665
666	tn = tnode_new(oldtnode->key, oldtnode->pos, oldtnode->bits + 1);
667
668	if (!tn)
669		return ERR_PTR(-ENOMEM);
670
671	/*
672	 * Preallocate and store tnodes before the actual work so we
673	 * don't get into an inconsistent state if memory allocation
674	 * fails. In case of failure we return the oldnode and  inflate
675	 * of tnode is ignored.
676	 */
677
678	for (i = 0; i < olen; i++) {
679		struct tnode *inode = (struct tnode *) tnode_get_child(oldtnode, i);
680
681		if (inode &&
682		    IS_TNODE(inode) &&
683		    inode->pos == oldtnode->pos + oldtnode->bits &&
684		    inode->bits > 1) {
685			struct tnode *left, *right;
686			t_key m = ~0U << (KEYLENGTH - 1) >> inode->pos;
687
688			left = tnode_new(inode->key&(~m), inode->pos + 1,
689					 inode->bits - 1);
690			if (!left)
691				goto nomem;
692
693			right = tnode_new(inode->key|m, inode->pos + 1,
694					  inode->bits - 1);
695
696			if (!right) {
697				tnode_free(left);
698				goto nomem;
699			}
700
701			put_child(t, tn, 2*i, (struct node *) left);
702			put_child(t, tn, 2*i+1, (struct node *) right);
703		}
704	}
705
706	for (i = 0; i < olen; i++) {
707		struct node *node = tnode_get_child(oldtnode, i);
708		struct tnode *left, *right;
709		int size, j;
710
711		/* An empty child */
712		if (node == NULL)
713			continue;
714
715		/* A leaf or an internal node with skipped bits */
716
717		if (IS_LEAF(node) || ((struct tnode *) node)->pos >
718		   tn->pos + tn->bits - 1) {
719			if (tkey_extract_bits(node->key, oldtnode->pos + oldtnode->bits,
720					     1) == 0)
721				put_child(t, tn, 2*i, node);
722			else
723				put_child(t, tn, 2*i+1, node);
724			continue;
725		}
726
727		/* An internal node with two children */
728		inode = (struct tnode *) node;
729
730		if (inode->bits == 1) {
731			put_child(t, tn, 2*i, inode->child[0]);
732			put_child(t, tn, 2*i+1, inode->child[1]);
733
734			tnode_free(inode);
735			continue;
736		}
737
738		/* An internal node with more than two children */
739
740		/* We will replace this node 'inode' with two new
741		 * ones, 'left' and 'right', each with half of the
742		 * original children. The two new nodes will have
743		 * a position one bit further down the key and this
744		 * means that the "significant" part of their keys
745		 * (see the discussion near the top of this file)
746		 * will differ by one bit, which will be "0" in
747		 * left's key and "1" in right's key. Since we are
748		 * moving the key position by one step, the bit that
749		 * we are moving away from - the bit at position
750		 * (inode->pos) - is the one that will differ between
751		 * left and right. So... we synthesize that bit in the
752		 * two  new keys.
753		 * The mask 'm' below will be a single "one" bit at
754		 * the position (inode->pos)
755		 */
756
757		/* Use the old key, but set the new significant
758		 *   bit to zero.
759		 */
760
761		left = (struct tnode *) tnode_get_child(tn, 2*i);
762		put_child(t, tn, 2*i, NULL);
763
764		BUG_ON(!left);
765
766		right = (struct tnode *) tnode_get_child(tn, 2*i+1);
767		put_child(t, tn, 2*i+1, NULL);
768
769		BUG_ON(!right);
770
771		size = tnode_child_length(left);
772		for (j = 0; j < size; j++) {
773			put_child(t, left, j, inode->child[j]);
774			put_child(t, right, j, inode->child[j + size]);
775		}
776		put_child(t, tn, 2*i, resize(t, left));
777		put_child(t, tn, 2*i+1, resize(t, right));
778
779		tnode_free(inode);
780	}
781	tnode_free(oldtnode);
782	return tn;
783nomem:
784	{
785		int size = tnode_child_length(tn);
786		int j;
787
788		for (j = 0; j < size; j++)
789			if (tn->child[j])
790				tnode_free((struct tnode *)tn->child[j]);
791
792		tnode_free(tn);
793
794		return ERR_PTR(-ENOMEM);
795	}
796}
797
798static struct tnode *halve(struct trie *t, struct tnode *tn)
799{
800	struct tnode *oldtnode = tn;
801	struct node *left, *right;
802	int i;
803	int olen = tnode_child_length(tn);
804
805	pr_debug("In halve\n");
806
807	tn = tnode_new(oldtnode->key, oldtnode->pos, oldtnode->bits - 1);
808
809	if (!tn)
810		return ERR_PTR(-ENOMEM);
811
812	/*
813	 * Preallocate and store tnodes before the actual work so we
814	 * don't get into an inconsistent state if memory allocation
815	 * fails. In case of failure we return the oldnode and halve
816	 * of tnode is ignored.
817	 */
818
819	for (i = 0; i < olen; i += 2) {
820		left = tnode_get_child(oldtnode, i);
821		right = tnode_get_child(oldtnode, i+1);
822
823		/* Two nonempty children */
824		if (left && right) {
825			struct tnode *newn;
826
827			newn = tnode_new(left->key, tn->pos + tn->bits, 1);
828
829			if (!newn)
830				goto nomem;
831
832			put_child(t, tn, i/2, (struct node *)newn);
833		}
834
835	}
836
837	for (i = 0; i < olen; i += 2) {
838		struct tnode *newBinNode;
839
840		left = tnode_get_child(oldtnode, i);
841		right = tnode_get_child(oldtnode, i+1);
842
843		/* At least one of the children is empty */
844		if (left == NULL) {
845			if (right == NULL)    /* Both are empty */
846				continue;
847			put_child(t, tn, i/2, right);
848			continue;
849		}
850
851		if (right == NULL) {
852			put_child(t, tn, i/2, left);
853			continue;
854		}
855
856		/* Two nonempty children */
857		newBinNode = (struct tnode *) tnode_get_child(tn, i/2);
858		put_child(t, tn, i/2, NULL);
859		put_child(t, newBinNode, 0, left);
860		put_child(t, newBinNode, 1, right);
861		put_child(t, tn, i/2, resize(t, newBinNode));
862	}
863	tnode_free(oldtnode);
864	return tn;
865nomem:
866	{
867		int size = tnode_child_length(tn);
868		int j;
869
870		for (j = 0; j < size; j++)
871			if (tn->child[j])
872				tnode_free((struct tnode *)tn->child[j]);
873
874		tnode_free(tn);
875
876		return ERR_PTR(-ENOMEM);
877	}
878}
879
880static void trie_init(struct trie *t)
881{
882	if (!t)
883		return;
884
885	t->size = 0;
886	rcu_assign_pointer(t->trie, NULL);
887	t->revision = 0;
888#ifdef CONFIG_IP_FIB_TRIE_STATS
889	memset(&t->stats, 0, sizeof(struct trie_use_stats));
890#endif
891}
892
893/* readside must use rcu_read_lock currently dump routines
894 via get_fa_head and dump */
895
896static struct leaf_info *find_leaf_info(struct leaf *l, int plen)
897{
898	struct hlist_head *head = &l->list;
899	struct hlist_node *node;
900	struct leaf_info *li;
901
902	hlist_for_each_entry_rcu(li, node, head, hlist)
903		if (li->plen == plen)
904			return li;
905
906	return NULL;
907}
908
909static inline struct list_head * get_fa_head(struct leaf *l, int plen)
910{
911	struct leaf_info *li = find_leaf_info(l, plen);
912
913	if (!li)
914		return NULL;
915
916	return &li->falh;
917}
918
919static void insert_leaf_info(struct hlist_head *head, struct leaf_info *new)
920{
921	struct leaf_info *li = NULL, *last = NULL;
922	struct hlist_node *node;
923
924	if (hlist_empty(head)) {
925		hlist_add_head_rcu(&new->hlist, head);
926	} else {
927		hlist_for_each_entry(li, node, head, hlist) {
928			if (new->plen > li->plen)
929				break;
930
931			last = li;
932		}
933		if (last)
934			hlist_add_after_rcu(&last->hlist, &new->hlist);
935		else
936			hlist_add_before_rcu(&new->hlist, &li->hlist);
937	}
938}
939
940/* rcu_read_lock needs to be hold by caller from readside */
941
942static struct leaf *
943fib_find_node(struct trie *t, u32 key)
944{
945	int pos;
946	struct tnode *tn;
947	struct node *n;
948
949	pos = 0;
950	n = rcu_dereference(t->trie);
951
952	while (n != NULL &&  NODE_TYPE(n) == T_TNODE) {
953		tn = (struct tnode *) n;
954
955		check_tnode(tn);
956
957		if (tkey_sub_equals(tn->key, pos, tn->pos-pos, key)) {
958			pos = tn->pos + tn->bits;
959			n = tnode_get_child(tn, tkey_extract_bits(key, tn->pos, tn->bits));
960		} else
961			break;
962	}
963	/* Case we have found a leaf. Compare prefixes */
964
965	if (n != NULL && IS_LEAF(n) && tkey_equals(key, n->key))
966		return (struct leaf *)n;
967
968	return NULL;
969}
970
971static struct node *trie_rebalance(struct trie *t, struct tnode *tn)
972{
973	int wasfull;
974	t_key cindex, key = tn->key;
975	struct tnode *tp;
976
977	while (tn != NULL && (tp = node_parent((struct node *)tn)) != NULL) {
978		cindex = tkey_extract_bits(key, tp->pos, tp->bits);
979		wasfull = tnode_full(tp, tnode_get_child(tp, cindex));
980		tn = (struct tnode *) resize (t, (struct tnode *)tn);
981		tnode_put_child_reorg((struct tnode *)tp, cindex,(struct node*)tn, wasfull);
982
983		tp = node_parent((struct node *) tn);
984		if (!tp)
985			break;
986		tn = tp;
987	}
988
989	/* Handle last (top) tnode */
990	if (IS_TNODE(tn))
991		tn = (struct tnode*) resize(t, (struct tnode *)tn);
992
993	return (struct node*) tn;
994}
995
996/* only used from updater-side */
997
998static  struct list_head *
999fib_insert_node(struct trie *t, int *err, u32 key, int plen)
1000{
1001	int pos, newpos;
1002	struct tnode *tp = NULL, *tn = NULL;
1003	struct node *n;
1004	struct leaf *l;
1005	int missbit;
1006	struct list_head *fa_head = NULL;
1007	struct leaf_info *li;
1008	t_key cindex;
1009
1010	pos = 0;
1011	n = t->trie;
1012
1013	/* If we point to NULL, stop. Either the tree is empty and we should
1014	 * just put a new leaf in if, or we have reached an empty child slot,
1015	 * and we should just put our new leaf in that.
1016	 * If we point to a T_TNODE, check if it matches our key. Note that
1017	 * a T_TNODE might be skipping any number of bits - its 'pos' need
1018	 * not be the parent's 'pos'+'bits'!
1019	 *
1020	 * If it does match the current key, get pos/bits from it, extract
1021	 * the index from our key, push the T_TNODE and walk the tree.
1022	 *
1023	 * If it doesn't, we have to replace it with a new T_TNODE.
1024	 *
1025	 * If we point to a T_LEAF, it might or might not have the same key
1026	 * as we do. If it does, just change the value, update the T_LEAF's
1027	 * value, and return it.
1028	 * If it doesn't, we need to replace it with a T_TNODE.
1029	 */
1030
1031	while (n != NULL &&  NODE_TYPE(n) == T_TNODE) {
1032		tn = (struct tnode *) n;
1033
1034		check_tnode(tn);
1035
1036		if (tkey_sub_equals(tn->key, pos, tn->pos-pos, key)) {
1037			tp = tn;
1038			pos = tn->pos + tn->bits;
1039			n = tnode_get_child(tn, tkey_extract_bits(key, tn->pos, tn->bits));
1040
1041			BUG_ON(n && node_parent(n) != tn);
1042		} else
1043			break;
1044	}
1045
1046	/*
1047	 * n  ----> NULL, LEAF or TNODE
1048	 *
1049	 * tp is n's (parent) ----> NULL or TNODE
1050	 */
1051
1052	BUG_ON(tp && IS_LEAF(tp));
1053
1054	/* Case 1: n is a leaf. Compare prefixes */
1055
1056	if (n != NULL && IS_LEAF(n) && tkey_equals(key, n->key)) {
1057		struct leaf *l = (struct leaf *) n;
1058
1059		li = leaf_info_new(plen);
1060
1061		if (!li) {
1062			*err = -ENOMEM;
1063			goto err;
1064		}
1065
1066		fa_head = &li->falh;
1067		insert_leaf_info(&l->list, li);
1068		goto done;
1069	}
1070	t->size++;
1071	l = leaf_new();
1072
1073	if (!l) {
1074		*err = -ENOMEM;
1075		goto err;
1076	}
1077
1078	l->key = key;
1079	li = leaf_info_new(plen);
1080
1081	if (!li) {
1082		tnode_free((struct tnode *) l);
1083		*err = -ENOMEM;
1084		goto err;
1085	}
1086
1087	fa_head = &li->falh;
1088	insert_leaf_info(&l->list, li);
1089
1090	if (t->trie && n == NULL) {
1091		/* Case 2: n is NULL, and will just insert a new leaf */
1092
1093		node_set_parent((struct node *)l, tp);
1094
1095		cindex = tkey_extract_bits(key, tp->pos, tp->bits);
1096		put_child(t, (struct tnode *)tp, cindex, (struct node *)l);
1097	} else {
1098		/* Case 3: n is a LEAF or a TNODE and the key doesn't match. */
1099		/*
1100		 *  Add a new tnode here
1101		 *  first tnode need some special handling
1102		 */
1103
1104		if (tp)
1105			pos = tp->pos+tp->bits;
1106		else
1107			pos = 0;
1108
1109		if (n) {
1110			newpos = tkey_mismatch(key, pos, n->key);
1111			tn = tnode_new(n->key, newpos, 1);
1112		} else {
1113			newpos = 0;
1114			tn = tnode_new(key, newpos, 1); /* First tnode */
1115		}
1116
1117		if (!tn) {
1118			free_leaf_info(li);
1119			tnode_free((struct tnode *) l);
1120			*err = -ENOMEM;
1121			goto err;
1122		}
1123
1124		node_set_parent((struct node *)tn, tp);
1125
1126		missbit = tkey_extract_bits(key, newpos, 1);
1127		put_child(t, tn, missbit, (struct node *)l);
1128		put_child(t, tn, 1-missbit, n);
1129
1130		if (tp) {
1131			cindex = tkey_extract_bits(key, tp->pos, tp->bits);
1132			put_child(t, (struct tnode *)tp, cindex, (struct node *)tn);
1133		} else {
1134			rcu_assign_pointer(t->trie, (struct node *)tn); /* First tnode */
1135			tp = tn;
1136		}
1137	}
1138
1139	if (tp && tp->pos + tp->bits > 32)
1140		printk(KERN_WARNING "fib_trie tp=%p pos=%d, bits=%d, key=%0x plen=%d\n",
1141		       tp, tp->pos, tp->bits, key, plen);
1142
1143	/* Rebalance the trie */
1144
1145	rcu_assign_pointer(t->trie, trie_rebalance(t, tp));
1146done:
1147	t->revision++;
1148err:
1149	return fa_head;
1150}
1151
1152/*
1153 * Caller must hold RTNL.
1154 */
1155static int fn_trie_insert(struct fib_table *tb, struct fib_config *cfg)
1156{
1157	struct trie *t = (struct trie *) tb->tb_data;
1158	struct fib_alias *fa, *new_fa;
1159	struct list_head *fa_head = NULL;
1160	struct fib_info *fi;
1161	int plen = cfg->fc_dst_len;
1162	u8 tos = cfg->fc_tos;
1163	u32 key, mask;
1164	int err;
1165	struct leaf *l;
1166
1167	if (plen > 32)
1168		return -EINVAL;
1169
1170	key = ntohl(cfg->fc_dst);
1171
1172	pr_debug("Insert table=%u %08x/%d\n", tb->tb_id, key, plen);
1173
1174	mask = ntohl(inet_make_mask(plen));
1175
1176	if (key & ~mask)
1177		return -EINVAL;
1178
1179	key = key & mask;
1180
1181	fi = fib_create_info(cfg);
1182	if (IS_ERR(fi)) {
1183		err = PTR_ERR(fi);
1184		goto err;
1185	}
1186
1187	l = fib_find_node(t, key);
1188	fa = NULL;
1189
1190	if (l) {
1191		fa_head = get_fa_head(l, plen);
1192		fa = fib_find_alias(fa_head, tos, fi->fib_priority);
1193	}
1194
1195	/* Now fa, if non-NULL, points to the first fib alias
1196	 * with the same keys [prefix,tos,priority], if such key already
1197	 * exists or to the node before which we will insert new one.
1198	 *
1199	 * If fa is NULL, we will need to allocate a new one and
1200	 * insert to the head of f.
1201	 *
1202	 * If f is NULL, no fib node matched the destination key
1203	 * and we need to allocate a new one of those as well.
1204	 */
1205
1206	if (fa && fa->fa_info->fib_priority == fi->fib_priority) {
1207		struct fib_alias *fa_orig;
1208
1209		err = -EEXIST;
1210		if (cfg->fc_nlflags & NLM_F_EXCL)
1211			goto out;
1212
1213		if (cfg->fc_nlflags & NLM_F_REPLACE) {
1214			struct fib_info *fi_drop;
1215			u8 state;
1216
1217			err = -ENOBUFS;
1218			new_fa = kmem_cache_alloc(fn_alias_kmem, GFP_KERNEL);
1219			if (new_fa == NULL)
1220				goto out;
1221
1222			fi_drop = fa->fa_info;
1223			new_fa->fa_tos = fa->fa_tos;
1224			new_fa->fa_info = fi;
1225			new_fa->fa_type = cfg->fc_type;
1226			new_fa->fa_scope = cfg->fc_scope;
1227			state = fa->fa_state;
1228			new_fa->fa_state &= ~FA_S_ACCESSED;
1229
1230			list_replace_rcu(&fa->fa_list, &new_fa->fa_list);
1231			alias_free_mem_rcu(fa);
1232
1233			fib_release_info(fi_drop);
1234			if (state & FA_S_ACCESSED)
1235				rt_cache_flush(-1);
1236			rtmsg_fib(RTM_NEWROUTE, htonl(key), new_fa, plen,
1237				tb->tb_id, &cfg->fc_nlinfo, NLM_F_REPLACE);
1238
1239			goto succeeded;
1240		}
1241		/* Error if we find a perfect match which
1242		 * uses the same scope, type, and nexthop
1243		 * information.
1244		 */
1245		fa_orig = fa;
1246		list_for_each_entry(fa, fa_orig->fa_list.prev, fa_list) {
1247			if (fa->fa_tos != tos)
1248				break;
1249			if (fa->fa_info->fib_priority != fi->fib_priority)
1250				break;
1251			if (fa->fa_type == cfg->fc_type &&
1252			    fa->fa_scope == cfg->fc_scope &&
1253			    fa->fa_info == fi) {
1254				goto out;
1255			}
1256		}
1257		if (!(cfg->fc_nlflags & NLM_F_APPEND))
1258			fa = fa_orig;
1259	}
1260	err = -ENOENT;
1261	if (!(cfg->fc_nlflags & NLM_F_CREATE))
1262		goto out;
1263
1264	err = -ENOBUFS;
1265	new_fa = kmem_cache_alloc(fn_alias_kmem, GFP_KERNEL);
1266	if (new_fa == NULL)
1267		goto out;
1268
1269	new_fa->fa_info = fi;
1270	new_fa->fa_tos = tos;
1271	new_fa->fa_type = cfg->fc_type;
1272	new_fa->fa_scope = cfg->fc_scope;
1273	new_fa->fa_state = 0;
1274	/*
1275	 * Insert new entry to the list.
1276	 */
1277
1278	if (!fa_head) {
1279		err = 0;
1280		fa_head = fib_insert_node(t, &err, key, plen);
1281		if (err)
1282			goto out_free_new_fa;
1283	}
1284
1285	list_add_tail_rcu(&new_fa->fa_list,
1286			  (fa ? &fa->fa_list : fa_head));
1287
1288	rt_cache_flush(-1);
1289	rtmsg_fib(RTM_NEWROUTE, htonl(key), new_fa, plen, tb->tb_id,
1290		  &cfg->fc_nlinfo, 0);
1291succeeded:
1292	return 0;
1293
1294out_free_new_fa:
1295	kmem_cache_free(fn_alias_kmem, new_fa);
1296out:
1297	fib_release_info(fi);
1298err:
1299	return err;
1300}
1301
1302
1303/* should be called with rcu_read_lock */
1304static inline int check_leaf(struct trie *t, struct leaf *l,
1305			     t_key key, int *plen, const struct flowi *flp,
1306			     struct fib_result *res)
1307{
1308	int err, i;
1309	__be32 mask;
1310	struct leaf_info *li;
1311	struct hlist_head *hhead = &l->list;
1312	struct hlist_node *node;
1313
1314	hlist_for_each_entry_rcu(li, node, hhead, hlist) {
1315		i = li->plen;
1316		mask = inet_make_mask(i);
1317		if (l->key != (key & ntohl(mask)))
1318			continue;
1319
1320		if ((err = fib_semantic_match(&li->falh, flp, res, htonl(l->key), mask, i)) <= 0) {
1321			*plen = i;
1322#ifdef CONFIG_IP_FIB_TRIE_STATS
1323			t->stats.semantic_match_passed++;
1324#endif
1325			return err;
1326		}
1327#ifdef CONFIG_IP_FIB_TRIE_STATS
1328		t->stats.semantic_match_miss++;
1329#endif
1330	}
1331	return 1;
1332}
1333
1334static int
1335fn_trie_lookup(struct fib_table *tb, const struct flowi *flp, struct fib_result *res)
1336{
1337	struct trie *t = (struct trie *) tb->tb_data;
1338	int plen, ret = 0;
1339	struct node *n;
1340	struct tnode *pn;
1341	int pos, bits;
1342	t_key key = ntohl(flp->fl4_dst);
1343	int chopped_off;
1344	t_key cindex = 0;
1345	int current_prefix_length = KEYLENGTH;
1346	struct tnode *cn;
1347	t_key node_prefix, key_prefix, pref_mismatch;
1348	int mp;
1349
1350	rcu_read_lock();
1351
1352	n = rcu_dereference(t->trie);
1353	if (!n)
1354		goto failed;
1355
1356#ifdef CONFIG_IP_FIB_TRIE_STATS
1357	t->stats.gets++;
1358#endif
1359
1360	/* Just a leaf? */
1361	if (IS_LEAF(n)) {
1362		if ((ret = check_leaf(t, (struct leaf *)n, key, &plen, flp, res)) <= 0)
1363			goto found;
1364		goto failed;
1365	}
1366	pn = (struct tnode *) n;
1367	chopped_off = 0;
1368
1369	while (pn) {
1370		pos = pn->pos;
1371		bits = pn->bits;
1372
1373		if (!chopped_off)
1374			cindex = tkey_extract_bits(mask_pfx(key, current_prefix_length),
1375						   pos, bits);
1376
1377		n = tnode_get_child(pn, cindex);
1378
1379		if (n == NULL) {
1380#ifdef CONFIG_IP_FIB_TRIE_STATS
1381			t->stats.null_node_hit++;
1382#endif
1383			goto backtrace;
1384		}
1385
1386		if (IS_LEAF(n)) {
1387			if ((ret = check_leaf(t, (struct leaf *)n, key, &plen, flp, res)) <= 0)
1388				goto found;
1389			else
1390				goto backtrace;
1391		}
1392
1393#define HL_OPTIMIZE
1394#ifdef HL_OPTIMIZE
1395		cn = (struct tnode *)n;
1396
1397		/*
1398		 * It's a tnode, and we can do some extra checks here if we
1399		 * like, to avoid descending into a dead-end branch.
1400		 * This tnode is in the parent's child array at index
1401		 * key[p_pos..p_pos+p_bits] but potentially with some bits
1402		 * chopped off, so in reality the index may be just a
1403		 * subprefix, padded with zero at the end.
1404		 * We can also take a look at any skipped bits in this
1405		 * tnode - everything up to p_pos is supposed to be ok,
1406		 * and the non-chopped bits of the index (se previous
1407		 * paragraph) are also guaranteed ok, but the rest is
1408		 * considered unknown.
1409		 *
1410		 * The skipped bits are key[pos+bits..cn->pos].
1411		 */
1412
1413		/* If current_prefix_length < pos+bits, we are already doing
1414		 * actual prefix  matching, which means everything from
1415		 * pos+(bits-chopped_off) onward must be zero along some
1416		 * branch of this subtree - otherwise there is *no* valid
1417		 * prefix present. Here we can only check the skipped
1418		 * bits. Remember, since we have already indexed into the
1419		 * parent's child array, we know that the bits we chopped of
1420		 * *are* zero.
1421		 */
1422
1423		/* NOTA BENE: CHECKING ONLY SKIPPED BITS FOR THE NEW NODE HERE */
1424
1425		if (current_prefix_length < pos+bits) {
1426			if (tkey_extract_bits(cn->key, current_prefix_length,
1427						cn->pos - current_prefix_length) != 0 ||
1428			    !(cn->child[0]))
1429				goto backtrace;
1430		}
1431
1432		/*
1433		 * If chopped_off=0, the index is fully validated and we
1434		 * only need to look at the skipped bits for this, the new,
1435		 * tnode. What we actually want to do is to find out if
1436		 * these skipped bits match our key perfectly, or if we will
1437		 * have to count on finding a matching prefix further down,
1438		 * because if we do, we would like to have some way of
1439		 * verifying the existence of such a prefix at this point.
1440		 */
1441
1442		/* The only thing we can do at this point is to verify that
1443		 * any such matching prefix can indeed be a prefix to our
1444		 * key, and if the bits in the node we are inspecting that
1445		 * do not match our key are not ZERO, this cannot be true.
1446		 * Thus, find out where there is a mismatch (before cn->pos)
1447		 * and verify that all the mismatching bits are zero in the
1448		 * new tnode's key.
1449		 */
1450
1451		/* Note: We aren't very concerned about the piece of the key
1452		 * that precede pn->pos+pn->bits, since these have already been
1453		 * checked. The bits after cn->pos aren't checked since these are
1454		 * by definition "unknown" at this point. Thus, what we want to
1455		 * see is if we are about to enter the "prefix matching" state,
1456		 * and in that case verify that the skipped bits that will prevail
1457		 * throughout this subtree are zero, as they have to be if we are
1458		 * to find a matching prefix.
1459		 */
1460
1461		node_prefix = mask_pfx(cn->key, cn->pos);
1462		key_prefix = mask_pfx(key, cn->pos);
1463		pref_mismatch = key_prefix^node_prefix;
1464		mp = 0;
1465
1466		/* In short: If skipped bits in this node do not match the search
1467		 * key, enter the "prefix matching" state.directly.
1468		 */
1469		if (pref_mismatch) {
1470			while (!(pref_mismatch & (1<<(KEYLENGTH-1)))) {
1471				mp++;
1472				pref_mismatch = pref_mismatch <<1;
1473			}
1474			key_prefix = tkey_extract_bits(cn->key, mp, cn->pos-mp);
1475
1476			if (key_prefix != 0)
1477				goto backtrace;
1478
1479			if (current_prefix_length >= cn->pos)
1480				current_prefix_length = mp;
1481		}
1482#endif
1483		pn = (struct tnode *)n; /* Descend */
1484		chopped_off = 0;
1485		continue;
1486
1487backtrace:
1488		chopped_off++;
1489
1490		/* As zero don't change the child key (cindex) */
1491		while ((chopped_off <= pn->bits) && !(cindex & (1<<(chopped_off-1))))
1492			chopped_off++;
1493
1494		/* Decrease current_... with bits chopped off */
1495		if (current_prefix_length > pn->pos + pn->bits - chopped_off)
1496			current_prefix_length = pn->pos + pn->bits - chopped_off;
1497
1498		/*
1499		 * Either we do the actual chop off according or if we have
1500		 * chopped off all bits in this tnode walk up to our parent.
1501		 */
1502
1503		if (chopped_off <= pn->bits) {
1504			cindex &= ~(1 << (chopped_off-1));
1505		} else {
1506			struct tnode *parent = node_parent((struct node *) pn);
1507			if (!parent)
1508				goto failed;
1509
1510			/* Get Child's index */
1511			cindex = tkey_extract_bits(pn->key, parent->pos, parent->bits);
1512			pn = parent;
1513			chopped_off = 0;
1514
1515#ifdef CONFIG_IP_FIB_TRIE_STATS
1516			t->stats.backtrack++;
1517#endif
1518			goto backtrace;
1519		}
1520	}
1521failed:
1522	ret = 1;
1523found:
1524	rcu_read_unlock();
1525	return ret;
1526}
1527
1528/* only called from updater side */
1529static int trie_leaf_remove(struct trie *t, t_key key)
1530{
1531	t_key cindex;
1532	struct tnode *tp = NULL;
1533	struct node *n = t->trie;
1534	struct leaf *l;
1535
1536	pr_debug("entering trie_leaf_remove(%p)\n", n);
1537
1538	/* Note that in the case skipped bits, those bits are *not* checked!
1539	 * When we finish this, we will have NULL or a T_LEAF, and the
1540	 * T_LEAF may or may not match our key.
1541	 */
1542
1543	while (n != NULL && IS_TNODE(n)) {
1544		struct tnode *tn = (struct tnode *) n;
1545		check_tnode(tn);
1546		n = tnode_get_child(tn ,tkey_extract_bits(key, tn->pos, tn->bits));
1547
1548		BUG_ON(n && node_parent(n) != tn);
1549	}
1550	l = (struct leaf *) n;
1551
1552	if (!n || !tkey_equals(l->key, key))
1553		return 0;
1554
1555	/*
1556	 * Key found.
1557	 * Remove the leaf and rebalance the tree
1558	 */
1559
1560	t->revision++;
1561	t->size--;
1562
1563	tp = node_parent(n);
1564	tnode_free((struct tnode *) n);
1565
1566	if (tp) {
1567		cindex = tkey_extract_bits(key, tp->pos, tp->bits);
1568		put_child(t, (struct tnode *)tp, cindex, NULL);
1569		rcu_assign_pointer(t->trie, trie_rebalance(t, tp));
1570	} else
1571		rcu_assign_pointer(t->trie, NULL);
1572
1573	return 1;
1574}
1575
1576/*
1577 * Caller must hold RTNL.
1578 */
1579static int fn_trie_delete(struct fib_table *tb, struct fib_config *cfg)
1580{
1581	struct trie *t = (struct trie *) tb->tb_data;
1582	u32 key, mask;
1583	int plen = cfg->fc_dst_len;
1584	u8 tos = cfg->fc_tos;
1585	struct fib_alias *fa, *fa_to_delete;
1586	struct list_head *fa_head;
1587	struct leaf *l;
1588	struct leaf_info *li;
1589
1590	if (plen > 32)
1591		return -EINVAL;
1592
1593	key = ntohl(cfg->fc_dst);
1594	mask = ntohl(inet_make_mask(plen));
1595
1596	if (key & ~mask)
1597		return -EINVAL;
1598
1599	key = key & mask;
1600	l = fib_find_node(t, key);
1601
1602	if (!l)
1603		return -ESRCH;
1604
1605	fa_head = get_fa_head(l, plen);
1606	fa = fib_find_alias(fa_head, tos, 0);
1607
1608	if (!fa)
1609		return -ESRCH;
1610
1611	pr_debug("Deleting %08x/%d tos=%d t=%p\n", key, plen, tos, t);
1612
1613	fa_to_delete = NULL;
1614	fa_head = fa->fa_list.prev;
1615
1616	list_for_each_entry(fa, fa_head, fa_list) {
1617		struct fib_info *fi = fa->fa_info;
1618
1619		if (fa->fa_tos != tos)
1620			break;
1621
1622		if ((!cfg->fc_type || fa->fa_type == cfg->fc_type) &&
1623		    (cfg->fc_scope == RT_SCOPE_NOWHERE ||
1624		     fa->fa_scope == cfg->fc_scope) &&
1625		    (!cfg->fc_protocol ||
1626		     fi->fib_protocol == cfg->fc_protocol) &&
1627		    fib_nh_match(cfg, fi) == 0) {
1628			fa_to_delete = fa;
1629			break;
1630		}
1631	}
1632
1633	if (!fa_to_delete)
1634		return -ESRCH;
1635
1636	fa = fa_to_delete;
1637	rtmsg_fib(RTM_DELROUTE, htonl(key), fa, plen, tb->tb_id,
1638		  &cfg->fc_nlinfo, 0);
1639
1640	l = fib_find_node(t, key);
1641	li = find_leaf_info(l, plen);
1642
1643	list_del_rcu(&fa->fa_list);
1644
1645	if (list_empty(fa_head)) {
1646		hlist_del_rcu(&li->hlist);
1647		free_leaf_info(li);
1648	}
1649
1650	if (hlist_empty(&l->list))
1651		trie_leaf_remove(t, key);
1652
1653	if (fa->fa_state & FA_S_ACCESSED)
1654		rt_cache_flush(-1);
1655
1656	fib_release_info(fa->fa_info);
1657	alias_free_mem_rcu(fa);
1658	return 0;
1659}
1660
1661static int trie_flush_list(struct trie *t, struct list_head *head)
1662{
1663	struct fib_alias *fa, *fa_node;
1664	int found = 0;
1665
1666	list_for_each_entry_safe(fa, fa_node, head, fa_list) {
1667		struct fib_info *fi = fa->fa_info;
1668
1669		if (fi && (fi->fib_flags & RTNH_F_DEAD)) {
1670			list_del_rcu(&fa->fa_list);
1671			fib_release_info(fa->fa_info);
1672			alias_free_mem_rcu(fa);
1673			found++;
1674		}
1675	}
1676	return found;
1677}
1678
1679static int trie_flush_leaf(struct trie *t, struct leaf *l)
1680{
1681	int found = 0;
1682	struct hlist_head *lih = &l->list;
1683	struct hlist_node *node, *tmp;
1684	struct leaf_info *li = NULL;
1685
1686	hlist_for_each_entry_safe(li, node, tmp, lih, hlist) {
1687		found += trie_flush_list(t, &li->falh);
1688
1689		if (list_empty(&li->falh)) {
1690			hlist_del_rcu(&li->hlist);
1691			free_leaf_info(li);
1692		}
1693	}
1694	return found;
1695}
1696
1697/* rcu_read_lock needs to be hold by caller from readside */
1698
1699static struct leaf *nextleaf(struct trie *t, struct leaf *thisleaf)
1700{
1701	struct node *c = (struct node *) thisleaf;
1702	struct tnode *p;
1703	int idx;
1704	struct node *trie = rcu_dereference(t->trie);
1705
1706	if (c == NULL) {
1707		if (trie == NULL)
1708			return NULL;
1709
1710		if (IS_LEAF(trie))          /* trie w. just a leaf */
1711			return (struct leaf *) trie;
1712
1713		p = (struct tnode*) trie;  /* Start */
1714	} else
1715		p = node_parent(c);
1716
1717	while (p) {
1718		int pos, last;
1719
1720		/*  Find the next child of the parent */
1721		if (c)
1722			pos = 1 + tkey_extract_bits(c->key, p->pos, p->bits);
1723		else
1724			pos = 0;
1725
1726		last = 1 << p->bits;
1727		for (idx = pos; idx < last ; idx++) {
1728			c = rcu_dereference(p->child[idx]);
1729
1730			if (!c)
1731				continue;
1732
1733			/* Decend if tnode */
1734			while (IS_TNODE(c)) {
1735				p = (struct tnode *) c;
1736				idx = 0;
1737
1738				/* Rightmost non-NULL branch */
1739				if (p && IS_TNODE(p))
1740					while (!(c = rcu_dereference(p->child[idx]))
1741					       && idx < (1<<p->bits)) idx++;
1742
1743				/* Done with this tnode? */
1744				if (idx >= (1 << p->bits) || !c)
1745					goto up;
1746			}
1747			return (struct leaf *) c;
1748		}
1749up:
1750		/* No more children go up one step  */
1751		c = (struct node *) p;
1752		p = node_parent(c);
1753	}
1754	return NULL; /* Ready. Root of trie */
1755}
1756
1757/*
1758 * Caller must hold RTNL.
1759 */
1760static int fn_trie_flush(struct fib_table *tb)
1761{
1762	struct trie *t = (struct trie *) tb->tb_data;
1763	struct leaf *ll = NULL, *l = NULL;
1764	int found = 0, h;
1765
1766	t->revision++;
1767
1768	for (h = 0; (l = nextleaf(t, l)) != NULL; h++) {
1769		found += trie_flush_leaf(t, l);
1770
1771		if (ll && hlist_empty(&ll->list))
1772			trie_leaf_remove(t, ll->key);
1773		ll = l;
1774	}
1775
1776	if (ll && hlist_empty(&ll->list))
1777		trie_leaf_remove(t, ll->key);
1778
1779	pr_debug("trie_flush found=%d\n", found);
1780	return found;
1781}
1782
1783static int trie_last_dflt = -1;
1784
1785static void
1786fn_trie_select_default(struct fib_table *tb, const struct flowi *flp, struct fib_result *res)
1787{
1788	struct trie *t = (struct trie *) tb->tb_data;
1789	int order, last_idx;
1790	struct fib_info *fi = NULL;
1791	struct fib_info *last_resort;
1792	struct fib_alias *fa = NULL;
1793	struct list_head *fa_head;
1794	struct leaf *l;
1795
1796	last_idx = -1;
1797	last_resort = NULL;
1798	order = -1;
1799
1800	rcu_read_lock();
1801
1802	l = fib_find_node(t, 0);
1803	if (!l)
1804		goto out;
1805
1806	fa_head = get_fa_head(l, 0);
1807	if (!fa_head)
1808		goto out;
1809
1810	if (list_empty(fa_head))
1811		goto out;
1812
1813	list_for_each_entry_rcu(fa, fa_head, fa_list) {
1814		struct fib_info *next_fi = fa->fa_info;
1815
1816		if (fa->fa_scope != res->scope ||
1817		    fa->fa_type != RTN_UNICAST)
1818			continue;
1819
1820		if (next_fi->fib_priority > res->fi->fib_priority)
1821			break;
1822		if (!next_fi->fib_nh[0].nh_gw ||
1823		    next_fi->fib_nh[0].nh_scope != RT_SCOPE_LINK)
1824			continue;
1825		fa->fa_state |= FA_S_ACCESSED;
1826
1827		if (fi == NULL) {
1828			if (next_fi != res->fi)
1829				break;
1830		} else if (!fib_detect_death(fi, order, &last_resort,
1831					     &last_idx, &trie_last_dflt)) {
1832			if (res->fi)
1833				fib_info_put(res->fi);
1834			res->fi = fi;
1835			atomic_inc(&fi->fib_clntref);
1836			trie_last_dflt = order;
1837			goto out;
1838		}
1839		fi = next_fi;
1840		order++;
1841	}
1842	if (order <= 0 || fi == NULL) {
1843		trie_last_dflt = -1;
1844		goto out;
1845	}
1846
1847	if (!fib_detect_death(fi, order, &last_resort, &last_idx, &trie_last_dflt)) {
1848		if (res->fi)
1849			fib_info_put(res->fi);
1850		res->fi = fi;
1851		atomic_inc(&fi->fib_clntref);
1852		trie_last_dflt = order;
1853		goto out;
1854	}
1855	if (last_idx >= 0) {
1856		if (res->fi)
1857			fib_info_put(res->fi);
1858		res->fi = last_resort;
1859		if (last_resort)
1860			atomic_inc(&last_resort->fib_clntref);
1861	}
1862	trie_last_dflt = last_idx;
1863 out:;
1864	rcu_read_unlock();
1865}
1866
1867static int fn_trie_dump_fa(t_key key, int plen, struct list_head *fah, struct fib_table *tb,
1868			   struct sk_buff *skb, struct netlink_callback *cb)
1869{
1870	int i, s_i;
1871	struct fib_alias *fa;
1872
1873	__be32 xkey = htonl(key);
1874
1875	s_i = cb->args[4];
1876	i = 0;
1877
1878	/* rcu_read_lock is hold by caller */
1879
1880	list_for_each_entry_rcu(fa, fah, fa_list) {
1881		if (i < s_i) {
1882			i++;
1883			continue;
1884		}
1885		BUG_ON(!fa->fa_info);
1886
1887		if (fib_dump_info(skb, NETLINK_CB(cb->skb).pid,
1888				  cb->nlh->nlmsg_seq,
1889				  RTM_NEWROUTE,
1890				  tb->tb_id,
1891				  fa->fa_type,
1892				  fa->fa_scope,
1893				  xkey,
1894				  plen,
1895				  fa->fa_tos,
1896				  fa->fa_info, 0) < 0) {
1897			cb->args[4] = i;
1898			return -1;
1899		}
1900		i++;
1901	}
1902	cb->args[4] = i;
1903	return skb->len;
1904}
1905
1906static int fn_trie_dump_plen(struct trie *t, int plen, struct fib_table *tb, struct sk_buff *skb,
1907			     struct netlink_callback *cb)
1908{
1909	int h, s_h;
1910	struct list_head *fa_head;
1911	struct leaf *l = NULL;
1912
1913	s_h = cb->args[3];
1914
1915	for (h = 0; (l = nextleaf(t, l)) != NULL; h++) {
1916		if (h < s_h)
1917			continue;
1918		if (h > s_h)
1919			memset(&cb->args[4], 0,
1920			       sizeof(cb->args) - 4*sizeof(cb->args[0]));
1921
1922		fa_head = get_fa_head(l, plen);
1923
1924		if (!fa_head)
1925			continue;
1926
1927		if (list_empty(fa_head))
1928			continue;
1929
1930		if (fn_trie_dump_fa(l->key, plen, fa_head, tb, skb, cb)<0) {
1931			cb->args[3] = h;
1932			return -1;
1933		}
1934	}
1935	cb->args[3] = h;
1936	return skb->len;
1937}
1938
1939static int fn_trie_dump(struct fib_table *tb, struct sk_buff *skb, struct netlink_callback *cb)
1940{
1941	int m, s_m;
1942	struct trie *t = (struct trie *) tb->tb_data;
1943
1944	s_m = cb->args[2];
1945
1946	rcu_read_lock();
1947	for (m = 0; m <= 32; m++) {
1948		if (m < s_m)
1949			continue;
1950		if (m > s_m)
1951			memset(&cb->args[3], 0,
1952				sizeof(cb->args) - 3*sizeof(cb->args[0]));
1953
1954		if (fn_trie_dump_plen(t, 32-m, tb, skb, cb)<0) {
1955			cb->args[2] = m;
1956			goto out;
1957		}
1958	}
1959	rcu_read_unlock();
1960	cb->args[2] = m;
1961	return skb->len;
1962out:
1963	rcu_read_unlock();
1964	return -1;
1965}
1966
1967/* Fix more generic FIB names for init later */
1968
1969#ifdef CONFIG_IP_MULTIPLE_TABLES
1970struct fib_table * fib_hash_init(u32 id)
1971#else
1972struct fib_table * __init fib_hash_init(u32 id)
1973#endif
1974{
1975	struct fib_table *tb;
1976	struct trie *t;
1977
1978	if (fn_alias_kmem == NULL)
1979		fn_alias_kmem = kmem_cache_create("ip_fib_alias",
1980						  sizeof(struct fib_alias),
1981						  0, SLAB_HWCACHE_ALIGN,
1982						  NULL);
1983
1984	tb = kmalloc(sizeof(struct fib_table) + sizeof(struct trie),
1985		     GFP_KERNEL);
1986	if (tb == NULL)
1987		return NULL;
1988
1989	tb->tb_id = id;
1990	tb->tb_lookup = fn_trie_lookup;
1991	tb->tb_insert = fn_trie_insert;
1992	tb->tb_delete = fn_trie_delete;
1993	tb->tb_flush = fn_trie_flush;
1994	tb->tb_select_default = fn_trie_select_default;
1995	tb->tb_dump = fn_trie_dump;
1996	memset(tb->tb_data, 0, sizeof(struct trie));
1997
1998	t = (struct trie *) tb->tb_data;
1999
2000	trie_init(t);
2001
2002	if (id == RT_TABLE_LOCAL)
2003		trie_local = t;
2004	else if (id == RT_TABLE_MAIN)
2005		trie_main = t;
2006
2007	if (id == RT_TABLE_LOCAL)
2008		printk(KERN_INFO "IPv4 FIB: Using LC-trie version %s\n", VERSION);
2009
2010	return tb;
2011}
2012
2013#ifdef CONFIG_PROC_FS
2014/* Depth first Trie walk iterator */
2015struct fib_trie_iter {
2016	struct tnode *tnode;
2017	struct trie *trie;
2018	unsigned index;
2019	unsigned depth;
2020};
2021
2022static struct node *fib_trie_get_next(struct fib_trie_iter *iter)
2023{
2024	struct tnode *tn = iter->tnode;
2025	unsigned cindex = iter->index;
2026	struct tnode *p;
2027
2028	/* A single entry routing table */
2029	if (!tn)
2030		return NULL;
2031
2032	pr_debug("get_next iter={node=%p index=%d depth=%d}\n",
2033		 iter->tnode, iter->index, iter->depth);
2034rescan:
2035	while (cindex < (1<<tn->bits)) {
2036		struct node *n = tnode_get_child(tn, cindex);
2037
2038		if (n) {
2039			if (IS_LEAF(n)) {
2040				iter->tnode = tn;
2041				iter->index = cindex + 1;
2042			} else {
2043				/* push down one level */
2044				iter->tnode = (struct tnode *) n;
2045				iter->index = 0;
2046				++iter->depth;
2047			}
2048			return n;
2049		}
2050
2051		++cindex;
2052	}
2053
2054	/* Current node exhausted, pop back up */
2055	p = node_parent((struct node *)tn);
2056	if (p) {
2057		cindex = tkey_extract_bits(tn->key, p->pos, p->bits)+1;
2058		tn = p;
2059		--iter->depth;
2060		goto rescan;
2061	}
2062
2063	/* got root? */
2064	return NULL;
2065}
2066
2067static struct node *fib_trie_get_first(struct fib_trie_iter *iter,
2068				       struct trie *t)
2069{
2070	struct node *n ;
2071
2072	if (!t)
2073		return NULL;
2074
2075	n = rcu_dereference(t->trie);
2076
2077	if (!iter)
2078		return NULL;
2079
2080	if (n) {
2081		if (IS_TNODE(n)) {
2082			iter->tnode = (struct tnode *) n;
2083			iter->trie = t;
2084			iter->index = 0;
2085			iter->depth = 1;
2086		} else {
2087			iter->tnode = NULL;
2088			iter->trie  = t;
2089			iter->index = 0;
2090			iter->depth = 0;
2091		}
2092		return n;
2093	}
2094	return NULL;
2095}
2096
2097static void trie_collect_stats(struct trie *t, struct trie_stat *s)
2098{
2099	struct node *n;
2100	struct fib_trie_iter iter;
2101
2102	memset(s, 0, sizeof(*s));
2103
2104	rcu_read_lock();
2105	for (n = fib_trie_get_first(&iter, t); n;
2106	     n = fib_trie_get_next(&iter)) {
2107		if (IS_LEAF(n)) {
2108			s->leaves++;
2109			s->totdepth += iter.depth;
2110			if (iter.depth > s->maxdepth)
2111				s->maxdepth = iter.depth;
2112		} else {
2113			const struct tnode *tn = (const struct tnode *) n;
2114			int i;
2115
2116			s->tnodes++;
2117			if (tn->bits < MAX_STAT_DEPTH)
2118				s->nodesizes[tn->bits]++;
2119
2120			for (i = 0; i < (1<<tn->bits); i++)
2121				if (!tn->child[i])
2122					s->nullpointers++;
2123		}
2124	}
2125	rcu_read_unlock();
2126}
2127
2128/*
2129 *	This outputs /proc/net/fib_triestats
2130 */
2131static void trie_show_stats(struct seq_file *seq, struct trie_stat *stat)
2132{
2133	unsigned i, max, pointers, bytes, avdepth;
2134
2135	if (stat->leaves)
2136		avdepth = stat->totdepth*100 / stat->leaves;
2137	else
2138		avdepth = 0;
2139
2140	seq_printf(seq, "\tAver depth:     %d.%02d\n", avdepth / 100, avdepth % 100 );
2141	seq_printf(seq, "\tMax depth:      %u\n", stat->maxdepth);
2142
2143	seq_printf(seq, "\tLeaves:         %u\n", stat->leaves);
2144
2145	bytes = sizeof(struct leaf) * stat->leaves;
2146	seq_printf(seq, "\tInternal nodes: %d\n\t", stat->tnodes);
2147	bytes += sizeof(struct tnode) * stat->tnodes;
2148
2149	max = MAX_STAT_DEPTH;
2150	while (max > 0 && stat->nodesizes[max-1] == 0)
2151		max--;
2152
2153	pointers = 0;
2154	for (i = 1; i <= max; i++)
2155		if (stat->nodesizes[i] != 0) {
2156			seq_printf(seq, "  %d: %d",  i, stat->nodesizes[i]);
2157			pointers += (1<<i) * stat->nodesizes[i];
2158		}
2159	seq_putc(seq, '\n');
2160	seq_printf(seq, "\tPointers: %d\n", pointers);
2161
2162	bytes += sizeof(struct node *) * pointers;
2163	seq_printf(seq, "Null ptrs: %d\n", stat->nullpointers);
2164	seq_printf(seq, "Total size: %d  kB\n", (bytes + 1023) / 1024);
2165
2166#ifdef CONFIG_IP_FIB_TRIE_STATS
2167	seq_printf(seq, "Counters:\n---------\n");
2168	seq_printf(seq,"gets = %d\n", t->stats.gets);
2169	seq_printf(seq,"backtracks = %d\n", t->stats.backtrack);
2170	seq_printf(seq,"semantic match passed = %d\n", t->stats.semantic_match_passed);
2171	seq_printf(seq,"semantic match miss = %d\n", t->stats.semantic_match_miss);
2172	seq_printf(seq,"null node hit= %d\n", t->stats.null_node_hit);
2173	seq_printf(seq,"skipped node resize = %d\n", t->stats.resize_node_skipped);
2174#ifdef CLEAR_STATS
2175	memset(&(t->stats), 0, sizeof(t->stats));
2176#endif
2177#endif /*  CONFIG_IP_FIB_TRIE_STATS */
2178}
2179
2180static int fib_triestat_seq_show(struct seq_file *seq, void *v)
2181{
2182	struct trie_stat *stat;
2183
2184	stat = kmalloc(sizeof(*stat), GFP_KERNEL);
2185	if (!stat)
2186		return -ENOMEM;
2187
2188	seq_printf(seq, "Basic info: size of leaf: %Zd bytes, size of tnode: %Zd bytes.\n",
2189		   sizeof(struct leaf), sizeof(struct tnode));
2190
2191	if (trie_local) {
2192		seq_printf(seq, "Local:\n");
2193		trie_collect_stats(trie_local, stat);
2194		trie_show_stats(seq, stat);
2195	}
2196
2197	if (trie_main) {
2198		seq_printf(seq, "Main:\n");
2199		trie_collect_stats(trie_main, stat);
2200		trie_show_stats(seq, stat);
2201	}
2202	kfree(stat);
2203
2204	return 0;
2205}
2206
2207static int fib_triestat_seq_open(struct inode *inode, struct file *file)
2208{
2209	return single_open(file, fib_triestat_seq_show, NULL);
2210}
2211
2212static const struct file_operations fib_triestat_fops = {
2213	.owner	= THIS_MODULE,
2214	.open	= fib_triestat_seq_open,
2215	.read	= seq_read,
2216	.llseek	= seq_lseek,
2217	.release = single_release,
2218};
2219
2220static struct node *fib_trie_get_idx(struct fib_trie_iter *iter,
2221				      loff_t pos)
2222{
2223	loff_t idx = 0;
2224	struct node *n;
2225
2226	for (n = fib_trie_get_first(iter, trie_local);
2227	     n; ++idx, n = fib_trie_get_next(iter)) {
2228		if (pos == idx)
2229			return n;
2230	}
2231
2232	for (n = fib_trie_get_first(iter, trie_main);
2233	     n; ++idx, n = fib_trie_get_next(iter)) {
2234		if (pos == idx)
2235			return n;
2236	}
2237	return NULL;
2238}
2239
2240static void *fib_trie_seq_start(struct seq_file *seq, loff_t *pos)
2241{
2242	rcu_read_lock();
2243	if (*pos == 0)
2244		return SEQ_START_TOKEN;
2245	return fib_trie_get_idx(seq->private, *pos - 1);
2246}
2247
2248static void *fib_trie_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2249{
2250	struct fib_trie_iter *iter = seq->private;
2251	void *l = v;
2252
2253	++*pos;
2254	if (v == SEQ_START_TOKEN)
2255		return fib_trie_get_idx(iter, 0);
2256
2257	v = fib_trie_get_next(iter);
2258	BUG_ON(v == l);
2259	if (v)
2260		return v;
2261
2262	/* continue scan in next trie */
2263	if (iter->trie == trie_local)
2264		return fib_trie_get_first(iter, trie_main);
2265
2266	return NULL;
2267}
2268
2269static void fib_trie_seq_stop(struct seq_file *seq, void *v)
2270{
2271	rcu_read_unlock();
2272}
2273
2274static void seq_indent(struct seq_file *seq, int n)
2275{
2276	while (n-- > 0) seq_puts(seq, "   ");
2277}
2278
2279static inline const char *rtn_scope(enum rt_scope_t s)
2280{
2281	static char buf[32];
2282
2283	switch (s) {
2284	case RT_SCOPE_UNIVERSE: return "universe";
2285	case RT_SCOPE_SITE:	return "site";
2286	case RT_SCOPE_LINK:	return "link";
2287	case RT_SCOPE_HOST:	return "host";
2288	case RT_SCOPE_NOWHERE:	return "nowhere";
2289	default:
2290		snprintf(buf, sizeof(buf), "scope=%d", s);
2291		return buf;
2292	}
2293}
2294
2295static const char *rtn_type_names[__RTN_MAX] = {
2296	[RTN_UNSPEC] = "UNSPEC",
2297	[RTN_UNICAST] = "UNICAST",
2298	[RTN_LOCAL] = "LOCAL",
2299	[RTN_BROADCAST] = "BROADCAST",
2300	[RTN_ANYCAST] = "ANYCAST",
2301	[RTN_MULTICAST] = "MULTICAST",
2302	[RTN_BLACKHOLE] = "BLACKHOLE",
2303	[RTN_UNREACHABLE] = "UNREACHABLE",
2304	[RTN_PROHIBIT] = "PROHIBIT",
2305	[RTN_THROW] = "THROW",
2306	[RTN_NAT] = "NAT",
2307	[RTN_XRESOLVE] = "XRESOLVE",
2308};
2309
2310static inline const char *rtn_type(unsigned t)
2311{
2312	static char buf[32];
2313
2314	if (t < __RTN_MAX && rtn_type_names[t])
2315		return rtn_type_names[t];
2316	snprintf(buf, sizeof(buf), "type %d", t);
2317	return buf;
2318}
2319
2320/* Pretty print the trie */
2321static int fib_trie_seq_show(struct seq_file *seq, void *v)
2322{
2323	const struct fib_trie_iter *iter = seq->private;
2324	struct node *n = v;
2325
2326	if (v == SEQ_START_TOKEN)
2327		return 0;
2328
2329	if (!node_parent(n)) {
2330		if (iter->trie == trie_local)
2331			seq_puts(seq, "<local>:\n");
2332		else
2333			seq_puts(seq, "<main>:\n");
2334	}
2335
2336	if (IS_TNODE(n)) {
2337		struct tnode *tn = (struct tnode *) n;
2338		__be32 prf = htonl(mask_pfx(tn->key, tn->pos));
2339
2340		seq_indent(seq, iter->depth-1);
2341		seq_printf(seq, "  +-- %d.%d.%d.%d/%d %d %d %d\n",
2342			   NIPQUAD(prf), tn->pos, tn->bits, tn->full_children,
2343			   tn->empty_children);
2344
2345	} else {
2346		struct leaf *l = (struct leaf *) n;
2347		int i;
2348		__be32 val = htonl(l->key);
2349
2350		seq_indent(seq, iter->depth);
2351		seq_printf(seq, "  |-- %d.%d.%d.%d\n", NIPQUAD(val));
2352		for (i = 32; i >= 0; i--) {
2353			struct leaf_info *li = find_leaf_info(l, i);
2354			if (li) {
2355				struct fib_alias *fa;
2356				list_for_each_entry_rcu(fa, &li->falh, fa_list) {
2357					seq_indent(seq, iter->depth+1);
2358					seq_printf(seq, "  /%d %s %s", i,
2359						   rtn_scope(fa->fa_scope),
2360						   rtn_type(fa->fa_type));
2361					if (fa->fa_tos)
2362						seq_printf(seq, "tos =%d\n",
2363							   fa->fa_tos);
2364					seq_putc(seq, '\n');
2365				}
2366			}
2367		}
2368	}
2369
2370	return 0;
2371}
2372
2373static const struct seq_operations fib_trie_seq_ops = {
2374	.start  = fib_trie_seq_start,
2375	.next   = fib_trie_seq_next,
2376	.stop   = fib_trie_seq_stop,
2377	.show   = fib_trie_seq_show,
2378};
2379
2380static int fib_trie_seq_open(struct inode *inode, struct file *file)
2381{
2382	return seq_open_private(file, &fib_trie_seq_ops,
2383			sizeof(struct fib_trie_iter));
2384}
2385
2386static const struct file_operations fib_trie_fops = {
2387	.owner  = THIS_MODULE,
2388	.open   = fib_trie_seq_open,
2389	.read   = seq_read,
2390	.llseek = seq_lseek,
2391	.release = seq_release_private,
2392};
2393
2394static unsigned fib_flag_trans(int type, __be32 mask, const struct fib_info *fi)
2395{
2396	static unsigned type2flags[RTN_MAX + 1] = {
2397		[7] = RTF_REJECT, [8] = RTF_REJECT,
2398	};
2399	unsigned flags = type2flags[type];
2400
2401	if (fi && fi->fib_nh->nh_gw)
2402		flags |= RTF_GATEWAY;
2403	if (mask == htonl(0xFFFFFFFF))
2404		flags |= RTF_HOST;
2405	flags |= RTF_UP;
2406	return flags;
2407}
2408
2409/*
2410 *	This outputs /proc/net/route.
2411 *	The format of the file is not supposed to be changed
2412 * 	and needs to be same as fib_hash output to avoid breaking
2413 *	legacy utilities
2414 */
2415static int fib_route_seq_show(struct seq_file *seq, void *v)
2416{
2417	const struct fib_trie_iter *iter = seq->private;
2418	struct leaf *l = v;
2419	int i;
2420	char bf[128];
2421
2422	if (v == SEQ_START_TOKEN) {
2423		seq_printf(seq, "%-127s\n", "Iface\tDestination\tGateway "
2424			   "\tFlags\tRefCnt\tUse\tMetric\tMask\t\tMTU"
2425			   "\tWindow\tIRTT");
2426		return 0;
2427	}
2428
2429	if (iter->trie == trie_local)
2430		return 0;
2431	if (IS_TNODE(l))
2432		return 0;
2433
2434	for (i=32; i>=0; i--) {
2435		struct leaf_info *li = find_leaf_info(l, i);
2436		struct fib_alias *fa;
2437		__be32 mask, prefix;
2438
2439		if (!li)
2440			continue;
2441
2442		mask = inet_make_mask(li->plen);
2443		prefix = htonl(l->key);
2444
2445		list_for_each_entry_rcu(fa, &li->falh, fa_list) {
2446			const struct fib_info *fi = fa->fa_info;
2447			unsigned flags = fib_flag_trans(fa->fa_type, mask, fi);
2448
2449			if (fa->fa_type == RTN_BROADCAST
2450			    || fa->fa_type == RTN_MULTICAST)
2451				continue;
2452
2453			if (fi)
2454				snprintf(bf, sizeof(bf),
2455					 "%s\t%08X\t%08X\t%04X\t%d\t%u\t%d\t%08X\t%d\t%u\t%u",
2456					 fi->fib_dev ? fi->fib_dev->name : "*",
2457					 prefix,
2458					 fi->fib_nh->nh_gw, flags, 0, 0,
2459					 fi->fib_priority,
2460					 mask,
2461					 (fi->fib_advmss ? fi->fib_advmss + 40 : 0),
2462					 fi->fib_window,
2463					 fi->fib_rtt >> 3);
2464			else
2465				snprintf(bf, sizeof(bf),
2466					 "*\t%08X\t%08X\t%04X\t%d\t%u\t%d\t%08X\t%d\t%u\t%u",
2467					 prefix, 0, flags, 0, 0, 0,
2468					 mask, 0, 0, 0);
2469
2470			seq_printf(seq, "%-127s\n", bf);
2471		}
2472	}
2473
2474	return 0;
2475}
2476
2477static const struct seq_operations fib_route_seq_ops = {
2478	.start  = fib_trie_seq_start,
2479	.next   = fib_trie_seq_next,
2480	.stop   = fib_trie_seq_stop,
2481	.show   = fib_route_seq_show,
2482};
2483
2484static int fib_route_seq_open(struct inode *inode, struct file *file)
2485{
2486	return seq_open_private(file, &fib_route_seq_ops,
2487			sizeof(struct fib_trie_iter));
2488}
2489
2490static const struct file_operations fib_route_fops = {
2491	.owner  = THIS_MODULE,
2492	.open   = fib_route_seq_open,
2493	.read   = seq_read,
2494	.llseek = seq_lseek,
2495	.release = seq_release_private,
2496};
2497
2498int __init fib_proc_init(void)
2499{
2500	if (!proc_net_fops_create(&init_net, "fib_trie", S_IRUGO, &fib_trie_fops))
2501		goto out1;
2502
2503	if (!proc_net_fops_create(&init_net, "fib_triestat", S_IRUGO, &fib_triestat_fops))
2504		goto out2;
2505
2506	if (!proc_net_fops_create(&init_net, "route", S_IRUGO, &fib_route_fops))
2507		goto out3;
2508
2509	return 0;
2510
2511out3:
2512	proc_net_remove(&init_net, "fib_triestat");
2513out2:
2514	proc_net_remove(&init_net, "fib_trie");
2515out1:
2516	return -ENOMEM;
2517}
2518
2519void __init fib_proc_exit(void)
2520{
2521	proc_net_remove(&init_net, "fib_trie");
2522	proc_net_remove(&init_net, "fib_triestat");
2523	proc_net_remove(&init_net, "route");
2524}
2525
2526#endif /* CONFIG_PROC_FS */
2527