fib_trie.c revision 187b5188a78694fa6608fa1252d5197a7b3ab076
1/*
2 *   This program is free software; you can redistribute it and/or
3 *   modify it under the terms of the GNU General Public License
4 *   as published by the Free Software Foundation; either version
5 *   2 of the License, or (at your option) any later version.
6 *
7 *   Robert Olsson <robert.olsson@its.uu.se> Uppsala Universitet
8 *     & Swedish University of Agricultural Sciences.
9 *
10 *   Jens Laas <jens.laas@data.slu.se> Swedish University of
11 *     Agricultural Sciences.
12 *
13 *   Hans Liss <hans.liss@its.uu.se>  Uppsala Universitet
14 *
15 * This work is based on the LPC-trie which is originally descibed in:
16 *
17 * An experimental study of compression methods for dynamic tries
18 * Stefan Nilsson and Matti Tikkanen. Algorithmica, 33(1):19-33, 2002.
19 * http://www.nada.kth.se/~snilsson/public/papers/dyntrie2/
20 *
21 *
22 * IP-address lookup using LC-tries. Stefan Nilsson and Gunnar Karlsson
23 * IEEE Journal on Selected Areas in Communications, 17(6):1083-1092, June 1999
24 *
25 * Version:	$Id: fib_trie.c,v 1.3 2005/06/08 14:20:01 robert Exp $
26 *
27 *
28 * Code from fib_hash has been reused which includes the following header:
29 *
30 *
31 * INET		An implementation of the TCP/IP protocol suite for the LINUX
32 *		operating system.  INET is implemented using the  BSD Socket
33 *		interface as the means of communication with the user level.
34 *
35 *		IPv4 FIB: lookup engine and maintenance routines.
36 *
37 *
38 * Authors:	Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
39 *
40 *		This program is free software; you can redistribute it and/or
41 *		modify it under the terms of the GNU General Public License
42 *		as published by the Free Software Foundation; either version
43 *		2 of the License, or (at your option) any later version.
44 *
45 * Substantial contributions to this work comes from:
46 *
47 *		David S. Miller, <davem@davemloft.net>
48 *		Stephen Hemminger <shemminger@osdl.org>
49 *		Paul E. McKenney <paulmck@us.ibm.com>
50 *		Patrick McHardy <kaber@trash.net>
51 */
52
53#define VERSION "0.408"
54
55#include <asm/uaccess.h>
56#include <asm/system.h>
57#include <linux/bitops.h>
58#include <linux/types.h>
59#include <linux/kernel.h>
60#include <linux/mm.h>
61#include <linux/string.h>
62#include <linux/socket.h>
63#include <linux/sockios.h>
64#include <linux/errno.h>
65#include <linux/in.h>
66#include <linux/inet.h>
67#include <linux/inetdevice.h>
68#include <linux/netdevice.h>
69#include <linux/if_arp.h>
70#include <linux/proc_fs.h>
71#include <linux/rcupdate.h>
72#include <linux/skbuff.h>
73#include <linux/netlink.h>
74#include <linux/init.h>
75#include <linux/list.h>
76#include <net/net_namespace.h>
77#include <net/ip.h>
78#include <net/protocol.h>
79#include <net/route.h>
80#include <net/tcp.h>
81#include <net/sock.h>
82#include <net/ip_fib.h>
83#include "fib_lookup.h"
84
85#undef CONFIG_IP_FIB_TRIE_STATS
86#define MAX_STAT_DEPTH 32
87
88#define KEYLENGTH (8*sizeof(t_key))
89
90typedef unsigned int t_key;
91
92#define T_TNODE 0
93#define T_LEAF  1
94#define NODE_TYPE_MASK	0x1UL
95#define NODE_TYPE(node) ((node)->parent & NODE_TYPE_MASK)
96
97#define IS_TNODE(n) (!(n->parent & T_LEAF))
98#define IS_LEAF(n) (n->parent & T_LEAF)
99
100struct node {
101	t_key key;
102	unsigned long parent;
103};
104
105struct leaf {
106	t_key key;
107	unsigned long parent;
108	struct hlist_head list;
109	struct rcu_head rcu;
110};
111
112struct leaf_info {
113	struct hlist_node hlist;
114	struct rcu_head rcu;
115	int plen;
116	struct list_head falh;
117};
118
119struct tnode {
120	t_key key;
121	unsigned long parent;
122	unsigned short pos:5;		/* 2log(KEYLENGTH) bits needed */
123	unsigned short bits:5;		/* 2log(KEYLENGTH) bits needed */
124	unsigned short full_children;	/* KEYLENGTH bits needed */
125	unsigned short empty_children;	/* KEYLENGTH bits needed */
126	struct rcu_head rcu;
127	struct node *child[0];
128};
129
130#ifdef CONFIG_IP_FIB_TRIE_STATS
131struct trie_use_stats {
132	unsigned int gets;
133	unsigned int backtrack;
134	unsigned int semantic_match_passed;
135	unsigned int semantic_match_miss;
136	unsigned int null_node_hit;
137	unsigned int resize_node_skipped;
138};
139#endif
140
141struct trie_stat {
142	unsigned int totdepth;
143	unsigned int maxdepth;
144	unsigned int tnodes;
145	unsigned int leaves;
146	unsigned int nullpointers;
147	unsigned int nodesizes[MAX_STAT_DEPTH];
148};
149
150struct trie {
151	struct node *trie;
152#ifdef CONFIG_IP_FIB_TRIE_STATS
153	struct trie_use_stats stats;
154#endif
155	int size;
156};
157
158static void put_child(struct trie *t, struct tnode *tn, int i, struct node *n);
159static void tnode_put_child_reorg(struct tnode *tn, int i, struct node *n, int wasfull);
160static struct node *resize(struct trie *t, struct tnode *tn);
161static struct tnode *inflate(struct trie *t, struct tnode *tn);
162static struct tnode *halve(struct trie *t, struct tnode *tn);
163static void tnode_free(struct tnode *tn);
164
165static struct kmem_cache *fn_alias_kmem __read_mostly;
166
167static inline struct tnode *node_parent(struct node *node)
168{
169	struct tnode *ret;
170
171	ret = (struct tnode *)(node->parent & ~NODE_TYPE_MASK);
172	return rcu_dereference(ret);
173}
174
175static inline void node_set_parent(struct node *node, struct tnode *ptr)
176{
177	rcu_assign_pointer(node->parent,
178			   (unsigned long)ptr | NODE_TYPE(node));
179}
180
181/* rcu_read_lock needs to be hold by caller from readside */
182
183static inline struct node *tnode_get_child(struct tnode *tn, int i)
184{
185	BUG_ON(i >= 1 << tn->bits);
186
187	return rcu_dereference(tn->child[i]);
188}
189
190static inline int tnode_child_length(const struct tnode *tn)
191{
192	return 1 << tn->bits;
193}
194
195static inline t_key mask_pfx(t_key k, unsigned short l)
196{
197	return (l == 0) ? 0 : k >> (KEYLENGTH-l) << (KEYLENGTH-l);
198}
199
200static inline t_key tkey_extract_bits(t_key a, int offset, int bits)
201{
202	if (offset < KEYLENGTH)
203		return ((t_key)(a << offset)) >> (KEYLENGTH - bits);
204	else
205		return 0;
206}
207
208static inline int tkey_equals(t_key a, t_key b)
209{
210	return a == b;
211}
212
213static inline int tkey_sub_equals(t_key a, int offset, int bits, t_key b)
214{
215	if (bits == 0 || offset >= KEYLENGTH)
216		return 1;
217	bits = bits > KEYLENGTH ? KEYLENGTH : bits;
218	return ((a ^ b) << offset) >> (KEYLENGTH - bits) == 0;
219}
220
221static inline int tkey_mismatch(t_key a, int offset, t_key b)
222{
223	t_key diff = a ^ b;
224	int i = offset;
225
226	if (!diff)
227		return 0;
228	while ((diff << i) >> (KEYLENGTH-1) == 0)
229		i++;
230	return i;
231}
232
233/*
234  To understand this stuff, an understanding of keys and all their bits is
235  necessary. Every node in the trie has a key associated with it, but not
236  all of the bits in that key are significant.
237
238  Consider a node 'n' and its parent 'tp'.
239
240  If n is a leaf, every bit in its key is significant. Its presence is
241  necessitated by path compression, since during a tree traversal (when
242  searching for a leaf - unless we are doing an insertion) we will completely
243  ignore all skipped bits we encounter. Thus we need to verify, at the end of
244  a potentially successful search, that we have indeed been walking the
245  correct key path.
246
247  Note that we can never "miss" the correct key in the tree if present by
248  following the wrong path. Path compression ensures that segments of the key
249  that are the same for all keys with a given prefix are skipped, but the
250  skipped part *is* identical for each node in the subtrie below the skipped
251  bit! trie_insert() in this implementation takes care of that - note the
252  call to tkey_sub_equals() in trie_insert().
253
254  if n is an internal node - a 'tnode' here, the various parts of its key
255  have many different meanings.
256
257  Example:
258  _________________________________________________________________
259  | i | i | i | i | i | i | i | N | N | N | S | S | S | S | S | C |
260  -----------------------------------------------------------------
261    0   1   2   3   4   5   6   7   8   9  10  11  12  13  14  15
262
263  _________________________________________________________________
264  | C | C | C | u | u | u | u | u | u | u | u | u | u | u | u | u |
265  -----------------------------------------------------------------
266   16  17  18  19  20  21  22  23  24  25  26  27  28  29  30  31
267
268  tp->pos = 7
269  tp->bits = 3
270  n->pos = 15
271  n->bits = 4
272
273  First, let's just ignore the bits that come before the parent tp, that is
274  the bits from 0 to (tp->pos-1). They are *known* but at this point we do
275  not use them for anything.
276
277  The bits from (tp->pos) to (tp->pos + tp->bits - 1) - "N", above - are the
278  index into the parent's child array. That is, they will be used to find
279  'n' among tp's children.
280
281  The bits from (tp->pos + tp->bits) to (n->pos - 1) - "S" - are skipped bits
282  for the node n.
283
284  All the bits we have seen so far are significant to the node n. The rest
285  of the bits are really not needed or indeed known in n->key.
286
287  The bits from (n->pos) to (n->pos + n->bits - 1) - "C" - are the index into
288  n's child array, and will of course be different for each child.
289
290
291  The rest of the bits, from (n->pos + n->bits) onward, are completely unknown
292  at this point.
293
294*/
295
296static inline void check_tnode(const struct tnode *tn)
297{
298	WARN_ON(tn && tn->pos+tn->bits > 32);
299}
300
301static const int halve_threshold = 25;
302static const int inflate_threshold = 50;
303static const int halve_threshold_root = 8;
304static const int inflate_threshold_root = 15;
305
306
307static void __alias_free_mem(struct rcu_head *head)
308{
309	struct fib_alias *fa = container_of(head, struct fib_alias, rcu);
310	kmem_cache_free(fn_alias_kmem, fa);
311}
312
313static inline void alias_free_mem_rcu(struct fib_alias *fa)
314{
315	call_rcu(&fa->rcu, __alias_free_mem);
316}
317
318static void __leaf_free_rcu(struct rcu_head *head)
319{
320	kfree(container_of(head, struct leaf, rcu));
321}
322
323static void __leaf_info_free_rcu(struct rcu_head *head)
324{
325	kfree(container_of(head, struct leaf_info, rcu));
326}
327
328static inline void free_leaf_info(struct leaf_info *leaf)
329{
330	call_rcu(&leaf->rcu, __leaf_info_free_rcu);
331}
332
333static struct tnode *tnode_alloc(unsigned int size)
334{
335	struct page *pages;
336
337	if (size <= PAGE_SIZE)
338		return kcalloc(size, 1, GFP_KERNEL);
339
340	pages = alloc_pages(GFP_KERNEL|__GFP_ZERO, get_order(size));
341	if (!pages)
342		return NULL;
343
344	return page_address(pages);
345}
346
347static void __tnode_free_rcu(struct rcu_head *head)
348{
349	struct tnode *tn = container_of(head, struct tnode, rcu);
350	unsigned int size = sizeof(struct tnode) +
351		(1 << tn->bits) * sizeof(struct node *);
352
353	if (size <= PAGE_SIZE)
354		kfree(tn);
355	else
356		free_pages((unsigned long)tn, get_order(size));
357}
358
359static inline void tnode_free(struct tnode *tn)
360{
361	if (IS_LEAF(tn)) {
362		struct leaf *l = (struct leaf *) tn;
363		call_rcu_bh(&l->rcu, __leaf_free_rcu);
364	} else
365		call_rcu(&tn->rcu, __tnode_free_rcu);
366}
367
368static struct leaf *leaf_new(void)
369{
370	struct leaf *l = kmalloc(sizeof(struct leaf),  GFP_KERNEL);
371	if (l) {
372		l->parent = T_LEAF;
373		INIT_HLIST_HEAD(&l->list);
374	}
375	return l;
376}
377
378static struct leaf_info *leaf_info_new(int plen)
379{
380	struct leaf_info *li = kmalloc(sizeof(struct leaf_info),  GFP_KERNEL);
381	if (li) {
382		li->plen = plen;
383		INIT_LIST_HEAD(&li->falh);
384	}
385	return li;
386}
387
388static struct tnode* tnode_new(t_key key, int pos, int bits)
389{
390	int nchildren = 1<<bits;
391	int sz = sizeof(struct tnode) + nchildren * sizeof(struct node *);
392	struct tnode *tn = tnode_alloc(sz);
393
394	if (tn) {
395		memset(tn, 0, sz);
396		tn->parent = T_TNODE;
397		tn->pos = pos;
398		tn->bits = bits;
399		tn->key = key;
400		tn->full_children = 0;
401		tn->empty_children = 1<<bits;
402	}
403
404	pr_debug("AT %p s=%u %u\n", tn, (unsigned int) sizeof(struct tnode),
405		 (unsigned int) (sizeof(struct node) * 1<<bits));
406	return tn;
407}
408
409/*
410 * Check whether a tnode 'n' is "full", i.e. it is an internal node
411 * and no bits are skipped. See discussion in dyntree paper p. 6
412 */
413
414static inline int tnode_full(const struct tnode *tn, const struct node *n)
415{
416	if (n == NULL || IS_LEAF(n))
417		return 0;
418
419	return ((struct tnode *) n)->pos == tn->pos + tn->bits;
420}
421
422static inline void put_child(struct trie *t, struct tnode *tn, int i, struct node *n)
423{
424	tnode_put_child_reorg(tn, i, n, -1);
425}
426
427 /*
428  * Add a child at position i overwriting the old value.
429  * Update the value of full_children and empty_children.
430  */
431
432static void tnode_put_child_reorg(struct tnode *tn, int i, struct node *n, int wasfull)
433{
434	struct node *chi = tn->child[i];
435	int isfull;
436
437	BUG_ON(i >= 1<<tn->bits);
438
439
440	/* update emptyChildren */
441	if (n == NULL && chi != NULL)
442		tn->empty_children++;
443	else if (n != NULL && chi == NULL)
444		tn->empty_children--;
445
446	/* update fullChildren */
447	if (wasfull == -1)
448		wasfull = tnode_full(tn, chi);
449
450	isfull = tnode_full(tn, n);
451	if (wasfull && !isfull)
452		tn->full_children--;
453	else if (!wasfull && isfull)
454		tn->full_children++;
455
456	if (n)
457		node_set_parent(n, tn);
458
459	rcu_assign_pointer(tn->child[i], n);
460}
461
462static struct node *resize(struct trie *t, struct tnode *tn)
463{
464	int i;
465	int err = 0;
466	struct tnode *old_tn;
467	int inflate_threshold_use;
468	int halve_threshold_use;
469	int max_resize;
470
471	if (!tn)
472		return NULL;
473
474	pr_debug("In tnode_resize %p inflate_threshold=%d threshold=%d\n",
475		 tn, inflate_threshold, halve_threshold);
476
477	/* No children */
478	if (tn->empty_children == tnode_child_length(tn)) {
479		tnode_free(tn);
480		return NULL;
481	}
482	/* One child */
483	if (tn->empty_children == tnode_child_length(tn) - 1)
484		for (i = 0; i < tnode_child_length(tn); i++) {
485			struct node *n;
486
487			n = tn->child[i];
488			if (!n)
489				continue;
490
491			/* compress one level */
492			node_set_parent(n, NULL);
493			tnode_free(tn);
494			return n;
495		}
496	/*
497	 * Double as long as the resulting node has a number of
498	 * nonempty nodes that are above the threshold.
499	 */
500
501	/*
502	 * From "Implementing a dynamic compressed trie" by Stefan Nilsson of
503	 * the Helsinki University of Technology and Matti Tikkanen of Nokia
504	 * Telecommunications, page 6:
505	 * "A node is doubled if the ratio of non-empty children to all
506	 * children in the *doubled* node is at least 'high'."
507	 *
508	 * 'high' in this instance is the variable 'inflate_threshold'. It
509	 * is expressed as a percentage, so we multiply it with
510	 * tnode_child_length() and instead of multiplying by 2 (since the
511	 * child array will be doubled by inflate()) and multiplying
512	 * the left-hand side by 100 (to handle the percentage thing) we
513	 * multiply the left-hand side by 50.
514	 *
515	 * The left-hand side may look a bit weird: tnode_child_length(tn)
516	 * - tn->empty_children is of course the number of non-null children
517	 * in the current node. tn->full_children is the number of "full"
518	 * children, that is non-null tnodes with a skip value of 0.
519	 * All of those will be doubled in the resulting inflated tnode, so
520	 * we just count them one extra time here.
521	 *
522	 * A clearer way to write this would be:
523	 *
524	 * to_be_doubled = tn->full_children;
525	 * not_to_be_doubled = tnode_child_length(tn) - tn->empty_children -
526	 *     tn->full_children;
527	 *
528	 * new_child_length = tnode_child_length(tn) * 2;
529	 *
530	 * new_fill_factor = 100 * (not_to_be_doubled + 2*to_be_doubled) /
531	 *      new_child_length;
532	 * if (new_fill_factor >= inflate_threshold)
533	 *
534	 * ...and so on, tho it would mess up the while () loop.
535	 *
536	 * anyway,
537	 * 100 * (not_to_be_doubled + 2*to_be_doubled) / new_child_length >=
538	 *      inflate_threshold
539	 *
540	 * avoid a division:
541	 * 100 * (not_to_be_doubled + 2*to_be_doubled) >=
542	 *      inflate_threshold * new_child_length
543	 *
544	 * expand not_to_be_doubled and to_be_doubled, and shorten:
545	 * 100 * (tnode_child_length(tn) - tn->empty_children +
546	 *    tn->full_children) >= inflate_threshold * new_child_length
547	 *
548	 * expand new_child_length:
549	 * 100 * (tnode_child_length(tn) - tn->empty_children +
550	 *    tn->full_children) >=
551	 *      inflate_threshold * tnode_child_length(tn) * 2
552	 *
553	 * shorten again:
554	 * 50 * (tn->full_children + tnode_child_length(tn) -
555	 *    tn->empty_children) >= inflate_threshold *
556	 *    tnode_child_length(tn)
557	 *
558	 */
559
560	check_tnode(tn);
561
562	/* Keep root node larger  */
563
564	if (!tn->parent)
565		inflate_threshold_use = inflate_threshold_root;
566	else
567		inflate_threshold_use = inflate_threshold;
568
569	err = 0;
570	max_resize = 10;
571	while ((tn->full_children > 0 &&  max_resize-- &&
572	       50 * (tn->full_children + tnode_child_length(tn) - tn->empty_children) >=
573				inflate_threshold_use * tnode_child_length(tn))) {
574
575		old_tn = tn;
576		tn = inflate(t, tn);
577		if (IS_ERR(tn)) {
578			tn = old_tn;
579#ifdef CONFIG_IP_FIB_TRIE_STATS
580			t->stats.resize_node_skipped++;
581#endif
582			break;
583		}
584	}
585
586	if (max_resize < 0) {
587		if (!tn->parent)
588			printk(KERN_WARNING "Fix inflate_threshold_root. Now=%d size=%d bits\n",
589			       inflate_threshold_root, tn->bits);
590		else
591			printk(KERN_WARNING "Fix inflate_threshold. Now=%d size=%d bits\n",
592			       inflate_threshold, tn->bits);
593	}
594
595	check_tnode(tn);
596
597	/*
598	 * Halve as long as the number of empty children in this
599	 * node is above threshold.
600	 */
601
602
603	/* Keep root node larger  */
604
605	if (!tn->parent)
606		halve_threshold_use = halve_threshold_root;
607	else
608		halve_threshold_use = halve_threshold;
609
610	err = 0;
611	max_resize = 10;
612	while (tn->bits > 1 &&  max_resize-- &&
613	       100 * (tnode_child_length(tn) - tn->empty_children) <
614	       halve_threshold_use * tnode_child_length(tn)) {
615
616		old_tn = tn;
617		tn = halve(t, tn);
618		if (IS_ERR(tn)) {
619			tn = old_tn;
620#ifdef CONFIG_IP_FIB_TRIE_STATS
621			t->stats.resize_node_skipped++;
622#endif
623			break;
624		}
625	}
626
627	if (max_resize < 0) {
628		if (!tn->parent)
629			printk(KERN_WARNING "Fix halve_threshold_root. Now=%d size=%d bits\n",
630			       halve_threshold_root, tn->bits);
631		else
632			printk(KERN_WARNING "Fix halve_threshold. Now=%d size=%d bits\n",
633			       halve_threshold, tn->bits);
634	}
635
636	/* Only one child remains */
637	if (tn->empty_children == tnode_child_length(tn) - 1)
638		for (i = 0; i < tnode_child_length(tn); i++) {
639			struct node *n;
640
641			n = tn->child[i];
642			if (!n)
643				continue;
644
645			/* compress one level */
646
647			node_set_parent(n, NULL);
648			tnode_free(tn);
649			return n;
650		}
651
652	return (struct node *) tn;
653}
654
655static struct tnode *inflate(struct trie *t, struct tnode *tn)
656{
657	struct tnode *inode;
658	struct tnode *oldtnode = tn;
659	int olen = tnode_child_length(tn);
660	int i;
661
662	pr_debug("In inflate\n");
663
664	tn = tnode_new(oldtnode->key, oldtnode->pos, oldtnode->bits + 1);
665
666	if (!tn)
667		return ERR_PTR(-ENOMEM);
668
669	/*
670	 * Preallocate and store tnodes before the actual work so we
671	 * don't get into an inconsistent state if memory allocation
672	 * fails. In case of failure we return the oldnode and  inflate
673	 * of tnode is ignored.
674	 */
675
676	for (i = 0; i < olen; i++) {
677		struct tnode *inode = (struct tnode *) tnode_get_child(oldtnode, i);
678
679		if (inode &&
680		    IS_TNODE(inode) &&
681		    inode->pos == oldtnode->pos + oldtnode->bits &&
682		    inode->bits > 1) {
683			struct tnode *left, *right;
684			t_key m = ~0U << (KEYLENGTH - 1) >> inode->pos;
685
686			left = tnode_new(inode->key&(~m), inode->pos + 1,
687					 inode->bits - 1);
688			if (!left)
689				goto nomem;
690
691			right = tnode_new(inode->key|m, inode->pos + 1,
692					  inode->bits - 1);
693
694			if (!right) {
695				tnode_free(left);
696				goto nomem;
697			}
698
699			put_child(t, tn, 2*i, (struct node *) left);
700			put_child(t, tn, 2*i+1, (struct node *) right);
701		}
702	}
703
704	for (i = 0; i < olen; i++) {
705		struct node *node = tnode_get_child(oldtnode, i);
706		struct tnode *left, *right;
707		int size, j;
708
709		/* An empty child */
710		if (node == NULL)
711			continue;
712
713		/* A leaf or an internal node with skipped bits */
714
715		if (IS_LEAF(node) || ((struct tnode *) node)->pos >
716		   tn->pos + tn->bits - 1) {
717			if (tkey_extract_bits(node->key, oldtnode->pos + oldtnode->bits,
718					     1) == 0)
719				put_child(t, tn, 2*i, node);
720			else
721				put_child(t, tn, 2*i+1, node);
722			continue;
723		}
724
725		/* An internal node with two children */
726		inode = (struct tnode *) node;
727
728		if (inode->bits == 1) {
729			put_child(t, tn, 2*i, inode->child[0]);
730			put_child(t, tn, 2*i+1, inode->child[1]);
731
732			tnode_free(inode);
733			continue;
734		}
735
736		/* An internal node with more than two children */
737
738		/* We will replace this node 'inode' with two new
739		 * ones, 'left' and 'right', each with half of the
740		 * original children. The two new nodes will have
741		 * a position one bit further down the key and this
742		 * means that the "significant" part of their keys
743		 * (see the discussion near the top of this file)
744		 * will differ by one bit, which will be "0" in
745		 * left's key and "1" in right's key. Since we are
746		 * moving the key position by one step, the bit that
747		 * we are moving away from - the bit at position
748		 * (inode->pos) - is the one that will differ between
749		 * left and right. So... we synthesize that bit in the
750		 * two  new keys.
751		 * The mask 'm' below will be a single "one" bit at
752		 * the position (inode->pos)
753		 */
754
755		/* Use the old key, but set the new significant
756		 *   bit to zero.
757		 */
758
759		left = (struct tnode *) tnode_get_child(tn, 2*i);
760		put_child(t, tn, 2*i, NULL);
761
762		BUG_ON(!left);
763
764		right = (struct tnode *) tnode_get_child(tn, 2*i+1);
765		put_child(t, tn, 2*i+1, NULL);
766
767		BUG_ON(!right);
768
769		size = tnode_child_length(left);
770		for (j = 0; j < size; j++) {
771			put_child(t, left, j, inode->child[j]);
772			put_child(t, right, j, inode->child[j + size]);
773		}
774		put_child(t, tn, 2*i, resize(t, left));
775		put_child(t, tn, 2*i+1, resize(t, right));
776
777		tnode_free(inode);
778	}
779	tnode_free(oldtnode);
780	return tn;
781nomem:
782	{
783		int size = tnode_child_length(tn);
784		int j;
785
786		for (j = 0; j < size; j++)
787			if (tn->child[j])
788				tnode_free((struct tnode *)tn->child[j]);
789
790		tnode_free(tn);
791
792		return ERR_PTR(-ENOMEM);
793	}
794}
795
796static struct tnode *halve(struct trie *t, struct tnode *tn)
797{
798	struct tnode *oldtnode = tn;
799	struct node *left, *right;
800	int i;
801	int olen = tnode_child_length(tn);
802
803	pr_debug("In halve\n");
804
805	tn = tnode_new(oldtnode->key, oldtnode->pos, oldtnode->bits - 1);
806
807	if (!tn)
808		return ERR_PTR(-ENOMEM);
809
810	/*
811	 * Preallocate and store tnodes before the actual work so we
812	 * don't get into an inconsistent state if memory allocation
813	 * fails. In case of failure we return the oldnode and halve
814	 * of tnode is ignored.
815	 */
816
817	for (i = 0; i < olen; i += 2) {
818		left = tnode_get_child(oldtnode, i);
819		right = tnode_get_child(oldtnode, i+1);
820
821		/* Two nonempty children */
822		if (left && right) {
823			struct tnode *newn;
824
825			newn = tnode_new(left->key, tn->pos + tn->bits, 1);
826
827			if (!newn)
828				goto nomem;
829
830			put_child(t, tn, i/2, (struct node *)newn);
831		}
832
833	}
834
835	for (i = 0; i < olen; i += 2) {
836		struct tnode *newBinNode;
837
838		left = tnode_get_child(oldtnode, i);
839		right = tnode_get_child(oldtnode, i+1);
840
841		/* At least one of the children is empty */
842		if (left == NULL) {
843			if (right == NULL)    /* Both are empty */
844				continue;
845			put_child(t, tn, i/2, right);
846			continue;
847		}
848
849		if (right == NULL) {
850			put_child(t, tn, i/2, left);
851			continue;
852		}
853
854		/* Two nonempty children */
855		newBinNode = (struct tnode *) tnode_get_child(tn, i/2);
856		put_child(t, tn, i/2, NULL);
857		put_child(t, newBinNode, 0, left);
858		put_child(t, newBinNode, 1, right);
859		put_child(t, tn, i/2, resize(t, newBinNode));
860	}
861	tnode_free(oldtnode);
862	return tn;
863nomem:
864	{
865		int size = tnode_child_length(tn);
866		int j;
867
868		for (j = 0; j < size; j++)
869			if (tn->child[j])
870				tnode_free((struct tnode *)tn->child[j]);
871
872		tnode_free(tn);
873
874		return ERR_PTR(-ENOMEM);
875	}
876}
877
878/* readside must use rcu_read_lock currently dump routines
879 via get_fa_head and dump */
880
881static struct leaf_info *find_leaf_info(struct leaf *l, int plen)
882{
883	struct hlist_head *head = &l->list;
884	struct hlist_node *node;
885	struct leaf_info *li;
886
887	hlist_for_each_entry_rcu(li, node, head, hlist)
888		if (li->plen == plen)
889			return li;
890
891	return NULL;
892}
893
894static inline struct list_head * get_fa_head(struct leaf *l, int plen)
895{
896	struct leaf_info *li = find_leaf_info(l, plen);
897
898	if (!li)
899		return NULL;
900
901	return &li->falh;
902}
903
904static void insert_leaf_info(struct hlist_head *head, struct leaf_info *new)
905{
906	struct leaf_info *li = NULL, *last = NULL;
907	struct hlist_node *node;
908
909	if (hlist_empty(head)) {
910		hlist_add_head_rcu(&new->hlist, head);
911	} else {
912		hlist_for_each_entry(li, node, head, hlist) {
913			if (new->plen > li->plen)
914				break;
915
916			last = li;
917		}
918		if (last)
919			hlist_add_after_rcu(&last->hlist, &new->hlist);
920		else
921			hlist_add_before_rcu(&new->hlist, &li->hlist);
922	}
923}
924
925/* rcu_read_lock needs to be hold by caller from readside */
926
927static struct leaf *
928fib_find_node(struct trie *t, u32 key)
929{
930	int pos;
931	struct tnode *tn;
932	struct node *n;
933
934	pos = 0;
935	n = rcu_dereference(t->trie);
936
937	while (n != NULL &&  NODE_TYPE(n) == T_TNODE) {
938		tn = (struct tnode *) n;
939
940		check_tnode(tn);
941
942		if (tkey_sub_equals(tn->key, pos, tn->pos-pos, key)) {
943			pos = tn->pos + tn->bits;
944			n = tnode_get_child(tn, tkey_extract_bits(key, tn->pos, tn->bits));
945		} else
946			break;
947	}
948	/* Case we have found a leaf. Compare prefixes */
949
950	if (n != NULL && IS_LEAF(n) && tkey_equals(key, n->key))
951		return (struct leaf *)n;
952
953	return NULL;
954}
955
956static struct node *trie_rebalance(struct trie *t, struct tnode *tn)
957{
958	int wasfull;
959	t_key cindex, key = tn->key;
960	struct tnode *tp;
961
962	while (tn != NULL && (tp = node_parent((struct node *)tn)) != NULL) {
963		cindex = tkey_extract_bits(key, tp->pos, tp->bits);
964		wasfull = tnode_full(tp, tnode_get_child(tp, cindex));
965		tn = (struct tnode *) resize (t, (struct tnode *)tn);
966		tnode_put_child_reorg((struct tnode *)tp, cindex,(struct node*)tn, wasfull);
967
968		tp = node_parent((struct node *) tn);
969		if (!tp)
970			break;
971		tn = tp;
972	}
973
974	/* Handle last (top) tnode */
975	if (IS_TNODE(tn))
976		tn = (struct tnode*) resize(t, (struct tnode *)tn);
977
978	return (struct node*) tn;
979}
980
981/* only used from updater-side */
982
983static  struct list_head *
984fib_insert_node(struct trie *t, int *err, u32 key, int plen)
985{
986	int pos, newpos;
987	struct tnode *tp = NULL, *tn = NULL;
988	struct node *n;
989	struct leaf *l;
990	int missbit;
991	struct list_head *fa_head = NULL;
992	struct leaf_info *li;
993	t_key cindex;
994
995	pos = 0;
996	n = t->trie;
997
998	/* If we point to NULL, stop. Either the tree is empty and we should
999	 * just put a new leaf in if, or we have reached an empty child slot,
1000	 * and we should just put our new leaf in that.
1001	 * If we point to a T_TNODE, check if it matches our key. Note that
1002	 * a T_TNODE might be skipping any number of bits - its 'pos' need
1003	 * not be the parent's 'pos'+'bits'!
1004	 *
1005	 * If it does match the current key, get pos/bits from it, extract
1006	 * the index from our key, push the T_TNODE and walk the tree.
1007	 *
1008	 * If it doesn't, we have to replace it with a new T_TNODE.
1009	 *
1010	 * If we point to a T_LEAF, it might or might not have the same key
1011	 * as we do. If it does, just change the value, update the T_LEAF's
1012	 * value, and return it.
1013	 * If it doesn't, we need to replace it with a T_TNODE.
1014	 */
1015
1016	while (n != NULL &&  NODE_TYPE(n) == T_TNODE) {
1017		tn = (struct tnode *) n;
1018
1019		check_tnode(tn);
1020
1021		if (tkey_sub_equals(tn->key, pos, tn->pos-pos, key)) {
1022			tp = tn;
1023			pos = tn->pos + tn->bits;
1024			n = tnode_get_child(tn, tkey_extract_bits(key, tn->pos, tn->bits));
1025
1026			BUG_ON(n && node_parent(n) != tn);
1027		} else
1028			break;
1029	}
1030
1031	/*
1032	 * n  ----> NULL, LEAF or TNODE
1033	 *
1034	 * tp is n's (parent) ----> NULL or TNODE
1035	 */
1036
1037	BUG_ON(tp && IS_LEAF(tp));
1038
1039	/* Case 1: n is a leaf. Compare prefixes */
1040
1041	if (n != NULL && IS_LEAF(n) && tkey_equals(key, n->key)) {
1042		struct leaf *l = (struct leaf *) n;
1043
1044		li = leaf_info_new(plen);
1045
1046		if (!li) {
1047			*err = -ENOMEM;
1048			goto done;
1049		}
1050
1051		fa_head = &li->falh;
1052		insert_leaf_info(&l->list, li);
1053		goto done;
1054	}
1055	t->size++;
1056	l = leaf_new();
1057
1058	if (!l) {
1059		*err = -ENOMEM;
1060		goto done;
1061	}
1062
1063	l->key = key;
1064	li = leaf_info_new(plen);
1065
1066	if (!li) {
1067		tnode_free((struct tnode *) l);
1068		*err = -ENOMEM;
1069		goto done;
1070	}
1071
1072	fa_head = &li->falh;
1073	insert_leaf_info(&l->list, li);
1074
1075	if (t->trie && n == NULL) {
1076		/* Case 2: n is NULL, and will just insert a new leaf */
1077
1078		node_set_parent((struct node *)l, tp);
1079
1080		cindex = tkey_extract_bits(key, tp->pos, tp->bits);
1081		put_child(t, (struct tnode *)tp, cindex, (struct node *)l);
1082	} else {
1083		/* Case 3: n is a LEAF or a TNODE and the key doesn't match. */
1084		/*
1085		 *  Add a new tnode here
1086		 *  first tnode need some special handling
1087		 */
1088
1089		if (tp)
1090			pos = tp->pos+tp->bits;
1091		else
1092			pos = 0;
1093
1094		if (n) {
1095			newpos = tkey_mismatch(key, pos, n->key);
1096			tn = tnode_new(n->key, newpos, 1);
1097		} else {
1098			newpos = 0;
1099			tn = tnode_new(key, newpos, 1); /* First tnode */
1100		}
1101
1102		if (!tn) {
1103			free_leaf_info(li);
1104			tnode_free((struct tnode *) l);
1105			*err = -ENOMEM;
1106			goto done;
1107		}
1108
1109		node_set_parent((struct node *)tn, tp);
1110
1111		missbit = tkey_extract_bits(key, newpos, 1);
1112		put_child(t, tn, missbit, (struct node *)l);
1113		put_child(t, tn, 1-missbit, n);
1114
1115		if (tp) {
1116			cindex = tkey_extract_bits(key, tp->pos, tp->bits);
1117			put_child(t, (struct tnode *)tp, cindex, (struct node *)tn);
1118		} else {
1119			rcu_assign_pointer(t->trie, (struct node *)tn); /* First tnode */
1120			tp = tn;
1121		}
1122	}
1123
1124	if (tp && tp->pos + tp->bits > 32)
1125		printk(KERN_WARNING "fib_trie tp=%p pos=%d, bits=%d, key=%0x plen=%d\n",
1126		       tp, tp->pos, tp->bits, key, plen);
1127
1128	/* Rebalance the trie */
1129
1130	rcu_assign_pointer(t->trie, trie_rebalance(t, tp));
1131done:
1132	return fa_head;
1133}
1134
1135/*
1136 * Caller must hold RTNL.
1137 */
1138static int fn_trie_insert(struct fib_table *tb, struct fib_config *cfg)
1139{
1140	struct trie *t = (struct trie *) tb->tb_data;
1141	struct fib_alias *fa, *new_fa;
1142	struct list_head *fa_head = NULL;
1143	struct fib_info *fi;
1144	int plen = cfg->fc_dst_len;
1145	u8 tos = cfg->fc_tos;
1146	u32 key, mask;
1147	int err;
1148	struct leaf *l;
1149
1150	if (plen > 32)
1151		return -EINVAL;
1152
1153	key = ntohl(cfg->fc_dst);
1154
1155	pr_debug("Insert table=%u %08x/%d\n", tb->tb_id, key, plen);
1156
1157	mask = ntohl(inet_make_mask(plen));
1158
1159	if (key & ~mask)
1160		return -EINVAL;
1161
1162	key = key & mask;
1163
1164	fi = fib_create_info(cfg);
1165	if (IS_ERR(fi)) {
1166		err = PTR_ERR(fi);
1167		goto err;
1168	}
1169
1170	l = fib_find_node(t, key);
1171	fa = NULL;
1172
1173	if (l) {
1174		fa_head = get_fa_head(l, plen);
1175		fa = fib_find_alias(fa_head, tos, fi->fib_priority);
1176	}
1177
1178	/* Now fa, if non-NULL, points to the first fib alias
1179	 * with the same keys [prefix,tos,priority], if such key already
1180	 * exists or to the node before which we will insert new one.
1181	 *
1182	 * If fa is NULL, we will need to allocate a new one and
1183	 * insert to the head of f.
1184	 *
1185	 * If f is NULL, no fib node matched the destination key
1186	 * and we need to allocate a new one of those as well.
1187	 */
1188
1189	if (fa && fa->fa_info->fib_priority == fi->fib_priority) {
1190		struct fib_alias *fa_orig;
1191
1192		err = -EEXIST;
1193		if (cfg->fc_nlflags & NLM_F_EXCL)
1194			goto out;
1195
1196		if (cfg->fc_nlflags & NLM_F_REPLACE) {
1197			struct fib_info *fi_drop;
1198			u8 state;
1199
1200			if (fi->fib_treeref > 1)
1201				goto out;
1202
1203			err = -ENOBUFS;
1204			new_fa = kmem_cache_alloc(fn_alias_kmem, GFP_KERNEL);
1205			if (new_fa == NULL)
1206				goto out;
1207
1208			fi_drop = fa->fa_info;
1209			new_fa->fa_tos = fa->fa_tos;
1210			new_fa->fa_info = fi;
1211			new_fa->fa_type = cfg->fc_type;
1212			new_fa->fa_scope = cfg->fc_scope;
1213			state = fa->fa_state;
1214			new_fa->fa_state &= ~FA_S_ACCESSED;
1215
1216			list_replace_rcu(&fa->fa_list, &new_fa->fa_list);
1217			alias_free_mem_rcu(fa);
1218
1219			fib_release_info(fi_drop);
1220			if (state & FA_S_ACCESSED)
1221				rt_cache_flush(-1);
1222			rtmsg_fib(RTM_NEWROUTE, htonl(key), new_fa, plen,
1223				tb->tb_id, &cfg->fc_nlinfo, NLM_F_REPLACE);
1224
1225			goto succeeded;
1226		}
1227		/* Error if we find a perfect match which
1228		 * uses the same scope, type, and nexthop
1229		 * information.
1230		 */
1231		fa_orig = fa;
1232		list_for_each_entry(fa, fa_orig->fa_list.prev, fa_list) {
1233			if (fa->fa_tos != tos)
1234				break;
1235			if (fa->fa_info->fib_priority != fi->fib_priority)
1236				break;
1237			if (fa->fa_type == cfg->fc_type &&
1238			    fa->fa_scope == cfg->fc_scope &&
1239			    fa->fa_info == fi) {
1240				goto out;
1241			}
1242		}
1243		if (!(cfg->fc_nlflags & NLM_F_APPEND))
1244			fa = fa_orig;
1245	}
1246	err = -ENOENT;
1247	if (!(cfg->fc_nlflags & NLM_F_CREATE))
1248		goto out;
1249
1250	err = -ENOBUFS;
1251	new_fa = kmem_cache_alloc(fn_alias_kmem, GFP_KERNEL);
1252	if (new_fa == NULL)
1253		goto out;
1254
1255	new_fa->fa_info = fi;
1256	new_fa->fa_tos = tos;
1257	new_fa->fa_type = cfg->fc_type;
1258	new_fa->fa_scope = cfg->fc_scope;
1259	new_fa->fa_state = 0;
1260	/*
1261	 * Insert new entry to the list.
1262	 */
1263
1264	if (!fa_head) {
1265		err = 0;
1266		fa_head = fib_insert_node(t, &err, key, plen);
1267		if (err)
1268			goto out_free_new_fa;
1269	}
1270
1271	list_add_tail_rcu(&new_fa->fa_list,
1272			  (fa ? &fa->fa_list : fa_head));
1273
1274	rt_cache_flush(-1);
1275	rtmsg_fib(RTM_NEWROUTE, htonl(key), new_fa, plen, tb->tb_id,
1276		  &cfg->fc_nlinfo, 0);
1277succeeded:
1278	return 0;
1279
1280out_free_new_fa:
1281	kmem_cache_free(fn_alias_kmem, new_fa);
1282out:
1283	fib_release_info(fi);
1284err:
1285	return err;
1286}
1287
1288
1289/* should be called with rcu_read_lock */
1290static inline int check_leaf(struct trie *t, struct leaf *l,
1291			     t_key key, int *plen, const struct flowi *flp,
1292			     struct fib_result *res)
1293{
1294	int err, i;
1295	__be32 mask;
1296	struct leaf_info *li;
1297	struct hlist_head *hhead = &l->list;
1298	struct hlist_node *node;
1299
1300	hlist_for_each_entry_rcu(li, node, hhead, hlist) {
1301		i = li->plen;
1302		mask = inet_make_mask(i);
1303		if (l->key != (key & ntohl(mask)))
1304			continue;
1305
1306		if ((err = fib_semantic_match(&li->falh, flp, res, htonl(l->key), mask, i)) <= 0) {
1307			*plen = i;
1308#ifdef CONFIG_IP_FIB_TRIE_STATS
1309			t->stats.semantic_match_passed++;
1310#endif
1311			return err;
1312		}
1313#ifdef CONFIG_IP_FIB_TRIE_STATS
1314		t->stats.semantic_match_miss++;
1315#endif
1316	}
1317	return 1;
1318}
1319
1320static int
1321fn_trie_lookup(struct fib_table *tb, const struct flowi *flp, struct fib_result *res)
1322{
1323	struct trie *t = (struct trie *) tb->tb_data;
1324	int plen, ret = 0;
1325	struct node *n;
1326	struct tnode *pn;
1327	int pos, bits;
1328	t_key key = ntohl(flp->fl4_dst);
1329	int chopped_off;
1330	t_key cindex = 0;
1331	int current_prefix_length = KEYLENGTH;
1332	struct tnode *cn;
1333	t_key node_prefix, key_prefix, pref_mismatch;
1334	int mp;
1335
1336	rcu_read_lock();
1337
1338	n = rcu_dereference(t->trie);
1339	if (!n)
1340		goto failed;
1341
1342#ifdef CONFIG_IP_FIB_TRIE_STATS
1343	t->stats.gets++;
1344#endif
1345
1346	/* Just a leaf? */
1347	if (IS_LEAF(n)) {
1348		if ((ret = check_leaf(t, (struct leaf *)n, key, &plen, flp, res)) <= 0)
1349			goto found;
1350		goto failed;
1351	}
1352	pn = (struct tnode *) n;
1353	chopped_off = 0;
1354
1355	while (pn) {
1356		pos = pn->pos;
1357		bits = pn->bits;
1358
1359		if (!chopped_off)
1360			cindex = tkey_extract_bits(mask_pfx(key, current_prefix_length),
1361						   pos, bits);
1362
1363		n = tnode_get_child(pn, cindex);
1364
1365		if (n == NULL) {
1366#ifdef CONFIG_IP_FIB_TRIE_STATS
1367			t->stats.null_node_hit++;
1368#endif
1369			goto backtrace;
1370		}
1371
1372		if (IS_LEAF(n)) {
1373			if ((ret = check_leaf(t, (struct leaf *)n, key, &plen, flp, res)) <= 0)
1374				goto found;
1375			else
1376				goto backtrace;
1377		}
1378
1379#define HL_OPTIMIZE
1380#ifdef HL_OPTIMIZE
1381		cn = (struct tnode *)n;
1382
1383		/*
1384		 * It's a tnode, and we can do some extra checks here if we
1385		 * like, to avoid descending into a dead-end branch.
1386		 * This tnode is in the parent's child array at index
1387		 * key[p_pos..p_pos+p_bits] but potentially with some bits
1388		 * chopped off, so in reality the index may be just a
1389		 * subprefix, padded with zero at the end.
1390		 * We can also take a look at any skipped bits in this
1391		 * tnode - everything up to p_pos is supposed to be ok,
1392		 * and the non-chopped bits of the index (se previous
1393		 * paragraph) are also guaranteed ok, but the rest is
1394		 * considered unknown.
1395		 *
1396		 * The skipped bits are key[pos+bits..cn->pos].
1397		 */
1398
1399		/* If current_prefix_length < pos+bits, we are already doing
1400		 * actual prefix  matching, which means everything from
1401		 * pos+(bits-chopped_off) onward must be zero along some
1402		 * branch of this subtree - otherwise there is *no* valid
1403		 * prefix present. Here we can only check the skipped
1404		 * bits. Remember, since we have already indexed into the
1405		 * parent's child array, we know that the bits we chopped of
1406		 * *are* zero.
1407		 */
1408
1409		/* NOTA BENE: CHECKING ONLY SKIPPED BITS FOR THE NEW NODE HERE */
1410
1411		if (current_prefix_length < pos+bits) {
1412			if (tkey_extract_bits(cn->key, current_prefix_length,
1413						cn->pos - current_prefix_length) != 0 ||
1414			    !(cn->child[0]))
1415				goto backtrace;
1416		}
1417
1418		/*
1419		 * If chopped_off=0, the index is fully validated and we
1420		 * only need to look at the skipped bits for this, the new,
1421		 * tnode. What we actually want to do is to find out if
1422		 * these skipped bits match our key perfectly, or if we will
1423		 * have to count on finding a matching prefix further down,
1424		 * because if we do, we would like to have some way of
1425		 * verifying the existence of such a prefix at this point.
1426		 */
1427
1428		/* The only thing we can do at this point is to verify that
1429		 * any such matching prefix can indeed be a prefix to our
1430		 * key, and if the bits in the node we are inspecting that
1431		 * do not match our key are not ZERO, this cannot be true.
1432		 * Thus, find out where there is a mismatch (before cn->pos)
1433		 * and verify that all the mismatching bits are zero in the
1434		 * new tnode's key.
1435		 */
1436
1437		/* Note: We aren't very concerned about the piece of the key
1438		 * that precede pn->pos+pn->bits, since these have already been
1439		 * checked. The bits after cn->pos aren't checked since these are
1440		 * by definition "unknown" at this point. Thus, what we want to
1441		 * see is if we are about to enter the "prefix matching" state,
1442		 * and in that case verify that the skipped bits that will prevail
1443		 * throughout this subtree are zero, as they have to be if we are
1444		 * to find a matching prefix.
1445		 */
1446
1447		node_prefix = mask_pfx(cn->key, cn->pos);
1448		key_prefix = mask_pfx(key, cn->pos);
1449		pref_mismatch = key_prefix^node_prefix;
1450		mp = 0;
1451
1452		/* In short: If skipped bits in this node do not match the search
1453		 * key, enter the "prefix matching" state.directly.
1454		 */
1455		if (pref_mismatch) {
1456			while (!(pref_mismatch & (1<<(KEYLENGTH-1)))) {
1457				mp++;
1458				pref_mismatch = pref_mismatch <<1;
1459			}
1460			key_prefix = tkey_extract_bits(cn->key, mp, cn->pos-mp);
1461
1462			if (key_prefix != 0)
1463				goto backtrace;
1464
1465			if (current_prefix_length >= cn->pos)
1466				current_prefix_length = mp;
1467		}
1468#endif
1469		pn = (struct tnode *)n; /* Descend */
1470		chopped_off = 0;
1471		continue;
1472
1473backtrace:
1474		chopped_off++;
1475
1476		/* As zero don't change the child key (cindex) */
1477		while ((chopped_off <= pn->bits) && !(cindex & (1<<(chopped_off-1))))
1478			chopped_off++;
1479
1480		/* Decrease current_... with bits chopped off */
1481		if (current_prefix_length > pn->pos + pn->bits - chopped_off)
1482			current_prefix_length = pn->pos + pn->bits - chopped_off;
1483
1484		/*
1485		 * Either we do the actual chop off according or if we have
1486		 * chopped off all bits in this tnode walk up to our parent.
1487		 */
1488
1489		if (chopped_off <= pn->bits) {
1490			cindex &= ~(1 << (chopped_off-1));
1491		} else {
1492			struct tnode *parent = node_parent((struct node *) pn);
1493			if (!parent)
1494				goto failed;
1495
1496			/* Get Child's index */
1497			cindex = tkey_extract_bits(pn->key, parent->pos, parent->bits);
1498			pn = parent;
1499			chopped_off = 0;
1500
1501#ifdef CONFIG_IP_FIB_TRIE_STATS
1502			t->stats.backtrack++;
1503#endif
1504			goto backtrace;
1505		}
1506	}
1507failed:
1508	ret = 1;
1509found:
1510	rcu_read_unlock();
1511	return ret;
1512}
1513
1514/* only called from updater side */
1515static int trie_leaf_remove(struct trie *t, t_key key)
1516{
1517	t_key cindex;
1518	struct tnode *tp = NULL;
1519	struct node *n = t->trie;
1520	struct leaf *l;
1521
1522	pr_debug("entering trie_leaf_remove(%p)\n", n);
1523
1524	/* Note that in the case skipped bits, those bits are *not* checked!
1525	 * When we finish this, we will have NULL or a T_LEAF, and the
1526	 * T_LEAF may or may not match our key.
1527	 */
1528
1529	while (n != NULL && IS_TNODE(n)) {
1530		struct tnode *tn = (struct tnode *) n;
1531		check_tnode(tn);
1532		n = tnode_get_child(tn ,tkey_extract_bits(key, tn->pos, tn->bits));
1533
1534		BUG_ON(n && node_parent(n) != tn);
1535	}
1536	l = (struct leaf *) n;
1537
1538	if (!n || !tkey_equals(l->key, key))
1539		return 0;
1540
1541	/*
1542	 * Key found.
1543	 * Remove the leaf and rebalance the tree
1544	 */
1545
1546	t->size--;
1547
1548	tp = node_parent(n);
1549	tnode_free((struct tnode *) n);
1550
1551	if (tp) {
1552		cindex = tkey_extract_bits(key, tp->pos, tp->bits);
1553		put_child(t, (struct tnode *)tp, cindex, NULL);
1554		rcu_assign_pointer(t->trie, trie_rebalance(t, tp));
1555	} else
1556		rcu_assign_pointer(t->trie, NULL);
1557
1558	return 1;
1559}
1560
1561/*
1562 * Caller must hold RTNL.
1563 */
1564static int fn_trie_delete(struct fib_table *tb, struct fib_config *cfg)
1565{
1566	struct trie *t = (struct trie *) tb->tb_data;
1567	u32 key, mask;
1568	int plen = cfg->fc_dst_len;
1569	u8 tos = cfg->fc_tos;
1570	struct fib_alias *fa, *fa_to_delete;
1571	struct list_head *fa_head;
1572	struct leaf *l;
1573	struct leaf_info *li;
1574
1575	if (plen > 32)
1576		return -EINVAL;
1577
1578	key = ntohl(cfg->fc_dst);
1579	mask = ntohl(inet_make_mask(plen));
1580
1581	if (key & ~mask)
1582		return -EINVAL;
1583
1584	key = key & mask;
1585	l = fib_find_node(t, key);
1586
1587	if (!l)
1588		return -ESRCH;
1589
1590	fa_head = get_fa_head(l, plen);
1591	fa = fib_find_alias(fa_head, tos, 0);
1592
1593	if (!fa)
1594		return -ESRCH;
1595
1596	pr_debug("Deleting %08x/%d tos=%d t=%p\n", key, plen, tos, t);
1597
1598	fa_to_delete = NULL;
1599	fa_head = fa->fa_list.prev;
1600
1601	list_for_each_entry(fa, fa_head, fa_list) {
1602		struct fib_info *fi = fa->fa_info;
1603
1604		if (fa->fa_tos != tos)
1605			break;
1606
1607		if ((!cfg->fc_type || fa->fa_type == cfg->fc_type) &&
1608		    (cfg->fc_scope == RT_SCOPE_NOWHERE ||
1609		     fa->fa_scope == cfg->fc_scope) &&
1610		    (!cfg->fc_protocol ||
1611		     fi->fib_protocol == cfg->fc_protocol) &&
1612		    fib_nh_match(cfg, fi) == 0) {
1613			fa_to_delete = fa;
1614			break;
1615		}
1616	}
1617
1618	if (!fa_to_delete)
1619		return -ESRCH;
1620
1621	fa = fa_to_delete;
1622	rtmsg_fib(RTM_DELROUTE, htonl(key), fa, plen, tb->tb_id,
1623		  &cfg->fc_nlinfo, 0);
1624
1625	l = fib_find_node(t, key);
1626	li = find_leaf_info(l, plen);
1627
1628	list_del_rcu(&fa->fa_list);
1629
1630	if (list_empty(fa_head)) {
1631		hlist_del_rcu(&li->hlist);
1632		free_leaf_info(li);
1633	}
1634
1635	if (hlist_empty(&l->list))
1636		trie_leaf_remove(t, key);
1637
1638	if (fa->fa_state & FA_S_ACCESSED)
1639		rt_cache_flush(-1);
1640
1641	fib_release_info(fa->fa_info);
1642	alias_free_mem_rcu(fa);
1643	return 0;
1644}
1645
1646static int trie_flush_list(struct trie *t, struct list_head *head)
1647{
1648	struct fib_alias *fa, *fa_node;
1649	int found = 0;
1650
1651	list_for_each_entry_safe(fa, fa_node, head, fa_list) {
1652		struct fib_info *fi = fa->fa_info;
1653
1654		if (fi && (fi->fib_flags & RTNH_F_DEAD)) {
1655			list_del_rcu(&fa->fa_list);
1656			fib_release_info(fa->fa_info);
1657			alias_free_mem_rcu(fa);
1658			found++;
1659		}
1660	}
1661	return found;
1662}
1663
1664static int trie_flush_leaf(struct trie *t, struct leaf *l)
1665{
1666	int found = 0;
1667	struct hlist_head *lih = &l->list;
1668	struct hlist_node *node, *tmp;
1669	struct leaf_info *li = NULL;
1670
1671	hlist_for_each_entry_safe(li, node, tmp, lih, hlist) {
1672		found += trie_flush_list(t, &li->falh);
1673
1674		if (list_empty(&li->falh)) {
1675			hlist_del_rcu(&li->hlist);
1676			free_leaf_info(li);
1677		}
1678	}
1679	return found;
1680}
1681
1682/* rcu_read_lock needs to be hold by caller from readside */
1683
1684static struct leaf *nextleaf(struct trie *t, struct leaf *thisleaf)
1685{
1686	struct node *c = (struct node *) thisleaf;
1687	struct tnode *p;
1688	int idx;
1689	struct node *trie = rcu_dereference(t->trie);
1690
1691	if (c == NULL) {
1692		if (trie == NULL)
1693			return NULL;
1694
1695		if (IS_LEAF(trie))          /* trie w. just a leaf */
1696			return (struct leaf *) trie;
1697
1698		p = (struct tnode*) trie;  /* Start */
1699	} else
1700		p = node_parent(c);
1701
1702	while (p) {
1703		int pos, last;
1704
1705		/*  Find the next child of the parent */
1706		if (c)
1707			pos = 1 + tkey_extract_bits(c->key, p->pos, p->bits);
1708		else
1709			pos = 0;
1710
1711		last = 1 << p->bits;
1712		for (idx = pos; idx < last ; idx++) {
1713			c = rcu_dereference(p->child[idx]);
1714
1715			if (!c)
1716				continue;
1717
1718			/* Decend if tnode */
1719			while (IS_TNODE(c)) {
1720				p = (struct tnode *) c;
1721				idx = 0;
1722
1723				/* Rightmost non-NULL branch */
1724				if (p && IS_TNODE(p))
1725					while (!(c = rcu_dereference(p->child[idx]))
1726					       && idx < (1<<p->bits)) idx++;
1727
1728				/* Done with this tnode? */
1729				if (idx >= (1 << p->bits) || !c)
1730					goto up;
1731			}
1732			return (struct leaf *) c;
1733		}
1734up:
1735		/* No more children go up one step  */
1736		c = (struct node *) p;
1737		p = node_parent(c);
1738	}
1739	return NULL; /* Ready. Root of trie */
1740}
1741
1742/*
1743 * Caller must hold RTNL.
1744 */
1745static int fn_trie_flush(struct fib_table *tb)
1746{
1747	struct trie *t = (struct trie *) tb->tb_data;
1748	struct leaf *ll = NULL, *l = NULL;
1749	int found = 0, h;
1750
1751	for (h = 0; (l = nextleaf(t, l)) != NULL; h++) {
1752		found += trie_flush_leaf(t, l);
1753
1754		if (ll && hlist_empty(&ll->list))
1755			trie_leaf_remove(t, ll->key);
1756		ll = l;
1757	}
1758
1759	if (ll && hlist_empty(&ll->list))
1760		trie_leaf_remove(t, ll->key);
1761
1762	pr_debug("trie_flush found=%d\n", found);
1763	return found;
1764}
1765
1766static void
1767fn_trie_select_default(struct fib_table *tb, const struct flowi *flp, struct fib_result *res)
1768{
1769	struct trie *t = (struct trie *) tb->tb_data;
1770	int order, last_idx;
1771	struct fib_info *fi = NULL;
1772	struct fib_info *last_resort;
1773	struct fib_alias *fa = NULL;
1774	struct list_head *fa_head;
1775	struct leaf *l;
1776
1777	last_idx = -1;
1778	last_resort = NULL;
1779	order = -1;
1780
1781	rcu_read_lock();
1782
1783	l = fib_find_node(t, 0);
1784	if (!l)
1785		goto out;
1786
1787	fa_head = get_fa_head(l, 0);
1788	if (!fa_head)
1789		goto out;
1790
1791	if (list_empty(fa_head))
1792		goto out;
1793
1794	list_for_each_entry_rcu(fa, fa_head, fa_list) {
1795		struct fib_info *next_fi = fa->fa_info;
1796
1797		if (fa->fa_scope != res->scope ||
1798		    fa->fa_type != RTN_UNICAST)
1799			continue;
1800
1801		if (next_fi->fib_priority > res->fi->fib_priority)
1802			break;
1803		if (!next_fi->fib_nh[0].nh_gw ||
1804		    next_fi->fib_nh[0].nh_scope != RT_SCOPE_LINK)
1805			continue;
1806		fa->fa_state |= FA_S_ACCESSED;
1807
1808		if (fi == NULL) {
1809			if (next_fi != res->fi)
1810				break;
1811		} else if (!fib_detect_death(fi, order, &last_resort,
1812					     &last_idx, tb->tb_default)) {
1813			fib_result_assign(res, fi);
1814			tb->tb_default = order;
1815			goto out;
1816		}
1817		fi = next_fi;
1818		order++;
1819	}
1820	if (order <= 0 || fi == NULL) {
1821		tb->tb_default = -1;
1822		goto out;
1823	}
1824
1825	if (!fib_detect_death(fi, order, &last_resort, &last_idx,
1826				tb->tb_default)) {
1827		fib_result_assign(res, fi);
1828		tb->tb_default = order;
1829		goto out;
1830	}
1831	if (last_idx >= 0)
1832		fib_result_assign(res, last_resort);
1833	tb->tb_default = last_idx;
1834out:
1835	rcu_read_unlock();
1836}
1837
1838static int fn_trie_dump_fa(t_key key, int plen, struct list_head *fah, struct fib_table *tb,
1839			   struct sk_buff *skb, struct netlink_callback *cb)
1840{
1841	int i, s_i;
1842	struct fib_alias *fa;
1843
1844	__be32 xkey = htonl(key);
1845
1846	s_i = cb->args[4];
1847	i = 0;
1848
1849	/* rcu_read_lock is hold by caller */
1850
1851	list_for_each_entry_rcu(fa, fah, fa_list) {
1852		if (i < s_i) {
1853			i++;
1854			continue;
1855		}
1856		BUG_ON(!fa->fa_info);
1857
1858		if (fib_dump_info(skb, NETLINK_CB(cb->skb).pid,
1859				  cb->nlh->nlmsg_seq,
1860				  RTM_NEWROUTE,
1861				  tb->tb_id,
1862				  fa->fa_type,
1863				  fa->fa_scope,
1864				  xkey,
1865				  plen,
1866				  fa->fa_tos,
1867				  fa->fa_info, 0) < 0) {
1868			cb->args[4] = i;
1869			return -1;
1870		}
1871		i++;
1872	}
1873	cb->args[4] = i;
1874	return skb->len;
1875}
1876
1877static int fn_trie_dump_plen(struct trie *t, int plen, struct fib_table *tb, struct sk_buff *skb,
1878			     struct netlink_callback *cb)
1879{
1880	int h, s_h;
1881	struct list_head *fa_head;
1882	struct leaf *l = NULL;
1883
1884	s_h = cb->args[3];
1885
1886	for (h = 0; (l = nextleaf(t, l)) != NULL; h++) {
1887		if (h < s_h)
1888			continue;
1889		if (h > s_h)
1890			memset(&cb->args[4], 0,
1891			       sizeof(cb->args) - 4*sizeof(cb->args[0]));
1892
1893		fa_head = get_fa_head(l, plen);
1894
1895		if (!fa_head)
1896			continue;
1897
1898		if (list_empty(fa_head))
1899			continue;
1900
1901		if (fn_trie_dump_fa(l->key, plen, fa_head, tb, skb, cb)<0) {
1902			cb->args[3] = h;
1903			return -1;
1904		}
1905	}
1906	cb->args[3] = h;
1907	return skb->len;
1908}
1909
1910static int fn_trie_dump(struct fib_table *tb, struct sk_buff *skb, struct netlink_callback *cb)
1911{
1912	int m, s_m;
1913	struct trie *t = (struct trie *) tb->tb_data;
1914
1915	s_m = cb->args[2];
1916
1917	rcu_read_lock();
1918	for (m = 0; m <= 32; m++) {
1919		if (m < s_m)
1920			continue;
1921		if (m > s_m)
1922			memset(&cb->args[3], 0,
1923				sizeof(cb->args) - 3*sizeof(cb->args[0]));
1924
1925		if (fn_trie_dump_plen(t, 32-m, tb, skb, cb)<0) {
1926			cb->args[2] = m;
1927			goto out;
1928		}
1929	}
1930	rcu_read_unlock();
1931	cb->args[2] = m;
1932	return skb->len;
1933out:
1934	rcu_read_unlock();
1935	return -1;
1936}
1937
1938/* Fix more generic FIB names for init later */
1939
1940struct fib_table *fib_hash_init(u32 id)
1941{
1942	struct fib_table *tb;
1943	struct trie *t;
1944
1945	if (fn_alias_kmem == NULL)
1946		fn_alias_kmem = kmem_cache_create("ip_fib_alias",
1947						  sizeof(struct fib_alias),
1948						  0, SLAB_HWCACHE_ALIGN,
1949						  NULL);
1950
1951	tb = kmalloc(sizeof(struct fib_table) + sizeof(struct trie),
1952		     GFP_KERNEL);
1953	if (tb == NULL)
1954		return NULL;
1955
1956	tb->tb_id = id;
1957	tb->tb_default = -1;
1958	tb->tb_lookup = fn_trie_lookup;
1959	tb->tb_insert = fn_trie_insert;
1960	tb->tb_delete = fn_trie_delete;
1961	tb->tb_flush = fn_trie_flush;
1962	tb->tb_select_default = fn_trie_select_default;
1963	tb->tb_dump = fn_trie_dump;
1964
1965	t = (struct trie *) tb->tb_data;
1966	memset(t, 0, sizeof(*t));
1967
1968	if (id == RT_TABLE_LOCAL)
1969		printk(KERN_INFO "IPv4 FIB: Using LC-trie version %s\n", VERSION);
1970
1971	return tb;
1972}
1973
1974#ifdef CONFIG_PROC_FS
1975/* Depth first Trie walk iterator */
1976struct fib_trie_iter {
1977	struct seq_net_private p;
1978	struct trie *trie_local, *trie_main;
1979	struct tnode *tnode;
1980	struct trie *trie;
1981	unsigned index;
1982	unsigned depth;
1983};
1984
1985static struct node *fib_trie_get_next(struct fib_trie_iter *iter)
1986{
1987	struct tnode *tn = iter->tnode;
1988	unsigned cindex = iter->index;
1989	struct tnode *p;
1990
1991	/* A single entry routing table */
1992	if (!tn)
1993		return NULL;
1994
1995	pr_debug("get_next iter={node=%p index=%d depth=%d}\n",
1996		 iter->tnode, iter->index, iter->depth);
1997rescan:
1998	while (cindex < (1<<tn->bits)) {
1999		struct node *n = tnode_get_child(tn, cindex);
2000
2001		if (n) {
2002			if (IS_LEAF(n)) {
2003				iter->tnode = tn;
2004				iter->index = cindex + 1;
2005			} else {
2006				/* push down one level */
2007				iter->tnode = (struct tnode *) n;
2008				iter->index = 0;
2009				++iter->depth;
2010			}
2011			return n;
2012		}
2013
2014		++cindex;
2015	}
2016
2017	/* Current node exhausted, pop back up */
2018	p = node_parent((struct node *)tn);
2019	if (p) {
2020		cindex = tkey_extract_bits(tn->key, p->pos, p->bits)+1;
2021		tn = p;
2022		--iter->depth;
2023		goto rescan;
2024	}
2025
2026	/* got root? */
2027	return NULL;
2028}
2029
2030static struct node *fib_trie_get_first(struct fib_trie_iter *iter,
2031				       struct trie *t)
2032{
2033	struct node *n ;
2034
2035	if (!t)
2036		return NULL;
2037
2038	n = rcu_dereference(t->trie);
2039
2040	if (!iter)
2041		return NULL;
2042
2043	if (n) {
2044		if (IS_TNODE(n)) {
2045			iter->tnode = (struct tnode *) n;
2046			iter->trie = t;
2047			iter->index = 0;
2048			iter->depth = 1;
2049		} else {
2050			iter->tnode = NULL;
2051			iter->trie  = t;
2052			iter->index = 0;
2053			iter->depth = 0;
2054		}
2055		return n;
2056	}
2057	return NULL;
2058}
2059
2060static void trie_collect_stats(struct trie *t, struct trie_stat *s)
2061{
2062	struct node *n;
2063	struct fib_trie_iter iter;
2064
2065	memset(s, 0, sizeof(*s));
2066
2067	rcu_read_lock();
2068	for (n = fib_trie_get_first(&iter, t); n;
2069	     n = fib_trie_get_next(&iter)) {
2070		if (IS_LEAF(n)) {
2071			s->leaves++;
2072			s->totdepth += iter.depth;
2073			if (iter.depth > s->maxdepth)
2074				s->maxdepth = iter.depth;
2075		} else {
2076			const struct tnode *tn = (const struct tnode *) n;
2077			int i;
2078
2079			s->tnodes++;
2080			if (tn->bits < MAX_STAT_DEPTH)
2081				s->nodesizes[tn->bits]++;
2082
2083			for (i = 0; i < (1<<tn->bits); i++)
2084				if (!tn->child[i])
2085					s->nullpointers++;
2086		}
2087	}
2088	rcu_read_unlock();
2089}
2090
2091/*
2092 *	This outputs /proc/net/fib_triestats
2093 */
2094static void trie_show_stats(struct seq_file *seq, struct trie_stat *stat)
2095{
2096	unsigned i, max, pointers, bytes, avdepth;
2097
2098	if (stat->leaves)
2099		avdepth = stat->totdepth*100 / stat->leaves;
2100	else
2101		avdepth = 0;
2102
2103	seq_printf(seq, "\tAver depth:     %u.%02d\n", avdepth / 100, avdepth % 100 );
2104	seq_printf(seq, "\tMax depth:      %u\n", stat->maxdepth);
2105
2106	seq_printf(seq, "\tLeaves:         %u\n", stat->leaves);
2107
2108	bytes = sizeof(struct leaf) * stat->leaves;
2109	seq_printf(seq, "\tInternal nodes: %u\n\t", stat->tnodes);
2110	bytes += sizeof(struct tnode) * stat->tnodes;
2111
2112	max = MAX_STAT_DEPTH;
2113	while (max > 0 && stat->nodesizes[max-1] == 0)
2114		max--;
2115
2116	pointers = 0;
2117	for (i = 1; i <= max; i++)
2118		if (stat->nodesizes[i] != 0) {
2119			seq_printf(seq, "  %u: %u",  i, stat->nodesizes[i]);
2120			pointers += (1<<i) * stat->nodesizes[i];
2121		}
2122	seq_putc(seq, '\n');
2123	seq_printf(seq, "\tPointers: %u\n", pointers);
2124
2125	bytes += sizeof(struct node *) * pointers;
2126	seq_printf(seq, "Null ptrs: %u\n", stat->nullpointers);
2127	seq_printf(seq, "Total size: %u  kB\n", (bytes + 1023) / 1024);
2128
2129#ifdef CONFIG_IP_FIB_TRIE_STATS
2130	seq_printf(seq, "Counters:\n---------\n");
2131	seq_printf(seq,"gets = %d\n", t->stats.gets);
2132	seq_printf(seq,"backtracks = %d\n", t->stats.backtrack);
2133	seq_printf(seq,"semantic match passed = %d\n", t->stats.semantic_match_passed);
2134	seq_printf(seq,"semantic match miss = %d\n", t->stats.semantic_match_miss);
2135	seq_printf(seq,"null node hit= %d\n", t->stats.null_node_hit);
2136	seq_printf(seq,"skipped node resize = %d\n", t->stats.resize_node_skipped);
2137#ifdef CLEAR_STATS
2138	memset(&(t->stats), 0, sizeof(t->stats));
2139#endif
2140#endif /*  CONFIG_IP_FIB_TRIE_STATS */
2141}
2142
2143static int fib_triestat_seq_show(struct seq_file *seq, void *v)
2144{
2145	struct net *net = (struct net *)seq->private;
2146	struct trie *trie_local, *trie_main;
2147	struct trie_stat *stat;
2148	struct fib_table *tb;
2149
2150	trie_local = NULL;
2151	tb = fib_get_table(net, RT_TABLE_LOCAL);
2152	if (tb)
2153		trie_local = (struct trie *) tb->tb_data;
2154
2155	trie_main = NULL;
2156	tb = fib_get_table(net, RT_TABLE_MAIN);
2157	if (tb)
2158		trie_main = (struct trie *) tb->tb_data;
2159
2160
2161	stat = kmalloc(sizeof(*stat), GFP_KERNEL);
2162	if (!stat)
2163		return -ENOMEM;
2164
2165	seq_printf(seq, "Basic info: size of leaf: %Zd bytes, size of tnode: %Zd bytes.\n",
2166		   sizeof(struct leaf), sizeof(struct tnode));
2167
2168	if (trie_local) {
2169		seq_printf(seq, "Local:\n");
2170		trie_collect_stats(trie_local, stat);
2171		trie_show_stats(seq, stat);
2172	}
2173
2174	if (trie_main) {
2175		seq_printf(seq, "Main:\n");
2176		trie_collect_stats(trie_main, stat);
2177		trie_show_stats(seq, stat);
2178	}
2179	kfree(stat);
2180
2181	return 0;
2182}
2183
2184static int fib_triestat_seq_open(struct inode *inode, struct file *file)
2185{
2186	int err;
2187	struct net *net;
2188
2189	net = get_proc_net(inode);
2190	if (net == NULL)
2191		return -ENXIO;
2192	err = single_open(file, fib_triestat_seq_show, net);
2193	if (err < 0) {
2194		put_net(net);
2195		return err;
2196	}
2197	return 0;
2198}
2199
2200static int fib_triestat_seq_release(struct inode *ino, struct file *f)
2201{
2202	struct seq_file *seq = f->private_data;
2203	put_net(seq->private);
2204	return single_release(ino, f);
2205}
2206
2207static const struct file_operations fib_triestat_fops = {
2208	.owner	= THIS_MODULE,
2209	.open	= fib_triestat_seq_open,
2210	.read	= seq_read,
2211	.llseek	= seq_lseek,
2212	.release = fib_triestat_seq_release,
2213};
2214
2215static struct node *fib_trie_get_idx(struct fib_trie_iter *iter,
2216				      loff_t pos)
2217{
2218	loff_t idx = 0;
2219	struct node *n;
2220
2221	for (n = fib_trie_get_first(iter, iter->trie_local);
2222	     n; ++idx, n = fib_trie_get_next(iter)) {
2223		if (pos == idx)
2224			return n;
2225	}
2226
2227	for (n = fib_trie_get_first(iter, iter->trie_main);
2228	     n; ++idx, n = fib_trie_get_next(iter)) {
2229		if (pos == idx)
2230			return n;
2231	}
2232	return NULL;
2233}
2234
2235static void *fib_trie_seq_start(struct seq_file *seq, loff_t *pos)
2236{
2237	struct fib_trie_iter *iter = seq->private;
2238	struct fib_table *tb;
2239
2240	if (!iter->trie_local) {
2241		tb = fib_get_table(iter->p.net, RT_TABLE_LOCAL);
2242		if (tb)
2243			iter->trie_local = (struct trie *) tb->tb_data;
2244	}
2245	if (!iter->trie_main) {
2246		tb = fib_get_table(iter->p.net, RT_TABLE_MAIN);
2247		if (tb)
2248			iter->trie_main = (struct trie *) tb->tb_data;
2249	}
2250	rcu_read_lock();
2251	if (*pos == 0)
2252		return SEQ_START_TOKEN;
2253	return fib_trie_get_idx(iter, *pos - 1);
2254}
2255
2256static void *fib_trie_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2257{
2258	struct fib_trie_iter *iter = seq->private;
2259	void *l = v;
2260
2261	++*pos;
2262	if (v == SEQ_START_TOKEN)
2263		return fib_trie_get_idx(iter, 0);
2264
2265	v = fib_trie_get_next(iter);
2266	BUG_ON(v == l);
2267	if (v)
2268		return v;
2269
2270	/* continue scan in next trie */
2271	if (iter->trie == iter->trie_local)
2272		return fib_trie_get_first(iter, iter->trie_main);
2273
2274	return NULL;
2275}
2276
2277static void fib_trie_seq_stop(struct seq_file *seq, void *v)
2278{
2279	rcu_read_unlock();
2280}
2281
2282static void seq_indent(struct seq_file *seq, int n)
2283{
2284	while (n-- > 0) seq_puts(seq, "   ");
2285}
2286
2287static inline const char *rtn_scope(enum rt_scope_t s)
2288{
2289	static char buf[32];
2290
2291	switch (s) {
2292	case RT_SCOPE_UNIVERSE: return "universe";
2293	case RT_SCOPE_SITE:	return "site";
2294	case RT_SCOPE_LINK:	return "link";
2295	case RT_SCOPE_HOST:	return "host";
2296	case RT_SCOPE_NOWHERE:	return "nowhere";
2297	default:
2298		snprintf(buf, sizeof(buf), "scope=%d", s);
2299		return buf;
2300	}
2301}
2302
2303static const char *rtn_type_names[__RTN_MAX] = {
2304	[RTN_UNSPEC] = "UNSPEC",
2305	[RTN_UNICAST] = "UNICAST",
2306	[RTN_LOCAL] = "LOCAL",
2307	[RTN_BROADCAST] = "BROADCAST",
2308	[RTN_ANYCAST] = "ANYCAST",
2309	[RTN_MULTICAST] = "MULTICAST",
2310	[RTN_BLACKHOLE] = "BLACKHOLE",
2311	[RTN_UNREACHABLE] = "UNREACHABLE",
2312	[RTN_PROHIBIT] = "PROHIBIT",
2313	[RTN_THROW] = "THROW",
2314	[RTN_NAT] = "NAT",
2315	[RTN_XRESOLVE] = "XRESOLVE",
2316};
2317
2318static inline const char *rtn_type(unsigned t)
2319{
2320	static char buf[32];
2321
2322	if (t < __RTN_MAX && rtn_type_names[t])
2323		return rtn_type_names[t];
2324	snprintf(buf, sizeof(buf), "type %u", t);
2325	return buf;
2326}
2327
2328/* Pretty print the trie */
2329static int fib_trie_seq_show(struct seq_file *seq, void *v)
2330{
2331	const struct fib_trie_iter *iter = seq->private;
2332	struct node *n = v;
2333
2334	if (v == SEQ_START_TOKEN)
2335		return 0;
2336
2337	if (!node_parent(n)) {
2338		if (iter->trie == iter->trie_local)
2339			seq_puts(seq, "<local>:\n");
2340		else
2341			seq_puts(seq, "<main>:\n");
2342	}
2343
2344	if (IS_TNODE(n)) {
2345		struct tnode *tn = (struct tnode *) n;
2346		__be32 prf = htonl(mask_pfx(tn->key, tn->pos));
2347
2348		seq_indent(seq, iter->depth-1);
2349		seq_printf(seq, "  +-- %d.%d.%d.%d/%d %d %d %d\n",
2350			   NIPQUAD(prf), tn->pos, tn->bits, tn->full_children,
2351			   tn->empty_children);
2352
2353	} else {
2354		struct leaf *l = (struct leaf *) n;
2355		int i;
2356		__be32 val = htonl(l->key);
2357
2358		seq_indent(seq, iter->depth);
2359		seq_printf(seq, "  |-- %d.%d.%d.%d\n", NIPQUAD(val));
2360		for (i = 32; i >= 0; i--) {
2361			struct leaf_info *li = find_leaf_info(l, i);
2362			if (li) {
2363				struct fib_alias *fa;
2364				list_for_each_entry_rcu(fa, &li->falh, fa_list) {
2365					seq_indent(seq, iter->depth+1);
2366					seq_printf(seq, "  /%d %s %s", i,
2367						   rtn_scope(fa->fa_scope),
2368						   rtn_type(fa->fa_type));
2369					if (fa->fa_tos)
2370						seq_printf(seq, "tos =%d\n",
2371							   fa->fa_tos);
2372					seq_putc(seq, '\n');
2373				}
2374			}
2375		}
2376	}
2377
2378	return 0;
2379}
2380
2381static const struct seq_operations fib_trie_seq_ops = {
2382	.start  = fib_trie_seq_start,
2383	.next   = fib_trie_seq_next,
2384	.stop   = fib_trie_seq_stop,
2385	.show   = fib_trie_seq_show,
2386};
2387
2388static int fib_trie_seq_open(struct inode *inode, struct file *file)
2389{
2390	return seq_open_net(inode, file, &fib_trie_seq_ops,
2391			    sizeof(struct fib_trie_iter));
2392}
2393
2394static const struct file_operations fib_trie_fops = {
2395	.owner  = THIS_MODULE,
2396	.open   = fib_trie_seq_open,
2397	.read   = seq_read,
2398	.llseek = seq_lseek,
2399	.release = seq_release_net,
2400};
2401
2402static unsigned fib_flag_trans(int type, __be32 mask, const struct fib_info *fi)
2403{
2404	static unsigned type2flags[RTN_MAX + 1] = {
2405		[7] = RTF_REJECT, [8] = RTF_REJECT,
2406	};
2407	unsigned flags = type2flags[type];
2408
2409	if (fi && fi->fib_nh->nh_gw)
2410		flags |= RTF_GATEWAY;
2411	if (mask == htonl(0xFFFFFFFF))
2412		flags |= RTF_HOST;
2413	flags |= RTF_UP;
2414	return flags;
2415}
2416
2417/*
2418 *	This outputs /proc/net/route.
2419 *	The format of the file is not supposed to be changed
2420 * 	and needs to be same as fib_hash output to avoid breaking
2421 *	legacy utilities
2422 */
2423static int fib_route_seq_show(struct seq_file *seq, void *v)
2424{
2425	const struct fib_trie_iter *iter = seq->private;
2426	struct leaf *l = v;
2427	int i;
2428	char bf[128];
2429
2430	if (v == SEQ_START_TOKEN) {
2431		seq_printf(seq, "%-127s\n", "Iface\tDestination\tGateway "
2432			   "\tFlags\tRefCnt\tUse\tMetric\tMask\t\tMTU"
2433			   "\tWindow\tIRTT");
2434		return 0;
2435	}
2436
2437	if (iter->trie == iter->trie_local)
2438		return 0;
2439	if (IS_TNODE(l))
2440		return 0;
2441
2442	for (i=32; i>=0; i--) {
2443		struct leaf_info *li = find_leaf_info(l, i);
2444		struct fib_alias *fa;
2445		__be32 mask, prefix;
2446
2447		if (!li)
2448			continue;
2449
2450		mask = inet_make_mask(li->plen);
2451		prefix = htonl(l->key);
2452
2453		list_for_each_entry_rcu(fa, &li->falh, fa_list) {
2454			const struct fib_info *fi = fa->fa_info;
2455			unsigned flags = fib_flag_trans(fa->fa_type, mask, fi);
2456
2457			if (fa->fa_type == RTN_BROADCAST
2458			    || fa->fa_type == RTN_MULTICAST)
2459				continue;
2460
2461			if (fi)
2462				snprintf(bf, sizeof(bf),
2463					 "%s\t%08X\t%08X\t%04X\t%d\t%u\t%d\t%08X\t%d\t%u\t%u",
2464					 fi->fib_dev ? fi->fib_dev->name : "*",
2465					 prefix,
2466					 fi->fib_nh->nh_gw, flags, 0, 0,
2467					 fi->fib_priority,
2468					 mask,
2469					 (fi->fib_advmss ? fi->fib_advmss + 40 : 0),
2470					 fi->fib_window,
2471					 fi->fib_rtt >> 3);
2472			else
2473				snprintf(bf, sizeof(bf),
2474					 "*\t%08X\t%08X\t%04X\t%d\t%u\t%d\t%08X\t%d\t%u\t%u",
2475					 prefix, 0, flags, 0, 0, 0,
2476					 mask, 0, 0, 0);
2477
2478			seq_printf(seq, "%-127s\n", bf);
2479		}
2480	}
2481
2482	return 0;
2483}
2484
2485static const struct seq_operations fib_route_seq_ops = {
2486	.start  = fib_trie_seq_start,
2487	.next   = fib_trie_seq_next,
2488	.stop   = fib_trie_seq_stop,
2489	.show   = fib_route_seq_show,
2490};
2491
2492static int fib_route_seq_open(struct inode *inode, struct file *file)
2493{
2494	return seq_open_net(inode, file, &fib_route_seq_ops,
2495			    sizeof(struct fib_trie_iter));
2496}
2497
2498static const struct file_operations fib_route_fops = {
2499	.owner  = THIS_MODULE,
2500	.open   = fib_route_seq_open,
2501	.read   = seq_read,
2502	.llseek = seq_lseek,
2503	.release = seq_release_net,
2504};
2505
2506int __net_init fib_proc_init(struct net *net)
2507{
2508	if (!proc_net_fops_create(net, "fib_trie", S_IRUGO, &fib_trie_fops))
2509		goto out1;
2510
2511	if (!proc_net_fops_create(net, "fib_triestat", S_IRUGO,
2512				  &fib_triestat_fops))
2513		goto out2;
2514
2515	if (!proc_net_fops_create(net, "route", S_IRUGO, &fib_route_fops))
2516		goto out3;
2517
2518	return 0;
2519
2520out3:
2521	proc_net_remove(net, "fib_triestat");
2522out2:
2523	proc_net_remove(net, "fib_trie");
2524out1:
2525	return -ENOMEM;
2526}
2527
2528void __net_exit fib_proc_exit(struct net *net)
2529{
2530	proc_net_remove(net, "fib_trie");
2531	proc_net_remove(net, "fib_triestat");
2532	proc_net_remove(net, "route");
2533}
2534
2535#endif /* CONFIG_PROC_FS */
2536