fib_trie.c revision 9195bef7fb0ba0a91d5ffa566bcf8e007e3c7172
1/*
2 *   This program is free software; you can redistribute it and/or
3 *   modify it under the terms of the GNU General Public License
4 *   as published by the Free Software Foundation; either version
5 *   2 of the License, or (at your option) any later version.
6 *
7 *   Robert Olsson <robert.olsson@its.uu.se> Uppsala Universitet
8 *     & Swedish University of Agricultural Sciences.
9 *
10 *   Jens Laas <jens.laas@data.slu.se> Swedish University of
11 *     Agricultural Sciences.
12 *
13 *   Hans Liss <hans.liss@its.uu.se>  Uppsala Universitet
14 *
15 * This work is based on the LPC-trie which is originally descibed in:
16 *
17 * An experimental study of compression methods for dynamic tries
18 * Stefan Nilsson and Matti Tikkanen. Algorithmica, 33(1):19-33, 2002.
19 * http://www.nada.kth.se/~snilsson/public/papers/dyntrie2/
20 *
21 *
22 * IP-address lookup using LC-tries. Stefan Nilsson and Gunnar Karlsson
23 * IEEE Journal on Selected Areas in Communications, 17(6):1083-1092, June 1999
24 *
25 * Version:	$Id: fib_trie.c,v 1.3 2005/06/08 14:20:01 robert Exp $
26 *
27 *
28 * Code from fib_hash has been reused which includes the following header:
29 *
30 *
31 * INET		An implementation of the TCP/IP protocol suite for the LINUX
32 *		operating system.  INET is implemented using the  BSD Socket
33 *		interface as the means of communication with the user level.
34 *
35 *		IPv4 FIB: lookup engine and maintenance routines.
36 *
37 *
38 * Authors:	Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
39 *
40 *		This program is free software; you can redistribute it and/or
41 *		modify it under the terms of the GNU General Public License
42 *		as published by the Free Software Foundation; either version
43 *		2 of the License, or (at your option) any later version.
44 *
45 * Substantial contributions to this work comes from:
46 *
47 *		David S. Miller, <davem@davemloft.net>
48 *		Stephen Hemminger <shemminger@osdl.org>
49 *		Paul E. McKenney <paulmck@us.ibm.com>
50 *		Patrick McHardy <kaber@trash.net>
51 */
52
53#define VERSION "0.408"
54
55#include <asm/uaccess.h>
56#include <asm/system.h>
57#include <linux/bitops.h>
58#include <linux/types.h>
59#include <linux/kernel.h>
60#include <linux/mm.h>
61#include <linux/string.h>
62#include <linux/socket.h>
63#include <linux/sockios.h>
64#include <linux/errno.h>
65#include <linux/in.h>
66#include <linux/inet.h>
67#include <linux/inetdevice.h>
68#include <linux/netdevice.h>
69#include <linux/if_arp.h>
70#include <linux/proc_fs.h>
71#include <linux/rcupdate.h>
72#include <linux/skbuff.h>
73#include <linux/netlink.h>
74#include <linux/init.h>
75#include <linux/list.h>
76#include <net/net_namespace.h>
77#include <net/ip.h>
78#include <net/protocol.h>
79#include <net/route.h>
80#include <net/tcp.h>
81#include <net/sock.h>
82#include <net/ip_fib.h>
83#include "fib_lookup.h"
84
85#define MAX_STAT_DEPTH 32
86
87#define KEYLENGTH (8*sizeof(t_key))
88
89typedef unsigned int t_key;
90
91#define T_TNODE 0
92#define T_LEAF  1
93#define NODE_TYPE_MASK	0x1UL
94#define NODE_TYPE(node) ((node)->parent & NODE_TYPE_MASK)
95
96#define IS_TNODE(n) (!(n->parent & T_LEAF))
97#define IS_LEAF(n) (n->parent & T_LEAF)
98
99struct node {
100	unsigned long parent;
101	t_key key;
102};
103
104struct leaf {
105	unsigned long parent;
106	t_key key;
107	struct hlist_head list;
108	struct rcu_head rcu;
109};
110
111struct leaf_info {
112	struct hlist_node hlist;
113	struct rcu_head rcu;
114	int plen;
115	struct list_head falh;
116};
117
118struct tnode {
119	unsigned long parent;
120	t_key key;
121	unsigned char pos;		/* 2log(KEYLENGTH) bits needed */
122	unsigned char bits;		/* 2log(KEYLENGTH) bits needed */
123	unsigned int full_children;	/* KEYLENGTH bits needed */
124	unsigned int empty_children;	/* KEYLENGTH bits needed */
125	struct rcu_head rcu;
126	struct node *child[0];
127};
128
129#ifdef CONFIG_IP_FIB_TRIE_STATS
130struct trie_use_stats {
131	unsigned int gets;
132	unsigned int backtrack;
133	unsigned int semantic_match_passed;
134	unsigned int semantic_match_miss;
135	unsigned int null_node_hit;
136	unsigned int resize_node_skipped;
137};
138#endif
139
140struct trie_stat {
141	unsigned int totdepth;
142	unsigned int maxdepth;
143	unsigned int tnodes;
144	unsigned int leaves;
145	unsigned int nullpointers;
146	unsigned int prefixes;
147	unsigned int nodesizes[MAX_STAT_DEPTH];
148};
149
150struct trie {
151	struct node *trie;
152#ifdef CONFIG_IP_FIB_TRIE_STATS
153	struct trie_use_stats stats;
154#endif
155};
156
157static void put_child(struct trie *t, struct tnode *tn, int i, struct node *n);
158static void tnode_put_child_reorg(struct tnode *tn, int i, struct node *n,
159				  int wasfull);
160static struct node *resize(struct trie *t, struct tnode *tn);
161static struct tnode *inflate(struct trie *t, struct tnode *tn);
162static struct tnode *halve(struct trie *t, struct tnode *tn);
163static void tnode_free(struct tnode *tn);
164
165static struct kmem_cache *fn_alias_kmem __read_mostly;
166static struct kmem_cache *trie_leaf_kmem __read_mostly;
167
168static inline struct tnode *node_parent(struct node *node)
169{
170	return (struct tnode *)(node->parent & ~NODE_TYPE_MASK);
171}
172
173static inline struct tnode *node_parent_rcu(struct node *node)
174{
175	struct tnode *ret = node_parent(node);
176
177	return rcu_dereference(ret);
178}
179
180static inline void node_set_parent(struct node *node, struct tnode *ptr)
181{
182	rcu_assign_pointer(node->parent,
183			   (unsigned long)ptr | NODE_TYPE(node));
184}
185
186static inline struct node *tnode_get_child(struct tnode *tn, unsigned int i)
187{
188	BUG_ON(i >= 1U << tn->bits);
189
190	return tn->child[i];
191}
192
193static inline struct node *tnode_get_child_rcu(struct tnode *tn, unsigned int i)
194{
195	struct node *ret = tnode_get_child(tn, i);
196
197	return rcu_dereference(ret);
198}
199
200static inline int tnode_child_length(const struct tnode *tn)
201{
202	return 1 << tn->bits;
203}
204
205static inline t_key mask_pfx(t_key k, unsigned short l)
206{
207	return (l == 0) ? 0 : k >> (KEYLENGTH-l) << (KEYLENGTH-l);
208}
209
210static inline t_key tkey_extract_bits(t_key a, int offset, int bits)
211{
212	if (offset < KEYLENGTH)
213		return ((t_key)(a << offset)) >> (KEYLENGTH - bits);
214	else
215		return 0;
216}
217
218static inline int tkey_equals(t_key a, t_key b)
219{
220	return a == b;
221}
222
223static inline int tkey_sub_equals(t_key a, int offset, int bits, t_key b)
224{
225	if (bits == 0 || offset >= KEYLENGTH)
226		return 1;
227	bits = bits > KEYLENGTH ? KEYLENGTH : bits;
228	return ((a ^ b) << offset) >> (KEYLENGTH - bits) == 0;
229}
230
231static inline int tkey_mismatch(t_key a, int offset, t_key b)
232{
233	t_key diff = a ^ b;
234	int i = offset;
235
236	if (!diff)
237		return 0;
238	while ((diff << i) >> (KEYLENGTH-1) == 0)
239		i++;
240	return i;
241}
242
243/*
244  To understand this stuff, an understanding of keys and all their bits is
245  necessary. Every node in the trie has a key associated with it, but not
246  all of the bits in that key are significant.
247
248  Consider a node 'n' and its parent 'tp'.
249
250  If n is a leaf, every bit in its key is significant. Its presence is
251  necessitated by path compression, since during a tree traversal (when
252  searching for a leaf - unless we are doing an insertion) we will completely
253  ignore all skipped bits we encounter. Thus we need to verify, at the end of
254  a potentially successful search, that we have indeed been walking the
255  correct key path.
256
257  Note that we can never "miss" the correct key in the tree if present by
258  following the wrong path. Path compression ensures that segments of the key
259  that are the same for all keys with a given prefix are skipped, but the
260  skipped part *is* identical for each node in the subtrie below the skipped
261  bit! trie_insert() in this implementation takes care of that - note the
262  call to tkey_sub_equals() in trie_insert().
263
264  if n is an internal node - a 'tnode' here, the various parts of its key
265  have many different meanings.
266
267  Example:
268  _________________________________________________________________
269  | i | i | i | i | i | i | i | N | N | N | S | S | S | S | S | C |
270  -----------------------------------------------------------------
271    0   1   2   3   4   5   6   7   8   9  10  11  12  13  14  15
272
273  _________________________________________________________________
274  | C | C | C | u | u | u | u | u | u | u | u | u | u | u | u | u |
275  -----------------------------------------------------------------
276   16  17  18  19  20  21  22  23  24  25  26  27  28  29  30  31
277
278  tp->pos = 7
279  tp->bits = 3
280  n->pos = 15
281  n->bits = 4
282
283  First, let's just ignore the bits that come before the parent tp, that is
284  the bits from 0 to (tp->pos-1). They are *known* but at this point we do
285  not use them for anything.
286
287  The bits from (tp->pos) to (tp->pos + tp->bits - 1) - "N", above - are the
288  index into the parent's child array. That is, they will be used to find
289  'n' among tp's children.
290
291  The bits from (tp->pos + tp->bits) to (n->pos - 1) - "S" - are skipped bits
292  for the node n.
293
294  All the bits we have seen so far are significant to the node n. The rest
295  of the bits are really not needed or indeed known in n->key.
296
297  The bits from (n->pos) to (n->pos + n->bits - 1) - "C" - are the index into
298  n's child array, and will of course be different for each child.
299
300
301  The rest of the bits, from (n->pos + n->bits) onward, are completely unknown
302  at this point.
303
304*/
305
306static inline void check_tnode(const struct tnode *tn)
307{
308	WARN_ON(tn && tn->pos+tn->bits > 32);
309}
310
311static const int halve_threshold = 25;
312static const int inflate_threshold = 50;
313static const int halve_threshold_root = 8;
314static const int inflate_threshold_root = 15;
315
316
317static void __alias_free_mem(struct rcu_head *head)
318{
319	struct fib_alias *fa = container_of(head, struct fib_alias, rcu);
320	kmem_cache_free(fn_alias_kmem, fa);
321}
322
323static inline void alias_free_mem_rcu(struct fib_alias *fa)
324{
325	call_rcu(&fa->rcu, __alias_free_mem);
326}
327
328static void __leaf_free_rcu(struct rcu_head *head)
329{
330	struct leaf *l = container_of(head, struct leaf, rcu);
331	kmem_cache_free(trie_leaf_kmem, l);
332}
333
334static void __leaf_info_free_rcu(struct rcu_head *head)
335{
336	kfree(container_of(head, struct leaf_info, rcu));
337}
338
339static inline void free_leaf_info(struct leaf_info *leaf)
340{
341	call_rcu(&leaf->rcu, __leaf_info_free_rcu);
342}
343
344static struct tnode *tnode_alloc(size_t size)
345{
346	struct page *pages;
347
348	if (size <= PAGE_SIZE)
349		return kzalloc(size, GFP_KERNEL);
350
351	pages = alloc_pages(GFP_KERNEL|__GFP_ZERO, get_order(size));
352	if (!pages)
353		return NULL;
354
355	return page_address(pages);
356}
357
358static void __tnode_free_rcu(struct rcu_head *head)
359{
360	struct tnode *tn = container_of(head, struct tnode, rcu);
361	size_t size = sizeof(struct tnode) +
362		      (sizeof(struct node *) << tn->bits);
363
364	if (size <= PAGE_SIZE)
365		kfree(tn);
366	else
367		free_pages((unsigned long)tn, get_order(size));
368}
369
370static inline void tnode_free(struct tnode *tn)
371{
372	if (IS_LEAF(tn)) {
373		struct leaf *l = (struct leaf *) tn;
374		call_rcu_bh(&l->rcu, __leaf_free_rcu);
375	} else
376		call_rcu(&tn->rcu, __tnode_free_rcu);
377}
378
379static struct leaf *leaf_new(void)
380{
381	struct leaf *l = kmem_cache_alloc(trie_leaf_kmem, GFP_KERNEL);
382	if (l) {
383		l->parent = T_LEAF;
384		INIT_HLIST_HEAD(&l->list);
385	}
386	return l;
387}
388
389static struct leaf_info *leaf_info_new(int plen)
390{
391	struct leaf_info *li = kmalloc(sizeof(struct leaf_info),  GFP_KERNEL);
392	if (li) {
393		li->plen = plen;
394		INIT_LIST_HEAD(&li->falh);
395	}
396	return li;
397}
398
399static struct tnode *tnode_new(t_key key, int pos, int bits)
400{
401	size_t sz = sizeof(struct tnode) + (sizeof(struct node *) << bits);
402	struct tnode *tn = tnode_alloc(sz);
403
404	if (tn) {
405		tn->parent = T_TNODE;
406		tn->pos = pos;
407		tn->bits = bits;
408		tn->key = key;
409		tn->full_children = 0;
410		tn->empty_children = 1<<bits;
411	}
412
413	pr_debug("AT %p s=%u %lu\n", tn, (unsigned int) sizeof(struct tnode),
414		 (unsigned long) (sizeof(struct node) << bits));
415	return tn;
416}
417
418/*
419 * Check whether a tnode 'n' is "full", i.e. it is an internal node
420 * and no bits are skipped. See discussion in dyntree paper p. 6
421 */
422
423static inline int tnode_full(const struct tnode *tn, const struct node *n)
424{
425	if (n == NULL || IS_LEAF(n))
426		return 0;
427
428	return ((struct tnode *) n)->pos == tn->pos + tn->bits;
429}
430
431static inline void put_child(struct trie *t, struct tnode *tn, int i,
432			     struct node *n)
433{
434	tnode_put_child_reorg(tn, i, n, -1);
435}
436
437 /*
438  * Add a child at position i overwriting the old value.
439  * Update the value of full_children and empty_children.
440  */
441
442static void tnode_put_child_reorg(struct tnode *tn, int i, struct node *n,
443				  int wasfull)
444{
445	struct node *chi = tn->child[i];
446	int isfull;
447
448	BUG_ON(i >= 1<<tn->bits);
449
450
451	/* update emptyChildren */
452	if (n == NULL && chi != NULL)
453		tn->empty_children++;
454	else if (n != NULL && chi == NULL)
455		tn->empty_children--;
456
457	/* update fullChildren */
458	if (wasfull == -1)
459		wasfull = tnode_full(tn, chi);
460
461	isfull = tnode_full(tn, n);
462	if (wasfull && !isfull)
463		tn->full_children--;
464	else if (!wasfull && isfull)
465		tn->full_children++;
466
467	if (n)
468		node_set_parent(n, tn);
469
470	rcu_assign_pointer(tn->child[i], n);
471}
472
473static struct node *resize(struct trie *t, struct tnode *tn)
474{
475	int i;
476	int err = 0;
477	struct tnode *old_tn;
478	int inflate_threshold_use;
479	int halve_threshold_use;
480	int max_resize;
481
482	if (!tn)
483		return NULL;
484
485	pr_debug("In tnode_resize %p inflate_threshold=%d threshold=%d\n",
486		 tn, inflate_threshold, halve_threshold);
487
488	/* No children */
489	if (tn->empty_children == tnode_child_length(tn)) {
490		tnode_free(tn);
491		return NULL;
492	}
493	/* One child */
494	if (tn->empty_children == tnode_child_length(tn) - 1)
495		for (i = 0; i < tnode_child_length(tn); i++) {
496			struct node *n;
497
498			n = tn->child[i];
499			if (!n)
500				continue;
501
502			/* compress one level */
503			node_set_parent(n, NULL);
504			tnode_free(tn);
505			return n;
506		}
507	/*
508	 * Double as long as the resulting node has a number of
509	 * nonempty nodes that are above the threshold.
510	 */
511
512	/*
513	 * From "Implementing a dynamic compressed trie" by Stefan Nilsson of
514	 * the Helsinki University of Technology and Matti Tikkanen of Nokia
515	 * Telecommunications, page 6:
516	 * "A node is doubled if the ratio of non-empty children to all
517	 * children in the *doubled* node is at least 'high'."
518	 *
519	 * 'high' in this instance is the variable 'inflate_threshold'. It
520	 * is expressed as a percentage, so we multiply it with
521	 * tnode_child_length() and instead of multiplying by 2 (since the
522	 * child array will be doubled by inflate()) and multiplying
523	 * the left-hand side by 100 (to handle the percentage thing) we
524	 * multiply the left-hand side by 50.
525	 *
526	 * The left-hand side may look a bit weird: tnode_child_length(tn)
527	 * - tn->empty_children is of course the number of non-null children
528	 * in the current node. tn->full_children is the number of "full"
529	 * children, that is non-null tnodes with a skip value of 0.
530	 * All of those will be doubled in the resulting inflated tnode, so
531	 * we just count them one extra time here.
532	 *
533	 * A clearer way to write this would be:
534	 *
535	 * to_be_doubled = tn->full_children;
536	 * not_to_be_doubled = tnode_child_length(tn) - tn->empty_children -
537	 *     tn->full_children;
538	 *
539	 * new_child_length = tnode_child_length(tn) * 2;
540	 *
541	 * new_fill_factor = 100 * (not_to_be_doubled + 2*to_be_doubled) /
542	 *      new_child_length;
543	 * if (new_fill_factor >= inflate_threshold)
544	 *
545	 * ...and so on, tho it would mess up the while () loop.
546	 *
547	 * anyway,
548	 * 100 * (not_to_be_doubled + 2*to_be_doubled) / new_child_length >=
549	 *      inflate_threshold
550	 *
551	 * avoid a division:
552	 * 100 * (not_to_be_doubled + 2*to_be_doubled) >=
553	 *      inflate_threshold * new_child_length
554	 *
555	 * expand not_to_be_doubled and to_be_doubled, and shorten:
556	 * 100 * (tnode_child_length(tn) - tn->empty_children +
557	 *    tn->full_children) >= inflate_threshold * new_child_length
558	 *
559	 * expand new_child_length:
560	 * 100 * (tnode_child_length(tn) - tn->empty_children +
561	 *    tn->full_children) >=
562	 *      inflate_threshold * tnode_child_length(tn) * 2
563	 *
564	 * shorten again:
565	 * 50 * (tn->full_children + tnode_child_length(tn) -
566	 *    tn->empty_children) >= inflate_threshold *
567	 *    tnode_child_length(tn)
568	 *
569	 */
570
571	check_tnode(tn);
572
573	/* Keep root node larger  */
574
575	if (!tn->parent)
576		inflate_threshold_use = inflate_threshold_root;
577	else
578		inflate_threshold_use = inflate_threshold;
579
580	err = 0;
581	max_resize = 10;
582	while ((tn->full_children > 0 &&  max_resize-- &&
583		50 * (tn->full_children + tnode_child_length(tn)
584		      - tn->empty_children)
585		>= inflate_threshold_use * tnode_child_length(tn))) {
586
587		old_tn = tn;
588		tn = inflate(t, tn);
589
590		if (IS_ERR(tn)) {
591			tn = old_tn;
592#ifdef CONFIG_IP_FIB_TRIE_STATS
593			t->stats.resize_node_skipped++;
594#endif
595			break;
596		}
597	}
598
599	if (max_resize < 0) {
600		if (!tn->parent)
601			pr_warning("Fix inflate_threshold_root."
602				   " Now=%d size=%d bits\n",
603				   inflate_threshold_root, tn->bits);
604		else
605			pr_warning("Fix inflate_threshold."
606				   " Now=%d size=%d bits\n",
607				   inflate_threshold, tn->bits);
608	}
609
610	check_tnode(tn);
611
612	/*
613	 * Halve as long as the number of empty children in this
614	 * node is above threshold.
615	 */
616
617
618	/* Keep root node larger  */
619
620	if (!tn->parent)
621		halve_threshold_use = halve_threshold_root;
622	else
623		halve_threshold_use = halve_threshold;
624
625	err = 0;
626	max_resize = 10;
627	while (tn->bits > 1 &&  max_resize-- &&
628	       100 * (tnode_child_length(tn) - tn->empty_children) <
629	       halve_threshold_use * tnode_child_length(tn)) {
630
631		old_tn = tn;
632		tn = halve(t, tn);
633		if (IS_ERR(tn)) {
634			tn = old_tn;
635#ifdef CONFIG_IP_FIB_TRIE_STATS
636			t->stats.resize_node_skipped++;
637#endif
638			break;
639		}
640	}
641
642	if (max_resize < 0) {
643		if (!tn->parent)
644			pr_warning("Fix halve_threshold_root."
645				   " Now=%d size=%d bits\n",
646				   halve_threshold_root, tn->bits);
647		else
648			pr_warning("Fix halve_threshold."
649				   " Now=%d size=%d bits\n",
650				   halve_threshold, tn->bits);
651	}
652
653	/* Only one child remains */
654	if (tn->empty_children == tnode_child_length(tn) - 1)
655		for (i = 0; i < tnode_child_length(tn); i++) {
656			struct node *n;
657
658			n = tn->child[i];
659			if (!n)
660				continue;
661
662			/* compress one level */
663
664			node_set_parent(n, NULL);
665			tnode_free(tn);
666			return n;
667		}
668
669	return (struct node *) tn;
670}
671
672static struct tnode *inflate(struct trie *t, struct tnode *tn)
673{
674	struct tnode *oldtnode = tn;
675	int olen = tnode_child_length(tn);
676	int i;
677
678	pr_debug("In inflate\n");
679
680	tn = tnode_new(oldtnode->key, oldtnode->pos, oldtnode->bits + 1);
681
682	if (!tn)
683		return ERR_PTR(-ENOMEM);
684
685	/*
686	 * Preallocate and store tnodes before the actual work so we
687	 * don't get into an inconsistent state if memory allocation
688	 * fails. In case of failure we return the oldnode and  inflate
689	 * of tnode is ignored.
690	 */
691
692	for (i = 0; i < olen; i++) {
693		struct tnode *inode;
694
695		inode = (struct tnode *) tnode_get_child(oldtnode, i);
696		if (inode &&
697		    IS_TNODE(inode) &&
698		    inode->pos == oldtnode->pos + oldtnode->bits &&
699		    inode->bits > 1) {
700			struct tnode *left, *right;
701			t_key m = ~0U << (KEYLENGTH - 1) >> inode->pos;
702
703			left = tnode_new(inode->key&(~m), inode->pos + 1,
704					 inode->bits - 1);
705			if (!left)
706				goto nomem;
707
708			right = tnode_new(inode->key|m, inode->pos + 1,
709					  inode->bits - 1);
710
711			if (!right) {
712				tnode_free(left);
713				goto nomem;
714			}
715
716			put_child(t, tn, 2*i, (struct node *) left);
717			put_child(t, tn, 2*i+1, (struct node *) right);
718		}
719	}
720
721	for (i = 0; i < olen; i++) {
722		struct tnode *inode;
723		struct node *node = tnode_get_child(oldtnode, i);
724		struct tnode *left, *right;
725		int size, j;
726
727		/* An empty child */
728		if (node == NULL)
729			continue;
730
731		/* A leaf or an internal node with skipped bits */
732
733		if (IS_LEAF(node) || ((struct tnode *) node)->pos >
734		   tn->pos + tn->bits - 1) {
735			if (tkey_extract_bits(node->key,
736					      oldtnode->pos + oldtnode->bits,
737					      1) == 0)
738				put_child(t, tn, 2*i, node);
739			else
740				put_child(t, tn, 2*i+1, node);
741			continue;
742		}
743
744		/* An internal node with two children */
745		inode = (struct tnode *) node;
746
747		if (inode->bits == 1) {
748			put_child(t, tn, 2*i, inode->child[0]);
749			put_child(t, tn, 2*i+1, inode->child[1]);
750
751			tnode_free(inode);
752			continue;
753		}
754
755		/* An internal node with more than two children */
756
757		/* We will replace this node 'inode' with two new
758		 * ones, 'left' and 'right', each with half of the
759		 * original children. The two new nodes will have
760		 * a position one bit further down the key and this
761		 * means that the "significant" part of their keys
762		 * (see the discussion near the top of this file)
763		 * will differ by one bit, which will be "0" in
764		 * left's key and "1" in right's key. Since we are
765		 * moving the key position by one step, the bit that
766		 * we are moving away from - the bit at position
767		 * (inode->pos) - is the one that will differ between
768		 * left and right. So... we synthesize that bit in the
769		 * two  new keys.
770		 * The mask 'm' below will be a single "one" bit at
771		 * the position (inode->pos)
772		 */
773
774		/* Use the old key, but set the new significant
775		 *   bit to zero.
776		 */
777
778		left = (struct tnode *) tnode_get_child(tn, 2*i);
779		put_child(t, tn, 2*i, NULL);
780
781		BUG_ON(!left);
782
783		right = (struct tnode *) tnode_get_child(tn, 2*i+1);
784		put_child(t, tn, 2*i+1, NULL);
785
786		BUG_ON(!right);
787
788		size = tnode_child_length(left);
789		for (j = 0; j < size; j++) {
790			put_child(t, left, j, inode->child[j]);
791			put_child(t, right, j, inode->child[j + size]);
792		}
793		put_child(t, tn, 2*i, resize(t, left));
794		put_child(t, tn, 2*i+1, resize(t, right));
795
796		tnode_free(inode);
797	}
798	tnode_free(oldtnode);
799	return tn;
800nomem:
801	{
802		int size = tnode_child_length(tn);
803		int j;
804
805		for (j = 0; j < size; j++)
806			if (tn->child[j])
807				tnode_free((struct tnode *)tn->child[j]);
808
809		tnode_free(tn);
810
811		return ERR_PTR(-ENOMEM);
812	}
813}
814
815static struct tnode *halve(struct trie *t, struct tnode *tn)
816{
817	struct tnode *oldtnode = tn;
818	struct node *left, *right;
819	int i;
820	int olen = tnode_child_length(tn);
821
822	pr_debug("In halve\n");
823
824	tn = tnode_new(oldtnode->key, oldtnode->pos, oldtnode->bits - 1);
825
826	if (!tn)
827		return ERR_PTR(-ENOMEM);
828
829	/*
830	 * Preallocate and store tnodes before the actual work so we
831	 * don't get into an inconsistent state if memory allocation
832	 * fails. In case of failure we return the oldnode and halve
833	 * of tnode is ignored.
834	 */
835
836	for (i = 0; i < olen; i += 2) {
837		left = tnode_get_child(oldtnode, i);
838		right = tnode_get_child(oldtnode, i+1);
839
840		/* Two nonempty children */
841		if (left && right) {
842			struct tnode *newn;
843
844			newn = tnode_new(left->key, tn->pos + tn->bits, 1);
845
846			if (!newn)
847				goto nomem;
848
849			put_child(t, tn, i/2, (struct node *)newn);
850		}
851
852	}
853
854	for (i = 0; i < olen; i += 2) {
855		struct tnode *newBinNode;
856
857		left = tnode_get_child(oldtnode, i);
858		right = tnode_get_child(oldtnode, i+1);
859
860		/* At least one of the children is empty */
861		if (left == NULL) {
862			if (right == NULL)    /* Both are empty */
863				continue;
864			put_child(t, tn, i/2, right);
865			continue;
866		}
867
868		if (right == NULL) {
869			put_child(t, tn, i/2, left);
870			continue;
871		}
872
873		/* Two nonempty children */
874		newBinNode = (struct tnode *) tnode_get_child(tn, i/2);
875		put_child(t, tn, i/2, NULL);
876		put_child(t, newBinNode, 0, left);
877		put_child(t, newBinNode, 1, right);
878		put_child(t, tn, i/2, resize(t, newBinNode));
879	}
880	tnode_free(oldtnode);
881	return tn;
882nomem:
883	{
884		int size = tnode_child_length(tn);
885		int j;
886
887		for (j = 0; j < size; j++)
888			if (tn->child[j])
889				tnode_free((struct tnode *)tn->child[j]);
890
891		tnode_free(tn);
892
893		return ERR_PTR(-ENOMEM);
894	}
895}
896
897/* readside must use rcu_read_lock currently dump routines
898 via get_fa_head and dump */
899
900static struct leaf_info *find_leaf_info(struct leaf *l, int plen)
901{
902	struct hlist_head *head = &l->list;
903	struct hlist_node *node;
904	struct leaf_info *li;
905
906	hlist_for_each_entry_rcu(li, node, head, hlist)
907		if (li->plen == plen)
908			return li;
909
910	return NULL;
911}
912
913static inline struct list_head *get_fa_head(struct leaf *l, int plen)
914{
915	struct leaf_info *li = find_leaf_info(l, plen);
916
917	if (!li)
918		return NULL;
919
920	return &li->falh;
921}
922
923static void insert_leaf_info(struct hlist_head *head, struct leaf_info *new)
924{
925	struct leaf_info *li = NULL, *last = NULL;
926	struct hlist_node *node;
927
928	if (hlist_empty(head)) {
929		hlist_add_head_rcu(&new->hlist, head);
930	} else {
931		hlist_for_each_entry(li, node, head, hlist) {
932			if (new->plen > li->plen)
933				break;
934
935			last = li;
936		}
937		if (last)
938			hlist_add_after_rcu(&last->hlist, &new->hlist);
939		else
940			hlist_add_before_rcu(&new->hlist, &li->hlist);
941	}
942}
943
944/* rcu_read_lock needs to be hold by caller from readside */
945
946static struct leaf *
947fib_find_node(struct trie *t, u32 key)
948{
949	int pos;
950	struct tnode *tn;
951	struct node *n;
952
953	pos = 0;
954	n = rcu_dereference(t->trie);
955
956	while (n != NULL &&  NODE_TYPE(n) == T_TNODE) {
957		tn = (struct tnode *) n;
958
959		check_tnode(tn);
960
961		if (tkey_sub_equals(tn->key, pos, tn->pos-pos, key)) {
962			pos = tn->pos + tn->bits;
963			n = tnode_get_child_rcu(tn,
964						tkey_extract_bits(key,
965								  tn->pos,
966								  tn->bits));
967		} else
968			break;
969	}
970	/* Case we have found a leaf. Compare prefixes */
971
972	if (n != NULL && IS_LEAF(n) && tkey_equals(key, n->key))
973		return (struct leaf *)n;
974
975	return NULL;
976}
977
978static struct node *trie_rebalance(struct trie *t, struct tnode *tn)
979{
980	int wasfull;
981	t_key cindex, key = tn->key;
982	struct tnode *tp;
983
984	while (tn != NULL && (tp = node_parent((struct node *)tn)) != NULL) {
985		cindex = tkey_extract_bits(key, tp->pos, tp->bits);
986		wasfull = tnode_full(tp, tnode_get_child(tp, cindex));
987		tn = (struct tnode *) resize(t, (struct tnode *)tn);
988
989		tnode_put_child_reorg((struct tnode *)tp, cindex,
990				      (struct node *)tn, wasfull);
991
992		tp = node_parent((struct node *) tn);
993		if (!tp)
994			break;
995		tn = tp;
996	}
997
998	/* Handle last (top) tnode */
999	if (IS_TNODE(tn))
1000		tn = (struct tnode *)resize(t, (struct tnode *)tn);
1001
1002	return (struct node *)tn;
1003}
1004
1005/* only used from updater-side */
1006
1007static struct list_head *fib_insert_node(struct trie *t, u32 key, int plen)
1008{
1009	int pos, newpos;
1010	struct tnode *tp = NULL, *tn = NULL;
1011	struct node *n;
1012	struct leaf *l;
1013	int missbit;
1014	struct list_head *fa_head = NULL;
1015	struct leaf_info *li;
1016	t_key cindex;
1017
1018	pos = 0;
1019	n = t->trie;
1020
1021	/* If we point to NULL, stop. Either the tree is empty and we should
1022	 * just put a new leaf in if, or we have reached an empty child slot,
1023	 * and we should just put our new leaf in that.
1024	 * If we point to a T_TNODE, check if it matches our key. Note that
1025	 * a T_TNODE might be skipping any number of bits - its 'pos' need
1026	 * not be the parent's 'pos'+'bits'!
1027	 *
1028	 * If it does match the current key, get pos/bits from it, extract
1029	 * the index from our key, push the T_TNODE and walk the tree.
1030	 *
1031	 * If it doesn't, we have to replace it with a new T_TNODE.
1032	 *
1033	 * If we point to a T_LEAF, it might or might not have the same key
1034	 * as we do. If it does, just change the value, update the T_LEAF's
1035	 * value, and return it.
1036	 * If it doesn't, we need to replace it with a T_TNODE.
1037	 */
1038
1039	while (n != NULL &&  NODE_TYPE(n) == T_TNODE) {
1040		tn = (struct tnode *) n;
1041
1042		check_tnode(tn);
1043
1044		if (tkey_sub_equals(tn->key, pos, tn->pos-pos, key)) {
1045			tp = tn;
1046			pos = tn->pos + tn->bits;
1047			n = tnode_get_child(tn,
1048					    tkey_extract_bits(key,
1049							      tn->pos,
1050							      tn->bits));
1051
1052			BUG_ON(n && node_parent(n) != tn);
1053		} else
1054			break;
1055	}
1056
1057	/*
1058	 * n  ----> NULL, LEAF or TNODE
1059	 *
1060	 * tp is n's (parent) ----> NULL or TNODE
1061	 */
1062
1063	BUG_ON(tp && IS_LEAF(tp));
1064
1065	/* Case 1: n is a leaf. Compare prefixes */
1066
1067	if (n != NULL && IS_LEAF(n) && tkey_equals(key, n->key)) {
1068		l = (struct leaf *) n;
1069		li = leaf_info_new(plen);
1070
1071		if (!li)
1072			return NULL;
1073
1074		fa_head = &li->falh;
1075		insert_leaf_info(&l->list, li);
1076		goto done;
1077	}
1078	l = leaf_new();
1079
1080	if (!l)
1081		return NULL;
1082
1083	l->key = key;
1084	li = leaf_info_new(plen);
1085
1086	if (!li) {
1087		tnode_free((struct tnode *) l);
1088		return NULL;
1089	}
1090
1091	fa_head = &li->falh;
1092	insert_leaf_info(&l->list, li);
1093
1094	if (t->trie && n == NULL) {
1095		/* Case 2: n is NULL, and will just insert a new leaf */
1096
1097		node_set_parent((struct node *)l, tp);
1098
1099		cindex = tkey_extract_bits(key, tp->pos, tp->bits);
1100		put_child(t, (struct tnode *)tp, cindex, (struct node *)l);
1101	} else {
1102		/* Case 3: n is a LEAF or a TNODE and the key doesn't match. */
1103		/*
1104		 *  Add a new tnode here
1105		 *  first tnode need some special handling
1106		 */
1107
1108		if (tp)
1109			pos = tp->pos+tp->bits;
1110		else
1111			pos = 0;
1112
1113		if (n) {
1114			newpos = tkey_mismatch(key, pos, n->key);
1115			tn = tnode_new(n->key, newpos, 1);
1116		} else {
1117			newpos = 0;
1118			tn = tnode_new(key, newpos, 1); /* First tnode */
1119		}
1120
1121		if (!tn) {
1122			free_leaf_info(li);
1123			tnode_free((struct tnode *) l);
1124			return NULL;
1125		}
1126
1127		node_set_parent((struct node *)tn, tp);
1128
1129		missbit = tkey_extract_bits(key, newpos, 1);
1130		put_child(t, tn, missbit, (struct node *)l);
1131		put_child(t, tn, 1-missbit, n);
1132
1133		if (tp) {
1134			cindex = tkey_extract_bits(key, tp->pos, tp->bits);
1135			put_child(t, (struct tnode *)tp, cindex,
1136				  (struct node *)tn);
1137		} else {
1138			rcu_assign_pointer(t->trie, (struct node *)tn);
1139			tp = tn;
1140		}
1141	}
1142
1143	if (tp && tp->pos + tp->bits > 32)
1144		pr_warning("fib_trie"
1145			   " tp=%p pos=%d, bits=%d, key=%0x plen=%d\n",
1146			   tp, tp->pos, tp->bits, key, plen);
1147
1148	/* Rebalance the trie */
1149
1150	rcu_assign_pointer(t->trie, trie_rebalance(t, tp));
1151done:
1152	return fa_head;
1153}
1154
1155/*
1156 * Caller must hold RTNL.
1157 */
1158static int fn_trie_insert(struct fib_table *tb, struct fib_config *cfg)
1159{
1160	struct trie *t = (struct trie *) tb->tb_data;
1161	struct fib_alias *fa, *new_fa;
1162	struct list_head *fa_head = NULL;
1163	struct fib_info *fi;
1164	int plen = cfg->fc_dst_len;
1165	u8 tos = cfg->fc_tos;
1166	u32 key, mask;
1167	int err;
1168	struct leaf *l;
1169
1170	if (plen > 32)
1171		return -EINVAL;
1172
1173	key = ntohl(cfg->fc_dst);
1174
1175	pr_debug("Insert table=%u %08x/%d\n", tb->tb_id, key, plen);
1176
1177	mask = ntohl(inet_make_mask(plen));
1178
1179	if (key & ~mask)
1180		return -EINVAL;
1181
1182	key = key & mask;
1183
1184	fi = fib_create_info(cfg);
1185	if (IS_ERR(fi)) {
1186		err = PTR_ERR(fi);
1187		goto err;
1188	}
1189
1190	l = fib_find_node(t, key);
1191	fa = NULL;
1192
1193	if (l) {
1194		fa_head = get_fa_head(l, plen);
1195		fa = fib_find_alias(fa_head, tos, fi->fib_priority);
1196	}
1197
1198	/* Now fa, if non-NULL, points to the first fib alias
1199	 * with the same keys [prefix,tos,priority], if such key already
1200	 * exists or to the node before which we will insert new one.
1201	 *
1202	 * If fa is NULL, we will need to allocate a new one and
1203	 * insert to the head of f.
1204	 *
1205	 * If f is NULL, no fib node matched the destination key
1206	 * and we need to allocate a new one of those as well.
1207	 */
1208
1209	if (fa && fa->fa_info->fib_priority == fi->fib_priority) {
1210		struct fib_alias *fa_orig;
1211
1212		err = -EEXIST;
1213		if (cfg->fc_nlflags & NLM_F_EXCL)
1214			goto out;
1215
1216		if (cfg->fc_nlflags & NLM_F_REPLACE) {
1217			struct fib_info *fi_drop;
1218			u8 state;
1219
1220			if (fi->fib_treeref > 1)
1221				goto out;
1222
1223			err = -ENOBUFS;
1224			new_fa = kmem_cache_alloc(fn_alias_kmem, GFP_KERNEL);
1225			if (new_fa == NULL)
1226				goto out;
1227
1228			fi_drop = fa->fa_info;
1229			new_fa->fa_tos = fa->fa_tos;
1230			new_fa->fa_info = fi;
1231			new_fa->fa_type = cfg->fc_type;
1232			new_fa->fa_scope = cfg->fc_scope;
1233			state = fa->fa_state;
1234			new_fa->fa_state &= ~FA_S_ACCESSED;
1235
1236			list_replace_rcu(&fa->fa_list, &new_fa->fa_list);
1237			alias_free_mem_rcu(fa);
1238
1239			fib_release_info(fi_drop);
1240			if (state & FA_S_ACCESSED)
1241				rt_cache_flush(-1);
1242			rtmsg_fib(RTM_NEWROUTE, htonl(key), new_fa, plen,
1243				tb->tb_id, &cfg->fc_nlinfo, NLM_F_REPLACE);
1244
1245			goto succeeded;
1246		}
1247		/* Error if we find a perfect match which
1248		 * uses the same scope, type, and nexthop
1249		 * information.
1250		 */
1251		fa_orig = fa;
1252		list_for_each_entry(fa, fa_orig->fa_list.prev, fa_list) {
1253			if (fa->fa_tos != tos)
1254				break;
1255			if (fa->fa_info->fib_priority != fi->fib_priority)
1256				break;
1257			if (fa->fa_type == cfg->fc_type &&
1258			    fa->fa_scope == cfg->fc_scope &&
1259			    fa->fa_info == fi)
1260				goto out;
1261		}
1262
1263		if (!(cfg->fc_nlflags & NLM_F_APPEND))
1264			fa = fa_orig;
1265	}
1266	err = -ENOENT;
1267	if (!(cfg->fc_nlflags & NLM_F_CREATE))
1268		goto out;
1269
1270	err = -ENOBUFS;
1271	new_fa = kmem_cache_alloc(fn_alias_kmem, GFP_KERNEL);
1272	if (new_fa == NULL)
1273		goto out;
1274
1275	new_fa->fa_info = fi;
1276	new_fa->fa_tos = tos;
1277	new_fa->fa_type = cfg->fc_type;
1278	new_fa->fa_scope = cfg->fc_scope;
1279	new_fa->fa_state = 0;
1280	/*
1281	 * Insert new entry to the list.
1282	 */
1283
1284	if (!fa_head) {
1285		fa_head = fib_insert_node(t, key, plen);
1286		if (unlikely(!fa_head)) {
1287			err = -ENOMEM;
1288			goto out_free_new_fa;
1289		}
1290	}
1291
1292	list_add_tail_rcu(&new_fa->fa_list,
1293			  (fa ? &fa->fa_list : fa_head));
1294
1295	rt_cache_flush(-1);
1296	rtmsg_fib(RTM_NEWROUTE, htonl(key), new_fa, plen, tb->tb_id,
1297		  &cfg->fc_nlinfo, 0);
1298succeeded:
1299	return 0;
1300
1301out_free_new_fa:
1302	kmem_cache_free(fn_alias_kmem, new_fa);
1303out:
1304	fib_release_info(fi);
1305err:
1306	return err;
1307}
1308
1309
1310/* should be called with rcu_read_lock */
1311static int check_leaf(struct trie *t, struct leaf *l,
1312		      t_key key,  const struct flowi *flp,
1313		      struct fib_result *res)
1314{
1315	struct leaf_info *li;
1316	struct hlist_head *hhead = &l->list;
1317	struct hlist_node *node;
1318
1319	hlist_for_each_entry_rcu(li, node, hhead, hlist) {
1320		int err;
1321		int plen = li->plen;
1322		__be32 mask = inet_make_mask(plen);
1323
1324		if (l->key != (key & ntohl(mask)))
1325			continue;
1326
1327		err = fib_semantic_match(&li->falh, flp, res,
1328					 htonl(l->key), mask, plen);
1329
1330#ifdef CONFIG_IP_FIB_TRIE_STATS
1331		if (err <= 0)
1332			t->stats.semantic_match_passed++;
1333		else
1334			t->stats.semantic_match_miss++;
1335#endif
1336		if (err <= 0)
1337			return plen;
1338	}
1339
1340	return -1;
1341}
1342
1343static int fn_trie_lookup(struct fib_table *tb, const struct flowi *flp,
1344			  struct fib_result *res)
1345{
1346	struct trie *t = (struct trie *) tb->tb_data;
1347	int plen, ret = 0;
1348	struct node *n;
1349	struct tnode *pn;
1350	int pos, bits;
1351	t_key key = ntohl(flp->fl4_dst);
1352	int chopped_off;
1353	t_key cindex = 0;
1354	int current_prefix_length = KEYLENGTH;
1355	struct tnode *cn;
1356	t_key node_prefix, key_prefix, pref_mismatch;
1357	int mp;
1358
1359	rcu_read_lock();
1360
1361	n = rcu_dereference(t->trie);
1362	if (!n)
1363		goto failed;
1364
1365#ifdef CONFIG_IP_FIB_TRIE_STATS
1366	t->stats.gets++;
1367#endif
1368
1369	/* Just a leaf? */
1370	if (IS_LEAF(n)) {
1371		plen = check_leaf(t, (struct leaf *)n, key, flp, res);
1372		if (plen < 0)
1373			goto failed;
1374		ret = 0;
1375		goto found;
1376	}
1377
1378	pn = (struct tnode *) n;
1379	chopped_off = 0;
1380
1381	while (pn) {
1382		pos = pn->pos;
1383		bits = pn->bits;
1384
1385		if (!chopped_off)
1386			cindex = tkey_extract_bits(mask_pfx(key, current_prefix_length),
1387						   pos, bits);
1388
1389		n = tnode_get_child(pn, cindex);
1390
1391		if (n == NULL) {
1392#ifdef CONFIG_IP_FIB_TRIE_STATS
1393			t->stats.null_node_hit++;
1394#endif
1395			goto backtrace;
1396		}
1397
1398		if (IS_LEAF(n)) {
1399			plen = check_leaf(t, (struct leaf *)n, key, flp, res);
1400			if (plen < 0)
1401				goto backtrace;
1402
1403			ret = 0;
1404			goto found;
1405		}
1406
1407		cn = (struct tnode *)n;
1408
1409		/*
1410		 * It's a tnode, and we can do some extra checks here if we
1411		 * like, to avoid descending into a dead-end branch.
1412		 * This tnode is in the parent's child array at index
1413		 * key[p_pos..p_pos+p_bits] but potentially with some bits
1414		 * chopped off, so in reality the index may be just a
1415		 * subprefix, padded with zero at the end.
1416		 * We can also take a look at any skipped bits in this
1417		 * tnode - everything up to p_pos is supposed to be ok,
1418		 * and the non-chopped bits of the index (se previous
1419		 * paragraph) are also guaranteed ok, but the rest is
1420		 * considered unknown.
1421		 *
1422		 * The skipped bits are key[pos+bits..cn->pos].
1423		 */
1424
1425		/* If current_prefix_length < pos+bits, we are already doing
1426		 * actual prefix  matching, which means everything from
1427		 * pos+(bits-chopped_off) onward must be zero along some
1428		 * branch of this subtree - otherwise there is *no* valid
1429		 * prefix present. Here we can only check the skipped
1430		 * bits. Remember, since we have already indexed into the
1431		 * parent's child array, we know that the bits we chopped of
1432		 * *are* zero.
1433		 */
1434
1435		/* NOTA BENE: Checking only skipped bits
1436		   for the new node here */
1437
1438		if (current_prefix_length < pos+bits) {
1439			if (tkey_extract_bits(cn->key, current_prefix_length,
1440						cn->pos - current_prefix_length)
1441			    || !(cn->child[0]))
1442				goto backtrace;
1443		}
1444
1445		/*
1446		 * If chopped_off=0, the index is fully validated and we
1447		 * only need to look at the skipped bits for this, the new,
1448		 * tnode. What we actually want to do is to find out if
1449		 * these skipped bits match our key perfectly, or if we will
1450		 * have to count on finding a matching prefix further down,
1451		 * because if we do, we would like to have some way of
1452		 * verifying the existence of such a prefix at this point.
1453		 */
1454
1455		/* The only thing we can do at this point is to verify that
1456		 * any such matching prefix can indeed be a prefix to our
1457		 * key, and if the bits in the node we are inspecting that
1458		 * do not match our key are not ZERO, this cannot be true.
1459		 * Thus, find out where there is a mismatch (before cn->pos)
1460		 * and verify that all the mismatching bits are zero in the
1461		 * new tnode's key.
1462		 */
1463
1464		/*
1465		 * Note: We aren't very concerned about the piece of
1466		 * the key that precede pn->pos+pn->bits, since these
1467		 * have already been checked. The bits after cn->pos
1468		 * aren't checked since these are by definition
1469		 * "unknown" at this point. Thus, what we want to see
1470		 * is if we are about to enter the "prefix matching"
1471		 * state, and in that case verify that the skipped
1472		 * bits that will prevail throughout this subtree are
1473		 * zero, as they have to be if we are to find a
1474		 * matching prefix.
1475		 */
1476
1477		node_prefix = mask_pfx(cn->key, cn->pos);
1478		key_prefix = mask_pfx(key, cn->pos);
1479		pref_mismatch = key_prefix^node_prefix;
1480		mp = 0;
1481
1482		/*
1483		 * In short: If skipped bits in this node do not match
1484		 * the search key, enter the "prefix matching"
1485		 * state.directly.
1486		 */
1487		if (pref_mismatch) {
1488			while (!(pref_mismatch & (1<<(KEYLENGTH-1)))) {
1489				mp++;
1490				pref_mismatch = pref_mismatch << 1;
1491			}
1492			key_prefix = tkey_extract_bits(cn->key, mp, cn->pos-mp);
1493
1494			if (key_prefix != 0)
1495				goto backtrace;
1496
1497			if (current_prefix_length >= cn->pos)
1498				current_prefix_length = mp;
1499		}
1500
1501		pn = (struct tnode *)n; /* Descend */
1502		chopped_off = 0;
1503		continue;
1504
1505backtrace:
1506		chopped_off++;
1507
1508		/* As zero don't change the child key (cindex) */
1509		while ((chopped_off <= pn->bits)
1510		       && !(cindex & (1<<(chopped_off-1))))
1511			chopped_off++;
1512
1513		/* Decrease current_... with bits chopped off */
1514		if (current_prefix_length > pn->pos + pn->bits - chopped_off)
1515			current_prefix_length = pn->pos + pn->bits
1516				- chopped_off;
1517
1518		/*
1519		 * Either we do the actual chop off according or if we have
1520		 * chopped off all bits in this tnode walk up to our parent.
1521		 */
1522
1523		if (chopped_off <= pn->bits) {
1524			cindex &= ~(1 << (chopped_off-1));
1525		} else {
1526			struct tnode *parent = node_parent((struct node *) pn);
1527			if (!parent)
1528				goto failed;
1529
1530			/* Get Child's index */
1531			cindex = tkey_extract_bits(pn->key, parent->pos, parent->bits);
1532			pn = parent;
1533			chopped_off = 0;
1534
1535#ifdef CONFIG_IP_FIB_TRIE_STATS
1536			t->stats.backtrack++;
1537#endif
1538			goto backtrace;
1539		}
1540	}
1541failed:
1542	ret = 1;
1543found:
1544	rcu_read_unlock();
1545	return ret;
1546}
1547
1548/*
1549 * Remove the leaf and return parent.
1550 */
1551static void trie_leaf_remove(struct trie *t, struct leaf *l)
1552{
1553	struct tnode *tp = node_parent((struct node *) l);
1554
1555	pr_debug("entering trie_leaf_remove(%p)\n", l);
1556
1557	if (tp) {
1558		t_key cindex = tkey_extract_bits(l->key, tp->pos, tp->bits);
1559		put_child(t, (struct tnode *)tp, cindex, NULL);
1560		rcu_assign_pointer(t->trie, trie_rebalance(t, tp));
1561	} else
1562		rcu_assign_pointer(t->trie, NULL);
1563
1564	tnode_free((struct tnode *) l);
1565}
1566
1567/*
1568 * Caller must hold RTNL.
1569 */
1570static int fn_trie_delete(struct fib_table *tb, struct fib_config *cfg)
1571{
1572	struct trie *t = (struct trie *) tb->tb_data;
1573	u32 key, mask;
1574	int plen = cfg->fc_dst_len;
1575	u8 tos = cfg->fc_tos;
1576	struct fib_alias *fa, *fa_to_delete;
1577	struct list_head *fa_head;
1578	struct leaf *l;
1579	struct leaf_info *li;
1580
1581	if (plen > 32)
1582		return -EINVAL;
1583
1584	key = ntohl(cfg->fc_dst);
1585	mask = ntohl(inet_make_mask(plen));
1586
1587	if (key & ~mask)
1588		return -EINVAL;
1589
1590	key = key & mask;
1591	l = fib_find_node(t, key);
1592
1593	if (!l)
1594		return -ESRCH;
1595
1596	fa_head = get_fa_head(l, plen);
1597	fa = fib_find_alias(fa_head, tos, 0);
1598
1599	if (!fa)
1600		return -ESRCH;
1601
1602	pr_debug("Deleting %08x/%d tos=%d t=%p\n", key, plen, tos, t);
1603
1604	fa_to_delete = NULL;
1605	fa_head = fa->fa_list.prev;
1606
1607	list_for_each_entry(fa, fa_head, fa_list) {
1608		struct fib_info *fi = fa->fa_info;
1609
1610		if (fa->fa_tos != tos)
1611			break;
1612
1613		if ((!cfg->fc_type || fa->fa_type == cfg->fc_type) &&
1614		    (cfg->fc_scope == RT_SCOPE_NOWHERE ||
1615		     fa->fa_scope == cfg->fc_scope) &&
1616		    (!cfg->fc_protocol ||
1617		     fi->fib_protocol == cfg->fc_protocol) &&
1618		    fib_nh_match(cfg, fi) == 0) {
1619			fa_to_delete = fa;
1620			break;
1621		}
1622	}
1623
1624	if (!fa_to_delete)
1625		return -ESRCH;
1626
1627	fa = fa_to_delete;
1628	rtmsg_fib(RTM_DELROUTE, htonl(key), fa, plen, tb->tb_id,
1629		  &cfg->fc_nlinfo, 0);
1630
1631	l = fib_find_node(t, key);
1632	li = find_leaf_info(l, plen);
1633
1634	list_del_rcu(&fa->fa_list);
1635
1636	if (list_empty(fa_head)) {
1637		hlist_del_rcu(&li->hlist);
1638		free_leaf_info(li);
1639	}
1640
1641	if (hlist_empty(&l->list))
1642		trie_leaf_remove(t, l);
1643
1644	if (fa->fa_state & FA_S_ACCESSED)
1645		rt_cache_flush(-1);
1646
1647	fib_release_info(fa->fa_info);
1648	alias_free_mem_rcu(fa);
1649	return 0;
1650}
1651
1652static int trie_flush_list(struct trie *t, struct list_head *head)
1653{
1654	struct fib_alias *fa, *fa_node;
1655	int found = 0;
1656
1657	list_for_each_entry_safe(fa, fa_node, head, fa_list) {
1658		struct fib_info *fi = fa->fa_info;
1659
1660		if (fi && (fi->fib_flags & RTNH_F_DEAD)) {
1661			list_del_rcu(&fa->fa_list);
1662			fib_release_info(fa->fa_info);
1663			alias_free_mem_rcu(fa);
1664			found++;
1665		}
1666	}
1667	return found;
1668}
1669
1670static int trie_flush_leaf(struct trie *t, struct leaf *l)
1671{
1672	int found = 0;
1673	struct hlist_head *lih = &l->list;
1674	struct hlist_node *node, *tmp;
1675	struct leaf_info *li = NULL;
1676
1677	hlist_for_each_entry_safe(li, node, tmp, lih, hlist) {
1678		found += trie_flush_list(t, &li->falh);
1679
1680		if (list_empty(&li->falh)) {
1681			hlist_del_rcu(&li->hlist);
1682			free_leaf_info(li);
1683		}
1684	}
1685	return found;
1686}
1687
1688/*
1689 * Scan for the next right leaf starting at node p->child[idx]
1690 * Since we have back pointer, no recursion necessary.
1691 */
1692static struct leaf *leaf_walk_rcu(struct tnode *p, struct node *c)
1693{
1694	do {
1695		t_key idx;
1696
1697		if (c)
1698			idx = tkey_extract_bits(c->key, p->pos, p->bits) + 1;
1699		else
1700			idx = 0;
1701
1702		while (idx < 1u << p->bits) {
1703			c = tnode_get_child_rcu(p, idx++);
1704			if (!c)
1705				continue;
1706
1707			if (IS_LEAF(c)) {
1708				prefetch(p->child[idx]);
1709				return (struct leaf *) c;
1710			}
1711
1712			/* Rescan start scanning in new node */
1713			p = (struct tnode *) c;
1714			idx = 0;
1715		}
1716
1717		/* Node empty, walk back up to parent */
1718		c = (struct node *) p;
1719	} while ( (p = node_parent_rcu(c)) != NULL);
1720
1721	return NULL; /* Root of trie */
1722}
1723
1724
1725static struct leaf *trie_firstleaf(struct trie *t)
1726{
1727	struct tnode *n = (struct tnode *) rcu_dereference(t->trie);
1728
1729	if (!n)
1730		return NULL;
1731
1732	if (IS_LEAF(n))          /* trie is just a leaf */
1733		return (struct leaf *) n;
1734
1735	return leaf_walk_rcu(n, NULL);
1736}
1737
1738static struct leaf *trie_nextleaf(struct leaf *l)
1739{
1740	struct node *c = (struct node *) l;
1741	struct tnode *p = node_parent(c);
1742
1743	if (!p)
1744		return NULL;	/* trie with just one leaf */
1745
1746	return leaf_walk_rcu(p, c);
1747}
1748
1749/*
1750 * Caller must hold RTNL.
1751 */
1752static int fn_trie_flush(struct fib_table *tb)
1753{
1754	struct trie *t = (struct trie *) tb->tb_data;
1755	struct leaf *l, *ll = NULL;
1756	int found = 0;
1757
1758	for (l = trie_firstleaf(t); l; l = trie_nextleaf(l)) {
1759		found += trie_flush_leaf(t, l);
1760
1761		if (ll && hlist_empty(&ll->list))
1762			trie_leaf_remove(t, ll);
1763		ll = l;
1764	}
1765
1766	if (ll && hlist_empty(&ll->list))
1767		trie_leaf_remove(t, ll);
1768
1769	pr_debug("trie_flush found=%d\n", found);
1770	return found;
1771}
1772
1773static void fn_trie_select_default(struct fib_table *tb,
1774				   const struct flowi *flp,
1775				   struct fib_result *res)
1776{
1777	struct trie *t = (struct trie *) tb->tb_data;
1778	int order, last_idx;
1779	struct fib_info *fi = NULL;
1780	struct fib_info *last_resort;
1781	struct fib_alias *fa = NULL;
1782	struct list_head *fa_head;
1783	struct leaf *l;
1784
1785	last_idx = -1;
1786	last_resort = NULL;
1787	order = -1;
1788
1789	rcu_read_lock();
1790
1791	l = fib_find_node(t, 0);
1792	if (!l)
1793		goto out;
1794
1795	fa_head = get_fa_head(l, 0);
1796	if (!fa_head)
1797		goto out;
1798
1799	if (list_empty(fa_head))
1800		goto out;
1801
1802	list_for_each_entry_rcu(fa, fa_head, fa_list) {
1803		struct fib_info *next_fi = fa->fa_info;
1804
1805		if (fa->fa_scope != res->scope ||
1806		    fa->fa_type != RTN_UNICAST)
1807			continue;
1808
1809		if (next_fi->fib_priority > res->fi->fib_priority)
1810			break;
1811		if (!next_fi->fib_nh[0].nh_gw ||
1812		    next_fi->fib_nh[0].nh_scope != RT_SCOPE_LINK)
1813			continue;
1814		fa->fa_state |= FA_S_ACCESSED;
1815
1816		if (fi == NULL) {
1817			if (next_fi != res->fi)
1818				break;
1819		} else if (!fib_detect_death(fi, order, &last_resort,
1820					     &last_idx, tb->tb_default)) {
1821			fib_result_assign(res, fi);
1822			tb->tb_default = order;
1823			goto out;
1824		}
1825		fi = next_fi;
1826		order++;
1827	}
1828	if (order <= 0 || fi == NULL) {
1829		tb->tb_default = -1;
1830		goto out;
1831	}
1832
1833	if (!fib_detect_death(fi, order, &last_resort, &last_idx,
1834				tb->tb_default)) {
1835		fib_result_assign(res, fi);
1836		tb->tb_default = order;
1837		goto out;
1838	}
1839	if (last_idx >= 0)
1840		fib_result_assign(res, last_resort);
1841	tb->tb_default = last_idx;
1842out:
1843	rcu_read_unlock();
1844}
1845
1846static int fn_trie_dump_fa(t_key key, int plen, struct list_head *fah,
1847			   struct fib_table *tb,
1848			   struct sk_buff *skb, struct netlink_callback *cb)
1849{
1850	int i, s_i;
1851	struct fib_alias *fa;
1852
1853	__be32 xkey = htonl(key);
1854
1855	s_i = cb->args[4];
1856	i = 0;
1857
1858	/* rcu_read_lock is hold by caller */
1859
1860	list_for_each_entry_rcu(fa, fah, fa_list) {
1861		if (i < s_i) {
1862			i++;
1863			continue;
1864		}
1865
1866		if (fib_dump_info(skb, NETLINK_CB(cb->skb).pid,
1867				  cb->nlh->nlmsg_seq,
1868				  RTM_NEWROUTE,
1869				  tb->tb_id,
1870				  fa->fa_type,
1871				  fa->fa_scope,
1872				  xkey,
1873				  plen,
1874				  fa->fa_tos,
1875				  fa->fa_info, NLM_F_MULTI) < 0) {
1876			cb->args[4] = i;
1877			return -1;
1878		}
1879		i++;
1880	}
1881	cb->args[4] = i;
1882	return skb->len;
1883}
1884
1885
1886static int fn_trie_dump_leaf(struct leaf *l, struct fib_table *tb,
1887			struct sk_buff *skb, struct netlink_callback *cb)
1888{
1889	struct leaf_info *li;
1890	struct hlist_node *node;
1891	int i, s_i;
1892
1893	s_i = cb->args[3];
1894	i = 0;
1895
1896	/* rcu_read_lock is hold by caller */
1897	hlist_for_each_entry_rcu(li, node, &l->list, hlist) {
1898		if (i < s_i) {
1899			i++;
1900			continue;
1901		}
1902
1903		if (i > s_i)
1904			cb->args[4] = 0;
1905
1906		if (list_empty(&li->falh))
1907			continue;
1908
1909		if (fn_trie_dump_fa(l->key, li->plen, &li->falh, tb, skb, cb) < 0) {
1910			cb->args[3] = i;
1911			return -1;
1912		}
1913		i++;
1914	}
1915
1916	cb->args[3] = i;
1917	return skb->len;
1918}
1919
1920
1921
1922static int fn_trie_dump(struct fib_table *tb, struct sk_buff *skb,
1923			struct netlink_callback *cb)
1924{
1925	struct leaf *l;
1926	struct trie *t = (struct trie *) tb->tb_data;
1927	int h = 0;
1928	int s_h = cb->args[2];
1929
1930	rcu_read_lock();
1931	for (h = 0, l = trie_firstleaf(t); l != NULL; h++, l = trie_nextleaf(l)) {
1932		if (h < s_h)
1933			continue;
1934
1935		if (h > s_h) {
1936			cb->args[3] = 0;
1937			cb->args[4] = 0;
1938		}
1939
1940		if (fn_trie_dump_leaf(l, tb, skb, cb) < 0) {
1941			rcu_read_unlock();
1942			cb->args[2] = h;
1943			return -1;
1944		}
1945	}
1946	rcu_read_unlock();
1947
1948	cb->args[2] = h;
1949	return skb->len;
1950}
1951
1952void __init fib_hash_init(void)
1953{
1954	fn_alias_kmem = kmem_cache_create("ip_fib_alias",
1955					  sizeof(struct fib_alias),
1956					  0, SLAB_PANIC, NULL);
1957
1958	trie_leaf_kmem = kmem_cache_create("ip_fib_trie",
1959					   max(sizeof(struct leaf),
1960					       sizeof(struct leaf_info)),
1961					   0, SLAB_PANIC, NULL);
1962}
1963
1964
1965/* Fix more generic FIB names for init later */
1966struct fib_table *fib_hash_table(u32 id)
1967{
1968	struct fib_table *tb;
1969	struct trie *t;
1970
1971	tb = kmalloc(sizeof(struct fib_table) + sizeof(struct trie),
1972		     GFP_KERNEL);
1973	if (tb == NULL)
1974		return NULL;
1975
1976	tb->tb_id = id;
1977	tb->tb_default = -1;
1978	tb->tb_lookup = fn_trie_lookup;
1979	tb->tb_insert = fn_trie_insert;
1980	tb->tb_delete = fn_trie_delete;
1981	tb->tb_flush = fn_trie_flush;
1982	tb->tb_select_default = fn_trie_select_default;
1983	tb->tb_dump = fn_trie_dump;
1984
1985	t = (struct trie *) tb->tb_data;
1986	memset(t, 0, sizeof(*t));
1987
1988	if (id == RT_TABLE_LOCAL)
1989		pr_info("IPv4 FIB: Using LC-trie version %s\n", VERSION);
1990
1991	return tb;
1992}
1993
1994#ifdef CONFIG_PROC_FS
1995/* Depth first Trie walk iterator */
1996struct fib_trie_iter {
1997	struct seq_net_private p;
1998	struct trie *trie_local, *trie_main;
1999	struct tnode *tnode;
2000	struct trie *trie;
2001	unsigned index;
2002	unsigned depth;
2003};
2004
2005static struct node *fib_trie_get_next(struct fib_trie_iter *iter)
2006{
2007	struct tnode *tn = iter->tnode;
2008	unsigned cindex = iter->index;
2009	struct tnode *p;
2010
2011	/* A single entry routing table */
2012	if (!tn)
2013		return NULL;
2014
2015	pr_debug("get_next iter={node=%p index=%d depth=%d}\n",
2016		 iter->tnode, iter->index, iter->depth);
2017rescan:
2018	while (cindex < (1<<tn->bits)) {
2019		struct node *n = tnode_get_child_rcu(tn, cindex);
2020
2021		if (n) {
2022			if (IS_LEAF(n)) {
2023				iter->tnode = tn;
2024				iter->index = cindex + 1;
2025			} else {
2026				/* push down one level */
2027				iter->tnode = (struct tnode *) n;
2028				iter->index = 0;
2029				++iter->depth;
2030			}
2031			return n;
2032		}
2033
2034		++cindex;
2035	}
2036
2037	/* Current node exhausted, pop back up */
2038	p = node_parent_rcu((struct node *)tn);
2039	if (p) {
2040		cindex = tkey_extract_bits(tn->key, p->pos, p->bits)+1;
2041		tn = p;
2042		--iter->depth;
2043		goto rescan;
2044	}
2045
2046	/* got root? */
2047	return NULL;
2048}
2049
2050static struct node *fib_trie_get_first(struct fib_trie_iter *iter,
2051				       struct trie *t)
2052{
2053	struct node *n ;
2054
2055	if (!t)
2056		return NULL;
2057
2058	n = rcu_dereference(t->trie);
2059
2060	if (!iter)
2061		return NULL;
2062
2063	if (n) {
2064		if (IS_TNODE(n)) {
2065			iter->tnode = (struct tnode *) n;
2066			iter->trie = t;
2067			iter->index = 0;
2068			iter->depth = 1;
2069		} else {
2070			iter->tnode = NULL;
2071			iter->trie  = t;
2072			iter->index = 0;
2073			iter->depth = 0;
2074		}
2075		return n;
2076	}
2077	return NULL;
2078}
2079
2080static void trie_collect_stats(struct trie *t, struct trie_stat *s)
2081{
2082	struct node *n;
2083	struct fib_trie_iter iter;
2084
2085	memset(s, 0, sizeof(*s));
2086
2087	rcu_read_lock();
2088	for (n = fib_trie_get_first(&iter, t); n;
2089	     n = fib_trie_get_next(&iter)) {
2090		if (IS_LEAF(n)) {
2091			struct leaf *l = (struct leaf *)n;
2092			struct leaf_info *li;
2093			struct hlist_node *tmp;
2094
2095			s->leaves++;
2096			s->totdepth += iter.depth;
2097			if (iter.depth > s->maxdepth)
2098				s->maxdepth = iter.depth;
2099
2100			hlist_for_each_entry_rcu(li, tmp, &l->list, hlist)
2101				++s->prefixes;
2102		} else {
2103			const struct tnode *tn = (const struct tnode *) n;
2104			int i;
2105
2106			s->tnodes++;
2107			if (tn->bits < MAX_STAT_DEPTH)
2108				s->nodesizes[tn->bits]++;
2109
2110			for (i = 0; i < (1<<tn->bits); i++)
2111				if (!tn->child[i])
2112					s->nullpointers++;
2113		}
2114	}
2115	rcu_read_unlock();
2116}
2117
2118/*
2119 *	This outputs /proc/net/fib_triestats
2120 */
2121static void trie_show_stats(struct seq_file *seq, struct trie_stat *stat)
2122{
2123	unsigned i, max, pointers, bytes, avdepth;
2124
2125	if (stat->leaves)
2126		avdepth = stat->totdepth*100 / stat->leaves;
2127	else
2128		avdepth = 0;
2129
2130	seq_printf(seq, "\tAver depth:     %u.%02d\n",
2131		   avdepth / 100, avdepth % 100);
2132	seq_printf(seq, "\tMax depth:      %u\n", stat->maxdepth);
2133
2134	seq_printf(seq, "\tLeaves:         %u\n", stat->leaves);
2135	bytes = sizeof(struct leaf) * stat->leaves;
2136
2137	seq_printf(seq, "\tPrefixes:       %u\n", stat->prefixes);
2138	bytes += sizeof(struct leaf_info) * stat->prefixes;
2139
2140	seq_printf(seq, "\tInternal nodes: %u\n\t", stat->tnodes);
2141	bytes += sizeof(struct tnode) * stat->tnodes;
2142
2143	max = MAX_STAT_DEPTH;
2144	while (max > 0 && stat->nodesizes[max-1] == 0)
2145		max--;
2146
2147	pointers = 0;
2148	for (i = 1; i <= max; i++)
2149		if (stat->nodesizes[i] != 0) {
2150			seq_printf(seq, "  %u: %u",  i, stat->nodesizes[i]);
2151			pointers += (1<<i) * stat->nodesizes[i];
2152		}
2153	seq_putc(seq, '\n');
2154	seq_printf(seq, "\tPointers: %u\n", pointers);
2155
2156	bytes += sizeof(struct node *) * pointers;
2157	seq_printf(seq, "Null ptrs: %u\n", stat->nullpointers);
2158	seq_printf(seq, "Total size: %u  kB\n", (bytes + 1023) / 1024);
2159}
2160
2161#ifdef CONFIG_IP_FIB_TRIE_STATS
2162static void trie_show_usage(struct seq_file *seq,
2163			    const struct trie_use_stats *stats)
2164{
2165	seq_printf(seq, "\nCounters:\n---------\n");
2166	seq_printf(seq, "gets = %u\n", stats->gets);
2167	seq_printf(seq, "backtracks = %u\n", stats->backtrack);
2168	seq_printf(seq, "semantic match passed = %u\n",
2169		   stats->semantic_match_passed);
2170	seq_printf(seq, "semantic match miss = %u\n",
2171		   stats->semantic_match_miss);
2172	seq_printf(seq, "null node hit= %u\n", stats->null_node_hit);
2173	seq_printf(seq, "skipped node resize = %u\n\n",
2174		   stats->resize_node_skipped);
2175}
2176#endif /*  CONFIG_IP_FIB_TRIE_STATS */
2177
2178static void fib_trie_show(struct seq_file *seq, const char *name,
2179			  struct trie *trie)
2180{
2181	struct trie_stat stat;
2182
2183	trie_collect_stats(trie, &stat);
2184	seq_printf(seq, "%s:\n", name);
2185	trie_show_stats(seq, &stat);
2186#ifdef CONFIG_IP_FIB_TRIE_STATS
2187	trie_show_usage(seq, &trie->stats);
2188#endif
2189}
2190
2191static int fib_triestat_seq_show(struct seq_file *seq, void *v)
2192{
2193	struct net *net = (struct net *)seq->private;
2194	struct fib_table *tb;
2195
2196	seq_printf(seq,
2197		   "Basic info: size of leaf:"
2198		   " %Zd bytes, size of tnode: %Zd bytes.\n",
2199		   sizeof(struct leaf), sizeof(struct tnode));
2200
2201	tb = fib_get_table(net, RT_TABLE_LOCAL);
2202	if (tb)
2203		fib_trie_show(seq, "Local", (struct trie *) tb->tb_data);
2204
2205	tb = fib_get_table(net, RT_TABLE_MAIN);
2206	if (tb)
2207		fib_trie_show(seq, "Main", (struct trie *) tb->tb_data);
2208
2209	return 0;
2210}
2211
2212static int fib_triestat_seq_open(struct inode *inode, struct file *file)
2213{
2214	int err;
2215	struct net *net;
2216
2217	net = get_proc_net(inode);
2218	if (net == NULL)
2219		return -ENXIO;
2220	err = single_open(file, fib_triestat_seq_show, net);
2221	if (err < 0) {
2222		put_net(net);
2223		return err;
2224	}
2225	return 0;
2226}
2227
2228static int fib_triestat_seq_release(struct inode *ino, struct file *f)
2229{
2230	struct seq_file *seq = f->private_data;
2231	put_net(seq->private);
2232	return single_release(ino, f);
2233}
2234
2235static const struct file_operations fib_triestat_fops = {
2236	.owner	= THIS_MODULE,
2237	.open	= fib_triestat_seq_open,
2238	.read	= seq_read,
2239	.llseek	= seq_lseek,
2240	.release = fib_triestat_seq_release,
2241};
2242
2243static struct node *fib_trie_get_idx(struct fib_trie_iter *iter,
2244				      loff_t pos)
2245{
2246	loff_t idx = 0;
2247	struct node *n;
2248
2249	for (n = fib_trie_get_first(iter, iter->trie_local);
2250	     n; ++idx, n = fib_trie_get_next(iter)) {
2251		if (pos == idx)
2252			return n;
2253	}
2254
2255	for (n = fib_trie_get_first(iter, iter->trie_main);
2256	     n; ++idx, n = fib_trie_get_next(iter)) {
2257		if (pos == idx)
2258			return n;
2259	}
2260	return NULL;
2261}
2262
2263static void *fib_trie_seq_start(struct seq_file *seq, loff_t *pos)
2264	__acquires(RCU)
2265{
2266	struct fib_trie_iter *iter = seq->private;
2267	struct fib_table *tb;
2268
2269	if (!iter->trie_local) {
2270		tb = fib_get_table(iter->p.net, RT_TABLE_LOCAL);
2271		if (tb)
2272			iter->trie_local = (struct trie *) tb->tb_data;
2273	}
2274	if (!iter->trie_main) {
2275		tb = fib_get_table(iter->p.net, RT_TABLE_MAIN);
2276		if (tb)
2277			iter->trie_main = (struct trie *) tb->tb_data;
2278	}
2279	rcu_read_lock();
2280	if (*pos == 0)
2281		return SEQ_START_TOKEN;
2282	return fib_trie_get_idx(iter, *pos - 1);
2283}
2284
2285static void *fib_trie_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2286{
2287	struct fib_trie_iter *iter = seq->private;
2288	void *l = v;
2289
2290	++*pos;
2291	if (v == SEQ_START_TOKEN)
2292		return fib_trie_get_idx(iter, 0);
2293
2294	v = fib_trie_get_next(iter);
2295	BUG_ON(v == l);
2296	if (v)
2297		return v;
2298
2299	/* continue scan in next trie */
2300	if (iter->trie == iter->trie_local)
2301		return fib_trie_get_first(iter, iter->trie_main);
2302
2303	return NULL;
2304}
2305
2306static void fib_trie_seq_stop(struct seq_file *seq, void *v)
2307	__releases(RCU)
2308{
2309	rcu_read_unlock();
2310}
2311
2312static void seq_indent(struct seq_file *seq, int n)
2313{
2314	while (n-- > 0) seq_puts(seq, "   ");
2315}
2316
2317static inline const char *rtn_scope(char *buf, size_t len, enum rt_scope_t s)
2318{
2319	switch (s) {
2320	case RT_SCOPE_UNIVERSE: return "universe";
2321	case RT_SCOPE_SITE:	return "site";
2322	case RT_SCOPE_LINK:	return "link";
2323	case RT_SCOPE_HOST:	return "host";
2324	case RT_SCOPE_NOWHERE:	return "nowhere";
2325	default:
2326		snprintf(buf, len, "scope=%d", s);
2327		return buf;
2328	}
2329}
2330
2331static const char *rtn_type_names[__RTN_MAX] = {
2332	[RTN_UNSPEC] = "UNSPEC",
2333	[RTN_UNICAST] = "UNICAST",
2334	[RTN_LOCAL] = "LOCAL",
2335	[RTN_BROADCAST] = "BROADCAST",
2336	[RTN_ANYCAST] = "ANYCAST",
2337	[RTN_MULTICAST] = "MULTICAST",
2338	[RTN_BLACKHOLE] = "BLACKHOLE",
2339	[RTN_UNREACHABLE] = "UNREACHABLE",
2340	[RTN_PROHIBIT] = "PROHIBIT",
2341	[RTN_THROW] = "THROW",
2342	[RTN_NAT] = "NAT",
2343	[RTN_XRESOLVE] = "XRESOLVE",
2344};
2345
2346static inline const char *rtn_type(char *buf, size_t len, unsigned t)
2347{
2348	if (t < __RTN_MAX && rtn_type_names[t])
2349		return rtn_type_names[t];
2350	snprintf(buf, len, "type %u", t);
2351	return buf;
2352}
2353
2354/* Pretty print the trie */
2355static int fib_trie_seq_show(struct seq_file *seq, void *v)
2356{
2357	const struct fib_trie_iter *iter = seq->private;
2358	struct node *n = v;
2359
2360	if (v == SEQ_START_TOKEN)
2361		return 0;
2362
2363	if (!node_parent_rcu(n)) {
2364		if (iter->trie == iter->trie_local)
2365			seq_puts(seq, "<local>:\n");
2366		else
2367			seq_puts(seq, "<main>:\n");
2368	}
2369
2370	if (IS_TNODE(n)) {
2371		struct tnode *tn = (struct tnode *) n;
2372		__be32 prf = htonl(mask_pfx(tn->key, tn->pos));
2373
2374		seq_indent(seq, iter->depth-1);
2375		seq_printf(seq, "  +-- %d.%d.%d.%d/%d %d %d %d\n",
2376			   NIPQUAD(prf), tn->pos, tn->bits, tn->full_children,
2377			   tn->empty_children);
2378
2379	} else {
2380		struct leaf *l = (struct leaf *) n;
2381		struct leaf_info *li;
2382		struct hlist_node *node;
2383
2384		__be32 val = htonl(l->key);
2385
2386		seq_indent(seq, iter->depth);
2387		seq_printf(seq, "  |-- %d.%d.%d.%d\n", NIPQUAD(val));
2388
2389		hlist_for_each_entry_rcu(li, node, &l->list, hlist) {
2390			struct fib_alias *fa;
2391
2392			list_for_each_entry_rcu(fa, &li->falh, fa_list) {
2393				char buf1[32], buf2[32];
2394
2395				seq_indent(seq, iter->depth+1);
2396				seq_printf(seq, "  /%d %s %s", li->plen,
2397					   rtn_scope(buf1, sizeof(buf1),
2398						     fa->fa_scope),
2399					   rtn_type(buf2, sizeof(buf2),
2400						    fa->fa_type));
2401				if (fa->fa_tos)
2402					seq_printf(seq, "tos =%d\n",
2403						   fa->fa_tos);
2404				seq_putc(seq, '\n');
2405			}
2406		}
2407	}
2408
2409	return 0;
2410}
2411
2412static const struct seq_operations fib_trie_seq_ops = {
2413	.start  = fib_trie_seq_start,
2414	.next   = fib_trie_seq_next,
2415	.stop   = fib_trie_seq_stop,
2416	.show   = fib_trie_seq_show,
2417};
2418
2419static int fib_trie_seq_open(struct inode *inode, struct file *file)
2420{
2421	return seq_open_net(inode, file, &fib_trie_seq_ops,
2422			    sizeof(struct fib_trie_iter));
2423}
2424
2425static const struct file_operations fib_trie_fops = {
2426	.owner  = THIS_MODULE,
2427	.open   = fib_trie_seq_open,
2428	.read   = seq_read,
2429	.llseek = seq_lseek,
2430	.release = seq_release_net,
2431};
2432
2433static unsigned fib_flag_trans(int type, __be32 mask, const struct fib_info *fi)
2434{
2435	static unsigned type2flags[RTN_MAX + 1] = {
2436		[7] = RTF_REJECT, [8] = RTF_REJECT,
2437	};
2438	unsigned flags = type2flags[type];
2439
2440	if (fi && fi->fib_nh->nh_gw)
2441		flags |= RTF_GATEWAY;
2442	if (mask == htonl(0xFFFFFFFF))
2443		flags |= RTF_HOST;
2444	flags |= RTF_UP;
2445	return flags;
2446}
2447
2448/*
2449 *	This outputs /proc/net/route.
2450 *	The format of the file is not supposed to be changed
2451 * 	and needs to be same as fib_hash output to avoid breaking
2452 *	legacy utilities
2453 */
2454static int fib_route_seq_show(struct seq_file *seq, void *v)
2455{
2456	const struct fib_trie_iter *iter = seq->private;
2457	struct leaf *l = v;
2458	struct leaf_info *li;
2459	struct hlist_node *node;
2460
2461	if (v == SEQ_START_TOKEN) {
2462		seq_printf(seq, "%-127s\n", "Iface\tDestination\tGateway "
2463			   "\tFlags\tRefCnt\tUse\tMetric\tMask\t\tMTU"
2464			   "\tWindow\tIRTT");
2465		return 0;
2466	}
2467
2468	if (iter->trie == iter->trie_local)
2469		return 0;
2470
2471	if (IS_TNODE(l))
2472		return 0;
2473
2474	hlist_for_each_entry_rcu(li, node, &l->list, hlist) {
2475		struct fib_alias *fa;
2476		__be32 mask, prefix;
2477
2478		if (!li)
2479			continue;
2480
2481		mask = inet_make_mask(li->plen);
2482		prefix = htonl(l->key);
2483
2484		list_for_each_entry_rcu(fa, &li->falh, fa_list) {
2485			const struct fib_info *fi = fa->fa_info;
2486			unsigned flags = fib_flag_trans(fa->fa_type, mask, fi);
2487			char bf[128];
2488
2489			if (fa->fa_type == RTN_BROADCAST
2490			    || fa->fa_type == RTN_MULTICAST)
2491				continue;
2492
2493			if (fi)
2494				snprintf(bf, sizeof(bf),
2495					 "%s\t%08X\t%08X\t%04X\t%d\t%u\t%d\t%08X\t%d\t%u\t%u",
2496					 fi->fib_dev ? fi->fib_dev->name : "*",
2497					 prefix,
2498					 fi->fib_nh->nh_gw, flags, 0, 0,
2499					 fi->fib_priority,
2500					 mask,
2501					 (fi->fib_advmss ?
2502					  fi->fib_advmss + 40 : 0),
2503					 fi->fib_window,
2504					 fi->fib_rtt >> 3);
2505			else
2506				snprintf(bf, sizeof(bf),
2507					 "*\t%08X\t%08X\t%04X\t%d\t%u\t%d\t%08X\t%d\t%u\t%u",
2508					 prefix, 0, flags, 0, 0, 0,
2509					 mask, 0, 0, 0);
2510
2511			seq_printf(seq, "%-127s\n", bf);
2512		}
2513	}
2514
2515	return 0;
2516}
2517
2518static const struct seq_operations fib_route_seq_ops = {
2519	.start  = fib_trie_seq_start,
2520	.next   = fib_trie_seq_next,
2521	.stop   = fib_trie_seq_stop,
2522	.show   = fib_route_seq_show,
2523};
2524
2525static int fib_route_seq_open(struct inode *inode, struct file *file)
2526{
2527	return seq_open_net(inode, file, &fib_route_seq_ops,
2528			    sizeof(struct fib_trie_iter));
2529}
2530
2531static const struct file_operations fib_route_fops = {
2532	.owner  = THIS_MODULE,
2533	.open   = fib_route_seq_open,
2534	.read   = seq_read,
2535	.llseek = seq_lseek,
2536	.release = seq_release_net,
2537};
2538
2539int __net_init fib_proc_init(struct net *net)
2540{
2541	if (!proc_net_fops_create(net, "fib_trie", S_IRUGO, &fib_trie_fops))
2542		goto out1;
2543
2544	if (!proc_net_fops_create(net, "fib_triestat", S_IRUGO,
2545				  &fib_triestat_fops))
2546		goto out2;
2547
2548	if (!proc_net_fops_create(net, "route", S_IRUGO, &fib_route_fops))
2549		goto out3;
2550
2551	return 0;
2552
2553out3:
2554	proc_net_remove(net, "fib_triestat");
2555out2:
2556	proc_net_remove(net, "fib_trie");
2557out1:
2558	return -ENOMEM;
2559}
2560
2561void __net_exit fib_proc_exit(struct net *net)
2562{
2563	proc_net_remove(net, "fib_trie");
2564	proc_net_remove(net, "fib_triestat");
2565	proc_net_remove(net, "route");
2566}
2567
2568#endif /* CONFIG_PROC_FS */
2569