fib_trie.c revision ac97f75faae2a18648145bc6bbcdd326bac6a1c2
1/*
2 *   This program is free software; you can redistribute it and/or
3 *   modify it under the terms of the GNU General Public License
4 *   as published by the Free Software Foundation; either version
5 *   2 of the License, or (at your option) any later version.
6 *
7 *   Robert Olsson <robert.olsson@its.uu.se> Uppsala Universitet
8 *     & Swedish University of Agricultural Sciences.
9 *
10 *   Jens Laas <jens.laas@data.slu.se> Swedish University of
11 *     Agricultural Sciences.
12 *
13 *   Hans Liss <hans.liss@its.uu.se>  Uppsala Universitet
14 *
15 * This work is based on the LPC-trie which is originally descibed in:
16 *
17 * An experimental study of compression methods for dynamic tries
18 * Stefan Nilsson and Matti Tikkanen. Algorithmica, 33(1):19-33, 2002.
19 * http://www.nada.kth.se/~snilsson/public/papers/dyntrie2/
20 *
21 *
22 * IP-address lookup using LC-tries. Stefan Nilsson and Gunnar Karlsson
23 * IEEE Journal on Selected Areas in Communications, 17(6):1083-1092, June 1999
24 *
25 * Version:	$Id: fib_trie.c,v 1.3 2005/06/08 14:20:01 robert Exp $
26 *
27 *
28 * Code from fib_hash has been reused which includes the following header:
29 *
30 *
31 * INET		An implementation of the TCP/IP protocol suite for the LINUX
32 *		operating system.  INET is implemented using the  BSD Socket
33 *		interface as the means of communication with the user level.
34 *
35 *		IPv4 FIB: lookup engine and maintenance routines.
36 *
37 *
38 * Authors:	Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
39 *
40 *		This program is free software; you can redistribute it and/or
41 *		modify it under the terms of the GNU General Public License
42 *		as published by the Free Software Foundation; either version
43 *		2 of the License, or (at your option) any later version.
44 *
45 * Substantial contributions to this work comes from:
46 *
47 *		David S. Miller, <davem@davemloft.net>
48 *		Stephen Hemminger <shemminger@osdl.org>
49 *		Paul E. McKenney <paulmck@us.ibm.com>
50 *		Patrick McHardy <kaber@trash.net>
51 */
52
53#define VERSION "0.408"
54
55#include <asm/uaccess.h>
56#include <asm/system.h>
57#include <linux/bitops.h>
58#include <linux/types.h>
59#include <linux/kernel.h>
60#include <linux/mm.h>
61#include <linux/string.h>
62#include <linux/socket.h>
63#include <linux/sockios.h>
64#include <linux/errno.h>
65#include <linux/in.h>
66#include <linux/inet.h>
67#include <linux/inetdevice.h>
68#include <linux/netdevice.h>
69#include <linux/if_arp.h>
70#include <linux/proc_fs.h>
71#include <linux/rcupdate.h>
72#include <linux/skbuff.h>
73#include <linux/netlink.h>
74#include <linux/init.h>
75#include <linux/list.h>
76#include <net/net_namespace.h>
77#include <net/ip.h>
78#include <net/protocol.h>
79#include <net/route.h>
80#include <net/tcp.h>
81#include <net/sock.h>
82#include <net/ip_fib.h>
83#include "fib_lookup.h"
84
85#define MAX_STAT_DEPTH 32
86
87#define KEYLENGTH (8*sizeof(t_key))
88
89typedef unsigned int t_key;
90
91#define T_TNODE 0
92#define T_LEAF  1
93#define NODE_TYPE_MASK	0x1UL
94#define NODE_TYPE(node) ((node)->parent & NODE_TYPE_MASK)
95
96#define IS_TNODE(n) (!(n->parent & T_LEAF))
97#define IS_LEAF(n) (n->parent & T_LEAF)
98
99struct node {
100	unsigned long parent;
101	t_key key;
102};
103
104struct leaf {
105	unsigned long parent;
106	t_key key;
107	struct hlist_head list;
108	struct rcu_head rcu;
109};
110
111struct leaf_info {
112	struct hlist_node hlist;
113	struct rcu_head rcu;
114	int plen;
115	struct list_head falh;
116};
117
118struct tnode {
119	unsigned long parent;
120	t_key key;
121	unsigned char pos;		/* 2log(KEYLENGTH) bits needed */
122	unsigned char bits;		/* 2log(KEYLENGTH) bits needed */
123	unsigned int full_children;	/* KEYLENGTH bits needed */
124	unsigned int empty_children;	/* KEYLENGTH bits needed */
125	struct rcu_head rcu;
126	struct node *child[0];
127};
128
129#ifdef CONFIG_IP_FIB_TRIE_STATS
130struct trie_use_stats {
131	unsigned int gets;
132	unsigned int backtrack;
133	unsigned int semantic_match_passed;
134	unsigned int semantic_match_miss;
135	unsigned int null_node_hit;
136	unsigned int resize_node_skipped;
137};
138#endif
139
140struct trie_stat {
141	unsigned int totdepth;
142	unsigned int maxdepth;
143	unsigned int tnodes;
144	unsigned int leaves;
145	unsigned int nullpointers;
146	unsigned int prefixes;
147	unsigned int nodesizes[MAX_STAT_DEPTH];
148};
149
150struct trie {
151	struct node *trie;
152#ifdef CONFIG_IP_FIB_TRIE_STATS
153	struct trie_use_stats stats;
154#endif
155};
156
157static void put_child(struct trie *t, struct tnode *tn, int i, struct node *n);
158static void tnode_put_child_reorg(struct tnode *tn, int i, struct node *n,
159				  int wasfull);
160static struct node *resize(struct trie *t, struct tnode *tn);
161static struct tnode *inflate(struct trie *t, struct tnode *tn);
162static struct tnode *halve(struct trie *t, struct tnode *tn);
163static void tnode_free(struct tnode *tn);
164
165static struct kmem_cache *fn_alias_kmem __read_mostly;
166static struct kmem_cache *trie_leaf_kmem __read_mostly;
167
168static inline struct tnode *node_parent(struct node *node)
169{
170	return (struct tnode *)(node->parent & ~NODE_TYPE_MASK);
171}
172
173static inline struct tnode *node_parent_rcu(struct node *node)
174{
175	struct tnode *ret = node_parent(node);
176
177	return rcu_dereference(ret);
178}
179
180static inline void node_set_parent(struct node *node, struct tnode *ptr)
181{
182	rcu_assign_pointer(node->parent,
183			   (unsigned long)ptr | NODE_TYPE(node));
184}
185
186static inline struct node *tnode_get_child(struct tnode *tn, unsigned int i)
187{
188	BUG_ON(i >= 1U << tn->bits);
189
190	return tn->child[i];
191}
192
193static inline struct node *tnode_get_child_rcu(struct tnode *tn, unsigned int i)
194{
195	struct node *ret = tnode_get_child(tn, i);
196
197	return rcu_dereference(ret);
198}
199
200static inline int tnode_child_length(const struct tnode *tn)
201{
202	return 1 << tn->bits;
203}
204
205static inline t_key mask_pfx(t_key k, unsigned short l)
206{
207	return (l == 0) ? 0 : k >> (KEYLENGTH-l) << (KEYLENGTH-l);
208}
209
210static inline t_key tkey_extract_bits(t_key a, int offset, int bits)
211{
212	if (offset < KEYLENGTH)
213		return ((t_key)(a << offset)) >> (KEYLENGTH - bits);
214	else
215		return 0;
216}
217
218static inline int tkey_equals(t_key a, t_key b)
219{
220	return a == b;
221}
222
223static inline int tkey_sub_equals(t_key a, int offset, int bits, t_key b)
224{
225	if (bits == 0 || offset >= KEYLENGTH)
226		return 1;
227	bits = bits > KEYLENGTH ? KEYLENGTH : bits;
228	return ((a ^ b) << offset) >> (KEYLENGTH - bits) == 0;
229}
230
231static inline int tkey_mismatch(t_key a, int offset, t_key b)
232{
233	t_key diff = a ^ b;
234	int i = offset;
235
236	if (!diff)
237		return 0;
238	while ((diff << i) >> (KEYLENGTH-1) == 0)
239		i++;
240	return i;
241}
242
243/*
244  To understand this stuff, an understanding of keys and all their bits is
245  necessary. Every node in the trie has a key associated with it, but not
246  all of the bits in that key are significant.
247
248  Consider a node 'n' and its parent 'tp'.
249
250  If n is a leaf, every bit in its key is significant. Its presence is
251  necessitated by path compression, since during a tree traversal (when
252  searching for a leaf - unless we are doing an insertion) we will completely
253  ignore all skipped bits we encounter. Thus we need to verify, at the end of
254  a potentially successful search, that we have indeed been walking the
255  correct key path.
256
257  Note that we can never "miss" the correct key in the tree if present by
258  following the wrong path. Path compression ensures that segments of the key
259  that are the same for all keys with a given prefix are skipped, but the
260  skipped part *is* identical for each node in the subtrie below the skipped
261  bit! trie_insert() in this implementation takes care of that - note the
262  call to tkey_sub_equals() in trie_insert().
263
264  if n is an internal node - a 'tnode' here, the various parts of its key
265  have many different meanings.
266
267  Example:
268  _________________________________________________________________
269  | i | i | i | i | i | i | i | N | N | N | S | S | S | S | S | C |
270  -----------------------------------------------------------------
271    0   1   2   3   4   5   6   7   8   9  10  11  12  13  14  15
272
273  _________________________________________________________________
274  | C | C | C | u | u | u | u | u | u | u | u | u | u | u | u | u |
275  -----------------------------------------------------------------
276   16  17  18  19  20  21  22  23  24  25  26  27  28  29  30  31
277
278  tp->pos = 7
279  tp->bits = 3
280  n->pos = 15
281  n->bits = 4
282
283  First, let's just ignore the bits that come before the parent tp, that is
284  the bits from 0 to (tp->pos-1). They are *known* but at this point we do
285  not use them for anything.
286
287  The bits from (tp->pos) to (tp->pos + tp->bits - 1) - "N", above - are the
288  index into the parent's child array. That is, they will be used to find
289  'n' among tp's children.
290
291  The bits from (tp->pos + tp->bits) to (n->pos - 1) - "S" - are skipped bits
292  for the node n.
293
294  All the bits we have seen so far are significant to the node n. The rest
295  of the bits are really not needed or indeed known in n->key.
296
297  The bits from (n->pos) to (n->pos + n->bits - 1) - "C" - are the index into
298  n's child array, and will of course be different for each child.
299
300
301  The rest of the bits, from (n->pos + n->bits) onward, are completely unknown
302  at this point.
303
304*/
305
306static inline void check_tnode(const struct tnode *tn)
307{
308	WARN_ON(tn && tn->pos+tn->bits > 32);
309}
310
311static const int halve_threshold = 25;
312static const int inflate_threshold = 50;
313static const int halve_threshold_root = 8;
314static const int inflate_threshold_root = 15;
315
316
317static void __alias_free_mem(struct rcu_head *head)
318{
319	struct fib_alias *fa = container_of(head, struct fib_alias, rcu);
320	kmem_cache_free(fn_alias_kmem, fa);
321}
322
323static inline void alias_free_mem_rcu(struct fib_alias *fa)
324{
325	call_rcu(&fa->rcu, __alias_free_mem);
326}
327
328static void __leaf_free_rcu(struct rcu_head *head)
329{
330	struct leaf *l = container_of(head, struct leaf, rcu);
331	kmem_cache_free(trie_leaf_kmem, l);
332}
333
334static void __leaf_info_free_rcu(struct rcu_head *head)
335{
336	kfree(container_of(head, struct leaf_info, rcu));
337}
338
339static inline void free_leaf_info(struct leaf_info *leaf)
340{
341	call_rcu(&leaf->rcu, __leaf_info_free_rcu);
342}
343
344static struct tnode *tnode_alloc(size_t size)
345{
346	struct page *pages;
347
348	if (size <= PAGE_SIZE)
349		return kzalloc(size, GFP_KERNEL);
350
351	pages = alloc_pages(GFP_KERNEL|__GFP_ZERO, get_order(size));
352	if (!pages)
353		return NULL;
354
355	return page_address(pages);
356}
357
358static void __tnode_free_rcu(struct rcu_head *head)
359{
360	struct tnode *tn = container_of(head, struct tnode, rcu);
361	size_t size = sizeof(struct tnode) +
362		      (sizeof(struct node *) << tn->bits);
363
364	if (size <= PAGE_SIZE)
365		kfree(tn);
366	else
367		free_pages((unsigned long)tn, get_order(size));
368}
369
370static inline void tnode_free(struct tnode *tn)
371{
372	if (IS_LEAF(tn)) {
373		struct leaf *l = (struct leaf *) tn;
374		call_rcu_bh(&l->rcu, __leaf_free_rcu);
375	} else
376		call_rcu(&tn->rcu, __tnode_free_rcu);
377}
378
379static struct leaf *leaf_new(void)
380{
381	struct leaf *l = kmem_cache_alloc(trie_leaf_kmem, GFP_KERNEL);
382	if (l) {
383		l->parent = T_LEAF;
384		INIT_HLIST_HEAD(&l->list);
385	}
386	return l;
387}
388
389static struct leaf_info *leaf_info_new(int plen)
390{
391	struct leaf_info *li = kmalloc(sizeof(struct leaf_info),  GFP_KERNEL);
392	if (li) {
393		li->plen = plen;
394		INIT_LIST_HEAD(&li->falh);
395	}
396	return li;
397}
398
399static struct tnode *tnode_new(t_key key, int pos, int bits)
400{
401	size_t sz = sizeof(struct tnode) + (sizeof(struct node *) << bits);
402	struct tnode *tn = tnode_alloc(sz);
403
404	if (tn) {
405		tn->parent = T_TNODE;
406		tn->pos = pos;
407		tn->bits = bits;
408		tn->key = key;
409		tn->full_children = 0;
410		tn->empty_children = 1<<bits;
411	}
412
413	pr_debug("AT %p s=%u %lu\n", tn, (unsigned int) sizeof(struct tnode),
414		 (unsigned long) (sizeof(struct node) << bits));
415	return tn;
416}
417
418/*
419 * Check whether a tnode 'n' is "full", i.e. it is an internal node
420 * and no bits are skipped. See discussion in dyntree paper p. 6
421 */
422
423static inline int tnode_full(const struct tnode *tn, const struct node *n)
424{
425	if (n == NULL || IS_LEAF(n))
426		return 0;
427
428	return ((struct tnode *) n)->pos == tn->pos + tn->bits;
429}
430
431static inline void put_child(struct trie *t, struct tnode *tn, int i,
432			     struct node *n)
433{
434	tnode_put_child_reorg(tn, i, n, -1);
435}
436
437 /*
438  * Add a child at position i overwriting the old value.
439  * Update the value of full_children and empty_children.
440  */
441
442static void tnode_put_child_reorg(struct tnode *tn, int i, struct node *n,
443				  int wasfull)
444{
445	struct node *chi = tn->child[i];
446	int isfull;
447
448	BUG_ON(i >= 1<<tn->bits);
449
450	/* update emptyChildren */
451	if (n == NULL && chi != NULL)
452		tn->empty_children++;
453	else if (n != NULL && chi == NULL)
454		tn->empty_children--;
455
456	/* update fullChildren */
457	if (wasfull == -1)
458		wasfull = tnode_full(tn, chi);
459
460	isfull = tnode_full(tn, n);
461	if (wasfull && !isfull)
462		tn->full_children--;
463	else if (!wasfull && isfull)
464		tn->full_children++;
465
466	if (n)
467		node_set_parent(n, tn);
468
469	rcu_assign_pointer(tn->child[i], n);
470}
471
472static struct node *resize(struct trie *t, struct tnode *tn)
473{
474	int i;
475	int err = 0;
476	struct tnode *old_tn;
477	int inflate_threshold_use;
478	int halve_threshold_use;
479	int max_resize;
480
481	if (!tn)
482		return NULL;
483
484	pr_debug("In tnode_resize %p inflate_threshold=%d threshold=%d\n",
485		 tn, inflate_threshold, halve_threshold);
486
487	/* No children */
488	if (tn->empty_children == tnode_child_length(tn)) {
489		tnode_free(tn);
490		return NULL;
491	}
492	/* One child */
493	if (tn->empty_children == tnode_child_length(tn) - 1)
494		for (i = 0; i < tnode_child_length(tn); i++) {
495			struct node *n;
496
497			n = tn->child[i];
498			if (!n)
499				continue;
500
501			/* compress one level */
502			node_set_parent(n, NULL);
503			tnode_free(tn);
504			return n;
505		}
506	/*
507	 * Double as long as the resulting node has a number of
508	 * nonempty nodes that are above the threshold.
509	 */
510
511	/*
512	 * From "Implementing a dynamic compressed trie" by Stefan Nilsson of
513	 * the Helsinki University of Technology and Matti Tikkanen of Nokia
514	 * Telecommunications, page 6:
515	 * "A node is doubled if the ratio of non-empty children to all
516	 * children in the *doubled* node is at least 'high'."
517	 *
518	 * 'high' in this instance is the variable 'inflate_threshold'. It
519	 * is expressed as a percentage, so we multiply it with
520	 * tnode_child_length() and instead of multiplying by 2 (since the
521	 * child array will be doubled by inflate()) and multiplying
522	 * the left-hand side by 100 (to handle the percentage thing) we
523	 * multiply the left-hand side by 50.
524	 *
525	 * The left-hand side may look a bit weird: tnode_child_length(tn)
526	 * - tn->empty_children is of course the number of non-null children
527	 * in the current node. tn->full_children is the number of "full"
528	 * children, that is non-null tnodes with a skip value of 0.
529	 * All of those will be doubled in the resulting inflated tnode, so
530	 * we just count them one extra time here.
531	 *
532	 * A clearer way to write this would be:
533	 *
534	 * to_be_doubled = tn->full_children;
535	 * not_to_be_doubled = tnode_child_length(tn) - tn->empty_children -
536	 *     tn->full_children;
537	 *
538	 * new_child_length = tnode_child_length(tn) * 2;
539	 *
540	 * new_fill_factor = 100 * (not_to_be_doubled + 2*to_be_doubled) /
541	 *      new_child_length;
542	 * if (new_fill_factor >= inflate_threshold)
543	 *
544	 * ...and so on, tho it would mess up the while () loop.
545	 *
546	 * anyway,
547	 * 100 * (not_to_be_doubled + 2*to_be_doubled) / new_child_length >=
548	 *      inflate_threshold
549	 *
550	 * avoid a division:
551	 * 100 * (not_to_be_doubled + 2*to_be_doubled) >=
552	 *      inflate_threshold * new_child_length
553	 *
554	 * expand not_to_be_doubled and to_be_doubled, and shorten:
555	 * 100 * (tnode_child_length(tn) - tn->empty_children +
556	 *    tn->full_children) >= inflate_threshold * new_child_length
557	 *
558	 * expand new_child_length:
559	 * 100 * (tnode_child_length(tn) - tn->empty_children +
560	 *    tn->full_children) >=
561	 *      inflate_threshold * tnode_child_length(tn) * 2
562	 *
563	 * shorten again:
564	 * 50 * (tn->full_children + tnode_child_length(tn) -
565	 *    tn->empty_children) >= inflate_threshold *
566	 *    tnode_child_length(tn)
567	 *
568	 */
569
570	check_tnode(tn);
571
572	/* Keep root node larger  */
573
574	if (!tn->parent)
575		inflate_threshold_use = inflate_threshold_root;
576	else
577		inflate_threshold_use = inflate_threshold;
578
579	err = 0;
580	max_resize = 10;
581	while ((tn->full_children > 0 &&  max_resize-- &&
582		50 * (tn->full_children + tnode_child_length(tn)
583		      - tn->empty_children)
584		>= inflate_threshold_use * tnode_child_length(tn))) {
585
586		old_tn = tn;
587		tn = inflate(t, tn);
588
589		if (IS_ERR(tn)) {
590			tn = old_tn;
591#ifdef CONFIG_IP_FIB_TRIE_STATS
592			t->stats.resize_node_skipped++;
593#endif
594			break;
595		}
596	}
597
598	if (max_resize < 0) {
599		if (!tn->parent)
600			pr_warning("Fix inflate_threshold_root."
601				   " Now=%d size=%d bits\n",
602				   inflate_threshold_root, tn->bits);
603		else
604			pr_warning("Fix inflate_threshold."
605				   " Now=%d size=%d bits\n",
606				   inflate_threshold, tn->bits);
607	}
608
609	check_tnode(tn);
610
611	/*
612	 * Halve as long as the number of empty children in this
613	 * node is above threshold.
614	 */
615
616
617	/* Keep root node larger  */
618
619	if (!tn->parent)
620		halve_threshold_use = halve_threshold_root;
621	else
622		halve_threshold_use = halve_threshold;
623
624	err = 0;
625	max_resize = 10;
626	while (tn->bits > 1 &&  max_resize-- &&
627	       100 * (tnode_child_length(tn) - tn->empty_children) <
628	       halve_threshold_use * tnode_child_length(tn)) {
629
630		old_tn = tn;
631		tn = halve(t, tn);
632		if (IS_ERR(tn)) {
633			tn = old_tn;
634#ifdef CONFIG_IP_FIB_TRIE_STATS
635			t->stats.resize_node_skipped++;
636#endif
637			break;
638		}
639	}
640
641	if (max_resize < 0) {
642		if (!tn->parent)
643			pr_warning("Fix halve_threshold_root."
644				   " Now=%d size=%d bits\n",
645				   halve_threshold_root, tn->bits);
646		else
647			pr_warning("Fix halve_threshold."
648				   " Now=%d size=%d bits\n",
649				   halve_threshold, tn->bits);
650	}
651
652	/* Only one child remains */
653	if (tn->empty_children == tnode_child_length(tn) - 1)
654		for (i = 0; i < tnode_child_length(tn); i++) {
655			struct node *n;
656
657			n = tn->child[i];
658			if (!n)
659				continue;
660
661			/* compress one level */
662
663			node_set_parent(n, NULL);
664			tnode_free(tn);
665			return n;
666		}
667
668	return (struct node *) tn;
669}
670
671static struct tnode *inflate(struct trie *t, struct tnode *tn)
672{
673	struct tnode *oldtnode = tn;
674	int olen = tnode_child_length(tn);
675	int i;
676
677	pr_debug("In inflate\n");
678
679	tn = tnode_new(oldtnode->key, oldtnode->pos, oldtnode->bits + 1);
680
681	if (!tn)
682		return ERR_PTR(-ENOMEM);
683
684	/*
685	 * Preallocate and store tnodes before the actual work so we
686	 * don't get into an inconsistent state if memory allocation
687	 * fails. In case of failure we return the oldnode and  inflate
688	 * of tnode is ignored.
689	 */
690
691	for (i = 0; i < olen; i++) {
692		struct tnode *inode;
693
694		inode = (struct tnode *) tnode_get_child(oldtnode, i);
695		if (inode &&
696		    IS_TNODE(inode) &&
697		    inode->pos == oldtnode->pos + oldtnode->bits &&
698		    inode->bits > 1) {
699			struct tnode *left, *right;
700			t_key m = ~0U << (KEYLENGTH - 1) >> inode->pos;
701
702			left = tnode_new(inode->key&(~m), inode->pos + 1,
703					 inode->bits - 1);
704			if (!left)
705				goto nomem;
706
707			right = tnode_new(inode->key|m, inode->pos + 1,
708					  inode->bits - 1);
709
710			if (!right) {
711				tnode_free(left);
712				goto nomem;
713			}
714
715			put_child(t, tn, 2*i, (struct node *) left);
716			put_child(t, tn, 2*i+1, (struct node *) right);
717		}
718	}
719
720	for (i = 0; i < olen; i++) {
721		struct tnode *inode;
722		struct node *node = tnode_get_child(oldtnode, i);
723		struct tnode *left, *right;
724		int size, j;
725
726		/* An empty child */
727		if (node == NULL)
728			continue;
729
730		/* A leaf or an internal node with skipped bits */
731
732		if (IS_LEAF(node) || ((struct tnode *) node)->pos >
733		   tn->pos + tn->bits - 1) {
734			if (tkey_extract_bits(node->key,
735					      oldtnode->pos + oldtnode->bits,
736					      1) == 0)
737				put_child(t, tn, 2*i, node);
738			else
739				put_child(t, tn, 2*i+1, node);
740			continue;
741		}
742
743		/* An internal node with two children */
744		inode = (struct tnode *) node;
745
746		if (inode->bits == 1) {
747			put_child(t, tn, 2*i, inode->child[0]);
748			put_child(t, tn, 2*i+1, inode->child[1]);
749
750			tnode_free(inode);
751			continue;
752		}
753
754		/* An internal node with more than two children */
755
756		/* We will replace this node 'inode' with two new
757		 * ones, 'left' and 'right', each with half of the
758		 * original children. The two new nodes will have
759		 * a position one bit further down the key and this
760		 * means that the "significant" part of their keys
761		 * (see the discussion near the top of this file)
762		 * will differ by one bit, which will be "0" in
763		 * left's key and "1" in right's key. Since we are
764		 * moving the key position by one step, the bit that
765		 * we are moving away from - the bit at position
766		 * (inode->pos) - is the one that will differ between
767		 * left and right. So... we synthesize that bit in the
768		 * two  new keys.
769		 * The mask 'm' below will be a single "one" bit at
770		 * the position (inode->pos)
771		 */
772
773		/* Use the old key, but set the new significant
774		 *   bit to zero.
775		 */
776
777		left = (struct tnode *) tnode_get_child(tn, 2*i);
778		put_child(t, tn, 2*i, NULL);
779
780		BUG_ON(!left);
781
782		right = (struct tnode *) tnode_get_child(tn, 2*i+1);
783		put_child(t, tn, 2*i+1, NULL);
784
785		BUG_ON(!right);
786
787		size = tnode_child_length(left);
788		for (j = 0; j < size; j++) {
789			put_child(t, left, j, inode->child[j]);
790			put_child(t, right, j, inode->child[j + size]);
791		}
792		put_child(t, tn, 2*i, resize(t, left));
793		put_child(t, tn, 2*i+1, resize(t, right));
794
795		tnode_free(inode);
796	}
797	tnode_free(oldtnode);
798	return tn;
799nomem:
800	{
801		int size = tnode_child_length(tn);
802		int j;
803
804		for (j = 0; j < size; j++)
805			if (tn->child[j])
806				tnode_free((struct tnode *)tn->child[j]);
807
808		tnode_free(tn);
809
810		return ERR_PTR(-ENOMEM);
811	}
812}
813
814static struct tnode *halve(struct trie *t, struct tnode *tn)
815{
816	struct tnode *oldtnode = tn;
817	struct node *left, *right;
818	int i;
819	int olen = tnode_child_length(tn);
820
821	pr_debug("In halve\n");
822
823	tn = tnode_new(oldtnode->key, oldtnode->pos, oldtnode->bits - 1);
824
825	if (!tn)
826		return ERR_PTR(-ENOMEM);
827
828	/*
829	 * Preallocate and store tnodes before the actual work so we
830	 * don't get into an inconsistent state if memory allocation
831	 * fails. In case of failure we return the oldnode and halve
832	 * of tnode is ignored.
833	 */
834
835	for (i = 0; i < olen; i += 2) {
836		left = tnode_get_child(oldtnode, i);
837		right = tnode_get_child(oldtnode, i+1);
838
839		/* Two nonempty children */
840		if (left && right) {
841			struct tnode *newn;
842
843			newn = tnode_new(left->key, tn->pos + tn->bits, 1);
844
845			if (!newn)
846				goto nomem;
847
848			put_child(t, tn, i/2, (struct node *)newn);
849		}
850
851	}
852
853	for (i = 0; i < olen; i += 2) {
854		struct tnode *newBinNode;
855
856		left = tnode_get_child(oldtnode, i);
857		right = tnode_get_child(oldtnode, i+1);
858
859		/* At least one of the children is empty */
860		if (left == NULL) {
861			if (right == NULL)    /* Both are empty */
862				continue;
863			put_child(t, tn, i/2, right);
864			continue;
865		}
866
867		if (right == NULL) {
868			put_child(t, tn, i/2, left);
869			continue;
870		}
871
872		/* Two nonempty children */
873		newBinNode = (struct tnode *) tnode_get_child(tn, i/2);
874		put_child(t, tn, i/2, NULL);
875		put_child(t, newBinNode, 0, left);
876		put_child(t, newBinNode, 1, right);
877		put_child(t, tn, i/2, resize(t, newBinNode));
878	}
879	tnode_free(oldtnode);
880	return tn;
881nomem:
882	{
883		int size = tnode_child_length(tn);
884		int j;
885
886		for (j = 0; j < size; j++)
887			if (tn->child[j])
888				tnode_free((struct tnode *)tn->child[j]);
889
890		tnode_free(tn);
891
892		return ERR_PTR(-ENOMEM);
893	}
894}
895
896/* readside must use rcu_read_lock currently dump routines
897 via get_fa_head and dump */
898
899static struct leaf_info *find_leaf_info(struct leaf *l, int plen)
900{
901	struct hlist_head *head = &l->list;
902	struct hlist_node *node;
903	struct leaf_info *li;
904
905	hlist_for_each_entry_rcu(li, node, head, hlist)
906		if (li->plen == plen)
907			return li;
908
909	return NULL;
910}
911
912static inline struct list_head *get_fa_head(struct leaf *l, int plen)
913{
914	struct leaf_info *li = find_leaf_info(l, plen);
915
916	if (!li)
917		return NULL;
918
919	return &li->falh;
920}
921
922static void insert_leaf_info(struct hlist_head *head, struct leaf_info *new)
923{
924	struct leaf_info *li = NULL, *last = NULL;
925	struct hlist_node *node;
926
927	if (hlist_empty(head)) {
928		hlist_add_head_rcu(&new->hlist, head);
929	} else {
930		hlist_for_each_entry(li, node, head, hlist) {
931			if (new->plen > li->plen)
932				break;
933
934			last = li;
935		}
936		if (last)
937			hlist_add_after_rcu(&last->hlist, &new->hlist);
938		else
939			hlist_add_before_rcu(&new->hlist, &li->hlist);
940	}
941}
942
943/* rcu_read_lock needs to be hold by caller from readside */
944
945static struct leaf *
946fib_find_node(struct trie *t, u32 key)
947{
948	int pos;
949	struct tnode *tn;
950	struct node *n;
951
952	pos = 0;
953	n = rcu_dereference(t->trie);
954
955	while (n != NULL &&  NODE_TYPE(n) == T_TNODE) {
956		tn = (struct tnode *) n;
957
958		check_tnode(tn);
959
960		if (tkey_sub_equals(tn->key, pos, tn->pos-pos, key)) {
961			pos = tn->pos + tn->bits;
962			n = tnode_get_child_rcu(tn,
963						tkey_extract_bits(key,
964								  tn->pos,
965								  tn->bits));
966		} else
967			break;
968	}
969	/* Case we have found a leaf. Compare prefixes */
970
971	if (n != NULL && IS_LEAF(n) && tkey_equals(key, n->key))
972		return (struct leaf *)n;
973
974	return NULL;
975}
976
977static struct node *trie_rebalance(struct trie *t, struct tnode *tn)
978{
979	int wasfull;
980	t_key cindex, key = tn->key;
981	struct tnode *tp;
982
983	while (tn != NULL && (tp = node_parent((struct node *)tn)) != NULL) {
984		cindex = tkey_extract_bits(key, tp->pos, tp->bits);
985		wasfull = tnode_full(tp, tnode_get_child(tp, cindex));
986		tn = (struct tnode *) resize(t, (struct tnode *)tn);
987
988		tnode_put_child_reorg((struct tnode *)tp, cindex,
989				      (struct node *)tn, wasfull);
990
991		tp = node_parent((struct node *) tn);
992		if (!tp)
993			break;
994		tn = tp;
995	}
996
997	/* Handle last (top) tnode */
998	if (IS_TNODE(tn))
999		tn = (struct tnode *)resize(t, (struct tnode *)tn);
1000
1001	return (struct node *)tn;
1002}
1003
1004/* only used from updater-side */
1005
1006static struct list_head *fib_insert_node(struct trie *t, u32 key, int plen)
1007{
1008	int pos, newpos;
1009	struct tnode *tp = NULL, *tn = NULL;
1010	struct node *n;
1011	struct leaf *l;
1012	int missbit;
1013	struct list_head *fa_head = NULL;
1014	struct leaf_info *li;
1015	t_key cindex;
1016
1017	pos = 0;
1018	n = t->trie;
1019
1020	/* If we point to NULL, stop. Either the tree is empty and we should
1021	 * just put a new leaf in if, or we have reached an empty child slot,
1022	 * and we should just put our new leaf in that.
1023	 * If we point to a T_TNODE, check if it matches our key. Note that
1024	 * a T_TNODE might be skipping any number of bits - its 'pos' need
1025	 * not be the parent's 'pos'+'bits'!
1026	 *
1027	 * If it does match the current key, get pos/bits from it, extract
1028	 * the index from our key, push the T_TNODE and walk the tree.
1029	 *
1030	 * If it doesn't, we have to replace it with a new T_TNODE.
1031	 *
1032	 * If we point to a T_LEAF, it might or might not have the same key
1033	 * as we do. If it does, just change the value, update the T_LEAF's
1034	 * value, and return it.
1035	 * If it doesn't, we need to replace it with a T_TNODE.
1036	 */
1037
1038	while (n != NULL &&  NODE_TYPE(n) == T_TNODE) {
1039		tn = (struct tnode *) n;
1040
1041		check_tnode(tn);
1042
1043		if (tkey_sub_equals(tn->key, pos, tn->pos-pos, key)) {
1044			tp = tn;
1045			pos = tn->pos + tn->bits;
1046			n = tnode_get_child(tn,
1047					    tkey_extract_bits(key,
1048							      tn->pos,
1049							      tn->bits));
1050
1051			BUG_ON(n && node_parent(n) != tn);
1052		} else
1053			break;
1054	}
1055
1056	/*
1057	 * n  ----> NULL, LEAF or TNODE
1058	 *
1059	 * tp is n's (parent) ----> NULL or TNODE
1060	 */
1061
1062	BUG_ON(tp && IS_LEAF(tp));
1063
1064	/* Case 1: n is a leaf. Compare prefixes */
1065
1066	if (n != NULL && IS_LEAF(n) && tkey_equals(key, n->key)) {
1067		l = (struct leaf *) n;
1068		li = leaf_info_new(plen);
1069
1070		if (!li)
1071			return NULL;
1072
1073		fa_head = &li->falh;
1074		insert_leaf_info(&l->list, li);
1075		goto done;
1076	}
1077	l = leaf_new();
1078
1079	if (!l)
1080		return NULL;
1081
1082	l->key = key;
1083	li = leaf_info_new(plen);
1084
1085	if (!li) {
1086		tnode_free((struct tnode *) l);
1087		return NULL;
1088	}
1089
1090	fa_head = &li->falh;
1091	insert_leaf_info(&l->list, li);
1092
1093	if (t->trie && n == NULL) {
1094		/* Case 2: n is NULL, and will just insert a new leaf */
1095
1096		node_set_parent((struct node *)l, tp);
1097
1098		cindex = tkey_extract_bits(key, tp->pos, tp->bits);
1099		put_child(t, (struct tnode *)tp, cindex, (struct node *)l);
1100	} else {
1101		/* Case 3: n is a LEAF or a TNODE and the key doesn't match. */
1102		/*
1103		 *  Add a new tnode here
1104		 *  first tnode need some special handling
1105		 */
1106
1107		if (tp)
1108			pos = tp->pos+tp->bits;
1109		else
1110			pos = 0;
1111
1112		if (n) {
1113			newpos = tkey_mismatch(key, pos, n->key);
1114			tn = tnode_new(n->key, newpos, 1);
1115		} else {
1116			newpos = 0;
1117			tn = tnode_new(key, newpos, 1); /* First tnode */
1118		}
1119
1120		if (!tn) {
1121			free_leaf_info(li);
1122			tnode_free((struct tnode *) l);
1123			return NULL;
1124		}
1125
1126		node_set_parent((struct node *)tn, tp);
1127
1128		missbit = tkey_extract_bits(key, newpos, 1);
1129		put_child(t, tn, missbit, (struct node *)l);
1130		put_child(t, tn, 1-missbit, n);
1131
1132		if (tp) {
1133			cindex = tkey_extract_bits(key, tp->pos, tp->bits);
1134			put_child(t, (struct tnode *)tp, cindex,
1135				  (struct node *)tn);
1136		} else {
1137			rcu_assign_pointer(t->trie, (struct node *)tn);
1138			tp = tn;
1139		}
1140	}
1141
1142	if (tp && tp->pos + tp->bits > 32)
1143		pr_warning("fib_trie"
1144			   " tp=%p pos=%d, bits=%d, key=%0x plen=%d\n",
1145			   tp, tp->pos, tp->bits, key, plen);
1146
1147	/* Rebalance the trie */
1148
1149	rcu_assign_pointer(t->trie, trie_rebalance(t, tp));
1150done:
1151	return fa_head;
1152}
1153
1154/*
1155 * Caller must hold RTNL.
1156 */
1157static int fn_trie_insert(struct fib_table *tb, struct fib_config *cfg)
1158{
1159	struct trie *t = (struct trie *) tb->tb_data;
1160	struct fib_alias *fa, *new_fa;
1161	struct list_head *fa_head = NULL;
1162	struct fib_info *fi;
1163	int plen = cfg->fc_dst_len;
1164	u8 tos = cfg->fc_tos;
1165	u32 key, mask;
1166	int err;
1167	struct leaf *l;
1168
1169	if (plen > 32)
1170		return -EINVAL;
1171
1172	key = ntohl(cfg->fc_dst);
1173
1174	pr_debug("Insert table=%u %08x/%d\n", tb->tb_id, key, plen);
1175
1176	mask = ntohl(inet_make_mask(plen));
1177
1178	if (key & ~mask)
1179		return -EINVAL;
1180
1181	key = key & mask;
1182
1183	fi = fib_create_info(cfg);
1184	if (IS_ERR(fi)) {
1185		err = PTR_ERR(fi);
1186		goto err;
1187	}
1188
1189	l = fib_find_node(t, key);
1190	fa = NULL;
1191
1192	if (l) {
1193		fa_head = get_fa_head(l, plen);
1194		fa = fib_find_alias(fa_head, tos, fi->fib_priority);
1195	}
1196
1197	/* Now fa, if non-NULL, points to the first fib alias
1198	 * with the same keys [prefix,tos,priority], if such key already
1199	 * exists or to the node before which we will insert new one.
1200	 *
1201	 * If fa is NULL, we will need to allocate a new one and
1202	 * insert to the head of f.
1203	 *
1204	 * If f is NULL, no fib node matched the destination key
1205	 * and we need to allocate a new one of those as well.
1206	 */
1207
1208	if (fa && fa->fa_info->fib_priority == fi->fib_priority) {
1209		struct fib_alias *fa_orig;
1210
1211		err = -EEXIST;
1212		if (cfg->fc_nlflags & NLM_F_EXCL)
1213			goto out;
1214
1215		if (cfg->fc_nlflags & NLM_F_REPLACE) {
1216			struct fib_info *fi_drop;
1217			u8 state;
1218
1219			if (fi->fib_treeref > 1)
1220				goto out;
1221
1222			err = -ENOBUFS;
1223			new_fa = kmem_cache_alloc(fn_alias_kmem, GFP_KERNEL);
1224			if (new_fa == NULL)
1225				goto out;
1226
1227			fi_drop = fa->fa_info;
1228			new_fa->fa_tos = fa->fa_tos;
1229			new_fa->fa_info = fi;
1230			new_fa->fa_type = cfg->fc_type;
1231			new_fa->fa_scope = cfg->fc_scope;
1232			state = fa->fa_state;
1233			new_fa->fa_state &= ~FA_S_ACCESSED;
1234
1235			list_replace_rcu(&fa->fa_list, &new_fa->fa_list);
1236			alias_free_mem_rcu(fa);
1237
1238			fib_release_info(fi_drop);
1239			if (state & FA_S_ACCESSED)
1240				rt_cache_flush(-1);
1241			rtmsg_fib(RTM_NEWROUTE, htonl(key), new_fa, plen,
1242				tb->tb_id, &cfg->fc_nlinfo, NLM_F_REPLACE);
1243
1244			goto succeeded;
1245		}
1246		/* Error if we find a perfect match which
1247		 * uses the same scope, type, and nexthop
1248		 * information.
1249		 */
1250		fa_orig = fa;
1251		list_for_each_entry(fa, fa_orig->fa_list.prev, fa_list) {
1252			if (fa->fa_tos != tos)
1253				break;
1254			if (fa->fa_info->fib_priority != fi->fib_priority)
1255				break;
1256			if (fa->fa_type == cfg->fc_type &&
1257			    fa->fa_scope == cfg->fc_scope &&
1258			    fa->fa_info == fi)
1259				goto out;
1260		}
1261
1262		if (!(cfg->fc_nlflags & NLM_F_APPEND))
1263			fa = fa_orig;
1264	}
1265	err = -ENOENT;
1266	if (!(cfg->fc_nlflags & NLM_F_CREATE))
1267		goto out;
1268
1269	err = -ENOBUFS;
1270	new_fa = kmem_cache_alloc(fn_alias_kmem, GFP_KERNEL);
1271	if (new_fa == NULL)
1272		goto out;
1273
1274	new_fa->fa_info = fi;
1275	new_fa->fa_tos = tos;
1276	new_fa->fa_type = cfg->fc_type;
1277	new_fa->fa_scope = cfg->fc_scope;
1278	new_fa->fa_state = 0;
1279	/*
1280	 * Insert new entry to the list.
1281	 */
1282
1283	if (!fa_head) {
1284		fa_head = fib_insert_node(t, key, plen);
1285		if (unlikely(!fa_head)) {
1286			err = -ENOMEM;
1287			goto out_free_new_fa;
1288		}
1289	}
1290
1291	list_add_tail_rcu(&new_fa->fa_list,
1292			  (fa ? &fa->fa_list : fa_head));
1293
1294	rt_cache_flush(-1);
1295	rtmsg_fib(RTM_NEWROUTE, htonl(key), new_fa, plen, tb->tb_id,
1296		  &cfg->fc_nlinfo, 0);
1297succeeded:
1298	return 0;
1299
1300out_free_new_fa:
1301	kmem_cache_free(fn_alias_kmem, new_fa);
1302out:
1303	fib_release_info(fi);
1304err:
1305	return err;
1306}
1307
1308/* should be called with rcu_read_lock */
1309static int check_leaf(struct trie *t, struct leaf *l,
1310		      t_key key,  const struct flowi *flp,
1311		      struct fib_result *res)
1312{
1313	struct leaf_info *li;
1314	struct hlist_head *hhead = &l->list;
1315	struct hlist_node *node;
1316
1317	hlist_for_each_entry_rcu(li, node, hhead, hlist) {
1318		int err;
1319		int plen = li->plen;
1320		__be32 mask = inet_make_mask(plen);
1321
1322		if (l->key != (key & ntohl(mask)))
1323			continue;
1324
1325		err = fib_semantic_match(&li->falh, flp, res,
1326					 htonl(l->key), mask, plen);
1327
1328#ifdef CONFIG_IP_FIB_TRIE_STATS
1329		if (err <= 0)
1330			t->stats.semantic_match_passed++;
1331		else
1332			t->stats.semantic_match_miss++;
1333#endif
1334		if (err <= 0)
1335			return plen;
1336	}
1337
1338	return -1;
1339}
1340
1341static int fn_trie_lookup(struct fib_table *tb, const struct flowi *flp,
1342			  struct fib_result *res)
1343{
1344	struct trie *t = (struct trie *) tb->tb_data;
1345	int plen, ret = 0;
1346	struct node *n;
1347	struct tnode *pn;
1348	int pos, bits;
1349	t_key key = ntohl(flp->fl4_dst);
1350	int chopped_off;
1351	t_key cindex = 0;
1352	int current_prefix_length = KEYLENGTH;
1353	struct tnode *cn;
1354	t_key node_prefix, key_prefix, pref_mismatch;
1355	int mp;
1356
1357	rcu_read_lock();
1358
1359	n = rcu_dereference(t->trie);
1360	if (!n)
1361		goto failed;
1362
1363#ifdef CONFIG_IP_FIB_TRIE_STATS
1364	t->stats.gets++;
1365#endif
1366
1367	/* Just a leaf? */
1368	if (IS_LEAF(n)) {
1369		plen = check_leaf(t, (struct leaf *)n, key, flp, res);
1370		if (plen < 0)
1371			goto failed;
1372		ret = 0;
1373		goto found;
1374	}
1375
1376	pn = (struct tnode *) n;
1377	chopped_off = 0;
1378
1379	while (pn) {
1380		pos = pn->pos;
1381		bits = pn->bits;
1382
1383		if (!chopped_off)
1384			cindex = tkey_extract_bits(mask_pfx(key, current_prefix_length),
1385						   pos, bits);
1386
1387		n = tnode_get_child(pn, cindex);
1388
1389		if (n == NULL) {
1390#ifdef CONFIG_IP_FIB_TRIE_STATS
1391			t->stats.null_node_hit++;
1392#endif
1393			goto backtrace;
1394		}
1395
1396		if (IS_LEAF(n)) {
1397			plen = check_leaf(t, (struct leaf *)n, key, flp, res);
1398			if (plen < 0)
1399				goto backtrace;
1400
1401			ret = 0;
1402			goto found;
1403		}
1404
1405		cn = (struct tnode *)n;
1406
1407		/*
1408		 * It's a tnode, and we can do some extra checks here if we
1409		 * like, to avoid descending into a dead-end branch.
1410		 * This tnode is in the parent's child array at index
1411		 * key[p_pos..p_pos+p_bits] but potentially with some bits
1412		 * chopped off, so in reality the index may be just a
1413		 * subprefix, padded with zero at the end.
1414		 * We can also take a look at any skipped bits in this
1415		 * tnode - everything up to p_pos is supposed to be ok,
1416		 * and the non-chopped bits of the index (se previous
1417		 * paragraph) are also guaranteed ok, but the rest is
1418		 * considered unknown.
1419		 *
1420		 * The skipped bits are key[pos+bits..cn->pos].
1421		 */
1422
1423		/* If current_prefix_length < pos+bits, we are already doing
1424		 * actual prefix  matching, which means everything from
1425		 * pos+(bits-chopped_off) onward must be zero along some
1426		 * branch of this subtree - otherwise there is *no* valid
1427		 * prefix present. Here we can only check the skipped
1428		 * bits. Remember, since we have already indexed into the
1429		 * parent's child array, we know that the bits we chopped of
1430		 * *are* zero.
1431		 */
1432
1433		/* NOTA BENE: Checking only skipped bits
1434		   for the new node here */
1435
1436		if (current_prefix_length < pos+bits) {
1437			if (tkey_extract_bits(cn->key, current_prefix_length,
1438						cn->pos - current_prefix_length)
1439			    || !(cn->child[0]))
1440				goto backtrace;
1441		}
1442
1443		/*
1444		 * If chopped_off=0, the index is fully validated and we
1445		 * only need to look at the skipped bits for this, the new,
1446		 * tnode. What we actually want to do is to find out if
1447		 * these skipped bits match our key perfectly, or if we will
1448		 * have to count on finding a matching prefix further down,
1449		 * because if we do, we would like to have some way of
1450		 * verifying the existence of such a prefix at this point.
1451		 */
1452
1453		/* The only thing we can do at this point is to verify that
1454		 * any such matching prefix can indeed be a prefix to our
1455		 * key, and if the bits in the node we are inspecting that
1456		 * do not match our key are not ZERO, this cannot be true.
1457		 * Thus, find out where there is a mismatch (before cn->pos)
1458		 * and verify that all the mismatching bits are zero in the
1459		 * new tnode's key.
1460		 */
1461
1462		/*
1463		 * Note: We aren't very concerned about the piece of
1464		 * the key that precede pn->pos+pn->bits, since these
1465		 * have already been checked. The bits after cn->pos
1466		 * aren't checked since these are by definition
1467		 * "unknown" at this point. Thus, what we want to see
1468		 * is if we are about to enter the "prefix matching"
1469		 * state, and in that case verify that the skipped
1470		 * bits that will prevail throughout this subtree are
1471		 * zero, as they have to be if we are to find a
1472		 * matching prefix.
1473		 */
1474
1475		node_prefix = mask_pfx(cn->key, cn->pos);
1476		key_prefix = mask_pfx(key, cn->pos);
1477		pref_mismatch = key_prefix^node_prefix;
1478		mp = 0;
1479
1480		/*
1481		 * In short: If skipped bits in this node do not match
1482		 * the search key, enter the "prefix matching"
1483		 * state.directly.
1484		 */
1485		if (pref_mismatch) {
1486			while (!(pref_mismatch & (1<<(KEYLENGTH-1)))) {
1487				mp++;
1488				pref_mismatch = pref_mismatch << 1;
1489			}
1490			key_prefix = tkey_extract_bits(cn->key, mp, cn->pos-mp);
1491
1492			if (key_prefix != 0)
1493				goto backtrace;
1494
1495			if (current_prefix_length >= cn->pos)
1496				current_prefix_length = mp;
1497		}
1498
1499		pn = (struct tnode *)n; /* Descend */
1500		chopped_off = 0;
1501		continue;
1502
1503backtrace:
1504		chopped_off++;
1505
1506		/* As zero don't change the child key (cindex) */
1507		while ((chopped_off <= pn->bits)
1508		       && !(cindex & (1<<(chopped_off-1))))
1509			chopped_off++;
1510
1511		/* Decrease current_... with bits chopped off */
1512		if (current_prefix_length > pn->pos + pn->bits - chopped_off)
1513			current_prefix_length = pn->pos + pn->bits
1514				- chopped_off;
1515
1516		/*
1517		 * Either we do the actual chop off according or if we have
1518		 * chopped off all bits in this tnode walk up to our parent.
1519		 */
1520
1521		if (chopped_off <= pn->bits) {
1522			cindex &= ~(1 << (chopped_off-1));
1523		} else {
1524			struct tnode *parent = node_parent((struct node *) pn);
1525			if (!parent)
1526				goto failed;
1527
1528			/* Get Child's index */
1529			cindex = tkey_extract_bits(pn->key, parent->pos, parent->bits);
1530			pn = parent;
1531			chopped_off = 0;
1532
1533#ifdef CONFIG_IP_FIB_TRIE_STATS
1534			t->stats.backtrack++;
1535#endif
1536			goto backtrace;
1537		}
1538	}
1539failed:
1540	ret = 1;
1541found:
1542	rcu_read_unlock();
1543	return ret;
1544}
1545
1546/*
1547 * Remove the leaf and return parent.
1548 */
1549static void trie_leaf_remove(struct trie *t, struct leaf *l)
1550{
1551	struct tnode *tp = node_parent((struct node *) l);
1552
1553	pr_debug("entering trie_leaf_remove(%p)\n", l);
1554
1555	if (tp) {
1556		t_key cindex = tkey_extract_bits(l->key, tp->pos, tp->bits);
1557		put_child(t, (struct tnode *)tp, cindex, NULL);
1558		rcu_assign_pointer(t->trie, trie_rebalance(t, tp));
1559	} else
1560		rcu_assign_pointer(t->trie, NULL);
1561
1562	tnode_free((struct tnode *) l);
1563}
1564
1565/*
1566 * Caller must hold RTNL.
1567 */
1568static int fn_trie_delete(struct fib_table *tb, struct fib_config *cfg)
1569{
1570	struct trie *t = (struct trie *) tb->tb_data;
1571	u32 key, mask;
1572	int plen = cfg->fc_dst_len;
1573	u8 tos = cfg->fc_tos;
1574	struct fib_alias *fa, *fa_to_delete;
1575	struct list_head *fa_head;
1576	struct leaf *l;
1577	struct leaf_info *li;
1578
1579	if (plen > 32)
1580		return -EINVAL;
1581
1582	key = ntohl(cfg->fc_dst);
1583	mask = ntohl(inet_make_mask(plen));
1584
1585	if (key & ~mask)
1586		return -EINVAL;
1587
1588	key = key & mask;
1589	l = fib_find_node(t, key);
1590
1591	if (!l)
1592		return -ESRCH;
1593
1594	fa_head = get_fa_head(l, plen);
1595	fa = fib_find_alias(fa_head, tos, 0);
1596
1597	if (!fa)
1598		return -ESRCH;
1599
1600	pr_debug("Deleting %08x/%d tos=%d t=%p\n", key, plen, tos, t);
1601
1602	fa_to_delete = NULL;
1603	fa_head = fa->fa_list.prev;
1604
1605	list_for_each_entry(fa, fa_head, fa_list) {
1606		struct fib_info *fi = fa->fa_info;
1607
1608		if (fa->fa_tos != tos)
1609			break;
1610
1611		if ((!cfg->fc_type || fa->fa_type == cfg->fc_type) &&
1612		    (cfg->fc_scope == RT_SCOPE_NOWHERE ||
1613		     fa->fa_scope == cfg->fc_scope) &&
1614		    (!cfg->fc_protocol ||
1615		     fi->fib_protocol == cfg->fc_protocol) &&
1616		    fib_nh_match(cfg, fi) == 0) {
1617			fa_to_delete = fa;
1618			break;
1619		}
1620	}
1621
1622	if (!fa_to_delete)
1623		return -ESRCH;
1624
1625	fa = fa_to_delete;
1626	rtmsg_fib(RTM_DELROUTE, htonl(key), fa, plen, tb->tb_id,
1627		  &cfg->fc_nlinfo, 0);
1628
1629	l = fib_find_node(t, key);
1630	li = find_leaf_info(l, plen);
1631
1632	list_del_rcu(&fa->fa_list);
1633
1634	if (list_empty(fa_head)) {
1635		hlist_del_rcu(&li->hlist);
1636		free_leaf_info(li);
1637	}
1638
1639	if (hlist_empty(&l->list))
1640		trie_leaf_remove(t, l);
1641
1642	if (fa->fa_state & FA_S_ACCESSED)
1643		rt_cache_flush(-1);
1644
1645	fib_release_info(fa->fa_info);
1646	alias_free_mem_rcu(fa);
1647	return 0;
1648}
1649
1650static int trie_flush_list(struct trie *t, struct list_head *head)
1651{
1652	struct fib_alias *fa, *fa_node;
1653	int found = 0;
1654
1655	list_for_each_entry_safe(fa, fa_node, head, fa_list) {
1656		struct fib_info *fi = fa->fa_info;
1657
1658		if (fi && (fi->fib_flags & RTNH_F_DEAD)) {
1659			list_del_rcu(&fa->fa_list);
1660			fib_release_info(fa->fa_info);
1661			alias_free_mem_rcu(fa);
1662			found++;
1663		}
1664	}
1665	return found;
1666}
1667
1668static int trie_flush_leaf(struct trie *t, struct leaf *l)
1669{
1670	int found = 0;
1671	struct hlist_head *lih = &l->list;
1672	struct hlist_node *node, *tmp;
1673	struct leaf_info *li = NULL;
1674
1675	hlist_for_each_entry_safe(li, node, tmp, lih, hlist) {
1676		found += trie_flush_list(t, &li->falh);
1677
1678		if (list_empty(&li->falh)) {
1679			hlist_del_rcu(&li->hlist);
1680			free_leaf_info(li);
1681		}
1682	}
1683	return found;
1684}
1685
1686/*
1687 * Scan for the next right leaf starting at node p->child[idx]
1688 * Since we have back pointer, no recursion necessary.
1689 */
1690static struct leaf *leaf_walk_rcu(struct tnode *p, struct node *c)
1691{
1692	do {
1693		t_key idx;
1694
1695		if (c)
1696			idx = tkey_extract_bits(c->key, p->pos, p->bits) + 1;
1697		else
1698			idx = 0;
1699
1700		while (idx < 1u << p->bits) {
1701			c = tnode_get_child_rcu(p, idx++);
1702			if (!c)
1703				continue;
1704
1705			if (IS_LEAF(c)) {
1706				prefetch(p->child[idx]);
1707				return (struct leaf *) c;
1708			}
1709
1710			/* Rescan start scanning in new node */
1711			p = (struct tnode *) c;
1712			idx = 0;
1713		}
1714
1715		/* Node empty, walk back up to parent */
1716		c = (struct node *) p;
1717	} while ( (p = node_parent_rcu(c)) != NULL);
1718
1719	return NULL; /* Root of trie */
1720}
1721
1722static struct leaf *trie_firstleaf(struct trie *t)
1723{
1724	struct tnode *n = (struct tnode *) rcu_dereference(t->trie);
1725
1726	if (!n)
1727		return NULL;
1728
1729	if (IS_LEAF(n))          /* trie is just a leaf */
1730		return (struct leaf *) n;
1731
1732	return leaf_walk_rcu(n, NULL);
1733}
1734
1735static struct leaf *trie_nextleaf(struct leaf *l)
1736{
1737	struct node *c = (struct node *) l;
1738	struct tnode *p = node_parent(c);
1739
1740	if (!p)
1741		return NULL;	/* trie with just one leaf */
1742
1743	return leaf_walk_rcu(p, c);
1744}
1745
1746/*
1747 * Caller must hold RTNL.
1748 */
1749static int fn_trie_flush(struct fib_table *tb)
1750{
1751	struct trie *t = (struct trie *) tb->tb_data;
1752	struct leaf *l, *ll = NULL;
1753	int found = 0;
1754
1755	for (l = trie_firstleaf(t); l; l = trie_nextleaf(l)) {
1756		found += trie_flush_leaf(t, l);
1757
1758		if (ll && hlist_empty(&ll->list))
1759			trie_leaf_remove(t, ll);
1760		ll = l;
1761	}
1762
1763	if (ll && hlist_empty(&ll->list))
1764		trie_leaf_remove(t, ll);
1765
1766	pr_debug("trie_flush found=%d\n", found);
1767	return found;
1768}
1769
1770static void fn_trie_select_default(struct fib_table *tb,
1771				   const struct flowi *flp,
1772				   struct fib_result *res)
1773{
1774	struct trie *t = (struct trie *) tb->tb_data;
1775	int order, last_idx;
1776	struct fib_info *fi = NULL;
1777	struct fib_info *last_resort;
1778	struct fib_alias *fa = NULL;
1779	struct list_head *fa_head;
1780	struct leaf *l;
1781
1782	last_idx = -1;
1783	last_resort = NULL;
1784	order = -1;
1785
1786	rcu_read_lock();
1787
1788	l = fib_find_node(t, 0);
1789	if (!l)
1790		goto out;
1791
1792	fa_head = get_fa_head(l, 0);
1793	if (!fa_head)
1794		goto out;
1795
1796	if (list_empty(fa_head))
1797		goto out;
1798
1799	list_for_each_entry_rcu(fa, fa_head, fa_list) {
1800		struct fib_info *next_fi = fa->fa_info;
1801
1802		if (fa->fa_scope != res->scope ||
1803		    fa->fa_type != RTN_UNICAST)
1804			continue;
1805
1806		if (next_fi->fib_priority > res->fi->fib_priority)
1807			break;
1808		if (!next_fi->fib_nh[0].nh_gw ||
1809		    next_fi->fib_nh[0].nh_scope != RT_SCOPE_LINK)
1810			continue;
1811		fa->fa_state |= FA_S_ACCESSED;
1812
1813		if (fi == NULL) {
1814			if (next_fi != res->fi)
1815				break;
1816		} else if (!fib_detect_death(fi, order, &last_resort,
1817					     &last_idx, tb->tb_default)) {
1818			fib_result_assign(res, fi);
1819			tb->tb_default = order;
1820			goto out;
1821		}
1822		fi = next_fi;
1823		order++;
1824	}
1825	if (order <= 0 || fi == NULL) {
1826		tb->tb_default = -1;
1827		goto out;
1828	}
1829
1830	if (!fib_detect_death(fi, order, &last_resort, &last_idx,
1831				tb->tb_default)) {
1832		fib_result_assign(res, fi);
1833		tb->tb_default = order;
1834		goto out;
1835	}
1836	if (last_idx >= 0)
1837		fib_result_assign(res, last_resort);
1838	tb->tb_default = last_idx;
1839out:
1840	rcu_read_unlock();
1841}
1842
1843static int fn_trie_dump_fa(t_key key, int plen, struct list_head *fah,
1844			   struct fib_table *tb,
1845			   struct sk_buff *skb, struct netlink_callback *cb)
1846{
1847	int i, s_i;
1848	struct fib_alias *fa;
1849	__be32 xkey = htonl(key);
1850
1851	s_i = cb->args[4];
1852	i = 0;
1853
1854	/* rcu_read_lock is hold by caller */
1855
1856	list_for_each_entry_rcu(fa, fah, fa_list) {
1857		if (i < s_i) {
1858			i++;
1859			continue;
1860		}
1861
1862		if (fib_dump_info(skb, NETLINK_CB(cb->skb).pid,
1863				  cb->nlh->nlmsg_seq,
1864				  RTM_NEWROUTE,
1865				  tb->tb_id,
1866				  fa->fa_type,
1867				  fa->fa_scope,
1868				  xkey,
1869				  plen,
1870				  fa->fa_tos,
1871				  fa->fa_info, NLM_F_MULTI) < 0) {
1872			cb->args[4] = i;
1873			return -1;
1874		}
1875		i++;
1876	}
1877	cb->args[4] = i;
1878	return skb->len;
1879}
1880
1881static int fn_trie_dump_leaf(struct leaf *l, struct fib_table *tb,
1882			struct sk_buff *skb, struct netlink_callback *cb)
1883{
1884	struct leaf_info *li;
1885	struct hlist_node *node;
1886	int i, s_i;
1887
1888	s_i = cb->args[3];
1889	i = 0;
1890
1891	/* rcu_read_lock is hold by caller */
1892	hlist_for_each_entry_rcu(li, node, &l->list, hlist) {
1893		if (i < s_i) {
1894			i++;
1895			continue;
1896		}
1897
1898		if (i > s_i)
1899			cb->args[4] = 0;
1900
1901		if (list_empty(&li->falh))
1902			continue;
1903
1904		if (fn_trie_dump_fa(l->key, li->plen, &li->falh, tb, skb, cb) < 0) {
1905			cb->args[3] = i;
1906			return -1;
1907		}
1908		i++;
1909	}
1910
1911	cb->args[3] = i;
1912	return skb->len;
1913}
1914
1915static int fn_trie_dump(struct fib_table *tb, struct sk_buff *skb,
1916			struct netlink_callback *cb)
1917{
1918	struct leaf *l;
1919	struct trie *t = (struct trie *) tb->tb_data;
1920	t_key key = cb->args[2];
1921
1922	rcu_read_lock();
1923	/* Dump starting at last key.
1924	 * Note: 0.0.0.0/0 (ie default) is first key.
1925	 */
1926	if (!key)
1927		l = trie_firstleaf(t);
1928	else {
1929		l = fib_find_node(t, key);
1930		if (!l) {
1931			/* The table changed during the dump, rather than
1932			 * giving partial data, just make application retry.
1933			 */
1934			rcu_read_unlock();
1935			return -EBUSY;
1936		}
1937	}
1938
1939	while (l) {
1940		cb->args[2] = l->key;
1941		if (fn_trie_dump_leaf(l, tb, skb, cb) < 0) {
1942			rcu_read_unlock();
1943			return -1;
1944		}
1945
1946		l = trie_nextleaf(l);
1947		memset(&cb->args[3], 0,
1948		       sizeof(cb->args) - 3*sizeof(cb->args[0]));
1949	}
1950	rcu_read_unlock();
1951
1952	return skb->len;
1953}
1954
1955void __init fib_hash_init(void)
1956{
1957	fn_alias_kmem = kmem_cache_create("ip_fib_alias",
1958					  sizeof(struct fib_alias),
1959					  0, SLAB_PANIC, NULL);
1960
1961	trie_leaf_kmem = kmem_cache_create("ip_fib_trie",
1962					   max(sizeof(struct leaf),
1963					       sizeof(struct leaf_info)),
1964					   0, SLAB_PANIC, NULL);
1965}
1966
1967
1968/* Fix more generic FIB names for init later */
1969struct fib_table *fib_hash_table(u32 id)
1970{
1971	struct fib_table *tb;
1972	struct trie *t;
1973
1974	tb = kmalloc(sizeof(struct fib_table) + sizeof(struct trie),
1975		     GFP_KERNEL);
1976	if (tb == NULL)
1977		return NULL;
1978
1979	tb->tb_id = id;
1980	tb->tb_default = -1;
1981	tb->tb_lookup = fn_trie_lookup;
1982	tb->tb_insert = fn_trie_insert;
1983	tb->tb_delete = fn_trie_delete;
1984	tb->tb_flush = fn_trie_flush;
1985	tb->tb_select_default = fn_trie_select_default;
1986	tb->tb_dump = fn_trie_dump;
1987
1988	t = (struct trie *) tb->tb_data;
1989	memset(t, 0, sizeof(*t));
1990
1991	if (id == RT_TABLE_LOCAL)
1992		pr_info("IPv4 FIB: Using LC-trie version %s\n", VERSION);
1993
1994	return tb;
1995}
1996
1997#ifdef CONFIG_PROC_FS
1998/* Depth first Trie walk iterator */
1999struct fib_trie_iter {
2000	struct seq_net_private p;
2001	struct trie *trie_local, *trie_main;
2002	struct tnode *tnode;
2003	struct trie *trie;
2004	unsigned index;
2005	unsigned depth;
2006};
2007
2008static struct node *fib_trie_get_next(struct fib_trie_iter *iter)
2009{
2010	struct tnode *tn = iter->tnode;
2011	unsigned cindex = iter->index;
2012	struct tnode *p;
2013
2014	/* A single entry routing table */
2015	if (!tn)
2016		return NULL;
2017
2018	pr_debug("get_next iter={node=%p index=%d depth=%d}\n",
2019		 iter->tnode, iter->index, iter->depth);
2020rescan:
2021	while (cindex < (1<<tn->bits)) {
2022		struct node *n = tnode_get_child_rcu(tn, cindex);
2023
2024		if (n) {
2025			if (IS_LEAF(n)) {
2026				iter->tnode = tn;
2027				iter->index = cindex + 1;
2028			} else {
2029				/* push down one level */
2030				iter->tnode = (struct tnode *) n;
2031				iter->index = 0;
2032				++iter->depth;
2033			}
2034			return n;
2035		}
2036
2037		++cindex;
2038	}
2039
2040	/* Current node exhausted, pop back up */
2041	p = node_parent_rcu((struct node *)tn);
2042	if (p) {
2043		cindex = tkey_extract_bits(tn->key, p->pos, p->bits)+1;
2044		tn = p;
2045		--iter->depth;
2046		goto rescan;
2047	}
2048
2049	/* got root? */
2050	return NULL;
2051}
2052
2053static struct node *fib_trie_get_first(struct fib_trie_iter *iter,
2054				       struct trie *t)
2055{
2056	struct node *n ;
2057
2058	if (!t)
2059		return NULL;
2060
2061	n = rcu_dereference(t->trie);
2062
2063	if (!iter)
2064		return NULL;
2065
2066	if (n) {
2067		if (IS_TNODE(n)) {
2068			iter->tnode = (struct tnode *) n;
2069			iter->trie = t;
2070			iter->index = 0;
2071			iter->depth = 1;
2072		} else {
2073			iter->tnode = NULL;
2074			iter->trie  = t;
2075			iter->index = 0;
2076			iter->depth = 0;
2077		}
2078		return n;
2079	}
2080	return NULL;
2081}
2082
2083static void trie_collect_stats(struct trie *t, struct trie_stat *s)
2084{
2085	struct node *n;
2086	struct fib_trie_iter iter;
2087
2088	memset(s, 0, sizeof(*s));
2089
2090	rcu_read_lock();
2091	for (n = fib_trie_get_first(&iter, t); n;
2092	     n = fib_trie_get_next(&iter)) {
2093		if (IS_LEAF(n)) {
2094			struct leaf *l = (struct leaf *)n;
2095			struct leaf_info *li;
2096			struct hlist_node *tmp;
2097
2098			s->leaves++;
2099			s->totdepth += iter.depth;
2100			if (iter.depth > s->maxdepth)
2101				s->maxdepth = iter.depth;
2102
2103			hlist_for_each_entry_rcu(li, tmp, &l->list, hlist)
2104				++s->prefixes;
2105		} else {
2106			const struct tnode *tn = (const struct tnode *) n;
2107			int i;
2108
2109			s->tnodes++;
2110			if (tn->bits < MAX_STAT_DEPTH)
2111				s->nodesizes[tn->bits]++;
2112
2113			for (i = 0; i < (1<<tn->bits); i++)
2114				if (!tn->child[i])
2115					s->nullpointers++;
2116		}
2117	}
2118	rcu_read_unlock();
2119}
2120
2121/*
2122 *	This outputs /proc/net/fib_triestats
2123 */
2124static void trie_show_stats(struct seq_file *seq, struct trie_stat *stat)
2125{
2126	unsigned i, max, pointers, bytes, avdepth;
2127
2128	if (stat->leaves)
2129		avdepth = stat->totdepth*100 / stat->leaves;
2130	else
2131		avdepth = 0;
2132
2133	seq_printf(seq, "\tAver depth:     %u.%02d\n",
2134		   avdepth / 100, avdepth % 100);
2135	seq_printf(seq, "\tMax depth:      %u\n", stat->maxdepth);
2136
2137	seq_printf(seq, "\tLeaves:         %u\n", stat->leaves);
2138	bytes = sizeof(struct leaf) * stat->leaves;
2139
2140	seq_printf(seq, "\tPrefixes:       %u\n", stat->prefixes);
2141	bytes += sizeof(struct leaf_info) * stat->prefixes;
2142
2143	seq_printf(seq, "\tInternal nodes: %u\n\t", stat->tnodes);
2144	bytes += sizeof(struct tnode) * stat->tnodes;
2145
2146	max = MAX_STAT_DEPTH;
2147	while (max > 0 && stat->nodesizes[max-1] == 0)
2148		max--;
2149
2150	pointers = 0;
2151	for (i = 1; i <= max; i++)
2152		if (stat->nodesizes[i] != 0) {
2153			seq_printf(seq, "  %u: %u",  i, stat->nodesizes[i]);
2154			pointers += (1<<i) * stat->nodesizes[i];
2155		}
2156	seq_putc(seq, '\n');
2157	seq_printf(seq, "\tPointers: %u\n", pointers);
2158
2159	bytes += sizeof(struct node *) * pointers;
2160	seq_printf(seq, "Null ptrs: %u\n", stat->nullpointers);
2161	seq_printf(seq, "Total size: %u  kB\n", (bytes + 1023) / 1024);
2162}
2163
2164#ifdef CONFIG_IP_FIB_TRIE_STATS
2165static void trie_show_usage(struct seq_file *seq,
2166			    const struct trie_use_stats *stats)
2167{
2168	seq_printf(seq, "\nCounters:\n---------\n");
2169	seq_printf(seq, "gets = %u\n", stats->gets);
2170	seq_printf(seq, "backtracks = %u\n", stats->backtrack);
2171	seq_printf(seq, "semantic match passed = %u\n",
2172		   stats->semantic_match_passed);
2173	seq_printf(seq, "semantic match miss = %u\n",
2174		   stats->semantic_match_miss);
2175	seq_printf(seq, "null node hit= %u\n", stats->null_node_hit);
2176	seq_printf(seq, "skipped node resize = %u\n\n",
2177		   stats->resize_node_skipped);
2178}
2179#endif /*  CONFIG_IP_FIB_TRIE_STATS */
2180
2181static void fib_trie_show(struct seq_file *seq, const char *name,
2182			  struct trie *trie)
2183{
2184	struct trie_stat stat;
2185
2186	trie_collect_stats(trie, &stat);
2187	seq_printf(seq, "%s:\n", name);
2188	trie_show_stats(seq, &stat);
2189#ifdef CONFIG_IP_FIB_TRIE_STATS
2190	trie_show_usage(seq, &trie->stats);
2191#endif
2192}
2193
2194static int fib_triestat_seq_show(struct seq_file *seq, void *v)
2195{
2196	struct net *net = (struct net *)seq->private;
2197	struct fib_table *tb;
2198
2199	seq_printf(seq,
2200		   "Basic info: size of leaf:"
2201		   " %Zd bytes, size of tnode: %Zd bytes.\n",
2202		   sizeof(struct leaf), sizeof(struct tnode));
2203
2204	tb = fib_get_table(net, RT_TABLE_LOCAL);
2205	if (tb)
2206		fib_trie_show(seq, "Local", (struct trie *) tb->tb_data);
2207
2208	tb = fib_get_table(net, RT_TABLE_MAIN);
2209	if (tb)
2210		fib_trie_show(seq, "Main", (struct trie *) tb->tb_data);
2211
2212	return 0;
2213}
2214
2215static int fib_triestat_seq_open(struct inode *inode, struct file *file)
2216{
2217	int err;
2218	struct net *net;
2219
2220	net = get_proc_net(inode);
2221	if (net == NULL)
2222		return -ENXIO;
2223	err = single_open(file, fib_triestat_seq_show, net);
2224	if (err < 0) {
2225		put_net(net);
2226		return err;
2227	}
2228	return 0;
2229}
2230
2231static int fib_triestat_seq_release(struct inode *ino, struct file *f)
2232{
2233	struct seq_file *seq = f->private_data;
2234	put_net(seq->private);
2235	return single_release(ino, f);
2236}
2237
2238static const struct file_operations fib_triestat_fops = {
2239	.owner	= THIS_MODULE,
2240	.open	= fib_triestat_seq_open,
2241	.read	= seq_read,
2242	.llseek	= seq_lseek,
2243	.release = fib_triestat_seq_release,
2244};
2245
2246static struct node *fib_trie_get_idx(struct fib_trie_iter *iter,
2247				      loff_t pos)
2248{
2249	loff_t idx = 0;
2250	struct node *n;
2251
2252	for (n = fib_trie_get_first(iter, iter->trie_local);
2253	     n; ++idx, n = fib_trie_get_next(iter)) {
2254		if (pos == idx)
2255			return n;
2256	}
2257
2258	for (n = fib_trie_get_first(iter, iter->trie_main);
2259	     n; ++idx, n = fib_trie_get_next(iter)) {
2260		if (pos == idx)
2261			return n;
2262	}
2263	return NULL;
2264}
2265
2266static void *fib_trie_seq_start(struct seq_file *seq, loff_t *pos)
2267	__acquires(RCU)
2268{
2269	struct fib_trie_iter *iter = seq->private;
2270	struct fib_table *tb;
2271
2272	if (!iter->trie_local) {
2273		tb = fib_get_table(iter->p.net, RT_TABLE_LOCAL);
2274		if (tb)
2275			iter->trie_local = (struct trie *) tb->tb_data;
2276	}
2277	if (!iter->trie_main) {
2278		tb = fib_get_table(iter->p.net, RT_TABLE_MAIN);
2279		if (tb)
2280			iter->trie_main = (struct trie *) tb->tb_data;
2281	}
2282	rcu_read_lock();
2283	if (*pos == 0)
2284		return SEQ_START_TOKEN;
2285	return fib_trie_get_idx(iter, *pos - 1);
2286}
2287
2288static void *fib_trie_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2289{
2290	struct fib_trie_iter *iter = seq->private;
2291	void *l = v;
2292
2293	++*pos;
2294	if (v == SEQ_START_TOKEN)
2295		return fib_trie_get_idx(iter, 0);
2296
2297	v = fib_trie_get_next(iter);
2298	BUG_ON(v == l);
2299	if (v)
2300		return v;
2301
2302	/* continue scan in next trie */
2303	if (iter->trie == iter->trie_local)
2304		return fib_trie_get_first(iter, iter->trie_main);
2305
2306	return NULL;
2307}
2308
2309static void fib_trie_seq_stop(struct seq_file *seq, void *v)
2310	__releases(RCU)
2311{
2312	rcu_read_unlock();
2313}
2314
2315static void seq_indent(struct seq_file *seq, int n)
2316{
2317	while (n-- > 0) seq_puts(seq, "   ");
2318}
2319
2320static inline const char *rtn_scope(char *buf, size_t len, enum rt_scope_t s)
2321{
2322	switch (s) {
2323	case RT_SCOPE_UNIVERSE: return "universe";
2324	case RT_SCOPE_SITE:	return "site";
2325	case RT_SCOPE_LINK:	return "link";
2326	case RT_SCOPE_HOST:	return "host";
2327	case RT_SCOPE_NOWHERE:	return "nowhere";
2328	default:
2329		snprintf(buf, len, "scope=%d", s);
2330		return buf;
2331	}
2332}
2333
2334static const char *rtn_type_names[__RTN_MAX] = {
2335	[RTN_UNSPEC] = "UNSPEC",
2336	[RTN_UNICAST] = "UNICAST",
2337	[RTN_LOCAL] = "LOCAL",
2338	[RTN_BROADCAST] = "BROADCAST",
2339	[RTN_ANYCAST] = "ANYCAST",
2340	[RTN_MULTICAST] = "MULTICAST",
2341	[RTN_BLACKHOLE] = "BLACKHOLE",
2342	[RTN_UNREACHABLE] = "UNREACHABLE",
2343	[RTN_PROHIBIT] = "PROHIBIT",
2344	[RTN_THROW] = "THROW",
2345	[RTN_NAT] = "NAT",
2346	[RTN_XRESOLVE] = "XRESOLVE",
2347};
2348
2349static inline const char *rtn_type(char *buf, size_t len, unsigned t)
2350{
2351	if (t < __RTN_MAX && rtn_type_names[t])
2352		return rtn_type_names[t];
2353	snprintf(buf, len, "type %u", t);
2354	return buf;
2355}
2356
2357/* Pretty print the trie */
2358static int fib_trie_seq_show(struct seq_file *seq, void *v)
2359{
2360	const struct fib_trie_iter *iter = seq->private;
2361	struct node *n = v;
2362
2363	if (v == SEQ_START_TOKEN)
2364		return 0;
2365
2366	if (!node_parent_rcu(n)) {
2367		if (iter->trie == iter->trie_local)
2368			seq_puts(seq, "<local>:\n");
2369		else
2370			seq_puts(seq, "<main>:\n");
2371	}
2372
2373	if (IS_TNODE(n)) {
2374		struct tnode *tn = (struct tnode *) n;
2375		__be32 prf = htonl(mask_pfx(tn->key, tn->pos));
2376
2377		seq_indent(seq, iter->depth-1);
2378		seq_printf(seq, "  +-- %d.%d.%d.%d/%d %d %d %d\n",
2379			   NIPQUAD(prf), tn->pos, tn->bits, tn->full_children,
2380			   tn->empty_children);
2381
2382	} else {
2383		struct leaf *l = (struct leaf *) n;
2384		struct leaf_info *li;
2385		struct hlist_node *node;
2386		__be32 val = htonl(l->key);
2387
2388		seq_indent(seq, iter->depth);
2389		seq_printf(seq, "  |-- %d.%d.%d.%d\n", NIPQUAD(val));
2390
2391		hlist_for_each_entry_rcu(li, node, &l->list, hlist) {
2392			struct fib_alias *fa;
2393
2394			list_for_each_entry_rcu(fa, &li->falh, fa_list) {
2395				char buf1[32], buf2[32];
2396
2397				seq_indent(seq, iter->depth+1);
2398				seq_printf(seq, "  /%d %s %s", li->plen,
2399					   rtn_scope(buf1, sizeof(buf1),
2400						     fa->fa_scope),
2401					   rtn_type(buf2, sizeof(buf2),
2402						    fa->fa_type));
2403				if (fa->fa_tos)
2404					seq_printf(seq, "tos =%d\n",
2405						   fa->fa_tos);
2406				seq_putc(seq, '\n');
2407			}
2408		}
2409	}
2410
2411	return 0;
2412}
2413
2414static const struct seq_operations fib_trie_seq_ops = {
2415	.start  = fib_trie_seq_start,
2416	.next   = fib_trie_seq_next,
2417	.stop   = fib_trie_seq_stop,
2418	.show   = fib_trie_seq_show,
2419};
2420
2421static int fib_trie_seq_open(struct inode *inode, struct file *file)
2422{
2423	return seq_open_net(inode, file, &fib_trie_seq_ops,
2424			    sizeof(struct fib_trie_iter));
2425}
2426
2427static const struct file_operations fib_trie_fops = {
2428	.owner  = THIS_MODULE,
2429	.open   = fib_trie_seq_open,
2430	.read   = seq_read,
2431	.llseek = seq_lseek,
2432	.release = seq_release_net,
2433};
2434
2435static unsigned fib_flag_trans(int type, __be32 mask, const struct fib_info *fi)
2436{
2437	static unsigned type2flags[RTN_MAX + 1] = {
2438		[7] = RTF_REJECT, [8] = RTF_REJECT,
2439	};
2440	unsigned flags = type2flags[type];
2441
2442	if (fi && fi->fib_nh->nh_gw)
2443		flags |= RTF_GATEWAY;
2444	if (mask == htonl(0xFFFFFFFF))
2445		flags |= RTF_HOST;
2446	flags |= RTF_UP;
2447	return flags;
2448}
2449
2450/*
2451 *	This outputs /proc/net/route.
2452 *	The format of the file is not supposed to be changed
2453 * 	and needs to be same as fib_hash output to avoid breaking
2454 *	legacy utilities
2455 */
2456static int fib_route_seq_show(struct seq_file *seq, void *v)
2457{
2458	const struct fib_trie_iter *iter = seq->private;
2459	struct leaf *l = v;
2460	struct leaf_info *li;
2461	struct hlist_node *node;
2462
2463	if (v == SEQ_START_TOKEN) {
2464		seq_printf(seq, "%-127s\n", "Iface\tDestination\tGateway "
2465			   "\tFlags\tRefCnt\tUse\tMetric\tMask\t\tMTU"
2466			   "\tWindow\tIRTT");
2467		return 0;
2468	}
2469
2470	if (iter->trie == iter->trie_local)
2471		return 0;
2472
2473	if (IS_TNODE(l))
2474		return 0;
2475
2476	hlist_for_each_entry_rcu(li, node, &l->list, hlist) {
2477		struct fib_alias *fa;
2478		__be32 mask, prefix;
2479
2480		mask = inet_make_mask(li->plen);
2481		prefix = htonl(l->key);
2482
2483		list_for_each_entry_rcu(fa, &li->falh, fa_list) {
2484			const struct fib_info *fi = fa->fa_info;
2485			unsigned flags = fib_flag_trans(fa->fa_type, mask, fi);
2486			char bf[128];
2487
2488			if (fa->fa_type == RTN_BROADCAST
2489			    || fa->fa_type == RTN_MULTICAST)
2490				continue;
2491
2492			if (fi)
2493				snprintf(bf, sizeof(bf),
2494					 "%s\t%08X\t%08X\t%04X\t%d\t%u\t%d\t%08X\t%d\t%u\t%u",
2495					 fi->fib_dev ? fi->fib_dev->name : "*",
2496					 prefix,
2497					 fi->fib_nh->nh_gw, flags, 0, 0,
2498					 fi->fib_priority,
2499					 mask,
2500					 (fi->fib_advmss ?
2501					  fi->fib_advmss + 40 : 0),
2502					 fi->fib_window,
2503					 fi->fib_rtt >> 3);
2504			else
2505				snprintf(bf, sizeof(bf),
2506					 "*\t%08X\t%08X\t%04X\t%d\t%u\t%d\t%08X\t%d\t%u\t%u",
2507					 prefix, 0, flags, 0, 0, 0,
2508					 mask, 0, 0, 0);
2509
2510			seq_printf(seq, "%-127s\n", bf);
2511		}
2512	}
2513
2514	return 0;
2515}
2516
2517static const struct seq_operations fib_route_seq_ops = {
2518	.start  = fib_trie_seq_start,
2519	.next   = fib_trie_seq_next,
2520	.stop   = fib_trie_seq_stop,
2521	.show   = fib_route_seq_show,
2522};
2523
2524static int fib_route_seq_open(struct inode *inode, struct file *file)
2525{
2526	return seq_open_net(inode, file, &fib_route_seq_ops,
2527			    sizeof(struct fib_trie_iter));
2528}
2529
2530static const struct file_operations fib_route_fops = {
2531	.owner  = THIS_MODULE,
2532	.open   = fib_route_seq_open,
2533	.read   = seq_read,
2534	.llseek = seq_lseek,
2535	.release = seq_release_net,
2536};
2537
2538int __net_init fib_proc_init(struct net *net)
2539{
2540	if (!proc_net_fops_create(net, "fib_trie", S_IRUGO, &fib_trie_fops))
2541		goto out1;
2542
2543	if (!proc_net_fops_create(net, "fib_triestat", S_IRUGO,
2544				  &fib_triestat_fops))
2545		goto out2;
2546
2547	if (!proc_net_fops_create(net, "route", S_IRUGO, &fib_route_fops))
2548		goto out3;
2549
2550	return 0;
2551
2552out3:
2553	proc_net_remove(net, "fib_triestat");
2554out2:
2555	proc_net_remove(net, "fib_trie");
2556out1:
2557	return -ENOMEM;
2558}
2559
2560void __net_exit fib_proc_exit(struct net *net)
2561{
2562	proc_net_remove(net, "fib_trie");
2563	proc_net_remove(net, "fib_triestat");
2564	proc_net_remove(net, "route");
2565}
2566
2567#endif /* CONFIG_PROC_FS */
2568