fib_trie.c revision 15be75cdb5db442d0e33d37b20832b88f3ccd383
1/*
2 *   This program is free software; you can redistribute it and/or
3 *   modify it under the terms of the GNU General Public License
4 *   as published by the Free Software Foundation; either version
5 *   2 of the License, or (at your option) any later version.
6 *
7 *   Robert Olsson <robert.olsson@its.uu.se> Uppsala Universitet
8 *     & Swedish University of Agricultural Sciences.
9 *
10 *   Jens Laas <jens.laas@data.slu.se> Swedish University of
11 *     Agricultural Sciences.
12 *
13 *   Hans Liss <hans.liss@its.uu.se>  Uppsala Universitet
14 *
15 * This work is based on the LPC-trie which is originally descibed in:
16 *
17 * An experimental study of compression methods for dynamic tries
18 * Stefan Nilsson and Matti Tikkanen. Algorithmica, 33(1):19-33, 2002.
19 * http://www.nada.kth.se/~snilsson/public/papers/dyntrie2/
20 *
21 *
22 * IP-address lookup using LC-tries. Stefan Nilsson and Gunnar Karlsson
23 * IEEE Journal on Selected Areas in Communications, 17(6):1083-1092, June 1999
24 *
25 * Version:	$Id: fib_trie.c,v 1.3 2005/06/08 14:20:01 robert Exp $
26 *
27 *
28 * Code from fib_hash has been reused which includes the following header:
29 *
30 *
31 * INET		An implementation of the TCP/IP protocol suite for the LINUX
32 *		operating system.  INET is implemented using the  BSD Socket
33 *		interface as the means of communication with the user level.
34 *
35 *		IPv4 FIB: lookup engine and maintenance routines.
36 *
37 *
38 * Authors:	Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
39 *
40 *		This program is free software; you can redistribute it and/or
41 *		modify it under the terms of the GNU General Public License
42 *		as published by the Free Software Foundation; either version
43 *		2 of the License, or (at your option) any later version.
44 *
45 * Substantial contributions to this work comes from:
46 *
47 *		David S. Miller, <davem@davemloft.net>
48 *		Stephen Hemminger <shemminger@osdl.org>
49 *		Paul E. McKenney <paulmck@us.ibm.com>
50 *		Patrick McHardy <kaber@trash.net>
51 */
52
53#define VERSION "0.408"
54
55#include <asm/uaccess.h>
56#include <asm/system.h>
57#include <linux/bitops.h>
58#include <linux/types.h>
59#include <linux/kernel.h>
60#include <linux/mm.h>
61#include <linux/string.h>
62#include <linux/socket.h>
63#include <linux/sockios.h>
64#include <linux/errno.h>
65#include <linux/in.h>
66#include <linux/inet.h>
67#include <linux/inetdevice.h>
68#include <linux/netdevice.h>
69#include <linux/if_arp.h>
70#include <linux/proc_fs.h>
71#include <linux/rcupdate.h>
72#include <linux/skbuff.h>
73#include <linux/netlink.h>
74#include <linux/init.h>
75#include <linux/list.h>
76#include <net/net_namespace.h>
77#include <net/ip.h>
78#include <net/protocol.h>
79#include <net/route.h>
80#include <net/tcp.h>
81#include <net/sock.h>
82#include <net/ip_fib.h>
83#include "fib_lookup.h"
84
85#define MAX_STAT_DEPTH 32
86
87#define KEYLENGTH (8*sizeof(t_key))
88
89typedef unsigned int t_key;
90
91#define T_TNODE 0
92#define T_LEAF  1
93#define NODE_TYPE_MASK	0x1UL
94#define NODE_TYPE(node) ((node)->parent & NODE_TYPE_MASK)
95
96#define IS_TNODE(n) (!(n->parent & T_LEAF))
97#define IS_LEAF(n) (n->parent & T_LEAF)
98
99struct node {
100	unsigned long parent;
101	t_key key;
102};
103
104struct leaf {
105	unsigned long parent;
106	t_key key;
107	struct hlist_head list;
108	struct rcu_head rcu;
109};
110
111struct leaf_info {
112	struct hlist_node hlist;
113	struct rcu_head rcu;
114	int plen;
115	struct list_head falh;
116};
117
118struct tnode {
119	unsigned long parent;
120	t_key key;
121	unsigned char pos;		/* 2log(KEYLENGTH) bits needed */
122	unsigned char bits;		/* 2log(KEYLENGTH) bits needed */
123	unsigned int full_children;	/* KEYLENGTH bits needed */
124	unsigned int empty_children;	/* KEYLENGTH bits needed */
125	union {
126		struct rcu_head rcu;
127		struct work_struct work;
128	};
129	struct node *child[0];
130};
131
132#ifdef CONFIG_IP_FIB_TRIE_STATS
133struct trie_use_stats {
134	unsigned int gets;
135	unsigned int backtrack;
136	unsigned int semantic_match_passed;
137	unsigned int semantic_match_miss;
138	unsigned int null_node_hit;
139	unsigned int resize_node_skipped;
140};
141#endif
142
143struct trie_stat {
144	unsigned int totdepth;
145	unsigned int maxdepth;
146	unsigned int tnodes;
147	unsigned int leaves;
148	unsigned int nullpointers;
149	unsigned int prefixes;
150	unsigned int nodesizes[MAX_STAT_DEPTH];
151};
152
153struct trie {
154	struct node *trie;
155#ifdef CONFIG_IP_FIB_TRIE_STATS
156	struct trie_use_stats stats;
157#endif
158};
159
160static void put_child(struct trie *t, struct tnode *tn, int i, struct node *n);
161static void tnode_put_child_reorg(struct tnode *tn, int i, struct node *n,
162				  int wasfull);
163static struct node *resize(struct trie *t, struct tnode *tn);
164static struct tnode *inflate(struct trie *t, struct tnode *tn);
165static struct tnode *halve(struct trie *t, struct tnode *tn);
166static void tnode_free(struct tnode *tn);
167
168static struct kmem_cache *fn_alias_kmem __read_mostly;
169static struct kmem_cache *trie_leaf_kmem __read_mostly;
170
171static inline struct tnode *node_parent(struct node *node)
172{
173	return (struct tnode *)(node->parent & ~NODE_TYPE_MASK);
174}
175
176static inline struct tnode *node_parent_rcu(struct node *node)
177{
178	struct tnode *ret = node_parent(node);
179
180	return rcu_dereference(ret);
181}
182
183/* Same as rcu_assign_pointer
184 * but that macro() assumes that value is a pointer.
185 */
186static inline void node_set_parent(struct node *node, struct tnode *ptr)
187{
188	smp_wmb();
189	node->parent = (unsigned long)ptr | NODE_TYPE(node);
190}
191
192static inline struct node *tnode_get_child(struct tnode *tn, unsigned int i)
193{
194	BUG_ON(i >= 1U << tn->bits);
195
196	return tn->child[i];
197}
198
199static inline struct node *tnode_get_child_rcu(struct tnode *tn, unsigned int i)
200{
201	struct node *ret = tnode_get_child(tn, i);
202
203	return rcu_dereference(ret);
204}
205
206static inline int tnode_child_length(const struct tnode *tn)
207{
208	return 1 << tn->bits;
209}
210
211static inline t_key mask_pfx(t_key k, unsigned short l)
212{
213	return (l == 0) ? 0 : k >> (KEYLENGTH-l) << (KEYLENGTH-l);
214}
215
216static inline t_key tkey_extract_bits(t_key a, int offset, int bits)
217{
218	if (offset < KEYLENGTH)
219		return ((t_key)(a << offset)) >> (KEYLENGTH - bits);
220	else
221		return 0;
222}
223
224static inline int tkey_equals(t_key a, t_key b)
225{
226	return a == b;
227}
228
229static inline int tkey_sub_equals(t_key a, int offset, int bits, t_key b)
230{
231	if (bits == 0 || offset >= KEYLENGTH)
232		return 1;
233	bits = bits > KEYLENGTH ? KEYLENGTH : bits;
234	return ((a ^ b) << offset) >> (KEYLENGTH - bits) == 0;
235}
236
237static inline int tkey_mismatch(t_key a, int offset, t_key b)
238{
239	t_key diff = a ^ b;
240	int i = offset;
241
242	if (!diff)
243		return 0;
244	while ((diff << i) >> (KEYLENGTH-1) == 0)
245		i++;
246	return i;
247}
248
249/*
250  To understand this stuff, an understanding of keys and all their bits is
251  necessary. Every node in the trie has a key associated with it, but not
252  all of the bits in that key are significant.
253
254  Consider a node 'n' and its parent 'tp'.
255
256  If n is a leaf, every bit in its key is significant. Its presence is
257  necessitated by path compression, since during a tree traversal (when
258  searching for a leaf - unless we are doing an insertion) we will completely
259  ignore all skipped bits we encounter. Thus we need to verify, at the end of
260  a potentially successful search, that we have indeed been walking the
261  correct key path.
262
263  Note that we can never "miss" the correct key in the tree if present by
264  following the wrong path. Path compression ensures that segments of the key
265  that are the same for all keys with a given prefix are skipped, but the
266  skipped part *is* identical for each node in the subtrie below the skipped
267  bit! trie_insert() in this implementation takes care of that - note the
268  call to tkey_sub_equals() in trie_insert().
269
270  if n is an internal node - a 'tnode' here, the various parts of its key
271  have many different meanings.
272
273  Example:
274  _________________________________________________________________
275  | i | i | i | i | i | i | i | N | N | N | S | S | S | S | S | C |
276  -----------------------------------------------------------------
277    0   1   2   3   4   5   6   7   8   9  10  11  12  13  14  15
278
279  _________________________________________________________________
280  | C | C | C | u | u | u | u | u | u | u | u | u | u | u | u | u |
281  -----------------------------------------------------------------
282   16  17  18  19  20  21  22  23  24  25  26  27  28  29  30  31
283
284  tp->pos = 7
285  tp->bits = 3
286  n->pos = 15
287  n->bits = 4
288
289  First, let's just ignore the bits that come before the parent tp, that is
290  the bits from 0 to (tp->pos-1). They are *known* but at this point we do
291  not use them for anything.
292
293  The bits from (tp->pos) to (tp->pos + tp->bits - 1) - "N", above - are the
294  index into the parent's child array. That is, they will be used to find
295  'n' among tp's children.
296
297  The bits from (tp->pos + tp->bits) to (n->pos - 1) - "S" - are skipped bits
298  for the node n.
299
300  All the bits we have seen so far are significant to the node n. The rest
301  of the bits are really not needed or indeed known in n->key.
302
303  The bits from (n->pos) to (n->pos + n->bits - 1) - "C" - are the index into
304  n's child array, and will of course be different for each child.
305
306
307  The rest of the bits, from (n->pos + n->bits) onward, are completely unknown
308  at this point.
309
310*/
311
312static inline void check_tnode(const struct tnode *tn)
313{
314	WARN_ON(tn && tn->pos+tn->bits > 32);
315}
316
317static const int halve_threshold = 25;
318static const int inflate_threshold = 50;
319static const int halve_threshold_root = 8;
320static const int inflate_threshold_root = 15;
321
322
323static void __alias_free_mem(struct rcu_head *head)
324{
325	struct fib_alias *fa = container_of(head, struct fib_alias, rcu);
326	kmem_cache_free(fn_alias_kmem, fa);
327}
328
329static inline void alias_free_mem_rcu(struct fib_alias *fa)
330{
331	call_rcu(&fa->rcu, __alias_free_mem);
332}
333
334static void __leaf_free_rcu(struct rcu_head *head)
335{
336	struct leaf *l = container_of(head, struct leaf, rcu);
337	kmem_cache_free(trie_leaf_kmem, l);
338}
339
340static void __leaf_info_free_rcu(struct rcu_head *head)
341{
342	kfree(container_of(head, struct leaf_info, rcu));
343}
344
345static inline void free_leaf_info(struct leaf_info *leaf)
346{
347	call_rcu(&leaf->rcu, __leaf_info_free_rcu);
348}
349
350static struct tnode *tnode_alloc(size_t size)
351{
352	if (size <= PAGE_SIZE)
353		return kzalloc(size, GFP_KERNEL);
354	else
355		return __vmalloc(size, GFP_KERNEL | __GFP_ZERO, PAGE_KERNEL);
356}
357
358static void __tnode_vfree(struct work_struct *arg)
359{
360	struct tnode *tn = container_of(arg, struct tnode, work);
361	vfree(tn);
362}
363
364static void __tnode_free_rcu(struct rcu_head *head)
365{
366	struct tnode *tn = container_of(head, struct tnode, rcu);
367	size_t size = sizeof(struct tnode) +
368		      (sizeof(struct node *) << tn->bits);
369
370	if (size <= PAGE_SIZE)
371		kfree(tn);
372	else {
373		INIT_WORK(&tn->work, __tnode_vfree);
374		schedule_work(&tn->work);
375	}
376}
377
378static inline void tnode_free(struct tnode *tn)
379{
380	if (IS_LEAF(tn)) {
381		struct leaf *l = (struct leaf *) tn;
382		call_rcu_bh(&l->rcu, __leaf_free_rcu);
383	} else
384		call_rcu(&tn->rcu, __tnode_free_rcu);
385}
386
387static struct leaf *leaf_new(void)
388{
389	struct leaf *l = kmem_cache_alloc(trie_leaf_kmem, GFP_KERNEL);
390	if (l) {
391		l->parent = T_LEAF;
392		INIT_HLIST_HEAD(&l->list);
393	}
394	return l;
395}
396
397static struct leaf_info *leaf_info_new(int plen)
398{
399	struct leaf_info *li = kmalloc(sizeof(struct leaf_info),  GFP_KERNEL);
400	if (li) {
401		li->plen = plen;
402		INIT_LIST_HEAD(&li->falh);
403	}
404	return li;
405}
406
407static struct tnode *tnode_new(t_key key, int pos, int bits)
408{
409	size_t sz = sizeof(struct tnode) + (sizeof(struct node *) << bits);
410	struct tnode *tn = tnode_alloc(sz);
411
412	if (tn) {
413		tn->parent = T_TNODE;
414		tn->pos = pos;
415		tn->bits = bits;
416		tn->key = key;
417		tn->full_children = 0;
418		tn->empty_children = 1<<bits;
419	}
420
421	pr_debug("AT %p s=%u %lu\n", tn, (unsigned int) sizeof(struct tnode),
422		 (unsigned long) (sizeof(struct node) << bits));
423	return tn;
424}
425
426/*
427 * Check whether a tnode 'n' is "full", i.e. it is an internal node
428 * and no bits are skipped. See discussion in dyntree paper p. 6
429 */
430
431static inline int tnode_full(const struct tnode *tn, const struct node *n)
432{
433	if (n == NULL || IS_LEAF(n))
434		return 0;
435
436	return ((struct tnode *) n)->pos == tn->pos + tn->bits;
437}
438
439static inline void put_child(struct trie *t, struct tnode *tn, int i,
440			     struct node *n)
441{
442	tnode_put_child_reorg(tn, i, n, -1);
443}
444
445 /*
446  * Add a child at position i overwriting the old value.
447  * Update the value of full_children and empty_children.
448  */
449
450static void tnode_put_child_reorg(struct tnode *tn, int i, struct node *n,
451				  int wasfull)
452{
453	struct node *chi = tn->child[i];
454	int isfull;
455
456	BUG_ON(i >= 1<<tn->bits);
457
458	/* update emptyChildren */
459	if (n == NULL && chi != NULL)
460		tn->empty_children++;
461	else if (n != NULL && chi == NULL)
462		tn->empty_children--;
463
464	/* update fullChildren */
465	if (wasfull == -1)
466		wasfull = tnode_full(tn, chi);
467
468	isfull = tnode_full(tn, n);
469	if (wasfull && !isfull)
470		tn->full_children--;
471	else if (!wasfull && isfull)
472		tn->full_children++;
473
474	if (n)
475		node_set_parent(n, tn);
476
477	rcu_assign_pointer(tn->child[i], n);
478}
479
480static struct node *resize(struct trie *t, struct tnode *tn)
481{
482	int i;
483	int err = 0;
484	struct tnode *old_tn;
485	int inflate_threshold_use;
486	int halve_threshold_use;
487	int max_resize;
488
489	if (!tn)
490		return NULL;
491
492	pr_debug("In tnode_resize %p inflate_threshold=%d threshold=%d\n",
493		 tn, inflate_threshold, halve_threshold);
494
495	/* No children */
496	if (tn->empty_children == tnode_child_length(tn)) {
497		tnode_free(tn);
498		return NULL;
499	}
500	/* One child */
501	if (tn->empty_children == tnode_child_length(tn) - 1)
502		for (i = 0; i < tnode_child_length(tn); i++) {
503			struct node *n;
504
505			n = tn->child[i];
506			if (!n)
507				continue;
508
509			/* compress one level */
510			node_set_parent(n, NULL);
511			tnode_free(tn);
512			return n;
513		}
514	/*
515	 * Double as long as the resulting node has a number of
516	 * nonempty nodes that are above the threshold.
517	 */
518
519	/*
520	 * From "Implementing a dynamic compressed trie" by Stefan Nilsson of
521	 * the Helsinki University of Technology and Matti Tikkanen of Nokia
522	 * Telecommunications, page 6:
523	 * "A node is doubled if the ratio of non-empty children to all
524	 * children in the *doubled* node is at least 'high'."
525	 *
526	 * 'high' in this instance is the variable 'inflate_threshold'. It
527	 * is expressed as a percentage, so we multiply it with
528	 * tnode_child_length() and instead of multiplying by 2 (since the
529	 * child array will be doubled by inflate()) and multiplying
530	 * the left-hand side by 100 (to handle the percentage thing) we
531	 * multiply the left-hand side by 50.
532	 *
533	 * The left-hand side may look a bit weird: tnode_child_length(tn)
534	 * - tn->empty_children is of course the number of non-null children
535	 * in the current node. tn->full_children is the number of "full"
536	 * children, that is non-null tnodes with a skip value of 0.
537	 * All of those will be doubled in the resulting inflated tnode, so
538	 * we just count them one extra time here.
539	 *
540	 * A clearer way to write this would be:
541	 *
542	 * to_be_doubled = tn->full_children;
543	 * not_to_be_doubled = tnode_child_length(tn) - tn->empty_children -
544	 *     tn->full_children;
545	 *
546	 * new_child_length = tnode_child_length(tn) * 2;
547	 *
548	 * new_fill_factor = 100 * (not_to_be_doubled + 2*to_be_doubled) /
549	 *      new_child_length;
550	 * if (new_fill_factor >= inflate_threshold)
551	 *
552	 * ...and so on, tho it would mess up the while () loop.
553	 *
554	 * anyway,
555	 * 100 * (not_to_be_doubled + 2*to_be_doubled) / new_child_length >=
556	 *      inflate_threshold
557	 *
558	 * avoid a division:
559	 * 100 * (not_to_be_doubled + 2*to_be_doubled) >=
560	 *      inflate_threshold * new_child_length
561	 *
562	 * expand not_to_be_doubled and to_be_doubled, and shorten:
563	 * 100 * (tnode_child_length(tn) - tn->empty_children +
564	 *    tn->full_children) >= inflate_threshold * new_child_length
565	 *
566	 * expand new_child_length:
567	 * 100 * (tnode_child_length(tn) - tn->empty_children +
568	 *    tn->full_children) >=
569	 *      inflate_threshold * tnode_child_length(tn) * 2
570	 *
571	 * shorten again:
572	 * 50 * (tn->full_children + tnode_child_length(tn) -
573	 *    tn->empty_children) >= inflate_threshold *
574	 *    tnode_child_length(tn)
575	 *
576	 */
577
578	check_tnode(tn);
579
580	/* Keep root node larger  */
581
582	if (!tn->parent)
583		inflate_threshold_use = inflate_threshold_root;
584	else
585		inflate_threshold_use = inflate_threshold;
586
587	err = 0;
588	max_resize = 10;
589	while ((tn->full_children > 0 &&  max_resize-- &&
590		50 * (tn->full_children + tnode_child_length(tn)
591		      - tn->empty_children)
592		>= inflate_threshold_use * tnode_child_length(tn))) {
593
594		old_tn = tn;
595		tn = inflate(t, tn);
596
597		if (IS_ERR(tn)) {
598			tn = old_tn;
599#ifdef CONFIG_IP_FIB_TRIE_STATS
600			t->stats.resize_node_skipped++;
601#endif
602			break;
603		}
604	}
605
606	if (max_resize < 0) {
607		if (!tn->parent)
608			pr_warning("Fix inflate_threshold_root."
609				   " Now=%d size=%d bits\n",
610				   inflate_threshold_root, tn->bits);
611		else
612			pr_warning("Fix inflate_threshold."
613				   " Now=%d size=%d bits\n",
614				   inflate_threshold, tn->bits);
615	}
616
617	check_tnode(tn);
618
619	/*
620	 * Halve as long as the number of empty children in this
621	 * node is above threshold.
622	 */
623
624
625	/* Keep root node larger  */
626
627	if (!tn->parent)
628		halve_threshold_use = halve_threshold_root;
629	else
630		halve_threshold_use = halve_threshold;
631
632	err = 0;
633	max_resize = 10;
634	while (tn->bits > 1 &&  max_resize-- &&
635	       100 * (tnode_child_length(tn) - tn->empty_children) <
636	       halve_threshold_use * tnode_child_length(tn)) {
637
638		old_tn = tn;
639		tn = halve(t, tn);
640		if (IS_ERR(tn)) {
641			tn = old_tn;
642#ifdef CONFIG_IP_FIB_TRIE_STATS
643			t->stats.resize_node_skipped++;
644#endif
645			break;
646		}
647	}
648
649	if (max_resize < 0) {
650		if (!tn->parent)
651			pr_warning("Fix halve_threshold_root."
652				   " Now=%d size=%d bits\n",
653				   halve_threshold_root, tn->bits);
654		else
655			pr_warning("Fix halve_threshold."
656				   " Now=%d size=%d bits\n",
657				   halve_threshold, tn->bits);
658	}
659
660	/* Only one child remains */
661	if (tn->empty_children == tnode_child_length(tn) - 1)
662		for (i = 0; i < tnode_child_length(tn); i++) {
663			struct node *n;
664
665			n = tn->child[i];
666			if (!n)
667				continue;
668
669			/* compress one level */
670
671			node_set_parent(n, NULL);
672			tnode_free(tn);
673			return n;
674		}
675
676	return (struct node *) tn;
677}
678
679static struct tnode *inflate(struct trie *t, struct tnode *tn)
680{
681	struct tnode *oldtnode = tn;
682	int olen = tnode_child_length(tn);
683	int i;
684
685	pr_debug("In inflate\n");
686
687	tn = tnode_new(oldtnode->key, oldtnode->pos, oldtnode->bits + 1);
688
689	if (!tn)
690		return ERR_PTR(-ENOMEM);
691
692	/*
693	 * Preallocate and store tnodes before the actual work so we
694	 * don't get into an inconsistent state if memory allocation
695	 * fails. In case of failure we return the oldnode and  inflate
696	 * of tnode is ignored.
697	 */
698
699	for (i = 0; i < olen; i++) {
700		struct tnode *inode;
701
702		inode = (struct tnode *) tnode_get_child(oldtnode, i);
703		if (inode &&
704		    IS_TNODE(inode) &&
705		    inode->pos == oldtnode->pos + oldtnode->bits &&
706		    inode->bits > 1) {
707			struct tnode *left, *right;
708			t_key m = ~0U << (KEYLENGTH - 1) >> inode->pos;
709
710			left = tnode_new(inode->key&(~m), inode->pos + 1,
711					 inode->bits - 1);
712			if (!left)
713				goto nomem;
714
715			right = tnode_new(inode->key|m, inode->pos + 1,
716					  inode->bits - 1);
717
718			if (!right) {
719				tnode_free(left);
720				goto nomem;
721			}
722
723			put_child(t, tn, 2*i, (struct node *) left);
724			put_child(t, tn, 2*i+1, (struct node *) right);
725		}
726	}
727
728	for (i = 0; i < olen; i++) {
729		struct tnode *inode;
730		struct node *node = tnode_get_child(oldtnode, i);
731		struct tnode *left, *right;
732		int size, j;
733
734		/* An empty child */
735		if (node == NULL)
736			continue;
737
738		/* A leaf or an internal node with skipped bits */
739
740		if (IS_LEAF(node) || ((struct tnode *) node)->pos >
741		   tn->pos + tn->bits - 1) {
742			if (tkey_extract_bits(node->key,
743					      oldtnode->pos + oldtnode->bits,
744					      1) == 0)
745				put_child(t, tn, 2*i, node);
746			else
747				put_child(t, tn, 2*i+1, node);
748			continue;
749		}
750
751		/* An internal node with two children */
752		inode = (struct tnode *) node;
753
754		if (inode->bits == 1) {
755			put_child(t, tn, 2*i, inode->child[0]);
756			put_child(t, tn, 2*i+1, inode->child[1]);
757
758			tnode_free(inode);
759			continue;
760		}
761
762		/* An internal node with more than two children */
763
764		/* We will replace this node 'inode' with two new
765		 * ones, 'left' and 'right', each with half of the
766		 * original children. The two new nodes will have
767		 * a position one bit further down the key and this
768		 * means that the "significant" part of their keys
769		 * (see the discussion near the top of this file)
770		 * will differ by one bit, which will be "0" in
771		 * left's key and "1" in right's key. Since we are
772		 * moving the key position by one step, the bit that
773		 * we are moving away from - the bit at position
774		 * (inode->pos) - is the one that will differ between
775		 * left and right. So... we synthesize that bit in the
776		 * two  new keys.
777		 * The mask 'm' below will be a single "one" bit at
778		 * the position (inode->pos)
779		 */
780
781		/* Use the old key, but set the new significant
782		 *   bit to zero.
783		 */
784
785		left = (struct tnode *) tnode_get_child(tn, 2*i);
786		put_child(t, tn, 2*i, NULL);
787
788		BUG_ON(!left);
789
790		right = (struct tnode *) tnode_get_child(tn, 2*i+1);
791		put_child(t, tn, 2*i+1, NULL);
792
793		BUG_ON(!right);
794
795		size = tnode_child_length(left);
796		for (j = 0; j < size; j++) {
797			put_child(t, left, j, inode->child[j]);
798			put_child(t, right, j, inode->child[j + size]);
799		}
800		put_child(t, tn, 2*i, resize(t, left));
801		put_child(t, tn, 2*i+1, resize(t, right));
802
803		tnode_free(inode);
804	}
805	tnode_free(oldtnode);
806	return tn;
807nomem:
808	{
809		int size = tnode_child_length(tn);
810		int j;
811
812		for (j = 0; j < size; j++)
813			if (tn->child[j])
814				tnode_free((struct tnode *)tn->child[j]);
815
816		tnode_free(tn);
817
818		return ERR_PTR(-ENOMEM);
819	}
820}
821
822static struct tnode *halve(struct trie *t, struct tnode *tn)
823{
824	struct tnode *oldtnode = tn;
825	struct node *left, *right;
826	int i;
827	int olen = tnode_child_length(tn);
828
829	pr_debug("In halve\n");
830
831	tn = tnode_new(oldtnode->key, oldtnode->pos, oldtnode->bits - 1);
832
833	if (!tn)
834		return ERR_PTR(-ENOMEM);
835
836	/*
837	 * Preallocate and store tnodes before the actual work so we
838	 * don't get into an inconsistent state if memory allocation
839	 * fails. In case of failure we return the oldnode and halve
840	 * of tnode is ignored.
841	 */
842
843	for (i = 0; i < olen; i += 2) {
844		left = tnode_get_child(oldtnode, i);
845		right = tnode_get_child(oldtnode, i+1);
846
847		/* Two nonempty children */
848		if (left && right) {
849			struct tnode *newn;
850
851			newn = tnode_new(left->key, tn->pos + tn->bits, 1);
852
853			if (!newn)
854				goto nomem;
855
856			put_child(t, tn, i/2, (struct node *)newn);
857		}
858
859	}
860
861	for (i = 0; i < olen; i += 2) {
862		struct tnode *newBinNode;
863
864		left = tnode_get_child(oldtnode, i);
865		right = tnode_get_child(oldtnode, i+1);
866
867		/* At least one of the children is empty */
868		if (left == NULL) {
869			if (right == NULL)    /* Both are empty */
870				continue;
871			put_child(t, tn, i/2, right);
872			continue;
873		}
874
875		if (right == NULL) {
876			put_child(t, tn, i/2, left);
877			continue;
878		}
879
880		/* Two nonempty children */
881		newBinNode = (struct tnode *) tnode_get_child(tn, i/2);
882		put_child(t, tn, i/2, NULL);
883		put_child(t, newBinNode, 0, left);
884		put_child(t, newBinNode, 1, right);
885		put_child(t, tn, i/2, resize(t, newBinNode));
886	}
887	tnode_free(oldtnode);
888	return tn;
889nomem:
890	{
891		int size = tnode_child_length(tn);
892		int j;
893
894		for (j = 0; j < size; j++)
895			if (tn->child[j])
896				tnode_free((struct tnode *)tn->child[j]);
897
898		tnode_free(tn);
899
900		return ERR_PTR(-ENOMEM);
901	}
902}
903
904/* readside must use rcu_read_lock currently dump routines
905 via get_fa_head and dump */
906
907static struct leaf_info *find_leaf_info(struct leaf *l, int plen)
908{
909	struct hlist_head *head = &l->list;
910	struct hlist_node *node;
911	struct leaf_info *li;
912
913	hlist_for_each_entry_rcu(li, node, head, hlist)
914		if (li->plen == plen)
915			return li;
916
917	return NULL;
918}
919
920static inline struct list_head *get_fa_head(struct leaf *l, int plen)
921{
922	struct leaf_info *li = find_leaf_info(l, plen);
923
924	if (!li)
925		return NULL;
926
927	return &li->falh;
928}
929
930static void insert_leaf_info(struct hlist_head *head, struct leaf_info *new)
931{
932	struct leaf_info *li = NULL, *last = NULL;
933	struct hlist_node *node;
934
935	if (hlist_empty(head)) {
936		hlist_add_head_rcu(&new->hlist, head);
937	} else {
938		hlist_for_each_entry(li, node, head, hlist) {
939			if (new->plen > li->plen)
940				break;
941
942			last = li;
943		}
944		if (last)
945			hlist_add_after_rcu(&last->hlist, &new->hlist);
946		else
947			hlist_add_before_rcu(&new->hlist, &li->hlist);
948	}
949}
950
951/* rcu_read_lock needs to be hold by caller from readside */
952
953static struct leaf *
954fib_find_node(struct trie *t, u32 key)
955{
956	int pos;
957	struct tnode *tn;
958	struct node *n;
959
960	pos = 0;
961	n = rcu_dereference(t->trie);
962
963	while (n != NULL &&  NODE_TYPE(n) == T_TNODE) {
964		tn = (struct tnode *) n;
965
966		check_tnode(tn);
967
968		if (tkey_sub_equals(tn->key, pos, tn->pos-pos, key)) {
969			pos = tn->pos + tn->bits;
970			n = tnode_get_child_rcu(tn,
971						tkey_extract_bits(key,
972								  tn->pos,
973								  tn->bits));
974		} else
975			break;
976	}
977	/* Case we have found a leaf. Compare prefixes */
978
979	if (n != NULL && IS_LEAF(n) && tkey_equals(key, n->key))
980		return (struct leaf *)n;
981
982	return NULL;
983}
984
985static struct node *trie_rebalance(struct trie *t, struct tnode *tn)
986{
987	int wasfull;
988	t_key cindex, key = tn->key;
989	struct tnode *tp;
990
991	while (tn != NULL && (tp = node_parent((struct node *)tn)) != NULL) {
992		cindex = tkey_extract_bits(key, tp->pos, tp->bits);
993		wasfull = tnode_full(tp, tnode_get_child(tp, cindex));
994		tn = (struct tnode *) resize(t, (struct tnode *)tn);
995
996		tnode_put_child_reorg((struct tnode *)tp, cindex,
997				      (struct node *)tn, wasfull);
998
999		tp = node_parent((struct node *) tn);
1000		if (!tp)
1001			break;
1002		tn = tp;
1003	}
1004
1005	/* Handle last (top) tnode */
1006	if (IS_TNODE(tn))
1007		tn = (struct tnode *)resize(t, (struct tnode *)tn);
1008
1009	return (struct node *)tn;
1010}
1011
1012/* only used from updater-side */
1013
1014static struct list_head *fib_insert_node(struct trie *t, u32 key, int plen)
1015{
1016	int pos, newpos;
1017	struct tnode *tp = NULL, *tn = NULL;
1018	struct node *n;
1019	struct leaf *l;
1020	int missbit;
1021	struct list_head *fa_head = NULL;
1022	struct leaf_info *li;
1023	t_key cindex;
1024
1025	pos = 0;
1026	n = t->trie;
1027
1028	/* If we point to NULL, stop. Either the tree is empty and we should
1029	 * just put a new leaf in if, or we have reached an empty child slot,
1030	 * and we should just put our new leaf in that.
1031	 * If we point to a T_TNODE, check if it matches our key. Note that
1032	 * a T_TNODE might be skipping any number of bits - its 'pos' need
1033	 * not be the parent's 'pos'+'bits'!
1034	 *
1035	 * If it does match the current key, get pos/bits from it, extract
1036	 * the index from our key, push the T_TNODE and walk the tree.
1037	 *
1038	 * If it doesn't, we have to replace it with a new T_TNODE.
1039	 *
1040	 * If we point to a T_LEAF, it might or might not have the same key
1041	 * as we do. If it does, just change the value, update the T_LEAF's
1042	 * value, and return it.
1043	 * If it doesn't, we need to replace it with a T_TNODE.
1044	 */
1045
1046	while (n != NULL &&  NODE_TYPE(n) == T_TNODE) {
1047		tn = (struct tnode *) n;
1048
1049		check_tnode(tn);
1050
1051		if (tkey_sub_equals(tn->key, pos, tn->pos-pos, key)) {
1052			tp = tn;
1053			pos = tn->pos + tn->bits;
1054			n = tnode_get_child(tn,
1055					    tkey_extract_bits(key,
1056							      tn->pos,
1057							      tn->bits));
1058
1059			BUG_ON(n && node_parent(n) != tn);
1060		} else
1061			break;
1062	}
1063
1064	/*
1065	 * n  ----> NULL, LEAF or TNODE
1066	 *
1067	 * tp is n's (parent) ----> NULL or TNODE
1068	 */
1069
1070	BUG_ON(tp && IS_LEAF(tp));
1071
1072	/* Case 1: n is a leaf. Compare prefixes */
1073
1074	if (n != NULL && IS_LEAF(n) && tkey_equals(key, n->key)) {
1075		l = (struct leaf *) n;
1076		li = leaf_info_new(plen);
1077
1078		if (!li)
1079			return NULL;
1080
1081		fa_head = &li->falh;
1082		insert_leaf_info(&l->list, li);
1083		goto done;
1084	}
1085	l = leaf_new();
1086
1087	if (!l)
1088		return NULL;
1089
1090	l->key = key;
1091	li = leaf_info_new(plen);
1092
1093	if (!li) {
1094		tnode_free((struct tnode *) l);
1095		return NULL;
1096	}
1097
1098	fa_head = &li->falh;
1099	insert_leaf_info(&l->list, li);
1100
1101	if (t->trie && n == NULL) {
1102		/* Case 2: n is NULL, and will just insert a new leaf */
1103
1104		node_set_parent((struct node *)l, tp);
1105
1106		cindex = tkey_extract_bits(key, tp->pos, tp->bits);
1107		put_child(t, (struct tnode *)tp, cindex, (struct node *)l);
1108	} else {
1109		/* Case 3: n is a LEAF or a TNODE and the key doesn't match. */
1110		/*
1111		 *  Add a new tnode here
1112		 *  first tnode need some special handling
1113		 */
1114
1115		if (tp)
1116			pos = tp->pos+tp->bits;
1117		else
1118			pos = 0;
1119
1120		if (n) {
1121			newpos = tkey_mismatch(key, pos, n->key);
1122			tn = tnode_new(n->key, newpos, 1);
1123		} else {
1124			newpos = 0;
1125			tn = tnode_new(key, newpos, 1); /* First tnode */
1126		}
1127
1128		if (!tn) {
1129			free_leaf_info(li);
1130			tnode_free((struct tnode *) l);
1131			return NULL;
1132		}
1133
1134		node_set_parent((struct node *)tn, tp);
1135
1136		missbit = tkey_extract_bits(key, newpos, 1);
1137		put_child(t, tn, missbit, (struct node *)l);
1138		put_child(t, tn, 1-missbit, n);
1139
1140		if (tp) {
1141			cindex = tkey_extract_bits(key, tp->pos, tp->bits);
1142			put_child(t, (struct tnode *)tp, cindex,
1143				  (struct node *)tn);
1144		} else {
1145			rcu_assign_pointer(t->trie, (struct node *)tn);
1146			tp = tn;
1147		}
1148	}
1149
1150	if (tp && tp->pos + tp->bits > 32)
1151		pr_warning("fib_trie"
1152			   " tp=%p pos=%d, bits=%d, key=%0x plen=%d\n",
1153			   tp, tp->pos, tp->bits, key, plen);
1154
1155	/* Rebalance the trie */
1156
1157	rcu_assign_pointer(t->trie, trie_rebalance(t, tp));
1158done:
1159	return fa_head;
1160}
1161
1162/*
1163 * Caller must hold RTNL.
1164 */
1165static int fn_trie_insert(struct fib_table *tb, struct fib_config *cfg)
1166{
1167	struct trie *t = (struct trie *) tb->tb_data;
1168	struct fib_alias *fa, *new_fa;
1169	struct list_head *fa_head = NULL;
1170	struct fib_info *fi;
1171	int plen = cfg->fc_dst_len;
1172	u8 tos = cfg->fc_tos;
1173	u32 key, mask;
1174	int err;
1175	struct leaf *l;
1176
1177	if (plen > 32)
1178		return -EINVAL;
1179
1180	key = ntohl(cfg->fc_dst);
1181
1182	pr_debug("Insert table=%u %08x/%d\n", tb->tb_id, key, plen);
1183
1184	mask = ntohl(inet_make_mask(plen));
1185
1186	if (key & ~mask)
1187		return -EINVAL;
1188
1189	key = key & mask;
1190
1191	fi = fib_create_info(cfg);
1192	if (IS_ERR(fi)) {
1193		err = PTR_ERR(fi);
1194		goto err;
1195	}
1196
1197	l = fib_find_node(t, key);
1198	fa = NULL;
1199
1200	if (l) {
1201		fa_head = get_fa_head(l, plen);
1202		fa = fib_find_alias(fa_head, tos, fi->fib_priority);
1203	}
1204
1205	/* Now fa, if non-NULL, points to the first fib alias
1206	 * with the same keys [prefix,tos,priority], if such key already
1207	 * exists or to the node before which we will insert new one.
1208	 *
1209	 * If fa is NULL, we will need to allocate a new one and
1210	 * insert to the head of f.
1211	 *
1212	 * If f is NULL, no fib node matched the destination key
1213	 * and we need to allocate a new one of those as well.
1214	 */
1215
1216	if (fa && fa->fa_tos == tos &&
1217	    fa->fa_info->fib_priority == fi->fib_priority) {
1218		struct fib_alias *fa_first, *fa_match;
1219
1220		err = -EEXIST;
1221		if (cfg->fc_nlflags & NLM_F_EXCL)
1222			goto out;
1223
1224		/* We have 2 goals:
1225		 * 1. Find exact match for type, scope, fib_info to avoid
1226		 * duplicate routes
1227		 * 2. Find next 'fa' (or head), NLM_F_APPEND inserts before it
1228		 */
1229		fa_match = NULL;
1230		fa_first = fa;
1231		fa = list_entry(fa->fa_list.prev, struct fib_alias, fa_list);
1232		list_for_each_entry_continue(fa, fa_head, fa_list) {
1233			if (fa->fa_tos != tos)
1234				break;
1235			if (fa->fa_info->fib_priority != fi->fib_priority)
1236				break;
1237			if (fa->fa_type == cfg->fc_type &&
1238			    fa->fa_scope == cfg->fc_scope &&
1239			    fa->fa_info == fi) {
1240				fa_match = fa;
1241				break;
1242			}
1243		}
1244
1245		if (cfg->fc_nlflags & NLM_F_REPLACE) {
1246			struct fib_info *fi_drop;
1247			u8 state;
1248
1249			fa = fa_first;
1250			if (fa_match) {
1251				if (fa == fa_match)
1252					err = 0;
1253				goto out;
1254			}
1255			err = -ENOBUFS;
1256			new_fa = kmem_cache_alloc(fn_alias_kmem, GFP_KERNEL);
1257			if (new_fa == NULL)
1258				goto out;
1259
1260			fi_drop = fa->fa_info;
1261			new_fa->fa_tos = fa->fa_tos;
1262			new_fa->fa_info = fi;
1263			new_fa->fa_type = cfg->fc_type;
1264			new_fa->fa_scope = cfg->fc_scope;
1265			state = fa->fa_state;
1266			new_fa->fa_state = state & ~FA_S_ACCESSED;
1267
1268			list_replace_rcu(&fa->fa_list, &new_fa->fa_list);
1269			alias_free_mem_rcu(fa);
1270
1271			fib_release_info(fi_drop);
1272			if (state & FA_S_ACCESSED)
1273				rt_cache_flush(-1);
1274			rtmsg_fib(RTM_NEWROUTE, htonl(key), new_fa, plen,
1275				tb->tb_id, &cfg->fc_nlinfo, NLM_F_REPLACE);
1276
1277			goto succeeded;
1278		}
1279		/* Error if we find a perfect match which
1280		 * uses the same scope, type, and nexthop
1281		 * information.
1282		 */
1283		if (fa_match)
1284			goto out;
1285
1286		if (!(cfg->fc_nlflags & NLM_F_APPEND))
1287			fa = fa_first;
1288	}
1289	err = -ENOENT;
1290	if (!(cfg->fc_nlflags & NLM_F_CREATE))
1291		goto out;
1292
1293	err = -ENOBUFS;
1294	new_fa = kmem_cache_alloc(fn_alias_kmem, GFP_KERNEL);
1295	if (new_fa == NULL)
1296		goto out;
1297
1298	new_fa->fa_info = fi;
1299	new_fa->fa_tos = tos;
1300	new_fa->fa_type = cfg->fc_type;
1301	new_fa->fa_scope = cfg->fc_scope;
1302	new_fa->fa_state = 0;
1303	/*
1304	 * Insert new entry to the list.
1305	 */
1306
1307	if (!fa_head) {
1308		fa_head = fib_insert_node(t, key, plen);
1309		if (unlikely(!fa_head)) {
1310			err = -ENOMEM;
1311			goto out_free_new_fa;
1312		}
1313	}
1314
1315	list_add_tail_rcu(&new_fa->fa_list,
1316			  (fa ? &fa->fa_list : fa_head));
1317
1318	rt_cache_flush(-1);
1319	rtmsg_fib(RTM_NEWROUTE, htonl(key), new_fa, plen, tb->tb_id,
1320		  &cfg->fc_nlinfo, 0);
1321succeeded:
1322	return 0;
1323
1324out_free_new_fa:
1325	kmem_cache_free(fn_alias_kmem, new_fa);
1326out:
1327	fib_release_info(fi);
1328err:
1329	return err;
1330}
1331
1332/* should be called with rcu_read_lock */
1333static int check_leaf(struct trie *t, struct leaf *l,
1334		      t_key key,  const struct flowi *flp,
1335		      struct fib_result *res)
1336{
1337	struct leaf_info *li;
1338	struct hlist_head *hhead = &l->list;
1339	struct hlist_node *node;
1340
1341	hlist_for_each_entry_rcu(li, node, hhead, hlist) {
1342		int err;
1343		int plen = li->plen;
1344		__be32 mask = inet_make_mask(plen);
1345
1346		if (l->key != (key & ntohl(mask)))
1347			continue;
1348
1349		err = fib_semantic_match(&li->falh, flp, res,
1350					 htonl(l->key), mask, plen);
1351
1352#ifdef CONFIG_IP_FIB_TRIE_STATS
1353		if (err <= 0)
1354			t->stats.semantic_match_passed++;
1355		else
1356			t->stats.semantic_match_miss++;
1357#endif
1358		if (err <= 0)
1359			return plen;
1360	}
1361
1362	return -1;
1363}
1364
1365static int fn_trie_lookup(struct fib_table *tb, const struct flowi *flp,
1366			  struct fib_result *res)
1367{
1368	struct trie *t = (struct trie *) tb->tb_data;
1369	int plen, ret = 0;
1370	struct node *n;
1371	struct tnode *pn;
1372	int pos, bits;
1373	t_key key = ntohl(flp->fl4_dst);
1374	int chopped_off;
1375	t_key cindex = 0;
1376	int current_prefix_length = KEYLENGTH;
1377	struct tnode *cn;
1378	t_key node_prefix, key_prefix, pref_mismatch;
1379	int mp;
1380
1381	rcu_read_lock();
1382
1383	n = rcu_dereference(t->trie);
1384	if (!n)
1385		goto failed;
1386
1387#ifdef CONFIG_IP_FIB_TRIE_STATS
1388	t->stats.gets++;
1389#endif
1390
1391	/* Just a leaf? */
1392	if (IS_LEAF(n)) {
1393		plen = check_leaf(t, (struct leaf *)n, key, flp, res);
1394		if (plen < 0)
1395			goto failed;
1396		ret = 0;
1397		goto found;
1398	}
1399
1400	pn = (struct tnode *) n;
1401	chopped_off = 0;
1402
1403	while (pn) {
1404		pos = pn->pos;
1405		bits = pn->bits;
1406
1407		if (!chopped_off)
1408			cindex = tkey_extract_bits(mask_pfx(key, current_prefix_length),
1409						   pos, bits);
1410
1411		n = tnode_get_child(pn, cindex);
1412
1413		if (n == NULL) {
1414#ifdef CONFIG_IP_FIB_TRIE_STATS
1415			t->stats.null_node_hit++;
1416#endif
1417			goto backtrace;
1418		}
1419
1420		if (IS_LEAF(n)) {
1421			plen = check_leaf(t, (struct leaf *)n, key, flp, res);
1422			if (plen < 0)
1423				goto backtrace;
1424
1425			ret = 0;
1426			goto found;
1427		}
1428
1429		cn = (struct tnode *)n;
1430
1431		/*
1432		 * It's a tnode, and we can do some extra checks here if we
1433		 * like, to avoid descending into a dead-end branch.
1434		 * This tnode is in the parent's child array at index
1435		 * key[p_pos..p_pos+p_bits] but potentially with some bits
1436		 * chopped off, so in reality the index may be just a
1437		 * subprefix, padded with zero at the end.
1438		 * We can also take a look at any skipped bits in this
1439		 * tnode - everything up to p_pos is supposed to be ok,
1440		 * and the non-chopped bits of the index (se previous
1441		 * paragraph) are also guaranteed ok, but the rest is
1442		 * considered unknown.
1443		 *
1444		 * The skipped bits are key[pos+bits..cn->pos].
1445		 */
1446
1447		/* If current_prefix_length < pos+bits, we are already doing
1448		 * actual prefix  matching, which means everything from
1449		 * pos+(bits-chopped_off) onward must be zero along some
1450		 * branch of this subtree - otherwise there is *no* valid
1451		 * prefix present. Here we can only check the skipped
1452		 * bits. Remember, since we have already indexed into the
1453		 * parent's child array, we know that the bits we chopped of
1454		 * *are* zero.
1455		 */
1456
1457		/* NOTA BENE: Checking only skipped bits
1458		   for the new node here */
1459
1460		if (current_prefix_length < pos+bits) {
1461			if (tkey_extract_bits(cn->key, current_prefix_length,
1462						cn->pos - current_prefix_length)
1463			    || !(cn->child[0]))
1464				goto backtrace;
1465		}
1466
1467		/*
1468		 * If chopped_off=0, the index is fully validated and we
1469		 * only need to look at the skipped bits for this, the new,
1470		 * tnode. What we actually want to do is to find out if
1471		 * these skipped bits match our key perfectly, or if we will
1472		 * have to count on finding a matching prefix further down,
1473		 * because if we do, we would like to have some way of
1474		 * verifying the existence of such a prefix at this point.
1475		 */
1476
1477		/* The only thing we can do at this point is to verify that
1478		 * any such matching prefix can indeed be a prefix to our
1479		 * key, and if the bits in the node we are inspecting that
1480		 * do not match our key are not ZERO, this cannot be true.
1481		 * Thus, find out where there is a mismatch (before cn->pos)
1482		 * and verify that all the mismatching bits are zero in the
1483		 * new tnode's key.
1484		 */
1485
1486		/*
1487		 * Note: We aren't very concerned about the piece of
1488		 * the key that precede pn->pos+pn->bits, since these
1489		 * have already been checked. The bits after cn->pos
1490		 * aren't checked since these are by definition
1491		 * "unknown" at this point. Thus, what we want to see
1492		 * is if we are about to enter the "prefix matching"
1493		 * state, and in that case verify that the skipped
1494		 * bits that will prevail throughout this subtree are
1495		 * zero, as they have to be if we are to find a
1496		 * matching prefix.
1497		 */
1498
1499		node_prefix = mask_pfx(cn->key, cn->pos);
1500		key_prefix = mask_pfx(key, cn->pos);
1501		pref_mismatch = key_prefix^node_prefix;
1502		mp = 0;
1503
1504		/*
1505		 * In short: If skipped bits in this node do not match
1506		 * the search key, enter the "prefix matching"
1507		 * state.directly.
1508		 */
1509		if (pref_mismatch) {
1510			while (!(pref_mismatch & (1<<(KEYLENGTH-1)))) {
1511				mp++;
1512				pref_mismatch = pref_mismatch << 1;
1513			}
1514			key_prefix = tkey_extract_bits(cn->key, mp, cn->pos-mp);
1515
1516			if (key_prefix != 0)
1517				goto backtrace;
1518
1519			if (current_prefix_length >= cn->pos)
1520				current_prefix_length = mp;
1521		}
1522
1523		pn = (struct tnode *)n; /* Descend */
1524		chopped_off = 0;
1525		continue;
1526
1527backtrace:
1528		chopped_off++;
1529
1530		/* As zero don't change the child key (cindex) */
1531		while ((chopped_off <= pn->bits)
1532		       && !(cindex & (1<<(chopped_off-1))))
1533			chopped_off++;
1534
1535		/* Decrease current_... with bits chopped off */
1536		if (current_prefix_length > pn->pos + pn->bits - chopped_off)
1537			current_prefix_length = pn->pos + pn->bits
1538				- chopped_off;
1539
1540		/*
1541		 * Either we do the actual chop off according or if we have
1542		 * chopped off all bits in this tnode walk up to our parent.
1543		 */
1544
1545		if (chopped_off <= pn->bits) {
1546			cindex &= ~(1 << (chopped_off-1));
1547		} else {
1548			struct tnode *parent = node_parent((struct node *) pn);
1549			if (!parent)
1550				goto failed;
1551
1552			/* Get Child's index */
1553			cindex = tkey_extract_bits(pn->key, parent->pos, parent->bits);
1554			pn = parent;
1555			chopped_off = 0;
1556
1557#ifdef CONFIG_IP_FIB_TRIE_STATS
1558			t->stats.backtrack++;
1559#endif
1560			goto backtrace;
1561		}
1562	}
1563failed:
1564	ret = 1;
1565found:
1566	rcu_read_unlock();
1567	return ret;
1568}
1569
1570/*
1571 * Remove the leaf and return parent.
1572 */
1573static void trie_leaf_remove(struct trie *t, struct leaf *l)
1574{
1575	struct tnode *tp = node_parent((struct node *) l);
1576
1577	pr_debug("entering trie_leaf_remove(%p)\n", l);
1578
1579	if (tp) {
1580		t_key cindex = tkey_extract_bits(l->key, tp->pos, tp->bits);
1581		put_child(t, (struct tnode *)tp, cindex, NULL);
1582		rcu_assign_pointer(t->trie, trie_rebalance(t, tp));
1583	} else
1584		rcu_assign_pointer(t->trie, NULL);
1585
1586	tnode_free((struct tnode *) l);
1587}
1588
1589/*
1590 * Caller must hold RTNL.
1591 */
1592static int fn_trie_delete(struct fib_table *tb, struct fib_config *cfg)
1593{
1594	struct trie *t = (struct trie *) tb->tb_data;
1595	u32 key, mask;
1596	int plen = cfg->fc_dst_len;
1597	u8 tos = cfg->fc_tos;
1598	struct fib_alias *fa, *fa_to_delete;
1599	struct list_head *fa_head;
1600	struct leaf *l;
1601	struct leaf_info *li;
1602
1603	if (plen > 32)
1604		return -EINVAL;
1605
1606	key = ntohl(cfg->fc_dst);
1607	mask = ntohl(inet_make_mask(plen));
1608
1609	if (key & ~mask)
1610		return -EINVAL;
1611
1612	key = key & mask;
1613	l = fib_find_node(t, key);
1614
1615	if (!l)
1616		return -ESRCH;
1617
1618	fa_head = get_fa_head(l, plen);
1619	fa = fib_find_alias(fa_head, tos, 0);
1620
1621	if (!fa)
1622		return -ESRCH;
1623
1624	pr_debug("Deleting %08x/%d tos=%d t=%p\n", key, plen, tos, t);
1625
1626	fa_to_delete = NULL;
1627	fa = list_entry(fa->fa_list.prev, struct fib_alias, fa_list);
1628	list_for_each_entry_continue(fa, fa_head, fa_list) {
1629		struct fib_info *fi = fa->fa_info;
1630
1631		if (fa->fa_tos != tos)
1632			break;
1633
1634		if ((!cfg->fc_type || fa->fa_type == cfg->fc_type) &&
1635		    (cfg->fc_scope == RT_SCOPE_NOWHERE ||
1636		     fa->fa_scope == cfg->fc_scope) &&
1637		    (!cfg->fc_protocol ||
1638		     fi->fib_protocol == cfg->fc_protocol) &&
1639		    fib_nh_match(cfg, fi) == 0) {
1640			fa_to_delete = fa;
1641			break;
1642		}
1643	}
1644
1645	if (!fa_to_delete)
1646		return -ESRCH;
1647
1648	fa = fa_to_delete;
1649	rtmsg_fib(RTM_DELROUTE, htonl(key), fa, plen, tb->tb_id,
1650		  &cfg->fc_nlinfo, 0);
1651
1652	l = fib_find_node(t, key);
1653	li = find_leaf_info(l, plen);
1654
1655	list_del_rcu(&fa->fa_list);
1656
1657	if (list_empty(fa_head)) {
1658		hlist_del_rcu(&li->hlist);
1659		free_leaf_info(li);
1660	}
1661
1662	if (hlist_empty(&l->list))
1663		trie_leaf_remove(t, l);
1664
1665	if (fa->fa_state & FA_S_ACCESSED)
1666		rt_cache_flush(-1);
1667
1668	fib_release_info(fa->fa_info);
1669	alias_free_mem_rcu(fa);
1670	return 0;
1671}
1672
1673static int trie_flush_list(struct trie *t, struct list_head *head)
1674{
1675	struct fib_alias *fa, *fa_node;
1676	int found = 0;
1677
1678	list_for_each_entry_safe(fa, fa_node, head, fa_list) {
1679		struct fib_info *fi = fa->fa_info;
1680
1681		if (fi && (fi->fib_flags & RTNH_F_DEAD)) {
1682			list_del_rcu(&fa->fa_list);
1683			fib_release_info(fa->fa_info);
1684			alias_free_mem_rcu(fa);
1685			found++;
1686		}
1687	}
1688	return found;
1689}
1690
1691static int trie_flush_leaf(struct trie *t, struct leaf *l)
1692{
1693	int found = 0;
1694	struct hlist_head *lih = &l->list;
1695	struct hlist_node *node, *tmp;
1696	struct leaf_info *li = NULL;
1697
1698	hlist_for_each_entry_safe(li, node, tmp, lih, hlist) {
1699		found += trie_flush_list(t, &li->falh);
1700
1701		if (list_empty(&li->falh)) {
1702			hlist_del_rcu(&li->hlist);
1703			free_leaf_info(li);
1704		}
1705	}
1706	return found;
1707}
1708
1709/*
1710 * Scan for the next right leaf starting at node p->child[idx]
1711 * Since we have back pointer, no recursion necessary.
1712 */
1713static struct leaf *leaf_walk_rcu(struct tnode *p, struct node *c)
1714{
1715	do {
1716		t_key idx;
1717
1718		if (c)
1719			idx = tkey_extract_bits(c->key, p->pos, p->bits) + 1;
1720		else
1721			idx = 0;
1722
1723		while (idx < 1u << p->bits) {
1724			c = tnode_get_child_rcu(p, idx++);
1725			if (!c)
1726				continue;
1727
1728			if (IS_LEAF(c)) {
1729				prefetch(p->child[idx]);
1730				return (struct leaf *) c;
1731			}
1732
1733			/* Rescan start scanning in new node */
1734			p = (struct tnode *) c;
1735			idx = 0;
1736		}
1737
1738		/* Node empty, walk back up to parent */
1739		c = (struct node *) p;
1740	} while ( (p = node_parent_rcu(c)) != NULL);
1741
1742	return NULL; /* Root of trie */
1743}
1744
1745static struct leaf *trie_firstleaf(struct trie *t)
1746{
1747	struct tnode *n = (struct tnode *) rcu_dereference(t->trie);
1748
1749	if (!n)
1750		return NULL;
1751
1752	if (IS_LEAF(n))          /* trie is just a leaf */
1753		return (struct leaf *) n;
1754
1755	return leaf_walk_rcu(n, NULL);
1756}
1757
1758static struct leaf *trie_nextleaf(struct leaf *l)
1759{
1760	struct node *c = (struct node *) l;
1761	struct tnode *p = node_parent(c);
1762
1763	if (!p)
1764		return NULL;	/* trie with just one leaf */
1765
1766	return leaf_walk_rcu(p, c);
1767}
1768
1769static struct leaf *trie_leafindex(struct trie *t, int index)
1770{
1771	struct leaf *l = trie_firstleaf(t);
1772
1773	while (l && index-- > 0)
1774		l = trie_nextleaf(l);
1775
1776	return l;
1777}
1778
1779
1780/*
1781 * Caller must hold RTNL.
1782 */
1783static int fn_trie_flush(struct fib_table *tb)
1784{
1785	struct trie *t = (struct trie *) tb->tb_data;
1786	struct leaf *l, *ll = NULL;
1787	int found = 0;
1788
1789	for (l = trie_firstleaf(t); l; l = trie_nextleaf(l)) {
1790		found += trie_flush_leaf(t, l);
1791
1792		if (ll && hlist_empty(&ll->list))
1793			trie_leaf_remove(t, ll);
1794		ll = l;
1795	}
1796
1797	if (ll && hlist_empty(&ll->list))
1798		trie_leaf_remove(t, ll);
1799
1800	pr_debug("trie_flush found=%d\n", found);
1801	return found;
1802}
1803
1804static void fn_trie_select_default(struct fib_table *tb,
1805				   const struct flowi *flp,
1806				   struct fib_result *res)
1807{
1808	struct trie *t = (struct trie *) tb->tb_data;
1809	int order, last_idx;
1810	struct fib_info *fi = NULL;
1811	struct fib_info *last_resort;
1812	struct fib_alias *fa = NULL;
1813	struct list_head *fa_head;
1814	struct leaf *l;
1815
1816	last_idx = -1;
1817	last_resort = NULL;
1818	order = -1;
1819
1820	rcu_read_lock();
1821
1822	l = fib_find_node(t, 0);
1823	if (!l)
1824		goto out;
1825
1826	fa_head = get_fa_head(l, 0);
1827	if (!fa_head)
1828		goto out;
1829
1830	if (list_empty(fa_head))
1831		goto out;
1832
1833	list_for_each_entry_rcu(fa, fa_head, fa_list) {
1834		struct fib_info *next_fi = fa->fa_info;
1835
1836		if (fa->fa_scope != res->scope ||
1837		    fa->fa_type != RTN_UNICAST)
1838			continue;
1839
1840		if (next_fi->fib_priority > res->fi->fib_priority)
1841			break;
1842		if (!next_fi->fib_nh[0].nh_gw ||
1843		    next_fi->fib_nh[0].nh_scope != RT_SCOPE_LINK)
1844			continue;
1845		fa->fa_state |= FA_S_ACCESSED;
1846
1847		if (fi == NULL) {
1848			if (next_fi != res->fi)
1849				break;
1850		} else if (!fib_detect_death(fi, order, &last_resort,
1851					     &last_idx, tb->tb_default)) {
1852			fib_result_assign(res, fi);
1853			tb->tb_default = order;
1854			goto out;
1855		}
1856		fi = next_fi;
1857		order++;
1858	}
1859	if (order <= 0 || fi == NULL) {
1860		tb->tb_default = -1;
1861		goto out;
1862	}
1863
1864	if (!fib_detect_death(fi, order, &last_resort, &last_idx,
1865				tb->tb_default)) {
1866		fib_result_assign(res, fi);
1867		tb->tb_default = order;
1868		goto out;
1869	}
1870	if (last_idx >= 0)
1871		fib_result_assign(res, last_resort);
1872	tb->tb_default = last_idx;
1873out:
1874	rcu_read_unlock();
1875}
1876
1877static int fn_trie_dump_fa(t_key key, int plen, struct list_head *fah,
1878			   struct fib_table *tb,
1879			   struct sk_buff *skb, struct netlink_callback *cb)
1880{
1881	int i, s_i;
1882	struct fib_alias *fa;
1883	__be32 xkey = htonl(key);
1884
1885	s_i = cb->args[5];
1886	i = 0;
1887
1888	/* rcu_read_lock is hold by caller */
1889
1890	list_for_each_entry_rcu(fa, fah, fa_list) {
1891		if (i < s_i) {
1892			i++;
1893			continue;
1894		}
1895
1896		if (fib_dump_info(skb, NETLINK_CB(cb->skb).pid,
1897				  cb->nlh->nlmsg_seq,
1898				  RTM_NEWROUTE,
1899				  tb->tb_id,
1900				  fa->fa_type,
1901				  fa->fa_scope,
1902				  xkey,
1903				  plen,
1904				  fa->fa_tos,
1905				  fa->fa_info, NLM_F_MULTI) < 0) {
1906			cb->args[5] = i;
1907			return -1;
1908		}
1909		i++;
1910	}
1911	cb->args[5] = i;
1912	return skb->len;
1913}
1914
1915static int fn_trie_dump_leaf(struct leaf *l, struct fib_table *tb,
1916			struct sk_buff *skb, struct netlink_callback *cb)
1917{
1918	struct leaf_info *li;
1919	struct hlist_node *node;
1920	int i, s_i;
1921
1922	s_i = cb->args[4];
1923	i = 0;
1924
1925	/* rcu_read_lock is hold by caller */
1926	hlist_for_each_entry_rcu(li, node, &l->list, hlist) {
1927		if (i < s_i) {
1928			i++;
1929			continue;
1930		}
1931
1932		if (i > s_i)
1933			cb->args[5] = 0;
1934
1935		if (list_empty(&li->falh))
1936			continue;
1937
1938		if (fn_trie_dump_fa(l->key, li->plen, &li->falh, tb, skb, cb) < 0) {
1939			cb->args[4] = i;
1940			return -1;
1941		}
1942		i++;
1943	}
1944
1945	cb->args[4] = i;
1946	return skb->len;
1947}
1948
1949static int fn_trie_dump(struct fib_table *tb, struct sk_buff *skb,
1950			struct netlink_callback *cb)
1951{
1952	struct leaf *l;
1953	struct trie *t = (struct trie *) tb->tb_data;
1954	t_key key = cb->args[2];
1955	int count = cb->args[3];
1956
1957	rcu_read_lock();
1958	/* Dump starting at last key.
1959	 * Note: 0.0.0.0/0 (ie default) is first key.
1960	 */
1961	if (count == 0)
1962		l = trie_firstleaf(t);
1963	else {
1964		/* Normally, continue from last key, but if that is missing
1965		 * fallback to using slow rescan
1966		 */
1967		l = fib_find_node(t, key);
1968		if (!l)
1969			l = trie_leafindex(t, count);
1970	}
1971
1972	while (l) {
1973		cb->args[2] = l->key;
1974		if (fn_trie_dump_leaf(l, tb, skb, cb) < 0) {
1975			cb->args[3] = count;
1976			rcu_read_unlock();
1977			return -1;
1978		}
1979
1980		++count;
1981		l = trie_nextleaf(l);
1982		memset(&cb->args[4], 0,
1983		       sizeof(cb->args) - 4*sizeof(cb->args[0]));
1984	}
1985	cb->args[3] = count;
1986	rcu_read_unlock();
1987
1988	return skb->len;
1989}
1990
1991void __init fib_hash_init(void)
1992{
1993	fn_alias_kmem = kmem_cache_create("ip_fib_alias",
1994					  sizeof(struct fib_alias),
1995					  0, SLAB_PANIC, NULL);
1996
1997	trie_leaf_kmem = kmem_cache_create("ip_fib_trie",
1998					   max(sizeof(struct leaf),
1999					       sizeof(struct leaf_info)),
2000					   0, SLAB_PANIC, NULL);
2001}
2002
2003
2004/* Fix more generic FIB names for init later */
2005struct fib_table *fib_hash_table(u32 id)
2006{
2007	struct fib_table *tb;
2008	struct trie *t;
2009
2010	tb = kmalloc(sizeof(struct fib_table) + sizeof(struct trie),
2011		     GFP_KERNEL);
2012	if (tb == NULL)
2013		return NULL;
2014
2015	tb->tb_id = id;
2016	tb->tb_default = -1;
2017	tb->tb_lookup = fn_trie_lookup;
2018	tb->tb_insert = fn_trie_insert;
2019	tb->tb_delete = fn_trie_delete;
2020	tb->tb_flush = fn_trie_flush;
2021	tb->tb_select_default = fn_trie_select_default;
2022	tb->tb_dump = fn_trie_dump;
2023
2024	t = (struct trie *) tb->tb_data;
2025	memset(t, 0, sizeof(*t));
2026
2027	if (id == RT_TABLE_LOCAL)
2028		pr_info("IPv4 FIB: Using LC-trie version %s\n", VERSION);
2029
2030	return tb;
2031}
2032
2033#ifdef CONFIG_PROC_FS
2034/* Depth first Trie walk iterator */
2035struct fib_trie_iter {
2036	struct seq_net_private p;
2037	struct fib_table *tb;
2038	struct tnode *tnode;
2039	unsigned index;
2040	unsigned depth;
2041};
2042
2043static struct node *fib_trie_get_next(struct fib_trie_iter *iter)
2044{
2045	struct tnode *tn = iter->tnode;
2046	unsigned cindex = iter->index;
2047	struct tnode *p;
2048
2049	/* A single entry routing table */
2050	if (!tn)
2051		return NULL;
2052
2053	pr_debug("get_next iter={node=%p index=%d depth=%d}\n",
2054		 iter->tnode, iter->index, iter->depth);
2055rescan:
2056	while (cindex < (1<<tn->bits)) {
2057		struct node *n = tnode_get_child_rcu(tn, cindex);
2058
2059		if (n) {
2060			if (IS_LEAF(n)) {
2061				iter->tnode = tn;
2062				iter->index = cindex + 1;
2063			} else {
2064				/* push down one level */
2065				iter->tnode = (struct tnode *) n;
2066				iter->index = 0;
2067				++iter->depth;
2068			}
2069			return n;
2070		}
2071
2072		++cindex;
2073	}
2074
2075	/* Current node exhausted, pop back up */
2076	p = node_parent_rcu((struct node *)tn);
2077	if (p) {
2078		cindex = tkey_extract_bits(tn->key, p->pos, p->bits)+1;
2079		tn = p;
2080		--iter->depth;
2081		goto rescan;
2082	}
2083
2084	/* got root? */
2085	return NULL;
2086}
2087
2088static struct node *fib_trie_get_first(struct fib_trie_iter *iter,
2089				       struct trie *t)
2090{
2091	struct node *n;
2092
2093	if (!t)
2094		return NULL;
2095
2096	n = rcu_dereference(t->trie);
2097	if (!n)
2098		return NULL;
2099
2100	if (IS_TNODE(n)) {
2101		iter->tnode = (struct tnode *) n;
2102		iter->index = 0;
2103		iter->depth = 1;
2104	} else {
2105		iter->tnode = NULL;
2106		iter->index = 0;
2107		iter->depth = 0;
2108	}
2109
2110	return n;
2111}
2112
2113static void trie_collect_stats(struct trie *t, struct trie_stat *s)
2114{
2115	struct node *n;
2116	struct fib_trie_iter iter;
2117
2118	memset(s, 0, sizeof(*s));
2119
2120	rcu_read_lock();
2121	for (n = fib_trie_get_first(&iter, t); n; n = fib_trie_get_next(&iter)) {
2122		if (IS_LEAF(n)) {
2123			struct leaf *l = (struct leaf *)n;
2124			struct leaf_info *li;
2125			struct hlist_node *tmp;
2126
2127			s->leaves++;
2128			s->totdepth += iter.depth;
2129			if (iter.depth > s->maxdepth)
2130				s->maxdepth = iter.depth;
2131
2132			hlist_for_each_entry_rcu(li, tmp, &l->list, hlist)
2133				++s->prefixes;
2134		} else {
2135			const struct tnode *tn = (const struct tnode *) n;
2136			int i;
2137
2138			s->tnodes++;
2139			if (tn->bits < MAX_STAT_DEPTH)
2140				s->nodesizes[tn->bits]++;
2141
2142			for (i = 0; i < (1<<tn->bits); i++)
2143				if (!tn->child[i])
2144					s->nullpointers++;
2145		}
2146	}
2147	rcu_read_unlock();
2148}
2149
2150/*
2151 *	This outputs /proc/net/fib_triestats
2152 */
2153static void trie_show_stats(struct seq_file *seq, struct trie_stat *stat)
2154{
2155	unsigned i, max, pointers, bytes, avdepth;
2156
2157	if (stat->leaves)
2158		avdepth = stat->totdepth*100 / stat->leaves;
2159	else
2160		avdepth = 0;
2161
2162	seq_printf(seq, "\tAver depth:     %u.%02d\n",
2163		   avdepth / 100, avdepth % 100);
2164	seq_printf(seq, "\tMax depth:      %u\n", stat->maxdepth);
2165
2166	seq_printf(seq, "\tLeaves:         %u\n", stat->leaves);
2167	bytes = sizeof(struct leaf) * stat->leaves;
2168
2169	seq_printf(seq, "\tPrefixes:       %u\n", stat->prefixes);
2170	bytes += sizeof(struct leaf_info) * stat->prefixes;
2171
2172	seq_printf(seq, "\tInternal nodes: %u\n\t", stat->tnodes);
2173	bytes += sizeof(struct tnode) * stat->tnodes;
2174
2175	max = MAX_STAT_DEPTH;
2176	while (max > 0 && stat->nodesizes[max-1] == 0)
2177		max--;
2178
2179	pointers = 0;
2180	for (i = 1; i <= max; i++)
2181		if (stat->nodesizes[i] != 0) {
2182			seq_printf(seq, "  %u: %u",  i, stat->nodesizes[i]);
2183			pointers += (1<<i) * stat->nodesizes[i];
2184		}
2185	seq_putc(seq, '\n');
2186	seq_printf(seq, "\tPointers: %u\n", pointers);
2187
2188	bytes += sizeof(struct node *) * pointers;
2189	seq_printf(seq, "Null ptrs: %u\n", stat->nullpointers);
2190	seq_printf(seq, "Total size: %u  kB\n", (bytes + 1023) / 1024);
2191}
2192
2193#ifdef CONFIG_IP_FIB_TRIE_STATS
2194static void trie_show_usage(struct seq_file *seq,
2195			    const struct trie_use_stats *stats)
2196{
2197	seq_printf(seq, "\nCounters:\n---------\n");
2198	seq_printf(seq, "gets = %u\n", stats->gets);
2199	seq_printf(seq, "backtracks = %u\n", stats->backtrack);
2200	seq_printf(seq, "semantic match passed = %u\n",
2201		   stats->semantic_match_passed);
2202	seq_printf(seq, "semantic match miss = %u\n",
2203		   stats->semantic_match_miss);
2204	seq_printf(seq, "null node hit= %u\n", stats->null_node_hit);
2205	seq_printf(seq, "skipped node resize = %u\n\n",
2206		   stats->resize_node_skipped);
2207}
2208#endif /*  CONFIG_IP_FIB_TRIE_STATS */
2209
2210static void fib_table_print(struct seq_file *seq, struct fib_table *tb)
2211{
2212	if (tb->tb_id == RT_TABLE_LOCAL)
2213		seq_puts(seq, "Local:\n");
2214	else if (tb->tb_id == RT_TABLE_MAIN)
2215		seq_puts(seq, "Main:\n");
2216	else
2217		seq_printf(seq, "Id %d:\n", tb->tb_id);
2218}
2219
2220
2221static int fib_triestat_seq_show(struct seq_file *seq, void *v)
2222{
2223	struct net *net = (struct net *)seq->private;
2224	unsigned int h;
2225
2226	seq_printf(seq,
2227		   "Basic info: size of leaf:"
2228		   " %Zd bytes, size of tnode: %Zd bytes.\n",
2229		   sizeof(struct leaf), sizeof(struct tnode));
2230
2231	for (h = 0; h < FIB_TABLE_HASHSZ; h++) {
2232		struct hlist_head *head = &net->ipv4.fib_table_hash[h];
2233		struct hlist_node *node;
2234		struct fib_table *tb;
2235
2236		hlist_for_each_entry_rcu(tb, node, head, tb_hlist) {
2237			struct trie *t = (struct trie *) tb->tb_data;
2238			struct trie_stat stat;
2239
2240			if (!t)
2241				continue;
2242
2243			fib_table_print(seq, tb);
2244
2245			trie_collect_stats(t, &stat);
2246			trie_show_stats(seq, &stat);
2247#ifdef CONFIG_IP_FIB_TRIE_STATS
2248			trie_show_usage(seq, &t->stats);
2249#endif
2250		}
2251	}
2252
2253	return 0;
2254}
2255
2256static int fib_triestat_seq_open(struct inode *inode, struct file *file)
2257{
2258	int err;
2259	struct net *net;
2260
2261	net = get_proc_net(inode);
2262	if (net == NULL)
2263		return -ENXIO;
2264	err = single_open(file, fib_triestat_seq_show, net);
2265	if (err < 0) {
2266		put_net(net);
2267		return err;
2268	}
2269	return 0;
2270}
2271
2272static int fib_triestat_seq_release(struct inode *ino, struct file *f)
2273{
2274	struct seq_file *seq = f->private_data;
2275	put_net(seq->private);
2276	return single_release(ino, f);
2277}
2278
2279static const struct file_operations fib_triestat_fops = {
2280	.owner	= THIS_MODULE,
2281	.open	= fib_triestat_seq_open,
2282	.read	= seq_read,
2283	.llseek	= seq_lseek,
2284	.release = fib_triestat_seq_release,
2285};
2286
2287static struct node *fib_trie_get_idx(struct seq_file *seq, loff_t pos)
2288{
2289	struct fib_trie_iter *iter = seq->private;
2290	struct net *net = seq_file_net(seq);
2291	loff_t idx = 0;
2292	unsigned int h;
2293
2294	for (h = 0; h < FIB_TABLE_HASHSZ; h++) {
2295		struct hlist_head *head = &net->ipv4.fib_table_hash[h];
2296		struct hlist_node *node;
2297		struct fib_table *tb;
2298
2299		hlist_for_each_entry_rcu(tb, node, head, tb_hlist) {
2300			struct node *n;
2301
2302			for (n = fib_trie_get_first(iter,
2303						    (struct trie *) tb->tb_data);
2304			     n; n = fib_trie_get_next(iter))
2305				if (pos == idx++) {
2306					iter->tb = tb;
2307					return n;
2308				}
2309		}
2310	}
2311
2312	return NULL;
2313}
2314
2315static void *fib_trie_seq_start(struct seq_file *seq, loff_t *pos)
2316	__acquires(RCU)
2317{
2318	rcu_read_lock();
2319	return fib_trie_get_idx(seq, *pos);
2320}
2321
2322static void *fib_trie_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2323{
2324	struct fib_trie_iter *iter = seq->private;
2325	struct net *net = seq_file_net(seq);
2326	struct fib_table *tb = iter->tb;
2327	struct hlist_node *tb_node;
2328	unsigned int h;
2329	struct node *n;
2330
2331	++*pos;
2332	/* next node in same table */
2333	n = fib_trie_get_next(iter);
2334	if (n)
2335		return n;
2336
2337	/* walk rest of this hash chain */
2338	h = tb->tb_id & (FIB_TABLE_HASHSZ - 1);
2339	while ( (tb_node = rcu_dereference(tb->tb_hlist.next)) ) {
2340		tb = hlist_entry(tb_node, struct fib_table, tb_hlist);
2341		n = fib_trie_get_first(iter, (struct trie *) tb->tb_data);
2342		if (n)
2343			goto found;
2344	}
2345
2346	/* new hash chain */
2347	while (++h < FIB_TABLE_HASHSZ) {
2348		struct hlist_head *head = &net->ipv4.fib_table_hash[h];
2349		hlist_for_each_entry_rcu(tb, tb_node, head, tb_hlist) {
2350			n = fib_trie_get_first(iter, (struct trie *) tb->tb_data);
2351			if (n)
2352				goto found;
2353		}
2354	}
2355	return NULL;
2356
2357found:
2358	iter->tb = tb;
2359	return n;
2360}
2361
2362static void fib_trie_seq_stop(struct seq_file *seq, void *v)
2363	__releases(RCU)
2364{
2365	rcu_read_unlock();
2366}
2367
2368static void seq_indent(struct seq_file *seq, int n)
2369{
2370	while (n-- > 0) seq_puts(seq, "   ");
2371}
2372
2373static inline const char *rtn_scope(char *buf, size_t len, enum rt_scope_t s)
2374{
2375	switch (s) {
2376	case RT_SCOPE_UNIVERSE: return "universe";
2377	case RT_SCOPE_SITE:	return "site";
2378	case RT_SCOPE_LINK:	return "link";
2379	case RT_SCOPE_HOST:	return "host";
2380	case RT_SCOPE_NOWHERE:	return "nowhere";
2381	default:
2382		snprintf(buf, len, "scope=%d", s);
2383		return buf;
2384	}
2385}
2386
2387static const char *rtn_type_names[__RTN_MAX] = {
2388	[RTN_UNSPEC] = "UNSPEC",
2389	[RTN_UNICAST] = "UNICAST",
2390	[RTN_LOCAL] = "LOCAL",
2391	[RTN_BROADCAST] = "BROADCAST",
2392	[RTN_ANYCAST] = "ANYCAST",
2393	[RTN_MULTICAST] = "MULTICAST",
2394	[RTN_BLACKHOLE] = "BLACKHOLE",
2395	[RTN_UNREACHABLE] = "UNREACHABLE",
2396	[RTN_PROHIBIT] = "PROHIBIT",
2397	[RTN_THROW] = "THROW",
2398	[RTN_NAT] = "NAT",
2399	[RTN_XRESOLVE] = "XRESOLVE",
2400};
2401
2402static inline const char *rtn_type(char *buf, size_t len, unsigned t)
2403{
2404	if (t < __RTN_MAX && rtn_type_names[t])
2405		return rtn_type_names[t];
2406	snprintf(buf, len, "type %u", t);
2407	return buf;
2408}
2409
2410/* Pretty print the trie */
2411static int fib_trie_seq_show(struct seq_file *seq, void *v)
2412{
2413	const struct fib_trie_iter *iter = seq->private;
2414	struct node *n = v;
2415
2416	if (!node_parent_rcu(n))
2417		fib_table_print(seq, iter->tb);
2418
2419	if (IS_TNODE(n)) {
2420		struct tnode *tn = (struct tnode *) n;
2421		__be32 prf = htonl(mask_pfx(tn->key, tn->pos));
2422
2423		seq_indent(seq, iter->depth-1);
2424		seq_printf(seq, "  +-- %d.%d.%d.%d/%d %d %d %d\n",
2425			   NIPQUAD(prf), tn->pos, tn->bits, tn->full_children,
2426			   tn->empty_children);
2427
2428	} else {
2429		struct leaf *l = (struct leaf *) n;
2430		struct leaf_info *li;
2431		struct hlist_node *node;
2432		__be32 val = htonl(l->key);
2433
2434		seq_indent(seq, iter->depth);
2435		seq_printf(seq, "  |-- %d.%d.%d.%d\n", NIPQUAD(val));
2436
2437		hlist_for_each_entry_rcu(li, node, &l->list, hlist) {
2438			struct fib_alias *fa;
2439
2440			list_for_each_entry_rcu(fa, &li->falh, fa_list) {
2441				char buf1[32], buf2[32];
2442
2443				seq_indent(seq, iter->depth+1);
2444				seq_printf(seq, "  /%d %s %s", li->plen,
2445					   rtn_scope(buf1, sizeof(buf1),
2446						     fa->fa_scope),
2447					   rtn_type(buf2, sizeof(buf2),
2448						    fa->fa_type));
2449				if (fa->fa_tos)
2450					seq_printf(seq, " tos=%d", fa->fa_tos);
2451				seq_putc(seq, '\n');
2452			}
2453		}
2454	}
2455
2456	return 0;
2457}
2458
2459static const struct seq_operations fib_trie_seq_ops = {
2460	.start  = fib_trie_seq_start,
2461	.next   = fib_trie_seq_next,
2462	.stop   = fib_trie_seq_stop,
2463	.show   = fib_trie_seq_show,
2464};
2465
2466static int fib_trie_seq_open(struct inode *inode, struct file *file)
2467{
2468	return seq_open_net(inode, file, &fib_trie_seq_ops,
2469			    sizeof(struct fib_trie_iter));
2470}
2471
2472static const struct file_operations fib_trie_fops = {
2473	.owner  = THIS_MODULE,
2474	.open   = fib_trie_seq_open,
2475	.read   = seq_read,
2476	.llseek = seq_lseek,
2477	.release = seq_release_net,
2478};
2479
2480struct fib_route_iter {
2481	struct seq_net_private p;
2482	struct trie *main_trie;
2483	loff_t	pos;
2484	t_key	key;
2485};
2486
2487static struct leaf *fib_route_get_idx(struct fib_route_iter *iter, loff_t pos)
2488{
2489	struct leaf *l = NULL;
2490	struct trie *t = iter->main_trie;
2491
2492	/* use cache location of last found key */
2493	if (iter->pos > 0 && pos >= iter->pos && (l = fib_find_node(t, iter->key)))
2494		pos -= iter->pos;
2495	else {
2496		iter->pos = 0;
2497		l = trie_firstleaf(t);
2498	}
2499
2500	while (l && pos-- > 0) {
2501		iter->pos++;
2502		l = trie_nextleaf(l);
2503	}
2504
2505	if (l)
2506		iter->key = pos;	/* remember it */
2507	else
2508		iter->pos = 0;		/* forget it */
2509
2510	return l;
2511}
2512
2513static void *fib_route_seq_start(struct seq_file *seq, loff_t *pos)
2514	__acquires(RCU)
2515{
2516	struct fib_route_iter *iter = seq->private;
2517	struct fib_table *tb;
2518
2519	rcu_read_lock();
2520	tb = fib_get_table(seq_file_net(seq), RT_TABLE_MAIN);
2521	if (!tb)
2522		return NULL;
2523
2524	iter->main_trie = (struct trie *) tb->tb_data;
2525	if (*pos == 0)
2526		return SEQ_START_TOKEN;
2527	else
2528		return fib_route_get_idx(iter, *pos - 1);
2529}
2530
2531static void *fib_route_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2532{
2533	struct fib_route_iter *iter = seq->private;
2534	struct leaf *l = v;
2535
2536	++*pos;
2537	if (v == SEQ_START_TOKEN) {
2538		iter->pos = 0;
2539		l = trie_firstleaf(iter->main_trie);
2540	} else {
2541		iter->pos++;
2542		l = trie_nextleaf(l);
2543	}
2544
2545	if (l)
2546		iter->key = l->key;
2547	else
2548		iter->pos = 0;
2549	return l;
2550}
2551
2552static void fib_route_seq_stop(struct seq_file *seq, void *v)
2553	__releases(RCU)
2554{
2555	rcu_read_unlock();
2556}
2557
2558static unsigned fib_flag_trans(int type, __be32 mask, const struct fib_info *fi)
2559{
2560	static unsigned type2flags[RTN_MAX + 1] = {
2561		[7] = RTF_REJECT, [8] = RTF_REJECT,
2562	};
2563	unsigned flags = type2flags[type];
2564
2565	if (fi && fi->fib_nh->nh_gw)
2566		flags |= RTF_GATEWAY;
2567	if (mask == htonl(0xFFFFFFFF))
2568		flags |= RTF_HOST;
2569	flags |= RTF_UP;
2570	return flags;
2571}
2572
2573/*
2574 *	This outputs /proc/net/route.
2575 *	The format of the file is not supposed to be changed
2576 * 	and needs to be same as fib_hash output to avoid breaking
2577 *	legacy utilities
2578 */
2579static int fib_route_seq_show(struct seq_file *seq, void *v)
2580{
2581	struct leaf *l = v;
2582	struct leaf_info *li;
2583	struct hlist_node *node;
2584
2585	if (v == SEQ_START_TOKEN) {
2586		seq_printf(seq, "%-127s\n", "Iface\tDestination\tGateway "
2587			   "\tFlags\tRefCnt\tUse\tMetric\tMask\t\tMTU"
2588			   "\tWindow\tIRTT");
2589		return 0;
2590	}
2591
2592	hlist_for_each_entry_rcu(li, node, &l->list, hlist) {
2593		struct fib_alias *fa;
2594		__be32 mask, prefix;
2595
2596		mask = inet_make_mask(li->plen);
2597		prefix = htonl(l->key);
2598
2599		list_for_each_entry_rcu(fa, &li->falh, fa_list) {
2600			const struct fib_info *fi = fa->fa_info;
2601			unsigned flags = fib_flag_trans(fa->fa_type, mask, fi);
2602			char bf[128];
2603
2604			if (fa->fa_type == RTN_BROADCAST
2605			    || fa->fa_type == RTN_MULTICAST)
2606				continue;
2607
2608			if (fi)
2609				snprintf(bf, sizeof(bf),
2610					 "%s\t%08X\t%08X\t%04X\t%d\t%u\t%d\t%08X\t%d\t%u\t%u",
2611					 fi->fib_dev ? fi->fib_dev->name : "*",
2612					 prefix,
2613					 fi->fib_nh->nh_gw, flags, 0, 0,
2614					 fi->fib_priority,
2615					 mask,
2616					 (fi->fib_advmss ?
2617					  fi->fib_advmss + 40 : 0),
2618					 fi->fib_window,
2619					 fi->fib_rtt >> 3);
2620			else
2621				snprintf(bf, sizeof(bf),
2622					 "*\t%08X\t%08X\t%04X\t%d\t%u\t%d\t%08X\t%d\t%u\t%u",
2623					 prefix, 0, flags, 0, 0, 0,
2624					 mask, 0, 0, 0);
2625
2626			seq_printf(seq, "%-127s\n", bf);
2627		}
2628	}
2629
2630	return 0;
2631}
2632
2633static const struct seq_operations fib_route_seq_ops = {
2634	.start  = fib_route_seq_start,
2635	.next   = fib_route_seq_next,
2636	.stop   = fib_route_seq_stop,
2637	.show   = fib_route_seq_show,
2638};
2639
2640static int fib_route_seq_open(struct inode *inode, struct file *file)
2641{
2642	return seq_open_net(inode, file, &fib_route_seq_ops,
2643			    sizeof(struct fib_route_iter));
2644}
2645
2646static const struct file_operations fib_route_fops = {
2647	.owner  = THIS_MODULE,
2648	.open   = fib_route_seq_open,
2649	.read   = seq_read,
2650	.llseek = seq_lseek,
2651	.release = seq_release_net,
2652};
2653
2654int __net_init fib_proc_init(struct net *net)
2655{
2656	if (!proc_net_fops_create(net, "fib_trie", S_IRUGO, &fib_trie_fops))
2657		goto out1;
2658
2659	if (!proc_net_fops_create(net, "fib_triestat", S_IRUGO,
2660				  &fib_triestat_fops))
2661		goto out2;
2662
2663	if (!proc_net_fops_create(net, "route", S_IRUGO, &fib_route_fops))
2664		goto out3;
2665
2666	return 0;
2667
2668out3:
2669	proc_net_remove(net, "fib_triestat");
2670out2:
2671	proc_net_remove(net, "fib_trie");
2672out1:
2673	return -ENOMEM;
2674}
2675
2676void __net_exit fib_proc_exit(struct net *net)
2677{
2678	proc_net_remove(net, "fib_trie");
2679	proc_net_remove(net, "fib_triestat");
2680	proc_net_remove(net, "route");
2681}
2682
2683#endif /* CONFIG_PROC_FS */
2684