fib_trie.c revision 4dde4610c4ab54e9d36a4afaa98c23b017f7f9e3
1/*
2 *   This program is free software; you can redistribute it and/or
3 *   modify it under the terms of the GNU General Public License
4 *   as published by the Free Software Foundation; either version
5 *   2 of the License, or (at your option) any later version.
6 *
7 *   Robert Olsson <robert.olsson@its.uu.se> Uppsala Universitet
8 *     & Swedish University of Agricultural Sciences.
9 *
10 *   Jens Laas <jens.laas@data.slu.se> Swedish University of
11 *     Agricultural Sciences.
12 *
13 *   Hans Liss <hans.liss@its.uu.se>  Uppsala Universitet
14 *
15 * This work is based on the LPC-trie which is originally descibed in:
16 *
17 * An experimental study of compression methods for dynamic tries
18 * Stefan Nilsson and Matti Tikkanen. Algorithmica, 33(1):19-33, 2002.
19 * http://www.nada.kth.se/~snilsson/public/papers/dyntrie2/
20 *
21 *
22 * IP-address lookup using LC-tries. Stefan Nilsson and Gunnar Karlsson
23 * IEEE Journal on Selected Areas in Communications, 17(6):1083-1092, June 1999
24 *
25 * Version:	$Id: fib_trie.c,v 1.3 2005/06/08 14:20:01 robert Exp $
26 *
27 *
28 * Code from fib_hash has been reused which includes the following header:
29 *
30 *
31 * INET		An implementation of the TCP/IP protocol suite for the LINUX
32 *		operating system.  INET is implemented using the  BSD Socket
33 *		interface as the means of communication with the user level.
34 *
35 *		IPv4 FIB: lookup engine and maintenance routines.
36 *
37 *
38 * Authors:	Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
39 *
40 *		This program is free software; you can redistribute it and/or
41 *		modify it under the terms of the GNU General Public License
42 *		as published by the Free Software Foundation; either version
43 *		2 of the License, or (at your option) any later version.
44 *
45 * Substantial contributions to this work comes from:
46 *
47 *		David S. Miller, <davem@davemloft.net>
48 *		Stephen Hemminger <shemminger@osdl.org>
49 *		Paul E. McKenney <paulmck@us.ibm.com>
50 *		Patrick McHardy <kaber@trash.net>
51 */
52
53#define VERSION "0.408"
54
55#include <asm/uaccess.h>
56#include <asm/system.h>
57#include <linux/bitops.h>
58#include <linux/types.h>
59#include <linux/kernel.h>
60#include <linux/mm.h>
61#include <linux/string.h>
62#include <linux/socket.h>
63#include <linux/sockios.h>
64#include <linux/errno.h>
65#include <linux/in.h>
66#include <linux/inet.h>
67#include <linux/inetdevice.h>
68#include <linux/netdevice.h>
69#include <linux/if_arp.h>
70#include <linux/proc_fs.h>
71#include <linux/rcupdate.h>
72#include <linux/skbuff.h>
73#include <linux/netlink.h>
74#include <linux/init.h>
75#include <linux/list.h>
76#include <net/net_namespace.h>
77#include <net/ip.h>
78#include <net/protocol.h>
79#include <net/route.h>
80#include <net/tcp.h>
81#include <net/sock.h>
82#include <net/ip_fib.h>
83#include "fib_lookup.h"
84
85#define MAX_STAT_DEPTH 32
86
87#define KEYLENGTH (8*sizeof(t_key))
88
89typedef unsigned int t_key;
90
91#define T_TNODE 0
92#define T_LEAF  1
93#define NODE_TYPE_MASK	0x1UL
94#define NODE_TYPE(node) ((node)->parent & NODE_TYPE_MASK)
95
96#define IS_TNODE(n) (!(n->parent & T_LEAF))
97#define IS_LEAF(n) (n->parent & T_LEAF)
98
99struct node {
100	t_key key;
101	unsigned long parent;
102};
103
104struct leaf {
105	t_key key;
106	unsigned long parent;
107	struct hlist_head list;
108	struct rcu_head rcu;
109};
110
111struct leaf_info {
112	struct hlist_node hlist;
113	struct rcu_head rcu;
114	int plen;
115	struct list_head falh;
116};
117
118struct tnode {
119	t_key key;
120	unsigned long parent;
121	unsigned char pos;		/* 2log(KEYLENGTH) bits needed */
122	unsigned char bits;		/* 2log(KEYLENGTH) bits needed */
123	unsigned short full_children;	/* KEYLENGTH bits needed */
124	unsigned short empty_children;	/* KEYLENGTH bits needed */
125	struct rcu_head rcu;
126	struct node *child[0];
127};
128
129#ifdef CONFIG_IP_FIB_TRIE_STATS
130struct trie_use_stats {
131	unsigned int gets;
132	unsigned int backtrack;
133	unsigned int semantic_match_passed;
134	unsigned int semantic_match_miss;
135	unsigned int null_node_hit;
136	unsigned int resize_node_skipped;
137};
138#endif
139
140struct trie_stat {
141	unsigned int totdepth;
142	unsigned int maxdepth;
143	unsigned int tnodes;
144	unsigned int leaves;
145	unsigned int nullpointers;
146	unsigned int nodesizes[MAX_STAT_DEPTH];
147};
148
149struct trie {
150	struct node *trie;
151#ifdef CONFIG_IP_FIB_TRIE_STATS
152	struct trie_use_stats stats;
153#endif
154	int size;
155};
156
157static void put_child(struct trie *t, struct tnode *tn, int i, struct node *n);
158static void tnode_put_child_reorg(struct tnode *tn, int i, struct node *n, int wasfull);
159static struct node *resize(struct trie *t, struct tnode *tn);
160static struct tnode *inflate(struct trie *t, struct tnode *tn);
161static struct tnode *halve(struct trie *t, struct tnode *tn);
162static void tnode_free(struct tnode *tn);
163
164static struct kmem_cache *fn_alias_kmem __read_mostly;
165
166static inline struct tnode *node_parent(struct node *node)
167{
168	struct tnode *ret;
169
170	ret = (struct tnode *)(node->parent & ~NODE_TYPE_MASK);
171	return rcu_dereference(ret);
172}
173
174static inline void node_set_parent(struct node *node, struct tnode *ptr)
175{
176	rcu_assign_pointer(node->parent,
177			   (unsigned long)ptr | NODE_TYPE(node));
178}
179
180/* rcu_read_lock needs to be hold by caller from readside */
181
182static inline struct node *tnode_get_child(struct tnode *tn, int i)
183{
184	BUG_ON(i >= 1 << tn->bits);
185
186	return rcu_dereference(tn->child[i]);
187}
188
189static inline int tnode_child_length(const struct tnode *tn)
190{
191	return 1 << tn->bits;
192}
193
194static inline t_key mask_pfx(t_key k, unsigned short l)
195{
196	return (l == 0) ? 0 : k >> (KEYLENGTH-l) << (KEYLENGTH-l);
197}
198
199static inline t_key tkey_extract_bits(t_key a, int offset, int bits)
200{
201	if (offset < KEYLENGTH)
202		return ((t_key)(a << offset)) >> (KEYLENGTH - bits);
203	else
204		return 0;
205}
206
207static inline int tkey_equals(t_key a, t_key b)
208{
209	return a == b;
210}
211
212static inline int tkey_sub_equals(t_key a, int offset, int bits, t_key b)
213{
214	if (bits == 0 || offset >= KEYLENGTH)
215		return 1;
216	bits = bits > KEYLENGTH ? KEYLENGTH : bits;
217	return ((a ^ b) << offset) >> (KEYLENGTH - bits) == 0;
218}
219
220static inline int tkey_mismatch(t_key a, int offset, t_key b)
221{
222	t_key diff = a ^ b;
223	int i = offset;
224
225	if (!diff)
226		return 0;
227	while ((diff << i) >> (KEYLENGTH-1) == 0)
228		i++;
229	return i;
230}
231
232/*
233  To understand this stuff, an understanding of keys and all their bits is
234  necessary. Every node in the trie has a key associated with it, but not
235  all of the bits in that key are significant.
236
237  Consider a node 'n' and its parent 'tp'.
238
239  If n is a leaf, every bit in its key is significant. Its presence is
240  necessitated by path compression, since during a tree traversal (when
241  searching for a leaf - unless we are doing an insertion) we will completely
242  ignore all skipped bits we encounter. Thus we need to verify, at the end of
243  a potentially successful search, that we have indeed been walking the
244  correct key path.
245
246  Note that we can never "miss" the correct key in the tree if present by
247  following the wrong path. Path compression ensures that segments of the key
248  that are the same for all keys with a given prefix are skipped, but the
249  skipped part *is* identical for each node in the subtrie below the skipped
250  bit! trie_insert() in this implementation takes care of that - note the
251  call to tkey_sub_equals() in trie_insert().
252
253  if n is an internal node - a 'tnode' here, the various parts of its key
254  have many different meanings.
255
256  Example:
257  _________________________________________________________________
258  | i | i | i | i | i | i | i | N | N | N | S | S | S | S | S | C |
259  -----------------------------------------------------------------
260    0   1   2   3   4   5   6   7   8   9  10  11  12  13  14  15
261
262  _________________________________________________________________
263  | C | C | C | u | u | u | u | u | u | u | u | u | u | u | u | u |
264  -----------------------------------------------------------------
265   16  17  18  19  20  21  22  23  24  25  26  27  28  29  30  31
266
267  tp->pos = 7
268  tp->bits = 3
269  n->pos = 15
270  n->bits = 4
271
272  First, let's just ignore the bits that come before the parent tp, that is
273  the bits from 0 to (tp->pos-1). They are *known* but at this point we do
274  not use them for anything.
275
276  The bits from (tp->pos) to (tp->pos + tp->bits - 1) - "N", above - are the
277  index into the parent's child array. That is, they will be used to find
278  'n' among tp's children.
279
280  The bits from (tp->pos + tp->bits) to (n->pos - 1) - "S" - are skipped bits
281  for the node n.
282
283  All the bits we have seen so far are significant to the node n. The rest
284  of the bits are really not needed or indeed known in n->key.
285
286  The bits from (n->pos) to (n->pos + n->bits - 1) - "C" - are the index into
287  n's child array, and will of course be different for each child.
288
289
290  The rest of the bits, from (n->pos + n->bits) onward, are completely unknown
291  at this point.
292
293*/
294
295static inline void check_tnode(const struct tnode *tn)
296{
297	WARN_ON(tn && tn->pos+tn->bits > 32);
298}
299
300static const int halve_threshold = 25;
301static const int inflate_threshold = 50;
302static const int halve_threshold_root = 8;
303static const int inflate_threshold_root = 15;
304
305
306static void __alias_free_mem(struct rcu_head *head)
307{
308	struct fib_alias *fa = container_of(head, struct fib_alias, rcu);
309	kmem_cache_free(fn_alias_kmem, fa);
310}
311
312static inline void alias_free_mem_rcu(struct fib_alias *fa)
313{
314	call_rcu(&fa->rcu, __alias_free_mem);
315}
316
317static void __leaf_free_rcu(struct rcu_head *head)
318{
319	kfree(container_of(head, struct leaf, rcu));
320}
321
322static void __leaf_info_free_rcu(struct rcu_head *head)
323{
324	kfree(container_of(head, struct leaf_info, rcu));
325}
326
327static inline void free_leaf_info(struct leaf_info *leaf)
328{
329	call_rcu(&leaf->rcu, __leaf_info_free_rcu);
330}
331
332static struct tnode *tnode_alloc(unsigned int size)
333{
334	struct page *pages;
335
336	if (size <= PAGE_SIZE)
337		return kcalloc(size, 1, GFP_KERNEL);
338
339	pages = alloc_pages(GFP_KERNEL|__GFP_ZERO, get_order(size));
340	if (!pages)
341		return NULL;
342
343	return page_address(pages);
344}
345
346static void __tnode_free_rcu(struct rcu_head *head)
347{
348	struct tnode *tn = container_of(head, struct tnode, rcu);
349	unsigned int size = sizeof(struct tnode) +
350		(1 << tn->bits) * sizeof(struct node *);
351
352	if (size <= PAGE_SIZE)
353		kfree(tn);
354	else
355		free_pages((unsigned long)tn, get_order(size));
356}
357
358static inline void tnode_free(struct tnode *tn)
359{
360	if (IS_LEAF(tn)) {
361		struct leaf *l = (struct leaf *) tn;
362		call_rcu_bh(&l->rcu, __leaf_free_rcu);
363	} else
364		call_rcu(&tn->rcu, __tnode_free_rcu);
365}
366
367static struct leaf *leaf_new(void)
368{
369	struct leaf *l = kmalloc(sizeof(struct leaf),  GFP_KERNEL);
370	if (l) {
371		l->parent = T_LEAF;
372		INIT_HLIST_HEAD(&l->list);
373	}
374	return l;
375}
376
377static struct leaf_info *leaf_info_new(int plen)
378{
379	struct leaf_info *li = kmalloc(sizeof(struct leaf_info),  GFP_KERNEL);
380	if (li) {
381		li->plen = plen;
382		INIT_LIST_HEAD(&li->falh);
383	}
384	return li;
385}
386
387static struct tnode* tnode_new(t_key key, int pos, int bits)
388{
389	int nchildren = 1<<bits;
390	int sz = sizeof(struct tnode) + nchildren * sizeof(struct node *);
391	struct tnode *tn = tnode_alloc(sz);
392
393	if (tn) {
394		tn->parent = T_TNODE;
395		tn->pos = pos;
396		tn->bits = bits;
397		tn->key = key;
398		tn->full_children = 0;
399		tn->empty_children = 1<<bits;
400	}
401
402	pr_debug("AT %p s=%u %u\n", tn, (unsigned int) sizeof(struct tnode),
403		 (unsigned int) (sizeof(struct node) * 1<<bits));
404	return tn;
405}
406
407/*
408 * Check whether a tnode 'n' is "full", i.e. it is an internal node
409 * and no bits are skipped. See discussion in dyntree paper p. 6
410 */
411
412static inline int tnode_full(const struct tnode *tn, const struct node *n)
413{
414	if (n == NULL || IS_LEAF(n))
415		return 0;
416
417	return ((struct tnode *) n)->pos == tn->pos + tn->bits;
418}
419
420static inline void put_child(struct trie *t, struct tnode *tn, int i, struct node *n)
421{
422	tnode_put_child_reorg(tn, i, n, -1);
423}
424
425 /*
426  * Add a child at position i overwriting the old value.
427  * Update the value of full_children and empty_children.
428  */
429
430static void tnode_put_child_reorg(struct tnode *tn, int i, struct node *n, int wasfull)
431{
432	struct node *chi = tn->child[i];
433	int isfull;
434
435	BUG_ON(i >= 1<<tn->bits);
436
437
438	/* update emptyChildren */
439	if (n == NULL && chi != NULL)
440		tn->empty_children++;
441	else if (n != NULL && chi == NULL)
442		tn->empty_children--;
443
444	/* update fullChildren */
445	if (wasfull == -1)
446		wasfull = tnode_full(tn, chi);
447
448	isfull = tnode_full(tn, n);
449	if (wasfull && !isfull)
450		tn->full_children--;
451	else if (!wasfull && isfull)
452		tn->full_children++;
453
454	if (n)
455		node_set_parent(n, tn);
456
457	rcu_assign_pointer(tn->child[i], n);
458}
459
460static struct node *resize(struct trie *t, struct tnode *tn)
461{
462	int i;
463	int err = 0;
464	struct tnode *old_tn;
465	int inflate_threshold_use;
466	int halve_threshold_use;
467	int max_resize;
468
469	if (!tn)
470		return NULL;
471
472	pr_debug("In tnode_resize %p inflate_threshold=%d threshold=%d\n",
473		 tn, inflate_threshold, halve_threshold);
474
475	/* No children */
476	if (tn->empty_children == tnode_child_length(tn)) {
477		tnode_free(tn);
478		return NULL;
479	}
480	/* One child */
481	if (tn->empty_children == tnode_child_length(tn) - 1)
482		for (i = 0; i < tnode_child_length(tn); i++) {
483			struct node *n;
484
485			n = tn->child[i];
486			if (!n)
487				continue;
488
489			/* compress one level */
490			node_set_parent(n, NULL);
491			tnode_free(tn);
492			return n;
493		}
494	/*
495	 * Double as long as the resulting node has a number of
496	 * nonempty nodes that are above the threshold.
497	 */
498
499	/*
500	 * From "Implementing a dynamic compressed trie" by Stefan Nilsson of
501	 * the Helsinki University of Technology and Matti Tikkanen of Nokia
502	 * Telecommunications, page 6:
503	 * "A node is doubled if the ratio of non-empty children to all
504	 * children in the *doubled* node is at least 'high'."
505	 *
506	 * 'high' in this instance is the variable 'inflate_threshold'. It
507	 * is expressed as a percentage, so we multiply it with
508	 * tnode_child_length() and instead of multiplying by 2 (since the
509	 * child array will be doubled by inflate()) and multiplying
510	 * the left-hand side by 100 (to handle the percentage thing) we
511	 * multiply the left-hand side by 50.
512	 *
513	 * The left-hand side may look a bit weird: tnode_child_length(tn)
514	 * - tn->empty_children is of course the number of non-null children
515	 * in the current node. tn->full_children is the number of "full"
516	 * children, that is non-null tnodes with a skip value of 0.
517	 * All of those will be doubled in the resulting inflated tnode, so
518	 * we just count them one extra time here.
519	 *
520	 * A clearer way to write this would be:
521	 *
522	 * to_be_doubled = tn->full_children;
523	 * not_to_be_doubled = tnode_child_length(tn) - tn->empty_children -
524	 *     tn->full_children;
525	 *
526	 * new_child_length = tnode_child_length(tn) * 2;
527	 *
528	 * new_fill_factor = 100 * (not_to_be_doubled + 2*to_be_doubled) /
529	 *      new_child_length;
530	 * if (new_fill_factor >= inflate_threshold)
531	 *
532	 * ...and so on, tho it would mess up the while () loop.
533	 *
534	 * anyway,
535	 * 100 * (not_to_be_doubled + 2*to_be_doubled) / new_child_length >=
536	 *      inflate_threshold
537	 *
538	 * avoid a division:
539	 * 100 * (not_to_be_doubled + 2*to_be_doubled) >=
540	 *      inflate_threshold * new_child_length
541	 *
542	 * expand not_to_be_doubled and to_be_doubled, and shorten:
543	 * 100 * (tnode_child_length(tn) - tn->empty_children +
544	 *    tn->full_children) >= inflate_threshold * new_child_length
545	 *
546	 * expand new_child_length:
547	 * 100 * (tnode_child_length(tn) - tn->empty_children +
548	 *    tn->full_children) >=
549	 *      inflate_threshold * tnode_child_length(tn) * 2
550	 *
551	 * shorten again:
552	 * 50 * (tn->full_children + tnode_child_length(tn) -
553	 *    tn->empty_children) >= inflate_threshold *
554	 *    tnode_child_length(tn)
555	 *
556	 */
557
558	check_tnode(tn);
559
560	/* Keep root node larger  */
561
562	if (!tn->parent)
563		inflate_threshold_use = inflate_threshold_root;
564	else
565		inflate_threshold_use = inflate_threshold;
566
567	err = 0;
568	max_resize = 10;
569	while ((tn->full_children > 0 &&  max_resize-- &&
570	       50 * (tn->full_children + tnode_child_length(tn) - tn->empty_children) >=
571				inflate_threshold_use * tnode_child_length(tn))) {
572
573		old_tn = tn;
574		tn = inflate(t, tn);
575		if (IS_ERR(tn)) {
576			tn = old_tn;
577#ifdef CONFIG_IP_FIB_TRIE_STATS
578			t->stats.resize_node_skipped++;
579#endif
580			break;
581		}
582	}
583
584	if (max_resize < 0) {
585		if (!tn->parent)
586			printk(KERN_WARNING "Fix inflate_threshold_root. Now=%d size=%d bits\n",
587			       inflate_threshold_root, tn->bits);
588		else
589			printk(KERN_WARNING "Fix inflate_threshold. Now=%d size=%d bits\n",
590			       inflate_threshold, tn->bits);
591	}
592
593	check_tnode(tn);
594
595	/*
596	 * Halve as long as the number of empty children in this
597	 * node is above threshold.
598	 */
599
600
601	/* Keep root node larger  */
602
603	if (!tn->parent)
604		halve_threshold_use = halve_threshold_root;
605	else
606		halve_threshold_use = halve_threshold;
607
608	err = 0;
609	max_resize = 10;
610	while (tn->bits > 1 &&  max_resize-- &&
611	       100 * (tnode_child_length(tn) - tn->empty_children) <
612	       halve_threshold_use * tnode_child_length(tn)) {
613
614		old_tn = tn;
615		tn = halve(t, tn);
616		if (IS_ERR(tn)) {
617			tn = old_tn;
618#ifdef CONFIG_IP_FIB_TRIE_STATS
619			t->stats.resize_node_skipped++;
620#endif
621			break;
622		}
623	}
624
625	if (max_resize < 0) {
626		if (!tn->parent)
627			printk(KERN_WARNING "Fix halve_threshold_root. Now=%d size=%d bits\n",
628			       halve_threshold_root, tn->bits);
629		else
630			printk(KERN_WARNING "Fix halve_threshold. Now=%d size=%d bits\n",
631			       halve_threshold, tn->bits);
632	}
633
634	/* Only one child remains */
635	if (tn->empty_children == tnode_child_length(tn) - 1)
636		for (i = 0; i < tnode_child_length(tn); i++) {
637			struct node *n;
638
639			n = tn->child[i];
640			if (!n)
641				continue;
642
643			/* compress one level */
644
645			node_set_parent(n, NULL);
646			tnode_free(tn);
647			return n;
648		}
649
650	return (struct node *) tn;
651}
652
653static struct tnode *inflate(struct trie *t, struct tnode *tn)
654{
655	struct tnode *oldtnode = tn;
656	int olen = tnode_child_length(tn);
657	int i;
658
659	pr_debug("In inflate\n");
660
661	tn = tnode_new(oldtnode->key, oldtnode->pos, oldtnode->bits + 1);
662
663	if (!tn)
664		return ERR_PTR(-ENOMEM);
665
666	/*
667	 * Preallocate and store tnodes before the actual work so we
668	 * don't get into an inconsistent state if memory allocation
669	 * fails. In case of failure we return the oldnode and  inflate
670	 * of tnode is ignored.
671	 */
672
673	for (i = 0; i < olen; i++) {
674		struct tnode *inode = (struct tnode *) tnode_get_child(oldtnode, i);
675
676		if (inode &&
677		    IS_TNODE(inode) &&
678		    inode->pos == oldtnode->pos + oldtnode->bits &&
679		    inode->bits > 1) {
680			struct tnode *left, *right;
681			t_key m = ~0U << (KEYLENGTH - 1) >> inode->pos;
682
683			left = tnode_new(inode->key&(~m), inode->pos + 1,
684					 inode->bits - 1);
685			if (!left)
686				goto nomem;
687
688			right = tnode_new(inode->key|m, inode->pos + 1,
689					  inode->bits - 1);
690
691			if (!right) {
692				tnode_free(left);
693				goto nomem;
694			}
695
696			put_child(t, tn, 2*i, (struct node *) left);
697			put_child(t, tn, 2*i+1, (struct node *) right);
698		}
699	}
700
701	for (i = 0; i < olen; i++) {
702		struct tnode *inode;
703		struct node *node = tnode_get_child(oldtnode, i);
704		struct tnode *left, *right;
705		int size, j;
706
707		/* An empty child */
708		if (node == NULL)
709			continue;
710
711		/* A leaf or an internal node with skipped bits */
712
713		if (IS_LEAF(node) || ((struct tnode *) node)->pos >
714		   tn->pos + tn->bits - 1) {
715			if (tkey_extract_bits(node->key, oldtnode->pos + oldtnode->bits,
716					     1) == 0)
717				put_child(t, tn, 2*i, node);
718			else
719				put_child(t, tn, 2*i+1, node);
720			continue;
721		}
722
723		/* An internal node with two children */
724		inode = (struct tnode *) node;
725
726		if (inode->bits == 1) {
727			put_child(t, tn, 2*i, inode->child[0]);
728			put_child(t, tn, 2*i+1, inode->child[1]);
729
730			tnode_free(inode);
731			continue;
732		}
733
734		/* An internal node with more than two children */
735
736		/* We will replace this node 'inode' with two new
737		 * ones, 'left' and 'right', each with half of the
738		 * original children. The two new nodes will have
739		 * a position one bit further down the key and this
740		 * means that the "significant" part of their keys
741		 * (see the discussion near the top of this file)
742		 * will differ by one bit, which will be "0" in
743		 * left's key and "1" in right's key. Since we are
744		 * moving the key position by one step, the bit that
745		 * we are moving away from - the bit at position
746		 * (inode->pos) - is the one that will differ between
747		 * left and right. So... we synthesize that bit in the
748		 * two  new keys.
749		 * The mask 'm' below will be a single "one" bit at
750		 * the position (inode->pos)
751		 */
752
753		/* Use the old key, but set the new significant
754		 *   bit to zero.
755		 */
756
757		left = (struct tnode *) tnode_get_child(tn, 2*i);
758		put_child(t, tn, 2*i, NULL);
759
760		BUG_ON(!left);
761
762		right = (struct tnode *) tnode_get_child(tn, 2*i+1);
763		put_child(t, tn, 2*i+1, NULL);
764
765		BUG_ON(!right);
766
767		size = tnode_child_length(left);
768		for (j = 0; j < size; j++) {
769			put_child(t, left, j, inode->child[j]);
770			put_child(t, right, j, inode->child[j + size]);
771		}
772		put_child(t, tn, 2*i, resize(t, left));
773		put_child(t, tn, 2*i+1, resize(t, right));
774
775		tnode_free(inode);
776	}
777	tnode_free(oldtnode);
778	return tn;
779nomem:
780	{
781		int size = tnode_child_length(tn);
782		int j;
783
784		for (j = 0; j < size; j++)
785			if (tn->child[j])
786				tnode_free((struct tnode *)tn->child[j]);
787
788		tnode_free(tn);
789
790		return ERR_PTR(-ENOMEM);
791	}
792}
793
794static struct tnode *halve(struct trie *t, struct tnode *tn)
795{
796	struct tnode *oldtnode = tn;
797	struct node *left, *right;
798	int i;
799	int olen = tnode_child_length(tn);
800
801	pr_debug("In halve\n");
802
803	tn = tnode_new(oldtnode->key, oldtnode->pos, oldtnode->bits - 1);
804
805	if (!tn)
806		return ERR_PTR(-ENOMEM);
807
808	/*
809	 * Preallocate and store tnodes before the actual work so we
810	 * don't get into an inconsistent state if memory allocation
811	 * fails. In case of failure we return the oldnode and halve
812	 * of tnode is ignored.
813	 */
814
815	for (i = 0; i < olen; i += 2) {
816		left = tnode_get_child(oldtnode, i);
817		right = tnode_get_child(oldtnode, i+1);
818
819		/* Two nonempty children */
820		if (left && right) {
821			struct tnode *newn;
822
823			newn = tnode_new(left->key, tn->pos + tn->bits, 1);
824
825			if (!newn)
826				goto nomem;
827
828			put_child(t, tn, i/2, (struct node *)newn);
829		}
830
831	}
832
833	for (i = 0; i < olen; i += 2) {
834		struct tnode *newBinNode;
835
836		left = tnode_get_child(oldtnode, i);
837		right = tnode_get_child(oldtnode, i+1);
838
839		/* At least one of the children is empty */
840		if (left == NULL) {
841			if (right == NULL)    /* Both are empty */
842				continue;
843			put_child(t, tn, i/2, right);
844			continue;
845		}
846
847		if (right == NULL) {
848			put_child(t, tn, i/2, left);
849			continue;
850		}
851
852		/* Two nonempty children */
853		newBinNode = (struct tnode *) tnode_get_child(tn, i/2);
854		put_child(t, tn, i/2, NULL);
855		put_child(t, newBinNode, 0, left);
856		put_child(t, newBinNode, 1, right);
857		put_child(t, tn, i/2, resize(t, newBinNode));
858	}
859	tnode_free(oldtnode);
860	return tn;
861nomem:
862	{
863		int size = tnode_child_length(tn);
864		int j;
865
866		for (j = 0; j < size; j++)
867			if (tn->child[j])
868				tnode_free((struct tnode *)tn->child[j]);
869
870		tnode_free(tn);
871
872		return ERR_PTR(-ENOMEM);
873	}
874}
875
876/* readside must use rcu_read_lock currently dump routines
877 via get_fa_head and dump */
878
879static struct leaf_info *find_leaf_info(struct leaf *l, int plen)
880{
881	struct hlist_head *head = &l->list;
882	struct hlist_node *node;
883	struct leaf_info *li;
884
885	hlist_for_each_entry_rcu(li, node, head, hlist)
886		if (li->plen == plen)
887			return li;
888
889	return NULL;
890}
891
892static inline struct list_head * get_fa_head(struct leaf *l, int plen)
893{
894	struct leaf_info *li = find_leaf_info(l, plen);
895
896	if (!li)
897		return NULL;
898
899	return &li->falh;
900}
901
902static void insert_leaf_info(struct hlist_head *head, struct leaf_info *new)
903{
904	struct leaf_info *li = NULL, *last = NULL;
905	struct hlist_node *node;
906
907	if (hlist_empty(head)) {
908		hlist_add_head_rcu(&new->hlist, head);
909	} else {
910		hlist_for_each_entry(li, node, head, hlist) {
911			if (new->plen > li->plen)
912				break;
913
914			last = li;
915		}
916		if (last)
917			hlist_add_after_rcu(&last->hlist, &new->hlist);
918		else
919			hlist_add_before_rcu(&new->hlist, &li->hlist);
920	}
921}
922
923/* rcu_read_lock needs to be hold by caller from readside */
924
925static struct leaf *
926fib_find_node(struct trie *t, u32 key)
927{
928	int pos;
929	struct tnode *tn;
930	struct node *n;
931
932	pos = 0;
933	n = rcu_dereference(t->trie);
934
935	while (n != NULL &&  NODE_TYPE(n) == T_TNODE) {
936		tn = (struct tnode *) n;
937
938		check_tnode(tn);
939
940		if (tkey_sub_equals(tn->key, pos, tn->pos-pos, key)) {
941			pos = tn->pos + tn->bits;
942			n = tnode_get_child(tn, tkey_extract_bits(key, tn->pos, tn->bits));
943		} else
944			break;
945	}
946	/* Case we have found a leaf. Compare prefixes */
947
948	if (n != NULL && IS_LEAF(n) && tkey_equals(key, n->key))
949		return (struct leaf *)n;
950
951	return NULL;
952}
953
954static struct node *trie_rebalance(struct trie *t, struct tnode *tn)
955{
956	int wasfull;
957	t_key cindex, key = tn->key;
958	struct tnode *tp;
959
960	while (tn != NULL && (tp = node_parent((struct node *)tn)) != NULL) {
961		cindex = tkey_extract_bits(key, tp->pos, tp->bits);
962		wasfull = tnode_full(tp, tnode_get_child(tp, cindex));
963		tn = (struct tnode *) resize (t, (struct tnode *)tn);
964		tnode_put_child_reorg((struct tnode *)tp, cindex,(struct node*)tn, wasfull);
965
966		tp = node_parent((struct node *) tn);
967		if (!tp)
968			break;
969		tn = tp;
970	}
971
972	/* Handle last (top) tnode */
973	if (IS_TNODE(tn))
974		tn = (struct tnode*) resize(t, (struct tnode *)tn);
975
976	return (struct node*) tn;
977}
978
979/* only used from updater-side */
980
981static struct list_head *fib_insert_node(struct trie *t, u32 key, int plen)
982{
983	int pos, newpos;
984	struct tnode *tp = NULL, *tn = NULL;
985	struct node *n;
986	struct leaf *l;
987	int missbit;
988	struct list_head *fa_head = NULL;
989	struct leaf_info *li;
990	t_key cindex;
991
992	pos = 0;
993	n = t->trie;
994
995	/* If we point to NULL, stop. Either the tree is empty and we should
996	 * just put a new leaf in if, or we have reached an empty child slot,
997	 * and we should just put our new leaf in that.
998	 * If we point to a T_TNODE, check if it matches our key. Note that
999	 * a T_TNODE might be skipping any number of bits - its 'pos' need
1000	 * not be the parent's 'pos'+'bits'!
1001	 *
1002	 * If it does match the current key, get pos/bits from it, extract
1003	 * the index from our key, push the T_TNODE and walk the tree.
1004	 *
1005	 * If it doesn't, we have to replace it with a new T_TNODE.
1006	 *
1007	 * If we point to a T_LEAF, it might or might not have the same key
1008	 * as we do. If it does, just change the value, update the T_LEAF's
1009	 * value, and return it.
1010	 * If it doesn't, we need to replace it with a T_TNODE.
1011	 */
1012
1013	while (n != NULL &&  NODE_TYPE(n) == T_TNODE) {
1014		tn = (struct tnode *) n;
1015
1016		check_tnode(tn);
1017
1018		if (tkey_sub_equals(tn->key, pos, tn->pos-pos, key)) {
1019			tp = tn;
1020			pos = tn->pos + tn->bits;
1021			n = tnode_get_child(tn, tkey_extract_bits(key, tn->pos, tn->bits));
1022
1023			BUG_ON(n && node_parent(n) != tn);
1024		} else
1025			break;
1026	}
1027
1028	/*
1029	 * n  ----> NULL, LEAF or TNODE
1030	 *
1031	 * tp is n's (parent) ----> NULL or TNODE
1032	 */
1033
1034	BUG_ON(tp && IS_LEAF(tp));
1035
1036	/* Case 1: n is a leaf. Compare prefixes */
1037
1038	if (n != NULL && IS_LEAF(n) && tkey_equals(key, n->key)) {
1039		l = (struct leaf *) n;
1040		li = leaf_info_new(plen);
1041
1042		if (!li)
1043			return NULL;
1044
1045		fa_head = &li->falh;
1046		insert_leaf_info(&l->list, li);
1047		goto done;
1048	}
1049	t->size++;
1050	l = leaf_new();
1051
1052	if (!l)
1053		return NULL;
1054
1055	l->key = key;
1056	li = leaf_info_new(plen);
1057
1058	if (!li) {
1059		tnode_free((struct tnode *) l);
1060		return NULL;
1061	}
1062
1063	fa_head = &li->falh;
1064	insert_leaf_info(&l->list, li);
1065
1066	if (t->trie && n == NULL) {
1067		/* Case 2: n is NULL, and will just insert a new leaf */
1068
1069		node_set_parent((struct node *)l, tp);
1070
1071		cindex = tkey_extract_bits(key, tp->pos, tp->bits);
1072		put_child(t, (struct tnode *)tp, cindex, (struct node *)l);
1073	} else {
1074		/* Case 3: n is a LEAF or a TNODE and the key doesn't match. */
1075		/*
1076		 *  Add a new tnode here
1077		 *  first tnode need some special handling
1078		 */
1079
1080		if (tp)
1081			pos = tp->pos+tp->bits;
1082		else
1083			pos = 0;
1084
1085		if (n) {
1086			newpos = tkey_mismatch(key, pos, n->key);
1087			tn = tnode_new(n->key, newpos, 1);
1088		} else {
1089			newpos = 0;
1090			tn = tnode_new(key, newpos, 1); /* First tnode */
1091		}
1092
1093		if (!tn) {
1094			free_leaf_info(li);
1095			tnode_free((struct tnode *) l);
1096			return NULL;
1097		}
1098
1099		node_set_parent((struct node *)tn, tp);
1100
1101		missbit = tkey_extract_bits(key, newpos, 1);
1102		put_child(t, tn, missbit, (struct node *)l);
1103		put_child(t, tn, 1-missbit, n);
1104
1105		if (tp) {
1106			cindex = tkey_extract_bits(key, tp->pos, tp->bits);
1107			put_child(t, (struct tnode *)tp, cindex, (struct node *)tn);
1108		} else {
1109			rcu_assign_pointer(t->trie, (struct node *)tn); /* First tnode */
1110			tp = tn;
1111		}
1112	}
1113
1114	if (tp && tp->pos + tp->bits > 32)
1115		printk(KERN_WARNING "fib_trie tp=%p pos=%d, bits=%d, key=%0x plen=%d\n",
1116		       tp, tp->pos, tp->bits, key, plen);
1117
1118	/* Rebalance the trie */
1119
1120	rcu_assign_pointer(t->trie, trie_rebalance(t, tp));
1121done:
1122	return fa_head;
1123}
1124
1125/*
1126 * Caller must hold RTNL.
1127 */
1128static int fn_trie_insert(struct fib_table *tb, struct fib_config *cfg)
1129{
1130	struct trie *t = (struct trie *) tb->tb_data;
1131	struct fib_alias *fa, *new_fa;
1132	struct list_head *fa_head = NULL;
1133	struct fib_info *fi;
1134	int plen = cfg->fc_dst_len;
1135	u8 tos = cfg->fc_tos;
1136	u32 key, mask;
1137	int err;
1138	struct leaf *l;
1139
1140	if (plen > 32)
1141		return -EINVAL;
1142
1143	key = ntohl(cfg->fc_dst);
1144
1145	pr_debug("Insert table=%u %08x/%d\n", tb->tb_id, key, plen);
1146
1147	mask = ntohl(inet_make_mask(plen));
1148
1149	if (key & ~mask)
1150		return -EINVAL;
1151
1152	key = key & mask;
1153
1154	fi = fib_create_info(cfg);
1155	if (IS_ERR(fi)) {
1156		err = PTR_ERR(fi);
1157		goto err;
1158	}
1159
1160	l = fib_find_node(t, key);
1161	fa = NULL;
1162
1163	if (l) {
1164		fa_head = get_fa_head(l, plen);
1165		fa = fib_find_alias(fa_head, tos, fi->fib_priority);
1166	}
1167
1168	/* Now fa, if non-NULL, points to the first fib alias
1169	 * with the same keys [prefix,tos,priority], if such key already
1170	 * exists or to the node before which we will insert new one.
1171	 *
1172	 * If fa is NULL, we will need to allocate a new one and
1173	 * insert to the head of f.
1174	 *
1175	 * If f is NULL, no fib node matched the destination key
1176	 * and we need to allocate a new one of those as well.
1177	 */
1178
1179	if (fa && fa->fa_info->fib_priority == fi->fib_priority) {
1180		struct fib_alias *fa_orig;
1181
1182		err = -EEXIST;
1183		if (cfg->fc_nlflags & NLM_F_EXCL)
1184			goto out;
1185
1186		if (cfg->fc_nlflags & NLM_F_REPLACE) {
1187			struct fib_info *fi_drop;
1188			u8 state;
1189
1190			if (fi->fib_treeref > 1)
1191				goto out;
1192
1193			err = -ENOBUFS;
1194			new_fa = kmem_cache_alloc(fn_alias_kmem, GFP_KERNEL);
1195			if (new_fa == NULL)
1196				goto out;
1197
1198			fi_drop = fa->fa_info;
1199			new_fa->fa_tos = fa->fa_tos;
1200			new_fa->fa_info = fi;
1201			new_fa->fa_type = cfg->fc_type;
1202			new_fa->fa_scope = cfg->fc_scope;
1203			state = fa->fa_state;
1204			new_fa->fa_state &= ~FA_S_ACCESSED;
1205
1206			list_replace_rcu(&fa->fa_list, &new_fa->fa_list);
1207			alias_free_mem_rcu(fa);
1208
1209			fib_release_info(fi_drop);
1210			if (state & FA_S_ACCESSED)
1211				rt_cache_flush(-1);
1212			rtmsg_fib(RTM_NEWROUTE, htonl(key), new_fa, plen,
1213				tb->tb_id, &cfg->fc_nlinfo, NLM_F_REPLACE);
1214
1215			goto succeeded;
1216		}
1217		/* Error if we find a perfect match which
1218		 * uses the same scope, type, and nexthop
1219		 * information.
1220		 */
1221		fa_orig = fa;
1222		list_for_each_entry(fa, fa_orig->fa_list.prev, fa_list) {
1223			if (fa->fa_tos != tos)
1224				break;
1225			if (fa->fa_info->fib_priority != fi->fib_priority)
1226				break;
1227			if (fa->fa_type == cfg->fc_type &&
1228			    fa->fa_scope == cfg->fc_scope &&
1229			    fa->fa_info == fi) {
1230				goto out;
1231			}
1232		}
1233		if (!(cfg->fc_nlflags & NLM_F_APPEND))
1234			fa = fa_orig;
1235	}
1236	err = -ENOENT;
1237	if (!(cfg->fc_nlflags & NLM_F_CREATE))
1238		goto out;
1239
1240	err = -ENOBUFS;
1241	new_fa = kmem_cache_alloc(fn_alias_kmem, GFP_KERNEL);
1242	if (new_fa == NULL)
1243		goto out;
1244
1245	new_fa->fa_info = fi;
1246	new_fa->fa_tos = tos;
1247	new_fa->fa_type = cfg->fc_type;
1248	new_fa->fa_scope = cfg->fc_scope;
1249	new_fa->fa_state = 0;
1250	/*
1251	 * Insert new entry to the list.
1252	 */
1253
1254	if (!fa_head) {
1255		fa_head = fib_insert_node(t, key, plen);
1256		if (unlikely(!fa_head)) {
1257			err = -ENOMEM;
1258			goto out_free_new_fa;
1259		}
1260	}
1261
1262	list_add_tail_rcu(&new_fa->fa_list,
1263			  (fa ? &fa->fa_list : fa_head));
1264
1265	rt_cache_flush(-1);
1266	rtmsg_fib(RTM_NEWROUTE, htonl(key), new_fa, plen, tb->tb_id,
1267		  &cfg->fc_nlinfo, 0);
1268succeeded:
1269	return 0;
1270
1271out_free_new_fa:
1272	kmem_cache_free(fn_alias_kmem, new_fa);
1273out:
1274	fib_release_info(fi);
1275err:
1276	return err;
1277}
1278
1279
1280/* should be called with rcu_read_lock */
1281static inline int check_leaf(struct trie *t, struct leaf *l,
1282			     t_key key, int *plen, const struct flowi *flp,
1283			     struct fib_result *res)
1284{
1285	int err, i;
1286	__be32 mask;
1287	struct leaf_info *li;
1288	struct hlist_head *hhead = &l->list;
1289	struct hlist_node *node;
1290
1291	hlist_for_each_entry_rcu(li, node, hhead, hlist) {
1292		i = li->plen;
1293		mask = inet_make_mask(i);
1294		if (l->key != (key & ntohl(mask)))
1295			continue;
1296
1297		if ((err = fib_semantic_match(&li->falh, flp, res, htonl(l->key), mask, i)) <= 0) {
1298			*plen = i;
1299#ifdef CONFIG_IP_FIB_TRIE_STATS
1300			t->stats.semantic_match_passed++;
1301#endif
1302			return err;
1303		}
1304#ifdef CONFIG_IP_FIB_TRIE_STATS
1305		t->stats.semantic_match_miss++;
1306#endif
1307	}
1308	return 1;
1309}
1310
1311static int
1312fn_trie_lookup(struct fib_table *tb, const struct flowi *flp, struct fib_result *res)
1313{
1314	struct trie *t = (struct trie *) tb->tb_data;
1315	int plen, ret = 0;
1316	struct node *n;
1317	struct tnode *pn;
1318	int pos, bits;
1319	t_key key = ntohl(flp->fl4_dst);
1320	int chopped_off;
1321	t_key cindex = 0;
1322	int current_prefix_length = KEYLENGTH;
1323	struct tnode *cn;
1324	t_key node_prefix, key_prefix, pref_mismatch;
1325	int mp;
1326
1327	rcu_read_lock();
1328
1329	n = rcu_dereference(t->trie);
1330	if (!n)
1331		goto failed;
1332
1333#ifdef CONFIG_IP_FIB_TRIE_STATS
1334	t->stats.gets++;
1335#endif
1336
1337	/* Just a leaf? */
1338	if (IS_LEAF(n)) {
1339		if ((ret = check_leaf(t, (struct leaf *)n, key, &plen, flp, res)) <= 0)
1340			goto found;
1341		goto failed;
1342	}
1343	pn = (struct tnode *) n;
1344	chopped_off = 0;
1345
1346	while (pn) {
1347		pos = pn->pos;
1348		bits = pn->bits;
1349
1350		if (!chopped_off)
1351			cindex = tkey_extract_bits(mask_pfx(key, current_prefix_length),
1352						   pos, bits);
1353
1354		n = tnode_get_child(pn, cindex);
1355
1356		if (n == NULL) {
1357#ifdef CONFIG_IP_FIB_TRIE_STATS
1358			t->stats.null_node_hit++;
1359#endif
1360			goto backtrace;
1361		}
1362
1363		if (IS_LEAF(n)) {
1364			if ((ret = check_leaf(t, (struct leaf *)n, key, &plen, flp, res)) <= 0)
1365				goto found;
1366			else
1367				goto backtrace;
1368		}
1369
1370#define HL_OPTIMIZE
1371#ifdef HL_OPTIMIZE
1372		cn = (struct tnode *)n;
1373
1374		/*
1375		 * It's a tnode, and we can do some extra checks here if we
1376		 * like, to avoid descending into a dead-end branch.
1377		 * This tnode is in the parent's child array at index
1378		 * key[p_pos..p_pos+p_bits] but potentially with some bits
1379		 * chopped off, so in reality the index may be just a
1380		 * subprefix, padded with zero at the end.
1381		 * We can also take a look at any skipped bits in this
1382		 * tnode - everything up to p_pos is supposed to be ok,
1383		 * and the non-chopped bits of the index (se previous
1384		 * paragraph) are also guaranteed ok, but the rest is
1385		 * considered unknown.
1386		 *
1387		 * The skipped bits are key[pos+bits..cn->pos].
1388		 */
1389
1390		/* If current_prefix_length < pos+bits, we are already doing
1391		 * actual prefix  matching, which means everything from
1392		 * pos+(bits-chopped_off) onward must be zero along some
1393		 * branch of this subtree - otherwise there is *no* valid
1394		 * prefix present. Here we can only check the skipped
1395		 * bits. Remember, since we have already indexed into the
1396		 * parent's child array, we know that the bits we chopped of
1397		 * *are* zero.
1398		 */
1399
1400		/* NOTA BENE: CHECKING ONLY SKIPPED BITS FOR THE NEW NODE HERE */
1401
1402		if (current_prefix_length < pos+bits) {
1403			if (tkey_extract_bits(cn->key, current_prefix_length,
1404						cn->pos - current_prefix_length) != 0 ||
1405			    !(cn->child[0]))
1406				goto backtrace;
1407		}
1408
1409		/*
1410		 * If chopped_off=0, the index is fully validated and we
1411		 * only need to look at the skipped bits for this, the new,
1412		 * tnode. What we actually want to do is to find out if
1413		 * these skipped bits match our key perfectly, or if we will
1414		 * have to count on finding a matching prefix further down,
1415		 * because if we do, we would like to have some way of
1416		 * verifying the existence of such a prefix at this point.
1417		 */
1418
1419		/* The only thing we can do at this point is to verify that
1420		 * any such matching prefix can indeed be a prefix to our
1421		 * key, and if the bits in the node we are inspecting that
1422		 * do not match our key are not ZERO, this cannot be true.
1423		 * Thus, find out where there is a mismatch (before cn->pos)
1424		 * and verify that all the mismatching bits are zero in the
1425		 * new tnode's key.
1426		 */
1427
1428		/* Note: We aren't very concerned about the piece of the key
1429		 * that precede pn->pos+pn->bits, since these have already been
1430		 * checked. The bits after cn->pos aren't checked since these are
1431		 * by definition "unknown" at this point. Thus, what we want to
1432		 * see is if we are about to enter the "prefix matching" state,
1433		 * and in that case verify that the skipped bits that will prevail
1434		 * throughout this subtree are zero, as they have to be if we are
1435		 * to find a matching prefix.
1436		 */
1437
1438		node_prefix = mask_pfx(cn->key, cn->pos);
1439		key_prefix = mask_pfx(key, cn->pos);
1440		pref_mismatch = key_prefix^node_prefix;
1441		mp = 0;
1442
1443		/* In short: If skipped bits in this node do not match the search
1444		 * key, enter the "prefix matching" state.directly.
1445		 */
1446		if (pref_mismatch) {
1447			while (!(pref_mismatch & (1<<(KEYLENGTH-1)))) {
1448				mp++;
1449				pref_mismatch = pref_mismatch <<1;
1450			}
1451			key_prefix = tkey_extract_bits(cn->key, mp, cn->pos-mp);
1452
1453			if (key_prefix != 0)
1454				goto backtrace;
1455
1456			if (current_prefix_length >= cn->pos)
1457				current_prefix_length = mp;
1458		}
1459#endif
1460		pn = (struct tnode *)n; /* Descend */
1461		chopped_off = 0;
1462		continue;
1463
1464backtrace:
1465		chopped_off++;
1466
1467		/* As zero don't change the child key (cindex) */
1468		while ((chopped_off <= pn->bits) && !(cindex & (1<<(chopped_off-1))))
1469			chopped_off++;
1470
1471		/* Decrease current_... with bits chopped off */
1472		if (current_prefix_length > pn->pos + pn->bits - chopped_off)
1473			current_prefix_length = pn->pos + pn->bits - chopped_off;
1474
1475		/*
1476		 * Either we do the actual chop off according or if we have
1477		 * chopped off all bits in this tnode walk up to our parent.
1478		 */
1479
1480		if (chopped_off <= pn->bits) {
1481			cindex &= ~(1 << (chopped_off-1));
1482		} else {
1483			struct tnode *parent = node_parent((struct node *) pn);
1484			if (!parent)
1485				goto failed;
1486
1487			/* Get Child's index */
1488			cindex = tkey_extract_bits(pn->key, parent->pos, parent->bits);
1489			pn = parent;
1490			chopped_off = 0;
1491
1492#ifdef CONFIG_IP_FIB_TRIE_STATS
1493			t->stats.backtrack++;
1494#endif
1495			goto backtrace;
1496		}
1497	}
1498failed:
1499	ret = 1;
1500found:
1501	rcu_read_unlock();
1502	return ret;
1503}
1504
1505/* only called from updater side */
1506static int trie_leaf_remove(struct trie *t, t_key key)
1507{
1508	t_key cindex;
1509	struct tnode *tp = NULL;
1510	struct node *n = t->trie;
1511	struct leaf *l;
1512
1513	pr_debug("entering trie_leaf_remove(%p)\n", n);
1514
1515	/* Note that in the case skipped bits, those bits are *not* checked!
1516	 * When we finish this, we will have NULL or a T_LEAF, and the
1517	 * T_LEAF may or may not match our key.
1518	 */
1519
1520	while (n != NULL && IS_TNODE(n)) {
1521		struct tnode *tn = (struct tnode *) n;
1522		check_tnode(tn);
1523		n = tnode_get_child(tn ,tkey_extract_bits(key, tn->pos, tn->bits));
1524
1525		BUG_ON(n && node_parent(n) != tn);
1526	}
1527	l = (struct leaf *) n;
1528
1529	if (!n || !tkey_equals(l->key, key))
1530		return 0;
1531
1532	/*
1533	 * Key found.
1534	 * Remove the leaf and rebalance the tree
1535	 */
1536
1537	t->size--;
1538
1539	tp = node_parent(n);
1540	tnode_free((struct tnode *) n);
1541
1542	if (tp) {
1543		cindex = tkey_extract_bits(key, tp->pos, tp->bits);
1544		put_child(t, (struct tnode *)tp, cindex, NULL);
1545		rcu_assign_pointer(t->trie, trie_rebalance(t, tp));
1546	} else
1547		rcu_assign_pointer(t->trie, NULL);
1548
1549	return 1;
1550}
1551
1552/*
1553 * Caller must hold RTNL.
1554 */
1555static int fn_trie_delete(struct fib_table *tb, struct fib_config *cfg)
1556{
1557	struct trie *t = (struct trie *) tb->tb_data;
1558	u32 key, mask;
1559	int plen = cfg->fc_dst_len;
1560	u8 tos = cfg->fc_tos;
1561	struct fib_alias *fa, *fa_to_delete;
1562	struct list_head *fa_head;
1563	struct leaf *l;
1564	struct leaf_info *li;
1565
1566	if (plen > 32)
1567		return -EINVAL;
1568
1569	key = ntohl(cfg->fc_dst);
1570	mask = ntohl(inet_make_mask(plen));
1571
1572	if (key & ~mask)
1573		return -EINVAL;
1574
1575	key = key & mask;
1576	l = fib_find_node(t, key);
1577
1578	if (!l)
1579		return -ESRCH;
1580
1581	fa_head = get_fa_head(l, plen);
1582	fa = fib_find_alias(fa_head, tos, 0);
1583
1584	if (!fa)
1585		return -ESRCH;
1586
1587	pr_debug("Deleting %08x/%d tos=%d t=%p\n", key, plen, tos, t);
1588
1589	fa_to_delete = NULL;
1590	fa_head = fa->fa_list.prev;
1591
1592	list_for_each_entry(fa, fa_head, fa_list) {
1593		struct fib_info *fi = fa->fa_info;
1594
1595		if (fa->fa_tos != tos)
1596			break;
1597
1598		if ((!cfg->fc_type || fa->fa_type == cfg->fc_type) &&
1599		    (cfg->fc_scope == RT_SCOPE_NOWHERE ||
1600		     fa->fa_scope == cfg->fc_scope) &&
1601		    (!cfg->fc_protocol ||
1602		     fi->fib_protocol == cfg->fc_protocol) &&
1603		    fib_nh_match(cfg, fi) == 0) {
1604			fa_to_delete = fa;
1605			break;
1606		}
1607	}
1608
1609	if (!fa_to_delete)
1610		return -ESRCH;
1611
1612	fa = fa_to_delete;
1613	rtmsg_fib(RTM_DELROUTE, htonl(key), fa, plen, tb->tb_id,
1614		  &cfg->fc_nlinfo, 0);
1615
1616	l = fib_find_node(t, key);
1617	li = find_leaf_info(l, plen);
1618
1619	list_del_rcu(&fa->fa_list);
1620
1621	if (list_empty(fa_head)) {
1622		hlist_del_rcu(&li->hlist);
1623		free_leaf_info(li);
1624	}
1625
1626	if (hlist_empty(&l->list))
1627		trie_leaf_remove(t, key);
1628
1629	if (fa->fa_state & FA_S_ACCESSED)
1630		rt_cache_flush(-1);
1631
1632	fib_release_info(fa->fa_info);
1633	alias_free_mem_rcu(fa);
1634	return 0;
1635}
1636
1637static int trie_flush_list(struct trie *t, struct list_head *head)
1638{
1639	struct fib_alias *fa, *fa_node;
1640	int found = 0;
1641
1642	list_for_each_entry_safe(fa, fa_node, head, fa_list) {
1643		struct fib_info *fi = fa->fa_info;
1644
1645		if (fi && (fi->fib_flags & RTNH_F_DEAD)) {
1646			list_del_rcu(&fa->fa_list);
1647			fib_release_info(fa->fa_info);
1648			alias_free_mem_rcu(fa);
1649			found++;
1650		}
1651	}
1652	return found;
1653}
1654
1655static int trie_flush_leaf(struct trie *t, struct leaf *l)
1656{
1657	int found = 0;
1658	struct hlist_head *lih = &l->list;
1659	struct hlist_node *node, *tmp;
1660	struct leaf_info *li = NULL;
1661
1662	hlist_for_each_entry_safe(li, node, tmp, lih, hlist) {
1663		found += trie_flush_list(t, &li->falh);
1664
1665		if (list_empty(&li->falh)) {
1666			hlist_del_rcu(&li->hlist);
1667			free_leaf_info(li);
1668		}
1669	}
1670	return found;
1671}
1672
1673/* rcu_read_lock needs to be hold by caller from readside */
1674
1675static struct leaf *nextleaf(struct trie *t, struct leaf *thisleaf)
1676{
1677	struct node *c = (struct node *) thisleaf;
1678	struct tnode *p;
1679	int idx;
1680	struct node *trie = rcu_dereference(t->trie);
1681
1682	if (c == NULL) {
1683		if (trie == NULL)
1684			return NULL;
1685
1686		if (IS_LEAF(trie))          /* trie w. just a leaf */
1687			return (struct leaf *) trie;
1688
1689		p = (struct tnode*) trie;  /* Start */
1690	} else
1691		p = node_parent(c);
1692
1693	while (p) {
1694		int pos, last;
1695
1696		/*  Find the next child of the parent */
1697		if (c)
1698			pos = 1 + tkey_extract_bits(c->key, p->pos, p->bits);
1699		else
1700			pos = 0;
1701
1702		last = 1 << p->bits;
1703		for (idx = pos; idx < last ; idx++) {
1704			c = rcu_dereference(p->child[idx]);
1705
1706			if (!c)
1707				continue;
1708
1709			/* Decend if tnode */
1710			while (IS_TNODE(c)) {
1711				p = (struct tnode *) c;
1712				idx = 0;
1713
1714				/* Rightmost non-NULL branch */
1715				if (p && IS_TNODE(p))
1716					while (!(c = rcu_dereference(p->child[idx]))
1717					       && idx < (1<<p->bits)) idx++;
1718
1719				/* Done with this tnode? */
1720				if (idx >= (1 << p->bits) || !c)
1721					goto up;
1722			}
1723			return (struct leaf *) c;
1724		}
1725up:
1726		/* No more children go up one step  */
1727		c = (struct node *) p;
1728		p = node_parent(c);
1729	}
1730	return NULL; /* Ready. Root of trie */
1731}
1732
1733/*
1734 * Caller must hold RTNL.
1735 */
1736static int fn_trie_flush(struct fib_table *tb)
1737{
1738	struct trie *t = (struct trie *) tb->tb_data;
1739	struct leaf *ll = NULL, *l = NULL;
1740	int found = 0, h;
1741
1742	for (h = 0; (l = nextleaf(t, l)) != NULL; h++) {
1743		found += trie_flush_leaf(t, l);
1744
1745		if (ll && hlist_empty(&ll->list))
1746			trie_leaf_remove(t, ll->key);
1747		ll = l;
1748	}
1749
1750	if (ll && hlist_empty(&ll->list))
1751		trie_leaf_remove(t, ll->key);
1752
1753	pr_debug("trie_flush found=%d\n", found);
1754	return found;
1755}
1756
1757static void
1758fn_trie_select_default(struct fib_table *tb, const struct flowi *flp, struct fib_result *res)
1759{
1760	struct trie *t = (struct trie *) tb->tb_data;
1761	int order, last_idx;
1762	struct fib_info *fi = NULL;
1763	struct fib_info *last_resort;
1764	struct fib_alias *fa = NULL;
1765	struct list_head *fa_head;
1766	struct leaf *l;
1767
1768	last_idx = -1;
1769	last_resort = NULL;
1770	order = -1;
1771
1772	rcu_read_lock();
1773
1774	l = fib_find_node(t, 0);
1775	if (!l)
1776		goto out;
1777
1778	fa_head = get_fa_head(l, 0);
1779	if (!fa_head)
1780		goto out;
1781
1782	if (list_empty(fa_head))
1783		goto out;
1784
1785	list_for_each_entry_rcu(fa, fa_head, fa_list) {
1786		struct fib_info *next_fi = fa->fa_info;
1787
1788		if (fa->fa_scope != res->scope ||
1789		    fa->fa_type != RTN_UNICAST)
1790			continue;
1791
1792		if (next_fi->fib_priority > res->fi->fib_priority)
1793			break;
1794		if (!next_fi->fib_nh[0].nh_gw ||
1795		    next_fi->fib_nh[0].nh_scope != RT_SCOPE_LINK)
1796			continue;
1797		fa->fa_state |= FA_S_ACCESSED;
1798
1799		if (fi == NULL) {
1800			if (next_fi != res->fi)
1801				break;
1802		} else if (!fib_detect_death(fi, order, &last_resort,
1803					     &last_idx, tb->tb_default)) {
1804			fib_result_assign(res, fi);
1805			tb->tb_default = order;
1806			goto out;
1807		}
1808		fi = next_fi;
1809		order++;
1810	}
1811	if (order <= 0 || fi == NULL) {
1812		tb->tb_default = -1;
1813		goto out;
1814	}
1815
1816	if (!fib_detect_death(fi, order, &last_resort, &last_idx,
1817				tb->tb_default)) {
1818		fib_result_assign(res, fi);
1819		tb->tb_default = order;
1820		goto out;
1821	}
1822	if (last_idx >= 0)
1823		fib_result_assign(res, last_resort);
1824	tb->tb_default = last_idx;
1825out:
1826	rcu_read_unlock();
1827}
1828
1829static int fn_trie_dump_fa(t_key key, int plen, struct list_head *fah, struct fib_table *tb,
1830			   struct sk_buff *skb, struct netlink_callback *cb)
1831{
1832	int i, s_i;
1833	struct fib_alias *fa;
1834
1835	__be32 xkey = htonl(key);
1836
1837	s_i = cb->args[4];
1838	i = 0;
1839
1840	/* rcu_read_lock is hold by caller */
1841
1842	list_for_each_entry_rcu(fa, fah, fa_list) {
1843		if (i < s_i) {
1844			i++;
1845			continue;
1846		}
1847		BUG_ON(!fa->fa_info);
1848
1849		if (fib_dump_info(skb, NETLINK_CB(cb->skb).pid,
1850				  cb->nlh->nlmsg_seq,
1851				  RTM_NEWROUTE,
1852				  tb->tb_id,
1853				  fa->fa_type,
1854				  fa->fa_scope,
1855				  xkey,
1856				  plen,
1857				  fa->fa_tos,
1858				  fa->fa_info, 0) < 0) {
1859			cb->args[4] = i;
1860			return -1;
1861		}
1862		i++;
1863	}
1864	cb->args[4] = i;
1865	return skb->len;
1866}
1867
1868static int fn_trie_dump_plen(struct trie *t, int plen, struct fib_table *tb, struct sk_buff *skb,
1869			     struct netlink_callback *cb)
1870{
1871	int h, s_h;
1872	struct list_head *fa_head;
1873	struct leaf *l = NULL;
1874
1875	s_h = cb->args[3];
1876
1877	for (h = 0; (l = nextleaf(t, l)) != NULL; h++) {
1878		if (h < s_h)
1879			continue;
1880		if (h > s_h)
1881			memset(&cb->args[4], 0,
1882			       sizeof(cb->args) - 4*sizeof(cb->args[0]));
1883
1884		fa_head = get_fa_head(l, plen);
1885
1886		if (!fa_head)
1887			continue;
1888
1889		if (list_empty(fa_head))
1890			continue;
1891
1892		if (fn_trie_dump_fa(l->key, plen, fa_head, tb, skb, cb)<0) {
1893			cb->args[3] = h;
1894			return -1;
1895		}
1896	}
1897	cb->args[3] = h;
1898	return skb->len;
1899}
1900
1901static int fn_trie_dump(struct fib_table *tb, struct sk_buff *skb, struct netlink_callback *cb)
1902{
1903	int m, s_m;
1904	struct trie *t = (struct trie *) tb->tb_data;
1905
1906	s_m = cb->args[2];
1907
1908	rcu_read_lock();
1909	for (m = 0; m <= 32; m++) {
1910		if (m < s_m)
1911			continue;
1912		if (m > s_m)
1913			memset(&cb->args[3], 0,
1914				sizeof(cb->args) - 3*sizeof(cb->args[0]));
1915
1916		if (fn_trie_dump_plen(t, 32-m, tb, skb, cb)<0) {
1917			cb->args[2] = m;
1918			goto out;
1919		}
1920	}
1921	rcu_read_unlock();
1922	cb->args[2] = m;
1923	return skb->len;
1924out:
1925	rcu_read_unlock();
1926	return -1;
1927}
1928
1929/* Fix more generic FIB names for init later */
1930
1931struct fib_table *fib_hash_init(u32 id)
1932{
1933	struct fib_table *tb;
1934	struct trie *t;
1935
1936	if (fn_alias_kmem == NULL)
1937		fn_alias_kmem = kmem_cache_create("ip_fib_alias",
1938						  sizeof(struct fib_alias),
1939						  0, SLAB_HWCACHE_ALIGN,
1940						  NULL);
1941
1942	tb = kmalloc(sizeof(struct fib_table) + sizeof(struct trie),
1943		     GFP_KERNEL);
1944	if (tb == NULL)
1945		return NULL;
1946
1947	tb->tb_id = id;
1948	tb->tb_default = -1;
1949	tb->tb_lookup = fn_trie_lookup;
1950	tb->tb_insert = fn_trie_insert;
1951	tb->tb_delete = fn_trie_delete;
1952	tb->tb_flush = fn_trie_flush;
1953	tb->tb_select_default = fn_trie_select_default;
1954	tb->tb_dump = fn_trie_dump;
1955
1956	t = (struct trie *) tb->tb_data;
1957	memset(t, 0, sizeof(*t));
1958
1959	if (id == RT_TABLE_LOCAL)
1960		printk(KERN_INFO "IPv4 FIB: Using LC-trie version %s\n", VERSION);
1961
1962	return tb;
1963}
1964
1965#ifdef CONFIG_PROC_FS
1966/* Depth first Trie walk iterator */
1967struct fib_trie_iter {
1968	struct seq_net_private p;
1969	struct trie *trie_local, *trie_main;
1970	struct tnode *tnode;
1971	struct trie *trie;
1972	unsigned index;
1973	unsigned depth;
1974};
1975
1976static struct node *fib_trie_get_next(struct fib_trie_iter *iter)
1977{
1978	struct tnode *tn = iter->tnode;
1979	unsigned cindex = iter->index;
1980	struct tnode *p;
1981
1982	/* A single entry routing table */
1983	if (!tn)
1984		return NULL;
1985
1986	pr_debug("get_next iter={node=%p index=%d depth=%d}\n",
1987		 iter->tnode, iter->index, iter->depth);
1988rescan:
1989	while (cindex < (1<<tn->bits)) {
1990		struct node *n = tnode_get_child(tn, cindex);
1991
1992		if (n) {
1993			if (IS_LEAF(n)) {
1994				iter->tnode = tn;
1995				iter->index = cindex + 1;
1996			} else {
1997				/* push down one level */
1998				iter->tnode = (struct tnode *) n;
1999				iter->index = 0;
2000				++iter->depth;
2001			}
2002			return n;
2003		}
2004
2005		++cindex;
2006	}
2007
2008	/* Current node exhausted, pop back up */
2009	p = node_parent((struct node *)tn);
2010	if (p) {
2011		cindex = tkey_extract_bits(tn->key, p->pos, p->bits)+1;
2012		tn = p;
2013		--iter->depth;
2014		goto rescan;
2015	}
2016
2017	/* got root? */
2018	return NULL;
2019}
2020
2021static struct node *fib_trie_get_first(struct fib_trie_iter *iter,
2022				       struct trie *t)
2023{
2024	struct node *n ;
2025
2026	if (!t)
2027		return NULL;
2028
2029	n = rcu_dereference(t->trie);
2030
2031	if (!iter)
2032		return NULL;
2033
2034	if (n) {
2035		if (IS_TNODE(n)) {
2036			iter->tnode = (struct tnode *) n;
2037			iter->trie = t;
2038			iter->index = 0;
2039			iter->depth = 1;
2040		} else {
2041			iter->tnode = NULL;
2042			iter->trie  = t;
2043			iter->index = 0;
2044			iter->depth = 0;
2045		}
2046		return n;
2047	}
2048	return NULL;
2049}
2050
2051static void trie_collect_stats(struct trie *t, struct trie_stat *s)
2052{
2053	struct node *n;
2054	struct fib_trie_iter iter;
2055
2056	memset(s, 0, sizeof(*s));
2057
2058	rcu_read_lock();
2059	for (n = fib_trie_get_first(&iter, t); n;
2060	     n = fib_trie_get_next(&iter)) {
2061		if (IS_LEAF(n)) {
2062			s->leaves++;
2063			s->totdepth += iter.depth;
2064			if (iter.depth > s->maxdepth)
2065				s->maxdepth = iter.depth;
2066		} else {
2067			const struct tnode *tn = (const struct tnode *) n;
2068			int i;
2069
2070			s->tnodes++;
2071			if (tn->bits < MAX_STAT_DEPTH)
2072				s->nodesizes[tn->bits]++;
2073
2074			for (i = 0; i < (1<<tn->bits); i++)
2075				if (!tn->child[i])
2076					s->nullpointers++;
2077		}
2078	}
2079	rcu_read_unlock();
2080}
2081
2082/*
2083 *	This outputs /proc/net/fib_triestats
2084 */
2085static void trie_show_stats(struct seq_file *seq, struct trie_stat *stat)
2086{
2087	unsigned i, max, pointers, bytes, avdepth;
2088
2089	if (stat->leaves)
2090		avdepth = stat->totdepth*100 / stat->leaves;
2091	else
2092		avdepth = 0;
2093
2094	seq_printf(seq, "\tAver depth:     %u.%02d\n", avdepth / 100, avdepth % 100 );
2095	seq_printf(seq, "\tMax depth:      %u\n", stat->maxdepth);
2096
2097	seq_printf(seq, "\tLeaves:         %u\n", stat->leaves);
2098
2099	bytes = sizeof(struct leaf) * stat->leaves;
2100	seq_printf(seq, "\tInternal nodes: %u\n\t", stat->tnodes);
2101	bytes += sizeof(struct tnode) * stat->tnodes;
2102
2103	max = MAX_STAT_DEPTH;
2104	while (max > 0 && stat->nodesizes[max-1] == 0)
2105		max--;
2106
2107	pointers = 0;
2108	for (i = 1; i <= max; i++)
2109		if (stat->nodesizes[i] != 0) {
2110			seq_printf(seq, "  %u: %u",  i, stat->nodesizes[i]);
2111			pointers += (1<<i) * stat->nodesizes[i];
2112		}
2113	seq_putc(seq, '\n');
2114	seq_printf(seq, "\tPointers: %u\n", pointers);
2115
2116	bytes += sizeof(struct node *) * pointers;
2117	seq_printf(seq, "Null ptrs: %u\n", stat->nullpointers);
2118	seq_printf(seq, "Total size: %u  kB\n", (bytes + 1023) / 1024);
2119}
2120
2121#ifdef CONFIG_IP_FIB_TRIE_STATS
2122static void trie_show_usage(struct seq_file *seq,
2123			    const struct trie_use_stats *stats)
2124{
2125	seq_printf(seq, "\nCounters:\n---------\n");
2126	seq_printf(seq,"gets = %u\n", stats->gets);
2127	seq_printf(seq,"backtracks = %u\n", stats->backtrack);
2128	seq_printf(seq,"semantic match passed = %u\n", stats->semantic_match_passed);
2129	seq_printf(seq,"semantic match miss = %u\n", stats->semantic_match_miss);
2130	seq_printf(seq,"null node hit= %u\n", stats->null_node_hit);
2131	seq_printf(seq,"skipped node resize = %u\n\n", stats->resize_node_skipped);
2132}
2133#endif /*  CONFIG_IP_FIB_TRIE_STATS */
2134
2135
2136static int fib_triestat_seq_show(struct seq_file *seq, void *v)
2137{
2138	struct net *net = (struct net *)seq->private;
2139	struct trie *trie_local, *trie_main;
2140	struct trie_stat *stat;
2141	struct fib_table *tb;
2142
2143	trie_local = NULL;
2144	tb = fib_get_table(net, RT_TABLE_LOCAL);
2145	if (tb)
2146		trie_local = (struct trie *) tb->tb_data;
2147
2148	trie_main = NULL;
2149	tb = fib_get_table(net, RT_TABLE_MAIN);
2150	if (tb)
2151		trie_main = (struct trie *) tb->tb_data;
2152
2153
2154	stat = kmalloc(sizeof(*stat), GFP_KERNEL);
2155	if (!stat)
2156		return -ENOMEM;
2157
2158	seq_printf(seq, "Basic info: size of leaf: %Zd bytes, size of tnode: %Zd bytes.\n",
2159		   sizeof(struct leaf), sizeof(struct tnode));
2160
2161	if (trie_local) {
2162		seq_printf(seq, "Local:\n");
2163		trie_collect_stats(trie_local, stat);
2164		trie_show_stats(seq, stat);
2165#ifdef CONFIG_IP_FIB_TRIE_STATS
2166		trie_show_usage(seq, &trie_local->stats);
2167#endif
2168	}
2169
2170	if (trie_main) {
2171		seq_printf(seq, "Main:\n");
2172		trie_collect_stats(trie_main, stat);
2173		trie_show_stats(seq, stat);
2174#ifdef CONFIG_IP_FIB_TRIE_STATS
2175		trie_show_usage(seq, &trie_main->stats);
2176#endif
2177	}
2178	kfree(stat);
2179
2180	return 0;
2181}
2182
2183static int fib_triestat_seq_open(struct inode *inode, struct file *file)
2184{
2185	int err;
2186	struct net *net;
2187
2188	net = get_proc_net(inode);
2189	if (net == NULL)
2190		return -ENXIO;
2191	err = single_open(file, fib_triestat_seq_show, net);
2192	if (err < 0) {
2193		put_net(net);
2194		return err;
2195	}
2196	return 0;
2197}
2198
2199static int fib_triestat_seq_release(struct inode *ino, struct file *f)
2200{
2201	struct seq_file *seq = f->private_data;
2202	put_net(seq->private);
2203	return single_release(ino, f);
2204}
2205
2206static const struct file_operations fib_triestat_fops = {
2207	.owner	= THIS_MODULE,
2208	.open	= fib_triestat_seq_open,
2209	.read	= seq_read,
2210	.llseek	= seq_lseek,
2211	.release = fib_triestat_seq_release,
2212};
2213
2214static struct node *fib_trie_get_idx(struct fib_trie_iter *iter,
2215				      loff_t pos)
2216{
2217	loff_t idx = 0;
2218	struct node *n;
2219
2220	for (n = fib_trie_get_first(iter, iter->trie_local);
2221	     n; ++idx, n = fib_trie_get_next(iter)) {
2222		if (pos == idx)
2223			return n;
2224	}
2225
2226	for (n = fib_trie_get_first(iter, iter->trie_main);
2227	     n; ++idx, n = fib_trie_get_next(iter)) {
2228		if (pos == idx)
2229			return n;
2230	}
2231	return NULL;
2232}
2233
2234static void *fib_trie_seq_start(struct seq_file *seq, loff_t *pos)
2235	__acquires(RCU)
2236{
2237	struct fib_trie_iter *iter = seq->private;
2238	struct fib_table *tb;
2239
2240	if (!iter->trie_local) {
2241		tb = fib_get_table(iter->p.net, RT_TABLE_LOCAL);
2242		if (tb)
2243			iter->trie_local = (struct trie *) tb->tb_data;
2244	}
2245	if (!iter->trie_main) {
2246		tb = fib_get_table(iter->p.net, RT_TABLE_MAIN);
2247		if (tb)
2248			iter->trie_main = (struct trie *) tb->tb_data;
2249	}
2250	rcu_read_lock();
2251	if (*pos == 0)
2252		return SEQ_START_TOKEN;
2253	return fib_trie_get_idx(iter, *pos - 1);
2254}
2255
2256static void *fib_trie_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2257{
2258	struct fib_trie_iter *iter = seq->private;
2259	void *l = v;
2260
2261	++*pos;
2262	if (v == SEQ_START_TOKEN)
2263		return fib_trie_get_idx(iter, 0);
2264
2265	v = fib_trie_get_next(iter);
2266	BUG_ON(v == l);
2267	if (v)
2268		return v;
2269
2270	/* continue scan in next trie */
2271	if (iter->trie == iter->trie_local)
2272		return fib_trie_get_first(iter, iter->trie_main);
2273
2274	return NULL;
2275}
2276
2277static void fib_trie_seq_stop(struct seq_file *seq, void *v)
2278	__releases(RCU)
2279{
2280	rcu_read_unlock();
2281}
2282
2283static void seq_indent(struct seq_file *seq, int n)
2284{
2285	while (n-- > 0) seq_puts(seq, "   ");
2286}
2287
2288static inline const char *rtn_scope(enum rt_scope_t s)
2289{
2290	static char buf[32];
2291
2292	switch (s) {
2293	case RT_SCOPE_UNIVERSE: return "universe";
2294	case RT_SCOPE_SITE:	return "site";
2295	case RT_SCOPE_LINK:	return "link";
2296	case RT_SCOPE_HOST:	return "host";
2297	case RT_SCOPE_NOWHERE:	return "nowhere";
2298	default:
2299		snprintf(buf, sizeof(buf), "scope=%d", s);
2300		return buf;
2301	}
2302}
2303
2304static const char *rtn_type_names[__RTN_MAX] = {
2305	[RTN_UNSPEC] = "UNSPEC",
2306	[RTN_UNICAST] = "UNICAST",
2307	[RTN_LOCAL] = "LOCAL",
2308	[RTN_BROADCAST] = "BROADCAST",
2309	[RTN_ANYCAST] = "ANYCAST",
2310	[RTN_MULTICAST] = "MULTICAST",
2311	[RTN_BLACKHOLE] = "BLACKHOLE",
2312	[RTN_UNREACHABLE] = "UNREACHABLE",
2313	[RTN_PROHIBIT] = "PROHIBIT",
2314	[RTN_THROW] = "THROW",
2315	[RTN_NAT] = "NAT",
2316	[RTN_XRESOLVE] = "XRESOLVE",
2317};
2318
2319static inline const char *rtn_type(unsigned t)
2320{
2321	static char buf[32];
2322
2323	if (t < __RTN_MAX && rtn_type_names[t])
2324		return rtn_type_names[t];
2325	snprintf(buf, sizeof(buf), "type %u", t);
2326	return buf;
2327}
2328
2329/* Pretty print the trie */
2330static int fib_trie_seq_show(struct seq_file *seq, void *v)
2331{
2332	const struct fib_trie_iter *iter = seq->private;
2333	struct node *n = v;
2334
2335	if (v == SEQ_START_TOKEN)
2336		return 0;
2337
2338	if (!node_parent(n)) {
2339		if (iter->trie == iter->trie_local)
2340			seq_puts(seq, "<local>:\n");
2341		else
2342			seq_puts(seq, "<main>:\n");
2343	}
2344
2345	if (IS_TNODE(n)) {
2346		struct tnode *tn = (struct tnode *) n;
2347		__be32 prf = htonl(mask_pfx(tn->key, tn->pos));
2348
2349		seq_indent(seq, iter->depth-1);
2350		seq_printf(seq, "  +-- %d.%d.%d.%d/%d %d %d %d\n",
2351			   NIPQUAD(prf), tn->pos, tn->bits, tn->full_children,
2352			   tn->empty_children);
2353
2354	} else {
2355		struct leaf *l = (struct leaf *) n;
2356		int i;
2357		__be32 val = htonl(l->key);
2358
2359		seq_indent(seq, iter->depth);
2360		seq_printf(seq, "  |-- %d.%d.%d.%d\n", NIPQUAD(val));
2361		for (i = 32; i >= 0; i--) {
2362			struct leaf_info *li = find_leaf_info(l, i);
2363			if (li) {
2364				struct fib_alias *fa;
2365				list_for_each_entry_rcu(fa, &li->falh, fa_list) {
2366					seq_indent(seq, iter->depth+1);
2367					seq_printf(seq, "  /%d %s %s", i,
2368						   rtn_scope(fa->fa_scope),
2369						   rtn_type(fa->fa_type));
2370					if (fa->fa_tos)
2371						seq_printf(seq, "tos =%d\n",
2372							   fa->fa_tos);
2373					seq_putc(seq, '\n');
2374				}
2375			}
2376		}
2377	}
2378
2379	return 0;
2380}
2381
2382static const struct seq_operations fib_trie_seq_ops = {
2383	.start  = fib_trie_seq_start,
2384	.next   = fib_trie_seq_next,
2385	.stop   = fib_trie_seq_stop,
2386	.show   = fib_trie_seq_show,
2387};
2388
2389static int fib_trie_seq_open(struct inode *inode, struct file *file)
2390{
2391	return seq_open_net(inode, file, &fib_trie_seq_ops,
2392			    sizeof(struct fib_trie_iter));
2393}
2394
2395static const struct file_operations fib_trie_fops = {
2396	.owner  = THIS_MODULE,
2397	.open   = fib_trie_seq_open,
2398	.read   = seq_read,
2399	.llseek = seq_lseek,
2400	.release = seq_release_net,
2401};
2402
2403static unsigned fib_flag_trans(int type, __be32 mask, const struct fib_info *fi)
2404{
2405	static unsigned type2flags[RTN_MAX + 1] = {
2406		[7] = RTF_REJECT, [8] = RTF_REJECT,
2407	};
2408	unsigned flags = type2flags[type];
2409
2410	if (fi && fi->fib_nh->nh_gw)
2411		flags |= RTF_GATEWAY;
2412	if (mask == htonl(0xFFFFFFFF))
2413		flags |= RTF_HOST;
2414	flags |= RTF_UP;
2415	return flags;
2416}
2417
2418/*
2419 *	This outputs /proc/net/route.
2420 *	The format of the file is not supposed to be changed
2421 * 	and needs to be same as fib_hash output to avoid breaking
2422 *	legacy utilities
2423 */
2424static int fib_route_seq_show(struct seq_file *seq, void *v)
2425{
2426	const struct fib_trie_iter *iter = seq->private;
2427	struct leaf *l = v;
2428	int i;
2429	char bf[128];
2430
2431	if (v == SEQ_START_TOKEN) {
2432		seq_printf(seq, "%-127s\n", "Iface\tDestination\tGateway "
2433			   "\tFlags\tRefCnt\tUse\tMetric\tMask\t\tMTU"
2434			   "\tWindow\tIRTT");
2435		return 0;
2436	}
2437
2438	if (iter->trie == iter->trie_local)
2439		return 0;
2440	if (IS_TNODE(l))
2441		return 0;
2442
2443	for (i=32; i>=0; i--) {
2444		struct leaf_info *li = find_leaf_info(l, i);
2445		struct fib_alias *fa;
2446		__be32 mask, prefix;
2447
2448		if (!li)
2449			continue;
2450
2451		mask = inet_make_mask(li->plen);
2452		prefix = htonl(l->key);
2453
2454		list_for_each_entry_rcu(fa, &li->falh, fa_list) {
2455			const struct fib_info *fi = fa->fa_info;
2456			unsigned flags = fib_flag_trans(fa->fa_type, mask, fi);
2457
2458			if (fa->fa_type == RTN_BROADCAST
2459			    || fa->fa_type == RTN_MULTICAST)
2460				continue;
2461
2462			if (fi)
2463				snprintf(bf, sizeof(bf),
2464					 "%s\t%08X\t%08X\t%04X\t%d\t%u\t%d\t%08X\t%d\t%u\t%u",
2465					 fi->fib_dev ? fi->fib_dev->name : "*",
2466					 prefix,
2467					 fi->fib_nh->nh_gw, flags, 0, 0,
2468					 fi->fib_priority,
2469					 mask,
2470					 (fi->fib_advmss ? fi->fib_advmss + 40 : 0),
2471					 fi->fib_window,
2472					 fi->fib_rtt >> 3);
2473			else
2474				snprintf(bf, sizeof(bf),
2475					 "*\t%08X\t%08X\t%04X\t%d\t%u\t%d\t%08X\t%d\t%u\t%u",
2476					 prefix, 0, flags, 0, 0, 0,
2477					 mask, 0, 0, 0);
2478
2479			seq_printf(seq, "%-127s\n", bf);
2480		}
2481	}
2482
2483	return 0;
2484}
2485
2486static const struct seq_operations fib_route_seq_ops = {
2487	.start  = fib_trie_seq_start,
2488	.next   = fib_trie_seq_next,
2489	.stop   = fib_trie_seq_stop,
2490	.show   = fib_route_seq_show,
2491};
2492
2493static int fib_route_seq_open(struct inode *inode, struct file *file)
2494{
2495	return seq_open_net(inode, file, &fib_route_seq_ops,
2496			    sizeof(struct fib_trie_iter));
2497}
2498
2499static const struct file_operations fib_route_fops = {
2500	.owner  = THIS_MODULE,
2501	.open   = fib_route_seq_open,
2502	.read   = seq_read,
2503	.llseek = seq_lseek,
2504	.release = seq_release_net,
2505};
2506
2507int __net_init fib_proc_init(struct net *net)
2508{
2509	if (!proc_net_fops_create(net, "fib_trie", S_IRUGO, &fib_trie_fops))
2510		goto out1;
2511
2512	if (!proc_net_fops_create(net, "fib_triestat", S_IRUGO,
2513				  &fib_triestat_fops))
2514		goto out2;
2515
2516	if (!proc_net_fops_create(net, "route", S_IRUGO, &fib_route_fops))
2517		goto out3;
2518
2519	return 0;
2520
2521out3:
2522	proc_net_remove(net, "fib_triestat");
2523out2:
2524	proc_net_remove(net, "fib_trie");
2525out1:
2526	return -ENOMEM;
2527}
2528
2529void __net_exit fib_proc_exit(struct net *net)
2530{
2531	proc_net_remove(net, "fib_trie");
2532	proc_net_remove(net, "fib_triestat");
2533	proc_net_remove(net, "route");
2534}
2535
2536#endif /* CONFIG_PROC_FS */
2537