fib_trie.c revision 550e29bc96e6f1ced2bca82dace197b009434367
1/*
2 *   This program is free software; you can redistribute it and/or
3 *   modify it under the terms of the GNU General Public License
4 *   as published by the Free Software Foundation; either version
5 *   2 of the License, or (at your option) any later version.
6 *
7 *   Robert Olsson <robert.olsson@its.uu.se> Uppsala Universitet
8 *     & Swedish University of Agricultural Sciences.
9 *
10 *   Jens Laas <jens.laas@data.slu.se> Swedish University of
11 *     Agricultural Sciences.
12 *
13 *   Hans Liss <hans.liss@its.uu.se>  Uppsala Universitet
14 *
15 * This work is based on the LPC-trie which is originally descibed in:
16 *
17 * An experimental study of compression methods for dynamic tries
18 * Stefan Nilsson and Matti Tikkanen. Algorithmica, 33(1):19-33, 2002.
19 * http://www.nada.kth.se/~snilsson/public/papers/dyntrie2/
20 *
21 *
22 * IP-address lookup using LC-tries. Stefan Nilsson and Gunnar Karlsson
23 * IEEE Journal on Selected Areas in Communications, 17(6):1083-1092, June 1999
24 *
25 * Version:	$Id: fib_trie.c,v 1.3 2005/06/08 14:20:01 robert Exp $
26 *
27 *
28 * Code from fib_hash has been reused which includes the following header:
29 *
30 *
31 * INET		An implementation of the TCP/IP protocol suite for the LINUX
32 *		operating system.  INET is implemented using the  BSD Socket
33 *		interface as the means of communication with the user level.
34 *
35 *		IPv4 FIB: lookup engine and maintenance routines.
36 *
37 *
38 * Authors:	Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
39 *
40 *		This program is free software; you can redistribute it and/or
41 *		modify it under the terms of the GNU General Public License
42 *		as published by the Free Software Foundation; either version
43 *		2 of the License, or (at your option) any later version.
44 *
45 * Substantial contributions to this work comes from:
46 *
47 *		David S. Miller, <davem@davemloft.net>
48 *		Stephen Hemminger <shemminger@osdl.org>
49 *		Paul E. McKenney <paulmck@us.ibm.com>
50 *		Patrick McHardy <kaber@trash.net>
51 */
52
53#define VERSION "0.407"
54
55#include <linux/config.h>
56#include <asm/uaccess.h>
57#include <asm/system.h>
58#include <asm/bitops.h>
59#include <linux/types.h>
60#include <linux/kernel.h>
61#include <linux/sched.h>
62#include <linux/mm.h>
63#include <linux/string.h>
64#include <linux/socket.h>
65#include <linux/sockios.h>
66#include <linux/errno.h>
67#include <linux/in.h>
68#include <linux/inet.h>
69#include <linux/inetdevice.h>
70#include <linux/netdevice.h>
71#include <linux/if_arp.h>
72#include <linux/proc_fs.h>
73#include <linux/rcupdate.h>
74#include <linux/skbuff.h>
75#include <linux/netlink.h>
76#include <linux/init.h>
77#include <linux/list.h>
78#include <net/ip.h>
79#include <net/protocol.h>
80#include <net/route.h>
81#include <net/tcp.h>
82#include <net/sock.h>
83#include <net/ip_fib.h>
84#include "fib_lookup.h"
85
86#undef CONFIG_IP_FIB_TRIE_STATS
87#define MAX_STAT_DEPTH 32
88
89#define KEYLENGTH (8*sizeof(t_key))
90#define MASK_PFX(k, l) (((l)==0)?0:(k >> (KEYLENGTH-l)) << (KEYLENGTH-l))
91#define TKEY_GET_MASK(offset, bits) (((bits)==0)?0:((t_key)(-1) << (KEYLENGTH - bits) >> offset))
92
93typedef unsigned int t_key;
94
95#define T_TNODE 0
96#define T_LEAF  1
97#define NODE_TYPE_MASK	0x1UL
98#define NODE_PARENT(node) \
99	((struct tnode *)rcu_dereference(((node)->parent & ~NODE_TYPE_MASK)))
100
101#define NODE_TYPE(node) ((node)->parent & NODE_TYPE_MASK)
102
103#define NODE_SET_PARENT(node, ptr)		\
104	rcu_assign_pointer((node)->parent,	\
105			   ((unsigned long)(ptr)) | NODE_TYPE(node))
106
107#define IS_TNODE(n) (!(n->parent & T_LEAF))
108#define IS_LEAF(n) (n->parent & T_LEAF)
109
110struct node {
111	t_key key;
112	unsigned long parent;
113};
114
115struct leaf {
116	t_key key;
117	unsigned long parent;
118	struct hlist_head list;
119	struct rcu_head rcu;
120};
121
122struct leaf_info {
123	struct hlist_node hlist;
124	struct rcu_head rcu;
125	int plen;
126	struct list_head falh;
127};
128
129struct tnode {
130	t_key key;
131	unsigned long parent;
132	unsigned short pos:5;		/* 2log(KEYLENGTH) bits needed */
133	unsigned short bits:5;		/* 2log(KEYLENGTH) bits needed */
134	unsigned short full_children;	/* KEYLENGTH bits needed */
135	unsigned short empty_children;	/* KEYLENGTH bits needed */
136	struct rcu_head rcu;
137	struct node *child[0];
138};
139
140#ifdef CONFIG_IP_FIB_TRIE_STATS
141struct trie_use_stats {
142	unsigned int gets;
143	unsigned int backtrack;
144	unsigned int semantic_match_passed;
145	unsigned int semantic_match_miss;
146	unsigned int null_node_hit;
147	unsigned int resize_node_skipped;
148};
149#endif
150
151struct trie_stat {
152	unsigned int totdepth;
153	unsigned int maxdepth;
154	unsigned int tnodes;
155	unsigned int leaves;
156	unsigned int nullpointers;
157	unsigned int nodesizes[MAX_STAT_DEPTH];
158};
159
160struct trie {
161	struct node *trie;
162#ifdef CONFIG_IP_FIB_TRIE_STATS
163	struct trie_use_stats stats;
164#endif
165	int size;
166	unsigned int revision;
167};
168
169static void put_child(struct trie *t, struct tnode *tn, int i, struct node *n);
170static void tnode_put_child_reorg(struct tnode *tn, int i, struct node *n, int wasfull);
171static struct node *resize(struct trie *t, struct tnode *tn);
172static struct tnode *inflate(struct trie *t, struct tnode *tn);
173static struct tnode *halve(struct trie *t, struct tnode *tn);
174static void tnode_free(struct tnode *tn);
175
176static kmem_cache_t *fn_alias_kmem __read_mostly;
177static struct trie *trie_local = NULL, *trie_main = NULL;
178
179
180/* rcu_read_lock needs to be hold by caller from readside */
181
182static inline struct node *tnode_get_child(struct tnode *tn, int i)
183{
184	BUG_ON(i >= 1 << tn->bits);
185
186	return rcu_dereference(tn->child[i]);
187}
188
189static inline int tnode_child_length(const struct tnode *tn)
190{
191	return 1 << tn->bits;
192}
193
194static inline t_key tkey_extract_bits(t_key a, int offset, int bits)
195{
196	if (offset < KEYLENGTH)
197		return ((t_key)(a << offset)) >> (KEYLENGTH - bits);
198	else
199		return 0;
200}
201
202static inline int tkey_equals(t_key a, t_key b)
203{
204	return a == b;
205}
206
207static inline int tkey_sub_equals(t_key a, int offset, int bits, t_key b)
208{
209	if (bits == 0 || offset >= KEYLENGTH)
210		return 1;
211	bits = bits > KEYLENGTH ? KEYLENGTH : bits;
212	return ((a ^ b) << offset) >> (KEYLENGTH - bits) == 0;
213}
214
215static inline int tkey_mismatch(t_key a, int offset, t_key b)
216{
217	t_key diff = a ^ b;
218	int i = offset;
219
220	if (!diff)
221		return 0;
222	while ((diff << i) >> (KEYLENGTH-1) == 0)
223		i++;
224	return i;
225}
226
227/*
228  To understand this stuff, an understanding of keys and all their bits is
229  necessary. Every node in the trie has a key associated with it, but not
230  all of the bits in that key are significant.
231
232  Consider a node 'n' and its parent 'tp'.
233
234  If n is a leaf, every bit in its key is significant. Its presence is
235  necessitated by path compression, since during a tree traversal (when
236  searching for a leaf - unless we are doing an insertion) we will completely
237  ignore all skipped bits we encounter. Thus we need to verify, at the end of
238  a potentially successful search, that we have indeed been walking the
239  correct key path.
240
241  Note that we can never "miss" the correct key in the tree if present by
242  following the wrong path. Path compression ensures that segments of the key
243  that are the same for all keys with a given prefix are skipped, but the
244  skipped part *is* identical for each node in the subtrie below the skipped
245  bit! trie_insert() in this implementation takes care of that - note the
246  call to tkey_sub_equals() in trie_insert().
247
248  if n is an internal node - a 'tnode' here, the various parts of its key
249  have many different meanings.
250
251  Example:
252  _________________________________________________________________
253  | i | i | i | i | i | i | i | N | N | N | S | S | S | S | S | C |
254  -----------------------------------------------------------------
255    0   1   2   3   4   5   6   7   8   9  10  11  12  13  14  15
256
257  _________________________________________________________________
258  | C | C | C | u | u | u | u | u | u | u | u | u | u | u | u | u |
259  -----------------------------------------------------------------
260   16  17  18  19  20  21  22  23  24  25  26  27  28  29  30  31
261
262  tp->pos = 7
263  tp->bits = 3
264  n->pos = 15
265  n->bits = 4
266
267  First, let's just ignore the bits that come before the parent tp, that is
268  the bits from 0 to (tp->pos-1). They are *known* but at this point we do
269  not use them for anything.
270
271  The bits from (tp->pos) to (tp->pos + tp->bits - 1) - "N", above - are the
272  index into the parent's child array. That is, they will be used to find
273  'n' among tp's children.
274
275  The bits from (tp->pos + tp->bits) to (n->pos - 1) - "S" - are skipped bits
276  for the node n.
277
278  All the bits we have seen so far are significant to the node n. The rest
279  of the bits are really not needed or indeed known in n->key.
280
281  The bits from (n->pos) to (n->pos + n->bits - 1) - "C" - are the index into
282  n's child array, and will of course be different for each child.
283
284
285  The rest of the bits, from (n->pos + n->bits) onward, are completely unknown
286  at this point.
287
288*/
289
290static inline void check_tnode(const struct tnode *tn)
291{
292	WARN_ON(tn && tn->pos+tn->bits > 32);
293}
294
295static int halve_threshold = 25;
296static int inflate_threshold = 50;
297static int halve_threshold_root = 15;
298static int inflate_threshold_root = 25;
299
300
301static void __alias_free_mem(struct rcu_head *head)
302{
303	struct fib_alias *fa = container_of(head, struct fib_alias, rcu);
304	kmem_cache_free(fn_alias_kmem, fa);
305}
306
307static inline void alias_free_mem_rcu(struct fib_alias *fa)
308{
309	call_rcu(&fa->rcu, __alias_free_mem);
310}
311
312static void __leaf_free_rcu(struct rcu_head *head)
313{
314	kfree(container_of(head, struct leaf, rcu));
315}
316
317static void __leaf_info_free_rcu(struct rcu_head *head)
318{
319	kfree(container_of(head, struct leaf_info, rcu));
320}
321
322static inline void free_leaf_info(struct leaf_info *leaf)
323{
324	call_rcu(&leaf->rcu, __leaf_info_free_rcu);
325}
326
327static struct tnode *tnode_alloc(unsigned int size)
328{
329	struct page *pages;
330
331	if (size <= PAGE_SIZE)
332		return kcalloc(size, 1, GFP_KERNEL);
333
334	pages = alloc_pages(GFP_KERNEL|__GFP_ZERO, get_order(size));
335	if (!pages)
336		return NULL;
337
338	return page_address(pages);
339}
340
341static void __tnode_free_rcu(struct rcu_head *head)
342{
343	struct tnode *tn = container_of(head, struct tnode, rcu);
344	unsigned int size = sizeof(struct tnode) +
345		(1 << tn->bits) * sizeof(struct node *);
346
347	if (size <= PAGE_SIZE)
348		kfree(tn);
349	else
350		free_pages((unsigned long)tn, get_order(size));
351}
352
353static inline void tnode_free(struct tnode *tn)
354{
355	if(IS_LEAF(tn)) {
356		struct leaf *l = (struct leaf *) tn;
357		call_rcu_bh(&l->rcu, __leaf_free_rcu);
358	}
359        else
360		call_rcu(&tn->rcu, __tnode_free_rcu);
361}
362
363static struct leaf *leaf_new(void)
364{
365	struct leaf *l = kmalloc(sizeof(struct leaf),  GFP_KERNEL);
366	if (l) {
367		l->parent = T_LEAF;
368		INIT_HLIST_HEAD(&l->list);
369	}
370	return l;
371}
372
373static struct leaf_info *leaf_info_new(int plen)
374{
375	struct leaf_info *li = kmalloc(sizeof(struct leaf_info),  GFP_KERNEL);
376	if (li) {
377		li->plen = plen;
378		INIT_LIST_HEAD(&li->falh);
379	}
380	return li;
381}
382
383static struct tnode* tnode_new(t_key key, int pos, int bits)
384{
385	int nchildren = 1<<bits;
386	int sz = sizeof(struct tnode) + nchildren * sizeof(struct node *);
387	struct tnode *tn = tnode_alloc(sz);
388
389	if (tn) {
390		memset(tn, 0, sz);
391		tn->parent = T_TNODE;
392		tn->pos = pos;
393		tn->bits = bits;
394		tn->key = key;
395		tn->full_children = 0;
396		tn->empty_children = 1<<bits;
397	}
398
399	pr_debug("AT %p s=%u %u\n", tn, (unsigned int) sizeof(struct tnode),
400		 (unsigned int) (sizeof(struct node) * 1<<bits));
401	return tn;
402}
403
404/*
405 * Check whether a tnode 'n' is "full", i.e. it is an internal node
406 * and no bits are skipped. See discussion in dyntree paper p. 6
407 */
408
409static inline int tnode_full(const struct tnode *tn, const struct node *n)
410{
411	if (n == NULL || IS_LEAF(n))
412		return 0;
413
414	return ((struct tnode *) n)->pos == tn->pos + tn->bits;
415}
416
417static inline void put_child(struct trie *t, struct tnode *tn, int i, struct node *n)
418{
419	tnode_put_child_reorg(tn, i, n, -1);
420}
421
422 /*
423  * Add a child at position i overwriting the old value.
424  * Update the value of full_children and empty_children.
425  */
426
427static void tnode_put_child_reorg(struct tnode *tn, int i, struct node *n, int wasfull)
428{
429	struct node *chi = tn->child[i];
430	int isfull;
431
432	BUG_ON(i >= 1<<tn->bits);
433
434
435	/* update emptyChildren */
436	if (n == NULL && chi != NULL)
437		tn->empty_children++;
438	else if (n != NULL && chi == NULL)
439		tn->empty_children--;
440
441	/* update fullChildren */
442	if (wasfull == -1)
443		wasfull = tnode_full(tn, chi);
444
445	isfull = tnode_full(tn, n);
446	if (wasfull && !isfull)
447		tn->full_children--;
448	else if (!wasfull && isfull)
449		tn->full_children++;
450
451	if (n)
452		NODE_SET_PARENT(n, tn);
453
454	rcu_assign_pointer(tn->child[i], n);
455}
456
457static struct node *resize(struct trie *t, struct tnode *tn)
458{
459	int i;
460	int err = 0;
461	struct tnode *old_tn;
462	int inflate_threshold_use;
463	int halve_threshold_use;
464
465 	if (!tn)
466		return NULL;
467
468	pr_debug("In tnode_resize %p inflate_threshold=%d threshold=%d\n",
469		 tn, inflate_threshold, halve_threshold);
470
471	/* No children */
472	if (tn->empty_children == tnode_child_length(tn)) {
473		tnode_free(tn);
474		return NULL;
475	}
476	/* One child */
477	if (tn->empty_children == tnode_child_length(tn) - 1)
478		for (i = 0; i < tnode_child_length(tn); i++) {
479			struct node *n;
480
481			n = tn->child[i];
482			if (!n)
483				continue;
484
485			/* compress one level */
486			NODE_SET_PARENT(n, NULL);
487			tnode_free(tn);
488			return n;
489		}
490	/*
491	 * Double as long as the resulting node has a number of
492	 * nonempty nodes that are above the threshold.
493	 */
494
495	/*
496	 * From "Implementing a dynamic compressed trie" by Stefan Nilsson of
497	 * the Helsinki University of Technology and Matti Tikkanen of Nokia
498	 * Telecommunications, page 6:
499	 * "A node is doubled if the ratio of non-empty children to all
500	 * children in the *doubled* node is at least 'high'."
501	 *
502	 * 'high' in this instance is the variable 'inflate_threshold'. It
503	 * is expressed as a percentage, so we multiply it with
504	 * tnode_child_length() and instead of multiplying by 2 (since the
505	 * child array will be doubled by inflate()) and multiplying
506	 * the left-hand side by 100 (to handle the percentage thing) we
507	 * multiply the left-hand side by 50.
508	 *
509	 * The left-hand side may look a bit weird: tnode_child_length(tn)
510	 * - tn->empty_children is of course the number of non-null children
511	 * in the current node. tn->full_children is the number of "full"
512	 * children, that is non-null tnodes with a skip value of 0.
513	 * All of those will be doubled in the resulting inflated tnode, so
514	 * we just count them one extra time here.
515	 *
516	 * A clearer way to write this would be:
517	 *
518	 * to_be_doubled = tn->full_children;
519	 * not_to_be_doubled = tnode_child_length(tn) - tn->empty_children -
520	 *     tn->full_children;
521	 *
522	 * new_child_length = tnode_child_length(tn) * 2;
523	 *
524	 * new_fill_factor = 100 * (not_to_be_doubled + 2*to_be_doubled) /
525	 *      new_child_length;
526	 * if (new_fill_factor >= inflate_threshold)
527	 *
528	 * ...and so on, tho it would mess up the while () loop.
529	 *
530	 * anyway,
531	 * 100 * (not_to_be_doubled + 2*to_be_doubled) / new_child_length >=
532	 *      inflate_threshold
533	 *
534	 * avoid a division:
535	 * 100 * (not_to_be_doubled + 2*to_be_doubled) >=
536	 *      inflate_threshold * new_child_length
537	 *
538	 * expand not_to_be_doubled and to_be_doubled, and shorten:
539	 * 100 * (tnode_child_length(tn) - tn->empty_children +
540	 *    tn->full_children) >= inflate_threshold * new_child_length
541	 *
542	 * expand new_child_length:
543	 * 100 * (tnode_child_length(tn) - tn->empty_children +
544	 *    tn->full_children) >=
545	 *      inflate_threshold * tnode_child_length(tn) * 2
546	 *
547	 * shorten again:
548	 * 50 * (tn->full_children + tnode_child_length(tn) -
549	 *    tn->empty_children) >= inflate_threshold *
550	 *    tnode_child_length(tn)
551	 *
552	 */
553
554	check_tnode(tn);
555
556	/* Keep root node larger  */
557
558	if(!tn->parent)
559		inflate_threshold_use = inflate_threshold_root;
560	else
561		inflate_threshold_use = inflate_threshold;
562
563	err = 0;
564	while ((tn->full_children > 0 &&
565	       50 * (tn->full_children + tnode_child_length(tn) - tn->empty_children) >=
566				inflate_threshold_use * tnode_child_length(tn))) {
567
568		old_tn = tn;
569		tn = inflate(t, tn);
570		if (IS_ERR(tn)) {
571			tn = old_tn;
572#ifdef CONFIG_IP_FIB_TRIE_STATS
573			t->stats.resize_node_skipped++;
574#endif
575			break;
576		}
577	}
578
579	check_tnode(tn);
580
581	/*
582	 * Halve as long as the number of empty children in this
583	 * node is above threshold.
584	 */
585
586
587	/* Keep root node larger  */
588
589	if(!tn->parent)
590		halve_threshold_use = halve_threshold_root;
591	else
592		halve_threshold_use = halve_threshold;
593
594	err = 0;
595	while (tn->bits > 1 &&
596	       100 * (tnode_child_length(tn) - tn->empty_children) <
597	       halve_threshold_use * tnode_child_length(tn)) {
598
599		old_tn = tn;
600		tn = halve(t, tn);
601		if (IS_ERR(tn)) {
602			tn = old_tn;
603#ifdef CONFIG_IP_FIB_TRIE_STATS
604			t->stats.resize_node_skipped++;
605#endif
606			break;
607		}
608	}
609
610
611	/* Only one child remains */
612	if (tn->empty_children == tnode_child_length(tn) - 1)
613		for (i = 0; i < tnode_child_length(tn); i++) {
614			struct node *n;
615
616			n = tn->child[i];
617			if (!n)
618				continue;
619
620			/* compress one level */
621
622			NODE_SET_PARENT(n, NULL);
623			tnode_free(tn);
624			return n;
625		}
626
627	return (struct node *) tn;
628}
629
630static struct tnode *inflate(struct trie *t, struct tnode *tn)
631{
632	struct tnode *inode;
633	struct tnode *oldtnode = tn;
634	int olen = tnode_child_length(tn);
635	int i;
636
637	pr_debug("In inflate\n");
638
639	tn = tnode_new(oldtnode->key, oldtnode->pos, oldtnode->bits + 1);
640
641	if (!tn)
642		return ERR_PTR(-ENOMEM);
643
644	/*
645	 * Preallocate and store tnodes before the actual work so we
646	 * don't get into an inconsistent state if memory allocation
647	 * fails. In case of failure we return the oldnode and  inflate
648	 * of tnode is ignored.
649	 */
650
651	for (i = 0; i < olen; i++) {
652		struct tnode *inode = (struct tnode *) tnode_get_child(oldtnode, i);
653
654		if (inode &&
655		    IS_TNODE(inode) &&
656		    inode->pos == oldtnode->pos + oldtnode->bits &&
657		    inode->bits > 1) {
658			struct tnode *left, *right;
659			t_key m = TKEY_GET_MASK(inode->pos, 1);
660
661			left = tnode_new(inode->key&(~m), inode->pos + 1,
662					 inode->bits - 1);
663			if (!left)
664				goto nomem;
665
666			right = tnode_new(inode->key|m, inode->pos + 1,
667					  inode->bits - 1);
668
669                        if (!right) {
670				tnode_free(left);
671				goto nomem;
672                        }
673
674			put_child(t, tn, 2*i, (struct node *) left);
675			put_child(t, tn, 2*i+1, (struct node *) right);
676		}
677	}
678
679	for (i = 0; i < olen; i++) {
680		struct node *node = tnode_get_child(oldtnode, i);
681		struct tnode *left, *right;
682		int size, j;
683
684		/* An empty child */
685		if (node == NULL)
686			continue;
687
688		/* A leaf or an internal node with skipped bits */
689
690		if (IS_LEAF(node) || ((struct tnode *) node)->pos >
691		   tn->pos + tn->bits - 1) {
692			if (tkey_extract_bits(node->key, oldtnode->pos + oldtnode->bits,
693					     1) == 0)
694				put_child(t, tn, 2*i, node);
695			else
696				put_child(t, tn, 2*i+1, node);
697			continue;
698		}
699
700		/* An internal node with two children */
701		inode = (struct tnode *) node;
702
703		if (inode->bits == 1) {
704			put_child(t, tn, 2*i, inode->child[0]);
705			put_child(t, tn, 2*i+1, inode->child[1]);
706
707			tnode_free(inode);
708			continue;
709		}
710
711		/* An internal node with more than two children */
712
713		/* We will replace this node 'inode' with two new
714		 * ones, 'left' and 'right', each with half of the
715		 * original children. The two new nodes will have
716		 * a position one bit further down the key and this
717		 * means that the "significant" part of their keys
718		 * (see the discussion near the top of this file)
719		 * will differ by one bit, which will be "0" in
720		 * left's key and "1" in right's key. Since we are
721		 * moving the key position by one step, the bit that
722		 * we are moving away from - the bit at position
723		 * (inode->pos) - is the one that will differ between
724		 * left and right. So... we synthesize that bit in the
725		 * two  new keys.
726		 * The mask 'm' below will be a single "one" bit at
727		 * the position (inode->pos)
728		 */
729
730		/* Use the old key, but set the new significant
731		 *   bit to zero.
732		 */
733
734		left = (struct tnode *) tnode_get_child(tn, 2*i);
735		put_child(t, tn, 2*i, NULL);
736
737		BUG_ON(!left);
738
739		right = (struct tnode *) tnode_get_child(tn, 2*i+1);
740		put_child(t, tn, 2*i+1, NULL);
741
742		BUG_ON(!right);
743
744		size = tnode_child_length(left);
745		for (j = 0; j < size; j++) {
746			put_child(t, left, j, inode->child[j]);
747			put_child(t, right, j, inode->child[j + size]);
748		}
749		put_child(t, tn, 2*i, resize(t, left));
750		put_child(t, tn, 2*i+1, resize(t, right));
751
752		tnode_free(inode);
753	}
754	tnode_free(oldtnode);
755	return tn;
756nomem:
757	{
758		int size = tnode_child_length(tn);
759		int j;
760
761		for (j = 0; j < size; j++)
762			if (tn->child[j])
763				tnode_free((struct tnode *)tn->child[j]);
764
765		tnode_free(tn);
766
767		return ERR_PTR(-ENOMEM);
768	}
769}
770
771static struct tnode *halve(struct trie *t, struct tnode *tn)
772{
773	struct tnode *oldtnode = tn;
774	struct node *left, *right;
775	int i;
776	int olen = tnode_child_length(tn);
777
778	pr_debug("In halve\n");
779
780	tn = tnode_new(oldtnode->key, oldtnode->pos, oldtnode->bits - 1);
781
782	if (!tn)
783		return ERR_PTR(-ENOMEM);
784
785	/*
786	 * Preallocate and store tnodes before the actual work so we
787	 * don't get into an inconsistent state if memory allocation
788	 * fails. In case of failure we return the oldnode and halve
789	 * of tnode is ignored.
790	 */
791
792	for (i = 0; i < olen; i += 2) {
793		left = tnode_get_child(oldtnode, i);
794		right = tnode_get_child(oldtnode, i+1);
795
796		/* Two nonempty children */
797		if (left && right) {
798			struct tnode *newn;
799
800			newn = tnode_new(left->key, tn->pos + tn->bits, 1);
801
802			if (!newn)
803				goto nomem;
804
805			put_child(t, tn, i/2, (struct node *)newn);
806		}
807
808	}
809
810	for (i = 0; i < olen; i += 2) {
811		struct tnode *newBinNode;
812
813		left = tnode_get_child(oldtnode, i);
814		right = tnode_get_child(oldtnode, i+1);
815
816		/* At least one of the children is empty */
817		if (left == NULL) {
818			if (right == NULL)    /* Both are empty */
819				continue;
820			put_child(t, tn, i/2, right);
821			continue;
822		}
823
824		if (right == NULL) {
825			put_child(t, tn, i/2, left);
826			continue;
827		}
828
829		/* Two nonempty children */
830		newBinNode = (struct tnode *) tnode_get_child(tn, i/2);
831		put_child(t, tn, i/2, NULL);
832		put_child(t, newBinNode, 0, left);
833		put_child(t, newBinNode, 1, right);
834		put_child(t, tn, i/2, resize(t, newBinNode));
835	}
836	tnode_free(oldtnode);
837	return tn;
838nomem:
839	{
840		int size = tnode_child_length(tn);
841		int j;
842
843		for (j = 0; j < size; j++)
844			if (tn->child[j])
845				tnode_free((struct tnode *)tn->child[j]);
846
847		tnode_free(tn);
848
849		return ERR_PTR(-ENOMEM);
850	}
851}
852
853static void trie_init(struct trie *t)
854{
855	if (!t)
856		return;
857
858	t->size = 0;
859	rcu_assign_pointer(t->trie, NULL);
860	t->revision = 0;
861#ifdef CONFIG_IP_FIB_TRIE_STATS
862	memset(&t->stats, 0, sizeof(struct trie_use_stats));
863#endif
864}
865
866/* readside must use rcu_read_lock currently dump routines
867 via get_fa_head and dump */
868
869static struct leaf_info *find_leaf_info(struct leaf *l, int plen)
870{
871	struct hlist_head *head = &l->list;
872	struct hlist_node *node;
873	struct leaf_info *li;
874
875	hlist_for_each_entry_rcu(li, node, head, hlist)
876		if (li->plen == plen)
877			return li;
878
879	return NULL;
880}
881
882static inline struct list_head * get_fa_head(struct leaf *l, int plen)
883{
884	struct leaf_info *li = find_leaf_info(l, plen);
885
886	if (!li)
887		return NULL;
888
889	return &li->falh;
890}
891
892static void insert_leaf_info(struct hlist_head *head, struct leaf_info *new)
893{
894        struct leaf_info *li = NULL, *last = NULL;
895        struct hlist_node *node;
896
897        if (hlist_empty(head)) {
898                hlist_add_head_rcu(&new->hlist, head);
899        } else {
900                hlist_for_each_entry(li, node, head, hlist) {
901                        if (new->plen > li->plen)
902                                break;
903
904                        last = li;
905                }
906                if (last)
907                        hlist_add_after_rcu(&last->hlist, &new->hlist);
908                else
909                        hlist_add_before_rcu(&new->hlist, &li->hlist);
910        }
911}
912
913/* rcu_read_lock needs to be hold by caller from readside */
914
915static struct leaf *
916fib_find_node(struct trie *t, u32 key)
917{
918	int pos;
919	struct tnode *tn;
920	struct node *n;
921
922	pos = 0;
923	n = rcu_dereference(t->trie);
924
925	while (n != NULL &&  NODE_TYPE(n) == T_TNODE) {
926		tn = (struct tnode *) n;
927
928		check_tnode(tn);
929
930		if (tkey_sub_equals(tn->key, pos, tn->pos-pos, key)) {
931			pos = tn->pos + tn->bits;
932			n = tnode_get_child(tn, tkey_extract_bits(key, tn->pos, tn->bits));
933		} else
934			break;
935	}
936	/* Case we have found a leaf. Compare prefixes */
937
938	if (n != NULL && IS_LEAF(n) && tkey_equals(key, n->key))
939		return (struct leaf *)n;
940
941	return NULL;
942}
943
944static struct node *trie_rebalance(struct trie *t, struct tnode *tn)
945{
946	int wasfull;
947	t_key cindex, key;
948	struct tnode *tp = NULL;
949
950	key = tn->key;
951
952	while (tn != NULL && NODE_PARENT(tn) != NULL) {
953
954		tp = NODE_PARENT(tn);
955		cindex = tkey_extract_bits(key, tp->pos, tp->bits);
956		wasfull = tnode_full(tp, tnode_get_child(tp, cindex));
957		tn = (struct tnode *) resize (t, (struct tnode *)tn);
958		tnode_put_child_reorg((struct tnode *)tp, cindex,(struct node*)tn, wasfull);
959
960		if (!NODE_PARENT(tn))
961			break;
962
963		tn = NODE_PARENT(tn);
964	}
965	/* Handle last (top) tnode */
966	if (IS_TNODE(tn))
967		tn = (struct tnode*) resize(t, (struct tnode *)tn);
968
969	return (struct node*) tn;
970}
971
972/* only used from updater-side */
973
974static  struct list_head *
975fib_insert_node(struct trie *t, int *err, u32 key, int plen)
976{
977	int pos, newpos;
978	struct tnode *tp = NULL, *tn = NULL;
979	struct node *n;
980	struct leaf *l;
981	int missbit;
982	struct list_head *fa_head = NULL;
983	struct leaf_info *li;
984	t_key cindex;
985
986	pos = 0;
987	n = t->trie;
988
989	/* If we point to NULL, stop. Either the tree is empty and we should
990	 * just put a new leaf in if, or we have reached an empty child slot,
991	 * and we should just put our new leaf in that.
992	 * If we point to a T_TNODE, check if it matches our key. Note that
993	 * a T_TNODE might be skipping any number of bits - its 'pos' need
994	 * not be the parent's 'pos'+'bits'!
995	 *
996	 * If it does match the current key, get pos/bits from it, extract
997	 * the index from our key, push the T_TNODE and walk the tree.
998	 *
999	 * If it doesn't, we have to replace it with a new T_TNODE.
1000	 *
1001	 * If we point to a T_LEAF, it might or might not have the same key
1002	 * as we do. If it does, just change the value, update the T_LEAF's
1003	 * value, and return it.
1004	 * If it doesn't, we need to replace it with a T_TNODE.
1005	 */
1006
1007	while (n != NULL &&  NODE_TYPE(n) == T_TNODE) {
1008		tn = (struct tnode *) n;
1009
1010		check_tnode(tn);
1011
1012		if (tkey_sub_equals(tn->key, pos, tn->pos-pos, key)) {
1013			tp = tn;
1014			pos = tn->pos + tn->bits;
1015			n = tnode_get_child(tn, tkey_extract_bits(key, tn->pos, tn->bits));
1016
1017			BUG_ON(n && NODE_PARENT(n) != tn);
1018		} else
1019			break;
1020	}
1021
1022	/*
1023	 * n  ----> NULL, LEAF or TNODE
1024	 *
1025	 * tp is n's (parent) ----> NULL or TNODE
1026	 */
1027
1028	BUG_ON(tp && IS_LEAF(tp));
1029
1030	/* Case 1: n is a leaf. Compare prefixes */
1031
1032	if (n != NULL && IS_LEAF(n) && tkey_equals(key, n->key)) {
1033		struct leaf *l = (struct leaf *) n;
1034
1035		li = leaf_info_new(plen);
1036
1037		if (!li) {
1038			*err = -ENOMEM;
1039			goto err;
1040		}
1041
1042		fa_head = &li->falh;
1043		insert_leaf_info(&l->list, li);
1044		goto done;
1045	}
1046	t->size++;
1047	l = leaf_new();
1048
1049	if (!l) {
1050		*err = -ENOMEM;
1051		goto err;
1052	}
1053
1054	l->key = key;
1055	li = leaf_info_new(plen);
1056
1057	if (!li) {
1058		tnode_free((struct tnode *) l);
1059		*err = -ENOMEM;
1060		goto err;
1061	}
1062
1063	fa_head = &li->falh;
1064	insert_leaf_info(&l->list, li);
1065
1066	if (t->trie && n == NULL) {
1067		/* Case 2: n is NULL, and will just insert a new leaf */
1068
1069		NODE_SET_PARENT(l, tp);
1070
1071		cindex = tkey_extract_bits(key, tp->pos, tp->bits);
1072		put_child(t, (struct tnode *)tp, cindex, (struct node *)l);
1073	} else {
1074		/* Case 3: n is a LEAF or a TNODE and the key doesn't match. */
1075		/*
1076		 *  Add a new tnode here
1077		 *  first tnode need some special handling
1078		 */
1079
1080		if (tp)
1081			pos = tp->pos+tp->bits;
1082		else
1083			pos = 0;
1084
1085		if (n) {
1086			newpos = tkey_mismatch(key, pos, n->key);
1087			tn = tnode_new(n->key, newpos, 1);
1088		} else {
1089			newpos = 0;
1090			tn = tnode_new(key, newpos, 1); /* First tnode */
1091		}
1092
1093		if (!tn) {
1094			free_leaf_info(li);
1095			tnode_free((struct tnode *) l);
1096			*err = -ENOMEM;
1097			goto err;
1098		}
1099
1100		NODE_SET_PARENT(tn, tp);
1101
1102		missbit = tkey_extract_bits(key, newpos, 1);
1103		put_child(t, tn, missbit, (struct node *)l);
1104		put_child(t, tn, 1-missbit, n);
1105
1106		if (tp) {
1107			cindex = tkey_extract_bits(key, tp->pos, tp->bits);
1108			put_child(t, (struct tnode *)tp, cindex, (struct node *)tn);
1109		} else {
1110			rcu_assign_pointer(t->trie, (struct node *)tn); /* First tnode */
1111			tp = tn;
1112		}
1113	}
1114
1115	if (tp && tp->pos + tp->bits > 32)
1116		printk(KERN_WARNING "fib_trie tp=%p pos=%d, bits=%d, key=%0x plen=%d\n",
1117		       tp, tp->pos, tp->bits, key, plen);
1118
1119	/* Rebalance the trie */
1120
1121	rcu_assign_pointer(t->trie, trie_rebalance(t, tp));
1122done:
1123	t->revision++;
1124err:
1125	return fa_head;
1126}
1127
1128static int
1129fn_trie_insert(struct fib_table *tb, struct rtmsg *r, struct kern_rta *rta,
1130	       struct nlmsghdr *nlhdr, struct netlink_skb_parms *req)
1131{
1132	struct trie *t = (struct trie *) tb->tb_data;
1133	struct fib_alias *fa, *new_fa;
1134	struct list_head *fa_head = NULL;
1135	struct fib_info *fi;
1136	int plen = r->rtm_dst_len;
1137	int type = r->rtm_type;
1138	u8 tos = r->rtm_tos;
1139	u32 key, mask;
1140	int err;
1141	struct leaf *l;
1142
1143	if (plen > 32)
1144		return -EINVAL;
1145
1146	key = 0;
1147	if (rta->rta_dst)
1148		memcpy(&key, rta->rta_dst, 4);
1149
1150	key = ntohl(key);
1151
1152	pr_debug("Insert table=%d %08x/%d\n", tb->tb_id, key, plen);
1153
1154	mask = ntohl(inet_make_mask(plen));
1155
1156	if (key & ~mask)
1157		return -EINVAL;
1158
1159	key = key & mask;
1160
1161	fi = fib_create_info(r, rta, nlhdr, &err);
1162
1163	if (!fi)
1164		goto err;
1165
1166	l = fib_find_node(t, key);
1167	fa = NULL;
1168
1169	if (l) {
1170		fa_head = get_fa_head(l, plen);
1171		fa = fib_find_alias(fa_head, tos, fi->fib_priority);
1172	}
1173
1174	/* Now fa, if non-NULL, points to the first fib alias
1175	 * with the same keys [prefix,tos,priority], if such key already
1176	 * exists or to the node before which we will insert new one.
1177	 *
1178	 * If fa is NULL, we will need to allocate a new one and
1179	 * insert to the head of f.
1180	 *
1181	 * If f is NULL, no fib node matched the destination key
1182	 * and we need to allocate a new one of those as well.
1183	 */
1184
1185	if (fa && fa->fa_info->fib_priority == fi->fib_priority) {
1186		struct fib_alias *fa_orig;
1187
1188		err = -EEXIST;
1189		if (nlhdr->nlmsg_flags & NLM_F_EXCL)
1190			goto out;
1191
1192		if (nlhdr->nlmsg_flags & NLM_F_REPLACE) {
1193			struct fib_info *fi_drop;
1194			u8 state;
1195
1196			err = -ENOBUFS;
1197			new_fa = kmem_cache_alloc(fn_alias_kmem, SLAB_KERNEL);
1198			if (new_fa == NULL)
1199				goto out;
1200
1201			fi_drop = fa->fa_info;
1202			new_fa->fa_tos = fa->fa_tos;
1203			new_fa->fa_info = fi;
1204			new_fa->fa_type = type;
1205			new_fa->fa_scope = r->rtm_scope;
1206			state = fa->fa_state;
1207			new_fa->fa_state &= ~FA_S_ACCESSED;
1208
1209			list_replace_rcu(&fa->fa_list, &new_fa->fa_list);
1210			alias_free_mem_rcu(fa);
1211
1212			fib_release_info(fi_drop);
1213			if (state & FA_S_ACCESSED)
1214				rt_cache_flush(-1);
1215
1216			goto succeeded;
1217		}
1218		/* Error if we find a perfect match which
1219		 * uses the same scope, type, and nexthop
1220		 * information.
1221		 */
1222		fa_orig = fa;
1223		list_for_each_entry(fa, fa_orig->fa_list.prev, fa_list) {
1224			if (fa->fa_tos != tos)
1225				break;
1226			if (fa->fa_info->fib_priority != fi->fib_priority)
1227				break;
1228			if (fa->fa_type == type &&
1229			    fa->fa_scope == r->rtm_scope &&
1230			    fa->fa_info == fi) {
1231				goto out;
1232			}
1233		}
1234		if (!(nlhdr->nlmsg_flags & NLM_F_APPEND))
1235			fa = fa_orig;
1236	}
1237	err = -ENOENT;
1238	if (!(nlhdr->nlmsg_flags & NLM_F_CREATE))
1239		goto out;
1240
1241	err = -ENOBUFS;
1242	new_fa = kmem_cache_alloc(fn_alias_kmem, SLAB_KERNEL);
1243	if (new_fa == NULL)
1244		goto out;
1245
1246	new_fa->fa_info = fi;
1247	new_fa->fa_tos = tos;
1248	new_fa->fa_type = type;
1249	new_fa->fa_scope = r->rtm_scope;
1250	new_fa->fa_state = 0;
1251	/*
1252	 * Insert new entry to the list.
1253	 */
1254
1255	if (!fa_head) {
1256		fa_head = fib_insert_node(t, &err, key, plen);
1257		err = 0;
1258		if (err)
1259			goto out_free_new_fa;
1260	}
1261
1262	list_add_tail_rcu(&new_fa->fa_list,
1263			  (fa ? &fa->fa_list : fa_head));
1264
1265	rt_cache_flush(-1);
1266	rtmsg_fib(RTM_NEWROUTE, htonl(key), new_fa, plen, tb->tb_id, nlhdr, req);
1267succeeded:
1268	return 0;
1269
1270out_free_new_fa:
1271	kmem_cache_free(fn_alias_kmem, new_fa);
1272out:
1273	fib_release_info(fi);
1274err:
1275	return err;
1276}
1277
1278
1279/* should be called with rcu_read_lock */
1280static inline int check_leaf(struct trie *t, struct leaf *l,
1281			     t_key key, int *plen, const struct flowi *flp,
1282			     struct fib_result *res)
1283{
1284	int err, i;
1285	t_key mask;
1286	struct leaf_info *li;
1287	struct hlist_head *hhead = &l->list;
1288	struct hlist_node *node;
1289
1290	hlist_for_each_entry_rcu(li, node, hhead, hlist) {
1291		i = li->plen;
1292		mask = ntohl(inet_make_mask(i));
1293		if (l->key != (key & mask))
1294			continue;
1295
1296		if ((err = fib_semantic_match(&li->falh, flp, res, l->key, mask, i)) <= 0) {
1297			*plen = i;
1298#ifdef CONFIG_IP_FIB_TRIE_STATS
1299			t->stats.semantic_match_passed++;
1300#endif
1301			return err;
1302		}
1303#ifdef CONFIG_IP_FIB_TRIE_STATS
1304		t->stats.semantic_match_miss++;
1305#endif
1306	}
1307	return 1;
1308}
1309
1310static int
1311fn_trie_lookup(struct fib_table *tb, const struct flowi *flp, struct fib_result *res)
1312{
1313	struct trie *t = (struct trie *) tb->tb_data;
1314	int plen, ret = 0;
1315	struct node *n;
1316	struct tnode *pn;
1317	int pos, bits;
1318	t_key key = ntohl(flp->fl4_dst);
1319	int chopped_off;
1320	t_key cindex = 0;
1321	int current_prefix_length = KEYLENGTH;
1322	struct tnode *cn;
1323	t_key node_prefix, key_prefix, pref_mismatch;
1324	int mp;
1325
1326	rcu_read_lock();
1327
1328	n = rcu_dereference(t->trie);
1329	if (!n)
1330		goto failed;
1331
1332#ifdef CONFIG_IP_FIB_TRIE_STATS
1333	t->stats.gets++;
1334#endif
1335
1336	/* Just a leaf? */
1337	if (IS_LEAF(n)) {
1338		if ((ret = check_leaf(t, (struct leaf *)n, key, &plen, flp, res)) <= 0)
1339			goto found;
1340		goto failed;
1341	}
1342	pn = (struct tnode *) n;
1343	chopped_off = 0;
1344
1345	while (pn) {
1346		pos = pn->pos;
1347		bits = pn->bits;
1348
1349		if (!chopped_off)
1350			cindex = tkey_extract_bits(MASK_PFX(key, current_prefix_length), pos, bits);
1351
1352		n = tnode_get_child(pn, cindex);
1353
1354		if (n == NULL) {
1355#ifdef CONFIG_IP_FIB_TRIE_STATS
1356			t->stats.null_node_hit++;
1357#endif
1358			goto backtrace;
1359		}
1360
1361		if (IS_LEAF(n)) {
1362			if ((ret = check_leaf(t, (struct leaf *)n, key, &plen, flp, res)) <= 0)
1363				goto found;
1364			else
1365				goto backtrace;
1366		}
1367
1368#define HL_OPTIMIZE
1369#ifdef HL_OPTIMIZE
1370		cn = (struct tnode *)n;
1371
1372		/*
1373		 * It's a tnode, and we can do some extra checks here if we
1374		 * like, to avoid descending into a dead-end branch.
1375		 * This tnode is in the parent's child array at index
1376		 * key[p_pos..p_pos+p_bits] but potentially with some bits
1377		 * chopped off, so in reality the index may be just a
1378		 * subprefix, padded with zero at the end.
1379		 * We can also take a look at any skipped bits in this
1380		 * tnode - everything up to p_pos is supposed to be ok,
1381		 * and the non-chopped bits of the index (se previous
1382		 * paragraph) are also guaranteed ok, but the rest is
1383		 * considered unknown.
1384		 *
1385		 * The skipped bits are key[pos+bits..cn->pos].
1386		 */
1387
1388		/* If current_prefix_length < pos+bits, we are already doing
1389		 * actual prefix  matching, which means everything from
1390		 * pos+(bits-chopped_off) onward must be zero along some
1391		 * branch of this subtree - otherwise there is *no* valid
1392		 * prefix present. Here we can only check the skipped
1393		 * bits. Remember, since we have already indexed into the
1394		 * parent's child array, we know that the bits we chopped of
1395		 * *are* zero.
1396		 */
1397
1398		/* NOTA BENE: CHECKING ONLY SKIPPED BITS FOR THE NEW NODE HERE */
1399
1400		if (current_prefix_length < pos+bits) {
1401			if (tkey_extract_bits(cn->key, current_prefix_length,
1402						cn->pos - current_prefix_length) != 0 ||
1403			    !(cn->child[0]))
1404				goto backtrace;
1405		}
1406
1407		/*
1408		 * If chopped_off=0, the index is fully validated and we
1409		 * only need to look at the skipped bits for this, the new,
1410		 * tnode. What we actually want to do is to find out if
1411		 * these skipped bits match our key perfectly, or if we will
1412		 * have to count on finding a matching prefix further down,
1413		 * because if we do, we would like to have some way of
1414		 * verifying the existence of such a prefix at this point.
1415		 */
1416
1417		/* The only thing we can do at this point is to verify that
1418		 * any such matching prefix can indeed be a prefix to our
1419		 * key, and if the bits in the node we are inspecting that
1420		 * do not match our key are not ZERO, this cannot be true.
1421		 * Thus, find out where there is a mismatch (before cn->pos)
1422		 * and verify that all the mismatching bits are zero in the
1423		 * new tnode's key.
1424		 */
1425
1426		/* Note: We aren't very concerned about the piece of the key
1427		 * that precede pn->pos+pn->bits, since these have already been
1428		 * checked. The bits after cn->pos aren't checked since these are
1429		 * by definition "unknown" at this point. Thus, what we want to
1430		 * see is if we are about to enter the "prefix matching" state,
1431		 * and in that case verify that the skipped bits that will prevail
1432		 * throughout this subtree are zero, as they have to be if we are
1433		 * to find a matching prefix.
1434		 */
1435
1436		node_prefix = MASK_PFX(cn->key, cn->pos);
1437		key_prefix = MASK_PFX(key, cn->pos);
1438		pref_mismatch = key_prefix^node_prefix;
1439		mp = 0;
1440
1441		/* In short: If skipped bits in this node do not match the search
1442		 * key, enter the "prefix matching" state.directly.
1443		 */
1444		if (pref_mismatch) {
1445			while (!(pref_mismatch & (1<<(KEYLENGTH-1)))) {
1446				mp++;
1447				pref_mismatch = pref_mismatch <<1;
1448			}
1449			key_prefix = tkey_extract_bits(cn->key, mp, cn->pos-mp);
1450
1451			if (key_prefix != 0)
1452				goto backtrace;
1453
1454			if (current_prefix_length >= cn->pos)
1455				current_prefix_length = mp;
1456		}
1457#endif
1458		pn = (struct tnode *)n; /* Descend */
1459		chopped_off = 0;
1460		continue;
1461
1462backtrace:
1463		chopped_off++;
1464
1465		/* As zero don't change the child key (cindex) */
1466		while ((chopped_off <= pn->bits) && !(cindex & (1<<(chopped_off-1))))
1467			chopped_off++;
1468
1469		/* Decrease current_... with bits chopped off */
1470		if (current_prefix_length > pn->pos + pn->bits - chopped_off)
1471			current_prefix_length = pn->pos + pn->bits - chopped_off;
1472
1473		/*
1474		 * Either we do the actual chop off according or if we have
1475		 * chopped off all bits in this tnode walk up to our parent.
1476		 */
1477
1478		if (chopped_off <= pn->bits) {
1479			cindex &= ~(1 << (chopped_off-1));
1480		} else {
1481			if (NODE_PARENT(pn) == NULL)
1482				goto failed;
1483
1484			/* Get Child's index */
1485			cindex = tkey_extract_bits(pn->key, NODE_PARENT(pn)->pos, NODE_PARENT(pn)->bits);
1486			pn = NODE_PARENT(pn);
1487			chopped_off = 0;
1488
1489#ifdef CONFIG_IP_FIB_TRIE_STATS
1490			t->stats.backtrack++;
1491#endif
1492			goto backtrace;
1493		}
1494	}
1495failed:
1496	ret = 1;
1497found:
1498	rcu_read_unlock();
1499	return ret;
1500}
1501
1502/* only called from updater side */
1503static int trie_leaf_remove(struct trie *t, t_key key)
1504{
1505	t_key cindex;
1506	struct tnode *tp = NULL;
1507	struct node *n = t->trie;
1508	struct leaf *l;
1509
1510	pr_debug("entering trie_leaf_remove(%p)\n", n);
1511
1512	/* Note that in the case skipped bits, those bits are *not* checked!
1513	 * When we finish this, we will have NULL or a T_LEAF, and the
1514	 * T_LEAF may or may not match our key.
1515	 */
1516
1517	while (n != NULL && IS_TNODE(n)) {
1518		struct tnode *tn = (struct tnode *) n;
1519		check_tnode(tn);
1520		n = tnode_get_child(tn ,tkey_extract_bits(key, tn->pos, tn->bits));
1521
1522		BUG_ON(n && NODE_PARENT(n) != tn);
1523	}
1524	l = (struct leaf *) n;
1525
1526	if (!n || !tkey_equals(l->key, key))
1527		return 0;
1528
1529	/*
1530	 * Key found.
1531	 * Remove the leaf and rebalance the tree
1532	 */
1533
1534	t->revision++;
1535	t->size--;
1536
1537	preempt_disable();
1538	tp = NODE_PARENT(n);
1539	tnode_free((struct tnode *) n);
1540
1541	if (tp) {
1542		cindex = tkey_extract_bits(key, tp->pos, tp->bits);
1543		put_child(t, (struct tnode *)tp, cindex, NULL);
1544		rcu_assign_pointer(t->trie, trie_rebalance(t, tp));
1545	} else
1546		rcu_assign_pointer(t->trie, NULL);
1547	preempt_enable();
1548
1549	return 1;
1550}
1551
1552static int
1553fn_trie_delete(struct fib_table *tb, struct rtmsg *r, struct kern_rta *rta,
1554		struct nlmsghdr *nlhdr, struct netlink_skb_parms *req)
1555{
1556	struct trie *t = (struct trie *) tb->tb_data;
1557	u32 key, mask;
1558	int plen = r->rtm_dst_len;
1559	u8 tos = r->rtm_tos;
1560	struct fib_alias *fa, *fa_to_delete;
1561	struct list_head *fa_head;
1562	struct leaf *l;
1563	struct leaf_info *li;
1564
1565
1566	if (plen > 32)
1567		return -EINVAL;
1568
1569	key = 0;
1570	if (rta->rta_dst)
1571		memcpy(&key, rta->rta_dst, 4);
1572
1573	key = ntohl(key);
1574	mask = ntohl(inet_make_mask(plen));
1575
1576	if (key & ~mask)
1577		return -EINVAL;
1578
1579	key = key & mask;
1580	l = fib_find_node(t, key);
1581
1582	if (!l)
1583		return -ESRCH;
1584
1585	fa_head = get_fa_head(l, plen);
1586	fa = fib_find_alias(fa_head, tos, 0);
1587
1588	if (!fa)
1589		return -ESRCH;
1590
1591	pr_debug("Deleting %08x/%d tos=%d t=%p\n", key, plen, tos, t);
1592
1593	fa_to_delete = NULL;
1594	fa_head = fa->fa_list.prev;
1595
1596	list_for_each_entry(fa, fa_head, fa_list) {
1597		struct fib_info *fi = fa->fa_info;
1598
1599		if (fa->fa_tos != tos)
1600			break;
1601
1602		if ((!r->rtm_type ||
1603		     fa->fa_type == r->rtm_type) &&
1604		    (r->rtm_scope == RT_SCOPE_NOWHERE ||
1605		     fa->fa_scope == r->rtm_scope) &&
1606		    (!r->rtm_protocol ||
1607		     fi->fib_protocol == r->rtm_protocol) &&
1608		    fib_nh_match(r, nlhdr, rta, fi) == 0) {
1609			fa_to_delete = fa;
1610			break;
1611		}
1612	}
1613
1614	if (!fa_to_delete)
1615		return -ESRCH;
1616
1617	fa = fa_to_delete;
1618	rtmsg_fib(RTM_DELROUTE, htonl(key), fa, plen, tb->tb_id, nlhdr, req);
1619
1620	l = fib_find_node(t, key);
1621	li = find_leaf_info(l, plen);
1622
1623	list_del_rcu(&fa->fa_list);
1624
1625	if (list_empty(fa_head)) {
1626		hlist_del_rcu(&li->hlist);
1627		free_leaf_info(li);
1628	}
1629
1630	if (hlist_empty(&l->list))
1631		trie_leaf_remove(t, key);
1632
1633	if (fa->fa_state & FA_S_ACCESSED)
1634		rt_cache_flush(-1);
1635
1636	fib_release_info(fa->fa_info);
1637	alias_free_mem_rcu(fa);
1638	return 0;
1639}
1640
1641static int trie_flush_list(struct trie *t, struct list_head *head)
1642{
1643	struct fib_alias *fa, *fa_node;
1644	int found = 0;
1645
1646	list_for_each_entry_safe(fa, fa_node, head, fa_list) {
1647		struct fib_info *fi = fa->fa_info;
1648
1649		if (fi && (fi->fib_flags & RTNH_F_DEAD)) {
1650			list_del_rcu(&fa->fa_list);
1651			fib_release_info(fa->fa_info);
1652			alias_free_mem_rcu(fa);
1653			found++;
1654		}
1655	}
1656	return found;
1657}
1658
1659static int trie_flush_leaf(struct trie *t, struct leaf *l)
1660{
1661	int found = 0;
1662	struct hlist_head *lih = &l->list;
1663	struct hlist_node *node, *tmp;
1664	struct leaf_info *li = NULL;
1665
1666	hlist_for_each_entry_safe(li, node, tmp, lih, hlist) {
1667		found += trie_flush_list(t, &li->falh);
1668
1669		if (list_empty(&li->falh)) {
1670			hlist_del_rcu(&li->hlist);
1671			free_leaf_info(li);
1672		}
1673	}
1674	return found;
1675}
1676
1677/* rcu_read_lock needs to be hold by caller from readside */
1678
1679static struct leaf *nextleaf(struct trie *t, struct leaf *thisleaf)
1680{
1681	struct node *c = (struct node *) thisleaf;
1682	struct tnode *p;
1683	int idx;
1684	struct node *trie = rcu_dereference(t->trie);
1685
1686	if (c == NULL) {
1687		if (trie == NULL)
1688			return NULL;
1689
1690		if (IS_LEAF(trie))          /* trie w. just a leaf */
1691			return (struct leaf *) trie;
1692
1693		p = (struct tnode*) trie;  /* Start */
1694	} else
1695		p = (struct tnode *) NODE_PARENT(c);
1696
1697	while (p) {
1698		int pos, last;
1699
1700		/*  Find the next child of the parent */
1701		if (c)
1702			pos = 1 + tkey_extract_bits(c->key, p->pos, p->bits);
1703		else
1704			pos = 0;
1705
1706		last = 1 << p->bits;
1707		for (idx = pos; idx < last ; idx++) {
1708			c = rcu_dereference(p->child[idx]);
1709
1710			if (!c)
1711				continue;
1712
1713			/* Decend if tnode */
1714			while (IS_TNODE(c)) {
1715				p = (struct tnode *) c;
1716  				idx = 0;
1717
1718				/* Rightmost non-NULL branch */
1719				if (p && IS_TNODE(p))
1720					while (!(c = rcu_dereference(p->child[idx]))
1721					       && idx < (1<<p->bits)) idx++;
1722
1723				/* Done with this tnode? */
1724				if (idx >= (1 << p->bits) || !c)
1725					goto up;
1726			}
1727			return (struct leaf *) c;
1728		}
1729up:
1730		/* No more children go up one step  */
1731		c = (struct node *) p;
1732		p = (struct tnode *) NODE_PARENT(p);
1733	}
1734	return NULL; /* Ready. Root of trie */
1735}
1736
1737static int fn_trie_flush(struct fib_table *tb)
1738{
1739	struct trie *t = (struct trie *) tb->tb_data;
1740	struct leaf *ll = NULL, *l = NULL;
1741	int found = 0, h;
1742
1743	t->revision++;
1744
1745	for (h = 0; (l = nextleaf(t, l)) != NULL; h++) {
1746		found += trie_flush_leaf(t, l);
1747
1748		if (ll && hlist_empty(&ll->list))
1749			trie_leaf_remove(t, ll->key);
1750		ll = l;
1751	}
1752
1753	if (ll && hlist_empty(&ll->list))
1754		trie_leaf_remove(t, ll->key);
1755
1756	pr_debug("trie_flush found=%d\n", found);
1757	return found;
1758}
1759
1760static int trie_last_dflt = -1;
1761
1762static void
1763fn_trie_select_default(struct fib_table *tb, const struct flowi *flp, struct fib_result *res)
1764{
1765	struct trie *t = (struct trie *) tb->tb_data;
1766	int order, last_idx;
1767	struct fib_info *fi = NULL;
1768	struct fib_info *last_resort;
1769	struct fib_alias *fa = NULL;
1770	struct list_head *fa_head;
1771	struct leaf *l;
1772
1773	last_idx = -1;
1774	last_resort = NULL;
1775	order = -1;
1776
1777	rcu_read_lock();
1778
1779	l = fib_find_node(t, 0);
1780	if (!l)
1781		goto out;
1782
1783	fa_head = get_fa_head(l, 0);
1784	if (!fa_head)
1785		goto out;
1786
1787	if (list_empty(fa_head))
1788		goto out;
1789
1790	list_for_each_entry_rcu(fa, fa_head, fa_list) {
1791		struct fib_info *next_fi = fa->fa_info;
1792
1793		if (fa->fa_scope != res->scope ||
1794		    fa->fa_type != RTN_UNICAST)
1795			continue;
1796
1797		if (next_fi->fib_priority > res->fi->fib_priority)
1798			break;
1799		if (!next_fi->fib_nh[0].nh_gw ||
1800		    next_fi->fib_nh[0].nh_scope != RT_SCOPE_LINK)
1801			continue;
1802		fa->fa_state |= FA_S_ACCESSED;
1803
1804		if (fi == NULL) {
1805			if (next_fi != res->fi)
1806				break;
1807		} else if (!fib_detect_death(fi, order, &last_resort,
1808					     &last_idx, &trie_last_dflt)) {
1809			if (res->fi)
1810				fib_info_put(res->fi);
1811			res->fi = fi;
1812			atomic_inc(&fi->fib_clntref);
1813			trie_last_dflt = order;
1814			goto out;
1815		}
1816		fi = next_fi;
1817		order++;
1818	}
1819	if (order <= 0 || fi == NULL) {
1820		trie_last_dflt = -1;
1821		goto out;
1822	}
1823
1824	if (!fib_detect_death(fi, order, &last_resort, &last_idx, &trie_last_dflt)) {
1825		if (res->fi)
1826			fib_info_put(res->fi);
1827		res->fi = fi;
1828		atomic_inc(&fi->fib_clntref);
1829		trie_last_dflt = order;
1830		goto out;
1831	}
1832	if (last_idx >= 0) {
1833		if (res->fi)
1834			fib_info_put(res->fi);
1835		res->fi = last_resort;
1836		if (last_resort)
1837			atomic_inc(&last_resort->fib_clntref);
1838	}
1839	trie_last_dflt = last_idx;
1840 out:;
1841	rcu_read_unlock();
1842}
1843
1844static int fn_trie_dump_fa(t_key key, int plen, struct list_head *fah, struct fib_table *tb,
1845			   struct sk_buff *skb, struct netlink_callback *cb)
1846{
1847	int i, s_i;
1848	struct fib_alias *fa;
1849
1850	u32 xkey = htonl(key);
1851
1852	s_i = cb->args[3];
1853	i = 0;
1854
1855	/* rcu_read_lock is hold by caller */
1856
1857	list_for_each_entry_rcu(fa, fah, fa_list) {
1858		if (i < s_i) {
1859			i++;
1860			continue;
1861		}
1862		BUG_ON(!fa->fa_info);
1863
1864		if (fib_dump_info(skb, NETLINK_CB(cb->skb).pid,
1865				  cb->nlh->nlmsg_seq,
1866				  RTM_NEWROUTE,
1867				  tb->tb_id,
1868				  fa->fa_type,
1869				  fa->fa_scope,
1870				  &xkey,
1871				  plen,
1872				  fa->fa_tos,
1873				  fa->fa_info, 0) < 0) {
1874			cb->args[3] = i;
1875			return -1;
1876		}
1877		i++;
1878	}
1879	cb->args[3] = i;
1880	return skb->len;
1881}
1882
1883static int fn_trie_dump_plen(struct trie *t, int plen, struct fib_table *tb, struct sk_buff *skb,
1884			     struct netlink_callback *cb)
1885{
1886	int h, s_h;
1887	struct list_head *fa_head;
1888	struct leaf *l = NULL;
1889
1890	s_h = cb->args[2];
1891
1892	for (h = 0; (l = nextleaf(t, l)) != NULL; h++) {
1893		if (h < s_h)
1894			continue;
1895		if (h > s_h)
1896			memset(&cb->args[3], 0,
1897			       sizeof(cb->args) - 3*sizeof(cb->args[0]));
1898
1899		fa_head = get_fa_head(l, plen);
1900
1901		if (!fa_head)
1902			continue;
1903
1904		if (list_empty(fa_head))
1905			continue;
1906
1907		if (fn_trie_dump_fa(l->key, plen, fa_head, tb, skb, cb)<0) {
1908			cb->args[2] = h;
1909			return -1;
1910		}
1911	}
1912	cb->args[2] = h;
1913	return skb->len;
1914}
1915
1916static int fn_trie_dump(struct fib_table *tb, struct sk_buff *skb, struct netlink_callback *cb)
1917{
1918	int m, s_m;
1919	struct trie *t = (struct trie *) tb->tb_data;
1920
1921	s_m = cb->args[1];
1922
1923	rcu_read_lock();
1924	for (m = 0; m <= 32; m++) {
1925		if (m < s_m)
1926			continue;
1927		if (m > s_m)
1928			memset(&cb->args[2], 0,
1929				sizeof(cb->args) - 2*sizeof(cb->args[0]));
1930
1931		if (fn_trie_dump_plen(t, 32-m, tb, skb, cb)<0) {
1932			cb->args[1] = m;
1933			goto out;
1934		}
1935	}
1936	rcu_read_unlock();
1937	cb->args[1] = m;
1938	return skb->len;
1939out:
1940	rcu_read_unlock();
1941	return -1;
1942}
1943
1944/* Fix more generic FIB names for init later */
1945
1946#ifdef CONFIG_IP_MULTIPLE_TABLES
1947struct fib_table * fib_hash_init(int id)
1948#else
1949struct fib_table * __init fib_hash_init(int id)
1950#endif
1951{
1952	struct fib_table *tb;
1953	struct trie *t;
1954
1955	if (fn_alias_kmem == NULL)
1956		fn_alias_kmem = kmem_cache_create("ip_fib_alias",
1957						  sizeof(struct fib_alias),
1958						  0, SLAB_HWCACHE_ALIGN,
1959						  NULL, NULL);
1960
1961	tb = kmalloc(sizeof(struct fib_table) + sizeof(struct trie),
1962		     GFP_KERNEL);
1963	if (tb == NULL)
1964		return NULL;
1965
1966	tb->tb_id = id;
1967	tb->tb_lookup = fn_trie_lookup;
1968	tb->tb_insert = fn_trie_insert;
1969	tb->tb_delete = fn_trie_delete;
1970	tb->tb_flush = fn_trie_flush;
1971	tb->tb_select_default = fn_trie_select_default;
1972	tb->tb_dump = fn_trie_dump;
1973	memset(tb->tb_data, 0, sizeof(struct trie));
1974
1975	t = (struct trie *) tb->tb_data;
1976
1977	trie_init(t);
1978
1979	if (id == RT_TABLE_LOCAL)
1980		trie_local = t;
1981	else if (id == RT_TABLE_MAIN)
1982		trie_main = t;
1983
1984	if (id == RT_TABLE_LOCAL)
1985		printk(KERN_INFO "IPv4 FIB: Using LC-trie version %s\n", VERSION);
1986
1987	return tb;
1988}
1989
1990#ifdef CONFIG_PROC_FS
1991/* Depth first Trie walk iterator */
1992struct fib_trie_iter {
1993	struct tnode *tnode;
1994	struct trie *trie;
1995	unsigned index;
1996	unsigned depth;
1997};
1998
1999static struct node *fib_trie_get_next(struct fib_trie_iter *iter)
2000{
2001	struct tnode *tn = iter->tnode;
2002	unsigned cindex = iter->index;
2003	struct tnode *p;
2004
2005	pr_debug("get_next iter={node=%p index=%d depth=%d}\n",
2006		 iter->tnode, iter->index, iter->depth);
2007rescan:
2008	while (cindex < (1<<tn->bits)) {
2009		struct node *n = tnode_get_child(tn, cindex);
2010
2011		if (n) {
2012			if (IS_LEAF(n)) {
2013				iter->tnode = tn;
2014				iter->index = cindex + 1;
2015			} else {
2016				/* push down one level */
2017				iter->tnode = (struct tnode *) n;
2018				iter->index = 0;
2019				++iter->depth;
2020			}
2021			return n;
2022		}
2023
2024		++cindex;
2025	}
2026
2027	/* Current node exhausted, pop back up */
2028	p = NODE_PARENT(tn);
2029	if (p) {
2030		cindex = tkey_extract_bits(tn->key, p->pos, p->bits)+1;
2031		tn = p;
2032		--iter->depth;
2033		goto rescan;
2034	}
2035
2036	/* got root? */
2037	return NULL;
2038}
2039
2040static struct node *fib_trie_get_first(struct fib_trie_iter *iter,
2041				       struct trie *t)
2042{
2043	struct node *n ;
2044
2045	if(!t)
2046		return NULL;
2047
2048	n = rcu_dereference(t->trie);
2049
2050	if(!iter)
2051		return NULL;
2052
2053	if (n && IS_TNODE(n)) {
2054		iter->tnode = (struct tnode *) n;
2055		iter->trie = t;
2056		iter->index = 0;
2057		iter->depth = 1;
2058		return n;
2059	}
2060	return NULL;
2061}
2062
2063static void trie_collect_stats(struct trie *t, struct trie_stat *s)
2064{
2065	struct node *n;
2066	struct fib_trie_iter iter;
2067
2068	memset(s, 0, sizeof(*s));
2069
2070	rcu_read_lock();
2071	for (n = fib_trie_get_first(&iter, t); n;
2072	     n = fib_trie_get_next(&iter)) {
2073		if (IS_LEAF(n)) {
2074			s->leaves++;
2075			s->totdepth += iter.depth;
2076			if (iter.depth > s->maxdepth)
2077				s->maxdepth = iter.depth;
2078		} else {
2079			const struct tnode *tn = (const struct tnode *) n;
2080			int i;
2081
2082			s->tnodes++;
2083			if(tn->bits < MAX_STAT_DEPTH)
2084				s->nodesizes[tn->bits]++;
2085
2086			for (i = 0; i < (1<<tn->bits); i++)
2087				if (!tn->child[i])
2088					s->nullpointers++;
2089		}
2090	}
2091	rcu_read_unlock();
2092}
2093
2094/*
2095 *	This outputs /proc/net/fib_triestats
2096 */
2097static void trie_show_stats(struct seq_file *seq, struct trie_stat *stat)
2098{
2099	unsigned i, max, pointers, bytes, avdepth;
2100
2101	if (stat->leaves)
2102		avdepth = stat->totdepth*100 / stat->leaves;
2103	else
2104		avdepth = 0;
2105
2106	seq_printf(seq, "\tAver depth:     %d.%02d\n", avdepth / 100, avdepth % 100 );
2107	seq_printf(seq, "\tMax depth:      %u\n", stat->maxdepth);
2108
2109	seq_printf(seq, "\tLeaves:         %u\n", stat->leaves);
2110
2111	bytes = sizeof(struct leaf) * stat->leaves;
2112	seq_printf(seq, "\tInternal nodes: %d\n\t", stat->tnodes);
2113	bytes += sizeof(struct tnode) * stat->tnodes;
2114
2115	max = MAX_STAT_DEPTH;
2116	while (max > 0 && stat->nodesizes[max-1] == 0)
2117		max--;
2118
2119	pointers = 0;
2120	for (i = 1; i <= max; i++)
2121		if (stat->nodesizes[i] != 0) {
2122			seq_printf(seq, "  %d: %d",  i, stat->nodesizes[i]);
2123			pointers += (1<<i) * stat->nodesizes[i];
2124		}
2125	seq_putc(seq, '\n');
2126	seq_printf(seq, "\tPointers: %d\n", pointers);
2127
2128	bytes += sizeof(struct node *) * pointers;
2129	seq_printf(seq, "Null ptrs: %d\n", stat->nullpointers);
2130	seq_printf(seq, "Total size: %d  kB\n", (bytes + 1023) / 1024);
2131
2132#ifdef CONFIG_IP_FIB_TRIE_STATS
2133	seq_printf(seq, "Counters:\n---------\n");
2134	seq_printf(seq,"gets = %d\n", t->stats.gets);
2135	seq_printf(seq,"backtracks = %d\n", t->stats.backtrack);
2136	seq_printf(seq,"semantic match passed = %d\n", t->stats.semantic_match_passed);
2137	seq_printf(seq,"semantic match miss = %d\n", t->stats.semantic_match_miss);
2138	seq_printf(seq,"null node hit= %d\n", t->stats.null_node_hit);
2139	seq_printf(seq,"skipped node resize = %d\n", t->stats.resize_node_skipped);
2140#ifdef CLEAR_STATS
2141	memset(&(t->stats), 0, sizeof(t->stats));
2142#endif
2143#endif /*  CONFIG_IP_FIB_TRIE_STATS */
2144}
2145
2146static int fib_triestat_seq_show(struct seq_file *seq, void *v)
2147{
2148	struct trie_stat *stat;
2149
2150	stat = kmalloc(sizeof(*stat), GFP_KERNEL);
2151	if (!stat)
2152		return -ENOMEM;
2153
2154	seq_printf(seq, "Basic info: size of leaf: %Zd bytes, size of tnode: %Zd bytes.\n",
2155		   sizeof(struct leaf), sizeof(struct tnode));
2156
2157	if (trie_local) {
2158		seq_printf(seq, "Local:\n");
2159		trie_collect_stats(trie_local, stat);
2160		trie_show_stats(seq, stat);
2161	}
2162
2163	if (trie_main) {
2164		seq_printf(seq, "Main:\n");
2165		trie_collect_stats(trie_main, stat);
2166		trie_show_stats(seq, stat);
2167	}
2168	kfree(stat);
2169
2170	return 0;
2171}
2172
2173static int fib_triestat_seq_open(struct inode *inode, struct file *file)
2174{
2175	return single_open(file, fib_triestat_seq_show, NULL);
2176}
2177
2178static struct file_operations fib_triestat_fops = {
2179	.owner	= THIS_MODULE,
2180	.open	= fib_triestat_seq_open,
2181	.read	= seq_read,
2182	.llseek	= seq_lseek,
2183	.release = single_release,
2184};
2185
2186static struct node *fib_trie_get_idx(struct fib_trie_iter *iter,
2187				      loff_t pos)
2188{
2189	loff_t idx = 0;
2190	struct node *n;
2191
2192	for (n = fib_trie_get_first(iter, trie_local);
2193	     n; ++idx, n = fib_trie_get_next(iter)) {
2194		if (pos == idx)
2195			return n;
2196	}
2197
2198	for (n = fib_trie_get_first(iter, trie_main);
2199	     n; ++idx, n = fib_trie_get_next(iter)) {
2200		if (pos == idx)
2201			return n;
2202	}
2203	return NULL;
2204}
2205
2206static void *fib_trie_seq_start(struct seq_file *seq, loff_t *pos)
2207{
2208	rcu_read_lock();
2209	if (*pos == 0)
2210		return SEQ_START_TOKEN;
2211	return fib_trie_get_idx(seq->private, *pos - 1);
2212}
2213
2214static void *fib_trie_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2215{
2216	struct fib_trie_iter *iter = seq->private;
2217	void *l = v;
2218
2219	++*pos;
2220	if (v == SEQ_START_TOKEN)
2221		return fib_trie_get_idx(iter, 0);
2222
2223	v = fib_trie_get_next(iter);
2224	BUG_ON(v == l);
2225	if (v)
2226		return v;
2227
2228	/* continue scan in next trie */
2229	if (iter->trie == trie_local)
2230		return fib_trie_get_first(iter, trie_main);
2231
2232	return NULL;
2233}
2234
2235static void fib_trie_seq_stop(struct seq_file *seq, void *v)
2236{
2237	rcu_read_unlock();
2238}
2239
2240static void seq_indent(struct seq_file *seq, int n)
2241{
2242	while (n-- > 0) seq_puts(seq, "   ");
2243}
2244
2245static inline const char *rtn_scope(enum rt_scope_t s)
2246{
2247	static char buf[32];
2248
2249	switch(s) {
2250	case RT_SCOPE_UNIVERSE: return "universe";
2251	case RT_SCOPE_SITE:	return "site";
2252	case RT_SCOPE_LINK:	return "link";
2253	case RT_SCOPE_HOST:	return "host";
2254	case RT_SCOPE_NOWHERE:	return "nowhere";
2255	default:
2256		snprintf(buf, sizeof(buf), "scope=%d", s);
2257		return buf;
2258	}
2259}
2260
2261static const char *rtn_type_names[__RTN_MAX] = {
2262	[RTN_UNSPEC] = "UNSPEC",
2263	[RTN_UNICAST] = "UNICAST",
2264	[RTN_LOCAL] = "LOCAL",
2265	[RTN_BROADCAST] = "BROADCAST",
2266	[RTN_ANYCAST] = "ANYCAST",
2267	[RTN_MULTICAST] = "MULTICAST",
2268	[RTN_BLACKHOLE] = "BLACKHOLE",
2269	[RTN_UNREACHABLE] = "UNREACHABLE",
2270	[RTN_PROHIBIT] = "PROHIBIT",
2271	[RTN_THROW] = "THROW",
2272	[RTN_NAT] = "NAT",
2273	[RTN_XRESOLVE] = "XRESOLVE",
2274};
2275
2276static inline const char *rtn_type(unsigned t)
2277{
2278	static char buf[32];
2279
2280	if (t < __RTN_MAX && rtn_type_names[t])
2281		return rtn_type_names[t];
2282	snprintf(buf, sizeof(buf), "type %d", t);
2283	return buf;
2284}
2285
2286/* Pretty print the trie */
2287static int fib_trie_seq_show(struct seq_file *seq, void *v)
2288{
2289	const struct fib_trie_iter *iter = seq->private;
2290	struct node *n = v;
2291
2292	if (v == SEQ_START_TOKEN)
2293		return 0;
2294
2295	if (IS_TNODE(n)) {
2296		struct tnode *tn = (struct tnode *) n;
2297		t_key prf = ntohl(MASK_PFX(tn->key, tn->pos));
2298
2299		if (!NODE_PARENT(n)) {
2300			if (iter->trie == trie_local)
2301				seq_puts(seq, "<local>:\n");
2302			else
2303				seq_puts(seq, "<main>:\n");
2304		}
2305		seq_indent(seq, iter->depth-1);
2306		seq_printf(seq, "  +-- %d.%d.%d.%d/%d %d %d %d\n",
2307			   NIPQUAD(prf), tn->pos, tn->bits, tn->full_children,
2308			   tn->empty_children);
2309
2310	} else {
2311		struct leaf *l = (struct leaf *) n;
2312		int i;
2313		u32 val = ntohl(l->key);
2314
2315		seq_indent(seq, iter->depth);
2316		seq_printf(seq, "  |-- %d.%d.%d.%d\n", NIPQUAD(val));
2317		for (i = 32; i >= 0; i--) {
2318			struct leaf_info *li = find_leaf_info(l, i);
2319			if (li) {
2320				struct fib_alias *fa;
2321				list_for_each_entry_rcu(fa, &li->falh, fa_list) {
2322					seq_indent(seq, iter->depth+1);
2323					seq_printf(seq, "  /%d %s %s", i,
2324						   rtn_scope(fa->fa_scope),
2325						   rtn_type(fa->fa_type));
2326					if (fa->fa_tos)
2327						seq_printf(seq, "tos =%d\n",
2328							   fa->fa_tos);
2329					seq_putc(seq, '\n');
2330				}
2331			}
2332		}
2333	}
2334
2335	return 0;
2336}
2337
2338static struct seq_operations fib_trie_seq_ops = {
2339	.start  = fib_trie_seq_start,
2340	.next   = fib_trie_seq_next,
2341	.stop   = fib_trie_seq_stop,
2342	.show   = fib_trie_seq_show,
2343};
2344
2345static int fib_trie_seq_open(struct inode *inode, struct file *file)
2346{
2347	struct seq_file *seq;
2348	int rc = -ENOMEM;
2349	struct fib_trie_iter *s = kmalloc(sizeof(*s), GFP_KERNEL);
2350
2351	if (!s)
2352		goto out;
2353
2354	rc = seq_open(file, &fib_trie_seq_ops);
2355	if (rc)
2356		goto out_kfree;
2357
2358	seq	     = file->private_data;
2359	seq->private = s;
2360	memset(s, 0, sizeof(*s));
2361out:
2362	return rc;
2363out_kfree:
2364	kfree(s);
2365	goto out;
2366}
2367
2368static struct file_operations fib_trie_fops = {
2369	.owner  = THIS_MODULE,
2370	.open   = fib_trie_seq_open,
2371	.read   = seq_read,
2372	.llseek = seq_lseek,
2373	.release = seq_release_private,
2374};
2375
2376static unsigned fib_flag_trans(int type, u32 mask, const struct fib_info *fi)
2377{
2378	static unsigned type2flags[RTN_MAX + 1] = {
2379		[7] = RTF_REJECT, [8] = RTF_REJECT,
2380	};
2381	unsigned flags = type2flags[type];
2382
2383	if (fi && fi->fib_nh->nh_gw)
2384		flags |= RTF_GATEWAY;
2385	if (mask == 0xFFFFFFFF)
2386		flags |= RTF_HOST;
2387	flags |= RTF_UP;
2388	return flags;
2389}
2390
2391/*
2392 *	This outputs /proc/net/route.
2393 *	The format of the file is not supposed to be changed
2394 * 	and needs to be same as fib_hash output to avoid breaking
2395 *	legacy utilities
2396 */
2397static int fib_route_seq_show(struct seq_file *seq, void *v)
2398{
2399	const struct fib_trie_iter *iter = seq->private;
2400	struct leaf *l = v;
2401	int i;
2402	char bf[128];
2403
2404	if (v == SEQ_START_TOKEN) {
2405		seq_printf(seq, "%-127s\n", "Iface\tDestination\tGateway "
2406			   "\tFlags\tRefCnt\tUse\tMetric\tMask\t\tMTU"
2407			   "\tWindow\tIRTT");
2408		return 0;
2409	}
2410
2411	if (iter->trie == trie_local)
2412		return 0;
2413	if (IS_TNODE(l))
2414		return 0;
2415
2416	for (i=32; i>=0; i--) {
2417		struct leaf_info *li = find_leaf_info(l, i);
2418		struct fib_alias *fa;
2419		u32 mask, prefix;
2420
2421		if (!li)
2422			continue;
2423
2424		mask = inet_make_mask(li->plen);
2425		prefix = htonl(l->key);
2426
2427		list_for_each_entry_rcu(fa, &li->falh, fa_list) {
2428			const struct fib_info *fi = fa->fa_info;
2429			unsigned flags = fib_flag_trans(fa->fa_type, mask, fi);
2430
2431			if (fa->fa_type == RTN_BROADCAST
2432			    || fa->fa_type == RTN_MULTICAST)
2433				continue;
2434
2435			if (fi)
2436				snprintf(bf, sizeof(bf),
2437					 "%s\t%08X\t%08X\t%04X\t%d\t%u\t%d\t%08X\t%d\t%u\t%u",
2438					 fi->fib_dev ? fi->fib_dev->name : "*",
2439					 prefix,
2440					 fi->fib_nh->nh_gw, flags, 0, 0,
2441					 fi->fib_priority,
2442					 mask,
2443					 (fi->fib_advmss ? fi->fib_advmss + 40 : 0),
2444					 fi->fib_window,
2445					 fi->fib_rtt >> 3);
2446			else
2447				snprintf(bf, sizeof(bf),
2448					 "*\t%08X\t%08X\t%04X\t%d\t%u\t%d\t%08X\t%d\t%u\t%u",
2449					 prefix, 0, flags, 0, 0, 0,
2450					 mask, 0, 0, 0);
2451
2452			seq_printf(seq, "%-127s\n", bf);
2453		}
2454	}
2455
2456	return 0;
2457}
2458
2459static struct seq_operations fib_route_seq_ops = {
2460	.start  = fib_trie_seq_start,
2461	.next   = fib_trie_seq_next,
2462	.stop   = fib_trie_seq_stop,
2463	.show   = fib_route_seq_show,
2464};
2465
2466static int fib_route_seq_open(struct inode *inode, struct file *file)
2467{
2468	struct seq_file *seq;
2469	int rc = -ENOMEM;
2470	struct fib_trie_iter *s = kmalloc(sizeof(*s), GFP_KERNEL);
2471
2472	if (!s)
2473		goto out;
2474
2475	rc = seq_open(file, &fib_route_seq_ops);
2476	if (rc)
2477		goto out_kfree;
2478
2479	seq	     = file->private_data;
2480	seq->private = s;
2481	memset(s, 0, sizeof(*s));
2482out:
2483	return rc;
2484out_kfree:
2485	kfree(s);
2486	goto out;
2487}
2488
2489static struct file_operations fib_route_fops = {
2490	.owner  = THIS_MODULE,
2491	.open   = fib_route_seq_open,
2492	.read   = seq_read,
2493	.llseek = seq_lseek,
2494	.release = seq_release_private,
2495};
2496
2497int __init fib_proc_init(void)
2498{
2499	if (!proc_net_fops_create("fib_trie", S_IRUGO, &fib_trie_fops))
2500		goto out1;
2501
2502	if (!proc_net_fops_create("fib_triestat", S_IRUGO, &fib_triestat_fops))
2503		goto out2;
2504
2505	if (!proc_net_fops_create("route", S_IRUGO, &fib_route_fops))
2506		goto out3;
2507
2508	return 0;
2509
2510out3:
2511	proc_net_remove("fib_triestat");
2512out2:
2513	proc_net_remove("fib_trie");
2514out1:
2515	return -ENOMEM;
2516}
2517
2518void __init fib_proc_exit(void)
2519{
2520	proc_net_remove("fib_trie");
2521	proc_net_remove("fib_triestat");
2522	proc_net_remove("route");
2523}
2524
2525#endif /* CONFIG_PROC_FS */
2526