fib_trie.c revision 631dd1a885b6d7e9f6f51b4e5b311c2bb04c323c
1/*
2 *   This program is free software; you can redistribute it and/or
3 *   modify it under the terms of the GNU General Public License
4 *   as published by the Free Software Foundation; either version
5 *   2 of the License, or (at your option) any later version.
6 *
7 *   Robert Olsson <robert.olsson@its.uu.se> Uppsala Universitet
8 *     & Swedish University of Agricultural Sciences.
9 *
10 *   Jens Laas <jens.laas@data.slu.se> Swedish University of
11 *     Agricultural Sciences.
12 *
13 *   Hans Liss <hans.liss@its.uu.se>  Uppsala Universitet
14 *
15 * This work is based on the LPC-trie which is originally descibed in:
16 *
17 * An experimental study of compression methods for dynamic tries
18 * Stefan Nilsson and Matti Tikkanen. Algorithmica, 33(1):19-33, 2002.
19 * http://www.csc.kth.se/~snilsson/software/dyntrie2/
20 *
21 *
22 * IP-address lookup using LC-tries. Stefan Nilsson and Gunnar Karlsson
23 * IEEE Journal on Selected Areas in Communications, 17(6):1083-1092, June 1999
24 *
25 *
26 * Code from fib_hash has been reused which includes the following header:
27 *
28 *
29 * INET		An implementation of the TCP/IP protocol suite for the LINUX
30 *		operating system.  INET is implemented using the  BSD Socket
31 *		interface as the means of communication with the user level.
32 *
33 *		IPv4 FIB: lookup engine and maintenance routines.
34 *
35 *
36 * Authors:	Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
37 *
38 *		This program is free software; you can redistribute it and/or
39 *		modify it under the terms of the GNU General Public License
40 *		as published by the Free Software Foundation; either version
41 *		2 of the License, or (at your option) any later version.
42 *
43 * Substantial contributions to this work comes from:
44 *
45 *		David S. Miller, <davem@davemloft.net>
46 *		Stephen Hemminger <shemminger@osdl.org>
47 *		Paul E. McKenney <paulmck@us.ibm.com>
48 *		Patrick McHardy <kaber@trash.net>
49 */
50
51#define VERSION "0.409"
52
53#include <asm/uaccess.h>
54#include <asm/system.h>
55#include <linux/bitops.h>
56#include <linux/types.h>
57#include <linux/kernel.h>
58#include <linux/mm.h>
59#include <linux/string.h>
60#include <linux/socket.h>
61#include <linux/sockios.h>
62#include <linux/errno.h>
63#include <linux/in.h>
64#include <linux/inet.h>
65#include <linux/inetdevice.h>
66#include <linux/netdevice.h>
67#include <linux/if_arp.h>
68#include <linux/proc_fs.h>
69#include <linux/rcupdate.h>
70#include <linux/skbuff.h>
71#include <linux/netlink.h>
72#include <linux/init.h>
73#include <linux/list.h>
74#include <linux/slab.h>
75#include <net/net_namespace.h>
76#include <net/ip.h>
77#include <net/protocol.h>
78#include <net/route.h>
79#include <net/tcp.h>
80#include <net/sock.h>
81#include <net/ip_fib.h>
82#include "fib_lookup.h"
83
84#define MAX_STAT_DEPTH 32
85
86#define KEYLENGTH (8*sizeof(t_key))
87
88typedef unsigned int t_key;
89
90#define T_TNODE 0
91#define T_LEAF  1
92#define NODE_TYPE_MASK	0x1UL
93#define NODE_TYPE(node) ((node)->parent & NODE_TYPE_MASK)
94
95#define IS_TNODE(n) (!(n->parent & T_LEAF))
96#define IS_LEAF(n) (n->parent & T_LEAF)
97
98struct node {
99	unsigned long parent;
100	t_key key;
101};
102
103struct leaf {
104	unsigned long parent;
105	t_key key;
106	struct hlist_head list;
107	struct rcu_head rcu;
108};
109
110struct leaf_info {
111	struct hlist_node hlist;
112	struct rcu_head rcu;
113	int plen;
114	struct list_head falh;
115};
116
117struct tnode {
118	unsigned long parent;
119	t_key key;
120	unsigned char pos;		/* 2log(KEYLENGTH) bits needed */
121	unsigned char bits;		/* 2log(KEYLENGTH) bits needed */
122	unsigned int full_children;	/* KEYLENGTH bits needed */
123	unsigned int empty_children;	/* KEYLENGTH bits needed */
124	union {
125		struct rcu_head rcu;
126		struct work_struct work;
127		struct tnode *tnode_free;
128	};
129	struct node *child[0];
130};
131
132#ifdef CONFIG_IP_FIB_TRIE_STATS
133struct trie_use_stats {
134	unsigned int gets;
135	unsigned int backtrack;
136	unsigned int semantic_match_passed;
137	unsigned int semantic_match_miss;
138	unsigned int null_node_hit;
139	unsigned int resize_node_skipped;
140};
141#endif
142
143struct trie_stat {
144	unsigned int totdepth;
145	unsigned int maxdepth;
146	unsigned int tnodes;
147	unsigned int leaves;
148	unsigned int nullpointers;
149	unsigned int prefixes;
150	unsigned int nodesizes[MAX_STAT_DEPTH];
151};
152
153struct trie {
154	struct node *trie;
155#ifdef CONFIG_IP_FIB_TRIE_STATS
156	struct trie_use_stats stats;
157#endif
158};
159
160static void put_child(struct trie *t, struct tnode *tn, int i, struct node *n);
161static void tnode_put_child_reorg(struct tnode *tn, int i, struct node *n,
162				  int wasfull);
163static struct node *resize(struct trie *t, struct tnode *tn);
164static struct tnode *inflate(struct trie *t, struct tnode *tn);
165static struct tnode *halve(struct trie *t, struct tnode *tn);
166/* tnodes to free after resize(); protected by RTNL */
167static struct tnode *tnode_free_head;
168static size_t tnode_free_size;
169
170/*
171 * synchronize_rcu after call_rcu for that many pages; it should be especially
172 * useful before resizing the root node with PREEMPT_NONE configs; the value was
173 * obtained experimentally, aiming to avoid visible slowdown.
174 */
175static const int sync_pages = 128;
176
177static struct kmem_cache *fn_alias_kmem __read_mostly;
178static struct kmem_cache *trie_leaf_kmem __read_mostly;
179
180static inline struct tnode *node_parent(struct node *node)
181{
182	return (struct tnode *)(node->parent & ~NODE_TYPE_MASK);
183}
184
185static inline struct tnode *node_parent_rcu(struct node *node)
186{
187	struct tnode *ret = node_parent(node);
188
189	return rcu_dereference(ret);
190}
191
192/* Same as rcu_assign_pointer
193 * but that macro() assumes that value is a pointer.
194 */
195static inline void node_set_parent(struct node *node, struct tnode *ptr)
196{
197	smp_wmb();
198	node->parent = (unsigned long)ptr | NODE_TYPE(node);
199}
200
201static inline struct node *tnode_get_child(struct tnode *tn, unsigned int i)
202{
203	BUG_ON(i >= 1U << tn->bits);
204
205	return tn->child[i];
206}
207
208static inline struct node *tnode_get_child_rcu(struct tnode *tn, unsigned int i)
209{
210	struct node *ret = tnode_get_child(tn, i);
211
212	return rcu_dereference_check(ret,
213				     rcu_read_lock_held() ||
214				     lockdep_rtnl_is_held());
215}
216
217static inline int tnode_child_length(const struct tnode *tn)
218{
219	return 1 << tn->bits;
220}
221
222static inline t_key mask_pfx(t_key k, unsigned short l)
223{
224	return (l == 0) ? 0 : k >> (KEYLENGTH-l) << (KEYLENGTH-l);
225}
226
227static inline t_key tkey_extract_bits(t_key a, int offset, int bits)
228{
229	if (offset < KEYLENGTH)
230		return ((t_key)(a << offset)) >> (KEYLENGTH - bits);
231	else
232		return 0;
233}
234
235static inline int tkey_equals(t_key a, t_key b)
236{
237	return a == b;
238}
239
240static inline int tkey_sub_equals(t_key a, int offset, int bits, t_key b)
241{
242	if (bits == 0 || offset >= KEYLENGTH)
243		return 1;
244	bits = bits > KEYLENGTH ? KEYLENGTH : bits;
245	return ((a ^ b) << offset) >> (KEYLENGTH - bits) == 0;
246}
247
248static inline int tkey_mismatch(t_key a, int offset, t_key b)
249{
250	t_key diff = a ^ b;
251	int i = offset;
252
253	if (!diff)
254		return 0;
255	while ((diff << i) >> (KEYLENGTH-1) == 0)
256		i++;
257	return i;
258}
259
260/*
261  To understand this stuff, an understanding of keys and all their bits is
262  necessary. Every node in the trie has a key associated with it, but not
263  all of the bits in that key are significant.
264
265  Consider a node 'n' and its parent 'tp'.
266
267  If n is a leaf, every bit in its key is significant. Its presence is
268  necessitated by path compression, since during a tree traversal (when
269  searching for a leaf - unless we are doing an insertion) we will completely
270  ignore all skipped bits we encounter. Thus we need to verify, at the end of
271  a potentially successful search, that we have indeed been walking the
272  correct key path.
273
274  Note that we can never "miss" the correct key in the tree if present by
275  following the wrong path. Path compression ensures that segments of the key
276  that are the same for all keys with a given prefix are skipped, but the
277  skipped part *is* identical for each node in the subtrie below the skipped
278  bit! trie_insert() in this implementation takes care of that - note the
279  call to tkey_sub_equals() in trie_insert().
280
281  if n is an internal node - a 'tnode' here, the various parts of its key
282  have many different meanings.
283
284  Example:
285  _________________________________________________________________
286  | i | i | i | i | i | i | i | N | N | N | S | S | S | S | S | C |
287  -----------------------------------------------------------------
288    0   1   2   3   4   5   6   7   8   9  10  11  12  13  14  15
289
290  _________________________________________________________________
291  | C | C | C | u | u | u | u | u | u | u | u | u | u | u | u | u |
292  -----------------------------------------------------------------
293   16  17  18  19  20  21  22  23  24  25  26  27  28  29  30  31
294
295  tp->pos = 7
296  tp->bits = 3
297  n->pos = 15
298  n->bits = 4
299
300  First, let's just ignore the bits that come before the parent tp, that is
301  the bits from 0 to (tp->pos-1). They are *known* but at this point we do
302  not use them for anything.
303
304  The bits from (tp->pos) to (tp->pos + tp->bits - 1) - "N", above - are the
305  index into the parent's child array. That is, they will be used to find
306  'n' among tp's children.
307
308  The bits from (tp->pos + tp->bits) to (n->pos - 1) - "S" - are skipped bits
309  for the node n.
310
311  All the bits we have seen so far are significant to the node n. The rest
312  of the bits are really not needed or indeed known in n->key.
313
314  The bits from (n->pos) to (n->pos + n->bits - 1) - "C" - are the index into
315  n's child array, and will of course be different for each child.
316
317
318  The rest of the bits, from (n->pos + n->bits) onward, are completely unknown
319  at this point.
320
321*/
322
323static inline void check_tnode(const struct tnode *tn)
324{
325	WARN_ON(tn && tn->pos+tn->bits > 32);
326}
327
328static const int halve_threshold = 25;
329static const int inflate_threshold = 50;
330static const int halve_threshold_root = 15;
331static const int inflate_threshold_root = 30;
332
333static void __alias_free_mem(struct rcu_head *head)
334{
335	struct fib_alias *fa = container_of(head, struct fib_alias, rcu);
336	kmem_cache_free(fn_alias_kmem, fa);
337}
338
339static inline void alias_free_mem_rcu(struct fib_alias *fa)
340{
341	call_rcu(&fa->rcu, __alias_free_mem);
342}
343
344static void __leaf_free_rcu(struct rcu_head *head)
345{
346	struct leaf *l = container_of(head, struct leaf, rcu);
347	kmem_cache_free(trie_leaf_kmem, l);
348}
349
350static inline void free_leaf(struct leaf *l)
351{
352	call_rcu_bh(&l->rcu, __leaf_free_rcu);
353}
354
355static void __leaf_info_free_rcu(struct rcu_head *head)
356{
357	kfree(container_of(head, struct leaf_info, rcu));
358}
359
360static inline void free_leaf_info(struct leaf_info *leaf)
361{
362	call_rcu(&leaf->rcu, __leaf_info_free_rcu);
363}
364
365static struct tnode *tnode_alloc(size_t size)
366{
367	if (size <= PAGE_SIZE)
368		return kzalloc(size, GFP_KERNEL);
369	else
370		return __vmalloc(size, GFP_KERNEL | __GFP_ZERO, PAGE_KERNEL);
371}
372
373static void __tnode_vfree(struct work_struct *arg)
374{
375	struct tnode *tn = container_of(arg, struct tnode, work);
376	vfree(tn);
377}
378
379static void __tnode_free_rcu(struct rcu_head *head)
380{
381	struct tnode *tn = container_of(head, struct tnode, rcu);
382	size_t size = sizeof(struct tnode) +
383		      (sizeof(struct node *) << tn->bits);
384
385	if (size <= PAGE_SIZE)
386		kfree(tn);
387	else {
388		INIT_WORK(&tn->work, __tnode_vfree);
389		schedule_work(&tn->work);
390	}
391}
392
393static inline void tnode_free(struct tnode *tn)
394{
395	if (IS_LEAF(tn))
396		free_leaf((struct leaf *) tn);
397	else
398		call_rcu(&tn->rcu, __tnode_free_rcu);
399}
400
401static void tnode_free_safe(struct tnode *tn)
402{
403	BUG_ON(IS_LEAF(tn));
404	tn->tnode_free = tnode_free_head;
405	tnode_free_head = tn;
406	tnode_free_size += sizeof(struct tnode) +
407			   (sizeof(struct node *) << tn->bits);
408}
409
410static void tnode_free_flush(void)
411{
412	struct tnode *tn;
413
414	while ((tn = tnode_free_head)) {
415		tnode_free_head = tn->tnode_free;
416		tn->tnode_free = NULL;
417		tnode_free(tn);
418	}
419
420	if (tnode_free_size >= PAGE_SIZE * sync_pages) {
421		tnode_free_size = 0;
422		synchronize_rcu();
423	}
424}
425
426static struct leaf *leaf_new(void)
427{
428	struct leaf *l = kmem_cache_alloc(trie_leaf_kmem, GFP_KERNEL);
429	if (l) {
430		l->parent = T_LEAF;
431		INIT_HLIST_HEAD(&l->list);
432	}
433	return l;
434}
435
436static struct leaf_info *leaf_info_new(int plen)
437{
438	struct leaf_info *li = kmalloc(sizeof(struct leaf_info),  GFP_KERNEL);
439	if (li) {
440		li->plen = plen;
441		INIT_LIST_HEAD(&li->falh);
442	}
443	return li;
444}
445
446static struct tnode *tnode_new(t_key key, int pos, int bits)
447{
448	size_t sz = sizeof(struct tnode) + (sizeof(struct node *) << bits);
449	struct tnode *tn = tnode_alloc(sz);
450
451	if (tn) {
452		tn->parent = T_TNODE;
453		tn->pos = pos;
454		tn->bits = bits;
455		tn->key = key;
456		tn->full_children = 0;
457		tn->empty_children = 1<<bits;
458	}
459
460	pr_debug("AT %p s=%u %lu\n", tn, (unsigned int) sizeof(struct tnode),
461		 (unsigned long) (sizeof(struct node) << bits));
462	return tn;
463}
464
465/*
466 * Check whether a tnode 'n' is "full", i.e. it is an internal node
467 * and no bits are skipped. See discussion in dyntree paper p. 6
468 */
469
470static inline int tnode_full(const struct tnode *tn, const struct node *n)
471{
472	if (n == NULL || IS_LEAF(n))
473		return 0;
474
475	return ((struct tnode *) n)->pos == tn->pos + tn->bits;
476}
477
478static inline void put_child(struct trie *t, struct tnode *tn, int i,
479			     struct node *n)
480{
481	tnode_put_child_reorg(tn, i, n, -1);
482}
483
484 /*
485  * Add a child at position i overwriting the old value.
486  * Update the value of full_children and empty_children.
487  */
488
489static void tnode_put_child_reorg(struct tnode *tn, int i, struct node *n,
490				  int wasfull)
491{
492	struct node *chi = tn->child[i];
493	int isfull;
494
495	BUG_ON(i >= 1<<tn->bits);
496
497	/* update emptyChildren */
498	if (n == NULL && chi != NULL)
499		tn->empty_children++;
500	else if (n != NULL && chi == NULL)
501		tn->empty_children--;
502
503	/* update fullChildren */
504	if (wasfull == -1)
505		wasfull = tnode_full(tn, chi);
506
507	isfull = tnode_full(tn, n);
508	if (wasfull && !isfull)
509		tn->full_children--;
510	else if (!wasfull && isfull)
511		tn->full_children++;
512
513	if (n)
514		node_set_parent(n, tn);
515
516	rcu_assign_pointer(tn->child[i], n);
517}
518
519#define MAX_WORK 10
520static struct node *resize(struct trie *t, struct tnode *tn)
521{
522	int i;
523	struct tnode *old_tn;
524	int inflate_threshold_use;
525	int halve_threshold_use;
526	int max_work;
527
528	if (!tn)
529		return NULL;
530
531	pr_debug("In tnode_resize %p inflate_threshold=%d threshold=%d\n",
532		 tn, inflate_threshold, halve_threshold);
533
534	/* No children */
535	if (tn->empty_children == tnode_child_length(tn)) {
536		tnode_free_safe(tn);
537		return NULL;
538	}
539	/* One child */
540	if (tn->empty_children == tnode_child_length(tn) - 1)
541		goto one_child;
542	/*
543	 * Double as long as the resulting node has a number of
544	 * nonempty nodes that are above the threshold.
545	 */
546
547	/*
548	 * From "Implementing a dynamic compressed trie" by Stefan Nilsson of
549	 * the Helsinki University of Technology and Matti Tikkanen of Nokia
550	 * Telecommunications, page 6:
551	 * "A node is doubled if the ratio of non-empty children to all
552	 * children in the *doubled* node is at least 'high'."
553	 *
554	 * 'high' in this instance is the variable 'inflate_threshold'. It
555	 * is expressed as a percentage, so we multiply it with
556	 * tnode_child_length() and instead of multiplying by 2 (since the
557	 * child array will be doubled by inflate()) and multiplying
558	 * the left-hand side by 100 (to handle the percentage thing) we
559	 * multiply the left-hand side by 50.
560	 *
561	 * The left-hand side may look a bit weird: tnode_child_length(tn)
562	 * - tn->empty_children is of course the number of non-null children
563	 * in the current node. tn->full_children is the number of "full"
564	 * children, that is non-null tnodes with a skip value of 0.
565	 * All of those will be doubled in the resulting inflated tnode, so
566	 * we just count them one extra time here.
567	 *
568	 * A clearer way to write this would be:
569	 *
570	 * to_be_doubled = tn->full_children;
571	 * not_to_be_doubled = tnode_child_length(tn) - tn->empty_children -
572	 *     tn->full_children;
573	 *
574	 * new_child_length = tnode_child_length(tn) * 2;
575	 *
576	 * new_fill_factor = 100 * (not_to_be_doubled + 2*to_be_doubled) /
577	 *      new_child_length;
578	 * if (new_fill_factor >= inflate_threshold)
579	 *
580	 * ...and so on, tho it would mess up the while () loop.
581	 *
582	 * anyway,
583	 * 100 * (not_to_be_doubled + 2*to_be_doubled) / new_child_length >=
584	 *      inflate_threshold
585	 *
586	 * avoid a division:
587	 * 100 * (not_to_be_doubled + 2*to_be_doubled) >=
588	 *      inflate_threshold * new_child_length
589	 *
590	 * expand not_to_be_doubled and to_be_doubled, and shorten:
591	 * 100 * (tnode_child_length(tn) - tn->empty_children +
592	 *    tn->full_children) >= inflate_threshold * new_child_length
593	 *
594	 * expand new_child_length:
595	 * 100 * (tnode_child_length(tn) - tn->empty_children +
596	 *    tn->full_children) >=
597	 *      inflate_threshold * tnode_child_length(tn) * 2
598	 *
599	 * shorten again:
600	 * 50 * (tn->full_children + tnode_child_length(tn) -
601	 *    tn->empty_children) >= inflate_threshold *
602	 *    tnode_child_length(tn)
603	 *
604	 */
605
606	check_tnode(tn);
607
608	/* Keep root node larger  */
609
610	if (!node_parent((struct node*) tn)) {
611		inflate_threshold_use = inflate_threshold_root;
612		halve_threshold_use = halve_threshold_root;
613	}
614	else {
615		inflate_threshold_use = inflate_threshold;
616		halve_threshold_use = halve_threshold;
617	}
618
619	max_work = MAX_WORK;
620	while ((tn->full_children > 0 &&  max_work-- &&
621		50 * (tn->full_children + tnode_child_length(tn)
622		      - tn->empty_children)
623		>= inflate_threshold_use * tnode_child_length(tn))) {
624
625		old_tn = tn;
626		tn = inflate(t, tn);
627
628		if (IS_ERR(tn)) {
629			tn = old_tn;
630#ifdef CONFIG_IP_FIB_TRIE_STATS
631			t->stats.resize_node_skipped++;
632#endif
633			break;
634		}
635	}
636
637	check_tnode(tn);
638
639	/* Return if at least one inflate is run */
640	if( max_work != MAX_WORK)
641		return (struct node *) tn;
642
643	/*
644	 * Halve as long as the number of empty children in this
645	 * node is above threshold.
646	 */
647
648	max_work = MAX_WORK;
649	while (tn->bits > 1 &&  max_work-- &&
650	       100 * (tnode_child_length(tn) - tn->empty_children) <
651	       halve_threshold_use * tnode_child_length(tn)) {
652
653		old_tn = tn;
654		tn = halve(t, tn);
655		if (IS_ERR(tn)) {
656			tn = old_tn;
657#ifdef CONFIG_IP_FIB_TRIE_STATS
658			t->stats.resize_node_skipped++;
659#endif
660			break;
661		}
662	}
663
664
665	/* Only one child remains */
666	if (tn->empty_children == tnode_child_length(tn) - 1) {
667one_child:
668		for (i = 0; i < tnode_child_length(tn); i++) {
669			struct node *n;
670
671			n = tn->child[i];
672			if (!n)
673				continue;
674
675			/* compress one level */
676
677			node_set_parent(n, NULL);
678			tnode_free_safe(tn);
679			return n;
680		}
681	}
682	return (struct node *) tn;
683}
684
685static struct tnode *inflate(struct trie *t, struct tnode *tn)
686{
687	struct tnode *oldtnode = tn;
688	int olen = tnode_child_length(tn);
689	int i;
690
691	pr_debug("In inflate\n");
692
693	tn = tnode_new(oldtnode->key, oldtnode->pos, oldtnode->bits + 1);
694
695	if (!tn)
696		return ERR_PTR(-ENOMEM);
697
698	/*
699	 * Preallocate and store tnodes before the actual work so we
700	 * don't get into an inconsistent state if memory allocation
701	 * fails. In case of failure we return the oldnode and  inflate
702	 * of tnode is ignored.
703	 */
704
705	for (i = 0; i < olen; i++) {
706		struct tnode *inode;
707
708		inode = (struct tnode *) tnode_get_child(oldtnode, i);
709		if (inode &&
710		    IS_TNODE(inode) &&
711		    inode->pos == oldtnode->pos + oldtnode->bits &&
712		    inode->bits > 1) {
713			struct tnode *left, *right;
714			t_key m = ~0U << (KEYLENGTH - 1) >> inode->pos;
715
716			left = tnode_new(inode->key&(~m), inode->pos + 1,
717					 inode->bits - 1);
718			if (!left)
719				goto nomem;
720
721			right = tnode_new(inode->key|m, inode->pos + 1,
722					  inode->bits - 1);
723
724			if (!right) {
725				tnode_free(left);
726				goto nomem;
727			}
728
729			put_child(t, tn, 2*i, (struct node *) left);
730			put_child(t, tn, 2*i+1, (struct node *) right);
731		}
732	}
733
734	for (i = 0; i < olen; i++) {
735		struct tnode *inode;
736		struct node *node = tnode_get_child(oldtnode, i);
737		struct tnode *left, *right;
738		int size, j;
739
740		/* An empty child */
741		if (node == NULL)
742			continue;
743
744		/* A leaf or an internal node with skipped bits */
745
746		if (IS_LEAF(node) || ((struct tnode *) node)->pos >
747		   tn->pos + tn->bits - 1) {
748			if (tkey_extract_bits(node->key,
749					      oldtnode->pos + oldtnode->bits,
750					      1) == 0)
751				put_child(t, tn, 2*i, node);
752			else
753				put_child(t, tn, 2*i+1, node);
754			continue;
755		}
756
757		/* An internal node with two children */
758		inode = (struct tnode *) node;
759
760		if (inode->bits == 1) {
761			put_child(t, tn, 2*i, inode->child[0]);
762			put_child(t, tn, 2*i+1, inode->child[1]);
763
764			tnode_free_safe(inode);
765			continue;
766		}
767
768		/* An internal node with more than two children */
769
770		/* We will replace this node 'inode' with two new
771		 * ones, 'left' and 'right', each with half of the
772		 * original children. The two new nodes will have
773		 * a position one bit further down the key and this
774		 * means that the "significant" part of their keys
775		 * (see the discussion near the top of this file)
776		 * will differ by one bit, which will be "0" in
777		 * left's key and "1" in right's key. Since we are
778		 * moving the key position by one step, the bit that
779		 * we are moving away from - the bit at position
780		 * (inode->pos) - is the one that will differ between
781		 * left and right. So... we synthesize that bit in the
782		 * two  new keys.
783		 * The mask 'm' below will be a single "one" bit at
784		 * the position (inode->pos)
785		 */
786
787		/* Use the old key, but set the new significant
788		 *   bit to zero.
789		 */
790
791		left = (struct tnode *) tnode_get_child(tn, 2*i);
792		put_child(t, tn, 2*i, NULL);
793
794		BUG_ON(!left);
795
796		right = (struct tnode *) tnode_get_child(tn, 2*i+1);
797		put_child(t, tn, 2*i+1, NULL);
798
799		BUG_ON(!right);
800
801		size = tnode_child_length(left);
802		for (j = 0; j < size; j++) {
803			put_child(t, left, j, inode->child[j]);
804			put_child(t, right, j, inode->child[j + size]);
805		}
806		put_child(t, tn, 2*i, resize(t, left));
807		put_child(t, tn, 2*i+1, resize(t, right));
808
809		tnode_free_safe(inode);
810	}
811	tnode_free_safe(oldtnode);
812	return tn;
813nomem:
814	{
815		int size = tnode_child_length(tn);
816		int j;
817
818		for (j = 0; j < size; j++)
819			if (tn->child[j])
820				tnode_free((struct tnode *)tn->child[j]);
821
822		tnode_free(tn);
823
824		return ERR_PTR(-ENOMEM);
825	}
826}
827
828static struct tnode *halve(struct trie *t, struct tnode *tn)
829{
830	struct tnode *oldtnode = tn;
831	struct node *left, *right;
832	int i;
833	int olen = tnode_child_length(tn);
834
835	pr_debug("In halve\n");
836
837	tn = tnode_new(oldtnode->key, oldtnode->pos, oldtnode->bits - 1);
838
839	if (!tn)
840		return ERR_PTR(-ENOMEM);
841
842	/*
843	 * Preallocate and store tnodes before the actual work so we
844	 * don't get into an inconsistent state if memory allocation
845	 * fails. In case of failure we return the oldnode and halve
846	 * of tnode is ignored.
847	 */
848
849	for (i = 0; i < olen; i += 2) {
850		left = tnode_get_child(oldtnode, i);
851		right = tnode_get_child(oldtnode, i+1);
852
853		/* Two nonempty children */
854		if (left && right) {
855			struct tnode *newn;
856
857			newn = tnode_new(left->key, tn->pos + tn->bits, 1);
858
859			if (!newn)
860				goto nomem;
861
862			put_child(t, tn, i/2, (struct node *)newn);
863		}
864
865	}
866
867	for (i = 0; i < olen; i += 2) {
868		struct tnode *newBinNode;
869
870		left = tnode_get_child(oldtnode, i);
871		right = tnode_get_child(oldtnode, i+1);
872
873		/* At least one of the children is empty */
874		if (left == NULL) {
875			if (right == NULL)    /* Both are empty */
876				continue;
877			put_child(t, tn, i/2, right);
878			continue;
879		}
880
881		if (right == NULL) {
882			put_child(t, tn, i/2, left);
883			continue;
884		}
885
886		/* Two nonempty children */
887		newBinNode = (struct tnode *) tnode_get_child(tn, i/2);
888		put_child(t, tn, i/2, NULL);
889		put_child(t, newBinNode, 0, left);
890		put_child(t, newBinNode, 1, right);
891		put_child(t, tn, i/2, resize(t, newBinNode));
892	}
893	tnode_free_safe(oldtnode);
894	return tn;
895nomem:
896	{
897		int size = tnode_child_length(tn);
898		int j;
899
900		for (j = 0; j < size; j++)
901			if (tn->child[j])
902				tnode_free((struct tnode *)tn->child[j]);
903
904		tnode_free(tn);
905
906		return ERR_PTR(-ENOMEM);
907	}
908}
909
910/* readside must use rcu_read_lock currently dump routines
911 via get_fa_head and dump */
912
913static struct leaf_info *find_leaf_info(struct leaf *l, int plen)
914{
915	struct hlist_head *head = &l->list;
916	struct hlist_node *node;
917	struct leaf_info *li;
918
919	hlist_for_each_entry_rcu(li, node, head, hlist)
920		if (li->plen == plen)
921			return li;
922
923	return NULL;
924}
925
926static inline struct list_head *get_fa_head(struct leaf *l, int plen)
927{
928	struct leaf_info *li = find_leaf_info(l, plen);
929
930	if (!li)
931		return NULL;
932
933	return &li->falh;
934}
935
936static void insert_leaf_info(struct hlist_head *head, struct leaf_info *new)
937{
938	struct leaf_info *li = NULL, *last = NULL;
939	struct hlist_node *node;
940
941	if (hlist_empty(head)) {
942		hlist_add_head_rcu(&new->hlist, head);
943	} else {
944		hlist_for_each_entry(li, node, head, hlist) {
945			if (new->plen > li->plen)
946				break;
947
948			last = li;
949		}
950		if (last)
951			hlist_add_after_rcu(&last->hlist, &new->hlist);
952		else
953			hlist_add_before_rcu(&new->hlist, &li->hlist);
954	}
955}
956
957/* rcu_read_lock needs to be hold by caller from readside */
958
959static struct leaf *
960fib_find_node(struct trie *t, u32 key)
961{
962	int pos;
963	struct tnode *tn;
964	struct node *n;
965
966	pos = 0;
967	n = rcu_dereference_check(t->trie,
968				  rcu_read_lock_held() ||
969				  lockdep_rtnl_is_held());
970
971	while (n != NULL &&  NODE_TYPE(n) == T_TNODE) {
972		tn = (struct tnode *) n;
973
974		check_tnode(tn);
975
976		if (tkey_sub_equals(tn->key, pos, tn->pos-pos, key)) {
977			pos = tn->pos + tn->bits;
978			n = tnode_get_child_rcu(tn,
979						tkey_extract_bits(key,
980								  tn->pos,
981								  tn->bits));
982		} else
983			break;
984	}
985	/* Case we have found a leaf. Compare prefixes */
986
987	if (n != NULL && IS_LEAF(n) && tkey_equals(key, n->key))
988		return (struct leaf *)n;
989
990	return NULL;
991}
992
993static void trie_rebalance(struct trie *t, struct tnode *tn)
994{
995	int wasfull;
996	t_key cindex, key;
997	struct tnode *tp;
998
999	key = tn->key;
1000
1001	while (tn != NULL && (tp = node_parent((struct node *)tn)) != NULL) {
1002		cindex = tkey_extract_bits(key, tp->pos, tp->bits);
1003		wasfull = tnode_full(tp, tnode_get_child(tp, cindex));
1004		tn = (struct tnode *) resize(t, (struct tnode *)tn);
1005
1006		tnode_put_child_reorg((struct tnode *)tp, cindex,
1007				      (struct node *)tn, wasfull);
1008
1009		tp = node_parent((struct node *) tn);
1010		if (!tp)
1011			rcu_assign_pointer(t->trie, (struct node *)tn);
1012
1013		tnode_free_flush();
1014		if (!tp)
1015			break;
1016		tn = tp;
1017	}
1018
1019	/* Handle last (top) tnode */
1020	if (IS_TNODE(tn))
1021		tn = (struct tnode *)resize(t, (struct tnode *)tn);
1022
1023	rcu_assign_pointer(t->trie, (struct node *)tn);
1024	tnode_free_flush();
1025}
1026
1027/* only used from updater-side */
1028
1029static struct list_head *fib_insert_node(struct trie *t, u32 key, int plen)
1030{
1031	int pos, newpos;
1032	struct tnode *tp = NULL, *tn = NULL;
1033	struct node *n;
1034	struct leaf *l;
1035	int missbit;
1036	struct list_head *fa_head = NULL;
1037	struct leaf_info *li;
1038	t_key cindex;
1039
1040	pos = 0;
1041	n = t->trie;
1042
1043	/* If we point to NULL, stop. Either the tree is empty and we should
1044	 * just put a new leaf in if, or we have reached an empty child slot,
1045	 * and we should just put our new leaf in that.
1046	 * If we point to a T_TNODE, check if it matches our key. Note that
1047	 * a T_TNODE might be skipping any number of bits - its 'pos' need
1048	 * not be the parent's 'pos'+'bits'!
1049	 *
1050	 * If it does match the current key, get pos/bits from it, extract
1051	 * the index from our key, push the T_TNODE and walk the tree.
1052	 *
1053	 * If it doesn't, we have to replace it with a new T_TNODE.
1054	 *
1055	 * If we point to a T_LEAF, it might or might not have the same key
1056	 * as we do. If it does, just change the value, update the T_LEAF's
1057	 * value, and return it.
1058	 * If it doesn't, we need to replace it with a T_TNODE.
1059	 */
1060
1061	while (n != NULL &&  NODE_TYPE(n) == T_TNODE) {
1062		tn = (struct tnode *) n;
1063
1064		check_tnode(tn);
1065
1066		if (tkey_sub_equals(tn->key, pos, tn->pos-pos, key)) {
1067			tp = tn;
1068			pos = tn->pos + tn->bits;
1069			n = tnode_get_child(tn,
1070					    tkey_extract_bits(key,
1071							      tn->pos,
1072							      tn->bits));
1073
1074			BUG_ON(n && node_parent(n) != tn);
1075		} else
1076			break;
1077	}
1078
1079	/*
1080	 * n  ----> NULL, LEAF or TNODE
1081	 *
1082	 * tp is n's (parent) ----> NULL or TNODE
1083	 */
1084
1085	BUG_ON(tp && IS_LEAF(tp));
1086
1087	/* Case 1: n is a leaf. Compare prefixes */
1088
1089	if (n != NULL && IS_LEAF(n) && tkey_equals(key, n->key)) {
1090		l = (struct leaf *) n;
1091		li = leaf_info_new(plen);
1092
1093		if (!li)
1094			return NULL;
1095
1096		fa_head = &li->falh;
1097		insert_leaf_info(&l->list, li);
1098		goto done;
1099	}
1100	l = leaf_new();
1101
1102	if (!l)
1103		return NULL;
1104
1105	l->key = key;
1106	li = leaf_info_new(plen);
1107
1108	if (!li) {
1109		free_leaf(l);
1110		return NULL;
1111	}
1112
1113	fa_head = &li->falh;
1114	insert_leaf_info(&l->list, li);
1115
1116	if (t->trie && n == NULL) {
1117		/* Case 2: n is NULL, and will just insert a new leaf */
1118
1119		node_set_parent((struct node *)l, tp);
1120
1121		cindex = tkey_extract_bits(key, tp->pos, tp->bits);
1122		put_child(t, (struct tnode *)tp, cindex, (struct node *)l);
1123	} else {
1124		/* Case 3: n is a LEAF or a TNODE and the key doesn't match. */
1125		/*
1126		 *  Add a new tnode here
1127		 *  first tnode need some special handling
1128		 */
1129
1130		if (tp)
1131			pos = tp->pos+tp->bits;
1132		else
1133			pos = 0;
1134
1135		if (n) {
1136			newpos = tkey_mismatch(key, pos, n->key);
1137			tn = tnode_new(n->key, newpos, 1);
1138		} else {
1139			newpos = 0;
1140			tn = tnode_new(key, newpos, 1); /* First tnode */
1141		}
1142
1143		if (!tn) {
1144			free_leaf_info(li);
1145			free_leaf(l);
1146			return NULL;
1147		}
1148
1149		node_set_parent((struct node *)tn, tp);
1150
1151		missbit = tkey_extract_bits(key, newpos, 1);
1152		put_child(t, tn, missbit, (struct node *)l);
1153		put_child(t, tn, 1-missbit, n);
1154
1155		if (tp) {
1156			cindex = tkey_extract_bits(key, tp->pos, tp->bits);
1157			put_child(t, (struct tnode *)tp, cindex,
1158				  (struct node *)tn);
1159		} else {
1160			rcu_assign_pointer(t->trie, (struct node *)tn);
1161			tp = tn;
1162		}
1163	}
1164
1165	if (tp && tp->pos + tp->bits > 32)
1166		pr_warning("fib_trie"
1167			   " tp=%p pos=%d, bits=%d, key=%0x plen=%d\n",
1168			   tp, tp->pos, tp->bits, key, plen);
1169
1170	/* Rebalance the trie */
1171
1172	trie_rebalance(t, tp);
1173done:
1174	return fa_head;
1175}
1176
1177/*
1178 * Caller must hold RTNL.
1179 */
1180int fib_table_insert(struct fib_table *tb, struct fib_config *cfg)
1181{
1182	struct trie *t = (struct trie *) tb->tb_data;
1183	struct fib_alias *fa, *new_fa;
1184	struct list_head *fa_head = NULL;
1185	struct fib_info *fi;
1186	int plen = cfg->fc_dst_len;
1187	u8 tos = cfg->fc_tos;
1188	u32 key, mask;
1189	int err;
1190	struct leaf *l;
1191
1192	if (plen > 32)
1193		return -EINVAL;
1194
1195	key = ntohl(cfg->fc_dst);
1196
1197	pr_debug("Insert table=%u %08x/%d\n", tb->tb_id, key, plen);
1198
1199	mask = ntohl(inet_make_mask(plen));
1200
1201	if (key & ~mask)
1202		return -EINVAL;
1203
1204	key = key & mask;
1205
1206	fi = fib_create_info(cfg);
1207	if (IS_ERR(fi)) {
1208		err = PTR_ERR(fi);
1209		goto err;
1210	}
1211
1212	l = fib_find_node(t, key);
1213	fa = NULL;
1214
1215	if (l) {
1216		fa_head = get_fa_head(l, plen);
1217		fa = fib_find_alias(fa_head, tos, fi->fib_priority);
1218	}
1219
1220	/* Now fa, if non-NULL, points to the first fib alias
1221	 * with the same keys [prefix,tos,priority], if such key already
1222	 * exists or to the node before which we will insert new one.
1223	 *
1224	 * If fa is NULL, we will need to allocate a new one and
1225	 * insert to the head of f.
1226	 *
1227	 * If f is NULL, no fib node matched the destination key
1228	 * and we need to allocate a new one of those as well.
1229	 */
1230
1231	if (fa && fa->fa_tos == tos &&
1232	    fa->fa_info->fib_priority == fi->fib_priority) {
1233		struct fib_alias *fa_first, *fa_match;
1234
1235		err = -EEXIST;
1236		if (cfg->fc_nlflags & NLM_F_EXCL)
1237			goto out;
1238
1239		/* We have 2 goals:
1240		 * 1. Find exact match for type, scope, fib_info to avoid
1241		 * duplicate routes
1242		 * 2. Find next 'fa' (or head), NLM_F_APPEND inserts before it
1243		 */
1244		fa_match = NULL;
1245		fa_first = fa;
1246		fa = list_entry(fa->fa_list.prev, struct fib_alias, fa_list);
1247		list_for_each_entry_continue(fa, fa_head, fa_list) {
1248			if (fa->fa_tos != tos)
1249				break;
1250			if (fa->fa_info->fib_priority != fi->fib_priority)
1251				break;
1252			if (fa->fa_type == cfg->fc_type &&
1253			    fa->fa_scope == cfg->fc_scope &&
1254			    fa->fa_info == fi) {
1255				fa_match = fa;
1256				break;
1257			}
1258		}
1259
1260		if (cfg->fc_nlflags & NLM_F_REPLACE) {
1261			struct fib_info *fi_drop;
1262			u8 state;
1263
1264			fa = fa_first;
1265			if (fa_match) {
1266				if (fa == fa_match)
1267					err = 0;
1268				goto out;
1269			}
1270			err = -ENOBUFS;
1271			new_fa = kmem_cache_alloc(fn_alias_kmem, GFP_KERNEL);
1272			if (new_fa == NULL)
1273				goto out;
1274
1275			fi_drop = fa->fa_info;
1276			new_fa->fa_tos = fa->fa_tos;
1277			new_fa->fa_info = fi;
1278			new_fa->fa_type = cfg->fc_type;
1279			new_fa->fa_scope = cfg->fc_scope;
1280			state = fa->fa_state;
1281			new_fa->fa_state = state & ~FA_S_ACCESSED;
1282
1283			list_replace_rcu(&fa->fa_list, &new_fa->fa_list);
1284			alias_free_mem_rcu(fa);
1285
1286			fib_release_info(fi_drop);
1287			if (state & FA_S_ACCESSED)
1288				rt_cache_flush(cfg->fc_nlinfo.nl_net, -1);
1289			rtmsg_fib(RTM_NEWROUTE, htonl(key), new_fa, plen,
1290				tb->tb_id, &cfg->fc_nlinfo, NLM_F_REPLACE);
1291
1292			goto succeeded;
1293		}
1294		/* Error if we find a perfect match which
1295		 * uses the same scope, type, and nexthop
1296		 * information.
1297		 */
1298		if (fa_match)
1299			goto out;
1300
1301		if (!(cfg->fc_nlflags & NLM_F_APPEND))
1302			fa = fa_first;
1303	}
1304	err = -ENOENT;
1305	if (!(cfg->fc_nlflags & NLM_F_CREATE))
1306		goto out;
1307
1308	err = -ENOBUFS;
1309	new_fa = kmem_cache_alloc(fn_alias_kmem, GFP_KERNEL);
1310	if (new_fa == NULL)
1311		goto out;
1312
1313	new_fa->fa_info = fi;
1314	new_fa->fa_tos = tos;
1315	new_fa->fa_type = cfg->fc_type;
1316	new_fa->fa_scope = cfg->fc_scope;
1317	new_fa->fa_state = 0;
1318	/*
1319	 * Insert new entry to the list.
1320	 */
1321
1322	if (!fa_head) {
1323		fa_head = fib_insert_node(t, key, plen);
1324		if (unlikely(!fa_head)) {
1325			err = -ENOMEM;
1326			goto out_free_new_fa;
1327		}
1328	}
1329
1330	list_add_tail_rcu(&new_fa->fa_list,
1331			  (fa ? &fa->fa_list : fa_head));
1332
1333	rt_cache_flush(cfg->fc_nlinfo.nl_net, -1);
1334	rtmsg_fib(RTM_NEWROUTE, htonl(key), new_fa, plen, tb->tb_id,
1335		  &cfg->fc_nlinfo, 0);
1336succeeded:
1337	return 0;
1338
1339out_free_new_fa:
1340	kmem_cache_free(fn_alias_kmem, new_fa);
1341out:
1342	fib_release_info(fi);
1343err:
1344	return err;
1345}
1346
1347/* should be called with rcu_read_lock */
1348static int check_leaf(struct trie *t, struct leaf *l,
1349		      t_key key,  const struct flowi *flp,
1350		      struct fib_result *res)
1351{
1352	struct leaf_info *li;
1353	struct hlist_head *hhead = &l->list;
1354	struct hlist_node *node;
1355
1356	hlist_for_each_entry_rcu(li, node, hhead, hlist) {
1357		int err;
1358		int plen = li->plen;
1359		__be32 mask = inet_make_mask(plen);
1360
1361		if (l->key != (key & ntohl(mask)))
1362			continue;
1363
1364		err = fib_semantic_match(&li->falh, flp, res, plen);
1365
1366#ifdef CONFIG_IP_FIB_TRIE_STATS
1367		if (err <= 0)
1368			t->stats.semantic_match_passed++;
1369		else
1370			t->stats.semantic_match_miss++;
1371#endif
1372		if (err <= 0)
1373			return err;
1374	}
1375
1376	return 1;
1377}
1378
1379int fib_table_lookup(struct fib_table *tb, const struct flowi *flp,
1380		     struct fib_result *res)
1381{
1382	struct trie *t = (struct trie *) tb->tb_data;
1383	int ret;
1384	struct node *n;
1385	struct tnode *pn;
1386	int pos, bits;
1387	t_key key = ntohl(flp->fl4_dst);
1388	int chopped_off;
1389	t_key cindex = 0;
1390	int current_prefix_length = KEYLENGTH;
1391	struct tnode *cn;
1392	t_key node_prefix, key_prefix, pref_mismatch;
1393	int mp;
1394
1395	rcu_read_lock();
1396
1397	n = rcu_dereference(t->trie);
1398	if (!n)
1399		goto failed;
1400
1401#ifdef CONFIG_IP_FIB_TRIE_STATS
1402	t->stats.gets++;
1403#endif
1404
1405	/* Just a leaf? */
1406	if (IS_LEAF(n)) {
1407		ret = check_leaf(t, (struct leaf *)n, key, flp, res);
1408		goto found;
1409	}
1410
1411	pn = (struct tnode *) n;
1412	chopped_off = 0;
1413
1414	while (pn) {
1415		pos = pn->pos;
1416		bits = pn->bits;
1417
1418		if (!chopped_off)
1419			cindex = tkey_extract_bits(mask_pfx(key, current_prefix_length),
1420						   pos, bits);
1421
1422		n = tnode_get_child_rcu(pn, cindex);
1423
1424		if (n == NULL) {
1425#ifdef CONFIG_IP_FIB_TRIE_STATS
1426			t->stats.null_node_hit++;
1427#endif
1428			goto backtrace;
1429		}
1430
1431		if (IS_LEAF(n)) {
1432			ret = check_leaf(t, (struct leaf *)n, key, flp, res);
1433			if (ret > 0)
1434				goto backtrace;
1435			goto found;
1436		}
1437
1438		cn = (struct tnode *)n;
1439
1440		/*
1441		 * It's a tnode, and we can do some extra checks here if we
1442		 * like, to avoid descending into a dead-end branch.
1443		 * This tnode is in the parent's child array at index
1444		 * key[p_pos..p_pos+p_bits] but potentially with some bits
1445		 * chopped off, so in reality the index may be just a
1446		 * subprefix, padded with zero at the end.
1447		 * We can also take a look at any skipped bits in this
1448		 * tnode - everything up to p_pos is supposed to be ok,
1449		 * and the non-chopped bits of the index (se previous
1450		 * paragraph) are also guaranteed ok, but the rest is
1451		 * considered unknown.
1452		 *
1453		 * The skipped bits are key[pos+bits..cn->pos].
1454		 */
1455
1456		/* If current_prefix_length < pos+bits, we are already doing
1457		 * actual prefix  matching, which means everything from
1458		 * pos+(bits-chopped_off) onward must be zero along some
1459		 * branch of this subtree - otherwise there is *no* valid
1460		 * prefix present. Here we can only check the skipped
1461		 * bits. Remember, since we have already indexed into the
1462		 * parent's child array, we know that the bits we chopped of
1463		 * *are* zero.
1464		 */
1465
1466		/* NOTA BENE: Checking only skipped bits
1467		   for the new node here */
1468
1469		if (current_prefix_length < pos+bits) {
1470			if (tkey_extract_bits(cn->key, current_prefix_length,
1471						cn->pos - current_prefix_length)
1472			    || !(cn->child[0]))
1473				goto backtrace;
1474		}
1475
1476		/*
1477		 * If chopped_off=0, the index is fully validated and we
1478		 * only need to look at the skipped bits for this, the new,
1479		 * tnode. What we actually want to do is to find out if
1480		 * these skipped bits match our key perfectly, or if we will
1481		 * have to count on finding a matching prefix further down,
1482		 * because if we do, we would like to have some way of
1483		 * verifying the existence of such a prefix at this point.
1484		 */
1485
1486		/* The only thing we can do at this point is to verify that
1487		 * any such matching prefix can indeed be a prefix to our
1488		 * key, and if the bits in the node we are inspecting that
1489		 * do not match our key are not ZERO, this cannot be true.
1490		 * Thus, find out where there is a mismatch (before cn->pos)
1491		 * and verify that all the mismatching bits are zero in the
1492		 * new tnode's key.
1493		 */
1494
1495		/*
1496		 * Note: We aren't very concerned about the piece of
1497		 * the key that precede pn->pos+pn->bits, since these
1498		 * have already been checked. The bits after cn->pos
1499		 * aren't checked since these are by definition
1500		 * "unknown" at this point. Thus, what we want to see
1501		 * is if we are about to enter the "prefix matching"
1502		 * state, and in that case verify that the skipped
1503		 * bits that will prevail throughout this subtree are
1504		 * zero, as they have to be if we are to find a
1505		 * matching prefix.
1506		 */
1507
1508		node_prefix = mask_pfx(cn->key, cn->pos);
1509		key_prefix = mask_pfx(key, cn->pos);
1510		pref_mismatch = key_prefix^node_prefix;
1511		mp = 0;
1512
1513		/*
1514		 * In short: If skipped bits in this node do not match
1515		 * the search key, enter the "prefix matching"
1516		 * state.directly.
1517		 */
1518		if (pref_mismatch) {
1519			while (!(pref_mismatch & (1<<(KEYLENGTH-1)))) {
1520				mp++;
1521				pref_mismatch = pref_mismatch << 1;
1522			}
1523			key_prefix = tkey_extract_bits(cn->key, mp, cn->pos-mp);
1524
1525			if (key_prefix != 0)
1526				goto backtrace;
1527
1528			if (current_prefix_length >= cn->pos)
1529				current_prefix_length = mp;
1530		}
1531
1532		pn = (struct tnode *)n; /* Descend */
1533		chopped_off = 0;
1534		continue;
1535
1536backtrace:
1537		chopped_off++;
1538
1539		/* As zero don't change the child key (cindex) */
1540		while ((chopped_off <= pn->bits)
1541		       && !(cindex & (1<<(chopped_off-1))))
1542			chopped_off++;
1543
1544		/* Decrease current_... with bits chopped off */
1545		if (current_prefix_length > pn->pos + pn->bits - chopped_off)
1546			current_prefix_length = pn->pos + pn->bits
1547				- chopped_off;
1548
1549		/*
1550		 * Either we do the actual chop off according or if we have
1551		 * chopped off all bits in this tnode walk up to our parent.
1552		 */
1553
1554		if (chopped_off <= pn->bits) {
1555			cindex &= ~(1 << (chopped_off-1));
1556		} else {
1557			struct tnode *parent = node_parent_rcu((struct node *) pn);
1558			if (!parent)
1559				goto failed;
1560
1561			/* Get Child's index */
1562			cindex = tkey_extract_bits(pn->key, parent->pos, parent->bits);
1563			pn = parent;
1564			chopped_off = 0;
1565
1566#ifdef CONFIG_IP_FIB_TRIE_STATS
1567			t->stats.backtrack++;
1568#endif
1569			goto backtrace;
1570		}
1571	}
1572failed:
1573	ret = 1;
1574found:
1575	rcu_read_unlock();
1576	return ret;
1577}
1578
1579/*
1580 * Remove the leaf and return parent.
1581 */
1582static void trie_leaf_remove(struct trie *t, struct leaf *l)
1583{
1584	struct tnode *tp = node_parent((struct node *) l);
1585
1586	pr_debug("entering trie_leaf_remove(%p)\n", l);
1587
1588	if (tp) {
1589		t_key cindex = tkey_extract_bits(l->key, tp->pos, tp->bits);
1590		put_child(t, (struct tnode *)tp, cindex, NULL);
1591		trie_rebalance(t, tp);
1592	} else
1593		rcu_assign_pointer(t->trie, NULL);
1594
1595	free_leaf(l);
1596}
1597
1598/*
1599 * Caller must hold RTNL.
1600 */
1601int fib_table_delete(struct fib_table *tb, struct fib_config *cfg)
1602{
1603	struct trie *t = (struct trie *) tb->tb_data;
1604	u32 key, mask;
1605	int plen = cfg->fc_dst_len;
1606	u8 tos = cfg->fc_tos;
1607	struct fib_alias *fa, *fa_to_delete;
1608	struct list_head *fa_head;
1609	struct leaf *l;
1610	struct leaf_info *li;
1611
1612	if (plen > 32)
1613		return -EINVAL;
1614
1615	key = ntohl(cfg->fc_dst);
1616	mask = ntohl(inet_make_mask(plen));
1617
1618	if (key & ~mask)
1619		return -EINVAL;
1620
1621	key = key & mask;
1622	l = fib_find_node(t, key);
1623
1624	if (!l)
1625		return -ESRCH;
1626
1627	fa_head = get_fa_head(l, plen);
1628	fa = fib_find_alias(fa_head, tos, 0);
1629
1630	if (!fa)
1631		return -ESRCH;
1632
1633	pr_debug("Deleting %08x/%d tos=%d t=%p\n", key, plen, tos, t);
1634
1635	fa_to_delete = NULL;
1636	fa = list_entry(fa->fa_list.prev, struct fib_alias, fa_list);
1637	list_for_each_entry_continue(fa, fa_head, fa_list) {
1638		struct fib_info *fi = fa->fa_info;
1639
1640		if (fa->fa_tos != tos)
1641			break;
1642
1643		if ((!cfg->fc_type || fa->fa_type == cfg->fc_type) &&
1644		    (cfg->fc_scope == RT_SCOPE_NOWHERE ||
1645		     fa->fa_scope == cfg->fc_scope) &&
1646		    (!cfg->fc_protocol ||
1647		     fi->fib_protocol == cfg->fc_protocol) &&
1648		    fib_nh_match(cfg, fi) == 0) {
1649			fa_to_delete = fa;
1650			break;
1651		}
1652	}
1653
1654	if (!fa_to_delete)
1655		return -ESRCH;
1656
1657	fa = fa_to_delete;
1658	rtmsg_fib(RTM_DELROUTE, htonl(key), fa, plen, tb->tb_id,
1659		  &cfg->fc_nlinfo, 0);
1660
1661	l = fib_find_node(t, key);
1662	li = find_leaf_info(l, plen);
1663
1664	list_del_rcu(&fa->fa_list);
1665
1666	if (list_empty(fa_head)) {
1667		hlist_del_rcu(&li->hlist);
1668		free_leaf_info(li);
1669	}
1670
1671	if (hlist_empty(&l->list))
1672		trie_leaf_remove(t, l);
1673
1674	if (fa->fa_state & FA_S_ACCESSED)
1675		rt_cache_flush(cfg->fc_nlinfo.nl_net, -1);
1676
1677	fib_release_info(fa->fa_info);
1678	alias_free_mem_rcu(fa);
1679	return 0;
1680}
1681
1682static int trie_flush_list(struct list_head *head)
1683{
1684	struct fib_alias *fa, *fa_node;
1685	int found = 0;
1686
1687	list_for_each_entry_safe(fa, fa_node, head, fa_list) {
1688		struct fib_info *fi = fa->fa_info;
1689
1690		if (fi && (fi->fib_flags & RTNH_F_DEAD)) {
1691			list_del_rcu(&fa->fa_list);
1692			fib_release_info(fa->fa_info);
1693			alias_free_mem_rcu(fa);
1694			found++;
1695		}
1696	}
1697	return found;
1698}
1699
1700static int trie_flush_leaf(struct leaf *l)
1701{
1702	int found = 0;
1703	struct hlist_head *lih = &l->list;
1704	struct hlist_node *node, *tmp;
1705	struct leaf_info *li = NULL;
1706
1707	hlist_for_each_entry_safe(li, node, tmp, lih, hlist) {
1708		found += trie_flush_list(&li->falh);
1709
1710		if (list_empty(&li->falh)) {
1711			hlist_del_rcu(&li->hlist);
1712			free_leaf_info(li);
1713		}
1714	}
1715	return found;
1716}
1717
1718/*
1719 * Scan for the next right leaf starting at node p->child[idx]
1720 * Since we have back pointer, no recursion necessary.
1721 */
1722static struct leaf *leaf_walk_rcu(struct tnode *p, struct node *c)
1723{
1724	do {
1725		t_key idx;
1726
1727		if (c)
1728			idx = tkey_extract_bits(c->key, p->pos, p->bits) + 1;
1729		else
1730			idx = 0;
1731
1732		while (idx < 1u << p->bits) {
1733			c = tnode_get_child_rcu(p, idx++);
1734			if (!c)
1735				continue;
1736
1737			if (IS_LEAF(c)) {
1738				prefetch(p->child[idx]);
1739				return (struct leaf *) c;
1740			}
1741
1742			/* Rescan start scanning in new node */
1743			p = (struct tnode *) c;
1744			idx = 0;
1745		}
1746
1747		/* Node empty, walk back up to parent */
1748		c = (struct node *) p;
1749	} while ( (p = node_parent_rcu(c)) != NULL);
1750
1751	return NULL; /* Root of trie */
1752}
1753
1754static struct leaf *trie_firstleaf(struct trie *t)
1755{
1756	struct tnode *n = (struct tnode *) rcu_dereference(t->trie);
1757
1758	if (!n)
1759		return NULL;
1760
1761	if (IS_LEAF(n))          /* trie is just a leaf */
1762		return (struct leaf *) n;
1763
1764	return leaf_walk_rcu(n, NULL);
1765}
1766
1767static struct leaf *trie_nextleaf(struct leaf *l)
1768{
1769	struct node *c = (struct node *) l;
1770	struct tnode *p = node_parent_rcu(c);
1771
1772	if (!p)
1773		return NULL;	/* trie with just one leaf */
1774
1775	return leaf_walk_rcu(p, c);
1776}
1777
1778static struct leaf *trie_leafindex(struct trie *t, int index)
1779{
1780	struct leaf *l = trie_firstleaf(t);
1781
1782	while (l && index-- > 0)
1783		l = trie_nextleaf(l);
1784
1785	return l;
1786}
1787
1788
1789/*
1790 * Caller must hold RTNL.
1791 */
1792int fib_table_flush(struct fib_table *tb)
1793{
1794	struct trie *t = (struct trie *) tb->tb_data;
1795	struct leaf *l, *ll = NULL;
1796	int found = 0;
1797
1798	for (l = trie_firstleaf(t); l; l = trie_nextleaf(l)) {
1799		found += trie_flush_leaf(l);
1800
1801		if (ll && hlist_empty(&ll->list))
1802			trie_leaf_remove(t, ll);
1803		ll = l;
1804	}
1805
1806	if (ll && hlist_empty(&ll->list))
1807		trie_leaf_remove(t, ll);
1808
1809	pr_debug("trie_flush found=%d\n", found);
1810	return found;
1811}
1812
1813void fib_table_select_default(struct fib_table *tb,
1814			      const struct flowi *flp,
1815			      struct fib_result *res)
1816{
1817	struct trie *t = (struct trie *) tb->tb_data;
1818	int order, last_idx;
1819	struct fib_info *fi = NULL;
1820	struct fib_info *last_resort;
1821	struct fib_alias *fa = NULL;
1822	struct list_head *fa_head;
1823	struct leaf *l;
1824
1825	last_idx = -1;
1826	last_resort = NULL;
1827	order = -1;
1828
1829	rcu_read_lock();
1830
1831	l = fib_find_node(t, 0);
1832	if (!l)
1833		goto out;
1834
1835	fa_head = get_fa_head(l, 0);
1836	if (!fa_head)
1837		goto out;
1838
1839	if (list_empty(fa_head))
1840		goto out;
1841
1842	list_for_each_entry_rcu(fa, fa_head, fa_list) {
1843		struct fib_info *next_fi = fa->fa_info;
1844
1845		if (fa->fa_scope != res->scope ||
1846		    fa->fa_type != RTN_UNICAST)
1847			continue;
1848
1849		if (next_fi->fib_priority > res->fi->fib_priority)
1850			break;
1851		if (!next_fi->fib_nh[0].nh_gw ||
1852		    next_fi->fib_nh[0].nh_scope != RT_SCOPE_LINK)
1853			continue;
1854		fa->fa_state |= FA_S_ACCESSED;
1855
1856		if (fi == NULL) {
1857			if (next_fi != res->fi)
1858				break;
1859		} else if (!fib_detect_death(fi, order, &last_resort,
1860					     &last_idx, tb->tb_default)) {
1861			fib_result_assign(res, fi);
1862			tb->tb_default = order;
1863			goto out;
1864		}
1865		fi = next_fi;
1866		order++;
1867	}
1868	if (order <= 0 || fi == NULL) {
1869		tb->tb_default = -1;
1870		goto out;
1871	}
1872
1873	if (!fib_detect_death(fi, order, &last_resort, &last_idx,
1874				tb->tb_default)) {
1875		fib_result_assign(res, fi);
1876		tb->tb_default = order;
1877		goto out;
1878	}
1879	if (last_idx >= 0)
1880		fib_result_assign(res, last_resort);
1881	tb->tb_default = last_idx;
1882out:
1883	rcu_read_unlock();
1884}
1885
1886static int fn_trie_dump_fa(t_key key, int plen, struct list_head *fah,
1887			   struct fib_table *tb,
1888			   struct sk_buff *skb, struct netlink_callback *cb)
1889{
1890	int i, s_i;
1891	struct fib_alias *fa;
1892	__be32 xkey = htonl(key);
1893
1894	s_i = cb->args[5];
1895	i = 0;
1896
1897	/* rcu_read_lock is hold by caller */
1898
1899	list_for_each_entry_rcu(fa, fah, fa_list) {
1900		if (i < s_i) {
1901			i++;
1902			continue;
1903		}
1904
1905		if (fib_dump_info(skb, NETLINK_CB(cb->skb).pid,
1906				  cb->nlh->nlmsg_seq,
1907				  RTM_NEWROUTE,
1908				  tb->tb_id,
1909				  fa->fa_type,
1910				  fa->fa_scope,
1911				  xkey,
1912				  plen,
1913				  fa->fa_tos,
1914				  fa->fa_info, NLM_F_MULTI) < 0) {
1915			cb->args[5] = i;
1916			return -1;
1917		}
1918		i++;
1919	}
1920	cb->args[5] = i;
1921	return skb->len;
1922}
1923
1924static int fn_trie_dump_leaf(struct leaf *l, struct fib_table *tb,
1925			struct sk_buff *skb, struct netlink_callback *cb)
1926{
1927	struct leaf_info *li;
1928	struct hlist_node *node;
1929	int i, s_i;
1930
1931	s_i = cb->args[4];
1932	i = 0;
1933
1934	/* rcu_read_lock is hold by caller */
1935	hlist_for_each_entry_rcu(li, node, &l->list, hlist) {
1936		if (i < s_i) {
1937			i++;
1938			continue;
1939		}
1940
1941		if (i > s_i)
1942			cb->args[5] = 0;
1943
1944		if (list_empty(&li->falh))
1945			continue;
1946
1947		if (fn_trie_dump_fa(l->key, li->plen, &li->falh, tb, skb, cb) < 0) {
1948			cb->args[4] = i;
1949			return -1;
1950		}
1951		i++;
1952	}
1953
1954	cb->args[4] = i;
1955	return skb->len;
1956}
1957
1958int fib_table_dump(struct fib_table *tb, struct sk_buff *skb,
1959		   struct netlink_callback *cb)
1960{
1961	struct leaf *l;
1962	struct trie *t = (struct trie *) tb->tb_data;
1963	t_key key = cb->args[2];
1964	int count = cb->args[3];
1965
1966	rcu_read_lock();
1967	/* Dump starting at last key.
1968	 * Note: 0.0.0.0/0 (ie default) is first key.
1969	 */
1970	if (count == 0)
1971		l = trie_firstleaf(t);
1972	else {
1973		/* Normally, continue from last key, but if that is missing
1974		 * fallback to using slow rescan
1975		 */
1976		l = fib_find_node(t, key);
1977		if (!l)
1978			l = trie_leafindex(t, count);
1979	}
1980
1981	while (l) {
1982		cb->args[2] = l->key;
1983		if (fn_trie_dump_leaf(l, tb, skb, cb) < 0) {
1984			cb->args[3] = count;
1985			rcu_read_unlock();
1986			return -1;
1987		}
1988
1989		++count;
1990		l = trie_nextleaf(l);
1991		memset(&cb->args[4], 0,
1992		       sizeof(cb->args) - 4*sizeof(cb->args[0]));
1993	}
1994	cb->args[3] = count;
1995	rcu_read_unlock();
1996
1997	return skb->len;
1998}
1999
2000void __init fib_hash_init(void)
2001{
2002	fn_alias_kmem = kmem_cache_create("ip_fib_alias",
2003					  sizeof(struct fib_alias),
2004					  0, SLAB_PANIC, NULL);
2005
2006	trie_leaf_kmem = kmem_cache_create("ip_fib_trie",
2007					   max(sizeof(struct leaf),
2008					       sizeof(struct leaf_info)),
2009					   0, SLAB_PANIC, NULL);
2010}
2011
2012
2013/* Fix more generic FIB names for init later */
2014struct fib_table *fib_hash_table(u32 id)
2015{
2016	struct fib_table *tb;
2017	struct trie *t;
2018
2019	tb = kmalloc(sizeof(struct fib_table) + sizeof(struct trie),
2020		     GFP_KERNEL);
2021	if (tb == NULL)
2022		return NULL;
2023
2024	tb->tb_id = id;
2025	tb->tb_default = -1;
2026
2027	t = (struct trie *) tb->tb_data;
2028	memset(t, 0, sizeof(*t));
2029
2030	if (id == RT_TABLE_LOCAL)
2031		pr_info("IPv4 FIB: Using LC-trie version %s\n", VERSION);
2032
2033	return tb;
2034}
2035
2036#ifdef CONFIG_PROC_FS
2037/* Depth first Trie walk iterator */
2038struct fib_trie_iter {
2039	struct seq_net_private p;
2040	struct fib_table *tb;
2041	struct tnode *tnode;
2042	unsigned index;
2043	unsigned depth;
2044};
2045
2046static struct node *fib_trie_get_next(struct fib_trie_iter *iter)
2047{
2048	struct tnode *tn = iter->tnode;
2049	unsigned cindex = iter->index;
2050	struct tnode *p;
2051
2052	/* A single entry routing table */
2053	if (!tn)
2054		return NULL;
2055
2056	pr_debug("get_next iter={node=%p index=%d depth=%d}\n",
2057		 iter->tnode, iter->index, iter->depth);
2058rescan:
2059	while (cindex < (1<<tn->bits)) {
2060		struct node *n = tnode_get_child_rcu(tn, cindex);
2061
2062		if (n) {
2063			if (IS_LEAF(n)) {
2064				iter->tnode = tn;
2065				iter->index = cindex + 1;
2066			} else {
2067				/* push down one level */
2068				iter->tnode = (struct tnode *) n;
2069				iter->index = 0;
2070				++iter->depth;
2071			}
2072			return n;
2073		}
2074
2075		++cindex;
2076	}
2077
2078	/* Current node exhausted, pop back up */
2079	p = node_parent_rcu((struct node *)tn);
2080	if (p) {
2081		cindex = tkey_extract_bits(tn->key, p->pos, p->bits)+1;
2082		tn = p;
2083		--iter->depth;
2084		goto rescan;
2085	}
2086
2087	/* got root? */
2088	return NULL;
2089}
2090
2091static struct node *fib_trie_get_first(struct fib_trie_iter *iter,
2092				       struct trie *t)
2093{
2094	struct node *n;
2095
2096	if (!t)
2097		return NULL;
2098
2099	n = rcu_dereference(t->trie);
2100	if (!n)
2101		return NULL;
2102
2103	if (IS_TNODE(n)) {
2104		iter->tnode = (struct tnode *) n;
2105		iter->index = 0;
2106		iter->depth = 1;
2107	} else {
2108		iter->tnode = NULL;
2109		iter->index = 0;
2110		iter->depth = 0;
2111	}
2112
2113	return n;
2114}
2115
2116static void trie_collect_stats(struct trie *t, struct trie_stat *s)
2117{
2118	struct node *n;
2119	struct fib_trie_iter iter;
2120
2121	memset(s, 0, sizeof(*s));
2122
2123	rcu_read_lock();
2124	for (n = fib_trie_get_first(&iter, t); n; n = fib_trie_get_next(&iter)) {
2125		if (IS_LEAF(n)) {
2126			struct leaf *l = (struct leaf *)n;
2127			struct leaf_info *li;
2128			struct hlist_node *tmp;
2129
2130			s->leaves++;
2131			s->totdepth += iter.depth;
2132			if (iter.depth > s->maxdepth)
2133				s->maxdepth = iter.depth;
2134
2135			hlist_for_each_entry_rcu(li, tmp, &l->list, hlist)
2136				++s->prefixes;
2137		} else {
2138			const struct tnode *tn = (const struct tnode *) n;
2139			int i;
2140
2141			s->tnodes++;
2142			if (tn->bits < MAX_STAT_DEPTH)
2143				s->nodesizes[tn->bits]++;
2144
2145			for (i = 0; i < (1<<tn->bits); i++)
2146				if (!tn->child[i])
2147					s->nullpointers++;
2148		}
2149	}
2150	rcu_read_unlock();
2151}
2152
2153/*
2154 *	This outputs /proc/net/fib_triestats
2155 */
2156static void trie_show_stats(struct seq_file *seq, struct trie_stat *stat)
2157{
2158	unsigned i, max, pointers, bytes, avdepth;
2159
2160	if (stat->leaves)
2161		avdepth = stat->totdepth*100 / stat->leaves;
2162	else
2163		avdepth = 0;
2164
2165	seq_printf(seq, "\tAver depth:     %u.%02d\n",
2166		   avdepth / 100, avdepth % 100);
2167	seq_printf(seq, "\tMax depth:      %u\n", stat->maxdepth);
2168
2169	seq_printf(seq, "\tLeaves:         %u\n", stat->leaves);
2170	bytes = sizeof(struct leaf) * stat->leaves;
2171
2172	seq_printf(seq, "\tPrefixes:       %u\n", stat->prefixes);
2173	bytes += sizeof(struct leaf_info) * stat->prefixes;
2174
2175	seq_printf(seq, "\tInternal nodes: %u\n\t", stat->tnodes);
2176	bytes += sizeof(struct tnode) * stat->tnodes;
2177
2178	max = MAX_STAT_DEPTH;
2179	while (max > 0 && stat->nodesizes[max-1] == 0)
2180		max--;
2181
2182	pointers = 0;
2183	for (i = 1; i <= max; i++)
2184		if (stat->nodesizes[i] != 0) {
2185			seq_printf(seq, "  %u: %u",  i, stat->nodesizes[i]);
2186			pointers += (1<<i) * stat->nodesizes[i];
2187		}
2188	seq_putc(seq, '\n');
2189	seq_printf(seq, "\tPointers: %u\n", pointers);
2190
2191	bytes += sizeof(struct node *) * pointers;
2192	seq_printf(seq, "Null ptrs: %u\n", stat->nullpointers);
2193	seq_printf(seq, "Total size: %u  kB\n", (bytes + 1023) / 1024);
2194}
2195
2196#ifdef CONFIG_IP_FIB_TRIE_STATS
2197static void trie_show_usage(struct seq_file *seq,
2198			    const struct trie_use_stats *stats)
2199{
2200	seq_printf(seq, "\nCounters:\n---------\n");
2201	seq_printf(seq, "gets = %u\n", stats->gets);
2202	seq_printf(seq, "backtracks = %u\n", stats->backtrack);
2203	seq_printf(seq, "semantic match passed = %u\n",
2204		   stats->semantic_match_passed);
2205	seq_printf(seq, "semantic match miss = %u\n",
2206		   stats->semantic_match_miss);
2207	seq_printf(seq, "null node hit= %u\n", stats->null_node_hit);
2208	seq_printf(seq, "skipped node resize = %u\n\n",
2209		   stats->resize_node_skipped);
2210}
2211#endif /*  CONFIG_IP_FIB_TRIE_STATS */
2212
2213static void fib_table_print(struct seq_file *seq, struct fib_table *tb)
2214{
2215	if (tb->tb_id == RT_TABLE_LOCAL)
2216		seq_puts(seq, "Local:\n");
2217	else if (tb->tb_id == RT_TABLE_MAIN)
2218		seq_puts(seq, "Main:\n");
2219	else
2220		seq_printf(seq, "Id %d:\n", tb->tb_id);
2221}
2222
2223
2224static int fib_triestat_seq_show(struct seq_file *seq, void *v)
2225{
2226	struct net *net = (struct net *)seq->private;
2227	unsigned int h;
2228
2229	seq_printf(seq,
2230		   "Basic info: size of leaf:"
2231		   " %Zd bytes, size of tnode: %Zd bytes.\n",
2232		   sizeof(struct leaf), sizeof(struct tnode));
2233
2234	for (h = 0; h < FIB_TABLE_HASHSZ; h++) {
2235		struct hlist_head *head = &net->ipv4.fib_table_hash[h];
2236		struct hlist_node *node;
2237		struct fib_table *tb;
2238
2239		hlist_for_each_entry_rcu(tb, node, head, tb_hlist) {
2240			struct trie *t = (struct trie *) tb->tb_data;
2241			struct trie_stat stat;
2242
2243			if (!t)
2244				continue;
2245
2246			fib_table_print(seq, tb);
2247
2248			trie_collect_stats(t, &stat);
2249			trie_show_stats(seq, &stat);
2250#ifdef CONFIG_IP_FIB_TRIE_STATS
2251			trie_show_usage(seq, &t->stats);
2252#endif
2253		}
2254	}
2255
2256	return 0;
2257}
2258
2259static int fib_triestat_seq_open(struct inode *inode, struct file *file)
2260{
2261	return single_open_net(inode, file, fib_triestat_seq_show);
2262}
2263
2264static const struct file_operations fib_triestat_fops = {
2265	.owner	= THIS_MODULE,
2266	.open	= fib_triestat_seq_open,
2267	.read	= seq_read,
2268	.llseek	= seq_lseek,
2269	.release = single_release_net,
2270};
2271
2272static struct node *fib_trie_get_idx(struct seq_file *seq, loff_t pos)
2273{
2274	struct fib_trie_iter *iter = seq->private;
2275	struct net *net = seq_file_net(seq);
2276	loff_t idx = 0;
2277	unsigned int h;
2278
2279	for (h = 0; h < FIB_TABLE_HASHSZ; h++) {
2280		struct hlist_head *head = &net->ipv4.fib_table_hash[h];
2281		struct hlist_node *node;
2282		struct fib_table *tb;
2283
2284		hlist_for_each_entry_rcu(tb, node, head, tb_hlist) {
2285			struct node *n;
2286
2287			for (n = fib_trie_get_first(iter,
2288						    (struct trie *) tb->tb_data);
2289			     n; n = fib_trie_get_next(iter))
2290				if (pos == idx++) {
2291					iter->tb = tb;
2292					return n;
2293				}
2294		}
2295	}
2296
2297	return NULL;
2298}
2299
2300static void *fib_trie_seq_start(struct seq_file *seq, loff_t *pos)
2301	__acquires(RCU)
2302{
2303	rcu_read_lock();
2304	return fib_trie_get_idx(seq, *pos);
2305}
2306
2307static void *fib_trie_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2308{
2309	struct fib_trie_iter *iter = seq->private;
2310	struct net *net = seq_file_net(seq);
2311	struct fib_table *tb = iter->tb;
2312	struct hlist_node *tb_node;
2313	unsigned int h;
2314	struct node *n;
2315
2316	++*pos;
2317	/* next node in same table */
2318	n = fib_trie_get_next(iter);
2319	if (n)
2320		return n;
2321
2322	/* walk rest of this hash chain */
2323	h = tb->tb_id & (FIB_TABLE_HASHSZ - 1);
2324	while ( (tb_node = rcu_dereference(tb->tb_hlist.next)) ) {
2325		tb = hlist_entry(tb_node, struct fib_table, tb_hlist);
2326		n = fib_trie_get_first(iter, (struct trie *) tb->tb_data);
2327		if (n)
2328			goto found;
2329	}
2330
2331	/* new hash chain */
2332	while (++h < FIB_TABLE_HASHSZ) {
2333		struct hlist_head *head = &net->ipv4.fib_table_hash[h];
2334		hlist_for_each_entry_rcu(tb, tb_node, head, tb_hlist) {
2335			n = fib_trie_get_first(iter, (struct trie *) tb->tb_data);
2336			if (n)
2337				goto found;
2338		}
2339	}
2340	return NULL;
2341
2342found:
2343	iter->tb = tb;
2344	return n;
2345}
2346
2347static void fib_trie_seq_stop(struct seq_file *seq, void *v)
2348	__releases(RCU)
2349{
2350	rcu_read_unlock();
2351}
2352
2353static void seq_indent(struct seq_file *seq, int n)
2354{
2355	while (n-- > 0) seq_puts(seq, "   ");
2356}
2357
2358static inline const char *rtn_scope(char *buf, size_t len, enum rt_scope_t s)
2359{
2360	switch (s) {
2361	case RT_SCOPE_UNIVERSE: return "universe";
2362	case RT_SCOPE_SITE:	return "site";
2363	case RT_SCOPE_LINK:	return "link";
2364	case RT_SCOPE_HOST:	return "host";
2365	case RT_SCOPE_NOWHERE:	return "nowhere";
2366	default:
2367		snprintf(buf, len, "scope=%d", s);
2368		return buf;
2369	}
2370}
2371
2372static const char *const rtn_type_names[__RTN_MAX] = {
2373	[RTN_UNSPEC] = "UNSPEC",
2374	[RTN_UNICAST] = "UNICAST",
2375	[RTN_LOCAL] = "LOCAL",
2376	[RTN_BROADCAST] = "BROADCAST",
2377	[RTN_ANYCAST] = "ANYCAST",
2378	[RTN_MULTICAST] = "MULTICAST",
2379	[RTN_BLACKHOLE] = "BLACKHOLE",
2380	[RTN_UNREACHABLE] = "UNREACHABLE",
2381	[RTN_PROHIBIT] = "PROHIBIT",
2382	[RTN_THROW] = "THROW",
2383	[RTN_NAT] = "NAT",
2384	[RTN_XRESOLVE] = "XRESOLVE",
2385};
2386
2387static inline const char *rtn_type(char *buf, size_t len, unsigned t)
2388{
2389	if (t < __RTN_MAX && rtn_type_names[t])
2390		return rtn_type_names[t];
2391	snprintf(buf, len, "type %u", t);
2392	return buf;
2393}
2394
2395/* Pretty print the trie */
2396static int fib_trie_seq_show(struct seq_file *seq, void *v)
2397{
2398	const struct fib_trie_iter *iter = seq->private;
2399	struct node *n = v;
2400
2401	if (!node_parent_rcu(n))
2402		fib_table_print(seq, iter->tb);
2403
2404	if (IS_TNODE(n)) {
2405		struct tnode *tn = (struct tnode *) n;
2406		__be32 prf = htonl(mask_pfx(tn->key, tn->pos));
2407
2408		seq_indent(seq, iter->depth-1);
2409		seq_printf(seq, "  +-- %pI4/%d %d %d %d\n",
2410			   &prf, tn->pos, tn->bits, tn->full_children,
2411			   tn->empty_children);
2412
2413	} else {
2414		struct leaf *l = (struct leaf *) n;
2415		struct leaf_info *li;
2416		struct hlist_node *node;
2417		__be32 val = htonl(l->key);
2418
2419		seq_indent(seq, iter->depth);
2420		seq_printf(seq, "  |-- %pI4\n", &val);
2421
2422		hlist_for_each_entry_rcu(li, node, &l->list, hlist) {
2423			struct fib_alias *fa;
2424
2425			list_for_each_entry_rcu(fa, &li->falh, fa_list) {
2426				char buf1[32], buf2[32];
2427
2428				seq_indent(seq, iter->depth+1);
2429				seq_printf(seq, "  /%d %s %s", li->plen,
2430					   rtn_scope(buf1, sizeof(buf1),
2431						     fa->fa_scope),
2432					   rtn_type(buf2, sizeof(buf2),
2433						    fa->fa_type));
2434				if (fa->fa_tos)
2435					seq_printf(seq, " tos=%d", fa->fa_tos);
2436				seq_putc(seq, '\n');
2437			}
2438		}
2439	}
2440
2441	return 0;
2442}
2443
2444static const struct seq_operations fib_trie_seq_ops = {
2445	.start  = fib_trie_seq_start,
2446	.next   = fib_trie_seq_next,
2447	.stop   = fib_trie_seq_stop,
2448	.show   = fib_trie_seq_show,
2449};
2450
2451static int fib_trie_seq_open(struct inode *inode, struct file *file)
2452{
2453	return seq_open_net(inode, file, &fib_trie_seq_ops,
2454			    sizeof(struct fib_trie_iter));
2455}
2456
2457static const struct file_operations fib_trie_fops = {
2458	.owner  = THIS_MODULE,
2459	.open   = fib_trie_seq_open,
2460	.read   = seq_read,
2461	.llseek = seq_lseek,
2462	.release = seq_release_net,
2463};
2464
2465struct fib_route_iter {
2466	struct seq_net_private p;
2467	struct trie *main_trie;
2468	loff_t	pos;
2469	t_key	key;
2470};
2471
2472static struct leaf *fib_route_get_idx(struct fib_route_iter *iter, loff_t pos)
2473{
2474	struct leaf *l = NULL;
2475	struct trie *t = iter->main_trie;
2476
2477	/* use cache location of last found key */
2478	if (iter->pos > 0 && pos >= iter->pos && (l = fib_find_node(t, iter->key)))
2479		pos -= iter->pos;
2480	else {
2481		iter->pos = 0;
2482		l = trie_firstleaf(t);
2483	}
2484
2485	while (l && pos-- > 0) {
2486		iter->pos++;
2487		l = trie_nextleaf(l);
2488	}
2489
2490	if (l)
2491		iter->key = pos;	/* remember it */
2492	else
2493		iter->pos = 0;		/* forget it */
2494
2495	return l;
2496}
2497
2498static void *fib_route_seq_start(struct seq_file *seq, loff_t *pos)
2499	__acquires(RCU)
2500{
2501	struct fib_route_iter *iter = seq->private;
2502	struct fib_table *tb;
2503
2504	rcu_read_lock();
2505	tb = fib_get_table(seq_file_net(seq), RT_TABLE_MAIN);
2506	if (!tb)
2507		return NULL;
2508
2509	iter->main_trie = (struct trie *) tb->tb_data;
2510	if (*pos == 0)
2511		return SEQ_START_TOKEN;
2512	else
2513		return fib_route_get_idx(iter, *pos - 1);
2514}
2515
2516static void *fib_route_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2517{
2518	struct fib_route_iter *iter = seq->private;
2519	struct leaf *l = v;
2520
2521	++*pos;
2522	if (v == SEQ_START_TOKEN) {
2523		iter->pos = 0;
2524		l = trie_firstleaf(iter->main_trie);
2525	} else {
2526		iter->pos++;
2527		l = trie_nextleaf(l);
2528	}
2529
2530	if (l)
2531		iter->key = l->key;
2532	else
2533		iter->pos = 0;
2534	return l;
2535}
2536
2537static void fib_route_seq_stop(struct seq_file *seq, void *v)
2538	__releases(RCU)
2539{
2540	rcu_read_unlock();
2541}
2542
2543static unsigned fib_flag_trans(int type, __be32 mask, const struct fib_info *fi)
2544{
2545	static unsigned type2flags[RTN_MAX + 1] = {
2546		[7] = RTF_REJECT, [8] = RTF_REJECT,
2547	};
2548	unsigned flags = type2flags[type];
2549
2550	if (fi && fi->fib_nh->nh_gw)
2551		flags |= RTF_GATEWAY;
2552	if (mask == htonl(0xFFFFFFFF))
2553		flags |= RTF_HOST;
2554	flags |= RTF_UP;
2555	return flags;
2556}
2557
2558/*
2559 *	This outputs /proc/net/route.
2560 *	The format of the file is not supposed to be changed
2561 * 	and needs to be same as fib_hash output to avoid breaking
2562 *	legacy utilities
2563 */
2564static int fib_route_seq_show(struct seq_file *seq, void *v)
2565{
2566	struct leaf *l = v;
2567	struct leaf_info *li;
2568	struct hlist_node *node;
2569
2570	if (v == SEQ_START_TOKEN) {
2571		seq_printf(seq, "%-127s\n", "Iface\tDestination\tGateway "
2572			   "\tFlags\tRefCnt\tUse\tMetric\tMask\t\tMTU"
2573			   "\tWindow\tIRTT");
2574		return 0;
2575	}
2576
2577	hlist_for_each_entry_rcu(li, node, &l->list, hlist) {
2578		struct fib_alias *fa;
2579		__be32 mask, prefix;
2580
2581		mask = inet_make_mask(li->plen);
2582		prefix = htonl(l->key);
2583
2584		list_for_each_entry_rcu(fa, &li->falh, fa_list) {
2585			const struct fib_info *fi = fa->fa_info;
2586			unsigned flags = fib_flag_trans(fa->fa_type, mask, fi);
2587			int len;
2588
2589			if (fa->fa_type == RTN_BROADCAST
2590			    || fa->fa_type == RTN_MULTICAST)
2591				continue;
2592
2593			if (fi)
2594				seq_printf(seq,
2595					 "%s\t%08X\t%08X\t%04X\t%d\t%u\t"
2596					 "%d\t%08X\t%d\t%u\t%u%n",
2597					 fi->fib_dev ? fi->fib_dev->name : "*",
2598					 prefix,
2599					 fi->fib_nh->nh_gw, flags, 0, 0,
2600					 fi->fib_priority,
2601					 mask,
2602					 (fi->fib_advmss ?
2603					  fi->fib_advmss + 40 : 0),
2604					 fi->fib_window,
2605					 fi->fib_rtt >> 3, &len);
2606			else
2607				seq_printf(seq,
2608					 "*\t%08X\t%08X\t%04X\t%d\t%u\t"
2609					 "%d\t%08X\t%d\t%u\t%u%n",
2610					 prefix, 0, flags, 0, 0, 0,
2611					 mask, 0, 0, 0, &len);
2612
2613			seq_printf(seq, "%*s\n", 127 - len, "");
2614		}
2615	}
2616
2617	return 0;
2618}
2619
2620static const struct seq_operations fib_route_seq_ops = {
2621	.start  = fib_route_seq_start,
2622	.next   = fib_route_seq_next,
2623	.stop   = fib_route_seq_stop,
2624	.show   = fib_route_seq_show,
2625};
2626
2627static int fib_route_seq_open(struct inode *inode, struct file *file)
2628{
2629	return seq_open_net(inode, file, &fib_route_seq_ops,
2630			    sizeof(struct fib_route_iter));
2631}
2632
2633static const struct file_operations fib_route_fops = {
2634	.owner  = THIS_MODULE,
2635	.open   = fib_route_seq_open,
2636	.read   = seq_read,
2637	.llseek = seq_lseek,
2638	.release = seq_release_net,
2639};
2640
2641int __net_init fib_proc_init(struct net *net)
2642{
2643	if (!proc_net_fops_create(net, "fib_trie", S_IRUGO, &fib_trie_fops))
2644		goto out1;
2645
2646	if (!proc_net_fops_create(net, "fib_triestat", S_IRUGO,
2647				  &fib_triestat_fops))
2648		goto out2;
2649
2650	if (!proc_net_fops_create(net, "route", S_IRUGO, &fib_route_fops))
2651		goto out3;
2652
2653	return 0;
2654
2655out3:
2656	proc_net_remove(net, "fib_triestat");
2657out2:
2658	proc_net_remove(net, "fib_trie");
2659out1:
2660	return -ENOMEM;
2661}
2662
2663void __net_exit fib_proc_exit(struct net *net)
2664{
2665	proc_net_remove(net, "fib_trie");
2666	proc_net_remove(net, "fib_triestat");
2667	proc_net_remove(net, "route");
2668}
2669
2670#endif /* CONFIG_PROC_FS */
2671