fib_trie.c revision b67bfe0d42cac56c512dd5da4b1b347a23f4b70a
1/*
2 *   This program is free software; you can redistribute it and/or
3 *   modify it under the terms of the GNU General Public License
4 *   as published by the Free Software Foundation; either version
5 *   2 of the License, or (at your option) any later version.
6 *
7 *   Robert Olsson <robert.olsson@its.uu.se> Uppsala Universitet
8 *     & Swedish University of Agricultural Sciences.
9 *
10 *   Jens Laas <jens.laas@data.slu.se> Swedish University of
11 *     Agricultural Sciences.
12 *
13 *   Hans Liss <hans.liss@its.uu.se>  Uppsala Universitet
14 *
15 * This work is based on the LPC-trie which is originally described in:
16 *
17 * An experimental study of compression methods for dynamic tries
18 * Stefan Nilsson and Matti Tikkanen. Algorithmica, 33(1):19-33, 2002.
19 * http://www.csc.kth.se/~snilsson/software/dyntrie2/
20 *
21 *
22 * IP-address lookup using LC-tries. Stefan Nilsson and Gunnar Karlsson
23 * IEEE Journal on Selected Areas in Communications, 17(6):1083-1092, June 1999
24 *
25 *
26 * Code from fib_hash has been reused which includes the following header:
27 *
28 *
29 * INET		An implementation of the TCP/IP protocol suite for the LINUX
30 *		operating system.  INET is implemented using the  BSD Socket
31 *		interface as the means of communication with the user level.
32 *
33 *		IPv4 FIB: lookup engine and maintenance routines.
34 *
35 *
36 * Authors:	Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
37 *
38 *		This program is free software; you can redistribute it and/or
39 *		modify it under the terms of the GNU General Public License
40 *		as published by the Free Software Foundation; either version
41 *		2 of the License, or (at your option) any later version.
42 *
43 * Substantial contributions to this work comes from:
44 *
45 *		David S. Miller, <davem@davemloft.net>
46 *		Stephen Hemminger <shemminger@osdl.org>
47 *		Paul E. McKenney <paulmck@us.ibm.com>
48 *		Patrick McHardy <kaber@trash.net>
49 */
50
51#define VERSION "0.409"
52
53#include <asm/uaccess.h>
54#include <linux/bitops.h>
55#include <linux/types.h>
56#include <linux/kernel.h>
57#include <linux/mm.h>
58#include <linux/string.h>
59#include <linux/socket.h>
60#include <linux/sockios.h>
61#include <linux/errno.h>
62#include <linux/in.h>
63#include <linux/inet.h>
64#include <linux/inetdevice.h>
65#include <linux/netdevice.h>
66#include <linux/if_arp.h>
67#include <linux/proc_fs.h>
68#include <linux/rcupdate.h>
69#include <linux/skbuff.h>
70#include <linux/netlink.h>
71#include <linux/init.h>
72#include <linux/list.h>
73#include <linux/slab.h>
74#include <linux/prefetch.h>
75#include <linux/export.h>
76#include <net/net_namespace.h>
77#include <net/ip.h>
78#include <net/protocol.h>
79#include <net/route.h>
80#include <net/tcp.h>
81#include <net/sock.h>
82#include <net/ip_fib.h>
83#include "fib_lookup.h"
84
85#define MAX_STAT_DEPTH 32
86
87#define KEYLENGTH (8*sizeof(t_key))
88
89typedef unsigned int t_key;
90
91#define T_TNODE 0
92#define T_LEAF  1
93#define NODE_TYPE_MASK	0x1UL
94#define NODE_TYPE(node) ((node)->parent & NODE_TYPE_MASK)
95
96#define IS_TNODE(n) (!(n->parent & T_LEAF))
97#define IS_LEAF(n) (n->parent & T_LEAF)
98
99struct rt_trie_node {
100	unsigned long parent;
101	t_key key;
102};
103
104struct leaf {
105	unsigned long parent;
106	t_key key;
107	struct hlist_head list;
108	struct rcu_head rcu;
109};
110
111struct leaf_info {
112	struct hlist_node hlist;
113	int plen;
114	u32 mask_plen; /* ntohl(inet_make_mask(plen)) */
115	struct list_head falh;
116	struct rcu_head rcu;
117};
118
119struct tnode {
120	unsigned long parent;
121	t_key key;
122	unsigned char pos;		/* 2log(KEYLENGTH) bits needed */
123	unsigned char bits;		/* 2log(KEYLENGTH) bits needed */
124	unsigned int full_children;	/* KEYLENGTH bits needed */
125	unsigned int empty_children;	/* KEYLENGTH bits needed */
126	union {
127		struct rcu_head rcu;
128		struct work_struct work;
129		struct tnode *tnode_free;
130	};
131	struct rt_trie_node __rcu *child[0];
132};
133
134#ifdef CONFIG_IP_FIB_TRIE_STATS
135struct trie_use_stats {
136	unsigned int gets;
137	unsigned int backtrack;
138	unsigned int semantic_match_passed;
139	unsigned int semantic_match_miss;
140	unsigned int null_node_hit;
141	unsigned int resize_node_skipped;
142};
143#endif
144
145struct trie_stat {
146	unsigned int totdepth;
147	unsigned int maxdepth;
148	unsigned int tnodes;
149	unsigned int leaves;
150	unsigned int nullpointers;
151	unsigned int prefixes;
152	unsigned int nodesizes[MAX_STAT_DEPTH];
153};
154
155struct trie {
156	struct rt_trie_node __rcu *trie;
157#ifdef CONFIG_IP_FIB_TRIE_STATS
158	struct trie_use_stats stats;
159#endif
160};
161
162static void tnode_put_child_reorg(struct tnode *tn, int i, struct rt_trie_node *n,
163				  int wasfull);
164static struct rt_trie_node *resize(struct trie *t, struct tnode *tn);
165static struct tnode *inflate(struct trie *t, struct tnode *tn);
166static struct tnode *halve(struct trie *t, struct tnode *tn);
167/* tnodes to free after resize(); protected by RTNL */
168static struct tnode *tnode_free_head;
169static size_t tnode_free_size;
170
171/*
172 * synchronize_rcu after call_rcu for that many pages; it should be especially
173 * useful before resizing the root node with PREEMPT_NONE configs; the value was
174 * obtained experimentally, aiming to avoid visible slowdown.
175 */
176static const int sync_pages = 128;
177
178static struct kmem_cache *fn_alias_kmem __read_mostly;
179static struct kmem_cache *trie_leaf_kmem __read_mostly;
180
181/*
182 * caller must hold RTNL
183 */
184static inline struct tnode *node_parent(const struct rt_trie_node *node)
185{
186	unsigned long parent;
187
188	parent = rcu_dereference_index_check(node->parent, lockdep_rtnl_is_held());
189
190	return (struct tnode *)(parent & ~NODE_TYPE_MASK);
191}
192
193/*
194 * caller must hold RCU read lock or RTNL
195 */
196static inline struct tnode *node_parent_rcu(const struct rt_trie_node *node)
197{
198	unsigned long parent;
199
200	parent = rcu_dereference_index_check(node->parent, rcu_read_lock_held() ||
201							   lockdep_rtnl_is_held());
202
203	return (struct tnode *)(parent & ~NODE_TYPE_MASK);
204}
205
206/* Same as rcu_assign_pointer
207 * but that macro() assumes that value is a pointer.
208 */
209static inline void node_set_parent(struct rt_trie_node *node, struct tnode *ptr)
210{
211	smp_wmb();
212	node->parent = (unsigned long)ptr | NODE_TYPE(node);
213}
214
215/*
216 * caller must hold RTNL
217 */
218static inline struct rt_trie_node *tnode_get_child(const struct tnode *tn, unsigned int i)
219{
220	BUG_ON(i >= 1U << tn->bits);
221
222	return rtnl_dereference(tn->child[i]);
223}
224
225/*
226 * caller must hold RCU read lock or RTNL
227 */
228static inline struct rt_trie_node *tnode_get_child_rcu(const struct tnode *tn, unsigned int i)
229{
230	BUG_ON(i >= 1U << tn->bits);
231
232	return rcu_dereference_rtnl(tn->child[i]);
233}
234
235static inline int tnode_child_length(const struct tnode *tn)
236{
237	return 1 << tn->bits;
238}
239
240static inline t_key mask_pfx(t_key k, unsigned int l)
241{
242	return (l == 0) ? 0 : k >> (KEYLENGTH-l) << (KEYLENGTH-l);
243}
244
245static inline t_key tkey_extract_bits(t_key a, unsigned int offset, unsigned int bits)
246{
247	if (offset < KEYLENGTH)
248		return ((t_key)(a << offset)) >> (KEYLENGTH - bits);
249	else
250		return 0;
251}
252
253static inline int tkey_equals(t_key a, t_key b)
254{
255	return a == b;
256}
257
258static inline int tkey_sub_equals(t_key a, int offset, int bits, t_key b)
259{
260	if (bits == 0 || offset >= KEYLENGTH)
261		return 1;
262	bits = bits > KEYLENGTH ? KEYLENGTH : bits;
263	return ((a ^ b) << offset) >> (KEYLENGTH - bits) == 0;
264}
265
266static inline int tkey_mismatch(t_key a, int offset, t_key b)
267{
268	t_key diff = a ^ b;
269	int i = offset;
270
271	if (!diff)
272		return 0;
273	while ((diff << i) >> (KEYLENGTH-1) == 0)
274		i++;
275	return i;
276}
277
278/*
279  To understand this stuff, an understanding of keys and all their bits is
280  necessary. Every node in the trie has a key associated with it, but not
281  all of the bits in that key are significant.
282
283  Consider a node 'n' and its parent 'tp'.
284
285  If n is a leaf, every bit in its key is significant. Its presence is
286  necessitated by path compression, since during a tree traversal (when
287  searching for a leaf - unless we are doing an insertion) we will completely
288  ignore all skipped bits we encounter. Thus we need to verify, at the end of
289  a potentially successful search, that we have indeed been walking the
290  correct key path.
291
292  Note that we can never "miss" the correct key in the tree if present by
293  following the wrong path. Path compression ensures that segments of the key
294  that are the same for all keys with a given prefix are skipped, but the
295  skipped part *is* identical for each node in the subtrie below the skipped
296  bit! trie_insert() in this implementation takes care of that - note the
297  call to tkey_sub_equals() in trie_insert().
298
299  if n is an internal node - a 'tnode' here, the various parts of its key
300  have many different meanings.
301
302  Example:
303  _________________________________________________________________
304  | i | i | i | i | i | i | i | N | N | N | S | S | S | S | S | C |
305  -----------------------------------------------------------------
306    0   1   2   3   4   5   6   7   8   9  10  11  12  13  14  15
307
308  _________________________________________________________________
309  | C | C | C | u | u | u | u | u | u | u | u | u | u | u | u | u |
310  -----------------------------------------------------------------
311   16  17  18  19  20  21  22  23  24  25  26  27  28  29  30  31
312
313  tp->pos = 7
314  tp->bits = 3
315  n->pos = 15
316  n->bits = 4
317
318  First, let's just ignore the bits that come before the parent tp, that is
319  the bits from 0 to (tp->pos-1). They are *known* but at this point we do
320  not use them for anything.
321
322  The bits from (tp->pos) to (tp->pos + tp->bits - 1) - "N", above - are the
323  index into the parent's child array. That is, they will be used to find
324  'n' among tp's children.
325
326  The bits from (tp->pos + tp->bits) to (n->pos - 1) - "S" - are skipped bits
327  for the node n.
328
329  All the bits we have seen so far are significant to the node n. The rest
330  of the bits are really not needed or indeed known in n->key.
331
332  The bits from (n->pos) to (n->pos + n->bits - 1) - "C" - are the index into
333  n's child array, and will of course be different for each child.
334
335
336  The rest of the bits, from (n->pos + n->bits) onward, are completely unknown
337  at this point.
338
339*/
340
341static inline void check_tnode(const struct tnode *tn)
342{
343	WARN_ON(tn && tn->pos+tn->bits > 32);
344}
345
346static const int halve_threshold = 25;
347static const int inflate_threshold = 50;
348static const int halve_threshold_root = 15;
349static const int inflate_threshold_root = 30;
350
351static void __alias_free_mem(struct rcu_head *head)
352{
353	struct fib_alias *fa = container_of(head, struct fib_alias, rcu);
354	kmem_cache_free(fn_alias_kmem, fa);
355}
356
357static inline void alias_free_mem_rcu(struct fib_alias *fa)
358{
359	call_rcu(&fa->rcu, __alias_free_mem);
360}
361
362static void __leaf_free_rcu(struct rcu_head *head)
363{
364	struct leaf *l = container_of(head, struct leaf, rcu);
365	kmem_cache_free(trie_leaf_kmem, l);
366}
367
368static inline void free_leaf(struct leaf *l)
369{
370	call_rcu(&l->rcu, __leaf_free_rcu);
371}
372
373static inline void free_leaf_info(struct leaf_info *leaf)
374{
375	kfree_rcu(leaf, rcu);
376}
377
378static struct tnode *tnode_alloc(size_t size)
379{
380	if (size <= PAGE_SIZE)
381		return kzalloc(size, GFP_KERNEL);
382	else
383		return vzalloc(size);
384}
385
386static void __tnode_vfree(struct work_struct *arg)
387{
388	struct tnode *tn = container_of(arg, struct tnode, work);
389	vfree(tn);
390}
391
392static void __tnode_free_rcu(struct rcu_head *head)
393{
394	struct tnode *tn = container_of(head, struct tnode, rcu);
395	size_t size = sizeof(struct tnode) +
396		      (sizeof(struct rt_trie_node *) << tn->bits);
397
398	if (size <= PAGE_SIZE)
399		kfree(tn);
400	else {
401		INIT_WORK(&tn->work, __tnode_vfree);
402		schedule_work(&tn->work);
403	}
404}
405
406static inline void tnode_free(struct tnode *tn)
407{
408	if (IS_LEAF(tn))
409		free_leaf((struct leaf *) tn);
410	else
411		call_rcu(&tn->rcu, __tnode_free_rcu);
412}
413
414static void tnode_free_safe(struct tnode *tn)
415{
416	BUG_ON(IS_LEAF(tn));
417	tn->tnode_free = tnode_free_head;
418	tnode_free_head = tn;
419	tnode_free_size += sizeof(struct tnode) +
420			   (sizeof(struct rt_trie_node *) << tn->bits);
421}
422
423static void tnode_free_flush(void)
424{
425	struct tnode *tn;
426
427	while ((tn = tnode_free_head)) {
428		tnode_free_head = tn->tnode_free;
429		tn->tnode_free = NULL;
430		tnode_free(tn);
431	}
432
433	if (tnode_free_size >= PAGE_SIZE * sync_pages) {
434		tnode_free_size = 0;
435		synchronize_rcu();
436	}
437}
438
439static struct leaf *leaf_new(void)
440{
441	struct leaf *l = kmem_cache_alloc(trie_leaf_kmem, GFP_KERNEL);
442	if (l) {
443		l->parent = T_LEAF;
444		INIT_HLIST_HEAD(&l->list);
445	}
446	return l;
447}
448
449static struct leaf_info *leaf_info_new(int plen)
450{
451	struct leaf_info *li = kmalloc(sizeof(struct leaf_info),  GFP_KERNEL);
452	if (li) {
453		li->plen = plen;
454		li->mask_plen = ntohl(inet_make_mask(plen));
455		INIT_LIST_HEAD(&li->falh);
456	}
457	return li;
458}
459
460static struct tnode *tnode_new(t_key key, int pos, int bits)
461{
462	size_t sz = sizeof(struct tnode) + (sizeof(struct rt_trie_node *) << bits);
463	struct tnode *tn = tnode_alloc(sz);
464
465	if (tn) {
466		tn->parent = T_TNODE;
467		tn->pos = pos;
468		tn->bits = bits;
469		tn->key = key;
470		tn->full_children = 0;
471		tn->empty_children = 1<<bits;
472	}
473
474	pr_debug("AT %p s=%zu %zu\n", tn, sizeof(struct tnode),
475		 sizeof(struct rt_trie_node *) << bits);
476	return tn;
477}
478
479/*
480 * Check whether a tnode 'n' is "full", i.e. it is an internal node
481 * and no bits are skipped. See discussion in dyntree paper p. 6
482 */
483
484static inline int tnode_full(const struct tnode *tn, const struct rt_trie_node *n)
485{
486	if (n == NULL || IS_LEAF(n))
487		return 0;
488
489	return ((struct tnode *) n)->pos == tn->pos + tn->bits;
490}
491
492static inline void put_child(struct tnode *tn, int i,
493			     struct rt_trie_node *n)
494{
495	tnode_put_child_reorg(tn, i, n, -1);
496}
497
498 /*
499  * Add a child at position i overwriting the old value.
500  * Update the value of full_children and empty_children.
501  */
502
503static void tnode_put_child_reorg(struct tnode *tn, int i, struct rt_trie_node *n,
504				  int wasfull)
505{
506	struct rt_trie_node *chi = rtnl_dereference(tn->child[i]);
507	int isfull;
508
509	BUG_ON(i >= 1<<tn->bits);
510
511	/* update emptyChildren */
512	if (n == NULL && chi != NULL)
513		tn->empty_children++;
514	else if (n != NULL && chi == NULL)
515		tn->empty_children--;
516
517	/* update fullChildren */
518	if (wasfull == -1)
519		wasfull = tnode_full(tn, chi);
520
521	isfull = tnode_full(tn, n);
522	if (wasfull && !isfull)
523		tn->full_children--;
524	else if (!wasfull && isfull)
525		tn->full_children++;
526
527	if (n)
528		node_set_parent(n, tn);
529
530	rcu_assign_pointer(tn->child[i], n);
531}
532
533#define MAX_WORK 10
534static struct rt_trie_node *resize(struct trie *t, struct tnode *tn)
535{
536	int i;
537	struct tnode *old_tn;
538	int inflate_threshold_use;
539	int halve_threshold_use;
540	int max_work;
541
542	if (!tn)
543		return NULL;
544
545	pr_debug("In tnode_resize %p inflate_threshold=%d threshold=%d\n",
546		 tn, inflate_threshold, halve_threshold);
547
548	/* No children */
549	if (tn->empty_children == tnode_child_length(tn)) {
550		tnode_free_safe(tn);
551		return NULL;
552	}
553	/* One child */
554	if (tn->empty_children == tnode_child_length(tn) - 1)
555		goto one_child;
556	/*
557	 * Double as long as the resulting node has a number of
558	 * nonempty nodes that are above the threshold.
559	 */
560
561	/*
562	 * From "Implementing a dynamic compressed trie" by Stefan Nilsson of
563	 * the Helsinki University of Technology and Matti Tikkanen of Nokia
564	 * Telecommunications, page 6:
565	 * "A node is doubled if the ratio of non-empty children to all
566	 * children in the *doubled* node is at least 'high'."
567	 *
568	 * 'high' in this instance is the variable 'inflate_threshold'. It
569	 * is expressed as a percentage, so we multiply it with
570	 * tnode_child_length() and instead of multiplying by 2 (since the
571	 * child array will be doubled by inflate()) and multiplying
572	 * the left-hand side by 100 (to handle the percentage thing) we
573	 * multiply the left-hand side by 50.
574	 *
575	 * The left-hand side may look a bit weird: tnode_child_length(tn)
576	 * - tn->empty_children is of course the number of non-null children
577	 * in the current node. tn->full_children is the number of "full"
578	 * children, that is non-null tnodes with a skip value of 0.
579	 * All of those will be doubled in the resulting inflated tnode, so
580	 * we just count them one extra time here.
581	 *
582	 * A clearer way to write this would be:
583	 *
584	 * to_be_doubled = tn->full_children;
585	 * not_to_be_doubled = tnode_child_length(tn) - tn->empty_children -
586	 *     tn->full_children;
587	 *
588	 * new_child_length = tnode_child_length(tn) * 2;
589	 *
590	 * new_fill_factor = 100 * (not_to_be_doubled + 2*to_be_doubled) /
591	 *      new_child_length;
592	 * if (new_fill_factor >= inflate_threshold)
593	 *
594	 * ...and so on, tho it would mess up the while () loop.
595	 *
596	 * anyway,
597	 * 100 * (not_to_be_doubled + 2*to_be_doubled) / new_child_length >=
598	 *      inflate_threshold
599	 *
600	 * avoid a division:
601	 * 100 * (not_to_be_doubled + 2*to_be_doubled) >=
602	 *      inflate_threshold * new_child_length
603	 *
604	 * expand not_to_be_doubled and to_be_doubled, and shorten:
605	 * 100 * (tnode_child_length(tn) - tn->empty_children +
606	 *    tn->full_children) >= inflate_threshold * new_child_length
607	 *
608	 * expand new_child_length:
609	 * 100 * (tnode_child_length(tn) - tn->empty_children +
610	 *    tn->full_children) >=
611	 *      inflate_threshold * tnode_child_length(tn) * 2
612	 *
613	 * shorten again:
614	 * 50 * (tn->full_children + tnode_child_length(tn) -
615	 *    tn->empty_children) >= inflate_threshold *
616	 *    tnode_child_length(tn)
617	 *
618	 */
619
620	check_tnode(tn);
621
622	/* Keep root node larger  */
623
624	if (!node_parent((struct rt_trie_node *)tn)) {
625		inflate_threshold_use = inflate_threshold_root;
626		halve_threshold_use = halve_threshold_root;
627	} else {
628		inflate_threshold_use = inflate_threshold;
629		halve_threshold_use = halve_threshold;
630	}
631
632	max_work = MAX_WORK;
633	while ((tn->full_children > 0 &&  max_work-- &&
634		50 * (tn->full_children + tnode_child_length(tn)
635		      - tn->empty_children)
636		>= inflate_threshold_use * tnode_child_length(tn))) {
637
638		old_tn = tn;
639		tn = inflate(t, tn);
640
641		if (IS_ERR(tn)) {
642			tn = old_tn;
643#ifdef CONFIG_IP_FIB_TRIE_STATS
644			t->stats.resize_node_skipped++;
645#endif
646			break;
647		}
648	}
649
650	check_tnode(tn);
651
652	/* Return if at least one inflate is run */
653	if (max_work != MAX_WORK)
654		return (struct rt_trie_node *) tn;
655
656	/*
657	 * Halve as long as the number of empty children in this
658	 * node is above threshold.
659	 */
660
661	max_work = MAX_WORK;
662	while (tn->bits > 1 &&  max_work-- &&
663	       100 * (tnode_child_length(tn) - tn->empty_children) <
664	       halve_threshold_use * tnode_child_length(tn)) {
665
666		old_tn = tn;
667		tn = halve(t, tn);
668		if (IS_ERR(tn)) {
669			tn = old_tn;
670#ifdef CONFIG_IP_FIB_TRIE_STATS
671			t->stats.resize_node_skipped++;
672#endif
673			break;
674		}
675	}
676
677
678	/* Only one child remains */
679	if (tn->empty_children == tnode_child_length(tn) - 1) {
680one_child:
681		for (i = 0; i < tnode_child_length(tn); i++) {
682			struct rt_trie_node *n;
683
684			n = rtnl_dereference(tn->child[i]);
685			if (!n)
686				continue;
687
688			/* compress one level */
689
690			node_set_parent(n, NULL);
691			tnode_free_safe(tn);
692			return n;
693		}
694	}
695	return (struct rt_trie_node *) tn;
696}
697
698
699static void tnode_clean_free(struct tnode *tn)
700{
701	int i;
702	struct tnode *tofree;
703
704	for (i = 0; i < tnode_child_length(tn); i++) {
705		tofree = (struct tnode *)rtnl_dereference(tn->child[i]);
706		if (tofree)
707			tnode_free(tofree);
708	}
709	tnode_free(tn);
710}
711
712static struct tnode *inflate(struct trie *t, struct tnode *tn)
713{
714	struct tnode *oldtnode = tn;
715	int olen = tnode_child_length(tn);
716	int i;
717
718	pr_debug("In inflate\n");
719
720	tn = tnode_new(oldtnode->key, oldtnode->pos, oldtnode->bits + 1);
721
722	if (!tn)
723		return ERR_PTR(-ENOMEM);
724
725	/*
726	 * Preallocate and store tnodes before the actual work so we
727	 * don't get into an inconsistent state if memory allocation
728	 * fails. In case of failure we return the oldnode and  inflate
729	 * of tnode is ignored.
730	 */
731
732	for (i = 0; i < olen; i++) {
733		struct tnode *inode;
734
735		inode = (struct tnode *) tnode_get_child(oldtnode, i);
736		if (inode &&
737		    IS_TNODE(inode) &&
738		    inode->pos == oldtnode->pos + oldtnode->bits &&
739		    inode->bits > 1) {
740			struct tnode *left, *right;
741			t_key m = ~0U << (KEYLENGTH - 1) >> inode->pos;
742
743			left = tnode_new(inode->key&(~m), inode->pos + 1,
744					 inode->bits - 1);
745			if (!left)
746				goto nomem;
747
748			right = tnode_new(inode->key|m, inode->pos + 1,
749					  inode->bits - 1);
750
751			if (!right) {
752				tnode_free(left);
753				goto nomem;
754			}
755
756			put_child(tn, 2*i, (struct rt_trie_node *) left);
757			put_child(tn, 2*i+1, (struct rt_trie_node *) right);
758		}
759	}
760
761	for (i = 0; i < olen; i++) {
762		struct tnode *inode;
763		struct rt_trie_node *node = tnode_get_child(oldtnode, i);
764		struct tnode *left, *right;
765		int size, j;
766
767		/* An empty child */
768		if (node == NULL)
769			continue;
770
771		/* A leaf or an internal node with skipped bits */
772
773		if (IS_LEAF(node) || ((struct tnode *) node)->pos >
774		   tn->pos + tn->bits - 1) {
775			if (tkey_extract_bits(node->key,
776					      oldtnode->pos + oldtnode->bits,
777					      1) == 0)
778				put_child(tn, 2*i, node);
779			else
780				put_child(tn, 2*i+1, node);
781			continue;
782		}
783
784		/* An internal node with two children */
785		inode = (struct tnode *) node;
786
787		if (inode->bits == 1) {
788			put_child(tn, 2*i, rtnl_dereference(inode->child[0]));
789			put_child(tn, 2*i+1, rtnl_dereference(inode->child[1]));
790
791			tnode_free_safe(inode);
792			continue;
793		}
794
795		/* An internal node with more than two children */
796
797		/* We will replace this node 'inode' with two new
798		 * ones, 'left' and 'right', each with half of the
799		 * original children. The two new nodes will have
800		 * a position one bit further down the key and this
801		 * means that the "significant" part of their keys
802		 * (see the discussion near the top of this file)
803		 * will differ by one bit, which will be "0" in
804		 * left's key and "1" in right's key. Since we are
805		 * moving the key position by one step, the bit that
806		 * we are moving away from - the bit at position
807		 * (inode->pos) - is the one that will differ between
808		 * left and right. So... we synthesize that bit in the
809		 * two  new keys.
810		 * The mask 'm' below will be a single "one" bit at
811		 * the position (inode->pos)
812		 */
813
814		/* Use the old key, but set the new significant
815		 *   bit to zero.
816		 */
817
818		left = (struct tnode *) tnode_get_child(tn, 2*i);
819		put_child(tn, 2*i, NULL);
820
821		BUG_ON(!left);
822
823		right = (struct tnode *) tnode_get_child(tn, 2*i+1);
824		put_child(tn, 2*i+1, NULL);
825
826		BUG_ON(!right);
827
828		size = tnode_child_length(left);
829		for (j = 0; j < size; j++) {
830			put_child(left, j, rtnl_dereference(inode->child[j]));
831			put_child(right, j, rtnl_dereference(inode->child[j + size]));
832		}
833		put_child(tn, 2*i, resize(t, left));
834		put_child(tn, 2*i+1, resize(t, right));
835
836		tnode_free_safe(inode);
837	}
838	tnode_free_safe(oldtnode);
839	return tn;
840nomem:
841	tnode_clean_free(tn);
842	return ERR_PTR(-ENOMEM);
843}
844
845static struct tnode *halve(struct trie *t, struct tnode *tn)
846{
847	struct tnode *oldtnode = tn;
848	struct rt_trie_node *left, *right;
849	int i;
850	int olen = tnode_child_length(tn);
851
852	pr_debug("In halve\n");
853
854	tn = tnode_new(oldtnode->key, oldtnode->pos, oldtnode->bits - 1);
855
856	if (!tn)
857		return ERR_PTR(-ENOMEM);
858
859	/*
860	 * Preallocate and store tnodes before the actual work so we
861	 * don't get into an inconsistent state if memory allocation
862	 * fails. In case of failure we return the oldnode and halve
863	 * of tnode is ignored.
864	 */
865
866	for (i = 0; i < olen; i += 2) {
867		left = tnode_get_child(oldtnode, i);
868		right = tnode_get_child(oldtnode, i+1);
869
870		/* Two nonempty children */
871		if (left && right) {
872			struct tnode *newn;
873
874			newn = tnode_new(left->key, tn->pos + tn->bits, 1);
875
876			if (!newn)
877				goto nomem;
878
879			put_child(tn, i/2, (struct rt_trie_node *)newn);
880		}
881
882	}
883
884	for (i = 0; i < olen; i += 2) {
885		struct tnode *newBinNode;
886
887		left = tnode_get_child(oldtnode, i);
888		right = tnode_get_child(oldtnode, i+1);
889
890		/* At least one of the children is empty */
891		if (left == NULL) {
892			if (right == NULL)    /* Both are empty */
893				continue;
894			put_child(tn, i/2, right);
895			continue;
896		}
897
898		if (right == NULL) {
899			put_child(tn, i/2, left);
900			continue;
901		}
902
903		/* Two nonempty children */
904		newBinNode = (struct tnode *) tnode_get_child(tn, i/2);
905		put_child(tn, i/2, NULL);
906		put_child(newBinNode, 0, left);
907		put_child(newBinNode, 1, right);
908		put_child(tn, i/2, resize(t, newBinNode));
909	}
910	tnode_free_safe(oldtnode);
911	return tn;
912nomem:
913	tnode_clean_free(tn);
914	return ERR_PTR(-ENOMEM);
915}
916
917/* readside must use rcu_read_lock currently dump routines
918 via get_fa_head and dump */
919
920static struct leaf_info *find_leaf_info(struct leaf *l, int plen)
921{
922	struct hlist_head *head = &l->list;
923	struct leaf_info *li;
924
925	hlist_for_each_entry_rcu(li, head, hlist)
926		if (li->plen == plen)
927			return li;
928
929	return NULL;
930}
931
932static inline struct list_head *get_fa_head(struct leaf *l, int plen)
933{
934	struct leaf_info *li = find_leaf_info(l, plen);
935
936	if (!li)
937		return NULL;
938
939	return &li->falh;
940}
941
942static void insert_leaf_info(struct hlist_head *head, struct leaf_info *new)
943{
944	struct leaf_info *li = NULL, *last = NULL;
945
946	if (hlist_empty(head)) {
947		hlist_add_head_rcu(&new->hlist, head);
948	} else {
949		hlist_for_each_entry(li, head, hlist) {
950			if (new->plen > li->plen)
951				break;
952
953			last = li;
954		}
955		if (last)
956			hlist_add_after_rcu(&last->hlist, &new->hlist);
957		else
958			hlist_add_before_rcu(&new->hlist, &li->hlist);
959	}
960}
961
962/* rcu_read_lock needs to be hold by caller from readside */
963
964static struct leaf *
965fib_find_node(struct trie *t, u32 key)
966{
967	int pos;
968	struct tnode *tn;
969	struct rt_trie_node *n;
970
971	pos = 0;
972	n = rcu_dereference_rtnl(t->trie);
973
974	while (n != NULL &&  NODE_TYPE(n) == T_TNODE) {
975		tn = (struct tnode *) n;
976
977		check_tnode(tn);
978
979		if (tkey_sub_equals(tn->key, pos, tn->pos-pos, key)) {
980			pos = tn->pos + tn->bits;
981			n = tnode_get_child_rcu(tn,
982						tkey_extract_bits(key,
983								  tn->pos,
984								  tn->bits));
985		} else
986			break;
987	}
988	/* Case we have found a leaf. Compare prefixes */
989
990	if (n != NULL && IS_LEAF(n) && tkey_equals(key, n->key))
991		return (struct leaf *)n;
992
993	return NULL;
994}
995
996static void trie_rebalance(struct trie *t, struct tnode *tn)
997{
998	int wasfull;
999	t_key cindex, key;
1000	struct tnode *tp;
1001
1002	key = tn->key;
1003
1004	while (tn != NULL && (tp = node_parent((struct rt_trie_node *)tn)) != NULL) {
1005		cindex = tkey_extract_bits(key, tp->pos, tp->bits);
1006		wasfull = tnode_full(tp, tnode_get_child(tp, cindex));
1007		tn = (struct tnode *)resize(t, tn);
1008
1009		tnode_put_child_reorg(tp, cindex,
1010				      (struct rt_trie_node *)tn, wasfull);
1011
1012		tp = node_parent((struct rt_trie_node *) tn);
1013		if (!tp)
1014			rcu_assign_pointer(t->trie, (struct rt_trie_node *)tn);
1015
1016		tnode_free_flush();
1017		if (!tp)
1018			break;
1019		tn = tp;
1020	}
1021
1022	/* Handle last (top) tnode */
1023	if (IS_TNODE(tn))
1024		tn = (struct tnode *)resize(t, tn);
1025
1026	rcu_assign_pointer(t->trie, (struct rt_trie_node *)tn);
1027	tnode_free_flush();
1028}
1029
1030/* only used from updater-side */
1031
1032static struct list_head *fib_insert_node(struct trie *t, u32 key, int plen)
1033{
1034	int pos, newpos;
1035	struct tnode *tp = NULL, *tn = NULL;
1036	struct rt_trie_node *n;
1037	struct leaf *l;
1038	int missbit;
1039	struct list_head *fa_head = NULL;
1040	struct leaf_info *li;
1041	t_key cindex;
1042
1043	pos = 0;
1044	n = rtnl_dereference(t->trie);
1045
1046	/* If we point to NULL, stop. Either the tree is empty and we should
1047	 * just put a new leaf in if, or we have reached an empty child slot,
1048	 * and we should just put our new leaf in that.
1049	 * If we point to a T_TNODE, check if it matches our key. Note that
1050	 * a T_TNODE might be skipping any number of bits - its 'pos' need
1051	 * not be the parent's 'pos'+'bits'!
1052	 *
1053	 * If it does match the current key, get pos/bits from it, extract
1054	 * the index from our key, push the T_TNODE and walk the tree.
1055	 *
1056	 * If it doesn't, we have to replace it with a new T_TNODE.
1057	 *
1058	 * If we point to a T_LEAF, it might or might not have the same key
1059	 * as we do. If it does, just change the value, update the T_LEAF's
1060	 * value, and return it.
1061	 * If it doesn't, we need to replace it with a T_TNODE.
1062	 */
1063
1064	while (n != NULL &&  NODE_TYPE(n) == T_TNODE) {
1065		tn = (struct tnode *) n;
1066
1067		check_tnode(tn);
1068
1069		if (tkey_sub_equals(tn->key, pos, tn->pos-pos, key)) {
1070			tp = tn;
1071			pos = tn->pos + tn->bits;
1072			n = tnode_get_child(tn,
1073					    tkey_extract_bits(key,
1074							      tn->pos,
1075							      tn->bits));
1076
1077			BUG_ON(n && node_parent(n) != tn);
1078		} else
1079			break;
1080	}
1081
1082	/*
1083	 * n  ----> NULL, LEAF or TNODE
1084	 *
1085	 * tp is n's (parent) ----> NULL or TNODE
1086	 */
1087
1088	BUG_ON(tp && IS_LEAF(tp));
1089
1090	/* Case 1: n is a leaf. Compare prefixes */
1091
1092	if (n != NULL && IS_LEAF(n) && tkey_equals(key, n->key)) {
1093		l = (struct leaf *) n;
1094		li = leaf_info_new(plen);
1095
1096		if (!li)
1097			return NULL;
1098
1099		fa_head = &li->falh;
1100		insert_leaf_info(&l->list, li);
1101		goto done;
1102	}
1103	l = leaf_new();
1104
1105	if (!l)
1106		return NULL;
1107
1108	l->key = key;
1109	li = leaf_info_new(plen);
1110
1111	if (!li) {
1112		free_leaf(l);
1113		return NULL;
1114	}
1115
1116	fa_head = &li->falh;
1117	insert_leaf_info(&l->list, li);
1118
1119	if (t->trie && n == NULL) {
1120		/* Case 2: n is NULL, and will just insert a new leaf */
1121
1122		node_set_parent((struct rt_trie_node *)l, tp);
1123
1124		cindex = tkey_extract_bits(key, tp->pos, tp->bits);
1125		put_child(tp, cindex, (struct rt_trie_node *)l);
1126	} else {
1127		/* Case 3: n is a LEAF or a TNODE and the key doesn't match. */
1128		/*
1129		 *  Add a new tnode here
1130		 *  first tnode need some special handling
1131		 */
1132
1133		if (tp)
1134			pos = tp->pos+tp->bits;
1135		else
1136			pos = 0;
1137
1138		if (n) {
1139			newpos = tkey_mismatch(key, pos, n->key);
1140			tn = tnode_new(n->key, newpos, 1);
1141		} else {
1142			newpos = 0;
1143			tn = tnode_new(key, newpos, 1); /* First tnode */
1144		}
1145
1146		if (!tn) {
1147			free_leaf_info(li);
1148			free_leaf(l);
1149			return NULL;
1150		}
1151
1152		node_set_parent((struct rt_trie_node *)tn, tp);
1153
1154		missbit = tkey_extract_bits(key, newpos, 1);
1155		put_child(tn, missbit, (struct rt_trie_node *)l);
1156		put_child(tn, 1-missbit, n);
1157
1158		if (tp) {
1159			cindex = tkey_extract_bits(key, tp->pos, tp->bits);
1160			put_child(tp, cindex, (struct rt_trie_node *)tn);
1161		} else {
1162			rcu_assign_pointer(t->trie, (struct rt_trie_node *)tn);
1163			tp = tn;
1164		}
1165	}
1166
1167	if (tp && tp->pos + tp->bits > 32)
1168		pr_warn("fib_trie tp=%p pos=%d, bits=%d, key=%0x plen=%d\n",
1169			tp, tp->pos, tp->bits, key, plen);
1170
1171	/* Rebalance the trie */
1172
1173	trie_rebalance(t, tp);
1174done:
1175	return fa_head;
1176}
1177
1178/*
1179 * Caller must hold RTNL.
1180 */
1181int fib_table_insert(struct fib_table *tb, struct fib_config *cfg)
1182{
1183	struct trie *t = (struct trie *) tb->tb_data;
1184	struct fib_alias *fa, *new_fa;
1185	struct list_head *fa_head = NULL;
1186	struct fib_info *fi;
1187	int plen = cfg->fc_dst_len;
1188	u8 tos = cfg->fc_tos;
1189	u32 key, mask;
1190	int err;
1191	struct leaf *l;
1192
1193	if (plen > 32)
1194		return -EINVAL;
1195
1196	key = ntohl(cfg->fc_dst);
1197
1198	pr_debug("Insert table=%u %08x/%d\n", tb->tb_id, key, plen);
1199
1200	mask = ntohl(inet_make_mask(plen));
1201
1202	if (key & ~mask)
1203		return -EINVAL;
1204
1205	key = key & mask;
1206
1207	fi = fib_create_info(cfg);
1208	if (IS_ERR(fi)) {
1209		err = PTR_ERR(fi);
1210		goto err;
1211	}
1212
1213	l = fib_find_node(t, key);
1214	fa = NULL;
1215
1216	if (l) {
1217		fa_head = get_fa_head(l, plen);
1218		fa = fib_find_alias(fa_head, tos, fi->fib_priority);
1219	}
1220
1221	/* Now fa, if non-NULL, points to the first fib alias
1222	 * with the same keys [prefix,tos,priority], if such key already
1223	 * exists or to the node before which we will insert new one.
1224	 *
1225	 * If fa is NULL, we will need to allocate a new one and
1226	 * insert to the head of f.
1227	 *
1228	 * If f is NULL, no fib node matched the destination key
1229	 * and we need to allocate a new one of those as well.
1230	 */
1231
1232	if (fa && fa->fa_tos == tos &&
1233	    fa->fa_info->fib_priority == fi->fib_priority) {
1234		struct fib_alias *fa_first, *fa_match;
1235
1236		err = -EEXIST;
1237		if (cfg->fc_nlflags & NLM_F_EXCL)
1238			goto out;
1239
1240		/* We have 2 goals:
1241		 * 1. Find exact match for type, scope, fib_info to avoid
1242		 * duplicate routes
1243		 * 2. Find next 'fa' (or head), NLM_F_APPEND inserts before it
1244		 */
1245		fa_match = NULL;
1246		fa_first = fa;
1247		fa = list_entry(fa->fa_list.prev, struct fib_alias, fa_list);
1248		list_for_each_entry_continue(fa, fa_head, fa_list) {
1249			if (fa->fa_tos != tos)
1250				break;
1251			if (fa->fa_info->fib_priority != fi->fib_priority)
1252				break;
1253			if (fa->fa_type == cfg->fc_type &&
1254			    fa->fa_info == fi) {
1255				fa_match = fa;
1256				break;
1257			}
1258		}
1259
1260		if (cfg->fc_nlflags & NLM_F_REPLACE) {
1261			struct fib_info *fi_drop;
1262			u8 state;
1263
1264			fa = fa_first;
1265			if (fa_match) {
1266				if (fa == fa_match)
1267					err = 0;
1268				goto out;
1269			}
1270			err = -ENOBUFS;
1271			new_fa = kmem_cache_alloc(fn_alias_kmem, GFP_KERNEL);
1272			if (new_fa == NULL)
1273				goto out;
1274
1275			fi_drop = fa->fa_info;
1276			new_fa->fa_tos = fa->fa_tos;
1277			new_fa->fa_info = fi;
1278			new_fa->fa_type = cfg->fc_type;
1279			state = fa->fa_state;
1280			new_fa->fa_state = state & ~FA_S_ACCESSED;
1281
1282			list_replace_rcu(&fa->fa_list, &new_fa->fa_list);
1283			alias_free_mem_rcu(fa);
1284
1285			fib_release_info(fi_drop);
1286			if (state & FA_S_ACCESSED)
1287				rt_cache_flush(cfg->fc_nlinfo.nl_net);
1288			rtmsg_fib(RTM_NEWROUTE, htonl(key), new_fa, plen,
1289				tb->tb_id, &cfg->fc_nlinfo, NLM_F_REPLACE);
1290
1291			goto succeeded;
1292		}
1293		/* Error if we find a perfect match which
1294		 * uses the same scope, type, and nexthop
1295		 * information.
1296		 */
1297		if (fa_match)
1298			goto out;
1299
1300		if (!(cfg->fc_nlflags & NLM_F_APPEND))
1301			fa = fa_first;
1302	}
1303	err = -ENOENT;
1304	if (!(cfg->fc_nlflags & NLM_F_CREATE))
1305		goto out;
1306
1307	err = -ENOBUFS;
1308	new_fa = kmem_cache_alloc(fn_alias_kmem, GFP_KERNEL);
1309	if (new_fa == NULL)
1310		goto out;
1311
1312	new_fa->fa_info = fi;
1313	new_fa->fa_tos = tos;
1314	new_fa->fa_type = cfg->fc_type;
1315	new_fa->fa_state = 0;
1316	/*
1317	 * Insert new entry to the list.
1318	 */
1319
1320	if (!fa_head) {
1321		fa_head = fib_insert_node(t, key, plen);
1322		if (unlikely(!fa_head)) {
1323			err = -ENOMEM;
1324			goto out_free_new_fa;
1325		}
1326	}
1327
1328	if (!plen)
1329		tb->tb_num_default++;
1330
1331	list_add_tail_rcu(&new_fa->fa_list,
1332			  (fa ? &fa->fa_list : fa_head));
1333
1334	rt_cache_flush(cfg->fc_nlinfo.nl_net);
1335	rtmsg_fib(RTM_NEWROUTE, htonl(key), new_fa, plen, tb->tb_id,
1336		  &cfg->fc_nlinfo, 0);
1337succeeded:
1338	return 0;
1339
1340out_free_new_fa:
1341	kmem_cache_free(fn_alias_kmem, new_fa);
1342out:
1343	fib_release_info(fi);
1344err:
1345	return err;
1346}
1347
1348/* should be called with rcu_read_lock */
1349static int check_leaf(struct fib_table *tb, struct trie *t, struct leaf *l,
1350		      t_key key,  const struct flowi4 *flp,
1351		      struct fib_result *res, int fib_flags)
1352{
1353	struct leaf_info *li;
1354	struct hlist_head *hhead = &l->list;
1355
1356	hlist_for_each_entry_rcu(li, hhead, hlist) {
1357		struct fib_alias *fa;
1358
1359		if (l->key != (key & li->mask_plen))
1360			continue;
1361
1362		list_for_each_entry_rcu(fa, &li->falh, fa_list) {
1363			struct fib_info *fi = fa->fa_info;
1364			int nhsel, err;
1365
1366			if (fa->fa_tos && fa->fa_tos != flp->flowi4_tos)
1367				continue;
1368			if (fi->fib_dead)
1369				continue;
1370			if (fa->fa_info->fib_scope < flp->flowi4_scope)
1371				continue;
1372			fib_alias_accessed(fa);
1373			err = fib_props[fa->fa_type].error;
1374			if (err) {
1375#ifdef CONFIG_IP_FIB_TRIE_STATS
1376				t->stats.semantic_match_passed++;
1377#endif
1378				return err;
1379			}
1380			if (fi->fib_flags & RTNH_F_DEAD)
1381				continue;
1382			for (nhsel = 0; nhsel < fi->fib_nhs; nhsel++) {
1383				const struct fib_nh *nh = &fi->fib_nh[nhsel];
1384
1385				if (nh->nh_flags & RTNH_F_DEAD)
1386					continue;
1387				if (flp->flowi4_oif && flp->flowi4_oif != nh->nh_oif)
1388					continue;
1389
1390#ifdef CONFIG_IP_FIB_TRIE_STATS
1391				t->stats.semantic_match_passed++;
1392#endif
1393				res->prefixlen = li->plen;
1394				res->nh_sel = nhsel;
1395				res->type = fa->fa_type;
1396				res->scope = fa->fa_info->fib_scope;
1397				res->fi = fi;
1398				res->table = tb;
1399				res->fa_head = &li->falh;
1400				if (!(fib_flags & FIB_LOOKUP_NOREF))
1401					atomic_inc(&fi->fib_clntref);
1402				return 0;
1403			}
1404		}
1405
1406#ifdef CONFIG_IP_FIB_TRIE_STATS
1407		t->stats.semantic_match_miss++;
1408#endif
1409	}
1410
1411	return 1;
1412}
1413
1414int fib_table_lookup(struct fib_table *tb, const struct flowi4 *flp,
1415		     struct fib_result *res, int fib_flags)
1416{
1417	struct trie *t = (struct trie *) tb->tb_data;
1418	int ret;
1419	struct rt_trie_node *n;
1420	struct tnode *pn;
1421	unsigned int pos, bits;
1422	t_key key = ntohl(flp->daddr);
1423	unsigned int chopped_off;
1424	t_key cindex = 0;
1425	unsigned int current_prefix_length = KEYLENGTH;
1426	struct tnode *cn;
1427	t_key pref_mismatch;
1428
1429	rcu_read_lock();
1430
1431	n = rcu_dereference(t->trie);
1432	if (!n)
1433		goto failed;
1434
1435#ifdef CONFIG_IP_FIB_TRIE_STATS
1436	t->stats.gets++;
1437#endif
1438
1439	/* Just a leaf? */
1440	if (IS_LEAF(n)) {
1441		ret = check_leaf(tb, t, (struct leaf *)n, key, flp, res, fib_flags);
1442		goto found;
1443	}
1444
1445	pn = (struct tnode *) n;
1446	chopped_off = 0;
1447
1448	while (pn) {
1449		pos = pn->pos;
1450		bits = pn->bits;
1451
1452		if (!chopped_off)
1453			cindex = tkey_extract_bits(mask_pfx(key, current_prefix_length),
1454						   pos, bits);
1455
1456		n = tnode_get_child_rcu(pn, cindex);
1457
1458		if (n == NULL) {
1459#ifdef CONFIG_IP_FIB_TRIE_STATS
1460			t->stats.null_node_hit++;
1461#endif
1462			goto backtrace;
1463		}
1464
1465		if (IS_LEAF(n)) {
1466			ret = check_leaf(tb, t, (struct leaf *)n, key, flp, res, fib_flags);
1467			if (ret > 0)
1468				goto backtrace;
1469			goto found;
1470		}
1471
1472		cn = (struct tnode *)n;
1473
1474		/*
1475		 * It's a tnode, and we can do some extra checks here if we
1476		 * like, to avoid descending into a dead-end branch.
1477		 * This tnode is in the parent's child array at index
1478		 * key[p_pos..p_pos+p_bits] but potentially with some bits
1479		 * chopped off, so in reality the index may be just a
1480		 * subprefix, padded with zero at the end.
1481		 * We can also take a look at any skipped bits in this
1482		 * tnode - everything up to p_pos is supposed to be ok,
1483		 * and the non-chopped bits of the index (se previous
1484		 * paragraph) are also guaranteed ok, but the rest is
1485		 * considered unknown.
1486		 *
1487		 * The skipped bits are key[pos+bits..cn->pos].
1488		 */
1489
1490		/* If current_prefix_length < pos+bits, we are already doing
1491		 * actual prefix  matching, which means everything from
1492		 * pos+(bits-chopped_off) onward must be zero along some
1493		 * branch of this subtree - otherwise there is *no* valid
1494		 * prefix present. Here we can only check the skipped
1495		 * bits. Remember, since we have already indexed into the
1496		 * parent's child array, we know that the bits we chopped of
1497		 * *are* zero.
1498		 */
1499
1500		/* NOTA BENE: Checking only skipped bits
1501		   for the new node here */
1502
1503		if (current_prefix_length < pos+bits) {
1504			if (tkey_extract_bits(cn->key, current_prefix_length,
1505						cn->pos - current_prefix_length)
1506			    || !(cn->child[0]))
1507				goto backtrace;
1508		}
1509
1510		/*
1511		 * If chopped_off=0, the index is fully validated and we
1512		 * only need to look at the skipped bits for this, the new,
1513		 * tnode. What we actually want to do is to find out if
1514		 * these skipped bits match our key perfectly, or if we will
1515		 * have to count on finding a matching prefix further down,
1516		 * because if we do, we would like to have some way of
1517		 * verifying the existence of such a prefix at this point.
1518		 */
1519
1520		/* The only thing we can do at this point is to verify that
1521		 * any such matching prefix can indeed be a prefix to our
1522		 * key, and if the bits in the node we are inspecting that
1523		 * do not match our key are not ZERO, this cannot be true.
1524		 * Thus, find out where there is a mismatch (before cn->pos)
1525		 * and verify that all the mismatching bits are zero in the
1526		 * new tnode's key.
1527		 */
1528
1529		/*
1530		 * Note: We aren't very concerned about the piece of
1531		 * the key that precede pn->pos+pn->bits, since these
1532		 * have already been checked. The bits after cn->pos
1533		 * aren't checked since these are by definition
1534		 * "unknown" at this point. Thus, what we want to see
1535		 * is if we are about to enter the "prefix matching"
1536		 * state, and in that case verify that the skipped
1537		 * bits that will prevail throughout this subtree are
1538		 * zero, as they have to be if we are to find a
1539		 * matching prefix.
1540		 */
1541
1542		pref_mismatch = mask_pfx(cn->key ^ key, cn->pos);
1543
1544		/*
1545		 * In short: If skipped bits in this node do not match
1546		 * the search key, enter the "prefix matching"
1547		 * state.directly.
1548		 */
1549		if (pref_mismatch) {
1550			/* fls(x) = __fls(x) + 1 */
1551			int mp = KEYLENGTH - __fls(pref_mismatch) - 1;
1552
1553			if (tkey_extract_bits(cn->key, mp, cn->pos - mp) != 0)
1554				goto backtrace;
1555
1556			if (current_prefix_length >= cn->pos)
1557				current_prefix_length = mp;
1558		}
1559
1560		pn = (struct tnode *)n; /* Descend */
1561		chopped_off = 0;
1562		continue;
1563
1564backtrace:
1565		chopped_off++;
1566
1567		/* As zero don't change the child key (cindex) */
1568		while ((chopped_off <= pn->bits)
1569		       && !(cindex & (1<<(chopped_off-1))))
1570			chopped_off++;
1571
1572		/* Decrease current_... with bits chopped off */
1573		if (current_prefix_length > pn->pos + pn->bits - chopped_off)
1574			current_prefix_length = pn->pos + pn->bits
1575				- chopped_off;
1576
1577		/*
1578		 * Either we do the actual chop off according or if we have
1579		 * chopped off all bits in this tnode walk up to our parent.
1580		 */
1581
1582		if (chopped_off <= pn->bits) {
1583			cindex &= ~(1 << (chopped_off-1));
1584		} else {
1585			struct tnode *parent = node_parent_rcu((struct rt_trie_node *) pn);
1586			if (!parent)
1587				goto failed;
1588
1589			/* Get Child's index */
1590			cindex = tkey_extract_bits(pn->key, parent->pos, parent->bits);
1591			pn = parent;
1592			chopped_off = 0;
1593
1594#ifdef CONFIG_IP_FIB_TRIE_STATS
1595			t->stats.backtrack++;
1596#endif
1597			goto backtrace;
1598		}
1599	}
1600failed:
1601	ret = 1;
1602found:
1603	rcu_read_unlock();
1604	return ret;
1605}
1606EXPORT_SYMBOL_GPL(fib_table_lookup);
1607
1608/*
1609 * Remove the leaf and return parent.
1610 */
1611static void trie_leaf_remove(struct trie *t, struct leaf *l)
1612{
1613	struct tnode *tp = node_parent((struct rt_trie_node *) l);
1614
1615	pr_debug("entering trie_leaf_remove(%p)\n", l);
1616
1617	if (tp) {
1618		t_key cindex = tkey_extract_bits(l->key, tp->pos, tp->bits);
1619		put_child(tp, cindex, NULL);
1620		trie_rebalance(t, tp);
1621	} else
1622		RCU_INIT_POINTER(t->trie, NULL);
1623
1624	free_leaf(l);
1625}
1626
1627/*
1628 * Caller must hold RTNL.
1629 */
1630int fib_table_delete(struct fib_table *tb, struct fib_config *cfg)
1631{
1632	struct trie *t = (struct trie *) tb->tb_data;
1633	u32 key, mask;
1634	int plen = cfg->fc_dst_len;
1635	u8 tos = cfg->fc_tos;
1636	struct fib_alias *fa, *fa_to_delete;
1637	struct list_head *fa_head;
1638	struct leaf *l;
1639	struct leaf_info *li;
1640
1641	if (plen > 32)
1642		return -EINVAL;
1643
1644	key = ntohl(cfg->fc_dst);
1645	mask = ntohl(inet_make_mask(plen));
1646
1647	if (key & ~mask)
1648		return -EINVAL;
1649
1650	key = key & mask;
1651	l = fib_find_node(t, key);
1652
1653	if (!l)
1654		return -ESRCH;
1655
1656	li = find_leaf_info(l, plen);
1657
1658	if (!li)
1659		return -ESRCH;
1660
1661	fa_head = &li->falh;
1662	fa = fib_find_alias(fa_head, tos, 0);
1663
1664	if (!fa)
1665		return -ESRCH;
1666
1667	pr_debug("Deleting %08x/%d tos=%d t=%p\n", key, plen, tos, t);
1668
1669	fa_to_delete = NULL;
1670	fa = list_entry(fa->fa_list.prev, struct fib_alias, fa_list);
1671	list_for_each_entry_continue(fa, fa_head, fa_list) {
1672		struct fib_info *fi = fa->fa_info;
1673
1674		if (fa->fa_tos != tos)
1675			break;
1676
1677		if ((!cfg->fc_type || fa->fa_type == cfg->fc_type) &&
1678		    (cfg->fc_scope == RT_SCOPE_NOWHERE ||
1679		     fa->fa_info->fib_scope == cfg->fc_scope) &&
1680		    (!cfg->fc_prefsrc ||
1681		     fi->fib_prefsrc == cfg->fc_prefsrc) &&
1682		    (!cfg->fc_protocol ||
1683		     fi->fib_protocol == cfg->fc_protocol) &&
1684		    fib_nh_match(cfg, fi) == 0) {
1685			fa_to_delete = fa;
1686			break;
1687		}
1688	}
1689
1690	if (!fa_to_delete)
1691		return -ESRCH;
1692
1693	fa = fa_to_delete;
1694	rtmsg_fib(RTM_DELROUTE, htonl(key), fa, plen, tb->tb_id,
1695		  &cfg->fc_nlinfo, 0);
1696
1697	list_del_rcu(&fa->fa_list);
1698
1699	if (!plen)
1700		tb->tb_num_default--;
1701
1702	if (list_empty(fa_head)) {
1703		hlist_del_rcu(&li->hlist);
1704		free_leaf_info(li);
1705	}
1706
1707	if (hlist_empty(&l->list))
1708		trie_leaf_remove(t, l);
1709
1710	if (fa->fa_state & FA_S_ACCESSED)
1711		rt_cache_flush(cfg->fc_nlinfo.nl_net);
1712
1713	fib_release_info(fa->fa_info);
1714	alias_free_mem_rcu(fa);
1715	return 0;
1716}
1717
1718static int trie_flush_list(struct list_head *head)
1719{
1720	struct fib_alias *fa, *fa_node;
1721	int found = 0;
1722
1723	list_for_each_entry_safe(fa, fa_node, head, fa_list) {
1724		struct fib_info *fi = fa->fa_info;
1725
1726		if (fi && (fi->fib_flags & RTNH_F_DEAD)) {
1727			list_del_rcu(&fa->fa_list);
1728			fib_release_info(fa->fa_info);
1729			alias_free_mem_rcu(fa);
1730			found++;
1731		}
1732	}
1733	return found;
1734}
1735
1736static int trie_flush_leaf(struct leaf *l)
1737{
1738	int found = 0;
1739	struct hlist_head *lih = &l->list;
1740	struct hlist_node *tmp;
1741	struct leaf_info *li = NULL;
1742
1743	hlist_for_each_entry_safe(li, tmp, lih, hlist) {
1744		found += trie_flush_list(&li->falh);
1745
1746		if (list_empty(&li->falh)) {
1747			hlist_del_rcu(&li->hlist);
1748			free_leaf_info(li);
1749		}
1750	}
1751	return found;
1752}
1753
1754/*
1755 * Scan for the next right leaf starting at node p->child[idx]
1756 * Since we have back pointer, no recursion necessary.
1757 */
1758static struct leaf *leaf_walk_rcu(struct tnode *p, struct rt_trie_node *c)
1759{
1760	do {
1761		t_key idx;
1762
1763		if (c)
1764			idx = tkey_extract_bits(c->key, p->pos, p->bits) + 1;
1765		else
1766			idx = 0;
1767
1768		while (idx < 1u << p->bits) {
1769			c = tnode_get_child_rcu(p, idx++);
1770			if (!c)
1771				continue;
1772
1773			if (IS_LEAF(c)) {
1774				prefetch(rcu_dereference_rtnl(p->child[idx]));
1775				return (struct leaf *) c;
1776			}
1777
1778			/* Rescan start scanning in new node */
1779			p = (struct tnode *) c;
1780			idx = 0;
1781		}
1782
1783		/* Node empty, walk back up to parent */
1784		c = (struct rt_trie_node *) p;
1785	} while ((p = node_parent_rcu(c)) != NULL);
1786
1787	return NULL; /* Root of trie */
1788}
1789
1790static struct leaf *trie_firstleaf(struct trie *t)
1791{
1792	struct tnode *n = (struct tnode *)rcu_dereference_rtnl(t->trie);
1793
1794	if (!n)
1795		return NULL;
1796
1797	if (IS_LEAF(n))          /* trie is just a leaf */
1798		return (struct leaf *) n;
1799
1800	return leaf_walk_rcu(n, NULL);
1801}
1802
1803static struct leaf *trie_nextleaf(struct leaf *l)
1804{
1805	struct rt_trie_node *c = (struct rt_trie_node *) l;
1806	struct tnode *p = node_parent_rcu(c);
1807
1808	if (!p)
1809		return NULL;	/* trie with just one leaf */
1810
1811	return leaf_walk_rcu(p, c);
1812}
1813
1814static struct leaf *trie_leafindex(struct trie *t, int index)
1815{
1816	struct leaf *l = trie_firstleaf(t);
1817
1818	while (l && index-- > 0)
1819		l = trie_nextleaf(l);
1820
1821	return l;
1822}
1823
1824
1825/*
1826 * Caller must hold RTNL.
1827 */
1828int fib_table_flush(struct fib_table *tb)
1829{
1830	struct trie *t = (struct trie *) tb->tb_data;
1831	struct leaf *l, *ll = NULL;
1832	int found = 0;
1833
1834	for (l = trie_firstleaf(t); l; l = trie_nextleaf(l)) {
1835		found += trie_flush_leaf(l);
1836
1837		if (ll && hlist_empty(&ll->list))
1838			trie_leaf_remove(t, ll);
1839		ll = l;
1840	}
1841
1842	if (ll && hlist_empty(&ll->list))
1843		trie_leaf_remove(t, ll);
1844
1845	pr_debug("trie_flush found=%d\n", found);
1846	return found;
1847}
1848
1849void fib_free_table(struct fib_table *tb)
1850{
1851	kfree(tb);
1852}
1853
1854static int fn_trie_dump_fa(t_key key, int plen, struct list_head *fah,
1855			   struct fib_table *tb,
1856			   struct sk_buff *skb, struct netlink_callback *cb)
1857{
1858	int i, s_i;
1859	struct fib_alias *fa;
1860	__be32 xkey = htonl(key);
1861
1862	s_i = cb->args[5];
1863	i = 0;
1864
1865	/* rcu_read_lock is hold by caller */
1866
1867	list_for_each_entry_rcu(fa, fah, fa_list) {
1868		if (i < s_i) {
1869			i++;
1870			continue;
1871		}
1872
1873		if (fib_dump_info(skb, NETLINK_CB(cb->skb).portid,
1874				  cb->nlh->nlmsg_seq,
1875				  RTM_NEWROUTE,
1876				  tb->tb_id,
1877				  fa->fa_type,
1878				  xkey,
1879				  plen,
1880				  fa->fa_tos,
1881				  fa->fa_info, NLM_F_MULTI) < 0) {
1882			cb->args[5] = i;
1883			return -1;
1884		}
1885		i++;
1886	}
1887	cb->args[5] = i;
1888	return skb->len;
1889}
1890
1891static int fn_trie_dump_leaf(struct leaf *l, struct fib_table *tb,
1892			struct sk_buff *skb, struct netlink_callback *cb)
1893{
1894	struct leaf_info *li;
1895	int i, s_i;
1896
1897	s_i = cb->args[4];
1898	i = 0;
1899
1900	/* rcu_read_lock is hold by caller */
1901	hlist_for_each_entry_rcu(li, &l->list, hlist) {
1902		if (i < s_i) {
1903			i++;
1904			continue;
1905		}
1906
1907		if (i > s_i)
1908			cb->args[5] = 0;
1909
1910		if (list_empty(&li->falh))
1911			continue;
1912
1913		if (fn_trie_dump_fa(l->key, li->plen, &li->falh, tb, skb, cb) < 0) {
1914			cb->args[4] = i;
1915			return -1;
1916		}
1917		i++;
1918	}
1919
1920	cb->args[4] = i;
1921	return skb->len;
1922}
1923
1924int fib_table_dump(struct fib_table *tb, struct sk_buff *skb,
1925		   struct netlink_callback *cb)
1926{
1927	struct leaf *l;
1928	struct trie *t = (struct trie *) tb->tb_data;
1929	t_key key = cb->args[2];
1930	int count = cb->args[3];
1931
1932	rcu_read_lock();
1933	/* Dump starting at last key.
1934	 * Note: 0.0.0.0/0 (ie default) is first key.
1935	 */
1936	if (count == 0)
1937		l = trie_firstleaf(t);
1938	else {
1939		/* Normally, continue from last key, but if that is missing
1940		 * fallback to using slow rescan
1941		 */
1942		l = fib_find_node(t, key);
1943		if (!l)
1944			l = trie_leafindex(t, count);
1945	}
1946
1947	while (l) {
1948		cb->args[2] = l->key;
1949		if (fn_trie_dump_leaf(l, tb, skb, cb) < 0) {
1950			cb->args[3] = count;
1951			rcu_read_unlock();
1952			return -1;
1953		}
1954
1955		++count;
1956		l = trie_nextleaf(l);
1957		memset(&cb->args[4], 0,
1958		       sizeof(cb->args) - 4*sizeof(cb->args[0]));
1959	}
1960	cb->args[3] = count;
1961	rcu_read_unlock();
1962
1963	return skb->len;
1964}
1965
1966void __init fib_trie_init(void)
1967{
1968	fn_alias_kmem = kmem_cache_create("ip_fib_alias",
1969					  sizeof(struct fib_alias),
1970					  0, SLAB_PANIC, NULL);
1971
1972	trie_leaf_kmem = kmem_cache_create("ip_fib_trie",
1973					   max(sizeof(struct leaf),
1974					       sizeof(struct leaf_info)),
1975					   0, SLAB_PANIC, NULL);
1976}
1977
1978
1979struct fib_table *fib_trie_table(u32 id)
1980{
1981	struct fib_table *tb;
1982	struct trie *t;
1983
1984	tb = kmalloc(sizeof(struct fib_table) + sizeof(struct trie),
1985		     GFP_KERNEL);
1986	if (tb == NULL)
1987		return NULL;
1988
1989	tb->tb_id = id;
1990	tb->tb_default = -1;
1991	tb->tb_num_default = 0;
1992
1993	t = (struct trie *) tb->tb_data;
1994	memset(t, 0, sizeof(*t));
1995
1996	return tb;
1997}
1998
1999#ifdef CONFIG_PROC_FS
2000/* Depth first Trie walk iterator */
2001struct fib_trie_iter {
2002	struct seq_net_private p;
2003	struct fib_table *tb;
2004	struct tnode *tnode;
2005	unsigned int index;
2006	unsigned int depth;
2007};
2008
2009static struct rt_trie_node *fib_trie_get_next(struct fib_trie_iter *iter)
2010{
2011	struct tnode *tn = iter->tnode;
2012	unsigned int cindex = iter->index;
2013	struct tnode *p;
2014
2015	/* A single entry routing table */
2016	if (!tn)
2017		return NULL;
2018
2019	pr_debug("get_next iter={node=%p index=%d depth=%d}\n",
2020		 iter->tnode, iter->index, iter->depth);
2021rescan:
2022	while (cindex < (1<<tn->bits)) {
2023		struct rt_trie_node *n = tnode_get_child_rcu(tn, cindex);
2024
2025		if (n) {
2026			if (IS_LEAF(n)) {
2027				iter->tnode = tn;
2028				iter->index = cindex + 1;
2029			} else {
2030				/* push down one level */
2031				iter->tnode = (struct tnode *) n;
2032				iter->index = 0;
2033				++iter->depth;
2034			}
2035			return n;
2036		}
2037
2038		++cindex;
2039	}
2040
2041	/* Current node exhausted, pop back up */
2042	p = node_parent_rcu((struct rt_trie_node *)tn);
2043	if (p) {
2044		cindex = tkey_extract_bits(tn->key, p->pos, p->bits)+1;
2045		tn = p;
2046		--iter->depth;
2047		goto rescan;
2048	}
2049
2050	/* got root? */
2051	return NULL;
2052}
2053
2054static struct rt_trie_node *fib_trie_get_first(struct fib_trie_iter *iter,
2055				       struct trie *t)
2056{
2057	struct rt_trie_node *n;
2058
2059	if (!t)
2060		return NULL;
2061
2062	n = rcu_dereference(t->trie);
2063	if (!n)
2064		return NULL;
2065
2066	if (IS_TNODE(n)) {
2067		iter->tnode = (struct tnode *) n;
2068		iter->index = 0;
2069		iter->depth = 1;
2070	} else {
2071		iter->tnode = NULL;
2072		iter->index = 0;
2073		iter->depth = 0;
2074	}
2075
2076	return n;
2077}
2078
2079static void trie_collect_stats(struct trie *t, struct trie_stat *s)
2080{
2081	struct rt_trie_node *n;
2082	struct fib_trie_iter iter;
2083
2084	memset(s, 0, sizeof(*s));
2085
2086	rcu_read_lock();
2087	for (n = fib_trie_get_first(&iter, t); n; n = fib_trie_get_next(&iter)) {
2088		if (IS_LEAF(n)) {
2089			struct leaf *l = (struct leaf *)n;
2090			struct leaf_info *li;
2091
2092			s->leaves++;
2093			s->totdepth += iter.depth;
2094			if (iter.depth > s->maxdepth)
2095				s->maxdepth = iter.depth;
2096
2097			hlist_for_each_entry_rcu(li, &l->list, hlist)
2098				++s->prefixes;
2099		} else {
2100			const struct tnode *tn = (const struct tnode *) n;
2101			int i;
2102
2103			s->tnodes++;
2104			if (tn->bits < MAX_STAT_DEPTH)
2105				s->nodesizes[tn->bits]++;
2106
2107			for (i = 0; i < (1<<tn->bits); i++)
2108				if (!tn->child[i])
2109					s->nullpointers++;
2110		}
2111	}
2112	rcu_read_unlock();
2113}
2114
2115/*
2116 *	This outputs /proc/net/fib_triestats
2117 */
2118static void trie_show_stats(struct seq_file *seq, struct trie_stat *stat)
2119{
2120	unsigned int i, max, pointers, bytes, avdepth;
2121
2122	if (stat->leaves)
2123		avdepth = stat->totdepth*100 / stat->leaves;
2124	else
2125		avdepth = 0;
2126
2127	seq_printf(seq, "\tAver depth:     %u.%02d\n",
2128		   avdepth / 100, avdepth % 100);
2129	seq_printf(seq, "\tMax depth:      %u\n", stat->maxdepth);
2130
2131	seq_printf(seq, "\tLeaves:         %u\n", stat->leaves);
2132	bytes = sizeof(struct leaf) * stat->leaves;
2133
2134	seq_printf(seq, "\tPrefixes:       %u\n", stat->prefixes);
2135	bytes += sizeof(struct leaf_info) * stat->prefixes;
2136
2137	seq_printf(seq, "\tInternal nodes: %u\n\t", stat->tnodes);
2138	bytes += sizeof(struct tnode) * stat->tnodes;
2139
2140	max = MAX_STAT_DEPTH;
2141	while (max > 0 && stat->nodesizes[max-1] == 0)
2142		max--;
2143
2144	pointers = 0;
2145	for (i = 1; i <= max; i++)
2146		if (stat->nodesizes[i] != 0) {
2147			seq_printf(seq, "  %u: %u",  i, stat->nodesizes[i]);
2148			pointers += (1<<i) * stat->nodesizes[i];
2149		}
2150	seq_putc(seq, '\n');
2151	seq_printf(seq, "\tPointers: %u\n", pointers);
2152
2153	bytes += sizeof(struct rt_trie_node *) * pointers;
2154	seq_printf(seq, "Null ptrs: %u\n", stat->nullpointers);
2155	seq_printf(seq, "Total size: %u  kB\n", (bytes + 1023) / 1024);
2156}
2157
2158#ifdef CONFIG_IP_FIB_TRIE_STATS
2159static void trie_show_usage(struct seq_file *seq,
2160			    const struct trie_use_stats *stats)
2161{
2162	seq_printf(seq, "\nCounters:\n---------\n");
2163	seq_printf(seq, "gets = %u\n", stats->gets);
2164	seq_printf(seq, "backtracks = %u\n", stats->backtrack);
2165	seq_printf(seq, "semantic match passed = %u\n",
2166		   stats->semantic_match_passed);
2167	seq_printf(seq, "semantic match miss = %u\n",
2168		   stats->semantic_match_miss);
2169	seq_printf(seq, "null node hit= %u\n", stats->null_node_hit);
2170	seq_printf(seq, "skipped node resize = %u\n\n",
2171		   stats->resize_node_skipped);
2172}
2173#endif /*  CONFIG_IP_FIB_TRIE_STATS */
2174
2175static void fib_table_print(struct seq_file *seq, struct fib_table *tb)
2176{
2177	if (tb->tb_id == RT_TABLE_LOCAL)
2178		seq_puts(seq, "Local:\n");
2179	else if (tb->tb_id == RT_TABLE_MAIN)
2180		seq_puts(seq, "Main:\n");
2181	else
2182		seq_printf(seq, "Id %d:\n", tb->tb_id);
2183}
2184
2185
2186static int fib_triestat_seq_show(struct seq_file *seq, void *v)
2187{
2188	struct net *net = (struct net *)seq->private;
2189	unsigned int h;
2190
2191	seq_printf(seq,
2192		   "Basic info: size of leaf:"
2193		   " %Zd bytes, size of tnode: %Zd bytes.\n",
2194		   sizeof(struct leaf), sizeof(struct tnode));
2195
2196	for (h = 0; h < FIB_TABLE_HASHSZ; h++) {
2197		struct hlist_head *head = &net->ipv4.fib_table_hash[h];
2198		struct fib_table *tb;
2199
2200		hlist_for_each_entry_rcu(tb, head, tb_hlist) {
2201			struct trie *t = (struct trie *) tb->tb_data;
2202			struct trie_stat stat;
2203
2204			if (!t)
2205				continue;
2206
2207			fib_table_print(seq, tb);
2208
2209			trie_collect_stats(t, &stat);
2210			trie_show_stats(seq, &stat);
2211#ifdef CONFIG_IP_FIB_TRIE_STATS
2212			trie_show_usage(seq, &t->stats);
2213#endif
2214		}
2215	}
2216
2217	return 0;
2218}
2219
2220static int fib_triestat_seq_open(struct inode *inode, struct file *file)
2221{
2222	return single_open_net(inode, file, fib_triestat_seq_show);
2223}
2224
2225static const struct file_operations fib_triestat_fops = {
2226	.owner	= THIS_MODULE,
2227	.open	= fib_triestat_seq_open,
2228	.read	= seq_read,
2229	.llseek	= seq_lseek,
2230	.release = single_release_net,
2231};
2232
2233static struct rt_trie_node *fib_trie_get_idx(struct seq_file *seq, loff_t pos)
2234{
2235	struct fib_trie_iter *iter = seq->private;
2236	struct net *net = seq_file_net(seq);
2237	loff_t idx = 0;
2238	unsigned int h;
2239
2240	for (h = 0; h < FIB_TABLE_HASHSZ; h++) {
2241		struct hlist_head *head = &net->ipv4.fib_table_hash[h];
2242		struct fib_table *tb;
2243
2244		hlist_for_each_entry_rcu(tb, head, tb_hlist) {
2245			struct rt_trie_node *n;
2246
2247			for (n = fib_trie_get_first(iter,
2248						    (struct trie *) tb->tb_data);
2249			     n; n = fib_trie_get_next(iter))
2250				if (pos == idx++) {
2251					iter->tb = tb;
2252					return n;
2253				}
2254		}
2255	}
2256
2257	return NULL;
2258}
2259
2260static void *fib_trie_seq_start(struct seq_file *seq, loff_t *pos)
2261	__acquires(RCU)
2262{
2263	rcu_read_lock();
2264	return fib_trie_get_idx(seq, *pos);
2265}
2266
2267static void *fib_trie_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2268{
2269	struct fib_trie_iter *iter = seq->private;
2270	struct net *net = seq_file_net(seq);
2271	struct fib_table *tb = iter->tb;
2272	struct hlist_node *tb_node;
2273	unsigned int h;
2274	struct rt_trie_node *n;
2275
2276	++*pos;
2277	/* next node in same table */
2278	n = fib_trie_get_next(iter);
2279	if (n)
2280		return n;
2281
2282	/* walk rest of this hash chain */
2283	h = tb->tb_id & (FIB_TABLE_HASHSZ - 1);
2284	while ((tb_node = rcu_dereference(hlist_next_rcu(&tb->tb_hlist)))) {
2285		tb = hlist_entry(tb_node, struct fib_table, tb_hlist);
2286		n = fib_trie_get_first(iter, (struct trie *) tb->tb_data);
2287		if (n)
2288			goto found;
2289	}
2290
2291	/* new hash chain */
2292	while (++h < FIB_TABLE_HASHSZ) {
2293		struct hlist_head *head = &net->ipv4.fib_table_hash[h];
2294		hlist_for_each_entry_rcu(tb, head, tb_hlist) {
2295			n = fib_trie_get_first(iter, (struct trie *) tb->tb_data);
2296			if (n)
2297				goto found;
2298		}
2299	}
2300	return NULL;
2301
2302found:
2303	iter->tb = tb;
2304	return n;
2305}
2306
2307static void fib_trie_seq_stop(struct seq_file *seq, void *v)
2308	__releases(RCU)
2309{
2310	rcu_read_unlock();
2311}
2312
2313static void seq_indent(struct seq_file *seq, int n)
2314{
2315	while (n-- > 0)
2316		seq_puts(seq, "   ");
2317}
2318
2319static inline const char *rtn_scope(char *buf, size_t len, enum rt_scope_t s)
2320{
2321	switch (s) {
2322	case RT_SCOPE_UNIVERSE: return "universe";
2323	case RT_SCOPE_SITE:	return "site";
2324	case RT_SCOPE_LINK:	return "link";
2325	case RT_SCOPE_HOST:	return "host";
2326	case RT_SCOPE_NOWHERE:	return "nowhere";
2327	default:
2328		snprintf(buf, len, "scope=%d", s);
2329		return buf;
2330	}
2331}
2332
2333static const char *const rtn_type_names[__RTN_MAX] = {
2334	[RTN_UNSPEC] = "UNSPEC",
2335	[RTN_UNICAST] = "UNICAST",
2336	[RTN_LOCAL] = "LOCAL",
2337	[RTN_BROADCAST] = "BROADCAST",
2338	[RTN_ANYCAST] = "ANYCAST",
2339	[RTN_MULTICAST] = "MULTICAST",
2340	[RTN_BLACKHOLE] = "BLACKHOLE",
2341	[RTN_UNREACHABLE] = "UNREACHABLE",
2342	[RTN_PROHIBIT] = "PROHIBIT",
2343	[RTN_THROW] = "THROW",
2344	[RTN_NAT] = "NAT",
2345	[RTN_XRESOLVE] = "XRESOLVE",
2346};
2347
2348static inline const char *rtn_type(char *buf, size_t len, unsigned int t)
2349{
2350	if (t < __RTN_MAX && rtn_type_names[t])
2351		return rtn_type_names[t];
2352	snprintf(buf, len, "type %u", t);
2353	return buf;
2354}
2355
2356/* Pretty print the trie */
2357static int fib_trie_seq_show(struct seq_file *seq, void *v)
2358{
2359	const struct fib_trie_iter *iter = seq->private;
2360	struct rt_trie_node *n = v;
2361
2362	if (!node_parent_rcu(n))
2363		fib_table_print(seq, iter->tb);
2364
2365	if (IS_TNODE(n)) {
2366		struct tnode *tn = (struct tnode *) n;
2367		__be32 prf = htonl(mask_pfx(tn->key, tn->pos));
2368
2369		seq_indent(seq, iter->depth-1);
2370		seq_printf(seq, "  +-- %pI4/%d %d %d %d\n",
2371			   &prf, tn->pos, tn->bits, tn->full_children,
2372			   tn->empty_children);
2373
2374	} else {
2375		struct leaf *l = (struct leaf *) n;
2376		struct leaf_info *li;
2377		__be32 val = htonl(l->key);
2378
2379		seq_indent(seq, iter->depth);
2380		seq_printf(seq, "  |-- %pI4\n", &val);
2381
2382		hlist_for_each_entry_rcu(li, &l->list, hlist) {
2383			struct fib_alias *fa;
2384
2385			list_for_each_entry_rcu(fa, &li->falh, fa_list) {
2386				char buf1[32], buf2[32];
2387
2388				seq_indent(seq, iter->depth+1);
2389				seq_printf(seq, "  /%d %s %s", li->plen,
2390					   rtn_scope(buf1, sizeof(buf1),
2391						     fa->fa_info->fib_scope),
2392					   rtn_type(buf2, sizeof(buf2),
2393						    fa->fa_type));
2394				if (fa->fa_tos)
2395					seq_printf(seq, " tos=%d", fa->fa_tos);
2396				seq_putc(seq, '\n');
2397			}
2398		}
2399	}
2400
2401	return 0;
2402}
2403
2404static const struct seq_operations fib_trie_seq_ops = {
2405	.start  = fib_trie_seq_start,
2406	.next   = fib_trie_seq_next,
2407	.stop   = fib_trie_seq_stop,
2408	.show   = fib_trie_seq_show,
2409};
2410
2411static int fib_trie_seq_open(struct inode *inode, struct file *file)
2412{
2413	return seq_open_net(inode, file, &fib_trie_seq_ops,
2414			    sizeof(struct fib_trie_iter));
2415}
2416
2417static const struct file_operations fib_trie_fops = {
2418	.owner  = THIS_MODULE,
2419	.open   = fib_trie_seq_open,
2420	.read   = seq_read,
2421	.llseek = seq_lseek,
2422	.release = seq_release_net,
2423};
2424
2425struct fib_route_iter {
2426	struct seq_net_private p;
2427	struct trie *main_trie;
2428	loff_t	pos;
2429	t_key	key;
2430};
2431
2432static struct leaf *fib_route_get_idx(struct fib_route_iter *iter, loff_t pos)
2433{
2434	struct leaf *l = NULL;
2435	struct trie *t = iter->main_trie;
2436
2437	/* use cache location of last found key */
2438	if (iter->pos > 0 && pos >= iter->pos && (l = fib_find_node(t, iter->key)))
2439		pos -= iter->pos;
2440	else {
2441		iter->pos = 0;
2442		l = trie_firstleaf(t);
2443	}
2444
2445	while (l && pos-- > 0) {
2446		iter->pos++;
2447		l = trie_nextleaf(l);
2448	}
2449
2450	if (l)
2451		iter->key = pos;	/* remember it */
2452	else
2453		iter->pos = 0;		/* forget it */
2454
2455	return l;
2456}
2457
2458static void *fib_route_seq_start(struct seq_file *seq, loff_t *pos)
2459	__acquires(RCU)
2460{
2461	struct fib_route_iter *iter = seq->private;
2462	struct fib_table *tb;
2463
2464	rcu_read_lock();
2465	tb = fib_get_table(seq_file_net(seq), RT_TABLE_MAIN);
2466	if (!tb)
2467		return NULL;
2468
2469	iter->main_trie = (struct trie *) tb->tb_data;
2470	if (*pos == 0)
2471		return SEQ_START_TOKEN;
2472	else
2473		return fib_route_get_idx(iter, *pos - 1);
2474}
2475
2476static void *fib_route_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2477{
2478	struct fib_route_iter *iter = seq->private;
2479	struct leaf *l = v;
2480
2481	++*pos;
2482	if (v == SEQ_START_TOKEN) {
2483		iter->pos = 0;
2484		l = trie_firstleaf(iter->main_trie);
2485	} else {
2486		iter->pos++;
2487		l = trie_nextleaf(l);
2488	}
2489
2490	if (l)
2491		iter->key = l->key;
2492	else
2493		iter->pos = 0;
2494	return l;
2495}
2496
2497static void fib_route_seq_stop(struct seq_file *seq, void *v)
2498	__releases(RCU)
2499{
2500	rcu_read_unlock();
2501}
2502
2503static unsigned int fib_flag_trans(int type, __be32 mask, const struct fib_info *fi)
2504{
2505	unsigned int flags = 0;
2506
2507	if (type == RTN_UNREACHABLE || type == RTN_PROHIBIT)
2508		flags = RTF_REJECT;
2509	if (fi && fi->fib_nh->nh_gw)
2510		flags |= RTF_GATEWAY;
2511	if (mask == htonl(0xFFFFFFFF))
2512		flags |= RTF_HOST;
2513	flags |= RTF_UP;
2514	return flags;
2515}
2516
2517/*
2518 *	This outputs /proc/net/route.
2519 *	The format of the file is not supposed to be changed
2520 *	and needs to be same as fib_hash output to avoid breaking
2521 *	legacy utilities
2522 */
2523static int fib_route_seq_show(struct seq_file *seq, void *v)
2524{
2525	struct leaf *l = v;
2526	struct leaf_info *li;
2527
2528	if (v == SEQ_START_TOKEN) {
2529		seq_printf(seq, "%-127s\n", "Iface\tDestination\tGateway "
2530			   "\tFlags\tRefCnt\tUse\tMetric\tMask\t\tMTU"
2531			   "\tWindow\tIRTT");
2532		return 0;
2533	}
2534
2535	hlist_for_each_entry_rcu(li, &l->list, hlist) {
2536		struct fib_alias *fa;
2537		__be32 mask, prefix;
2538
2539		mask = inet_make_mask(li->plen);
2540		prefix = htonl(l->key);
2541
2542		list_for_each_entry_rcu(fa, &li->falh, fa_list) {
2543			const struct fib_info *fi = fa->fa_info;
2544			unsigned int flags = fib_flag_trans(fa->fa_type, mask, fi);
2545			int len;
2546
2547			if (fa->fa_type == RTN_BROADCAST
2548			    || fa->fa_type == RTN_MULTICAST)
2549				continue;
2550
2551			if (fi)
2552				seq_printf(seq,
2553					 "%s\t%08X\t%08X\t%04X\t%d\t%u\t"
2554					 "%d\t%08X\t%d\t%u\t%u%n",
2555					 fi->fib_dev ? fi->fib_dev->name : "*",
2556					 prefix,
2557					 fi->fib_nh->nh_gw, flags, 0, 0,
2558					 fi->fib_priority,
2559					 mask,
2560					 (fi->fib_advmss ?
2561					  fi->fib_advmss + 40 : 0),
2562					 fi->fib_window,
2563					 fi->fib_rtt >> 3, &len);
2564			else
2565				seq_printf(seq,
2566					 "*\t%08X\t%08X\t%04X\t%d\t%u\t"
2567					 "%d\t%08X\t%d\t%u\t%u%n",
2568					 prefix, 0, flags, 0, 0, 0,
2569					 mask, 0, 0, 0, &len);
2570
2571			seq_printf(seq, "%*s\n", 127 - len, "");
2572		}
2573	}
2574
2575	return 0;
2576}
2577
2578static const struct seq_operations fib_route_seq_ops = {
2579	.start  = fib_route_seq_start,
2580	.next   = fib_route_seq_next,
2581	.stop   = fib_route_seq_stop,
2582	.show   = fib_route_seq_show,
2583};
2584
2585static int fib_route_seq_open(struct inode *inode, struct file *file)
2586{
2587	return seq_open_net(inode, file, &fib_route_seq_ops,
2588			    sizeof(struct fib_route_iter));
2589}
2590
2591static const struct file_operations fib_route_fops = {
2592	.owner  = THIS_MODULE,
2593	.open   = fib_route_seq_open,
2594	.read   = seq_read,
2595	.llseek = seq_lseek,
2596	.release = seq_release_net,
2597};
2598
2599int __net_init fib_proc_init(struct net *net)
2600{
2601	if (!proc_create("fib_trie", S_IRUGO, net->proc_net, &fib_trie_fops))
2602		goto out1;
2603
2604	if (!proc_create("fib_triestat", S_IRUGO, net->proc_net,
2605			 &fib_triestat_fops))
2606		goto out2;
2607
2608	if (!proc_create("route", S_IRUGO, net->proc_net, &fib_route_fops))
2609		goto out3;
2610
2611	return 0;
2612
2613out3:
2614	remove_proc_entry("fib_triestat", net->proc_net);
2615out2:
2616	remove_proc_entry("fib_trie", net->proc_net);
2617out1:
2618	return -ENOMEM;
2619}
2620
2621void __net_exit fib_proc_exit(struct net *net)
2622{
2623	remove_proc_entry("fib_trie", net->proc_net);
2624	remove_proc_entry("fib_triestat", net->proc_net);
2625	remove_proc_entry("route", net->proc_net);
2626}
2627
2628#endif /* CONFIG_PROC_FS */
2629