fib_trie.c revision b6fcbdb4f283f7ba67cec3cda6be23da8e959031
1/*
2 *   This program is free software; you can redistribute it and/or
3 *   modify it under the terms of the GNU General Public License
4 *   as published by the Free Software Foundation; either version
5 *   2 of the License, or (at your option) any later version.
6 *
7 *   Robert Olsson <robert.olsson@its.uu.se> Uppsala Universitet
8 *     & Swedish University of Agricultural Sciences.
9 *
10 *   Jens Laas <jens.laas@data.slu.se> Swedish University of
11 *     Agricultural Sciences.
12 *
13 *   Hans Liss <hans.liss@its.uu.se>  Uppsala Universitet
14 *
15 * This work is based on the LPC-trie which is originally descibed in:
16 *
17 * An experimental study of compression methods for dynamic tries
18 * Stefan Nilsson and Matti Tikkanen. Algorithmica, 33(1):19-33, 2002.
19 * http://www.nada.kth.se/~snilsson/public/papers/dyntrie2/
20 *
21 *
22 * IP-address lookup using LC-tries. Stefan Nilsson and Gunnar Karlsson
23 * IEEE Journal on Selected Areas in Communications, 17(6):1083-1092, June 1999
24 *
25 *
26 * Code from fib_hash has been reused which includes the following header:
27 *
28 *
29 * INET		An implementation of the TCP/IP protocol suite for the LINUX
30 *		operating system.  INET is implemented using the  BSD Socket
31 *		interface as the means of communication with the user level.
32 *
33 *		IPv4 FIB: lookup engine and maintenance routines.
34 *
35 *
36 * Authors:	Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
37 *
38 *		This program is free software; you can redistribute it and/or
39 *		modify it under the terms of the GNU General Public License
40 *		as published by the Free Software Foundation; either version
41 *		2 of the License, or (at your option) any later version.
42 *
43 * Substantial contributions to this work comes from:
44 *
45 *		David S. Miller, <davem@davemloft.net>
46 *		Stephen Hemminger <shemminger@osdl.org>
47 *		Paul E. McKenney <paulmck@us.ibm.com>
48 *		Patrick McHardy <kaber@trash.net>
49 */
50
51#define VERSION "0.408"
52
53#include <asm/uaccess.h>
54#include <asm/system.h>
55#include <linux/bitops.h>
56#include <linux/types.h>
57#include <linux/kernel.h>
58#include <linux/mm.h>
59#include <linux/string.h>
60#include <linux/socket.h>
61#include <linux/sockios.h>
62#include <linux/errno.h>
63#include <linux/in.h>
64#include <linux/inet.h>
65#include <linux/inetdevice.h>
66#include <linux/netdevice.h>
67#include <linux/if_arp.h>
68#include <linux/proc_fs.h>
69#include <linux/rcupdate.h>
70#include <linux/skbuff.h>
71#include <linux/netlink.h>
72#include <linux/init.h>
73#include <linux/list.h>
74#include <net/net_namespace.h>
75#include <net/ip.h>
76#include <net/protocol.h>
77#include <net/route.h>
78#include <net/tcp.h>
79#include <net/sock.h>
80#include <net/ip_fib.h>
81#include "fib_lookup.h"
82
83#define MAX_STAT_DEPTH 32
84
85#define KEYLENGTH (8*sizeof(t_key))
86
87typedef unsigned int t_key;
88
89#define T_TNODE 0
90#define T_LEAF  1
91#define NODE_TYPE_MASK	0x1UL
92#define NODE_TYPE(node) ((node)->parent & NODE_TYPE_MASK)
93
94#define IS_TNODE(n) (!(n->parent & T_LEAF))
95#define IS_LEAF(n) (n->parent & T_LEAF)
96
97struct node {
98	unsigned long parent;
99	t_key key;
100};
101
102struct leaf {
103	unsigned long parent;
104	t_key key;
105	struct hlist_head list;
106	struct rcu_head rcu;
107};
108
109struct leaf_info {
110	struct hlist_node hlist;
111	struct rcu_head rcu;
112	int plen;
113	struct list_head falh;
114};
115
116struct tnode {
117	unsigned long parent;
118	t_key key;
119	unsigned char pos;		/* 2log(KEYLENGTH) bits needed */
120	unsigned char bits;		/* 2log(KEYLENGTH) bits needed */
121	unsigned int full_children;	/* KEYLENGTH bits needed */
122	unsigned int empty_children;	/* KEYLENGTH bits needed */
123	union {
124		struct rcu_head rcu;
125		struct work_struct work;
126	};
127	struct node *child[0];
128};
129
130#ifdef CONFIG_IP_FIB_TRIE_STATS
131struct trie_use_stats {
132	unsigned int gets;
133	unsigned int backtrack;
134	unsigned int semantic_match_passed;
135	unsigned int semantic_match_miss;
136	unsigned int null_node_hit;
137	unsigned int resize_node_skipped;
138};
139#endif
140
141struct trie_stat {
142	unsigned int totdepth;
143	unsigned int maxdepth;
144	unsigned int tnodes;
145	unsigned int leaves;
146	unsigned int nullpointers;
147	unsigned int prefixes;
148	unsigned int nodesizes[MAX_STAT_DEPTH];
149};
150
151struct trie {
152	struct node *trie;
153#ifdef CONFIG_IP_FIB_TRIE_STATS
154	struct trie_use_stats stats;
155#endif
156};
157
158static void put_child(struct trie *t, struct tnode *tn, int i, struct node *n);
159static void tnode_put_child_reorg(struct tnode *tn, int i, struct node *n,
160				  int wasfull);
161static struct node *resize(struct trie *t, struct tnode *tn);
162static struct tnode *inflate(struct trie *t, struct tnode *tn);
163static struct tnode *halve(struct trie *t, struct tnode *tn);
164
165static struct kmem_cache *fn_alias_kmem __read_mostly;
166static struct kmem_cache *trie_leaf_kmem __read_mostly;
167
168static inline struct tnode *node_parent(struct node *node)
169{
170	return (struct tnode *)(node->parent & ~NODE_TYPE_MASK);
171}
172
173static inline struct tnode *node_parent_rcu(struct node *node)
174{
175	struct tnode *ret = node_parent(node);
176
177	return rcu_dereference(ret);
178}
179
180/* Same as rcu_assign_pointer
181 * but that macro() assumes that value is a pointer.
182 */
183static inline void node_set_parent(struct node *node, struct tnode *ptr)
184{
185	smp_wmb();
186	node->parent = (unsigned long)ptr | NODE_TYPE(node);
187}
188
189static inline struct node *tnode_get_child(struct tnode *tn, unsigned int i)
190{
191	BUG_ON(i >= 1U << tn->bits);
192
193	return tn->child[i];
194}
195
196static inline struct node *tnode_get_child_rcu(struct tnode *tn, unsigned int i)
197{
198	struct node *ret = tnode_get_child(tn, i);
199
200	return rcu_dereference(ret);
201}
202
203static inline int tnode_child_length(const struct tnode *tn)
204{
205	return 1 << tn->bits;
206}
207
208static inline t_key mask_pfx(t_key k, unsigned short l)
209{
210	return (l == 0) ? 0 : k >> (KEYLENGTH-l) << (KEYLENGTH-l);
211}
212
213static inline t_key tkey_extract_bits(t_key a, int offset, int bits)
214{
215	if (offset < KEYLENGTH)
216		return ((t_key)(a << offset)) >> (KEYLENGTH - bits);
217	else
218		return 0;
219}
220
221static inline int tkey_equals(t_key a, t_key b)
222{
223	return a == b;
224}
225
226static inline int tkey_sub_equals(t_key a, int offset, int bits, t_key b)
227{
228	if (bits == 0 || offset >= KEYLENGTH)
229		return 1;
230	bits = bits > KEYLENGTH ? KEYLENGTH : bits;
231	return ((a ^ b) << offset) >> (KEYLENGTH - bits) == 0;
232}
233
234static inline int tkey_mismatch(t_key a, int offset, t_key b)
235{
236	t_key diff = a ^ b;
237	int i = offset;
238
239	if (!diff)
240		return 0;
241	while ((diff << i) >> (KEYLENGTH-1) == 0)
242		i++;
243	return i;
244}
245
246/*
247  To understand this stuff, an understanding of keys and all their bits is
248  necessary. Every node in the trie has a key associated with it, but not
249  all of the bits in that key are significant.
250
251  Consider a node 'n' and its parent 'tp'.
252
253  If n is a leaf, every bit in its key is significant. Its presence is
254  necessitated by path compression, since during a tree traversal (when
255  searching for a leaf - unless we are doing an insertion) we will completely
256  ignore all skipped bits we encounter. Thus we need to verify, at the end of
257  a potentially successful search, that we have indeed been walking the
258  correct key path.
259
260  Note that we can never "miss" the correct key in the tree if present by
261  following the wrong path. Path compression ensures that segments of the key
262  that are the same for all keys with a given prefix are skipped, but the
263  skipped part *is* identical for each node in the subtrie below the skipped
264  bit! trie_insert() in this implementation takes care of that - note the
265  call to tkey_sub_equals() in trie_insert().
266
267  if n is an internal node - a 'tnode' here, the various parts of its key
268  have many different meanings.
269
270  Example:
271  _________________________________________________________________
272  | i | i | i | i | i | i | i | N | N | N | S | S | S | S | S | C |
273  -----------------------------------------------------------------
274    0   1   2   3   4   5   6   7   8   9  10  11  12  13  14  15
275
276  _________________________________________________________________
277  | C | C | C | u | u | u | u | u | u | u | u | u | u | u | u | u |
278  -----------------------------------------------------------------
279   16  17  18  19  20  21  22  23  24  25  26  27  28  29  30  31
280
281  tp->pos = 7
282  tp->bits = 3
283  n->pos = 15
284  n->bits = 4
285
286  First, let's just ignore the bits that come before the parent tp, that is
287  the bits from 0 to (tp->pos-1). They are *known* but at this point we do
288  not use them for anything.
289
290  The bits from (tp->pos) to (tp->pos + tp->bits - 1) - "N", above - are the
291  index into the parent's child array. That is, they will be used to find
292  'n' among tp's children.
293
294  The bits from (tp->pos + tp->bits) to (n->pos - 1) - "S" - are skipped bits
295  for the node n.
296
297  All the bits we have seen so far are significant to the node n. The rest
298  of the bits are really not needed or indeed known in n->key.
299
300  The bits from (n->pos) to (n->pos + n->bits - 1) - "C" - are the index into
301  n's child array, and will of course be different for each child.
302
303
304  The rest of the bits, from (n->pos + n->bits) onward, are completely unknown
305  at this point.
306
307*/
308
309static inline void check_tnode(const struct tnode *tn)
310{
311	WARN_ON(tn && tn->pos+tn->bits > 32);
312}
313
314static const int halve_threshold = 25;
315static const int inflate_threshold = 50;
316static const int halve_threshold_root = 8;
317static const int inflate_threshold_root = 15;
318
319
320static void __alias_free_mem(struct rcu_head *head)
321{
322	struct fib_alias *fa = container_of(head, struct fib_alias, rcu);
323	kmem_cache_free(fn_alias_kmem, fa);
324}
325
326static inline void alias_free_mem_rcu(struct fib_alias *fa)
327{
328	call_rcu(&fa->rcu, __alias_free_mem);
329}
330
331static void __leaf_free_rcu(struct rcu_head *head)
332{
333	struct leaf *l = container_of(head, struct leaf, rcu);
334	kmem_cache_free(trie_leaf_kmem, l);
335}
336
337static inline void free_leaf(struct leaf *l)
338{
339	call_rcu_bh(&l->rcu, __leaf_free_rcu);
340}
341
342static void __leaf_info_free_rcu(struct rcu_head *head)
343{
344	kfree(container_of(head, struct leaf_info, rcu));
345}
346
347static inline void free_leaf_info(struct leaf_info *leaf)
348{
349	call_rcu(&leaf->rcu, __leaf_info_free_rcu);
350}
351
352static struct tnode *tnode_alloc(size_t size)
353{
354	if (size <= PAGE_SIZE)
355		return kzalloc(size, GFP_KERNEL);
356	else
357		return __vmalloc(size, GFP_KERNEL | __GFP_ZERO, PAGE_KERNEL);
358}
359
360static void __tnode_vfree(struct work_struct *arg)
361{
362	struct tnode *tn = container_of(arg, struct tnode, work);
363	vfree(tn);
364}
365
366static void __tnode_free_rcu(struct rcu_head *head)
367{
368	struct tnode *tn = container_of(head, struct tnode, rcu);
369	size_t size = sizeof(struct tnode) +
370		      (sizeof(struct node *) << tn->bits);
371
372	if (size <= PAGE_SIZE)
373		kfree(tn);
374	else {
375		INIT_WORK(&tn->work, __tnode_vfree);
376		schedule_work(&tn->work);
377	}
378}
379
380static inline void tnode_free(struct tnode *tn)
381{
382	if (IS_LEAF(tn))
383		free_leaf((struct leaf *) tn);
384	else
385		call_rcu(&tn->rcu, __tnode_free_rcu);
386}
387
388static struct leaf *leaf_new(void)
389{
390	struct leaf *l = kmem_cache_alloc(trie_leaf_kmem, GFP_KERNEL);
391	if (l) {
392		l->parent = T_LEAF;
393		INIT_HLIST_HEAD(&l->list);
394	}
395	return l;
396}
397
398static struct leaf_info *leaf_info_new(int plen)
399{
400	struct leaf_info *li = kmalloc(sizeof(struct leaf_info),  GFP_KERNEL);
401	if (li) {
402		li->plen = plen;
403		INIT_LIST_HEAD(&li->falh);
404	}
405	return li;
406}
407
408static struct tnode *tnode_new(t_key key, int pos, int bits)
409{
410	size_t sz = sizeof(struct tnode) + (sizeof(struct node *) << bits);
411	struct tnode *tn = tnode_alloc(sz);
412
413	if (tn) {
414		tn->parent = T_TNODE;
415		tn->pos = pos;
416		tn->bits = bits;
417		tn->key = key;
418		tn->full_children = 0;
419		tn->empty_children = 1<<bits;
420	}
421
422	pr_debug("AT %p s=%u %lu\n", tn, (unsigned int) sizeof(struct tnode),
423		 (unsigned long) (sizeof(struct node) << bits));
424	return tn;
425}
426
427/*
428 * Check whether a tnode 'n' is "full", i.e. it is an internal node
429 * and no bits are skipped. See discussion in dyntree paper p. 6
430 */
431
432static inline int tnode_full(const struct tnode *tn, const struct node *n)
433{
434	if (n == NULL || IS_LEAF(n))
435		return 0;
436
437	return ((struct tnode *) n)->pos == tn->pos + tn->bits;
438}
439
440static inline void put_child(struct trie *t, struct tnode *tn, int i,
441			     struct node *n)
442{
443	tnode_put_child_reorg(tn, i, n, -1);
444}
445
446 /*
447  * Add a child at position i overwriting the old value.
448  * Update the value of full_children and empty_children.
449  */
450
451static void tnode_put_child_reorg(struct tnode *tn, int i, struct node *n,
452				  int wasfull)
453{
454	struct node *chi = tn->child[i];
455	int isfull;
456
457	BUG_ON(i >= 1<<tn->bits);
458
459	/* update emptyChildren */
460	if (n == NULL && chi != NULL)
461		tn->empty_children++;
462	else if (n != NULL && chi == NULL)
463		tn->empty_children--;
464
465	/* update fullChildren */
466	if (wasfull == -1)
467		wasfull = tnode_full(tn, chi);
468
469	isfull = tnode_full(tn, n);
470	if (wasfull && !isfull)
471		tn->full_children--;
472	else if (!wasfull && isfull)
473		tn->full_children++;
474
475	if (n)
476		node_set_parent(n, tn);
477
478	rcu_assign_pointer(tn->child[i], n);
479}
480
481static struct node *resize(struct trie *t, struct tnode *tn)
482{
483	int i;
484	int err = 0;
485	struct tnode *old_tn;
486	int inflate_threshold_use;
487	int halve_threshold_use;
488	int max_resize;
489
490	if (!tn)
491		return NULL;
492
493	pr_debug("In tnode_resize %p inflate_threshold=%d threshold=%d\n",
494		 tn, inflate_threshold, halve_threshold);
495
496	/* No children */
497	if (tn->empty_children == tnode_child_length(tn)) {
498		tnode_free(tn);
499		return NULL;
500	}
501	/* One child */
502	if (tn->empty_children == tnode_child_length(tn) - 1)
503		for (i = 0; i < tnode_child_length(tn); i++) {
504			struct node *n;
505
506			n = tn->child[i];
507			if (!n)
508				continue;
509
510			/* compress one level */
511			node_set_parent(n, NULL);
512			tnode_free(tn);
513			return n;
514		}
515	/*
516	 * Double as long as the resulting node has a number of
517	 * nonempty nodes that are above the threshold.
518	 */
519
520	/*
521	 * From "Implementing a dynamic compressed trie" by Stefan Nilsson of
522	 * the Helsinki University of Technology and Matti Tikkanen of Nokia
523	 * Telecommunications, page 6:
524	 * "A node is doubled if the ratio of non-empty children to all
525	 * children in the *doubled* node is at least 'high'."
526	 *
527	 * 'high' in this instance is the variable 'inflate_threshold'. It
528	 * is expressed as a percentage, so we multiply it with
529	 * tnode_child_length() and instead of multiplying by 2 (since the
530	 * child array will be doubled by inflate()) and multiplying
531	 * the left-hand side by 100 (to handle the percentage thing) we
532	 * multiply the left-hand side by 50.
533	 *
534	 * The left-hand side may look a bit weird: tnode_child_length(tn)
535	 * - tn->empty_children is of course the number of non-null children
536	 * in the current node. tn->full_children is the number of "full"
537	 * children, that is non-null tnodes with a skip value of 0.
538	 * All of those will be doubled in the resulting inflated tnode, so
539	 * we just count them one extra time here.
540	 *
541	 * A clearer way to write this would be:
542	 *
543	 * to_be_doubled = tn->full_children;
544	 * not_to_be_doubled = tnode_child_length(tn) - tn->empty_children -
545	 *     tn->full_children;
546	 *
547	 * new_child_length = tnode_child_length(tn) * 2;
548	 *
549	 * new_fill_factor = 100 * (not_to_be_doubled + 2*to_be_doubled) /
550	 *      new_child_length;
551	 * if (new_fill_factor >= inflate_threshold)
552	 *
553	 * ...and so on, tho it would mess up the while () loop.
554	 *
555	 * anyway,
556	 * 100 * (not_to_be_doubled + 2*to_be_doubled) / new_child_length >=
557	 *      inflate_threshold
558	 *
559	 * avoid a division:
560	 * 100 * (not_to_be_doubled + 2*to_be_doubled) >=
561	 *      inflate_threshold * new_child_length
562	 *
563	 * expand not_to_be_doubled and to_be_doubled, and shorten:
564	 * 100 * (tnode_child_length(tn) - tn->empty_children +
565	 *    tn->full_children) >= inflate_threshold * new_child_length
566	 *
567	 * expand new_child_length:
568	 * 100 * (tnode_child_length(tn) - tn->empty_children +
569	 *    tn->full_children) >=
570	 *      inflate_threshold * tnode_child_length(tn) * 2
571	 *
572	 * shorten again:
573	 * 50 * (tn->full_children + tnode_child_length(tn) -
574	 *    tn->empty_children) >= inflate_threshold *
575	 *    tnode_child_length(tn)
576	 *
577	 */
578
579	check_tnode(tn);
580
581	/* Keep root node larger  */
582
583	if (!tn->parent)
584		inflate_threshold_use = inflate_threshold_root;
585	else
586		inflate_threshold_use = inflate_threshold;
587
588	err = 0;
589	max_resize = 10;
590	while ((tn->full_children > 0 &&  max_resize-- &&
591		50 * (tn->full_children + tnode_child_length(tn)
592		      - tn->empty_children)
593		>= inflate_threshold_use * tnode_child_length(tn))) {
594
595		old_tn = tn;
596		tn = inflate(t, tn);
597
598		if (IS_ERR(tn)) {
599			tn = old_tn;
600#ifdef CONFIG_IP_FIB_TRIE_STATS
601			t->stats.resize_node_skipped++;
602#endif
603			break;
604		}
605	}
606
607	if (max_resize < 0) {
608		if (!tn->parent)
609			pr_warning("Fix inflate_threshold_root."
610				   " Now=%d size=%d bits\n",
611				   inflate_threshold_root, tn->bits);
612		else
613			pr_warning("Fix inflate_threshold."
614				   " Now=%d size=%d bits\n",
615				   inflate_threshold, tn->bits);
616	}
617
618	check_tnode(tn);
619
620	/*
621	 * Halve as long as the number of empty children in this
622	 * node is above threshold.
623	 */
624
625
626	/* Keep root node larger  */
627
628	if (!tn->parent)
629		halve_threshold_use = halve_threshold_root;
630	else
631		halve_threshold_use = halve_threshold;
632
633	err = 0;
634	max_resize = 10;
635	while (tn->bits > 1 &&  max_resize-- &&
636	       100 * (tnode_child_length(tn) - tn->empty_children) <
637	       halve_threshold_use * tnode_child_length(tn)) {
638
639		old_tn = tn;
640		tn = halve(t, tn);
641		if (IS_ERR(tn)) {
642			tn = old_tn;
643#ifdef CONFIG_IP_FIB_TRIE_STATS
644			t->stats.resize_node_skipped++;
645#endif
646			break;
647		}
648	}
649
650	if (max_resize < 0) {
651		if (!tn->parent)
652			pr_warning("Fix halve_threshold_root."
653				   " Now=%d size=%d bits\n",
654				   halve_threshold_root, tn->bits);
655		else
656			pr_warning("Fix halve_threshold."
657				   " Now=%d size=%d bits\n",
658				   halve_threshold, tn->bits);
659	}
660
661	/* Only one child remains */
662	if (tn->empty_children == tnode_child_length(tn) - 1)
663		for (i = 0; i < tnode_child_length(tn); i++) {
664			struct node *n;
665
666			n = tn->child[i];
667			if (!n)
668				continue;
669
670			/* compress one level */
671
672			node_set_parent(n, NULL);
673			tnode_free(tn);
674			return n;
675		}
676
677	return (struct node *) tn;
678}
679
680static struct tnode *inflate(struct trie *t, struct tnode *tn)
681{
682	struct tnode *oldtnode = tn;
683	int olen = tnode_child_length(tn);
684	int i;
685
686	pr_debug("In inflate\n");
687
688	tn = tnode_new(oldtnode->key, oldtnode->pos, oldtnode->bits + 1);
689
690	if (!tn)
691		return ERR_PTR(-ENOMEM);
692
693	/*
694	 * Preallocate and store tnodes before the actual work so we
695	 * don't get into an inconsistent state if memory allocation
696	 * fails. In case of failure we return the oldnode and  inflate
697	 * of tnode is ignored.
698	 */
699
700	for (i = 0; i < olen; i++) {
701		struct tnode *inode;
702
703		inode = (struct tnode *) tnode_get_child(oldtnode, i);
704		if (inode &&
705		    IS_TNODE(inode) &&
706		    inode->pos == oldtnode->pos + oldtnode->bits &&
707		    inode->bits > 1) {
708			struct tnode *left, *right;
709			t_key m = ~0U << (KEYLENGTH - 1) >> inode->pos;
710
711			left = tnode_new(inode->key&(~m), inode->pos + 1,
712					 inode->bits - 1);
713			if (!left)
714				goto nomem;
715
716			right = tnode_new(inode->key|m, inode->pos + 1,
717					  inode->bits - 1);
718
719			if (!right) {
720				tnode_free(left);
721				goto nomem;
722			}
723
724			put_child(t, tn, 2*i, (struct node *) left);
725			put_child(t, tn, 2*i+1, (struct node *) right);
726		}
727	}
728
729	for (i = 0; i < olen; i++) {
730		struct tnode *inode;
731		struct node *node = tnode_get_child(oldtnode, i);
732		struct tnode *left, *right;
733		int size, j;
734
735		/* An empty child */
736		if (node == NULL)
737			continue;
738
739		/* A leaf or an internal node with skipped bits */
740
741		if (IS_LEAF(node) || ((struct tnode *) node)->pos >
742		   tn->pos + tn->bits - 1) {
743			if (tkey_extract_bits(node->key,
744					      oldtnode->pos + oldtnode->bits,
745					      1) == 0)
746				put_child(t, tn, 2*i, node);
747			else
748				put_child(t, tn, 2*i+1, node);
749			continue;
750		}
751
752		/* An internal node with two children */
753		inode = (struct tnode *) node;
754
755		if (inode->bits == 1) {
756			put_child(t, tn, 2*i, inode->child[0]);
757			put_child(t, tn, 2*i+1, inode->child[1]);
758
759			tnode_free(inode);
760			continue;
761		}
762
763		/* An internal node with more than two children */
764
765		/* We will replace this node 'inode' with two new
766		 * ones, 'left' and 'right', each with half of the
767		 * original children. The two new nodes will have
768		 * a position one bit further down the key and this
769		 * means that the "significant" part of their keys
770		 * (see the discussion near the top of this file)
771		 * will differ by one bit, which will be "0" in
772		 * left's key and "1" in right's key. Since we are
773		 * moving the key position by one step, the bit that
774		 * we are moving away from - the bit at position
775		 * (inode->pos) - is the one that will differ between
776		 * left and right. So... we synthesize that bit in the
777		 * two  new keys.
778		 * The mask 'm' below will be a single "one" bit at
779		 * the position (inode->pos)
780		 */
781
782		/* Use the old key, but set the new significant
783		 *   bit to zero.
784		 */
785
786		left = (struct tnode *) tnode_get_child(tn, 2*i);
787		put_child(t, tn, 2*i, NULL);
788
789		BUG_ON(!left);
790
791		right = (struct tnode *) tnode_get_child(tn, 2*i+1);
792		put_child(t, tn, 2*i+1, NULL);
793
794		BUG_ON(!right);
795
796		size = tnode_child_length(left);
797		for (j = 0; j < size; j++) {
798			put_child(t, left, j, inode->child[j]);
799			put_child(t, right, j, inode->child[j + size]);
800		}
801		put_child(t, tn, 2*i, resize(t, left));
802		put_child(t, tn, 2*i+1, resize(t, right));
803
804		tnode_free(inode);
805	}
806	tnode_free(oldtnode);
807	return tn;
808nomem:
809	{
810		int size = tnode_child_length(tn);
811		int j;
812
813		for (j = 0; j < size; j++)
814			if (tn->child[j])
815				tnode_free((struct tnode *)tn->child[j]);
816
817		tnode_free(tn);
818
819		return ERR_PTR(-ENOMEM);
820	}
821}
822
823static struct tnode *halve(struct trie *t, struct tnode *tn)
824{
825	struct tnode *oldtnode = tn;
826	struct node *left, *right;
827	int i;
828	int olen = tnode_child_length(tn);
829
830	pr_debug("In halve\n");
831
832	tn = tnode_new(oldtnode->key, oldtnode->pos, oldtnode->bits - 1);
833
834	if (!tn)
835		return ERR_PTR(-ENOMEM);
836
837	/*
838	 * Preallocate and store tnodes before the actual work so we
839	 * don't get into an inconsistent state if memory allocation
840	 * fails. In case of failure we return the oldnode and halve
841	 * of tnode is ignored.
842	 */
843
844	for (i = 0; i < olen; i += 2) {
845		left = tnode_get_child(oldtnode, i);
846		right = tnode_get_child(oldtnode, i+1);
847
848		/* Two nonempty children */
849		if (left && right) {
850			struct tnode *newn;
851
852			newn = tnode_new(left->key, tn->pos + tn->bits, 1);
853
854			if (!newn)
855				goto nomem;
856
857			put_child(t, tn, i/2, (struct node *)newn);
858		}
859
860	}
861
862	for (i = 0; i < olen; i += 2) {
863		struct tnode *newBinNode;
864
865		left = tnode_get_child(oldtnode, i);
866		right = tnode_get_child(oldtnode, i+1);
867
868		/* At least one of the children is empty */
869		if (left == NULL) {
870			if (right == NULL)    /* Both are empty */
871				continue;
872			put_child(t, tn, i/2, right);
873			continue;
874		}
875
876		if (right == NULL) {
877			put_child(t, tn, i/2, left);
878			continue;
879		}
880
881		/* Two nonempty children */
882		newBinNode = (struct tnode *) tnode_get_child(tn, i/2);
883		put_child(t, tn, i/2, NULL);
884		put_child(t, newBinNode, 0, left);
885		put_child(t, newBinNode, 1, right);
886		put_child(t, tn, i/2, resize(t, newBinNode));
887	}
888	tnode_free(oldtnode);
889	return tn;
890nomem:
891	{
892		int size = tnode_child_length(tn);
893		int j;
894
895		for (j = 0; j < size; j++)
896			if (tn->child[j])
897				tnode_free((struct tnode *)tn->child[j]);
898
899		tnode_free(tn);
900
901		return ERR_PTR(-ENOMEM);
902	}
903}
904
905/* readside must use rcu_read_lock currently dump routines
906 via get_fa_head and dump */
907
908static struct leaf_info *find_leaf_info(struct leaf *l, int plen)
909{
910	struct hlist_head *head = &l->list;
911	struct hlist_node *node;
912	struct leaf_info *li;
913
914	hlist_for_each_entry_rcu(li, node, head, hlist)
915		if (li->plen == plen)
916			return li;
917
918	return NULL;
919}
920
921static inline struct list_head *get_fa_head(struct leaf *l, int plen)
922{
923	struct leaf_info *li = find_leaf_info(l, plen);
924
925	if (!li)
926		return NULL;
927
928	return &li->falh;
929}
930
931static void insert_leaf_info(struct hlist_head *head, struct leaf_info *new)
932{
933	struct leaf_info *li = NULL, *last = NULL;
934	struct hlist_node *node;
935
936	if (hlist_empty(head)) {
937		hlist_add_head_rcu(&new->hlist, head);
938	} else {
939		hlist_for_each_entry(li, node, head, hlist) {
940			if (new->plen > li->plen)
941				break;
942
943			last = li;
944		}
945		if (last)
946			hlist_add_after_rcu(&last->hlist, &new->hlist);
947		else
948			hlist_add_before_rcu(&new->hlist, &li->hlist);
949	}
950}
951
952/* rcu_read_lock needs to be hold by caller from readside */
953
954static struct leaf *
955fib_find_node(struct trie *t, u32 key)
956{
957	int pos;
958	struct tnode *tn;
959	struct node *n;
960
961	pos = 0;
962	n = rcu_dereference(t->trie);
963
964	while (n != NULL &&  NODE_TYPE(n) == T_TNODE) {
965		tn = (struct tnode *) n;
966
967		check_tnode(tn);
968
969		if (tkey_sub_equals(tn->key, pos, tn->pos-pos, key)) {
970			pos = tn->pos + tn->bits;
971			n = tnode_get_child_rcu(tn,
972						tkey_extract_bits(key,
973								  tn->pos,
974								  tn->bits));
975		} else
976			break;
977	}
978	/* Case we have found a leaf. Compare prefixes */
979
980	if (n != NULL && IS_LEAF(n) && tkey_equals(key, n->key))
981		return (struct leaf *)n;
982
983	return NULL;
984}
985
986static struct node *trie_rebalance(struct trie *t, struct tnode *tn)
987{
988	int wasfull;
989	t_key cindex, key = tn->key;
990	struct tnode *tp;
991
992	while (tn != NULL && (tp = node_parent((struct node *)tn)) != NULL) {
993		cindex = tkey_extract_bits(key, tp->pos, tp->bits);
994		wasfull = tnode_full(tp, tnode_get_child(tp, cindex));
995		tn = (struct tnode *) resize(t, (struct tnode *)tn);
996
997		tnode_put_child_reorg((struct tnode *)tp, cindex,
998				      (struct node *)tn, wasfull);
999
1000		tp = node_parent((struct node *) tn);
1001		if (!tp)
1002			break;
1003		tn = tp;
1004	}
1005
1006	/* Handle last (top) tnode */
1007	if (IS_TNODE(tn))
1008		tn = (struct tnode *)resize(t, (struct tnode *)tn);
1009
1010	return (struct node *)tn;
1011}
1012
1013/* only used from updater-side */
1014
1015static struct list_head *fib_insert_node(struct trie *t, u32 key, int plen)
1016{
1017	int pos, newpos;
1018	struct tnode *tp = NULL, *tn = NULL;
1019	struct node *n;
1020	struct leaf *l;
1021	int missbit;
1022	struct list_head *fa_head = NULL;
1023	struct leaf_info *li;
1024	t_key cindex;
1025
1026	pos = 0;
1027	n = t->trie;
1028
1029	/* If we point to NULL, stop. Either the tree is empty and we should
1030	 * just put a new leaf in if, or we have reached an empty child slot,
1031	 * and we should just put our new leaf in that.
1032	 * If we point to a T_TNODE, check if it matches our key. Note that
1033	 * a T_TNODE might be skipping any number of bits - its 'pos' need
1034	 * not be the parent's 'pos'+'bits'!
1035	 *
1036	 * If it does match the current key, get pos/bits from it, extract
1037	 * the index from our key, push the T_TNODE and walk the tree.
1038	 *
1039	 * If it doesn't, we have to replace it with a new T_TNODE.
1040	 *
1041	 * If we point to a T_LEAF, it might or might not have the same key
1042	 * as we do. If it does, just change the value, update the T_LEAF's
1043	 * value, and return it.
1044	 * If it doesn't, we need to replace it with a T_TNODE.
1045	 */
1046
1047	while (n != NULL &&  NODE_TYPE(n) == T_TNODE) {
1048		tn = (struct tnode *) n;
1049
1050		check_tnode(tn);
1051
1052		if (tkey_sub_equals(tn->key, pos, tn->pos-pos, key)) {
1053			tp = tn;
1054			pos = tn->pos + tn->bits;
1055			n = tnode_get_child(tn,
1056					    tkey_extract_bits(key,
1057							      tn->pos,
1058							      tn->bits));
1059
1060			BUG_ON(n && node_parent(n) != tn);
1061		} else
1062			break;
1063	}
1064
1065	/*
1066	 * n  ----> NULL, LEAF or TNODE
1067	 *
1068	 * tp is n's (parent) ----> NULL or TNODE
1069	 */
1070
1071	BUG_ON(tp && IS_LEAF(tp));
1072
1073	/* Case 1: n is a leaf. Compare prefixes */
1074
1075	if (n != NULL && IS_LEAF(n) && tkey_equals(key, n->key)) {
1076		l = (struct leaf *) n;
1077		li = leaf_info_new(plen);
1078
1079		if (!li)
1080			return NULL;
1081
1082		fa_head = &li->falh;
1083		insert_leaf_info(&l->list, li);
1084		goto done;
1085	}
1086	l = leaf_new();
1087
1088	if (!l)
1089		return NULL;
1090
1091	l->key = key;
1092	li = leaf_info_new(plen);
1093
1094	if (!li) {
1095		free_leaf(l);
1096		return NULL;
1097	}
1098
1099	fa_head = &li->falh;
1100	insert_leaf_info(&l->list, li);
1101
1102	if (t->trie && n == NULL) {
1103		/* Case 2: n is NULL, and will just insert a new leaf */
1104
1105		node_set_parent((struct node *)l, tp);
1106
1107		cindex = tkey_extract_bits(key, tp->pos, tp->bits);
1108		put_child(t, (struct tnode *)tp, cindex, (struct node *)l);
1109	} else {
1110		/* Case 3: n is a LEAF or a TNODE and the key doesn't match. */
1111		/*
1112		 *  Add a new tnode here
1113		 *  first tnode need some special handling
1114		 */
1115
1116		if (tp)
1117			pos = tp->pos+tp->bits;
1118		else
1119			pos = 0;
1120
1121		if (n) {
1122			newpos = tkey_mismatch(key, pos, n->key);
1123			tn = tnode_new(n->key, newpos, 1);
1124		} else {
1125			newpos = 0;
1126			tn = tnode_new(key, newpos, 1); /* First tnode */
1127		}
1128
1129		if (!tn) {
1130			free_leaf_info(li);
1131			free_leaf(l);
1132			return NULL;
1133		}
1134
1135		node_set_parent((struct node *)tn, tp);
1136
1137		missbit = tkey_extract_bits(key, newpos, 1);
1138		put_child(t, tn, missbit, (struct node *)l);
1139		put_child(t, tn, 1-missbit, n);
1140
1141		if (tp) {
1142			cindex = tkey_extract_bits(key, tp->pos, tp->bits);
1143			put_child(t, (struct tnode *)tp, cindex,
1144				  (struct node *)tn);
1145		} else {
1146			rcu_assign_pointer(t->trie, (struct node *)tn);
1147			tp = tn;
1148		}
1149	}
1150
1151	if (tp && tp->pos + tp->bits > 32)
1152		pr_warning("fib_trie"
1153			   " tp=%p pos=%d, bits=%d, key=%0x plen=%d\n",
1154			   tp, tp->pos, tp->bits, key, plen);
1155
1156	/* Rebalance the trie */
1157
1158	rcu_assign_pointer(t->trie, trie_rebalance(t, tp));
1159done:
1160	return fa_head;
1161}
1162
1163/*
1164 * Caller must hold RTNL.
1165 */
1166static int fn_trie_insert(struct fib_table *tb, struct fib_config *cfg)
1167{
1168	struct trie *t = (struct trie *) tb->tb_data;
1169	struct fib_alias *fa, *new_fa;
1170	struct list_head *fa_head = NULL;
1171	struct fib_info *fi;
1172	int plen = cfg->fc_dst_len;
1173	u8 tos = cfg->fc_tos;
1174	u32 key, mask;
1175	int err;
1176	struct leaf *l;
1177
1178	if (plen > 32)
1179		return -EINVAL;
1180
1181	key = ntohl(cfg->fc_dst);
1182
1183	pr_debug("Insert table=%u %08x/%d\n", tb->tb_id, key, plen);
1184
1185	mask = ntohl(inet_make_mask(plen));
1186
1187	if (key & ~mask)
1188		return -EINVAL;
1189
1190	key = key & mask;
1191
1192	fi = fib_create_info(cfg);
1193	if (IS_ERR(fi)) {
1194		err = PTR_ERR(fi);
1195		goto err;
1196	}
1197
1198	l = fib_find_node(t, key);
1199	fa = NULL;
1200
1201	if (l) {
1202		fa_head = get_fa_head(l, plen);
1203		fa = fib_find_alias(fa_head, tos, fi->fib_priority);
1204	}
1205
1206	/* Now fa, if non-NULL, points to the first fib alias
1207	 * with the same keys [prefix,tos,priority], if such key already
1208	 * exists or to the node before which we will insert new one.
1209	 *
1210	 * If fa is NULL, we will need to allocate a new one and
1211	 * insert to the head of f.
1212	 *
1213	 * If f is NULL, no fib node matched the destination key
1214	 * and we need to allocate a new one of those as well.
1215	 */
1216
1217	if (fa && fa->fa_tos == tos &&
1218	    fa->fa_info->fib_priority == fi->fib_priority) {
1219		struct fib_alias *fa_first, *fa_match;
1220
1221		err = -EEXIST;
1222		if (cfg->fc_nlflags & NLM_F_EXCL)
1223			goto out;
1224
1225		/* We have 2 goals:
1226		 * 1. Find exact match for type, scope, fib_info to avoid
1227		 * duplicate routes
1228		 * 2. Find next 'fa' (or head), NLM_F_APPEND inserts before it
1229		 */
1230		fa_match = NULL;
1231		fa_first = fa;
1232		fa = list_entry(fa->fa_list.prev, struct fib_alias, fa_list);
1233		list_for_each_entry_continue(fa, fa_head, fa_list) {
1234			if (fa->fa_tos != tos)
1235				break;
1236			if (fa->fa_info->fib_priority != fi->fib_priority)
1237				break;
1238			if (fa->fa_type == cfg->fc_type &&
1239			    fa->fa_scope == cfg->fc_scope &&
1240			    fa->fa_info == fi) {
1241				fa_match = fa;
1242				break;
1243			}
1244		}
1245
1246		if (cfg->fc_nlflags & NLM_F_REPLACE) {
1247			struct fib_info *fi_drop;
1248			u8 state;
1249
1250			fa = fa_first;
1251			if (fa_match) {
1252				if (fa == fa_match)
1253					err = 0;
1254				goto out;
1255			}
1256			err = -ENOBUFS;
1257			new_fa = kmem_cache_alloc(fn_alias_kmem, GFP_KERNEL);
1258			if (new_fa == NULL)
1259				goto out;
1260
1261			fi_drop = fa->fa_info;
1262			new_fa->fa_tos = fa->fa_tos;
1263			new_fa->fa_info = fi;
1264			new_fa->fa_type = cfg->fc_type;
1265			new_fa->fa_scope = cfg->fc_scope;
1266			state = fa->fa_state;
1267			new_fa->fa_state = state & ~FA_S_ACCESSED;
1268
1269			list_replace_rcu(&fa->fa_list, &new_fa->fa_list);
1270			alias_free_mem_rcu(fa);
1271
1272			fib_release_info(fi_drop);
1273			if (state & FA_S_ACCESSED)
1274				rt_cache_flush(cfg->fc_nlinfo.nl_net, -1);
1275			rtmsg_fib(RTM_NEWROUTE, htonl(key), new_fa, plen,
1276				tb->tb_id, &cfg->fc_nlinfo, NLM_F_REPLACE);
1277
1278			goto succeeded;
1279		}
1280		/* Error if we find a perfect match which
1281		 * uses the same scope, type, and nexthop
1282		 * information.
1283		 */
1284		if (fa_match)
1285			goto out;
1286
1287		if (!(cfg->fc_nlflags & NLM_F_APPEND))
1288			fa = fa_first;
1289	}
1290	err = -ENOENT;
1291	if (!(cfg->fc_nlflags & NLM_F_CREATE))
1292		goto out;
1293
1294	err = -ENOBUFS;
1295	new_fa = kmem_cache_alloc(fn_alias_kmem, GFP_KERNEL);
1296	if (new_fa == NULL)
1297		goto out;
1298
1299	new_fa->fa_info = fi;
1300	new_fa->fa_tos = tos;
1301	new_fa->fa_type = cfg->fc_type;
1302	new_fa->fa_scope = cfg->fc_scope;
1303	new_fa->fa_state = 0;
1304	/*
1305	 * Insert new entry to the list.
1306	 */
1307
1308	if (!fa_head) {
1309		fa_head = fib_insert_node(t, key, plen);
1310		if (unlikely(!fa_head)) {
1311			err = -ENOMEM;
1312			goto out_free_new_fa;
1313		}
1314	}
1315
1316	list_add_tail_rcu(&new_fa->fa_list,
1317			  (fa ? &fa->fa_list : fa_head));
1318
1319	rt_cache_flush(cfg->fc_nlinfo.nl_net, -1);
1320	rtmsg_fib(RTM_NEWROUTE, htonl(key), new_fa, plen, tb->tb_id,
1321		  &cfg->fc_nlinfo, 0);
1322succeeded:
1323	return 0;
1324
1325out_free_new_fa:
1326	kmem_cache_free(fn_alias_kmem, new_fa);
1327out:
1328	fib_release_info(fi);
1329err:
1330	return err;
1331}
1332
1333/* should be called with rcu_read_lock */
1334static int check_leaf(struct trie *t, struct leaf *l,
1335		      t_key key,  const struct flowi *flp,
1336		      struct fib_result *res)
1337{
1338	struct leaf_info *li;
1339	struct hlist_head *hhead = &l->list;
1340	struct hlist_node *node;
1341
1342	hlist_for_each_entry_rcu(li, node, hhead, hlist) {
1343		int err;
1344		int plen = li->plen;
1345		__be32 mask = inet_make_mask(plen);
1346
1347		if (l->key != (key & ntohl(mask)))
1348			continue;
1349
1350		err = fib_semantic_match(&li->falh, flp, res,
1351					 htonl(l->key), mask, plen);
1352
1353#ifdef CONFIG_IP_FIB_TRIE_STATS
1354		if (err <= 0)
1355			t->stats.semantic_match_passed++;
1356		else
1357			t->stats.semantic_match_miss++;
1358#endif
1359		if (err <= 0)
1360			return err;
1361	}
1362
1363	return 1;
1364}
1365
1366static int fn_trie_lookup(struct fib_table *tb, const struct flowi *flp,
1367			  struct fib_result *res)
1368{
1369	struct trie *t = (struct trie *) tb->tb_data;
1370	int ret;
1371	struct node *n;
1372	struct tnode *pn;
1373	int pos, bits;
1374	t_key key = ntohl(flp->fl4_dst);
1375	int chopped_off;
1376	t_key cindex = 0;
1377	int current_prefix_length = KEYLENGTH;
1378	struct tnode *cn;
1379	t_key node_prefix, key_prefix, pref_mismatch;
1380	int mp;
1381
1382	rcu_read_lock();
1383
1384	n = rcu_dereference(t->trie);
1385	if (!n)
1386		goto failed;
1387
1388#ifdef CONFIG_IP_FIB_TRIE_STATS
1389	t->stats.gets++;
1390#endif
1391
1392	/* Just a leaf? */
1393	if (IS_LEAF(n)) {
1394		ret = check_leaf(t, (struct leaf *)n, key, flp, res);
1395		goto found;
1396	}
1397
1398	pn = (struct tnode *) n;
1399	chopped_off = 0;
1400
1401	while (pn) {
1402		pos = pn->pos;
1403		bits = pn->bits;
1404
1405		if (!chopped_off)
1406			cindex = tkey_extract_bits(mask_pfx(key, current_prefix_length),
1407						   pos, bits);
1408
1409		n = tnode_get_child(pn, cindex);
1410
1411		if (n == NULL) {
1412#ifdef CONFIG_IP_FIB_TRIE_STATS
1413			t->stats.null_node_hit++;
1414#endif
1415			goto backtrace;
1416		}
1417
1418		if (IS_LEAF(n)) {
1419			ret = check_leaf(t, (struct leaf *)n, key, flp, res);
1420			if (ret > 0)
1421				goto backtrace;
1422			goto found;
1423		}
1424
1425		cn = (struct tnode *)n;
1426
1427		/*
1428		 * It's a tnode, and we can do some extra checks here if we
1429		 * like, to avoid descending into a dead-end branch.
1430		 * This tnode is in the parent's child array at index
1431		 * key[p_pos..p_pos+p_bits] but potentially with some bits
1432		 * chopped off, so in reality the index may be just a
1433		 * subprefix, padded with zero at the end.
1434		 * We can also take a look at any skipped bits in this
1435		 * tnode - everything up to p_pos is supposed to be ok,
1436		 * and the non-chopped bits of the index (se previous
1437		 * paragraph) are also guaranteed ok, but the rest is
1438		 * considered unknown.
1439		 *
1440		 * The skipped bits are key[pos+bits..cn->pos].
1441		 */
1442
1443		/* If current_prefix_length < pos+bits, we are already doing
1444		 * actual prefix  matching, which means everything from
1445		 * pos+(bits-chopped_off) onward must be zero along some
1446		 * branch of this subtree - otherwise there is *no* valid
1447		 * prefix present. Here we can only check the skipped
1448		 * bits. Remember, since we have already indexed into the
1449		 * parent's child array, we know that the bits we chopped of
1450		 * *are* zero.
1451		 */
1452
1453		/* NOTA BENE: Checking only skipped bits
1454		   for the new node here */
1455
1456		if (current_prefix_length < pos+bits) {
1457			if (tkey_extract_bits(cn->key, current_prefix_length,
1458						cn->pos - current_prefix_length)
1459			    || !(cn->child[0]))
1460				goto backtrace;
1461		}
1462
1463		/*
1464		 * If chopped_off=0, the index is fully validated and we
1465		 * only need to look at the skipped bits for this, the new,
1466		 * tnode. What we actually want to do is to find out if
1467		 * these skipped bits match our key perfectly, or if we will
1468		 * have to count on finding a matching prefix further down,
1469		 * because if we do, we would like to have some way of
1470		 * verifying the existence of such a prefix at this point.
1471		 */
1472
1473		/* The only thing we can do at this point is to verify that
1474		 * any such matching prefix can indeed be a prefix to our
1475		 * key, and if the bits in the node we are inspecting that
1476		 * do not match our key are not ZERO, this cannot be true.
1477		 * Thus, find out where there is a mismatch (before cn->pos)
1478		 * and verify that all the mismatching bits are zero in the
1479		 * new tnode's key.
1480		 */
1481
1482		/*
1483		 * Note: We aren't very concerned about the piece of
1484		 * the key that precede pn->pos+pn->bits, since these
1485		 * have already been checked. The bits after cn->pos
1486		 * aren't checked since these are by definition
1487		 * "unknown" at this point. Thus, what we want to see
1488		 * is if we are about to enter the "prefix matching"
1489		 * state, and in that case verify that the skipped
1490		 * bits that will prevail throughout this subtree are
1491		 * zero, as they have to be if we are to find a
1492		 * matching prefix.
1493		 */
1494
1495		node_prefix = mask_pfx(cn->key, cn->pos);
1496		key_prefix = mask_pfx(key, cn->pos);
1497		pref_mismatch = key_prefix^node_prefix;
1498		mp = 0;
1499
1500		/*
1501		 * In short: If skipped bits in this node do not match
1502		 * the search key, enter the "prefix matching"
1503		 * state.directly.
1504		 */
1505		if (pref_mismatch) {
1506			while (!(pref_mismatch & (1<<(KEYLENGTH-1)))) {
1507				mp++;
1508				pref_mismatch = pref_mismatch << 1;
1509			}
1510			key_prefix = tkey_extract_bits(cn->key, mp, cn->pos-mp);
1511
1512			if (key_prefix != 0)
1513				goto backtrace;
1514
1515			if (current_prefix_length >= cn->pos)
1516				current_prefix_length = mp;
1517		}
1518
1519		pn = (struct tnode *)n; /* Descend */
1520		chopped_off = 0;
1521		continue;
1522
1523backtrace:
1524		chopped_off++;
1525
1526		/* As zero don't change the child key (cindex) */
1527		while ((chopped_off <= pn->bits)
1528		       && !(cindex & (1<<(chopped_off-1))))
1529			chopped_off++;
1530
1531		/* Decrease current_... with bits chopped off */
1532		if (current_prefix_length > pn->pos + pn->bits - chopped_off)
1533			current_prefix_length = pn->pos + pn->bits
1534				- chopped_off;
1535
1536		/*
1537		 * Either we do the actual chop off according or if we have
1538		 * chopped off all bits in this tnode walk up to our parent.
1539		 */
1540
1541		if (chopped_off <= pn->bits) {
1542			cindex &= ~(1 << (chopped_off-1));
1543		} else {
1544			struct tnode *parent = node_parent((struct node *) pn);
1545			if (!parent)
1546				goto failed;
1547
1548			/* Get Child's index */
1549			cindex = tkey_extract_bits(pn->key, parent->pos, parent->bits);
1550			pn = parent;
1551			chopped_off = 0;
1552
1553#ifdef CONFIG_IP_FIB_TRIE_STATS
1554			t->stats.backtrack++;
1555#endif
1556			goto backtrace;
1557		}
1558	}
1559failed:
1560	ret = 1;
1561found:
1562	rcu_read_unlock();
1563	return ret;
1564}
1565
1566/*
1567 * Remove the leaf and return parent.
1568 */
1569static void trie_leaf_remove(struct trie *t, struct leaf *l)
1570{
1571	struct tnode *tp = node_parent((struct node *) l);
1572
1573	pr_debug("entering trie_leaf_remove(%p)\n", l);
1574
1575	if (tp) {
1576		t_key cindex = tkey_extract_bits(l->key, tp->pos, tp->bits);
1577		put_child(t, (struct tnode *)tp, cindex, NULL);
1578		rcu_assign_pointer(t->trie, trie_rebalance(t, tp));
1579	} else
1580		rcu_assign_pointer(t->trie, NULL);
1581
1582	free_leaf(l);
1583}
1584
1585/*
1586 * Caller must hold RTNL.
1587 */
1588static int fn_trie_delete(struct fib_table *tb, struct fib_config *cfg)
1589{
1590	struct trie *t = (struct trie *) tb->tb_data;
1591	u32 key, mask;
1592	int plen = cfg->fc_dst_len;
1593	u8 tos = cfg->fc_tos;
1594	struct fib_alias *fa, *fa_to_delete;
1595	struct list_head *fa_head;
1596	struct leaf *l;
1597	struct leaf_info *li;
1598
1599	if (plen > 32)
1600		return -EINVAL;
1601
1602	key = ntohl(cfg->fc_dst);
1603	mask = ntohl(inet_make_mask(plen));
1604
1605	if (key & ~mask)
1606		return -EINVAL;
1607
1608	key = key & mask;
1609	l = fib_find_node(t, key);
1610
1611	if (!l)
1612		return -ESRCH;
1613
1614	fa_head = get_fa_head(l, plen);
1615	fa = fib_find_alias(fa_head, tos, 0);
1616
1617	if (!fa)
1618		return -ESRCH;
1619
1620	pr_debug("Deleting %08x/%d tos=%d t=%p\n", key, plen, tos, t);
1621
1622	fa_to_delete = NULL;
1623	fa = list_entry(fa->fa_list.prev, struct fib_alias, fa_list);
1624	list_for_each_entry_continue(fa, fa_head, fa_list) {
1625		struct fib_info *fi = fa->fa_info;
1626
1627		if (fa->fa_tos != tos)
1628			break;
1629
1630		if ((!cfg->fc_type || fa->fa_type == cfg->fc_type) &&
1631		    (cfg->fc_scope == RT_SCOPE_NOWHERE ||
1632		     fa->fa_scope == cfg->fc_scope) &&
1633		    (!cfg->fc_protocol ||
1634		     fi->fib_protocol == cfg->fc_protocol) &&
1635		    fib_nh_match(cfg, fi) == 0) {
1636			fa_to_delete = fa;
1637			break;
1638		}
1639	}
1640
1641	if (!fa_to_delete)
1642		return -ESRCH;
1643
1644	fa = fa_to_delete;
1645	rtmsg_fib(RTM_DELROUTE, htonl(key), fa, plen, tb->tb_id,
1646		  &cfg->fc_nlinfo, 0);
1647
1648	l = fib_find_node(t, key);
1649	li = find_leaf_info(l, plen);
1650
1651	list_del_rcu(&fa->fa_list);
1652
1653	if (list_empty(fa_head)) {
1654		hlist_del_rcu(&li->hlist);
1655		free_leaf_info(li);
1656	}
1657
1658	if (hlist_empty(&l->list))
1659		trie_leaf_remove(t, l);
1660
1661	if (fa->fa_state & FA_S_ACCESSED)
1662		rt_cache_flush(cfg->fc_nlinfo.nl_net, -1);
1663
1664	fib_release_info(fa->fa_info);
1665	alias_free_mem_rcu(fa);
1666	return 0;
1667}
1668
1669static int trie_flush_list(struct list_head *head)
1670{
1671	struct fib_alias *fa, *fa_node;
1672	int found = 0;
1673
1674	list_for_each_entry_safe(fa, fa_node, head, fa_list) {
1675		struct fib_info *fi = fa->fa_info;
1676
1677		if (fi && (fi->fib_flags & RTNH_F_DEAD)) {
1678			list_del_rcu(&fa->fa_list);
1679			fib_release_info(fa->fa_info);
1680			alias_free_mem_rcu(fa);
1681			found++;
1682		}
1683	}
1684	return found;
1685}
1686
1687static int trie_flush_leaf(struct leaf *l)
1688{
1689	int found = 0;
1690	struct hlist_head *lih = &l->list;
1691	struct hlist_node *node, *tmp;
1692	struct leaf_info *li = NULL;
1693
1694	hlist_for_each_entry_safe(li, node, tmp, lih, hlist) {
1695		found += trie_flush_list(&li->falh);
1696
1697		if (list_empty(&li->falh)) {
1698			hlist_del_rcu(&li->hlist);
1699			free_leaf_info(li);
1700		}
1701	}
1702	return found;
1703}
1704
1705/*
1706 * Scan for the next right leaf starting at node p->child[idx]
1707 * Since we have back pointer, no recursion necessary.
1708 */
1709static struct leaf *leaf_walk_rcu(struct tnode *p, struct node *c)
1710{
1711	do {
1712		t_key idx;
1713
1714		if (c)
1715			idx = tkey_extract_bits(c->key, p->pos, p->bits) + 1;
1716		else
1717			idx = 0;
1718
1719		while (idx < 1u << p->bits) {
1720			c = tnode_get_child_rcu(p, idx++);
1721			if (!c)
1722				continue;
1723
1724			if (IS_LEAF(c)) {
1725				prefetch(p->child[idx]);
1726				return (struct leaf *) c;
1727			}
1728
1729			/* Rescan start scanning in new node */
1730			p = (struct tnode *) c;
1731			idx = 0;
1732		}
1733
1734		/* Node empty, walk back up to parent */
1735		c = (struct node *) p;
1736	} while ( (p = node_parent_rcu(c)) != NULL);
1737
1738	return NULL; /* Root of trie */
1739}
1740
1741static struct leaf *trie_firstleaf(struct trie *t)
1742{
1743	struct tnode *n = (struct tnode *) rcu_dereference(t->trie);
1744
1745	if (!n)
1746		return NULL;
1747
1748	if (IS_LEAF(n))          /* trie is just a leaf */
1749		return (struct leaf *) n;
1750
1751	return leaf_walk_rcu(n, NULL);
1752}
1753
1754static struct leaf *trie_nextleaf(struct leaf *l)
1755{
1756	struct node *c = (struct node *) l;
1757	struct tnode *p = node_parent(c);
1758
1759	if (!p)
1760		return NULL;	/* trie with just one leaf */
1761
1762	return leaf_walk_rcu(p, c);
1763}
1764
1765static struct leaf *trie_leafindex(struct trie *t, int index)
1766{
1767	struct leaf *l = trie_firstleaf(t);
1768
1769	while (l && index-- > 0)
1770		l = trie_nextleaf(l);
1771
1772	return l;
1773}
1774
1775
1776/*
1777 * Caller must hold RTNL.
1778 */
1779static int fn_trie_flush(struct fib_table *tb)
1780{
1781	struct trie *t = (struct trie *) tb->tb_data;
1782	struct leaf *l, *ll = NULL;
1783	int found = 0;
1784
1785	for (l = trie_firstleaf(t); l; l = trie_nextleaf(l)) {
1786		found += trie_flush_leaf(l);
1787
1788		if (ll && hlist_empty(&ll->list))
1789			trie_leaf_remove(t, ll);
1790		ll = l;
1791	}
1792
1793	if (ll && hlist_empty(&ll->list))
1794		trie_leaf_remove(t, ll);
1795
1796	pr_debug("trie_flush found=%d\n", found);
1797	return found;
1798}
1799
1800static void fn_trie_select_default(struct fib_table *tb,
1801				   const struct flowi *flp,
1802				   struct fib_result *res)
1803{
1804	struct trie *t = (struct trie *) tb->tb_data;
1805	int order, last_idx;
1806	struct fib_info *fi = NULL;
1807	struct fib_info *last_resort;
1808	struct fib_alias *fa = NULL;
1809	struct list_head *fa_head;
1810	struct leaf *l;
1811
1812	last_idx = -1;
1813	last_resort = NULL;
1814	order = -1;
1815
1816	rcu_read_lock();
1817
1818	l = fib_find_node(t, 0);
1819	if (!l)
1820		goto out;
1821
1822	fa_head = get_fa_head(l, 0);
1823	if (!fa_head)
1824		goto out;
1825
1826	if (list_empty(fa_head))
1827		goto out;
1828
1829	list_for_each_entry_rcu(fa, fa_head, fa_list) {
1830		struct fib_info *next_fi = fa->fa_info;
1831
1832		if (fa->fa_scope != res->scope ||
1833		    fa->fa_type != RTN_UNICAST)
1834			continue;
1835
1836		if (next_fi->fib_priority > res->fi->fib_priority)
1837			break;
1838		if (!next_fi->fib_nh[0].nh_gw ||
1839		    next_fi->fib_nh[0].nh_scope != RT_SCOPE_LINK)
1840			continue;
1841		fa->fa_state |= FA_S_ACCESSED;
1842
1843		if (fi == NULL) {
1844			if (next_fi != res->fi)
1845				break;
1846		} else if (!fib_detect_death(fi, order, &last_resort,
1847					     &last_idx, tb->tb_default)) {
1848			fib_result_assign(res, fi);
1849			tb->tb_default = order;
1850			goto out;
1851		}
1852		fi = next_fi;
1853		order++;
1854	}
1855	if (order <= 0 || fi == NULL) {
1856		tb->tb_default = -1;
1857		goto out;
1858	}
1859
1860	if (!fib_detect_death(fi, order, &last_resort, &last_idx,
1861				tb->tb_default)) {
1862		fib_result_assign(res, fi);
1863		tb->tb_default = order;
1864		goto out;
1865	}
1866	if (last_idx >= 0)
1867		fib_result_assign(res, last_resort);
1868	tb->tb_default = last_idx;
1869out:
1870	rcu_read_unlock();
1871}
1872
1873static int fn_trie_dump_fa(t_key key, int plen, struct list_head *fah,
1874			   struct fib_table *tb,
1875			   struct sk_buff *skb, struct netlink_callback *cb)
1876{
1877	int i, s_i;
1878	struct fib_alias *fa;
1879	__be32 xkey = htonl(key);
1880
1881	s_i = cb->args[5];
1882	i = 0;
1883
1884	/* rcu_read_lock is hold by caller */
1885
1886	list_for_each_entry_rcu(fa, fah, fa_list) {
1887		if (i < s_i) {
1888			i++;
1889			continue;
1890		}
1891
1892		if (fib_dump_info(skb, NETLINK_CB(cb->skb).pid,
1893				  cb->nlh->nlmsg_seq,
1894				  RTM_NEWROUTE,
1895				  tb->tb_id,
1896				  fa->fa_type,
1897				  fa->fa_scope,
1898				  xkey,
1899				  plen,
1900				  fa->fa_tos,
1901				  fa->fa_info, NLM_F_MULTI) < 0) {
1902			cb->args[5] = i;
1903			return -1;
1904		}
1905		i++;
1906	}
1907	cb->args[5] = i;
1908	return skb->len;
1909}
1910
1911static int fn_trie_dump_leaf(struct leaf *l, struct fib_table *tb,
1912			struct sk_buff *skb, struct netlink_callback *cb)
1913{
1914	struct leaf_info *li;
1915	struct hlist_node *node;
1916	int i, s_i;
1917
1918	s_i = cb->args[4];
1919	i = 0;
1920
1921	/* rcu_read_lock is hold by caller */
1922	hlist_for_each_entry_rcu(li, node, &l->list, hlist) {
1923		if (i < s_i) {
1924			i++;
1925			continue;
1926		}
1927
1928		if (i > s_i)
1929			cb->args[5] = 0;
1930
1931		if (list_empty(&li->falh))
1932			continue;
1933
1934		if (fn_trie_dump_fa(l->key, li->plen, &li->falh, tb, skb, cb) < 0) {
1935			cb->args[4] = i;
1936			return -1;
1937		}
1938		i++;
1939	}
1940
1941	cb->args[4] = i;
1942	return skb->len;
1943}
1944
1945static int fn_trie_dump(struct fib_table *tb, struct sk_buff *skb,
1946			struct netlink_callback *cb)
1947{
1948	struct leaf *l;
1949	struct trie *t = (struct trie *) tb->tb_data;
1950	t_key key = cb->args[2];
1951	int count = cb->args[3];
1952
1953	rcu_read_lock();
1954	/* Dump starting at last key.
1955	 * Note: 0.0.0.0/0 (ie default) is first key.
1956	 */
1957	if (count == 0)
1958		l = trie_firstleaf(t);
1959	else {
1960		/* Normally, continue from last key, but if that is missing
1961		 * fallback to using slow rescan
1962		 */
1963		l = fib_find_node(t, key);
1964		if (!l)
1965			l = trie_leafindex(t, count);
1966	}
1967
1968	while (l) {
1969		cb->args[2] = l->key;
1970		if (fn_trie_dump_leaf(l, tb, skb, cb) < 0) {
1971			cb->args[3] = count;
1972			rcu_read_unlock();
1973			return -1;
1974		}
1975
1976		++count;
1977		l = trie_nextleaf(l);
1978		memset(&cb->args[4], 0,
1979		       sizeof(cb->args) - 4*sizeof(cb->args[0]));
1980	}
1981	cb->args[3] = count;
1982	rcu_read_unlock();
1983
1984	return skb->len;
1985}
1986
1987void __init fib_hash_init(void)
1988{
1989	fn_alias_kmem = kmem_cache_create("ip_fib_alias",
1990					  sizeof(struct fib_alias),
1991					  0, SLAB_PANIC, NULL);
1992
1993	trie_leaf_kmem = kmem_cache_create("ip_fib_trie",
1994					   max(sizeof(struct leaf),
1995					       sizeof(struct leaf_info)),
1996					   0, SLAB_PANIC, NULL);
1997}
1998
1999
2000/* Fix more generic FIB names for init later */
2001struct fib_table *fib_hash_table(u32 id)
2002{
2003	struct fib_table *tb;
2004	struct trie *t;
2005
2006	tb = kmalloc(sizeof(struct fib_table) + sizeof(struct trie),
2007		     GFP_KERNEL);
2008	if (tb == NULL)
2009		return NULL;
2010
2011	tb->tb_id = id;
2012	tb->tb_default = -1;
2013	tb->tb_lookup = fn_trie_lookup;
2014	tb->tb_insert = fn_trie_insert;
2015	tb->tb_delete = fn_trie_delete;
2016	tb->tb_flush = fn_trie_flush;
2017	tb->tb_select_default = fn_trie_select_default;
2018	tb->tb_dump = fn_trie_dump;
2019
2020	t = (struct trie *) tb->tb_data;
2021	memset(t, 0, sizeof(*t));
2022
2023	if (id == RT_TABLE_LOCAL)
2024		pr_info("IPv4 FIB: Using LC-trie version %s\n", VERSION);
2025
2026	return tb;
2027}
2028
2029#ifdef CONFIG_PROC_FS
2030/* Depth first Trie walk iterator */
2031struct fib_trie_iter {
2032	struct seq_net_private p;
2033	struct fib_table *tb;
2034	struct tnode *tnode;
2035	unsigned index;
2036	unsigned depth;
2037};
2038
2039static struct node *fib_trie_get_next(struct fib_trie_iter *iter)
2040{
2041	struct tnode *tn = iter->tnode;
2042	unsigned cindex = iter->index;
2043	struct tnode *p;
2044
2045	/* A single entry routing table */
2046	if (!tn)
2047		return NULL;
2048
2049	pr_debug("get_next iter={node=%p index=%d depth=%d}\n",
2050		 iter->tnode, iter->index, iter->depth);
2051rescan:
2052	while (cindex < (1<<tn->bits)) {
2053		struct node *n = tnode_get_child_rcu(tn, cindex);
2054
2055		if (n) {
2056			if (IS_LEAF(n)) {
2057				iter->tnode = tn;
2058				iter->index = cindex + 1;
2059			} else {
2060				/* push down one level */
2061				iter->tnode = (struct tnode *) n;
2062				iter->index = 0;
2063				++iter->depth;
2064			}
2065			return n;
2066		}
2067
2068		++cindex;
2069	}
2070
2071	/* Current node exhausted, pop back up */
2072	p = node_parent_rcu((struct node *)tn);
2073	if (p) {
2074		cindex = tkey_extract_bits(tn->key, p->pos, p->bits)+1;
2075		tn = p;
2076		--iter->depth;
2077		goto rescan;
2078	}
2079
2080	/* got root? */
2081	return NULL;
2082}
2083
2084static struct node *fib_trie_get_first(struct fib_trie_iter *iter,
2085				       struct trie *t)
2086{
2087	struct node *n;
2088
2089	if (!t)
2090		return NULL;
2091
2092	n = rcu_dereference(t->trie);
2093	if (!n)
2094		return NULL;
2095
2096	if (IS_TNODE(n)) {
2097		iter->tnode = (struct tnode *) n;
2098		iter->index = 0;
2099		iter->depth = 1;
2100	} else {
2101		iter->tnode = NULL;
2102		iter->index = 0;
2103		iter->depth = 0;
2104	}
2105
2106	return n;
2107}
2108
2109static void trie_collect_stats(struct trie *t, struct trie_stat *s)
2110{
2111	struct node *n;
2112	struct fib_trie_iter iter;
2113
2114	memset(s, 0, sizeof(*s));
2115
2116	rcu_read_lock();
2117	for (n = fib_trie_get_first(&iter, t); n; n = fib_trie_get_next(&iter)) {
2118		if (IS_LEAF(n)) {
2119			struct leaf *l = (struct leaf *)n;
2120			struct leaf_info *li;
2121			struct hlist_node *tmp;
2122
2123			s->leaves++;
2124			s->totdepth += iter.depth;
2125			if (iter.depth > s->maxdepth)
2126				s->maxdepth = iter.depth;
2127
2128			hlist_for_each_entry_rcu(li, tmp, &l->list, hlist)
2129				++s->prefixes;
2130		} else {
2131			const struct tnode *tn = (const struct tnode *) n;
2132			int i;
2133
2134			s->tnodes++;
2135			if (tn->bits < MAX_STAT_DEPTH)
2136				s->nodesizes[tn->bits]++;
2137
2138			for (i = 0; i < (1<<tn->bits); i++)
2139				if (!tn->child[i])
2140					s->nullpointers++;
2141		}
2142	}
2143	rcu_read_unlock();
2144}
2145
2146/*
2147 *	This outputs /proc/net/fib_triestats
2148 */
2149static void trie_show_stats(struct seq_file *seq, struct trie_stat *stat)
2150{
2151	unsigned i, max, pointers, bytes, avdepth;
2152
2153	if (stat->leaves)
2154		avdepth = stat->totdepth*100 / stat->leaves;
2155	else
2156		avdepth = 0;
2157
2158	seq_printf(seq, "\tAver depth:     %u.%02d\n",
2159		   avdepth / 100, avdepth % 100);
2160	seq_printf(seq, "\tMax depth:      %u\n", stat->maxdepth);
2161
2162	seq_printf(seq, "\tLeaves:         %u\n", stat->leaves);
2163	bytes = sizeof(struct leaf) * stat->leaves;
2164
2165	seq_printf(seq, "\tPrefixes:       %u\n", stat->prefixes);
2166	bytes += sizeof(struct leaf_info) * stat->prefixes;
2167
2168	seq_printf(seq, "\tInternal nodes: %u\n\t", stat->tnodes);
2169	bytes += sizeof(struct tnode) * stat->tnodes;
2170
2171	max = MAX_STAT_DEPTH;
2172	while (max > 0 && stat->nodesizes[max-1] == 0)
2173		max--;
2174
2175	pointers = 0;
2176	for (i = 1; i <= max; i++)
2177		if (stat->nodesizes[i] != 0) {
2178			seq_printf(seq, "  %u: %u",  i, stat->nodesizes[i]);
2179			pointers += (1<<i) * stat->nodesizes[i];
2180		}
2181	seq_putc(seq, '\n');
2182	seq_printf(seq, "\tPointers: %u\n", pointers);
2183
2184	bytes += sizeof(struct node *) * pointers;
2185	seq_printf(seq, "Null ptrs: %u\n", stat->nullpointers);
2186	seq_printf(seq, "Total size: %u  kB\n", (bytes + 1023) / 1024);
2187}
2188
2189#ifdef CONFIG_IP_FIB_TRIE_STATS
2190static void trie_show_usage(struct seq_file *seq,
2191			    const struct trie_use_stats *stats)
2192{
2193	seq_printf(seq, "\nCounters:\n---------\n");
2194	seq_printf(seq, "gets = %u\n", stats->gets);
2195	seq_printf(seq, "backtracks = %u\n", stats->backtrack);
2196	seq_printf(seq, "semantic match passed = %u\n",
2197		   stats->semantic_match_passed);
2198	seq_printf(seq, "semantic match miss = %u\n",
2199		   stats->semantic_match_miss);
2200	seq_printf(seq, "null node hit= %u\n", stats->null_node_hit);
2201	seq_printf(seq, "skipped node resize = %u\n\n",
2202		   stats->resize_node_skipped);
2203}
2204#endif /*  CONFIG_IP_FIB_TRIE_STATS */
2205
2206static void fib_table_print(struct seq_file *seq, struct fib_table *tb)
2207{
2208	if (tb->tb_id == RT_TABLE_LOCAL)
2209		seq_puts(seq, "Local:\n");
2210	else if (tb->tb_id == RT_TABLE_MAIN)
2211		seq_puts(seq, "Main:\n");
2212	else
2213		seq_printf(seq, "Id %d:\n", tb->tb_id);
2214}
2215
2216
2217static int fib_triestat_seq_show(struct seq_file *seq, void *v)
2218{
2219	struct net *net = (struct net *)seq->private;
2220	unsigned int h;
2221
2222	seq_printf(seq,
2223		   "Basic info: size of leaf:"
2224		   " %Zd bytes, size of tnode: %Zd bytes.\n",
2225		   sizeof(struct leaf), sizeof(struct tnode));
2226
2227	for (h = 0; h < FIB_TABLE_HASHSZ; h++) {
2228		struct hlist_head *head = &net->ipv4.fib_table_hash[h];
2229		struct hlist_node *node;
2230		struct fib_table *tb;
2231
2232		hlist_for_each_entry_rcu(tb, node, head, tb_hlist) {
2233			struct trie *t = (struct trie *) tb->tb_data;
2234			struct trie_stat stat;
2235
2236			if (!t)
2237				continue;
2238
2239			fib_table_print(seq, tb);
2240
2241			trie_collect_stats(t, &stat);
2242			trie_show_stats(seq, &stat);
2243#ifdef CONFIG_IP_FIB_TRIE_STATS
2244			trie_show_usage(seq, &t->stats);
2245#endif
2246		}
2247	}
2248
2249	return 0;
2250}
2251
2252static int fib_triestat_seq_open(struct inode *inode, struct file *file)
2253{
2254	return single_open_net(inode, file, fib_triestat_seq_show);
2255}
2256
2257static const struct file_operations fib_triestat_fops = {
2258	.owner	= THIS_MODULE,
2259	.open	= fib_triestat_seq_open,
2260	.read	= seq_read,
2261	.llseek	= seq_lseek,
2262	.release = single_release_net,
2263};
2264
2265static struct node *fib_trie_get_idx(struct seq_file *seq, loff_t pos)
2266{
2267	struct fib_trie_iter *iter = seq->private;
2268	struct net *net = seq_file_net(seq);
2269	loff_t idx = 0;
2270	unsigned int h;
2271
2272	for (h = 0; h < FIB_TABLE_HASHSZ; h++) {
2273		struct hlist_head *head = &net->ipv4.fib_table_hash[h];
2274		struct hlist_node *node;
2275		struct fib_table *tb;
2276
2277		hlist_for_each_entry_rcu(tb, node, head, tb_hlist) {
2278			struct node *n;
2279
2280			for (n = fib_trie_get_first(iter,
2281						    (struct trie *) tb->tb_data);
2282			     n; n = fib_trie_get_next(iter))
2283				if (pos == idx++) {
2284					iter->tb = tb;
2285					return n;
2286				}
2287		}
2288	}
2289
2290	return NULL;
2291}
2292
2293static void *fib_trie_seq_start(struct seq_file *seq, loff_t *pos)
2294	__acquires(RCU)
2295{
2296	rcu_read_lock();
2297	return fib_trie_get_idx(seq, *pos);
2298}
2299
2300static void *fib_trie_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2301{
2302	struct fib_trie_iter *iter = seq->private;
2303	struct net *net = seq_file_net(seq);
2304	struct fib_table *tb = iter->tb;
2305	struct hlist_node *tb_node;
2306	unsigned int h;
2307	struct node *n;
2308
2309	++*pos;
2310	/* next node in same table */
2311	n = fib_trie_get_next(iter);
2312	if (n)
2313		return n;
2314
2315	/* walk rest of this hash chain */
2316	h = tb->tb_id & (FIB_TABLE_HASHSZ - 1);
2317	while ( (tb_node = rcu_dereference(tb->tb_hlist.next)) ) {
2318		tb = hlist_entry(tb_node, struct fib_table, tb_hlist);
2319		n = fib_trie_get_first(iter, (struct trie *) tb->tb_data);
2320		if (n)
2321			goto found;
2322	}
2323
2324	/* new hash chain */
2325	while (++h < FIB_TABLE_HASHSZ) {
2326		struct hlist_head *head = &net->ipv4.fib_table_hash[h];
2327		hlist_for_each_entry_rcu(tb, tb_node, head, tb_hlist) {
2328			n = fib_trie_get_first(iter, (struct trie *) tb->tb_data);
2329			if (n)
2330				goto found;
2331		}
2332	}
2333	return NULL;
2334
2335found:
2336	iter->tb = tb;
2337	return n;
2338}
2339
2340static void fib_trie_seq_stop(struct seq_file *seq, void *v)
2341	__releases(RCU)
2342{
2343	rcu_read_unlock();
2344}
2345
2346static void seq_indent(struct seq_file *seq, int n)
2347{
2348	while (n-- > 0) seq_puts(seq, "   ");
2349}
2350
2351static inline const char *rtn_scope(char *buf, size_t len, enum rt_scope_t s)
2352{
2353	switch (s) {
2354	case RT_SCOPE_UNIVERSE: return "universe";
2355	case RT_SCOPE_SITE:	return "site";
2356	case RT_SCOPE_LINK:	return "link";
2357	case RT_SCOPE_HOST:	return "host";
2358	case RT_SCOPE_NOWHERE:	return "nowhere";
2359	default:
2360		snprintf(buf, len, "scope=%d", s);
2361		return buf;
2362	}
2363}
2364
2365static const char *rtn_type_names[__RTN_MAX] = {
2366	[RTN_UNSPEC] = "UNSPEC",
2367	[RTN_UNICAST] = "UNICAST",
2368	[RTN_LOCAL] = "LOCAL",
2369	[RTN_BROADCAST] = "BROADCAST",
2370	[RTN_ANYCAST] = "ANYCAST",
2371	[RTN_MULTICAST] = "MULTICAST",
2372	[RTN_BLACKHOLE] = "BLACKHOLE",
2373	[RTN_UNREACHABLE] = "UNREACHABLE",
2374	[RTN_PROHIBIT] = "PROHIBIT",
2375	[RTN_THROW] = "THROW",
2376	[RTN_NAT] = "NAT",
2377	[RTN_XRESOLVE] = "XRESOLVE",
2378};
2379
2380static inline const char *rtn_type(char *buf, size_t len, unsigned t)
2381{
2382	if (t < __RTN_MAX && rtn_type_names[t])
2383		return rtn_type_names[t];
2384	snprintf(buf, len, "type %u", t);
2385	return buf;
2386}
2387
2388/* Pretty print the trie */
2389static int fib_trie_seq_show(struct seq_file *seq, void *v)
2390{
2391	const struct fib_trie_iter *iter = seq->private;
2392	struct node *n = v;
2393
2394	if (!node_parent_rcu(n))
2395		fib_table_print(seq, iter->tb);
2396
2397	if (IS_TNODE(n)) {
2398		struct tnode *tn = (struct tnode *) n;
2399		__be32 prf = htonl(mask_pfx(tn->key, tn->pos));
2400
2401		seq_indent(seq, iter->depth-1);
2402		seq_printf(seq, "  +-- " NIPQUAD_FMT "/%d %d %d %d\n",
2403			   NIPQUAD(prf), tn->pos, tn->bits, tn->full_children,
2404			   tn->empty_children);
2405
2406	} else {
2407		struct leaf *l = (struct leaf *) n;
2408		struct leaf_info *li;
2409		struct hlist_node *node;
2410		__be32 val = htonl(l->key);
2411
2412		seq_indent(seq, iter->depth);
2413		seq_printf(seq, "  |-- " NIPQUAD_FMT "\n", NIPQUAD(val));
2414
2415		hlist_for_each_entry_rcu(li, node, &l->list, hlist) {
2416			struct fib_alias *fa;
2417
2418			list_for_each_entry_rcu(fa, &li->falh, fa_list) {
2419				char buf1[32], buf2[32];
2420
2421				seq_indent(seq, iter->depth+1);
2422				seq_printf(seq, "  /%d %s %s", li->plen,
2423					   rtn_scope(buf1, sizeof(buf1),
2424						     fa->fa_scope),
2425					   rtn_type(buf2, sizeof(buf2),
2426						    fa->fa_type));
2427				if (fa->fa_tos)
2428					seq_printf(seq, " tos=%d", fa->fa_tos);
2429				seq_putc(seq, '\n');
2430			}
2431		}
2432	}
2433
2434	return 0;
2435}
2436
2437static const struct seq_operations fib_trie_seq_ops = {
2438	.start  = fib_trie_seq_start,
2439	.next   = fib_trie_seq_next,
2440	.stop   = fib_trie_seq_stop,
2441	.show   = fib_trie_seq_show,
2442};
2443
2444static int fib_trie_seq_open(struct inode *inode, struct file *file)
2445{
2446	return seq_open_net(inode, file, &fib_trie_seq_ops,
2447			    sizeof(struct fib_trie_iter));
2448}
2449
2450static const struct file_operations fib_trie_fops = {
2451	.owner  = THIS_MODULE,
2452	.open   = fib_trie_seq_open,
2453	.read   = seq_read,
2454	.llseek = seq_lseek,
2455	.release = seq_release_net,
2456};
2457
2458struct fib_route_iter {
2459	struct seq_net_private p;
2460	struct trie *main_trie;
2461	loff_t	pos;
2462	t_key	key;
2463};
2464
2465static struct leaf *fib_route_get_idx(struct fib_route_iter *iter, loff_t pos)
2466{
2467	struct leaf *l = NULL;
2468	struct trie *t = iter->main_trie;
2469
2470	/* use cache location of last found key */
2471	if (iter->pos > 0 && pos >= iter->pos && (l = fib_find_node(t, iter->key)))
2472		pos -= iter->pos;
2473	else {
2474		iter->pos = 0;
2475		l = trie_firstleaf(t);
2476	}
2477
2478	while (l && pos-- > 0) {
2479		iter->pos++;
2480		l = trie_nextleaf(l);
2481	}
2482
2483	if (l)
2484		iter->key = pos;	/* remember it */
2485	else
2486		iter->pos = 0;		/* forget it */
2487
2488	return l;
2489}
2490
2491static void *fib_route_seq_start(struct seq_file *seq, loff_t *pos)
2492	__acquires(RCU)
2493{
2494	struct fib_route_iter *iter = seq->private;
2495	struct fib_table *tb;
2496
2497	rcu_read_lock();
2498	tb = fib_get_table(seq_file_net(seq), RT_TABLE_MAIN);
2499	if (!tb)
2500		return NULL;
2501
2502	iter->main_trie = (struct trie *) tb->tb_data;
2503	if (*pos == 0)
2504		return SEQ_START_TOKEN;
2505	else
2506		return fib_route_get_idx(iter, *pos - 1);
2507}
2508
2509static void *fib_route_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2510{
2511	struct fib_route_iter *iter = seq->private;
2512	struct leaf *l = v;
2513
2514	++*pos;
2515	if (v == SEQ_START_TOKEN) {
2516		iter->pos = 0;
2517		l = trie_firstleaf(iter->main_trie);
2518	} else {
2519		iter->pos++;
2520		l = trie_nextleaf(l);
2521	}
2522
2523	if (l)
2524		iter->key = l->key;
2525	else
2526		iter->pos = 0;
2527	return l;
2528}
2529
2530static void fib_route_seq_stop(struct seq_file *seq, void *v)
2531	__releases(RCU)
2532{
2533	rcu_read_unlock();
2534}
2535
2536static unsigned fib_flag_trans(int type, __be32 mask, const struct fib_info *fi)
2537{
2538	static unsigned type2flags[RTN_MAX + 1] = {
2539		[7] = RTF_REJECT, [8] = RTF_REJECT,
2540	};
2541	unsigned flags = type2flags[type];
2542
2543	if (fi && fi->fib_nh->nh_gw)
2544		flags |= RTF_GATEWAY;
2545	if (mask == htonl(0xFFFFFFFF))
2546		flags |= RTF_HOST;
2547	flags |= RTF_UP;
2548	return flags;
2549}
2550
2551/*
2552 *	This outputs /proc/net/route.
2553 *	The format of the file is not supposed to be changed
2554 * 	and needs to be same as fib_hash output to avoid breaking
2555 *	legacy utilities
2556 */
2557static int fib_route_seq_show(struct seq_file *seq, void *v)
2558{
2559	struct leaf *l = v;
2560	struct leaf_info *li;
2561	struct hlist_node *node;
2562
2563	if (v == SEQ_START_TOKEN) {
2564		seq_printf(seq, "%-127s\n", "Iface\tDestination\tGateway "
2565			   "\tFlags\tRefCnt\tUse\tMetric\tMask\t\tMTU"
2566			   "\tWindow\tIRTT");
2567		return 0;
2568	}
2569
2570	hlist_for_each_entry_rcu(li, node, &l->list, hlist) {
2571		struct fib_alias *fa;
2572		__be32 mask, prefix;
2573
2574		mask = inet_make_mask(li->plen);
2575		prefix = htonl(l->key);
2576
2577		list_for_each_entry_rcu(fa, &li->falh, fa_list) {
2578			const struct fib_info *fi = fa->fa_info;
2579			unsigned flags = fib_flag_trans(fa->fa_type, mask, fi);
2580			int len;
2581
2582			if (fa->fa_type == RTN_BROADCAST
2583			    || fa->fa_type == RTN_MULTICAST)
2584				continue;
2585
2586			if (fi)
2587				seq_printf(seq,
2588					 "%s\t%08X\t%08X\t%04X\t%d\t%u\t"
2589					 "%d\t%08X\t%d\t%u\t%u%n",
2590					 fi->fib_dev ? fi->fib_dev->name : "*",
2591					 prefix,
2592					 fi->fib_nh->nh_gw, flags, 0, 0,
2593					 fi->fib_priority,
2594					 mask,
2595					 (fi->fib_advmss ?
2596					  fi->fib_advmss + 40 : 0),
2597					 fi->fib_window,
2598					 fi->fib_rtt >> 3, &len);
2599			else
2600				seq_printf(seq,
2601					 "*\t%08X\t%08X\t%04X\t%d\t%u\t"
2602					 "%d\t%08X\t%d\t%u\t%u%n",
2603					 prefix, 0, flags, 0, 0, 0,
2604					 mask, 0, 0, 0, &len);
2605
2606			seq_printf(seq, "%*s\n", 127 - len, "");
2607		}
2608	}
2609
2610	return 0;
2611}
2612
2613static const struct seq_operations fib_route_seq_ops = {
2614	.start  = fib_route_seq_start,
2615	.next   = fib_route_seq_next,
2616	.stop   = fib_route_seq_stop,
2617	.show   = fib_route_seq_show,
2618};
2619
2620static int fib_route_seq_open(struct inode *inode, struct file *file)
2621{
2622	return seq_open_net(inode, file, &fib_route_seq_ops,
2623			    sizeof(struct fib_route_iter));
2624}
2625
2626static const struct file_operations fib_route_fops = {
2627	.owner  = THIS_MODULE,
2628	.open   = fib_route_seq_open,
2629	.read   = seq_read,
2630	.llseek = seq_lseek,
2631	.release = seq_release_net,
2632};
2633
2634int __net_init fib_proc_init(struct net *net)
2635{
2636	if (!proc_net_fops_create(net, "fib_trie", S_IRUGO, &fib_trie_fops))
2637		goto out1;
2638
2639	if (!proc_net_fops_create(net, "fib_triestat", S_IRUGO,
2640				  &fib_triestat_fops))
2641		goto out2;
2642
2643	if (!proc_net_fops_create(net, "route", S_IRUGO, &fib_route_fops))
2644		goto out3;
2645
2646	return 0;
2647
2648out3:
2649	proc_net_remove(net, "fib_triestat");
2650out2:
2651	proc_net_remove(net, "fib_trie");
2652out1:
2653	return -ENOMEM;
2654}
2655
2656void __net_exit fib_proc_exit(struct net *net)
2657{
2658	proc_net_remove(net, "fib_trie");
2659	proc_net_remove(net, "fib_triestat");
2660	proc_net_remove(net, "route");
2661}
2662
2663#endif /* CONFIG_PROC_FS */
2664