xfrm_policy.c revision c4028958b6ecad064b1a6303a6a5906d4fe48d73
1/*
2 * xfrm_policy.c
3 *
4 * Changes:
5 *	Mitsuru KANDA @USAGI
6 * 	Kazunori MIYAZAWA @USAGI
7 * 	Kunihiro Ishiguro <kunihiro@ipinfusion.com>
8 * 		IPv6 support
9 * 	Kazunori MIYAZAWA @USAGI
10 * 	YOSHIFUJI Hideaki
11 * 		Split up af-specific portion
12 *	Derek Atkins <derek@ihtfp.com>		Add the post_input processor
13 *
14 */
15
16#include <linux/slab.h>
17#include <linux/kmod.h>
18#include <linux/list.h>
19#include <linux/spinlock.h>
20#include <linux/workqueue.h>
21#include <linux/notifier.h>
22#include <linux/netdevice.h>
23#include <linux/netfilter.h>
24#include <linux/module.h>
25#include <linux/cache.h>
26#include <net/xfrm.h>
27#include <net/ip.h>
28
29#include "xfrm_hash.h"
30
31DEFINE_MUTEX(xfrm_cfg_mutex);
32EXPORT_SYMBOL(xfrm_cfg_mutex);
33
34static DEFINE_RWLOCK(xfrm_policy_lock);
35
36unsigned int xfrm_policy_count[XFRM_POLICY_MAX*2];
37EXPORT_SYMBOL(xfrm_policy_count);
38
39static DEFINE_RWLOCK(xfrm_policy_afinfo_lock);
40static struct xfrm_policy_afinfo *xfrm_policy_afinfo[NPROTO];
41
42static kmem_cache_t *xfrm_dst_cache __read_mostly;
43
44static struct work_struct xfrm_policy_gc_work;
45static HLIST_HEAD(xfrm_policy_gc_list);
46static DEFINE_SPINLOCK(xfrm_policy_gc_lock);
47
48static struct xfrm_policy_afinfo *xfrm_policy_get_afinfo(unsigned short family);
49static void xfrm_policy_put_afinfo(struct xfrm_policy_afinfo *afinfo);
50static struct xfrm_policy_afinfo *xfrm_policy_lock_afinfo(unsigned int family);
51static void xfrm_policy_unlock_afinfo(struct xfrm_policy_afinfo *afinfo);
52
53int xfrm_register_type(struct xfrm_type *type, unsigned short family)
54{
55	struct xfrm_policy_afinfo *afinfo = xfrm_policy_lock_afinfo(family);
56	struct xfrm_type **typemap;
57	int err = 0;
58
59	if (unlikely(afinfo == NULL))
60		return -EAFNOSUPPORT;
61	typemap = afinfo->type_map;
62
63	if (likely(typemap[type->proto] == NULL))
64		typemap[type->proto] = type;
65	else
66		err = -EEXIST;
67	xfrm_policy_unlock_afinfo(afinfo);
68	return err;
69}
70EXPORT_SYMBOL(xfrm_register_type);
71
72int xfrm_unregister_type(struct xfrm_type *type, unsigned short family)
73{
74	struct xfrm_policy_afinfo *afinfo = xfrm_policy_lock_afinfo(family);
75	struct xfrm_type **typemap;
76	int err = 0;
77
78	if (unlikely(afinfo == NULL))
79		return -EAFNOSUPPORT;
80	typemap = afinfo->type_map;
81
82	if (unlikely(typemap[type->proto] != type))
83		err = -ENOENT;
84	else
85		typemap[type->proto] = NULL;
86	xfrm_policy_unlock_afinfo(afinfo);
87	return err;
88}
89EXPORT_SYMBOL(xfrm_unregister_type);
90
91struct xfrm_type *xfrm_get_type(u8 proto, unsigned short family)
92{
93	struct xfrm_policy_afinfo *afinfo;
94	struct xfrm_type **typemap;
95	struct xfrm_type *type;
96	int modload_attempted = 0;
97
98retry:
99	afinfo = xfrm_policy_get_afinfo(family);
100	if (unlikely(afinfo == NULL))
101		return NULL;
102	typemap = afinfo->type_map;
103
104	type = typemap[proto];
105	if (unlikely(type && !try_module_get(type->owner)))
106		type = NULL;
107	if (!type && !modload_attempted) {
108		xfrm_policy_put_afinfo(afinfo);
109		request_module("xfrm-type-%d-%d",
110			       (int) family, (int) proto);
111		modload_attempted = 1;
112		goto retry;
113	}
114
115	xfrm_policy_put_afinfo(afinfo);
116	return type;
117}
118
119int xfrm_dst_lookup(struct xfrm_dst **dst, struct flowi *fl,
120		    unsigned short family)
121{
122	struct xfrm_policy_afinfo *afinfo = xfrm_policy_get_afinfo(family);
123	int err = 0;
124
125	if (unlikely(afinfo == NULL))
126		return -EAFNOSUPPORT;
127
128	if (likely(afinfo->dst_lookup != NULL))
129		err = afinfo->dst_lookup(dst, fl);
130	else
131		err = -EINVAL;
132	xfrm_policy_put_afinfo(afinfo);
133	return err;
134}
135EXPORT_SYMBOL(xfrm_dst_lookup);
136
137void xfrm_put_type(struct xfrm_type *type)
138{
139	module_put(type->owner);
140}
141
142int xfrm_register_mode(struct xfrm_mode *mode, int family)
143{
144	struct xfrm_policy_afinfo *afinfo;
145	struct xfrm_mode **modemap;
146	int err;
147
148	if (unlikely(mode->encap >= XFRM_MODE_MAX))
149		return -EINVAL;
150
151	afinfo = xfrm_policy_lock_afinfo(family);
152	if (unlikely(afinfo == NULL))
153		return -EAFNOSUPPORT;
154
155	err = -EEXIST;
156	modemap = afinfo->mode_map;
157	if (likely(modemap[mode->encap] == NULL)) {
158		modemap[mode->encap] = mode;
159		err = 0;
160	}
161
162	xfrm_policy_unlock_afinfo(afinfo);
163	return err;
164}
165EXPORT_SYMBOL(xfrm_register_mode);
166
167int xfrm_unregister_mode(struct xfrm_mode *mode, int family)
168{
169	struct xfrm_policy_afinfo *afinfo;
170	struct xfrm_mode **modemap;
171	int err;
172
173	if (unlikely(mode->encap >= XFRM_MODE_MAX))
174		return -EINVAL;
175
176	afinfo = xfrm_policy_lock_afinfo(family);
177	if (unlikely(afinfo == NULL))
178		return -EAFNOSUPPORT;
179
180	err = -ENOENT;
181	modemap = afinfo->mode_map;
182	if (likely(modemap[mode->encap] == mode)) {
183		modemap[mode->encap] = NULL;
184		err = 0;
185	}
186
187	xfrm_policy_unlock_afinfo(afinfo);
188	return err;
189}
190EXPORT_SYMBOL(xfrm_unregister_mode);
191
192struct xfrm_mode *xfrm_get_mode(unsigned int encap, int family)
193{
194	struct xfrm_policy_afinfo *afinfo;
195	struct xfrm_mode *mode;
196	int modload_attempted = 0;
197
198	if (unlikely(encap >= XFRM_MODE_MAX))
199		return NULL;
200
201retry:
202	afinfo = xfrm_policy_get_afinfo(family);
203	if (unlikely(afinfo == NULL))
204		return NULL;
205
206	mode = afinfo->mode_map[encap];
207	if (unlikely(mode && !try_module_get(mode->owner)))
208		mode = NULL;
209	if (!mode && !modload_attempted) {
210		xfrm_policy_put_afinfo(afinfo);
211		request_module("xfrm-mode-%d-%d", family, encap);
212		modload_attempted = 1;
213		goto retry;
214	}
215
216	xfrm_policy_put_afinfo(afinfo);
217	return mode;
218}
219
220void xfrm_put_mode(struct xfrm_mode *mode)
221{
222	module_put(mode->owner);
223}
224
225static inline unsigned long make_jiffies(long secs)
226{
227	if (secs >= (MAX_SCHEDULE_TIMEOUT-1)/HZ)
228		return MAX_SCHEDULE_TIMEOUT-1;
229	else
230	        return secs*HZ;
231}
232
233static void xfrm_policy_timer(unsigned long data)
234{
235	struct xfrm_policy *xp = (struct xfrm_policy*)data;
236	unsigned long now = (unsigned long)xtime.tv_sec;
237	long next = LONG_MAX;
238	int warn = 0;
239	int dir;
240
241	read_lock(&xp->lock);
242
243	if (xp->dead)
244		goto out;
245
246	dir = xfrm_policy_id2dir(xp->index);
247
248	if (xp->lft.hard_add_expires_seconds) {
249		long tmo = xp->lft.hard_add_expires_seconds +
250			xp->curlft.add_time - now;
251		if (tmo <= 0)
252			goto expired;
253		if (tmo < next)
254			next = tmo;
255	}
256	if (xp->lft.hard_use_expires_seconds) {
257		long tmo = xp->lft.hard_use_expires_seconds +
258			(xp->curlft.use_time ? : xp->curlft.add_time) - now;
259		if (tmo <= 0)
260			goto expired;
261		if (tmo < next)
262			next = tmo;
263	}
264	if (xp->lft.soft_add_expires_seconds) {
265		long tmo = xp->lft.soft_add_expires_seconds +
266			xp->curlft.add_time - now;
267		if (tmo <= 0) {
268			warn = 1;
269			tmo = XFRM_KM_TIMEOUT;
270		}
271		if (tmo < next)
272			next = tmo;
273	}
274	if (xp->lft.soft_use_expires_seconds) {
275		long tmo = xp->lft.soft_use_expires_seconds +
276			(xp->curlft.use_time ? : xp->curlft.add_time) - now;
277		if (tmo <= 0) {
278			warn = 1;
279			tmo = XFRM_KM_TIMEOUT;
280		}
281		if (tmo < next)
282			next = tmo;
283	}
284
285	if (warn)
286		km_policy_expired(xp, dir, 0, 0);
287	if (next != LONG_MAX &&
288	    !mod_timer(&xp->timer, jiffies + make_jiffies(next)))
289		xfrm_pol_hold(xp);
290
291out:
292	read_unlock(&xp->lock);
293	xfrm_pol_put(xp);
294	return;
295
296expired:
297	read_unlock(&xp->lock);
298	if (!xfrm_policy_delete(xp, dir))
299		km_policy_expired(xp, dir, 1, 0);
300	xfrm_pol_put(xp);
301}
302
303
304/* Allocate xfrm_policy. Not used here, it is supposed to be used by pfkeyv2
305 * SPD calls.
306 */
307
308struct xfrm_policy *xfrm_policy_alloc(gfp_t gfp)
309{
310	struct xfrm_policy *policy;
311
312	policy = kzalloc(sizeof(struct xfrm_policy), gfp);
313
314	if (policy) {
315		INIT_HLIST_NODE(&policy->bydst);
316		INIT_HLIST_NODE(&policy->byidx);
317		rwlock_init(&policy->lock);
318		atomic_set(&policy->refcnt, 1);
319		init_timer(&policy->timer);
320		policy->timer.data = (unsigned long)policy;
321		policy->timer.function = xfrm_policy_timer;
322	}
323	return policy;
324}
325EXPORT_SYMBOL(xfrm_policy_alloc);
326
327/* Destroy xfrm_policy: descendant resources must be released to this moment. */
328
329void __xfrm_policy_destroy(struct xfrm_policy *policy)
330{
331	BUG_ON(!policy->dead);
332
333	BUG_ON(policy->bundles);
334
335	if (del_timer(&policy->timer))
336		BUG();
337
338	security_xfrm_policy_free(policy);
339	kfree(policy);
340}
341EXPORT_SYMBOL(__xfrm_policy_destroy);
342
343static void xfrm_policy_gc_kill(struct xfrm_policy *policy)
344{
345	struct dst_entry *dst;
346
347	while ((dst = policy->bundles) != NULL) {
348		policy->bundles = dst->next;
349		dst_free(dst);
350	}
351
352	if (del_timer(&policy->timer))
353		atomic_dec(&policy->refcnt);
354
355	if (atomic_read(&policy->refcnt) > 1)
356		flow_cache_flush();
357
358	xfrm_pol_put(policy);
359}
360
361static void xfrm_policy_gc_task(struct work_struct *work)
362{
363	struct xfrm_policy *policy;
364	struct hlist_node *entry, *tmp;
365	struct hlist_head gc_list;
366
367	spin_lock_bh(&xfrm_policy_gc_lock);
368	gc_list.first = xfrm_policy_gc_list.first;
369	INIT_HLIST_HEAD(&xfrm_policy_gc_list);
370	spin_unlock_bh(&xfrm_policy_gc_lock);
371
372	hlist_for_each_entry_safe(policy, entry, tmp, &gc_list, bydst)
373		xfrm_policy_gc_kill(policy);
374}
375
376/* Rule must be locked. Release descentant resources, announce
377 * entry dead. The rule must be unlinked from lists to the moment.
378 */
379
380static void xfrm_policy_kill(struct xfrm_policy *policy)
381{
382	int dead;
383
384	write_lock_bh(&policy->lock);
385	dead = policy->dead;
386	policy->dead = 1;
387	write_unlock_bh(&policy->lock);
388
389	if (unlikely(dead)) {
390		WARN_ON(1);
391		return;
392	}
393
394	spin_lock(&xfrm_policy_gc_lock);
395	hlist_add_head(&policy->bydst, &xfrm_policy_gc_list);
396	spin_unlock(&xfrm_policy_gc_lock);
397
398	schedule_work(&xfrm_policy_gc_work);
399}
400
401struct xfrm_policy_hash {
402	struct hlist_head	*table;
403	unsigned int		hmask;
404};
405
406static struct hlist_head xfrm_policy_inexact[XFRM_POLICY_MAX*2];
407static struct xfrm_policy_hash xfrm_policy_bydst[XFRM_POLICY_MAX*2] __read_mostly;
408static struct hlist_head *xfrm_policy_byidx __read_mostly;
409static unsigned int xfrm_idx_hmask __read_mostly;
410static unsigned int xfrm_policy_hashmax __read_mostly = 1 * 1024 * 1024;
411
412static inline unsigned int idx_hash(u32 index)
413{
414	return __idx_hash(index, xfrm_idx_hmask);
415}
416
417static struct hlist_head *policy_hash_bysel(struct xfrm_selector *sel, unsigned short family, int dir)
418{
419	unsigned int hmask = xfrm_policy_bydst[dir].hmask;
420	unsigned int hash = __sel_hash(sel, family, hmask);
421
422	return (hash == hmask + 1 ?
423		&xfrm_policy_inexact[dir] :
424		xfrm_policy_bydst[dir].table + hash);
425}
426
427static struct hlist_head *policy_hash_direct(xfrm_address_t *daddr, xfrm_address_t *saddr, unsigned short family, int dir)
428{
429	unsigned int hmask = xfrm_policy_bydst[dir].hmask;
430	unsigned int hash = __addr_hash(daddr, saddr, family, hmask);
431
432	return xfrm_policy_bydst[dir].table + hash;
433}
434
435static void xfrm_dst_hash_transfer(struct hlist_head *list,
436				   struct hlist_head *ndsttable,
437				   unsigned int nhashmask)
438{
439	struct hlist_node *entry, *tmp;
440	struct xfrm_policy *pol;
441
442	hlist_for_each_entry_safe(pol, entry, tmp, list, bydst) {
443		unsigned int h;
444
445		h = __addr_hash(&pol->selector.daddr, &pol->selector.saddr,
446				pol->family, nhashmask);
447		hlist_add_head(&pol->bydst, ndsttable+h);
448	}
449}
450
451static void xfrm_idx_hash_transfer(struct hlist_head *list,
452				   struct hlist_head *nidxtable,
453				   unsigned int nhashmask)
454{
455	struct hlist_node *entry, *tmp;
456	struct xfrm_policy *pol;
457
458	hlist_for_each_entry_safe(pol, entry, tmp, list, byidx) {
459		unsigned int h;
460
461		h = __idx_hash(pol->index, nhashmask);
462		hlist_add_head(&pol->byidx, nidxtable+h);
463	}
464}
465
466static unsigned long xfrm_new_hash_mask(unsigned int old_hmask)
467{
468	return ((old_hmask + 1) << 1) - 1;
469}
470
471static void xfrm_bydst_resize(int dir)
472{
473	unsigned int hmask = xfrm_policy_bydst[dir].hmask;
474	unsigned int nhashmask = xfrm_new_hash_mask(hmask);
475	unsigned int nsize = (nhashmask + 1) * sizeof(struct hlist_head);
476	struct hlist_head *odst = xfrm_policy_bydst[dir].table;
477	struct hlist_head *ndst = xfrm_hash_alloc(nsize);
478	int i;
479
480	if (!ndst)
481		return;
482
483	write_lock_bh(&xfrm_policy_lock);
484
485	for (i = hmask; i >= 0; i--)
486		xfrm_dst_hash_transfer(odst + i, ndst, nhashmask);
487
488	xfrm_policy_bydst[dir].table = ndst;
489	xfrm_policy_bydst[dir].hmask = nhashmask;
490
491	write_unlock_bh(&xfrm_policy_lock);
492
493	xfrm_hash_free(odst, (hmask + 1) * sizeof(struct hlist_head));
494}
495
496static void xfrm_byidx_resize(int total)
497{
498	unsigned int hmask = xfrm_idx_hmask;
499	unsigned int nhashmask = xfrm_new_hash_mask(hmask);
500	unsigned int nsize = (nhashmask + 1) * sizeof(struct hlist_head);
501	struct hlist_head *oidx = xfrm_policy_byidx;
502	struct hlist_head *nidx = xfrm_hash_alloc(nsize);
503	int i;
504
505	if (!nidx)
506		return;
507
508	write_lock_bh(&xfrm_policy_lock);
509
510	for (i = hmask; i >= 0; i--)
511		xfrm_idx_hash_transfer(oidx + i, nidx, nhashmask);
512
513	xfrm_policy_byidx = nidx;
514	xfrm_idx_hmask = nhashmask;
515
516	write_unlock_bh(&xfrm_policy_lock);
517
518	xfrm_hash_free(oidx, (hmask + 1) * sizeof(struct hlist_head));
519}
520
521static inline int xfrm_bydst_should_resize(int dir, int *total)
522{
523	unsigned int cnt = xfrm_policy_count[dir];
524	unsigned int hmask = xfrm_policy_bydst[dir].hmask;
525
526	if (total)
527		*total += cnt;
528
529	if ((hmask + 1) < xfrm_policy_hashmax &&
530	    cnt > hmask)
531		return 1;
532
533	return 0;
534}
535
536static inline int xfrm_byidx_should_resize(int total)
537{
538	unsigned int hmask = xfrm_idx_hmask;
539
540	if ((hmask + 1) < xfrm_policy_hashmax &&
541	    total > hmask)
542		return 1;
543
544	return 0;
545}
546
547static DEFINE_MUTEX(hash_resize_mutex);
548
549static void xfrm_hash_resize(struct work_struct *__unused)
550{
551	int dir, total;
552
553	mutex_lock(&hash_resize_mutex);
554
555	total = 0;
556	for (dir = 0; dir < XFRM_POLICY_MAX * 2; dir++) {
557		if (xfrm_bydst_should_resize(dir, &total))
558			xfrm_bydst_resize(dir);
559	}
560	if (xfrm_byidx_should_resize(total))
561		xfrm_byidx_resize(total);
562
563	mutex_unlock(&hash_resize_mutex);
564}
565
566static DECLARE_WORK(xfrm_hash_work, xfrm_hash_resize);
567
568/* Generate new index... KAME seems to generate them ordered by cost
569 * of an absolute inpredictability of ordering of rules. This will not pass. */
570static u32 xfrm_gen_index(u8 type, int dir)
571{
572	static u32 idx_generator;
573
574	for (;;) {
575		struct hlist_node *entry;
576		struct hlist_head *list;
577		struct xfrm_policy *p;
578		u32 idx;
579		int found;
580
581		idx = (idx_generator | dir);
582		idx_generator += 8;
583		if (idx == 0)
584			idx = 8;
585		list = xfrm_policy_byidx + idx_hash(idx);
586		found = 0;
587		hlist_for_each_entry(p, entry, list, byidx) {
588			if (p->index == idx) {
589				found = 1;
590				break;
591			}
592		}
593		if (!found)
594			return idx;
595	}
596}
597
598static inline int selector_cmp(struct xfrm_selector *s1, struct xfrm_selector *s2)
599{
600	u32 *p1 = (u32 *) s1;
601	u32 *p2 = (u32 *) s2;
602	int len = sizeof(struct xfrm_selector) / sizeof(u32);
603	int i;
604
605	for (i = 0; i < len; i++) {
606		if (p1[i] != p2[i])
607			return 1;
608	}
609
610	return 0;
611}
612
613int xfrm_policy_insert(int dir, struct xfrm_policy *policy, int excl)
614{
615	struct xfrm_policy *pol;
616	struct xfrm_policy *delpol;
617	struct hlist_head *chain;
618	struct hlist_node *entry, *newpos, *last;
619	struct dst_entry *gc_list;
620
621	write_lock_bh(&xfrm_policy_lock);
622	chain = policy_hash_bysel(&policy->selector, policy->family, dir);
623	delpol = NULL;
624	newpos = NULL;
625	last = NULL;
626	hlist_for_each_entry(pol, entry, chain, bydst) {
627		if (!delpol &&
628		    pol->type == policy->type &&
629		    !selector_cmp(&pol->selector, &policy->selector) &&
630		    xfrm_sec_ctx_match(pol->security, policy->security)) {
631			if (excl) {
632				write_unlock_bh(&xfrm_policy_lock);
633				return -EEXIST;
634			}
635			delpol = pol;
636			if (policy->priority > pol->priority)
637				continue;
638		} else if (policy->priority >= pol->priority) {
639			last = &pol->bydst;
640			continue;
641		}
642		if (!newpos)
643			newpos = &pol->bydst;
644		if (delpol)
645			break;
646		last = &pol->bydst;
647	}
648	if (!newpos)
649		newpos = last;
650	if (newpos)
651		hlist_add_after(newpos, &policy->bydst);
652	else
653		hlist_add_head(&policy->bydst, chain);
654	xfrm_pol_hold(policy);
655	xfrm_policy_count[dir]++;
656	atomic_inc(&flow_cache_genid);
657	if (delpol) {
658		hlist_del(&delpol->bydst);
659		hlist_del(&delpol->byidx);
660		xfrm_policy_count[dir]--;
661	}
662	policy->index = delpol ? delpol->index : xfrm_gen_index(policy->type, dir);
663	hlist_add_head(&policy->byidx, xfrm_policy_byidx+idx_hash(policy->index));
664	policy->curlft.add_time = (unsigned long)xtime.tv_sec;
665	policy->curlft.use_time = 0;
666	if (!mod_timer(&policy->timer, jiffies + HZ))
667		xfrm_pol_hold(policy);
668	write_unlock_bh(&xfrm_policy_lock);
669
670	if (delpol)
671		xfrm_policy_kill(delpol);
672	else if (xfrm_bydst_should_resize(dir, NULL))
673		schedule_work(&xfrm_hash_work);
674
675	read_lock_bh(&xfrm_policy_lock);
676	gc_list = NULL;
677	entry = &policy->bydst;
678	hlist_for_each_entry_continue(policy, entry, bydst) {
679		struct dst_entry *dst;
680
681		write_lock(&policy->lock);
682		dst = policy->bundles;
683		if (dst) {
684			struct dst_entry *tail = dst;
685			while (tail->next)
686				tail = tail->next;
687			tail->next = gc_list;
688			gc_list = dst;
689
690			policy->bundles = NULL;
691		}
692		write_unlock(&policy->lock);
693	}
694	read_unlock_bh(&xfrm_policy_lock);
695
696	while (gc_list) {
697		struct dst_entry *dst = gc_list;
698
699		gc_list = dst->next;
700		dst_free(dst);
701	}
702
703	return 0;
704}
705EXPORT_SYMBOL(xfrm_policy_insert);
706
707struct xfrm_policy *xfrm_policy_bysel_ctx(u8 type, int dir,
708					  struct xfrm_selector *sel,
709					  struct xfrm_sec_ctx *ctx, int delete)
710{
711	struct xfrm_policy *pol, *ret;
712	struct hlist_head *chain;
713	struct hlist_node *entry;
714
715	write_lock_bh(&xfrm_policy_lock);
716	chain = policy_hash_bysel(sel, sel->family, dir);
717	ret = NULL;
718	hlist_for_each_entry(pol, entry, chain, bydst) {
719		if (pol->type == type &&
720		    !selector_cmp(sel, &pol->selector) &&
721		    xfrm_sec_ctx_match(ctx, pol->security)) {
722			xfrm_pol_hold(pol);
723			if (delete) {
724				hlist_del(&pol->bydst);
725				hlist_del(&pol->byidx);
726				xfrm_policy_count[dir]--;
727			}
728			ret = pol;
729			break;
730		}
731	}
732	write_unlock_bh(&xfrm_policy_lock);
733
734	if (ret && delete) {
735		atomic_inc(&flow_cache_genid);
736		xfrm_policy_kill(ret);
737	}
738	return ret;
739}
740EXPORT_SYMBOL(xfrm_policy_bysel_ctx);
741
742struct xfrm_policy *xfrm_policy_byid(u8 type, int dir, u32 id, int delete)
743{
744	struct xfrm_policy *pol, *ret;
745	struct hlist_head *chain;
746	struct hlist_node *entry;
747
748	write_lock_bh(&xfrm_policy_lock);
749	chain = xfrm_policy_byidx + idx_hash(id);
750	ret = NULL;
751	hlist_for_each_entry(pol, entry, chain, byidx) {
752		if (pol->type == type && pol->index == id) {
753			xfrm_pol_hold(pol);
754			if (delete) {
755				hlist_del(&pol->bydst);
756				hlist_del(&pol->byidx);
757				xfrm_policy_count[dir]--;
758			}
759			ret = pol;
760			break;
761		}
762	}
763	write_unlock_bh(&xfrm_policy_lock);
764
765	if (ret && delete) {
766		atomic_inc(&flow_cache_genid);
767		xfrm_policy_kill(ret);
768	}
769	return ret;
770}
771EXPORT_SYMBOL(xfrm_policy_byid);
772
773void xfrm_policy_flush(u8 type)
774{
775	int dir;
776
777	write_lock_bh(&xfrm_policy_lock);
778	for (dir = 0; dir < XFRM_POLICY_MAX; dir++) {
779		struct xfrm_policy *pol;
780		struct hlist_node *entry;
781		int i, killed;
782
783		killed = 0;
784	again1:
785		hlist_for_each_entry(pol, entry,
786				     &xfrm_policy_inexact[dir], bydst) {
787			if (pol->type != type)
788				continue;
789			hlist_del(&pol->bydst);
790			hlist_del(&pol->byidx);
791			write_unlock_bh(&xfrm_policy_lock);
792
793			xfrm_policy_kill(pol);
794			killed++;
795
796			write_lock_bh(&xfrm_policy_lock);
797			goto again1;
798		}
799
800		for (i = xfrm_policy_bydst[dir].hmask; i >= 0; i--) {
801	again2:
802			hlist_for_each_entry(pol, entry,
803					     xfrm_policy_bydst[dir].table + i,
804					     bydst) {
805				if (pol->type != type)
806					continue;
807				hlist_del(&pol->bydst);
808				hlist_del(&pol->byidx);
809				write_unlock_bh(&xfrm_policy_lock);
810
811				xfrm_policy_kill(pol);
812				killed++;
813
814				write_lock_bh(&xfrm_policy_lock);
815				goto again2;
816			}
817		}
818
819		xfrm_policy_count[dir] -= killed;
820	}
821	atomic_inc(&flow_cache_genid);
822	write_unlock_bh(&xfrm_policy_lock);
823}
824EXPORT_SYMBOL(xfrm_policy_flush);
825
826int xfrm_policy_walk(u8 type, int (*func)(struct xfrm_policy *, int, int, void*),
827		     void *data)
828{
829	struct xfrm_policy *pol;
830	struct hlist_node *entry;
831	int dir, count, error;
832
833	read_lock_bh(&xfrm_policy_lock);
834	count = 0;
835	for (dir = 0; dir < 2*XFRM_POLICY_MAX; dir++) {
836		struct hlist_head *table = xfrm_policy_bydst[dir].table;
837		int i;
838
839		hlist_for_each_entry(pol, entry,
840				     &xfrm_policy_inexact[dir], bydst) {
841			if (pol->type == type)
842				count++;
843		}
844		for (i = xfrm_policy_bydst[dir].hmask; i >= 0; i--) {
845			hlist_for_each_entry(pol, entry, table + i, bydst) {
846				if (pol->type == type)
847					count++;
848			}
849		}
850	}
851
852	if (count == 0) {
853		error = -ENOENT;
854		goto out;
855	}
856
857	for (dir = 0; dir < 2*XFRM_POLICY_MAX; dir++) {
858		struct hlist_head *table = xfrm_policy_bydst[dir].table;
859		int i;
860
861		hlist_for_each_entry(pol, entry,
862				     &xfrm_policy_inexact[dir], bydst) {
863			if (pol->type != type)
864				continue;
865			error = func(pol, dir % XFRM_POLICY_MAX, --count, data);
866			if (error)
867				goto out;
868		}
869		for (i = xfrm_policy_bydst[dir].hmask; i >= 0; i--) {
870			hlist_for_each_entry(pol, entry, table + i, bydst) {
871				if (pol->type != type)
872					continue;
873				error = func(pol, dir % XFRM_POLICY_MAX, --count, data);
874				if (error)
875					goto out;
876			}
877		}
878	}
879	error = 0;
880out:
881	read_unlock_bh(&xfrm_policy_lock);
882	return error;
883}
884EXPORT_SYMBOL(xfrm_policy_walk);
885
886/*
887 * Find policy to apply to this flow.
888 *
889 * Returns 0 if policy found, else an -errno.
890 */
891static int xfrm_policy_match(struct xfrm_policy *pol, struct flowi *fl,
892			     u8 type, u16 family, int dir)
893{
894	struct xfrm_selector *sel = &pol->selector;
895	int match, ret = -ESRCH;
896
897	if (pol->family != family ||
898	    pol->type != type)
899		return ret;
900
901	match = xfrm_selector_match(sel, fl, family);
902	if (match)
903		ret = security_xfrm_policy_lookup(pol, fl->secid, dir);
904
905	return ret;
906}
907
908static struct xfrm_policy *xfrm_policy_lookup_bytype(u8 type, struct flowi *fl,
909						     u16 family, u8 dir)
910{
911	int err;
912	struct xfrm_policy *pol, *ret;
913	xfrm_address_t *daddr, *saddr;
914	struct hlist_node *entry;
915	struct hlist_head *chain;
916	u32 priority = ~0U;
917
918	daddr = xfrm_flowi_daddr(fl, family);
919	saddr = xfrm_flowi_saddr(fl, family);
920	if (unlikely(!daddr || !saddr))
921		return NULL;
922
923	read_lock_bh(&xfrm_policy_lock);
924	chain = policy_hash_direct(daddr, saddr, family, dir);
925	ret = NULL;
926	hlist_for_each_entry(pol, entry, chain, bydst) {
927		err = xfrm_policy_match(pol, fl, type, family, dir);
928		if (err) {
929			if (err == -ESRCH)
930				continue;
931			else {
932				ret = ERR_PTR(err);
933				goto fail;
934			}
935		} else {
936			ret = pol;
937			priority = ret->priority;
938			break;
939		}
940	}
941	chain = &xfrm_policy_inexact[dir];
942	hlist_for_each_entry(pol, entry, chain, bydst) {
943		err = xfrm_policy_match(pol, fl, type, family, dir);
944		if (err) {
945			if (err == -ESRCH)
946				continue;
947			else {
948				ret = ERR_PTR(err);
949				goto fail;
950			}
951		} else if (pol->priority < priority) {
952			ret = pol;
953			break;
954		}
955	}
956	if (ret)
957		xfrm_pol_hold(ret);
958fail:
959	read_unlock_bh(&xfrm_policy_lock);
960
961	return ret;
962}
963
964static int xfrm_policy_lookup(struct flowi *fl, u16 family, u8 dir,
965			       void **objp, atomic_t **obj_refp)
966{
967	struct xfrm_policy *pol;
968	int err = 0;
969
970#ifdef CONFIG_XFRM_SUB_POLICY
971	pol = xfrm_policy_lookup_bytype(XFRM_POLICY_TYPE_SUB, fl, family, dir);
972	if (IS_ERR(pol)) {
973		err = PTR_ERR(pol);
974		pol = NULL;
975	}
976	if (pol || err)
977		goto end;
978#endif
979	pol = xfrm_policy_lookup_bytype(XFRM_POLICY_TYPE_MAIN, fl, family, dir);
980	if (IS_ERR(pol)) {
981		err = PTR_ERR(pol);
982		pol = NULL;
983	}
984#ifdef CONFIG_XFRM_SUB_POLICY
985end:
986#endif
987	if ((*objp = (void *) pol) != NULL)
988		*obj_refp = &pol->refcnt;
989	return err;
990}
991
992static inline int policy_to_flow_dir(int dir)
993{
994	if (XFRM_POLICY_IN == FLOW_DIR_IN &&
995 	    XFRM_POLICY_OUT == FLOW_DIR_OUT &&
996 	    XFRM_POLICY_FWD == FLOW_DIR_FWD)
997 		return dir;
998 	switch (dir) {
999 	default:
1000 	case XFRM_POLICY_IN:
1001 		return FLOW_DIR_IN;
1002 	case XFRM_POLICY_OUT:
1003 		return FLOW_DIR_OUT;
1004 	case XFRM_POLICY_FWD:
1005 		return FLOW_DIR_FWD;
1006	};
1007}
1008
1009static struct xfrm_policy *xfrm_sk_policy_lookup(struct sock *sk, int dir, struct flowi *fl)
1010{
1011	struct xfrm_policy *pol;
1012
1013	read_lock_bh(&xfrm_policy_lock);
1014	if ((pol = sk->sk_policy[dir]) != NULL) {
1015 		int match = xfrm_selector_match(&pol->selector, fl,
1016						sk->sk_family);
1017 		int err = 0;
1018
1019		if (match) {
1020			err = security_xfrm_policy_lookup(pol, fl->secid,
1021					policy_to_flow_dir(dir));
1022			if (!err)
1023				xfrm_pol_hold(pol);
1024			else if (err == -ESRCH)
1025				pol = NULL;
1026			else
1027				pol = ERR_PTR(err);
1028		} else
1029			pol = NULL;
1030	}
1031	read_unlock_bh(&xfrm_policy_lock);
1032	return pol;
1033}
1034
1035static void __xfrm_policy_link(struct xfrm_policy *pol, int dir)
1036{
1037	struct hlist_head *chain = policy_hash_bysel(&pol->selector,
1038						     pol->family, dir);
1039
1040	hlist_add_head(&pol->bydst, chain);
1041	hlist_add_head(&pol->byidx, xfrm_policy_byidx+idx_hash(pol->index));
1042	xfrm_policy_count[dir]++;
1043	xfrm_pol_hold(pol);
1044
1045	if (xfrm_bydst_should_resize(dir, NULL))
1046		schedule_work(&xfrm_hash_work);
1047}
1048
1049static struct xfrm_policy *__xfrm_policy_unlink(struct xfrm_policy *pol,
1050						int dir)
1051{
1052	if (hlist_unhashed(&pol->bydst))
1053		return NULL;
1054
1055	hlist_del(&pol->bydst);
1056	hlist_del(&pol->byidx);
1057	xfrm_policy_count[dir]--;
1058
1059	return pol;
1060}
1061
1062int xfrm_policy_delete(struct xfrm_policy *pol, int dir)
1063{
1064	write_lock_bh(&xfrm_policy_lock);
1065	pol = __xfrm_policy_unlink(pol, dir);
1066	write_unlock_bh(&xfrm_policy_lock);
1067	if (pol) {
1068		if (dir < XFRM_POLICY_MAX)
1069			atomic_inc(&flow_cache_genid);
1070		xfrm_policy_kill(pol);
1071		return 0;
1072	}
1073	return -ENOENT;
1074}
1075EXPORT_SYMBOL(xfrm_policy_delete);
1076
1077int xfrm_sk_policy_insert(struct sock *sk, int dir, struct xfrm_policy *pol)
1078{
1079	struct xfrm_policy *old_pol;
1080
1081#ifdef CONFIG_XFRM_SUB_POLICY
1082	if (pol && pol->type != XFRM_POLICY_TYPE_MAIN)
1083		return -EINVAL;
1084#endif
1085
1086	write_lock_bh(&xfrm_policy_lock);
1087	old_pol = sk->sk_policy[dir];
1088	sk->sk_policy[dir] = pol;
1089	if (pol) {
1090		pol->curlft.add_time = (unsigned long)xtime.tv_sec;
1091		pol->index = xfrm_gen_index(pol->type, XFRM_POLICY_MAX+dir);
1092		__xfrm_policy_link(pol, XFRM_POLICY_MAX+dir);
1093	}
1094	if (old_pol)
1095		__xfrm_policy_unlink(old_pol, XFRM_POLICY_MAX+dir);
1096	write_unlock_bh(&xfrm_policy_lock);
1097
1098	if (old_pol) {
1099		xfrm_policy_kill(old_pol);
1100	}
1101	return 0;
1102}
1103
1104static struct xfrm_policy *clone_policy(struct xfrm_policy *old, int dir)
1105{
1106	struct xfrm_policy *newp = xfrm_policy_alloc(GFP_ATOMIC);
1107
1108	if (newp) {
1109		newp->selector = old->selector;
1110		if (security_xfrm_policy_clone(old, newp)) {
1111			kfree(newp);
1112			return NULL;  /* ENOMEM */
1113		}
1114		newp->lft = old->lft;
1115		newp->curlft = old->curlft;
1116		newp->action = old->action;
1117		newp->flags = old->flags;
1118		newp->xfrm_nr = old->xfrm_nr;
1119		newp->index = old->index;
1120		newp->type = old->type;
1121		memcpy(newp->xfrm_vec, old->xfrm_vec,
1122		       newp->xfrm_nr*sizeof(struct xfrm_tmpl));
1123		write_lock_bh(&xfrm_policy_lock);
1124		__xfrm_policy_link(newp, XFRM_POLICY_MAX+dir);
1125		write_unlock_bh(&xfrm_policy_lock);
1126		xfrm_pol_put(newp);
1127	}
1128	return newp;
1129}
1130
1131int __xfrm_sk_clone_policy(struct sock *sk)
1132{
1133	struct xfrm_policy *p0 = sk->sk_policy[0],
1134			   *p1 = sk->sk_policy[1];
1135
1136	sk->sk_policy[0] = sk->sk_policy[1] = NULL;
1137	if (p0 && (sk->sk_policy[0] = clone_policy(p0, 0)) == NULL)
1138		return -ENOMEM;
1139	if (p1 && (sk->sk_policy[1] = clone_policy(p1, 1)) == NULL)
1140		return -ENOMEM;
1141	return 0;
1142}
1143
1144static int
1145xfrm_get_saddr(xfrm_address_t *local, xfrm_address_t *remote,
1146	       unsigned short family)
1147{
1148	int err;
1149	struct xfrm_policy_afinfo *afinfo = xfrm_policy_get_afinfo(family);
1150
1151	if (unlikely(afinfo == NULL))
1152		return -EINVAL;
1153	err = afinfo->get_saddr(local, remote);
1154	xfrm_policy_put_afinfo(afinfo);
1155	return err;
1156}
1157
1158/* Resolve list of templates for the flow, given policy. */
1159
1160static int
1161xfrm_tmpl_resolve_one(struct xfrm_policy *policy, struct flowi *fl,
1162		      struct xfrm_state **xfrm,
1163		      unsigned short family)
1164{
1165	int nx;
1166	int i, error;
1167	xfrm_address_t *daddr = xfrm_flowi_daddr(fl, family);
1168	xfrm_address_t *saddr = xfrm_flowi_saddr(fl, family);
1169	xfrm_address_t tmp;
1170
1171	for (nx=0, i = 0; i < policy->xfrm_nr; i++) {
1172		struct xfrm_state *x;
1173		xfrm_address_t *remote = daddr;
1174		xfrm_address_t *local  = saddr;
1175		struct xfrm_tmpl *tmpl = &policy->xfrm_vec[i];
1176
1177		if (tmpl->mode == XFRM_MODE_TUNNEL) {
1178			remote = &tmpl->id.daddr;
1179			local = &tmpl->saddr;
1180			if (xfrm_addr_any(local, family)) {
1181				error = xfrm_get_saddr(&tmp, remote, family);
1182				if (error)
1183					goto fail;
1184				local = &tmp;
1185			}
1186		}
1187
1188		x = xfrm_state_find(remote, local, fl, tmpl, policy, &error, family);
1189
1190		if (x && x->km.state == XFRM_STATE_VALID) {
1191			xfrm[nx++] = x;
1192			daddr = remote;
1193			saddr = local;
1194			continue;
1195		}
1196		if (x) {
1197			error = (x->km.state == XFRM_STATE_ERROR ?
1198				 -EINVAL : -EAGAIN);
1199			xfrm_state_put(x);
1200		}
1201
1202		if (!tmpl->optional)
1203			goto fail;
1204	}
1205	return nx;
1206
1207fail:
1208	for (nx--; nx>=0; nx--)
1209		xfrm_state_put(xfrm[nx]);
1210	return error;
1211}
1212
1213static int
1214xfrm_tmpl_resolve(struct xfrm_policy **pols, int npols, struct flowi *fl,
1215		  struct xfrm_state **xfrm,
1216		  unsigned short family)
1217{
1218	struct xfrm_state *tp[XFRM_MAX_DEPTH];
1219	struct xfrm_state **tpp = (npols > 1) ? tp : xfrm;
1220	int cnx = 0;
1221	int error;
1222	int ret;
1223	int i;
1224
1225	for (i = 0; i < npols; i++) {
1226		if (cnx + pols[i]->xfrm_nr >= XFRM_MAX_DEPTH) {
1227			error = -ENOBUFS;
1228			goto fail;
1229		}
1230
1231		ret = xfrm_tmpl_resolve_one(pols[i], fl, &tpp[cnx], family);
1232		if (ret < 0) {
1233			error = ret;
1234			goto fail;
1235		} else
1236			cnx += ret;
1237	}
1238
1239	/* found states are sorted for outbound processing */
1240	if (npols > 1)
1241		xfrm_state_sort(xfrm, tpp, cnx, family);
1242
1243	return cnx;
1244
1245 fail:
1246	for (cnx--; cnx>=0; cnx--)
1247		xfrm_state_put(tpp[cnx]);
1248	return error;
1249
1250}
1251
1252/* Check that the bundle accepts the flow and its components are
1253 * still valid.
1254 */
1255
1256static struct dst_entry *
1257xfrm_find_bundle(struct flowi *fl, struct xfrm_policy *policy, unsigned short family)
1258{
1259	struct dst_entry *x;
1260	struct xfrm_policy_afinfo *afinfo = xfrm_policy_get_afinfo(family);
1261	if (unlikely(afinfo == NULL))
1262		return ERR_PTR(-EINVAL);
1263	x = afinfo->find_bundle(fl, policy);
1264	xfrm_policy_put_afinfo(afinfo);
1265	return x;
1266}
1267
1268/* Allocate chain of dst_entry's, attach known xfrm's, calculate
1269 * all the metrics... Shortly, bundle a bundle.
1270 */
1271
1272static int
1273xfrm_bundle_create(struct xfrm_policy *policy, struct xfrm_state **xfrm, int nx,
1274		   struct flowi *fl, struct dst_entry **dst_p,
1275		   unsigned short family)
1276{
1277	int err;
1278	struct xfrm_policy_afinfo *afinfo = xfrm_policy_get_afinfo(family);
1279	if (unlikely(afinfo == NULL))
1280		return -EINVAL;
1281	err = afinfo->bundle_create(policy, xfrm, nx, fl, dst_p);
1282	xfrm_policy_put_afinfo(afinfo);
1283	return err;
1284}
1285
1286
1287static int stale_bundle(struct dst_entry *dst);
1288
1289/* Main function: finds/creates a bundle for given flow.
1290 *
1291 * At the moment we eat a raw IP route. Mostly to speed up lookups
1292 * on interfaces with disabled IPsec.
1293 */
1294int xfrm_lookup(struct dst_entry **dst_p, struct flowi *fl,
1295		struct sock *sk, int flags)
1296{
1297	struct xfrm_policy *policy;
1298	struct xfrm_policy *pols[XFRM_POLICY_TYPE_MAX];
1299	int npols;
1300	int pol_dead;
1301	int xfrm_nr;
1302	int pi;
1303	struct xfrm_state *xfrm[XFRM_MAX_DEPTH];
1304	struct dst_entry *dst, *dst_orig = *dst_p;
1305	int nx = 0;
1306	int err;
1307	u32 genid;
1308	u16 family;
1309	u8 dir = policy_to_flow_dir(XFRM_POLICY_OUT);
1310
1311restart:
1312	genid = atomic_read(&flow_cache_genid);
1313	policy = NULL;
1314	for (pi = 0; pi < ARRAY_SIZE(pols); pi++)
1315		pols[pi] = NULL;
1316	npols = 0;
1317	pol_dead = 0;
1318	xfrm_nr = 0;
1319
1320	if (sk && sk->sk_policy[1]) {
1321		policy = xfrm_sk_policy_lookup(sk, XFRM_POLICY_OUT, fl);
1322		if (IS_ERR(policy))
1323			return PTR_ERR(policy);
1324	}
1325
1326	if (!policy) {
1327		/* To accelerate a bit...  */
1328		if ((dst_orig->flags & DST_NOXFRM) ||
1329		    !xfrm_policy_count[XFRM_POLICY_OUT])
1330			return 0;
1331
1332		policy = flow_cache_lookup(fl, dst_orig->ops->family,
1333					   dir, xfrm_policy_lookup);
1334		if (IS_ERR(policy))
1335			return PTR_ERR(policy);
1336	}
1337
1338	if (!policy)
1339		return 0;
1340
1341	family = dst_orig->ops->family;
1342	policy->curlft.use_time = (unsigned long)xtime.tv_sec;
1343	pols[0] = policy;
1344	npols ++;
1345	xfrm_nr += pols[0]->xfrm_nr;
1346
1347	switch (policy->action) {
1348	case XFRM_POLICY_BLOCK:
1349		/* Prohibit the flow */
1350		err = -EPERM;
1351		goto error;
1352
1353	case XFRM_POLICY_ALLOW:
1354#ifndef CONFIG_XFRM_SUB_POLICY
1355		if (policy->xfrm_nr == 0) {
1356			/* Flow passes not transformed. */
1357			xfrm_pol_put(policy);
1358			return 0;
1359		}
1360#endif
1361
1362		/* Try to find matching bundle.
1363		 *
1364		 * LATER: help from flow cache. It is optional, this
1365		 * is required only for output policy.
1366		 */
1367		dst = xfrm_find_bundle(fl, policy, family);
1368		if (IS_ERR(dst)) {
1369			err = PTR_ERR(dst);
1370			goto error;
1371		}
1372
1373		if (dst)
1374			break;
1375
1376#ifdef CONFIG_XFRM_SUB_POLICY
1377		if (pols[0]->type != XFRM_POLICY_TYPE_MAIN) {
1378			pols[1] = xfrm_policy_lookup_bytype(XFRM_POLICY_TYPE_MAIN,
1379							    fl, family,
1380							    XFRM_POLICY_OUT);
1381			if (pols[1]) {
1382				if (IS_ERR(pols[1])) {
1383					err = PTR_ERR(pols[1]);
1384					goto error;
1385				}
1386				if (pols[1]->action == XFRM_POLICY_BLOCK) {
1387					err = -EPERM;
1388					goto error;
1389				}
1390				npols ++;
1391				xfrm_nr += pols[1]->xfrm_nr;
1392			}
1393		}
1394
1395		/*
1396		 * Because neither flowi nor bundle information knows about
1397		 * transformation template size. On more than one policy usage
1398		 * we can realize whether all of them is bypass or not after
1399		 * they are searched. See above not-transformed bypass
1400		 * is surrounded by non-sub policy configuration, too.
1401		 */
1402		if (xfrm_nr == 0) {
1403			/* Flow passes not transformed. */
1404			xfrm_pols_put(pols, npols);
1405			return 0;
1406		}
1407
1408#endif
1409		nx = xfrm_tmpl_resolve(pols, npols, fl, xfrm, family);
1410
1411		if (unlikely(nx<0)) {
1412			err = nx;
1413			if (err == -EAGAIN && flags) {
1414				DECLARE_WAITQUEUE(wait, current);
1415
1416				add_wait_queue(&km_waitq, &wait);
1417				set_current_state(TASK_INTERRUPTIBLE);
1418				schedule();
1419				set_current_state(TASK_RUNNING);
1420				remove_wait_queue(&km_waitq, &wait);
1421
1422				nx = xfrm_tmpl_resolve(pols, npols, fl, xfrm, family);
1423
1424				if (nx == -EAGAIN && signal_pending(current)) {
1425					err = -ERESTART;
1426					goto error;
1427				}
1428				if (nx == -EAGAIN ||
1429				    genid != atomic_read(&flow_cache_genid)) {
1430					xfrm_pols_put(pols, npols);
1431					goto restart;
1432				}
1433				err = nx;
1434			}
1435			if (err < 0)
1436				goto error;
1437		}
1438		if (nx == 0) {
1439			/* Flow passes not transformed. */
1440			xfrm_pols_put(pols, npols);
1441			return 0;
1442		}
1443
1444		dst = dst_orig;
1445		err = xfrm_bundle_create(policy, xfrm, nx, fl, &dst, family);
1446
1447		if (unlikely(err)) {
1448			int i;
1449			for (i=0; i<nx; i++)
1450				xfrm_state_put(xfrm[i]);
1451			goto error;
1452		}
1453
1454		for (pi = 0; pi < npols; pi++) {
1455			read_lock_bh(&pols[pi]->lock);
1456			pol_dead |= pols[pi]->dead;
1457			read_unlock_bh(&pols[pi]->lock);
1458		}
1459
1460		write_lock_bh(&policy->lock);
1461		if (unlikely(pol_dead || stale_bundle(dst))) {
1462			/* Wow! While we worked on resolving, this
1463			 * policy has gone. Retry. It is not paranoia,
1464			 * we just cannot enlist new bundle to dead object.
1465			 * We can't enlist stable bundles either.
1466			 */
1467			write_unlock_bh(&policy->lock);
1468			if (dst)
1469				dst_free(dst);
1470
1471			err = -EHOSTUNREACH;
1472			goto error;
1473		}
1474		dst->next = policy->bundles;
1475		policy->bundles = dst;
1476		dst_hold(dst);
1477		write_unlock_bh(&policy->lock);
1478	}
1479	*dst_p = dst;
1480	dst_release(dst_orig);
1481 	xfrm_pols_put(pols, npols);
1482	return 0;
1483
1484error:
1485	dst_release(dst_orig);
1486	xfrm_pols_put(pols, npols);
1487	*dst_p = NULL;
1488	return err;
1489}
1490EXPORT_SYMBOL(xfrm_lookup);
1491
1492static inline int
1493xfrm_secpath_reject(int idx, struct sk_buff *skb, struct flowi *fl)
1494{
1495	struct xfrm_state *x;
1496	int err;
1497
1498	if (!skb->sp || idx < 0 || idx >= skb->sp->len)
1499		return 0;
1500	x = skb->sp->xvec[idx];
1501	if (!x->type->reject)
1502		return 0;
1503	xfrm_state_hold(x);
1504	err = x->type->reject(x, skb, fl);
1505	xfrm_state_put(x);
1506	return err;
1507}
1508
1509/* When skb is transformed back to its "native" form, we have to
1510 * check policy restrictions. At the moment we make this in maximally
1511 * stupid way. Shame on me. :-) Of course, connected sockets must
1512 * have policy cached at them.
1513 */
1514
1515static inline int
1516xfrm_state_ok(struct xfrm_tmpl *tmpl, struct xfrm_state *x,
1517	      unsigned short family)
1518{
1519	if (xfrm_state_kern(x))
1520		return tmpl->optional && !xfrm_state_addr_cmp(tmpl, x, family);
1521	return	x->id.proto == tmpl->id.proto &&
1522		(x->id.spi == tmpl->id.spi || !tmpl->id.spi) &&
1523		(x->props.reqid == tmpl->reqid || !tmpl->reqid) &&
1524		x->props.mode == tmpl->mode &&
1525		((tmpl->aalgos & (1<<x->props.aalgo)) ||
1526		 !(xfrm_id_proto_match(tmpl->id.proto, IPSEC_PROTO_ANY))) &&
1527		!(x->props.mode != XFRM_MODE_TRANSPORT &&
1528		  xfrm_state_addr_cmp(tmpl, x, family));
1529}
1530
1531/*
1532 * 0 or more than 0 is returned when validation is succeeded (either bypass
1533 * because of optional transport mode, or next index of the mathced secpath
1534 * state with the template.
1535 * -1 is returned when no matching template is found.
1536 * Otherwise "-2 - errored_index" is returned.
1537 */
1538static inline int
1539xfrm_policy_ok(struct xfrm_tmpl *tmpl, struct sec_path *sp, int start,
1540	       unsigned short family)
1541{
1542	int idx = start;
1543
1544	if (tmpl->optional) {
1545		if (tmpl->mode == XFRM_MODE_TRANSPORT)
1546			return start;
1547	} else
1548		start = -1;
1549	for (; idx < sp->len; idx++) {
1550		if (xfrm_state_ok(tmpl, sp->xvec[idx], family))
1551			return ++idx;
1552		if (sp->xvec[idx]->props.mode != XFRM_MODE_TRANSPORT) {
1553			if (start == -1)
1554				start = -2-idx;
1555			break;
1556		}
1557	}
1558	return start;
1559}
1560
1561int
1562xfrm_decode_session(struct sk_buff *skb, struct flowi *fl, unsigned short family)
1563{
1564	struct xfrm_policy_afinfo *afinfo = xfrm_policy_get_afinfo(family);
1565	int err;
1566
1567	if (unlikely(afinfo == NULL))
1568		return -EAFNOSUPPORT;
1569
1570	afinfo->decode_session(skb, fl);
1571	err = security_xfrm_decode_session(skb, &fl->secid);
1572	xfrm_policy_put_afinfo(afinfo);
1573	return err;
1574}
1575EXPORT_SYMBOL(xfrm_decode_session);
1576
1577static inline int secpath_has_nontransport(struct sec_path *sp, int k, int *idxp)
1578{
1579	for (; k < sp->len; k++) {
1580		if (sp->xvec[k]->props.mode != XFRM_MODE_TRANSPORT) {
1581			*idxp = k;
1582			return 1;
1583		}
1584	}
1585
1586	return 0;
1587}
1588
1589int __xfrm_policy_check(struct sock *sk, int dir, struct sk_buff *skb,
1590			unsigned short family)
1591{
1592	struct xfrm_policy *pol;
1593	struct xfrm_policy *pols[XFRM_POLICY_TYPE_MAX];
1594	int npols = 0;
1595	int xfrm_nr;
1596	int pi;
1597	struct flowi fl;
1598	u8 fl_dir = policy_to_flow_dir(dir);
1599	int xerr_idx = -1;
1600
1601	if (xfrm_decode_session(skb, &fl, family) < 0)
1602		return 0;
1603	nf_nat_decode_session(skb, &fl, family);
1604
1605	/* First, check used SA against their selectors. */
1606	if (skb->sp) {
1607		int i;
1608
1609		for (i=skb->sp->len-1; i>=0; i--) {
1610			struct xfrm_state *x = skb->sp->xvec[i];
1611			if (!xfrm_selector_match(&x->sel, &fl, family))
1612				return 0;
1613		}
1614	}
1615
1616	pol = NULL;
1617	if (sk && sk->sk_policy[dir]) {
1618		pol = xfrm_sk_policy_lookup(sk, dir, &fl);
1619		if (IS_ERR(pol))
1620			return 0;
1621	}
1622
1623	if (!pol)
1624		pol = flow_cache_lookup(&fl, family, fl_dir,
1625					xfrm_policy_lookup);
1626
1627	if (IS_ERR(pol))
1628		return 0;
1629
1630	if (!pol) {
1631		if (skb->sp && secpath_has_nontransport(skb->sp, 0, &xerr_idx)) {
1632			xfrm_secpath_reject(xerr_idx, skb, &fl);
1633			return 0;
1634		}
1635		return 1;
1636	}
1637
1638	pol->curlft.use_time = (unsigned long)xtime.tv_sec;
1639
1640	pols[0] = pol;
1641	npols ++;
1642#ifdef CONFIG_XFRM_SUB_POLICY
1643	if (pols[0]->type != XFRM_POLICY_TYPE_MAIN) {
1644		pols[1] = xfrm_policy_lookup_bytype(XFRM_POLICY_TYPE_MAIN,
1645						    &fl, family,
1646						    XFRM_POLICY_IN);
1647		if (pols[1]) {
1648			if (IS_ERR(pols[1]))
1649				return 0;
1650			pols[1]->curlft.use_time = (unsigned long)xtime.tv_sec;
1651			npols ++;
1652		}
1653	}
1654#endif
1655
1656	if (pol->action == XFRM_POLICY_ALLOW) {
1657		struct sec_path *sp;
1658		static struct sec_path dummy;
1659		struct xfrm_tmpl *tp[XFRM_MAX_DEPTH];
1660		struct xfrm_tmpl *stp[XFRM_MAX_DEPTH];
1661		struct xfrm_tmpl **tpp = tp;
1662		int ti = 0;
1663		int i, k;
1664
1665		if ((sp = skb->sp) == NULL)
1666			sp = &dummy;
1667
1668		for (pi = 0; pi < npols; pi++) {
1669			if (pols[pi] != pol &&
1670			    pols[pi]->action != XFRM_POLICY_ALLOW)
1671				goto reject;
1672			if (ti + pols[pi]->xfrm_nr >= XFRM_MAX_DEPTH)
1673				goto reject_error;
1674			for (i = 0; i < pols[pi]->xfrm_nr; i++)
1675				tpp[ti++] = &pols[pi]->xfrm_vec[i];
1676		}
1677		xfrm_nr = ti;
1678		if (npols > 1) {
1679			xfrm_tmpl_sort(stp, tpp, xfrm_nr, family);
1680			tpp = stp;
1681		}
1682
1683		/* For each tunnel xfrm, find the first matching tmpl.
1684		 * For each tmpl before that, find corresponding xfrm.
1685		 * Order is _important_. Later we will implement
1686		 * some barriers, but at the moment barriers
1687		 * are implied between each two transformations.
1688		 */
1689		for (i = xfrm_nr-1, k = 0; i >= 0; i--) {
1690			k = xfrm_policy_ok(tpp[i], sp, k, family);
1691			if (k < 0) {
1692				if (k < -1)
1693					/* "-2 - errored_index" returned */
1694					xerr_idx = -(2+k);
1695				goto reject;
1696			}
1697		}
1698
1699		if (secpath_has_nontransport(sp, k, &xerr_idx))
1700			goto reject;
1701
1702		xfrm_pols_put(pols, npols);
1703		return 1;
1704	}
1705
1706reject:
1707	xfrm_secpath_reject(xerr_idx, skb, &fl);
1708reject_error:
1709	xfrm_pols_put(pols, npols);
1710	return 0;
1711}
1712EXPORT_SYMBOL(__xfrm_policy_check);
1713
1714int __xfrm_route_forward(struct sk_buff *skb, unsigned short family)
1715{
1716	struct flowi fl;
1717
1718	if (xfrm_decode_session(skb, &fl, family) < 0)
1719		return 0;
1720
1721	return xfrm_lookup(&skb->dst, &fl, NULL, 0) == 0;
1722}
1723EXPORT_SYMBOL(__xfrm_route_forward);
1724
1725/* Optimize later using cookies and generation ids. */
1726
1727static struct dst_entry *xfrm_dst_check(struct dst_entry *dst, u32 cookie)
1728{
1729	/* Code (such as __xfrm4_bundle_create()) sets dst->obsolete
1730	 * to "-1" to force all XFRM destinations to get validated by
1731	 * dst_ops->check on every use.  We do this because when a
1732	 * normal route referenced by an XFRM dst is obsoleted we do
1733	 * not go looking around for all parent referencing XFRM dsts
1734	 * so that we can invalidate them.  It is just too much work.
1735	 * Instead we make the checks here on every use.  For example:
1736	 *
1737	 *	XFRM dst A --> IPv4 dst X
1738	 *
1739	 * X is the "xdst->route" of A (X is also the "dst->path" of A
1740	 * in this example).  If X is marked obsolete, "A" will not
1741	 * notice.  That's what we are validating here via the
1742	 * stale_bundle() check.
1743	 *
1744	 * When a policy's bundle is pruned, we dst_free() the XFRM
1745	 * dst which causes it's ->obsolete field to be set to a
1746	 * positive non-zero integer.  If an XFRM dst has been pruned
1747	 * like this, we want to force a new route lookup.
1748	 */
1749	if (dst->obsolete < 0 && !stale_bundle(dst))
1750		return dst;
1751
1752	return NULL;
1753}
1754
1755static int stale_bundle(struct dst_entry *dst)
1756{
1757	return !xfrm_bundle_ok(NULL, (struct xfrm_dst *)dst, NULL, AF_UNSPEC, 0);
1758}
1759
1760void xfrm_dst_ifdown(struct dst_entry *dst, struct net_device *dev)
1761{
1762	while ((dst = dst->child) && dst->xfrm && dst->dev == dev) {
1763		dst->dev = &loopback_dev;
1764		dev_hold(&loopback_dev);
1765		dev_put(dev);
1766	}
1767}
1768EXPORT_SYMBOL(xfrm_dst_ifdown);
1769
1770static void xfrm_link_failure(struct sk_buff *skb)
1771{
1772	/* Impossible. Such dst must be popped before reaches point of failure. */
1773	return;
1774}
1775
1776static struct dst_entry *xfrm_negative_advice(struct dst_entry *dst)
1777{
1778	if (dst) {
1779		if (dst->obsolete) {
1780			dst_release(dst);
1781			dst = NULL;
1782		}
1783	}
1784	return dst;
1785}
1786
1787static void prune_one_bundle(struct xfrm_policy *pol, int (*func)(struct dst_entry *), struct dst_entry **gc_list_p)
1788{
1789	struct dst_entry *dst, **dstp;
1790
1791	write_lock(&pol->lock);
1792	dstp = &pol->bundles;
1793	while ((dst=*dstp) != NULL) {
1794		if (func(dst)) {
1795			*dstp = dst->next;
1796			dst->next = *gc_list_p;
1797			*gc_list_p = dst;
1798		} else {
1799			dstp = &dst->next;
1800		}
1801	}
1802	write_unlock(&pol->lock);
1803}
1804
1805static void xfrm_prune_bundles(int (*func)(struct dst_entry *))
1806{
1807	struct dst_entry *gc_list = NULL;
1808	int dir;
1809
1810	read_lock_bh(&xfrm_policy_lock);
1811	for (dir = 0; dir < XFRM_POLICY_MAX * 2; dir++) {
1812		struct xfrm_policy *pol;
1813		struct hlist_node *entry;
1814		struct hlist_head *table;
1815		int i;
1816
1817		hlist_for_each_entry(pol, entry,
1818				     &xfrm_policy_inexact[dir], bydst)
1819			prune_one_bundle(pol, func, &gc_list);
1820
1821		table = xfrm_policy_bydst[dir].table;
1822		for (i = xfrm_policy_bydst[dir].hmask; i >= 0; i--) {
1823			hlist_for_each_entry(pol, entry, table + i, bydst)
1824				prune_one_bundle(pol, func, &gc_list);
1825		}
1826	}
1827	read_unlock_bh(&xfrm_policy_lock);
1828
1829	while (gc_list) {
1830		struct dst_entry *dst = gc_list;
1831		gc_list = dst->next;
1832		dst_free(dst);
1833	}
1834}
1835
1836static int unused_bundle(struct dst_entry *dst)
1837{
1838	return !atomic_read(&dst->__refcnt);
1839}
1840
1841static void __xfrm_garbage_collect(void)
1842{
1843	xfrm_prune_bundles(unused_bundle);
1844}
1845
1846static int xfrm_flush_bundles(void)
1847{
1848	xfrm_prune_bundles(stale_bundle);
1849	return 0;
1850}
1851
1852void xfrm_init_pmtu(struct dst_entry *dst)
1853{
1854	do {
1855		struct xfrm_dst *xdst = (struct xfrm_dst *)dst;
1856		u32 pmtu, route_mtu_cached;
1857
1858		pmtu = dst_mtu(dst->child);
1859		xdst->child_mtu_cached = pmtu;
1860
1861		pmtu = xfrm_state_mtu(dst->xfrm, pmtu);
1862
1863		route_mtu_cached = dst_mtu(xdst->route);
1864		xdst->route_mtu_cached = route_mtu_cached;
1865
1866		if (pmtu > route_mtu_cached)
1867			pmtu = route_mtu_cached;
1868
1869		dst->metrics[RTAX_MTU-1] = pmtu;
1870	} while ((dst = dst->next));
1871}
1872
1873EXPORT_SYMBOL(xfrm_init_pmtu);
1874
1875/* Check that the bundle accepts the flow and its components are
1876 * still valid.
1877 */
1878
1879int xfrm_bundle_ok(struct xfrm_policy *pol, struct xfrm_dst *first,
1880		struct flowi *fl, int family, int strict)
1881{
1882	struct dst_entry *dst = &first->u.dst;
1883	struct xfrm_dst *last;
1884	u32 mtu;
1885
1886	if (!dst_check(dst->path, ((struct xfrm_dst *)dst)->path_cookie) ||
1887	    (dst->dev && !netif_running(dst->dev)))
1888		return 0;
1889
1890	last = NULL;
1891
1892	do {
1893		struct xfrm_dst *xdst = (struct xfrm_dst *)dst;
1894
1895		if (fl && !xfrm_selector_match(&dst->xfrm->sel, fl, family))
1896			return 0;
1897		if (fl && !security_xfrm_flow_state_match(fl, dst->xfrm, pol))
1898			return 0;
1899		if (dst->xfrm->km.state != XFRM_STATE_VALID)
1900			return 0;
1901		if (xdst->genid != dst->xfrm->genid)
1902			return 0;
1903
1904		if (strict && fl && dst->xfrm->props.mode != XFRM_MODE_TUNNEL &&
1905		    !xfrm_state_addr_flow_check(dst->xfrm, fl, family))
1906			return 0;
1907
1908		mtu = dst_mtu(dst->child);
1909		if (xdst->child_mtu_cached != mtu) {
1910			last = xdst;
1911			xdst->child_mtu_cached = mtu;
1912		}
1913
1914		if (!dst_check(xdst->route, xdst->route_cookie))
1915			return 0;
1916		mtu = dst_mtu(xdst->route);
1917		if (xdst->route_mtu_cached != mtu) {
1918			last = xdst;
1919			xdst->route_mtu_cached = mtu;
1920		}
1921
1922		dst = dst->child;
1923	} while (dst->xfrm);
1924
1925	if (likely(!last))
1926		return 1;
1927
1928	mtu = last->child_mtu_cached;
1929	for (;;) {
1930		dst = &last->u.dst;
1931
1932		mtu = xfrm_state_mtu(dst->xfrm, mtu);
1933		if (mtu > last->route_mtu_cached)
1934			mtu = last->route_mtu_cached;
1935		dst->metrics[RTAX_MTU-1] = mtu;
1936
1937		if (last == first)
1938			break;
1939
1940		last = last->u.next;
1941		last->child_mtu_cached = mtu;
1942	}
1943
1944	return 1;
1945}
1946
1947EXPORT_SYMBOL(xfrm_bundle_ok);
1948
1949int xfrm_policy_register_afinfo(struct xfrm_policy_afinfo *afinfo)
1950{
1951	int err = 0;
1952	if (unlikely(afinfo == NULL))
1953		return -EINVAL;
1954	if (unlikely(afinfo->family >= NPROTO))
1955		return -EAFNOSUPPORT;
1956	write_lock_bh(&xfrm_policy_afinfo_lock);
1957	if (unlikely(xfrm_policy_afinfo[afinfo->family] != NULL))
1958		err = -ENOBUFS;
1959	else {
1960		struct dst_ops *dst_ops = afinfo->dst_ops;
1961		if (likely(dst_ops->kmem_cachep == NULL))
1962			dst_ops->kmem_cachep = xfrm_dst_cache;
1963		if (likely(dst_ops->check == NULL))
1964			dst_ops->check = xfrm_dst_check;
1965		if (likely(dst_ops->negative_advice == NULL))
1966			dst_ops->negative_advice = xfrm_negative_advice;
1967		if (likely(dst_ops->link_failure == NULL))
1968			dst_ops->link_failure = xfrm_link_failure;
1969		if (likely(afinfo->garbage_collect == NULL))
1970			afinfo->garbage_collect = __xfrm_garbage_collect;
1971		xfrm_policy_afinfo[afinfo->family] = afinfo;
1972	}
1973	write_unlock_bh(&xfrm_policy_afinfo_lock);
1974	return err;
1975}
1976EXPORT_SYMBOL(xfrm_policy_register_afinfo);
1977
1978int xfrm_policy_unregister_afinfo(struct xfrm_policy_afinfo *afinfo)
1979{
1980	int err = 0;
1981	if (unlikely(afinfo == NULL))
1982		return -EINVAL;
1983	if (unlikely(afinfo->family >= NPROTO))
1984		return -EAFNOSUPPORT;
1985	write_lock_bh(&xfrm_policy_afinfo_lock);
1986	if (likely(xfrm_policy_afinfo[afinfo->family] != NULL)) {
1987		if (unlikely(xfrm_policy_afinfo[afinfo->family] != afinfo))
1988			err = -EINVAL;
1989		else {
1990			struct dst_ops *dst_ops = afinfo->dst_ops;
1991			xfrm_policy_afinfo[afinfo->family] = NULL;
1992			dst_ops->kmem_cachep = NULL;
1993			dst_ops->check = NULL;
1994			dst_ops->negative_advice = NULL;
1995			dst_ops->link_failure = NULL;
1996			afinfo->garbage_collect = NULL;
1997		}
1998	}
1999	write_unlock_bh(&xfrm_policy_afinfo_lock);
2000	return err;
2001}
2002EXPORT_SYMBOL(xfrm_policy_unregister_afinfo);
2003
2004static struct xfrm_policy_afinfo *xfrm_policy_get_afinfo(unsigned short family)
2005{
2006	struct xfrm_policy_afinfo *afinfo;
2007	if (unlikely(family >= NPROTO))
2008		return NULL;
2009	read_lock(&xfrm_policy_afinfo_lock);
2010	afinfo = xfrm_policy_afinfo[family];
2011	if (unlikely(!afinfo))
2012		read_unlock(&xfrm_policy_afinfo_lock);
2013	return afinfo;
2014}
2015
2016static void xfrm_policy_put_afinfo(struct xfrm_policy_afinfo *afinfo)
2017{
2018	read_unlock(&xfrm_policy_afinfo_lock);
2019}
2020
2021static struct xfrm_policy_afinfo *xfrm_policy_lock_afinfo(unsigned int family)
2022{
2023	struct xfrm_policy_afinfo *afinfo;
2024	if (unlikely(family >= NPROTO))
2025		return NULL;
2026	write_lock_bh(&xfrm_policy_afinfo_lock);
2027	afinfo = xfrm_policy_afinfo[family];
2028	if (unlikely(!afinfo))
2029		write_unlock_bh(&xfrm_policy_afinfo_lock);
2030	return afinfo;
2031}
2032
2033static void xfrm_policy_unlock_afinfo(struct xfrm_policy_afinfo *afinfo)
2034{
2035	write_unlock_bh(&xfrm_policy_afinfo_lock);
2036}
2037
2038static int xfrm_dev_event(struct notifier_block *this, unsigned long event, void *ptr)
2039{
2040	switch (event) {
2041	case NETDEV_DOWN:
2042		xfrm_flush_bundles();
2043	}
2044	return NOTIFY_DONE;
2045}
2046
2047static struct notifier_block xfrm_dev_notifier = {
2048	xfrm_dev_event,
2049	NULL,
2050	0
2051};
2052
2053static void __init xfrm_policy_init(void)
2054{
2055	unsigned int hmask, sz;
2056	int dir;
2057
2058	xfrm_dst_cache = kmem_cache_create("xfrm_dst_cache",
2059					   sizeof(struct xfrm_dst),
2060					   0, SLAB_HWCACHE_ALIGN|SLAB_PANIC,
2061					   NULL, NULL);
2062
2063	hmask = 8 - 1;
2064	sz = (hmask+1) * sizeof(struct hlist_head);
2065
2066	xfrm_policy_byidx = xfrm_hash_alloc(sz);
2067	xfrm_idx_hmask = hmask;
2068	if (!xfrm_policy_byidx)
2069		panic("XFRM: failed to allocate byidx hash\n");
2070
2071	for (dir = 0; dir < XFRM_POLICY_MAX * 2; dir++) {
2072		struct xfrm_policy_hash *htab;
2073
2074		INIT_HLIST_HEAD(&xfrm_policy_inexact[dir]);
2075
2076		htab = &xfrm_policy_bydst[dir];
2077		htab->table = xfrm_hash_alloc(sz);
2078		htab->hmask = hmask;
2079		if (!htab->table)
2080			panic("XFRM: failed to allocate bydst hash\n");
2081	}
2082
2083	INIT_WORK(&xfrm_policy_gc_work, xfrm_policy_gc_task);
2084	register_netdevice_notifier(&xfrm_dev_notifier);
2085}
2086
2087void __init xfrm_init(void)
2088{
2089	xfrm_state_init();
2090	xfrm_policy_init();
2091	xfrm_input_init();
2092}
2093
2094