xfrm_policy.c revision 134b0fc544ba062498451611cb6f3e4454221b3d
1/*
2 * xfrm_policy.c
3 *
4 * Changes:
5 *	Mitsuru KANDA @USAGI
6 * 	Kazunori MIYAZAWA @USAGI
7 * 	Kunihiro Ishiguro <kunihiro@ipinfusion.com>
8 * 		IPv6 support
9 * 	Kazunori MIYAZAWA @USAGI
10 * 	YOSHIFUJI Hideaki
11 * 		Split up af-specific portion
12 *	Derek Atkins <derek@ihtfp.com>		Add the post_input processor
13 *
14 */
15
16#include <linux/slab.h>
17#include <linux/kmod.h>
18#include <linux/list.h>
19#include <linux/spinlock.h>
20#include <linux/workqueue.h>
21#include <linux/notifier.h>
22#include <linux/netdevice.h>
23#include <linux/netfilter.h>
24#include <linux/module.h>
25#include <linux/cache.h>
26#include <net/xfrm.h>
27#include <net/ip.h>
28
29#include "xfrm_hash.h"
30
31DEFINE_MUTEX(xfrm_cfg_mutex);
32EXPORT_SYMBOL(xfrm_cfg_mutex);
33
34static DEFINE_RWLOCK(xfrm_policy_lock);
35
36unsigned int xfrm_policy_count[XFRM_POLICY_MAX*2];
37EXPORT_SYMBOL(xfrm_policy_count);
38
39static DEFINE_RWLOCK(xfrm_policy_afinfo_lock);
40static struct xfrm_policy_afinfo *xfrm_policy_afinfo[NPROTO];
41
42static kmem_cache_t *xfrm_dst_cache __read_mostly;
43
44static struct work_struct xfrm_policy_gc_work;
45static HLIST_HEAD(xfrm_policy_gc_list);
46static DEFINE_SPINLOCK(xfrm_policy_gc_lock);
47
48static struct xfrm_policy_afinfo *xfrm_policy_get_afinfo(unsigned short family);
49static void xfrm_policy_put_afinfo(struct xfrm_policy_afinfo *afinfo);
50static struct xfrm_policy_afinfo *xfrm_policy_lock_afinfo(unsigned int family);
51static void xfrm_policy_unlock_afinfo(struct xfrm_policy_afinfo *afinfo);
52
53int xfrm_register_type(struct xfrm_type *type, unsigned short family)
54{
55	struct xfrm_policy_afinfo *afinfo = xfrm_policy_lock_afinfo(family);
56	struct xfrm_type **typemap;
57	int err = 0;
58
59	if (unlikely(afinfo == NULL))
60		return -EAFNOSUPPORT;
61	typemap = afinfo->type_map;
62
63	if (likely(typemap[type->proto] == NULL))
64		typemap[type->proto] = type;
65	else
66		err = -EEXIST;
67	xfrm_policy_unlock_afinfo(afinfo);
68	return err;
69}
70EXPORT_SYMBOL(xfrm_register_type);
71
72int xfrm_unregister_type(struct xfrm_type *type, unsigned short family)
73{
74	struct xfrm_policy_afinfo *afinfo = xfrm_policy_lock_afinfo(family);
75	struct xfrm_type **typemap;
76	int err = 0;
77
78	if (unlikely(afinfo == NULL))
79		return -EAFNOSUPPORT;
80	typemap = afinfo->type_map;
81
82	if (unlikely(typemap[type->proto] != type))
83		err = -ENOENT;
84	else
85		typemap[type->proto] = NULL;
86	xfrm_policy_unlock_afinfo(afinfo);
87	return err;
88}
89EXPORT_SYMBOL(xfrm_unregister_type);
90
91struct xfrm_type *xfrm_get_type(u8 proto, unsigned short family)
92{
93	struct xfrm_policy_afinfo *afinfo;
94	struct xfrm_type **typemap;
95	struct xfrm_type *type;
96	int modload_attempted = 0;
97
98retry:
99	afinfo = xfrm_policy_get_afinfo(family);
100	if (unlikely(afinfo == NULL))
101		return NULL;
102	typemap = afinfo->type_map;
103
104	type = typemap[proto];
105	if (unlikely(type && !try_module_get(type->owner)))
106		type = NULL;
107	if (!type && !modload_attempted) {
108		xfrm_policy_put_afinfo(afinfo);
109		request_module("xfrm-type-%d-%d",
110			       (int) family, (int) proto);
111		modload_attempted = 1;
112		goto retry;
113	}
114
115	xfrm_policy_put_afinfo(afinfo);
116	return type;
117}
118
119int xfrm_dst_lookup(struct xfrm_dst **dst, struct flowi *fl,
120		    unsigned short family)
121{
122	struct xfrm_policy_afinfo *afinfo = xfrm_policy_get_afinfo(family);
123	int err = 0;
124
125	if (unlikely(afinfo == NULL))
126		return -EAFNOSUPPORT;
127
128	if (likely(afinfo->dst_lookup != NULL))
129		err = afinfo->dst_lookup(dst, fl);
130	else
131		err = -EINVAL;
132	xfrm_policy_put_afinfo(afinfo);
133	return err;
134}
135EXPORT_SYMBOL(xfrm_dst_lookup);
136
137void xfrm_put_type(struct xfrm_type *type)
138{
139	module_put(type->owner);
140}
141
142int xfrm_register_mode(struct xfrm_mode *mode, int family)
143{
144	struct xfrm_policy_afinfo *afinfo;
145	struct xfrm_mode **modemap;
146	int err;
147
148	if (unlikely(mode->encap >= XFRM_MODE_MAX))
149		return -EINVAL;
150
151	afinfo = xfrm_policy_lock_afinfo(family);
152	if (unlikely(afinfo == NULL))
153		return -EAFNOSUPPORT;
154
155	err = -EEXIST;
156	modemap = afinfo->mode_map;
157	if (likely(modemap[mode->encap] == NULL)) {
158		modemap[mode->encap] = mode;
159		err = 0;
160	}
161
162	xfrm_policy_unlock_afinfo(afinfo);
163	return err;
164}
165EXPORT_SYMBOL(xfrm_register_mode);
166
167int xfrm_unregister_mode(struct xfrm_mode *mode, int family)
168{
169	struct xfrm_policy_afinfo *afinfo;
170	struct xfrm_mode **modemap;
171	int err;
172
173	if (unlikely(mode->encap >= XFRM_MODE_MAX))
174		return -EINVAL;
175
176	afinfo = xfrm_policy_lock_afinfo(family);
177	if (unlikely(afinfo == NULL))
178		return -EAFNOSUPPORT;
179
180	err = -ENOENT;
181	modemap = afinfo->mode_map;
182	if (likely(modemap[mode->encap] == mode)) {
183		modemap[mode->encap] = NULL;
184		err = 0;
185	}
186
187	xfrm_policy_unlock_afinfo(afinfo);
188	return err;
189}
190EXPORT_SYMBOL(xfrm_unregister_mode);
191
192struct xfrm_mode *xfrm_get_mode(unsigned int encap, int family)
193{
194	struct xfrm_policy_afinfo *afinfo;
195	struct xfrm_mode *mode;
196	int modload_attempted = 0;
197
198	if (unlikely(encap >= XFRM_MODE_MAX))
199		return NULL;
200
201retry:
202	afinfo = xfrm_policy_get_afinfo(family);
203	if (unlikely(afinfo == NULL))
204		return NULL;
205
206	mode = afinfo->mode_map[encap];
207	if (unlikely(mode && !try_module_get(mode->owner)))
208		mode = NULL;
209	if (!mode && !modload_attempted) {
210		xfrm_policy_put_afinfo(afinfo);
211		request_module("xfrm-mode-%d-%d", family, encap);
212		modload_attempted = 1;
213		goto retry;
214	}
215
216	xfrm_policy_put_afinfo(afinfo);
217	return mode;
218}
219
220void xfrm_put_mode(struct xfrm_mode *mode)
221{
222	module_put(mode->owner);
223}
224
225static inline unsigned long make_jiffies(long secs)
226{
227	if (secs >= (MAX_SCHEDULE_TIMEOUT-1)/HZ)
228		return MAX_SCHEDULE_TIMEOUT-1;
229	else
230	        return secs*HZ;
231}
232
233static void xfrm_policy_timer(unsigned long data)
234{
235	struct xfrm_policy *xp = (struct xfrm_policy*)data;
236	unsigned long now = (unsigned long)xtime.tv_sec;
237	long next = LONG_MAX;
238	int warn = 0;
239	int dir;
240
241	read_lock(&xp->lock);
242
243	if (xp->dead)
244		goto out;
245
246	dir = xfrm_policy_id2dir(xp->index);
247
248	if (xp->lft.hard_add_expires_seconds) {
249		long tmo = xp->lft.hard_add_expires_seconds +
250			xp->curlft.add_time - now;
251		if (tmo <= 0)
252			goto expired;
253		if (tmo < next)
254			next = tmo;
255	}
256	if (xp->lft.hard_use_expires_seconds) {
257		long tmo = xp->lft.hard_use_expires_seconds +
258			(xp->curlft.use_time ? : xp->curlft.add_time) - now;
259		if (tmo <= 0)
260			goto expired;
261		if (tmo < next)
262			next = tmo;
263	}
264	if (xp->lft.soft_add_expires_seconds) {
265		long tmo = xp->lft.soft_add_expires_seconds +
266			xp->curlft.add_time - now;
267		if (tmo <= 0) {
268			warn = 1;
269			tmo = XFRM_KM_TIMEOUT;
270		}
271		if (tmo < next)
272			next = tmo;
273	}
274	if (xp->lft.soft_use_expires_seconds) {
275		long tmo = xp->lft.soft_use_expires_seconds +
276			(xp->curlft.use_time ? : xp->curlft.add_time) - now;
277		if (tmo <= 0) {
278			warn = 1;
279			tmo = XFRM_KM_TIMEOUT;
280		}
281		if (tmo < next)
282			next = tmo;
283	}
284
285	if (warn)
286		km_policy_expired(xp, dir, 0, 0);
287	if (next != LONG_MAX &&
288	    !mod_timer(&xp->timer, jiffies + make_jiffies(next)))
289		xfrm_pol_hold(xp);
290
291out:
292	read_unlock(&xp->lock);
293	xfrm_pol_put(xp);
294	return;
295
296expired:
297	read_unlock(&xp->lock);
298	if (!xfrm_policy_delete(xp, dir))
299		km_policy_expired(xp, dir, 1, 0);
300	xfrm_pol_put(xp);
301}
302
303
304/* Allocate xfrm_policy. Not used here, it is supposed to be used by pfkeyv2
305 * SPD calls.
306 */
307
308struct xfrm_policy *xfrm_policy_alloc(gfp_t gfp)
309{
310	struct xfrm_policy *policy;
311
312	policy = kzalloc(sizeof(struct xfrm_policy), gfp);
313
314	if (policy) {
315		INIT_HLIST_NODE(&policy->bydst);
316		INIT_HLIST_NODE(&policy->byidx);
317		rwlock_init(&policy->lock);
318		atomic_set(&policy->refcnt, 1);
319		init_timer(&policy->timer);
320		policy->timer.data = (unsigned long)policy;
321		policy->timer.function = xfrm_policy_timer;
322	}
323	return policy;
324}
325EXPORT_SYMBOL(xfrm_policy_alloc);
326
327/* Destroy xfrm_policy: descendant resources must be released to this moment. */
328
329void __xfrm_policy_destroy(struct xfrm_policy *policy)
330{
331	BUG_ON(!policy->dead);
332
333	BUG_ON(policy->bundles);
334
335	if (del_timer(&policy->timer))
336		BUG();
337
338	security_xfrm_policy_free(policy);
339	kfree(policy);
340}
341EXPORT_SYMBOL(__xfrm_policy_destroy);
342
343static void xfrm_policy_gc_kill(struct xfrm_policy *policy)
344{
345	struct dst_entry *dst;
346
347	while ((dst = policy->bundles) != NULL) {
348		policy->bundles = dst->next;
349		dst_free(dst);
350	}
351
352	if (del_timer(&policy->timer))
353		atomic_dec(&policy->refcnt);
354
355	if (atomic_read(&policy->refcnt) > 1)
356		flow_cache_flush();
357
358	xfrm_pol_put(policy);
359}
360
361static void xfrm_policy_gc_task(void *data)
362{
363	struct xfrm_policy *policy;
364	struct hlist_node *entry, *tmp;
365	struct hlist_head gc_list;
366
367	spin_lock_bh(&xfrm_policy_gc_lock);
368	gc_list.first = xfrm_policy_gc_list.first;
369	INIT_HLIST_HEAD(&xfrm_policy_gc_list);
370	spin_unlock_bh(&xfrm_policy_gc_lock);
371
372	hlist_for_each_entry_safe(policy, entry, tmp, &gc_list, bydst)
373		xfrm_policy_gc_kill(policy);
374}
375
376/* Rule must be locked. Release descentant resources, announce
377 * entry dead. The rule must be unlinked from lists to the moment.
378 */
379
380static void xfrm_policy_kill(struct xfrm_policy *policy)
381{
382	int dead;
383
384	write_lock_bh(&policy->lock);
385	dead = policy->dead;
386	policy->dead = 1;
387	write_unlock_bh(&policy->lock);
388
389	if (unlikely(dead)) {
390		WARN_ON(1);
391		return;
392	}
393
394	spin_lock(&xfrm_policy_gc_lock);
395	hlist_add_head(&policy->bydst, &xfrm_policy_gc_list);
396	spin_unlock(&xfrm_policy_gc_lock);
397
398	schedule_work(&xfrm_policy_gc_work);
399}
400
401struct xfrm_policy_hash {
402	struct hlist_head	*table;
403	unsigned int		hmask;
404};
405
406static struct hlist_head xfrm_policy_inexact[XFRM_POLICY_MAX*2];
407static struct xfrm_policy_hash xfrm_policy_bydst[XFRM_POLICY_MAX*2] __read_mostly;
408static struct hlist_head *xfrm_policy_byidx __read_mostly;
409static unsigned int xfrm_idx_hmask __read_mostly;
410static unsigned int xfrm_policy_hashmax __read_mostly = 1 * 1024 * 1024;
411
412static inline unsigned int idx_hash(u32 index)
413{
414	return __idx_hash(index, xfrm_idx_hmask);
415}
416
417static struct hlist_head *policy_hash_bysel(struct xfrm_selector *sel, unsigned short family, int dir)
418{
419	unsigned int hmask = xfrm_policy_bydst[dir].hmask;
420	unsigned int hash = __sel_hash(sel, family, hmask);
421
422	return (hash == hmask + 1 ?
423		&xfrm_policy_inexact[dir] :
424		xfrm_policy_bydst[dir].table + hash);
425}
426
427static struct hlist_head *policy_hash_direct(xfrm_address_t *daddr, xfrm_address_t *saddr, unsigned short family, int dir)
428{
429	unsigned int hmask = xfrm_policy_bydst[dir].hmask;
430	unsigned int hash = __addr_hash(daddr, saddr, family, hmask);
431
432	return xfrm_policy_bydst[dir].table + hash;
433}
434
435static void xfrm_dst_hash_transfer(struct hlist_head *list,
436				   struct hlist_head *ndsttable,
437				   unsigned int nhashmask)
438{
439	struct hlist_node *entry, *tmp;
440	struct xfrm_policy *pol;
441
442	hlist_for_each_entry_safe(pol, entry, tmp, list, bydst) {
443		unsigned int h;
444
445		h = __addr_hash(&pol->selector.daddr, &pol->selector.saddr,
446				pol->family, nhashmask);
447		hlist_add_head(&pol->bydst, ndsttable+h);
448	}
449}
450
451static void xfrm_idx_hash_transfer(struct hlist_head *list,
452				   struct hlist_head *nidxtable,
453				   unsigned int nhashmask)
454{
455	struct hlist_node *entry, *tmp;
456	struct xfrm_policy *pol;
457
458	hlist_for_each_entry_safe(pol, entry, tmp, list, byidx) {
459		unsigned int h;
460
461		h = __idx_hash(pol->index, nhashmask);
462		hlist_add_head(&pol->byidx, nidxtable+h);
463	}
464}
465
466static unsigned long xfrm_new_hash_mask(unsigned int old_hmask)
467{
468	return ((old_hmask + 1) << 1) - 1;
469}
470
471static void xfrm_bydst_resize(int dir)
472{
473	unsigned int hmask = xfrm_policy_bydst[dir].hmask;
474	unsigned int nhashmask = xfrm_new_hash_mask(hmask);
475	unsigned int nsize = (nhashmask + 1) * sizeof(struct hlist_head);
476	struct hlist_head *odst = xfrm_policy_bydst[dir].table;
477	struct hlist_head *ndst = xfrm_hash_alloc(nsize);
478	int i;
479
480	if (!ndst)
481		return;
482
483	write_lock_bh(&xfrm_policy_lock);
484
485	for (i = hmask; i >= 0; i--)
486		xfrm_dst_hash_transfer(odst + i, ndst, nhashmask);
487
488	xfrm_policy_bydst[dir].table = ndst;
489	xfrm_policy_bydst[dir].hmask = nhashmask;
490
491	write_unlock_bh(&xfrm_policy_lock);
492
493	xfrm_hash_free(odst, (hmask + 1) * sizeof(struct hlist_head));
494}
495
496static void xfrm_byidx_resize(int total)
497{
498	unsigned int hmask = xfrm_idx_hmask;
499	unsigned int nhashmask = xfrm_new_hash_mask(hmask);
500	unsigned int nsize = (nhashmask + 1) * sizeof(struct hlist_head);
501	struct hlist_head *oidx = xfrm_policy_byidx;
502	struct hlist_head *nidx = xfrm_hash_alloc(nsize);
503	int i;
504
505	if (!nidx)
506		return;
507
508	write_lock_bh(&xfrm_policy_lock);
509
510	for (i = hmask; i >= 0; i--)
511		xfrm_idx_hash_transfer(oidx + i, nidx, nhashmask);
512
513	xfrm_policy_byidx = nidx;
514	xfrm_idx_hmask = nhashmask;
515
516	write_unlock_bh(&xfrm_policy_lock);
517
518	xfrm_hash_free(oidx, (hmask + 1) * sizeof(struct hlist_head));
519}
520
521static inline int xfrm_bydst_should_resize(int dir, int *total)
522{
523	unsigned int cnt = xfrm_policy_count[dir];
524	unsigned int hmask = xfrm_policy_bydst[dir].hmask;
525
526	if (total)
527		*total += cnt;
528
529	if ((hmask + 1) < xfrm_policy_hashmax &&
530	    cnt > hmask)
531		return 1;
532
533	return 0;
534}
535
536static inline int xfrm_byidx_should_resize(int total)
537{
538	unsigned int hmask = xfrm_idx_hmask;
539
540	if ((hmask + 1) < xfrm_policy_hashmax &&
541	    total > hmask)
542		return 1;
543
544	return 0;
545}
546
547static DEFINE_MUTEX(hash_resize_mutex);
548
549static void xfrm_hash_resize(void *__unused)
550{
551	int dir, total;
552
553	mutex_lock(&hash_resize_mutex);
554
555	total = 0;
556	for (dir = 0; dir < XFRM_POLICY_MAX * 2; dir++) {
557		if (xfrm_bydst_should_resize(dir, &total))
558			xfrm_bydst_resize(dir);
559	}
560	if (xfrm_byidx_should_resize(total))
561		xfrm_byidx_resize(total);
562
563	mutex_unlock(&hash_resize_mutex);
564}
565
566static DECLARE_WORK(xfrm_hash_work, xfrm_hash_resize, NULL);
567
568/* Generate new index... KAME seems to generate them ordered by cost
569 * of an absolute inpredictability of ordering of rules. This will not pass. */
570static u32 xfrm_gen_index(u8 type, int dir)
571{
572	static u32 idx_generator;
573
574	for (;;) {
575		struct hlist_node *entry;
576		struct hlist_head *list;
577		struct xfrm_policy *p;
578		u32 idx;
579		int found;
580
581		idx = (idx_generator | dir);
582		idx_generator += 8;
583		if (idx == 0)
584			idx = 8;
585		list = xfrm_policy_byidx + idx_hash(idx);
586		found = 0;
587		hlist_for_each_entry(p, entry, list, byidx) {
588			if (p->index == idx) {
589				found = 1;
590				break;
591			}
592		}
593		if (!found)
594			return idx;
595	}
596}
597
598static inline int selector_cmp(struct xfrm_selector *s1, struct xfrm_selector *s2)
599{
600	u32 *p1 = (u32 *) s1;
601	u32 *p2 = (u32 *) s2;
602	int len = sizeof(struct xfrm_selector) / sizeof(u32);
603	int i;
604
605	for (i = 0; i < len; i++) {
606		if (p1[i] != p2[i])
607			return 1;
608	}
609
610	return 0;
611}
612
613int xfrm_policy_insert(int dir, struct xfrm_policy *policy, int excl)
614{
615	struct xfrm_policy *pol;
616	struct xfrm_policy *delpol;
617	struct hlist_head *chain;
618	struct hlist_node *entry, *newpos, *last;
619	struct dst_entry *gc_list;
620
621	write_lock_bh(&xfrm_policy_lock);
622	chain = policy_hash_bysel(&policy->selector, policy->family, dir);
623	delpol = NULL;
624	newpos = NULL;
625	last = NULL;
626	hlist_for_each_entry(pol, entry, chain, bydst) {
627		if (!delpol &&
628		    pol->type == policy->type &&
629		    !selector_cmp(&pol->selector, &policy->selector) &&
630		    xfrm_sec_ctx_match(pol->security, policy->security)) {
631			if (excl) {
632				write_unlock_bh(&xfrm_policy_lock);
633				return -EEXIST;
634			}
635			delpol = pol;
636			if (policy->priority > pol->priority)
637				continue;
638		} else if (policy->priority >= pol->priority) {
639			last = &pol->bydst;
640			continue;
641		}
642		if (!newpos)
643			newpos = &pol->bydst;
644		if (delpol)
645			break;
646		last = &pol->bydst;
647	}
648	if (!newpos)
649		newpos = last;
650	if (newpos)
651		hlist_add_after(newpos, &policy->bydst);
652	else
653		hlist_add_head(&policy->bydst, chain);
654	xfrm_pol_hold(policy);
655	xfrm_policy_count[dir]++;
656	atomic_inc(&flow_cache_genid);
657	if (delpol) {
658		hlist_del(&delpol->bydst);
659		hlist_del(&delpol->byidx);
660		xfrm_policy_count[dir]--;
661	}
662	policy->index = delpol ? delpol->index : xfrm_gen_index(policy->type, dir);
663	hlist_add_head(&policy->byidx, xfrm_policy_byidx+idx_hash(policy->index));
664	policy->curlft.add_time = (unsigned long)xtime.tv_sec;
665	policy->curlft.use_time = 0;
666	if (!mod_timer(&policy->timer, jiffies + HZ))
667		xfrm_pol_hold(policy);
668	write_unlock_bh(&xfrm_policy_lock);
669
670	if (delpol)
671		xfrm_policy_kill(delpol);
672	else if (xfrm_bydst_should_resize(dir, NULL))
673		schedule_work(&xfrm_hash_work);
674
675	read_lock_bh(&xfrm_policy_lock);
676	gc_list = NULL;
677	entry = &policy->bydst;
678	hlist_for_each_entry_continue(policy, entry, bydst) {
679		struct dst_entry *dst;
680
681		write_lock(&policy->lock);
682		dst = policy->bundles;
683		if (dst) {
684			struct dst_entry *tail = dst;
685			while (tail->next)
686				tail = tail->next;
687			tail->next = gc_list;
688			gc_list = dst;
689
690			policy->bundles = NULL;
691		}
692		write_unlock(&policy->lock);
693	}
694	read_unlock_bh(&xfrm_policy_lock);
695
696	while (gc_list) {
697		struct dst_entry *dst = gc_list;
698
699		gc_list = dst->next;
700		dst_free(dst);
701	}
702
703	return 0;
704}
705EXPORT_SYMBOL(xfrm_policy_insert);
706
707struct xfrm_policy *xfrm_policy_bysel_ctx(u8 type, int dir,
708					  struct xfrm_selector *sel,
709					  struct xfrm_sec_ctx *ctx, int delete)
710{
711	struct xfrm_policy *pol, *ret;
712	struct hlist_head *chain;
713	struct hlist_node *entry;
714
715	write_lock_bh(&xfrm_policy_lock);
716	chain = policy_hash_bysel(sel, sel->family, dir);
717	ret = NULL;
718	hlist_for_each_entry(pol, entry, chain, bydst) {
719		if (pol->type == type &&
720		    !selector_cmp(sel, &pol->selector) &&
721		    xfrm_sec_ctx_match(ctx, pol->security)) {
722			xfrm_pol_hold(pol);
723			if (delete) {
724				hlist_del(&pol->bydst);
725				hlist_del(&pol->byidx);
726				xfrm_policy_count[dir]--;
727			}
728			ret = pol;
729			break;
730		}
731	}
732	write_unlock_bh(&xfrm_policy_lock);
733
734	if (ret && delete) {
735		atomic_inc(&flow_cache_genid);
736		xfrm_policy_kill(ret);
737	}
738	return ret;
739}
740EXPORT_SYMBOL(xfrm_policy_bysel_ctx);
741
742struct xfrm_policy *xfrm_policy_byid(u8 type, int dir, u32 id, int delete)
743{
744	struct xfrm_policy *pol, *ret;
745	struct hlist_head *chain;
746	struct hlist_node *entry;
747
748	write_lock_bh(&xfrm_policy_lock);
749	chain = xfrm_policy_byidx + idx_hash(id);
750	ret = NULL;
751	hlist_for_each_entry(pol, entry, chain, byidx) {
752		if (pol->type == type && pol->index == id) {
753			xfrm_pol_hold(pol);
754			if (delete) {
755				hlist_del(&pol->bydst);
756				hlist_del(&pol->byidx);
757				xfrm_policy_count[dir]--;
758			}
759			ret = pol;
760			break;
761		}
762	}
763	write_unlock_bh(&xfrm_policy_lock);
764
765	if (ret && delete) {
766		atomic_inc(&flow_cache_genid);
767		xfrm_policy_kill(ret);
768	}
769	return ret;
770}
771EXPORT_SYMBOL(xfrm_policy_byid);
772
773void xfrm_policy_flush(u8 type)
774{
775	int dir;
776
777	write_lock_bh(&xfrm_policy_lock);
778	for (dir = 0; dir < XFRM_POLICY_MAX; dir++) {
779		struct xfrm_policy *pol;
780		struct hlist_node *entry;
781		int i, killed;
782
783		killed = 0;
784	again1:
785		hlist_for_each_entry(pol, entry,
786				     &xfrm_policy_inexact[dir], bydst) {
787			if (pol->type != type)
788				continue;
789			hlist_del(&pol->bydst);
790			hlist_del(&pol->byidx);
791			write_unlock_bh(&xfrm_policy_lock);
792
793			xfrm_policy_kill(pol);
794			killed++;
795
796			write_lock_bh(&xfrm_policy_lock);
797			goto again1;
798		}
799
800		for (i = xfrm_policy_bydst[dir].hmask; i >= 0; i--) {
801	again2:
802			hlist_for_each_entry(pol, entry,
803					     xfrm_policy_bydst[dir].table + i,
804					     bydst) {
805				if (pol->type != type)
806					continue;
807				hlist_del(&pol->bydst);
808				hlist_del(&pol->byidx);
809				write_unlock_bh(&xfrm_policy_lock);
810
811				xfrm_policy_kill(pol);
812				killed++;
813
814				write_lock_bh(&xfrm_policy_lock);
815				goto again2;
816			}
817		}
818
819		xfrm_policy_count[dir] -= killed;
820	}
821	atomic_inc(&flow_cache_genid);
822	write_unlock_bh(&xfrm_policy_lock);
823}
824EXPORT_SYMBOL(xfrm_policy_flush);
825
826int xfrm_policy_walk(u8 type, int (*func)(struct xfrm_policy *, int, int, void*),
827		     void *data)
828{
829	struct xfrm_policy *pol;
830	struct hlist_node *entry;
831	int dir, count, error;
832
833	read_lock_bh(&xfrm_policy_lock);
834	count = 0;
835	for (dir = 0; dir < 2*XFRM_POLICY_MAX; dir++) {
836		struct hlist_head *table = xfrm_policy_bydst[dir].table;
837		int i;
838
839		hlist_for_each_entry(pol, entry,
840				     &xfrm_policy_inexact[dir], bydst) {
841			if (pol->type == type)
842				count++;
843		}
844		for (i = xfrm_policy_bydst[dir].hmask; i >= 0; i--) {
845			hlist_for_each_entry(pol, entry, table + i, bydst) {
846				if (pol->type == type)
847					count++;
848			}
849		}
850	}
851
852	if (count == 0) {
853		error = -ENOENT;
854		goto out;
855	}
856
857	for (dir = 0; dir < 2*XFRM_POLICY_MAX; dir++) {
858		struct hlist_head *table = xfrm_policy_bydst[dir].table;
859		int i;
860
861		hlist_for_each_entry(pol, entry,
862				     &xfrm_policy_inexact[dir], bydst) {
863			if (pol->type != type)
864				continue;
865			error = func(pol, dir % XFRM_POLICY_MAX, --count, data);
866			if (error)
867				goto out;
868		}
869		for (i = xfrm_policy_bydst[dir].hmask; i >= 0; i--) {
870			hlist_for_each_entry(pol, entry, table + i, bydst) {
871				if (pol->type != type)
872					continue;
873				error = func(pol, dir % XFRM_POLICY_MAX, --count, data);
874				if (error)
875					goto out;
876			}
877		}
878	}
879	error = 0;
880out:
881	read_unlock_bh(&xfrm_policy_lock);
882	return error;
883}
884EXPORT_SYMBOL(xfrm_policy_walk);
885
886/*
887 * Find policy to apply to this flow.
888 *
889 * Returns 0 if policy found, else an -errno.
890 */
891static int xfrm_policy_match(struct xfrm_policy *pol, struct flowi *fl,
892			     u8 type, u16 family, int dir)
893{
894	struct xfrm_selector *sel = &pol->selector;
895	int match, ret = -ESRCH;
896
897	if (pol->family != family ||
898	    pol->type != type)
899		return ret;
900
901	match = xfrm_selector_match(sel, fl, family);
902	if (match)
903		ret = security_xfrm_policy_lookup(pol, fl->secid, dir);
904
905	return ret;
906}
907
908static struct xfrm_policy *xfrm_policy_lookup_bytype(u8 type, struct flowi *fl,
909						     u16 family, u8 dir)
910{
911	int err;
912	struct xfrm_policy *pol, *ret;
913	xfrm_address_t *daddr, *saddr;
914	struct hlist_node *entry;
915	struct hlist_head *chain;
916	u32 priority = ~0U;
917
918	daddr = xfrm_flowi_daddr(fl, family);
919	saddr = xfrm_flowi_saddr(fl, family);
920	if (unlikely(!daddr || !saddr))
921		return NULL;
922
923	read_lock_bh(&xfrm_policy_lock);
924	chain = policy_hash_direct(daddr, saddr, family, dir);
925	ret = NULL;
926	hlist_for_each_entry(pol, entry, chain, bydst) {
927		err = xfrm_policy_match(pol, fl, type, family, dir);
928		if (err) {
929			if (err == -ESRCH)
930				continue;
931			else {
932				ret = ERR_PTR(err);
933				goto fail;
934			}
935		} else {
936			ret = pol;
937			priority = ret->priority;
938			break;
939		}
940	}
941	chain = &xfrm_policy_inexact[dir];
942	hlist_for_each_entry(pol, entry, chain, bydst) {
943		err = xfrm_policy_match(pol, fl, type, family, dir);
944		if (err) {
945			if (err == -ESRCH)
946				continue;
947			else {
948				ret = ERR_PTR(err);
949				goto fail;
950			}
951		} else if (pol->priority < priority) {
952			ret = pol;
953			break;
954		}
955	}
956	if (ret)
957		xfrm_pol_hold(ret);
958fail:
959	read_unlock_bh(&xfrm_policy_lock);
960
961	return ret;
962}
963
964static int xfrm_policy_lookup(struct flowi *fl, u16 family, u8 dir,
965			       void **objp, atomic_t **obj_refp)
966{
967	struct xfrm_policy *pol;
968	int err = 0;
969
970#ifdef CONFIG_XFRM_SUB_POLICY
971	pol = xfrm_policy_lookup_bytype(XFRM_POLICY_TYPE_SUB, fl, family, dir);
972	if (IS_ERR(pol)) {
973		err = PTR_ERR(pol);
974		pol = NULL;
975	}
976	if (pol || err)
977		goto end;
978#endif
979	pol = xfrm_policy_lookup_bytype(XFRM_POLICY_TYPE_MAIN, fl, family, dir);
980	if (IS_ERR(pol)) {
981		err = PTR_ERR(pol);
982		pol = NULL;
983	}
984#ifdef CONFIG_XFRM_SUB_POLICY
985end:
986#endif
987	if ((*objp = (void *) pol) != NULL)
988		*obj_refp = &pol->refcnt;
989	return err;
990}
991
992static inline int policy_to_flow_dir(int dir)
993{
994	if (XFRM_POLICY_IN == FLOW_DIR_IN &&
995 	    XFRM_POLICY_OUT == FLOW_DIR_OUT &&
996 	    XFRM_POLICY_FWD == FLOW_DIR_FWD)
997 		return dir;
998 	switch (dir) {
999 	default:
1000 	case XFRM_POLICY_IN:
1001 		return FLOW_DIR_IN;
1002 	case XFRM_POLICY_OUT:
1003 		return FLOW_DIR_OUT;
1004 	case XFRM_POLICY_FWD:
1005 		return FLOW_DIR_FWD;
1006	};
1007}
1008
1009static struct xfrm_policy *xfrm_sk_policy_lookup(struct sock *sk, int dir, struct flowi *fl)
1010{
1011	struct xfrm_policy *pol;
1012
1013	read_lock_bh(&xfrm_policy_lock);
1014	if ((pol = sk->sk_policy[dir]) != NULL) {
1015 		int match = xfrm_selector_match(&pol->selector, fl,
1016						sk->sk_family);
1017 		int err = 0;
1018
1019		if (match)
1020		  err = security_xfrm_policy_lookup(pol, fl->secid, policy_to_flow_dir(dir));
1021
1022 		if (match && !err)
1023			xfrm_pol_hold(pol);
1024		else
1025			pol = NULL;
1026	}
1027	read_unlock_bh(&xfrm_policy_lock);
1028	return pol;
1029}
1030
1031static void __xfrm_policy_link(struct xfrm_policy *pol, int dir)
1032{
1033	struct hlist_head *chain = policy_hash_bysel(&pol->selector,
1034						     pol->family, dir);
1035
1036	hlist_add_head(&pol->bydst, chain);
1037	hlist_add_head(&pol->byidx, xfrm_policy_byidx+idx_hash(pol->index));
1038	xfrm_policy_count[dir]++;
1039	xfrm_pol_hold(pol);
1040
1041	if (xfrm_bydst_should_resize(dir, NULL))
1042		schedule_work(&xfrm_hash_work);
1043}
1044
1045static struct xfrm_policy *__xfrm_policy_unlink(struct xfrm_policy *pol,
1046						int dir)
1047{
1048	if (hlist_unhashed(&pol->bydst))
1049		return NULL;
1050
1051	hlist_del(&pol->bydst);
1052	hlist_del(&pol->byidx);
1053	xfrm_policy_count[dir]--;
1054
1055	return pol;
1056}
1057
1058int xfrm_policy_delete(struct xfrm_policy *pol, int dir)
1059{
1060	write_lock_bh(&xfrm_policy_lock);
1061	pol = __xfrm_policy_unlink(pol, dir);
1062	write_unlock_bh(&xfrm_policy_lock);
1063	if (pol) {
1064		if (dir < XFRM_POLICY_MAX)
1065			atomic_inc(&flow_cache_genid);
1066		xfrm_policy_kill(pol);
1067		return 0;
1068	}
1069	return -ENOENT;
1070}
1071EXPORT_SYMBOL(xfrm_policy_delete);
1072
1073int xfrm_sk_policy_insert(struct sock *sk, int dir, struct xfrm_policy *pol)
1074{
1075	struct xfrm_policy *old_pol;
1076
1077#ifdef CONFIG_XFRM_SUB_POLICY
1078	if (pol && pol->type != XFRM_POLICY_TYPE_MAIN)
1079		return -EINVAL;
1080#endif
1081
1082	write_lock_bh(&xfrm_policy_lock);
1083	old_pol = sk->sk_policy[dir];
1084	sk->sk_policy[dir] = pol;
1085	if (pol) {
1086		pol->curlft.add_time = (unsigned long)xtime.tv_sec;
1087		pol->index = xfrm_gen_index(pol->type, XFRM_POLICY_MAX+dir);
1088		__xfrm_policy_link(pol, XFRM_POLICY_MAX+dir);
1089	}
1090	if (old_pol)
1091		__xfrm_policy_unlink(old_pol, XFRM_POLICY_MAX+dir);
1092	write_unlock_bh(&xfrm_policy_lock);
1093
1094	if (old_pol) {
1095		xfrm_policy_kill(old_pol);
1096	}
1097	return 0;
1098}
1099
1100static struct xfrm_policy *clone_policy(struct xfrm_policy *old, int dir)
1101{
1102	struct xfrm_policy *newp = xfrm_policy_alloc(GFP_ATOMIC);
1103
1104	if (newp) {
1105		newp->selector = old->selector;
1106		if (security_xfrm_policy_clone(old, newp)) {
1107			kfree(newp);
1108			return NULL;  /* ENOMEM */
1109		}
1110		newp->lft = old->lft;
1111		newp->curlft = old->curlft;
1112		newp->action = old->action;
1113		newp->flags = old->flags;
1114		newp->xfrm_nr = old->xfrm_nr;
1115		newp->index = old->index;
1116		newp->type = old->type;
1117		memcpy(newp->xfrm_vec, old->xfrm_vec,
1118		       newp->xfrm_nr*sizeof(struct xfrm_tmpl));
1119		write_lock_bh(&xfrm_policy_lock);
1120		__xfrm_policy_link(newp, XFRM_POLICY_MAX+dir);
1121		write_unlock_bh(&xfrm_policy_lock);
1122		xfrm_pol_put(newp);
1123	}
1124	return newp;
1125}
1126
1127int __xfrm_sk_clone_policy(struct sock *sk)
1128{
1129	struct xfrm_policy *p0 = sk->sk_policy[0],
1130			   *p1 = sk->sk_policy[1];
1131
1132	sk->sk_policy[0] = sk->sk_policy[1] = NULL;
1133	if (p0 && (sk->sk_policy[0] = clone_policy(p0, 0)) == NULL)
1134		return -ENOMEM;
1135	if (p1 && (sk->sk_policy[1] = clone_policy(p1, 1)) == NULL)
1136		return -ENOMEM;
1137	return 0;
1138}
1139
1140static int
1141xfrm_get_saddr(xfrm_address_t *local, xfrm_address_t *remote,
1142	       unsigned short family)
1143{
1144	int err;
1145	struct xfrm_policy_afinfo *afinfo = xfrm_policy_get_afinfo(family);
1146
1147	if (unlikely(afinfo == NULL))
1148		return -EINVAL;
1149	err = afinfo->get_saddr(local, remote);
1150	xfrm_policy_put_afinfo(afinfo);
1151	return err;
1152}
1153
1154/* Resolve list of templates for the flow, given policy. */
1155
1156static int
1157xfrm_tmpl_resolve_one(struct xfrm_policy *policy, struct flowi *fl,
1158		      struct xfrm_state **xfrm,
1159		      unsigned short family)
1160{
1161	int nx;
1162	int i, error;
1163	xfrm_address_t *daddr = xfrm_flowi_daddr(fl, family);
1164	xfrm_address_t *saddr = xfrm_flowi_saddr(fl, family);
1165	xfrm_address_t tmp;
1166
1167	for (nx=0, i = 0; i < policy->xfrm_nr; i++) {
1168		struct xfrm_state *x;
1169		xfrm_address_t *remote = daddr;
1170		xfrm_address_t *local  = saddr;
1171		struct xfrm_tmpl *tmpl = &policy->xfrm_vec[i];
1172
1173		if (tmpl->mode == XFRM_MODE_TUNNEL) {
1174			remote = &tmpl->id.daddr;
1175			local = &tmpl->saddr;
1176			if (xfrm_addr_any(local, family)) {
1177				error = xfrm_get_saddr(&tmp, remote, family);
1178				if (error)
1179					goto fail;
1180				local = &tmp;
1181			}
1182		}
1183
1184		x = xfrm_state_find(remote, local, fl, tmpl, policy, &error, family);
1185
1186		if (x && x->km.state == XFRM_STATE_VALID) {
1187			xfrm[nx++] = x;
1188			daddr = remote;
1189			saddr = local;
1190			continue;
1191		}
1192		if (x) {
1193			error = (x->km.state == XFRM_STATE_ERROR ?
1194				 -EINVAL : -EAGAIN);
1195			xfrm_state_put(x);
1196		}
1197
1198		if (!tmpl->optional)
1199			goto fail;
1200	}
1201	return nx;
1202
1203fail:
1204	for (nx--; nx>=0; nx--)
1205		xfrm_state_put(xfrm[nx]);
1206	return error;
1207}
1208
1209static int
1210xfrm_tmpl_resolve(struct xfrm_policy **pols, int npols, struct flowi *fl,
1211		  struct xfrm_state **xfrm,
1212		  unsigned short family)
1213{
1214	struct xfrm_state *tp[XFRM_MAX_DEPTH];
1215	struct xfrm_state **tpp = (npols > 1) ? tp : xfrm;
1216	int cnx = 0;
1217	int error;
1218	int ret;
1219	int i;
1220
1221	for (i = 0; i < npols; i++) {
1222		if (cnx + pols[i]->xfrm_nr >= XFRM_MAX_DEPTH) {
1223			error = -ENOBUFS;
1224			goto fail;
1225		}
1226
1227		ret = xfrm_tmpl_resolve_one(pols[i], fl, &tpp[cnx], family);
1228		if (ret < 0) {
1229			error = ret;
1230			goto fail;
1231		} else
1232			cnx += ret;
1233	}
1234
1235	/* found states are sorted for outbound processing */
1236	if (npols > 1)
1237		xfrm_state_sort(xfrm, tpp, cnx, family);
1238
1239	return cnx;
1240
1241 fail:
1242	for (cnx--; cnx>=0; cnx--)
1243		xfrm_state_put(tpp[cnx]);
1244	return error;
1245
1246}
1247
1248/* Check that the bundle accepts the flow and its components are
1249 * still valid.
1250 */
1251
1252static struct dst_entry *
1253xfrm_find_bundle(struct flowi *fl, struct xfrm_policy *policy, unsigned short family)
1254{
1255	struct dst_entry *x;
1256	struct xfrm_policy_afinfo *afinfo = xfrm_policy_get_afinfo(family);
1257	if (unlikely(afinfo == NULL))
1258		return ERR_PTR(-EINVAL);
1259	x = afinfo->find_bundle(fl, policy);
1260	xfrm_policy_put_afinfo(afinfo);
1261	return x;
1262}
1263
1264/* Allocate chain of dst_entry's, attach known xfrm's, calculate
1265 * all the metrics... Shortly, bundle a bundle.
1266 */
1267
1268static int
1269xfrm_bundle_create(struct xfrm_policy *policy, struct xfrm_state **xfrm, int nx,
1270		   struct flowi *fl, struct dst_entry **dst_p,
1271		   unsigned short family)
1272{
1273	int err;
1274	struct xfrm_policy_afinfo *afinfo = xfrm_policy_get_afinfo(family);
1275	if (unlikely(afinfo == NULL))
1276		return -EINVAL;
1277	err = afinfo->bundle_create(policy, xfrm, nx, fl, dst_p);
1278	xfrm_policy_put_afinfo(afinfo);
1279	return err;
1280}
1281
1282
1283static int stale_bundle(struct dst_entry *dst);
1284
1285/* Main function: finds/creates a bundle for given flow.
1286 *
1287 * At the moment we eat a raw IP route. Mostly to speed up lookups
1288 * on interfaces with disabled IPsec.
1289 */
1290int xfrm_lookup(struct dst_entry **dst_p, struct flowi *fl,
1291		struct sock *sk, int flags)
1292{
1293	struct xfrm_policy *policy;
1294	struct xfrm_policy *pols[XFRM_POLICY_TYPE_MAX];
1295	int npols;
1296	int pol_dead;
1297	int xfrm_nr;
1298	int pi;
1299	struct xfrm_state *xfrm[XFRM_MAX_DEPTH];
1300	struct dst_entry *dst, *dst_orig = *dst_p;
1301	int nx = 0;
1302	int err;
1303	u32 genid;
1304	u16 family;
1305	u8 dir = policy_to_flow_dir(XFRM_POLICY_OUT);
1306
1307restart:
1308	genid = atomic_read(&flow_cache_genid);
1309	policy = NULL;
1310	for (pi = 0; pi < ARRAY_SIZE(pols); pi++)
1311		pols[pi] = NULL;
1312	npols = 0;
1313	pol_dead = 0;
1314	xfrm_nr = 0;
1315
1316	if (sk && sk->sk_policy[1])
1317		policy = xfrm_sk_policy_lookup(sk, XFRM_POLICY_OUT, fl);
1318
1319	if (!policy) {
1320		/* To accelerate a bit...  */
1321		if ((dst_orig->flags & DST_NOXFRM) ||
1322		    !xfrm_policy_count[XFRM_POLICY_OUT])
1323			return 0;
1324
1325		policy = flow_cache_lookup(fl, dst_orig->ops->family,
1326					   dir, xfrm_policy_lookup);
1327		if (IS_ERR(policy))
1328			return PTR_ERR(policy);
1329	}
1330
1331	if (!policy)
1332		return 0;
1333
1334	family = dst_orig->ops->family;
1335	policy->curlft.use_time = (unsigned long)xtime.tv_sec;
1336	pols[0] = policy;
1337	npols ++;
1338	xfrm_nr += pols[0]->xfrm_nr;
1339
1340	switch (policy->action) {
1341	case XFRM_POLICY_BLOCK:
1342		/* Prohibit the flow */
1343		err = -EPERM;
1344		goto error;
1345
1346	case XFRM_POLICY_ALLOW:
1347#ifndef CONFIG_XFRM_SUB_POLICY
1348		if (policy->xfrm_nr == 0) {
1349			/* Flow passes not transformed. */
1350			xfrm_pol_put(policy);
1351			return 0;
1352		}
1353#endif
1354
1355		/* Try to find matching bundle.
1356		 *
1357		 * LATER: help from flow cache. It is optional, this
1358		 * is required only for output policy.
1359		 */
1360		dst = xfrm_find_bundle(fl, policy, family);
1361		if (IS_ERR(dst)) {
1362			err = PTR_ERR(dst);
1363			goto error;
1364		}
1365
1366		if (dst)
1367			break;
1368
1369#ifdef CONFIG_XFRM_SUB_POLICY
1370		if (pols[0]->type != XFRM_POLICY_TYPE_MAIN) {
1371			pols[1] = xfrm_policy_lookup_bytype(XFRM_POLICY_TYPE_MAIN,
1372							    fl, family,
1373							    XFRM_POLICY_OUT);
1374			if (pols[1]) {
1375				if (IS_ERR(pols[1])) {
1376					err = PTR_ERR(pols[1]);
1377					goto error;
1378				}
1379				if (pols[1]->action == XFRM_POLICY_BLOCK) {
1380					err = -EPERM;
1381					goto error;
1382				}
1383				npols ++;
1384				xfrm_nr += pols[1]->xfrm_nr;
1385			}
1386		}
1387
1388		/*
1389		 * Because neither flowi nor bundle information knows about
1390		 * transformation template size. On more than one policy usage
1391		 * we can realize whether all of them is bypass or not after
1392		 * they are searched. See above not-transformed bypass
1393		 * is surrounded by non-sub policy configuration, too.
1394		 */
1395		if (xfrm_nr == 0) {
1396			/* Flow passes not transformed. */
1397			xfrm_pols_put(pols, npols);
1398			return 0;
1399		}
1400
1401#endif
1402		nx = xfrm_tmpl_resolve(pols, npols, fl, xfrm, family);
1403
1404		if (unlikely(nx<0)) {
1405			err = nx;
1406			if (err == -EAGAIN && flags) {
1407				DECLARE_WAITQUEUE(wait, current);
1408
1409				add_wait_queue(&km_waitq, &wait);
1410				set_current_state(TASK_INTERRUPTIBLE);
1411				schedule();
1412				set_current_state(TASK_RUNNING);
1413				remove_wait_queue(&km_waitq, &wait);
1414
1415				nx = xfrm_tmpl_resolve(pols, npols, fl, xfrm, family);
1416
1417				if (nx == -EAGAIN && signal_pending(current)) {
1418					err = -ERESTART;
1419					goto error;
1420				}
1421				if (nx == -EAGAIN ||
1422				    genid != atomic_read(&flow_cache_genid)) {
1423					xfrm_pols_put(pols, npols);
1424					goto restart;
1425				}
1426				err = nx;
1427			}
1428			if (err < 0)
1429				goto error;
1430		}
1431		if (nx == 0) {
1432			/* Flow passes not transformed. */
1433			xfrm_pols_put(pols, npols);
1434			return 0;
1435		}
1436
1437		dst = dst_orig;
1438		err = xfrm_bundle_create(policy, xfrm, nx, fl, &dst, family);
1439
1440		if (unlikely(err)) {
1441			int i;
1442			for (i=0; i<nx; i++)
1443				xfrm_state_put(xfrm[i]);
1444			goto error;
1445		}
1446
1447		for (pi = 0; pi < npols; pi++) {
1448			read_lock_bh(&pols[pi]->lock);
1449			pol_dead |= pols[pi]->dead;
1450			read_unlock_bh(&pols[pi]->lock);
1451		}
1452
1453		write_lock_bh(&policy->lock);
1454		if (unlikely(pol_dead || stale_bundle(dst))) {
1455			/* Wow! While we worked on resolving, this
1456			 * policy has gone. Retry. It is not paranoia,
1457			 * we just cannot enlist new bundle to dead object.
1458			 * We can't enlist stable bundles either.
1459			 */
1460			write_unlock_bh(&policy->lock);
1461			if (dst)
1462				dst_free(dst);
1463
1464			err = -EHOSTUNREACH;
1465			goto error;
1466		}
1467		dst->next = policy->bundles;
1468		policy->bundles = dst;
1469		dst_hold(dst);
1470		write_unlock_bh(&policy->lock);
1471	}
1472	*dst_p = dst;
1473	dst_release(dst_orig);
1474 	xfrm_pols_put(pols, npols);
1475	return 0;
1476
1477error:
1478	dst_release(dst_orig);
1479	xfrm_pols_put(pols, npols);
1480	*dst_p = NULL;
1481	return err;
1482}
1483EXPORT_SYMBOL(xfrm_lookup);
1484
1485static inline int
1486xfrm_secpath_reject(int idx, struct sk_buff *skb, struct flowi *fl)
1487{
1488	struct xfrm_state *x;
1489	int err;
1490
1491	if (!skb->sp || idx < 0 || idx >= skb->sp->len)
1492		return 0;
1493	x = skb->sp->xvec[idx];
1494	if (!x->type->reject)
1495		return 0;
1496	xfrm_state_hold(x);
1497	err = x->type->reject(x, skb, fl);
1498	xfrm_state_put(x);
1499	return err;
1500}
1501
1502/* When skb is transformed back to its "native" form, we have to
1503 * check policy restrictions. At the moment we make this in maximally
1504 * stupid way. Shame on me. :-) Of course, connected sockets must
1505 * have policy cached at them.
1506 */
1507
1508static inline int
1509xfrm_state_ok(struct xfrm_tmpl *tmpl, struct xfrm_state *x,
1510	      unsigned short family)
1511{
1512	if (xfrm_state_kern(x))
1513		return tmpl->optional && !xfrm_state_addr_cmp(tmpl, x, family);
1514	return	x->id.proto == tmpl->id.proto &&
1515		(x->id.spi == tmpl->id.spi || !tmpl->id.spi) &&
1516		(x->props.reqid == tmpl->reqid || !tmpl->reqid) &&
1517		x->props.mode == tmpl->mode &&
1518		((tmpl->aalgos & (1<<x->props.aalgo)) ||
1519		 !(xfrm_id_proto_match(tmpl->id.proto, IPSEC_PROTO_ANY))) &&
1520		!(x->props.mode != XFRM_MODE_TRANSPORT &&
1521		  xfrm_state_addr_cmp(tmpl, x, family));
1522}
1523
1524/*
1525 * 0 or more than 0 is returned when validation is succeeded (either bypass
1526 * because of optional transport mode, or next index of the mathced secpath
1527 * state with the template.
1528 * -1 is returned when no matching template is found.
1529 * Otherwise "-2 - errored_index" is returned.
1530 */
1531static inline int
1532xfrm_policy_ok(struct xfrm_tmpl *tmpl, struct sec_path *sp, int start,
1533	       unsigned short family)
1534{
1535	int idx = start;
1536
1537	if (tmpl->optional) {
1538		if (tmpl->mode == XFRM_MODE_TRANSPORT)
1539			return start;
1540	} else
1541		start = -1;
1542	for (; idx < sp->len; idx++) {
1543		if (xfrm_state_ok(tmpl, sp->xvec[idx], family))
1544			return ++idx;
1545		if (sp->xvec[idx]->props.mode != XFRM_MODE_TRANSPORT) {
1546			if (start == -1)
1547				start = -2-idx;
1548			break;
1549		}
1550	}
1551	return start;
1552}
1553
1554int
1555xfrm_decode_session(struct sk_buff *skb, struct flowi *fl, unsigned short family)
1556{
1557	struct xfrm_policy_afinfo *afinfo = xfrm_policy_get_afinfo(family);
1558	int err;
1559
1560	if (unlikely(afinfo == NULL))
1561		return -EAFNOSUPPORT;
1562
1563	afinfo->decode_session(skb, fl);
1564	err = security_xfrm_decode_session(skb, &fl->secid);
1565	xfrm_policy_put_afinfo(afinfo);
1566	return err;
1567}
1568EXPORT_SYMBOL(xfrm_decode_session);
1569
1570static inline int secpath_has_nontransport(struct sec_path *sp, int k, int *idxp)
1571{
1572	for (; k < sp->len; k++) {
1573		if (sp->xvec[k]->props.mode != XFRM_MODE_TRANSPORT) {
1574			*idxp = k;
1575			return 1;
1576		}
1577	}
1578
1579	return 0;
1580}
1581
1582int __xfrm_policy_check(struct sock *sk, int dir, struct sk_buff *skb,
1583			unsigned short family)
1584{
1585	struct xfrm_policy *pol;
1586	struct xfrm_policy *pols[XFRM_POLICY_TYPE_MAX];
1587	int npols = 0;
1588	int xfrm_nr;
1589	int pi;
1590	struct flowi fl;
1591	u8 fl_dir = policy_to_flow_dir(dir);
1592	int xerr_idx = -1;
1593
1594	if (xfrm_decode_session(skb, &fl, family) < 0)
1595		return 0;
1596	nf_nat_decode_session(skb, &fl, family);
1597
1598	/* First, check used SA against their selectors. */
1599	if (skb->sp) {
1600		int i;
1601
1602		for (i=skb->sp->len-1; i>=0; i--) {
1603			struct xfrm_state *x = skb->sp->xvec[i];
1604			if (!xfrm_selector_match(&x->sel, &fl, family))
1605				return 0;
1606		}
1607	}
1608
1609	pol = NULL;
1610	if (sk && sk->sk_policy[dir])
1611		pol = xfrm_sk_policy_lookup(sk, dir, &fl);
1612
1613	if (!pol)
1614		pol = flow_cache_lookup(&fl, family, fl_dir,
1615					xfrm_policy_lookup);
1616
1617	if (IS_ERR(pol))
1618		return 0;
1619
1620	if (!pol) {
1621		if (skb->sp && secpath_has_nontransport(skb->sp, 0, &xerr_idx)) {
1622			xfrm_secpath_reject(xerr_idx, skb, &fl);
1623			return 0;
1624		}
1625		return 1;
1626	}
1627
1628	pol->curlft.use_time = (unsigned long)xtime.tv_sec;
1629
1630	pols[0] = pol;
1631	npols ++;
1632#ifdef CONFIG_XFRM_SUB_POLICY
1633	if (pols[0]->type != XFRM_POLICY_TYPE_MAIN) {
1634		pols[1] = xfrm_policy_lookup_bytype(XFRM_POLICY_TYPE_MAIN,
1635						    &fl, family,
1636						    XFRM_POLICY_IN);
1637		if (pols[1]) {
1638			if (IS_ERR(pols[1]))
1639				return 0;
1640			pols[1]->curlft.use_time = (unsigned long)xtime.tv_sec;
1641			npols ++;
1642		}
1643	}
1644#endif
1645
1646	if (pol->action == XFRM_POLICY_ALLOW) {
1647		struct sec_path *sp;
1648		static struct sec_path dummy;
1649		struct xfrm_tmpl *tp[XFRM_MAX_DEPTH];
1650		struct xfrm_tmpl *stp[XFRM_MAX_DEPTH];
1651		struct xfrm_tmpl **tpp = tp;
1652		int ti = 0;
1653		int i, k;
1654
1655		if ((sp = skb->sp) == NULL)
1656			sp = &dummy;
1657
1658		for (pi = 0; pi < npols; pi++) {
1659			if (pols[pi] != pol &&
1660			    pols[pi]->action != XFRM_POLICY_ALLOW)
1661				goto reject;
1662			if (ti + pols[pi]->xfrm_nr >= XFRM_MAX_DEPTH)
1663				goto reject_error;
1664			for (i = 0; i < pols[pi]->xfrm_nr; i++)
1665				tpp[ti++] = &pols[pi]->xfrm_vec[i];
1666		}
1667		xfrm_nr = ti;
1668		if (npols > 1) {
1669			xfrm_tmpl_sort(stp, tpp, xfrm_nr, family);
1670			tpp = stp;
1671		}
1672
1673		/* For each tunnel xfrm, find the first matching tmpl.
1674		 * For each tmpl before that, find corresponding xfrm.
1675		 * Order is _important_. Later we will implement
1676		 * some barriers, but at the moment barriers
1677		 * are implied between each two transformations.
1678		 */
1679		for (i = xfrm_nr-1, k = 0; i >= 0; i--) {
1680			k = xfrm_policy_ok(tpp[i], sp, k, family);
1681			if (k < 0) {
1682				if (k < -1)
1683					/* "-2 - errored_index" returned */
1684					xerr_idx = -(2+k);
1685				goto reject;
1686			}
1687		}
1688
1689		if (secpath_has_nontransport(sp, k, &xerr_idx))
1690			goto reject;
1691
1692		xfrm_pols_put(pols, npols);
1693		return 1;
1694	}
1695
1696reject:
1697	xfrm_secpath_reject(xerr_idx, skb, &fl);
1698reject_error:
1699	xfrm_pols_put(pols, npols);
1700	return 0;
1701}
1702EXPORT_SYMBOL(__xfrm_policy_check);
1703
1704int __xfrm_route_forward(struct sk_buff *skb, unsigned short family)
1705{
1706	struct flowi fl;
1707
1708	if (xfrm_decode_session(skb, &fl, family) < 0)
1709		return 0;
1710
1711	return xfrm_lookup(&skb->dst, &fl, NULL, 0) == 0;
1712}
1713EXPORT_SYMBOL(__xfrm_route_forward);
1714
1715/* Optimize later using cookies and generation ids. */
1716
1717static struct dst_entry *xfrm_dst_check(struct dst_entry *dst, u32 cookie)
1718{
1719	/* Code (such as __xfrm4_bundle_create()) sets dst->obsolete
1720	 * to "-1" to force all XFRM destinations to get validated by
1721	 * dst_ops->check on every use.  We do this because when a
1722	 * normal route referenced by an XFRM dst is obsoleted we do
1723	 * not go looking around for all parent referencing XFRM dsts
1724	 * so that we can invalidate them.  It is just too much work.
1725	 * Instead we make the checks here on every use.  For example:
1726	 *
1727	 *	XFRM dst A --> IPv4 dst X
1728	 *
1729	 * X is the "xdst->route" of A (X is also the "dst->path" of A
1730	 * in this example).  If X is marked obsolete, "A" will not
1731	 * notice.  That's what we are validating here via the
1732	 * stale_bundle() check.
1733	 *
1734	 * When a policy's bundle is pruned, we dst_free() the XFRM
1735	 * dst which causes it's ->obsolete field to be set to a
1736	 * positive non-zero integer.  If an XFRM dst has been pruned
1737	 * like this, we want to force a new route lookup.
1738	 */
1739	if (dst->obsolete < 0 && !stale_bundle(dst))
1740		return dst;
1741
1742	return NULL;
1743}
1744
1745static int stale_bundle(struct dst_entry *dst)
1746{
1747	return !xfrm_bundle_ok((struct xfrm_dst *)dst, NULL, AF_UNSPEC, 0);
1748}
1749
1750void xfrm_dst_ifdown(struct dst_entry *dst, struct net_device *dev)
1751{
1752	while ((dst = dst->child) && dst->xfrm && dst->dev == dev) {
1753		dst->dev = &loopback_dev;
1754		dev_hold(&loopback_dev);
1755		dev_put(dev);
1756	}
1757}
1758EXPORT_SYMBOL(xfrm_dst_ifdown);
1759
1760static void xfrm_link_failure(struct sk_buff *skb)
1761{
1762	/* Impossible. Such dst must be popped before reaches point of failure. */
1763	return;
1764}
1765
1766static struct dst_entry *xfrm_negative_advice(struct dst_entry *dst)
1767{
1768	if (dst) {
1769		if (dst->obsolete) {
1770			dst_release(dst);
1771			dst = NULL;
1772		}
1773	}
1774	return dst;
1775}
1776
1777static void prune_one_bundle(struct xfrm_policy *pol, int (*func)(struct dst_entry *), struct dst_entry **gc_list_p)
1778{
1779	struct dst_entry *dst, **dstp;
1780
1781	write_lock(&pol->lock);
1782	dstp = &pol->bundles;
1783	while ((dst=*dstp) != NULL) {
1784		if (func(dst)) {
1785			*dstp = dst->next;
1786			dst->next = *gc_list_p;
1787			*gc_list_p = dst;
1788		} else {
1789			dstp = &dst->next;
1790		}
1791	}
1792	write_unlock(&pol->lock);
1793}
1794
1795static void xfrm_prune_bundles(int (*func)(struct dst_entry *))
1796{
1797	struct dst_entry *gc_list = NULL;
1798	int dir;
1799
1800	read_lock_bh(&xfrm_policy_lock);
1801	for (dir = 0; dir < XFRM_POLICY_MAX * 2; dir++) {
1802		struct xfrm_policy *pol;
1803		struct hlist_node *entry;
1804		struct hlist_head *table;
1805		int i;
1806
1807		hlist_for_each_entry(pol, entry,
1808				     &xfrm_policy_inexact[dir], bydst)
1809			prune_one_bundle(pol, func, &gc_list);
1810
1811		table = xfrm_policy_bydst[dir].table;
1812		for (i = xfrm_policy_bydst[dir].hmask; i >= 0; i--) {
1813			hlist_for_each_entry(pol, entry, table + i, bydst)
1814				prune_one_bundle(pol, func, &gc_list);
1815		}
1816	}
1817	read_unlock_bh(&xfrm_policy_lock);
1818
1819	while (gc_list) {
1820		struct dst_entry *dst = gc_list;
1821		gc_list = dst->next;
1822		dst_free(dst);
1823	}
1824}
1825
1826static int unused_bundle(struct dst_entry *dst)
1827{
1828	return !atomic_read(&dst->__refcnt);
1829}
1830
1831static void __xfrm_garbage_collect(void)
1832{
1833	xfrm_prune_bundles(unused_bundle);
1834}
1835
1836static int xfrm_flush_bundles(void)
1837{
1838	xfrm_prune_bundles(stale_bundle);
1839	return 0;
1840}
1841
1842void xfrm_init_pmtu(struct dst_entry *dst)
1843{
1844	do {
1845		struct xfrm_dst *xdst = (struct xfrm_dst *)dst;
1846		u32 pmtu, route_mtu_cached;
1847
1848		pmtu = dst_mtu(dst->child);
1849		xdst->child_mtu_cached = pmtu;
1850
1851		pmtu = xfrm_state_mtu(dst->xfrm, pmtu);
1852
1853		route_mtu_cached = dst_mtu(xdst->route);
1854		xdst->route_mtu_cached = route_mtu_cached;
1855
1856		if (pmtu > route_mtu_cached)
1857			pmtu = route_mtu_cached;
1858
1859		dst->metrics[RTAX_MTU-1] = pmtu;
1860	} while ((dst = dst->next));
1861}
1862
1863EXPORT_SYMBOL(xfrm_init_pmtu);
1864
1865/* Check that the bundle accepts the flow and its components are
1866 * still valid.
1867 */
1868
1869int xfrm_bundle_ok(struct xfrm_dst *first, struct flowi *fl, int family, int strict)
1870{
1871	struct dst_entry *dst = &first->u.dst;
1872	struct xfrm_dst *last;
1873	u32 mtu;
1874
1875	if (!dst_check(dst->path, ((struct xfrm_dst *)dst)->path_cookie) ||
1876	    (dst->dev && !netif_running(dst->dev)))
1877		return 0;
1878
1879	last = NULL;
1880
1881	do {
1882		struct xfrm_dst *xdst = (struct xfrm_dst *)dst;
1883
1884		if (fl && !xfrm_selector_match(&dst->xfrm->sel, fl, family))
1885			return 0;
1886		if (fl && !security_xfrm_flow_state_match(fl, dst->xfrm))
1887			return 0;
1888		if (dst->xfrm->km.state != XFRM_STATE_VALID)
1889			return 0;
1890		if (xdst->genid != dst->xfrm->genid)
1891			return 0;
1892
1893		if (strict && fl && dst->xfrm->props.mode != XFRM_MODE_TUNNEL &&
1894		    !xfrm_state_addr_flow_check(dst->xfrm, fl, family))
1895			return 0;
1896
1897		mtu = dst_mtu(dst->child);
1898		if (xdst->child_mtu_cached != mtu) {
1899			last = xdst;
1900			xdst->child_mtu_cached = mtu;
1901		}
1902
1903		if (!dst_check(xdst->route, xdst->route_cookie))
1904			return 0;
1905		mtu = dst_mtu(xdst->route);
1906		if (xdst->route_mtu_cached != mtu) {
1907			last = xdst;
1908			xdst->route_mtu_cached = mtu;
1909		}
1910
1911		dst = dst->child;
1912	} while (dst->xfrm);
1913
1914	if (likely(!last))
1915		return 1;
1916
1917	mtu = last->child_mtu_cached;
1918	for (;;) {
1919		dst = &last->u.dst;
1920
1921		mtu = xfrm_state_mtu(dst->xfrm, mtu);
1922		if (mtu > last->route_mtu_cached)
1923			mtu = last->route_mtu_cached;
1924		dst->metrics[RTAX_MTU-1] = mtu;
1925
1926		if (last == first)
1927			break;
1928
1929		last = last->u.next;
1930		last->child_mtu_cached = mtu;
1931	}
1932
1933	return 1;
1934}
1935
1936EXPORT_SYMBOL(xfrm_bundle_ok);
1937
1938int xfrm_policy_register_afinfo(struct xfrm_policy_afinfo *afinfo)
1939{
1940	int err = 0;
1941	if (unlikely(afinfo == NULL))
1942		return -EINVAL;
1943	if (unlikely(afinfo->family >= NPROTO))
1944		return -EAFNOSUPPORT;
1945	write_lock_bh(&xfrm_policy_afinfo_lock);
1946	if (unlikely(xfrm_policy_afinfo[afinfo->family] != NULL))
1947		err = -ENOBUFS;
1948	else {
1949		struct dst_ops *dst_ops = afinfo->dst_ops;
1950		if (likely(dst_ops->kmem_cachep == NULL))
1951			dst_ops->kmem_cachep = xfrm_dst_cache;
1952		if (likely(dst_ops->check == NULL))
1953			dst_ops->check = xfrm_dst_check;
1954		if (likely(dst_ops->negative_advice == NULL))
1955			dst_ops->negative_advice = xfrm_negative_advice;
1956		if (likely(dst_ops->link_failure == NULL))
1957			dst_ops->link_failure = xfrm_link_failure;
1958		if (likely(afinfo->garbage_collect == NULL))
1959			afinfo->garbage_collect = __xfrm_garbage_collect;
1960		xfrm_policy_afinfo[afinfo->family] = afinfo;
1961	}
1962	write_unlock_bh(&xfrm_policy_afinfo_lock);
1963	return err;
1964}
1965EXPORT_SYMBOL(xfrm_policy_register_afinfo);
1966
1967int xfrm_policy_unregister_afinfo(struct xfrm_policy_afinfo *afinfo)
1968{
1969	int err = 0;
1970	if (unlikely(afinfo == NULL))
1971		return -EINVAL;
1972	if (unlikely(afinfo->family >= NPROTO))
1973		return -EAFNOSUPPORT;
1974	write_lock_bh(&xfrm_policy_afinfo_lock);
1975	if (likely(xfrm_policy_afinfo[afinfo->family] != NULL)) {
1976		if (unlikely(xfrm_policy_afinfo[afinfo->family] != afinfo))
1977			err = -EINVAL;
1978		else {
1979			struct dst_ops *dst_ops = afinfo->dst_ops;
1980			xfrm_policy_afinfo[afinfo->family] = NULL;
1981			dst_ops->kmem_cachep = NULL;
1982			dst_ops->check = NULL;
1983			dst_ops->negative_advice = NULL;
1984			dst_ops->link_failure = NULL;
1985			afinfo->garbage_collect = NULL;
1986		}
1987	}
1988	write_unlock_bh(&xfrm_policy_afinfo_lock);
1989	return err;
1990}
1991EXPORT_SYMBOL(xfrm_policy_unregister_afinfo);
1992
1993static struct xfrm_policy_afinfo *xfrm_policy_get_afinfo(unsigned short family)
1994{
1995	struct xfrm_policy_afinfo *afinfo;
1996	if (unlikely(family >= NPROTO))
1997		return NULL;
1998	read_lock(&xfrm_policy_afinfo_lock);
1999	afinfo = xfrm_policy_afinfo[family];
2000	if (unlikely(!afinfo))
2001		read_unlock(&xfrm_policy_afinfo_lock);
2002	return afinfo;
2003}
2004
2005static void xfrm_policy_put_afinfo(struct xfrm_policy_afinfo *afinfo)
2006{
2007	read_unlock(&xfrm_policy_afinfo_lock);
2008}
2009
2010static struct xfrm_policy_afinfo *xfrm_policy_lock_afinfo(unsigned int family)
2011{
2012	struct xfrm_policy_afinfo *afinfo;
2013	if (unlikely(family >= NPROTO))
2014		return NULL;
2015	write_lock_bh(&xfrm_policy_afinfo_lock);
2016	afinfo = xfrm_policy_afinfo[family];
2017	if (unlikely(!afinfo))
2018		write_unlock_bh(&xfrm_policy_afinfo_lock);
2019	return afinfo;
2020}
2021
2022static void xfrm_policy_unlock_afinfo(struct xfrm_policy_afinfo *afinfo)
2023{
2024	write_unlock_bh(&xfrm_policy_afinfo_lock);
2025}
2026
2027static int xfrm_dev_event(struct notifier_block *this, unsigned long event, void *ptr)
2028{
2029	switch (event) {
2030	case NETDEV_DOWN:
2031		xfrm_flush_bundles();
2032	}
2033	return NOTIFY_DONE;
2034}
2035
2036static struct notifier_block xfrm_dev_notifier = {
2037	xfrm_dev_event,
2038	NULL,
2039	0
2040};
2041
2042static void __init xfrm_policy_init(void)
2043{
2044	unsigned int hmask, sz;
2045	int dir;
2046
2047	xfrm_dst_cache = kmem_cache_create("xfrm_dst_cache",
2048					   sizeof(struct xfrm_dst),
2049					   0, SLAB_HWCACHE_ALIGN|SLAB_PANIC,
2050					   NULL, NULL);
2051
2052	hmask = 8 - 1;
2053	sz = (hmask+1) * sizeof(struct hlist_head);
2054
2055	xfrm_policy_byidx = xfrm_hash_alloc(sz);
2056	xfrm_idx_hmask = hmask;
2057	if (!xfrm_policy_byidx)
2058		panic("XFRM: failed to allocate byidx hash\n");
2059
2060	for (dir = 0; dir < XFRM_POLICY_MAX * 2; dir++) {
2061		struct xfrm_policy_hash *htab;
2062
2063		INIT_HLIST_HEAD(&xfrm_policy_inexact[dir]);
2064
2065		htab = &xfrm_policy_bydst[dir];
2066		htab->table = xfrm_hash_alloc(sz);
2067		htab->hmask = hmask;
2068		if (!htab->table)
2069			panic("XFRM: failed to allocate bydst hash\n");
2070	}
2071
2072	INIT_WORK(&xfrm_policy_gc_work, xfrm_policy_gc_task, NULL);
2073	register_netdevice_notifier(&xfrm_dev_notifier);
2074}
2075
2076void __init xfrm_init(void)
2077{
2078	xfrm_state_init();
2079	xfrm_policy_init();
2080	xfrm_input_init();
2081}
2082
2083