1/*
2 * net/sched/cls_flow.c		Generic flow classifier
3 *
4 * Copyright (c) 2007, 2008 Patrick McHardy <kaber@trash.net>
5 *
6 * This program is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU General Public License
8 * as published by the Free Software Foundation; either version 2
9 * of the License, or (at your option) any later version.
10 */
11
12#include <linux/kernel.h>
13#include <linux/init.h>
14#include <linux/list.h>
15#include <linux/jhash.h>
16#include <linux/random.h>
17#include <linux/pkt_cls.h>
18#include <linux/skbuff.h>
19#include <linux/in.h>
20#include <linux/ip.h>
21#include <linux/ipv6.h>
22#include <linux/if_vlan.h>
23#include <linux/slab.h>
24#include <linux/module.h>
25
26#include <net/pkt_cls.h>
27#include <net/ip.h>
28#include <net/route.h>
29#include <net/flow_keys.h>
30
31#if defined(CONFIG_NF_CONNTRACK) || defined(CONFIG_NF_CONNTRACK_MODULE)
32#include <net/netfilter/nf_conntrack.h>
33#endif
34
35struct flow_head {
36	struct list_head	filters;
37};
38
39struct flow_filter {
40	struct list_head	list;
41	struct tcf_exts		exts;
42	struct tcf_ematch_tree	ematches;
43	struct timer_list	perturb_timer;
44	u32			perturb_period;
45	u32			handle;
46
47	u32			nkeys;
48	u32			keymask;
49	u32			mode;
50	u32			mask;
51	u32			xor;
52	u32			rshift;
53	u32			addend;
54	u32			divisor;
55	u32			baseclass;
56	u32			hashrnd;
57};
58
59static const struct tcf_ext_map flow_ext_map = {
60	.action	= TCA_FLOW_ACT,
61	.police	= TCA_FLOW_POLICE,
62};
63
64static inline u32 addr_fold(void *addr)
65{
66	unsigned long a = (unsigned long)addr;
67
68	return (a & 0xFFFFFFFF) ^ (BITS_PER_LONG > 32 ? a >> 32 : 0);
69}
70
71static u32 flow_get_src(const struct sk_buff *skb, const struct flow_keys *flow)
72{
73	if (flow->src)
74		return ntohl(flow->src);
75	return addr_fold(skb->sk);
76}
77
78static u32 flow_get_dst(const struct sk_buff *skb, const struct flow_keys *flow)
79{
80	if (flow->dst)
81		return ntohl(flow->dst);
82	return addr_fold(skb_dst(skb)) ^ (__force u16)skb->protocol;
83}
84
85static u32 flow_get_proto(const struct sk_buff *skb, const struct flow_keys *flow)
86{
87	return flow->ip_proto;
88}
89
90static u32 flow_get_proto_src(const struct sk_buff *skb, const struct flow_keys *flow)
91{
92	if (flow->ports)
93		return ntohs(flow->port16[0]);
94
95	return addr_fold(skb->sk);
96}
97
98static u32 flow_get_proto_dst(const struct sk_buff *skb, const struct flow_keys *flow)
99{
100	if (flow->ports)
101		return ntohs(flow->port16[1]);
102
103	return addr_fold(skb_dst(skb)) ^ (__force u16)skb->protocol;
104}
105
106static u32 flow_get_iif(const struct sk_buff *skb)
107{
108	return skb->skb_iif;
109}
110
111static u32 flow_get_priority(const struct sk_buff *skb)
112{
113	return skb->priority;
114}
115
116static u32 flow_get_mark(const struct sk_buff *skb)
117{
118	return skb->mark;
119}
120
121static u32 flow_get_nfct(const struct sk_buff *skb)
122{
123#if defined(CONFIG_NF_CONNTRACK) || defined(CONFIG_NF_CONNTRACK_MODULE)
124	return addr_fold(skb->nfct);
125#else
126	return 0;
127#endif
128}
129
130#if defined(CONFIG_NF_CONNTRACK) || defined(CONFIG_NF_CONNTRACK_MODULE)
131#define CTTUPLE(skb, member)						\
132({									\
133	enum ip_conntrack_info ctinfo;					\
134	const struct nf_conn *ct = nf_ct_get(skb, &ctinfo);		\
135	if (ct == NULL)							\
136		goto fallback;						\
137	ct->tuplehash[CTINFO2DIR(ctinfo)].tuple.member;			\
138})
139#else
140#define CTTUPLE(skb, member)						\
141({									\
142	goto fallback;							\
143	0;								\
144})
145#endif
146
147static u32 flow_get_nfct_src(const struct sk_buff *skb, const struct flow_keys *flow)
148{
149	switch (skb->protocol) {
150	case htons(ETH_P_IP):
151		return ntohl(CTTUPLE(skb, src.u3.ip));
152	case htons(ETH_P_IPV6):
153		return ntohl(CTTUPLE(skb, src.u3.ip6[3]));
154	}
155fallback:
156	return flow_get_src(skb, flow);
157}
158
159static u32 flow_get_nfct_dst(const struct sk_buff *skb, const struct flow_keys *flow)
160{
161	switch (skb->protocol) {
162	case htons(ETH_P_IP):
163		return ntohl(CTTUPLE(skb, dst.u3.ip));
164	case htons(ETH_P_IPV6):
165		return ntohl(CTTUPLE(skb, dst.u3.ip6[3]));
166	}
167fallback:
168	return flow_get_dst(skb, flow);
169}
170
171static u32 flow_get_nfct_proto_src(const struct sk_buff *skb, const struct flow_keys *flow)
172{
173	return ntohs(CTTUPLE(skb, src.u.all));
174fallback:
175	return flow_get_proto_src(skb, flow);
176}
177
178static u32 flow_get_nfct_proto_dst(const struct sk_buff *skb, const struct flow_keys *flow)
179{
180	return ntohs(CTTUPLE(skb, dst.u.all));
181fallback:
182	return flow_get_proto_dst(skb, flow);
183}
184
185static u32 flow_get_rtclassid(const struct sk_buff *skb)
186{
187#ifdef CONFIG_IP_ROUTE_CLASSID
188	if (skb_dst(skb))
189		return skb_dst(skb)->tclassid;
190#endif
191	return 0;
192}
193
194static u32 flow_get_skuid(const struct sk_buff *skb)
195{
196	if (skb->sk && skb->sk->sk_socket && skb->sk->sk_socket->file) {
197		kuid_t skuid = skb->sk->sk_socket->file->f_cred->fsuid;
198		return from_kuid(&init_user_ns, skuid);
199	}
200	return 0;
201}
202
203static u32 flow_get_skgid(const struct sk_buff *skb)
204{
205	if (skb->sk && skb->sk->sk_socket && skb->sk->sk_socket->file) {
206		kgid_t skgid = skb->sk->sk_socket->file->f_cred->fsgid;
207		return from_kgid(&init_user_ns, skgid);
208	}
209	return 0;
210}
211
212static u32 flow_get_vlan_tag(const struct sk_buff *skb)
213{
214	u16 uninitialized_var(tag);
215
216	if (vlan_get_tag(skb, &tag) < 0)
217		return 0;
218	return tag & VLAN_VID_MASK;
219}
220
221static u32 flow_get_rxhash(struct sk_buff *skb)
222{
223	return skb_get_rxhash(skb);
224}
225
226static u32 flow_key_get(struct sk_buff *skb, int key, struct flow_keys *flow)
227{
228	switch (key) {
229	case FLOW_KEY_SRC:
230		return flow_get_src(skb, flow);
231	case FLOW_KEY_DST:
232		return flow_get_dst(skb, flow);
233	case FLOW_KEY_PROTO:
234		return flow_get_proto(skb, flow);
235	case FLOW_KEY_PROTO_SRC:
236		return flow_get_proto_src(skb, flow);
237	case FLOW_KEY_PROTO_DST:
238		return flow_get_proto_dst(skb, flow);
239	case FLOW_KEY_IIF:
240		return flow_get_iif(skb);
241	case FLOW_KEY_PRIORITY:
242		return flow_get_priority(skb);
243	case FLOW_KEY_MARK:
244		return flow_get_mark(skb);
245	case FLOW_KEY_NFCT:
246		return flow_get_nfct(skb);
247	case FLOW_KEY_NFCT_SRC:
248		return flow_get_nfct_src(skb, flow);
249	case FLOW_KEY_NFCT_DST:
250		return flow_get_nfct_dst(skb, flow);
251	case FLOW_KEY_NFCT_PROTO_SRC:
252		return flow_get_nfct_proto_src(skb, flow);
253	case FLOW_KEY_NFCT_PROTO_DST:
254		return flow_get_nfct_proto_dst(skb, flow);
255	case FLOW_KEY_RTCLASSID:
256		return flow_get_rtclassid(skb);
257	case FLOW_KEY_SKUID:
258		return flow_get_skuid(skb);
259	case FLOW_KEY_SKGID:
260		return flow_get_skgid(skb);
261	case FLOW_KEY_VLAN_TAG:
262		return flow_get_vlan_tag(skb);
263	case FLOW_KEY_RXHASH:
264		return flow_get_rxhash(skb);
265	default:
266		WARN_ON(1);
267		return 0;
268	}
269}
270
271#define FLOW_KEYS_NEEDED ((1 << FLOW_KEY_SRC) | 		\
272			  (1 << FLOW_KEY_DST) |			\
273			  (1 << FLOW_KEY_PROTO) |		\
274			  (1 << FLOW_KEY_PROTO_SRC) |		\
275			  (1 << FLOW_KEY_PROTO_DST) | 		\
276			  (1 << FLOW_KEY_NFCT_SRC) |		\
277			  (1 << FLOW_KEY_NFCT_DST) |		\
278			  (1 << FLOW_KEY_NFCT_PROTO_SRC) |	\
279			  (1 << FLOW_KEY_NFCT_PROTO_DST))
280
281static int flow_classify(struct sk_buff *skb, const struct tcf_proto *tp,
282			 struct tcf_result *res)
283{
284	struct flow_head *head = tp->root;
285	struct flow_filter *f;
286	u32 keymask;
287	u32 classid;
288	unsigned int n, key;
289	int r;
290
291	list_for_each_entry(f, &head->filters, list) {
292		u32 keys[FLOW_KEY_MAX + 1];
293		struct flow_keys flow_keys;
294
295		if (!tcf_em_tree_match(skb, &f->ematches, NULL))
296			continue;
297
298		keymask = f->keymask;
299		if (keymask & FLOW_KEYS_NEEDED)
300			skb_flow_dissect(skb, &flow_keys);
301
302		for (n = 0; n < f->nkeys; n++) {
303			key = ffs(keymask) - 1;
304			keymask &= ~(1 << key);
305			keys[n] = flow_key_get(skb, key, &flow_keys);
306		}
307
308		if (f->mode == FLOW_MODE_HASH)
309			classid = jhash2(keys, f->nkeys, f->hashrnd);
310		else {
311			classid = keys[0];
312			classid = (classid & f->mask) ^ f->xor;
313			classid = (classid >> f->rshift) + f->addend;
314		}
315
316		if (f->divisor)
317			classid %= f->divisor;
318
319		res->class   = 0;
320		res->classid = TC_H_MAKE(f->baseclass, f->baseclass + classid);
321
322		r = tcf_exts_exec(skb, &f->exts, res);
323		if (r < 0)
324			continue;
325		return r;
326	}
327	return -1;
328}
329
330static void flow_perturbation(unsigned long arg)
331{
332	struct flow_filter *f = (struct flow_filter *)arg;
333
334	get_random_bytes(&f->hashrnd, 4);
335	if (f->perturb_period)
336		mod_timer(&f->perturb_timer, jiffies + f->perturb_period);
337}
338
339static const struct nla_policy flow_policy[TCA_FLOW_MAX + 1] = {
340	[TCA_FLOW_KEYS]		= { .type = NLA_U32 },
341	[TCA_FLOW_MODE]		= { .type = NLA_U32 },
342	[TCA_FLOW_BASECLASS]	= { .type = NLA_U32 },
343	[TCA_FLOW_RSHIFT]	= { .type = NLA_U32 },
344	[TCA_FLOW_ADDEND]	= { .type = NLA_U32 },
345	[TCA_FLOW_MASK]		= { .type = NLA_U32 },
346	[TCA_FLOW_XOR]		= { .type = NLA_U32 },
347	[TCA_FLOW_DIVISOR]	= { .type = NLA_U32 },
348	[TCA_FLOW_ACT]		= { .type = NLA_NESTED },
349	[TCA_FLOW_POLICE]	= { .type = NLA_NESTED },
350	[TCA_FLOW_EMATCHES]	= { .type = NLA_NESTED },
351	[TCA_FLOW_PERTURB]	= { .type = NLA_U32 },
352};
353
354static int flow_change(struct net *net, struct sk_buff *in_skb,
355		       struct tcf_proto *tp, unsigned long base,
356		       u32 handle, struct nlattr **tca,
357		       unsigned long *arg)
358{
359	struct flow_head *head = tp->root;
360	struct flow_filter *f;
361	struct nlattr *opt = tca[TCA_OPTIONS];
362	struct nlattr *tb[TCA_FLOW_MAX + 1];
363	struct tcf_exts e;
364	struct tcf_ematch_tree t;
365	unsigned int nkeys = 0;
366	unsigned int perturb_period = 0;
367	u32 baseclass = 0;
368	u32 keymask = 0;
369	u32 mode;
370	int err;
371
372	if (opt == NULL)
373		return -EINVAL;
374
375	err = nla_parse_nested(tb, TCA_FLOW_MAX, opt, flow_policy);
376	if (err < 0)
377		return err;
378
379	if (tb[TCA_FLOW_BASECLASS]) {
380		baseclass = nla_get_u32(tb[TCA_FLOW_BASECLASS]);
381		if (TC_H_MIN(baseclass) == 0)
382			return -EINVAL;
383	}
384
385	if (tb[TCA_FLOW_KEYS]) {
386		keymask = nla_get_u32(tb[TCA_FLOW_KEYS]);
387
388		nkeys = hweight32(keymask);
389		if (nkeys == 0)
390			return -EINVAL;
391
392		if (fls(keymask) - 1 > FLOW_KEY_MAX)
393			return -EOPNOTSUPP;
394
395		if ((keymask & (FLOW_KEY_SKUID|FLOW_KEY_SKGID)) &&
396		    sk_user_ns(NETLINK_CB(in_skb).sk) != &init_user_ns)
397			return -EOPNOTSUPP;
398	}
399
400	err = tcf_exts_validate(net, tp, tb, tca[TCA_RATE], &e, &flow_ext_map);
401	if (err < 0)
402		return err;
403
404	err = tcf_em_tree_validate(tp, tb[TCA_FLOW_EMATCHES], &t);
405	if (err < 0)
406		goto err1;
407
408	f = (struct flow_filter *)*arg;
409	if (f != NULL) {
410		err = -EINVAL;
411		if (f->handle != handle && handle)
412			goto err2;
413
414		mode = f->mode;
415		if (tb[TCA_FLOW_MODE])
416			mode = nla_get_u32(tb[TCA_FLOW_MODE]);
417		if (mode != FLOW_MODE_HASH && nkeys > 1)
418			goto err2;
419
420		if (mode == FLOW_MODE_HASH)
421			perturb_period = f->perturb_period;
422		if (tb[TCA_FLOW_PERTURB]) {
423			if (mode != FLOW_MODE_HASH)
424				goto err2;
425			perturb_period = nla_get_u32(tb[TCA_FLOW_PERTURB]) * HZ;
426		}
427	} else {
428		err = -EINVAL;
429		if (!handle)
430			goto err2;
431		if (!tb[TCA_FLOW_KEYS])
432			goto err2;
433
434		mode = FLOW_MODE_MAP;
435		if (tb[TCA_FLOW_MODE])
436			mode = nla_get_u32(tb[TCA_FLOW_MODE]);
437		if (mode != FLOW_MODE_HASH && nkeys > 1)
438			goto err2;
439
440		if (tb[TCA_FLOW_PERTURB]) {
441			if (mode != FLOW_MODE_HASH)
442				goto err2;
443			perturb_period = nla_get_u32(tb[TCA_FLOW_PERTURB]) * HZ;
444		}
445
446		if (TC_H_MAJ(baseclass) == 0)
447			baseclass = TC_H_MAKE(tp->q->handle, baseclass);
448		if (TC_H_MIN(baseclass) == 0)
449			baseclass = TC_H_MAKE(baseclass, 1);
450
451		err = -ENOBUFS;
452		f = kzalloc(sizeof(*f), GFP_KERNEL);
453		if (f == NULL)
454			goto err2;
455
456		f->handle = handle;
457		f->mask	  = ~0U;
458
459		get_random_bytes(&f->hashrnd, 4);
460		f->perturb_timer.function = flow_perturbation;
461		f->perturb_timer.data = (unsigned long)f;
462		init_timer_deferrable(&f->perturb_timer);
463	}
464
465	tcf_exts_change(tp, &f->exts, &e);
466	tcf_em_tree_change(tp, &f->ematches, &t);
467
468	tcf_tree_lock(tp);
469
470	if (tb[TCA_FLOW_KEYS]) {
471		f->keymask = keymask;
472		f->nkeys   = nkeys;
473	}
474
475	f->mode = mode;
476
477	if (tb[TCA_FLOW_MASK])
478		f->mask = nla_get_u32(tb[TCA_FLOW_MASK]);
479	if (tb[TCA_FLOW_XOR])
480		f->xor = nla_get_u32(tb[TCA_FLOW_XOR]);
481	if (tb[TCA_FLOW_RSHIFT])
482		f->rshift = nla_get_u32(tb[TCA_FLOW_RSHIFT]);
483	if (tb[TCA_FLOW_ADDEND])
484		f->addend = nla_get_u32(tb[TCA_FLOW_ADDEND]);
485
486	if (tb[TCA_FLOW_DIVISOR])
487		f->divisor = nla_get_u32(tb[TCA_FLOW_DIVISOR]);
488	if (baseclass)
489		f->baseclass = baseclass;
490
491	f->perturb_period = perturb_period;
492	del_timer(&f->perturb_timer);
493	if (perturb_period)
494		mod_timer(&f->perturb_timer, jiffies + perturb_period);
495
496	if (*arg == 0)
497		list_add_tail(&f->list, &head->filters);
498
499	tcf_tree_unlock(tp);
500
501	*arg = (unsigned long)f;
502	return 0;
503
504err2:
505	tcf_em_tree_destroy(tp, &t);
506err1:
507	tcf_exts_destroy(tp, &e);
508	return err;
509}
510
511static void flow_destroy_filter(struct tcf_proto *tp, struct flow_filter *f)
512{
513	del_timer_sync(&f->perturb_timer);
514	tcf_exts_destroy(tp, &f->exts);
515	tcf_em_tree_destroy(tp, &f->ematches);
516	kfree(f);
517}
518
519static int flow_delete(struct tcf_proto *tp, unsigned long arg)
520{
521	struct flow_filter *f = (struct flow_filter *)arg;
522
523	tcf_tree_lock(tp);
524	list_del(&f->list);
525	tcf_tree_unlock(tp);
526	flow_destroy_filter(tp, f);
527	return 0;
528}
529
530static int flow_init(struct tcf_proto *tp)
531{
532	struct flow_head *head;
533
534	head = kzalloc(sizeof(*head), GFP_KERNEL);
535	if (head == NULL)
536		return -ENOBUFS;
537	INIT_LIST_HEAD(&head->filters);
538	tp->root = head;
539	return 0;
540}
541
542static void flow_destroy(struct tcf_proto *tp)
543{
544	struct flow_head *head = tp->root;
545	struct flow_filter *f, *next;
546
547	list_for_each_entry_safe(f, next, &head->filters, list) {
548		list_del(&f->list);
549		flow_destroy_filter(tp, f);
550	}
551	kfree(head);
552}
553
554static unsigned long flow_get(struct tcf_proto *tp, u32 handle)
555{
556	struct flow_head *head = tp->root;
557	struct flow_filter *f;
558
559	list_for_each_entry(f, &head->filters, list)
560		if (f->handle == handle)
561			return (unsigned long)f;
562	return 0;
563}
564
565static void flow_put(struct tcf_proto *tp, unsigned long f)
566{
567}
568
569static int flow_dump(struct tcf_proto *tp, unsigned long fh,
570		     struct sk_buff *skb, struct tcmsg *t)
571{
572	struct flow_filter *f = (struct flow_filter *)fh;
573	struct nlattr *nest;
574
575	if (f == NULL)
576		return skb->len;
577
578	t->tcm_handle = f->handle;
579
580	nest = nla_nest_start(skb, TCA_OPTIONS);
581	if (nest == NULL)
582		goto nla_put_failure;
583
584	if (nla_put_u32(skb, TCA_FLOW_KEYS, f->keymask) ||
585	    nla_put_u32(skb, TCA_FLOW_MODE, f->mode))
586		goto nla_put_failure;
587
588	if (f->mask != ~0 || f->xor != 0) {
589		if (nla_put_u32(skb, TCA_FLOW_MASK, f->mask) ||
590		    nla_put_u32(skb, TCA_FLOW_XOR, f->xor))
591			goto nla_put_failure;
592	}
593	if (f->rshift &&
594	    nla_put_u32(skb, TCA_FLOW_RSHIFT, f->rshift))
595		goto nla_put_failure;
596	if (f->addend &&
597	    nla_put_u32(skb, TCA_FLOW_ADDEND, f->addend))
598		goto nla_put_failure;
599
600	if (f->divisor &&
601	    nla_put_u32(skb, TCA_FLOW_DIVISOR, f->divisor))
602		goto nla_put_failure;
603	if (f->baseclass &&
604	    nla_put_u32(skb, TCA_FLOW_BASECLASS, f->baseclass))
605		goto nla_put_failure;
606
607	if (f->perturb_period &&
608	    nla_put_u32(skb, TCA_FLOW_PERTURB, f->perturb_period / HZ))
609		goto nla_put_failure;
610
611	if (tcf_exts_dump(skb, &f->exts, &flow_ext_map) < 0)
612		goto nla_put_failure;
613#ifdef CONFIG_NET_EMATCH
614	if (f->ematches.hdr.nmatches &&
615	    tcf_em_tree_dump(skb, &f->ematches, TCA_FLOW_EMATCHES) < 0)
616		goto nla_put_failure;
617#endif
618	nla_nest_end(skb, nest);
619
620	if (tcf_exts_dump_stats(skb, &f->exts, &flow_ext_map) < 0)
621		goto nla_put_failure;
622
623	return skb->len;
624
625nla_put_failure:
626	nlmsg_trim(skb, nest);
627	return -1;
628}
629
630static void flow_walk(struct tcf_proto *tp, struct tcf_walker *arg)
631{
632	struct flow_head *head = tp->root;
633	struct flow_filter *f;
634
635	list_for_each_entry(f, &head->filters, list) {
636		if (arg->count < arg->skip)
637			goto skip;
638		if (arg->fn(tp, (unsigned long)f, arg) < 0) {
639			arg->stop = 1;
640			break;
641		}
642skip:
643		arg->count++;
644	}
645}
646
647static struct tcf_proto_ops cls_flow_ops __read_mostly = {
648	.kind		= "flow",
649	.classify	= flow_classify,
650	.init		= flow_init,
651	.destroy	= flow_destroy,
652	.change		= flow_change,
653	.delete		= flow_delete,
654	.get		= flow_get,
655	.put		= flow_put,
656	.dump		= flow_dump,
657	.walk		= flow_walk,
658	.owner		= THIS_MODULE,
659};
660
661static int __init cls_flow_init(void)
662{
663	return register_tcf_proto_ops(&cls_flow_ops);
664}
665
666static void __exit cls_flow_exit(void)
667{
668	unregister_tcf_proto_ops(&cls_flow_ops);
669}
670
671module_init(cls_flow_init);
672module_exit(cls_flow_exit);
673
674MODULE_LICENSE("GPL");
675MODULE_AUTHOR("Patrick McHardy <kaber@trash.net>");
676MODULE_DESCRIPTION("TC flow classifier");
677