datapath.c revision 8aa51d64c1f526e43b1e7f89fb8b98c2fd583f4b
1/*
2 * Copyright (c) 2007-2012 Nicira Networks.
3 *
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of version 2 of the GNU General Public
6 * License as published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope that it will be useful, but
9 * WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
11 * General Public License for more details.
12 *
13 * You should have received a copy of the GNU General Public License
14 * along with this program; if not, write to the Free Software
15 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
16 * 02110-1301, USA
17 */
18
19#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
20
21#include <linux/init.h>
22#include <linux/module.h>
23#include <linux/if_arp.h>
24#include <linux/if_vlan.h>
25#include <linux/in.h>
26#include <linux/ip.h>
27#include <linux/jhash.h>
28#include <linux/delay.h>
29#include <linux/time.h>
30#include <linux/etherdevice.h>
31#include <linux/genetlink.h>
32#include <linux/kernel.h>
33#include <linux/kthread.h>
34#include <linux/mutex.h>
35#include <linux/percpu.h>
36#include <linux/rcupdate.h>
37#include <linux/tcp.h>
38#include <linux/udp.h>
39#include <linux/ethtool.h>
40#include <linux/wait.h>
41#include <asm/div64.h>
42#include <linux/highmem.h>
43#include <linux/netfilter_bridge.h>
44#include <linux/netfilter_ipv4.h>
45#include <linux/inetdevice.h>
46#include <linux/list.h>
47#include <linux/openvswitch.h>
48#include <linux/rculist.h>
49#include <linux/dmi.h>
50#include <linux/workqueue.h>
51#include <net/genetlink.h>
52
53#include "datapath.h"
54#include "flow.h"
55#include "vport-internal_dev.h"
56
57/**
58 * DOC: Locking:
59 *
60 * Writes to device state (add/remove datapath, port, set operations on vports,
61 * etc.) are protected by RTNL.
62 *
63 * Writes to other state (flow table modifications, set miscellaneous datapath
64 * parameters, etc.) are protected by genl_mutex.  The RTNL lock nests inside
65 * genl_mutex.
66 *
67 * Reads are protected by RCU.
68 *
69 * There are a few special cases (mostly stats) that have their own
70 * synchronization but they nest under all of above and don't interact with
71 * each other.
72 */
73
74/* Global list of datapaths to enable dumping them all out.
75 * Protected by genl_mutex.
76 */
77static LIST_HEAD(dps);
78
79#define REHASH_FLOW_INTERVAL (10 * 60 * HZ)
80static void rehash_flow_table(struct work_struct *work);
81static DECLARE_DELAYED_WORK(rehash_flow_wq, rehash_flow_table);
82
83static struct vport *new_vport(const struct vport_parms *);
84static int queue_gso_packets(int dp_ifindex, struct sk_buff *,
85			     const struct dp_upcall_info *);
86static int queue_userspace_packet(int dp_ifindex, struct sk_buff *,
87				  const struct dp_upcall_info *);
88
89/* Must be called with rcu_read_lock, genl_mutex, or RTNL lock. */
90static struct datapath *get_dp(int dp_ifindex)
91{
92	struct datapath *dp = NULL;
93	struct net_device *dev;
94
95	rcu_read_lock();
96	dev = dev_get_by_index_rcu(&init_net, dp_ifindex);
97	if (dev) {
98		struct vport *vport = ovs_internal_dev_get_vport(dev);
99		if (vport)
100			dp = vport->dp;
101	}
102	rcu_read_unlock();
103
104	return dp;
105}
106
107/* Must be called with rcu_read_lock or RTNL lock. */
108const char *ovs_dp_name(const struct datapath *dp)
109{
110	struct vport *vport = rcu_dereference_rtnl(dp->ports[OVSP_LOCAL]);
111	return vport->ops->get_name(vport);
112}
113
114static int get_dpifindex(struct datapath *dp)
115{
116	struct vport *local;
117	int ifindex;
118
119	rcu_read_lock();
120
121	local = rcu_dereference(dp->ports[OVSP_LOCAL]);
122	if (local)
123		ifindex = local->ops->get_ifindex(local);
124	else
125		ifindex = 0;
126
127	rcu_read_unlock();
128
129	return ifindex;
130}
131
132static void destroy_dp_rcu(struct rcu_head *rcu)
133{
134	struct datapath *dp = container_of(rcu, struct datapath, rcu);
135
136	ovs_flow_tbl_destroy((__force struct flow_table *)dp->table);
137	free_percpu(dp->stats_percpu);
138	kfree(dp);
139}
140
141/* Called with RTNL lock and genl_lock. */
142static struct vport *new_vport(const struct vport_parms *parms)
143{
144	struct vport *vport;
145
146	vport = ovs_vport_add(parms);
147	if (!IS_ERR(vport)) {
148		struct datapath *dp = parms->dp;
149
150		rcu_assign_pointer(dp->ports[parms->port_no], vport);
151		list_add(&vport->node, &dp->port_list);
152	}
153
154	return vport;
155}
156
157/* Called with RTNL lock. */
158void ovs_dp_detach_port(struct vport *p)
159{
160	ASSERT_RTNL();
161
162	/* First drop references to device. */
163	list_del(&p->node);
164	rcu_assign_pointer(p->dp->ports[p->port_no], NULL);
165
166	/* Then destroy it. */
167	ovs_vport_del(p);
168}
169
170/* Must be called with rcu_read_lock. */
171void ovs_dp_process_received_packet(struct vport *p, struct sk_buff *skb)
172{
173	struct datapath *dp = p->dp;
174	struct sw_flow *flow;
175	struct dp_stats_percpu *stats;
176	struct sw_flow_key key;
177	u64 *stats_counter;
178	int error;
179	int key_len;
180
181	stats = per_cpu_ptr(dp->stats_percpu, smp_processor_id());
182
183	/* Extract flow from 'skb' into 'key'. */
184	error = ovs_flow_extract(skb, p->port_no, &key, &key_len);
185	if (unlikely(error)) {
186		kfree_skb(skb);
187		return;
188	}
189
190	/* Look up flow. */
191	flow = ovs_flow_tbl_lookup(rcu_dereference(dp->table), &key, key_len);
192	if (unlikely(!flow)) {
193		struct dp_upcall_info upcall;
194
195		upcall.cmd = OVS_PACKET_CMD_MISS;
196		upcall.key = &key;
197		upcall.userdata = NULL;
198		upcall.pid = p->upcall_pid;
199		ovs_dp_upcall(dp, skb, &upcall);
200		consume_skb(skb);
201		stats_counter = &stats->n_missed;
202		goto out;
203	}
204
205	OVS_CB(skb)->flow = flow;
206
207	stats_counter = &stats->n_hit;
208	ovs_flow_used(OVS_CB(skb)->flow, skb);
209	ovs_execute_actions(dp, skb);
210
211out:
212	/* Update datapath statistics. */
213	u64_stats_update_begin(&stats->sync);
214	(*stats_counter)++;
215	u64_stats_update_end(&stats->sync);
216}
217
218static struct genl_family dp_packet_genl_family = {
219	.id = GENL_ID_GENERATE,
220	.hdrsize = sizeof(struct ovs_header),
221	.name = OVS_PACKET_FAMILY,
222	.version = OVS_PACKET_VERSION,
223	.maxattr = OVS_PACKET_ATTR_MAX
224};
225
226int ovs_dp_upcall(struct datapath *dp, struct sk_buff *skb,
227	      const struct dp_upcall_info *upcall_info)
228{
229	struct dp_stats_percpu *stats;
230	int dp_ifindex;
231	int err;
232
233	if (upcall_info->pid == 0) {
234		err = -ENOTCONN;
235		goto err;
236	}
237
238	dp_ifindex = get_dpifindex(dp);
239	if (!dp_ifindex) {
240		err = -ENODEV;
241		goto err;
242	}
243
244	if (!skb_is_gso(skb))
245		err = queue_userspace_packet(dp_ifindex, skb, upcall_info);
246	else
247		err = queue_gso_packets(dp_ifindex, skb, upcall_info);
248	if (err)
249		goto err;
250
251	return 0;
252
253err:
254	stats = per_cpu_ptr(dp->stats_percpu, smp_processor_id());
255
256	u64_stats_update_begin(&stats->sync);
257	stats->n_lost++;
258	u64_stats_update_end(&stats->sync);
259
260	return err;
261}
262
263static int queue_gso_packets(int dp_ifindex, struct sk_buff *skb,
264			     const struct dp_upcall_info *upcall_info)
265{
266	struct dp_upcall_info later_info;
267	struct sw_flow_key later_key;
268	struct sk_buff *segs, *nskb;
269	int err;
270
271	segs = skb_gso_segment(skb, NETIF_F_SG | NETIF_F_HW_CSUM);
272	if (IS_ERR(skb))
273		return PTR_ERR(skb);
274
275	/* Queue all of the segments. */
276	skb = segs;
277	do {
278		err = queue_userspace_packet(dp_ifindex, skb, upcall_info);
279		if (err)
280			break;
281
282		if (skb == segs && skb_shinfo(skb)->gso_type & SKB_GSO_UDP) {
283			/* The initial flow key extracted by ovs_flow_extract()
284			 * in this case is for a first fragment, so we need to
285			 * properly mark later fragments.
286			 */
287			later_key = *upcall_info->key;
288			later_key.ip.frag = OVS_FRAG_TYPE_LATER;
289
290			later_info = *upcall_info;
291			later_info.key = &later_key;
292			upcall_info = &later_info;
293		}
294	} while ((skb = skb->next));
295
296	/* Free all of the segments. */
297	skb = segs;
298	do {
299		nskb = skb->next;
300		if (err)
301			kfree_skb(skb);
302		else
303			consume_skb(skb);
304	} while ((skb = nskb));
305	return err;
306}
307
308static int queue_userspace_packet(int dp_ifindex, struct sk_buff *skb,
309				  const struct dp_upcall_info *upcall_info)
310{
311	struct ovs_header *upcall;
312	struct sk_buff *nskb = NULL;
313	struct sk_buff *user_skb; /* to be queued to userspace */
314	struct nlattr *nla;
315	unsigned int len;
316	int err;
317
318	if (vlan_tx_tag_present(skb)) {
319		nskb = skb_clone(skb, GFP_ATOMIC);
320		if (!nskb)
321			return -ENOMEM;
322
323		nskb = __vlan_put_tag(nskb, vlan_tx_tag_get(nskb));
324		if (!nskb)
325			return -ENOMEM;
326
327		nskb->vlan_tci = 0;
328		skb = nskb;
329	}
330
331	if (nla_attr_size(skb->len) > USHRT_MAX) {
332		err = -EFBIG;
333		goto out;
334	}
335
336	len = sizeof(struct ovs_header);
337	len += nla_total_size(skb->len);
338	len += nla_total_size(FLOW_BUFSIZE);
339	if (upcall_info->cmd == OVS_PACKET_CMD_ACTION)
340		len += nla_total_size(8);
341
342	user_skb = genlmsg_new(len, GFP_ATOMIC);
343	if (!user_skb) {
344		err = -ENOMEM;
345		goto out;
346	}
347
348	upcall = genlmsg_put(user_skb, 0, 0, &dp_packet_genl_family,
349			     0, upcall_info->cmd);
350	upcall->dp_ifindex = dp_ifindex;
351
352	nla = nla_nest_start(user_skb, OVS_PACKET_ATTR_KEY);
353	ovs_flow_to_nlattrs(upcall_info->key, user_skb);
354	nla_nest_end(user_skb, nla);
355
356	if (upcall_info->userdata)
357		nla_put_u64(user_skb, OVS_PACKET_ATTR_USERDATA,
358			    nla_get_u64(upcall_info->userdata));
359
360	nla = __nla_reserve(user_skb, OVS_PACKET_ATTR_PACKET, skb->len);
361
362	skb_copy_and_csum_dev(skb, nla_data(nla));
363
364	err = genlmsg_unicast(&init_net, user_skb, upcall_info->pid);
365
366out:
367	kfree_skb(nskb);
368	return err;
369}
370
371/* Called with genl_mutex. */
372static int flush_flows(int dp_ifindex)
373{
374	struct flow_table *old_table;
375	struct flow_table *new_table;
376	struct datapath *dp;
377
378	dp = get_dp(dp_ifindex);
379	if (!dp)
380		return -ENODEV;
381
382	old_table = genl_dereference(dp->table);
383	new_table = ovs_flow_tbl_alloc(TBL_MIN_BUCKETS);
384	if (!new_table)
385		return -ENOMEM;
386
387	rcu_assign_pointer(dp->table, new_table);
388
389	ovs_flow_tbl_deferred_destroy(old_table);
390	return 0;
391}
392
393static int validate_actions(const struct nlattr *attr,
394				const struct sw_flow_key *key, int depth);
395
396static int validate_sample(const struct nlattr *attr,
397				const struct sw_flow_key *key, int depth)
398{
399	const struct nlattr *attrs[OVS_SAMPLE_ATTR_MAX + 1];
400	const struct nlattr *probability, *actions;
401	const struct nlattr *a;
402	int rem;
403
404	memset(attrs, 0, sizeof(attrs));
405	nla_for_each_nested(a, attr, rem) {
406		int type = nla_type(a);
407		if (!type || type > OVS_SAMPLE_ATTR_MAX || attrs[type])
408			return -EINVAL;
409		attrs[type] = a;
410	}
411	if (rem)
412		return -EINVAL;
413
414	probability = attrs[OVS_SAMPLE_ATTR_PROBABILITY];
415	if (!probability || nla_len(probability) != sizeof(u32))
416		return -EINVAL;
417
418	actions = attrs[OVS_SAMPLE_ATTR_ACTIONS];
419	if (!actions || (nla_len(actions) && nla_len(actions) < NLA_HDRLEN))
420		return -EINVAL;
421	return validate_actions(actions, key, depth + 1);
422}
423
424static int validate_tp_port(const struct sw_flow_key *flow_key)
425{
426	if (flow_key->eth.type == htons(ETH_P_IP)) {
427		if (flow_key->ipv4.tp.src && flow_key->ipv4.tp.dst)
428			return 0;
429	} else if (flow_key->eth.type == htons(ETH_P_IPV6)) {
430		if (flow_key->ipv6.tp.src && flow_key->ipv6.tp.dst)
431			return 0;
432	}
433
434	return -EINVAL;
435}
436
437static int validate_set(const struct nlattr *a,
438			const struct sw_flow_key *flow_key)
439{
440	const struct nlattr *ovs_key = nla_data(a);
441	int key_type = nla_type(ovs_key);
442
443	/* There can be only one key in a action */
444	if (nla_total_size(nla_len(ovs_key)) != nla_len(a))
445		return -EINVAL;
446
447	if (key_type > OVS_KEY_ATTR_MAX ||
448	    nla_len(ovs_key) != ovs_key_lens[key_type])
449		return -EINVAL;
450
451	switch (key_type) {
452	const struct ovs_key_ipv4 *ipv4_key;
453
454	case OVS_KEY_ATTR_PRIORITY:
455	case OVS_KEY_ATTR_ETHERNET:
456		break;
457
458	case OVS_KEY_ATTR_IPV4:
459		if (flow_key->eth.type != htons(ETH_P_IP))
460			return -EINVAL;
461
462		if (!flow_key->ipv4.addr.src || !flow_key->ipv4.addr.dst)
463			return -EINVAL;
464
465		ipv4_key = nla_data(ovs_key);
466		if (ipv4_key->ipv4_proto != flow_key->ip.proto)
467			return -EINVAL;
468
469		if (ipv4_key->ipv4_frag != flow_key->ip.frag)
470			return -EINVAL;
471
472		break;
473
474	case OVS_KEY_ATTR_TCP:
475		if (flow_key->ip.proto != IPPROTO_TCP)
476			return -EINVAL;
477
478		return validate_tp_port(flow_key);
479
480	case OVS_KEY_ATTR_UDP:
481		if (flow_key->ip.proto != IPPROTO_UDP)
482			return -EINVAL;
483
484		return validate_tp_port(flow_key);
485
486	default:
487		return -EINVAL;
488	}
489
490	return 0;
491}
492
493static int validate_userspace(const struct nlattr *attr)
494{
495	static const struct nla_policy userspace_policy[OVS_USERSPACE_ATTR_MAX + 1] =	{
496		[OVS_USERSPACE_ATTR_PID] = {.type = NLA_U32 },
497		[OVS_USERSPACE_ATTR_USERDATA] = {.type = NLA_U64 },
498	};
499	struct nlattr *a[OVS_USERSPACE_ATTR_MAX + 1];
500	int error;
501
502	error = nla_parse_nested(a, OVS_USERSPACE_ATTR_MAX,
503				 attr, userspace_policy);
504	if (error)
505		return error;
506
507	if (!a[OVS_USERSPACE_ATTR_PID] ||
508	    !nla_get_u32(a[OVS_USERSPACE_ATTR_PID]))
509		return -EINVAL;
510
511	return 0;
512}
513
514static int validate_actions(const struct nlattr *attr,
515				const struct sw_flow_key *key,  int depth)
516{
517	const struct nlattr *a;
518	int rem, err;
519
520	if (depth >= SAMPLE_ACTION_DEPTH)
521		return -EOVERFLOW;
522
523	nla_for_each_nested(a, attr, rem) {
524		/* Expected argument lengths, (u32)-1 for variable length. */
525		static const u32 action_lens[OVS_ACTION_ATTR_MAX + 1] = {
526			[OVS_ACTION_ATTR_OUTPUT] = sizeof(u32),
527			[OVS_ACTION_ATTR_USERSPACE] = (u32)-1,
528			[OVS_ACTION_ATTR_PUSH_VLAN] = sizeof(struct ovs_action_push_vlan),
529			[OVS_ACTION_ATTR_POP_VLAN] = 0,
530			[OVS_ACTION_ATTR_SET] = (u32)-1,
531			[OVS_ACTION_ATTR_SAMPLE] = (u32)-1
532		};
533		const struct ovs_action_push_vlan *vlan;
534		int type = nla_type(a);
535
536		if (type > OVS_ACTION_ATTR_MAX ||
537		    (action_lens[type] != nla_len(a) &&
538		     action_lens[type] != (u32)-1))
539			return -EINVAL;
540
541		switch (type) {
542		case OVS_ACTION_ATTR_UNSPEC:
543			return -EINVAL;
544
545		case OVS_ACTION_ATTR_USERSPACE:
546			err = validate_userspace(a);
547			if (err)
548				return err;
549			break;
550
551		case OVS_ACTION_ATTR_OUTPUT:
552			if (nla_get_u32(a) >= DP_MAX_PORTS)
553				return -EINVAL;
554			break;
555
556
557		case OVS_ACTION_ATTR_POP_VLAN:
558			break;
559
560		case OVS_ACTION_ATTR_PUSH_VLAN:
561			vlan = nla_data(a);
562			if (vlan->vlan_tpid != htons(ETH_P_8021Q))
563				return -EINVAL;
564			if (!(vlan->vlan_tci & htons(VLAN_TAG_PRESENT)))
565				return -EINVAL;
566			break;
567
568		case OVS_ACTION_ATTR_SET:
569			err = validate_set(a, key);
570			if (err)
571				return err;
572			break;
573
574		case OVS_ACTION_ATTR_SAMPLE:
575			err = validate_sample(a, key, depth);
576			if (err)
577				return err;
578			break;
579
580		default:
581			return -EINVAL;
582		}
583	}
584
585	if (rem > 0)
586		return -EINVAL;
587
588	return 0;
589}
590
591static void clear_stats(struct sw_flow *flow)
592{
593	flow->used = 0;
594	flow->tcp_flags = 0;
595	flow->packet_count = 0;
596	flow->byte_count = 0;
597}
598
599static int ovs_packet_cmd_execute(struct sk_buff *skb, struct genl_info *info)
600{
601	struct ovs_header *ovs_header = info->userhdr;
602	struct nlattr **a = info->attrs;
603	struct sw_flow_actions *acts;
604	struct sk_buff *packet;
605	struct sw_flow *flow;
606	struct datapath *dp;
607	struct ethhdr *eth;
608	int len;
609	int err;
610	int key_len;
611
612	err = -EINVAL;
613	if (!a[OVS_PACKET_ATTR_PACKET] || !a[OVS_PACKET_ATTR_KEY] ||
614	    !a[OVS_PACKET_ATTR_ACTIONS] ||
615	    nla_len(a[OVS_PACKET_ATTR_PACKET]) < ETH_HLEN)
616		goto err;
617
618	len = nla_len(a[OVS_PACKET_ATTR_PACKET]);
619	packet = __dev_alloc_skb(NET_IP_ALIGN + len, GFP_KERNEL);
620	err = -ENOMEM;
621	if (!packet)
622		goto err;
623	skb_reserve(packet, NET_IP_ALIGN);
624
625	memcpy(__skb_put(packet, len), nla_data(a[OVS_PACKET_ATTR_PACKET]), len);
626
627	skb_reset_mac_header(packet);
628	eth = eth_hdr(packet);
629
630	/* Normally, setting the skb 'protocol' field would be handled by a
631	 * call to eth_type_trans(), but it assumes there's a sending
632	 * device, which we may not have. */
633	if (ntohs(eth->h_proto) >= 1536)
634		packet->protocol = eth->h_proto;
635	else
636		packet->protocol = htons(ETH_P_802_2);
637
638	/* Build an sw_flow for sending this packet. */
639	flow = ovs_flow_alloc();
640	err = PTR_ERR(flow);
641	if (IS_ERR(flow))
642		goto err_kfree_skb;
643
644	err = ovs_flow_extract(packet, -1, &flow->key, &key_len);
645	if (err)
646		goto err_flow_free;
647
648	err = ovs_flow_metadata_from_nlattrs(&flow->key.phy.priority,
649					     &flow->key.phy.in_port,
650					     a[OVS_PACKET_ATTR_KEY]);
651	if (err)
652		goto err_flow_free;
653
654	err = validate_actions(a[OVS_PACKET_ATTR_ACTIONS], &flow->key, 0);
655	if (err)
656		goto err_flow_free;
657
658	flow->hash = ovs_flow_hash(&flow->key, key_len);
659
660	acts = ovs_flow_actions_alloc(a[OVS_PACKET_ATTR_ACTIONS]);
661	err = PTR_ERR(acts);
662	if (IS_ERR(acts))
663		goto err_flow_free;
664	rcu_assign_pointer(flow->sf_acts, acts);
665
666	OVS_CB(packet)->flow = flow;
667	packet->priority = flow->key.phy.priority;
668
669	rcu_read_lock();
670	dp = get_dp(ovs_header->dp_ifindex);
671	err = -ENODEV;
672	if (!dp)
673		goto err_unlock;
674
675	local_bh_disable();
676	err = ovs_execute_actions(dp, packet);
677	local_bh_enable();
678	rcu_read_unlock();
679
680	ovs_flow_free(flow);
681	return err;
682
683err_unlock:
684	rcu_read_unlock();
685err_flow_free:
686	ovs_flow_free(flow);
687err_kfree_skb:
688	kfree_skb(packet);
689err:
690	return err;
691}
692
693static const struct nla_policy packet_policy[OVS_PACKET_ATTR_MAX + 1] = {
694	[OVS_PACKET_ATTR_PACKET] = { .type = NLA_UNSPEC },
695	[OVS_PACKET_ATTR_KEY] = { .type = NLA_NESTED },
696	[OVS_PACKET_ATTR_ACTIONS] = { .type = NLA_NESTED },
697};
698
699static struct genl_ops dp_packet_genl_ops[] = {
700	{ .cmd = OVS_PACKET_CMD_EXECUTE,
701	  .flags = GENL_ADMIN_PERM, /* Requires CAP_NET_ADMIN privilege. */
702	  .policy = packet_policy,
703	  .doit = ovs_packet_cmd_execute
704	}
705};
706
707static void get_dp_stats(struct datapath *dp, struct ovs_dp_stats *stats)
708{
709	int i;
710	struct flow_table *table = genl_dereference(dp->table);
711
712	stats->n_flows = ovs_flow_tbl_count(table);
713
714	stats->n_hit = stats->n_missed = stats->n_lost = 0;
715	for_each_possible_cpu(i) {
716		const struct dp_stats_percpu *percpu_stats;
717		struct dp_stats_percpu local_stats;
718		unsigned int start;
719
720		percpu_stats = per_cpu_ptr(dp->stats_percpu, i);
721
722		do {
723			start = u64_stats_fetch_begin_bh(&percpu_stats->sync);
724			local_stats = *percpu_stats;
725		} while (u64_stats_fetch_retry_bh(&percpu_stats->sync, start));
726
727		stats->n_hit += local_stats.n_hit;
728		stats->n_missed += local_stats.n_missed;
729		stats->n_lost += local_stats.n_lost;
730	}
731}
732
733static const struct nla_policy flow_policy[OVS_FLOW_ATTR_MAX + 1] = {
734	[OVS_FLOW_ATTR_KEY] = { .type = NLA_NESTED },
735	[OVS_FLOW_ATTR_ACTIONS] = { .type = NLA_NESTED },
736	[OVS_FLOW_ATTR_CLEAR] = { .type = NLA_FLAG },
737};
738
739static struct genl_family dp_flow_genl_family = {
740	.id = GENL_ID_GENERATE,
741	.hdrsize = sizeof(struct ovs_header),
742	.name = OVS_FLOW_FAMILY,
743	.version = OVS_FLOW_VERSION,
744	.maxattr = OVS_FLOW_ATTR_MAX
745};
746
747static struct genl_multicast_group ovs_dp_flow_multicast_group = {
748	.name = OVS_FLOW_MCGROUP
749};
750
751/* Called with genl_lock. */
752static int ovs_flow_cmd_fill_info(struct sw_flow *flow, struct datapath *dp,
753				  struct sk_buff *skb, u32 pid,
754				  u32 seq, u32 flags, u8 cmd)
755{
756	const int skb_orig_len = skb->len;
757	const struct sw_flow_actions *sf_acts;
758	struct ovs_flow_stats stats;
759	struct ovs_header *ovs_header;
760	struct nlattr *nla;
761	unsigned long used;
762	u8 tcp_flags;
763	int err;
764
765	sf_acts = rcu_dereference_protected(flow->sf_acts,
766					    lockdep_genl_is_held());
767
768	ovs_header = genlmsg_put(skb, pid, seq, &dp_flow_genl_family, flags, cmd);
769	if (!ovs_header)
770		return -EMSGSIZE;
771
772	ovs_header->dp_ifindex = get_dpifindex(dp);
773
774	nla = nla_nest_start(skb, OVS_FLOW_ATTR_KEY);
775	if (!nla)
776		goto nla_put_failure;
777	err = ovs_flow_to_nlattrs(&flow->key, skb);
778	if (err)
779		goto error;
780	nla_nest_end(skb, nla);
781
782	spin_lock_bh(&flow->lock);
783	used = flow->used;
784	stats.n_packets = flow->packet_count;
785	stats.n_bytes = flow->byte_count;
786	tcp_flags = flow->tcp_flags;
787	spin_unlock_bh(&flow->lock);
788
789	if (used)
790		NLA_PUT_U64(skb, OVS_FLOW_ATTR_USED, ovs_flow_used_time(used));
791
792	if (stats.n_packets)
793		NLA_PUT(skb, OVS_FLOW_ATTR_STATS,
794			sizeof(struct ovs_flow_stats), &stats);
795
796	if (tcp_flags)
797		NLA_PUT_U8(skb, OVS_FLOW_ATTR_TCP_FLAGS, tcp_flags);
798
799	/* If OVS_FLOW_ATTR_ACTIONS doesn't fit, skip dumping the actions if
800	 * this is the first flow to be dumped into 'skb'.  This is unusual for
801	 * Netlink but individual action lists can be longer than
802	 * NLMSG_GOODSIZE and thus entirely undumpable if we didn't do this.
803	 * The userspace caller can always fetch the actions separately if it
804	 * really wants them.  (Most userspace callers in fact don't care.)
805	 *
806	 * This can only fail for dump operations because the skb is always
807	 * properly sized for single flows.
808	 */
809	err = nla_put(skb, OVS_FLOW_ATTR_ACTIONS, sf_acts->actions_len,
810		      sf_acts->actions);
811	if (err < 0 && skb_orig_len)
812		goto error;
813
814	return genlmsg_end(skb, ovs_header);
815
816nla_put_failure:
817	err = -EMSGSIZE;
818error:
819	genlmsg_cancel(skb, ovs_header);
820	return err;
821}
822
823static struct sk_buff *ovs_flow_cmd_alloc_info(struct sw_flow *flow)
824{
825	const struct sw_flow_actions *sf_acts;
826	int len;
827
828	sf_acts = rcu_dereference_protected(flow->sf_acts,
829					    lockdep_genl_is_held());
830
831	/* OVS_FLOW_ATTR_KEY */
832	len = nla_total_size(FLOW_BUFSIZE);
833	/* OVS_FLOW_ATTR_ACTIONS */
834	len += nla_total_size(sf_acts->actions_len);
835	/* OVS_FLOW_ATTR_STATS */
836	len += nla_total_size(sizeof(struct ovs_flow_stats));
837	/* OVS_FLOW_ATTR_TCP_FLAGS */
838	len += nla_total_size(1);
839	/* OVS_FLOW_ATTR_USED */
840	len += nla_total_size(8);
841
842	len += NLMSG_ALIGN(sizeof(struct ovs_header));
843
844	return genlmsg_new(len, GFP_KERNEL);
845}
846
847static struct sk_buff *ovs_flow_cmd_build_info(struct sw_flow *flow,
848					       struct datapath *dp,
849					       u32 pid, u32 seq, u8 cmd)
850{
851	struct sk_buff *skb;
852	int retval;
853
854	skb = ovs_flow_cmd_alloc_info(flow);
855	if (!skb)
856		return ERR_PTR(-ENOMEM);
857
858	retval = ovs_flow_cmd_fill_info(flow, dp, skb, pid, seq, 0, cmd);
859	BUG_ON(retval < 0);
860	return skb;
861}
862
863static int ovs_flow_cmd_new_or_set(struct sk_buff *skb, struct genl_info *info)
864{
865	struct nlattr **a = info->attrs;
866	struct ovs_header *ovs_header = info->userhdr;
867	struct sw_flow_key key;
868	struct sw_flow *flow;
869	struct sk_buff *reply;
870	struct datapath *dp;
871	struct flow_table *table;
872	int error;
873	int key_len;
874
875	/* Extract key. */
876	error = -EINVAL;
877	if (!a[OVS_FLOW_ATTR_KEY])
878		goto error;
879	error = ovs_flow_from_nlattrs(&key, &key_len, a[OVS_FLOW_ATTR_KEY]);
880	if (error)
881		goto error;
882
883	/* Validate actions. */
884	if (a[OVS_FLOW_ATTR_ACTIONS]) {
885		error = validate_actions(a[OVS_FLOW_ATTR_ACTIONS], &key,  0);
886		if (error)
887			goto error;
888	} else if (info->genlhdr->cmd == OVS_FLOW_CMD_NEW) {
889		error = -EINVAL;
890		goto error;
891	}
892
893	dp = get_dp(ovs_header->dp_ifindex);
894	error = -ENODEV;
895	if (!dp)
896		goto error;
897
898	table = genl_dereference(dp->table);
899	flow = ovs_flow_tbl_lookup(table, &key, key_len);
900	if (!flow) {
901		struct sw_flow_actions *acts;
902
903		/* Bail out if we're not allowed to create a new flow. */
904		error = -ENOENT;
905		if (info->genlhdr->cmd == OVS_FLOW_CMD_SET)
906			goto error;
907
908		/* Expand table, if necessary, to make room. */
909		if (ovs_flow_tbl_need_to_expand(table)) {
910			struct flow_table *new_table;
911
912			new_table = ovs_flow_tbl_expand(table);
913			if (!IS_ERR(new_table)) {
914				rcu_assign_pointer(dp->table, new_table);
915				ovs_flow_tbl_deferred_destroy(table);
916				table = genl_dereference(dp->table);
917			}
918		}
919
920		/* Allocate flow. */
921		flow = ovs_flow_alloc();
922		if (IS_ERR(flow)) {
923			error = PTR_ERR(flow);
924			goto error;
925		}
926		flow->key = key;
927		clear_stats(flow);
928
929		/* Obtain actions. */
930		acts = ovs_flow_actions_alloc(a[OVS_FLOW_ATTR_ACTIONS]);
931		error = PTR_ERR(acts);
932		if (IS_ERR(acts))
933			goto error_free_flow;
934		rcu_assign_pointer(flow->sf_acts, acts);
935
936		/* Put flow in bucket. */
937		flow->hash = ovs_flow_hash(&key, key_len);
938		ovs_flow_tbl_insert(table, flow);
939
940		reply = ovs_flow_cmd_build_info(flow, dp, info->snd_pid,
941						info->snd_seq,
942						OVS_FLOW_CMD_NEW);
943	} else {
944		/* We found a matching flow. */
945		struct sw_flow_actions *old_acts;
946		struct nlattr *acts_attrs;
947
948		/* Bail out if we're not allowed to modify an existing flow.
949		 * We accept NLM_F_CREATE in place of the intended NLM_F_EXCL
950		 * because Generic Netlink treats the latter as a dump
951		 * request.  We also accept NLM_F_EXCL in case that bug ever
952		 * gets fixed.
953		 */
954		error = -EEXIST;
955		if (info->genlhdr->cmd == OVS_FLOW_CMD_NEW &&
956		    info->nlhdr->nlmsg_flags & (NLM_F_CREATE | NLM_F_EXCL))
957			goto error;
958
959		/* Update actions. */
960		old_acts = rcu_dereference_protected(flow->sf_acts,
961						     lockdep_genl_is_held());
962		acts_attrs = a[OVS_FLOW_ATTR_ACTIONS];
963		if (acts_attrs &&
964		   (old_acts->actions_len != nla_len(acts_attrs) ||
965		   memcmp(old_acts->actions, nla_data(acts_attrs),
966			  old_acts->actions_len))) {
967			struct sw_flow_actions *new_acts;
968
969			new_acts = ovs_flow_actions_alloc(acts_attrs);
970			error = PTR_ERR(new_acts);
971			if (IS_ERR(new_acts))
972				goto error;
973
974			rcu_assign_pointer(flow->sf_acts, new_acts);
975			ovs_flow_deferred_free_acts(old_acts);
976		}
977
978		reply = ovs_flow_cmd_build_info(flow, dp, info->snd_pid,
979					       info->snd_seq, OVS_FLOW_CMD_NEW);
980
981		/* Clear stats. */
982		if (a[OVS_FLOW_ATTR_CLEAR]) {
983			spin_lock_bh(&flow->lock);
984			clear_stats(flow);
985			spin_unlock_bh(&flow->lock);
986		}
987	}
988
989	if (!IS_ERR(reply))
990		genl_notify(reply, genl_info_net(info), info->snd_pid,
991			   ovs_dp_flow_multicast_group.id, info->nlhdr,
992			   GFP_KERNEL);
993	else
994		netlink_set_err(init_net.genl_sock, 0,
995				ovs_dp_flow_multicast_group.id, PTR_ERR(reply));
996	return 0;
997
998error_free_flow:
999	ovs_flow_free(flow);
1000error:
1001	return error;
1002}
1003
1004static int ovs_flow_cmd_get(struct sk_buff *skb, struct genl_info *info)
1005{
1006	struct nlattr **a = info->attrs;
1007	struct ovs_header *ovs_header = info->userhdr;
1008	struct sw_flow_key key;
1009	struct sk_buff *reply;
1010	struct sw_flow *flow;
1011	struct datapath *dp;
1012	struct flow_table *table;
1013	int err;
1014	int key_len;
1015
1016	if (!a[OVS_FLOW_ATTR_KEY])
1017		return -EINVAL;
1018	err = ovs_flow_from_nlattrs(&key, &key_len, a[OVS_FLOW_ATTR_KEY]);
1019	if (err)
1020		return err;
1021
1022	dp = get_dp(ovs_header->dp_ifindex);
1023	if (!dp)
1024		return -ENODEV;
1025
1026	table = genl_dereference(dp->table);
1027	flow = ovs_flow_tbl_lookup(table, &key, key_len);
1028	if (!flow)
1029		return -ENOENT;
1030
1031	reply = ovs_flow_cmd_build_info(flow, dp, info->snd_pid,
1032					info->snd_seq, OVS_FLOW_CMD_NEW);
1033	if (IS_ERR(reply))
1034		return PTR_ERR(reply);
1035
1036	return genlmsg_reply(reply, info);
1037}
1038
1039static int ovs_flow_cmd_del(struct sk_buff *skb, struct genl_info *info)
1040{
1041	struct nlattr **a = info->attrs;
1042	struct ovs_header *ovs_header = info->userhdr;
1043	struct sw_flow_key key;
1044	struct sk_buff *reply;
1045	struct sw_flow *flow;
1046	struct datapath *dp;
1047	struct flow_table *table;
1048	int err;
1049	int key_len;
1050
1051	if (!a[OVS_FLOW_ATTR_KEY])
1052		return flush_flows(ovs_header->dp_ifindex);
1053	err = ovs_flow_from_nlattrs(&key, &key_len, a[OVS_FLOW_ATTR_KEY]);
1054	if (err)
1055		return err;
1056
1057	dp = get_dp(ovs_header->dp_ifindex);
1058	if (!dp)
1059		return -ENODEV;
1060
1061	table = genl_dereference(dp->table);
1062	flow = ovs_flow_tbl_lookup(table, &key, key_len);
1063	if (!flow)
1064		return -ENOENT;
1065
1066	reply = ovs_flow_cmd_alloc_info(flow);
1067	if (!reply)
1068		return -ENOMEM;
1069
1070	ovs_flow_tbl_remove(table, flow);
1071
1072	err = ovs_flow_cmd_fill_info(flow, dp, reply, info->snd_pid,
1073				     info->snd_seq, 0, OVS_FLOW_CMD_DEL);
1074	BUG_ON(err < 0);
1075
1076	ovs_flow_deferred_free(flow);
1077
1078	genl_notify(reply, genl_info_net(info), info->snd_pid,
1079		    ovs_dp_flow_multicast_group.id, info->nlhdr, GFP_KERNEL);
1080	return 0;
1081}
1082
1083static int ovs_flow_cmd_dump(struct sk_buff *skb, struct netlink_callback *cb)
1084{
1085	struct ovs_header *ovs_header = genlmsg_data(nlmsg_data(cb->nlh));
1086	struct datapath *dp;
1087	struct flow_table *table;
1088
1089	dp = get_dp(ovs_header->dp_ifindex);
1090	if (!dp)
1091		return -ENODEV;
1092
1093	table = genl_dereference(dp->table);
1094
1095	for (;;) {
1096		struct sw_flow *flow;
1097		u32 bucket, obj;
1098
1099		bucket = cb->args[0];
1100		obj = cb->args[1];
1101		flow = ovs_flow_tbl_next(table, &bucket, &obj);
1102		if (!flow)
1103			break;
1104
1105		if (ovs_flow_cmd_fill_info(flow, dp, skb,
1106					   NETLINK_CB(cb->skb).pid,
1107					   cb->nlh->nlmsg_seq, NLM_F_MULTI,
1108					   OVS_FLOW_CMD_NEW) < 0)
1109			break;
1110
1111		cb->args[0] = bucket;
1112		cb->args[1] = obj;
1113	}
1114	return skb->len;
1115}
1116
1117static struct genl_ops dp_flow_genl_ops[] = {
1118	{ .cmd = OVS_FLOW_CMD_NEW,
1119	  .flags = GENL_ADMIN_PERM, /* Requires CAP_NET_ADMIN privilege. */
1120	  .policy = flow_policy,
1121	  .doit = ovs_flow_cmd_new_or_set
1122	},
1123	{ .cmd = OVS_FLOW_CMD_DEL,
1124	  .flags = GENL_ADMIN_PERM, /* Requires CAP_NET_ADMIN privilege. */
1125	  .policy = flow_policy,
1126	  .doit = ovs_flow_cmd_del
1127	},
1128	{ .cmd = OVS_FLOW_CMD_GET,
1129	  .flags = 0,		    /* OK for unprivileged users. */
1130	  .policy = flow_policy,
1131	  .doit = ovs_flow_cmd_get,
1132	  .dumpit = ovs_flow_cmd_dump
1133	},
1134	{ .cmd = OVS_FLOW_CMD_SET,
1135	  .flags = GENL_ADMIN_PERM, /* Requires CAP_NET_ADMIN privilege. */
1136	  .policy = flow_policy,
1137	  .doit = ovs_flow_cmd_new_or_set,
1138	},
1139};
1140
1141static const struct nla_policy datapath_policy[OVS_DP_ATTR_MAX + 1] = {
1142	[OVS_DP_ATTR_NAME] = { .type = NLA_NUL_STRING, .len = IFNAMSIZ - 1 },
1143	[OVS_DP_ATTR_UPCALL_PID] = { .type = NLA_U32 },
1144};
1145
1146static struct genl_family dp_datapath_genl_family = {
1147	.id = GENL_ID_GENERATE,
1148	.hdrsize = sizeof(struct ovs_header),
1149	.name = OVS_DATAPATH_FAMILY,
1150	.version = OVS_DATAPATH_VERSION,
1151	.maxattr = OVS_DP_ATTR_MAX
1152};
1153
1154static struct genl_multicast_group ovs_dp_datapath_multicast_group = {
1155	.name = OVS_DATAPATH_MCGROUP
1156};
1157
1158static int ovs_dp_cmd_fill_info(struct datapath *dp, struct sk_buff *skb,
1159				u32 pid, u32 seq, u32 flags, u8 cmd)
1160{
1161	struct ovs_header *ovs_header;
1162	struct ovs_dp_stats dp_stats;
1163	int err;
1164
1165	ovs_header = genlmsg_put(skb, pid, seq, &dp_datapath_genl_family,
1166				   flags, cmd);
1167	if (!ovs_header)
1168		goto error;
1169
1170	ovs_header->dp_ifindex = get_dpifindex(dp);
1171
1172	rcu_read_lock();
1173	err = nla_put_string(skb, OVS_DP_ATTR_NAME, ovs_dp_name(dp));
1174	rcu_read_unlock();
1175	if (err)
1176		goto nla_put_failure;
1177
1178	get_dp_stats(dp, &dp_stats);
1179	NLA_PUT(skb, OVS_DP_ATTR_STATS, sizeof(struct ovs_dp_stats), &dp_stats);
1180
1181	return genlmsg_end(skb, ovs_header);
1182
1183nla_put_failure:
1184	genlmsg_cancel(skb, ovs_header);
1185error:
1186	return -EMSGSIZE;
1187}
1188
1189static struct sk_buff *ovs_dp_cmd_build_info(struct datapath *dp, u32 pid,
1190					     u32 seq, u8 cmd)
1191{
1192	struct sk_buff *skb;
1193	int retval;
1194
1195	skb = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
1196	if (!skb)
1197		return ERR_PTR(-ENOMEM);
1198
1199	retval = ovs_dp_cmd_fill_info(dp, skb, pid, seq, 0, cmd);
1200	if (retval < 0) {
1201		kfree_skb(skb);
1202		return ERR_PTR(retval);
1203	}
1204	return skb;
1205}
1206
1207/* Called with genl_mutex and optionally with RTNL lock also. */
1208static struct datapath *lookup_datapath(struct ovs_header *ovs_header,
1209					struct nlattr *a[OVS_DP_ATTR_MAX + 1])
1210{
1211	struct datapath *dp;
1212
1213	if (!a[OVS_DP_ATTR_NAME])
1214		dp = get_dp(ovs_header->dp_ifindex);
1215	else {
1216		struct vport *vport;
1217
1218		rcu_read_lock();
1219		vport = ovs_vport_locate(nla_data(a[OVS_DP_ATTR_NAME]));
1220		dp = vport && vport->port_no == OVSP_LOCAL ? vport->dp : NULL;
1221		rcu_read_unlock();
1222	}
1223	return dp ? dp : ERR_PTR(-ENODEV);
1224}
1225
1226static int ovs_dp_cmd_new(struct sk_buff *skb, struct genl_info *info)
1227{
1228	struct nlattr **a = info->attrs;
1229	struct vport_parms parms;
1230	struct sk_buff *reply;
1231	struct datapath *dp;
1232	struct vport *vport;
1233	int err;
1234
1235	err = -EINVAL;
1236	if (!a[OVS_DP_ATTR_NAME] || !a[OVS_DP_ATTR_UPCALL_PID])
1237		goto err;
1238
1239	rtnl_lock();
1240	err = -ENODEV;
1241	if (!try_module_get(THIS_MODULE))
1242		goto err_unlock_rtnl;
1243
1244	err = -ENOMEM;
1245	dp = kzalloc(sizeof(*dp), GFP_KERNEL);
1246	if (dp == NULL)
1247		goto err_put_module;
1248	INIT_LIST_HEAD(&dp->port_list);
1249
1250	/* Allocate table. */
1251	err = -ENOMEM;
1252	rcu_assign_pointer(dp->table, ovs_flow_tbl_alloc(TBL_MIN_BUCKETS));
1253	if (!dp->table)
1254		goto err_free_dp;
1255
1256	dp->stats_percpu = alloc_percpu(struct dp_stats_percpu);
1257	if (!dp->stats_percpu) {
1258		err = -ENOMEM;
1259		goto err_destroy_table;
1260	}
1261
1262	/* Set up our datapath device. */
1263	parms.name = nla_data(a[OVS_DP_ATTR_NAME]);
1264	parms.type = OVS_VPORT_TYPE_INTERNAL;
1265	parms.options = NULL;
1266	parms.dp = dp;
1267	parms.port_no = OVSP_LOCAL;
1268	parms.upcall_pid = nla_get_u32(a[OVS_DP_ATTR_UPCALL_PID]);
1269
1270	vport = new_vport(&parms);
1271	if (IS_ERR(vport)) {
1272		err = PTR_ERR(vport);
1273		if (err == -EBUSY)
1274			err = -EEXIST;
1275
1276		goto err_destroy_percpu;
1277	}
1278
1279	reply = ovs_dp_cmd_build_info(dp, info->snd_pid,
1280				      info->snd_seq, OVS_DP_CMD_NEW);
1281	err = PTR_ERR(reply);
1282	if (IS_ERR(reply))
1283		goto err_destroy_local_port;
1284
1285	list_add_tail(&dp->list_node, &dps);
1286	rtnl_unlock();
1287
1288	genl_notify(reply, genl_info_net(info), info->snd_pid,
1289		    ovs_dp_datapath_multicast_group.id, info->nlhdr,
1290		    GFP_KERNEL);
1291	return 0;
1292
1293err_destroy_local_port:
1294	ovs_dp_detach_port(rtnl_dereference(dp->ports[OVSP_LOCAL]));
1295err_destroy_percpu:
1296	free_percpu(dp->stats_percpu);
1297err_destroy_table:
1298	ovs_flow_tbl_destroy(genl_dereference(dp->table));
1299err_free_dp:
1300	kfree(dp);
1301err_put_module:
1302	module_put(THIS_MODULE);
1303err_unlock_rtnl:
1304	rtnl_unlock();
1305err:
1306	return err;
1307}
1308
1309static int ovs_dp_cmd_del(struct sk_buff *skb, struct genl_info *info)
1310{
1311	struct vport *vport, *next_vport;
1312	struct sk_buff *reply;
1313	struct datapath *dp;
1314	int err;
1315
1316	rtnl_lock();
1317	dp = lookup_datapath(info->userhdr, info->attrs);
1318	err = PTR_ERR(dp);
1319	if (IS_ERR(dp))
1320		goto exit_unlock;
1321
1322	reply = ovs_dp_cmd_build_info(dp, info->snd_pid,
1323				      info->snd_seq, OVS_DP_CMD_DEL);
1324	err = PTR_ERR(reply);
1325	if (IS_ERR(reply))
1326		goto exit_unlock;
1327
1328	list_for_each_entry_safe(vport, next_vport, &dp->port_list, node)
1329		if (vport->port_no != OVSP_LOCAL)
1330			ovs_dp_detach_port(vport);
1331
1332	list_del(&dp->list_node);
1333	ovs_dp_detach_port(rtnl_dereference(dp->ports[OVSP_LOCAL]));
1334
1335	/* rtnl_unlock() will wait until all the references to devices that
1336	 * are pending unregistration have been dropped.  We do it here to
1337	 * ensure that any internal devices (which contain DP pointers) are
1338	 * fully destroyed before freeing the datapath.
1339	 */
1340	rtnl_unlock();
1341
1342	call_rcu(&dp->rcu, destroy_dp_rcu);
1343	module_put(THIS_MODULE);
1344
1345	genl_notify(reply, genl_info_net(info), info->snd_pid,
1346		    ovs_dp_datapath_multicast_group.id, info->nlhdr,
1347		    GFP_KERNEL);
1348
1349	return 0;
1350
1351exit_unlock:
1352	rtnl_unlock();
1353	return err;
1354}
1355
1356static int ovs_dp_cmd_set(struct sk_buff *skb, struct genl_info *info)
1357{
1358	struct sk_buff *reply;
1359	struct datapath *dp;
1360	int err;
1361
1362	dp = lookup_datapath(info->userhdr, info->attrs);
1363	if (IS_ERR(dp))
1364		return PTR_ERR(dp);
1365
1366	reply = ovs_dp_cmd_build_info(dp, info->snd_pid,
1367				      info->snd_seq, OVS_DP_CMD_NEW);
1368	if (IS_ERR(reply)) {
1369		err = PTR_ERR(reply);
1370		netlink_set_err(init_net.genl_sock, 0,
1371				ovs_dp_datapath_multicast_group.id, err);
1372		return 0;
1373	}
1374
1375	genl_notify(reply, genl_info_net(info), info->snd_pid,
1376		    ovs_dp_datapath_multicast_group.id, info->nlhdr,
1377		    GFP_KERNEL);
1378
1379	return 0;
1380}
1381
1382static int ovs_dp_cmd_get(struct sk_buff *skb, struct genl_info *info)
1383{
1384	struct sk_buff *reply;
1385	struct datapath *dp;
1386
1387	dp = lookup_datapath(info->userhdr, info->attrs);
1388	if (IS_ERR(dp))
1389		return PTR_ERR(dp);
1390
1391	reply = ovs_dp_cmd_build_info(dp, info->snd_pid,
1392				      info->snd_seq, OVS_DP_CMD_NEW);
1393	if (IS_ERR(reply))
1394		return PTR_ERR(reply);
1395
1396	return genlmsg_reply(reply, info);
1397}
1398
1399static int ovs_dp_cmd_dump(struct sk_buff *skb, struct netlink_callback *cb)
1400{
1401	struct datapath *dp;
1402	int skip = cb->args[0];
1403	int i = 0;
1404
1405	list_for_each_entry(dp, &dps, list_node) {
1406		if (i >= skip &&
1407		    ovs_dp_cmd_fill_info(dp, skb, NETLINK_CB(cb->skb).pid,
1408					 cb->nlh->nlmsg_seq, NLM_F_MULTI,
1409					 OVS_DP_CMD_NEW) < 0)
1410			break;
1411		i++;
1412	}
1413
1414	cb->args[0] = i;
1415
1416	return skb->len;
1417}
1418
1419static struct genl_ops dp_datapath_genl_ops[] = {
1420	{ .cmd = OVS_DP_CMD_NEW,
1421	  .flags = GENL_ADMIN_PERM, /* Requires CAP_NET_ADMIN privilege. */
1422	  .policy = datapath_policy,
1423	  .doit = ovs_dp_cmd_new
1424	},
1425	{ .cmd = OVS_DP_CMD_DEL,
1426	  .flags = GENL_ADMIN_PERM, /* Requires CAP_NET_ADMIN privilege. */
1427	  .policy = datapath_policy,
1428	  .doit = ovs_dp_cmd_del
1429	},
1430	{ .cmd = OVS_DP_CMD_GET,
1431	  .flags = 0,		    /* OK for unprivileged users. */
1432	  .policy = datapath_policy,
1433	  .doit = ovs_dp_cmd_get,
1434	  .dumpit = ovs_dp_cmd_dump
1435	},
1436	{ .cmd = OVS_DP_CMD_SET,
1437	  .flags = GENL_ADMIN_PERM, /* Requires CAP_NET_ADMIN privilege. */
1438	  .policy = datapath_policy,
1439	  .doit = ovs_dp_cmd_set,
1440	},
1441};
1442
1443static const struct nla_policy vport_policy[OVS_VPORT_ATTR_MAX + 1] = {
1444	[OVS_VPORT_ATTR_NAME] = { .type = NLA_NUL_STRING, .len = IFNAMSIZ - 1 },
1445	[OVS_VPORT_ATTR_STATS] = { .len = sizeof(struct ovs_vport_stats) },
1446	[OVS_VPORT_ATTR_PORT_NO] = { .type = NLA_U32 },
1447	[OVS_VPORT_ATTR_TYPE] = { .type = NLA_U32 },
1448	[OVS_VPORT_ATTR_UPCALL_PID] = { .type = NLA_U32 },
1449	[OVS_VPORT_ATTR_OPTIONS] = { .type = NLA_NESTED },
1450};
1451
1452static struct genl_family dp_vport_genl_family = {
1453	.id = GENL_ID_GENERATE,
1454	.hdrsize = sizeof(struct ovs_header),
1455	.name = OVS_VPORT_FAMILY,
1456	.version = OVS_VPORT_VERSION,
1457	.maxattr = OVS_VPORT_ATTR_MAX
1458};
1459
1460struct genl_multicast_group ovs_dp_vport_multicast_group = {
1461	.name = OVS_VPORT_MCGROUP
1462};
1463
1464/* Called with RTNL lock or RCU read lock. */
1465static int ovs_vport_cmd_fill_info(struct vport *vport, struct sk_buff *skb,
1466				   u32 pid, u32 seq, u32 flags, u8 cmd)
1467{
1468	struct ovs_header *ovs_header;
1469	struct ovs_vport_stats vport_stats;
1470	int err;
1471
1472	ovs_header = genlmsg_put(skb, pid, seq, &dp_vport_genl_family,
1473				 flags, cmd);
1474	if (!ovs_header)
1475		return -EMSGSIZE;
1476
1477	ovs_header->dp_ifindex = get_dpifindex(vport->dp);
1478
1479	NLA_PUT_U32(skb, OVS_VPORT_ATTR_PORT_NO, vport->port_no);
1480	NLA_PUT_U32(skb, OVS_VPORT_ATTR_TYPE, vport->ops->type);
1481	NLA_PUT_STRING(skb, OVS_VPORT_ATTR_NAME, vport->ops->get_name(vport));
1482	NLA_PUT_U32(skb, OVS_VPORT_ATTR_UPCALL_PID, vport->upcall_pid);
1483
1484	ovs_vport_get_stats(vport, &vport_stats);
1485	NLA_PUT(skb, OVS_VPORT_ATTR_STATS, sizeof(struct ovs_vport_stats),
1486		&vport_stats);
1487
1488	err = ovs_vport_get_options(vport, skb);
1489	if (err == -EMSGSIZE)
1490		goto error;
1491
1492	return genlmsg_end(skb, ovs_header);
1493
1494nla_put_failure:
1495	err = -EMSGSIZE;
1496error:
1497	genlmsg_cancel(skb, ovs_header);
1498	return err;
1499}
1500
1501/* Called with RTNL lock or RCU read lock. */
1502struct sk_buff *ovs_vport_cmd_build_info(struct vport *vport, u32 pid,
1503					 u32 seq, u8 cmd)
1504{
1505	struct sk_buff *skb;
1506	int retval;
1507
1508	skb = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_ATOMIC);
1509	if (!skb)
1510		return ERR_PTR(-ENOMEM);
1511
1512	retval = ovs_vport_cmd_fill_info(vport, skb, pid, seq, 0, cmd);
1513	if (retval < 0) {
1514		kfree_skb(skb);
1515		return ERR_PTR(retval);
1516	}
1517	return skb;
1518}
1519
1520/* Called with RTNL lock or RCU read lock. */
1521static struct vport *lookup_vport(struct ovs_header *ovs_header,
1522				  struct nlattr *a[OVS_VPORT_ATTR_MAX + 1])
1523{
1524	struct datapath *dp;
1525	struct vport *vport;
1526
1527	if (a[OVS_VPORT_ATTR_NAME]) {
1528		vport = ovs_vport_locate(nla_data(a[OVS_VPORT_ATTR_NAME]));
1529		if (!vport)
1530			return ERR_PTR(-ENODEV);
1531		if (ovs_header->dp_ifindex &&
1532		    ovs_header->dp_ifindex != get_dpifindex(vport->dp))
1533			return ERR_PTR(-ENODEV);
1534		return vport;
1535	} else if (a[OVS_VPORT_ATTR_PORT_NO]) {
1536		u32 port_no = nla_get_u32(a[OVS_VPORT_ATTR_PORT_NO]);
1537
1538		if (port_no >= DP_MAX_PORTS)
1539			return ERR_PTR(-EFBIG);
1540
1541		dp = get_dp(ovs_header->dp_ifindex);
1542		if (!dp)
1543			return ERR_PTR(-ENODEV);
1544
1545		vport = rcu_dereference_rtnl(dp->ports[port_no]);
1546		if (!vport)
1547			return ERR_PTR(-ENOENT);
1548		return vport;
1549	} else
1550		return ERR_PTR(-EINVAL);
1551}
1552
1553static int ovs_vport_cmd_new(struct sk_buff *skb, struct genl_info *info)
1554{
1555	struct nlattr **a = info->attrs;
1556	struct ovs_header *ovs_header = info->userhdr;
1557	struct vport_parms parms;
1558	struct sk_buff *reply;
1559	struct vport *vport;
1560	struct datapath *dp;
1561	u32 port_no;
1562	int err;
1563
1564	err = -EINVAL;
1565	if (!a[OVS_VPORT_ATTR_NAME] || !a[OVS_VPORT_ATTR_TYPE] ||
1566	    !a[OVS_VPORT_ATTR_UPCALL_PID])
1567		goto exit;
1568
1569	rtnl_lock();
1570	dp = get_dp(ovs_header->dp_ifindex);
1571	err = -ENODEV;
1572	if (!dp)
1573		goto exit_unlock;
1574
1575	if (a[OVS_VPORT_ATTR_PORT_NO]) {
1576		port_no = nla_get_u32(a[OVS_VPORT_ATTR_PORT_NO]);
1577
1578		err = -EFBIG;
1579		if (port_no >= DP_MAX_PORTS)
1580			goto exit_unlock;
1581
1582		vport = rtnl_dereference(dp->ports[port_no]);
1583		err = -EBUSY;
1584		if (vport)
1585			goto exit_unlock;
1586	} else {
1587		for (port_no = 1; ; port_no++) {
1588			if (port_no >= DP_MAX_PORTS) {
1589				err = -EFBIG;
1590				goto exit_unlock;
1591			}
1592			vport = rtnl_dereference(dp->ports[port_no]);
1593			if (!vport)
1594				break;
1595		}
1596	}
1597
1598	parms.name = nla_data(a[OVS_VPORT_ATTR_NAME]);
1599	parms.type = nla_get_u32(a[OVS_VPORT_ATTR_TYPE]);
1600	parms.options = a[OVS_VPORT_ATTR_OPTIONS];
1601	parms.dp = dp;
1602	parms.port_no = port_no;
1603	parms.upcall_pid = nla_get_u32(a[OVS_VPORT_ATTR_UPCALL_PID]);
1604
1605	vport = new_vport(&parms);
1606	err = PTR_ERR(vport);
1607	if (IS_ERR(vport))
1608		goto exit_unlock;
1609
1610	reply = ovs_vport_cmd_build_info(vport, info->snd_pid, info->snd_seq,
1611					 OVS_VPORT_CMD_NEW);
1612	if (IS_ERR(reply)) {
1613		err = PTR_ERR(reply);
1614		ovs_dp_detach_port(vport);
1615		goto exit_unlock;
1616	}
1617	genl_notify(reply, genl_info_net(info), info->snd_pid,
1618		    ovs_dp_vport_multicast_group.id, info->nlhdr, GFP_KERNEL);
1619
1620exit_unlock:
1621	rtnl_unlock();
1622exit:
1623	return err;
1624}
1625
1626static int ovs_vport_cmd_set(struct sk_buff *skb, struct genl_info *info)
1627{
1628	struct nlattr **a = info->attrs;
1629	struct sk_buff *reply;
1630	struct vport *vport;
1631	int err;
1632
1633	rtnl_lock();
1634	vport = lookup_vport(info->userhdr, a);
1635	err = PTR_ERR(vport);
1636	if (IS_ERR(vport))
1637		goto exit_unlock;
1638
1639	err = 0;
1640	if (a[OVS_VPORT_ATTR_TYPE] &&
1641	    nla_get_u32(a[OVS_VPORT_ATTR_TYPE]) != vport->ops->type)
1642		err = -EINVAL;
1643
1644	if (!err && a[OVS_VPORT_ATTR_OPTIONS])
1645		err = ovs_vport_set_options(vport, a[OVS_VPORT_ATTR_OPTIONS]);
1646	if (!err && a[OVS_VPORT_ATTR_UPCALL_PID])
1647		vport->upcall_pid = nla_get_u32(a[OVS_VPORT_ATTR_UPCALL_PID]);
1648
1649	reply = ovs_vport_cmd_build_info(vport, info->snd_pid, info->snd_seq,
1650					 OVS_VPORT_CMD_NEW);
1651	if (IS_ERR(reply)) {
1652		netlink_set_err(init_net.genl_sock, 0,
1653				ovs_dp_vport_multicast_group.id, PTR_ERR(reply));
1654		goto exit_unlock;
1655	}
1656
1657	genl_notify(reply, genl_info_net(info), info->snd_pid,
1658		    ovs_dp_vport_multicast_group.id, info->nlhdr, GFP_KERNEL);
1659
1660exit_unlock:
1661	rtnl_unlock();
1662	return err;
1663}
1664
1665static int ovs_vport_cmd_del(struct sk_buff *skb, struct genl_info *info)
1666{
1667	struct nlattr **a = info->attrs;
1668	struct sk_buff *reply;
1669	struct vport *vport;
1670	int err;
1671
1672	rtnl_lock();
1673	vport = lookup_vport(info->userhdr, a);
1674	err = PTR_ERR(vport);
1675	if (IS_ERR(vport))
1676		goto exit_unlock;
1677
1678	if (vport->port_no == OVSP_LOCAL) {
1679		err = -EINVAL;
1680		goto exit_unlock;
1681	}
1682
1683	reply = ovs_vport_cmd_build_info(vport, info->snd_pid, info->snd_seq,
1684					 OVS_VPORT_CMD_DEL);
1685	err = PTR_ERR(reply);
1686	if (IS_ERR(reply))
1687		goto exit_unlock;
1688
1689	ovs_dp_detach_port(vport);
1690
1691	genl_notify(reply, genl_info_net(info), info->snd_pid,
1692		    ovs_dp_vport_multicast_group.id, info->nlhdr, GFP_KERNEL);
1693
1694exit_unlock:
1695	rtnl_unlock();
1696	return err;
1697}
1698
1699static int ovs_vport_cmd_get(struct sk_buff *skb, struct genl_info *info)
1700{
1701	struct nlattr **a = info->attrs;
1702	struct ovs_header *ovs_header = info->userhdr;
1703	struct sk_buff *reply;
1704	struct vport *vport;
1705	int err;
1706
1707	rcu_read_lock();
1708	vport = lookup_vport(ovs_header, a);
1709	err = PTR_ERR(vport);
1710	if (IS_ERR(vport))
1711		goto exit_unlock;
1712
1713	reply = ovs_vport_cmd_build_info(vport, info->snd_pid, info->snd_seq,
1714					 OVS_VPORT_CMD_NEW);
1715	err = PTR_ERR(reply);
1716	if (IS_ERR(reply))
1717		goto exit_unlock;
1718
1719	rcu_read_unlock();
1720
1721	return genlmsg_reply(reply, info);
1722
1723exit_unlock:
1724	rcu_read_unlock();
1725	return err;
1726}
1727
1728static int ovs_vport_cmd_dump(struct sk_buff *skb, struct netlink_callback *cb)
1729{
1730	struct ovs_header *ovs_header = genlmsg_data(nlmsg_data(cb->nlh));
1731	struct datapath *dp;
1732	u32 port_no;
1733	int retval;
1734
1735	dp = get_dp(ovs_header->dp_ifindex);
1736	if (!dp)
1737		return -ENODEV;
1738
1739	rcu_read_lock();
1740	for (port_no = cb->args[0]; port_no < DP_MAX_PORTS; port_no++) {
1741		struct vport *vport;
1742
1743		vport = rcu_dereference(dp->ports[port_no]);
1744		if (!vport)
1745			continue;
1746
1747		if (ovs_vport_cmd_fill_info(vport, skb, NETLINK_CB(cb->skb).pid,
1748					    cb->nlh->nlmsg_seq, NLM_F_MULTI,
1749					    OVS_VPORT_CMD_NEW) < 0)
1750			break;
1751	}
1752	rcu_read_unlock();
1753
1754	cb->args[0] = port_no;
1755	retval = skb->len;
1756
1757	return retval;
1758}
1759
1760static void rehash_flow_table(struct work_struct *work)
1761{
1762	struct datapath *dp;
1763
1764	genl_lock();
1765
1766	list_for_each_entry(dp, &dps, list_node) {
1767		struct flow_table *old_table = genl_dereference(dp->table);
1768		struct flow_table *new_table;
1769
1770		new_table = ovs_flow_tbl_rehash(old_table);
1771		if (!IS_ERR(new_table)) {
1772			rcu_assign_pointer(dp->table, new_table);
1773			ovs_flow_tbl_deferred_destroy(old_table);
1774		}
1775	}
1776
1777	genl_unlock();
1778
1779	schedule_delayed_work(&rehash_flow_wq, REHASH_FLOW_INTERVAL);
1780}
1781
1782static struct genl_ops dp_vport_genl_ops[] = {
1783	{ .cmd = OVS_VPORT_CMD_NEW,
1784	  .flags = GENL_ADMIN_PERM, /* Requires CAP_NET_ADMIN privilege. */
1785	  .policy = vport_policy,
1786	  .doit = ovs_vport_cmd_new
1787	},
1788	{ .cmd = OVS_VPORT_CMD_DEL,
1789	  .flags = GENL_ADMIN_PERM, /* Requires CAP_NET_ADMIN privilege. */
1790	  .policy = vport_policy,
1791	  .doit = ovs_vport_cmd_del
1792	},
1793	{ .cmd = OVS_VPORT_CMD_GET,
1794	  .flags = 0,		    /* OK for unprivileged users. */
1795	  .policy = vport_policy,
1796	  .doit = ovs_vport_cmd_get,
1797	  .dumpit = ovs_vport_cmd_dump
1798	},
1799	{ .cmd = OVS_VPORT_CMD_SET,
1800	  .flags = GENL_ADMIN_PERM, /* Requires CAP_NET_ADMIN privilege. */
1801	  .policy = vport_policy,
1802	  .doit = ovs_vport_cmd_set,
1803	},
1804};
1805
1806struct genl_family_and_ops {
1807	struct genl_family *family;
1808	struct genl_ops *ops;
1809	int n_ops;
1810	struct genl_multicast_group *group;
1811};
1812
1813static const struct genl_family_and_ops dp_genl_families[] = {
1814	{ &dp_datapath_genl_family,
1815	  dp_datapath_genl_ops, ARRAY_SIZE(dp_datapath_genl_ops),
1816	  &ovs_dp_datapath_multicast_group },
1817	{ &dp_vport_genl_family,
1818	  dp_vport_genl_ops, ARRAY_SIZE(dp_vport_genl_ops),
1819	  &ovs_dp_vport_multicast_group },
1820	{ &dp_flow_genl_family,
1821	  dp_flow_genl_ops, ARRAY_SIZE(dp_flow_genl_ops),
1822	  &ovs_dp_flow_multicast_group },
1823	{ &dp_packet_genl_family,
1824	  dp_packet_genl_ops, ARRAY_SIZE(dp_packet_genl_ops),
1825	  NULL },
1826};
1827
1828static void dp_unregister_genl(int n_families)
1829{
1830	int i;
1831
1832	for (i = 0; i < n_families; i++)
1833		genl_unregister_family(dp_genl_families[i].family);
1834}
1835
1836static int dp_register_genl(void)
1837{
1838	int n_registered;
1839	int err;
1840	int i;
1841
1842	n_registered = 0;
1843	for (i = 0; i < ARRAY_SIZE(dp_genl_families); i++) {
1844		const struct genl_family_and_ops *f = &dp_genl_families[i];
1845
1846		err = genl_register_family_with_ops(f->family, f->ops,
1847						    f->n_ops);
1848		if (err)
1849			goto error;
1850		n_registered++;
1851
1852		if (f->group) {
1853			err = genl_register_mc_group(f->family, f->group);
1854			if (err)
1855				goto error;
1856		}
1857	}
1858
1859	return 0;
1860
1861error:
1862	dp_unregister_genl(n_registered);
1863	return err;
1864}
1865
1866static int __init dp_init(void)
1867{
1868	struct sk_buff *dummy_skb;
1869	int err;
1870
1871	BUILD_BUG_ON(sizeof(struct ovs_skb_cb) > sizeof(dummy_skb->cb));
1872
1873	pr_info("Open vSwitch switching datapath\n");
1874
1875	err = ovs_flow_init();
1876	if (err)
1877		goto error;
1878
1879	err = ovs_vport_init();
1880	if (err)
1881		goto error_flow_exit;
1882
1883	err = register_netdevice_notifier(&ovs_dp_device_notifier);
1884	if (err)
1885		goto error_vport_exit;
1886
1887	err = dp_register_genl();
1888	if (err < 0)
1889		goto error_unreg_notifier;
1890
1891	schedule_delayed_work(&rehash_flow_wq, REHASH_FLOW_INTERVAL);
1892
1893	return 0;
1894
1895error_unreg_notifier:
1896	unregister_netdevice_notifier(&ovs_dp_device_notifier);
1897error_vport_exit:
1898	ovs_vport_exit();
1899error_flow_exit:
1900	ovs_flow_exit();
1901error:
1902	return err;
1903}
1904
1905static void dp_cleanup(void)
1906{
1907	cancel_delayed_work_sync(&rehash_flow_wq);
1908	rcu_barrier();
1909	dp_unregister_genl(ARRAY_SIZE(dp_genl_families));
1910	unregister_netdevice_notifier(&ovs_dp_device_notifier);
1911	ovs_vport_exit();
1912	ovs_flow_exit();
1913}
1914
1915module_init(dp_init);
1916module_exit(dp_cleanup);
1917
1918MODULE_DESCRIPTION("Open vSwitch switching datapath");
1919MODULE_LICENSE("GPL");
1920