af_packet.c revision 614f60fa9d73a9e8fdff3df83381907fea7c5649
1/*
2 * INET		An implementation of the TCP/IP protocol suite for the LINUX
3 *		operating system.  INET is implemented using the  BSD Socket
4 *		interface as the means of communication with the user level.
5 *
6 *		PACKET - implements raw packet sockets.
7 *
8 * Authors:	Ross Biro
9 *		Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
10 *		Alan Cox, <gw4pts@gw4pts.ampr.org>
11 *
12 * Fixes:
13 *		Alan Cox	:	verify_area() now used correctly
14 *		Alan Cox	:	new skbuff lists, look ma no backlogs!
15 *		Alan Cox	:	tidied skbuff lists.
16 *		Alan Cox	:	Now uses generic datagram routines I
17 *					added. Also fixed the peek/read crash
18 *					from all old Linux datagram code.
19 *		Alan Cox	:	Uses the improved datagram code.
20 *		Alan Cox	:	Added NULL's for socket options.
21 *		Alan Cox	:	Re-commented the code.
22 *		Alan Cox	:	Use new kernel side addressing
23 *		Rob Janssen	:	Correct MTU usage.
24 *		Dave Platt	:	Counter leaks caused by incorrect
25 *					interrupt locking and some slightly
26 *					dubious gcc output. Can you read
27 *					compiler: it said _VOLATILE_
28 *	Richard Kooijman	:	Timestamp fixes.
29 *		Alan Cox	:	New buffers. Use sk->mac.raw.
30 *		Alan Cox	:	sendmsg/recvmsg support.
31 *		Alan Cox	:	Protocol setting support
32 *	Alexey Kuznetsov	:	Untied from IPv4 stack.
33 *	Cyrus Durgin		:	Fixed kerneld for kmod.
34 *	Michal Ostrowski        :       Module initialization cleanup.
35 *         Ulises Alonso        :       Frame number limit removal and
36 *                                      packet_set_ring memory leak.
37 *		Eric Biederman	:	Allow for > 8 byte hardware addresses.
38 *					The convention is that longer addresses
39 *					will simply extend the hardware address
40 *					byte arrays at the end of sockaddr_ll
41 *					and packet_mreq.
42 *		Johann Baudy	:	Added TX RING.
43 *
44 *		This program is free software; you can redistribute it and/or
45 *		modify it under the terms of the GNU General Public License
46 *		as published by the Free Software Foundation; either version
47 *		2 of the License, or (at your option) any later version.
48 *
49 */
50
51#include <linux/types.h>
52#include <linux/mm.h>
53#include <linux/capability.h>
54#include <linux/fcntl.h>
55#include <linux/socket.h>
56#include <linux/in.h>
57#include <linux/inet.h>
58#include <linux/netdevice.h>
59#include <linux/if_packet.h>
60#include <linux/wireless.h>
61#include <linux/kernel.h>
62#include <linux/kmod.h>
63#include <linux/slab.h>
64#include <net/net_namespace.h>
65#include <net/ip.h>
66#include <net/protocol.h>
67#include <linux/skbuff.h>
68#include <net/sock.h>
69#include <linux/errno.h>
70#include <linux/timer.h>
71#include <asm/system.h>
72#include <asm/uaccess.h>
73#include <asm/ioctls.h>
74#include <asm/page.h>
75#include <asm/cacheflush.h>
76#include <asm/io.h>
77#include <linux/proc_fs.h>
78#include <linux/seq_file.h>
79#include <linux/poll.h>
80#include <linux/module.h>
81#include <linux/init.h>
82#include <linux/mutex.h>
83#include <linux/if_vlan.h>
84#include <linux/virtio_net.h>
85#include <linux/errqueue.h>
86#include <linux/net_tstamp.h>
87
88#ifdef CONFIG_INET
89#include <net/inet_common.h>
90#endif
91
92/*
93   Assumptions:
94   - if device has no dev->hard_header routine, it adds and removes ll header
95     inside itself. In this case ll header is invisible outside of device,
96     but higher levels still should reserve dev->hard_header_len.
97     Some devices are enough clever to reallocate skb, when header
98     will not fit to reserved space (tunnel), another ones are silly
99     (PPP).
100   - packet socket receives packets with pulled ll header,
101     so that SOCK_RAW should push it back.
102
103On receive:
104-----------
105
106Incoming, dev->hard_header!=NULL
107   mac_header -> ll header
108   data       -> data
109
110Outgoing, dev->hard_header!=NULL
111   mac_header -> ll header
112   data       -> ll header
113
114Incoming, dev->hard_header==NULL
115   mac_header -> UNKNOWN position. It is very likely, that it points to ll
116		 header.  PPP makes it, that is wrong, because introduce
117		 assymetry between rx and tx paths.
118   data       -> data
119
120Outgoing, dev->hard_header==NULL
121   mac_header -> data. ll header is still not built!
122   data       -> data
123
124Resume
125  If dev->hard_header==NULL we are unlikely to restore sensible ll header.
126
127
128On transmit:
129------------
130
131dev->hard_header != NULL
132   mac_header -> ll header
133   data       -> ll header
134
135dev->hard_header == NULL (ll header is added by device, we cannot control it)
136   mac_header -> data
137   data       -> data
138
139   We should set nh.raw on output to correct posistion,
140   packet classifier depends on it.
141 */
142
143/* Private packet socket structures. */
144
145struct packet_mclist {
146	struct packet_mclist	*next;
147	int			ifindex;
148	int			count;
149	unsigned short		type;
150	unsigned short		alen;
151	unsigned char		addr[MAX_ADDR_LEN];
152};
153/* identical to struct packet_mreq except it has
154 * a longer address field.
155 */
156struct packet_mreq_max {
157	int		mr_ifindex;
158	unsigned short	mr_type;
159	unsigned short	mr_alen;
160	unsigned char	mr_address[MAX_ADDR_LEN];
161};
162
163static int packet_set_ring(struct sock *sk, struct tpacket_req *req,
164		int closing, int tx_ring);
165
166struct packet_ring_buffer {
167	char			**pg_vec;
168	unsigned int		head;
169	unsigned int		frames_per_block;
170	unsigned int		frame_size;
171	unsigned int		frame_max;
172
173	unsigned int		pg_vec_order;
174	unsigned int		pg_vec_pages;
175	unsigned int		pg_vec_len;
176
177	atomic_t		pending;
178};
179
180struct packet_sock;
181static int tpacket_snd(struct packet_sock *po, struct msghdr *msg);
182
183static void packet_flush_mclist(struct sock *sk);
184
185struct packet_sock {
186	/* struct sock has to be the first member of packet_sock */
187	struct sock		sk;
188	struct tpacket_stats	stats;
189	struct packet_ring_buffer	rx_ring;
190	struct packet_ring_buffer	tx_ring;
191	int			copy_thresh;
192	spinlock_t		bind_lock;
193	struct mutex		pg_vec_lock;
194	unsigned int		running:1,	/* prot_hook is attached*/
195				auxdata:1,
196				origdev:1,
197				has_vnet_hdr:1;
198	int			ifindex;	/* bound device		*/
199	__be16			num;
200	struct packet_mclist	*mclist;
201	atomic_t		mapped;
202	enum tpacket_versions	tp_version;
203	unsigned int		tp_hdrlen;
204	unsigned int		tp_reserve;
205	unsigned int		tp_loss:1;
206	unsigned int		tp_tstamp;
207	struct packet_type	prot_hook ____cacheline_aligned_in_smp;
208};
209
210struct packet_skb_cb {
211	unsigned int origlen;
212	union {
213		struct sockaddr_pkt pkt;
214		struct sockaddr_ll ll;
215	} sa;
216};
217
218#define PACKET_SKB_CB(__skb)	((struct packet_skb_cb *)((__skb)->cb))
219
220static void __packet_set_status(struct packet_sock *po, void *frame, int status)
221{
222	union {
223		struct tpacket_hdr *h1;
224		struct tpacket2_hdr *h2;
225		void *raw;
226	} h;
227
228	h.raw = frame;
229	switch (po->tp_version) {
230	case TPACKET_V1:
231		h.h1->tp_status = status;
232		flush_dcache_page(virt_to_page(&h.h1->tp_status));
233		break;
234	case TPACKET_V2:
235		h.h2->tp_status = status;
236		flush_dcache_page(virt_to_page(&h.h2->tp_status));
237		break;
238	default:
239		pr_err("TPACKET version not supported\n");
240		BUG();
241	}
242
243	smp_wmb();
244}
245
246static int __packet_get_status(struct packet_sock *po, void *frame)
247{
248	union {
249		struct tpacket_hdr *h1;
250		struct tpacket2_hdr *h2;
251		void *raw;
252	} h;
253
254	smp_rmb();
255
256	h.raw = frame;
257	switch (po->tp_version) {
258	case TPACKET_V1:
259		flush_dcache_page(virt_to_page(&h.h1->tp_status));
260		return h.h1->tp_status;
261	case TPACKET_V2:
262		flush_dcache_page(virt_to_page(&h.h2->tp_status));
263		return h.h2->tp_status;
264	default:
265		pr_err("TPACKET version not supported\n");
266		BUG();
267		return 0;
268	}
269}
270
271static void *packet_lookup_frame(struct packet_sock *po,
272		struct packet_ring_buffer *rb,
273		unsigned int position,
274		int status)
275{
276	unsigned int pg_vec_pos, frame_offset;
277	union {
278		struct tpacket_hdr *h1;
279		struct tpacket2_hdr *h2;
280		void *raw;
281	} h;
282
283	pg_vec_pos = position / rb->frames_per_block;
284	frame_offset = position % rb->frames_per_block;
285
286	h.raw = rb->pg_vec[pg_vec_pos] + (frame_offset * rb->frame_size);
287
288	if (status != __packet_get_status(po, h.raw))
289		return NULL;
290
291	return h.raw;
292}
293
294static inline void *packet_current_frame(struct packet_sock *po,
295		struct packet_ring_buffer *rb,
296		int status)
297{
298	return packet_lookup_frame(po, rb, rb->head, status);
299}
300
301static inline void *packet_previous_frame(struct packet_sock *po,
302		struct packet_ring_buffer *rb,
303		int status)
304{
305	unsigned int previous = rb->head ? rb->head - 1 : rb->frame_max;
306	return packet_lookup_frame(po, rb, previous, status);
307}
308
309static inline void packet_increment_head(struct packet_ring_buffer *buff)
310{
311	buff->head = buff->head != buff->frame_max ? buff->head+1 : 0;
312}
313
314static inline struct packet_sock *pkt_sk(struct sock *sk)
315{
316	return (struct packet_sock *)sk;
317}
318
319static void packet_sock_destruct(struct sock *sk)
320{
321	skb_queue_purge(&sk->sk_error_queue);
322
323	WARN_ON(atomic_read(&sk->sk_rmem_alloc));
324	WARN_ON(atomic_read(&sk->sk_wmem_alloc));
325
326	if (!sock_flag(sk, SOCK_DEAD)) {
327		pr_err("Attempt to release alive packet socket: %p\n", sk);
328		return;
329	}
330
331	sk_refcnt_debug_dec(sk);
332}
333
334
335static const struct proto_ops packet_ops;
336
337static const struct proto_ops packet_ops_spkt;
338
339static int packet_rcv_spkt(struct sk_buff *skb, struct net_device *dev,
340			   struct packet_type *pt, struct net_device *orig_dev)
341{
342	struct sock *sk;
343	struct sockaddr_pkt *spkt;
344
345	/*
346	 *	When we registered the protocol we saved the socket in the data
347	 *	field for just this event.
348	 */
349
350	sk = pt->af_packet_priv;
351
352	/*
353	 *	Yank back the headers [hope the device set this
354	 *	right or kerboom...]
355	 *
356	 *	Incoming packets have ll header pulled,
357	 *	push it back.
358	 *
359	 *	For outgoing ones skb->data == skb_mac_header(skb)
360	 *	so that this procedure is noop.
361	 */
362
363	if (skb->pkt_type == PACKET_LOOPBACK)
364		goto out;
365
366	if (!net_eq(dev_net(dev), sock_net(sk)))
367		goto out;
368
369	skb = skb_share_check(skb, GFP_ATOMIC);
370	if (skb == NULL)
371		goto oom;
372
373	/* drop any routing info */
374	skb_dst_drop(skb);
375
376	/* drop conntrack reference */
377	nf_reset(skb);
378
379	spkt = &PACKET_SKB_CB(skb)->sa.pkt;
380
381	skb_push(skb, skb->data - skb_mac_header(skb));
382
383	/*
384	 *	The SOCK_PACKET socket receives _all_ frames.
385	 */
386
387	spkt->spkt_family = dev->type;
388	strlcpy(spkt->spkt_device, dev->name, sizeof(spkt->spkt_device));
389	spkt->spkt_protocol = skb->protocol;
390
391	/*
392	 *	Charge the memory to the socket. This is done specifically
393	 *	to prevent sockets using all the memory up.
394	 */
395
396	if (sock_queue_rcv_skb(sk, skb) == 0)
397		return 0;
398
399out:
400	kfree_skb(skb);
401oom:
402	return 0;
403}
404
405
406/*
407 *	Output a raw packet to a device layer. This bypasses all the other
408 *	protocol layers and you must therefore supply it with a complete frame
409 */
410
411static int packet_sendmsg_spkt(struct kiocb *iocb, struct socket *sock,
412			       struct msghdr *msg, size_t len)
413{
414	struct sock *sk = sock->sk;
415	struct sockaddr_pkt *saddr = (struct sockaddr_pkt *)msg->msg_name;
416	struct sk_buff *skb = NULL;
417	struct net_device *dev;
418	__be16 proto = 0;
419	int err;
420
421	/*
422	 *	Get and verify the address.
423	 */
424
425	if (saddr) {
426		if (msg->msg_namelen < sizeof(struct sockaddr))
427			return -EINVAL;
428		if (msg->msg_namelen == sizeof(struct sockaddr_pkt))
429			proto = saddr->spkt_protocol;
430	} else
431		return -ENOTCONN;	/* SOCK_PACKET must be sent giving an address */
432
433	/*
434	 *	Find the device first to size check it
435	 */
436
437	saddr->spkt_device[13] = 0;
438retry:
439	rcu_read_lock();
440	dev = dev_get_by_name_rcu(sock_net(sk), saddr->spkt_device);
441	err = -ENODEV;
442	if (dev == NULL)
443		goto out_unlock;
444
445	err = -ENETDOWN;
446	if (!(dev->flags & IFF_UP))
447		goto out_unlock;
448
449	/*
450	 * You may not queue a frame bigger than the mtu. This is the lowest level
451	 * raw protocol and you must do your own fragmentation at this level.
452	 */
453
454	err = -EMSGSIZE;
455	if (len > dev->mtu + dev->hard_header_len)
456		goto out_unlock;
457
458	if (!skb) {
459		size_t reserved = LL_RESERVED_SPACE(dev);
460		unsigned int hhlen = dev->header_ops ? dev->hard_header_len : 0;
461
462		rcu_read_unlock();
463		skb = sock_wmalloc(sk, len + reserved, 0, GFP_KERNEL);
464		if (skb == NULL)
465			return -ENOBUFS;
466		/* FIXME: Save some space for broken drivers that write a hard
467		 * header at transmission time by themselves. PPP is the notable
468		 * one here. This should really be fixed at the driver level.
469		 */
470		skb_reserve(skb, reserved);
471		skb_reset_network_header(skb);
472
473		/* Try to align data part correctly */
474		if (hhlen) {
475			skb->data -= hhlen;
476			skb->tail -= hhlen;
477			if (len < hhlen)
478				skb_reset_network_header(skb);
479		}
480		err = memcpy_fromiovec(skb_put(skb, len), msg->msg_iov, len);
481		if (err)
482			goto out_free;
483		goto retry;
484	}
485
486
487	skb->protocol = proto;
488	skb->dev = dev;
489	skb->priority = sk->sk_priority;
490	skb->mark = sk->sk_mark;
491	err = sock_tx_timestamp(msg, sk, skb_tx(skb));
492	if (err < 0)
493		goto out_unlock;
494
495	dev_queue_xmit(skb);
496	rcu_read_unlock();
497	return len;
498
499out_unlock:
500	rcu_read_unlock();
501out_free:
502	kfree_skb(skb);
503	return err;
504}
505
506static inline unsigned int run_filter(struct sk_buff *skb, struct sock *sk,
507				      unsigned int res)
508{
509	struct sk_filter *filter;
510
511	rcu_read_lock_bh();
512	filter = rcu_dereference_bh(sk->sk_filter);
513	if (filter != NULL)
514		res = sk_run_filter(skb, filter->insns, filter->len);
515	rcu_read_unlock_bh();
516
517	return res;
518}
519
520/*
521   This function makes lazy skb cloning in hope that most of packets
522   are discarded by BPF.
523
524   Note tricky part: we DO mangle shared skb! skb->data, skb->len
525   and skb->cb are mangled. It works because (and until) packets
526   falling here are owned by current CPU. Output packets are cloned
527   by dev_queue_xmit_nit(), input packets are processed by net_bh
528   sequencially, so that if we return skb to original state on exit,
529   we will not harm anyone.
530 */
531
532static int packet_rcv(struct sk_buff *skb, struct net_device *dev,
533		      struct packet_type *pt, struct net_device *orig_dev)
534{
535	struct sock *sk;
536	struct sockaddr_ll *sll;
537	struct packet_sock *po;
538	u8 *skb_head = skb->data;
539	int skb_len = skb->len;
540	unsigned int snaplen, res;
541
542	if (skb->pkt_type == PACKET_LOOPBACK)
543		goto drop;
544
545	sk = pt->af_packet_priv;
546	po = pkt_sk(sk);
547
548	if (!net_eq(dev_net(dev), sock_net(sk)))
549		goto drop;
550
551	skb->dev = dev;
552
553	if (dev->header_ops) {
554		/* The device has an explicit notion of ll header,
555		   exported to higher levels.
556
557		   Otherwise, the device hides datails of it frame
558		   structure, so that corresponding packet head
559		   never delivered to user.
560		 */
561		if (sk->sk_type != SOCK_DGRAM)
562			skb_push(skb, skb->data - skb_mac_header(skb));
563		else if (skb->pkt_type == PACKET_OUTGOING) {
564			/* Special case: outgoing packets have ll header at head */
565			skb_pull(skb, skb_network_offset(skb));
566		}
567	}
568
569	snaplen = skb->len;
570
571	res = run_filter(skb, sk, snaplen);
572	if (!res)
573		goto drop_n_restore;
574	if (snaplen > res)
575		snaplen = res;
576
577	if (atomic_read(&sk->sk_rmem_alloc) + skb->truesize >=
578	    (unsigned)sk->sk_rcvbuf)
579		goto drop_n_acct;
580
581	if (skb_shared(skb)) {
582		struct sk_buff *nskb = skb_clone(skb, GFP_ATOMIC);
583		if (nskb == NULL)
584			goto drop_n_acct;
585
586		if (skb_head != skb->data) {
587			skb->data = skb_head;
588			skb->len = skb_len;
589		}
590		kfree_skb(skb);
591		skb = nskb;
592	}
593
594	BUILD_BUG_ON(sizeof(*PACKET_SKB_CB(skb)) + MAX_ADDR_LEN - 8 >
595		     sizeof(skb->cb));
596
597	sll = &PACKET_SKB_CB(skb)->sa.ll;
598	sll->sll_family = AF_PACKET;
599	sll->sll_hatype = dev->type;
600	sll->sll_protocol = skb->protocol;
601	sll->sll_pkttype = skb->pkt_type;
602	if (unlikely(po->origdev))
603		sll->sll_ifindex = orig_dev->ifindex;
604	else
605		sll->sll_ifindex = dev->ifindex;
606
607	sll->sll_halen = dev_parse_header(skb, sll->sll_addr);
608
609	PACKET_SKB_CB(skb)->origlen = skb->len;
610
611	if (pskb_trim(skb, snaplen))
612		goto drop_n_acct;
613
614	skb_set_owner_r(skb, sk);
615	skb->dev = NULL;
616	skb_dst_drop(skb);
617
618	/* drop conntrack reference */
619	nf_reset(skb);
620
621	spin_lock(&sk->sk_receive_queue.lock);
622	po->stats.tp_packets++;
623	skb->dropcount = atomic_read(&sk->sk_drops);
624	__skb_queue_tail(&sk->sk_receive_queue, skb);
625	spin_unlock(&sk->sk_receive_queue.lock);
626	sk->sk_data_ready(sk, skb->len);
627	return 0;
628
629drop_n_acct:
630	po->stats.tp_drops = atomic_inc_return(&sk->sk_drops);
631
632drop_n_restore:
633	if (skb_head != skb->data && skb_shared(skb)) {
634		skb->data = skb_head;
635		skb->len = skb_len;
636	}
637drop:
638	consume_skb(skb);
639	return 0;
640}
641
642static int tpacket_rcv(struct sk_buff *skb, struct net_device *dev,
643		       struct packet_type *pt, struct net_device *orig_dev)
644{
645	struct sock *sk;
646	struct packet_sock *po;
647	struct sockaddr_ll *sll;
648	union {
649		struct tpacket_hdr *h1;
650		struct tpacket2_hdr *h2;
651		void *raw;
652	} h;
653	u8 *skb_head = skb->data;
654	int skb_len = skb->len;
655	unsigned int snaplen, res;
656	unsigned long status = TP_STATUS_LOSING|TP_STATUS_USER;
657	unsigned short macoff, netoff, hdrlen;
658	struct sk_buff *copy_skb = NULL;
659	struct timeval tv;
660	struct timespec ts;
661	struct skb_shared_hwtstamps *shhwtstamps = skb_hwtstamps(skb);
662
663	if (skb->pkt_type == PACKET_LOOPBACK)
664		goto drop;
665
666	sk = pt->af_packet_priv;
667	po = pkt_sk(sk);
668
669	if (!net_eq(dev_net(dev), sock_net(sk)))
670		goto drop;
671
672	if (dev->header_ops) {
673		if (sk->sk_type != SOCK_DGRAM)
674			skb_push(skb, skb->data - skb_mac_header(skb));
675		else if (skb->pkt_type == PACKET_OUTGOING) {
676			/* Special case: outgoing packets have ll header at head */
677			skb_pull(skb, skb_network_offset(skb));
678		}
679	}
680
681	if (skb->ip_summed == CHECKSUM_PARTIAL)
682		status |= TP_STATUS_CSUMNOTREADY;
683
684	snaplen = skb->len;
685
686	res = run_filter(skb, sk, snaplen);
687	if (!res)
688		goto drop_n_restore;
689	if (snaplen > res)
690		snaplen = res;
691
692	if (sk->sk_type == SOCK_DGRAM) {
693		macoff = netoff = TPACKET_ALIGN(po->tp_hdrlen) + 16 +
694				  po->tp_reserve;
695	} else {
696		unsigned maclen = skb_network_offset(skb);
697		netoff = TPACKET_ALIGN(po->tp_hdrlen +
698				       (maclen < 16 ? 16 : maclen)) +
699			po->tp_reserve;
700		macoff = netoff - maclen;
701	}
702
703	if (macoff + snaplen > po->rx_ring.frame_size) {
704		if (po->copy_thresh &&
705		    atomic_read(&sk->sk_rmem_alloc) + skb->truesize <
706		    (unsigned)sk->sk_rcvbuf) {
707			if (skb_shared(skb)) {
708				copy_skb = skb_clone(skb, GFP_ATOMIC);
709			} else {
710				copy_skb = skb_get(skb);
711				skb_head = skb->data;
712			}
713			if (copy_skb)
714				skb_set_owner_r(copy_skb, sk);
715		}
716		snaplen = po->rx_ring.frame_size - macoff;
717		if ((int)snaplen < 0)
718			snaplen = 0;
719	}
720
721	spin_lock(&sk->sk_receive_queue.lock);
722	h.raw = packet_current_frame(po, &po->rx_ring, TP_STATUS_KERNEL);
723	if (!h.raw)
724		goto ring_is_full;
725	packet_increment_head(&po->rx_ring);
726	po->stats.tp_packets++;
727	if (copy_skb) {
728		status |= TP_STATUS_COPY;
729		__skb_queue_tail(&sk->sk_receive_queue, copy_skb);
730	}
731	if (!po->stats.tp_drops)
732		status &= ~TP_STATUS_LOSING;
733	spin_unlock(&sk->sk_receive_queue.lock);
734
735	skb_copy_bits(skb, 0, h.raw + macoff, snaplen);
736
737	switch (po->tp_version) {
738	case TPACKET_V1:
739		h.h1->tp_len = skb->len;
740		h.h1->tp_snaplen = snaplen;
741		h.h1->tp_mac = macoff;
742		h.h1->tp_net = netoff;
743		if ((po->tp_tstamp & SOF_TIMESTAMPING_SYS_HARDWARE)
744				&& shhwtstamps->syststamp.tv64)
745			tv = ktime_to_timeval(shhwtstamps->syststamp);
746		else if ((po->tp_tstamp & SOF_TIMESTAMPING_RAW_HARDWARE)
747				&& shhwtstamps->hwtstamp.tv64)
748			tv = ktime_to_timeval(shhwtstamps->hwtstamp);
749		else if (skb->tstamp.tv64)
750			tv = ktime_to_timeval(skb->tstamp);
751		else
752			do_gettimeofday(&tv);
753		h.h1->tp_sec = tv.tv_sec;
754		h.h1->tp_usec = tv.tv_usec;
755		hdrlen = sizeof(*h.h1);
756		break;
757	case TPACKET_V2:
758		h.h2->tp_len = skb->len;
759		h.h2->tp_snaplen = snaplen;
760		h.h2->tp_mac = macoff;
761		h.h2->tp_net = netoff;
762		if ((po->tp_tstamp & SOF_TIMESTAMPING_SYS_HARDWARE)
763				&& shhwtstamps->syststamp.tv64)
764			ts = ktime_to_timespec(shhwtstamps->syststamp);
765		else if ((po->tp_tstamp & SOF_TIMESTAMPING_RAW_HARDWARE)
766				&& shhwtstamps->hwtstamp.tv64)
767			ts = ktime_to_timespec(shhwtstamps->hwtstamp);
768		else if (skb->tstamp.tv64)
769			ts = ktime_to_timespec(skb->tstamp);
770		else
771			getnstimeofday(&ts);
772		h.h2->tp_sec = ts.tv_sec;
773		h.h2->tp_nsec = ts.tv_nsec;
774		h.h2->tp_vlan_tci = vlan_tx_tag_get(skb);
775		hdrlen = sizeof(*h.h2);
776		break;
777	default:
778		BUG();
779	}
780
781	sll = h.raw + TPACKET_ALIGN(hdrlen);
782	sll->sll_halen = dev_parse_header(skb, sll->sll_addr);
783	sll->sll_family = AF_PACKET;
784	sll->sll_hatype = dev->type;
785	sll->sll_protocol = skb->protocol;
786	sll->sll_pkttype = skb->pkt_type;
787	if (unlikely(po->origdev))
788		sll->sll_ifindex = orig_dev->ifindex;
789	else
790		sll->sll_ifindex = dev->ifindex;
791
792	__packet_set_status(po, h.raw, status);
793	smp_mb();
794	{
795		struct page *p_start, *p_end;
796		u8 *h_end = h.raw + macoff + snaplen - 1;
797
798		p_start = virt_to_page(h.raw);
799		p_end = virt_to_page(h_end);
800		while (p_start <= p_end) {
801			flush_dcache_page(p_start);
802			p_start++;
803		}
804	}
805
806	sk->sk_data_ready(sk, 0);
807
808drop_n_restore:
809	if (skb_head != skb->data && skb_shared(skb)) {
810		skb->data = skb_head;
811		skb->len = skb_len;
812	}
813drop:
814	kfree_skb(skb);
815	return 0;
816
817ring_is_full:
818	po->stats.tp_drops++;
819	spin_unlock(&sk->sk_receive_queue.lock);
820
821	sk->sk_data_ready(sk, 0);
822	kfree_skb(copy_skb);
823	goto drop_n_restore;
824}
825
826static void tpacket_destruct_skb(struct sk_buff *skb)
827{
828	struct packet_sock *po = pkt_sk(skb->sk);
829	void *ph;
830
831	BUG_ON(skb == NULL);
832
833	if (likely(po->tx_ring.pg_vec)) {
834		ph = skb_shinfo(skb)->destructor_arg;
835		BUG_ON(__packet_get_status(po, ph) != TP_STATUS_SENDING);
836		BUG_ON(atomic_read(&po->tx_ring.pending) == 0);
837		atomic_dec(&po->tx_ring.pending);
838		__packet_set_status(po, ph, TP_STATUS_AVAILABLE);
839	}
840
841	sock_wfree(skb);
842}
843
844static int tpacket_fill_skb(struct packet_sock *po, struct sk_buff *skb,
845		void *frame, struct net_device *dev, int size_max,
846		__be16 proto, unsigned char *addr)
847{
848	union {
849		struct tpacket_hdr *h1;
850		struct tpacket2_hdr *h2;
851		void *raw;
852	} ph;
853	int to_write, offset, len, tp_len, nr_frags, len_max;
854	struct socket *sock = po->sk.sk_socket;
855	struct page *page;
856	void *data;
857	int err;
858
859	ph.raw = frame;
860
861	skb->protocol = proto;
862	skb->dev = dev;
863	skb->priority = po->sk.sk_priority;
864	skb->mark = po->sk.sk_mark;
865	skb_shinfo(skb)->destructor_arg = ph.raw;
866
867	switch (po->tp_version) {
868	case TPACKET_V2:
869		tp_len = ph.h2->tp_len;
870		break;
871	default:
872		tp_len = ph.h1->tp_len;
873		break;
874	}
875	if (unlikely(tp_len > size_max)) {
876		pr_err("packet size is too long (%d > %d)\n", tp_len, size_max);
877		return -EMSGSIZE;
878	}
879
880	skb_reserve(skb, LL_RESERVED_SPACE(dev));
881	skb_reset_network_header(skb);
882
883	data = ph.raw + po->tp_hdrlen - sizeof(struct sockaddr_ll);
884	to_write = tp_len;
885
886	if (sock->type == SOCK_DGRAM) {
887		err = dev_hard_header(skb, dev, ntohs(proto), addr,
888				NULL, tp_len);
889		if (unlikely(err < 0))
890			return -EINVAL;
891	} else if (dev->hard_header_len) {
892		/* net device doesn't like empty head */
893		if (unlikely(tp_len <= dev->hard_header_len)) {
894			pr_err("packet size is too short (%d < %d)\n",
895			       tp_len, dev->hard_header_len);
896			return -EINVAL;
897		}
898
899		skb_push(skb, dev->hard_header_len);
900		err = skb_store_bits(skb, 0, data,
901				dev->hard_header_len);
902		if (unlikely(err))
903			return err;
904
905		data += dev->hard_header_len;
906		to_write -= dev->hard_header_len;
907	}
908
909	err = -EFAULT;
910	page = virt_to_page(data);
911	offset = offset_in_page(data);
912	len_max = PAGE_SIZE - offset;
913	len = ((to_write > len_max) ? len_max : to_write);
914
915	skb->data_len = to_write;
916	skb->len += to_write;
917	skb->truesize += to_write;
918	atomic_add(to_write, &po->sk.sk_wmem_alloc);
919
920	while (likely(to_write)) {
921		nr_frags = skb_shinfo(skb)->nr_frags;
922
923		if (unlikely(nr_frags >= MAX_SKB_FRAGS)) {
924			pr_err("Packet exceed the number of skb frags(%lu)\n",
925			       MAX_SKB_FRAGS);
926			return -EFAULT;
927		}
928
929		flush_dcache_page(page);
930		get_page(page);
931		skb_fill_page_desc(skb,
932				nr_frags,
933				page++, offset, len);
934		to_write -= len;
935		offset = 0;
936		len_max = PAGE_SIZE;
937		len = ((to_write > len_max) ? len_max : to_write);
938	}
939
940	return tp_len;
941}
942
943static int tpacket_snd(struct packet_sock *po, struct msghdr *msg)
944{
945	struct socket *sock;
946	struct sk_buff *skb;
947	struct net_device *dev;
948	__be16 proto;
949	int ifindex, err, reserve = 0;
950	void *ph;
951	struct sockaddr_ll *saddr = (struct sockaddr_ll *)msg->msg_name;
952	int tp_len, size_max;
953	unsigned char *addr;
954	int len_sum = 0;
955	int status = 0;
956
957	sock = po->sk.sk_socket;
958
959	mutex_lock(&po->pg_vec_lock);
960
961	err = -EBUSY;
962	if (saddr == NULL) {
963		ifindex	= po->ifindex;
964		proto	= po->num;
965		addr	= NULL;
966	} else {
967		err = -EINVAL;
968		if (msg->msg_namelen < sizeof(struct sockaddr_ll))
969			goto out;
970		if (msg->msg_namelen < (saddr->sll_halen
971					+ offsetof(struct sockaddr_ll,
972						sll_addr)))
973			goto out;
974		ifindex	= saddr->sll_ifindex;
975		proto	= saddr->sll_protocol;
976		addr	= saddr->sll_addr;
977	}
978
979	dev = dev_get_by_index(sock_net(&po->sk), ifindex);
980	err = -ENXIO;
981	if (unlikely(dev == NULL))
982		goto out;
983
984	reserve = dev->hard_header_len;
985
986	err = -ENETDOWN;
987	if (unlikely(!(dev->flags & IFF_UP)))
988		goto out_put;
989
990	size_max = po->tx_ring.frame_size
991		- (po->tp_hdrlen - sizeof(struct sockaddr_ll));
992
993	if (size_max > dev->mtu + reserve)
994		size_max = dev->mtu + reserve;
995
996	do {
997		ph = packet_current_frame(po, &po->tx_ring,
998				TP_STATUS_SEND_REQUEST);
999
1000		if (unlikely(ph == NULL)) {
1001			schedule();
1002			continue;
1003		}
1004
1005		status = TP_STATUS_SEND_REQUEST;
1006		skb = sock_alloc_send_skb(&po->sk,
1007				LL_ALLOCATED_SPACE(dev)
1008				+ sizeof(struct sockaddr_ll),
1009				0, &err);
1010
1011		if (unlikely(skb == NULL))
1012			goto out_status;
1013
1014		tp_len = tpacket_fill_skb(po, skb, ph, dev, size_max, proto,
1015				addr);
1016
1017		if (unlikely(tp_len < 0)) {
1018			if (po->tp_loss) {
1019				__packet_set_status(po, ph,
1020						TP_STATUS_AVAILABLE);
1021				packet_increment_head(&po->tx_ring);
1022				kfree_skb(skb);
1023				continue;
1024			} else {
1025				status = TP_STATUS_WRONG_FORMAT;
1026				err = tp_len;
1027				goto out_status;
1028			}
1029		}
1030
1031		skb->destructor = tpacket_destruct_skb;
1032		__packet_set_status(po, ph, TP_STATUS_SENDING);
1033		atomic_inc(&po->tx_ring.pending);
1034
1035		status = TP_STATUS_SEND_REQUEST;
1036		err = dev_queue_xmit(skb);
1037		if (unlikely(err > 0)) {
1038			err = net_xmit_errno(err);
1039			if (err && __packet_get_status(po, ph) ==
1040				   TP_STATUS_AVAILABLE) {
1041				/* skb was destructed already */
1042				skb = NULL;
1043				goto out_status;
1044			}
1045			/*
1046			 * skb was dropped but not destructed yet;
1047			 * let's treat it like congestion or err < 0
1048			 */
1049			err = 0;
1050		}
1051		packet_increment_head(&po->tx_ring);
1052		len_sum += tp_len;
1053	} while (likely((ph != NULL) ||
1054			((!(msg->msg_flags & MSG_DONTWAIT)) &&
1055			 (atomic_read(&po->tx_ring.pending))))
1056		);
1057
1058	err = len_sum;
1059	goto out_put;
1060
1061out_status:
1062	__packet_set_status(po, ph, status);
1063	kfree_skb(skb);
1064out_put:
1065	dev_put(dev);
1066out:
1067	mutex_unlock(&po->pg_vec_lock);
1068	return err;
1069}
1070
1071static inline struct sk_buff *packet_alloc_skb(struct sock *sk, size_t prepad,
1072					       size_t reserve, size_t len,
1073					       size_t linear, int noblock,
1074					       int *err)
1075{
1076	struct sk_buff *skb;
1077
1078	/* Under a page?  Don't bother with paged skb. */
1079	if (prepad + len < PAGE_SIZE || !linear)
1080		linear = len;
1081
1082	skb = sock_alloc_send_pskb(sk, prepad + linear, len - linear, noblock,
1083				   err);
1084	if (!skb)
1085		return NULL;
1086
1087	skb_reserve(skb, reserve);
1088	skb_put(skb, linear);
1089	skb->data_len = len - linear;
1090	skb->len += len - linear;
1091
1092	return skb;
1093}
1094
1095static int packet_snd(struct socket *sock,
1096			  struct msghdr *msg, size_t len)
1097{
1098	struct sock *sk = sock->sk;
1099	struct sockaddr_ll *saddr = (struct sockaddr_ll *)msg->msg_name;
1100	struct sk_buff *skb;
1101	struct net_device *dev;
1102	__be16 proto;
1103	unsigned char *addr;
1104	int ifindex, err, reserve = 0;
1105	struct virtio_net_hdr vnet_hdr = { 0 };
1106	int offset = 0;
1107	int vnet_hdr_len;
1108	struct packet_sock *po = pkt_sk(sk);
1109	unsigned short gso_type = 0;
1110
1111	/*
1112	 *	Get and verify the address.
1113	 */
1114
1115	if (saddr == NULL) {
1116		ifindex	= po->ifindex;
1117		proto	= po->num;
1118		addr	= NULL;
1119	} else {
1120		err = -EINVAL;
1121		if (msg->msg_namelen < sizeof(struct sockaddr_ll))
1122			goto out;
1123		if (msg->msg_namelen < (saddr->sll_halen + offsetof(struct sockaddr_ll, sll_addr)))
1124			goto out;
1125		ifindex	= saddr->sll_ifindex;
1126		proto	= saddr->sll_protocol;
1127		addr	= saddr->sll_addr;
1128	}
1129
1130
1131	dev = dev_get_by_index(sock_net(sk), ifindex);
1132	err = -ENXIO;
1133	if (dev == NULL)
1134		goto out_unlock;
1135	if (sock->type == SOCK_RAW)
1136		reserve = dev->hard_header_len;
1137
1138	err = -ENETDOWN;
1139	if (!(dev->flags & IFF_UP))
1140		goto out_unlock;
1141
1142	if (po->has_vnet_hdr) {
1143		vnet_hdr_len = sizeof(vnet_hdr);
1144
1145		err = -EINVAL;
1146		if (len < vnet_hdr_len)
1147			goto out_unlock;
1148
1149		len -= vnet_hdr_len;
1150
1151		err = memcpy_fromiovec((void *)&vnet_hdr, msg->msg_iov,
1152				       vnet_hdr_len);
1153		if (err < 0)
1154			goto out_unlock;
1155
1156		if ((vnet_hdr.flags & VIRTIO_NET_HDR_F_NEEDS_CSUM) &&
1157		    (vnet_hdr.csum_start + vnet_hdr.csum_offset + 2 >
1158		      vnet_hdr.hdr_len))
1159			vnet_hdr.hdr_len = vnet_hdr.csum_start +
1160						 vnet_hdr.csum_offset + 2;
1161
1162		err = -EINVAL;
1163		if (vnet_hdr.hdr_len > len)
1164			goto out_unlock;
1165
1166		if (vnet_hdr.gso_type != VIRTIO_NET_HDR_GSO_NONE) {
1167			switch (vnet_hdr.gso_type & ~VIRTIO_NET_HDR_GSO_ECN) {
1168			case VIRTIO_NET_HDR_GSO_TCPV4:
1169				gso_type = SKB_GSO_TCPV4;
1170				break;
1171			case VIRTIO_NET_HDR_GSO_TCPV6:
1172				gso_type = SKB_GSO_TCPV6;
1173				break;
1174			case VIRTIO_NET_HDR_GSO_UDP:
1175				gso_type = SKB_GSO_UDP;
1176				break;
1177			default:
1178				goto out_unlock;
1179			}
1180
1181			if (vnet_hdr.gso_type & VIRTIO_NET_HDR_GSO_ECN)
1182				gso_type |= SKB_GSO_TCP_ECN;
1183
1184			if (vnet_hdr.gso_size == 0)
1185				goto out_unlock;
1186
1187		}
1188	}
1189
1190	err = -EMSGSIZE;
1191	if (!gso_type && (len > dev->mtu+reserve))
1192		goto out_unlock;
1193
1194	err = -ENOBUFS;
1195	skb = packet_alloc_skb(sk, LL_ALLOCATED_SPACE(dev),
1196			       LL_RESERVED_SPACE(dev), len, vnet_hdr.hdr_len,
1197			       msg->msg_flags & MSG_DONTWAIT, &err);
1198	if (skb == NULL)
1199		goto out_unlock;
1200
1201	skb_set_network_header(skb, reserve);
1202
1203	err = -EINVAL;
1204	if (sock->type == SOCK_DGRAM &&
1205	    (offset = dev_hard_header(skb, dev, ntohs(proto), addr, NULL, len)) < 0)
1206		goto out_free;
1207
1208	/* Returns -EFAULT on error */
1209	err = skb_copy_datagram_from_iovec(skb, offset, msg->msg_iov, 0, len);
1210	if (err)
1211		goto out_free;
1212	err = sock_tx_timestamp(msg, sk, skb_tx(skb));
1213	if (err < 0)
1214		goto out_free;
1215
1216	skb->protocol = proto;
1217	skb->dev = dev;
1218	skb->priority = sk->sk_priority;
1219	skb->mark = sk->sk_mark;
1220
1221	if (po->has_vnet_hdr) {
1222		if (vnet_hdr.flags & VIRTIO_NET_HDR_F_NEEDS_CSUM) {
1223			if (!skb_partial_csum_set(skb, vnet_hdr.csum_start,
1224						  vnet_hdr.csum_offset)) {
1225				err = -EINVAL;
1226				goto out_free;
1227			}
1228		}
1229
1230		skb_shinfo(skb)->gso_size = vnet_hdr.gso_size;
1231		skb_shinfo(skb)->gso_type = gso_type;
1232
1233		/* Header must be checked, and gso_segs computed. */
1234		skb_shinfo(skb)->gso_type |= SKB_GSO_DODGY;
1235		skb_shinfo(skb)->gso_segs = 0;
1236
1237		len += vnet_hdr_len;
1238	}
1239
1240	/*
1241	 *	Now send it
1242	 */
1243
1244	err = dev_queue_xmit(skb);
1245	if (err > 0 && (err = net_xmit_errno(err)) != 0)
1246		goto out_unlock;
1247
1248	dev_put(dev);
1249
1250	return len;
1251
1252out_free:
1253	kfree_skb(skb);
1254out_unlock:
1255	if (dev)
1256		dev_put(dev);
1257out:
1258	return err;
1259}
1260
1261static int packet_sendmsg(struct kiocb *iocb, struct socket *sock,
1262		struct msghdr *msg, size_t len)
1263{
1264	struct sock *sk = sock->sk;
1265	struct packet_sock *po = pkt_sk(sk);
1266	if (po->tx_ring.pg_vec)
1267		return tpacket_snd(po, msg);
1268	else
1269		return packet_snd(sock, msg, len);
1270}
1271
1272/*
1273 *	Close a PACKET socket. This is fairly simple. We immediately go
1274 *	to 'closed' state and remove our protocol entry in the device list.
1275 */
1276
1277static int packet_release(struct socket *sock)
1278{
1279	struct sock *sk = sock->sk;
1280	struct packet_sock *po;
1281	struct net *net;
1282	struct tpacket_req req;
1283
1284	if (!sk)
1285		return 0;
1286
1287	net = sock_net(sk);
1288	po = pkt_sk(sk);
1289
1290	spin_lock_bh(&net->packet.sklist_lock);
1291	sk_del_node_init_rcu(sk);
1292	sock_prot_inuse_add(net, sk->sk_prot, -1);
1293	spin_unlock_bh(&net->packet.sklist_lock);
1294
1295	spin_lock(&po->bind_lock);
1296	if (po->running) {
1297		/*
1298		 * Remove from protocol table
1299		 */
1300		po->running = 0;
1301		po->num = 0;
1302		__dev_remove_pack(&po->prot_hook);
1303		__sock_put(sk);
1304	}
1305	spin_unlock(&po->bind_lock);
1306
1307	packet_flush_mclist(sk);
1308
1309	memset(&req, 0, sizeof(req));
1310
1311	if (po->rx_ring.pg_vec)
1312		packet_set_ring(sk, &req, 1, 0);
1313
1314	if (po->tx_ring.pg_vec)
1315		packet_set_ring(sk, &req, 1, 1);
1316
1317	synchronize_net();
1318	/*
1319	 *	Now the socket is dead. No more input will appear.
1320	 */
1321	sock_orphan(sk);
1322	sock->sk = NULL;
1323
1324	/* Purge queues */
1325
1326	skb_queue_purge(&sk->sk_receive_queue);
1327	sk_refcnt_debug_release(sk);
1328
1329	sock_put(sk);
1330	return 0;
1331}
1332
1333/*
1334 *	Attach a packet hook.
1335 */
1336
1337static int packet_do_bind(struct sock *sk, struct net_device *dev, __be16 protocol)
1338{
1339	struct packet_sock *po = pkt_sk(sk);
1340	/*
1341	 *	Detach an existing hook if present.
1342	 */
1343
1344	lock_sock(sk);
1345
1346	spin_lock(&po->bind_lock);
1347	if (po->running) {
1348		__sock_put(sk);
1349		po->running = 0;
1350		po->num = 0;
1351		spin_unlock(&po->bind_lock);
1352		dev_remove_pack(&po->prot_hook);
1353		spin_lock(&po->bind_lock);
1354	}
1355
1356	po->num = protocol;
1357	po->prot_hook.type = protocol;
1358	po->prot_hook.dev = dev;
1359
1360	po->ifindex = dev ? dev->ifindex : 0;
1361
1362	if (protocol == 0)
1363		goto out_unlock;
1364
1365	if (!dev || (dev->flags & IFF_UP)) {
1366		dev_add_pack(&po->prot_hook);
1367		sock_hold(sk);
1368		po->running = 1;
1369	} else {
1370		sk->sk_err = ENETDOWN;
1371		if (!sock_flag(sk, SOCK_DEAD))
1372			sk->sk_error_report(sk);
1373	}
1374
1375out_unlock:
1376	spin_unlock(&po->bind_lock);
1377	release_sock(sk);
1378	return 0;
1379}
1380
1381/*
1382 *	Bind a packet socket to a device
1383 */
1384
1385static int packet_bind_spkt(struct socket *sock, struct sockaddr *uaddr,
1386			    int addr_len)
1387{
1388	struct sock *sk = sock->sk;
1389	char name[15];
1390	struct net_device *dev;
1391	int err = -ENODEV;
1392
1393	/*
1394	 *	Check legality
1395	 */
1396
1397	if (addr_len != sizeof(struct sockaddr))
1398		return -EINVAL;
1399	strlcpy(name, uaddr->sa_data, sizeof(name));
1400
1401	dev = dev_get_by_name(sock_net(sk), name);
1402	if (dev) {
1403		err = packet_do_bind(sk, dev, pkt_sk(sk)->num);
1404		dev_put(dev);
1405	}
1406	return err;
1407}
1408
1409static int packet_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
1410{
1411	struct sockaddr_ll *sll = (struct sockaddr_ll *)uaddr;
1412	struct sock *sk = sock->sk;
1413	struct net_device *dev = NULL;
1414	int err;
1415
1416
1417	/*
1418	 *	Check legality
1419	 */
1420
1421	if (addr_len < sizeof(struct sockaddr_ll))
1422		return -EINVAL;
1423	if (sll->sll_family != AF_PACKET)
1424		return -EINVAL;
1425
1426	if (sll->sll_ifindex) {
1427		err = -ENODEV;
1428		dev = dev_get_by_index(sock_net(sk), sll->sll_ifindex);
1429		if (dev == NULL)
1430			goto out;
1431	}
1432	err = packet_do_bind(sk, dev, sll->sll_protocol ? : pkt_sk(sk)->num);
1433	if (dev)
1434		dev_put(dev);
1435
1436out:
1437	return err;
1438}
1439
1440static struct proto packet_proto = {
1441	.name	  = "PACKET",
1442	.owner	  = THIS_MODULE,
1443	.obj_size = sizeof(struct packet_sock),
1444};
1445
1446/*
1447 *	Create a packet of type SOCK_PACKET.
1448 */
1449
1450static int packet_create(struct net *net, struct socket *sock, int protocol,
1451			 int kern)
1452{
1453	struct sock *sk;
1454	struct packet_sock *po;
1455	__be16 proto = (__force __be16)protocol; /* weird, but documented */
1456	int err;
1457
1458	if (!capable(CAP_NET_RAW))
1459		return -EPERM;
1460	if (sock->type != SOCK_DGRAM && sock->type != SOCK_RAW &&
1461	    sock->type != SOCK_PACKET)
1462		return -ESOCKTNOSUPPORT;
1463
1464	sock->state = SS_UNCONNECTED;
1465
1466	err = -ENOBUFS;
1467	sk = sk_alloc(net, PF_PACKET, GFP_KERNEL, &packet_proto);
1468	if (sk == NULL)
1469		goto out;
1470
1471	sock->ops = &packet_ops;
1472	if (sock->type == SOCK_PACKET)
1473		sock->ops = &packet_ops_spkt;
1474
1475	sock_init_data(sock, sk);
1476
1477	po = pkt_sk(sk);
1478	sk->sk_family = PF_PACKET;
1479	po->num = proto;
1480
1481	sk->sk_destruct = packet_sock_destruct;
1482	sk_refcnt_debug_inc(sk);
1483
1484	/*
1485	 *	Attach a protocol block
1486	 */
1487
1488	spin_lock_init(&po->bind_lock);
1489	mutex_init(&po->pg_vec_lock);
1490	po->prot_hook.func = packet_rcv;
1491
1492	if (sock->type == SOCK_PACKET)
1493		po->prot_hook.func = packet_rcv_spkt;
1494
1495	po->prot_hook.af_packet_priv = sk;
1496
1497	if (proto) {
1498		po->prot_hook.type = proto;
1499		dev_add_pack(&po->prot_hook);
1500		sock_hold(sk);
1501		po->running = 1;
1502	}
1503
1504	spin_lock_bh(&net->packet.sklist_lock);
1505	sk_add_node_rcu(sk, &net->packet.sklist);
1506	sock_prot_inuse_add(net, &packet_proto, 1);
1507	spin_unlock_bh(&net->packet.sklist_lock);
1508
1509	return 0;
1510out:
1511	return err;
1512}
1513
1514static int packet_recv_error(struct sock *sk, struct msghdr *msg, int len)
1515{
1516	struct sock_exterr_skb *serr;
1517	struct sk_buff *skb, *skb2;
1518	int copied, err;
1519
1520	err = -EAGAIN;
1521	skb = skb_dequeue(&sk->sk_error_queue);
1522	if (skb == NULL)
1523		goto out;
1524
1525	copied = skb->len;
1526	if (copied > len) {
1527		msg->msg_flags |= MSG_TRUNC;
1528		copied = len;
1529	}
1530	err = skb_copy_datagram_iovec(skb, 0, msg->msg_iov, copied);
1531	if (err)
1532		goto out_free_skb;
1533
1534	sock_recv_timestamp(msg, sk, skb);
1535
1536	serr = SKB_EXT_ERR(skb);
1537	put_cmsg(msg, SOL_PACKET, PACKET_TX_TIMESTAMP,
1538		 sizeof(serr->ee), &serr->ee);
1539
1540	msg->msg_flags |= MSG_ERRQUEUE;
1541	err = copied;
1542
1543	/* Reset and regenerate socket error */
1544	spin_lock_bh(&sk->sk_error_queue.lock);
1545	sk->sk_err = 0;
1546	if ((skb2 = skb_peek(&sk->sk_error_queue)) != NULL) {
1547		sk->sk_err = SKB_EXT_ERR(skb2)->ee.ee_errno;
1548		spin_unlock_bh(&sk->sk_error_queue.lock);
1549		sk->sk_error_report(sk);
1550	} else
1551		spin_unlock_bh(&sk->sk_error_queue.lock);
1552
1553out_free_skb:
1554	kfree_skb(skb);
1555out:
1556	return err;
1557}
1558
1559/*
1560 *	Pull a packet from our receive queue and hand it to the user.
1561 *	If necessary we block.
1562 */
1563
1564static int packet_recvmsg(struct kiocb *iocb, struct socket *sock,
1565			  struct msghdr *msg, size_t len, int flags)
1566{
1567	struct sock *sk = sock->sk;
1568	struct sk_buff *skb;
1569	int copied, err;
1570	struct sockaddr_ll *sll;
1571	int vnet_hdr_len = 0;
1572
1573	err = -EINVAL;
1574	if (flags & ~(MSG_PEEK|MSG_DONTWAIT|MSG_TRUNC|MSG_CMSG_COMPAT|MSG_ERRQUEUE))
1575		goto out;
1576
1577#if 0
1578	/* What error should we return now? EUNATTACH? */
1579	if (pkt_sk(sk)->ifindex < 0)
1580		return -ENODEV;
1581#endif
1582
1583	if (flags & MSG_ERRQUEUE) {
1584		err = packet_recv_error(sk, msg, len);
1585		goto out;
1586	}
1587
1588	/*
1589	 *	Call the generic datagram receiver. This handles all sorts
1590	 *	of horrible races and re-entrancy so we can forget about it
1591	 *	in the protocol layers.
1592	 *
1593	 *	Now it will return ENETDOWN, if device have just gone down,
1594	 *	but then it will block.
1595	 */
1596
1597	skb = skb_recv_datagram(sk, flags, flags & MSG_DONTWAIT, &err);
1598
1599	/*
1600	 *	An error occurred so return it. Because skb_recv_datagram()
1601	 *	handles the blocking we don't see and worry about blocking
1602	 *	retries.
1603	 */
1604
1605	if (skb == NULL)
1606		goto out;
1607
1608	if (pkt_sk(sk)->has_vnet_hdr) {
1609		struct virtio_net_hdr vnet_hdr = { 0 };
1610
1611		err = -EINVAL;
1612		vnet_hdr_len = sizeof(vnet_hdr);
1613		if ((len -= vnet_hdr_len) < 0)
1614			goto out_free;
1615
1616		if (skb_is_gso(skb)) {
1617			struct skb_shared_info *sinfo = skb_shinfo(skb);
1618
1619			/* This is a hint as to how much should be linear. */
1620			vnet_hdr.hdr_len = skb_headlen(skb);
1621			vnet_hdr.gso_size = sinfo->gso_size;
1622			if (sinfo->gso_type & SKB_GSO_TCPV4)
1623				vnet_hdr.gso_type = VIRTIO_NET_HDR_GSO_TCPV4;
1624			else if (sinfo->gso_type & SKB_GSO_TCPV6)
1625				vnet_hdr.gso_type = VIRTIO_NET_HDR_GSO_TCPV6;
1626			else if (sinfo->gso_type & SKB_GSO_UDP)
1627				vnet_hdr.gso_type = VIRTIO_NET_HDR_GSO_UDP;
1628			else if (sinfo->gso_type & SKB_GSO_FCOE)
1629				goto out_free;
1630			else
1631				BUG();
1632			if (sinfo->gso_type & SKB_GSO_TCP_ECN)
1633				vnet_hdr.gso_type |= VIRTIO_NET_HDR_GSO_ECN;
1634		} else
1635			vnet_hdr.gso_type = VIRTIO_NET_HDR_GSO_NONE;
1636
1637		if (skb->ip_summed == CHECKSUM_PARTIAL) {
1638			vnet_hdr.flags = VIRTIO_NET_HDR_F_NEEDS_CSUM;
1639			vnet_hdr.csum_start = skb->csum_start -
1640							skb_headroom(skb);
1641			vnet_hdr.csum_offset = skb->csum_offset;
1642		} /* else everything is zero */
1643
1644		err = memcpy_toiovec(msg->msg_iov, (void *)&vnet_hdr,
1645				     vnet_hdr_len);
1646		if (err < 0)
1647			goto out_free;
1648	}
1649
1650	/*
1651	 *	If the address length field is there to be filled in, we fill
1652	 *	it in now.
1653	 */
1654
1655	sll = &PACKET_SKB_CB(skb)->sa.ll;
1656	if (sock->type == SOCK_PACKET)
1657		msg->msg_namelen = sizeof(struct sockaddr_pkt);
1658	else
1659		msg->msg_namelen = sll->sll_halen + offsetof(struct sockaddr_ll, sll_addr);
1660
1661	/*
1662	 *	You lose any data beyond the buffer you gave. If it worries a
1663	 *	user program they can ask the device for its MTU anyway.
1664	 */
1665
1666	copied = skb->len;
1667	if (copied > len) {
1668		copied = len;
1669		msg->msg_flags |= MSG_TRUNC;
1670	}
1671
1672	err = skb_copy_datagram_iovec(skb, 0, msg->msg_iov, copied);
1673	if (err)
1674		goto out_free;
1675
1676	sock_recv_ts_and_drops(msg, sk, skb);
1677
1678	if (msg->msg_name)
1679		memcpy(msg->msg_name, &PACKET_SKB_CB(skb)->sa,
1680		       msg->msg_namelen);
1681
1682	if (pkt_sk(sk)->auxdata) {
1683		struct tpacket_auxdata aux;
1684
1685		aux.tp_status = TP_STATUS_USER;
1686		if (skb->ip_summed == CHECKSUM_PARTIAL)
1687			aux.tp_status |= TP_STATUS_CSUMNOTREADY;
1688		aux.tp_len = PACKET_SKB_CB(skb)->origlen;
1689		aux.tp_snaplen = skb->len;
1690		aux.tp_mac = 0;
1691		aux.tp_net = skb_network_offset(skb);
1692		aux.tp_vlan_tci = vlan_tx_tag_get(skb);
1693
1694		put_cmsg(msg, SOL_PACKET, PACKET_AUXDATA, sizeof(aux), &aux);
1695	}
1696
1697	/*
1698	 *	Free or return the buffer as appropriate. Again this
1699	 *	hides all the races and re-entrancy issues from us.
1700	 */
1701	err = vnet_hdr_len + ((flags&MSG_TRUNC) ? skb->len : copied);
1702
1703out_free:
1704	skb_free_datagram(sk, skb);
1705out:
1706	return err;
1707}
1708
1709static int packet_getname_spkt(struct socket *sock, struct sockaddr *uaddr,
1710			       int *uaddr_len, int peer)
1711{
1712	struct net_device *dev;
1713	struct sock *sk	= sock->sk;
1714
1715	if (peer)
1716		return -EOPNOTSUPP;
1717
1718	uaddr->sa_family = AF_PACKET;
1719	rcu_read_lock();
1720	dev = dev_get_by_index_rcu(sock_net(sk), pkt_sk(sk)->ifindex);
1721	if (dev)
1722		strlcpy(uaddr->sa_data, dev->name, 15);
1723	else
1724		memset(uaddr->sa_data, 0, 14);
1725	rcu_read_unlock();
1726	*uaddr_len = sizeof(*uaddr);
1727
1728	return 0;
1729}
1730
1731static int packet_getname(struct socket *sock, struct sockaddr *uaddr,
1732			  int *uaddr_len, int peer)
1733{
1734	struct net_device *dev;
1735	struct sock *sk = sock->sk;
1736	struct packet_sock *po = pkt_sk(sk);
1737	DECLARE_SOCKADDR(struct sockaddr_ll *, sll, uaddr);
1738
1739	if (peer)
1740		return -EOPNOTSUPP;
1741
1742	sll->sll_family = AF_PACKET;
1743	sll->sll_ifindex = po->ifindex;
1744	sll->sll_protocol = po->num;
1745	rcu_read_lock();
1746	dev = dev_get_by_index_rcu(sock_net(sk), po->ifindex);
1747	if (dev) {
1748		sll->sll_hatype = dev->type;
1749		sll->sll_halen = dev->addr_len;
1750		memcpy(sll->sll_addr, dev->dev_addr, dev->addr_len);
1751	} else {
1752		sll->sll_hatype = 0;	/* Bad: we have no ARPHRD_UNSPEC */
1753		sll->sll_halen = 0;
1754	}
1755	rcu_read_unlock();
1756	*uaddr_len = offsetof(struct sockaddr_ll, sll_addr) + sll->sll_halen;
1757
1758	return 0;
1759}
1760
1761static int packet_dev_mc(struct net_device *dev, struct packet_mclist *i,
1762			 int what)
1763{
1764	switch (i->type) {
1765	case PACKET_MR_MULTICAST:
1766		if (i->alen != dev->addr_len)
1767			return -EINVAL;
1768		if (what > 0)
1769			return dev_mc_add(dev, i->addr);
1770		else
1771			return dev_mc_del(dev, i->addr);
1772		break;
1773	case PACKET_MR_PROMISC:
1774		return dev_set_promiscuity(dev, what);
1775		break;
1776	case PACKET_MR_ALLMULTI:
1777		return dev_set_allmulti(dev, what);
1778		break;
1779	case PACKET_MR_UNICAST:
1780		if (i->alen != dev->addr_len)
1781			return -EINVAL;
1782		if (what > 0)
1783			return dev_uc_add(dev, i->addr);
1784		else
1785			return dev_uc_del(dev, i->addr);
1786		break;
1787	default:
1788		break;
1789	}
1790	return 0;
1791}
1792
1793static void packet_dev_mclist(struct net_device *dev, struct packet_mclist *i, int what)
1794{
1795	for ( ; i; i = i->next) {
1796		if (i->ifindex == dev->ifindex)
1797			packet_dev_mc(dev, i, what);
1798	}
1799}
1800
1801static int packet_mc_add(struct sock *sk, struct packet_mreq_max *mreq)
1802{
1803	struct packet_sock *po = pkt_sk(sk);
1804	struct packet_mclist *ml, *i;
1805	struct net_device *dev;
1806	int err;
1807
1808	rtnl_lock();
1809
1810	err = -ENODEV;
1811	dev = __dev_get_by_index(sock_net(sk), mreq->mr_ifindex);
1812	if (!dev)
1813		goto done;
1814
1815	err = -EINVAL;
1816	if (mreq->mr_alen > dev->addr_len)
1817		goto done;
1818
1819	err = -ENOBUFS;
1820	i = kmalloc(sizeof(*i), GFP_KERNEL);
1821	if (i == NULL)
1822		goto done;
1823
1824	err = 0;
1825	for (ml = po->mclist; ml; ml = ml->next) {
1826		if (ml->ifindex == mreq->mr_ifindex &&
1827		    ml->type == mreq->mr_type &&
1828		    ml->alen == mreq->mr_alen &&
1829		    memcmp(ml->addr, mreq->mr_address, ml->alen) == 0) {
1830			ml->count++;
1831			/* Free the new element ... */
1832			kfree(i);
1833			goto done;
1834		}
1835	}
1836
1837	i->type = mreq->mr_type;
1838	i->ifindex = mreq->mr_ifindex;
1839	i->alen = mreq->mr_alen;
1840	memcpy(i->addr, mreq->mr_address, i->alen);
1841	i->count = 1;
1842	i->next = po->mclist;
1843	po->mclist = i;
1844	err = packet_dev_mc(dev, i, 1);
1845	if (err) {
1846		po->mclist = i->next;
1847		kfree(i);
1848	}
1849
1850done:
1851	rtnl_unlock();
1852	return err;
1853}
1854
1855static int packet_mc_drop(struct sock *sk, struct packet_mreq_max *mreq)
1856{
1857	struct packet_mclist *ml, **mlp;
1858
1859	rtnl_lock();
1860
1861	for (mlp = &pkt_sk(sk)->mclist; (ml = *mlp) != NULL; mlp = &ml->next) {
1862		if (ml->ifindex == mreq->mr_ifindex &&
1863		    ml->type == mreq->mr_type &&
1864		    ml->alen == mreq->mr_alen &&
1865		    memcmp(ml->addr, mreq->mr_address, ml->alen) == 0) {
1866			if (--ml->count == 0) {
1867				struct net_device *dev;
1868				*mlp = ml->next;
1869				dev = __dev_get_by_index(sock_net(sk), ml->ifindex);
1870				if (dev)
1871					packet_dev_mc(dev, ml, -1);
1872				kfree(ml);
1873			}
1874			rtnl_unlock();
1875			return 0;
1876		}
1877	}
1878	rtnl_unlock();
1879	return -EADDRNOTAVAIL;
1880}
1881
1882static void packet_flush_mclist(struct sock *sk)
1883{
1884	struct packet_sock *po = pkt_sk(sk);
1885	struct packet_mclist *ml;
1886
1887	if (!po->mclist)
1888		return;
1889
1890	rtnl_lock();
1891	while ((ml = po->mclist) != NULL) {
1892		struct net_device *dev;
1893
1894		po->mclist = ml->next;
1895		dev = __dev_get_by_index(sock_net(sk), ml->ifindex);
1896		if (dev != NULL)
1897			packet_dev_mc(dev, ml, -1);
1898		kfree(ml);
1899	}
1900	rtnl_unlock();
1901}
1902
1903static int
1904packet_setsockopt(struct socket *sock, int level, int optname, char __user *optval, unsigned int optlen)
1905{
1906	struct sock *sk = sock->sk;
1907	struct packet_sock *po = pkt_sk(sk);
1908	int ret;
1909
1910	if (level != SOL_PACKET)
1911		return -ENOPROTOOPT;
1912
1913	switch (optname) {
1914	case PACKET_ADD_MEMBERSHIP:
1915	case PACKET_DROP_MEMBERSHIP:
1916	{
1917		struct packet_mreq_max mreq;
1918		int len = optlen;
1919		memset(&mreq, 0, sizeof(mreq));
1920		if (len < sizeof(struct packet_mreq))
1921			return -EINVAL;
1922		if (len > sizeof(mreq))
1923			len = sizeof(mreq);
1924		if (copy_from_user(&mreq, optval, len))
1925			return -EFAULT;
1926		if (len < (mreq.mr_alen + offsetof(struct packet_mreq, mr_address)))
1927			return -EINVAL;
1928		if (optname == PACKET_ADD_MEMBERSHIP)
1929			ret = packet_mc_add(sk, &mreq);
1930		else
1931			ret = packet_mc_drop(sk, &mreq);
1932		return ret;
1933	}
1934
1935	case PACKET_RX_RING:
1936	case PACKET_TX_RING:
1937	{
1938		struct tpacket_req req;
1939
1940		if (optlen < sizeof(req))
1941			return -EINVAL;
1942		if (pkt_sk(sk)->has_vnet_hdr)
1943			return -EINVAL;
1944		if (copy_from_user(&req, optval, sizeof(req)))
1945			return -EFAULT;
1946		return packet_set_ring(sk, &req, 0, optname == PACKET_TX_RING);
1947	}
1948	case PACKET_COPY_THRESH:
1949	{
1950		int val;
1951
1952		if (optlen != sizeof(val))
1953			return -EINVAL;
1954		if (copy_from_user(&val, optval, sizeof(val)))
1955			return -EFAULT;
1956
1957		pkt_sk(sk)->copy_thresh = val;
1958		return 0;
1959	}
1960	case PACKET_VERSION:
1961	{
1962		int val;
1963
1964		if (optlen != sizeof(val))
1965			return -EINVAL;
1966		if (po->rx_ring.pg_vec || po->tx_ring.pg_vec)
1967			return -EBUSY;
1968		if (copy_from_user(&val, optval, sizeof(val)))
1969			return -EFAULT;
1970		switch (val) {
1971		case TPACKET_V1:
1972		case TPACKET_V2:
1973			po->tp_version = val;
1974			return 0;
1975		default:
1976			return -EINVAL;
1977		}
1978	}
1979	case PACKET_RESERVE:
1980	{
1981		unsigned int val;
1982
1983		if (optlen != sizeof(val))
1984			return -EINVAL;
1985		if (po->rx_ring.pg_vec || po->tx_ring.pg_vec)
1986			return -EBUSY;
1987		if (copy_from_user(&val, optval, sizeof(val)))
1988			return -EFAULT;
1989		po->tp_reserve = val;
1990		return 0;
1991	}
1992	case PACKET_LOSS:
1993	{
1994		unsigned int val;
1995
1996		if (optlen != sizeof(val))
1997			return -EINVAL;
1998		if (po->rx_ring.pg_vec || po->tx_ring.pg_vec)
1999			return -EBUSY;
2000		if (copy_from_user(&val, optval, sizeof(val)))
2001			return -EFAULT;
2002		po->tp_loss = !!val;
2003		return 0;
2004	}
2005	case PACKET_AUXDATA:
2006	{
2007		int val;
2008
2009		if (optlen < sizeof(val))
2010			return -EINVAL;
2011		if (copy_from_user(&val, optval, sizeof(val)))
2012			return -EFAULT;
2013
2014		po->auxdata = !!val;
2015		return 0;
2016	}
2017	case PACKET_ORIGDEV:
2018	{
2019		int val;
2020
2021		if (optlen < sizeof(val))
2022			return -EINVAL;
2023		if (copy_from_user(&val, optval, sizeof(val)))
2024			return -EFAULT;
2025
2026		po->origdev = !!val;
2027		return 0;
2028	}
2029	case PACKET_VNET_HDR:
2030	{
2031		int val;
2032
2033		if (sock->type != SOCK_RAW)
2034			return -EINVAL;
2035		if (po->rx_ring.pg_vec || po->tx_ring.pg_vec)
2036			return -EBUSY;
2037		if (optlen < sizeof(val))
2038			return -EINVAL;
2039		if (copy_from_user(&val, optval, sizeof(val)))
2040			return -EFAULT;
2041
2042		po->has_vnet_hdr = !!val;
2043		return 0;
2044	}
2045	case PACKET_TIMESTAMP:
2046	{
2047		int val;
2048
2049		if (optlen != sizeof(val))
2050			return -EINVAL;
2051		if (copy_from_user(&val, optval, sizeof(val)))
2052			return -EFAULT;
2053
2054		po->tp_tstamp = val;
2055		return 0;
2056	}
2057	default:
2058		return -ENOPROTOOPT;
2059	}
2060}
2061
2062static int packet_getsockopt(struct socket *sock, int level, int optname,
2063			     char __user *optval, int __user *optlen)
2064{
2065	int len;
2066	int val;
2067	struct sock *sk = sock->sk;
2068	struct packet_sock *po = pkt_sk(sk);
2069	void *data;
2070	struct tpacket_stats st;
2071
2072	if (level != SOL_PACKET)
2073		return -ENOPROTOOPT;
2074
2075	if (get_user(len, optlen))
2076		return -EFAULT;
2077
2078	if (len < 0)
2079		return -EINVAL;
2080
2081	switch (optname) {
2082	case PACKET_STATISTICS:
2083		if (len > sizeof(struct tpacket_stats))
2084			len = sizeof(struct tpacket_stats);
2085		spin_lock_bh(&sk->sk_receive_queue.lock);
2086		st = po->stats;
2087		memset(&po->stats, 0, sizeof(st));
2088		spin_unlock_bh(&sk->sk_receive_queue.lock);
2089		st.tp_packets += st.tp_drops;
2090
2091		data = &st;
2092		break;
2093	case PACKET_AUXDATA:
2094		if (len > sizeof(int))
2095			len = sizeof(int);
2096		val = po->auxdata;
2097
2098		data = &val;
2099		break;
2100	case PACKET_ORIGDEV:
2101		if (len > sizeof(int))
2102			len = sizeof(int);
2103		val = po->origdev;
2104
2105		data = &val;
2106		break;
2107	case PACKET_VNET_HDR:
2108		if (len > sizeof(int))
2109			len = sizeof(int);
2110		val = po->has_vnet_hdr;
2111
2112		data = &val;
2113		break;
2114	case PACKET_VERSION:
2115		if (len > sizeof(int))
2116			len = sizeof(int);
2117		val = po->tp_version;
2118		data = &val;
2119		break;
2120	case PACKET_HDRLEN:
2121		if (len > sizeof(int))
2122			len = sizeof(int);
2123		if (copy_from_user(&val, optval, len))
2124			return -EFAULT;
2125		switch (val) {
2126		case TPACKET_V1:
2127			val = sizeof(struct tpacket_hdr);
2128			break;
2129		case TPACKET_V2:
2130			val = sizeof(struct tpacket2_hdr);
2131			break;
2132		default:
2133			return -EINVAL;
2134		}
2135		data = &val;
2136		break;
2137	case PACKET_RESERVE:
2138		if (len > sizeof(unsigned int))
2139			len = sizeof(unsigned int);
2140		val = po->tp_reserve;
2141		data = &val;
2142		break;
2143	case PACKET_LOSS:
2144		if (len > sizeof(unsigned int))
2145			len = sizeof(unsigned int);
2146		val = po->tp_loss;
2147		data = &val;
2148		break;
2149	case PACKET_TIMESTAMP:
2150		if (len > sizeof(int))
2151			len = sizeof(int);
2152		val = po->tp_tstamp;
2153		data = &val;
2154		break;
2155	default:
2156		return -ENOPROTOOPT;
2157	}
2158
2159	if (put_user(len, optlen))
2160		return -EFAULT;
2161	if (copy_to_user(optval, data, len))
2162		return -EFAULT;
2163	return 0;
2164}
2165
2166
2167static int packet_notifier(struct notifier_block *this, unsigned long msg, void *data)
2168{
2169	struct sock *sk;
2170	struct hlist_node *node;
2171	struct net_device *dev = data;
2172	struct net *net = dev_net(dev);
2173
2174	rcu_read_lock();
2175	sk_for_each_rcu(sk, node, &net->packet.sklist) {
2176		struct packet_sock *po = pkt_sk(sk);
2177
2178		switch (msg) {
2179		case NETDEV_UNREGISTER:
2180			if (po->mclist)
2181				packet_dev_mclist(dev, po->mclist, -1);
2182			/* fallthrough */
2183
2184		case NETDEV_DOWN:
2185			if (dev->ifindex == po->ifindex) {
2186				spin_lock(&po->bind_lock);
2187				if (po->running) {
2188					__dev_remove_pack(&po->prot_hook);
2189					__sock_put(sk);
2190					po->running = 0;
2191					sk->sk_err = ENETDOWN;
2192					if (!sock_flag(sk, SOCK_DEAD))
2193						sk->sk_error_report(sk);
2194				}
2195				if (msg == NETDEV_UNREGISTER) {
2196					po->ifindex = -1;
2197					po->prot_hook.dev = NULL;
2198				}
2199				spin_unlock(&po->bind_lock);
2200			}
2201			break;
2202		case NETDEV_UP:
2203			if (dev->ifindex == po->ifindex) {
2204				spin_lock(&po->bind_lock);
2205				if (po->num && !po->running) {
2206					dev_add_pack(&po->prot_hook);
2207					sock_hold(sk);
2208					po->running = 1;
2209				}
2210				spin_unlock(&po->bind_lock);
2211			}
2212			break;
2213		}
2214	}
2215	rcu_read_unlock();
2216	return NOTIFY_DONE;
2217}
2218
2219
2220static int packet_ioctl(struct socket *sock, unsigned int cmd,
2221			unsigned long arg)
2222{
2223	struct sock *sk = sock->sk;
2224
2225	switch (cmd) {
2226	case SIOCOUTQ:
2227	{
2228		int amount = sk_wmem_alloc_get(sk);
2229
2230		return put_user(amount, (int __user *)arg);
2231	}
2232	case SIOCINQ:
2233	{
2234		struct sk_buff *skb;
2235		int amount = 0;
2236
2237		spin_lock_bh(&sk->sk_receive_queue.lock);
2238		skb = skb_peek(&sk->sk_receive_queue);
2239		if (skb)
2240			amount = skb->len;
2241		spin_unlock_bh(&sk->sk_receive_queue.lock);
2242		return put_user(amount, (int __user *)arg);
2243	}
2244	case SIOCGSTAMP:
2245		return sock_get_timestamp(sk, (struct timeval __user *)arg);
2246	case SIOCGSTAMPNS:
2247		return sock_get_timestampns(sk, (struct timespec __user *)arg);
2248
2249#ifdef CONFIG_INET
2250	case SIOCADDRT:
2251	case SIOCDELRT:
2252	case SIOCDARP:
2253	case SIOCGARP:
2254	case SIOCSARP:
2255	case SIOCGIFADDR:
2256	case SIOCSIFADDR:
2257	case SIOCGIFBRDADDR:
2258	case SIOCSIFBRDADDR:
2259	case SIOCGIFNETMASK:
2260	case SIOCSIFNETMASK:
2261	case SIOCGIFDSTADDR:
2262	case SIOCSIFDSTADDR:
2263	case SIOCSIFFLAGS:
2264		return inet_dgram_ops.ioctl(sock, cmd, arg);
2265#endif
2266
2267	default:
2268		return -ENOIOCTLCMD;
2269	}
2270	return 0;
2271}
2272
2273static unsigned int packet_poll(struct file *file, struct socket *sock,
2274				poll_table *wait)
2275{
2276	struct sock *sk = sock->sk;
2277	struct packet_sock *po = pkt_sk(sk);
2278	unsigned int mask = datagram_poll(file, sock, wait);
2279
2280	spin_lock_bh(&sk->sk_receive_queue.lock);
2281	if (po->rx_ring.pg_vec) {
2282		if (!packet_previous_frame(po, &po->rx_ring, TP_STATUS_KERNEL))
2283			mask |= POLLIN | POLLRDNORM;
2284	}
2285	spin_unlock_bh(&sk->sk_receive_queue.lock);
2286	spin_lock_bh(&sk->sk_write_queue.lock);
2287	if (po->tx_ring.pg_vec) {
2288		if (packet_current_frame(po, &po->tx_ring, TP_STATUS_AVAILABLE))
2289			mask |= POLLOUT | POLLWRNORM;
2290	}
2291	spin_unlock_bh(&sk->sk_write_queue.lock);
2292	return mask;
2293}
2294
2295
2296/* Dirty? Well, I still did not learn better way to account
2297 * for user mmaps.
2298 */
2299
2300static void packet_mm_open(struct vm_area_struct *vma)
2301{
2302	struct file *file = vma->vm_file;
2303	struct socket *sock = file->private_data;
2304	struct sock *sk = sock->sk;
2305
2306	if (sk)
2307		atomic_inc(&pkt_sk(sk)->mapped);
2308}
2309
2310static void packet_mm_close(struct vm_area_struct *vma)
2311{
2312	struct file *file = vma->vm_file;
2313	struct socket *sock = file->private_data;
2314	struct sock *sk = sock->sk;
2315
2316	if (sk)
2317		atomic_dec(&pkt_sk(sk)->mapped);
2318}
2319
2320static const struct vm_operations_struct packet_mmap_ops = {
2321	.open	=	packet_mm_open,
2322	.close	=	packet_mm_close,
2323};
2324
2325static void free_pg_vec(char **pg_vec, unsigned int order, unsigned int len)
2326{
2327	int i;
2328
2329	for (i = 0; i < len; i++) {
2330		if (likely(pg_vec[i]))
2331			free_pages((unsigned long) pg_vec[i], order);
2332	}
2333	kfree(pg_vec);
2334}
2335
2336static inline char *alloc_one_pg_vec_page(unsigned long order)
2337{
2338	gfp_t gfp_flags = GFP_KERNEL | __GFP_COMP | __GFP_ZERO | __GFP_NOWARN;
2339
2340	return (char *) __get_free_pages(gfp_flags, order);
2341}
2342
2343static char **alloc_pg_vec(struct tpacket_req *req, int order)
2344{
2345	unsigned int block_nr = req->tp_block_nr;
2346	char **pg_vec;
2347	int i;
2348
2349	pg_vec = kzalloc(block_nr * sizeof(char *), GFP_KERNEL);
2350	if (unlikely(!pg_vec))
2351		goto out;
2352
2353	for (i = 0; i < block_nr; i++) {
2354		pg_vec[i] = alloc_one_pg_vec_page(order);
2355		if (unlikely(!pg_vec[i]))
2356			goto out_free_pgvec;
2357	}
2358
2359out:
2360	return pg_vec;
2361
2362out_free_pgvec:
2363	free_pg_vec(pg_vec, order, block_nr);
2364	pg_vec = NULL;
2365	goto out;
2366}
2367
2368static int packet_set_ring(struct sock *sk, struct tpacket_req *req,
2369		int closing, int tx_ring)
2370{
2371	char **pg_vec = NULL;
2372	struct packet_sock *po = pkt_sk(sk);
2373	int was_running, order = 0;
2374	struct packet_ring_buffer *rb;
2375	struct sk_buff_head *rb_queue;
2376	__be16 num;
2377	int err;
2378
2379	rb = tx_ring ? &po->tx_ring : &po->rx_ring;
2380	rb_queue = tx_ring ? &sk->sk_write_queue : &sk->sk_receive_queue;
2381
2382	err = -EBUSY;
2383	if (!closing) {
2384		if (atomic_read(&po->mapped))
2385			goto out;
2386		if (atomic_read(&rb->pending))
2387			goto out;
2388	}
2389
2390	if (req->tp_block_nr) {
2391		/* Sanity tests and some calculations */
2392		err = -EBUSY;
2393		if (unlikely(rb->pg_vec))
2394			goto out;
2395
2396		switch (po->tp_version) {
2397		case TPACKET_V1:
2398			po->tp_hdrlen = TPACKET_HDRLEN;
2399			break;
2400		case TPACKET_V2:
2401			po->tp_hdrlen = TPACKET2_HDRLEN;
2402			break;
2403		}
2404
2405		err = -EINVAL;
2406		if (unlikely((int)req->tp_block_size <= 0))
2407			goto out;
2408		if (unlikely(req->tp_block_size & (PAGE_SIZE - 1)))
2409			goto out;
2410		if (unlikely(req->tp_frame_size < po->tp_hdrlen +
2411					po->tp_reserve))
2412			goto out;
2413		if (unlikely(req->tp_frame_size & (TPACKET_ALIGNMENT - 1)))
2414			goto out;
2415
2416		rb->frames_per_block = req->tp_block_size/req->tp_frame_size;
2417		if (unlikely(rb->frames_per_block <= 0))
2418			goto out;
2419		if (unlikely((rb->frames_per_block * req->tp_block_nr) !=
2420					req->tp_frame_nr))
2421			goto out;
2422
2423		err = -ENOMEM;
2424		order = get_order(req->tp_block_size);
2425		pg_vec = alloc_pg_vec(req, order);
2426		if (unlikely(!pg_vec))
2427			goto out;
2428	}
2429	/* Done */
2430	else {
2431		err = -EINVAL;
2432		if (unlikely(req->tp_frame_nr))
2433			goto out;
2434	}
2435
2436	lock_sock(sk);
2437
2438	/* Detach socket from network */
2439	spin_lock(&po->bind_lock);
2440	was_running = po->running;
2441	num = po->num;
2442	if (was_running) {
2443		__dev_remove_pack(&po->prot_hook);
2444		po->num = 0;
2445		po->running = 0;
2446		__sock_put(sk);
2447	}
2448	spin_unlock(&po->bind_lock);
2449
2450	synchronize_net();
2451
2452	err = -EBUSY;
2453	mutex_lock(&po->pg_vec_lock);
2454	if (closing || atomic_read(&po->mapped) == 0) {
2455		err = 0;
2456#define XC(a, b) ({ __typeof__ ((a)) __t; __t = (a); (a) = (b); __t; })
2457		spin_lock_bh(&rb_queue->lock);
2458		pg_vec = XC(rb->pg_vec, pg_vec);
2459		rb->frame_max = (req->tp_frame_nr - 1);
2460		rb->head = 0;
2461		rb->frame_size = req->tp_frame_size;
2462		spin_unlock_bh(&rb_queue->lock);
2463
2464		order = XC(rb->pg_vec_order, order);
2465		req->tp_block_nr = XC(rb->pg_vec_len, req->tp_block_nr);
2466
2467		rb->pg_vec_pages = req->tp_block_size/PAGE_SIZE;
2468		po->prot_hook.func = (po->rx_ring.pg_vec) ?
2469						tpacket_rcv : packet_rcv;
2470		skb_queue_purge(rb_queue);
2471#undef XC
2472		if (atomic_read(&po->mapped))
2473			pr_err("packet_mmap: vma is busy: %d\n",
2474			       atomic_read(&po->mapped));
2475	}
2476	mutex_unlock(&po->pg_vec_lock);
2477
2478	spin_lock(&po->bind_lock);
2479	if (was_running && !po->running) {
2480		sock_hold(sk);
2481		po->running = 1;
2482		po->num = num;
2483		dev_add_pack(&po->prot_hook);
2484	}
2485	spin_unlock(&po->bind_lock);
2486
2487	release_sock(sk);
2488
2489	if (pg_vec)
2490		free_pg_vec(pg_vec, order, req->tp_block_nr);
2491out:
2492	return err;
2493}
2494
2495static int packet_mmap(struct file *file, struct socket *sock,
2496		struct vm_area_struct *vma)
2497{
2498	struct sock *sk = sock->sk;
2499	struct packet_sock *po = pkt_sk(sk);
2500	unsigned long size, expected_size;
2501	struct packet_ring_buffer *rb;
2502	unsigned long start;
2503	int err = -EINVAL;
2504	int i;
2505
2506	if (vma->vm_pgoff)
2507		return -EINVAL;
2508
2509	mutex_lock(&po->pg_vec_lock);
2510
2511	expected_size = 0;
2512	for (rb = &po->rx_ring; rb <= &po->tx_ring; rb++) {
2513		if (rb->pg_vec) {
2514			expected_size += rb->pg_vec_len
2515						* rb->pg_vec_pages
2516						* PAGE_SIZE;
2517		}
2518	}
2519
2520	if (expected_size == 0)
2521		goto out;
2522
2523	size = vma->vm_end - vma->vm_start;
2524	if (size != expected_size)
2525		goto out;
2526
2527	start = vma->vm_start;
2528	for (rb = &po->rx_ring; rb <= &po->tx_ring; rb++) {
2529		if (rb->pg_vec == NULL)
2530			continue;
2531
2532		for (i = 0; i < rb->pg_vec_len; i++) {
2533			struct page *page = virt_to_page(rb->pg_vec[i]);
2534			int pg_num;
2535
2536			for (pg_num = 0; pg_num < rb->pg_vec_pages;
2537					pg_num++, page++) {
2538				err = vm_insert_page(vma, start, page);
2539				if (unlikely(err))
2540					goto out;
2541				start += PAGE_SIZE;
2542			}
2543		}
2544	}
2545
2546	atomic_inc(&po->mapped);
2547	vma->vm_ops = &packet_mmap_ops;
2548	err = 0;
2549
2550out:
2551	mutex_unlock(&po->pg_vec_lock);
2552	return err;
2553}
2554
2555static const struct proto_ops packet_ops_spkt = {
2556	.family =	PF_PACKET,
2557	.owner =	THIS_MODULE,
2558	.release =	packet_release,
2559	.bind =		packet_bind_spkt,
2560	.connect =	sock_no_connect,
2561	.socketpair =	sock_no_socketpair,
2562	.accept =	sock_no_accept,
2563	.getname =	packet_getname_spkt,
2564	.poll =		datagram_poll,
2565	.ioctl =	packet_ioctl,
2566	.listen =	sock_no_listen,
2567	.shutdown =	sock_no_shutdown,
2568	.setsockopt =	sock_no_setsockopt,
2569	.getsockopt =	sock_no_getsockopt,
2570	.sendmsg =	packet_sendmsg_spkt,
2571	.recvmsg =	packet_recvmsg,
2572	.mmap =		sock_no_mmap,
2573	.sendpage =	sock_no_sendpage,
2574};
2575
2576static const struct proto_ops packet_ops = {
2577	.family =	PF_PACKET,
2578	.owner =	THIS_MODULE,
2579	.release =	packet_release,
2580	.bind =		packet_bind,
2581	.connect =	sock_no_connect,
2582	.socketpair =	sock_no_socketpair,
2583	.accept =	sock_no_accept,
2584	.getname =	packet_getname,
2585	.poll =		packet_poll,
2586	.ioctl =	packet_ioctl,
2587	.listen =	sock_no_listen,
2588	.shutdown =	sock_no_shutdown,
2589	.setsockopt =	packet_setsockopt,
2590	.getsockopt =	packet_getsockopt,
2591	.sendmsg =	packet_sendmsg,
2592	.recvmsg =	packet_recvmsg,
2593	.mmap =		packet_mmap,
2594	.sendpage =	sock_no_sendpage,
2595};
2596
2597static const struct net_proto_family packet_family_ops = {
2598	.family =	PF_PACKET,
2599	.create =	packet_create,
2600	.owner	=	THIS_MODULE,
2601};
2602
2603static struct notifier_block packet_netdev_notifier = {
2604	.notifier_call =	packet_notifier,
2605};
2606
2607#ifdef CONFIG_PROC_FS
2608
2609static void *packet_seq_start(struct seq_file *seq, loff_t *pos)
2610	__acquires(RCU)
2611{
2612	struct net *net = seq_file_net(seq);
2613
2614	rcu_read_lock();
2615	return seq_hlist_start_head_rcu(&net->packet.sklist, *pos);
2616}
2617
2618static void *packet_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2619{
2620	struct net *net = seq_file_net(seq);
2621	return seq_hlist_next_rcu(v, &net->packet.sklist, pos);
2622}
2623
2624static void packet_seq_stop(struct seq_file *seq, void *v)
2625	__releases(RCU)
2626{
2627	rcu_read_unlock();
2628}
2629
2630static int packet_seq_show(struct seq_file *seq, void *v)
2631{
2632	if (v == SEQ_START_TOKEN)
2633		seq_puts(seq, "sk       RefCnt Type Proto  Iface R Rmem   User   Inode\n");
2634	else {
2635		struct sock *s = sk_entry(v);
2636		const struct packet_sock *po = pkt_sk(s);
2637
2638		seq_printf(seq,
2639			   "%p %-6d %-4d %04x   %-5d %1d %-6u %-6u %-6lu\n",
2640			   s,
2641			   atomic_read(&s->sk_refcnt),
2642			   s->sk_type,
2643			   ntohs(po->num),
2644			   po->ifindex,
2645			   po->running,
2646			   atomic_read(&s->sk_rmem_alloc),
2647			   sock_i_uid(s),
2648			   sock_i_ino(s));
2649	}
2650
2651	return 0;
2652}
2653
2654static const struct seq_operations packet_seq_ops = {
2655	.start	= packet_seq_start,
2656	.next	= packet_seq_next,
2657	.stop	= packet_seq_stop,
2658	.show	= packet_seq_show,
2659};
2660
2661static int packet_seq_open(struct inode *inode, struct file *file)
2662{
2663	return seq_open_net(inode, file, &packet_seq_ops,
2664			    sizeof(struct seq_net_private));
2665}
2666
2667static const struct file_operations packet_seq_fops = {
2668	.owner		= THIS_MODULE,
2669	.open		= packet_seq_open,
2670	.read		= seq_read,
2671	.llseek		= seq_lseek,
2672	.release	= seq_release_net,
2673};
2674
2675#endif
2676
2677static int __net_init packet_net_init(struct net *net)
2678{
2679	spin_lock_init(&net->packet.sklist_lock);
2680	INIT_HLIST_HEAD(&net->packet.sklist);
2681
2682	if (!proc_net_fops_create(net, "packet", 0, &packet_seq_fops))
2683		return -ENOMEM;
2684
2685	return 0;
2686}
2687
2688static void __net_exit packet_net_exit(struct net *net)
2689{
2690	proc_net_remove(net, "packet");
2691}
2692
2693static struct pernet_operations packet_net_ops = {
2694	.init = packet_net_init,
2695	.exit = packet_net_exit,
2696};
2697
2698
2699static void __exit packet_exit(void)
2700{
2701	unregister_netdevice_notifier(&packet_netdev_notifier);
2702	unregister_pernet_subsys(&packet_net_ops);
2703	sock_unregister(PF_PACKET);
2704	proto_unregister(&packet_proto);
2705}
2706
2707static int __init packet_init(void)
2708{
2709	int rc = proto_register(&packet_proto, 0);
2710
2711	if (rc != 0)
2712		goto out;
2713
2714	sock_register(&packet_family_ops);
2715	register_pernet_subsys(&packet_net_ops);
2716	register_netdevice_notifier(&packet_netdev_notifier);
2717out:
2718	return rc;
2719}
2720
2721module_init(packet_init);
2722module_exit(packet_exit);
2723MODULE_LICENSE("GPL");
2724MODULE_ALIAS_NETPROTO(PF_PACKET);
2725