af_packet.c revision 1162563f82b434e3099c9e6c1bbdba846d792f0d
1/*
2 * INET		An implementation of the TCP/IP protocol suite for the LINUX
3 *		operating system.  INET is implemented using the  BSD Socket
4 *		interface as the means of communication with the user level.
5 *
6 *		PACKET - implements raw packet sockets.
7 *
8 * Authors:	Ross Biro
9 *		Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
10 *		Alan Cox, <gw4pts@gw4pts.ampr.org>
11 *
12 * Fixes:
13 *		Alan Cox	:	verify_area() now used correctly
14 *		Alan Cox	:	new skbuff lists, look ma no backlogs!
15 *		Alan Cox	:	tidied skbuff lists.
16 *		Alan Cox	:	Now uses generic datagram routines I
17 *					added. Also fixed the peek/read crash
18 *					from all old Linux datagram code.
19 *		Alan Cox	:	Uses the improved datagram code.
20 *		Alan Cox	:	Added NULL's for socket options.
21 *		Alan Cox	:	Re-commented the code.
22 *		Alan Cox	:	Use new kernel side addressing
23 *		Rob Janssen	:	Correct MTU usage.
24 *		Dave Platt	:	Counter leaks caused by incorrect
25 *					interrupt locking and some slightly
26 *					dubious gcc output. Can you read
27 *					compiler: it said _VOLATILE_
28 *	Richard Kooijman	:	Timestamp fixes.
29 *		Alan Cox	:	New buffers. Use sk->mac.raw.
30 *		Alan Cox	:	sendmsg/recvmsg support.
31 *		Alan Cox	:	Protocol setting support
32 *	Alexey Kuznetsov	:	Untied from IPv4 stack.
33 *	Cyrus Durgin		:	Fixed kerneld for kmod.
34 *	Michal Ostrowski        :       Module initialization cleanup.
35 *         Ulises Alonso        :       Frame number limit removal and
36 *                                      packet_set_ring memory leak.
37 *		Eric Biederman	:	Allow for > 8 byte hardware addresses.
38 *					The convention is that longer addresses
39 *					will simply extend the hardware address
40 *					byte arrays at the end of sockaddr_ll
41 *					and packet_mreq.
42 *		Johann Baudy	:	Added TX RING.
43 *
44 *		This program is free software; you can redistribute it and/or
45 *		modify it under the terms of the GNU General Public License
46 *		as published by the Free Software Foundation; either version
47 *		2 of the License, or (at your option) any later version.
48 *
49 */
50
51#include <linux/types.h>
52#include <linux/mm.h>
53#include <linux/capability.h>
54#include <linux/fcntl.h>
55#include <linux/socket.h>
56#include <linux/in.h>
57#include <linux/inet.h>
58#include <linux/netdevice.h>
59#include <linux/if_packet.h>
60#include <linux/wireless.h>
61#include <linux/kernel.h>
62#include <linux/kmod.h>
63#include <net/net_namespace.h>
64#include <net/ip.h>
65#include <net/protocol.h>
66#include <linux/skbuff.h>
67#include <net/sock.h>
68#include <linux/errno.h>
69#include <linux/timer.h>
70#include <asm/system.h>
71#include <asm/uaccess.h>
72#include <asm/ioctls.h>
73#include <asm/page.h>
74#include <asm/cacheflush.h>
75#include <asm/io.h>
76#include <linux/proc_fs.h>
77#include <linux/seq_file.h>
78#include <linux/poll.h>
79#include <linux/module.h>
80#include <linux/init.h>
81#include <linux/mutex.h>
82#include <linux/if_vlan.h>
83#include <linux/virtio_net.h>
84
85#ifdef CONFIG_INET
86#include <net/inet_common.h>
87#endif
88
89/*
90   Assumptions:
91   - if device has no dev->hard_header routine, it adds and removes ll header
92     inside itself. In this case ll header is invisible outside of device,
93     but higher levels still should reserve dev->hard_header_len.
94     Some devices are enough clever to reallocate skb, when header
95     will not fit to reserved space (tunnel), another ones are silly
96     (PPP).
97   - packet socket receives packets with pulled ll header,
98     so that SOCK_RAW should push it back.
99
100On receive:
101-----------
102
103Incoming, dev->hard_header!=NULL
104   mac_header -> ll header
105   data       -> data
106
107Outgoing, dev->hard_header!=NULL
108   mac_header -> ll header
109   data       -> ll header
110
111Incoming, dev->hard_header==NULL
112   mac_header -> UNKNOWN position. It is very likely, that it points to ll
113		 header.  PPP makes it, that is wrong, because introduce
114		 assymetry between rx and tx paths.
115   data       -> data
116
117Outgoing, dev->hard_header==NULL
118   mac_header -> data. ll header is still not built!
119   data       -> data
120
121Resume
122  If dev->hard_header==NULL we are unlikely to restore sensible ll header.
123
124
125On transmit:
126------------
127
128dev->hard_header != NULL
129   mac_header -> ll header
130   data       -> ll header
131
132dev->hard_header == NULL (ll header is added by device, we cannot control it)
133   mac_header -> data
134   data       -> data
135
136   We should set nh.raw on output to correct posistion,
137   packet classifier depends on it.
138 */
139
140/* Private packet socket structures. */
141
142struct packet_mclist {
143	struct packet_mclist	*next;
144	int			ifindex;
145	int			count;
146	unsigned short		type;
147	unsigned short		alen;
148	unsigned char		addr[MAX_ADDR_LEN];
149};
150/* identical to struct packet_mreq except it has
151 * a longer address field.
152 */
153struct packet_mreq_max {
154	int		mr_ifindex;
155	unsigned short	mr_type;
156	unsigned short	mr_alen;
157	unsigned char	mr_address[MAX_ADDR_LEN];
158};
159
160static int packet_set_ring(struct sock *sk, struct tpacket_req *req,
161		int closing, int tx_ring);
162
163struct packet_ring_buffer {
164	char			**pg_vec;
165	unsigned int		head;
166	unsigned int		frames_per_block;
167	unsigned int		frame_size;
168	unsigned int		frame_max;
169
170	unsigned int		pg_vec_order;
171	unsigned int		pg_vec_pages;
172	unsigned int		pg_vec_len;
173
174	atomic_t		pending;
175};
176
177struct packet_sock;
178static int tpacket_snd(struct packet_sock *po, struct msghdr *msg);
179
180static void packet_flush_mclist(struct sock *sk);
181
182struct packet_sock {
183	/* struct sock has to be the first member of packet_sock */
184	struct sock		sk;
185	struct tpacket_stats	stats;
186	struct packet_ring_buffer	rx_ring;
187	struct packet_ring_buffer	tx_ring;
188	int			copy_thresh;
189	spinlock_t		bind_lock;
190	struct mutex		pg_vec_lock;
191	unsigned int		running:1,	/* prot_hook is attached*/
192				auxdata:1,
193				origdev:1,
194				has_vnet_hdr:1;
195	int			ifindex;	/* bound device		*/
196	__be16			num;
197	struct packet_mclist	*mclist;
198	atomic_t		mapped;
199	enum tpacket_versions	tp_version;
200	unsigned int		tp_hdrlen;
201	unsigned int		tp_reserve;
202	unsigned int		tp_loss:1;
203	struct packet_type	prot_hook ____cacheline_aligned_in_smp;
204};
205
206struct packet_skb_cb {
207	unsigned int origlen;
208	union {
209		struct sockaddr_pkt pkt;
210		struct sockaddr_ll ll;
211	} sa;
212};
213
214#define PACKET_SKB_CB(__skb)	((struct packet_skb_cb *)((__skb)->cb))
215
216static void __packet_set_status(struct packet_sock *po, void *frame, int status)
217{
218	union {
219		struct tpacket_hdr *h1;
220		struct tpacket2_hdr *h2;
221		void *raw;
222	} h;
223
224	h.raw = frame;
225	switch (po->tp_version) {
226	case TPACKET_V1:
227		h.h1->tp_status = status;
228		flush_dcache_page(virt_to_page(&h.h1->tp_status));
229		break;
230	case TPACKET_V2:
231		h.h2->tp_status = status;
232		flush_dcache_page(virt_to_page(&h.h2->tp_status));
233		break;
234	default:
235		pr_err("TPACKET version not supported\n");
236		BUG();
237	}
238
239	smp_wmb();
240}
241
242static int __packet_get_status(struct packet_sock *po, void *frame)
243{
244	union {
245		struct tpacket_hdr *h1;
246		struct tpacket2_hdr *h2;
247		void *raw;
248	} h;
249
250	smp_rmb();
251
252	h.raw = frame;
253	switch (po->tp_version) {
254	case TPACKET_V1:
255		flush_dcache_page(virt_to_page(&h.h1->tp_status));
256		return h.h1->tp_status;
257	case TPACKET_V2:
258		flush_dcache_page(virt_to_page(&h.h2->tp_status));
259		return h.h2->tp_status;
260	default:
261		pr_err("TPACKET version not supported\n");
262		BUG();
263		return 0;
264	}
265}
266
267static void *packet_lookup_frame(struct packet_sock *po,
268		struct packet_ring_buffer *rb,
269		unsigned int position,
270		int status)
271{
272	unsigned int pg_vec_pos, frame_offset;
273	union {
274		struct tpacket_hdr *h1;
275		struct tpacket2_hdr *h2;
276		void *raw;
277	} h;
278
279	pg_vec_pos = position / rb->frames_per_block;
280	frame_offset = position % rb->frames_per_block;
281
282	h.raw = rb->pg_vec[pg_vec_pos] + (frame_offset * rb->frame_size);
283
284	if (status != __packet_get_status(po, h.raw))
285		return NULL;
286
287	return h.raw;
288}
289
290static inline void *packet_current_frame(struct packet_sock *po,
291		struct packet_ring_buffer *rb,
292		int status)
293{
294	return packet_lookup_frame(po, rb, rb->head, status);
295}
296
297static inline void *packet_previous_frame(struct packet_sock *po,
298		struct packet_ring_buffer *rb,
299		int status)
300{
301	unsigned int previous = rb->head ? rb->head - 1 : rb->frame_max;
302	return packet_lookup_frame(po, rb, previous, status);
303}
304
305static inline void packet_increment_head(struct packet_ring_buffer *buff)
306{
307	buff->head = buff->head != buff->frame_max ? buff->head+1 : 0;
308}
309
310static inline struct packet_sock *pkt_sk(struct sock *sk)
311{
312	return (struct packet_sock *)sk;
313}
314
315static void packet_sock_destruct(struct sock *sk)
316{
317	WARN_ON(atomic_read(&sk->sk_rmem_alloc));
318	WARN_ON(atomic_read(&sk->sk_wmem_alloc));
319
320	if (!sock_flag(sk, SOCK_DEAD)) {
321		pr_err("Attempt to release alive packet socket: %p\n", sk);
322		return;
323	}
324
325	sk_refcnt_debug_dec(sk);
326}
327
328
329static const struct proto_ops packet_ops;
330
331static const struct proto_ops packet_ops_spkt;
332
333static int packet_rcv_spkt(struct sk_buff *skb, struct net_device *dev,
334			   struct packet_type *pt, struct net_device *orig_dev)
335{
336	struct sock *sk;
337	struct sockaddr_pkt *spkt;
338
339	/*
340	 *	When we registered the protocol we saved the socket in the data
341	 *	field for just this event.
342	 */
343
344	sk = pt->af_packet_priv;
345
346	/*
347	 *	Yank back the headers [hope the device set this
348	 *	right or kerboom...]
349	 *
350	 *	Incoming packets have ll header pulled,
351	 *	push it back.
352	 *
353	 *	For outgoing ones skb->data == skb_mac_header(skb)
354	 *	so that this procedure is noop.
355	 */
356
357	if (skb->pkt_type == PACKET_LOOPBACK)
358		goto out;
359
360	if (!net_eq(dev_net(dev), sock_net(sk)))
361		goto out;
362
363	skb = skb_share_check(skb, GFP_ATOMIC);
364	if (skb == NULL)
365		goto oom;
366
367	/* drop any routing info */
368	skb_dst_drop(skb);
369
370	/* drop conntrack reference */
371	nf_reset(skb);
372
373	spkt = &PACKET_SKB_CB(skb)->sa.pkt;
374
375	skb_push(skb, skb->data - skb_mac_header(skb));
376
377	/*
378	 *	The SOCK_PACKET socket receives _all_ frames.
379	 */
380
381	spkt->spkt_family = dev->type;
382	strlcpy(spkt->spkt_device, dev->name, sizeof(spkt->spkt_device));
383	spkt->spkt_protocol = skb->protocol;
384
385	/*
386	 *	Charge the memory to the socket. This is done specifically
387	 *	to prevent sockets using all the memory up.
388	 */
389
390	if (sock_queue_rcv_skb(sk, skb) == 0)
391		return 0;
392
393out:
394	kfree_skb(skb);
395oom:
396	return 0;
397}
398
399
400/*
401 *	Output a raw packet to a device layer. This bypasses all the other
402 *	protocol layers and you must therefore supply it with a complete frame
403 */
404
405static int packet_sendmsg_spkt(struct kiocb *iocb, struct socket *sock,
406			       struct msghdr *msg, size_t len)
407{
408	struct sock *sk = sock->sk;
409	struct sockaddr_pkt *saddr = (struct sockaddr_pkt *)msg->msg_name;
410	struct sk_buff *skb = NULL;
411	struct net_device *dev;
412	__be16 proto = 0;
413	int err;
414
415	/*
416	 *	Get and verify the address.
417	 */
418
419	if (saddr) {
420		if (msg->msg_namelen < sizeof(struct sockaddr))
421			return -EINVAL;
422		if (msg->msg_namelen == sizeof(struct sockaddr_pkt))
423			proto = saddr->spkt_protocol;
424	} else
425		return -ENOTCONN;	/* SOCK_PACKET must be sent giving an address */
426
427	/*
428	 *	Find the device first to size check it
429	 */
430
431	saddr->spkt_device[13] = 0;
432retry:
433	rcu_read_lock();
434	dev = dev_get_by_name_rcu(sock_net(sk), saddr->spkt_device);
435	err = -ENODEV;
436	if (dev == NULL)
437		goto out_unlock;
438
439	err = -ENETDOWN;
440	if (!(dev->flags & IFF_UP))
441		goto out_unlock;
442
443	/*
444	 * You may not queue a frame bigger than the mtu. This is the lowest level
445	 * raw protocol and you must do your own fragmentation at this level.
446	 */
447
448	err = -EMSGSIZE;
449	if (len > dev->mtu + dev->hard_header_len)
450		goto out_unlock;
451
452	if (!skb) {
453		size_t reserved = LL_RESERVED_SPACE(dev);
454		unsigned int hhlen = dev->header_ops ? dev->hard_header_len : 0;
455
456		rcu_read_unlock();
457		skb = sock_wmalloc(sk, len + reserved, 0, GFP_KERNEL);
458		if (skb == NULL)
459			return -ENOBUFS;
460		/* FIXME: Save some space for broken drivers that write a hard
461		 * header at transmission time by themselves. PPP is the notable
462		 * one here. This should really be fixed at the driver level.
463		 */
464		skb_reserve(skb, reserved);
465		skb_reset_network_header(skb);
466
467		/* Try to align data part correctly */
468		if (hhlen) {
469			skb->data -= hhlen;
470			skb->tail -= hhlen;
471			if (len < hhlen)
472				skb_reset_network_header(skb);
473		}
474		err = memcpy_fromiovec(skb_put(skb, len), msg->msg_iov, len);
475		if (err)
476			goto out_free;
477		goto retry;
478	}
479
480
481	skb->protocol = proto;
482	skb->dev = dev;
483	skb->priority = sk->sk_priority;
484	skb->mark = sk->sk_mark;
485
486	dev_queue_xmit(skb);
487	rcu_read_unlock();
488	return len;
489
490out_unlock:
491	rcu_read_unlock();
492out_free:
493	kfree_skb(skb);
494	return err;
495}
496
497static inline unsigned int run_filter(struct sk_buff *skb, struct sock *sk,
498				      unsigned int res)
499{
500	struct sk_filter *filter;
501
502	rcu_read_lock_bh();
503	filter = rcu_dereference_bh(sk->sk_filter);
504	if (filter != NULL)
505		res = sk_run_filter(skb, filter->insns, filter->len);
506	rcu_read_unlock_bh();
507
508	return res;
509}
510
511/*
512   This function makes lazy skb cloning in hope that most of packets
513   are discarded by BPF.
514
515   Note tricky part: we DO mangle shared skb! skb->data, skb->len
516   and skb->cb are mangled. It works because (and until) packets
517   falling here are owned by current CPU. Output packets are cloned
518   by dev_queue_xmit_nit(), input packets are processed by net_bh
519   sequencially, so that if we return skb to original state on exit,
520   we will not harm anyone.
521 */
522
523static int packet_rcv(struct sk_buff *skb, struct net_device *dev,
524		      struct packet_type *pt, struct net_device *orig_dev)
525{
526	struct sock *sk;
527	struct sockaddr_ll *sll;
528	struct packet_sock *po;
529	u8 *skb_head = skb->data;
530	int skb_len = skb->len;
531	unsigned int snaplen, res;
532
533	if (skb->pkt_type == PACKET_LOOPBACK)
534		goto drop;
535
536	sk = pt->af_packet_priv;
537	po = pkt_sk(sk);
538
539	if (!net_eq(dev_net(dev), sock_net(sk)))
540		goto drop;
541
542	skb->dev = dev;
543
544	if (dev->header_ops) {
545		/* The device has an explicit notion of ll header,
546		   exported to higher levels.
547
548		   Otherwise, the device hides datails of it frame
549		   structure, so that corresponding packet head
550		   never delivered to user.
551		 */
552		if (sk->sk_type != SOCK_DGRAM)
553			skb_push(skb, skb->data - skb_mac_header(skb));
554		else if (skb->pkt_type == PACKET_OUTGOING) {
555			/* Special case: outgoing packets have ll header at head */
556			skb_pull(skb, skb_network_offset(skb));
557		}
558	}
559
560	snaplen = skb->len;
561
562	res = run_filter(skb, sk, snaplen);
563	if (!res)
564		goto drop_n_restore;
565	if (snaplen > res)
566		snaplen = res;
567
568	if (atomic_read(&sk->sk_rmem_alloc) + skb->truesize >=
569	    (unsigned)sk->sk_rcvbuf)
570		goto drop_n_acct;
571
572	if (skb_shared(skb)) {
573		struct sk_buff *nskb = skb_clone(skb, GFP_ATOMIC);
574		if (nskb == NULL)
575			goto drop_n_acct;
576
577		if (skb_head != skb->data) {
578			skb->data = skb_head;
579			skb->len = skb_len;
580		}
581		kfree_skb(skb);
582		skb = nskb;
583	}
584
585	BUILD_BUG_ON(sizeof(*PACKET_SKB_CB(skb)) + MAX_ADDR_LEN - 8 >
586		     sizeof(skb->cb));
587
588	sll = &PACKET_SKB_CB(skb)->sa.ll;
589	sll->sll_family = AF_PACKET;
590	sll->sll_hatype = dev->type;
591	sll->sll_protocol = skb->protocol;
592	sll->sll_pkttype = skb->pkt_type;
593	if (unlikely(po->origdev))
594		sll->sll_ifindex = orig_dev->ifindex;
595	else
596		sll->sll_ifindex = dev->ifindex;
597
598	sll->sll_halen = dev_parse_header(skb, sll->sll_addr);
599
600	PACKET_SKB_CB(skb)->origlen = skb->len;
601
602	if (pskb_trim(skb, snaplen))
603		goto drop_n_acct;
604
605	skb_set_owner_r(skb, sk);
606	skb->dev = NULL;
607	skb_dst_drop(skb);
608
609	/* drop conntrack reference */
610	nf_reset(skb);
611
612	spin_lock(&sk->sk_receive_queue.lock);
613	po->stats.tp_packets++;
614	skb->dropcount = atomic_read(&sk->sk_drops);
615	__skb_queue_tail(&sk->sk_receive_queue, skb);
616	spin_unlock(&sk->sk_receive_queue.lock);
617	sk->sk_data_ready(sk, skb->len);
618	return 0;
619
620drop_n_acct:
621	po->stats.tp_drops = atomic_inc_return(&sk->sk_drops);
622
623drop_n_restore:
624	if (skb_head != skb->data && skb_shared(skb)) {
625		skb->data = skb_head;
626		skb->len = skb_len;
627	}
628drop:
629	consume_skb(skb);
630	return 0;
631}
632
633static int tpacket_rcv(struct sk_buff *skb, struct net_device *dev,
634		       struct packet_type *pt, struct net_device *orig_dev)
635{
636	struct sock *sk;
637	struct packet_sock *po;
638	struct sockaddr_ll *sll;
639	union {
640		struct tpacket_hdr *h1;
641		struct tpacket2_hdr *h2;
642		void *raw;
643	} h;
644	u8 *skb_head = skb->data;
645	int skb_len = skb->len;
646	unsigned int snaplen, res;
647	unsigned long status = TP_STATUS_LOSING|TP_STATUS_USER;
648	unsigned short macoff, netoff, hdrlen;
649	struct sk_buff *copy_skb = NULL;
650	struct timeval tv;
651	struct timespec ts;
652
653	if (skb->pkt_type == PACKET_LOOPBACK)
654		goto drop;
655
656	sk = pt->af_packet_priv;
657	po = pkt_sk(sk);
658
659	if (!net_eq(dev_net(dev), sock_net(sk)))
660		goto drop;
661
662	if (dev->header_ops) {
663		if (sk->sk_type != SOCK_DGRAM)
664			skb_push(skb, skb->data - skb_mac_header(skb));
665		else if (skb->pkt_type == PACKET_OUTGOING) {
666			/* Special case: outgoing packets have ll header at head */
667			skb_pull(skb, skb_network_offset(skb));
668		}
669	}
670
671	if (skb->ip_summed == CHECKSUM_PARTIAL)
672		status |= TP_STATUS_CSUMNOTREADY;
673
674	snaplen = skb->len;
675
676	res = run_filter(skb, sk, snaplen);
677	if (!res)
678		goto drop_n_restore;
679	if (snaplen > res)
680		snaplen = res;
681
682	if (sk->sk_type == SOCK_DGRAM) {
683		macoff = netoff = TPACKET_ALIGN(po->tp_hdrlen) + 16 +
684				  po->tp_reserve;
685	} else {
686		unsigned maclen = skb_network_offset(skb);
687		netoff = TPACKET_ALIGN(po->tp_hdrlen +
688				       (maclen < 16 ? 16 : maclen)) +
689			po->tp_reserve;
690		macoff = netoff - maclen;
691	}
692
693	if (macoff + snaplen > po->rx_ring.frame_size) {
694		if (po->copy_thresh &&
695		    atomic_read(&sk->sk_rmem_alloc) + skb->truesize <
696		    (unsigned)sk->sk_rcvbuf) {
697			if (skb_shared(skb)) {
698				copy_skb = skb_clone(skb, GFP_ATOMIC);
699			} else {
700				copy_skb = skb_get(skb);
701				skb_head = skb->data;
702			}
703			if (copy_skb)
704				skb_set_owner_r(copy_skb, sk);
705		}
706		snaplen = po->rx_ring.frame_size - macoff;
707		if ((int)snaplen < 0)
708			snaplen = 0;
709	}
710
711	spin_lock(&sk->sk_receive_queue.lock);
712	h.raw = packet_current_frame(po, &po->rx_ring, TP_STATUS_KERNEL);
713	if (!h.raw)
714		goto ring_is_full;
715	packet_increment_head(&po->rx_ring);
716	po->stats.tp_packets++;
717	if (copy_skb) {
718		status |= TP_STATUS_COPY;
719		__skb_queue_tail(&sk->sk_receive_queue, copy_skb);
720	}
721	if (!po->stats.tp_drops)
722		status &= ~TP_STATUS_LOSING;
723	spin_unlock(&sk->sk_receive_queue.lock);
724
725	skb_copy_bits(skb, 0, h.raw + macoff, snaplen);
726
727	switch (po->tp_version) {
728	case TPACKET_V1:
729		h.h1->tp_len = skb->len;
730		h.h1->tp_snaplen = snaplen;
731		h.h1->tp_mac = macoff;
732		h.h1->tp_net = netoff;
733		if (skb->tstamp.tv64)
734			tv = ktime_to_timeval(skb->tstamp);
735		else
736			do_gettimeofday(&tv);
737		h.h1->tp_sec = tv.tv_sec;
738		h.h1->tp_usec = tv.tv_usec;
739		hdrlen = sizeof(*h.h1);
740		break;
741	case TPACKET_V2:
742		h.h2->tp_len = skb->len;
743		h.h2->tp_snaplen = snaplen;
744		h.h2->tp_mac = macoff;
745		h.h2->tp_net = netoff;
746		if (skb->tstamp.tv64)
747			ts = ktime_to_timespec(skb->tstamp);
748		else
749			getnstimeofday(&ts);
750		h.h2->tp_sec = ts.tv_sec;
751		h.h2->tp_nsec = ts.tv_nsec;
752		h.h2->tp_vlan_tci = vlan_tx_tag_get(skb);
753		hdrlen = sizeof(*h.h2);
754		break;
755	default:
756		BUG();
757	}
758
759	sll = h.raw + TPACKET_ALIGN(hdrlen);
760	sll->sll_halen = dev_parse_header(skb, sll->sll_addr);
761	sll->sll_family = AF_PACKET;
762	sll->sll_hatype = dev->type;
763	sll->sll_protocol = skb->protocol;
764	sll->sll_pkttype = skb->pkt_type;
765	if (unlikely(po->origdev))
766		sll->sll_ifindex = orig_dev->ifindex;
767	else
768		sll->sll_ifindex = dev->ifindex;
769
770	__packet_set_status(po, h.raw, status);
771	smp_mb();
772	{
773		struct page *p_start, *p_end;
774		u8 *h_end = h.raw + macoff + snaplen - 1;
775
776		p_start = virt_to_page(h.raw);
777		p_end = virt_to_page(h_end);
778		while (p_start <= p_end) {
779			flush_dcache_page(p_start);
780			p_start++;
781		}
782	}
783
784	sk->sk_data_ready(sk, 0);
785
786drop_n_restore:
787	if (skb_head != skb->data && skb_shared(skb)) {
788		skb->data = skb_head;
789		skb->len = skb_len;
790	}
791drop:
792	kfree_skb(skb);
793	return 0;
794
795ring_is_full:
796	po->stats.tp_drops++;
797	spin_unlock(&sk->sk_receive_queue.lock);
798
799	sk->sk_data_ready(sk, 0);
800	kfree_skb(copy_skb);
801	goto drop_n_restore;
802}
803
804static void tpacket_destruct_skb(struct sk_buff *skb)
805{
806	struct packet_sock *po = pkt_sk(skb->sk);
807	void *ph;
808
809	BUG_ON(skb == NULL);
810
811	if (likely(po->tx_ring.pg_vec)) {
812		ph = skb_shinfo(skb)->destructor_arg;
813		BUG_ON(__packet_get_status(po, ph) != TP_STATUS_SENDING);
814		BUG_ON(atomic_read(&po->tx_ring.pending) == 0);
815		atomic_dec(&po->tx_ring.pending);
816		__packet_set_status(po, ph, TP_STATUS_AVAILABLE);
817	}
818
819	sock_wfree(skb);
820}
821
822static int tpacket_fill_skb(struct packet_sock *po, struct sk_buff *skb,
823		void *frame, struct net_device *dev, int size_max,
824		__be16 proto, unsigned char *addr)
825{
826	union {
827		struct tpacket_hdr *h1;
828		struct tpacket2_hdr *h2;
829		void *raw;
830	} ph;
831	int to_write, offset, len, tp_len, nr_frags, len_max;
832	struct socket *sock = po->sk.sk_socket;
833	struct page *page;
834	void *data;
835	int err;
836
837	ph.raw = frame;
838
839	skb->protocol = proto;
840	skb->dev = dev;
841	skb->priority = po->sk.sk_priority;
842	skb->mark = po->sk.sk_mark;
843	skb_shinfo(skb)->destructor_arg = ph.raw;
844
845	switch (po->tp_version) {
846	case TPACKET_V2:
847		tp_len = ph.h2->tp_len;
848		break;
849	default:
850		tp_len = ph.h1->tp_len;
851		break;
852	}
853	if (unlikely(tp_len > size_max)) {
854		pr_err("packet size is too long (%d > %d)\n", tp_len, size_max);
855		return -EMSGSIZE;
856	}
857
858	skb_reserve(skb, LL_RESERVED_SPACE(dev));
859	skb_reset_network_header(skb);
860
861	data = ph.raw + po->tp_hdrlen - sizeof(struct sockaddr_ll);
862	to_write = tp_len;
863
864	if (sock->type == SOCK_DGRAM) {
865		err = dev_hard_header(skb, dev, ntohs(proto), addr,
866				NULL, tp_len);
867		if (unlikely(err < 0))
868			return -EINVAL;
869	} else if (dev->hard_header_len) {
870		/* net device doesn't like empty head */
871		if (unlikely(tp_len <= dev->hard_header_len)) {
872			pr_err("packet size is too short (%d < %d)\n",
873			       tp_len, dev->hard_header_len);
874			return -EINVAL;
875		}
876
877		skb_push(skb, dev->hard_header_len);
878		err = skb_store_bits(skb, 0, data,
879				dev->hard_header_len);
880		if (unlikely(err))
881			return err;
882
883		data += dev->hard_header_len;
884		to_write -= dev->hard_header_len;
885	}
886
887	err = -EFAULT;
888	page = virt_to_page(data);
889	offset = offset_in_page(data);
890	len_max = PAGE_SIZE - offset;
891	len = ((to_write > len_max) ? len_max : to_write);
892
893	skb->data_len = to_write;
894	skb->len += to_write;
895	skb->truesize += to_write;
896	atomic_add(to_write, &po->sk.sk_wmem_alloc);
897
898	while (likely(to_write)) {
899		nr_frags = skb_shinfo(skb)->nr_frags;
900
901		if (unlikely(nr_frags >= MAX_SKB_FRAGS)) {
902			pr_err("Packet exceed the number of skb frags(%lu)\n",
903			       MAX_SKB_FRAGS);
904			return -EFAULT;
905		}
906
907		flush_dcache_page(page);
908		get_page(page);
909		skb_fill_page_desc(skb,
910				nr_frags,
911				page++, offset, len);
912		to_write -= len;
913		offset = 0;
914		len_max = PAGE_SIZE;
915		len = ((to_write > len_max) ? len_max : to_write);
916	}
917
918	return tp_len;
919}
920
921static int tpacket_snd(struct packet_sock *po, struct msghdr *msg)
922{
923	struct socket *sock;
924	struct sk_buff *skb;
925	struct net_device *dev;
926	__be16 proto;
927	int ifindex, err, reserve = 0;
928	void *ph;
929	struct sockaddr_ll *saddr = (struct sockaddr_ll *)msg->msg_name;
930	int tp_len, size_max;
931	unsigned char *addr;
932	int len_sum = 0;
933	int status = 0;
934
935	sock = po->sk.sk_socket;
936
937	mutex_lock(&po->pg_vec_lock);
938
939	err = -EBUSY;
940	if (saddr == NULL) {
941		ifindex	= po->ifindex;
942		proto	= po->num;
943		addr	= NULL;
944	} else {
945		err = -EINVAL;
946		if (msg->msg_namelen < sizeof(struct sockaddr_ll))
947			goto out;
948		if (msg->msg_namelen < (saddr->sll_halen
949					+ offsetof(struct sockaddr_ll,
950						sll_addr)))
951			goto out;
952		ifindex	= saddr->sll_ifindex;
953		proto	= saddr->sll_protocol;
954		addr	= saddr->sll_addr;
955	}
956
957	dev = dev_get_by_index(sock_net(&po->sk), ifindex);
958	err = -ENXIO;
959	if (unlikely(dev == NULL))
960		goto out;
961
962	reserve = dev->hard_header_len;
963
964	err = -ENETDOWN;
965	if (unlikely(!(dev->flags & IFF_UP)))
966		goto out_put;
967
968	size_max = po->tx_ring.frame_size
969		- (po->tp_hdrlen - sizeof(struct sockaddr_ll));
970
971	if (size_max > dev->mtu + reserve)
972		size_max = dev->mtu + reserve;
973
974	do {
975		ph = packet_current_frame(po, &po->tx_ring,
976				TP_STATUS_SEND_REQUEST);
977
978		if (unlikely(ph == NULL)) {
979			schedule();
980			continue;
981		}
982
983		status = TP_STATUS_SEND_REQUEST;
984		skb = sock_alloc_send_skb(&po->sk,
985				LL_ALLOCATED_SPACE(dev)
986				+ sizeof(struct sockaddr_ll),
987				0, &err);
988
989		if (unlikely(skb == NULL))
990			goto out_status;
991
992		tp_len = tpacket_fill_skb(po, skb, ph, dev, size_max, proto,
993				addr);
994
995		if (unlikely(tp_len < 0)) {
996			if (po->tp_loss) {
997				__packet_set_status(po, ph,
998						TP_STATUS_AVAILABLE);
999				packet_increment_head(&po->tx_ring);
1000				kfree_skb(skb);
1001				continue;
1002			} else {
1003				status = TP_STATUS_WRONG_FORMAT;
1004				err = tp_len;
1005				goto out_status;
1006			}
1007		}
1008
1009		skb->destructor = tpacket_destruct_skb;
1010		__packet_set_status(po, ph, TP_STATUS_SENDING);
1011		atomic_inc(&po->tx_ring.pending);
1012
1013		status = TP_STATUS_SEND_REQUEST;
1014		err = dev_queue_xmit(skb);
1015		if (unlikely(err > 0)) {
1016			err = net_xmit_errno(err);
1017			if (err && __packet_get_status(po, ph) ==
1018				   TP_STATUS_AVAILABLE) {
1019				/* skb was destructed already */
1020				skb = NULL;
1021				goto out_status;
1022			}
1023			/*
1024			 * skb was dropped but not destructed yet;
1025			 * let's treat it like congestion or err < 0
1026			 */
1027			err = 0;
1028		}
1029		packet_increment_head(&po->tx_ring);
1030		len_sum += tp_len;
1031	} while (likely((ph != NULL) ||
1032			((!(msg->msg_flags & MSG_DONTWAIT)) &&
1033			 (atomic_read(&po->tx_ring.pending))))
1034		);
1035
1036	err = len_sum;
1037	goto out_put;
1038
1039out_status:
1040	__packet_set_status(po, ph, status);
1041	kfree_skb(skb);
1042out_put:
1043	dev_put(dev);
1044out:
1045	mutex_unlock(&po->pg_vec_lock);
1046	return err;
1047}
1048
1049static inline struct sk_buff *packet_alloc_skb(struct sock *sk, size_t prepad,
1050					       size_t reserve, size_t len,
1051					       size_t linear, int noblock,
1052					       int *err)
1053{
1054	struct sk_buff *skb;
1055
1056	/* Under a page?  Don't bother with paged skb. */
1057	if (prepad + len < PAGE_SIZE || !linear)
1058		linear = len;
1059
1060	skb = sock_alloc_send_pskb(sk, prepad + linear, len - linear, noblock,
1061				   err);
1062	if (!skb)
1063		return NULL;
1064
1065	skb_reserve(skb, reserve);
1066	skb_put(skb, linear);
1067	skb->data_len = len - linear;
1068	skb->len += len - linear;
1069
1070	return skb;
1071}
1072
1073static int packet_snd(struct socket *sock,
1074			  struct msghdr *msg, size_t len)
1075{
1076	struct sock *sk = sock->sk;
1077	struct sockaddr_ll *saddr = (struct sockaddr_ll *)msg->msg_name;
1078	struct sk_buff *skb;
1079	struct net_device *dev;
1080	__be16 proto;
1081	unsigned char *addr;
1082	int ifindex, err, reserve = 0;
1083	struct virtio_net_hdr vnet_hdr = { 0 };
1084	int offset = 0;
1085	int vnet_hdr_len;
1086	struct packet_sock *po = pkt_sk(sk);
1087	unsigned short gso_type = 0;
1088
1089	/*
1090	 *	Get and verify the address.
1091	 */
1092
1093	if (saddr == NULL) {
1094		ifindex	= po->ifindex;
1095		proto	= po->num;
1096		addr	= NULL;
1097	} else {
1098		err = -EINVAL;
1099		if (msg->msg_namelen < sizeof(struct sockaddr_ll))
1100			goto out;
1101		if (msg->msg_namelen < (saddr->sll_halen + offsetof(struct sockaddr_ll, sll_addr)))
1102			goto out;
1103		ifindex	= saddr->sll_ifindex;
1104		proto	= saddr->sll_protocol;
1105		addr	= saddr->sll_addr;
1106	}
1107
1108
1109	dev = dev_get_by_index(sock_net(sk), ifindex);
1110	err = -ENXIO;
1111	if (dev == NULL)
1112		goto out_unlock;
1113	if (sock->type == SOCK_RAW)
1114		reserve = dev->hard_header_len;
1115
1116	err = -ENETDOWN;
1117	if (!(dev->flags & IFF_UP))
1118		goto out_unlock;
1119
1120	if (po->has_vnet_hdr) {
1121		vnet_hdr_len = sizeof(vnet_hdr);
1122
1123		err = -EINVAL;
1124		if (len < vnet_hdr_len)
1125			goto out_unlock;
1126
1127		len -= vnet_hdr_len;
1128
1129		err = memcpy_fromiovec((void *)&vnet_hdr, msg->msg_iov,
1130				       vnet_hdr_len);
1131		if (err < 0)
1132			goto out_unlock;
1133
1134		if ((vnet_hdr.flags & VIRTIO_NET_HDR_F_NEEDS_CSUM) &&
1135		    (vnet_hdr.csum_start + vnet_hdr.csum_offset + 2 >
1136		      vnet_hdr.hdr_len))
1137			vnet_hdr.hdr_len = vnet_hdr.csum_start +
1138						 vnet_hdr.csum_offset + 2;
1139
1140		err = -EINVAL;
1141		if (vnet_hdr.hdr_len > len)
1142			goto out_unlock;
1143
1144		if (vnet_hdr.gso_type != VIRTIO_NET_HDR_GSO_NONE) {
1145			switch (vnet_hdr.gso_type & ~VIRTIO_NET_HDR_GSO_ECN) {
1146			case VIRTIO_NET_HDR_GSO_TCPV4:
1147				gso_type = SKB_GSO_TCPV4;
1148				break;
1149			case VIRTIO_NET_HDR_GSO_TCPV6:
1150				gso_type = SKB_GSO_TCPV6;
1151				break;
1152			case VIRTIO_NET_HDR_GSO_UDP:
1153				gso_type = SKB_GSO_UDP;
1154				break;
1155			default:
1156				goto out_unlock;
1157			}
1158
1159			if (vnet_hdr.gso_type & VIRTIO_NET_HDR_GSO_ECN)
1160				gso_type |= SKB_GSO_TCP_ECN;
1161
1162			if (vnet_hdr.gso_size == 0)
1163				goto out_unlock;
1164
1165		}
1166	}
1167
1168	err = -EMSGSIZE;
1169	if (!gso_type && (len > dev->mtu+reserve))
1170		goto out_unlock;
1171
1172	err = -ENOBUFS;
1173	skb = packet_alloc_skb(sk, LL_ALLOCATED_SPACE(dev),
1174			       LL_RESERVED_SPACE(dev), len, vnet_hdr.hdr_len,
1175			       msg->msg_flags & MSG_DONTWAIT, &err);
1176	if (skb == NULL)
1177		goto out_unlock;
1178
1179	skb_set_network_header(skb, reserve);
1180
1181	err = -EINVAL;
1182	if (sock->type == SOCK_DGRAM &&
1183	    (offset = dev_hard_header(skb, dev, ntohs(proto), addr, NULL, len)) < 0)
1184		goto out_free;
1185
1186	/* Returns -EFAULT on error */
1187	err = skb_copy_datagram_from_iovec(skb, offset, msg->msg_iov, 0, len);
1188	if (err)
1189		goto out_free;
1190
1191	skb->protocol = proto;
1192	skb->dev = dev;
1193	skb->priority = sk->sk_priority;
1194	skb->mark = sk->sk_mark;
1195
1196	if (po->has_vnet_hdr) {
1197		if (vnet_hdr.flags & VIRTIO_NET_HDR_F_NEEDS_CSUM) {
1198			if (!skb_partial_csum_set(skb, vnet_hdr.csum_start,
1199						  vnet_hdr.csum_offset)) {
1200				err = -EINVAL;
1201				goto out_free;
1202			}
1203		}
1204
1205		skb_shinfo(skb)->gso_size = vnet_hdr.gso_size;
1206		skb_shinfo(skb)->gso_type = gso_type;
1207
1208		/* Header must be checked, and gso_segs computed. */
1209		skb_shinfo(skb)->gso_type |= SKB_GSO_DODGY;
1210		skb_shinfo(skb)->gso_segs = 0;
1211
1212		len += vnet_hdr_len;
1213	}
1214
1215	/*
1216	 *	Now send it
1217	 */
1218
1219	err = dev_queue_xmit(skb);
1220	if (err > 0 && (err = net_xmit_errno(err)) != 0)
1221		goto out_unlock;
1222
1223	dev_put(dev);
1224
1225	return len;
1226
1227out_free:
1228	kfree_skb(skb);
1229out_unlock:
1230	if (dev)
1231		dev_put(dev);
1232out:
1233	return err;
1234}
1235
1236static int packet_sendmsg(struct kiocb *iocb, struct socket *sock,
1237		struct msghdr *msg, size_t len)
1238{
1239	struct sock *sk = sock->sk;
1240	struct packet_sock *po = pkt_sk(sk);
1241	if (po->tx_ring.pg_vec)
1242		return tpacket_snd(po, msg);
1243	else
1244		return packet_snd(sock, msg, len);
1245}
1246
1247/*
1248 *	Close a PACKET socket. This is fairly simple. We immediately go
1249 *	to 'closed' state and remove our protocol entry in the device list.
1250 */
1251
1252static int packet_release(struct socket *sock)
1253{
1254	struct sock *sk = sock->sk;
1255	struct packet_sock *po;
1256	struct net *net;
1257	struct tpacket_req req;
1258
1259	if (!sk)
1260		return 0;
1261
1262	net = sock_net(sk);
1263	po = pkt_sk(sk);
1264
1265	spin_lock_bh(&net->packet.sklist_lock);
1266	sk_del_node_init_rcu(sk);
1267	sock_prot_inuse_add(net, sk->sk_prot, -1);
1268	spin_unlock_bh(&net->packet.sklist_lock);
1269
1270	spin_lock(&po->bind_lock);
1271	if (po->running) {
1272		/*
1273		 * Remove from protocol table
1274		 */
1275		po->running = 0;
1276		po->num = 0;
1277		__dev_remove_pack(&po->prot_hook);
1278		__sock_put(sk);
1279	}
1280	spin_unlock(&po->bind_lock);
1281
1282	packet_flush_mclist(sk);
1283
1284	memset(&req, 0, sizeof(req));
1285
1286	if (po->rx_ring.pg_vec)
1287		packet_set_ring(sk, &req, 1, 0);
1288
1289	if (po->tx_ring.pg_vec)
1290		packet_set_ring(sk, &req, 1, 1);
1291
1292	synchronize_net();
1293	/*
1294	 *	Now the socket is dead. No more input will appear.
1295	 */
1296	sock_orphan(sk);
1297	sock->sk = NULL;
1298
1299	/* Purge queues */
1300
1301	skb_queue_purge(&sk->sk_receive_queue);
1302	sk_refcnt_debug_release(sk);
1303
1304	sock_put(sk);
1305	return 0;
1306}
1307
1308/*
1309 *	Attach a packet hook.
1310 */
1311
1312static int packet_do_bind(struct sock *sk, struct net_device *dev, __be16 protocol)
1313{
1314	struct packet_sock *po = pkt_sk(sk);
1315	/*
1316	 *	Detach an existing hook if present.
1317	 */
1318
1319	lock_sock(sk);
1320
1321	spin_lock(&po->bind_lock);
1322	if (po->running) {
1323		__sock_put(sk);
1324		po->running = 0;
1325		po->num = 0;
1326		spin_unlock(&po->bind_lock);
1327		dev_remove_pack(&po->prot_hook);
1328		spin_lock(&po->bind_lock);
1329	}
1330
1331	po->num = protocol;
1332	po->prot_hook.type = protocol;
1333	po->prot_hook.dev = dev;
1334
1335	po->ifindex = dev ? dev->ifindex : 0;
1336
1337	if (protocol == 0)
1338		goto out_unlock;
1339
1340	if (!dev || (dev->flags & IFF_UP)) {
1341		dev_add_pack(&po->prot_hook);
1342		sock_hold(sk);
1343		po->running = 1;
1344	} else {
1345		sk->sk_err = ENETDOWN;
1346		if (!sock_flag(sk, SOCK_DEAD))
1347			sk->sk_error_report(sk);
1348	}
1349
1350out_unlock:
1351	spin_unlock(&po->bind_lock);
1352	release_sock(sk);
1353	return 0;
1354}
1355
1356/*
1357 *	Bind a packet socket to a device
1358 */
1359
1360static int packet_bind_spkt(struct socket *sock, struct sockaddr *uaddr,
1361			    int addr_len)
1362{
1363	struct sock *sk = sock->sk;
1364	char name[15];
1365	struct net_device *dev;
1366	int err = -ENODEV;
1367
1368	/*
1369	 *	Check legality
1370	 */
1371
1372	if (addr_len != sizeof(struct sockaddr))
1373		return -EINVAL;
1374	strlcpy(name, uaddr->sa_data, sizeof(name));
1375
1376	dev = dev_get_by_name(sock_net(sk), name);
1377	if (dev) {
1378		err = packet_do_bind(sk, dev, pkt_sk(sk)->num);
1379		dev_put(dev);
1380	}
1381	return err;
1382}
1383
1384static int packet_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
1385{
1386	struct sockaddr_ll *sll = (struct sockaddr_ll *)uaddr;
1387	struct sock *sk = sock->sk;
1388	struct net_device *dev = NULL;
1389	int err;
1390
1391
1392	/*
1393	 *	Check legality
1394	 */
1395
1396	if (addr_len < sizeof(struct sockaddr_ll))
1397		return -EINVAL;
1398	if (sll->sll_family != AF_PACKET)
1399		return -EINVAL;
1400
1401	if (sll->sll_ifindex) {
1402		err = -ENODEV;
1403		dev = dev_get_by_index(sock_net(sk), sll->sll_ifindex);
1404		if (dev == NULL)
1405			goto out;
1406	}
1407	err = packet_do_bind(sk, dev, sll->sll_protocol ? : pkt_sk(sk)->num);
1408	if (dev)
1409		dev_put(dev);
1410
1411out:
1412	return err;
1413}
1414
1415static struct proto packet_proto = {
1416	.name	  = "PACKET",
1417	.owner	  = THIS_MODULE,
1418	.obj_size = sizeof(struct packet_sock),
1419};
1420
1421/*
1422 *	Create a packet of type SOCK_PACKET.
1423 */
1424
1425static int packet_create(struct net *net, struct socket *sock, int protocol,
1426			 int kern)
1427{
1428	struct sock *sk;
1429	struct packet_sock *po;
1430	__be16 proto = (__force __be16)protocol; /* weird, but documented */
1431	int err;
1432
1433	if (!capable(CAP_NET_RAW))
1434		return -EPERM;
1435	if (sock->type != SOCK_DGRAM && sock->type != SOCK_RAW &&
1436	    sock->type != SOCK_PACKET)
1437		return -ESOCKTNOSUPPORT;
1438
1439	sock->state = SS_UNCONNECTED;
1440
1441	err = -ENOBUFS;
1442	sk = sk_alloc(net, PF_PACKET, GFP_KERNEL, &packet_proto);
1443	if (sk == NULL)
1444		goto out;
1445
1446	sock->ops = &packet_ops;
1447	if (sock->type == SOCK_PACKET)
1448		sock->ops = &packet_ops_spkt;
1449
1450	sock_init_data(sock, sk);
1451
1452	po = pkt_sk(sk);
1453	sk->sk_family = PF_PACKET;
1454	po->num = proto;
1455
1456	sk->sk_destruct = packet_sock_destruct;
1457	sk_refcnt_debug_inc(sk);
1458
1459	/*
1460	 *	Attach a protocol block
1461	 */
1462
1463	spin_lock_init(&po->bind_lock);
1464	mutex_init(&po->pg_vec_lock);
1465	po->prot_hook.func = packet_rcv;
1466
1467	if (sock->type == SOCK_PACKET)
1468		po->prot_hook.func = packet_rcv_spkt;
1469
1470	po->prot_hook.af_packet_priv = sk;
1471
1472	if (proto) {
1473		po->prot_hook.type = proto;
1474		dev_add_pack(&po->prot_hook);
1475		sock_hold(sk);
1476		po->running = 1;
1477	}
1478
1479	spin_lock_bh(&net->packet.sklist_lock);
1480	sk_add_node_rcu(sk, &net->packet.sklist);
1481	sock_prot_inuse_add(net, &packet_proto, 1);
1482	spin_unlock_bh(&net->packet.sklist_lock);
1483
1484	return 0;
1485out:
1486	return err;
1487}
1488
1489/*
1490 *	Pull a packet from our receive queue and hand it to the user.
1491 *	If necessary we block.
1492 */
1493
1494static int packet_recvmsg(struct kiocb *iocb, struct socket *sock,
1495			  struct msghdr *msg, size_t len, int flags)
1496{
1497	struct sock *sk = sock->sk;
1498	struct sk_buff *skb;
1499	int copied, err;
1500	struct sockaddr_ll *sll;
1501	int vnet_hdr_len = 0;
1502
1503	err = -EINVAL;
1504	if (flags & ~(MSG_PEEK|MSG_DONTWAIT|MSG_TRUNC|MSG_CMSG_COMPAT))
1505		goto out;
1506
1507#if 0
1508	/* What error should we return now? EUNATTACH? */
1509	if (pkt_sk(sk)->ifindex < 0)
1510		return -ENODEV;
1511#endif
1512
1513	/*
1514	 *	Call the generic datagram receiver. This handles all sorts
1515	 *	of horrible races and re-entrancy so we can forget about it
1516	 *	in the protocol layers.
1517	 *
1518	 *	Now it will return ENETDOWN, if device have just gone down,
1519	 *	but then it will block.
1520	 */
1521
1522	skb = skb_recv_datagram(sk, flags, flags & MSG_DONTWAIT, &err);
1523
1524	/*
1525	 *	An error occurred so return it. Because skb_recv_datagram()
1526	 *	handles the blocking we don't see and worry about blocking
1527	 *	retries.
1528	 */
1529
1530	if (skb == NULL)
1531		goto out;
1532
1533	if (pkt_sk(sk)->has_vnet_hdr) {
1534		struct virtio_net_hdr vnet_hdr = { 0 };
1535
1536		err = -EINVAL;
1537		vnet_hdr_len = sizeof(vnet_hdr);
1538		if ((len -= vnet_hdr_len) < 0)
1539			goto out_free;
1540
1541		if (skb_is_gso(skb)) {
1542			struct skb_shared_info *sinfo = skb_shinfo(skb);
1543
1544			/* This is a hint as to how much should be linear. */
1545			vnet_hdr.hdr_len = skb_headlen(skb);
1546			vnet_hdr.gso_size = sinfo->gso_size;
1547			if (sinfo->gso_type & SKB_GSO_TCPV4)
1548				vnet_hdr.gso_type = VIRTIO_NET_HDR_GSO_TCPV4;
1549			else if (sinfo->gso_type & SKB_GSO_TCPV6)
1550				vnet_hdr.gso_type = VIRTIO_NET_HDR_GSO_TCPV6;
1551			else if (sinfo->gso_type & SKB_GSO_UDP)
1552				vnet_hdr.gso_type = VIRTIO_NET_HDR_GSO_UDP;
1553			else if (sinfo->gso_type & SKB_GSO_FCOE)
1554				goto out_free;
1555			else
1556				BUG();
1557			if (sinfo->gso_type & SKB_GSO_TCP_ECN)
1558				vnet_hdr.gso_type |= VIRTIO_NET_HDR_GSO_ECN;
1559		} else
1560			vnet_hdr.gso_type = VIRTIO_NET_HDR_GSO_NONE;
1561
1562		if (skb->ip_summed == CHECKSUM_PARTIAL) {
1563			vnet_hdr.flags = VIRTIO_NET_HDR_F_NEEDS_CSUM;
1564			vnet_hdr.csum_start = skb->csum_start -
1565							skb_headroom(skb);
1566			vnet_hdr.csum_offset = skb->csum_offset;
1567		} /* else everything is zero */
1568
1569		err = memcpy_toiovec(msg->msg_iov, (void *)&vnet_hdr,
1570				     vnet_hdr_len);
1571		if (err < 0)
1572			goto out_free;
1573	}
1574
1575	/*
1576	 *	If the address length field is there to be filled in, we fill
1577	 *	it in now.
1578	 */
1579
1580	sll = &PACKET_SKB_CB(skb)->sa.ll;
1581	if (sock->type == SOCK_PACKET)
1582		msg->msg_namelen = sizeof(struct sockaddr_pkt);
1583	else
1584		msg->msg_namelen = sll->sll_halen + offsetof(struct sockaddr_ll, sll_addr);
1585
1586	/*
1587	 *	You lose any data beyond the buffer you gave. If it worries a
1588	 *	user program they can ask the device for its MTU anyway.
1589	 */
1590
1591	copied = skb->len;
1592	if (copied > len) {
1593		copied = len;
1594		msg->msg_flags |= MSG_TRUNC;
1595	}
1596
1597	err = skb_copy_datagram_iovec(skb, 0, msg->msg_iov, copied);
1598	if (err)
1599		goto out_free;
1600
1601	sock_recv_ts_and_drops(msg, sk, skb);
1602
1603	if (msg->msg_name)
1604		memcpy(msg->msg_name, &PACKET_SKB_CB(skb)->sa,
1605		       msg->msg_namelen);
1606
1607	if (pkt_sk(sk)->auxdata) {
1608		struct tpacket_auxdata aux;
1609
1610		aux.tp_status = TP_STATUS_USER;
1611		if (skb->ip_summed == CHECKSUM_PARTIAL)
1612			aux.tp_status |= TP_STATUS_CSUMNOTREADY;
1613		aux.tp_len = PACKET_SKB_CB(skb)->origlen;
1614		aux.tp_snaplen = skb->len;
1615		aux.tp_mac = 0;
1616		aux.tp_net = skb_network_offset(skb);
1617		aux.tp_vlan_tci = vlan_tx_tag_get(skb);
1618
1619		put_cmsg(msg, SOL_PACKET, PACKET_AUXDATA, sizeof(aux), &aux);
1620	}
1621
1622	/*
1623	 *	Free or return the buffer as appropriate. Again this
1624	 *	hides all the races and re-entrancy issues from us.
1625	 */
1626	err = vnet_hdr_len + ((flags&MSG_TRUNC) ? skb->len : copied);
1627
1628out_free:
1629	skb_free_datagram(sk, skb);
1630out:
1631	return err;
1632}
1633
1634static int packet_getname_spkt(struct socket *sock, struct sockaddr *uaddr,
1635			       int *uaddr_len, int peer)
1636{
1637	struct net_device *dev;
1638	struct sock *sk	= sock->sk;
1639
1640	if (peer)
1641		return -EOPNOTSUPP;
1642
1643	uaddr->sa_family = AF_PACKET;
1644	rcu_read_lock();
1645	dev = dev_get_by_index_rcu(sock_net(sk), pkt_sk(sk)->ifindex);
1646	if (dev)
1647		strlcpy(uaddr->sa_data, dev->name, 15);
1648	else
1649		memset(uaddr->sa_data, 0, 14);
1650	rcu_read_unlock();
1651	*uaddr_len = sizeof(*uaddr);
1652
1653	return 0;
1654}
1655
1656static int packet_getname(struct socket *sock, struct sockaddr *uaddr,
1657			  int *uaddr_len, int peer)
1658{
1659	struct net_device *dev;
1660	struct sock *sk = sock->sk;
1661	struct packet_sock *po = pkt_sk(sk);
1662	DECLARE_SOCKADDR(struct sockaddr_ll *, sll, uaddr);
1663
1664	if (peer)
1665		return -EOPNOTSUPP;
1666
1667	sll->sll_family = AF_PACKET;
1668	sll->sll_ifindex = po->ifindex;
1669	sll->sll_protocol = po->num;
1670	rcu_read_lock();
1671	dev = dev_get_by_index_rcu(sock_net(sk), po->ifindex);
1672	if (dev) {
1673		sll->sll_hatype = dev->type;
1674		sll->sll_halen = dev->addr_len;
1675		memcpy(sll->sll_addr, dev->dev_addr, dev->addr_len);
1676	} else {
1677		sll->sll_hatype = 0;	/* Bad: we have no ARPHRD_UNSPEC */
1678		sll->sll_halen = 0;
1679	}
1680	rcu_read_unlock();
1681	*uaddr_len = offsetof(struct sockaddr_ll, sll_addr) + sll->sll_halen;
1682
1683	return 0;
1684}
1685
1686static int packet_dev_mc(struct net_device *dev, struct packet_mclist *i,
1687			 int what)
1688{
1689	switch (i->type) {
1690	case PACKET_MR_MULTICAST:
1691		if (i->alen != dev->addr_len)
1692			return -EINVAL;
1693		if (what > 0)
1694			return dev_mc_add(dev, i->addr, i->alen, 0);
1695		else
1696			return dev_mc_delete(dev, i->addr, i->alen, 0);
1697		break;
1698	case PACKET_MR_PROMISC:
1699		return dev_set_promiscuity(dev, what);
1700		break;
1701	case PACKET_MR_ALLMULTI:
1702		return dev_set_allmulti(dev, what);
1703		break;
1704	case PACKET_MR_UNICAST:
1705		if (i->alen != dev->addr_len)
1706			return -EINVAL;
1707		if (what > 0)
1708			return dev_unicast_add(dev, i->addr);
1709		else
1710			return dev_unicast_delete(dev, i->addr);
1711		break;
1712	default:
1713		break;
1714	}
1715	return 0;
1716}
1717
1718static void packet_dev_mclist(struct net_device *dev, struct packet_mclist *i, int what)
1719{
1720	for ( ; i; i = i->next) {
1721		if (i->ifindex == dev->ifindex)
1722			packet_dev_mc(dev, i, what);
1723	}
1724}
1725
1726static int packet_mc_add(struct sock *sk, struct packet_mreq_max *mreq)
1727{
1728	struct packet_sock *po = pkt_sk(sk);
1729	struct packet_mclist *ml, *i;
1730	struct net_device *dev;
1731	int err;
1732
1733	rtnl_lock();
1734
1735	err = -ENODEV;
1736	dev = __dev_get_by_index(sock_net(sk), mreq->mr_ifindex);
1737	if (!dev)
1738		goto done;
1739
1740	err = -EINVAL;
1741	if (mreq->mr_alen > dev->addr_len)
1742		goto done;
1743
1744	err = -ENOBUFS;
1745	i = kmalloc(sizeof(*i), GFP_KERNEL);
1746	if (i == NULL)
1747		goto done;
1748
1749	err = 0;
1750	for (ml = po->mclist; ml; ml = ml->next) {
1751		if (ml->ifindex == mreq->mr_ifindex &&
1752		    ml->type == mreq->mr_type &&
1753		    ml->alen == mreq->mr_alen &&
1754		    memcmp(ml->addr, mreq->mr_address, ml->alen) == 0) {
1755			ml->count++;
1756			/* Free the new element ... */
1757			kfree(i);
1758			goto done;
1759		}
1760	}
1761
1762	i->type = mreq->mr_type;
1763	i->ifindex = mreq->mr_ifindex;
1764	i->alen = mreq->mr_alen;
1765	memcpy(i->addr, mreq->mr_address, i->alen);
1766	i->count = 1;
1767	i->next = po->mclist;
1768	po->mclist = i;
1769	err = packet_dev_mc(dev, i, 1);
1770	if (err) {
1771		po->mclist = i->next;
1772		kfree(i);
1773	}
1774
1775done:
1776	rtnl_unlock();
1777	return err;
1778}
1779
1780static int packet_mc_drop(struct sock *sk, struct packet_mreq_max *mreq)
1781{
1782	struct packet_mclist *ml, **mlp;
1783
1784	rtnl_lock();
1785
1786	for (mlp = &pkt_sk(sk)->mclist; (ml = *mlp) != NULL; mlp = &ml->next) {
1787		if (ml->ifindex == mreq->mr_ifindex &&
1788		    ml->type == mreq->mr_type &&
1789		    ml->alen == mreq->mr_alen &&
1790		    memcmp(ml->addr, mreq->mr_address, ml->alen) == 0) {
1791			if (--ml->count == 0) {
1792				struct net_device *dev;
1793				*mlp = ml->next;
1794				dev = __dev_get_by_index(sock_net(sk), ml->ifindex);
1795				if (dev)
1796					packet_dev_mc(dev, ml, -1);
1797				kfree(ml);
1798			}
1799			rtnl_unlock();
1800			return 0;
1801		}
1802	}
1803	rtnl_unlock();
1804	return -EADDRNOTAVAIL;
1805}
1806
1807static void packet_flush_mclist(struct sock *sk)
1808{
1809	struct packet_sock *po = pkt_sk(sk);
1810	struct packet_mclist *ml;
1811
1812	if (!po->mclist)
1813		return;
1814
1815	rtnl_lock();
1816	while ((ml = po->mclist) != NULL) {
1817		struct net_device *dev;
1818
1819		po->mclist = ml->next;
1820		dev = __dev_get_by_index(sock_net(sk), ml->ifindex);
1821		if (dev != NULL)
1822			packet_dev_mc(dev, ml, -1);
1823		kfree(ml);
1824	}
1825	rtnl_unlock();
1826}
1827
1828static int
1829packet_setsockopt(struct socket *sock, int level, int optname, char __user *optval, unsigned int optlen)
1830{
1831	struct sock *sk = sock->sk;
1832	struct packet_sock *po = pkt_sk(sk);
1833	int ret;
1834
1835	if (level != SOL_PACKET)
1836		return -ENOPROTOOPT;
1837
1838	switch (optname) {
1839	case PACKET_ADD_MEMBERSHIP:
1840	case PACKET_DROP_MEMBERSHIP:
1841	{
1842		struct packet_mreq_max mreq;
1843		int len = optlen;
1844		memset(&mreq, 0, sizeof(mreq));
1845		if (len < sizeof(struct packet_mreq))
1846			return -EINVAL;
1847		if (len > sizeof(mreq))
1848			len = sizeof(mreq);
1849		if (copy_from_user(&mreq, optval, len))
1850			return -EFAULT;
1851		if (len < (mreq.mr_alen + offsetof(struct packet_mreq, mr_address)))
1852			return -EINVAL;
1853		if (optname == PACKET_ADD_MEMBERSHIP)
1854			ret = packet_mc_add(sk, &mreq);
1855		else
1856			ret = packet_mc_drop(sk, &mreq);
1857		return ret;
1858	}
1859
1860	case PACKET_RX_RING:
1861	case PACKET_TX_RING:
1862	{
1863		struct tpacket_req req;
1864
1865		if (optlen < sizeof(req))
1866			return -EINVAL;
1867		if (pkt_sk(sk)->has_vnet_hdr)
1868			return -EINVAL;
1869		if (copy_from_user(&req, optval, sizeof(req)))
1870			return -EFAULT;
1871		return packet_set_ring(sk, &req, 0, optname == PACKET_TX_RING);
1872	}
1873	case PACKET_COPY_THRESH:
1874	{
1875		int val;
1876
1877		if (optlen != sizeof(val))
1878			return -EINVAL;
1879		if (copy_from_user(&val, optval, sizeof(val)))
1880			return -EFAULT;
1881
1882		pkt_sk(sk)->copy_thresh = val;
1883		return 0;
1884	}
1885	case PACKET_VERSION:
1886	{
1887		int val;
1888
1889		if (optlen != sizeof(val))
1890			return -EINVAL;
1891		if (po->rx_ring.pg_vec || po->tx_ring.pg_vec)
1892			return -EBUSY;
1893		if (copy_from_user(&val, optval, sizeof(val)))
1894			return -EFAULT;
1895		switch (val) {
1896		case TPACKET_V1:
1897		case TPACKET_V2:
1898			po->tp_version = val;
1899			return 0;
1900		default:
1901			return -EINVAL;
1902		}
1903	}
1904	case PACKET_RESERVE:
1905	{
1906		unsigned int val;
1907
1908		if (optlen != sizeof(val))
1909			return -EINVAL;
1910		if (po->rx_ring.pg_vec || po->tx_ring.pg_vec)
1911			return -EBUSY;
1912		if (copy_from_user(&val, optval, sizeof(val)))
1913			return -EFAULT;
1914		po->tp_reserve = val;
1915		return 0;
1916	}
1917	case PACKET_LOSS:
1918	{
1919		unsigned int val;
1920
1921		if (optlen != sizeof(val))
1922			return -EINVAL;
1923		if (po->rx_ring.pg_vec || po->tx_ring.pg_vec)
1924			return -EBUSY;
1925		if (copy_from_user(&val, optval, sizeof(val)))
1926			return -EFAULT;
1927		po->tp_loss = !!val;
1928		return 0;
1929	}
1930	case PACKET_AUXDATA:
1931	{
1932		int val;
1933
1934		if (optlen < sizeof(val))
1935			return -EINVAL;
1936		if (copy_from_user(&val, optval, sizeof(val)))
1937			return -EFAULT;
1938
1939		po->auxdata = !!val;
1940		return 0;
1941	}
1942	case PACKET_ORIGDEV:
1943	{
1944		int val;
1945
1946		if (optlen < sizeof(val))
1947			return -EINVAL;
1948		if (copy_from_user(&val, optval, sizeof(val)))
1949			return -EFAULT;
1950
1951		po->origdev = !!val;
1952		return 0;
1953	}
1954	case PACKET_VNET_HDR:
1955	{
1956		int val;
1957
1958		if (sock->type != SOCK_RAW)
1959			return -EINVAL;
1960		if (po->rx_ring.pg_vec || po->tx_ring.pg_vec)
1961			return -EBUSY;
1962		if (optlen < sizeof(val))
1963			return -EINVAL;
1964		if (copy_from_user(&val, optval, sizeof(val)))
1965			return -EFAULT;
1966
1967		po->has_vnet_hdr = !!val;
1968		return 0;
1969	}
1970	default:
1971		return -ENOPROTOOPT;
1972	}
1973}
1974
1975static int packet_getsockopt(struct socket *sock, int level, int optname,
1976			     char __user *optval, int __user *optlen)
1977{
1978	int len;
1979	int val;
1980	struct sock *sk = sock->sk;
1981	struct packet_sock *po = pkt_sk(sk);
1982	void *data;
1983	struct tpacket_stats st;
1984
1985	if (level != SOL_PACKET)
1986		return -ENOPROTOOPT;
1987
1988	if (get_user(len, optlen))
1989		return -EFAULT;
1990
1991	if (len < 0)
1992		return -EINVAL;
1993
1994	switch (optname) {
1995	case PACKET_STATISTICS:
1996		if (len > sizeof(struct tpacket_stats))
1997			len = sizeof(struct tpacket_stats);
1998		spin_lock_bh(&sk->sk_receive_queue.lock);
1999		st = po->stats;
2000		memset(&po->stats, 0, sizeof(st));
2001		spin_unlock_bh(&sk->sk_receive_queue.lock);
2002		st.tp_packets += st.tp_drops;
2003
2004		data = &st;
2005		break;
2006	case PACKET_AUXDATA:
2007		if (len > sizeof(int))
2008			len = sizeof(int);
2009		val = po->auxdata;
2010
2011		data = &val;
2012		break;
2013	case PACKET_ORIGDEV:
2014		if (len > sizeof(int))
2015			len = sizeof(int);
2016		val = po->origdev;
2017
2018		data = &val;
2019		break;
2020	case PACKET_VNET_HDR:
2021		if (len > sizeof(int))
2022			len = sizeof(int);
2023		val = po->has_vnet_hdr;
2024
2025		data = &val;
2026		break;
2027	case PACKET_VERSION:
2028		if (len > sizeof(int))
2029			len = sizeof(int);
2030		val = po->tp_version;
2031		data = &val;
2032		break;
2033	case PACKET_HDRLEN:
2034		if (len > sizeof(int))
2035			len = sizeof(int);
2036		if (copy_from_user(&val, optval, len))
2037			return -EFAULT;
2038		switch (val) {
2039		case TPACKET_V1:
2040			val = sizeof(struct tpacket_hdr);
2041			break;
2042		case TPACKET_V2:
2043			val = sizeof(struct tpacket2_hdr);
2044			break;
2045		default:
2046			return -EINVAL;
2047		}
2048		data = &val;
2049		break;
2050	case PACKET_RESERVE:
2051		if (len > sizeof(unsigned int))
2052			len = sizeof(unsigned int);
2053		val = po->tp_reserve;
2054		data = &val;
2055		break;
2056	case PACKET_LOSS:
2057		if (len > sizeof(unsigned int))
2058			len = sizeof(unsigned int);
2059		val = po->tp_loss;
2060		data = &val;
2061		break;
2062	default:
2063		return -ENOPROTOOPT;
2064	}
2065
2066	if (put_user(len, optlen))
2067		return -EFAULT;
2068	if (copy_to_user(optval, data, len))
2069		return -EFAULT;
2070	return 0;
2071}
2072
2073
2074static int packet_notifier(struct notifier_block *this, unsigned long msg, void *data)
2075{
2076	struct sock *sk;
2077	struct hlist_node *node;
2078	struct net_device *dev = data;
2079	struct net *net = dev_net(dev);
2080
2081	rcu_read_lock();
2082	sk_for_each_rcu(sk, node, &net->packet.sklist) {
2083		struct packet_sock *po = pkt_sk(sk);
2084
2085		switch (msg) {
2086		case NETDEV_UNREGISTER:
2087			if (po->mclist)
2088				packet_dev_mclist(dev, po->mclist, -1);
2089			/* fallthrough */
2090
2091		case NETDEV_DOWN:
2092			if (dev->ifindex == po->ifindex) {
2093				spin_lock(&po->bind_lock);
2094				if (po->running) {
2095					__dev_remove_pack(&po->prot_hook);
2096					__sock_put(sk);
2097					po->running = 0;
2098					sk->sk_err = ENETDOWN;
2099					if (!sock_flag(sk, SOCK_DEAD))
2100						sk->sk_error_report(sk);
2101				}
2102				if (msg == NETDEV_UNREGISTER) {
2103					po->ifindex = -1;
2104					po->prot_hook.dev = NULL;
2105				}
2106				spin_unlock(&po->bind_lock);
2107			}
2108			break;
2109		case NETDEV_UP:
2110			if (dev->ifindex == po->ifindex) {
2111				spin_lock(&po->bind_lock);
2112				if (po->num && !po->running) {
2113					dev_add_pack(&po->prot_hook);
2114					sock_hold(sk);
2115					po->running = 1;
2116				}
2117				spin_unlock(&po->bind_lock);
2118			}
2119			break;
2120		}
2121	}
2122	rcu_read_unlock();
2123	return NOTIFY_DONE;
2124}
2125
2126
2127static int packet_ioctl(struct socket *sock, unsigned int cmd,
2128			unsigned long arg)
2129{
2130	struct sock *sk = sock->sk;
2131
2132	switch (cmd) {
2133	case SIOCOUTQ:
2134	{
2135		int amount = sk_wmem_alloc_get(sk);
2136
2137		return put_user(amount, (int __user *)arg);
2138	}
2139	case SIOCINQ:
2140	{
2141		struct sk_buff *skb;
2142		int amount = 0;
2143
2144		spin_lock_bh(&sk->sk_receive_queue.lock);
2145		skb = skb_peek(&sk->sk_receive_queue);
2146		if (skb)
2147			amount = skb->len;
2148		spin_unlock_bh(&sk->sk_receive_queue.lock);
2149		return put_user(amount, (int __user *)arg);
2150	}
2151	case SIOCGSTAMP:
2152		return sock_get_timestamp(sk, (struct timeval __user *)arg);
2153	case SIOCGSTAMPNS:
2154		return sock_get_timestampns(sk, (struct timespec __user *)arg);
2155
2156#ifdef CONFIG_INET
2157	case SIOCADDRT:
2158	case SIOCDELRT:
2159	case SIOCDARP:
2160	case SIOCGARP:
2161	case SIOCSARP:
2162	case SIOCGIFADDR:
2163	case SIOCSIFADDR:
2164	case SIOCGIFBRDADDR:
2165	case SIOCSIFBRDADDR:
2166	case SIOCGIFNETMASK:
2167	case SIOCSIFNETMASK:
2168	case SIOCGIFDSTADDR:
2169	case SIOCSIFDSTADDR:
2170	case SIOCSIFFLAGS:
2171		if (!net_eq(sock_net(sk), &init_net))
2172			return -ENOIOCTLCMD;
2173		return inet_dgram_ops.ioctl(sock, cmd, arg);
2174#endif
2175
2176	default:
2177		return -ENOIOCTLCMD;
2178	}
2179	return 0;
2180}
2181
2182static unsigned int packet_poll(struct file *file, struct socket *sock,
2183				poll_table *wait)
2184{
2185	struct sock *sk = sock->sk;
2186	struct packet_sock *po = pkt_sk(sk);
2187	unsigned int mask = datagram_poll(file, sock, wait);
2188
2189	spin_lock_bh(&sk->sk_receive_queue.lock);
2190	if (po->rx_ring.pg_vec) {
2191		if (!packet_previous_frame(po, &po->rx_ring, TP_STATUS_KERNEL))
2192			mask |= POLLIN | POLLRDNORM;
2193	}
2194	spin_unlock_bh(&sk->sk_receive_queue.lock);
2195	spin_lock_bh(&sk->sk_write_queue.lock);
2196	if (po->tx_ring.pg_vec) {
2197		if (packet_current_frame(po, &po->tx_ring, TP_STATUS_AVAILABLE))
2198			mask |= POLLOUT | POLLWRNORM;
2199	}
2200	spin_unlock_bh(&sk->sk_write_queue.lock);
2201	return mask;
2202}
2203
2204
2205/* Dirty? Well, I still did not learn better way to account
2206 * for user mmaps.
2207 */
2208
2209static void packet_mm_open(struct vm_area_struct *vma)
2210{
2211	struct file *file = vma->vm_file;
2212	struct socket *sock = file->private_data;
2213	struct sock *sk = sock->sk;
2214
2215	if (sk)
2216		atomic_inc(&pkt_sk(sk)->mapped);
2217}
2218
2219static void packet_mm_close(struct vm_area_struct *vma)
2220{
2221	struct file *file = vma->vm_file;
2222	struct socket *sock = file->private_data;
2223	struct sock *sk = sock->sk;
2224
2225	if (sk)
2226		atomic_dec(&pkt_sk(sk)->mapped);
2227}
2228
2229static const struct vm_operations_struct packet_mmap_ops = {
2230	.open	=	packet_mm_open,
2231	.close	=	packet_mm_close,
2232};
2233
2234static void free_pg_vec(char **pg_vec, unsigned int order, unsigned int len)
2235{
2236	int i;
2237
2238	for (i = 0; i < len; i++) {
2239		if (likely(pg_vec[i]))
2240			free_pages((unsigned long) pg_vec[i], order);
2241	}
2242	kfree(pg_vec);
2243}
2244
2245static inline char *alloc_one_pg_vec_page(unsigned long order)
2246{
2247	gfp_t gfp_flags = GFP_KERNEL | __GFP_COMP | __GFP_ZERO | __GFP_NOWARN;
2248
2249	return (char *) __get_free_pages(gfp_flags, order);
2250}
2251
2252static char **alloc_pg_vec(struct tpacket_req *req, int order)
2253{
2254	unsigned int block_nr = req->tp_block_nr;
2255	char **pg_vec;
2256	int i;
2257
2258	pg_vec = kzalloc(block_nr * sizeof(char *), GFP_KERNEL);
2259	if (unlikely(!pg_vec))
2260		goto out;
2261
2262	for (i = 0; i < block_nr; i++) {
2263		pg_vec[i] = alloc_one_pg_vec_page(order);
2264		if (unlikely(!pg_vec[i]))
2265			goto out_free_pgvec;
2266	}
2267
2268out:
2269	return pg_vec;
2270
2271out_free_pgvec:
2272	free_pg_vec(pg_vec, order, block_nr);
2273	pg_vec = NULL;
2274	goto out;
2275}
2276
2277static int packet_set_ring(struct sock *sk, struct tpacket_req *req,
2278		int closing, int tx_ring)
2279{
2280	char **pg_vec = NULL;
2281	struct packet_sock *po = pkt_sk(sk);
2282	int was_running, order = 0;
2283	struct packet_ring_buffer *rb;
2284	struct sk_buff_head *rb_queue;
2285	__be16 num;
2286	int err;
2287
2288	rb = tx_ring ? &po->tx_ring : &po->rx_ring;
2289	rb_queue = tx_ring ? &sk->sk_write_queue : &sk->sk_receive_queue;
2290
2291	err = -EBUSY;
2292	if (!closing) {
2293		if (atomic_read(&po->mapped))
2294			goto out;
2295		if (atomic_read(&rb->pending))
2296			goto out;
2297	}
2298
2299	if (req->tp_block_nr) {
2300		/* Sanity tests and some calculations */
2301		err = -EBUSY;
2302		if (unlikely(rb->pg_vec))
2303			goto out;
2304
2305		switch (po->tp_version) {
2306		case TPACKET_V1:
2307			po->tp_hdrlen = TPACKET_HDRLEN;
2308			break;
2309		case TPACKET_V2:
2310			po->tp_hdrlen = TPACKET2_HDRLEN;
2311			break;
2312		}
2313
2314		err = -EINVAL;
2315		if (unlikely((int)req->tp_block_size <= 0))
2316			goto out;
2317		if (unlikely(req->tp_block_size & (PAGE_SIZE - 1)))
2318			goto out;
2319		if (unlikely(req->tp_frame_size < po->tp_hdrlen +
2320					po->tp_reserve))
2321			goto out;
2322		if (unlikely(req->tp_frame_size & (TPACKET_ALIGNMENT - 1)))
2323			goto out;
2324
2325		rb->frames_per_block = req->tp_block_size/req->tp_frame_size;
2326		if (unlikely(rb->frames_per_block <= 0))
2327			goto out;
2328		if (unlikely((rb->frames_per_block * req->tp_block_nr) !=
2329					req->tp_frame_nr))
2330			goto out;
2331
2332		err = -ENOMEM;
2333		order = get_order(req->tp_block_size);
2334		pg_vec = alloc_pg_vec(req, order);
2335		if (unlikely(!pg_vec))
2336			goto out;
2337	}
2338	/* Done */
2339	else {
2340		err = -EINVAL;
2341		if (unlikely(req->tp_frame_nr))
2342			goto out;
2343	}
2344
2345	lock_sock(sk);
2346
2347	/* Detach socket from network */
2348	spin_lock(&po->bind_lock);
2349	was_running = po->running;
2350	num = po->num;
2351	if (was_running) {
2352		__dev_remove_pack(&po->prot_hook);
2353		po->num = 0;
2354		po->running = 0;
2355		__sock_put(sk);
2356	}
2357	spin_unlock(&po->bind_lock);
2358
2359	synchronize_net();
2360
2361	err = -EBUSY;
2362	mutex_lock(&po->pg_vec_lock);
2363	if (closing || atomic_read(&po->mapped) == 0) {
2364		err = 0;
2365#define XC(a, b) ({ __typeof__ ((a)) __t; __t = (a); (a) = (b); __t; })
2366		spin_lock_bh(&rb_queue->lock);
2367		pg_vec = XC(rb->pg_vec, pg_vec);
2368		rb->frame_max = (req->tp_frame_nr - 1);
2369		rb->head = 0;
2370		rb->frame_size = req->tp_frame_size;
2371		spin_unlock_bh(&rb_queue->lock);
2372
2373		order = XC(rb->pg_vec_order, order);
2374		req->tp_block_nr = XC(rb->pg_vec_len, req->tp_block_nr);
2375
2376		rb->pg_vec_pages = req->tp_block_size/PAGE_SIZE;
2377		po->prot_hook.func = (po->rx_ring.pg_vec) ?
2378						tpacket_rcv : packet_rcv;
2379		skb_queue_purge(rb_queue);
2380#undef XC
2381		if (atomic_read(&po->mapped))
2382			pr_err("packet_mmap: vma is busy: %d\n",
2383			       atomic_read(&po->mapped));
2384	}
2385	mutex_unlock(&po->pg_vec_lock);
2386
2387	spin_lock(&po->bind_lock);
2388	if (was_running && !po->running) {
2389		sock_hold(sk);
2390		po->running = 1;
2391		po->num = num;
2392		dev_add_pack(&po->prot_hook);
2393	}
2394	spin_unlock(&po->bind_lock);
2395
2396	release_sock(sk);
2397
2398	if (pg_vec)
2399		free_pg_vec(pg_vec, order, req->tp_block_nr);
2400out:
2401	return err;
2402}
2403
2404static int packet_mmap(struct file *file, struct socket *sock,
2405		struct vm_area_struct *vma)
2406{
2407	struct sock *sk = sock->sk;
2408	struct packet_sock *po = pkt_sk(sk);
2409	unsigned long size, expected_size;
2410	struct packet_ring_buffer *rb;
2411	unsigned long start;
2412	int err = -EINVAL;
2413	int i;
2414
2415	if (vma->vm_pgoff)
2416		return -EINVAL;
2417
2418	mutex_lock(&po->pg_vec_lock);
2419
2420	expected_size = 0;
2421	for (rb = &po->rx_ring; rb <= &po->tx_ring; rb++) {
2422		if (rb->pg_vec) {
2423			expected_size += rb->pg_vec_len
2424						* rb->pg_vec_pages
2425						* PAGE_SIZE;
2426		}
2427	}
2428
2429	if (expected_size == 0)
2430		goto out;
2431
2432	size = vma->vm_end - vma->vm_start;
2433	if (size != expected_size)
2434		goto out;
2435
2436	start = vma->vm_start;
2437	for (rb = &po->rx_ring; rb <= &po->tx_ring; rb++) {
2438		if (rb->pg_vec == NULL)
2439			continue;
2440
2441		for (i = 0; i < rb->pg_vec_len; i++) {
2442			struct page *page = virt_to_page(rb->pg_vec[i]);
2443			int pg_num;
2444
2445			for (pg_num = 0; pg_num < rb->pg_vec_pages;
2446					pg_num++, page++) {
2447				err = vm_insert_page(vma, start, page);
2448				if (unlikely(err))
2449					goto out;
2450				start += PAGE_SIZE;
2451			}
2452		}
2453	}
2454
2455	atomic_inc(&po->mapped);
2456	vma->vm_ops = &packet_mmap_ops;
2457	err = 0;
2458
2459out:
2460	mutex_unlock(&po->pg_vec_lock);
2461	return err;
2462}
2463
2464static const struct proto_ops packet_ops_spkt = {
2465	.family =	PF_PACKET,
2466	.owner =	THIS_MODULE,
2467	.release =	packet_release,
2468	.bind =		packet_bind_spkt,
2469	.connect =	sock_no_connect,
2470	.socketpair =	sock_no_socketpair,
2471	.accept =	sock_no_accept,
2472	.getname =	packet_getname_spkt,
2473	.poll =		datagram_poll,
2474	.ioctl =	packet_ioctl,
2475	.listen =	sock_no_listen,
2476	.shutdown =	sock_no_shutdown,
2477	.setsockopt =	sock_no_setsockopt,
2478	.getsockopt =	sock_no_getsockopt,
2479	.sendmsg =	packet_sendmsg_spkt,
2480	.recvmsg =	packet_recvmsg,
2481	.mmap =		sock_no_mmap,
2482	.sendpage =	sock_no_sendpage,
2483};
2484
2485static const struct proto_ops packet_ops = {
2486	.family =	PF_PACKET,
2487	.owner =	THIS_MODULE,
2488	.release =	packet_release,
2489	.bind =		packet_bind,
2490	.connect =	sock_no_connect,
2491	.socketpair =	sock_no_socketpair,
2492	.accept =	sock_no_accept,
2493	.getname =	packet_getname,
2494	.poll =		packet_poll,
2495	.ioctl =	packet_ioctl,
2496	.listen =	sock_no_listen,
2497	.shutdown =	sock_no_shutdown,
2498	.setsockopt =	packet_setsockopt,
2499	.getsockopt =	packet_getsockopt,
2500	.sendmsg =	packet_sendmsg,
2501	.recvmsg =	packet_recvmsg,
2502	.mmap =		packet_mmap,
2503	.sendpage =	sock_no_sendpage,
2504};
2505
2506static const struct net_proto_family packet_family_ops = {
2507	.family =	PF_PACKET,
2508	.create =	packet_create,
2509	.owner	=	THIS_MODULE,
2510};
2511
2512static struct notifier_block packet_netdev_notifier = {
2513	.notifier_call =	packet_notifier,
2514};
2515
2516#ifdef CONFIG_PROC_FS
2517
2518static void *packet_seq_start(struct seq_file *seq, loff_t *pos)
2519	__acquires(RCU)
2520{
2521	struct net *net = seq_file_net(seq);
2522
2523	rcu_read_lock();
2524	return seq_hlist_start_head_rcu(&net->packet.sklist, *pos);
2525}
2526
2527static void *packet_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2528{
2529	struct net *net = seq_file_net(seq);
2530	return seq_hlist_next_rcu(v, &net->packet.sklist, pos);
2531}
2532
2533static void packet_seq_stop(struct seq_file *seq, void *v)
2534	__releases(RCU)
2535{
2536	rcu_read_unlock();
2537}
2538
2539static int packet_seq_show(struct seq_file *seq, void *v)
2540{
2541	if (v == SEQ_START_TOKEN)
2542		seq_puts(seq, "sk       RefCnt Type Proto  Iface R Rmem   User   Inode\n");
2543	else {
2544		struct sock *s = sk_entry(v);
2545		const struct packet_sock *po = pkt_sk(s);
2546
2547		seq_printf(seq,
2548			   "%p %-6d %-4d %04x   %-5d %1d %-6u %-6u %-6lu\n",
2549			   s,
2550			   atomic_read(&s->sk_refcnt),
2551			   s->sk_type,
2552			   ntohs(po->num),
2553			   po->ifindex,
2554			   po->running,
2555			   atomic_read(&s->sk_rmem_alloc),
2556			   sock_i_uid(s),
2557			   sock_i_ino(s));
2558	}
2559
2560	return 0;
2561}
2562
2563static const struct seq_operations packet_seq_ops = {
2564	.start	= packet_seq_start,
2565	.next	= packet_seq_next,
2566	.stop	= packet_seq_stop,
2567	.show	= packet_seq_show,
2568};
2569
2570static int packet_seq_open(struct inode *inode, struct file *file)
2571{
2572	return seq_open_net(inode, file, &packet_seq_ops,
2573			    sizeof(struct seq_net_private));
2574}
2575
2576static const struct file_operations packet_seq_fops = {
2577	.owner		= THIS_MODULE,
2578	.open		= packet_seq_open,
2579	.read		= seq_read,
2580	.llseek		= seq_lseek,
2581	.release	= seq_release_net,
2582};
2583
2584#endif
2585
2586static int __net_init packet_net_init(struct net *net)
2587{
2588	spin_lock_init(&net->packet.sklist_lock);
2589	INIT_HLIST_HEAD(&net->packet.sklist);
2590
2591	if (!proc_net_fops_create(net, "packet", 0, &packet_seq_fops))
2592		return -ENOMEM;
2593
2594	return 0;
2595}
2596
2597static void __net_exit packet_net_exit(struct net *net)
2598{
2599	proc_net_remove(net, "packet");
2600}
2601
2602static struct pernet_operations packet_net_ops = {
2603	.init = packet_net_init,
2604	.exit = packet_net_exit,
2605};
2606
2607
2608static void __exit packet_exit(void)
2609{
2610	unregister_netdevice_notifier(&packet_netdev_notifier);
2611	unregister_pernet_subsys(&packet_net_ops);
2612	sock_unregister(PF_PACKET);
2613	proto_unregister(&packet_proto);
2614}
2615
2616static int __init packet_init(void)
2617{
2618	int rc = proto_register(&packet_proto, 0);
2619
2620	if (rc != 0)
2621		goto out;
2622
2623	sock_register(&packet_family_ops);
2624	register_pernet_subsys(&packet_net_ops);
2625	register_netdevice_notifier(&packet_netdev_notifier);
2626out:
2627	return rc;
2628}
2629
2630module_init(packet_init);
2631module_exit(packet_exit);
2632MODULE_LICENSE("GPL");
2633MODULE_ALIAS_NETPROTO(PF_PACKET);
2634