af_packet.c revision 2d37a186cedc51502dbee71c16ae0fbd9114d62c
1/*
2 * INET		An implementation of the TCP/IP protocol suite for the LINUX
3 *		operating system.  INET is implemented using the  BSD Socket
4 *		interface as the means of communication with the user level.
5 *
6 *		PACKET - implements raw packet sockets.
7 *
8 * Authors:	Ross Biro
9 *		Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
10 *		Alan Cox, <gw4pts@gw4pts.ampr.org>
11 *
12 * Fixes:
13 *		Alan Cox	:	verify_area() now used correctly
14 *		Alan Cox	:	new skbuff lists, look ma no backlogs!
15 *		Alan Cox	:	tidied skbuff lists.
16 *		Alan Cox	:	Now uses generic datagram routines I
17 *					added. Also fixed the peek/read crash
18 *					from all old Linux datagram code.
19 *		Alan Cox	:	Uses the improved datagram code.
20 *		Alan Cox	:	Added NULL's for socket options.
21 *		Alan Cox	:	Re-commented the code.
22 *		Alan Cox	:	Use new kernel side addressing
23 *		Rob Janssen	:	Correct MTU usage.
24 *		Dave Platt	:	Counter leaks caused by incorrect
25 *					interrupt locking and some slightly
26 *					dubious gcc output. Can you read
27 *					compiler: it said _VOLATILE_
28 *	Richard Kooijman	:	Timestamp fixes.
29 *		Alan Cox	:	New buffers. Use sk->mac.raw.
30 *		Alan Cox	:	sendmsg/recvmsg support.
31 *		Alan Cox	:	Protocol setting support
32 *	Alexey Kuznetsov	:	Untied from IPv4 stack.
33 *	Cyrus Durgin		:	Fixed kerneld for kmod.
34 *	Michal Ostrowski        :       Module initialization cleanup.
35 *         Ulises Alonso        :       Frame number limit removal and
36 *                                      packet_set_ring memory leak.
37 *		Eric Biederman	:	Allow for > 8 byte hardware addresses.
38 *					The convention is that longer addresses
39 *					will simply extend the hardware address
40 *					byte arrays at the end of sockaddr_ll
41 *					and packet_mreq.
42 *		Johann Baudy	:	Added TX RING.
43 *
44 *		This program is free software; you can redistribute it and/or
45 *		modify it under the terms of the GNU General Public License
46 *		as published by the Free Software Foundation; either version
47 *		2 of the License, or (at your option) any later version.
48 *
49 */
50
51#include <linux/types.h>
52#include <linux/mm.h>
53#include <linux/capability.h>
54#include <linux/fcntl.h>
55#include <linux/socket.h>
56#include <linux/in.h>
57#include <linux/inet.h>
58#include <linux/netdevice.h>
59#include <linux/if_packet.h>
60#include <linux/wireless.h>
61#include <linux/kernel.h>
62#include <linux/kmod.h>
63#include <net/net_namespace.h>
64#include <net/ip.h>
65#include <net/protocol.h>
66#include <linux/skbuff.h>
67#include <net/sock.h>
68#include <linux/errno.h>
69#include <linux/timer.h>
70#include <asm/system.h>
71#include <asm/uaccess.h>
72#include <asm/ioctls.h>
73#include <asm/page.h>
74#include <asm/cacheflush.h>
75#include <asm/io.h>
76#include <linux/proc_fs.h>
77#include <linux/seq_file.h>
78#include <linux/poll.h>
79#include <linux/module.h>
80#include <linux/init.h>
81#include <linux/mutex.h>
82
83#ifdef CONFIG_INET
84#include <net/inet_common.h>
85#endif
86
87/*
88   Assumptions:
89   - if device has no dev->hard_header routine, it adds and removes ll header
90     inside itself. In this case ll header is invisible outside of device,
91     but higher levels still should reserve dev->hard_header_len.
92     Some devices are enough clever to reallocate skb, when header
93     will not fit to reserved space (tunnel), another ones are silly
94     (PPP).
95   - packet socket receives packets with pulled ll header,
96     so that SOCK_RAW should push it back.
97
98On receive:
99-----------
100
101Incoming, dev->hard_header!=NULL
102   mac_header -> ll header
103   data       -> data
104
105Outgoing, dev->hard_header!=NULL
106   mac_header -> ll header
107   data       -> ll header
108
109Incoming, dev->hard_header==NULL
110   mac_header -> UNKNOWN position. It is very likely, that it points to ll
111		 header.  PPP makes it, that is wrong, because introduce
112		 assymetry between rx and tx paths.
113   data       -> data
114
115Outgoing, dev->hard_header==NULL
116   mac_header -> data. ll header is still not built!
117   data       -> data
118
119Resume
120  If dev->hard_header==NULL we are unlikely to restore sensible ll header.
121
122
123On transmit:
124------------
125
126dev->hard_header != NULL
127   mac_header -> ll header
128   data       -> ll header
129
130dev->hard_header == NULL (ll header is added by device, we cannot control it)
131   mac_header -> data
132   data       -> data
133
134   We should set nh.raw on output to correct posistion,
135   packet classifier depends on it.
136 */
137
138/* Private packet socket structures. */
139
140struct packet_mclist {
141	struct packet_mclist	*next;
142	int			ifindex;
143	int			count;
144	unsigned short		type;
145	unsigned short		alen;
146	unsigned char		addr[MAX_ADDR_LEN];
147};
148/* identical to struct packet_mreq except it has
149 * a longer address field.
150 */
151struct packet_mreq_max {
152	int		mr_ifindex;
153	unsigned short	mr_type;
154	unsigned short	mr_alen;
155	unsigned char	mr_address[MAX_ADDR_LEN];
156};
157
158#ifdef CONFIG_PACKET_MMAP
159static int packet_set_ring(struct sock *sk, struct tpacket_req *req,
160		int closing, int tx_ring);
161
162struct packet_ring_buffer {
163	char			**pg_vec;
164	unsigned int		head;
165	unsigned int		frames_per_block;
166	unsigned int		frame_size;
167	unsigned int		frame_max;
168
169	unsigned int		pg_vec_order;
170	unsigned int		pg_vec_pages;
171	unsigned int		pg_vec_len;
172
173	atomic_t		pending;
174};
175
176struct packet_sock;
177static int tpacket_snd(struct packet_sock *po, struct msghdr *msg);
178#endif
179
180static void packet_flush_mclist(struct sock *sk);
181
182struct packet_sock {
183	/* struct sock has to be the first member of packet_sock */
184	struct sock		sk;
185	struct tpacket_stats	stats;
186#ifdef CONFIG_PACKET_MMAP
187	struct packet_ring_buffer	rx_ring;
188	struct packet_ring_buffer	tx_ring;
189	int			copy_thresh;
190#endif
191	struct packet_type	prot_hook;
192	spinlock_t		bind_lock;
193	struct mutex		pg_vec_lock;
194	unsigned int		running:1,	/* prot_hook is attached*/
195				auxdata:1,
196				origdev:1;
197	int			ifindex;	/* bound device		*/
198	__be16			num;
199	struct packet_mclist	*mclist;
200#ifdef CONFIG_PACKET_MMAP
201	atomic_t		mapped;
202	enum tpacket_versions	tp_version;
203	unsigned int		tp_hdrlen;
204	unsigned int		tp_reserve;
205	unsigned int		tp_loss:1;
206#endif
207};
208
209struct packet_skb_cb {
210	unsigned int origlen;
211	union {
212		struct sockaddr_pkt pkt;
213		struct sockaddr_ll ll;
214	} sa;
215};
216
217#define PACKET_SKB_CB(__skb)	((struct packet_skb_cb *)((__skb)->cb))
218
219#ifdef CONFIG_PACKET_MMAP
220
221static void __packet_set_status(struct packet_sock *po, void *frame, int status)
222{
223	union {
224		struct tpacket_hdr *h1;
225		struct tpacket2_hdr *h2;
226		void *raw;
227	} h;
228
229	h.raw = frame;
230	switch (po->tp_version) {
231	case TPACKET_V1:
232		h.h1->tp_status = status;
233		flush_dcache_page(virt_to_page(&h.h1->tp_status));
234		break;
235	case TPACKET_V2:
236		h.h2->tp_status = status;
237		flush_dcache_page(virt_to_page(&h.h2->tp_status));
238		break;
239	default:
240		pr_err("TPACKET version not supported\n");
241		BUG();
242	}
243
244	smp_wmb();
245}
246
247static int __packet_get_status(struct packet_sock *po, void *frame)
248{
249	union {
250		struct tpacket_hdr *h1;
251		struct tpacket2_hdr *h2;
252		void *raw;
253	} h;
254
255	smp_rmb();
256
257	h.raw = frame;
258	switch (po->tp_version) {
259	case TPACKET_V1:
260		flush_dcache_page(virt_to_page(&h.h1->tp_status));
261		return h.h1->tp_status;
262	case TPACKET_V2:
263		flush_dcache_page(virt_to_page(&h.h2->tp_status));
264		return h.h2->tp_status;
265	default:
266		pr_err("TPACKET version not supported\n");
267		BUG();
268		return 0;
269	}
270}
271
272static void *packet_lookup_frame(struct packet_sock *po,
273		struct packet_ring_buffer *rb,
274		unsigned int position,
275		int status)
276{
277	unsigned int pg_vec_pos, frame_offset;
278	union {
279		struct tpacket_hdr *h1;
280		struct tpacket2_hdr *h2;
281		void *raw;
282	} h;
283
284	pg_vec_pos = position / rb->frames_per_block;
285	frame_offset = position % rb->frames_per_block;
286
287	h.raw = rb->pg_vec[pg_vec_pos] + (frame_offset * rb->frame_size);
288
289	if (status != __packet_get_status(po, h.raw))
290		return NULL;
291
292	return h.raw;
293}
294
295static inline void *packet_current_frame(struct packet_sock *po,
296		struct packet_ring_buffer *rb,
297		int status)
298{
299	return packet_lookup_frame(po, rb, rb->head, status);
300}
301
302static inline void *packet_previous_frame(struct packet_sock *po,
303		struct packet_ring_buffer *rb,
304		int status)
305{
306	unsigned int previous = rb->head ? rb->head - 1 : rb->frame_max;
307	return packet_lookup_frame(po, rb, previous, status);
308}
309
310static inline void packet_increment_head(struct packet_ring_buffer *buff)
311{
312	buff->head = buff->head != buff->frame_max ? buff->head+1 : 0;
313}
314
315#endif
316
317static inline struct packet_sock *pkt_sk(struct sock *sk)
318{
319	return (struct packet_sock *)sk;
320}
321
322static void packet_sock_destruct(struct sock *sk)
323{
324	WARN_ON(atomic_read(&sk->sk_rmem_alloc));
325	WARN_ON(atomic_read(&sk->sk_wmem_alloc));
326
327	if (!sock_flag(sk, SOCK_DEAD)) {
328		pr_err("Attempt to release alive packet socket: %p\n", sk);
329		return;
330	}
331
332	sk_refcnt_debug_dec(sk);
333}
334
335
336static const struct proto_ops packet_ops;
337
338static const struct proto_ops packet_ops_spkt;
339
340static int packet_rcv_spkt(struct sk_buff *skb, struct net_device *dev,
341			   struct packet_type *pt, struct net_device *orig_dev)
342{
343	struct sock *sk;
344	struct sockaddr_pkt *spkt;
345
346	/*
347	 *	When we registered the protocol we saved the socket in the data
348	 *	field for just this event.
349	 */
350
351	sk = pt->af_packet_priv;
352
353	/*
354	 *	Yank back the headers [hope the device set this
355	 *	right or kerboom...]
356	 *
357	 *	Incoming packets have ll header pulled,
358	 *	push it back.
359	 *
360	 *	For outgoing ones skb->data == skb_mac_header(skb)
361	 *	so that this procedure is noop.
362	 */
363
364	if (skb->pkt_type == PACKET_LOOPBACK)
365		goto out;
366
367	if (dev_net(dev) != sock_net(sk))
368		goto out;
369
370	skb = skb_share_check(skb, GFP_ATOMIC);
371	if (skb == NULL)
372		goto oom;
373
374	/* drop any routing info */
375	skb_dst_drop(skb);
376
377	/* drop conntrack reference */
378	nf_reset(skb);
379
380	spkt = &PACKET_SKB_CB(skb)->sa.pkt;
381
382	skb_push(skb, skb->data - skb_mac_header(skb));
383
384	/*
385	 *	The SOCK_PACKET socket receives _all_ frames.
386	 */
387
388	spkt->spkt_family = dev->type;
389	strlcpy(spkt->spkt_device, dev->name, sizeof(spkt->spkt_device));
390	spkt->spkt_protocol = skb->protocol;
391
392	/*
393	 *	Charge the memory to the socket. This is done specifically
394	 *	to prevent sockets using all the memory up.
395	 */
396
397	if (sock_queue_rcv_skb(sk, skb) == 0)
398		return 0;
399
400out:
401	kfree_skb(skb);
402oom:
403	return 0;
404}
405
406
407/*
408 *	Output a raw packet to a device layer. This bypasses all the other
409 *	protocol layers and you must therefore supply it with a complete frame
410 */
411
412static int packet_sendmsg_spkt(struct kiocb *iocb, struct socket *sock,
413			       struct msghdr *msg, size_t len)
414{
415	struct sock *sk = sock->sk;
416	struct sockaddr_pkt *saddr = (struct sockaddr_pkt *)msg->msg_name;
417	struct sk_buff *skb;
418	struct net_device *dev;
419	__be16 proto = 0;
420	int err;
421
422	/*
423	 *	Get and verify the address.
424	 */
425
426	if (saddr) {
427		if (msg->msg_namelen < sizeof(struct sockaddr))
428			return -EINVAL;
429		if (msg->msg_namelen == sizeof(struct sockaddr_pkt))
430			proto = saddr->spkt_protocol;
431	} else
432		return -ENOTCONN;	/* SOCK_PACKET must be sent giving an address */
433
434	/*
435	 *	Find the device first to size check it
436	 */
437
438	saddr->spkt_device[13] = 0;
439	dev = dev_get_by_name(sock_net(sk), saddr->spkt_device);
440	err = -ENODEV;
441	if (dev == NULL)
442		goto out_unlock;
443
444	err = -ENETDOWN;
445	if (!(dev->flags & IFF_UP))
446		goto out_unlock;
447
448	/*
449	 * You may not queue a frame bigger than the mtu. This is the lowest level
450	 * raw protocol and you must do your own fragmentation at this level.
451	 */
452
453	err = -EMSGSIZE;
454	if (len > dev->mtu + dev->hard_header_len)
455		goto out_unlock;
456
457	err = -ENOBUFS;
458	skb = sock_wmalloc(sk, len + LL_RESERVED_SPACE(dev), 0, GFP_KERNEL);
459
460	/*
461	 * If the write buffer is full, then tough. At this level the user
462	 * gets to deal with the problem - do your own algorithmic backoffs.
463	 * That's far more flexible.
464	 */
465
466	if (skb == NULL)
467		goto out_unlock;
468
469	/*
470	 *	Fill it in
471	 */
472
473	/* FIXME: Save some space for broken drivers that write a
474	 * hard header at transmission time by themselves. PPP is the
475	 * notable one here. This should really be fixed at the driver level.
476	 */
477	skb_reserve(skb, LL_RESERVED_SPACE(dev));
478	skb_reset_network_header(skb);
479
480	/* Try to align data part correctly */
481	if (dev->header_ops) {
482		skb->data -= dev->hard_header_len;
483		skb->tail -= dev->hard_header_len;
484		if (len < dev->hard_header_len)
485			skb_reset_network_header(skb);
486	}
487
488	/* Returns -EFAULT on error */
489	err = memcpy_fromiovec(skb_put(skb, len), msg->msg_iov, len);
490	skb->protocol = proto;
491	skb->dev = dev;
492	skb->priority = sk->sk_priority;
493	skb->mark = sk->sk_mark;
494	if (err)
495		goto out_free;
496
497	/*
498	 *	Now send it
499	 */
500
501	dev_queue_xmit(skb);
502	dev_put(dev);
503	return len;
504
505out_free:
506	kfree_skb(skb);
507out_unlock:
508	if (dev)
509		dev_put(dev);
510	return err;
511}
512
513static inline unsigned int run_filter(struct sk_buff *skb, struct sock *sk,
514				      unsigned int res)
515{
516	struct sk_filter *filter;
517
518	rcu_read_lock_bh();
519	filter = rcu_dereference(sk->sk_filter);
520	if (filter != NULL)
521		res = sk_run_filter(skb, filter->insns, filter->len);
522	rcu_read_unlock_bh();
523
524	return res;
525}
526
527/*
528 * If we've lost frames since the last time we queued one to the
529 * sk_receive_queue, we need to record it here.
530 * This must be called under the protection of the socket lock
531 * to prevent racing with other softirqs and user space
532 */
533static inline void record_packet_gap(struct sk_buff *skb,
534					struct packet_sock *po)
535{
536	/*
537	 * We overload the mark field here, since we're about
538	 * to enqueue to a receive queue and no body else will
539	 * use this field at this point
540	 */
541	skb->mark = po->stats.tp_gap;
542	po->stats.tp_gap = 0;
543	return;
544
545}
546
547static inline __u32 check_packet_gap(struct sk_buff *skb)
548{
549	return skb->mark;
550}
551
552/*
553   This function makes lazy skb cloning in hope that most of packets
554   are discarded by BPF.
555
556   Note tricky part: we DO mangle shared skb! skb->data, skb->len
557   and skb->cb are mangled. It works because (and until) packets
558   falling here are owned by current CPU. Output packets are cloned
559   by dev_queue_xmit_nit(), input packets are processed by net_bh
560   sequencially, so that if we return skb to original state on exit,
561   we will not harm anyone.
562 */
563
564static int packet_rcv(struct sk_buff *skb, struct net_device *dev,
565		      struct packet_type *pt, struct net_device *orig_dev)
566{
567	struct sock *sk;
568	struct sockaddr_ll *sll;
569	struct packet_sock *po;
570	u8 *skb_head = skb->data;
571	int skb_len = skb->len;
572	unsigned int snaplen, res;
573
574	if (skb->pkt_type == PACKET_LOOPBACK)
575		goto drop;
576
577	sk = pt->af_packet_priv;
578	po = pkt_sk(sk);
579
580	if (dev_net(dev) != sock_net(sk))
581		goto drop;
582
583	skb->dev = dev;
584
585	if (dev->header_ops) {
586		/* The device has an explicit notion of ll header,
587		   exported to higher levels.
588
589		   Otherwise, the device hides datails of it frame
590		   structure, so that corresponding packet head
591		   never delivered to user.
592		 */
593		if (sk->sk_type != SOCK_DGRAM)
594			skb_push(skb, skb->data - skb_mac_header(skb));
595		else if (skb->pkt_type == PACKET_OUTGOING) {
596			/* Special case: outgoing packets have ll header at head */
597			skb_pull(skb, skb_network_offset(skb));
598		}
599	}
600
601	snaplen = skb->len;
602
603	res = run_filter(skb, sk, snaplen);
604	if (!res)
605		goto drop_n_restore;
606	if (snaplen > res)
607		snaplen = res;
608
609	if (atomic_read(&sk->sk_rmem_alloc) + skb->truesize >=
610	    (unsigned)sk->sk_rcvbuf)
611		goto drop_n_acct;
612
613	if (skb_shared(skb)) {
614		struct sk_buff *nskb = skb_clone(skb, GFP_ATOMIC);
615		if (nskb == NULL)
616			goto drop_n_acct;
617
618		if (skb_head != skb->data) {
619			skb->data = skb_head;
620			skb->len = skb_len;
621		}
622		kfree_skb(skb);
623		skb = nskb;
624	}
625
626	BUILD_BUG_ON(sizeof(*PACKET_SKB_CB(skb)) + MAX_ADDR_LEN - 8 >
627		     sizeof(skb->cb));
628
629	sll = &PACKET_SKB_CB(skb)->sa.ll;
630	sll->sll_family = AF_PACKET;
631	sll->sll_hatype = dev->type;
632	sll->sll_protocol = skb->protocol;
633	sll->sll_pkttype = skb->pkt_type;
634	if (unlikely(po->origdev))
635		sll->sll_ifindex = orig_dev->ifindex;
636	else
637		sll->sll_ifindex = dev->ifindex;
638
639	sll->sll_halen = dev_parse_header(skb, sll->sll_addr);
640
641	PACKET_SKB_CB(skb)->origlen = skb->len;
642
643	if (pskb_trim(skb, snaplen))
644		goto drop_n_acct;
645
646	skb_set_owner_r(skb, sk);
647	skb->dev = NULL;
648	skb_dst_drop(skb);
649
650	/* drop conntrack reference */
651	nf_reset(skb);
652
653	spin_lock(&sk->sk_receive_queue.lock);
654	po->stats.tp_packets++;
655	record_packet_gap(skb, po);
656	__skb_queue_tail(&sk->sk_receive_queue, skb);
657	spin_unlock(&sk->sk_receive_queue.lock);
658	sk->sk_data_ready(sk, skb->len);
659	return 0;
660
661drop_n_acct:
662	spin_lock(&sk->sk_receive_queue.lock);
663	po->stats.tp_drops++;
664	po->stats.tp_gap++;
665	spin_unlock(&sk->sk_receive_queue.lock);
666
667drop_n_restore:
668	if (skb_head != skb->data && skb_shared(skb)) {
669		skb->data = skb_head;
670		skb->len = skb_len;
671	}
672drop:
673	consume_skb(skb);
674	return 0;
675}
676
677#ifdef CONFIG_PACKET_MMAP
678static int tpacket_rcv(struct sk_buff *skb, struct net_device *dev,
679		       struct packet_type *pt, struct net_device *orig_dev)
680{
681	struct sock *sk;
682	struct packet_sock *po;
683	struct sockaddr_ll *sll;
684	union {
685		struct tpacket_hdr *h1;
686		struct tpacket2_hdr *h2;
687		void *raw;
688	} h;
689	u8 *skb_head = skb->data;
690	int skb_len = skb->len;
691	unsigned int snaplen, res;
692	unsigned long status = TP_STATUS_LOSING|TP_STATUS_USER;
693	unsigned short macoff, netoff, hdrlen;
694	struct sk_buff *copy_skb = NULL;
695	struct timeval tv;
696	struct timespec ts;
697
698	if (skb->pkt_type == PACKET_LOOPBACK)
699		goto drop;
700
701	sk = pt->af_packet_priv;
702	po = pkt_sk(sk);
703
704	if (dev_net(dev) != sock_net(sk))
705		goto drop;
706
707	if (dev->header_ops) {
708		if (sk->sk_type != SOCK_DGRAM)
709			skb_push(skb, skb->data - skb_mac_header(skb));
710		else if (skb->pkt_type == PACKET_OUTGOING) {
711			/* Special case: outgoing packets have ll header at head */
712			skb_pull(skb, skb_network_offset(skb));
713		}
714	}
715
716	if (skb->ip_summed == CHECKSUM_PARTIAL)
717		status |= TP_STATUS_CSUMNOTREADY;
718
719	snaplen = skb->len;
720
721	res = run_filter(skb, sk, snaplen);
722	if (!res)
723		goto drop_n_restore;
724	if (snaplen > res)
725		snaplen = res;
726
727	if (sk->sk_type == SOCK_DGRAM) {
728		macoff = netoff = TPACKET_ALIGN(po->tp_hdrlen) + 16 +
729				  po->tp_reserve;
730	} else {
731		unsigned maclen = skb_network_offset(skb);
732		netoff = TPACKET_ALIGN(po->tp_hdrlen +
733				       (maclen < 16 ? 16 : maclen)) +
734			po->tp_reserve;
735		macoff = netoff - maclen;
736	}
737
738	if (macoff + snaplen > po->rx_ring.frame_size) {
739		if (po->copy_thresh &&
740		    atomic_read(&sk->sk_rmem_alloc) + skb->truesize <
741		    (unsigned)sk->sk_rcvbuf) {
742			if (skb_shared(skb)) {
743				copy_skb = skb_clone(skb, GFP_ATOMIC);
744			} else {
745				copy_skb = skb_get(skb);
746				skb_head = skb->data;
747			}
748			if (copy_skb)
749				skb_set_owner_r(copy_skb, sk);
750		}
751		snaplen = po->rx_ring.frame_size - macoff;
752		if ((int)snaplen < 0)
753			snaplen = 0;
754	}
755
756	spin_lock(&sk->sk_receive_queue.lock);
757	h.raw = packet_current_frame(po, &po->rx_ring, TP_STATUS_KERNEL);
758	if (!h.raw)
759		goto ring_is_full;
760	packet_increment_head(&po->rx_ring);
761	po->stats.tp_packets++;
762	if (copy_skb) {
763		status |= TP_STATUS_COPY;
764		__skb_queue_tail(&sk->sk_receive_queue, copy_skb);
765	}
766	if (!po->stats.tp_drops)
767		status &= ~TP_STATUS_LOSING;
768	spin_unlock(&sk->sk_receive_queue.lock);
769
770	skb_copy_bits(skb, 0, h.raw + macoff, snaplen);
771
772	switch (po->tp_version) {
773	case TPACKET_V1:
774		h.h1->tp_len = skb->len;
775		h.h1->tp_snaplen = snaplen;
776		h.h1->tp_mac = macoff;
777		h.h1->tp_net = netoff;
778		if (skb->tstamp.tv64)
779			tv = ktime_to_timeval(skb->tstamp);
780		else
781			do_gettimeofday(&tv);
782		h.h1->tp_sec = tv.tv_sec;
783		h.h1->tp_usec = tv.tv_usec;
784		hdrlen = sizeof(*h.h1);
785		break;
786	case TPACKET_V2:
787		h.h2->tp_len = skb->len;
788		h.h2->tp_snaplen = snaplen;
789		h.h2->tp_mac = macoff;
790		h.h2->tp_net = netoff;
791		if (skb->tstamp.tv64)
792			ts = ktime_to_timespec(skb->tstamp);
793		else
794			getnstimeofday(&ts);
795		h.h2->tp_sec = ts.tv_sec;
796		h.h2->tp_nsec = ts.tv_nsec;
797		h.h2->tp_vlan_tci = skb->vlan_tci;
798		hdrlen = sizeof(*h.h2);
799		break;
800	default:
801		BUG();
802	}
803
804	sll = h.raw + TPACKET_ALIGN(hdrlen);
805	sll->sll_halen = dev_parse_header(skb, sll->sll_addr);
806	sll->sll_family = AF_PACKET;
807	sll->sll_hatype = dev->type;
808	sll->sll_protocol = skb->protocol;
809	sll->sll_pkttype = skb->pkt_type;
810	if (unlikely(po->origdev))
811		sll->sll_ifindex = orig_dev->ifindex;
812	else
813		sll->sll_ifindex = dev->ifindex;
814
815	__packet_set_status(po, h.raw, status);
816	smp_mb();
817	{
818		struct page *p_start, *p_end;
819		u8 *h_end = h.raw + macoff + snaplen - 1;
820
821		p_start = virt_to_page(h.raw);
822		p_end = virt_to_page(h_end);
823		while (p_start <= p_end) {
824			flush_dcache_page(p_start);
825			p_start++;
826		}
827	}
828
829	sk->sk_data_ready(sk, 0);
830
831drop_n_restore:
832	if (skb_head != skb->data && skb_shared(skb)) {
833		skb->data = skb_head;
834		skb->len = skb_len;
835	}
836drop:
837	kfree_skb(skb);
838	return 0;
839
840ring_is_full:
841	po->stats.tp_drops++;
842	po->stats.tp_gap++;
843	spin_unlock(&sk->sk_receive_queue.lock);
844
845	sk->sk_data_ready(sk, 0);
846	kfree_skb(copy_skb);
847	goto drop_n_restore;
848}
849
850static void tpacket_destruct_skb(struct sk_buff *skb)
851{
852	struct packet_sock *po = pkt_sk(skb->sk);
853	void *ph;
854
855	BUG_ON(skb == NULL);
856
857	if (likely(po->tx_ring.pg_vec)) {
858		ph = skb_shinfo(skb)->destructor_arg;
859		BUG_ON(__packet_get_status(po, ph) != TP_STATUS_SENDING);
860		BUG_ON(atomic_read(&po->tx_ring.pending) == 0);
861		atomic_dec(&po->tx_ring.pending);
862		__packet_set_status(po, ph, TP_STATUS_AVAILABLE);
863	}
864
865	sock_wfree(skb);
866}
867
868static int tpacket_fill_skb(struct packet_sock *po, struct sk_buff *skb,
869		void *frame, struct net_device *dev, int size_max,
870		__be16 proto, unsigned char *addr)
871{
872	union {
873		struct tpacket_hdr *h1;
874		struct tpacket2_hdr *h2;
875		void *raw;
876	} ph;
877	int to_write, offset, len, tp_len, nr_frags, len_max;
878	struct socket *sock = po->sk.sk_socket;
879	struct page *page;
880	void *data;
881	int err;
882
883	ph.raw = frame;
884
885	skb->protocol = proto;
886	skb->dev = dev;
887	skb->priority = po->sk.sk_priority;
888	skb->mark = po->sk.sk_mark;
889	skb_shinfo(skb)->destructor_arg = ph.raw;
890
891	switch (po->tp_version) {
892	case TPACKET_V2:
893		tp_len = ph.h2->tp_len;
894		break;
895	default:
896		tp_len = ph.h1->tp_len;
897		break;
898	}
899	if (unlikely(tp_len > size_max)) {
900		pr_err("packet size is too long (%d > %d)\n", tp_len, size_max);
901		return -EMSGSIZE;
902	}
903
904	skb_reserve(skb, LL_RESERVED_SPACE(dev));
905	skb_reset_network_header(skb);
906
907	data = ph.raw + po->tp_hdrlen - sizeof(struct sockaddr_ll);
908	to_write = tp_len;
909
910	if (sock->type == SOCK_DGRAM) {
911		err = dev_hard_header(skb, dev, ntohs(proto), addr,
912				NULL, tp_len);
913		if (unlikely(err < 0))
914			return -EINVAL;
915	} else if (dev->hard_header_len) {
916		/* net device doesn't like empty head */
917		if (unlikely(tp_len <= dev->hard_header_len)) {
918			pr_err("packet size is too short (%d < %d)\n",
919			       tp_len, dev->hard_header_len);
920			return -EINVAL;
921		}
922
923		skb_push(skb, dev->hard_header_len);
924		err = skb_store_bits(skb, 0, data,
925				dev->hard_header_len);
926		if (unlikely(err))
927			return err;
928
929		data += dev->hard_header_len;
930		to_write -= dev->hard_header_len;
931	}
932
933	err = -EFAULT;
934	page = virt_to_page(data);
935	offset = offset_in_page(data);
936	len_max = PAGE_SIZE - offset;
937	len = ((to_write > len_max) ? len_max : to_write);
938
939	skb->data_len = to_write;
940	skb->len += to_write;
941	skb->truesize += to_write;
942	atomic_add(to_write, &po->sk.sk_wmem_alloc);
943
944	while (likely(to_write)) {
945		nr_frags = skb_shinfo(skb)->nr_frags;
946
947		if (unlikely(nr_frags >= MAX_SKB_FRAGS)) {
948			pr_err("Packet exceed the number of skb frags(%lu)\n",
949			       MAX_SKB_FRAGS);
950			return -EFAULT;
951		}
952
953		flush_dcache_page(page);
954		get_page(page);
955		skb_fill_page_desc(skb,
956				nr_frags,
957				page++, offset, len);
958		to_write -= len;
959		offset = 0;
960		len_max = PAGE_SIZE;
961		len = ((to_write > len_max) ? len_max : to_write);
962	}
963
964	return tp_len;
965}
966
967static int tpacket_snd(struct packet_sock *po, struct msghdr *msg)
968{
969	struct socket *sock;
970	struct sk_buff *skb;
971	struct net_device *dev;
972	__be16 proto;
973	int ifindex, err, reserve = 0;
974	void *ph;
975	struct sockaddr_ll *saddr = (struct sockaddr_ll *)msg->msg_name;
976	int tp_len, size_max;
977	unsigned char *addr;
978	int len_sum = 0;
979	int status = 0;
980
981	sock = po->sk.sk_socket;
982
983	mutex_lock(&po->pg_vec_lock);
984
985	err = -EBUSY;
986	if (saddr == NULL) {
987		ifindex	= po->ifindex;
988		proto	= po->num;
989		addr	= NULL;
990	} else {
991		err = -EINVAL;
992		if (msg->msg_namelen < sizeof(struct sockaddr_ll))
993			goto out;
994		if (msg->msg_namelen < (saddr->sll_halen
995					+ offsetof(struct sockaddr_ll,
996						sll_addr)))
997			goto out;
998		ifindex	= saddr->sll_ifindex;
999		proto	= saddr->sll_protocol;
1000		addr	= saddr->sll_addr;
1001	}
1002
1003	dev = dev_get_by_index(sock_net(&po->sk), ifindex);
1004	err = -ENXIO;
1005	if (unlikely(dev == NULL))
1006		goto out;
1007
1008	reserve = dev->hard_header_len;
1009
1010	err = -ENETDOWN;
1011	if (unlikely(!(dev->flags & IFF_UP)))
1012		goto out_put;
1013
1014	size_max = po->tx_ring.frame_size
1015		- sizeof(struct skb_shared_info)
1016		- po->tp_hdrlen
1017		- LL_ALLOCATED_SPACE(dev)
1018		- sizeof(struct sockaddr_ll);
1019
1020	if (size_max > dev->mtu + reserve)
1021		size_max = dev->mtu + reserve;
1022
1023	do {
1024		ph = packet_current_frame(po, &po->tx_ring,
1025				TP_STATUS_SEND_REQUEST);
1026
1027		if (unlikely(ph == NULL)) {
1028			schedule();
1029			continue;
1030		}
1031
1032		status = TP_STATUS_SEND_REQUEST;
1033		skb = sock_alloc_send_skb(&po->sk,
1034				LL_ALLOCATED_SPACE(dev)
1035				+ sizeof(struct sockaddr_ll),
1036				0, &err);
1037
1038		if (unlikely(skb == NULL))
1039			goto out_status;
1040
1041		tp_len = tpacket_fill_skb(po, skb, ph, dev, size_max, proto,
1042				addr);
1043
1044		if (unlikely(tp_len < 0)) {
1045			if (po->tp_loss) {
1046				__packet_set_status(po, ph,
1047						TP_STATUS_AVAILABLE);
1048				packet_increment_head(&po->tx_ring);
1049				kfree_skb(skb);
1050				continue;
1051			} else {
1052				status = TP_STATUS_WRONG_FORMAT;
1053				err = tp_len;
1054				goto out_status;
1055			}
1056		}
1057
1058		skb->destructor = tpacket_destruct_skb;
1059		__packet_set_status(po, ph, TP_STATUS_SENDING);
1060		atomic_inc(&po->tx_ring.pending);
1061
1062		status = TP_STATUS_SEND_REQUEST;
1063		err = dev_queue_xmit(skb);
1064		if (unlikely(err > 0 && (err = net_xmit_errno(err)) != 0))
1065			goto out_xmit;
1066		packet_increment_head(&po->tx_ring);
1067		len_sum += tp_len;
1068	} while (likely((ph != NULL) || ((!(msg->msg_flags & MSG_DONTWAIT))
1069					&& (atomic_read(&po->tx_ring.pending))))
1070	      );
1071
1072	err = len_sum;
1073	goto out_put;
1074
1075out_xmit:
1076	skb->destructor = sock_wfree;
1077	atomic_dec(&po->tx_ring.pending);
1078out_status:
1079	__packet_set_status(po, ph, status);
1080	kfree_skb(skb);
1081out_put:
1082	dev_put(dev);
1083out:
1084	mutex_unlock(&po->pg_vec_lock);
1085	return err;
1086}
1087#endif
1088
1089static int packet_snd(struct socket *sock,
1090			  struct msghdr *msg, size_t len)
1091{
1092	struct sock *sk = sock->sk;
1093	struct sockaddr_ll *saddr = (struct sockaddr_ll *)msg->msg_name;
1094	struct sk_buff *skb;
1095	struct net_device *dev;
1096	__be16 proto;
1097	unsigned char *addr;
1098	int ifindex, err, reserve = 0;
1099
1100	/*
1101	 *	Get and verify the address.
1102	 */
1103
1104	if (saddr == NULL) {
1105		struct packet_sock *po = pkt_sk(sk);
1106
1107		ifindex	= po->ifindex;
1108		proto	= po->num;
1109		addr	= NULL;
1110	} else {
1111		err = -EINVAL;
1112		if (msg->msg_namelen < sizeof(struct sockaddr_ll))
1113			goto out;
1114		if (msg->msg_namelen < (saddr->sll_halen + offsetof(struct sockaddr_ll, sll_addr)))
1115			goto out;
1116		ifindex	= saddr->sll_ifindex;
1117		proto	= saddr->sll_protocol;
1118		addr	= saddr->sll_addr;
1119	}
1120
1121
1122	dev = dev_get_by_index(sock_net(sk), ifindex);
1123	err = -ENXIO;
1124	if (dev == NULL)
1125		goto out_unlock;
1126	if (sock->type == SOCK_RAW)
1127		reserve = dev->hard_header_len;
1128
1129	err = -ENETDOWN;
1130	if (!(dev->flags & IFF_UP))
1131		goto out_unlock;
1132
1133	err = -EMSGSIZE;
1134	if (len > dev->mtu+reserve)
1135		goto out_unlock;
1136
1137	skb = sock_alloc_send_skb(sk, len + LL_ALLOCATED_SPACE(dev),
1138				msg->msg_flags & MSG_DONTWAIT, &err);
1139	if (skb == NULL)
1140		goto out_unlock;
1141
1142	skb_reserve(skb, LL_RESERVED_SPACE(dev));
1143	skb_reset_network_header(skb);
1144
1145	err = -EINVAL;
1146	if (sock->type == SOCK_DGRAM &&
1147	    dev_hard_header(skb, dev, ntohs(proto), addr, NULL, len) < 0)
1148		goto out_free;
1149
1150	/* Returns -EFAULT on error */
1151	err = memcpy_fromiovec(skb_put(skb, len), msg->msg_iov, len);
1152	if (err)
1153		goto out_free;
1154
1155	skb->protocol = proto;
1156	skb->dev = dev;
1157	skb->priority = sk->sk_priority;
1158	skb->mark = sk->sk_mark;
1159
1160	/*
1161	 *	Now send it
1162	 */
1163
1164	err = dev_queue_xmit(skb);
1165	if (err > 0 && (err = net_xmit_errno(err)) != 0)
1166		goto out_unlock;
1167
1168	dev_put(dev);
1169
1170	return len;
1171
1172out_free:
1173	kfree_skb(skb);
1174out_unlock:
1175	if (dev)
1176		dev_put(dev);
1177out:
1178	return err;
1179}
1180
1181static int packet_sendmsg(struct kiocb *iocb, struct socket *sock,
1182		struct msghdr *msg, size_t len)
1183{
1184#ifdef CONFIG_PACKET_MMAP
1185	struct sock *sk = sock->sk;
1186	struct packet_sock *po = pkt_sk(sk);
1187	if (po->tx_ring.pg_vec)
1188		return tpacket_snd(po, msg);
1189	else
1190#endif
1191		return packet_snd(sock, msg, len);
1192}
1193
1194/*
1195 *	Close a PACKET socket. This is fairly simple. We immediately go
1196 *	to 'closed' state and remove our protocol entry in the device list.
1197 */
1198
1199static int packet_release(struct socket *sock)
1200{
1201	struct sock *sk = sock->sk;
1202	struct packet_sock *po;
1203	struct net *net;
1204#ifdef CONFIG_PACKET_MMAP
1205	struct tpacket_req req;
1206#endif
1207
1208	if (!sk)
1209		return 0;
1210
1211	net = sock_net(sk);
1212	po = pkt_sk(sk);
1213
1214	write_lock_bh(&net->packet.sklist_lock);
1215	sk_del_node_init(sk);
1216	sock_prot_inuse_add(net, sk->sk_prot, -1);
1217	write_unlock_bh(&net->packet.sklist_lock);
1218
1219	/*
1220	 *	Unhook packet receive handler.
1221	 */
1222
1223	if (po->running) {
1224		/*
1225		 *	Remove the protocol hook
1226		 */
1227		dev_remove_pack(&po->prot_hook);
1228		po->running = 0;
1229		po->num = 0;
1230		__sock_put(sk);
1231	}
1232
1233	packet_flush_mclist(sk);
1234
1235#ifdef CONFIG_PACKET_MMAP
1236	memset(&req, 0, sizeof(req));
1237
1238	if (po->rx_ring.pg_vec)
1239		packet_set_ring(sk, &req, 1, 0);
1240
1241	if (po->tx_ring.pg_vec)
1242		packet_set_ring(sk, &req, 1, 1);
1243#endif
1244
1245	/*
1246	 *	Now the socket is dead. No more input will appear.
1247	 */
1248
1249	sock_orphan(sk);
1250	sock->sk = NULL;
1251
1252	/* Purge queues */
1253
1254	skb_queue_purge(&sk->sk_receive_queue);
1255	sk_refcnt_debug_release(sk);
1256
1257	sock_put(sk);
1258	return 0;
1259}
1260
1261/*
1262 *	Attach a packet hook.
1263 */
1264
1265static int packet_do_bind(struct sock *sk, struct net_device *dev, __be16 protocol)
1266{
1267	struct packet_sock *po = pkt_sk(sk);
1268	/*
1269	 *	Detach an existing hook if present.
1270	 */
1271
1272	lock_sock(sk);
1273
1274	spin_lock(&po->bind_lock);
1275	if (po->running) {
1276		__sock_put(sk);
1277		po->running = 0;
1278		po->num = 0;
1279		spin_unlock(&po->bind_lock);
1280		dev_remove_pack(&po->prot_hook);
1281		spin_lock(&po->bind_lock);
1282	}
1283
1284	po->num = protocol;
1285	po->prot_hook.type = protocol;
1286	po->prot_hook.dev = dev;
1287
1288	po->ifindex = dev ? dev->ifindex : 0;
1289
1290	if (protocol == 0)
1291		goto out_unlock;
1292
1293	if (!dev || (dev->flags & IFF_UP)) {
1294		dev_add_pack(&po->prot_hook);
1295		sock_hold(sk);
1296		po->running = 1;
1297	} else {
1298		sk->sk_err = ENETDOWN;
1299		if (!sock_flag(sk, SOCK_DEAD))
1300			sk->sk_error_report(sk);
1301	}
1302
1303out_unlock:
1304	spin_unlock(&po->bind_lock);
1305	release_sock(sk);
1306	return 0;
1307}
1308
1309/*
1310 *	Bind a packet socket to a device
1311 */
1312
1313static int packet_bind_spkt(struct socket *sock, struct sockaddr *uaddr,
1314			    int addr_len)
1315{
1316	struct sock *sk = sock->sk;
1317	char name[15];
1318	struct net_device *dev;
1319	int err = -ENODEV;
1320
1321	/*
1322	 *	Check legality
1323	 */
1324
1325	if (addr_len != sizeof(struct sockaddr))
1326		return -EINVAL;
1327	strlcpy(name, uaddr->sa_data, sizeof(name));
1328
1329	dev = dev_get_by_name(sock_net(sk), name);
1330	if (dev) {
1331		err = packet_do_bind(sk, dev, pkt_sk(sk)->num);
1332		dev_put(dev);
1333	}
1334	return err;
1335}
1336
1337static int packet_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
1338{
1339	struct sockaddr_ll *sll = (struct sockaddr_ll *)uaddr;
1340	struct sock *sk = sock->sk;
1341	struct net_device *dev = NULL;
1342	int err;
1343
1344
1345	/*
1346	 *	Check legality
1347	 */
1348
1349	if (addr_len < sizeof(struct sockaddr_ll))
1350		return -EINVAL;
1351	if (sll->sll_family != AF_PACKET)
1352		return -EINVAL;
1353
1354	if (sll->sll_ifindex) {
1355		err = -ENODEV;
1356		dev = dev_get_by_index(sock_net(sk), sll->sll_ifindex);
1357		if (dev == NULL)
1358			goto out;
1359	}
1360	err = packet_do_bind(sk, dev, sll->sll_protocol ? : pkt_sk(sk)->num);
1361	if (dev)
1362		dev_put(dev);
1363
1364out:
1365	return err;
1366}
1367
1368static struct proto packet_proto = {
1369	.name	  = "PACKET",
1370	.owner	  = THIS_MODULE,
1371	.obj_size = sizeof(struct packet_sock),
1372};
1373
1374/*
1375 *	Create a packet of type SOCK_PACKET.
1376 */
1377
1378static int packet_create(struct net *net, struct socket *sock, int protocol)
1379{
1380	struct sock *sk;
1381	struct packet_sock *po;
1382	__be16 proto = (__force __be16)protocol; /* weird, but documented */
1383	int err;
1384
1385	if (!capable(CAP_NET_RAW))
1386		return -EPERM;
1387	if (sock->type != SOCK_DGRAM && sock->type != SOCK_RAW &&
1388	    sock->type != SOCK_PACKET)
1389		return -ESOCKTNOSUPPORT;
1390
1391	sock->state = SS_UNCONNECTED;
1392
1393	err = -ENOBUFS;
1394	sk = sk_alloc(net, PF_PACKET, GFP_KERNEL, &packet_proto);
1395	if (sk == NULL)
1396		goto out;
1397
1398	sock->ops = &packet_ops;
1399	if (sock->type == SOCK_PACKET)
1400		sock->ops = &packet_ops_spkt;
1401
1402	sock_init_data(sock, sk);
1403
1404	po = pkt_sk(sk);
1405	sk->sk_family = PF_PACKET;
1406	po->num = proto;
1407
1408	sk->sk_destruct = packet_sock_destruct;
1409	sk_refcnt_debug_inc(sk);
1410
1411	/*
1412	 *	Attach a protocol block
1413	 */
1414
1415	spin_lock_init(&po->bind_lock);
1416	mutex_init(&po->pg_vec_lock);
1417	po->prot_hook.func = packet_rcv;
1418
1419	if (sock->type == SOCK_PACKET)
1420		po->prot_hook.func = packet_rcv_spkt;
1421
1422	po->prot_hook.af_packet_priv = sk;
1423
1424	if (proto) {
1425		po->prot_hook.type = proto;
1426		dev_add_pack(&po->prot_hook);
1427		sock_hold(sk);
1428		po->running = 1;
1429	}
1430
1431	write_lock_bh(&net->packet.sklist_lock);
1432	sk_add_node(sk, &net->packet.sklist);
1433	sock_prot_inuse_add(net, &packet_proto, 1);
1434	write_unlock_bh(&net->packet.sklist_lock);
1435	return 0;
1436out:
1437	return err;
1438}
1439
1440/*
1441 *	Pull a packet from our receive queue and hand it to the user.
1442 *	If necessary we block.
1443 */
1444
1445static int packet_recvmsg(struct kiocb *iocb, struct socket *sock,
1446			  struct msghdr *msg, size_t len, int flags)
1447{
1448	struct sock *sk = sock->sk;
1449	struct sk_buff *skb;
1450	int copied, err;
1451	struct sockaddr_ll *sll;
1452	__u32 gap;
1453
1454	err = -EINVAL;
1455	if (flags & ~(MSG_PEEK|MSG_DONTWAIT|MSG_TRUNC|MSG_CMSG_COMPAT))
1456		goto out;
1457
1458#if 0
1459	/* What error should we return now? EUNATTACH? */
1460	if (pkt_sk(sk)->ifindex < 0)
1461		return -ENODEV;
1462#endif
1463
1464	/*
1465	 *	Call the generic datagram receiver. This handles all sorts
1466	 *	of horrible races and re-entrancy so we can forget about it
1467	 *	in the protocol layers.
1468	 *
1469	 *	Now it will return ENETDOWN, if device have just gone down,
1470	 *	but then it will block.
1471	 */
1472
1473	skb = skb_recv_datagram(sk, flags, flags & MSG_DONTWAIT, &err);
1474
1475	/*
1476	 *	An error occurred so return it. Because skb_recv_datagram()
1477	 *	handles the blocking we don't see and worry about blocking
1478	 *	retries.
1479	 */
1480
1481	if (skb == NULL)
1482		goto out;
1483
1484	/*
1485	 *	If the address length field is there to be filled in, we fill
1486	 *	it in now.
1487	 */
1488
1489	sll = &PACKET_SKB_CB(skb)->sa.ll;
1490	if (sock->type == SOCK_PACKET)
1491		msg->msg_namelen = sizeof(struct sockaddr_pkt);
1492	else
1493		msg->msg_namelen = sll->sll_halen + offsetof(struct sockaddr_ll, sll_addr);
1494
1495	/*
1496	 *	You lose any data beyond the buffer you gave. If it worries a
1497	 *	user program they can ask the device for its MTU anyway.
1498	 */
1499
1500	copied = skb->len;
1501	if (copied > len) {
1502		copied = len;
1503		msg->msg_flags |= MSG_TRUNC;
1504	}
1505
1506	err = skb_copy_datagram_iovec(skb, 0, msg->msg_iov, copied);
1507	if (err)
1508		goto out_free;
1509
1510	sock_recv_timestamp(msg, sk, skb);
1511
1512	if (msg->msg_name)
1513		memcpy(msg->msg_name, &PACKET_SKB_CB(skb)->sa,
1514		       msg->msg_namelen);
1515
1516	if (pkt_sk(sk)->auxdata) {
1517		struct tpacket_auxdata aux;
1518
1519		aux.tp_status = TP_STATUS_USER;
1520		if (skb->ip_summed == CHECKSUM_PARTIAL)
1521			aux.tp_status |= TP_STATUS_CSUMNOTREADY;
1522		aux.tp_len = PACKET_SKB_CB(skb)->origlen;
1523		aux.tp_snaplen = skb->len;
1524		aux.tp_mac = 0;
1525		aux.tp_net = skb_network_offset(skb);
1526		aux.tp_vlan_tci = skb->vlan_tci;
1527
1528		put_cmsg(msg, SOL_PACKET, PACKET_AUXDATA, sizeof(aux), &aux);
1529	}
1530
1531	gap = check_packet_gap(skb);
1532	if (gap)
1533		put_cmsg(msg, SOL_PACKET, PACKET_GAPDATA, sizeof(__u32), &gap);
1534
1535	/*
1536	 *	Free or return the buffer as appropriate. Again this
1537	 *	hides all the races and re-entrancy issues from us.
1538	 */
1539	err = (flags&MSG_TRUNC) ? skb->len : copied;
1540
1541out_free:
1542	skb_free_datagram(sk, skb);
1543out:
1544	return err;
1545}
1546
1547static int packet_getname_spkt(struct socket *sock, struct sockaddr *uaddr,
1548			       int *uaddr_len, int peer)
1549{
1550	struct net_device *dev;
1551	struct sock *sk	= sock->sk;
1552
1553	if (peer)
1554		return -EOPNOTSUPP;
1555
1556	uaddr->sa_family = AF_PACKET;
1557	dev = dev_get_by_index(sock_net(sk), pkt_sk(sk)->ifindex);
1558	if (dev) {
1559		strlcpy(uaddr->sa_data, dev->name, 15);
1560		dev_put(dev);
1561	} else
1562		memset(uaddr->sa_data, 0, 14);
1563	*uaddr_len = sizeof(*uaddr);
1564
1565	return 0;
1566}
1567
1568static int packet_getname(struct socket *sock, struct sockaddr *uaddr,
1569			  int *uaddr_len, int peer)
1570{
1571	struct net_device *dev;
1572	struct sock *sk = sock->sk;
1573	struct packet_sock *po = pkt_sk(sk);
1574	struct sockaddr_ll *sll = (struct sockaddr_ll *)uaddr;
1575
1576	if (peer)
1577		return -EOPNOTSUPP;
1578
1579	sll->sll_family = AF_PACKET;
1580	sll->sll_ifindex = po->ifindex;
1581	sll->sll_protocol = po->num;
1582	dev = dev_get_by_index(sock_net(sk), po->ifindex);
1583	if (dev) {
1584		sll->sll_hatype = dev->type;
1585		sll->sll_halen = dev->addr_len;
1586		memcpy(sll->sll_addr, dev->dev_addr, dev->addr_len);
1587		dev_put(dev);
1588	} else {
1589		sll->sll_hatype = 0;	/* Bad: we have no ARPHRD_UNSPEC */
1590		sll->sll_halen = 0;
1591	}
1592	*uaddr_len = offsetof(struct sockaddr_ll, sll_addr) + sll->sll_halen;
1593
1594	return 0;
1595}
1596
1597static int packet_dev_mc(struct net_device *dev, struct packet_mclist *i,
1598			 int what)
1599{
1600	switch (i->type) {
1601	case PACKET_MR_MULTICAST:
1602		if (what > 0)
1603			return dev_mc_add(dev, i->addr, i->alen, 0);
1604		else
1605			return dev_mc_delete(dev, i->addr, i->alen, 0);
1606		break;
1607	case PACKET_MR_PROMISC:
1608		return dev_set_promiscuity(dev, what);
1609		break;
1610	case PACKET_MR_ALLMULTI:
1611		return dev_set_allmulti(dev, what);
1612		break;
1613	case PACKET_MR_UNICAST:
1614		if (what > 0)
1615			return dev_unicast_add(dev, i->addr);
1616		else
1617			return dev_unicast_delete(dev, i->addr);
1618		break;
1619	default:
1620		break;
1621	}
1622	return 0;
1623}
1624
1625static void packet_dev_mclist(struct net_device *dev, struct packet_mclist *i, int what)
1626{
1627	for ( ; i; i = i->next) {
1628		if (i->ifindex == dev->ifindex)
1629			packet_dev_mc(dev, i, what);
1630	}
1631}
1632
1633static int packet_mc_add(struct sock *sk, struct packet_mreq_max *mreq)
1634{
1635	struct packet_sock *po = pkt_sk(sk);
1636	struct packet_mclist *ml, *i;
1637	struct net_device *dev;
1638	int err;
1639
1640	rtnl_lock();
1641
1642	err = -ENODEV;
1643	dev = __dev_get_by_index(sock_net(sk), mreq->mr_ifindex);
1644	if (!dev)
1645		goto done;
1646
1647	err = -EINVAL;
1648	if (mreq->mr_alen > dev->addr_len)
1649		goto done;
1650
1651	err = -ENOBUFS;
1652	i = kmalloc(sizeof(*i), GFP_KERNEL);
1653	if (i == NULL)
1654		goto done;
1655
1656	err = 0;
1657	for (ml = po->mclist; ml; ml = ml->next) {
1658		if (ml->ifindex == mreq->mr_ifindex &&
1659		    ml->type == mreq->mr_type &&
1660		    ml->alen == mreq->mr_alen &&
1661		    memcmp(ml->addr, mreq->mr_address, ml->alen) == 0) {
1662			ml->count++;
1663			/* Free the new element ... */
1664			kfree(i);
1665			goto done;
1666		}
1667	}
1668
1669	i->type = mreq->mr_type;
1670	i->ifindex = mreq->mr_ifindex;
1671	i->alen = mreq->mr_alen;
1672	memcpy(i->addr, mreq->mr_address, i->alen);
1673	i->count = 1;
1674	i->next = po->mclist;
1675	po->mclist = i;
1676	err = packet_dev_mc(dev, i, 1);
1677	if (err) {
1678		po->mclist = i->next;
1679		kfree(i);
1680	}
1681
1682done:
1683	rtnl_unlock();
1684	return err;
1685}
1686
1687static int packet_mc_drop(struct sock *sk, struct packet_mreq_max *mreq)
1688{
1689	struct packet_mclist *ml, **mlp;
1690
1691	rtnl_lock();
1692
1693	for (mlp = &pkt_sk(sk)->mclist; (ml = *mlp) != NULL; mlp = &ml->next) {
1694		if (ml->ifindex == mreq->mr_ifindex &&
1695		    ml->type == mreq->mr_type &&
1696		    ml->alen == mreq->mr_alen &&
1697		    memcmp(ml->addr, mreq->mr_address, ml->alen) == 0) {
1698			if (--ml->count == 0) {
1699				struct net_device *dev;
1700				*mlp = ml->next;
1701				dev = dev_get_by_index(sock_net(sk), ml->ifindex);
1702				if (dev) {
1703					packet_dev_mc(dev, ml, -1);
1704					dev_put(dev);
1705				}
1706				kfree(ml);
1707			}
1708			rtnl_unlock();
1709			return 0;
1710		}
1711	}
1712	rtnl_unlock();
1713	return -EADDRNOTAVAIL;
1714}
1715
1716static void packet_flush_mclist(struct sock *sk)
1717{
1718	struct packet_sock *po = pkt_sk(sk);
1719	struct packet_mclist *ml;
1720
1721	if (!po->mclist)
1722		return;
1723
1724	rtnl_lock();
1725	while ((ml = po->mclist) != NULL) {
1726		struct net_device *dev;
1727
1728		po->mclist = ml->next;
1729		dev = dev_get_by_index(sock_net(sk), ml->ifindex);
1730		if (dev != NULL) {
1731			packet_dev_mc(dev, ml, -1);
1732			dev_put(dev);
1733		}
1734		kfree(ml);
1735	}
1736	rtnl_unlock();
1737}
1738
1739static int
1740packet_setsockopt(struct socket *sock, int level, int optname, char __user *optval, unsigned int optlen)
1741{
1742	struct sock *sk = sock->sk;
1743	struct packet_sock *po = pkt_sk(sk);
1744	int ret;
1745
1746	if (level != SOL_PACKET)
1747		return -ENOPROTOOPT;
1748
1749	switch (optname) {
1750	case PACKET_ADD_MEMBERSHIP:
1751	case PACKET_DROP_MEMBERSHIP:
1752	{
1753		struct packet_mreq_max mreq;
1754		int len = optlen;
1755		memset(&mreq, 0, sizeof(mreq));
1756		if (len < sizeof(struct packet_mreq))
1757			return -EINVAL;
1758		if (len > sizeof(mreq))
1759			len = sizeof(mreq);
1760		if (copy_from_user(&mreq, optval, len))
1761			return -EFAULT;
1762		if (len < (mreq.mr_alen + offsetof(struct packet_mreq, mr_address)))
1763			return -EINVAL;
1764		if (optname == PACKET_ADD_MEMBERSHIP)
1765			ret = packet_mc_add(sk, &mreq);
1766		else
1767			ret = packet_mc_drop(sk, &mreq);
1768		return ret;
1769	}
1770
1771#ifdef CONFIG_PACKET_MMAP
1772	case PACKET_RX_RING:
1773	case PACKET_TX_RING:
1774	{
1775		struct tpacket_req req;
1776
1777		if (optlen < sizeof(req))
1778			return -EINVAL;
1779		if (copy_from_user(&req, optval, sizeof(req)))
1780			return -EFAULT;
1781		return packet_set_ring(sk, &req, 0, optname == PACKET_TX_RING);
1782	}
1783	case PACKET_COPY_THRESH:
1784	{
1785		int val;
1786
1787		if (optlen != sizeof(val))
1788			return -EINVAL;
1789		if (copy_from_user(&val, optval, sizeof(val)))
1790			return -EFAULT;
1791
1792		pkt_sk(sk)->copy_thresh = val;
1793		return 0;
1794	}
1795	case PACKET_VERSION:
1796	{
1797		int val;
1798
1799		if (optlen != sizeof(val))
1800			return -EINVAL;
1801		if (po->rx_ring.pg_vec || po->tx_ring.pg_vec)
1802			return -EBUSY;
1803		if (copy_from_user(&val, optval, sizeof(val)))
1804			return -EFAULT;
1805		switch (val) {
1806		case TPACKET_V1:
1807		case TPACKET_V2:
1808			po->tp_version = val;
1809			return 0;
1810		default:
1811			return -EINVAL;
1812		}
1813	}
1814	case PACKET_RESERVE:
1815	{
1816		unsigned int val;
1817
1818		if (optlen != sizeof(val))
1819			return -EINVAL;
1820		if (po->rx_ring.pg_vec || po->tx_ring.pg_vec)
1821			return -EBUSY;
1822		if (copy_from_user(&val, optval, sizeof(val)))
1823			return -EFAULT;
1824		po->tp_reserve = val;
1825		return 0;
1826	}
1827	case PACKET_LOSS:
1828	{
1829		unsigned int val;
1830
1831		if (optlen != sizeof(val))
1832			return -EINVAL;
1833		if (po->rx_ring.pg_vec || po->tx_ring.pg_vec)
1834			return -EBUSY;
1835		if (copy_from_user(&val, optval, sizeof(val)))
1836			return -EFAULT;
1837		po->tp_loss = !!val;
1838		return 0;
1839	}
1840#endif
1841	case PACKET_AUXDATA:
1842	{
1843		int val;
1844
1845		if (optlen < sizeof(val))
1846			return -EINVAL;
1847		if (copy_from_user(&val, optval, sizeof(val)))
1848			return -EFAULT;
1849
1850		po->auxdata = !!val;
1851		return 0;
1852	}
1853	case PACKET_ORIGDEV:
1854	{
1855		int val;
1856
1857		if (optlen < sizeof(val))
1858			return -EINVAL;
1859		if (copy_from_user(&val, optval, sizeof(val)))
1860			return -EFAULT;
1861
1862		po->origdev = !!val;
1863		return 0;
1864	}
1865	default:
1866		return -ENOPROTOOPT;
1867	}
1868}
1869
1870static int packet_getsockopt(struct socket *sock, int level, int optname,
1871			     char __user *optval, int __user *optlen)
1872{
1873	int len;
1874	int val;
1875	struct sock *sk = sock->sk;
1876	struct packet_sock *po = pkt_sk(sk);
1877	void *data;
1878	struct tpacket_stats st;
1879
1880	if (level != SOL_PACKET)
1881		return -ENOPROTOOPT;
1882
1883	if (get_user(len, optlen))
1884		return -EFAULT;
1885
1886	if (len < 0)
1887		return -EINVAL;
1888
1889	switch (optname) {
1890	case PACKET_STATISTICS:
1891		if (len > sizeof(struct tpacket_stats))
1892			len = sizeof(struct tpacket_stats);
1893		spin_lock_bh(&sk->sk_receive_queue.lock);
1894		st = po->stats;
1895		memset(&po->stats, 0, sizeof(st));
1896		spin_unlock_bh(&sk->sk_receive_queue.lock);
1897		st.tp_packets += st.tp_drops;
1898
1899		data = &st;
1900		break;
1901	case PACKET_AUXDATA:
1902		if (len > sizeof(int))
1903			len = sizeof(int);
1904		val = po->auxdata;
1905
1906		data = &val;
1907		break;
1908	case PACKET_ORIGDEV:
1909		if (len > sizeof(int))
1910			len = sizeof(int);
1911		val = po->origdev;
1912
1913		data = &val;
1914		break;
1915#ifdef CONFIG_PACKET_MMAP
1916	case PACKET_VERSION:
1917		if (len > sizeof(int))
1918			len = sizeof(int);
1919		val = po->tp_version;
1920		data = &val;
1921		break;
1922	case PACKET_HDRLEN:
1923		if (len > sizeof(int))
1924			len = sizeof(int);
1925		if (copy_from_user(&val, optval, len))
1926			return -EFAULT;
1927		switch (val) {
1928		case TPACKET_V1:
1929			val = sizeof(struct tpacket_hdr);
1930			break;
1931		case TPACKET_V2:
1932			val = sizeof(struct tpacket2_hdr);
1933			break;
1934		default:
1935			return -EINVAL;
1936		}
1937		data = &val;
1938		break;
1939	case PACKET_RESERVE:
1940		if (len > sizeof(unsigned int))
1941			len = sizeof(unsigned int);
1942		val = po->tp_reserve;
1943		data = &val;
1944		break;
1945	case PACKET_LOSS:
1946		if (len > sizeof(unsigned int))
1947			len = sizeof(unsigned int);
1948		val = po->tp_loss;
1949		data = &val;
1950		break;
1951#endif
1952	default:
1953		return -ENOPROTOOPT;
1954	}
1955
1956	if (put_user(len, optlen))
1957		return -EFAULT;
1958	if (copy_to_user(optval, data, len))
1959		return -EFAULT;
1960	return 0;
1961}
1962
1963
1964static int packet_notifier(struct notifier_block *this, unsigned long msg, void *data)
1965{
1966	struct sock *sk;
1967	struct hlist_node *node;
1968	struct net_device *dev = data;
1969	struct net *net = dev_net(dev);
1970
1971	read_lock(&net->packet.sklist_lock);
1972	sk_for_each(sk, node, &net->packet.sklist) {
1973		struct packet_sock *po = pkt_sk(sk);
1974
1975		switch (msg) {
1976		case NETDEV_UNREGISTER:
1977			if (po->mclist)
1978				packet_dev_mclist(dev, po->mclist, -1);
1979			/* fallthrough */
1980
1981		case NETDEV_DOWN:
1982			if (dev->ifindex == po->ifindex) {
1983				spin_lock(&po->bind_lock);
1984				if (po->running) {
1985					__dev_remove_pack(&po->prot_hook);
1986					__sock_put(sk);
1987					po->running = 0;
1988					sk->sk_err = ENETDOWN;
1989					if (!sock_flag(sk, SOCK_DEAD))
1990						sk->sk_error_report(sk);
1991				}
1992				if (msg == NETDEV_UNREGISTER) {
1993					po->ifindex = -1;
1994					po->prot_hook.dev = NULL;
1995				}
1996				spin_unlock(&po->bind_lock);
1997			}
1998			break;
1999		case NETDEV_UP:
2000			spin_lock(&po->bind_lock);
2001			if (dev->ifindex == po->ifindex && po->num &&
2002			    !po->running) {
2003				dev_add_pack(&po->prot_hook);
2004				sock_hold(sk);
2005				po->running = 1;
2006			}
2007			spin_unlock(&po->bind_lock);
2008			break;
2009		}
2010	}
2011	read_unlock(&net->packet.sklist_lock);
2012	return NOTIFY_DONE;
2013}
2014
2015
2016static int packet_ioctl(struct socket *sock, unsigned int cmd,
2017			unsigned long arg)
2018{
2019	struct sock *sk = sock->sk;
2020
2021	switch (cmd) {
2022	case SIOCOUTQ:
2023	{
2024		int amount = sk_wmem_alloc_get(sk);
2025
2026		return put_user(amount, (int __user *)arg);
2027	}
2028	case SIOCINQ:
2029	{
2030		struct sk_buff *skb;
2031		int amount = 0;
2032
2033		spin_lock_bh(&sk->sk_receive_queue.lock);
2034		skb = skb_peek(&sk->sk_receive_queue);
2035		if (skb)
2036			amount = skb->len;
2037		spin_unlock_bh(&sk->sk_receive_queue.lock);
2038		return put_user(amount, (int __user *)arg);
2039	}
2040	case SIOCGSTAMP:
2041		return sock_get_timestamp(sk, (struct timeval __user *)arg);
2042	case SIOCGSTAMPNS:
2043		return sock_get_timestampns(sk, (struct timespec __user *)arg);
2044
2045#ifdef CONFIG_INET
2046	case SIOCADDRT:
2047	case SIOCDELRT:
2048	case SIOCDARP:
2049	case SIOCGARP:
2050	case SIOCSARP:
2051	case SIOCGIFADDR:
2052	case SIOCSIFADDR:
2053	case SIOCGIFBRDADDR:
2054	case SIOCSIFBRDADDR:
2055	case SIOCGIFNETMASK:
2056	case SIOCSIFNETMASK:
2057	case SIOCGIFDSTADDR:
2058	case SIOCSIFDSTADDR:
2059	case SIOCSIFFLAGS:
2060		if (!net_eq(sock_net(sk), &init_net))
2061			return -ENOIOCTLCMD;
2062		return inet_dgram_ops.ioctl(sock, cmd, arg);
2063#endif
2064
2065	default:
2066		return -ENOIOCTLCMD;
2067	}
2068	return 0;
2069}
2070
2071#ifndef CONFIG_PACKET_MMAP
2072#define packet_mmap sock_no_mmap
2073#define packet_poll datagram_poll
2074#else
2075
2076static unsigned int packet_poll(struct file *file, struct socket *sock,
2077				poll_table *wait)
2078{
2079	struct sock *sk = sock->sk;
2080	struct packet_sock *po = pkt_sk(sk);
2081	unsigned int mask = datagram_poll(file, sock, wait);
2082
2083	spin_lock_bh(&sk->sk_receive_queue.lock);
2084	if (po->rx_ring.pg_vec) {
2085		if (!packet_previous_frame(po, &po->rx_ring, TP_STATUS_KERNEL))
2086			mask |= POLLIN | POLLRDNORM;
2087	}
2088	spin_unlock_bh(&sk->sk_receive_queue.lock);
2089	spin_lock_bh(&sk->sk_write_queue.lock);
2090	if (po->tx_ring.pg_vec) {
2091		if (packet_current_frame(po, &po->tx_ring, TP_STATUS_AVAILABLE))
2092			mask |= POLLOUT | POLLWRNORM;
2093	}
2094	spin_unlock_bh(&sk->sk_write_queue.lock);
2095	return mask;
2096}
2097
2098
2099/* Dirty? Well, I still did not learn better way to account
2100 * for user mmaps.
2101 */
2102
2103static void packet_mm_open(struct vm_area_struct *vma)
2104{
2105	struct file *file = vma->vm_file;
2106	struct socket *sock = file->private_data;
2107	struct sock *sk = sock->sk;
2108
2109	if (sk)
2110		atomic_inc(&pkt_sk(sk)->mapped);
2111}
2112
2113static void packet_mm_close(struct vm_area_struct *vma)
2114{
2115	struct file *file = vma->vm_file;
2116	struct socket *sock = file->private_data;
2117	struct sock *sk = sock->sk;
2118
2119	if (sk)
2120		atomic_dec(&pkt_sk(sk)->mapped);
2121}
2122
2123static const struct vm_operations_struct packet_mmap_ops = {
2124	.open	=	packet_mm_open,
2125	.close	=	packet_mm_close,
2126};
2127
2128static void free_pg_vec(char **pg_vec, unsigned int order, unsigned int len)
2129{
2130	int i;
2131
2132	for (i = 0; i < len; i++) {
2133		if (likely(pg_vec[i]))
2134			free_pages((unsigned long) pg_vec[i], order);
2135	}
2136	kfree(pg_vec);
2137}
2138
2139static inline char *alloc_one_pg_vec_page(unsigned long order)
2140{
2141	gfp_t gfp_flags = GFP_KERNEL | __GFP_COMP | __GFP_ZERO | __GFP_NOWARN;
2142
2143	return (char *) __get_free_pages(gfp_flags, order);
2144}
2145
2146static char **alloc_pg_vec(struct tpacket_req *req, int order)
2147{
2148	unsigned int block_nr = req->tp_block_nr;
2149	char **pg_vec;
2150	int i;
2151
2152	pg_vec = kzalloc(block_nr * sizeof(char *), GFP_KERNEL);
2153	if (unlikely(!pg_vec))
2154		goto out;
2155
2156	for (i = 0; i < block_nr; i++) {
2157		pg_vec[i] = alloc_one_pg_vec_page(order);
2158		if (unlikely(!pg_vec[i]))
2159			goto out_free_pgvec;
2160	}
2161
2162out:
2163	return pg_vec;
2164
2165out_free_pgvec:
2166	free_pg_vec(pg_vec, order, block_nr);
2167	pg_vec = NULL;
2168	goto out;
2169}
2170
2171static int packet_set_ring(struct sock *sk, struct tpacket_req *req,
2172		int closing, int tx_ring)
2173{
2174	char **pg_vec = NULL;
2175	struct packet_sock *po = pkt_sk(sk);
2176	int was_running, order = 0;
2177	struct packet_ring_buffer *rb;
2178	struct sk_buff_head *rb_queue;
2179	__be16 num;
2180	int err;
2181
2182	rb = tx_ring ? &po->tx_ring : &po->rx_ring;
2183	rb_queue = tx_ring ? &sk->sk_write_queue : &sk->sk_receive_queue;
2184
2185	err = -EBUSY;
2186	if (!closing) {
2187		if (atomic_read(&po->mapped))
2188			goto out;
2189		if (atomic_read(&rb->pending))
2190			goto out;
2191	}
2192
2193	if (req->tp_block_nr) {
2194		/* Sanity tests and some calculations */
2195		err = -EBUSY;
2196		if (unlikely(rb->pg_vec))
2197			goto out;
2198
2199		switch (po->tp_version) {
2200		case TPACKET_V1:
2201			po->tp_hdrlen = TPACKET_HDRLEN;
2202			break;
2203		case TPACKET_V2:
2204			po->tp_hdrlen = TPACKET2_HDRLEN;
2205			break;
2206		}
2207
2208		err = -EINVAL;
2209		if (unlikely((int)req->tp_block_size <= 0))
2210			goto out;
2211		if (unlikely(req->tp_block_size & (PAGE_SIZE - 1)))
2212			goto out;
2213		if (unlikely(req->tp_frame_size < po->tp_hdrlen +
2214					po->tp_reserve))
2215			goto out;
2216		if (unlikely(req->tp_frame_size & (TPACKET_ALIGNMENT - 1)))
2217			goto out;
2218
2219		rb->frames_per_block = req->tp_block_size/req->tp_frame_size;
2220		if (unlikely(rb->frames_per_block <= 0))
2221			goto out;
2222		if (unlikely((rb->frames_per_block * req->tp_block_nr) !=
2223					req->tp_frame_nr))
2224			goto out;
2225
2226		err = -ENOMEM;
2227		order = get_order(req->tp_block_size);
2228		pg_vec = alloc_pg_vec(req, order);
2229		if (unlikely(!pg_vec))
2230			goto out;
2231	}
2232	/* Done */
2233	else {
2234		err = -EINVAL;
2235		if (unlikely(req->tp_frame_nr))
2236			goto out;
2237	}
2238
2239	lock_sock(sk);
2240
2241	/* Detach socket from network */
2242	spin_lock(&po->bind_lock);
2243	was_running = po->running;
2244	num = po->num;
2245	if (was_running) {
2246		__dev_remove_pack(&po->prot_hook);
2247		po->num = 0;
2248		po->running = 0;
2249		__sock_put(sk);
2250	}
2251	spin_unlock(&po->bind_lock);
2252
2253	synchronize_net();
2254
2255	err = -EBUSY;
2256	mutex_lock(&po->pg_vec_lock);
2257	if (closing || atomic_read(&po->mapped) == 0) {
2258		err = 0;
2259#define XC(a, b) ({ __typeof__ ((a)) __t; __t = (a); (a) = (b); __t; })
2260		spin_lock_bh(&rb_queue->lock);
2261		pg_vec = XC(rb->pg_vec, pg_vec);
2262		rb->frame_max = (req->tp_frame_nr - 1);
2263		rb->head = 0;
2264		rb->frame_size = req->tp_frame_size;
2265		spin_unlock_bh(&rb_queue->lock);
2266
2267		order = XC(rb->pg_vec_order, order);
2268		req->tp_block_nr = XC(rb->pg_vec_len, req->tp_block_nr);
2269
2270		rb->pg_vec_pages = req->tp_block_size/PAGE_SIZE;
2271		po->prot_hook.func = (po->rx_ring.pg_vec) ?
2272						tpacket_rcv : packet_rcv;
2273		skb_queue_purge(rb_queue);
2274#undef XC
2275		if (atomic_read(&po->mapped))
2276			pr_err("packet_mmap: vma is busy: %d\n",
2277			       atomic_read(&po->mapped));
2278	}
2279	mutex_unlock(&po->pg_vec_lock);
2280
2281	spin_lock(&po->bind_lock);
2282	if (was_running && !po->running) {
2283		sock_hold(sk);
2284		po->running = 1;
2285		po->num = num;
2286		dev_add_pack(&po->prot_hook);
2287	}
2288	spin_unlock(&po->bind_lock);
2289
2290	release_sock(sk);
2291
2292	if (pg_vec)
2293		free_pg_vec(pg_vec, order, req->tp_block_nr);
2294out:
2295	return err;
2296}
2297
2298static int packet_mmap(struct file *file, struct socket *sock,
2299		struct vm_area_struct *vma)
2300{
2301	struct sock *sk = sock->sk;
2302	struct packet_sock *po = pkt_sk(sk);
2303	unsigned long size, expected_size;
2304	struct packet_ring_buffer *rb;
2305	unsigned long start;
2306	int err = -EINVAL;
2307	int i;
2308
2309	if (vma->vm_pgoff)
2310		return -EINVAL;
2311
2312	mutex_lock(&po->pg_vec_lock);
2313
2314	expected_size = 0;
2315	for (rb = &po->rx_ring; rb <= &po->tx_ring; rb++) {
2316		if (rb->pg_vec) {
2317			expected_size += rb->pg_vec_len
2318						* rb->pg_vec_pages
2319						* PAGE_SIZE;
2320		}
2321	}
2322
2323	if (expected_size == 0)
2324		goto out;
2325
2326	size = vma->vm_end - vma->vm_start;
2327	if (size != expected_size)
2328		goto out;
2329
2330	start = vma->vm_start;
2331	for (rb = &po->rx_ring; rb <= &po->tx_ring; rb++) {
2332		if (rb->pg_vec == NULL)
2333			continue;
2334
2335		for (i = 0; i < rb->pg_vec_len; i++) {
2336			struct page *page = virt_to_page(rb->pg_vec[i]);
2337			int pg_num;
2338
2339			for (pg_num = 0; pg_num < rb->pg_vec_pages;
2340					pg_num++, page++) {
2341				err = vm_insert_page(vma, start, page);
2342				if (unlikely(err))
2343					goto out;
2344				start += PAGE_SIZE;
2345			}
2346		}
2347	}
2348
2349	atomic_inc(&po->mapped);
2350	vma->vm_ops = &packet_mmap_ops;
2351	err = 0;
2352
2353out:
2354	mutex_unlock(&po->pg_vec_lock);
2355	return err;
2356}
2357#endif
2358
2359
2360static const struct proto_ops packet_ops_spkt = {
2361	.family =	PF_PACKET,
2362	.owner =	THIS_MODULE,
2363	.release =	packet_release,
2364	.bind =		packet_bind_spkt,
2365	.connect =	sock_no_connect,
2366	.socketpair =	sock_no_socketpair,
2367	.accept =	sock_no_accept,
2368	.getname =	packet_getname_spkt,
2369	.poll =		datagram_poll,
2370	.ioctl =	packet_ioctl,
2371	.listen =	sock_no_listen,
2372	.shutdown =	sock_no_shutdown,
2373	.setsockopt =	sock_no_setsockopt,
2374	.getsockopt =	sock_no_getsockopt,
2375	.sendmsg =	packet_sendmsg_spkt,
2376	.recvmsg =	packet_recvmsg,
2377	.mmap =		sock_no_mmap,
2378	.sendpage =	sock_no_sendpage,
2379};
2380
2381static const struct proto_ops packet_ops = {
2382	.family =	PF_PACKET,
2383	.owner =	THIS_MODULE,
2384	.release =	packet_release,
2385	.bind =		packet_bind,
2386	.connect =	sock_no_connect,
2387	.socketpair =	sock_no_socketpair,
2388	.accept =	sock_no_accept,
2389	.getname =	packet_getname,
2390	.poll =		packet_poll,
2391	.ioctl =	packet_ioctl,
2392	.listen =	sock_no_listen,
2393	.shutdown =	sock_no_shutdown,
2394	.setsockopt =	packet_setsockopt,
2395	.getsockopt =	packet_getsockopt,
2396	.sendmsg =	packet_sendmsg,
2397	.recvmsg =	packet_recvmsg,
2398	.mmap =		packet_mmap,
2399	.sendpage =	sock_no_sendpage,
2400};
2401
2402static struct net_proto_family packet_family_ops = {
2403	.family =	PF_PACKET,
2404	.create =	packet_create,
2405	.owner	=	THIS_MODULE,
2406};
2407
2408static struct notifier_block packet_netdev_notifier = {
2409	.notifier_call =	packet_notifier,
2410};
2411
2412#ifdef CONFIG_PROC_FS
2413static inline struct sock *packet_seq_idx(struct net *net, loff_t off)
2414{
2415	struct sock *s;
2416	struct hlist_node *node;
2417
2418	sk_for_each(s, node, &net->packet.sklist) {
2419		if (!off--)
2420			return s;
2421	}
2422	return NULL;
2423}
2424
2425static void *packet_seq_start(struct seq_file *seq, loff_t *pos)
2426	__acquires(seq_file_net(seq)->packet.sklist_lock)
2427{
2428	struct net *net = seq_file_net(seq);
2429	read_lock(&net->packet.sklist_lock);
2430	return *pos ? packet_seq_idx(net, *pos - 1) : SEQ_START_TOKEN;
2431}
2432
2433static void *packet_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2434{
2435	struct net *net = seq_file_net(seq);
2436	++*pos;
2437	return  (v == SEQ_START_TOKEN)
2438		? sk_head(&net->packet.sklist)
2439		: sk_next((struct sock *)v) ;
2440}
2441
2442static void packet_seq_stop(struct seq_file *seq, void *v)
2443	__releases(seq_file_net(seq)->packet.sklist_lock)
2444{
2445	struct net *net = seq_file_net(seq);
2446	read_unlock(&net->packet.sklist_lock);
2447}
2448
2449static int packet_seq_show(struct seq_file *seq, void *v)
2450{
2451	if (v == SEQ_START_TOKEN)
2452		seq_puts(seq, "sk       RefCnt Type Proto  Iface R Rmem   User   Inode\n");
2453	else {
2454		struct sock *s = v;
2455		const struct packet_sock *po = pkt_sk(s);
2456
2457		seq_printf(seq,
2458			   "%p %-6d %-4d %04x   %-5d %1d %-6u %-6u %-6lu\n",
2459			   s,
2460			   atomic_read(&s->sk_refcnt),
2461			   s->sk_type,
2462			   ntohs(po->num),
2463			   po->ifindex,
2464			   po->running,
2465			   atomic_read(&s->sk_rmem_alloc),
2466			   sock_i_uid(s),
2467			   sock_i_ino(s));
2468	}
2469
2470	return 0;
2471}
2472
2473static const struct seq_operations packet_seq_ops = {
2474	.start	= packet_seq_start,
2475	.next	= packet_seq_next,
2476	.stop	= packet_seq_stop,
2477	.show	= packet_seq_show,
2478};
2479
2480static int packet_seq_open(struct inode *inode, struct file *file)
2481{
2482	return seq_open_net(inode, file, &packet_seq_ops,
2483			    sizeof(struct seq_net_private));
2484}
2485
2486static const struct file_operations packet_seq_fops = {
2487	.owner		= THIS_MODULE,
2488	.open		= packet_seq_open,
2489	.read		= seq_read,
2490	.llseek		= seq_lseek,
2491	.release	= seq_release_net,
2492};
2493
2494#endif
2495
2496static int packet_net_init(struct net *net)
2497{
2498	rwlock_init(&net->packet.sklist_lock);
2499	INIT_HLIST_HEAD(&net->packet.sklist);
2500
2501	if (!proc_net_fops_create(net, "packet", 0, &packet_seq_fops))
2502		return -ENOMEM;
2503
2504	return 0;
2505}
2506
2507static void packet_net_exit(struct net *net)
2508{
2509	proc_net_remove(net, "packet");
2510}
2511
2512static struct pernet_operations packet_net_ops = {
2513	.init = packet_net_init,
2514	.exit = packet_net_exit,
2515};
2516
2517
2518static void __exit packet_exit(void)
2519{
2520	unregister_netdevice_notifier(&packet_netdev_notifier);
2521	unregister_pernet_subsys(&packet_net_ops);
2522	sock_unregister(PF_PACKET);
2523	proto_unregister(&packet_proto);
2524}
2525
2526static int __init packet_init(void)
2527{
2528	int rc = proto_register(&packet_proto, 0);
2529
2530	if (rc != 0)
2531		goto out;
2532
2533	sock_register(&packet_family_ops);
2534	register_pernet_subsys(&packet_net_ops);
2535	register_netdevice_notifier(&packet_netdev_notifier);
2536out:
2537	return rc;
2538}
2539
2540module_init(packet_init);
2541module_exit(packet_exit);
2542MODULE_LICENSE("GPL");
2543MODULE_ALIAS_NETPROTO(PF_PACKET);
2544