af_packet.c revision 3f378b684453f2a028eda463ce383370545d9cc9
1/*
2 * INET		An implementation of the TCP/IP protocol suite for the LINUX
3 *		operating system.  INET is implemented using the  BSD Socket
4 *		interface as the means of communication with the user level.
5 *
6 *		PACKET - implements raw packet sockets.
7 *
8 * Authors:	Ross Biro
9 *		Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
10 *		Alan Cox, <gw4pts@gw4pts.ampr.org>
11 *
12 * Fixes:
13 *		Alan Cox	:	verify_area() now used correctly
14 *		Alan Cox	:	new skbuff lists, look ma no backlogs!
15 *		Alan Cox	:	tidied skbuff lists.
16 *		Alan Cox	:	Now uses generic datagram routines I
17 *					added. Also fixed the peek/read crash
18 *					from all old Linux datagram code.
19 *		Alan Cox	:	Uses the improved datagram code.
20 *		Alan Cox	:	Added NULL's for socket options.
21 *		Alan Cox	:	Re-commented the code.
22 *		Alan Cox	:	Use new kernel side addressing
23 *		Rob Janssen	:	Correct MTU usage.
24 *		Dave Platt	:	Counter leaks caused by incorrect
25 *					interrupt locking and some slightly
26 *					dubious gcc output. Can you read
27 *					compiler: it said _VOLATILE_
28 *	Richard Kooijman	:	Timestamp fixes.
29 *		Alan Cox	:	New buffers. Use sk->mac.raw.
30 *		Alan Cox	:	sendmsg/recvmsg support.
31 *		Alan Cox	:	Protocol setting support
32 *	Alexey Kuznetsov	:	Untied from IPv4 stack.
33 *	Cyrus Durgin		:	Fixed kerneld for kmod.
34 *	Michal Ostrowski        :       Module initialization cleanup.
35 *         Ulises Alonso        :       Frame number limit removal and
36 *                                      packet_set_ring memory leak.
37 *		Eric Biederman	:	Allow for > 8 byte hardware addresses.
38 *					The convention is that longer addresses
39 *					will simply extend the hardware address
40 *					byte arrays at the end of sockaddr_ll
41 *					and packet_mreq.
42 *		Johann Baudy	:	Added TX RING.
43 *
44 *		This program is free software; you can redistribute it and/or
45 *		modify it under the terms of the GNU General Public License
46 *		as published by the Free Software Foundation; either version
47 *		2 of the License, or (at your option) any later version.
48 *
49 */
50
51#include <linux/types.h>
52#include <linux/mm.h>
53#include <linux/capability.h>
54#include <linux/fcntl.h>
55#include <linux/socket.h>
56#include <linux/in.h>
57#include <linux/inet.h>
58#include <linux/netdevice.h>
59#include <linux/if_packet.h>
60#include <linux/wireless.h>
61#include <linux/kernel.h>
62#include <linux/kmod.h>
63#include <net/net_namespace.h>
64#include <net/ip.h>
65#include <net/protocol.h>
66#include <linux/skbuff.h>
67#include <net/sock.h>
68#include <linux/errno.h>
69#include <linux/timer.h>
70#include <asm/system.h>
71#include <asm/uaccess.h>
72#include <asm/ioctls.h>
73#include <asm/page.h>
74#include <asm/cacheflush.h>
75#include <asm/io.h>
76#include <linux/proc_fs.h>
77#include <linux/seq_file.h>
78#include <linux/poll.h>
79#include <linux/module.h>
80#include <linux/init.h>
81#include <linux/mutex.h>
82#include <linux/if_vlan.h>
83
84#ifdef CONFIG_INET
85#include <net/inet_common.h>
86#endif
87
88/*
89   Assumptions:
90   - if device has no dev->hard_header routine, it adds and removes ll header
91     inside itself. In this case ll header is invisible outside of device,
92     but higher levels still should reserve dev->hard_header_len.
93     Some devices are enough clever to reallocate skb, when header
94     will not fit to reserved space (tunnel), another ones are silly
95     (PPP).
96   - packet socket receives packets with pulled ll header,
97     so that SOCK_RAW should push it back.
98
99On receive:
100-----------
101
102Incoming, dev->hard_header!=NULL
103   mac_header -> ll header
104   data       -> data
105
106Outgoing, dev->hard_header!=NULL
107   mac_header -> ll header
108   data       -> ll header
109
110Incoming, dev->hard_header==NULL
111   mac_header -> UNKNOWN position. It is very likely, that it points to ll
112		 header.  PPP makes it, that is wrong, because introduce
113		 assymetry between rx and tx paths.
114   data       -> data
115
116Outgoing, dev->hard_header==NULL
117   mac_header -> data. ll header is still not built!
118   data       -> data
119
120Resume
121  If dev->hard_header==NULL we are unlikely to restore sensible ll header.
122
123
124On transmit:
125------------
126
127dev->hard_header != NULL
128   mac_header -> ll header
129   data       -> ll header
130
131dev->hard_header == NULL (ll header is added by device, we cannot control it)
132   mac_header -> data
133   data       -> data
134
135   We should set nh.raw on output to correct posistion,
136   packet classifier depends on it.
137 */
138
139/* Private packet socket structures. */
140
141struct packet_mclist {
142	struct packet_mclist	*next;
143	int			ifindex;
144	int			count;
145	unsigned short		type;
146	unsigned short		alen;
147	unsigned char		addr[MAX_ADDR_LEN];
148};
149/* identical to struct packet_mreq except it has
150 * a longer address field.
151 */
152struct packet_mreq_max {
153	int		mr_ifindex;
154	unsigned short	mr_type;
155	unsigned short	mr_alen;
156	unsigned char	mr_address[MAX_ADDR_LEN];
157};
158
159#ifdef CONFIG_PACKET_MMAP
160static int packet_set_ring(struct sock *sk, struct tpacket_req *req,
161		int closing, int tx_ring);
162
163struct packet_ring_buffer {
164	char			**pg_vec;
165	unsigned int		head;
166	unsigned int		frames_per_block;
167	unsigned int		frame_size;
168	unsigned int		frame_max;
169
170	unsigned int		pg_vec_order;
171	unsigned int		pg_vec_pages;
172	unsigned int		pg_vec_len;
173
174	atomic_t		pending;
175};
176
177struct packet_sock;
178static int tpacket_snd(struct packet_sock *po, struct msghdr *msg);
179#endif
180
181static void packet_flush_mclist(struct sock *sk);
182
183struct packet_sock {
184	/* struct sock has to be the first member of packet_sock */
185	struct sock		sk;
186	struct tpacket_stats	stats;
187#ifdef CONFIG_PACKET_MMAP
188	struct packet_ring_buffer	rx_ring;
189	struct packet_ring_buffer	tx_ring;
190	int			copy_thresh;
191#endif
192	spinlock_t		bind_lock;
193	struct mutex		pg_vec_lock;
194	unsigned int		running:1,	/* prot_hook is attached*/
195				auxdata:1,
196				origdev:1;
197	int			ifindex;	/* bound device		*/
198	__be16			num;
199	struct packet_mclist	*mclist;
200#ifdef CONFIG_PACKET_MMAP
201	atomic_t		mapped;
202	enum tpacket_versions	tp_version;
203	unsigned int		tp_hdrlen;
204	unsigned int		tp_reserve;
205	unsigned int		tp_loss:1;
206#endif
207	struct packet_type	prot_hook ____cacheline_aligned_in_smp;
208};
209
210struct packet_skb_cb {
211	unsigned int origlen;
212	union {
213		struct sockaddr_pkt pkt;
214		struct sockaddr_ll ll;
215	} sa;
216};
217
218#define PACKET_SKB_CB(__skb)	((struct packet_skb_cb *)((__skb)->cb))
219
220#ifdef CONFIG_PACKET_MMAP
221
222static void __packet_set_status(struct packet_sock *po, void *frame, int status)
223{
224	union {
225		struct tpacket_hdr *h1;
226		struct tpacket2_hdr *h2;
227		void *raw;
228	} h;
229
230	h.raw = frame;
231	switch (po->tp_version) {
232	case TPACKET_V1:
233		h.h1->tp_status = status;
234		flush_dcache_page(virt_to_page(&h.h1->tp_status));
235		break;
236	case TPACKET_V2:
237		h.h2->tp_status = status;
238		flush_dcache_page(virt_to_page(&h.h2->tp_status));
239		break;
240	default:
241		pr_err("TPACKET version not supported\n");
242		BUG();
243	}
244
245	smp_wmb();
246}
247
248static int __packet_get_status(struct packet_sock *po, void *frame)
249{
250	union {
251		struct tpacket_hdr *h1;
252		struct tpacket2_hdr *h2;
253		void *raw;
254	} h;
255
256	smp_rmb();
257
258	h.raw = frame;
259	switch (po->tp_version) {
260	case TPACKET_V1:
261		flush_dcache_page(virt_to_page(&h.h1->tp_status));
262		return h.h1->tp_status;
263	case TPACKET_V2:
264		flush_dcache_page(virt_to_page(&h.h2->tp_status));
265		return h.h2->tp_status;
266	default:
267		pr_err("TPACKET version not supported\n");
268		BUG();
269		return 0;
270	}
271}
272
273static void *packet_lookup_frame(struct packet_sock *po,
274		struct packet_ring_buffer *rb,
275		unsigned int position,
276		int status)
277{
278	unsigned int pg_vec_pos, frame_offset;
279	union {
280		struct tpacket_hdr *h1;
281		struct tpacket2_hdr *h2;
282		void *raw;
283	} h;
284
285	pg_vec_pos = position / rb->frames_per_block;
286	frame_offset = position % rb->frames_per_block;
287
288	h.raw = rb->pg_vec[pg_vec_pos] + (frame_offset * rb->frame_size);
289
290	if (status != __packet_get_status(po, h.raw))
291		return NULL;
292
293	return h.raw;
294}
295
296static inline void *packet_current_frame(struct packet_sock *po,
297		struct packet_ring_buffer *rb,
298		int status)
299{
300	return packet_lookup_frame(po, rb, rb->head, status);
301}
302
303static inline void *packet_previous_frame(struct packet_sock *po,
304		struct packet_ring_buffer *rb,
305		int status)
306{
307	unsigned int previous = rb->head ? rb->head - 1 : rb->frame_max;
308	return packet_lookup_frame(po, rb, previous, status);
309}
310
311static inline void packet_increment_head(struct packet_ring_buffer *buff)
312{
313	buff->head = buff->head != buff->frame_max ? buff->head+1 : 0;
314}
315
316#endif
317
318static inline struct packet_sock *pkt_sk(struct sock *sk)
319{
320	return (struct packet_sock *)sk;
321}
322
323static void packet_sock_destruct(struct sock *sk)
324{
325	WARN_ON(atomic_read(&sk->sk_rmem_alloc));
326	WARN_ON(atomic_read(&sk->sk_wmem_alloc));
327
328	if (!sock_flag(sk, SOCK_DEAD)) {
329		pr_err("Attempt to release alive packet socket: %p\n", sk);
330		return;
331	}
332
333	sk_refcnt_debug_dec(sk);
334}
335
336
337static const struct proto_ops packet_ops;
338
339static const struct proto_ops packet_ops_spkt;
340
341static int packet_rcv_spkt(struct sk_buff *skb, struct net_device *dev,
342			   struct packet_type *pt, struct net_device *orig_dev)
343{
344	struct sock *sk;
345	struct sockaddr_pkt *spkt;
346
347	/*
348	 *	When we registered the protocol we saved the socket in the data
349	 *	field for just this event.
350	 */
351
352	sk = pt->af_packet_priv;
353
354	/*
355	 *	Yank back the headers [hope the device set this
356	 *	right or kerboom...]
357	 *
358	 *	Incoming packets have ll header pulled,
359	 *	push it back.
360	 *
361	 *	For outgoing ones skb->data == skb_mac_header(skb)
362	 *	so that this procedure is noop.
363	 */
364
365	if (skb->pkt_type == PACKET_LOOPBACK)
366		goto out;
367
368	if (dev_net(dev) != sock_net(sk))
369		goto out;
370
371	skb = skb_share_check(skb, GFP_ATOMIC);
372	if (skb == NULL)
373		goto oom;
374
375	/* drop any routing info */
376	skb_dst_drop(skb);
377
378	/* drop conntrack reference */
379	nf_reset(skb);
380
381	spkt = &PACKET_SKB_CB(skb)->sa.pkt;
382
383	skb_push(skb, skb->data - skb_mac_header(skb));
384
385	/*
386	 *	The SOCK_PACKET socket receives _all_ frames.
387	 */
388
389	spkt->spkt_family = dev->type;
390	strlcpy(spkt->spkt_device, dev->name, sizeof(spkt->spkt_device));
391	spkt->spkt_protocol = skb->protocol;
392
393	/*
394	 *	Charge the memory to the socket. This is done specifically
395	 *	to prevent sockets using all the memory up.
396	 */
397
398	if (sock_queue_rcv_skb(sk, skb) == 0)
399		return 0;
400
401out:
402	kfree_skb(skb);
403oom:
404	return 0;
405}
406
407
408/*
409 *	Output a raw packet to a device layer. This bypasses all the other
410 *	protocol layers and you must therefore supply it with a complete frame
411 */
412
413static int packet_sendmsg_spkt(struct kiocb *iocb, struct socket *sock,
414			       struct msghdr *msg, size_t len)
415{
416	struct sock *sk = sock->sk;
417	struct sockaddr_pkt *saddr = (struct sockaddr_pkt *)msg->msg_name;
418	struct sk_buff *skb;
419	struct net_device *dev;
420	__be16 proto = 0;
421	int err;
422
423	/*
424	 *	Get and verify the address.
425	 */
426
427	if (saddr) {
428		if (msg->msg_namelen < sizeof(struct sockaddr))
429			return -EINVAL;
430		if (msg->msg_namelen == sizeof(struct sockaddr_pkt))
431			proto = saddr->spkt_protocol;
432	} else
433		return -ENOTCONN;	/* SOCK_PACKET must be sent giving an address */
434
435	/*
436	 *	Find the device first to size check it
437	 */
438
439	saddr->spkt_device[13] = 0;
440	rcu_read_lock();
441	dev = dev_get_by_name_rcu(sock_net(sk), saddr->spkt_device);
442	err = -ENODEV;
443	if (dev == NULL)
444		goto out_unlock;
445
446	err = -ENETDOWN;
447	if (!(dev->flags & IFF_UP))
448		goto out_unlock;
449
450	/*
451	 * You may not queue a frame bigger than the mtu. This is the lowest level
452	 * raw protocol and you must do your own fragmentation at this level.
453	 */
454
455	err = -EMSGSIZE;
456	if (len > dev->mtu + dev->hard_header_len)
457		goto out_unlock;
458
459	err = -ENOBUFS;
460	skb = sock_wmalloc(sk, len + LL_RESERVED_SPACE(dev), 0, GFP_KERNEL);
461
462	/*
463	 * If the write buffer is full, then tough. At this level the user
464	 * gets to deal with the problem - do your own algorithmic backoffs.
465	 * That's far more flexible.
466	 */
467
468	if (skb == NULL)
469		goto out_unlock;
470
471	/*
472	 *	Fill it in
473	 */
474
475	/* FIXME: Save some space for broken drivers that write a
476	 * hard header at transmission time by themselves. PPP is the
477	 * notable one here. This should really be fixed at the driver level.
478	 */
479	skb_reserve(skb, LL_RESERVED_SPACE(dev));
480	skb_reset_network_header(skb);
481
482	/* Try to align data part correctly */
483	if (dev->header_ops) {
484		skb->data -= dev->hard_header_len;
485		skb->tail -= dev->hard_header_len;
486		if (len < dev->hard_header_len)
487			skb_reset_network_header(skb);
488	}
489
490	/* Returns -EFAULT on error */
491	err = memcpy_fromiovec(skb_put(skb, len), msg->msg_iov, len);
492	skb->protocol = proto;
493	skb->dev = dev;
494	skb->priority = sk->sk_priority;
495	skb->mark = sk->sk_mark;
496	if (err)
497		goto out_free;
498
499	/*
500	 *	Now send it
501	 */
502
503	dev_queue_xmit(skb);
504	rcu_read_unlock();
505	return len;
506
507out_free:
508	kfree_skb(skb);
509out_unlock:
510	rcu_read_unlock();
511	return err;
512}
513
514static inline unsigned int run_filter(struct sk_buff *skb, struct sock *sk,
515				      unsigned int res)
516{
517	struct sk_filter *filter;
518
519	rcu_read_lock_bh();
520	filter = rcu_dereference(sk->sk_filter);
521	if (filter != NULL)
522		res = sk_run_filter(skb, filter->insns, filter->len);
523	rcu_read_unlock_bh();
524
525	return res;
526}
527
528/*
529   This function makes lazy skb cloning in hope that most of packets
530   are discarded by BPF.
531
532   Note tricky part: we DO mangle shared skb! skb->data, skb->len
533   and skb->cb are mangled. It works because (and until) packets
534   falling here are owned by current CPU. Output packets are cloned
535   by dev_queue_xmit_nit(), input packets are processed by net_bh
536   sequencially, so that if we return skb to original state on exit,
537   we will not harm anyone.
538 */
539
540static int packet_rcv(struct sk_buff *skb, struct net_device *dev,
541		      struct packet_type *pt, struct net_device *orig_dev)
542{
543	struct sock *sk;
544	struct sockaddr_ll *sll;
545	struct packet_sock *po;
546	u8 *skb_head = skb->data;
547	int skb_len = skb->len;
548	unsigned int snaplen, res;
549
550	if (skb->pkt_type == PACKET_LOOPBACK)
551		goto drop;
552
553	sk = pt->af_packet_priv;
554	po = pkt_sk(sk);
555
556	if (dev_net(dev) != sock_net(sk))
557		goto drop;
558
559	skb->dev = dev;
560
561	if (dev->header_ops) {
562		/* The device has an explicit notion of ll header,
563		   exported to higher levels.
564
565		   Otherwise, the device hides datails of it frame
566		   structure, so that corresponding packet head
567		   never delivered to user.
568		 */
569		if (sk->sk_type != SOCK_DGRAM)
570			skb_push(skb, skb->data - skb_mac_header(skb));
571		else if (skb->pkt_type == PACKET_OUTGOING) {
572			/* Special case: outgoing packets have ll header at head */
573			skb_pull(skb, skb_network_offset(skb));
574		}
575	}
576
577	snaplen = skb->len;
578
579	res = run_filter(skb, sk, snaplen);
580	if (!res)
581		goto drop_n_restore;
582	if (snaplen > res)
583		snaplen = res;
584
585	if (atomic_read(&sk->sk_rmem_alloc) + skb->truesize >=
586	    (unsigned)sk->sk_rcvbuf)
587		goto drop_n_acct;
588
589	if (skb_shared(skb)) {
590		struct sk_buff *nskb = skb_clone(skb, GFP_ATOMIC);
591		if (nskb == NULL)
592			goto drop_n_acct;
593
594		if (skb_head != skb->data) {
595			skb->data = skb_head;
596			skb->len = skb_len;
597		}
598		kfree_skb(skb);
599		skb = nskb;
600	}
601
602	BUILD_BUG_ON(sizeof(*PACKET_SKB_CB(skb)) + MAX_ADDR_LEN - 8 >
603		     sizeof(skb->cb));
604
605	sll = &PACKET_SKB_CB(skb)->sa.ll;
606	sll->sll_family = AF_PACKET;
607	sll->sll_hatype = dev->type;
608	sll->sll_protocol = skb->protocol;
609	sll->sll_pkttype = skb->pkt_type;
610	if (unlikely(po->origdev))
611		sll->sll_ifindex = orig_dev->ifindex;
612	else
613		sll->sll_ifindex = dev->ifindex;
614
615	sll->sll_halen = dev_parse_header(skb, sll->sll_addr);
616
617	PACKET_SKB_CB(skb)->origlen = skb->len;
618
619	if (pskb_trim(skb, snaplen))
620		goto drop_n_acct;
621
622	skb_set_owner_r(skb, sk);
623	skb->dev = NULL;
624	skb_dst_drop(skb);
625
626	/* drop conntrack reference */
627	nf_reset(skb);
628
629	spin_lock(&sk->sk_receive_queue.lock);
630	po->stats.tp_packets++;
631	skb->dropcount = atomic_read(&sk->sk_drops);
632	__skb_queue_tail(&sk->sk_receive_queue, skb);
633	spin_unlock(&sk->sk_receive_queue.lock);
634	sk->sk_data_ready(sk, skb->len);
635	return 0;
636
637drop_n_acct:
638	po->stats.tp_drops = atomic_inc_return(&sk->sk_drops);
639
640drop_n_restore:
641	if (skb_head != skb->data && skb_shared(skb)) {
642		skb->data = skb_head;
643		skb->len = skb_len;
644	}
645drop:
646	consume_skb(skb);
647	return 0;
648}
649
650#ifdef CONFIG_PACKET_MMAP
651static int tpacket_rcv(struct sk_buff *skb, struct net_device *dev,
652		       struct packet_type *pt, struct net_device *orig_dev)
653{
654	struct sock *sk;
655	struct packet_sock *po;
656	struct sockaddr_ll *sll;
657	union {
658		struct tpacket_hdr *h1;
659		struct tpacket2_hdr *h2;
660		void *raw;
661	} h;
662	u8 *skb_head = skb->data;
663	int skb_len = skb->len;
664	unsigned int snaplen, res;
665	unsigned long status = TP_STATUS_LOSING|TP_STATUS_USER;
666	unsigned short macoff, netoff, hdrlen;
667	struct sk_buff *copy_skb = NULL;
668	struct timeval tv;
669	struct timespec ts;
670
671	if (skb->pkt_type == PACKET_LOOPBACK)
672		goto drop;
673
674	sk = pt->af_packet_priv;
675	po = pkt_sk(sk);
676
677	if (dev_net(dev) != sock_net(sk))
678		goto drop;
679
680	if (dev->header_ops) {
681		if (sk->sk_type != SOCK_DGRAM)
682			skb_push(skb, skb->data - skb_mac_header(skb));
683		else if (skb->pkt_type == PACKET_OUTGOING) {
684			/* Special case: outgoing packets have ll header at head */
685			skb_pull(skb, skb_network_offset(skb));
686		}
687	}
688
689	if (skb->ip_summed == CHECKSUM_PARTIAL)
690		status |= TP_STATUS_CSUMNOTREADY;
691
692	snaplen = skb->len;
693
694	res = run_filter(skb, sk, snaplen);
695	if (!res)
696		goto drop_n_restore;
697	if (snaplen > res)
698		snaplen = res;
699
700	if (sk->sk_type == SOCK_DGRAM) {
701		macoff = netoff = TPACKET_ALIGN(po->tp_hdrlen) + 16 +
702				  po->tp_reserve;
703	} else {
704		unsigned maclen = skb_network_offset(skb);
705		netoff = TPACKET_ALIGN(po->tp_hdrlen +
706				       (maclen < 16 ? 16 : maclen)) +
707			po->tp_reserve;
708		macoff = netoff - maclen;
709	}
710
711	if (macoff + snaplen > po->rx_ring.frame_size) {
712		if (po->copy_thresh &&
713		    atomic_read(&sk->sk_rmem_alloc) + skb->truesize <
714		    (unsigned)sk->sk_rcvbuf) {
715			if (skb_shared(skb)) {
716				copy_skb = skb_clone(skb, GFP_ATOMIC);
717			} else {
718				copy_skb = skb_get(skb);
719				skb_head = skb->data;
720			}
721			if (copy_skb)
722				skb_set_owner_r(copy_skb, sk);
723		}
724		snaplen = po->rx_ring.frame_size - macoff;
725		if ((int)snaplen < 0)
726			snaplen = 0;
727	}
728
729	spin_lock(&sk->sk_receive_queue.lock);
730	h.raw = packet_current_frame(po, &po->rx_ring, TP_STATUS_KERNEL);
731	if (!h.raw)
732		goto ring_is_full;
733	packet_increment_head(&po->rx_ring);
734	po->stats.tp_packets++;
735	if (copy_skb) {
736		status |= TP_STATUS_COPY;
737		__skb_queue_tail(&sk->sk_receive_queue, copy_skb);
738	}
739	if (!po->stats.tp_drops)
740		status &= ~TP_STATUS_LOSING;
741	spin_unlock(&sk->sk_receive_queue.lock);
742
743	skb_copy_bits(skb, 0, h.raw + macoff, snaplen);
744
745	switch (po->tp_version) {
746	case TPACKET_V1:
747		h.h1->tp_len = skb->len;
748		h.h1->tp_snaplen = snaplen;
749		h.h1->tp_mac = macoff;
750		h.h1->tp_net = netoff;
751		if (skb->tstamp.tv64)
752			tv = ktime_to_timeval(skb->tstamp);
753		else
754			do_gettimeofday(&tv);
755		h.h1->tp_sec = tv.tv_sec;
756		h.h1->tp_usec = tv.tv_usec;
757		hdrlen = sizeof(*h.h1);
758		break;
759	case TPACKET_V2:
760		h.h2->tp_len = skb->len;
761		h.h2->tp_snaplen = snaplen;
762		h.h2->tp_mac = macoff;
763		h.h2->tp_net = netoff;
764		if (skb->tstamp.tv64)
765			ts = ktime_to_timespec(skb->tstamp);
766		else
767			getnstimeofday(&ts);
768		h.h2->tp_sec = ts.tv_sec;
769		h.h2->tp_nsec = ts.tv_nsec;
770		h.h2->tp_vlan_tci = vlan_tx_tag_get(skb);
771		hdrlen = sizeof(*h.h2);
772		break;
773	default:
774		BUG();
775	}
776
777	sll = h.raw + TPACKET_ALIGN(hdrlen);
778	sll->sll_halen = dev_parse_header(skb, sll->sll_addr);
779	sll->sll_family = AF_PACKET;
780	sll->sll_hatype = dev->type;
781	sll->sll_protocol = skb->protocol;
782	sll->sll_pkttype = skb->pkt_type;
783	if (unlikely(po->origdev))
784		sll->sll_ifindex = orig_dev->ifindex;
785	else
786		sll->sll_ifindex = dev->ifindex;
787
788	__packet_set_status(po, h.raw, status);
789	smp_mb();
790	{
791		struct page *p_start, *p_end;
792		u8 *h_end = h.raw + macoff + snaplen - 1;
793
794		p_start = virt_to_page(h.raw);
795		p_end = virt_to_page(h_end);
796		while (p_start <= p_end) {
797			flush_dcache_page(p_start);
798			p_start++;
799		}
800	}
801
802	sk->sk_data_ready(sk, 0);
803
804drop_n_restore:
805	if (skb_head != skb->data && skb_shared(skb)) {
806		skb->data = skb_head;
807		skb->len = skb_len;
808	}
809drop:
810	kfree_skb(skb);
811	return 0;
812
813ring_is_full:
814	po->stats.tp_drops++;
815	spin_unlock(&sk->sk_receive_queue.lock);
816
817	sk->sk_data_ready(sk, 0);
818	kfree_skb(copy_skb);
819	goto drop_n_restore;
820}
821
822static void tpacket_destruct_skb(struct sk_buff *skb)
823{
824	struct packet_sock *po = pkt_sk(skb->sk);
825	void *ph;
826
827	BUG_ON(skb == NULL);
828
829	if (likely(po->tx_ring.pg_vec)) {
830		ph = skb_shinfo(skb)->destructor_arg;
831		BUG_ON(__packet_get_status(po, ph) != TP_STATUS_SENDING);
832		BUG_ON(atomic_read(&po->tx_ring.pending) == 0);
833		atomic_dec(&po->tx_ring.pending);
834		__packet_set_status(po, ph, TP_STATUS_AVAILABLE);
835	}
836
837	sock_wfree(skb);
838}
839
840static int tpacket_fill_skb(struct packet_sock *po, struct sk_buff *skb,
841		void *frame, struct net_device *dev, int size_max,
842		__be16 proto, unsigned char *addr)
843{
844	union {
845		struct tpacket_hdr *h1;
846		struct tpacket2_hdr *h2;
847		void *raw;
848	} ph;
849	int to_write, offset, len, tp_len, nr_frags, len_max;
850	struct socket *sock = po->sk.sk_socket;
851	struct page *page;
852	void *data;
853	int err;
854
855	ph.raw = frame;
856
857	skb->protocol = proto;
858	skb->dev = dev;
859	skb->priority = po->sk.sk_priority;
860	skb->mark = po->sk.sk_mark;
861	skb_shinfo(skb)->destructor_arg = ph.raw;
862
863	switch (po->tp_version) {
864	case TPACKET_V2:
865		tp_len = ph.h2->tp_len;
866		break;
867	default:
868		tp_len = ph.h1->tp_len;
869		break;
870	}
871	if (unlikely(tp_len > size_max)) {
872		pr_err("packet size is too long (%d > %d)\n", tp_len, size_max);
873		return -EMSGSIZE;
874	}
875
876	skb_reserve(skb, LL_RESERVED_SPACE(dev));
877	skb_reset_network_header(skb);
878
879	data = ph.raw + po->tp_hdrlen - sizeof(struct sockaddr_ll);
880	to_write = tp_len;
881
882	if (sock->type == SOCK_DGRAM) {
883		err = dev_hard_header(skb, dev, ntohs(proto), addr,
884				NULL, tp_len);
885		if (unlikely(err < 0))
886			return -EINVAL;
887	} else if (dev->hard_header_len) {
888		/* net device doesn't like empty head */
889		if (unlikely(tp_len <= dev->hard_header_len)) {
890			pr_err("packet size is too short (%d < %d)\n",
891			       tp_len, dev->hard_header_len);
892			return -EINVAL;
893		}
894
895		skb_push(skb, dev->hard_header_len);
896		err = skb_store_bits(skb, 0, data,
897				dev->hard_header_len);
898		if (unlikely(err))
899			return err;
900
901		data += dev->hard_header_len;
902		to_write -= dev->hard_header_len;
903	}
904
905	err = -EFAULT;
906	page = virt_to_page(data);
907	offset = offset_in_page(data);
908	len_max = PAGE_SIZE - offset;
909	len = ((to_write > len_max) ? len_max : to_write);
910
911	skb->data_len = to_write;
912	skb->len += to_write;
913	skb->truesize += to_write;
914	atomic_add(to_write, &po->sk.sk_wmem_alloc);
915
916	while (likely(to_write)) {
917		nr_frags = skb_shinfo(skb)->nr_frags;
918
919		if (unlikely(nr_frags >= MAX_SKB_FRAGS)) {
920			pr_err("Packet exceed the number of skb frags(%lu)\n",
921			       MAX_SKB_FRAGS);
922			return -EFAULT;
923		}
924
925		flush_dcache_page(page);
926		get_page(page);
927		skb_fill_page_desc(skb,
928				nr_frags,
929				page++, offset, len);
930		to_write -= len;
931		offset = 0;
932		len_max = PAGE_SIZE;
933		len = ((to_write > len_max) ? len_max : to_write);
934	}
935
936	return tp_len;
937}
938
939static int tpacket_snd(struct packet_sock *po, struct msghdr *msg)
940{
941	struct socket *sock;
942	struct sk_buff *skb;
943	struct net_device *dev;
944	__be16 proto;
945	int ifindex, err, reserve = 0;
946	void *ph;
947	struct sockaddr_ll *saddr = (struct sockaddr_ll *)msg->msg_name;
948	int tp_len, size_max;
949	unsigned char *addr;
950	int len_sum = 0;
951	int status = 0;
952
953	sock = po->sk.sk_socket;
954
955	mutex_lock(&po->pg_vec_lock);
956
957	err = -EBUSY;
958	if (saddr == NULL) {
959		ifindex	= po->ifindex;
960		proto	= po->num;
961		addr	= NULL;
962	} else {
963		err = -EINVAL;
964		if (msg->msg_namelen < sizeof(struct sockaddr_ll))
965			goto out;
966		if (msg->msg_namelen < (saddr->sll_halen
967					+ offsetof(struct sockaddr_ll,
968						sll_addr)))
969			goto out;
970		ifindex	= saddr->sll_ifindex;
971		proto	= saddr->sll_protocol;
972		addr	= saddr->sll_addr;
973	}
974
975	dev = dev_get_by_index(sock_net(&po->sk), ifindex);
976	err = -ENXIO;
977	if (unlikely(dev == NULL))
978		goto out;
979
980	reserve = dev->hard_header_len;
981
982	err = -ENETDOWN;
983	if (unlikely(!(dev->flags & IFF_UP)))
984		goto out_put;
985
986	size_max = po->tx_ring.frame_size
987		- (po->tp_hdrlen - sizeof(struct sockaddr_ll));
988
989	if (size_max > dev->mtu + reserve)
990		size_max = dev->mtu + reserve;
991
992	do {
993		ph = packet_current_frame(po, &po->tx_ring,
994				TP_STATUS_SEND_REQUEST);
995
996		if (unlikely(ph == NULL)) {
997			schedule();
998			continue;
999		}
1000
1001		status = TP_STATUS_SEND_REQUEST;
1002		skb = sock_alloc_send_skb(&po->sk,
1003				LL_ALLOCATED_SPACE(dev)
1004				+ sizeof(struct sockaddr_ll),
1005				0, &err);
1006
1007		if (unlikely(skb == NULL))
1008			goto out_status;
1009
1010		tp_len = tpacket_fill_skb(po, skb, ph, dev, size_max, proto,
1011				addr);
1012
1013		if (unlikely(tp_len < 0)) {
1014			if (po->tp_loss) {
1015				__packet_set_status(po, ph,
1016						TP_STATUS_AVAILABLE);
1017				packet_increment_head(&po->tx_ring);
1018				kfree_skb(skb);
1019				continue;
1020			} else {
1021				status = TP_STATUS_WRONG_FORMAT;
1022				err = tp_len;
1023				goto out_status;
1024			}
1025		}
1026
1027		skb->destructor = tpacket_destruct_skb;
1028		__packet_set_status(po, ph, TP_STATUS_SENDING);
1029		atomic_inc(&po->tx_ring.pending);
1030
1031		status = TP_STATUS_SEND_REQUEST;
1032		err = dev_queue_xmit(skb);
1033		if (unlikely(err > 0 && (err = net_xmit_errno(err)) != 0))
1034			goto out_xmit;
1035		packet_increment_head(&po->tx_ring);
1036		len_sum += tp_len;
1037	} while (likely((ph != NULL) || ((!(msg->msg_flags & MSG_DONTWAIT))
1038					&& (atomic_read(&po->tx_ring.pending))))
1039	      );
1040
1041	err = len_sum;
1042	goto out_put;
1043
1044out_xmit:
1045	skb->destructor = sock_wfree;
1046	atomic_dec(&po->tx_ring.pending);
1047out_status:
1048	__packet_set_status(po, ph, status);
1049	kfree_skb(skb);
1050out_put:
1051	dev_put(dev);
1052out:
1053	mutex_unlock(&po->pg_vec_lock);
1054	return err;
1055}
1056#endif
1057
1058static int packet_snd(struct socket *sock,
1059			  struct msghdr *msg, size_t len)
1060{
1061	struct sock *sk = sock->sk;
1062	struct sockaddr_ll *saddr = (struct sockaddr_ll *)msg->msg_name;
1063	struct sk_buff *skb;
1064	struct net_device *dev;
1065	__be16 proto;
1066	unsigned char *addr;
1067	int ifindex, err, reserve = 0;
1068
1069	/*
1070	 *	Get and verify the address.
1071	 */
1072
1073	if (saddr == NULL) {
1074		struct packet_sock *po = pkt_sk(sk);
1075
1076		ifindex	= po->ifindex;
1077		proto	= po->num;
1078		addr	= NULL;
1079	} else {
1080		err = -EINVAL;
1081		if (msg->msg_namelen < sizeof(struct sockaddr_ll))
1082			goto out;
1083		if (msg->msg_namelen < (saddr->sll_halen + offsetof(struct sockaddr_ll, sll_addr)))
1084			goto out;
1085		ifindex	= saddr->sll_ifindex;
1086		proto	= saddr->sll_protocol;
1087		addr	= saddr->sll_addr;
1088	}
1089
1090
1091	dev = dev_get_by_index(sock_net(sk), ifindex);
1092	err = -ENXIO;
1093	if (dev == NULL)
1094		goto out_unlock;
1095	if (sock->type == SOCK_RAW)
1096		reserve = dev->hard_header_len;
1097
1098	err = -ENETDOWN;
1099	if (!(dev->flags & IFF_UP))
1100		goto out_unlock;
1101
1102	err = -EMSGSIZE;
1103	if (len > dev->mtu+reserve)
1104		goto out_unlock;
1105
1106	skb = sock_alloc_send_skb(sk, len + LL_ALLOCATED_SPACE(dev),
1107				msg->msg_flags & MSG_DONTWAIT, &err);
1108	if (skb == NULL)
1109		goto out_unlock;
1110
1111	skb_reserve(skb, LL_RESERVED_SPACE(dev));
1112	skb_reset_network_header(skb);
1113
1114	err = -EINVAL;
1115	if (sock->type == SOCK_DGRAM &&
1116	    dev_hard_header(skb, dev, ntohs(proto), addr, NULL, len) < 0)
1117		goto out_free;
1118
1119	/* Returns -EFAULT on error */
1120	err = memcpy_fromiovec(skb_put(skb, len), msg->msg_iov, len);
1121	if (err)
1122		goto out_free;
1123
1124	skb->protocol = proto;
1125	skb->dev = dev;
1126	skb->priority = sk->sk_priority;
1127	skb->mark = sk->sk_mark;
1128
1129	/*
1130	 *	Now send it
1131	 */
1132
1133	err = dev_queue_xmit(skb);
1134	if (err > 0 && (err = net_xmit_errno(err)) != 0)
1135		goto out_unlock;
1136
1137	dev_put(dev);
1138
1139	return len;
1140
1141out_free:
1142	kfree_skb(skb);
1143out_unlock:
1144	if (dev)
1145		dev_put(dev);
1146out:
1147	return err;
1148}
1149
1150static int packet_sendmsg(struct kiocb *iocb, struct socket *sock,
1151		struct msghdr *msg, size_t len)
1152{
1153#ifdef CONFIG_PACKET_MMAP
1154	struct sock *sk = sock->sk;
1155	struct packet_sock *po = pkt_sk(sk);
1156	if (po->tx_ring.pg_vec)
1157		return tpacket_snd(po, msg);
1158	else
1159#endif
1160		return packet_snd(sock, msg, len);
1161}
1162
1163/*
1164 *	Close a PACKET socket. This is fairly simple. We immediately go
1165 *	to 'closed' state and remove our protocol entry in the device list.
1166 */
1167
1168static int packet_release(struct socket *sock)
1169{
1170	struct sock *sk = sock->sk;
1171	struct packet_sock *po;
1172	struct net *net;
1173#ifdef CONFIG_PACKET_MMAP
1174	struct tpacket_req req;
1175#endif
1176
1177	if (!sk)
1178		return 0;
1179
1180	net = sock_net(sk);
1181	po = pkt_sk(sk);
1182
1183	write_lock_bh(&net->packet.sklist_lock);
1184	sk_del_node_init(sk);
1185	sock_prot_inuse_add(net, sk->sk_prot, -1);
1186	write_unlock_bh(&net->packet.sklist_lock);
1187
1188	/*
1189	 *	Unhook packet receive handler.
1190	 */
1191
1192	if (po->running) {
1193		/*
1194		 *	Remove the protocol hook
1195		 */
1196		dev_remove_pack(&po->prot_hook);
1197		po->running = 0;
1198		po->num = 0;
1199		__sock_put(sk);
1200	}
1201
1202	packet_flush_mclist(sk);
1203
1204#ifdef CONFIG_PACKET_MMAP
1205	memset(&req, 0, sizeof(req));
1206
1207	if (po->rx_ring.pg_vec)
1208		packet_set_ring(sk, &req, 1, 0);
1209
1210	if (po->tx_ring.pg_vec)
1211		packet_set_ring(sk, &req, 1, 1);
1212#endif
1213
1214	/*
1215	 *	Now the socket is dead. No more input will appear.
1216	 */
1217
1218	sock_orphan(sk);
1219	sock->sk = NULL;
1220
1221	/* Purge queues */
1222
1223	skb_queue_purge(&sk->sk_receive_queue);
1224	sk_refcnt_debug_release(sk);
1225
1226	sock_put(sk);
1227	return 0;
1228}
1229
1230/*
1231 *	Attach a packet hook.
1232 */
1233
1234static int packet_do_bind(struct sock *sk, struct net_device *dev, __be16 protocol)
1235{
1236	struct packet_sock *po = pkt_sk(sk);
1237	/*
1238	 *	Detach an existing hook if present.
1239	 */
1240
1241	lock_sock(sk);
1242
1243	spin_lock(&po->bind_lock);
1244	if (po->running) {
1245		__sock_put(sk);
1246		po->running = 0;
1247		po->num = 0;
1248		spin_unlock(&po->bind_lock);
1249		dev_remove_pack(&po->prot_hook);
1250		spin_lock(&po->bind_lock);
1251	}
1252
1253	po->num = protocol;
1254	po->prot_hook.type = protocol;
1255	po->prot_hook.dev = dev;
1256
1257	po->ifindex = dev ? dev->ifindex : 0;
1258
1259	if (protocol == 0)
1260		goto out_unlock;
1261
1262	if (!dev || (dev->flags & IFF_UP)) {
1263		dev_add_pack(&po->prot_hook);
1264		sock_hold(sk);
1265		po->running = 1;
1266	} else {
1267		sk->sk_err = ENETDOWN;
1268		if (!sock_flag(sk, SOCK_DEAD))
1269			sk->sk_error_report(sk);
1270	}
1271
1272out_unlock:
1273	spin_unlock(&po->bind_lock);
1274	release_sock(sk);
1275	return 0;
1276}
1277
1278/*
1279 *	Bind a packet socket to a device
1280 */
1281
1282static int packet_bind_spkt(struct socket *sock, struct sockaddr *uaddr,
1283			    int addr_len)
1284{
1285	struct sock *sk = sock->sk;
1286	char name[15];
1287	struct net_device *dev;
1288	int err = -ENODEV;
1289
1290	/*
1291	 *	Check legality
1292	 */
1293
1294	if (addr_len != sizeof(struct sockaddr))
1295		return -EINVAL;
1296	strlcpy(name, uaddr->sa_data, sizeof(name));
1297
1298	dev = dev_get_by_name(sock_net(sk), name);
1299	if (dev) {
1300		err = packet_do_bind(sk, dev, pkt_sk(sk)->num);
1301		dev_put(dev);
1302	}
1303	return err;
1304}
1305
1306static int packet_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
1307{
1308	struct sockaddr_ll *sll = (struct sockaddr_ll *)uaddr;
1309	struct sock *sk = sock->sk;
1310	struct net_device *dev = NULL;
1311	int err;
1312
1313
1314	/*
1315	 *	Check legality
1316	 */
1317
1318	if (addr_len < sizeof(struct sockaddr_ll))
1319		return -EINVAL;
1320	if (sll->sll_family != AF_PACKET)
1321		return -EINVAL;
1322
1323	if (sll->sll_ifindex) {
1324		err = -ENODEV;
1325		dev = dev_get_by_index(sock_net(sk), sll->sll_ifindex);
1326		if (dev == NULL)
1327			goto out;
1328	}
1329	err = packet_do_bind(sk, dev, sll->sll_protocol ? : pkt_sk(sk)->num);
1330	if (dev)
1331		dev_put(dev);
1332
1333out:
1334	return err;
1335}
1336
1337static struct proto packet_proto = {
1338	.name	  = "PACKET",
1339	.owner	  = THIS_MODULE,
1340	.obj_size = sizeof(struct packet_sock),
1341};
1342
1343/*
1344 *	Create a packet of type SOCK_PACKET.
1345 */
1346
1347static int packet_create(struct net *net, struct socket *sock, int protocol,
1348			 int kern)
1349{
1350	struct sock *sk;
1351	struct packet_sock *po;
1352	__be16 proto = (__force __be16)protocol; /* weird, but documented */
1353	int err;
1354
1355	if (!capable(CAP_NET_RAW))
1356		return -EPERM;
1357	if (sock->type != SOCK_DGRAM && sock->type != SOCK_RAW &&
1358	    sock->type != SOCK_PACKET)
1359		return -ESOCKTNOSUPPORT;
1360
1361	sock->state = SS_UNCONNECTED;
1362
1363	err = -ENOBUFS;
1364	sk = sk_alloc(net, PF_PACKET, GFP_KERNEL, &packet_proto);
1365	if (sk == NULL)
1366		goto out;
1367
1368	sock->ops = &packet_ops;
1369	if (sock->type == SOCK_PACKET)
1370		sock->ops = &packet_ops_spkt;
1371
1372	sock_init_data(sock, sk);
1373
1374	po = pkt_sk(sk);
1375	sk->sk_family = PF_PACKET;
1376	po->num = proto;
1377
1378	sk->sk_destruct = packet_sock_destruct;
1379	sk_refcnt_debug_inc(sk);
1380
1381	/*
1382	 *	Attach a protocol block
1383	 */
1384
1385	spin_lock_init(&po->bind_lock);
1386	mutex_init(&po->pg_vec_lock);
1387	po->prot_hook.func = packet_rcv;
1388
1389	if (sock->type == SOCK_PACKET)
1390		po->prot_hook.func = packet_rcv_spkt;
1391
1392	po->prot_hook.af_packet_priv = sk;
1393
1394	if (proto) {
1395		po->prot_hook.type = proto;
1396		dev_add_pack(&po->prot_hook);
1397		sock_hold(sk);
1398		po->running = 1;
1399	}
1400
1401	write_lock_bh(&net->packet.sklist_lock);
1402	sk_add_node(sk, &net->packet.sklist);
1403	sock_prot_inuse_add(net, &packet_proto, 1);
1404	write_unlock_bh(&net->packet.sklist_lock);
1405	return 0;
1406out:
1407	return err;
1408}
1409
1410/*
1411 *	Pull a packet from our receive queue and hand it to the user.
1412 *	If necessary we block.
1413 */
1414
1415static int packet_recvmsg(struct kiocb *iocb, struct socket *sock,
1416			  struct msghdr *msg, size_t len, int flags)
1417{
1418	struct sock *sk = sock->sk;
1419	struct sk_buff *skb;
1420	int copied, err;
1421	struct sockaddr_ll *sll;
1422
1423	err = -EINVAL;
1424	if (flags & ~(MSG_PEEK|MSG_DONTWAIT|MSG_TRUNC|MSG_CMSG_COMPAT))
1425		goto out;
1426
1427#if 0
1428	/* What error should we return now? EUNATTACH? */
1429	if (pkt_sk(sk)->ifindex < 0)
1430		return -ENODEV;
1431#endif
1432
1433	/*
1434	 *	Call the generic datagram receiver. This handles all sorts
1435	 *	of horrible races and re-entrancy so we can forget about it
1436	 *	in the protocol layers.
1437	 *
1438	 *	Now it will return ENETDOWN, if device have just gone down,
1439	 *	but then it will block.
1440	 */
1441
1442	skb = skb_recv_datagram(sk, flags, flags & MSG_DONTWAIT, &err);
1443
1444	/*
1445	 *	An error occurred so return it. Because skb_recv_datagram()
1446	 *	handles the blocking we don't see and worry about blocking
1447	 *	retries.
1448	 */
1449
1450	if (skb == NULL)
1451		goto out;
1452
1453	/*
1454	 *	If the address length field is there to be filled in, we fill
1455	 *	it in now.
1456	 */
1457
1458	sll = &PACKET_SKB_CB(skb)->sa.ll;
1459	if (sock->type == SOCK_PACKET)
1460		msg->msg_namelen = sizeof(struct sockaddr_pkt);
1461	else
1462		msg->msg_namelen = sll->sll_halen + offsetof(struct sockaddr_ll, sll_addr);
1463
1464	/*
1465	 *	You lose any data beyond the buffer you gave. If it worries a
1466	 *	user program they can ask the device for its MTU anyway.
1467	 */
1468
1469	copied = skb->len;
1470	if (copied > len) {
1471		copied = len;
1472		msg->msg_flags |= MSG_TRUNC;
1473	}
1474
1475	err = skb_copy_datagram_iovec(skb, 0, msg->msg_iov, copied);
1476	if (err)
1477		goto out_free;
1478
1479	sock_recv_ts_and_drops(msg, sk, skb);
1480
1481	if (msg->msg_name)
1482		memcpy(msg->msg_name, &PACKET_SKB_CB(skb)->sa,
1483		       msg->msg_namelen);
1484
1485	if (pkt_sk(sk)->auxdata) {
1486		struct tpacket_auxdata aux;
1487
1488		aux.tp_status = TP_STATUS_USER;
1489		if (skb->ip_summed == CHECKSUM_PARTIAL)
1490			aux.tp_status |= TP_STATUS_CSUMNOTREADY;
1491		aux.tp_len = PACKET_SKB_CB(skb)->origlen;
1492		aux.tp_snaplen = skb->len;
1493		aux.tp_mac = 0;
1494		aux.tp_net = skb_network_offset(skb);
1495		aux.tp_vlan_tci = vlan_tx_tag_get(skb);
1496
1497		put_cmsg(msg, SOL_PACKET, PACKET_AUXDATA, sizeof(aux), &aux);
1498	}
1499
1500	/*
1501	 *	Free or return the buffer as appropriate. Again this
1502	 *	hides all the races and re-entrancy issues from us.
1503	 */
1504	err = (flags&MSG_TRUNC) ? skb->len : copied;
1505
1506out_free:
1507	skb_free_datagram(sk, skb);
1508out:
1509	return err;
1510}
1511
1512static int packet_getname_spkt(struct socket *sock, struct sockaddr *uaddr,
1513			       int *uaddr_len, int peer)
1514{
1515	struct net_device *dev;
1516	struct sock *sk	= sock->sk;
1517
1518	if (peer)
1519		return -EOPNOTSUPP;
1520
1521	uaddr->sa_family = AF_PACKET;
1522	rcu_read_lock();
1523	dev = dev_get_by_index_rcu(sock_net(sk), pkt_sk(sk)->ifindex);
1524	if (dev)
1525		strlcpy(uaddr->sa_data, dev->name, 15);
1526	else
1527		memset(uaddr->sa_data, 0, 14);
1528	rcu_read_unlock();
1529	*uaddr_len = sizeof(*uaddr);
1530
1531	return 0;
1532}
1533
1534static int packet_getname(struct socket *sock, struct sockaddr *uaddr,
1535			  int *uaddr_len, int peer)
1536{
1537	struct net_device *dev;
1538	struct sock *sk = sock->sk;
1539	struct packet_sock *po = pkt_sk(sk);
1540	struct sockaddr_ll *sll = (struct sockaddr_ll *)uaddr;
1541
1542	if (peer)
1543		return -EOPNOTSUPP;
1544
1545	sll->sll_family = AF_PACKET;
1546	sll->sll_ifindex = po->ifindex;
1547	sll->sll_protocol = po->num;
1548	rcu_read_lock();
1549	dev = dev_get_by_index_rcu(sock_net(sk), po->ifindex);
1550	if (dev) {
1551		sll->sll_hatype = dev->type;
1552		sll->sll_halen = dev->addr_len;
1553		memcpy(sll->sll_addr, dev->dev_addr, dev->addr_len);
1554	} else {
1555		sll->sll_hatype = 0;	/* Bad: we have no ARPHRD_UNSPEC */
1556		sll->sll_halen = 0;
1557	}
1558	rcu_read_unlock();
1559	*uaddr_len = offsetof(struct sockaddr_ll, sll_addr) + sll->sll_halen;
1560
1561	return 0;
1562}
1563
1564static int packet_dev_mc(struct net_device *dev, struct packet_mclist *i,
1565			 int what)
1566{
1567	switch (i->type) {
1568	case PACKET_MR_MULTICAST:
1569		if (what > 0)
1570			return dev_mc_add(dev, i->addr, i->alen, 0);
1571		else
1572			return dev_mc_delete(dev, i->addr, i->alen, 0);
1573		break;
1574	case PACKET_MR_PROMISC:
1575		return dev_set_promiscuity(dev, what);
1576		break;
1577	case PACKET_MR_ALLMULTI:
1578		return dev_set_allmulti(dev, what);
1579		break;
1580	case PACKET_MR_UNICAST:
1581		if (what > 0)
1582			return dev_unicast_add(dev, i->addr);
1583		else
1584			return dev_unicast_delete(dev, i->addr);
1585		break;
1586	default:
1587		break;
1588	}
1589	return 0;
1590}
1591
1592static void packet_dev_mclist(struct net_device *dev, struct packet_mclist *i, int what)
1593{
1594	for ( ; i; i = i->next) {
1595		if (i->ifindex == dev->ifindex)
1596			packet_dev_mc(dev, i, what);
1597	}
1598}
1599
1600static int packet_mc_add(struct sock *sk, struct packet_mreq_max *mreq)
1601{
1602	struct packet_sock *po = pkt_sk(sk);
1603	struct packet_mclist *ml, *i;
1604	struct net_device *dev;
1605	int err;
1606
1607	rtnl_lock();
1608
1609	err = -ENODEV;
1610	dev = __dev_get_by_index(sock_net(sk), mreq->mr_ifindex);
1611	if (!dev)
1612		goto done;
1613
1614	err = -EINVAL;
1615	if (mreq->mr_alen > dev->addr_len)
1616		goto done;
1617
1618	err = -ENOBUFS;
1619	i = kmalloc(sizeof(*i), GFP_KERNEL);
1620	if (i == NULL)
1621		goto done;
1622
1623	err = 0;
1624	for (ml = po->mclist; ml; ml = ml->next) {
1625		if (ml->ifindex == mreq->mr_ifindex &&
1626		    ml->type == mreq->mr_type &&
1627		    ml->alen == mreq->mr_alen &&
1628		    memcmp(ml->addr, mreq->mr_address, ml->alen) == 0) {
1629			ml->count++;
1630			/* Free the new element ... */
1631			kfree(i);
1632			goto done;
1633		}
1634	}
1635
1636	i->type = mreq->mr_type;
1637	i->ifindex = mreq->mr_ifindex;
1638	i->alen = mreq->mr_alen;
1639	memcpy(i->addr, mreq->mr_address, i->alen);
1640	i->count = 1;
1641	i->next = po->mclist;
1642	po->mclist = i;
1643	err = packet_dev_mc(dev, i, 1);
1644	if (err) {
1645		po->mclist = i->next;
1646		kfree(i);
1647	}
1648
1649done:
1650	rtnl_unlock();
1651	return err;
1652}
1653
1654static int packet_mc_drop(struct sock *sk, struct packet_mreq_max *mreq)
1655{
1656	struct packet_mclist *ml, **mlp;
1657
1658	rtnl_lock();
1659
1660	for (mlp = &pkt_sk(sk)->mclist; (ml = *mlp) != NULL; mlp = &ml->next) {
1661		if (ml->ifindex == mreq->mr_ifindex &&
1662		    ml->type == mreq->mr_type &&
1663		    ml->alen == mreq->mr_alen &&
1664		    memcmp(ml->addr, mreq->mr_address, ml->alen) == 0) {
1665			if (--ml->count == 0) {
1666				struct net_device *dev;
1667				*mlp = ml->next;
1668				dev = __dev_get_by_index(sock_net(sk), ml->ifindex);
1669				if (dev)
1670					packet_dev_mc(dev, ml, -1);
1671				kfree(ml);
1672			}
1673			rtnl_unlock();
1674			return 0;
1675		}
1676	}
1677	rtnl_unlock();
1678	return -EADDRNOTAVAIL;
1679}
1680
1681static void packet_flush_mclist(struct sock *sk)
1682{
1683	struct packet_sock *po = pkt_sk(sk);
1684	struct packet_mclist *ml;
1685
1686	if (!po->mclist)
1687		return;
1688
1689	rtnl_lock();
1690	while ((ml = po->mclist) != NULL) {
1691		struct net_device *dev;
1692
1693		po->mclist = ml->next;
1694		dev = __dev_get_by_index(sock_net(sk), ml->ifindex);
1695		if (dev != NULL)
1696			packet_dev_mc(dev, ml, -1);
1697		kfree(ml);
1698	}
1699	rtnl_unlock();
1700}
1701
1702static int
1703packet_setsockopt(struct socket *sock, int level, int optname, char __user *optval, unsigned int optlen)
1704{
1705	struct sock *sk = sock->sk;
1706	struct packet_sock *po = pkt_sk(sk);
1707	int ret;
1708
1709	if (level != SOL_PACKET)
1710		return -ENOPROTOOPT;
1711
1712	switch (optname) {
1713	case PACKET_ADD_MEMBERSHIP:
1714	case PACKET_DROP_MEMBERSHIP:
1715	{
1716		struct packet_mreq_max mreq;
1717		int len = optlen;
1718		memset(&mreq, 0, sizeof(mreq));
1719		if (len < sizeof(struct packet_mreq))
1720			return -EINVAL;
1721		if (len > sizeof(mreq))
1722			len = sizeof(mreq);
1723		if (copy_from_user(&mreq, optval, len))
1724			return -EFAULT;
1725		if (len < (mreq.mr_alen + offsetof(struct packet_mreq, mr_address)))
1726			return -EINVAL;
1727		if (optname == PACKET_ADD_MEMBERSHIP)
1728			ret = packet_mc_add(sk, &mreq);
1729		else
1730			ret = packet_mc_drop(sk, &mreq);
1731		return ret;
1732	}
1733
1734#ifdef CONFIG_PACKET_MMAP
1735	case PACKET_RX_RING:
1736	case PACKET_TX_RING:
1737	{
1738		struct tpacket_req req;
1739
1740		if (optlen < sizeof(req))
1741			return -EINVAL;
1742		if (copy_from_user(&req, optval, sizeof(req)))
1743			return -EFAULT;
1744		return packet_set_ring(sk, &req, 0, optname == PACKET_TX_RING);
1745	}
1746	case PACKET_COPY_THRESH:
1747	{
1748		int val;
1749
1750		if (optlen != sizeof(val))
1751			return -EINVAL;
1752		if (copy_from_user(&val, optval, sizeof(val)))
1753			return -EFAULT;
1754
1755		pkt_sk(sk)->copy_thresh = val;
1756		return 0;
1757	}
1758	case PACKET_VERSION:
1759	{
1760		int val;
1761
1762		if (optlen != sizeof(val))
1763			return -EINVAL;
1764		if (po->rx_ring.pg_vec || po->tx_ring.pg_vec)
1765			return -EBUSY;
1766		if (copy_from_user(&val, optval, sizeof(val)))
1767			return -EFAULT;
1768		switch (val) {
1769		case TPACKET_V1:
1770		case TPACKET_V2:
1771			po->tp_version = val;
1772			return 0;
1773		default:
1774			return -EINVAL;
1775		}
1776	}
1777	case PACKET_RESERVE:
1778	{
1779		unsigned int val;
1780
1781		if (optlen != sizeof(val))
1782			return -EINVAL;
1783		if (po->rx_ring.pg_vec || po->tx_ring.pg_vec)
1784			return -EBUSY;
1785		if (copy_from_user(&val, optval, sizeof(val)))
1786			return -EFAULT;
1787		po->tp_reserve = val;
1788		return 0;
1789	}
1790	case PACKET_LOSS:
1791	{
1792		unsigned int val;
1793
1794		if (optlen != sizeof(val))
1795			return -EINVAL;
1796		if (po->rx_ring.pg_vec || po->tx_ring.pg_vec)
1797			return -EBUSY;
1798		if (copy_from_user(&val, optval, sizeof(val)))
1799			return -EFAULT;
1800		po->tp_loss = !!val;
1801		return 0;
1802	}
1803#endif
1804	case PACKET_AUXDATA:
1805	{
1806		int val;
1807
1808		if (optlen < sizeof(val))
1809			return -EINVAL;
1810		if (copy_from_user(&val, optval, sizeof(val)))
1811			return -EFAULT;
1812
1813		po->auxdata = !!val;
1814		return 0;
1815	}
1816	case PACKET_ORIGDEV:
1817	{
1818		int val;
1819
1820		if (optlen < sizeof(val))
1821			return -EINVAL;
1822		if (copy_from_user(&val, optval, sizeof(val)))
1823			return -EFAULT;
1824
1825		po->origdev = !!val;
1826		return 0;
1827	}
1828	default:
1829		return -ENOPROTOOPT;
1830	}
1831}
1832
1833static int packet_getsockopt(struct socket *sock, int level, int optname,
1834			     char __user *optval, int __user *optlen)
1835{
1836	int len;
1837	int val;
1838	struct sock *sk = sock->sk;
1839	struct packet_sock *po = pkt_sk(sk);
1840	void *data;
1841	struct tpacket_stats st;
1842
1843	if (level != SOL_PACKET)
1844		return -ENOPROTOOPT;
1845
1846	if (get_user(len, optlen))
1847		return -EFAULT;
1848
1849	if (len < 0)
1850		return -EINVAL;
1851
1852	switch (optname) {
1853	case PACKET_STATISTICS:
1854		if (len > sizeof(struct tpacket_stats))
1855			len = sizeof(struct tpacket_stats);
1856		spin_lock_bh(&sk->sk_receive_queue.lock);
1857		st = po->stats;
1858		memset(&po->stats, 0, sizeof(st));
1859		spin_unlock_bh(&sk->sk_receive_queue.lock);
1860		st.tp_packets += st.tp_drops;
1861
1862		data = &st;
1863		break;
1864	case PACKET_AUXDATA:
1865		if (len > sizeof(int))
1866			len = sizeof(int);
1867		val = po->auxdata;
1868
1869		data = &val;
1870		break;
1871	case PACKET_ORIGDEV:
1872		if (len > sizeof(int))
1873			len = sizeof(int);
1874		val = po->origdev;
1875
1876		data = &val;
1877		break;
1878#ifdef CONFIG_PACKET_MMAP
1879	case PACKET_VERSION:
1880		if (len > sizeof(int))
1881			len = sizeof(int);
1882		val = po->tp_version;
1883		data = &val;
1884		break;
1885	case PACKET_HDRLEN:
1886		if (len > sizeof(int))
1887			len = sizeof(int);
1888		if (copy_from_user(&val, optval, len))
1889			return -EFAULT;
1890		switch (val) {
1891		case TPACKET_V1:
1892			val = sizeof(struct tpacket_hdr);
1893			break;
1894		case TPACKET_V2:
1895			val = sizeof(struct tpacket2_hdr);
1896			break;
1897		default:
1898			return -EINVAL;
1899		}
1900		data = &val;
1901		break;
1902	case PACKET_RESERVE:
1903		if (len > sizeof(unsigned int))
1904			len = sizeof(unsigned int);
1905		val = po->tp_reserve;
1906		data = &val;
1907		break;
1908	case PACKET_LOSS:
1909		if (len > sizeof(unsigned int))
1910			len = sizeof(unsigned int);
1911		val = po->tp_loss;
1912		data = &val;
1913		break;
1914#endif
1915	default:
1916		return -ENOPROTOOPT;
1917	}
1918
1919	if (put_user(len, optlen))
1920		return -EFAULT;
1921	if (copy_to_user(optval, data, len))
1922		return -EFAULT;
1923	return 0;
1924}
1925
1926
1927static int packet_notifier(struct notifier_block *this, unsigned long msg, void *data)
1928{
1929	struct sock *sk;
1930	struct hlist_node *node;
1931	struct net_device *dev = data;
1932	struct net *net = dev_net(dev);
1933
1934	read_lock(&net->packet.sklist_lock);
1935	sk_for_each(sk, node, &net->packet.sklist) {
1936		struct packet_sock *po = pkt_sk(sk);
1937
1938		switch (msg) {
1939		case NETDEV_UNREGISTER:
1940			if (po->mclist)
1941				packet_dev_mclist(dev, po->mclist, -1);
1942			/* fallthrough */
1943
1944		case NETDEV_DOWN:
1945			if (dev->ifindex == po->ifindex) {
1946				spin_lock(&po->bind_lock);
1947				if (po->running) {
1948					__dev_remove_pack(&po->prot_hook);
1949					__sock_put(sk);
1950					po->running = 0;
1951					sk->sk_err = ENETDOWN;
1952					if (!sock_flag(sk, SOCK_DEAD))
1953						sk->sk_error_report(sk);
1954				}
1955				if (msg == NETDEV_UNREGISTER) {
1956					po->ifindex = -1;
1957					po->prot_hook.dev = NULL;
1958				}
1959				spin_unlock(&po->bind_lock);
1960			}
1961			break;
1962		case NETDEV_UP:
1963			spin_lock(&po->bind_lock);
1964			if (dev->ifindex == po->ifindex && po->num &&
1965			    !po->running) {
1966				dev_add_pack(&po->prot_hook);
1967				sock_hold(sk);
1968				po->running = 1;
1969			}
1970			spin_unlock(&po->bind_lock);
1971			break;
1972		}
1973	}
1974	read_unlock(&net->packet.sklist_lock);
1975	return NOTIFY_DONE;
1976}
1977
1978
1979static int packet_ioctl(struct socket *sock, unsigned int cmd,
1980			unsigned long arg)
1981{
1982	struct sock *sk = sock->sk;
1983
1984	switch (cmd) {
1985	case SIOCOUTQ:
1986	{
1987		int amount = sk_wmem_alloc_get(sk);
1988
1989		return put_user(amount, (int __user *)arg);
1990	}
1991	case SIOCINQ:
1992	{
1993		struct sk_buff *skb;
1994		int amount = 0;
1995
1996		spin_lock_bh(&sk->sk_receive_queue.lock);
1997		skb = skb_peek(&sk->sk_receive_queue);
1998		if (skb)
1999			amount = skb->len;
2000		spin_unlock_bh(&sk->sk_receive_queue.lock);
2001		return put_user(amount, (int __user *)arg);
2002	}
2003	case SIOCGSTAMP:
2004		return sock_get_timestamp(sk, (struct timeval __user *)arg);
2005	case SIOCGSTAMPNS:
2006		return sock_get_timestampns(sk, (struct timespec __user *)arg);
2007
2008#ifdef CONFIG_INET
2009	case SIOCADDRT:
2010	case SIOCDELRT:
2011	case SIOCDARP:
2012	case SIOCGARP:
2013	case SIOCSARP:
2014	case SIOCGIFADDR:
2015	case SIOCSIFADDR:
2016	case SIOCGIFBRDADDR:
2017	case SIOCSIFBRDADDR:
2018	case SIOCGIFNETMASK:
2019	case SIOCSIFNETMASK:
2020	case SIOCGIFDSTADDR:
2021	case SIOCSIFDSTADDR:
2022	case SIOCSIFFLAGS:
2023		if (!net_eq(sock_net(sk), &init_net))
2024			return -ENOIOCTLCMD;
2025		return inet_dgram_ops.ioctl(sock, cmd, arg);
2026#endif
2027
2028	default:
2029		return -ENOIOCTLCMD;
2030	}
2031	return 0;
2032}
2033
2034#ifndef CONFIG_PACKET_MMAP
2035#define packet_mmap sock_no_mmap
2036#define packet_poll datagram_poll
2037#else
2038
2039static unsigned int packet_poll(struct file *file, struct socket *sock,
2040				poll_table *wait)
2041{
2042	struct sock *sk = sock->sk;
2043	struct packet_sock *po = pkt_sk(sk);
2044	unsigned int mask = datagram_poll(file, sock, wait);
2045
2046	spin_lock_bh(&sk->sk_receive_queue.lock);
2047	if (po->rx_ring.pg_vec) {
2048		if (!packet_previous_frame(po, &po->rx_ring, TP_STATUS_KERNEL))
2049			mask |= POLLIN | POLLRDNORM;
2050	}
2051	spin_unlock_bh(&sk->sk_receive_queue.lock);
2052	spin_lock_bh(&sk->sk_write_queue.lock);
2053	if (po->tx_ring.pg_vec) {
2054		if (packet_current_frame(po, &po->tx_ring, TP_STATUS_AVAILABLE))
2055			mask |= POLLOUT | POLLWRNORM;
2056	}
2057	spin_unlock_bh(&sk->sk_write_queue.lock);
2058	return mask;
2059}
2060
2061
2062/* Dirty? Well, I still did not learn better way to account
2063 * for user mmaps.
2064 */
2065
2066static void packet_mm_open(struct vm_area_struct *vma)
2067{
2068	struct file *file = vma->vm_file;
2069	struct socket *sock = file->private_data;
2070	struct sock *sk = sock->sk;
2071
2072	if (sk)
2073		atomic_inc(&pkt_sk(sk)->mapped);
2074}
2075
2076static void packet_mm_close(struct vm_area_struct *vma)
2077{
2078	struct file *file = vma->vm_file;
2079	struct socket *sock = file->private_data;
2080	struct sock *sk = sock->sk;
2081
2082	if (sk)
2083		atomic_dec(&pkt_sk(sk)->mapped);
2084}
2085
2086static const struct vm_operations_struct packet_mmap_ops = {
2087	.open	=	packet_mm_open,
2088	.close	=	packet_mm_close,
2089};
2090
2091static void free_pg_vec(char **pg_vec, unsigned int order, unsigned int len)
2092{
2093	int i;
2094
2095	for (i = 0; i < len; i++) {
2096		if (likely(pg_vec[i]))
2097			free_pages((unsigned long) pg_vec[i], order);
2098	}
2099	kfree(pg_vec);
2100}
2101
2102static inline char *alloc_one_pg_vec_page(unsigned long order)
2103{
2104	gfp_t gfp_flags = GFP_KERNEL | __GFP_COMP | __GFP_ZERO | __GFP_NOWARN;
2105
2106	return (char *) __get_free_pages(gfp_flags, order);
2107}
2108
2109static char **alloc_pg_vec(struct tpacket_req *req, int order)
2110{
2111	unsigned int block_nr = req->tp_block_nr;
2112	char **pg_vec;
2113	int i;
2114
2115	pg_vec = kzalloc(block_nr * sizeof(char *), GFP_KERNEL);
2116	if (unlikely(!pg_vec))
2117		goto out;
2118
2119	for (i = 0; i < block_nr; i++) {
2120		pg_vec[i] = alloc_one_pg_vec_page(order);
2121		if (unlikely(!pg_vec[i]))
2122			goto out_free_pgvec;
2123	}
2124
2125out:
2126	return pg_vec;
2127
2128out_free_pgvec:
2129	free_pg_vec(pg_vec, order, block_nr);
2130	pg_vec = NULL;
2131	goto out;
2132}
2133
2134static int packet_set_ring(struct sock *sk, struct tpacket_req *req,
2135		int closing, int tx_ring)
2136{
2137	char **pg_vec = NULL;
2138	struct packet_sock *po = pkt_sk(sk);
2139	int was_running, order = 0;
2140	struct packet_ring_buffer *rb;
2141	struct sk_buff_head *rb_queue;
2142	__be16 num;
2143	int err;
2144
2145	rb = tx_ring ? &po->tx_ring : &po->rx_ring;
2146	rb_queue = tx_ring ? &sk->sk_write_queue : &sk->sk_receive_queue;
2147
2148	err = -EBUSY;
2149	if (!closing) {
2150		if (atomic_read(&po->mapped))
2151			goto out;
2152		if (atomic_read(&rb->pending))
2153			goto out;
2154	}
2155
2156	if (req->tp_block_nr) {
2157		/* Sanity tests and some calculations */
2158		err = -EBUSY;
2159		if (unlikely(rb->pg_vec))
2160			goto out;
2161
2162		switch (po->tp_version) {
2163		case TPACKET_V1:
2164			po->tp_hdrlen = TPACKET_HDRLEN;
2165			break;
2166		case TPACKET_V2:
2167			po->tp_hdrlen = TPACKET2_HDRLEN;
2168			break;
2169		}
2170
2171		err = -EINVAL;
2172		if (unlikely((int)req->tp_block_size <= 0))
2173			goto out;
2174		if (unlikely(req->tp_block_size & (PAGE_SIZE - 1)))
2175			goto out;
2176		if (unlikely(req->tp_frame_size < po->tp_hdrlen +
2177					po->tp_reserve))
2178			goto out;
2179		if (unlikely(req->tp_frame_size & (TPACKET_ALIGNMENT - 1)))
2180			goto out;
2181
2182		rb->frames_per_block = req->tp_block_size/req->tp_frame_size;
2183		if (unlikely(rb->frames_per_block <= 0))
2184			goto out;
2185		if (unlikely((rb->frames_per_block * req->tp_block_nr) !=
2186					req->tp_frame_nr))
2187			goto out;
2188
2189		err = -ENOMEM;
2190		order = get_order(req->tp_block_size);
2191		pg_vec = alloc_pg_vec(req, order);
2192		if (unlikely(!pg_vec))
2193			goto out;
2194	}
2195	/* Done */
2196	else {
2197		err = -EINVAL;
2198		if (unlikely(req->tp_frame_nr))
2199			goto out;
2200	}
2201
2202	lock_sock(sk);
2203
2204	/* Detach socket from network */
2205	spin_lock(&po->bind_lock);
2206	was_running = po->running;
2207	num = po->num;
2208	if (was_running) {
2209		__dev_remove_pack(&po->prot_hook);
2210		po->num = 0;
2211		po->running = 0;
2212		__sock_put(sk);
2213	}
2214	spin_unlock(&po->bind_lock);
2215
2216	synchronize_net();
2217
2218	err = -EBUSY;
2219	mutex_lock(&po->pg_vec_lock);
2220	if (closing || atomic_read(&po->mapped) == 0) {
2221		err = 0;
2222#define XC(a, b) ({ __typeof__ ((a)) __t; __t = (a); (a) = (b); __t; })
2223		spin_lock_bh(&rb_queue->lock);
2224		pg_vec = XC(rb->pg_vec, pg_vec);
2225		rb->frame_max = (req->tp_frame_nr - 1);
2226		rb->head = 0;
2227		rb->frame_size = req->tp_frame_size;
2228		spin_unlock_bh(&rb_queue->lock);
2229
2230		order = XC(rb->pg_vec_order, order);
2231		req->tp_block_nr = XC(rb->pg_vec_len, req->tp_block_nr);
2232
2233		rb->pg_vec_pages = req->tp_block_size/PAGE_SIZE;
2234		po->prot_hook.func = (po->rx_ring.pg_vec) ?
2235						tpacket_rcv : packet_rcv;
2236		skb_queue_purge(rb_queue);
2237#undef XC
2238		if (atomic_read(&po->mapped))
2239			pr_err("packet_mmap: vma is busy: %d\n",
2240			       atomic_read(&po->mapped));
2241	}
2242	mutex_unlock(&po->pg_vec_lock);
2243
2244	spin_lock(&po->bind_lock);
2245	if (was_running && !po->running) {
2246		sock_hold(sk);
2247		po->running = 1;
2248		po->num = num;
2249		dev_add_pack(&po->prot_hook);
2250	}
2251	spin_unlock(&po->bind_lock);
2252
2253	release_sock(sk);
2254
2255	if (pg_vec)
2256		free_pg_vec(pg_vec, order, req->tp_block_nr);
2257out:
2258	return err;
2259}
2260
2261static int packet_mmap(struct file *file, struct socket *sock,
2262		struct vm_area_struct *vma)
2263{
2264	struct sock *sk = sock->sk;
2265	struct packet_sock *po = pkt_sk(sk);
2266	unsigned long size, expected_size;
2267	struct packet_ring_buffer *rb;
2268	unsigned long start;
2269	int err = -EINVAL;
2270	int i;
2271
2272	if (vma->vm_pgoff)
2273		return -EINVAL;
2274
2275	mutex_lock(&po->pg_vec_lock);
2276
2277	expected_size = 0;
2278	for (rb = &po->rx_ring; rb <= &po->tx_ring; rb++) {
2279		if (rb->pg_vec) {
2280			expected_size += rb->pg_vec_len
2281						* rb->pg_vec_pages
2282						* PAGE_SIZE;
2283		}
2284	}
2285
2286	if (expected_size == 0)
2287		goto out;
2288
2289	size = vma->vm_end - vma->vm_start;
2290	if (size != expected_size)
2291		goto out;
2292
2293	start = vma->vm_start;
2294	for (rb = &po->rx_ring; rb <= &po->tx_ring; rb++) {
2295		if (rb->pg_vec == NULL)
2296			continue;
2297
2298		for (i = 0; i < rb->pg_vec_len; i++) {
2299			struct page *page = virt_to_page(rb->pg_vec[i]);
2300			int pg_num;
2301
2302			for (pg_num = 0; pg_num < rb->pg_vec_pages;
2303					pg_num++, page++) {
2304				err = vm_insert_page(vma, start, page);
2305				if (unlikely(err))
2306					goto out;
2307				start += PAGE_SIZE;
2308			}
2309		}
2310	}
2311
2312	atomic_inc(&po->mapped);
2313	vma->vm_ops = &packet_mmap_ops;
2314	err = 0;
2315
2316out:
2317	mutex_unlock(&po->pg_vec_lock);
2318	return err;
2319}
2320#endif
2321
2322
2323static const struct proto_ops packet_ops_spkt = {
2324	.family =	PF_PACKET,
2325	.owner =	THIS_MODULE,
2326	.release =	packet_release,
2327	.bind =		packet_bind_spkt,
2328	.connect =	sock_no_connect,
2329	.socketpair =	sock_no_socketpair,
2330	.accept =	sock_no_accept,
2331	.getname =	packet_getname_spkt,
2332	.poll =		datagram_poll,
2333	.ioctl =	packet_ioctl,
2334	.listen =	sock_no_listen,
2335	.shutdown =	sock_no_shutdown,
2336	.setsockopt =	sock_no_setsockopt,
2337	.getsockopt =	sock_no_getsockopt,
2338	.sendmsg =	packet_sendmsg_spkt,
2339	.recvmsg =	packet_recvmsg,
2340	.mmap =		sock_no_mmap,
2341	.sendpage =	sock_no_sendpage,
2342};
2343
2344static const struct proto_ops packet_ops = {
2345	.family =	PF_PACKET,
2346	.owner =	THIS_MODULE,
2347	.release =	packet_release,
2348	.bind =		packet_bind,
2349	.connect =	sock_no_connect,
2350	.socketpair =	sock_no_socketpair,
2351	.accept =	sock_no_accept,
2352	.getname =	packet_getname,
2353	.poll =		packet_poll,
2354	.ioctl =	packet_ioctl,
2355	.listen =	sock_no_listen,
2356	.shutdown =	sock_no_shutdown,
2357	.setsockopt =	packet_setsockopt,
2358	.getsockopt =	packet_getsockopt,
2359	.sendmsg =	packet_sendmsg,
2360	.recvmsg =	packet_recvmsg,
2361	.mmap =		packet_mmap,
2362	.sendpage =	sock_no_sendpage,
2363};
2364
2365static const struct net_proto_family packet_family_ops = {
2366	.family =	PF_PACKET,
2367	.create =	packet_create,
2368	.owner	=	THIS_MODULE,
2369};
2370
2371static struct notifier_block packet_netdev_notifier = {
2372	.notifier_call =	packet_notifier,
2373};
2374
2375#ifdef CONFIG_PROC_FS
2376static inline struct sock *packet_seq_idx(struct net *net, loff_t off)
2377{
2378	struct sock *s;
2379	struct hlist_node *node;
2380
2381	sk_for_each(s, node, &net->packet.sklist) {
2382		if (!off--)
2383			return s;
2384	}
2385	return NULL;
2386}
2387
2388static void *packet_seq_start(struct seq_file *seq, loff_t *pos)
2389	__acquires(seq_file_net(seq)->packet.sklist_lock)
2390{
2391	struct net *net = seq_file_net(seq);
2392	read_lock(&net->packet.sklist_lock);
2393	return *pos ? packet_seq_idx(net, *pos - 1) : SEQ_START_TOKEN;
2394}
2395
2396static void *packet_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2397{
2398	struct net *net = seq_file_net(seq);
2399	++*pos;
2400	return  (v == SEQ_START_TOKEN)
2401		? sk_head(&net->packet.sklist)
2402		: sk_next((struct sock *)v) ;
2403}
2404
2405static void packet_seq_stop(struct seq_file *seq, void *v)
2406	__releases(seq_file_net(seq)->packet.sklist_lock)
2407{
2408	struct net *net = seq_file_net(seq);
2409	read_unlock(&net->packet.sklist_lock);
2410}
2411
2412static int packet_seq_show(struct seq_file *seq, void *v)
2413{
2414	if (v == SEQ_START_TOKEN)
2415		seq_puts(seq, "sk       RefCnt Type Proto  Iface R Rmem   User   Inode\n");
2416	else {
2417		struct sock *s = v;
2418		const struct packet_sock *po = pkt_sk(s);
2419
2420		seq_printf(seq,
2421			   "%p %-6d %-4d %04x   %-5d %1d %-6u %-6u %-6lu\n",
2422			   s,
2423			   atomic_read(&s->sk_refcnt),
2424			   s->sk_type,
2425			   ntohs(po->num),
2426			   po->ifindex,
2427			   po->running,
2428			   atomic_read(&s->sk_rmem_alloc),
2429			   sock_i_uid(s),
2430			   sock_i_ino(s));
2431	}
2432
2433	return 0;
2434}
2435
2436static const struct seq_operations packet_seq_ops = {
2437	.start	= packet_seq_start,
2438	.next	= packet_seq_next,
2439	.stop	= packet_seq_stop,
2440	.show	= packet_seq_show,
2441};
2442
2443static int packet_seq_open(struct inode *inode, struct file *file)
2444{
2445	return seq_open_net(inode, file, &packet_seq_ops,
2446			    sizeof(struct seq_net_private));
2447}
2448
2449static const struct file_operations packet_seq_fops = {
2450	.owner		= THIS_MODULE,
2451	.open		= packet_seq_open,
2452	.read		= seq_read,
2453	.llseek		= seq_lseek,
2454	.release	= seq_release_net,
2455};
2456
2457#endif
2458
2459static int packet_net_init(struct net *net)
2460{
2461	rwlock_init(&net->packet.sklist_lock);
2462	INIT_HLIST_HEAD(&net->packet.sklist);
2463
2464	if (!proc_net_fops_create(net, "packet", 0, &packet_seq_fops))
2465		return -ENOMEM;
2466
2467	return 0;
2468}
2469
2470static void packet_net_exit(struct net *net)
2471{
2472	proc_net_remove(net, "packet");
2473}
2474
2475static struct pernet_operations packet_net_ops = {
2476	.init = packet_net_init,
2477	.exit = packet_net_exit,
2478};
2479
2480
2481static void __exit packet_exit(void)
2482{
2483	unregister_netdevice_notifier(&packet_netdev_notifier);
2484	unregister_pernet_subsys(&packet_net_ops);
2485	sock_unregister(PF_PACKET);
2486	proto_unregister(&packet_proto);
2487}
2488
2489static int __init packet_init(void)
2490{
2491	int rc = proto_register(&packet_proto, 0);
2492
2493	if (rc != 0)
2494		goto out;
2495
2496	sock_register(&packet_family_ops);
2497	register_pernet_subsys(&packet_net_ops);
2498	register_netdevice_notifier(&packet_netdev_notifier);
2499out:
2500	return rc;
2501}
2502
2503module_init(packet_init);
2504module_exit(packet_exit);
2505MODULE_LICENSE("GPL");
2506MODULE_ALIAS_NETPROTO(PF_PACKET);
2507