af_packet.c revision 69e3c75f4d541a6eb151b3ef91f34033cb3ad6e1
1/*
2 * INET		An implementation of the TCP/IP protocol suite for the LINUX
3 *		operating system.  INET is implemented using the  BSD Socket
4 *		interface as the means of communication with the user level.
5 *
6 *		PACKET - implements raw packet sockets.
7 *
8 * Authors:	Ross Biro
9 *		Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
10 *		Alan Cox, <gw4pts@gw4pts.ampr.org>
11 *
12 * Fixes:
13 *		Alan Cox	:	verify_area() now used correctly
14 *		Alan Cox	:	new skbuff lists, look ma no backlogs!
15 *		Alan Cox	:	tidied skbuff lists.
16 *		Alan Cox	:	Now uses generic datagram routines I
17 *					added. Also fixed the peek/read crash
18 *					from all old Linux datagram code.
19 *		Alan Cox	:	Uses the improved datagram code.
20 *		Alan Cox	:	Added NULL's for socket options.
21 *		Alan Cox	:	Re-commented the code.
22 *		Alan Cox	:	Use new kernel side addressing
23 *		Rob Janssen	:	Correct MTU usage.
24 *		Dave Platt	:	Counter leaks caused by incorrect
25 *					interrupt locking and some slightly
26 *					dubious gcc output. Can you read
27 *					compiler: it said _VOLATILE_
28 *	Richard Kooijman	:	Timestamp fixes.
29 *		Alan Cox	:	New buffers. Use sk->mac.raw.
30 *		Alan Cox	:	sendmsg/recvmsg support.
31 *		Alan Cox	:	Protocol setting support
32 *	Alexey Kuznetsov	:	Untied from IPv4 stack.
33 *	Cyrus Durgin		:	Fixed kerneld for kmod.
34 *	Michal Ostrowski        :       Module initialization cleanup.
35 *         Ulises Alonso        :       Frame number limit removal and
36 *                                      packet_set_ring memory leak.
37 *		Eric Biederman	:	Allow for > 8 byte hardware addresses.
38 *					The convention is that longer addresses
39 *					will simply extend the hardware address
40 *					byte arrays at the end of sockaddr_ll
41 *					and packet_mreq.
42 *		Johann Baudy	:	Added TX RING.
43 *
44 *		This program is free software; you can redistribute it and/or
45 *		modify it under the terms of the GNU General Public License
46 *		as published by the Free Software Foundation; either version
47 *		2 of the License, or (at your option) any later version.
48 *
49 */
50
51#include <linux/types.h>
52#include <linux/mm.h>
53#include <linux/capability.h>
54#include <linux/fcntl.h>
55#include <linux/socket.h>
56#include <linux/in.h>
57#include <linux/inet.h>
58#include <linux/netdevice.h>
59#include <linux/if_packet.h>
60#include <linux/wireless.h>
61#include <linux/kernel.h>
62#include <linux/kmod.h>
63#include <net/net_namespace.h>
64#include <net/ip.h>
65#include <net/protocol.h>
66#include <linux/skbuff.h>
67#include <net/sock.h>
68#include <linux/errno.h>
69#include <linux/timer.h>
70#include <asm/system.h>
71#include <asm/uaccess.h>
72#include <asm/ioctls.h>
73#include <asm/page.h>
74#include <asm/cacheflush.h>
75#include <asm/io.h>
76#include <linux/proc_fs.h>
77#include <linux/seq_file.h>
78#include <linux/poll.h>
79#include <linux/module.h>
80#include <linux/init.h>
81#include <linux/mutex.h>
82
83#ifdef CONFIG_INET
84#include <net/inet_common.h>
85#endif
86
87/*
88   Assumptions:
89   - if device has no dev->hard_header routine, it adds and removes ll header
90     inside itself. In this case ll header is invisible outside of device,
91     but higher levels still should reserve dev->hard_header_len.
92     Some devices are enough clever to reallocate skb, when header
93     will not fit to reserved space (tunnel), another ones are silly
94     (PPP).
95   - packet socket receives packets with pulled ll header,
96     so that SOCK_RAW should push it back.
97
98On receive:
99-----------
100
101Incoming, dev->hard_header!=NULL
102   mac_header -> ll header
103   data       -> data
104
105Outgoing, dev->hard_header!=NULL
106   mac_header -> ll header
107   data       -> ll header
108
109Incoming, dev->hard_header==NULL
110   mac_header -> UNKNOWN position. It is very likely, that it points to ll
111		 header.  PPP makes it, that is wrong, because introduce
112		 assymetry between rx and tx paths.
113   data       -> data
114
115Outgoing, dev->hard_header==NULL
116   mac_header -> data. ll header is still not built!
117   data       -> data
118
119Resume
120  If dev->hard_header==NULL we are unlikely to restore sensible ll header.
121
122
123On transmit:
124------------
125
126dev->hard_header != NULL
127   mac_header -> ll header
128   data       -> ll header
129
130dev->hard_header == NULL (ll header is added by device, we cannot control it)
131   mac_header -> data
132   data       -> data
133
134   We should set nh.raw on output to correct posistion,
135   packet classifier depends on it.
136 */
137
138/* Private packet socket structures. */
139
140struct packet_mclist
141{
142	struct packet_mclist	*next;
143	int			ifindex;
144	int			count;
145	unsigned short		type;
146	unsigned short		alen;
147	unsigned char		addr[MAX_ADDR_LEN];
148};
149/* identical to struct packet_mreq except it has
150 * a longer address field.
151 */
152struct packet_mreq_max
153{
154	int		mr_ifindex;
155	unsigned short	mr_type;
156	unsigned short	mr_alen;
157	unsigned char	mr_address[MAX_ADDR_LEN];
158};
159
160#ifdef CONFIG_PACKET_MMAP
161static int packet_set_ring(struct sock *sk, struct tpacket_req *req,
162		int closing, int tx_ring);
163
164struct packet_ring_buffer {
165	char *			*pg_vec;
166	unsigned int		head;
167	unsigned int		frames_per_block;
168	unsigned int		frame_size;
169	unsigned int		frame_max;
170
171	unsigned int		pg_vec_order;
172	unsigned int		pg_vec_pages;
173	unsigned int		pg_vec_len;
174
175	atomic_t		pending;
176};
177
178struct packet_sock;
179static int tpacket_snd(struct packet_sock *po, struct msghdr *msg);
180#endif
181
182static void packet_flush_mclist(struct sock *sk);
183
184struct packet_sock {
185	/* struct sock has to be the first member of packet_sock */
186	struct sock		sk;
187	struct tpacket_stats	stats;
188#ifdef CONFIG_PACKET_MMAP
189	struct packet_ring_buffer	rx_ring;
190	struct packet_ring_buffer	tx_ring;
191	int			copy_thresh;
192#endif
193	struct packet_type	prot_hook;
194	spinlock_t		bind_lock;
195	struct mutex		pg_vec_lock;
196	unsigned int		running:1,	/* prot_hook is attached*/
197				auxdata:1,
198				origdev:1;
199	int			ifindex;	/* bound device		*/
200	__be16			num;
201	struct packet_mclist	*mclist;
202#ifdef CONFIG_PACKET_MMAP
203	atomic_t		mapped;
204	enum tpacket_versions	tp_version;
205	unsigned int		tp_hdrlen;
206	unsigned int		tp_reserve;
207	unsigned int		tp_loss:1;
208#endif
209};
210
211struct packet_skb_cb {
212	unsigned int origlen;
213	union {
214		struct sockaddr_pkt pkt;
215		struct sockaddr_ll ll;
216	} sa;
217};
218
219#define PACKET_SKB_CB(__skb)	((struct packet_skb_cb *)((__skb)->cb))
220
221#ifdef CONFIG_PACKET_MMAP
222
223static void __packet_set_status(struct packet_sock *po, void *frame, int status)
224{
225	union {
226		struct tpacket_hdr *h1;
227		struct tpacket2_hdr *h2;
228		void *raw;
229	} h;
230
231	h.raw = frame;
232	switch (po->tp_version) {
233	case TPACKET_V1:
234		h.h1->tp_status = status;
235		flush_dcache_page(virt_to_page(&h.h1->tp_status));
236		break;
237	case TPACKET_V2:
238		h.h2->tp_status = status;
239		flush_dcache_page(virt_to_page(&h.h2->tp_status));
240		break;
241	default:
242		printk(KERN_ERR "TPACKET version not supported\n");
243		BUG();
244	}
245
246	smp_wmb();
247}
248
249static int __packet_get_status(struct packet_sock *po, void *frame)
250{
251	union {
252		struct tpacket_hdr *h1;
253		struct tpacket2_hdr *h2;
254		void *raw;
255	} h;
256
257	smp_rmb();
258
259	h.raw = frame;
260	switch (po->tp_version) {
261	case TPACKET_V1:
262		flush_dcache_page(virt_to_page(&h.h1->tp_status));
263		return h.h1->tp_status;
264	case TPACKET_V2:
265		flush_dcache_page(virt_to_page(&h.h2->tp_status));
266		return h.h2->tp_status;
267	default:
268		printk(KERN_ERR "TPACKET version not supported\n");
269		BUG();
270		return 0;
271	}
272}
273
274static void *packet_lookup_frame(struct packet_sock *po,
275		struct packet_ring_buffer *rb,
276		unsigned int position,
277		int status)
278{
279	unsigned int pg_vec_pos, frame_offset;
280	union {
281		struct tpacket_hdr *h1;
282		struct tpacket2_hdr *h2;
283		void *raw;
284	} h;
285
286	pg_vec_pos = position / rb->frames_per_block;
287	frame_offset = position % rb->frames_per_block;
288
289	h.raw = rb->pg_vec[pg_vec_pos] + (frame_offset * rb->frame_size);
290
291	if (status != __packet_get_status(po, h.raw))
292		return NULL;
293
294	return h.raw;
295}
296
297static inline void *packet_current_frame(struct packet_sock *po,
298		struct packet_ring_buffer *rb,
299		int status)
300{
301	return packet_lookup_frame(po, rb, rb->head, status);
302}
303
304static inline void *packet_previous_frame(struct packet_sock *po,
305		struct packet_ring_buffer *rb,
306		int status)
307{
308	unsigned int previous = rb->head ? rb->head - 1 : rb->frame_max;
309	return packet_lookup_frame(po, rb, previous, status);
310}
311
312static inline void packet_increment_head(struct packet_ring_buffer *buff)
313{
314	buff->head = buff->head != buff->frame_max ? buff->head+1 : 0;
315}
316
317#endif
318
319static inline struct packet_sock *pkt_sk(struct sock *sk)
320{
321	return (struct packet_sock *)sk;
322}
323
324static void packet_sock_destruct(struct sock *sk)
325{
326	WARN_ON(atomic_read(&sk->sk_rmem_alloc));
327	WARN_ON(atomic_read(&sk->sk_wmem_alloc));
328
329	if (!sock_flag(sk, SOCK_DEAD)) {
330		printk("Attempt to release alive packet socket: %p\n", sk);
331		return;
332	}
333
334	sk_refcnt_debug_dec(sk);
335}
336
337
338static const struct proto_ops packet_ops;
339
340static const struct proto_ops packet_ops_spkt;
341
342static int packet_rcv_spkt(struct sk_buff *skb, struct net_device *dev,  struct packet_type *pt, struct net_device *orig_dev)
343{
344	struct sock *sk;
345	struct sockaddr_pkt *spkt;
346
347	/*
348	 *	When we registered the protocol we saved the socket in the data
349	 *	field for just this event.
350	 */
351
352	sk = pt->af_packet_priv;
353
354	/*
355	 *	Yank back the headers [hope the device set this
356	 *	right or kerboom...]
357	 *
358	 *	Incoming packets have ll header pulled,
359	 *	push it back.
360	 *
361	 *	For outgoing ones skb->data == skb_mac_header(skb)
362	 *	so that this procedure is noop.
363	 */
364
365	if (skb->pkt_type == PACKET_LOOPBACK)
366		goto out;
367
368	if (dev_net(dev) != sock_net(sk))
369		goto out;
370
371	if ((skb = skb_share_check(skb, GFP_ATOMIC)) == NULL)
372		goto oom;
373
374	/* drop any routing info */
375	dst_release(skb->dst);
376	skb->dst = NULL;
377
378	/* drop conntrack reference */
379	nf_reset(skb);
380
381	spkt = &PACKET_SKB_CB(skb)->sa.pkt;
382
383	skb_push(skb, skb->data - skb_mac_header(skb));
384
385	/*
386	 *	The SOCK_PACKET socket receives _all_ frames.
387	 */
388
389	spkt->spkt_family = dev->type;
390	strlcpy(spkt->spkt_device, dev->name, sizeof(spkt->spkt_device));
391	spkt->spkt_protocol = skb->protocol;
392
393	/*
394	 *	Charge the memory to the socket. This is done specifically
395	 *	to prevent sockets using all the memory up.
396	 */
397
398	if (sock_queue_rcv_skb(sk,skb) == 0)
399		return 0;
400
401out:
402	kfree_skb(skb);
403oom:
404	return 0;
405}
406
407
408/*
409 *	Output a raw packet to a device layer. This bypasses all the other
410 *	protocol layers and you must therefore supply it with a complete frame
411 */
412
413static int packet_sendmsg_spkt(struct kiocb *iocb, struct socket *sock,
414			       struct msghdr *msg, size_t len)
415{
416	struct sock *sk = sock->sk;
417	struct sockaddr_pkt *saddr=(struct sockaddr_pkt *)msg->msg_name;
418	struct sk_buff *skb;
419	struct net_device *dev;
420	__be16 proto=0;
421	int err;
422
423	/*
424	 *	Get and verify the address.
425	 */
426
427	if (saddr)
428	{
429		if (msg->msg_namelen < sizeof(struct sockaddr))
430			return(-EINVAL);
431		if (msg->msg_namelen==sizeof(struct sockaddr_pkt))
432			proto=saddr->spkt_protocol;
433	}
434	else
435		return(-ENOTCONN);	/* SOCK_PACKET must be sent giving an address */
436
437	/*
438	 *	Find the device first to size check it
439	 */
440
441	saddr->spkt_device[13] = 0;
442	dev = dev_get_by_name(sock_net(sk), saddr->spkt_device);
443	err = -ENODEV;
444	if (dev == NULL)
445		goto out_unlock;
446
447	err = -ENETDOWN;
448	if (!(dev->flags & IFF_UP))
449		goto out_unlock;
450
451	/*
452	 *	You may not queue a frame bigger than the mtu. This is the lowest level
453	 *	raw protocol and you must do your own fragmentation at this level.
454	 */
455
456	err = -EMSGSIZE;
457	if (len > dev->mtu + dev->hard_header_len)
458		goto out_unlock;
459
460	err = -ENOBUFS;
461	skb = sock_wmalloc(sk, len + LL_RESERVED_SPACE(dev), 0, GFP_KERNEL);
462
463	/*
464	 *	If the write buffer is full, then tough. At this level the user gets to
465	 *	deal with the problem - do your own algorithmic backoffs. That's far
466	 *	more flexible.
467	 */
468
469	if (skb == NULL)
470		goto out_unlock;
471
472	/*
473	 *	Fill it in
474	 */
475
476	/* FIXME: Save some space for broken drivers that write a
477	 * hard header at transmission time by themselves. PPP is the
478	 * notable one here. This should really be fixed at the driver level.
479	 */
480	skb_reserve(skb, LL_RESERVED_SPACE(dev));
481	skb_reset_network_header(skb);
482
483	/* Try to align data part correctly */
484	if (dev->header_ops) {
485		skb->data -= dev->hard_header_len;
486		skb->tail -= dev->hard_header_len;
487		if (len < dev->hard_header_len)
488			skb_reset_network_header(skb);
489	}
490
491	/* Returns -EFAULT on error */
492	err = memcpy_fromiovec(skb_put(skb,len), msg->msg_iov, len);
493	skb->protocol = proto;
494	skb->dev = dev;
495	skb->priority = sk->sk_priority;
496	if (err)
497		goto out_free;
498
499	/*
500	 *	Now send it
501	 */
502
503	dev_queue_xmit(skb);
504	dev_put(dev);
505	return(len);
506
507out_free:
508	kfree_skb(skb);
509out_unlock:
510	if (dev)
511		dev_put(dev);
512	return err;
513}
514
515static inline unsigned int run_filter(struct sk_buff *skb, struct sock *sk,
516				      unsigned int res)
517{
518	struct sk_filter *filter;
519
520	rcu_read_lock_bh();
521	filter = rcu_dereference(sk->sk_filter);
522	if (filter != NULL)
523		res = sk_run_filter(skb, filter->insns, filter->len);
524	rcu_read_unlock_bh();
525
526	return res;
527}
528
529/*
530   This function makes lazy skb cloning in hope that most of packets
531   are discarded by BPF.
532
533   Note tricky part: we DO mangle shared skb! skb->data, skb->len
534   and skb->cb are mangled. It works because (and until) packets
535   falling here are owned by current CPU. Output packets are cloned
536   by dev_queue_xmit_nit(), input packets are processed by net_bh
537   sequencially, so that if we return skb to original state on exit,
538   we will not harm anyone.
539 */
540
541static int packet_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt, struct net_device *orig_dev)
542{
543	struct sock *sk;
544	struct sockaddr_ll *sll;
545	struct packet_sock *po;
546	u8 * skb_head = skb->data;
547	int skb_len = skb->len;
548	unsigned int snaplen, res;
549
550	if (skb->pkt_type == PACKET_LOOPBACK)
551		goto drop;
552
553	sk = pt->af_packet_priv;
554	po = pkt_sk(sk);
555
556	if (dev_net(dev) != sock_net(sk))
557		goto drop;
558
559	skb->dev = dev;
560
561	if (dev->header_ops) {
562		/* The device has an explicit notion of ll header,
563		   exported to higher levels.
564
565		   Otherwise, the device hides datails of it frame
566		   structure, so that corresponding packet head
567		   never delivered to user.
568		 */
569		if (sk->sk_type != SOCK_DGRAM)
570			skb_push(skb, skb->data - skb_mac_header(skb));
571		else if (skb->pkt_type == PACKET_OUTGOING) {
572			/* Special case: outgoing packets have ll header at head */
573			skb_pull(skb, skb_network_offset(skb));
574		}
575	}
576
577	snaplen = skb->len;
578
579	res = run_filter(skb, sk, snaplen);
580	if (!res)
581		goto drop_n_restore;
582	if (snaplen > res)
583		snaplen = res;
584
585	if (atomic_read(&sk->sk_rmem_alloc) + skb->truesize >=
586	    (unsigned)sk->sk_rcvbuf)
587		goto drop_n_acct;
588
589	if (skb_shared(skb)) {
590		struct sk_buff *nskb = skb_clone(skb, GFP_ATOMIC);
591		if (nskb == NULL)
592			goto drop_n_acct;
593
594		if (skb_head != skb->data) {
595			skb->data = skb_head;
596			skb->len = skb_len;
597		}
598		kfree_skb(skb);
599		skb = nskb;
600	}
601
602	BUILD_BUG_ON(sizeof(*PACKET_SKB_CB(skb)) + MAX_ADDR_LEN - 8 >
603		     sizeof(skb->cb));
604
605	sll = &PACKET_SKB_CB(skb)->sa.ll;
606	sll->sll_family = AF_PACKET;
607	sll->sll_hatype = dev->type;
608	sll->sll_protocol = skb->protocol;
609	sll->sll_pkttype = skb->pkt_type;
610	if (unlikely(po->origdev))
611		sll->sll_ifindex = orig_dev->ifindex;
612	else
613		sll->sll_ifindex = dev->ifindex;
614
615	sll->sll_halen = dev_parse_header(skb, sll->sll_addr);
616
617	PACKET_SKB_CB(skb)->origlen = skb->len;
618
619	if (pskb_trim(skb, snaplen))
620		goto drop_n_acct;
621
622	skb_set_owner_r(skb, sk);
623	skb->dev = NULL;
624	dst_release(skb->dst);
625	skb->dst = NULL;
626
627	/* drop conntrack reference */
628	nf_reset(skb);
629
630	spin_lock(&sk->sk_receive_queue.lock);
631	po->stats.tp_packets++;
632	__skb_queue_tail(&sk->sk_receive_queue, skb);
633	spin_unlock(&sk->sk_receive_queue.lock);
634	sk->sk_data_ready(sk, skb->len);
635	return 0;
636
637drop_n_acct:
638	spin_lock(&sk->sk_receive_queue.lock);
639	po->stats.tp_drops++;
640	spin_unlock(&sk->sk_receive_queue.lock);
641
642drop_n_restore:
643	if (skb_head != skb->data && skb_shared(skb)) {
644		skb->data = skb_head;
645		skb->len = skb_len;
646	}
647drop:
648	consume_skb(skb);
649	return 0;
650}
651
652#ifdef CONFIG_PACKET_MMAP
653static int tpacket_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt, struct net_device *orig_dev)
654{
655	struct sock *sk;
656	struct packet_sock *po;
657	struct sockaddr_ll *sll;
658	union {
659		struct tpacket_hdr *h1;
660		struct tpacket2_hdr *h2;
661		void *raw;
662	} h;
663	u8 * skb_head = skb->data;
664	int skb_len = skb->len;
665	unsigned int snaplen, res;
666	unsigned long status = TP_STATUS_LOSING|TP_STATUS_USER;
667	unsigned short macoff, netoff, hdrlen;
668	struct sk_buff *copy_skb = NULL;
669	struct timeval tv;
670	struct timespec ts;
671
672	if (skb->pkt_type == PACKET_LOOPBACK)
673		goto drop;
674
675	sk = pt->af_packet_priv;
676	po = pkt_sk(sk);
677
678	if (dev_net(dev) != sock_net(sk))
679		goto drop;
680
681	if (dev->header_ops) {
682		if (sk->sk_type != SOCK_DGRAM)
683			skb_push(skb, skb->data - skb_mac_header(skb));
684		else if (skb->pkt_type == PACKET_OUTGOING) {
685			/* Special case: outgoing packets have ll header at head */
686			skb_pull(skb, skb_network_offset(skb));
687		}
688	}
689
690	if (skb->ip_summed == CHECKSUM_PARTIAL)
691		status |= TP_STATUS_CSUMNOTREADY;
692
693	snaplen = skb->len;
694
695	res = run_filter(skb, sk, snaplen);
696	if (!res)
697		goto drop_n_restore;
698	if (snaplen > res)
699		snaplen = res;
700
701	if (sk->sk_type == SOCK_DGRAM) {
702		macoff = netoff = TPACKET_ALIGN(po->tp_hdrlen) + 16 +
703				  po->tp_reserve;
704	} else {
705		unsigned maclen = skb_network_offset(skb);
706		netoff = TPACKET_ALIGN(po->tp_hdrlen +
707				       (maclen < 16 ? 16 : maclen)) +
708			po->tp_reserve;
709		macoff = netoff - maclen;
710	}
711
712	if (macoff + snaplen > po->rx_ring.frame_size) {
713		if (po->copy_thresh &&
714		    atomic_read(&sk->sk_rmem_alloc) + skb->truesize <
715		    (unsigned)sk->sk_rcvbuf) {
716			if (skb_shared(skb)) {
717				copy_skb = skb_clone(skb, GFP_ATOMIC);
718			} else {
719				copy_skb = skb_get(skb);
720				skb_head = skb->data;
721			}
722			if (copy_skb)
723				skb_set_owner_r(copy_skb, sk);
724		}
725		snaplen = po->rx_ring.frame_size - macoff;
726		if ((int)snaplen < 0)
727			snaplen = 0;
728	}
729
730	spin_lock(&sk->sk_receive_queue.lock);
731	h.raw = packet_current_frame(po, &po->rx_ring, TP_STATUS_KERNEL);
732	if (!h.raw)
733		goto ring_is_full;
734	packet_increment_head(&po->rx_ring);
735	po->stats.tp_packets++;
736	if (copy_skb) {
737		status |= TP_STATUS_COPY;
738		__skb_queue_tail(&sk->sk_receive_queue, copy_skb);
739	}
740	if (!po->stats.tp_drops)
741		status &= ~TP_STATUS_LOSING;
742	spin_unlock(&sk->sk_receive_queue.lock);
743
744	skb_copy_bits(skb, 0, h.raw + macoff, snaplen);
745
746	switch (po->tp_version) {
747	case TPACKET_V1:
748		h.h1->tp_len = skb->len;
749		h.h1->tp_snaplen = snaplen;
750		h.h1->tp_mac = macoff;
751		h.h1->tp_net = netoff;
752		if (skb->tstamp.tv64)
753			tv = ktime_to_timeval(skb->tstamp);
754		else
755			do_gettimeofday(&tv);
756		h.h1->tp_sec = tv.tv_sec;
757		h.h1->tp_usec = tv.tv_usec;
758		hdrlen = sizeof(*h.h1);
759		break;
760	case TPACKET_V2:
761		h.h2->tp_len = skb->len;
762		h.h2->tp_snaplen = snaplen;
763		h.h2->tp_mac = macoff;
764		h.h2->tp_net = netoff;
765		if (skb->tstamp.tv64)
766			ts = ktime_to_timespec(skb->tstamp);
767		else
768			getnstimeofday(&ts);
769		h.h2->tp_sec = ts.tv_sec;
770		h.h2->tp_nsec = ts.tv_nsec;
771		h.h2->tp_vlan_tci = skb->vlan_tci;
772		hdrlen = sizeof(*h.h2);
773		break;
774	default:
775		BUG();
776	}
777
778	sll = h.raw + TPACKET_ALIGN(hdrlen);
779	sll->sll_halen = dev_parse_header(skb, sll->sll_addr);
780	sll->sll_family = AF_PACKET;
781	sll->sll_hatype = dev->type;
782	sll->sll_protocol = skb->protocol;
783	sll->sll_pkttype = skb->pkt_type;
784	if (unlikely(po->origdev))
785		sll->sll_ifindex = orig_dev->ifindex;
786	else
787		sll->sll_ifindex = dev->ifindex;
788
789	__packet_set_status(po, h.raw, status);
790	smp_mb();
791	{
792		struct page *p_start, *p_end;
793		u8 *h_end = h.raw + macoff + snaplen - 1;
794
795		p_start = virt_to_page(h.raw);
796		p_end = virt_to_page(h_end);
797		while (p_start <= p_end) {
798			flush_dcache_page(p_start);
799			p_start++;
800		}
801	}
802
803	sk->sk_data_ready(sk, 0);
804
805drop_n_restore:
806	if (skb_head != skb->data && skb_shared(skb)) {
807		skb->data = skb_head;
808		skb->len = skb_len;
809	}
810drop:
811	kfree_skb(skb);
812	return 0;
813
814ring_is_full:
815	po->stats.tp_drops++;
816	spin_unlock(&sk->sk_receive_queue.lock);
817
818	sk->sk_data_ready(sk, 0);
819	kfree_skb(copy_skb);
820	goto drop_n_restore;
821}
822
823static void tpacket_destruct_skb(struct sk_buff *skb)
824{
825	struct packet_sock *po = pkt_sk(skb->sk);
826	void * ph;
827
828	BUG_ON(skb == NULL);
829
830	if (likely(po->tx_ring.pg_vec)) {
831		ph = skb_shinfo(skb)->destructor_arg;
832		BUG_ON(__packet_get_status(po, ph) != TP_STATUS_SENDING);
833		BUG_ON(atomic_read(&po->tx_ring.pending) == 0);
834		atomic_dec(&po->tx_ring.pending);
835		__packet_set_status(po, ph, TP_STATUS_AVAILABLE);
836	}
837
838	sock_wfree(skb);
839}
840
841static int tpacket_fill_skb(struct packet_sock *po, struct sk_buff * skb,
842		void * frame, struct net_device *dev, int size_max,
843		__be16 proto, unsigned char * addr)
844{
845	union {
846		struct tpacket_hdr *h1;
847		struct tpacket2_hdr *h2;
848		void *raw;
849	} ph;
850	int to_write, offset, len, tp_len, nr_frags, len_max;
851	struct socket *sock = po->sk.sk_socket;
852	struct page *page;
853	void *data;
854	int err;
855
856	ph.raw = frame;
857
858	skb->protocol = proto;
859	skb->dev = dev;
860	skb->priority = po->sk.sk_priority;
861	skb_shinfo(skb)->destructor_arg = ph.raw;
862
863	switch (po->tp_version) {
864	case TPACKET_V2:
865		tp_len = ph.h2->tp_len;
866		break;
867	default:
868		tp_len = ph.h1->tp_len;
869		break;
870	}
871	if (unlikely(tp_len > size_max)) {
872		printk(KERN_ERR "packet size is too long (%d > %d)\n",
873				tp_len, size_max);
874		return -EMSGSIZE;
875	}
876
877	skb_reserve(skb, LL_RESERVED_SPACE(dev));
878	skb_reset_network_header(skb);
879
880	data = ph.raw + po->tp_hdrlen - sizeof(struct sockaddr_ll);
881	to_write = tp_len;
882
883	if (sock->type == SOCK_DGRAM) {
884		err = dev_hard_header(skb, dev, ntohs(proto), addr,
885				NULL, tp_len);
886		if (unlikely(err < 0))
887			return -EINVAL;
888	} else if (dev->hard_header_len ) {
889		/* net device doesn't like empty head */
890		if (unlikely(tp_len <= dev->hard_header_len)) {
891			printk(KERN_ERR "packet size is too short "
892					"(%d < %d)\n", tp_len,
893					dev->hard_header_len);
894			return -EINVAL;
895		}
896
897		skb_push(skb, dev->hard_header_len);
898		err = skb_store_bits(skb, 0, data,
899				dev->hard_header_len);
900		if (unlikely(err))
901			return err;
902
903		data += dev->hard_header_len;
904		to_write -= dev->hard_header_len;
905	}
906
907	err = -EFAULT;
908	page = virt_to_page(data);
909	offset = offset_in_page(data);
910	len_max = PAGE_SIZE - offset;
911	len = ((to_write > len_max) ? len_max : to_write);
912
913	skb->data_len = to_write;
914	skb->len += to_write;
915	skb->truesize += to_write;
916	atomic_add(to_write, &po->sk.sk_wmem_alloc);
917
918	while (likely(to_write)) {
919		nr_frags = skb_shinfo(skb)->nr_frags;
920
921		if (unlikely(nr_frags >= MAX_SKB_FRAGS)) {
922			printk(KERN_ERR "Packet exceed the number "
923					"of skb frags(%lu)\n",
924					MAX_SKB_FRAGS);
925			return -EFAULT;
926		}
927
928		flush_dcache_page(page);
929		get_page(page);
930		skb_fill_page_desc(skb,
931				nr_frags,
932				page++, offset, len);
933		to_write -= len;
934		offset = 0;
935		len_max = PAGE_SIZE;
936		len = ((to_write > len_max) ? len_max : to_write);
937	}
938
939	return tp_len;
940}
941
942static int tpacket_snd(struct packet_sock *po, struct msghdr *msg)
943{
944	struct socket *sock;
945	struct sk_buff *skb;
946	struct net_device *dev;
947	__be16 proto;
948	int ifindex, err, reserve = 0;
949	void * ph;
950	struct sockaddr_ll *saddr=(struct sockaddr_ll *)msg->msg_name;
951	int tp_len, size_max;
952	unsigned char *addr;
953	int len_sum = 0;
954	int status = 0;
955
956	sock = po->sk.sk_socket;
957
958	mutex_lock(&po->pg_vec_lock);
959
960	err = -EBUSY;
961	if (saddr == NULL) {
962		ifindex	= po->ifindex;
963		proto	= po->num;
964		addr	= NULL;
965	} else {
966		err = -EINVAL;
967		if (msg->msg_namelen < sizeof(struct sockaddr_ll))
968			goto out;
969		if (msg->msg_namelen < (saddr->sll_halen
970					+ offsetof(struct sockaddr_ll,
971						sll_addr)))
972			goto out;
973		ifindex	= saddr->sll_ifindex;
974		proto	= saddr->sll_protocol;
975		addr	= saddr->sll_addr;
976	}
977
978	dev = dev_get_by_index(sock_net(&po->sk), ifindex);
979	err = -ENXIO;
980	if (unlikely(dev == NULL))
981		goto out;
982
983	reserve = dev->hard_header_len;
984
985	err = -ENETDOWN;
986	if (unlikely(!(dev->flags & IFF_UP)))
987		goto out_put;
988
989	size_max = po->tx_ring.frame_size
990		- sizeof(struct skb_shared_info)
991		- po->tp_hdrlen
992		- LL_ALLOCATED_SPACE(dev)
993		- sizeof(struct sockaddr_ll);
994
995	if (size_max > dev->mtu + reserve)
996		size_max = dev->mtu + reserve;
997
998	do {
999		ph = packet_current_frame(po, &po->tx_ring,
1000				TP_STATUS_SEND_REQUEST);
1001
1002		if (unlikely(ph == NULL)) {
1003			schedule();
1004			continue;
1005		}
1006
1007		status = TP_STATUS_SEND_REQUEST;
1008		skb = sock_alloc_send_skb(&po->sk,
1009				LL_ALLOCATED_SPACE(dev)
1010				+ sizeof(struct sockaddr_ll),
1011				0, &err);
1012
1013		if (unlikely(skb == NULL))
1014			goto out_status;
1015
1016		tp_len = tpacket_fill_skb(po, skb, ph, dev, size_max, proto,
1017				addr);
1018
1019		if (unlikely(tp_len < 0)) {
1020			if (po->tp_loss) {
1021				__packet_set_status(po, ph,
1022						TP_STATUS_AVAILABLE);
1023				packet_increment_head(&po->tx_ring);
1024				kfree_skb(skb);
1025				continue;
1026			} else {
1027				status = TP_STATUS_WRONG_FORMAT;
1028				err = tp_len;
1029				goto out_status;
1030			}
1031		}
1032
1033		skb->destructor = tpacket_destruct_skb;
1034		__packet_set_status(po, ph, TP_STATUS_SENDING);
1035		atomic_inc(&po->tx_ring.pending);
1036
1037		status = TP_STATUS_SEND_REQUEST;
1038		err = dev_queue_xmit(skb);
1039		if (unlikely(err > 0 && (err = net_xmit_errno(err)) != 0))
1040			goto out_xmit;
1041		packet_increment_head(&po->tx_ring);
1042		len_sum += tp_len;
1043	}
1044	while (likely((ph != NULL) || ((!(msg->msg_flags & MSG_DONTWAIT))
1045					&& (atomic_read(&po->tx_ring.pending))))
1046	      );
1047
1048	err = len_sum;
1049	goto out_put;
1050
1051out_xmit:
1052	skb->destructor = sock_wfree;
1053	atomic_dec(&po->tx_ring.pending);
1054out_status:
1055	__packet_set_status(po, ph, status);
1056	kfree_skb(skb);
1057out_put:
1058	dev_put(dev);
1059out:
1060	mutex_unlock(&po->pg_vec_lock);
1061	return err;
1062}
1063#endif
1064
1065static int packet_snd(struct socket *sock,
1066			  struct msghdr *msg, size_t len)
1067{
1068	struct sock *sk = sock->sk;
1069	struct sockaddr_ll *saddr=(struct sockaddr_ll *)msg->msg_name;
1070	struct sk_buff *skb;
1071	struct net_device *dev;
1072	__be16 proto;
1073	unsigned char *addr;
1074	int ifindex, err, reserve = 0;
1075
1076	/*
1077	 *	Get and verify the address.
1078	 */
1079
1080	if (saddr == NULL) {
1081		struct packet_sock *po = pkt_sk(sk);
1082
1083		ifindex	= po->ifindex;
1084		proto	= po->num;
1085		addr	= NULL;
1086	} else {
1087		err = -EINVAL;
1088		if (msg->msg_namelen < sizeof(struct sockaddr_ll))
1089			goto out;
1090		if (msg->msg_namelen < (saddr->sll_halen + offsetof(struct sockaddr_ll, sll_addr)))
1091			goto out;
1092		ifindex	= saddr->sll_ifindex;
1093		proto	= saddr->sll_protocol;
1094		addr	= saddr->sll_addr;
1095	}
1096
1097
1098	dev = dev_get_by_index(sock_net(sk), ifindex);
1099	err = -ENXIO;
1100	if (dev == NULL)
1101		goto out_unlock;
1102	if (sock->type == SOCK_RAW)
1103		reserve = dev->hard_header_len;
1104
1105	err = -ENETDOWN;
1106	if (!(dev->flags & IFF_UP))
1107		goto out_unlock;
1108
1109	err = -EMSGSIZE;
1110	if (len > dev->mtu+reserve)
1111		goto out_unlock;
1112
1113	skb = sock_alloc_send_skb(sk, len + LL_ALLOCATED_SPACE(dev),
1114				msg->msg_flags & MSG_DONTWAIT, &err);
1115	if (skb==NULL)
1116		goto out_unlock;
1117
1118	skb_reserve(skb, LL_RESERVED_SPACE(dev));
1119	skb_reset_network_header(skb);
1120
1121	err = -EINVAL;
1122	if (sock->type == SOCK_DGRAM &&
1123	    dev_hard_header(skb, dev, ntohs(proto), addr, NULL, len) < 0)
1124		goto out_free;
1125
1126	/* Returns -EFAULT on error */
1127	err = memcpy_fromiovec(skb_put(skb,len), msg->msg_iov, len);
1128	if (err)
1129		goto out_free;
1130
1131	skb->protocol = proto;
1132	skb->dev = dev;
1133	skb->priority = sk->sk_priority;
1134
1135	/*
1136	 *	Now send it
1137	 */
1138
1139	err = dev_queue_xmit(skb);
1140	if (err > 0 && (err = net_xmit_errno(err)) != 0)
1141		goto out_unlock;
1142
1143	dev_put(dev);
1144
1145	return(len);
1146
1147out_free:
1148	kfree_skb(skb);
1149out_unlock:
1150	if (dev)
1151		dev_put(dev);
1152out:
1153	return err;
1154}
1155
1156static int packet_sendmsg(struct kiocb *iocb, struct socket *sock,
1157		struct msghdr *msg, size_t len)
1158{
1159#ifdef CONFIG_PACKET_MMAP
1160	struct sock *sk = sock->sk;
1161	struct packet_sock *po = pkt_sk(sk);
1162	if (po->tx_ring.pg_vec)
1163		return tpacket_snd(po, msg);
1164	else
1165#endif
1166		return packet_snd(sock, msg, len);
1167}
1168
1169/*
1170 *	Close a PACKET socket. This is fairly simple. We immediately go
1171 *	to 'closed' state and remove our protocol entry in the device list.
1172 */
1173
1174static int packet_release(struct socket *sock)
1175{
1176	struct sock *sk = sock->sk;
1177	struct packet_sock *po;
1178	struct net *net;
1179#ifdef CONFIG_PACKET_MMAP
1180	struct tpacket_req req;
1181#endif
1182
1183	if (!sk)
1184		return 0;
1185
1186	net = sock_net(sk);
1187	po = pkt_sk(sk);
1188
1189	write_lock_bh(&net->packet.sklist_lock);
1190	sk_del_node_init(sk);
1191	sock_prot_inuse_add(net, sk->sk_prot, -1);
1192	write_unlock_bh(&net->packet.sklist_lock);
1193
1194	/*
1195	 *	Unhook packet receive handler.
1196	 */
1197
1198	if (po->running) {
1199		/*
1200		 *	Remove the protocol hook
1201		 */
1202		dev_remove_pack(&po->prot_hook);
1203		po->running = 0;
1204		po->num = 0;
1205		__sock_put(sk);
1206	}
1207
1208	packet_flush_mclist(sk);
1209
1210#ifdef CONFIG_PACKET_MMAP
1211	memset(&req, 0, sizeof(req));
1212
1213	if (po->rx_ring.pg_vec)
1214		packet_set_ring(sk, &req, 1, 0);
1215
1216	if (po->tx_ring.pg_vec)
1217		packet_set_ring(sk, &req, 1, 1);
1218#endif
1219
1220	/*
1221	 *	Now the socket is dead. No more input will appear.
1222	 */
1223
1224	sock_orphan(sk);
1225	sock->sk = NULL;
1226
1227	/* Purge queues */
1228
1229	skb_queue_purge(&sk->sk_receive_queue);
1230	sk_refcnt_debug_release(sk);
1231
1232	sock_put(sk);
1233	return 0;
1234}
1235
1236/*
1237 *	Attach a packet hook.
1238 */
1239
1240static int packet_do_bind(struct sock *sk, struct net_device *dev, __be16 protocol)
1241{
1242	struct packet_sock *po = pkt_sk(sk);
1243	/*
1244	 *	Detach an existing hook if present.
1245	 */
1246
1247	lock_sock(sk);
1248
1249	spin_lock(&po->bind_lock);
1250	if (po->running) {
1251		__sock_put(sk);
1252		po->running = 0;
1253		po->num = 0;
1254		spin_unlock(&po->bind_lock);
1255		dev_remove_pack(&po->prot_hook);
1256		spin_lock(&po->bind_lock);
1257	}
1258
1259	po->num = protocol;
1260	po->prot_hook.type = protocol;
1261	po->prot_hook.dev = dev;
1262
1263	po->ifindex = dev ? dev->ifindex : 0;
1264
1265	if (protocol == 0)
1266		goto out_unlock;
1267
1268	if (!dev || (dev->flags & IFF_UP)) {
1269		dev_add_pack(&po->prot_hook);
1270		sock_hold(sk);
1271		po->running = 1;
1272	} else {
1273		sk->sk_err = ENETDOWN;
1274		if (!sock_flag(sk, SOCK_DEAD))
1275			sk->sk_error_report(sk);
1276	}
1277
1278out_unlock:
1279	spin_unlock(&po->bind_lock);
1280	release_sock(sk);
1281	return 0;
1282}
1283
1284/*
1285 *	Bind a packet socket to a device
1286 */
1287
1288static int packet_bind_spkt(struct socket *sock, struct sockaddr *uaddr, int addr_len)
1289{
1290	struct sock *sk=sock->sk;
1291	char name[15];
1292	struct net_device *dev;
1293	int err = -ENODEV;
1294
1295	/*
1296	 *	Check legality
1297	 */
1298
1299	if (addr_len != sizeof(struct sockaddr))
1300		return -EINVAL;
1301	strlcpy(name,uaddr->sa_data,sizeof(name));
1302
1303	dev = dev_get_by_name(sock_net(sk), name);
1304	if (dev) {
1305		err = packet_do_bind(sk, dev, pkt_sk(sk)->num);
1306		dev_put(dev);
1307	}
1308	return err;
1309}
1310
1311static int packet_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
1312{
1313	struct sockaddr_ll *sll = (struct sockaddr_ll*)uaddr;
1314	struct sock *sk=sock->sk;
1315	struct net_device *dev = NULL;
1316	int err;
1317
1318
1319	/*
1320	 *	Check legality
1321	 */
1322
1323	if (addr_len < sizeof(struct sockaddr_ll))
1324		return -EINVAL;
1325	if (sll->sll_family != AF_PACKET)
1326		return -EINVAL;
1327
1328	if (sll->sll_ifindex) {
1329		err = -ENODEV;
1330		dev = dev_get_by_index(sock_net(sk), sll->sll_ifindex);
1331		if (dev == NULL)
1332			goto out;
1333	}
1334	err = packet_do_bind(sk, dev, sll->sll_protocol ? : pkt_sk(sk)->num);
1335	if (dev)
1336		dev_put(dev);
1337
1338out:
1339	return err;
1340}
1341
1342static struct proto packet_proto = {
1343	.name	  = "PACKET",
1344	.owner	  = THIS_MODULE,
1345	.obj_size = sizeof(struct packet_sock),
1346};
1347
1348/*
1349 *	Create a packet of type SOCK_PACKET.
1350 */
1351
1352static int packet_create(struct net *net, struct socket *sock, int protocol)
1353{
1354	struct sock *sk;
1355	struct packet_sock *po;
1356	__be16 proto = (__force __be16)protocol; /* weird, but documented */
1357	int err;
1358
1359	if (!capable(CAP_NET_RAW))
1360		return -EPERM;
1361	if (sock->type != SOCK_DGRAM && sock->type != SOCK_RAW &&
1362	    sock->type != SOCK_PACKET)
1363		return -ESOCKTNOSUPPORT;
1364
1365	sock->state = SS_UNCONNECTED;
1366
1367	err = -ENOBUFS;
1368	sk = sk_alloc(net, PF_PACKET, GFP_KERNEL, &packet_proto);
1369	if (sk == NULL)
1370		goto out;
1371
1372	sock->ops = &packet_ops;
1373	if (sock->type == SOCK_PACKET)
1374		sock->ops = &packet_ops_spkt;
1375
1376	sock_init_data(sock, sk);
1377
1378	po = pkt_sk(sk);
1379	sk->sk_family = PF_PACKET;
1380	po->num = proto;
1381
1382	sk->sk_destruct = packet_sock_destruct;
1383	sk_refcnt_debug_inc(sk);
1384
1385	/*
1386	 *	Attach a protocol block
1387	 */
1388
1389	spin_lock_init(&po->bind_lock);
1390	mutex_init(&po->pg_vec_lock);
1391	po->prot_hook.func = packet_rcv;
1392
1393	if (sock->type == SOCK_PACKET)
1394		po->prot_hook.func = packet_rcv_spkt;
1395
1396	po->prot_hook.af_packet_priv = sk;
1397
1398	if (proto) {
1399		po->prot_hook.type = proto;
1400		dev_add_pack(&po->prot_hook);
1401		sock_hold(sk);
1402		po->running = 1;
1403	}
1404
1405	write_lock_bh(&net->packet.sklist_lock);
1406	sk_add_node(sk, &net->packet.sklist);
1407	sock_prot_inuse_add(net, &packet_proto, 1);
1408	write_unlock_bh(&net->packet.sklist_lock);
1409	return(0);
1410out:
1411	return err;
1412}
1413
1414/*
1415 *	Pull a packet from our receive queue and hand it to the user.
1416 *	If necessary we block.
1417 */
1418
1419static int packet_recvmsg(struct kiocb *iocb, struct socket *sock,
1420			  struct msghdr *msg, size_t len, int flags)
1421{
1422	struct sock *sk = sock->sk;
1423	struct sk_buff *skb;
1424	int copied, err;
1425	struct sockaddr_ll *sll;
1426
1427	err = -EINVAL;
1428	if (flags & ~(MSG_PEEK|MSG_DONTWAIT|MSG_TRUNC|MSG_CMSG_COMPAT))
1429		goto out;
1430
1431#if 0
1432	/* What error should we return now? EUNATTACH? */
1433	if (pkt_sk(sk)->ifindex < 0)
1434		return -ENODEV;
1435#endif
1436
1437	/*
1438	 *	Call the generic datagram receiver. This handles all sorts
1439	 *	of horrible races and re-entrancy so we can forget about it
1440	 *	in the protocol layers.
1441	 *
1442	 *	Now it will return ENETDOWN, if device have just gone down,
1443	 *	but then it will block.
1444	 */
1445
1446	skb=skb_recv_datagram(sk,flags,flags&MSG_DONTWAIT,&err);
1447
1448	/*
1449	 *	An error occurred so return it. Because skb_recv_datagram()
1450	 *	handles the blocking we don't see and worry about blocking
1451	 *	retries.
1452	 */
1453
1454	if (skb == NULL)
1455		goto out;
1456
1457	/*
1458	 *	If the address length field is there to be filled in, we fill
1459	 *	it in now.
1460	 */
1461
1462	sll = &PACKET_SKB_CB(skb)->sa.ll;
1463	if (sock->type == SOCK_PACKET)
1464		msg->msg_namelen = sizeof(struct sockaddr_pkt);
1465	else
1466		msg->msg_namelen = sll->sll_halen + offsetof(struct sockaddr_ll, sll_addr);
1467
1468	/*
1469	 *	You lose any data beyond the buffer you gave. If it worries a
1470	 *	user program they can ask the device for its MTU anyway.
1471	 */
1472
1473	copied = skb->len;
1474	if (copied > len)
1475	{
1476		copied=len;
1477		msg->msg_flags|=MSG_TRUNC;
1478	}
1479
1480	err = skb_copy_datagram_iovec(skb, 0, msg->msg_iov, copied);
1481	if (err)
1482		goto out_free;
1483
1484	sock_recv_timestamp(msg, sk, skb);
1485
1486	if (msg->msg_name)
1487		memcpy(msg->msg_name, &PACKET_SKB_CB(skb)->sa,
1488		       msg->msg_namelen);
1489
1490	if (pkt_sk(sk)->auxdata) {
1491		struct tpacket_auxdata aux;
1492
1493		aux.tp_status = TP_STATUS_USER;
1494		if (skb->ip_summed == CHECKSUM_PARTIAL)
1495			aux.tp_status |= TP_STATUS_CSUMNOTREADY;
1496		aux.tp_len = PACKET_SKB_CB(skb)->origlen;
1497		aux.tp_snaplen = skb->len;
1498		aux.tp_mac = 0;
1499		aux.tp_net = skb_network_offset(skb);
1500		aux.tp_vlan_tci = skb->vlan_tci;
1501
1502		put_cmsg(msg, SOL_PACKET, PACKET_AUXDATA, sizeof(aux), &aux);
1503	}
1504
1505	/*
1506	 *	Free or return the buffer as appropriate. Again this
1507	 *	hides all the races and re-entrancy issues from us.
1508	 */
1509	err = (flags&MSG_TRUNC) ? skb->len : copied;
1510
1511out_free:
1512	skb_free_datagram(sk, skb);
1513out:
1514	return err;
1515}
1516
1517static int packet_getname_spkt(struct socket *sock, struct sockaddr *uaddr,
1518			       int *uaddr_len, int peer)
1519{
1520	struct net_device *dev;
1521	struct sock *sk	= sock->sk;
1522
1523	if (peer)
1524		return -EOPNOTSUPP;
1525
1526	uaddr->sa_family = AF_PACKET;
1527	dev = dev_get_by_index(sock_net(sk), pkt_sk(sk)->ifindex);
1528	if (dev) {
1529		strlcpy(uaddr->sa_data, dev->name, 15);
1530		dev_put(dev);
1531	} else
1532		memset(uaddr->sa_data, 0, 14);
1533	*uaddr_len = sizeof(*uaddr);
1534
1535	return 0;
1536}
1537
1538static int packet_getname(struct socket *sock, struct sockaddr *uaddr,
1539			  int *uaddr_len, int peer)
1540{
1541	struct net_device *dev;
1542	struct sock *sk = sock->sk;
1543	struct packet_sock *po = pkt_sk(sk);
1544	struct sockaddr_ll *sll = (struct sockaddr_ll*)uaddr;
1545
1546	if (peer)
1547		return -EOPNOTSUPP;
1548
1549	sll->sll_family = AF_PACKET;
1550	sll->sll_ifindex = po->ifindex;
1551	sll->sll_protocol = po->num;
1552	dev = dev_get_by_index(sock_net(sk), po->ifindex);
1553	if (dev) {
1554		sll->sll_hatype = dev->type;
1555		sll->sll_halen = dev->addr_len;
1556		memcpy(sll->sll_addr, dev->dev_addr, dev->addr_len);
1557		dev_put(dev);
1558	} else {
1559		sll->sll_hatype = 0;	/* Bad: we have no ARPHRD_UNSPEC */
1560		sll->sll_halen = 0;
1561	}
1562	*uaddr_len = offsetof(struct sockaddr_ll, sll_addr) + sll->sll_halen;
1563
1564	return 0;
1565}
1566
1567static int packet_dev_mc(struct net_device *dev, struct packet_mclist *i,
1568			 int what)
1569{
1570	switch (i->type) {
1571	case PACKET_MR_MULTICAST:
1572		if (what > 0)
1573			dev_mc_add(dev, i->addr, i->alen, 0);
1574		else
1575			dev_mc_delete(dev, i->addr, i->alen, 0);
1576		break;
1577	case PACKET_MR_PROMISC:
1578		return dev_set_promiscuity(dev, what);
1579		break;
1580	case PACKET_MR_ALLMULTI:
1581		return dev_set_allmulti(dev, what);
1582		break;
1583	default:;
1584	}
1585	return 0;
1586}
1587
1588static void packet_dev_mclist(struct net_device *dev, struct packet_mclist *i, int what)
1589{
1590	for ( ; i; i=i->next) {
1591		if (i->ifindex == dev->ifindex)
1592			packet_dev_mc(dev, i, what);
1593	}
1594}
1595
1596static int packet_mc_add(struct sock *sk, struct packet_mreq_max *mreq)
1597{
1598	struct packet_sock *po = pkt_sk(sk);
1599	struct packet_mclist *ml, *i;
1600	struct net_device *dev;
1601	int err;
1602
1603	rtnl_lock();
1604
1605	err = -ENODEV;
1606	dev = __dev_get_by_index(sock_net(sk), mreq->mr_ifindex);
1607	if (!dev)
1608		goto done;
1609
1610	err = -EINVAL;
1611	if (mreq->mr_alen > dev->addr_len)
1612		goto done;
1613
1614	err = -ENOBUFS;
1615	i = kmalloc(sizeof(*i), GFP_KERNEL);
1616	if (i == NULL)
1617		goto done;
1618
1619	err = 0;
1620	for (ml = po->mclist; ml; ml = ml->next) {
1621		if (ml->ifindex == mreq->mr_ifindex &&
1622		    ml->type == mreq->mr_type &&
1623		    ml->alen == mreq->mr_alen &&
1624		    memcmp(ml->addr, mreq->mr_address, ml->alen) == 0) {
1625			ml->count++;
1626			/* Free the new element ... */
1627			kfree(i);
1628			goto done;
1629		}
1630	}
1631
1632	i->type = mreq->mr_type;
1633	i->ifindex = mreq->mr_ifindex;
1634	i->alen = mreq->mr_alen;
1635	memcpy(i->addr, mreq->mr_address, i->alen);
1636	i->count = 1;
1637	i->next = po->mclist;
1638	po->mclist = i;
1639	err = packet_dev_mc(dev, i, 1);
1640	if (err) {
1641		po->mclist = i->next;
1642		kfree(i);
1643	}
1644
1645done:
1646	rtnl_unlock();
1647	return err;
1648}
1649
1650static int packet_mc_drop(struct sock *sk, struct packet_mreq_max *mreq)
1651{
1652	struct packet_mclist *ml, **mlp;
1653
1654	rtnl_lock();
1655
1656	for (mlp = &pkt_sk(sk)->mclist; (ml = *mlp) != NULL; mlp = &ml->next) {
1657		if (ml->ifindex == mreq->mr_ifindex &&
1658		    ml->type == mreq->mr_type &&
1659		    ml->alen == mreq->mr_alen &&
1660		    memcmp(ml->addr, mreq->mr_address, ml->alen) == 0) {
1661			if (--ml->count == 0) {
1662				struct net_device *dev;
1663				*mlp = ml->next;
1664				dev = dev_get_by_index(sock_net(sk), ml->ifindex);
1665				if (dev) {
1666					packet_dev_mc(dev, ml, -1);
1667					dev_put(dev);
1668				}
1669				kfree(ml);
1670			}
1671			rtnl_unlock();
1672			return 0;
1673		}
1674	}
1675	rtnl_unlock();
1676	return -EADDRNOTAVAIL;
1677}
1678
1679static void packet_flush_mclist(struct sock *sk)
1680{
1681	struct packet_sock *po = pkt_sk(sk);
1682	struct packet_mclist *ml;
1683
1684	if (!po->mclist)
1685		return;
1686
1687	rtnl_lock();
1688	while ((ml = po->mclist) != NULL) {
1689		struct net_device *dev;
1690
1691		po->mclist = ml->next;
1692		if ((dev = dev_get_by_index(sock_net(sk), ml->ifindex)) != NULL) {
1693			packet_dev_mc(dev, ml, -1);
1694			dev_put(dev);
1695		}
1696		kfree(ml);
1697	}
1698	rtnl_unlock();
1699}
1700
1701static int
1702packet_setsockopt(struct socket *sock, int level, int optname, char __user *optval, int optlen)
1703{
1704	struct sock *sk = sock->sk;
1705	struct packet_sock *po = pkt_sk(sk);
1706	int ret;
1707
1708	if (level != SOL_PACKET)
1709		return -ENOPROTOOPT;
1710
1711	switch (optname) {
1712	case PACKET_ADD_MEMBERSHIP:
1713	case PACKET_DROP_MEMBERSHIP:
1714	{
1715		struct packet_mreq_max mreq;
1716		int len = optlen;
1717		memset(&mreq, 0, sizeof(mreq));
1718		if (len < sizeof(struct packet_mreq))
1719			return -EINVAL;
1720		if (len > sizeof(mreq))
1721			len = sizeof(mreq);
1722		if (copy_from_user(&mreq,optval,len))
1723			return -EFAULT;
1724		if (len < (mreq.mr_alen + offsetof(struct packet_mreq, mr_address)))
1725			return -EINVAL;
1726		if (optname == PACKET_ADD_MEMBERSHIP)
1727			ret = packet_mc_add(sk, &mreq);
1728		else
1729			ret = packet_mc_drop(sk, &mreq);
1730		return ret;
1731	}
1732
1733#ifdef CONFIG_PACKET_MMAP
1734	case PACKET_RX_RING:
1735	case PACKET_TX_RING:
1736	{
1737		struct tpacket_req req;
1738
1739		if (optlen<sizeof(req))
1740			return -EINVAL;
1741		if (copy_from_user(&req,optval,sizeof(req)))
1742			return -EFAULT;
1743		return packet_set_ring(sk, &req, 0, optname == PACKET_TX_RING);
1744	}
1745	case PACKET_COPY_THRESH:
1746	{
1747		int val;
1748
1749		if (optlen!=sizeof(val))
1750			return -EINVAL;
1751		if (copy_from_user(&val,optval,sizeof(val)))
1752			return -EFAULT;
1753
1754		pkt_sk(sk)->copy_thresh = val;
1755		return 0;
1756	}
1757	case PACKET_VERSION:
1758	{
1759		int val;
1760
1761		if (optlen != sizeof(val))
1762			return -EINVAL;
1763		if (po->rx_ring.pg_vec || po->tx_ring.pg_vec)
1764			return -EBUSY;
1765		if (copy_from_user(&val, optval, sizeof(val)))
1766			return -EFAULT;
1767		switch (val) {
1768		case TPACKET_V1:
1769		case TPACKET_V2:
1770			po->tp_version = val;
1771			return 0;
1772		default:
1773			return -EINVAL;
1774		}
1775	}
1776	case PACKET_RESERVE:
1777	{
1778		unsigned int val;
1779
1780		if (optlen != sizeof(val))
1781			return -EINVAL;
1782		if (po->rx_ring.pg_vec || po->tx_ring.pg_vec)
1783			return -EBUSY;
1784		if (copy_from_user(&val, optval, sizeof(val)))
1785			return -EFAULT;
1786		po->tp_reserve = val;
1787		return 0;
1788	}
1789	case PACKET_LOSS:
1790	{
1791		unsigned int val;
1792
1793		if (optlen != sizeof(val))
1794			return -EINVAL;
1795		if (po->rx_ring.pg_vec || po->tx_ring.pg_vec)
1796			return -EBUSY;
1797		if (copy_from_user(&val, optval, sizeof(val)))
1798			return -EFAULT;
1799		po->tp_loss = !!val;
1800		return 0;
1801	}
1802#endif
1803	case PACKET_AUXDATA:
1804	{
1805		int val;
1806
1807		if (optlen < sizeof(val))
1808			return -EINVAL;
1809		if (copy_from_user(&val, optval, sizeof(val)))
1810			return -EFAULT;
1811
1812		po->auxdata = !!val;
1813		return 0;
1814	}
1815	case PACKET_ORIGDEV:
1816	{
1817		int val;
1818
1819		if (optlen < sizeof(val))
1820			return -EINVAL;
1821		if (copy_from_user(&val, optval, sizeof(val)))
1822			return -EFAULT;
1823
1824		po->origdev = !!val;
1825		return 0;
1826	}
1827	default:
1828		return -ENOPROTOOPT;
1829	}
1830}
1831
1832static int packet_getsockopt(struct socket *sock, int level, int optname,
1833			     char __user *optval, int __user *optlen)
1834{
1835	int len;
1836	int val;
1837	struct sock *sk = sock->sk;
1838	struct packet_sock *po = pkt_sk(sk);
1839	void *data;
1840	struct tpacket_stats st;
1841
1842	if (level != SOL_PACKET)
1843		return -ENOPROTOOPT;
1844
1845	if (get_user(len, optlen))
1846		return -EFAULT;
1847
1848	if (len < 0)
1849		return -EINVAL;
1850
1851	switch (optname) {
1852	case PACKET_STATISTICS:
1853		if (len > sizeof(struct tpacket_stats))
1854			len = sizeof(struct tpacket_stats);
1855		spin_lock_bh(&sk->sk_receive_queue.lock);
1856		st = po->stats;
1857		memset(&po->stats, 0, sizeof(st));
1858		spin_unlock_bh(&sk->sk_receive_queue.lock);
1859		st.tp_packets += st.tp_drops;
1860
1861		data = &st;
1862		break;
1863	case PACKET_AUXDATA:
1864		if (len > sizeof(int))
1865			len = sizeof(int);
1866		val = po->auxdata;
1867
1868		data = &val;
1869		break;
1870	case PACKET_ORIGDEV:
1871		if (len > sizeof(int))
1872			len = sizeof(int);
1873		val = po->origdev;
1874
1875		data = &val;
1876		break;
1877#ifdef CONFIG_PACKET_MMAP
1878	case PACKET_VERSION:
1879		if (len > sizeof(int))
1880			len = sizeof(int);
1881		val = po->tp_version;
1882		data = &val;
1883		break;
1884	case PACKET_HDRLEN:
1885		if (len > sizeof(int))
1886			len = sizeof(int);
1887		if (copy_from_user(&val, optval, len))
1888			return -EFAULT;
1889		switch (val) {
1890		case TPACKET_V1:
1891			val = sizeof(struct tpacket_hdr);
1892			break;
1893		case TPACKET_V2:
1894			val = sizeof(struct tpacket2_hdr);
1895			break;
1896		default:
1897			return -EINVAL;
1898		}
1899		data = &val;
1900		break;
1901	case PACKET_RESERVE:
1902		if (len > sizeof(unsigned int))
1903			len = sizeof(unsigned int);
1904		val = po->tp_reserve;
1905		data = &val;
1906		break;
1907	case PACKET_LOSS:
1908		if (len > sizeof(unsigned int))
1909			len = sizeof(unsigned int);
1910		val = po->tp_loss;
1911		data = &val;
1912		break;
1913#endif
1914	default:
1915		return -ENOPROTOOPT;
1916	}
1917
1918	if (put_user(len, optlen))
1919		return -EFAULT;
1920	if (copy_to_user(optval, data, len))
1921		return -EFAULT;
1922	return 0;
1923}
1924
1925
1926static int packet_notifier(struct notifier_block *this, unsigned long msg, void *data)
1927{
1928	struct sock *sk;
1929	struct hlist_node *node;
1930	struct net_device *dev = data;
1931	struct net *net = dev_net(dev);
1932
1933	read_lock(&net->packet.sklist_lock);
1934	sk_for_each(sk, node, &net->packet.sklist) {
1935		struct packet_sock *po = pkt_sk(sk);
1936
1937		switch (msg) {
1938		case NETDEV_UNREGISTER:
1939			if (po->mclist)
1940				packet_dev_mclist(dev, po->mclist, -1);
1941			/* fallthrough */
1942
1943		case NETDEV_DOWN:
1944			if (dev->ifindex == po->ifindex) {
1945				spin_lock(&po->bind_lock);
1946				if (po->running) {
1947					__dev_remove_pack(&po->prot_hook);
1948					__sock_put(sk);
1949					po->running = 0;
1950					sk->sk_err = ENETDOWN;
1951					if (!sock_flag(sk, SOCK_DEAD))
1952						sk->sk_error_report(sk);
1953				}
1954				if (msg == NETDEV_UNREGISTER) {
1955					po->ifindex = -1;
1956					po->prot_hook.dev = NULL;
1957				}
1958				spin_unlock(&po->bind_lock);
1959			}
1960			break;
1961		case NETDEV_UP:
1962			spin_lock(&po->bind_lock);
1963			if (dev->ifindex == po->ifindex && po->num &&
1964			    !po->running) {
1965				dev_add_pack(&po->prot_hook);
1966				sock_hold(sk);
1967				po->running = 1;
1968			}
1969			spin_unlock(&po->bind_lock);
1970			break;
1971		}
1972	}
1973	read_unlock(&net->packet.sklist_lock);
1974	return NOTIFY_DONE;
1975}
1976
1977
1978static int packet_ioctl(struct socket *sock, unsigned int cmd,
1979			unsigned long arg)
1980{
1981	struct sock *sk = sock->sk;
1982
1983	switch (cmd) {
1984		case SIOCOUTQ:
1985		{
1986			int amount = atomic_read(&sk->sk_wmem_alloc);
1987			return put_user(amount, (int __user *)arg);
1988		}
1989		case SIOCINQ:
1990		{
1991			struct sk_buff *skb;
1992			int amount = 0;
1993
1994			spin_lock_bh(&sk->sk_receive_queue.lock);
1995			skb = skb_peek(&sk->sk_receive_queue);
1996			if (skb)
1997				amount = skb->len;
1998			spin_unlock_bh(&sk->sk_receive_queue.lock);
1999			return put_user(amount, (int __user *)arg);
2000		}
2001		case SIOCGSTAMP:
2002			return sock_get_timestamp(sk, (struct timeval __user *)arg);
2003		case SIOCGSTAMPNS:
2004			return sock_get_timestampns(sk, (struct timespec __user *)arg);
2005
2006#ifdef CONFIG_INET
2007		case SIOCADDRT:
2008		case SIOCDELRT:
2009		case SIOCDARP:
2010		case SIOCGARP:
2011		case SIOCSARP:
2012		case SIOCGIFADDR:
2013		case SIOCSIFADDR:
2014		case SIOCGIFBRDADDR:
2015		case SIOCSIFBRDADDR:
2016		case SIOCGIFNETMASK:
2017		case SIOCSIFNETMASK:
2018		case SIOCGIFDSTADDR:
2019		case SIOCSIFDSTADDR:
2020		case SIOCSIFFLAGS:
2021			if (!net_eq(sock_net(sk), &init_net))
2022				return -ENOIOCTLCMD;
2023			return inet_dgram_ops.ioctl(sock, cmd, arg);
2024#endif
2025
2026		default:
2027			return -ENOIOCTLCMD;
2028	}
2029	return 0;
2030}
2031
2032#ifndef CONFIG_PACKET_MMAP
2033#define packet_mmap sock_no_mmap
2034#define packet_poll datagram_poll
2035#else
2036
2037static unsigned int packet_poll(struct file * file, struct socket *sock,
2038				poll_table *wait)
2039{
2040	struct sock *sk = sock->sk;
2041	struct packet_sock *po = pkt_sk(sk);
2042	unsigned int mask = datagram_poll(file, sock, wait);
2043
2044	spin_lock_bh(&sk->sk_receive_queue.lock);
2045	if (po->rx_ring.pg_vec) {
2046		if (!packet_previous_frame(po, &po->rx_ring, TP_STATUS_KERNEL))
2047			mask |= POLLIN | POLLRDNORM;
2048	}
2049	spin_unlock_bh(&sk->sk_receive_queue.lock);
2050	spin_lock_bh(&sk->sk_write_queue.lock);
2051	if (po->tx_ring.pg_vec) {
2052		if (packet_current_frame(po, &po->tx_ring, TP_STATUS_AVAILABLE))
2053			mask |= POLLOUT | POLLWRNORM;
2054	}
2055	spin_unlock_bh(&sk->sk_write_queue.lock);
2056	return mask;
2057}
2058
2059
2060/* Dirty? Well, I still did not learn better way to account
2061 * for user mmaps.
2062 */
2063
2064static void packet_mm_open(struct vm_area_struct *vma)
2065{
2066	struct file *file = vma->vm_file;
2067	struct socket * sock = file->private_data;
2068	struct sock *sk = sock->sk;
2069
2070	if (sk)
2071		atomic_inc(&pkt_sk(sk)->mapped);
2072}
2073
2074static void packet_mm_close(struct vm_area_struct *vma)
2075{
2076	struct file *file = vma->vm_file;
2077	struct socket * sock = file->private_data;
2078	struct sock *sk = sock->sk;
2079
2080	if (sk)
2081		atomic_dec(&pkt_sk(sk)->mapped);
2082}
2083
2084static struct vm_operations_struct packet_mmap_ops = {
2085	.open =	packet_mm_open,
2086	.close =packet_mm_close,
2087};
2088
2089static void free_pg_vec(char **pg_vec, unsigned int order, unsigned int len)
2090{
2091	int i;
2092
2093	for (i = 0; i < len; i++) {
2094		if (likely(pg_vec[i]))
2095			free_pages((unsigned long) pg_vec[i], order);
2096	}
2097	kfree(pg_vec);
2098}
2099
2100static inline char *alloc_one_pg_vec_page(unsigned long order)
2101{
2102	gfp_t gfp_flags = GFP_KERNEL | __GFP_COMP | __GFP_ZERO | __GFP_NOWARN;
2103
2104	return (char *) __get_free_pages(gfp_flags, order);
2105}
2106
2107static char **alloc_pg_vec(struct tpacket_req *req, int order)
2108{
2109	unsigned int block_nr = req->tp_block_nr;
2110	char **pg_vec;
2111	int i;
2112
2113	pg_vec = kzalloc(block_nr * sizeof(char *), GFP_KERNEL);
2114	if (unlikely(!pg_vec))
2115		goto out;
2116
2117	for (i = 0; i < block_nr; i++) {
2118		pg_vec[i] = alloc_one_pg_vec_page(order);
2119		if (unlikely(!pg_vec[i]))
2120			goto out_free_pgvec;
2121	}
2122
2123out:
2124	return pg_vec;
2125
2126out_free_pgvec:
2127	free_pg_vec(pg_vec, order, block_nr);
2128	pg_vec = NULL;
2129	goto out;
2130}
2131
2132static int packet_set_ring(struct sock *sk, struct tpacket_req *req,
2133		int closing, int tx_ring)
2134{
2135	char **pg_vec = NULL;
2136	struct packet_sock *po = pkt_sk(sk);
2137	int was_running, order = 0;
2138	struct packet_ring_buffer *rb;
2139	struct sk_buff_head *rb_queue;
2140	__be16 num;
2141	int err;
2142
2143	rb = tx_ring ? &po->tx_ring : &po->rx_ring;
2144	rb_queue = tx_ring ? &sk->sk_write_queue : &sk->sk_receive_queue;
2145
2146	err = -EBUSY;
2147	if (!closing) {
2148		if (atomic_read(&po->mapped))
2149			goto out;
2150		if (atomic_read(&rb->pending))
2151			goto out;
2152	}
2153
2154	if (req->tp_block_nr) {
2155		/* Sanity tests and some calculations */
2156		err = -EBUSY;
2157		if (unlikely(rb->pg_vec))
2158			goto out;
2159
2160		switch (po->tp_version) {
2161		case TPACKET_V1:
2162			po->tp_hdrlen = TPACKET_HDRLEN;
2163			break;
2164		case TPACKET_V2:
2165			po->tp_hdrlen = TPACKET2_HDRLEN;
2166			break;
2167		}
2168
2169		err = -EINVAL;
2170		if (unlikely((int)req->tp_block_size <= 0))
2171			goto out;
2172		if (unlikely(req->tp_block_size & (PAGE_SIZE - 1)))
2173			goto out;
2174		if (unlikely(req->tp_frame_size < po->tp_hdrlen +
2175					po->tp_reserve))
2176			goto out;
2177		if (unlikely(req->tp_frame_size & (TPACKET_ALIGNMENT - 1)))
2178			goto out;
2179
2180		rb->frames_per_block = req->tp_block_size/req->tp_frame_size;
2181		if (unlikely(rb->frames_per_block <= 0))
2182			goto out;
2183		if (unlikely((rb->frames_per_block * req->tp_block_nr) !=
2184					req->tp_frame_nr))
2185			goto out;
2186
2187		err = -ENOMEM;
2188		order = get_order(req->tp_block_size);
2189		pg_vec = alloc_pg_vec(req, order);
2190		if (unlikely(!pg_vec))
2191			goto out;
2192	}
2193	/* Done */
2194	else {
2195		err = -EINVAL;
2196		if (unlikely(req->tp_frame_nr))
2197			goto out;
2198	}
2199
2200	lock_sock(sk);
2201
2202	/* Detach socket from network */
2203	spin_lock(&po->bind_lock);
2204	was_running = po->running;
2205	num = po->num;
2206	if (was_running) {
2207		__dev_remove_pack(&po->prot_hook);
2208		po->num = 0;
2209		po->running = 0;
2210		__sock_put(sk);
2211	}
2212	spin_unlock(&po->bind_lock);
2213
2214	synchronize_net();
2215
2216	err = -EBUSY;
2217	mutex_lock(&po->pg_vec_lock);
2218	if (closing || atomic_read(&po->mapped) == 0) {
2219		err = 0;
2220#define XC(a, b) ({ __typeof__ ((a)) __t; __t = (a); (a) = (b); __t; })
2221		spin_lock_bh(&rb_queue->lock);
2222		pg_vec = XC(rb->pg_vec, pg_vec);
2223		rb->frame_max = (req->tp_frame_nr - 1);
2224		rb->head = 0;
2225		rb->frame_size = req->tp_frame_size;
2226		spin_unlock_bh(&rb_queue->lock);
2227
2228		order = XC(rb->pg_vec_order, order);
2229		req->tp_block_nr = XC(rb->pg_vec_len, req->tp_block_nr);
2230
2231		rb->pg_vec_pages = req->tp_block_size/PAGE_SIZE;
2232		po->prot_hook.func = (po->rx_ring.pg_vec) ?
2233						tpacket_rcv : packet_rcv;
2234		skb_queue_purge(rb_queue);
2235#undef XC
2236		if (atomic_read(&po->mapped))
2237			printk(KERN_DEBUG "packet_mmap: vma is busy: %d\n",
2238						atomic_read(&po->mapped));
2239	}
2240	mutex_unlock(&po->pg_vec_lock);
2241
2242	spin_lock(&po->bind_lock);
2243	if (was_running && !po->running) {
2244		sock_hold(sk);
2245		po->running = 1;
2246		po->num = num;
2247		dev_add_pack(&po->prot_hook);
2248	}
2249	spin_unlock(&po->bind_lock);
2250
2251	release_sock(sk);
2252
2253	if (pg_vec)
2254		free_pg_vec(pg_vec, order, req->tp_block_nr);
2255out:
2256	return err;
2257}
2258
2259static int packet_mmap(struct file *file, struct socket *sock,
2260		struct vm_area_struct *vma)
2261{
2262	struct sock *sk = sock->sk;
2263	struct packet_sock *po = pkt_sk(sk);
2264	unsigned long size, expected_size;
2265	struct packet_ring_buffer *rb;
2266	unsigned long start;
2267	int err = -EINVAL;
2268	int i;
2269
2270	if (vma->vm_pgoff)
2271		return -EINVAL;
2272
2273	mutex_lock(&po->pg_vec_lock);
2274
2275	expected_size = 0;
2276	for (rb = &po->rx_ring; rb <= &po->tx_ring; rb++) {
2277		if (rb->pg_vec) {
2278			expected_size += rb->pg_vec_len
2279						* rb->pg_vec_pages
2280						* PAGE_SIZE;
2281		}
2282	}
2283
2284	if (expected_size == 0)
2285		goto out;
2286
2287	size = vma->vm_end - vma->vm_start;
2288	if (size != expected_size)
2289		goto out;
2290
2291	start = vma->vm_start;
2292	for (rb = &po->rx_ring; rb <= &po->tx_ring; rb++) {
2293		if (rb->pg_vec == NULL)
2294			continue;
2295
2296		for (i = 0; i < rb->pg_vec_len; i++) {
2297			struct page *page = virt_to_page(rb->pg_vec[i]);
2298			int pg_num;
2299
2300			for (pg_num = 0; pg_num < rb->pg_vec_pages;
2301					pg_num++,page++) {
2302				err = vm_insert_page(vma, start, page);
2303				if (unlikely(err))
2304					goto out;
2305				start += PAGE_SIZE;
2306			}
2307		}
2308	}
2309
2310	atomic_inc(&po->mapped);
2311	vma->vm_ops = &packet_mmap_ops;
2312	err = 0;
2313
2314out:
2315	mutex_unlock(&po->pg_vec_lock);
2316	return err;
2317}
2318#endif
2319
2320
2321static const struct proto_ops packet_ops_spkt = {
2322	.family =	PF_PACKET,
2323	.owner =	THIS_MODULE,
2324	.release =	packet_release,
2325	.bind =		packet_bind_spkt,
2326	.connect =	sock_no_connect,
2327	.socketpair =	sock_no_socketpair,
2328	.accept =	sock_no_accept,
2329	.getname =	packet_getname_spkt,
2330	.poll =		datagram_poll,
2331	.ioctl =	packet_ioctl,
2332	.listen =	sock_no_listen,
2333	.shutdown =	sock_no_shutdown,
2334	.setsockopt =	sock_no_setsockopt,
2335	.getsockopt =	sock_no_getsockopt,
2336	.sendmsg =	packet_sendmsg_spkt,
2337	.recvmsg =	packet_recvmsg,
2338	.mmap =		sock_no_mmap,
2339	.sendpage =	sock_no_sendpage,
2340};
2341
2342static const struct proto_ops packet_ops = {
2343	.family =	PF_PACKET,
2344	.owner =	THIS_MODULE,
2345	.release =	packet_release,
2346	.bind =		packet_bind,
2347	.connect =	sock_no_connect,
2348	.socketpair =	sock_no_socketpair,
2349	.accept =	sock_no_accept,
2350	.getname =	packet_getname,
2351	.poll =		packet_poll,
2352	.ioctl =	packet_ioctl,
2353	.listen =	sock_no_listen,
2354	.shutdown =	sock_no_shutdown,
2355	.setsockopt =	packet_setsockopt,
2356	.getsockopt =	packet_getsockopt,
2357	.sendmsg =	packet_sendmsg,
2358	.recvmsg =	packet_recvmsg,
2359	.mmap =		packet_mmap,
2360	.sendpage =	sock_no_sendpage,
2361};
2362
2363static struct net_proto_family packet_family_ops = {
2364	.family =	PF_PACKET,
2365	.create =	packet_create,
2366	.owner	=	THIS_MODULE,
2367};
2368
2369static struct notifier_block packet_netdev_notifier = {
2370	.notifier_call =packet_notifier,
2371};
2372
2373#ifdef CONFIG_PROC_FS
2374static inline struct sock *packet_seq_idx(struct net *net, loff_t off)
2375{
2376	struct sock *s;
2377	struct hlist_node *node;
2378
2379	sk_for_each(s, node, &net->packet.sklist) {
2380		if (!off--)
2381			return s;
2382	}
2383	return NULL;
2384}
2385
2386static void *packet_seq_start(struct seq_file *seq, loff_t *pos)
2387	__acquires(seq_file_net(seq)->packet.sklist_lock)
2388{
2389	struct net *net = seq_file_net(seq);
2390	read_lock(&net->packet.sklist_lock);
2391	return *pos ? packet_seq_idx(net, *pos - 1) : SEQ_START_TOKEN;
2392}
2393
2394static void *packet_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2395{
2396	struct net *net = seq_file_net(seq);
2397	++*pos;
2398	return  (v == SEQ_START_TOKEN)
2399		? sk_head(&net->packet.sklist)
2400		: sk_next((struct sock*)v) ;
2401}
2402
2403static void packet_seq_stop(struct seq_file *seq, void *v)
2404	__releases(seq_file_net(seq)->packet.sklist_lock)
2405{
2406	struct net *net = seq_file_net(seq);
2407	read_unlock(&net->packet.sklist_lock);
2408}
2409
2410static int packet_seq_show(struct seq_file *seq, void *v)
2411{
2412	if (v == SEQ_START_TOKEN)
2413		seq_puts(seq, "sk       RefCnt Type Proto  Iface R Rmem   User   Inode\n");
2414	else {
2415		struct sock *s = v;
2416		const struct packet_sock *po = pkt_sk(s);
2417
2418		seq_printf(seq,
2419			   "%p %-6d %-4d %04x   %-5d %1d %-6u %-6u %-6lu\n",
2420			   s,
2421			   atomic_read(&s->sk_refcnt),
2422			   s->sk_type,
2423			   ntohs(po->num),
2424			   po->ifindex,
2425			   po->running,
2426			   atomic_read(&s->sk_rmem_alloc),
2427			   sock_i_uid(s),
2428			   sock_i_ino(s) );
2429	}
2430
2431	return 0;
2432}
2433
2434static const struct seq_operations packet_seq_ops = {
2435	.start	= packet_seq_start,
2436	.next	= packet_seq_next,
2437	.stop	= packet_seq_stop,
2438	.show	= packet_seq_show,
2439};
2440
2441static int packet_seq_open(struct inode *inode, struct file *file)
2442{
2443	return seq_open_net(inode, file, &packet_seq_ops,
2444			    sizeof(struct seq_net_private));
2445}
2446
2447static const struct file_operations packet_seq_fops = {
2448	.owner		= THIS_MODULE,
2449	.open		= packet_seq_open,
2450	.read		= seq_read,
2451	.llseek		= seq_lseek,
2452	.release	= seq_release_net,
2453};
2454
2455#endif
2456
2457static int packet_net_init(struct net *net)
2458{
2459	rwlock_init(&net->packet.sklist_lock);
2460	INIT_HLIST_HEAD(&net->packet.sklist);
2461
2462	if (!proc_net_fops_create(net, "packet", 0, &packet_seq_fops))
2463		return -ENOMEM;
2464
2465	return 0;
2466}
2467
2468static void packet_net_exit(struct net *net)
2469{
2470	proc_net_remove(net, "packet");
2471}
2472
2473static struct pernet_operations packet_net_ops = {
2474	.init = packet_net_init,
2475	.exit = packet_net_exit,
2476};
2477
2478
2479static void __exit packet_exit(void)
2480{
2481	unregister_netdevice_notifier(&packet_netdev_notifier);
2482	unregister_pernet_subsys(&packet_net_ops);
2483	sock_unregister(PF_PACKET);
2484	proto_unregister(&packet_proto);
2485}
2486
2487static int __init packet_init(void)
2488{
2489	int rc = proto_register(&packet_proto, 0);
2490
2491	if (rc != 0)
2492		goto out;
2493
2494	sock_register(&packet_family_ops);
2495	register_pernet_subsys(&packet_net_ops);
2496	register_netdevice_notifier(&packet_netdev_notifier);
2497out:
2498	return rc;
2499}
2500
2501module_init(packet_init);
2502module_exit(packet_exit);
2503MODULE_LICENSE("GPL");
2504MODULE_ALIAS_NETPROTO(PF_PACKET);
2505