af_packet.c revision 50f17787e9b0222ce65cc831407c3ba4790db3ff
1/*
2 * INET		An implementation of the TCP/IP protocol suite for the LINUX
3 *		operating system.  INET is implemented using the  BSD Socket
4 *		interface as the means of communication with the user level.
5 *
6 *		PACKET - implements raw packet sockets.
7 *
8 * Version:	$Id: af_packet.c,v 1.61 2002/02/08 03:57:19 davem Exp $
9 *
10 * Authors:	Ross Biro
11 *		Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
12 *		Alan Cox, <gw4pts@gw4pts.ampr.org>
13 *
14 * Fixes:
15 *		Alan Cox	:	verify_area() now used correctly
16 *		Alan Cox	:	new skbuff lists, look ma no backlogs!
17 *		Alan Cox	:	tidied skbuff lists.
18 *		Alan Cox	:	Now uses generic datagram routines I
19 *					added. Also fixed the peek/read crash
20 *					from all old Linux datagram code.
21 *		Alan Cox	:	Uses the improved datagram code.
22 *		Alan Cox	:	Added NULL's for socket options.
23 *		Alan Cox	:	Re-commented the code.
24 *		Alan Cox	:	Use new kernel side addressing
25 *		Rob Janssen	:	Correct MTU usage.
26 *		Dave Platt	:	Counter leaks caused by incorrect
27 *					interrupt locking and some slightly
28 *					dubious gcc output. Can you read
29 *					compiler: it said _VOLATILE_
30 *	Richard Kooijman	:	Timestamp fixes.
31 *		Alan Cox	:	New buffers. Use sk->mac.raw.
32 *		Alan Cox	:	sendmsg/recvmsg support.
33 *		Alan Cox	:	Protocol setting support
34 *	Alexey Kuznetsov	:	Untied from IPv4 stack.
35 *	Cyrus Durgin		:	Fixed kerneld for kmod.
36 *	Michal Ostrowski        :       Module initialization cleanup.
37 *         Ulises Alonso        :       Frame number limit removal and
38 *                                      packet_set_ring memory leak.
39 *		Eric Biederman	:	Allow for > 8 byte hardware addresses.
40 *					The convention is that longer addresses
41 *					will simply extend the hardware address
42 *					byte arrays at the end of sockaddr_ll
43 *					and packet_mreq.
44 *
45 *		This program is free software; you can redistribute it and/or
46 *		modify it under the terms of the GNU General Public License
47 *		as published by the Free Software Foundation; either version
48 *		2 of the License, or (at your option) any later version.
49 *
50 */
51
52#include <linux/types.h>
53#include <linux/mm.h>
54#include <linux/capability.h>
55#include <linux/fcntl.h>
56#include <linux/socket.h>
57#include <linux/in.h>
58#include <linux/inet.h>
59#include <linux/netdevice.h>
60#include <linux/if_packet.h>
61#include <linux/wireless.h>
62#include <linux/kernel.h>
63#include <linux/kmod.h>
64#include <net/ip.h>
65#include <net/protocol.h>
66#include <linux/skbuff.h>
67#include <net/sock.h>
68#include <linux/errno.h>
69#include <linux/timer.h>
70#include <asm/system.h>
71#include <asm/uaccess.h>
72#include <asm/ioctls.h>
73#include <asm/page.h>
74#include <asm/cacheflush.h>
75#include <asm/io.h>
76#include <linux/proc_fs.h>
77#include <linux/seq_file.h>
78#include <linux/poll.h>
79#include <linux/module.h>
80#include <linux/init.h>
81
82#ifdef CONFIG_INET
83#include <net/inet_common.h>
84#endif
85
86/*
87   Assumptions:
88   - if device has no dev->hard_header routine, it adds and removes ll header
89     inside itself. In this case ll header is invisible outside of device,
90     but higher levels still should reserve dev->hard_header_len.
91     Some devices are enough clever to reallocate skb, when header
92     will not fit to reserved space (tunnel), another ones are silly
93     (PPP).
94   - packet socket receives packets with pulled ll header,
95     so that SOCK_RAW should push it back.
96
97On receive:
98-----------
99
100Incoming, dev->hard_header!=NULL
101   mac_header -> ll header
102   data       -> data
103
104Outgoing, dev->hard_header!=NULL
105   mac_header -> ll header
106   data       -> ll header
107
108Incoming, dev->hard_header==NULL
109   mac_header -> UNKNOWN position. It is very likely, that it points to ll
110		 header.  PPP makes it, that is wrong, because introduce
111		 assymetry between rx and tx paths.
112   data       -> data
113
114Outgoing, dev->hard_header==NULL
115   mac_header -> data. ll header is still not built!
116   data       -> data
117
118Resume
119  If dev->hard_header==NULL we are unlikely to restore sensible ll header.
120
121
122On transmit:
123------------
124
125dev->hard_header != NULL
126   mac_header -> ll header
127   data       -> ll header
128
129dev->hard_header == NULL (ll header is added by device, we cannot control it)
130   mac_header -> data
131   data       -> data
132
133   We should set nh.raw on output to correct posistion,
134   packet classifier depends on it.
135 */
136
137/* List of all packet sockets. */
138static HLIST_HEAD(packet_sklist);
139static DEFINE_RWLOCK(packet_sklist_lock);
140
141static atomic_t packet_socks_nr;
142
143
144/* Private packet socket structures. */
145
146struct packet_mclist
147{
148	struct packet_mclist	*next;
149	int			ifindex;
150	int			count;
151	unsigned short		type;
152	unsigned short		alen;
153	unsigned char		addr[MAX_ADDR_LEN];
154};
155/* identical to struct packet_mreq except it has
156 * a longer address field.
157 */
158struct packet_mreq_max
159{
160	int		mr_ifindex;
161	unsigned short	mr_type;
162	unsigned short	mr_alen;
163	unsigned char	mr_address[MAX_ADDR_LEN];
164};
165
166#ifdef CONFIG_PACKET_MMAP
167static int packet_set_ring(struct sock *sk, struct tpacket_req *req, int closing);
168#endif
169
170static void packet_flush_mclist(struct sock *sk);
171
172struct packet_sock {
173	/* struct sock has to be the first member of packet_sock */
174	struct sock		sk;
175	struct tpacket_stats	stats;
176#ifdef CONFIG_PACKET_MMAP
177	char *			*pg_vec;
178	unsigned int		head;
179	unsigned int            frames_per_block;
180	unsigned int		frame_size;
181	unsigned int		frame_max;
182	int			copy_thresh;
183#endif
184	struct packet_type	prot_hook;
185	spinlock_t		bind_lock;
186	unsigned int		running:1,	/* prot_hook is attached*/
187				auxdata:1,
188				origdev:1;
189	int			ifindex;	/* bound device		*/
190	__be16			num;
191	struct packet_mclist	*mclist;
192#ifdef CONFIG_PACKET_MMAP
193	atomic_t		mapped;
194	unsigned int            pg_vec_order;
195	unsigned int		pg_vec_pages;
196	unsigned int		pg_vec_len;
197#endif
198};
199
200struct packet_skb_cb {
201	unsigned int origlen;
202	union {
203		struct sockaddr_pkt pkt;
204		struct sockaddr_ll ll;
205	} sa;
206};
207
208#define PACKET_SKB_CB(__skb)	((struct packet_skb_cb *)((__skb)->cb))
209
210#ifdef CONFIG_PACKET_MMAP
211
212static inline struct tpacket_hdr *packet_lookup_frame(struct packet_sock *po, unsigned int position)
213{
214	unsigned int pg_vec_pos, frame_offset;
215
216	pg_vec_pos = position / po->frames_per_block;
217	frame_offset = position % po->frames_per_block;
218
219	return (struct tpacket_hdr *)(po->pg_vec[pg_vec_pos] + (frame_offset * po->frame_size));
220}
221#endif
222
223static inline struct packet_sock *pkt_sk(struct sock *sk)
224{
225	return (struct packet_sock *)sk;
226}
227
228static void packet_sock_destruct(struct sock *sk)
229{
230	BUG_TRAP(!atomic_read(&sk->sk_rmem_alloc));
231	BUG_TRAP(!atomic_read(&sk->sk_wmem_alloc));
232
233	if (!sock_flag(sk, SOCK_DEAD)) {
234		printk("Attempt to release alive packet socket: %p\n", sk);
235		return;
236	}
237
238	atomic_dec(&packet_socks_nr);
239#ifdef PACKET_REFCNT_DEBUG
240	printk(KERN_DEBUG "PACKET socket %p is free, %d are alive\n", sk, atomic_read(&packet_socks_nr));
241#endif
242}
243
244
245static const struct proto_ops packet_ops;
246
247static const struct proto_ops packet_ops_spkt;
248
249static int packet_rcv_spkt(struct sk_buff *skb, struct net_device *dev,  struct packet_type *pt, struct net_device *orig_dev)
250{
251	struct sock *sk;
252	struct sockaddr_pkt *spkt;
253
254	/*
255	 *	When we registered the protocol we saved the socket in the data
256	 *	field for just this event.
257	 */
258
259	sk = pt->af_packet_priv;
260
261	/*
262	 *	Yank back the headers [hope the device set this
263	 *	right or kerboom...]
264	 *
265	 *	Incoming packets have ll header pulled,
266	 *	push it back.
267	 *
268	 *	For outgoing ones skb->data == skb_mac_header(skb)
269	 *	so that this procedure is noop.
270	 */
271
272	if (skb->pkt_type == PACKET_LOOPBACK)
273		goto out;
274
275	if ((skb = skb_share_check(skb, GFP_ATOMIC)) == NULL)
276		goto oom;
277
278	/* drop any routing info */
279	dst_release(skb->dst);
280	skb->dst = NULL;
281
282	/* drop conntrack reference */
283	nf_reset(skb);
284
285	spkt = &PACKET_SKB_CB(skb)->sa.pkt;
286
287	skb_push(skb, skb->data - skb_mac_header(skb));
288
289	/*
290	 *	The SOCK_PACKET socket receives _all_ frames.
291	 */
292
293	spkt->spkt_family = dev->type;
294	strlcpy(spkt->spkt_device, dev->name, sizeof(spkt->spkt_device));
295	spkt->spkt_protocol = skb->protocol;
296
297	/*
298	 *	Charge the memory to the socket. This is done specifically
299	 *	to prevent sockets using all the memory up.
300	 */
301
302	if (sock_queue_rcv_skb(sk,skb) == 0)
303		return 0;
304
305out:
306	kfree_skb(skb);
307oom:
308	return 0;
309}
310
311
312/*
313 *	Output a raw packet to a device layer. This bypasses all the other
314 *	protocol layers and you must therefore supply it with a complete frame
315 */
316
317static int packet_sendmsg_spkt(struct kiocb *iocb, struct socket *sock,
318			       struct msghdr *msg, size_t len)
319{
320	struct sock *sk = sock->sk;
321	struct sockaddr_pkt *saddr=(struct sockaddr_pkt *)msg->msg_name;
322	struct sk_buff *skb;
323	struct net_device *dev;
324	__be16 proto=0;
325	int err;
326
327	/*
328	 *	Get and verify the address.
329	 */
330
331	if (saddr)
332	{
333		if (msg->msg_namelen < sizeof(struct sockaddr))
334			return(-EINVAL);
335		if (msg->msg_namelen==sizeof(struct sockaddr_pkt))
336			proto=saddr->spkt_protocol;
337	}
338	else
339		return(-ENOTCONN);	/* SOCK_PACKET must be sent giving an address */
340
341	/*
342	 *	Find the device first to size check it
343	 */
344
345	saddr->spkt_device[13] = 0;
346	dev = dev_get_by_name(saddr->spkt_device);
347	err = -ENODEV;
348	if (dev == NULL)
349		goto out_unlock;
350
351	err = -ENETDOWN;
352	if (!(dev->flags & IFF_UP))
353		goto out_unlock;
354
355	/*
356	 *	You may not queue a frame bigger than the mtu. This is the lowest level
357	 *	raw protocol and you must do your own fragmentation at this level.
358	 */
359
360	err = -EMSGSIZE;
361	if (len > dev->mtu + dev->hard_header_len)
362		goto out_unlock;
363
364	err = -ENOBUFS;
365	skb = sock_wmalloc(sk, len + LL_RESERVED_SPACE(dev), 0, GFP_KERNEL);
366
367	/*
368	 *	If the write buffer is full, then tough. At this level the user gets to
369	 *	deal with the problem - do your own algorithmic backoffs. That's far
370	 *	more flexible.
371	 */
372
373	if (skb == NULL)
374		goto out_unlock;
375
376	/*
377	 *	Fill it in
378	 */
379
380	/* FIXME: Save some space for broken drivers that write a
381	 * hard header at transmission time by themselves. PPP is the
382	 * notable one here. This should really be fixed at the driver level.
383	 */
384	skb_reserve(skb, LL_RESERVED_SPACE(dev));
385	skb_reset_network_header(skb);
386
387	/* Try to align data part correctly */
388	if (dev->hard_header) {
389		skb->data -= dev->hard_header_len;
390		skb->tail -= dev->hard_header_len;
391		if (len < dev->hard_header_len)
392			skb_reset_network_header(skb);
393	}
394
395	/* Returns -EFAULT on error */
396	err = memcpy_fromiovec(skb_put(skb,len), msg->msg_iov, len);
397	skb->protocol = proto;
398	skb->dev = dev;
399	skb->priority = sk->sk_priority;
400	if (err)
401		goto out_free;
402
403	/*
404	 *	Now send it
405	 */
406
407	dev_queue_xmit(skb);
408	dev_put(dev);
409	return(len);
410
411out_free:
412	kfree_skb(skb);
413out_unlock:
414	if (dev)
415		dev_put(dev);
416	return err;
417}
418
419static inline unsigned int run_filter(struct sk_buff *skb, struct sock *sk,
420				      unsigned int res)
421{
422	struct sk_filter *filter;
423
424	rcu_read_lock_bh();
425	filter = rcu_dereference(sk->sk_filter);
426	if (filter != NULL)
427		res = sk_run_filter(skb, filter->insns, filter->len);
428	rcu_read_unlock_bh();
429
430	return res;
431}
432
433/*
434   This function makes lazy skb cloning in hope that most of packets
435   are discarded by BPF.
436
437   Note tricky part: we DO mangle shared skb! skb->data, skb->len
438   and skb->cb are mangled. It works because (and until) packets
439   falling here are owned by current CPU. Output packets are cloned
440   by dev_queue_xmit_nit(), input packets are processed by net_bh
441   sequencially, so that if we return skb to original state on exit,
442   we will not harm anyone.
443 */
444
445static int packet_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt, struct net_device *orig_dev)
446{
447	struct sock *sk;
448	struct sockaddr_ll *sll;
449	struct packet_sock *po;
450	u8 * skb_head = skb->data;
451	int skb_len = skb->len;
452	unsigned int snaplen, res;
453
454	if (skb->pkt_type == PACKET_LOOPBACK)
455		goto drop;
456
457	sk = pt->af_packet_priv;
458	po = pkt_sk(sk);
459
460	skb->dev = dev;
461
462	if (dev->hard_header) {
463		/* The device has an explicit notion of ll header,
464		   exported to higher levels.
465
466		   Otherwise, the device hides datails of it frame
467		   structure, so that corresponding packet head
468		   never delivered to user.
469		 */
470		if (sk->sk_type != SOCK_DGRAM)
471			skb_push(skb, skb->data - skb_mac_header(skb));
472		else if (skb->pkt_type == PACKET_OUTGOING) {
473			/* Special case: outgoing packets have ll header at head */
474			skb_pull(skb, skb_network_offset(skb));
475		}
476	}
477
478	snaplen = skb->len;
479
480	res = run_filter(skb, sk, snaplen);
481	if (!res)
482		goto drop_n_restore;
483	if (snaplen > res)
484		snaplen = res;
485
486	if (atomic_read(&sk->sk_rmem_alloc) + skb->truesize >=
487	    (unsigned)sk->sk_rcvbuf)
488		goto drop_n_acct;
489
490	if (skb_shared(skb)) {
491		struct sk_buff *nskb = skb_clone(skb, GFP_ATOMIC);
492		if (nskb == NULL)
493			goto drop_n_acct;
494
495		if (skb_head != skb->data) {
496			skb->data = skb_head;
497			skb->len = skb_len;
498		}
499		kfree_skb(skb);
500		skb = nskb;
501	}
502
503	BUILD_BUG_ON(sizeof(*PACKET_SKB_CB(skb)) + MAX_ADDR_LEN - 8 >
504		     sizeof(skb->cb));
505
506	sll = &PACKET_SKB_CB(skb)->sa.ll;
507	sll->sll_family = AF_PACKET;
508	sll->sll_hatype = dev->type;
509	sll->sll_protocol = skb->protocol;
510	sll->sll_pkttype = skb->pkt_type;
511	if (unlikely(po->origdev) && skb->pkt_type == PACKET_HOST)
512		sll->sll_ifindex = orig_dev->ifindex;
513	else
514		sll->sll_ifindex = dev->ifindex;
515	sll->sll_halen = 0;
516
517	if (dev->hard_header_parse)
518		sll->sll_halen = dev->hard_header_parse(skb, sll->sll_addr);
519
520	PACKET_SKB_CB(skb)->origlen = skb->len;
521
522	if (pskb_trim(skb, snaplen))
523		goto drop_n_acct;
524
525	skb_set_owner_r(skb, sk);
526	skb->dev = NULL;
527	dst_release(skb->dst);
528	skb->dst = NULL;
529
530	/* drop conntrack reference */
531	nf_reset(skb);
532
533	spin_lock(&sk->sk_receive_queue.lock);
534	po->stats.tp_packets++;
535	__skb_queue_tail(&sk->sk_receive_queue, skb);
536	spin_unlock(&sk->sk_receive_queue.lock);
537	sk->sk_data_ready(sk, skb->len);
538	return 0;
539
540drop_n_acct:
541	spin_lock(&sk->sk_receive_queue.lock);
542	po->stats.tp_drops++;
543	spin_unlock(&sk->sk_receive_queue.lock);
544
545drop_n_restore:
546	if (skb_head != skb->data && skb_shared(skb)) {
547		skb->data = skb_head;
548		skb->len = skb_len;
549	}
550drop:
551	kfree_skb(skb);
552	return 0;
553}
554
555#ifdef CONFIG_PACKET_MMAP
556static int tpacket_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt, struct net_device *orig_dev)
557{
558	struct sock *sk;
559	struct packet_sock *po;
560	struct sockaddr_ll *sll;
561	struct tpacket_hdr *h;
562	u8 * skb_head = skb->data;
563	int skb_len = skb->len;
564	unsigned int snaplen, res;
565	unsigned long status = TP_STATUS_LOSING|TP_STATUS_USER;
566	unsigned short macoff, netoff;
567	struct sk_buff *copy_skb = NULL;
568	struct timeval tv;
569
570	if (skb->pkt_type == PACKET_LOOPBACK)
571		goto drop;
572
573	sk = pt->af_packet_priv;
574	po = pkt_sk(sk);
575
576	if (dev->hard_header) {
577		if (sk->sk_type != SOCK_DGRAM)
578			skb_push(skb, skb->data - skb_mac_header(skb));
579		else if (skb->pkt_type == PACKET_OUTGOING) {
580			/* Special case: outgoing packets have ll header at head */
581			skb_pull(skb, skb_network_offset(skb));
582		}
583	}
584
585	if (skb->ip_summed == CHECKSUM_PARTIAL)
586		status |= TP_STATUS_CSUMNOTREADY;
587
588	snaplen = skb->len;
589
590	res = run_filter(skb, sk, snaplen);
591	if (!res)
592		goto drop_n_restore;
593	if (snaplen > res)
594		snaplen = res;
595
596	if (sk->sk_type == SOCK_DGRAM) {
597		macoff = netoff = TPACKET_ALIGN(TPACKET_HDRLEN) + 16;
598	} else {
599		unsigned maclen = skb_network_offset(skb);
600		netoff = TPACKET_ALIGN(TPACKET_HDRLEN + (maclen < 16 ? 16 : maclen));
601		macoff = netoff - maclen;
602	}
603
604	if (macoff + snaplen > po->frame_size) {
605		if (po->copy_thresh &&
606		    atomic_read(&sk->sk_rmem_alloc) + skb->truesize <
607		    (unsigned)sk->sk_rcvbuf) {
608			if (skb_shared(skb)) {
609				copy_skb = skb_clone(skb, GFP_ATOMIC);
610			} else {
611				copy_skb = skb_get(skb);
612				skb_head = skb->data;
613			}
614			if (copy_skb)
615				skb_set_owner_r(copy_skb, sk);
616		}
617		snaplen = po->frame_size - macoff;
618		if ((int)snaplen < 0)
619			snaplen = 0;
620	}
621
622	spin_lock(&sk->sk_receive_queue.lock);
623	h = packet_lookup_frame(po, po->head);
624
625	if (h->tp_status)
626		goto ring_is_full;
627	po->head = po->head != po->frame_max ? po->head+1 : 0;
628	po->stats.tp_packets++;
629	if (copy_skb) {
630		status |= TP_STATUS_COPY;
631		__skb_queue_tail(&sk->sk_receive_queue, copy_skb);
632	}
633	if (!po->stats.tp_drops)
634		status &= ~TP_STATUS_LOSING;
635	spin_unlock(&sk->sk_receive_queue.lock);
636
637	skb_copy_bits(skb, 0, (u8*)h + macoff, snaplen);
638
639	h->tp_len = skb->len;
640	h->tp_snaplen = snaplen;
641	h->tp_mac = macoff;
642	h->tp_net = netoff;
643	if (skb->tstamp.tv64)
644		tv = ktime_to_timeval(skb->tstamp);
645	else
646		do_gettimeofday(&tv);
647	h->tp_sec = tv.tv_sec;
648	h->tp_usec = tv.tv_usec;
649
650	sll = (struct sockaddr_ll*)((u8*)h + TPACKET_ALIGN(sizeof(*h)));
651	sll->sll_halen = 0;
652	if (dev->hard_header_parse)
653		sll->sll_halen = dev->hard_header_parse(skb, sll->sll_addr);
654	sll->sll_family = AF_PACKET;
655	sll->sll_hatype = dev->type;
656	sll->sll_protocol = skb->protocol;
657	sll->sll_pkttype = skb->pkt_type;
658	if (unlikely(po->origdev) && skb->pkt_type == PACKET_HOST)
659		sll->sll_ifindex = orig_dev->ifindex;
660	else
661		sll->sll_ifindex = dev->ifindex;
662
663	h->tp_status = status;
664	smp_mb();
665
666	{
667		struct page *p_start, *p_end;
668		u8 *h_end = (u8 *)h + macoff + snaplen - 1;
669
670		p_start = virt_to_page(h);
671		p_end = virt_to_page(h_end);
672		while (p_start <= p_end) {
673			flush_dcache_page(p_start);
674			p_start++;
675		}
676	}
677
678	sk->sk_data_ready(sk, 0);
679
680drop_n_restore:
681	if (skb_head != skb->data && skb_shared(skb)) {
682		skb->data = skb_head;
683		skb->len = skb_len;
684	}
685drop:
686	kfree_skb(skb);
687	return 0;
688
689ring_is_full:
690	po->stats.tp_drops++;
691	spin_unlock(&sk->sk_receive_queue.lock);
692
693	sk->sk_data_ready(sk, 0);
694	if (copy_skb)
695		kfree_skb(copy_skb);
696	goto drop_n_restore;
697}
698
699#endif
700
701
702static int packet_sendmsg(struct kiocb *iocb, struct socket *sock,
703			  struct msghdr *msg, size_t len)
704{
705	struct sock *sk = sock->sk;
706	struct sockaddr_ll *saddr=(struct sockaddr_ll *)msg->msg_name;
707	struct sk_buff *skb;
708	struct net_device *dev;
709	__be16 proto;
710	unsigned char *addr;
711	int ifindex, err, reserve = 0;
712
713	/*
714	 *	Get and verify the address.
715	 */
716
717	if (saddr == NULL) {
718		struct packet_sock *po = pkt_sk(sk);
719
720		ifindex	= po->ifindex;
721		proto	= po->num;
722		addr	= NULL;
723	} else {
724		err = -EINVAL;
725		if (msg->msg_namelen < sizeof(struct sockaddr_ll))
726			goto out;
727		if (msg->msg_namelen < (saddr->sll_halen + offsetof(struct sockaddr_ll, sll_addr)))
728			goto out;
729		ifindex	= saddr->sll_ifindex;
730		proto	= saddr->sll_protocol;
731		addr	= saddr->sll_addr;
732	}
733
734
735	dev = dev_get_by_index(ifindex);
736	err = -ENXIO;
737	if (dev == NULL)
738		goto out_unlock;
739	if (sock->type == SOCK_RAW)
740		reserve = dev->hard_header_len;
741
742	err = -ENETDOWN;
743	if (!(dev->flags & IFF_UP))
744		goto out_unlock;
745
746	err = -EMSGSIZE;
747	if (len > dev->mtu+reserve)
748		goto out_unlock;
749
750	skb = sock_alloc_send_skb(sk, len + LL_RESERVED_SPACE(dev),
751				msg->msg_flags & MSG_DONTWAIT, &err);
752	if (skb==NULL)
753		goto out_unlock;
754
755	skb_reserve(skb, LL_RESERVED_SPACE(dev));
756	skb_reset_network_header(skb);
757
758	if (dev->hard_header) {
759		int res;
760		err = -EINVAL;
761		res = dev->hard_header(skb, dev, ntohs(proto), addr, NULL, len);
762		if (sock->type != SOCK_DGRAM) {
763			skb_reset_tail_pointer(skb);
764			skb->len = 0;
765		} else if (res < 0)
766			goto out_free;
767	}
768
769	/* Returns -EFAULT on error */
770	err = memcpy_fromiovec(skb_put(skb,len), msg->msg_iov, len);
771	if (err)
772		goto out_free;
773
774	skb->protocol = proto;
775	skb->dev = dev;
776	skb->priority = sk->sk_priority;
777
778	/*
779	 *	Now send it
780	 */
781
782	err = dev_queue_xmit(skb);
783	if (err > 0 && (err = net_xmit_errno(err)) != 0)
784		goto out_unlock;
785
786	dev_put(dev);
787
788	return(len);
789
790out_free:
791	kfree_skb(skb);
792out_unlock:
793	if (dev)
794		dev_put(dev);
795out:
796	return err;
797}
798
799/*
800 *	Close a PACKET socket. This is fairly simple. We immediately go
801 *	to 'closed' state and remove our protocol entry in the device list.
802 */
803
804static int packet_release(struct socket *sock)
805{
806	struct sock *sk = sock->sk;
807	struct packet_sock *po;
808
809	if (!sk)
810		return 0;
811
812	po = pkt_sk(sk);
813
814	write_lock_bh(&packet_sklist_lock);
815	sk_del_node_init(sk);
816	write_unlock_bh(&packet_sklist_lock);
817
818	/*
819	 *	Unhook packet receive handler.
820	 */
821
822	if (po->running) {
823		/*
824		 *	Remove the protocol hook
825		 */
826		dev_remove_pack(&po->prot_hook);
827		po->running = 0;
828		po->num = 0;
829		__sock_put(sk);
830	}
831
832	packet_flush_mclist(sk);
833
834#ifdef CONFIG_PACKET_MMAP
835	if (po->pg_vec) {
836		struct tpacket_req req;
837		memset(&req, 0, sizeof(req));
838		packet_set_ring(sk, &req, 1);
839	}
840#endif
841
842	/*
843	 *	Now the socket is dead. No more input will appear.
844	 */
845
846	sock_orphan(sk);
847	sock->sk = NULL;
848
849	/* Purge queues */
850
851	skb_queue_purge(&sk->sk_receive_queue);
852
853	sock_put(sk);
854	return 0;
855}
856
857/*
858 *	Attach a packet hook.
859 */
860
861static int packet_do_bind(struct sock *sk, struct net_device *dev, __be16 protocol)
862{
863	struct packet_sock *po = pkt_sk(sk);
864	/*
865	 *	Detach an existing hook if present.
866	 */
867
868	lock_sock(sk);
869
870	spin_lock(&po->bind_lock);
871	if (po->running) {
872		__sock_put(sk);
873		po->running = 0;
874		po->num = 0;
875		spin_unlock(&po->bind_lock);
876		dev_remove_pack(&po->prot_hook);
877		spin_lock(&po->bind_lock);
878	}
879
880	po->num = protocol;
881	po->prot_hook.type = protocol;
882	po->prot_hook.dev = dev;
883
884	po->ifindex = dev ? dev->ifindex : 0;
885
886	if (protocol == 0)
887		goto out_unlock;
888
889	if (dev) {
890		if (dev->flags&IFF_UP) {
891			dev_add_pack(&po->prot_hook);
892			sock_hold(sk);
893			po->running = 1;
894		} else {
895			sk->sk_err = ENETDOWN;
896			if (!sock_flag(sk, SOCK_DEAD))
897				sk->sk_error_report(sk);
898		}
899	} else {
900		dev_add_pack(&po->prot_hook);
901		sock_hold(sk);
902		po->running = 1;
903	}
904
905out_unlock:
906	spin_unlock(&po->bind_lock);
907	release_sock(sk);
908	return 0;
909}
910
911/*
912 *	Bind a packet socket to a device
913 */
914
915static int packet_bind_spkt(struct socket *sock, struct sockaddr *uaddr, int addr_len)
916{
917	struct sock *sk=sock->sk;
918	char name[15];
919	struct net_device *dev;
920	int err = -ENODEV;
921
922	/*
923	 *	Check legality
924	 */
925
926	if (addr_len != sizeof(struct sockaddr))
927		return -EINVAL;
928	strlcpy(name,uaddr->sa_data,sizeof(name));
929
930	dev = dev_get_by_name(name);
931	if (dev) {
932		err = packet_do_bind(sk, dev, pkt_sk(sk)->num);
933		dev_put(dev);
934	}
935	return err;
936}
937
938static int packet_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
939{
940	struct sockaddr_ll *sll = (struct sockaddr_ll*)uaddr;
941	struct sock *sk=sock->sk;
942	struct net_device *dev = NULL;
943	int err;
944
945
946	/*
947	 *	Check legality
948	 */
949
950	if (addr_len < sizeof(struct sockaddr_ll))
951		return -EINVAL;
952	if (sll->sll_family != AF_PACKET)
953		return -EINVAL;
954
955	if (sll->sll_ifindex) {
956		err = -ENODEV;
957		dev = dev_get_by_index(sll->sll_ifindex);
958		if (dev == NULL)
959			goto out;
960	}
961	err = packet_do_bind(sk, dev, sll->sll_protocol ? : pkt_sk(sk)->num);
962	if (dev)
963		dev_put(dev);
964
965out:
966	return err;
967}
968
969static struct proto packet_proto = {
970	.name	  = "PACKET",
971	.owner	  = THIS_MODULE,
972	.obj_size = sizeof(struct packet_sock),
973};
974
975/*
976 *	Create a packet of type SOCK_PACKET.
977 */
978
979static int packet_create(struct socket *sock, int protocol)
980{
981	struct sock *sk;
982	struct packet_sock *po;
983	__be16 proto = (__force __be16)protocol; /* weird, but documented */
984	int err;
985
986	if (!capable(CAP_NET_RAW))
987		return -EPERM;
988	if (sock->type != SOCK_DGRAM && sock->type != SOCK_RAW &&
989	    sock->type != SOCK_PACKET)
990		return -ESOCKTNOSUPPORT;
991
992	sock->state = SS_UNCONNECTED;
993
994	err = -ENOBUFS;
995	sk = sk_alloc(PF_PACKET, GFP_KERNEL, &packet_proto, 1);
996	if (sk == NULL)
997		goto out;
998
999	sock->ops = &packet_ops;
1000	if (sock->type == SOCK_PACKET)
1001		sock->ops = &packet_ops_spkt;
1002
1003	sock_init_data(sock, sk);
1004
1005	po = pkt_sk(sk);
1006	sk->sk_family = PF_PACKET;
1007	po->num = proto;
1008
1009	sk->sk_destruct = packet_sock_destruct;
1010	atomic_inc(&packet_socks_nr);
1011
1012	/*
1013	 *	Attach a protocol block
1014	 */
1015
1016	spin_lock_init(&po->bind_lock);
1017	po->prot_hook.func = packet_rcv;
1018
1019	if (sock->type == SOCK_PACKET)
1020		po->prot_hook.func = packet_rcv_spkt;
1021
1022	po->prot_hook.af_packet_priv = sk;
1023
1024	if (proto) {
1025		po->prot_hook.type = proto;
1026		dev_add_pack(&po->prot_hook);
1027		sock_hold(sk);
1028		po->running = 1;
1029	}
1030
1031	write_lock_bh(&packet_sklist_lock);
1032	sk_add_node(sk, &packet_sklist);
1033	write_unlock_bh(&packet_sklist_lock);
1034	return(0);
1035out:
1036	return err;
1037}
1038
1039/*
1040 *	Pull a packet from our receive queue and hand it to the user.
1041 *	If necessary we block.
1042 */
1043
1044static int packet_recvmsg(struct kiocb *iocb, struct socket *sock,
1045			  struct msghdr *msg, size_t len, int flags)
1046{
1047	struct sock *sk = sock->sk;
1048	struct sk_buff *skb;
1049	int copied, err;
1050	struct sockaddr_ll *sll;
1051
1052	err = -EINVAL;
1053	if (flags & ~(MSG_PEEK|MSG_DONTWAIT|MSG_TRUNC|MSG_CMSG_COMPAT))
1054		goto out;
1055
1056#if 0
1057	/* What error should we return now? EUNATTACH? */
1058	if (pkt_sk(sk)->ifindex < 0)
1059		return -ENODEV;
1060#endif
1061
1062	/*
1063	 *	Call the generic datagram receiver. This handles all sorts
1064	 *	of horrible races and re-entrancy so we can forget about it
1065	 *	in the protocol layers.
1066	 *
1067	 *	Now it will return ENETDOWN, if device have just gone down,
1068	 *	but then it will block.
1069	 */
1070
1071	skb=skb_recv_datagram(sk,flags,flags&MSG_DONTWAIT,&err);
1072
1073	/*
1074	 *	An error occurred so return it. Because skb_recv_datagram()
1075	 *	handles the blocking we don't see and worry about blocking
1076	 *	retries.
1077	 */
1078
1079	if (skb == NULL)
1080		goto out;
1081
1082	/*
1083	 *	If the address length field is there to be filled in, we fill
1084	 *	it in now.
1085	 */
1086
1087	sll = &PACKET_SKB_CB(skb)->sa.ll;
1088	if (sock->type == SOCK_PACKET)
1089		msg->msg_namelen = sizeof(struct sockaddr_pkt);
1090	else
1091		msg->msg_namelen = sll->sll_halen + offsetof(struct sockaddr_ll, sll_addr);
1092
1093	/*
1094	 *	You lose any data beyond the buffer you gave. If it worries a
1095	 *	user program they can ask the device for its MTU anyway.
1096	 */
1097
1098	copied = skb->len;
1099	if (copied > len)
1100	{
1101		copied=len;
1102		msg->msg_flags|=MSG_TRUNC;
1103	}
1104
1105	err = skb_copy_datagram_iovec(skb, 0, msg->msg_iov, copied);
1106	if (err)
1107		goto out_free;
1108
1109	sock_recv_timestamp(msg, sk, skb);
1110
1111	if (msg->msg_name)
1112		memcpy(msg->msg_name, &PACKET_SKB_CB(skb)->sa,
1113		       msg->msg_namelen);
1114
1115	if (pkt_sk(sk)->auxdata) {
1116		struct tpacket_auxdata aux;
1117
1118		aux.tp_status = TP_STATUS_USER;
1119		if (skb->ip_summed == CHECKSUM_PARTIAL)
1120			aux.tp_status |= TP_STATUS_CSUMNOTREADY;
1121		aux.tp_len = PACKET_SKB_CB(skb)->origlen;
1122		aux.tp_snaplen = skb->len;
1123		aux.tp_mac = 0;
1124		aux.tp_net = skb_network_offset(skb);
1125
1126		put_cmsg(msg, SOL_PACKET, PACKET_AUXDATA, sizeof(aux), &aux);
1127	}
1128
1129	/*
1130	 *	Free or return the buffer as appropriate. Again this
1131	 *	hides all the races and re-entrancy issues from us.
1132	 */
1133	err = (flags&MSG_TRUNC) ? skb->len : copied;
1134
1135out_free:
1136	skb_free_datagram(sk, skb);
1137out:
1138	return err;
1139}
1140
1141static int packet_getname_spkt(struct socket *sock, struct sockaddr *uaddr,
1142			       int *uaddr_len, int peer)
1143{
1144	struct net_device *dev;
1145	struct sock *sk	= sock->sk;
1146
1147	if (peer)
1148		return -EOPNOTSUPP;
1149
1150	uaddr->sa_family = AF_PACKET;
1151	dev = dev_get_by_index(pkt_sk(sk)->ifindex);
1152	if (dev) {
1153		strlcpy(uaddr->sa_data, dev->name, 15);
1154		dev_put(dev);
1155	} else
1156		memset(uaddr->sa_data, 0, 14);
1157	*uaddr_len = sizeof(*uaddr);
1158
1159	return 0;
1160}
1161
1162static int packet_getname(struct socket *sock, struct sockaddr *uaddr,
1163			  int *uaddr_len, int peer)
1164{
1165	struct net_device *dev;
1166	struct sock *sk = sock->sk;
1167	struct packet_sock *po = pkt_sk(sk);
1168	struct sockaddr_ll *sll = (struct sockaddr_ll*)uaddr;
1169
1170	if (peer)
1171		return -EOPNOTSUPP;
1172
1173	sll->sll_family = AF_PACKET;
1174	sll->sll_ifindex = po->ifindex;
1175	sll->sll_protocol = po->num;
1176	dev = dev_get_by_index(po->ifindex);
1177	if (dev) {
1178		sll->sll_hatype = dev->type;
1179		sll->sll_halen = dev->addr_len;
1180		memcpy(sll->sll_addr, dev->dev_addr, dev->addr_len);
1181		dev_put(dev);
1182	} else {
1183		sll->sll_hatype = 0;	/* Bad: we have no ARPHRD_UNSPEC */
1184		sll->sll_halen = 0;
1185	}
1186	*uaddr_len = offsetof(struct sockaddr_ll, sll_addr) + sll->sll_halen;
1187
1188	return 0;
1189}
1190
1191static void packet_dev_mc(struct net_device *dev, struct packet_mclist *i, int what)
1192{
1193	switch (i->type) {
1194	case PACKET_MR_MULTICAST:
1195		if (what > 0)
1196			dev_mc_add(dev, i->addr, i->alen, 0);
1197		else
1198			dev_mc_delete(dev, i->addr, i->alen, 0);
1199		break;
1200	case PACKET_MR_PROMISC:
1201		dev_set_promiscuity(dev, what);
1202		break;
1203	case PACKET_MR_ALLMULTI:
1204		dev_set_allmulti(dev, what);
1205		break;
1206	default:;
1207	}
1208}
1209
1210static void packet_dev_mclist(struct net_device *dev, struct packet_mclist *i, int what)
1211{
1212	for ( ; i; i=i->next) {
1213		if (i->ifindex == dev->ifindex)
1214			packet_dev_mc(dev, i, what);
1215	}
1216}
1217
1218static int packet_mc_add(struct sock *sk, struct packet_mreq_max *mreq)
1219{
1220	struct packet_sock *po = pkt_sk(sk);
1221	struct packet_mclist *ml, *i;
1222	struct net_device *dev;
1223	int err;
1224
1225	rtnl_lock();
1226
1227	err = -ENODEV;
1228	dev = __dev_get_by_index(mreq->mr_ifindex);
1229	if (!dev)
1230		goto done;
1231
1232	err = -EINVAL;
1233	if (mreq->mr_alen > dev->addr_len)
1234		goto done;
1235
1236	err = -ENOBUFS;
1237	i = kmalloc(sizeof(*i), GFP_KERNEL);
1238	if (i == NULL)
1239		goto done;
1240
1241	err = 0;
1242	for (ml = po->mclist; ml; ml = ml->next) {
1243		if (ml->ifindex == mreq->mr_ifindex &&
1244		    ml->type == mreq->mr_type &&
1245		    ml->alen == mreq->mr_alen &&
1246		    memcmp(ml->addr, mreq->mr_address, ml->alen) == 0) {
1247			ml->count++;
1248			/* Free the new element ... */
1249			kfree(i);
1250			goto done;
1251		}
1252	}
1253
1254	i->type = mreq->mr_type;
1255	i->ifindex = mreq->mr_ifindex;
1256	i->alen = mreq->mr_alen;
1257	memcpy(i->addr, mreq->mr_address, i->alen);
1258	i->count = 1;
1259	i->next = po->mclist;
1260	po->mclist = i;
1261	packet_dev_mc(dev, i, +1);
1262
1263done:
1264	rtnl_unlock();
1265	return err;
1266}
1267
1268static int packet_mc_drop(struct sock *sk, struct packet_mreq_max *mreq)
1269{
1270	struct packet_mclist *ml, **mlp;
1271
1272	rtnl_lock();
1273
1274	for (mlp = &pkt_sk(sk)->mclist; (ml = *mlp) != NULL; mlp = &ml->next) {
1275		if (ml->ifindex == mreq->mr_ifindex &&
1276		    ml->type == mreq->mr_type &&
1277		    ml->alen == mreq->mr_alen &&
1278		    memcmp(ml->addr, mreq->mr_address, ml->alen) == 0) {
1279			if (--ml->count == 0) {
1280				struct net_device *dev;
1281				*mlp = ml->next;
1282				dev = dev_get_by_index(ml->ifindex);
1283				if (dev) {
1284					packet_dev_mc(dev, ml, -1);
1285					dev_put(dev);
1286				}
1287				kfree(ml);
1288			}
1289			rtnl_unlock();
1290			return 0;
1291		}
1292	}
1293	rtnl_unlock();
1294	return -EADDRNOTAVAIL;
1295}
1296
1297static void packet_flush_mclist(struct sock *sk)
1298{
1299	struct packet_sock *po = pkt_sk(sk);
1300	struct packet_mclist *ml;
1301
1302	if (!po->mclist)
1303		return;
1304
1305	rtnl_lock();
1306	while ((ml = po->mclist) != NULL) {
1307		struct net_device *dev;
1308
1309		po->mclist = ml->next;
1310		if ((dev = dev_get_by_index(ml->ifindex)) != NULL) {
1311			packet_dev_mc(dev, ml, -1);
1312			dev_put(dev);
1313		}
1314		kfree(ml);
1315	}
1316	rtnl_unlock();
1317}
1318
1319static int
1320packet_setsockopt(struct socket *sock, int level, int optname, char __user *optval, int optlen)
1321{
1322	struct sock *sk = sock->sk;
1323	struct packet_sock *po = pkt_sk(sk);
1324	int ret;
1325
1326	if (level != SOL_PACKET)
1327		return -ENOPROTOOPT;
1328
1329	switch(optname)	{
1330	case PACKET_ADD_MEMBERSHIP:
1331	case PACKET_DROP_MEMBERSHIP:
1332	{
1333		struct packet_mreq_max mreq;
1334		int len = optlen;
1335		memset(&mreq, 0, sizeof(mreq));
1336		if (len < sizeof(struct packet_mreq))
1337			return -EINVAL;
1338		if (len > sizeof(mreq))
1339			len = sizeof(mreq);
1340		if (copy_from_user(&mreq,optval,len))
1341			return -EFAULT;
1342		if (len < (mreq.mr_alen + offsetof(struct packet_mreq, mr_address)))
1343			return -EINVAL;
1344		if (optname == PACKET_ADD_MEMBERSHIP)
1345			ret = packet_mc_add(sk, &mreq);
1346		else
1347			ret = packet_mc_drop(sk, &mreq);
1348		return ret;
1349	}
1350
1351#ifdef CONFIG_PACKET_MMAP
1352	case PACKET_RX_RING:
1353	{
1354		struct tpacket_req req;
1355
1356		if (optlen<sizeof(req))
1357			return -EINVAL;
1358		if (copy_from_user(&req,optval,sizeof(req)))
1359			return -EFAULT;
1360		return packet_set_ring(sk, &req, 0);
1361	}
1362	case PACKET_COPY_THRESH:
1363	{
1364		int val;
1365
1366		if (optlen!=sizeof(val))
1367			return -EINVAL;
1368		if (copy_from_user(&val,optval,sizeof(val)))
1369			return -EFAULT;
1370
1371		pkt_sk(sk)->copy_thresh = val;
1372		return 0;
1373	}
1374#endif
1375	case PACKET_AUXDATA:
1376	{
1377		int val;
1378
1379		if (optlen < sizeof(val))
1380			return -EINVAL;
1381		if (copy_from_user(&val, optval, sizeof(val)))
1382			return -EFAULT;
1383
1384		po->auxdata = !!val;
1385		return 0;
1386	}
1387	case PACKET_ORIGDEV:
1388	{
1389		int val;
1390
1391		if (optlen < sizeof(val))
1392			return -EINVAL;
1393		if (copy_from_user(&val, optval, sizeof(val)))
1394			return -EFAULT;
1395
1396		po->origdev = !!val;
1397		return 0;
1398	}
1399	default:
1400		return -ENOPROTOOPT;
1401	}
1402}
1403
1404static int packet_getsockopt(struct socket *sock, int level, int optname,
1405			     char __user *optval, int __user *optlen)
1406{
1407	int len;
1408	int val;
1409	struct sock *sk = sock->sk;
1410	struct packet_sock *po = pkt_sk(sk);
1411	void *data;
1412	struct tpacket_stats st;
1413
1414	if (level != SOL_PACKET)
1415		return -ENOPROTOOPT;
1416
1417	if (get_user(len, optlen))
1418		return -EFAULT;
1419
1420	if (len < 0)
1421		return -EINVAL;
1422
1423	switch(optname)	{
1424	case PACKET_STATISTICS:
1425		if (len > sizeof(struct tpacket_stats))
1426			len = sizeof(struct tpacket_stats);
1427		spin_lock_bh(&sk->sk_receive_queue.lock);
1428		st = po->stats;
1429		memset(&po->stats, 0, sizeof(st));
1430		spin_unlock_bh(&sk->sk_receive_queue.lock);
1431		st.tp_packets += st.tp_drops;
1432
1433		data = &st;
1434		break;
1435	case PACKET_AUXDATA:
1436		if (len > sizeof(int))
1437			len = sizeof(int);
1438		val = po->auxdata;
1439
1440		data = &val;
1441		break;
1442	case PACKET_ORIGDEV:
1443		if (len > sizeof(int))
1444			len = sizeof(int);
1445		val = po->origdev;
1446
1447		data = &val;
1448		break;
1449	default:
1450		return -ENOPROTOOPT;
1451	}
1452
1453	if (put_user(len, optlen))
1454		return -EFAULT;
1455	if (copy_to_user(optval, data, len))
1456		return -EFAULT;
1457	return 0;
1458}
1459
1460
1461static int packet_notifier(struct notifier_block *this, unsigned long msg, void *data)
1462{
1463	struct sock *sk;
1464	struct hlist_node *node;
1465	struct net_device *dev = data;
1466
1467	read_lock(&packet_sklist_lock);
1468	sk_for_each(sk, node, &packet_sklist) {
1469		struct packet_sock *po = pkt_sk(sk);
1470
1471		switch (msg) {
1472		case NETDEV_UNREGISTER:
1473			if (po->mclist)
1474				packet_dev_mclist(dev, po->mclist, -1);
1475			/* fallthrough */
1476
1477		case NETDEV_DOWN:
1478			if (dev->ifindex == po->ifindex) {
1479				spin_lock(&po->bind_lock);
1480				if (po->running) {
1481					__dev_remove_pack(&po->prot_hook);
1482					__sock_put(sk);
1483					po->running = 0;
1484					sk->sk_err = ENETDOWN;
1485					if (!sock_flag(sk, SOCK_DEAD))
1486						sk->sk_error_report(sk);
1487				}
1488				if (msg == NETDEV_UNREGISTER) {
1489					po->ifindex = -1;
1490					po->prot_hook.dev = NULL;
1491				}
1492				spin_unlock(&po->bind_lock);
1493			}
1494			break;
1495		case NETDEV_UP:
1496			spin_lock(&po->bind_lock);
1497			if (dev->ifindex == po->ifindex && po->num &&
1498			    !po->running) {
1499				dev_add_pack(&po->prot_hook);
1500				sock_hold(sk);
1501				po->running = 1;
1502			}
1503			spin_unlock(&po->bind_lock);
1504			break;
1505		}
1506	}
1507	read_unlock(&packet_sklist_lock);
1508	return NOTIFY_DONE;
1509}
1510
1511
1512static int packet_ioctl(struct socket *sock, unsigned int cmd,
1513			unsigned long arg)
1514{
1515	struct sock *sk = sock->sk;
1516
1517	switch(cmd) {
1518		case SIOCOUTQ:
1519		{
1520			int amount = atomic_read(&sk->sk_wmem_alloc);
1521			return put_user(amount, (int __user *)arg);
1522		}
1523		case SIOCINQ:
1524		{
1525			struct sk_buff *skb;
1526			int amount = 0;
1527
1528			spin_lock_bh(&sk->sk_receive_queue.lock);
1529			skb = skb_peek(&sk->sk_receive_queue);
1530			if (skb)
1531				amount = skb->len;
1532			spin_unlock_bh(&sk->sk_receive_queue.lock);
1533			return put_user(amount, (int __user *)arg);
1534		}
1535		case SIOCGSTAMP:
1536			return sock_get_timestamp(sk, (struct timeval __user *)arg);
1537		case SIOCGSTAMPNS:
1538			return sock_get_timestampns(sk, (struct timespec __user *)arg);
1539
1540#ifdef CONFIG_INET
1541		case SIOCADDRT:
1542		case SIOCDELRT:
1543		case SIOCDARP:
1544		case SIOCGARP:
1545		case SIOCSARP:
1546		case SIOCGIFADDR:
1547		case SIOCSIFADDR:
1548		case SIOCGIFBRDADDR:
1549		case SIOCSIFBRDADDR:
1550		case SIOCGIFNETMASK:
1551		case SIOCSIFNETMASK:
1552		case SIOCGIFDSTADDR:
1553		case SIOCSIFDSTADDR:
1554		case SIOCSIFFLAGS:
1555			return inet_dgram_ops.ioctl(sock, cmd, arg);
1556#endif
1557
1558		default:
1559			return -ENOIOCTLCMD;
1560	}
1561	return 0;
1562}
1563
1564#ifndef CONFIG_PACKET_MMAP
1565#define packet_mmap sock_no_mmap
1566#define packet_poll datagram_poll
1567#else
1568
1569static unsigned int packet_poll(struct file * file, struct socket *sock,
1570				poll_table *wait)
1571{
1572	struct sock *sk = sock->sk;
1573	struct packet_sock *po = pkt_sk(sk);
1574	unsigned int mask = datagram_poll(file, sock, wait);
1575
1576	spin_lock_bh(&sk->sk_receive_queue.lock);
1577	if (po->pg_vec) {
1578		unsigned last = po->head ? po->head-1 : po->frame_max;
1579		struct tpacket_hdr *h;
1580
1581		h = packet_lookup_frame(po, last);
1582
1583		if (h->tp_status)
1584			mask |= POLLIN | POLLRDNORM;
1585	}
1586	spin_unlock_bh(&sk->sk_receive_queue.lock);
1587	return mask;
1588}
1589
1590
1591/* Dirty? Well, I still did not learn better way to account
1592 * for user mmaps.
1593 */
1594
1595static void packet_mm_open(struct vm_area_struct *vma)
1596{
1597	struct file *file = vma->vm_file;
1598	struct socket * sock = file->private_data;
1599	struct sock *sk = sock->sk;
1600
1601	if (sk)
1602		atomic_inc(&pkt_sk(sk)->mapped);
1603}
1604
1605static void packet_mm_close(struct vm_area_struct *vma)
1606{
1607	struct file *file = vma->vm_file;
1608	struct socket * sock = file->private_data;
1609	struct sock *sk = sock->sk;
1610
1611	if (sk)
1612		atomic_dec(&pkt_sk(sk)->mapped);
1613}
1614
1615static struct vm_operations_struct packet_mmap_ops = {
1616	.open =	packet_mm_open,
1617	.close =packet_mm_close,
1618};
1619
1620static inline struct page *pg_vec_endpage(char *one_pg_vec, unsigned int order)
1621{
1622	return virt_to_page(one_pg_vec + (PAGE_SIZE << order) - 1);
1623}
1624
1625static void free_pg_vec(char **pg_vec, unsigned int order, unsigned int len)
1626{
1627	int i;
1628
1629	for (i = 0; i < len; i++) {
1630		if (likely(pg_vec[i]))
1631			free_pages((unsigned long) pg_vec[i], order);
1632	}
1633	kfree(pg_vec);
1634}
1635
1636static inline char *alloc_one_pg_vec_page(unsigned long order)
1637{
1638	return (char *) __get_free_pages(GFP_KERNEL | __GFP_COMP | __GFP_ZERO,
1639					 order);
1640}
1641
1642static char **alloc_pg_vec(struct tpacket_req *req, int order)
1643{
1644	unsigned int block_nr = req->tp_block_nr;
1645	char **pg_vec;
1646	int i;
1647
1648	pg_vec = kzalloc(block_nr * sizeof(char *), GFP_KERNEL);
1649	if (unlikely(!pg_vec))
1650		goto out;
1651
1652	for (i = 0; i < block_nr; i++) {
1653		pg_vec[i] = alloc_one_pg_vec_page(order);
1654		if (unlikely(!pg_vec[i]))
1655			goto out_free_pgvec;
1656	}
1657
1658out:
1659	return pg_vec;
1660
1661out_free_pgvec:
1662	free_pg_vec(pg_vec, order, block_nr);
1663	pg_vec = NULL;
1664	goto out;
1665}
1666
1667static int packet_set_ring(struct sock *sk, struct tpacket_req *req, int closing)
1668{
1669	char **pg_vec = NULL;
1670	struct packet_sock *po = pkt_sk(sk);
1671	int was_running, order = 0;
1672	__be16 num;
1673	int err = 0;
1674
1675	if (req->tp_block_nr) {
1676		int i, l;
1677
1678		/* Sanity tests and some calculations */
1679
1680		if (unlikely(po->pg_vec))
1681			return -EBUSY;
1682
1683		if (unlikely((int)req->tp_block_size <= 0))
1684			return -EINVAL;
1685		if (unlikely(req->tp_block_size & (PAGE_SIZE - 1)))
1686			return -EINVAL;
1687		if (unlikely(req->tp_frame_size < TPACKET_HDRLEN))
1688			return -EINVAL;
1689		if (unlikely(req->tp_frame_size & (TPACKET_ALIGNMENT - 1)))
1690			return -EINVAL;
1691
1692		po->frames_per_block = req->tp_block_size/req->tp_frame_size;
1693		if (unlikely(po->frames_per_block <= 0))
1694			return -EINVAL;
1695		if (unlikely((po->frames_per_block * req->tp_block_nr) !=
1696			     req->tp_frame_nr))
1697			return -EINVAL;
1698
1699		err = -ENOMEM;
1700		order = get_order(req->tp_block_size);
1701		pg_vec = alloc_pg_vec(req, order);
1702		if (unlikely(!pg_vec))
1703			goto out;
1704
1705		l = 0;
1706		for (i = 0; i < req->tp_block_nr; i++) {
1707			char *ptr = pg_vec[i];
1708			struct tpacket_hdr *header;
1709			int k;
1710
1711			for (k = 0; k < po->frames_per_block; k++) {
1712				header = (struct tpacket_hdr *) ptr;
1713				header->tp_status = TP_STATUS_KERNEL;
1714				ptr += req->tp_frame_size;
1715			}
1716		}
1717		/* Done */
1718	} else {
1719		if (unlikely(req->tp_frame_nr))
1720			return -EINVAL;
1721	}
1722
1723	lock_sock(sk);
1724
1725	/* Detach socket from network */
1726	spin_lock(&po->bind_lock);
1727	was_running = po->running;
1728	num = po->num;
1729	if (was_running) {
1730		__dev_remove_pack(&po->prot_hook);
1731		po->num = 0;
1732		po->running = 0;
1733		__sock_put(sk);
1734	}
1735	spin_unlock(&po->bind_lock);
1736
1737	synchronize_net();
1738
1739	err = -EBUSY;
1740	if (closing || atomic_read(&po->mapped) == 0) {
1741		err = 0;
1742#define XC(a, b) ({ __typeof__ ((a)) __t; __t = (a); (a) = (b); __t; })
1743
1744		spin_lock_bh(&sk->sk_receive_queue.lock);
1745		pg_vec = XC(po->pg_vec, pg_vec);
1746		po->frame_max = (req->tp_frame_nr - 1);
1747		po->head = 0;
1748		po->frame_size = req->tp_frame_size;
1749		spin_unlock_bh(&sk->sk_receive_queue.lock);
1750
1751		order = XC(po->pg_vec_order, order);
1752		req->tp_block_nr = XC(po->pg_vec_len, req->tp_block_nr);
1753
1754		po->pg_vec_pages = req->tp_block_size/PAGE_SIZE;
1755		po->prot_hook.func = po->pg_vec ? tpacket_rcv : packet_rcv;
1756		skb_queue_purge(&sk->sk_receive_queue);
1757#undef XC
1758		if (atomic_read(&po->mapped))
1759			printk(KERN_DEBUG "packet_mmap: vma is busy: %d\n", atomic_read(&po->mapped));
1760	}
1761
1762	spin_lock(&po->bind_lock);
1763	if (was_running && !po->running) {
1764		sock_hold(sk);
1765		po->running = 1;
1766		po->num = num;
1767		dev_add_pack(&po->prot_hook);
1768	}
1769	spin_unlock(&po->bind_lock);
1770
1771	release_sock(sk);
1772
1773	if (pg_vec)
1774		free_pg_vec(pg_vec, order, req->tp_block_nr);
1775out:
1776	return err;
1777}
1778
1779static int packet_mmap(struct file *file, struct socket *sock, struct vm_area_struct *vma)
1780{
1781	struct sock *sk = sock->sk;
1782	struct packet_sock *po = pkt_sk(sk);
1783	unsigned long size;
1784	unsigned long start;
1785	int err = -EINVAL;
1786	int i;
1787
1788	if (vma->vm_pgoff)
1789		return -EINVAL;
1790
1791	size = vma->vm_end - vma->vm_start;
1792
1793	lock_sock(sk);
1794	if (po->pg_vec == NULL)
1795		goto out;
1796	if (size != po->pg_vec_len*po->pg_vec_pages*PAGE_SIZE)
1797		goto out;
1798
1799	start = vma->vm_start;
1800	for (i = 0; i < po->pg_vec_len; i++) {
1801		struct page *page = virt_to_page(po->pg_vec[i]);
1802		int pg_num;
1803
1804		for (pg_num = 0; pg_num < po->pg_vec_pages; pg_num++, page++) {
1805			err = vm_insert_page(vma, start, page);
1806			if (unlikely(err))
1807				goto out;
1808			start += PAGE_SIZE;
1809		}
1810	}
1811	atomic_inc(&po->mapped);
1812	vma->vm_ops = &packet_mmap_ops;
1813	err = 0;
1814
1815out:
1816	release_sock(sk);
1817	return err;
1818}
1819#endif
1820
1821
1822static const struct proto_ops packet_ops_spkt = {
1823	.family =	PF_PACKET,
1824	.owner =	THIS_MODULE,
1825	.release =	packet_release,
1826	.bind =		packet_bind_spkt,
1827	.connect =	sock_no_connect,
1828	.socketpair =	sock_no_socketpair,
1829	.accept =	sock_no_accept,
1830	.getname =	packet_getname_spkt,
1831	.poll =		datagram_poll,
1832	.ioctl =	packet_ioctl,
1833	.listen =	sock_no_listen,
1834	.shutdown =	sock_no_shutdown,
1835	.setsockopt =	sock_no_setsockopt,
1836	.getsockopt =	sock_no_getsockopt,
1837	.sendmsg =	packet_sendmsg_spkt,
1838	.recvmsg =	packet_recvmsg,
1839	.mmap =		sock_no_mmap,
1840	.sendpage =	sock_no_sendpage,
1841};
1842
1843static const struct proto_ops packet_ops = {
1844	.family =	PF_PACKET,
1845	.owner =	THIS_MODULE,
1846	.release =	packet_release,
1847	.bind =		packet_bind,
1848	.connect =	sock_no_connect,
1849	.socketpair =	sock_no_socketpair,
1850	.accept =	sock_no_accept,
1851	.getname =	packet_getname,
1852	.poll =		packet_poll,
1853	.ioctl =	packet_ioctl,
1854	.listen =	sock_no_listen,
1855	.shutdown =	sock_no_shutdown,
1856	.setsockopt =	packet_setsockopt,
1857	.getsockopt =	packet_getsockopt,
1858	.sendmsg =	packet_sendmsg,
1859	.recvmsg =	packet_recvmsg,
1860	.mmap =		packet_mmap,
1861	.sendpage =	sock_no_sendpage,
1862};
1863
1864static struct net_proto_family packet_family_ops = {
1865	.family =	PF_PACKET,
1866	.create =	packet_create,
1867	.owner	=	THIS_MODULE,
1868};
1869
1870static struct notifier_block packet_netdev_notifier = {
1871	.notifier_call =packet_notifier,
1872};
1873
1874#ifdef CONFIG_PROC_FS
1875static inline struct sock *packet_seq_idx(loff_t off)
1876{
1877	struct sock *s;
1878	struct hlist_node *node;
1879
1880	sk_for_each(s, node, &packet_sklist) {
1881		if (!off--)
1882			return s;
1883	}
1884	return NULL;
1885}
1886
1887static void *packet_seq_start(struct seq_file *seq, loff_t *pos)
1888{
1889	read_lock(&packet_sklist_lock);
1890	return *pos ? packet_seq_idx(*pos - 1) : SEQ_START_TOKEN;
1891}
1892
1893static void *packet_seq_next(struct seq_file *seq, void *v, loff_t *pos)
1894{
1895	++*pos;
1896	return  (v == SEQ_START_TOKEN)
1897		? sk_head(&packet_sklist)
1898		: sk_next((struct sock*)v) ;
1899}
1900
1901static void packet_seq_stop(struct seq_file *seq, void *v)
1902{
1903	read_unlock(&packet_sklist_lock);
1904}
1905
1906static int packet_seq_show(struct seq_file *seq, void *v)
1907{
1908	if (v == SEQ_START_TOKEN)
1909		seq_puts(seq, "sk       RefCnt Type Proto  Iface R Rmem   User   Inode\n");
1910	else {
1911		struct sock *s = v;
1912		const struct packet_sock *po = pkt_sk(s);
1913
1914		seq_printf(seq,
1915			   "%p %-6d %-4d %04x   %-5d %1d %-6u %-6u %-6lu\n",
1916			   s,
1917			   atomic_read(&s->sk_refcnt),
1918			   s->sk_type,
1919			   ntohs(po->num),
1920			   po->ifindex,
1921			   po->running,
1922			   atomic_read(&s->sk_rmem_alloc),
1923			   sock_i_uid(s),
1924			   sock_i_ino(s) );
1925	}
1926
1927	return 0;
1928}
1929
1930static const struct seq_operations packet_seq_ops = {
1931	.start	= packet_seq_start,
1932	.next	= packet_seq_next,
1933	.stop	= packet_seq_stop,
1934	.show	= packet_seq_show,
1935};
1936
1937static int packet_seq_open(struct inode *inode, struct file *file)
1938{
1939	return seq_open(file, &packet_seq_ops);
1940}
1941
1942static const struct file_operations packet_seq_fops = {
1943	.owner		= THIS_MODULE,
1944	.open		= packet_seq_open,
1945	.read		= seq_read,
1946	.llseek		= seq_lseek,
1947	.release	= seq_release,
1948};
1949
1950#endif
1951
1952static void __exit packet_exit(void)
1953{
1954	proc_net_remove("packet");
1955	unregister_netdevice_notifier(&packet_netdev_notifier);
1956	sock_unregister(PF_PACKET);
1957	proto_unregister(&packet_proto);
1958}
1959
1960static int __init packet_init(void)
1961{
1962	int rc = proto_register(&packet_proto, 0);
1963
1964	if (rc != 0)
1965		goto out;
1966
1967	sock_register(&packet_family_ops);
1968	register_netdevice_notifier(&packet_netdev_notifier);
1969	proc_net_fops_create("packet", 0, &packet_seq_fops);
1970out:
1971	return rc;
1972}
1973
1974module_init(packet_init);
1975module_exit(packet_exit);
1976MODULE_LICENSE("GPL");
1977MODULE_ALIAS_NETPROTO(PF_PACKET);
1978