af_packet.c revision 1162563f82b434e3099c9e6c1bbdba846d792f0d
1/* 2 * INET An implementation of the TCP/IP protocol suite for the LINUX 3 * operating system. INET is implemented using the BSD Socket 4 * interface as the means of communication with the user level. 5 * 6 * PACKET - implements raw packet sockets. 7 * 8 * Authors: Ross Biro 9 * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG> 10 * Alan Cox, <gw4pts@gw4pts.ampr.org> 11 * 12 * Fixes: 13 * Alan Cox : verify_area() now used correctly 14 * Alan Cox : new skbuff lists, look ma no backlogs! 15 * Alan Cox : tidied skbuff lists. 16 * Alan Cox : Now uses generic datagram routines I 17 * added. Also fixed the peek/read crash 18 * from all old Linux datagram code. 19 * Alan Cox : Uses the improved datagram code. 20 * Alan Cox : Added NULL's for socket options. 21 * Alan Cox : Re-commented the code. 22 * Alan Cox : Use new kernel side addressing 23 * Rob Janssen : Correct MTU usage. 24 * Dave Platt : Counter leaks caused by incorrect 25 * interrupt locking and some slightly 26 * dubious gcc output. Can you read 27 * compiler: it said _VOLATILE_ 28 * Richard Kooijman : Timestamp fixes. 29 * Alan Cox : New buffers. Use sk->mac.raw. 30 * Alan Cox : sendmsg/recvmsg support. 31 * Alan Cox : Protocol setting support 32 * Alexey Kuznetsov : Untied from IPv4 stack. 33 * Cyrus Durgin : Fixed kerneld for kmod. 34 * Michal Ostrowski : Module initialization cleanup. 35 * Ulises Alonso : Frame number limit removal and 36 * packet_set_ring memory leak. 37 * Eric Biederman : Allow for > 8 byte hardware addresses. 38 * The convention is that longer addresses 39 * will simply extend the hardware address 40 * byte arrays at the end of sockaddr_ll 41 * and packet_mreq. 42 * Johann Baudy : Added TX RING. 43 * 44 * This program is free software; you can redistribute it and/or 45 * modify it under the terms of the GNU General Public License 46 * as published by the Free Software Foundation; either version 47 * 2 of the License, or (at your option) any later version. 48 * 49 */ 50 51#include <linux/types.h> 52#include <linux/mm.h> 53#include <linux/capability.h> 54#include <linux/fcntl.h> 55#include <linux/socket.h> 56#include <linux/in.h> 57#include <linux/inet.h> 58#include <linux/netdevice.h> 59#include <linux/if_packet.h> 60#include <linux/wireless.h> 61#include <linux/kernel.h> 62#include <linux/kmod.h> 63#include <net/net_namespace.h> 64#include <net/ip.h> 65#include <net/protocol.h> 66#include <linux/skbuff.h> 67#include <net/sock.h> 68#include <linux/errno.h> 69#include <linux/timer.h> 70#include <asm/system.h> 71#include <asm/uaccess.h> 72#include <asm/ioctls.h> 73#include <asm/page.h> 74#include <asm/cacheflush.h> 75#include <asm/io.h> 76#include <linux/proc_fs.h> 77#include <linux/seq_file.h> 78#include <linux/poll.h> 79#include <linux/module.h> 80#include <linux/init.h> 81#include <linux/mutex.h> 82#include <linux/if_vlan.h> 83#include <linux/virtio_net.h> 84 85#ifdef CONFIG_INET 86#include <net/inet_common.h> 87#endif 88 89/* 90 Assumptions: 91 - if device has no dev->hard_header routine, it adds and removes ll header 92 inside itself. In this case ll header is invisible outside of device, 93 but higher levels still should reserve dev->hard_header_len. 94 Some devices are enough clever to reallocate skb, when header 95 will not fit to reserved space (tunnel), another ones are silly 96 (PPP). 97 - packet socket receives packets with pulled ll header, 98 so that SOCK_RAW should push it back. 99 100On receive: 101----------- 102 103Incoming, dev->hard_header!=NULL 104 mac_header -> ll header 105 data -> data 106 107Outgoing, dev->hard_header!=NULL 108 mac_header -> ll header 109 data -> ll header 110 111Incoming, dev->hard_header==NULL 112 mac_header -> UNKNOWN position. It is very likely, that it points to ll 113 header. PPP makes it, that is wrong, because introduce 114 assymetry between rx and tx paths. 115 data -> data 116 117Outgoing, dev->hard_header==NULL 118 mac_header -> data. ll header is still not built! 119 data -> data 120 121Resume 122 If dev->hard_header==NULL we are unlikely to restore sensible ll header. 123 124 125On transmit: 126------------ 127 128dev->hard_header != NULL 129 mac_header -> ll header 130 data -> ll header 131 132dev->hard_header == NULL (ll header is added by device, we cannot control it) 133 mac_header -> data 134 data -> data 135 136 We should set nh.raw on output to correct posistion, 137 packet classifier depends on it. 138 */ 139 140/* Private packet socket structures. */ 141 142struct packet_mclist { 143 struct packet_mclist *next; 144 int ifindex; 145 int count; 146 unsigned short type; 147 unsigned short alen; 148 unsigned char addr[MAX_ADDR_LEN]; 149}; 150/* identical to struct packet_mreq except it has 151 * a longer address field. 152 */ 153struct packet_mreq_max { 154 int mr_ifindex; 155 unsigned short mr_type; 156 unsigned short mr_alen; 157 unsigned char mr_address[MAX_ADDR_LEN]; 158}; 159 160static int packet_set_ring(struct sock *sk, struct tpacket_req *req, 161 int closing, int tx_ring); 162 163struct packet_ring_buffer { 164 char **pg_vec; 165 unsigned int head; 166 unsigned int frames_per_block; 167 unsigned int frame_size; 168 unsigned int frame_max; 169 170 unsigned int pg_vec_order; 171 unsigned int pg_vec_pages; 172 unsigned int pg_vec_len; 173 174 atomic_t pending; 175}; 176 177struct packet_sock; 178static int tpacket_snd(struct packet_sock *po, struct msghdr *msg); 179 180static void packet_flush_mclist(struct sock *sk); 181 182struct packet_sock { 183 /* struct sock has to be the first member of packet_sock */ 184 struct sock sk; 185 struct tpacket_stats stats; 186 struct packet_ring_buffer rx_ring; 187 struct packet_ring_buffer tx_ring; 188 int copy_thresh; 189 spinlock_t bind_lock; 190 struct mutex pg_vec_lock; 191 unsigned int running:1, /* prot_hook is attached*/ 192 auxdata:1, 193 origdev:1, 194 has_vnet_hdr:1; 195 int ifindex; /* bound device */ 196 __be16 num; 197 struct packet_mclist *mclist; 198 atomic_t mapped; 199 enum tpacket_versions tp_version; 200 unsigned int tp_hdrlen; 201 unsigned int tp_reserve; 202 unsigned int tp_loss:1; 203 struct packet_type prot_hook ____cacheline_aligned_in_smp; 204}; 205 206struct packet_skb_cb { 207 unsigned int origlen; 208 union { 209 struct sockaddr_pkt pkt; 210 struct sockaddr_ll ll; 211 } sa; 212}; 213 214#define PACKET_SKB_CB(__skb) ((struct packet_skb_cb *)((__skb)->cb)) 215 216static void __packet_set_status(struct packet_sock *po, void *frame, int status) 217{ 218 union { 219 struct tpacket_hdr *h1; 220 struct tpacket2_hdr *h2; 221 void *raw; 222 } h; 223 224 h.raw = frame; 225 switch (po->tp_version) { 226 case TPACKET_V1: 227 h.h1->tp_status = status; 228 flush_dcache_page(virt_to_page(&h.h1->tp_status)); 229 break; 230 case TPACKET_V2: 231 h.h2->tp_status = status; 232 flush_dcache_page(virt_to_page(&h.h2->tp_status)); 233 break; 234 default: 235 pr_err("TPACKET version not supported\n"); 236 BUG(); 237 } 238 239 smp_wmb(); 240} 241 242static int __packet_get_status(struct packet_sock *po, void *frame) 243{ 244 union { 245 struct tpacket_hdr *h1; 246 struct tpacket2_hdr *h2; 247 void *raw; 248 } h; 249 250 smp_rmb(); 251 252 h.raw = frame; 253 switch (po->tp_version) { 254 case TPACKET_V1: 255 flush_dcache_page(virt_to_page(&h.h1->tp_status)); 256 return h.h1->tp_status; 257 case TPACKET_V2: 258 flush_dcache_page(virt_to_page(&h.h2->tp_status)); 259 return h.h2->tp_status; 260 default: 261 pr_err("TPACKET version not supported\n"); 262 BUG(); 263 return 0; 264 } 265} 266 267static void *packet_lookup_frame(struct packet_sock *po, 268 struct packet_ring_buffer *rb, 269 unsigned int position, 270 int status) 271{ 272 unsigned int pg_vec_pos, frame_offset; 273 union { 274 struct tpacket_hdr *h1; 275 struct tpacket2_hdr *h2; 276 void *raw; 277 } h; 278 279 pg_vec_pos = position / rb->frames_per_block; 280 frame_offset = position % rb->frames_per_block; 281 282 h.raw = rb->pg_vec[pg_vec_pos] + (frame_offset * rb->frame_size); 283 284 if (status != __packet_get_status(po, h.raw)) 285 return NULL; 286 287 return h.raw; 288} 289 290static inline void *packet_current_frame(struct packet_sock *po, 291 struct packet_ring_buffer *rb, 292 int status) 293{ 294 return packet_lookup_frame(po, rb, rb->head, status); 295} 296 297static inline void *packet_previous_frame(struct packet_sock *po, 298 struct packet_ring_buffer *rb, 299 int status) 300{ 301 unsigned int previous = rb->head ? rb->head - 1 : rb->frame_max; 302 return packet_lookup_frame(po, rb, previous, status); 303} 304 305static inline void packet_increment_head(struct packet_ring_buffer *buff) 306{ 307 buff->head = buff->head != buff->frame_max ? buff->head+1 : 0; 308} 309 310static inline struct packet_sock *pkt_sk(struct sock *sk) 311{ 312 return (struct packet_sock *)sk; 313} 314 315static void packet_sock_destruct(struct sock *sk) 316{ 317 WARN_ON(atomic_read(&sk->sk_rmem_alloc)); 318 WARN_ON(atomic_read(&sk->sk_wmem_alloc)); 319 320 if (!sock_flag(sk, SOCK_DEAD)) { 321 pr_err("Attempt to release alive packet socket: %p\n", sk); 322 return; 323 } 324 325 sk_refcnt_debug_dec(sk); 326} 327 328 329static const struct proto_ops packet_ops; 330 331static const struct proto_ops packet_ops_spkt; 332 333static int packet_rcv_spkt(struct sk_buff *skb, struct net_device *dev, 334 struct packet_type *pt, struct net_device *orig_dev) 335{ 336 struct sock *sk; 337 struct sockaddr_pkt *spkt; 338 339 /* 340 * When we registered the protocol we saved the socket in the data 341 * field for just this event. 342 */ 343 344 sk = pt->af_packet_priv; 345 346 /* 347 * Yank back the headers [hope the device set this 348 * right or kerboom...] 349 * 350 * Incoming packets have ll header pulled, 351 * push it back. 352 * 353 * For outgoing ones skb->data == skb_mac_header(skb) 354 * so that this procedure is noop. 355 */ 356 357 if (skb->pkt_type == PACKET_LOOPBACK) 358 goto out; 359 360 if (!net_eq(dev_net(dev), sock_net(sk))) 361 goto out; 362 363 skb = skb_share_check(skb, GFP_ATOMIC); 364 if (skb == NULL) 365 goto oom; 366 367 /* drop any routing info */ 368 skb_dst_drop(skb); 369 370 /* drop conntrack reference */ 371 nf_reset(skb); 372 373 spkt = &PACKET_SKB_CB(skb)->sa.pkt; 374 375 skb_push(skb, skb->data - skb_mac_header(skb)); 376 377 /* 378 * The SOCK_PACKET socket receives _all_ frames. 379 */ 380 381 spkt->spkt_family = dev->type; 382 strlcpy(spkt->spkt_device, dev->name, sizeof(spkt->spkt_device)); 383 spkt->spkt_protocol = skb->protocol; 384 385 /* 386 * Charge the memory to the socket. This is done specifically 387 * to prevent sockets using all the memory up. 388 */ 389 390 if (sock_queue_rcv_skb(sk, skb) == 0) 391 return 0; 392 393out: 394 kfree_skb(skb); 395oom: 396 return 0; 397} 398 399 400/* 401 * Output a raw packet to a device layer. This bypasses all the other 402 * protocol layers and you must therefore supply it with a complete frame 403 */ 404 405static int packet_sendmsg_spkt(struct kiocb *iocb, struct socket *sock, 406 struct msghdr *msg, size_t len) 407{ 408 struct sock *sk = sock->sk; 409 struct sockaddr_pkt *saddr = (struct sockaddr_pkt *)msg->msg_name; 410 struct sk_buff *skb = NULL; 411 struct net_device *dev; 412 __be16 proto = 0; 413 int err; 414 415 /* 416 * Get and verify the address. 417 */ 418 419 if (saddr) { 420 if (msg->msg_namelen < sizeof(struct sockaddr)) 421 return -EINVAL; 422 if (msg->msg_namelen == sizeof(struct sockaddr_pkt)) 423 proto = saddr->spkt_protocol; 424 } else 425 return -ENOTCONN; /* SOCK_PACKET must be sent giving an address */ 426 427 /* 428 * Find the device first to size check it 429 */ 430 431 saddr->spkt_device[13] = 0; 432retry: 433 rcu_read_lock(); 434 dev = dev_get_by_name_rcu(sock_net(sk), saddr->spkt_device); 435 err = -ENODEV; 436 if (dev == NULL) 437 goto out_unlock; 438 439 err = -ENETDOWN; 440 if (!(dev->flags & IFF_UP)) 441 goto out_unlock; 442 443 /* 444 * You may not queue a frame bigger than the mtu. This is the lowest level 445 * raw protocol and you must do your own fragmentation at this level. 446 */ 447 448 err = -EMSGSIZE; 449 if (len > dev->mtu + dev->hard_header_len) 450 goto out_unlock; 451 452 if (!skb) { 453 size_t reserved = LL_RESERVED_SPACE(dev); 454 unsigned int hhlen = dev->header_ops ? dev->hard_header_len : 0; 455 456 rcu_read_unlock(); 457 skb = sock_wmalloc(sk, len + reserved, 0, GFP_KERNEL); 458 if (skb == NULL) 459 return -ENOBUFS; 460 /* FIXME: Save some space for broken drivers that write a hard 461 * header at transmission time by themselves. PPP is the notable 462 * one here. This should really be fixed at the driver level. 463 */ 464 skb_reserve(skb, reserved); 465 skb_reset_network_header(skb); 466 467 /* Try to align data part correctly */ 468 if (hhlen) { 469 skb->data -= hhlen; 470 skb->tail -= hhlen; 471 if (len < hhlen) 472 skb_reset_network_header(skb); 473 } 474 err = memcpy_fromiovec(skb_put(skb, len), msg->msg_iov, len); 475 if (err) 476 goto out_free; 477 goto retry; 478 } 479 480 481 skb->protocol = proto; 482 skb->dev = dev; 483 skb->priority = sk->sk_priority; 484 skb->mark = sk->sk_mark; 485 486 dev_queue_xmit(skb); 487 rcu_read_unlock(); 488 return len; 489 490out_unlock: 491 rcu_read_unlock(); 492out_free: 493 kfree_skb(skb); 494 return err; 495} 496 497static inline unsigned int run_filter(struct sk_buff *skb, struct sock *sk, 498 unsigned int res) 499{ 500 struct sk_filter *filter; 501 502 rcu_read_lock_bh(); 503 filter = rcu_dereference_bh(sk->sk_filter); 504 if (filter != NULL) 505 res = sk_run_filter(skb, filter->insns, filter->len); 506 rcu_read_unlock_bh(); 507 508 return res; 509} 510 511/* 512 This function makes lazy skb cloning in hope that most of packets 513 are discarded by BPF. 514 515 Note tricky part: we DO mangle shared skb! skb->data, skb->len 516 and skb->cb are mangled. It works because (and until) packets 517 falling here are owned by current CPU. Output packets are cloned 518 by dev_queue_xmit_nit(), input packets are processed by net_bh 519 sequencially, so that if we return skb to original state on exit, 520 we will not harm anyone. 521 */ 522 523static int packet_rcv(struct sk_buff *skb, struct net_device *dev, 524 struct packet_type *pt, struct net_device *orig_dev) 525{ 526 struct sock *sk; 527 struct sockaddr_ll *sll; 528 struct packet_sock *po; 529 u8 *skb_head = skb->data; 530 int skb_len = skb->len; 531 unsigned int snaplen, res; 532 533 if (skb->pkt_type == PACKET_LOOPBACK) 534 goto drop; 535 536 sk = pt->af_packet_priv; 537 po = pkt_sk(sk); 538 539 if (!net_eq(dev_net(dev), sock_net(sk))) 540 goto drop; 541 542 skb->dev = dev; 543 544 if (dev->header_ops) { 545 /* The device has an explicit notion of ll header, 546 exported to higher levels. 547 548 Otherwise, the device hides datails of it frame 549 structure, so that corresponding packet head 550 never delivered to user. 551 */ 552 if (sk->sk_type != SOCK_DGRAM) 553 skb_push(skb, skb->data - skb_mac_header(skb)); 554 else if (skb->pkt_type == PACKET_OUTGOING) { 555 /* Special case: outgoing packets have ll header at head */ 556 skb_pull(skb, skb_network_offset(skb)); 557 } 558 } 559 560 snaplen = skb->len; 561 562 res = run_filter(skb, sk, snaplen); 563 if (!res) 564 goto drop_n_restore; 565 if (snaplen > res) 566 snaplen = res; 567 568 if (atomic_read(&sk->sk_rmem_alloc) + skb->truesize >= 569 (unsigned)sk->sk_rcvbuf) 570 goto drop_n_acct; 571 572 if (skb_shared(skb)) { 573 struct sk_buff *nskb = skb_clone(skb, GFP_ATOMIC); 574 if (nskb == NULL) 575 goto drop_n_acct; 576 577 if (skb_head != skb->data) { 578 skb->data = skb_head; 579 skb->len = skb_len; 580 } 581 kfree_skb(skb); 582 skb = nskb; 583 } 584 585 BUILD_BUG_ON(sizeof(*PACKET_SKB_CB(skb)) + MAX_ADDR_LEN - 8 > 586 sizeof(skb->cb)); 587 588 sll = &PACKET_SKB_CB(skb)->sa.ll; 589 sll->sll_family = AF_PACKET; 590 sll->sll_hatype = dev->type; 591 sll->sll_protocol = skb->protocol; 592 sll->sll_pkttype = skb->pkt_type; 593 if (unlikely(po->origdev)) 594 sll->sll_ifindex = orig_dev->ifindex; 595 else 596 sll->sll_ifindex = dev->ifindex; 597 598 sll->sll_halen = dev_parse_header(skb, sll->sll_addr); 599 600 PACKET_SKB_CB(skb)->origlen = skb->len; 601 602 if (pskb_trim(skb, snaplen)) 603 goto drop_n_acct; 604 605 skb_set_owner_r(skb, sk); 606 skb->dev = NULL; 607 skb_dst_drop(skb); 608 609 /* drop conntrack reference */ 610 nf_reset(skb); 611 612 spin_lock(&sk->sk_receive_queue.lock); 613 po->stats.tp_packets++; 614 skb->dropcount = atomic_read(&sk->sk_drops); 615 __skb_queue_tail(&sk->sk_receive_queue, skb); 616 spin_unlock(&sk->sk_receive_queue.lock); 617 sk->sk_data_ready(sk, skb->len); 618 return 0; 619 620drop_n_acct: 621 po->stats.tp_drops = atomic_inc_return(&sk->sk_drops); 622 623drop_n_restore: 624 if (skb_head != skb->data && skb_shared(skb)) { 625 skb->data = skb_head; 626 skb->len = skb_len; 627 } 628drop: 629 consume_skb(skb); 630 return 0; 631} 632 633static int tpacket_rcv(struct sk_buff *skb, struct net_device *dev, 634 struct packet_type *pt, struct net_device *orig_dev) 635{ 636 struct sock *sk; 637 struct packet_sock *po; 638 struct sockaddr_ll *sll; 639 union { 640 struct tpacket_hdr *h1; 641 struct tpacket2_hdr *h2; 642 void *raw; 643 } h; 644 u8 *skb_head = skb->data; 645 int skb_len = skb->len; 646 unsigned int snaplen, res; 647 unsigned long status = TP_STATUS_LOSING|TP_STATUS_USER; 648 unsigned short macoff, netoff, hdrlen; 649 struct sk_buff *copy_skb = NULL; 650 struct timeval tv; 651 struct timespec ts; 652 653 if (skb->pkt_type == PACKET_LOOPBACK) 654 goto drop; 655 656 sk = pt->af_packet_priv; 657 po = pkt_sk(sk); 658 659 if (!net_eq(dev_net(dev), sock_net(sk))) 660 goto drop; 661 662 if (dev->header_ops) { 663 if (sk->sk_type != SOCK_DGRAM) 664 skb_push(skb, skb->data - skb_mac_header(skb)); 665 else if (skb->pkt_type == PACKET_OUTGOING) { 666 /* Special case: outgoing packets have ll header at head */ 667 skb_pull(skb, skb_network_offset(skb)); 668 } 669 } 670 671 if (skb->ip_summed == CHECKSUM_PARTIAL) 672 status |= TP_STATUS_CSUMNOTREADY; 673 674 snaplen = skb->len; 675 676 res = run_filter(skb, sk, snaplen); 677 if (!res) 678 goto drop_n_restore; 679 if (snaplen > res) 680 snaplen = res; 681 682 if (sk->sk_type == SOCK_DGRAM) { 683 macoff = netoff = TPACKET_ALIGN(po->tp_hdrlen) + 16 + 684 po->tp_reserve; 685 } else { 686 unsigned maclen = skb_network_offset(skb); 687 netoff = TPACKET_ALIGN(po->tp_hdrlen + 688 (maclen < 16 ? 16 : maclen)) + 689 po->tp_reserve; 690 macoff = netoff - maclen; 691 } 692 693 if (macoff + snaplen > po->rx_ring.frame_size) { 694 if (po->copy_thresh && 695 atomic_read(&sk->sk_rmem_alloc) + skb->truesize < 696 (unsigned)sk->sk_rcvbuf) { 697 if (skb_shared(skb)) { 698 copy_skb = skb_clone(skb, GFP_ATOMIC); 699 } else { 700 copy_skb = skb_get(skb); 701 skb_head = skb->data; 702 } 703 if (copy_skb) 704 skb_set_owner_r(copy_skb, sk); 705 } 706 snaplen = po->rx_ring.frame_size - macoff; 707 if ((int)snaplen < 0) 708 snaplen = 0; 709 } 710 711 spin_lock(&sk->sk_receive_queue.lock); 712 h.raw = packet_current_frame(po, &po->rx_ring, TP_STATUS_KERNEL); 713 if (!h.raw) 714 goto ring_is_full; 715 packet_increment_head(&po->rx_ring); 716 po->stats.tp_packets++; 717 if (copy_skb) { 718 status |= TP_STATUS_COPY; 719 __skb_queue_tail(&sk->sk_receive_queue, copy_skb); 720 } 721 if (!po->stats.tp_drops) 722 status &= ~TP_STATUS_LOSING; 723 spin_unlock(&sk->sk_receive_queue.lock); 724 725 skb_copy_bits(skb, 0, h.raw + macoff, snaplen); 726 727 switch (po->tp_version) { 728 case TPACKET_V1: 729 h.h1->tp_len = skb->len; 730 h.h1->tp_snaplen = snaplen; 731 h.h1->tp_mac = macoff; 732 h.h1->tp_net = netoff; 733 if (skb->tstamp.tv64) 734 tv = ktime_to_timeval(skb->tstamp); 735 else 736 do_gettimeofday(&tv); 737 h.h1->tp_sec = tv.tv_sec; 738 h.h1->tp_usec = tv.tv_usec; 739 hdrlen = sizeof(*h.h1); 740 break; 741 case TPACKET_V2: 742 h.h2->tp_len = skb->len; 743 h.h2->tp_snaplen = snaplen; 744 h.h2->tp_mac = macoff; 745 h.h2->tp_net = netoff; 746 if (skb->tstamp.tv64) 747 ts = ktime_to_timespec(skb->tstamp); 748 else 749 getnstimeofday(&ts); 750 h.h2->tp_sec = ts.tv_sec; 751 h.h2->tp_nsec = ts.tv_nsec; 752 h.h2->tp_vlan_tci = vlan_tx_tag_get(skb); 753 hdrlen = sizeof(*h.h2); 754 break; 755 default: 756 BUG(); 757 } 758 759 sll = h.raw + TPACKET_ALIGN(hdrlen); 760 sll->sll_halen = dev_parse_header(skb, sll->sll_addr); 761 sll->sll_family = AF_PACKET; 762 sll->sll_hatype = dev->type; 763 sll->sll_protocol = skb->protocol; 764 sll->sll_pkttype = skb->pkt_type; 765 if (unlikely(po->origdev)) 766 sll->sll_ifindex = orig_dev->ifindex; 767 else 768 sll->sll_ifindex = dev->ifindex; 769 770 __packet_set_status(po, h.raw, status); 771 smp_mb(); 772 { 773 struct page *p_start, *p_end; 774 u8 *h_end = h.raw + macoff + snaplen - 1; 775 776 p_start = virt_to_page(h.raw); 777 p_end = virt_to_page(h_end); 778 while (p_start <= p_end) { 779 flush_dcache_page(p_start); 780 p_start++; 781 } 782 } 783 784 sk->sk_data_ready(sk, 0); 785 786drop_n_restore: 787 if (skb_head != skb->data && skb_shared(skb)) { 788 skb->data = skb_head; 789 skb->len = skb_len; 790 } 791drop: 792 kfree_skb(skb); 793 return 0; 794 795ring_is_full: 796 po->stats.tp_drops++; 797 spin_unlock(&sk->sk_receive_queue.lock); 798 799 sk->sk_data_ready(sk, 0); 800 kfree_skb(copy_skb); 801 goto drop_n_restore; 802} 803 804static void tpacket_destruct_skb(struct sk_buff *skb) 805{ 806 struct packet_sock *po = pkt_sk(skb->sk); 807 void *ph; 808 809 BUG_ON(skb == NULL); 810 811 if (likely(po->tx_ring.pg_vec)) { 812 ph = skb_shinfo(skb)->destructor_arg; 813 BUG_ON(__packet_get_status(po, ph) != TP_STATUS_SENDING); 814 BUG_ON(atomic_read(&po->tx_ring.pending) == 0); 815 atomic_dec(&po->tx_ring.pending); 816 __packet_set_status(po, ph, TP_STATUS_AVAILABLE); 817 } 818 819 sock_wfree(skb); 820} 821 822static int tpacket_fill_skb(struct packet_sock *po, struct sk_buff *skb, 823 void *frame, struct net_device *dev, int size_max, 824 __be16 proto, unsigned char *addr) 825{ 826 union { 827 struct tpacket_hdr *h1; 828 struct tpacket2_hdr *h2; 829 void *raw; 830 } ph; 831 int to_write, offset, len, tp_len, nr_frags, len_max; 832 struct socket *sock = po->sk.sk_socket; 833 struct page *page; 834 void *data; 835 int err; 836 837 ph.raw = frame; 838 839 skb->protocol = proto; 840 skb->dev = dev; 841 skb->priority = po->sk.sk_priority; 842 skb->mark = po->sk.sk_mark; 843 skb_shinfo(skb)->destructor_arg = ph.raw; 844 845 switch (po->tp_version) { 846 case TPACKET_V2: 847 tp_len = ph.h2->tp_len; 848 break; 849 default: 850 tp_len = ph.h1->tp_len; 851 break; 852 } 853 if (unlikely(tp_len > size_max)) { 854 pr_err("packet size is too long (%d > %d)\n", tp_len, size_max); 855 return -EMSGSIZE; 856 } 857 858 skb_reserve(skb, LL_RESERVED_SPACE(dev)); 859 skb_reset_network_header(skb); 860 861 data = ph.raw + po->tp_hdrlen - sizeof(struct sockaddr_ll); 862 to_write = tp_len; 863 864 if (sock->type == SOCK_DGRAM) { 865 err = dev_hard_header(skb, dev, ntohs(proto), addr, 866 NULL, tp_len); 867 if (unlikely(err < 0)) 868 return -EINVAL; 869 } else if (dev->hard_header_len) { 870 /* net device doesn't like empty head */ 871 if (unlikely(tp_len <= dev->hard_header_len)) { 872 pr_err("packet size is too short (%d < %d)\n", 873 tp_len, dev->hard_header_len); 874 return -EINVAL; 875 } 876 877 skb_push(skb, dev->hard_header_len); 878 err = skb_store_bits(skb, 0, data, 879 dev->hard_header_len); 880 if (unlikely(err)) 881 return err; 882 883 data += dev->hard_header_len; 884 to_write -= dev->hard_header_len; 885 } 886 887 err = -EFAULT; 888 page = virt_to_page(data); 889 offset = offset_in_page(data); 890 len_max = PAGE_SIZE - offset; 891 len = ((to_write > len_max) ? len_max : to_write); 892 893 skb->data_len = to_write; 894 skb->len += to_write; 895 skb->truesize += to_write; 896 atomic_add(to_write, &po->sk.sk_wmem_alloc); 897 898 while (likely(to_write)) { 899 nr_frags = skb_shinfo(skb)->nr_frags; 900 901 if (unlikely(nr_frags >= MAX_SKB_FRAGS)) { 902 pr_err("Packet exceed the number of skb frags(%lu)\n", 903 MAX_SKB_FRAGS); 904 return -EFAULT; 905 } 906 907 flush_dcache_page(page); 908 get_page(page); 909 skb_fill_page_desc(skb, 910 nr_frags, 911 page++, offset, len); 912 to_write -= len; 913 offset = 0; 914 len_max = PAGE_SIZE; 915 len = ((to_write > len_max) ? len_max : to_write); 916 } 917 918 return tp_len; 919} 920 921static int tpacket_snd(struct packet_sock *po, struct msghdr *msg) 922{ 923 struct socket *sock; 924 struct sk_buff *skb; 925 struct net_device *dev; 926 __be16 proto; 927 int ifindex, err, reserve = 0; 928 void *ph; 929 struct sockaddr_ll *saddr = (struct sockaddr_ll *)msg->msg_name; 930 int tp_len, size_max; 931 unsigned char *addr; 932 int len_sum = 0; 933 int status = 0; 934 935 sock = po->sk.sk_socket; 936 937 mutex_lock(&po->pg_vec_lock); 938 939 err = -EBUSY; 940 if (saddr == NULL) { 941 ifindex = po->ifindex; 942 proto = po->num; 943 addr = NULL; 944 } else { 945 err = -EINVAL; 946 if (msg->msg_namelen < sizeof(struct sockaddr_ll)) 947 goto out; 948 if (msg->msg_namelen < (saddr->sll_halen 949 + offsetof(struct sockaddr_ll, 950 sll_addr))) 951 goto out; 952 ifindex = saddr->sll_ifindex; 953 proto = saddr->sll_protocol; 954 addr = saddr->sll_addr; 955 } 956 957 dev = dev_get_by_index(sock_net(&po->sk), ifindex); 958 err = -ENXIO; 959 if (unlikely(dev == NULL)) 960 goto out; 961 962 reserve = dev->hard_header_len; 963 964 err = -ENETDOWN; 965 if (unlikely(!(dev->flags & IFF_UP))) 966 goto out_put; 967 968 size_max = po->tx_ring.frame_size 969 - (po->tp_hdrlen - sizeof(struct sockaddr_ll)); 970 971 if (size_max > dev->mtu + reserve) 972 size_max = dev->mtu + reserve; 973 974 do { 975 ph = packet_current_frame(po, &po->tx_ring, 976 TP_STATUS_SEND_REQUEST); 977 978 if (unlikely(ph == NULL)) { 979 schedule(); 980 continue; 981 } 982 983 status = TP_STATUS_SEND_REQUEST; 984 skb = sock_alloc_send_skb(&po->sk, 985 LL_ALLOCATED_SPACE(dev) 986 + sizeof(struct sockaddr_ll), 987 0, &err); 988 989 if (unlikely(skb == NULL)) 990 goto out_status; 991 992 tp_len = tpacket_fill_skb(po, skb, ph, dev, size_max, proto, 993 addr); 994 995 if (unlikely(tp_len < 0)) { 996 if (po->tp_loss) { 997 __packet_set_status(po, ph, 998 TP_STATUS_AVAILABLE); 999 packet_increment_head(&po->tx_ring); 1000 kfree_skb(skb); 1001 continue; 1002 } else { 1003 status = TP_STATUS_WRONG_FORMAT; 1004 err = tp_len; 1005 goto out_status; 1006 } 1007 } 1008 1009 skb->destructor = tpacket_destruct_skb; 1010 __packet_set_status(po, ph, TP_STATUS_SENDING); 1011 atomic_inc(&po->tx_ring.pending); 1012 1013 status = TP_STATUS_SEND_REQUEST; 1014 err = dev_queue_xmit(skb); 1015 if (unlikely(err > 0)) { 1016 err = net_xmit_errno(err); 1017 if (err && __packet_get_status(po, ph) == 1018 TP_STATUS_AVAILABLE) { 1019 /* skb was destructed already */ 1020 skb = NULL; 1021 goto out_status; 1022 } 1023 /* 1024 * skb was dropped but not destructed yet; 1025 * let's treat it like congestion or err < 0 1026 */ 1027 err = 0; 1028 } 1029 packet_increment_head(&po->tx_ring); 1030 len_sum += tp_len; 1031 } while (likely((ph != NULL) || 1032 ((!(msg->msg_flags & MSG_DONTWAIT)) && 1033 (atomic_read(&po->tx_ring.pending)))) 1034 ); 1035 1036 err = len_sum; 1037 goto out_put; 1038 1039out_status: 1040 __packet_set_status(po, ph, status); 1041 kfree_skb(skb); 1042out_put: 1043 dev_put(dev); 1044out: 1045 mutex_unlock(&po->pg_vec_lock); 1046 return err; 1047} 1048 1049static inline struct sk_buff *packet_alloc_skb(struct sock *sk, size_t prepad, 1050 size_t reserve, size_t len, 1051 size_t linear, int noblock, 1052 int *err) 1053{ 1054 struct sk_buff *skb; 1055 1056 /* Under a page? Don't bother with paged skb. */ 1057 if (prepad + len < PAGE_SIZE || !linear) 1058 linear = len; 1059 1060 skb = sock_alloc_send_pskb(sk, prepad + linear, len - linear, noblock, 1061 err); 1062 if (!skb) 1063 return NULL; 1064 1065 skb_reserve(skb, reserve); 1066 skb_put(skb, linear); 1067 skb->data_len = len - linear; 1068 skb->len += len - linear; 1069 1070 return skb; 1071} 1072 1073static int packet_snd(struct socket *sock, 1074 struct msghdr *msg, size_t len) 1075{ 1076 struct sock *sk = sock->sk; 1077 struct sockaddr_ll *saddr = (struct sockaddr_ll *)msg->msg_name; 1078 struct sk_buff *skb; 1079 struct net_device *dev; 1080 __be16 proto; 1081 unsigned char *addr; 1082 int ifindex, err, reserve = 0; 1083 struct virtio_net_hdr vnet_hdr = { 0 }; 1084 int offset = 0; 1085 int vnet_hdr_len; 1086 struct packet_sock *po = pkt_sk(sk); 1087 unsigned short gso_type = 0; 1088 1089 /* 1090 * Get and verify the address. 1091 */ 1092 1093 if (saddr == NULL) { 1094 ifindex = po->ifindex; 1095 proto = po->num; 1096 addr = NULL; 1097 } else { 1098 err = -EINVAL; 1099 if (msg->msg_namelen < sizeof(struct sockaddr_ll)) 1100 goto out; 1101 if (msg->msg_namelen < (saddr->sll_halen + offsetof(struct sockaddr_ll, sll_addr))) 1102 goto out; 1103 ifindex = saddr->sll_ifindex; 1104 proto = saddr->sll_protocol; 1105 addr = saddr->sll_addr; 1106 } 1107 1108 1109 dev = dev_get_by_index(sock_net(sk), ifindex); 1110 err = -ENXIO; 1111 if (dev == NULL) 1112 goto out_unlock; 1113 if (sock->type == SOCK_RAW) 1114 reserve = dev->hard_header_len; 1115 1116 err = -ENETDOWN; 1117 if (!(dev->flags & IFF_UP)) 1118 goto out_unlock; 1119 1120 if (po->has_vnet_hdr) { 1121 vnet_hdr_len = sizeof(vnet_hdr); 1122 1123 err = -EINVAL; 1124 if (len < vnet_hdr_len) 1125 goto out_unlock; 1126 1127 len -= vnet_hdr_len; 1128 1129 err = memcpy_fromiovec((void *)&vnet_hdr, msg->msg_iov, 1130 vnet_hdr_len); 1131 if (err < 0) 1132 goto out_unlock; 1133 1134 if ((vnet_hdr.flags & VIRTIO_NET_HDR_F_NEEDS_CSUM) && 1135 (vnet_hdr.csum_start + vnet_hdr.csum_offset + 2 > 1136 vnet_hdr.hdr_len)) 1137 vnet_hdr.hdr_len = vnet_hdr.csum_start + 1138 vnet_hdr.csum_offset + 2; 1139 1140 err = -EINVAL; 1141 if (vnet_hdr.hdr_len > len) 1142 goto out_unlock; 1143 1144 if (vnet_hdr.gso_type != VIRTIO_NET_HDR_GSO_NONE) { 1145 switch (vnet_hdr.gso_type & ~VIRTIO_NET_HDR_GSO_ECN) { 1146 case VIRTIO_NET_HDR_GSO_TCPV4: 1147 gso_type = SKB_GSO_TCPV4; 1148 break; 1149 case VIRTIO_NET_HDR_GSO_TCPV6: 1150 gso_type = SKB_GSO_TCPV6; 1151 break; 1152 case VIRTIO_NET_HDR_GSO_UDP: 1153 gso_type = SKB_GSO_UDP; 1154 break; 1155 default: 1156 goto out_unlock; 1157 } 1158 1159 if (vnet_hdr.gso_type & VIRTIO_NET_HDR_GSO_ECN) 1160 gso_type |= SKB_GSO_TCP_ECN; 1161 1162 if (vnet_hdr.gso_size == 0) 1163 goto out_unlock; 1164 1165 } 1166 } 1167 1168 err = -EMSGSIZE; 1169 if (!gso_type && (len > dev->mtu+reserve)) 1170 goto out_unlock; 1171 1172 err = -ENOBUFS; 1173 skb = packet_alloc_skb(sk, LL_ALLOCATED_SPACE(dev), 1174 LL_RESERVED_SPACE(dev), len, vnet_hdr.hdr_len, 1175 msg->msg_flags & MSG_DONTWAIT, &err); 1176 if (skb == NULL) 1177 goto out_unlock; 1178 1179 skb_set_network_header(skb, reserve); 1180 1181 err = -EINVAL; 1182 if (sock->type == SOCK_DGRAM && 1183 (offset = dev_hard_header(skb, dev, ntohs(proto), addr, NULL, len)) < 0) 1184 goto out_free; 1185 1186 /* Returns -EFAULT on error */ 1187 err = skb_copy_datagram_from_iovec(skb, offset, msg->msg_iov, 0, len); 1188 if (err) 1189 goto out_free; 1190 1191 skb->protocol = proto; 1192 skb->dev = dev; 1193 skb->priority = sk->sk_priority; 1194 skb->mark = sk->sk_mark; 1195 1196 if (po->has_vnet_hdr) { 1197 if (vnet_hdr.flags & VIRTIO_NET_HDR_F_NEEDS_CSUM) { 1198 if (!skb_partial_csum_set(skb, vnet_hdr.csum_start, 1199 vnet_hdr.csum_offset)) { 1200 err = -EINVAL; 1201 goto out_free; 1202 } 1203 } 1204 1205 skb_shinfo(skb)->gso_size = vnet_hdr.gso_size; 1206 skb_shinfo(skb)->gso_type = gso_type; 1207 1208 /* Header must be checked, and gso_segs computed. */ 1209 skb_shinfo(skb)->gso_type |= SKB_GSO_DODGY; 1210 skb_shinfo(skb)->gso_segs = 0; 1211 1212 len += vnet_hdr_len; 1213 } 1214 1215 /* 1216 * Now send it 1217 */ 1218 1219 err = dev_queue_xmit(skb); 1220 if (err > 0 && (err = net_xmit_errno(err)) != 0) 1221 goto out_unlock; 1222 1223 dev_put(dev); 1224 1225 return len; 1226 1227out_free: 1228 kfree_skb(skb); 1229out_unlock: 1230 if (dev) 1231 dev_put(dev); 1232out: 1233 return err; 1234} 1235 1236static int packet_sendmsg(struct kiocb *iocb, struct socket *sock, 1237 struct msghdr *msg, size_t len) 1238{ 1239 struct sock *sk = sock->sk; 1240 struct packet_sock *po = pkt_sk(sk); 1241 if (po->tx_ring.pg_vec) 1242 return tpacket_snd(po, msg); 1243 else 1244 return packet_snd(sock, msg, len); 1245} 1246 1247/* 1248 * Close a PACKET socket. This is fairly simple. We immediately go 1249 * to 'closed' state and remove our protocol entry in the device list. 1250 */ 1251 1252static int packet_release(struct socket *sock) 1253{ 1254 struct sock *sk = sock->sk; 1255 struct packet_sock *po; 1256 struct net *net; 1257 struct tpacket_req req; 1258 1259 if (!sk) 1260 return 0; 1261 1262 net = sock_net(sk); 1263 po = pkt_sk(sk); 1264 1265 spin_lock_bh(&net->packet.sklist_lock); 1266 sk_del_node_init_rcu(sk); 1267 sock_prot_inuse_add(net, sk->sk_prot, -1); 1268 spin_unlock_bh(&net->packet.sklist_lock); 1269 1270 spin_lock(&po->bind_lock); 1271 if (po->running) { 1272 /* 1273 * Remove from protocol table 1274 */ 1275 po->running = 0; 1276 po->num = 0; 1277 __dev_remove_pack(&po->prot_hook); 1278 __sock_put(sk); 1279 } 1280 spin_unlock(&po->bind_lock); 1281 1282 packet_flush_mclist(sk); 1283 1284 memset(&req, 0, sizeof(req)); 1285 1286 if (po->rx_ring.pg_vec) 1287 packet_set_ring(sk, &req, 1, 0); 1288 1289 if (po->tx_ring.pg_vec) 1290 packet_set_ring(sk, &req, 1, 1); 1291 1292 synchronize_net(); 1293 /* 1294 * Now the socket is dead. No more input will appear. 1295 */ 1296 sock_orphan(sk); 1297 sock->sk = NULL; 1298 1299 /* Purge queues */ 1300 1301 skb_queue_purge(&sk->sk_receive_queue); 1302 sk_refcnt_debug_release(sk); 1303 1304 sock_put(sk); 1305 return 0; 1306} 1307 1308/* 1309 * Attach a packet hook. 1310 */ 1311 1312static int packet_do_bind(struct sock *sk, struct net_device *dev, __be16 protocol) 1313{ 1314 struct packet_sock *po = pkt_sk(sk); 1315 /* 1316 * Detach an existing hook if present. 1317 */ 1318 1319 lock_sock(sk); 1320 1321 spin_lock(&po->bind_lock); 1322 if (po->running) { 1323 __sock_put(sk); 1324 po->running = 0; 1325 po->num = 0; 1326 spin_unlock(&po->bind_lock); 1327 dev_remove_pack(&po->prot_hook); 1328 spin_lock(&po->bind_lock); 1329 } 1330 1331 po->num = protocol; 1332 po->prot_hook.type = protocol; 1333 po->prot_hook.dev = dev; 1334 1335 po->ifindex = dev ? dev->ifindex : 0; 1336 1337 if (protocol == 0) 1338 goto out_unlock; 1339 1340 if (!dev || (dev->flags & IFF_UP)) { 1341 dev_add_pack(&po->prot_hook); 1342 sock_hold(sk); 1343 po->running = 1; 1344 } else { 1345 sk->sk_err = ENETDOWN; 1346 if (!sock_flag(sk, SOCK_DEAD)) 1347 sk->sk_error_report(sk); 1348 } 1349 1350out_unlock: 1351 spin_unlock(&po->bind_lock); 1352 release_sock(sk); 1353 return 0; 1354} 1355 1356/* 1357 * Bind a packet socket to a device 1358 */ 1359 1360static int packet_bind_spkt(struct socket *sock, struct sockaddr *uaddr, 1361 int addr_len) 1362{ 1363 struct sock *sk = sock->sk; 1364 char name[15]; 1365 struct net_device *dev; 1366 int err = -ENODEV; 1367 1368 /* 1369 * Check legality 1370 */ 1371 1372 if (addr_len != sizeof(struct sockaddr)) 1373 return -EINVAL; 1374 strlcpy(name, uaddr->sa_data, sizeof(name)); 1375 1376 dev = dev_get_by_name(sock_net(sk), name); 1377 if (dev) { 1378 err = packet_do_bind(sk, dev, pkt_sk(sk)->num); 1379 dev_put(dev); 1380 } 1381 return err; 1382} 1383 1384static int packet_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len) 1385{ 1386 struct sockaddr_ll *sll = (struct sockaddr_ll *)uaddr; 1387 struct sock *sk = sock->sk; 1388 struct net_device *dev = NULL; 1389 int err; 1390 1391 1392 /* 1393 * Check legality 1394 */ 1395 1396 if (addr_len < sizeof(struct sockaddr_ll)) 1397 return -EINVAL; 1398 if (sll->sll_family != AF_PACKET) 1399 return -EINVAL; 1400 1401 if (sll->sll_ifindex) { 1402 err = -ENODEV; 1403 dev = dev_get_by_index(sock_net(sk), sll->sll_ifindex); 1404 if (dev == NULL) 1405 goto out; 1406 } 1407 err = packet_do_bind(sk, dev, sll->sll_protocol ? : pkt_sk(sk)->num); 1408 if (dev) 1409 dev_put(dev); 1410 1411out: 1412 return err; 1413} 1414 1415static struct proto packet_proto = { 1416 .name = "PACKET", 1417 .owner = THIS_MODULE, 1418 .obj_size = sizeof(struct packet_sock), 1419}; 1420 1421/* 1422 * Create a packet of type SOCK_PACKET. 1423 */ 1424 1425static int packet_create(struct net *net, struct socket *sock, int protocol, 1426 int kern) 1427{ 1428 struct sock *sk; 1429 struct packet_sock *po; 1430 __be16 proto = (__force __be16)protocol; /* weird, but documented */ 1431 int err; 1432 1433 if (!capable(CAP_NET_RAW)) 1434 return -EPERM; 1435 if (sock->type != SOCK_DGRAM && sock->type != SOCK_RAW && 1436 sock->type != SOCK_PACKET) 1437 return -ESOCKTNOSUPPORT; 1438 1439 sock->state = SS_UNCONNECTED; 1440 1441 err = -ENOBUFS; 1442 sk = sk_alloc(net, PF_PACKET, GFP_KERNEL, &packet_proto); 1443 if (sk == NULL) 1444 goto out; 1445 1446 sock->ops = &packet_ops; 1447 if (sock->type == SOCK_PACKET) 1448 sock->ops = &packet_ops_spkt; 1449 1450 sock_init_data(sock, sk); 1451 1452 po = pkt_sk(sk); 1453 sk->sk_family = PF_PACKET; 1454 po->num = proto; 1455 1456 sk->sk_destruct = packet_sock_destruct; 1457 sk_refcnt_debug_inc(sk); 1458 1459 /* 1460 * Attach a protocol block 1461 */ 1462 1463 spin_lock_init(&po->bind_lock); 1464 mutex_init(&po->pg_vec_lock); 1465 po->prot_hook.func = packet_rcv; 1466 1467 if (sock->type == SOCK_PACKET) 1468 po->prot_hook.func = packet_rcv_spkt; 1469 1470 po->prot_hook.af_packet_priv = sk; 1471 1472 if (proto) { 1473 po->prot_hook.type = proto; 1474 dev_add_pack(&po->prot_hook); 1475 sock_hold(sk); 1476 po->running = 1; 1477 } 1478 1479 spin_lock_bh(&net->packet.sklist_lock); 1480 sk_add_node_rcu(sk, &net->packet.sklist); 1481 sock_prot_inuse_add(net, &packet_proto, 1); 1482 spin_unlock_bh(&net->packet.sklist_lock); 1483 1484 return 0; 1485out: 1486 return err; 1487} 1488 1489/* 1490 * Pull a packet from our receive queue and hand it to the user. 1491 * If necessary we block. 1492 */ 1493 1494static int packet_recvmsg(struct kiocb *iocb, struct socket *sock, 1495 struct msghdr *msg, size_t len, int flags) 1496{ 1497 struct sock *sk = sock->sk; 1498 struct sk_buff *skb; 1499 int copied, err; 1500 struct sockaddr_ll *sll; 1501 int vnet_hdr_len = 0; 1502 1503 err = -EINVAL; 1504 if (flags & ~(MSG_PEEK|MSG_DONTWAIT|MSG_TRUNC|MSG_CMSG_COMPAT)) 1505 goto out; 1506 1507#if 0 1508 /* What error should we return now? EUNATTACH? */ 1509 if (pkt_sk(sk)->ifindex < 0) 1510 return -ENODEV; 1511#endif 1512 1513 /* 1514 * Call the generic datagram receiver. This handles all sorts 1515 * of horrible races and re-entrancy so we can forget about it 1516 * in the protocol layers. 1517 * 1518 * Now it will return ENETDOWN, if device have just gone down, 1519 * but then it will block. 1520 */ 1521 1522 skb = skb_recv_datagram(sk, flags, flags & MSG_DONTWAIT, &err); 1523 1524 /* 1525 * An error occurred so return it. Because skb_recv_datagram() 1526 * handles the blocking we don't see and worry about blocking 1527 * retries. 1528 */ 1529 1530 if (skb == NULL) 1531 goto out; 1532 1533 if (pkt_sk(sk)->has_vnet_hdr) { 1534 struct virtio_net_hdr vnet_hdr = { 0 }; 1535 1536 err = -EINVAL; 1537 vnet_hdr_len = sizeof(vnet_hdr); 1538 if ((len -= vnet_hdr_len) < 0) 1539 goto out_free; 1540 1541 if (skb_is_gso(skb)) { 1542 struct skb_shared_info *sinfo = skb_shinfo(skb); 1543 1544 /* This is a hint as to how much should be linear. */ 1545 vnet_hdr.hdr_len = skb_headlen(skb); 1546 vnet_hdr.gso_size = sinfo->gso_size; 1547 if (sinfo->gso_type & SKB_GSO_TCPV4) 1548 vnet_hdr.gso_type = VIRTIO_NET_HDR_GSO_TCPV4; 1549 else if (sinfo->gso_type & SKB_GSO_TCPV6) 1550 vnet_hdr.gso_type = VIRTIO_NET_HDR_GSO_TCPV6; 1551 else if (sinfo->gso_type & SKB_GSO_UDP) 1552 vnet_hdr.gso_type = VIRTIO_NET_HDR_GSO_UDP; 1553 else if (sinfo->gso_type & SKB_GSO_FCOE) 1554 goto out_free; 1555 else 1556 BUG(); 1557 if (sinfo->gso_type & SKB_GSO_TCP_ECN) 1558 vnet_hdr.gso_type |= VIRTIO_NET_HDR_GSO_ECN; 1559 } else 1560 vnet_hdr.gso_type = VIRTIO_NET_HDR_GSO_NONE; 1561 1562 if (skb->ip_summed == CHECKSUM_PARTIAL) { 1563 vnet_hdr.flags = VIRTIO_NET_HDR_F_NEEDS_CSUM; 1564 vnet_hdr.csum_start = skb->csum_start - 1565 skb_headroom(skb); 1566 vnet_hdr.csum_offset = skb->csum_offset; 1567 } /* else everything is zero */ 1568 1569 err = memcpy_toiovec(msg->msg_iov, (void *)&vnet_hdr, 1570 vnet_hdr_len); 1571 if (err < 0) 1572 goto out_free; 1573 } 1574 1575 /* 1576 * If the address length field is there to be filled in, we fill 1577 * it in now. 1578 */ 1579 1580 sll = &PACKET_SKB_CB(skb)->sa.ll; 1581 if (sock->type == SOCK_PACKET) 1582 msg->msg_namelen = sizeof(struct sockaddr_pkt); 1583 else 1584 msg->msg_namelen = sll->sll_halen + offsetof(struct sockaddr_ll, sll_addr); 1585 1586 /* 1587 * You lose any data beyond the buffer you gave. If it worries a 1588 * user program they can ask the device for its MTU anyway. 1589 */ 1590 1591 copied = skb->len; 1592 if (copied > len) { 1593 copied = len; 1594 msg->msg_flags |= MSG_TRUNC; 1595 } 1596 1597 err = skb_copy_datagram_iovec(skb, 0, msg->msg_iov, copied); 1598 if (err) 1599 goto out_free; 1600 1601 sock_recv_ts_and_drops(msg, sk, skb); 1602 1603 if (msg->msg_name) 1604 memcpy(msg->msg_name, &PACKET_SKB_CB(skb)->sa, 1605 msg->msg_namelen); 1606 1607 if (pkt_sk(sk)->auxdata) { 1608 struct tpacket_auxdata aux; 1609 1610 aux.tp_status = TP_STATUS_USER; 1611 if (skb->ip_summed == CHECKSUM_PARTIAL) 1612 aux.tp_status |= TP_STATUS_CSUMNOTREADY; 1613 aux.tp_len = PACKET_SKB_CB(skb)->origlen; 1614 aux.tp_snaplen = skb->len; 1615 aux.tp_mac = 0; 1616 aux.tp_net = skb_network_offset(skb); 1617 aux.tp_vlan_tci = vlan_tx_tag_get(skb); 1618 1619 put_cmsg(msg, SOL_PACKET, PACKET_AUXDATA, sizeof(aux), &aux); 1620 } 1621 1622 /* 1623 * Free or return the buffer as appropriate. Again this 1624 * hides all the races and re-entrancy issues from us. 1625 */ 1626 err = vnet_hdr_len + ((flags&MSG_TRUNC) ? skb->len : copied); 1627 1628out_free: 1629 skb_free_datagram(sk, skb); 1630out: 1631 return err; 1632} 1633 1634static int packet_getname_spkt(struct socket *sock, struct sockaddr *uaddr, 1635 int *uaddr_len, int peer) 1636{ 1637 struct net_device *dev; 1638 struct sock *sk = sock->sk; 1639 1640 if (peer) 1641 return -EOPNOTSUPP; 1642 1643 uaddr->sa_family = AF_PACKET; 1644 rcu_read_lock(); 1645 dev = dev_get_by_index_rcu(sock_net(sk), pkt_sk(sk)->ifindex); 1646 if (dev) 1647 strlcpy(uaddr->sa_data, dev->name, 15); 1648 else 1649 memset(uaddr->sa_data, 0, 14); 1650 rcu_read_unlock(); 1651 *uaddr_len = sizeof(*uaddr); 1652 1653 return 0; 1654} 1655 1656static int packet_getname(struct socket *sock, struct sockaddr *uaddr, 1657 int *uaddr_len, int peer) 1658{ 1659 struct net_device *dev; 1660 struct sock *sk = sock->sk; 1661 struct packet_sock *po = pkt_sk(sk); 1662 DECLARE_SOCKADDR(struct sockaddr_ll *, sll, uaddr); 1663 1664 if (peer) 1665 return -EOPNOTSUPP; 1666 1667 sll->sll_family = AF_PACKET; 1668 sll->sll_ifindex = po->ifindex; 1669 sll->sll_protocol = po->num; 1670 rcu_read_lock(); 1671 dev = dev_get_by_index_rcu(sock_net(sk), po->ifindex); 1672 if (dev) { 1673 sll->sll_hatype = dev->type; 1674 sll->sll_halen = dev->addr_len; 1675 memcpy(sll->sll_addr, dev->dev_addr, dev->addr_len); 1676 } else { 1677 sll->sll_hatype = 0; /* Bad: we have no ARPHRD_UNSPEC */ 1678 sll->sll_halen = 0; 1679 } 1680 rcu_read_unlock(); 1681 *uaddr_len = offsetof(struct sockaddr_ll, sll_addr) + sll->sll_halen; 1682 1683 return 0; 1684} 1685 1686static int packet_dev_mc(struct net_device *dev, struct packet_mclist *i, 1687 int what) 1688{ 1689 switch (i->type) { 1690 case PACKET_MR_MULTICAST: 1691 if (i->alen != dev->addr_len) 1692 return -EINVAL; 1693 if (what > 0) 1694 return dev_mc_add(dev, i->addr, i->alen, 0); 1695 else 1696 return dev_mc_delete(dev, i->addr, i->alen, 0); 1697 break; 1698 case PACKET_MR_PROMISC: 1699 return dev_set_promiscuity(dev, what); 1700 break; 1701 case PACKET_MR_ALLMULTI: 1702 return dev_set_allmulti(dev, what); 1703 break; 1704 case PACKET_MR_UNICAST: 1705 if (i->alen != dev->addr_len) 1706 return -EINVAL; 1707 if (what > 0) 1708 return dev_unicast_add(dev, i->addr); 1709 else 1710 return dev_unicast_delete(dev, i->addr); 1711 break; 1712 default: 1713 break; 1714 } 1715 return 0; 1716} 1717 1718static void packet_dev_mclist(struct net_device *dev, struct packet_mclist *i, int what) 1719{ 1720 for ( ; i; i = i->next) { 1721 if (i->ifindex == dev->ifindex) 1722 packet_dev_mc(dev, i, what); 1723 } 1724} 1725 1726static int packet_mc_add(struct sock *sk, struct packet_mreq_max *mreq) 1727{ 1728 struct packet_sock *po = pkt_sk(sk); 1729 struct packet_mclist *ml, *i; 1730 struct net_device *dev; 1731 int err; 1732 1733 rtnl_lock(); 1734 1735 err = -ENODEV; 1736 dev = __dev_get_by_index(sock_net(sk), mreq->mr_ifindex); 1737 if (!dev) 1738 goto done; 1739 1740 err = -EINVAL; 1741 if (mreq->mr_alen > dev->addr_len) 1742 goto done; 1743 1744 err = -ENOBUFS; 1745 i = kmalloc(sizeof(*i), GFP_KERNEL); 1746 if (i == NULL) 1747 goto done; 1748 1749 err = 0; 1750 for (ml = po->mclist; ml; ml = ml->next) { 1751 if (ml->ifindex == mreq->mr_ifindex && 1752 ml->type == mreq->mr_type && 1753 ml->alen == mreq->mr_alen && 1754 memcmp(ml->addr, mreq->mr_address, ml->alen) == 0) { 1755 ml->count++; 1756 /* Free the new element ... */ 1757 kfree(i); 1758 goto done; 1759 } 1760 } 1761 1762 i->type = mreq->mr_type; 1763 i->ifindex = mreq->mr_ifindex; 1764 i->alen = mreq->mr_alen; 1765 memcpy(i->addr, mreq->mr_address, i->alen); 1766 i->count = 1; 1767 i->next = po->mclist; 1768 po->mclist = i; 1769 err = packet_dev_mc(dev, i, 1); 1770 if (err) { 1771 po->mclist = i->next; 1772 kfree(i); 1773 } 1774 1775done: 1776 rtnl_unlock(); 1777 return err; 1778} 1779 1780static int packet_mc_drop(struct sock *sk, struct packet_mreq_max *mreq) 1781{ 1782 struct packet_mclist *ml, **mlp; 1783 1784 rtnl_lock(); 1785 1786 for (mlp = &pkt_sk(sk)->mclist; (ml = *mlp) != NULL; mlp = &ml->next) { 1787 if (ml->ifindex == mreq->mr_ifindex && 1788 ml->type == mreq->mr_type && 1789 ml->alen == mreq->mr_alen && 1790 memcmp(ml->addr, mreq->mr_address, ml->alen) == 0) { 1791 if (--ml->count == 0) { 1792 struct net_device *dev; 1793 *mlp = ml->next; 1794 dev = __dev_get_by_index(sock_net(sk), ml->ifindex); 1795 if (dev) 1796 packet_dev_mc(dev, ml, -1); 1797 kfree(ml); 1798 } 1799 rtnl_unlock(); 1800 return 0; 1801 } 1802 } 1803 rtnl_unlock(); 1804 return -EADDRNOTAVAIL; 1805} 1806 1807static void packet_flush_mclist(struct sock *sk) 1808{ 1809 struct packet_sock *po = pkt_sk(sk); 1810 struct packet_mclist *ml; 1811 1812 if (!po->mclist) 1813 return; 1814 1815 rtnl_lock(); 1816 while ((ml = po->mclist) != NULL) { 1817 struct net_device *dev; 1818 1819 po->mclist = ml->next; 1820 dev = __dev_get_by_index(sock_net(sk), ml->ifindex); 1821 if (dev != NULL) 1822 packet_dev_mc(dev, ml, -1); 1823 kfree(ml); 1824 } 1825 rtnl_unlock(); 1826} 1827 1828static int 1829packet_setsockopt(struct socket *sock, int level, int optname, char __user *optval, unsigned int optlen) 1830{ 1831 struct sock *sk = sock->sk; 1832 struct packet_sock *po = pkt_sk(sk); 1833 int ret; 1834 1835 if (level != SOL_PACKET) 1836 return -ENOPROTOOPT; 1837 1838 switch (optname) { 1839 case PACKET_ADD_MEMBERSHIP: 1840 case PACKET_DROP_MEMBERSHIP: 1841 { 1842 struct packet_mreq_max mreq; 1843 int len = optlen; 1844 memset(&mreq, 0, sizeof(mreq)); 1845 if (len < sizeof(struct packet_mreq)) 1846 return -EINVAL; 1847 if (len > sizeof(mreq)) 1848 len = sizeof(mreq); 1849 if (copy_from_user(&mreq, optval, len)) 1850 return -EFAULT; 1851 if (len < (mreq.mr_alen + offsetof(struct packet_mreq, mr_address))) 1852 return -EINVAL; 1853 if (optname == PACKET_ADD_MEMBERSHIP) 1854 ret = packet_mc_add(sk, &mreq); 1855 else 1856 ret = packet_mc_drop(sk, &mreq); 1857 return ret; 1858 } 1859 1860 case PACKET_RX_RING: 1861 case PACKET_TX_RING: 1862 { 1863 struct tpacket_req req; 1864 1865 if (optlen < sizeof(req)) 1866 return -EINVAL; 1867 if (pkt_sk(sk)->has_vnet_hdr) 1868 return -EINVAL; 1869 if (copy_from_user(&req, optval, sizeof(req))) 1870 return -EFAULT; 1871 return packet_set_ring(sk, &req, 0, optname == PACKET_TX_RING); 1872 } 1873 case PACKET_COPY_THRESH: 1874 { 1875 int val; 1876 1877 if (optlen != sizeof(val)) 1878 return -EINVAL; 1879 if (copy_from_user(&val, optval, sizeof(val))) 1880 return -EFAULT; 1881 1882 pkt_sk(sk)->copy_thresh = val; 1883 return 0; 1884 } 1885 case PACKET_VERSION: 1886 { 1887 int val; 1888 1889 if (optlen != sizeof(val)) 1890 return -EINVAL; 1891 if (po->rx_ring.pg_vec || po->tx_ring.pg_vec) 1892 return -EBUSY; 1893 if (copy_from_user(&val, optval, sizeof(val))) 1894 return -EFAULT; 1895 switch (val) { 1896 case TPACKET_V1: 1897 case TPACKET_V2: 1898 po->tp_version = val; 1899 return 0; 1900 default: 1901 return -EINVAL; 1902 } 1903 } 1904 case PACKET_RESERVE: 1905 { 1906 unsigned int val; 1907 1908 if (optlen != sizeof(val)) 1909 return -EINVAL; 1910 if (po->rx_ring.pg_vec || po->tx_ring.pg_vec) 1911 return -EBUSY; 1912 if (copy_from_user(&val, optval, sizeof(val))) 1913 return -EFAULT; 1914 po->tp_reserve = val; 1915 return 0; 1916 } 1917 case PACKET_LOSS: 1918 { 1919 unsigned int val; 1920 1921 if (optlen != sizeof(val)) 1922 return -EINVAL; 1923 if (po->rx_ring.pg_vec || po->tx_ring.pg_vec) 1924 return -EBUSY; 1925 if (copy_from_user(&val, optval, sizeof(val))) 1926 return -EFAULT; 1927 po->tp_loss = !!val; 1928 return 0; 1929 } 1930 case PACKET_AUXDATA: 1931 { 1932 int val; 1933 1934 if (optlen < sizeof(val)) 1935 return -EINVAL; 1936 if (copy_from_user(&val, optval, sizeof(val))) 1937 return -EFAULT; 1938 1939 po->auxdata = !!val; 1940 return 0; 1941 } 1942 case PACKET_ORIGDEV: 1943 { 1944 int val; 1945 1946 if (optlen < sizeof(val)) 1947 return -EINVAL; 1948 if (copy_from_user(&val, optval, sizeof(val))) 1949 return -EFAULT; 1950 1951 po->origdev = !!val; 1952 return 0; 1953 } 1954 case PACKET_VNET_HDR: 1955 { 1956 int val; 1957 1958 if (sock->type != SOCK_RAW) 1959 return -EINVAL; 1960 if (po->rx_ring.pg_vec || po->tx_ring.pg_vec) 1961 return -EBUSY; 1962 if (optlen < sizeof(val)) 1963 return -EINVAL; 1964 if (copy_from_user(&val, optval, sizeof(val))) 1965 return -EFAULT; 1966 1967 po->has_vnet_hdr = !!val; 1968 return 0; 1969 } 1970 default: 1971 return -ENOPROTOOPT; 1972 } 1973} 1974 1975static int packet_getsockopt(struct socket *sock, int level, int optname, 1976 char __user *optval, int __user *optlen) 1977{ 1978 int len; 1979 int val; 1980 struct sock *sk = sock->sk; 1981 struct packet_sock *po = pkt_sk(sk); 1982 void *data; 1983 struct tpacket_stats st; 1984 1985 if (level != SOL_PACKET) 1986 return -ENOPROTOOPT; 1987 1988 if (get_user(len, optlen)) 1989 return -EFAULT; 1990 1991 if (len < 0) 1992 return -EINVAL; 1993 1994 switch (optname) { 1995 case PACKET_STATISTICS: 1996 if (len > sizeof(struct tpacket_stats)) 1997 len = sizeof(struct tpacket_stats); 1998 spin_lock_bh(&sk->sk_receive_queue.lock); 1999 st = po->stats; 2000 memset(&po->stats, 0, sizeof(st)); 2001 spin_unlock_bh(&sk->sk_receive_queue.lock); 2002 st.tp_packets += st.tp_drops; 2003 2004 data = &st; 2005 break; 2006 case PACKET_AUXDATA: 2007 if (len > sizeof(int)) 2008 len = sizeof(int); 2009 val = po->auxdata; 2010 2011 data = &val; 2012 break; 2013 case PACKET_ORIGDEV: 2014 if (len > sizeof(int)) 2015 len = sizeof(int); 2016 val = po->origdev; 2017 2018 data = &val; 2019 break; 2020 case PACKET_VNET_HDR: 2021 if (len > sizeof(int)) 2022 len = sizeof(int); 2023 val = po->has_vnet_hdr; 2024 2025 data = &val; 2026 break; 2027 case PACKET_VERSION: 2028 if (len > sizeof(int)) 2029 len = sizeof(int); 2030 val = po->tp_version; 2031 data = &val; 2032 break; 2033 case PACKET_HDRLEN: 2034 if (len > sizeof(int)) 2035 len = sizeof(int); 2036 if (copy_from_user(&val, optval, len)) 2037 return -EFAULT; 2038 switch (val) { 2039 case TPACKET_V1: 2040 val = sizeof(struct tpacket_hdr); 2041 break; 2042 case TPACKET_V2: 2043 val = sizeof(struct tpacket2_hdr); 2044 break; 2045 default: 2046 return -EINVAL; 2047 } 2048 data = &val; 2049 break; 2050 case PACKET_RESERVE: 2051 if (len > sizeof(unsigned int)) 2052 len = sizeof(unsigned int); 2053 val = po->tp_reserve; 2054 data = &val; 2055 break; 2056 case PACKET_LOSS: 2057 if (len > sizeof(unsigned int)) 2058 len = sizeof(unsigned int); 2059 val = po->tp_loss; 2060 data = &val; 2061 break; 2062 default: 2063 return -ENOPROTOOPT; 2064 } 2065 2066 if (put_user(len, optlen)) 2067 return -EFAULT; 2068 if (copy_to_user(optval, data, len)) 2069 return -EFAULT; 2070 return 0; 2071} 2072 2073 2074static int packet_notifier(struct notifier_block *this, unsigned long msg, void *data) 2075{ 2076 struct sock *sk; 2077 struct hlist_node *node; 2078 struct net_device *dev = data; 2079 struct net *net = dev_net(dev); 2080 2081 rcu_read_lock(); 2082 sk_for_each_rcu(sk, node, &net->packet.sklist) { 2083 struct packet_sock *po = pkt_sk(sk); 2084 2085 switch (msg) { 2086 case NETDEV_UNREGISTER: 2087 if (po->mclist) 2088 packet_dev_mclist(dev, po->mclist, -1); 2089 /* fallthrough */ 2090 2091 case NETDEV_DOWN: 2092 if (dev->ifindex == po->ifindex) { 2093 spin_lock(&po->bind_lock); 2094 if (po->running) { 2095 __dev_remove_pack(&po->prot_hook); 2096 __sock_put(sk); 2097 po->running = 0; 2098 sk->sk_err = ENETDOWN; 2099 if (!sock_flag(sk, SOCK_DEAD)) 2100 sk->sk_error_report(sk); 2101 } 2102 if (msg == NETDEV_UNREGISTER) { 2103 po->ifindex = -1; 2104 po->prot_hook.dev = NULL; 2105 } 2106 spin_unlock(&po->bind_lock); 2107 } 2108 break; 2109 case NETDEV_UP: 2110 if (dev->ifindex == po->ifindex) { 2111 spin_lock(&po->bind_lock); 2112 if (po->num && !po->running) { 2113 dev_add_pack(&po->prot_hook); 2114 sock_hold(sk); 2115 po->running = 1; 2116 } 2117 spin_unlock(&po->bind_lock); 2118 } 2119 break; 2120 } 2121 } 2122 rcu_read_unlock(); 2123 return NOTIFY_DONE; 2124} 2125 2126 2127static int packet_ioctl(struct socket *sock, unsigned int cmd, 2128 unsigned long arg) 2129{ 2130 struct sock *sk = sock->sk; 2131 2132 switch (cmd) { 2133 case SIOCOUTQ: 2134 { 2135 int amount = sk_wmem_alloc_get(sk); 2136 2137 return put_user(amount, (int __user *)arg); 2138 } 2139 case SIOCINQ: 2140 { 2141 struct sk_buff *skb; 2142 int amount = 0; 2143 2144 spin_lock_bh(&sk->sk_receive_queue.lock); 2145 skb = skb_peek(&sk->sk_receive_queue); 2146 if (skb) 2147 amount = skb->len; 2148 spin_unlock_bh(&sk->sk_receive_queue.lock); 2149 return put_user(amount, (int __user *)arg); 2150 } 2151 case SIOCGSTAMP: 2152 return sock_get_timestamp(sk, (struct timeval __user *)arg); 2153 case SIOCGSTAMPNS: 2154 return sock_get_timestampns(sk, (struct timespec __user *)arg); 2155 2156#ifdef CONFIG_INET 2157 case SIOCADDRT: 2158 case SIOCDELRT: 2159 case SIOCDARP: 2160 case SIOCGARP: 2161 case SIOCSARP: 2162 case SIOCGIFADDR: 2163 case SIOCSIFADDR: 2164 case SIOCGIFBRDADDR: 2165 case SIOCSIFBRDADDR: 2166 case SIOCGIFNETMASK: 2167 case SIOCSIFNETMASK: 2168 case SIOCGIFDSTADDR: 2169 case SIOCSIFDSTADDR: 2170 case SIOCSIFFLAGS: 2171 if (!net_eq(sock_net(sk), &init_net)) 2172 return -ENOIOCTLCMD; 2173 return inet_dgram_ops.ioctl(sock, cmd, arg); 2174#endif 2175 2176 default: 2177 return -ENOIOCTLCMD; 2178 } 2179 return 0; 2180} 2181 2182static unsigned int packet_poll(struct file *file, struct socket *sock, 2183 poll_table *wait) 2184{ 2185 struct sock *sk = sock->sk; 2186 struct packet_sock *po = pkt_sk(sk); 2187 unsigned int mask = datagram_poll(file, sock, wait); 2188 2189 spin_lock_bh(&sk->sk_receive_queue.lock); 2190 if (po->rx_ring.pg_vec) { 2191 if (!packet_previous_frame(po, &po->rx_ring, TP_STATUS_KERNEL)) 2192 mask |= POLLIN | POLLRDNORM; 2193 } 2194 spin_unlock_bh(&sk->sk_receive_queue.lock); 2195 spin_lock_bh(&sk->sk_write_queue.lock); 2196 if (po->tx_ring.pg_vec) { 2197 if (packet_current_frame(po, &po->tx_ring, TP_STATUS_AVAILABLE)) 2198 mask |= POLLOUT | POLLWRNORM; 2199 } 2200 spin_unlock_bh(&sk->sk_write_queue.lock); 2201 return mask; 2202} 2203 2204 2205/* Dirty? Well, I still did not learn better way to account 2206 * for user mmaps. 2207 */ 2208 2209static void packet_mm_open(struct vm_area_struct *vma) 2210{ 2211 struct file *file = vma->vm_file; 2212 struct socket *sock = file->private_data; 2213 struct sock *sk = sock->sk; 2214 2215 if (sk) 2216 atomic_inc(&pkt_sk(sk)->mapped); 2217} 2218 2219static void packet_mm_close(struct vm_area_struct *vma) 2220{ 2221 struct file *file = vma->vm_file; 2222 struct socket *sock = file->private_data; 2223 struct sock *sk = sock->sk; 2224 2225 if (sk) 2226 atomic_dec(&pkt_sk(sk)->mapped); 2227} 2228 2229static const struct vm_operations_struct packet_mmap_ops = { 2230 .open = packet_mm_open, 2231 .close = packet_mm_close, 2232}; 2233 2234static void free_pg_vec(char **pg_vec, unsigned int order, unsigned int len) 2235{ 2236 int i; 2237 2238 for (i = 0; i < len; i++) { 2239 if (likely(pg_vec[i])) 2240 free_pages((unsigned long) pg_vec[i], order); 2241 } 2242 kfree(pg_vec); 2243} 2244 2245static inline char *alloc_one_pg_vec_page(unsigned long order) 2246{ 2247 gfp_t gfp_flags = GFP_KERNEL | __GFP_COMP | __GFP_ZERO | __GFP_NOWARN; 2248 2249 return (char *) __get_free_pages(gfp_flags, order); 2250} 2251 2252static char **alloc_pg_vec(struct tpacket_req *req, int order) 2253{ 2254 unsigned int block_nr = req->tp_block_nr; 2255 char **pg_vec; 2256 int i; 2257 2258 pg_vec = kzalloc(block_nr * sizeof(char *), GFP_KERNEL); 2259 if (unlikely(!pg_vec)) 2260 goto out; 2261 2262 for (i = 0; i < block_nr; i++) { 2263 pg_vec[i] = alloc_one_pg_vec_page(order); 2264 if (unlikely(!pg_vec[i])) 2265 goto out_free_pgvec; 2266 } 2267 2268out: 2269 return pg_vec; 2270 2271out_free_pgvec: 2272 free_pg_vec(pg_vec, order, block_nr); 2273 pg_vec = NULL; 2274 goto out; 2275} 2276 2277static int packet_set_ring(struct sock *sk, struct tpacket_req *req, 2278 int closing, int tx_ring) 2279{ 2280 char **pg_vec = NULL; 2281 struct packet_sock *po = pkt_sk(sk); 2282 int was_running, order = 0; 2283 struct packet_ring_buffer *rb; 2284 struct sk_buff_head *rb_queue; 2285 __be16 num; 2286 int err; 2287 2288 rb = tx_ring ? &po->tx_ring : &po->rx_ring; 2289 rb_queue = tx_ring ? &sk->sk_write_queue : &sk->sk_receive_queue; 2290 2291 err = -EBUSY; 2292 if (!closing) { 2293 if (atomic_read(&po->mapped)) 2294 goto out; 2295 if (atomic_read(&rb->pending)) 2296 goto out; 2297 } 2298 2299 if (req->tp_block_nr) { 2300 /* Sanity tests and some calculations */ 2301 err = -EBUSY; 2302 if (unlikely(rb->pg_vec)) 2303 goto out; 2304 2305 switch (po->tp_version) { 2306 case TPACKET_V1: 2307 po->tp_hdrlen = TPACKET_HDRLEN; 2308 break; 2309 case TPACKET_V2: 2310 po->tp_hdrlen = TPACKET2_HDRLEN; 2311 break; 2312 } 2313 2314 err = -EINVAL; 2315 if (unlikely((int)req->tp_block_size <= 0)) 2316 goto out; 2317 if (unlikely(req->tp_block_size & (PAGE_SIZE - 1))) 2318 goto out; 2319 if (unlikely(req->tp_frame_size < po->tp_hdrlen + 2320 po->tp_reserve)) 2321 goto out; 2322 if (unlikely(req->tp_frame_size & (TPACKET_ALIGNMENT - 1))) 2323 goto out; 2324 2325 rb->frames_per_block = req->tp_block_size/req->tp_frame_size; 2326 if (unlikely(rb->frames_per_block <= 0)) 2327 goto out; 2328 if (unlikely((rb->frames_per_block * req->tp_block_nr) != 2329 req->tp_frame_nr)) 2330 goto out; 2331 2332 err = -ENOMEM; 2333 order = get_order(req->tp_block_size); 2334 pg_vec = alloc_pg_vec(req, order); 2335 if (unlikely(!pg_vec)) 2336 goto out; 2337 } 2338 /* Done */ 2339 else { 2340 err = -EINVAL; 2341 if (unlikely(req->tp_frame_nr)) 2342 goto out; 2343 } 2344 2345 lock_sock(sk); 2346 2347 /* Detach socket from network */ 2348 spin_lock(&po->bind_lock); 2349 was_running = po->running; 2350 num = po->num; 2351 if (was_running) { 2352 __dev_remove_pack(&po->prot_hook); 2353 po->num = 0; 2354 po->running = 0; 2355 __sock_put(sk); 2356 } 2357 spin_unlock(&po->bind_lock); 2358 2359 synchronize_net(); 2360 2361 err = -EBUSY; 2362 mutex_lock(&po->pg_vec_lock); 2363 if (closing || atomic_read(&po->mapped) == 0) { 2364 err = 0; 2365#define XC(a, b) ({ __typeof__ ((a)) __t; __t = (a); (a) = (b); __t; }) 2366 spin_lock_bh(&rb_queue->lock); 2367 pg_vec = XC(rb->pg_vec, pg_vec); 2368 rb->frame_max = (req->tp_frame_nr - 1); 2369 rb->head = 0; 2370 rb->frame_size = req->tp_frame_size; 2371 spin_unlock_bh(&rb_queue->lock); 2372 2373 order = XC(rb->pg_vec_order, order); 2374 req->tp_block_nr = XC(rb->pg_vec_len, req->tp_block_nr); 2375 2376 rb->pg_vec_pages = req->tp_block_size/PAGE_SIZE; 2377 po->prot_hook.func = (po->rx_ring.pg_vec) ? 2378 tpacket_rcv : packet_rcv; 2379 skb_queue_purge(rb_queue); 2380#undef XC 2381 if (atomic_read(&po->mapped)) 2382 pr_err("packet_mmap: vma is busy: %d\n", 2383 atomic_read(&po->mapped)); 2384 } 2385 mutex_unlock(&po->pg_vec_lock); 2386 2387 spin_lock(&po->bind_lock); 2388 if (was_running && !po->running) { 2389 sock_hold(sk); 2390 po->running = 1; 2391 po->num = num; 2392 dev_add_pack(&po->prot_hook); 2393 } 2394 spin_unlock(&po->bind_lock); 2395 2396 release_sock(sk); 2397 2398 if (pg_vec) 2399 free_pg_vec(pg_vec, order, req->tp_block_nr); 2400out: 2401 return err; 2402} 2403 2404static int packet_mmap(struct file *file, struct socket *sock, 2405 struct vm_area_struct *vma) 2406{ 2407 struct sock *sk = sock->sk; 2408 struct packet_sock *po = pkt_sk(sk); 2409 unsigned long size, expected_size; 2410 struct packet_ring_buffer *rb; 2411 unsigned long start; 2412 int err = -EINVAL; 2413 int i; 2414 2415 if (vma->vm_pgoff) 2416 return -EINVAL; 2417 2418 mutex_lock(&po->pg_vec_lock); 2419 2420 expected_size = 0; 2421 for (rb = &po->rx_ring; rb <= &po->tx_ring; rb++) { 2422 if (rb->pg_vec) { 2423 expected_size += rb->pg_vec_len 2424 * rb->pg_vec_pages 2425 * PAGE_SIZE; 2426 } 2427 } 2428 2429 if (expected_size == 0) 2430 goto out; 2431 2432 size = vma->vm_end - vma->vm_start; 2433 if (size != expected_size) 2434 goto out; 2435 2436 start = vma->vm_start; 2437 for (rb = &po->rx_ring; rb <= &po->tx_ring; rb++) { 2438 if (rb->pg_vec == NULL) 2439 continue; 2440 2441 for (i = 0; i < rb->pg_vec_len; i++) { 2442 struct page *page = virt_to_page(rb->pg_vec[i]); 2443 int pg_num; 2444 2445 for (pg_num = 0; pg_num < rb->pg_vec_pages; 2446 pg_num++, page++) { 2447 err = vm_insert_page(vma, start, page); 2448 if (unlikely(err)) 2449 goto out; 2450 start += PAGE_SIZE; 2451 } 2452 } 2453 } 2454 2455 atomic_inc(&po->mapped); 2456 vma->vm_ops = &packet_mmap_ops; 2457 err = 0; 2458 2459out: 2460 mutex_unlock(&po->pg_vec_lock); 2461 return err; 2462} 2463 2464static const struct proto_ops packet_ops_spkt = { 2465 .family = PF_PACKET, 2466 .owner = THIS_MODULE, 2467 .release = packet_release, 2468 .bind = packet_bind_spkt, 2469 .connect = sock_no_connect, 2470 .socketpair = sock_no_socketpair, 2471 .accept = sock_no_accept, 2472 .getname = packet_getname_spkt, 2473 .poll = datagram_poll, 2474 .ioctl = packet_ioctl, 2475 .listen = sock_no_listen, 2476 .shutdown = sock_no_shutdown, 2477 .setsockopt = sock_no_setsockopt, 2478 .getsockopt = sock_no_getsockopt, 2479 .sendmsg = packet_sendmsg_spkt, 2480 .recvmsg = packet_recvmsg, 2481 .mmap = sock_no_mmap, 2482 .sendpage = sock_no_sendpage, 2483}; 2484 2485static const struct proto_ops packet_ops = { 2486 .family = PF_PACKET, 2487 .owner = THIS_MODULE, 2488 .release = packet_release, 2489 .bind = packet_bind, 2490 .connect = sock_no_connect, 2491 .socketpair = sock_no_socketpair, 2492 .accept = sock_no_accept, 2493 .getname = packet_getname, 2494 .poll = packet_poll, 2495 .ioctl = packet_ioctl, 2496 .listen = sock_no_listen, 2497 .shutdown = sock_no_shutdown, 2498 .setsockopt = packet_setsockopt, 2499 .getsockopt = packet_getsockopt, 2500 .sendmsg = packet_sendmsg, 2501 .recvmsg = packet_recvmsg, 2502 .mmap = packet_mmap, 2503 .sendpage = sock_no_sendpage, 2504}; 2505 2506static const struct net_proto_family packet_family_ops = { 2507 .family = PF_PACKET, 2508 .create = packet_create, 2509 .owner = THIS_MODULE, 2510}; 2511 2512static struct notifier_block packet_netdev_notifier = { 2513 .notifier_call = packet_notifier, 2514}; 2515 2516#ifdef CONFIG_PROC_FS 2517 2518static void *packet_seq_start(struct seq_file *seq, loff_t *pos) 2519 __acquires(RCU) 2520{ 2521 struct net *net = seq_file_net(seq); 2522 2523 rcu_read_lock(); 2524 return seq_hlist_start_head_rcu(&net->packet.sklist, *pos); 2525} 2526 2527static void *packet_seq_next(struct seq_file *seq, void *v, loff_t *pos) 2528{ 2529 struct net *net = seq_file_net(seq); 2530 return seq_hlist_next_rcu(v, &net->packet.sklist, pos); 2531} 2532 2533static void packet_seq_stop(struct seq_file *seq, void *v) 2534 __releases(RCU) 2535{ 2536 rcu_read_unlock(); 2537} 2538 2539static int packet_seq_show(struct seq_file *seq, void *v) 2540{ 2541 if (v == SEQ_START_TOKEN) 2542 seq_puts(seq, "sk RefCnt Type Proto Iface R Rmem User Inode\n"); 2543 else { 2544 struct sock *s = sk_entry(v); 2545 const struct packet_sock *po = pkt_sk(s); 2546 2547 seq_printf(seq, 2548 "%p %-6d %-4d %04x %-5d %1d %-6u %-6u %-6lu\n", 2549 s, 2550 atomic_read(&s->sk_refcnt), 2551 s->sk_type, 2552 ntohs(po->num), 2553 po->ifindex, 2554 po->running, 2555 atomic_read(&s->sk_rmem_alloc), 2556 sock_i_uid(s), 2557 sock_i_ino(s)); 2558 } 2559 2560 return 0; 2561} 2562 2563static const struct seq_operations packet_seq_ops = { 2564 .start = packet_seq_start, 2565 .next = packet_seq_next, 2566 .stop = packet_seq_stop, 2567 .show = packet_seq_show, 2568}; 2569 2570static int packet_seq_open(struct inode *inode, struct file *file) 2571{ 2572 return seq_open_net(inode, file, &packet_seq_ops, 2573 sizeof(struct seq_net_private)); 2574} 2575 2576static const struct file_operations packet_seq_fops = { 2577 .owner = THIS_MODULE, 2578 .open = packet_seq_open, 2579 .read = seq_read, 2580 .llseek = seq_lseek, 2581 .release = seq_release_net, 2582}; 2583 2584#endif 2585 2586static int __net_init packet_net_init(struct net *net) 2587{ 2588 spin_lock_init(&net->packet.sklist_lock); 2589 INIT_HLIST_HEAD(&net->packet.sklist); 2590 2591 if (!proc_net_fops_create(net, "packet", 0, &packet_seq_fops)) 2592 return -ENOMEM; 2593 2594 return 0; 2595} 2596 2597static void __net_exit packet_net_exit(struct net *net) 2598{ 2599 proc_net_remove(net, "packet"); 2600} 2601 2602static struct pernet_operations packet_net_ops = { 2603 .init = packet_net_init, 2604 .exit = packet_net_exit, 2605}; 2606 2607 2608static void __exit packet_exit(void) 2609{ 2610 unregister_netdevice_notifier(&packet_netdev_notifier); 2611 unregister_pernet_subsys(&packet_net_ops); 2612 sock_unregister(PF_PACKET); 2613 proto_unregister(&packet_proto); 2614} 2615 2616static int __init packet_init(void) 2617{ 2618 int rc = proto_register(&packet_proto, 0); 2619 2620 if (rc != 0) 2621 goto out; 2622 2623 sock_register(&packet_family_ops); 2624 register_pernet_subsys(&packet_net_ops); 2625 register_netdevice_notifier(&packet_netdev_notifier); 2626out: 2627 return rc; 2628} 2629 2630module_init(packet_init); 2631module_exit(packet_exit); 2632MODULE_LICENSE("GPL"); 2633MODULE_ALIAS_NETPROTO(PF_PACKET); 2634