ip_output.c revision 4b30b1c6a3e58dc74f2dbb0aa39f16a23cfcdd56
1/* 2 * INET An implementation of the TCP/IP protocol suite for the LINUX 3 * operating system. INET is implemented using the BSD Socket 4 * interface as the means of communication with the user level. 5 * 6 * The Internet Protocol (IP) output module. 7 * 8 * Version: $Id: ip_output.c,v 1.100 2002/02/01 22:01:03 davem Exp $ 9 * 10 * Authors: Ross Biro 11 * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG> 12 * Donald Becker, <becker@super.org> 13 * Alan Cox, <Alan.Cox@linux.org> 14 * Richard Underwood 15 * Stefan Becker, <stefanb@yello.ping.de> 16 * Jorge Cwik, <jorge@laser.satlink.net> 17 * Arnt Gulbrandsen, <agulbra@nvg.unit.no> 18 * Hirokazu Takahashi, <taka@valinux.co.jp> 19 * 20 * See ip_input.c for original log 21 * 22 * Fixes: 23 * Alan Cox : Missing nonblock feature in ip_build_xmit. 24 * Mike Kilburn : htons() missing in ip_build_xmit. 25 * Bradford Johnson: Fix faulty handling of some frames when 26 * no route is found. 27 * Alexander Demenshin: Missing sk/skb free in ip_queue_xmit 28 * (in case if packet not accepted by 29 * output firewall rules) 30 * Mike McLagan : Routing by source 31 * Alexey Kuznetsov: use new route cache 32 * Andi Kleen: Fix broken PMTU recovery and remove 33 * some redundant tests. 34 * Vitaly E. Lavrov : Transparent proxy revived after year coma. 35 * Andi Kleen : Replace ip_reply with ip_send_reply. 36 * Andi Kleen : Split fast and slow ip_build_xmit path 37 * for decreased register pressure on x86 38 * and more readibility. 39 * Marc Boucher : When call_out_firewall returns FW_QUEUE, 40 * silently drop skb instead of failing with -EPERM. 41 * Detlev Wengorz : Copy protocol for fragments. 42 * Hirokazu Takahashi: HW checksumming for outgoing UDP 43 * datagrams. 44 * Hirokazu Takahashi: sendfile() on UDP works now. 45 */ 46 47#include <asm/uaccess.h> 48#include <asm/system.h> 49#include <linux/module.h> 50#include <linux/types.h> 51#include <linux/kernel.h> 52#include <linux/sched.h> 53#include <linux/mm.h> 54#include <linux/string.h> 55#include <linux/errno.h> 56#include <linux/config.h> 57 58#include <linux/socket.h> 59#include <linux/sockios.h> 60#include <linux/in.h> 61#include <linux/inet.h> 62#include <linux/netdevice.h> 63#include <linux/etherdevice.h> 64#include <linux/proc_fs.h> 65#include <linux/stat.h> 66#include <linux/init.h> 67 68#include <net/snmp.h> 69#include <net/ip.h> 70#include <net/protocol.h> 71#include <net/route.h> 72#include <linux/skbuff.h> 73#include <net/sock.h> 74#include <net/arp.h> 75#include <net/icmp.h> 76#include <net/checksum.h> 77#include <net/inetpeer.h> 78#include <net/checksum.h> 79#include <linux/igmp.h> 80#include <linux/netfilter_ipv4.h> 81#include <linux/netfilter_bridge.h> 82#include <linux/mroute.h> 83#include <linux/netlink.h> 84#include <linux/tcp.h> 85 86int sysctl_ip_default_ttl = IPDEFTTL; 87 88/* Generate a checksum for an outgoing IP datagram. */ 89__inline__ void ip_send_check(struct iphdr *iph) 90{ 91 iph->check = 0; 92 iph->check = ip_fast_csum((unsigned char *)iph, iph->ihl); 93} 94 95/* dev_loopback_xmit for use with netfilter. */ 96static int ip_dev_loopback_xmit(struct sk_buff *newskb) 97{ 98 newskb->mac.raw = newskb->data; 99 __skb_pull(newskb, newskb->nh.raw - newskb->data); 100 newskb->pkt_type = PACKET_LOOPBACK; 101 newskb->ip_summed = CHECKSUM_UNNECESSARY; 102 BUG_TRAP(newskb->dst); 103 netif_rx(newskb); 104 return 0; 105} 106 107static inline int ip_select_ttl(struct inet_sock *inet, struct dst_entry *dst) 108{ 109 int ttl = inet->uc_ttl; 110 111 if (ttl < 0) 112 ttl = dst_metric(dst, RTAX_HOPLIMIT); 113 return ttl; 114} 115 116/* 117 * Add an ip header to a skbuff and send it out. 118 * 119 */ 120int ip_build_and_send_pkt(struct sk_buff *skb, struct sock *sk, 121 u32 saddr, u32 daddr, struct ip_options *opt) 122{ 123 struct inet_sock *inet = inet_sk(sk); 124 struct rtable *rt = (struct rtable *)skb->dst; 125 struct iphdr *iph; 126 127 /* Build the IP header. */ 128 if (opt) 129 iph=(struct iphdr *)skb_push(skb,sizeof(struct iphdr) + opt->optlen); 130 else 131 iph=(struct iphdr *)skb_push(skb,sizeof(struct iphdr)); 132 133 iph->version = 4; 134 iph->ihl = 5; 135 iph->tos = inet->tos; 136 if (ip_dont_fragment(sk, &rt->u.dst)) 137 iph->frag_off = htons(IP_DF); 138 else 139 iph->frag_off = 0; 140 iph->ttl = ip_select_ttl(inet, &rt->u.dst); 141 iph->daddr = rt->rt_dst; 142 iph->saddr = rt->rt_src; 143 iph->protocol = sk->sk_protocol; 144 iph->tot_len = htons(skb->len); 145 ip_select_ident(iph, &rt->u.dst, sk); 146 skb->nh.iph = iph; 147 148 if (opt && opt->optlen) { 149 iph->ihl += opt->optlen>>2; 150 ip_options_build(skb, opt, daddr, rt, 0); 151 } 152 ip_send_check(iph); 153 154 skb->priority = sk->sk_priority; 155 156 /* Send it out. */ 157 return NF_HOOK(PF_INET, NF_IP_LOCAL_OUT, skb, NULL, rt->u.dst.dev, 158 dst_output); 159} 160 161EXPORT_SYMBOL_GPL(ip_build_and_send_pkt); 162 163static inline int ip_finish_output2(struct sk_buff *skb) 164{ 165 struct dst_entry *dst = skb->dst; 166 struct hh_cache *hh = dst->hh; 167 struct net_device *dev = dst->dev; 168 int hh_len = LL_RESERVED_SPACE(dev); 169 170 /* Be paranoid, rather than too clever. */ 171 if (unlikely(skb_headroom(skb) < hh_len && dev->hard_header)) { 172 struct sk_buff *skb2; 173 174 skb2 = skb_realloc_headroom(skb, LL_RESERVED_SPACE(dev)); 175 if (skb2 == NULL) { 176 kfree_skb(skb); 177 return -ENOMEM; 178 } 179 if (skb->sk) 180 skb_set_owner_w(skb2, skb->sk); 181 kfree_skb(skb); 182 skb = skb2; 183 } 184 185 if (hh) { 186 int hh_alen; 187 188 read_lock_bh(&hh->hh_lock); 189 hh_alen = HH_DATA_ALIGN(hh->hh_len); 190 memcpy(skb->data - hh_alen, hh->hh_data, hh_alen); 191 read_unlock_bh(&hh->hh_lock); 192 skb_push(skb, hh->hh_len); 193 return hh->hh_output(skb); 194 } else if (dst->neighbour) 195 return dst->neighbour->output(skb); 196 197 if (net_ratelimit()) 198 printk(KERN_DEBUG "ip_finish_output2: No header cache and no neighbour!\n"); 199 kfree_skb(skb); 200 return -EINVAL; 201} 202 203static inline int ip_finish_output(struct sk_buff *skb) 204{ 205 struct net_device *dev = skb->dst->dev; 206 207 skb->dev = dev; 208 skb->protocol = htons(ETH_P_IP); 209 210 return NF_HOOK(PF_INET, NF_IP_POST_ROUTING, skb, NULL, dev, 211 ip_finish_output2); 212} 213 214int ip_mc_output(struct sk_buff *skb) 215{ 216 struct sock *sk = skb->sk; 217 struct rtable *rt = (struct rtable*)skb->dst; 218 struct net_device *dev = rt->u.dst.dev; 219 220 /* 221 * If the indicated interface is up and running, send the packet. 222 */ 223 IP_INC_STATS(IPSTATS_MIB_OUTREQUESTS); 224 225 skb->dev = dev; 226 skb->protocol = htons(ETH_P_IP); 227 228 /* 229 * Multicasts are looped back for other local users 230 */ 231 232 if (rt->rt_flags&RTCF_MULTICAST) { 233 if ((!sk || inet_sk(sk)->mc_loop) 234#ifdef CONFIG_IP_MROUTE 235 /* Small optimization: do not loopback not local frames, 236 which returned after forwarding; they will be dropped 237 by ip_mr_input in any case. 238 Note, that local frames are looped back to be delivered 239 to local recipients. 240 241 This check is duplicated in ip_mr_input at the moment. 242 */ 243 && ((rt->rt_flags&RTCF_LOCAL) || !(IPCB(skb)->flags&IPSKB_FORWARDED)) 244#endif 245 ) { 246 struct sk_buff *newskb = skb_clone(skb, GFP_ATOMIC); 247 if (newskb) 248 NF_HOOK(PF_INET, NF_IP_POST_ROUTING, newskb, NULL, 249 newskb->dev, 250 ip_dev_loopback_xmit); 251 } 252 253 /* Multicasts with ttl 0 must not go beyond the host */ 254 255 if (skb->nh.iph->ttl == 0) { 256 kfree_skb(skb); 257 return 0; 258 } 259 } 260 261 if (rt->rt_flags&RTCF_BROADCAST) { 262 struct sk_buff *newskb = skb_clone(skb, GFP_ATOMIC); 263 if (newskb) 264 NF_HOOK(PF_INET, NF_IP_POST_ROUTING, newskb, NULL, 265 newskb->dev, ip_dev_loopback_xmit); 266 } 267 268 if (skb->len > dst_mtu(&rt->u.dst)) 269 return ip_fragment(skb, ip_finish_output); 270 else 271 return ip_finish_output(skb); 272} 273 274int ip_output(struct sk_buff *skb) 275{ 276 IP_INC_STATS(IPSTATS_MIB_OUTREQUESTS); 277 278 if (skb->len > dst_mtu(skb->dst) && 279 !(skb_shinfo(skb)->ufo_size || skb_shinfo(skb)->tso_size)) 280 return ip_fragment(skb, ip_finish_output); 281 else 282 return ip_finish_output(skb); 283} 284 285int ip_queue_xmit(struct sk_buff *skb, int ipfragok) 286{ 287 struct sock *sk = skb->sk; 288 struct inet_sock *inet = inet_sk(sk); 289 struct ip_options *opt = inet->opt; 290 struct rtable *rt; 291 struct iphdr *iph; 292 293 /* Skip all of this if the packet is already routed, 294 * f.e. by something like SCTP. 295 */ 296 rt = (struct rtable *) skb->dst; 297 if (rt != NULL) 298 goto packet_routed; 299 300 /* Make sure we can route this packet. */ 301 rt = (struct rtable *)__sk_dst_check(sk, 0); 302 if (rt == NULL) { 303 u32 daddr; 304 305 /* Use correct destination address if we have options. */ 306 daddr = inet->daddr; 307 if(opt && opt->srr) 308 daddr = opt->faddr; 309 310 { 311 struct flowi fl = { .oif = sk->sk_bound_dev_if, 312 .nl_u = { .ip4_u = 313 { .daddr = daddr, 314 .saddr = inet->saddr, 315 .tos = RT_CONN_FLAGS(sk) } }, 316 .proto = sk->sk_protocol, 317 .uli_u = { .ports = 318 { .sport = inet->sport, 319 .dport = inet->dport } } }; 320 321 /* If this fails, retransmit mechanism of transport layer will 322 * keep trying until route appears or the connection times 323 * itself out. 324 */ 325 if (ip_route_output_flow(&rt, &fl, sk, 0)) 326 goto no_route; 327 } 328 sk_setup_caps(sk, &rt->u.dst); 329 } 330 skb->dst = dst_clone(&rt->u.dst); 331 332packet_routed: 333 if (opt && opt->is_strictroute && rt->rt_dst != rt->rt_gateway) 334 goto no_route; 335 336 /* OK, we know where to send it, allocate and build IP header. */ 337 iph = (struct iphdr *) skb_push(skb, sizeof(struct iphdr) + (opt ? opt->optlen : 0)); 338 *((__u16 *)iph) = htons((4 << 12) | (5 << 8) | (inet->tos & 0xff)); 339 iph->tot_len = htons(skb->len); 340 if (ip_dont_fragment(sk, &rt->u.dst) && !ipfragok) 341 iph->frag_off = htons(IP_DF); 342 else 343 iph->frag_off = 0; 344 iph->ttl = ip_select_ttl(inet, &rt->u.dst); 345 iph->protocol = sk->sk_protocol; 346 iph->saddr = rt->rt_src; 347 iph->daddr = rt->rt_dst; 348 skb->nh.iph = iph; 349 /* Transport layer set skb->h.foo itself. */ 350 351 if (opt && opt->optlen) { 352 iph->ihl += opt->optlen >> 2; 353 ip_options_build(skb, opt, inet->daddr, rt, 0); 354 } 355 356 ip_select_ident_more(iph, &rt->u.dst, sk, 357 (skb_shinfo(skb)->tso_segs ?: 1) - 1); 358 359 /* Add an IP checksum. */ 360 ip_send_check(iph); 361 362 skb->priority = sk->sk_priority; 363 364 return NF_HOOK(PF_INET, NF_IP_LOCAL_OUT, skb, NULL, rt->u.dst.dev, 365 dst_output); 366 367no_route: 368 IP_INC_STATS(IPSTATS_MIB_OUTNOROUTES); 369 kfree_skb(skb); 370 return -EHOSTUNREACH; 371} 372 373 374static void ip_copy_metadata(struct sk_buff *to, struct sk_buff *from) 375{ 376 to->pkt_type = from->pkt_type; 377 to->priority = from->priority; 378 to->protocol = from->protocol; 379 dst_release(to->dst); 380 to->dst = dst_clone(from->dst); 381 to->dev = from->dev; 382 383 /* Copy the flags to each fragment. */ 384 IPCB(to)->flags = IPCB(from)->flags; 385 386#ifdef CONFIG_NET_SCHED 387 to->tc_index = from->tc_index; 388#endif 389#ifdef CONFIG_NETFILTER 390 to->nfmark = from->nfmark; 391 /* Connection association is same as pre-frag packet */ 392 nf_conntrack_put(to->nfct); 393 to->nfct = from->nfct; 394 nf_conntrack_get(to->nfct); 395 to->nfctinfo = from->nfctinfo; 396#if defined(CONFIG_IP_VS) || defined(CONFIG_IP_VS_MODULE) 397 to->ipvs_property = from->ipvs_property; 398#endif 399#ifdef CONFIG_BRIDGE_NETFILTER 400 nf_bridge_put(to->nf_bridge); 401 to->nf_bridge = from->nf_bridge; 402 nf_bridge_get(to->nf_bridge); 403#endif 404#endif 405} 406 407/* 408 * This IP datagram is too large to be sent in one piece. Break it up into 409 * smaller pieces (each of size equal to IP header plus 410 * a block of the data of the original IP data part) that will yet fit in a 411 * single device frame, and queue such a frame for sending. 412 */ 413 414int ip_fragment(struct sk_buff *skb, int (*output)(struct sk_buff*)) 415{ 416 struct iphdr *iph; 417 int raw = 0; 418 int ptr; 419 struct net_device *dev; 420 struct sk_buff *skb2; 421 unsigned int mtu, hlen, left, len, ll_rs; 422 int offset; 423 int not_last_frag; 424 struct rtable *rt = (struct rtable*)skb->dst; 425 int err = 0; 426 427 dev = rt->u.dst.dev; 428 429 /* 430 * Point into the IP datagram header. 431 */ 432 433 iph = skb->nh.iph; 434 435 if (unlikely((iph->frag_off & htons(IP_DF)) && !skb->local_df)) { 436 icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED, 437 htonl(dst_mtu(&rt->u.dst))); 438 kfree_skb(skb); 439 return -EMSGSIZE; 440 } 441 442 /* 443 * Setup starting values. 444 */ 445 446 hlen = iph->ihl * 4; 447 mtu = dst_mtu(&rt->u.dst) - hlen; /* Size of data space */ 448 449 /* When frag_list is given, use it. First, check its validity: 450 * some transformers could create wrong frag_list or break existing 451 * one, it is not prohibited. In this case fall back to copying. 452 * 453 * LATER: this step can be merged to real generation of fragments, 454 * we can switch to copy when see the first bad fragment. 455 */ 456 if (skb_shinfo(skb)->frag_list) { 457 struct sk_buff *frag; 458 int first_len = skb_pagelen(skb); 459 460 if (first_len - hlen > mtu || 461 ((first_len - hlen) & 7) || 462 (iph->frag_off & htons(IP_MF|IP_OFFSET)) || 463 skb_cloned(skb)) 464 goto slow_path; 465 466 for (frag = skb_shinfo(skb)->frag_list; frag; frag = frag->next) { 467 /* Correct geometry. */ 468 if (frag->len > mtu || 469 ((frag->len & 7) && frag->next) || 470 skb_headroom(frag) < hlen) 471 goto slow_path; 472 473 /* Partially cloned skb? */ 474 if (skb_shared(frag)) 475 goto slow_path; 476 477 BUG_ON(frag->sk); 478 if (skb->sk) { 479 sock_hold(skb->sk); 480 frag->sk = skb->sk; 481 frag->destructor = sock_wfree; 482 skb->truesize -= frag->truesize; 483 } 484 } 485 486 /* Everything is OK. Generate! */ 487 488 err = 0; 489 offset = 0; 490 frag = skb_shinfo(skb)->frag_list; 491 skb_shinfo(skb)->frag_list = NULL; 492 skb->data_len = first_len - skb_headlen(skb); 493 skb->len = first_len; 494 iph->tot_len = htons(first_len); 495 iph->frag_off = htons(IP_MF); 496 ip_send_check(iph); 497 498 for (;;) { 499 /* Prepare header of the next frame, 500 * before previous one went down. */ 501 if (frag) { 502 frag->ip_summed = CHECKSUM_NONE; 503 frag->h.raw = frag->data; 504 frag->nh.raw = __skb_push(frag, hlen); 505 memcpy(frag->nh.raw, iph, hlen); 506 iph = frag->nh.iph; 507 iph->tot_len = htons(frag->len); 508 ip_copy_metadata(frag, skb); 509 if (offset == 0) 510 ip_options_fragment(frag); 511 offset += skb->len - hlen; 512 iph->frag_off = htons(offset>>3); 513 if (frag->next != NULL) 514 iph->frag_off |= htons(IP_MF); 515 /* Ready, complete checksum */ 516 ip_send_check(iph); 517 } 518 519 err = output(skb); 520 521 if (err || !frag) 522 break; 523 524 skb = frag; 525 frag = skb->next; 526 skb->next = NULL; 527 } 528 529 if (err == 0) { 530 IP_INC_STATS(IPSTATS_MIB_FRAGOKS); 531 return 0; 532 } 533 534 while (frag) { 535 skb = frag->next; 536 kfree_skb(frag); 537 frag = skb; 538 } 539 IP_INC_STATS(IPSTATS_MIB_FRAGFAILS); 540 return err; 541 } 542 543slow_path: 544 left = skb->len - hlen; /* Space per frame */ 545 ptr = raw + hlen; /* Where to start from */ 546 547#ifdef CONFIG_BRIDGE_NETFILTER 548 /* for bridged IP traffic encapsulated inside f.e. a vlan header, 549 * we need to make room for the encapsulating header */ 550 ll_rs = LL_RESERVED_SPACE_EXTRA(rt->u.dst.dev, nf_bridge_pad(skb)); 551 mtu -= nf_bridge_pad(skb); 552#else 553 ll_rs = LL_RESERVED_SPACE(rt->u.dst.dev); 554#endif 555 /* 556 * Fragment the datagram. 557 */ 558 559 offset = (ntohs(iph->frag_off) & IP_OFFSET) << 3; 560 not_last_frag = iph->frag_off & htons(IP_MF); 561 562 /* 563 * Keep copying data until we run out. 564 */ 565 566 while(left > 0) { 567 len = left; 568 /* IF: it doesn't fit, use 'mtu' - the data space left */ 569 if (len > mtu) 570 len = mtu; 571 /* IF: we are not sending upto and including the packet end 572 then align the next start on an eight byte boundary */ 573 if (len < left) { 574 len &= ~7; 575 } 576 /* 577 * Allocate buffer. 578 */ 579 580 if ((skb2 = alloc_skb(len+hlen+ll_rs, GFP_ATOMIC)) == NULL) { 581 NETDEBUG(KERN_INFO "IP: frag: no memory for new fragment!\n"); 582 err = -ENOMEM; 583 goto fail; 584 } 585 586 /* 587 * Set up data on packet 588 */ 589 590 ip_copy_metadata(skb2, skb); 591 skb_reserve(skb2, ll_rs); 592 skb_put(skb2, len + hlen); 593 skb2->nh.raw = skb2->data; 594 skb2->h.raw = skb2->data + hlen; 595 596 /* 597 * Charge the memory for the fragment to any owner 598 * it might possess 599 */ 600 601 if (skb->sk) 602 skb_set_owner_w(skb2, skb->sk); 603 604 /* 605 * Copy the packet header into the new buffer. 606 */ 607 608 memcpy(skb2->nh.raw, skb->data, hlen); 609 610 /* 611 * Copy a block of the IP datagram. 612 */ 613 if (skb_copy_bits(skb, ptr, skb2->h.raw, len)) 614 BUG(); 615 left -= len; 616 617 /* 618 * Fill in the new header fields. 619 */ 620 iph = skb2->nh.iph; 621 iph->frag_off = htons((offset >> 3)); 622 623 /* ANK: dirty, but effective trick. Upgrade options only if 624 * the segment to be fragmented was THE FIRST (otherwise, 625 * options are already fixed) and make it ONCE 626 * on the initial skb, so that all the following fragments 627 * will inherit fixed options. 628 */ 629 if (offset == 0) 630 ip_options_fragment(skb); 631 632 /* 633 * Added AC : If we are fragmenting a fragment that's not the 634 * last fragment then keep MF on each bit 635 */ 636 if (left > 0 || not_last_frag) 637 iph->frag_off |= htons(IP_MF); 638 ptr += len; 639 offset += len; 640 641 /* 642 * Put this fragment into the sending queue. 643 */ 644 645 IP_INC_STATS(IPSTATS_MIB_FRAGCREATES); 646 647 iph->tot_len = htons(len + hlen); 648 649 ip_send_check(iph); 650 651 err = output(skb2); 652 if (err) 653 goto fail; 654 } 655 kfree_skb(skb); 656 IP_INC_STATS(IPSTATS_MIB_FRAGOKS); 657 return err; 658 659fail: 660 kfree_skb(skb); 661 IP_INC_STATS(IPSTATS_MIB_FRAGFAILS); 662 return err; 663} 664 665int 666ip_generic_getfrag(void *from, char *to, int offset, int len, int odd, struct sk_buff *skb) 667{ 668 struct iovec *iov = from; 669 670 if (skb->ip_summed == CHECKSUM_HW) { 671 if (memcpy_fromiovecend(to, iov, offset, len) < 0) 672 return -EFAULT; 673 } else { 674 unsigned int csum = 0; 675 if (csum_partial_copy_fromiovecend(to, iov, offset, len, &csum) < 0) 676 return -EFAULT; 677 skb->csum = csum_block_add(skb->csum, csum, odd); 678 } 679 return 0; 680} 681 682static inline unsigned int 683csum_page(struct page *page, int offset, int copy) 684{ 685 char *kaddr; 686 unsigned int csum; 687 kaddr = kmap(page); 688 csum = csum_partial(kaddr + offset, copy, 0); 689 kunmap(page); 690 return csum; 691} 692 693static inline int ip_ufo_append_data(struct sock *sk, 694 int getfrag(void *from, char *to, int offset, int len, 695 int odd, struct sk_buff *skb), 696 void *from, int length, int hh_len, int fragheaderlen, 697 int transhdrlen, int mtu,unsigned int flags) 698{ 699 struct sk_buff *skb; 700 int err; 701 702 /* There is support for UDP fragmentation offload by network 703 * device, so create one single skb packet containing complete 704 * udp datagram 705 */ 706 if ((skb = skb_peek_tail(&sk->sk_write_queue)) == NULL) { 707 skb = sock_alloc_send_skb(sk, 708 hh_len + fragheaderlen + transhdrlen + 20, 709 (flags & MSG_DONTWAIT), &err); 710 711 if (skb == NULL) 712 return err; 713 714 /* reserve space for Hardware header */ 715 skb_reserve(skb, hh_len); 716 717 /* create space for UDP/IP header */ 718 skb_put(skb,fragheaderlen + transhdrlen); 719 720 /* initialize network header pointer */ 721 skb->nh.raw = skb->data; 722 723 /* initialize protocol header pointer */ 724 skb->h.raw = skb->data + fragheaderlen; 725 726 skb->ip_summed = CHECKSUM_HW; 727 skb->csum = 0; 728 sk->sk_sndmsg_off = 0; 729 } 730 731 err = skb_append_datato_frags(sk,skb, getfrag, from, 732 (length - transhdrlen)); 733 if (!err) { 734 /* specify the length of each IP datagram fragment*/ 735 skb_shinfo(skb)->ufo_size = (mtu - fragheaderlen); 736 __skb_queue_tail(&sk->sk_write_queue, skb); 737 738 return 0; 739 } 740 /* There is not enough support do UFO , 741 * so follow normal path 742 */ 743 kfree_skb(skb); 744 return err; 745} 746 747/* 748 * ip_append_data() and ip_append_page() can make one large IP datagram 749 * from many pieces of data. Each pieces will be holded on the socket 750 * until ip_push_pending_frames() is called. Each piece can be a page 751 * or non-page data. 752 * 753 * Not only UDP, other transport protocols - e.g. raw sockets - can use 754 * this interface potentially. 755 * 756 * LATER: length must be adjusted by pad at tail, when it is required. 757 */ 758int ip_append_data(struct sock *sk, 759 int getfrag(void *from, char *to, int offset, int len, 760 int odd, struct sk_buff *skb), 761 void *from, int length, int transhdrlen, 762 struct ipcm_cookie *ipc, struct rtable *rt, 763 unsigned int flags) 764{ 765 struct inet_sock *inet = inet_sk(sk); 766 struct sk_buff *skb; 767 768 struct ip_options *opt = NULL; 769 int hh_len; 770 int exthdrlen; 771 int mtu; 772 int copy; 773 int err; 774 int offset = 0; 775 unsigned int maxfraglen, fragheaderlen; 776 int csummode = CHECKSUM_NONE; 777 778 if (flags&MSG_PROBE) 779 return 0; 780 781 if (skb_queue_empty(&sk->sk_write_queue)) { 782 /* 783 * setup for corking. 784 */ 785 opt = ipc->opt; 786 if (opt) { 787 if (inet->cork.opt == NULL) { 788 inet->cork.opt = kmalloc(sizeof(struct ip_options) + 40, sk->sk_allocation); 789 if (unlikely(inet->cork.opt == NULL)) 790 return -ENOBUFS; 791 } 792 memcpy(inet->cork.opt, opt, sizeof(struct ip_options)+opt->optlen); 793 inet->cork.flags |= IPCORK_OPT; 794 inet->cork.addr = ipc->addr; 795 } 796 dst_hold(&rt->u.dst); 797 inet->cork.fragsize = mtu = dst_mtu(rt->u.dst.path); 798 inet->cork.rt = rt; 799 inet->cork.length = 0; 800 sk->sk_sndmsg_page = NULL; 801 sk->sk_sndmsg_off = 0; 802 if ((exthdrlen = rt->u.dst.header_len) != 0) { 803 length += exthdrlen; 804 transhdrlen += exthdrlen; 805 } 806 } else { 807 rt = inet->cork.rt; 808 if (inet->cork.flags & IPCORK_OPT) 809 opt = inet->cork.opt; 810 811 transhdrlen = 0; 812 exthdrlen = 0; 813 mtu = inet->cork.fragsize; 814 } 815 hh_len = LL_RESERVED_SPACE(rt->u.dst.dev); 816 817 fragheaderlen = sizeof(struct iphdr) + (opt ? opt->optlen : 0); 818 maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen; 819 820 if (inet->cork.length + length > 0xFFFF - fragheaderlen) { 821 ip_local_error(sk, EMSGSIZE, rt->rt_dst, inet->dport, mtu-exthdrlen); 822 return -EMSGSIZE; 823 } 824 825 /* 826 * transhdrlen > 0 means that this is the first fragment and we wish 827 * it won't be fragmented in the future. 828 */ 829 if (transhdrlen && 830 length + fragheaderlen <= mtu && 831 rt->u.dst.dev->features&(NETIF_F_IP_CSUM|NETIF_F_NO_CSUM|NETIF_F_HW_CSUM) && 832 !exthdrlen) 833 csummode = CHECKSUM_HW; 834 835 inet->cork.length += length; 836 if (((length > mtu) && (sk->sk_protocol == IPPROTO_UDP)) && 837 (rt->u.dst.dev->features & NETIF_F_UFO)) { 838 839 if(ip_ufo_append_data(sk, getfrag, from, length, hh_len, 840 fragheaderlen, transhdrlen, mtu, flags)) 841 goto error; 842 843 return 0; 844 } 845 846 /* So, what's going on in the loop below? 847 * 848 * We use calculated fragment length to generate chained skb, 849 * each of segments is IP fragment ready for sending to network after 850 * adding appropriate IP header. 851 */ 852 853 if ((skb = skb_peek_tail(&sk->sk_write_queue)) == NULL) 854 goto alloc_new_skb; 855 856 while (length > 0) { 857 /* Check if the remaining data fits into current packet. */ 858 copy = mtu - skb->len; 859 if (copy < length) 860 copy = maxfraglen - skb->len; 861 if (copy <= 0) { 862 char *data; 863 unsigned int datalen; 864 unsigned int fraglen; 865 unsigned int fraggap; 866 unsigned int alloclen; 867 struct sk_buff *skb_prev; 868alloc_new_skb: 869 skb_prev = skb; 870 if (skb_prev) 871 fraggap = skb_prev->len - maxfraglen; 872 else 873 fraggap = 0; 874 875 /* 876 * If remaining data exceeds the mtu, 877 * we know we need more fragment(s). 878 */ 879 datalen = length + fraggap; 880 if (datalen > mtu - fragheaderlen) 881 datalen = maxfraglen - fragheaderlen; 882 fraglen = datalen + fragheaderlen; 883 884 if ((flags & MSG_MORE) && 885 !(rt->u.dst.dev->features&NETIF_F_SG)) 886 alloclen = mtu; 887 else 888 alloclen = datalen + fragheaderlen; 889 890 /* The last fragment gets additional space at tail. 891 * Note, with MSG_MORE we overallocate on fragments, 892 * because we have no idea what fragment will be 893 * the last. 894 */ 895 if (datalen == length) 896 alloclen += rt->u.dst.trailer_len; 897 898 if (transhdrlen) { 899 skb = sock_alloc_send_skb(sk, 900 alloclen + hh_len + 15, 901 (flags & MSG_DONTWAIT), &err); 902 } else { 903 skb = NULL; 904 if (atomic_read(&sk->sk_wmem_alloc) <= 905 2 * sk->sk_sndbuf) 906 skb = sock_wmalloc(sk, 907 alloclen + hh_len + 15, 1, 908 sk->sk_allocation); 909 if (unlikely(skb == NULL)) 910 err = -ENOBUFS; 911 } 912 if (skb == NULL) 913 goto error; 914 915 /* 916 * Fill in the control structures 917 */ 918 skb->ip_summed = csummode; 919 skb->csum = 0; 920 skb_reserve(skb, hh_len); 921 922 /* 923 * Find where to start putting bytes. 924 */ 925 data = skb_put(skb, fraglen); 926 skb->nh.raw = data + exthdrlen; 927 data += fragheaderlen; 928 skb->h.raw = data + exthdrlen; 929 930 if (fraggap) { 931 skb->csum = skb_copy_and_csum_bits( 932 skb_prev, maxfraglen, 933 data + transhdrlen, fraggap, 0); 934 skb_prev->csum = csum_sub(skb_prev->csum, 935 skb->csum); 936 data += fraggap; 937 skb_trim(skb_prev, maxfraglen); 938 } 939 940 copy = datalen - transhdrlen - fraggap; 941 if (copy > 0 && getfrag(from, data + transhdrlen, offset, copy, fraggap, skb) < 0) { 942 err = -EFAULT; 943 kfree_skb(skb); 944 goto error; 945 } 946 947 offset += copy; 948 length -= datalen - fraggap; 949 transhdrlen = 0; 950 exthdrlen = 0; 951 csummode = CHECKSUM_NONE; 952 953 /* 954 * Put the packet on the pending queue. 955 */ 956 __skb_queue_tail(&sk->sk_write_queue, skb); 957 continue; 958 } 959 960 if (copy > length) 961 copy = length; 962 963 if (!(rt->u.dst.dev->features&NETIF_F_SG)) { 964 unsigned int off; 965 966 off = skb->len; 967 if (getfrag(from, skb_put(skb, copy), 968 offset, copy, off, skb) < 0) { 969 __skb_trim(skb, off); 970 err = -EFAULT; 971 goto error; 972 } 973 } else { 974 int i = skb_shinfo(skb)->nr_frags; 975 skb_frag_t *frag = &skb_shinfo(skb)->frags[i-1]; 976 struct page *page = sk->sk_sndmsg_page; 977 int off = sk->sk_sndmsg_off; 978 unsigned int left; 979 980 if (page && (left = PAGE_SIZE - off) > 0) { 981 if (copy >= left) 982 copy = left; 983 if (page != frag->page) { 984 if (i == MAX_SKB_FRAGS) { 985 err = -EMSGSIZE; 986 goto error; 987 } 988 get_page(page); 989 skb_fill_page_desc(skb, i, page, sk->sk_sndmsg_off, 0); 990 frag = &skb_shinfo(skb)->frags[i]; 991 } 992 } else if (i < MAX_SKB_FRAGS) { 993 if (copy > PAGE_SIZE) 994 copy = PAGE_SIZE; 995 page = alloc_pages(sk->sk_allocation, 0); 996 if (page == NULL) { 997 err = -ENOMEM; 998 goto error; 999 } 1000 sk->sk_sndmsg_page = page; 1001 sk->sk_sndmsg_off = 0; 1002 1003 skb_fill_page_desc(skb, i, page, 0, 0); 1004 frag = &skb_shinfo(skb)->frags[i]; 1005 skb->truesize += PAGE_SIZE; 1006 atomic_add(PAGE_SIZE, &sk->sk_wmem_alloc); 1007 } else { 1008 err = -EMSGSIZE; 1009 goto error; 1010 } 1011 if (getfrag(from, page_address(frag->page)+frag->page_offset+frag->size, offset, copy, skb->len, skb) < 0) { 1012 err = -EFAULT; 1013 goto error; 1014 } 1015 sk->sk_sndmsg_off += copy; 1016 frag->size += copy; 1017 skb->len += copy; 1018 skb->data_len += copy; 1019 } 1020 offset += copy; 1021 length -= copy; 1022 } 1023 1024 return 0; 1025 1026error: 1027 inet->cork.length -= length; 1028 IP_INC_STATS(IPSTATS_MIB_OUTDISCARDS); 1029 return err; 1030} 1031 1032ssize_t ip_append_page(struct sock *sk, struct page *page, 1033 int offset, size_t size, int flags) 1034{ 1035 struct inet_sock *inet = inet_sk(sk); 1036 struct sk_buff *skb; 1037 struct rtable *rt; 1038 struct ip_options *opt = NULL; 1039 int hh_len; 1040 int mtu; 1041 int len; 1042 int err; 1043 unsigned int maxfraglen, fragheaderlen, fraggap; 1044 1045 if (inet->hdrincl) 1046 return -EPERM; 1047 1048 if (flags&MSG_PROBE) 1049 return 0; 1050 1051 if (skb_queue_empty(&sk->sk_write_queue)) 1052 return -EINVAL; 1053 1054 rt = inet->cork.rt; 1055 if (inet->cork.flags & IPCORK_OPT) 1056 opt = inet->cork.opt; 1057 1058 if (!(rt->u.dst.dev->features&NETIF_F_SG)) 1059 return -EOPNOTSUPP; 1060 1061 hh_len = LL_RESERVED_SPACE(rt->u.dst.dev); 1062 mtu = inet->cork.fragsize; 1063 1064 fragheaderlen = sizeof(struct iphdr) + (opt ? opt->optlen : 0); 1065 maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen; 1066 1067 if (inet->cork.length + size > 0xFFFF - fragheaderlen) { 1068 ip_local_error(sk, EMSGSIZE, rt->rt_dst, inet->dport, mtu); 1069 return -EMSGSIZE; 1070 } 1071 1072 if ((skb = skb_peek_tail(&sk->sk_write_queue)) == NULL) 1073 return -EINVAL; 1074 1075 inet->cork.length += size; 1076 if ((sk->sk_protocol == IPPROTO_UDP) && 1077 (rt->u.dst.dev->features & NETIF_F_UFO)) 1078 skb_shinfo(skb)->ufo_size = (mtu - fragheaderlen); 1079 1080 1081 while (size > 0) { 1082 int i; 1083 1084 if (skb_shinfo(skb)->ufo_size) 1085 len = size; 1086 else { 1087 1088 /* Check if the remaining data fits into current packet. */ 1089 len = mtu - skb->len; 1090 if (len < size) 1091 len = maxfraglen - skb->len; 1092 } 1093 if (len <= 0) { 1094 struct sk_buff *skb_prev; 1095 char *data; 1096 struct iphdr *iph; 1097 int alloclen; 1098 1099 skb_prev = skb; 1100 fraggap = skb_prev->len - maxfraglen; 1101 1102 alloclen = fragheaderlen + hh_len + fraggap + 15; 1103 skb = sock_wmalloc(sk, alloclen, 1, sk->sk_allocation); 1104 if (unlikely(!skb)) { 1105 err = -ENOBUFS; 1106 goto error; 1107 } 1108 1109 /* 1110 * Fill in the control structures 1111 */ 1112 skb->ip_summed = CHECKSUM_NONE; 1113 skb->csum = 0; 1114 skb_reserve(skb, hh_len); 1115 1116 /* 1117 * Find where to start putting bytes. 1118 */ 1119 data = skb_put(skb, fragheaderlen + fraggap); 1120 skb->nh.iph = iph = (struct iphdr *)data; 1121 data += fragheaderlen; 1122 skb->h.raw = data; 1123 1124 if (fraggap) { 1125 skb->csum = skb_copy_and_csum_bits( 1126 skb_prev, maxfraglen, 1127 data, fraggap, 0); 1128 skb_prev->csum = csum_sub(skb_prev->csum, 1129 skb->csum); 1130 skb_trim(skb_prev, maxfraglen); 1131 } 1132 1133 /* 1134 * Put the packet on the pending queue. 1135 */ 1136 __skb_queue_tail(&sk->sk_write_queue, skb); 1137 continue; 1138 } 1139 1140 i = skb_shinfo(skb)->nr_frags; 1141 if (len > size) 1142 len = size; 1143 if (skb_can_coalesce(skb, i, page, offset)) { 1144 skb_shinfo(skb)->frags[i-1].size += len; 1145 } else if (i < MAX_SKB_FRAGS) { 1146 get_page(page); 1147 skb_fill_page_desc(skb, i, page, offset, len); 1148 } else { 1149 err = -EMSGSIZE; 1150 goto error; 1151 } 1152 1153 if (skb->ip_summed == CHECKSUM_NONE) { 1154 unsigned int csum; 1155 csum = csum_page(page, offset, len); 1156 skb->csum = csum_block_add(skb->csum, csum, skb->len); 1157 } 1158 1159 skb->len += len; 1160 skb->data_len += len; 1161 offset += len; 1162 size -= len; 1163 } 1164 return 0; 1165 1166error: 1167 inet->cork.length -= size; 1168 IP_INC_STATS(IPSTATS_MIB_OUTDISCARDS); 1169 return err; 1170} 1171 1172/* 1173 * Combined all pending IP fragments on the socket as one IP datagram 1174 * and push them out. 1175 */ 1176int ip_push_pending_frames(struct sock *sk) 1177{ 1178 struct sk_buff *skb, *tmp_skb; 1179 struct sk_buff **tail_skb; 1180 struct inet_sock *inet = inet_sk(sk); 1181 struct ip_options *opt = NULL; 1182 struct rtable *rt = inet->cork.rt; 1183 struct iphdr *iph; 1184 int df = 0; 1185 __u8 ttl; 1186 int err = 0; 1187 1188 if ((skb = __skb_dequeue(&sk->sk_write_queue)) == NULL) 1189 goto out; 1190 tail_skb = &(skb_shinfo(skb)->frag_list); 1191 1192 /* move skb->data to ip header from ext header */ 1193 if (skb->data < skb->nh.raw) 1194 __skb_pull(skb, skb->nh.raw - skb->data); 1195 while ((tmp_skb = __skb_dequeue(&sk->sk_write_queue)) != NULL) { 1196 __skb_pull(tmp_skb, skb->h.raw - skb->nh.raw); 1197 *tail_skb = tmp_skb; 1198 tail_skb = &(tmp_skb->next); 1199 skb->len += tmp_skb->len; 1200 skb->data_len += tmp_skb->len; 1201 skb->truesize += tmp_skb->truesize; 1202 __sock_put(tmp_skb->sk); 1203 tmp_skb->destructor = NULL; 1204 tmp_skb->sk = NULL; 1205 } 1206 1207 /* Unless user demanded real pmtu discovery (IP_PMTUDISC_DO), we allow 1208 * to fragment the frame generated here. No matter, what transforms 1209 * how transforms change size of the packet, it will come out. 1210 */ 1211 if (inet->pmtudisc != IP_PMTUDISC_DO) 1212 skb->local_df = 1; 1213 1214 /* DF bit is set when we want to see DF on outgoing frames. 1215 * If local_df is set too, we still allow to fragment this frame 1216 * locally. */ 1217 if (inet->pmtudisc == IP_PMTUDISC_DO || 1218 (skb->len <= dst_mtu(&rt->u.dst) && 1219 ip_dont_fragment(sk, &rt->u.dst))) 1220 df = htons(IP_DF); 1221 1222 if (inet->cork.flags & IPCORK_OPT) 1223 opt = inet->cork.opt; 1224 1225 if (rt->rt_type == RTN_MULTICAST) 1226 ttl = inet->mc_ttl; 1227 else 1228 ttl = ip_select_ttl(inet, &rt->u.dst); 1229 1230 iph = (struct iphdr *)skb->data; 1231 iph->version = 4; 1232 iph->ihl = 5; 1233 if (opt) { 1234 iph->ihl += opt->optlen>>2; 1235 ip_options_build(skb, opt, inet->cork.addr, rt, 0); 1236 } 1237 iph->tos = inet->tos; 1238 iph->tot_len = htons(skb->len); 1239 iph->frag_off = df; 1240 if (!df) { 1241 __ip_select_ident(iph, &rt->u.dst, 0); 1242 } else { 1243 iph->id = htons(inet->id++); 1244 } 1245 iph->ttl = ttl; 1246 iph->protocol = sk->sk_protocol; 1247 iph->saddr = rt->rt_src; 1248 iph->daddr = rt->rt_dst; 1249 ip_send_check(iph); 1250 1251 skb->priority = sk->sk_priority; 1252 skb->dst = dst_clone(&rt->u.dst); 1253 1254 /* Netfilter gets whole the not fragmented skb. */ 1255 err = NF_HOOK(PF_INET, NF_IP_LOCAL_OUT, skb, NULL, 1256 skb->dst->dev, dst_output); 1257 if (err) { 1258 if (err > 0) 1259 err = inet->recverr ? net_xmit_errno(err) : 0; 1260 if (err) 1261 goto error; 1262 } 1263 1264out: 1265 inet->cork.flags &= ~IPCORK_OPT; 1266 kfree(inet->cork.opt); 1267 inet->cork.opt = NULL; 1268 if (inet->cork.rt) { 1269 ip_rt_put(inet->cork.rt); 1270 inet->cork.rt = NULL; 1271 } 1272 return err; 1273 1274error: 1275 IP_INC_STATS(IPSTATS_MIB_OUTDISCARDS); 1276 goto out; 1277} 1278 1279/* 1280 * Throw away all pending data on the socket. 1281 */ 1282void ip_flush_pending_frames(struct sock *sk) 1283{ 1284 struct inet_sock *inet = inet_sk(sk); 1285 struct sk_buff *skb; 1286 1287 while ((skb = __skb_dequeue_tail(&sk->sk_write_queue)) != NULL) 1288 kfree_skb(skb); 1289 1290 inet->cork.flags &= ~IPCORK_OPT; 1291 kfree(inet->cork.opt); 1292 inet->cork.opt = NULL; 1293 if (inet->cork.rt) { 1294 ip_rt_put(inet->cork.rt); 1295 inet->cork.rt = NULL; 1296 } 1297} 1298 1299 1300/* 1301 * Fetch data from kernel space and fill in checksum if needed. 1302 */ 1303static int ip_reply_glue_bits(void *dptr, char *to, int offset, 1304 int len, int odd, struct sk_buff *skb) 1305{ 1306 unsigned int csum; 1307 1308 csum = csum_partial_copy_nocheck(dptr+offset, to, len, 0); 1309 skb->csum = csum_block_add(skb->csum, csum, odd); 1310 return 0; 1311} 1312 1313/* 1314 * Generic function to send a packet as reply to another packet. 1315 * Used to send TCP resets so far. ICMP should use this function too. 1316 * 1317 * Should run single threaded per socket because it uses the sock 1318 * structure to pass arguments. 1319 * 1320 * LATER: switch from ip_build_xmit to ip_append_* 1321 */ 1322void ip_send_reply(struct sock *sk, struct sk_buff *skb, struct ip_reply_arg *arg, 1323 unsigned int len) 1324{ 1325 struct inet_sock *inet = inet_sk(sk); 1326 struct { 1327 struct ip_options opt; 1328 char data[40]; 1329 } replyopts; 1330 struct ipcm_cookie ipc; 1331 u32 daddr; 1332 struct rtable *rt = (struct rtable*)skb->dst; 1333 1334 if (ip_options_echo(&replyopts.opt, skb)) 1335 return; 1336 1337 daddr = ipc.addr = rt->rt_src; 1338 ipc.opt = NULL; 1339 1340 if (replyopts.opt.optlen) { 1341 ipc.opt = &replyopts.opt; 1342 1343 if (ipc.opt->srr) 1344 daddr = replyopts.opt.faddr; 1345 } 1346 1347 { 1348 struct flowi fl = { .nl_u = { .ip4_u = 1349 { .daddr = daddr, 1350 .saddr = rt->rt_spec_dst, 1351 .tos = RT_TOS(skb->nh.iph->tos) } }, 1352 /* Not quite clean, but right. */ 1353 .uli_u = { .ports = 1354 { .sport = skb->h.th->dest, 1355 .dport = skb->h.th->source } }, 1356 .proto = sk->sk_protocol }; 1357 if (ip_route_output_key(&rt, &fl)) 1358 return; 1359 } 1360 1361 /* And let IP do all the hard work. 1362 1363 This chunk is not reenterable, hence spinlock. 1364 Note that it uses the fact, that this function is called 1365 with locally disabled BH and that sk cannot be already spinlocked. 1366 */ 1367 bh_lock_sock(sk); 1368 inet->tos = skb->nh.iph->tos; 1369 sk->sk_priority = skb->priority; 1370 sk->sk_protocol = skb->nh.iph->protocol; 1371 ip_append_data(sk, ip_reply_glue_bits, arg->iov->iov_base, len, 0, 1372 &ipc, rt, MSG_DONTWAIT); 1373 if ((skb = skb_peek(&sk->sk_write_queue)) != NULL) { 1374 if (arg->csumoffset >= 0) 1375 *((u16 *)skb->h.raw + arg->csumoffset) = csum_fold(csum_add(skb->csum, arg->csum)); 1376 skb->ip_summed = CHECKSUM_NONE; 1377 ip_push_pending_frames(sk); 1378 } 1379 1380 bh_unlock_sock(sk); 1381 1382 ip_rt_put(rt); 1383} 1384 1385void __init ip_init(void) 1386{ 1387 ip_rt_init(); 1388 inet_initpeers(); 1389 1390#if defined(CONFIG_IP_MULTICAST) && defined(CONFIG_PROC_FS) 1391 igmp_mc_proc_init(); 1392#endif 1393} 1394 1395EXPORT_SYMBOL(ip_fragment); 1396EXPORT_SYMBOL(ip_generic_getfrag); 1397EXPORT_SYMBOL(ip_queue_xmit); 1398EXPORT_SYMBOL(ip_send_check); 1399