ip_output.c revision f2c31e32b378a6653f8de606149d963baf11d7d3
1/* 2 * INET An implementation of the TCP/IP protocol suite for the LINUX 3 * operating system. INET is implemented using the BSD Socket 4 * interface as the means of communication with the user level. 5 * 6 * The Internet Protocol (IP) output module. 7 * 8 * Authors: Ross Biro 9 * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG> 10 * Donald Becker, <becker@super.org> 11 * Alan Cox, <Alan.Cox@linux.org> 12 * Richard Underwood 13 * Stefan Becker, <stefanb@yello.ping.de> 14 * Jorge Cwik, <jorge@laser.satlink.net> 15 * Arnt Gulbrandsen, <agulbra@nvg.unit.no> 16 * Hirokazu Takahashi, <taka@valinux.co.jp> 17 * 18 * See ip_input.c for original log 19 * 20 * Fixes: 21 * Alan Cox : Missing nonblock feature in ip_build_xmit. 22 * Mike Kilburn : htons() missing in ip_build_xmit. 23 * Bradford Johnson: Fix faulty handling of some frames when 24 * no route is found. 25 * Alexander Demenshin: Missing sk/skb free in ip_queue_xmit 26 * (in case if packet not accepted by 27 * output firewall rules) 28 * Mike McLagan : Routing by source 29 * Alexey Kuznetsov: use new route cache 30 * Andi Kleen: Fix broken PMTU recovery and remove 31 * some redundant tests. 32 * Vitaly E. Lavrov : Transparent proxy revived after year coma. 33 * Andi Kleen : Replace ip_reply with ip_send_reply. 34 * Andi Kleen : Split fast and slow ip_build_xmit path 35 * for decreased register pressure on x86 36 * and more readibility. 37 * Marc Boucher : When call_out_firewall returns FW_QUEUE, 38 * silently drop skb instead of failing with -EPERM. 39 * Detlev Wengorz : Copy protocol for fragments. 40 * Hirokazu Takahashi: HW checksumming for outgoing UDP 41 * datagrams. 42 * Hirokazu Takahashi: sendfile() on UDP works now. 43 */ 44 45#include <asm/uaccess.h> 46#include <asm/system.h> 47#include <linux/module.h> 48#include <linux/types.h> 49#include <linux/kernel.h> 50#include <linux/mm.h> 51#include <linux/string.h> 52#include <linux/errno.h> 53#include <linux/highmem.h> 54#include <linux/slab.h> 55 56#include <linux/socket.h> 57#include <linux/sockios.h> 58#include <linux/in.h> 59#include <linux/inet.h> 60#include <linux/netdevice.h> 61#include <linux/etherdevice.h> 62#include <linux/proc_fs.h> 63#include <linux/stat.h> 64#include <linux/init.h> 65 66#include <net/snmp.h> 67#include <net/ip.h> 68#include <net/protocol.h> 69#include <net/route.h> 70#include <net/xfrm.h> 71#include <linux/skbuff.h> 72#include <net/sock.h> 73#include <net/arp.h> 74#include <net/icmp.h> 75#include <net/checksum.h> 76#include <net/inetpeer.h> 77#include <linux/igmp.h> 78#include <linux/netfilter_ipv4.h> 79#include <linux/netfilter_bridge.h> 80#include <linux/mroute.h> 81#include <linux/netlink.h> 82#include <linux/tcp.h> 83 84int sysctl_ip_default_ttl __read_mostly = IPDEFTTL; 85EXPORT_SYMBOL(sysctl_ip_default_ttl); 86 87/* Generate a checksum for an outgoing IP datagram. */ 88__inline__ void ip_send_check(struct iphdr *iph) 89{ 90 iph->check = 0; 91 iph->check = ip_fast_csum((unsigned char *)iph, iph->ihl); 92} 93EXPORT_SYMBOL(ip_send_check); 94 95int __ip_local_out(struct sk_buff *skb) 96{ 97 struct iphdr *iph = ip_hdr(skb); 98 99 iph->tot_len = htons(skb->len); 100 ip_send_check(iph); 101 return nf_hook(NFPROTO_IPV4, NF_INET_LOCAL_OUT, skb, NULL, 102 skb_dst(skb)->dev, dst_output); 103} 104 105int ip_local_out(struct sk_buff *skb) 106{ 107 int err; 108 109 err = __ip_local_out(skb); 110 if (likely(err == 1)) 111 err = dst_output(skb); 112 113 return err; 114} 115EXPORT_SYMBOL_GPL(ip_local_out); 116 117/* dev_loopback_xmit for use with netfilter. */ 118static int ip_dev_loopback_xmit(struct sk_buff *newskb) 119{ 120 skb_reset_mac_header(newskb); 121 __skb_pull(newskb, skb_network_offset(newskb)); 122 newskb->pkt_type = PACKET_LOOPBACK; 123 newskb->ip_summed = CHECKSUM_UNNECESSARY; 124 WARN_ON(!skb_dst(newskb)); 125 netif_rx_ni(newskb); 126 return 0; 127} 128 129static inline int ip_select_ttl(struct inet_sock *inet, struct dst_entry *dst) 130{ 131 int ttl = inet->uc_ttl; 132 133 if (ttl < 0) 134 ttl = ip4_dst_hoplimit(dst); 135 return ttl; 136} 137 138/* 139 * Add an ip header to a skbuff and send it out. 140 * 141 */ 142int ip_build_and_send_pkt(struct sk_buff *skb, struct sock *sk, 143 __be32 saddr, __be32 daddr, struct ip_options_rcu *opt) 144{ 145 struct inet_sock *inet = inet_sk(sk); 146 struct rtable *rt = skb_rtable(skb); 147 struct iphdr *iph; 148 149 /* Build the IP header. */ 150 skb_push(skb, sizeof(struct iphdr) + (opt ? opt->opt.optlen : 0)); 151 skb_reset_network_header(skb); 152 iph = ip_hdr(skb); 153 iph->version = 4; 154 iph->ihl = 5; 155 iph->tos = inet->tos; 156 if (ip_dont_fragment(sk, &rt->dst)) 157 iph->frag_off = htons(IP_DF); 158 else 159 iph->frag_off = 0; 160 iph->ttl = ip_select_ttl(inet, &rt->dst); 161 iph->daddr = (opt && opt->opt.srr ? opt->opt.faddr : daddr); 162 iph->saddr = saddr; 163 iph->protocol = sk->sk_protocol; 164 ip_select_ident(iph, &rt->dst, sk); 165 166 if (opt && opt->opt.optlen) { 167 iph->ihl += opt->opt.optlen>>2; 168 ip_options_build(skb, &opt->opt, daddr, rt, 0); 169 } 170 171 skb->priority = sk->sk_priority; 172 skb->mark = sk->sk_mark; 173 174 /* Send it out. */ 175 return ip_local_out(skb); 176} 177EXPORT_SYMBOL_GPL(ip_build_and_send_pkt); 178 179static inline int ip_finish_output2(struct sk_buff *skb) 180{ 181 struct dst_entry *dst = skb_dst(skb); 182 struct rtable *rt = (struct rtable *)dst; 183 struct net_device *dev = dst->dev; 184 unsigned int hh_len = LL_RESERVED_SPACE(dev); 185 struct neighbour *neigh; 186 187 if (rt->rt_type == RTN_MULTICAST) { 188 IP_UPD_PO_STATS(dev_net(dev), IPSTATS_MIB_OUTMCAST, skb->len); 189 } else if (rt->rt_type == RTN_BROADCAST) 190 IP_UPD_PO_STATS(dev_net(dev), IPSTATS_MIB_OUTBCAST, skb->len); 191 192 /* Be paranoid, rather than too clever. */ 193 if (unlikely(skb_headroom(skb) < hh_len && dev->header_ops)) { 194 struct sk_buff *skb2; 195 196 skb2 = skb_realloc_headroom(skb, LL_RESERVED_SPACE(dev)); 197 if (skb2 == NULL) { 198 kfree_skb(skb); 199 return -ENOMEM; 200 } 201 if (skb->sk) 202 skb_set_owner_w(skb2, skb->sk); 203 kfree_skb(skb); 204 skb = skb2; 205 } 206 207 rcu_read_lock(); 208 neigh = dst_get_neighbour(dst); 209 if (neigh) { 210 int res = neigh_output(neigh, skb); 211 212 rcu_read_unlock(); 213 return res; 214 } 215 rcu_read_unlock(); 216 217 if (net_ratelimit()) 218 printk(KERN_DEBUG "ip_finish_output2: No header cache and no neighbour!\n"); 219 kfree_skb(skb); 220 return -EINVAL; 221} 222 223static inline int ip_skb_dst_mtu(struct sk_buff *skb) 224{ 225 struct inet_sock *inet = skb->sk ? inet_sk(skb->sk) : NULL; 226 227 return (inet && inet->pmtudisc == IP_PMTUDISC_PROBE) ? 228 skb_dst(skb)->dev->mtu : dst_mtu(skb_dst(skb)); 229} 230 231static int ip_finish_output(struct sk_buff *skb) 232{ 233#if defined(CONFIG_NETFILTER) && defined(CONFIG_XFRM) 234 /* Policy lookup after SNAT yielded a new policy */ 235 if (skb_dst(skb)->xfrm != NULL) { 236 IPCB(skb)->flags |= IPSKB_REROUTED; 237 return dst_output(skb); 238 } 239#endif 240 if (skb->len > ip_skb_dst_mtu(skb) && !skb_is_gso(skb)) 241 return ip_fragment(skb, ip_finish_output2); 242 else 243 return ip_finish_output2(skb); 244} 245 246int ip_mc_output(struct sk_buff *skb) 247{ 248 struct sock *sk = skb->sk; 249 struct rtable *rt = skb_rtable(skb); 250 struct net_device *dev = rt->dst.dev; 251 252 /* 253 * If the indicated interface is up and running, send the packet. 254 */ 255 IP_UPD_PO_STATS(dev_net(dev), IPSTATS_MIB_OUT, skb->len); 256 257 skb->dev = dev; 258 skb->protocol = htons(ETH_P_IP); 259 260 /* 261 * Multicasts are looped back for other local users 262 */ 263 264 if (rt->rt_flags&RTCF_MULTICAST) { 265 if (sk_mc_loop(sk) 266#ifdef CONFIG_IP_MROUTE 267 /* Small optimization: do not loopback not local frames, 268 which returned after forwarding; they will be dropped 269 by ip_mr_input in any case. 270 Note, that local frames are looped back to be delivered 271 to local recipients. 272 273 This check is duplicated in ip_mr_input at the moment. 274 */ 275 && 276 ((rt->rt_flags & RTCF_LOCAL) || 277 !(IPCB(skb)->flags & IPSKB_FORWARDED)) 278#endif 279 ) { 280 struct sk_buff *newskb = skb_clone(skb, GFP_ATOMIC); 281 if (newskb) 282 NF_HOOK(NFPROTO_IPV4, NF_INET_POST_ROUTING, 283 newskb, NULL, newskb->dev, 284 ip_dev_loopback_xmit); 285 } 286 287 /* Multicasts with ttl 0 must not go beyond the host */ 288 289 if (ip_hdr(skb)->ttl == 0) { 290 kfree_skb(skb); 291 return 0; 292 } 293 } 294 295 if (rt->rt_flags&RTCF_BROADCAST) { 296 struct sk_buff *newskb = skb_clone(skb, GFP_ATOMIC); 297 if (newskb) 298 NF_HOOK(NFPROTO_IPV4, NF_INET_POST_ROUTING, newskb, 299 NULL, newskb->dev, ip_dev_loopback_xmit); 300 } 301 302 return NF_HOOK_COND(NFPROTO_IPV4, NF_INET_POST_ROUTING, skb, NULL, 303 skb->dev, ip_finish_output, 304 !(IPCB(skb)->flags & IPSKB_REROUTED)); 305} 306 307int ip_output(struct sk_buff *skb) 308{ 309 struct net_device *dev = skb_dst(skb)->dev; 310 311 IP_UPD_PO_STATS(dev_net(dev), IPSTATS_MIB_OUT, skb->len); 312 313 skb->dev = dev; 314 skb->protocol = htons(ETH_P_IP); 315 316 return NF_HOOK_COND(NFPROTO_IPV4, NF_INET_POST_ROUTING, skb, NULL, dev, 317 ip_finish_output, 318 !(IPCB(skb)->flags & IPSKB_REROUTED)); 319} 320 321int ip_queue_xmit(struct sk_buff *skb, struct flowi *fl) 322{ 323 struct sock *sk = skb->sk; 324 struct inet_sock *inet = inet_sk(sk); 325 struct ip_options_rcu *inet_opt; 326 struct flowi4 *fl4; 327 struct rtable *rt; 328 struct iphdr *iph; 329 int res; 330 331 /* Skip all of this if the packet is already routed, 332 * f.e. by something like SCTP. 333 */ 334 rcu_read_lock(); 335 inet_opt = rcu_dereference(inet->inet_opt); 336 fl4 = &fl->u.ip4; 337 rt = skb_rtable(skb); 338 if (rt != NULL) 339 goto packet_routed; 340 341 /* Make sure we can route this packet. */ 342 rt = (struct rtable *)__sk_dst_check(sk, 0); 343 if (rt == NULL) { 344 __be32 daddr; 345 346 /* Use correct destination address if we have options. */ 347 daddr = inet->inet_daddr; 348 if (inet_opt && inet_opt->opt.srr) 349 daddr = inet_opt->opt.faddr; 350 351 /* If this fails, retransmit mechanism of transport layer will 352 * keep trying until route appears or the connection times 353 * itself out. 354 */ 355 rt = ip_route_output_ports(sock_net(sk), fl4, sk, 356 daddr, inet->inet_saddr, 357 inet->inet_dport, 358 inet->inet_sport, 359 sk->sk_protocol, 360 RT_CONN_FLAGS(sk), 361 sk->sk_bound_dev_if); 362 if (IS_ERR(rt)) 363 goto no_route; 364 sk_setup_caps(sk, &rt->dst); 365 } 366 skb_dst_set_noref(skb, &rt->dst); 367 368packet_routed: 369 if (inet_opt && inet_opt->opt.is_strictroute && fl4->daddr != rt->rt_gateway) 370 goto no_route; 371 372 /* OK, we know where to send it, allocate and build IP header. */ 373 skb_push(skb, sizeof(struct iphdr) + (inet_opt ? inet_opt->opt.optlen : 0)); 374 skb_reset_network_header(skb); 375 iph = ip_hdr(skb); 376 *((__be16 *)iph) = htons((4 << 12) | (5 << 8) | (inet->tos & 0xff)); 377 if (ip_dont_fragment(sk, &rt->dst) && !skb->local_df) 378 iph->frag_off = htons(IP_DF); 379 else 380 iph->frag_off = 0; 381 iph->ttl = ip_select_ttl(inet, &rt->dst); 382 iph->protocol = sk->sk_protocol; 383 iph->saddr = fl4->saddr; 384 iph->daddr = fl4->daddr; 385 /* Transport layer set skb->h.foo itself. */ 386 387 if (inet_opt && inet_opt->opt.optlen) { 388 iph->ihl += inet_opt->opt.optlen >> 2; 389 ip_options_build(skb, &inet_opt->opt, inet->inet_daddr, rt, 0); 390 } 391 392 ip_select_ident_more(iph, &rt->dst, sk, 393 (skb_shinfo(skb)->gso_segs ?: 1) - 1); 394 395 skb->priority = sk->sk_priority; 396 skb->mark = sk->sk_mark; 397 398 res = ip_local_out(skb); 399 rcu_read_unlock(); 400 return res; 401 402no_route: 403 rcu_read_unlock(); 404 IP_INC_STATS(sock_net(sk), IPSTATS_MIB_OUTNOROUTES); 405 kfree_skb(skb); 406 return -EHOSTUNREACH; 407} 408EXPORT_SYMBOL(ip_queue_xmit); 409 410 411static void ip_copy_metadata(struct sk_buff *to, struct sk_buff *from) 412{ 413 to->pkt_type = from->pkt_type; 414 to->priority = from->priority; 415 to->protocol = from->protocol; 416 skb_dst_drop(to); 417 skb_dst_copy(to, from); 418 to->dev = from->dev; 419 to->mark = from->mark; 420 421 /* Copy the flags to each fragment. */ 422 IPCB(to)->flags = IPCB(from)->flags; 423 424#ifdef CONFIG_NET_SCHED 425 to->tc_index = from->tc_index; 426#endif 427 nf_copy(to, from); 428#if defined(CONFIG_NETFILTER_XT_TARGET_TRACE) || \ 429 defined(CONFIG_NETFILTER_XT_TARGET_TRACE_MODULE) 430 to->nf_trace = from->nf_trace; 431#endif 432#if defined(CONFIG_IP_VS) || defined(CONFIG_IP_VS_MODULE) 433 to->ipvs_property = from->ipvs_property; 434#endif 435 skb_copy_secmark(to, from); 436} 437 438/* 439 * This IP datagram is too large to be sent in one piece. Break it up into 440 * smaller pieces (each of size equal to IP header plus 441 * a block of the data of the original IP data part) that will yet fit in a 442 * single device frame, and queue such a frame for sending. 443 */ 444 445int ip_fragment(struct sk_buff *skb, int (*output)(struct sk_buff *)) 446{ 447 struct iphdr *iph; 448 int ptr; 449 struct net_device *dev; 450 struct sk_buff *skb2; 451 unsigned int mtu, hlen, left, len, ll_rs; 452 int offset; 453 __be16 not_last_frag; 454 struct rtable *rt = skb_rtable(skb); 455 int err = 0; 456 457 dev = rt->dst.dev; 458 459 /* 460 * Point into the IP datagram header. 461 */ 462 463 iph = ip_hdr(skb); 464 465 if (unlikely((iph->frag_off & htons(IP_DF)) && !skb->local_df)) { 466 IP_INC_STATS(dev_net(dev), IPSTATS_MIB_FRAGFAILS); 467 icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED, 468 htonl(ip_skb_dst_mtu(skb))); 469 kfree_skb(skb); 470 return -EMSGSIZE; 471 } 472 473 /* 474 * Setup starting values. 475 */ 476 477 hlen = iph->ihl * 4; 478 mtu = dst_mtu(&rt->dst) - hlen; /* Size of data space */ 479#ifdef CONFIG_BRIDGE_NETFILTER 480 if (skb->nf_bridge) 481 mtu -= nf_bridge_mtu_reduction(skb); 482#endif 483 IPCB(skb)->flags |= IPSKB_FRAG_COMPLETE; 484 485 /* When frag_list is given, use it. First, check its validity: 486 * some transformers could create wrong frag_list or break existing 487 * one, it is not prohibited. In this case fall back to copying. 488 * 489 * LATER: this step can be merged to real generation of fragments, 490 * we can switch to copy when see the first bad fragment. 491 */ 492 if (skb_has_frag_list(skb)) { 493 struct sk_buff *frag, *frag2; 494 int first_len = skb_pagelen(skb); 495 496 if (first_len - hlen > mtu || 497 ((first_len - hlen) & 7) || 498 ip_is_fragment(iph) || 499 skb_cloned(skb)) 500 goto slow_path; 501 502 skb_walk_frags(skb, frag) { 503 /* Correct geometry. */ 504 if (frag->len > mtu || 505 ((frag->len & 7) && frag->next) || 506 skb_headroom(frag) < hlen) 507 goto slow_path_clean; 508 509 /* Partially cloned skb? */ 510 if (skb_shared(frag)) 511 goto slow_path_clean; 512 513 BUG_ON(frag->sk); 514 if (skb->sk) { 515 frag->sk = skb->sk; 516 frag->destructor = sock_wfree; 517 } 518 skb->truesize -= frag->truesize; 519 } 520 521 /* Everything is OK. Generate! */ 522 523 err = 0; 524 offset = 0; 525 frag = skb_shinfo(skb)->frag_list; 526 skb_frag_list_init(skb); 527 skb->data_len = first_len - skb_headlen(skb); 528 skb->len = first_len; 529 iph->tot_len = htons(first_len); 530 iph->frag_off = htons(IP_MF); 531 ip_send_check(iph); 532 533 for (;;) { 534 /* Prepare header of the next frame, 535 * before previous one went down. */ 536 if (frag) { 537 frag->ip_summed = CHECKSUM_NONE; 538 skb_reset_transport_header(frag); 539 __skb_push(frag, hlen); 540 skb_reset_network_header(frag); 541 memcpy(skb_network_header(frag), iph, hlen); 542 iph = ip_hdr(frag); 543 iph->tot_len = htons(frag->len); 544 ip_copy_metadata(frag, skb); 545 if (offset == 0) 546 ip_options_fragment(frag); 547 offset += skb->len - hlen; 548 iph->frag_off = htons(offset>>3); 549 if (frag->next != NULL) 550 iph->frag_off |= htons(IP_MF); 551 /* Ready, complete checksum */ 552 ip_send_check(iph); 553 } 554 555 err = output(skb); 556 557 if (!err) 558 IP_INC_STATS(dev_net(dev), IPSTATS_MIB_FRAGCREATES); 559 if (err || !frag) 560 break; 561 562 skb = frag; 563 frag = skb->next; 564 skb->next = NULL; 565 } 566 567 if (err == 0) { 568 IP_INC_STATS(dev_net(dev), IPSTATS_MIB_FRAGOKS); 569 return 0; 570 } 571 572 while (frag) { 573 skb = frag->next; 574 kfree_skb(frag); 575 frag = skb; 576 } 577 IP_INC_STATS(dev_net(dev), IPSTATS_MIB_FRAGFAILS); 578 return err; 579 580slow_path_clean: 581 skb_walk_frags(skb, frag2) { 582 if (frag2 == frag) 583 break; 584 frag2->sk = NULL; 585 frag2->destructor = NULL; 586 skb->truesize += frag2->truesize; 587 } 588 } 589 590slow_path: 591 left = skb->len - hlen; /* Space per frame */ 592 ptr = hlen; /* Where to start from */ 593 594 /* for bridged IP traffic encapsulated inside f.e. a vlan header, 595 * we need to make room for the encapsulating header 596 */ 597 ll_rs = LL_RESERVED_SPACE_EXTRA(rt->dst.dev, nf_bridge_pad(skb)); 598 599 /* 600 * Fragment the datagram. 601 */ 602 603 offset = (ntohs(iph->frag_off) & IP_OFFSET) << 3; 604 not_last_frag = iph->frag_off & htons(IP_MF); 605 606 /* 607 * Keep copying data until we run out. 608 */ 609 610 while (left > 0) { 611 len = left; 612 /* IF: it doesn't fit, use 'mtu' - the data space left */ 613 if (len > mtu) 614 len = mtu; 615 /* IF: we are not sending up to and including the packet end 616 then align the next start on an eight byte boundary */ 617 if (len < left) { 618 len &= ~7; 619 } 620 /* 621 * Allocate buffer. 622 */ 623 624 if ((skb2 = alloc_skb(len+hlen+ll_rs, GFP_ATOMIC)) == NULL) { 625 NETDEBUG(KERN_INFO "IP: frag: no memory for new fragment!\n"); 626 err = -ENOMEM; 627 goto fail; 628 } 629 630 /* 631 * Set up data on packet 632 */ 633 634 ip_copy_metadata(skb2, skb); 635 skb_reserve(skb2, ll_rs); 636 skb_put(skb2, len + hlen); 637 skb_reset_network_header(skb2); 638 skb2->transport_header = skb2->network_header + hlen; 639 640 /* 641 * Charge the memory for the fragment to any owner 642 * it might possess 643 */ 644 645 if (skb->sk) 646 skb_set_owner_w(skb2, skb->sk); 647 648 /* 649 * Copy the packet header into the new buffer. 650 */ 651 652 skb_copy_from_linear_data(skb, skb_network_header(skb2), hlen); 653 654 /* 655 * Copy a block of the IP datagram. 656 */ 657 if (skb_copy_bits(skb, ptr, skb_transport_header(skb2), len)) 658 BUG(); 659 left -= len; 660 661 /* 662 * Fill in the new header fields. 663 */ 664 iph = ip_hdr(skb2); 665 iph->frag_off = htons((offset >> 3)); 666 667 /* ANK: dirty, but effective trick. Upgrade options only if 668 * the segment to be fragmented was THE FIRST (otherwise, 669 * options are already fixed) and make it ONCE 670 * on the initial skb, so that all the following fragments 671 * will inherit fixed options. 672 */ 673 if (offset == 0) 674 ip_options_fragment(skb); 675 676 /* 677 * Added AC : If we are fragmenting a fragment that's not the 678 * last fragment then keep MF on each bit 679 */ 680 if (left > 0 || not_last_frag) 681 iph->frag_off |= htons(IP_MF); 682 ptr += len; 683 offset += len; 684 685 /* 686 * Put this fragment into the sending queue. 687 */ 688 iph->tot_len = htons(len + hlen); 689 690 ip_send_check(iph); 691 692 err = output(skb2); 693 if (err) 694 goto fail; 695 696 IP_INC_STATS(dev_net(dev), IPSTATS_MIB_FRAGCREATES); 697 } 698 kfree_skb(skb); 699 IP_INC_STATS(dev_net(dev), IPSTATS_MIB_FRAGOKS); 700 return err; 701 702fail: 703 kfree_skb(skb); 704 IP_INC_STATS(dev_net(dev), IPSTATS_MIB_FRAGFAILS); 705 return err; 706} 707EXPORT_SYMBOL(ip_fragment); 708 709int 710ip_generic_getfrag(void *from, char *to, int offset, int len, int odd, struct sk_buff *skb) 711{ 712 struct iovec *iov = from; 713 714 if (skb->ip_summed == CHECKSUM_PARTIAL) { 715 if (memcpy_fromiovecend(to, iov, offset, len) < 0) 716 return -EFAULT; 717 } else { 718 __wsum csum = 0; 719 if (csum_partial_copy_fromiovecend(to, iov, offset, len, &csum) < 0) 720 return -EFAULT; 721 skb->csum = csum_block_add(skb->csum, csum, odd); 722 } 723 return 0; 724} 725EXPORT_SYMBOL(ip_generic_getfrag); 726 727static inline __wsum 728csum_page(struct page *page, int offset, int copy) 729{ 730 char *kaddr; 731 __wsum csum; 732 kaddr = kmap(page); 733 csum = csum_partial(kaddr + offset, copy, 0); 734 kunmap(page); 735 return csum; 736} 737 738static inline int ip_ufo_append_data(struct sock *sk, 739 struct sk_buff_head *queue, 740 int getfrag(void *from, char *to, int offset, int len, 741 int odd, struct sk_buff *skb), 742 void *from, int length, int hh_len, int fragheaderlen, 743 int transhdrlen, int maxfraglen, unsigned int flags) 744{ 745 struct sk_buff *skb; 746 int err; 747 748 /* There is support for UDP fragmentation offload by network 749 * device, so create one single skb packet containing complete 750 * udp datagram 751 */ 752 if ((skb = skb_peek_tail(queue)) == NULL) { 753 skb = sock_alloc_send_skb(sk, 754 hh_len + fragheaderlen + transhdrlen + 20, 755 (flags & MSG_DONTWAIT), &err); 756 757 if (skb == NULL) 758 return err; 759 760 /* reserve space for Hardware header */ 761 skb_reserve(skb, hh_len); 762 763 /* create space for UDP/IP header */ 764 skb_put(skb, fragheaderlen + transhdrlen); 765 766 /* initialize network header pointer */ 767 skb_reset_network_header(skb); 768 769 /* initialize protocol header pointer */ 770 skb->transport_header = skb->network_header + fragheaderlen; 771 772 skb->ip_summed = CHECKSUM_PARTIAL; 773 skb->csum = 0; 774 775 /* specify the length of each IP datagram fragment */ 776 skb_shinfo(skb)->gso_size = maxfraglen - fragheaderlen; 777 skb_shinfo(skb)->gso_type = SKB_GSO_UDP; 778 __skb_queue_tail(queue, skb); 779 } 780 781 return skb_append_datato_frags(sk, skb, getfrag, from, 782 (length - transhdrlen)); 783} 784 785static int __ip_append_data(struct sock *sk, 786 struct flowi4 *fl4, 787 struct sk_buff_head *queue, 788 struct inet_cork *cork, 789 int getfrag(void *from, char *to, int offset, 790 int len, int odd, struct sk_buff *skb), 791 void *from, int length, int transhdrlen, 792 unsigned int flags) 793{ 794 struct inet_sock *inet = inet_sk(sk); 795 struct sk_buff *skb; 796 797 struct ip_options *opt = cork->opt; 798 int hh_len; 799 int exthdrlen; 800 int mtu; 801 int copy; 802 int err; 803 int offset = 0; 804 unsigned int maxfraglen, fragheaderlen; 805 int csummode = CHECKSUM_NONE; 806 struct rtable *rt = (struct rtable *)cork->dst; 807 808 skb = skb_peek_tail(queue); 809 810 exthdrlen = !skb ? rt->dst.header_len : 0; 811 mtu = cork->fragsize; 812 813 hh_len = LL_RESERVED_SPACE(rt->dst.dev); 814 815 fragheaderlen = sizeof(struct iphdr) + (opt ? opt->optlen : 0); 816 maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen; 817 818 if (cork->length + length > 0xFFFF - fragheaderlen) { 819 ip_local_error(sk, EMSGSIZE, fl4->daddr, inet->inet_dport, 820 mtu-exthdrlen); 821 return -EMSGSIZE; 822 } 823 824 /* 825 * transhdrlen > 0 means that this is the first fragment and we wish 826 * it won't be fragmented in the future. 827 */ 828 if (transhdrlen && 829 length + fragheaderlen <= mtu && 830 rt->dst.dev->features & NETIF_F_V4_CSUM && 831 !exthdrlen) 832 csummode = CHECKSUM_PARTIAL; 833 834 cork->length += length; 835 if (((length > mtu) || (skb && skb_is_gso(skb))) && 836 (sk->sk_protocol == IPPROTO_UDP) && 837 (rt->dst.dev->features & NETIF_F_UFO) && !rt->dst.header_len) { 838 err = ip_ufo_append_data(sk, queue, getfrag, from, length, 839 hh_len, fragheaderlen, transhdrlen, 840 maxfraglen, flags); 841 if (err) 842 goto error; 843 return 0; 844 } 845 846 /* So, what's going on in the loop below? 847 * 848 * We use calculated fragment length to generate chained skb, 849 * each of segments is IP fragment ready for sending to network after 850 * adding appropriate IP header. 851 */ 852 853 if (!skb) 854 goto alloc_new_skb; 855 856 while (length > 0) { 857 /* Check if the remaining data fits into current packet. */ 858 copy = mtu - skb->len; 859 if (copy < length) 860 copy = maxfraglen - skb->len; 861 if (copy <= 0) { 862 char *data; 863 unsigned int datalen; 864 unsigned int fraglen; 865 unsigned int fraggap; 866 unsigned int alloclen; 867 struct sk_buff *skb_prev; 868alloc_new_skb: 869 skb_prev = skb; 870 if (skb_prev) 871 fraggap = skb_prev->len - maxfraglen; 872 else 873 fraggap = 0; 874 875 /* 876 * If remaining data exceeds the mtu, 877 * we know we need more fragment(s). 878 */ 879 datalen = length + fraggap; 880 if (datalen > mtu - fragheaderlen) 881 datalen = maxfraglen - fragheaderlen; 882 fraglen = datalen + fragheaderlen; 883 884 if ((flags & MSG_MORE) && 885 !(rt->dst.dev->features&NETIF_F_SG)) 886 alloclen = mtu; 887 else 888 alloclen = fraglen; 889 890 alloclen += exthdrlen; 891 892 /* The last fragment gets additional space at tail. 893 * Note, with MSG_MORE we overallocate on fragments, 894 * because we have no idea what fragment will be 895 * the last. 896 */ 897 if (datalen == length + fraggap) 898 alloclen += rt->dst.trailer_len; 899 900 if (transhdrlen) { 901 skb = sock_alloc_send_skb(sk, 902 alloclen + hh_len + 15, 903 (flags & MSG_DONTWAIT), &err); 904 } else { 905 skb = NULL; 906 if (atomic_read(&sk->sk_wmem_alloc) <= 907 2 * sk->sk_sndbuf) 908 skb = sock_wmalloc(sk, 909 alloclen + hh_len + 15, 1, 910 sk->sk_allocation); 911 if (unlikely(skb == NULL)) 912 err = -ENOBUFS; 913 else 914 /* only the initial fragment is 915 time stamped */ 916 cork->tx_flags = 0; 917 } 918 if (skb == NULL) 919 goto error; 920 921 /* 922 * Fill in the control structures 923 */ 924 skb->ip_summed = csummode; 925 skb->csum = 0; 926 skb_reserve(skb, hh_len); 927 skb_shinfo(skb)->tx_flags = cork->tx_flags; 928 929 /* 930 * Find where to start putting bytes. 931 */ 932 data = skb_put(skb, fraglen + exthdrlen); 933 skb_set_network_header(skb, exthdrlen); 934 skb->transport_header = (skb->network_header + 935 fragheaderlen); 936 data += fragheaderlen + exthdrlen; 937 938 if (fraggap) { 939 skb->csum = skb_copy_and_csum_bits( 940 skb_prev, maxfraglen, 941 data + transhdrlen, fraggap, 0); 942 skb_prev->csum = csum_sub(skb_prev->csum, 943 skb->csum); 944 data += fraggap; 945 pskb_trim_unique(skb_prev, maxfraglen); 946 } 947 948 copy = datalen - transhdrlen - fraggap; 949 if (copy > 0 && getfrag(from, data + transhdrlen, offset, copy, fraggap, skb) < 0) { 950 err = -EFAULT; 951 kfree_skb(skb); 952 goto error; 953 } 954 955 offset += copy; 956 length -= datalen - fraggap; 957 transhdrlen = 0; 958 exthdrlen = 0; 959 csummode = CHECKSUM_NONE; 960 961 /* 962 * Put the packet on the pending queue. 963 */ 964 __skb_queue_tail(queue, skb); 965 continue; 966 } 967 968 if (copy > length) 969 copy = length; 970 971 if (!(rt->dst.dev->features&NETIF_F_SG)) { 972 unsigned int off; 973 974 off = skb->len; 975 if (getfrag(from, skb_put(skb, copy), 976 offset, copy, off, skb) < 0) { 977 __skb_trim(skb, off); 978 err = -EFAULT; 979 goto error; 980 } 981 } else { 982 int i = skb_shinfo(skb)->nr_frags; 983 skb_frag_t *frag = &skb_shinfo(skb)->frags[i-1]; 984 struct page *page = cork->page; 985 int off = cork->off; 986 unsigned int left; 987 988 if (page && (left = PAGE_SIZE - off) > 0) { 989 if (copy >= left) 990 copy = left; 991 if (page != frag->page) { 992 if (i == MAX_SKB_FRAGS) { 993 err = -EMSGSIZE; 994 goto error; 995 } 996 get_page(page); 997 skb_fill_page_desc(skb, i, page, off, 0); 998 frag = &skb_shinfo(skb)->frags[i]; 999 } 1000 } else if (i < MAX_SKB_FRAGS) { 1001 if (copy > PAGE_SIZE) 1002 copy = PAGE_SIZE; 1003 page = alloc_pages(sk->sk_allocation, 0); 1004 if (page == NULL) { 1005 err = -ENOMEM; 1006 goto error; 1007 } 1008 cork->page = page; 1009 cork->off = 0; 1010 1011 skb_fill_page_desc(skb, i, page, 0, 0); 1012 frag = &skb_shinfo(skb)->frags[i]; 1013 } else { 1014 err = -EMSGSIZE; 1015 goto error; 1016 } 1017 if (getfrag(from, page_address(frag->page)+frag->page_offset+frag->size, offset, copy, skb->len, skb) < 0) { 1018 err = -EFAULT; 1019 goto error; 1020 } 1021 cork->off += copy; 1022 frag->size += copy; 1023 skb->len += copy; 1024 skb->data_len += copy; 1025 skb->truesize += copy; 1026 atomic_add(copy, &sk->sk_wmem_alloc); 1027 } 1028 offset += copy; 1029 length -= copy; 1030 } 1031 1032 return 0; 1033 1034error: 1035 cork->length -= length; 1036 IP_INC_STATS(sock_net(sk), IPSTATS_MIB_OUTDISCARDS); 1037 return err; 1038} 1039 1040static int ip_setup_cork(struct sock *sk, struct inet_cork *cork, 1041 struct ipcm_cookie *ipc, struct rtable **rtp) 1042{ 1043 struct inet_sock *inet = inet_sk(sk); 1044 struct ip_options_rcu *opt; 1045 struct rtable *rt; 1046 1047 /* 1048 * setup for corking. 1049 */ 1050 opt = ipc->opt; 1051 if (opt) { 1052 if (cork->opt == NULL) { 1053 cork->opt = kmalloc(sizeof(struct ip_options) + 40, 1054 sk->sk_allocation); 1055 if (unlikely(cork->opt == NULL)) 1056 return -ENOBUFS; 1057 } 1058 memcpy(cork->opt, &opt->opt, sizeof(struct ip_options) + opt->opt.optlen); 1059 cork->flags |= IPCORK_OPT; 1060 cork->addr = ipc->addr; 1061 } 1062 rt = *rtp; 1063 if (unlikely(!rt)) 1064 return -EFAULT; 1065 /* 1066 * We steal reference to this route, caller should not release it 1067 */ 1068 *rtp = NULL; 1069 cork->fragsize = inet->pmtudisc == IP_PMTUDISC_PROBE ? 1070 rt->dst.dev->mtu : dst_mtu(&rt->dst); 1071 cork->dst = &rt->dst; 1072 cork->length = 0; 1073 cork->tx_flags = ipc->tx_flags; 1074 cork->page = NULL; 1075 cork->off = 0; 1076 1077 return 0; 1078} 1079 1080/* 1081 * ip_append_data() and ip_append_page() can make one large IP datagram 1082 * from many pieces of data. Each pieces will be holded on the socket 1083 * until ip_push_pending_frames() is called. Each piece can be a page 1084 * or non-page data. 1085 * 1086 * Not only UDP, other transport protocols - e.g. raw sockets - can use 1087 * this interface potentially. 1088 * 1089 * LATER: length must be adjusted by pad at tail, when it is required. 1090 */ 1091int ip_append_data(struct sock *sk, struct flowi4 *fl4, 1092 int getfrag(void *from, char *to, int offset, int len, 1093 int odd, struct sk_buff *skb), 1094 void *from, int length, int transhdrlen, 1095 struct ipcm_cookie *ipc, struct rtable **rtp, 1096 unsigned int flags) 1097{ 1098 struct inet_sock *inet = inet_sk(sk); 1099 int err; 1100 1101 if (flags&MSG_PROBE) 1102 return 0; 1103 1104 if (skb_queue_empty(&sk->sk_write_queue)) { 1105 err = ip_setup_cork(sk, &inet->cork.base, ipc, rtp); 1106 if (err) 1107 return err; 1108 } else { 1109 transhdrlen = 0; 1110 } 1111 1112 return __ip_append_data(sk, fl4, &sk->sk_write_queue, &inet->cork.base, getfrag, 1113 from, length, transhdrlen, flags); 1114} 1115 1116ssize_t ip_append_page(struct sock *sk, struct flowi4 *fl4, struct page *page, 1117 int offset, size_t size, int flags) 1118{ 1119 struct inet_sock *inet = inet_sk(sk); 1120 struct sk_buff *skb; 1121 struct rtable *rt; 1122 struct ip_options *opt = NULL; 1123 struct inet_cork *cork; 1124 int hh_len; 1125 int mtu; 1126 int len; 1127 int err; 1128 unsigned int maxfraglen, fragheaderlen, fraggap; 1129 1130 if (inet->hdrincl) 1131 return -EPERM; 1132 1133 if (flags&MSG_PROBE) 1134 return 0; 1135 1136 if (skb_queue_empty(&sk->sk_write_queue)) 1137 return -EINVAL; 1138 1139 cork = &inet->cork.base; 1140 rt = (struct rtable *)cork->dst; 1141 if (cork->flags & IPCORK_OPT) 1142 opt = cork->opt; 1143 1144 if (!(rt->dst.dev->features&NETIF_F_SG)) 1145 return -EOPNOTSUPP; 1146 1147 hh_len = LL_RESERVED_SPACE(rt->dst.dev); 1148 mtu = cork->fragsize; 1149 1150 fragheaderlen = sizeof(struct iphdr) + (opt ? opt->optlen : 0); 1151 maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen; 1152 1153 if (cork->length + size > 0xFFFF - fragheaderlen) { 1154 ip_local_error(sk, EMSGSIZE, fl4->daddr, inet->inet_dport, mtu); 1155 return -EMSGSIZE; 1156 } 1157 1158 if ((skb = skb_peek_tail(&sk->sk_write_queue)) == NULL) 1159 return -EINVAL; 1160 1161 cork->length += size; 1162 if ((size + skb->len > mtu) && 1163 (sk->sk_protocol == IPPROTO_UDP) && 1164 (rt->dst.dev->features & NETIF_F_UFO)) { 1165 skb_shinfo(skb)->gso_size = mtu - fragheaderlen; 1166 skb_shinfo(skb)->gso_type = SKB_GSO_UDP; 1167 } 1168 1169 1170 while (size > 0) { 1171 int i; 1172 1173 if (skb_is_gso(skb)) 1174 len = size; 1175 else { 1176 1177 /* Check if the remaining data fits into current packet. */ 1178 len = mtu - skb->len; 1179 if (len < size) 1180 len = maxfraglen - skb->len; 1181 } 1182 if (len <= 0) { 1183 struct sk_buff *skb_prev; 1184 int alloclen; 1185 1186 skb_prev = skb; 1187 fraggap = skb_prev->len - maxfraglen; 1188 1189 alloclen = fragheaderlen + hh_len + fraggap + 15; 1190 skb = sock_wmalloc(sk, alloclen, 1, sk->sk_allocation); 1191 if (unlikely(!skb)) { 1192 err = -ENOBUFS; 1193 goto error; 1194 } 1195 1196 /* 1197 * Fill in the control structures 1198 */ 1199 skb->ip_summed = CHECKSUM_NONE; 1200 skb->csum = 0; 1201 skb_reserve(skb, hh_len); 1202 1203 /* 1204 * Find where to start putting bytes. 1205 */ 1206 skb_put(skb, fragheaderlen + fraggap); 1207 skb_reset_network_header(skb); 1208 skb->transport_header = (skb->network_header + 1209 fragheaderlen); 1210 if (fraggap) { 1211 skb->csum = skb_copy_and_csum_bits(skb_prev, 1212 maxfraglen, 1213 skb_transport_header(skb), 1214 fraggap, 0); 1215 skb_prev->csum = csum_sub(skb_prev->csum, 1216 skb->csum); 1217 pskb_trim_unique(skb_prev, maxfraglen); 1218 } 1219 1220 /* 1221 * Put the packet on the pending queue. 1222 */ 1223 __skb_queue_tail(&sk->sk_write_queue, skb); 1224 continue; 1225 } 1226 1227 i = skb_shinfo(skb)->nr_frags; 1228 if (len > size) 1229 len = size; 1230 if (skb_can_coalesce(skb, i, page, offset)) { 1231 skb_shinfo(skb)->frags[i-1].size += len; 1232 } else if (i < MAX_SKB_FRAGS) { 1233 get_page(page); 1234 skb_fill_page_desc(skb, i, page, offset, len); 1235 } else { 1236 err = -EMSGSIZE; 1237 goto error; 1238 } 1239 1240 if (skb->ip_summed == CHECKSUM_NONE) { 1241 __wsum csum; 1242 csum = csum_page(page, offset, len); 1243 skb->csum = csum_block_add(skb->csum, csum, skb->len); 1244 } 1245 1246 skb->len += len; 1247 skb->data_len += len; 1248 skb->truesize += len; 1249 atomic_add(len, &sk->sk_wmem_alloc); 1250 offset += len; 1251 size -= len; 1252 } 1253 return 0; 1254 1255error: 1256 cork->length -= size; 1257 IP_INC_STATS(sock_net(sk), IPSTATS_MIB_OUTDISCARDS); 1258 return err; 1259} 1260 1261static void ip_cork_release(struct inet_cork *cork) 1262{ 1263 cork->flags &= ~IPCORK_OPT; 1264 kfree(cork->opt); 1265 cork->opt = NULL; 1266 dst_release(cork->dst); 1267 cork->dst = NULL; 1268} 1269 1270/* 1271 * Combined all pending IP fragments on the socket as one IP datagram 1272 * and push them out. 1273 */ 1274struct sk_buff *__ip_make_skb(struct sock *sk, 1275 struct flowi4 *fl4, 1276 struct sk_buff_head *queue, 1277 struct inet_cork *cork) 1278{ 1279 struct sk_buff *skb, *tmp_skb; 1280 struct sk_buff **tail_skb; 1281 struct inet_sock *inet = inet_sk(sk); 1282 struct net *net = sock_net(sk); 1283 struct ip_options *opt = NULL; 1284 struct rtable *rt = (struct rtable *)cork->dst; 1285 struct iphdr *iph; 1286 __be16 df = 0; 1287 __u8 ttl; 1288 1289 if ((skb = __skb_dequeue(queue)) == NULL) 1290 goto out; 1291 tail_skb = &(skb_shinfo(skb)->frag_list); 1292 1293 /* move skb->data to ip header from ext header */ 1294 if (skb->data < skb_network_header(skb)) 1295 __skb_pull(skb, skb_network_offset(skb)); 1296 while ((tmp_skb = __skb_dequeue(queue)) != NULL) { 1297 __skb_pull(tmp_skb, skb_network_header_len(skb)); 1298 *tail_skb = tmp_skb; 1299 tail_skb = &(tmp_skb->next); 1300 skb->len += tmp_skb->len; 1301 skb->data_len += tmp_skb->len; 1302 skb->truesize += tmp_skb->truesize; 1303 tmp_skb->destructor = NULL; 1304 tmp_skb->sk = NULL; 1305 } 1306 1307 /* Unless user demanded real pmtu discovery (IP_PMTUDISC_DO), we allow 1308 * to fragment the frame generated here. No matter, what transforms 1309 * how transforms change size of the packet, it will come out. 1310 */ 1311 if (inet->pmtudisc < IP_PMTUDISC_DO) 1312 skb->local_df = 1; 1313 1314 /* DF bit is set when we want to see DF on outgoing frames. 1315 * If local_df is set too, we still allow to fragment this frame 1316 * locally. */ 1317 if (inet->pmtudisc >= IP_PMTUDISC_DO || 1318 (skb->len <= dst_mtu(&rt->dst) && 1319 ip_dont_fragment(sk, &rt->dst))) 1320 df = htons(IP_DF); 1321 1322 if (cork->flags & IPCORK_OPT) 1323 opt = cork->opt; 1324 1325 if (rt->rt_type == RTN_MULTICAST) 1326 ttl = inet->mc_ttl; 1327 else 1328 ttl = ip_select_ttl(inet, &rt->dst); 1329 1330 iph = (struct iphdr *)skb->data; 1331 iph->version = 4; 1332 iph->ihl = 5; 1333 iph->tos = inet->tos; 1334 iph->frag_off = df; 1335 ip_select_ident(iph, &rt->dst, sk); 1336 iph->ttl = ttl; 1337 iph->protocol = sk->sk_protocol; 1338 iph->saddr = fl4->saddr; 1339 iph->daddr = fl4->daddr; 1340 1341 if (opt) { 1342 iph->ihl += opt->optlen>>2; 1343 ip_options_build(skb, opt, cork->addr, rt, 0); 1344 } 1345 1346 skb->priority = sk->sk_priority; 1347 skb->mark = sk->sk_mark; 1348 /* 1349 * Steal rt from cork.dst to avoid a pair of atomic_inc/atomic_dec 1350 * on dst refcount 1351 */ 1352 cork->dst = NULL; 1353 skb_dst_set(skb, &rt->dst); 1354 1355 if (iph->protocol == IPPROTO_ICMP) 1356 icmp_out_count(net, ((struct icmphdr *) 1357 skb_transport_header(skb))->type); 1358 1359 ip_cork_release(cork); 1360out: 1361 return skb; 1362} 1363 1364int ip_send_skb(struct sk_buff *skb) 1365{ 1366 struct net *net = sock_net(skb->sk); 1367 int err; 1368 1369 err = ip_local_out(skb); 1370 if (err) { 1371 if (err > 0) 1372 err = net_xmit_errno(err); 1373 if (err) 1374 IP_INC_STATS(net, IPSTATS_MIB_OUTDISCARDS); 1375 } 1376 1377 return err; 1378} 1379 1380int ip_push_pending_frames(struct sock *sk, struct flowi4 *fl4) 1381{ 1382 struct sk_buff *skb; 1383 1384 skb = ip_finish_skb(sk, fl4); 1385 if (!skb) 1386 return 0; 1387 1388 /* Netfilter gets whole the not fragmented skb. */ 1389 return ip_send_skb(skb); 1390} 1391 1392/* 1393 * Throw away all pending data on the socket. 1394 */ 1395static void __ip_flush_pending_frames(struct sock *sk, 1396 struct sk_buff_head *queue, 1397 struct inet_cork *cork) 1398{ 1399 struct sk_buff *skb; 1400 1401 while ((skb = __skb_dequeue_tail(queue)) != NULL) 1402 kfree_skb(skb); 1403 1404 ip_cork_release(cork); 1405} 1406 1407void ip_flush_pending_frames(struct sock *sk) 1408{ 1409 __ip_flush_pending_frames(sk, &sk->sk_write_queue, &inet_sk(sk)->cork.base); 1410} 1411 1412struct sk_buff *ip_make_skb(struct sock *sk, 1413 struct flowi4 *fl4, 1414 int getfrag(void *from, char *to, int offset, 1415 int len, int odd, struct sk_buff *skb), 1416 void *from, int length, int transhdrlen, 1417 struct ipcm_cookie *ipc, struct rtable **rtp, 1418 unsigned int flags) 1419{ 1420 struct inet_cork cork; 1421 struct sk_buff_head queue; 1422 int err; 1423 1424 if (flags & MSG_PROBE) 1425 return NULL; 1426 1427 __skb_queue_head_init(&queue); 1428 1429 cork.flags = 0; 1430 cork.addr = 0; 1431 cork.opt = NULL; 1432 err = ip_setup_cork(sk, &cork, ipc, rtp); 1433 if (err) 1434 return ERR_PTR(err); 1435 1436 err = __ip_append_data(sk, fl4, &queue, &cork, getfrag, 1437 from, length, transhdrlen, flags); 1438 if (err) { 1439 __ip_flush_pending_frames(sk, &queue, &cork); 1440 return ERR_PTR(err); 1441 } 1442 1443 return __ip_make_skb(sk, fl4, &queue, &cork); 1444} 1445 1446/* 1447 * Fetch data from kernel space and fill in checksum if needed. 1448 */ 1449static int ip_reply_glue_bits(void *dptr, char *to, int offset, 1450 int len, int odd, struct sk_buff *skb) 1451{ 1452 __wsum csum; 1453 1454 csum = csum_partial_copy_nocheck(dptr+offset, to, len, 0); 1455 skb->csum = csum_block_add(skb->csum, csum, odd); 1456 return 0; 1457} 1458 1459/* 1460 * Generic function to send a packet as reply to another packet. 1461 * Used to send TCP resets so far. ICMP should use this function too. 1462 * 1463 * Should run single threaded per socket because it uses the sock 1464 * structure to pass arguments. 1465 */ 1466void ip_send_reply(struct sock *sk, struct sk_buff *skb, __be32 daddr, 1467 struct ip_reply_arg *arg, unsigned int len) 1468{ 1469 struct inet_sock *inet = inet_sk(sk); 1470 struct ip_options_data replyopts; 1471 struct ipcm_cookie ipc; 1472 struct flowi4 fl4; 1473 struct rtable *rt = skb_rtable(skb); 1474 1475 if (ip_options_echo(&replyopts.opt.opt, skb)) 1476 return; 1477 1478 ipc.addr = daddr; 1479 ipc.opt = NULL; 1480 ipc.tx_flags = 0; 1481 1482 if (replyopts.opt.opt.optlen) { 1483 ipc.opt = &replyopts.opt; 1484 1485 if (replyopts.opt.opt.srr) 1486 daddr = replyopts.opt.opt.faddr; 1487 } 1488 1489 flowi4_init_output(&fl4, arg->bound_dev_if, 0, 1490 RT_TOS(ip_hdr(skb)->tos), 1491 RT_SCOPE_UNIVERSE, sk->sk_protocol, 1492 ip_reply_arg_flowi_flags(arg), 1493 daddr, rt->rt_spec_dst, 1494 tcp_hdr(skb)->source, tcp_hdr(skb)->dest); 1495 security_skb_classify_flow(skb, flowi4_to_flowi(&fl4)); 1496 rt = ip_route_output_key(sock_net(sk), &fl4); 1497 if (IS_ERR(rt)) 1498 return; 1499 1500 /* And let IP do all the hard work. 1501 1502 This chunk is not reenterable, hence spinlock. 1503 Note that it uses the fact, that this function is called 1504 with locally disabled BH and that sk cannot be already spinlocked. 1505 */ 1506 bh_lock_sock(sk); 1507 inet->tos = ip_hdr(skb)->tos; 1508 sk->sk_priority = skb->priority; 1509 sk->sk_protocol = ip_hdr(skb)->protocol; 1510 sk->sk_bound_dev_if = arg->bound_dev_if; 1511 ip_append_data(sk, &fl4, ip_reply_glue_bits, arg->iov->iov_base, len, 0, 1512 &ipc, &rt, MSG_DONTWAIT); 1513 if ((skb = skb_peek(&sk->sk_write_queue)) != NULL) { 1514 if (arg->csumoffset >= 0) 1515 *((__sum16 *)skb_transport_header(skb) + 1516 arg->csumoffset) = csum_fold(csum_add(skb->csum, 1517 arg->csum)); 1518 skb->ip_summed = CHECKSUM_NONE; 1519 ip_push_pending_frames(sk, &fl4); 1520 } 1521 1522 bh_unlock_sock(sk); 1523 1524 ip_rt_put(rt); 1525} 1526 1527void __init ip_init(void) 1528{ 1529 ip_rt_init(); 1530 inet_initpeers(); 1531 1532#if defined(CONFIG_IP_MULTICAST) && defined(CONFIG_PROC_FS) 1533 igmp_mc_proc_init(); 1534#endif 1535} 1536