nf_conntrack_reasm.c revision e97c3e278e951501c2f385de70c3ceacdea78c4a
1/* 2 * IPv6 fragment reassembly for connection tracking 3 * 4 * Copyright (C)2004 USAGI/WIDE Project 5 * 6 * Author: 7 * Yasuyuki Kozakai @USAGI <yasuyuki.kozakai@toshiba.co.jp> 8 * 9 * Based on: net/ipv6/reassembly.c 10 * 11 * This program is free software; you can redistribute it and/or 12 * modify it under the terms of the GNU General Public License 13 * as published by the Free Software Foundation; either version 14 * 2 of the License, or (at your option) any later version. 15 */ 16 17#include <linux/errno.h> 18#include <linux/types.h> 19#include <linux/string.h> 20#include <linux/socket.h> 21#include <linux/sockios.h> 22#include <linux/jiffies.h> 23#include <linux/net.h> 24#include <linux/list.h> 25#include <linux/netdevice.h> 26#include <linux/in6.h> 27#include <linux/ipv6.h> 28#include <linux/icmpv6.h> 29#include <linux/random.h> 30#include <linux/slab.h> 31 32#include <net/sock.h> 33#include <net/snmp.h> 34#include <net/inet_frag.h> 35 36#include <net/ipv6.h> 37#include <net/protocol.h> 38#include <net/transp_v6.h> 39#include <net/rawv6.h> 40#include <net/ndisc.h> 41#include <net/addrconf.h> 42#include <net/netfilter/ipv6/nf_conntrack_ipv6.h> 43#include <linux/sysctl.h> 44#include <linux/netfilter.h> 45#include <linux/netfilter_ipv6.h> 46#include <linux/kernel.h> 47#include <linux/module.h> 48 49 50struct nf_ct_frag6_skb_cb 51{ 52 struct inet6_skb_parm h; 53 int offset; 54 struct sk_buff *orig; 55}; 56 57#define NFCT_FRAG6_CB(skb) ((struct nf_ct_frag6_skb_cb*)((skb)->cb)) 58 59struct nf_ct_frag6_queue 60{ 61 struct inet_frag_queue q; 62 63 __be32 id; /* fragment id */ 64 u32 user; 65 struct in6_addr saddr; 66 struct in6_addr daddr; 67 68 unsigned int csum; 69 __u16 nhoffset; 70}; 71 72static struct inet_frags nf_frags; 73static struct netns_frags nf_init_frags; 74 75#ifdef CONFIG_SYSCTL 76struct ctl_table nf_ct_frag6_sysctl_table[] = { 77 { 78 .procname = "nf_conntrack_frag6_timeout", 79 .data = &nf_init_frags.timeout, 80 .maxlen = sizeof(unsigned int), 81 .mode = 0644, 82 .proc_handler = proc_dointvec_jiffies, 83 }, 84 { 85 .procname = "nf_conntrack_frag6_low_thresh", 86 .data = &nf_init_frags.low_thresh, 87 .maxlen = sizeof(unsigned int), 88 .mode = 0644, 89 .proc_handler = proc_dointvec, 90 }, 91 { 92 .procname = "nf_conntrack_frag6_high_thresh", 93 .data = &nf_init_frags.high_thresh, 94 .maxlen = sizeof(unsigned int), 95 .mode = 0644, 96 .proc_handler = proc_dointvec, 97 }, 98 { } 99}; 100 101static struct ctl_table_header *nf_ct_frag6_sysctl_header; 102#endif 103 104static unsigned int nf_hashfn(struct inet_frag_queue *q) 105{ 106 const struct nf_ct_frag6_queue *nq; 107 108 nq = container_of(q, struct nf_ct_frag6_queue, q); 109 return inet6_hash_frag(nq->id, &nq->saddr, &nq->daddr, nf_frags.rnd); 110} 111 112static void nf_skb_free(struct sk_buff *skb) 113{ 114 if (NFCT_FRAG6_CB(skb)->orig) 115 kfree_skb(NFCT_FRAG6_CB(skb)->orig); 116} 117 118/* Destruction primitives. */ 119 120static __inline__ void fq_put(struct nf_ct_frag6_queue *fq) 121{ 122 inet_frag_put(&fq->q, &nf_frags); 123} 124 125/* Kill fq entry. It is not destroyed immediately, 126 * because caller (and someone more) holds reference count. 127 */ 128static __inline__ void fq_kill(struct nf_ct_frag6_queue *fq) 129{ 130 inet_frag_kill(&fq->q, &nf_frags); 131} 132 133static void nf_ct_frag6_evictor(void) 134{ 135 local_bh_disable(); 136 inet_frag_evictor(&nf_init_frags, &nf_frags); 137 local_bh_enable(); 138} 139 140static void nf_ct_frag6_expire(unsigned long data) 141{ 142 struct nf_ct_frag6_queue *fq; 143 144 fq = container_of((struct inet_frag_queue *)data, 145 struct nf_ct_frag6_queue, q); 146 147 spin_lock(&fq->q.lock); 148 149 if (fq->q.last_in & INET_FRAG_COMPLETE) 150 goto out; 151 152 fq_kill(fq); 153 154out: 155 spin_unlock(&fq->q.lock); 156 fq_put(fq); 157} 158 159/* Creation primitives. */ 160 161static __inline__ struct nf_ct_frag6_queue * 162fq_find(__be32 id, u32 user, struct in6_addr *src, struct in6_addr *dst) 163{ 164 struct inet_frag_queue *q; 165 struct ip6_create_arg arg; 166 unsigned int hash; 167 168 arg.id = id; 169 arg.user = user; 170 arg.src = src; 171 arg.dst = dst; 172 173 read_lock_bh(&nf_frags.lock); 174 hash = inet6_hash_frag(id, src, dst, nf_frags.rnd); 175 176 q = inet_frag_find(&nf_init_frags, &nf_frags, &arg, hash); 177 local_bh_enable(); 178 if (q == NULL) 179 goto oom; 180 181 return container_of(q, struct nf_ct_frag6_queue, q); 182 183oom: 184 pr_debug("Can't alloc new queue\n"); 185 return NULL; 186} 187 188 189static int nf_ct_frag6_queue(struct nf_ct_frag6_queue *fq, struct sk_buff *skb, 190 const struct frag_hdr *fhdr, int nhoff) 191{ 192 struct sk_buff *prev, *next; 193 int offset, end; 194 195 if (fq->q.last_in & INET_FRAG_COMPLETE) { 196 pr_debug("Already completed\n"); 197 goto err; 198 } 199 200 offset = ntohs(fhdr->frag_off) & ~0x7; 201 end = offset + (ntohs(ipv6_hdr(skb)->payload_len) - 202 ((u8 *)(fhdr + 1) - (u8 *)(ipv6_hdr(skb) + 1))); 203 204 if ((unsigned int)end > IPV6_MAXPLEN) { 205 pr_debug("offset is too large.\n"); 206 return -1; 207 } 208 209 if (skb->ip_summed == CHECKSUM_COMPLETE) { 210 const unsigned char *nh = skb_network_header(skb); 211 skb->csum = csum_sub(skb->csum, 212 csum_partial(nh, (u8 *)(fhdr + 1) - nh, 213 0)); 214 } 215 216 /* Is this the final fragment? */ 217 if (!(fhdr->frag_off & htons(IP6_MF))) { 218 /* If we already have some bits beyond end 219 * or have different end, the segment is corrupted. 220 */ 221 if (end < fq->q.len || 222 ((fq->q.last_in & INET_FRAG_LAST_IN) && end != fq->q.len)) { 223 pr_debug("already received last fragment\n"); 224 goto err; 225 } 226 fq->q.last_in |= INET_FRAG_LAST_IN; 227 fq->q.len = end; 228 } else { 229 /* Check if the fragment is rounded to 8 bytes. 230 * Required by the RFC. 231 */ 232 if (end & 0x7) { 233 /* RFC2460 says always send parameter problem in 234 * this case. -DaveM 235 */ 236 pr_debug("end of fragment not rounded to 8 bytes.\n"); 237 return -1; 238 } 239 if (end > fq->q.len) { 240 /* Some bits beyond end -> corruption. */ 241 if (fq->q.last_in & INET_FRAG_LAST_IN) { 242 pr_debug("last packet already reached.\n"); 243 goto err; 244 } 245 fq->q.len = end; 246 } 247 } 248 249 if (end == offset) 250 goto err; 251 252 /* Point into the IP datagram 'data' part. */ 253 if (!pskb_pull(skb, (u8 *) (fhdr + 1) - skb->data)) { 254 pr_debug("queue: message is too short.\n"); 255 goto err; 256 } 257 if (pskb_trim_rcsum(skb, end - offset)) { 258 pr_debug("Can't trim\n"); 259 goto err; 260 } 261 262 /* Find out which fragments are in front and at the back of us 263 * in the chain of fragments so far. We must know where to put 264 * this fragment, right? 265 */ 266 prev = fq->q.fragments_tail; 267 if (!prev || NFCT_FRAG6_CB(prev)->offset < offset) { 268 next = NULL; 269 goto found; 270 } 271 prev = NULL; 272 for (next = fq->q.fragments; next != NULL; next = next->next) { 273 if (NFCT_FRAG6_CB(next)->offset >= offset) 274 break; /* bingo! */ 275 prev = next; 276 } 277 278found: 279 /* RFC5722, Section 4: 280 * When reassembling an IPv6 datagram, if 281 * one or more its constituent fragments is determined to be an 282 * overlapping fragment, the entire datagram (and any constituent 283 * fragments, including those not yet received) MUST be silently 284 * discarded. 285 */ 286 287 /* Check for overlap with preceding fragment. */ 288 if (prev && 289 (NFCT_FRAG6_CB(prev)->offset + prev->len) - offset > 0) 290 goto discard_fq; 291 292 /* Look for overlap with succeeding segment. */ 293 if (next && NFCT_FRAG6_CB(next)->offset < end) 294 goto discard_fq; 295 296 NFCT_FRAG6_CB(skb)->offset = offset; 297 298 /* Insert this fragment in the chain of fragments. */ 299 skb->next = next; 300 if (!next) 301 fq->q.fragments_tail = skb; 302 if (prev) 303 prev->next = skb; 304 else 305 fq->q.fragments = skb; 306 307 skb->dev = NULL; 308 fq->q.stamp = skb->tstamp; 309 fq->q.meat += skb->len; 310 atomic_add(skb->truesize, &nf_init_frags.mem); 311 312 /* The first fragment. 313 * nhoffset is obtained from the first fragment, of course. 314 */ 315 if (offset == 0) { 316 fq->nhoffset = nhoff; 317 fq->q.last_in |= INET_FRAG_FIRST_IN; 318 } 319 write_lock(&nf_frags.lock); 320 list_move_tail(&fq->q.lru_list, &nf_init_frags.lru_list); 321 write_unlock(&nf_frags.lock); 322 return 0; 323 324discard_fq: 325 fq_kill(fq); 326err: 327 return -1; 328} 329 330/* 331 * Check if this packet is complete. 332 * Returns NULL on failure by any reason, and pointer 333 * to current nexthdr field in reassembled frame. 334 * 335 * It is called with locked fq, and caller must check that 336 * queue is eligible for reassembly i.e. it is not COMPLETE, 337 * the last and the first frames arrived and all the bits are here. 338 */ 339static struct sk_buff * 340nf_ct_frag6_reasm(struct nf_ct_frag6_queue *fq, struct net_device *dev) 341{ 342 struct sk_buff *fp, *op, *head = fq->q.fragments; 343 int payload_len; 344 345 fq_kill(fq); 346 347 WARN_ON(head == NULL); 348 WARN_ON(NFCT_FRAG6_CB(head)->offset != 0); 349 350 /* Unfragmented part is taken from the first segment. */ 351 payload_len = ((head->data - skb_network_header(head)) - 352 sizeof(struct ipv6hdr) + fq->q.len - 353 sizeof(struct frag_hdr)); 354 if (payload_len > IPV6_MAXPLEN) { 355 pr_debug("payload len is too large.\n"); 356 goto out_oversize; 357 } 358 359 /* Head of list must not be cloned. */ 360 if (skb_cloned(head) && pskb_expand_head(head, 0, 0, GFP_ATOMIC)) { 361 pr_debug("skb is cloned but can't expand head"); 362 goto out_oom; 363 } 364 365 /* If the first fragment is fragmented itself, we split 366 * it to two chunks: the first with data and paged part 367 * and the second, holding only fragments. */ 368 if (skb_has_frag_list(head)) { 369 struct sk_buff *clone; 370 int i, plen = 0; 371 372 if ((clone = alloc_skb(0, GFP_ATOMIC)) == NULL) { 373 pr_debug("Can't alloc skb\n"); 374 goto out_oom; 375 } 376 clone->next = head->next; 377 head->next = clone; 378 skb_shinfo(clone)->frag_list = skb_shinfo(head)->frag_list; 379 skb_frag_list_init(head); 380 for (i=0; i<skb_shinfo(head)->nr_frags; i++) 381 plen += skb_shinfo(head)->frags[i].size; 382 clone->len = clone->data_len = head->data_len - plen; 383 head->data_len -= clone->len; 384 head->len -= clone->len; 385 clone->csum = 0; 386 clone->ip_summed = head->ip_summed; 387 388 NFCT_FRAG6_CB(clone)->orig = NULL; 389 atomic_add(clone->truesize, &nf_init_frags.mem); 390 } 391 392 /* We have to remove fragment header from datagram and to relocate 393 * header in order to calculate ICV correctly. */ 394 skb_network_header(head)[fq->nhoffset] = skb_transport_header(head)[0]; 395 memmove(head->head + sizeof(struct frag_hdr), head->head, 396 (head->data - head->head) - sizeof(struct frag_hdr)); 397 head->mac_header += sizeof(struct frag_hdr); 398 head->network_header += sizeof(struct frag_hdr); 399 400 skb_shinfo(head)->frag_list = head->next; 401 skb_reset_transport_header(head); 402 skb_push(head, head->data - skb_network_header(head)); 403 404 for (fp=head->next; fp; fp = fp->next) { 405 head->data_len += fp->len; 406 head->len += fp->len; 407 if (head->ip_summed != fp->ip_summed) 408 head->ip_summed = CHECKSUM_NONE; 409 else if (head->ip_summed == CHECKSUM_COMPLETE) 410 head->csum = csum_add(head->csum, fp->csum); 411 head->truesize += fp->truesize; 412 } 413 atomic_sub(head->truesize, &nf_init_frags.mem); 414 415 head->next = NULL; 416 head->dev = dev; 417 head->tstamp = fq->q.stamp; 418 ipv6_hdr(head)->payload_len = htons(payload_len); 419 420 /* Yes, and fold redundant checksum back. 8) */ 421 if (head->ip_summed == CHECKSUM_COMPLETE) 422 head->csum = csum_partial(skb_network_header(head), 423 skb_network_header_len(head), 424 head->csum); 425 426 fq->q.fragments = NULL; 427 fq->q.fragments_tail = NULL; 428 429 /* all original skbs are linked into the NFCT_FRAG6_CB(head).orig */ 430 fp = skb_shinfo(head)->frag_list; 431 if (fp && NFCT_FRAG6_CB(fp)->orig == NULL) 432 /* at above code, head skb is divided into two skbs. */ 433 fp = fp->next; 434 435 op = NFCT_FRAG6_CB(head)->orig; 436 for (; fp; fp = fp->next) { 437 struct sk_buff *orig = NFCT_FRAG6_CB(fp)->orig; 438 439 op->next = orig; 440 op = orig; 441 NFCT_FRAG6_CB(fp)->orig = NULL; 442 } 443 444 return head; 445 446out_oversize: 447 if (net_ratelimit()) 448 printk(KERN_DEBUG "nf_ct_frag6_reasm: payload len = %d\n", payload_len); 449 goto out_fail; 450out_oom: 451 if (net_ratelimit()) 452 printk(KERN_DEBUG "nf_ct_frag6_reasm: no memory for reassembly\n"); 453out_fail: 454 return NULL; 455} 456 457/* 458 * find the header just before Fragment Header. 459 * 460 * if success return 0 and set ... 461 * (*prevhdrp): the value of "Next Header Field" in the header 462 * just before Fragment Header. 463 * (*prevhoff): the offset of "Next Header Field" in the header 464 * just before Fragment Header. 465 * (*fhoff) : the offset of Fragment Header. 466 * 467 * Based on ipv6_skip_hdr() in net/ipv6/exthdr.c 468 * 469 */ 470static int 471find_prev_fhdr(struct sk_buff *skb, u8 *prevhdrp, int *prevhoff, int *fhoff) 472{ 473 u8 nexthdr = ipv6_hdr(skb)->nexthdr; 474 const int netoff = skb_network_offset(skb); 475 u8 prev_nhoff = netoff + offsetof(struct ipv6hdr, nexthdr); 476 int start = netoff + sizeof(struct ipv6hdr); 477 int len = skb->len - start; 478 u8 prevhdr = NEXTHDR_IPV6; 479 480 while (nexthdr != NEXTHDR_FRAGMENT) { 481 struct ipv6_opt_hdr hdr; 482 int hdrlen; 483 484 if (!ipv6_ext_hdr(nexthdr)) { 485 return -1; 486 } 487 if (nexthdr == NEXTHDR_NONE) { 488 pr_debug("next header is none\n"); 489 return -1; 490 } 491 if (len < (int)sizeof(struct ipv6_opt_hdr)) { 492 pr_debug("too short\n"); 493 return -1; 494 } 495 if (skb_copy_bits(skb, start, &hdr, sizeof(hdr))) 496 BUG(); 497 if (nexthdr == NEXTHDR_AUTH) 498 hdrlen = (hdr.hdrlen+2)<<2; 499 else 500 hdrlen = ipv6_optlen(&hdr); 501 502 prevhdr = nexthdr; 503 prev_nhoff = start; 504 505 nexthdr = hdr.nexthdr; 506 len -= hdrlen; 507 start += hdrlen; 508 } 509 510 if (len < 0) 511 return -1; 512 513 *prevhdrp = prevhdr; 514 *prevhoff = prev_nhoff; 515 *fhoff = start; 516 517 return 0; 518} 519 520struct sk_buff *nf_ct_frag6_gather(struct sk_buff *skb, u32 user) 521{ 522 struct sk_buff *clone; 523 struct net_device *dev = skb->dev; 524 struct frag_hdr *fhdr; 525 struct nf_ct_frag6_queue *fq; 526 struct ipv6hdr *hdr; 527 int fhoff, nhoff; 528 u8 prevhdr; 529 struct sk_buff *ret_skb = NULL; 530 531 /* Jumbo payload inhibits frag. header */ 532 if (ipv6_hdr(skb)->payload_len == 0) { 533 pr_debug("payload len = 0\n"); 534 return skb; 535 } 536 537 if (find_prev_fhdr(skb, &prevhdr, &nhoff, &fhoff) < 0) 538 return skb; 539 540 clone = skb_clone(skb, GFP_ATOMIC); 541 if (clone == NULL) { 542 pr_debug("Can't clone skb\n"); 543 return skb; 544 } 545 546 NFCT_FRAG6_CB(clone)->orig = skb; 547 548 if (!pskb_may_pull(clone, fhoff + sizeof(*fhdr))) { 549 pr_debug("message is too short.\n"); 550 goto ret_orig; 551 } 552 553 skb_set_transport_header(clone, fhoff); 554 hdr = ipv6_hdr(clone); 555 fhdr = (struct frag_hdr *)skb_transport_header(clone); 556 557 if (atomic_read(&nf_init_frags.mem) > nf_init_frags.high_thresh) 558 nf_ct_frag6_evictor(); 559 560 fq = fq_find(fhdr->identification, user, &hdr->saddr, &hdr->daddr); 561 if (fq == NULL) { 562 pr_debug("Can't find and can't create new queue\n"); 563 goto ret_orig; 564 } 565 566 spin_lock_bh(&fq->q.lock); 567 568 if (nf_ct_frag6_queue(fq, clone, fhdr, nhoff) < 0) { 569 spin_unlock_bh(&fq->q.lock); 570 pr_debug("Can't insert skb to queue\n"); 571 fq_put(fq); 572 goto ret_orig; 573 } 574 575 if (fq->q.last_in == (INET_FRAG_FIRST_IN | INET_FRAG_LAST_IN) && 576 fq->q.meat == fq->q.len) { 577 ret_skb = nf_ct_frag6_reasm(fq, dev); 578 if (ret_skb == NULL) 579 pr_debug("Can't reassemble fragmented packets\n"); 580 } 581 spin_unlock_bh(&fq->q.lock); 582 583 fq_put(fq); 584 return ret_skb; 585 586ret_orig: 587 kfree_skb(clone); 588 return skb; 589} 590 591void nf_ct_frag6_output(unsigned int hooknum, struct sk_buff *skb, 592 struct net_device *in, struct net_device *out, 593 int (*okfn)(struct sk_buff *)) 594{ 595 struct sk_buff *s, *s2; 596 597 for (s = NFCT_FRAG6_CB(skb)->orig; s;) { 598 nf_conntrack_put_reasm(s->nfct_reasm); 599 nf_conntrack_get_reasm(skb); 600 s->nfct_reasm = skb; 601 602 s2 = s->next; 603 s->next = NULL; 604 605 NF_HOOK_THRESH(NFPROTO_IPV6, hooknum, s, in, out, okfn, 606 NF_IP6_PRI_CONNTRACK_DEFRAG + 1); 607 s = s2; 608 } 609 nf_conntrack_put_reasm(skb); 610} 611 612int nf_ct_frag6_init(void) 613{ 614 nf_frags.hashfn = nf_hashfn; 615 nf_frags.constructor = ip6_frag_init; 616 nf_frags.destructor = NULL; 617 nf_frags.skb_free = nf_skb_free; 618 nf_frags.qsize = sizeof(struct nf_ct_frag6_queue); 619 nf_frags.match = ip6_frag_match; 620 nf_frags.frag_expire = nf_ct_frag6_expire; 621 nf_frags.secret_interval = 10 * 60 * HZ; 622 nf_init_frags.timeout = IPV6_FRAG_TIMEOUT; 623 nf_init_frags.high_thresh = IPV6_FRAG_HIGH_THRESH; 624 nf_init_frags.low_thresh = IPV6_FRAG_LOW_THRESH; 625 inet_frags_init_net(&nf_init_frags); 626 inet_frags_init(&nf_frags); 627 628 nf_ct_frag6_sysctl_header = register_sysctl_paths(nf_net_netfilter_sysctl_path, 629 nf_ct_frag6_sysctl_table); 630 if (!nf_ct_frag6_sysctl_header) { 631 inet_frags_fini(&nf_frags); 632 return -ENOMEM; 633 } 634 635 return 0; 636} 637 638void nf_ct_frag6_cleanup(void) 639{ 640 unregister_sysctl_table(nf_ct_frag6_sysctl_header); 641 nf_ct_frag6_sysctl_header = NULL; 642 643 inet_frags_fini(&nf_frags); 644 645 nf_init_frags.low_thresh = 0; 646 nf_ct_frag6_evictor(); 647} 648