ipoib_main.c revision 9217b27b12eb5ab910d14b3376c2b6cd13d87711
1/* 2 * Copyright (c) 2004 Topspin Communications. All rights reserved. 3 * Copyright (c) 2005 Sun Microsystems, Inc. All rights reserved. 4 * Copyright (c) 2004 Voltaire, Inc. All rights reserved. 5 * 6 * This software is available to you under a choice of one of two 7 * licenses. You may choose to be licensed under the terms of the GNU 8 * General Public License (GPL) Version 2, available from the file 9 * COPYING in the main directory of this source tree, or the 10 * OpenIB.org BSD license below: 11 * 12 * Redistribution and use in source and binary forms, with or 13 * without modification, are permitted provided that the following 14 * conditions are met: 15 * 16 * - Redistributions of source code must retain the above 17 * copyright notice, this list of conditions and the following 18 * disclaimer. 19 * 20 * - Redistributions in binary form must reproduce the above 21 * copyright notice, this list of conditions and the following 22 * disclaimer in the documentation and/or other materials 23 * provided with the distribution. 24 * 25 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 26 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 27 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 28 * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS 29 * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN 30 * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN 31 * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 32 * SOFTWARE. 33 * 34 * $Id: ipoib_main.c 1377 2004-12-23 19:57:12Z roland $ 35 */ 36 37#include "ipoib.h" 38 39#include <linux/module.h> 40 41#include <linux/init.h> 42#include <linux/slab.h> 43#include <linux/vmalloc.h> 44#include <linux/kernel.h> 45 46#include <linux/if_arp.h> /* For ARPHRD_xxx */ 47 48#include <linux/ip.h> 49#include <linux/in.h> 50 51#include <net/dst.h> 52 53MODULE_AUTHOR("Roland Dreier"); 54MODULE_DESCRIPTION("IP-over-InfiniBand net driver"); 55MODULE_LICENSE("Dual BSD/GPL"); 56 57int ipoib_sendq_size __read_mostly = IPOIB_TX_RING_SIZE; 58int ipoib_recvq_size __read_mostly = IPOIB_RX_RING_SIZE; 59 60module_param_named(send_queue_size, ipoib_sendq_size, int, 0444); 61MODULE_PARM_DESC(send_queue_size, "Number of descriptors in send queue"); 62module_param_named(recv_queue_size, ipoib_recvq_size, int, 0444); 63MODULE_PARM_DESC(recv_queue_size, "Number of descriptors in receive queue"); 64 65#ifdef CONFIG_INFINIBAND_IPOIB_DEBUG 66int ipoib_debug_level; 67 68module_param_named(debug_level, ipoib_debug_level, int, 0644); 69MODULE_PARM_DESC(debug_level, "Enable debug tracing if > 0"); 70#endif 71 72struct ipoib_path_iter { 73 struct net_device *dev; 74 struct ipoib_path path; 75}; 76 77static const u8 ipv4_bcast_addr[] = { 78 0x00, 0xff, 0xff, 0xff, 79 0xff, 0x12, 0x40, 0x1b, 0x00, 0x00, 0x00, 0x00, 80 0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff 81}; 82 83struct workqueue_struct *ipoib_workqueue; 84 85static void ipoib_add_one(struct ib_device *device); 86static void ipoib_remove_one(struct ib_device *device); 87 88static struct ib_client ipoib_client = { 89 .name = "ipoib", 90 .add = ipoib_add_one, 91 .remove = ipoib_remove_one 92}; 93 94int ipoib_open(struct net_device *dev) 95{ 96 struct ipoib_dev_priv *priv = netdev_priv(dev); 97 98 ipoib_dbg(priv, "bringing up interface\n"); 99 100 set_bit(IPOIB_FLAG_ADMIN_UP, &priv->flags); 101 102 if (ipoib_pkey_dev_delay_open(dev)) 103 return 0; 104 105 if (ipoib_ib_dev_open(dev)) 106 return -EINVAL; 107 108 if (ipoib_ib_dev_up(dev)) { 109 ipoib_ib_dev_stop(dev); 110 return -EINVAL; 111 } 112 113 if (!test_bit(IPOIB_FLAG_SUBINTERFACE, &priv->flags)) { 114 struct ipoib_dev_priv *cpriv; 115 116 /* Bring up any child interfaces too */ 117 mutex_lock(&priv->vlan_mutex); 118 list_for_each_entry(cpriv, &priv->child_intfs, list) { 119 int flags; 120 121 flags = cpriv->dev->flags; 122 if (flags & IFF_UP) 123 continue; 124 125 dev_change_flags(cpriv->dev, flags | IFF_UP); 126 } 127 mutex_unlock(&priv->vlan_mutex); 128 } 129 130 netif_start_queue(dev); 131 132 return 0; 133} 134 135static int ipoib_stop(struct net_device *dev) 136{ 137 struct ipoib_dev_priv *priv = netdev_priv(dev); 138 139 ipoib_dbg(priv, "stopping interface\n"); 140 141 clear_bit(IPOIB_FLAG_ADMIN_UP, &priv->flags); 142 143 netif_stop_queue(dev); 144 145 /* 146 * Now flush workqueue to make sure a scheduled task doesn't 147 * bring our internal state back up. 148 */ 149 flush_workqueue(ipoib_workqueue); 150 151 ipoib_ib_dev_down(dev, 1); 152 ipoib_ib_dev_stop(dev); 153 154 if (!test_bit(IPOIB_FLAG_SUBINTERFACE, &priv->flags)) { 155 struct ipoib_dev_priv *cpriv; 156 157 /* Bring down any child interfaces too */ 158 mutex_lock(&priv->vlan_mutex); 159 list_for_each_entry(cpriv, &priv->child_intfs, list) { 160 int flags; 161 162 flags = cpriv->dev->flags; 163 if (!(flags & IFF_UP)) 164 continue; 165 166 dev_change_flags(cpriv->dev, flags & ~IFF_UP); 167 } 168 mutex_unlock(&priv->vlan_mutex); 169 } 170 171 return 0; 172} 173 174static int ipoib_change_mtu(struct net_device *dev, int new_mtu) 175{ 176 struct ipoib_dev_priv *priv = netdev_priv(dev); 177 178 if (new_mtu > IPOIB_PACKET_SIZE - IPOIB_ENCAP_LEN) 179 return -EINVAL; 180 181 priv->admin_mtu = new_mtu; 182 183 dev->mtu = min(priv->mcast_mtu, priv->admin_mtu); 184 185 return 0; 186} 187 188static struct ipoib_path *__path_find(struct net_device *dev, void *gid) 189{ 190 struct ipoib_dev_priv *priv = netdev_priv(dev); 191 struct rb_node *n = priv->path_tree.rb_node; 192 struct ipoib_path *path; 193 int ret; 194 195 while (n) { 196 path = rb_entry(n, struct ipoib_path, rb_node); 197 198 ret = memcmp(gid, path->pathrec.dgid.raw, 199 sizeof (union ib_gid)); 200 201 if (ret < 0) 202 n = n->rb_left; 203 else if (ret > 0) 204 n = n->rb_right; 205 else 206 return path; 207 } 208 209 return NULL; 210} 211 212static int __path_add(struct net_device *dev, struct ipoib_path *path) 213{ 214 struct ipoib_dev_priv *priv = netdev_priv(dev); 215 struct rb_node **n = &priv->path_tree.rb_node; 216 struct rb_node *pn = NULL; 217 struct ipoib_path *tpath; 218 int ret; 219 220 while (*n) { 221 pn = *n; 222 tpath = rb_entry(pn, struct ipoib_path, rb_node); 223 224 ret = memcmp(path->pathrec.dgid.raw, tpath->pathrec.dgid.raw, 225 sizeof (union ib_gid)); 226 if (ret < 0) 227 n = &pn->rb_left; 228 else if (ret > 0) 229 n = &pn->rb_right; 230 else 231 return -EEXIST; 232 } 233 234 rb_link_node(&path->rb_node, pn, n); 235 rb_insert_color(&path->rb_node, &priv->path_tree); 236 237 list_add_tail(&path->list, &priv->path_list); 238 239 return 0; 240} 241 242static void path_free(struct net_device *dev, struct ipoib_path *path) 243{ 244 struct ipoib_dev_priv *priv = netdev_priv(dev); 245 struct ipoib_neigh *neigh, *tn; 246 struct sk_buff *skb; 247 unsigned long flags; 248 249 while ((skb = __skb_dequeue(&path->queue))) 250 dev_kfree_skb_irq(skb); 251 252 spin_lock_irqsave(&priv->lock, flags); 253 254 list_for_each_entry_safe(neigh, tn, &path->neigh_list, list) { 255 /* 256 * It's safe to call ipoib_put_ah() inside priv->lock 257 * here, because we know that path->ah will always 258 * hold one more reference, so ipoib_put_ah() will 259 * never do more than decrement the ref count. 260 */ 261 if (neigh->ah) 262 ipoib_put_ah(neigh->ah); 263 264 ipoib_neigh_free(neigh); 265 } 266 267 spin_unlock_irqrestore(&priv->lock, flags); 268 269 if (path->ah) 270 ipoib_put_ah(path->ah); 271 272 kfree(path); 273} 274 275#ifdef CONFIG_INFINIBAND_IPOIB_DEBUG 276 277struct ipoib_path_iter *ipoib_path_iter_init(struct net_device *dev) 278{ 279 struct ipoib_path_iter *iter; 280 281 iter = kmalloc(sizeof *iter, GFP_KERNEL); 282 if (!iter) 283 return NULL; 284 285 iter->dev = dev; 286 memset(iter->path.pathrec.dgid.raw, 0, 16); 287 288 if (ipoib_path_iter_next(iter)) { 289 kfree(iter); 290 return NULL; 291 } 292 293 return iter; 294} 295 296int ipoib_path_iter_next(struct ipoib_path_iter *iter) 297{ 298 struct ipoib_dev_priv *priv = netdev_priv(iter->dev); 299 struct rb_node *n; 300 struct ipoib_path *path; 301 int ret = 1; 302 303 spin_lock_irq(&priv->lock); 304 305 n = rb_first(&priv->path_tree); 306 307 while (n) { 308 path = rb_entry(n, struct ipoib_path, rb_node); 309 310 if (memcmp(iter->path.pathrec.dgid.raw, path->pathrec.dgid.raw, 311 sizeof (union ib_gid)) < 0) { 312 iter->path = *path; 313 ret = 0; 314 break; 315 } 316 317 n = rb_next(n); 318 } 319 320 spin_unlock_irq(&priv->lock); 321 322 return ret; 323} 324 325void ipoib_path_iter_read(struct ipoib_path_iter *iter, 326 struct ipoib_path *path) 327{ 328 *path = iter->path; 329} 330 331#endif /* CONFIG_INFINIBAND_IPOIB_DEBUG */ 332 333void ipoib_flush_paths(struct net_device *dev) 334{ 335 struct ipoib_dev_priv *priv = netdev_priv(dev); 336 struct ipoib_path *path, *tp; 337 LIST_HEAD(remove_list); 338 339 spin_lock_irq(&priv->tx_lock); 340 spin_lock(&priv->lock); 341 342 list_splice(&priv->path_list, &remove_list); 343 INIT_LIST_HEAD(&priv->path_list); 344 345 list_for_each_entry(path, &remove_list, list) 346 rb_erase(&path->rb_node, &priv->path_tree); 347 348 list_for_each_entry_safe(path, tp, &remove_list, list) { 349 if (path->query) 350 ib_sa_cancel_query(path->query_id, path->query); 351 spin_unlock(&priv->lock); 352 spin_unlock_irq(&priv->tx_lock); 353 wait_for_completion(&path->done); 354 path_free(dev, path); 355 spin_lock_irq(&priv->tx_lock); 356 spin_lock(&priv->lock); 357 } 358 spin_unlock(&priv->lock); 359 spin_unlock_irq(&priv->tx_lock); 360} 361 362static void path_rec_completion(int status, 363 struct ib_sa_path_rec *pathrec, 364 void *path_ptr) 365{ 366 struct ipoib_path *path = path_ptr; 367 struct net_device *dev = path->dev; 368 struct ipoib_dev_priv *priv = netdev_priv(dev); 369 struct ipoib_ah *ah = NULL; 370 struct ipoib_neigh *neigh; 371 struct sk_buff_head skqueue; 372 struct sk_buff *skb; 373 unsigned long flags; 374 375 if (pathrec) 376 ipoib_dbg(priv, "PathRec LID 0x%04x for GID " IPOIB_GID_FMT "\n", 377 be16_to_cpu(pathrec->dlid), IPOIB_GID_ARG(pathrec->dgid)); 378 else 379 ipoib_dbg(priv, "PathRec status %d for GID " IPOIB_GID_FMT "\n", 380 status, IPOIB_GID_ARG(path->pathrec.dgid)); 381 382 skb_queue_head_init(&skqueue); 383 384 if (!status) { 385 struct ib_ah_attr av = { 386 .dlid = be16_to_cpu(pathrec->dlid), 387 .sl = pathrec->sl, 388 .port_num = priv->port, 389 .static_rate = pathrec->rate 390 }; 391 392 ah = ipoib_create_ah(dev, priv->pd, &av); 393 } 394 395 spin_lock_irqsave(&priv->lock, flags); 396 397 path->ah = ah; 398 399 if (ah) { 400 path->pathrec = *pathrec; 401 402 ipoib_dbg(priv, "created address handle %p for LID 0x%04x, SL %d\n", 403 ah, be16_to_cpu(pathrec->dlid), pathrec->sl); 404 405 while ((skb = __skb_dequeue(&path->queue))) 406 __skb_queue_tail(&skqueue, skb); 407 408 list_for_each_entry(neigh, &path->neigh_list, list) { 409 kref_get(&path->ah->ref); 410 neigh->ah = path->ah; 411 memcpy(&neigh->dgid.raw, &path->pathrec.dgid.raw, 412 sizeof(union ib_gid)); 413 414 while ((skb = __skb_dequeue(&neigh->queue))) 415 __skb_queue_tail(&skqueue, skb); 416 } 417 } 418 419 path->query = NULL; 420 complete(&path->done); 421 422 spin_unlock_irqrestore(&priv->lock, flags); 423 424 while ((skb = __skb_dequeue(&skqueue))) { 425 skb->dev = dev; 426 if (dev_queue_xmit(skb)) 427 ipoib_warn(priv, "dev_queue_xmit failed " 428 "to requeue packet\n"); 429 } 430} 431 432static struct ipoib_path *path_rec_create(struct net_device *dev, void *gid) 433{ 434 struct ipoib_dev_priv *priv = netdev_priv(dev); 435 struct ipoib_path *path; 436 437 path = kzalloc(sizeof *path, GFP_ATOMIC); 438 if (!path) 439 return NULL; 440 441 path->dev = dev; 442 443 skb_queue_head_init(&path->queue); 444 445 INIT_LIST_HEAD(&path->neigh_list); 446 447 memcpy(path->pathrec.dgid.raw, gid, sizeof (union ib_gid)); 448 path->pathrec.sgid = priv->local_gid; 449 path->pathrec.pkey = cpu_to_be16(priv->pkey); 450 path->pathrec.numb_path = 1; 451 452 return path; 453} 454 455static int path_rec_start(struct net_device *dev, 456 struct ipoib_path *path) 457{ 458 struct ipoib_dev_priv *priv = netdev_priv(dev); 459 460 ipoib_dbg(priv, "Start path record lookup for " IPOIB_GID_FMT "\n", 461 IPOIB_GID_ARG(path->pathrec.dgid)); 462 463 init_completion(&path->done); 464 465 path->query_id = 466 ib_sa_path_rec_get(priv->ca, priv->port, 467 &path->pathrec, 468 IB_SA_PATH_REC_DGID | 469 IB_SA_PATH_REC_SGID | 470 IB_SA_PATH_REC_NUMB_PATH | 471 IB_SA_PATH_REC_PKEY, 472 1000, GFP_ATOMIC, 473 path_rec_completion, 474 path, &path->query); 475 if (path->query_id < 0) { 476 ipoib_warn(priv, "ib_sa_path_rec_get failed\n"); 477 path->query = NULL; 478 return path->query_id; 479 } 480 481 return 0; 482} 483 484static void neigh_add_path(struct sk_buff *skb, struct net_device *dev) 485{ 486 struct ipoib_dev_priv *priv = netdev_priv(dev); 487 struct ipoib_path *path; 488 struct ipoib_neigh *neigh; 489 490 neigh = ipoib_neigh_alloc(skb->dst->neighbour); 491 if (!neigh) { 492 ++priv->stats.tx_dropped; 493 dev_kfree_skb_any(skb); 494 return; 495 } 496 497 skb_queue_head_init(&neigh->queue); 498 499 /* 500 * We can only be called from ipoib_start_xmit, so we're 501 * inside tx_lock -- no need to save/restore flags. 502 */ 503 spin_lock(&priv->lock); 504 505 path = __path_find(dev, skb->dst->neighbour->ha + 4); 506 if (!path) { 507 path = path_rec_create(dev, skb->dst->neighbour->ha + 4); 508 if (!path) 509 goto err_path; 510 511 __path_add(dev, path); 512 } 513 514 list_add_tail(&neigh->list, &path->neigh_list); 515 516 if (path->ah) { 517 kref_get(&path->ah->ref); 518 neigh->ah = path->ah; 519 memcpy(&neigh->dgid.raw, &path->pathrec.dgid.raw, 520 sizeof(union ib_gid)); 521 522 ipoib_send(dev, skb, path->ah, 523 be32_to_cpup((__be32 *) skb->dst->neighbour->ha)); 524 } else { 525 neigh->ah = NULL; 526 __skb_queue_tail(&neigh->queue, skb); 527 528 if (!path->query && path_rec_start(dev, path)) 529 goto err_list; 530 } 531 532 spin_unlock(&priv->lock); 533 return; 534 535err_list: 536 list_del(&neigh->list); 537 538err_path: 539 ipoib_neigh_free(neigh); 540 ++priv->stats.tx_dropped; 541 dev_kfree_skb_any(skb); 542 543 spin_unlock(&priv->lock); 544} 545 546static void ipoib_path_lookup(struct sk_buff *skb, struct net_device *dev) 547{ 548 struct ipoib_dev_priv *priv = netdev_priv(skb->dev); 549 550 /* Look up path record for unicasts */ 551 if (skb->dst->neighbour->ha[4] != 0xff) { 552 neigh_add_path(skb, dev); 553 return; 554 } 555 556 /* Add in the P_Key for multicasts */ 557 skb->dst->neighbour->ha[8] = (priv->pkey >> 8) & 0xff; 558 skb->dst->neighbour->ha[9] = priv->pkey & 0xff; 559 ipoib_mcast_send(dev, skb->dst->neighbour->ha + 4, skb); 560} 561 562static void unicast_arp_send(struct sk_buff *skb, struct net_device *dev, 563 struct ipoib_pseudoheader *phdr) 564{ 565 struct ipoib_dev_priv *priv = netdev_priv(dev); 566 struct ipoib_path *path; 567 568 /* 569 * We can only be called from ipoib_start_xmit, so we're 570 * inside tx_lock -- no need to save/restore flags. 571 */ 572 spin_lock(&priv->lock); 573 574 path = __path_find(dev, phdr->hwaddr + 4); 575 if (!path) { 576 path = path_rec_create(dev, phdr->hwaddr + 4); 577 if (path) { 578 /* put pseudoheader back on for next time */ 579 skb_push(skb, sizeof *phdr); 580 __skb_queue_tail(&path->queue, skb); 581 582 if (path_rec_start(dev, path)) { 583 spin_unlock(&priv->lock); 584 path_free(dev, path); 585 return; 586 } else 587 __path_add(dev, path); 588 } else { 589 ++priv->stats.tx_dropped; 590 dev_kfree_skb_any(skb); 591 } 592 593 spin_unlock(&priv->lock); 594 return; 595 } 596 597 if (path->ah) { 598 ipoib_dbg(priv, "Send unicast ARP to %04x\n", 599 be16_to_cpu(path->pathrec.dlid)); 600 601 ipoib_send(dev, skb, path->ah, 602 be32_to_cpup((__be32 *) phdr->hwaddr)); 603 } else if ((path->query || !path_rec_start(dev, path)) && 604 skb_queue_len(&path->queue) < IPOIB_MAX_PATH_REC_QUEUE) { 605 /* put pseudoheader back on for next time */ 606 skb_push(skb, sizeof *phdr); 607 __skb_queue_tail(&path->queue, skb); 608 } else { 609 ++priv->stats.tx_dropped; 610 dev_kfree_skb_any(skb); 611 } 612 613 spin_unlock(&priv->lock); 614} 615 616static int ipoib_start_xmit(struct sk_buff *skb, struct net_device *dev) 617{ 618 struct ipoib_dev_priv *priv = netdev_priv(dev); 619 struct ipoib_neigh *neigh; 620 unsigned long flags; 621 622 if (!spin_trylock_irqsave(&priv->tx_lock, flags)) 623 return NETDEV_TX_LOCKED; 624 625 /* 626 * Check if our queue is stopped. Since we have the LLTX bit 627 * set, we can't rely on netif_stop_queue() preventing our 628 * xmit function from being called with a full queue. 629 */ 630 if (unlikely(netif_queue_stopped(dev))) { 631 spin_unlock_irqrestore(&priv->tx_lock, flags); 632 return NETDEV_TX_BUSY; 633 } 634 635 if (skb->dst && skb->dst->neighbour) { 636 if (unlikely(!*to_ipoib_neigh(skb->dst->neighbour))) { 637 ipoib_path_lookup(skb, dev); 638 goto out; 639 } 640 641 neigh = *to_ipoib_neigh(skb->dst->neighbour); 642 643 if (likely(neigh->ah)) { 644 if (unlikely(memcmp(&neigh->dgid.raw, 645 skb->dst->neighbour->ha + 4, 646 sizeof(union ib_gid)))) { 647 spin_lock(&priv->lock); 648 /* 649 * It's safe to call ipoib_put_ah() inside 650 * priv->lock here, because we know that 651 * path->ah will always hold one more reference, 652 * so ipoib_put_ah() will never do more than 653 * decrement the ref count. 654 */ 655 ipoib_put_ah(neigh->ah); 656 list_del(&neigh->list); 657 ipoib_neigh_free(neigh); 658 spin_unlock(&priv->lock); 659 ipoib_path_lookup(skb, dev); 660 goto out; 661 } 662 663 ipoib_send(dev, skb, neigh->ah, 664 be32_to_cpup((__be32 *) skb->dst->neighbour->ha)); 665 goto out; 666 } 667 668 if (skb_queue_len(&neigh->queue) < IPOIB_MAX_PATH_REC_QUEUE) { 669 spin_lock(&priv->lock); 670 __skb_queue_tail(&neigh->queue, skb); 671 spin_unlock(&priv->lock); 672 } else { 673 ++priv->stats.tx_dropped; 674 dev_kfree_skb_any(skb); 675 } 676 } else { 677 struct ipoib_pseudoheader *phdr = 678 (struct ipoib_pseudoheader *) skb->data; 679 skb_pull(skb, sizeof *phdr); 680 681 if (phdr->hwaddr[4] == 0xff) { 682 /* Add in the P_Key for multicast*/ 683 phdr->hwaddr[8] = (priv->pkey >> 8) & 0xff; 684 phdr->hwaddr[9] = priv->pkey & 0xff; 685 686 ipoib_mcast_send(dev, phdr->hwaddr + 4, skb); 687 } else { 688 /* unicast GID -- should be ARP or RARP reply */ 689 690 if ((be16_to_cpup((__be16 *) skb->data) != ETH_P_ARP) && 691 (be16_to_cpup((__be16 *) skb->data) != ETH_P_RARP)) { 692 ipoib_warn(priv, "Unicast, no %s: type %04x, QPN %06x " 693 IPOIB_GID_FMT "\n", 694 skb->dst ? "neigh" : "dst", 695 be16_to_cpup((__be16 *) skb->data), 696 be32_to_cpup((__be32 *) phdr->hwaddr), 697 IPOIB_GID_RAW_ARG(phdr->hwaddr + 4)); 698 dev_kfree_skb_any(skb); 699 ++priv->stats.tx_dropped; 700 goto out; 701 } 702 703 unicast_arp_send(skb, dev, phdr); 704 } 705 } 706 707out: 708 spin_unlock_irqrestore(&priv->tx_lock, flags); 709 710 return NETDEV_TX_OK; 711} 712 713static struct net_device_stats *ipoib_get_stats(struct net_device *dev) 714{ 715 struct ipoib_dev_priv *priv = netdev_priv(dev); 716 717 return &priv->stats; 718} 719 720static void ipoib_timeout(struct net_device *dev) 721{ 722 struct ipoib_dev_priv *priv = netdev_priv(dev); 723 724 ipoib_warn(priv, "transmit timeout: latency %d msecs\n", 725 jiffies_to_msecs(jiffies - dev->trans_start)); 726 ipoib_warn(priv, "queue stopped %d, tx_head %u, tx_tail %u\n", 727 netif_queue_stopped(dev), 728 priv->tx_head, priv->tx_tail); 729 /* XXX reset QP, etc. */ 730} 731 732static int ipoib_hard_header(struct sk_buff *skb, 733 struct net_device *dev, 734 unsigned short type, 735 void *daddr, void *saddr, unsigned len) 736{ 737 struct ipoib_header *header; 738 739 header = (struct ipoib_header *) skb_push(skb, sizeof *header); 740 741 header->proto = htons(type); 742 header->reserved = 0; 743 744 /* 745 * If we don't have a neighbour structure, stuff the 746 * destination address onto the front of the skb so we can 747 * figure out where to send the packet later. 748 */ 749 if ((!skb->dst || !skb->dst->neighbour) && daddr) { 750 struct ipoib_pseudoheader *phdr = 751 (struct ipoib_pseudoheader *) skb_push(skb, sizeof *phdr); 752 memcpy(phdr->hwaddr, daddr, INFINIBAND_ALEN); 753 } 754 755 return 0; 756} 757 758static void ipoib_set_mcast_list(struct net_device *dev) 759{ 760 struct ipoib_dev_priv *priv = netdev_priv(dev); 761 762 if (!test_bit(IPOIB_FLAG_OPER_UP, &priv->flags)) { 763 ipoib_dbg(priv, "IPOIB_FLAG_OPER_UP not set"); 764 return; 765 } 766 767 queue_work(ipoib_workqueue, &priv->restart_task); 768} 769 770static void ipoib_neigh_destructor(struct neighbour *n) 771{ 772 struct ipoib_neigh *neigh; 773 struct ipoib_dev_priv *priv = netdev_priv(n->dev); 774 unsigned long flags; 775 struct ipoib_ah *ah = NULL; 776 777 ipoib_dbg(priv, 778 "neigh_destructor for %06x " IPOIB_GID_FMT "\n", 779 be32_to_cpup((__be32 *) n->ha), 780 IPOIB_GID_RAW_ARG(n->ha + 4)); 781 782 spin_lock_irqsave(&priv->lock, flags); 783 784 neigh = *to_ipoib_neigh(n); 785 if (neigh) { 786 if (neigh->ah) 787 ah = neigh->ah; 788 list_del(&neigh->list); 789 ipoib_neigh_free(neigh); 790 } 791 792 spin_unlock_irqrestore(&priv->lock, flags); 793 794 if (ah) 795 ipoib_put_ah(ah); 796} 797 798struct ipoib_neigh *ipoib_neigh_alloc(struct neighbour *neighbour) 799{ 800 struct ipoib_neigh *neigh; 801 802 neigh = kmalloc(sizeof *neigh, GFP_ATOMIC); 803 if (!neigh) 804 return NULL; 805 806 neigh->neighbour = neighbour; 807 *to_ipoib_neigh(neighbour) = neigh; 808 809 return neigh; 810} 811 812void ipoib_neigh_free(struct ipoib_neigh *neigh) 813{ 814 *to_ipoib_neigh(neigh->neighbour) = NULL; 815 kfree(neigh); 816} 817 818static int ipoib_neigh_setup_dev(struct net_device *dev, struct neigh_parms *parms) 819{ 820 parms->neigh_destructor = ipoib_neigh_destructor; 821 822 return 0; 823} 824 825int ipoib_dev_init(struct net_device *dev, struct ib_device *ca, int port) 826{ 827 struct ipoib_dev_priv *priv = netdev_priv(dev); 828 829 /* Allocate RX/TX "rings" to hold queued skbs */ 830 priv->rx_ring = kzalloc(ipoib_recvq_size * sizeof *priv->rx_ring, 831 GFP_KERNEL); 832 if (!priv->rx_ring) { 833 printk(KERN_WARNING "%s: failed to allocate RX ring (%d entries)\n", 834 ca->name, ipoib_recvq_size); 835 goto out; 836 } 837 838 priv->tx_ring = kzalloc(ipoib_sendq_size * sizeof *priv->tx_ring, 839 GFP_KERNEL); 840 if (!priv->tx_ring) { 841 printk(KERN_WARNING "%s: failed to allocate TX ring (%d entries)\n", 842 ca->name, ipoib_sendq_size); 843 goto out_rx_ring_cleanup; 844 } 845 846 /* priv->tx_head & tx_tail are already 0 */ 847 848 if (ipoib_ib_dev_init(dev, ca, port)) 849 goto out_tx_ring_cleanup; 850 851 return 0; 852 853out_tx_ring_cleanup: 854 kfree(priv->tx_ring); 855 856out_rx_ring_cleanup: 857 kfree(priv->rx_ring); 858 859out: 860 return -ENOMEM; 861} 862 863void ipoib_dev_cleanup(struct net_device *dev) 864{ 865 struct ipoib_dev_priv *priv = netdev_priv(dev), *cpriv, *tcpriv; 866 867 ipoib_delete_debug_files(dev); 868 869 /* Delete any child interfaces first */ 870 list_for_each_entry_safe(cpriv, tcpriv, &priv->child_intfs, list) { 871 unregister_netdev(cpriv->dev); 872 ipoib_dev_cleanup(cpriv->dev); 873 free_netdev(cpriv->dev); 874 } 875 876 ipoib_ib_dev_cleanup(dev); 877 878 kfree(priv->rx_ring); 879 kfree(priv->tx_ring); 880 881 priv->rx_ring = NULL; 882 priv->tx_ring = NULL; 883} 884 885static void ipoib_setup(struct net_device *dev) 886{ 887 struct ipoib_dev_priv *priv = netdev_priv(dev); 888 889 dev->open = ipoib_open; 890 dev->stop = ipoib_stop; 891 dev->change_mtu = ipoib_change_mtu; 892 dev->hard_start_xmit = ipoib_start_xmit; 893 dev->get_stats = ipoib_get_stats; 894 dev->tx_timeout = ipoib_timeout; 895 dev->hard_header = ipoib_hard_header; 896 dev->set_multicast_list = ipoib_set_mcast_list; 897 dev->neigh_setup = ipoib_neigh_setup_dev; 898 899 dev->watchdog_timeo = HZ; 900 901 dev->flags |= IFF_BROADCAST | IFF_MULTICAST; 902 903 /* 904 * We add in INFINIBAND_ALEN to allow for the destination 905 * address "pseudoheader" for skbs without neighbour struct. 906 */ 907 dev->hard_header_len = IPOIB_ENCAP_LEN + INFINIBAND_ALEN; 908 dev->addr_len = INFINIBAND_ALEN; 909 dev->type = ARPHRD_INFINIBAND; 910 dev->tx_queue_len = ipoib_sendq_size * 2; 911 dev->features = NETIF_F_VLAN_CHALLENGED | NETIF_F_LLTX; 912 913 /* MTU will be reset when mcast join happens */ 914 dev->mtu = IPOIB_PACKET_SIZE - IPOIB_ENCAP_LEN; 915 priv->mcast_mtu = priv->admin_mtu = dev->mtu; 916 917 memcpy(dev->broadcast, ipv4_bcast_addr, INFINIBAND_ALEN); 918 919 netif_carrier_off(dev); 920 921 SET_MODULE_OWNER(dev); 922 923 priv->dev = dev; 924 925 spin_lock_init(&priv->lock); 926 spin_lock_init(&priv->tx_lock); 927 928 mutex_init(&priv->mcast_mutex); 929 mutex_init(&priv->vlan_mutex); 930 931 INIT_LIST_HEAD(&priv->path_list); 932 INIT_LIST_HEAD(&priv->child_intfs); 933 INIT_LIST_HEAD(&priv->dead_ahs); 934 INIT_LIST_HEAD(&priv->multicast_list); 935 936 INIT_WORK(&priv->pkey_task, ipoib_pkey_poll, priv->dev); 937 INIT_WORK(&priv->mcast_task, ipoib_mcast_join_task, priv->dev); 938 INIT_WORK(&priv->flush_task, ipoib_ib_dev_flush, priv->dev); 939 INIT_WORK(&priv->restart_task, ipoib_mcast_restart_task, priv->dev); 940 INIT_WORK(&priv->ah_reap_task, ipoib_reap_ah, priv->dev); 941} 942 943struct ipoib_dev_priv *ipoib_intf_alloc(const char *name) 944{ 945 struct net_device *dev; 946 947 dev = alloc_netdev((int) sizeof (struct ipoib_dev_priv), name, 948 ipoib_setup); 949 if (!dev) 950 return NULL; 951 952 return netdev_priv(dev); 953} 954 955static ssize_t show_pkey(struct class_device *cdev, char *buf) 956{ 957 struct ipoib_dev_priv *priv = 958 netdev_priv(container_of(cdev, struct net_device, class_dev)); 959 960 return sprintf(buf, "0x%04x\n", priv->pkey); 961} 962static CLASS_DEVICE_ATTR(pkey, S_IRUGO, show_pkey, NULL); 963 964static ssize_t create_child(struct class_device *cdev, 965 const char *buf, size_t count) 966{ 967 int pkey; 968 int ret; 969 970 if (sscanf(buf, "%i", &pkey) != 1) 971 return -EINVAL; 972 973 if (pkey < 0 || pkey > 0xffff) 974 return -EINVAL; 975 976 /* 977 * Set the full membership bit, so that we join the right 978 * broadcast group, etc. 979 */ 980 pkey |= 0x8000; 981 982 ret = ipoib_vlan_add(container_of(cdev, struct net_device, class_dev), 983 pkey); 984 985 return ret ? ret : count; 986} 987static CLASS_DEVICE_ATTR(create_child, S_IWUGO, NULL, create_child); 988 989static ssize_t delete_child(struct class_device *cdev, 990 const char *buf, size_t count) 991{ 992 int pkey; 993 int ret; 994 995 if (sscanf(buf, "%i", &pkey) != 1) 996 return -EINVAL; 997 998 if (pkey < 0 || pkey > 0xffff) 999 return -EINVAL; 1000 1001 ret = ipoib_vlan_delete(container_of(cdev, struct net_device, class_dev), 1002 pkey); 1003 1004 return ret ? ret : count; 1005 1006} 1007static CLASS_DEVICE_ATTR(delete_child, S_IWUGO, NULL, delete_child); 1008 1009int ipoib_add_pkey_attr(struct net_device *dev) 1010{ 1011 return class_device_create_file(&dev->class_dev, 1012 &class_device_attr_pkey); 1013} 1014 1015static struct net_device *ipoib_add_port(const char *format, 1016 struct ib_device *hca, u8 port) 1017{ 1018 struct ipoib_dev_priv *priv; 1019 int result = -ENOMEM; 1020 1021 priv = ipoib_intf_alloc(format); 1022 if (!priv) 1023 goto alloc_mem_failed; 1024 1025 SET_NETDEV_DEV(priv->dev, hca->dma_device); 1026 1027 result = ib_query_pkey(hca, port, 0, &priv->pkey); 1028 if (result) { 1029 printk(KERN_WARNING "%s: ib_query_pkey port %d failed (ret = %d)\n", 1030 hca->name, port, result); 1031 goto alloc_mem_failed; 1032 } 1033 1034 /* 1035 * Set the full membership bit, so that we join the right 1036 * broadcast group, etc. 1037 */ 1038 priv->pkey |= 0x8000; 1039 1040 priv->dev->broadcast[8] = priv->pkey >> 8; 1041 priv->dev->broadcast[9] = priv->pkey & 0xff; 1042 1043 result = ib_query_gid(hca, port, 0, &priv->local_gid); 1044 if (result) { 1045 printk(KERN_WARNING "%s: ib_query_gid port %d failed (ret = %d)\n", 1046 hca->name, port, result); 1047 goto alloc_mem_failed; 1048 } else 1049 memcpy(priv->dev->dev_addr + 4, priv->local_gid.raw, sizeof (union ib_gid)); 1050 1051 1052 result = ipoib_dev_init(priv->dev, hca, port); 1053 if (result < 0) { 1054 printk(KERN_WARNING "%s: failed to initialize port %d (ret = %d)\n", 1055 hca->name, port, result); 1056 goto device_init_failed; 1057 } 1058 1059 INIT_IB_EVENT_HANDLER(&priv->event_handler, 1060 priv->ca, ipoib_event); 1061 result = ib_register_event_handler(&priv->event_handler); 1062 if (result < 0) { 1063 printk(KERN_WARNING "%s: ib_register_event_handler failed for " 1064 "port %d (ret = %d)\n", 1065 hca->name, port, result); 1066 goto event_failed; 1067 } 1068 1069 result = register_netdev(priv->dev); 1070 if (result) { 1071 printk(KERN_WARNING "%s: couldn't register ipoib port %d; error %d\n", 1072 hca->name, port, result); 1073 goto register_failed; 1074 } 1075 1076 ipoib_create_debug_files(priv->dev); 1077 1078 if (ipoib_add_pkey_attr(priv->dev)) 1079 goto sysfs_failed; 1080 if (class_device_create_file(&priv->dev->class_dev, 1081 &class_device_attr_create_child)) 1082 goto sysfs_failed; 1083 if (class_device_create_file(&priv->dev->class_dev, 1084 &class_device_attr_delete_child)) 1085 goto sysfs_failed; 1086 1087 return priv->dev; 1088 1089sysfs_failed: 1090 ipoib_delete_debug_files(priv->dev); 1091 unregister_netdev(priv->dev); 1092 1093register_failed: 1094 ib_unregister_event_handler(&priv->event_handler); 1095 flush_scheduled_work(); 1096 1097event_failed: 1098 ipoib_dev_cleanup(priv->dev); 1099 1100device_init_failed: 1101 free_netdev(priv->dev); 1102 1103alloc_mem_failed: 1104 return ERR_PTR(result); 1105} 1106 1107static void ipoib_add_one(struct ib_device *device) 1108{ 1109 struct list_head *dev_list; 1110 struct net_device *dev; 1111 struct ipoib_dev_priv *priv; 1112 int s, e, p; 1113 1114 dev_list = kmalloc(sizeof *dev_list, GFP_KERNEL); 1115 if (!dev_list) 1116 return; 1117 1118 INIT_LIST_HEAD(dev_list); 1119 1120 if (device->node_type == IB_NODE_SWITCH) { 1121 s = 0; 1122 e = 0; 1123 } else { 1124 s = 1; 1125 e = device->phys_port_cnt; 1126 } 1127 1128 for (p = s; p <= e; ++p) { 1129 dev = ipoib_add_port("ib%d", device, p); 1130 if (!IS_ERR(dev)) { 1131 priv = netdev_priv(dev); 1132 list_add_tail(&priv->list, dev_list); 1133 } 1134 } 1135 1136 ib_set_client_data(device, &ipoib_client, dev_list); 1137} 1138 1139static void ipoib_remove_one(struct ib_device *device) 1140{ 1141 struct ipoib_dev_priv *priv, *tmp; 1142 struct list_head *dev_list; 1143 1144 dev_list = ib_get_client_data(device, &ipoib_client); 1145 1146 list_for_each_entry_safe(priv, tmp, dev_list, list) { 1147 ib_unregister_event_handler(&priv->event_handler); 1148 flush_scheduled_work(); 1149 1150 unregister_netdev(priv->dev); 1151 ipoib_dev_cleanup(priv->dev); 1152 free_netdev(priv->dev); 1153 } 1154 1155 kfree(dev_list); 1156} 1157 1158static int __init ipoib_init_module(void) 1159{ 1160 int ret; 1161 1162 ipoib_recvq_size = roundup_pow_of_two(ipoib_recvq_size); 1163 ipoib_recvq_size = min(ipoib_recvq_size, IPOIB_MAX_QUEUE_SIZE); 1164 ipoib_recvq_size = max(ipoib_recvq_size, IPOIB_MIN_QUEUE_SIZE); 1165 1166 ipoib_sendq_size = roundup_pow_of_two(ipoib_sendq_size); 1167 ipoib_sendq_size = min(ipoib_sendq_size, IPOIB_MAX_QUEUE_SIZE); 1168 ipoib_sendq_size = max(ipoib_sendq_size, IPOIB_MIN_QUEUE_SIZE); 1169 1170 ret = ipoib_register_debugfs(); 1171 if (ret) 1172 return ret; 1173 1174 /* 1175 * We create our own workqueue mainly because we want to be 1176 * able to flush it when devices are being removed. We can't 1177 * use schedule_work()/flush_scheduled_work() because both 1178 * unregister_netdev() and linkwatch_event take the rtnl lock, 1179 * so flush_scheduled_work() can deadlock during device 1180 * removal. 1181 */ 1182 ipoib_workqueue = create_singlethread_workqueue("ipoib"); 1183 if (!ipoib_workqueue) { 1184 ret = -ENOMEM; 1185 goto err_fs; 1186 } 1187 1188 ret = ib_register_client(&ipoib_client); 1189 if (ret) 1190 goto err_wq; 1191 1192 return 0; 1193 1194err_wq: 1195 destroy_workqueue(ipoib_workqueue); 1196 1197err_fs: 1198 ipoib_unregister_debugfs(); 1199 1200 return ret; 1201} 1202 1203static void __exit ipoib_cleanup_module(void) 1204{ 1205 ib_unregister_client(&ipoib_client); 1206 ipoib_unregister_debugfs(); 1207 destroy_workqueue(ipoib_workqueue); 1208} 1209 1210module_init(ipoib_init_module); 1211module_exit(ipoib_cleanup_module); 1212