ipoib_main.c revision 8a7f752125a930a83f4d8dfe37fa5a081ab19d31
1/* 2 * Copyright (c) 2004 Topspin Communications. All rights reserved. 3 * Copyright (c) 2005 Sun Microsystems, Inc. All rights reserved. 4 * Copyright (c) 2004 Voltaire, Inc. All rights reserved. 5 * 6 * This software is available to you under a choice of one of two 7 * licenses. You may choose to be licensed under the terms of the GNU 8 * General Public License (GPL) Version 2, available from the file 9 * COPYING in the main directory of this source tree, or the 10 * OpenIB.org BSD license below: 11 * 12 * Redistribution and use in source and binary forms, with or 13 * without modification, are permitted provided that the following 14 * conditions are met: 15 * 16 * - Redistributions of source code must retain the above 17 * copyright notice, this list of conditions and the following 18 * disclaimer. 19 * 20 * - Redistributions in binary form must reproduce the above 21 * copyright notice, this list of conditions and the following 22 * disclaimer in the documentation and/or other materials 23 * provided with the distribution. 24 * 25 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 26 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 27 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 28 * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS 29 * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN 30 * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN 31 * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 32 * SOFTWARE. 33 * 34 * $Id: ipoib_main.c 1377 2004-12-23 19:57:12Z roland $ 35 */ 36 37#include "ipoib.h" 38 39#include <linux/module.h> 40 41#include <linux/init.h> 42#include <linux/slab.h> 43#include <linux/vmalloc.h> 44#include <linux/kernel.h> 45 46#include <linux/if_arp.h> /* For ARPHRD_xxx */ 47 48#include <linux/ip.h> 49#include <linux/in.h> 50 51#include <net/dst.h> 52 53MODULE_AUTHOR("Roland Dreier"); 54MODULE_DESCRIPTION("IP-over-InfiniBand net driver"); 55MODULE_LICENSE("Dual BSD/GPL"); 56 57int ipoib_sendq_size __read_mostly = IPOIB_TX_RING_SIZE; 58int ipoib_recvq_size __read_mostly = IPOIB_RX_RING_SIZE; 59 60module_param_named(send_queue_size, ipoib_sendq_size, int, 0444); 61MODULE_PARM_DESC(send_queue_size, "Number of descriptors in send queue"); 62module_param_named(recv_queue_size, ipoib_recvq_size, int, 0444); 63MODULE_PARM_DESC(recv_queue_size, "Number of descriptors in receive queue"); 64 65#ifdef CONFIG_INFINIBAND_IPOIB_DEBUG 66int ipoib_debug_level; 67 68module_param_named(debug_level, ipoib_debug_level, int, 0644); 69MODULE_PARM_DESC(debug_level, "Enable debug tracing if > 0"); 70#endif 71 72struct ipoib_path_iter { 73 struct net_device *dev; 74 struct ipoib_path path; 75}; 76 77static const u8 ipv4_bcast_addr[] = { 78 0x00, 0xff, 0xff, 0xff, 79 0xff, 0x12, 0x40, 0x1b, 0x00, 0x00, 0x00, 0x00, 80 0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff 81}; 82 83struct workqueue_struct *ipoib_workqueue; 84 85static void ipoib_add_one(struct ib_device *device); 86static void ipoib_remove_one(struct ib_device *device); 87 88static struct ib_client ipoib_client = { 89 .name = "ipoib", 90 .add = ipoib_add_one, 91 .remove = ipoib_remove_one 92}; 93 94int ipoib_open(struct net_device *dev) 95{ 96 struct ipoib_dev_priv *priv = netdev_priv(dev); 97 98 ipoib_dbg(priv, "bringing up interface\n"); 99 100 set_bit(IPOIB_FLAG_ADMIN_UP, &priv->flags); 101 102 if (ipoib_pkey_dev_delay_open(dev)) 103 return 0; 104 105 if (ipoib_ib_dev_open(dev)) 106 return -EINVAL; 107 108 if (ipoib_ib_dev_up(dev)) { 109 ipoib_ib_dev_stop(dev); 110 return -EINVAL; 111 } 112 113 if (!test_bit(IPOIB_FLAG_SUBINTERFACE, &priv->flags)) { 114 struct ipoib_dev_priv *cpriv; 115 116 /* Bring up any child interfaces too */ 117 mutex_lock(&priv->vlan_mutex); 118 list_for_each_entry(cpriv, &priv->child_intfs, list) { 119 int flags; 120 121 flags = cpriv->dev->flags; 122 if (flags & IFF_UP) 123 continue; 124 125 dev_change_flags(cpriv->dev, flags | IFF_UP); 126 } 127 mutex_unlock(&priv->vlan_mutex); 128 } 129 130 netif_start_queue(dev); 131 132 return 0; 133} 134 135static int ipoib_stop(struct net_device *dev) 136{ 137 struct ipoib_dev_priv *priv = netdev_priv(dev); 138 139 ipoib_dbg(priv, "stopping interface\n"); 140 141 clear_bit(IPOIB_FLAG_ADMIN_UP, &priv->flags); 142 143 netif_stop_queue(dev); 144 145 /* 146 * Now flush workqueue to make sure a scheduled task doesn't 147 * bring our internal state back up. 148 */ 149 flush_workqueue(ipoib_workqueue); 150 151 ipoib_ib_dev_down(dev, 1); 152 ipoib_ib_dev_stop(dev); 153 154 if (!test_bit(IPOIB_FLAG_SUBINTERFACE, &priv->flags)) { 155 struct ipoib_dev_priv *cpriv; 156 157 /* Bring down any child interfaces too */ 158 mutex_lock(&priv->vlan_mutex); 159 list_for_each_entry(cpriv, &priv->child_intfs, list) { 160 int flags; 161 162 flags = cpriv->dev->flags; 163 if (!(flags & IFF_UP)) 164 continue; 165 166 dev_change_flags(cpriv->dev, flags & ~IFF_UP); 167 } 168 mutex_unlock(&priv->vlan_mutex); 169 } 170 171 return 0; 172} 173 174static int ipoib_change_mtu(struct net_device *dev, int new_mtu) 175{ 176 struct ipoib_dev_priv *priv = netdev_priv(dev); 177 178 if (new_mtu > IPOIB_PACKET_SIZE - IPOIB_ENCAP_LEN) 179 return -EINVAL; 180 181 priv->admin_mtu = new_mtu; 182 183 dev->mtu = min(priv->mcast_mtu, priv->admin_mtu); 184 185 return 0; 186} 187 188static struct ipoib_path *__path_find(struct net_device *dev, void *gid) 189{ 190 struct ipoib_dev_priv *priv = netdev_priv(dev); 191 struct rb_node *n = priv->path_tree.rb_node; 192 struct ipoib_path *path; 193 int ret; 194 195 while (n) { 196 path = rb_entry(n, struct ipoib_path, rb_node); 197 198 ret = memcmp(gid, path->pathrec.dgid.raw, 199 sizeof (union ib_gid)); 200 201 if (ret < 0) 202 n = n->rb_left; 203 else if (ret > 0) 204 n = n->rb_right; 205 else 206 return path; 207 } 208 209 return NULL; 210} 211 212static int __path_add(struct net_device *dev, struct ipoib_path *path) 213{ 214 struct ipoib_dev_priv *priv = netdev_priv(dev); 215 struct rb_node **n = &priv->path_tree.rb_node; 216 struct rb_node *pn = NULL; 217 struct ipoib_path *tpath; 218 int ret; 219 220 while (*n) { 221 pn = *n; 222 tpath = rb_entry(pn, struct ipoib_path, rb_node); 223 224 ret = memcmp(path->pathrec.dgid.raw, tpath->pathrec.dgid.raw, 225 sizeof (union ib_gid)); 226 if (ret < 0) 227 n = &pn->rb_left; 228 else if (ret > 0) 229 n = &pn->rb_right; 230 else 231 return -EEXIST; 232 } 233 234 rb_link_node(&path->rb_node, pn, n); 235 rb_insert_color(&path->rb_node, &priv->path_tree); 236 237 list_add_tail(&path->list, &priv->path_list); 238 239 return 0; 240} 241 242static void path_free(struct net_device *dev, struct ipoib_path *path) 243{ 244 struct ipoib_dev_priv *priv = netdev_priv(dev); 245 struct ipoib_neigh *neigh, *tn; 246 struct sk_buff *skb; 247 unsigned long flags; 248 249 while ((skb = __skb_dequeue(&path->queue))) 250 dev_kfree_skb_irq(skb); 251 252 spin_lock_irqsave(&priv->lock, flags); 253 254 list_for_each_entry_safe(neigh, tn, &path->neigh_list, list) { 255 /* 256 * It's safe to call ipoib_put_ah() inside priv->lock 257 * here, because we know that path->ah will always 258 * hold one more reference, so ipoib_put_ah() will 259 * never do more than decrement the ref count. 260 */ 261 if (neigh->ah) 262 ipoib_put_ah(neigh->ah); 263 264 ipoib_neigh_free(neigh); 265 } 266 267 spin_unlock_irqrestore(&priv->lock, flags); 268 269 if (path->ah) 270 ipoib_put_ah(path->ah); 271 272 kfree(path); 273} 274 275#ifdef CONFIG_INFINIBAND_IPOIB_DEBUG 276 277struct ipoib_path_iter *ipoib_path_iter_init(struct net_device *dev) 278{ 279 struct ipoib_path_iter *iter; 280 281 iter = kmalloc(sizeof *iter, GFP_KERNEL); 282 if (!iter) 283 return NULL; 284 285 iter->dev = dev; 286 memset(iter->path.pathrec.dgid.raw, 0, 16); 287 288 if (ipoib_path_iter_next(iter)) { 289 kfree(iter); 290 return NULL; 291 } 292 293 return iter; 294} 295 296int ipoib_path_iter_next(struct ipoib_path_iter *iter) 297{ 298 struct ipoib_dev_priv *priv = netdev_priv(iter->dev); 299 struct rb_node *n; 300 struct ipoib_path *path; 301 int ret = 1; 302 303 spin_lock_irq(&priv->lock); 304 305 n = rb_first(&priv->path_tree); 306 307 while (n) { 308 path = rb_entry(n, struct ipoib_path, rb_node); 309 310 if (memcmp(iter->path.pathrec.dgid.raw, path->pathrec.dgid.raw, 311 sizeof (union ib_gid)) < 0) { 312 iter->path = *path; 313 ret = 0; 314 break; 315 } 316 317 n = rb_next(n); 318 } 319 320 spin_unlock_irq(&priv->lock); 321 322 return ret; 323} 324 325void ipoib_path_iter_read(struct ipoib_path_iter *iter, 326 struct ipoib_path *path) 327{ 328 *path = iter->path; 329} 330 331#endif /* CONFIG_INFINIBAND_IPOIB_DEBUG */ 332 333void ipoib_flush_paths(struct net_device *dev) 334{ 335 struct ipoib_dev_priv *priv = netdev_priv(dev); 336 struct ipoib_path *path, *tp; 337 LIST_HEAD(remove_list); 338 339 spin_lock_irq(&priv->lock); 340 341 list_splice(&priv->path_list, &remove_list); 342 INIT_LIST_HEAD(&priv->path_list); 343 344 list_for_each_entry(path, &remove_list, list) 345 rb_erase(&path->rb_node, &priv->path_tree); 346 347 list_for_each_entry_safe(path, tp, &remove_list, list) { 348 if (path->query) 349 ib_sa_cancel_query(path->query_id, path->query); 350 spin_unlock_irq(&priv->lock); 351 wait_for_completion(&path->done); 352 path_free(dev, path); 353 spin_lock_irq(&priv->lock); 354 } 355 spin_unlock_irq(&priv->lock); 356} 357 358static void path_rec_completion(int status, 359 struct ib_sa_path_rec *pathrec, 360 void *path_ptr) 361{ 362 struct ipoib_path *path = path_ptr; 363 struct net_device *dev = path->dev; 364 struct ipoib_dev_priv *priv = netdev_priv(dev); 365 struct ipoib_ah *ah = NULL; 366 struct ipoib_neigh *neigh; 367 struct sk_buff_head skqueue; 368 struct sk_buff *skb; 369 unsigned long flags; 370 371 if (pathrec) 372 ipoib_dbg(priv, "PathRec LID 0x%04x for GID " IPOIB_GID_FMT "\n", 373 be16_to_cpu(pathrec->dlid), IPOIB_GID_ARG(pathrec->dgid)); 374 else 375 ipoib_dbg(priv, "PathRec status %d for GID " IPOIB_GID_FMT "\n", 376 status, IPOIB_GID_ARG(path->pathrec.dgid)); 377 378 skb_queue_head_init(&skqueue); 379 380 if (!status) { 381 struct ib_ah_attr av = { 382 .dlid = be16_to_cpu(pathrec->dlid), 383 .sl = pathrec->sl, 384 .port_num = priv->port, 385 .static_rate = pathrec->rate 386 }; 387 388 ah = ipoib_create_ah(dev, priv->pd, &av); 389 } 390 391 spin_lock_irqsave(&priv->lock, flags); 392 393 path->ah = ah; 394 395 if (ah) { 396 path->pathrec = *pathrec; 397 398 ipoib_dbg(priv, "created address handle %p for LID 0x%04x, SL %d\n", 399 ah, be16_to_cpu(pathrec->dlid), pathrec->sl); 400 401 while ((skb = __skb_dequeue(&path->queue))) 402 __skb_queue_tail(&skqueue, skb); 403 404 list_for_each_entry(neigh, &path->neigh_list, list) { 405 kref_get(&path->ah->ref); 406 neigh->ah = path->ah; 407 memcpy(&neigh->dgid.raw, &path->pathrec.dgid.raw, 408 sizeof(union ib_gid)); 409 410 while ((skb = __skb_dequeue(&neigh->queue))) 411 __skb_queue_tail(&skqueue, skb); 412 } 413 } 414 415 path->query = NULL; 416 complete(&path->done); 417 418 spin_unlock_irqrestore(&priv->lock, flags); 419 420 while ((skb = __skb_dequeue(&skqueue))) { 421 skb->dev = dev; 422 if (dev_queue_xmit(skb)) 423 ipoib_warn(priv, "dev_queue_xmit failed " 424 "to requeue packet\n"); 425 } 426} 427 428static struct ipoib_path *path_rec_create(struct net_device *dev, void *gid) 429{ 430 struct ipoib_dev_priv *priv = netdev_priv(dev); 431 struct ipoib_path *path; 432 433 path = kzalloc(sizeof *path, GFP_ATOMIC); 434 if (!path) 435 return NULL; 436 437 path->dev = dev; 438 439 skb_queue_head_init(&path->queue); 440 441 INIT_LIST_HEAD(&path->neigh_list); 442 443 memcpy(path->pathrec.dgid.raw, gid, sizeof (union ib_gid)); 444 path->pathrec.sgid = priv->local_gid; 445 path->pathrec.pkey = cpu_to_be16(priv->pkey); 446 path->pathrec.numb_path = 1; 447 448 return path; 449} 450 451static int path_rec_start(struct net_device *dev, 452 struct ipoib_path *path) 453{ 454 struct ipoib_dev_priv *priv = netdev_priv(dev); 455 456 ipoib_dbg(priv, "Start path record lookup for " IPOIB_GID_FMT "\n", 457 IPOIB_GID_ARG(path->pathrec.dgid)); 458 459 init_completion(&path->done); 460 461 path->query_id = 462 ib_sa_path_rec_get(priv->ca, priv->port, 463 &path->pathrec, 464 IB_SA_PATH_REC_DGID | 465 IB_SA_PATH_REC_SGID | 466 IB_SA_PATH_REC_NUMB_PATH | 467 IB_SA_PATH_REC_PKEY, 468 1000, GFP_ATOMIC, 469 path_rec_completion, 470 path, &path->query); 471 if (path->query_id < 0) { 472 ipoib_warn(priv, "ib_sa_path_rec_get failed\n"); 473 path->query = NULL; 474 return path->query_id; 475 } 476 477 return 0; 478} 479 480static void neigh_add_path(struct sk_buff *skb, struct net_device *dev) 481{ 482 struct ipoib_dev_priv *priv = netdev_priv(dev); 483 struct ipoib_path *path; 484 struct ipoib_neigh *neigh; 485 486 neigh = ipoib_neigh_alloc(skb->dst->neighbour); 487 if (!neigh) { 488 ++priv->stats.tx_dropped; 489 dev_kfree_skb_any(skb); 490 return; 491 } 492 493 skb_queue_head_init(&neigh->queue); 494 495 /* 496 * We can only be called from ipoib_start_xmit, so we're 497 * inside tx_lock -- no need to save/restore flags. 498 */ 499 spin_lock(&priv->lock); 500 501 path = __path_find(dev, skb->dst->neighbour->ha + 4); 502 if (!path) { 503 path = path_rec_create(dev, skb->dst->neighbour->ha + 4); 504 if (!path) 505 goto err_path; 506 507 __path_add(dev, path); 508 } 509 510 list_add_tail(&neigh->list, &path->neigh_list); 511 512 if (path->ah) { 513 kref_get(&path->ah->ref); 514 neigh->ah = path->ah; 515 memcpy(&neigh->dgid.raw, &path->pathrec.dgid.raw, 516 sizeof(union ib_gid)); 517 518 ipoib_send(dev, skb, path->ah, 519 be32_to_cpup((__be32 *) skb->dst->neighbour->ha)); 520 } else { 521 neigh->ah = NULL; 522 __skb_queue_tail(&neigh->queue, skb); 523 524 if (!path->query && path_rec_start(dev, path)) 525 goto err_list; 526 } 527 528 spin_unlock(&priv->lock); 529 return; 530 531err_list: 532 list_del(&neigh->list); 533 534err_path: 535 ipoib_neigh_free(neigh); 536 ++priv->stats.tx_dropped; 537 dev_kfree_skb_any(skb); 538 539 spin_unlock(&priv->lock); 540} 541 542static void ipoib_path_lookup(struct sk_buff *skb, struct net_device *dev) 543{ 544 struct ipoib_dev_priv *priv = netdev_priv(skb->dev); 545 546 /* Look up path record for unicasts */ 547 if (skb->dst->neighbour->ha[4] != 0xff) { 548 neigh_add_path(skb, dev); 549 return; 550 } 551 552 /* Add in the P_Key for multicasts */ 553 skb->dst->neighbour->ha[8] = (priv->pkey >> 8) & 0xff; 554 skb->dst->neighbour->ha[9] = priv->pkey & 0xff; 555 ipoib_mcast_send(dev, skb->dst->neighbour->ha + 4, skb); 556} 557 558static void unicast_arp_send(struct sk_buff *skb, struct net_device *dev, 559 struct ipoib_pseudoheader *phdr) 560{ 561 struct ipoib_dev_priv *priv = netdev_priv(dev); 562 struct ipoib_path *path; 563 564 /* 565 * We can only be called from ipoib_start_xmit, so we're 566 * inside tx_lock -- no need to save/restore flags. 567 */ 568 spin_lock(&priv->lock); 569 570 path = __path_find(dev, phdr->hwaddr + 4); 571 if (!path) { 572 path = path_rec_create(dev, phdr->hwaddr + 4); 573 if (path) { 574 /* put pseudoheader back on for next time */ 575 skb_push(skb, sizeof *phdr); 576 __skb_queue_tail(&path->queue, skb); 577 578 if (path_rec_start(dev, path)) { 579 spin_unlock(&priv->lock); 580 path_free(dev, path); 581 return; 582 } else 583 __path_add(dev, path); 584 } else { 585 ++priv->stats.tx_dropped; 586 dev_kfree_skb_any(skb); 587 } 588 589 spin_unlock(&priv->lock); 590 return; 591 } 592 593 if (path->ah) { 594 ipoib_dbg(priv, "Send unicast ARP to %04x\n", 595 be16_to_cpu(path->pathrec.dlid)); 596 597 ipoib_send(dev, skb, path->ah, 598 be32_to_cpup((__be32 *) phdr->hwaddr)); 599 } else if ((path->query || !path_rec_start(dev, path)) && 600 skb_queue_len(&path->queue) < IPOIB_MAX_PATH_REC_QUEUE) { 601 /* put pseudoheader back on for next time */ 602 skb_push(skb, sizeof *phdr); 603 __skb_queue_tail(&path->queue, skb); 604 } else { 605 ++priv->stats.tx_dropped; 606 dev_kfree_skb_any(skb); 607 } 608 609 spin_unlock(&priv->lock); 610} 611 612static int ipoib_start_xmit(struct sk_buff *skb, struct net_device *dev) 613{ 614 struct ipoib_dev_priv *priv = netdev_priv(dev); 615 struct ipoib_neigh *neigh; 616 unsigned long flags; 617 618 if (!spin_trylock_irqsave(&priv->tx_lock, flags)) 619 return NETDEV_TX_LOCKED; 620 621 /* 622 * Check if our queue is stopped. Since we have the LLTX bit 623 * set, we can't rely on netif_stop_queue() preventing our 624 * xmit function from being called with a full queue. 625 */ 626 if (unlikely(netif_queue_stopped(dev))) { 627 spin_unlock_irqrestore(&priv->tx_lock, flags); 628 return NETDEV_TX_BUSY; 629 } 630 631 if (skb->dst && skb->dst->neighbour) { 632 if (unlikely(!*to_ipoib_neigh(skb->dst->neighbour))) { 633 ipoib_path_lookup(skb, dev); 634 goto out; 635 } 636 637 neigh = *to_ipoib_neigh(skb->dst->neighbour); 638 639 if (likely(neigh->ah)) { 640 if (unlikely(memcmp(&neigh->dgid.raw, 641 skb->dst->neighbour->ha + 4, 642 sizeof(union ib_gid)))) { 643 spin_lock(&priv->lock); 644 /* 645 * It's safe to call ipoib_put_ah() inside 646 * priv->lock here, because we know that 647 * path->ah will always hold one more reference, 648 * so ipoib_put_ah() will never do more than 649 * decrement the ref count. 650 */ 651 ipoib_put_ah(neigh->ah); 652 list_del(&neigh->list); 653 ipoib_neigh_free(neigh); 654 spin_unlock(&priv->lock); 655 ipoib_path_lookup(skb, dev); 656 goto out; 657 } 658 659 ipoib_send(dev, skb, neigh->ah, 660 be32_to_cpup((__be32 *) skb->dst->neighbour->ha)); 661 goto out; 662 } 663 664 if (skb_queue_len(&neigh->queue) < IPOIB_MAX_PATH_REC_QUEUE) { 665 spin_lock(&priv->lock); 666 __skb_queue_tail(&neigh->queue, skb); 667 spin_unlock(&priv->lock); 668 } else { 669 ++priv->stats.tx_dropped; 670 dev_kfree_skb_any(skb); 671 } 672 } else { 673 struct ipoib_pseudoheader *phdr = 674 (struct ipoib_pseudoheader *) skb->data; 675 skb_pull(skb, sizeof *phdr); 676 677 if (phdr->hwaddr[4] == 0xff) { 678 /* Add in the P_Key for multicast*/ 679 phdr->hwaddr[8] = (priv->pkey >> 8) & 0xff; 680 phdr->hwaddr[9] = priv->pkey & 0xff; 681 682 ipoib_mcast_send(dev, phdr->hwaddr + 4, skb); 683 } else { 684 /* unicast GID -- should be ARP or RARP reply */ 685 686 if ((be16_to_cpup((__be16 *) skb->data) != ETH_P_ARP) && 687 (be16_to_cpup((__be16 *) skb->data) != ETH_P_RARP)) { 688 ipoib_warn(priv, "Unicast, no %s: type %04x, QPN %06x " 689 IPOIB_GID_FMT "\n", 690 skb->dst ? "neigh" : "dst", 691 be16_to_cpup((__be16 *) skb->data), 692 be32_to_cpup((__be32 *) phdr->hwaddr), 693 IPOIB_GID_RAW_ARG(phdr->hwaddr + 4)); 694 dev_kfree_skb_any(skb); 695 ++priv->stats.tx_dropped; 696 goto out; 697 } 698 699 unicast_arp_send(skb, dev, phdr); 700 } 701 } 702 703out: 704 spin_unlock_irqrestore(&priv->tx_lock, flags); 705 706 return NETDEV_TX_OK; 707} 708 709static struct net_device_stats *ipoib_get_stats(struct net_device *dev) 710{ 711 struct ipoib_dev_priv *priv = netdev_priv(dev); 712 713 return &priv->stats; 714} 715 716static void ipoib_timeout(struct net_device *dev) 717{ 718 struct ipoib_dev_priv *priv = netdev_priv(dev); 719 720 ipoib_warn(priv, "transmit timeout: latency %d msecs\n", 721 jiffies_to_msecs(jiffies - dev->trans_start)); 722 ipoib_warn(priv, "queue stopped %d, tx_head %u, tx_tail %u\n", 723 netif_queue_stopped(dev), 724 priv->tx_head, priv->tx_tail); 725 /* XXX reset QP, etc. */ 726} 727 728static int ipoib_hard_header(struct sk_buff *skb, 729 struct net_device *dev, 730 unsigned short type, 731 void *daddr, void *saddr, unsigned len) 732{ 733 struct ipoib_header *header; 734 735 header = (struct ipoib_header *) skb_push(skb, sizeof *header); 736 737 header->proto = htons(type); 738 header->reserved = 0; 739 740 /* 741 * If we don't have a neighbour structure, stuff the 742 * destination address onto the front of the skb so we can 743 * figure out where to send the packet later. 744 */ 745 if ((!skb->dst || !skb->dst->neighbour) && daddr) { 746 struct ipoib_pseudoheader *phdr = 747 (struct ipoib_pseudoheader *) skb_push(skb, sizeof *phdr); 748 memcpy(phdr->hwaddr, daddr, INFINIBAND_ALEN); 749 } 750 751 return 0; 752} 753 754static void ipoib_set_mcast_list(struct net_device *dev) 755{ 756 struct ipoib_dev_priv *priv = netdev_priv(dev); 757 758 if (!test_bit(IPOIB_FLAG_OPER_UP, &priv->flags)) { 759 ipoib_dbg(priv, "IPOIB_FLAG_OPER_UP not set"); 760 return; 761 } 762 763 queue_work(ipoib_workqueue, &priv->restart_task); 764} 765 766static void ipoib_neigh_destructor(struct neighbour *n) 767{ 768 struct ipoib_neigh *neigh; 769 struct ipoib_dev_priv *priv = netdev_priv(n->dev); 770 unsigned long flags; 771 struct ipoib_ah *ah = NULL; 772 773 ipoib_dbg(priv, 774 "neigh_destructor for %06x " IPOIB_GID_FMT "\n", 775 be32_to_cpup((__be32 *) n->ha), 776 IPOIB_GID_RAW_ARG(n->ha + 4)); 777 778 spin_lock_irqsave(&priv->lock, flags); 779 780 neigh = *to_ipoib_neigh(n); 781 if (neigh) { 782 if (neigh->ah) 783 ah = neigh->ah; 784 list_del(&neigh->list); 785 ipoib_neigh_free(neigh); 786 } 787 788 spin_unlock_irqrestore(&priv->lock, flags); 789 790 if (ah) 791 ipoib_put_ah(ah); 792} 793 794struct ipoib_neigh *ipoib_neigh_alloc(struct neighbour *neighbour) 795{ 796 struct ipoib_neigh *neigh; 797 798 neigh = kmalloc(sizeof *neigh, GFP_ATOMIC); 799 if (!neigh) 800 return NULL; 801 802 neigh->neighbour = neighbour; 803 *to_ipoib_neigh(neighbour) = neigh; 804 805 return neigh; 806} 807 808void ipoib_neigh_free(struct ipoib_neigh *neigh) 809{ 810 *to_ipoib_neigh(neigh->neighbour) = NULL; 811 kfree(neigh); 812} 813 814static int ipoib_neigh_setup_dev(struct net_device *dev, struct neigh_parms *parms) 815{ 816 parms->neigh_destructor = ipoib_neigh_destructor; 817 818 return 0; 819} 820 821int ipoib_dev_init(struct net_device *dev, struct ib_device *ca, int port) 822{ 823 struct ipoib_dev_priv *priv = netdev_priv(dev); 824 825 /* Allocate RX/TX "rings" to hold queued skbs */ 826 priv->rx_ring = kzalloc(ipoib_recvq_size * sizeof *priv->rx_ring, 827 GFP_KERNEL); 828 if (!priv->rx_ring) { 829 printk(KERN_WARNING "%s: failed to allocate RX ring (%d entries)\n", 830 ca->name, ipoib_recvq_size); 831 goto out; 832 } 833 834 priv->tx_ring = kzalloc(ipoib_sendq_size * sizeof *priv->tx_ring, 835 GFP_KERNEL); 836 if (!priv->tx_ring) { 837 printk(KERN_WARNING "%s: failed to allocate TX ring (%d entries)\n", 838 ca->name, ipoib_sendq_size); 839 goto out_rx_ring_cleanup; 840 } 841 842 /* priv->tx_head & tx_tail are already 0 */ 843 844 if (ipoib_ib_dev_init(dev, ca, port)) 845 goto out_tx_ring_cleanup; 846 847 return 0; 848 849out_tx_ring_cleanup: 850 kfree(priv->tx_ring); 851 852out_rx_ring_cleanup: 853 kfree(priv->rx_ring); 854 855out: 856 return -ENOMEM; 857} 858 859void ipoib_dev_cleanup(struct net_device *dev) 860{ 861 struct ipoib_dev_priv *priv = netdev_priv(dev), *cpriv, *tcpriv; 862 863 ipoib_delete_debug_files(dev); 864 865 /* Delete any child interfaces first */ 866 list_for_each_entry_safe(cpriv, tcpriv, &priv->child_intfs, list) { 867 unregister_netdev(cpriv->dev); 868 ipoib_dev_cleanup(cpriv->dev); 869 free_netdev(cpriv->dev); 870 } 871 872 ipoib_ib_dev_cleanup(dev); 873 874 kfree(priv->rx_ring); 875 kfree(priv->tx_ring); 876 877 priv->rx_ring = NULL; 878 priv->tx_ring = NULL; 879} 880 881static void ipoib_setup(struct net_device *dev) 882{ 883 struct ipoib_dev_priv *priv = netdev_priv(dev); 884 885 dev->open = ipoib_open; 886 dev->stop = ipoib_stop; 887 dev->change_mtu = ipoib_change_mtu; 888 dev->hard_start_xmit = ipoib_start_xmit; 889 dev->get_stats = ipoib_get_stats; 890 dev->tx_timeout = ipoib_timeout; 891 dev->hard_header = ipoib_hard_header; 892 dev->set_multicast_list = ipoib_set_mcast_list; 893 dev->neigh_setup = ipoib_neigh_setup_dev; 894 895 dev->watchdog_timeo = HZ; 896 897 dev->flags |= IFF_BROADCAST | IFF_MULTICAST; 898 899 /* 900 * We add in INFINIBAND_ALEN to allow for the destination 901 * address "pseudoheader" for skbs without neighbour struct. 902 */ 903 dev->hard_header_len = IPOIB_ENCAP_LEN + INFINIBAND_ALEN; 904 dev->addr_len = INFINIBAND_ALEN; 905 dev->type = ARPHRD_INFINIBAND; 906 dev->tx_queue_len = ipoib_sendq_size * 2; 907 dev->features = NETIF_F_VLAN_CHALLENGED | NETIF_F_LLTX; 908 909 /* MTU will be reset when mcast join happens */ 910 dev->mtu = IPOIB_PACKET_SIZE - IPOIB_ENCAP_LEN; 911 priv->mcast_mtu = priv->admin_mtu = dev->mtu; 912 913 memcpy(dev->broadcast, ipv4_bcast_addr, INFINIBAND_ALEN); 914 915 netif_carrier_off(dev); 916 917 SET_MODULE_OWNER(dev); 918 919 priv->dev = dev; 920 921 spin_lock_init(&priv->lock); 922 spin_lock_init(&priv->tx_lock); 923 924 mutex_init(&priv->mcast_mutex); 925 mutex_init(&priv->vlan_mutex); 926 927 INIT_LIST_HEAD(&priv->path_list); 928 INIT_LIST_HEAD(&priv->child_intfs); 929 INIT_LIST_HEAD(&priv->dead_ahs); 930 INIT_LIST_HEAD(&priv->multicast_list); 931 932 INIT_WORK(&priv->pkey_task, ipoib_pkey_poll, priv->dev); 933 INIT_WORK(&priv->mcast_task, ipoib_mcast_join_task, priv->dev); 934 INIT_WORK(&priv->flush_task, ipoib_ib_dev_flush, priv->dev); 935 INIT_WORK(&priv->restart_task, ipoib_mcast_restart_task, priv->dev); 936 INIT_WORK(&priv->ah_reap_task, ipoib_reap_ah, priv->dev); 937} 938 939struct ipoib_dev_priv *ipoib_intf_alloc(const char *name) 940{ 941 struct net_device *dev; 942 943 dev = alloc_netdev((int) sizeof (struct ipoib_dev_priv), name, 944 ipoib_setup); 945 if (!dev) 946 return NULL; 947 948 return netdev_priv(dev); 949} 950 951static ssize_t show_pkey(struct class_device *cdev, char *buf) 952{ 953 struct ipoib_dev_priv *priv = 954 netdev_priv(container_of(cdev, struct net_device, class_dev)); 955 956 return sprintf(buf, "0x%04x\n", priv->pkey); 957} 958static CLASS_DEVICE_ATTR(pkey, S_IRUGO, show_pkey, NULL); 959 960static ssize_t create_child(struct class_device *cdev, 961 const char *buf, size_t count) 962{ 963 int pkey; 964 int ret; 965 966 if (sscanf(buf, "%i", &pkey) != 1) 967 return -EINVAL; 968 969 if (pkey < 0 || pkey > 0xffff) 970 return -EINVAL; 971 972 /* 973 * Set the full membership bit, so that we join the right 974 * broadcast group, etc. 975 */ 976 pkey |= 0x8000; 977 978 ret = ipoib_vlan_add(container_of(cdev, struct net_device, class_dev), 979 pkey); 980 981 return ret ? ret : count; 982} 983static CLASS_DEVICE_ATTR(create_child, S_IWUGO, NULL, create_child); 984 985static ssize_t delete_child(struct class_device *cdev, 986 const char *buf, size_t count) 987{ 988 int pkey; 989 int ret; 990 991 if (sscanf(buf, "%i", &pkey) != 1) 992 return -EINVAL; 993 994 if (pkey < 0 || pkey > 0xffff) 995 return -EINVAL; 996 997 ret = ipoib_vlan_delete(container_of(cdev, struct net_device, class_dev), 998 pkey); 999 1000 return ret ? ret : count; 1001 1002} 1003static CLASS_DEVICE_ATTR(delete_child, S_IWUGO, NULL, delete_child); 1004 1005int ipoib_add_pkey_attr(struct net_device *dev) 1006{ 1007 return class_device_create_file(&dev->class_dev, 1008 &class_device_attr_pkey); 1009} 1010 1011static struct net_device *ipoib_add_port(const char *format, 1012 struct ib_device *hca, u8 port) 1013{ 1014 struct ipoib_dev_priv *priv; 1015 int result = -ENOMEM; 1016 1017 priv = ipoib_intf_alloc(format); 1018 if (!priv) 1019 goto alloc_mem_failed; 1020 1021 SET_NETDEV_DEV(priv->dev, hca->dma_device); 1022 1023 result = ib_query_pkey(hca, port, 0, &priv->pkey); 1024 if (result) { 1025 printk(KERN_WARNING "%s: ib_query_pkey port %d failed (ret = %d)\n", 1026 hca->name, port, result); 1027 goto alloc_mem_failed; 1028 } 1029 1030 /* 1031 * Set the full membership bit, so that we join the right 1032 * broadcast group, etc. 1033 */ 1034 priv->pkey |= 0x8000; 1035 1036 priv->dev->broadcast[8] = priv->pkey >> 8; 1037 priv->dev->broadcast[9] = priv->pkey & 0xff; 1038 1039 result = ib_query_gid(hca, port, 0, &priv->local_gid); 1040 if (result) { 1041 printk(KERN_WARNING "%s: ib_query_gid port %d failed (ret = %d)\n", 1042 hca->name, port, result); 1043 goto alloc_mem_failed; 1044 } else 1045 memcpy(priv->dev->dev_addr + 4, priv->local_gid.raw, sizeof (union ib_gid)); 1046 1047 1048 result = ipoib_dev_init(priv->dev, hca, port); 1049 if (result < 0) { 1050 printk(KERN_WARNING "%s: failed to initialize port %d (ret = %d)\n", 1051 hca->name, port, result); 1052 goto device_init_failed; 1053 } 1054 1055 INIT_IB_EVENT_HANDLER(&priv->event_handler, 1056 priv->ca, ipoib_event); 1057 result = ib_register_event_handler(&priv->event_handler); 1058 if (result < 0) { 1059 printk(KERN_WARNING "%s: ib_register_event_handler failed for " 1060 "port %d (ret = %d)\n", 1061 hca->name, port, result); 1062 goto event_failed; 1063 } 1064 1065 result = register_netdev(priv->dev); 1066 if (result) { 1067 printk(KERN_WARNING "%s: couldn't register ipoib port %d; error %d\n", 1068 hca->name, port, result); 1069 goto register_failed; 1070 } 1071 1072 ipoib_create_debug_files(priv->dev); 1073 1074 if (ipoib_add_pkey_attr(priv->dev)) 1075 goto sysfs_failed; 1076 if (class_device_create_file(&priv->dev->class_dev, 1077 &class_device_attr_create_child)) 1078 goto sysfs_failed; 1079 if (class_device_create_file(&priv->dev->class_dev, 1080 &class_device_attr_delete_child)) 1081 goto sysfs_failed; 1082 1083 return priv->dev; 1084 1085sysfs_failed: 1086 ipoib_delete_debug_files(priv->dev); 1087 unregister_netdev(priv->dev); 1088 1089register_failed: 1090 ib_unregister_event_handler(&priv->event_handler); 1091 flush_scheduled_work(); 1092 1093event_failed: 1094 ipoib_dev_cleanup(priv->dev); 1095 1096device_init_failed: 1097 free_netdev(priv->dev); 1098 1099alloc_mem_failed: 1100 return ERR_PTR(result); 1101} 1102 1103static void ipoib_add_one(struct ib_device *device) 1104{ 1105 struct list_head *dev_list; 1106 struct net_device *dev; 1107 struct ipoib_dev_priv *priv; 1108 int s, e, p; 1109 1110 dev_list = kmalloc(sizeof *dev_list, GFP_KERNEL); 1111 if (!dev_list) 1112 return; 1113 1114 INIT_LIST_HEAD(dev_list); 1115 1116 if (device->node_type == IB_NODE_SWITCH) { 1117 s = 0; 1118 e = 0; 1119 } else { 1120 s = 1; 1121 e = device->phys_port_cnt; 1122 } 1123 1124 for (p = s; p <= e; ++p) { 1125 dev = ipoib_add_port("ib%d", device, p); 1126 if (!IS_ERR(dev)) { 1127 priv = netdev_priv(dev); 1128 list_add_tail(&priv->list, dev_list); 1129 } 1130 } 1131 1132 ib_set_client_data(device, &ipoib_client, dev_list); 1133} 1134 1135static void ipoib_remove_one(struct ib_device *device) 1136{ 1137 struct ipoib_dev_priv *priv, *tmp; 1138 struct list_head *dev_list; 1139 1140 dev_list = ib_get_client_data(device, &ipoib_client); 1141 1142 list_for_each_entry_safe(priv, tmp, dev_list, list) { 1143 ib_unregister_event_handler(&priv->event_handler); 1144 flush_scheduled_work(); 1145 1146 unregister_netdev(priv->dev); 1147 ipoib_dev_cleanup(priv->dev); 1148 free_netdev(priv->dev); 1149 } 1150 1151 kfree(dev_list); 1152} 1153 1154static int __init ipoib_init_module(void) 1155{ 1156 int ret; 1157 1158 ipoib_recvq_size = roundup_pow_of_two(ipoib_recvq_size); 1159 ipoib_recvq_size = min(ipoib_recvq_size, IPOIB_MAX_QUEUE_SIZE); 1160 ipoib_recvq_size = max(ipoib_recvq_size, IPOIB_MIN_QUEUE_SIZE); 1161 1162 ipoib_sendq_size = roundup_pow_of_two(ipoib_sendq_size); 1163 ipoib_sendq_size = min(ipoib_sendq_size, IPOIB_MAX_QUEUE_SIZE); 1164 ipoib_sendq_size = max(ipoib_sendq_size, IPOIB_MIN_QUEUE_SIZE); 1165 1166 ret = ipoib_register_debugfs(); 1167 if (ret) 1168 return ret; 1169 1170 /* 1171 * We create our own workqueue mainly because we want to be 1172 * able to flush it when devices are being removed. We can't 1173 * use schedule_work()/flush_scheduled_work() because both 1174 * unregister_netdev() and linkwatch_event take the rtnl lock, 1175 * so flush_scheduled_work() can deadlock during device 1176 * removal. 1177 */ 1178 ipoib_workqueue = create_singlethread_workqueue("ipoib"); 1179 if (!ipoib_workqueue) { 1180 ret = -ENOMEM; 1181 goto err_fs; 1182 } 1183 1184 ret = ib_register_client(&ipoib_client); 1185 if (ret) 1186 goto err_wq; 1187 1188 return 0; 1189 1190err_wq: 1191 destroy_workqueue(ipoib_workqueue); 1192 1193err_fs: 1194 ipoib_unregister_debugfs(); 1195 1196 return ret; 1197} 1198 1199static void __exit ipoib_cleanup_module(void) 1200{ 1201 ib_unregister_client(&ipoib_client); 1202 ipoib_unregister_debugfs(); 1203 destroy_workqueue(ipoib_workqueue); 1204} 1205 1206module_init(ipoib_init_module); 1207module_exit(ipoib_cleanup_module); 1208