l2t.c revision e48f129c2f200dde8899f6ea5c6e7173674fc482
1/* 2 * Copyright (c) 2003-2008 Chelsio, Inc. All rights reserved. 3 * 4 * This software is available to you under a choice of one of two 5 * licenses. You may choose to be licensed under the terms of the GNU 6 * General Public License (GPL) Version 2, available from the file 7 * COPYING in the main directory of this source tree, or the 8 * OpenIB.org BSD license below: 9 * 10 * Redistribution and use in source and binary forms, with or 11 * without modification, are permitted provided that the following 12 * conditions are met: 13 * 14 * - Redistributions of source code must retain the above 15 * copyright notice, this list of conditions and the following 16 * disclaimer. 17 * 18 * - Redistributions in binary form must reproduce the above 19 * copyright notice, this list of conditions and the following 20 * disclaimer in the documentation and/or other materials 21 * provided with the distribution. 22 * 23 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 24 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 25 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 26 * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS 27 * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN 28 * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN 29 * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 30 * SOFTWARE. 31 */ 32#include <linux/skbuff.h> 33#include <linux/netdevice.h> 34#include <linux/if.h> 35#include <linux/if_vlan.h> 36#include <linux/jhash.h> 37#include <linux/slab.h> 38#include <net/neighbour.h> 39#include "common.h" 40#include "t3cdev.h" 41#include "cxgb3_defs.h" 42#include "l2t.h" 43#include "t3_cpl.h" 44#include "firmware_exports.h" 45 46#define VLAN_NONE 0xfff 47 48/* 49 * Module locking notes: There is a RW lock protecting the L2 table as a 50 * whole plus a spinlock per L2T entry. Entry lookups and allocations happen 51 * under the protection of the table lock, individual entry changes happen 52 * while holding that entry's spinlock. The table lock nests outside the 53 * entry locks. Allocations of new entries take the table lock as writers so 54 * no other lookups can happen while allocating new entries. Entry updates 55 * take the table lock as readers so multiple entries can be updated in 56 * parallel. An L2T entry can be dropped by decrementing its reference count 57 * and therefore can happen in parallel with entry allocation but no entry 58 * can change state or increment its ref count during allocation as both of 59 * these perform lookups. 60 */ 61 62static inline unsigned int vlan_prio(const struct l2t_entry *e) 63{ 64 return e->vlan >> 13; 65} 66 67static inline unsigned int arp_hash(u32 key, int ifindex, 68 const struct l2t_data *d) 69{ 70 return jhash_2words(key, ifindex, 0) & (d->nentries - 1); 71} 72 73static inline void neigh_replace(struct l2t_entry *e, struct neighbour *n) 74{ 75 neigh_hold(n); 76 if (e->neigh) 77 neigh_release(e->neigh); 78 e->neigh = n; 79} 80 81/* 82 * Set up an L2T entry and send any packets waiting in the arp queue. The 83 * supplied skb is used for the CPL_L2T_WRITE_REQ. Must be called with the 84 * entry locked. 85 */ 86static int setup_l2e_send_pending(struct t3cdev *dev, struct sk_buff *skb, 87 struct l2t_entry *e) 88{ 89 struct cpl_l2t_write_req *req; 90 struct sk_buff *tmp; 91 92 if (!skb) { 93 skb = alloc_skb(sizeof(*req), GFP_ATOMIC); 94 if (!skb) 95 return -ENOMEM; 96 } 97 98 req = (struct cpl_l2t_write_req *)__skb_put(skb, sizeof(*req)); 99 req->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_FORWARD)); 100 OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_L2T_WRITE_REQ, e->idx)); 101 req->params = htonl(V_L2T_W_IDX(e->idx) | V_L2T_W_IFF(e->smt_idx) | 102 V_L2T_W_VLAN(e->vlan & VLAN_VID_MASK) | 103 V_L2T_W_PRIO(vlan_prio(e))); 104 memcpy(e->dmac, e->neigh->ha, sizeof(e->dmac)); 105 memcpy(req->dst_mac, e->dmac, sizeof(req->dst_mac)); 106 skb->priority = CPL_PRIORITY_CONTROL; 107 cxgb3_ofld_send(dev, skb); 108 109 skb_queue_walk_safe(&e->arpq, skb, tmp) { 110 __skb_unlink(skb, &e->arpq); 111 cxgb3_ofld_send(dev, skb); 112 } 113 e->state = L2T_STATE_VALID; 114 115 return 0; 116} 117 118/* 119 * Add a packet to the an L2T entry's queue of packets awaiting resolution. 120 * Must be called with the entry's lock held. 121 */ 122static inline void arpq_enqueue(struct l2t_entry *e, struct sk_buff *skb) 123{ 124 __skb_queue_tail(&e->arpq, skb); 125} 126 127int t3_l2t_send_slow(struct t3cdev *dev, struct sk_buff *skb, 128 struct l2t_entry *e) 129{ 130again: 131 switch (e->state) { 132 case L2T_STATE_STALE: /* entry is stale, kick off revalidation */ 133 neigh_event_send(e->neigh, NULL); 134 spin_lock_bh(&e->lock); 135 if (e->state == L2T_STATE_STALE) 136 e->state = L2T_STATE_VALID; 137 spin_unlock_bh(&e->lock); 138 case L2T_STATE_VALID: /* fast-path, send the packet on */ 139 return cxgb3_ofld_send(dev, skb); 140 case L2T_STATE_RESOLVING: 141 spin_lock_bh(&e->lock); 142 if (e->state != L2T_STATE_RESOLVING) { 143 /* ARP already completed */ 144 spin_unlock_bh(&e->lock); 145 goto again; 146 } 147 arpq_enqueue(e, skb); 148 spin_unlock_bh(&e->lock); 149 150 /* 151 * Only the first packet added to the arpq should kick off 152 * resolution. However, because the alloc_skb below can fail, 153 * we allow each packet added to the arpq to retry resolution 154 * as a way of recovering from transient memory exhaustion. 155 * A better way would be to use a work request to retry L2T 156 * entries when there's no memory. 157 */ 158 if (!neigh_event_send(e->neigh, NULL)) { 159 skb = alloc_skb(sizeof(struct cpl_l2t_write_req), 160 GFP_ATOMIC); 161 if (!skb) 162 break; 163 164 spin_lock_bh(&e->lock); 165 if (!skb_queue_empty(&e->arpq)) 166 setup_l2e_send_pending(dev, skb, e); 167 else /* we lost the race */ 168 __kfree_skb(skb); 169 spin_unlock_bh(&e->lock); 170 } 171 } 172 return 0; 173} 174 175EXPORT_SYMBOL(t3_l2t_send_slow); 176 177void t3_l2t_send_event(struct t3cdev *dev, struct l2t_entry *e) 178{ 179again: 180 switch (e->state) { 181 case L2T_STATE_STALE: /* entry is stale, kick off revalidation */ 182 neigh_event_send(e->neigh, NULL); 183 spin_lock_bh(&e->lock); 184 if (e->state == L2T_STATE_STALE) { 185 e->state = L2T_STATE_VALID; 186 } 187 spin_unlock_bh(&e->lock); 188 return; 189 case L2T_STATE_VALID: /* fast-path, send the packet on */ 190 return; 191 case L2T_STATE_RESOLVING: 192 spin_lock_bh(&e->lock); 193 if (e->state != L2T_STATE_RESOLVING) { 194 /* ARP already completed */ 195 spin_unlock_bh(&e->lock); 196 goto again; 197 } 198 spin_unlock_bh(&e->lock); 199 200 /* 201 * Only the first packet added to the arpq should kick off 202 * resolution. However, because the alloc_skb below can fail, 203 * we allow each packet added to the arpq to retry resolution 204 * as a way of recovering from transient memory exhaustion. 205 * A better way would be to use a work request to retry L2T 206 * entries when there's no memory. 207 */ 208 neigh_event_send(e->neigh, NULL); 209 } 210} 211 212EXPORT_SYMBOL(t3_l2t_send_event); 213 214/* 215 * Allocate a free L2T entry. Must be called with l2t_data.lock held. 216 */ 217static struct l2t_entry *alloc_l2e(struct l2t_data *d) 218{ 219 struct l2t_entry *end, *e, **p; 220 221 if (!atomic_read(&d->nfree)) 222 return NULL; 223 224 /* there's definitely a free entry */ 225 for (e = d->rover, end = &d->l2tab[d->nentries]; e != end; ++e) 226 if (atomic_read(&e->refcnt) == 0) 227 goto found; 228 229 for (e = &d->l2tab[1]; atomic_read(&e->refcnt); ++e) ; 230found: 231 d->rover = e + 1; 232 atomic_dec(&d->nfree); 233 234 /* 235 * The entry we found may be an inactive entry that is 236 * presently in the hash table. We need to remove it. 237 */ 238 if (e->state != L2T_STATE_UNUSED) { 239 int hash = arp_hash(e->addr, e->ifindex, d); 240 241 for (p = &d->l2tab[hash].first; *p; p = &(*p)->next) 242 if (*p == e) { 243 *p = e->next; 244 break; 245 } 246 e->state = L2T_STATE_UNUSED; 247 } 248 return e; 249} 250 251/* 252 * Called when an L2T entry has no more users. The entry is left in the hash 253 * table since it is likely to be reused but we also bump nfree to indicate 254 * that the entry can be reallocated for a different neighbor. We also drop 255 * the existing neighbor reference in case the neighbor is going away and is 256 * waiting on our reference. 257 * 258 * Because entries can be reallocated to other neighbors once their ref count 259 * drops to 0 we need to take the entry's lock to avoid races with a new 260 * incarnation. 261 */ 262void t3_l2e_free(struct l2t_data *d, struct l2t_entry *e) 263{ 264 spin_lock_bh(&e->lock); 265 if (atomic_read(&e->refcnt) == 0) { /* hasn't been recycled */ 266 if (e->neigh) { 267 neigh_release(e->neigh); 268 e->neigh = NULL; 269 } 270 } 271 spin_unlock_bh(&e->lock); 272 atomic_inc(&d->nfree); 273} 274 275EXPORT_SYMBOL(t3_l2e_free); 276 277/* 278 * Update an L2T entry that was previously used for the same next hop as neigh. 279 * Must be called with softirqs disabled. 280 */ 281static inline void reuse_entry(struct l2t_entry *e, struct neighbour *neigh) 282{ 283 unsigned int nud_state; 284 285 spin_lock(&e->lock); /* avoid race with t3_l2t_free */ 286 287 if (neigh != e->neigh) 288 neigh_replace(e, neigh); 289 nud_state = neigh->nud_state; 290 if (memcmp(e->dmac, neigh->ha, sizeof(e->dmac)) || 291 !(nud_state & NUD_VALID)) 292 e->state = L2T_STATE_RESOLVING; 293 else if (nud_state & NUD_CONNECTED) 294 e->state = L2T_STATE_VALID; 295 else 296 e->state = L2T_STATE_STALE; 297 spin_unlock(&e->lock); 298} 299 300struct l2t_entry *t3_l2t_get(struct t3cdev *cdev, struct neighbour *neigh, 301 struct net_device *dev) 302{ 303 struct l2t_entry *e = NULL; 304 struct l2t_data *d; 305 int hash; 306 u32 addr = *(u32 *) neigh->primary_key; 307 int ifidx = neigh->dev->ifindex; 308 struct port_info *p = netdev_priv(dev); 309 int smt_idx = p->port_id; 310 311 rcu_read_lock(); 312 d = L2DATA(cdev); 313 if (!d) 314 goto done_rcu; 315 316 hash = arp_hash(addr, ifidx, d); 317 318 write_lock_bh(&d->lock); 319 for (e = d->l2tab[hash].first; e; e = e->next) 320 if (e->addr == addr && e->ifindex == ifidx && 321 e->smt_idx == smt_idx) { 322 l2t_hold(d, e); 323 if (atomic_read(&e->refcnt) == 1) 324 reuse_entry(e, neigh); 325 goto done; 326 } 327 328 /* Need to allocate a new entry */ 329 e = alloc_l2e(d); 330 if (e) { 331 spin_lock(&e->lock); /* avoid race with t3_l2t_free */ 332 e->next = d->l2tab[hash].first; 333 d->l2tab[hash].first = e; 334 e->state = L2T_STATE_RESOLVING; 335 e->addr = addr; 336 e->ifindex = ifidx; 337 e->smt_idx = smt_idx; 338 atomic_set(&e->refcnt, 1); 339 neigh_replace(e, neigh); 340 if (neigh->dev->priv_flags & IFF_802_1Q_VLAN) 341 e->vlan = vlan_dev_vlan_id(neigh->dev); 342 else 343 e->vlan = VLAN_NONE; 344 spin_unlock(&e->lock); 345 } 346done: 347 write_unlock_bh(&d->lock); 348done_rcu: 349 rcu_read_unlock(); 350 return e; 351} 352 353EXPORT_SYMBOL(t3_l2t_get); 354 355/* 356 * Called when address resolution fails for an L2T entry to handle packets 357 * on the arpq head. If a packet specifies a failure handler it is invoked, 358 * otherwise the packets is sent to the offload device. 359 * 360 * XXX: maybe we should abandon the latter behavior and just require a failure 361 * handler. 362 */ 363static void handle_failed_resolution(struct t3cdev *dev, struct sk_buff_head *arpq) 364{ 365 struct sk_buff *skb, *tmp; 366 367 skb_queue_walk_safe(arpq, skb, tmp) { 368 struct l2t_skb_cb *cb = L2T_SKB_CB(skb); 369 370 __skb_unlink(skb, arpq); 371 if (cb->arp_failure_handler) 372 cb->arp_failure_handler(dev, skb); 373 else 374 cxgb3_ofld_send(dev, skb); 375 } 376} 377 378/* 379 * Called when the host's ARP layer makes a change to some entry that is 380 * loaded into the HW L2 table. 381 */ 382void t3_l2t_update(struct t3cdev *dev, struct neighbour *neigh) 383{ 384 struct sk_buff_head arpq; 385 struct l2t_entry *e; 386 struct l2t_data *d = L2DATA(dev); 387 u32 addr = *(u32 *) neigh->primary_key; 388 int ifidx = neigh->dev->ifindex; 389 int hash = arp_hash(addr, ifidx, d); 390 391 read_lock_bh(&d->lock); 392 for (e = d->l2tab[hash].first; e; e = e->next) 393 if (e->addr == addr && e->ifindex == ifidx) { 394 spin_lock(&e->lock); 395 goto found; 396 } 397 read_unlock_bh(&d->lock); 398 return; 399 400found: 401 __skb_queue_head_init(&arpq); 402 403 read_unlock(&d->lock); 404 if (atomic_read(&e->refcnt)) { 405 if (neigh != e->neigh) 406 neigh_replace(e, neigh); 407 408 if (e->state == L2T_STATE_RESOLVING) { 409 if (neigh->nud_state & NUD_FAILED) { 410 skb_queue_splice_init(&e->arpq, &arpq); 411 } else if (neigh->nud_state & (NUD_CONNECTED|NUD_STALE)) 412 setup_l2e_send_pending(dev, NULL, e); 413 } else { 414 e->state = neigh->nud_state & NUD_CONNECTED ? 415 L2T_STATE_VALID : L2T_STATE_STALE; 416 if (memcmp(e->dmac, neigh->ha, 6)) 417 setup_l2e_send_pending(dev, NULL, e); 418 } 419 } 420 spin_unlock_bh(&e->lock); 421 422 if (!skb_queue_empty(&arpq)) 423 handle_failed_resolution(dev, &arpq); 424} 425 426struct l2t_data *t3_init_l2t(unsigned int l2t_capacity) 427{ 428 struct l2t_data *d; 429 int i, size = sizeof(*d) + l2t_capacity * sizeof(struct l2t_entry); 430 431 d = cxgb_alloc_mem(size); 432 if (!d) 433 return NULL; 434 435 d->nentries = l2t_capacity; 436 d->rover = &d->l2tab[1]; /* entry 0 is not used */ 437 atomic_set(&d->nfree, l2t_capacity - 1); 438 rwlock_init(&d->lock); 439 440 for (i = 0; i < l2t_capacity; ++i) { 441 d->l2tab[i].idx = i; 442 d->l2tab[i].state = L2T_STATE_UNUSED; 443 __skb_queue_head_init(&d->l2tab[i].arpq); 444 spin_lock_init(&d->l2tab[i].lock); 445 atomic_set(&d->l2tab[i].refcnt, 0); 446 } 447 return d; 448} 449 450void t3_free_l2t(struct l2t_data *d) 451{ 452 cxgb_free_mem(d); 453} 454 455