ipath_rc.c revision 3859e39d75b72f35f7d38c618fbbacb39a440c22
1/* 2 * Copyright (c) 2006 QLogic, Inc. All rights reserved. 3 * Copyright (c) 2005, 2006 PathScale, Inc. All rights reserved. 4 * 5 * This software is available to you under a choice of one of two 6 * licenses. You may choose to be licensed under the terms of the GNU 7 * General Public License (GPL) Version 2, available from the file 8 * COPYING in the main directory of this source tree, or the 9 * OpenIB.org BSD license below: 10 * 11 * Redistribution and use in source and binary forms, with or 12 * without modification, are permitted provided that the following 13 * conditions are met: 14 * 15 * - Redistributions of source code must retain the above 16 * copyright notice, this list of conditions and the following 17 * disclaimer. 18 * 19 * - Redistributions in binary form must reproduce the above 20 * copyright notice, this list of conditions and the following 21 * disclaimer in the documentation and/or other materials 22 * provided with the distribution. 23 * 24 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 25 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 26 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 27 * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS 28 * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN 29 * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN 30 * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 31 * SOFTWARE. 32 */ 33 34#include "ipath_verbs.h" 35#include "ipath_kernel.h" 36 37/* cut down ridiculously long IB macro names */ 38#define OP(x) IB_OPCODE_RC_##x 39 40static u32 restart_sge(struct ipath_sge_state *ss, struct ipath_swqe *wqe, 41 u32 psn, u32 pmtu) 42{ 43 u32 len; 44 45 len = ((psn - wqe->psn) & IPATH_PSN_MASK) * pmtu; 46 ss->sge = wqe->sg_list[0]; 47 ss->sg_list = wqe->sg_list + 1; 48 ss->num_sge = wqe->wr.num_sge; 49 ipath_skip_sge(ss, len); 50 return wqe->length - len; 51} 52 53/** 54 * ipath_init_restart- initialize the qp->s_sge after a restart 55 * @qp: the QP who's SGE we're restarting 56 * @wqe: the work queue to initialize the QP's SGE from 57 * 58 * The QP s_lock should be held and interrupts disabled. 59 */ 60static void ipath_init_restart(struct ipath_qp *qp, struct ipath_swqe *wqe) 61{ 62 struct ipath_ibdev *dev; 63 64 qp->s_len = restart_sge(&qp->s_sge, wqe, qp->s_psn, 65 ib_mtu_enum_to_int(qp->path_mtu)); 66 dev = to_idev(qp->ibqp.device); 67 spin_lock(&dev->pending_lock); 68 if (list_empty(&qp->timerwait)) 69 list_add_tail(&qp->timerwait, 70 &dev->pending[dev->pending_index]); 71 spin_unlock(&dev->pending_lock); 72} 73 74/** 75 * ipath_make_rc_ack - construct a response packet (ACK, NAK, or RDMA read) 76 * @qp: a pointer to the QP 77 * @ohdr: a pointer to the IB header being constructed 78 * @pmtu: the path MTU 79 * 80 * Return 1 if constructed; otherwise, return 0. 81 * Note that we are in the responder's side of the QP context. 82 * Note the QP s_lock must be held. 83 */ 84static int ipath_make_rc_ack(struct ipath_qp *qp, 85 struct ipath_other_headers *ohdr, 86 u32 pmtu, u32 *bth0p, u32 *bth2p) 87{ 88 struct ipath_ack_entry *e; 89 u32 hwords; 90 u32 len; 91 u32 bth0; 92 u32 bth2; 93 94 /* header size in 32-bit words LRH+BTH = (8+12)/4. */ 95 hwords = 5; 96 97 switch (qp->s_ack_state) { 98 case OP(RDMA_READ_RESPONSE_LAST): 99 case OP(RDMA_READ_RESPONSE_ONLY): 100 case OP(ATOMIC_ACKNOWLEDGE): 101 qp->s_ack_state = OP(ACKNOWLEDGE); 102 /* FALLTHROUGH */ 103 case OP(ACKNOWLEDGE): 104 /* Check for no next entry in the queue. */ 105 if (qp->r_head_ack_queue == qp->s_tail_ack_queue) { 106 if (qp->s_flags & IPATH_S_ACK_PENDING) 107 goto normal; 108 goto bail; 109 } 110 111 e = &qp->s_ack_queue[qp->s_tail_ack_queue]; 112 if (e->opcode == OP(RDMA_READ_REQUEST)) { 113 /* Copy SGE state in case we need to resend */ 114 qp->s_ack_rdma_sge = e->rdma_sge; 115 qp->s_cur_sge = &qp->s_ack_rdma_sge; 116 len = e->rdma_sge.sge.sge_length; 117 if (len > pmtu) { 118 len = pmtu; 119 qp->s_ack_state = OP(RDMA_READ_RESPONSE_FIRST); 120 } else { 121 qp->s_ack_state = OP(RDMA_READ_RESPONSE_ONLY); 122 if (++qp->s_tail_ack_queue > 123 IPATH_MAX_RDMA_ATOMIC) 124 qp->s_tail_ack_queue = 0; 125 } 126 ohdr->u.aeth = ipath_compute_aeth(qp); 127 hwords++; 128 qp->s_ack_rdma_psn = e->psn; 129 bth2 = qp->s_ack_rdma_psn++ & IPATH_PSN_MASK; 130 } else { 131 /* COMPARE_SWAP or FETCH_ADD */ 132 qp->s_cur_sge = NULL; 133 len = 0; 134 qp->s_ack_state = OP(ATOMIC_ACKNOWLEDGE); 135 ohdr->u.at.aeth = ipath_compute_aeth(qp); 136 ohdr->u.at.atomic_ack_eth[0] = 137 cpu_to_be32(e->atomic_data >> 32); 138 ohdr->u.at.atomic_ack_eth[1] = 139 cpu_to_be32(e->atomic_data); 140 hwords += sizeof(ohdr->u.at) / sizeof(u32); 141 bth2 = e->psn; 142 if (++qp->s_tail_ack_queue > IPATH_MAX_RDMA_ATOMIC) 143 qp->s_tail_ack_queue = 0; 144 } 145 bth0 = qp->s_ack_state << 24; 146 break; 147 148 case OP(RDMA_READ_RESPONSE_FIRST): 149 qp->s_ack_state = OP(RDMA_READ_RESPONSE_MIDDLE); 150 /* FALLTHROUGH */ 151 case OP(RDMA_READ_RESPONSE_MIDDLE): 152 len = qp->s_ack_rdma_sge.sge.sge_length; 153 if (len > pmtu) 154 len = pmtu; 155 else { 156 ohdr->u.aeth = ipath_compute_aeth(qp); 157 hwords++; 158 qp->s_ack_state = OP(RDMA_READ_RESPONSE_LAST); 159 if (++qp->s_tail_ack_queue > IPATH_MAX_RDMA_ATOMIC) 160 qp->s_tail_ack_queue = 0; 161 } 162 bth0 = qp->s_ack_state << 24; 163 bth2 = qp->s_ack_rdma_psn++ & IPATH_PSN_MASK; 164 break; 165 166 default: 167 normal: 168 /* 169 * Send a regular ACK. 170 * Set the s_ack_state so we wait until after sending 171 * the ACK before setting s_ack_state to ACKNOWLEDGE 172 * (see above). 173 */ 174 qp->s_ack_state = OP(ATOMIC_ACKNOWLEDGE); 175 qp->s_flags &= ~IPATH_S_ACK_PENDING; 176 qp->s_cur_sge = NULL; 177 if (qp->s_nak_state) 178 ohdr->u.aeth = 179 cpu_to_be32((qp->r_msn & IPATH_MSN_MASK) | 180 (qp->s_nak_state << 181 IPATH_AETH_CREDIT_SHIFT)); 182 else 183 ohdr->u.aeth = ipath_compute_aeth(qp); 184 hwords++; 185 len = 0; 186 bth0 = OP(ACKNOWLEDGE) << 24; 187 bth2 = qp->s_ack_psn & IPATH_PSN_MASK; 188 } 189 qp->s_hdrwords = hwords; 190 qp->s_cur_size = len; 191 *bth0p = bth0; 192 *bth2p = bth2; 193 return 1; 194 195bail: 196 return 0; 197} 198 199/** 200 * ipath_make_rc_req - construct a request packet (SEND, RDMA r/w, ATOMIC) 201 * @qp: a pointer to the QP 202 * @ohdr: a pointer to the IB header being constructed 203 * @pmtu: the path MTU 204 * @bth0p: pointer to the BTH opcode word 205 * @bth2p: pointer to the BTH PSN word 206 * 207 * Return 1 if constructed; otherwise, return 0. 208 * Note the QP s_lock must be held and interrupts disabled. 209 */ 210int ipath_make_rc_req(struct ipath_qp *qp, 211 struct ipath_other_headers *ohdr, 212 u32 pmtu, u32 *bth0p, u32 *bth2p) 213{ 214 struct ipath_ibdev *dev = to_idev(qp->ibqp.device); 215 struct ipath_sge_state *ss; 216 struct ipath_swqe *wqe; 217 u32 hwords; 218 u32 len; 219 u32 bth0; 220 u32 bth2; 221 char newreq; 222 223 /* Sending responses has higher priority over sending requests. */ 224 if ((qp->r_head_ack_queue != qp->s_tail_ack_queue || 225 (qp->s_flags & IPATH_S_ACK_PENDING) || 226 qp->s_ack_state != IB_OPCODE_RC_ACKNOWLEDGE) && 227 ipath_make_rc_ack(qp, ohdr, pmtu, bth0p, bth2p)) 228 goto done; 229 230 if (!(ib_ipath_state_ops[qp->state] & IPATH_PROCESS_SEND_OK) || 231 qp->s_rnr_timeout) 232 goto bail; 233 234 /* Limit the number of packets sent without an ACK. */ 235 if (ipath_cmp24(qp->s_psn, qp->s_last_psn + IPATH_PSN_CREDIT) > 0) { 236 qp->s_wait_credit = 1; 237 dev->n_rc_stalls++; 238 spin_lock(&dev->pending_lock); 239 if (list_empty(&qp->timerwait)) 240 list_add_tail(&qp->timerwait, 241 &dev->pending[dev->pending_index]); 242 spin_unlock(&dev->pending_lock); 243 goto bail; 244 } 245 246 /* header size in 32-bit words LRH+BTH = (8+12)/4. */ 247 hwords = 5; 248 bth0 = 0; 249 250 /* Send a request. */ 251 wqe = get_swqe_ptr(qp, qp->s_cur); 252 switch (qp->s_state) { 253 default: 254 /* 255 * Resend an old request or start a new one. 256 * 257 * We keep track of the current SWQE so that 258 * we don't reset the "furthest progress" state 259 * if we need to back up. 260 */ 261 newreq = 0; 262 if (qp->s_cur == qp->s_tail) { 263 /* Check if send work queue is empty. */ 264 if (qp->s_tail == qp->s_head) 265 goto bail; 266 /* 267 * If a fence is requested, wait for previous 268 * RDMA read and atomic operations to finish. 269 */ 270 if ((wqe->wr.send_flags & IB_SEND_FENCE) && 271 qp->s_num_rd_atomic) { 272 qp->s_flags |= IPATH_S_FENCE_PENDING; 273 goto bail; 274 } 275 wqe->psn = qp->s_next_psn; 276 newreq = 1; 277 } 278 /* 279 * Note that we have to be careful not to modify the 280 * original work request since we may need to resend 281 * it. 282 */ 283 len = wqe->length; 284 ss = &qp->s_sge; 285 bth2 = 0; 286 switch (wqe->wr.opcode) { 287 case IB_WR_SEND: 288 case IB_WR_SEND_WITH_IMM: 289 /* If no credit, return. */ 290 if (qp->s_lsn != (u32) -1 && 291 ipath_cmp24(wqe->ssn, qp->s_lsn + 1) > 0) 292 goto bail; 293 wqe->lpsn = wqe->psn; 294 if (len > pmtu) { 295 wqe->lpsn += (len - 1) / pmtu; 296 qp->s_state = OP(SEND_FIRST); 297 len = pmtu; 298 break; 299 } 300 if (wqe->wr.opcode == IB_WR_SEND) 301 qp->s_state = OP(SEND_ONLY); 302 else { 303 qp->s_state = OP(SEND_ONLY_WITH_IMMEDIATE); 304 /* Immediate data comes after the BTH */ 305 ohdr->u.imm_data = wqe->wr.imm_data; 306 hwords += 1; 307 } 308 if (wqe->wr.send_flags & IB_SEND_SOLICITED) 309 bth0 |= 1 << 23; 310 bth2 = 1 << 31; /* Request ACK. */ 311 if (++qp->s_cur == qp->s_size) 312 qp->s_cur = 0; 313 break; 314 315 case IB_WR_RDMA_WRITE: 316 if (newreq && qp->s_lsn != (u32) -1) 317 qp->s_lsn++; 318 /* FALLTHROUGH */ 319 case IB_WR_RDMA_WRITE_WITH_IMM: 320 /* If no credit, return. */ 321 if (qp->s_lsn != (u32) -1 && 322 ipath_cmp24(wqe->ssn, qp->s_lsn + 1) > 0) 323 goto bail; 324 ohdr->u.rc.reth.vaddr = 325 cpu_to_be64(wqe->wr.wr.rdma.remote_addr); 326 ohdr->u.rc.reth.rkey = 327 cpu_to_be32(wqe->wr.wr.rdma.rkey); 328 ohdr->u.rc.reth.length = cpu_to_be32(len); 329 hwords += sizeof(struct ib_reth) / sizeof(u32); 330 wqe->lpsn = wqe->psn; 331 if (len > pmtu) { 332 wqe->lpsn += (len - 1) / pmtu; 333 qp->s_state = OP(RDMA_WRITE_FIRST); 334 len = pmtu; 335 break; 336 } 337 if (wqe->wr.opcode == IB_WR_RDMA_WRITE) 338 qp->s_state = OP(RDMA_WRITE_ONLY); 339 else { 340 qp->s_state = 341 OP(RDMA_WRITE_ONLY_WITH_IMMEDIATE); 342 /* Immediate data comes after RETH */ 343 ohdr->u.rc.imm_data = wqe->wr.imm_data; 344 hwords += 1; 345 if (wqe->wr.send_flags & IB_SEND_SOLICITED) 346 bth0 |= 1 << 23; 347 } 348 bth2 = 1 << 31; /* Request ACK. */ 349 if (++qp->s_cur == qp->s_size) 350 qp->s_cur = 0; 351 break; 352 353 case IB_WR_RDMA_READ: 354 /* 355 * Don't allow more operations to be started 356 * than the QP limits allow. 357 */ 358 if (newreq) { 359 if (qp->s_num_rd_atomic >= 360 qp->s_max_rd_atomic) { 361 qp->s_flags |= IPATH_S_RDMAR_PENDING; 362 goto bail; 363 } 364 qp->s_num_rd_atomic++; 365 if (qp->s_lsn != (u32) -1) 366 qp->s_lsn++; 367 /* 368 * Adjust s_next_psn to count the 369 * expected number of responses. 370 */ 371 if (len > pmtu) 372 qp->s_next_psn += (len - 1) / pmtu; 373 wqe->lpsn = qp->s_next_psn++; 374 } 375 ohdr->u.rc.reth.vaddr = 376 cpu_to_be64(wqe->wr.wr.rdma.remote_addr); 377 ohdr->u.rc.reth.rkey = 378 cpu_to_be32(wqe->wr.wr.rdma.rkey); 379 ohdr->u.rc.reth.length = cpu_to_be32(len); 380 qp->s_state = OP(RDMA_READ_REQUEST); 381 hwords += sizeof(ohdr->u.rc.reth) / sizeof(u32); 382 ss = NULL; 383 len = 0; 384 if (++qp->s_cur == qp->s_size) 385 qp->s_cur = 0; 386 break; 387 388 case IB_WR_ATOMIC_CMP_AND_SWP: 389 case IB_WR_ATOMIC_FETCH_AND_ADD: 390 /* 391 * Don't allow more operations to be started 392 * than the QP limits allow. 393 */ 394 if (newreq) { 395 if (qp->s_num_rd_atomic >= 396 qp->s_max_rd_atomic) { 397 qp->s_flags |= IPATH_S_RDMAR_PENDING; 398 goto bail; 399 } 400 qp->s_num_rd_atomic++; 401 if (qp->s_lsn != (u32) -1) 402 qp->s_lsn++; 403 wqe->lpsn = wqe->psn; 404 } 405 if (wqe->wr.opcode == IB_WR_ATOMIC_CMP_AND_SWP) { 406 qp->s_state = OP(COMPARE_SWAP); 407 ohdr->u.atomic_eth.swap_data = cpu_to_be64( 408 wqe->wr.wr.atomic.swap); 409 ohdr->u.atomic_eth.compare_data = cpu_to_be64( 410 wqe->wr.wr.atomic.compare_add); 411 } else { 412 qp->s_state = OP(FETCH_ADD); 413 ohdr->u.atomic_eth.swap_data = cpu_to_be64( 414 wqe->wr.wr.atomic.compare_add); 415 ohdr->u.atomic_eth.compare_data = 0; 416 } 417 ohdr->u.atomic_eth.vaddr[0] = cpu_to_be32( 418 wqe->wr.wr.atomic.remote_addr >> 32); 419 ohdr->u.atomic_eth.vaddr[1] = cpu_to_be32( 420 wqe->wr.wr.atomic.remote_addr); 421 ohdr->u.atomic_eth.rkey = cpu_to_be32( 422 wqe->wr.wr.atomic.rkey); 423 hwords += sizeof(struct ib_atomic_eth) / sizeof(u32); 424 ss = NULL; 425 len = 0; 426 if (++qp->s_cur == qp->s_size) 427 qp->s_cur = 0; 428 break; 429 430 default: 431 goto bail; 432 } 433 qp->s_sge.sge = wqe->sg_list[0]; 434 qp->s_sge.sg_list = wqe->sg_list + 1; 435 qp->s_sge.num_sge = wqe->wr.num_sge; 436 qp->s_len = wqe->length; 437 if (newreq) { 438 qp->s_tail++; 439 if (qp->s_tail >= qp->s_size) 440 qp->s_tail = 0; 441 } 442 bth2 |= qp->s_psn & IPATH_PSN_MASK; 443 if (wqe->wr.opcode == IB_WR_RDMA_READ) 444 qp->s_psn = wqe->lpsn + 1; 445 else { 446 qp->s_psn++; 447 if ((int)(qp->s_psn - qp->s_next_psn) > 0) 448 qp->s_next_psn = qp->s_psn; 449 } 450 /* 451 * Put the QP on the pending list so lost ACKs will cause 452 * a retry. More than one request can be pending so the 453 * QP may already be on the dev->pending list. 454 */ 455 spin_lock(&dev->pending_lock); 456 if (list_empty(&qp->timerwait)) 457 list_add_tail(&qp->timerwait, 458 &dev->pending[dev->pending_index]); 459 spin_unlock(&dev->pending_lock); 460 break; 461 462 case OP(RDMA_READ_RESPONSE_FIRST): 463 /* 464 * This case can only happen if a send is restarted. 465 * See ipath_restart_rc(). 466 */ 467 ipath_init_restart(qp, wqe); 468 /* FALLTHROUGH */ 469 case OP(SEND_FIRST): 470 qp->s_state = OP(SEND_MIDDLE); 471 /* FALLTHROUGH */ 472 case OP(SEND_MIDDLE): 473 bth2 = qp->s_psn++ & IPATH_PSN_MASK; 474 if ((int)(qp->s_psn - qp->s_next_psn) > 0) 475 qp->s_next_psn = qp->s_psn; 476 ss = &qp->s_sge; 477 len = qp->s_len; 478 if (len > pmtu) { 479 len = pmtu; 480 break; 481 } 482 if (wqe->wr.opcode == IB_WR_SEND) 483 qp->s_state = OP(SEND_LAST); 484 else { 485 qp->s_state = OP(SEND_LAST_WITH_IMMEDIATE); 486 /* Immediate data comes after the BTH */ 487 ohdr->u.imm_data = wqe->wr.imm_data; 488 hwords += 1; 489 } 490 if (wqe->wr.send_flags & IB_SEND_SOLICITED) 491 bth0 |= 1 << 23; 492 bth2 |= 1 << 31; /* Request ACK. */ 493 qp->s_cur++; 494 if (qp->s_cur >= qp->s_size) 495 qp->s_cur = 0; 496 break; 497 498 case OP(RDMA_READ_RESPONSE_LAST): 499 /* 500 * This case can only happen if a RDMA write is restarted. 501 * See ipath_restart_rc(). 502 */ 503 ipath_init_restart(qp, wqe); 504 /* FALLTHROUGH */ 505 case OP(RDMA_WRITE_FIRST): 506 qp->s_state = OP(RDMA_WRITE_MIDDLE); 507 /* FALLTHROUGH */ 508 case OP(RDMA_WRITE_MIDDLE): 509 bth2 = qp->s_psn++ & IPATH_PSN_MASK; 510 if ((int)(qp->s_psn - qp->s_next_psn) > 0) 511 qp->s_next_psn = qp->s_psn; 512 ss = &qp->s_sge; 513 len = qp->s_len; 514 if (len > pmtu) { 515 len = pmtu; 516 break; 517 } 518 if (wqe->wr.opcode == IB_WR_RDMA_WRITE) 519 qp->s_state = OP(RDMA_WRITE_LAST); 520 else { 521 qp->s_state = OP(RDMA_WRITE_LAST_WITH_IMMEDIATE); 522 /* Immediate data comes after the BTH */ 523 ohdr->u.imm_data = wqe->wr.imm_data; 524 hwords += 1; 525 if (wqe->wr.send_flags & IB_SEND_SOLICITED) 526 bth0 |= 1 << 23; 527 } 528 bth2 |= 1 << 31; /* Request ACK. */ 529 qp->s_cur++; 530 if (qp->s_cur >= qp->s_size) 531 qp->s_cur = 0; 532 break; 533 534 case OP(RDMA_READ_RESPONSE_MIDDLE): 535 /* 536 * This case can only happen if a RDMA read is restarted. 537 * See ipath_restart_rc(). 538 */ 539 ipath_init_restart(qp, wqe); 540 len = ((qp->s_psn - wqe->psn) & IPATH_PSN_MASK) * pmtu; 541 ohdr->u.rc.reth.vaddr = 542 cpu_to_be64(wqe->wr.wr.rdma.remote_addr + len); 543 ohdr->u.rc.reth.rkey = 544 cpu_to_be32(wqe->wr.wr.rdma.rkey); 545 ohdr->u.rc.reth.length = cpu_to_be32(qp->s_len); 546 qp->s_state = OP(RDMA_READ_REQUEST); 547 hwords += sizeof(ohdr->u.rc.reth) / sizeof(u32); 548 bth2 = qp->s_psn++ & IPATH_PSN_MASK; 549 if ((int)(qp->s_psn - qp->s_next_psn) > 0) 550 qp->s_next_psn = qp->s_psn; 551 ss = NULL; 552 len = 0; 553 qp->s_cur++; 554 if (qp->s_cur == qp->s_size) 555 qp->s_cur = 0; 556 break; 557 } 558 if (ipath_cmp24(qp->s_psn, qp->s_last_psn + IPATH_PSN_CREDIT - 1) >= 0) 559 bth2 |= 1 << 31; /* Request ACK. */ 560 qp->s_len -= len; 561 qp->s_hdrwords = hwords; 562 qp->s_cur_sge = ss; 563 qp->s_cur_size = len; 564 *bth0p = bth0 | (qp->s_state << 24); 565 *bth2p = bth2; 566done: 567 return 1; 568 569bail: 570 return 0; 571} 572 573/** 574 * send_rc_ack - Construct an ACK packet and send it 575 * @qp: a pointer to the QP 576 * 577 * This is called from ipath_rc_rcv() and only uses the receive 578 * side QP state. 579 * Note that RDMA reads and atomics are handled in the 580 * send side QP state and tasklet. 581 */ 582static void send_rc_ack(struct ipath_qp *qp) 583{ 584 struct ipath_ibdev *dev = to_idev(qp->ibqp.device); 585 u16 lrh0; 586 u32 bth0; 587 u32 hwords; 588 struct ipath_ib_header hdr; 589 struct ipath_other_headers *ohdr; 590 591 /* Don't send ACK or NAK if a RDMA read or atomic is pending. */ 592 if (qp->r_head_ack_queue != qp->s_tail_ack_queue) 593 goto queue_ack; 594 595 /* Construct the header. */ 596 ohdr = &hdr.u.oth; 597 lrh0 = IPATH_LRH_BTH; 598 /* header size in 32-bit words LRH+BTH+AETH = (8+12+4)/4. */ 599 hwords = 6; 600 if (unlikely(qp->remote_ah_attr.ah_flags & IB_AH_GRH)) { 601 hwords += ipath_make_grh(dev, &hdr.u.l.grh, 602 &qp->remote_ah_attr.grh, 603 hwords, 0); 604 ohdr = &hdr.u.l.oth; 605 lrh0 = IPATH_LRH_GRH; 606 } 607 /* read pkey_index w/o lock (its atomic) */ 608 bth0 = ipath_get_pkey(dev->dd, qp->s_pkey_index) | 609 OP(ACKNOWLEDGE) << 24; 610 if (qp->r_nak_state) 611 ohdr->u.aeth = cpu_to_be32((qp->r_msn & IPATH_MSN_MASK) | 612 (qp->r_nak_state << 613 IPATH_AETH_CREDIT_SHIFT)); 614 else 615 ohdr->u.aeth = ipath_compute_aeth(qp); 616 lrh0 |= qp->remote_ah_attr.sl << 4; 617 hdr.lrh[0] = cpu_to_be16(lrh0); 618 hdr.lrh[1] = cpu_to_be16(qp->remote_ah_attr.dlid); 619 hdr.lrh[2] = cpu_to_be16(hwords + SIZE_OF_CRC); 620 hdr.lrh[3] = cpu_to_be16(dev->dd->ipath_lid); 621 ohdr->bth[0] = cpu_to_be32(bth0); 622 ohdr->bth[1] = cpu_to_be32(qp->remote_qpn); 623 ohdr->bth[2] = cpu_to_be32(qp->r_ack_psn & IPATH_PSN_MASK); 624 625 /* 626 * If we can send the ACK, clear the ACK state. 627 */ 628 if (ipath_verbs_send(dev->dd, hwords, (u32 *) &hdr, 0, NULL) == 0) { 629 dev->n_unicast_xmit++; 630 goto done; 631 } 632 633 /* 634 * We are out of PIO buffers at the moment. 635 * Pass responsibility for sending the ACK to the 636 * send tasklet so that when a PIO buffer becomes 637 * available, the ACK is sent ahead of other outgoing 638 * packets. 639 */ 640 dev->n_rc_qacks++; 641 642queue_ack: 643 spin_lock_irq(&qp->s_lock); 644 qp->s_flags |= IPATH_S_ACK_PENDING; 645 qp->s_nak_state = qp->r_nak_state; 646 qp->s_ack_psn = qp->r_ack_psn; 647 spin_unlock_irq(&qp->s_lock); 648 649 /* Call ipath_do_rc_send() in another thread. */ 650 tasklet_hi_schedule(&qp->s_task); 651 652done: 653 return; 654} 655 656/** 657 * reset_psn - reset the QP state to send starting from PSN 658 * @qp: the QP 659 * @psn: the packet sequence number to restart at 660 * 661 * This is called from ipath_rc_rcv() to process an incoming RC ACK 662 * for the given QP. 663 * Called at interrupt level with the QP s_lock held. 664 */ 665static void reset_psn(struct ipath_qp *qp, u32 psn) 666{ 667 u32 n = qp->s_last; 668 struct ipath_swqe *wqe = get_swqe_ptr(qp, n); 669 u32 opcode; 670 671 qp->s_cur = n; 672 673 /* 674 * If we are starting the request from the beginning, 675 * let the normal send code handle initialization. 676 */ 677 if (ipath_cmp24(psn, wqe->psn) <= 0) { 678 qp->s_state = OP(SEND_LAST); 679 goto done; 680 } 681 682 /* Find the work request opcode corresponding to the given PSN. */ 683 opcode = wqe->wr.opcode; 684 for (;;) { 685 int diff; 686 687 if (++n == qp->s_size) 688 n = 0; 689 if (n == qp->s_tail) 690 break; 691 wqe = get_swqe_ptr(qp, n); 692 diff = ipath_cmp24(psn, wqe->psn); 693 if (diff < 0) 694 break; 695 qp->s_cur = n; 696 /* 697 * If we are starting the request from the beginning, 698 * let the normal send code handle initialization. 699 */ 700 if (diff == 0) { 701 qp->s_state = OP(SEND_LAST); 702 goto done; 703 } 704 opcode = wqe->wr.opcode; 705 } 706 707 /* 708 * Set the state to restart in the middle of a request. 709 * Don't change the s_sge, s_cur_sge, or s_cur_size. 710 * See ipath_do_rc_send(). 711 */ 712 switch (opcode) { 713 case IB_WR_SEND: 714 case IB_WR_SEND_WITH_IMM: 715 qp->s_state = OP(RDMA_READ_RESPONSE_FIRST); 716 break; 717 718 case IB_WR_RDMA_WRITE: 719 case IB_WR_RDMA_WRITE_WITH_IMM: 720 qp->s_state = OP(RDMA_READ_RESPONSE_LAST); 721 break; 722 723 case IB_WR_RDMA_READ: 724 qp->s_state = OP(RDMA_READ_RESPONSE_MIDDLE); 725 break; 726 727 default: 728 /* 729 * This case shouldn't happen since its only 730 * one PSN per req. 731 */ 732 qp->s_state = OP(SEND_LAST); 733 } 734done: 735 qp->s_psn = psn; 736} 737 738/** 739 * ipath_restart_rc - back up requester to resend the last un-ACKed request 740 * @qp: the QP to restart 741 * @psn: packet sequence number for the request 742 * @wc: the work completion request 743 * 744 * The QP s_lock should be held and interrupts disabled. 745 */ 746void ipath_restart_rc(struct ipath_qp *qp, u32 psn, struct ib_wc *wc) 747{ 748 struct ipath_swqe *wqe = get_swqe_ptr(qp, qp->s_last); 749 struct ipath_ibdev *dev; 750 751 if (qp->s_retry == 0) { 752 wc->wr_id = wqe->wr.wr_id; 753 wc->status = IB_WC_RETRY_EXC_ERR; 754 wc->opcode = ib_ipath_wc_opcode[wqe->wr.opcode]; 755 wc->vendor_err = 0; 756 wc->byte_len = 0; 757 wc->qp = &qp->ibqp; 758 wc->src_qp = qp->remote_qpn; 759 wc->pkey_index = 0; 760 wc->slid = qp->remote_ah_attr.dlid; 761 wc->sl = qp->remote_ah_attr.sl; 762 wc->dlid_path_bits = 0; 763 wc->port_num = 0; 764 ipath_sqerror_qp(qp, wc); 765 goto bail; 766 } 767 qp->s_retry--; 768 769 /* 770 * Remove the QP from the timeout queue. 771 * Note: it may already have been removed by ipath_ib_timer(). 772 */ 773 dev = to_idev(qp->ibqp.device); 774 spin_lock(&dev->pending_lock); 775 if (!list_empty(&qp->timerwait)) 776 list_del_init(&qp->timerwait); 777 spin_unlock(&dev->pending_lock); 778 779 if (wqe->wr.opcode == IB_WR_RDMA_READ) 780 dev->n_rc_resends++; 781 else 782 dev->n_rc_resends += (int)qp->s_psn - (int)psn; 783 784 reset_psn(qp, psn); 785 tasklet_hi_schedule(&qp->s_task); 786 787bail: 788 return; 789} 790 791static inline void update_last_psn(struct ipath_qp *qp, u32 psn) 792{ 793 if (qp->s_wait_credit) { 794 qp->s_wait_credit = 0; 795 tasklet_hi_schedule(&qp->s_task); 796 } 797 qp->s_last_psn = psn; 798} 799 800/** 801 * do_rc_ack - process an incoming RC ACK 802 * @qp: the QP the ACK came in on 803 * @psn: the packet sequence number of the ACK 804 * @opcode: the opcode of the request that resulted in the ACK 805 * 806 * This is called from ipath_rc_rcv_resp() to process an incoming RC ACK 807 * for the given QP. 808 * Called at interrupt level with the QP s_lock held and interrupts disabled. 809 * Returns 1 if OK, 0 if current operation should be aborted (NAK). 810 */ 811static int do_rc_ack(struct ipath_qp *qp, u32 aeth, u32 psn, int opcode) 812{ 813 struct ipath_ibdev *dev = to_idev(qp->ibqp.device); 814 struct ib_wc wc; 815 struct ipath_swqe *wqe; 816 int ret = 0; 817 u32 ack_psn; 818 819 /* 820 * Remove the QP from the timeout queue (or RNR timeout queue). 821 * If ipath_ib_timer() has already removed it, 822 * it's OK since we hold the QP s_lock and ipath_restart_rc() 823 * just won't find anything to restart if we ACK everything. 824 */ 825 spin_lock(&dev->pending_lock); 826 if (!list_empty(&qp->timerwait)) 827 list_del_init(&qp->timerwait); 828 spin_unlock(&dev->pending_lock); 829 830 /* 831 * Note that NAKs implicitly ACK outstanding SEND and RDMA write 832 * requests and implicitly NAK RDMA read and atomic requests issued 833 * before the NAK'ed request. The MSN won't include the NAK'ed 834 * request but will include an ACK'ed request(s). 835 */ 836 ack_psn = psn; 837 if (aeth >> 29) 838 ack_psn--; 839 wqe = get_swqe_ptr(qp, qp->s_last); 840 841 /* 842 * The MSN might be for a later WQE than the PSN indicates so 843 * only complete WQEs that the PSN finishes. 844 */ 845 while (ipath_cmp24(ack_psn, wqe->lpsn) >= 0) { 846 /* 847 * If this request is a RDMA read or atomic, and the ACK is 848 * for a later operation, this ACK NAKs the RDMA read or 849 * atomic. In other words, only a RDMA_READ_LAST or ONLY 850 * can ACK a RDMA read and likewise for atomic ops. Note 851 * that the NAK case can only happen if relaxed ordering is 852 * used and requests are sent after an RDMA read or atomic 853 * is sent but before the response is received. 854 */ 855 if ((wqe->wr.opcode == IB_WR_RDMA_READ && 856 (opcode != OP(RDMA_READ_RESPONSE_LAST) || 857 ipath_cmp24(ack_psn, wqe->lpsn) != 0)) || 858 ((wqe->wr.opcode == IB_WR_ATOMIC_CMP_AND_SWP || 859 wqe->wr.opcode == IB_WR_ATOMIC_FETCH_AND_ADD) && 860 (opcode != OP(ATOMIC_ACKNOWLEDGE) || 861 ipath_cmp24(wqe->psn, psn) != 0))) { 862 /* 863 * The last valid PSN seen is the previous 864 * request's. 865 */ 866 update_last_psn(qp, wqe->psn - 1); 867 /* Retry this request. */ 868 ipath_restart_rc(qp, wqe->psn, &wc); 869 /* 870 * No need to process the ACK/NAK since we are 871 * restarting an earlier request. 872 */ 873 goto bail; 874 } 875 if (qp->s_num_rd_atomic && 876 (wqe->wr.opcode == IB_WR_RDMA_READ || 877 wqe->wr.opcode == IB_WR_ATOMIC_CMP_AND_SWP || 878 wqe->wr.opcode == IB_WR_ATOMIC_FETCH_AND_ADD)) { 879 qp->s_num_rd_atomic--; 880 /* Restart sending task if fence is complete */ 881 if ((qp->s_flags & IPATH_S_FENCE_PENDING) && 882 !qp->s_num_rd_atomic) { 883 qp->s_flags &= ~IPATH_S_FENCE_PENDING; 884 tasklet_hi_schedule(&qp->s_task); 885 } else if (qp->s_flags & IPATH_S_RDMAR_PENDING) { 886 qp->s_flags &= ~IPATH_S_RDMAR_PENDING; 887 tasklet_hi_schedule(&qp->s_task); 888 } 889 } 890 /* Post a send completion queue entry if requested. */ 891 if (!(qp->s_flags & IPATH_S_SIGNAL_REQ_WR) || 892 (wqe->wr.send_flags & IB_SEND_SIGNALED)) { 893 wc.wr_id = wqe->wr.wr_id; 894 wc.status = IB_WC_SUCCESS; 895 wc.opcode = ib_ipath_wc_opcode[wqe->wr.opcode]; 896 wc.vendor_err = 0; 897 wc.byte_len = wqe->length; 898 wc.qp = &qp->ibqp; 899 wc.src_qp = qp->remote_qpn; 900 wc.pkey_index = 0; 901 wc.slid = qp->remote_ah_attr.dlid; 902 wc.sl = qp->remote_ah_attr.sl; 903 wc.dlid_path_bits = 0; 904 wc.port_num = 0; 905 ipath_cq_enter(to_icq(qp->ibqp.send_cq), &wc, 0); 906 } 907 qp->s_retry = qp->s_retry_cnt; 908 /* 909 * If we are completing a request which is in the process of 910 * being resent, we can stop resending it since we know the 911 * responder has already seen it. 912 */ 913 if (qp->s_last == qp->s_cur) { 914 if (++qp->s_cur >= qp->s_size) 915 qp->s_cur = 0; 916 wqe = get_swqe_ptr(qp, qp->s_cur); 917 qp->s_state = OP(SEND_LAST); 918 qp->s_psn = wqe->psn; 919 } 920 if (++qp->s_last >= qp->s_size) 921 qp->s_last = 0; 922 wqe = get_swqe_ptr(qp, qp->s_last); 923 if (qp->s_last == qp->s_tail) 924 break; 925 } 926 927 switch (aeth >> 29) { 928 case 0: /* ACK */ 929 dev->n_rc_acks++; 930 /* If this is a partial ACK, reset the retransmit timer. */ 931 if (qp->s_last != qp->s_tail) { 932 spin_lock(&dev->pending_lock); 933 list_add_tail(&qp->timerwait, 934 &dev->pending[dev->pending_index]); 935 spin_unlock(&dev->pending_lock); 936 } 937 ipath_get_credit(qp, aeth); 938 qp->s_rnr_retry = qp->s_rnr_retry_cnt; 939 qp->s_retry = qp->s_retry_cnt; 940 update_last_psn(qp, psn); 941 ret = 1; 942 goto bail; 943 944 case 1: /* RNR NAK */ 945 dev->n_rnr_naks++; 946 if (qp->s_rnr_retry == 0) { 947 if (qp->s_last == qp->s_tail) 948 goto bail; 949 950 wc.status = IB_WC_RNR_RETRY_EXC_ERR; 951 goto class_b; 952 } 953 if (qp->s_rnr_retry_cnt < 7) 954 qp->s_rnr_retry--; 955 if (qp->s_last == qp->s_tail) 956 goto bail; 957 958 /* The last valid PSN is the previous PSN. */ 959 update_last_psn(qp, psn - 1); 960 961 dev->n_rc_resends += (int)qp->s_psn - (int)psn; 962 963 reset_psn(qp, psn); 964 965 qp->s_rnr_timeout = 966 ib_ipath_rnr_table[(aeth >> IPATH_AETH_CREDIT_SHIFT) & 967 IPATH_AETH_CREDIT_MASK]; 968 ipath_insert_rnr_queue(qp); 969 goto bail; 970 971 case 3: /* NAK */ 972 /* The last valid PSN seen is the previous request's. */ 973 if (qp->s_last != qp->s_tail) 974 update_last_psn(qp, wqe->psn - 1); 975 switch ((aeth >> IPATH_AETH_CREDIT_SHIFT) & 976 IPATH_AETH_CREDIT_MASK) { 977 case 0: /* PSN sequence error */ 978 dev->n_seq_naks++; 979 /* 980 * Back up to the responder's expected PSN. XXX 981 * Note that we might get a NAK in the middle of an 982 * RDMA READ response which terminates the RDMA 983 * READ. 984 */ 985 if (qp->s_last == qp->s_tail) 986 break; 987 988 if (ipath_cmp24(psn, wqe->psn) < 0) 989 break; 990 991 /* Retry the request. */ 992 ipath_restart_rc(qp, psn, &wc); 993 break; 994 995 case 1: /* Invalid Request */ 996 wc.status = IB_WC_REM_INV_REQ_ERR; 997 dev->n_other_naks++; 998 goto class_b; 999 1000 case 2: /* Remote Access Error */ 1001 wc.status = IB_WC_REM_ACCESS_ERR; 1002 dev->n_other_naks++; 1003 goto class_b; 1004 1005 case 3: /* Remote Operation Error */ 1006 wc.status = IB_WC_REM_OP_ERR; 1007 dev->n_other_naks++; 1008 class_b: 1009 wc.wr_id = wqe->wr.wr_id; 1010 wc.opcode = ib_ipath_wc_opcode[wqe->wr.opcode]; 1011 wc.vendor_err = 0; 1012 wc.byte_len = 0; 1013 wc.qp = &qp->ibqp; 1014 wc.src_qp = qp->remote_qpn; 1015 wc.pkey_index = 0; 1016 wc.slid = qp->remote_ah_attr.dlid; 1017 wc.sl = qp->remote_ah_attr.sl; 1018 wc.dlid_path_bits = 0; 1019 wc.port_num = 0; 1020 ipath_sqerror_qp(qp, &wc); 1021 break; 1022 1023 default: 1024 /* Ignore other reserved NAK error codes */ 1025 goto reserved; 1026 } 1027 qp->s_rnr_retry = qp->s_rnr_retry_cnt; 1028 goto bail; 1029 1030 default: /* 2: reserved */ 1031 reserved: 1032 /* Ignore reserved NAK codes. */ 1033 goto bail; 1034 } 1035 1036bail: 1037 return ret; 1038} 1039 1040/** 1041 * ipath_rc_rcv_resp - process an incoming RC response packet 1042 * @dev: the device this packet came in on 1043 * @ohdr: the other headers for this packet 1044 * @data: the packet data 1045 * @tlen: the packet length 1046 * @qp: the QP for this packet 1047 * @opcode: the opcode for this packet 1048 * @psn: the packet sequence number for this packet 1049 * @hdrsize: the header length 1050 * @pmtu: the path MTU 1051 * @header_in_data: true if part of the header data is in the data buffer 1052 * 1053 * This is called from ipath_rc_rcv() to process an incoming RC response 1054 * packet for the given QP. 1055 * Called at interrupt level. 1056 */ 1057static inline void ipath_rc_rcv_resp(struct ipath_ibdev *dev, 1058 struct ipath_other_headers *ohdr, 1059 void *data, u32 tlen, 1060 struct ipath_qp *qp, 1061 u32 opcode, 1062 u32 psn, u32 hdrsize, u32 pmtu, 1063 int header_in_data) 1064{ 1065 struct ipath_swqe *wqe; 1066 unsigned long flags; 1067 struct ib_wc wc; 1068 int diff; 1069 u32 pad; 1070 u32 aeth; 1071 1072 spin_lock_irqsave(&qp->s_lock, flags); 1073 1074 /* Ignore invalid responses. */ 1075 if (ipath_cmp24(psn, qp->s_next_psn) >= 0) 1076 goto ack_done; 1077 1078 /* Ignore duplicate responses. */ 1079 diff = ipath_cmp24(psn, qp->s_last_psn); 1080 if (unlikely(diff <= 0)) { 1081 /* Update credits for "ghost" ACKs */ 1082 if (diff == 0 && opcode == OP(ACKNOWLEDGE)) { 1083 if (!header_in_data) 1084 aeth = be32_to_cpu(ohdr->u.aeth); 1085 else { 1086 aeth = be32_to_cpu(((__be32 *) data)[0]); 1087 data += sizeof(__be32); 1088 } 1089 if ((aeth >> 29) == 0) 1090 ipath_get_credit(qp, aeth); 1091 } 1092 goto ack_done; 1093 } 1094 1095 if (unlikely(qp->s_last == qp->s_tail)) 1096 goto ack_done; 1097 wqe = get_swqe_ptr(qp, qp->s_last); 1098 1099 switch (opcode) { 1100 case OP(ACKNOWLEDGE): 1101 case OP(ATOMIC_ACKNOWLEDGE): 1102 case OP(RDMA_READ_RESPONSE_FIRST): 1103 if (!header_in_data) 1104 aeth = be32_to_cpu(ohdr->u.aeth); 1105 else { 1106 aeth = be32_to_cpu(((__be32 *) data)[0]); 1107 data += sizeof(__be32); 1108 } 1109 if (opcode == OP(ATOMIC_ACKNOWLEDGE)) { 1110 u64 val; 1111 1112 if (!header_in_data) { 1113 __be32 *p = ohdr->u.at.atomic_ack_eth; 1114 1115 val = ((u64) be32_to_cpu(p[0]) << 32) | 1116 be32_to_cpu(p[1]); 1117 } else 1118 val = be64_to_cpu(((__be64 *) data)[0]); 1119 *(u64 *) wqe->sg_list[0].vaddr = val; 1120 } 1121 if (!do_rc_ack(qp, aeth, psn, opcode) || 1122 opcode != OP(RDMA_READ_RESPONSE_FIRST)) 1123 goto ack_done; 1124 hdrsize += 4; 1125 if (unlikely(wqe->wr.opcode != IB_WR_RDMA_READ)) 1126 goto ack_done; 1127 /* 1128 * If this is a response to a resent RDMA read, we 1129 * have to be careful to copy the data to the right 1130 * location. 1131 */ 1132 qp->s_rdma_read_len = restart_sge(&qp->s_rdma_read_sge, 1133 wqe, psn, pmtu); 1134 goto read_middle; 1135 1136 case OP(RDMA_READ_RESPONSE_MIDDLE): 1137 /* no AETH, no ACK */ 1138 if (unlikely(ipath_cmp24(psn, qp->s_last_psn + 1))) { 1139 dev->n_rdma_seq++; 1140 ipath_restart_rc(qp, qp->s_last_psn + 1, &wc); 1141 goto ack_done; 1142 } 1143 if (unlikely(wqe->wr.opcode != IB_WR_RDMA_READ)) 1144 goto ack_done; 1145 read_middle: 1146 if (unlikely(tlen != (hdrsize + pmtu + 4))) 1147 goto ack_done; 1148 if (unlikely(pmtu >= qp->s_rdma_read_len)) 1149 goto ack_done; 1150 1151 /* We got a response so update the timeout. */ 1152 spin_lock(&dev->pending_lock); 1153 if (qp->s_rnr_timeout == 0 && !list_empty(&qp->timerwait)) 1154 list_move_tail(&qp->timerwait, 1155 &dev->pending[dev->pending_index]); 1156 spin_unlock(&dev->pending_lock); 1157 /* 1158 * Update the RDMA receive state but do the copy w/o 1159 * holding the locks and blocking interrupts. 1160 */ 1161 qp->s_rdma_read_len -= pmtu; 1162 update_last_psn(qp, psn); 1163 spin_unlock_irqrestore(&qp->s_lock, flags); 1164 ipath_copy_sge(&qp->s_rdma_read_sge, data, pmtu); 1165 goto bail; 1166 1167 case OP(RDMA_READ_RESPONSE_ONLY): 1168 if (unlikely(ipath_cmp24(psn, qp->s_last_psn + 1))) { 1169 dev->n_rdma_seq++; 1170 ipath_restart_rc(qp, qp->s_last_psn + 1, &wc); 1171 goto ack_done; 1172 } 1173 if (unlikely(wqe->wr.opcode != IB_WR_RDMA_READ)) 1174 goto ack_done; 1175 /* 1176 * If this is a response to a resent RDMA read, we 1177 * have to be careful to copy the data to the right 1178 * location. 1179 * XXX should check PSN and wqe opcode first. 1180 */ 1181 qp->s_rdma_read_len = restart_sge(&qp->s_rdma_read_sge, 1182 wqe, psn, pmtu); 1183 goto read_last; 1184 1185 case OP(RDMA_READ_RESPONSE_LAST): 1186 /* ACKs READ req. */ 1187 if (unlikely(ipath_cmp24(psn, qp->s_last_psn + 1))) { 1188 dev->n_rdma_seq++; 1189 ipath_restart_rc(qp, qp->s_last_psn + 1, &wc); 1190 goto ack_done; 1191 } 1192 if (unlikely(wqe->wr.opcode != IB_WR_RDMA_READ)) 1193 goto ack_done; 1194 read_last: 1195 /* 1196 * Get the number of bytes the message was padded by. 1197 */ 1198 pad = (be32_to_cpu(ohdr->bth[0]) >> 20) & 3; 1199 /* 1200 * Check that the data size is >= 1 && <= pmtu. 1201 * Remember to account for the AETH header (4) and 1202 * ICRC (4). 1203 */ 1204 if (unlikely(tlen <= (hdrsize + pad + 8))) { 1205 /* XXX Need to generate an error CQ entry. */ 1206 goto ack_done; 1207 } 1208 tlen -= hdrsize + pad + 8; 1209 if (unlikely(tlen != qp->s_rdma_read_len)) { 1210 /* XXX Need to generate an error CQ entry. */ 1211 goto ack_done; 1212 } 1213 if (!header_in_data) 1214 aeth = be32_to_cpu(ohdr->u.aeth); 1215 else { 1216 aeth = be32_to_cpu(((__be32 *) data)[0]); 1217 data += sizeof(__be32); 1218 } 1219 ipath_copy_sge(&qp->s_rdma_read_sge, data, tlen); 1220 (void) do_rc_ack(qp, aeth, psn, OP(RDMA_READ_RESPONSE_LAST)); 1221 goto ack_done; 1222 } 1223 1224ack_done: 1225 spin_unlock_irqrestore(&qp->s_lock, flags); 1226bail: 1227 return; 1228} 1229 1230/** 1231 * ipath_rc_rcv_error - process an incoming duplicate or error RC packet 1232 * @dev: the device this packet came in on 1233 * @ohdr: the other headers for this packet 1234 * @data: the packet data 1235 * @qp: the QP for this packet 1236 * @opcode: the opcode for this packet 1237 * @psn: the packet sequence number for this packet 1238 * @diff: the difference between the PSN and the expected PSN 1239 * @header_in_data: true if part of the header data is in the data buffer 1240 * 1241 * This is called from ipath_rc_rcv() to process an unexpected 1242 * incoming RC packet for the given QP. 1243 * Called at interrupt level. 1244 * Return 1 if no more processing is needed; otherwise return 0 to 1245 * schedule a response to be sent. 1246 */ 1247static inline int ipath_rc_rcv_error(struct ipath_ibdev *dev, 1248 struct ipath_other_headers *ohdr, 1249 void *data, 1250 struct ipath_qp *qp, 1251 u32 opcode, 1252 u32 psn, 1253 int diff, 1254 int header_in_data) 1255{ 1256 struct ipath_ack_entry *e; 1257 u8 i, prev; 1258 int old_req; 1259 1260 if (diff > 0) { 1261 /* 1262 * Packet sequence error. 1263 * A NAK will ACK earlier sends and RDMA writes. 1264 * Don't queue the NAK if we already sent one. 1265 */ 1266 if (!qp->r_nak_state) { 1267 qp->r_nak_state = IB_NAK_PSN_ERROR; 1268 /* Use the expected PSN. */ 1269 qp->r_ack_psn = qp->r_psn; 1270 goto send_ack; 1271 } 1272 goto done; 1273 } 1274 1275 /* 1276 * Handle a duplicate request. Don't re-execute SEND, RDMA 1277 * write or atomic op. Don't NAK errors, just silently drop 1278 * the duplicate request. Note that r_sge, r_len, and 1279 * r_rcv_len may be in use so don't modify them. 1280 * 1281 * We are supposed to ACK the earliest duplicate PSN but we 1282 * can coalesce an outstanding duplicate ACK. We have to 1283 * send the earliest so that RDMA reads can be restarted at 1284 * the requester's expected PSN. 1285 * 1286 * First, find where this duplicate PSN falls within the 1287 * ACKs previously sent. 1288 */ 1289 psn &= IPATH_PSN_MASK; 1290 e = NULL; 1291 old_req = 1; 1292 spin_lock_irq(&qp->s_lock); 1293 for (i = qp->r_head_ack_queue; ; i = prev) { 1294 if (i == qp->s_tail_ack_queue) 1295 old_req = 0; 1296 if (i) 1297 prev = i - 1; 1298 else 1299 prev = IPATH_MAX_RDMA_ATOMIC; 1300 if (prev == qp->r_head_ack_queue) { 1301 e = NULL; 1302 break; 1303 } 1304 e = &qp->s_ack_queue[prev]; 1305 if (!e->opcode) { 1306 e = NULL; 1307 break; 1308 } 1309 if (ipath_cmp24(psn, e->psn) >= 0) 1310 break; 1311 } 1312 switch (opcode) { 1313 case OP(RDMA_READ_REQUEST): { 1314 struct ib_reth *reth; 1315 u32 offset; 1316 u32 len; 1317 1318 /* 1319 * If we didn't find the RDMA read request in the ack queue, 1320 * or the send tasklet is already backed up to send an 1321 * earlier entry, we can ignore this request. 1322 */ 1323 if (!e || e->opcode != OP(RDMA_READ_REQUEST) || old_req) 1324 goto unlock_done; 1325 /* RETH comes after BTH */ 1326 if (!header_in_data) 1327 reth = &ohdr->u.rc.reth; 1328 else { 1329 reth = (struct ib_reth *)data; 1330 data += sizeof(*reth); 1331 } 1332 /* 1333 * Address range must be a subset of the original 1334 * request and start on pmtu boundaries. 1335 * We reuse the old ack_queue slot since the requester 1336 * should not back up and request an earlier PSN for the 1337 * same request. 1338 */ 1339 offset = ((psn - e->psn) & IPATH_PSN_MASK) * 1340 ib_mtu_enum_to_int(qp->path_mtu); 1341 len = be32_to_cpu(reth->length); 1342 if (unlikely(offset + len > e->rdma_sge.sge.sge_length)) 1343 goto unlock_done; 1344 if (len != 0) { 1345 u32 rkey = be32_to_cpu(reth->rkey); 1346 u64 vaddr = be64_to_cpu(reth->vaddr); 1347 int ok; 1348 1349 ok = ipath_rkey_ok(qp, &e->rdma_sge, 1350 len, vaddr, rkey, 1351 IB_ACCESS_REMOTE_READ); 1352 if (unlikely(!ok)) 1353 goto unlock_done; 1354 } else { 1355 e->rdma_sge.sg_list = NULL; 1356 e->rdma_sge.num_sge = 0; 1357 e->rdma_sge.sge.mr = NULL; 1358 e->rdma_sge.sge.vaddr = NULL; 1359 e->rdma_sge.sge.length = 0; 1360 e->rdma_sge.sge.sge_length = 0; 1361 } 1362 e->psn = psn; 1363 qp->s_ack_state = OP(ACKNOWLEDGE); 1364 qp->s_tail_ack_queue = prev; 1365 break; 1366 } 1367 1368 case OP(COMPARE_SWAP): 1369 case OP(FETCH_ADD): { 1370 /* 1371 * If we didn't find the atomic request in the ack queue 1372 * or the send tasklet is already backed up to send an 1373 * earlier entry, we can ignore this request. 1374 */ 1375 if (!e || e->opcode != (u8) opcode || old_req) 1376 goto unlock_done; 1377 qp->s_ack_state = OP(ACKNOWLEDGE); 1378 qp->s_tail_ack_queue = prev; 1379 break; 1380 } 1381 1382 default: 1383 if (old_req) 1384 goto unlock_done; 1385 /* 1386 * Resend the most recent ACK if this request is 1387 * after all the previous RDMA reads and atomics. 1388 */ 1389 if (i == qp->r_head_ack_queue) { 1390 spin_unlock_irq(&qp->s_lock); 1391 qp->r_nak_state = 0; 1392 qp->r_ack_psn = qp->r_psn - 1; 1393 goto send_ack; 1394 } 1395 /* 1396 * Resend the RDMA read or atomic op which 1397 * ACKs this duplicate request. 1398 */ 1399 qp->s_ack_state = OP(ACKNOWLEDGE); 1400 qp->s_tail_ack_queue = i; 1401 break; 1402 } 1403 qp->r_nak_state = 0; 1404 spin_unlock_irq(&qp->s_lock); 1405 tasklet_hi_schedule(&qp->s_task); 1406 1407unlock_done: 1408 spin_unlock_irq(&qp->s_lock); 1409done: 1410 return 1; 1411 1412send_ack: 1413 return 0; 1414} 1415 1416static void ipath_rc_error(struct ipath_qp *qp, enum ib_wc_status err) 1417{ 1418 spin_lock_irq(&qp->s_lock); 1419 qp->state = IB_QPS_ERR; 1420 ipath_error_qp(qp, err); 1421 spin_unlock_irq(&qp->s_lock); 1422} 1423 1424/** 1425 * ipath_rc_rcv - process an incoming RC packet 1426 * @dev: the device this packet came in on 1427 * @hdr: the header of this packet 1428 * @has_grh: true if the header has a GRH 1429 * @data: the packet data 1430 * @tlen: the packet length 1431 * @qp: the QP for this packet 1432 * 1433 * This is called from ipath_qp_rcv() to process an incoming RC packet 1434 * for the given QP. 1435 * Called at interrupt level. 1436 */ 1437void ipath_rc_rcv(struct ipath_ibdev *dev, struct ipath_ib_header *hdr, 1438 int has_grh, void *data, u32 tlen, struct ipath_qp *qp) 1439{ 1440 struct ipath_other_headers *ohdr; 1441 u32 opcode; 1442 u32 hdrsize; 1443 u32 psn; 1444 u32 pad; 1445 struct ib_wc wc; 1446 u32 pmtu = ib_mtu_enum_to_int(qp->path_mtu); 1447 int diff; 1448 struct ib_reth *reth; 1449 int header_in_data; 1450 1451 /* Validate the SLID. See Ch. 9.6.1.5 */ 1452 if (unlikely(be16_to_cpu(hdr->lrh[3]) != qp->remote_ah_attr.dlid)) 1453 goto done; 1454 1455 /* Check for GRH */ 1456 if (!has_grh) { 1457 ohdr = &hdr->u.oth; 1458 hdrsize = 8 + 12; /* LRH + BTH */ 1459 psn = be32_to_cpu(ohdr->bth[2]); 1460 header_in_data = 0; 1461 } else { 1462 ohdr = &hdr->u.l.oth; 1463 hdrsize = 8 + 40 + 12; /* LRH + GRH + BTH */ 1464 /* 1465 * The header with GRH is 60 bytes and the core driver sets 1466 * the eager header buffer size to 56 bytes so the last 4 1467 * bytes of the BTH header (PSN) is in the data buffer. 1468 */ 1469 header_in_data = dev->dd->ipath_rcvhdrentsize == 16; 1470 if (header_in_data) { 1471 psn = be32_to_cpu(((__be32 *) data)[0]); 1472 data += sizeof(__be32); 1473 } else 1474 psn = be32_to_cpu(ohdr->bth[2]); 1475 } 1476 1477 /* 1478 * Process responses (ACKs) before anything else. Note that the 1479 * packet sequence number will be for something in the send work 1480 * queue rather than the expected receive packet sequence number. 1481 * In other words, this QP is the requester. 1482 */ 1483 opcode = be32_to_cpu(ohdr->bth[0]) >> 24; 1484 if (opcode >= OP(RDMA_READ_RESPONSE_FIRST) && 1485 opcode <= OP(ATOMIC_ACKNOWLEDGE)) { 1486 ipath_rc_rcv_resp(dev, ohdr, data, tlen, qp, opcode, psn, 1487 hdrsize, pmtu, header_in_data); 1488 goto done; 1489 } 1490 1491 /* Compute 24 bits worth of difference. */ 1492 diff = ipath_cmp24(psn, qp->r_psn); 1493 if (unlikely(diff)) { 1494 if (ipath_rc_rcv_error(dev, ohdr, data, qp, opcode, 1495 psn, diff, header_in_data)) 1496 goto done; 1497 goto send_ack; 1498 } 1499 1500 /* Check for opcode sequence errors. */ 1501 switch (qp->r_state) { 1502 case OP(SEND_FIRST): 1503 case OP(SEND_MIDDLE): 1504 if (opcode == OP(SEND_MIDDLE) || 1505 opcode == OP(SEND_LAST) || 1506 opcode == OP(SEND_LAST_WITH_IMMEDIATE)) 1507 break; 1508 nack_inv: 1509 ipath_rc_error(qp, IB_WC_REM_INV_REQ_ERR); 1510 qp->r_nak_state = IB_NAK_INVALID_REQUEST; 1511 qp->r_ack_psn = qp->r_psn; 1512 goto send_ack; 1513 1514 case OP(RDMA_WRITE_FIRST): 1515 case OP(RDMA_WRITE_MIDDLE): 1516 if (opcode == OP(RDMA_WRITE_MIDDLE) || 1517 opcode == OP(RDMA_WRITE_LAST) || 1518 opcode == OP(RDMA_WRITE_LAST_WITH_IMMEDIATE)) 1519 break; 1520 goto nack_inv; 1521 1522 default: 1523 if (opcode == OP(SEND_MIDDLE) || 1524 opcode == OP(SEND_LAST) || 1525 opcode == OP(SEND_LAST_WITH_IMMEDIATE) || 1526 opcode == OP(RDMA_WRITE_MIDDLE) || 1527 opcode == OP(RDMA_WRITE_LAST) || 1528 opcode == OP(RDMA_WRITE_LAST_WITH_IMMEDIATE)) 1529 goto nack_inv; 1530 /* 1531 * Note that it is up to the requester to not send a new 1532 * RDMA read or atomic operation before receiving an ACK 1533 * for the previous operation. 1534 */ 1535 break; 1536 } 1537 1538 wc.imm_data = 0; 1539 wc.wc_flags = 0; 1540 1541 /* OK, process the packet. */ 1542 switch (opcode) { 1543 case OP(SEND_FIRST): 1544 if (!ipath_get_rwqe(qp, 0)) { 1545 rnr_nak: 1546 /* 1547 * A RNR NAK will ACK earlier sends and RDMA writes. 1548 * Don't queue the NAK if a RDMA read or atomic 1549 * is pending though. 1550 */ 1551 if (qp->r_nak_state) 1552 goto done; 1553 qp->r_nak_state = IB_RNR_NAK | qp->r_min_rnr_timer; 1554 qp->r_ack_psn = qp->r_psn; 1555 goto send_ack; 1556 } 1557 qp->r_rcv_len = 0; 1558 /* FALLTHROUGH */ 1559 case OP(SEND_MIDDLE): 1560 case OP(RDMA_WRITE_MIDDLE): 1561 send_middle: 1562 /* Check for invalid length PMTU or posted rwqe len. */ 1563 if (unlikely(tlen != (hdrsize + pmtu + 4))) 1564 goto nack_inv; 1565 qp->r_rcv_len += pmtu; 1566 if (unlikely(qp->r_rcv_len > qp->r_len)) 1567 goto nack_inv; 1568 ipath_copy_sge(&qp->r_sge, data, pmtu); 1569 break; 1570 1571 case OP(RDMA_WRITE_LAST_WITH_IMMEDIATE): 1572 /* consume RWQE */ 1573 if (!ipath_get_rwqe(qp, 1)) 1574 goto rnr_nak; 1575 goto send_last_imm; 1576 1577 case OP(SEND_ONLY): 1578 case OP(SEND_ONLY_WITH_IMMEDIATE): 1579 if (!ipath_get_rwqe(qp, 0)) 1580 goto rnr_nak; 1581 qp->r_rcv_len = 0; 1582 if (opcode == OP(SEND_ONLY)) 1583 goto send_last; 1584 /* FALLTHROUGH */ 1585 case OP(SEND_LAST_WITH_IMMEDIATE): 1586 send_last_imm: 1587 if (header_in_data) { 1588 wc.imm_data = *(__be32 *) data; 1589 data += sizeof(__be32); 1590 } else { 1591 /* Immediate data comes after BTH */ 1592 wc.imm_data = ohdr->u.imm_data; 1593 } 1594 hdrsize += 4; 1595 wc.wc_flags = IB_WC_WITH_IMM; 1596 /* FALLTHROUGH */ 1597 case OP(SEND_LAST): 1598 case OP(RDMA_WRITE_LAST): 1599 send_last: 1600 /* Get the number of bytes the message was padded by. */ 1601 pad = (be32_to_cpu(ohdr->bth[0]) >> 20) & 3; 1602 /* Check for invalid length. */ 1603 /* XXX LAST len should be >= 1 */ 1604 if (unlikely(tlen < (hdrsize + pad + 4))) 1605 goto nack_inv; 1606 /* Don't count the CRC. */ 1607 tlen -= (hdrsize + pad + 4); 1608 wc.byte_len = tlen + qp->r_rcv_len; 1609 if (unlikely(wc.byte_len > qp->r_len)) 1610 goto nack_inv; 1611 ipath_copy_sge(&qp->r_sge, data, tlen); 1612 qp->r_msn++; 1613 if (!qp->r_wrid_valid) 1614 break; 1615 qp->r_wrid_valid = 0; 1616 wc.wr_id = qp->r_wr_id; 1617 wc.status = IB_WC_SUCCESS; 1618 wc.opcode = IB_WC_RECV; 1619 wc.vendor_err = 0; 1620 wc.qp = &qp->ibqp; 1621 wc.src_qp = qp->remote_qpn; 1622 wc.pkey_index = 0; 1623 wc.slid = qp->remote_ah_attr.dlid; 1624 wc.sl = qp->remote_ah_attr.sl; 1625 wc.dlid_path_bits = 0; 1626 wc.port_num = 0; 1627 /* Signal completion event if the solicited bit is set. */ 1628 ipath_cq_enter(to_icq(qp->ibqp.recv_cq), &wc, 1629 (ohdr->bth[0] & 1630 __constant_cpu_to_be32(1 << 23)) != 0); 1631 break; 1632 1633 case OP(RDMA_WRITE_FIRST): 1634 case OP(RDMA_WRITE_ONLY): 1635 case OP(RDMA_WRITE_ONLY_WITH_IMMEDIATE): 1636 /* consume RWQE */ 1637 /* RETH comes after BTH */ 1638 if (!header_in_data) 1639 reth = &ohdr->u.rc.reth; 1640 else { 1641 reth = (struct ib_reth *)data; 1642 data += sizeof(*reth); 1643 } 1644 hdrsize += sizeof(*reth); 1645 qp->r_len = be32_to_cpu(reth->length); 1646 qp->r_rcv_len = 0; 1647 if (qp->r_len != 0) { 1648 u32 rkey = be32_to_cpu(reth->rkey); 1649 u64 vaddr = be64_to_cpu(reth->vaddr); 1650 int ok; 1651 1652 /* Check rkey & NAK */ 1653 ok = ipath_rkey_ok(qp, &qp->r_sge, 1654 qp->r_len, vaddr, rkey, 1655 IB_ACCESS_REMOTE_WRITE); 1656 if (unlikely(!ok)) 1657 goto nack_acc; 1658 } else { 1659 qp->r_sge.sg_list = NULL; 1660 qp->r_sge.sge.mr = NULL; 1661 qp->r_sge.sge.vaddr = NULL; 1662 qp->r_sge.sge.length = 0; 1663 qp->r_sge.sge.sge_length = 0; 1664 } 1665 if (unlikely(!(qp->qp_access_flags & 1666 IB_ACCESS_REMOTE_WRITE))) 1667 goto nack_acc; 1668 if (opcode == OP(RDMA_WRITE_FIRST)) 1669 goto send_middle; 1670 else if (opcode == OP(RDMA_WRITE_ONLY)) 1671 goto send_last; 1672 if (!ipath_get_rwqe(qp, 1)) 1673 goto rnr_nak; 1674 goto send_last_imm; 1675 1676 case OP(RDMA_READ_REQUEST): { 1677 struct ipath_ack_entry *e; 1678 u32 len; 1679 u8 next; 1680 1681 if (unlikely(!(qp->qp_access_flags & IB_ACCESS_REMOTE_READ))) 1682 goto nack_acc; 1683 next = qp->r_head_ack_queue + 1; 1684 if (next > IPATH_MAX_RDMA_ATOMIC) 1685 next = 0; 1686 if (unlikely(next == qp->s_tail_ack_queue)) 1687 goto nack_inv; 1688 e = &qp->s_ack_queue[qp->r_head_ack_queue]; 1689 /* RETH comes after BTH */ 1690 if (!header_in_data) 1691 reth = &ohdr->u.rc.reth; 1692 else { 1693 reth = (struct ib_reth *)data; 1694 data += sizeof(*reth); 1695 } 1696 len = be32_to_cpu(reth->length); 1697 if (len) { 1698 u32 rkey = be32_to_cpu(reth->rkey); 1699 u64 vaddr = be64_to_cpu(reth->vaddr); 1700 int ok; 1701 1702 /* Check rkey & NAK */ 1703 ok = ipath_rkey_ok(qp, &e->rdma_sge, len, vaddr, 1704 rkey, IB_ACCESS_REMOTE_READ); 1705 if (unlikely(!ok)) 1706 goto nack_acc; 1707 /* 1708 * Update the next expected PSN. We add 1 later 1709 * below, so only add the remainder here. 1710 */ 1711 if (len > pmtu) 1712 qp->r_psn += (len - 1) / pmtu; 1713 } else { 1714 e->rdma_sge.sg_list = NULL; 1715 e->rdma_sge.num_sge = 0; 1716 e->rdma_sge.sge.mr = NULL; 1717 e->rdma_sge.sge.vaddr = NULL; 1718 e->rdma_sge.sge.length = 0; 1719 e->rdma_sge.sge.sge_length = 0; 1720 } 1721 e->opcode = opcode; 1722 e->psn = psn; 1723 /* 1724 * We need to increment the MSN here instead of when we 1725 * finish sending the result since a duplicate request would 1726 * increment it more than once. 1727 */ 1728 qp->r_msn++; 1729 qp->r_psn++; 1730 qp->r_state = opcode; 1731 qp->r_nak_state = 0; 1732 barrier(); 1733 qp->r_head_ack_queue = next; 1734 1735 /* Call ipath_do_rc_send() in another thread. */ 1736 tasklet_hi_schedule(&qp->s_task); 1737 1738 goto done; 1739 } 1740 1741 case OP(COMPARE_SWAP): 1742 case OP(FETCH_ADD): { 1743 struct ib_atomic_eth *ateth; 1744 struct ipath_ack_entry *e; 1745 u64 vaddr; 1746 atomic64_t *maddr; 1747 u64 sdata; 1748 u32 rkey; 1749 u8 next; 1750 1751 if (unlikely(!(qp->qp_access_flags & 1752 IB_ACCESS_REMOTE_ATOMIC))) 1753 goto nack_acc; 1754 next = qp->r_head_ack_queue + 1; 1755 if (next > IPATH_MAX_RDMA_ATOMIC) 1756 next = 0; 1757 if (unlikely(next == qp->s_tail_ack_queue)) 1758 goto nack_inv; 1759 if (!header_in_data) 1760 ateth = &ohdr->u.atomic_eth; 1761 else 1762 ateth = (struct ib_atomic_eth *)data; 1763 vaddr = ((u64) be32_to_cpu(ateth->vaddr[0]) << 32) | 1764 be32_to_cpu(ateth->vaddr[1]); 1765 if (unlikely(vaddr & (sizeof(u64) - 1))) 1766 goto nack_inv; 1767 rkey = be32_to_cpu(ateth->rkey); 1768 /* Check rkey & NAK */ 1769 if (unlikely(!ipath_rkey_ok(qp, &qp->r_sge, 1770 sizeof(u64), vaddr, rkey, 1771 IB_ACCESS_REMOTE_ATOMIC))) 1772 goto nack_acc; 1773 /* Perform atomic OP and save result. */ 1774 maddr = (atomic64_t *) qp->r_sge.sge.vaddr; 1775 sdata = be64_to_cpu(ateth->swap_data); 1776 e = &qp->s_ack_queue[qp->r_head_ack_queue]; 1777 e->atomic_data = (opcode == OP(FETCH_ADD)) ? 1778 (u64) atomic64_add_return(sdata, maddr) - sdata : 1779 (u64) cmpxchg((u64 *) qp->r_sge.sge.vaddr, 1780 be64_to_cpu(ateth->compare_data), 1781 sdata); 1782 e->opcode = opcode; 1783 e->psn = psn & IPATH_PSN_MASK; 1784 qp->r_msn++; 1785 qp->r_psn++; 1786 qp->r_state = opcode; 1787 qp->r_nak_state = 0; 1788 barrier(); 1789 qp->r_head_ack_queue = next; 1790 1791 /* Call ipath_do_rc_send() in another thread. */ 1792 tasklet_hi_schedule(&qp->s_task); 1793 1794 goto done; 1795 } 1796 1797 default: 1798 /* NAK unknown opcodes. */ 1799 goto nack_inv; 1800 } 1801 qp->r_psn++; 1802 qp->r_state = opcode; 1803 qp->r_ack_psn = psn; 1804 qp->r_nak_state = 0; 1805 /* Send an ACK if requested or required. */ 1806 if (psn & (1 << 31)) 1807 goto send_ack; 1808 goto done; 1809 1810nack_acc: 1811 ipath_rc_error(qp, IB_WC_REM_ACCESS_ERR); 1812 qp->r_nak_state = IB_NAK_REMOTE_ACCESS_ERROR; 1813 qp->r_ack_psn = qp->r_psn; 1814 1815send_ack: 1816 send_rc_ack(qp); 1817 1818done: 1819 return; 1820} 1821