ipoib_cm.c revision 7143740d26098aca84ecc7376ccfe2c58fd0412e
1/*
2 * Copyright (c) 2006 Mellanox Technologies. All rights reserved
3 *
4 * This software is available to you under a choice of one of two
5 * licenses.  You may choose to be licensed under the terms of the GNU
6 * General Public License (GPL) Version 2, available from the file
7 * COPYING in the main directory of this source tree, or the
8 * OpenIB.org BSD license below:
9 *
10 *     Redistribution and use in source and binary forms, with or
11 *     without modification, are permitted provided that the following
12 *     conditions are met:
13 *
14 *      - Redistributions of source code must retain the above
15 *        copyright notice, this list of conditions and the following
16 *        disclaimer.
17 *
18 *      - Redistributions in binary form must reproduce the above
19 *        copyright notice, this list of conditions and the following
20 *        disclaimer in the documentation and/or other materials
21 *        provided with the distribution.
22 *
23 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
24 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
25 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
26 * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
27 * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
28 * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
29 * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
30 * SOFTWARE.
31 *
32 * $Id$
33 */
34
35#include <rdma/ib_cm.h>
36#include <rdma/ib_cache.h>
37#include <net/dst.h>
38#include <net/icmp.h>
39#include <linux/icmpv6.h>
40#include <linux/delay.h>
41
42#include "ipoib.h"
43
44int ipoib_max_conn_qp = 128;
45
46module_param_named(max_nonsrq_conn_qp, ipoib_max_conn_qp, int, 0444);
47MODULE_PARM_DESC(max_nonsrq_conn_qp,
48		 "Max number of connected-mode QPs per interface "
49		 "(applied only if shared receive queue is not available)");
50
51#ifdef CONFIG_INFINIBAND_IPOIB_DEBUG_DATA
52static int data_debug_level;
53
54module_param_named(cm_data_debug_level, data_debug_level, int, 0644);
55MODULE_PARM_DESC(cm_data_debug_level,
56		 "Enable data path debug tracing for connected mode if > 0");
57#endif
58
59#define IPOIB_CM_IETF_ID 0x1000000000000000ULL
60
61#define IPOIB_CM_RX_UPDATE_TIME (256 * HZ)
62#define IPOIB_CM_RX_TIMEOUT     (2 * 256 * HZ)
63#define IPOIB_CM_RX_DELAY       (3 * 256 * HZ)
64#define IPOIB_CM_RX_UPDATE_MASK (0x3)
65
66static struct ib_qp_attr ipoib_cm_err_attr = {
67	.qp_state = IB_QPS_ERR
68};
69
70#define IPOIB_CM_RX_DRAIN_WRID 0xffffffff
71
72static struct ib_send_wr ipoib_cm_rx_drain_wr = {
73	.wr_id = IPOIB_CM_RX_DRAIN_WRID,
74	.opcode = IB_WR_SEND,
75};
76
77static int ipoib_cm_tx_handler(struct ib_cm_id *cm_id,
78			       struct ib_cm_event *event);
79
80static void ipoib_cm_dma_unmap_rx(struct ipoib_dev_priv *priv, int frags,
81				  u64 mapping[IPOIB_CM_RX_SG])
82{
83	int i;
84
85	ib_dma_unmap_single(priv->ca, mapping[0], IPOIB_CM_HEAD_SIZE, DMA_FROM_DEVICE);
86
87	for (i = 0; i < frags; ++i)
88		ib_dma_unmap_single(priv->ca, mapping[i + 1], PAGE_SIZE, DMA_FROM_DEVICE);
89}
90
91static int ipoib_cm_post_receive_srq(struct net_device *dev, int id)
92{
93	struct ipoib_dev_priv *priv = netdev_priv(dev);
94	struct ib_recv_wr *bad_wr;
95	int i, ret;
96
97	priv->cm.rx_wr.wr_id = id | IPOIB_OP_CM | IPOIB_OP_RECV;
98
99	for (i = 0; i < priv->cm.num_frags; ++i)
100		priv->cm.rx_sge[i].addr = priv->cm.srq_ring[id].mapping[i];
101
102	ret = ib_post_srq_recv(priv->cm.srq, &priv->cm.rx_wr, &bad_wr);
103	if (unlikely(ret)) {
104		ipoib_warn(priv, "post srq failed for buf %d (%d)\n", id, ret);
105		ipoib_cm_dma_unmap_rx(priv, priv->cm.num_frags - 1,
106				      priv->cm.srq_ring[id].mapping);
107		dev_kfree_skb_any(priv->cm.srq_ring[id].skb);
108		priv->cm.srq_ring[id].skb = NULL;
109	}
110
111	return ret;
112}
113
114static int ipoib_cm_post_receive_nonsrq(struct net_device *dev,
115					struct ipoib_cm_rx *rx, int id)
116{
117	struct ipoib_dev_priv *priv = netdev_priv(dev);
118	struct ib_recv_wr *bad_wr;
119	int i, ret;
120
121	priv->cm.rx_wr.wr_id = id | IPOIB_OP_CM | IPOIB_OP_RECV;
122
123	for (i = 0; i < IPOIB_CM_RX_SG; ++i)
124		priv->cm.rx_sge[i].addr = rx->rx_ring[id].mapping[i];
125
126	ret = ib_post_recv(rx->qp, &priv->cm.rx_wr, &bad_wr);
127	if (unlikely(ret)) {
128		ipoib_warn(priv, "post recv failed for buf %d (%d)\n", id, ret);
129		ipoib_cm_dma_unmap_rx(priv, IPOIB_CM_RX_SG - 1,
130				      rx->rx_ring[id].mapping);
131		dev_kfree_skb_any(rx->rx_ring[id].skb);
132		rx->rx_ring[id].skb = NULL;
133	}
134
135	return ret;
136}
137
138static struct sk_buff *ipoib_cm_alloc_rx_skb(struct net_device *dev,
139					     struct ipoib_cm_rx_buf *rx_ring,
140					     int id, int frags,
141					     u64 mapping[IPOIB_CM_RX_SG])
142{
143	struct ipoib_dev_priv *priv = netdev_priv(dev);
144	struct sk_buff *skb;
145	int i;
146
147	skb = dev_alloc_skb(IPOIB_CM_HEAD_SIZE + 12);
148	if (unlikely(!skb))
149		return NULL;
150
151	/*
152	 * IPoIB adds a 4 byte header. So we need 12 more bytes to align the
153	 * IP header to a multiple of 16.
154	 */
155	skb_reserve(skb, 12);
156
157	mapping[0] = ib_dma_map_single(priv->ca, skb->data, IPOIB_CM_HEAD_SIZE,
158				       DMA_FROM_DEVICE);
159	if (unlikely(ib_dma_mapping_error(priv->ca, mapping[0]))) {
160		dev_kfree_skb_any(skb);
161		return NULL;
162	}
163
164	for (i = 0; i < frags; i++) {
165		struct page *page = alloc_page(GFP_ATOMIC);
166
167		if (!page)
168			goto partial_error;
169		skb_fill_page_desc(skb, i, page, 0, PAGE_SIZE);
170
171		mapping[i + 1] = ib_dma_map_page(priv->ca, skb_shinfo(skb)->frags[i].page,
172						 0, PAGE_SIZE, DMA_FROM_DEVICE);
173		if (unlikely(ib_dma_mapping_error(priv->ca, mapping[i + 1])))
174			goto partial_error;
175	}
176
177	rx_ring[id].skb = skb;
178	return skb;
179
180partial_error:
181
182	ib_dma_unmap_single(priv->ca, mapping[0], IPOIB_CM_HEAD_SIZE, DMA_FROM_DEVICE);
183
184	for (; i > 0; --i)
185		ib_dma_unmap_single(priv->ca, mapping[i], PAGE_SIZE, DMA_FROM_DEVICE);
186
187	dev_kfree_skb_any(skb);
188	return NULL;
189}
190
191static void ipoib_cm_free_rx_ring(struct net_device *dev,
192				  struct ipoib_cm_rx_buf *rx_ring)
193{
194	struct ipoib_dev_priv *priv = netdev_priv(dev);
195	int i;
196
197	for (i = 0; i < ipoib_recvq_size; ++i)
198		if (rx_ring[i].skb) {
199			ipoib_cm_dma_unmap_rx(priv, IPOIB_CM_RX_SG - 1,
200					      rx_ring[i].mapping);
201			dev_kfree_skb_any(rx_ring[i].skb);
202		}
203
204	kfree(rx_ring);
205}
206
207static void ipoib_cm_start_rx_drain(struct ipoib_dev_priv *priv)
208{
209	struct ib_send_wr *bad_wr;
210	struct ipoib_cm_rx *p;
211
212	/* We only reserved 1 extra slot in CQ for drain WRs, so
213	 * make sure we have at most 1 outstanding WR. */
214	if (list_empty(&priv->cm.rx_flush_list) ||
215	    !list_empty(&priv->cm.rx_drain_list))
216		return;
217
218	/*
219	 * QPs on flush list are error state.  This way, a "flush
220	 * error" WC will be immediately generated for each WR we post.
221	 */
222	p = list_entry(priv->cm.rx_flush_list.next, typeof(*p), list);
223	if (ib_post_send(p->qp, &ipoib_cm_rx_drain_wr, &bad_wr))
224		ipoib_warn(priv, "failed to post drain wr\n");
225
226	list_splice_init(&priv->cm.rx_flush_list, &priv->cm.rx_drain_list);
227}
228
229static void ipoib_cm_rx_event_handler(struct ib_event *event, void *ctx)
230{
231	struct ipoib_cm_rx *p = ctx;
232	struct ipoib_dev_priv *priv = netdev_priv(p->dev);
233	unsigned long flags;
234
235	if (event->event != IB_EVENT_QP_LAST_WQE_REACHED)
236		return;
237
238	spin_lock_irqsave(&priv->lock, flags);
239	list_move(&p->list, &priv->cm.rx_flush_list);
240	p->state = IPOIB_CM_RX_FLUSH;
241	ipoib_cm_start_rx_drain(priv);
242	spin_unlock_irqrestore(&priv->lock, flags);
243}
244
245static struct ib_qp *ipoib_cm_create_rx_qp(struct net_device *dev,
246					   struct ipoib_cm_rx *p)
247{
248	struct ipoib_dev_priv *priv = netdev_priv(dev);
249	struct ib_qp_init_attr attr = {
250		.event_handler = ipoib_cm_rx_event_handler,
251		.send_cq = priv->cq, /* For drain WR */
252		.recv_cq = priv->cq,
253		.srq = priv->cm.srq,
254		.cap.max_send_wr = 1, /* For drain WR */
255		.cap.max_send_sge = 1, /* FIXME: 0 Seems not to work */
256		.sq_sig_type = IB_SIGNAL_ALL_WR,
257		.qp_type = IB_QPT_RC,
258		.qp_context = p,
259	};
260
261	if (!ipoib_cm_has_srq(dev)) {
262		attr.cap.max_recv_wr  = ipoib_recvq_size;
263		attr.cap.max_recv_sge = IPOIB_CM_RX_SG;
264	}
265
266	return ib_create_qp(priv->pd, &attr);
267}
268
269static int ipoib_cm_modify_rx_qp(struct net_device *dev,
270				 struct ib_cm_id *cm_id, struct ib_qp *qp,
271				 unsigned psn)
272{
273	struct ipoib_dev_priv *priv = netdev_priv(dev);
274	struct ib_qp_attr qp_attr;
275	int qp_attr_mask, ret;
276
277	qp_attr.qp_state = IB_QPS_INIT;
278	ret = ib_cm_init_qp_attr(cm_id, &qp_attr, &qp_attr_mask);
279	if (ret) {
280		ipoib_warn(priv, "failed to init QP attr for INIT: %d\n", ret);
281		return ret;
282	}
283	ret = ib_modify_qp(qp, &qp_attr, qp_attr_mask);
284	if (ret) {
285		ipoib_warn(priv, "failed to modify QP to INIT: %d\n", ret);
286		return ret;
287	}
288	qp_attr.qp_state = IB_QPS_RTR;
289	ret = ib_cm_init_qp_attr(cm_id, &qp_attr, &qp_attr_mask);
290	if (ret) {
291		ipoib_warn(priv, "failed to init QP attr for RTR: %d\n", ret);
292		return ret;
293	}
294	qp_attr.rq_psn = psn;
295	ret = ib_modify_qp(qp, &qp_attr, qp_attr_mask);
296	if (ret) {
297		ipoib_warn(priv, "failed to modify QP to RTR: %d\n", ret);
298		return ret;
299	}
300
301	/*
302	 * Current Mellanox HCA firmware won't generate completions
303	 * with error for drain WRs unless the QP has been moved to
304	 * RTS first. This work-around leaves a window where a QP has
305	 * moved to error asynchronously, but this will eventually get
306	 * fixed in firmware, so let's not error out if modify QP
307	 * fails.
308	 */
309	qp_attr.qp_state = IB_QPS_RTS;
310	ret = ib_cm_init_qp_attr(cm_id, &qp_attr, &qp_attr_mask);
311	if (ret) {
312		ipoib_warn(priv, "failed to init QP attr for RTS: %d\n", ret);
313		return 0;
314	}
315	ret = ib_modify_qp(qp, &qp_attr, qp_attr_mask);
316	if (ret) {
317		ipoib_warn(priv, "failed to modify QP to RTS: %d\n", ret);
318		return 0;
319	}
320
321	return 0;
322}
323
324static int ipoib_cm_nonsrq_init_rx(struct net_device *dev, struct ib_cm_id *cm_id,
325				   struct ipoib_cm_rx *rx)
326{
327	struct ipoib_dev_priv *priv = netdev_priv(dev);
328	int ret;
329	int i;
330
331	rx->rx_ring = kcalloc(ipoib_recvq_size, sizeof *rx->rx_ring, GFP_KERNEL);
332	if (!rx->rx_ring)
333		return -ENOMEM;
334
335	spin_lock_irq(&priv->lock);
336
337	if (priv->cm.nonsrq_conn_qp >= ipoib_max_conn_qp) {
338		spin_unlock_irq(&priv->lock);
339		ib_send_cm_rej(cm_id, IB_CM_REJ_NO_QP, NULL, 0, NULL, 0);
340		ret = -EINVAL;
341		goto err_free;
342	} else
343		++priv->cm.nonsrq_conn_qp;
344
345	spin_unlock_irq(&priv->lock);
346
347	for (i = 0; i < ipoib_recvq_size; ++i) {
348		if (!ipoib_cm_alloc_rx_skb(dev, rx->rx_ring, i, IPOIB_CM_RX_SG - 1,
349					   rx->rx_ring[i].mapping)) {
350			ipoib_warn(priv, "failed to allocate receive buffer %d\n", i);
351				ret = -ENOMEM;
352				goto err_count;
353			}
354		ret = ipoib_cm_post_receive_nonsrq(dev, rx, i);
355		if (ret) {
356			ipoib_warn(priv, "ipoib_cm_post_receive_nonsrq "
357				   "failed for buf %d\n", i);
358			ret = -EIO;
359			goto err_count;
360		}
361	}
362
363	rx->recv_count = ipoib_recvq_size;
364
365	return 0;
366
367err_count:
368	spin_lock_irq(&priv->lock);
369	--priv->cm.nonsrq_conn_qp;
370	spin_unlock_irq(&priv->lock);
371
372err_free:
373	ipoib_cm_free_rx_ring(dev, rx->rx_ring);
374
375	return ret;
376}
377
378static int ipoib_cm_send_rep(struct net_device *dev, struct ib_cm_id *cm_id,
379			     struct ib_qp *qp, struct ib_cm_req_event_param *req,
380			     unsigned psn)
381{
382	struct ipoib_dev_priv *priv = netdev_priv(dev);
383	struct ipoib_cm_data data = {};
384	struct ib_cm_rep_param rep = {};
385
386	data.qpn = cpu_to_be32(priv->qp->qp_num);
387	data.mtu = cpu_to_be32(IPOIB_CM_BUF_SIZE);
388
389	rep.private_data = &data;
390	rep.private_data_len = sizeof data;
391	rep.flow_control = 0;
392	rep.rnr_retry_count = req->rnr_retry_count;
393	rep.srq = ipoib_cm_has_srq(dev);
394	rep.qp_num = qp->qp_num;
395	rep.starting_psn = psn;
396	return ib_send_cm_rep(cm_id, &rep);
397}
398
399static int ipoib_cm_req_handler(struct ib_cm_id *cm_id, struct ib_cm_event *event)
400{
401	struct net_device *dev = cm_id->context;
402	struct ipoib_dev_priv *priv = netdev_priv(dev);
403	struct ipoib_cm_rx *p;
404	unsigned psn;
405	int ret;
406
407	ipoib_dbg(priv, "REQ arrived\n");
408	p = kzalloc(sizeof *p, GFP_KERNEL);
409	if (!p)
410		return -ENOMEM;
411	p->dev = dev;
412	p->id = cm_id;
413	cm_id->context = p;
414	p->state = IPOIB_CM_RX_LIVE;
415	p->jiffies = jiffies;
416	INIT_LIST_HEAD(&p->list);
417
418	p->qp = ipoib_cm_create_rx_qp(dev, p);
419	if (IS_ERR(p->qp)) {
420		ret = PTR_ERR(p->qp);
421		goto err_qp;
422	}
423
424	psn = random32() & 0xffffff;
425	ret = ipoib_cm_modify_rx_qp(dev, cm_id, p->qp, psn);
426	if (ret)
427		goto err_modify;
428
429	if (!ipoib_cm_has_srq(dev)) {
430		ret = ipoib_cm_nonsrq_init_rx(dev, cm_id, p);
431		if (ret)
432			goto err_modify;
433	}
434
435	spin_lock_irq(&priv->lock);
436	queue_delayed_work(ipoib_workqueue,
437			   &priv->cm.stale_task, IPOIB_CM_RX_DELAY);
438	/* Add this entry to passive ids list head, but do not re-add it
439	 * if IB_EVENT_QP_LAST_WQE_REACHED has moved it to flush list. */
440	p->jiffies = jiffies;
441	if (p->state == IPOIB_CM_RX_LIVE)
442		list_move(&p->list, &priv->cm.passive_ids);
443	spin_unlock_irq(&priv->lock);
444
445	ret = ipoib_cm_send_rep(dev, cm_id, p->qp, &event->param.req_rcvd, psn);
446	if (ret) {
447		ipoib_warn(priv, "failed to send REP: %d\n", ret);
448		if (ib_modify_qp(p->qp, &ipoib_cm_err_attr, IB_QP_STATE))
449			ipoib_warn(priv, "unable to move qp to error state\n");
450	}
451	return 0;
452
453err_modify:
454	ib_destroy_qp(p->qp);
455err_qp:
456	kfree(p);
457	return ret;
458}
459
460static int ipoib_cm_rx_handler(struct ib_cm_id *cm_id,
461			       struct ib_cm_event *event)
462{
463	struct ipoib_cm_rx *p;
464	struct ipoib_dev_priv *priv;
465
466	switch (event->event) {
467	case IB_CM_REQ_RECEIVED:
468		return ipoib_cm_req_handler(cm_id, event);
469	case IB_CM_DREQ_RECEIVED:
470		p = cm_id->context;
471		ib_send_cm_drep(cm_id, NULL, 0);
472		/* Fall through */
473	case IB_CM_REJ_RECEIVED:
474		p = cm_id->context;
475		priv = netdev_priv(p->dev);
476		if (ib_modify_qp(p->qp, &ipoib_cm_err_attr, IB_QP_STATE))
477			ipoib_warn(priv, "unable to move qp to error state\n");
478		/* Fall through */
479	default:
480		return 0;
481	}
482}
483/* Adjust length of skb with fragments to match received data */
484static void skb_put_frags(struct sk_buff *skb, unsigned int hdr_space,
485			  unsigned int length, struct sk_buff *toskb)
486{
487	int i, num_frags;
488	unsigned int size;
489
490	/* put header into skb */
491	size = min(length, hdr_space);
492	skb->tail += size;
493	skb->len += size;
494	length -= size;
495
496	num_frags = skb_shinfo(skb)->nr_frags;
497	for (i = 0; i < num_frags; i++) {
498		skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
499
500		if (length == 0) {
501			/* don't need this page */
502			skb_fill_page_desc(toskb, i, frag->page, 0, PAGE_SIZE);
503			--skb_shinfo(skb)->nr_frags;
504		} else {
505			size = min(length, (unsigned) PAGE_SIZE);
506
507			frag->size = size;
508			skb->data_len += size;
509			skb->truesize += size;
510			skb->len += size;
511			length -= size;
512		}
513	}
514}
515
516void ipoib_cm_handle_rx_wc(struct net_device *dev, struct ib_wc *wc)
517{
518	struct ipoib_dev_priv *priv = netdev_priv(dev);
519	struct ipoib_cm_rx_buf *rx_ring;
520	unsigned int wr_id = wc->wr_id & ~(IPOIB_OP_CM | IPOIB_OP_RECV);
521	struct sk_buff *skb, *newskb;
522	struct ipoib_cm_rx *p;
523	unsigned long flags;
524	u64 mapping[IPOIB_CM_RX_SG];
525	int frags;
526	int has_srq;
527
528	ipoib_dbg_data(priv, "cm recv completion: id %d, status: %d\n",
529		       wr_id, wc->status);
530
531	if (unlikely(wr_id >= ipoib_recvq_size)) {
532		if (wr_id == (IPOIB_CM_RX_DRAIN_WRID & ~(IPOIB_OP_CM | IPOIB_OP_RECV))) {
533			spin_lock_irqsave(&priv->lock, flags);
534			list_splice_init(&priv->cm.rx_drain_list, &priv->cm.rx_reap_list);
535			ipoib_cm_start_rx_drain(priv);
536			queue_work(ipoib_workqueue, &priv->cm.rx_reap_task);
537			spin_unlock_irqrestore(&priv->lock, flags);
538		} else
539			ipoib_warn(priv, "cm recv completion event with wrid %d (> %d)\n",
540				   wr_id, ipoib_recvq_size);
541		return;
542	}
543
544	p = wc->qp->qp_context;
545
546	has_srq = ipoib_cm_has_srq(dev);
547	rx_ring = has_srq ? priv->cm.srq_ring : p->rx_ring;
548
549	skb = rx_ring[wr_id].skb;
550
551	if (unlikely(wc->status != IB_WC_SUCCESS)) {
552		ipoib_dbg(priv, "cm recv error "
553			   "(status=%d, wrid=%d vend_err %x)\n",
554			   wc->status, wr_id, wc->vendor_err);
555		++dev->stats.rx_dropped;
556		if (has_srq)
557			goto repost;
558		else {
559			if (!--p->recv_count) {
560				spin_lock_irqsave(&priv->lock, flags);
561				list_move(&p->list, &priv->cm.rx_reap_list);
562				spin_unlock_irqrestore(&priv->lock, flags);
563				queue_work(ipoib_workqueue, &priv->cm.rx_reap_task);
564			}
565			return;
566		}
567	}
568
569	if (unlikely(!(wr_id & IPOIB_CM_RX_UPDATE_MASK))) {
570		if (p && time_after_eq(jiffies, p->jiffies + IPOIB_CM_RX_UPDATE_TIME)) {
571			spin_lock_irqsave(&priv->lock, flags);
572			p->jiffies = jiffies;
573			/* Move this entry to list head, but do not re-add it
574			 * if it has been moved out of list. */
575			if (p->state == IPOIB_CM_RX_LIVE)
576				list_move(&p->list, &priv->cm.passive_ids);
577			spin_unlock_irqrestore(&priv->lock, flags);
578		}
579	}
580
581	frags = PAGE_ALIGN(wc->byte_len - min(wc->byte_len,
582					      (unsigned)IPOIB_CM_HEAD_SIZE)) / PAGE_SIZE;
583
584	newskb = ipoib_cm_alloc_rx_skb(dev, rx_ring, wr_id, frags, mapping);
585	if (unlikely(!newskb)) {
586		/*
587		 * If we can't allocate a new RX buffer, dump
588		 * this packet and reuse the old buffer.
589		 */
590		ipoib_dbg(priv, "failed to allocate receive buffer %d\n", wr_id);
591		++dev->stats.rx_dropped;
592		goto repost;
593	}
594
595	ipoib_cm_dma_unmap_rx(priv, frags, rx_ring[wr_id].mapping);
596	memcpy(rx_ring[wr_id].mapping, mapping, (frags + 1) * sizeof *mapping);
597
598	ipoib_dbg_data(priv, "received %d bytes, SLID 0x%04x\n",
599		       wc->byte_len, wc->slid);
600
601	skb_put_frags(skb, IPOIB_CM_HEAD_SIZE, wc->byte_len, newskb);
602
603	skb->protocol = ((struct ipoib_header *) skb->data)->proto;
604	skb_reset_mac_header(skb);
605	skb_pull(skb, IPOIB_ENCAP_LEN);
606
607	dev->last_rx = jiffies;
608	++dev->stats.rx_packets;
609	dev->stats.rx_bytes += skb->len;
610
611	skb->dev = dev;
612	/* XXX get correct PACKET_ type here */
613	skb->pkt_type = PACKET_HOST;
614	netif_receive_skb(skb);
615
616repost:
617	if (has_srq) {
618		if (unlikely(ipoib_cm_post_receive_srq(dev, wr_id)))
619			ipoib_warn(priv, "ipoib_cm_post_receive_srq failed "
620				   "for buf %d\n", wr_id);
621	} else {
622		if (unlikely(ipoib_cm_post_receive_nonsrq(dev, p, wr_id))) {
623			--p->recv_count;
624			ipoib_warn(priv, "ipoib_cm_post_receive_nonsrq failed "
625				   "for buf %d\n", wr_id);
626		}
627	}
628}
629
630static inline int post_send(struct ipoib_dev_priv *priv,
631			    struct ipoib_cm_tx *tx,
632			    unsigned int wr_id,
633			    u64 addr, int len)
634{
635	struct ib_send_wr *bad_wr;
636
637	priv->tx_sge[0].addr          = addr;
638	priv->tx_sge[0].length        = len;
639
640	priv->tx_wr.wr_id	= wr_id | IPOIB_OP_CM;
641
642	return ib_post_send(tx->qp, &priv->tx_wr, &bad_wr);
643}
644
645void ipoib_cm_send(struct net_device *dev, struct sk_buff *skb, struct ipoib_cm_tx *tx)
646{
647	struct ipoib_dev_priv *priv = netdev_priv(dev);
648	struct ipoib_tx_buf *tx_req;
649	u64 addr;
650
651	if (unlikely(skb->len > tx->mtu)) {
652		ipoib_warn(priv, "packet len %d (> %d) too long to send, dropping\n",
653			   skb->len, tx->mtu);
654		++dev->stats.tx_dropped;
655		++dev->stats.tx_errors;
656		ipoib_cm_skb_too_long(dev, skb, tx->mtu - IPOIB_ENCAP_LEN);
657		return;
658	}
659
660	ipoib_dbg_data(priv, "sending packet: head 0x%x length %d connection 0x%x\n",
661		       tx->tx_head, skb->len, tx->qp->qp_num);
662
663	/*
664	 * We put the skb into the tx_ring _before_ we call post_send()
665	 * because it's entirely possible that the completion handler will
666	 * run before we execute anything after the post_send().  That
667	 * means we have to make sure everything is properly recorded and
668	 * our state is consistent before we call post_send().
669	 */
670	tx_req = &tx->tx_ring[tx->tx_head & (ipoib_sendq_size - 1)];
671	tx_req->skb = skb;
672	addr = ib_dma_map_single(priv->ca, skb->data, skb->len, DMA_TO_DEVICE);
673	if (unlikely(ib_dma_mapping_error(priv->ca, addr))) {
674		++dev->stats.tx_errors;
675		dev_kfree_skb_any(skb);
676		return;
677	}
678
679	tx_req->mapping[0] = addr;
680
681	if (unlikely(post_send(priv, tx, tx->tx_head & (ipoib_sendq_size - 1),
682			       addr, skb->len))) {
683		ipoib_warn(priv, "post_send failed\n");
684		++dev->stats.tx_errors;
685		ib_dma_unmap_single(priv->ca, addr, skb->len, DMA_TO_DEVICE);
686		dev_kfree_skb_any(skb);
687	} else {
688		dev->trans_start = jiffies;
689		++tx->tx_head;
690
691		if (++priv->tx_outstanding == ipoib_sendq_size) {
692			ipoib_dbg(priv, "TX ring 0x%x full, stopping kernel net queue\n",
693				  tx->qp->qp_num);
694			netif_stop_queue(dev);
695		}
696	}
697}
698
699void ipoib_cm_handle_tx_wc(struct net_device *dev, struct ib_wc *wc)
700{
701	struct ipoib_dev_priv *priv = netdev_priv(dev);
702	struct ipoib_cm_tx *tx = wc->qp->qp_context;
703	unsigned int wr_id = wc->wr_id & ~IPOIB_OP_CM;
704	struct ipoib_tx_buf *tx_req;
705	unsigned long flags;
706
707	ipoib_dbg_data(priv, "cm send completion: id %d, status: %d\n",
708		       wr_id, wc->status);
709
710	if (unlikely(wr_id >= ipoib_sendq_size)) {
711		ipoib_warn(priv, "cm send completion event with wrid %d (> %d)\n",
712			   wr_id, ipoib_sendq_size);
713		return;
714	}
715
716	tx_req = &tx->tx_ring[wr_id];
717
718	ib_dma_unmap_single(priv->ca, tx_req->mapping[0], tx_req->skb->len, DMA_TO_DEVICE);
719
720	/* FIXME: is this right? Shouldn't we only increment on success? */
721	++dev->stats.tx_packets;
722	dev->stats.tx_bytes += tx_req->skb->len;
723
724	dev_kfree_skb_any(tx_req->skb);
725
726	spin_lock_irqsave(&priv->tx_lock, flags);
727	++tx->tx_tail;
728	if (unlikely(--priv->tx_outstanding == ipoib_sendq_size >> 1) &&
729	    netif_queue_stopped(dev) &&
730	    test_bit(IPOIB_FLAG_ADMIN_UP, &priv->flags))
731		netif_wake_queue(dev);
732
733	if (wc->status != IB_WC_SUCCESS &&
734	    wc->status != IB_WC_WR_FLUSH_ERR) {
735		struct ipoib_neigh *neigh;
736
737		ipoib_dbg(priv, "failed cm send event "
738			   "(status=%d, wrid=%d vend_err %x)\n",
739			   wc->status, wr_id, wc->vendor_err);
740
741		spin_lock(&priv->lock);
742		neigh = tx->neigh;
743
744		if (neigh) {
745			neigh->cm = NULL;
746			list_del(&neigh->list);
747			if (neigh->ah)
748				ipoib_put_ah(neigh->ah);
749			ipoib_neigh_free(dev, neigh);
750
751			tx->neigh = NULL;
752		}
753
754		if (test_and_clear_bit(IPOIB_FLAG_INITIALIZED, &tx->flags)) {
755			list_move(&tx->list, &priv->cm.reap_list);
756			queue_work(ipoib_workqueue, &priv->cm.reap_task);
757		}
758
759		clear_bit(IPOIB_FLAG_OPER_UP, &tx->flags);
760
761		spin_unlock(&priv->lock);
762	}
763
764	spin_unlock_irqrestore(&priv->tx_lock, flags);
765}
766
767int ipoib_cm_dev_open(struct net_device *dev)
768{
769	struct ipoib_dev_priv *priv = netdev_priv(dev);
770	int ret;
771
772	if (!IPOIB_CM_SUPPORTED(dev->dev_addr))
773		return 0;
774
775	priv->cm.id = ib_create_cm_id(priv->ca, ipoib_cm_rx_handler, dev);
776	if (IS_ERR(priv->cm.id)) {
777		printk(KERN_WARNING "%s: failed to create CM ID\n", priv->ca->name);
778		ret = PTR_ERR(priv->cm.id);
779		goto err_cm;
780	}
781
782	ret = ib_cm_listen(priv->cm.id, cpu_to_be64(IPOIB_CM_IETF_ID | priv->qp->qp_num),
783			   0, NULL);
784	if (ret) {
785		printk(KERN_WARNING "%s: failed to listen on ID 0x%llx\n", priv->ca->name,
786		       IPOIB_CM_IETF_ID | priv->qp->qp_num);
787		goto err_listen;
788	}
789
790	return 0;
791
792err_listen:
793	ib_destroy_cm_id(priv->cm.id);
794err_cm:
795	priv->cm.id = NULL;
796	return ret;
797}
798
799static void ipoib_cm_free_rx_reap_list(struct net_device *dev)
800{
801	struct ipoib_dev_priv *priv = netdev_priv(dev);
802	struct ipoib_cm_rx *rx, *n;
803	LIST_HEAD(list);
804
805	spin_lock_irq(&priv->lock);
806	list_splice_init(&priv->cm.rx_reap_list, &list);
807	spin_unlock_irq(&priv->lock);
808
809	list_for_each_entry_safe(rx, n, &list, list) {
810		ib_destroy_cm_id(rx->id);
811		ib_destroy_qp(rx->qp);
812		if (!ipoib_cm_has_srq(dev)) {
813			ipoib_cm_free_rx_ring(priv->dev, rx->rx_ring);
814			spin_lock_irq(&priv->lock);
815			--priv->cm.nonsrq_conn_qp;
816			spin_unlock_irq(&priv->lock);
817		}
818		kfree(rx);
819	}
820}
821
822void ipoib_cm_dev_stop(struct net_device *dev)
823{
824	struct ipoib_dev_priv *priv = netdev_priv(dev);
825	struct ipoib_cm_rx *p;
826	unsigned long begin;
827	LIST_HEAD(list);
828	int ret;
829
830	if (!IPOIB_CM_SUPPORTED(dev->dev_addr) || !priv->cm.id)
831		return;
832
833	ib_destroy_cm_id(priv->cm.id);
834	priv->cm.id = NULL;
835
836	spin_lock_irq(&priv->lock);
837	while (!list_empty(&priv->cm.passive_ids)) {
838		p = list_entry(priv->cm.passive_ids.next, typeof(*p), list);
839		list_move(&p->list, &priv->cm.rx_error_list);
840		p->state = IPOIB_CM_RX_ERROR;
841		spin_unlock_irq(&priv->lock);
842		ret = ib_modify_qp(p->qp, &ipoib_cm_err_attr, IB_QP_STATE);
843		if (ret)
844			ipoib_warn(priv, "unable to move qp to error state: %d\n", ret);
845		spin_lock_irq(&priv->lock);
846	}
847
848	/* Wait for all RX to be drained */
849	begin = jiffies;
850
851	while (!list_empty(&priv->cm.rx_error_list) ||
852	       !list_empty(&priv->cm.rx_flush_list) ||
853	       !list_empty(&priv->cm.rx_drain_list)) {
854		if (time_after(jiffies, begin + 5 * HZ)) {
855			ipoib_warn(priv, "RX drain timing out\n");
856
857			/*
858			 * assume the HW is wedged and just free up everything.
859			 */
860			list_splice_init(&priv->cm.rx_flush_list, &list);
861			list_splice_init(&priv->cm.rx_error_list, &list);
862			list_splice_init(&priv->cm.rx_drain_list, &list);
863			break;
864		}
865		spin_unlock_irq(&priv->lock);
866		msleep(1);
867		ipoib_drain_cq(dev);
868		spin_lock_irq(&priv->lock);
869	}
870
871	spin_unlock_irq(&priv->lock);
872
873	ipoib_cm_free_rx_reap_list(dev);
874
875	cancel_delayed_work(&priv->cm.stale_task);
876}
877
878static int ipoib_cm_rep_handler(struct ib_cm_id *cm_id, struct ib_cm_event *event)
879{
880	struct ipoib_cm_tx *p = cm_id->context;
881	struct ipoib_dev_priv *priv = netdev_priv(p->dev);
882	struct ipoib_cm_data *data = event->private_data;
883	struct sk_buff_head skqueue;
884	struct ib_qp_attr qp_attr;
885	int qp_attr_mask, ret;
886	struct sk_buff *skb;
887
888	p->mtu = be32_to_cpu(data->mtu);
889
890	if (p->mtu <= IPOIB_ENCAP_LEN) {
891		ipoib_warn(priv, "Rejecting connection: mtu %d <= %d\n",
892			   p->mtu, IPOIB_ENCAP_LEN);
893		return -EINVAL;
894	}
895
896	qp_attr.qp_state = IB_QPS_RTR;
897	ret = ib_cm_init_qp_attr(cm_id, &qp_attr, &qp_attr_mask);
898	if (ret) {
899		ipoib_warn(priv, "failed to init QP attr for RTR: %d\n", ret);
900		return ret;
901	}
902
903	qp_attr.rq_psn = 0 /* FIXME */;
904	ret = ib_modify_qp(p->qp, &qp_attr, qp_attr_mask);
905	if (ret) {
906		ipoib_warn(priv, "failed to modify QP to RTR: %d\n", ret);
907		return ret;
908	}
909
910	qp_attr.qp_state = IB_QPS_RTS;
911	ret = ib_cm_init_qp_attr(cm_id, &qp_attr, &qp_attr_mask);
912	if (ret) {
913		ipoib_warn(priv, "failed to init QP attr for RTS: %d\n", ret);
914		return ret;
915	}
916	ret = ib_modify_qp(p->qp, &qp_attr, qp_attr_mask);
917	if (ret) {
918		ipoib_warn(priv, "failed to modify QP to RTS: %d\n", ret);
919		return ret;
920	}
921
922	skb_queue_head_init(&skqueue);
923
924	spin_lock_irq(&priv->lock);
925	set_bit(IPOIB_FLAG_OPER_UP, &p->flags);
926	if (p->neigh)
927		while ((skb = __skb_dequeue(&p->neigh->queue)))
928			__skb_queue_tail(&skqueue, skb);
929	spin_unlock_irq(&priv->lock);
930
931	while ((skb = __skb_dequeue(&skqueue))) {
932		skb->dev = p->dev;
933		if (dev_queue_xmit(skb))
934			ipoib_warn(priv, "dev_queue_xmit failed "
935				   "to requeue packet\n");
936	}
937
938	ret = ib_send_cm_rtu(cm_id, NULL, 0);
939	if (ret) {
940		ipoib_warn(priv, "failed to send RTU: %d\n", ret);
941		return ret;
942	}
943	return 0;
944}
945
946static struct ib_qp *ipoib_cm_create_tx_qp(struct net_device *dev, struct ipoib_cm_tx *tx)
947{
948	struct ipoib_dev_priv *priv = netdev_priv(dev);
949	struct ib_qp_init_attr attr = {
950		.send_cq		= priv->cq,
951		.recv_cq		= priv->cq,
952		.srq			= priv->cm.srq,
953		.cap.max_send_wr	= ipoib_sendq_size,
954		.cap.max_send_sge	= 1,
955		.sq_sig_type		= IB_SIGNAL_ALL_WR,
956		.qp_type		= IB_QPT_RC,
957		.qp_context		= tx
958	};
959
960	return ib_create_qp(priv->pd, &attr);
961}
962
963static int ipoib_cm_send_req(struct net_device *dev,
964			     struct ib_cm_id *id, struct ib_qp *qp,
965			     u32 qpn,
966			     struct ib_sa_path_rec *pathrec)
967{
968	struct ipoib_dev_priv *priv = netdev_priv(dev);
969	struct ipoib_cm_data data = {};
970	struct ib_cm_req_param req = {};
971
972	data.qpn = cpu_to_be32(priv->qp->qp_num);
973	data.mtu = cpu_to_be32(IPOIB_CM_BUF_SIZE);
974
975	req.primary_path		= pathrec;
976	req.alternate_path		= NULL;
977	req.service_id			= cpu_to_be64(IPOIB_CM_IETF_ID | qpn);
978	req.qp_num			= qp->qp_num;
979	req.qp_type			= qp->qp_type;
980	req.private_data		= &data;
981	req.private_data_len		= sizeof data;
982	req.flow_control		= 0;
983
984	req.starting_psn		= 0; /* FIXME */
985
986	/*
987	 * Pick some arbitrary defaults here; we could make these
988	 * module parameters if anyone cared about setting them.
989	 */
990	req.responder_resources		= 4;
991	req.remote_cm_response_timeout	= 20;
992	req.local_cm_response_timeout	= 20;
993	req.retry_count			= 0; /* RFC draft warns against retries */
994	req.rnr_retry_count		= 0; /* RFC draft warns against retries */
995	req.max_cm_retries		= 15;
996	req.srq				= ipoib_cm_has_srq(dev);
997	return ib_send_cm_req(id, &req);
998}
999
1000static int ipoib_cm_modify_tx_init(struct net_device *dev,
1001				  struct ib_cm_id *cm_id, struct ib_qp *qp)
1002{
1003	struct ipoib_dev_priv *priv = netdev_priv(dev);
1004	struct ib_qp_attr qp_attr;
1005	int qp_attr_mask, ret;
1006	ret = ib_find_cached_pkey(priv->ca, priv->port, priv->pkey, &qp_attr.pkey_index);
1007	if (ret) {
1008		ipoib_warn(priv, "pkey 0x%x not in cache: %d\n", priv->pkey, ret);
1009		return ret;
1010	}
1011
1012	qp_attr.qp_state = IB_QPS_INIT;
1013	qp_attr.qp_access_flags = IB_ACCESS_LOCAL_WRITE;
1014	qp_attr.port_num = priv->port;
1015	qp_attr_mask = IB_QP_STATE | IB_QP_ACCESS_FLAGS | IB_QP_PKEY_INDEX | IB_QP_PORT;
1016
1017	ret = ib_modify_qp(qp, &qp_attr, qp_attr_mask);
1018	if (ret) {
1019		ipoib_warn(priv, "failed to modify tx QP to INIT: %d\n", ret);
1020		return ret;
1021	}
1022	return 0;
1023}
1024
1025static int ipoib_cm_tx_init(struct ipoib_cm_tx *p, u32 qpn,
1026			    struct ib_sa_path_rec *pathrec)
1027{
1028	struct ipoib_dev_priv *priv = netdev_priv(p->dev);
1029	int ret;
1030
1031	p->tx_ring = kzalloc(ipoib_sendq_size * sizeof *p->tx_ring,
1032				GFP_KERNEL);
1033	if (!p->tx_ring) {
1034		ipoib_warn(priv, "failed to allocate tx ring\n");
1035		ret = -ENOMEM;
1036		goto err_tx;
1037	}
1038
1039	p->qp = ipoib_cm_create_tx_qp(p->dev, p);
1040	if (IS_ERR(p->qp)) {
1041		ret = PTR_ERR(p->qp);
1042		ipoib_warn(priv, "failed to allocate tx qp: %d\n", ret);
1043		goto err_qp;
1044	}
1045
1046	p->id = ib_create_cm_id(priv->ca, ipoib_cm_tx_handler, p);
1047	if (IS_ERR(p->id)) {
1048		ret = PTR_ERR(p->id);
1049		ipoib_warn(priv, "failed to create tx cm id: %d\n", ret);
1050		goto err_id;
1051	}
1052
1053	ret = ipoib_cm_modify_tx_init(p->dev, p->id,  p->qp);
1054	if (ret) {
1055		ipoib_warn(priv, "failed to modify tx qp to rtr: %d\n", ret);
1056		goto err_modify;
1057	}
1058
1059	ret = ipoib_cm_send_req(p->dev, p->id, p->qp, qpn, pathrec);
1060	if (ret) {
1061		ipoib_warn(priv, "failed to send cm req: %d\n", ret);
1062		goto err_send_cm;
1063	}
1064
1065	ipoib_dbg(priv, "Request connection 0x%x for gid " IPOIB_GID_FMT " qpn 0x%x\n",
1066		  p->qp->qp_num, IPOIB_GID_ARG(pathrec->dgid), qpn);
1067
1068	return 0;
1069
1070err_send_cm:
1071err_modify:
1072	ib_destroy_cm_id(p->id);
1073err_id:
1074	p->id = NULL;
1075	ib_destroy_qp(p->qp);
1076err_qp:
1077	p->qp = NULL;
1078err_tx:
1079	return ret;
1080}
1081
1082static void ipoib_cm_tx_destroy(struct ipoib_cm_tx *p)
1083{
1084	struct ipoib_dev_priv *priv = netdev_priv(p->dev);
1085	struct ipoib_tx_buf *tx_req;
1086	unsigned long flags;
1087	unsigned long begin;
1088
1089	ipoib_dbg(priv, "Destroy active connection 0x%x head 0x%x tail 0x%x\n",
1090		  p->qp ? p->qp->qp_num : 0, p->tx_head, p->tx_tail);
1091
1092	if (p->id)
1093		ib_destroy_cm_id(p->id);
1094
1095	if (p->tx_ring) {
1096		/* Wait for all sends to complete */
1097		begin = jiffies;
1098		while ((int) p->tx_tail - (int) p->tx_head < 0) {
1099			if (time_after(jiffies, begin + 5 * HZ)) {
1100				ipoib_warn(priv, "timing out; %d sends not completed\n",
1101					   p->tx_head - p->tx_tail);
1102				goto timeout;
1103			}
1104
1105			msleep(1);
1106		}
1107	}
1108
1109timeout:
1110
1111	while ((int) p->tx_tail - (int) p->tx_head < 0) {
1112		tx_req = &p->tx_ring[p->tx_tail & (ipoib_sendq_size - 1)];
1113		ib_dma_unmap_single(priv->ca, tx_req->mapping[0], tx_req->skb->len,
1114				    DMA_TO_DEVICE);
1115		dev_kfree_skb_any(tx_req->skb);
1116		++p->tx_tail;
1117		spin_lock_irqsave(&priv->tx_lock, flags);
1118		if (unlikely(--priv->tx_outstanding == ipoib_sendq_size >> 1) &&
1119		    netif_queue_stopped(p->dev) &&
1120		    test_bit(IPOIB_FLAG_ADMIN_UP, &priv->flags))
1121			netif_wake_queue(p->dev);
1122		spin_unlock_irqrestore(&priv->tx_lock, flags);
1123	}
1124
1125	if (p->qp)
1126		ib_destroy_qp(p->qp);
1127
1128	kfree(p->tx_ring);
1129	kfree(p);
1130}
1131
1132static int ipoib_cm_tx_handler(struct ib_cm_id *cm_id,
1133			       struct ib_cm_event *event)
1134{
1135	struct ipoib_cm_tx *tx = cm_id->context;
1136	struct ipoib_dev_priv *priv = netdev_priv(tx->dev);
1137	struct net_device *dev = priv->dev;
1138	struct ipoib_neigh *neigh;
1139	int ret;
1140
1141	switch (event->event) {
1142	case IB_CM_DREQ_RECEIVED:
1143		ipoib_dbg(priv, "DREQ received.\n");
1144		ib_send_cm_drep(cm_id, NULL, 0);
1145		break;
1146	case IB_CM_REP_RECEIVED:
1147		ipoib_dbg(priv, "REP received.\n");
1148		ret = ipoib_cm_rep_handler(cm_id, event);
1149		if (ret)
1150			ib_send_cm_rej(cm_id, IB_CM_REJ_CONSUMER_DEFINED,
1151				       NULL, 0, NULL, 0);
1152		break;
1153	case IB_CM_REQ_ERROR:
1154	case IB_CM_REJ_RECEIVED:
1155	case IB_CM_TIMEWAIT_EXIT:
1156		ipoib_dbg(priv, "CM error %d.\n", event->event);
1157		spin_lock_irq(&priv->tx_lock);
1158		spin_lock(&priv->lock);
1159		neigh = tx->neigh;
1160
1161		if (neigh) {
1162			neigh->cm = NULL;
1163			list_del(&neigh->list);
1164			if (neigh->ah)
1165				ipoib_put_ah(neigh->ah);
1166			ipoib_neigh_free(dev, neigh);
1167
1168			tx->neigh = NULL;
1169		}
1170
1171		if (test_and_clear_bit(IPOIB_FLAG_INITIALIZED, &tx->flags)) {
1172			list_move(&tx->list, &priv->cm.reap_list);
1173			queue_work(ipoib_workqueue, &priv->cm.reap_task);
1174		}
1175
1176		spin_unlock(&priv->lock);
1177		spin_unlock_irq(&priv->tx_lock);
1178		break;
1179	default:
1180		break;
1181	}
1182
1183	return 0;
1184}
1185
1186struct ipoib_cm_tx *ipoib_cm_create_tx(struct net_device *dev, struct ipoib_path *path,
1187				       struct ipoib_neigh *neigh)
1188{
1189	struct ipoib_dev_priv *priv = netdev_priv(dev);
1190	struct ipoib_cm_tx *tx;
1191
1192	tx = kzalloc(sizeof *tx, GFP_ATOMIC);
1193	if (!tx)
1194		return NULL;
1195
1196	neigh->cm = tx;
1197	tx->neigh = neigh;
1198	tx->path = path;
1199	tx->dev = dev;
1200	list_add(&tx->list, &priv->cm.start_list);
1201	set_bit(IPOIB_FLAG_INITIALIZED, &tx->flags);
1202	queue_work(ipoib_workqueue, &priv->cm.start_task);
1203	return tx;
1204}
1205
1206void ipoib_cm_destroy_tx(struct ipoib_cm_tx *tx)
1207{
1208	struct ipoib_dev_priv *priv = netdev_priv(tx->dev);
1209	if (test_and_clear_bit(IPOIB_FLAG_INITIALIZED, &tx->flags)) {
1210		list_move(&tx->list, &priv->cm.reap_list);
1211		queue_work(ipoib_workqueue, &priv->cm.reap_task);
1212		ipoib_dbg(priv, "Reap connection for gid " IPOIB_GID_FMT "\n",
1213			  IPOIB_GID_ARG(tx->neigh->dgid));
1214		tx->neigh = NULL;
1215	}
1216}
1217
1218static void ipoib_cm_tx_start(struct work_struct *work)
1219{
1220	struct ipoib_dev_priv *priv = container_of(work, struct ipoib_dev_priv,
1221						   cm.start_task);
1222	struct net_device *dev = priv->dev;
1223	struct ipoib_neigh *neigh;
1224	struct ipoib_cm_tx *p;
1225	unsigned long flags;
1226	int ret;
1227
1228	struct ib_sa_path_rec pathrec;
1229	u32 qpn;
1230
1231	spin_lock_irqsave(&priv->tx_lock, flags);
1232	spin_lock(&priv->lock);
1233	while (!list_empty(&priv->cm.start_list)) {
1234		p = list_entry(priv->cm.start_list.next, typeof(*p), list);
1235		list_del_init(&p->list);
1236		neigh = p->neigh;
1237		qpn = IPOIB_QPN(neigh->neighbour->ha);
1238		memcpy(&pathrec, &p->path->pathrec, sizeof pathrec);
1239		spin_unlock(&priv->lock);
1240		spin_unlock_irqrestore(&priv->tx_lock, flags);
1241		ret = ipoib_cm_tx_init(p, qpn, &pathrec);
1242		spin_lock_irqsave(&priv->tx_lock, flags);
1243		spin_lock(&priv->lock);
1244		if (ret) {
1245			neigh = p->neigh;
1246			if (neigh) {
1247				neigh->cm = NULL;
1248				list_del(&neigh->list);
1249				if (neigh->ah)
1250					ipoib_put_ah(neigh->ah);
1251				ipoib_neigh_free(dev, neigh);
1252			}
1253			list_del(&p->list);
1254			kfree(p);
1255		}
1256	}
1257	spin_unlock(&priv->lock);
1258	spin_unlock_irqrestore(&priv->tx_lock, flags);
1259}
1260
1261static void ipoib_cm_tx_reap(struct work_struct *work)
1262{
1263	struct ipoib_dev_priv *priv = container_of(work, struct ipoib_dev_priv,
1264						   cm.reap_task);
1265	struct ipoib_cm_tx *p;
1266
1267	spin_lock_irq(&priv->tx_lock);
1268	spin_lock(&priv->lock);
1269	while (!list_empty(&priv->cm.reap_list)) {
1270		p = list_entry(priv->cm.reap_list.next, typeof(*p), list);
1271		list_del(&p->list);
1272		spin_unlock(&priv->lock);
1273		spin_unlock_irq(&priv->tx_lock);
1274		ipoib_cm_tx_destroy(p);
1275		spin_lock_irq(&priv->tx_lock);
1276		spin_lock(&priv->lock);
1277	}
1278	spin_unlock(&priv->lock);
1279	spin_unlock_irq(&priv->tx_lock);
1280}
1281
1282static void ipoib_cm_skb_reap(struct work_struct *work)
1283{
1284	struct ipoib_dev_priv *priv = container_of(work, struct ipoib_dev_priv,
1285						   cm.skb_task);
1286	struct sk_buff *skb;
1287
1288	unsigned mtu = priv->mcast_mtu;
1289
1290	spin_lock_irq(&priv->tx_lock);
1291	spin_lock(&priv->lock);
1292	while ((skb = skb_dequeue(&priv->cm.skb_queue))) {
1293		spin_unlock(&priv->lock);
1294		spin_unlock_irq(&priv->tx_lock);
1295		if (skb->protocol == htons(ETH_P_IP))
1296			icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED, htonl(mtu));
1297#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
1298		else if (skb->protocol == htons(ETH_P_IPV6))
1299			icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu, priv->dev);
1300#endif
1301		dev_kfree_skb_any(skb);
1302		spin_lock_irq(&priv->tx_lock);
1303		spin_lock(&priv->lock);
1304	}
1305	spin_unlock(&priv->lock);
1306	spin_unlock_irq(&priv->tx_lock);
1307}
1308
1309void ipoib_cm_skb_too_long(struct net_device *dev, struct sk_buff *skb,
1310			   unsigned int mtu)
1311{
1312	struct ipoib_dev_priv *priv = netdev_priv(dev);
1313	int e = skb_queue_empty(&priv->cm.skb_queue);
1314
1315	if (skb->dst)
1316		skb->dst->ops->update_pmtu(skb->dst, mtu);
1317
1318	skb_queue_tail(&priv->cm.skb_queue, skb);
1319	if (e)
1320		queue_work(ipoib_workqueue, &priv->cm.skb_task);
1321}
1322
1323static void ipoib_cm_rx_reap(struct work_struct *work)
1324{
1325	ipoib_cm_free_rx_reap_list(container_of(work, struct ipoib_dev_priv,
1326						cm.rx_reap_task)->dev);
1327}
1328
1329static void ipoib_cm_stale_task(struct work_struct *work)
1330{
1331	struct ipoib_dev_priv *priv = container_of(work, struct ipoib_dev_priv,
1332						   cm.stale_task.work);
1333	struct ipoib_cm_rx *p;
1334	int ret;
1335
1336	spin_lock_irq(&priv->lock);
1337	while (!list_empty(&priv->cm.passive_ids)) {
1338		/* List is sorted by LRU, start from tail,
1339		 * stop when we see a recently used entry */
1340		p = list_entry(priv->cm.passive_ids.prev, typeof(*p), list);
1341		if (time_before_eq(jiffies, p->jiffies + IPOIB_CM_RX_TIMEOUT))
1342			break;
1343		list_move(&p->list, &priv->cm.rx_error_list);
1344		p->state = IPOIB_CM_RX_ERROR;
1345		spin_unlock_irq(&priv->lock);
1346		ret = ib_modify_qp(p->qp, &ipoib_cm_err_attr, IB_QP_STATE);
1347		if (ret)
1348			ipoib_warn(priv, "unable to move qp to error state: %d\n", ret);
1349		spin_lock_irq(&priv->lock);
1350	}
1351
1352	if (!list_empty(&priv->cm.passive_ids))
1353		queue_delayed_work(ipoib_workqueue,
1354				   &priv->cm.stale_task, IPOIB_CM_RX_DELAY);
1355	spin_unlock_irq(&priv->lock);
1356}
1357
1358
1359static ssize_t show_mode(struct device *d, struct device_attribute *attr,
1360			 char *buf)
1361{
1362	struct ipoib_dev_priv *priv = netdev_priv(to_net_dev(d));
1363
1364	if (test_bit(IPOIB_FLAG_ADMIN_CM, &priv->flags))
1365		return sprintf(buf, "connected\n");
1366	else
1367		return sprintf(buf, "datagram\n");
1368}
1369
1370static ssize_t set_mode(struct device *d, struct device_attribute *attr,
1371			const char *buf, size_t count)
1372{
1373	struct net_device *dev = to_net_dev(d);
1374	struct ipoib_dev_priv *priv = netdev_priv(dev);
1375
1376	/* flush paths if we switch modes so that connections are restarted */
1377	if (IPOIB_CM_SUPPORTED(dev->dev_addr) && !strcmp(buf, "connected\n")) {
1378		set_bit(IPOIB_FLAG_ADMIN_CM, &priv->flags);
1379		ipoib_warn(priv, "enabling connected mode "
1380			   "will cause multicast packet drops\n");
1381		ipoib_flush_paths(dev);
1382		return count;
1383	}
1384
1385	if (!strcmp(buf, "datagram\n")) {
1386		clear_bit(IPOIB_FLAG_ADMIN_CM, &priv->flags);
1387		dev->mtu = min(priv->mcast_mtu, dev->mtu);
1388		ipoib_flush_paths(dev);
1389		return count;
1390	}
1391
1392	return -EINVAL;
1393}
1394
1395static DEVICE_ATTR(mode, S_IWUSR | S_IRUGO, show_mode, set_mode);
1396
1397int ipoib_cm_add_mode_attr(struct net_device *dev)
1398{
1399	return device_create_file(&dev->dev, &dev_attr_mode);
1400}
1401
1402static void ipoib_cm_create_srq(struct net_device *dev, int max_sge)
1403{
1404	struct ipoib_dev_priv *priv = netdev_priv(dev);
1405	struct ib_srq_init_attr srq_init_attr = {
1406		.attr = {
1407			.max_wr  = ipoib_recvq_size,
1408			.max_sge = max_sge
1409		}
1410	};
1411
1412	priv->cm.srq = ib_create_srq(priv->pd, &srq_init_attr);
1413	if (IS_ERR(priv->cm.srq)) {
1414		if (PTR_ERR(priv->cm.srq) != -ENOSYS)
1415			printk(KERN_WARNING "%s: failed to allocate SRQ, error %ld\n",
1416			       priv->ca->name, PTR_ERR(priv->cm.srq));
1417		priv->cm.srq = NULL;
1418		return;
1419	}
1420
1421	priv->cm.srq_ring = kzalloc(ipoib_recvq_size * sizeof *priv->cm.srq_ring,
1422				    GFP_KERNEL);
1423	if (!priv->cm.srq_ring) {
1424		printk(KERN_WARNING "%s: failed to allocate CM SRQ ring (%d entries)\n",
1425		       priv->ca->name, ipoib_recvq_size);
1426		ib_destroy_srq(priv->cm.srq);
1427		priv->cm.srq = NULL;
1428	}
1429}
1430
1431int ipoib_cm_dev_init(struct net_device *dev)
1432{
1433	struct ipoib_dev_priv *priv = netdev_priv(dev);
1434	int i, ret;
1435	struct ib_device_attr attr;
1436
1437	INIT_LIST_HEAD(&priv->cm.passive_ids);
1438	INIT_LIST_HEAD(&priv->cm.reap_list);
1439	INIT_LIST_HEAD(&priv->cm.start_list);
1440	INIT_LIST_HEAD(&priv->cm.rx_error_list);
1441	INIT_LIST_HEAD(&priv->cm.rx_flush_list);
1442	INIT_LIST_HEAD(&priv->cm.rx_drain_list);
1443	INIT_LIST_HEAD(&priv->cm.rx_reap_list);
1444	INIT_WORK(&priv->cm.start_task, ipoib_cm_tx_start);
1445	INIT_WORK(&priv->cm.reap_task, ipoib_cm_tx_reap);
1446	INIT_WORK(&priv->cm.skb_task, ipoib_cm_skb_reap);
1447	INIT_WORK(&priv->cm.rx_reap_task, ipoib_cm_rx_reap);
1448	INIT_DELAYED_WORK(&priv->cm.stale_task, ipoib_cm_stale_task);
1449
1450	skb_queue_head_init(&priv->cm.skb_queue);
1451
1452	ret = ib_query_device(priv->ca, &attr);
1453	if (ret) {
1454		printk(KERN_WARNING "ib_query_device() failed with %d\n", ret);
1455		return ret;
1456	}
1457
1458	ipoib_dbg(priv, "max_srq_sge=%d\n", attr.max_srq_sge);
1459
1460	attr.max_srq_sge = min_t(int, IPOIB_CM_RX_SG, attr.max_srq_sge);
1461	ipoib_cm_create_srq(dev, attr.max_srq_sge);
1462	if (ipoib_cm_has_srq(dev)) {
1463		priv->cm.max_cm_mtu = attr.max_srq_sge * PAGE_SIZE - 0x10;
1464		priv->cm.num_frags  = attr.max_srq_sge;
1465		ipoib_dbg(priv, "max_cm_mtu = 0x%x, num_frags=%d\n",
1466			  priv->cm.max_cm_mtu, priv->cm.num_frags);
1467	} else {
1468		priv->cm.max_cm_mtu = IPOIB_CM_MTU;
1469		priv->cm.num_frags  = IPOIB_CM_RX_SG;
1470	}
1471
1472	for (i = 0; i < priv->cm.num_frags; ++i)
1473		priv->cm.rx_sge[i].lkey	= priv->mr->lkey;
1474
1475	priv->cm.rx_sge[0].length = IPOIB_CM_HEAD_SIZE;
1476	for (i = 1; i < priv->cm.num_frags; ++i)
1477		priv->cm.rx_sge[i].length = PAGE_SIZE;
1478	priv->cm.rx_wr.next = NULL;
1479	priv->cm.rx_wr.sg_list = priv->cm.rx_sge;
1480	priv->cm.rx_wr.num_sge = priv->cm.num_frags;
1481
1482	if (ipoib_cm_has_srq(dev)) {
1483		for (i = 0; i < ipoib_recvq_size; ++i) {
1484			if (!ipoib_cm_alloc_rx_skb(dev, priv->cm.srq_ring, i,
1485						   priv->cm.num_frags - 1,
1486						   priv->cm.srq_ring[i].mapping)) {
1487				ipoib_warn(priv, "failed to allocate "
1488					   "receive buffer %d\n", i);
1489				ipoib_cm_dev_cleanup(dev);
1490				return -ENOMEM;
1491			}
1492
1493			if (ipoib_cm_post_receive_srq(dev, i)) {
1494				ipoib_warn(priv, "ipoib_cm_post_receive_srq "
1495					   "failed for buf %d\n", i);
1496				ipoib_cm_dev_cleanup(dev);
1497				return -EIO;
1498			}
1499		}
1500	}
1501
1502	priv->dev->dev_addr[0] = IPOIB_FLAGS_RC;
1503	return 0;
1504}
1505
1506void ipoib_cm_dev_cleanup(struct net_device *dev)
1507{
1508	struct ipoib_dev_priv *priv = netdev_priv(dev);
1509	int ret;
1510
1511	if (!priv->cm.srq)
1512		return;
1513
1514	ipoib_dbg(priv, "Cleanup ipoib connected mode.\n");
1515
1516	ret = ib_destroy_srq(priv->cm.srq);
1517	if (ret)
1518		ipoib_warn(priv, "ib_destroy_srq failed: %d\n", ret);
1519
1520	priv->cm.srq = NULL;
1521	if (!priv->cm.srq_ring)
1522		return;
1523
1524	ipoib_cm_free_rx_ring(dev, priv->cm.srq_ring);
1525	priv->cm.srq_ring = NULL;
1526}
1527