1/*
2 * SN Platform GRU Driver
3 *
4 *              KERNEL SERVICES THAT USE THE GRU
5 *
6 *  Copyright (c) 2008 Silicon Graphics, Inc.  All Rights Reserved.
7 *
8 *  This program is free software; you can redistribute it and/or modify
9 *  it under the terms of the GNU General Public License as published by
10 *  the Free Software Foundation; either version 2 of the License, or
11 *  (at your option) any later version.
12 *
13 *  This program is distributed in the hope that it will be useful,
14 *  but WITHOUT ANY WARRANTY; without even the implied warranty of
15 *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
16 *  GNU General Public License for more details.
17 *
18 *  You should have received a copy of the GNU General Public License
19 *  along with this program; if not, write to the Free Software
20 *  Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307 USA
21 */
22
23#include <linux/kernel.h>
24#include <linux/errno.h>
25#include <linux/slab.h>
26#include <linux/mm.h>
27#include <linux/spinlock.h>
28#include <linux/device.h>
29#include <linux/miscdevice.h>
30#include <linux/proc_fs.h>
31#include <linux/interrupt.h>
32#include <linux/uaccess.h>
33#include <linux/delay.h>
34#include <linux/export.h>
35#include <asm/io_apic.h>
36#include "gru.h"
37#include "grulib.h"
38#include "grutables.h"
39#include "grukservices.h"
40#include "gru_instructions.h"
41#include <asm/uv/uv_hub.h>
42
43/*
44 * Kernel GRU Usage
45 *
46 * The following is an interim algorithm for management of kernel GRU
47 * resources. This will likely be replaced when we better understand the
48 * kernel/user requirements.
49 *
50 * Blade percpu resources reserved for kernel use. These resources are
51 * reserved whenever the the kernel context for the blade is loaded. Note
52 * that the kernel context is not guaranteed to be always available. It is
53 * loaded on demand & can be stolen by a user if the user demand exceeds the
54 * kernel demand. The kernel can always reload the kernel context but
55 * a SLEEP may be required!!!.
56 *
57 * Async Overview:
58 *
59 * 	Each blade has one "kernel context" that owns GRU kernel resources
60 * 	located on the blade. Kernel drivers use GRU resources in this context
61 * 	for sending messages, zeroing memory, etc.
62 *
63 * 	The kernel context is dynamically loaded on demand. If it is not in
64 * 	use by the kernel, the kernel context can be unloaded & given to a user.
65 * 	The kernel context will be reloaded when needed. This may require that
66 * 	a context be stolen from a user.
67 * 		NOTE: frequent unloading/reloading of the kernel context is
68 * 		expensive. We are depending on batch schedulers, cpusets, sane
69 * 		drivers or some other mechanism to prevent the need for frequent
70 *	 	stealing/reloading.
71 *
72 * 	The kernel context consists of two parts:
73 * 		- 1 CB & a few DSRs that are reserved for each cpu on the blade.
74 * 		  Each cpu has it's own private resources & does not share them
75 * 		  with other cpus. These resources are used serially, ie,
76 * 		  locked, used & unlocked  on each call to a function in
77 * 		  grukservices.
78 * 		  	(Now that we have dynamic loading of kernel contexts, I
79 * 		  	 may rethink this & allow sharing between cpus....)
80 *
81 *		- Additional resources can be reserved long term & used directly
82 *		  by UV drivers located in the kernel. Drivers using these GRU
83 *		  resources can use asynchronous GRU instructions that send
84 *		  interrupts on completion.
85 *		  	- these resources must be explicitly locked/unlocked
86 *		  	- locked resources prevent (obviously) the kernel
87 *		  	  context from being unloaded.
88 *			- drivers using these resource directly issue their own
89 *			  GRU instruction and must wait/check completion.
90 *
91 * 		  When these resources are reserved, the caller can optionally
92 * 		  associate a wait_queue with the resources and use asynchronous
93 * 		  GRU instructions. When an async GRU instruction completes, the
94 * 		  driver will do a wakeup on the event.
95 *
96 */
97
98
99#define ASYNC_HAN_TO_BID(h)	((h) - 1)
100#define ASYNC_BID_TO_HAN(b)	((b) + 1)
101#define ASYNC_HAN_TO_BS(h)	gru_base[ASYNC_HAN_TO_BID(h)]
102
103#define GRU_NUM_KERNEL_CBR	1
104#define GRU_NUM_KERNEL_DSR_BYTES 256
105#define GRU_NUM_KERNEL_DSR_CL	(GRU_NUM_KERNEL_DSR_BYTES /		\
106					GRU_CACHE_LINE_BYTES)
107
108/* GRU instruction attributes for all instructions */
109#define IMA			IMA_CB_DELAY
110
111/* GRU cacheline size is always 64 bytes - even on arches with 128 byte lines */
112#define __gru_cacheline_aligned__                               \
113	__attribute__((__aligned__(GRU_CACHE_LINE_BYTES)))
114
115#define MAGIC	0x1234567887654321UL
116
117/* Default retry count for GRU errors on kernel instructions */
118#define EXCEPTION_RETRY_LIMIT	3
119
120/* Status of message queue sections */
121#define MQS_EMPTY		0
122#define MQS_FULL		1
123#define MQS_NOOP		2
124
125/*----------------- RESOURCE MANAGEMENT -------------------------------------*/
126/* optimized for x86_64 */
127struct message_queue {
128	union gru_mesqhead	head __gru_cacheline_aligned__;	/* CL 0 */
129	int			qlines;				/* DW 1 */
130	long 			hstatus[2];
131	void 			*next __gru_cacheline_aligned__;/* CL 1 */
132	void 			*limit;
133	void 			*start;
134	void 			*start2;
135	char			data ____cacheline_aligned;	/* CL 2 */
136};
137
138/* First word in every message - used by mesq interface */
139struct message_header {
140	char	present;
141	char	present2;
142	char 	lines;
143	char	fill;
144};
145
146#define HSTATUS(mq, h)	((mq) + offsetof(struct message_queue, hstatus[h]))
147
148/*
149 * Reload the blade's kernel context into a GRU chiplet. Called holding
150 * the bs_kgts_sema for READ. Will steal user contexts if necessary.
151 */
152static void gru_load_kernel_context(struct gru_blade_state *bs, int blade_id)
153{
154	struct gru_state *gru;
155	struct gru_thread_state *kgts;
156	void *vaddr;
157	int ctxnum, ncpus;
158
159	up_read(&bs->bs_kgts_sema);
160	down_write(&bs->bs_kgts_sema);
161
162	if (!bs->bs_kgts) {
163		bs->bs_kgts = gru_alloc_gts(NULL, 0, 0, 0, 0, 0);
164		bs->bs_kgts->ts_user_blade_id = blade_id;
165	}
166	kgts = bs->bs_kgts;
167
168	if (!kgts->ts_gru) {
169		STAT(load_kernel_context);
170		ncpus = uv_blade_nr_possible_cpus(blade_id);
171		kgts->ts_cbr_au_count = GRU_CB_COUNT_TO_AU(
172			GRU_NUM_KERNEL_CBR * ncpus + bs->bs_async_cbrs);
173		kgts->ts_dsr_au_count = GRU_DS_BYTES_TO_AU(
174			GRU_NUM_KERNEL_DSR_BYTES * ncpus +
175				bs->bs_async_dsr_bytes);
176		while (!gru_assign_gru_context(kgts)) {
177			msleep(1);
178			gru_steal_context(kgts);
179		}
180		gru_load_context(kgts);
181		gru = bs->bs_kgts->ts_gru;
182		vaddr = gru->gs_gru_base_vaddr;
183		ctxnum = kgts->ts_ctxnum;
184		bs->kernel_cb = get_gseg_base_address_cb(vaddr, ctxnum, 0);
185		bs->kernel_dsr = get_gseg_base_address_ds(vaddr, ctxnum, 0);
186	}
187	downgrade_write(&bs->bs_kgts_sema);
188}
189
190/*
191 * Free all kernel contexts that are not currently in use.
192 *   Returns 0 if all freed, else number of inuse context.
193 */
194static int gru_free_kernel_contexts(void)
195{
196	struct gru_blade_state *bs;
197	struct gru_thread_state *kgts;
198	int bid, ret = 0;
199
200	for (bid = 0; bid < GRU_MAX_BLADES; bid++) {
201		bs = gru_base[bid];
202		if (!bs)
203			continue;
204
205		/* Ignore busy contexts. Don't want to block here.  */
206		if (down_write_trylock(&bs->bs_kgts_sema)) {
207			kgts = bs->bs_kgts;
208			if (kgts && kgts->ts_gru)
209				gru_unload_context(kgts, 0);
210			bs->bs_kgts = NULL;
211			up_write(&bs->bs_kgts_sema);
212			kfree(kgts);
213		} else {
214			ret++;
215		}
216	}
217	return ret;
218}
219
220/*
221 * Lock & load the kernel context for the specified blade.
222 */
223static struct gru_blade_state *gru_lock_kernel_context(int blade_id)
224{
225	struct gru_blade_state *bs;
226	int bid;
227
228	STAT(lock_kernel_context);
229again:
230	bid = blade_id < 0 ? uv_numa_blade_id() : blade_id;
231	bs = gru_base[bid];
232
233	/* Handle the case where migration occurred while waiting for the sema */
234	down_read(&bs->bs_kgts_sema);
235	if (blade_id < 0 && bid != uv_numa_blade_id()) {
236		up_read(&bs->bs_kgts_sema);
237		goto again;
238	}
239	if (!bs->bs_kgts || !bs->bs_kgts->ts_gru)
240		gru_load_kernel_context(bs, bid);
241	return bs;
242
243}
244
245/*
246 * Unlock the kernel context for the specified blade. Context is not
247 * unloaded but may be stolen before next use.
248 */
249static void gru_unlock_kernel_context(int blade_id)
250{
251	struct gru_blade_state *bs;
252
253	bs = gru_base[blade_id];
254	up_read(&bs->bs_kgts_sema);
255	STAT(unlock_kernel_context);
256}
257
258/*
259 * Reserve & get pointers to the DSR/CBRs reserved for the current cpu.
260 * 	- returns with preemption disabled
261 */
262static int gru_get_cpu_resources(int dsr_bytes, void **cb, void **dsr)
263{
264	struct gru_blade_state *bs;
265	int lcpu;
266
267	BUG_ON(dsr_bytes > GRU_NUM_KERNEL_DSR_BYTES);
268	preempt_disable();
269	bs = gru_lock_kernel_context(-1);
270	lcpu = uv_blade_processor_id();
271	*cb = bs->kernel_cb + lcpu * GRU_HANDLE_STRIDE;
272	*dsr = bs->kernel_dsr + lcpu * GRU_NUM_KERNEL_DSR_BYTES;
273	return 0;
274}
275
276/*
277 * Free the current cpus reserved DSR/CBR resources.
278 */
279static void gru_free_cpu_resources(void *cb, void *dsr)
280{
281	gru_unlock_kernel_context(uv_numa_blade_id());
282	preempt_enable();
283}
284
285/*
286 * Reserve GRU resources to be used asynchronously.
287 *   Note: currently supports only 1 reservation per blade.
288 *
289 * 	input:
290 * 		blade_id  - blade on which resources should be reserved
291 * 		cbrs	  - number of CBRs
292 * 		dsr_bytes - number of DSR bytes needed
293 *	output:
294 *		handle to identify resource
295 *		(0 = async resources already reserved)
296 */
297unsigned long gru_reserve_async_resources(int blade_id, int cbrs, int dsr_bytes,
298			struct completion *cmp)
299{
300	struct gru_blade_state *bs;
301	struct gru_thread_state *kgts;
302	int ret = 0;
303
304	bs = gru_base[blade_id];
305
306	down_write(&bs->bs_kgts_sema);
307
308	/* Verify no resources already reserved */
309	if (bs->bs_async_dsr_bytes + bs->bs_async_cbrs)
310		goto done;
311	bs->bs_async_dsr_bytes = dsr_bytes;
312	bs->bs_async_cbrs = cbrs;
313	bs->bs_async_wq = cmp;
314	kgts = bs->bs_kgts;
315
316	/* Resources changed. Unload context if already loaded */
317	if (kgts && kgts->ts_gru)
318		gru_unload_context(kgts, 0);
319	ret = ASYNC_BID_TO_HAN(blade_id);
320
321done:
322	up_write(&bs->bs_kgts_sema);
323	return ret;
324}
325
326/*
327 * Release async resources previously reserved.
328 *
329 *	input:
330 *		han - handle to identify resources
331 */
332void gru_release_async_resources(unsigned long han)
333{
334	struct gru_blade_state *bs = ASYNC_HAN_TO_BS(han);
335
336	down_write(&bs->bs_kgts_sema);
337	bs->bs_async_dsr_bytes = 0;
338	bs->bs_async_cbrs = 0;
339	bs->bs_async_wq = NULL;
340	up_write(&bs->bs_kgts_sema);
341}
342
343/*
344 * Wait for async GRU instructions to complete.
345 *
346 *	input:
347 *		han - handle to identify resources
348 */
349void gru_wait_async_cbr(unsigned long han)
350{
351	struct gru_blade_state *bs = ASYNC_HAN_TO_BS(han);
352
353	wait_for_completion(bs->bs_async_wq);
354	mb();
355}
356
357/*
358 * Lock previous reserved async GRU resources
359 *
360 *	input:
361 *		han - handle to identify resources
362 *	output:
363 *		cb  - pointer to first CBR
364 *		dsr - pointer to first DSR
365 */
366void gru_lock_async_resource(unsigned long han,  void **cb, void **dsr)
367{
368	struct gru_blade_state *bs = ASYNC_HAN_TO_BS(han);
369	int blade_id = ASYNC_HAN_TO_BID(han);
370	int ncpus;
371
372	gru_lock_kernel_context(blade_id);
373	ncpus = uv_blade_nr_possible_cpus(blade_id);
374	if (cb)
375		*cb = bs->kernel_cb + ncpus * GRU_HANDLE_STRIDE;
376	if (dsr)
377		*dsr = bs->kernel_dsr + ncpus * GRU_NUM_KERNEL_DSR_BYTES;
378}
379
380/*
381 * Unlock previous reserved async GRU resources
382 *
383 *	input:
384 *		han - handle to identify resources
385 */
386void gru_unlock_async_resource(unsigned long han)
387{
388	int blade_id = ASYNC_HAN_TO_BID(han);
389
390	gru_unlock_kernel_context(blade_id);
391}
392
393/*----------------------------------------------------------------------*/
394int gru_get_cb_exception_detail(void *cb,
395		struct control_block_extended_exc_detail *excdet)
396{
397	struct gru_control_block_extended *cbe;
398	struct gru_thread_state *kgts = NULL;
399	unsigned long off;
400	int cbrnum, bid;
401
402	/*
403	 * Locate kgts for cb. This algorithm is SLOW but
404	 * this function is rarely called (ie., almost never).
405	 * Performance does not matter.
406	 */
407	for_each_possible_blade(bid) {
408		if (!gru_base[bid])
409			break;
410		kgts = gru_base[bid]->bs_kgts;
411		if (!kgts || !kgts->ts_gru)
412			continue;
413		off = cb - kgts->ts_gru->gs_gru_base_vaddr;
414		if (off < GRU_SIZE)
415			break;
416		kgts = NULL;
417	}
418	BUG_ON(!kgts);
419	cbrnum = thread_cbr_number(kgts, get_cb_number(cb));
420	cbe = get_cbe(GRUBASE(cb), cbrnum);
421	gru_flush_cache(cbe);	/* CBE not coherent */
422	sync_core();
423	excdet->opc = cbe->opccpy;
424	excdet->exopc = cbe->exopccpy;
425	excdet->ecause = cbe->ecause;
426	excdet->exceptdet0 = cbe->idef1upd;
427	excdet->exceptdet1 = cbe->idef3upd;
428	gru_flush_cache(cbe);
429	return 0;
430}
431
432char *gru_get_cb_exception_detail_str(int ret, void *cb,
433				      char *buf, int size)
434{
435	struct gru_control_block_status *gen = (void *)cb;
436	struct control_block_extended_exc_detail excdet;
437
438	if (ret > 0 && gen->istatus == CBS_EXCEPTION) {
439		gru_get_cb_exception_detail(cb, &excdet);
440		snprintf(buf, size,
441			"GRU:%d exception: cb %p, opc %d, exopc %d, ecause 0x%x,"
442			"excdet0 0x%lx, excdet1 0x%x", smp_processor_id(),
443			gen, excdet.opc, excdet.exopc, excdet.ecause,
444			excdet.exceptdet0, excdet.exceptdet1);
445	} else {
446		snprintf(buf, size, "No exception");
447	}
448	return buf;
449}
450
451static int gru_wait_idle_or_exception(struct gru_control_block_status *gen)
452{
453	while (gen->istatus >= CBS_ACTIVE) {
454		cpu_relax();
455		barrier();
456	}
457	return gen->istatus;
458}
459
460static int gru_retry_exception(void *cb)
461{
462	struct gru_control_block_status *gen = (void *)cb;
463	struct control_block_extended_exc_detail excdet;
464	int retry = EXCEPTION_RETRY_LIMIT;
465
466	while (1)  {
467		if (gru_wait_idle_or_exception(gen) == CBS_IDLE)
468			return CBS_IDLE;
469		if (gru_get_cb_message_queue_substatus(cb))
470			return CBS_EXCEPTION;
471		gru_get_cb_exception_detail(cb, &excdet);
472		if ((excdet.ecause & ~EXCEPTION_RETRY_BITS) ||
473				(excdet.cbrexecstatus & CBR_EXS_ABORT_OCC))
474			break;
475		if (retry-- == 0)
476			break;
477		gen->icmd = 1;
478		gru_flush_cache(gen);
479	}
480	return CBS_EXCEPTION;
481}
482
483int gru_check_status_proc(void *cb)
484{
485	struct gru_control_block_status *gen = (void *)cb;
486	int ret;
487
488	ret = gen->istatus;
489	if (ret == CBS_EXCEPTION)
490		ret = gru_retry_exception(cb);
491	rmb();
492	return ret;
493
494}
495
496int gru_wait_proc(void *cb)
497{
498	struct gru_control_block_status *gen = (void *)cb;
499	int ret;
500
501	ret = gru_wait_idle_or_exception(gen);
502	if (ret == CBS_EXCEPTION)
503		ret = gru_retry_exception(cb);
504	rmb();
505	return ret;
506}
507
508void gru_abort(int ret, void *cb, char *str)
509{
510	char buf[GRU_EXC_STR_SIZE];
511
512	panic("GRU FATAL ERROR: %s - %s\n", str,
513	      gru_get_cb_exception_detail_str(ret, cb, buf, sizeof(buf)));
514}
515
516void gru_wait_abort_proc(void *cb)
517{
518	int ret;
519
520	ret = gru_wait_proc(cb);
521	if (ret)
522		gru_abort(ret, cb, "gru_wait_abort");
523}
524
525
526/*------------------------------ MESSAGE QUEUES -----------------------------*/
527
528/* Internal status . These are NOT returned to the user. */
529#define MQIE_AGAIN		-1	/* try again */
530
531
532/*
533 * Save/restore the "present" flag that is in the second line of 2-line
534 * messages
535 */
536static inline int get_present2(void *p)
537{
538	struct message_header *mhdr = p + GRU_CACHE_LINE_BYTES;
539	return mhdr->present;
540}
541
542static inline void restore_present2(void *p, int val)
543{
544	struct message_header *mhdr = p + GRU_CACHE_LINE_BYTES;
545	mhdr->present = val;
546}
547
548/*
549 * Create a message queue.
550 * 	qlines - message queue size in cache lines. Includes 2-line header.
551 */
552int gru_create_message_queue(struct gru_message_queue_desc *mqd,
553		void *p, unsigned int bytes, int nasid, int vector, int apicid)
554{
555	struct message_queue *mq = p;
556	unsigned int qlines;
557
558	qlines = bytes / GRU_CACHE_LINE_BYTES - 2;
559	memset(mq, 0, bytes);
560	mq->start = &mq->data;
561	mq->start2 = &mq->data + (qlines / 2 - 1) * GRU_CACHE_LINE_BYTES;
562	mq->next = &mq->data;
563	mq->limit = &mq->data + (qlines - 2) * GRU_CACHE_LINE_BYTES;
564	mq->qlines = qlines;
565	mq->hstatus[0] = 0;
566	mq->hstatus[1] = 1;
567	mq->head = gru_mesq_head(2, qlines / 2 + 1);
568	mqd->mq = mq;
569	mqd->mq_gpa = uv_gpa(mq);
570	mqd->qlines = qlines;
571	mqd->interrupt_pnode = nasid >> 1;
572	mqd->interrupt_vector = vector;
573	mqd->interrupt_apicid = apicid;
574	return 0;
575}
576EXPORT_SYMBOL_GPL(gru_create_message_queue);
577
578/*
579 * Send a NOOP message to a message queue
580 * 	Returns:
581 * 		 0 - if queue is full after the send. This is the normal case
582 * 		     but various races can change this.
583 *		-1 - if mesq sent successfully but queue not full
584 *		>0 - unexpected error. MQE_xxx returned
585 */
586static int send_noop_message(void *cb, struct gru_message_queue_desc *mqd,
587				void *mesg)
588{
589	const struct message_header noop_header = {
590					.present = MQS_NOOP, .lines = 1};
591	unsigned long m;
592	int substatus, ret;
593	struct message_header save_mhdr, *mhdr = mesg;
594
595	STAT(mesq_noop);
596	save_mhdr = *mhdr;
597	*mhdr = noop_header;
598	gru_mesq(cb, mqd->mq_gpa, gru_get_tri(mhdr), 1, IMA);
599	ret = gru_wait(cb);
600
601	if (ret) {
602		substatus = gru_get_cb_message_queue_substatus(cb);
603		switch (substatus) {
604		case CBSS_NO_ERROR:
605			STAT(mesq_noop_unexpected_error);
606			ret = MQE_UNEXPECTED_CB_ERR;
607			break;
608		case CBSS_LB_OVERFLOWED:
609			STAT(mesq_noop_lb_overflow);
610			ret = MQE_CONGESTION;
611			break;
612		case CBSS_QLIMIT_REACHED:
613			STAT(mesq_noop_qlimit_reached);
614			ret = 0;
615			break;
616		case CBSS_AMO_NACKED:
617			STAT(mesq_noop_amo_nacked);
618			ret = MQE_CONGESTION;
619			break;
620		case CBSS_PUT_NACKED:
621			STAT(mesq_noop_put_nacked);
622			m = mqd->mq_gpa + (gru_get_amo_value_head(cb) << 6);
623			gru_vstore(cb, m, gru_get_tri(mesg), XTYPE_CL, 1, 1,
624						IMA);
625			if (gru_wait(cb) == CBS_IDLE)
626				ret = MQIE_AGAIN;
627			else
628				ret = MQE_UNEXPECTED_CB_ERR;
629			break;
630		case CBSS_PAGE_OVERFLOW:
631			STAT(mesq_noop_page_overflow);
632			/* fallthru */
633		default:
634			BUG();
635		}
636	}
637	*mhdr = save_mhdr;
638	return ret;
639}
640
641/*
642 * Handle a gru_mesq full.
643 */
644static int send_message_queue_full(void *cb, struct gru_message_queue_desc *mqd,
645				void *mesg, int lines)
646{
647	union gru_mesqhead mqh;
648	unsigned int limit, head;
649	unsigned long avalue;
650	int half, qlines;
651
652	/* Determine if switching to first/second half of q */
653	avalue = gru_get_amo_value(cb);
654	head = gru_get_amo_value_head(cb);
655	limit = gru_get_amo_value_limit(cb);
656
657	qlines = mqd->qlines;
658	half = (limit != qlines);
659
660	if (half)
661		mqh = gru_mesq_head(qlines / 2 + 1, qlines);
662	else
663		mqh = gru_mesq_head(2, qlines / 2 + 1);
664
665	/* Try to get lock for switching head pointer */
666	gru_gamir(cb, EOP_IR_CLR, HSTATUS(mqd->mq_gpa, half), XTYPE_DW, IMA);
667	if (gru_wait(cb) != CBS_IDLE)
668		goto cberr;
669	if (!gru_get_amo_value(cb)) {
670		STAT(mesq_qf_locked);
671		return MQE_QUEUE_FULL;
672	}
673
674	/* Got the lock. Send optional NOP if queue not full, */
675	if (head != limit) {
676		if (send_noop_message(cb, mqd, mesg)) {
677			gru_gamir(cb, EOP_IR_INC, HSTATUS(mqd->mq_gpa, half),
678					XTYPE_DW, IMA);
679			if (gru_wait(cb) != CBS_IDLE)
680				goto cberr;
681			STAT(mesq_qf_noop_not_full);
682			return MQIE_AGAIN;
683		}
684		avalue++;
685	}
686
687	/* Then flip queuehead to other half of queue. */
688	gru_gamer(cb, EOP_ERR_CSWAP, mqd->mq_gpa, XTYPE_DW, mqh.val, avalue,
689							IMA);
690	if (gru_wait(cb) != CBS_IDLE)
691		goto cberr;
692
693	/* If not successfully in swapping queue head, clear the hstatus lock */
694	if (gru_get_amo_value(cb) != avalue) {
695		STAT(mesq_qf_switch_head_failed);
696		gru_gamir(cb, EOP_IR_INC, HSTATUS(mqd->mq_gpa, half), XTYPE_DW,
697							IMA);
698		if (gru_wait(cb) != CBS_IDLE)
699			goto cberr;
700	}
701	return MQIE_AGAIN;
702cberr:
703	STAT(mesq_qf_unexpected_error);
704	return MQE_UNEXPECTED_CB_ERR;
705}
706
707/*
708 * Handle a PUT failure. Note: if message was a 2-line message, one of the
709 * lines might have successfully have been written. Before sending the
710 * message, "present" must be cleared in BOTH lines to prevent the receiver
711 * from prematurely seeing the full message.
712 */
713static int send_message_put_nacked(void *cb, struct gru_message_queue_desc *mqd,
714			void *mesg, int lines)
715{
716	unsigned long m, *val = mesg, gpa, save;
717	int ret;
718
719	m = mqd->mq_gpa + (gru_get_amo_value_head(cb) << 6);
720	if (lines == 2) {
721		gru_vset(cb, m, 0, XTYPE_CL, lines, 1, IMA);
722		if (gru_wait(cb) != CBS_IDLE)
723			return MQE_UNEXPECTED_CB_ERR;
724	}
725	gru_vstore(cb, m, gru_get_tri(mesg), XTYPE_CL, lines, 1, IMA);
726	if (gru_wait(cb) != CBS_IDLE)
727		return MQE_UNEXPECTED_CB_ERR;
728
729	if (!mqd->interrupt_vector)
730		return MQE_OK;
731
732	/*
733	 * Send a cross-partition interrupt to the SSI that contains the target
734	 * message queue. Normally, the interrupt is automatically delivered by
735	 * hardware but some error conditions require explicit delivery.
736	 * Use the GRU to deliver the interrupt. Otherwise partition failures
737	 * could cause unrecovered errors.
738	 */
739	gpa = uv_global_gru_mmr_address(mqd->interrupt_pnode, UVH_IPI_INT);
740	save = *val;
741	*val = uv_hub_ipi_value(mqd->interrupt_apicid, mqd->interrupt_vector,
742				dest_Fixed);
743	gru_vstore_phys(cb, gpa, gru_get_tri(mesg), IAA_REGISTER, IMA);
744	ret = gru_wait(cb);
745	*val = save;
746	if (ret != CBS_IDLE)
747		return MQE_UNEXPECTED_CB_ERR;
748	return MQE_OK;
749}
750
751/*
752 * Handle a gru_mesq failure. Some of these failures are software recoverable
753 * or retryable.
754 */
755static int send_message_failure(void *cb, struct gru_message_queue_desc *mqd,
756				void *mesg, int lines)
757{
758	int substatus, ret = 0;
759
760	substatus = gru_get_cb_message_queue_substatus(cb);
761	switch (substatus) {
762	case CBSS_NO_ERROR:
763		STAT(mesq_send_unexpected_error);
764		ret = MQE_UNEXPECTED_CB_ERR;
765		break;
766	case CBSS_LB_OVERFLOWED:
767		STAT(mesq_send_lb_overflow);
768		ret = MQE_CONGESTION;
769		break;
770	case CBSS_QLIMIT_REACHED:
771		STAT(mesq_send_qlimit_reached);
772		ret = send_message_queue_full(cb, mqd, mesg, lines);
773		break;
774	case CBSS_AMO_NACKED:
775		STAT(mesq_send_amo_nacked);
776		ret = MQE_CONGESTION;
777		break;
778	case CBSS_PUT_NACKED:
779		STAT(mesq_send_put_nacked);
780		ret = send_message_put_nacked(cb, mqd, mesg, lines);
781		break;
782	case CBSS_PAGE_OVERFLOW:
783		STAT(mesq_page_overflow);
784		/* fallthru */
785	default:
786		BUG();
787	}
788	return ret;
789}
790
791/*
792 * Send a message to a message queue
793 * 	mqd	message queue descriptor
794 * 	mesg	message. ust be vaddr within a GSEG
795 * 	bytes	message size (<= 2 CL)
796 */
797int gru_send_message_gpa(struct gru_message_queue_desc *mqd, void *mesg,
798				unsigned int bytes)
799{
800	struct message_header *mhdr;
801	void *cb;
802	void *dsr;
803	int istatus, clines, ret;
804
805	STAT(mesq_send);
806	BUG_ON(bytes < sizeof(int) || bytes > 2 * GRU_CACHE_LINE_BYTES);
807
808	clines = DIV_ROUND_UP(bytes, GRU_CACHE_LINE_BYTES);
809	if (gru_get_cpu_resources(bytes, &cb, &dsr))
810		return MQE_BUG_NO_RESOURCES;
811	memcpy(dsr, mesg, bytes);
812	mhdr = dsr;
813	mhdr->present = MQS_FULL;
814	mhdr->lines = clines;
815	if (clines == 2) {
816		mhdr->present2 = get_present2(mhdr);
817		restore_present2(mhdr, MQS_FULL);
818	}
819
820	do {
821		ret = MQE_OK;
822		gru_mesq(cb, mqd->mq_gpa, gru_get_tri(mhdr), clines, IMA);
823		istatus = gru_wait(cb);
824		if (istatus != CBS_IDLE)
825			ret = send_message_failure(cb, mqd, dsr, clines);
826	} while (ret == MQIE_AGAIN);
827	gru_free_cpu_resources(cb, dsr);
828
829	if (ret)
830		STAT(mesq_send_failed);
831	return ret;
832}
833EXPORT_SYMBOL_GPL(gru_send_message_gpa);
834
835/*
836 * Advance the receive pointer for the queue to the next message.
837 */
838void gru_free_message(struct gru_message_queue_desc *mqd, void *mesg)
839{
840	struct message_queue *mq = mqd->mq;
841	struct message_header *mhdr = mq->next;
842	void *next, *pnext;
843	int half = -1;
844	int lines = mhdr->lines;
845
846	if (lines == 2)
847		restore_present2(mhdr, MQS_EMPTY);
848	mhdr->present = MQS_EMPTY;
849
850	pnext = mq->next;
851	next = pnext + GRU_CACHE_LINE_BYTES * lines;
852	if (next == mq->limit) {
853		next = mq->start;
854		half = 1;
855	} else if (pnext < mq->start2 && next >= mq->start2) {
856		half = 0;
857	}
858
859	if (half >= 0)
860		mq->hstatus[half] = 1;
861	mq->next = next;
862}
863EXPORT_SYMBOL_GPL(gru_free_message);
864
865/*
866 * Get next message from message queue. Return NULL if no message
867 * present. User must call next_message() to move to next message.
868 * 	rmq	message queue
869 */
870void *gru_get_next_message(struct gru_message_queue_desc *mqd)
871{
872	struct message_queue *mq = mqd->mq;
873	struct message_header *mhdr = mq->next;
874	int present = mhdr->present;
875
876	/* skip NOOP messages */
877	while (present == MQS_NOOP) {
878		gru_free_message(mqd, mhdr);
879		mhdr = mq->next;
880		present = mhdr->present;
881	}
882
883	/* Wait for both halves of 2 line messages */
884	if (present == MQS_FULL && mhdr->lines == 2 &&
885				get_present2(mhdr) == MQS_EMPTY)
886		present = MQS_EMPTY;
887
888	if (!present) {
889		STAT(mesq_receive_none);
890		return NULL;
891	}
892
893	if (mhdr->lines == 2)
894		restore_present2(mhdr, mhdr->present2);
895
896	STAT(mesq_receive);
897	return mhdr;
898}
899EXPORT_SYMBOL_GPL(gru_get_next_message);
900
901/* ---------------------- GRU DATA COPY FUNCTIONS ---------------------------*/
902
903/*
904 * Load a DW from a global GPA. The GPA can be a memory or MMR address.
905 */
906int gru_read_gpa(unsigned long *value, unsigned long gpa)
907{
908	void *cb;
909	void *dsr;
910	int ret, iaa;
911
912	STAT(read_gpa);
913	if (gru_get_cpu_resources(GRU_NUM_KERNEL_DSR_BYTES, &cb, &dsr))
914		return MQE_BUG_NO_RESOURCES;
915	iaa = gpa >> 62;
916	gru_vload_phys(cb, gpa, gru_get_tri(dsr), iaa, IMA);
917	ret = gru_wait(cb);
918	if (ret == CBS_IDLE)
919		*value = *(unsigned long *)dsr;
920	gru_free_cpu_resources(cb, dsr);
921	return ret;
922}
923EXPORT_SYMBOL_GPL(gru_read_gpa);
924
925
926/*
927 * Copy a block of data using the GRU resources
928 */
929int gru_copy_gpa(unsigned long dest_gpa, unsigned long src_gpa,
930				unsigned int bytes)
931{
932	void *cb;
933	void *dsr;
934	int ret;
935
936	STAT(copy_gpa);
937	if (gru_get_cpu_resources(GRU_NUM_KERNEL_DSR_BYTES, &cb, &dsr))
938		return MQE_BUG_NO_RESOURCES;
939	gru_bcopy(cb, src_gpa, dest_gpa, gru_get_tri(dsr),
940		  XTYPE_B, bytes, GRU_NUM_KERNEL_DSR_CL, IMA);
941	ret = gru_wait(cb);
942	gru_free_cpu_resources(cb, dsr);
943	return ret;
944}
945EXPORT_SYMBOL_GPL(gru_copy_gpa);
946
947/* ------------------- KERNEL QUICKTESTS RUN AT STARTUP ----------------*/
948/* 	Temp - will delete after we gain confidence in the GRU		*/
949
950static int quicktest0(unsigned long arg)
951{
952	unsigned long word0;
953	unsigned long word1;
954	void *cb;
955	void *dsr;
956	unsigned long *p;
957	int ret = -EIO;
958
959	if (gru_get_cpu_resources(GRU_CACHE_LINE_BYTES, &cb, &dsr))
960		return MQE_BUG_NO_RESOURCES;
961	p = dsr;
962	word0 = MAGIC;
963	word1 = 0;
964
965	gru_vload(cb, uv_gpa(&word0), gru_get_tri(dsr), XTYPE_DW, 1, 1, IMA);
966	if (gru_wait(cb) != CBS_IDLE) {
967		printk(KERN_DEBUG "GRU:%d quicktest0: CBR failure 1\n", smp_processor_id());
968		goto done;
969	}
970
971	if (*p != MAGIC) {
972		printk(KERN_DEBUG "GRU:%d quicktest0 bad magic 0x%lx\n", smp_processor_id(), *p);
973		goto done;
974	}
975	gru_vstore(cb, uv_gpa(&word1), gru_get_tri(dsr), XTYPE_DW, 1, 1, IMA);
976	if (gru_wait(cb) != CBS_IDLE) {
977		printk(KERN_DEBUG "GRU:%d quicktest0: CBR failure 2\n", smp_processor_id());
978		goto done;
979	}
980
981	if (word0 != word1 || word1 != MAGIC) {
982		printk(KERN_DEBUG
983		       "GRU:%d quicktest0 err: found 0x%lx, expected 0x%lx\n",
984		     smp_processor_id(), word1, MAGIC);
985		goto done;
986	}
987	ret = 0;
988
989done:
990	gru_free_cpu_resources(cb, dsr);
991	return ret;
992}
993
994#define ALIGNUP(p, q)	((void *)(((unsigned long)(p) + (q) - 1) & ~(q - 1)))
995
996static int quicktest1(unsigned long arg)
997{
998	struct gru_message_queue_desc mqd;
999	void *p, *mq;
1000	unsigned long *dw;
1001	int i, ret = -EIO;
1002	char mes[GRU_CACHE_LINE_BYTES], *m;
1003
1004	/* Need  1K cacheline aligned that does not cross page boundary */
1005	p = kmalloc(4096, 0);
1006	if (p == NULL)
1007		return -ENOMEM;
1008	mq = ALIGNUP(p, 1024);
1009	memset(mes, 0xee, sizeof(mes));
1010	dw = mq;
1011
1012	gru_create_message_queue(&mqd, mq, 8 * GRU_CACHE_LINE_BYTES, 0, 0, 0);
1013	for (i = 0; i < 6; i++) {
1014		mes[8] = i;
1015		do {
1016			ret = gru_send_message_gpa(&mqd, mes, sizeof(mes));
1017		} while (ret == MQE_CONGESTION);
1018		if (ret)
1019			break;
1020	}
1021	if (ret != MQE_QUEUE_FULL || i != 4) {
1022		printk(KERN_DEBUG "GRU:%d quicktest1: unexpect status %d, i %d\n",
1023		       smp_processor_id(), ret, i);
1024		goto done;
1025	}
1026
1027	for (i = 0; i < 6; i++) {
1028		m = gru_get_next_message(&mqd);
1029		if (!m || m[8] != i)
1030			break;
1031		gru_free_message(&mqd, m);
1032	}
1033	if (i != 4) {
1034		printk(KERN_DEBUG "GRU:%d quicktest2: bad message, i %d, m %p, m8 %d\n",
1035			smp_processor_id(), i, m, m ? m[8] : -1);
1036		goto done;
1037	}
1038	ret = 0;
1039
1040done:
1041	kfree(p);
1042	return ret;
1043}
1044
1045static int quicktest2(unsigned long arg)
1046{
1047	static DECLARE_COMPLETION(cmp);
1048	unsigned long han;
1049	int blade_id = 0;
1050	int numcb = 4;
1051	int ret = 0;
1052	unsigned long *buf;
1053	void *cb0, *cb;
1054	struct gru_control_block_status *gen;
1055	int i, k, istatus, bytes;
1056
1057	bytes = numcb * 4 * 8;
1058	buf = kmalloc(bytes, GFP_KERNEL);
1059	if (!buf)
1060		return -ENOMEM;
1061
1062	ret = -EBUSY;
1063	han = gru_reserve_async_resources(blade_id, numcb, 0, &cmp);
1064	if (!han)
1065		goto done;
1066
1067	gru_lock_async_resource(han, &cb0, NULL);
1068	memset(buf, 0xee, bytes);
1069	for (i = 0; i < numcb; i++)
1070		gru_vset(cb0 + i * GRU_HANDLE_STRIDE, uv_gpa(&buf[i * 4]), 0,
1071				XTYPE_DW, 4, 1, IMA_INTERRUPT);
1072
1073	ret = 0;
1074	k = numcb;
1075	do {
1076		gru_wait_async_cbr(han);
1077		for (i = 0; i < numcb; i++) {
1078			cb = cb0 + i * GRU_HANDLE_STRIDE;
1079			istatus = gru_check_status(cb);
1080			if (istatus != CBS_ACTIVE && istatus != CBS_CALL_OS)
1081				break;
1082		}
1083		if (i == numcb)
1084			continue;
1085		if (istatus != CBS_IDLE) {
1086			printk(KERN_DEBUG "GRU:%d quicktest2: cb %d, exception\n", smp_processor_id(), i);
1087			ret = -EFAULT;
1088		} else if (buf[4 * i] || buf[4 * i + 1] || buf[4 * i + 2] ||
1089				buf[4 * i + 3]) {
1090			printk(KERN_DEBUG "GRU:%d quicktest2:cb %d,  buf 0x%lx, 0x%lx, 0x%lx, 0x%lx\n",
1091			       smp_processor_id(), i, buf[4 * i], buf[4 * i + 1], buf[4 * i + 2], buf[4 * i + 3]);
1092			ret = -EIO;
1093		}
1094		k--;
1095		gen = cb;
1096		gen->istatus = CBS_CALL_OS; /* don't handle this CBR again */
1097	} while (k);
1098	BUG_ON(cmp.done);
1099
1100	gru_unlock_async_resource(han);
1101	gru_release_async_resources(han);
1102done:
1103	kfree(buf);
1104	return ret;
1105}
1106
1107#define BUFSIZE 200
1108static int quicktest3(unsigned long arg)
1109{
1110	char buf1[BUFSIZE], buf2[BUFSIZE];
1111	int ret = 0;
1112
1113	memset(buf2, 0, sizeof(buf2));
1114	memset(buf1, get_cycles() & 255, sizeof(buf1));
1115	gru_copy_gpa(uv_gpa(buf2), uv_gpa(buf1), BUFSIZE);
1116	if (memcmp(buf1, buf2, BUFSIZE)) {
1117		printk(KERN_DEBUG "GRU:%d quicktest3 error\n", smp_processor_id());
1118		ret = -EIO;
1119	}
1120	return ret;
1121}
1122
1123/*
1124 * Debugging only. User hook for various kernel tests
1125 * of driver & gru.
1126 */
1127int gru_ktest(unsigned long arg)
1128{
1129	int ret = -EINVAL;
1130
1131	switch (arg & 0xff) {
1132	case 0:
1133		ret = quicktest0(arg);
1134		break;
1135	case 1:
1136		ret = quicktest1(arg);
1137		break;
1138	case 2:
1139		ret = quicktest2(arg);
1140		break;
1141	case 3:
1142		ret = quicktest3(arg);
1143		break;
1144	case 99:
1145		ret = gru_free_kernel_contexts();
1146		break;
1147	}
1148	return ret;
1149
1150}
1151
1152int gru_kservices_init(void)
1153{
1154	return 0;
1155}
1156
1157void gru_kservices_exit(void)
1158{
1159	if (gru_free_kernel_contexts())
1160		BUG();
1161}
1162
1163