nv50_program.c revision ea4b09cbcbd9db82648ab30f18c0f46a66ab9f69
1#include "pipe/p_context.h"
2#include "pipe/p_defines.h"
3#include "pipe/p_state.h"
4#include "pipe/p_inlines.h"
5
6#include "pipe/p_shader_tokens.h"
7#include "tgsi/util/tgsi_parse.h"
8#include "tgsi/util/tgsi_util.h"
9
10#include "nv50_context.h"
11#include "nv50_state.h"
12
13#define NV50_SU_MAX_TEMP 64
14
15/* ARL
16 * LIT - other buggery
17 *
18 * MSB - Like MAD, but MUL+SUB
19 * 	- Fuck it off, introduce a way to negate args for ops that
20 * 	  support it.
21 *
22 * Look into inlining IMMD for ops other than MOV (make it general?)
23 * 	- Maybe even relax restrictions a bit, can't do P_RESULT + P_IMMD,
24 * 	  but can emit to P_TEMP first - then MOV later. NVIDIA does this
25 *
26 * Verify half-insns work where expected - and force disable them where they
27 * don't work - MUL has it forcibly disabled atm as it fixes POW..
28 */
29struct nv50_reg {
30	enum {
31		P_TEMP,
32		P_ATTR,
33		P_RESULT,
34		P_CONST,
35		P_IMMD
36	} type;
37	int index;
38
39	int hw;
40	int neg;
41};
42
43struct nv50_pc {
44	struct nv50_program *p;
45
46	/* hw resources */
47	struct nv50_reg *r_temp[NV50_SU_MAX_TEMP];
48
49	/* tgsi resources */
50	struct nv50_reg *temp;
51	int temp_nr;
52	struct nv50_reg *attr;
53	int attr_nr;
54	struct nv50_reg *result;
55	int result_nr;
56	struct nv50_reg *param;
57	int param_nr;
58	struct nv50_reg *immd;
59	float *immd_buf;
60	int immd_nr;
61
62	struct nv50_reg *temp_temp[8];
63	unsigned temp_temp_nr;
64};
65
66static void
67alloc_reg(struct nv50_pc *pc, struct nv50_reg *reg)
68{
69	int i;
70
71	if (reg->type != P_TEMP)
72		return;
73
74	if (reg->hw >= 0) {
75		/*XXX: do this here too to catch FP temp-as-attr usage..
76		 *     not clean, but works */
77		if (pc->p->cfg.high_temp < (reg->hw + 1))
78			pc->p->cfg.high_temp = reg->hw + 1;
79		return;
80	}
81
82	for (i = 0; i < NV50_SU_MAX_TEMP; i++) {
83		if (!(pc->r_temp[i])) {
84			pc->r_temp[i] = reg;
85			reg->hw = i;
86			if (pc->p->cfg.high_temp < (i + 1))
87				pc->p->cfg.high_temp = i + 1;
88			return;
89		}
90	}
91
92	assert(0);
93}
94
95static struct nv50_reg *
96alloc_temp(struct nv50_pc *pc, struct nv50_reg *dst)
97{
98	struct nv50_reg *r;
99	int i;
100
101	if (dst && dst->type == P_TEMP && dst->hw == -1)
102		return dst;
103
104	for (i = 0; i < NV50_SU_MAX_TEMP; i++) {
105		if (!pc->r_temp[i]) {
106			r = CALLOC_STRUCT(nv50_reg);
107			r->type = P_TEMP;
108			r->index = -1;
109			r->hw = i;
110			pc->r_temp[i] = r;
111			return r;
112		}
113	}
114
115	assert(0);
116	return NULL;
117}
118
119static void
120free_temp(struct nv50_pc *pc, struct nv50_reg *r)
121{
122	if (r->index == -1) {
123		FREE(pc->r_temp[r->hw]);
124		pc->r_temp[r->hw] = NULL;
125	}
126}
127
128static struct nv50_reg *
129temp_temp(struct nv50_pc *pc)
130{
131	if (pc->temp_temp_nr >= 8)
132		assert(0);
133
134	pc->temp_temp[pc->temp_temp_nr] = alloc_temp(pc, NULL);
135	return pc->temp_temp[pc->temp_temp_nr++];
136}
137
138static void
139kill_temp_temp(struct nv50_pc *pc)
140{
141	int i;
142
143	for (i = 0; i < pc->temp_temp_nr; i++)
144		free_temp(pc, pc->temp_temp[i]);
145	pc->temp_temp_nr = 0;
146}
147
148static int
149ctor_immd(struct nv50_pc *pc, float x, float y, float z, float w)
150{
151	pc->immd_buf = realloc(pc->immd_buf, (pc->immd_nr + 1) * 4 *
152					     sizeof(float));
153	pc->immd_buf[(pc->immd_nr * 4) + 0] = x;
154	pc->immd_buf[(pc->immd_nr * 4) + 1] = x;
155	pc->immd_buf[(pc->immd_nr * 4) + 2] = x;
156	pc->immd_buf[(pc->immd_nr * 4) + 3] = x;
157
158	return pc->immd_nr++;
159}
160
161static struct nv50_reg *
162alloc_immd(struct nv50_pc *pc, float f)
163{
164	struct nv50_reg *r = CALLOC_STRUCT(nv50_reg);
165	unsigned hw;
166
167	hw = ctor_immd(pc, f, 0, 0, 0);
168	r->type = P_IMMD;
169	r->hw = hw;
170	r->index = -1;
171	return r;
172}
173
174static struct nv50_reg *
175tgsi_dst(struct nv50_pc *pc, int c, const struct tgsi_full_dst_register *dst)
176{
177	switch (dst->DstRegister.File) {
178	case TGSI_FILE_TEMPORARY:
179		return &pc->temp[dst->DstRegister.Index * 4 + c];
180	case TGSI_FILE_OUTPUT:
181		return &pc->result[dst->DstRegister.Index * 4 + c];
182	case TGSI_FILE_NULL:
183		return NULL;
184	default:
185		break;
186	}
187
188	return NULL;
189}
190
191static struct nv50_reg *
192tgsi_src(struct nv50_pc *pc, int chan, const struct tgsi_full_src_register *src)
193{
194	struct nv50_reg *r = NULL;
195	unsigned c;
196
197	c = tgsi_util_get_full_src_register_extswizzle(src, chan);
198	switch (c) {
199	case TGSI_EXTSWIZZLE_X:
200	case TGSI_EXTSWIZZLE_Y:
201	case TGSI_EXTSWIZZLE_Z:
202	case TGSI_EXTSWIZZLE_W:
203		switch (src->SrcRegister.File) {
204		case TGSI_FILE_INPUT:
205			r = &pc->attr[src->SrcRegister.Index * 4 + c];
206			break;
207		case TGSI_FILE_TEMPORARY:
208			r = &pc->temp[src->SrcRegister.Index * 4 + c];
209			break;
210		case TGSI_FILE_CONSTANT:
211			r = &pc->param[src->SrcRegister.Index * 4 + c];
212			break;
213		case TGSI_FILE_IMMEDIATE:
214			r = &pc->immd[src->SrcRegister.Index * 4 + c];
215			break;
216		default:
217			assert(0);
218			break;
219		}
220		break;
221	case TGSI_EXTSWIZZLE_ZERO:
222		r = alloc_immd(pc, 0.0);
223		break;
224	case TGSI_EXTSWIZZLE_ONE:
225		r = alloc_immd(pc, 1.0);
226		break;
227	default:
228		assert(0);
229		break;
230	}
231
232	switch (tgsi_util_get_full_src_register_sign_mode(src, chan)) {
233	case TGSI_UTIL_SIGN_KEEP:
234		break;
235	default:
236		assert(0);
237		break;
238	}
239
240	return r;
241}
242
243static void
244emit(struct nv50_pc *pc, unsigned *inst)
245{
246	struct nv50_program *p = pc->p;
247
248	if (inst[0] & 1) {
249		p->insns_nr += 2;
250		p->insns = realloc(p->insns, sizeof(unsigned) * p->insns_nr);
251		memcpy(p->insns + (p->insns_nr - 2), inst, sizeof(unsigned)*2);
252	} else {
253		p->insns_nr += 1;
254		p->insns = realloc(p->insns, sizeof(unsigned) * p->insns_nr);
255		memcpy(p->insns + (p->insns_nr - 1), inst, sizeof(unsigned));
256	}
257}
258
259static INLINE void set_long(struct nv50_pc *, unsigned *);
260
261static boolean
262is_long(unsigned *inst)
263{
264	if (inst[0] & 1)
265		return TRUE;
266	return FALSE;
267}
268
269static boolean
270is_immd(unsigned *inst)
271{
272	if (is_long(inst) && (inst[1] & 3) == 3)
273		return TRUE;
274	return FALSE;
275}
276
277static INLINE void
278set_pred(struct nv50_pc *pc, unsigned pred, unsigned idx, unsigned *inst)
279{
280	set_long(pc, inst);
281	inst[1] &= ~((0x1f << 7) | (0x3 << 12));
282	inst[1] |= (pred << 7) | (idx << 12);
283}
284
285static INLINE void
286set_pred_wr(struct nv50_pc *pc, unsigned on, unsigned idx, unsigned *inst)
287{
288	set_long(pc, inst);
289	inst[1] &= ~((0x3 << 4) | (1 << 6));
290	inst[1] |= (idx << 4) | (on << 6);
291}
292
293static INLINE void
294set_long(struct nv50_pc *pc, unsigned *inst)
295{
296	if (is_long(inst))
297		return;
298
299	inst[0] |= 1;
300	set_pred(pc, 0xf, 0, inst);
301	set_pred_wr(pc, 0, 0, inst);
302}
303
304static INLINE void
305set_dst(struct nv50_pc *pc, struct nv50_reg *dst, unsigned *inst)
306{
307	if (dst->type == P_RESULT) {
308		set_long(pc, inst);
309		inst[1] |= 0x00000008;
310	}
311
312	alloc_reg(pc, dst);
313	inst[0] |= (dst->hw << 2);
314}
315
316static INLINE void
317set_immd(struct nv50_pc *pc, struct nv50_reg *imm, unsigned *inst)
318{
319	unsigned val = fui(pc->immd_buf[imm->hw]); /* XXX */
320
321	set_long(pc, inst);
322	/*XXX: can't be predicated - bits overlap.. catch cases where both
323	 *     are required and avoid them. */
324	set_pred(pc, 0, 0, inst);
325	set_pred_wr(pc, 0, 0, inst);
326
327	inst[1] |= 0x00000002 | 0x00000001;
328	inst[0] |= (val & 0x3f) << 16;
329	inst[1] |= (val >> 6) << 2;
330}
331
332static void
333emit_interp(struct nv50_pc *pc, struct nv50_reg *dst,
334	    struct nv50_reg *src, struct nv50_reg *iv, boolean noperspective)
335{
336	unsigned inst[2] = { 0, 0 };
337
338	inst[0] |= 0x80000000;
339	set_dst(pc, dst, inst);
340	alloc_reg(pc, iv);
341	inst[0] |= (iv->hw << 9);
342	alloc_reg(pc, src);
343	inst[0] |= (src->hw << 16);
344	if (noperspective)
345		inst[0] |= (1 << 25);
346
347	emit(pc, inst);
348}
349
350static void
351set_cseg(struct nv50_pc *pc, struct nv50_reg *src, unsigned *inst)
352{
353	set_long(pc, inst);
354	if (src->type == P_IMMD) {
355		inst[1] |= (NV50_CB_PMISC << 22);
356	} else {
357		if (pc->p->type == NV50_PROG_VERTEX)
358			inst[1] |= (NV50_CB_PVP << 22);
359		else
360			inst[1] |= (NV50_CB_PFP << 22);
361	}
362}
363
364static void
365emit_mov(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src)
366{
367	unsigned inst[2] = { 0, 0 };
368
369	inst[0] |= 0x10000000;
370
371	set_dst(pc, dst, inst);
372
373	if (dst->type != P_RESULT && src->type == P_IMMD) {
374		set_immd(pc, src, inst);
375		/*XXX: 32-bit, but steals part of "half" reg space - need to
376		 *     catch and handle this case if/when we do half-regs
377		 */
378		inst[0] |= 0x00008000;
379	} else
380	if (src->type == P_IMMD || src->type == P_CONST) {
381		set_long(pc, inst);
382		set_cseg(pc, src, inst);
383		inst[0] |= (src->hw << 9);
384		inst[1] |= 0x20000000; /* src0 const? */
385	} else {
386		if (src->type == P_ATTR) {
387			set_long(pc, inst);
388			inst[1] |= 0x00200000;
389		}
390
391		alloc_reg(pc, src);
392		inst[0] |= (src->hw << 9);
393	}
394
395	/* We really should support "half" instructions here at some point,
396	 * but I don't feel confident enough about them yet.
397	 */
398	set_long(pc, inst);
399	if (is_long(inst) && !is_immd(inst)) {
400		inst[1] |= 0x04000000; /* 32-bit */
401		inst[1] |= 0x0003c000; /* "subsubop" 0xf == mov */
402	}
403
404	emit(pc, inst);
405}
406
407static boolean
408check_swap_src_0_1(struct nv50_pc *pc,
409		   struct nv50_reg **s0, struct nv50_reg **s1)
410{
411	struct nv50_reg *src0 = *s0, *src1 = *s1;
412
413	if (src0->type == P_CONST) {
414		if (src1->type != P_CONST) {
415			*s0 = src1;
416			*s1 = src0;
417			return TRUE;
418		}
419	} else
420	if (src1->type == P_ATTR) {
421		if (src0->type != P_ATTR) {
422			*s0 = src1;
423			*s1 = src0;
424			return TRUE;
425		}
426	}
427
428	return FALSE;
429}
430
431static void
432set_src_0(struct nv50_pc *pc, struct nv50_reg *src, unsigned *inst)
433{
434	if (src->type == P_ATTR) {
435		set_long(pc, inst);
436		inst[1] |= 0x00200000;
437	} else
438	if (src->type == P_CONST || src->type == P_IMMD) {
439		struct nv50_reg *temp = temp_temp(pc);
440
441		emit_mov(pc, temp, src);
442		src = temp;
443	}
444
445	alloc_reg(pc, src);
446	inst[0] |= (src->hw << 9);
447}
448
449static void
450set_src_1(struct nv50_pc *pc, struct nv50_reg *src, unsigned *inst)
451{
452	if (src->type == P_ATTR) {
453		struct nv50_reg *temp = temp_temp(pc);
454
455		emit_mov(pc, temp, src);
456		src = temp;
457	} else
458	if (src->type == P_CONST || src->type == P_IMMD) {
459		set_cseg(pc, src, inst);
460		inst[0] |= 0x00800000;
461	}
462
463	alloc_reg(pc, src);
464	inst[0] |= (src->hw << 16);
465}
466
467static void
468set_src_2(struct nv50_pc *pc, struct nv50_reg *src, unsigned *inst)
469{
470	set_long(pc, inst);
471
472	if (src->type == P_ATTR) {
473		struct nv50_reg *temp = temp_temp(pc);
474
475		emit_mov(pc, temp, src);
476		src = temp;
477	} else
478	if (src->type == P_CONST || src->type == P_IMMD) {
479		set_cseg(pc, src, inst);
480		inst[0] |= 0x01000000;
481	}
482
483	alloc_reg(pc, src);
484	inst[1] |= (src->hw << 14);
485}
486
487static void
488emit_mul(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src0,
489	 struct nv50_reg *src1)
490{
491	unsigned inst[2] = { 0, 0 };
492
493	inst[0] |= 0xc0000000;
494	set_long(pc, inst);
495
496	check_swap_src_0_1(pc, &src0, &src1);
497	set_dst(pc, dst, inst);
498	set_src_0(pc, src0, inst);
499	set_src_1(pc, src1, inst);
500
501	emit(pc, inst);
502}
503
504static void
505emit_add(struct nv50_pc *pc, struct nv50_reg *dst,
506	 struct nv50_reg *src0, struct nv50_reg *src1)
507{
508	unsigned inst[2] = { 0, 0 };
509
510	inst[0] |= 0xb0000000;
511
512	check_swap_src_0_1(pc, &src0, &src1);
513	set_dst(pc, dst, inst);
514	set_src_0(pc, src0, inst);
515	if (is_long(inst))
516		set_src_2(pc, src1, inst);
517	else
518		set_src_1(pc, src1, inst);
519
520	emit(pc, inst);
521}
522
523static void
524emit_minmax(struct nv50_pc *pc, unsigned sub, struct nv50_reg *dst,
525	    struct nv50_reg *src0, struct nv50_reg *src1)
526{
527	unsigned inst[2] = { 0, 0 };
528
529	set_long(pc, inst);
530	inst[0] |= 0xb0000000;
531	inst[1] |= (sub << 29);
532
533	check_swap_src_0_1(pc, &src0, &src1);
534	set_dst(pc, dst, inst);
535	set_src_0(pc, src0, inst);
536	set_src_1(pc, src1, inst);
537
538	emit(pc, inst);
539}
540
541static void
542emit_sub(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src0,
543	 struct nv50_reg *src1)
544{
545	unsigned inst[2] = { 0, 0 };
546
547	inst[0] |= 0xb0000000;
548
549	set_long(pc, inst);
550	if (check_swap_src_0_1(pc, &src0, &src1))
551		inst[1] |= 0x04000000;
552	else
553		inst[1] |= 0x08000000;
554
555	set_dst(pc, dst, inst);
556	set_src_0(pc, src0, inst);
557	set_src_2(pc, src1, inst);
558
559	emit(pc, inst);
560}
561
562static void
563emit_mad(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src0,
564	 struct nv50_reg *src1, struct nv50_reg *src2)
565{
566	unsigned inst[2] = { 0, 0 };
567
568	inst[0] |= 0xe0000000;
569
570	check_swap_src_0_1(pc, &src0, &src1);
571	set_dst(pc, dst, inst);
572	set_src_0(pc, src0, inst);
573	set_src_1(pc, src1, inst);
574	set_src_2(pc, src2, inst);
575
576	emit(pc, inst);
577}
578
579static void
580emit_msb(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src0,
581	 struct nv50_reg *src1, struct nv50_reg *src2)
582{
583	unsigned inst[2] = { 0, 0 };
584
585	inst[0] |= 0xe0000000;
586	set_long(pc, inst);
587	inst[1] |= 0x08000000; /* src0 * src1 - src2 */
588
589	check_swap_src_0_1(pc, &src0, &src1);
590	set_dst(pc, dst, inst);
591	set_src_0(pc, src0, inst);
592	set_src_1(pc, src1, inst);
593	set_src_2(pc, src2, inst);
594
595	emit(pc, inst);
596}
597
598static void
599emit_flop(struct nv50_pc *pc, unsigned sub,
600	  struct nv50_reg *dst, struct nv50_reg *src)
601{
602	unsigned inst[2] = { 0, 0 };
603
604	inst[0] |= 0x90000000;
605	if (sub) {
606		set_long(pc, inst);
607		inst[1] |= (sub << 29);
608	}
609
610	set_dst(pc, dst, inst);
611	set_src_0(pc, src, inst);
612
613	emit(pc, inst);
614}
615
616static void
617emit_preex2(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src)
618{
619	unsigned inst[2] = { 0, 0 };
620
621	inst[0] |= 0xb0000000;
622
623	set_dst(pc, dst, inst);
624	set_src_0(pc, src, inst);
625	set_long(pc, inst);
626	inst[1] |= (6 << 29) | 0x00004000;
627
628	emit(pc, inst);
629}
630/*XXX: inaccurate results.. why? */
631#define ALLOW_SET_SWAP 0
632
633static void
634emit_set(struct nv50_pc *pc, unsigned c_op, struct nv50_reg *dst,
635	 struct nv50_reg *src0, struct nv50_reg *src1)
636{
637	unsigned inst[2] = { 0, 0 };
638#if ALLOW_SET_SWAP
639	unsigned inv_cop[8] = { 0, 6, 2, 4, 3, 5, 1, 7 };
640#endif
641	struct nv50_reg *rdst;
642
643#if ALLOW_SET_SWAP
644	assert(c_op <= 7);
645	if (check_swap_src_0_1(pc, &src0, &src1))
646		c_op = inv_cop[c_op];
647#endif
648
649	rdst = dst;
650	if (dst->type != P_TEMP)
651		dst = alloc_temp(pc, NULL);
652
653	/* set.u32 */
654	set_long(pc, inst);
655	inst[0] |= 0xb0000000;
656	inst[1] |= (3 << 29);
657	inst[1] |= (c_op << 14);
658	/*XXX: breaks things, .u32 by default?
659	 *     decuda will disasm as .u16 and use .lo/.hi regs, but this
660	 *     doesn't seem to match what the hw actually does.
661	inst[1] |= 0x04000000; << breaks things.. .u32 by default?
662	 */
663	set_dst(pc, dst, inst);
664	set_src_0(pc, src0, inst);
665	set_src_1(pc, src1, inst);
666	emit(pc, inst);
667
668	/* cvt.f32.u32 */
669	inst[0] = 0xa0000001;
670	inst[1] = 0x64014780;
671	set_dst(pc, rdst, inst);
672	set_src_0(pc, dst, inst);
673	emit(pc, inst);
674
675	if (dst != rdst)
676		free_temp(pc, dst);
677}
678
679static void
680emit_flr(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src)
681{
682	unsigned inst[2] = { 0, 0 };
683
684	inst[0] = 0xa0000000; /* cvt */
685	set_long(pc, inst);
686	inst[1] |= (6 << 29); /* cvt */
687	inst[1] |= 0x08000000; /* integer mode */
688	inst[1] |= 0x04000000; /* 32 bit */
689	inst[1] |= ((0x1 << 3)) << 14; /* .rn */
690	inst[1] |= (1 << 14); /* src .f32 */
691	set_dst(pc, dst, inst);
692	set_src_0(pc, src, inst);
693
694	emit(pc, inst);
695}
696
697static boolean
698nv50_program_tx_insn(struct nv50_pc *pc, const union tgsi_full_token *tok)
699{
700	const struct tgsi_full_instruction *inst = &tok->FullInstruction;
701	struct nv50_reg *rdst[4], *dst[4], *src[3][4], *temp;
702	unsigned mask, sat;
703	int i, c;
704
705	NOUVEAU_ERR("insn %p\n", tok);
706
707	mask = inst->FullDstRegisters[0].DstRegister.WriteMask;
708	sat = inst->Instruction.Saturate == TGSI_SAT_ZERO_ONE;
709
710	for (c = 0; c < 4; c++) {
711		if (mask & (1 << c))
712			dst[c] = tgsi_dst(pc, c, &inst->FullDstRegisters[0]);
713		else
714			dst[c] = NULL;
715	}
716
717	for (i = 0; i < inst->Instruction.NumSrcRegs; i++) {
718		for (c = 0; c < 4; c++)
719			src[i][c] = tgsi_src(pc, c, &inst->FullSrcRegisters[i]);
720	}
721
722	if (sat) {
723		for (c = 0; c < 4; c++) {
724			rdst[c] = dst[c];
725			dst[c] = temp_temp(pc);
726		}
727	}
728
729	switch (inst->Instruction.Opcode) {
730	case TGSI_OPCODE_ABS:
731		for (c = 0; c < 4; c++) {
732			unsigned inst[2] = { 0, 0 };
733
734			inst[0] = 0xa0000000; /* cvt */
735			set_long(pc, inst);
736			inst[1] |= (6 << 29); /* cvt */
737			inst[1] |= 0x04000000; /* 32 bit */
738			inst[1] |= (1 << 14); /* src .f32 */
739			inst[1] |= ((1 << 6) << 14); /* .abs */
740			set_dst(pc, dst[c], inst);
741			set_src_0(pc, src[0][c], inst);
742			emit(pc, inst);
743		}
744		break;
745	case TGSI_OPCODE_ADD:
746		for (c = 0; c < 4; c++) {
747			if (!(mask & (1 << c)))
748				continue;
749			emit_add(pc, dst[c], src[0][c], src[1][c]);
750		}
751		break;
752	case TGSI_OPCODE_COS:
753		for (c = 0; c < 4; c++) {
754			if (!(mask & (1 << c)))
755				continue;
756			emit_flop(pc, 5, dst[c], src[0][c]);
757		}
758		break;
759	case TGSI_OPCODE_DP3:
760		temp = alloc_temp(pc, NULL);
761		emit_mul(pc, temp, src[0][0], src[1][0]);
762		emit_mad(pc, temp, src[0][1], src[1][1], temp);
763		emit_mad(pc, temp, src[0][2], src[1][2], temp);
764		for (c = 0; c < 4; c++) {
765			if (!(mask & (1 << c)))
766				continue;
767			emit_mov(pc, dst[c], temp);
768		}
769		free_temp(pc, temp);
770		break;
771	case TGSI_OPCODE_DP4:
772		temp = alloc_temp(pc, NULL);
773		emit_mul(pc, temp, src[0][0], src[1][0]);
774		emit_mad(pc, temp, src[0][1], src[1][1], temp);
775		emit_mad(pc, temp, src[0][2], src[1][2], temp);
776		emit_mad(pc, temp, src[0][3], src[1][3], temp);
777		for (c = 0; c < 4; c++) {
778			if (!(mask & (1 << c)))
779				continue;
780			emit_mov(pc, dst[c], temp);
781		}
782		free_temp(pc, temp);
783		break;
784	case TGSI_OPCODE_DPH:
785		temp = alloc_temp(pc, NULL);
786		emit_mul(pc, temp, src[0][0], src[1][0]);
787		emit_mad(pc, temp, src[0][1], src[1][1], temp);
788		emit_mad(pc, temp, src[0][2], src[1][2], temp);
789		emit_add(pc, temp, src[1][3], temp);
790		for (c = 0; c < 4; c++) {
791			if (!(mask & (1 << c)))
792				continue;
793			emit_mov(pc, dst[c], temp);
794		}
795		free_temp(pc, temp);
796		break;
797	case TGSI_OPCODE_DST:
798	{
799		struct nv50_reg *one = alloc_immd(pc, 1.0);
800		emit_mov(pc, dst[0], one);
801		emit_mul(pc, dst[1], src[0][1], src[1][1]);
802		emit_mov(pc, dst[2], src[0][2]);
803		emit_mov(pc, dst[3], src[1][3]);
804		FREE(one);
805	}
806		break;
807	case TGSI_OPCODE_EX2:
808		temp = alloc_temp(pc, NULL);
809		for (c = 0; c < 4; c++) {
810			if (!(mask & (1 << c)))
811				continue;
812			emit_preex2(pc, temp, src[0][c]);
813			emit_flop(pc, 6, dst[c], temp);
814		}
815		free_temp(pc, temp);
816		break;
817	case TGSI_OPCODE_FLR:
818		for (c = 0; c < 4; c++) {
819			if (!(mask & (1 << c)))
820				continue;
821			emit_flr(pc, dst[c], src[0][c]);
822		}
823		break;
824	case TGSI_OPCODE_FRC:
825		temp = alloc_temp(pc, NULL);
826		for (c = 0; c < 4; c++) {
827			if (!(mask & (1 << c)))
828				continue;
829			emit_flr(pc, temp, src[0][c]);
830			emit_sub(pc, dst[c], src[0][c], temp);
831		}
832		free_temp(pc, temp);
833		break;
834	case TGSI_OPCODE_LG2:
835		for (c = 0; c < 4; c++) {
836			if (!(mask & (1 << c)))
837				continue;
838			emit_flop(pc, 3, dst[c], src[0][c]);
839		}
840		break;
841	case TGSI_OPCODE_MAD:
842		for (c = 0; c < 4; c++) {
843			if (!(mask & (1 << c)))
844				continue;
845			emit_mad(pc, dst[c], src[0][c], src[1][c], src[2][c]);
846		}
847		break;
848	case TGSI_OPCODE_MAX:
849		for (c = 0; c < 4; c++) {
850			if (!(mask & (1 << c)))
851				continue;
852			emit_minmax(pc, 4, dst[c], src[0][c], src[1][c]);
853		}
854		break;
855	case TGSI_OPCODE_MIN:
856		for (c = 0; c < 4; c++) {
857			if (!(mask & (1 << c)))
858				continue;
859			emit_minmax(pc, 5, dst[c], src[0][c], src[1][c]);
860		}
861		break;
862	case TGSI_OPCODE_MOV:
863		for (c = 0; c < 4; c++) {
864			if (!(mask & (1 << c)))
865				continue;
866			emit_mov(pc, dst[c], src[0][c]);
867		}
868		break;
869	case TGSI_OPCODE_MUL:
870		for (c = 0; c < 4; c++) {
871			if (!(mask & (1 << c)))
872				continue;
873			emit_mul(pc, dst[c], src[0][c], src[1][c]);
874		}
875		break;
876	case TGSI_OPCODE_POW:
877		temp = alloc_temp(pc, NULL);
878		emit_flop(pc, 3, temp, src[0][0]);
879		emit_mul(pc, temp, temp, src[1][0]);
880		emit_preex2(pc, temp, temp);
881		emit_flop(pc, 6, temp, temp);
882		for (c = 0; c < 4; c++) {
883			if (!(mask & (1 << c)))
884				continue;
885			emit_mov(pc, dst[c], temp);
886		}
887		free_temp(pc, temp);
888		break;
889	case TGSI_OPCODE_RCP:
890		for (c = 0; c < 4; c++) {
891			if (!(mask & (1 << c)))
892				continue;
893			emit_flop(pc, 0, dst[c], src[0][c]);
894		}
895		break;
896	case TGSI_OPCODE_RSQ:
897		for (c = 0; c < 4; c++) {
898			if (!(mask & (1 << c)))
899				continue;
900			emit_flop(pc, 2, dst[c], src[0][c]);
901		}
902		break;
903	case TGSI_OPCODE_SGE:
904		for (c = 0; c < 4; c++) {
905			if (!(mask & (1 << c)))
906				continue;
907			emit_set(pc, 6, dst[c], src[0][c], src[1][c]);
908		}
909		break;
910	case TGSI_OPCODE_SIN:
911		for (c = 0; c < 4; c++) {
912			if (!(mask & (1 << c)))
913				continue;
914			emit_flop(pc, 4, dst[c], src[0][c]);
915		}
916		break;
917	case TGSI_OPCODE_SLT:
918		for (c = 0; c < 4; c++) {
919			if (!(mask & (1 << c)))
920				continue;
921			emit_set(pc, 1, dst[c], src[0][c], src[1][c]);
922		}
923		break;
924	case TGSI_OPCODE_SUB:
925		for (c = 0; c < 4; c++) {
926			if (!(mask & (1 << c)))
927				continue;
928			emit_sub(pc, dst[c], src[0][c], src[1][c]);
929		}
930		break;
931	case TGSI_OPCODE_XPD:
932		temp = alloc_temp(pc, NULL);
933		emit_mul(pc, temp, src[0][2], src[1][1]);
934		emit_msb(pc, dst[0], src[0][1], src[1][2], temp);
935		emit_mul(pc, temp, src[0][0], src[1][2]);
936		emit_msb(pc, dst[1], src[0][2], src[1][0], temp);
937		emit_mul(pc, temp, src[0][1], src[1][0]);
938		emit_msb(pc, dst[2], src[0][0], src[1][1], temp);
939		free_temp(pc, temp);
940		break;
941	case TGSI_OPCODE_END:
942		break;
943	default:
944		NOUVEAU_ERR("invalid opcode %d\n", inst->Instruction.Opcode);
945		return FALSE;
946	}
947
948	if (sat) {
949		for (c = 0; c < 4; c++) {
950			unsigned inst[2] = { 0, 0 };
951
952			if (!(mask & (1 << c)))
953				continue;
954
955			inst[0] = 0xa0000000; /* cvt */
956			set_long(pc, inst);
957			inst[1] |= (6 << 29); /* cvt */
958			inst[1] |= 0x04000000; /* 32 bit */
959			inst[1] |= (1 << 14); /* src .f32 */
960			inst[1] |= ((1 << 5) << 14); /* .sat */
961			set_dst(pc, rdst[c], inst);
962			set_src_0(pc, dst[c], inst);
963			emit(pc, inst);
964		}
965	}
966
967	kill_temp_temp(pc);
968	return TRUE;
969}
970
971static boolean
972nv50_program_tx_prep(struct nv50_pc *pc)
973{
974	struct tgsi_parse_context p;
975	boolean ret = FALSE;
976	unsigned i, c;
977
978	tgsi_parse_init(&p, pc->p->pipe.tokens);
979	while (!tgsi_parse_end_of_tokens(&p)) {
980		const union tgsi_full_token *tok = &p.FullToken;
981
982		tgsi_parse_token(&p);
983		switch (tok->Token.Type) {
984		case TGSI_TOKEN_TYPE_IMMEDIATE:
985		{
986			const struct tgsi_full_immediate *imm =
987				&p.FullToken.FullImmediate;
988
989			ctor_immd(pc, imm->u.ImmediateFloat32[0].Float,
990				      imm->u.ImmediateFloat32[1].Float,
991				      imm->u.ImmediateFloat32[2].Float,
992				      imm->u.ImmediateFloat32[3].Float);
993		}
994			break;
995		case TGSI_TOKEN_TYPE_DECLARATION:
996		{
997			const struct tgsi_full_declaration *d;
998			unsigned last;
999
1000			d = &p.FullToken.FullDeclaration;
1001			last = d->u.DeclarationRange.Last;
1002
1003			switch (d->Declaration.File) {
1004			case TGSI_FILE_TEMPORARY:
1005				if (pc->temp_nr < (last + 1))
1006					pc->temp_nr = last + 1;
1007				break;
1008			case TGSI_FILE_OUTPUT:
1009				if (pc->result_nr < (last + 1))
1010					pc->result_nr = last + 1;
1011				break;
1012			case TGSI_FILE_INPUT:
1013				if (pc->attr_nr < (last + 1))
1014					pc->attr_nr = last + 1;
1015				break;
1016			case TGSI_FILE_CONSTANT:
1017				if (pc->param_nr < (last + 1))
1018					pc->param_nr = last + 1;
1019				break;
1020			default:
1021				NOUVEAU_ERR("bad decl file %d\n",
1022					    d->Declaration.File);
1023				goto out_err;
1024			}
1025		}
1026			break;
1027		case TGSI_TOKEN_TYPE_INSTRUCTION:
1028			break;
1029		default:
1030			break;
1031		}
1032	}
1033
1034	NOUVEAU_ERR("%d temps\n", pc->temp_nr);
1035	if (pc->temp_nr) {
1036		pc->temp = calloc(pc->temp_nr * 4, sizeof(struct nv50_reg));
1037		if (!pc->temp)
1038			goto out_err;
1039
1040		for (i = 0; i < pc->temp_nr; i++) {
1041			for (c = 0; c < 4; c++) {
1042				pc->temp[i*4+c].type = P_TEMP;
1043				pc->temp[i*4+c].hw = -1;
1044				pc->temp[i*4+c].index = i;
1045			}
1046		}
1047	}
1048
1049	NOUVEAU_ERR("%d attrib regs\n", pc->attr_nr);
1050	if (pc->attr_nr) {
1051		struct nv50_reg *iv = NULL, *tmp = NULL;
1052		int aid = 0;
1053
1054		pc->attr = calloc(pc->attr_nr * 4, sizeof(struct nv50_reg));
1055		if (!pc->attr)
1056			goto out_err;
1057
1058		if (pc->p->type == NV50_PROG_FRAGMENT) {
1059			iv = alloc_temp(pc, NULL);
1060			aid++;
1061		}
1062
1063		for (i = 0; i < pc->attr_nr; i++) {
1064			struct nv50_reg *a = &pc->attr[i*4];
1065
1066			for (c = 0; c < 4; c++) {
1067				if (pc->p->type == NV50_PROG_FRAGMENT) {
1068					struct nv50_reg *at =
1069						alloc_temp(pc, NULL);
1070					pc->attr[i*4+c].type = at->type;
1071					pc->attr[i*4+c].hw = at->hw;
1072					pc->attr[i*4+c].index = at->index;
1073				} else {
1074					pc->p->cfg.vp.attr[aid/32] |=
1075						(1 << (aid % 32));
1076					pc->attr[i*4+c].type = P_ATTR;
1077					pc->attr[i*4+c].hw = aid++;
1078					pc->attr[i*4+c].index = i;
1079				}
1080			}
1081
1082			if (pc->p->type != NV50_PROG_FRAGMENT)
1083				continue;
1084
1085			emit_interp(pc, iv, iv, iv, FALSE);
1086			tmp = alloc_temp(pc, NULL);
1087			{
1088				unsigned inst[2] = { 0, 0 };
1089				inst[0]  = 0x90000000;
1090				inst[0] |= (tmp->hw << 2);
1091				emit(pc, inst);
1092			}
1093			emit_interp(pc, &a[0], &a[0], tmp, TRUE);
1094			emit_interp(pc, &a[1], &a[1], tmp, TRUE);
1095			emit_interp(pc, &a[2], &a[2], tmp, TRUE);
1096			emit_interp(pc, &a[3], &a[3], tmp, TRUE);
1097			free_temp(pc, tmp);
1098		}
1099
1100		if (iv)
1101			free_temp(pc, iv);
1102	}
1103
1104	NOUVEAU_ERR("%d result regs\n", pc->result_nr);
1105	if (pc->result_nr) {
1106		int rid = 0;
1107
1108		pc->result = calloc(pc->result_nr * 4, sizeof(struct nv50_reg));
1109		if (!pc->result)
1110			goto out_err;
1111
1112		for (i = 0; i < pc->result_nr; i++) {
1113			for (c = 0; c < 4; c++) {
1114				if (pc->p->type == NV50_PROG_FRAGMENT)
1115					pc->result[i*4+c].type = P_TEMP;
1116				else
1117					pc->result[i*4+c].type = P_RESULT;
1118				pc->result[i*4+c].hw = rid++;
1119				pc->result[i*4+c].index = i;
1120			}
1121		}
1122	}
1123
1124	NOUVEAU_ERR("%d param regs\n", pc->param_nr);
1125	if (pc->param_nr) {
1126		int rid = 0;
1127
1128		pc->param = calloc(pc->param_nr * 4, sizeof(struct nv50_reg));
1129		if (!pc->param)
1130			goto out_err;
1131
1132		for (i = 0; i < pc->param_nr; i++) {
1133			for (c = 0; c < 4; c++) {
1134				pc->param[i*4+c].type = P_CONST;
1135				pc->param[i*4+c].hw = rid++;
1136				pc->param[i*4+c].index = i;
1137			}
1138		}
1139	}
1140
1141	if (pc->immd_nr) {
1142		int rid = 0;
1143
1144		pc->immd = calloc(pc->immd_nr * 4, sizeof(struct nv50_reg));
1145		if (!pc->immd)
1146			goto out_err;
1147
1148		for (i = 0; i < pc->immd_nr; i++) {
1149			for (c = 0; c < 4; c++) {
1150				pc->immd[i*4+c].type = P_IMMD;
1151				pc->immd[i*4+c].hw = rid++;
1152				pc->immd[i*4+c].index = i;
1153			}
1154		}
1155	}
1156
1157	ret = TRUE;
1158out_err:
1159	tgsi_parse_free(&p);
1160	return ret;
1161}
1162
1163static boolean
1164nv50_program_tx(struct nv50_program *p)
1165{
1166	struct tgsi_parse_context parse;
1167	struct nv50_pc *pc;
1168	boolean ret;
1169
1170	pc = CALLOC_STRUCT(nv50_pc);
1171	if (!pc)
1172		return FALSE;
1173	pc->p = p;
1174	pc->p->cfg.high_temp = 4;
1175
1176	ret = nv50_program_tx_prep(pc);
1177	if (ret == FALSE)
1178		goto out_cleanup;
1179
1180	tgsi_parse_init(&parse, pc->p->pipe.tokens);
1181	while (!tgsi_parse_end_of_tokens(&parse)) {
1182		const union tgsi_full_token *tok = &parse.FullToken;
1183
1184		tgsi_parse_token(&parse);
1185
1186		switch (tok->Token.Type) {
1187		case TGSI_TOKEN_TYPE_INSTRUCTION:
1188			ret = nv50_program_tx_insn(pc, tok);
1189			if (ret == FALSE)
1190				goto out_err;
1191			break;
1192		default:
1193			break;
1194		}
1195	}
1196
1197	p->immd_nr = pc->immd_nr * 4;
1198	p->immd = pc->immd_buf;
1199
1200out_err:
1201	tgsi_parse_free(&parse);
1202
1203out_cleanup:
1204	return ret;
1205}
1206
1207static void
1208nv50_program_validate(struct nv50_context *nv50, struct nv50_program *p)
1209{
1210	int i;
1211
1212	if (nv50_program_tx(p) == FALSE)
1213		assert(0);
1214	/* *not* sufficient, it's fine if last inst is long and
1215	 * NOT immd - otherwise it's fucked fucked fucked */
1216	p->insns[p->insns_nr - 1] |= 0x00000001;
1217
1218	if (p->type == NV50_PROG_VERTEX) {
1219	for (i = 0; i < p->insns_nr; i++)
1220		NOUVEAU_ERR("VP0x%08x\n", p->insns[i]);
1221	} else {
1222	for (i = 0; i < p->insns_nr; i++)
1223		NOUVEAU_ERR("FP0x%08x\n", p->insns[i]);
1224	}
1225
1226	p->translated = TRUE;
1227}
1228
1229static void
1230nv50_program_validate_data(struct nv50_context *nv50, struct nv50_program *p)
1231{
1232	int i;
1233
1234	for (i = 0; i < p->immd_nr; i++) {
1235		BEGIN_RING(tesla, 0x0f00, 2);
1236		OUT_RING  ((NV50_CB_PMISC << 16) | (i << 8));
1237		OUT_RING  (fui(p->immd[i]));
1238	}
1239}
1240
1241static void
1242nv50_program_validate_code(struct nv50_context *nv50, struct nv50_program *p)
1243{
1244	struct pipe_winsys *ws = nv50->pipe.winsys;
1245	void *map;
1246
1247	if (!p->buffer)
1248		p->buffer = ws->buffer_create(ws, 0x100, 0, p->insns_nr * 4);
1249	map = ws->buffer_map(ws, p->buffer, PIPE_BUFFER_USAGE_CPU_WRITE);
1250	memcpy(map, p->insns, p->insns_nr * 4);
1251	ws->buffer_unmap(ws, p->buffer);
1252}
1253
1254void
1255nv50_vertprog_validate(struct nv50_context *nv50)
1256{
1257	struct nouveau_grobj *tesla = nv50->screen->tesla;
1258	struct nv50_program *p = nv50->vertprog;
1259	struct nouveau_stateobj *so;
1260
1261	if (!p->translated) {
1262		nv50_program_validate(nv50, p);
1263		if (!p->translated)
1264			assert(0);
1265	}
1266
1267	nv50_program_validate_data(nv50, p);
1268	nv50_program_validate_code(nv50, p);
1269
1270	so = so_new(11, 2);
1271	so_method(so, tesla, NV50TCL_VP_ADDRESS_HIGH, 2);
1272	so_reloc (so, p->buffer, 0, NOUVEAU_BO_VRAM | NOUVEAU_BO_RD |
1273		  NOUVEAU_BO_HIGH, 0, 0);
1274	so_reloc (so, p->buffer, 0, NOUVEAU_BO_VRAM | NOUVEAU_BO_RD |
1275		  NOUVEAU_BO_LOW, 0, 0);
1276	so_method(so, tesla, 0x1650, 2);
1277	so_data  (so, p->cfg.vp.attr[0]);
1278	so_data  (so, p->cfg.vp.attr[1]);
1279	so_method(so, tesla, 0x16ac, 2);
1280	so_data  (so, 8);
1281	so_data  (so, p->cfg.high_temp);
1282	so_method(so, tesla, 0x140c, 1);
1283	so_data  (so, 0); /* program start offset */
1284	so_emit(nv50->screen->nvws, so);
1285	so_ref(NULL, &so);
1286}
1287
1288void
1289nv50_fragprog_validate(struct nv50_context *nv50)
1290{
1291	struct nouveau_grobj *tesla = nv50->screen->tesla;
1292	struct nv50_program *p = nv50->fragprog;
1293	struct nouveau_stateobj *so;
1294
1295	if (!p->translated) {
1296		nv50_program_validate(nv50, p);
1297		if (!p->translated)
1298			assert(0);
1299	}
1300
1301	nv50_program_validate_data(nv50, p);
1302	nv50_program_validate_code(nv50, p);
1303
1304	so = so_new(7, 2);
1305	so_method(so, tesla, NV50TCL_FP_ADDRESS_HIGH, 2);
1306	so_reloc (so, p->buffer, 0, NOUVEAU_BO_VRAM | NOUVEAU_BO_RD |
1307		  NOUVEAU_BO_HIGH, 0, 0);
1308	so_reloc (so, p->buffer, 0, NOUVEAU_BO_VRAM | NOUVEAU_BO_RD |
1309		  NOUVEAU_BO_LOW, 0, 0);
1310	so_method(so, tesla, 0x198c, 1);
1311	so_data  (so, p->cfg.high_temp);
1312	so_method(so, tesla, 0x1414, 1);
1313	so_data  (so, 0); /* program start offset */
1314	so_emit(nv50->screen->nvws, so);
1315	so_ref(NULL, &so);
1316}
1317
1318void
1319nv50_program_destroy(struct nv50_context *nv50, struct nv50_program *p)
1320{
1321	struct pipe_winsys *ws = nv50->pipe.winsys;
1322
1323	if (p->insns_nr) {
1324		if (p->insns)
1325			FREE(p->insns);
1326		p->insns_nr = 0;
1327	}
1328
1329	if (p->buffer)
1330		pipe_buffer_reference(ws, &p->buffer, NULL);
1331
1332	p->translated = 0;
1333}
1334
1335