nv50_program.c revision 33e4d30d50344be26398a51365bea1be37487403
1#include "pipe/p_context.h"
2#include "pipe/p_defines.h"
3#include "pipe/p_state.h"
4#include "pipe/p_inlines.h"
5
6#include "pipe/p_shader_tokens.h"
7#include "tgsi/util/tgsi_parse.h"
8#include "tgsi/util/tgsi_util.h"
9
10#include "nv50_context.h"
11#include "nv50_state.h"
12
13#define NV50_SU_MAX_TEMP 64
14
15/* ARL
16 * LIT - other buggery
17 * POW
18 * SWZ - negation ARGH
19 * SAT
20 *
21 * MSB - Like MAD, but MUL+SUB
22 * 	- Fuck it off, introduce a way to negate args for ops that
23 * 	  support it.
24 *
25 * Look into inlining IMMD for ops other than MOV
26 * 	- Maybe even relax restrictions a bit, can't do P_RESULT + P_IMMD,
27 * 	  but can emit to P_TEMP first - then MOV later. NVIDIA does this
28 */
29struct nv50_reg {
30	enum {
31		P_TEMP,
32		P_ATTR,
33		P_RESULT,
34		P_CONST,
35		P_IMMD
36	} type;
37	int index;
38
39	int hw;
40	int neg;
41};
42
43struct nv50_pc {
44	struct nv50_program *p;
45
46	/* hw resources */
47	struct nv50_reg *r_temp[NV50_SU_MAX_TEMP];
48
49	/* tgsi resources */
50	struct nv50_reg *temp;
51	int temp_nr;
52	struct nv50_reg *attr;
53	int attr_nr;
54	struct nv50_reg *result;
55	int result_nr;
56	struct nv50_reg *param;
57	int param_nr;
58	struct nv50_reg *immd;
59	float *immd_buf;
60	int immd_nr;
61
62	struct nv50_reg *temp_temp[8];
63	unsigned temp_temp_nr;
64};
65
66static void
67alloc_reg(struct nv50_pc *pc, struct nv50_reg *reg)
68{
69	int i;
70
71	if (reg->type != P_TEMP)
72		return;
73
74	if (reg->hw >= 0) {
75		/*XXX: do this here too to catch FP temp-as-attr usage..
76		 *     not clean, but works */
77		if (pc->p->cfg.high_temp < (reg->hw + 1))
78			pc->p->cfg.high_temp = reg->hw + 1;
79		return;
80	}
81
82	for (i = 0; i < NV50_SU_MAX_TEMP; i++) {
83		if (!(pc->r_temp[i])) {
84			pc->r_temp[i] = reg;
85			reg->hw = i;
86			if (pc->p->cfg.high_temp < (i + 1))
87				pc->p->cfg.high_temp = i + 1;
88			return;
89		}
90	}
91
92	assert(0);
93}
94
95static struct nv50_reg *
96alloc_temp(struct nv50_pc *pc, struct nv50_reg *dst)
97{
98	struct nv50_reg *r;
99	int i;
100
101	if (dst && dst->type == P_TEMP && dst->hw == -1)
102		return dst;
103
104	for (i = 0; i < NV50_SU_MAX_TEMP; i++) {
105		if (!pc->r_temp[i]) {
106			r = CALLOC_STRUCT(nv50_reg);
107			r->type = P_TEMP;
108			r->index = -1;
109			r->hw = i;
110			pc->r_temp[i] = r;
111			return r;
112		}
113	}
114
115	assert(0);
116	return NULL;
117}
118
119static void
120free_temp(struct nv50_pc *pc, struct nv50_reg *r)
121{
122	if (r->index == -1) {
123		FREE(pc->r_temp[r->hw]);
124		pc->r_temp[r->hw] = NULL;
125	}
126}
127
128static struct nv50_reg *
129temp_temp(struct nv50_pc *pc)
130{
131	if (pc->temp_temp_nr >= 8)
132		assert(0);
133
134	pc->temp_temp[pc->temp_temp_nr] = alloc_temp(pc, NULL);
135	return pc->temp_temp[pc->temp_temp_nr++];
136}
137
138static void
139kill_temp_temp(struct nv50_pc *pc)
140{
141	int i;
142
143	for (i = 0; i < pc->temp_temp_nr; i++)
144		free_temp(pc, pc->temp_temp[i]);
145	pc->temp_temp_nr = 0;
146}
147
148static int
149ctor_immd(struct nv50_pc *pc, float x, float y, float z, float w)
150{
151	pc->immd_buf = realloc(pc->immd_buf, (pc->immd_nr + 1) * 4 *
152					     sizeof(float));
153	pc->immd_buf[(pc->immd_nr * 4) + 0] = x;
154	pc->immd_buf[(pc->immd_nr * 4) + 1] = x;
155	pc->immd_buf[(pc->immd_nr * 4) + 2] = x;
156	pc->immd_buf[(pc->immd_nr * 4) + 3] = x;
157
158	return pc->immd_nr++;
159}
160
161static struct nv50_reg *
162alloc_immd(struct nv50_pc *pc, float f)
163{
164	struct nv50_reg *r = CALLOC_STRUCT(nv50_reg);
165	unsigned hw;
166
167	hw = ctor_immd(pc, f, 0, 0, 0);
168	r->type = P_IMMD;
169	r->hw = hw;
170	r->index = -1;
171	return r;
172}
173
174static struct nv50_reg *
175tgsi_dst(struct nv50_pc *pc, int c, const struct tgsi_full_dst_register *dst)
176{
177	switch (dst->DstRegister.File) {
178	case TGSI_FILE_TEMPORARY:
179		return &pc->temp[dst->DstRegister.Index * 4 + c];
180	case TGSI_FILE_OUTPUT:
181		return &pc->result[dst->DstRegister.Index * 4 + c];
182	case TGSI_FILE_NULL:
183		return NULL;
184	default:
185		break;
186	}
187
188	return NULL;
189}
190
191static struct nv50_reg *
192tgsi_src(struct nv50_pc *pc, int c, const struct tgsi_full_src_register *src)
193{
194	/* Handle swizzling */
195	switch (c) {
196	case 0: c = src->SrcRegister.SwizzleX; break;
197	case 1: c = src->SrcRegister.SwizzleY; break;
198	case 2: c = src->SrcRegister.SwizzleZ; break;
199	case 3: c = src->SrcRegister.SwizzleW; break;
200	default:
201		assert(0);
202	}
203
204	switch (src->SrcRegister.File) {
205	case TGSI_FILE_INPUT:
206		return &pc->attr[src->SrcRegister.Index * 4 + c];
207	case TGSI_FILE_TEMPORARY:
208		return &pc->temp[src->SrcRegister.Index * 4 + c];
209	case TGSI_FILE_CONSTANT:
210		return &pc->param[src->SrcRegister.Index * 4 + c];
211	case TGSI_FILE_IMMEDIATE:
212		return &pc->immd[src->SrcRegister.Index * 4 + c];
213	default:
214		break;
215	}
216
217	return NULL;
218}
219
220static void
221emit(struct nv50_pc *pc, unsigned *inst)
222{
223	struct nv50_program *p = pc->p;
224
225	if (inst[0] & 1) {
226		p->insns_nr += 2;
227		p->insns = realloc(p->insns, sizeof(unsigned) * p->insns_nr);
228		memcpy(p->insns + (p->insns_nr - 2), inst, sizeof(unsigned)*2);
229	} else {
230		p->insns_nr += 1;
231		p->insns = realloc(p->insns, sizeof(unsigned) * p->insns_nr);
232		memcpy(p->insns + (p->insns_nr - 1), inst, sizeof(unsigned));
233	}
234}
235
236static INLINE void set_long(struct nv50_pc *, unsigned *);
237
238static boolean
239is_long(unsigned *inst)
240{
241	if (inst[0] & 1)
242		return TRUE;
243	return FALSE;
244}
245
246static boolean
247is_immd(unsigned *inst)
248{
249	if (is_long(inst) && (inst[1] & 3) == 3)
250		return TRUE;
251	return FALSE;
252}
253
254static INLINE void
255set_pred(struct nv50_pc *pc, unsigned pred, unsigned idx, unsigned *inst)
256{
257	set_long(pc, inst);
258	inst[1] &= ~((0x1f << 7) | (0x3 << 12));
259	inst[1] |= (pred << 7) | (idx << 12);
260}
261
262static INLINE void
263set_pred_wr(struct nv50_pc *pc, unsigned on, unsigned idx, unsigned *inst)
264{
265	set_long(pc, inst);
266	inst[1] &= ~((0x3 << 4) | (1 << 6));
267	inst[1] |= (idx << 4) | (on << 6);
268}
269
270static INLINE void
271set_long(struct nv50_pc *pc, unsigned *inst)
272{
273	if (is_long(inst))
274		return;
275
276	inst[0] |= 1;
277	set_pred(pc, 0xf, 0, inst);
278	set_pred_wr(pc, 0, 0, inst);
279}
280
281static INLINE void
282set_dst(struct nv50_pc *pc, struct nv50_reg *dst, unsigned *inst)
283{
284	if (dst->type == P_RESULT) {
285		set_long(pc, inst);
286		inst[1] |= 0x00000008;
287	}
288
289	alloc_reg(pc, dst);
290	inst[0] |= (dst->hw << 2);
291}
292
293static INLINE void
294set_immd(struct nv50_pc *pc, struct nv50_reg *imm, unsigned *inst)
295{
296	unsigned val = fui(pc->immd_buf[imm->hw]); /* XXX */
297
298	set_long(pc, inst);
299	/*XXX: can't be predicated - bits overlap.. catch cases where both
300	 *     are required and avoid them. */
301	set_pred(pc, 0, 0, inst);
302	set_pred_wr(pc, 0, 0, inst);
303
304	inst[1] |= 0x00000002 | 0x00000001;
305	inst[0] |= (val & 0x3f) << 16;
306	inst[1] |= (val >> 6) << 2;
307}
308
309static void
310emit_interp(struct nv50_pc *pc, struct nv50_reg *dst,
311	    struct nv50_reg *src, struct nv50_reg *iv, boolean noperspective)
312{
313	unsigned inst[2] = { 0, 0 };
314
315	inst[0] |= 0x80000000;
316	set_dst(pc, dst, inst);
317	alloc_reg(pc, iv);
318	inst[0] |= (iv->hw << 9);
319	alloc_reg(pc, src);
320	inst[0] |= (src->hw << 16);
321	if (noperspective)
322		inst[0] |= (1 << 25);
323
324	emit(pc, inst);
325}
326
327static void
328set_cseg(struct nv50_pc *pc, struct nv50_reg *src, unsigned *inst)
329{
330	set_long(pc, inst);
331	if (src->type == P_IMMD) {
332		inst[1] |= (NV50_CB_PMISC << 22);
333	} else {
334		if (pc->p->type == NV50_PROG_VERTEX)
335			inst[1] |= (NV50_CB_PVP << 22);
336		else
337			inst[1] |= (NV50_CB_PFP << 22);
338	}
339}
340
341static void
342emit_mov(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src)
343{
344	unsigned inst[2] = { 0, 0 };
345
346	inst[0] |= 0x10000000;
347
348	set_dst(pc, dst, inst);
349
350	if (dst->type != P_RESULT && src->type == P_IMMD) {
351		set_immd(pc, src, inst);
352		/*XXX: 32-bit, but steals part of "half" reg space - need to
353		 *     catch and handle this case if/when we do half-regs
354		 */
355		inst[0] |= 0x00008000;
356	} else
357	if (src->type == P_IMMD || src->type == P_CONST) {
358		set_long(pc, inst);
359		set_cseg(pc, src, inst);
360		inst[0] |= (src->hw << 9);
361		inst[1] |= 0x20000000; /* src0 const? */
362	} else {
363		if (src->type == P_ATTR) {
364			set_long(pc, inst);
365			inst[1] |= 0x00200000;
366		}
367
368		alloc_reg(pc, src);
369		inst[0] |= (src->hw << 9);
370	}
371
372	/* We really should support "half" instructions here at some point,
373	 * but I don't feel confident enough about them yet.
374	 */
375	set_long(pc, inst);
376	if (is_long(inst) && !is_immd(inst)) {
377		inst[1] |= 0x04000000; /* 32-bit */
378		inst[1] |= 0x0003c000; /* "subsubop" 0xf == mov */
379	}
380
381	emit(pc, inst);
382}
383
384static boolean
385check_swap_src_0_1(struct nv50_pc *pc,
386		   struct nv50_reg **s0, struct nv50_reg **s1)
387{
388	struct nv50_reg *src0 = *s0, *src1 = *s1;
389
390	if (src0->type == P_CONST) {
391		if (src1->type != P_CONST) {
392			*s0 = src1;
393			*s1 = src0;
394			return TRUE;
395		}
396	} else
397	if (src1->type == P_ATTR) {
398		if (src0->type != P_ATTR) {
399			*s0 = src1;
400			*s1 = src0;
401			return TRUE;
402		}
403	}
404
405	return FALSE;
406}
407
408static void
409set_src_0(struct nv50_pc *pc, struct nv50_reg *src, unsigned *inst)
410{
411	if (src->type == P_ATTR) {
412		set_long(pc, inst);
413		inst[1] |= 0x00200000;
414	} else
415	if (src->type == P_CONST || src->type == P_IMMD) {
416		struct nv50_reg *temp = temp_temp(pc);
417
418		emit_mov(pc, temp, src);
419		src = temp;
420	}
421
422	alloc_reg(pc, src);
423	inst[0] |= (src->hw << 9);
424}
425
426static void
427set_src_1(struct nv50_pc *pc, struct nv50_reg *src, unsigned *inst)
428{
429	if (src->type == P_ATTR) {
430		struct nv50_reg *temp = temp_temp(pc);
431
432		emit_mov(pc, temp, src);
433		src = temp;
434	} else
435	if (src->type == P_CONST || src->type == P_IMMD) {
436		set_cseg(pc, src, inst);
437		inst[0] |= 0x00800000;
438	}
439
440	alloc_reg(pc, src);
441	inst[0] |= (src->hw << 16);
442}
443
444static void
445set_src_2(struct nv50_pc *pc, struct nv50_reg *src, unsigned *inst)
446{
447	set_long(pc, inst);
448
449	if (src->type == P_ATTR) {
450		struct nv50_reg *temp = temp_temp(pc);
451
452		emit_mov(pc, temp, src);
453		src = temp;
454	} else
455	if (src->type == P_CONST || src->type == P_IMMD) {
456		set_cseg(pc, src, inst);
457		inst[0] |= 0x01000000;
458	}
459
460	alloc_reg(pc, src);
461	inst[1] |= (src->hw << 14);
462}
463
464static void
465emit_mul(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src0,
466	 struct nv50_reg *src1)
467{
468	unsigned inst[2] = { 0, 0 };
469
470	inst[0] |= 0xc0000000;
471
472	check_swap_src_0_1(pc, &src0, &src1);
473	set_dst(pc, dst, inst);
474	set_src_0(pc, src0, inst);
475	set_src_1(pc, src1, inst);
476
477	emit(pc, inst);
478}
479
480static void
481emit_add(struct nv50_pc *pc, struct nv50_reg *dst,
482	 struct nv50_reg *src0, struct nv50_reg *src1)
483{
484	unsigned inst[2] = { 0, 0 };
485
486	inst[0] |= 0xb0000000;
487
488	check_swap_src_0_1(pc, &src0, &src1);
489	set_dst(pc, dst, inst);
490	set_src_0(pc, src0, inst);
491	if (is_long(inst))
492		set_src_2(pc, src1, inst);
493	else
494		set_src_1(pc, src1, inst);
495
496	emit(pc, inst);
497}
498
499static void
500emit_minmax(struct nv50_pc *pc, unsigned sub, struct nv50_reg *dst,
501	    struct nv50_reg *src0, struct nv50_reg *src1)
502{
503	unsigned inst[2] = { 0, 0 };
504
505	set_long(pc, inst);
506	inst[0] |= 0xb0000000;
507	inst[1] |= (sub << 29);
508
509	check_swap_src_0_1(pc, &src0, &src1);
510	set_dst(pc, dst, inst);
511	set_src_0(pc, src0, inst);
512	set_src_1(pc, src1, inst);
513
514	emit(pc, inst);
515}
516
517static void
518emit_sub(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src0,
519	 struct nv50_reg *src1)
520{
521	unsigned inst[2] = { 0, 0 };
522
523	inst[0] |= 0xb0000000;
524
525	set_long(pc, inst);
526	if (check_swap_src_0_1(pc, &src0, &src1))
527		inst[1] |= 0x04000000;
528	else
529		inst[1] |= 0x08000000;
530
531	set_dst(pc, dst, inst);
532	set_src_0(pc, src0, inst);
533	set_src_2(pc, src1, inst);
534
535	emit(pc, inst);
536}
537
538static void
539emit_mad(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src0,
540	 struct nv50_reg *src1, struct nv50_reg *src2)
541{
542	unsigned inst[2] = { 0, 0 };
543
544	inst[0] |= 0xe0000000;
545
546	check_swap_src_0_1(pc, &src0, &src1);
547	set_dst(pc, dst, inst);
548	set_src_0(pc, src0, inst);
549	set_src_1(pc, src1, inst);
550	set_src_2(pc, src2, inst);
551
552	emit(pc, inst);
553}
554
555static void
556emit_msb(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src0,
557	 struct nv50_reg *src1, struct nv50_reg *src2)
558{
559	unsigned inst[2] = { 0, 0 };
560
561	inst[0] |= 0xe0000000;
562	set_long(pc, inst);
563	inst[1] |= 0x08000000; /* src0 * src1 - src2 */
564
565	check_swap_src_0_1(pc, &src0, &src1);
566	set_dst(pc, dst, inst);
567	set_src_0(pc, src0, inst);
568	set_src_1(pc, src1, inst);
569	set_src_2(pc, src2, inst);
570
571	emit(pc, inst);
572}
573
574static void
575emit_flop(struct nv50_pc *pc, unsigned sub,
576	  struct nv50_reg *dst, struct nv50_reg *src)
577{
578	unsigned inst[2] = { 0, 0 };
579
580	inst[0] |= 0x90000000;
581	if (sub) {
582		set_long(pc, inst);
583		inst[1] |= (sub << 29);
584	}
585
586	set_dst(pc, dst, inst);
587	set_src_0(pc, src, inst);
588
589	emit(pc, inst);
590}
591
592static void
593emit_preex2(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src)
594{
595	unsigned inst[2] = { 0, 0 };
596
597	inst[0] |= 0xb0000000;
598
599	set_dst(pc, dst, inst);
600	set_src_0(pc, src, inst);
601	set_long(pc, inst);
602	inst[1] |= (6 << 29) | 0x00004000;
603
604	emit(pc, inst);
605}
606/*XXX: inaccurate results.. why? */
607#define ALLOW_SET_SWAP 0
608
609static void
610emit_set(struct nv50_pc *pc, unsigned c_op, struct nv50_reg *dst,
611	 struct nv50_reg *src0, struct nv50_reg *src1)
612{
613	unsigned inst[2] = { 0, 0 };
614#if ALLOW_SET_SWAP
615	unsigned inv_cop[8] = { 0, 6, 2, 4, 3, 5, 1, 7 };
616#endif
617	struct nv50_reg *rdst;
618
619#if ALLOW_SET_SWAP
620	assert(c_op <= 7);
621	if (check_swap_src_0_1(pc, &src0, &src1))
622		c_op = inv_cop[c_op];
623#endif
624
625	rdst = dst;
626	if (dst->type != P_TEMP)
627		dst = alloc_temp(pc, NULL);
628
629	/* set.u32 */
630	set_long(pc, inst);
631	inst[0] |= 0xb0000000;
632	inst[1] |= (3 << 29);
633	inst[1] |= (c_op << 14);
634	/*XXX: breaks things, .u32 by default?
635	 *     decuda will disasm as .u16 and use .lo/.hi regs, but this
636	 *     doesn't seem to match what the hw actually does.
637	inst[1] |= 0x04000000; << breaks things.. .u32 by default?
638	 */
639	set_dst(pc, dst, inst);
640	set_src_0(pc, src0, inst);
641	set_src_1(pc, src1, inst);
642	emit(pc, inst);
643
644	/* cvt.f32.u32 */
645	inst[0] = 0xa0000001;
646	inst[1] = 0x64014780;
647	set_dst(pc, rdst, inst);
648	set_src_0(pc, dst, inst);
649	emit(pc, inst);
650
651	if (dst != rdst)
652		free_temp(pc, dst);
653}
654
655static void
656emit_flr(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src)
657{
658	unsigned inst[2] = { 0, 0 };
659
660	set_long(pc, inst);
661	inst[0] = 0xa0000000; /* cvt */
662	inst[1] |= (6 << 29); /* cvt */
663	inst[1] |= 0x08000000; /* integer mode */
664	inst[1] |= 0x04000000; /* 32 bit */
665	inst[1] |= ((0x1 << 3)) << 14; /* .rn */
666	inst[1] |= (1 << 14); /* src .f32 */
667	set_dst(pc, dst, inst);
668	set_src_0(pc, src, inst);
669
670	emit(pc, inst);
671}
672
673static boolean
674nv50_program_tx_insn(struct nv50_pc *pc, const union tgsi_full_token *tok)
675{
676	const struct tgsi_full_instruction *inst = &tok->FullInstruction;
677	struct nv50_reg *dst[4], *src[3][4], *temp;
678	unsigned mask;
679	int i, c;
680
681	NOUVEAU_ERR("insn %p\n", tok);
682
683	mask = inst->FullDstRegisters[0].DstRegister.WriteMask;
684
685	for (c = 0; c < 4; c++) {
686		if (mask & (1 << c))
687			dst[c] = tgsi_dst(pc, c, &inst->FullDstRegisters[0]);
688		else
689			dst[c] = NULL;
690	}
691
692	for (i = 0; i < inst->Instruction.NumSrcRegs; i++) {
693		for (c = 0; c < 4; c++)
694			src[i][c] = tgsi_src(pc, c, &inst->FullSrcRegisters[i]);
695	}
696
697	switch (inst->Instruction.Opcode) {
698	case TGSI_OPCODE_ABS:
699		for (c = 0; c < 4; c++) {
700			unsigned inst[2] = { 0, 0 };
701
702			set_long(pc, inst);
703			inst[0] = 0xa0000000; /* cvt */
704			inst[1] |= (6 << 29); /* cvt */
705			inst[1] |= 0x04000000; /* 32 bit */
706			inst[1] |= (1 << 14); /* src .f32 */
707			inst[1] |= ((1 << 6) << 14); /* .abs */
708			set_dst(pc, dst[c], inst);
709			set_src_0(pc, src[0][c], inst);
710			emit(pc, inst);
711		}
712		break;
713	case TGSI_OPCODE_ADD:
714		for (c = 0; c < 4; c++) {
715			if (!(mask & (1 << c)))
716				continue;
717			emit_add(pc, dst[c], src[0][c], src[1][c]);
718		}
719		break;
720	case TGSI_OPCODE_COS:
721		for (c = 0; c < 4; c++) {
722			if (!(mask & (1 << c)))
723				continue;
724			emit_flop(pc, 5, dst[c], src[0][c]);
725		}
726		break;
727	case TGSI_OPCODE_DP3:
728		temp = alloc_temp(pc, NULL);
729		emit_mul(pc, temp, src[0][0], src[1][0]);
730		emit_mad(pc, temp, src[0][1], src[1][1], temp);
731		emit_mad(pc, temp, src[0][2], src[1][2], temp);
732		for (c = 0; c < 4; c++) {
733			if (!(mask & (1 << c)))
734				continue;
735			emit_mov(pc, dst[c], temp);
736		}
737		free_temp(pc, temp);
738		break;
739	case TGSI_OPCODE_DP4:
740		temp = alloc_temp(pc, NULL);
741		emit_mul(pc, temp, src[0][0], src[1][0]);
742		emit_mad(pc, temp, src[0][1], src[1][1], temp);
743		emit_mad(pc, temp, src[0][2], src[1][2], temp);
744		emit_mad(pc, temp, src[0][3], src[1][3], temp);
745		for (c = 0; c < 4; c++) {
746			if (!(mask & (1 << c)))
747				continue;
748			emit_mov(pc, dst[c], temp);
749		}
750		free_temp(pc, temp);
751		break;
752	case TGSI_OPCODE_DPH:
753		temp = alloc_temp(pc, NULL);
754		emit_mul(pc, temp, src[0][0], src[1][0]);
755		emit_mad(pc, temp, src[0][1], src[1][1], temp);
756		emit_mad(pc, temp, src[0][2], src[1][2], temp);
757		emit_add(pc, temp, src[1][3], temp);
758		for (c = 0; c < 4; c++) {
759			if (!(mask & (1 << c)))
760				continue;
761			emit_mov(pc, dst[c], temp);
762		}
763		free_temp(pc, temp);
764		break;
765	case TGSI_OPCODE_DST:
766	{
767		struct nv50_reg *one = alloc_immd(pc, 1.0);
768		emit_mov(pc, dst[0], one);
769		emit_mul(pc, dst[1], src[0][1], src[1][1]);
770		emit_mov(pc, dst[2], src[0][2]);
771		emit_mov(pc, dst[3], src[1][3]);
772		FREE(one);
773	}
774		break;
775	case TGSI_OPCODE_EX2:
776		temp = alloc_temp(pc, NULL);
777		for (c = 0; c < 4; c++) {
778			if (!(mask & (1 << c)))
779				continue;
780			emit_preex2(pc, temp, src[0][c]);
781			emit_flop(pc, 6, dst[c], temp);
782		}
783		free_temp(pc, temp);
784		break;
785	case TGSI_OPCODE_FLR:
786		for (c = 0; c < 4; c++) {
787			if (!(mask & (1 << c)))
788				continue;
789			emit_flr(pc, dst[c], src[0][c]);
790		}
791		break;
792	case TGSI_OPCODE_FRC:
793		temp = alloc_temp(pc, NULL);
794		for (c = 0; c < 4; c++) {
795			if (!(mask & (1 << c)))
796				continue;
797			emit_flr(pc, temp, src[0][c]);
798			emit_sub(pc, dst[c], src[0][c], temp);
799		}
800		free_temp(pc, temp);
801		break;
802	case TGSI_OPCODE_LG2:
803		for (c = 0; c < 4; c++) {
804			if (!(mask & (1 << c)))
805				continue;
806			emit_flop(pc, 3, dst[c], src[0][c]);
807		}
808		break;
809	case TGSI_OPCODE_MAD:
810		for (c = 0; c < 4; c++) {
811			if (!(mask & (1 << c)))
812				continue;
813			emit_mad(pc, dst[c], src[0][c], src[1][c], src[2][c]);
814		}
815		break;
816	case TGSI_OPCODE_MAX:
817		for (c = 0; c < 4; c++) {
818			if (!(mask & (1 << c)))
819				continue;
820			emit_minmax(pc, 4, dst[c], src[0][c], src[1][c]);
821		}
822		break;
823	case TGSI_OPCODE_MIN:
824		for (c = 0; c < 4; c++) {
825			if (!(mask & (1 << c)))
826				continue;
827			emit_minmax(pc, 5, dst[c], src[0][c], src[1][c]);
828		}
829		break;
830	case TGSI_OPCODE_MOV:
831		for (c = 0; c < 4; c++) {
832			if (!(mask & (1 << c)))
833				continue;
834			emit_mov(pc, dst[c], src[0][c]);
835		}
836		break;
837	case TGSI_OPCODE_MUL:
838		for (c = 0; c < 4; c++) {
839			if (!(mask & (1 << c)))
840				continue;
841			emit_mul(pc, dst[c], src[0][c], src[1][c]);
842		}
843		break;
844	case TGSI_OPCODE_RCP:
845		for (c = 0; c < 4; c++) {
846			if (!(mask & (1 << c)))
847				continue;
848			emit_flop(pc, 0, dst[c], src[0][c]);
849		}
850		break;
851	case TGSI_OPCODE_RSQ:
852		for (c = 0; c < 4; c++) {
853			if (!(mask & (1 << c)))
854				continue;
855			emit_flop(pc, 2, dst[c], src[0][c]);
856		}
857		break;
858	case TGSI_OPCODE_SGE:
859		for (c = 0; c < 4; c++) {
860			if (!(mask & (1 << c)))
861				continue;
862			emit_set(pc, 6, dst[c], src[0][c], src[1][c]);
863		}
864		break;
865	case TGSI_OPCODE_SIN:
866		for (c = 0; c < 4; c++) {
867			if (!(mask & (1 << c)))
868				continue;
869			emit_flop(pc, 4, dst[c], src[0][c]);
870		}
871		break;
872	case TGSI_OPCODE_SLT:
873		for (c = 0; c < 4; c++) {
874			if (!(mask & (1 << c)))
875				continue;
876			emit_set(pc, 1, dst[c], src[0][c], src[1][c]);
877		}
878		break;
879	case TGSI_OPCODE_SUB:
880		for (c = 0; c < 4; c++) {
881			if (!(mask & (1 << c)))
882				continue;
883			emit_sub(pc, dst[c], src[0][c], src[1][c]);
884		}
885		break;
886	case TGSI_OPCODE_XPD:
887		temp = alloc_temp(pc, NULL);
888		emit_mul(pc, temp, src[0][2], src[1][1]);
889		emit_msb(pc, dst[0], src[0][1], src[1][2], temp);
890		emit_mul(pc, temp, src[0][0], src[1][2]);
891		emit_msb(pc, dst[1], src[0][2], src[1][0], temp);
892		emit_mul(pc, temp, src[0][1], src[1][0]);
893		emit_msb(pc, dst[2], src[0][0], src[1][1], temp);
894		free_temp(pc, temp);
895		break;
896	case TGSI_OPCODE_END:
897		break;
898	default:
899		NOUVEAU_ERR("invalid opcode %d\n", inst->Instruction.Opcode);
900		return FALSE;
901	}
902
903	kill_temp_temp(pc);
904	return TRUE;
905}
906
907static boolean
908nv50_program_tx_prep(struct nv50_pc *pc)
909{
910	struct tgsi_parse_context p;
911	boolean ret = FALSE;
912	unsigned i, c;
913
914	tgsi_parse_init(&p, pc->p->pipe.tokens);
915	while (!tgsi_parse_end_of_tokens(&p)) {
916		const union tgsi_full_token *tok = &p.FullToken;
917
918		tgsi_parse_token(&p);
919		switch (tok->Token.Type) {
920		case TGSI_TOKEN_TYPE_IMMEDIATE:
921		{
922			const struct tgsi_full_immediate *imm =
923				&p.FullToken.FullImmediate;
924
925			ctor_immd(pc, imm->u.ImmediateFloat32[0].Float,
926				      imm->u.ImmediateFloat32[1].Float,
927				      imm->u.ImmediateFloat32[2].Float,
928				      imm->u.ImmediateFloat32[3].Float);
929		}
930			break;
931		case TGSI_TOKEN_TYPE_DECLARATION:
932		{
933			const struct tgsi_full_declaration *d;
934			unsigned last;
935
936			d = &p.FullToken.FullDeclaration;
937			last = d->u.DeclarationRange.Last;
938
939			switch (d->Declaration.File) {
940			case TGSI_FILE_TEMPORARY:
941				if (pc->temp_nr < (last + 1))
942					pc->temp_nr = last + 1;
943				break;
944			case TGSI_FILE_OUTPUT:
945				if (pc->result_nr < (last + 1))
946					pc->result_nr = last + 1;
947				break;
948			case TGSI_FILE_INPUT:
949				if (pc->attr_nr < (last + 1))
950					pc->attr_nr = last + 1;
951				break;
952			case TGSI_FILE_CONSTANT:
953				if (pc->param_nr < (last + 1))
954					pc->param_nr = last + 1;
955				break;
956			default:
957				NOUVEAU_ERR("bad decl file %d\n",
958					    d->Declaration.File);
959				goto out_err;
960			}
961		}
962			break;
963		case TGSI_TOKEN_TYPE_INSTRUCTION:
964			break;
965		default:
966			break;
967		}
968	}
969
970	NOUVEAU_ERR("%d temps\n", pc->temp_nr);
971	if (pc->temp_nr) {
972		pc->temp = calloc(pc->temp_nr * 4, sizeof(struct nv50_reg));
973		if (!pc->temp)
974			goto out_err;
975
976		for (i = 0; i < pc->temp_nr; i++) {
977			for (c = 0; c < 4; c++) {
978				pc->temp[i*4+c].type = P_TEMP;
979				pc->temp[i*4+c].hw = -1;
980				pc->temp[i*4+c].index = i;
981			}
982		}
983	}
984
985	NOUVEAU_ERR("%d attrib regs\n", pc->attr_nr);
986	if (pc->attr_nr) {
987		struct nv50_reg *iv = NULL, *tmp = NULL;
988		int aid = 0;
989
990		pc->attr = calloc(pc->attr_nr * 4, sizeof(struct nv50_reg));
991		if (!pc->attr)
992			goto out_err;
993
994		if (pc->p->type == NV50_PROG_FRAGMENT) {
995			iv = alloc_temp(pc, NULL);
996			aid++;
997		}
998
999		for (i = 0; i < pc->attr_nr; i++) {
1000			struct nv50_reg *a = &pc->attr[i*4];
1001
1002			for (c = 0; c < 4; c++) {
1003				if (pc->p->type == NV50_PROG_FRAGMENT) {
1004					struct nv50_reg *at =
1005						alloc_temp(pc, NULL);
1006					pc->attr[i*4+c].type = at->type;
1007					pc->attr[i*4+c].hw = at->hw;
1008					pc->attr[i*4+c].index = at->index;
1009				} else {
1010					pc->p->cfg.vp.attr[aid/32] |=
1011						(1 << (aid % 32));
1012					pc->attr[i*4+c].type = P_ATTR;
1013					pc->attr[i*4+c].hw = aid++;
1014					pc->attr[i*4+c].index = i;
1015				}
1016			}
1017
1018			if (pc->p->type != NV50_PROG_FRAGMENT)
1019				continue;
1020
1021			emit_interp(pc, iv, iv, iv, FALSE);
1022			tmp = alloc_temp(pc, NULL);
1023			{
1024				unsigned inst[2] = { 0, 0 };
1025				inst[0]  = 0x90000000;
1026				inst[0] |= (tmp->hw << 2);
1027				emit(pc, inst);
1028			}
1029			emit_interp(pc, &a[0], &a[0], tmp, TRUE);
1030			emit_interp(pc, &a[1], &a[1], tmp, TRUE);
1031			emit_interp(pc, &a[2], &a[2], tmp, TRUE);
1032			emit_interp(pc, &a[3], &a[3], tmp, TRUE);
1033			free_temp(pc, tmp);
1034		}
1035
1036		if (iv)
1037			free_temp(pc, iv);
1038	}
1039
1040	NOUVEAU_ERR("%d result regs\n", pc->result_nr);
1041	if (pc->result_nr) {
1042		int rid = 0;
1043
1044		pc->result = calloc(pc->result_nr * 4, sizeof(struct nv50_reg));
1045		if (!pc->result)
1046			goto out_err;
1047
1048		for (i = 0; i < pc->result_nr; i++) {
1049			for (c = 0; c < 4; c++) {
1050				if (pc->p->type == NV50_PROG_FRAGMENT)
1051					pc->result[i*4+c].type = P_TEMP;
1052				else
1053					pc->result[i*4+c].type = P_RESULT;
1054				pc->result[i*4+c].hw = rid++;
1055				pc->result[i*4+c].index = i;
1056			}
1057		}
1058	}
1059
1060	NOUVEAU_ERR("%d param regs\n", pc->param_nr);
1061	if (pc->param_nr) {
1062		int rid = 0;
1063
1064		pc->param = calloc(pc->param_nr * 4, sizeof(struct nv50_reg));
1065		if (!pc->param)
1066			goto out_err;
1067
1068		for (i = 0; i < pc->param_nr; i++) {
1069			for (c = 0; c < 4; c++) {
1070				pc->param[i*4+c].type = P_CONST;
1071				pc->param[i*4+c].hw = rid++;
1072				pc->param[i*4+c].index = i;
1073			}
1074		}
1075	}
1076
1077	if (pc->immd_nr) {
1078		int rid = 0;
1079
1080		pc->immd = calloc(pc->immd_nr * 4, sizeof(struct nv50_reg));
1081		if (!pc->immd)
1082			goto out_err;
1083
1084		for (i = 0; i < pc->immd_nr; i++) {
1085			for (c = 0; c < 4; c++) {
1086				pc->immd[i*4+c].type = P_IMMD;
1087				pc->immd[i*4+c].hw = rid++;
1088				pc->immd[i*4+c].index = i;
1089			}
1090		}
1091	}
1092
1093	ret = TRUE;
1094out_err:
1095	tgsi_parse_free(&p);
1096	return ret;
1097}
1098
1099static boolean
1100nv50_program_tx(struct nv50_program *p)
1101{
1102	struct tgsi_parse_context parse;
1103	struct nv50_pc *pc;
1104	boolean ret;
1105
1106	pc = CALLOC_STRUCT(nv50_pc);
1107	if (!pc)
1108		return FALSE;
1109	pc->p = p;
1110	pc->p->cfg.high_temp = 4;
1111
1112	ret = nv50_program_tx_prep(pc);
1113	if (ret == FALSE)
1114		goto out_cleanup;
1115
1116	tgsi_parse_init(&parse, pc->p->pipe.tokens);
1117	while (!tgsi_parse_end_of_tokens(&parse)) {
1118		const union tgsi_full_token *tok = &parse.FullToken;
1119
1120		tgsi_parse_token(&parse);
1121
1122		switch (tok->Token.Type) {
1123		case TGSI_TOKEN_TYPE_INSTRUCTION:
1124			ret = nv50_program_tx_insn(pc, tok);
1125			if (ret == FALSE)
1126				goto out_err;
1127			break;
1128		default:
1129			break;
1130		}
1131	}
1132
1133	p->immd_nr = pc->immd_nr * 4;
1134	p->immd = pc->immd_buf;
1135
1136out_err:
1137	tgsi_parse_free(&parse);
1138
1139out_cleanup:
1140	return ret;
1141}
1142
1143static void
1144nv50_program_validate(struct nv50_context *nv50, struct nv50_program *p)
1145{
1146	int i;
1147
1148	if (nv50_program_tx(p) == FALSE)
1149		assert(0);
1150	/* *not* sufficient, it's fine if last inst is long and
1151	 * NOT immd - otherwise it's fucked fucked fucked */
1152	p->insns[p->insns_nr - 1] |= 0x00000001;
1153
1154	if (p->type == NV50_PROG_VERTEX) {
1155	for (i = 0; i < p->insns_nr; i++)
1156		NOUVEAU_ERR("VP0x%08x\n", p->insns[i]);
1157	} else {
1158	for (i = 0; i < p->insns_nr; i++)
1159		NOUVEAU_ERR("FP0x%08x\n", p->insns[i]);
1160	}
1161
1162	p->translated = TRUE;
1163}
1164
1165static void
1166nv50_program_validate_data(struct nv50_context *nv50, struct nv50_program *p)
1167{
1168	int i;
1169
1170	for (i = 0; i < p->immd_nr; i++) {
1171		BEGIN_RING(tesla, 0x0f00, 2);
1172		OUT_RING  ((NV50_CB_PMISC << 16) | (i << 8));
1173		OUT_RING  (fui(p->immd[i]));
1174	}
1175}
1176
1177static void
1178nv50_program_validate_code(struct nv50_context *nv50, struct nv50_program *p)
1179{
1180	struct pipe_winsys *ws = nv50->pipe.winsys;
1181	void *map;
1182
1183	if (!p->buffer)
1184		p->buffer = ws->buffer_create(ws, 0x100, 0, p->insns_nr * 4);
1185	map = ws->buffer_map(ws, p->buffer, PIPE_BUFFER_USAGE_CPU_WRITE);
1186	memcpy(map, p->insns, p->insns_nr * 4);
1187	ws->buffer_unmap(ws, p->buffer);
1188}
1189
1190void
1191nv50_vertprog_validate(struct nv50_context *nv50)
1192{
1193	struct nouveau_grobj *tesla = nv50->screen->tesla;
1194	struct nv50_program *p = nv50->vertprog;
1195	struct nouveau_stateobj *so;
1196
1197	if (!p->translated) {
1198		nv50_program_validate(nv50, p);
1199		if (!p->translated)
1200			assert(0);
1201	}
1202
1203	nv50_program_validate_data(nv50, p);
1204	nv50_program_validate_code(nv50, p);
1205
1206	so = so_new(11, 2);
1207	so_method(so, tesla, NV50TCL_VP_ADDRESS_HIGH, 2);
1208	so_reloc (so, p->buffer, 0, NOUVEAU_BO_VRAM | NOUVEAU_BO_RD |
1209		  NOUVEAU_BO_HIGH, 0, 0);
1210	so_reloc (so, p->buffer, 0, NOUVEAU_BO_VRAM | NOUVEAU_BO_RD |
1211		  NOUVEAU_BO_LOW, 0, 0);
1212	so_method(so, tesla, 0x1650, 2);
1213	so_data  (so, p->cfg.vp.attr[0]);
1214	so_data  (so, p->cfg.vp.attr[1]);
1215	so_method(so, tesla, 0x16ac, 2);
1216	so_data  (so, 8);
1217	so_data  (so, p->cfg.high_temp);
1218	so_method(so, tesla, 0x140c, 1);
1219	so_data  (so, 0); /* program start offset */
1220	so_emit(nv50->screen->nvws, so);
1221	so_ref(NULL, &so);
1222}
1223
1224void
1225nv50_fragprog_validate(struct nv50_context *nv50)
1226{
1227	struct nouveau_grobj *tesla = nv50->screen->tesla;
1228	struct nv50_program *p = nv50->fragprog;
1229	struct nouveau_stateobj *so;
1230
1231	if (!p->translated) {
1232		nv50_program_validate(nv50, p);
1233		if (!p->translated)
1234			assert(0);
1235	}
1236
1237	nv50_program_validate_data(nv50, p);
1238	nv50_program_validate_code(nv50, p);
1239
1240	so = so_new(7, 2);
1241	so_method(so, tesla, NV50TCL_FP_ADDRESS_HIGH, 2);
1242	so_reloc (so, p->buffer, 0, NOUVEAU_BO_VRAM | NOUVEAU_BO_RD |
1243		  NOUVEAU_BO_HIGH, 0, 0);
1244	so_reloc (so, p->buffer, 0, NOUVEAU_BO_VRAM | NOUVEAU_BO_RD |
1245		  NOUVEAU_BO_LOW, 0, 0);
1246	so_method(so, tesla, 0x198c, 1);
1247	so_data  (so, p->cfg.high_temp);
1248	so_method(so, tesla, 0x1414, 1);
1249	so_data  (so, 0); /* program start offset */
1250	so_emit(nv50->screen->nvws, so);
1251	so_ref(NULL, &so);
1252}
1253
1254void
1255nv50_program_destroy(struct nv50_context *nv50, struct nv50_program *p)
1256{
1257	struct pipe_winsys *ws = nv50->pipe.winsys;
1258
1259	if (p->insns_nr) {
1260		if (p->insns)
1261			FREE(p->insns);
1262		p->insns_nr = 0;
1263	}
1264
1265	if (p->buffer)
1266		pipe_buffer_reference(ws, &p->buffer, NULL);
1267
1268	p->translated = 0;
1269}
1270
1271