nv50_program.c revision faa1c02546db00f69c66db18076b5b0ac86d7138
1#include "pipe/p_context.h"
2#include "pipe/p_defines.h"
3#include "pipe/p_state.h"
4#include "pipe/p_inlines.h"
5
6#include "pipe/p_shader_tokens.h"
7#include "tgsi/util/tgsi_parse.h"
8#include "tgsi/util/tgsi_util.h"
9
10#include "nv50_context.h"
11#include "nv50_state.h"
12
13#define NV50_SU_MAX_TEMP 64
14
15/* ARL
16 * LIT - other buggery
17 *
18 * MSB - Like MAD, but MUL+SUB
19 * 	- Fuck it off, introduce a way to negate args for ops that
20 * 	  support it.
21 *
22 * Look into inlining IMMD for ops other than MOV (make it general?)
23 * 	- Maybe even relax restrictions a bit, can't do P_RESULT + P_IMMD,
24 * 	  but can emit to P_TEMP first - then MOV later. NVIDIA does this
25 *
26 * Verify half-insns work where expected - and force disable them where they
27 * don't work - MUL has it forcibly disabled atm as it fixes POW..
28 */
29struct nv50_reg {
30	enum {
31		P_TEMP,
32		P_ATTR,
33		P_RESULT,
34		P_CONST,
35		P_IMMD
36	} type;
37	int index;
38
39	int hw;
40	int neg;
41};
42
43struct nv50_pc {
44	struct nv50_program *p;
45
46	/* hw resources */
47	struct nv50_reg *r_temp[NV50_SU_MAX_TEMP];
48
49	/* tgsi resources */
50	struct nv50_reg *temp;
51	int temp_nr;
52	struct nv50_reg *attr;
53	int attr_nr;
54	struct nv50_reg *result;
55	int result_nr;
56	struct nv50_reg *param;
57	int param_nr;
58	struct nv50_reg *immd;
59	float *immd_buf;
60	int immd_nr;
61
62	struct nv50_reg *temp_temp[8];
63	unsigned temp_temp_nr;
64};
65
66static void
67alloc_reg(struct nv50_pc *pc, struct nv50_reg *reg)
68{
69	int i;
70
71	if (reg->type != P_TEMP)
72		return;
73
74	if (reg->hw >= 0) {
75		/*XXX: do this here too to catch FP temp-as-attr usage..
76		 *     not clean, but works */
77		if (pc->p->cfg.high_temp < (reg->hw + 1))
78			pc->p->cfg.high_temp = reg->hw + 1;
79		return;
80	}
81
82	for (i = 0; i < NV50_SU_MAX_TEMP; i++) {
83		if (!(pc->r_temp[i])) {
84			pc->r_temp[i] = reg;
85			reg->hw = i;
86			if (pc->p->cfg.high_temp < (i + 1))
87				pc->p->cfg.high_temp = i + 1;
88			return;
89		}
90	}
91
92	assert(0);
93}
94
95static struct nv50_reg *
96alloc_temp(struct nv50_pc *pc, struct nv50_reg *dst)
97{
98	struct nv50_reg *r;
99	int i;
100
101	if (dst && dst->type == P_TEMP && dst->hw == -1)
102		return dst;
103
104	for (i = 0; i < NV50_SU_MAX_TEMP; i++) {
105		if (!pc->r_temp[i]) {
106			r = CALLOC_STRUCT(nv50_reg);
107			r->type = P_TEMP;
108			r->index = -1;
109			r->hw = i;
110			pc->r_temp[i] = r;
111			return r;
112		}
113	}
114
115	assert(0);
116	return NULL;
117}
118
119static void
120free_temp(struct nv50_pc *pc, struct nv50_reg *r)
121{
122	if (r->index == -1) {
123		FREE(pc->r_temp[r->hw]);
124		pc->r_temp[r->hw] = NULL;
125	}
126}
127
128static struct nv50_reg *
129temp_temp(struct nv50_pc *pc)
130{
131	if (pc->temp_temp_nr >= 8)
132		assert(0);
133
134	pc->temp_temp[pc->temp_temp_nr] = alloc_temp(pc, NULL);
135	return pc->temp_temp[pc->temp_temp_nr++];
136}
137
138static void
139kill_temp_temp(struct nv50_pc *pc)
140{
141	int i;
142
143	for (i = 0; i < pc->temp_temp_nr; i++)
144		free_temp(pc, pc->temp_temp[i]);
145	pc->temp_temp_nr = 0;
146}
147
148static int
149ctor_immd(struct nv50_pc *pc, float x, float y, float z, float w)
150{
151	pc->immd_buf = realloc(pc->immd_buf, (pc->immd_nr + 1) * 4 *
152					     sizeof(float));
153	pc->immd_buf[(pc->immd_nr * 4) + 0] = x;
154	pc->immd_buf[(pc->immd_nr * 4) + 1] = y;
155	pc->immd_buf[(pc->immd_nr * 4) + 2] = z;
156	pc->immd_buf[(pc->immd_nr * 4) + 3] = w;
157
158	return pc->immd_nr++;
159}
160
161static struct nv50_reg *
162alloc_immd(struct nv50_pc *pc, float f)
163{
164	struct nv50_reg *r = CALLOC_STRUCT(nv50_reg);
165	unsigned hw;
166
167	hw = ctor_immd(pc, f, 0, 0, 0);
168	r->type = P_IMMD;
169	r->hw = hw;
170	r->index = -1;
171	return r;
172}
173
174static struct nv50_reg *
175tgsi_dst(struct nv50_pc *pc, int c, const struct tgsi_full_dst_register *dst)
176{
177	switch (dst->DstRegister.File) {
178	case TGSI_FILE_TEMPORARY:
179		return &pc->temp[dst->DstRegister.Index * 4 + c];
180	case TGSI_FILE_OUTPUT:
181		return &pc->result[dst->DstRegister.Index * 4 + c];
182	case TGSI_FILE_NULL:
183		return NULL;
184	default:
185		break;
186	}
187
188	return NULL;
189}
190
191static struct nv50_reg *
192tgsi_src(struct nv50_pc *pc, int chan, const struct tgsi_full_src_register *src)
193{
194	struct nv50_reg *r = NULL;
195	unsigned c;
196
197	c = tgsi_util_get_full_src_register_extswizzle(src, chan);
198	switch (c) {
199	case TGSI_EXTSWIZZLE_X:
200	case TGSI_EXTSWIZZLE_Y:
201	case TGSI_EXTSWIZZLE_Z:
202	case TGSI_EXTSWIZZLE_W:
203		switch (src->SrcRegister.File) {
204		case TGSI_FILE_INPUT:
205			r = &pc->attr[src->SrcRegister.Index * 4 + c];
206			break;
207		case TGSI_FILE_TEMPORARY:
208			r = &pc->temp[src->SrcRegister.Index * 4 + c];
209			break;
210		case TGSI_FILE_CONSTANT:
211			r = &pc->param[src->SrcRegister.Index * 4 + c];
212			break;
213		case TGSI_FILE_IMMEDIATE:
214			r = &pc->immd[src->SrcRegister.Index * 4 + c];
215			break;
216		default:
217			assert(0);
218			break;
219		}
220		break;
221	case TGSI_EXTSWIZZLE_ZERO:
222		r = alloc_immd(pc, 0.0);
223		break;
224	case TGSI_EXTSWIZZLE_ONE:
225		r = alloc_immd(pc, 1.0);
226		break;
227	default:
228		assert(0);
229		break;
230	}
231
232	switch (tgsi_util_get_full_src_register_sign_mode(src, chan)) {
233	case TGSI_UTIL_SIGN_KEEP:
234		break;
235	default:
236		assert(0);
237		break;
238	}
239
240	return r;
241}
242
243static void
244emit(struct nv50_pc *pc, unsigned *inst)
245{
246	struct nv50_program *p = pc->p;
247
248	if (inst[0] & 1) {
249		p->insns_nr += 2;
250		p->insns = realloc(p->insns, sizeof(unsigned) * p->insns_nr);
251		memcpy(p->insns + (p->insns_nr - 2), inst, sizeof(unsigned)*2);
252	} else {
253		p->insns_nr += 1;
254		p->insns = realloc(p->insns, sizeof(unsigned) * p->insns_nr);
255		memcpy(p->insns + (p->insns_nr - 1), inst, sizeof(unsigned));
256	}
257}
258
259static INLINE void set_long(struct nv50_pc *, unsigned *);
260
261static boolean
262is_long(unsigned *inst)
263{
264	if (inst[0] & 1)
265		return TRUE;
266	return FALSE;
267}
268
269static boolean
270is_immd(unsigned *inst)
271{
272	if (is_long(inst) && (inst[1] & 3) == 3)
273		return TRUE;
274	return FALSE;
275}
276
277static INLINE void
278set_pred(struct nv50_pc *pc, unsigned pred, unsigned idx, unsigned *inst)
279{
280	set_long(pc, inst);
281	inst[1] &= ~((0x1f << 7) | (0x3 << 12));
282	inst[1] |= (pred << 7) | (idx << 12);
283}
284
285static INLINE void
286set_pred_wr(struct nv50_pc *pc, unsigned on, unsigned idx, unsigned *inst)
287{
288	set_long(pc, inst);
289	inst[1] &= ~((0x3 << 4) | (1 << 6));
290	inst[1] |= (idx << 4) | (on << 6);
291}
292
293static INLINE void
294set_long(struct nv50_pc *pc, unsigned *inst)
295{
296	if (is_long(inst))
297		return;
298
299	inst[0] |= 1;
300	set_pred(pc, 0xf, 0, inst);
301	set_pred_wr(pc, 0, 0, inst);
302}
303
304static INLINE void
305set_dst(struct nv50_pc *pc, struct nv50_reg *dst, unsigned *inst)
306{
307	if (dst->type == P_RESULT) {
308		set_long(pc, inst);
309		inst[1] |= 0x00000008;
310	}
311
312	alloc_reg(pc, dst);
313	inst[0] |= (dst->hw << 2);
314}
315
316static INLINE void
317set_immd(struct nv50_pc *pc, struct nv50_reg *imm, unsigned *inst)
318{
319	unsigned val = fui(pc->immd_buf[imm->hw]); /* XXX */
320
321	set_long(pc, inst);
322	/*XXX: can't be predicated - bits overlap.. catch cases where both
323	 *     are required and avoid them. */
324	set_pred(pc, 0, 0, inst);
325	set_pred_wr(pc, 0, 0, inst);
326
327	inst[1] |= 0x00000002 | 0x00000001;
328	inst[0] |= (val & 0x3f) << 16;
329	inst[1] |= (val >> 6) << 2;
330}
331
332static void
333emit_interp(struct nv50_pc *pc, struct nv50_reg *dst,
334	    struct nv50_reg *src, struct nv50_reg *iv, boolean noperspective)
335{
336	unsigned inst[2] = { 0, 0 };
337
338	inst[0] |= 0x80000000;
339	set_dst(pc, dst, inst);
340	alloc_reg(pc, iv);
341	inst[0] |= (iv->hw << 9);
342	alloc_reg(pc, src);
343	inst[0] |= (src->hw << 16);
344	if (noperspective)
345		inst[0] |= (1 << 25);
346
347	emit(pc, inst);
348}
349
350static void
351set_cseg(struct nv50_pc *pc, struct nv50_reg *src, unsigned *inst)
352{
353	set_long(pc, inst);
354	if (src->type == P_IMMD) {
355		inst[1] |= (NV50_CB_PMISC << 22);
356	} else {
357		if (pc->p->type == NV50_PROG_VERTEX)
358			inst[1] |= (NV50_CB_PVP << 22);
359		else
360			inst[1] |= (NV50_CB_PFP << 22);
361	}
362}
363
364static void
365emit_mov(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src)
366{
367	unsigned inst[2] = { 0, 0 };
368
369	inst[0] |= 0x10000000;
370
371	set_dst(pc, dst, inst);
372
373	if (dst->type != P_RESULT && src->type == P_IMMD) {
374		set_immd(pc, src, inst);
375		/*XXX: 32-bit, but steals part of "half" reg space - need to
376		 *     catch and handle this case if/when we do half-regs
377		 */
378		inst[0] |= 0x00008000;
379	} else
380	if (src->type == P_IMMD || src->type == P_CONST) {
381		set_long(pc, inst);
382		set_cseg(pc, src, inst);
383		inst[0] |= (src->hw << 9);
384		inst[1] |= 0x20000000; /* src0 const? */
385	} else {
386		if (src->type == P_ATTR) {
387			set_long(pc, inst);
388			inst[1] |= 0x00200000;
389		}
390
391		alloc_reg(pc, src);
392		inst[0] |= (src->hw << 9);
393	}
394
395	/* We really should support "half" instructions here at some point,
396	 * but I don't feel confident enough about them yet.
397	 */
398	set_long(pc, inst);
399	if (is_long(inst) && !is_immd(inst)) {
400		inst[1] |= 0x04000000; /* 32-bit */
401		inst[1] |= 0x0003c000; /* "subsubop" 0xf == mov */
402	}
403
404	emit(pc, inst);
405}
406
407static boolean
408check_swap_src_0_1(struct nv50_pc *pc,
409		   struct nv50_reg **s0, struct nv50_reg **s1)
410{
411	struct nv50_reg *src0 = *s0, *src1 = *s1;
412
413	if (src0->type == P_CONST) {
414		if (src1->type != P_CONST) {
415			*s0 = src1;
416			*s1 = src0;
417			return TRUE;
418		}
419	} else
420	if (src1->type == P_ATTR) {
421		if (src0->type != P_ATTR) {
422			*s0 = src1;
423			*s1 = src0;
424			return TRUE;
425		}
426	}
427
428	return FALSE;
429}
430
431static void
432set_src_0(struct nv50_pc *pc, struct nv50_reg *src, unsigned *inst)
433{
434	if (src->type == P_ATTR) {
435		set_long(pc, inst);
436		inst[1] |= 0x00200000;
437	} else
438	if (src->type == P_CONST || src->type == P_IMMD) {
439		struct nv50_reg *temp = temp_temp(pc);
440
441		emit_mov(pc, temp, src);
442		src = temp;
443	}
444
445	alloc_reg(pc, src);
446	inst[0] |= (src->hw << 9);
447}
448
449static void
450set_src_1(struct nv50_pc *pc, struct nv50_reg *src, unsigned *inst)
451{
452	if (src->type == P_ATTR) {
453		struct nv50_reg *temp = temp_temp(pc);
454
455		emit_mov(pc, temp, src);
456		src = temp;
457	} else
458	if (src->type == P_CONST || src->type == P_IMMD) {
459		set_cseg(pc, src, inst);
460		inst[0] |= 0x00800000;
461	}
462
463	alloc_reg(pc, src);
464	inst[0] |= (src->hw << 16);
465}
466
467static void
468set_src_2(struct nv50_pc *pc, struct nv50_reg *src, unsigned *inst)
469{
470	set_long(pc, inst);
471
472	if (src->type == P_ATTR) {
473		struct nv50_reg *temp = temp_temp(pc);
474
475		emit_mov(pc, temp, src);
476		src = temp;
477	} else
478	if (src->type == P_CONST || src->type == P_IMMD) {
479		set_cseg(pc, src, inst);
480		inst[0] |= 0x01000000;
481	}
482
483	alloc_reg(pc, src);
484	inst[1] |= (src->hw << 14);
485}
486
487static void
488emit_mul(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src0,
489	 struct nv50_reg *src1)
490{
491	unsigned inst[2] = { 0, 0 };
492
493	inst[0] |= 0xc0000000;
494	set_long(pc, inst);
495
496	check_swap_src_0_1(pc, &src0, &src1);
497	set_dst(pc, dst, inst);
498	set_src_0(pc, src0, inst);
499	set_src_1(pc, src1, inst);
500
501	emit(pc, inst);
502}
503
504static void
505emit_add(struct nv50_pc *pc, struct nv50_reg *dst,
506	 struct nv50_reg *src0, struct nv50_reg *src1)
507{
508	unsigned inst[2] = { 0, 0 };
509
510	inst[0] |= 0xb0000000;
511
512	check_swap_src_0_1(pc, &src0, &src1);
513	set_dst(pc, dst, inst);
514	set_src_0(pc, src0, inst);
515	if (is_long(inst))
516		set_src_2(pc, src1, inst);
517	else
518		set_src_1(pc, src1, inst);
519
520	emit(pc, inst);
521}
522
523static void
524emit_minmax(struct nv50_pc *pc, unsigned sub, struct nv50_reg *dst,
525	    struct nv50_reg *src0, struct nv50_reg *src1)
526{
527	unsigned inst[2] = { 0, 0 };
528
529	set_long(pc, inst);
530	inst[0] |= 0xb0000000;
531	inst[1] |= (sub << 29);
532
533	check_swap_src_0_1(pc, &src0, &src1);
534	set_dst(pc, dst, inst);
535	set_src_0(pc, src0, inst);
536	set_src_1(pc, src1, inst);
537
538	emit(pc, inst);
539}
540
541static void
542emit_sub(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src0,
543	 struct nv50_reg *src1)
544{
545	unsigned inst[2] = { 0, 0 };
546
547	inst[0] |= 0xb0000000;
548
549	set_long(pc, inst);
550	if (check_swap_src_0_1(pc, &src0, &src1))
551		inst[1] |= 0x04000000;
552	else
553		inst[1] |= 0x08000000;
554
555	set_dst(pc, dst, inst);
556	set_src_0(pc, src0, inst);
557	set_src_2(pc, src1, inst);
558
559	emit(pc, inst);
560}
561
562static void
563emit_mad(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src0,
564	 struct nv50_reg *src1, struct nv50_reg *src2)
565{
566	unsigned inst[2] = { 0, 0 };
567
568	inst[0] |= 0xe0000000;
569
570	check_swap_src_0_1(pc, &src0, &src1);
571	set_dst(pc, dst, inst);
572	set_src_0(pc, src0, inst);
573	set_src_1(pc, src1, inst);
574	set_src_2(pc, src2, inst);
575
576	emit(pc, inst);
577}
578
579static void
580emit_msb(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src0,
581	 struct nv50_reg *src1, struct nv50_reg *src2)
582{
583	unsigned inst[2] = { 0, 0 };
584
585	inst[0] |= 0xe0000000;
586	set_long(pc, inst);
587	inst[1] |= 0x08000000; /* src0 * src1 - src2 */
588
589	check_swap_src_0_1(pc, &src0, &src1);
590	set_dst(pc, dst, inst);
591	set_src_0(pc, src0, inst);
592	set_src_1(pc, src1, inst);
593	set_src_2(pc, src2, inst);
594
595	emit(pc, inst);
596}
597
598static void
599emit_flop(struct nv50_pc *pc, unsigned sub,
600	  struct nv50_reg *dst, struct nv50_reg *src)
601{
602	unsigned inst[2] = { 0, 0 };
603
604	inst[0] |= 0x90000000;
605	if (sub) {
606		set_long(pc, inst);
607		inst[1] |= (sub << 29);
608	}
609
610	set_dst(pc, dst, inst);
611	set_src_0(pc, src, inst);
612
613	emit(pc, inst);
614}
615
616static void
617emit_preex2(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src)
618{
619	unsigned inst[2] = { 0, 0 };
620
621	inst[0] |= 0xb0000000;
622
623	set_dst(pc, dst, inst);
624	set_src_0(pc, src, inst);
625	set_long(pc, inst);
626	inst[1] |= (6 << 29) | 0x00004000;
627
628	emit(pc, inst);
629}
630
631/*XXX: inaccurate results.. why? */
632#define ALLOW_SET_SWAP 0
633
634static void
635emit_set(struct nv50_pc *pc, unsigned c_op, struct nv50_reg *dst,
636	 struct nv50_reg *src0, struct nv50_reg *src1)
637{
638	unsigned inst[2] = { 0, 0 };
639#if ALLOW_SET_SWAP
640	unsigned inv_cop[8] = { 0, 6, 2, 4, 3, 5, 1, 7 };
641#endif
642	struct nv50_reg *rdst;
643
644#if ALLOW_SET_SWAP
645	assert(c_op <= 7);
646	if (check_swap_src_0_1(pc, &src0, &src1))
647		c_op = inv_cop[c_op];
648#endif
649
650	rdst = dst;
651	if (dst->type != P_TEMP)
652		dst = alloc_temp(pc, NULL);
653
654	/* set.u32 */
655	set_long(pc, inst);
656	inst[0] |= 0xb0000000;
657	inst[1] |= (3 << 29);
658	inst[1] |= (c_op << 14);
659	/*XXX: breaks things, .u32 by default?
660	 *     decuda will disasm as .u16 and use .lo/.hi regs, but this
661	 *     doesn't seem to match what the hw actually does.
662	inst[1] |= 0x04000000; << breaks things.. .u32 by default?
663	 */
664	set_dst(pc, dst, inst);
665	set_src_0(pc, src0, inst);
666	set_src_1(pc, src1, inst);
667	emit(pc, inst);
668
669	/* cvt.f32.u32 */
670	inst[0] = 0xa0000001;
671	inst[1] = 0x64014780;
672	set_dst(pc, rdst, inst);
673	set_src_0(pc, dst, inst);
674	emit(pc, inst);
675
676	if (dst != rdst)
677		free_temp(pc, dst);
678}
679
680static void
681emit_flr(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src)
682{
683	unsigned inst[2] = { 0, 0 };
684
685	inst[0] = 0xa0000000; /* cvt */
686	set_long(pc, inst);
687	inst[1] |= (6 << 29); /* cvt */
688	inst[1] |= 0x08000000; /* integer mode */
689	inst[1] |= 0x04000000; /* 32 bit */
690	inst[1] |= ((0x1 << 3)) << 14; /* .rn */
691	inst[1] |= (1 << 14); /* src .f32 */
692	set_dst(pc, dst, inst);
693	set_src_0(pc, src, inst);
694
695	emit(pc, inst);
696}
697
698static void
699emit_pow(struct nv50_pc *pc, struct nv50_reg *dst,
700	 struct nv50_reg *v, struct nv50_reg *e)
701{
702	struct nv50_reg *temp = alloc_temp(pc, NULL);
703
704	emit_flop(pc, 3, temp, v);
705	emit_mul(pc, temp, temp, e);
706	emit_preex2(pc, temp, temp);
707	emit_flop(pc, 6, dst, temp);
708
709	free_temp(pc, temp);
710}
711
712static boolean
713nv50_program_tx_insn(struct nv50_pc *pc, const union tgsi_full_token *tok)
714{
715	const struct tgsi_full_instruction *inst = &tok->FullInstruction;
716	struct nv50_reg *rdst[4], *dst[4], *src[3][4], *temp;
717	unsigned mask, sat;
718	int i, c;
719
720	NOUVEAU_ERR("insn %p\n", tok);
721
722	mask = inst->FullDstRegisters[0].DstRegister.WriteMask;
723	sat = inst->Instruction.Saturate == TGSI_SAT_ZERO_ONE;
724
725	for (c = 0; c < 4; c++) {
726		if (mask & (1 << c))
727			dst[c] = tgsi_dst(pc, c, &inst->FullDstRegisters[0]);
728		else
729			dst[c] = NULL;
730	}
731
732	for (i = 0; i < inst->Instruction.NumSrcRegs; i++) {
733		for (c = 0; c < 4; c++)
734			src[i][c] = tgsi_src(pc, c, &inst->FullSrcRegisters[i]);
735	}
736
737	if (sat) {
738		for (c = 0; c < 4; c++) {
739			rdst[c] = dst[c];
740			dst[c] = temp_temp(pc);
741		}
742	}
743
744	switch (inst->Instruction.Opcode) {
745	case TGSI_OPCODE_ABS:
746		for (c = 0; c < 4; c++) {
747			unsigned inst[2] = { 0, 0 };
748
749			inst[0] = 0xa0000000; /* cvt */
750			set_long(pc, inst);
751			inst[1] |= (6 << 29); /* cvt */
752			inst[1] |= 0x04000000; /* 32 bit */
753			inst[1] |= (1 << 14); /* src .f32 */
754			inst[1] |= ((1 << 6) << 14); /* .abs */
755			set_dst(pc, dst[c], inst);
756			set_src_0(pc, src[0][c], inst);
757			emit(pc, inst);
758		}
759		break;
760	case TGSI_OPCODE_ADD:
761		for (c = 0; c < 4; c++) {
762			if (!(mask & (1 << c)))
763				continue;
764			emit_add(pc, dst[c], src[0][c], src[1][c]);
765		}
766		break;
767	case TGSI_OPCODE_COS:
768		for (c = 0; c < 4; c++) {
769			if (!(mask & (1 << c)))
770				continue;
771			emit_flop(pc, 5, dst[c], src[0][c]);
772		}
773		break;
774	case TGSI_OPCODE_DP3:
775		temp = alloc_temp(pc, NULL);
776		emit_mul(pc, temp, src[0][0], src[1][0]);
777		emit_mad(pc, temp, src[0][1], src[1][1], temp);
778		emit_mad(pc, temp, src[0][2], src[1][2], temp);
779		for (c = 0; c < 4; c++) {
780			if (!(mask & (1 << c)))
781				continue;
782			emit_mov(pc, dst[c], temp);
783		}
784		free_temp(pc, temp);
785		break;
786	case TGSI_OPCODE_DP4:
787		temp = alloc_temp(pc, NULL);
788		emit_mul(pc, temp, src[0][0], src[1][0]);
789		emit_mad(pc, temp, src[0][1], src[1][1], temp);
790		emit_mad(pc, temp, src[0][2], src[1][2], temp);
791		emit_mad(pc, temp, src[0][3], src[1][3], temp);
792		for (c = 0; c < 4; c++) {
793			if (!(mask & (1 << c)))
794				continue;
795			emit_mov(pc, dst[c], temp);
796		}
797		free_temp(pc, temp);
798		break;
799	case TGSI_OPCODE_DPH:
800		temp = alloc_temp(pc, NULL);
801		emit_mul(pc, temp, src[0][0], src[1][0]);
802		emit_mad(pc, temp, src[0][1], src[1][1], temp);
803		emit_mad(pc, temp, src[0][2], src[1][2], temp);
804		emit_add(pc, temp, src[1][3], temp);
805		for (c = 0; c < 4; c++) {
806			if (!(mask & (1 << c)))
807				continue;
808			emit_mov(pc, dst[c], temp);
809		}
810		free_temp(pc, temp);
811		break;
812	case TGSI_OPCODE_DST:
813	{
814		struct nv50_reg *one = alloc_immd(pc, 1.0);
815		emit_mov(pc, dst[0], one);
816		emit_mul(pc, dst[1], src[0][1], src[1][1]);
817		emit_mov(pc, dst[2], src[0][2]);
818		emit_mov(pc, dst[3], src[1][3]);
819		FREE(one);
820	}
821		break;
822	case TGSI_OPCODE_EX2:
823		temp = alloc_temp(pc, NULL);
824		for (c = 0; c < 4; c++) {
825			if (!(mask & (1 << c)))
826				continue;
827			emit_preex2(pc, temp, src[0][c]);
828			emit_flop(pc, 6, dst[c], temp);
829		}
830		free_temp(pc, temp);
831		break;
832	case TGSI_OPCODE_FLR:
833		for (c = 0; c < 4; c++) {
834			if (!(mask & (1 << c)))
835				continue;
836			emit_flr(pc, dst[c], src[0][c]);
837		}
838		break;
839	case TGSI_OPCODE_FRC:
840		temp = alloc_temp(pc, NULL);
841		for (c = 0; c < 4; c++) {
842			if (!(mask & (1 << c)))
843				continue;
844			emit_flr(pc, temp, src[0][c]);
845			emit_sub(pc, dst[c], src[0][c], temp);
846		}
847		free_temp(pc, temp);
848		break;
849	case TGSI_OPCODE_LG2:
850		for (c = 0; c < 4; c++) {
851			if (!(mask & (1 << c)))
852				continue;
853			emit_flop(pc, 3, dst[c], src[0][c]);
854		}
855		break;
856	case TGSI_OPCODE_MAD:
857		for (c = 0; c < 4; c++) {
858			if (!(mask & (1 << c)))
859				continue;
860			emit_mad(pc, dst[c], src[0][c], src[1][c], src[2][c]);
861		}
862		break;
863	case TGSI_OPCODE_MAX:
864		for (c = 0; c < 4; c++) {
865			if (!(mask & (1 << c)))
866				continue;
867			emit_minmax(pc, 4, dst[c], src[0][c], src[1][c]);
868		}
869		break;
870	case TGSI_OPCODE_MIN:
871		for (c = 0; c < 4; c++) {
872			if (!(mask & (1 << c)))
873				continue;
874			emit_minmax(pc, 5, dst[c], src[0][c], src[1][c]);
875		}
876		break;
877	case TGSI_OPCODE_MOV:
878		for (c = 0; c < 4; c++) {
879			if (!(mask & (1 << c)))
880				continue;
881			emit_mov(pc, dst[c], src[0][c]);
882		}
883		break;
884	case TGSI_OPCODE_MUL:
885		for (c = 0; c < 4; c++) {
886			if (!(mask & (1 << c)))
887				continue;
888			emit_mul(pc, dst[c], src[0][c], src[1][c]);
889		}
890		break;
891	case TGSI_OPCODE_POW:
892		temp = alloc_temp(pc, NULL);
893		emit_pow(pc, temp, src[0][0], src[1][0]);
894		for (c = 0; c < 4; c++) {
895			if (!(mask & (1 << c)))
896				continue;
897			emit_mov(pc, dst[c], temp);
898		}
899		free_temp(pc, temp);
900		break;
901	case TGSI_OPCODE_RCP:
902		for (c = 0; c < 4; c++) {
903			if (!(mask & (1 << c)))
904				continue;
905			emit_flop(pc, 0, dst[c], src[0][c]);
906		}
907		break;
908	case TGSI_OPCODE_RSQ:
909		for (c = 0; c < 4; c++) {
910			if (!(mask & (1 << c)))
911				continue;
912			emit_flop(pc, 2, dst[c], src[0][c]);
913		}
914		break;
915	case TGSI_OPCODE_SGE:
916		for (c = 0; c < 4; c++) {
917			if (!(mask & (1 << c)))
918				continue;
919			emit_set(pc, 6, dst[c], src[0][c], src[1][c]);
920		}
921		break;
922	case TGSI_OPCODE_SIN:
923		for (c = 0; c < 4; c++) {
924			if (!(mask & (1 << c)))
925				continue;
926			emit_flop(pc, 4, dst[c], src[0][c]);
927		}
928		break;
929	case TGSI_OPCODE_SLT:
930		for (c = 0; c < 4; c++) {
931			if (!(mask & (1 << c)))
932				continue;
933			emit_set(pc, 1, dst[c], src[0][c], src[1][c]);
934		}
935		break;
936	case TGSI_OPCODE_SUB:
937		for (c = 0; c < 4; c++) {
938			if (!(mask & (1 << c)))
939				continue;
940			emit_sub(pc, dst[c], src[0][c], src[1][c]);
941		}
942		break;
943	case TGSI_OPCODE_XPD:
944		temp = alloc_temp(pc, NULL);
945		emit_mul(pc, temp, src[0][2], src[1][1]);
946		emit_msb(pc, dst[0], src[0][1], src[1][2], temp);
947		emit_mul(pc, temp, src[0][0], src[1][2]);
948		emit_msb(pc, dst[1], src[0][2], src[1][0], temp);
949		emit_mul(pc, temp, src[0][1], src[1][0]);
950		emit_msb(pc, dst[2], src[0][0], src[1][1], temp);
951		free_temp(pc, temp);
952		break;
953	case TGSI_OPCODE_END:
954		break;
955	default:
956		NOUVEAU_ERR("invalid opcode %d\n", inst->Instruction.Opcode);
957		return FALSE;
958	}
959
960	if (sat) {
961		for (c = 0; c < 4; c++) {
962			unsigned inst[2] = { 0, 0 };
963
964			if (!(mask & (1 << c)))
965				continue;
966
967			inst[0] = 0xa0000000; /* cvt */
968			set_long(pc, inst);
969			inst[1] |= (6 << 29); /* cvt */
970			inst[1] |= 0x04000000; /* 32 bit */
971			inst[1] |= (1 << 14); /* src .f32 */
972			inst[1] |= ((1 << 5) << 14); /* .sat */
973			set_dst(pc, rdst[c], inst);
974			set_src_0(pc, dst[c], inst);
975			emit(pc, inst);
976		}
977	}
978
979	kill_temp_temp(pc);
980	return TRUE;
981}
982
983static boolean
984nv50_program_tx_prep(struct nv50_pc *pc)
985{
986	struct tgsi_parse_context p;
987	boolean ret = FALSE;
988	unsigned i, c;
989
990	tgsi_parse_init(&p, pc->p->pipe.tokens);
991	while (!tgsi_parse_end_of_tokens(&p)) {
992		const union tgsi_full_token *tok = &p.FullToken;
993
994		tgsi_parse_token(&p);
995		switch (tok->Token.Type) {
996		case TGSI_TOKEN_TYPE_IMMEDIATE:
997		{
998			const struct tgsi_full_immediate *imm =
999				&p.FullToken.FullImmediate;
1000
1001			ctor_immd(pc, imm->u.ImmediateFloat32[0].Float,
1002				      imm->u.ImmediateFloat32[1].Float,
1003				      imm->u.ImmediateFloat32[2].Float,
1004				      imm->u.ImmediateFloat32[3].Float);
1005		}
1006			break;
1007		case TGSI_TOKEN_TYPE_DECLARATION:
1008		{
1009			const struct tgsi_full_declaration *d;
1010			unsigned last;
1011
1012			d = &p.FullToken.FullDeclaration;
1013			last = d->u.DeclarationRange.Last;
1014
1015			switch (d->Declaration.File) {
1016			case TGSI_FILE_TEMPORARY:
1017				if (pc->temp_nr < (last + 1))
1018					pc->temp_nr = last + 1;
1019				break;
1020			case TGSI_FILE_OUTPUT:
1021				if (pc->result_nr < (last + 1))
1022					pc->result_nr = last + 1;
1023				break;
1024			case TGSI_FILE_INPUT:
1025				if (pc->attr_nr < (last + 1))
1026					pc->attr_nr = last + 1;
1027				break;
1028			case TGSI_FILE_CONSTANT:
1029				if (pc->param_nr < (last + 1))
1030					pc->param_nr = last + 1;
1031				break;
1032			default:
1033				NOUVEAU_ERR("bad decl file %d\n",
1034					    d->Declaration.File);
1035				goto out_err;
1036			}
1037		}
1038			break;
1039		case TGSI_TOKEN_TYPE_INSTRUCTION:
1040			break;
1041		default:
1042			break;
1043		}
1044	}
1045
1046	NOUVEAU_ERR("%d temps\n", pc->temp_nr);
1047	if (pc->temp_nr) {
1048		pc->temp = calloc(pc->temp_nr * 4, sizeof(struct nv50_reg));
1049		if (!pc->temp)
1050			goto out_err;
1051
1052		for (i = 0; i < pc->temp_nr; i++) {
1053			for (c = 0; c < 4; c++) {
1054				pc->temp[i*4+c].type = P_TEMP;
1055				pc->temp[i*4+c].hw = -1;
1056				pc->temp[i*4+c].index = i;
1057			}
1058		}
1059	}
1060
1061	NOUVEAU_ERR("%d attrib regs\n", pc->attr_nr);
1062	if (pc->attr_nr) {
1063		struct nv50_reg *iv = NULL, *tmp = NULL;
1064		int aid = 0;
1065
1066		pc->attr = calloc(pc->attr_nr * 4, sizeof(struct nv50_reg));
1067		if (!pc->attr)
1068			goto out_err;
1069
1070		if (pc->p->type == NV50_PROG_FRAGMENT) {
1071			iv = alloc_temp(pc, NULL);
1072			aid++;
1073		}
1074
1075		for (i = 0; i < pc->attr_nr; i++) {
1076			struct nv50_reg *a = &pc->attr[i*4];
1077
1078			for (c = 0; c < 4; c++) {
1079				if (pc->p->type == NV50_PROG_FRAGMENT) {
1080					struct nv50_reg *at =
1081						alloc_temp(pc, NULL);
1082					pc->attr[i*4+c].type = at->type;
1083					pc->attr[i*4+c].hw = at->hw;
1084					pc->attr[i*4+c].index = at->index;
1085				} else {
1086					pc->p->cfg.vp.attr[aid/32] |=
1087						(1 << (aid % 32));
1088					pc->attr[i*4+c].type = P_ATTR;
1089					pc->attr[i*4+c].hw = aid++;
1090					pc->attr[i*4+c].index = i;
1091				}
1092			}
1093
1094			if (pc->p->type != NV50_PROG_FRAGMENT)
1095				continue;
1096
1097			emit_interp(pc, iv, iv, iv, FALSE);
1098			tmp = alloc_temp(pc, NULL);
1099			{
1100				unsigned inst[2] = { 0, 0 };
1101				inst[0]  = 0x90000000;
1102				inst[0] |= (tmp->hw << 2);
1103				emit(pc, inst);
1104			}
1105			emit_interp(pc, &a[0], &a[0], tmp, TRUE);
1106			emit_interp(pc, &a[1], &a[1], tmp, TRUE);
1107			emit_interp(pc, &a[2], &a[2], tmp, TRUE);
1108			emit_interp(pc, &a[3], &a[3], tmp, TRUE);
1109			free_temp(pc, tmp);
1110		}
1111
1112		if (iv)
1113			free_temp(pc, iv);
1114	}
1115
1116	NOUVEAU_ERR("%d result regs\n", pc->result_nr);
1117	if (pc->result_nr) {
1118		int rid = 0;
1119
1120		pc->result = calloc(pc->result_nr * 4, sizeof(struct nv50_reg));
1121		if (!pc->result)
1122			goto out_err;
1123
1124		for (i = 0; i < pc->result_nr; i++) {
1125			for (c = 0; c < 4; c++) {
1126				if (pc->p->type == NV50_PROG_FRAGMENT)
1127					pc->result[i*4+c].type = P_TEMP;
1128				else
1129					pc->result[i*4+c].type = P_RESULT;
1130				pc->result[i*4+c].hw = rid++;
1131				pc->result[i*4+c].index = i;
1132			}
1133		}
1134	}
1135
1136	NOUVEAU_ERR("%d param regs\n", pc->param_nr);
1137	if (pc->param_nr) {
1138		int rid = 0;
1139
1140		pc->param = calloc(pc->param_nr * 4, sizeof(struct nv50_reg));
1141		if (!pc->param)
1142			goto out_err;
1143
1144		for (i = 0; i < pc->param_nr; i++) {
1145			for (c = 0; c < 4; c++) {
1146				pc->param[i*4+c].type = P_CONST;
1147				pc->param[i*4+c].hw = rid++;
1148				pc->param[i*4+c].index = i;
1149			}
1150		}
1151	}
1152
1153	if (pc->immd_nr) {
1154		int rid = 0;
1155
1156		pc->immd = calloc(pc->immd_nr * 4, sizeof(struct nv50_reg));
1157		if (!pc->immd)
1158			goto out_err;
1159
1160		for (i = 0; i < pc->immd_nr; i++) {
1161			for (c = 0; c < 4; c++) {
1162				pc->immd[i*4+c].type = P_IMMD;
1163				pc->immd[i*4+c].hw = rid++;
1164				pc->immd[i*4+c].index = i;
1165			}
1166		}
1167	}
1168
1169	ret = TRUE;
1170out_err:
1171	tgsi_parse_free(&p);
1172	return ret;
1173}
1174
1175static boolean
1176nv50_program_tx(struct nv50_program *p)
1177{
1178	struct tgsi_parse_context parse;
1179	struct nv50_pc *pc;
1180	boolean ret;
1181
1182	pc = CALLOC_STRUCT(nv50_pc);
1183	if (!pc)
1184		return FALSE;
1185	pc->p = p;
1186	pc->p->cfg.high_temp = 4;
1187
1188	ret = nv50_program_tx_prep(pc);
1189	if (ret == FALSE)
1190		goto out_cleanup;
1191
1192	tgsi_parse_init(&parse, pc->p->pipe.tokens);
1193	while (!tgsi_parse_end_of_tokens(&parse)) {
1194		const union tgsi_full_token *tok = &parse.FullToken;
1195
1196		tgsi_parse_token(&parse);
1197
1198		switch (tok->Token.Type) {
1199		case TGSI_TOKEN_TYPE_INSTRUCTION:
1200			ret = nv50_program_tx_insn(pc, tok);
1201			if (ret == FALSE)
1202				goto out_err;
1203			break;
1204		default:
1205			break;
1206		}
1207	}
1208
1209	p->immd_nr = pc->immd_nr * 4;
1210	p->immd = pc->immd_buf;
1211
1212out_err:
1213	tgsi_parse_free(&parse);
1214
1215out_cleanup:
1216	return ret;
1217}
1218
1219static void
1220nv50_program_validate(struct nv50_context *nv50, struct nv50_program *p)
1221{
1222	int i;
1223
1224	if (nv50_program_tx(p) == FALSE)
1225		assert(0);
1226	/* *not* sufficient, it's fine if last inst is long and
1227	 * NOT immd - otherwise it's fucked fucked fucked */
1228	p->insns[p->insns_nr - 1] |= 0x00000001;
1229
1230	if (p->type == NV50_PROG_VERTEX) {
1231	for (i = 0; i < p->insns_nr; i++)
1232		NOUVEAU_ERR("VP0x%08x\n", p->insns[i]);
1233	} else {
1234	for (i = 0; i < p->insns_nr; i++)
1235		NOUVEAU_ERR("FP0x%08x\n", p->insns[i]);
1236	}
1237
1238	p->translated = TRUE;
1239}
1240
1241static void
1242nv50_program_validate_data(struct nv50_context *nv50, struct nv50_program *p)
1243{
1244	int i;
1245
1246	for (i = 0; i < p->immd_nr; i++) {
1247		BEGIN_RING(tesla, 0x0f00, 2);
1248		OUT_RING  ((NV50_CB_PMISC << 16) | (i << 8));
1249		OUT_RING  (fui(p->immd[i]));
1250	}
1251}
1252
1253static void
1254nv50_program_validate_code(struct nv50_context *nv50, struct nv50_program *p)
1255{
1256	struct pipe_winsys *ws = nv50->pipe.winsys;
1257	void *map;
1258
1259	if (!p->buffer)
1260		p->buffer = ws->buffer_create(ws, 0x100, 0, p->insns_nr * 4);
1261	map = ws->buffer_map(ws, p->buffer, PIPE_BUFFER_USAGE_CPU_WRITE);
1262	memcpy(map, p->insns, p->insns_nr * 4);
1263	ws->buffer_unmap(ws, p->buffer);
1264}
1265
1266void
1267nv50_vertprog_validate(struct nv50_context *nv50)
1268{
1269	struct nouveau_grobj *tesla = nv50->screen->tesla;
1270	struct nv50_program *p = nv50->vertprog;
1271	struct nouveau_stateobj *so;
1272
1273	if (!p->translated) {
1274		nv50_program_validate(nv50, p);
1275		if (!p->translated)
1276			assert(0);
1277	}
1278
1279	nv50_program_validate_data(nv50, p);
1280	nv50_program_validate_code(nv50, p);
1281
1282	so = so_new(11, 2);
1283	so_method(so, tesla, NV50TCL_VP_ADDRESS_HIGH, 2);
1284	so_reloc (so, p->buffer, 0, NOUVEAU_BO_VRAM | NOUVEAU_BO_RD |
1285		  NOUVEAU_BO_HIGH, 0, 0);
1286	so_reloc (so, p->buffer, 0, NOUVEAU_BO_VRAM | NOUVEAU_BO_RD |
1287		  NOUVEAU_BO_LOW, 0, 0);
1288	so_method(so, tesla, 0x1650, 2);
1289	so_data  (so, p->cfg.vp.attr[0]);
1290	so_data  (so, p->cfg.vp.attr[1]);
1291	so_method(so, tesla, 0x16ac, 2);
1292	so_data  (so, 8);
1293	so_data  (so, p->cfg.high_temp);
1294	so_method(so, tesla, 0x140c, 1);
1295	so_data  (so, 0); /* program start offset */
1296	so_emit(nv50->screen->nvws, so);
1297	so_ref(NULL, &so);
1298}
1299
1300void
1301nv50_fragprog_validate(struct nv50_context *nv50)
1302{
1303	struct nouveau_grobj *tesla = nv50->screen->tesla;
1304	struct nv50_program *p = nv50->fragprog;
1305	struct nouveau_stateobj *so;
1306
1307	if (!p->translated) {
1308		nv50_program_validate(nv50, p);
1309		if (!p->translated)
1310			assert(0);
1311	}
1312
1313	nv50_program_validate_data(nv50, p);
1314	nv50_program_validate_code(nv50, p);
1315
1316	so = so_new(7, 2);
1317	so_method(so, tesla, NV50TCL_FP_ADDRESS_HIGH, 2);
1318	so_reloc (so, p->buffer, 0, NOUVEAU_BO_VRAM | NOUVEAU_BO_RD |
1319		  NOUVEAU_BO_HIGH, 0, 0);
1320	so_reloc (so, p->buffer, 0, NOUVEAU_BO_VRAM | NOUVEAU_BO_RD |
1321		  NOUVEAU_BO_LOW, 0, 0);
1322	so_method(so, tesla, 0x198c, 1);
1323	so_data  (so, p->cfg.high_temp);
1324	so_method(so, tesla, 0x1414, 1);
1325	so_data  (so, 0); /* program start offset */
1326	so_emit(nv50->screen->nvws, so);
1327	so_ref(NULL, &so);
1328}
1329
1330void
1331nv50_program_destroy(struct nv50_context *nv50, struct nv50_program *p)
1332{
1333	struct pipe_winsys *ws = nv50->pipe.winsys;
1334
1335	if (p->insns_nr) {
1336		if (p->insns)
1337			FREE(p->insns);
1338		p->insns_nr = 0;
1339	}
1340
1341	if (p->buffer)
1342		pipe_buffer_reference(ws, &p->buffer, NULL);
1343
1344	p->translated = 0;
1345}
1346
1347