nv50_program.c revision 9a37a56c8ab8c64bdadb1e1e807f885d6a5e3121
1#include "pipe/p_context.h"
2#include "pipe/p_defines.h"
3#include "pipe/p_state.h"
4#include "pipe/p_inlines.h"
5
6#include "pipe/p_shader_tokens.h"
7#include "tgsi/util/tgsi_parse.h"
8#include "tgsi/util/tgsi_util.h"
9
10#include "nv50_context.h"
11#include "nv50_state.h"
12
13#define NV50_SU_MAX_TEMP 64
14
15/* ARL - gallium craps itself on progs/vp/arl.txt
16 *
17 * MSB - Like MAD, but MUL+SUB
18 * 	- Fuck it off, introduce a way to negate args for ops that
19 * 	  support it.
20 *
21 * Look into inlining IMMD for ops other than MOV (make it general?)
22 * 	- Maybe even relax restrictions a bit, can't do P_RESULT + P_IMMD,
23 * 	  but can emit to P_TEMP first - then MOV later. NVIDIA does this
24 *
25 * Hmmm.. what happens if we have src1+src2 both consts.. ouch !
26 *
27 * Verify half-insns work where expected - and force disable them where they
28 * don't work - MUL has it forcibly disabled atm as it fixes POW..
29 */
30struct nv50_reg {
31	enum {
32		P_TEMP,
33		P_ATTR,
34		P_RESULT,
35		P_CONST,
36		P_IMMD
37	} type;
38	int index;
39
40	int hw;
41	int neg;
42};
43
44struct nv50_pc {
45	struct nv50_program *p;
46
47	/* hw resources */
48	struct nv50_reg *r_temp[NV50_SU_MAX_TEMP];
49
50	/* tgsi resources */
51	struct nv50_reg *temp;
52	int temp_nr;
53	struct nv50_reg *attr;
54	int attr_nr;
55	struct nv50_reg *result;
56	int result_nr;
57	struct nv50_reg *param;
58	int param_nr;
59	struct nv50_reg *immd;
60	float *immd_buf;
61	int immd_nr;
62
63	struct nv50_reg *temp_temp[8];
64	unsigned temp_temp_nr;
65};
66
67static void
68alloc_reg(struct nv50_pc *pc, struct nv50_reg *reg)
69{
70	int i;
71
72	if (reg->type != P_TEMP)
73		return;
74
75	if (reg->hw >= 0) {
76		/*XXX: do this here too to catch FP temp-as-attr usage..
77		 *     not clean, but works */
78		if (pc->p->cfg.high_temp < (reg->hw + 1))
79			pc->p->cfg.high_temp = reg->hw + 1;
80		return;
81	}
82
83	for (i = 0; i < NV50_SU_MAX_TEMP; i++) {
84		if (!(pc->r_temp[i])) {
85			pc->r_temp[i] = reg;
86			reg->hw = i;
87			if (pc->p->cfg.high_temp < (i + 1))
88				pc->p->cfg.high_temp = i + 1;
89			return;
90		}
91	}
92
93	assert(0);
94}
95
96static struct nv50_reg *
97alloc_temp(struct nv50_pc *pc, struct nv50_reg *dst)
98{
99	struct nv50_reg *r;
100	int i;
101
102	if (dst && dst->type == P_TEMP && dst->hw == -1)
103		return dst;
104
105	for (i = 0; i < NV50_SU_MAX_TEMP; i++) {
106		if (!pc->r_temp[i]) {
107			r = CALLOC_STRUCT(nv50_reg);
108			r->type = P_TEMP;
109			r->index = -1;
110			r->hw = i;
111			pc->r_temp[i] = r;
112			return r;
113		}
114	}
115
116	assert(0);
117	return NULL;
118}
119
120static void
121free_temp(struct nv50_pc *pc, struct nv50_reg *r)
122{
123	if (r->index == -1) {
124		FREE(pc->r_temp[r->hw]);
125		pc->r_temp[r->hw] = NULL;
126	}
127}
128
129static struct nv50_reg *
130temp_temp(struct nv50_pc *pc)
131{
132	if (pc->temp_temp_nr >= 8)
133		assert(0);
134
135	pc->temp_temp[pc->temp_temp_nr] = alloc_temp(pc, NULL);
136	return pc->temp_temp[pc->temp_temp_nr++];
137}
138
139static void
140kill_temp_temp(struct nv50_pc *pc)
141{
142	int i;
143
144	for (i = 0; i < pc->temp_temp_nr; i++)
145		free_temp(pc, pc->temp_temp[i]);
146	pc->temp_temp_nr = 0;
147}
148
149static int
150ctor_immd(struct nv50_pc *pc, float x, float y, float z, float w)
151{
152	pc->immd_buf = realloc(pc->immd_buf, (pc->immd_nr + 1) * 4 *
153					     sizeof(float));
154	pc->immd_buf[(pc->immd_nr * 4) + 0] = x;
155	pc->immd_buf[(pc->immd_nr * 4) + 1] = y;
156	pc->immd_buf[(pc->immd_nr * 4) + 2] = z;
157	pc->immd_buf[(pc->immd_nr * 4) + 3] = w;
158
159	return pc->immd_nr++;
160}
161
162static struct nv50_reg *
163alloc_immd(struct nv50_pc *pc, float f)
164{
165	struct nv50_reg *r = CALLOC_STRUCT(nv50_reg);
166	unsigned hw;
167
168	hw = ctor_immd(pc, f, 0, 0, 0) * 4;
169	r->type = P_IMMD;
170	r->hw = hw;
171	r->index = -1;
172	return r;
173}
174
175static void
176emit(struct nv50_pc *pc, unsigned *inst)
177{
178	struct nv50_program *p = pc->p;
179
180	if (inst[0] & 1) {
181		p->insns_nr += 2;
182		p->insns = realloc(p->insns, sizeof(unsigned) * p->insns_nr);
183		memcpy(p->insns + (p->insns_nr - 2), inst, sizeof(unsigned)*2);
184	} else {
185		p->insns_nr += 1;
186		p->insns = realloc(p->insns, sizeof(unsigned) * p->insns_nr);
187		memcpy(p->insns + (p->insns_nr - 1), inst, sizeof(unsigned));
188	}
189}
190
191static INLINE void set_long(struct nv50_pc *, unsigned *);
192
193static boolean
194is_long(unsigned *inst)
195{
196	if (inst[0] & 1)
197		return TRUE;
198	return FALSE;
199}
200
201static boolean
202is_immd(unsigned *inst)
203{
204	if (is_long(inst) && (inst[1] & 3) == 3)
205		return TRUE;
206	return FALSE;
207}
208
209static INLINE void
210set_pred(struct nv50_pc *pc, unsigned pred, unsigned idx, unsigned *inst)
211{
212	set_long(pc, inst);
213	inst[1] &= ~((0x1f << 7) | (0x3 << 12));
214	inst[1] |= (pred << 7) | (idx << 12);
215}
216
217static INLINE void
218set_pred_wr(struct nv50_pc *pc, unsigned on, unsigned idx, unsigned *inst)
219{
220	set_long(pc, inst);
221	inst[1] &= ~((0x3 << 4) | (1 << 6));
222	inst[1] |= (idx << 4) | (on << 6);
223}
224
225static INLINE void
226set_long(struct nv50_pc *pc, unsigned *inst)
227{
228	if (is_long(inst))
229		return;
230
231	inst[0] |= 1;
232	set_pred(pc, 0xf, 0, inst);
233	set_pred_wr(pc, 0, 0, inst);
234}
235
236static INLINE void
237set_dst(struct nv50_pc *pc, struct nv50_reg *dst, unsigned *inst)
238{
239	if (dst->type == P_RESULT) {
240		set_long(pc, inst);
241		inst[1] |= 0x00000008;
242	}
243
244	alloc_reg(pc, dst);
245	inst[0] |= (dst->hw << 2);
246}
247
248static INLINE void
249set_immd(struct nv50_pc *pc, struct nv50_reg *imm, unsigned *inst)
250{
251	unsigned val = fui(pc->immd_buf[imm->hw]); /* XXX */
252
253	set_long(pc, inst);
254	/*XXX: can't be predicated - bits overlap.. catch cases where both
255	 *     are required and avoid them. */
256	set_pred(pc, 0, 0, inst);
257	set_pred_wr(pc, 0, 0, inst);
258
259	inst[1] |= 0x00000002 | 0x00000001;
260	inst[0] |= (val & 0x3f) << 16;
261	inst[1] |= (val >> 6) << 2;
262}
263
264static void
265emit_interp(struct nv50_pc *pc, struct nv50_reg *dst,
266	    struct nv50_reg *src, struct nv50_reg *iv, boolean noperspective)
267{
268	unsigned inst[2] = { 0, 0 };
269
270	inst[0] |= 0x80000000;
271	set_dst(pc, dst, inst);
272	alloc_reg(pc, iv);
273	inst[0] |= (iv->hw << 9);
274	alloc_reg(pc, src);
275	inst[0] |= (src->hw << 16);
276	if (noperspective)
277		inst[0] |= (1 << 25);
278
279	emit(pc, inst);
280}
281
282static void
283set_cseg(struct nv50_pc *pc, struct nv50_reg *src, unsigned *inst)
284{
285	set_long(pc, inst);
286	if (src->type == P_IMMD) {
287		inst[1] |= (NV50_CB_PMISC << 22);
288	} else {
289		if (pc->p->type == NV50_PROG_VERTEX)
290			inst[1] |= (NV50_CB_PVP << 22);
291		else
292			inst[1] |= (NV50_CB_PFP << 22);
293	}
294}
295
296static void
297emit_mov(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src)
298{
299	unsigned inst[2] = { 0, 0 };
300
301	inst[0] |= 0x10000000;
302
303	set_dst(pc, dst, inst);
304
305	if (dst->type != P_RESULT && src->type == P_IMMD) {
306		set_immd(pc, src, inst);
307		/*XXX: 32-bit, but steals part of "half" reg space - need to
308		 *     catch and handle this case if/when we do half-regs
309		 */
310		inst[0] |= 0x00008000;
311	} else
312	if (src->type == P_IMMD || src->type == P_CONST) {
313		set_long(pc, inst);
314		set_cseg(pc, src, inst);
315		inst[0] |= (src->hw << 9);
316		inst[1] |= 0x20000000; /* src0 const? */
317	} else {
318		if (src->type == P_ATTR) {
319			set_long(pc, inst);
320			inst[1] |= 0x00200000;
321		}
322
323		alloc_reg(pc, src);
324		inst[0] |= (src->hw << 9);
325	}
326
327	/* We really should support "half" instructions here at some point,
328	 * but I don't feel confident enough about them yet.
329	 */
330	set_long(pc, inst);
331	if (is_long(inst) && !is_immd(inst)) {
332		inst[1] |= 0x04000000; /* 32-bit */
333		inst[1] |= 0x0003c000; /* "subsubop" 0xf == mov */
334	}
335
336	emit(pc, inst);
337}
338
339static boolean
340check_swap_src_0_1(struct nv50_pc *pc,
341		   struct nv50_reg **s0, struct nv50_reg **s1)
342{
343	struct nv50_reg *src0 = *s0, *src1 = *s1;
344
345	if (src0->type == P_CONST) {
346		if (src1->type != P_CONST) {
347			*s0 = src1;
348			*s1 = src0;
349			return TRUE;
350		}
351	} else
352	if (src1->type == P_ATTR) {
353		if (src0->type != P_ATTR) {
354			*s0 = src1;
355			*s1 = src0;
356			return TRUE;
357		}
358	}
359
360	return FALSE;
361}
362
363static void
364set_src_0(struct nv50_pc *pc, struct nv50_reg *src, unsigned *inst)
365{
366	if (src->type == P_ATTR) {
367		set_long(pc, inst);
368		inst[1] |= 0x00200000;
369	} else
370	if (src->type == P_CONST || src->type == P_IMMD) {
371		struct nv50_reg *temp = temp_temp(pc);
372
373		emit_mov(pc, temp, src);
374		src = temp;
375	}
376
377	alloc_reg(pc, src);
378	inst[0] |= (src->hw << 9);
379}
380
381static void
382set_src_1(struct nv50_pc *pc, struct nv50_reg *src, unsigned *inst)
383{
384	if (src->type == P_ATTR) {
385		struct nv50_reg *temp = temp_temp(pc);
386
387		emit_mov(pc, temp, src);
388		src = temp;
389	} else
390	if (src->type == P_CONST || src->type == P_IMMD) {
391		set_cseg(pc, src, inst);
392		inst[0] |= 0x00800000;
393	}
394
395	alloc_reg(pc, src);
396	inst[0] |= (src->hw << 16);
397}
398
399static void
400set_src_2(struct nv50_pc *pc, struct nv50_reg *src, unsigned *inst)
401{
402	set_long(pc, inst);
403
404	if (src->type == P_ATTR) {
405		struct nv50_reg *temp = temp_temp(pc);
406
407		emit_mov(pc, temp, src);
408		src = temp;
409	} else
410	if (src->type == P_CONST || src->type == P_IMMD) {
411		set_cseg(pc, src, inst);
412		inst[0] |= 0x01000000;
413	}
414
415	alloc_reg(pc, src);
416	inst[1] |= (src->hw << 14);
417}
418
419static void
420emit_mul(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src0,
421	 struct nv50_reg *src1)
422{
423	unsigned inst[2] = { 0, 0 };
424
425	inst[0] |= 0xc0000000;
426	set_long(pc, inst);
427
428	check_swap_src_0_1(pc, &src0, &src1);
429	set_dst(pc, dst, inst);
430	set_src_0(pc, src0, inst);
431	set_src_1(pc, src1, inst);
432
433	emit(pc, inst);
434}
435
436static void
437emit_add(struct nv50_pc *pc, struct nv50_reg *dst,
438	 struct nv50_reg *src0, struct nv50_reg *src1)
439{
440	unsigned inst[2] = { 0, 0 };
441
442	inst[0] |= 0xb0000000;
443
444	check_swap_src_0_1(pc, &src0, &src1);
445	set_dst(pc, dst, inst);
446	set_src_0(pc, src0, inst);
447	if (is_long(inst))
448		set_src_2(pc, src1, inst);
449	else
450		set_src_1(pc, src1, inst);
451
452	emit(pc, inst);
453}
454
455static void
456emit_minmax(struct nv50_pc *pc, unsigned sub, struct nv50_reg *dst,
457	    struct nv50_reg *src0, struct nv50_reg *src1)
458{
459	unsigned inst[2] = { 0, 0 };
460
461	set_long(pc, inst);
462	inst[0] |= 0xb0000000;
463	inst[1] |= (sub << 29);
464
465	check_swap_src_0_1(pc, &src0, &src1);
466	set_dst(pc, dst, inst);
467	set_src_0(pc, src0, inst);
468	set_src_1(pc, src1, inst);
469
470	emit(pc, inst);
471}
472
473static void
474emit_sub(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src0,
475	 struct nv50_reg *src1)
476{
477	unsigned inst[2] = { 0, 0 };
478
479	inst[0] |= 0xb0000000;
480
481	set_long(pc, inst);
482	if (check_swap_src_0_1(pc, &src0, &src1))
483		inst[1] |= 0x04000000;
484	else
485		inst[1] |= 0x08000000;
486
487	set_dst(pc, dst, inst);
488	set_src_0(pc, src0, inst);
489	set_src_2(pc, src1, inst);
490
491	emit(pc, inst);
492}
493
494static void
495emit_mad(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src0,
496	 struct nv50_reg *src1, struct nv50_reg *src2)
497{
498	unsigned inst[2] = { 0, 0 };
499
500	inst[0] |= 0xe0000000;
501
502	check_swap_src_0_1(pc, &src0, &src1);
503	set_dst(pc, dst, inst);
504	set_src_0(pc, src0, inst);
505	set_src_1(pc, src1, inst);
506	set_src_2(pc, src2, inst);
507
508	emit(pc, inst);
509}
510
511static void
512emit_msb(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src0,
513	 struct nv50_reg *src1, struct nv50_reg *src2)
514{
515	unsigned inst[2] = { 0, 0 };
516
517	inst[0] |= 0xe0000000;
518	set_long(pc, inst);
519	inst[1] |= 0x08000000; /* src0 * src1 - src2 */
520
521	check_swap_src_0_1(pc, &src0, &src1);
522	set_dst(pc, dst, inst);
523	set_src_0(pc, src0, inst);
524	set_src_1(pc, src1, inst);
525	set_src_2(pc, src2, inst);
526
527	emit(pc, inst);
528}
529
530static void
531emit_flop(struct nv50_pc *pc, unsigned sub,
532	  struct nv50_reg *dst, struct nv50_reg *src)
533{
534	unsigned inst[2] = { 0, 0 };
535
536	inst[0] |= 0x90000000;
537	if (sub) {
538		set_long(pc, inst);
539		inst[1] |= (sub << 29);
540	}
541
542	set_dst(pc, dst, inst);
543	set_src_0(pc, src, inst);
544
545	emit(pc, inst);
546}
547
548static void
549emit_preex2(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src)
550{
551	unsigned inst[2] = { 0, 0 };
552
553	inst[0] |= 0xb0000000;
554
555	set_dst(pc, dst, inst);
556	set_src_0(pc, src, inst);
557	set_long(pc, inst);
558	inst[1] |= (6 << 29) | 0x00004000;
559
560	emit(pc, inst);
561}
562
563/*XXX: inaccurate results.. why? */
564#define ALLOW_SET_SWAP 0
565
566static void
567emit_set(struct nv50_pc *pc, unsigned c_op, struct nv50_reg *dst,
568	 struct nv50_reg *src0, struct nv50_reg *src1)
569{
570	unsigned inst[2] = { 0, 0 };
571#if ALLOW_SET_SWAP
572	unsigned inv_cop[8] = { 0, 6, 2, 4, 3, 5, 1, 7 };
573#endif
574	struct nv50_reg *rdst;
575
576#if ALLOW_SET_SWAP
577	assert(c_op <= 7);
578	if (check_swap_src_0_1(pc, &src0, &src1))
579		c_op = inv_cop[c_op];
580#endif
581
582	rdst = dst;
583	if (dst->type != P_TEMP)
584		dst = alloc_temp(pc, NULL);
585
586	/* set.u32 */
587	set_long(pc, inst);
588	inst[0] |= 0xb0000000;
589	inst[1] |= (3 << 29);
590	inst[1] |= (c_op << 14);
591	/*XXX: breaks things, .u32 by default?
592	 *     decuda will disasm as .u16 and use .lo/.hi regs, but this
593	 *     doesn't seem to match what the hw actually does.
594	inst[1] |= 0x04000000; << breaks things.. .u32 by default?
595	 */
596	set_dst(pc, dst, inst);
597	set_src_0(pc, src0, inst);
598	set_src_1(pc, src1, inst);
599	emit(pc, inst);
600
601	/* cvt.f32.u32 */
602	inst[0] = 0xa0000001;
603	inst[1] = 0x64014780;
604	set_dst(pc, rdst, inst);
605	set_src_0(pc, dst, inst);
606	emit(pc, inst);
607
608	if (dst != rdst)
609		free_temp(pc, dst);
610}
611
612static void
613emit_flr(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src)
614{
615	unsigned inst[2] = { 0, 0 };
616
617	inst[0] = 0xa0000000; /* cvt */
618	set_long(pc, inst);
619	inst[1] |= (6 << 29); /* cvt */
620	inst[1] |= 0x08000000; /* integer mode */
621	inst[1] |= 0x04000000; /* 32 bit */
622	inst[1] |= ((0x1 << 3)) << 14; /* .rn */
623	inst[1] |= (1 << 14); /* src .f32 */
624	set_dst(pc, dst, inst);
625	set_src_0(pc, src, inst);
626
627	emit(pc, inst);
628}
629
630static void
631emit_pow(struct nv50_pc *pc, struct nv50_reg *dst,
632	 struct nv50_reg *v, struct nv50_reg *e)
633{
634	struct nv50_reg *temp = alloc_temp(pc, NULL);
635
636	emit_flop(pc, 3, temp, v);
637	emit_mul(pc, temp, temp, e);
638	emit_preex2(pc, temp, temp);
639	emit_flop(pc, 6, dst, temp);
640
641	free_temp(pc, temp);
642}
643
644static void
645emit_abs(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src)
646{
647	unsigned inst[2] = { 0, 0 };
648
649	inst[0] = 0xa0000000; /* cvt */
650	set_long(pc, inst);
651	inst[1] |= (6 << 29); /* cvt */
652	inst[1] |= 0x04000000; /* 32 bit */
653	inst[1] |= (1 << 14); /* src .f32 */
654	inst[1] |= ((1 << 6) << 14); /* .abs */
655	set_dst(pc, dst, inst);
656	set_src_0(pc, src, inst);
657
658	emit(pc, inst);
659}
660
661static void
662emit_lit(struct nv50_pc *pc, struct nv50_reg **dst, struct nv50_reg **src)
663{
664	struct nv50_reg *one = alloc_immd(pc, 1.0);
665	struct nv50_reg *zero = alloc_immd(pc, 0.0);
666	struct nv50_reg *neg128 = alloc_immd(pc, -127.999999);
667	struct nv50_reg *pos128 = alloc_immd(pc,  127.999999);
668	struct nv50_reg *tmp[4];
669
670	emit_mov(pc, dst[0], one);
671	emit_mov(pc, dst[3], one);
672
673	tmp[0] = temp_temp(pc);
674	emit_minmax(pc, 4, dst[1], src[0], zero);
675	set_pred_wr(pc, 1, 0, &pc->p->insns[pc->p->insns_nr - 2]);
676
677	tmp[1] = temp_temp(pc);
678	emit_minmax(pc, 4, tmp[1], src[1], zero);
679
680	tmp[3] = temp_temp(pc);
681	emit_minmax(pc, 4, tmp[3], src[3], neg128);
682	emit_minmax(pc, 5, tmp[3], tmp[3], pos128);
683
684	emit_pow(pc, dst[2], tmp[1], tmp[3]);
685	emit_mov(pc, dst[2], zero);
686	set_pred(pc, 3, 0, &pc->p->insns[pc->p->insns_nr - 2]);
687}
688
689static struct nv50_reg *
690tgsi_dst(struct nv50_pc *pc, int c, const struct tgsi_full_dst_register *dst)
691{
692	switch (dst->DstRegister.File) {
693	case TGSI_FILE_TEMPORARY:
694		return &pc->temp[dst->DstRegister.Index * 4 + c];
695	case TGSI_FILE_OUTPUT:
696		return &pc->result[dst->DstRegister.Index * 4 + c];
697	case TGSI_FILE_NULL:
698		return NULL;
699	default:
700		break;
701	}
702
703	return NULL;
704}
705
706static struct nv50_reg *
707tgsi_src(struct nv50_pc *pc, int chan, const struct tgsi_full_src_register *src)
708{
709	struct nv50_reg *r = NULL;
710	struct nv50_reg *temp;
711	unsigned c;
712
713	c = tgsi_util_get_full_src_register_extswizzle(src, chan);
714	switch (c) {
715	case TGSI_EXTSWIZZLE_X:
716	case TGSI_EXTSWIZZLE_Y:
717	case TGSI_EXTSWIZZLE_Z:
718	case TGSI_EXTSWIZZLE_W:
719		switch (src->SrcRegister.File) {
720		case TGSI_FILE_INPUT:
721			r = &pc->attr[src->SrcRegister.Index * 4 + c];
722			break;
723		case TGSI_FILE_TEMPORARY:
724			r = &pc->temp[src->SrcRegister.Index * 4 + c];
725			break;
726		case TGSI_FILE_CONSTANT:
727			r = &pc->param[src->SrcRegister.Index * 4 + c];
728			break;
729		case TGSI_FILE_IMMEDIATE:
730			r = &pc->immd[src->SrcRegister.Index * 4 + c];
731			break;
732		default:
733			assert(0);
734			break;
735		}
736		break;
737	case TGSI_EXTSWIZZLE_ZERO:
738		r = alloc_immd(pc, 0.0);
739		break;
740	case TGSI_EXTSWIZZLE_ONE:
741		r = alloc_immd(pc, 1.0);
742		break;
743	default:
744		assert(0);
745		break;
746	}
747
748	switch (tgsi_util_get_full_src_register_sign_mode(src, chan)) {
749	case TGSI_UTIL_SIGN_KEEP:
750		break;
751	case TGSI_UTIL_SIGN_CLEAR:
752		temp = temp_temp(pc);
753		emit_abs(pc, temp, r);
754		r = temp;
755		break;
756	default:
757		assert(0);
758		break;
759	}
760
761	return r;
762}
763
764static boolean
765nv50_program_tx_insn(struct nv50_pc *pc, const union tgsi_full_token *tok)
766{
767	const struct tgsi_full_instruction *inst = &tok->FullInstruction;
768	struct nv50_reg *rdst[4], *dst[4], *src[3][4], *temp;
769	unsigned mask, sat;
770	int i, c;
771
772	NOUVEAU_ERR("insn %p\n", tok);
773
774	mask = inst->FullDstRegisters[0].DstRegister.WriteMask;
775	sat = inst->Instruction.Saturate == TGSI_SAT_ZERO_ONE;
776
777	for (c = 0; c < 4; c++) {
778		if (mask & (1 << c))
779			dst[c] = tgsi_dst(pc, c, &inst->FullDstRegisters[0]);
780		else
781			dst[c] = NULL;
782	}
783
784	for (i = 0; i < inst->Instruction.NumSrcRegs; i++) {
785		for (c = 0; c < 4; c++)
786			src[i][c] = tgsi_src(pc, c, &inst->FullSrcRegisters[i]);
787	}
788
789	if (sat) {
790		for (c = 0; c < 4; c++) {
791			rdst[c] = dst[c];
792			dst[c] = temp_temp(pc);
793		}
794	}
795
796	switch (inst->Instruction.Opcode) {
797	case TGSI_OPCODE_ABS:
798		for (c = 0; c < 4; c++) {
799			if (!(mask & (1 << c)))
800				continue;
801			emit_abs(pc, dst[c], src[0][c]);
802		}
803		break;
804	case TGSI_OPCODE_ADD:
805		for (c = 0; c < 4; c++) {
806			if (!(mask & (1 << c)))
807				continue;
808			emit_add(pc, dst[c], src[0][c], src[1][c]);
809		}
810		break;
811	case TGSI_OPCODE_COS:
812		for (c = 0; c < 4; c++) {
813			if (!(mask & (1 << c)))
814				continue;
815			emit_flop(pc, 5, dst[c], src[0][c]);
816		}
817		break;
818	case TGSI_OPCODE_DP3:
819		temp = alloc_temp(pc, NULL);
820		emit_mul(pc, temp, src[0][0], src[1][0]);
821		emit_mad(pc, temp, src[0][1], src[1][1], temp);
822		emit_mad(pc, temp, src[0][2], src[1][2], temp);
823		for (c = 0; c < 4; c++) {
824			if (!(mask & (1 << c)))
825				continue;
826			emit_mov(pc, dst[c], temp);
827		}
828		free_temp(pc, temp);
829		break;
830	case TGSI_OPCODE_DP4:
831		temp = alloc_temp(pc, NULL);
832		emit_mul(pc, temp, src[0][0], src[1][0]);
833		emit_mad(pc, temp, src[0][1], src[1][1], temp);
834		emit_mad(pc, temp, src[0][2], src[1][2], temp);
835		emit_mad(pc, temp, src[0][3], src[1][3], temp);
836		for (c = 0; c < 4; c++) {
837			if (!(mask & (1 << c)))
838				continue;
839			emit_mov(pc, dst[c], temp);
840		}
841		free_temp(pc, temp);
842		break;
843	case TGSI_OPCODE_DPH:
844		temp = alloc_temp(pc, NULL);
845		emit_mul(pc, temp, src[0][0], src[1][0]);
846		emit_mad(pc, temp, src[0][1], src[1][1], temp);
847		emit_mad(pc, temp, src[0][2], src[1][2], temp);
848		emit_add(pc, temp, src[1][3], temp);
849		for (c = 0; c < 4; c++) {
850			if (!(mask & (1 << c)))
851				continue;
852			emit_mov(pc, dst[c], temp);
853		}
854		free_temp(pc, temp);
855		break;
856	case TGSI_OPCODE_DST:
857	{
858		struct nv50_reg *one = alloc_immd(pc, 1.0);
859		if (mask & (1 << 0))
860			emit_mov(pc, dst[0], one);
861		if (mask & (1 << 1))
862			emit_mul(pc, dst[1], src[0][1], src[1][1]);
863		if (mask & (1 << 2))
864			emit_mov(pc, dst[2], src[0][2]);
865		if (mask & (1 << 3))
866			emit_mov(pc, dst[3], src[1][3]);
867		FREE(one);
868	}
869		break;
870	case TGSI_OPCODE_EX2:
871		temp = alloc_temp(pc, NULL);
872		for (c = 0; c < 4; c++) {
873			if (!(mask & (1 << c)))
874				continue;
875			emit_preex2(pc, temp, src[0][c]);
876			emit_flop(pc, 6, dst[c], temp);
877		}
878		free_temp(pc, temp);
879		break;
880	case TGSI_OPCODE_FLR:
881		for (c = 0; c < 4; c++) {
882			if (!(mask & (1 << c)))
883				continue;
884			emit_flr(pc, dst[c], src[0][c]);
885		}
886		break;
887	case TGSI_OPCODE_FRC:
888		temp = alloc_temp(pc, NULL);
889		for (c = 0; c < 4; c++) {
890			if (!(mask & (1 << c)))
891				continue;
892			emit_flr(pc, temp, src[0][c]);
893			emit_sub(pc, dst[c], src[0][c], temp);
894		}
895		free_temp(pc, temp);
896		break;
897	case TGSI_OPCODE_LIT:
898		/*XXX: writemask */
899		emit_lit(pc, &dst[0], &src[0][0]);
900		break;
901	case TGSI_OPCODE_LG2:
902		for (c = 0; c < 4; c++) {
903			if (!(mask & (1 << c)))
904				continue;
905			emit_flop(pc, 3, dst[c], src[0][c]);
906		}
907		break;
908	case TGSI_OPCODE_MAD:
909		for (c = 0; c < 4; c++) {
910			if (!(mask & (1 << c)))
911				continue;
912			emit_mad(pc, dst[c], src[0][c], src[1][c], src[2][c]);
913		}
914		break;
915	case TGSI_OPCODE_MAX:
916		for (c = 0; c < 4; c++) {
917			if (!(mask & (1 << c)))
918				continue;
919			emit_minmax(pc, 4, dst[c], src[0][c], src[1][c]);
920		}
921		break;
922	case TGSI_OPCODE_MIN:
923		for (c = 0; c < 4; c++) {
924			if (!(mask & (1 << c)))
925				continue;
926			emit_minmax(pc, 5, dst[c], src[0][c], src[1][c]);
927		}
928		break;
929	case TGSI_OPCODE_MOV:
930		for (c = 0; c < 4; c++) {
931			if (!(mask & (1 << c)))
932				continue;
933			emit_mov(pc, dst[c], src[0][c]);
934		}
935		break;
936	case TGSI_OPCODE_MUL:
937		for (c = 0; c < 4; c++) {
938			if (!(mask & (1 << c)))
939				continue;
940			emit_mul(pc, dst[c], src[0][c], src[1][c]);
941		}
942		break;
943	case TGSI_OPCODE_POW:
944		temp = alloc_temp(pc, NULL);
945		emit_pow(pc, temp, src[0][0], src[1][0]);
946		for (c = 0; c < 4; c++) {
947			if (!(mask & (1 << c)))
948				continue;
949			emit_mov(pc, dst[c], temp);
950		}
951		free_temp(pc, temp);
952		break;
953	case TGSI_OPCODE_RCP:
954		for (c = 0; c < 4; c++) {
955			if (!(mask & (1 << c)))
956				continue;
957			emit_flop(pc, 0, dst[c], src[0][c]);
958		}
959		break;
960	case TGSI_OPCODE_RSQ:
961		for (c = 0; c < 4; c++) {
962			if (!(mask & (1 << c)))
963				continue;
964			emit_flop(pc, 2, dst[c], src[0][c]);
965		}
966		break;
967	case TGSI_OPCODE_SGE:
968		for (c = 0; c < 4; c++) {
969			if (!(mask & (1 << c)))
970				continue;
971			emit_set(pc, 6, dst[c], src[0][c], src[1][c]);
972		}
973		break;
974	case TGSI_OPCODE_SIN:
975		for (c = 0; c < 4; c++) {
976			if (!(mask & (1 << c)))
977				continue;
978			emit_flop(pc, 4, dst[c], src[0][c]);
979		}
980		break;
981	case TGSI_OPCODE_SLT:
982		for (c = 0; c < 4; c++) {
983			if (!(mask & (1 << c)))
984				continue;
985			emit_set(pc, 1, dst[c], src[0][c], src[1][c]);
986		}
987		break;
988	case TGSI_OPCODE_SUB:
989		for (c = 0; c < 4; c++) {
990			if (!(mask & (1 << c)))
991				continue;
992			emit_sub(pc, dst[c], src[0][c], src[1][c]);
993		}
994		break;
995	case TGSI_OPCODE_XPD:
996		temp = alloc_temp(pc, NULL);
997		if (mask & (1 << 0)) {
998			emit_mul(pc, temp, src[0][2], src[1][1]);
999			emit_msb(pc, dst[0], src[0][1], src[1][2], temp);
1000		}
1001		if (mask & (1 << 1)) {
1002			emit_mul(pc, temp, src[0][0], src[1][2]);
1003			emit_msb(pc, dst[1], src[0][2], src[1][0], temp);
1004		}
1005		if (mask & (1 << 2)) {
1006			emit_mul(pc, temp, src[0][1], src[1][0]);
1007			emit_msb(pc, dst[2], src[0][0], src[1][1], temp);
1008		}
1009		free_temp(pc, temp);
1010		break;
1011	case TGSI_OPCODE_END:
1012		break;
1013	default:
1014		NOUVEAU_ERR("invalid opcode %d\n", inst->Instruction.Opcode);
1015		return FALSE;
1016	}
1017
1018	if (sat) {
1019		for (c = 0; c < 4; c++) {
1020			unsigned inst[2] = { 0, 0 };
1021
1022			if (!(mask & (1 << c)))
1023				continue;
1024
1025			inst[0] = 0xa0000000; /* cvt */
1026			set_long(pc, inst);
1027			inst[1] |= (6 << 29); /* cvt */
1028			inst[1] |= 0x04000000; /* 32 bit */
1029			inst[1] |= (1 << 14); /* src .f32 */
1030			inst[1] |= ((1 << 5) << 14); /* .sat */
1031			set_dst(pc, rdst[c], inst);
1032			set_src_0(pc, dst[c], inst);
1033			emit(pc, inst);
1034		}
1035	}
1036
1037	kill_temp_temp(pc);
1038	return TRUE;
1039}
1040
1041static boolean
1042nv50_program_tx_prep(struct nv50_pc *pc)
1043{
1044	struct tgsi_parse_context p;
1045	boolean ret = FALSE;
1046	unsigned i, c;
1047
1048	tgsi_parse_init(&p, pc->p->pipe.tokens);
1049	while (!tgsi_parse_end_of_tokens(&p)) {
1050		const union tgsi_full_token *tok = &p.FullToken;
1051
1052		tgsi_parse_token(&p);
1053		switch (tok->Token.Type) {
1054		case TGSI_TOKEN_TYPE_IMMEDIATE:
1055		{
1056			const struct tgsi_full_immediate *imm =
1057				&p.FullToken.FullImmediate;
1058
1059			ctor_immd(pc, imm->u.ImmediateFloat32[0].Float,
1060				      imm->u.ImmediateFloat32[1].Float,
1061				      imm->u.ImmediateFloat32[2].Float,
1062				      imm->u.ImmediateFloat32[3].Float);
1063		}
1064			break;
1065		case TGSI_TOKEN_TYPE_DECLARATION:
1066		{
1067			const struct tgsi_full_declaration *d;
1068			unsigned last;
1069
1070			d = &p.FullToken.FullDeclaration;
1071			last = d->u.DeclarationRange.Last;
1072
1073			switch (d->Declaration.File) {
1074			case TGSI_FILE_TEMPORARY:
1075				if (pc->temp_nr < (last + 1))
1076					pc->temp_nr = last + 1;
1077				break;
1078			case TGSI_FILE_OUTPUT:
1079				if (pc->result_nr < (last + 1))
1080					pc->result_nr = last + 1;
1081				break;
1082			case TGSI_FILE_INPUT:
1083				if (pc->attr_nr < (last + 1))
1084					pc->attr_nr = last + 1;
1085				break;
1086			case TGSI_FILE_CONSTANT:
1087				if (pc->param_nr < (last + 1))
1088					pc->param_nr = last + 1;
1089				break;
1090			default:
1091				NOUVEAU_ERR("bad decl file %d\n",
1092					    d->Declaration.File);
1093				goto out_err;
1094			}
1095		}
1096			break;
1097		case TGSI_TOKEN_TYPE_INSTRUCTION:
1098			break;
1099		default:
1100			break;
1101		}
1102	}
1103
1104	NOUVEAU_ERR("%d temps\n", pc->temp_nr);
1105	if (pc->temp_nr) {
1106		pc->temp = calloc(pc->temp_nr * 4, sizeof(struct nv50_reg));
1107		if (!pc->temp)
1108			goto out_err;
1109
1110		for (i = 0; i < pc->temp_nr; i++) {
1111			for (c = 0; c < 4; c++) {
1112				pc->temp[i*4+c].type = P_TEMP;
1113				pc->temp[i*4+c].hw = -1;
1114				pc->temp[i*4+c].index = i;
1115			}
1116		}
1117	}
1118
1119	NOUVEAU_ERR("%d attrib regs\n", pc->attr_nr);
1120	if (pc->attr_nr) {
1121		struct nv50_reg *iv = NULL, *tmp = NULL;
1122		int aid = 0;
1123
1124		pc->attr = calloc(pc->attr_nr * 4, sizeof(struct nv50_reg));
1125		if (!pc->attr)
1126			goto out_err;
1127
1128		if (pc->p->type == NV50_PROG_FRAGMENT) {
1129			iv = alloc_temp(pc, NULL);
1130			aid++;
1131		}
1132
1133		for (i = 0; i < pc->attr_nr; i++) {
1134			struct nv50_reg *a = &pc->attr[i*4];
1135
1136			for (c = 0; c < 4; c++) {
1137				if (pc->p->type == NV50_PROG_FRAGMENT) {
1138					struct nv50_reg *at =
1139						alloc_temp(pc, NULL);
1140					pc->attr[i*4+c].type = at->type;
1141					pc->attr[i*4+c].hw = at->hw;
1142					pc->attr[i*4+c].index = at->index;
1143				} else {
1144					pc->p->cfg.vp.attr[aid/32] |=
1145						(1 << (aid % 32));
1146					pc->attr[i*4+c].type = P_ATTR;
1147					pc->attr[i*4+c].hw = aid++;
1148					pc->attr[i*4+c].index = i;
1149				}
1150			}
1151
1152			if (pc->p->type != NV50_PROG_FRAGMENT)
1153				continue;
1154
1155			emit_interp(pc, iv, iv, iv, FALSE);
1156			tmp = alloc_temp(pc, NULL);
1157			{
1158				unsigned inst[2] = { 0, 0 };
1159				inst[0]  = 0x90000000;
1160				inst[0] |= (tmp->hw << 2);
1161				emit(pc, inst);
1162			}
1163			emit_interp(pc, &a[0], &a[0], tmp, TRUE);
1164			emit_interp(pc, &a[1], &a[1], tmp, TRUE);
1165			emit_interp(pc, &a[2], &a[2], tmp, TRUE);
1166			emit_interp(pc, &a[3], &a[3], tmp, TRUE);
1167			free_temp(pc, tmp);
1168		}
1169
1170		if (iv)
1171			free_temp(pc, iv);
1172	}
1173
1174	NOUVEAU_ERR("%d result regs\n", pc->result_nr);
1175	if (pc->result_nr) {
1176		int rid = 0;
1177
1178		pc->result = calloc(pc->result_nr * 4, sizeof(struct nv50_reg));
1179		if (!pc->result)
1180			goto out_err;
1181
1182		for (i = 0; i < pc->result_nr; i++) {
1183			for (c = 0; c < 4; c++) {
1184				if (pc->p->type == NV50_PROG_FRAGMENT)
1185					pc->result[i*4+c].type = P_TEMP;
1186				else
1187					pc->result[i*4+c].type = P_RESULT;
1188				pc->result[i*4+c].hw = rid++;
1189				pc->result[i*4+c].index = i;
1190			}
1191		}
1192	}
1193
1194	NOUVEAU_ERR("%d param regs\n", pc->param_nr);
1195	if (pc->param_nr) {
1196		int rid = 0;
1197
1198		pc->param = calloc(pc->param_nr * 4, sizeof(struct nv50_reg));
1199		if (!pc->param)
1200			goto out_err;
1201
1202		for (i = 0; i < pc->param_nr; i++) {
1203			for (c = 0; c < 4; c++) {
1204				pc->param[i*4+c].type = P_CONST;
1205				pc->param[i*4+c].hw = rid++;
1206				pc->param[i*4+c].index = i;
1207			}
1208		}
1209	}
1210
1211	if (pc->immd_nr) {
1212		int rid = 0;
1213
1214		pc->immd = calloc(pc->immd_nr * 4, sizeof(struct nv50_reg));
1215		if (!pc->immd)
1216			goto out_err;
1217
1218		for (i = 0; i < pc->immd_nr; i++) {
1219			for (c = 0; c < 4; c++) {
1220				pc->immd[i*4+c].type = P_IMMD;
1221				pc->immd[i*4+c].hw = rid++;
1222				pc->immd[i*4+c].index = i;
1223			}
1224		}
1225	}
1226
1227	ret = TRUE;
1228out_err:
1229	tgsi_parse_free(&p);
1230	return ret;
1231}
1232
1233static boolean
1234nv50_program_tx(struct nv50_program *p)
1235{
1236	struct tgsi_parse_context parse;
1237	struct nv50_pc *pc;
1238	boolean ret;
1239
1240	pc = CALLOC_STRUCT(nv50_pc);
1241	if (!pc)
1242		return FALSE;
1243	pc->p = p;
1244	pc->p->cfg.high_temp = 4;
1245
1246	ret = nv50_program_tx_prep(pc);
1247	if (ret == FALSE)
1248		goto out_cleanup;
1249
1250	tgsi_parse_init(&parse, pc->p->pipe.tokens);
1251	while (!tgsi_parse_end_of_tokens(&parse)) {
1252		const union tgsi_full_token *tok = &parse.FullToken;
1253
1254		tgsi_parse_token(&parse);
1255
1256		switch (tok->Token.Type) {
1257		case TGSI_TOKEN_TYPE_INSTRUCTION:
1258			ret = nv50_program_tx_insn(pc, tok);
1259			if (ret == FALSE)
1260				goto out_err;
1261			break;
1262		default:
1263			break;
1264		}
1265	}
1266
1267	p->immd_nr = pc->immd_nr * 4;
1268	p->immd = pc->immd_buf;
1269
1270out_err:
1271	tgsi_parse_free(&parse);
1272
1273out_cleanup:
1274	return ret;
1275}
1276
1277static void
1278nv50_program_validate(struct nv50_context *nv50, struct nv50_program *p)
1279{
1280	int i;
1281
1282	if (nv50_program_tx(p) == FALSE)
1283		assert(0);
1284	/* *not* sufficient, it's fine if last inst is long and
1285	 * NOT immd - otherwise it's fucked fucked fucked */
1286	p->insns[p->insns_nr - 1] |= 0x00000001;
1287
1288	if (p->type == NV50_PROG_VERTEX) {
1289	for (i = 0; i < p->insns_nr; i++)
1290		NOUVEAU_ERR("VP0x%08x\n", p->insns[i]);
1291	} else {
1292	for (i = 0; i < p->insns_nr; i++)
1293		NOUVEAU_ERR("FP0x%08x\n", p->insns[i]);
1294	}
1295
1296	p->translated = TRUE;
1297}
1298
1299static void
1300nv50_program_validate_data(struct nv50_context *nv50, struct nv50_program *p)
1301{
1302	int i;
1303
1304	for (i = 0; i < p->immd_nr; i++) {
1305		BEGIN_RING(tesla, 0x0f00, 2);
1306		OUT_RING  ((NV50_CB_PMISC << 16) | (i << 8));
1307		OUT_RING  (fui(p->immd[i]));
1308	}
1309}
1310
1311static void
1312nv50_program_validate_code(struct nv50_context *nv50, struct nv50_program *p)
1313{
1314	struct pipe_winsys *ws = nv50->pipe.winsys;
1315	void *map;
1316
1317	if (!p->buffer)
1318		p->buffer = ws->buffer_create(ws, 0x100, 0, p->insns_nr * 4);
1319	map = ws->buffer_map(ws, p->buffer, PIPE_BUFFER_USAGE_CPU_WRITE);
1320	memcpy(map, p->insns, p->insns_nr * 4);
1321	ws->buffer_unmap(ws, p->buffer);
1322}
1323
1324void
1325nv50_vertprog_validate(struct nv50_context *nv50)
1326{
1327	struct nouveau_grobj *tesla = nv50->screen->tesla;
1328	struct nv50_program *p = nv50->vertprog;
1329	struct nouveau_stateobj *so;
1330
1331	if (!p->translated) {
1332		nv50_program_validate(nv50, p);
1333		if (!p->translated)
1334			assert(0);
1335	}
1336
1337	nv50_program_validate_data(nv50, p);
1338	nv50_program_validate_code(nv50, p);
1339
1340	so = so_new(11, 2);
1341	so_method(so, tesla, NV50TCL_VP_ADDRESS_HIGH, 2);
1342	so_reloc (so, p->buffer, 0, NOUVEAU_BO_VRAM | NOUVEAU_BO_RD |
1343		  NOUVEAU_BO_HIGH, 0, 0);
1344	so_reloc (so, p->buffer, 0, NOUVEAU_BO_VRAM | NOUVEAU_BO_RD |
1345		  NOUVEAU_BO_LOW, 0, 0);
1346	so_method(so, tesla, 0x1650, 2);
1347	so_data  (so, p->cfg.vp.attr[0]);
1348	so_data  (so, p->cfg.vp.attr[1]);
1349	so_method(so, tesla, 0x16ac, 2);
1350	so_data  (so, 8);
1351	so_data  (so, p->cfg.high_temp);
1352	so_method(so, tesla, 0x140c, 1);
1353	so_data  (so, 0); /* program start offset */
1354	so_emit(nv50->screen->nvws, so);
1355	so_ref(NULL, &so);
1356}
1357
1358void
1359nv50_fragprog_validate(struct nv50_context *nv50)
1360{
1361	struct nouveau_grobj *tesla = nv50->screen->tesla;
1362	struct nv50_program *p = nv50->fragprog;
1363	struct nouveau_stateobj *so;
1364
1365	if (!p->translated) {
1366		nv50_program_validate(nv50, p);
1367		if (!p->translated)
1368			assert(0);
1369	}
1370
1371	nv50_program_validate_data(nv50, p);
1372	nv50_program_validate_code(nv50, p);
1373
1374	so = so_new(7, 2);
1375	so_method(so, tesla, NV50TCL_FP_ADDRESS_HIGH, 2);
1376	so_reloc (so, p->buffer, 0, NOUVEAU_BO_VRAM | NOUVEAU_BO_RD |
1377		  NOUVEAU_BO_HIGH, 0, 0);
1378	so_reloc (so, p->buffer, 0, NOUVEAU_BO_VRAM | NOUVEAU_BO_RD |
1379		  NOUVEAU_BO_LOW, 0, 0);
1380	so_method(so, tesla, 0x198c, 1);
1381	so_data  (so, p->cfg.high_temp);
1382	so_method(so, tesla, 0x1414, 1);
1383	so_data  (so, 0); /* program start offset */
1384	so_emit(nv50->screen->nvws, so);
1385	so_ref(NULL, &so);
1386}
1387
1388void
1389nv50_program_destroy(struct nv50_context *nv50, struct nv50_program *p)
1390{
1391	struct pipe_winsys *ws = nv50->pipe.winsys;
1392
1393	if (p->insns_nr) {
1394		if (p->insns)
1395			FREE(p->insns);
1396		p->insns_nr = 0;
1397	}
1398
1399	if (p->buffer)
1400		pipe_buffer_reference(ws, &p->buffer, NULL);
1401
1402	p->translated = 0;
1403}
1404
1405