nv50_program.c revision afcaeaa0e4dc3ced40621c76304a2c0c5a3ab403
1#include "pipe/p_context.h"
2#include "pipe/p_defines.h"
3#include "pipe/p_state.h"
4#include "pipe/p_inlines.h"
5
6#include "pipe/p_shader_tokens.h"
7#include "tgsi/util/tgsi_parse.h"
8#include "tgsi/util/tgsi_util.h"
9
10#include "nv50_context.h"
11#include "nv50_state.h"
12
13#define NV50_SU_MAX_TEMP 64
14
15/* ARL - gallium craps itself on progs/vp/arl.txt
16 *
17 * MSB - Like MAD, but MUL+SUB
18 * 	- Fuck it off, introduce a way to negate args for ops that
19 * 	  support it.
20 *
21 * Look into inlining IMMD for ops other than MOV (make it general?)
22 * 	- Maybe even relax restrictions a bit, can't do P_RESULT + P_IMMD,
23 * 	  but can emit to P_TEMP first - then MOV later. NVIDIA does this
24 *
25 * Hmmm.. what happens if we have src1+src2 both consts.. ouch !
26 *
27 * Verify half-insns work where expected - and force disable them where they
28 * don't work - MUL has it forcibly disabled atm as it fixes POW..
29 *
30 * FUCK! watch dst==src vectors, can overwrite components that are needed.
31 * 	ie. SUB R0, R0.yzxw, R0
32 */
33struct nv50_reg {
34	enum {
35		P_TEMP,
36		P_ATTR,
37		P_RESULT,
38		P_CONST,
39		P_IMMD
40	} type;
41	int index;
42
43	int hw;
44	int neg;
45};
46
47struct nv50_pc {
48	struct nv50_program *p;
49
50	/* hw resources */
51	struct nv50_reg *r_temp[NV50_SU_MAX_TEMP];
52
53	/* tgsi resources */
54	struct nv50_reg *temp;
55	int temp_nr;
56	struct nv50_reg *attr;
57	int attr_nr;
58	struct nv50_reg *result;
59	int result_nr;
60	struct nv50_reg *param;
61	int param_nr;
62	struct nv50_reg *immd;
63	float *immd_buf;
64	int immd_nr;
65
66	struct nv50_reg *temp_temp[8];
67	unsigned temp_temp_nr;
68};
69
70static void
71alloc_reg(struct nv50_pc *pc, struct nv50_reg *reg)
72{
73	int i;
74
75	if (reg->type != P_TEMP)
76		return;
77
78	if (reg->hw >= 0) {
79		/*XXX: do this here too to catch FP temp-as-attr usage..
80		 *     not clean, but works */
81		if (pc->p->cfg.high_temp < (reg->hw + 1))
82			pc->p->cfg.high_temp = reg->hw + 1;
83		return;
84	}
85
86	for (i = 0; i < NV50_SU_MAX_TEMP; i++) {
87		if (!(pc->r_temp[i])) {
88			pc->r_temp[i] = reg;
89			reg->hw = i;
90			if (pc->p->cfg.high_temp < (i + 1))
91				pc->p->cfg.high_temp = i + 1;
92			return;
93		}
94	}
95
96	assert(0);
97}
98
99static struct nv50_reg *
100alloc_temp(struct nv50_pc *pc, struct nv50_reg *dst)
101{
102	struct nv50_reg *r;
103	int i;
104
105	if (dst && dst->type == P_TEMP && dst->hw == -1)
106		return dst;
107
108	for (i = 0; i < NV50_SU_MAX_TEMP; i++) {
109		if (!pc->r_temp[i]) {
110			r = CALLOC_STRUCT(nv50_reg);
111			r->type = P_TEMP;
112			r->index = -1;
113			r->hw = i;
114			pc->r_temp[i] = r;
115			return r;
116		}
117	}
118
119	assert(0);
120	return NULL;
121}
122
123static void
124free_temp(struct nv50_pc *pc, struct nv50_reg *r)
125{
126	if (r->index == -1) {
127		FREE(pc->r_temp[r->hw]);
128		pc->r_temp[r->hw] = NULL;
129	}
130}
131
132static struct nv50_reg *
133temp_temp(struct nv50_pc *pc)
134{
135	if (pc->temp_temp_nr >= 8)
136		assert(0);
137
138	pc->temp_temp[pc->temp_temp_nr] = alloc_temp(pc, NULL);
139	return pc->temp_temp[pc->temp_temp_nr++];
140}
141
142static void
143kill_temp_temp(struct nv50_pc *pc)
144{
145	int i;
146
147	for (i = 0; i < pc->temp_temp_nr; i++)
148		free_temp(pc, pc->temp_temp[i]);
149	pc->temp_temp_nr = 0;
150}
151
152static int
153ctor_immd(struct nv50_pc *pc, float x, float y, float z, float w)
154{
155	pc->immd_buf = realloc(pc->immd_buf, (pc->immd_nr + 1) * 4 *
156					     sizeof(float));
157	pc->immd_buf[(pc->immd_nr * 4) + 0] = x;
158	pc->immd_buf[(pc->immd_nr * 4) + 1] = y;
159	pc->immd_buf[(pc->immd_nr * 4) + 2] = z;
160	pc->immd_buf[(pc->immd_nr * 4) + 3] = w;
161
162	return pc->immd_nr++;
163}
164
165static struct nv50_reg *
166alloc_immd(struct nv50_pc *pc, float f)
167{
168	struct nv50_reg *r = CALLOC_STRUCT(nv50_reg);
169	unsigned hw;
170
171	hw = ctor_immd(pc, f, 0, 0, 0) * 4;
172	r->type = P_IMMD;
173	r->hw = hw;
174	r->index = -1;
175	return r;
176}
177
178static void
179emit(struct nv50_pc *pc, unsigned *inst)
180{
181	struct nv50_program *p = pc->p;
182
183	if (inst[0] & 1) {
184		p->insns_nr += 2;
185		p->insns = realloc(p->insns, sizeof(unsigned) * p->insns_nr);
186		memcpy(p->insns + (p->insns_nr - 2), inst, sizeof(unsigned)*2);
187	} else {
188		p->insns_nr += 1;
189		p->insns = realloc(p->insns, sizeof(unsigned) * p->insns_nr);
190		memcpy(p->insns + (p->insns_nr - 1), inst, sizeof(unsigned));
191	}
192}
193
194static INLINE void set_long(struct nv50_pc *, unsigned *);
195
196static boolean
197is_long(unsigned *inst)
198{
199	if (inst[0] & 1)
200		return TRUE;
201	return FALSE;
202}
203
204static boolean
205is_immd(unsigned *inst)
206{
207	if (is_long(inst) && (inst[1] & 3) == 3)
208		return TRUE;
209	return FALSE;
210}
211
212static INLINE void
213set_pred(struct nv50_pc *pc, unsigned pred, unsigned idx, unsigned *inst)
214{
215	set_long(pc, inst);
216	inst[1] &= ~((0x1f << 7) | (0x3 << 12));
217	inst[1] |= (pred << 7) | (idx << 12);
218}
219
220static INLINE void
221set_pred_wr(struct nv50_pc *pc, unsigned on, unsigned idx, unsigned *inst)
222{
223	set_long(pc, inst);
224	inst[1] &= ~((0x3 << 4) | (1 << 6));
225	inst[1] |= (idx << 4) | (on << 6);
226}
227
228static INLINE void
229set_long(struct nv50_pc *pc, unsigned *inst)
230{
231	if (is_long(inst))
232		return;
233
234	inst[0] |= 1;
235	set_pred(pc, 0xf, 0, inst);
236	set_pred_wr(pc, 0, 0, inst);
237}
238
239static INLINE void
240set_dst(struct nv50_pc *pc, struct nv50_reg *dst, unsigned *inst)
241{
242	if (dst->type == P_RESULT) {
243		set_long(pc, inst);
244		inst[1] |= 0x00000008;
245	}
246
247	alloc_reg(pc, dst);
248	inst[0] |= (dst->hw << 2);
249}
250
251static INLINE void
252set_immd(struct nv50_pc *pc, struct nv50_reg *imm, unsigned *inst)
253{
254	unsigned val = fui(pc->immd_buf[imm->hw]); /* XXX */
255
256	set_long(pc, inst);
257	/*XXX: can't be predicated - bits overlap.. catch cases where both
258	 *     are required and avoid them. */
259	set_pred(pc, 0, 0, inst);
260	set_pred_wr(pc, 0, 0, inst);
261
262	inst[1] |= 0x00000002 | 0x00000001;
263	inst[0] |= (val & 0x3f) << 16;
264	inst[1] |= (val >> 6) << 2;
265}
266
267static void
268emit_interp(struct nv50_pc *pc, struct nv50_reg *dst,
269	    struct nv50_reg *src, struct nv50_reg *iv, boolean noperspective)
270{
271	unsigned inst[2] = { 0, 0 };
272
273	inst[0] |= 0x80000000;
274	set_dst(pc, dst, inst);
275	alloc_reg(pc, iv);
276	inst[0] |= (iv->hw << 9);
277	alloc_reg(pc, src);
278	inst[0] |= (src->hw << 16);
279	if (noperspective)
280		inst[0] |= (1 << 25);
281
282	emit(pc, inst);
283}
284
285static void
286set_cseg(struct nv50_pc *pc, struct nv50_reg *src, unsigned *inst)
287{
288	set_long(pc, inst);
289	if (src->type == P_IMMD) {
290		inst[1] |= (NV50_CB_PMISC << 22);
291	} else {
292		if (pc->p->type == NV50_PROG_VERTEX)
293			inst[1] |= (NV50_CB_PVP << 22);
294		else
295			inst[1] |= (NV50_CB_PFP << 22);
296	}
297}
298
299static void
300emit_mov(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src)
301{
302	unsigned inst[2] = { 0, 0 };
303
304	inst[0] |= 0x10000000;
305
306	set_dst(pc, dst, inst);
307
308	if (dst->type != P_RESULT && src->type == P_IMMD) {
309		set_immd(pc, src, inst);
310		/*XXX: 32-bit, but steals part of "half" reg space - need to
311		 *     catch and handle this case if/when we do half-regs
312		 */
313		inst[0] |= 0x00008000;
314	} else
315	if (src->type == P_IMMD || src->type == P_CONST) {
316		set_long(pc, inst);
317		set_cseg(pc, src, inst);
318		inst[0] |= (src->hw << 9);
319		inst[1] |= 0x20000000; /* src0 const? */
320	} else {
321		if (src->type == P_ATTR) {
322			set_long(pc, inst);
323			inst[1] |= 0x00200000;
324		}
325
326		alloc_reg(pc, src);
327		inst[0] |= (src->hw << 9);
328	}
329
330	/* We really should support "half" instructions here at some point,
331	 * but I don't feel confident enough about them yet.
332	 */
333	set_long(pc, inst);
334	if (is_long(inst) && !is_immd(inst)) {
335		inst[1] |= 0x04000000; /* 32-bit */
336		inst[1] |= 0x0003c000; /* "subsubop" 0xf == mov */
337	}
338
339	emit(pc, inst);
340}
341
342static boolean
343check_swap_src_0_1(struct nv50_pc *pc,
344		   struct nv50_reg **s0, struct nv50_reg **s1)
345{
346	struct nv50_reg *src0 = *s0, *src1 = *s1;
347
348	if (src0->type == P_CONST) {
349		if (src1->type != P_CONST) {
350			*s0 = src1;
351			*s1 = src0;
352			return TRUE;
353		}
354	} else
355	if (src1->type == P_ATTR) {
356		if (src0->type != P_ATTR) {
357			*s0 = src1;
358			*s1 = src0;
359			return TRUE;
360		}
361	}
362
363	return FALSE;
364}
365
366static void
367set_src_0(struct nv50_pc *pc, struct nv50_reg *src, unsigned *inst)
368{
369	if (src->type == P_ATTR) {
370		set_long(pc, inst);
371		inst[1] |= 0x00200000;
372	} else
373	if (src->type == P_CONST || src->type == P_IMMD) {
374		struct nv50_reg *temp = temp_temp(pc);
375
376		emit_mov(pc, temp, src);
377		src = temp;
378	}
379
380	alloc_reg(pc, src);
381	inst[0] |= (src->hw << 9);
382}
383
384static void
385set_src_1(struct nv50_pc *pc, struct nv50_reg *src, unsigned *inst)
386{
387	if (src->type == P_ATTR) {
388		struct nv50_reg *temp = temp_temp(pc);
389
390		emit_mov(pc, temp, src);
391		src = temp;
392	} else
393	if (src->type == P_CONST || src->type == P_IMMD) {
394		set_cseg(pc, src, inst);
395		inst[0] |= 0x00800000;
396	}
397
398	alloc_reg(pc, src);
399	inst[0] |= (src->hw << 16);
400}
401
402static void
403set_src_2(struct nv50_pc *pc, struct nv50_reg *src, unsigned *inst)
404{
405	set_long(pc, inst);
406
407	if (src->type == P_ATTR) {
408		struct nv50_reg *temp = temp_temp(pc);
409
410		emit_mov(pc, temp, src);
411		src = temp;
412	} else
413	if (src->type == P_CONST || src->type == P_IMMD) {
414		set_cseg(pc, src, inst);
415		inst[0] |= 0x01000000;
416	}
417
418	alloc_reg(pc, src);
419	inst[1] |= (src->hw << 14);
420}
421
422static void
423emit_mul(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src0,
424	 struct nv50_reg *src1)
425{
426	unsigned inst[2] = { 0, 0 };
427
428	inst[0] |= 0xc0000000;
429	set_long(pc, inst);
430
431	check_swap_src_0_1(pc, &src0, &src1);
432	set_dst(pc, dst, inst);
433	set_src_0(pc, src0, inst);
434	set_src_1(pc, src1, inst);
435
436	emit(pc, inst);
437}
438
439static void
440emit_add(struct nv50_pc *pc, struct nv50_reg *dst,
441	 struct nv50_reg *src0, struct nv50_reg *src1)
442{
443	unsigned inst[2] = { 0, 0 };
444
445	inst[0] |= 0xb0000000;
446
447	check_swap_src_0_1(pc, &src0, &src1);
448	set_dst(pc, dst, inst);
449	set_src_0(pc, src0, inst);
450	if (is_long(inst))
451		set_src_2(pc, src1, inst);
452	else
453		set_src_1(pc, src1, inst);
454
455	emit(pc, inst);
456}
457
458static void
459emit_minmax(struct nv50_pc *pc, unsigned sub, struct nv50_reg *dst,
460	    struct nv50_reg *src0, struct nv50_reg *src1)
461{
462	unsigned inst[2] = { 0, 0 };
463
464	set_long(pc, inst);
465	inst[0] |= 0xb0000000;
466	inst[1] |= (sub << 29);
467
468	check_swap_src_0_1(pc, &src0, &src1);
469	set_dst(pc, dst, inst);
470	set_src_0(pc, src0, inst);
471	set_src_1(pc, src1, inst);
472
473	emit(pc, inst);
474}
475
476static void
477emit_sub(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src0,
478	 struct nv50_reg *src1)
479{
480	unsigned inst[2] = { 0, 0 };
481
482	inst[0] |= 0xb0000000;
483
484	set_long(pc, inst);
485	if (check_swap_src_0_1(pc, &src0, &src1))
486		inst[1] |= 0x04000000;
487	else
488		inst[1] |= 0x08000000;
489
490	set_dst(pc, dst, inst);
491	set_src_0(pc, src0, inst);
492	set_src_2(pc, src1, inst);
493
494	emit(pc, inst);
495}
496
497static void
498emit_mad(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src0,
499	 struct nv50_reg *src1, struct nv50_reg *src2)
500{
501	unsigned inst[2] = { 0, 0 };
502
503	inst[0] |= 0xe0000000;
504
505	check_swap_src_0_1(pc, &src0, &src1);
506	set_dst(pc, dst, inst);
507	set_src_0(pc, src0, inst);
508	set_src_1(pc, src1, inst);
509	set_src_2(pc, src2, inst);
510
511	emit(pc, inst);
512}
513
514static void
515emit_msb(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src0,
516	 struct nv50_reg *src1, struct nv50_reg *src2)
517{
518	unsigned inst[2] = { 0, 0 };
519
520	inst[0] |= 0xe0000000;
521	set_long(pc, inst);
522	inst[1] |= 0x08000000; /* src0 * src1 - src2 */
523
524	check_swap_src_0_1(pc, &src0, &src1);
525	set_dst(pc, dst, inst);
526	set_src_0(pc, src0, inst);
527	set_src_1(pc, src1, inst);
528	set_src_2(pc, src2, inst);
529
530	emit(pc, inst);
531}
532
533static void
534emit_flop(struct nv50_pc *pc, unsigned sub,
535	  struct nv50_reg *dst, struct nv50_reg *src)
536{
537	unsigned inst[2] = { 0, 0 };
538
539	inst[0] |= 0x90000000;
540	if (sub) {
541		set_long(pc, inst);
542		inst[1] |= (sub << 29);
543	}
544
545	set_dst(pc, dst, inst);
546	set_src_0(pc, src, inst);
547
548	emit(pc, inst);
549}
550
551static void
552emit_preex2(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src)
553{
554	unsigned inst[2] = { 0, 0 };
555
556	inst[0] |= 0xb0000000;
557
558	set_dst(pc, dst, inst);
559	set_src_0(pc, src, inst);
560	set_long(pc, inst);
561	inst[1] |= (6 << 29) | 0x00004000;
562
563	emit(pc, inst);
564}
565
566/*XXX: inaccurate results.. why? */
567#define ALLOW_SET_SWAP 0
568
569static void
570emit_set(struct nv50_pc *pc, unsigned c_op, struct nv50_reg *dst,
571	 struct nv50_reg *src0, struct nv50_reg *src1)
572{
573	unsigned inst[2] = { 0, 0 };
574#if ALLOW_SET_SWAP
575	unsigned inv_cop[8] = { 0, 6, 2, 4, 3, 5, 1, 7 };
576#endif
577	struct nv50_reg *rdst;
578
579#if ALLOW_SET_SWAP
580	assert(c_op <= 7);
581	if (check_swap_src_0_1(pc, &src0, &src1))
582		c_op = inv_cop[c_op];
583#endif
584
585	rdst = dst;
586	if (dst->type != P_TEMP)
587		dst = alloc_temp(pc, NULL);
588
589	/* set.u32 */
590	set_long(pc, inst);
591	inst[0] |= 0xb0000000;
592	inst[1] |= (3 << 29);
593	inst[1] |= (c_op << 14);
594	/*XXX: breaks things, .u32 by default?
595	 *     decuda will disasm as .u16 and use .lo/.hi regs, but this
596	 *     doesn't seem to match what the hw actually does.
597	inst[1] |= 0x04000000; << breaks things.. .u32 by default?
598	 */
599	set_dst(pc, dst, inst);
600	set_src_0(pc, src0, inst);
601	set_src_1(pc, src1, inst);
602	emit(pc, inst);
603
604	/* cvt.f32.u32 */
605	inst[0] = 0xa0000001;
606	inst[1] = 0x64014780;
607	set_dst(pc, rdst, inst);
608	set_src_0(pc, dst, inst);
609	emit(pc, inst);
610
611	if (dst != rdst)
612		free_temp(pc, dst);
613}
614
615static void
616emit_flr(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src)
617{
618	unsigned inst[2] = { 0, 0 };
619
620	inst[0] = 0xa0000000; /* cvt */
621	set_long(pc, inst);
622	inst[1] |= (6 << 29); /* cvt */
623	inst[1] |= 0x08000000; /* integer mode */
624	inst[1] |= 0x04000000; /* 32 bit */
625	inst[1] |= ((0x1 << 3)) << 14; /* .rn */
626	inst[1] |= (1 << 14); /* src .f32 */
627	set_dst(pc, dst, inst);
628	set_src_0(pc, src, inst);
629
630	emit(pc, inst);
631}
632
633static void
634emit_pow(struct nv50_pc *pc, struct nv50_reg *dst,
635	 struct nv50_reg *v, struct nv50_reg *e)
636{
637	struct nv50_reg *temp = alloc_temp(pc, NULL);
638
639	emit_flop(pc, 3, temp, v);
640	emit_mul(pc, temp, temp, e);
641	emit_preex2(pc, temp, temp);
642	emit_flop(pc, 6, dst, temp);
643
644	free_temp(pc, temp);
645}
646
647static void
648emit_abs(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src)
649{
650	unsigned inst[2] = { 0, 0 };
651
652	inst[0] = 0xa0000000; /* cvt */
653	set_long(pc, inst);
654	inst[1] |= (6 << 29); /* cvt */
655	inst[1] |= 0x04000000; /* 32 bit */
656	inst[1] |= (1 << 14); /* src .f32 */
657	inst[1] |= ((1 << 6) << 14); /* .abs */
658	set_dst(pc, dst, inst);
659	set_src_0(pc, src, inst);
660
661	emit(pc, inst);
662}
663
664static void
665emit_lit(struct nv50_pc *pc, struct nv50_reg **dst, struct nv50_reg **src)
666{
667	struct nv50_reg *one = alloc_immd(pc, 1.0);
668	struct nv50_reg *zero = alloc_immd(pc, 0.0);
669	struct nv50_reg *neg128 = alloc_immd(pc, -127.999999);
670	struct nv50_reg *pos128 = alloc_immd(pc,  127.999999);
671	struct nv50_reg *tmp[4];
672
673	emit_mov(pc, dst[0], one);
674	emit_mov(pc, dst[3], one);
675
676	tmp[0] = temp_temp(pc);
677	emit_minmax(pc, 4, dst[1], src[0], zero);
678	set_pred_wr(pc, 1, 0, &pc->p->insns[pc->p->insns_nr - 2]);
679
680	tmp[1] = temp_temp(pc);
681	emit_minmax(pc, 4, tmp[1], src[1], zero);
682
683	tmp[3] = temp_temp(pc);
684	emit_minmax(pc, 4, tmp[3], src[3], neg128);
685	emit_minmax(pc, 5, tmp[3], tmp[3], pos128);
686
687	emit_pow(pc, dst[2], tmp[1], tmp[3]);
688	emit_mov(pc, dst[2], zero);
689	set_pred(pc, 3, 0, &pc->p->insns[pc->p->insns_nr - 2]);
690}
691
692static struct nv50_reg *
693tgsi_dst(struct nv50_pc *pc, int c, const struct tgsi_full_dst_register *dst)
694{
695	switch (dst->DstRegister.File) {
696	case TGSI_FILE_TEMPORARY:
697		return &pc->temp[dst->DstRegister.Index * 4 + c];
698	case TGSI_FILE_OUTPUT:
699		return &pc->result[dst->DstRegister.Index * 4 + c];
700	case TGSI_FILE_NULL:
701		return NULL;
702	default:
703		break;
704	}
705
706	return NULL;
707}
708
709static struct nv50_reg *
710tgsi_src(struct nv50_pc *pc, int chan, const struct tgsi_full_src_register *src)
711{
712	struct nv50_reg *r = NULL;
713	struct nv50_reg *temp;
714	unsigned c;
715
716	c = tgsi_util_get_full_src_register_extswizzle(src, chan);
717	switch (c) {
718	case TGSI_EXTSWIZZLE_X:
719	case TGSI_EXTSWIZZLE_Y:
720	case TGSI_EXTSWIZZLE_Z:
721	case TGSI_EXTSWIZZLE_W:
722		switch (src->SrcRegister.File) {
723		case TGSI_FILE_INPUT:
724			r = &pc->attr[src->SrcRegister.Index * 4 + c];
725			break;
726		case TGSI_FILE_TEMPORARY:
727			r = &pc->temp[src->SrcRegister.Index * 4 + c];
728			break;
729		case TGSI_FILE_CONSTANT:
730			r = &pc->param[src->SrcRegister.Index * 4 + c];
731			break;
732		case TGSI_FILE_IMMEDIATE:
733			r = &pc->immd[src->SrcRegister.Index * 4 + c];
734			break;
735		default:
736			assert(0);
737			break;
738		}
739		break;
740	case TGSI_EXTSWIZZLE_ZERO:
741		r = alloc_immd(pc, 0.0);
742		break;
743	case TGSI_EXTSWIZZLE_ONE:
744		r = alloc_immd(pc, 1.0);
745		break;
746	default:
747		assert(0);
748		break;
749	}
750
751	switch (tgsi_util_get_full_src_register_sign_mode(src, chan)) {
752	case TGSI_UTIL_SIGN_KEEP:
753		break;
754	case TGSI_UTIL_SIGN_CLEAR:
755		temp = temp_temp(pc);
756		emit_abs(pc, temp, r);
757		r = temp;
758		break;
759	default:
760		assert(0);
761		break;
762	}
763
764	return r;
765}
766
767static boolean
768nv50_program_tx_insn(struct nv50_pc *pc, const union tgsi_full_token *tok)
769{
770	const struct tgsi_full_instruction *inst = &tok->FullInstruction;
771	struct nv50_reg *rdst[4], *dst[4], *src[3][4], *temp;
772	unsigned mask, sat;
773	int i, c;
774
775	NOUVEAU_ERR("insn %p\n", tok);
776
777	mask = inst->FullDstRegisters[0].DstRegister.WriteMask;
778	sat = inst->Instruction.Saturate == TGSI_SAT_ZERO_ONE;
779
780	for (c = 0; c < 4; c++) {
781		if (mask & (1 << c))
782			dst[c] = tgsi_dst(pc, c, &inst->FullDstRegisters[0]);
783		else
784			dst[c] = NULL;
785	}
786
787	for (i = 0; i < inst->Instruction.NumSrcRegs; i++) {
788		for (c = 0; c < 4; c++)
789			src[i][c] = tgsi_src(pc, c, &inst->FullSrcRegisters[i]);
790	}
791
792	if (sat) {
793		for (c = 0; c < 4; c++) {
794			rdst[c] = dst[c];
795			dst[c] = temp_temp(pc);
796		}
797	}
798
799	switch (inst->Instruction.Opcode) {
800	case TGSI_OPCODE_ABS:
801		for (c = 0; c < 4; c++) {
802			if (!(mask & (1 << c)))
803				continue;
804			emit_abs(pc, dst[c], src[0][c]);
805		}
806		break;
807	case TGSI_OPCODE_ADD:
808		for (c = 0; c < 4; c++) {
809			if (!(mask & (1 << c)))
810				continue;
811			emit_add(pc, dst[c], src[0][c], src[1][c]);
812		}
813		break;
814	case TGSI_OPCODE_COS:
815		for (c = 0; c < 4; c++) {
816			if (!(mask & (1 << c)))
817				continue;
818			emit_flop(pc, 5, dst[c], src[0][c]);
819		}
820		break;
821	case TGSI_OPCODE_DP3:
822		temp = alloc_temp(pc, NULL);
823		emit_mul(pc, temp, src[0][0], src[1][0]);
824		emit_mad(pc, temp, src[0][1], src[1][1], temp);
825		emit_mad(pc, temp, src[0][2], src[1][2], temp);
826		for (c = 0; c < 4; c++) {
827			if (!(mask & (1 << c)))
828				continue;
829			emit_mov(pc, dst[c], temp);
830		}
831		free_temp(pc, temp);
832		break;
833	case TGSI_OPCODE_DP4:
834		temp = alloc_temp(pc, NULL);
835		emit_mul(pc, temp, src[0][0], src[1][0]);
836		emit_mad(pc, temp, src[0][1], src[1][1], temp);
837		emit_mad(pc, temp, src[0][2], src[1][2], temp);
838		emit_mad(pc, temp, src[0][3], src[1][3], temp);
839		for (c = 0; c < 4; c++) {
840			if (!(mask & (1 << c)))
841				continue;
842			emit_mov(pc, dst[c], temp);
843		}
844		free_temp(pc, temp);
845		break;
846	case TGSI_OPCODE_DPH:
847		temp = alloc_temp(pc, NULL);
848		emit_mul(pc, temp, src[0][0], src[1][0]);
849		emit_mad(pc, temp, src[0][1], src[1][1], temp);
850		emit_mad(pc, temp, src[0][2], src[1][2], temp);
851		emit_add(pc, temp, src[1][3], temp);
852		for (c = 0; c < 4; c++) {
853			if (!(mask & (1 << c)))
854				continue;
855			emit_mov(pc, dst[c], temp);
856		}
857		free_temp(pc, temp);
858		break;
859	case TGSI_OPCODE_DST:
860	{
861		struct nv50_reg *one = alloc_immd(pc, 1.0);
862		if (mask & (1 << 0))
863			emit_mov(pc, dst[0], one);
864		if (mask & (1 << 1))
865			emit_mul(pc, dst[1], src[0][1], src[1][1]);
866		if (mask & (1 << 2))
867			emit_mov(pc, dst[2], src[0][2]);
868		if (mask & (1 << 3))
869			emit_mov(pc, dst[3], src[1][3]);
870		FREE(one);
871	}
872		break;
873	case TGSI_OPCODE_EX2:
874		temp = alloc_temp(pc, NULL);
875		for (c = 0; c < 4; c++) {
876			if (!(mask & (1 << c)))
877				continue;
878			emit_preex2(pc, temp, src[0][c]);
879			emit_flop(pc, 6, dst[c], temp);
880		}
881		free_temp(pc, temp);
882		break;
883	case TGSI_OPCODE_FLR:
884		for (c = 0; c < 4; c++) {
885			if (!(mask & (1 << c)))
886				continue;
887			emit_flr(pc, dst[c], src[0][c]);
888		}
889		break;
890	case TGSI_OPCODE_FRC:
891		temp = alloc_temp(pc, NULL);
892		for (c = 0; c < 4; c++) {
893			if (!(mask & (1 << c)))
894				continue;
895			emit_flr(pc, temp, src[0][c]);
896			emit_sub(pc, dst[c], src[0][c], temp);
897		}
898		free_temp(pc, temp);
899		break;
900	case TGSI_OPCODE_LIT:
901		/*XXX: writemask */
902		emit_lit(pc, &dst[0], &src[0][0]);
903		break;
904	case TGSI_OPCODE_LG2:
905		for (c = 0; c < 4; c++) {
906			if (!(mask & (1 << c)))
907				continue;
908			emit_flop(pc, 3, dst[c], src[0][c]);
909		}
910		break;
911	case TGSI_OPCODE_MAD:
912		for (c = 0; c < 4; c++) {
913			if (!(mask & (1 << c)))
914				continue;
915			emit_mad(pc, dst[c], src[0][c], src[1][c], src[2][c]);
916		}
917		break;
918	case TGSI_OPCODE_MAX:
919		for (c = 0; c < 4; c++) {
920			if (!(mask & (1 << c)))
921				continue;
922			emit_minmax(pc, 4, dst[c], src[0][c], src[1][c]);
923		}
924		break;
925	case TGSI_OPCODE_MIN:
926		for (c = 0; c < 4; c++) {
927			if (!(mask & (1 << c)))
928				continue;
929			emit_minmax(pc, 5, dst[c], src[0][c], src[1][c]);
930		}
931		break;
932	case TGSI_OPCODE_MOV:
933		for (c = 0; c < 4; c++) {
934			if (!(mask & (1 << c)))
935				continue;
936			emit_mov(pc, dst[c], src[0][c]);
937		}
938		break;
939	case TGSI_OPCODE_MUL:
940		for (c = 0; c < 4; c++) {
941			if (!(mask & (1 << c)))
942				continue;
943			emit_mul(pc, dst[c], src[0][c], src[1][c]);
944		}
945		break;
946	case TGSI_OPCODE_POW:
947		temp = alloc_temp(pc, NULL);
948		emit_pow(pc, temp, src[0][0], src[1][0]);
949		for (c = 0; c < 4; c++) {
950			if (!(mask & (1 << c)))
951				continue;
952			emit_mov(pc, dst[c], temp);
953		}
954		free_temp(pc, temp);
955		break;
956	case TGSI_OPCODE_RCP:
957		for (c = 0; c < 4; c++) {
958			if (!(mask & (1 << c)))
959				continue;
960			emit_flop(pc, 0, dst[c], src[0][c]);
961		}
962		break;
963	case TGSI_OPCODE_RSQ:
964		for (c = 0; c < 4; c++) {
965			if (!(mask & (1 << c)))
966				continue;
967			emit_flop(pc, 2, dst[c], src[0][c]);
968		}
969		break;
970	case TGSI_OPCODE_SGE:
971		for (c = 0; c < 4; c++) {
972			if (!(mask & (1 << c)))
973				continue;
974			emit_set(pc, 6, dst[c], src[0][c], src[1][c]);
975		}
976		break;
977	case TGSI_OPCODE_SIN:
978		for (c = 0; c < 4; c++) {
979			if (!(mask & (1 << c)))
980				continue;
981			emit_flop(pc, 4, dst[c], src[0][c]);
982		}
983		break;
984	case TGSI_OPCODE_SLT:
985		for (c = 0; c < 4; c++) {
986			if (!(mask & (1 << c)))
987				continue;
988			emit_set(pc, 1, dst[c], src[0][c], src[1][c]);
989		}
990		break;
991	case TGSI_OPCODE_SUB:
992		for (c = 0; c < 4; c++) {
993			if (!(mask & (1 << c)))
994				continue;
995			emit_sub(pc, dst[c], src[0][c], src[1][c]);
996		}
997		break;
998	case TGSI_OPCODE_XPD:
999		temp = alloc_temp(pc, NULL);
1000		if (mask & (1 << 0)) {
1001			emit_mul(pc, temp, src[0][2], src[1][1]);
1002			emit_msb(pc, dst[0], src[0][1], src[1][2], temp);
1003		}
1004		if (mask & (1 << 1)) {
1005			emit_mul(pc, temp, src[0][0], src[1][2]);
1006			emit_msb(pc, dst[1], src[0][2], src[1][0], temp);
1007		}
1008		if (mask & (1 << 2)) {
1009			emit_mul(pc, temp, src[0][1], src[1][0]);
1010			emit_msb(pc, dst[2], src[0][0], src[1][1], temp);
1011		}
1012		free_temp(pc, temp);
1013		break;
1014	case TGSI_OPCODE_END:
1015		break;
1016	default:
1017		NOUVEAU_ERR("invalid opcode %d\n", inst->Instruction.Opcode);
1018		return FALSE;
1019	}
1020
1021	if (sat) {
1022		for (c = 0; c < 4; c++) {
1023			unsigned inst[2] = { 0, 0 };
1024
1025			if (!(mask & (1 << c)))
1026				continue;
1027
1028			inst[0] = 0xa0000000; /* cvt */
1029			set_long(pc, inst);
1030			inst[1] |= (6 << 29); /* cvt */
1031			inst[1] |= 0x04000000; /* 32 bit */
1032			inst[1] |= (1 << 14); /* src .f32 */
1033			inst[1] |= ((1 << 5) << 14); /* .sat */
1034			set_dst(pc, rdst[c], inst);
1035			set_src_0(pc, dst[c], inst);
1036			emit(pc, inst);
1037		}
1038	}
1039
1040	kill_temp_temp(pc);
1041	return TRUE;
1042}
1043
1044static boolean
1045nv50_program_tx_prep(struct nv50_pc *pc)
1046{
1047	struct tgsi_parse_context p;
1048	boolean ret = FALSE;
1049	unsigned i, c;
1050
1051	tgsi_parse_init(&p, pc->p->pipe.tokens);
1052	while (!tgsi_parse_end_of_tokens(&p)) {
1053		const union tgsi_full_token *tok = &p.FullToken;
1054
1055		tgsi_parse_token(&p);
1056		switch (tok->Token.Type) {
1057		case TGSI_TOKEN_TYPE_IMMEDIATE:
1058		{
1059			const struct tgsi_full_immediate *imm =
1060				&p.FullToken.FullImmediate;
1061
1062			ctor_immd(pc, imm->u.ImmediateFloat32[0].Float,
1063				      imm->u.ImmediateFloat32[1].Float,
1064				      imm->u.ImmediateFloat32[2].Float,
1065				      imm->u.ImmediateFloat32[3].Float);
1066		}
1067			break;
1068		case TGSI_TOKEN_TYPE_DECLARATION:
1069		{
1070			const struct tgsi_full_declaration *d;
1071			unsigned last;
1072
1073			d = &p.FullToken.FullDeclaration;
1074			last = d->u.DeclarationRange.Last;
1075
1076			switch (d->Declaration.File) {
1077			case TGSI_FILE_TEMPORARY:
1078				if (pc->temp_nr < (last + 1))
1079					pc->temp_nr = last + 1;
1080				break;
1081			case TGSI_FILE_OUTPUT:
1082				if (pc->result_nr < (last + 1))
1083					pc->result_nr = last + 1;
1084				break;
1085			case TGSI_FILE_INPUT:
1086				if (pc->attr_nr < (last + 1))
1087					pc->attr_nr = last + 1;
1088				break;
1089			case TGSI_FILE_CONSTANT:
1090				if (pc->param_nr < (last + 1))
1091					pc->param_nr = last + 1;
1092				break;
1093			default:
1094				NOUVEAU_ERR("bad decl file %d\n",
1095					    d->Declaration.File);
1096				goto out_err;
1097			}
1098		}
1099			break;
1100		case TGSI_TOKEN_TYPE_INSTRUCTION:
1101			break;
1102		default:
1103			break;
1104		}
1105	}
1106
1107	NOUVEAU_ERR("%d temps\n", pc->temp_nr);
1108	if (pc->temp_nr) {
1109		pc->temp = calloc(pc->temp_nr * 4, sizeof(struct nv50_reg));
1110		if (!pc->temp)
1111			goto out_err;
1112
1113		for (i = 0; i < pc->temp_nr; i++) {
1114			for (c = 0; c < 4; c++) {
1115				pc->temp[i*4+c].type = P_TEMP;
1116				pc->temp[i*4+c].hw = -1;
1117				pc->temp[i*4+c].index = i;
1118			}
1119		}
1120	}
1121
1122	NOUVEAU_ERR("%d attrib regs\n", pc->attr_nr);
1123	if (pc->attr_nr) {
1124		struct nv50_reg *iv = NULL, *tmp = NULL;
1125		int aid = 0;
1126
1127		pc->attr = calloc(pc->attr_nr * 4, sizeof(struct nv50_reg));
1128		if (!pc->attr)
1129			goto out_err;
1130
1131		if (pc->p->type == NV50_PROG_FRAGMENT) {
1132			iv = alloc_temp(pc, NULL);
1133			aid++;
1134		}
1135
1136		for (i = 0; i < pc->attr_nr; i++) {
1137			struct nv50_reg *a = &pc->attr[i*4];
1138
1139			for (c = 0; c < 4; c++) {
1140				if (pc->p->type == NV50_PROG_FRAGMENT) {
1141					struct nv50_reg *at =
1142						alloc_temp(pc, NULL);
1143					pc->attr[i*4+c].type = at->type;
1144					pc->attr[i*4+c].hw = at->hw;
1145					pc->attr[i*4+c].index = at->index;
1146				} else {
1147					pc->p->cfg.vp.attr[aid/32] |=
1148						(1 << (aid % 32));
1149					pc->attr[i*4+c].type = P_ATTR;
1150					pc->attr[i*4+c].hw = aid++;
1151					pc->attr[i*4+c].index = i;
1152				}
1153			}
1154
1155			if (pc->p->type != NV50_PROG_FRAGMENT)
1156				continue;
1157
1158			emit_interp(pc, iv, iv, iv, FALSE);
1159			tmp = alloc_temp(pc, NULL);
1160			{
1161				unsigned inst[2] = { 0, 0 };
1162				inst[0]  = 0x90000000;
1163				inst[0] |= (tmp->hw << 2);
1164				emit(pc, inst);
1165			}
1166			emit_interp(pc, &a[0], &a[0], tmp, TRUE);
1167			emit_interp(pc, &a[1], &a[1], tmp, TRUE);
1168			emit_interp(pc, &a[2], &a[2], tmp, TRUE);
1169			emit_interp(pc, &a[3], &a[3], tmp, TRUE);
1170			free_temp(pc, tmp);
1171		}
1172
1173		if (iv)
1174			free_temp(pc, iv);
1175	}
1176
1177	NOUVEAU_ERR("%d result regs\n", pc->result_nr);
1178	if (pc->result_nr) {
1179		int rid = 0;
1180
1181		pc->result = calloc(pc->result_nr * 4, sizeof(struct nv50_reg));
1182		if (!pc->result)
1183			goto out_err;
1184
1185		for (i = 0; i < pc->result_nr; i++) {
1186			for (c = 0; c < 4; c++) {
1187				if (pc->p->type == NV50_PROG_FRAGMENT)
1188					pc->result[i*4+c].type = P_TEMP;
1189				else
1190					pc->result[i*4+c].type = P_RESULT;
1191				pc->result[i*4+c].hw = rid++;
1192				pc->result[i*4+c].index = i;
1193			}
1194		}
1195	}
1196
1197	NOUVEAU_ERR("%d param regs\n", pc->param_nr);
1198	if (pc->param_nr) {
1199		int rid = 0;
1200
1201		pc->param = calloc(pc->param_nr * 4, sizeof(struct nv50_reg));
1202		if (!pc->param)
1203			goto out_err;
1204
1205		for (i = 0; i < pc->param_nr; i++) {
1206			for (c = 0; c < 4; c++) {
1207				pc->param[i*4+c].type = P_CONST;
1208				pc->param[i*4+c].hw = rid++;
1209				pc->param[i*4+c].index = i;
1210			}
1211		}
1212	}
1213
1214	if (pc->immd_nr) {
1215		int rid = 0;
1216
1217		pc->immd = calloc(pc->immd_nr * 4, sizeof(struct nv50_reg));
1218		if (!pc->immd)
1219			goto out_err;
1220
1221		for (i = 0; i < pc->immd_nr; i++) {
1222			for (c = 0; c < 4; c++) {
1223				pc->immd[i*4+c].type = P_IMMD;
1224				pc->immd[i*4+c].hw = rid++;
1225				pc->immd[i*4+c].index = i;
1226			}
1227		}
1228	}
1229
1230	ret = TRUE;
1231out_err:
1232	tgsi_parse_free(&p);
1233	return ret;
1234}
1235
1236static boolean
1237nv50_program_tx(struct nv50_program *p)
1238{
1239	struct tgsi_parse_context parse;
1240	struct nv50_pc *pc;
1241	boolean ret;
1242
1243	pc = CALLOC_STRUCT(nv50_pc);
1244	if (!pc)
1245		return FALSE;
1246	pc->p = p;
1247	pc->p->cfg.high_temp = 4;
1248
1249	ret = nv50_program_tx_prep(pc);
1250	if (ret == FALSE)
1251		goto out_cleanup;
1252
1253	tgsi_parse_init(&parse, pc->p->pipe.tokens);
1254	while (!tgsi_parse_end_of_tokens(&parse)) {
1255		const union tgsi_full_token *tok = &parse.FullToken;
1256
1257		tgsi_parse_token(&parse);
1258
1259		switch (tok->Token.Type) {
1260		case TGSI_TOKEN_TYPE_INSTRUCTION:
1261			ret = nv50_program_tx_insn(pc, tok);
1262			if (ret == FALSE)
1263				goto out_err;
1264			break;
1265		default:
1266			break;
1267		}
1268	}
1269
1270	p->immd_nr = pc->immd_nr * 4;
1271	p->immd = pc->immd_buf;
1272
1273out_err:
1274	tgsi_parse_free(&parse);
1275
1276out_cleanup:
1277	return ret;
1278}
1279
1280static void
1281nv50_program_validate(struct nv50_context *nv50, struct nv50_program *p)
1282{
1283	int i;
1284
1285	if (nv50_program_tx(p) == FALSE)
1286		assert(0);
1287	/* *not* sufficient, it's fine if last inst is long and
1288	 * NOT immd - otherwise it's fucked fucked fucked */
1289	p->insns[p->insns_nr - 1] |= 0x00000001;
1290
1291	if (p->type == NV50_PROG_VERTEX) {
1292	for (i = 0; i < p->insns_nr; i++)
1293		NOUVEAU_ERR("VP0x%08x\n", p->insns[i]);
1294	} else {
1295	for (i = 0; i < p->insns_nr; i++)
1296		NOUVEAU_ERR("FP0x%08x\n", p->insns[i]);
1297	}
1298
1299	p->translated = TRUE;
1300}
1301
1302static void
1303nv50_program_validate_data(struct nv50_context *nv50, struct nv50_program *p)
1304{
1305	int i;
1306
1307	for (i = 0; i < p->immd_nr; i++) {
1308		BEGIN_RING(tesla, 0x0f00, 2);
1309		OUT_RING  ((NV50_CB_PMISC << 16) | (i << 8));
1310		OUT_RING  (fui(p->immd[i]));
1311	}
1312}
1313
1314static void
1315nv50_program_validate_code(struct nv50_context *nv50, struct nv50_program *p)
1316{
1317	struct pipe_winsys *ws = nv50->pipe.winsys;
1318	void *map;
1319
1320	if (!p->buffer)
1321		p->buffer = ws->buffer_create(ws, 0x100, 0, p->insns_nr * 4);
1322	map = ws->buffer_map(ws, p->buffer, PIPE_BUFFER_USAGE_CPU_WRITE);
1323	memcpy(map, p->insns, p->insns_nr * 4);
1324	ws->buffer_unmap(ws, p->buffer);
1325}
1326
1327void
1328nv50_vertprog_validate(struct nv50_context *nv50)
1329{
1330	struct nouveau_grobj *tesla = nv50->screen->tesla;
1331	struct nv50_program *p = nv50->vertprog;
1332	struct nouveau_stateobj *so;
1333
1334	if (!p->translated) {
1335		nv50_program_validate(nv50, p);
1336		if (!p->translated)
1337			assert(0);
1338	}
1339
1340	nv50_program_validate_data(nv50, p);
1341	nv50_program_validate_code(nv50, p);
1342
1343	so = so_new(11, 2);
1344	so_method(so, tesla, NV50TCL_VP_ADDRESS_HIGH, 2);
1345	so_reloc (so, p->buffer, 0, NOUVEAU_BO_VRAM | NOUVEAU_BO_RD |
1346		  NOUVEAU_BO_HIGH, 0, 0);
1347	so_reloc (so, p->buffer, 0, NOUVEAU_BO_VRAM | NOUVEAU_BO_RD |
1348		  NOUVEAU_BO_LOW, 0, 0);
1349	so_method(so, tesla, 0x1650, 2);
1350	so_data  (so, p->cfg.vp.attr[0]);
1351	so_data  (so, p->cfg.vp.attr[1]);
1352	so_method(so, tesla, 0x16ac, 2);
1353	so_data  (so, 8);
1354	so_data  (so, p->cfg.high_temp);
1355	so_method(so, tesla, 0x140c, 1);
1356	so_data  (so, 0); /* program start offset */
1357	so_emit(nv50->screen->nvws, so);
1358	so_ref(NULL, &so);
1359}
1360
1361void
1362nv50_fragprog_validate(struct nv50_context *nv50)
1363{
1364	struct nouveau_grobj *tesla = nv50->screen->tesla;
1365	struct nv50_program *p = nv50->fragprog;
1366	struct nouveau_stateobj *so;
1367
1368	if (!p->translated) {
1369		nv50_program_validate(nv50, p);
1370		if (!p->translated)
1371			assert(0);
1372	}
1373
1374	nv50_program_validate_data(nv50, p);
1375	nv50_program_validate_code(nv50, p);
1376
1377	so = so_new(7, 2);
1378	so_method(so, tesla, NV50TCL_FP_ADDRESS_HIGH, 2);
1379	so_reloc (so, p->buffer, 0, NOUVEAU_BO_VRAM | NOUVEAU_BO_RD |
1380		  NOUVEAU_BO_HIGH, 0, 0);
1381	so_reloc (so, p->buffer, 0, NOUVEAU_BO_VRAM | NOUVEAU_BO_RD |
1382		  NOUVEAU_BO_LOW, 0, 0);
1383	so_method(so, tesla, 0x198c, 1);
1384	so_data  (so, p->cfg.high_temp);
1385	so_method(so, tesla, 0x1414, 1);
1386	so_data  (so, 0); /* program start offset */
1387	so_emit(nv50->screen->nvws, so);
1388	so_ref(NULL, &so);
1389}
1390
1391void
1392nv50_program_destroy(struct nv50_context *nv50, struct nv50_program *p)
1393{
1394	struct pipe_winsys *ws = nv50->pipe.winsys;
1395
1396	if (p->insns_nr) {
1397		if (p->insns)
1398			FREE(p->insns);
1399		p->insns_nr = 0;
1400	}
1401
1402	if (p->buffer)
1403		pipe_buffer_reference(ws, &p->buffer, NULL);
1404
1405	p->translated = 0;
1406}
1407
1408