nv50_program.c revision 34abb858e2aaef2c1a066a7cdb3e0376d6c9f6bd
1#include "pipe/p_context.h"
2#include "pipe/p_defines.h"
3#include "pipe/p_state.h"
4#include "pipe/p_inlines.h"
5
6#include "pipe/p_shader_tokens.h"
7#include "tgsi/util/tgsi_parse.h"
8#include "tgsi/util/tgsi_util.h"
9
10#include "nv50_context.h"
11#include "nv50_state.h"
12
13#define NV50_SU_MAX_TEMP 64
14
15/* ARL
16 * LIT - other buggery
17 * POW
18 * SAT
19 *
20 * MSB - Like MAD, but MUL+SUB
21 * 	- Fuck it off, introduce a way to negate args for ops that
22 * 	  support it.
23 *
24 * Look into inlining IMMD for ops other than MOV
25 * 	- Maybe even relax restrictions a bit, can't do P_RESULT + P_IMMD,
26 * 	  but can emit to P_TEMP first - then MOV later. NVIDIA does this
27 */
28struct nv50_reg {
29	enum {
30		P_TEMP,
31		P_ATTR,
32		P_RESULT,
33		P_CONST,
34		P_IMMD
35	} type;
36	int index;
37
38	int hw;
39	int neg;
40};
41
42struct nv50_pc {
43	struct nv50_program *p;
44
45	/* hw resources */
46	struct nv50_reg *r_temp[NV50_SU_MAX_TEMP];
47
48	/* tgsi resources */
49	struct nv50_reg *temp;
50	int temp_nr;
51	struct nv50_reg *attr;
52	int attr_nr;
53	struct nv50_reg *result;
54	int result_nr;
55	struct nv50_reg *param;
56	int param_nr;
57	struct nv50_reg *immd;
58	float *immd_buf;
59	int immd_nr;
60
61	struct nv50_reg *temp_temp[8];
62	unsigned temp_temp_nr;
63};
64
65static void
66alloc_reg(struct nv50_pc *pc, struct nv50_reg *reg)
67{
68	int i;
69
70	if (reg->type != P_TEMP)
71		return;
72
73	if (reg->hw >= 0) {
74		/*XXX: do this here too to catch FP temp-as-attr usage..
75		 *     not clean, but works */
76		if (pc->p->cfg.high_temp < (reg->hw + 1))
77			pc->p->cfg.high_temp = reg->hw + 1;
78		return;
79	}
80
81	for (i = 0; i < NV50_SU_MAX_TEMP; i++) {
82		if (!(pc->r_temp[i])) {
83			pc->r_temp[i] = reg;
84			reg->hw = i;
85			if (pc->p->cfg.high_temp < (i + 1))
86				pc->p->cfg.high_temp = i + 1;
87			return;
88		}
89	}
90
91	assert(0);
92}
93
94static struct nv50_reg *
95alloc_temp(struct nv50_pc *pc, struct nv50_reg *dst)
96{
97	struct nv50_reg *r;
98	int i;
99
100	if (dst && dst->type == P_TEMP && dst->hw == -1)
101		return dst;
102
103	for (i = 0; i < NV50_SU_MAX_TEMP; i++) {
104		if (!pc->r_temp[i]) {
105			r = CALLOC_STRUCT(nv50_reg);
106			r->type = P_TEMP;
107			r->index = -1;
108			r->hw = i;
109			pc->r_temp[i] = r;
110			return r;
111		}
112	}
113
114	assert(0);
115	return NULL;
116}
117
118static void
119free_temp(struct nv50_pc *pc, struct nv50_reg *r)
120{
121	if (r->index == -1) {
122		FREE(pc->r_temp[r->hw]);
123		pc->r_temp[r->hw] = NULL;
124	}
125}
126
127static struct nv50_reg *
128temp_temp(struct nv50_pc *pc)
129{
130	if (pc->temp_temp_nr >= 8)
131		assert(0);
132
133	pc->temp_temp[pc->temp_temp_nr] = alloc_temp(pc, NULL);
134	return pc->temp_temp[pc->temp_temp_nr++];
135}
136
137static void
138kill_temp_temp(struct nv50_pc *pc)
139{
140	int i;
141
142	for (i = 0; i < pc->temp_temp_nr; i++)
143		free_temp(pc, pc->temp_temp[i]);
144	pc->temp_temp_nr = 0;
145}
146
147static int
148ctor_immd(struct nv50_pc *pc, float x, float y, float z, float w)
149{
150	pc->immd_buf = realloc(pc->immd_buf, (pc->immd_nr + 1) * 4 *
151					     sizeof(float));
152	pc->immd_buf[(pc->immd_nr * 4) + 0] = x;
153	pc->immd_buf[(pc->immd_nr * 4) + 1] = x;
154	pc->immd_buf[(pc->immd_nr * 4) + 2] = x;
155	pc->immd_buf[(pc->immd_nr * 4) + 3] = x;
156
157	return pc->immd_nr++;
158}
159
160static struct nv50_reg *
161alloc_immd(struct nv50_pc *pc, float f)
162{
163	struct nv50_reg *r = CALLOC_STRUCT(nv50_reg);
164	unsigned hw;
165
166	hw = ctor_immd(pc, f, 0, 0, 0);
167	r->type = P_IMMD;
168	r->hw = hw;
169	r->index = -1;
170	return r;
171}
172
173static struct nv50_reg *
174tgsi_dst(struct nv50_pc *pc, int c, const struct tgsi_full_dst_register *dst)
175{
176	switch (dst->DstRegister.File) {
177	case TGSI_FILE_TEMPORARY:
178		return &pc->temp[dst->DstRegister.Index * 4 + c];
179	case TGSI_FILE_OUTPUT:
180		return &pc->result[dst->DstRegister.Index * 4 + c];
181	case TGSI_FILE_NULL:
182		return NULL;
183	default:
184		break;
185	}
186
187	return NULL;
188}
189
190static struct nv50_reg *
191tgsi_src(struct nv50_pc *pc, int chan, const struct tgsi_full_src_register *src)
192{
193	struct nv50_reg *r = NULL;
194	unsigned c;
195
196	c = tgsi_util_get_full_src_register_extswizzle(src, chan);
197	switch (c) {
198	case TGSI_EXTSWIZZLE_X:
199	case TGSI_EXTSWIZZLE_Y:
200	case TGSI_EXTSWIZZLE_Z:
201	case TGSI_EXTSWIZZLE_W:
202		switch (src->SrcRegister.File) {
203		case TGSI_FILE_INPUT:
204			r = &pc->attr[src->SrcRegister.Index * 4 + c];
205			break;
206		case TGSI_FILE_TEMPORARY:
207			r = &pc->temp[src->SrcRegister.Index * 4 + c];
208			break;
209		case TGSI_FILE_CONSTANT:
210			r = &pc->param[src->SrcRegister.Index * 4 + c];
211			break;
212		case TGSI_FILE_IMMEDIATE:
213			r = &pc->immd[src->SrcRegister.Index * 4 + c];
214			break;
215		default:
216			assert(0);
217			break;
218		}
219		break;
220	case TGSI_EXTSWIZZLE_ZERO:
221		r = alloc_immd(pc, 0.0);
222		break;
223	case TGSI_EXTSWIZZLE_ONE:
224		r = alloc_immd(pc, 1.0);
225		break;
226	default:
227		assert(0);
228		break;
229	}
230
231	switch (tgsi_util_get_full_src_register_sign_mode(src, chan)) {
232	case TGSI_UTIL_SIGN_KEEP:
233		break;
234	default:
235		assert(0);
236		break;
237	}
238
239	return r;
240}
241
242static void
243emit(struct nv50_pc *pc, unsigned *inst)
244{
245	struct nv50_program *p = pc->p;
246
247	if (inst[0] & 1) {
248		p->insns_nr += 2;
249		p->insns = realloc(p->insns, sizeof(unsigned) * p->insns_nr);
250		memcpy(p->insns + (p->insns_nr - 2), inst, sizeof(unsigned)*2);
251	} else {
252		p->insns_nr += 1;
253		p->insns = realloc(p->insns, sizeof(unsigned) * p->insns_nr);
254		memcpy(p->insns + (p->insns_nr - 1), inst, sizeof(unsigned));
255	}
256}
257
258static INLINE void set_long(struct nv50_pc *, unsigned *);
259
260static boolean
261is_long(unsigned *inst)
262{
263	if (inst[0] & 1)
264		return TRUE;
265	return FALSE;
266}
267
268static boolean
269is_immd(unsigned *inst)
270{
271	if (is_long(inst) && (inst[1] & 3) == 3)
272		return TRUE;
273	return FALSE;
274}
275
276static INLINE void
277set_pred(struct nv50_pc *pc, unsigned pred, unsigned idx, unsigned *inst)
278{
279	set_long(pc, inst);
280	inst[1] &= ~((0x1f << 7) | (0x3 << 12));
281	inst[1] |= (pred << 7) | (idx << 12);
282}
283
284static INLINE void
285set_pred_wr(struct nv50_pc *pc, unsigned on, unsigned idx, unsigned *inst)
286{
287	set_long(pc, inst);
288	inst[1] &= ~((0x3 << 4) | (1 << 6));
289	inst[1] |= (idx << 4) | (on << 6);
290}
291
292static INLINE void
293set_long(struct nv50_pc *pc, unsigned *inst)
294{
295	if (is_long(inst))
296		return;
297
298	inst[0] |= 1;
299	set_pred(pc, 0xf, 0, inst);
300	set_pred_wr(pc, 0, 0, inst);
301}
302
303static INLINE void
304set_dst(struct nv50_pc *pc, struct nv50_reg *dst, unsigned *inst)
305{
306	if (dst->type == P_RESULT) {
307		set_long(pc, inst);
308		inst[1] |= 0x00000008;
309	}
310
311	alloc_reg(pc, dst);
312	inst[0] |= (dst->hw << 2);
313}
314
315static INLINE void
316set_immd(struct nv50_pc *pc, struct nv50_reg *imm, unsigned *inst)
317{
318	unsigned val = fui(pc->immd_buf[imm->hw]); /* XXX */
319
320	set_long(pc, inst);
321	/*XXX: can't be predicated - bits overlap.. catch cases where both
322	 *     are required and avoid them. */
323	set_pred(pc, 0, 0, inst);
324	set_pred_wr(pc, 0, 0, inst);
325
326	inst[1] |= 0x00000002 | 0x00000001;
327	inst[0] |= (val & 0x3f) << 16;
328	inst[1] |= (val >> 6) << 2;
329}
330
331static void
332emit_interp(struct nv50_pc *pc, struct nv50_reg *dst,
333	    struct nv50_reg *src, struct nv50_reg *iv, boolean noperspective)
334{
335	unsigned inst[2] = { 0, 0 };
336
337	inst[0] |= 0x80000000;
338	set_dst(pc, dst, inst);
339	alloc_reg(pc, iv);
340	inst[0] |= (iv->hw << 9);
341	alloc_reg(pc, src);
342	inst[0] |= (src->hw << 16);
343	if (noperspective)
344		inst[0] |= (1 << 25);
345
346	emit(pc, inst);
347}
348
349static void
350set_cseg(struct nv50_pc *pc, struct nv50_reg *src, unsigned *inst)
351{
352	set_long(pc, inst);
353	if (src->type == P_IMMD) {
354		inst[1] |= (NV50_CB_PMISC << 22);
355	} else {
356		if (pc->p->type == NV50_PROG_VERTEX)
357			inst[1] |= (NV50_CB_PVP << 22);
358		else
359			inst[1] |= (NV50_CB_PFP << 22);
360	}
361}
362
363static void
364emit_mov(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src)
365{
366	unsigned inst[2] = { 0, 0 };
367
368	inst[0] |= 0x10000000;
369
370	set_dst(pc, dst, inst);
371
372	if (dst->type != P_RESULT && src->type == P_IMMD) {
373		set_immd(pc, src, inst);
374		/*XXX: 32-bit, but steals part of "half" reg space - need to
375		 *     catch and handle this case if/when we do half-regs
376		 */
377		inst[0] |= 0x00008000;
378	} else
379	if (src->type == P_IMMD || src->type == P_CONST) {
380		set_long(pc, inst);
381		set_cseg(pc, src, inst);
382		inst[0] |= (src->hw << 9);
383		inst[1] |= 0x20000000; /* src0 const? */
384	} else {
385		if (src->type == P_ATTR) {
386			set_long(pc, inst);
387			inst[1] |= 0x00200000;
388		}
389
390		alloc_reg(pc, src);
391		inst[0] |= (src->hw << 9);
392	}
393
394	/* We really should support "half" instructions here at some point,
395	 * but I don't feel confident enough about them yet.
396	 */
397	set_long(pc, inst);
398	if (is_long(inst) && !is_immd(inst)) {
399		inst[1] |= 0x04000000; /* 32-bit */
400		inst[1] |= 0x0003c000; /* "subsubop" 0xf == mov */
401	}
402
403	emit(pc, inst);
404}
405
406static boolean
407check_swap_src_0_1(struct nv50_pc *pc,
408		   struct nv50_reg **s0, struct nv50_reg **s1)
409{
410	struct nv50_reg *src0 = *s0, *src1 = *s1;
411
412	if (src0->type == P_CONST) {
413		if (src1->type != P_CONST) {
414			*s0 = src1;
415			*s1 = src0;
416			return TRUE;
417		}
418	} else
419	if (src1->type == P_ATTR) {
420		if (src0->type != P_ATTR) {
421			*s0 = src1;
422			*s1 = src0;
423			return TRUE;
424		}
425	}
426
427	return FALSE;
428}
429
430static void
431set_src_0(struct nv50_pc *pc, struct nv50_reg *src, unsigned *inst)
432{
433	if (src->type == P_ATTR) {
434		set_long(pc, inst);
435		inst[1] |= 0x00200000;
436	} else
437	if (src->type == P_CONST || src->type == P_IMMD) {
438		struct nv50_reg *temp = temp_temp(pc);
439
440		emit_mov(pc, temp, src);
441		src = temp;
442	}
443
444	alloc_reg(pc, src);
445	inst[0] |= (src->hw << 9);
446}
447
448static void
449set_src_1(struct nv50_pc *pc, struct nv50_reg *src, unsigned *inst)
450{
451	if (src->type == P_ATTR) {
452		struct nv50_reg *temp = temp_temp(pc);
453
454		emit_mov(pc, temp, src);
455		src = temp;
456	} else
457	if (src->type == P_CONST || src->type == P_IMMD) {
458		set_cseg(pc, src, inst);
459		inst[0] |= 0x00800000;
460	}
461
462	alloc_reg(pc, src);
463	inst[0] |= (src->hw << 16);
464}
465
466static void
467set_src_2(struct nv50_pc *pc, struct nv50_reg *src, unsigned *inst)
468{
469	set_long(pc, inst);
470
471	if (src->type == P_ATTR) {
472		struct nv50_reg *temp = temp_temp(pc);
473
474		emit_mov(pc, temp, src);
475		src = temp;
476	} else
477	if (src->type == P_CONST || src->type == P_IMMD) {
478		set_cseg(pc, src, inst);
479		inst[0] |= 0x01000000;
480	}
481
482	alloc_reg(pc, src);
483	inst[1] |= (src->hw << 14);
484}
485
486static void
487emit_mul(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src0,
488	 struct nv50_reg *src1)
489{
490	unsigned inst[2] = { 0, 0 };
491
492	inst[0] |= 0xc0000000;
493
494	check_swap_src_0_1(pc, &src0, &src1);
495	set_dst(pc, dst, inst);
496	set_src_0(pc, src0, inst);
497	set_src_1(pc, src1, inst);
498
499	emit(pc, inst);
500}
501
502static void
503emit_add(struct nv50_pc *pc, struct nv50_reg *dst,
504	 struct nv50_reg *src0, struct nv50_reg *src1)
505{
506	unsigned inst[2] = { 0, 0 };
507
508	inst[0] |= 0xb0000000;
509
510	check_swap_src_0_1(pc, &src0, &src1);
511	set_dst(pc, dst, inst);
512	set_src_0(pc, src0, inst);
513	if (is_long(inst))
514		set_src_2(pc, src1, inst);
515	else
516		set_src_1(pc, src1, inst);
517
518	emit(pc, inst);
519}
520
521static void
522emit_minmax(struct nv50_pc *pc, unsigned sub, struct nv50_reg *dst,
523	    struct nv50_reg *src0, struct nv50_reg *src1)
524{
525	unsigned inst[2] = { 0, 0 };
526
527	set_long(pc, inst);
528	inst[0] |= 0xb0000000;
529	inst[1] |= (sub << 29);
530
531	check_swap_src_0_1(pc, &src0, &src1);
532	set_dst(pc, dst, inst);
533	set_src_0(pc, src0, inst);
534	set_src_1(pc, src1, inst);
535
536	emit(pc, inst);
537}
538
539static void
540emit_sub(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src0,
541	 struct nv50_reg *src1)
542{
543	unsigned inst[2] = { 0, 0 };
544
545	inst[0] |= 0xb0000000;
546
547	set_long(pc, inst);
548	if (check_swap_src_0_1(pc, &src0, &src1))
549		inst[1] |= 0x04000000;
550	else
551		inst[1] |= 0x08000000;
552
553	set_dst(pc, dst, inst);
554	set_src_0(pc, src0, inst);
555	set_src_2(pc, src1, inst);
556
557	emit(pc, inst);
558}
559
560static void
561emit_mad(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src0,
562	 struct nv50_reg *src1, struct nv50_reg *src2)
563{
564	unsigned inst[2] = { 0, 0 };
565
566	inst[0] |= 0xe0000000;
567
568	check_swap_src_0_1(pc, &src0, &src1);
569	set_dst(pc, dst, inst);
570	set_src_0(pc, src0, inst);
571	set_src_1(pc, src1, inst);
572	set_src_2(pc, src2, inst);
573
574	emit(pc, inst);
575}
576
577static void
578emit_msb(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src0,
579	 struct nv50_reg *src1, struct nv50_reg *src2)
580{
581	unsigned inst[2] = { 0, 0 };
582
583	inst[0] |= 0xe0000000;
584	set_long(pc, inst);
585	inst[1] |= 0x08000000; /* src0 * src1 - src2 */
586
587	check_swap_src_0_1(pc, &src0, &src1);
588	set_dst(pc, dst, inst);
589	set_src_0(pc, src0, inst);
590	set_src_1(pc, src1, inst);
591	set_src_2(pc, src2, inst);
592
593	emit(pc, inst);
594}
595
596static void
597emit_flop(struct nv50_pc *pc, unsigned sub,
598	  struct nv50_reg *dst, struct nv50_reg *src)
599{
600	unsigned inst[2] = { 0, 0 };
601
602	inst[0] |= 0x90000000;
603	if (sub) {
604		set_long(pc, inst);
605		inst[1] |= (sub << 29);
606	}
607
608	set_dst(pc, dst, inst);
609	set_src_0(pc, src, inst);
610
611	emit(pc, inst);
612}
613
614static void
615emit_preex2(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src)
616{
617	unsigned inst[2] = { 0, 0 };
618
619	inst[0] |= 0xb0000000;
620
621	set_dst(pc, dst, inst);
622	set_src_0(pc, src, inst);
623	set_long(pc, inst);
624	inst[1] |= (6 << 29) | 0x00004000;
625
626	emit(pc, inst);
627}
628/*XXX: inaccurate results.. why? */
629#define ALLOW_SET_SWAP 0
630
631static void
632emit_set(struct nv50_pc *pc, unsigned c_op, struct nv50_reg *dst,
633	 struct nv50_reg *src0, struct nv50_reg *src1)
634{
635	unsigned inst[2] = { 0, 0 };
636#if ALLOW_SET_SWAP
637	unsigned inv_cop[8] = { 0, 6, 2, 4, 3, 5, 1, 7 };
638#endif
639	struct nv50_reg *rdst;
640
641#if ALLOW_SET_SWAP
642	assert(c_op <= 7);
643	if (check_swap_src_0_1(pc, &src0, &src1))
644		c_op = inv_cop[c_op];
645#endif
646
647	rdst = dst;
648	if (dst->type != P_TEMP)
649		dst = alloc_temp(pc, NULL);
650
651	/* set.u32 */
652	set_long(pc, inst);
653	inst[0] |= 0xb0000000;
654	inst[1] |= (3 << 29);
655	inst[1] |= (c_op << 14);
656	/*XXX: breaks things, .u32 by default?
657	 *     decuda will disasm as .u16 and use .lo/.hi regs, but this
658	 *     doesn't seem to match what the hw actually does.
659	inst[1] |= 0x04000000; << breaks things.. .u32 by default?
660	 */
661	set_dst(pc, dst, inst);
662	set_src_0(pc, src0, inst);
663	set_src_1(pc, src1, inst);
664	emit(pc, inst);
665
666	/* cvt.f32.u32 */
667	inst[0] = 0xa0000001;
668	inst[1] = 0x64014780;
669	set_dst(pc, rdst, inst);
670	set_src_0(pc, dst, inst);
671	emit(pc, inst);
672
673	if (dst != rdst)
674		free_temp(pc, dst);
675}
676
677static void
678emit_flr(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src)
679{
680	unsigned inst[2] = { 0, 0 };
681
682	set_long(pc, inst);
683	inst[0] = 0xa0000000; /* cvt */
684	inst[1] |= (6 << 29); /* cvt */
685	inst[1] |= 0x08000000; /* integer mode */
686	inst[1] |= 0x04000000; /* 32 bit */
687	inst[1] |= ((0x1 << 3)) << 14; /* .rn */
688	inst[1] |= (1 << 14); /* src .f32 */
689	set_dst(pc, dst, inst);
690	set_src_0(pc, src, inst);
691
692	emit(pc, inst);
693}
694
695static boolean
696nv50_program_tx_insn(struct nv50_pc *pc, const union tgsi_full_token *tok)
697{
698	const struct tgsi_full_instruction *inst = &tok->FullInstruction;
699	struct nv50_reg *dst[4], *src[3][4], *temp;
700	unsigned mask;
701	int i, c;
702
703	NOUVEAU_ERR("insn %p\n", tok);
704
705	mask = inst->FullDstRegisters[0].DstRegister.WriteMask;
706
707	for (c = 0; c < 4; c++) {
708		if (mask & (1 << c))
709			dst[c] = tgsi_dst(pc, c, &inst->FullDstRegisters[0]);
710		else
711			dst[c] = NULL;
712	}
713
714	for (i = 0; i < inst->Instruction.NumSrcRegs; i++) {
715		for (c = 0; c < 4; c++)
716			src[i][c] = tgsi_src(pc, c, &inst->FullSrcRegisters[i]);
717	}
718
719	switch (inst->Instruction.Opcode) {
720	case TGSI_OPCODE_ABS:
721		for (c = 0; c < 4; c++) {
722			unsigned inst[2] = { 0, 0 };
723
724			set_long(pc, inst);
725			inst[0] = 0xa0000000; /* cvt */
726			inst[1] |= (6 << 29); /* cvt */
727			inst[1] |= 0x04000000; /* 32 bit */
728			inst[1] |= (1 << 14); /* src .f32 */
729			inst[1] |= ((1 << 6) << 14); /* .abs */
730			set_dst(pc, dst[c], inst);
731			set_src_0(pc, src[0][c], inst);
732			emit(pc, inst);
733		}
734		break;
735	case TGSI_OPCODE_ADD:
736		for (c = 0; c < 4; c++) {
737			if (!(mask & (1 << c)))
738				continue;
739			emit_add(pc, dst[c], src[0][c], src[1][c]);
740		}
741		break;
742	case TGSI_OPCODE_COS:
743		for (c = 0; c < 4; c++) {
744			if (!(mask & (1 << c)))
745				continue;
746			emit_flop(pc, 5, dst[c], src[0][c]);
747		}
748		break;
749	case TGSI_OPCODE_DP3:
750		temp = alloc_temp(pc, NULL);
751		emit_mul(pc, temp, src[0][0], src[1][0]);
752		emit_mad(pc, temp, src[0][1], src[1][1], temp);
753		emit_mad(pc, temp, src[0][2], src[1][2], temp);
754		for (c = 0; c < 4; c++) {
755			if (!(mask & (1 << c)))
756				continue;
757			emit_mov(pc, dst[c], temp);
758		}
759		free_temp(pc, temp);
760		break;
761	case TGSI_OPCODE_DP4:
762		temp = alloc_temp(pc, NULL);
763		emit_mul(pc, temp, src[0][0], src[1][0]);
764		emit_mad(pc, temp, src[0][1], src[1][1], temp);
765		emit_mad(pc, temp, src[0][2], src[1][2], temp);
766		emit_mad(pc, temp, src[0][3], src[1][3], temp);
767		for (c = 0; c < 4; c++) {
768			if (!(mask & (1 << c)))
769				continue;
770			emit_mov(pc, dst[c], temp);
771		}
772		free_temp(pc, temp);
773		break;
774	case TGSI_OPCODE_DPH:
775		temp = alloc_temp(pc, NULL);
776		emit_mul(pc, temp, src[0][0], src[1][0]);
777		emit_mad(pc, temp, src[0][1], src[1][1], temp);
778		emit_mad(pc, temp, src[0][2], src[1][2], temp);
779		emit_add(pc, temp, src[1][3], temp);
780		for (c = 0; c < 4; c++) {
781			if (!(mask & (1 << c)))
782				continue;
783			emit_mov(pc, dst[c], temp);
784		}
785		free_temp(pc, temp);
786		break;
787	case TGSI_OPCODE_DST:
788	{
789		struct nv50_reg *one = alloc_immd(pc, 1.0);
790		emit_mov(pc, dst[0], one);
791		emit_mul(pc, dst[1], src[0][1], src[1][1]);
792		emit_mov(pc, dst[2], src[0][2]);
793		emit_mov(pc, dst[3], src[1][3]);
794		FREE(one);
795	}
796		break;
797	case TGSI_OPCODE_EX2:
798		temp = alloc_temp(pc, NULL);
799		for (c = 0; c < 4; c++) {
800			if (!(mask & (1 << c)))
801				continue;
802			emit_preex2(pc, temp, src[0][c]);
803			emit_flop(pc, 6, dst[c], temp);
804		}
805		free_temp(pc, temp);
806		break;
807	case TGSI_OPCODE_FLR:
808		for (c = 0; c < 4; c++) {
809			if (!(mask & (1 << c)))
810				continue;
811			emit_flr(pc, dst[c], src[0][c]);
812		}
813		break;
814	case TGSI_OPCODE_FRC:
815		temp = alloc_temp(pc, NULL);
816		for (c = 0; c < 4; c++) {
817			if (!(mask & (1 << c)))
818				continue;
819			emit_flr(pc, temp, src[0][c]);
820			emit_sub(pc, dst[c], src[0][c], temp);
821		}
822		free_temp(pc, temp);
823		break;
824	case TGSI_OPCODE_LG2:
825		for (c = 0; c < 4; c++) {
826			if (!(mask & (1 << c)))
827				continue;
828			emit_flop(pc, 3, dst[c], src[0][c]);
829		}
830		break;
831	case TGSI_OPCODE_MAD:
832		for (c = 0; c < 4; c++) {
833			if (!(mask & (1 << c)))
834				continue;
835			emit_mad(pc, dst[c], src[0][c], src[1][c], src[2][c]);
836		}
837		break;
838	case TGSI_OPCODE_MAX:
839		for (c = 0; c < 4; c++) {
840			if (!(mask & (1 << c)))
841				continue;
842			emit_minmax(pc, 4, dst[c], src[0][c], src[1][c]);
843		}
844		break;
845	case TGSI_OPCODE_MIN:
846		for (c = 0; c < 4; c++) {
847			if (!(mask & (1 << c)))
848				continue;
849			emit_minmax(pc, 5, dst[c], src[0][c], src[1][c]);
850		}
851		break;
852	case TGSI_OPCODE_MOV:
853		for (c = 0; c < 4; c++) {
854			if (!(mask & (1 << c)))
855				continue;
856			emit_mov(pc, dst[c], src[0][c]);
857		}
858		break;
859	case TGSI_OPCODE_MUL:
860		for (c = 0; c < 4; c++) {
861			if (!(mask & (1 << c)))
862				continue;
863			emit_mul(pc, dst[c], src[0][c], src[1][c]);
864		}
865		break;
866	case TGSI_OPCODE_RCP:
867		for (c = 0; c < 4; c++) {
868			if (!(mask & (1 << c)))
869				continue;
870			emit_flop(pc, 0, dst[c], src[0][c]);
871		}
872		break;
873	case TGSI_OPCODE_RSQ:
874		for (c = 0; c < 4; c++) {
875			if (!(mask & (1 << c)))
876				continue;
877			emit_flop(pc, 2, dst[c], src[0][c]);
878		}
879		break;
880	case TGSI_OPCODE_SGE:
881		for (c = 0; c < 4; c++) {
882			if (!(mask & (1 << c)))
883				continue;
884			emit_set(pc, 6, dst[c], src[0][c], src[1][c]);
885		}
886		break;
887	case TGSI_OPCODE_SIN:
888		for (c = 0; c < 4; c++) {
889			if (!(mask & (1 << c)))
890				continue;
891			emit_flop(pc, 4, dst[c], src[0][c]);
892		}
893		break;
894	case TGSI_OPCODE_SLT:
895		for (c = 0; c < 4; c++) {
896			if (!(mask & (1 << c)))
897				continue;
898			emit_set(pc, 1, dst[c], src[0][c], src[1][c]);
899		}
900		break;
901	case TGSI_OPCODE_SUB:
902		for (c = 0; c < 4; c++) {
903			if (!(mask & (1 << c)))
904				continue;
905			emit_sub(pc, dst[c], src[0][c], src[1][c]);
906		}
907		break;
908	case TGSI_OPCODE_XPD:
909		temp = alloc_temp(pc, NULL);
910		emit_mul(pc, temp, src[0][2], src[1][1]);
911		emit_msb(pc, dst[0], src[0][1], src[1][2], temp);
912		emit_mul(pc, temp, src[0][0], src[1][2]);
913		emit_msb(pc, dst[1], src[0][2], src[1][0], temp);
914		emit_mul(pc, temp, src[0][1], src[1][0]);
915		emit_msb(pc, dst[2], src[0][0], src[1][1], temp);
916		free_temp(pc, temp);
917		break;
918	case TGSI_OPCODE_END:
919		break;
920	default:
921		NOUVEAU_ERR("invalid opcode %d\n", inst->Instruction.Opcode);
922		return FALSE;
923	}
924
925	kill_temp_temp(pc);
926	return TRUE;
927}
928
929static boolean
930nv50_program_tx_prep(struct nv50_pc *pc)
931{
932	struct tgsi_parse_context p;
933	boolean ret = FALSE;
934	unsigned i, c;
935
936	tgsi_parse_init(&p, pc->p->pipe.tokens);
937	while (!tgsi_parse_end_of_tokens(&p)) {
938		const union tgsi_full_token *tok = &p.FullToken;
939
940		tgsi_parse_token(&p);
941		switch (tok->Token.Type) {
942		case TGSI_TOKEN_TYPE_IMMEDIATE:
943		{
944			const struct tgsi_full_immediate *imm =
945				&p.FullToken.FullImmediate;
946
947			ctor_immd(pc, imm->u.ImmediateFloat32[0].Float,
948				      imm->u.ImmediateFloat32[1].Float,
949				      imm->u.ImmediateFloat32[2].Float,
950				      imm->u.ImmediateFloat32[3].Float);
951		}
952			break;
953		case TGSI_TOKEN_TYPE_DECLARATION:
954		{
955			const struct tgsi_full_declaration *d;
956			unsigned last;
957
958			d = &p.FullToken.FullDeclaration;
959			last = d->u.DeclarationRange.Last;
960
961			switch (d->Declaration.File) {
962			case TGSI_FILE_TEMPORARY:
963				if (pc->temp_nr < (last + 1))
964					pc->temp_nr = last + 1;
965				break;
966			case TGSI_FILE_OUTPUT:
967				if (pc->result_nr < (last + 1))
968					pc->result_nr = last + 1;
969				break;
970			case TGSI_FILE_INPUT:
971				if (pc->attr_nr < (last + 1))
972					pc->attr_nr = last + 1;
973				break;
974			case TGSI_FILE_CONSTANT:
975				if (pc->param_nr < (last + 1))
976					pc->param_nr = last + 1;
977				break;
978			default:
979				NOUVEAU_ERR("bad decl file %d\n",
980					    d->Declaration.File);
981				goto out_err;
982			}
983		}
984			break;
985		case TGSI_TOKEN_TYPE_INSTRUCTION:
986			break;
987		default:
988			break;
989		}
990	}
991
992	NOUVEAU_ERR("%d temps\n", pc->temp_nr);
993	if (pc->temp_nr) {
994		pc->temp = calloc(pc->temp_nr * 4, sizeof(struct nv50_reg));
995		if (!pc->temp)
996			goto out_err;
997
998		for (i = 0; i < pc->temp_nr; i++) {
999			for (c = 0; c < 4; c++) {
1000				pc->temp[i*4+c].type = P_TEMP;
1001				pc->temp[i*4+c].hw = -1;
1002				pc->temp[i*4+c].index = i;
1003			}
1004		}
1005	}
1006
1007	NOUVEAU_ERR("%d attrib regs\n", pc->attr_nr);
1008	if (pc->attr_nr) {
1009		struct nv50_reg *iv = NULL, *tmp = NULL;
1010		int aid = 0;
1011
1012		pc->attr = calloc(pc->attr_nr * 4, sizeof(struct nv50_reg));
1013		if (!pc->attr)
1014			goto out_err;
1015
1016		if (pc->p->type == NV50_PROG_FRAGMENT) {
1017			iv = alloc_temp(pc, NULL);
1018			aid++;
1019		}
1020
1021		for (i = 0; i < pc->attr_nr; i++) {
1022			struct nv50_reg *a = &pc->attr[i*4];
1023
1024			for (c = 0; c < 4; c++) {
1025				if (pc->p->type == NV50_PROG_FRAGMENT) {
1026					struct nv50_reg *at =
1027						alloc_temp(pc, NULL);
1028					pc->attr[i*4+c].type = at->type;
1029					pc->attr[i*4+c].hw = at->hw;
1030					pc->attr[i*4+c].index = at->index;
1031				} else {
1032					pc->p->cfg.vp.attr[aid/32] |=
1033						(1 << (aid % 32));
1034					pc->attr[i*4+c].type = P_ATTR;
1035					pc->attr[i*4+c].hw = aid++;
1036					pc->attr[i*4+c].index = i;
1037				}
1038			}
1039
1040			if (pc->p->type != NV50_PROG_FRAGMENT)
1041				continue;
1042
1043			emit_interp(pc, iv, iv, iv, FALSE);
1044			tmp = alloc_temp(pc, NULL);
1045			{
1046				unsigned inst[2] = { 0, 0 };
1047				inst[0]  = 0x90000000;
1048				inst[0] |= (tmp->hw << 2);
1049				emit(pc, inst);
1050			}
1051			emit_interp(pc, &a[0], &a[0], tmp, TRUE);
1052			emit_interp(pc, &a[1], &a[1], tmp, TRUE);
1053			emit_interp(pc, &a[2], &a[2], tmp, TRUE);
1054			emit_interp(pc, &a[3], &a[3], tmp, TRUE);
1055			free_temp(pc, tmp);
1056		}
1057
1058		if (iv)
1059			free_temp(pc, iv);
1060	}
1061
1062	NOUVEAU_ERR("%d result regs\n", pc->result_nr);
1063	if (pc->result_nr) {
1064		int rid = 0;
1065
1066		pc->result = calloc(pc->result_nr * 4, sizeof(struct nv50_reg));
1067		if (!pc->result)
1068			goto out_err;
1069
1070		for (i = 0; i < pc->result_nr; i++) {
1071			for (c = 0; c < 4; c++) {
1072				if (pc->p->type == NV50_PROG_FRAGMENT)
1073					pc->result[i*4+c].type = P_TEMP;
1074				else
1075					pc->result[i*4+c].type = P_RESULT;
1076				pc->result[i*4+c].hw = rid++;
1077				pc->result[i*4+c].index = i;
1078			}
1079		}
1080	}
1081
1082	NOUVEAU_ERR("%d param regs\n", pc->param_nr);
1083	if (pc->param_nr) {
1084		int rid = 0;
1085
1086		pc->param = calloc(pc->param_nr * 4, sizeof(struct nv50_reg));
1087		if (!pc->param)
1088			goto out_err;
1089
1090		for (i = 0; i < pc->param_nr; i++) {
1091			for (c = 0; c < 4; c++) {
1092				pc->param[i*4+c].type = P_CONST;
1093				pc->param[i*4+c].hw = rid++;
1094				pc->param[i*4+c].index = i;
1095			}
1096		}
1097	}
1098
1099	if (pc->immd_nr) {
1100		int rid = 0;
1101
1102		pc->immd = calloc(pc->immd_nr * 4, sizeof(struct nv50_reg));
1103		if (!pc->immd)
1104			goto out_err;
1105
1106		for (i = 0; i < pc->immd_nr; i++) {
1107			for (c = 0; c < 4; c++) {
1108				pc->immd[i*4+c].type = P_IMMD;
1109				pc->immd[i*4+c].hw = rid++;
1110				pc->immd[i*4+c].index = i;
1111			}
1112		}
1113	}
1114
1115	ret = TRUE;
1116out_err:
1117	tgsi_parse_free(&p);
1118	return ret;
1119}
1120
1121static boolean
1122nv50_program_tx(struct nv50_program *p)
1123{
1124	struct tgsi_parse_context parse;
1125	struct nv50_pc *pc;
1126	boolean ret;
1127
1128	pc = CALLOC_STRUCT(nv50_pc);
1129	if (!pc)
1130		return FALSE;
1131	pc->p = p;
1132	pc->p->cfg.high_temp = 4;
1133
1134	ret = nv50_program_tx_prep(pc);
1135	if (ret == FALSE)
1136		goto out_cleanup;
1137
1138	tgsi_parse_init(&parse, pc->p->pipe.tokens);
1139	while (!tgsi_parse_end_of_tokens(&parse)) {
1140		const union tgsi_full_token *tok = &parse.FullToken;
1141
1142		tgsi_parse_token(&parse);
1143
1144		switch (tok->Token.Type) {
1145		case TGSI_TOKEN_TYPE_INSTRUCTION:
1146			ret = nv50_program_tx_insn(pc, tok);
1147			if (ret == FALSE)
1148				goto out_err;
1149			break;
1150		default:
1151			break;
1152		}
1153	}
1154
1155	p->immd_nr = pc->immd_nr * 4;
1156	p->immd = pc->immd_buf;
1157
1158out_err:
1159	tgsi_parse_free(&parse);
1160
1161out_cleanup:
1162	return ret;
1163}
1164
1165static void
1166nv50_program_validate(struct nv50_context *nv50, struct nv50_program *p)
1167{
1168	int i;
1169
1170	if (nv50_program_tx(p) == FALSE)
1171		assert(0);
1172	/* *not* sufficient, it's fine if last inst is long and
1173	 * NOT immd - otherwise it's fucked fucked fucked */
1174	p->insns[p->insns_nr - 1] |= 0x00000001;
1175
1176	if (p->type == NV50_PROG_VERTEX) {
1177	for (i = 0; i < p->insns_nr; i++)
1178		NOUVEAU_ERR("VP0x%08x\n", p->insns[i]);
1179	} else {
1180	for (i = 0; i < p->insns_nr; i++)
1181		NOUVEAU_ERR("FP0x%08x\n", p->insns[i]);
1182	}
1183
1184	p->translated = TRUE;
1185}
1186
1187static void
1188nv50_program_validate_data(struct nv50_context *nv50, struct nv50_program *p)
1189{
1190	int i;
1191
1192	for (i = 0; i < p->immd_nr; i++) {
1193		BEGIN_RING(tesla, 0x0f00, 2);
1194		OUT_RING  ((NV50_CB_PMISC << 16) | (i << 8));
1195		OUT_RING  (fui(p->immd[i]));
1196	}
1197}
1198
1199static void
1200nv50_program_validate_code(struct nv50_context *nv50, struct nv50_program *p)
1201{
1202	struct pipe_winsys *ws = nv50->pipe.winsys;
1203	void *map;
1204
1205	if (!p->buffer)
1206		p->buffer = ws->buffer_create(ws, 0x100, 0, p->insns_nr * 4);
1207	map = ws->buffer_map(ws, p->buffer, PIPE_BUFFER_USAGE_CPU_WRITE);
1208	memcpy(map, p->insns, p->insns_nr * 4);
1209	ws->buffer_unmap(ws, p->buffer);
1210}
1211
1212void
1213nv50_vertprog_validate(struct nv50_context *nv50)
1214{
1215	struct nouveau_grobj *tesla = nv50->screen->tesla;
1216	struct nv50_program *p = nv50->vertprog;
1217	struct nouveau_stateobj *so;
1218
1219	if (!p->translated) {
1220		nv50_program_validate(nv50, p);
1221		if (!p->translated)
1222			assert(0);
1223	}
1224
1225	nv50_program_validate_data(nv50, p);
1226	nv50_program_validate_code(nv50, p);
1227
1228	so = so_new(11, 2);
1229	so_method(so, tesla, NV50TCL_VP_ADDRESS_HIGH, 2);
1230	so_reloc (so, p->buffer, 0, NOUVEAU_BO_VRAM | NOUVEAU_BO_RD |
1231		  NOUVEAU_BO_HIGH, 0, 0);
1232	so_reloc (so, p->buffer, 0, NOUVEAU_BO_VRAM | NOUVEAU_BO_RD |
1233		  NOUVEAU_BO_LOW, 0, 0);
1234	so_method(so, tesla, 0x1650, 2);
1235	so_data  (so, p->cfg.vp.attr[0]);
1236	so_data  (so, p->cfg.vp.attr[1]);
1237	so_method(so, tesla, 0x16ac, 2);
1238	so_data  (so, 8);
1239	so_data  (so, p->cfg.high_temp);
1240	so_method(so, tesla, 0x140c, 1);
1241	so_data  (so, 0); /* program start offset */
1242	so_emit(nv50->screen->nvws, so);
1243	so_ref(NULL, &so);
1244}
1245
1246void
1247nv50_fragprog_validate(struct nv50_context *nv50)
1248{
1249	struct nouveau_grobj *tesla = nv50->screen->tesla;
1250	struct nv50_program *p = nv50->fragprog;
1251	struct nouveau_stateobj *so;
1252
1253	if (!p->translated) {
1254		nv50_program_validate(nv50, p);
1255		if (!p->translated)
1256			assert(0);
1257	}
1258
1259	nv50_program_validate_data(nv50, p);
1260	nv50_program_validate_code(nv50, p);
1261
1262	so = so_new(7, 2);
1263	so_method(so, tesla, NV50TCL_FP_ADDRESS_HIGH, 2);
1264	so_reloc (so, p->buffer, 0, NOUVEAU_BO_VRAM | NOUVEAU_BO_RD |
1265		  NOUVEAU_BO_HIGH, 0, 0);
1266	so_reloc (so, p->buffer, 0, NOUVEAU_BO_VRAM | NOUVEAU_BO_RD |
1267		  NOUVEAU_BO_LOW, 0, 0);
1268	so_method(so, tesla, 0x198c, 1);
1269	so_data  (so, p->cfg.high_temp);
1270	so_method(so, tesla, 0x1414, 1);
1271	so_data  (so, 0); /* program start offset */
1272	so_emit(nv50->screen->nvws, so);
1273	so_ref(NULL, &so);
1274}
1275
1276void
1277nv50_program_destroy(struct nv50_context *nv50, struct nv50_program *p)
1278{
1279	struct pipe_winsys *ws = nv50->pipe.winsys;
1280
1281	if (p->insns_nr) {
1282		if (p->insns)
1283			FREE(p->insns);
1284		p->insns_nr = 0;
1285	}
1286
1287	if (p->buffer)
1288		pipe_buffer_reference(ws, &p->buffer, NULL);
1289
1290	p->translated = 0;
1291}
1292
1293