nv50_program.c revision bb9efb5534a652878161e28bd73039eff5b11014
1#include "pipe/p_context.h"
2#include "pipe/p_defines.h"
3#include "pipe/p_state.h"
4#include "pipe/p_inlines.h"
5
6#include "pipe/p_shader_tokens.h"
7#include "tgsi/util/tgsi_parse.h"
8#include "tgsi/util/tgsi_util.h"
9
10#include "nv50_context.h"
11
12#define NV50_SU_MAX_TEMP 64
13#define NV50_PROGRAM_DUMP
14
15/* ARL - gallium craps itself on progs/vp/arl.txt
16 *
17 * MSB - Like MAD, but MUL+SUB
18 * 	- Fuck it off, introduce a way to negate args for ops that
19 * 	  support it.
20 *
21 * Look into inlining IMMD for ops other than MOV (make it general?)
22 * 	- Maybe even relax restrictions a bit, can't do P_RESULT + P_IMMD,
23 * 	  but can emit to P_TEMP first - then MOV later. NVIDIA does this
24 *
25 * In ops such as ADD it's possible to construct a bad opcode in the !is_long()
26 * case, if the emit_src() causes the inst to suddenly become long.
27 *
28 * Verify half-insns work where expected - and force disable them where they
29 * don't work - MUL has it forcibly disabled atm as it fixes POW..
30 *
31 * FUCK! watch dst==src vectors, can overwrite components that are needed.
32 * 	ie. SUB R0, R0.yzxw, R0
33 *
34 * Things to check with renouveau:
35 * 	FP attr/result assignment - how?
36 * 		attrib
37 * 			- 0x16bc maps vp output onto fp hpos
38 * 			- 0x16c0 maps vp output onto fp col0
39 * 		result
40 * 			- colr always 0-3
41 * 			- depr always 4
42 * 0x16bc->0x16e8 --> some binding between vp/fp regs
43 * 0x16b8 --> VP output count
44 *
45 * 0x1298 --> "MOV rcol.x, fcol.y" "MOV depr, fcol.y" = 0x00000005
46 * 	      "MOV rcol.x, fcol.y" = 0x00000004
47 * 0x19a8 --> as above but 0x00000100 and 0x00000000
48 * 	- 0x00100000 used when KIL used
49 * 0x196c --> as above but 0x00000011 and 0x00000000
50 *
51 * 0x1988 --> 0xXXNNNNNN
52 * 	- XX == FP high something
53 */
54struct nv50_reg {
55	enum {
56		P_TEMP,
57		P_ATTR,
58		P_RESULT,
59		P_CONST,
60		P_IMMD
61	} type;
62	int index;
63
64	int hw;
65	int neg;
66};
67
68struct nv50_pc {
69	struct nv50_program *p;
70
71	/* hw resources */
72	struct nv50_reg *r_temp[NV50_SU_MAX_TEMP];
73
74	/* tgsi resources */
75	struct nv50_reg *temp;
76	int temp_nr;
77	struct nv50_reg *attr;
78	int attr_nr;
79	struct nv50_reg *result;
80	int result_nr;
81	struct nv50_reg *param;
82	int param_nr;
83	struct nv50_reg *immd;
84	float *immd_buf;
85	int immd_nr;
86
87	struct nv50_reg *temp_temp[16];
88	unsigned temp_temp_nr;
89};
90
91static void
92alloc_reg(struct nv50_pc *pc, struct nv50_reg *reg)
93{
94	int i;
95
96	if (reg->type == P_RESULT) {
97		if (pc->p->cfg.high_result < (reg->hw + 1))
98			pc->p->cfg.high_result = reg->hw + 1;
99	}
100
101	if (reg->type != P_TEMP)
102		return;
103
104	if (reg->hw >= 0) {
105		/*XXX: do this here too to catch FP temp-as-attr usage..
106		 *     not clean, but works */
107		if (pc->p->cfg.high_temp < (reg->hw + 1))
108			pc->p->cfg.high_temp = reg->hw + 1;
109		return;
110	}
111
112	for (i = 0; i < NV50_SU_MAX_TEMP; i++) {
113		if (!(pc->r_temp[i])) {
114			pc->r_temp[i] = reg;
115			reg->hw = i;
116			if (pc->p->cfg.high_temp < (i + 1))
117				pc->p->cfg.high_temp = i + 1;
118			return;
119		}
120	}
121
122	assert(0);
123}
124
125static struct nv50_reg *
126alloc_temp(struct nv50_pc *pc, struct nv50_reg *dst)
127{
128	struct nv50_reg *r;
129	int i;
130
131	if (dst && dst->type == P_TEMP && dst->hw == -1)
132		return dst;
133
134	for (i = 0; i < NV50_SU_MAX_TEMP; i++) {
135		if (!pc->r_temp[i]) {
136			r = CALLOC_STRUCT(nv50_reg);
137			r->type = P_TEMP;
138			r->index = -1;
139			r->hw = i;
140			pc->r_temp[i] = r;
141			return r;
142		}
143	}
144
145	assert(0);
146	return NULL;
147}
148
149static void
150free_temp(struct nv50_pc *pc, struct nv50_reg *r)
151{
152	if (r->index == -1) {
153		unsigned hw = r->hw;
154
155		FREE(pc->r_temp[hw]);
156		pc->r_temp[hw] = NULL;
157	}
158}
159
160static struct nv50_reg *
161temp_temp(struct nv50_pc *pc)
162{
163	if (pc->temp_temp_nr >= 16)
164		assert(0);
165
166	pc->temp_temp[pc->temp_temp_nr] = alloc_temp(pc, NULL);
167	return pc->temp_temp[pc->temp_temp_nr++];
168}
169
170static void
171kill_temp_temp(struct nv50_pc *pc)
172{
173	int i;
174
175	for (i = 0; i < pc->temp_temp_nr; i++)
176		free_temp(pc, pc->temp_temp[i]);
177	pc->temp_temp_nr = 0;
178}
179
180static int
181ctor_immd(struct nv50_pc *pc, float x, float y, float z, float w)
182{
183	pc->immd_buf = realloc(pc->immd_buf, (pc->immd_nr + 1) * 4 *
184					     sizeof(float));
185	pc->immd_buf[(pc->immd_nr * 4) + 0] = x;
186	pc->immd_buf[(pc->immd_nr * 4) + 1] = y;
187	pc->immd_buf[(pc->immd_nr * 4) + 2] = z;
188	pc->immd_buf[(pc->immd_nr * 4) + 3] = w;
189
190	return pc->immd_nr++;
191}
192
193static struct nv50_reg *
194alloc_immd(struct nv50_pc *pc, float f)
195{
196	struct nv50_reg *r = CALLOC_STRUCT(nv50_reg);
197	unsigned hw;
198
199	hw = ctor_immd(pc, f, 0, 0, 0) * 4;
200	r->type = P_IMMD;
201	r->hw = hw;
202	r->index = -1;
203	return r;
204}
205
206static struct nv50_program_exec *
207exec(struct nv50_pc *pc)
208{
209	struct nv50_program_exec *e = CALLOC_STRUCT(nv50_program_exec);
210
211	e->param.index = -1;
212	return e;
213}
214
215static void
216emit(struct nv50_pc *pc, struct nv50_program_exec *e)
217{
218	struct nv50_program *p = pc->p;
219
220	if (p->exec_tail)
221		p->exec_tail->next = e;
222	if (!p->exec_head)
223		p->exec_head = e;
224	p->exec_tail = e;
225	p->exec_size += (e->inst[0] & 1) ? 2 : 1;
226}
227
228static INLINE void set_long(struct nv50_pc *, struct nv50_program_exec *);
229
230static boolean
231is_long(struct nv50_program_exec *e)
232{
233	if (e->inst[0] & 1)
234		return TRUE;
235	return FALSE;
236}
237
238static boolean
239is_immd(struct nv50_program_exec *e)
240{
241	if (is_long(e) && (e->inst[1] & 3) == 3)
242		return TRUE;
243	return FALSE;
244}
245
246static INLINE void
247set_pred(struct nv50_pc *pc, unsigned pred, unsigned idx,
248	 struct nv50_program_exec *e)
249{
250	set_long(pc, e);
251	e->inst[1] &= ~((0x1f << 7) | (0x3 << 12));
252	e->inst[1] |= (pred << 7) | (idx << 12);
253}
254
255static INLINE void
256set_pred_wr(struct nv50_pc *pc, unsigned on, unsigned idx,
257	    struct nv50_program_exec *e)
258{
259	set_long(pc, e);
260	e->inst[1] &= ~((0x3 << 4) | (1 << 6));
261	e->inst[1] |= (idx << 4) | (on << 6);
262}
263
264static INLINE void
265set_long(struct nv50_pc *pc, struct nv50_program_exec *e)
266{
267	if (is_long(e))
268		return;
269
270	e->inst[0] |= 1;
271	set_pred(pc, 0xf, 0, e);
272	set_pred_wr(pc, 0, 0, e);
273}
274
275static INLINE void
276set_dst(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_program_exec *e)
277{
278	if (dst->type == P_RESULT) {
279		set_long(pc, e);
280		e->inst[1] |= 0x00000008;
281	}
282
283	alloc_reg(pc, dst);
284	e->inst[0] |= (dst->hw << 2);
285}
286
287static INLINE void
288set_immd(struct nv50_pc *pc, struct nv50_reg *imm, struct nv50_program_exec *e)
289{
290	unsigned val = fui(pc->immd_buf[imm->hw]); /* XXX */
291
292	set_long(pc, e);
293	/*XXX: can't be predicated - bits overlap.. catch cases where both
294	 *     are required and avoid them. */
295	set_pred(pc, 0, 0, e);
296	set_pred_wr(pc, 0, 0, e);
297
298	e->inst[1] |= 0x00000002 | 0x00000001;
299	e->inst[0] |= (val & 0x3f) << 16;
300	e->inst[1] |= (val >> 6) << 2;
301}
302
303static void
304emit_interp(struct nv50_pc *pc, struct nv50_reg *dst,
305	    struct nv50_reg *src, struct nv50_reg *iv, boolean noperspective)
306{
307	struct nv50_program_exec *e = exec(pc);
308
309	e->inst[0] |= 0x80000000;
310	set_dst(pc, dst, e);
311	alloc_reg(pc, iv);
312	e->inst[0] |= (iv->hw << 9);
313	alloc_reg(pc, src);
314	e->inst[0] |= (src->hw << 16);
315	if (noperspective)
316		e->inst[0] |= (1 << 25);
317
318	emit(pc, e);
319}
320
321static void
322set_data(struct nv50_pc *pc, struct nv50_reg *src, unsigned m, unsigned s,
323	 struct nv50_program_exec *e)
324{
325	set_long(pc, e);
326#if 1
327	e->inst[1] |= (1 << 22);
328#else
329	if (src->type == P_IMMD) {
330		e->inst[1] |= (NV50_CB_PMISC << 22);
331	} else {
332		if (pc->p->type == PIPE_SHADER_VERTEX)
333			e->inst[1] |= (NV50_CB_PVP << 22);
334		else
335			e->inst[1] |= (NV50_CB_PFP << 22);
336	}
337#endif
338
339	e->param.index = src->hw;
340	e->param.shift = s;
341	e->param.mask = m << (s % 32);
342}
343
344static void
345emit_mov(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src)
346{
347	struct nv50_program_exec *e = exec(pc);
348
349	e->inst[0] |= 0x10000000;
350
351	set_dst(pc, dst, e);
352
353	if (0 && dst->type != P_RESULT && src->type == P_IMMD) {
354		set_immd(pc, src, e);
355		/*XXX: 32-bit, but steals part of "half" reg space - need to
356		 *     catch and handle this case if/when we do half-regs
357		 */
358		e->inst[0] |= 0x00008000;
359	} else
360	if (src->type == P_IMMD || src->type == P_CONST) {
361		set_long(pc, e);
362		set_data(pc, src, 0x7f, 9, e);
363		e->inst[1] |= 0x20000000; /* src0 const? */
364	} else {
365		if (src->type == P_ATTR) {
366			set_long(pc, e);
367			e->inst[1] |= 0x00200000;
368		}
369
370		alloc_reg(pc, src);
371		e->inst[0] |= (src->hw << 9);
372	}
373
374	/* We really should support "half" instructions here at some point,
375	 * but I don't feel confident enough about them yet.
376	 */
377	set_long(pc, e);
378	if (is_long(e) && !is_immd(e)) {
379		e->inst[1] |= 0x04000000; /* 32-bit */
380		e->inst[1] |= 0x0003c000; /* "subsubop" 0xf == mov */
381	}
382
383	emit(pc, e);
384}
385
386static boolean
387check_swap_src_0_1(struct nv50_pc *pc,
388		   struct nv50_reg **s0, struct nv50_reg **s1)
389{
390	struct nv50_reg *src0 = *s0, *src1 = *s1;
391
392	if (src0->type == P_CONST) {
393		if (src1->type != P_CONST) {
394			*s0 = src1;
395			*s1 = src0;
396			return TRUE;
397		}
398	} else
399	if (src1->type == P_ATTR) {
400		if (src0->type != P_ATTR) {
401			*s0 = src1;
402			*s1 = src0;
403			return TRUE;
404		}
405	}
406
407	return FALSE;
408}
409
410static void
411set_src_0(struct nv50_pc *pc, struct nv50_reg *src, struct nv50_program_exec *e)
412{
413	if (src->type == P_ATTR) {
414		set_long(pc, e);
415		e->inst[1] |= 0x00200000;
416	} else
417	if (src->type == P_CONST || src->type == P_IMMD) {
418		struct nv50_reg *temp = temp_temp(pc);
419
420		emit_mov(pc, temp, src);
421		src = temp;
422	}
423
424	alloc_reg(pc, src);
425	e->inst[0] |= (src->hw << 9);
426}
427
428static void
429set_src_1(struct nv50_pc *pc, struct nv50_reg *src, struct nv50_program_exec *e)
430{
431	if (src->type == P_ATTR) {
432		struct nv50_reg *temp = temp_temp(pc);
433
434		emit_mov(pc, temp, src);
435		src = temp;
436	} else
437	if (src->type == P_CONST || src->type == P_IMMD) {
438		assert(!(e->inst[0] & 0x00800000));
439		if (e->inst[0] & 0x01000000) {
440			struct nv50_reg *temp = temp_temp(pc);
441
442			emit_mov(pc, temp, src);
443			src = temp;
444		} else {
445			set_data(pc, src, 0x7f, 16, e);
446			e->inst[0] |= 0x00800000;
447		}
448	}
449
450	alloc_reg(pc, src);
451	e->inst[0] |= (src->hw << 16);
452}
453
454static void
455set_src_2(struct nv50_pc *pc, struct nv50_reg *src, struct nv50_program_exec *e)
456{
457	set_long(pc, e);
458
459	if (src->type == P_ATTR) {
460		struct nv50_reg *temp = temp_temp(pc);
461
462		emit_mov(pc, temp, src);
463		src = temp;
464	} else
465	if (src->type == P_CONST || src->type == P_IMMD) {
466		assert(!(e->inst[0] & 0x01000000));
467		if (e->inst[0] & 0x00800000) {
468			struct nv50_reg *temp = temp_temp(pc);
469
470			emit_mov(pc, temp, src);
471			src = temp;
472		} else {
473			set_data(pc, src, 0x7f, 32+14, e);
474			e->inst[0] |= 0x01000000;
475		}
476	}
477
478	alloc_reg(pc, src);
479	e->inst[1] |= (src->hw << 14);
480}
481
482static void
483emit_mul(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src0,
484	 struct nv50_reg *src1)
485{
486	struct nv50_program_exec *e = exec(pc);
487
488	e->inst[0] |= 0xc0000000;
489	set_long(pc, e);
490
491	check_swap_src_0_1(pc, &src0, &src1);
492	set_dst(pc, dst, e);
493	set_src_0(pc, src0, e);
494	set_src_1(pc, src1, e);
495
496	emit(pc, e);
497}
498
499static void
500emit_add(struct nv50_pc *pc, struct nv50_reg *dst,
501	 struct nv50_reg *src0, struct nv50_reg *src1)
502{
503	struct nv50_program_exec *e = exec(pc);
504
505	e->inst[0] |= 0xb0000000;
506
507	check_swap_src_0_1(pc, &src0, &src1);
508	set_dst(pc, dst, e);
509	set_src_0(pc, src0, e);
510	if (is_long(e))
511		set_src_2(pc, src1, e);
512	else
513		set_src_1(pc, src1, e);
514
515	emit(pc, e);
516}
517
518static void
519emit_minmax(struct nv50_pc *pc, unsigned sub, struct nv50_reg *dst,
520	    struct nv50_reg *src0, struct nv50_reg *src1)
521{
522	struct nv50_program_exec *e = exec(pc);
523
524	set_long(pc, e);
525	e->inst[0] |= 0xb0000000;
526	e->inst[1] |= (sub << 29);
527
528	check_swap_src_0_1(pc, &src0, &src1);
529	set_dst(pc, dst, e);
530	set_src_0(pc, src0, e);
531	set_src_1(pc, src1, e);
532
533	emit(pc, e);
534}
535
536static void
537emit_sub(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src0,
538	 struct nv50_reg *src1)
539{
540	struct nv50_program_exec *e = exec(pc);
541
542	e->inst[0] |= 0xb0000000;
543
544	set_long(pc, e);
545	if (check_swap_src_0_1(pc, &src0, &src1))
546		e->inst[1] |= 0x04000000;
547	else
548		e->inst[1] |= 0x08000000;
549
550	set_dst(pc, dst, e);
551	set_src_0(pc, src0, e);
552	set_src_2(pc, src1, e);
553
554	emit(pc, e);
555}
556
557static void
558emit_mad(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src0,
559	 struct nv50_reg *src1, struct nv50_reg *src2)
560{
561	struct nv50_program_exec *e = exec(pc);
562
563	e->inst[0] |= 0xe0000000;
564
565	check_swap_src_0_1(pc, &src0, &src1);
566	set_dst(pc, dst, e);
567	set_src_0(pc, src0, e);
568	set_src_1(pc, src1, e);
569	set_src_2(pc, src2, e);
570
571	emit(pc, e);
572}
573
574static void
575emit_msb(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src0,
576	 struct nv50_reg *src1, struct nv50_reg *src2)
577{
578	struct nv50_program_exec *e = exec(pc);
579
580	e->inst[0] |= 0xe0000000;
581	set_long(pc, e);
582	e->inst[1] |= 0x08000000; /* src0 * src1 - src2 */
583
584	check_swap_src_0_1(pc, &src0, &src1);
585	set_dst(pc, dst, e);
586	set_src_0(pc, src0, e);
587	set_src_1(pc, src1, e);
588	set_src_2(pc, src2, e);
589
590	emit(pc, e);
591}
592
593static void
594emit_flop(struct nv50_pc *pc, unsigned sub,
595	  struct nv50_reg *dst, struct nv50_reg *src)
596{
597	struct nv50_program_exec *e = exec(pc);
598
599	e->inst[0] |= 0x90000000;
600	if (sub) {
601		set_long(pc, e);
602		e->inst[1] |= (sub << 29);
603	}
604
605	set_dst(pc, dst, e);
606	set_src_0(pc, src, e);
607
608	emit(pc, e);
609}
610
611static void
612emit_preex2(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src)
613{
614	struct nv50_program_exec *e = exec(pc);
615
616	e->inst[0] |= 0xb0000000;
617
618	set_dst(pc, dst, e);
619	set_src_0(pc, src, e);
620	set_long(pc, e);
621	e->inst[1] |= (6 << 29) | 0x00004000;
622
623	emit(pc, e);
624}
625
626static void
627emit_precossin(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src)
628{
629	struct nv50_program_exec *e = exec(pc);
630
631	e->inst[0] |= 0xb0000000;
632
633	set_dst(pc, dst, e);
634	set_src_0(pc, src, e);
635	set_long(pc, e);
636	e->inst[1] |= (6 << 29);
637
638	emit(pc, e);
639}
640
641static void
642emit_set(struct nv50_pc *pc, unsigned c_op, struct nv50_reg *dst,
643	 struct nv50_reg *src0, struct nv50_reg *src1)
644{
645	struct nv50_program_exec *e = exec(pc);
646	unsigned inv_cop[8] = { 0, 4, 2, 6, 1, 5, 3, 7 };
647	struct nv50_reg *rdst;
648
649	assert(c_op <= 7);
650	if (check_swap_src_0_1(pc, &src0, &src1))
651		c_op = inv_cop[c_op];
652
653	rdst = dst;
654	if (dst->type != P_TEMP)
655		dst = alloc_temp(pc, NULL);
656
657	/* set.u32 */
658	set_long(pc, e);
659	e->inst[0] |= 0xb0000000;
660	e->inst[1] |= (3 << 29);
661	e->inst[1] |= (c_op << 14);
662	/*XXX: breaks things, .u32 by default?
663	 *     decuda will disasm as .u16 and use .lo/.hi regs, but this
664	 *     doesn't seem to match what the hw actually does.
665	inst[1] |= 0x04000000; << breaks things.. .u32 by default?
666	 */
667	set_dst(pc, dst, e);
668	set_src_0(pc, src0, e);
669	set_src_1(pc, src1, e);
670	emit(pc, e);
671
672	/* cvt.f32.u32 */
673	e = exec(pc);
674	e->inst[0] = 0xa0000001;
675	e->inst[1] = 0x64014780;
676	set_dst(pc, rdst, e);
677	set_src_0(pc, dst, e);
678	emit(pc, e);
679
680	if (dst != rdst)
681		free_temp(pc, dst);
682}
683
684static void
685emit_flr(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src)
686{
687	struct nv50_program_exec *e = exec(pc);
688
689	e->inst[0] = 0xa0000000; /* cvt */
690	set_long(pc, e);
691	e->inst[1] |= (6 << 29); /* cvt */
692	e->inst[1] |= 0x08000000; /* integer mode */
693	e->inst[1] |= 0x04000000; /* 32 bit */
694	e->inst[1] |= ((0x1 << 3)) << 14; /* .rn */
695	e->inst[1] |= (1 << 14); /* src .f32 */
696	set_dst(pc, dst, e);
697	set_src_0(pc, src, e);
698
699	emit(pc, e);
700}
701
702static void
703emit_pow(struct nv50_pc *pc, struct nv50_reg *dst,
704	 struct nv50_reg *v, struct nv50_reg *e)
705{
706	struct nv50_reg *temp = alloc_temp(pc, NULL);
707
708	emit_flop(pc, 3, temp, v);
709	emit_mul(pc, temp, temp, e);
710	emit_preex2(pc, temp, temp);
711	emit_flop(pc, 6, dst, temp);
712
713	free_temp(pc, temp);
714}
715
716static void
717emit_abs(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src)
718{
719	struct nv50_program_exec *e = exec(pc);
720
721	e->inst[0] = 0xa0000000; /* cvt */
722	set_long(pc, e);
723	e->inst[1] |= (6 << 29); /* cvt */
724	e->inst[1] |= 0x04000000; /* 32 bit */
725	e->inst[1] |= (1 << 14); /* src .f32 */
726	e->inst[1] |= ((1 << 6) << 14); /* .abs */
727	set_dst(pc, dst, e);
728	set_src_0(pc, src, e);
729
730	emit(pc, e);
731}
732
733static void
734emit_lit(struct nv50_pc *pc, struct nv50_reg **dst, unsigned mask,
735	 struct nv50_reg **src)
736{
737	struct nv50_reg *one = alloc_immd(pc, 1.0);
738	struct nv50_reg *zero = alloc_immd(pc, 0.0);
739	struct nv50_reg *neg128 = alloc_immd(pc, -127.999999);
740	struct nv50_reg *pos128 = alloc_immd(pc,  127.999999);
741	struct nv50_reg *tmp[4];
742
743	if (mask & (1 << 0))
744		emit_mov(pc, dst[0], one);
745
746	if (mask & (1 << 3))
747		emit_mov(pc, dst[3], one);
748
749	if (mask & (3 << 1)) {
750		if (mask & (1 << 1))
751			tmp[0] = dst[1];
752		else
753			tmp[0] = temp_temp(pc);
754		emit_minmax(pc, 4, tmp[0], src[0], zero);
755	}
756
757	if (mask & (1 << 2)) {
758		set_pred_wr(pc, 1, 0, pc->p->exec_tail);
759
760		tmp[1] = temp_temp(pc);
761		emit_minmax(pc, 4, tmp[1], src[1], zero);
762
763		tmp[3] = temp_temp(pc);
764		emit_minmax(pc, 4, tmp[3], src[3], neg128);
765		emit_minmax(pc, 5, tmp[3], tmp[3], pos128);
766
767		emit_pow(pc, dst[2], tmp[1], tmp[3]);
768		emit_mov(pc, dst[2], zero);
769		set_pred(pc, 3, 0, pc->p->exec_tail);
770	}
771}
772
773static void
774emit_neg(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src)
775{
776	struct nv50_program_exec *e = exec(pc);
777
778	set_long(pc, e);
779	e->inst[0] |= 0xa0000000; /* delta */
780	e->inst[1] |= (7 << 29); /* delta */
781	e->inst[1] |= 0x04000000; /* negate arg0? probably not */
782	e->inst[1] |= (1 << 14); /* src .f32 */
783	set_dst(pc, dst, e);
784	set_src_0(pc, src, e);
785
786	emit(pc, e);
787}
788
789static struct nv50_reg *
790tgsi_dst(struct nv50_pc *pc, int c, const struct tgsi_full_dst_register *dst)
791{
792	switch (dst->DstRegister.File) {
793	case TGSI_FILE_TEMPORARY:
794		return &pc->temp[dst->DstRegister.Index * 4 + c];
795	case TGSI_FILE_OUTPUT:
796		return &pc->result[dst->DstRegister.Index * 4 + c];
797	case TGSI_FILE_NULL:
798		return NULL;
799	default:
800		break;
801	}
802
803	return NULL;
804}
805
806static struct nv50_reg *
807tgsi_src(struct nv50_pc *pc, int chan, const struct tgsi_full_src_register *src)
808{
809	struct nv50_reg *r = NULL;
810	struct nv50_reg *temp;
811	unsigned c;
812
813	c = tgsi_util_get_full_src_register_extswizzle(src, chan);
814	switch (c) {
815	case TGSI_EXTSWIZZLE_X:
816	case TGSI_EXTSWIZZLE_Y:
817	case TGSI_EXTSWIZZLE_Z:
818	case TGSI_EXTSWIZZLE_W:
819		switch (src->SrcRegister.File) {
820		case TGSI_FILE_INPUT:
821			r = &pc->attr[src->SrcRegister.Index * 4 + c];
822			break;
823		case TGSI_FILE_TEMPORARY:
824			r = &pc->temp[src->SrcRegister.Index * 4 + c];
825			break;
826		case TGSI_FILE_CONSTANT:
827			r = &pc->param[src->SrcRegister.Index * 4 + c];
828			break;
829		case TGSI_FILE_IMMEDIATE:
830			r = &pc->immd[src->SrcRegister.Index * 4 + c];
831			break;
832		case TGSI_FILE_SAMPLER:
833			break;
834		default:
835			assert(0);
836			break;
837		}
838		break;
839	case TGSI_EXTSWIZZLE_ZERO:
840		r = alloc_immd(pc, 0.0);
841		break;
842	case TGSI_EXTSWIZZLE_ONE:
843		r = alloc_immd(pc, 1.0);
844		break;
845	default:
846		assert(0);
847		break;
848	}
849
850	switch (tgsi_util_get_full_src_register_sign_mode(src, chan)) {
851	case TGSI_UTIL_SIGN_KEEP:
852		break;
853	case TGSI_UTIL_SIGN_CLEAR:
854		temp = temp_temp(pc);
855		emit_abs(pc, temp, r);
856		r = temp;
857		break;
858	case TGSI_UTIL_SIGN_TOGGLE:
859		temp = temp_temp(pc);
860		emit_neg(pc, temp, r);
861		r = temp;
862		break;
863	case TGSI_UTIL_SIGN_SET:
864		temp = temp_temp(pc);
865		emit_abs(pc, temp, r);
866		emit_neg(pc, temp, r);
867		r = temp;
868		break;
869	default:
870		assert(0);
871		break;
872	}
873
874	return r;
875}
876
877static boolean
878nv50_program_tx_insn(struct nv50_pc *pc, const union tgsi_full_token *tok)
879{
880	const struct tgsi_full_instruction *inst = &tok->FullInstruction;
881	struct nv50_reg *rdst[4], *dst[4], *src[3][4], *temp;
882	unsigned mask, sat;
883	int i, c;
884
885	NOUVEAU_ERR("insn %p\n", tok);
886
887	mask = inst->FullDstRegisters[0].DstRegister.WriteMask;
888	sat = inst->Instruction.Saturate == TGSI_SAT_ZERO_ONE;
889
890	for (c = 0; c < 4; c++) {
891		if (mask & (1 << c))
892			dst[c] = tgsi_dst(pc, c, &inst->FullDstRegisters[0]);
893		else
894			dst[c] = NULL;
895	}
896
897	for (i = 0; i < inst->Instruction.NumSrcRegs; i++) {
898		for (c = 0; c < 4; c++)
899			src[i][c] = tgsi_src(pc, c, &inst->FullSrcRegisters[i]);
900	}
901
902	if (sat) {
903		for (c = 0; c < 4; c++) {
904			rdst[c] = dst[c];
905			dst[c] = temp_temp(pc);
906		}
907	}
908
909	switch (inst->Instruction.Opcode) {
910	case TGSI_OPCODE_ABS:
911		for (c = 0; c < 4; c++) {
912			if (!(mask & (1 << c)))
913				continue;
914			emit_abs(pc, dst[c], src[0][c]);
915		}
916		break;
917	case TGSI_OPCODE_ADD:
918		for (c = 0; c < 4; c++) {
919			if (!(mask & (1 << c)))
920				continue;
921			emit_add(pc, dst[c], src[0][c], src[1][c]);
922		}
923		break;
924	case TGSI_OPCODE_COS:
925		temp = alloc_temp(pc, NULL);
926		emit_precossin(pc, temp, src[0][0]);
927		emit_flop(pc, 5, temp, temp);
928		for (c = 0; c < 4; c++) {
929			if (!(mask & (1 << c)))
930				continue;
931			emit_mov(pc, dst[c], temp);
932		}
933		break;
934	case TGSI_OPCODE_DP3:
935		temp = alloc_temp(pc, NULL);
936		emit_mul(pc, temp, src[0][0], src[1][0]);
937		emit_mad(pc, temp, src[0][1], src[1][1], temp);
938		emit_mad(pc, temp, src[0][2], src[1][2], temp);
939		for (c = 0; c < 4; c++) {
940			if (!(mask & (1 << c)))
941				continue;
942			emit_mov(pc, dst[c], temp);
943		}
944		free_temp(pc, temp);
945		break;
946	case TGSI_OPCODE_DP4:
947		temp = alloc_temp(pc, NULL);
948		emit_mul(pc, temp, src[0][0], src[1][0]);
949		emit_mad(pc, temp, src[0][1], src[1][1], temp);
950		emit_mad(pc, temp, src[0][2], src[1][2], temp);
951		emit_mad(pc, temp, src[0][3], src[1][3], temp);
952		for (c = 0; c < 4; c++) {
953			if (!(mask & (1 << c)))
954				continue;
955			emit_mov(pc, dst[c], temp);
956		}
957		free_temp(pc, temp);
958		break;
959	case TGSI_OPCODE_DPH:
960		temp = alloc_temp(pc, NULL);
961		emit_mul(pc, temp, src[0][0], src[1][0]);
962		emit_mad(pc, temp, src[0][1], src[1][1], temp);
963		emit_mad(pc, temp, src[0][2], src[1][2], temp);
964		emit_add(pc, temp, src[1][3], temp);
965		for (c = 0; c < 4; c++) {
966			if (!(mask & (1 << c)))
967				continue;
968			emit_mov(pc, dst[c], temp);
969		}
970		free_temp(pc, temp);
971		break;
972	case TGSI_OPCODE_DST:
973	{
974		struct nv50_reg *one = alloc_immd(pc, 1.0);
975		if (mask & (1 << 0))
976			emit_mov(pc, dst[0], one);
977		if (mask & (1 << 1))
978			emit_mul(pc, dst[1], src[0][1], src[1][1]);
979		if (mask & (1 << 2))
980			emit_mov(pc, dst[2], src[0][2]);
981		if (mask & (1 << 3))
982			emit_mov(pc, dst[3], src[1][3]);
983		FREE(one);
984	}
985		break;
986	case TGSI_OPCODE_EX2:
987		temp = alloc_temp(pc, NULL);
988		emit_preex2(pc, temp, src[0][0]);
989		emit_flop(pc, 6, temp, temp);
990		for (c = 0; c < 4; c++) {
991			if (!(mask & (1 << c)))
992				continue;
993			emit_mov(pc, dst[c], temp);
994		}
995		free_temp(pc, temp);
996		break;
997	case TGSI_OPCODE_FLR:
998		for (c = 0; c < 4; c++) {
999			if (!(mask & (1 << c)))
1000				continue;
1001			emit_flr(pc, dst[c], src[0][c]);
1002		}
1003		break;
1004	case TGSI_OPCODE_FRC:
1005		temp = alloc_temp(pc, NULL);
1006		for (c = 0; c < 4; c++) {
1007			if (!(mask & (1 << c)))
1008				continue;
1009			emit_flr(pc, temp, src[0][c]);
1010			emit_sub(pc, dst[c], src[0][c], temp);
1011		}
1012		free_temp(pc, temp);
1013		break;
1014	case TGSI_OPCODE_LIT:
1015		emit_lit(pc, &dst[0], mask, &src[0][0]);
1016		break;
1017	case TGSI_OPCODE_LG2:
1018		temp = alloc_temp(pc, NULL);
1019		emit_flop(pc, 3, temp, src[0][0]);
1020		for (c = 0; c < 4; c++) {
1021			if (!(mask & (1 << c)))
1022				continue;
1023			emit_mov(pc, dst[c], temp);
1024		}
1025		break;
1026	case TGSI_OPCODE_LRP:
1027		for (c = 0; c < 4; c++) {
1028			if (!(mask & (1 << c)))
1029				continue;
1030			/*XXX: we can do better than this */
1031			temp = alloc_temp(pc, NULL);
1032			emit_neg(pc, temp, src[0][c]);
1033			emit_mad(pc, temp, temp, src[2][c], src[2][c]);
1034			emit_mad(pc, dst[c], src[0][c], src[1][c], temp);
1035			free_temp(pc, temp);
1036		}
1037		break;
1038	case TGSI_OPCODE_MAD:
1039		for (c = 0; c < 4; c++) {
1040			if (!(mask & (1 << c)))
1041				continue;
1042			emit_mad(pc, dst[c], src[0][c], src[1][c], src[2][c]);
1043		}
1044		break;
1045	case TGSI_OPCODE_MAX:
1046		for (c = 0; c < 4; c++) {
1047			if (!(mask & (1 << c)))
1048				continue;
1049			emit_minmax(pc, 4, dst[c], src[0][c], src[1][c]);
1050		}
1051		break;
1052	case TGSI_OPCODE_MIN:
1053		for (c = 0; c < 4; c++) {
1054			if (!(mask & (1 << c)))
1055				continue;
1056			emit_minmax(pc, 5, dst[c], src[0][c], src[1][c]);
1057		}
1058		break;
1059	case TGSI_OPCODE_MOV:
1060		for (c = 0; c < 4; c++) {
1061			if (!(mask & (1 << c)))
1062				continue;
1063			emit_mov(pc, dst[c], src[0][c]);
1064		}
1065		break;
1066	case TGSI_OPCODE_MUL:
1067		for (c = 0; c < 4; c++) {
1068			if (!(mask & (1 << c)))
1069				continue;
1070			emit_mul(pc, dst[c], src[0][c], src[1][c]);
1071		}
1072		break;
1073	case TGSI_OPCODE_POW:
1074		temp = alloc_temp(pc, NULL);
1075		emit_pow(pc, temp, src[0][0], src[1][0]);
1076		for (c = 0; c < 4; c++) {
1077			if (!(mask & (1 << c)))
1078				continue;
1079			emit_mov(pc, dst[c], temp);
1080		}
1081		free_temp(pc, temp);
1082		break;
1083	case TGSI_OPCODE_RCP:
1084		for (c = 0; c < 4; c++) {
1085			if (!(mask & (1 << c)))
1086				continue;
1087			emit_flop(pc, 0, dst[c], src[0][0]);
1088		}
1089		break;
1090	case TGSI_OPCODE_RSQ:
1091		for (c = 0; c < 4; c++) {
1092			if (!(mask & (1 << c)))
1093				continue;
1094			emit_flop(pc, 2, dst[c], src[0][0]);
1095		}
1096		break;
1097	case TGSI_OPCODE_SCS:
1098		temp = alloc_temp(pc, NULL);
1099		emit_precossin(pc, temp, src[0][0]);
1100		if (mask & (1 << 0))
1101			emit_flop(pc, 5, dst[0], temp);
1102		if (mask & (1 << 1))
1103			emit_flop(pc, 4, dst[1], temp);
1104		break;
1105	case TGSI_OPCODE_SGE:
1106		for (c = 0; c < 4; c++) {
1107			if (!(mask & (1 << c)))
1108				continue;
1109			emit_set(pc, 6, dst[c], src[0][c], src[1][c]);
1110		}
1111		break;
1112	case TGSI_OPCODE_SIN:
1113		temp = alloc_temp(pc, NULL);
1114		emit_precossin(pc, temp, src[0][0]);
1115		emit_flop(pc, 4, temp, temp);
1116		for (c = 0; c < 4; c++) {
1117			if (!(mask & (1 << c)))
1118				continue;
1119			emit_mov(pc, dst[c], temp);
1120		}
1121		break;
1122	case TGSI_OPCODE_SLT:
1123		for (c = 0; c < 4; c++) {
1124			if (!(mask & (1 << c)))
1125				continue;
1126			emit_set(pc, 1, dst[c], src[0][c], src[1][c]);
1127		}
1128		break;
1129	case TGSI_OPCODE_SUB:
1130		for (c = 0; c < 4; c++) {
1131			if (!(mask & (1 << c)))
1132				continue;
1133			emit_sub(pc, dst[c], src[0][c], src[1][c]);
1134		}
1135		break;
1136	case TGSI_OPCODE_TEX:
1137		{
1138			struct nv50_reg *t0, *t1;
1139			struct nv50_program_exec *e;
1140
1141			t0 = alloc_temp(pc, NULL);
1142			t0 = alloc_temp(pc, NULL);
1143			t1 = alloc_temp(pc, NULL);
1144			emit_mov(pc, t0, src[0][0]);
1145			emit_mov(pc, t1, src[0][1]);
1146
1147			e = exec(pc);
1148			e->inst[0] = 0xf0400000;
1149			set_long(pc, e);
1150			e->inst[1] |= 0x0000c004;
1151			set_dst(pc, t0, e);
1152			emit(pc, e);
1153			free_temp(pc, t0);
1154			free_temp(pc, t1);
1155		}
1156		break;
1157	case TGSI_OPCODE_XPD:
1158		temp = alloc_temp(pc, NULL);
1159		if (mask & (1 << 0)) {
1160			emit_mul(pc, temp, src[0][2], src[1][1]);
1161			emit_msb(pc, dst[0], src[0][1], src[1][2], temp);
1162		}
1163		if (mask & (1 << 1)) {
1164			emit_mul(pc, temp, src[0][0], src[1][2]);
1165			emit_msb(pc, dst[1], src[0][2], src[1][0], temp);
1166		}
1167		if (mask & (1 << 2)) {
1168			emit_mul(pc, temp, src[0][1], src[1][0]);
1169			emit_msb(pc, dst[2], src[0][0], src[1][1], temp);
1170		}
1171		free_temp(pc, temp);
1172		break;
1173	case TGSI_OPCODE_END:
1174		break;
1175	default:
1176		NOUVEAU_ERR("invalid opcode %d\n", inst->Instruction.Opcode);
1177		return FALSE;
1178	}
1179
1180	if (sat) {
1181		for (c = 0; c < 4; c++) {
1182			struct nv50_program_exec *e;
1183
1184			if (!(mask & (1 << c)))
1185				continue;
1186			e = exec(pc);
1187
1188			e->inst[0] = 0xa0000000; /* cvt */
1189			set_long(pc, e);
1190			e->inst[1] |= (6 << 29); /* cvt */
1191			e->inst[1] |= 0x04000000; /* 32 bit */
1192			e->inst[1] |= (1 << 14); /* src .f32 */
1193			e->inst[1] |= ((1 << 5) << 14); /* .sat */
1194			set_dst(pc, rdst[c], e);
1195			set_src_0(pc, dst[c], e);
1196			emit(pc, e);
1197		}
1198	}
1199
1200	kill_temp_temp(pc);
1201	return TRUE;
1202}
1203
1204static boolean
1205nv50_program_tx_prep(struct nv50_pc *pc)
1206{
1207	struct tgsi_parse_context p;
1208	boolean ret = FALSE;
1209	unsigned i, c;
1210
1211	tgsi_parse_init(&p, pc->p->pipe.tokens);
1212	while (!tgsi_parse_end_of_tokens(&p)) {
1213		const union tgsi_full_token *tok = &p.FullToken;
1214
1215		tgsi_parse_token(&p);
1216		switch (tok->Token.Type) {
1217		case TGSI_TOKEN_TYPE_IMMEDIATE:
1218		{
1219			const struct tgsi_full_immediate *imm =
1220				&p.FullToken.FullImmediate;
1221
1222			ctor_immd(pc, imm->u.ImmediateFloat32[0].Float,
1223				      imm->u.ImmediateFloat32[1].Float,
1224				      imm->u.ImmediateFloat32[2].Float,
1225				      imm->u.ImmediateFloat32[3].Float);
1226		}
1227			break;
1228		case TGSI_TOKEN_TYPE_DECLARATION:
1229		{
1230			const struct tgsi_full_declaration *d;
1231			unsigned last;
1232
1233			d = &p.FullToken.FullDeclaration;
1234			last = d->u.DeclarationRange.Last;
1235
1236			switch (d->Declaration.File) {
1237			case TGSI_FILE_TEMPORARY:
1238				if (pc->temp_nr < (last + 1))
1239					pc->temp_nr = last + 1;
1240				break;
1241			case TGSI_FILE_OUTPUT:
1242				if (pc->result_nr < (last + 1))
1243					pc->result_nr = last + 1;
1244				break;
1245			case TGSI_FILE_INPUT:
1246				if (pc->attr_nr < (last + 1))
1247					pc->attr_nr = last + 1;
1248				break;
1249			case TGSI_FILE_CONSTANT:
1250				if (pc->param_nr < (last + 1))
1251					pc->param_nr = last + 1;
1252				break;
1253			case TGSI_FILE_SAMPLER:
1254				break;
1255			default:
1256				NOUVEAU_ERR("bad decl file %d\n",
1257					    d->Declaration.File);
1258				goto out_err;
1259			}
1260		}
1261			break;
1262		case TGSI_TOKEN_TYPE_INSTRUCTION:
1263			break;
1264		default:
1265			break;
1266		}
1267	}
1268
1269	NOUVEAU_ERR("%d temps\n", pc->temp_nr);
1270	if (pc->temp_nr) {
1271		pc->temp = calloc(pc->temp_nr * 4, sizeof(struct nv50_reg));
1272		if (!pc->temp)
1273			goto out_err;
1274
1275		for (i = 0; i < pc->temp_nr; i++) {
1276			for (c = 0; c < 4; c++) {
1277				pc->temp[i*4+c].type = P_TEMP;
1278				pc->temp[i*4+c].hw = -1;
1279				pc->temp[i*4+c].index = i;
1280			}
1281		}
1282	}
1283
1284	NOUVEAU_ERR("%d attrib regs\n", pc->attr_nr);
1285	if (pc->attr_nr) {
1286		struct nv50_reg *iv = NULL;
1287		int aid = 0;
1288
1289		pc->attr = calloc(pc->attr_nr * 4, sizeof(struct nv50_reg));
1290		if (!pc->attr)
1291			goto out_err;
1292
1293		if (pc->p->type == PIPE_SHADER_FRAGMENT) {
1294			iv = alloc_temp(pc, NULL);
1295			emit_interp(pc, iv, iv, iv, FALSE);
1296			emit_flop(pc, 0, iv, iv);
1297			aid++;
1298		}
1299
1300		for (i = 0; i < pc->attr_nr; i++) {
1301			struct nv50_reg *a = &pc->attr[i*4];
1302
1303			for (c = 0; c < 4; c++) {
1304				if (pc->p->type == PIPE_SHADER_FRAGMENT) {
1305					struct nv50_reg *at =
1306						alloc_temp(pc, NULL);
1307					pc->attr[i*4+c].type = at->type;
1308					pc->attr[i*4+c].hw = at->hw;
1309					pc->attr[i*4+c].index = at->index;
1310				} else {
1311					pc->p->cfg.vp.attr[aid/32] |=
1312						(1 << (aid % 32));
1313					pc->attr[i*4+c].type = P_ATTR;
1314					pc->attr[i*4+c].hw = aid++;
1315					pc->attr[i*4+c].index = i;
1316				}
1317			}
1318
1319			if (pc->p->type != PIPE_SHADER_FRAGMENT)
1320				continue;
1321
1322			emit_interp(pc, &a[0], &a[0], iv, TRUE);
1323			emit_interp(pc, &a[1], &a[1], iv, TRUE);
1324			emit_interp(pc, &a[2], &a[2], iv, TRUE);
1325			emit_interp(pc, &a[3], &a[3], iv, TRUE);
1326		}
1327
1328		if (iv)
1329			free_temp(pc, iv);
1330	}
1331
1332	NOUVEAU_ERR("%d result regs\n", pc->result_nr);
1333	if (pc->result_nr) {
1334		int rid = 0;
1335
1336		pc->result = calloc(pc->result_nr * 4, sizeof(struct nv50_reg));
1337		if (!pc->result)
1338			goto out_err;
1339
1340		for (i = 0; i < pc->result_nr; i++) {
1341			for (c = 0; c < 4; c++) {
1342				if (pc->p->type == PIPE_SHADER_FRAGMENT) {
1343					pc->result[i*4+c].type = P_TEMP;
1344					pc->result[i*4+c].hw = -1;
1345				} else {
1346					pc->result[i*4+c].type = P_RESULT;
1347					pc->result[i*4+c].hw = rid++;
1348				}
1349				pc->result[i*4+c].index = i;
1350			}
1351		}
1352	}
1353
1354	NOUVEAU_ERR("%d param regs\n", pc->param_nr);
1355	if (pc->param_nr) {
1356		int rid = 0;
1357
1358		pc->param = calloc(pc->param_nr * 4, sizeof(struct nv50_reg));
1359		if (!pc->param)
1360			goto out_err;
1361
1362		for (i = 0; i < pc->param_nr; i++) {
1363			for (c = 0; c < 4; c++) {
1364				pc->param[i*4+c].type = P_CONST;
1365				pc->param[i*4+c].hw = rid++;
1366				pc->param[i*4+c].index = i;
1367			}
1368		}
1369	}
1370
1371	if (pc->immd_nr) {
1372		int rid = pc->param_nr * 4;
1373
1374		pc->immd = calloc(pc->immd_nr * 4, sizeof(struct nv50_reg));
1375		if (!pc->immd)
1376			goto out_err;
1377
1378		for (i = 0; i < pc->immd_nr; i++) {
1379			for (c = 0; c < 4; c++) {
1380				pc->immd[i*4+c].type = P_IMMD;
1381				pc->immd[i*4+c].hw = rid++;
1382				pc->immd[i*4+c].index = i;
1383			}
1384		}
1385	}
1386
1387	ret = TRUE;
1388out_err:
1389	tgsi_parse_free(&p);
1390	return ret;
1391}
1392
1393static boolean
1394nv50_program_tx(struct nv50_program *p)
1395{
1396	struct tgsi_parse_context parse;
1397	struct nv50_pc *pc;
1398	boolean ret;
1399
1400	pc = CALLOC_STRUCT(nv50_pc);
1401	if (!pc)
1402		return FALSE;
1403	pc->p = p;
1404	pc->p->cfg.high_temp = 4;
1405
1406	ret = nv50_program_tx_prep(pc);
1407	if (ret == FALSE)
1408		goto out_cleanup;
1409
1410	tgsi_parse_init(&parse, pc->p->pipe.tokens);
1411	while (!tgsi_parse_end_of_tokens(&parse)) {
1412		const union tgsi_full_token *tok = &parse.FullToken;
1413
1414		tgsi_parse_token(&parse);
1415
1416		switch (tok->Token.Type) {
1417		case TGSI_TOKEN_TYPE_INSTRUCTION:
1418			ret = nv50_program_tx_insn(pc, tok);
1419			if (ret == FALSE)
1420				goto out_err;
1421			break;
1422		default:
1423			break;
1424		}
1425	}
1426
1427	if (p->type == PIPE_SHADER_FRAGMENT) {
1428		struct nv50_reg out;
1429
1430		out.type = P_TEMP;
1431		for (out.hw = 0; out.hw < pc->result_nr * 4; out.hw++)
1432			emit_mov(pc, &out, &pc->result[out.hw]);
1433	}
1434
1435	assert(is_long(pc->p->exec_tail) && !is_immd(pc->p->exec_head));
1436	pc->p->exec_tail->inst[1] |= 0x00000001;
1437
1438	p->param_nr = pc->param_nr * 4;
1439	p->immd_nr = pc->immd_nr * 4;
1440	p->immd = pc->immd_buf;
1441
1442out_err:
1443	tgsi_parse_free(&parse);
1444
1445out_cleanup:
1446	return ret;
1447}
1448
1449static void
1450nv50_program_validate(struct nv50_context *nv50, struct nv50_program *p)
1451{
1452	if (nv50_program_tx(p) == FALSE)
1453		assert(0);
1454	p->translated = TRUE;
1455}
1456
1457static void
1458nv50_program_upload_data(struct nv50_context *nv50, float *map,
1459			 unsigned start, unsigned count)
1460{
1461	while (count) {
1462		unsigned nr = count > 2047 ? 2047 : count;
1463
1464		BEGIN_RING(tesla, 0x00000f00, 1);
1465		OUT_RING  ((NV50_CB_PMISC << 0) | (start << 8));
1466		BEGIN_RING(tesla, 0x40000f04, nr);
1467		OUT_RINGp (map, nr);
1468
1469		map += nr;
1470		start += nr;
1471		count -= nr;
1472	}
1473}
1474
1475static void
1476nv50_program_validate_data(struct nv50_context *nv50, struct nv50_program *p)
1477{
1478	struct nouveau_winsys *nvws = nv50->screen->nvws;
1479	struct pipe_winsys *ws = nv50->pipe.winsys;
1480	unsigned nr = p->param_nr + p->immd_nr;
1481
1482	if (!p->data && nr) {
1483		struct nouveau_resource *heap = nv50->screen->vp_data_heap;
1484
1485		if (nvws->res_alloc(heap, nr, p, &p->data)) {
1486			while (heap->next && heap->size < nr) {
1487				struct nv50_program *evict = heap->next->priv;
1488				nvws->res_free(&evict->data);
1489			}
1490
1491			if (nvws->res_alloc(heap, nr, p, &p->data))
1492				assert(0);
1493		}
1494	}
1495
1496	if (p->param_nr) {
1497		float *map = ws->buffer_map(ws, nv50->constbuf[p->type],
1498					    PIPE_BUFFER_USAGE_CPU_READ);
1499		nv50_program_upload_data(nv50, map, p->data->start,
1500					 p->param_nr);
1501		ws->buffer_unmap(ws, nv50->constbuf[p->type]);
1502	}
1503
1504	if (p->immd_nr) {
1505		nv50_program_upload_data(nv50, p->immd,
1506					 p->data->start + p->param_nr,
1507					 p->immd_nr);
1508	}
1509}
1510
1511static void
1512nv50_program_validate_code(struct nv50_context *nv50, struct nv50_program *p)
1513{
1514	struct pipe_winsys *ws = nv50->pipe.winsys;
1515	struct nv50_program_exec *e;
1516	struct nouveau_stateobj *so;
1517	const unsigned flags = NOUVEAU_BO_VRAM | NOUVEAU_BO_WR;
1518	unsigned start, count, *up, *ptr;
1519	boolean upload = FALSE;
1520
1521	if (!p->buffer) {
1522		p->buffer = ws->buffer_create(ws, 0x100, 0, p->exec_size * 4);
1523		upload = TRUE;
1524	}
1525
1526	if (p->data && p->data->start != p->data_start) {
1527		for (e = p->exec_head; e; e = e->next) {
1528			unsigned ei, ci;
1529
1530			if (e->param.index < 0)
1531				continue;
1532			ei = e->param.shift >> 5;
1533			ci = e->param.index + p->data->start;
1534
1535			e->inst[ei] &= ~e->param.mask;
1536			e->inst[ei] |= (ci << e->param.shift);
1537		}
1538
1539		p->data_start = p->data->start;
1540		upload = TRUE;
1541	}
1542
1543	if (!upload)
1544		return FALSE;
1545
1546	up = ptr = MALLOC(p->exec_size * 4);
1547	for (e = p->exec_head; e; e = e->next) {
1548		*(ptr++) = e->inst[0];
1549		if (is_long(e))
1550			*(ptr++) = e->inst[1];
1551	}
1552
1553	so = so_new(3,2);
1554	so_method(so, nv50->screen->tesla, 0x1280, 3);
1555	so_reloc (so, p->buffer, 0, flags | NOUVEAU_BO_HIGH, 0, 0);
1556	so_reloc (so, p->buffer, 0, flags | NOUVEAU_BO_LOW, 0, 0);
1557	so_data  (so, (NV50_CB_PUPLOAD << 16) | 0x0800); //(p->exec_size * 4));
1558
1559	start = 0; count = p->exec_size;
1560	while (count) {
1561		struct nouveau_winsys *nvws = nv50->screen->nvws;
1562		unsigned nr;
1563
1564		so_emit(nvws, so);
1565
1566		nr = MIN2(count, 2047);
1567		nr = MIN2(nvws->channel->pushbuf->remaining, nr);
1568		if (nvws->channel->pushbuf->remaining < (nr + 3)) {
1569			FIRE_RING(NULL);
1570			continue;
1571		}
1572
1573		BEGIN_RING(tesla, 0x0f00, 1);
1574		OUT_RING  ((start << 8) | NV50_CB_PUPLOAD);
1575		BEGIN_RING(tesla, 0x40000f04, nr);
1576		OUT_RINGp (up + start, nr);
1577
1578		start += nr;
1579		count -= nr;
1580	}
1581
1582	FREE(up);
1583	so_ref(NULL, &so);
1584}
1585
1586void
1587nv50_vertprog_validate(struct nv50_context *nv50)
1588{
1589	struct nouveau_grobj *tesla = nv50->screen->tesla;
1590	struct nv50_program *p = nv50->vertprog;
1591	struct nouveau_stateobj *so;
1592
1593	if (!p->translated) {
1594		nv50_program_validate(nv50, p);
1595		if (!p->translated)
1596			assert(0);
1597	}
1598
1599	nv50_program_validate_data(nv50, p);
1600	nv50_program_validate_code(nv50, p);
1601
1602	so = so_new(11, 2);
1603	so_method(so, tesla, NV50TCL_VP_ADDRESS_HIGH, 2);
1604	so_reloc (so, p->buffer, 0, NOUVEAU_BO_VRAM | NOUVEAU_BO_RD |
1605		  NOUVEAU_BO_HIGH, 0, 0);
1606	so_reloc (so, p->buffer, 0, NOUVEAU_BO_VRAM | NOUVEAU_BO_RD |
1607		  NOUVEAU_BO_LOW, 0, 0);
1608	so_method(so, tesla, 0x1650, 2);
1609	so_data  (so, p->cfg.vp.attr[0]);
1610	so_data  (so, p->cfg.vp.attr[1]);
1611	so_method(so, tesla, 0x16b8, 1);
1612	so_data  (so, p->cfg.high_result);
1613	so_method(so, tesla, 0x16ac, 2);
1614	so_data  (so, p->cfg.high_result); //8);
1615	so_data  (so, p->cfg.high_temp);
1616	so_method(so, tesla, 0x140c, 1);
1617	so_data  (so, 0); /* program start offset */
1618	so_ref(so, &nv50->state.vertprog);
1619}
1620
1621void
1622nv50_fragprog_validate(struct nv50_context *nv50)
1623{
1624	struct nouveau_grobj *tesla = nv50->screen->tesla;
1625	struct nv50_program *p = nv50->fragprog;
1626	struct nouveau_stateobj *so;
1627
1628	if (!p->translated) {
1629		nv50_program_validate(nv50, p);
1630		if (!p->translated)
1631			assert(0);
1632	}
1633
1634	nv50_program_validate_data(nv50, p);
1635	nv50_program_validate_code(nv50, p);
1636
1637	so = so_new(64, 2);
1638	so_method(so, tesla, NV50TCL_FP_ADDRESS_HIGH, 2);
1639	so_reloc (so, p->buffer, 0, NOUVEAU_BO_VRAM | NOUVEAU_BO_RD |
1640		  NOUVEAU_BO_HIGH, 0, 0);
1641	so_reloc (so, p->buffer, 0, NOUVEAU_BO_VRAM | NOUVEAU_BO_RD |
1642		  NOUVEAU_BO_LOW, 0, 0);
1643	so_method(so, tesla, 0x1904, 4);
1644	so_data  (so, 0x01040404); /* p: 0x01000404 */
1645	so_data  (so, 0x00000004);
1646	so_data  (so, 0x00000000);
1647	so_data  (so, 0x00000000);
1648	so_method(so, tesla, 0x16bc, 3); /*XXX: fixme */
1649	so_data  (so, 0x03020100);
1650	so_data  (so, 0x07060504);
1651	so_data  (so, 0x0b0a0908);
1652	so_method(so, tesla, 0x1988, 2);
1653	so_data  (so, 0x08080408); //0x08040404); /* p: 0x0f000401 */
1654	so_data  (so, p->cfg.high_temp);
1655	so_method(so, tesla, 0x1414, 1);
1656	so_data  (so, 0); /* program start offset */
1657	so_ref(so, &nv50->state.fragprog);
1658}
1659
1660void
1661nv50_program_destroy(struct nv50_context *nv50, struct nv50_program *p)
1662{
1663	struct pipe_winsys *ws = nv50->pipe.winsys;
1664
1665	while (p->exec_head) {
1666		struct nv50_program_exec *e = p->exec_head;
1667
1668		p->exec_head = e->next;
1669		FREE(e);
1670	}
1671	p->exec_tail = NULL;
1672	p->exec_size = 0;
1673
1674	if (p->buffer)
1675		pipe_buffer_reference(ws, &p->buffer, NULL);
1676
1677	p->translated = 0;
1678}
1679
1680