nv50_program.c revision fd7412a7f1beab8b81ce307b1054331eee102e8b
1#include "pipe/p_context.h"
2#include "pipe/p_defines.h"
3#include "pipe/p_state.h"
4#include "pipe/p_inlines.h"
5
6#include "pipe/p_shader_tokens.h"
7#include "tgsi/util/tgsi_parse.h"
8#include "tgsi/util/tgsi_util.h"
9
10#include "nv50_context.h"
11
12#define NV50_SU_MAX_TEMP 64
13#define NV50_PROGRAM_DUMP
14
15/* ARL - gallium craps itself on progs/vp/arl.txt
16 *
17 * MSB - Like MAD, but MUL+SUB
18 * 	- Fuck it off, introduce a way to negate args for ops that
19 * 	  support it.
20 *
21 * Look into inlining IMMD for ops other than MOV (make it general?)
22 * 	- Maybe even relax restrictions a bit, can't do P_RESULT + P_IMMD,
23 * 	  but can emit to P_TEMP first - then MOV later. NVIDIA does this
24 *
25 * In ops such as ADD it's possible to construct a bad opcode in the !is_long()
26 * case, if the emit_src() causes the inst to suddenly become long.
27 *
28 * Verify half-insns work where expected - and force disable them where they
29 * don't work - MUL has it forcibly disabled atm as it fixes POW..
30 *
31 * FUCK! watch dst==src vectors, can overwrite components that are needed.
32 * 	ie. SUB R0, R0.yzxw, R0
33 *
34 * Things to check with renouveau:
35 * 	FP attr/result assignment - how?
36 * 		attrib
37 * 			- 0x16bc maps vp output onto fp hpos
38 * 			- 0x16c0 maps vp output onto fp col0
39 * 		result
40 * 			- colr always 0-3
41 * 			- depr always 4
42 * 0x16bc->0x16e8 --> some binding between vp/fp regs
43 * 0x16b8 --> VP output count
44 *
45 * 0x1298 --> "MOV rcol.x, fcol.y" "MOV depr, fcol.y" = 0x00000005
46 * 	      "MOV rcol.x, fcol.y" = 0x00000004
47 * 0x19a8 --> as above but 0x00000100 and 0x00000000
48 * 	- 0x00100000 used when KIL used
49 * 0x196c --> as above but 0x00000011 and 0x00000000
50 *
51 * 0x1988 --> 0xXXNNNNNN
52 * 	- XX == FP high something
53 */
54struct nv50_reg {
55	enum {
56		P_TEMP,
57		P_ATTR,
58		P_RESULT,
59		P_CONST,
60		P_IMMD
61	} type;
62	int index;
63
64	int hw;
65	int neg;
66};
67
68struct nv50_pc {
69	struct nv50_program *p;
70
71	/* hw resources */
72	struct nv50_reg *r_temp[NV50_SU_MAX_TEMP];
73
74	/* tgsi resources */
75	struct nv50_reg *temp;
76	int temp_nr;
77	struct nv50_reg *attr;
78	int attr_nr;
79	struct nv50_reg *result;
80	int result_nr;
81	struct nv50_reg *param;
82	int param_nr;
83	struct nv50_reg *immd;
84	float *immd_buf;
85	int immd_nr;
86
87	struct nv50_reg *temp_temp[16];
88	unsigned temp_temp_nr;
89};
90
91static void
92alloc_reg(struct nv50_pc *pc, struct nv50_reg *reg)
93{
94	int i;
95
96	if (reg->type == P_RESULT) {
97		if (pc->p->cfg.high_result < (reg->hw + 1))
98			pc->p->cfg.high_result = reg->hw + 1;
99	}
100
101	if (reg->type != P_TEMP)
102		return;
103
104	if (reg->hw >= 0) {
105		/*XXX: do this here too to catch FP temp-as-attr usage..
106		 *     not clean, but works */
107		if (pc->p->cfg.high_temp < (reg->hw + 1))
108			pc->p->cfg.high_temp = reg->hw + 1;
109		return;
110	}
111
112	for (i = 0; i < NV50_SU_MAX_TEMP; i++) {
113		if (!(pc->r_temp[i])) {
114			pc->r_temp[i] = reg;
115			reg->hw = i;
116			if (pc->p->cfg.high_temp < (i + 1))
117				pc->p->cfg.high_temp = i + 1;
118			return;
119		}
120	}
121
122	assert(0);
123}
124
125static struct nv50_reg *
126alloc_temp(struct nv50_pc *pc, struct nv50_reg *dst)
127{
128	struct nv50_reg *r;
129	int i;
130
131	if (dst && dst->type == P_TEMP && dst->hw == -1)
132		return dst;
133
134	for (i = 0; i < NV50_SU_MAX_TEMP; i++) {
135		if (!pc->r_temp[i]) {
136			r = CALLOC_STRUCT(nv50_reg);
137			r->type = P_TEMP;
138			r->index = -1;
139			r->hw = i;
140			pc->r_temp[i] = r;
141			return r;
142		}
143	}
144
145	assert(0);
146	return NULL;
147}
148
149static void
150free_temp(struct nv50_pc *pc, struct nv50_reg *r)
151{
152	if (r->index == -1) {
153		unsigned hw = r->hw;
154
155		FREE(pc->r_temp[hw]);
156		pc->r_temp[hw] = NULL;
157	}
158}
159
160static struct nv50_reg *
161temp_temp(struct nv50_pc *pc)
162{
163	if (pc->temp_temp_nr >= 16)
164		assert(0);
165
166	pc->temp_temp[pc->temp_temp_nr] = alloc_temp(pc, NULL);
167	return pc->temp_temp[pc->temp_temp_nr++];
168}
169
170static void
171kill_temp_temp(struct nv50_pc *pc)
172{
173	int i;
174
175	for (i = 0; i < pc->temp_temp_nr; i++)
176		free_temp(pc, pc->temp_temp[i]);
177	pc->temp_temp_nr = 0;
178}
179
180static int
181ctor_immd(struct nv50_pc *pc, float x, float y, float z, float w)
182{
183	pc->immd_buf = realloc(pc->immd_buf, (pc->immd_nr + 1) * 4 *
184					     sizeof(float));
185	pc->immd_buf[(pc->immd_nr * 4) + 0] = x;
186	pc->immd_buf[(pc->immd_nr * 4) + 1] = y;
187	pc->immd_buf[(pc->immd_nr * 4) + 2] = z;
188	pc->immd_buf[(pc->immd_nr * 4) + 3] = w;
189
190	return pc->immd_nr++;
191}
192
193static struct nv50_reg *
194alloc_immd(struct nv50_pc *pc, float f)
195{
196	struct nv50_reg *r = CALLOC_STRUCT(nv50_reg);
197	unsigned hw;
198
199	hw = ctor_immd(pc, f, 0, 0, 0) * 4;
200	r->type = P_IMMD;
201	r->hw = hw;
202	r->index = -1;
203	return r;
204}
205
206static struct nv50_program_exec *
207exec(struct nv50_pc *pc)
208{
209	struct nv50_program_exec *e = CALLOC_STRUCT(nv50_program_exec);
210
211	e->param.index = -1;
212	return e;
213}
214
215static void
216emit(struct nv50_pc *pc, struct nv50_program_exec *e)
217{
218	struct nv50_program *p = pc->p;
219
220	if (p->exec_tail)
221		p->exec_tail->next = e;
222	if (!p->exec_head)
223		p->exec_head = e;
224	p->exec_tail = e;
225	p->exec_size += (e->inst[0] & 1) ? 2 : 1;
226}
227
228static INLINE void set_long(struct nv50_pc *, struct nv50_program_exec *);
229
230static boolean
231is_long(struct nv50_program_exec *e)
232{
233	if (e->inst[0] & 1)
234		return TRUE;
235	return FALSE;
236}
237
238static boolean
239is_immd(struct nv50_program_exec *e)
240{
241	if (is_long(e) && (e->inst[1] & 3) == 3)
242		return TRUE;
243	return FALSE;
244}
245
246static INLINE void
247set_pred(struct nv50_pc *pc, unsigned pred, unsigned idx,
248	 struct nv50_program_exec *e)
249{
250	set_long(pc, e);
251	e->inst[1] &= ~((0x1f << 7) | (0x3 << 12));
252	e->inst[1] |= (pred << 7) | (idx << 12);
253}
254
255static INLINE void
256set_pred_wr(struct nv50_pc *pc, unsigned on, unsigned idx,
257	    struct nv50_program_exec *e)
258{
259	set_long(pc, e);
260	e->inst[1] &= ~((0x3 << 4) | (1 << 6));
261	e->inst[1] |= (idx << 4) | (on << 6);
262}
263
264static INLINE void
265set_long(struct nv50_pc *pc, struct nv50_program_exec *e)
266{
267	if (is_long(e))
268		return;
269
270	e->inst[0] |= 1;
271	set_pred(pc, 0xf, 0, e);
272	set_pred_wr(pc, 0, 0, e);
273}
274
275static INLINE void
276set_dst(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_program_exec *e)
277{
278	if (dst->type == P_RESULT) {
279		set_long(pc, e);
280		e->inst[1] |= 0x00000008;
281	}
282
283	alloc_reg(pc, dst);
284	e->inst[0] |= (dst->hw << 2);
285}
286
287static INLINE void
288set_immd(struct nv50_pc *pc, struct nv50_reg *imm, struct nv50_program_exec *e)
289{
290	unsigned val = fui(pc->immd_buf[imm->hw]); /* XXX */
291
292	set_long(pc, e);
293	/*XXX: can't be predicated - bits overlap.. catch cases where both
294	 *     are required and avoid them. */
295	set_pred(pc, 0, 0, e);
296	set_pred_wr(pc, 0, 0, e);
297
298	e->inst[1] |= 0x00000002 | 0x00000001;
299	e->inst[0] |= (val & 0x3f) << 16;
300	e->inst[1] |= (val >> 6) << 2;
301}
302
303static void
304emit_interp(struct nv50_pc *pc, struct nv50_reg *dst,
305	    struct nv50_reg *src, struct nv50_reg *iv, boolean noperspective)
306{
307	struct nv50_program_exec *e = exec(pc);
308
309	e->inst[0] |= 0x80000000;
310	set_dst(pc, dst, e);
311	alloc_reg(pc, iv);
312	e->inst[0] |= (iv->hw << 9);
313	alloc_reg(pc, src);
314	e->inst[0] |= (src->hw << 16);
315	if (noperspective)
316		e->inst[0] |= (1 << 25);
317
318	emit(pc, e);
319}
320
321static void
322set_data(struct nv50_pc *pc, struct nv50_reg *src, unsigned m, unsigned s,
323	 struct nv50_program_exec *e)
324{
325	set_long(pc, e);
326#if 1
327	e->inst[1] |= (1 << 22);
328#else
329	if (src->type == P_IMMD) {
330		e->inst[1] |= (NV50_CB_PMISC << 22);
331	} else {
332		if (pc->p->type == PIPE_SHADER_VERTEX)
333			e->inst[1] |= (NV50_CB_PVP << 22);
334		else
335			e->inst[1] |= (NV50_CB_PFP << 22);
336	}
337#endif
338
339	e->param.index = src->hw;
340	e->param.shift = s;
341	e->param.mask = m << (s % 32);
342}
343
344static void
345emit_mov(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src)
346{
347	struct nv50_program_exec *e = exec(pc);
348
349	e->inst[0] |= 0x10000000;
350
351	set_dst(pc, dst, e);
352
353	if (0 && dst->type != P_RESULT && src->type == P_IMMD) {
354		set_immd(pc, src, e);
355		/*XXX: 32-bit, but steals part of "half" reg space - need to
356		 *     catch and handle this case if/when we do half-regs
357		 */
358		e->inst[0] |= 0x00008000;
359	} else
360	if (src->type == P_IMMD || src->type == P_CONST) {
361		set_long(pc, e);
362		set_data(pc, src, 0x7f, 9, e);
363		e->inst[1] |= 0x20000000; /* src0 const? */
364	} else {
365		if (src->type == P_ATTR) {
366			set_long(pc, e);
367			e->inst[1] |= 0x00200000;
368		}
369
370		alloc_reg(pc, src);
371		e->inst[0] |= (src->hw << 9);
372	}
373
374	/* We really should support "half" instructions here at some point,
375	 * but I don't feel confident enough about them yet.
376	 */
377	set_long(pc, e);
378	if (is_long(e) && !is_immd(e)) {
379		e->inst[1] |= 0x04000000; /* 32-bit */
380		e->inst[1] |= 0x0003c000; /* "subsubop" 0xf == mov */
381	}
382
383	emit(pc, e);
384}
385
386static boolean
387check_swap_src_0_1(struct nv50_pc *pc,
388		   struct nv50_reg **s0, struct nv50_reg **s1)
389{
390	struct nv50_reg *src0 = *s0, *src1 = *s1;
391
392	if (src0->type == P_CONST) {
393		if (src1->type != P_CONST) {
394			*s0 = src1;
395			*s1 = src0;
396			return TRUE;
397		}
398	} else
399	if (src1->type == P_ATTR) {
400		if (src0->type != P_ATTR) {
401			*s0 = src1;
402			*s1 = src0;
403			return TRUE;
404		}
405	}
406
407	return FALSE;
408}
409
410static void
411set_src_0(struct nv50_pc *pc, struct nv50_reg *src, struct nv50_program_exec *e)
412{
413	if (src->type == P_ATTR) {
414		set_long(pc, e);
415		e->inst[1] |= 0x00200000;
416	} else
417	if (src->type == P_CONST || src->type == P_IMMD) {
418		struct nv50_reg *temp = temp_temp(pc);
419
420		emit_mov(pc, temp, src);
421		src = temp;
422	}
423
424	alloc_reg(pc, src);
425	e->inst[0] |= (src->hw << 9);
426}
427
428static void
429set_src_1(struct nv50_pc *pc, struct nv50_reg *src, struct nv50_program_exec *e)
430{
431	if (src->type == P_ATTR) {
432		struct nv50_reg *temp = temp_temp(pc);
433
434		emit_mov(pc, temp, src);
435		src = temp;
436	} else
437	if (src->type == P_CONST || src->type == P_IMMD) {
438		assert(!(e->inst[0] & 0x00800000));
439		if (e->inst[0] & 0x01000000) {
440			struct nv50_reg *temp = temp_temp(pc);
441
442			emit_mov(pc, temp, src);
443			src = temp;
444		} else {
445			set_data(pc, src, 0x7f, 16, e);
446			e->inst[0] |= 0x00800000;
447		}
448	}
449
450	alloc_reg(pc, src);
451	e->inst[0] |= (src->hw << 16);
452}
453
454static void
455set_src_2(struct nv50_pc *pc, struct nv50_reg *src, struct nv50_program_exec *e)
456{
457	set_long(pc, e);
458
459	if (src->type == P_ATTR) {
460		struct nv50_reg *temp = temp_temp(pc);
461
462		emit_mov(pc, temp, src);
463		src = temp;
464	} else
465	if (src->type == P_CONST || src->type == P_IMMD) {
466		assert(!(e->inst[0] & 0x01000000));
467		if (e->inst[0] & 0x00800000) {
468			struct nv50_reg *temp = temp_temp(pc);
469
470			emit_mov(pc, temp, src);
471			src = temp;
472		} else {
473			set_data(pc, src, 0x7f, 32+14, e);
474			e->inst[0] |= 0x01000000;
475		}
476	}
477
478	alloc_reg(pc, src);
479	e->inst[1] |= (src->hw << 14);
480}
481
482static void
483emit_mul(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src0,
484	 struct nv50_reg *src1)
485{
486	struct nv50_program_exec *e = exec(pc);
487
488	e->inst[0] |= 0xc0000000;
489	set_long(pc, e);
490
491	check_swap_src_0_1(pc, &src0, &src1);
492	set_dst(pc, dst, e);
493	set_src_0(pc, src0, e);
494	set_src_1(pc, src1, e);
495
496	emit(pc, e);
497}
498
499static void
500emit_add(struct nv50_pc *pc, struct nv50_reg *dst,
501	 struct nv50_reg *src0, struct nv50_reg *src1)
502{
503	struct nv50_program_exec *e = exec(pc);
504
505	e->inst[0] |= 0xb0000000;
506
507	check_swap_src_0_1(pc, &src0, &src1);
508	set_dst(pc, dst, e);
509	set_src_0(pc, src0, e);
510	if (is_long(e))
511		set_src_2(pc, src1, e);
512	else
513		set_src_1(pc, src1, e);
514
515	emit(pc, e);
516}
517
518static void
519emit_minmax(struct nv50_pc *pc, unsigned sub, struct nv50_reg *dst,
520	    struct nv50_reg *src0, struct nv50_reg *src1)
521{
522	struct nv50_program_exec *e = exec(pc);
523
524	set_long(pc, e);
525	e->inst[0] |= 0xb0000000;
526	e->inst[1] |= (sub << 29);
527
528	check_swap_src_0_1(pc, &src0, &src1);
529	set_dst(pc, dst, e);
530	set_src_0(pc, src0, e);
531	set_src_1(pc, src1, e);
532
533	emit(pc, e);
534}
535
536static void
537emit_sub(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src0,
538	 struct nv50_reg *src1)
539{
540	struct nv50_program_exec *e = exec(pc);
541
542	e->inst[0] |= 0xb0000000;
543
544	set_long(pc, e);
545	if (check_swap_src_0_1(pc, &src0, &src1))
546		e->inst[1] |= 0x04000000;
547	else
548		e->inst[1] |= 0x08000000;
549
550	set_dst(pc, dst, e);
551	set_src_0(pc, src0, e);
552	set_src_2(pc, src1, e);
553
554	emit(pc, e);
555}
556
557static void
558emit_mad(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src0,
559	 struct nv50_reg *src1, struct nv50_reg *src2)
560{
561	struct nv50_program_exec *e = exec(pc);
562
563	e->inst[0] |= 0xe0000000;
564
565	check_swap_src_0_1(pc, &src0, &src1);
566	set_dst(pc, dst, e);
567	set_src_0(pc, src0, e);
568	set_src_1(pc, src1, e);
569	set_src_2(pc, src2, e);
570
571	emit(pc, e);
572}
573
574static void
575emit_msb(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src0,
576	 struct nv50_reg *src1, struct nv50_reg *src2)
577{
578	struct nv50_program_exec *e = exec(pc);
579
580	e->inst[0] |= 0xe0000000;
581	set_long(pc, e);
582	e->inst[1] |= 0x08000000; /* src0 * src1 - src2 */
583
584	check_swap_src_0_1(pc, &src0, &src1);
585	set_dst(pc, dst, e);
586	set_src_0(pc, src0, e);
587	set_src_1(pc, src1, e);
588	set_src_2(pc, src2, e);
589
590	emit(pc, e);
591}
592
593static void
594emit_flop(struct nv50_pc *pc, unsigned sub,
595	  struct nv50_reg *dst, struct nv50_reg *src)
596{
597	struct nv50_program_exec *e = exec(pc);
598
599	e->inst[0] |= 0x90000000;
600	if (sub) {
601		set_long(pc, e);
602		e->inst[1] |= (sub << 29);
603	}
604
605	set_dst(pc, dst, e);
606	set_src_0(pc, src, e);
607
608	emit(pc, e);
609}
610
611static void
612emit_preex2(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src)
613{
614	struct nv50_program_exec *e = exec(pc);
615
616	e->inst[0] |= 0xb0000000;
617
618	set_dst(pc, dst, e);
619	set_src_0(pc, src, e);
620	set_long(pc, e);
621	e->inst[1] |= (6 << 29) | 0x00004000;
622
623	emit(pc, e);
624}
625
626static void
627emit_precossin(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src)
628{
629	struct nv50_program_exec *e = exec(pc);
630
631	e->inst[0] |= 0xb0000000;
632
633	set_dst(pc, dst, e);
634	set_src_0(pc, src, e);
635	set_long(pc, e);
636	e->inst[1] |= (6 << 29);
637
638	emit(pc, e);
639}
640
641static void
642emit_set(struct nv50_pc *pc, unsigned c_op, struct nv50_reg *dst,
643	 struct nv50_reg *src0, struct nv50_reg *src1)
644{
645	struct nv50_program_exec *e = exec(pc);
646	unsigned inv_cop[8] = { 0, 4, 2, 6, 1, 5, 3, 7 };
647	struct nv50_reg *rdst;
648
649	assert(c_op <= 7);
650	if (check_swap_src_0_1(pc, &src0, &src1))
651		c_op = inv_cop[c_op];
652
653	rdst = dst;
654	if (dst->type != P_TEMP)
655		dst = alloc_temp(pc, NULL);
656
657	/* set.u32 */
658	set_long(pc, e);
659	e->inst[0] |= 0xb0000000;
660	e->inst[1] |= (3 << 29);
661	e->inst[1] |= (c_op << 14);
662	/*XXX: breaks things, .u32 by default?
663	 *     decuda will disasm as .u16 and use .lo/.hi regs, but this
664	 *     doesn't seem to match what the hw actually does.
665	inst[1] |= 0x04000000; << breaks things.. .u32 by default?
666	 */
667	set_dst(pc, dst, e);
668	set_src_0(pc, src0, e);
669	set_src_1(pc, src1, e);
670	emit(pc, e);
671
672	/* cvt.f32.u32 */
673	e = exec(pc);
674	e->inst[0] = 0xa0000001;
675	e->inst[1] = 0x64014780;
676	set_dst(pc, rdst, e);
677	set_src_0(pc, dst, e);
678	emit(pc, e);
679
680	if (dst != rdst)
681		free_temp(pc, dst);
682}
683
684static void
685emit_flr(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src)
686{
687	struct nv50_program_exec *e = exec(pc);
688
689	e->inst[0] = 0xa0000000; /* cvt */
690	set_long(pc, e);
691	e->inst[1] |= (6 << 29); /* cvt */
692	e->inst[1] |= 0x08000000; /* integer mode */
693	e->inst[1] |= 0x04000000; /* 32 bit */
694	e->inst[1] |= ((0x1 << 3)) << 14; /* .rn */
695	e->inst[1] |= (1 << 14); /* src .f32 */
696	set_dst(pc, dst, e);
697	set_src_0(pc, src, e);
698
699	emit(pc, e);
700}
701
702static void
703emit_pow(struct nv50_pc *pc, struct nv50_reg *dst,
704	 struct nv50_reg *v, struct nv50_reg *e)
705{
706	struct nv50_reg *temp = alloc_temp(pc, NULL);
707
708	emit_flop(pc, 3, temp, v);
709	emit_mul(pc, temp, temp, e);
710	emit_preex2(pc, temp, temp);
711	emit_flop(pc, 6, dst, temp);
712
713	free_temp(pc, temp);
714}
715
716static void
717emit_abs(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src)
718{
719	struct nv50_program_exec *e = exec(pc);
720
721	e->inst[0] = 0xa0000000; /* cvt */
722	set_long(pc, e);
723	e->inst[1] |= (6 << 29); /* cvt */
724	e->inst[1] |= 0x04000000; /* 32 bit */
725	e->inst[1] |= (1 << 14); /* src .f32 */
726	e->inst[1] |= ((1 << 6) << 14); /* .abs */
727	set_dst(pc, dst, e);
728	set_src_0(pc, src, e);
729
730	emit(pc, e);
731}
732
733static void
734emit_lit(struct nv50_pc *pc, struct nv50_reg **dst, unsigned mask,
735	 struct nv50_reg **src)
736{
737	struct nv50_reg *one = alloc_immd(pc, 1.0);
738	struct nv50_reg *zero = alloc_immd(pc, 0.0);
739	struct nv50_reg *neg128 = alloc_immd(pc, -127.999999);
740	struct nv50_reg *pos128 = alloc_immd(pc,  127.999999);
741	struct nv50_reg *tmp[4];
742
743	if (mask & (1 << 0))
744		emit_mov(pc, dst[0], one);
745
746	if (mask & (1 << 3))
747		emit_mov(pc, dst[3], one);
748
749	if (mask & (3 << 1)) {
750		if (mask & (1 << 1))
751			tmp[0] = dst[1];
752		else
753			tmp[0] = temp_temp(pc);
754		emit_minmax(pc, 4, tmp[0], src[0], zero);
755	}
756
757	if (mask & (1 << 2)) {
758		set_pred_wr(pc, 1, 0, pc->p->exec_tail);
759
760		tmp[1] = temp_temp(pc);
761		emit_minmax(pc, 4, tmp[1], src[1], zero);
762
763		tmp[3] = temp_temp(pc);
764		emit_minmax(pc, 4, tmp[3], src[3], neg128);
765		emit_minmax(pc, 5, tmp[3], tmp[3], pos128);
766
767		emit_pow(pc, dst[2], tmp[1], tmp[3]);
768		emit_mov(pc, dst[2], zero);
769		set_pred(pc, 3, 0, pc->p->exec_tail);
770	}
771}
772
773static void
774emit_neg(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src)
775{
776	struct nv50_program_exec *e = exec(pc);
777
778	set_long(pc, e);
779	e->inst[0] |= 0xa0000000; /* delta */
780	e->inst[1] |= (7 << 29); /* delta */
781	e->inst[1] |= 0x04000000; /* negate arg0? probably not */
782	e->inst[1] |= (1 << 14); /* src .f32 */
783	set_dst(pc, dst, e);
784	set_src_0(pc, src, e);
785
786	emit(pc, e);
787}
788
789static struct nv50_reg *
790tgsi_dst(struct nv50_pc *pc, int c, const struct tgsi_full_dst_register *dst)
791{
792	switch (dst->DstRegister.File) {
793	case TGSI_FILE_TEMPORARY:
794		return &pc->temp[dst->DstRegister.Index * 4 + c];
795	case TGSI_FILE_OUTPUT:
796		return &pc->result[dst->DstRegister.Index * 4 + c];
797	case TGSI_FILE_NULL:
798		return NULL;
799	default:
800		break;
801	}
802
803	return NULL;
804}
805
806static struct nv50_reg *
807tgsi_src(struct nv50_pc *pc, int chan, const struct tgsi_full_src_register *src)
808{
809	struct nv50_reg *r = NULL;
810	struct nv50_reg *temp;
811	unsigned c;
812
813	c = tgsi_util_get_full_src_register_extswizzle(src, chan);
814	switch (c) {
815	case TGSI_EXTSWIZZLE_X:
816	case TGSI_EXTSWIZZLE_Y:
817	case TGSI_EXTSWIZZLE_Z:
818	case TGSI_EXTSWIZZLE_W:
819		switch (src->SrcRegister.File) {
820		case TGSI_FILE_INPUT:
821			r = &pc->attr[src->SrcRegister.Index * 4 + c];
822			break;
823		case TGSI_FILE_TEMPORARY:
824			r = &pc->temp[src->SrcRegister.Index * 4 + c];
825			break;
826		case TGSI_FILE_CONSTANT:
827			r = &pc->param[src->SrcRegister.Index * 4 + c];
828			break;
829		case TGSI_FILE_IMMEDIATE:
830			r = &pc->immd[src->SrcRegister.Index * 4 + c];
831			break;
832		case TGSI_FILE_SAMPLER:
833			break;
834		default:
835			assert(0);
836			break;
837		}
838		break;
839	case TGSI_EXTSWIZZLE_ZERO:
840		r = alloc_immd(pc, 0.0);
841		break;
842	case TGSI_EXTSWIZZLE_ONE:
843		r = alloc_immd(pc, 1.0);
844		break;
845	default:
846		assert(0);
847		break;
848	}
849
850	switch (tgsi_util_get_full_src_register_sign_mode(src, chan)) {
851	case TGSI_UTIL_SIGN_KEEP:
852		break;
853	case TGSI_UTIL_SIGN_CLEAR:
854		temp = temp_temp(pc);
855		emit_abs(pc, temp, r);
856		r = temp;
857		break;
858	case TGSI_UTIL_SIGN_TOGGLE:
859		temp = temp_temp(pc);
860		emit_neg(pc, temp, r);
861		r = temp;
862		break;
863	case TGSI_UTIL_SIGN_SET:
864		temp = temp_temp(pc);
865		emit_abs(pc, temp, r);
866		emit_neg(pc, temp, r);
867		r = temp;
868		break;
869	default:
870		assert(0);
871		break;
872	}
873
874	return r;
875}
876
877static boolean
878nv50_program_tx_insn(struct nv50_pc *pc, const union tgsi_full_token *tok)
879{
880	const struct tgsi_full_instruction *inst = &tok->FullInstruction;
881	struct nv50_reg *rdst[4], *dst[4], *src[3][4], *temp;
882	unsigned mask, sat;
883	int i, c;
884
885	NOUVEAU_ERR("insn %p\n", tok);
886
887	mask = inst->FullDstRegisters[0].DstRegister.WriteMask;
888	sat = inst->Instruction.Saturate == TGSI_SAT_ZERO_ONE;
889
890	for (c = 0; c < 4; c++) {
891		if (mask & (1 << c))
892			dst[c] = tgsi_dst(pc, c, &inst->FullDstRegisters[0]);
893		else
894			dst[c] = NULL;
895	}
896
897	for (i = 0; i < inst->Instruction.NumSrcRegs; i++) {
898		for (c = 0; c < 4; c++)
899			src[i][c] = tgsi_src(pc, c, &inst->FullSrcRegisters[i]);
900	}
901
902	if (sat) {
903		for (c = 0; c < 4; c++) {
904			rdst[c] = dst[c];
905			dst[c] = temp_temp(pc);
906		}
907	}
908
909	switch (inst->Instruction.Opcode) {
910	case TGSI_OPCODE_ABS:
911		for (c = 0; c < 4; c++) {
912			if (!(mask & (1 << c)))
913				continue;
914			emit_abs(pc, dst[c], src[0][c]);
915		}
916		break;
917	case TGSI_OPCODE_ADD:
918		for (c = 0; c < 4; c++) {
919			if (!(mask & (1 << c)))
920				continue;
921			emit_add(pc, dst[c], src[0][c], src[1][c]);
922		}
923		break;
924	case TGSI_OPCODE_COS:
925		temp = alloc_temp(pc, NULL);
926		emit_precossin(pc, temp, src[0][0]);
927		emit_flop(pc, 5, temp, temp);
928		for (c = 0; c < 4; c++) {
929			if (!(mask & (1 << c)))
930				continue;
931			emit_mov(pc, dst[c], temp);
932		}
933		break;
934	case TGSI_OPCODE_DP3:
935		temp = alloc_temp(pc, NULL);
936		emit_mul(pc, temp, src[0][0], src[1][0]);
937		emit_mad(pc, temp, src[0][1], src[1][1], temp);
938		emit_mad(pc, temp, src[0][2], src[1][2], temp);
939		for (c = 0; c < 4; c++) {
940			if (!(mask & (1 << c)))
941				continue;
942			emit_mov(pc, dst[c], temp);
943		}
944		free_temp(pc, temp);
945		break;
946	case TGSI_OPCODE_DP4:
947		temp = alloc_temp(pc, NULL);
948		emit_mul(pc, temp, src[0][0], src[1][0]);
949		emit_mad(pc, temp, src[0][1], src[1][1], temp);
950		emit_mad(pc, temp, src[0][2], src[1][2], temp);
951		emit_mad(pc, temp, src[0][3], src[1][3], temp);
952		for (c = 0; c < 4; c++) {
953			if (!(mask & (1 << c)))
954				continue;
955			emit_mov(pc, dst[c], temp);
956		}
957		free_temp(pc, temp);
958		break;
959	case TGSI_OPCODE_DPH:
960		temp = alloc_temp(pc, NULL);
961		emit_mul(pc, temp, src[0][0], src[1][0]);
962		emit_mad(pc, temp, src[0][1], src[1][1], temp);
963		emit_mad(pc, temp, src[0][2], src[1][2], temp);
964		emit_add(pc, temp, src[1][3], temp);
965		for (c = 0; c < 4; c++) {
966			if (!(mask & (1 << c)))
967				continue;
968			emit_mov(pc, dst[c], temp);
969		}
970		free_temp(pc, temp);
971		break;
972	case TGSI_OPCODE_DST:
973	{
974		struct nv50_reg *one = alloc_immd(pc, 1.0);
975		if (mask & (1 << 0))
976			emit_mov(pc, dst[0], one);
977		if (mask & (1 << 1))
978			emit_mul(pc, dst[1], src[0][1], src[1][1]);
979		if (mask & (1 << 2))
980			emit_mov(pc, dst[2], src[0][2]);
981		if (mask & (1 << 3))
982			emit_mov(pc, dst[3], src[1][3]);
983		FREE(one);
984	}
985		break;
986	case TGSI_OPCODE_EX2:
987		temp = alloc_temp(pc, NULL);
988		emit_preex2(pc, temp, src[0][0]);
989		emit_flop(pc, 6, temp, temp);
990		for (c = 0; c < 4; c++) {
991			if (!(mask & (1 << c)))
992				continue;
993			emit_mov(pc, dst[c], temp);
994		}
995		free_temp(pc, temp);
996		break;
997	case TGSI_OPCODE_FLR:
998		for (c = 0; c < 4; c++) {
999			if (!(mask & (1 << c)))
1000				continue;
1001			emit_flr(pc, dst[c], src[0][c]);
1002		}
1003		break;
1004	case TGSI_OPCODE_FRC:
1005		temp = alloc_temp(pc, NULL);
1006		for (c = 0; c < 4; c++) {
1007			if (!(mask & (1 << c)))
1008				continue;
1009			emit_flr(pc, temp, src[0][c]);
1010			emit_sub(pc, dst[c], src[0][c], temp);
1011		}
1012		free_temp(pc, temp);
1013		break;
1014	case TGSI_OPCODE_LIT:
1015		emit_lit(pc, &dst[0], mask, &src[0][0]);
1016		break;
1017	case TGSI_OPCODE_LG2:
1018		temp = alloc_temp(pc, NULL);
1019		emit_flop(pc, 3, temp, src[0][0]);
1020		for (c = 0; c < 4; c++) {
1021			if (!(mask & (1 << c)))
1022				continue;
1023			emit_mov(pc, dst[c], temp);
1024		}
1025		break;
1026	case TGSI_OPCODE_LRP:
1027		for (c = 0; c < 4; c++) {
1028			if (!(mask & (1 << c)))
1029				continue;
1030			/*XXX: we can do better than this */
1031			temp = alloc_temp(pc, NULL);
1032			emit_neg(pc, temp, src[0][c]);
1033			emit_mad(pc, temp, temp, src[2][c], src[2][c]);
1034			emit_mad(pc, dst[c], src[0][c], src[1][c], temp);
1035			free_temp(pc, temp);
1036		}
1037		break;
1038	case TGSI_OPCODE_MAD:
1039		for (c = 0; c < 4; c++) {
1040			if (!(mask & (1 << c)))
1041				continue;
1042			emit_mad(pc, dst[c], src[0][c], src[1][c], src[2][c]);
1043		}
1044		break;
1045	case TGSI_OPCODE_MAX:
1046		for (c = 0; c < 4; c++) {
1047			if (!(mask & (1 << c)))
1048				continue;
1049			emit_minmax(pc, 4, dst[c], src[0][c], src[1][c]);
1050		}
1051		break;
1052	case TGSI_OPCODE_MIN:
1053		for (c = 0; c < 4; c++) {
1054			if (!(mask & (1 << c)))
1055				continue;
1056			emit_minmax(pc, 5, dst[c], src[0][c], src[1][c]);
1057		}
1058		break;
1059	case TGSI_OPCODE_MOV:
1060		for (c = 0; c < 4; c++) {
1061			if (!(mask & (1 << c)))
1062				continue;
1063			emit_mov(pc, dst[c], src[0][c]);
1064		}
1065		break;
1066	case TGSI_OPCODE_MUL:
1067		for (c = 0; c < 4; c++) {
1068			if (!(mask & (1 << c)))
1069				continue;
1070			emit_mul(pc, dst[c], src[0][c], src[1][c]);
1071		}
1072		break;
1073	case TGSI_OPCODE_POW:
1074		temp = alloc_temp(pc, NULL);
1075		emit_pow(pc, temp, src[0][0], src[1][0]);
1076		for (c = 0; c < 4; c++) {
1077			if (!(mask & (1 << c)))
1078				continue;
1079			emit_mov(pc, dst[c], temp);
1080		}
1081		free_temp(pc, temp);
1082		break;
1083	case TGSI_OPCODE_RCP:
1084		for (c = 0; c < 4; c++) {
1085			if (!(mask & (1 << c)))
1086				continue;
1087			emit_flop(pc, 0, dst[c], src[0][0]);
1088		}
1089		break;
1090	case TGSI_OPCODE_RSQ:
1091		for (c = 0; c < 4; c++) {
1092			if (!(mask & (1 << c)))
1093				continue;
1094			emit_flop(pc, 2, dst[c], src[0][0]);
1095		}
1096		break;
1097	case TGSI_OPCODE_SCS:
1098		temp = alloc_temp(pc, NULL);
1099		emit_precossin(pc, temp, src[0][0]);
1100		if (mask & (1 << 0))
1101			emit_flop(pc, 5, dst[0], temp);
1102		if (mask & (1 << 1))
1103			emit_flop(pc, 4, dst[1], temp);
1104		break;
1105	case TGSI_OPCODE_SGE:
1106		for (c = 0; c < 4; c++) {
1107			if (!(mask & (1 << c)))
1108				continue;
1109			emit_set(pc, 6, dst[c], src[0][c], src[1][c]);
1110		}
1111		break;
1112	case TGSI_OPCODE_SIN:
1113		temp = alloc_temp(pc, NULL);
1114		emit_precossin(pc, temp, src[0][0]);
1115		emit_flop(pc, 4, temp, temp);
1116		for (c = 0; c < 4; c++) {
1117			if (!(mask & (1 << c)))
1118				continue;
1119			emit_mov(pc, dst[c], temp);
1120		}
1121		break;
1122	case TGSI_OPCODE_SLT:
1123		for (c = 0; c < 4; c++) {
1124			if (!(mask & (1 << c)))
1125				continue;
1126			emit_set(pc, 1, dst[c], src[0][c], src[1][c]);
1127		}
1128		break;
1129	case TGSI_OPCODE_SUB:
1130		for (c = 0; c < 4; c++) {
1131			if (!(mask & (1 << c)))
1132				continue;
1133			emit_sub(pc, dst[c], src[0][c], src[1][c]);
1134		}
1135		break;
1136	case TGSI_OPCODE_TEX:
1137		{
1138			struct nv50_reg *t0, *t1, *t2, *t3;
1139			struct nv50_program_exec *e;
1140
1141			t0 = alloc_temp(pc, NULL);
1142			t0 = alloc_temp(pc, NULL);
1143			t1 = alloc_temp(pc, NULL);
1144			t2 = alloc_temp(pc, NULL);
1145			t3 = alloc_temp(pc, NULL);
1146			emit_mov(pc, t0, src[0][0]);
1147			emit_mov(pc, t1, src[0][1]);
1148
1149			e = exec(pc);
1150			e->inst[0] = 0xf0400000;
1151			set_long(pc, e);
1152			e->inst[1] |= 0x0000c004;
1153			set_dst(pc, t0, e);
1154			emit(pc, e);
1155
1156			if (mask & (1 << 0)) emit_mov(pc, dst[0], t0);
1157			if (mask & (1 << 1)) emit_mov(pc, dst[1], t1);
1158			if (mask & (1 << 2)) emit_mov(pc, dst[2], t2);
1159			if (mask & (1 << 3)) emit_mov(pc, dst[3], t3);
1160
1161			free_temp(pc, t0);
1162			free_temp(pc, t1);
1163			free_temp(pc, t2);
1164			free_temp(pc, t3);
1165		}
1166		break;
1167	case TGSI_OPCODE_XPD:
1168		temp = alloc_temp(pc, NULL);
1169		if (mask & (1 << 0)) {
1170			emit_mul(pc, temp, src[0][2], src[1][1]);
1171			emit_msb(pc, dst[0], src[0][1], src[1][2], temp);
1172		}
1173		if (mask & (1 << 1)) {
1174			emit_mul(pc, temp, src[0][0], src[1][2]);
1175			emit_msb(pc, dst[1], src[0][2], src[1][0], temp);
1176		}
1177		if (mask & (1 << 2)) {
1178			emit_mul(pc, temp, src[0][1], src[1][0]);
1179			emit_msb(pc, dst[2], src[0][0], src[1][1], temp);
1180		}
1181		free_temp(pc, temp);
1182		break;
1183	case TGSI_OPCODE_END:
1184		break;
1185	default:
1186		NOUVEAU_ERR("invalid opcode %d\n", inst->Instruction.Opcode);
1187		return FALSE;
1188	}
1189
1190	if (sat) {
1191		for (c = 0; c < 4; c++) {
1192			struct nv50_program_exec *e;
1193
1194			if (!(mask & (1 << c)))
1195				continue;
1196			e = exec(pc);
1197
1198			e->inst[0] = 0xa0000000; /* cvt */
1199			set_long(pc, e);
1200			e->inst[1] |= (6 << 29); /* cvt */
1201			e->inst[1] |= 0x04000000; /* 32 bit */
1202			e->inst[1] |= (1 << 14); /* src .f32 */
1203			e->inst[1] |= ((1 << 5) << 14); /* .sat */
1204			set_dst(pc, rdst[c], e);
1205			set_src_0(pc, dst[c], e);
1206			emit(pc, e);
1207		}
1208	}
1209
1210	kill_temp_temp(pc);
1211	return TRUE;
1212}
1213
1214static boolean
1215nv50_program_tx_prep(struct nv50_pc *pc)
1216{
1217	struct tgsi_parse_context p;
1218	boolean ret = FALSE;
1219	unsigned i, c;
1220
1221	tgsi_parse_init(&p, pc->p->pipe.tokens);
1222	while (!tgsi_parse_end_of_tokens(&p)) {
1223		const union tgsi_full_token *tok = &p.FullToken;
1224
1225		tgsi_parse_token(&p);
1226		switch (tok->Token.Type) {
1227		case TGSI_TOKEN_TYPE_IMMEDIATE:
1228		{
1229			const struct tgsi_full_immediate *imm =
1230				&p.FullToken.FullImmediate;
1231
1232			ctor_immd(pc, imm->u.ImmediateFloat32[0].Float,
1233				      imm->u.ImmediateFloat32[1].Float,
1234				      imm->u.ImmediateFloat32[2].Float,
1235				      imm->u.ImmediateFloat32[3].Float);
1236		}
1237			break;
1238		case TGSI_TOKEN_TYPE_DECLARATION:
1239		{
1240			const struct tgsi_full_declaration *d;
1241			unsigned last;
1242
1243			d = &p.FullToken.FullDeclaration;
1244			last = d->u.DeclarationRange.Last;
1245
1246			switch (d->Declaration.File) {
1247			case TGSI_FILE_TEMPORARY:
1248				if (pc->temp_nr < (last + 1))
1249					pc->temp_nr = last + 1;
1250				break;
1251			case TGSI_FILE_OUTPUT:
1252				if (pc->result_nr < (last + 1))
1253					pc->result_nr = last + 1;
1254				break;
1255			case TGSI_FILE_INPUT:
1256				if (pc->attr_nr < (last + 1))
1257					pc->attr_nr = last + 1;
1258				break;
1259			case TGSI_FILE_CONSTANT:
1260				if (pc->param_nr < (last + 1))
1261					pc->param_nr = last + 1;
1262				break;
1263			case TGSI_FILE_SAMPLER:
1264				break;
1265			default:
1266				NOUVEAU_ERR("bad decl file %d\n",
1267					    d->Declaration.File);
1268				goto out_err;
1269			}
1270		}
1271			break;
1272		case TGSI_TOKEN_TYPE_INSTRUCTION:
1273			break;
1274		default:
1275			break;
1276		}
1277	}
1278
1279	NOUVEAU_ERR("%d temps\n", pc->temp_nr);
1280	if (pc->temp_nr) {
1281		pc->temp = calloc(pc->temp_nr * 4, sizeof(struct nv50_reg));
1282		if (!pc->temp)
1283			goto out_err;
1284
1285		for (i = 0; i < pc->temp_nr; i++) {
1286			for (c = 0; c < 4; c++) {
1287				pc->temp[i*4+c].type = P_TEMP;
1288				pc->temp[i*4+c].hw = -1;
1289				pc->temp[i*4+c].index = i;
1290			}
1291		}
1292	}
1293
1294	NOUVEAU_ERR("%d attrib regs\n", pc->attr_nr);
1295	if (pc->attr_nr) {
1296		struct nv50_reg *iv = NULL;
1297		int aid = 0;
1298
1299		pc->attr = calloc(pc->attr_nr * 4, sizeof(struct nv50_reg));
1300		if (!pc->attr)
1301			goto out_err;
1302
1303		if (pc->p->type == PIPE_SHADER_FRAGMENT) {
1304			iv = alloc_temp(pc, NULL);
1305			emit_interp(pc, iv, iv, iv, FALSE);
1306			emit_flop(pc, 0, iv, iv);
1307			aid++;
1308		}
1309
1310		for (i = 0; i < pc->attr_nr; i++) {
1311			struct nv50_reg *a = &pc->attr[i*4];
1312
1313			for (c = 0; c < 4; c++) {
1314				if (pc->p->type == PIPE_SHADER_FRAGMENT) {
1315					struct nv50_reg *at =
1316						alloc_temp(pc, NULL);
1317					pc->attr[i*4+c].type = at->type;
1318					pc->attr[i*4+c].hw = at->hw;
1319					pc->attr[i*4+c].index = at->index;
1320				} else {
1321					pc->p->cfg.vp.attr[aid/32] |=
1322						(1 << (aid % 32));
1323					pc->attr[i*4+c].type = P_ATTR;
1324					pc->attr[i*4+c].hw = aid++;
1325					pc->attr[i*4+c].index = i;
1326				}
1327			}
1328
1329			if (pc->p->type != PIPE_SHADER_FRAGMENT)
1330				continue;
1331
1332			emit_interp(pc, &a[0], &a[0], iv, TRUE);
1333			emit_interp(pc, &a[1], &a[1], iv, TRUE);
1334			emit_interp(pc, &a[2], &a[2], iv, TRUE);
1335			emit_interp(pc, &a[3], &a[3], iv, TRUE);
1336		}
1337
1338		if (iv)
1339			free_temp(pc, iv);
1340	}
1341
1342	NOUVEAU_ERR("%d result regs\n", pc->result_nr);
1343	if (pc->result_nr) {
1344		int rid = 0;
1345
1346		pc->result = calloc(pc->result_nr * 4, sizeof(struct nv50_reg));
1347		if (!pc->result)
1348			goto out_err;
1349
1350		for (i = 0; i < pc->result_nr; i++) {
1351			for (c = 0; c < 4; c++) {
1352				if (pc->p->type == PIPE_SHADER_FRAGMENT) {
1353					pc->result[i*4+c].type = P_TEMP;
1354					pc->result[i*4+c].hw = -1;
1355				} else {
1356					pc->result[i*4+c].type = P_RESULT;
1357					pc->result[i*4+c].hw = rid++;
1358				}
1359				pc->result[i*4+c].index = i;
1360			}
1361		}
1362	}
1363
1364	NOUVEAU_ERR("%d param regs\n", pc->param_nr);
1365	if (pc->param_nr) {
1366		int rid = 0;
1367
1368		pc->param = calloc(pc->param_nr * 4, sizeof(struct nv50_reg));
1369		if (!pc->param)
1370			goto out_err;
1371
1372		for (i = 0; i < pc->param_nr; i++) {
1373			for (c = 0; c < 4; c++) {
1374				pc->param[i*4+c].type = P_CONST;
1375				pc->param[i*4+c].hw = rid++;
1376				pc->param[i*4+c].index = i;
1377			}
1378		}
1379	}
1380
1381	if (pc->immd_nr) {
1382		int rid = pc->param_nr * 4;
1383
1384		pc->immd = calloc(pc->immd_nr * 4, sizeof(struct nv50_reg));
1385		if (!pc->immd)
1386			goto out_err;
1387
1388		for (i = 0; i < pc->immd_nr; i++) {
1389			for (c = 0; c < 4; c++) {
1390				pc->immd[i*4+c].type = P_IMMD;
1391				pc->immd[i*4+c].hw = rid++;
1392				pc->immd[i*4+c].index = i;
1393			}
1394		}
1395	}
1396
1397	ret = TRUE;
1398out_err:
1399	tgsi_parse_free(&p);
1400	return ret;
1401}
1402
1403static boolean
1404nv50_program_tx(struct nv50_program *p)
1405{
1406	struct tgsi_parse_context parse;
1407	struct nv50_pc *pc;
1408	boolean ret;
1409
1410	pc = CALLOC_STRUCT(nv50_pc);
1411	if (!pc)
1412		return FALSE;
1413	pc->p = p;
1414	pc->p->cfg.high_temp = 4;
1415
1416	ret = nv50_program_tx_prep(pc);
1417	if (ret == FALSE)
1418		goto out_cleanup;
1419
1420	tgsi_parse_init(&parse, pc->p->pipe.tokens);
1421	while (!tgsi_parse_end_of_tokens(&parse)) {
1422		const union tgsi_full_token *tok = &parse.FullToken;
1423
1424		tgsi_parse_token(&parse);
1425
1426		switch (tok->Token.Type) {
1427		case TGSI_TOKEN_TYPE_INSTRUCTION:
1428			ret = nv50_program_tx_insn(pc, tok);
1429			if (ret == FALSE)
1430				goto out_err;
1431			break;
1432		default:
1433			break;
1434		}
1435	}
1436
1437	if (p->type == PIPE_SHADER_FRAGMENT) {
1438		struct nv50_reg out;
1439
1440		out.type = P_TEMP;
1441		for (out.hw = 0; out.hw < pc->result_nr * 4; out.hw++)
1442			emit_mov(pc, &out, &pc->result[out.hw]);
1443	}
1444
1445	assert(is_long(pc->p->exec_tail) && !is_immd(pc->p->exec_head));
1446	pc->p->exec_tail->inst[1] |= 0x00000001;
1447
1448	p->param_nr = pc->param_nr * 4;
1449	p->immd_nr = pc->immd_nr * 4;
1450	p->immd = pc->immd_buf;
1451
1452out_err:
1453	tgsi_parse_free(&parse);
1454
1455out_cleanup:
1456	return ret;
1457}
1458
1459static void
1460nv50_program_validate(struct nv50_context *nv50, struct nv50_program *p)
1461{
1462	if (nv50_program_tx(p) == FALSE)
1463		assert(0);
1464	p->translated = TRUE;
1465}
1466
1467static void
1468nv50_program_upload_data(struct nv50_context *nv50, float *map,
1469			 unsigned start, unsigned count)
1470{
1471	while (count) {
1472		unsigned nr = count > 2047 ? 2047 : count;
1473
1474		BEGIN_RING(tesla, 0x00000f00, 1);
1475		OUT_RING  ((NV50_CB_PMISC << 0) | (start << 8));
1476		BEGIN_RING(tesla, 0x40000f04, nr);
1477		OUT_RINGp (map, nr);
1478
1479		map += nr;
1480		start += nr;
1481		count -= nr;
1482	}
1483}
1484
1485static void
1486nv50_program_validate_data(struct nv50_context *nv50, struct nv50_program *p)
1487{
1488	struct nouveau_winsys *nvws = nv50->screen->nvws;
1489	struct pipe_winsys *ws = nv50->pipe.winsys;
1490	unsigned nr = p->param_nr + p->immd_nr;
1491
1492	if (!p->data && nr) {
1493		struct nouveau_resource *heap = nv50->screen->vp_data_heap;
1494
1495		if (nvws->res_alloc(heap, nr, p, &p->data)) {
1496			while (heap->next && heap->size < nr) {
1497				struct nv50_program *evict = heap->next->priv;
1498				nvws->res_free(&evict->data);
1499			}
1500
1501			if (nvws->res_alloc(heap, nr, p, &p->data))
1502				assert(0);
1503		}
1504	}
1505
1506	if (p->param_nr) {
1507		float *map = ws->buffer_map(ws, nv50->constbuf[p->type],
1508					    PIPE_BUFFER_USAGE_CPU_READ);
1509		nv50_program_upload_data(nv50, map, p->data->start,
1510					 p->param_nr);
1511		ws->buffer_unmap(ws, nv50->constbuf[p->type]);
1512	}
1513
1514	if (p->immd_nr) {
1515		nv50_program_upload_data(nv50, p->immd,
1516					 p->data->start + p->param_nr,
1517					 p->immd_nr);
1518	}
1519}
1520
1521static void
1522nv50_program_validate_code(struct nv50_context *nv50, struct nv50_program *p)
1523{
1524	struct pipe_winsys *ws = nv50->pipe.winsys;
1525	struct nv50_program_exec *e;
1526	struct nouveau_stateobj *so;
1527	const unsigned flags = NOUVEAU_BO_VRAM | NOUVEAU_BO_WR;
1528	unsigned start, count, *up, *ptr;
1529	boolean upload = FALSE;
1530
1531	if (!p->buffer) {
1532		p->buffer = ws->buffer_create(ws, 0x100, 0, p->exec_size * 4);
1533		upload = TRUE;
1534	}
1535
1536	if (p->data && p->data->start != p->data_start) {
1537		for (e = p->exec_head; e; e = e->next) {
1538			unsigned ei, ci;
1539
1540			if (e->param.index < 0)
1541				continue;
1542			ei = e->param.shift >> 5;
1543			ci = e->param.index + p->data->start;
1544
1545			e->inst[ei] &= ~e->param.mask;
1546			e->inst[ei] |= (ci << e->param.shift);
1547		}
1548
1549		p->data_start = p->data->start;
1550		upload = TRUE;
1551	}
1552
1553	if (!upload)
1554		return FALSE;
1555
1556	up = ptr = MALLOC(p->exec_size * 4);
1557	for (e = p->exec_head; e; e = e->next) {
1558		*(ptr++) = e->inst[0];
1559		if (is_long(e))
1560			*(ptr++) = e->inst[1];
1561	}
1562
1563	so = so_new(3,2);
1564	so_method(so, nv50->screen->tesla, 0x1280, 3);
1565	so_reloc (so, p->buffer, 0, flags | NOUVEAU_BO_HIGH, 0, 0);
1566	so_reloc (so, p->buffer, 0, flags | NOUVEAU_BO_LOW, 0, 0);
1567	so_data  (so, (NV50_CB_PUPLOAD << 16) | 0x0800); //(p->exec_size * 4));
1568
1569	start = 0; count = p->exec_size;
1570	while (count) {
1571		struct nouveau_winsys *nvws = nv50->screen->nvws;
1572		unsigned nr;
1573
1574		so_emit(nvws, so);
1575
1576		nr = MIN2(count, 2047);
1577		nr = MIN2(nvws->channel->pushbuf->remaining, nr);
1578		if (nvws->channel->pushbuf->remaining < (nr + 3)) {
1579			FIRE_RING(NULL);
1580			continue;
1581		}
1582
1583		BEGIN_RING(tesla, 0x0f00, 1);
1584		OUT_RING  ((start << 8) | NV50_CB_PUPLOAD);
1585		BEGIN_RING(tesla, 0x40000f04, nr);
1586		OUT_RINGp (up + start, nr);
1587
1588		start += nr;
1589		count -= nr;
1590	}
1591
1592	FREE(up);
1593	so_ref(NULL, &so);
1594}
1595
1596void
1597nv50_vertprog_validate(struct nv50_context *nv50)
1598{
1599	struct nouveau_grobj *tesla = nv50->screen->tesla;
1600	struct nv50_program *p = nv50->vertprog;
1601	struct nouveau_stateobj *so;
1602
1603	if (!p->translated) {
1604		nv50_program_validate(nv50, p);
1605		if (!p->translated)
1606			assert(0);
1607	}
1608
1609	nv50_program_validate_data(nv50, p);
1610	nv50_program_validate_code(nv50, p);
1611
1612	so = so_new(11, 2);
1613	so_method(so, tesla, NV50TCL_VP_ADDRESS_HIGH, 2);
1614	so_reloc (so, p->buffer, 0, NOUVEAU_BO_VRAM | NOUVEAU_BO_RD |
1615		  NOUVEAU_BO_HIGH, 0, 0);
1616	so_reloc (so, p->buffer, 0, NOUVEAU_BO_VRAM | NOUVEAU_BO_RD |
1617		  NOUVEAU_BO_LOW, 0, 0);
1618	so_method(so, tesla, 0x1650, 2);
1619	so_data  (so, p->cfg.vp.attr[0]);
1620	so_data  (so, p->cfg.vp.attr[1]);
1621	so_method(so, tesla, 0x16b8, 1);
1622	so_data  (so, p->cfg.high_result);
1623	so_method(so, tesla, 0x16ac, 2);
1624	so_data  (so, p->cfg.high_result); //8);
1625	so_data  (so, p->cfg.high_temp);
1626	so_method(so, tesla, 0x140c, 1);
1627	so_data  (so, 0); /* program start offset */
1628	so_ref(so, &nv50->state.vertprog);
1629}
1630
1631void
1632nv50_fragprog_validate(struct nv50_context *nv50)
1633{
1634	struct nouveau_grobj *tesla = nv50->screen->tesla;
1635	struct nv50_program *p = nv50->fragprog;
1636	struct nouveau_stateobj *so;
1637
1638	if (!p->translated) {
1639		nv50_program_validate(nv50, p);
1640		if (!p->translated)
1641			assert(0);
1642	}
1643
1644	nv50_program_validate_data(nv50, p);
1645	nv50_program_validate_code(nv50, p);
1646
1647	so = so_new(64, 2);
1648	so_method(so, tesla, NV50TCL_FP_ADDRESS_HIGH, 2);
1649	so_reloc (so, p->buffer, 0, NOUVEAU_BO_VRAM | NOUVEAU_BO_RD |
1650		  NOUVEAU_BO_HIGH, 0, 0);
1651	so_reloc (so, p->buffer, 0, NOUVEAU_BO_VRAM | NOUVEAU_BO_RD |
1652		  NOUVEAU_BO_LOW, 0, 0);
1653	so_method(so, tesla, 0x1904, 4);
1654	so_data  (so, 0x01040404); /* p: 0x01000404 */
1655	so_data  (so, 0x00000004);
1656	so_data  (so, 0x00000000);
1657	so_data  (so, 0x00000000);
1658	so_method(so, tesla, 0x16bc, 3); /*XXX: fixme */
1659	so_data  (so, 0x03020100);
1660	so_data  (so, 0x07060504);
1661	so_data  (so, 0x0b0a0908);
1662	so_method(so, tesla, 0x1988, 2);
1663	so_data  (so, 0x08080408); //0x08040404); /* p: 0x0f000401 */
1664	so_data  (so, p->cfg.high_temp);
1665	so_method(so, tesla, 0x1414, 1);
1666	so_data  (so, 0); /* program start offset */
1667	so_ref(so, &nv50->state.fragprog);
1668}
1669
1670void
1671nv50_program_destroy(struct nv50_context *nv50, struct nv50_program *p)
1672{
1673	struct pipe_winsys *ws = nv50->pipe.winsys;
1674
1675	while (p->exec_head) {
1676		struct nv50_program_exec *e = p->exec_head;
1677
1678		p->exec_head = e->next;
1679		FREE(e);
1680	}
1681	p->exec_tail = NULL;
1682	p->exec_size = 0;
1683
1684	if (p->buffer)
1685		pipe_buffer_reference(ws, &p->buffer, NULL);
1686
1687	p->translated = 0;
1688}
1689
1690