nv50_program.c revision cae38d0fcc6c936d3a4dc25ca2dbef3d106d05a5
1#include "pipe/p_context.h"
2#include "pipe/p_defines.h"
3#include "pipe/p_state.h"
4#include "pipe/p_inlines.h"
5
6#include "pipe/p_shader_tokens.h"
7#include "tgsi/util/tgsi_parse.h"
8#include "tgsi/util/tgsi_util.h"
9
10#include "nv50_context.h"
11
12#define NV50_SU_MAX_TEMP 64
13#define NV50_PROGRAM_DUMP
14
15/* ARL - gallium craps itself on progs/vp/arl.txt
16 *
17 * MSB - Like MAD, but MUL+SUB
18 * 	- Fuck it off, introduce a way to negate args for ops that
19 * 	  support it.
20 *
21 * Look into inlining IMMD for ops other than MOV (make it general?)
22 * 	- Maybe even relax restrictions a bit, can't do P_RESULT + P_IMMD,
23 * 	  but can emit to P_TEMP first - then MOV later. NVIDIA does this
24 *
25 * In ops such as ADD it's possible to construct a bad opcode in the !is_long()
26 * case, if the emit_src() causes the inst to suddenly become long.
27 *
28 * Verify half-insns work where expected - and force disable them where they
29 * don't work - MUL has it forcibly disabled atm as it fixes POW..
30 *
31 * FUCK! watch dst==src vectors, can overwrite components that are needed.
32 * 	ie. SUB R0, R0.yzxw, R0
33 *
34 * Things to check with renouveau:
35 * 	FP attr/result assignment - how?
36 * 		attrib
37 * 			- 0x16bc maps vp output onto fp hpos
38 * 			- 0x16c0 maps vp output onto fp col0
39 * 		result
40 * 			- colr always 0-3
41 * 			- depr always 4
42 * 0x16bc->0x16e8 --> some binding between vp/fp regs
43 * 0x16b8 --> VP output count
44 *
45 * 0x1298 --> "MOV rcol.x, fcol.y" "MOV depr, fcol.y" = 0x00000005
46 * 	      "MOV rcol.x, fcol.y" = 0x00000004
47 * 0x19a8 --> as above but 0x00000100 and 0x00000000
48 * 	- 0x00100000 used when KIL used
49 * 0x196c --> as above but 0x00000011 and 0x00000000
50 *
51 * 0x1988 --> 0xXXNNNNNN
52 * 	- XX == FP high something
53 */
54struct nv50_reg {
55	enum {
56		P_TEMP,
57		P_ATTR,
58		P_RESULT,
59		P_CONST,
60		P_IMMD
61	} type;
62	int index;
63
64	int hw;
65	int neg;
66};
67
68struct nv50_pc {
69	struct nv50_program *p;
70
71	/* hw resources */
72	struct nv50_reg *r_temp[NV50_SU_MAX_TEMP];
73
74	/* tgsi resources */
75	struct nv50_reg *temp;
76	int temp_nr;
77	struct nv50_reg *attr;
78	int attr_nr;
79	struct nv50_reg *result;
80	int result_nr;
81	struct nv50_reg *param;
82	int param_nr;
83	struct nv50_reg *immd;
84	float *immd_buf;
85	int immd_nr;
86
87	struct nv50_reg *temp_temp[16];
88	unsigned temp_temp_nr;
89};
90
91static void
92alloc_reg(struct nv50_pc *pc, struct nv50_reg *reg)
93{
94	int i;
95
96	if (reg->type == P_RESULT) {
97		if (pc->p->cfg.high_result < (reg->hw + 1))
98			pc->p->cfg.high_result = reg->hw + 1;
99	}
100
101	if (reg->type != P_TEMP)
102		return;
103
104	if (reg->hw >= 0) {
105		/*XXX: do this here too to catch FP temp-as-attr usage..
106		 *     not clean, but works */
107		if (pc->p->cfg.high_temp < (reg->hw + 1))
108			pc->p->cfg.high_temp = reg->hw + 1;
109		return;
110	}
111
112	for (i = 0; i < NV50_SU_MAX_TEMP; i++) {
113		if (!(pc->r_temp[i])) {
114			pc->r_temp[i] = reg;
115			reg->hw = i;
116			if (pc->p->cfg.high_temp < (i + 1))
117				pc->p->cfg.high_temp = i + 1;
118			return;
119		}
120	}
121
122	assert(0);
123}
124
125static struct nv50_reg *
126alloc_temp(struct nv50_pc *pc, struct nv50_reg *dst)
127{
128	struct nv50_reg *r;
129	int i;
130
131	if (dst && dst->type == P_TEMP && dst->hw == -1)
132		return dst;
133
134	for (i = 0; i < NV50_SU_MAX_TEMP; i++) {
135		if (!pc->r_temp[i]) {
136			r = CALLOC_STRUCT(nv50_reg);
137			r->type = P_TEMP;
138			r->index = -1;
139			r->hw = i;
140			pc->r_temp[i] = r;
141			return r;
142		}
143	}
144
145	assert(0);
146	return NULL;
147}
148
149static void
150free_temp(struct nv50_pc *pc, struct nv50_reg *r)
151{
152	if (r->index == -1) {
153		unsigned hw = r->hw;
154
155		FREE(pc->r_temp[hw]);
156		pc->r_temp[hw] = NULL;
157	}
158}
159
160static struct nv50_reg *
161temp_temp(struct nv50_pc *pc)
162{
163	if (pc->temp_temp_nr >= 16)
164		assert(0);
165
166	pc->temp_temp[pc->temp_temp_nr] = alloc_temp(pc, NULL);
167	return pc->temp_temp[pc->temp_temp_nr++];
168}
169
170static void
171kill_temp_temp(struct nv50_pc *pc)
172{
173	int i;
174
175	for (i = 0; i < pc->temp_temp_nr; i++)
176		free_temp(pc, pc->temp_temp[i]);
177	pc->temp_temp_nr = 0;
178}
179
180static int
181ctor_immd(struct nv50_pc *pc, float x, float y, float z, float w)
182{
183	pc->immd_buf = realloc(pc->immd_buf, (pc->immd_nr + 1) * 4 *
184					     sizeof(float));
185	pc->immd_buf[(pc->immd_nr * 4) + 0] = x;
186	pc->immd_buf[(pc->immd_nr * 4) + 1] = y;
187	pc->immd_buf[(pc->immd_nr * 4) + 2] = z;
188	pc->immd_buf[(pc->immd_nr * 4) + 3] = w;
189
190	return pc->immd_nr++;
191}
192
193static struct nv50_reg *
194alloc_immd(struct nv50_pc *pc, float f)
195{
196	struct nv50_reg *r = CALLOC_STRUCT(nv50_reg);
197	unsigned hw;
198
199	hw = ctor_immd(pc, f, 0, 0, 0) * 4;
200	r->type = P_IMMD;
201	r->hw = hw;
202	r->index = -1;
203	return r;
204}
205
206static struct nv50_program_exec *
207exec(struct nv50_pc *pc)
208{
209	struct nv50_program_exec *e = CALLOC_STRUCT(nv50_program_exec);
210
211	e->param.index = -1;
212	return e;
213}
214
215static void
216emit(struct nv50_pc *pc, struct nv50_program_exec *e)
217{
218	struct nv50_program *p = pc->p;
219
220	if (p->exec_tail)
221		p->exec_tail->next = e;
222	if (!p->exec_head)
223		p->exec_head = e;
224	p->exec_tail = e;
225	p->exec_size += (e->inst[0] & 1) ? 2 : 1;
226}
227
228static INLINE void set_long(struct nv50_pc *, struct nv50_program_exec *);
229
230static boolean
231is_long(struct nv50_program_exec *e)
232{
233	if (e->inst[0] & 1)
234		return TRUE;
235	return FALSE;
236}
237
238static boolean
239is_immd(struct nv50_program_exec *e)
240{
241	if (is_long(e) && (e->inst[1] & 3) == 3)
242		return TRUE;
243	return FALSE;
244}
245
246static INLINE void
247set_pred(struct nv50_pc *pc, unsigned pred, unsigned idx,
248	 struct nv50_program_exec *e)
249{
250	set_long(pc, e);
251	e->inst[1] &= ~((0x1f << 7) | (0x3 << 12));
252	e->inst[1] |= (pred << 7) | (idx << 12);
253}
254
255static INLINE void
256set_pred_wr(struct nv50_pc *pc, unsigned on, unsigned idx,
257	    struct nv50_program_exec *e)
258{
259	set_long(pc, e);
260	e->inst[1] &= ~((0x3 << 4) | (1 << 6));
261	e->inst[1] |= (idx << 4) | (on << 6);
262}
263
264static INLINE void
265set_long(struct nv50_pc *pc, struct nv50_program_exec *e)
266{
267	if (is_long(e))
268		return;
269
270	e->inst[0] |= 1;
271	set_pred(pc, 0xf, 0, e);
272	set_pred_wr(pc, 0, 0, e);
273}
274
275static INLINE void
276set_dst(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_program_exec *e)
277{
278	if (dst->type == P_RESULT) {
279		set_long(pc, e);
280		e->inst[1] |= 0x00000008;
281	}
282
283	alloc_reg(pc, dst);
284	e->inst[0] |= (dst->hw << 2);
285}
286
287static INLINE void
288set_immd(struct nv50_pc *pc, struct nv50_reg *imm, struct nv50_program_exec *e)
289{
290	unsigned val = fui(pc->immd_buf[imm->hw]); /* XXX */
291
292	set_long(pc, e);
293	/*XXX: can't be predicated - bits overlap.. catch cases where both
294	 *     are required and avoid them. */
295	set_pred(pc, 0, 0, e);
296	set_pred_wr(pc, 0, 0, e);
297
298	e->inst[1] |= 0x00000002 | 0x00000001;
299	e->inst[0] |= (val & 0x3f) << 16;
300	e->inst[1] |= (val >> 6) << 2;
301}
302
303static void
304emit_interp(struct nv50_pc *pc, struct nv50_reg *dst,
305	    struct nv50_reg *src, struct nv50_reg *iv, boolean noperspective)
306{
307	struct nv50_program_exec *e = exec(pc);
308
309	e->inst[0] |= 0x80000000;
310	set_dst(pc, dst, e);
311	alloc_reg(pc, iv);
312	e->inst[0] |= (iv->hw << 9);
313	alloc_reg(pc, src);
314	e->inst[0] |= (src->hw << 16);
315	if (noperspective)
316		e->inst[0] |= (1 << 25);
317
318	emit(pc, e);
319}
320
321static void
322set_data(struct nv50_pc *pc, struct nv50_reg *src, unsigned m, unsigned s,
323	 struct nv50_program_exec *e)
324{
325	set_long(pc, e);
326#if 1
327	e->inst[1] |= (1 << 22);
328#else
329	if (src->type == P_IMMD) {
330		e->inst[1] |= (NV50_CB_PMISC << 22);
331	} else {
332		if (pc->p->type == PIPE_SHADER_VERTEX)
333			e->inst[1] |= (NV50_CB_PVP << 22);
334		else
335			e->inst[1] |= (NV50_CB_PFP << 22);
336	}
337#endif
338
339	e->param.index = src->hw;
340	e->param.shift = s;
341	e->param.mask = m << (s % 32);
342}
343
344static void
345emit_mov(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src)
346{
347	struct nv50_program_exec *e = exec(pc);
348
349	e->inst[0] |= 0x10000000;
350
351	set_dst(pc, dst, e);
352
353	if (0 && dst->type != P_RESULT && src->type == P_IMMD) {
354		set_immd(pc, src, e);
355		/*XXX: 32-bit, but steals part of "half" reg space - need to
356		 *     catch and handle this case if/when we do half-regs
357		 */
358		e->inst[0] |= 0x00008000;
359	} else
360	if (src->type == P_IMMD || src->type == P_CONST) {
361		set_long(pc, e);
362		set_data(pc, src, 0x7f, 9, e);
363		e->inst[1] |= 0x20000000; /* src0 const? */
364	} else {
365		if (src->type == P_ATTR) {
366			set_long(pc, e);
367			e->inst[1] |= 0x00200000;
368		}
369
370		alloc_reg(pc, src);
371		e->inst[0] |= (src->hw << 9);
372	}
373
374	/* We really should support "half" instructions here at some point,
375	 * but I don't feel confident enough about them yet.
376	 */
377	set_long(pc, e);
378	if (is_long(e) && !is_immd(e)) {
379		e->inst[1] |= 0x04000000; /* 32-bit */
380		e->inst[1] |= 0x0003c000; /* "subsubop" 0xf == mov */
381	}
382
383	emit(pc, e);
384}
385
386static boolean
387check_swap_src_0_1(struct nv50_pc *pc,
388		   struct nv50_reg **s0, struct nv50_reg **s1)
389{
390	struct nv50_reg *src0 = *s0, *src1 = *s1;
391
392	if (src0->type == P_CONST) {
393		if (src1->type != P_CONST) {
394			*s0 = src1;
395			*s1 = src0;
396			return TRUE;
397		}
398	} else
399	if (src1->type == P_ATTR) {
400		if (src0->type != P_ATTR) {
401			*s0 = src1;
402			*s1 = src0;
403			return TRUE;
404		}
405	}
406
407	return FALSE;
408}
409
410static void
411set_src_0(struct nv50_pc *pc, struct nv50_reg *src, struct nv50_program_exec *e)
412{
413	if (src->type == P_ATTR) {
414		set_long(pc, e);
415		e->inst[1] |= 0x00200000;
416	} else
417	if (src->type == P_CONST || src->type == P_IMMD) {
418		struct nv50_reg *temp = temp_temp(pc);
419
420		emit_mov(pc, temp, src);
421		src = temp;
422	}
423
424	alloc_reg(pc, src);
425	e->inst[0] |= (src->hw << 9);
426}
427
428static void
429set_src_1(struct nv50_pc *pc, struct nv50_reg *src, struct nv50_program_exec *e)
430{
431	if (src->type == P_ATTR) {
432		struct nv50_reg *temp = temp_temp(pc);
433
434		emit_mov(pc, temp, src);
435		src = temp;
436	} else
437	if (src->type == P_CONST || src->type == P_IMMD) {
438		assert(!(e->inst[0] & 0x00800000));
439		if (e->inst[0] & 0x01000000) {
440			struct nv50_reg *temp = temp_temp(pc);
441
442			emit_mov(pc, temp, src);
443			src = temp;
444		} else {
445			set_data(pc, src, 0x7f, 16, e);
446			e->inst[0] |= 0x00800000;
447		}
448	}
449
450	alloc_reg(pc, src);
451	e->inst[0] |= (src->hw << 16);
452}
453
454static void
455set_src_2(struct nv50_pc *pc, struct nv50_reg *src, struct nv50_program_exec *e)
456{
457	set_long(pc, e);
458
459	if (src->type == P_ATTR) {
460		struct nv50_reg *temp = temp_temp(pc);
461
462		emit_mov(pc, temp, src);
463		src = temp;
464	} else
465	if (src->type == P_CONST || src->type == P_IMMD) {
466		assert(!(e->inst[0] & 0x01000000));
467		if (e->inst[0] & 0x00800000) {
468			struct nv50_reg *temp = temp_temp(pc);
469
470			emit_mov(pc, temp, src);
471			src = temp;
472		} else {
473			set_data(pc, src, 0x7f, 32+14, e);
474			e->inst[0] |= 0x01000000;
475		}
476	}
477
478	alloc_reg(pc, src);
479	e->inst[1] |= (src->hw << 14);
480}
481
482static void
483emit_mul(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src0,
484	 struct nv50_reg *src1)
485{
486	struct nv50_program_exec *e = exec(pc);
487
488	e->inst[0] |= 0xc0000000;
489	set_long(pc, e);
490
491	check_swap_src_0_1(pc, &src0, &src1);
492	set_dst(pc, dst, e);
493	set_src_0(pc, src0, e);
494	set_src_1(pc, src1, e);
495
496	emit(pc, e);
497}
498
499static void
500emit_add(struct nv50_pc *pc, struct nv50_reg *dst,
501	 struct nv50_reg *src0, struct nv50_reg *src1)
502{
503	struct nv50_program_exec *e = exec(pc);
504
505	e->inst[0] |= 0xb0000000;
506
507	check_swap_src_0_1(pc, &src0, &src1);
508	set_dst(pc, dst, e);
509	set_src_0(pc, src0, e);
510	if (is_long(e))
511		set_src_2(pc, src1, e);
512	else
513		set_src_1(pc, src1, e);
514
515	emit(pc, e);
516}
517
518static void
519emit_minmax(struct nv50_pc *pc, unsigned sub, struct nv50_reg *dst,
520	    struct nv50_reg *src0, struct nv50_reg *src1)
521{
522	struct nv50_program_exec *e = exec(pc);
523
524	set_long(pc, e);
525	e->inst[0] |= 0xb0000000;
526	e->inst[1] |= (sub << 29);
527
528	check_swap_src_0_1(pc, &src0, &src1);
529	set_dst(pc, dst, e);
530	set_src_0(pc, src0, e);
531	set_src_1(pc, src1, e);
532
533	emit(pc, e);
534}
535
536static void
537emit_sub(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src0,
538	 struct nv50_reg *src1)
539{
540	struct nv50_program_exec *e = exec(pc);
541
542	e->inst[0] |= 0xb0000000;
543
544	set_long(pc, e);
545	if (check_swap_src_0_1(pc, &src0, &src1))
546		e->inst[1] |= 0x04000000;
547	else
548		e->inst[1] |= 0x08000000;
549
550	set_dst(pc, dst, e);
551	set_src_0(pc, src0, e);
552	set_src_2(pc, src1, e);
553
554	emit(pc, e);
555}
556
557static void
558emit_mad(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src0,
559	 struct nv50_reg *src1, struct nv50_reg *src2)
560{
561	struct nv50_program_exec *e = exec(pc);
562
563	e->inst[0] |= 0xe0000000;
564
565	check_swap_src_0_1(pc, &src0, &src1);
566	set_dst(pc, dst, e);
567	set_src_0(pc, src0, e);
568	set_src_1(pc, src1, e);
569	set_src_2(pc, src2, e);
570
571	emit(pc, e);
572}
573
574static void
575emit_msb(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src0,
576	 struct nv50_reg *src1, struct nv50_reg *src2)
577{
578	struct nv50_program_exec *e = exec(pc);
579
580	e->inst[0] |= 0xe0000000;
581	set_long(pc, e);
582	e->inst[1] |= 0x08000000; /* src0 * src1 - src2 */
583
584	check_swap_src_0_1(pc, &src0, &src1);
585	set_dst(pc, dst, e);
586	set_src_0(pc, src0, e);
587	set_src_1(pc, src1, e);
588	set_src_2(pc, src2, e);
589
590	emit(pc, e);
591}
592
593static void
594emit_flop(struct nv50_pc *pc, unsigned sub,
595	  struct nv50_reg *dst, struct nv50_reg *src)
596{
597	struct nv50_program_exec *e = exec(pc);
598
599	e->inst[0] |= 0x90000000;
600	if (sub) {
601		set_long(pc, e);
602		e->inst[1] |= (sub << 29);
603	}
604
605	set_dst(pc, dst, e);
606	set_src_0(pc, src, e);
607
608	emit(pc, e);
609}
610
611static void
612emit_preex2(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src)
613{
614	struct nv50_program_exec *e = exec(pc);
615
616	e->inst[0] |= 0xb0000000;
617
618	set_dst(pc, dst, e);
619	set_src_0(pc, src, e);
620	set_long(pc, e);
621	e->inst[1] |= (6 << 29) | 0x00004000;
622
623	emit(pc, e);
624}
625
626static void
627emit_precossin(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src)
628{
629	struct nv50_program_exec *e = exec(pc);
630
631	e->inst[0] |= 0xb0000000;
632
633	set_dst(pc, dst, e);
634	set_src_0(pc, src, e);
635	set_long(pc, e);
636	e->inst[1] |= (6 << 29);
637
638	emit(pc, e);
639}
640
641static void
642emit_set(struct nv50_pc *pc, unsigned c_op, struct nv50_reg *dst,
643	 struct nv50_reg *src0, struct nv50_reg *src1)
644{
645	struct nv50_program_exec *e = exec(pc);
646	unsigned inv_cop[8] = { 0, 4, 2, 6, 1, 5, 3, 7 };
647	struct nv50_reg *rdst;
648
649	assert(c_op <= 7);
650	if (check_swap_src_0_1(pc, &src0, &src1))
651		c_op = inv_cop[c_op];
652
653	rdst = dst;
654	if (dst->type != P_TEMP)
655		dst = alloc_temp(pc, NULL);
656
657	/* set.u32 */
658	set_long(pc, e);
659	e->inst[0] |= 0xb0000000;
660	e->inst[1] |= (3 << 29);
661	e->inst[1] |= (c_op << 14);
662	/*XXX: breaks things, .u32 by default?
663	 *     decuda will disasm as .u16 and use .lo/.hi regs, but this
664	 *     doesn't seem to match what the hw actually does.
665	inst[1] |= 0x04000000; << breaks things.. .u32 by default?
666	 */
667	set_dst(pc, dst, e);
668	set_src_0(pc, src0, e);
669	set_src_1(pc, src1, e);
670	emit(pc, e);
671
672	/* cvt.f32.u32 */
673	e = exec(pc);
674	e->inst[0] = 0xa0000001;
675	e->inst[1] = 0x64014780;
676	set_dst(pc, rdst, e);
677	set_src_0(pc, dst, e);
678	emit(pc, e);
679
680	if (dst != rdst)
681		free_temp(pc, dst);
682}
683
684static void
685emit_flr(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src)
686{
687	struct nv50_program_exec *e = exec(pc);
688
689	e->inst[0] = 0xa0000000; /* cvt */
690	set_long(pc, e);
691	e->inst[1] |= (6 << 29); /* cvt */
692	e->inst[1] |= 0x08000000; /* integer mode */
693	e->inst[1] |= 0x04000000; /* 32 bit */
694	e->inst[1] |= ((0x1 << 3)) << 14; /* .rn */
695	e->inst[1] |= (1 << 14); /* src .f32 */
696	set_dst(pc, dst, e);
697	set_src_0(pc, src, e);
698
699	emit(pc, e);
700}
701
702static void
703emit_pow(struct nv50_pc *pc, struct nv50_reg *dst,
704	 struct nv50_reg *v, struct nv50_reg *e)
705{
706	struct nv50_reg *temp = alloc_temp(pc, NULL);
707
708	emit_flop(pc, 3, temp, v);
709	emit_mul(pc, temp, temp, e);
710	emit_preex2(pc, temp, temp);
711	emit_flop(pc, 6, dst, temp);
712
713	free_temp(pc, temp);
714}
715
716static void
717emit_abs(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src)
718{
719	struct nv50_program_exec *e = exec(pc);
720
721	e->inst[0] = 0xa0000000; /* cvt */
722	set_long(pc, e);
723	e->inst[1] |= (6 << 29); /* cvt */
724	e->inst[1] |= 0x04000000; /* 32 bit */
725	e->inst[1] |= (1 << 14); /* src .f32 */
726	e->inst[1] |= ((1 << 6) << 14); /* .abs */
727	set_dst(pc, dst, e);
728	set_src_0(pc, src, e);
729
730	emit(pc, e);
731}
732
733static void
734emit_lit(struct nv50_pc *pc, struct nv50_reg **dst, unsigned mask,
735	 struct nv50_reg **src)
736{
737	struct nv50_reg *one = alloc_immd(pc, 1.0);
738	struct nv50_reg *zero = alloc_immd(pc, 0.0);
739	struct nv50_reg *neg128 = alloc_immd(pc, -127.999999);
740	struct nv50_reg *pos128 = alloc_immd(pc,  127.999999);
741	struct nv50_reg *tmp[4];
742
743	if (mask & (1 << 0))
744		emit_mov(pc, dst[0], one);
745
746	if (mask & (1 << 3))
747		emit_mov(pc, dst[3], one);
748
749	if (mask & (3 << 1)) {
750		if (mask & (1 << 1))
751			tmp[0] = dst[1];
752		else
753			tmp[0] = temp_temp(pc);
754		emit_minmax(pc, 4, tmp[0], src[0], zero);
755	}
756
757	if (mask & (1 << 2)) {
758		set_pred_wr(pc, 1, 0, pc->p->exec_tail);
759
760		tmp[1] = temp_temp(pc);
761		emit_minmax(pc, 4, tmp[1], src[1], zero);
762
763		tmp[3] = temp_temp(pc);
764		emit_minmax(pc, 4, tmp[3], src[3], neg128);
765		emit_minmax(pc, 5, tmp[3], tmp[3], pos128);
766
767		emit_pow(pc, dst[2], tmp[1], tmp[3]);
768		emit_mov(pc, dst[2], zero);
769		set_pred(pc, 3, 0, pc->p->exec_tail);
770	}
771}
772
773static void
774emit_neg(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src)
775{
776	struct nv50_program_exec *e = exec(pc);
777
778	set_long(pc, e);
779	e->inst[0] |= 0xa0000000; /* delta */
780	e->inst[1] |= (7 << 29); /* delta */
781	e->inst[1] |= 0x04000000; /* negate arg0? probably not */
782	e->inst[1] |= (1 << 14); /* src .f32 */
783	set_dst(pc, dst, e);
784	set_src_0(pc, src, e);
785
786	emit(pc, e);
787}
788
789static struct nv50_reg *
790tgsi_dst(struct nv50_pc *pc, int c, const struct tgsi_full_dst_register *dst)
791{
792	switch (dst->DstRegister.File) {
793	case TGSI_FILE_TEMPORARY:
794		return &pc->temp[dst->DstRegister.Index * 4 + c];
795	case TGSI_FILE_OUTPUT:
796		return &pc->result[dst->DstRegister.Index * 4 + c];
797	case TGSI_FILE_NULL:
798		return NULL;
799	default:
800		break;
801	}
802
803	return NULL;
804}
805
806static struct nv50_reg *
807tgsi_src(struct nv50_pc *pc, int chan, const struct tgsi_full_src_register *src)
808{
809	struct nv50_reg *r = NULL;
810	struct nv50_reg *temp;
811	unsigned c;
812
813	c = tgsi_util_get_full_src_register_extswizzle(src, chan);
814	switch (c) {
815	case TGSI_EXTSWIZZLE_X:
816	case TGSI_EXTSWIZZLE_Y:
817	case TGSI_EXTSWIZZLE_Z:
818	case TGSI_EXTSWIZZLE_W:
819		switch (src->SrcRegister.File) {
820		case TGSI_FILE_INPUT:
821			r = &pc->attr[src->SrcRegister.Index * 4 + c];
822			break;
823		case TGSI_FILE_TEMPORARY:
824			r = &pc->temp[src->SrcRegister.Index * 4 + c];
825			break;
826		case TGSI_FILE_CONSTANT:
827			r = &pc->param[src->SrcRegister.Index * 4 + c];
828			break;
829		case TGSI_FILE_IMMEDIATE:
830			r = &pc->immd[src->SrcRegister.Index * 4 + c];
831			break;
832		case TGSI_FILE_SAMPLER:
833			break;
834		default:
835			assert(0);
836			break;
837		}
838		break;
839	case TGSI_EXTSWIZZLE_ZERO:
840		r = alloc_immd(pc, 0.0);
841		break;
842	case TGSI_EXTSWIZZLE_ONE:
843		r = alloc_immd(pc, 1.0);
844		break;
845	default:
846		assert(0);
847		break;
848	}
849
850	switch (tgsi_util_get_full_src_register_sign_mode(src, chan)) {
851	case TGSI_UTIL_SIGN_KEEP:
852		break;
853	case TGSI_UTIL_SIGN_CLEAR:
854		temp = temp_temp(pc);
855		emit_abs(pc, temp, r);
856		r = temp;
857		break;
858	case TGSI_UTIL_SIGN_TOGGLE:
859		temp = temp_temp(pc);
860		emit_neg(pc, temp, r);
861		r = temp;
862		break;
863	case TGSI_UTIL_SIGN_SET:
864		temp = temp_temp(pc);
865		emit_abs(pc, temp, r);
866		emit_neg(pc, temp, r);
867		r = temp;
868		break;
869	default:
870		assert(0);
871		break;
872	}
873
874	return r;
875}
876
877static boolean
878nv50_program_tx_insn(struct nv50_pc *pc, const union tgsi_full_token *tok)
879{
880	const struct tgsi_full_instruction *inst = &tok->FullInstruction;
881	struct nv50_reg *rdst[4], *dst[4], *src[3][4], *temp;
882	unsigned mask, sat;
883	int i, c;
884
885	NOUVEAU_ERR("insn %p\n", tok);
886
887	mask = inst->FullDstRegisters[0].DstRegister.WriteMask;
888	sat = inst->Instruction.Saturate == TGSI_SAT_ZERO_ONE;
889
890	for (c = 0; c < 4; c++) {
891		if (mask & (1 << c))
892			dst[c] = tgsi_dst(pc, c, &inst->FullDstRegisters[0]);
893		else
894			dst[c] = NULL;
895	}
896
897	for (i = 0; i < inst->Instruction.NumSrcRegs; i++) {
898		for (c = 0; c < 4; c++)
899			src[i][c] = tgsi_src(pc, c, &inst->FullSrcRegisters[i]);
900	}
901
902	if (sat) {
903		for (c = 0; c < 4; c++) {
904			rdst[c] = dst[c];
905			dst[c] = temp_temp(pc);
906		}
907	}
908
909	switch (inst->Instruction.Opcode) {
910	case TGSI_OPCODE_ABS:
911		for (c = 0; c < 4; c++) {
912			if (!(mask & (1 << c)))
913				continue;
914			emit_abs(pc, dst[c], src[0][c]);
915		}
916		break;
917	case TGSI_OPCODE_ADD:
918		for (c = 0; c < 4; c++) {
919			if (!(mask & (1 << c)))
920				continue;
921			emit_add(pc, dst[c], src[0][c], src[1][c]);
922		}
923		break;
924	case TGSI_OPCODE_COS:
925		temp = alloc_temp(pc, NULL);
926		emit_precossin(pc, temp, src[0][0]);
927		emit_flop(pc, 5, temp, temp);
928		for (c = 0; c < 4; c++) {
929			if (!(mask & (1 << c)))
930				continue;
931			emit_mov(pc, dst[c], temp);
932		}
933		break;
934	case TGSI_OPCODE_DP3:
935		temp = alloc_temp(pc, NULL);
936		emit_mul(pc, temp, src[0][0], src[1][0]);
937		emit_mad(pc, temp, src[0][1], src[1][1], temp);
938		emit_mad(pc, temp, src[0][2], src[1][2], temp);
939		for (c = 0; c < 4; c++) {
940			if (!(mask & (1 << c)))
941				continue;
942			emit_mov(pc, dst[c], temp);
943		}
944		free_temp(pc, temp);
945		break;
946	case TGSI_OPCODE_DP4:
947		temp = alloc_temp(pc, NULL);
948		emit_mul(pc, temp, src[0][0], src[1][0]);
949		emit_mad(pc, temp, src[0][1], src[1][1], temp);
950		emit_mad(pc, temp, src[0][2], src[1][2], temp);
951		emit_mad(pc, temp, src[0][3], src[1][3], temp);
952		for (c = 0; c < 4; c++) {
953			if (!(mask & (1 << c)))
954				continue;
955			emit_mov(pc, dst[c], temp);
956		}
957		free_temp(pc, temp);
958		break;
959	case TGSI_OPCODE_DPH:
960		temp = alloc_temp(pc, NULL);
961		emit_mul(pc, temp, src[0][0], src[1][0]);
962		emit_mad(pc, temp, src[0][1], src[1][1], temp);
963		emit_mad(pc, temp, src[0][2], src[1][2], temp);
964		emit_add(pc, temp, src[1][3], temp);
965		for (c = 0; c < 4; c++) {
966			if (!(mask & (1 << c)))
967				continue;
968			emit_mov(pc, dst[c], temp);
969		}
970		free_temp(pc, temp);
971		break;
972	case TGSI_OPCODE_DST:
973	{
974		struct nv50_reg *one = alloc_immd(pc, 1.0);
975		if (mask & (1 << 0))
976			emit_mov(pc, dst[0], one);
977		if (mask & (1 << 1))
978			emit_mul(pc, dst[1], src[0][1], src[1][1]);
979		if (mask & (1 << 2))
980			emit_mov(pc, dst[2], src[0][2]);
981		if (mask & (1 << 3))
982			emit_mov(pc, dst[3], src[1][3]);
983		FREE(one);
984	}
985		break;
986	case TGSI_OPCODE_EX2:
987		temp = alloc_temp(pc, NULL);
988		emit_preex2(pc, temp, src[0][0]);
989		emit_flop(pc, 6, temp, temp);
990		for (c = 0; c < 4; c++) {
991			if (!(mask & (1 << c)))
992				continue;
993			emit_mov(pc, dst[c], temp);
994		}
995		free_temp(pc, temp);
996		break;
997	case TGSI_OPCODE_FLR:
998		for (c = 0; c < 4; c++) {
999			if (!(mask & (1 << c)))
1000				continue;
1001			emit_flr(pc, dst[c], src[0][c]);
1002		}
1003		break;
1004	case TGSI_OPCODE_FRC:
1005		temp = alloc_temp(pc, NULL);
1006		for (c = 0; c < 4; c++) {
1007			if (!(mask & (1 << c)))
1008				continue;
1009			emit_flr(pc, temp, src[0][c]);
1010			emit_sub(pc, dst[c], src[0][c], temp);
1011		}
1012		free_temp(pc, temp);
1013		break;
1014	case TGSI_OPCODE_LIT:
1015		emit_lit(pc, &dst[0], mask, &src[0][0]);
1016		break;
1017	case TGSI_OPCODE_LG2:
1018		temp = alloc_temp(pc, NULL);
1019		emit_flop(pc, 3, temp, src[0][0]);
1020		for (c = 0; c < 4; c++) {
1021			if (!(mask & (1 << c)))
1022				continue;
1023			emit_mov(pc, dst[c], temp);
1024		}
1025		break;
1026	case TGSI_OPCODE_LRP:
1027		for (c = 0; c < 4; c++) {
1028			if (!(mask & (1 << c)))
1029				continue;
1030			/*XXX: we can do better than this */
1031			temp = alloc_temp(pc, NULL);
1032			emit_neg(pc, temp, src[0][c]);
1033			emit_mad(pc, temp, temp, src[2][c], src[2][c]);
1034			emit_mad(pc, dst[c], src[0][c], src[1][c], temp);
1035			free_temp(pc, temp);
1036		}
1037		break;
1038	case TGSI_OPCODE_MAD:
1039		for (c = 0; c < 4; c++) {
1040			if (!(mask & (1 << c)))
1041				continue;
1042			emit_mad(pc, dst[c], src[0][c], src[1][c], src[2][c]);
1043		}
1044		break;
1045	case TGSI_OPCODE_MAX:
1046		for (c = 0; c < 4; c++) {
1047			if (!(mask & (1 << c)))
1048				continue;
1049			emit_minmax(pc, 4, dst[c], src[0][c], src[1][c]);
1050		}
1051		break;
1052	case TGSI_OPCODE_MIN:
1053		for (c = 0; c < 4; c++) {
1054			if (!(mask & (1 << c)))
1055				continue;
1056			emit_minmax(pc, 5, dst[c], src[0][c], src[1][c]);
1057		}
1058		break;
1059	case TGSI_OPCODE_MOV:
1060		for (c = 0; c < 4; c++) {
1061			if (!(mask & (1 << c)))
1062				continue;
1063			emit_mov(pc, dst[c], src[0][c]);
1064		}
1065		break;
1066	case TGSI_OPCODE_MUL:
1067		for (c = 0; c < 4; c++) {
1068			if (!(mask & (1 << c)))
1069				continue;
1070			emit_mul(pc, dst[c], src[0][c], src[1][c]);
1071		}
1072		break;
1073	case TGSI_OPCODE_POW:
1074		temp = alloc_temp(pc, NULL);
1075		emit_pow(pc, temp, src[0][0], src[1][0]);
1076		for (c = 0; c < 4; c++) {
1077			if (!(mask & (1 << c)))
1078				continue;
1079			emit_mov(pc, dst[c], temp);
1080		}
1081		free_temp(pc, temp);
1082		break;
1083	case TGSI_OPCODE_RCP:
1084		for (c = 0; c < 4; c++) {
1085			if (!(mask & (1 << c)))
1086				continue;
1087			emit_flop(pc, 0, dst[c], src[0][0]);
1088		}
1089		break;
1090	case TGSI_OPCODE_RSQ:
1091		for (c = 0; c < 4; c++) {
1092			if (!(mask & (1 << c)))
1093				continue;
1094			emit_flop(pc, 2, dst[c], src[0][0]);
1095		}
1096		break;
1097	case TGSI_OPCODE_SCS:
1098		temp = alloc_temp(pc, NULL);
1099		emit_precossin(pc, temp, src[0][0]);
1100		if (mask & (1 << 0))
1101			emit_flop(pc, 5, dst[0], temp);
1102		if (mask & (1 << 1))
1103			emit_flop(pc, 4, dst[1], temp);
1104		break;
1105	case TGSI_OPCODE_SGE:
1106		for (c = 0; c < 4; c++) {
1107			if (!(mask & (1 << c)))
1108				continue;
1109			emit_set(pc, 6, dst[c], src[0][c], src[1][c]);
1110		}
1111		break;
1112	case TGSI_OPCODE_SIN:
1113		temp = alloc_temp(pc, NULL);
1114		emit_precossin(pc, temp, src[0][0]);
1115		emit_flop(pc, 4, temp, temp);
1116		for (c = 0; c < 4; c++) {
1117			if (!(mask & (1 << c)))
1118				continue;
1119			emit_mov(pc, dst[c], temp);
1120		}
1121		break;
1122	case TGSI_OPCODE_SLT:
1123		for (c = 0; c < 4; c++) {
1124			if (!(mask & (1 << c)))
1125				continue;
1126			emit_set(pc, 1, dst[c], src[0][c], src[1][c]);
1127		}
1128		break;
1129	case TGSI_OPCODE_SUB:
1130		for (c = 0; c < 4; c++) {
1131			if (!(mask & (1 << c)))
1132				continue;
1133			emit_sub(pc, dst[c], src[0][c], src[1][c]);
1134		}
1135		break;
1136	case TGSI_OPCODE_TEX:
1137		break;
1138	case TGSI_OPCODE_XPD:
1139		temp = alloc_temp(pc, NULL);
1140		if (mask & (1 << 0)) {
1141			emit_mul(pc, temp, src[0][2], src[1][1]);
1142			emit_msb(pc, dst[0], src[0][1], src[1][2], temp);
1143		}
1144		if (mask & (1 << 1)) {
1145			emit_mul(pc, temp, src[0][0], src[1][2]);
1146			emit_msb(pc, dst[1], src[0][2], src[1][0], temp);
1147		}
1148		if (mask & (1 << 2)) {
1149			emit_mul(pc, temp, src[0][1], src[1][0]);
1150			emit_msb(pc, dst[2], src[0][0], src[1][1], temp);
1151		}
1152		free_temp(pc, temp);
1153		break;
1154	case TGSI_OPCODE_END:
1155		break;
1156	default:
1157		NOUVEAU_ERR("invalid opcode %d\n", inst->Instruction.Opcode);
1158		return FALSE;
1159	}
1160
1161	if (sat) {
1162		for (c = 0; c < 4; c++) {
1163			struct nv50_program_exec *e;
1164
1165			if (!(mask & (1 << c)))
1166				continue;
1167			e = exec(pc);
1168
1169			e->inst[0] = 0xa0000000; /* cvt */
1170			set_long(pc, e);
1171			e->inst[1] |= (6 << 29); /* cvt */
1172			e->inst[1] |= 0x04000000; /* 32 bit */
1173			e->inst[1] |= (1 << 14); /* src .f32 */
1174			e->inst[1] |= ((1 << 5) << 14); /* .sat */
1175			set_dst(pc, rdst[c], e);
1176			set_src_0(pc, dst[c], e);
1177			emit(pc, e);
1178		}
1179	}
1180
1181	kill_temp_temp(pc);
1182	return TRUE;
1183}
1184
1185static boolean
1186nv50_program_tx_prep(struct nv50_pc *pc)
1187{
1188	struct tgsi_parse_context p;
1189	boolean ret = FALSE;
1190	unsigned i, c;
1191
1192	tgsi_parse_init(&p, pc->p->pipe.tokens);
1193	while (!tgsi_parse_end_of_tokens(&p)) {
1194		const union tgsi_full_token *tok = &p.FullToken;
1195
1196		tgsi_parse_token(&p);
1197		switch (tok->Token.Type) {
1198		case TGSI_TOKEN_TYPE_IMMEDIATE:
1199		{
1200			const struct tgsi_full_immediate *imm =
1201				&p.FullToken.FullImmediate;
1202
1203			ctor_immd(pc, imm->u.ImmediateFloat32[0].Float,
1204				      imm->u.ImmediateFloat32[1].Float,
1205				      imm->u.ImmediateFloat32[2].Float,
1206				      imm->u.ImmediateFloat32[3].Float);
1207		}
1208			break;
1209		case TGSI_TOKEN_TYPE_DECLARATION:
1210		{
1211			const struct tgsi_full_declaration *d;
1212			unsigned last;
1213
1214			d = &p.FullToken.FullDeclaration;
1215			last = d->u.DeclarationRange.Last;
1216
1217			switch (d->Declaration.File) {
1218			case TGSI_FILE_TEMPORARY:
1219				if (pc->temp_nr < (last + 1))
1220					pc->temp_nr = last + 1;
1221				break;
1222			case TGSI_FILE_OUTPUT:
1223				if (pc->result_nr < (last + 1))
1224					pc->result_nr = last + 1;
1225				break;
1226			case TGSI_FILE_INPUT:
1227				if (pc->attr_nr < (last + 1))
1228					pc->attr_nr = last + 1;
1229				break;
1230			case TGSI_FILE_CONSTANT:
1231				if (pc->param_nr < (last + 1))
1232					pc->param_nr = last + 1;
1233				break;
1234			case TGSI_FILE_SAMPLER:
1235				break;
1236			default:
1237				NOUVEAU_ERR("bad decl file %d\n",
1238					    d->Declaration.File);
1239				goto out_err;
1240			}
1241		}
1242			break;
1243		case TGSI_TOKEN_TYPE_INSTRUCTION:
1244			break;
1245		default:
1246			break;
1247		}
1248	}
1249
1250	NOUVEAU_ERR("%d temps\n", pc->temp_nr);
1251	if (pc->temp_nr) {
1252		pc->temp = calloc(pc->temp_nr * 4, sizeof(struct nv50_reg));
1253		if (!pc->temp)
1254			goto out_err;
1255
1256		for (i = 0; i < pc->temp_nr; i++) {
1257			for (c = 0; c < 4; c++) {
1258				pc->temp[i*4+c].type = P_TEMP;
1259				pc->temp[i*4+c].hw = -1;
1260				pc->temp[i*4+c].index = i;
1261			}
1262		}
1263	}
1264
1265	NOUVEAU_ERR("%d attrib regs\n", pc->attr_nr);
1266	if (pc->attr_nr) {
1267		struct nv50_reg *iv = NULL;
1268		int aid = 0;
1269
1270		pc->attr = calloc(pc->attr_nr * 4, sizeof(struct nv50_reg));
1271		if (!pc->attr)
1272			goto out_err;
1273
1274		if (pc->p->type == PIPE_SHADER_FRAGMENT) {
1275			iv = alloc_temp(pc, NULL);
1276			emit_interp(pc, iv, iv, iv, FALSE);
1277			emit_flop(pc, 0, iv, iv);
1278			aid++;
1279		}
1280
1281		for (i = 0; i < pc->attr_nr; i++) {
1282			struct nv50_reg *a = &pc->attr[i*4];
1283
1284			for (c = 0; c < 4; c++) {
1285				if (pc->p->type == PIPE_SHADER_FRAGMENT) {
1286					struct nv50_reg *at =
1287						alloc_temp(pc, NULL);
1288					pc->attr[i*4+c].type = at->type;
1289					pc->attr[i*4+c].hw = at->hw;
1290					pc->attr[i*4+c].index = at->index;
1291				} else {
1292					pc->p->cfg.vp.attr[aid/32] |=
1293						(1 << (aid % 32));
1294					pc->attr[i*4+c].type = P_ATTR;
1295					pc->attr[i*4+c].hw = aid++;
1296					pc->attr[i*4+c].index = i;
1297				}
1298			}
1299
1300			if (pc->p->type != PIPE_SHADER_FRAGMENT)
1301				continue;
1302
1303			emit_interp(pc, &a[0], &a[0], iv, TRUE);
1304			emit_interp(pc, &a[1], &a[1], iv, TRUE);
1305			emit_interp(pc, &a[2], &a[2], iv, TRUE);
1306			emit_interp(pc, &a[3], &a[3], iv, TRUE);
1307		}
1308
1309		if (iv)
1310			free_temp(pc, iv);
1311	}
1312
1313	NOUVEAU_ERR("%d result regs\n", pc->result_nr);
1314	if (pc->result_nr) {
1315		int rid = 0;
1316
1317		pc->result = calloc(pc->result_nr * 4, sizeof(struct nv50_reg));
1318		if (!pc->result)
1319			goto out_err;
1320
1321		for (i = 0; i < pc->result_nr; i++) {
1322			for (c = 0; c < 4; c++) {
1323				if (pc->p->type == PIPE_SHADER_FRAGMENT) {
1324					pc->result[i*4+c].type = P_TEMP;
1325					pc->result[i*4+c].hw = -1;
1326				} else {
1327					pc->result[i*4+c].type = P_RESULT;
1328					pc->result[i*4+c].hw = rid++;
1329				}
1330				pc->result[i*4+c].index = i;
1331			}
1332		}
1333	}
1334
1335	NOUVEAU_ERR("%d param regs\n", pc->param_nr);
1336	if (pc->param_nr) {
1337		int rid = 0;
1338
1339		pc->param = calloc(pc->param_nr * 4, sizeof(struct nv50_reg));
1340		if (!pc->param)
1341			goto out_err;
1342
1343		for (i = 0; i < pc->param_nr; i++) {
1344			for (c = 0; c < 4; c++) {
1345				pc->param[i*4+c].type = P_CONST;
1346				pc->param[i*4+c].hw = rid++;
1347				pc->param[i*4+c].index = i;
1348			}
1349		}
1350	}
1351
1352	if (pc->immd_nr) {
1353		int rid = pc->param_nr * 4;
1354
1355		pc->immd = calloc(pc->immd_nr * 4, sizeof(struct nv50_reg));
1356		if (!pc->immd)
1357			goto out_err;
1358
1359		for (i = 0; i < pc->immd_nr; i++) {
1360			for (c = 0; c < 4; c++) {
1361				pc->immd[i*4+c].type = P_IMMD;
1362				pc->immd[i*4+c].hw = rid++;
1363				pc->immd[i*4+c].index = i;
1364			}
1365		}
1366	}
1367
1368	ret = TRUE;
1369out_err:
1370	tgsi_parse_free(&p);
1371	return ret;
1372}
1373
1374static boolean
1375nv50_program_tx(struct nv50_program *p)
1376{
1377	struct tgsi_parse_context parse;
1378	struct nv50_pc *pc;
1379	boolean ret;
1380
1381	pc = CALLOC_STRUCT(nv50_pc);
1382	if (!pc)
1383		return FALSE;
1384	pc->p = p;
1385	pc->p->cfg.high_temp = 4;
1386
1387	ret = nv50_program_tx_prep(pc);
1388	if (ret == FALSE)
1389		goto out_cleanup;
1390
1391	tgsi_parse_init(&parse, pc->p->pipe.tokens);
1392	while (!tgsi_parse_end_of_tokens(&parse)) {
1393		const union tgsi_full_token *tok = &parse.FullToken;
1394
1395		tgsi_parse_token(&parse);
1396
1397		switch (tok->Token.Type) {
1398		case TGSI_TOKEN_TYPE_INSTRUCTION:
1399			ret = nv50_program_tx_insn(pc, tok);
1400			if (ret == FALSE)
1401				goto out_err;
1402			break;
1403		default:
1404			break;
1405		}
1406	}
1407
1408	if (p->type == PIPE_SHADER_FRAGMENT) {
1409		struct nv50_reg out;
1410
1411		out.type = P_TEMP;
1412		for (out.hw = 0; out.hw < pc->result_nr * 4; out.hw++)
1413			emit_mov(pc, &out, &pc->result[out.hw]);
1414	}
1415
1416	assert(is_long(pc->p->exec_tail) && !is_immd(pc->p->exec_head));
1417	pc->p->exec_tail->inst[1] |= 0x00000001;
1418
1419	p->param_nr = pc->param_nr * 4;
1420	p->immd_nr = pc->immd_nr * 4;
1421	p->immd = pc->immd_buf;
1422
1423out_err:
1424	tgsi_parse_free(&parse);
1425
1426out_cleanup:
1427	return ret;
1428}
1429
1430static void
1431nv50_program_validate(struct nv50_context *nv50, struct nv50_program *p)
1432{
1433	if (nv50_program_tx(p) == FALSE)
1434		assert(0);
1435	p->translated = TRUE;
1436}
1437
1438static void
1439nv50_program_upload_data(struct nv50_context *nv50, float *map,
1440			 unsigned start, unsigned count)
1441{
1442	while (count) {
1443		unsigned nr = count > 2047 ? 2047 : count;
1444
1445		BEGIN_RING(tesla, 0x00000f00, 1);
1446		OUT_RING  ((NV50_CB_PMISC << 0) | (start << 8));
1447		BEGIN_RING(tesla, 0x40000f04, nr);
1448		OUT_RINGp (map, nr);
1449
1450		map += nr;
1451		start += nr;
1452		count -= nr;
1453	}
1454}
1455
1456static void
1457nv50_program_validate_data(struct nv50_context *nv50, struct nv50_program *p)
1458{
1459	struct nouveau_winsys *nvws = nv50->screen->nvws;
1460	struct pipe_winsys *ws = nv50->pipe.winsys;
1461	unsigned nr = p->param_nr + p->immd_nr;
1462
1463	if (!p->data && nr) {
1464		struct nouveau_resource *heap = nv50->screen->vp_data_heap;
1465
1466		if (nvws->res_alloc(heap, nr, p, &p->data)) {
1467			while (heap->next && heap->size < nr) {
1468				struct nv50_program *evict = heap->next->priv;
1469				nvws->res_free(&evict->data);
1470			}
1471
1472			if (nvws->res_alloc(heap, nr, p, &p->data))
1473				assert(0);
1474		}
1475	}
1476
1477	if (p->param_nr) {
1478		float *map = ws->buffer_map(ws, nv50->constbuf[p->type],
1479					    PIPE_BUFFER_USAGE_CPU_READ);
1480		nv50_program_upload_data(nv50, map, p->data->start,
1481					 p->param_nr);
1482		ws->buffer_unmap(ws, nv50->constbuf[p->type]);
1483	}
1484
1485	if (p->immd_nr) {
1486		nv50_program_upload_data(nv50, p->immd,
1487					 p->data->start + p->param_nr,
1488					 p->immd_nr);
1489	}
1490}
1491
1492static void
1493nv50_program_validate_code(struct nv50_context *nv50, struct nv50_program *p)
1494{
1495	struct pipe_winsys *ws = nv50->pipe.winsys;
1496	struct nv50_program_exec *e;
1497	struct nouveau_stateobj *so;
1498	const unsigned flags = NOUVEAU_BO_VRAM | NOUVEAU_BO_WR;
1499	unsigned start, count, *up, *ptr;
1500	boolean upload = FALSE;
1501
1502	if (!p->buffer) {
1503		p->buffer = ws->buffer_create(ws, 0x100, 0, p->exec_size * 4);
1504		upload = TRUE;
1505	}
1506
1507	if (p->data && p->data->start != p->data_start) {
1508		for (e = p->exec_head; e; e = e->next) {
1509			unsigned ei, ci;
1510
1511			if (e->param.index < 0)
1512				continue;
1513			ei = e->param.shift >> 5;
1514			ci = e->param.index + p->data->start;
1515
1516			e->inst[ei] &= ~e->param.mask;
1517			e->inst[ei] |= (ci << e->param.shift);
1518		}
1519
1520		p->data_start = p->data->start;
1521		upload = TRUE;
1522	}
1523
1524	if (!upload)
1525		return FALSE;
1526
1527	up = ptr = MALLOC(p->exec_size * 4);
1528	for (e = p->exec_head; e; e = e->next) {
1529		*(ptr++) = e->inst[0];
1530		if (is_long(e))
1531			*(ptr++) = e->inst[1];
1532	}
1533
1534	so = so_new(3,2);
1535	so_method(so, nv50->screen->tesla, 0x1280, 3);
1536	so_reloc (so, p->buffer, 0, flags | NOUVEAU_BO_HIGH, 0, 0);
1537	so_reloc (so, p->buffer, 0, flags | NOUVEAU_BO_LOW, 0, 0);
1538	so_data  (so, (NV50_CB_PUPLOAD << 16) | 0x0800); //(p->exec_size * 4));
1539
1540	start = 0; count = p->exec_size;
1541	while (count) {
1542		struct nouveau_winsys *nvws = nv50->screen->nvws;
1543		unsigned nr;
1544
1545		so_emit(nvws, so);
1546
1547		nr = MIN2(count, 2047);
1548		nr = MIN2(nvws->channel->pushbuf->remaining, nr);
1549		if (nvws->channel->pushbuf->remaining < (nr + 3)) {
1550			FIRE_RING(NULL);
1551			continue;
1552		}
1553
1554		BEGIN_RING(tesla, 0x0f00, 1);
1555		OUT_RING  ((start << 8) | NV50_CB_PUPLOAD);
1556		BEGIN_RING(tesla, 0x40000f04, nr);
1557		OUT_RINGp (up + start, nr);
1558
1559		start += nr;
1560		count -= nr;
1561	}
1562
1563	FREE(up);
1564	so_ref(NULL, &so);
1565}
1566
1567void
1568nv50_vertprog_validate(struct nv50_context *nv50)
1569{
1570	struct nouveau_grobj *tesla = nv50->screen->tesla;
1571	struct nv50_program *p = nv50->vertprog;
1572	struct nouveau_stateobj *so;
1573
1574	if (!p->translated) {
1575		nv50_program_validate(nv50, p);
1576		if (!p->translated)
1577			assert(0);
1578	}
1579
1580	nv50_program_validate_data(nv50, p);
1581	nv50_program_validate_code(nv50, p);
1582
1583	so = so_new(11, 2);
1584	so_method(so, tesla, NV50TCL_VP_ADDRESS_HIGH, 2);
1585	so_reloc (so, p->buffer, 0, NOUVEAU_BO_VRAM | NOUVEAU_BO_RD |
1586		  NOUVEAU_BO_HIGH, 0, 0);
1587	so_reloc (so, p->buffer, 0, NOUVEAU_BO_VRAM | NOUVEAU_BO_RD |
1588		  NOUVEAU_BO_LOW, 0, 0);
1589	so_method(so, tesla, 0x1650, 2);
1590	so_data  (so, p->cfg.vp.attr[0]);
1591	so_data  (so, p->cfg.vp.attr[1]);
1592	so_method(so, tesla, 0x16b8, 1);
1593	so_data  (so, p->cfg.high_result);
1594	so_method(so, tesla, 0x16ac, 2);
1595	so_data  (so, 8);
1596	so_data  (so, p->cfg.high_temp);
1597	so_method(so, tesla, 0x140c, 1);
1598	so_data  (so, 0); /* program start offset */
1599	so_emit(nv50->screen->nvws, so);
1600	so_ref(NULL, &so);
1601}
1602
1603void
1604nv50_fragprog_validate(struct nv50_context *nv50)
1605{
1606	struct nouveau_grobj *tesla = nv50->screen->tesla;
1607	struct nv50_program *p = nv50->fragprog;
1608	struct nouveau_stateobj *so;
1609
1610	if (!p->translated) {
1611		nv50_program_validate(nv50, p);
1612		if (!p->translated)
1613			assert(0);
1614	}
1615
1616	nv50_program_validate_data(nv50, p);
1617	nv50_program_validate_code(nv50, p);
1618
1619	so = so_new(64, 2);
1620	so_method(so, tesla, NV50TCL_FP_ADDRESS_HIGH, 2);
1621	so_reloc (so, p->buffer, 0, NOUVEAU_BO_VRAM | NOUVEAU_BO_RD |
1622		  NOUVEAU_BO_HIGH, 0, 0);
1623	so_reloc (so, p->buffer, 0, NOUVEAU_BO_VRAM | NOUVEAU_BO_RD |
1624		  NOUVEAU_BO_LOW, 0, 0);
1625	so_method(so, tesla, 0x1904, 4);
1626	so_data  (so, 0x01040404); /* p: 0x01000404 */
1627	so_data  (so, 0x00000004);
1628	so_data  (so, 0x00000000);
1629	so_data  (so, 0x00000000);
1630	so_method(so, tesla, 0x16bc, 3); /*XXX: fixme */
1631	so_data  (so, 0x03020100);
1632	so_data  (so, 0x07060504);
1633	so_data  (so, 0x0b0a0908);
1634	so_method(so, tesla, 0x1988, 2);
1635	so_data  (so, 0x08040404); /* p: 0x0f000401 */
1636	so_data  (so, p->cfg.high_temp);
1637	so_method(so, tesla, 0x1414, 1);
1638	so_data  (so, 0); /* program start offset */
1639	so_emit(nv50->screen->nvws, so);
1640	so_ref(NULL, &so);
1641}
1642
1643void
1644nv50_program_destroy(struct nv50_context *nv50, struct nv50_program *p)
1645{
1646	struct pipe_winsys *ws = nv50->pipe.winsys;
1647
1648	while (p->exec_head) {
1649		struct nv50_program_exec *e = p->exec_head;
1650
1651		p->exec_head = e->next;
1652		FREE(e);
1653	}
1654	p->exec_tail = NULL;
1655	p->exec_size = 0;
1656
1657	if (p->buffer)
1658		pipe_buffer_reference(ws, &p->buffer, NULL);
1659
1660	p->translated = 0;
1661}
1662
1663