nv50_program.c revision 229992d2812581ffae24d69a5a983d2c8441f720
1/*
2 * Copyright 2008 Ben Skeggs
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice shall be included in
12 * all copies or substantial portions of the Software.
13 *
14 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
15 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
16 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
17 * THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
18 * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF
19 * OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
20 * SOFTWARE.
21 */
22
23#include "pipe/p_context.h"
24#include "pipe/p_defines.h"
25#include "pipe/p_state.h"
26#include "pipe/p_inlines.h"
27
28#include "pipe/p_shader_tokens.h"
29#include "tgsi/tgsi_parse.h"
30#include "tgsi/tgsi_util.h"
31
32#include "nv50_context.h"
33
34#define NV50_SU_MAX_TEMP 64
35//#define NV50_PROGRAM_DUMP
36
37/* ARL - gallium craps itself on progs/vp/arl.txt
38 *
39 * MSB - Like MAD, but MUL+SUB
40 * 	- Fuck it off, introduce a way to negate args for ops that
41 * 	  support it.
42 *
43 * Look into inlining IMMD for ops other than MOV (make it general?)
44 * 	- Maybe even relax restrictions a bit, can't do P_RESULT + P_IMMD,
45 * 	  but can emit to P_TEMP first - then MOV later. NVIDIA does this
46 *
47 * In ops such as ADD it's possible to construct a bad opcode in the !is_long()
48 * case, if the emit_src() causes the inst to suddenly become long.
49 *
50 * Verify half-insns work where expected - and force disable them where they
51 * don't work - MUL has it forcibly disabled atm as it fixes POW..
52 *
53 * FUCK! watch dst==src vectors, can overwrite components that are needed.
54 * 	ie. SUB R0, R0.yzxw, R0
55 *
56 * Things to check with renouveau:
57 * 	FP attr/result assignment - how?
58 * 		attrib
59 * 			- 0x16bc maps vp output onto fp hpos
60 * 			- 0x16c0 maps vp output onto fp col0
61 * 		result
62 * 			- colr always 0-3
63 * 			- depr always 4
64 * 0x16bc->0x16e8 --> some binding between vp/fp regs
65 * 0x16b8 --> VP output count
66 *
67 * 0x1298 --> "MOV rcol.x, fcol.y" "MOV depr, fcol.y" = 0x00000005
68 * 	      "MOV rcol.x, fcol.y" = 0x00000004
69 * 0x19a8 --> as above but 0x00000100 and 0x00000000
70 * 	- 0x00100000 used when KIL used
71 * 0x196c --> as above but 0x00000011 and 0x00000000
72 *
73 * 0x1988 --> 0xXXNNNNNN
74 * 	- XX == FP high something
75 */
76struct nv50_reg {
77	enum {
78		P_TEMP,
79		P_ATTR,
80		P_RESULT,
81		P_CONST,
82		P_IMMD
83	} type;
84	int index;
85
86	int hw;
87	int neg;
88};
89
90struct nv50_pc {
91	struct nv50_program *p;
92
93	/* hw resources */
94	struct nv50_reg *r_temp[NV50_SU_MAX_TEMP];
95
96	/* tgsi resources */
97	struct nv50_reg *temp;
98	int temp_nr;
99	struct nv50_reg *attr;
100	int attr_nr;
101	struct nv50_reg *result;
102	int result_nr;
103	struct nv50_reg *param;
104	int param_nr;
105	struct nv50_reg *immd;
106	float *immd_buf;
107	int immd_nr;
108
109	struct nv50_reg *temp_temp[16];
110	unsigned temp_temp_nr;
111};
112
113static void
114alloc_reg(struct nv50_pc *pc, struct nv50_reg *reg)
115{
116	int i;
117
118	if (reg->type == P_RESULT) {
119		if (pc->p->cfg.high_result < (reg->hw + 1))
120			pc->p->cfg.high_result = reg->hw + 1;
121	}
122
123	if (reg->type != P_TEMP)
124		return;
125
126	if (reg->hw >= 0) {
127		/*XXX: do this here too to catch FP temp-as-attr usage..
128		 *     not clean, but works */
129		if (pc->p->cfg.high_temp < (reg->hw + 1))
130			pc->p->cfg.high_temp = reg->hw + 1;
131		return;
132	}
133
134	for (i = 0; i < NV50_SU_MAX_TEMP; i++) {
135		if (!(pc->r_temp[i])) {
136			pc->r_temp[i] = reg;
137			reg->hw = i;
138			if (pc->p->cfg.high_temp < (i + 1))
139				pc->p->cfg.high_temp = i + 1;
140			return;
141		}
142	}
143
144	assert(0);
145}
146
147static struct nv50_reg *
148alloc_temp(struct nv50_pc *pc, struct nv50_reg *dst)
149{
150	struct nv50_reg *r;
151	int i;
152
153	if (dst && dst->type == P_TEMP && dst->hw == -1)
154		return dst;
155
156	for (i = 0; i < NV50_SU_MAX_TEMP; i++) {
157		if (!pc->r_temp[i]) {
158			r = CALLOC_STRUCT(nv50_reg);
159			r->type = P_TEMP;
160			r->index = -1;
161			r->hw = i;
162			pc->r_temp[i] = r;
163			return r;
164		}
165	}
166
167	assert(0);
168	return NULL;
169}
170
171static void
172free_temp(struct nv50_pc *pc, struct nv50_reg *r)
173{
174	if (r->index == -1) {
175		unsigned hw = r->hw;
176
177		FREE(pc->r_temp[hw]);
178		pc->r_temp[hw] = NULL;
179	}
180}
181
182static int
183alloc_temp4(struct nv50_pc *pc, struct nv50_reg *dst[4], int idx)
184{
185	int i;
186
187	if ((idx + 4) >= NV50_SU_MAX_TEMP)
188		return 1;
189
190	if (pc->r_temp[idx] || pc->r_temp[idx + 1] ||
191	    pc->r_temp[idx + 2] || pc->r_temp[idx + 3])
192		return alloc_temp4(pc, dst, idx + 1);
193
194	for (i = 0; i < 4; i++) {
195		dst[i] = CALLOC_STRUCT(nv50_reg);
196		dst[i]->type = P_TEMP;
197		dst[i]->index = -1;
198		dst[i]->hw = idx + i;
199		pc->r_temp[idx + i] = dst[i];
200	}
201
202	return 0;
203}
204
205static void
206free_temp4(struct nv50_pc *pc, struct nv50_reg *reg[4])
207{
208	int i;
209
210	for (i = 0; i < 4; i++)
211		free_temp(pc, reg[i]);
212}
213
214static struct nv50_reg *
215temp_temp(struct nv50_pc *pc)
216{
217	if (pc->temp_temp_nr >= 16)
218		assert(0);
219
220	pc->temp_temp[pc->temp_temp_nr] = alloc_temp(pc, NULL);
221	return pc->temp_temp[pc->temp_temp_nr++];
222}
223
224static void
225kill_temp_temp(struct nv50_pc *pc)
226{
227	int i;
228
229	for (i = 0; i < pc->temp_temp_nr; i++)
230		free_temp(pc, pc->temp_temp[i]);
231	pc->temp_temp_nr = 0;
232}
233
234static int
235ctor_immd(struct nv50_pc *pc, float x, float y, float z, float w)
236{
237	pc->immd_buf = REALLOC(pc->immd_buf, (pc->immd_nr * r * sizeof(float)),
238			       (pc->immd_nr + 1) * 4 * sizeof(float));
239	pc->immd_buf[(pc->immd_nr * 4) + 0] = x;
240	pc->immd_buf[(pc->immd_nr * 4) + 1] = y;
241	pc->immd_buf[(pc->immd_nr * 4) + 2] = z;
242	pc->immd_buf[(pc->immd_nr * 4) + 3] = w;
243
244	return pc->immd_nr++;
245}
246
247static struct nv50_reg *
248alloc_immd(struct nv50_pc *pc, float f)
249{
250	struct nv50_reg *r = CALLOC_STRUCT(nv50_reg);
251	unsigned hw;
252
253	hw = ctor_immd(pc, f, 0, 0, 0) * 4;
254	r->type = P_IMMD;
255	r->hw = hw;
256	r->index = -1;
257	return r;
258}
259
260static struct nv50_program_exec *
261exec(struct nv50_pc *pc)
262{
263	struct nv50_program_exec *e = CALLOC_STRUCT(nv50_program_exec);
264
265	e->param.index = -1;
266	return e;
267}
268
269static void
270emit(struct nv50_pc *pc, struct nv50_program_exec *e)
271{
272	struct nv50_program *p = pc->p;
273
274	if (p->exec_tail)
275		p->exec_tail->next = e;
276	if (!p->exec_head)
277		p->exec_head = e;
278	p->exec_tail = e;
279	p->exec_size += (e->inst[0] & 1) ? 2 : 1;
280}
281
282static INLINE void set_long(struct nv50_pc *, struct nv50_program_exec *);
283
284static boolean
285is_long(struct nv50_program_exec *e)
286{
287	if (e->inst[0] & 1)
288		return TRUE;
289	return FALSE;
290}
291
292static boolean
293is_immd(struct nv50_program_exec *e)
294{
295	if (is_long(e) && (e->inst[1] & 3) == 3)
296		return TRUE;
297	return FALSE;
298}
299
300static INLINE void
301set_pred(struct nv50_pc *pc, unsigned pred, unsigned idx,
302	 struct nv50_program_exec *e)
303{
304	set_long(pc, e);
305	e->inst[1] &= ~((0x1f << 7) | (0x3 << 12));
306	e->inst[1] |= (pred << 7) | (idx << 12);
307}
308
309static INLINE void
310set_pred_wr(struct nv50_pc *pc, unsigned on, unsigned idx,
311	    struct nv50_program_exec *e)
312{
313	set_long(pc, e);
314	e->inst[1] &= ~((0x3 << 4) | (1 << 6));
315	e->inst[1] |= (idx << 4) | (on << 6);
316}
317
318static INLINE void
319set_long(struct nv50_pc *pc, struct nv50_program_exec *e)
320{
321	if (is_long(e))
322		return;
323
324	e->inst[0] |= 1;
325	set_pred(pc, 0xf, 0, e);
326	set_pred_wr(pc, 0, 0, e);
327}
328
329static INLINE void
330set_dst(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_program_exec *e)
331{
332	if (dst->type == P_RESULT) {
333		set_long(pc, e);
334		e->inst[1] |= 0x00000008;
335	}
336
337	alloc_reg(pc, dst);
338	e->inst[0] |= (dst->hw << 2);
339}
340
341static INLINE void
342set_immd(struct nv50_pc *pc, struct nv50_reg *imm, struct nv50_program_exec *e)
343{
344	unsigned val = fui(pc->immd_buf[imm->hw]); /* XXX */
345
346	set_long(pc, e);
347	/*XXX: can't be predicated - bits overlap.. catch cases where both
348	 *     are required and avoid them. */
349	set_pred(pc, 0, 0, e);
350	set_pred_wr(pc, 0, 0, e);
351
352	e->inst[1] |= 0x00000002 | 0x00000001;
353	e->inst[0] |= (val & 0x3f) << 16;
354	e->inst[1] |= (val >> 6) << 2;
355}
356
357static void
358emit_interp(struct nv50_pc *pc, struct nv50_reg *dst,
359	    struct nv50_reg *src, struct nv50_reg *iv)
360{
361	struct nv50_program_exec *e = exec(pc);
362
363	e->inst[0] |= 0x80000000;
364	set_dst(pc, dst, e);
365	alloc_reg(pc, src);
366	e->inst[0] |= (src->hw << 16);
367	if (iv) {
368		e->inst[0] |= (1 << 25);
369		alloc_reg(pc, iv);
370		e->inst[0] |= (iv->hw << 9);
371	}
372
373	emit(pc, e);
374}
375
376static void
377set_data(struct nv50_pc *pc, struct nv50_reg *src, unsigned m, unsigned s,
378	 struct nv50_program_exec *e)
379{
380	set_long(pc, e);
381#if 1
382	e->inst[1] |= (1 << 22);
383#else
384	if (src->type == P_IMMD) {
385		e->inst[1] |= (NV50_CB_PMISC << 22);
386	} else {
387		if (pc->p->type == PIPE_SHADER_VERTEX)
388			e->inst[1] |= (NV50_CB_PVP << 22);
389		else
390			e->inst[1] |= (NV50_CB_PFP << 22);
391	}
392#endif
393
394	e->param.index = src->hw;
395	e->param.shift = s;
396	e->param.mask = m << (s % 32);
397}
398
399static void
400emit_mov(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src)
401{
402	struct nv50_program_exec *e = exec(pc);
403
404	e->inst[0] |= 0x10000000;
405
406	set_dst(pc, dst, e);
407
408	if (0 && dst->type != P_RESULT && src->type == P_IMMD) {
409		set_immd(pc, src, e);
410		/*XXX: 32-bit, but steals part of "half" reg space - need to
411		 *     catch and handle this case if/when we do half-regs
412		 */
413		e->inst[0] |= 0x00008000;
414	} else
415	if (src->type == P_IMMD || src->type == P_CONST) {
416		set_long(pc, e);
417		set_data(pc, src, 0x7f, 9, e);
418		e->inst[1] |= 0x20000000; /* src0 const? */
419	} else {
420		if (src->type == P_ATTR) {
421			set_long(pc, e);
422			e->inst[1] |= 0x00200000;
423		}
424
425		alloc_reg(pc, src);
426		e->inst[0] |= (src->hw << 9);
427	}
428
429	/* We really should support "half" instructions here at some point,
430	 * but I don't feel confident enough about them yet.
431	 */
432	set_long(pc, e);
433	if (is_long(e) && !is_immd(e)) {
434		e->inst[1] |= 0x04000000; /* 32-bit */
435		e->inst[1] |= 0x0003c000; /* "subsubop" 0xf == mov */
436	}
437
438	emit(pc, e);
439}
440
441static boolean
442check_swap_src_0_1(struct nv50_pc *pc,
443		   struct nv50_reg **s0, struct nv50_reg **s1)
444{
445	struct nv50_reg *src0 = *s0, *src1 = *s1;
446
447	if (src0->type == P_CONST) {
448		if (src1->type != P_CONST) {
449			*s0 = src1;
450			*s1 = src0;
451			return TRUE;
452		}
453	} else
454	if (src1->type == P_ATTR) {
455		if (src0->type != P_ATTR) {
456			*s0 = src1;
457			*s1 = src0;
458			return TRUE;
459		}
460	}
461
462	return FALSE;
463}
464
465static void
466set_src_0(struct nv50_pc *pc, struct nv50_reg *src, struct nv50_program_exec *e)
467{
468	if (src->type == P_ATTR) {
469		set_long(pc, e);
470		e->inst[1] |= 0x00200000;
471	} else
472	if (src->type == P_CONST || src->type == P_IMMD) {
473		struct nv50_reg *temp = temp_temp(pc);
474
475		emit_mov(pc, temp, src);
476		src = temp;
477	}
478
479	alloc_reg(pc, src);
480	e->inst[0] |= (src->hw << 9);
481}
482
483static void
484set_src_1(struct nv50_pc *pc, struct nv50_reg *src, struct nv50_program_exec *e)
485{
486	if (src->type == P_ATTR) {
487		struct nv50_reg *temp = temp_temp(pc);
488
489		emit_mov(pc, temp, src);
490		src = temp;
491	} else
492	if (src->type == P_CONST || src->type == P_IMMD) {
493		assert(!(e->inst[0] & 0x00800000));
494		if (e->inst[0] & 0x01000000) {
495			struct nv50_reg *temp = temp_temp(pc);
496
497			emit_mov(pc, temp, src);
498			src = temp;
499		} else {
500			set_data(pc, src, 0x7f, 16, e);
501			e->inst[0] |= 0x00800000;
502		}
503	}
504
505	alloc_reg(pc, src);
506	e->inst[0] |= (src->hw << 16);
507}
508
509static void
510set_src_2(struct nv50_pc *pc, struct nv50_reg *src, struct nv50_program_exec *e)
511{
512	set_long(pc, e);
513
514	if (src->type == P_ATTR) {
515		struct nv50_reg *temp = temp_temp(pc);
516
517		emit_mov(pc, temp, src);
518		src = temp;
519	} else
520	if (src->type == P_CONST || src->type == P_IMMD) {
521		assert(!(e->inst[0] & 0x01000000));
522		if (e->inst[0] & 0x00800000) {
523			struct nv50_reg *temp = temp_temp(pc);
524
525			emit_mov(pc, temp, src);
526			src = temp;
527		} else {
528			set_data(pc, src, 0x7f, 32+14, e);
529			e->inst[0] |= 0x01000000;
530		}
531	}
532
533	alloc_reg(pc, src);
534	e->inst[1] |= (src->hw << 14);
535}
536
537static void
538emit_mul(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src0,
539	 struct nv50_reg *src1)
540{
541	struct nv50_program_exec *e = exec(pc);
542
543	e->inst[0] |= 0xc0000000;
544	set_long(pc, e);
545
546	check_swap_src_0_1(pc, &src0, &src1);
547	set_dst(pc, dst, e);
548	set_src_0(pc, src0, e);
549	set_src_1(pc, src1, e);
550
551	emit(pc, e);
552}
553
554static void
555emit_add(struct nv50_pc *pc, struct nv50_reg *dst,
556	 struct nv50_reg *src0, struct nv50_reg *src1)
557{
558	struct nv50_program_exec *e = exec(pc);
559
560	e->inst[0] |= 0xb0000000;
561
562	check_swap_src_0_1(pc, &src0, &src1);
563	set_dst(pc, dst, e);
564	set_src_0(pc, src0, e);
565	if (is_long(e))
566		set_src_2(pc, src1, e);
567	else
568		set_src_1(pc, src1, e);
569
570	emit(pc, e);
571}
572
573static void
574emit_minmax(struct nv50_pc *pc, unsigned sub, struct nv50_reg *dst,
575	    struct nv50_reg *src0, struct nv50_reg *src1)
576{
577	struct nv50_program_exec *e = exec(pc);
578
579	set_long(pc, e);
580	e->inst[0] |= 0xb0000000;
581	e->inst[1] |= (sub << 29);
582
583	check_swap_src_0_1(pc, &src0, &src1);
584	set_dst(pc, dst, e);
585	set_src_0(pc, src0, e);
586	set_src_1(pc, src1, e);
587
588	emit(pc, e);
589}
590
591static void
592emit_sub(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src0,
593	 struct nv50_reg *src1)
594{
595	struct nv50_program_exec *e = exec(pc);
596
597	e->inst[0] |= 0xb0000000;
598
599	set_long(pc, e);
600	if (check_swap_src_0_1(pc, &src0, &src1))
601		e->inst[1] |= 0x04000000;
602	else
603		e->inst[1] |= 0x08000000;
604
605	set_dst(pc, dst, e);
606	set_src_0(pc, src0, e);
607	set_src_2(pc, src1, e);
608
609	emit(pc, e);
610}
611
612static void
613emit_mad(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src0,
614	 struct nv50_reg *src1, struct nv50_reg *src2)
615{
616	struct nv50_program_exec *e = exec(pc);
617
618	e->inst[0] |= 0xe0000000;
619
620	check_swap_src_0_1(pc, &src0, &src1);
621	set_dst(pc, dst, e);
622	set_src_0(pc, src0, e);
623	set_src_1(pc, src1, e);
624	set_src_2(pc, src2, e);
625
626	emit(pc, e);
627}
628
629static void
630emit_msb(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src0,
631	 struct nv50_reg *src1, struct nv50_reg *src2)
632{
633	struct nv50_program_exec *e = exec(pc);
634
635	e->inst[0] |= 0xe0000000;
636	set_long(pc, e);
637	e->inst[1] |= 0x08000000; /* src0 * src1 - src2 */
638
639	check_swap_src_0_1(pc, &src0, &src1);
640	set_dst(pc, dst, e);
641	set_src_0(pc, src0, e);
642	set_src_1(pc, src1, e);
643	set_src_2(pc, src2, e);
644
645	emit(pc, e);
646}
647
648static void
649emit_flop(struct nv50_pc *pc, unsigned sub,
650	  struct nv50_reg *dst, struct nv50_reg *src)
651{
652	struct nv50_program_exec *e = exec(pc);
653
654	e->inst[0] |= 0x90000000;
655	if (sub) {
656		set_long(pc, e);
657		e->inst[1] |= (sub << 29);
658	}
659
660	set_dst(pc, dst, e);
661	set_src_0(pc, src, e);
662
663	emit(pc, e);
664}
665
666static void
667emit_preex2(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src)
668{
669	struct nv50_program_exec *e = exec(pc);
670
671	e->inst[0] |= 0xb0000000;
672
673	set_dst(pc, dst, e);
674	set_src_0(pc, src, e);
675	set_long(pc, e);
676	e->inst[1] |= (6 << 29) | 0x00004000;
677
678	emit(pc, e);
679}
680
681static void
682emit_precossin(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src)
683{
684	struct nv50_program_exec *e = exec(pc);
685
686	e->inst[0] |= 0xb0000000;
687
688	set_dst(pc, dst, e);
689	set_src_0(pc, src, e);
690	set_long(pc, e);
691	e->inst[1] |= (6 << 29);
692
693	emit(pc, e);
694}
695
696static void
697emit_set(struct nv50_pc *pc, unsigned c_op, struct nv50_reg *dst,
698	 struct nv50_reg *src0, struct nv50_reg *src1)
699{
700	struct nv50_program_exec *e = exec(pc);
701	unsigned inv_cop[8] = { 0, 4, 2, 6, 1, 5, 3, 7 };
702	struct nv50_reg *rdst;
703
704	assert(c_op <= 7);
705	if (check_swap_src_0_1(pc, &src0, &src1))
706		c_op = inv_cop[c_op];
707
708	rdst = dst;
709	if (dst->type != P_TEMP)
710		dst = alloc_temp(pc, NULL);
711
712	/* set.u32 */
713	set_long(pc, e);
714	e->inst[0] |= 0xb0000000;
715	e->inst[1] |= (3 << 29);
716	e->inst[1] |= (c_op << 14);
717	/*XXX: breaks things, .u32 by default?
718	 *     decuda will disasm as .u16 and use .lo/.hi regs, but this
719	 *     doesn't seem to match what the hw actually does.
720	inst[1] |= 0x04000000; << breaks things.. .u32 by default?
721	 */
722	set_dst(pc, dst, e);
723	set_src_0(pc, src0, e);
724	set_src_1(pc, src1, e);
725	emit(pc, e);
726
727	/* cvt.f32.u32 */
728	e = exec(pc);
729	e->inst[0] = 0xa0000001;
730	e->inst[1] = 0x64014780;
731	set_dst(pc, rdst, e);
732	set_src_0(pc, dst, e);
733	emit(pc, e);
734
735	if (dst != rdst)
736		free_temp(pc, dst);
737}
738
739static void
740emit_flr(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src)
741{
742	struct nv50_program_exec *e = exec(pc);
743
744	e->inst[0] = 0xa0000000; /* cvt */
745	set_long(pc, e);
746	e->inst[1] |= (6 << 29); /* cvt */
747	e->inst[1] |= 0x08000000; /* integer mode */
748	e->inst[1] |= 0x04000000; /* 32 bit */
749	e->inst[1] |= ((0x1 << 3)) << 14; /* .rn */
750	e->inst[1] |= (1 << 14); /* src .f32 */
751	set_dst(pc, dst, e);
752	set_src_0(pc, src, e);
753
754	emit(pc, e);
755}
756
757static void
758emit_pow(struct nv50_pc *pc, struct nv50_reg *dst,
759	 struct nv50_reg *v, struct nv50_reg *e)
760{
761	struct nv50_reg *temp = alloc_temp(pc, NULL);
762
763	emit_flop(pc, 3, temp, v);
764	emit_mul(pc, temp, temp, e);
765	emit_preex2(pc, temp, temp);
766	emit_flop(pc, 6, dst, temp);
767
768	free_temp(pc, temp);
769}
770
771static void
772emit_abs(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src)
773{
774	struct nv50_program_exec *e = exec(pc);
775
776	e->inst[0] = 0xa0000000; /* cvt */
777	set_long(pc, e);
778	e->inst[1] |= (6 << 29); /* cvt */
779	e->inst[1] |= 0x04000000; /* 32 bit */
780	e->inst[1] |= (1 << 14); /* src .f32 */
781	e->inst[1] |= ((1 << 6) << 14); /* .abs */
782	set_dst(pc, dst, e);
783	set_src_0(pc, src, e);
784
785	emit(pc, e);
786}
787
788static void
789emit_lit(struct nv50_pc *pc, struct nv50_reg **dst, unsigned mask,
790	 struct nv50_reg **src)
791{
792	struct nv50_reg *one = alloc_immd(pc, 1.0);
793	struct nv50_reg *zero = alloc_immd(pc, 0.0);
794	struct nv50_reg *neg128 = alloc_immd(pc, -127.999999);
795	struct nv50_reg *pos128 = alloc_immd(pc,  127.999999);
796	struct nv50_reg *tmp[4];
797
798	if (mask & (1 << 0))
799		emit_mov(pc, dst[0], one);
800
801	if (mask & (1 << 3))
802		emit_mov(pc, dst[3], one);
803
804	if (mask & (3 << 1)) {
805		if (mask & (1 << 1))
806			tmp[0] = dst[1];
807		else
808			tmp[0] = temp_temp(pc);
809		emit_minmax(pc, 4, tmp[0], src[0], zero);
810	}
811
812	if (mask & (1 << 2)) {
813		set_pred_wr(pc, 1, 0, pc->p->exec_tail);
814
815		tmp[1] = temp_temp(pc);
816		emit_minmax(pc, 4, tmp[1], src[1], zero);
817
818		tmp[3] = temp_temp(pc);
819		emit_minmax(pc, 4, tmp[3], src[3], neg128);
820		emit_minmax(pc, 5, tmp[3], tmp[3], pos128);
821
822		emit_pow(pc, dst[2], tmp[1], tmp[3]);
823		emit_mov(pc, dst[2], zero);
824		set_pred(pc, 3, 0, pc->p->exec_tail);
825	}
826
827	FREE(pos128);
828	FREE(neg128);
829	FREE(zero);
830	FREE(one);
831}
832
833static void
834emit_neg(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src)
835{
836	struct nv50_program_exec *e = exec(pc);
837
838	set_long(pc, e);
839	e->inst[0] |= 0xa0000000; /* delta */
840	e->inst[1] |= (7 << 29); /* delta */
841	e->inst[1] |= 0x04000000; /* negate arg0? probably not */
842	e->inst[1] |= (1 << 14); /* src .f32 */
843	set_dst(pc, dst, e);
844	set_src_0(pc, src, e);
845
846	emit(pc, e);
847}
848
849static void
850emit_kil(struct nv50_pc *pc, struct nv50_reg *src)
851{
852	struct nv50_program_exec *e;
853	const int r_pred = 1;
854
855	/* Sets predicate reg ? */
856	e = exec(pc);
857	e->inst[0] = 0xa00001fd;
858	e->inst[1] = 0xc4014788;
859	set_src_0(pc, src, e);
860	set_pred_wr(pc, 1, r_pred, e);
861	emit(pc, e);
862
863	/* This is probably KILP */
864	e = exec(pc);
865	e->inst[0] = 0x000001fe;
866	set_long(pc, e);
867	set_pred(pc, 1 /* LT? */, r_pred, e);
868	emit(pc, e);
869}
870
871static struct nv50_reg *
872tgsi_dst(struct nv50_pc *pc, int c, const struct tgsi_full_dst_register *dst)
873{
874	switch (dst->DstRegister.File) {
875	case TGSI_FILE_TEMPORARY:
876		return &pc->temp[dst->DstRegister.Index * 4 + c];
877	case TGSI_FILE_OUTPUT:
878		return &pc->result[dst->DstRegister.Index * 4 + c];
879	case TGSI_FILE_NULL:
880		return NULL;
881	default:
882		break;
883	}
884
885	return NULL;
886}
887
888static struct nv50_reg *
889tgsi_src(struct nv50_pc *pc, int chan, const struct tgsi_full_src_register *src)
890{
891	struct nv50_reg *r = NULL;
892	struct nv50_reg *temp;
893	unsigned sgn, c;
894
895	sgn = tgsi_util_get_full_src_register_sign_mode(src, chan);
896
897	c = tgsi_util_get_full_src_register_extswizzle(src, chan);
898	switch (c) {
899	case TGSI_EXTSWIZZLE_X:
900	case TGSI_EXTSWIZZLE_Y:
901	case TGSI_EXTSWIZZLE_Z:
902	case TGSI_EXTSWIZZLE_W:
903		switch (src->SrcRegister.File) {
904		case TGSI_FILE_INPUT:
905			r = &pc->attr[src->SrcRegister.Index * 4 + c];
906			break;
907		case TGSI_FILE_TEMPORARY:
908			r = &pc->temp[src->SrcRegister.Index * 4 + c];
909			break;
910		case TGSI_FILE_CONSTANT:
911			r = &pc->param[src->SrcRegister.Index * 4 + c];
912			break;
913		case TGSI_FILE_IMMEDIATE:
914			r = &pc->immd[src->SrcRegister.Index * 4 + c];
915			break;
916		case TGSI_FILE_SAMPLER:
917			break;
918		default:
919			assert(0);
920			break;
921		}
922		break;
923	case TGSI_EXTSWIZZLE_ZERO:
924		r = alloc_immd(pc, 0.0);
925		return r;
926	case TGSI_EXTSWIZZLE_ONE:
927		if (sgn == TGSI_UTIL_SIGN_TOGGLE || sgn == TGSI_UTIL_SIGN_SET)
928			return alloc_immd(pc, -1.0);
929		return alloc_immd(pc, 1.0);
930	default:
931		assert(0);
932		break;
933	}
934
935	switch (sgn) {
936	case TGSI_UTIL_SIGN_KEEP:
937		break;
938	case TGSI_UTIL_SIGN_CLEAR:
939		temp = temp_temp(pc);
940		emit_abs(pc, temp, r);
941		r = temp;
942		break;
943	case TGSI_UTIL_SIGN_TOGGLE:
944		temp = temp_temp(pc);
945		emit_neg(pc, temp, r);
946		r = temp;
947		break;
948	case TGSI_UTIL_SIGN_SET:
949		temp = temp_temp(pc);
950		emit_abs(pc, temp, r);
951		emit_neg(pc, temp, r);
952		r = temp;
953		break;
954	default:
955		assert(0);
956		break;
957	}
958
959	return r;
960}
961
962static boolean
963nv50_program_tx_insn(struct nv50_pc *pc, const union tgsi_full_token *tok)
964{
965	const struct tgsi_full_instruction *inst = &tok->FullInstruction;
966	struct nv50_reg *rdst[4], *dst[4], *src[3][4], *temp;
967	unsigned mask, sat, unit;
968	int i, c;
969
970	mask = inst->FullDstRegisters[0].DstRegister.WriteMask;
971	sat = inst->Instruction.Saturate == TGSI_SAT_ZERO_ONE;
972
973	for (c = 0; c < 4; c++) {
974		if (mask & (1 << c))
975			dst[c] = tgsi_dst(pc, c, &inst->FullDstRegisters[0]);
976		else
977			dst[c] = NULL;
978		rdst[c] = NULL;
979		src[0][c] = NULL;
980		src[1][c] = NULL;
981		src[2][c] = NULL;
982	}
983
984	for (i = 0; i < inst->Instruction.NumSrcRegs; i++) {
985		const struct tgsi_full_src_register *fs = &inst->FullSrcRegisters[i];
986
987		if (fs->SrcRegister.File == TGSI_FILE_SAMPLER)
988			unit = fs->SrcRegister.Index;
989
990		for (c = 0; c < 4; c++)
991			src[i][c] = tgsi_src(pc, c, fs);
992	}
993
994	if (sat) {
995		for (c = 0; c < 4; c++) {
996			rdst[c] = dst[c];
997			dst[c] = temp_temp(pc);
998		}
999	}
1000
1001	switch (inst->Instruction.Opcode) {
1002	case TGSI_OPCODE_ABS:
1003		for (c = 0; c < 4; c++) {
1004			if (!(mask & (1 << c)))
1005				continue;
1006			emit_abs(pc, dst[c], src[0][c]);
1007		}
1008		break;
1009	case TGSI_OPCODE_ADD:
1010		for (c = 0; c < 4; c++) {
1011			if (!(mask & (1 << c)))
1012				continue;
1013			emit_add(pc, dst[c], src[0][c], src[1][c]);
1014		}
1015		break;
1016	case TGSI_OPCODE_COS:
1017		temp = temp_temp(pc);
1018		emit_precossin(pc, temp, src[0][0]);
1019		emit_flop(pc, 5, temp, temp);
1020		for (c = 0; c < 4; c++) {
1021			if (!(mask & (1 << c)))
1022				continue;
1023			emit_mov(pc, dst[c], temp);
1024		}
1025		break;
1026	case TGSI_OPCODE_DP3:
1027		temp = temp_temp(pc);
1028		emit_mul(pc, temp, src[0][0], src[1][0]);
1029		emit_mad(pc, temp, src[0][1], src[1][1], temp);
1030		emit_mad(pc, temp, src[0][2], src[1][2], temp);
1031		for (c = 0; c < 4; c++) {
1032			if (!(mask & (1 << c)))
1033				continue;
1034			emit_mov(pc, dst[c], temp);
1035		}
1036		break;
1037	case TGSI_OPCODE_DP4:
1038		temp = temp_temp(pc);
1039		emit_mul(pc, temp, src[0][0], src[1][0]);
1040		emit_mad(pc, temp, src[0][1], src[1][1], temp);
1041		emit_mad(pc, temp, src[0][2], src[1][2], temp);
1042		emit_mad(pc, temp, src[0][3], src[1][3], temp);
1043		for (c = 0; c < 4; c++) {
1044			if (!(mask & (1 << c)))
1045				continue;
1046			emit_mov(pc, dst[c], temp);
1047		}
1048		break;
1049	case TGSI_OPCODE_DPH:
1050		temp = temp_temp(pc);
1051		emit_mul(pc, temp, src[0][0], src[1][0]);
1052		emit_mad(pc, temp, src[0][1], src[1][1], temp);
1053		emit_mad(pc, temp, src[0][2], src[1][2], temp);
1054		emit_add(pc, temp, src[1][3], temp);
1055		for (c = 0; c < 4; c++) {
1056			if (!(mask & (1 << c)))
1057				continue;
1058			emit_mov(pc, dst[c], temp);
1059		}
1060		break;
1061	case TGSI_OPCODE_DST:
1062	{
1063		struct nv50_reg *one = alloc_immd(pc, 1.0);
1064		if (mask & (1 << 0))
1065			emit_mov(pc, dst[0], one);
1066		if (mask & (1 << 1))
1067			emit_mul(pc, dst[1], src[0][1], src[1][1]);
1068		if (mask & (1 << 2))
1069			emit_mov(pc, dst[2], src[0][2]);
1070		if (mask & (1 << 3))
1071			emit_mov(pc, dst[3], src[1][3]);
1072		FREE(one);
1073	}
1074		break;
1075	case TGSI_OPCODE_EX2:
1076		temp = temp_temp(pc);
1077		emit_preex2(pc, temp, src[0][0]);
1078		emit_flop(pc, 6, temp, temp);
1079		for (c = 0; c < 4; c++) {
1080			if (!(mask & (1 << c)))
1081				continue;
1082			emit_mov(pc, dst[c], temp);
1083		}
1084		break;
1085	case TGSI_OPCODE_FLR:
1086		for (c = 0; c < 4; c++) {
1087			if (!(mask & (1 << c)))
1088				continue;
1089			emit_flr(pc, dst[c], src[0][c]);
1090		}
1091		break;
1092	case TGSI_OPCODE_FRC:
1093		temp = temp_temp(pc);
1094		for (c = 0; c < 4; c++) {
1095			if (!(mask & (1 << c)))
1096				continue;
1097			emit_flr(pc, temp, src[0][c]);
1098			emit_sub(pc, dst[c], src[0][c], temp);
1099		}
1100		break;
1101	case TGSI_OPCODE_KIL:
1102		emit_kil(pc, src[0][0]);
1103		emit_kil(pc, src[0][1]);
1104		emit_kil(pc, src[0][2]);
1105		emit_kil(pc, src[0][3]);
1106		break;
1107	case TGSI_OPCODE_LIT:
1108		emit_lit(pc, &dst[0], mask, &src[0][0]);
1109		break;
1110	case TGSI_OPCODE_LG2:
1111		temp = temp_temp(pc);
1112		emit_flop(pc, 3, temp, src[0][0]);
1113		for (c = 0; c < 4; c++) {
1114			if (!(mask & (1 << c)))
1115				continue;
1116			emit_mov(pc, dst[c], temp);
1117		}
1118		break;
1119	case TGSI_OPCODE_LRP:
1120		temp = temp_temp(pc);
1121		for (c = 0; c < 4; c++) {
1122			if (!(mask & (1 << c)))
1123				continue;
1124			emit_sub(pc, temp, src[1][c], src[2][c]);
1125			emit_mad(pc, dst[c], temp, src[0][c], src[2][c]);
1126		}
1127		break;
1128	case TGSI_OPCODE_MAD:
1129		for (c = 0; c < 4; c++) {
1130			if (!(mask & (1 << c)))
1131				continue;
1132			emit_mad(pc, dst[c], src[0][c], src[1][c], src[2][c]);
1133		}
1134		break;
1135	case TGSI_OPCODE_MAX:
1136		for (c = 0; c < 4; c++) {
1137			if (!(mask & (1 << c)))
1138				continue;
1139			emit_minmax(pc, 4, dst[c], src[0][c], src[1][c]);
1140		}
1141		break;
1142	case TGSI_OPCODE_MIN:
1143		for (c = 0; c < 4; c++) {
1144			if (!(mask & (1 << c)))
1145				continue;
1146			emit_minmax(pc, 5, dst[c], src[0][c], src[1][c]);
1147		}
1148		break;
1149	case TGSI_OPCODE_MOV:
1150		for (c = 0; c < 4; c++) {
1151			if (!(mask & (1 << c)))
1152				continue;
1153			emit_mov(pc, dst[c], src[0][c]);
1154		}
1155		break;
1156	case TGSI_OPCODE_MUL:
1157		for (c = 0; c < 4; c++) {
1158			if (!(mask & (1 << c)))
1159				continue;
1160			emit_mul(pc, dst[c], src[0][c], src[1][c]);
1161		}
1162		break;
1163	case TGSI_OPCODE_POW:
1164		temp = temp_temp(pc);
1165		emit_pow(pc, temp, src[0][0], src[1][0]);
1166		for (c = 0; c < 4; c++) {
1167			if (!(mask & (1 << c)))
1168				continue;
1169			emit_mov(pc, dst[c], temp);
1170		}
1171		break;
1172	case TGSI_OPCODE_RCP:
1173		for (c = 0; c < 4; c++) {
1174			if (!(mask & (1 << c)))
1175				continue;
1176			emit_flop(pc, 0, dst[c], src[0][0]);
1177		}
1178		break;
1179	case TGSI_OPCODE_RSQ:
1180		for (c = 0; c < 4; c++) {
1181			if (!(mask & (1 << c)))
1182				continue;
1183			emit_flop(pc, 2, dst[c], src[0][0]);
1184		}
1185		break;
1186	case TGSI_OPCODE_SCS:
1187		temp = temp_temp(pc);
1188		emit_precossin(pc, temp, src[0][0]);
1189		if (mask & (1 << 0))
1190			emit_flop(pc, 5, dst[0], temp);
1191		if (mask & (1 << 1))
1192			emit_flop(pc, 4, dst[1], temp);
1193		break;
1194	case TGSI_OPCODE_SGE:
1195		for (c = 0; c < 4; c++) {
1196			if (!(mask & (1 << c)))
1197				continue;
1198			emit_set(pc, 6, dst[c], src[0][c], src[1][c]);
1199		}
1200		break;
1201	case TGSI_OPCODE_SIN:
1202		temp = temp_temp(pc);
1203		emit_precossin(pc, temp, src[0][0]);
1204		emit_flop(pc, 4, temp, temp);
1205		for (c = 0; c < 4; c++) {
1206			if (!(mask & (1 << c)))
1207				continue;
1208			emit_mov(pc, dst[c], temp);
1209		}
1210		break;
1211	case TGSI_OPCODE_SLT:
1212		for (c = 0; c < 4; c++) {
1213			if (!(mask & (1 << c)))
1214				continue;
1215			emit_set(pc, 1, dst[c], src[0][c], src[1][c]);
1216		}
1217		break;
1218	case TGSI_OPCODE_SUB:
1219		for (c = 0; c < 4; c++) {
1220			if (!(mask & (1 << c)))
1221				continue;
1222			emit_sub(pc, dst[c], src[0][c], src[1][c]);
1223		}
1224		break;
1225	case TGSI_OPCODE_TEX:
1226	case TGSI_OPCODE_TXP:
1227	{
1228		struct nv50_reg *t[4];
1229		struct nv50_program_exec *e;
1230
1231		alloc_temp4(pc, t, 0);
1232		emit_mov(pc, t[0], src[0][0]);
1233		emit_mov(pc, t[1], src[0][1]);
1234
1235		e = exec(pc);
1236		e->inst[0] = 0xf6400000;
1237		e->inst[0] |= (unit << 9);
1238		set_long(pc, e);
1239		e->inst[1] |= 0x0000c004;
1240		set_dst(pc, t[0], e);
1241		emit(pc, e);
1242
1243		if (mask & (1 << 0)) emit_mov(pc, dst[0], t[0]);
1244		if (mask & (1 << 1)) emit_mov(pc, dst[1], t[1]);
1245		if (mask & (1 << 2)) emit_mov(pc, dst[2], t[2]);
1246		if (mask & (1 << 3)) emit_mov(pc, dst[3], t[3]);
1247
1248		free_temp4(pc, t);
1249	}
1250		break;
1251	case TGSI_OPCODE_XPD:
1252		temp = temp_temp(pc);
1253		if (mask & (1 << 0)) {
1254			emit_mul(pc, temp, src[0][2], src[1][1]);
1255			emit_msb(pc, dst[0], src[0][1], src[1][2], temp);
1256		}
1257		if (mask & (1 << 1)) {
1258			emit_mul(pc, temp, src[0][0], src[1][2]);
1259			emit_msb(pc, dst[1], src[0][2], src[1][0], temp);
1260		}
1261		if (mask & (1 << 2)) {
1262			emit_mul(pc, temp, src[0][1], src[1][0]);
1263			emit_msb(pc, dst[2], src[0][0], src[1][1], temp);
1264		}
1265		break;
1266	case TGSI_OPCODE_END:
1267		break;
1268	default:
1269		NOUVEAU_ERR("invalid opcode %d\n", inst->Instruction.Opcode);
1270		return FALSE;
1271	}
1272
1273	if (sat) {
1274		for (c = 0; c < 4; c++) {
1275			struct nv50_program_exec *e;
1276
1277			if (!(mask & (1 << c)))
1278				continue;
1279			e = exec(pc);
1280
1281			e->inst[0] = 0xa0000000; /* cvt */
1282			set_long(pc, e);
1283			e->inst[1] |= (6 << 29); /* cvt */
1284			e->inst[1] |= 0x04000000; /* 32 bit */
1285			e->inst[1] |= (1 << 14); /* src .f32 */
1286			e->inst[1] |= ((1 << 5) << 14); /* .sat */
1287			set_dst(pc, rdst[c], e);
1288			set_src_0(pc, dst[c], e);
1289			emit(pc, e);
1290		}
1291	}
1292
1293	for (i = 0; i < inst->Instruction.NumSrcRegs; i++) {
1294		for (c = 0; c < 4; c++) {
1295			if (!src[i][c])
1296				continue;
1297			if (src[i][c]->index == -1 && src[i][c]->type == P_IMMD)
1298				FREE(src[i][c]);
1299		}
1300	}
1301
1302	kill_temp_temp(pc);
1303	return TRUE;
1304}
1305
1306static boolean
1307nv50_program_tx_prep(struct nv50_pc *pc)
1308{
1309	struct tgsi_parse_context p;
1310	boolean ret = FALSE;
1311	unsigned i, c;
1312
1313	tgsi_parse_init(&p, pc->p->pipe.tokens);
1314	while (!tgsi_parse_end_of_tokens(&p)) {
1315		const union tgsi_full_token *tok = &p.FullToken;
1316
1317		tgsi_parse_token(&p);
1318		switch (tok->Token.Type) {
1319		case TGSI_TOKEN_TYPE_IMMEDIATE:
1320		{
1321			const struct tgsi_full_immediate *imm =
1322				&p.FullToken.FullImmediate;
1323
1324			ctor_immd(pc, imm->u.ImmediateFloat32[0].Float,
1325				      imm->u.ImmediateFloat32[1].Float,
1326				      imm->u.ImmediateFloat32[2].Float,
1327				      imm->u.ImmediateFloat32[3].Float);
1328		}
1329			break;
1330		case TGSI_TOKEN_TYPE_DECLARATION:
1331		{
1332			const struct tgsi_full_declaration *d;
1333			unsigned last;
1334
1335			d = &p.FullToken.FullDeclaration;
1336			last = d->DeclarationRange.Last;
1337
1338			switch (d->Declaration.File) {
1339			case TGSI_FILE_TEMPORARY:
1340				if (pc->temp_nr < (last + 1))
1341					pc->temp_nr = last + 1;
1342				break;
1343			case TGSI_FILE_OUTPUT:
1344				if (pc->result_nr < (last + 1))
1345					pc->result_nr = last + 1;
1346				break;
1347			case TGSI_FILE_INPUT:
1348				if (pc->attr_nr < (last + 1))
1349					pc->attr_nr = last + 1;
1350				break;
1351			case TGSI_FILE_CONSTANT:
1352				if (pc->param_nr < (last + 1))
1353					pc->param_nr = last + 1;
1354				break;
1355			case TGSI_FILE_SAMPLER:
1356				break;
1357			default:
1358				NOUVEAU_ERR("bad decl file %d\n",
1359					    d->Declaration.File);
1360				goto out_err;
1361			}
1362		}
1363			break;
1364		case TGSI_TOKEN_TYPE_INSTRUCTION:
1365			break;
1366		default:
1367			break;
1368		}
1369	}
1370
1371	if (pc->temp_nr) {
1372		pc->temp = CALLOC(pc->temp_nr * 4, sizeof(struct nv50_reg));
1373		if (!pc->temp)
1374			goto out_err;
1375
1376		for (i = 0; i < pc->temp_nr; i++) {
1377			for (c = 0; c < 4; c++) {
1378				pc->temp[i*4+c].type = P_TEMP;
1379				pc->temp[i*4+c].hw = -1;
1380				pc->temp[i*4+c].index = i;
1381			}
1382		}
1383	}
1384
1385	if (pc->attr_nr) {
1386		struct nv50_reg *iv = NULL;
1387		int aid = 0;
1388
1389		pc->attr = CALLOC(pc->attr_nr * 4, sizeof(struct nv50_reg));
1390		if (!pc->attr)
1391			goto out_err;
1392
1393		if (pc->p->type == PIPE_SHADER_FRAGMENT) {
1394			iv = alloc_temp(pc, NULL);
1395			emit_interp(pc, iv, iv, NULL);
1396			emit_flop(pc, 0, iv, iv);
1397			aid++;
1398		}
1399
1400		for (i = 0; i < pc->attr_nr; i++) {
1401			struct nv50_reg *a = &pc->attr[i*4];
1402
1403			for (c = 0; c < 4; c++) {
1404				if (pc->p->type == PIPE_SHADER_FRAGMENT) {
1405					struct nv50_reg *at =
1406						alloc_temp(pc, NULL);
1407					pc->attr[i*4+c].type = at->type;
1408					pc->attr[i*4+c].hw = at->hw;
1409					pc->attr[i*4+c].index = at->index;
1410				} else {
1411					pc->p->cfg.vp.attr[aid/32] |=
1412						(1 << (aid % 32));
1413					pc->attr[i*4+c].type = P_ATTR;
1414					pc->attr[i*4+c].hw = aid++;
1415					pc->attr[i*4+c].index = i;
1416				}
1417			}
1418
1419			if (pc->p->type != PIPE_SHADER_FRAGMENT)
1420				continue;
1421
1422			emit_interp(pc, &a[0], &a[0], iv);
1423			emit_interp(pc, &a[1], &a[1], iv);
1424			emit_interp(pc, &a[2], &a[2], iv);
1425			emit_interp(pc, &a[3], &a[3], iv);
1426		}
1427
1428		if (iv)
1429			free_temp(pc, iv);
1430	}
1431
1432	if (pc->result_nr) {
1433		int rid = 0;
1434
1435		pc->result = CALLOC(pc->result_nr * 4, sizeof(struct nv50_reg));
1436		if (!pc->result)
1437			goto out_err;
1438
1439		for (i = 0; i < pc->result_nr; i++) {
1440			for (c = 0; c < 4; c++) {
1441				if (pc->p->type == PIPE_SHADER_FRAGMENT) {
1442					pc->result[i*4+c].type = P_TEMP;
1443					pc->result[i*4+c].hw = -1;
1444				} else {
1445					pc->result[i*4+c].type = P_RESULT;
1446					pc->result[i*4+c].hw = rid++;
1447				}
1448				pc->result[i*4+c].index = i;
1449			}
1450		}
1451	}
1452
1453	if (pc->param_nr) {
1454		int rid = 0;
1455
1456		pc->param = CALLOC(pc->param_nr * 4, sizeof(struct nv50_reg));
1457		if (!pc->param)
1458			goto out_err;
1459
1460		for (i = 0; i < pc->param_nr; i++) {
1461			for (c = 0; c < 4; c++) {
1462				pc->param[i*4+c].type = P_CONST;
1463				pc->param[i*4+c].hw = rid++;
1464				pc->param[i*4+c].index = i;
1465			}
1466		}
1467	}
1468
1469	if (pc->immd_nr) {
1470		int rid = pc->param_nr * 4;
1471
1472		pc->immd = CALLOC(pc->immd_nr * 4, sizeof(struct nv50_reg));
1473		if (!pc->immd)
1474			goto out_err;
1475
1476		for (i = 0; i < pc->immd_nr; i++) {
1477			for (c = 0; c < 4; c++) {
1478				pc->immd[i*4+c].type = P_IMMD;
1479				pc->immd[i*4+c].hw = rid++;
1480				pc->immd[i*4+c].index = i;
1481			}
1482		}
1483	}
1484
1485	ret = TRUE;
1486out_err:
1487	tgsi_parse_free(&p);
1488	return ret;
1489}
1490
1491static void
1492free_nv50_pc(struct nv50_pc *pc)
1493{
1494	unsigned i;
1495
1496	if (pc->immd)
1497		FREE(pc->immd);
1498	if (pc->param)
1499		FREE(pc->param);
1500	if (pc->result)
1501		FREE(pc->result);
1502	if (pc->attr)
1503		FREE(pc->attr);
1504	if (pc->temp)
1505		FREE(pc->temp);
1506
1507	for (i = 0; i < NV50_SU_MAX_TEMP; i++) {
1508		/* deallocate fragment program attributes */
1509		if (pc->r_temp[i] && pc->r_temp[i]->index == -1)
1510			FREE(pc->r_temp[i]);
1511	}
1512
1513	FREE(pc);
1514}
1515
1516static boolean
1517nv50_program_tx(struct nv50_program *p)
1518{
1519	struct tgsi_parse_context parse;
1520	struct nv50_pc *pc;
1521	boolean ret;
1522
1523	pc = CALLOC_STRUCT(nv50_pc);
1524	if (!pc)
1525		return FALSE;
1526	pc->p = p;
1527	pc->p->cfg.high_temp = 4;
1528
1529	ret = nv50_program_tx_prep(pc);
1530	if (ret == FALSE)
1531		goto out_cleanup;
1532
1533	tgsi_parse_init(&parse, pc->p->pipe.tokens);
1534	while (!tgsi_parse_end_of_tokens(&parse)) {
1535		const union tgsi_full_token *tok = &parse.FullToken;
1536
1537		tgsi_parse_token(&parse);
1538
1539		switch (tok->Token.Type) {
1540		case TGSI_TOKEN_TYPE_INSTRUCTION:
1541			ret = nv50_program_tx_insn(pc, tok);
1542			if (ret == FALSE)
1543				goto out_err;
1544			break;
1545		default:
1546			break;
1547		}
1548	}
1549
1550	if (p->type == PIPE_SHADER_FRAGMENT) {
1551		struct nv50_reg out;
1552
1553		out.type = P_TEMP;
1554		for (out.hw = 0; out.hw < pc->result_nr * 4; out.hw++)
1555			emit_mov(pc, &out, &pc->result[out.hw]);
1556	}
1557
1558	assert(is_long(pc->p->exec_tail) && !is_immd(pc->p->exec_head));
1559	pc->p->exec_tail->inst[1] |= 0x00000001;
1560
1561	p->param_nr = pc->param_nr * 4;
1562	p->immd_nr = pc->immd_nr * 4;
1563	p->immd = pc->immd_buf;
1564
1565out_err:
1566	tgsi_parse_free(&parse);
1567
1568out_cleanup:
1569	free_nv50_pc(pc);
1570	return ret;
1571}
1572
1573static void
1574nv50_program_validate(struct nv50_context *nv50, struct nv50_program *p)
1575{
1576	if (nv50_program_tx(p) == FALSE)
1577		assert(0);
1578	p->translated = TRUE;
1579}
1580
1581static void
1582nv50_program_upload_data(struct nv50_context *nv50, float *map,
1583			 unsigned start, unsigned count)
1584{
1585	struct nouveau_channel *chan = nv50->screen->nvws->channel;
1586	struct nouveau_grobj *tesla = nv50->screen->tesla;
1587
1588	while (count) {
1589		unsigned nr = count > 2047 ? 2047 : count;
1590
1591		BEGIN_RING(chan, tesla, 0x00000f00, 1);
1592		OUT_RING  (chan, (NV50_CB_PMISC << 0) | (start << 8));
1593		BEGIN_RING(chan, tesla, 0x40000f04, nr);
1594		OUT_RINGp (chan, map, nr);
1595
1596		map += nr;
1597		start += nr;
1598		count -= nr;
1599	}
1600}
1601
1602static void
1603nv50_program_validate_data(struct nv50_context *nv50, struct nv50_program *p)
1604{
1605	struct nouveau_winsys *nvws = nv50->screen->nvws;
1606	struct pipe_winsys *ws = nv50->pipe.winsys;
1607	unsigned nr = p->param_nr + p->immd_nr;
1608
1609	if (!p->data && nr) {
1610		struct nouveau_resource *heap = nv50->screen->vp_data_heap;
1611
1612		if (nvws->res_alloc(heap, nr, p, &p->data)) {
1613			while (heap->next && heap->size < nr) {
1614				struct nv50_program *evict = heap->next->priv;
1615				nvws->res_free(&evict->data);
1616			}
1617
1618			if (nvws->res_alloc(heap, nr, p, &p->data))
1619				assert(0);
1620		}
1621	}
1622
1623	if (p->param_nr) {
1624		float *map = ws->buffer_map(ws, nv50->constbuf[p->type],
1625					    PIPE_BUFFER_USAGE_CPU_READ);
1626		nv50_program_upload_data(nv50, map, p->data->start,
1627					 p->param_nr);
1628		ws->buffer_unmap(ws, nv50->constbuf[p->type]);
1629	}
1630
1631	if (p->immd_nr) {
1632		nv50_program_upload_data(nv50, p->immd,
1633					 p->data->start + p->param_nr,
1634					 p->immd_nr);
1635	}
1636}
1637
1638static void
1639nv50_program_validate_code(struct nv50_context *nv50, struct nv50_program *p)
1640{
1641	struct nouveau_channel *chan = nv50->screen->nvws->channel;
1642	struct nouveau_grobj *tesla = nv50->screen->tesla;
1643	struct pipe_screen *screen = nv50->pipe.screen;
1644	struct nv50_program_exec *e;
1645	struct nouveau_stateobj *so;
1646	const unsigned flags = NOUVEAU_BO_VRAM | NOUVEAU_BO_WR;
1647	unsigned start, count, *up, *ptr;
1648	boolean upload = FALSE;
1649
1650	if (!p->buffer) {
1651		p->buffer = screen->buffer_create(screen, 0x100, 0, p->exec_size * 4);
1652		upload = TRUE;
1653	}
1654
1655	if (p->data && p->data->start != p->data_start) {
1656		for (e = p->exec_head; e; e = e->next) {
1657			unsigned ei, ci;
1658
1659			if (e->param.index < 0)
1660				continue;
1661			ei = e->param.shift >> 5;
1662			ci = e->param.index + p->data->start;
1663
1664			e->inst[ei] &= ~e->param.mask;
1665			e->inst[ei] |= (ci << e->param.shift);
1666		}
1667
1668		p->data_start = p->data->start;
1669		upload = TRUE;
1670	}
1671
1672	if (!upload)
1673		return;
1674
1675#ifdef NV50_PROGRAM_DUMP
1676	NOUVEAU_ERR("-------\n");
1677	for (e = p->exec_head; e; e = e->next) {
1678		NOUVEAU_ERR("0x%08x\n", e->inst[0]);
1679		if (is_long(e))
1680			NOUVEAU_ERR("0x%08x\n", e->inst[1]);
1681	}
1682#endif
1683
1684	up = ptr = MALLOC(p->exec_size * 4);
1685	for (e = p->exec_head; e; e = e->next) {
1686		*(ptr++) = e->inst[0];
1687		if (is_long(e))
1688			*(ptr++) = e->inst[1];
1689	}
1690
1691	so = so_new(4,2);
1692	so_method(so, nv50->screen->tesla, 0x1280, 3);
1693	so_reloc (so, p->buffer, 0, flags | NOUVEAU_BO_HIGH, 0, 0);
1694	so_reloc (so, p->buffer, 0, flags | NOUVEAU_BO_LOW, 0, 0);
1695	so_data  (so, (NV50_CB_PUPLOAD << 16) | 0x0800); //(p->exec_size * 4));
1696
1697	start = 0; count = p->exec_size;
1698	while (count) {
1699		struct nouveau_winsys *nvws = nv50->screen->nvws;
1700		unsigned nr;
1701
1702		so_emit(nvws, so);
1703
1704		nr = MIN2(count, 2047);
1705		nr = MIN2(nvws->channel->pushbuf->remaining, nr);
1706		if (nvws->channel->pushbuf->remaining < (nr + 3)) {
1707			FIRE_RING(chan);
1708			continue;
1709		}
1710
1711		BEGIN_RING(chan, tesla, 0x0f00, 1);
1712		OUT_RING  (chan, (start << 8) | NV50_CB_PUPLOAD);
1713		BEGIN_RING(chan, tesla, 0x40000f04, nr);
1714		OUT_RINGp (chan, up + start, nr);
1715
1716		start += nr;
1717		count -= nr;
1718	}
1719
1720	FREE(up);
1721	so_ref(NULL, &so);
1722}
1723
1724void
1725nv50_vertprog_validate(struct nv50_context *nv50)
1726{
1727	struct nouveau_grobj *tesla = nv50->screen->tesla;
1728	struct nv50_program *p = nv50->vertprog;
1729	struct nouveau_stateobj *so;
1730
1731	if (!p->translated) {
1732		nv50_program_validate(nv50, p);
1733		if (!p->translated)
1734			assert(0);
1735	}
1736
1737	nv50_program_validate_data(nv50, p);
1738	nv50_program_validate_code(nv50, p);
1739
1740	so = so_new(13, 2);
1741	so_method(so, tesla, NV50TCL_VP_ADDRESS_HIGH, 2);
1742	so_reloc (so, p->buffer, 0, NOUVEAU_BO_VRAM | NOUVEAU_BO_RD |
1743		  NOUVEAU_BO_HIGH, 0, 0);
1744	so_reloc (so, p->buffer, 0, NOUVEAU_BO_VRAM | NOUVEAU_BO_RD |
1745		  NOUVEAU_BO_LOW, 0, 0);
1746	so_method(so, tesla, 0x1650, 2);
1747	so_data  (so, p->cfg.vp.attr[0]);
1748	so_data  (so, p->cfg.vp.attr[1]);
1749	so_method(so, tesla, 0x16b8, 1);
1750	so_data  (so, p->cfg.high_result);
1751	so_method(so, tesla, 0x16ac, 2);
1752	so_data  (so, p->cfg.high_result); //8);
1753	so_data  (so, p->cfg.high_temp);
1754	so_method(so, tesla, 0x140c, 1);
1755	so_data  (so, 0); /* program start offset */
1756	so_ref(so, &nv50->state.vertprog);
1757	so_ref(NULL, &so);
1758}
1759
1760void
1761nv50_fragprog_validate(struct nv50_context *nv50)
1762{
1763	struct nouveau_grobj *tesla = nv50->screen->tesla;
1764	struct nv50_program *p = nv50->fragprog;
1765	struct nouveau_stateobj *so;
1766
1767	if (!p->translated) {
1768		nv50_program_validate(nv50, p);
1769		if (!p->translated)
1770			assert(0);
1771	}
1772
1773	nv50_program_validate_data(nv50, p);
1774	nv50_program_validate_code(nv50, p);
1775
1776	so = so_new(64, 2);
1777	so_method(so, tesla, NV50TCL_FP_ADDRESS_HIGH, 2);
1778	so_reloc (so, p->buffer, 0, NOUVEAU_BO_VRAM | NOUVEAU_BO_RD |
1779		  NOUVEAU_BO_HIGH, 0, 0);
1780	so_reloc (so, p->buffer, 0, NOUVEAU_BO_VRAM | NOUVEAU_BO_RD |
1781		  NOUVEAU_BO_LOW, 0, 0);
1782	so_method(so, tesla, 0x1904, 4);
1783	so_data  (so, 0x00040404); /* p: 0x01000404 */
1784	so_data  (so, 0x00000004);
1785	so_data  (so, 0x00000000);
1786	so_data  (so, 0x00000000);
1787	so_method(so, tesla, 0x16bc, 3); /*XXX: fixme */
1788	so_data  (so, 0x03020100);
1789	so_data  (so, 0x07060504);
1790	so_data  (so, 0x0b0a0908);
1791	so_method(so, tesla, 0x1988, 2);
1792	so_data  (so, 0x08080408); //0x08040404); /* p: 0x0f000401 */
1793	so_data  (so, p->cfg.high_temp);
1794	so_method(so, tesla, 0x1414, 1);
1795	so_data  (so, 0); /* program start offset */
1796	so_ref(so, &nv50->state.fragprog);
1797	so_ref(NULL, &so);
1798}
1799
1800void
1801nv50_program_destroy(struct nv50_context *nv50, struct nv50_program *p)
1802{
1803	struct pipe_screen *pscreen = nv50->pipe.screen;
1804
1805	while (p->exec_head) {
1806		struct nv50_program_exec *e = p->exec_head;
1807
1808		p->exec_head = e->next;
1809		FREE(e);
1810	}
1811	p->exec_tail = NULL;
1812	p->exec_size = 0;
1813
1814	if (p->buffer)
1815		pipe_buffer_reference(&p->buffer, NULL);
1816
1817	nv50->screen->nvws->res_free(&p->data);
1818
1819	p->translated = 0;
1820}
1821
1822